{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6603223926976112, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003884249368809478, "grad_norm": 1.9375, "learning_rate": 0.0, "loss": 1.2052046060562134, "step": 1 }, { "epoch": 0.0007768498737618955, "grad_norm": 1.640625, "learning_rate": 7.017543859649123e-07, "loss": 1.156843662261963, "step": 2 }, { "epoch": 0.0011652748106428432, "grad_norm": 2.40625, "learning_rate": 1.4035087719298246e-06, "loss": 1.4691178798675537, "step": 3 }, { "epoch": 0.001553699747523791, "grad_norm": 2.109375, "learning_rate": 2.105263157894737e-06, "loss": 1.2414131164550781, "step": 4 }, { "epoch": 0.0019421246844047388, "grad_norm": 1.71875, "learning_rate": 2.8070175438596493e-06, "loss": 1.2551203966140747, "step": 5 }, { "epoch": 0.0023305496212856864, "grad_norm": 1.5625, "learning_rate": 3.5087719298245615e-06, "loss": 1.3325955867767334, "step": 6 }, { "epoch": 0.0027189745581666343, "grad_norm": 1.75, "learning_rate": 4.210526315789474e-06, "loss": 1.1091190576553345, "step": 7 }, { "epoch": 0.003107399495047582, "grad_norm": 1.4609375, "learning_rate": 4.912280701754386e-06, "loss": 1.400167465209961, "step": 8 }, { "epoch": 0.0034958244319285296, "grad_norm": 1.5859375, "learning_rate": 5.6140350877192985e-06, "loss": 1.5834916830062866, "step": 9 }, { "epoch": 0.0038842493688094775, "grad_norm": 1.5078125, "learning_rate": 6.31578947368421e-06, "loss": 1.1965928077697754, "step": 10 }, { "epoch": 0.004272674305690425, "grad_norm": 1.4609375, "learning_rate": 7.017543859649123e-06, "loss": 1.3334778547286987, "step": 11 }, { "epoch": 0.004661099242571373, "grad_norm": 1.546875, "learning_rate": 7.719298245614036e-06, "loss": 1.185213327407837, "step": 12 }, { "epoch": 0.005049524179452321, "grad_norm": 1.8125, "learning_rate": 8.421052631578948e-06, "loss": 1.1630058288574219, "step": 13 }, { "epoch": 0.005437949116333269, "grad_norm": 1.4765625, "learning_rate": 9.12280701754386e-06, "loss": 1.2109041213989258, "step": 14 }, { "epoch": 0.0058263740532142165, "grad_norm": 1.765625, "learning_rate": 9.824561403508772e-06, "loss": 1.1873822212219238, "step": 15 }, { "epoch": 0.006214798990095164, "grad_norm": 1.40625, "learning_rate": 1.0526315789473684e-05, "loss": 1.301408052444458, "step": 16 }, { "epoch": 0.006603223926976112, "grad_norm": 2.34375, "learning_rate": 1.1228070175438597e-05, "loss": 1.562785267829895, "step": 17 }, { "epoch": 0.006991648863857059, "grad_norm": 1.125, "learning_rate": 1.192982456140351e-05, "loss": 1.2167153358459473, "step": 18 }, { "epoch": 0.007380073800738007, "grad_norm": 1.28125, "learning_rate": 1.263157894736842e-05, "loss": 1.451377034187317, "step": 19 }, { "epoch": 0.007768498737618955, "grad_norm": 1.3671875, "learning_rate": 1.3333333333333333e-05, "loss": 1.358604073524475, "step": 20 }, { "epoch": 0.008156923674499902, "grad_norm": 1.3671875, "learning_rate": 1.4035087719298246e-05, "loss": 1.2347151041030884, "step": 21 }, { "epoch": 0.00854534861138085, "grad_norm": 1.171875, "learning_rate": 1.4736842105263159e-05, "loss": 1.723771572113037, "step": 22 }, { "epoch": 0.008933773548261798, "grad_norm": 1.1328125, "learning_rate": 1.543859649122807e-05, "loss": 1.1897292137145996, "step": 23 }, { "epoch": 0.009322198485142746, "grad_norm": 1.3046875, "learning_rate": 1.6140350877192984e-05, "loss": 1.5507746934890747, "step": 24 }, { "epoch": 0.009710623422023694, "grad_norm": 1.1640625, "learning_rate": 1.6842105263157896e-05, "loss": 1.6277556419372559, "step": 25 }, { "epoch": 0.010099048358904641, "grad_norm": 1.25, "learning_rate": 1.754385964912281e-05, "loss": 1.1887000799179077, "step": 26 }, { "epoch": 0.01048747329578559, "grad_norm": 1.1484375, "learning_rate": 1.824561403508772e-05, "loss": 1.4003136157989502, "step": 27 }, { "epoch": 0.010875898232666537, "grad_norm": 1.1484375, "learning_rate": 1.894736842105263e-05, "loss": 1.3821872472763062, "step": 28 }, { "epoch": 0.011264323169547485, "grad_norm": 1.2265625, "learning_rate": 1.9649122807017544e-05, "loss": 1.0592646598815918, "step": 29 }, { "epoch": 0.011652748106428433, "grad_norm": 1.125, "learning_rate": 2.035087719298246e-05, "loss": 1.339829683303833, "step": 30 }, { "epoch": 0.012041173043309381, "grad_norm": 1.046875, "learning_rate": 2.105263157894737e-05, "loss": 1.243274450302124, "step": 31 }, { "epoch": 0.012429597980190329, "grad_norm": 1.1796875, "learning_rate": 2.1754385964912285e-05, "loss": 1.1084136962890625, "step": 32 }, { "epoch": 0.012818022917071277, "grad_norm": 1.296875, "learning_rate": 2.2456140350877194e-05, "loss": 1.1394574642181396, "step": 33 }, { "epoch": 0.013206447853952225, "grad_norm": 1.1640625, "learning_rate": 2.3157894736842107e-05, "loss": 1.1227282285690308, "step": 34 }, { "epoch": 0.01359487279083317, "grad_norm": 1.140625, "learning_rate": 2.385964912280702e-05, "loss": 1.3615882396697998, "step": 35 }, { "epoch": 0.013983297727714119, "grad_norm": 1.3046875, "learning_rate": 2.4561403508771932e-05, "loss": 1.0999336242675781, "step": 36 }, { "epoch": 0.014371722664595066, "grad_norm": 1.1171875, "learning_rate": 2.526315789473684e-05, "loss": 1.3443243503570557, "step": 37 }, { "epoch": 0.014760147601476014, "grad_norm": 1.0703125, "learning_rate": 2.5964912280701757e-05, "loss": 1.3359915018081665, "step": 38 }, { "epoch": 0.015148572538356962, "grad_norm": 1.1171875, "learning_rate": 2.6666666666666667e-05, "loss": 1.3015429973602295, "step": 39 }, { "epoch": 0.01553699747523791, "grad_norm": 1.296875, "learning_rate": 2.7368421052631583e-05, "loss": 1.3782955408096313, "step": 40 }, { "epoch": 0.015925422412118858, "grad_norm": 0.9375, "learning_rate": 2.8070175438596492e-05, "loss": 0.9564651250839233, "step": 41 }, { "epoch": 0.016313847348999804, "grad_norm": 1.25, "learning_rate": 2.8771929824561408e-05, "loss": 1.1631165742874146, "step": 42 }, { "epoch": 0.016702272285880754, "grad_norm": 1.171875, "learning_rate": 2.9473684210526317e-05, "loss": 1.3167998790740967, "step": 43 }, { "epoch": 0.0170906972227617, "grad_norm": 1.09375, "learning_rate": 3.017543859649123e-05, "loss": 1.247291088104248, "step": 44 }, { "epoch": 0.01747912215964265, "grad_norm": 1.0703125, "learning_rate": 3.087719298245614e-05, "loss": 1.1648046970367432, "step": 45 }, { "epoch": 0.017867547096523596, "grad_norm": 1.03125, "learning_rate": 3.157894736842106e-05, "loss": 1.1113438606262207, "step": 46 }, { "epoch": 0.018255972033404545, "grad_norm": 1.015625, "learning_rate": 3.228070175438597e-05, "loss": 1.1884386539459229, "step": 47 }, { "epoch": 0.01864439697028549, "grad_norm": 1.0625, "learning_rate": 3.298245614035088e-05, "loss": 1.1113172769546509, "step": 48 }, { "epoch": 0.01903282190716644, "grad_norm": 1.2109375, "learning_rate": 3.368421052631579e-05, "loss": 1.350731611251831, "step": 49 }, { "epoch": 0.019421246844047387, "grad_norm": 1.109375, "learning_rate": 3.43859649122807e-05, "loss": 1.1782677173614502, "step": 50 }, { "epoch": 0.019809671780928337, "grad_norm": 1.0390625, "learning_rate": 3.508771929824562e-05, "loss": 1.09257173538208, "step": 51 }, { "epoch": 0.020198096717809283, "grad_norm": 1.3125, "learning_rate": 3.578947368421053e-05, "loss": 1.3578851222991943, "step": 52 }, { "epoch": 0.020586521654690233, "grad_norm": 1.5546875, "learning_rate": 3.649122807017544e-05, "loss": 1.2399784326553345, "step": 53 }, { "epoch": 0.02097494659157118, "grad_norm": 1.0625, "learning_rate": 3.719298245614035e-05, "loss": 1.2907016277313232, "step": 54 }, { "epoch": 0.02136337152845213, "grad_norm": 0.94140625, "learning_rate": 3.789473684210526e-05, "loss": 1.1072217226028442, "step": 55 }, { "epoch": 0.021751796465333074, "grad_norm": 1.0703125, "learning_rate": 3.859649122807018e-05, "loss": 1.0099533796310425, "step": 56 }, { "epoch": 0.02214022140221402, "grad_norm": 1.0, "learning_rate": 3.929824561403509e-05, "loss": 1.1833176612854004, "step": 57 }, { "epoch": 0.02252864633909497, "grad_norm": 0.89453125, "learning_rate": 4e-05, "loss": 1.1713721752166748, "step": 58 }, { "epoch": 0.022917071275975916, "grad_norm": 1.0703125, "learning_rate": 3.999998719260585e-05, "loss": 1.0900102853775024, "step": 59 }, { "epoch": 0.023305496212856866, "grad_norm": 1.09375, "learning_rate": 3.999994877043978e-05, "loss": 1.3123880624771118, "step": 60 }, { "epoch": 0.023693921149737812, "grad_norm": 1.0625, "learning_rate": 3.9999884733551e-05, "loss": 1.3595911264419556, "step": 61 }, { "epoch": 0.024082346086618762, "grad_norm": 1.0078125, "learning_rate": 3.9999795082021543e-05, "loss": 1.2159225940704346, "step": 62 }, { "epoch": 0.024470771023499708, "grad_norm": 1.015625, "learning_rate": 3.999967981596622e-05, "loss": 1.247667670249939, "step": 63 }, { "epoch": 0.024859195960380658, "grad_norm": 1.125, "learning_rate": 3.999953893553265e-05, "loss": 1.2842686176300049, "step": 64 }, { "epoch": 0.025247620897261604, "grad_norm": 1.3359375, "learning_rate": 3.999937244090127e-05, "loss": 1.1858339309692383, "step": 65 }, { "epoch": 0.025636045834142553, "grad_norm": 0.99609375, "learning_rate": 3.999918033228531e-05, "loss": 1.274332880973816, "step": 66 }, { "epoch": 0.0260244707710235, "grad_norm": 1.046875, "learning_rate": 3.9998962609930824e-05, "loss": 1.1893506050109863, "step": 67 }, { "epoch": 0.02641289570790445, "grad_norm": 1.1796875, "learning_rate": 3.999871927411664e-05, "loss": 1.154479742050171, "step": 68 }, { "epoch": 0.026801320644785395, "grad_norm": 1.03125, "learning_rate": 3.999845032515443e-05, "loss": 1.1241350173950195, "step": 69 }, { "epoch": 0.02718974558166634, "grad_norm": 1.046875, "learning_rate": 3.999815576338862e-05, "loss": 1.038230299949646, "step": 70 }, { "epoch": 0.02757817051854729, "grad_norm": 1.03125, "learning_rate": 3.999783558919649e-05, "loss": 1.1823713779449463, "step": 71 }, { "epoch": 0.027966595455428237, "grad_norm": 1.09375, "learning_rate": 3.9997489802988096e-05, "loss": 1.1666415929794312, "step": 72 }, { "epoch": 0.028355020392309187, "grad_norm": 1.4296875, "learning_rate": 3.99971184052063e-05, "loss": 1.1876068115234375, "step": 73 }, { "epoch": 0.028743445329190133, "grad_norm": 1.0078125, "learning_rate": 3.999672139632675e-05, "loss": 1.559510350227356, "step": 74 }, { "epoch": 0.029131870266071083, "grad_norm": 0.9921875, "learning_rate": 3.9996298776857924e-05, "loss": 1.4810776710510254, "step": 75 }, { "epoch": 0.02952029520295203, "grad_norm": 1.0234375, "learning_rate": 3.999585054734109e-05, "loss": 1.0881098508834839, "step": 76 }, { "epoch": 0.02990872013983298, "grad_norm": 0.953125, "learning_rate": 3.9995376708350304e-05, "loss": 1.0982859134674072, "step": 77 }, { "epoch": 0.030297145076713924, "grad_norm": 1.0234375, "learning_rate": 3.999487726049244e-05, "loss": 0.9817754030227661, "step": 78 }, { "epoch": 0.030685570013594874, "grad_norm": 1.046875, "learning_rate": 3.9994352204407156e-05, "loss": 1.108202338218689, "step": 79 }, { "epoch": 0.03107399495047582, "grad_norm": 1.0234375, "learning_rate": 3.999380154076691e-05, "loss": 1.0929515361785889, "step": 80 }, { "epoch": 0.03146241988735677, "grad_norm": 0.90625, "learning_rate": 3.999322527027696e-05, "loss": 1.2119938135147095, "step": 81 }, { "epoch": 0.031850844824237716, "grad_norm": 0.984375, "learning_rate": 3.999262339367536e-05, "loss": 1.3366830348968506, "step": 82 }, { "epoch": 0.03223926976111866, "grad_norm": 1.046875, "learning_rate": 3.9991995911732955e-05, "loss": 1.3447675704956055, "step": 83 }, { "epoch": 0.03262769469799961, "grad_norm": 1.0703125, "learning_rate": 3.9991342825253385e-05, "loss": 1.0723795890808105, "step": 84 }, { "epoch": 0.03301611963488056, "grad_norm": 1.0390625, "learning_rate": 3.999066413507309e-05, "loss": 1.3833125829696655, "step": 85 }, { "epoch": 0.03340454457176151, "grad_norm": 0.96875, "learning_rate": 3.998995984206128e-05, "loss": 1.0655522346496582, "step": 86 }, { "epoch": 0.033792969508642454, "grad_norm": 1.0703125, "learning_rate": 3.998922994712e-05, "loss": 1.141582727432251, "step": 87 }, { "epoch": 0.0341813944455234, "grad_norm": 0.98828125, "learning_rate": 3.9988474451184026e-05, "loss": 1.5044329166412354, "step": 88 }, { "epoch": 0.03456981938240435, "grad_norm": 1.0546875, "learning_rate": 3.998769335522096e-05, "loss": 1.0026109218597412, "step": 89 }, { "epoch": 0.0349582443192853, "grad_norm": 1.15625, "learning_rate": 3.9986886660231184e-05, "loss": 1.3367772102355957, "step": 90 }, { "epoch": 0.035346669256166245, "grad_norm": 0.9765625, "learning_rate": 3.9986054367247866e-05, "loss": 1.3066233396530151, "step": 91 }, { "epoch": 0.03573509419304719, "grad_norm": 1.0234375, "learning_rate": 3.998519647733696e-05, "loss": 1.2795196771621704, "step": 92 }, { "epoch": 0.036123519129928144, "grad_norm": 1.046875, "learning_rate": 3.9984312991597194e-05, "loss": 1.1808704137802124, "step": 93 }, { "epoch": 0.03651194406680909, "grad_norm": 1.046875, "learning_rate": 3.998340391116008e-05, "loss": 1.3668456077575684, "step": 94 }, { "epoch": 0.03690036900369004, "grad_norm": 1.0078125, "learning_rate": 3.9982469237189914e-05, "loss": 0.9315815567970276, "step": 95 }, { "epoch": 0.03728879394057098, "grad_norm": 0.9921875, "learning_rate": 3.998150897088378e-05, "loss": 1.2619706392288208, "step": 96 }, { "epoch": 0.03767721887745193, "grad_norm": 1.1171875, "learning_rate": 3.998052311347151e-05, "loss": 1.0360974073410034, "step": 97 }, { "epoch": 0.03806564381433288, "grad_norm": 1.0078125, "learning_rate": 3.997951166621575e-05, "loss": 1.0678105354309082, "step": 98 }, { "epoch": 0.03845406875121383, "grad_norm": 1.0390625, "learning_rate": 3.997847463041189e-05, "loss": 1.2845054864883423, "step": 99 }, { "epoch": 0.038842493688094774, "grad_norm": 1.125, "learning_rate": 3.997741200738811e-05, "loss": 1.101405382156372, "step": 100 }, { "epoch": 0.03923091862497572, "grad_norm": 1.046875, "learning_rate": 3.997632379850534e-05, "loss": 1.1669236421585083, "step": 101 }, { "epoch": 0.039619343561856674, "grad_norm": 1.1328125, "learning_rate": 3.997521000515731e-05, "loss": 1.3568894863128662, "step": 102 }, { "epoch": 0.04000776849873762, "grad_norm": 1.0234375, "learning_rate": 3.997407062877048e-05, "loss": 1.388906717300415, "step": 103 }, { "epoch": 0.040396193435618566, "grad_norm": 0.98046875, "learning_rate": 3.99729056708041e-05, "loss": 1.0699307918548584, "step": 104 }, { "epoch": 0.04078461837249951, "grad_norm": 1.171875, "learning_rate": 3.997171513275019e-05, "loss": 0.9556429982185364, "step": 105 }, { "epoch": 0.041173043309380465, "grad_norm": 1.0390625, "learning_rate": 3.997049901613351e-05, "loss": 1.499502182006836, "step": 106 }, { "epoch": 0.04156146824626141, "grad_norm": 1.0859375, "learning_rate": 3.996925732251158e-05, "loss": 1.154077410697937, "step": 107 }, { "epoch": 0.04194989318314236, "grad_norm": 0.97265625, "learning_rate": 3.99679900534747e-05, "loss": 1.2556886672973633, "step": 108 }, { "epoch": 0.042338318120023304, "grad_norm": 1.1640625, "learning_rate": 3.99666972106459e-05, "loss": 1.2007791996002197, "step": 109 }, { "epoch": 0.04272674305690426, "grad_norm": 1.1953125, "learning_rate": 3.9965378795680994e-05, "loss": 1.537756085395813, "step": 110 }, { "epoch": 0.0431151679937852, "grad_norm": 1.125, "learning_rate": 3.9964034810268504e-05, "loss": 1.2897405624389648, "step": 111 }, { "epoch": 0.04350359293066615, "grad_norm": 1.1875, "learning_rate": 3.996266525612973e-05, "loss": 1.7078125476837158, "step": 112 }, { "epoch": 0.043892017867547095, "grad_norm": 1.1015625, "learning_rate": 3.996127013501873e-05, "loss": 1.1563594341278076, "step": 113 }, { "epoch": 0.04428044280442804, "grad_norm": 1.03125, "learning_rate": 3.995984944872228e-05, "loss": 1.1612040996551514, "step": 114 }, { "epoch": 0.044668867741308994, "grad_norm": 1.046875, "learning_rate": 3.99584031990599e-05, "loss": 1.3883051872253418, "step": 115 }, { "epoch": 0.04505729267818994, "grad_norm": 1.140625, "learning_rate": 3.995693138788388e-05, "loss": 1.0139373540878296, "step": 116 }, { "epoch": 0.04544571761507089, "grad_norm": 1.015625, "learning_rate": 3.9955434017079206e-05, "loss": 1.2382270097732544, "step": 117 }, { "epoch": 0.04583414255195183, "grad_norm": 1.109375, "learning_rate": 3.995391108856363e-05, "loss": 1.0666306018829346, "step": 118 }, { "epoch": 0.046222567488832786, "grad_norm": 1.15625, "learning_rate": 3.9952362604287626e-05, "loss": 1.0969278812408447, "step": 119 }, { "epoch": 0.04661099242571373, "grad_norm": 1.0234375, "learning_rate": 3.99507885662344e-05, "loss": 1.2801592350006104, "step": 120 }, { "epoch": 0.04699941736259468, "grad_norm": 0.9609375, "learning_rate": 3.994918897641988e-05, "loss": 1.2142925262451172, "step": 121 }, { "epoch": 0.047387842299475624, "grad_norm": 1.0390625, "learning_rate": 3.9947563836892725e-05, "loss": 1.0754859447479248, "step": 122 }, { "epoch": 0.04777626723635658, "grad_norm": 1.015625, "learning_rate": 3.994591314973432e-05, "loss": 1.383463740348816, "step": 123 }, { "epoch": 0.048164692173237524, "grad_norm": 1.1875, "learning_rate": 3.9944236917058755e-05, "loss": 1.3704304695129395, "step": 124 }, { "epoch": 0.04855311711011847, "grad_norm": 1.0546875, "learning_rate": 3.994253514101286e-05, "loss": 1.2984403371810913, "step": 125 }, { "epoch": 0.048941542046999416, "grad_norm": 1.0546875, "learning_rate": 3.9940807823776155e-05, "loss": 0.9159691333770752, "step": 126 }, { "epoch": 0.04932996698388036, "grad_norm": 0.94140625, "learning_rate": 3.99390549675609e-05, "loss": 1.336573600769043, "step": 127 }, { "epoch": 0.049718391920761315, "grad_norm": 1.1015625, "learning_rate": 3.993727657461202e-05, "loss": 1.2220174074172974, "step": 128 }, { "epoch": 0.05010681685764226, "grad_norm": 1.046875, "learning_rate": 3.9935472647207196e-05, "loss": 1.1247215270996094, "step": 129 }, { "epoch": 0.05049524179452321, "grad_norm": 0.984375, "learning_rate": 3.9933643187656786e-05, "loss": 1.119951844215393, "step": 130 }, { "epoch": 0.050883666731404154, "grad_norm": 0.9609375, "learning_rate": 3.993178819830384e-05, "loss": 1.1053929328918457, "step": 131 }, { "epoch": 0.05127209166828511, "grad_norm": 1.2421875, "learning_rate": 3.992990768152412e-05, "loss": 1.050331473350525, "step": 132 }, { "epoch": 0.05166051660516605, "grad_norm": 0.84375, "learning_rate": 3.992800163972609e-05, "loss": 1.1509952545166016, "step": 133 }, { "epoch": 0.052048941542047, "grad_norm": 0.90625, "learning_rate": 3.992607007535088e-05, "loss": 1.1989561319351196, "step": 134 }, { "epoch": 0.052437366478927945, "grad_norm": 1.0625, "learning_rate": 3.992411299087232e-05, "loss": 1.256465196609497, "step": 135 }, { "epoch": 0.0528257914158089, "grad_norm": 1.046875, "learning_rate": 3.992213038879693e-05, "loss": 1.2814881801605225, "step": 136 }, { "epoch": 0.053214216352689844, "grad_norm": 1.1796875, "learning_rate": 3.9920122271663915e-05, "loss": 1.0654264688491821, "step": 137 }, { "epoch": 0.05360264128957079, "grad_norm": 1.1171875, "learning_rate": 3.9918088642045126e-05, "loss": 1.0711395740509033, "step": 138 }, { "epoch": 0.05399106622645174, "grad_norm": 0.98828125, "learning_rate": 3.991602950254514e-05, "loss": 1.1775628328323364, "step": 139 }, { "epoch": 0.05437949116333268, "grad_norm": 1.046875, "learning_rate": 3.991394485580116e-05, "loss": 1.3190288543701172, "step": 140 }, { "epoch": 0.054767916100213636, "grad_norm": 0.94140625, "learning_rate": 3.9911834704483075e-05, "loss": 1.2752089500427246, "step": 141 }, { "epoch": 0.05515634103709458, "grad_norm": 0.98828125, "learning_rate": 3.9909699051293455e-05, "loss": 1.1450399160385132, "step": 142 }, { "epoch": 0.05554476597397553, "grad_norm": 1.046875, "learning_rate": 3.99075378989675e-05, "loss": 0.992607831954956, "step": 143 }, { "epoch": 0.055933190910856474, "grad_norm": 1.203125, "learning_rate": 3.990535125027308e-05, "loss": 1.1784850358963013, "step": 144 }, { "epoch": 0.05632161584773743, "grad_norm": 1.109375, "learning_rate": 3.990313910801074e-05, "loss": 1.3818050622940063, "step": 145 }, { "epoch": 0.056710040784618373, "grad_norm": 1.2421875, "learning_rate": 3.990090147501365e-05, "loss": 1.4116750955581665, "step": 146 }, { "epoch": 0.05709846572149932, "grad_norm": 1.078125, "learning_rate": 3.989863835414762e-05, "loss": 1.219071388244629, "step": 147 }, { "epoch": 0.057486890658380266, "grad_norm": 0.91015625, "learning_rate": 3.989634974831114e-05, "loss": 1.0858508348464966, "step": 148 }, { "epoch": 0.05787531559526122, "grad_norm": 0.93359375, "learning_rate": 3.9894035660435306e-05, "loss": 1.187249779701233, "step": 149 }, { "epoch": 0.058263740532142165, "grad_norm": 1.0078125, "learning_rate": 3.989169609348387e-05, "loss": 1.1618479490280151, "step": 150 }, { "epoch": 0.05865216546902311, "grad_norm": 1.1640625, "learning_rate": 3.9889331050453195e-05, "loss": 0.8996610045433044, "step": 151 }, { "epoch": 0.05904059040590406, "grad_norm": 1.140625, "learning_rate": 3.988694053437229e-05, "loss": 1.2948083877563477, "step": 152 }, { "epoch": 0.059429015342785003, "grad_norm": 1.0703125, "learning_rate": 3.988452454830279e-05, "loss": 1.2242070436477661, "step": 153 }, { "epoch": 0.05981744027966596, "grad_norm": 1.203125, "learning_rate": 3.9882083095338934e-05, "loss": 1.2814799547195435, "step": 154 }, { "epoch": 0.0602058652165469, "grad_norm": 1.109375, "learning_rate": 3.987961617860759e-05, "loss": 1.1929787397384644, "step": 155 }, { "epoch": 0.06059429015342785, "grad_norm": 1.0078125, "learning_rate": 3.987712380126824e-05, "loss": 0.9702768921852112, "step": 156 }, { "epoch": 0.060982715090308795, "grad_norm": 1.09375, "learning_rate": 3.987460596651296e-05, "loss": 1.1169824600219727, "step": 157 }, { "epoch": 0.06137114002718975, "grad_norm": 1.234375, "learning_rate": 3.987206267756645e-05, "loss": 1.204899549484253, "step": 158 }, { "epoch": 0.061759564964070694, "grad_norm": 1.5703125, "learning_rate": 3.9869493937686e-05, "loss": 1.5161927938461304, "step": 159 }, { "epoch": 0.06214798990095164, "grad_norm": 1.0390625, "learning_rate": 3.986689975016149e-05, "loss": 1.2802753448486328, "step": 160 }, { "epoch": 0.0625364148378326, "grad_norm": 1.0703125, "learning_rate": 3.986428011831541e-05, "loss": 1.0637199878692627, "step": 161 }, { "epoch": 0.06292483977471354, "grad_norm": 1.0390625, "learning_rate": 3.986163504550281e-05, "loss": 1.2565662860870361, "step": 162 }, { "epoch": 0.06331326471159449, "grad_norm": 1.21875, "learning_rate": 3.9858964535111344e-05, "loss": 1.407301664352417, "step": 163 }, { "epoch": 0.06370168964847543, "grad_norm": 1.0703125, "learning_rate": 3.985626859056124e-05, "loss": 1.171770691871643, "step": 164 }, { "epoch": 0.06409011458535638, "grad_norm": 0.9921875, "learning_rate": 3.9853547215305315e-05, "loss": 0.9284271597862244, "step": 165 }, { "epoch": 0.06447853952223732, "grad_norm": 1.15625, "learning_rate": 3.985080041282892e-05, "loss": 0.949539065361023, "step": 166 }, { "epoch": 0.06486696445911827, "grad_norm": 0.96875, "learning_rate": 3.9848028186650004e-05, "loss": 1.1514685153961182, "step": 167 }, { "epoch": 0.06525538939599922, "grad_norm": 0.98828125, "learning_rate": 3.984523054031907e-05, "loss": 1.1563186645507812, "step": 168 }, { "epoch": 0.06564381433288018, "grad_norm": 1.125, "learning_rate": 3.9842407477419166e-05, "loss": 1.4168965816497803, "step": 169 }, { "epoch": 0.06603223926976112, "grad_norm": 1.3515625, "learning_rate": 3.98395590015659e-05, "loss": 1.0440829992294312, "step": 170 }, { "epoch": 0.06642066420664207, "grad_norm": 1.03125, "learning_rate": 3.983668511640743e-05, "loss": 1.1173646450042725, "step": 171 }, { "epoch": 0.06680908914352301, "grad_norm": 0.95703125, "learning_rate": 3.983378582562446e-05, "loss": 1.1747249364852905, "step": 172 }, { "epoch": 0.06719751408040396, "grad_norm": 1.09375, "learning_rate": 3.983086113293021e-05, "loss": 1.5655328035354614, "step": 173 }, { "epoch": 0.06758593901728491, "grad_norm": 1.078125, "learning_rate": 3.982791104207047e-05, "loss": 1.0933256149291992, "step": 174 }, { "epoch": 0.06797436395416585, "grad_norm": 1.1015625, "learning_rate": 3.9824935556823516e-05, "loss": 1.2898495197296143, "step": 175 }, { "epoch": 0.0683627888910468, "grad_norm": 0.94140625, "learning_rate": 3.982193468100019e-05, "loss": 0.9689711332321167, "step": 176 }, { "epoch": 0.06875121382792776, "grad_norm": 1.0546875, "learning_rate": 3.981890841844382e-05, "loss": 1.3364231586456299, "step": 177 }, { "epoch": 0.0691396387648087, "grad_norm": 0.9765625, "learning_rate": 3.981585677303025e-05, "loss": 1.0838921070098877, "step": 178 }, { "epoch": 0.06952806370168965, "grad_norm": 1.15625, "learning_rate": 3.9812779748667874e-05, "loss": 1.1282203197479248, "step": 179 }, { "epoch": 0.0699164886385706, "grad_norm": 1.0078125, "learning_rate": 3.9809677349297525e-05, "loss": 1.2816410064697266, "step": 180 }, { "epoch": 0.07030491357545154, "grad_norm": 1.078125, "learning_rate": 3.9806549578892586e-05, "loss": 1.0542411804199219, "step": 181 }, { "epoch": 0.07069333851233249, "grad_norm": 1.1171875, "learning_rate": 3.9803396441458917e-05, "loss": 1.1153162717819214, "step": 182 }, { "epoch": 0.07108176344921344, "grad_norm": 1.15625, "learning_rate": 3.980021794103485e-05, "loss": 1.0350027084350586, "step": 183 }, { "epoch": 0.07147018838609438, "grad_norm": 1.0390625, "learning_rate": 3.979701408169123e-05, "loss": 1.4738894701004028, "step": 184 }, { "epoch": 0.07185861332297533, "grad_norm": 1.1328125, "learning_rate": 3.979378486753136e-05, "loss": 1.0833780765533447, "step": 185 }, { "epoch": 0.07224703825985629, "grad_norm": 1.1015625, "learning_rate": 3.979053030269103e-05, "loss": 0.987901508808136, "step": 186 }, { "epoch": 0.07263546319673723, "grad_norm": 1.078125, "learning_rate": 3.9787250391338486e-05, "loss": 0.9462146759033203, "step": 187 }, { "epoch": 0.07302388813361818, "grad_norm": 1.109375, "learning_rate": 3.978394513767443e-05, "loss": 0.8942592144012451, "step": 188 }, { "epoch": 0.07341231307049913, "grad_norm": 1.0390625, "learning_rate": 3.9780614545932035e-05, "loss": 1.2562580108642578, "step": 189 }, { "epoch": 0.07380073800738007, "grad_norm": 1.0859375, "learning_rate": 3.9777258620376934e-05, "loss": 1.080777645111084, "step": 190 }, { "epoch": 0.07418916294426102, "grad_norm": 1.1328125, "learning_rate": 3.977387736530717e-05, "loss": 1.0833654403686523, "step": 191 }, { "epoch": 0.07457758788114197, "grad_norm": 1.0546875, "learning_rate": 3.977047078505327e-05, "loss": 1.054891586303711, "step": 192 }, { "epoch": 0.07496601281802291, "grad_norm": 1.0078125, "learning_rate": 3.9767038883978164e-05, "loss": 1.0277347564697266, "step": 193 }, { "epoch": 0.07535443775490386, "grad_norm": 0.9296875, "learning_rate": 3.976358166647723e-05, "loss": 1.330249547958374, "step": 194 }, { "epoch": 0.07574286269178482, "grad_norm": 1.03125, "learning_rate": 3.976009913697827e-05, "loss": 1.2053139209747314, "step": 195 }, { "epoch": 0.07613128762866576, "grad_norm": 1.2265625, "learning_rate": 3.975659129994147e-05, "loss": 0.9735493063926697, "step": 196 }, { "epoch": 0.07651971256554671, "grad_norm": 0.96875, "learning_rate": 3.975305815985948e-05, "loss": 1.2526639699935913, "step": 197 }, { "epoch": 0.07690813750242766, "grad_norm": 1.0390625, "learning_rate": 3.9749499721257316e-05, "loss": 1.414075493812561, "step": 198 }, { "epoch": 0.0772965624393086, "grad_norm": 1.1953125, "learning_rate": 3.974591598869243e-05, "loss": 1.0946063995361328, "step": 199 }, { "epoch": 0.07768498737618955, "grad_norm": 1.1953125, "learning_rate": 3.974230696675463e-05, "loss": 1.1612058877944946, "step": 200 }, { "epoch": 0.0780734123130705, "grad_norm": 1.140625, "learning_rate": 3.973867266006614e-05, "loss": 1.0618281364440918, "step": 201 }, { "epoch": 0.07846183724995144, "grad_norm": 1.21875, "learning_rate": 3.9735013073281564e-05, "loss": 0.9009622931480408, "step": 202 }, { "epoch": 0.0788502621868324, "grad_norm": 0.98046875, "learning_rate": 3.9731328211087875e-05, "loss": 1.0570615530014038, "step": 203 }, { "epoch": 0.07923868712371335, "grad_norm": 1.0390625, "learning_rate": 3.9727618078204416e-05, "loss": 1.1822123527526855, "step": 204 }, { "epoch": 0.0796271120605943, "grad_norm": 1.09375, "learning_rate": 3.972388267938291e-05, "loss": 1.3448340892791748, "step": 205 }, { "epoch": 0.08001553699747524, "grad_norm": 0.95703125, "learning_rate": 3.972012201940743e-05, "loss": 1.3228516578674316, "step": 206 }, { "epoch": 0.08040396193435619, "grad_norm": 1.015625, "learning_rate": 3.971633610309439e-05, "loss": 1.4562703371047974, "step": 207 }, { "epoch": 0.08079238687123713, "grad_norm": 1.0, "learning_rate": 3.971252493529257e-05, "loss": 1.2324013710021973, "step": 208 }, { "epoch": 0.08118081180811808, "grad_norm": 1.046875, "learning_rate": 3.970868852088308e-05, "loss": 1.0579476356506348, "step": 209 }, { "epoch": 0.08156923674499902, "grad_norm": 1.015625, "learning_rate": 3.970482686477937e-05, "loss": 1.0274438858032227, "step": 210 }, { "epoch": 0.08195766168187997, "grad_norm": 1.09375, "learning_rate": 3.970093997192722e-05, "loss": 1.2230676412582397, "step": 211 }, { "epoch": 0.08234608661876093, "grad_norm": 1.03125, "learning_rate": 3.969702784730471e-05, "loss": 1.1012083292007446, "step": 212 }, { "epoch": 0.08273451155564188, "grad_norm": 1.0703125, "learning_rate": 3.9693090495922273e-05, "loss": 0.9031573534011841, "step": 213 }, { "epoch": 0.08312293649252282, "grad_norm": 1.0625, "learning_rate": 3.9689127922822616e-05, "loss": 1.0552705526351929, "step": 214 }, { "epoch": 0.08351136142940377, "grad_norm": 1.0234375, "learning_rate": 3.968514013308077e-05, "loss": 0.9612170457839966, "step": 215 }, { "epoch": 0.08389978636628471, "grad_norm": 1.125, "learning_rate": 3.9681127131804055e-05, "loss": 1.3693969249725342, "step": 216 }, { "epoch": 0.08428821130316566, "grad_norm": 0.9609375, "learning_rate": 3.967708892413207e-05, "loss": 1.3199872970581055, "step": 217 }, { "epoch": 0.08467663624004661, "grad_norm": 1.140625, "learning_rate": 3.967302551523671e-05, "loss": 1.1120353937149048, "step": 218 }, { "epoch": 0.08506506117692755, "grad_norm": 0.98828125, "learning_rate": 3.966893691032216e-05, "loss": 0.9947315454483032, "step": 219 }, { "epoch": 0.08545348611380851, "grad_norm": 1.1484375, "learning_rate": 3.966482311462483e-05, "loss": 0.9557245969772339, "step": 220 }, { "epoch": 0.08584191105068946, "grad_norm": 0.9921875, "learning_rate": 3.9660684133413436e-05, "loss": 1.5002896785736084, "step": 221 }, { "epoch": 0.0862303359875704, "grad_norm": 1.1015625, "learning_rate": 3.965651997198893e-05, "loss": 0.7842967510223389, "step": 222 }, { "epoch": 0.08661876092445135, "grad_norm": 1.2578125, "learning_rate": 3.965233063568453e-05, "loss": 1.0095837116241455, "step": 223 }, { "epoch": 0.0870071858613323, "grad_norm": 1.203125, "learning_rate": 3.964811612986566e-05, "loss": 0.9000999927520752, "step": 224 }, { "epoch": 0.08739561079821324, "grad_norm": 0.98046875, "learning_rate": 3.964387645993003e-05, "loss": 1.3031398057937622, "step": 225 }, { "epoch": 0.08778403573509419, "grad_norm": 0.984375, "learning_rate": 3.9639611631307536e-05, "loss": 1.1282540559768677, "step": 226 }, { "epoch": 0.08817246067197514, "grad_norm": 1.0390625, "learning_rate": 3.963532164946032e-05, "loss": 1.0256094932556152, "step": 227 }, { "epoch": 0.08856088560885608, "grad_norm": 1.0390625, "learning_rate": 3.963100651988272e-05, "loss": 1.0437159538269043, "step": 228 }, { "epoch": 0.08894931054573704, "grad_norm": 1.015625, "learning_rate": 3.96266662481013e-05, "loss": 0.984143853187561, "step": 229 }, { "epoch": 0.08933773548261799, "grad_norm": 0.99609375, "learning_rate": 3.962230083967482e-05, "loss": 1.154150366783142, "step": 230 }, { "epoch": 0.08972616041949893, "grad_norm": 0.95703125, "learning_rate": 3.961791030019423e-05, "loss": 1.301011562347412, "step": 231 }, { "epoch": 0.09011458535637988, "grad_norm": 1.0078125, "learning_rate": 3.961349463528266e-05, "loss": 1.0782634019851685, "step": 232 }, { "epoch": 0.09050301029326083, "grad_norm": 1.6953125, "learning_rate": 3.960905385059543e-05, "loss": 1.6677277088165283, "step": 233 }, { "epoch": 0.09089143523014177, "grad_norm": 1.234375, "learning_rate": 3.960458795182003e-05, "loss": 1.199458360671997, "step": 234 }, { "epoch": 0.09127986016702272, "grad_norm": 0.94140625, "learning_rate": 3.960009694467611e-05, "loss": 1.1999191045761108, "step": 235 }, { "epoch": 0.09166828510390367, "grad_norm": 1.234375, "learning_rate": 3.9595580834915486e-05, "loss": 1.1523382663726807, "step": 236 }, { "epoch": 0.09205671004078461, "grad_norm": 1.140625, "learning_rate": 3.95910396283221e-05, "loss": 1.1133179664611816, "step": 237 }, { "epoch": 0.09244513497766557, "grad_norm": 1.171875, "learning_rate": 3.9586473330712085e-05, "loss": 1.1568917036056519, "step": 238 }, { "epoch": 0.09283355991454652, "grad_norm": 0.98046875, "learning_rate": 3.958188194793365e-05, "loss": 0.9494340419769287, "step": 239 }, { "epoch": 0.09322198485142746, "grad_norm": 1.015625, "learning_rate": 3.957726548586717e-05, "loss": 1.0637331008911133, "step": 240 }, { "epoch": 0.09361040978830841, "grad_norm": 1.015625, "learning_rate": 3.957262395042513e-05, "loss": 1.206292748451233, "step": 241 }, { "epoch": 0.09399883472518936, "grad_norm": 0.91015625, "learning_rate": 3.956795734755213e-05, "loss": 0.8818873763084412, "step": 242 }, { "epoch": 0.0943872596620703, "grad_norm": 1.2734375, "learning_rate": 3.956326568322487e-05, "loss": 1.0712517499923706, "step": 243 }, { "epoch": 0.09477568459895125, "grad_norm": 1.0234375, "learning_rate": 3.9558548963452146e-05, "loss": 1.0740681886672974, "step": 244 }, { "epoch": 0.0951641095358322, "grad_norm": 1.109375, "learning_rate": 3.9553807194274846e-05, "loss": 1.2452702522277832, "step": 245 }, { "epoch": 0.09555253447271315, "grad_norm": 1.046875, "learning_rate": 3.9549040381765955e-05, "loss": 0.9898457527160645, "step": 246 }, { "epoch": 0.0959409594095941, "grad_norm": 1.1171875, "learning_rate": 3.95442485320305e-05, "loss": 1.0777288675308228, "step": 247 }, { "epoch": 0.09632938434647505, "grad_norm": 1.0859375, "learning_rate": 3.9539431651205606e-05, "loss": 1.3255614042282104, "step": 248 }, { "epoch": 0.096717809283356, "grad_norm": 1.1484375, "learning_rate": 3.953458974546043e-05, "loss": 1.2613506317138672, "step": 249 }, { "epoch": 0.09710623422023694, "grad_norm": 1.21875, "learning_rate": 3.95297228209962e-05, "loss": 1.1285110712051392, "step": 250 }, { "epoch": 0.09749465915711789, "grad_norm": 1.0390625, "learning_rate": 3.952483088404617e-05, "loss": 1.2686623334884644, "step": 251 }, { "epoch": 0.09788308409399883, "grad_norm": 1.046875, "learning_rate": 3.951991394087565e-05, "loss": 1.231400728225708, "step": 252 }, { "epoch": 0.09827150903087978, "grad_norm": 0.87890625, "learning_rate": 3.951497199778195e-05, "loss": 0.8219915628433228, "step": 253 }, { "epoch": 0.09865993396776072, "grad_norm": 1.0078125, "learning_rate": 3.951000506109441e-05, "loss": 1.0561715364456177, "step": 254 }, { "epoch": 0.09904835890464168, "grad_norm": 0.98046875, "learning_rate": 3.9505013137174395e-05, "loss": 0.8940542340278625, "step": 255 }, { "epoch": 0.09943678384152263, "grad_norm": 1.125, "learning_rate": 3.949999623241524e-05, "loss": 1.188100814819336, "step": 256 }, { "epoch": 0.09982520877840358, "grad_norm": 1.078125, "learning_rate": 3.949495435324232e-05, "loss": 1.145621418952942, "step": 257 }, { "epoch": 0.10021363371528452, "grad_norm": 1.0546875, "learning_rate": 3.948988750611294e-05, "loss": 1.114254117012024, "step": 258 }, { "epoch": 0.10060205865216547, "grad_norm": 1.0390625, "learning_rate": 3.948479569751643e-05, "loss": 1.1786916255950928, "step": 259 }, { "epoch": 0.10099048358904641, "grad_norm": 1.078125, "learning_rate": 3.9479678933974056e-05, "loss": 1.2439488172531128, "step": 260 }, { "epoch": 0.10137890852592736, "grad_norm": 1.1328125, "learning_rate": 3.947453722203907e-05, "loss": 0.8902916312217712, "step": 261 }, { "epoch": 0.10176733346280831, "grad_norm": 0.91796875, "learning_rate": 3.946937056829666e-05, "loss": 1.296683669090271, "step": 262 }, { "epoch": 0.10215575839968925, "grad_norm": 1.140625, "learning_rate": 3.946417897936396e-05, "loss": 1.2797737121582031, "step": 263 }, { "epoch": 0.10254418333657021, "grad_norm": 1.0625, "learning_rate": 3.945896246189006e-05, "loss": 1.126288652420044, "step": 264 }, { "epoch": 0.10293260827345116, "grad_norm": 1.046875, "learning_rate": 3.9453721022555934e-05, "loss": 0.9240126013755798, "step": 265 }, { "epoch": 0.1033210332103321, "grad_norm": 1.796875, "learning_rate": 3.944845466807451e-05, "loss": 1.4072601795196533, "step": 266 }, { "epoch": 0.10370945814721305, "grad_norm": 1.03125, "learning_rate": 3.9443163405190625e-05, "loss": 0.959816575050354, "step": 267 }, { "epoch": 0.104097883084094, "grad_norm": 1.046875, "learning_rate": 3.9437847240680996e-05, "loss": 1.110178828239441, "step": 268 }, { "epoch": 0.10448630802097494, "grad_norm": 1.21875, "learning_rate": 3.943250618135425e-05, "loss": 1.406478762626648, "step": 269 }, { "epoch": 0.10487473295785589, "grad_norm": 1.0234375, "learning_rate": 3.942714023405089e-05, "loss": 1.042146921157837, "step": 270 }, { "epoch": 0.10526315789473684, "grad_norm": 1.0625, "learning_rate": 3.94217494056433e-05, "loss": 1.0666165351867676, "step": 271 }, { "epoch": 0.1056515828316178, "grad_norm": 1.0234375, "learning_rate": 3.941633370303572e-05, "loss": 1.0088145732879639, "step": 272 }, { "epoch": 0.10604000776849874, "grad_norm": 0.9921875, "learning_rate": 3.9410893133164265e-05, "loss": 1.2333273887634277, "step": 273 }, { "epoch": 0.10642843270537969, "grad_norm": 1.203125, "learning_rate": 3.940542770299687e-05, "loss": 0.9709728360176086, "step": 274 }, { "epoch": 0.10681685764226063, "grad_norm": 1.1328125, "learning_rate": 3.939993741953334e-05, "loss": 1.2241806983947754, "step": 275 }, { "epoch": 0.10720528257914158, "grad_norm": 0.984375, "learning_rate": 3.93944222898053e-05, "loss": 1.2368106842041016, "step": 276 }, { "epoch": 0.10759370751602253, "grad_norm": 0.86328125, "learning_rate": 3.938888232087618e-05, "loss": 0.8722430467605591, "step": 277 }, { "epoch": 0.10798213245290347, "grad_norm": 0.96875, "learning_rate": 3.938331751984125e-05, "loss": 1.1484941244125366, "step": 278 }, { "epoch": 0.10837055738978442, "grad_norm": 1.1484375, "learning_rate": 3.937772789382757e-05, "loss": 1.4616508483886719, "step": 279 }, { "epoch": 0.10875898232666537, "grad_norm": 1.03125, "learning_rate": 3.9372113449993976e-05, "loss": 1.011710524559021, "step": 280 }, { "epoch": 0.10914740726354633, "grad_norm": 1.015625, "learning_rate": 3.936647419553113e-05, "loss": 1.1213209629058838, "step": 281 }, { "epoch": 0.10953583220042727, "grad_norm": 1.234375, "learning_rate": 3.936081013766143e-05, "loss": 1.1031363010406494, "step": 282 }, { "epoch": 0.10992425713730822, "grad_norm": 1.140625, "learning_rate": 3.9355121283639075e-05, "loss": 1.1444246768951416, "step": 283 }, { "epoch": 0.11031268207418916, "grad_norm": 1.1875, "learning_rate": 3.934940764074999e-05, "loss": 1.0126041173934937, "step": 284 }, { "epoch": 0.11070110701107011, "grad_norm": 1.015625, "learning_rate": 3.9343669216311875e-05, "loss": 1.0441282987594604, "step": 285 }, { "epoch": 0.11108953194795106, "grad_norm": 1.0390625, "learning_rate": 3.933790601767415e-05, "loss": 1.385947346687317, "step": 286 }, { "epoch": 0.111477956884832, "grad_norm": 0.984375, "learning_rate": 3.933211805221796e-05, "loss": 0.9738495945930481, "step": 287 }, { "epoch": 0.11186638182171295, "grad_norm": 1.078125, "learning_rate": 3.93263053273562e-05, "loss": 1.4304131269454956, "step": 288 }, { "epoch": 0.11225480675859391, "grad_norm": 0.984375, "learning_rate": 3.9320467850533443e-05, "loss": 1.1276483535766602, "step": 289 }, { "epoch": 0.11264323169547485, "grad_norm": 1.046875, "learning_rate": 3.931460562922598e-05, "loss": 1.6643239259719849, "step": 290 }, { "epoch": 0.1130316566323558, "grad_norm": 1.1171875, "learning_rate": 3.93087186709418e-05, "loss": 1.23006010055542, "step": 291 }, { "epoch": 0.11342008156923675, "grad_norm": 0.91796875, "learning_rate": 3.930280698322053e-05, "loss": 0.9697540998458862, "step": 292 }, { "epoch": 0.1138085065061177, "grad_norm": 1.078125, "learning_rate": 3.929687057363354e-05, "loss": 1.1557683944702148, "step": 293 }, { "epoch": 0.11419693144299864, "grad_norm": 1.0234375, "learning_rate": 3.92909094497838e-05, "loss": 1.0606120824813843, "step": 294 }, { "epoch": 0.11458535637987959, "grad_norm": 1.15625, "learning_rate": 3.928492361930596e-05, "loss": 1.3572219610214233, "step": 295 }, { "epoch": 0.11497378131676053, "grad_norm": 1.2890625, "learning_rate": 3.927891308986631e-05, "loss": 1.6732779741287231, "step": 296 }, { "epoch": 0.11536220625364148, "grad_norm": 1.03125, "learning_rate": 3.927287786916278e-05, "loss": 0.9903056025505066, "step": 297 }, { "epoch": 0.11575063119052244, "grad_norm": 1.171875, "learning_rate": 3.9266817964924905e-05, "loss": 1.1329853534698486, "step": 298 }, { "epoch": 0.11613905612740338, "grad_norm": 1.0, "learning_rate": 3.926073338491384e-05, "loss": 1.1376073360443115, "step": 299 }, { "epoch": 0.11652748106428433, "grad_norm": 0.9609375, "learning_rate": 3.925462413692237e-05, "loss": 0.9302178621292114, "step": 300 }, { "epoch": 0.11691590600116528, "grad_norm": 1.1640625, "learning_rate": 3.924849022877482e-05, "loss": 0.9141783714294434, "step": 301 }, { "epoch": 0.11730433093804622, "grad_norm": 0.98046875, "learning_rate": 3.924233166832714e-05, "loss": 1.1870044469833374, "step": 302 }, { "epoch": 0.11769275587492717, "grad_norm": 1.171875, "learning_rate": 3.923614846346685e-05, "loss": 1.1025910377502441, "step": 303 }, { "epoch": 0.11808118081180811, "grad_norm": 1.0859375, "learning_rate": 3.922994062211301e-05, "loss": 1.183861255645752, "step": 304 }, { "epoch": 0.11846960574868906, "grad_norm": 1.0625, "learning_rate": 3.9223708152216256e-05, "loss": 0.9070916175842285, "step": 305 }, { "epoch": 0.11885803068557001, "grad_norm": 1.1796875, "learning_rate": 3.921745106175875e-05, "loss": 0.9971054792404175, "step": 306 }, { "epoch": 0.11924645562245097, "grad_norm": 0.9765625, "learning_rate": 3.921116935875421e-05, "loss": 1.1449792385101318, "step": 307 }, { "epoch": 0.11963488055933191, "grad_norm": 1.0, "learning_rate": 3.920486305124784e-05, "loss": 1.0535002946853638, "step": 308 }, { "epoch": 0.12002330549621286, "grad_norm": 1.125, "learning_rate": 3.9198532147316385e-05, "loss": 1.3713603019714355, "step": 309 }, { "epoch": 0.1204117304330938, "grad_norm": 0.88671875, "learning_rate": 3.919217665506809e-05, "loss": 0.9621338844299316, "step": 310 }, { "epoch": 0.12080015536997475, "grad_norm": 1.046875, "learning_rate": 3.918579658264268e-05, "loss": 1.1848158836364746, "step": 311 }, { "epoch": 0.1211885803068557, "grad_norm": 7.125, "learning_rate": 3.917939193821136e-05, "loss": 1.1307907104492188, "step": 312 }, { "epoch": 0.12157700524373664, "grad_norm": 1.015625, "learning_rate": 3.9172962729976824e-05, "loss": 1.069643259048462, "step": 313 }, { "epoch": 0.12196543018061759, "grad_norm": 0.953125, "learning_rate": 3.91665089661732e-05, "loss": 0.9884704351425171, "step": 314 }, { "epoch": 0.12235385511749855, "grad_norm": 1.1875, "learning_rate": 3.9160030655066076e-05, "loss": 1.2948812246322632, "step": 315 }, { "epoch": 0.1227422800543795, "grad_norm": 1.0390625, "learning_rate": 3.915352780495249e-05, "loss": 1.175618290901184, "step": 316 }, { "epoch": 0.12313070499126044, "grad_norm": 1.203125, "learning_rate": 3.914700042416089e-05, "loss": 1.256517767906189, "step": 317 }, { "epoch": 0.12351912992814139, "grad_norm": 1.0859375, "learning_rate": 3.914044852105116e-05, "loss": 1.2852438688278198, "step": 318 }, { "epoch": 0.12390755486502233, "grad_norm": 0.953125, "learning_rate": 3.913387210401457e-05, "loss": 0.9687092304229736, "step": 319 }, { "epoch": 0.12429597980190328, "grad_norm": 1.1875, "learning_rate": 3.91272711814738e-05, "loss": 1.0068652629852295, "step": 320 }, { "epoch": 0.12468440473878423, "grad_norm": 1.0625, "learning_rate": 3.9120645761882916e-05, "loss": 1.2731925249099731, "step": 321 }, { "epoch": 0.1250728296756652, "grad_norm": 1.1171875, "learning_rate": 3.911399585372735e-05, "loss": 1.1389235258102417, "step": 322 }, { "epoch": 0.12546125461254612, "grad_norm": 1.109375, "learning_rate": 3.91073214655239e-05, "loss": 1.1034696102142334, "step": 323 }, { "epoch": 0.12584967954942708, "grad_norm": 0.9453125, "learning_rate": 3.910062260582072e-05, "loss": 0.9548723697662354, "step": 324 }, { "epoch": 0.126238104486308, "grad_norm": 1.2890625, "learning_rate": 3.9093899283197306e-05, "loss": 1.390270709991455, "step": 325 }, { "epoch": 0.12662652942318897, "grad_norm": 1.0703125, "learning_rate": 3.9087151506264484e-05, "loss": 0.9853108525276184, "step": 326 }, { "epoch": 0.1270149543600699, "grad_norm": 1.0234375, "learning_rate": 3.9080379283664384e-05, "loss": 0.9549964070320129, "step": 327 }, { "epoch": 0.12740337929695086, "grad_norm": 1.109375, "learning_rate": 3.907358262407048e-05, "loss": 1.1180987358093262, "step": 328 }, { "epoch": 0.12779180423383182, "grad_norm": 1.0, "learning_rate": 3.90667615361875e-05, "loss": 1.2803322076797485, "step": 329 }, { "epoch": 0.12818022917071276, "grad_norm": 1.1015625, "learning_rate": 3.9059916028751496e-05, "loss": 1.3838303089141846, "step": 330 }, { "epoch": 0.12856865410759372, "grad_norm": 1.3046875, "learning_rate": 3.905304611052977e-05, "loss": 0.9730018377304077, "step": 331 }, { "epoch": 0.12895707904447465, "grad_norm": 1.0, "learning_rate": 3.9046151790320905e-05, "loss": 1.0566742420196533, "step": 332 }, { "epoch": 0.1293455039813556, "grad_norm": 1.0703125, "learning_rate": 3.903923307695472e-05, "loss": 0.995861291885376, "step": 333 }, { "epoch": 0.12973392891823654, "grad_norm": 1.0859375, "learning_rate": 3.903228997929229e-05, "loss": 1.0659996271133423, "step": 334 }, { "epoch": 0.1301223538551175, "grad_norm": 0.97265625, "learning_rate": 3.9025322506225915e-05, "loss": 0.9561373591423035, "step": 335 }, { "epoch": 0.13051077879199843, "grad_norm": 1.15625, "learning_rate": 3.901833066667911e-05, "loss": 1.1701858043670654, "step": 336 }, { "epoch": 0.1308992037288794, "grad_norm": 1.0078125, "learning_rate": 3.901131446960659e-05, "loss": 1.1244219541549683, "step": 337 }, { "epoch": 0.13128762866576035, "grad_norm": 0.9765625, "learning_rate": 3.900427392399429e-05, "loss": 0.9817262887954712, "step": 338 }, { "epoch": 0.13167605360264129, "grad_norm": 1.046875, "learning_rate": 3.89972090388593e-05, "loss": 1.049609661102295, "step": 339 }, { "epoch": 0.13206447853952225, "grad_norm": 1.0859375, "learning_rate": 3.899011982324992e-05, "loss": 1.159037470817566, "step": 340 }, { "epoch": 0.13245290347640318, "grad_norm": 1.1171875, "learning_rate": 3.898300628624556e-05, "loss": 1.2854981422424316, "step": 341 }, { "epoch": 0.13284132841328414, "grad_norm": 1.1953125, "learning_rate": 3.8975868436956826e-05, "loss": 1.2691904306411743, "step": 342 }, { "epoch": 0.13322975335016507, "grad_norm": 0.890625, "learning_rate": 3.8968706284525436e-05, "loss": 1.1354483366012573, "step": 343 }, { "epoch": 0.13361817828704603, "grad_norm": 0.93359375, "learning_rate": 3.896151983812424e-05, "loss": 1.1412798166275024, "step": 344 }, { "epoch": 0.134006603223927, "grad_norm": 0.92578125, "learning_rate": 3.8954309106957214e-05, "loss": 0.8203668594360352, "step": 345 }, { "epoch": 0.13439502816080792, "grad_norm": 1.203125, "learning_rate": 3.894707410025941e-05, "loss": 1.1172488927841187, "step": 346 }, { "epoch": 0.13478345309768888, "grad_norm": 1.2734375, "learning_rate": 3.893981482729699e-05, "loss": 1.2423323392868042, "step": 347 }, { "epoch": 0.13517187803456981, "grad_norm": 1.234375, "learning_rate": 3.8932531297367205e-05, "loss": 0.9976466298103333, "step": 348 }, { "epoch": 0.13556030297145077, "grad_norm": 1.1328125, "learning_rate": 3.8925223519798344e-05, "loss": 1.2187296152114868, "step": 349 }, { "epoch": 0.1359487279083317, "grad_norm": 1.0703125, "learning_rate": 3.891789150394976e-05, "loss": 0.9638290405273438, "step": 350 }, { "epoch": 0.13633715284521267, "grad_norm": 0.92578125, "learning_rate": 3.891053525921187e-05, "loss": 1.2471314668655396, "step": 351 }, { "epoch": 0.1367255777820936, "grad_norm": 1.1875, "learning_rate": 3.890315479500611e-05, "loss": 1.2054710388183594, "step": 352 }, { "epoch": 0.13711400271897456, "grad_norm": 1.09375, "learning_rate": 3.889575012078491e-05, "loss": 1.054743766784668, "step": 353 }, { "epoch": 0.13750242765585552, "grad_norm": 0.88671875, "learning_rate": 3.8888321246031754e-05, "loss": 1.1583956480026245, "step": 354 }, { "epoch": 0.13789085259273645, "grad_norm": 1.15625, "learning_rate": 3.888086818026107e-05, "loss": 0.9336003661155701, "step": 355 }, { "epoch": 0.1382792775296174, "grad_norm": 1.25, "learning_rate": 3.88733909330183e-05, "loss": 1.0374633073806763, "step": 356 }, { "epoch": 0.13866770246649834, "grad_norm": 1.0859375, "learning_rate": 3.8865889513879866e-05, "loss": 1.2032431364059448, "step": 357 }, { "epoch": 0.1390561274033793, "grad_norm": 1.203125, "learning_rate": 3.8858363932453116e-05, "loss": 0.9254549145698547, "step": 358 }, { "epoch": 0.13944455234026024, "grad_norm": 1.203125, "learning_rate": 3.885081419837636e-05, "loss": 1.218999981880188, "step": 359 }, { "epoch": 0.1398329772771412, "grad_norm": 1.0234375, "learning_rate": 3.8843240321318835e-05, "loss": 1.0493146181106567, "step": 360 }, { "epoch": 0.14022140221402213, "grad_norm": 0.9765625, "learning_rate": 3.883564231098072e-05, "loss": 1.0284656286239624, "step": 361 }, { "epoch": 0.1406098271509031, "grad_norm": 1.25, "learning_rate": 3.882802017709307e-05, "loss": 0.9331860542297363, "step": 362 }, { "epoch": 0.14099825208778405, "grad_norm": 1.0546875, "learning_rate": 3.882037392941786e-05, "loss": 0.986005961894989, "step": 363 }, { "epoch": 0.14138667702466498, "grad_norm": 1.0, "learning_rate": 3.881270357774795e-05, "loss": 1.0227582454681396, "step": 364 }, { "epoch": 0.14177510196154594, "grad_norm": 1.140625, "learning_rate": 3.880500913190704e-05, "loss": 1.076030969619751, "step": 365 }, { "epoch": 0.14216352689842687, "grad_norm": 1.1171875, "learning_rate": 3.879729060174973e-05, "loss": 0.9582054615020752, "step": 366 }, { "epoch": 0.14255195183530783, "grad_norm": 0.96875, "learning_rate": 3.878954799716143e-05, "loss": 1.238734245300293, "step": 367 }, { "epoch": 0.14294037677218877, "grad_norm": 1.046875, "learning_rate": 3.878178132805841e-05, "loss": 1.0535911321640015, "step": 368 }, { "epoch": 0.14332880170906973, "grad_norm": 1.171875, "learning_rate": 3.8773990604387747e-05, "loss": 1.2706632614135742, "step": 369 }, { "epoch": 0.14371722664595066, "grad_norm": 1.15625, "learning_rate": 3.8766175836127323e-05, "loss": 1.0236033201217651, "step": 370 }, { "epoch": 0.14410565158283162, "grad_norm": 1.1015625, "learning_rate": 3.8758337033285826e-05, "loss": 0.8620764017105103, "step": 371 }, { "epoch": 0.14449407651971258, "grad_norm": 0.890625, "learning_rate": 3.8750474205902715e-05, "loss": 1.0091922283172607, "step": 372 }, { "epoch": 0.1448825014565935, "grad_norm": 1.1640625, "learning_rate": 3.8742587364048223e-05, "loss": 1.1668879985809326, "step": 373 }, { "epoch": 0.14527092639347447, "grad_norm": 0.96875, "learning_rate": 3.873467651782335e-05, "loss": 1.0057437419891357, "step": 374 }, { "epoch": 0.1456593513303554, "grad_norm": 1.046875, "learning_rate": 3.872674167735981e-05, "loss": 0.9468935132026672, "step": 375 }, { "epoch": 0.14604777626723636, "grad_norm": 0.984375, "learning_rate": 3.8718782852820076e-05, "loss": 0.8907188177108765, "step": 376 }, { "epoch": 0.1464362012041173, "grad_norm": 1.2265625, "learning_rate": 3.871080005439733e-05, "loss": 1.2029355764389038, "step": 377 }, { "epoch": 0.14682462614099825, "grad_norm": 1.0546875, "learning_rate": 3.870279329231546e-05, "loss": 1.1410369873046875, "step": 378 }, { "epoch": 0.1472130510778792, "grad_norm": 1.0078125, "learning_rate": 3.8694762576829034e-05, "loss": 0.8683645129203796, "step": 379 }, { "epoch": 0.14760147601476015, "grad_norm": 1.2421875, "learning_rate": 3.8686707918223304e-05, "loss": 1.0769332647323608, "step": 380 }, { "epoch": 0.1479899009516411, "grad_norm": 1.09375, "learning_rate": 3.86786293268142e-05, "loss": 1.186623454093933, "step": 381 }, { "epoch": 0.14837832588852204, "grad_norm": 1.015625, "learning_rate": 3.867052681294828e-05, "loss": 0.9107520580291748, "step": 382 }, { "epoch": 0.148766750825403, "grad_norm": 1.0390625, "learning_rate": 3.866240038700276e-05, "loss": 1.0425362586975098, "step": 383 }, { "epoch": 0.14915517576228393, "grad_norm": 1.09375, "learning_rate": 3.865425005938547e-05, "loss": 0.9944266080856323, "step": 384 }, { "epoch": 0.1495436006991649, "grad_norm": 1.046875, "learning_rate": 3.864607584053486e-05, "loss": 1.115670084953308, "step": 385 }, { "epoch": 0.14993202563604582, "grad_norm": 1.2890625, "learning_rate": 3.863787774091997e-05, "loss": 1.0904215574264526, "step": 386 }, { "epoch": 0.15032045057292678, "grad_norm": 0.98046875, "learning_rate": 3.862965577104043e-05, "loss": 1.3724037408828735, "step": 387 }, { "epoch": 0.15070887550980772, "grad_norm": 1.03125, "learning_rate": 3.862140994142645e-05, "loss": 0.902522087097168, "step": 388 }, { "epoch": 0.15109730044668868, "grad_norm": 1.078125, "learning_rate": 3.8613140262638775e-05, "loss": 1.012543797492981, "step": 389 }, { "epoch": 0.15148572538356964, "grad_norm": 1.2578125, "learning_rate": 3.860484674526872e-05, "loss": 1.583787202835083, "step": 390 }, { "epoch": 0.15187415032045057, "grad_norm": 0.95703125, "learning_rate": 3.8596529399938114e-05, "loss": 0.9430757164955139, "step": 391 }, { "epoch": 0.15226257525733153, "grad_norm": 1.0, "learning_rate": 3.858818823729931e-05, "loss": 1.1282473802566528, "step": 392 }, { "epoch": 0.15265100019421246, "grad_norm": 1.1171875, "learning_rate": 3.857982326803516e-05, "loss": 1.1008602380752563, "step": 393 }, { "epoch": 0.15303942513109342, "grad_norm": 1.21875, "learning_rate": 3.857143450285901e-05, "loss": 1.0468552112579346, "step": 394 }, { "epoch": 0.15342785006797435, "grad_norm": 1.03125, "learning_rate": 3.85630219525147e-05, "loss": 1.0168933868408203, "step": 395 }, { "epoch": 0.1538162750048553, "grad_norm": 1.0390625, "learning_rate": 3.855458562777649e-05, "loss": 1.1111522912979126, "step": 396 }, { "epoch": 0.15420469994173627, "grad_norm": 1.125, "learning_rate": 3.854612553944912e-05, "loss": 0.9024568200111389, "step": 397 }, { "epoch": 0.1545931248786172, "grad_norm": 1.0625, "learning_rate": 3.853764169836777e-05, "loss": 1.033591389656067, "step": 398 }, { "epoch": 0.15498154981549817, "grad_norm": 1.1015625, "learning_rate": 3.8529134115398024e-05, "loss": 1.1722760200500488, "step": 399 }, { "epoch": 0.1553699747523791, "grad_norm": 1.1484375, "learning_rate": 3.8520602801435873e-05, "loss": 1.1893808841705322, "step": 400 }, { "epoch": 0.15575839968926006, "grad_norm": 1.265625, "learning_rate": 3.851204776740771e-05, "loss": 1.108094573020935, "step": 401 }, { "epoch": 0.156146824626141, "grad_norm": 1.03125, "learning_rate": 3.850346902427031e-05, "loss": 1.3161089420318604, "step": 402 }, { "epoch": 0.15653524956302195, "grad_norm": 1.15625, "learning_rate": 3.84948665830108e-05, "loss": 1.1102482080459595, "step": 403 }, { "epoch": 0.15692367449990288, "grad_norm": 1.078125, "learning_rate": 3.848624045464667e-05, "loss": 1.1611213684082031, "step": 404 }, { "epoch": 0.15731209943678384, "grad_norm": 0.9296875, "learning_rate": 3.8477590650225735e-05, "loss": 0.8901349306106567, "step": 405 }, { "epoch": 0.1577005243736648, "grad_norm": 1.1171875, "learning_rate": 3.846891718082615e-05, "loss": 1.0212658643722534, "step": 406 }, { "epoch": 0.15808894931054573, "grad_norm": 1.0390625, "learning_rate": 3.846022005755637e-05, "loss": 0.8722613453865051, "step": 407 }, { "epoch": 0.1584773742474267, "grad_norm": 0.984375, "learning_rate": 3.845149929155514e-05, "loss": 1.0967376232147217, "step": 408 }, { "epoch": 0.15886579918430763, "grad_norm": 1.1796875, "learning_rate": 3.844275489399148e-05, "loss": 1.0451778173446655, "step": 409 }, { "epoch": 0.1592542241211886, "grad_norm": 1.046875, "learning_rate": 3.84339868760647e-05, "loss": 1.0236799716949463, "step": 410 }, { "epoch": 0.15964264905806952, "grad_norm": 0.9921875, "learning_rate": 3.842519524900434e-05, "loss": 1.306217074394226, "step": 411 }, { "epoch": 0.16003107399495048, "grad_norm": 1.1796875, "learning_rate": 3.8416380024070175e-05, "loss": 1.0313096046447754, "step": 412 }, { "epoch": 0.1604194989318314, "grad_norm": 1.0, "learning_rate": 3.8407541212552225e-05, "loss": 1.161032795906067, "step": 413 }, { "epoch": 0.16080792386871237, "grad_norm": 1.296875, "learning_rate": 3.8398678825770694e-05, "loss": 1.5692284107208252, "step": 414 }, { "epoch": 0.16119634880559333, "grad_norm": 0.90234375, "learning_rate": 3.838979287507599e-05, "loss": 0.8742800951004028, "step": 415 }, { "epoch": 0.16158477374247426, "grad_norm": 1.0234375, "learning_rate": 3.838088337184871e-05, "loss": 1.0634121894836426, "step": 416 }, { "epoch": 0.16197319867935522, "grad_norm": 0.99609375, "learning_rate": 3.83719503274996e-05, "loss": 1.2643518447875977, "step": 417 }, { "epoch": 0.16236162361623616, "grad_norm": 1.0703125, "learning_rate": 3.836299375346956e-05, "loss": 0.9095296859741211, "step": 418 }, { "epoch": 0.16275004855311712, "grad_norm": 1.0546875, "learning_rate": 3.8354013661229624e-05, "loss": 1.0854108333587646, "step": 419 }, { "epoch": 0.16313847348999805, "grad_norm": 1.109375, "learning_rate": 3.834501006228096e-05, "loss": 0.9407902956008911, "step": 420 }, { "epoch": 0.163526898426879, "grad_norm": 1.0390625, "learning_rate": 3.833598296815483e-05, "loss": 1.0167287588119507, "step": 421 }, { "epoch": 0.16391532336375994, "grad_norm": 0.98828125, "learning_rate": 3.8326932390412584e-05, "loss": 1.0473847389221191, "step": 422 }, { "epoch": 0.1643037483006409, "grad_norm": 1.125, "learning_rate": 3.831785834064565e-05, "loss": 1.1237565279006958, "step": 423 }, { "epoch": 0.16469217323752186, "grad_norm": 1.0625, "learning_rate": 3.830876083047553e-05, "loss": 1.1198922395706177, "step": 424 }, { "epoch": 0.1650805981744028, "grad_norm": 1.09375, "learning_rate": 3.829963987155377e-05, "loss": 1.0426743030548096, "step": 425 }, { "epoch": 0.16546902311128375, "grad_norm": 0.9140625, "learning_rate": 3.829049547556193e-05, "loss": 1.0585099458694458, "step": 426 }, { "epoch": 0.16585744804816469, "grad_norm": 1.15625, "learning_rate": 3.82813276542116e-05, "loss": 0.977573037147522, "step": 427 }, { "epoch": 0.16624587298504565, "grad_norm": 1.109375, "learning_rate": 3.8272136419244366e-05, "loss": 1.1473420858383179, "step": 428 }, { "epoch": 0.16663429792192658, "grad_norm": 0.98828125, "learning_rate": 3.826292178243181e-05, "loss": 0.9159616231918335, "step": 429 }, { "epoch": 0.16702272285880754, "grad_norm": 1.0546875, "learning_rate": 3.825368375557549e-05, "loss": 0.922444224357605, "step": 430 }, { "epoch": 0.16741114779568847, "grad_norm": 1.0703125, "learning_rate": 3.82444223505069e-05, "loss": 0.975006103515625, "step": 431 }, { "epoch": 0.16779957273256943, "grad_norm": 1.1484375, "learning_rate": 3.823513757908748e-05, "loss": 1.166134238243103, "step": 432 }, { "epoch": 0.1681879976694504, "grad_norm": 1.0546875, "learning_rate": 3.822582945320862e-05, "loss": 1.2750545740127563, "step": 433 }, { "epoch": 0.16857642260633132, "grad_norm": 1.0078125, "learning_rate": 3.8216497984791595e-05, "loss": 1.10509192943573, "step": 434 }, { "epoch": 0.16896484754321228, "grad_norm": 1.0703125, "learning_rate": 3.820714318578758e-05, "loss": 1.138566017150879, "step": 435 }, { "epoch": 0.16935327248009321, "grad_norm": 1.25, "learning_rate": 3.819776506817764e-05, "loss": 0.9191439151763916, "step": 436 }, { "epoch": 0.16974169741697417, "grad_norm": 1.125, "learning_rate": 3.81883636439727e-05, "loss": 1.540093183517456, "step": 437 }, { "epoch": 0.1701301223538551, "grad_norm": 1.171875, "learning_rate": 3.817893892521353e-05, "loss": 1.1050527095794678, "step": 438 }, { "epoch": 0.17051854729073607, "grad_norm": 1.140625, "learning_rate": 3.8169490923970746e-05, "loss": 1.085845947265625, "step": 439 }, { "epoch": 0.17090697222761703, "grad_norm": 1.09375, "learning_rate": 3.8160019652344775e-05, "loss": 1.2088279724121094, "step": 440 }, { "epoch": 0.17129539716449796, "grad_norm": 1.0859375, "learning_rate": 3.8150525122465835e-05, "loss": 0.9586250185966492, "step": 441 }, { "epoch": 0.17168382210137892, "grad_norm": 1.0703125, "learning_rate": 3.8141007346493964e-05, "loss": 1.4006540775299072, "step": 442 }, { "epoch": 0.17207224703825985, "grad_norm": 0.98828125, "learning_rate": 3.813146633661894e-05, "loss": 1.130897879600525, "step": 443 }, { "epoch": 0.1724606719751408, "grad_norm": 0.97265625, "learning_rate": 3.812190210506032e-05, "loss": 1.1845965385437012, "step": 444 }, { "epoch": 0.17284909691202174, "grad_norm": 0.96484375, "learning_rate": 3.8112314664067376e-05, "loss": 1.0014785528182983, "step": 445 }, { "epoch": 0.1732375218489027, "grad_norm": 1.203125, "learning_rate": 3.810270402591914e-05, "loss": 1.2004361152648926, "step": 446 }, { "epoch": 0.17362594678578364, "grad_norm": 1.3046875, "learning_rate": 3.809307020292433e-05, "loss": 1.1330876350402832, "step": 447 }, { "epoch": 0.1740143717226646, "grad_norm": 1.0, "learning_rate": 3.8083413207421344e-05, "loss": 1.2742269039154053, "step": 448 }, { "epoch": 0.17440279665954556, "grad_norm": 1.1640625, "learning_rate": 3.80737330517783e-05, "loss": 1.2973606586456299, "step": 449 }, { "epoch": 0.1747912215964265, "grad_norm": 1.1953125, "learning_rate": 3.806402974839295e-05, "loss": 0.9075708389282227, "step": 450 }, { "epoch": 0.17517964653330745, "grad_norm": 1.0078125, "learning_rate": 3.8054303309692695e-05, "loss": 1.3301018476486206, "step": 451 }, { "epoch": 0.17556807147018838, "grad_norm": 1.0625, "learning_rate": 3.804455374813456e-05, "loss": 1.045426607131958, "step": 452 }, { "epoch": 0.17595649640706934, "grad_norm": 1.046875, "learning_rate": 3.803478107620521e-05, "loss": 1.118249535560608, "step": 453 }, { "epoch": 0.17634492134395027, "grad_norm": 1.1953125, "learning_rate": 3.802498530642088e-05, "loss": 0.9328992366790771, "step": 454 }, { "epoch": 0.17673334628083123, "grad_norm": 1.03125, "learning_rate": 3.801516645132739e-05, "loss": 1.4268983602523804, "step": 455 }, { "epoch": 0.17712177121771217, "grad_norm": 0.9765625, "learning_rate": 3.800532452350016e-05, "loss": 1.2285581827163696, "step": 456 }, { "epoch": 0.17751019615459313, "grad_norm": 1.0078125, "learning_rate": 3.799545953554411e-05, "loss": 1.0173109769821167, "step": 457 }, { "epoch": 0.17789862109147409, "grad_norm": 1.015625, "learning_rate": 3.798557150009373e-05, "loss": 0.7954003810882568, "step": 458 }, { "epoch": 0.17828704602835502, "grad_norm": 0.8984375, "learning_rate": 3.797566042981302e-05, "loss": 1.1985822916030884, "step": 459 }, { "epoch": 0.17867547096523598, "grad_norm": 1.0625, "learning_rate": 3.796572633739547e-05, "loss": 1.3913893699645996, "step": 460 }, { "epoch": 0.1790638959021169, "grad_norm": 1.1640625, "learning_rate": 3.7955769235564066e-05, "loss": 0.8872165679931641, "step": 461 }, { "epoch": 0.17945232083899787, "grad_norm": 1.2734375, "learning_rate": 3.7945789137071264e-05, "loss": 1.1226625442504883, "step": 462 }, { "epoch": 0.1798407457758788, "grad_norm": 0.96484375, "learning_rate": 3.793578605469897e-05, "loss": 0.8778141140937805, "step": 463 }, { "epoch": 0.18022917071275976, "grad_norm": 1.0234375, "learning_rate": 3.792576000125852e-05, "loss": 1.0418932437896729, "step": 464 }, { "epoch": 0.1806175956496407, "grad_norm": 0.875, "learning_rate": 3.791571098959068e-05, "loss": 0.9851300716400146, "step": 465 }, { "epoch": 0.18100602058652165, "grad_norm": 1.0625, "learning_rate": 3.790563903256562e-05, "loss": 0.8794447779655457, "step": 466 }, { "epoch": 0.18139444552340261, "grad_norm": 1.15625, "learning_rate": 3.789554414308288e-05, "loss": 1.2089881896972656, "step": 467 }, { "epoch": 0.18178287046028355, "grad_norm": 0.96875, "learning_rate": 3.78854263340714e-05, "loss": 0.9700504541397095, "step": 468 }, { "epoch": 0.1821712953971645, "grad_norm": 1.0625, "learning_rate": 3.787528561848944e-05, "loss": 0.9787839651107788, "step": 469 }, { "epoch": 0.18255972033404544, "grad_norm": 1.0703125, "learning_rate": 3.7865122009324624e-05, "loss": 1.0101159811019897, "step": 470 }, { "epoch": 0.1829481452709264, "grad_norm": 1.140625, "learning_rate": 3.7854935519593874e-05, "loss": 1.5328278541564941, "step": 471 }, { "epoch": 0.18333657020780733, "grad_norm": 1.234375, "learning_rate": 3.784472616234345e-05, "loss": 1.053279161453247, "step": 472 }, { "epoch": 0.1837249951446883, "grad_norm": 0.94921875, "learning_rate": 3.783449395064887e-05, "loss": 1.049187421798706, "step": 473 }, { "epoch": 0.18411342008156922, "grad_norm": 1.0859375, "learning_rate": 3.782423889761492e-05, "loss": 1.0140115022659302, "step": 474 }, { "epoch": 0.18450184501845018, "grad_norm": 1.21875, "learning_rate": 3.7813961016375655e-05, "loss": 1.0346108675003052, "step": 475 }, { "epoch": 0.18489026995533114, "grad_norm": 0.94921875, "learning_rate": 3.780366032009437e-05, "loss": 1.1364030838012695, "step": 476 }, { "epoch": 0.18527869489221208, "grad_norm": 1.0546875, "learning_rate": 3.779333682196357e-05, "loss": 1.478013277053833, "step": 477 }, { "epoch": 0.18566711982909304, "grad_norm": 1.125, "learning_rate": 3.7782990535204964e-05, "loss": 1.3151485919952393, "step": 478 }, { "epoch": 0.18605554476597397, "grad_norm": 1.1640625, "learning_rate": 3.777262147306945e-05, "loss": 1.1799895763397217, "step": 479 }, { "epoch": 0.18644396970285493, "grad_norm": 1.1875, "learning_rate": 3.77622296488371e-05, "loss": 1.2503812313079834, "step": 480 }, { "epoch": 0.18683239463973586, "grad_norm": 0.90625, "learning_rate": 3.775181507581712e-05, "loss": 1.0117619037628174, "step": 481 }, { "epoch": 0.18722081957661682, "grad_norm": 1.234375, "learning_rate": 3.774137776734788e-05, "loss": 1.0683610439300537, "step": 482 }, { "epoch": 0.18760924451349778, "grad_norm": 1.2265625, "learning_rate": 3.7730917736796835e-05, "loss": 1.2491822242736816, "step": 483 }, { "epoch": 0.1879976694503787, "grad_norm": 0.953125, "learning_rate": 3.7720434997560574e-05, "loss": 1.0882766246795654, "step": 484 }, { "epoch": 0.18838609438725967, "grad_norm": 1.1953125, "learning_rate": 3.770992956306474e-05, "loss": 1.6244347095489502, "step": 485 }, { "epoch": 0.1887745193241406, "grad_norm": 0.96484375, "learning_rate": 3.7699401446764075e-05, "loss": 1.1000465154647827, "step": 486 }, { "epoch": 0.18916294426102157, "grad_norm": 1.1640625, "learning_rate": 3.768885066214234e-05, "loss": 0.9936768412590027, "step": 487 }, { "epoch": 0.1895513691979025, "grad_norm": 1.03125, "learning_rate": 3.7678277222712336e-05, "loss": 1.2515525817871094, "step": 488 }, { "epoch": 0.18993979413478346, "grad_norm": 1.1015625, "learning_rate": 3.7667681142015895e-05, "loss": 1.015694499015808, "step": 489 }, { "epoch": 0.1903282190716644, "grad_norm": 1.109375, "learning_rate": 3.7657062433623825e-05, "loss": 1.0090786218643188, "step": 490 }, { "epoch": 0.19071664400854535, "grad_norm": 0.890625, "learning_rate": 3.7646421111135944e-05, "loss": 0.9640659689903259, "step": 491 }, { "epoch": 0.1911050689454263, "grad_norm": 0.96484375, "learning_rate": 3.763575718818099e-05, "loss": 1.034289002418518, "step": 492 }, { "epoch": 0.19149349388230724, "grad_norm": 0.99609375, "learning_rate": 3.762507067841668e-05, "loss": 1.1559734344482422, "step": 493 }, { "epoch": 0.1918819188191882, "grad_norm": 0.87890625, "learning_rate": 3.7614361595529645e-05, "loss": 0.9001410007476807, "step": 494 }, { "epoch": 0.19227034375606913, "grad_norm": 1.0, "learning_rate": 3.7603629953235435e-05, "loss": 0.9449456930160522, "step": 495 }, { "epoch": 0.1926587686929501, "grad_norm": 1.046875, "learning_rate": 3.759287576527849e-05, "loss": 1.1552667617797852, "step": 496 }, { "epoch": 0.19304719362983103, "grad_norm": 1.5, "learning_rate": 3.758209904543211e-05, "loss": 1.051976203918457, "step": 497 }, { "epoch": 0.193435618566712, "grad_norm": 0.9296875, "learning_rate": 3.757129980749847e-05, "loss": 1.156018853187561, "step": 498 }, { "epoch": 0.19382404350359292, "grad_norm": 1.109375, "learning_rate": 3.756047806530858e-05, "loss": 1.1493191719055176, "step": 499 }, { "epoch": 0.19421246844047388, "grad_norm": 1.125, "learning_rate": 3.754963383272228e-05, "loss": 1.0094928741455078, "step": 500 }, { "epoch": 0.19460089337735484, "grad_norm": 1.1875, "learning_rate": 3.753876712362819e-05, "loss": 1.2630305290222168, "step": 501 }, { "epoch": 0.19498931831423577, "grad_norm": 1.171875, "learning_rate": 3.7527877951943745e-05, "loss": 0.8763682246208191, "step": 502 }, { "epoch": 0.19537774325111673, "grad_norm": 1.1484375, "learning_rate": 3.7516966331615134e-05, "loss": 0.962637186050415, "step": 503 }, { "epoch": 0.19576616818799766, "grad_norm": 1.1953125, "learning_rate": 3.7506032276617295e-05, "loss": 1.3083328008651733, "step": 504 }, { "epoch": 0.19615459312487862, "grad_norm": 1.34375, "learning_rate": 3.749507580095391e-05, "loss": 1.3739006519317627, "step": 505 }, { "epoch": 0.19654301806175956, "grad_norm": 1.0625, "learning_rate": 3.748409691865737e-05, "loss": 0.9271696209907532, "step": 506 }, { "epoch": 0.19693144299864052, "grad_norm": 1.03125, "learning_rate": 3.747309564378875e-05, "loss": 1.0311074256896973, "step": 507 }, { "epoch": 0.19731986793552145, "grad_norm": 1.1328125, "learning_rate": 3.746207199043783e-05, "loss": 1.4827593564987183, "step": 508 }, { "epoch": 0.1977082928724024, "grad_norm": 1.046875, "learning_rate": 3.745102597272302e-05, "loss": 0.8514983057975769, "step": 509 }, { "epoch": 0.19809671780928337, "grad_norm": 1.0390625, "learning_rate": 3.7439957604791414e-05, "loss": 1.264042615890503, "step": 510 }, { "epoch": 0.1984851427461643, "grad_norm": 0.94921875, "learning_rate": 3.742886690081869e-05, "loss": 0.8488145470619202, "step": 511 }, { "epoch": 0.19887356768304526, "grad_norm": 0.9609375, "learning_rate": 3.7417753875009156e-05, "loss": 1.0499143600463867, "step": 512 }, { "epoch": 0.1992619926199262, "grad_norm": 1.015625, "learning_rate": 3.7406618541595704e-05, "loss": 0.9363781213760376, "step": 513 }, { "epoch": 0.19965041755680715, "grad_norm": 0.9453125, "learning_rate": 3.7395460914839787e-05, "loss": 0.883247971534729, "step": 514 }, { "epoch": 0.20003884249368808, "grad_norm": 2.0, "learning_rate": 3.738428100903142e-05, "loss": 1.294025182723999, "step": 515 }, { "epoch": 0.20042726743056904, "grad_norm": 0.98828125, "learning_rate": 3.737307883848916e-05, "loss": 1.136923909187317, "step": 516 }, { "epoch": 0.20081569236744998, "grad_norm": 1.0859375, "learning_rate": 3.736185441756005e-05, "loss": 0.9141234159469604, "step": 517 }, { "epoch": 0.20120411730433094, "grad_norm": 1.140625, "learning_rate": 3.7350607760619656e-05, "loss": 1.0500640869140625, "step": 518 }, { "epoch": 0.2015925422412119, "grad_norm": 1.1796875, "learning_rate": 3.7339338882072024e-05, "loss": 0.9898691177368164, "step": 519 }, { "epoch": 0.20198096717809283, "grad_norm": 1.2890625, "learning_rate": 3.732804779634964e-05, "loss": 1.0182572603225708, "step": 520 }, { "epoch": 0.2023693921149738, "grad_norm": 1.0234375, "learning_rate": 3.731673451791344e-05, "loss": 1.2103452682495117, "step": 521 }, { "epoch": 0.20275781705185472, "grad_norm": 1.0234375, "learning_rate": 3.7305399061252795e-05, "loss": 1.1579413414001465, "step": 522 }, { "epoch": 0.20314624198873568, "grad_norm": 1.078125, "learning_rate": 3.729404144088547e-05, "loss": 1.05728280544281, "step": 523 }, { "epoch": 0.20353466692561661, "grad_norm": 1.0078125, "learning_rate": 3.728266167135761e-05, "loss": 1.1875461339950562, "step": 524 }, { "epoch": 0.20392309186249757, "grad_norm": 1.0234375, "learning_rate": 3.727125976724375e-05, "loss": 1.0130983591079712, "step": 525 }, { "epoch": 0.2043115167993785, "grad_norm": 0.95703125, "learning_rate": 3.7259835743146736e-05, "loss": 0.9817225337028503, "step": 526 }, { "epoch": 0.20469994173625947, "grad_norm": 1.0234375, "learning_rate": 3.7248389613697777e-05, "loss": 0.8963401913642883, "step": 527 }, { "epoch": 0.20508836667314043, "grad_norm": 1.109375, "learning_rate": 3.723692139355638e-05, "loss": 1.0017890930175781, "step": 528 }, { "epoch": 0.20547679161002136, "grad_norm": 1.015625, "learning_rate": 3.722543109741036e-05, "loss": 0.789158046245575, "step": 529 }, { "epoch": 0.20586521654690232, "grad_norm": 1.0234375, "learning_rate": 3.721391873997577e-05, "loss": 1.186380386352539, "step": 530 }, { "epoch": 0.20625364148378325, "grad_norm": 1.2109375, "learning_rate": 3.7202384335996955e-05, "loss": 0.9675403833389282, "step": 531 }, { "epoch": 0.2066420664206642, "grad_norm": 1.203125, "learning_rate": 3.7190827900246474e-05, "loss": 0.9114765524864197, "step": 532 }, { "epoch": 0.20703049135754514, "grad_norm": 1.171875, "learning_rate": 3.717924944752511e-05, "loss": 0.87519770860672, "step": 533 }, { "epoch": 0.2074189162944261, "grad_norm": 1.0703125, "learning_rate": 3.716764899266185e-05, "loss": 1.1204779148101807, "step": 534 }, { "epoch": 0.20780734123130706, "grad_norm": 1.0390625, "learning_rate": 3.715602655051385e-05, "loss": 1.0442743301391602, "step": 535 }, { "epoch": 0.208195766168188, "grad_norm": 1.0546875, "learning_rate": 3.7144382135966434e-05, "loss": 0.8404509425163269, "step": 536 }, { "epoch": 0.20858419110506896, "grad_norm": 1.140625, "learning_rate": 3.7132715763933044e-05, "loss": 1.1538641452789307, "step": 537 }, { "epoch": 0.2089726160419499, "grad_norm": 1.0625, "learning_rate": 3.712102744935529e-05, "loss": 1.1673226356506348, "step": 538 }, { "epoch": 0.20936104097883085, "grad_norm": 1.1953125, "learning_rate": 3.710931720720284e-05, "loss": 1.043501615524292, "step": 539 }, { "epoch": 0.20974946591571178, "grad_norm": 1.140625, "learning_rate": 3.709758505247346e-05, "loss": 0.9944994449615479, "step": 540 }, { "epoch": 0.21013789085259274, "grad_norm": 1.0859375, "learning_rate": 3.7085831000193e-05, "loss": 0.8966817259788513, "step": 541 }, { "epoch": 0.21052631578947367, "grad_norm": 1.109375, "learning_rate": 3.707405506541532e-05, "loss": 1.0579668283462524, "step": 542 }, { "epoch": 0.21091474072635463, "grad_norm": 1.0, "learning_rate": 3.7062257263222334e-05, "loss": 0.893793523311615, "step": 543 }, { "epoch": 0.2113031656632356, "grad_norm": 1.140625, "learning_rate": 3.7050437608723956e-05, "loss": 1.0280640125274658, "step": 544 }, { "epoch": 0.21169159060011652, "grad_norm": 1.0546875, "learning_rate": 3.7038596117058067e-05, "loss": 1.262276530265808, "step": 545 }, { "epoch": 0.21208001553699748, "grad_norm": 1.2578125, "learning_rate": 3.7026732803390554e-05, "loss": 1.115488886833191, "step": 546 }, { "epoch": 0.21246844047387842, "grad_norm": 1.1953125, "learning_rate": 3.701484768291521e-05, "loss": 1.1753761768341064, "step": 547 }, { "epoch": 0.21285686541075938, "grad_norm": 1.0, "learning_rate": 3.7002940770853786e-05, "loss": 1.1671571731567383, "step": 548 }, { "epoch": 0.2132452903476403, "grad_norm": 1.1796875, "learning_rate": 3.699101208245593e-05, "loss": 1.050545334815979, "step": 549 }, { "epoch": 0.21363371528452127, "grad_norm": 0.94921875, "learning_rate": 3.6979061632999193e-05, "loss": 0.9829149842262268, "step": 550 }, { "epoch": 0.2140221402214022, "grad_norm": 1.1015625, "learning_rate": 3.696708943778898e-05, "loss": 1.1237698793411255, "step": 551 }, { "epoch": 0.21441056515828316, "grad_norm": 1.015625, "learning_rate": 3.6955095512158554e-05, "loss": 1.0884979963302612, "step": 552 }, { "epoch": 0.21479899009516412, "grad_norm": 1.046875, "learning_rate": 3.6943079871469005e-05, "loss": 0.9240511059761047, "step": 553 }, { "epoch": 0.21518741503204505, "grad_norm": 1.1953125, "learning_rate": 3.6931042531109246e-05, "loss": 1.228102445602417, "step": 554 }, { "epoch": 0.21557583996892601, "grad_norm": 1.1328125, "learning_rate": 3.6918983506495965e-05, "loss": 1.3352986574172974, "step": 555 }, { "epoch": 0.21596426490580695, "grad_norm": 1.0625, "learning_rate": 3.690690281307364e-05, "loss": 1.080653190612793, "step": 556 }, { "epoch": 0.2163526898426879, "grad_norm": 1.1328125, "learning_rate": 3.689480046631447e-05, "loss": 1.1221870183944702, "step": 557 }, { "epoch": 0.21674111477956884, "grad_norm": 1.1171875, "learning_rate": 3.688267648171843e-05, "loss": 1.2126820087432861, "step": 558 }, { "epoch": 0.2171295397164498, "grad_norm": 1.25, "learning_rate": 3.687053087481318e-05, "loss": 0.9629032015800476, "step": 559 }, { "epoch": 0.21751796465333073, "grad_norm": 1.078125, "learning_rate": 3.6858363661154066e-05, "loss": 1.0099022388458252, "step": 560 }, { "epoch": 0.2179063895902117, "grad_norm": 1.265625, "learning_rate": 3.6846174856324135e-05, "loss": 1.2953517436981201, "step": 561 }, { "epoch": 0.21829481452709265, "grad_norm": 0.953125, "learning_rate": 3.683396447593406e-05, "loss": 0.9897211194038391, "step": 562 }, { "epoch": 0.21868323946397358, "grad_norm": 1.15625, "learning_rate": 3.682173253562216e-05, "loss": 0.9984757304191589, "step": 563 }, { "epoch": 0.21907166440085454, "grad_norm": 1.1015625, "learning_rate": 3.680947905105435e-05, "loss": 0.8959919214248657, "step": 564 }, { "epoch": 0.21946008933773548, "grad_norm": 1.0703125, "learning_rate": 3.679720403792417e-05, "loss": 1.0797672271728516, "step": 565 }, { "epoch": 0.21984851427461644, "grad_norm": 1.1875, "learning_rate": 3.67849075119527e-05, "loss": 1.1887917518615723, "step": 566 }, { "epoch": 0.22023693921149737, "grad_norm": 1.0078125, "learning_rate": 3.67725894888886e-05, "loss": 1.1888139247894287, "step": 567 }, { "epoch": 0.22062536414837833, "grad_norm": 1.0625, "learning_rate": 3.676024998450802e-05, "loss": 1.2955310344696045, "step": 568 }, { "epoch": 0.22101378908525926, "grad_norm": 1.0546875, "learning_rate": 3.674788901461468e-05, "loss": 1.1094638109207153, "step": 569 }, { "epoch": 0.22140221402214022, "grad_norm": 1.0859375, "learning_rate": 3.673550659503975e-05, "loss": 0.9707934856414795, "step": 570 }, { "epoch": 0.22179063895902118, "grad_norm": 0.99609375, "learning_rate": 3.672310274164188e-05, "loss": 1.1504703760147095, "step": 571 }, { "epoch": 0.2221790638959021, "grad_norm": 1.0703125, "learning_rate": 3.6710677470307174e-05, "loss": 1.0779370069503784, "step": 572 }, { "epoch": 0.22256748883278307, "grad_norm": 0.90625, "learning_rate": 3.6698230796949166e-05, "loss": 1.0991334915161133, "step": 573 }, { "epoch": 0.222955913769664, "grad_norm": 1.1171875, "learning_rate": 3.6685762737508805e-05, "loss": 1.1198008060455322, "step": 574 }, { "epoch": 0.22334433870654496, "grad_norm": 1.0546875, "learning_rate": 3.6673273307954426e-05, "loss": 0.9456967711448669, "step": 575 }, { "epoch": 0.2237327636434259, "grad_norm": 1.0234375, "learning_rate": 3.666076252428173e-05, "loss": 1.1340779066085815, "step": 576 }, { "epoch": 0.22412118858030686, "grad_norm": 1.109375, "learning_rate": 3.664823040251378e-05, "loss": 1.0145490169525146, "step": 577 }, { "epoch": 0.22450961351718782, "grad_norm": 1.0546875, "learning_rate": 3.6635676958700946e-05, "loss": 1.157453179359436, "step": 578 }, { "epoch": 0.22489803845406875, "grad_norm": 0.95703125, "learning_rate": 3.6623102208920924e-05, "loss": 1.1519958972930908, "step": 579 }, { "epoch": 0.2252864633909497, "grad_norm": 1.328125, "learning_rate": 3.661050616927869e-05, "loss": 1.149026870727539, "step": 580 }, { "epoch": 0.22567488832783064, "grad_norm": 1.0078125, "learning_rate": 3.659788885590649e-05, "loss": 1.1133066415786743, "step": 581 }, { "epoch": 0.2260633132647116, "grad_norm": 1.2109375, "learning_rate": 3.658525028496382e-05, "loss": 1.1862083673477173, "step": 582 }, { "epoch": 0.22645173820159253, "grad_norm": 1.0625, "learning_rate": 3.657259047263738e-05, "loss": 1.036561369895935, "step": 583 }, { "epoch": 0.2268401631384735, "grad_norm": 1.015625, "learning_rate": 3.6559909435141106e-05, "loss": 0.9766876101493835, "step": 584 }, { "epoch": 0.22722858807535443, "grad_norm": 1.0390625, "learning_rate": 3.6547207188716095e-05, "loss": 1.6319947242736816, "step": 585 }, { "epoch": 0.2276170130122354, "grad_norm": 1.2265625, "learning_rate": 3.6534483749630624e-05, "loss": 1.153091549873352, "step": 586 }, { "epoch": 0.22800543794911635, "grad_norm": 1.1484375, "learning_rate": 3.6521739134180095e-05, "loss": 1.2657543420791626, "step": 587 }, { "epoch": 0.22839386288599728, "grad_norm": 1.046875, "learning_rate": 3.650897335868704e-05, "loss": 1.0386921167373657, "step": 588 }, { "epoch": 0.22878228782287824, "grad_norm": 1.2890625, "learning_rate": 3.649618643950109e-05, "loss": 0.9634569883346558, "step": 589 }, { "epoch": 0.22917071275975917, "grad_norm": 1.0703125, "learning_rate": 3.648337839299895e-05, "loss": 1.4374767541885376, "step": 590 }, { "epoch": 0.22955913769664013, "grad_norm": 1.0234375, "learning_rate": 3.647054923558441e-05, "loss": 0.9629542827606201, "step": 591 }, { "epoch": 0.22994756263352106, "grad_norm": 1.1640625, "learning_rate": 3.645769898368826e-05, "loss": 1.1193279027938843, "step": 592 }, { "epoch": 0.23033598757040202, "grad_norm": 1.1328125, "learning_rate": 3.644482765376833e-05, "loss": 1.016859531402588, "step": 593 }, { "epoch": 0.23072441250728296, "grad_norm": 0.953125, "learning_rate": 3.6431935262309436e-05, "loss": 1.0564347505569458, "step": 594 }, { "epoch": 0.23111283744416392, "grad_norm": 1.046875, "learning_rate": 3.641902182582338e-05, "loss": 1.0572593212127686, "step": 595 }, { "epoch": 0.23150126238104488, "grad_norm": 1.0703125, "learning_rate": 3.64060873608489e-05, "loss": 1.15488600730896, "step": 596 }, { "epoch": 0.2318896873179258, "grad_norm": 1.0234375, "learning_rate": 3.6393131883951685e-05, "loss": 1.0544419288635254, "step": 597 }, { "epoch": 0.23227811225480677, "grad_norm": 1.09375, "learning_rate": 3.638015541172431e-05, "loss": 1.0406293869018555, "step": 598 }, { "epoch": 0.2326665371916877, "grad_norm": 1.0546875, "learning_rate": 3.636715796078627e-05, "loss": 1.3004015684127808, "step": 599 }, { "epoch": 0.23305496212856866, "grad_norm": 0.9375, "learning_rate": 3.635413954778391e-05, "loss": 1.0613385438919067, "step": 600 }, { "epoch": 0.2334433870654496, "grad_norm": 1.109375, "learning_rate": 3.634110018939041e-05, "loss": 1.0500413179397583, "step": 601 }, { "epoch": 0.23383181200233055, "grad_norm": 1.1015625, "learning_rate": 3.6328039902305806e-05, "loss": 1.4993857145309448, "step": 602 }, { "epoch": 0.23422023693921148, "grad_norm": 1.015625, "learning_rate": 3.6314958703256916e-05, "loss": 1.0509413480758667, "step": 603 }, { "epoch": 0.23460866187609244, "grad_norm": 0.96484375, "learning_rate": 3.6301856608997355e-05, "loss": 0.9018488526344299, "step": 604 }, { "epoch": 0.2349970868129734, "grad_norm": 1.2265625, "learning_rate": 3.628873363630748e-05, "loss": 1.230818748474121, "step": 605 }, { "epoch": 0.23538551174985434, "grad_norm": 1.0859375, "learning_rate": 3.627558980199441e-05, "loss": 0.9632076025009155, "step": 606 }, { "epoch": 0.2357739366867353, "grad_norm": 1.1015625, "learning_rate": 3.626242512289196e-05, "loss": 1.0122417211532593, "step": 607 }, { "epoch": 0.23616236162361623, "grad_norm": 1.2421875, "learning_rate": 3.624923961586066e-05, "loss": 1.132332444190979, "step": 608 }, { "epoch": 0.2365507865604972, "grad_norm": 1.0859375, "learning_rate": 3.623603329778771e-05, "loss": 1.288146734237671, "step": 609 }, { "epoch": 0.23693921149737812, "grad_norm": 1.140625, "learning_rate": 3.622280618558696e-05, "loss": 1.2055113315582275, "step": 610 }, { "epoch": 0.23732763643425908, "grad_norm": 1.125, "learning_rate": 3.62095582961989e-05, "loss": 1.0556122064590454, "step": 611 }, { "epoch": 0.23771606137114001, "grad_norm": 1.015625, "learning_rate": 3.619628964659061e-05, "loss": 1.2922437191009521, "step": 612 }, { "epoch": 0.23810448630802097, "grad_norm": 0.97265625, "learning_rate": 3.6183000253755786e-05, "loss": 1.0363426208496094, "step": 613 }, { "epoch": 0.23849291124490193, "grad_norm": 1.0625, "learning_rate": 3.616969013471467e-05, "loss": 1.0232594013214111, "step": 614 }, { "epoch": 0.23888133618178287, "grad_norm": 1.203125, "learning_rate": 3.6156359306514064e-05, "loss": 1.3372410535812378, "step": 615 }, { "epoch": 0.23926976111866383, "grad_norm": 1.296875, "learning_rate": 3.614300778622727e-05, "loss": 1.0787200927734375, "step": 616 }, { "epoch": 0.23965818605554476, "grad_norm": 1.0859375, "learning_rate": 3.612963559095413e-05, "loss": 0.7697679996490479, "step": 617 }, { "epoch": 0.24004661099242572, "grad_norm": 1.0703125, "learning_rate": 3.611624273782092e-05, "loss": 1.206201195716858, "step": 618 }, { "epoch": 0.24043503592930665, "grad_norm": 1.1796875, "learning_rate": 3.6102829243980395e-05, "loss": 1.2077409029006958, "step": 619 }, { "epoch": 0.2408234608661876, "grad_norm": 1.3046875, "learning_rate": 3.6089395126611754e-05, "loss": 1.0691050291061401, "step": 620 }, { "epoch": 0.24121188580306857, "grad_norm": 1.03125, "learning_rate": 3.6075940402920604e-05, "loss": 0.8692009449005127, "step": 621 }, { "epoch": 0.2416003107399495, "grad_norm": 1.265625, "learning_rate": 3.6062465090138936e-05, "loss": 1.1179394721984863, "step": 622 }, { "epoch": 0.24198873567683046, "grad_norm": 1.1171875, "learning_rate": 3.604896920552511e-05, "loss": 1.0165318250656128, "step": 623 }, { "epoch": 0.2423771606137114, "grad_norm": 1.1640625, "learning_rate": 3.6035452766363846e-05, "loss": 0.9038753509521484, "step": 624 }, { "epoch": 0.24276558555059236, "grad_norm": 1.0703125, "learning_rate": 3.602191578996617e-05, "loss": 1.239443063735962, "step": 625 }, { "epoch": 0.2431540104874733, "grad_norm": 1.09375, "learning_rate": 3.6008358293669424e-05, "loss": 1.0732313394546509, "step": 626 }, { "epoch": 0.24354243542435425, "grad_norm": 1.1875, "learning_rate": 3.5994780294837236e-05, "loss": 0.9213586449623108, "step": 627 }, { "epoch": 0.24393086036123518, "grad_norm": 1.1015625, "learning_rate": 3.598118181085948e-05, "loss": 1.2002776861190796, "step": 628 }, { "epoch": 0.24431928529811614, "grad_norm": 1.03125, "learning_rate": 3.596756285915227e-05, "loss": 1.156685471534729, "step": 629 }, { "epoch": 0.2447077102349971, "grad_norm": 1.0625, "learning_rate": 3.595392345715793e-05, "loss": 0.792778491973877, "step": 630 }, { "epoch": 0.24509613517187803, "grad_norm": 1.1640625, "learning_rate": 3.5940263622344974e-05, "loss": 1.0988359451293945, "step": 631 }, { "epoch": 0.245484560108759, "grad_norm": 0.921875, "learning_rate": 3.5926583372208106e-05, "loss": 0.9336706399917603, "step": 632 }, { "epoch": 0.24587298504563992, "grad_norm": 1.0234375, "learning_rate": 3.591288272426816e-05, "loss": 0.8446120619773865, "step": 633 }, { "epoch": 0.24626140998252088, "grad_norm": 1.3203125, "learning_rate": 3.589916169607209e-05, "loss": 0.9841316342353821, "step": 634 }, { "epoch": 0.24664983491940182, "grad_norm": 1.1953125, "learning_rate": 3.588542030519296e-05, "loss": 1.2826591730117798, "step": 635 }, { "epoch": 0.24703825985628278, "grad_norm": 1.296875, "learning_rate": 3.58716585692299e-05, "loss": 0.9190275073051453, "step": 636 }, { "epoch": 0.2474266847931637, "grad_norm": 1.0703125, "learning_rate": 3.5857876505808125e-05, "loss": 0.9876718521118164, "step": 637 }, { "epoch": 0.24781510973004467, "grad_norm": 1.3984375, "learning_rate": 3.5844074132578864e-05, "loss": 1.0019081830978394, "step": 638 }, { "epoch": 0.24820353466692563, "grad_norm": 1.03125, "learning_rate": 3.583025146721934e-05, "loss": 1.2911444902420044, "step": 639 }, { "epoch": 0.24859195960380656, "grad_norm": 1.421875, "learning_rate": 3.581640852743282e-05, "loss": 1.039469599723816, "step": 640 }, { "epoch": 0.24898038454068752, "grad_norm": 1.0625, "learning_rate": 3.5802545330948476e-05, "loss": 0.7957627177238464, "step": 641 }, { "epoch": 0.24936880947756845, "grad_norm": 1.1015625, "learning_rate": 3.5788661895521455e-05, "loss": 0.9870870113372803, "step": 642 }, { "epoch": 0.2497572344144494, "grad_norm": 1.1171875, "learning_rate": 3.577475823893282e-05, "loss": 0.9755269289016724, "step": 643 }, { "epoch": 0.2501456593513304, "grad_norm": 1.328125, "learning_rate": 3.576083437898954e-05, "loss": 1.177359938621521, "step": 644 }, { "epoch": 0.2505340842882113, "grad_norm": 1.0703125, "learning_rate": 3.574689033352445e-05, "loss": 0.935440719127655, "step": 645 }, { "epoch": 0.25092250922509224, "grad_norm": 0.984375, "learning_rate": 3.573292612039623e-05, "loss": 1.0530145168304443, "step": 646 }, { "epoch": 0.2513109341619732, "grad_norm": 1.140625, "learning_rate": 3.571894175748941e-05, "loss": 1.2227333784103394, "step": 647 }, { "epoch": 0.25169935909885416, "grad_norm": 1.0625, "learning_rate": 3.5704937262714294e-05, "loss": 1.0682541131973267, "step": 648 }, { "epoch": 0.2520877840357351, "grad_norm": 1.109375, "learning_rate": 3.5690912654007014e-05, "loss": 1.132361888885498, "step": 649 }, { "epoch": 0.252476208972616, "grad_norm": 0.9375, "learning_rate": 3.567686794932943e-05, "loss": 1.0568245649337769, "step": 650 }, { "epoch": 0.252864633909497, "grad_norm": 1.109375, "learning_rate": 3.566280316666914e-05, "loss": 0.9279691576957703, "step": 651 }, { "epoch": 0.25325305884637794, "grad_norm": 1.390625, "learning_rate": 3.564871832403948e-05, "loss": 1.1678316593170166, "step": 652 }, { "epoch": 0.2536414837832589, "grad_norm": 1.140625, "learning_rate": 3.5634613439479455e-05, "loss": 0.9210705161094666, "step": 653 }, { "epoch": 0.2540299087201398, "grad_norm": 0.99609375, "learning_rate": 3.5620488531053747e-05, "loss": 0.9490594863891602, "step": 654 }, { "epoch": 0.25441833365702077, "grad_norm": 1.1640625, "learning_rate": 3.560634361685269e-05, "loss": 0.9598304033279419, "step": 655 }, { "epoch": 0.2548067585939017, "grad_norm": 1.28125, "learning_rate": 3.559217871499222e-05, "loss": 1.162004828453064, "step": 656 }, { "epoch": 0.2551951835307827, "grad_norm": 1.28125, "learning_rate": 3.5577993843613893e-05, "loss": 1.4163539409637451, "step": 657 }, { "epoch": 0.25558360846766365, "grad_norm": 1.1015625, "learning_rate": 3.556378902088484e-05, "loss": 1.2913762331008911, "step": 658 }, { "epoch": 0.25597203340454455, "grad_norm": 1.15625, "learning_rate": 3.5549564264997724e-05, "loss": 1.12490975856781, "step": 659 }, { "epoch": 0.2563604583414255, "grad_norm": 1.0390625, "learning_rate": 3.553531959417075e-05, "loss": 0.8314489722251892, "step": 660 }, { "epoch": 0.2567488832783065, "grad_norm": 1.03125, "learning_rate": 3.5521055026647655e-05, "loss": 0.8795455098152161, "step": 661 }, { "epoch": 0.25713730821518743, "grad_norm": 1.015625, "learning_rate": 3.55067705806976e-05, "loss": 0.8842142820358276, "step": 662 }, { "epoch": 0.25752573315206834, "grad_norm": 0.921875, "learning_rate": 3.549246627461525e-05, "loss": 0.9562962055206299, "step": 663 }, { "epoch": 0.2579141580889493, "grad_norm": 1.078125, "learning_rate": 3.54781421267207e-05, "loss": 0.977018415927887, "step": 664 }, { "epoch": 0.25830258302583026, "grad_norm": 1.0859375, "learning_rate": 3.546379815535945e-05, "loss": 1.1258656978607178, "step": 665 }, { "epoch": 0.2586910079627112, "grad_norm": 1.5078125, "learning_rate": 3.544943437890238e-05, "loss": 1.1370604038238525, "step": 666 }, { "epoch": 0.2590794328995922, "grad_norm": 0.9453125, "learning_rate": 3.543505081574575e-05, "loss": 0.8206220269203186, "step": 667 }, { "epoch": 0.2594678578364731, "grad_norm": 1.21875, "learning_rate": 3.542064748431116e-05, "loss": 1.0522335767745972, "step": 668 }, { "epoch": 0.25985628277335404, "grad_norm": 1.0859375, "learning_rate": 3.5406224403045516e-05, "loss": 1.1838871240615845, "step": 669 }, { "epoch": 0.260244707710235, "grad_norm": 1.078125, "learning_rate": 3.539178159042103e-05, "loss": 1.010019063949585, "step": 670 }, { "epoch": 0.26063313264711596, "grad_norm": 1.03125, "learning_rate": 3.5377319064935195e-05, "loss": 1.0651133060455322, "step": 671 }, { "epoch": 0.26102155758399687, "grad_norm": 1.0390625, "learning_rate": 3.5362836845110716e-05, "loss": 0.8615675568580627, "step": 672 }, { "epoch": 0.2614099825208778, "grad_norm": 0.9609375, "learning_rate": 3.5348334949495556e-05, "loss": 1.2837333679199219, "step": 673 }, { "epoch": 0.2617984074577588, "grad_norm": 0.95703125, "learning_rate": 3.533381339666285e-05, "loss": 0.9200749397277832, "step": 674 }, { "epoch": 0.26218683239463975, "grad_norm": 1.078125, "learning_rate": 3.5319272205210946e-05, "loss": 1.012538194656372, "step": 675 }, { "epoch": 0.2625752573315207, "grad_norm": 1.2265625, "learning_rate": 3.530471139376331e-05, "loss": 1.0498899221420288, "step": 676 }, { "epoch": 0.2629636822684016, "grad_norm": 1.1171875, "learning_rate": 3.529013098096853e-05, "loss": 1.1642321348190308, "step": 677 }, { "epoch": 0.26335210720528257, "grad_norm": 0.99609375, "learning_rate": 3.5275530985500344e-05, "loss": 1.19815993309021, "step": 678 }, { "epoch": 0.26374053214216353, "grad_norm": 1.0546875, "learning_rate": 3.526091142605752e-05, "loss": 1.0988470315933228, "step": 679 }, { "epoch": 0.2641289570790445, "grad_norm": 1.21875, "learning_rate": 3.5246272321363926e-05, "loss": 1.1635648012161255, "step": 680 }, { "epoch": 0.2645173820159254, "grad_norm": 1.2109375, "learning_rate": 3.523161369016842e-05, "loss": 1.2155308723449707, "step": 681 }, { "epoch": 0.26490580695280636, "grad_norm": 1.1171875, "learning_rate": 3.5216935551244896e-05, "loss": 0.7147628664970398, "step": 682 }, { "epoch": 0.2652942318896873, "grad_norm": 1.1484375, "learning_rate": 3.520223792339222e-05, "loss": 1.089343547821045, "step": 683 }, { "epoch": 0.2656826568265683, "grad_norm": 1.1171875, "learning_rate": 3.5187520825434233e-05, "loss": 1.109845757484436, "step": 684 }, { "epoch": 0.26607108176344924, "grad_norm": 1.15625, "learning_rate": 3.51727842762197e-05, "loss": 1.068721890449524, "step": 685 }, { "epoch": 0.26645950670033014, "grad_norm": 1.046875, "learning_rate": 3.51580282946223e-05, "loss": 1.0747637748718262, "step": 686 }, { "epoch": 0.2668479316372111, "grad_norm": 1.0, "learning_rate": 3.514325289954059e-05, "loss": 0.9964104890823364, "step": 687 }, { "epoch": 0.26723635657409206, "grad_norm": 1.15625, "learning_rate": 3.512845810989801e-05, "loss": 0.9367272257804871, "step": 688 }, { "epoch": 0.267624781510973, "grad_norm": 1.3046875, "learning_rate": 3.5113643944642835e-05, "loss": 1.0798001289367676, "step": 689 }, { "epoch": 0.268013206447854, "grad_norm": 1.265625, "learning_rate": 3.5098810422748144e-05, "loss": 1.3756492137908936, "step": 690 }, { "epoch": 0.2684016313847349, "grad_norm": 1.1484375, "learning_rate": 3.508395756321181e-05, "loss": 1.0519465208053589, "step": 691 }, { "epoch": 0.26879005632161584, "grad_norm": 1.015625, "learning_rate": 3.506908538505648e-05, "loss": 1.118362545967102, "step": 692 }, { "epoch": 0.2691784812584968, "grad_norm": 1.140625, "learning_rate": 3.505419390732954e-05, "loss": 0.9734751582145691, "step": 693 }, { "epoch": 0.26956690619537776, "grad_norm": 1.21875, "learning_rate": 3.5039283149103096e-05, "loss": 0.9823213815689087, "step": 694 }, { "epoch": 0.26995533113225867, "grad_norm": 1.0859375, "learning_rate": 3.502435312947393e-05, "loss": 0.8510763049125671, "step": 695 }, { "epoch": 0.27034375606913963, "grad_norm": 1.125, "learning_rate": 3.500940386756352e-05, "loss": 0.9463381171226501, "step": 696 }, { "epoch": 0.2707321810060206, "grad_norm": 1.1015625, "learning_rate": 3.4994435382517965e-05, "loss": 0.9031418561935425, "step": 697 }, { "epoch": 0.27112060594290155, "grad_norm": 1.109375, "learning_rate": 3.4979447693508e-05, "loss": 1.0440632104873657, "step": 698 }, { "epoch": 0.2715090308797825, "grad_norm": 1.1640625, "learning_rate": 3.4964440819728965e-05, "loss": 1.1765331029891968, "step": 699 }, { "epoch": 0.2718974558166634, "grad_norm": 1.109375, "learning_rate": 3.494941478040072e-05, "loss": 1.08942711353302, "step": 700 }, { "epoch": 0.2722858807535444, "grad_norm": 1.0546875, "learning_rate": 3.493436959476773e-05, "loss": 1.0721678733825684, "step": 701 }, { "epoch": 0.27267430569042533, "grad_norm": 1.21875, "learning_rate": 3.4919305282098946e-05, "loss": 1.1134552955627441, "step": 702 }, { "epoch": 0.2730627306273063, "grad_norm": 1.0390625, "learning_rate": 3.490422186168784e-05, "loss": 0.9920719265937805, "step": 703 }, { "epoch": 0.2734511555641872, "grad_norm": 1.1484375, "learning_rate": 3.4889119352852326e-05, "loss": 1.2347557544708252, "step": 704 }, { "epoch": 0.27383958050106816, "grad_norm": 1.2265625, "learning_rate": 3.4873997774934794e-05, "loss": 1.228805422782898, "step": 705 }, { "epoch": 0.2742280054379491, "grad_norm": 0.93359375, "learning_rate": 3.485885714730205e-05, "loss": 1.2464430332183838, "step": 706 }, { "epoch": 0.2746164303748301, "grad_norm": 1.15625, "learning_rate": 3.484369748934528e-05, "loss": 1.1135258674621582, "step": 707 }, { "epoch": 0.27500485531171104, "grad_norm": 1.203125, "learning_rate": 3.482851882048005e-05, "loss": 1.2803081274032593, "step": 708 }, { "epoch": 0.27539328024859194, "grad_norm": 1.0703125, "learning_rate": 3.48133211601463e-05, "loss": 1.1727495193481445, "step": 709 }, { "epoch": 0.2757817051854729, "grad_norm": 1.203125, "learning_rate": 3.479810452780826e-05, "loss": 0.9062326550483704, "step": 710 }, { "epoch": 0.27617013012235386, "grad_norm": 1.0, "learning_rate": 3.4782868942954476e-05, "loss": 0.8954911828041077, "step": 711 }, { "epoch": 0.2765585550592348, "grad_norm": 1.0, "learning_rate": 3.476761442509776e-05, "loss": 1.1940964460372925, "step": 712 }, { "epoch": 0.2769469799961157, "grad_norm": 1.1875, "learning_rate": 3.475234099377517e-05, "loss": 1.3007962703704834, "step": 713 }, { "epoch": 0.2773354049329967, "grad_norm": 1.0078125, "learning_rate": 3.4737048668547995e-05, "loss": 1.1502760648727417, "step": 714 }, { "epoch": 0.27772382986987765, "grad_norm": 1.1796875, "learning_rate": 3.472173746900172e-05, "loss": 1.0041171312332153, "step": 715 }, { "epoch": 0.2781122548067586, "grad_norm": 1.1953125, "learning_rate": 3.4706407414746e-05, "loss": 1.174515724182129, "step": 716 }, { "epoch": 0.27850067974363957, "grad_norm": 1.3046875, "learning_rate": 3.469105852541464e-05, "loss": 0.9085159301757812, "step": 717 }, { "epoch": 0.27888910468052047, "grad_norm": 1.0859375, "learning_rate": 3.467569082066557e-05, "loss": 1.178394079208374, "step": 718 }, { "epoch": 0.27927752961740143, "grad_norm": 3.015625, "learning_rate": 3.4660304320180814e-05, "loss": 1.0660459995269775, "step": 719 }, { "epoch": 0.2796659545542824, "grad_norm": 1.140625, "learning_rate": 3.464489904366646e-05, "loss": 0.8839026689529419, "step": 720 }, { "epoch": 0.28005437949116335, "grad_norm": 1.171875, "learning_rate": 3.462947501085267e-05, "loss": 1.0874050855636597, "step": 721 }, { "epoch": 0.28044280442804426, "grad_norm": 1.359375, "learning_rate": 3.46140322414936e-05, "loss": 1.0143588781356812, "step": 722 }, { "epoch": 0.2808312293649252, "grad_norm": 0.97265625, "learning_rate": 3.459857075536741e-05, "loss": 1.1248044967651367, "step": 723 }, { "epoch": 0.2812196543018062, "grad_norm": 1.1953125, "learning_rate": 3.458309057227625e-05, "loss": 1.0262174606323242, "step": 724 }, { "epoch": 0.28160807923868714, "grad_norm": 1.1796875, "learning_rate": 3.4567591712046185e-05, "loss": 1.1166930198669434, "step": 725 }, { "epoch": 0.2819965041755681, "grad_norm": 1.0234375, "learning_rate": 3.455207419452722e-05, "loss": 1.3346822261810303, "step": 726 }, { "epoch": 0.282384929112449, "grad_norm": 0.953125, "learning_rate": 3.453653803959326e-05, "loss": 0.9624277353286743, "step": 727 }, { "epoch": 0.28277335404932996, "grad_norm": 1.09375, "learning_rate": 3.4520983267142066e-05, "loss": 1.2167584896087646, "step": 728 }, { "epoch": 0.2831617789862109, "grad_norm": 1.046875, "learning_rate": 3.450540989709524e-05, "loss": 1.2402032613754272, "step": 729 }, { "epoch": 0.2835502039230919, "grad_norm": 1.125, "learning_rate": 3.4489817949398224e-05, "loss": 0.9929094314575195, "step": 730 }, { "epoch": 0.2839386288599728, "grad_norm": 1.015625, "learning_rate": 3.447420744402022e-05, "loss": 1.0042052268981934, "step": 731 }, { "epoch": 0.28432705379685375, "grad_norm": 1.03125, "learning_rate": 3.445857840095425e-05, "loss": 0.9718262553215027, "step": 732 }, { "epoch": 0.2847154787337347, "grad_norm": 1.28125, "learning_rate": 3.444293084021701e-05, "loss": 1.2374653816223145, "step": 733 }, { "epoch": 0.28510390367061567, "grad_norm": 1.0546875, "learning_rate": 3.4427264781848974e-05, "loss": 1.1435883045196533, "step": 734 }, { "epoch": 0.2854923286074966, "grad_norm": 1.1171875, "learning_rate": 3.4411580245914264e-05, "loss": 1.0239133834838867, "step": 735 }, { "epoch": 0.28588075354437753, "grad_norm": 1.1328125, "learning_rate": 3.4395877252500695e-05, "loss": 1.0083296298980713, "step": 736 }, { "epoch": 0.2862691784812585, "grad_norm": 1.421875, "learning_rate": 3.43801558217197e-05, "loss": 1.2628425359725952, "step": 737 }, { "epoch": 0.28665760341813945, "grad_norm": 0.92578125, "learning_rate": 3.436441597370635e-05, "loss": 0.7538553476333618, "step": 738 }, { "epoch": 0.2870460283550204, "grad_norm": 1.0390625, "learning_rate": 3.434865772861927e-05, "loss": 1.0107769966125488, "step": 739 }, { "epoch": 0.2874344532919013, "grad_norm": 1.15625, "learning_rate": 3.4332881106640675e-05, "loss": 1.075883150100708, "step": 740 }, { "epoch": 0.2878228782287823, "grad_norm": 1.0078125, "learning_rate": 3.431708612797631e-05, "loss": 0.7796924710273743, "step": 741 }, { "epoch": 0.28821130316566324, "grad_norm": 1.0390625, "learning_rate": 3.4301272812855425e-05, "loss": 1.1926034688949585, "step": 742 }, { "epoch": 0.2885997281025442, "grad_norm": 1.109375, "learning_rate": 3.428544118153074e-05, "loss": 0.891956627368927, "step": 743 }, { "epoch": 0.28898815303942516, "grad_norm": 1.0625, "learning_rate": 3.4269591254278474e-05, "loss": 1.0473291873931885, "step": 744 }, { "epoch": 0.28937657797630606, "grad_norm": 1.09375, "learning_rate": 3.425372305139824e-05, "loss": 1.032681941986084, "step": 745 }, { "epoch": 0.289765002913187, "grad_norm": 1.2421875, "learning_rate": 3.423783659321307e-05, "loss": 1.1628482341766357, "step": 746 }, { "epoch": 0.290153427850068, "grad_norm": 1.0546875, "learning_rate": 3.422193190006938e-05, "loss": 1.0222368240356445, "step": 747 }, { "epoch": 0.29054185278694894, "grad_norm": 1.21875, "learning_rate": 3.420600899233695e-05, "loss": 0.9927176833152771, "step": 748 }, { "epoch": 0.29093027772382984, "grad_norm": 1.171875, "learning_rate": 3.419006789040885e-05, "loss": 1.154222011566162, "step": 749 }, { "epoch": 0.2913187026607108, "grad_norm": 1.1875, "learning_rate": 3.41741086147015e-05, "loss": 1.2393794059753418, "step": 750 }, { "epoch": 0.29170712759759176, "grad_norm": 0.98828125, "learning_rate": 3.415813118565456e-05, "loss": 0.8544781804084778, "step": 751 }, { "epoch": 0.2920955525344727, "grad_norm": 1.1015625, "learning_rate": 3.4142135623730954e-05, "loss": 1.0507423877716064, "step": 752 }, { "epoch": 0.2924839774713537, "grad_norm": 1.3046875, "learning_rate": 3.412612194941684e-05, "loss": 0.8522463440895081, "step": 753 }, { "epoch": 0.2928724024082346, "grad_norm": 1.1484375, "learning_rate": 3.4110090183221544e-05, "loss": 1.2333531379699707, "step": 754 }, { "epoch": 0.29326082734511555, "grad_norm": 1.2109375, "learning_rate": 3.40940403456776e-05, "loss": 1.0986446142196655, "step": 755 }, { "epoch": 0.2936492522819965, "grad_norm": 1.0859375, "learning_rate": 3.407797245734065e-05, "loss": 1.1682486534118652, "step": 756 }, { "epoch": 0.29403767721887747, "grad_norm": 1.0234375, "learning_rate": 3.406188653878949e-05, "loss": 0.9353405833244324, "step": 757 }, { "epoch": 0.2944261021557584, "grad_norm": 1.2109375, "learning_rate": 3.404578261062597e-05, "loss": 0.8691340684890747, "step": 758 }, { "epoch": 0.29481452709263933, "grad_norm": 1.078125, "learning_rate": 3.402966069347504e-05, "loss": 1.027530550956726, "step": 759 }, { "epoch": 0.2952029520295203, "grad_norm": 1.015625, "learning_rate": 3.401352080798468e-05, "loss": 0.8484542965888977, "step": 760 }, { "epoch": 0.29559137696640125, "grad_norm": 1.03125, "learning_rate": 3.3997362974825855e-05, "loss": 0.8183404207229614, "step": 761 }, { "epoch": 0.2959798019032822, "grad_norm": 1.0234375, "learning_rate": 3.398118721469255e-05, "loss": 1.0077711343765259, "step": 762 }, { "epoch": 0.2963682268401631, "grad_norm": 1.1796875, "learning_rate": 3.396499354830171e-05, "loss": 0.866412341594696, "step": 763 }, { "epoch": 0.2967566517770441, "grad_norm": 0.94140625, "learning_rate": 3.394878199639319e-05, "loss": 1.0070486068725586, "step": 764 }, { "epoch": 0.29714507671392504, "grad_norm": 1.34375, "learning_rate": 3.393255257972977e-05, "loss": 0.9922256469726562, "step": 765 }, { "epoch": 0.297533501650806, "grad_norm": 1.1796875, "learning_rate": 3.39163053190971e-05, "loss": 1.4062093496322632, "step": 766 }, { "epoch": 0.2979219265876869, "grad_norm": 1.1484375, "learning_rate": 3.390004023530367e-05, "loss": 1.1183067560195923, "step": 767 }, { "epoch": 0.29831035152456786, "grad_norm": 1.078125, "learning_rate": 3.388375734918085e-05, "loss": 0.9019991159439087, "step": 768 }, { "epoch": 0.2986987764614488, "grad_norm": 1.1484375, "learning_rate": 3.386745668158274e-05, "loss": 1.2430915832519531, "step": 769 }, { "epoch": 0.2990872013983298, "grad_norm": 1.15625, "learning_rate": 3.385113825338627e-05, "loss": 1.1454029083251953, "step": 770 }, { "epoch": 0.29947562633521074, "grad_norm": 1.046875, "learning_rate": 3.3834802085491085e-05, "loss": 1.2253952026367188, "step": 771 }, { "epoch": 0.29986405127209165, "grad_norm": 1.046875, "learning_rate": 3.381844819881956e-05, "loss": 1.062422513961792, "step": 772 }, { "epoch": 0.3002524762089726, "grad_norm": 1.046875, "learning_rate": 3.3802076614316764e-05, "loss": 1.1638638973236084, "step": 773 }, { "epoch": 0.30064090114585357, "grad_norm": 1.015625, "learning_rate": 3.3785687352950425e-05, "loss": 0.9706148505210876, "step": 774 }, { "epoch": 0.3010293260827345, "grad_norm": 1.0546875, "learning_rate": 3.376928043571092e-05, "loss": 1.07710862159729, "step": 775 }, { "epoch": 0.30141775101961543, "grad_norm": 1.609375, "learning_rate": 3.375285588361124e-05, "loss": 1.322453260421753, "step": 776 }, { "epoch": 0.3018061759564964, "grad_norm": 1.09375, "learning_rate": 3.373641371768695e-05, "loss": 1.1410454511642456, "step": 777 }, { "epoch": 0.30219460089337735, "grad_norm": 1.2109375, "learning_rate": 3.371995395899618e-05, "loss": 1.1494274139404297, "step": 778 }, { "epoch": 0.3025830258302583, "grad_norm": 1.09375, "learning_rate": 3.3703476628619594e-05, "loss": 1.1188944578170776, "step": 779 }, { "epoch": 0.30297145076713927, "grad_norm": 1.125, "learning_rate": 3.3686981747660356e-05, "loss": 1.3288383483886719, "step": 780 }, { "epoch": 0.3033598757040202, "grad_norm": 1.28125, "learning_rate": 3.367046933724411e-05, "loss": 1.1171954870224, "step": 781 }, { "epoch": 0.30374830064090114, "grad_norm": 0.9375, "learning_rate": 3.365393941851895e-05, "loss": 0.9402799010276794, "step": 782 }, { "epoch": 0.3041367255777821, "grad_norm": 1.1015625, "learning_rate": 3.3637392012655406e-05, "loss": 0.9389098286628723, "step": 783 }, { "epoch": 0.30452515051466306, "grad_norm": 1.0234375, "learning_rate": 3.362082714084638e-05, "loss": 1.0487022399902344, "step": 784 }, { "epoch": 0.304913575451544, "grad_norm": 1.0859375, "learning_rate": 3.3604244824307155e-05, "loss": 0.8954732418060303, "step": 785 }, { "epoch": 0.3053020003884249, "grad_norm": 0.91796875, "learning_rate": 3.358764508427537e-05, "loss": 0.8527095317840576, "step": 786 }, { "epoch": 0.3056904253253059, "grad_norm": 1.2734375, "learning_rate": 3.357102794201095e-05, "loss": 1.0043666362762451, "step": 787 }, { "epoch": 0.30607885026218684, "grad_norm": 1.15625, "learning_rate": 3.3554393418796135e-05, "loss": 1.1125026941299438, "step": 788 }, { "epoch": 0.3064672751990678, "grad_norm": 1.2890625, "learning_rate": 3.353774153593541e-05, "loss": 1.295227289199829, "step": 789 }, { "epoch": 0.3068557001359487, "grad_norm": 1.046875, "learning_rate": 3.352107231475551e-05, "loss": 0.8529289960861206, "step": 790 }, { "epoch": 0.30724412507282967, "grad_norm": 0.98828125, "learning_rate": 3.3504385776605345e-05, "loss": 0.7499029040336609, "step": 791 }, { "epoch": 0.3076325500097106, "grad_norm": 1.0, "learning_rate": 3.348768194285604e-05, "loss": 1.0239906311035156, "step": 792 }, { "epoch": 0.3080209749465916, "grad_norm": 1.0546875, "learning_rate": 3.347096083490083e-05, "loss": 0.8945603370666504, "step": 793 }, { "epoch": 0.30840939988347255, "grad_norm": 0.98828125, "learning_rate": 3.345422247415512e-05, "loss": 1.1101937294006348, "step": 794 }, { "epoch": 0.30879782482035345, "grad_norm": 1.03125, "learning_rate": 3.343746688205638e-05, "loss": 1.0916755199432373, "step": 795 }, { "epoch": 0.3091862497572344, "grad_norm": 1.1484375, "learning_rate": 3.342069408006415e-05, "loss": 1.1124159097671509, "step": 796 }, { "epoch": 0.30957467469411537, "grad_norm": 1.078125, "learning_rate": 3.3403904089660035e-05, "loss": 0.9120814800262451, "step": 797 }, { "epoch": 0.30996309963099633, "grad_norm": 1.109375, "learning_rate": 3.3387096932347624e-05, "loss": 1.0716772079467773, "step": 798 }, { "epoch": 0.31035152456787723, "grad_norm": 1.2578125, "learning_rate": 3.337027262965251e-05, "loss": 1.1783028841018677, "step": 799 }, { "epoch": 0.3107399495047582, "grad_norm": 1.0, "learning_rate": 3.335343120312224e-05, "loss": 0.8972504138946533, "step": 800 }, { "epoch": 0.31112837444163916, "grad_norm": 1.28125, "learning_rate": 3.333657267432628e-05, "loss": 0.996420681476593, "step": 801 }, { "epoch": 0.3115167993785201, "grad_norm": 1.3828125, "learning_rate": 3.331969706485604e-05, "loss": 1.2110052108764648, "step": 802 }, { "epoch": 0.3119052243154011, "grad_norm": 1.140625, "learning_rate": 3.330280439632476e-05, "loss": 1.109370470046997, "step": 803 }, { "epoch": 0.312293649252282, "grad_norm": 0.98046875, "learning_rate": 3.328589469036755e-05, "loss": 0.9866268038749695, "step": 804 }, { "epoch": 0.31268207418916294, "grad_norm": 1.296875, "learning_rate": 3.326896796864134e-05, "loss": 0.7599623799324036, "step": 805 }, { "epoch": 0.3130704991260439, "grad_norm": 1.0390625, "learning_rate": 3.325202425282484e-05, "loss": 1.2387405633926392, "step": 806 }, { "epoch": 0.31345892406292486, "grad_norm": 1.15625, "learning_rate": 3.3235063564618544e-05, "loss": 0.865804135799408, "step": 807 }, { "epoch": 0.31384734899980576, "grad_norm": 1.0859375, "learning_rate": 3.321808592574467e-05, "loss": 0.7808165550231934, "step": 808 }, { "epoch": 0.3142357739366867, "grad_norm": 1.2109375, "learning_rate": 3.320109135794716e-05, "loss": 1.4710805416107178, "step": 809 }, { "epoch": 0.3146241988735677, "grad_norm": 1.09375, "learning_rate": 3.3184079882991606e-05, "loss": 1.0665165185928345, "step": 810 }, { "epoch": 0.31501262381044864, "grad_norm": 1.1015625, "learning_rate": 3.316705152266529e-05, "loss": 0.9650043249130249, "step": 811 }, { "epoch": 0.3154010487473296, "grad_norm": 1.0859375, "learning_rate": 3.31500062987771e-05, "loss": 1.150334119796753, "step": 812 }, { "epoch": 0.3157894736842105, "grad_norm": 1.015625, "learning_rate": 3.3132944233157524e-05, "loss": 0.9448800683021545, "step": 813 }, { "epoch": 0.31617789862109147, "grad_norm": 1.125, "learning_rate": 3.3115865347658625e-05, "loss": 1.011758804321289, "step": 814 }, { "epoch": 0.31656632355797243, "grad_norm": 1.1484375, "learning_rate": 3.3098769664154005e-05, "loss": 1.0464892387390137, "step": 815 }, { "epoch": 0.3169547484948534, "grad_norm": 1.3515625, "learning_rate": 3.308165720453878e-05, "loss": 1.212410569190979, "step": 816 }, { "epoch": 0.3173431734317343, "grad_norm": 1.078125, "learning_rate": 3.306452799072955e-05, "loss": 1.0162781476974487, "step": 817 }, { "epoch": 0.31773159836861525, "grad_norm": 1.1015625, "learning_rate": 3.304738204466437e-05, "loss": 1.0756381750106812, "step": 818 }, { "epoch": 0.3181200233054962, "grad_norm": 1.1328125, "learning_rate": 3.303021938830274e-05, "loss": 0.9091431498527527, "step": 819 }, { "epoch": 0.3185084482423772, "grad_norm": 1.03125, "learning_rate": 3.301304004362553e-05, "loss": 1.0791590213775635, "step": 820 }, { "epoch": 0.31889687317925813, "grad_norm": 1.1328125, "learning_rate": 3.2995844032635035e-05, "loss": 0.9696295857429504, "step": 821 }, { "epoch": 0.31928529811613904, "grad_norm": 1.0625, "learning_rate": 3.297863137735483e-05, "loss": 1.0135314464569092, "step": 822 }, { "epoch": 0.31967372305302, "grad_norm": 0.984375, "learning_rate": 3.296140209982987e-05, "loss": 0.866156280040741, "step": 823 }, { "epoch": 0.32006214798990096, "grad_norm": 1.203125, "learning_rate": 3.2944156222126356e-05, "loss": 1.0172420740127563, "step": 824 }, { "epoch": 0.3204505729267819, "grad_norm": 1.046875, "learning_rate": 3.292689376633177e-05, "loss": 0.8920045495033264, "step": 825 }, { "epoch": 0.3208389978636628, "grad_norm": 1.0703125, "learning_rate": 3.29096147545548e-05, "loss": 1.0652507543563843, "step": 826 }, { "epoch": 0.3212274228005438, "grad_norm": 1.015625, "learning_rate": 3.289231920892539e-05, "loss": 1.0118200778961182, "step": 827 }, { "epoch": 0.32161584773742474, "grad_norm": 1.28125, "learning_rate": 3.2875007151594597e-05, "loss": 1.0335946083068848, "step": 828 }, { "epoch": 0.3220042726743057, "grad_norm": 1.203125, "learning_rate": 3.285767860473466e-05, "loss": 1.1417776346206665, "step": 829 }, { "epoch": 0.32239269761118666, "grad_norm": 1.15625, "learning_rate": 3.284033359053895e-05, "loss": 0.971636950969696, "step": 830 }, { "epoch": 0.32278112254806757, "grad_norm": 1.1328125, "learning_rate": 3.28229721312219e-05, "loss": 0.9170125722885132, "step": 831 }, { "epoch": 0.3231695474849485, "grad_norm": 1.125, "learning_rate": 3.280559424901902e-05, "loss": 1.003848910331726, "step": 832 }, { "epoch": 0.3235579724218295, "grad_norm": 1.0859375, "learning_rate": 3.278819996618684e-05, "loss": 1.0153409242630005, "step": 833 }, { "epoch": 0.32394639735871045, "grad_norm": 1.046875, "learning_rate": 3.277078930500289e-05, "loss": 0.8175346255302429, "step": 834 }, { "epoch": 0.32433482229559135, "grad_norm": 0.9375, "learning_rate": 3.275336228776572e-05, "loss": 1.0555111169815063, "step": 835 }, { "epoch": 0.3247232472324723, "grad_norm": 1.109375, "learning_rate": 3.2735918936794786e-05, "loss": 0.9789876937866211, "step": 836 }, { "epoch": 0.32511167216935327, "grad_norm": 0.953125, "learning_rate": 3.271845927443048e-05, "loss": 0.9336972236633301, "step": 837 }, { "epoch": 0.32550009710623423, "grad_norm": 1.1640625, "learning_rate": 3.2700983323034064e-05, "loss": 1.234835147857666, "step": 838 }, { "epoch": 0.3258885220431152, "grad_norm": 1.6640625, "learning_rate": 3.268349110498769e-05, "loss": 0.9891402125358582, "step": 839 }, { "epoch": 0.3262769469799961, "grad_norm": 1.0625, "learning_rate": 3.266598264269433e-05, "loss": 0.9530712962150574, "step": 840 }, { "epoch": 0.32666537191687706, "grad_norm": 0.953125, "learning_rate": 3.264845795857777e-05, "loss": 1.0189599990844727, "step": 841 }, { "epoch": 0.327053796853758, "grad_norm": 1.0234375, "learning_rate": 3.2630917075082545e-05, "loss": 1.0231633186340332, "step": 842 }, { "epoch": 0.327442221790639, "grad_norm": 1.09375, "learning_rate": 3.2613360014673964e-05, "loss": 0.919548511505127, "step": 843 }, { "epoch": 0.3278306467275199, "grad_norm": 1.0390625, "learning_rate": 3.259578679983804e-05, "loss": 0.7798534035682678, "step": 844 }, { "epoch": 0.32821907166440084, "grad_norm": 1.109375, "learning_rate": 3.25781974530815e-05, "loss": 0.8904909491539001, "step": 845 }, { "epoch": 0.3286074966012818, "grad_norm": 0.99609375, "learning_rate": 3.25605919969317e-05, "loss": 1.0327059030532837, "step": 846 }, { "epoch": 0.32899592153816276, "grad_norm": 1.0546875, "learning_rate": 3.2542970453936654e-05, "loss": 0.9862273931503296, "step": 847 }, { "epoch": 0.3293843464750437, "grad_norm": 1.1484375, "learning_rate": 3.252533284666495e-05, "loss": 1.0200446844100952, "step": 848 }, { "epoch": 0.3297727714119246, "grad_norm": 1.09375, "learning_rate": 3.2507679197705774e-05, "loss": 0.9311704635620117, "step": 849 }, { "epoch": 0.3301611963488056, "grad_norm": 1.078125, "learning_rate": 3.2490009529668845e-05, "loss": 0.9758878350257874, "step": 850 }, { "epoch": 0.33054962128568655, "grad_norm": 1.359375, "learning_rate": 3.2472323865184416e-05, "loss": 0.8548144102096558, "step": 851 }, { "epoch": 0.3309380462225675, "grad_norm": 1.0, "learning_rate": 3.24546222269032e-05, "loss": 0.9024887084960938, "step": 852 }, { "epoch": 0.3313264711594484, "grad_norm": 1.1015625, "learning_rate": 3.2436904637496395e-05, "loss": 1.114680528640747, "step": 853 }, { "epoch": 0.33171489609632937, "grad_norm": 1.3984375, "learning_rate": 3.241917111965561e-05, "loss": 1.196089267730713, "step": 854 }, { "epoch": 0.33210332103321033, "grad_norm": 1.0625, "learning_rate": 3.2401421696092864e-05, "loss": 0.9751430153846741, "step": 855 }, { "epoch": 0.3324917459700913, "grad_norm": 1.0078125, "learning_rate": 3.238365638954054e-05, "loss": 0.7727580070495605, "step": 856 }, { "epoch": 0.33288017090697225, "grad_norm": 1.1171875, "learning_rate": 3.236587522275136e-05, "loss": 1.0105152130126953, "step": 857 }, { "epoch": 0.33326859584385315, "grad_norm": 1.0625, "learning_rate": 3.234807821849838e-05, "loss": 0.967509388923645, "step": 858 }, { "epoch": 0.3336570207807341, "grad_norm": 1.0234375, "learning_rate": 3.233026539957492e-05, "loss": 0.8557308316230774, "step": 859 }, { "epoch": 0.3340454457176151, "grad_norm": 0.95703125, "learning_rate": 3.231243678879455e-05, "loss": 0.8929481506347656, "step": 860 }, { "epoch": 0.33443387065449603, "grad_norm": 1.0546875, "learning_rate": 3.2294592408991084e-05, "loss": 1.1645114421844482, "step": 861 }, { "epoch": 0.33482229559137694, "grad_norm": 1.0703125, "learning_rate": 3.227673228301852e-05, "loss": 1.0303351879119873, "step": 862 }, { "epoch": 0.3352107205282579, "grad_norm": 1.234375, "learning_rate": 3.225885643375102e-05, "loss": 1.2674543857574463, "step": 863 }, { "epoch": 0.33559914546513886, "grad_norm": 1.1484375, "learning_rate": 3.2240964884082893e-05, "loss": 0.8605954647064209, "step": 864 }, { "epoch": 0.3359875704020198, "grad_norm": 1.2578125, "learning_rate": 3.2223057656928554e-05, "loss": 1.1922682523727417, "step": 865 }, { "epoch": 0.3363759953389008, "grad_norm": 1.2421875, "learning_rate": 3.22051347752225e-05, "loss": 1.1934959888458252, "step": 866 }, { "epoch": 0.3367644202757817, "grad_norm": 1.3203125, "learning_rate": 3.218719626191926e-05, "loss": 0.9493755102157593, "step": 867 }, { "epoch": 0.33715284521266264, "grad_norm": 0.9609375, "learning_rate": 3.21692421399934e-05, "loss": 0.8809481263160706, "step": 868 }, { "epoch": 0.3375412701495436, "grad_norm": 1.09375, "learning_rate": 3.215127243243947e-05, "loss": 0.8872241973876953, "step": 869 }, { "epoch": 0.33792969508642456, "grad_norm": 0.94921875, "learning_rate": 3.213328716227199e-05, "loss": 1.185905933380127, "step": 870 }, { "epoch": 0.33831812002330547, "grad_norm": 1.109375, "learning_rate": 3.211528635252539e-05, "loss": 0.9705193042755127, "step": 871 }, { "epoch": 0.33870654496018643, "grad_norm": 1.1484375, "learning_rate": 3.209727002625403e-05, "loss": 1.2055739164352417, "step": 872 }, { "epoch": 0.3390949698970674, "grad_norm": 1.3046875, "learning_rate": 3.207923820653213e-05, "loss": 1.1639686822891235, "step": 873 }, { "epoch": 0.33948339483394835, "grad_norm": 1.1640625, "learning_rate": 3.2061190916453745e-05, "loss": 1.1213228702545166, "step": 874 }, { "epoch": 0.3398718197708293, "grad_norm": 1.1796875, "learning_rate": 3.204312817913275e-05, "loss": 0.9107053279876709, "step": 875 }, { "epoch": 0.3402602447077102, "grad_norm": 1.265625, "learning_rate": 3.202505001770281e-05, "loss": 1.2361372709274292, "step": 876 }, { "epoch": 0.3406486696445912, "grad_norm": 1.109375, "learning_rate": 3.200695645531733e-05, "loss": 0.8116262555122375, "step": 877 }, { "epoch": 0.34103709458147213, "grad_norm": 1.1171875, "learning_rate": 3.198884751514946e-05, "loss": 0.8941199779510498, "step": 878 }, { "epoch": 0.3414255195183531, "grad_norm": 1.71875, "learning_rate": 3.197072322039203e-05, "loss": 1.067366123199463, "step": 879 }, { "epoch": 0.34181394445523405, "grad_norm": 1.2734375, "learning_rate": 3.1952583594257536e-05, "loss": 1.120818853378296, "step": 880 }, { "epoch": 0.34220236939211496, "grad_norm": 1.203125, "learning_rate": 3.193442865997811e-05, "loss": 0.9125462770462036, "step": 881 }, { "epoch": 0.3425907943289959, "grad_norm": 1.265625, "learning_rate": 3.191625844080549e-05, "loss": 1.096578598022461, "step": 882 }, { "epoch": 0.3429792192658769, "grad_norm": 1.078125, "learning_rate": 3.1898072960011004e-05, "loss": 0.8717673420906067, "step": 883 }, { "epoch": 0.34336764420275784, "grad_norm": 1.015625, "learning_rate": 3.18798722408855e-05, "loss": 0.7936244010925293, "step": 884 }, { "epoch": 0.34375606913963874, "grad_norm": 1.2109375, "learning_rate": 3.1861656306739375e-05, "loss": 0.8451751470565796, "step": 885 }, { "epoch": 0.3441444940765197, "grad_norm": 1.1640625, "learning_rate": 3.1843425180902476e-05, "loss": 1.0906908512115479, "step": 886 }, { "epoch": 0.34453291901340066, "grad_norm": 1.0703125, "learning_rate": 3.182517888672413e-05, "loss": 1.3840705156326294, "step": 887 }, { "epoch": 0.3449213439502816, "grad_norm": 1.1875, "learning_rate": 3.180691744757309e-05, "loss": 1.1344438791275024, "step": 888 }, { "epoch": 0.3453097688871626, "grad_norm": 1.265625, "learning_rate": 3.17886408868375e-05, "loss": 1.1030287742614746, "step": 889 }, { "epoch": 0.3456981938240435, "grad_norm": 1.1796875, "learning_rate": 3.1770349227924854e-05, "loss": 0.940391480922699, "step": 890 }, { "epoch": 0.34608661876092445, "grad_norm": 1.0, "learning_rate": 3.175204249426202e-05, "loss": 1.1598347425460815, "step": 891 }, { "epoch": 0.3464750436978054, "grad_norm": 1.09375, "learning_rate": 3.173372070929516e-05, "loss": 1.0245498418807983, "step": 892 }, { "epoch": 0.34686346863468637, "grad_norm": 1.0703125, "learning_rate": 3.1715383896489674e-05, "loss": 0.9509010910987854, "step": 893 }, { "epoch": 0.34725189357156727, "grad_norm": 1.0859375, "learning_rate": 3.169703207933028e-05, "loss": 1.030009150505066, "step": 894 }, { "epoch": 0.34764031850844823, "grad_norm": 0.90234375, "learning_rate": 3.167866528132085e-05, "loss": 0.7535997629165649, "step": 895 }, { "epoch": 0.3480287434453292, "grad_norm": 0.96484375, "learning_rate": 3.166028352598446e-05, "loss": 0.892837405204773, "step": 896 }, { "epoch": 0.34841716838221015, "grad_norm": 1.1015625, "learning_rate": 3.1641886836863376e-05, "loss": 0.8793472051620483, "step": 897 }, { "epoch": 0.3488055933190911, "grad_norm": 0.875, "learning_rate": 3.162347523751894e-05, "loss": 0.9006161689758301, "step": 898 }, { "epoch": 0.349194018255972, "grad_norm": 1.28125, "learning_rate": 3.160504875153161e-05, "loss": 0.9657084941864014, "step": 899 }, { "epoch": 0.349582443192853, "grad_norm": 1.203125, "learning_rate": 3.1586607402500935e-05, "loss": 0.9870240092277527, "step": 900 }, { "epoch": 0.34997086812973394, "grad_norm": 0.96484375, "learning_rate": 3.156815121404546e-05, "loss": 0.8896334171295166, "step": 901 }, { "epoch": 0.3503592930666149, "grad_norm": 1.3203125, "learning_rate": 3.1549680209802755e-05, "loss": 1.0631026029586792, "step": 902 }, { "epoch": 0.3507477180034958, "grad_norm": 1.0625, "learning_rate": 3.153119441342938e-05, "loss": 0.9084078073501587, "step": 903 }, { "epoch": 0.35113614294037676, "grad_norm": 1.3359375, "learning_rate": 3.15126938486008e-05, "loss": 1.0293641090393066, "step": 904 }, { "epoch": 0.3515245678772577, "grad_norm": 1.1796875, "learning_rate": 3.149417853901143e-05, "loss": 1.066938877105713, "step": 905 }, { "epoch": 0.3519129928141387, "grad_norm": 0.9453125, "learning_rate": 3.147564850837455e-05, "loss": 0.8579623699188232, "step": 906 }, { "epoch": 0.35230141775101964, "grad_norm": 0.953125, "learning_rate": 3.14571037804223e-05, "loss": 1.0486464500427246, "step": 907 }, { "epoch": 0.35268984268790055, "grad_norm": 1.1640625, "learning_rate": 3.143854437890565e-05, "loss": 0.7752895355224609, "step": 908 }, { "epoch": 0.3530782676247815, "grad_norm": 1.03125, "learning_rate": 3.141997032759436e-05, "loss": 0.9491285681724548, "step": 909 }, { "epoch": 0.35346669256166247, "grad_norm": 1.234375, "learning_rate": 3.140138165027695e-05, "loss": 0.9537655711174011, "step": 910 }, { "epoch": 0.3538551174985434, "grad_norm": 1.3515625, "learning_rate": 3.138277837076066e-05, "loss": 1.0496246814727783, "step": 911 }, { "epoch": 0.35424354243542433, "grad_norm": 1.0625, "learning_rate": 3.136416051287145e-05, "loss": 0.8471577167510986, "step": 912 }, { "epoch": 0.3546319673723053, "grad_norm": 0.99609375, "learning_rate": 3.134552810045394e-05, "loss": 1.0454132556915283, "step": 913 }, { "epoch": 0.35502039230918625, "grad_norm": 1.109375, "learning_rate": 3.132688115737141e-05, "loss": 1.0928782224655151, "step": 914 }, { "epoch": 0.3554088172460672, "grad_norm": 1.1328125, "learning_rate": 3.130821970750572e-05, "loss": 1.172321081161499, "step": 915 }, { "epoch": 0.35579724218294817, "grad_norm": 1.234375, "learning_rate": 3.1289543774757323e-05, "loss": 0.942084789276123, "step": 916 }, { "epoch": 0.3561856671198291, "grad_norm": 1.1171875, "learning_rate": 3.1270853383045237e-05, "loss": 1.0619860887527466, "step": 917 }, { "epoch": 0.35657409205671003, "grad_norm": 1.21875, "learning_rate": 3.125214855630697e-05, "loss": 1.2511012554168701, "step": 918 }, { "epoch": 0.356962516993591, "grad_norm": 1.3359375, "learning_rate": 3.1233429318498534e-05, "loss": 1.4669580459594727, "step": 919 }, { "epoch": 0.35735094193047195, "grad_norm": 1.0390625, "learning_rate": 3.1214695693594404e-05, "loss": 1.0257371664047241, "step": 920 }, { "epoch": 0.35773936686735286, "grad_norm": 1.15625, "learning_rate": 3.119594770558745e-05, "loss": 1.0325714349746704, "step": 921 }, { "epoch": 0.3581277918042338, "grad_norm": 1.15625, "learning_rate": 3.1177185378488984e-05, "loss": 0.9708775281906128, "step": 922 }, { "epoch": 0.3585162167411148, "grad_norm": 1.046875, "learning_rate": 3.1158408736328646e-05, "loss": 1.0907548666000366, "step": 923 }, { "epoch": 0.35890464167799574, "grad_norm": 1.203125, "learning_rate": 3.113961780315442e-05, "loss": 0.8508765697479248, "step": 924 }, { "epoch": 0.3592930666148767, "grad_norm": 1.109375, "learning_rate": 3.112081260303259e-05, "loss": 1.0622004270553589, "step": 925 }, { "epoch": 0.3596814915517576, "grad_norm": 1.1171875, "learning_rate": 3.110199316004774e-05, "loss": 0.9130839109420776, "step": 926 }, { "epoch": 0.36006991648863856, "grad_norm": 1.0703125, "learning_rate": 3.108315949830265e-05, "loss": 1.1228160858154297, "step": 927 }, { "epoch": 0.3604583414255195, "grad_norm": 1.046875, "learning_rate": 3.106431164191833e-05, "loss": 0.9542611837387085, "step": 928 }, { "epoch": 0.3608467663624005, "grad_norm": 1.171875, "learning_rate": 3.1045449615034e-05, "loss": 0.9185218214988708, "step": 929 }, { "epoch": 0.3612351912992814, "grad_norm": 1.03125, "learning_rate": 3.1026573441806976e-05, "loss": 0.9650945663452148, "step": 930 }, { "epoch": 0.36162361623616235, "grad_norm": 1.28125, "learning_rate": 3.100768314641272e-05, "loss": 0.8161953091621399, "step": 931 }, { "epoch": 0.3620120411730433, "grad_norm": 1.234375, "learning_rate": 3.098877875304478e-05, "loss": 1.1131112575531006, "step": 932 }, { "epoch": 0.36240046610992427, "grad_norm": 1.1171875, "learning_rate": 3.096986028591477e-05, "loss": 1.211348295211792, "step": 933 }, { "epoch": 0.36278889104680523, "grad_norm": 1.3671875, "learning_rate": 3.0950927769252306e-05, "loss": 1.2249430418014526, "step": 934 }, { "epoch": 0.36317731598368613, "grad_norm": 1.234375, "learning_rate": 3.0931981227305e-05, "loss": 1.2453396320343018, "step": 935 }, { "epoch": 0.3635657409205671, "grad_norm": 1.171875, "learning_rate": 3.091302068433845e-05, "loss": 1.0976662635803223, "step": 936 }, { "epoch": 0.36395416585744805, "grad_norm": 0.95703125, "learning_rate": 3.0894046164636166e-05, "loss": 1.0680540800094604, "step": 937 }, { "epoch": 0.364342590794329, "grad_norm": 1.0078125, "learning_rate": 3.0875057692499566e-05, "loss": 0.943792462348938, "step": 938 }, { "epoch": 0.3647310157312099, "grad_norm": 1.078125, "learning_rate": 3.085605529224792e-05, "loss": 0.9203507304191589, "step": 939 }, { "epoch": 0.3651194406680909, "grad_norm": 1.2421875, "learning_rate": 3.083703898821837e-05, "loss": 0.9688736200332642, "step": 940 }, { "epoch": 0.36550786560497184, "grad_norm": 1.0, "learning_rate": 3.081800880476584e-05, "loss": 1.0500209331512451, "step": 941 }, { "epoch": 0.3658962905418528, "grad_norm": 1.0234375, "learning_rate": 3.079896476626303e-05, "loss": 0.9061621427536011, "step": 942 }, { "epoch": 0.36628471547873376, "grad_norm": 1.1171875, "learning_rate": 3.077990689710039e-05, "loss": 0.9584693312644958, "step": 943 }, { "epoch": 0.36667314041561466, "grad_norm": 1.359375, "learning_rate": 3.07608352216861e-05, "loss": 0.9847326874732971, "step": 944 }, { "epoch": 0.3670615653524956, "grad_norm": 1.2109375, "learning_rate": 3.0741749764445986e-05, "loss": 1.4677062034606934, "step": 945 }, { "epoch": 0.3674499902893766, "grad_norm": 1.234375, "learning_rate": 3.072265054982356e-05, "loss": 0.973429799079895, "step": 946 }, { "epoch": 0.36783841522625754, "grad_norm": 1.140625, "learning_rate": 3.0703537602279934e-05, "loss": 0.9958919286727905, "step": 947 }, { "epoch": 0.36822684016313845, "grad_norm": 1.1484375, "learning_rate": 3.068441094629381e-05, "loss": 0.9943439960479736, "step": 948 }, { "epoch": 0.3686152651000194, "grad_norm": 1.4375, "learning_rate": 3.0665270606361455e-05, "loss": 1.1063662767410278, "step": 949 }, { "epoch": 0.36900369003690037, "grad_norm": 1.2109375, "learning_rate": 3.064611660699665e-05, "loss": 0.9784060120582581, "step": 950 }, { "epoch": 0.3693921149737813, "grad_norm": 1.046875, "learning_rate": 3.062694897273069e-05, "loss": 1.1553717851638794, "step": 951 }, { "epoch": 0.3697805399106623, "grad_norm": 1.0859375, "learning_rate": 3.060776772811231e-05, "loss": 0.9769039154052734, "step": 952 }, { "epoch": 0.3701689648475432, "grad_norm": 1.125, "learning_rate": 3.058857289770768e-05, "loss": 1.1188766956329346, "step": 953 }, { "epoch": 0.37055738978442415, "grad_norm": 1.1328125, "learning_rate": 3.05693645061004e-05, "loss": 0.9428322315216064, "step": 954 }, { "epoch": 0.3709458147213051, "grad_norm": 1.53125, "learning_rate": 3.0550142577891385e-05, "loss": 1.0381038188934326, "step": 955 }, { "epoch": 0.37133423965818607, "grad_norm": 1.1640625, "learning_rate": 3.053090713769893e-05, "loss": 1.264707326889038, "step": 956 }, { "epoch": 0.371722664595067, "grad_norm": 1.296875, "learning_rate": 3.0511658210158618e-05, "loss": 1.0196032524108887, "step": 957 }, { "epoch": 0.37211108953194794, "grad_norm": 1.0625, "learning_rate": 3.0492395819923318e-05, "loss": 0.9057264924049377, "step": 958 }, { "epoch": 0.3724995144688289, "grad_norm": 1.453125, "learning_rate": 3.047311999166312e-05, "loss": 1.3399500846862793, "step": 959 }, { "epoch": 0.37288793940570986, "grad_norm": 1.2265625, "learning_rate": 3.0453830750065352e-05, "loss": 1.1872987747192383, "step": 960 }, { "epoch": 0.3732763643425908, "grad_norm": 1.1640625, "learning_rate": 3.043452811983449e-05, "loss": 0.80659019947052, "step": 961 }, { "epoch": 0.3736647892794717, "grad_norm": 1.09375, "learning_rate": 3.0415212125692184e-05, "loss": 1.0449628829956055, "step": 962 }, { "epoch": 0.3740532142163527, "grad_norm": 1.046875, "learning_rate": 3.0395882792377186e-05, "loss": 0.9291737079620361, "step": 963 }, { "epoch": 0.37444163915323364, "grad_norm": 1.078125, "learning_rate": 3.037654014464533e-05, "loss": 1.0145061016082764, "step": 964 }, { "epoch": 0.3748300640901146, "grad_norm": 1.15625, "learning_rate": 3.0357184207269515e-05, "loss": 0.9409998655319214, "step": 965 }, { "epoch": 0.37521848902699556, "grad_norm": 1.0859375, "learning_rate": 3.0337815005039647e-05, "loss": 0.9843807220458984, "step": 966 }, { "epoch": 0.37560691396387647, "grad_norm": 1.21875, "learning_rate": 3.031843256276263e-05, "loss": 0.7564846277236938, "step": 967 }, { "epoch": 0.3759953389007574, "grad_norm": 1.1640625, "learning_rate": 3.0299036905262323e-05, "loss": 1.0204367637634277, "step": 968 }, { "epoch": 0.3763837638376384, "grad_norm": 1.0859375, "learning_rate": 3.027962805737951e-05, "loss": 0.8794606924057007, "step": 969 }, { "epoch": 0.37677218877451935, "grad_norm": 1.125, "learning_rate": 3.0260206043971857e-05, "loss": 1.219992756843567, "step": 970 }, { "epoch": 0.37716061371140025, "grad_norm": 1.078125, "learning_rate": 3.0240770889913914e-05, "loss": 0.9973229765892029, "step": 971 }, { "epoch": 0.3775490386482812, "grad_norm": 1.1484375, "learning_rate": 3.0221322620097047e-05, "loss": 0.9200574159622192, "step": 972 }, { "epoch": 0.37793746358516217, "grad_norm": 1.203125, "learning_rate": 3.0201861259429423e-05, "loss": 0.7741478681564331, "step": 973 }, { "epoch": 0.37832588852204313, "grad_norm": 1.3359375, "learning_rate": 3.0182386832835963e-05, "loss": 0.780112624168396, "step": 974 }, { "epoch": 0.3787143134589241, "grad_norm": 1.1484375, "learning_rate": 3.0162899365258343e-05, "loss": 0.7730796933174133, "step": 975 }, { "epoch": 0.379102738395805, "grad_norm": 1.25, "learning_rate": 3.0143398881654926e-05, "loss": 0.9476643204689026, "step": 976 }, { "epoch": 0.37949116333268595, "grad_norm": 1.375, "learning_rate": 3.0123885407000758e-05, "loss": 1.0431631803512573, "step": 977 }, { "epoch": 0.3798795882695669, "grad_norm": 1.015625, "learning_rate": 3.0104358966287503e-05, "loss": 0.8432206511497498, "step": 978 }, { "epoch": 0.3802680132064479, "grad_norm": 1.0078125, "learning_rate": 3.008481958452345e-05, "loss": 0.8431333899497986, "step": 979 }, { "epoch": 0.3806564381433288, "grad_norm": 0.96875, "learning_rate": 3.0065267286733454e-05, "loss": 0.953839898109436, "step": 980 }, { "epoch": 0.38104486308020974, "grad_norm": 0.94140625, "learning_rate": 3.0045702097958918e-05, "loss": 0.9332283735275269, "step": 981 }, { "epoch": 0.3814332880170907, "grad_norm": 1.0703125, "learning_rate": 3.002612404325774e-05, "loss": 1.0046453475952148, "step": 982 }, { "epoch": 0.38182171295397166, "grad_norm": 1.09375, "learning_rate": 3.000653314770432e-05, "loss": 0.7184510231018066, "step": 983 }, { "epoch": 0.3822101378908526, "grad_norm": 1.2109375, "learning_rate": 2.998692943638948e-05, "loss": 1.1594839096069336, "step": 984 }, { "epoch": 0.3825985628277335, "grad_norm": 1.046875, "learning_rate": 2.9967312934420465e-05, "loss": 0.9754316806793213, "step": 985 }, { "epoch": 0.3829869877646145, "grad_norm": 1.0078125, "learning_rate": 2.9947683666920913e-05, "loss": 0.8158805966377258, "step": 986 }, { "epoch": 0.38337541270149544, "grad_norm": 1.0390625, "learning_rate": 2.9928041659030787e-05, "loss": 1.1685631275177002, "step": 987 }, { "epoch": 0.3837638376383764, "grad_norm": 1.0078125, "learning_rate": 2.990838693590639e-05, "loss": 1.1294702291488647, "step": 988 }, { "epoch": 0.3841522625752573, "grad_norm": 1.1484375, "learning_rate": 2.98887195227203e-05, "loss": 1.1791685819625854, "step": 989 }, { "epoch": 0.38454068751213827, "grad_norm": 1.2421875, "learning_rate": 2.986903944466134e-05, "loss": 0.9037692546844482, "step": 990 }, { "epoch": 0.38492911244901923, "grad_norm": 1.0390625, "learning_rate": 2.9849346726934576e-05, "loss": 1.145851731300354, "step": 991 }, { "epoch": 0.3853175373859002, "grad_norm": 1.21875, "learning_rate": 2.982964139476124e-05, "loss": 1.1200162172317505, "step": 992 }, { "epoch": 0.38570596232278115, "grad_norm": 1.1328125, "learning_rate": 2.9809923473378722e-05, "loss": 1.3885529041290283, "step": 993 }, { "epoch": 0.38609438725966205, "grad_norm": 1.2265625, "learning_rate": 2.9790192988040548e-05, "loss": 1.084347128868103, "step": 994 }, { "epoch": 0.386482812196543, "grad_norm": 1.2734375, "learning_rate": 2.9770449964016325e-05, "loss": 1.0061254501342773, "step": 995 }, { "epoch": 0.386871237133424, "grad_norm": 1.2109375, "learning_rate": 2.9750694426591725e-05, "loss": 1.1881721019744873, "step": 996 }, { "epoch": 0.38725966207030493, "grad_norm": 1.234375, "learning_rate": 2.9730926401068442e-05, "loss": 0.9431416392326355, "step": 997 }, { "epoch": 0.38764808700718584, "grad_norm": 1.171875, "learning_rate": 2.9711145912764165e-05, "loss": 0.9370285272598267, "step": 998 }, { "epoch": 0.3880365119440668, "grad_norm": 1.234375, "learning_rate": 2.969135298701255e-05, "loss": 1.227623462677002, "step": 999 }, { "epoch": 0.38842493688094776, "grad_norm": 1.1875, "learning_rate": 2.967154764916316e-05, "loss": 0.8695383667945862, "step": 1000 }, { "epoch": 0.3888133618178287, "grad_norm": 1.0546875, "learning_rate": 2.9651729924581496e-05, "loss": 0.7729289531707764, "step": 1001 }, { "epoch": 0.3892017867547097, "grad_norm": 1.1015625, "learning_rate": 2.9631899838648887e-05, "loss": 0.9160133004188538, "step": 1002 }, { "epoch": 0.3895902116915906, "grad_norm": 1.0859375, "learning_rate": 2.961205741676251e-05, "loss": 0.8569009304046631, "step": 1003 }, { "epoch": 0.38997863662847154, "grad_norm": 1.125, "learning_rate": 2.9592202684335326e-05, "loss": 1.0961055755615234, "step": 1004 }, { "epoch": 0.3903670615653525, "grad_norm": 1.0234375, "learning_rate": 2.957233566679608e-05, "loss": 0.8225727081298828, "step": 1005 }, { "epoch": 0.39075548650223346, "grad_norm": 1.03125, "learning_rate": 2.9552456389589248e-05, "loss": 0.9165477752685547, "step": 1006 }, { "epoch": 0.39114391143911437, "grad_norm": 1.09375, "learning_rate": 2.9532564878175002e-05, "loss": 1.1189424991607666, "step": 1007 }, { "epoch": 0.3915323363759953, "grad_norm": 0.890625, "learning_rate": 2.951266115802918e-05, "loss": 0.7410985231399536, "step": 1008 }, { "epoch": 0.3919207613128763, "grad_norm": 1.1484375, "learning_rate": 2.9492745254643264e-05, "loss": 0.9990947246551514, "step": 1009 }, { "epoch": 0.39230918624975725, "grad_norm": 1.1171875, "learning_rate": 2.947281719352434e-05, "loss": 0.9761753082275391, "step": 1010 }, { "epoch": 0.3926976111866382, "grad_norm": 1.1484375, "learning_rate": 2.9452877000195056e-05, "loss": 0.9839602708816528, "step": 1011 }, { "epoch": 0.3930860361235191, "grad_norm": 1.0546875, "learning_rate": 2.943292470019361e-05, "loss": 0.9511690139770508, "step": 1012 }, { "epoch": 0.39347446106040007, "grad_norm": 0.99609375, "learning_rate": 2.9412960319073694e-05, "loss": 0.9323832988739014, "step": 1013 }, { "epoch": 0.39386288599728103, "grad_norm": 1.1328125, "learning_rate": 2.939298388240448e-05, "loss": 0.9808206558227539, "step": 1014 }, { "epoch": 0.394251310934162, "grad_norm": 1.0, "learning_rate": 2.9372995415770565e-05, "loss": 0.8046866655349731, "step": 1015 }, { "epoch": 0.3946397358710429, "grad_norm": 1.015625, "learning_rate": 2.935299494477199e-05, "loss": 0.9068353772163391, "step": 1016 }, { "epoch": 0.39502816080792386, "grad_norm": 1.234375, "learning_rate": 2.9332982495024133e-05, "loss": 0.9176714420318604, "step": 1017 }, { "epoch": 0.3954165857448048, "grad_norm": 1.0078125, "learning_rate": 2.9312958092157724e-05, "loss": 0.9024088382720947, "step": 1018 }, { "epoch": 0.3958050106816858, "grad_norm": 1.140625, "learning_rate": 2.929292176181881e-05, "loss": 0.864129900932312, "step": 1019 }, { "epoch": 0.39619343561856674, "grad_norm": 1.3046875, "learning_rate": 2.9272873529668708e-05, "loss": 0.9462385177612305, "step": 1020 }, { "epoch": 0.39658186055544764, "grad_norm": 1.109375, "learning_rate": 2.9252813421383978e-05, "loss": 1.0106253623962402, "step": 1021 }, { "epoch": 0.3969702854923286, "grad_norm": 1.09375, "learning_rate": 2.92327414626564e-05, "loss": 0.877783477306366, "step": 1022 }, { "epoch": 0.39735871042920956, "grad_norm": 1.03125, "learning_rate": 2.921265767919291e-05, "loss": 0.8242053389549255, "step": 1023 }, { "epoch": 0.3977471353660905, "grad_norm": 1.0703125, "learning_rate": 2.919256209671561e-05, "loss": 0.8150394558906555, "step": 1024 }, { "epoch": 0.3981355603029714, "grad_norm": 1.546875, "learning_rate": 2.91724547409617e-05, "loss": 0.7928986549377441, "step": 1025 }, { "epoch": 0.3985239852398524, "grad_norm": 1.1015625, "learning_rate": 2.9152335637683463e-05, "loss": 1.2459477186203003, "step": 1026 }, { "epoch": 0.39891241017673335, "grad_norm": 1.328125, "learning_rate": 2.9132204812648227e-05, "loss": 1.0400315523147583, "step": 1027 }, { "epoch": 0.3993008351136143, "grad_norm": 1.8046875, "learning_rate": 2.9112062291638338e-05, "loss": 1.6447676420211792, "step": 1028 }, { "epoch": 0.39968926005049527, "grad_norm": 0.984375, "learning_rate": 2.9091908100451108e-05, "loss": 0.9444699287414551, "step": 1029 }, { "epoch": 0.40007768498737617, "grad_norm": 1.078125, "learning_rate": 2.907174226489882e-05, "loss": 0.8932580947875977, "step": 1030 }, { "epoch": 0.40046610992425713, "grad_norm": 1.1484375, "learning_rate": 2.9051564810808638e-05, "loss": 0.9296973347663879, "step": 1031 }, { "epoch": 0.4008545348611381, "grad_norm": 1.171875, "learning_rate": 2.9031375764022627e-05, "loss": 0.8521270751953125, "step": 1032 }, { "epoch": 0.40124295979801905, "grad_norm": 1.3046875, "learning_rate": 2.9011175150397702e-05, "loss": 1.0031332969665527, "step": 1033 }, { "epoch": 0.40163138473489995, "grad_norm": 1.2578125, "learning_rate": 2.8990962995805577e-05, "loss": 1.1494348049163818, "step": 1034 }, { "epoch": 0.4020198096717809, "grad_norm": 1.1796875, "learning_rate": 2.897073932613276e-05, "loss": 1.078002691268921, "step": 1035 }, { "epoch": 0.4024082346086619, "grad_norm": 1.078125, "learning_rate": 2.895050416728049e-05, "loss": 1.2506316900253296, "step": 1036 }, { "epoch": 0.40279665954554283, "grad_norm": 1.1484375, "learning_rate": 2.8930257545164756e-05, "loss": 0.7518218159675598, "step": 1037 }, { "epoch": 0.4031850844824238, "grad_norm": 1.5390625, "learning_rate": 2.8909999485716187e-05, "loss": 1.0911829471588135, "step": 1038 }, { "epoch": 0.4035735094193047, "grad_norm": 1.171875, "learning_rate": 2.8889730014880076e-05, "loss": 1.0312256813049316, "step": 1039 }, { "epoch": 0.40396193435618566, "grad_norm": 1.0390625, "learning_rate": 2.8869449158616344e-05, "loss": 0.8731828927993774, "step": 1040 }, { "epoch": 0.4043503592930666, "grad_norm": 0.98828125, "learning_rate": 2.884915694289948e-05, "loss": 0.9138638973236084, "step": 1041 }, { "epoch": 0.4047387842299476, "grad_norm": 1.109375, "learning_rate": 2.882885339371852e-05, "loss": 0.8389953970909119, "step": 1042 }, { "epoch": 0.4051272091668285, "grad_norm": 1.125, "learning_rate": 2.880853853707703e-05, "loss": 1.0117634534835815, "step": 1043 }, { "epoch": 0.40551563410370944, "grad_norm": 1.0234375, "learning_rate": 2.8788212398993036e-05, "loss": 1.0204559564590454, "step": 1044 }, { "epoch": 0.4059040590405904, "grad_norm": 1.3984375, "learning_rate": 2.8767875005499034e-05, "loss": 0.9120598435401917, "step": 1045 }, { "epoch": 0.40629248397747136, "grad_norm": 1.109375, "learning_rate": 2.874752638264191e-05, "loss": 0.9504331350326538, "step": 1046 }, { "epoch": 0.4066809089143523, "grad_norm": 1.1171875, "learning_rate": 2.8727166556482962e-05, "loss": 1.0870546102523804, "step": 1047 }, { "epoch": 0.40706933385123323, "grad_norm": 1.21875, "learning_rate": 2.8706795553097824e-05, "loss": 0.7560852766036987, "step": 1048 }, { "epoch": 0.4074577587881142, "grad_norm": 1.046875, "learning_rate": 2.8686413398576425e-05, "loss": 0.9064743518829346, "step": 1049 }, { "epoch": 0.40784618372499515, "grad_norm": 1.1328125, "learning_rate": 2.866602011902301e-05, "loss": 1.0776864290237427, "step": 1050 }, { "epoch": 0.4082346086618761, "grad_norm": 1.0546875, "learning_rate": 2.8645615740556057e-05, "loss": 1.127287745475769, "step": 1051 }, { "epoch": 0.408623033598757, "grad_norm": 1.15625, "learning_rate": 2.8625200289308242e-05, "loss": 1.0801074504852295, "step": 1052 }, { "epoch": 0.409011458535638, "grad_norm": 1.171875, "learning_rate": 2.8604773791426454e-05, "loss": 0.9432052969932556, "step": 1053 }, { "epoch": 0.40939988347251893, "grad_norm": 1.0859375, "learning_rate": 2.858433627307171e-05, "loss": 0.9161993265151978, "step": 1054 }, { "epoch": 0.4097883084093999, "grad_norm": 1.015625, "learning_rate": 2.856388776041914e-05, "loss": 0.932655930519104, "step": 1055 }, { "epoch": 0.41017673334628085, "grad_norm": 1.3359375, "learning_rate": 2.8543428279657965e-05, "loss": 1.0417531728744507, "step": 1056 }, { "epoch": 0.41056515828316176, "grad_norm": 1.1015625, "learning_rate": 2.8522957856991446e-05, "loss": 0.9064041376113892, "step": 1057 }, { "epoch": 0.4109535832200427, "grad_norm": 1.078125, "learning_rate": 2.850247651863686e-05, "loss": 0.8767149448394775, "step": 1058 }, { "epoch": 0.4113420081569237, "grad_norm": 1.0546875, "learning_rate": 2.848198429082546e-05, "loss": 1.3994683027267456, "step": 1059 }, { "epoch": 0.41173043309380464, "grad_norm": 1.0625, "learning_rate": 2.846148119980246e-05, "loss": 1.0234884023666382, "step": 1060 }, { "epoch": 0.4121188580306856, "grad_norm": 1.0, "learning_rate": 2.8440967271826978e-05, "loss": 0.8757262229919434, "step": 1061 }, { "epoch": 0.4125072829675665, "grad_norm": 1.1015625, "learning_rate": 2.8420442533171995e-05, "loss": 1.017824649810791, "step": 1062 }, { "epoch": 0.41289570790444746, "grad_norm": 1.1640625, "learning_rate": 2.839990701012437e-05, "loss": 1.095155954360962, "step": 1063 }, { "epoch": 0.4132841328413284, "grad_norm": 1.0859375, "learning_rate": 2.8379360728984742e-05, "loss": 0.9740644693374634, "step": 1064 }, { "epoch": 0.4136725577782094, "grad_norm": 1.3125, "learning_rate": 2.8358803716067557e-05, "loss": 0.9040789604187012, "step": 1065 }, { "epoch": 0.4140609827150903, "grad_norm": 1.2421875, "learning_rate": 2.833823599770098e-05, "loss": 0.8548009395599365, "step": 1066 }, { "epoch": 0.41444940765197125, "grad_norm": 1.2734375, "learning_rate": 2.8317657600226903e-05, "loss": 0.8083251118659973, "step": 1067 }, { "epoch": 0.4148378325888522, "grad_norm": 1.15625, "learning_rate": 2.8297068550000884e-05, "loss": 0.8326992988586426, "step": 1068 }, { "epoch": 0.41522625752573317, "grad_norm": 1.1953125, "learning_rate": 2.8276468873392154e-05, "loss": 1.1705631017684937, "step": 1069 }, { "epoch": 0.4156146824626141, "grad_norm": 1.2109375, "learning_rate": 2.8255858596783497e-05, "loss": 1.2045000791549683, "step": 1070 }, { "epoch": 0.41600310739949503, "grad_norm": 1.3359375, "learning_rate": 2.8235237746571335e-05, "loss": 1.2351481914520264, "step": 1071 }, { "epoch": 0.416391532336376, "grad_norm": 1.3984375, "learning_rate": 2.8214606349165587e-05, "loss": 1.3715277910232544, "step": 1072 }, { "epoch": 0.41677995727325695, "grad_norm": 1.1015625, "learning_rate": 2.8193964430989713e-05, "loss": 0.9587658643722534, "step": 1073 }, { "epoch": 0.4171683822101379, "grad_norm": 1.2734375, "learning_rate": 2.8173312018480615e-05, "loss": 1.111472487449646, "step": 1074 }, { "epoch": 0.4175568071470188, "grad_norm": 1.1328125, "learning_rate": 2.815264913808866e-05, "loss": 0.895481288433075, "step": 1075 }, { "epoch": 0.4179452320838998, "grad_norm": 1.203125, "learning_rate": 2.813197581627761e-05, "loss": 1.0088659524917603, "step": 1076 }, { "epoch": 0.41833365702078074, "grad_norm": 1.1640625, "learning_rate": 2.81112920795246e-05, "loss": 0.7512674331665039, "step": 1077 }, { "epoch": 0.4187220819576617, "grad_norm": 1.390625, "learning_rate": 2.8090597954320116e-05, "loss": 1.0443449020385742, "step": 1078 }, { "epoch": 0.41911050689454266, "grad_norm": 1.0859375, "learning_rate": 2.806989346716794e-05, "loss": 0.9898313879966736, "step": 1079 }, { "epoch": 0.41949893183142356, "grad_norm": 1.2578125, "learning_rate": 2.804917864458511e-05, "loss": 1.0903726816177368, "step": 1080 }, { "epoch": 0.4198873567683045, "grad_norm": 1.1328125, "learning_rate": 2.8028453513101927e-05, "loss": 1.0957306623458862, "step": 1081 }, { "epoch": 0.4202757817051855, "grad_norm": 1.109375, "learning_rate": 2.8007718099261886e-05, "loss": 0.9879660606384277, "step": 1082 }, { "epoch": 0.42066420664206644, "grad_norm": 1.2421875, "learning_rate": 2.798697242962164e-05, "loss": 1.1972194910049438, "step": 1083 }, { "epoch": 0.42105263157894735, "grad_norm": 1.171875, "learning_rate": 2.7966216530751e-05, "loss": 1.2710044384002686, "step": 1084 }, { "epoch": 0.4214410565158283, "grad_norm": 1.0390625, "learning_rate": 2.794545042923285e-05, "loss": 1.0363808870315552, "step": 1085 }, { "epoch": 0.42182948145270927, "grad_norm": 1.296875, "learning_rate": 2.7924674151663166e-05, "loss": 1.1054917573928833, "step": 1086 }, { "epoch": 0.4222179063895902, "grad_norm": 1.203125, "learning_rate": 2.790388772465093e-05, "loss": 0.9483585357666016, "step": 1087 }, { "epoch": 0.4226063313264712, "grad_norm": 1.1484375, "learning_rate": 2.788309117481815e-05, "loss": 1.0857878923416138, "step": 1088 }, { "epoch": 0.4229947562633521, "grad_norm": 1.2421875, "learning_rate": 2.7862284528799784e-05, "loss": 1.0175448656082153, "step": 1089 }, { "epoch": 0.42338318120023305, "grad_norm": 0.96484375, "learning_rate": 2.7841467813243723e-05, "loss": 0.8457869291305542, "step": 1090 }, { "epoch": 0.423771606137114, "grad_norm": 1.3359375, "learning_rate": 2.782064105481076e-05, "loss": 1.1639456748962402, "step": 1091 }, { "epoch": 0.42416003107399497, "grad_norm": 1.0, "learning_rate": 2.7799804280174547e-05, "loss": 0.8100476861000061, "step": 1092 }, { "epoch": 0.4245484560108759, "grad_norm": 1.3203125, "learning_rate": 2.7778957516021547e-05, "loss": 0.9809389114379883, "step": 1093 }, { "epoch": 0.42493688094775683, "grad_norm": 1.34375, "learning_rate": 2.775810078905105e-05, "loss": 1.1469765901565552, "step": 1094 }, { "epoch": 0.4253253058846378, "grad_norm": 1.0546875, "learning_rate": 2.7737234125975082e-05, "loss": 0.972954273223877, "step": 1095 }, { "epoch": 0.42571373082151875, "grad_norm": 1.203125, "learning_rate": 2.7716357553518393e-05, "loss": 1.0747630596160889, "step": 1096 }, { "epoch": 0.4261021557583997, "grad_norm": 1.28125, "learning_rate": 2.7695471098418445e-05, "loss": 0.8717402815818787, "step": 1097 }, { "epoch": 0.4264905806952806, "grad_norm": 0.97265625, "learning_rate": 2.767457478742533e-05, "loss": 1.1207953691482544, "step": 1098 }, { "epoch": 0.4268790056321616, "grad_norm": 1.2265625, "learning_rate": 2.7653668647301797e-05, "loss": 1.0771404504776, "step": 1099 }, { "epoch": 0.42726743056904254, "grad_norm": 1.171875, "learning_rate": 2.7632752704823152e-05, "loss": 0.9350523948669434, "step": 1100 }, { "epoch": 0.4276558555059235, "grad_norm": 1.3125, "learning_rate": 2.761182698677726e-05, "loss": 1.0086420774459839, "step": 1101 }, { "epoch": 0.4280442804428044, "grad_norm": 1.1171875, "learning_rate": 2.7590891519964523e-05, "loss": 0.8167951107025146, "step": 1102 }, { "epoch": 0.42843270537968536, "grad_norm": 1.234375, "learning_rate": 2.7569946331197815e-05, "loss": 0.8619136810302734, "step": 1103 }, { "epoch": 0.4288211303165663, "grad_norm": 1.09375, "learning_rate": 2.7548991447302475e-05, "loss": 1.072274088859558, "step": 1104 }, { "epoch": 0.4292095552534473, "grad_norm": 1.1015625, "learning_rate": 2.7528026895116232e-05, "loss": 1.039091944694519, "step": 1105 }, { "epoch": 0.42959798019032824, "grad_norm": 1.03125, "learning_rate": 2.7507052701489222e-05, "loss": 1.0185226202011108, "step": 1106 }, { "epoch": 0.42998640512720915, "grad_norm": 1.0390625, "learning_rate": 2.748606889328392e-05, "loss": 1.066543698310852, "step": 1107 }, { "epoch": 0.4303748300640901, "grad_norm": 1.234375, "learning_rate": 2.746507549737512e-05, "loss": 1.0416185855865479, "step": 1108 }, { "epoch": 0.43076325500097107, "grad_norm": 1.0390625, "learning_rate": 2.7444072540649885e-05, "loss": 0.8940176963806152, "step": 1109 }, { "epoch": 0.43115167993785203, "grad_norm": 1.078125, "learning_rate": 2.7423060050007537e-05, "loss": 0.8502472043037415, "step": 1110 }, { "epoch": 0.43154010487473293, "grad_norm": 1.0625, "learning_rate": 2.7402038052359593e-05, "loss": 0.9548631906509399, "step": 1111 }, { "epoch": 0.4319285298116139, "grad_norm": 1.0078125, "learning_rate": 2.7381006574629764e-05, "loss": 0.8037289381027222, "step": 1112 }, { "epoch": 0.43231695474849485, "grad_norm": 1.1171875, "learning_rate": 2.7359965643753886e-05, "loss": 0.8383102416992188, "step": 1113 }, { "epoch": 0.4327053796853758, "grad_norm": 1.1953125, "learning_rate": 2.733891528667991e-05, "loss": 0.9431712627410889, "step": 1114 }, { "epoch": 0.4330938046222568, "grad_norm": 1.140625, "learning_rate": 2.731785553036786e-05, "loss": 0.7477529048919678, "step": 1115 }, { "epoch": 0.4334822295591377, "grad_norm": 1.203125, "learning_rate": 2.7296786401789788e-05, "loss": 0.9400382041931152, "step": 1116 }, { "epoch": 0.43387065449601864, "grad_norm": 1.1953125, "learning_rate": 2.727570792792977e-05, "loss": 0.9733121991157532, "step": 1117 }, { "epoch": 0.4342590794328996, "grad_norm": 1.0703125, "learning_rate": 2.7254620135783826e-05, "loss": 0.8205686211585999, "step": 1118 }, { "epoch": 0.43464750436978056, "grad_norm": 1.1328125, "learning_rate": 2.7233523052359923e-05, "loss": 1.1474733352661133, "step": 1119 }, { "epoch": 0.43503592930666146, "grad_norm": 1.1640625, "learning_rate": 2.7212416704677945e-05, "loss": 1.0017077922821045, "step": 1120 }, { "epoch": 0.4354243542435424, "grad_norm": 1.171875, "learning_rate": 2.7191301119769603e-05, "loss": 0.8891340494155884, "step": 1121 }, { "epoch": 0.4358127791804234, "grad_norm": 1.3984375, "learning_rate": 2.7170176324678466e-05, "loss": 1.1092190742492676, "step": 1122 }, { "epoch": 0.43620120411730434, "grad_norm": 1.4453125, "learning_rate": 2.7149042346459896e-05, "loss": 0.9241911172866821, "step": 1123 }, { "epoch": 0.4365896290541853, "grad_norm": 1.0546875, "learning_rate": 2.7127899212181007e-05, "loss": 0.7552111744880676, "step": 1124 }, { "epoch": 0.4369780539910662, "grad_norm": 1.0390625, "learning_rate": 2.7106746948920645e-05, "loss": 0.8830710649490356, "step": 1125 }, { "epoch": 0.43736647892794717, "grad_norm": 1.0703125, "learning_rate": 2.708558558376935e-05, "loss": 0.9027407169342041, "step": 1126 }, { "epoch": 0.4377549038648281, "grad_norm": 1.25, "learning_rate": 2.706441514382931e-05, "loss": 0.9470610022544861, "step": 1127 }, { "epoch": 0.4381433288017091, "grad_norm": 1.1171875, "learning_rate": 2.704323565621435e-05, "loss": 0.8240618109703064, "step": 1128 }, { "epoch": 0.43853175373859, "grad_norm": 1.171875, "learning_rate": 2.7022047148049865e-05, "loss": 0.989982008934021, "step": 1129 }, { "epoch": 0.43892017867547095, "grad_norm": 1.09375, "learning_rate": 2.7000849646472826e-05, "loss": 1.0662144422531128, "step": 1130 }, { "epoch": 0.4393086036123519, "grad_norm": 1.1640625, "learning_rate": 2.6979643178631696e-05, "loss": 0.8923007845878601, "step": 1131 }, { "epoch": 0.43969702854923287, "grad_norm": 1.15625, "learning_rate": 2.6958427771686442e-05, "loss": 0.8188368082046509, "step": 1132 }, { "epoch": 0.44008545348611383, "grad_norm": 1.078125, "learning_rate": 2.6937203452808467e-05, "loss": 1.0472534894943237, "step": 1133 }, { "epoch": 0.44047387842299474, "grad_norm": 1.1015625, "learning_rate": 2.6915970249180595e-05, "loss": 0.8896868824958801, "step": 1134 }, { "epoch": 0.4408623033598757, "grad_norm": 1.1875, "learning_rate": 2.6894728187997033e-05, "loss": 0.9350288510322571, "step": 1135 }, { "epoch": 0.44125072829675666, "grad_norm": 1.109375, "learning_rate": 2.6873477296463314e-05, "loss": 1.0610427856445312, "step": 1136 }, { "epoch": 0.4416391532336376, "grad_norm": 1.2265625, "learning_rate": 2.68522176017963e-05, "loss": 1.037235140800476, "step": 1137 }, { "epoch": 0.4420275781705185, "grad_norm": 1.0859375, "learning_rate": 2.6830949131224118e-05, "loss": 0.9188534617424011, "step": 1138 }, { "epoch": 0.4424160031073995, "grad_norm": 1.0859375, "learning_rate": 2.680967191198614e-05, "loss": 0.9437439441680908, "step": 1139 }, { "epoch": 0.44280442804428044, "grad_norm": 1.3203125, "learning_rate": 2.678838597133293e-05, "loss": 0.8971627354621887, "step": 1140 }, { "epoch": 0.4431928529811614, "grad_norm": 0.984375, "learning_rate": 2.676709133652624e-05, "loss": 0.9407253265380859, "step": 1141 }, { "epoch": 0.44358127791804236, "grad_norm": 1.0078125, "learning_rate": 2.674578803483894e-05, "loss": 0.9256010055541992, "step": 1142 }, { "epoch": 0.44396970285492326, "grad_norm": 1.25, "learning_rate": 2.6724476093555022e-05, "loss": 0.9604409337043762, "step": 1143 }, { "epoch": 0.4443581277918042, "grad_norm": 1.359375, "learning_rate": 2.670315553996952e-05, "loss": 1.0502903461456299, "step": 1144 }, { "epoch": 0.4447465527286852, "grad_norm": 1.1328125, "learning_rate": 2.6681826401388507e-05, "loss": 0.7921780347824097, "step": 1145 }, { "epoch": 0.44513497766556614, "grad_norm": 1.09375, "learning_rate": 2.6660488705129054e-05, "loss": 0.9469732046127319, "step": 1146 }, { "epoch": 0.44552340260244705, "grad_norm": 1.28125, "learning_rate": 2.6639142478519184e-05, "loss": 0.844677746295929, "step": 1147 }, { "epoch": 0.445911827539328, "grad_norm": 1.1796875, "learning_rate": 2.6617787748897864e-05, "loss": 1.271870493888855, "step": 1148 }, { "epoch": 0.44630025247620897, "grad_norm": 1.03125, "learning_rate": 2.659642454361492e-05, "loss": 0.9017252922058105, "step": 1149 }, { "epoch": 0.44668867741308993, "grad_norm": 1.109375, "learning_rate": 2.657505289003106e-05, "loss": 0.8206577301025391, "step": 1150 }, { "epoch": 0.4470771023499709, "grad_norm": 1.9765625, "learning_rate": 2.655367281551781e-05, "loss": 1.0907068252563477, "step": 1151 }, { "epoch": 0.4474655272868518, "grad_norm": 1.171875, "learning_rate": 2.653228434745746e-05, "loss": 0.9864065051078796, "step": 1152 }, { "epoch": 0.44785395222373275, "grad_norm": 0.96875, "learning_rate": 2.6510887513243075e-05, "loss": 1.0641767978668213, "step": 1153 }, { "epoch": 0.4482423771606137, "grad_norm": 1.15625, "learning_rate": 2.648948234027842e-05, "loss": 1.0065207481384277, "step": 1154 }, { "epoch": 0.4486308020974947, "grad_norm": 1.1328125, "learning_rate": 2.646806885597795e-05, "loss": 0.9280490875244141, "step": 1155 }, { "epoch": 0.44901922703437563, "grad_norm": 1.1796875, "learning_rate": 2.6446647087766746e-05, "loss": 0.9078973531723022, "step": 1156 }, { "epoch": 0.44940765197125654, "grad_norm": 1.2109375, "learning_rate": 2.642521706308052e-05, "loss": 1.1822282075881958, "step": 1157 }, { "epoch": 0.4497960769081375, "grad_norm": 1.078125, "learning_rate": 2.640377880936554e-05, "loss": 1.1624215841293335, "step": 1158 }, { "epoch": 0.45018450184501846, "grad_norm": 1.15625, "learning_rate": 2.6382332354078638e-05, "loss": 0.9183681011199951, "step": 1159 }, { "epoch": 0.4505729267818994, "grad_norm": 1.2265625, "learning_rate": 2.6360877724687123e-05, "loss": 1.000440001487732, "step": 1160 }, { "epoch": 0.4509613517187803, "grad_norm": 1.09375, "learning_rate": 2.6339414948668785e-05, "loss": 0.8714352250099182, "step": 1161 }, { "epoch": 0.4513497766556613, "grad_norm": 1.0703125, "learning_rate": 2.6317944053511853e-05, "loss": 0.8826557397842407, "step": 1162 }, { "epoch": 0.45173820159254224, "grad_norm": 1.0390625, "learning_rate": 2.629646506671494e-05, "loss": 0.8461215496063232, "step": 1163 }, { "epoch": 0.4521266265294232, "grad_norm": 1.0546875, "learning_rate": 2.627497801578704e-05, "loss": 0.8656233549118042, "step": 1164 }, { "epoch": 0.45251505146630416, "grad_norm": 1.4453125, "learning_rate": 2.625348292824747e-05, "loss": 1.2698265314102173, "step": 1165 }, { "epoch": 0.45290347640318507, "grad_norm": 1.1796875, "learning_rate": 2.623197983162582e-05, "loss": 0.8723911643028259, "step": 1166 }, { "epoch": 0.45329190134006603, "grad_norm": 1.2109375, "learning_rate": 2.6210468753461965e-05, "loss": 1.2475305795669556, "step": 1167 }, { "epoch": 0.453680326276947, "grad_norm": 1.2578125, "learning_rate": 2.6188949721305977e-05, "loss": 0.9678373336791992, "step": 1168 }, { "epoch": 0.45406875121382795, "grad_norm": 1.234375, "learning_rate": 2.616742276271815e-05, "loss": 0.9492830038070679, "step": 1169 }, { "epoch": 0.45445717615070885, "grad_norm": 1.9921875, "learning_rate": 2.6145887905268893e-05, "loss": 1.1320171356201172, "step": 1170 }, { "epoch": 0.4548456010875898, "grad_norm": 1.203125, "learning_rate": 2.612434517653875e-05, "loss": 0.9606800079345703, "step": 1171 }, { "epoch": 0.4552340260244708, "grad_norm": 1.09375, "learning_rate": 2.6102794604118345e-05, "loss": 0.892968475818634, "step": 1172 }, { "epoch": 0.45562245096135173, "grad_norm": 1.1015625, "learning_rate": 2.6081236215608346e-05, "loss": 0.889034628868103, "step": 1173 }, { "epoch": 0.4560108758982327, "grad_norm": 1.140625, "learning_rate": 2.6059670038619426e-05, "loss": 1.0893499851226807, "step": 1174 }, { "epoch": 0.4563993008351136, "grad_norm": 1.3515625, "learning_rate": 2.6038096100772245e-05, "loss": 1.0053839683532715, "step": 1175 }, { "epoch": 0.45678772577199456, "grad_norm": 1.0546875, "learning_rate": 2.6016514429697397e-05, "loss": 0.9552468061447144, "step": 1176 }, { "epoch": 0.4571761507088755, "grad_norm": 1.0859375, "learning_rate": 2.5994925053035368e-05, "loss": 0.7734244465827942, "step": 1177 }, { "epoch": 0.4575645756457565, "grad_norm": 1.140625, "learning_rate": 2.5973327998436527e-05, "loss": 1.1894118785858154, "step": 1178 }, { "epoch": 0.4579530005826374, "grad_norm": 1.28125, "learning_rate": 2.5951723293561074e-05, "loss": 1.260696530342102, "step": 1179 }, { "epoch": 0.45834142551951834, "grad_norm": 1.1875, "learning_rate": 2.593011096607901e-05, "loss": 0.9973691701889038, "step": 1180 }, { "epoch": 0.4587298504563993, "grad_norm": 1.2421875, "learning_rate": 2.590849104367009e-05, "loss": 0.8152632713317871, "step": 1181 }, { "epoch": 0.45911827539328026, "grad_norm": 1.1484375, "learning_rate": 2.5886863554023807e-05, "loss": 1.048824667930603, "step": 1182 }, { "epoch": 0.4595067003301612, "grad_norm": 1.0703125, "learning_rate": 2.586522852483934e-05, "loss": 1.0371934175491333, "step": 1183 }, { "epoch": 0.4598951252670421, "grad_norm": 1.2734375, "learning_rate": 2.584358598382551e-05, "loss": 1.0076347589492798, "step": 1184 }, { "epoch": 0.4602835502039231, "grad_norm": 1.0234375, "learning_rate": 2.582193595870078e-05, "loss": 0.9807928800582886, "step": 1185 }, { "epoch": 0.46067197514080405, "grad_norm": 1.5078125, "learning_rate": 2.5800278477193196e-05, "loss": 1.0094497203826904, "step": 1186 }, { "epoch": 0.461060400077685, "grad_norm": 1.0546875, "learning_rate": 2.5778613567040348e-05, "loss": 0.9527488946914673, "step": 1187 }, { "epoch": 0.4614488250145659, "grad_norm": 1.09375, "learning_rate": 2.575694125598933e-05, "loss": 0.9029735326766968, "step": 1188 }, { "epoch": 0.46183724995144687, "grad_norm": 1.109375, "learning_rate": 2.5735261571796734e-05, "loss": 0.8638364672660828, "step": 1189 }, { "epoch": 0.46222567488832783, "grad_norm": 1.1171875, "learning_rate": 2.5713574542228584e-05, "loss": 0.8102701902389526, "step": 1190 }, { "epoch": 0.4626140998252088, "grad_norm": 1.1015625, "learning_rate": 2.5691880195060313e-05, "loss": 0.9498847723007202, "step": 1191 }, { "epoch": 0.46300252476208975, "grad_norm": 1.7109375, "learning_rate": 2.5670178558076724e-05, "loss": 1.043046474456787, "step": 1192 }, { "epoch": 0.46339094969897066, "grad_norm": 1.15625, "learning_rate": 2.5648469659071964e-05, "loss": 0.9823000431060791, "step": 1193 }, { "epoch": 0.4637793746358516, "grad_norm": 1.0546875, "learning_rate": 2.562675352584947e-05, "loss": 0.8637526035308838, "step": 1194 }, { "epoch": 0.4641677995727326, "grad_norm": 1.03125, "learning_rate": 2.5605030186221957e-05, "loss": 0.8792254328727722, "step": 1195 }, { "epoch": 0.46455622450961354, "grad_norm": 1.140625, "learning_rate": 2.5583299668011358e-05, "loss": 0.822880208492279, "step": 1196 }, { "epoch": 0.46494464944649444, "grad_norm": 1.140625, "learning_rate": 2.5561561999048804e-05, "loss": 0.9067865610122681, "step": 1197 }, { "epoch": 0.4653330743833754, "grad_norm": 1.3671875, "learning_rate": 2.5539817207174588e-05, "loss": 0.8090221881866455, "step": 1198 }, { "epoch": 0.46572149932025636, "grad_norm": 1.1640625, "learning_rate": 2.551806532023811e-05, "loss": 1.0030924081802368, "step": 1199 }, { "epoch": 0.4661099242571373, "grad_norm": 1.4296875, "learning_rate": 2.5496306366097884e-05, "loss": 1.1362268924713135, "step": 1200 }, { "epoch": 0.4664983491940183, "grad_norm": 1.1015625, "learning_rate": 2.5474540372621453e-05, "loss": 0.9318096041679382, "step": 1201 }, { "epoch": 0.4668867741308992, "grad_norm": 1.359375, "learning_rate": 2.545276736768538e-05, "loss": 1.1053574085235596, "step": 1202 }, { "epoch": 0.46727519906778014, "grad_norm": 1.1640625, "learning_rate": 2.5430987379175216e-05, "loss": 1.1659109592437744, "step": 1203 }, { "epoch": 0.4676636240046611, "grad_norm": 1.0390625, "learning_rate": 2.5409200434985453e-05, "loss": 0.8250728845596313, "step": 1204 }, { "epoch": 0.46805204894154206, "grad_norm": 1.0703125, "learning_rate": 2.5387406563019483e-05, "loss": 0.7915035486221313, "step": 1205 }, { "epoch": 0.46844047387842297, "grad_norm": 1.1953125, "learning_rate": 2.536560579118958e-05, "loss": 0.8570035099983215, "step": 1206 }, { "epoch": 0.46882889881530393, "grad_norm": 1.1015625, "learning_rate": 2.5343798147416853e-05, "loss": 1.0727910995483398, "step": 1207 }, { "epoch": 0.4692173237521849, "grad_norm": 1.28125, "learning_rate": 2.5321983659631207e-05, "loss": 0.7476020455360413, "step": 1208 }, { "epoch": 0.46960574868906585, "grad_norm": 1.1875, "learning_rate": 2.5300162355771322e-05, "loss": 0.8158115148544312, "step": 1209 }, { "epoch": 0.4699941736259468, "grad_norm": 1.265625, "learning_rate": 2.5278334263784587e-05, "loss": 0.9476877450942993, "step": 1210 }, { "epoch": 0.4703825985628277, "grad_norm": 1.109375, "learning_rate": 2.525649941162712e-05, "loss": 0.8309472799301147, "step": 1211 }, { "epoch": 0.4707710234997087, "grad_norm": 1.1875, "learning_rate": 2.523465782726366e-05, "loss": 1.0223169326782227, "step": 1212 }, { "epoch": 0.47115944843658963, "grad_norm": 1.2421875, "learning_rate": 2.52128095386676e-05, "loss": 0.7896978259086609, "step": 1213 }, { "epoch": 0.4715478733734706, "grad_norm": 1.0625, "learning_rate": 2.51909545738209e-05, "loss": 1.1669301986694336, "step": 1214 }, { "epoch": 0.4719362983103515, "grad_norm": 1.3046875, "learning_rate": 2.5169092960714063e-05, "loss": 1.024320363998413, "step": 1215 }, { "epoch": 0.47232472324723246, "grad_norm": 1.0703125, "learning_rate": 2.5147224727346132e-05, "loss": 0.9026036262512207, "step": 1216 }, { "epoch": 0.4727131481841134, "grad_norm": 1.1875, "learning_rate": 2.5125349901724613e-05, "loss": 0.8754507303237915, "step": 1217 }, { "epoch": 0.4731015731209944, "grad_norm": 1.015625, "learning_rate": 2.5103468511865456e-05, "loss": 0.8402960300445557, "step": 1218 }, { "epoch": 0.47348999805787534, "grad_norm": 1.1640625, "learning_rate": 2.5081580585793017e-05, "loss": 0.9732791185379028, "step": 1219 }, { "epoch": 0.47387842299475624, "grad_norm": 1.171875, "learning_rate": 2.5059686151540024e-05, "loss": 0.9550161957740784, "step": 1220 }, { "epoch": 0.4742668479316372, "grad_norm": 1.46875, "learning_rate": 2.503778523714755e-05, "loss": 0.8574624061584473, "step": 1221 }, { "epoch": 0.47465527286851816, "grad_norm": 1.15625, "learning_rate": 2.5015877870664956e-05, "loss": 0.9567421674728394, "step": 1222 }, { "epoch": 0.4750436978053991, "grad_norm": 1.2421875, "learning_rate": 2.4993964080149866e-05, "loss": 1.2577290534973145, "step": 1223 }, { "epoch": 0.47543212274228003, "grad_norm": 1.1953125, "learning_rate": 2.4972043893668137e-05, "loss": 1.0693660974502563, "step": 1224 }, { "epoch": 0.475820547679161, "grad_norm": 0.98828125, "learning_rate": 2.4950117339293813e-05, "loss": 0.9900684952735901, "step": 1225 }, { "epoch": 0.47620897261604195, "grad_norm": 1.09375, "learning_rate": 2.4928184445109108e-05, "loss": 0.8094958662986755, "step": 1226 }, { "epoch": 0.4765973975529229, "grad_norm": 1.375, "learning_rate": 2.4906245239204336e-05, "loss": 1.0122060775756836, "step": 1227 }, { "epoch": 0.47698582248980387, "grad_norm": 1.09375, "learning_rate": 2.48842997496779e-05, "loss": 0.9573850035667419, "step": 1228 }, { "epoch": 0.4773742474266848, "grad_norm": 1.2421875, "learning_rate": 2.4862348004636257e-05, "loss": 0.930952787399292, "step": 1229 }, { "epoch": 0.47776267236356573, "grad_norm": 1.0078125, "learning_rate": 2.484039003219387e-05, "loss": 0.9312894940376282, "step": 1230 }, { "epoch": 0.4781510973004467, "grad_norm": 0.9453125, "learning_rate": 2.481842586047318e-05, "loss": 0.8494778871536255, "step": 1231 }, { "epoch": 0.47853952223732765, "grad_norm": 1.125, "learning_rate": 2.479645551760457e-05, "loss": 0.9327241778373718, "step": 1232 }, { "epoch": 0.47892794717420856, "grad_norm": 1.0859375, "learning_rate": 2.477447903172632e-05, "loss": 0.7259552478790283, "step": 1233 }, { "epoch": 0.4793163721110895, "grad_norm": 1.1171875, "learning_rate": 2.4752496430984593e-05, "loss": 0.9579362869262695, "step": 1234 }, { "epoch": 0.4797047970479705, "grad_norm": 1.171875, "learning_rate": 2.4730507743533357e-05, "loss": 1.2147228717803955, "step": 1235 }, { "epoch": 0.48009322198485144, "grad_norm": 1.2578125, "learning_rate": 2.4708512997534397e-05, "loss": 0.9478230476379395, "step": 1236 }, { "epoch": 0.4804816469217324, "grad_norm": 1.078125, "learning_rate": 2.4686512221157254e-05, "loss": 0.8297654986381531, "step": 1237 }, { "epoch": 0.4808700718586133, "grad_norm": 1.1328125, "learning_rate": 2.466450544257919e-05, "loss": 1.01932692527771, "step": 1238 }, { "epoch": 0.48125849679549426, "grad_norm": 1.0703125, "learning_rate": 2.464249268998515e-05, "loss": 0.9348945617675781, "step": 1239 }, { "epoch": 0.4816469217323752, "grad_norm": 1.0546875, "learning_rate": 2.4620473991567734e-05, "loss": 1.0425999164581299, "step": 1240 }, { "epoch": 0.4820353466692562, "grad_norm": 1.046875, "learning_rate": 2.459844937552715e-05, "loss": 0.8968407511711121, "step": 1241 }, { "epoch": 0.48242377160613714, "grad_norm": 1.2890625, "learning_rate": 2.457641887007121e-05, "loss": 0.9091005325317383, "step": 1242 }, { "epoch": 0.48281219654301805, "grad_norm": 1.3125, "learning_rate": 2.4554382503415232e-05, "loss": 0.8658621907234192, "step": 1243 }, { "epoch": 0.483200621479899, "grad_norm": 0.97265625, "learning_rate": 2.4532340303782073e-05, "loss": 0.8619877099990845, "step": 1244 }, { "epoch": 0.48358904641677997, "grad_norm": 1.5703125, "learning_rate": 2.451029229940204e-05, "loss": 1.4322502613067627, "step": 1245 }, { "epoch": 0.4839774713536609, "grad_norm": 1.203125, "learning_rate": 2.4488238518512883e-05, "loss": 0.9151092767715454, "step": 1246 }, { "epoch": 0.48436589629054183, "grad_norm": 1.0390625, "learning_rate": 2.446617898935975e-05, "loss": 0.8717566728591919, "step": 1247 }, { "epoch": 0.4847543212274228, "grad_norm": 1.0625, "learning_rate": 2.4444113740195144e-05, "loss": 0.9148802757263184, "step": 1248 }, { "epoch": 0.48514274616430375, "grad_norm": 1.1875, "learning_rate": 2.4422042799278906e-05, "loss": 1.011900544166565, "step": 1249 }, { "epoch": 0.4855311711011847, "grad_norm": 1.1640625, "learning_rate": 2.4399966194878158e-05, "loss": 0.9038481712341309, "step": 1250 }, { "epoch": 0.48591959603806567, "grad_norm": 1.0546875, "learning_rate": 2.437788395526727e-05, "loss": 0.877526044845581, "step": 1251 }, { "epoch": 0.4863080209749466, "grad_norm": 1.5078125, "learning_rate": 2.4355796108727847e-05, "loss": 1.2066154479980469, "step": 1252 }, { "epoch": 0.48669644591182754, "grad_norm": 1.1796875, "learning_rate": 2.4333702683548666e-05, "loss": 1.03080153465271, "step": 1253 }, { "epoch": 0.4870848708487085, "grad_norm": 1.15625, "learning_rate": 2.4311603708025633e-05, "loss": 1.2746516466140747, "step": 1254 }, { "epoch": 0.48747329578558946, "grad_norm": 1.0, "learning_rate": 2.4289499210461788e-05, "loss": 0.8316757082939148, "step": 1255 }, { "epoch": 0.48786172072247036, "grad_norm": 1.1484375, "learning_rate": 2.426738921916723e-05, "loss": 1.1071608066558838, "step": 1256 }, { "epoch": 0.4882501456593513, "grad_norm": 1.3203125, "learning_rate": 2.4245273762459102e-05, "loss": 1.0800890922546387, "step": 1257 }, { "epoch": 0.4886385705962323, "grad_norm": 1.1796875, "learning_rate": 2.4223152868661535e-05, "loss": 1.0340006351470947, "step": 1258 }, { "epoch": 0.48902699553311324, "grad_norm": 1.1953125, "learning_rate": 2.4201026566105626e-05, "loss": 1.004203200340271, "step": 1259 }, { "epoch": 0.4894154204699942, "grad_norm": 1.1796875, "learning_rate": 2.4178894883129412e-05, "loss": 0.9931145906448364, "step": 1260 }, { "epoch": 0.4898038454068751, "grad_norm": 1.140625, "learning_rate": 2.41567578480778e-05, "loss": 1.0205615758895874, "step": 1261 }, { "epoch": 0.49019227034375606, "grad_norm": 1.296875, "learning_rate": 2.4134615489302577e-05, "loss": 0.9469588398933411, "step": 1262 }, { "epoch": 0.490580695280637, "grad_norm": 1.1328125, "learning_rate": 2.411246783516233e-05, "loss": 0.7757263779640198, "step": 1263 }, { "epoch": 0.490969120217518, "grad_norm": 1.0390625, "learning_rate": 2.4090314914022422e-05, "loss": 0.8349518179893494, "step": 1264 }, { "epoch": 0.4913575451543989, "grad_norm": 0.9921875, "learning_rate": 2.4068156754254986e-05, "loss": 0.9987411499023438, "step": 1265 }, { "epoch": 0.49174597009127985, "grad_norm": 1.03125, "learning_rate": 2.404599338423885e-05, "loss": 0.8463017344474792, "step": 1266 }, { "epoch": 0.4921343950281608, "grad_norm": 1.4140625, "learning_rate": 2.4023824832359498e-05, "loss": 0.8845435976982117, "step": 1267 }, { "epoch": 0.49252281996504177, "grad_norm": 1.0859375, "learning_rate": 2.4001651127009094e-05, "loss": 0.8911988735198975, "step": 1268 }, { "epoch": 0.49291124490192273, "grad_norm": 1.125, "learning_rate": 2.3979472296586357e-05, "loss": 1.0929330587387085, "step": 1269 }, { "epoch": 0.49329966983880363, "grad_norm": 1.0078125, "learning_rate": 2.39572883694966e-05, "loss": 0.9423518180847168, "step": 1270 }, { "epoch": 0.4936880947756846, "grad_norm": 1.171875, "learning_rate": 2.3935099374151645e-05, "loss": 0.9789940118789673, "step": 1271 }, { "epoch": 0.49407651971256555, "grad_norm": 1.234375, "learning_rate": 2.3912905338969815e-05, "loss": 0.8428816795349121, "step": 1272 }, { "epoch": 0.4944649446494465, "grad_norm": 1.25, "learning_rate": 2.3890706292375896e-05, "loss": 0.9466828107833862, "step": 1273 }, { "epoch": 0.4948533695863274, "grad_norm": 1.2734375, "learning_rate": 2.3868502262801065e-05, "loss": 0.8659840822219849, "step": 1274 }, { "epoch": 0.4952417945232084, "grad_norm": 1.1484375, "learning_rate": 2.384629327868291e-05, "loss": 1.0009980201721191, "step": 1275 }, { "epoch": 0.49563021946008934, "grad_norm": 1.15625, "learning_rate": 2.382407936846535e-05, "loss": 0.991794228553772, "step": 1276 }, { "epoch": 0.4960186443969703, "grad_norm": 1.2265625, "learning_rate": 2.3801860560598606e-05, "loss": 1.1847240924835205, "step": 1277 }, { "epoch": 0.49640706933385126, "grad_norm": 1.1328125, "learning_rate": 2.37796368835392e-05, "loss": 0.9291219711303711, "step": 1278 }, { "epoch": 0.49679549427073216, "grad_norm": 1.453125, "learning_rate": 2.375740836574985e-05, "loss": 1.2752271890640259, "step": 1279 }, { "epoch": 0.4971839192076131, "grad_norm": 1.0390625, "learning_rate": 2.3735175035699512e-05, "loss": 0.9419888854026794, "step": 1280 }, { "epoch": 0.4975723441444941, "grad_norm": 1.0546875, "learning_rate": 2.371293692186328e-05, "loss": 0.9848824739456177, "step": 1281 }, { "epoch": 0.49796076908137504, "grad_norm": 1.0234375, "learning_rate": 2.3690694052722384e-05, "loss": 1.0773125886917114, "step": 1282 }, { "epoch": 0.49834919401825595, "grad_norm": 1.1171875, "learning_rate": 2.3668446456764144e-05, "loss": 0.9669545292854309, "step": 1283 }, { "epoch": 0.4987376189551369, "grad_norm": 1.265625, "learning_rate": 2.3646194162481927e-05, "loss": 0.9331457614898682, "step": 1284 }, { "epoch": 0.49912604389201787, "grad_norm": 1.0546875, "learning_rate": 2.3623937198375134e-05, "loss": 0.7691850066184998, "step": 1285 }, { "epoch": 0.4995144688288988, "grad_norm": 1.1953125, "learning_rate": 2.3601675592949124e-05, "loss": 1.0641664266586304, "step": 1286 }, { "epoch": 0.4999028937657798, "grad_norm": 1.0390625, "learning_rate": 2.357940937471523e-05, "loss": 0.7529241442680359, "step": 1287 }, { "epoch": 0.5002913187026607, "grad_norm": 1.1328125, "learning_rate": 2.355713857219066e-05, "loss": 1.0036711692810059, "step": 1288 }, { "epoch": 0.5006797436395417, "grad_norm": 1.1875, "learning_rate": 2.3534863213898516e-05, "loss": 0.8190545439720154, "step": 1289 }, { "epoch": 0.5010681685764226, "grad_norm": 1.140625, "learning_rate": 2.3512583328367717e-05, "loss": 1.046462059020996, "step": 1290 }, { "epoch": 0.5014565935133035, "grad_norm": 1.0625, "learning_rate": 2.3490298944133002e-05, "loss": 0.8511462211608887, "step": 1291 }, { "epoch": 0.5018450184501845, "grad_norm": 0.97265625, "learning_rate": 2.3468010089734854e-05, "loss": 0.8906524777412415, "step": 1292 }, { "epoch": 0.5022334433870654, "grad_norm": 1.0703125, "learning_rate": 2.3445716793719496e-05, "loss": 0.9308617115020752, "step": 1293 }, { "epoch": 0.5026218683239464, "grad_norm": 1.3125, "learning_rate": 2.3423419084638824e-05, "loss": 1.057046890258789, "step": 1294 }, { "epoch": 0.5030102932608274, "grad_norm": 1.0078125, "learning_rate": 2.3401116991050387e-05, "loss": 0.9057053327560425, "step": 1295 }, { "epoch": 0.5033987181977083, "grad_norm": 1.2734375, "learning_rate": 2.337881054151737e-05, "loss": 0.9754865765571594, "step": 1296 }, { "epoch": 0.5037871431345893, "grad_norm": 1.390625, "learning_rate": 2.335649976460851e-05, "loss": 1.2160544395446777, "step": 1297 }, { "epoch": 0.5041755680714702, "grad_norm": 1.3203125, "learning_rate": 2.3334184688898107e-05, "loss": 0.9636678099632263, "step": 1298 }, { "epoch": 0.5045639930083511, "grad_norm": 1.1875, "learning_rate": 2.3311865342965955e-05, "loss": 0.899956464767456, "step": 1299 }, { "epoch": 0.504952417945232, "grad_norm": 1.21875, "learning_rate": 2.3289541755397308e-05, "loss": 1.1650490760803223, "step": 1300 }, { "epoch": 0.505340842882113, "grad_norm": 1.2734375, "learning_rate": 2.3267213954782886e-05, "loss": 1.0158982276916504, "step": 1301 }, { "epoch": 0.505729267818994, "grad_norm": 1.34375, "learning_rate": 2.3244881969718768e-05, "loss": 1.0532525777816772, "step": 1302 }, { "epoch": 0.5061176927558749, "grad_norm": 1.296875, "learning_rate": 2.3222545828806407e-05, "loss": 0.9431767463684082, "step": 1303 }, { "epoch": 0.5065061176927559, "grad_norm": 1.15625, "learning_rate": 2.3200205560652594e-05, "loss": 0.7425223588943481, "step": 1304 }, { "epoch": 0.5068945426296368, "grad_norm": 1.2578125, "learning_rate": 2.317786119386937e-05, "loss": 0.9390622973442078, "step": 1305 }, { "epoch": 0.5072829675665178, "grad_norm": 1.3046875, "learning_rate": 2.3155512757074065e-05, "loss": 1.1936928033828735, "step": 1306 }, { "epoch": 0.5076713925033988, "grad_norm": 1.2109375, "learning_rate": 2.3133160278889194e-05, "loss": 1.0904490947723389, "step": 1307 }, { "epoch": 0.5080598174402796, "grad_norm": 1.0078125, "learning_rate": 2.311080378794246e-05, "loss": 1.044896125793457, "step": 1308 }, { "epoch": 0.5084482423771606, "grad_norm": 1.3046875, "learning_rate": 2.3088443312866698e-05, "loss": 0.8973260521888733, "step": 1309 }, { "epoch": 0.5088366673140415, "grad_norm": 1.2109375, "learning_rate": 2.3066078882299854e-05, "loss": 0.9809543490409851, "step": 1310 }, { "epoch": 0.5092250922509225, "grad_norm": 1.078125, "learning_rate": 2.304371052488493e-05, "loss": 0.9834471344947815, "step": 1311 }, { "epoch": 0.5096135171878035, "grad_norm": 1.0859375, "learning_rate": 2.3021338269269968e-05, "loss": 1.1194932460784912, "step": 1312 }, { "epoch": 0.5100019421246844, "grad_norm": 1.3984375, "learning_rate": 2.2998962144107996e-05, "loss": 1.2207896709442139, "step": 1313 }, { "epoch": 0.5103903670615654, "grad_norm": 1.1328125, "learning_rate": 2.2976582178057002e-05, "loss": 0.9341682195663452, "step": 1314 }, { "epoch": 0.5107787919984463, "grad_norm": 1.15625, "learning_rate": 2.2954198399779885e-05, "loss": 0.9724931716918945, "step": 1315 }, { "epoch": 0.5111672169353273, "grad_norm": 1.0078125, "learning_rate": 2.2931810837944433e-05, "loss": 1.1235673427581787, "step": 1316 }, { "epoch": 0.5115556418722081, "grad_norm": 1.203125, "learning_rate": 2.290941952122329e-05, "loss": 0.9877881407737732, "step": 1317 }, { "epoch": 0.5119440668090891, "grad_norm": 1.125, "learning_rate": 2.2887024478293883e-05, "loss": 1.2239453792572021, "step": 1318 }, { "epoch": 0.5123324917459701, "grad_norm": 1.2265625, "learning_rate": 2.2864625737838437e-05, "loss": 0.8496146202087402, "step": 1319 }, { "epoch": 0.512720916682851, "grad_norm": 1.140625, "learning_rate": 2.2842223328543893e-05, "loss": 0.8074788451194763, "step": 1320 }, { "epoch": 0.513109341619732, "grad_norm": 1.0078125, "learning_rate": 2.2819817279101904e-05, "loss": 0.881064772605896, "step": 1321 }, { "epoch": 0.513497766556613, "grad_norm": 0.953125, "learning_rate": 2.2797407618208784e-05, "loss": 0.7488251328468323, "step": 1322 }, { "epoch": 0.5138861914934939, "grad_norm": 1.1015625, "learning_rate": 2.277499437456546e-05, "loss": 1.0235583782196045, "step": 1323 }, { "epoch": 0.5142746164303749, "grad_norm": 1.296875, "learning_rate": 2.2752577576877467e-05, "loss": 0.8385047316551208, "step": 1324 }, { "epoch": 0.5146630413672558, "grad_norm": 1.1328125, "learning_rate": 2.273015725385488e-05, "loss": 0.9572916030883789, "step": 1325 }, { "epoch": 0.5150514663041367, "grad_norm": 1.15625, "learning_rate": 2.2707733434212273e-05, "loss": 0.9369227886199951, "step": 1326 }, { "epoch": 0.5154398912410176, "grad_norm": 1.1875, "learning_rate": 2.2685306146668743e-05, "loss": 0.9428249597549438, "step": 1327 }, { "epoch": 0.5158283161778986, "grad_norm": 1.078125, "learning_rate": 2.2662875419947783e-05, "loss": 0.9839540719985962, "step": 1328 }, { "epoch": 0.5162167411147796, "grad_norm": 1.1015625, "learning_rate": 2.2640441282777314e-05, "loss": 0.9924243688583374, "step": 1329 }, { "epoch": 0.5166051660516605, "grad_norm": 1.1015625, "learning_rate": 2.261800376388962e-05, "loss": 0.9647331237792969, "step": 1330 }, { "epoch": 0.5169935909885415, "grad_norm": 1.0625, "learning_rate": 2.2595562892021314e-05, "loss": 0.97670978307724, "step": 1331 }, { "epoch": 0.5173820159254224, "grad_norm": 1.1953125, "learning_rate": 2.2573118695913303e-05, "loss": 0.9225878119468689, "step": 1332 }, { "epoch": 0.5177704408623034, "grad_norm": 1.1484375, "learning_rate": 2.2550671204310762e-05, "loss": 0.9816463589668274, "step": 1333 }, { "epoch": 0.5181588657991844, "grad_norm": 1.0625, "learning_rate": 2.252822044596307e-05, "loss": 1.0046985149383545, "step": 1334 }, { "epoch": 0.5185472907360652, "grad_norm": 1.0390625, "learning_rate": 2.2505766449623802e-05, "loss": 0.9308055639266968, "step": 1335 }, { "epoch": 0.5189357156729462, "grad_norm": 1.21875, "learning_rate": 2.248330924405067e-05, "loss": 0.8549844026565552, "step": 1336 }, { "epoch": 0.5193241406098271, "grad_norm": 1.1875, "learning_rate": 2.2460848858005515e-05, "loss": 0.7563146948814392, "step": 1337 }, { "epoch": 0.5197125655467081, "grad_norm": 1.7109375, "learning_rate": 2.2438385320254234e-05, "loss": 1.0496511459350586, "step": 1338 }, { "epoch": 0.520100990483589, "grad_norm": 1.0703125, "learning_rate": 2.241591865956676e-05, "loss": 0.8475170731544495, "step": 1339 }, { "epoch": 0.52048941542047, "grad_norm": 1.3203125, "learning_rate": 2.2393448904717032e-05, "loss": 1.1377695798873901, "step": 1340 }, { "epoch": 0.520877840357351, "grad_norm": 1.1484375, "learning_rate": 2.2370976084482955e-05, "loss": 1.0080002546310425, "step": 1341 }, { "epoch": 0.5212662652942319, "grad_norm": 1.0390625, "learning_rate": 2.2348500227646347e-05, "loss": 0.8683527112007141, "step": 1342 }, { "epoch": 0.5216546902311129, "grad_norm": 0.96484375, "learning_rate": 2.232602136299293e-05, "loss": 0.8815857768058777, "step": 1343 }, { "epoch": 0.5220431151679937, "grad_norm": 1.09375, "learning_rate": 2.2303539519312276e-05, "loss": 0.7041352391242981, "step": 1344 }, { "epoch": 0.5224315401048747, "grad_norm": 1.2578125, "learning_rate": 2.2281054725397756e-05, "loss": 1.0449793338775635, "step": 1345 }, { "epoch": 0.5228199650417557, "grad_norm": 1.0859375, "learning_rate": 2.2258567010046546e-05, "loss": 0.825964093208313, "step": 1346 }, { "epoch": 0.5232083899786366, "grad_norm": 1.09375, "learning_rate": 2.2236076402059547e-05, "loss": 1.0300947427749634, "step": 1347 }, { "epoch": 0.5235968149155176, "grad_norm": 1.5, "learning_rate": 2.221358293024136e-05, "loss": 0.7542541027069092, "step": 1348 }, { "epoch": 0.5239852398523985, "grad_norm": 1.1328125, "learning_rate": 2.219108662340026e-05, "loss": 0.9815769195556641, "step": 1349 }, { "epoch": 0.5243736647892795, "grad_norm": 1.078125, "learning_rate": 2.2168587510348167e-05, "loss": 1.0465636253356934, "step": 1350 }, { "epoch": 0.5247620897261605, "grad_norm": 1.2734375, "learning_rate": 2.2146085619900565e-05, "loss": 0.8385406732559204, "step": 1351 }, { "epoch": 0.5251505146630414, "grad_norm": 1.3359375, "learning_rate": 2.212358098087652e-05, "loss": 1.0172611474990845, "step": 1352 }, { "epoch": 0.5255389395999223, "grad_norm": 1.265625, "learning_rate": 2.2101073622098615e-05, "loss": 0.9052812457084656, "step": 1353 }, { "epoch": 0.5259273645368032, "grad_norm": 1.140625, "learning_rate": 2.2078563572392907e-05, "loss": 1.0204092264175415, "step": 1354 }, { "epoch": 0.5263157894736842, "grad_norm": 1.0, "learning_rate": 2.2056050860588897e-05, "loss": 0.9628894925117493, "step": 1355 }, { "epoch": 0.5267042144105651, "grad_norm": 1.1015625, "learning_rate": 2.2033535515519516e-05, "loss": 0.881242036819458, "step": 1356 }, { "epoch": 0.5270926393474461, "grad_norm": 1.265625, "learning_rate": 2.2011017566021042e-05, "loss": 1.0060144662857056, "step": 1357 }, { "epoch": 0.5274810642843271, "grad_norm": 1.78125, "learning_rate": 2.198849704093311e-05, "loss": 0.9282349348068237, "step": 1358 }, { "epoch": 0.527869489221208, "grad_norm": 1.21875, "learning_rate": 2.196597396909864e-05, "loss": 0.9307987689971924, "step": 1359 }, { "epoch": 0.528257914158089, "grad_norm": 1.1484375, "learning_rate": 2.1943448379363813e-05, "loss": 0.8865164518356323, "step": 1360 }, { "epoch": 0.5286463390949699, "grad_norm": 1.078125, "learning_rate": 2.192092030057805e-05, "loss": 0.9043812155723572, "step": 1361 }, { "epoch": 0.5290347640318508, "grad_norm": 1.265625, "learning_rate": 2.1898389761593933e-05, "loss": 1.025873064994812, "step": 1362 }, { "epoch": 0.5294231889687318, "grad_norm": 1.1640625, "learning_rate": 2.1875856791267234e-05, "loss": 0.8218619227409363, "step": 1363 }, { "epoch": 0.5298116139056127, "grad_norm": 1.1953125, "learning_rate": 2.18533214184568e-05, "loss": 0.9024035334587097, "step": 1364 }, { "epoch": 0.5302000388424937, "grad_norm": 1.140625, "learning_rate": 2.183078367202457e-05, "loss": 0.9325942397117615, "step": 1365 }, { "epoch": 0.5305884637793746, "grad_norm": 1.21875, "learning_rate": 2.1808243580835534e-05, "loss": 0.9099087715148926, "step": 1366 }, { "epoch": 0.5309768887162556, "grad_norm": 1.3203125, "learning_rate": 2.1785701173757676e-05, "loss": 0.8846466541290283, "step": 1367 }, { "epoch": 0.5313653136531366, "grad_norm": 1.03125, "learning_rate": 2.1763156479661932e-05, "loss": 1.0101838111877441, "step": 1368 }, { "epoch": 0.5317537385900175, "grad_norm": 1.3984375, "learning_rate": 2.174060952742219e-05, "loss": 0.9552252292633057, "step": 1369 }, { "epoch": 0.5321421635268985, "grad_norm": 1.1640625, "learning_rate": 2.171806034591522e-05, "loss": 0.8258076906204224, "step": 1370 }, { "epoch": 0.5325305884637793, "grad_norm": 1.03125, "learning_rate": 2.169550896402065e-05, "loss": 0.965345561504364, "step": 1371 }, { "epoch": 0.5329190134006603, "grad_norm": 1.3046875, "learning_rate": 2.1672955410620916e-05, "loss": 0.7353599071502686, "step": 1372 }, { "epoch": 0.5333074383375412, "grad_norm": 1.2265625, "learning_rate": 2.1650399714601246e-05, "loss": 0.9690191745758057, "step": 1373 }, { "epoch": 0.5336958632744222, "grad_norm": 1.1484375, "learning_rate": 2.1627841904849612e-05, "loss": 0.9814085960388184, "step": 1374 }, { "epoch": 0.5340842882113032, "grad_norm": 1.21875, "learning_rate": 2.1605282010256687e-05, "loss": 1.038013219833374, "step": 1375 }, { "epoch": 0.5344727131481841, "grad_norm": 1.4140625, "learning_rate": 2.1582720059715817e-05, "loss": 0.8054144382476807, "step": 1376 }, { "epoch": 0.5348611380850651, "grad_norm": 1.34375, "learning_rate": 2.1560156082122985e-05, "loss": 0.9309391379356384, "step": 1377 }, { "epoch": 0.535249563021946, "grad_norm": 1.1875, "learning_rate": 2.1537590106376758e-05, "loss": 0.8818418979644775, "step": 1378 }, { "epoch": 0.535637987958827, "grad_norm": 1.1015625, "learning_rate": 2.1515022161378286e-05, "loss": 0.8282039165496826, "step": 1379 }, { "epoch": 0.536026412895708, "grad_norm": 1.203125, "learning_rate": 2.1492452276031212e-05, "loss": 0.9216827154159546, "step": 1380 }, { "epoch": 0.5364148378325888, "grad_norm": 1.1796875, "learning_rate": 2.146988047924168e-05, "loss": 0.9040237069129944, "step": 1381 }, { "epoch": 0.5368032627694698, "grad_norm": 1.125, "learning_rate": 2.1447306799918285e-05, "loss": 0.955001175403595, "step": 1382 }, { "epoch": 0.5371916877063507, "grad_norm": 1.3515625, "learning_rate": 2.1424731266972022e-05, "loss": 0.7373701930046082, "step": 1383 }, { "epoch": 0.5375801126432317, "grad_norm": 1.4375, "learning_rate": 2.1402153909316267e-05, "loss": 0.8566916584968567, "step": 1384 }, { "epoch": 0.5379685375801126, "grad_norm": 1.2734375, "learning_rate": 2.137957475586674e-05, "loss": 0.9391424655914307, "step": 1385 }, { "epoch": 0.5383569625169936, "grad_norm": 1.140625, "learning_rate": 2.135699383554144e-05, "loss": 0.9461976885795593, "step": 1386 }, { "epoch": 0.5387453874538746, "grad_norm": 1.2734375, "learning_rate": 2.133441117726065e-05, "loss": 0.7462364435195923, "step": 1387 }, { "epoch": 0.5391338123907555, "grad_norm": 1.2734375, "learning_rate": 2.1311826809946866e-05, "loss": 1.0189951658248901, "step": 1388 }, { "epoch": 0.5395222373276365, "grad_norm": 1.28125, "learning_rate": 2.1289240762524784e-05, "loss": 0.8119837045669556, "step": 1389 }, { "epoch": 0.5399106622645173, "grad_norm": 1.21875, "learning_rate": 2.126665306392124e-05, "loss": 1.06306791305542, "step": 1390 }, { "epoch": 0.5402990872013983, "grad_norm": 1.1953125, "learning_rate": 2.124406374306519e-05, "loss": 0.9185832142829895, "step": 1391 }, { "epoch": 0.5406875121382793, "grad_norm": 0.98046875, "learning_rate": 2.1221472828887672e-05, "loss": 0.6832720041275024, "step": 1392 }, { "epoch": 0.5410759370751602, "grad_norm": 1.2421875, "learning_rate": 2.119888035032175e-05, "loss": 0.9574915170669556, "step": 1393 }, { "epoch": 0.5414643620120412, "grad_norm": 1.078125, "learning_rate": 2.1176286336302513e-05, "loss": 1.1064502000808716, "step": 1394 }, { "epoch": 0.5418527869489221, "grad_norm": 1.046875, "learning_rate": 2.1153690815767e-05, "loss": 1.030134916305542, "step": 1395 }, { "epoch": 0.5422412118858031, "grad_norm": 1.125, "learning_rate": 2.1131093817654188e-05, "loss": 0.8586524724960327, "step": 1396 }, { "epoch": 0.5426296368226841, "grad_norm": 1.40625, "learning_rate": 2.1108495370904936e-05, "loss": 1.0125025510787964, "step": 1397 }, { "epoch": 0.543018061759565, "grad_norm": 1.1484375, "learning_rate": 2.1085895504461975e-05, "loss": 0.9850146174430847, "step": 1398 }, { "epoch": 0.5434064866964459, "grad_norm": 1.1953125, "learning_rate": 2.1063294247269838e-05, "loss": 0.8215245604515076, "step": 1399 }, { "epoch": 0.5437949116333268, "grad_norm": 1.078125, "learning_rate": 2.1040691628274846e-05, "loss": 0.9673529267311096, "step": 1400 }, { "epoch": 0.5441833365702078, "grad_norm": 1.0546875, "learning_rate": 2.101808767642507e-05, "loss": 1.0072882175445557, "step": 1401 }, { "epoch": 0.5445717615070887, "grad_norm": 1.1796875, "learning_rate": 2.099548242067028e-05, "loss": 0.869513988494873, "step": 1402 }, { "epoch": 0.5449601864439697, "grad_norm": 1.078125, "learning_rate": 2.097287588996191e-05, "loss": 0.9045069813728333, "step": 1403 }, { "epoch": 0.5453486113808507, "grad_norm": 1.3359375, "learning_rate": 2.0950268113253042e-05, "loss": 0.8702669143676758, "step": 1404 }, { "epoch": 0.5457370363177316, "grad_norm": 1.1875, "learning_rate": 2.092765911949835e-05, "loss": 0.809282660484314, "step": 1405 }, { "epoch": 0.5461254612546126, "grad_norm": 1.2890625, "learning_rate": 2.0905048937654057e-05, "loss": 1.0687072277069092, "step": 1406 }, { "epoch": 0.5465138861914935, "grad_norm": 1.265625, "learning_rate": 2.0882437596677912e-05, "loss": 0.8678672909736633, "step": 1407 }, { "epoch": 0.5469023111283744, "grad_norm": 1.0546875, "learning_rate": 2.0859825125529163e-05, "loss": 0.7148362398147583, "step": 1408 }, { "epoch": 0.5472907360652554, "grad_norm": 1.140625, "learning_rate": 2.0837211553168477e-05, "loss": 0.8650181889533997, "step": 1409 }, { "epoch": 0.5476791610021363, "grad_norm": 1.359375, "learning_rate": 2.0814596908557966e-05, "loss": 0.9805907011032104, "step": 1410 }, { "epoch": 0.5480675859390173, "grad_norm": 1.0859375, "learning_rate": 2.079198122066108e-05, "loss": 0.9370062351226807, "step": 1411 }, { "epoch": 0.5484560108758982, "grad_norm": 1.3203125, "learning_rate": 2.076936451844263e-05, "loss": 1.4173266887664795, "step": 1412 }, { "epoch": 0.5488444358127792, "grad_norm": 1.0859375, "learning_rate": 2.0746746830868714e-05, "loss": 0.9118934273719788, "step": 1413 }, { "epoch": 0.5492328607496602, "grad_norm": 1.078125, "learning_rate": 2.0724128186906697e-05, "loss": 1.0061503648757935, "step": 1414 }, { "epoch": 0.5496212856865411, "grad_norm": 1.328125, "learning_rate": 2.070150861552517e-05, "loss": 0.7185952067375183, "step": 1415 }, { "epoch": 0.5500097106234221, "grad_norm": 1.1015625, "learning_rate": 2.0678888145693912e-05, "loss": 0.7572886347770691, "step": 1416 }, { "epoch": 0.5503981355603029, "grad_norm": 1.046875, "learning_rate": 2.065626680638384e-05, "loss": 0.7314915060997009, "step": 1417 }, { "epoch": 0.5507865604971839, "grad_norm": 1.2109375, "learning_rate": 2.0633644626567007e-05, "loss": 0.9339551329612732, "step": 1418 }, { "epoch": 0.5511749854340648, "grad_norm": 1.4296875, "learning_rate": 2.0611021635216523e-05, "loss": 0.91826331615448, "step": 1419 }, { "epoch": 0.5515634103709458, "grad_norm": 1.0703125, "learning_rate": 2.058839786130655e-05, "loss": 0.9637704491615295, "step": 1420 }, { "epoch": 0.5519518353078268, "grad_norm": 1.203125, "learning_rate": 2.0565773333812244e-05, "loss": 0.881058394908905, "step": 1421 }, { "epoch": 0.5523402602447077, "grad_norm": 1.1328125, "learning_rate": 2.0543148081709726e-05, "loss": 0.8299684524536133, "step": 1422 }, { "epoch": 0.5527286851815887, "grad_norm": 1.4921875, "learning_rate": 2.0520522133976053e-05, "loss": 1.3566796779632568, "step": 1423 }, { "epoch": 0.5531171101184696, "grad_norm": 1.2265625, "learning_rate": 2.0497895519589165e-05, "loss": 1.204673409461975, "step": 1424 }, { "epoch": 0.5535055350553506, "grad_norm": 1.1953125, "learning_rate": 2.0475268267527858e-05, "loss": 1.1095091104507446, "step": 1425 }, { "epoch": 0.5538939599922315, "grad_norm": 1.0234375, "learning_rate": 2.0452640406771754e-05, "loss": 1.1587963104248047, "step": 1426 }, { "epoch": 0.5542823849291124, "grad_norm": 1.078125, "learning_rate": 2.0430011966301233e-05, "loss": 1.0243842601776123, "step": 1427 }, { "epoch": 0.5546708098659934, "grad_norm": 1.265625, "learning_rate": 2.0407382975097442e-05, "loss": 0.9792146682739258, "step": 1428 }, { "epoch": 0.5550592348028743, "grad_norm": 1.2109375, "learning_rate": 2.038475346214222e-05, "loss": 0.8673288822174072, "step": 1429 }, { "epoch": 0.5554476597397553, "grad_norm": 1.1328125, "learning_rate": 2.0362123456418068e-05, "loss": 0.7538125514984131, "step": 1430 }, { "epoch": 0.5558360846766363, "grad_norm": 1.171875, "learning_rate": 2.0339492986908135e-05, "loss": 0.8877299427986145, "step": 1431 }, { "epoch": 0.5562245096135172, "grad_norm": 1.09375, "learning_rate": 2.0316862082596153e-05, "loss": 1.043736457824707, "step": 1432 }, { "epoch": 0.5566129345503982, "grad_norm": 1.1015625, "learning_rate": 2.0294230772466413e-05, "loss": 0.8048237562179565, "step": 1433 }, { "epoch": 0.5570013594872791, "grad_norm": 1.3125, "learning_rate": 2.0271599085503722e-05, "loss": 0.8627485036849976, "step": 1434 }, { "epoch": 0.55738978442416, "grad_norm": 1.125, "learning_rate": 2.0248967050693372e-05, "loss": 1.1441445350646973, "step": 1435 }, { "epoch": 0.5577782093610409, "grad_norm": 1.3203125, "learning_rate": 2.0226334697021118e-05, "loss": 0.8530837893486023, "step": 1436 }, { "epoch": 0.5581666342979219, "grad_norm": 1.1328125, "learning_rate": 2.0203702053473088e-05, "loss": 1.0025904178619385, "step": 1437 }, { "epoch": 0.5585550592348029, "grad_norm": 1.09375, "learning_rate": 2.0181069149035814e-05, "loss": 0.8351298570632935, "step": 1438 }, { "epoch": 0.5589434841716838, "grad_norm": 1.1640625, "learning_rate": 2.0158436012696145e-05, "loss": 1.0042814016342163, "step": 1439 }, { "epoch": 0.5593319091085648, "grad_norm": 1.25, "learning_rate": 2.0135802673441222e-05, "loss": 0.9876765012741089, "step": 1440 }, { "epoch": 0.5597203340454457, "grad_norm": 1.2734375, "learning_rate": 2.011316916025847e-05, "loss": 0.9215753078460693, "step": 1441 }, { "epoch": 0.5601087589823267, "grad_norm": 1.1875, "learning_rate": 2.0090535502135516e-05, "loss": 0.7403925061225891, "step": 1442 }, { "epoch": 0.5604971839192077, "grad_norm": 1.3671875, "learning_rate": 2.0067901728060167e-05, "loss": 1.0433412790298462, "step": 1443 }, { "epoch": 0.5608856088560885, "grad_norm": 1.21875, "learning_rate": 2.0045267867020403e-05, "loss": 0.9769894480705261, "step": 1444 }, { "epoch": 0.5612740337929695, "grad_norm": 1.2421875, "learning_rate": 2.0022633948004296e-05, "loss": 0.96647047996521, "step": 1445 }, { "epoch": 0.5616624587298504, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 0.9248301982879639, "step": 1446 }, { "epoch": 0.5620508836667314, "grad_norm": 1.2890625, "learning_rate": 1.9977366051995707e-05, "loss": 0.9713277816772461, "step": 1447 }, { "epoch": 0.5624393086036124, "grad_norm": 1.4453125, "learning_rate": 1.9954732132979604e-05, "loss": 0.9983370304107666, "step": 1448 }, { "epoch": 0.5628277335404933, "grad_norm": 1.015625, "learning_rate": 1.9932098271939833e-05, "loss": 0.692103922367096, "step": 1449 }, { "epoch": 0.5632161584773743, "grad_norm": 1.1953125, "learning_rate": 1.9909464497864487e-05, "loss": 0.9998420476913452, "step": 1450 }, { "epoch": 0.5636045834142552, "grad_norm": 1.15625, "learning_rate": 1.9886830839741535e-05, "loss": 0.7937294840812683, "step": 1451 }, { "epoch": 0.5639930083511362, "grad_norm": 1.21875, "learning_rate": 1.9864197326558784e-05, "loss": 0.9315086603164673, "step": 1452 }, { "epoch": 0.564381433288017, "grad_norm": 1.3828125, "learning_rate": 1.9841563987303865e-05, "loss": 0.9741135239601135, "step": 1453 }, { "epoch": 0.564769858224898, "grad_norm": 1.0859375, "learning_rate": 1.9818930850964186e-05, "loss": 0.89149010181427, "step": 1454 }, { "epoch": 0.565158283161779, "grad_norm": 1.2734375, "learning_rate": 1.9796297946526912e-05, "loss": 1.0224583148956299, "step": 1455 }, { "epoch": 0.5655467080986599, "grad_norm": 1.359375, "learning_rate": 1.977366530297889e-05, "loss": 1.1595604419708252, "step": 1456 }, { "epoch": 0.5659351330355409, "grad_norm": 1.234375, "learning_rate": 1.975103294930663e-05, "loss": 1.1223788261413574, "step": 1457 }, { "epoch": 0.5663235579724218, "grad_norm": 1.0390625, "learning_rate": 1.9728400914496288e-05, "loss": 0.999083399772644, "step": 1458 }, { "epoch": 0.5667119829093028, "grad_norm": 1.40625, "learning_rate": 1.970576922753359e-05, "loss": 0.8769012093544006, "step": 1459 }, { "epoch": 0.5671004078461838, "grad_norm": 1.140625, "learning_rate": 1.968313791740385e-05, "loss": 1.0960745811462402, "step": 1460 }, { "epoch": 0.5674888327830647, "grad_norm": 1.34375, "learning_rate": 1.9660507013091872e-05, "loss": 0.993807315826416, "step": 1461 }, { "epoch": 0.5678772577199456, "grad_norm": 1.1640625, "learning_rate": 1.963787654358194e-05, "loss": 0.8850481510162354, "step": 1462 }, { "epoch": 0.5682656826568265, "grad_norm": 1.4609375, "learning_rate": 1.961524653785779e-05, "loss": 0.8996971845626831, "step": 1463 }, { "epoch": 0.5686541075937075, "grad_norm": 1.1015625, "learning_rate": 1.9592617024902568e-05, "loss": 0.7757315039634705, "step": 1464 }, { "epoch": 0.5690425325305885, "grad_norm": 1.265625, "learning_rate": 1.9569988033698767e-05, "loss": 1.1242129802703857, "step": 1465 }, { "epoch": 0.5694309574674694, "grad_norm": 1.421875, "learning_rate": 1.954735959322825e-05, "loss": 0.9488145112991333, "step": 1466 }, { "epoch": 0.5698193824043504, "grad_norm": 1.1484375, "learning_rate": 1.9524731732472145e-05, "loss": 0.9198830127716064, "step": 1467 }, { "epoch": 0.5702078073412313, "grad_norm": 1.15625, "learning_rate": 1.950210448041084e-05, "loss": 1.293518304824829, "step": 1468 }, { "epoch": 0.5705962322781123, "grad_norm": 1.046875, "learning_rate": 1.9479477866023954e-05, "loss": 0.9204908609390259, "step": 1469 }, { "epoch": 0.5709846572149933, "grad_norm": 1.234375, "learning_rate": 1.9456851918290278e-05, "loss": 0.8739852905273438, "step": 1470 }, { "epoch": 0.5713730821518741, "grad_norm": 1.3203125, "learning_rate": 1.9434226666187763e-05, "loss": 1.0076260566711426, "step": 1471 }, { "epoch": 0.5717615070887551, "grad_norm": 1.265625, "learning_rate": 1.9411602138693457e-05, "loss": 0.9528446197509766, "step": 1472 }, { "epoch": 0.572149932025636, "grad_norm": 1.109375, "learning_rate": 1.9388978364783484e-05, "loss": 0.9609858989715576, "step": 1473 }, { "epoch": 0.572538356962517, "grad_norm": 1.140625, "learning_rate": 1.9366355373433003e-05, "loss": 0.9486883282661438, "step": 1474 }, { "epoch": 0.5729267818993979, "grad_norm": 1.140625, "learning_rate": 1.934373319361616e-05, "loss": 0.8631957769393921, "step": 1475 }, { "epoch": 0.5733152068362789, "grad_norm": 1.0703125, "learning_rate": 1.9321111854306094e-05, "loss": 0.8310002088546753, "step": 1476 }, { "epoch": 0.5737036317731599, "grad_norm": 1.078125, "learning_rate": 1.9298491384474835e-05, "loss": 0.8228086233139038, "step": 1477 }, { "epoch": 0.5740920567100408, "grad_norm": 1.1328125, "learning_rate": 1.927587181309331e-05, "loss": 0.9887425303459167, "step": 1478 }, { "epoch": 0.5744804816469218, "grad_norm": 1.296875, "learning_rate": 1.9253253169131296e-05, "loss": 0.967291533946991, "step": 1479 }, { "epoch": 0.5748689065838026, "grad_norm": 1.2265625, "learning_rate": 1.9230635481557378e-05, "loss": 0.9751973748207092, "step": 1480 }, { "epoch": 0.5752573315206836, "grad_norm": 1.34375, "learning_rate": 1.9208018779338927e-05, "loss": 0.7887195944786072, "step": 1481 }, { "epoch": 0.5756457564575646, "grad_norm": 1.0078125, "learning_rate": 1.9185403091442044e-05, "loss": 0.849962055683136, "step": 1482 }, { "epoch": 0.5760341813944455, "grad_norm": 1.1171875, "learning_rate": 1.9162788446831526e-05, "loss": 0.8540864586830139, "step": 1483 }, { "epoch": 0.5764226063313265, "grad_norm": 1.28125, "learning_rate": 1.9140174874470847e-05, "loss": 0.9641323089599609, "step": 1484 }, { "epoch": 0.5768110312682074, "grad_norm": 1.0859375, "learning_rate": 1.9117562403322087e-05, "loss": 0.8389216065406799, "step": 1485 }, { "epoch": 0.5771994562050884, "grad_norm": 1.28125, "learning_rate": 1.9094951062345947e-05, "loss": 0.7240473628044128, "step": 1486 }, { "epoch": 0.5775878811419694, "grad_norm": 1.1484375, "learning_rate": 1.9072340880501655e-05, "loss": 0.8430283665657043, "step": 1487 }, { "epoch": 0.5779763060788503, "grad_norm": 1.4140625, "learning_rate": 1.9049731886746964e-05, "loss": 0.948866605758667, "step": 1488 }, { "epoch": 0.5783647310157312, "grad_norm": 1.0546875, "learning_rate": 1.9027124110038096e-05, "loss": 0.6990160942077637, "step": 1489 }, { "epoch": 0.5787531559526121, "grad_norm": 1.171875, "learning_rate": 1.900451757932973e-05, "loss": 0.9091046452522278, "step": 1490 }, { "epoch": 0.5791415808894931, "grad_norm": 1.1875, "learning_rate": 1.8981912323574936e-05, "loss": 1.026735782623291, "step": 1491 }, { "epoch": 0.579530005826374, "grad_norm": 1.2109375, "learning_rate": 1.8959308371725157e-05, "loss": 0.9743378758430481, "step": 1492 }, { "epoch": 0.579918430763255, "grad_norm": 1.2421875, "learning_rate": 1.8936705752730172e-05, "loss": 0.9924946427345276, "step": 1493 }, { "epoch": 0.580306855700136, "grad_norm": 1.203125, "learning_rate": 1.8914104495538035e-05, "loss": 1.1228772401809692, "step": 1494 }, { "epoch": 0.5806952806370169, "grad_norm": 1.078125, "learning_rate": 1.889150462909507e-05, "loss": 0.7512722611427307, "step": 1495 }, { "epoch": 0.5810837055738979, "grad_norm": 1.0703125, "learning_rate": 1.8868906182345815e-05, "loss": 0.9830581545829773, "step": 1496 }, { "epoch": 0.5814721305107788, "grad_norm": 1.5, "learning_rate": 1.8846309184233004e-05, "loss": 1.0526983737945557, "step": 1497 }, { "epoch": 0.5818605554476597, "grad_norm": 1.2421875, "learning_rate": 1.882371366369749e-05, "loss": 0.6859850883483887, "step": 1498 }, { "epoch": 0.5822489803845406, "grad_norm": 1.109375, "learning_rate": 1.8801119649678256e-05, "loss": 0.9660393595695496, "step": 1499 }, { "epoch": 0.5826374053214216, "grad_norm": 1.9140625, "learning_rate": 1.8778527171112338e-05, "loss": 1.0100769996643066, "step": 1500 }, { "epoch": 0.5830258302583026, "grad_norm": 1.15625, "learning_rate": 1.8755936256934813e-05, "loss": 1.2443194389343262, "step": 1501 }, { "epoch": 0.5834142551951835, "grad_norm": 1.2109375, "learning_rate": 1.8733346936078768e-05, "loss": 0.7764634490013123, "step": 1502 }, { "epoch": 0.5838026801320645, "grad_norm": 1.0625, "learning_rate": 1.8710759237475226e-05, "loss": 1.0083917379379272, "step": 1503 }, { "epoch": 0.5841911050689454, "grad_norm": 1.296875, "learning_rate": 1.8688173190053144e-05, "loss": 0.864986777305603, "step": 1504 }, { "epoch": 0.5845795300058264, "grad_norm": 1.171875, "learning_rate": 1.866558882273936e-05, "loss": 0.9206494092941284, "step": 1505 }, { "epoch": 0.5849679549427074, "grad_norm": 1.3125, "learning_rate": 1.8643006164458562e-05, "loss": 1.1923764944076538, "step": 1506 }, { "epoch": 0.5853563798795882, "grad_norm": 1.15625, "learning_rate": 1.8620425244133266e-05, "loss": 1.1567422151565552, "step": 1507 }, { "epoch": 0.5857448048164692, "grad_norm": 1.0625, "learning_rate": 1.8597846090683736e-05, "loss": 1.0614526271820068, "step": 1508 }, { "epoch": 0.5861332297533501, "grad_norm": 1.140625, "learning_rate": 1.857526873302799e-05, "loss": 0.8919860124588013, "step": 1509 }, { "epoch": 0.5865216546902311, "grad_norm": 1.140625, "learning_rate": 1.8552693200081725e-05, "loss": 0.9400984048843384, "step": 1510 }, { "epoch": 0.5869100796271121, "grad_norm": 1.2265625, "learning_rate": 1.8530119520758325e-05, "loss": 0.9441542029380798, "step": 1511 }, { "epoch": 0.587298504563993, "grad_norm": 0.9140625, "learning_rate": 1.8507547723968795e-05, "loss": 0.689987063407898, "step": 1512 }, { "epoch": 0.587686929500874, "grad_norm": 1.25, "learning_rate": 1.8484977838621724e-05, "loss": 0.9368295073509216, "step": 1513 }, { "epoch": 0.5880753544377549, "grad_norm": 1.359375, "learning_rate": 1.846240989362325e-05, "loss": 0.9330081939697266, "step": 1514 }, { "epoch": 0.5884637793746359, "grad_norm": 1.3984375, "learning_rate": 1.8439843917877025e-05, "loss": 0.9646498560905457, "step": 1515 }, { "epoch": 0.5888522043115167, "grad_norm": 1.0859375, "learning_rate": 1.8417279940284187e-05, "loss": 1.0207886695861816, "step": 1516 }, { "epoch": 0.5892406292483977, "grad_norm": 1.1640625, "learning_rate": 1.8394717989743316e-05, "loss": 0.9038371443748474, "step": 1517 }, { "epoch": 0.5896290541852787, "grad_norm": 1.15625, "learning_rate": 1.8372158095150394e-05, "loss": 0.780565619468689, "step": 1518 }, { "epoch": 0.5900174791221596, "grad_norm": 1.0625, "learning_rate": 1.834960028539876e-05, "loss": 0.8574860095977783, "step": 1519 }, { "epoch": 0.5904059040590406, "grad_norm": 1.28125, "learning_rate": 1.8327044589379094e-05, "loss": 0.9488846063613892, "step": 1520 }, { "epoch": 0.5907943289959215, "grad_norm": 1.1640625, "learning_rate": 1.830449103597936e-05, "loss": 0.9592880010604858, "step": 1521 }, { "epoch": 0.5911827539328025, "grad_norm": 1.078125, "learning_rate": 1.8281939654084783e-05, "loss": 0.8973768949508667, "step": 1522 }, { "epoch": 0.5915711788696835, "grad_norm": 1.5546875, "learning_rate": 1.8259390472577817e-05, "loss": 1.1413439512252808, "step": 1523 }, { "epoch": 0.5919596038065644, "grad_norm": 1.046875, "learning_rate": 1.8236843520338078e-05, "loss": 1.073927640914917, "step": 1524 }, { "epoch": 0.5923480287434453, "grad_norm": 1.1875, "learning_rate": 1.8214298826242334e-05, "loss": 0.8254389762878418, "step": 1525 }, { "epoch": 0.5927364536803262, "grad_norm": 1.1484375, "learning_rate": 1.8191756419164473e-05, "loss": 1.0262141227722168, "step": 1526 }, { "epoch": 0.5931248786172072, "grad_norm": 1.21875, "learning_rate": 1.816921632797543e-05, "loss": 1.0948424339294434, "step": 1527 }, { "epoch": 0.5935133035540882, "grad_norm": 1.1796875, "learning_rate": 1.8146678581543206e-05, "loss": 0.8442094326019287, "step": 1528 }, { "epoch": 0.5939017284909691, "grad_norm": 0.98828125, "learning_rate": 1.8124143208732773e-05, "loss": 1.0328789949417114, "step": 1529 }, { "epoch": 0.5942901534278501, "grad_norm": 1.2421875, "learning_rate": 1.810161023840607e-05, "loss": 0.8525210618972778, "step": 1530 }, { "epoch": 0.594678578364731, "grad_norm": 1.203125, "learning_rate": 1.8079079699421963e-05, "loss": 0.9936757683753967, "step": 1531 }, { "epoch": 0.595067003301612, "grad_norm": 1.2109375, "learning_rate": 1.805655162063619e-05, "loss": 0.9261599779129028, "step": 1532 }, { "epoch": 0.595455428238493, "grad_norm": 1.1875, "learning_rate": 1.803402603090137e-05, "loss": 0.7032359838485718, "step": 1533 }, { "epoch": 0.5958438531753738, "grad_norm": 1.0546875, "learning_rate": 1.80115029590669e-05, "loss": 0.8936344385147095, "step": 1534 }, { "epoch": 0.5962322781122548, "grad_norm": 1.1953125, "learning_rate": 1.7988982433978964e-05, "loss": 1.0786280632019043, "step": 1535 }, { "epoch": 0.5966207030491357, "grad_norm": 1.1640625, "learning_rate": 1.7966464484480494e-05, "loss": 0.9881832003593445, "step": 1536 }, { "epoch": 0.5970091279860167, "grad_norm": 1.046875, "learning_rate": 1.7943949139411103e-05, "loss": 0.7771439552307129, "step": 1537 }, { "epoch": 0.5973975529228976, "grad_norm": 1.1171875, "learning_rate": 1.79214364276071e-05, "loss": 0.7883205413818359, "step": 1538 }, { "epoch": 0.5977859778597786, "grad_norm": 1.234375, "learning_rate": 1.789892637790139e-05, "loss": 0.9074313640594482, "step": 1539 }, { "epoch": 0.5981744027966596, "grad_norm": 0.94140625, "learning_rate": 1.7876419019123482e-05, "loss": 0.8917738199234009, "step": 1540 }, { "epoch": 0.5985628277335405, "grad_norm": 1.296875, "learning_rate": 1.7853914380099445e-05, "loss": 1.023451328277588, "step": 1541 }, { "epoch": 0.5989512526704215, "grad_norm": 1.4296875, "learning_rate": 1.783141248965184e-05, "loss": 0.9944581985473633, "step": 1542 }, { "epoch": 0.5993396776073023, "grad_norm": 1.375, "learning_rate": 1.7808913376599743e-05, "loss": 0.8328251838684082, "step": 1543 }, { "epoch": 0.5997281025441833, "grad_norm": 1.265625, "learning_rate": 1.7786417069758648e-05, "loss": 0.9342725276947021, "step": 1544 }, { "epoch": 0.6001165274810643, "grad_norm": 1.2109375, "learning_rate": 1.7763923597940463e-05, "loss": 0.8728083372116089, "step": 1545 }, { "epoch": 0.6005049524179452, "grad_norm": 0.96875, "learning_rate": 1.774143298995346e-05, "loss": 0.7257869839668274, "step": 1546 }, { "epoch": 0.6008933773548262, "grad_norm": 1.1875, "learning_rate": 1.771894527460224e-05, "loss": 0.8812562823295593, "step": 1547 }, { "epoch": 0.6012818022917071, "grad_norm": 1.1796875, "learning_rate": 1.7696460480687728e-05, "loss": 0.8879989385604858, "step": 1548 }, { "epoch": 0.6016702272285881, "grad_norm": 1.296875, "learning_rate": 1.7673978637007073e-05, "loss": 1.1152243614196777, "step": 1549 }, { "epoch": 0.602058652165469, "grad_norm": 1.0625, "learning_rate": 1.7651499772353663e-05, "loss": 0.9299732446670532, "step": 1550 }, { "epoch": 0.60244707710235, "grad_norm": 1.328125, "learning_rate": 1.7629023915517058e-05, "loss": 0.8411107063293457, "step": 1551 }, { "epoch": 0.6028355020392309, "grad_norm": 1.234375, "learning_rate": 1.7606551095282978e-05, "loss": 1.1307954788208008, "step": 1552 }, { "epoch": 0.6032239269761118, "grad_norm": 1.125, "learning_rate": 1.758408134043325e-05, "loss": 0.6934619545936584, "step": 1553 }, { "epoch": 0.6036123519129928, "grad_norm": 1.21875, "learning_rate": 1.7561614679745773e-05, "loss": 0.7427366971969604, "step": 1554 }, { "epoch": 0.6040007768498737, "grad_norm": 1.34375, "learning_rate": 1.7539151141994488e-05, "loss": 0.966259241104126, "step": 1555 }, { "epoch": 0.6043892017867547, "grad_norm": 1.359375, "learning_rate": 1.7516690755949333e-05, "loss": 0.921768307685852, "step": 1556 }, { "epoch": 0.6047776267236357, "grad_norm": 1.2890625, "learning_rate": 1.7494233550376208e-05, "loss": 0.9942286610603333, "step": 1557 }, { "epoch": 0.6051660516605166, "grad_norm": 1.3203125, "learning_rate": 1.7471779554036933e-05, "loss": 0.8061314225196838, "step": 1558 }, { "epoch": 0.6055544765973976, "grad_norm": 1.296875, "learning_rate": 1.744932879568924e-05, "loss": 0.9774429202079773, "step": 1559 }, { "epoch": 0.6059429015342785, "grad_norm": 1.1640625, "learning_rate": 1.74268813040867e-05, "loss": 1.0019453763961792, "step": 1560 }, { "epoch": 0.6063313264711594, "grad_norm": 1.3125, "learning_rate": 1.7404437107978693e-05, "loss": 1.1430869102478027, "step": 1561 }, { "epoch": 0.6067197514080404, "grad_norm": 1.046875, "learning_rate": 1.7381996236110386e-05, "loss": 1.098380208015442, "step": 1562 }, { "epoch": 0.6071081763449213, "grad_norm": 1.03125, "learning_rate": 1.7359558717222692e-05, "loss": 0.9038490056991577, "step": 1563 }, { "epoch": 0.6074966012818023, "grad_norm": 1.28125, "learning_rate": 1.7337124580052224e-05, "loss": 1.0441936254501343, "step": 1564 }, { "epoch": 0.6078850262186832, "grad_norm": 1.0625, "learning_rate": 1.7314693853331264e-05, "loss": 0.895246684551239, "step": 1565 }, { "epoch": 0.6082734511555642, "grad_norm": 1.1796875, "learning_rate": 1.729226656578773e-05, "loss": 0.9961055517196655, "step": 1566 }, { "epoch": 0.6086618760924452, "grad_norm": 1.359375, "learning_rate": 1.7269842746145133e-05, "loss": 0.9731868505477905, "step": 1567 }, { "epoch": 0.6090503010293261, "grad_norm": 1.1796875, "learning_rate": 1.7247422423122536e-05, "loss": 1.063889980316162, "step": 1568 }, { "epoch": 0.6094387259662071, "grad_norm": 1.109375, "learning_rate": 1.722500562543454e-05, "loss": 1.035408854484558, "step": 1569 }, { "epoch": 0.609827150903088, "grad_norm": 1.0546875, "learning_rate": 1.7202592381791222e-05, "loss": 0.8740212917327881, "step": 1570 }, { "epoch": 0.6102155758399689, "grad_norm": 1.3828125, "learning_rate": 1.7180182720898103e-05, "loss": 1.1891071796417236, "step": 1571 }, { "epoch": 0.6106040007768498, "grad_norm": 1.40625, "learning_rate": 1.7157776671456114e-05, "loss": 1.144100546836853, "step": 1572 }, { "epoch": 0.6109924257137308, "grad_norm": 1.203125, "learning_rate": 1.7135374262161567e-05, "loss": 0.7290416955947876, "step": 1573 }, { "epoch": 0.6113808506506118, "grad_norm": 1.109375, "learning_rate": 1.711297552170612e-05, "loss": 0.7997403740882874, "step": 1574 }, { "epoch": 0.6117692755874927, "grad_norm": 1.09375, "learning_rate": 1.7090580478776717e-05, "loss": 0.9958997368812561, "step": 1575 }, { "epoch": 0.6121577005243737, "grad_norm": 1.1953125, "learning_rate": 1.706818916205557e-05, "loss": 0.7070245146751404, "step": 1576 }, { "epoch": 0.6125461254612546, "grad_norm": 1.2109375, "learning_rate": 1.7045801600220128e-05, "loss": 1.1302367448806763, "step": 1577 }, { "epoch": 0.6129345503981356, "grad_norm": 1.171875, "learning_rate": 1.702341782194301e-05, "loss": 0.9021425843238831, "step": 1578 }, { "epoch": 0.6133229753350166, "grad_norm": 1.1796875, "learning_rate": 1.7001037855892007e-05, "loss": 0.8579083681106567, "step": 1579 }, { "epoch": 0.6137114002718974, "grad_norm": 1.1640625, "learning_rate": 1.697866173073004e-05, "loss": 0.8023306131362915, "step": 1580 }, { "epoch": 0.6140998252087784, "grad_norm": 1.171875, "learning_rate": 1.6956289475115077e-05, "loss": 0.8973903059959412, "step": 1581 }, { "epoch": 0.6144882501456593, "grad_norm": 1.1796875, "learning_rate": 1.6933921117700156e-05, "loss": 0.8926674723625183, "step": 1582 }, { "epoch": 0.6148766750825403, "grad_norm": 1.2265625, "learning_rate": 1.691155668713331e-05, "loss": 0.8589719533920288, "step": 1583 }, { "epoch": 0.6152651000194213, "grad_norm": 1.2578125, "learning_rate": 1.6889196212057544e-05, "loss": 0.9503453373908997, "step": 1584 }, { "epoch": 0.6156535249563022, "grad_norm": 1.1484375, "learning_rate": 1.686683972111081e-05, "loss": 0.9685193300247192, "step": 1585 }, { "epoch": 0.6160419498931832, "grad_norm": 1.1484375, "learning_rate": 1.684448724292594e-05, "loss": 0.7065480947494507, "step": 1586 }, { "epoch": 0.6164303748300641, "grad_norm": 1.0, "learning_rate": 1.6822138806130634e-05, "loss": 0.8397907018661499, "step": 1587 }, { "epoch": 0.6168187997669451, "grad_norm": 1.265625, "learning_rate": 1.679979443934742e-05, "loss": 0.6110696196556091, "step": 1588 }, { "epoch": 0.6172072247038259, "grad_norm": 1.3125, "learning_rate": 1.6777454171193593e-05, "loss": 0.847122311592102, "step": 1589 }, { "epoch": 0.6175956496407069, "grad_norm": 1.1953125, "learning_rate": 1.675511803028124e-05, "loss": 0.7989035844802856, "step": 1590 }, { "epoch": 0.6179840745775879, "grad_norm": 1.109375, "learning_rate": 1.673278604521712e-05, "loss": 0.9426660537719727, "step": 1591 }, { "epoch": 0.6183724995144688, "grad_norm": 1.1953125, "learning_rate": 1.6710458244602695e-05, "loss": 1.0148433446884155, "step": 1592 }, { "epoch": 0.6187609244513498, "grad_norm": 1.390625, "learning_rate": 1.6688134657034055e-05, "loss": 0.8798336982727051, "step": 1593 }, { "epoch": 0.6191493493882307, "grad_norm": 1.1953125, "learning_rate": 1.6665815311101896e-05, "loss": 1.1171753406524658, "step": 1594 }, { "epoch": 0.6195377743251117, "grad_norm": 1.125, "learning_rate": 1.6643500235391494e-05, "loss": 0.8182234764099121, "step": 1595 }, { "epoch": 0.6199261992619927, "grad_norm": 1.1328125, "learning_rate": 1.6621189458482633e-05, "loss": 0.9257336258888245, "step": 1596 }, { "epoch": 0.6203146241988736, "grad_norm": 1.34375, "learning_rate": 1.6598883008949616e-05, "loss": 0.9460513591766357, "step": 1597 }, { "epoch": 0.6207030491357545, "grad_norm": 0.98828125, "learning_rate": 1.6576580915361186e-05, "loss": 0.6626075506210327, "step": 1598 }, { "epoch": 0.6210914740726354, "grad_norm": 1.140625, "learning_rate": 1.6554283206280504e-05, "loss": 0.9743509292602539, "step": 1599 }, { "epoch": 0.6214798990095164, "grad_norm": 1.21875, "learning_rate": 1.6531989910265146e-05, "loss": 0.8678874373435974, "step": 1600 }, { "epoch": 0.6218683239463973, "grad_norm": 1.1171875, "learning_rate": 1.6509701055867e-05, "loss": 1.0656843185424805, "step": 1601 }, { "epoch": 0.6222567488832783, "grad_norm": 1.3828125, "learning_rate": 1.648741667163229e-05, "loss": 0.819029688835144, "step": 1602 }, { "epoch": 0.6226451738201593, "grad_norm": 1.359375, "learning_rate": 1.6465136786101498e-05, "loss": 0.9206539392471313, "step": 1603 }, { "epoch": 0.6230335987570402, "grad_norm": 1.2265625, "learning_rate": 1.6442861427809343e-05, "loss": 1.0533063411712646, "step": 1604 }, { "epoch": 0.6234220236939212, "grad_norm": 1.1953125, "learning_rate": 1.6420590625284776e-05, "loss": 0.7231786847114563, "step": 1605 }, { "epoch": 0.6238104486308022, "grad_norm": 1.40625, "learning_rate": 1.639832440705088e-05, "loss": 1.2023369073867798, "step": 1606 }, { "epoch": 0.624198873567683, "grad_norm": 1.1953125, "learning_rate": 1.6376062801624873e-05, "loss": 1.1156659126281738, "step": 1607 }, { "epoch": 0.624587298504564, "grad_norm": 1.21875, "learning_rate": 1.6353805837518083e-05, "loss": 0.9612250328063965, "step": 1608 }, { "epoch": 0.6249757234414449, "grad_norm": 1.3125, "learning_rate": 1.633155354323587e-05, "loss": 0.9106634855270386, "step": 1609 }, { "epoch": 0.6253641483783259, "grad_norm": 1.0859375, "learning_rate": 1.630930594727762e-05, "loss": 0.8799929022789001, "step": 1610 }, { "epoch": 0.6257525733152068, "grad_norm": 1.2890625, "learning_rate": 1.6287063078136723e-05, "loss": 0.8176321983337402, "step": 1611 }, { "epoch": 0.6261409982520878, "grad_norm": 1.171875, "learning_rate": 1.626482496430049e-05, "loss": 0.942060112953186, "step": 1612 }, { "epoch": 0.6265294231889688, "grad_norm": 1.2734375, "learning_rate": 1.6242591634250154e-05, "loss": 1.0794072151184082, "step": 1613 }, { "epoch": 0.6269178481258497, "grad_norm": 1.1484375, "learning_rate": 1.622036311646081e-05, "loss": 0.9998536705970764, "step": 1614 }, { "epoch": 0.6273062730627307, "grad_norm": 1.0078125, "learning_rate": 1.6198139439401394e-05, "loss": 1.0494023561477661, "step": 1615 }, { "epoch": 0.6276946979996115, "grad_norm": 1.2734375, "learning_rate": 1.6175920631534657e-05, "loss": 0.8013200759887695, "step": 1616 }, { "epoch": 0.6280831229364925, "grad_norm": 1.328125, "learning_rate": 1.6153706721317097e-05, "loss": 0.938382625579834, "step": 1617 }, { "epoch": 0.6284715478733734, "grad_norm": 1.234375, "learning_rate": 1.6131497737198942e-05, "loss": 0.9575418829917908, "step": 1618 }, { "epoch": 0.6288599728102544, "grad_norm": 1.1796875, "learning_rate": 1.6109293707624117e-05, "loss": 1.0600093603134155, "step": 1619 }, { "epoch": 0.6292483977471354, "grad_norm": 1.03125, "learning_rate": 1.6087094661030185e-05, "loss": 0.6437755823135376, "step": 1620 }, { "epoch": 0.6296368226840163, "grad_norm": 1.59375, "learning_rate": 1.6064900625848358e-05, "loss": 0.8591285347938538, "step": 1621 }, { "epoch": 0.6300252476208973, "grad_norm": 1.4765625, "learning_rate": 1.6042711630503406e-05, "loss": 1.0264102220535278, "step": 1622 }, { "epoch": 0.6304136725577782, "grad_norm": 0.984375, "learning_rate": 1.602052770341365e-05, "loss": 0.9590293169021606, "step": 1623 }, { "epoch": 0.6308020974946592, "grad_norm": 1.21875, "learning_rate": 1.5998348872990913e-05, "loss": 1.0270369052886963, "step": 1624 }, { "epoch": 0.6311905224315401, "grad_norm": 1.265625, "learning_rate": 1.59761751676405e-05, "loss": 1.069689393043518, "step": 1625 }, { "epoch": 0.631578947368421, "grad_norm": 1.65625, "learning_rate": 1.5954006615761158e-05, "loss": 1.269720435142517, "step": 1626 }, { "epoch": 0.631967372305302, "grad_norm": 1.421875, "learning_rate": 1.593184324574502e-05, "loss": 1.1328396797180176, "step": 1627 }, { "epoch": 0.6323557972421829, "grad_norm": 1.0078125, "learning_rate": 1.5909685085977585e-05, "loss": 0.7744377851486206, "step": 1628 }, { "epoch": 0.6327442221790639, "grad_norm": 1.1328125, "learning_rate": 1.588753216483768e-05, "loss": 0.7979485988616943, "step": 1629 }, { "epoch": 0.6331326471159449, "grad_norm": 0.94921875, "learning_rate": 1.5865384510697423e-05, "loss": 0.8825502395629883, "step": 1630 }, { "epoch": 0.6335210720528258, "grad_norm": 1.2109375, "learning_rate": 1.58432421519222e-05, "loss": 0.7930506467819214, "step": 1631 }, { "epoch": 0.6339094969897068, "grad_norm": 1.171875, "learning_rate": 1.5821105116870594e-05, "loss": 1.007103443145752, "step": 1632 }, { "epoch": 0.6342979219265877, "grad_norm": 1.171875, "learning_rate": 1.5798973433894377e-05, "loss": 0.9539839029312134, "step": 1633 }, { "epoch": 0.6346863468634686, "grad_norm": 1.2265625, "learning_rate": 1.5776847131338472e-05, "loss": 0.8074854612350464, "step": 1634 }, { "epoch": 0.6350747718003495, "grad_norm": 1.1796875, "learning_rate": 1.5754726237540905e-05, "loss": 0.8729029893875122, "step": 1635 }, { "epoch": 0.6354631967372305, "grad_norm": 1.375, "learning_rate": 1.5732610780832772e-05, "loss": 0.7142243981361389, "step": 1636 }, { "epoch": 0.6358516216741115, "grad_norm": 1.0390625, "learning_rate": 1.5710500789538216e-05, "loss": 0.8070962429046631, "step": 1637 }, { "epoch": 0.6362400466109924, "grad_norm": 1.359375, "learning_rate": 1.5688396291974374e-05, "loss": 0.9472150802612305, "step": 1638 }, { "epoch": 0.6366284715478734, "grad_norm": 1.2109375, "learning_rate": 1.5666297316451348e-05, "loss": 0.9600380659103394, "step": 1639 }, { "epoch": 0.6370168964847543, "grad_norm": 1.125, "learning_rate": 1.564420389127216e-05, "loss": 1.0328888893127441, "step": 1640 }, { "epoch": 0.6374053214216353, "grad_norm": 1.1796875, "learning_rate": 1.5622116044732733e-05, "loss": 0.8305822610855103, "step": 1641 }, { "epoch": 0.6377937463585163, "grad_norm": 1.328125, "learning_rate": 1.560003380512185e-05, "loss": 0.9473103284835815, "step": 1642 }, { "epoch": 0.6381821712953971, "grad_norm": 1.1640625, "learning_rate": 1.5577957200721097e-05, "loss": 0.7738395929336548, "step": 1643 }, { "epoch": 0.6385705962322781, "grad_norm": 1.09375, "learning_rate": 1.555588625980486e-05, "loss": 0.9083512425422668, "step": 1644 }, { "epoch": 0.638959021169159, "grad_norm": 1.09375, "learning_rate": 1.5533821010640256e-05, "loss": 0.8488430976867676, "step": 1645 }, { "epoch": 0.63934744610604, "grad_norm": 1.203125, "learning_rate": 1.551176148148712e-05, "loss": 0.8478500247001648, "step": 1646 }, { "epoch": 0.639735871042921, "grad_norm": 1.078125, "learning_rate": 1.5489707700597963e-05, "loss": 1.0815755128860474, "step": 1647 }, { "epoch": 0.6401242959798019, "grad_norm": 1.015625, "learning_rate": 1.546765969621793e-05, "loss": 0.8073235154151917, "step": 1648 }, { "epoch": 0.6405127209166829, "grad_norm": 1.1640625, "learning_rate": 1.544561749658477e-05, "loss": 0.8393030166625977, "step": 1649 }, { "epoch": 0.6409011458535638, "grad_norm": 1.28125, "learning_rate": 1.54235811299288e-05, "loss": 0.9303040504455566, "step": 1650 }, { "epoch": 0.6412895707904448, "grad_norm": 1.1015625, "learning_rate": 1.5401550624472846e-05, "loss": 0.9657217860221863, "step": 1651 }, { "epoch": 0.6416779957273256, "grad_norm": 1.1640625, "learning_rate": 1.537952600843227e-05, "loss": 0.9006993770599365, "step": 1652 }, { "epoch": 0.6420664206642066, "grad_norm": 1.2890625, "learning_rate": 1.5357507310014854e-05, "loss": 0.9724708795547485, "step": 1653 }, { "epoch": 0.6424548456010876, "grad_norm": 1.0625, "learning_rate": 1.5335494557420815e-05, "loss": 0.6768783926963806, "step": 1654 }, { "epoch": 0.6428432705379685, "grad_norm": 1.234375, "learning_rate": 1.531348777884275e-05, "loss": 0.9249292016029358, "step": 1655 }, { "epoch": 0.6432316954748495, "grad_norm": 1.09375, "learning_rate": 1.5291487002465603e-05, "loss": 0.9395256638526917, "step": 1656 }, { "epoch": 0.6436201204117304, "grad_norm": 1.3515625, "learning_rate": 1.5269492256466646e-05, "loss": 0.8774154186248779, "step": 1657 }, { "epoch": 0.6440085453486114, "grad_norm": 2.046875, "learning_rate": 1.5247503569015413e-05, "loss": 1.0675946474075317, "step": 1658 }, { "epoch": 0.6443969702854924, "grad_norm": 1.140625, "learning_rate": 1.5225520968273682e-05, "loss": 0.8805907964706421, "step": 1659 }, { "epoch": 0.6447853952223733, "grad_norm": 1.4453125, "learning_rate": 1.5203544482395438e-05, "loss": 1.0703556537628174, "step": 1660 }, { "epoch": 0.6451738201592542, "grad_norm": 1.2890625, "learning_rate": 1.518157413952682e-05, "loss": 0.9873864650726318, "step": 1661 }, { "epoch": 0.6455622450961351, "grad_norm": 1.0234375, "learning_rate": 1.5159609967806135e-05, "loss": 0.8109238743782043, "step": 1662 }, { "epoch": 0.6459506700330161, "grad_norm": 1.234375, "learning_rate": 1.513765199536375e-05, "loss": 0.93097984790802, "step": 1663 }, { "epoch": 0.646339094969897, "grad_norm": 1.0859375, "learning_rate": 1.5115700250322107e-05, "loss": 0.7679112553596497, "step": 1664 }, { "epoch": 0.646727519906778, "grad_norm": 1.15625, "learning_rate": 1.5093754760795674e-05, "loss": 0.9165991544723511, "step": 1665 }, { "epoch": 0.647115944843659, "grad_norm": 1.5390625, "learning_rate": 1.5071815554890899e-05, "loss": 0.9556660652160645, "step": 1666 }, { "epoch": 0.6475043697805399, "grad_norm": 1.578125, "learning_rate": 1.5049882660706187e-05, "loss": 0.931258499622345, "step": 1667 }, { "epoch": 0.6478927947174209, "grad_norm": 1.15625, "learning_rate": 1.502795610633187e-05, "loss": 0.8490819334983826, "step": 1668 }, { "epoch": 0.6482812196543019, "grad_norm": 1.296875, "learning_rate": 1.5006035919850144e-05, "loss": 0.8354121446609497, "step": 1669 }, { "epoch": 0.6486696445911827, "grad_norm": 1.140625, "learning_rate": 1.4984122129335053e-05, "loss": 0.9482797384262085, "step": 1670 }, { "epoch": 0.6490580695280637, "grad_norm": 1.1796875, "learning_rate": 1.4962214762852459e-05, "loss": 0.9794711470603943, "step": 1671 }, { "epoch": 0.6494464944649446, "grad_norm": 1.328125, "learning_rate": 1.4940313848459975e-05, "loss": 0.9440234303474426, "step": 1672 }, { "epoch": 0.6498349194018256, "grad_norm": 1.28125, "learning_rate": 1.4918419414206988e-05, "loss": 0.8302776217460632, "step": 1673 }, { "epoch": 0.6502233443387065, "grad_norm": 1.3515625, "learning_rate": 1.489653148813455e-05, "loss": 1.0263035297393799, "step": 1674 }, { "epoch": 0.6506117692755875, "grad_norm": 1.21875, "learning_rate": 1.4874650098275392e-05, "loss": 0.8148124814033508, "step": 1675 }, { "epoch": 0.6510001942124685, "grad_norm": 1.1171875, "learning_rate": 1.4852775272653874e-05, "loss": 0.9238406419754028, "step": 1676 }, { "epoch": 0.6513886191493494, "grad_norm": 1.453125, "learning_rate": 1.483090703928594e-05, "loss": 0.8295400738716125, "step": 1677 }, { "epoch": 0.6517770440862304, "grad_norm": 1.1875, "learning_rate": 1.480904542617911e-05, "loss": 0.8350507616996765, "step": 1678 }, { "epoch": 0.6521654690231112, "grad_norm": 1.25, "learning_rate": 1.4787190461332404e-05, "loss": 1.6805524826049805, "step": 1679 }, { "epoch": 0.6525538939599922, "grad_norm": 0.98046875, "learning_rate": 1.4765342172736343e-05, "loss": 0.8090325593948364, "step": 1680 }, { "epoch": 0.6529423188968732, "grad_norm": 1.09375, "learning_rate": 1.4743500588372892e-05, "loss": 0.8051353693008423, "step": 1681 }, { "epoch": 0.6533307438337541, "grad_norm": 1.1640625, "learning_rate": 1.4721665736215416e-05, "loss": 0.8369855284690857, "step": 1682 }, { "epoch": 0.6537191687706351, "grad_norm": 1.03125, "learning_rate": 1.4699837644228688e-05, "loss": 0.8185234069824219, "step": 1683 }, { "epoch": 0.654107593707516, "grad_norm": 1.1015625, "learning_rate": 1.4678016340368798e-05, "loss": 0.9448896050453186, "step": 1684 }, { "epoch": 0.654496018644397, "grad_norm": 1.2265625, "learning_rate": 1.4656201852583154e-05, "loss": 0.9722593426704407, "step": 1685 }, { "epoch": 0.654884443581278, "grad_norm": 1.234375, "learning_rate": 1.4634394208810427e-05, "loss": 0.9408589601516724, "step": 1686 }, { "epoch": 0.6552728685181589, "grad_norm": 1.3515625, "learning_rate": 1.461259343698052e-05, "loss": 0.8367534875869751, "step": 1687 }, { "epoch": 0.6556612934550398, "grad_norm": 1.4296875, "learning_rate": 1.459079956501455e-05, "loss": 1.0196623802185059, "step": 1688 }, { "epoch": 0.6560497183919207, "grad_norm": 1.28125, "learning_rate": 1.4569012620824788e-05, "loss": 0.9133743643760681, "step": 1689 }, { "epoch": 0.6564381433288017, "grad_norm": 1.2734375, "learning_rate": 1.4547232632314624e-05, "loss": 1.251158595085144, "step": 1690 }, { "epoch": 0.6568265682656826, "grad_norm": 1.15625, "learning_rate": 1.4525459627378557e-05, "loss": 1.1144676208496094, "step": 1691 }, { "epoch": 0.6572149932025636, "grad_norm": 1.125, "learning_rate": 1.4503693633902128e-05, "loss": 0.9940688014030457, "step": 1692 }, { "epoch": 0.6576034181394446, "grad_norm": 1.3515625, "learning_rate": 1.4481934679761893e-05, "loss": 0.6721562147140503, "step": 1693 }, { "epoch": 0.6579918430763255, "grad_norm": 1.125, "learning_rate": 1.4460182792825422e-05, "loss": 0.8967936635017395, "step": 1694 }, { "epoch": 0.6583802680132065, "grad_norm": 1.0859375, "learning_rate": 1.4438438000951202e-05, "loss": 0.7560436725616455, "step": 1695 }, { "epoch": 0.6587686929500874, "grad_norm": 1.046875, "learning_rate": 1.4416700331988648e-05, "loss": 0.9431365728378296, "step": 1696 }, { "epoch": 0.6591571178869683, "grad_norm": 1.1171875, "learning_rate": 1.4394969813778048e-05, "loss": 0.7840831875801086, "step": 1697 }, { "epoch": 0.6595455428238493, "grad_norm": 1.28125, "learning_rate": 1.437324647415053e-05, "loss": 0.8320786952972412, "step": 1698 }, { "epoch": 0.6599339677607302, "grad_norm": 1.5703125, "learning_rate": 1.4351530340928039e-05, "loss": 1.0653414726257324, "step": 1699 }, { "epoch": 0.6603223926976112, "grad_norm": 1.1875, "learning_rate": 1.4329821441923281e-05, "loss": 0.8682868480682373, "step": 1700 } ], "logging_steps": 1, "max_steps": 2833, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 850, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.59385072388691e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }