{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9894736842105263, "eval_steps": 500, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007894736842105263, "grad_norm": 7.0740203857421875, "learning_rate": 2.6315789473684213e-07, "loss": 1.1186, "step": 1 }, { "epoch": 0.015789473684210527, "grad_norm": 7.031124114990234, "learning_rate": 5.263157894736843e-07, "loss": 1.1281, "step": 2 }, { "epoch": 0.02368421052631579, "grad_norm": 6.89492654800415, "learning_rate": 7.894736842105263e-07, "loss": 1.1185, "step": 3 }, { "epoch": 0.031578947368421054, "grad_norm": 7.015303134918213, "learning_rate": 1.0526315789473685e-06, "loss": 1.129, "step": 4 }, { "epoch": 0.039473684210526314, "grad_norm": 6.810600757598877, "learning_rate": 1.3157894736842106e-06, "loss": 1.103, "step": 5 }, { "epoch": 0.04736842105263158, "grad_norm": 6.700727462768555, "learning_rate": 1.5789473684210526e-06, "loss": 1.129, "step": 6 }, { "epoch": 0.05526315789473684, "grad_norm": 5.1340131759643555, "learning_rate": 1.8421052631578948e-06, "loss": 1.0762, "step": 7 }, { "epoch": 0.06315789473684211, "grad_norm": 4.850973606109619, "learning_rate": 2.105263157894737e-06, "loss": 1.0529, "step": 8 }, { "epoch": 0.07105263157894737, "grad_norm": 4.519778728485107, "learning_rate": 2.368421052631579e-06, "loss": 1.0531, "step": 9 }, { "epoch": 0.07894736842105263, "grad_norm": 2.700388193130493, "learning_rate": 2.631578947368421e-06, "loss": 0.9795, "step": 10 }, { "epoch": 0.0868421052631579, "grad_norm": 2.6331188678741455, "learning_rate": 2.8947368421052634e-06, "loss": 1.0095, "step": 11 }, { "epoch": 0.09473684210526316, "grad_norm": 2.4724442958831787, "learning_rate": 3.157894736842105e-06, "loss": 0.9728, "step": 12 }, { "epoch": 0.10263157894736842, "grad_norm": 3.1326963901519775, "learning_rate": 3.421052631578948e-06, "loss": 0.9307, "step": 13 }, { "epoch": 0.11052631578947368, "grad_norm": 3.435920476913452, "learning_rate": 3.6842105263157896e-06, "loss": 0.9612, "step": 14 }, { "epoch": 0.11842105263157894, "grad_norm": 3.3896520137786865, "learning_rate": 3.947368421052632e-06, "loss": 0.9617, "step": 15 }, { "epoch": 0.12631578947368421, "grad_norm": 2.802786111831665, "learning_rate": 4.210526315789474e-06, "loss": 0.9333, "step": 16 }, { "epoch": 0.13421052631578947, "grad_norm": 1.9803308248519897, "learning_rate": 4.473684210526316e-06, "loss": 0.9159, "step": 17 }, { "epoch": 0.14210526315789473, "grad_norm": 2.1656057834625244, "learning_rate": 4.736842105263158e-06, "loss": 0.8997, "step": 18 }, { "epoch": 0.15, "grad_norm": 2.4422030448913574, "learning_rate": 5e-06, "loss": 0.9104, "step": 19 }, { "epoch": 0.15789473684210525, "grad_norm": 2.1623854637145996, "learning_rate": 5.263157894736842e-06, "loss": 0.8804, "step": 20 }, { "epoch": 0.16578947368421051, "grad_norm": 1.7336848974227905, "learning_rate": 5.526315789473685e-06, "loss": 0.8454, "step": 21 }, { "epoch": 0.1736842105263158, "grad_norm": 1.314452886581421, "learning_rate": 5.789473684210527e-06, "loss": 0.8531, "step": 22 }, { "epoch": 0.18157894736842106, "grad_norm": 1.3969931602478027, "learning_rate": 6.0526315789473685e-06, "loss": 0.8296, "step": 23 }, { "epoch": 0.18947368421052632, "grad_norm": 1.5788418054580688, "learning_rate": 6.31578947368421e-06, "loss": 0.828, "step": 24 }, { "epoch": 0.19736842105263158, "grad_norm": 1.5884418487548828, "learning_rate": 6.578947368421054e-06, "loss": 0.8286, "step": 25 }, { "epoch": 0.20526315789473684, "grad_norm": 1.1489850282669067, "learning_rate": 6.842105263157896e-06, "loss": 0.8069, "step": 26 }, { "epoch": 0.2131578947368421, "grad_norm": 1.0133432149887085, "learning_rate": 7.1052631578947375e-06, "loss": 0.8102, "step": 27 }, { "epoch": 0.22105263157894736, "grad_norm": 0.9435691237449646, "learning_rate": 7.368421052631579e-06, "loss": 0.7892, "step": 28 }, { "epoch": 0.22894736842105262, "grad_norm": 1.0618520975112915, "learning_rate": 7.631578947368423e-06, "loss": 0.7961, "step": 29 }, { "epoch": 0.23684210526315788, "grad_norm": 0.953088641166687, "learning_rate": 7.894736842105265e-06, "loss": 0.8092, "step": 30 }, { "epoch": 0.24473684210526317, "grad_norm": 0.843917727470398, "learning_rate": 8.157894736842106e-06, "loss": 0.7846, "step": 31 }, { "epoch": 0.25263157894736843, "grad_norm": 0.8666284680366516, "learning_rate": 8.421052631578948e-06, "loss": 0.7538, "step": 32 }, { "epoch": 0.26052631578947366, "grad_norm": 0.8682100176811218, "learning_rate": 8.68421052631579e-06, "loss": 0.7802, "step": 33 }, { "epoch": 0.26842105263157895, "grad_norm": 0.7664394974708557, "learning_rate": 8.947368421052632e-06, "loss": 0.7718, "step": 34 }, { "epoch": 0.27631578947368424, "grad_norm": 0.7890555262565613, "learning_rate": 9.210526315789474e-06, "loss": 0.7826, "step": 35 }, { "epoch": 0.28421052631578947, "grad_norm": 0.8435359001159668, "learning_rate": 9.473684210526315e-06, "loss": 0.7763, "step": 36 }, { "epoch": 0.29210526315789476, "grad_norm": 0.8021636605262756, "learning_rate": 9.736842105263159e-06, "loss": 0.7564, "step": 37 }, { "epoch": 0.3, "grad_norm": 0.8132019639015198, "learning_rate": 1e-05, "loss": 0.7508, "step": 38 }, { "epoch": 0.3078947368421053, "grad_norm": 0.7610636949539185, "learning_rate": 9.99978655851684e-06, "loss": 0.7492, "step": 39 }, { "epoch": 0.3157894736842105, "grad_norm": 0.8269503712654114, "learning_rate": 9.999146252290264e-06, "loss": 0.7627, "step": 40 }, { "epoch": 0.3236842105263158, "grad_norm": 0.7848504781723022, "learning_rate": 9.998079135987437e-06, "loss": 0.739, "step": 41 }, { "epoch": 0.33157894736842103, "grad_norm": 0.7971328496932983, "learning_rate": 9.996585300715117e-06, "loss": 0.7372, "step": 42 }, { "epoch": 0.3394736842105263, "grad_norm": 0.7159483432769775, "learning_rate": 9.994664874011864e-06, "loss": 0.7537, "step": 43 }, { "epoch": 0.3473684210526316, "grad_norm": 0.7174955606460571, "learning_rate": 9.992318019837171e-06, "loss": 0.7485, "step": 44 }, { "epoch": 0.35526315789473684, "grad_norm": 0.8257307410240173, "learning_rate": 9.989544938557453e-06, "loss": 0.7487, "step": 45 }, { "epoch": 0.3631578947368421, "grad_norm": 0.7395874857902527, "learning_rate": 9.98634586692894e-06, "loss": 0.7465, "step": 46 }, { "epoch": 0.37105263157894736, "grad_norm": 0.8656660914421082, "learning_rate": 9.982721078077474e-06, "loss": 0.7471, "step": 47 }, { "epoch": 0.37894736842105264, "grad_norm": 0.6341963410377502, "learning_rate": 9.978670881475173e-06, "loss": 0.7643, "step": 48 }, { "epoch": 0.3868421052631579, "grad_norm": 0.8313978910446167, "learning_rate": 9.97419562291403e-06, "loss": 0.7386, "step": 49 }, { "epoch": 0.39473684210526316, "grad_norm": 0.93310546875, "learning_rate": 9.96929568447637e-06, "loss": 0.754, "step": 50 }, { "epoch": 0.4026315789473684, "grad_norm": 0.7993932962417603, "learning_rate": 9.963971484502247e-06, "loss": 0.731, "step": 51 }, { "epoch": 0.4105263157894737, "grad_norm": 0.7355050444602966, "learning_rate": 9.958223477553715e-06, "loss": 0.7542, "step": 52 }, { "epoch": 0.41842105263157897, "grad_norm": 0.7957926392555237, "learning_rate": 9.952052154376027e-06, "loss": 0.7626, "step": 53 }, { "epoch": 0.4263157894736842, "grad_norm": 0.9310785531997681, "learning_rate": 9.945458041855732e-06, "loss": 0.7303, "step": 54 }, { "epoch": 0.4342105263157895, "grad_norm": 0.8477827906608582, "learning_rate": 9.938441702975689e-06, "loss": 0.7143, "step": 55 }, { "epoch": 0.4421052631578947, "grad_norm": 0.654358446598053, "learning_rate": 9.931003736767013e-06, "loss": 0.7097, "step": 56 }, { "epoch": 0.45, "grad_norm": 0.8673765063285828, "learning_rate": 9.923144778257918e-06, "loss": 0.7253, "step": 57 }, { "epoch": 0.45789473684210524, "grad_norm": 0.7678876519203186, "learning_rate": 9.91486549841951e-06, "loss": 0.7212, "step": 58 }, { "epoch": 0.46578947368421053, "grad_norm": 0.7339031100273132, "learning_rate": 9.906166604108494e-06, "loss": 0.7254, "step": 59 }, { "epoch": 0.47368421052631576, "grad_norm": 0.7133976221084595, "learning_rate": 9.89704883800683e-06, "loss": 0.7306, "step": 60 }, { "epoch": 0.48157894736842105, "grad_norm": 0.8026192784309387, "learning_rate": 9.887512978558329e-06, "loss": 0.7245, "step": 61 }, { "epoch": 0.48947368421052634, "grad_norm": 0.659774899482727, "learning_rate": 9.877559839902185e-06, "loss": 0.7239, "step": 62 }, { "epoch": 0.49736842105263157, "grad_norm": 0.6718412637710571, "learning_rate": 9.867190271803466e-06, "loss": 0.6966, "step": 63 }, { "epoch": 0.5052631578947369, "grad_norm": 0.782952070236206, "learning_rate": 9.85640515958057e-06, "loss": 0.7246, "step": 64 }, { "epoch": 0.5131578947368421, "grad_norm": 0.749127984046936, "learning_rate": 9.845205424029639e-06, "loss": 0.7121, "step": 65 }, { "epoch": 0.5210526315789473, "grad_norm": 0.773841917514801, "learning_rate": 9.833592021345938e-06, "loss": 0.7341, "step": 66 }, { "epoch": 0.5289473684210526, "grad_norm": 0.7773014903068542, "learning_rate": 9.821565943042225e-06, "loss": 0.707, "step": 67 }, { "epoch": 0.5368421052631579, "grad_norm": 0.748015820980072, "learning_rate": 9.809128215864096e-06, "loss": 0.7118, "step": 68 }, { "epoch": 0.5447368421052632, "grad_norm": 0.6848940849304199, "learning_rate": 9.796279901702326e-06, "loss": 0.6977, "step": 69 }, { "epoch": 0.5526315789473685, "grad_norm": 0.8350101709365845, "learning_rate": 9.783022097502204e-06, "loss": 0.7213, "step": 70 }, { "epoch": 0.5605263157894737, "grad_norm": 0.7320327758789062, "learning_rate": 9.76935593516989e-06, "loss": 0.7079, "step": 71 }, { "epoch": 0.5684210526315789, "grad_norm": 0.7068721055984497, "learning_rate": 9.755282581475769e-06, "loss": 0.7304, "step": 72 }, { "epoch": 0.5763157894736842, "grad_norm": 0.7000291347503662, "learning_rate": 9.74080323795483e-06, "loss": 0.714, "step": 73 }, { "epoch": 0.5842105263157895, "grad_norm": 0.8786887526512146, "learning_rate": 9.7259191408041e-06, "loss": 0.7155, "step": 74 }, { "epoch": 0.5921052631578947, "grad_norm": 0.7386007905006409, "learning_rate": 9.710631560777082e-06, "loss": 0.7088, "step": 75 }, { "epoch": 0.6, "grad_norm": 0.6937209367752075, "learning_rate": 9.694941803075285e-06, "loss": 0.7058, "step": 76 }, { "epoch": 0.6078947368421053, "grad_norm": 0.9193783402442932, "learning_rate": 9.678851207236764e-06, "loss": 0.738, "step": 77 }, { "epoch": 0.6157894736842106, "grad_norm": 0.9059263467788696, "learning_rate": 9.66236114702178e-06, "loss": 0.7116, "step": 78 }, { "epoch": 0.6236842105263158, "grad_norm": 0.6896800994873047, "learning_rate": 9.645473030295496e-06, "loss": 0.7207, "step": 79 }, { "epoch": 0.631578947368421, "grad_norm": 0.8141930103302002, "learning_rate": 9.628188298907782e-06, "loss": 0.7015, "step": 80 }, { "epoch": 0.6394736842105263, "grad_norm": 0.6706604361534119, "learning_rate": 9.610508428570122e-06, "loss": 0.7204, "step": 81 }, { "epoch": 0.6473684210526316, "grad_norm": 0.7771408557891846, "learning_rate": 9.592434928729617e-06, "loss": 0.7215, "step": 82 }, { "epoch": 0.6552631578947369, "grad_norm": 0.6452600955963135, "learning_rate": 9.573969342440107e-06, "loss": 0.7187, "step": 83 }, { "epoch": 0.6631578947368421, "grad_norm": 0.704627275466919, "learning_rate": 9.555113246230443e-06, "loss": 0.7383, "step": 84 }, { "epoch": 0.6710526315789473, "grad_norm": 0.675726056098938, "learning_rate": 9.535868249969882e-06, "loss": 0.7006, "step": 85 }, { "epoch": 0.6789473684210526, "grad_norm": 0.807037353515625, "learning_rate": 9.516235996730645e-06, "loss": 0.7098, "step": 86 }, { "epoch": 0.6868421052631579, "grad_norm": 0.6850730776786804, "learning_rate": 9.496218162647629e-06, "loss": 0.7095, "step": 87 }, { "epoch": 0.6947368421052632, "grad_norm": 0.7169205546379089, "learning_rate": 9.475816456775313e-06, "loss": 0.7209, "step": 88 }, { "epoch": 0.7026315789473684, "grad_norm": 0.8442161083221436, "learning_rate": 9.45503262094184e-06, "loss": 0.7133, "step": 89 }, { "epoch": 0.7105263157894737, "grad_norm": 0.67853182554245, "learning_rate": 9.43386842960031e-06, "loss": 0.7017, "step": 90 }, { "epoch": 0.718421052631579, "grad_norm": 0.6674996018409729, "learning_rate": 9.41232568967728e-06, "loss": 0.7039, "step": 91 }, { "epoch": 0.7263157894736842, "grad_norm": 0.6516873240470886, "learning_rate": 9.39040624041849e-06, "loss": 0.6955, "step": 92 }, { "epoch": 0.7342105263157894, "grad_norm": 0.6596349477767944, "learning_rate": 9.368111953231849e-06, "loss": 0.6895, "step": 93 }, { "epoch": 0.7421052631578947, "grad_norm": 0.6478255987167358, "learning_rate": 9.345444731527642e-06, "loss": 0.6935, "step": 94 }, { "epoch": 0.75, "grad_norm": 0.6235653758049011, "learning_rate": 9.32240651055604e-06, "loss": 0.6787, "step": 95 }, { "epoch": 0.7578947368421053, "grad_norm": 0.638984739780426, "learning_rate": 9.298999257241862e-06, "loss": 0.7282, "step": 96 }, { "epoch": 0.7657894736842106, "grad_norm": 0.710728108882904, "learning_rate": 9.275224970016656e-06, "loss": 0.7166, "step": 97 }, { "epoch": 0.7736842105263158, "grad_norm": 0.6417115926742554, "learning_rate": 9.251085678648072e-06, "loss": 0.6881, "step": 98 }, { "epoch": 0.781578947368421, "grad_norm": 0.7576119303703308, "learning_rate": 9.22658344406657e-06, "loss": 0.7071, "step": 99 }, { "epoch": 0.7894736842105263, "grad_norm": 0.8099032640457153, "learning_rate": 9.201720358189464e-06, "loss": 0.708, "step": 100 }, { "epoch": 0.7973684210526316, "grad_norm": 0.649307131767273, "learning_rate": 9.176498543742328e-06, "loss": 0.696, "step": 101 }, { "epoch": 0.8052631578947368, "grad_norm": 0.7050330638885498, "learning_rate": 9.150920154077753e-06, "loss": 0.7138, "step": 102 }, { "epoch": 0.8131578947368421, "grad_norm": 0.6682823896408081, "learning_rate": 9.124987372991512e-06, "loss": 0.6937, "step": 103 }, { "epoch": 0.8210526315789474, "grad_norm": 0.718499481678009, "learning_rate": 9.098702414536107e-06, "loss": 0.7022, "step": 104 }, { "epoch": 0.8289473684210527, "grad_norm": 0.7294292449951172, "learning_rate": 9.072067522831743e-06, "loss": 0.6789, "step": 105 }, { "epoch": 0.8368421052631579, "grad_norm": 0.6921276450157166, "learning_rate": 9.045084971874738e-06, "loss": 0.7057, "step": 106 }, { "epoch": 0.8447368421052631, "grad_norm": 0.8754315376281738, "learning_rate": 9.017757065343368e-06, "loss": 0.7262, "step": 107 }, { "epoch": 0.8526315789473684, "grad_norm": 0.6188150644302368, "learning_rate": 8.990086136401199e-06, "loss": 0.6853, "step": 108 }, { "epoch": 0.8605263157894737, "grad_norm": 0.7342610955238342, "learning_rate": 8.96207454749787e-06, "loss": 0.7164, "step": 109 }, { "epoch": 0.868421052631579, "grad_norm": 0.7715557217597961, "learning_rate": 8.933724690167417e-06, "loss": 0.7058, "step": 110 }, { "epoch": 0.8763157894736842, "grad_norm": 0.7064869403839111, "learning_rate": 8.905038984824079e-06, "loss": 0.698, "step": 111 }, { "epoch": 0.8842105263157894, "grad_norm": 0.7494336366653442, "learning_rate": 8.87601988055565e-06, "loss": 0.6961, "step": 112 }, { "epoch": 0.8921052631578947, "grad_norm": 0.664580762386322, "learning_rate": 8.846669854914395e-06, "loss": 0.7129, "step": 113 }, { "epoch": 0.9, "grad_norm": 0.7173993587493896, "learning_rate": 8.816991413705515e-06, "loss": 0.716, "step": 114 }, { "epoch": 0.9078947368421053, "grad_norm": 0.6451180577278137, "learning_rate": 8.786987090773214e-06, "loss": 0.7034, "step": 115 }, { "epoch": 0.9157894736842105, "grad_norm": 0.6762385368347168, "learning_rate": 8.756659447784367e-06, "loss": 0.6933, "step": 116 }, { "epoch": 0.9236842105263158, "grad_norm": 0.688220739364624, "learning_rate": 8.726011074009813e-06, "loss": 0.7154, "step": 117 }, { "epoch": 0.9315789473684211, "grad_norm": 0.6731222867965698, "learning_rate": 8.695044586103297e-06, "loss": 0.6916, "step": 118 }, { "epoch": 0.9394736842105263, "grad_norm": 0.6552385687828064, "learning_rate": 8.663762627878059e-06, "loss": 0.7074, "step": 119 }, { "epoch": 0.9473684210526315, "grad_norm": 0.6504393815994263, "learning_rate": 8.632167870081122e-06, "loss": 0.7028, "step": 120 }, { "epoch": 0.9552631578947368, "grad_norm": 0.7418521046638489, "learning_rate": 8.600263010165275e-06, "loss": 0.6997, "step": 121 }, { "epoch": 0.9631578947368421, "grad_norm": 0.6728154420852661, "learning_rate": 8.568050772058763e-06, "loss": 0.7066, "step": 122 }, { "epoch": 0.9710526315789474, "grad_norm": 0.6942195892333984, "learning_rate": 8.535533905932739e-06, "loss": 0.7001, "step": 123 }, { "epoch": 0.9789473684210527, "grad_norm": 0.6641739010810852, "learning_rate": 8.502715187966455e-06, "loss": 0.702, "step": 124 }, { "epoch": 0.9868421052631579, "grad_norm": 0.7299260497093201, "learning_rate": 8.469597420110249e-06, "loss": 0.6927, "step": 125 }, { "epoch": 0.9947368421052631, "grad_norm": 0.6769669055938721, "learning_rate": 8.436183429846314e-06, "loss": 0.6743, "step": 126 }, { "epoch": 1.0052631578947369, "grad_norm": 0.7567976713180542, "learning_rate": 8.402476069947309e-06, "loss": 0.6556, "step": 127 }, { "epoch": 1.013157894736842, "grad_norm": 0.7088767290115356, "learning_rate": 8.368478218232787e-06, "loss": 0.6348, "step": 128 }, { "epoch": 1.0210526315789474, "grad_norm": 0.6701803207397461, "learning_rate": 8.334192777323508e-06, "loss": 0.63, "step": 129 }, { "epoch": 1.0289473684210526, "grad_norm": 0.7436509132385254, "learning_rate": 8.299622674393615e-06, "loss": 0.621, "step": 130 }, { "epoch": 1.0368421052631578, "grad_norm": 0.6837674379348755, "learning_rate": 8.264770860920722e-06, "loss": 0.6294, "step": 131 }, { "epoch": 1.0447368421052632, "grad_norm": 0.6941933035850525, "learning_rate": 8.229640312433938e-06, "loss": 0.6003, "step": 132 }, { "epoch": 1.0526315789473684, "grad_norm": 0.6441699266433716, "learning_rate": 8.194234028259806e-06, "loss": 0.6044, "step": 133 }, { "epoch": 1.0605263157894738, "grad_norm": 0.6503862142562866, "learning_rate": 8.158555031266255e-06, "loss": 0.6059, "step": 134 }, { "epoch": 1.068421052631579, "grad_norm": 0.6873996257781982, "learning_rate": 8.122606367604497e-06, "loss": 0.5948, "step": 135 }, { "epoch": 1.0763157894736841, "grad_norm": 0.6901219487190247, "learning_rate": 8.086391106448965e-06, "loss": 0.617, "step": 136 }, { "epoch": 1.0842105263157895, "grad_norm": 0.5913821458816528, "learning_rate": 8.049912339735284e-06, "loss": 0.6205, "step": 137 }, { "epoch": 1.0921052631578947, "grad_norm": 0.6473250389099121, "learning_rate": 8.013173181896283e-06, "loss": 0.5898, "step": 138 }, { "epoch": 1.1, "grad_norm": 0.6466673016548157, "learning_rate": 7.976176769596095e-06, "loss": 0.6283, "step": 139 }, { "epoch": 1.1078947368421053, "grad_norm": 0.562574565410614, "learning_rate": 7.938926261462366e-06, "loss": 0.5947, "step": 140 }, { "epoch": 1.1157894736842104, "grad_norm": 0.6042968034744263, "learning_rate": 7.90142483781658e-06, "loss": 0.6338, "step": 141 }, { "epoch": 1.1236842105263158, "grad_norm": 0.6527702808380127, "learning_rate": 7.863675700402527e-06, "loss": 0.6113, "step": 142 }, { "epoch": 1.131578947368421, "grad_norm": 0.5683473944664001, "learning_rate": 7.82568207211296e-06, "loss": 0.6254, "step": 143 }, { "epoch": 1.1394736842105262, "grad_norm": 0.6214303374290466, "learning_rate": 7.787447196714428e-06, "loss": 0.6213, "step": 144 }, { "epoch": 1.1473684210526316, "grad_norm": 0.5913365483283997, "learning_rate": 7.748974338570337e-06, "loss": 0.6221, "step": 145 }, { "epoch": 1.1552631578947368, "grad_norm": 0.6266749501228333, "learning_rate": 7.710266782362248e-06, "loss": 0.5909, "step": 146 }, { "epoch": 1.1631578947368422, "grad_norm": 0.6693281531333923, "learning_rate": 7.671327832809442e-06, "loss": 0.6373, "step": 147 }, { "epoch": 1.1710526315789473, "grad_norm": 0.6370000839233398, "learning_rate": 7.63216081438678e-06, "loss": 0.5908, "step": 148 }, { "epoch": 1.1789473684210527, "grad_norm": 0.6319229602813721, "learning_rate": 7.5927690710408606e-06, "loss": 0.6138, "step": 149 }, { "epoch": 1.186842105263158, "grad_norm": 0.6503207683563232, "learning_rate": 7.553155965904535e-06, "loss": 0.6165, "step": 150 }, { "epoch": 1.194736842105263, "grad_norm": 0.6979773640632629, "learning_rate": 7.513324881009769e-06, "loss": 0.6308, "step": 151 }, { "epoch": 1.2026315789473685, "grad_norm": 0.591997504234314, "learning_rate": 7.473279216998896e-06, "loss": 0.6227, "step": 152 }, { "epoch": 1.2105263157894737, "grad_norm": 0.6470649242401123, "learning_rate": 7.4330223928342814e-06, "loss": 0.6248, "step": 153 }, { "epoch": 1.2184210526315788, "grad_norm": 0.6023352742195129, "learning_rate": 7.392557845506433e-06, "loss": 0.6043, "step": 154 }, { "epoch": 1.2263157894736842, "grad_norm": 0.6112087368965149, "learning_rate": 7.351889029740548e-06, "loss": 0.6122, "step": 155 }, { "epoch": 1.2342105263157894, "grad_norm": 0.737591028213501, "learning_rate": 7.311019417701567e-06, "loss": 0.6091, "step": 156 }, { "epoch": 1.2421052631578948, "grad_norm": 0.6064261198043823, "learning_rate": 7.269952498697734e-06, "loss": 0.6092, "step": 157 }, { "epoch": 1.25, "grad_norm": 0.6505361199378967, "learning_rate": 7.2286917788826926e-06, "loss": 0.6164, "step": 158 }, { "epoch": 1.2578947368421054, "grad_norm": 0.6192215085029602, "learning_rate": 7.187240780956133e-06, "loss": 0.6028, "step": 159 }, { "epoch": 1.2657894736842106, "grad_norm": 0.6353914141654968, "learning_rate": 7.145603043863045e-06, "loss": 0.6052, "step": 160 }, { "epoch": 1.2736842105263158, "grad_norm": 0.634981632232666, "learning_rate": 7.103782122491577e-06, "loss": 0.6188, "step": 161 }, { "epoch": 1.2815789473684212, "grad_norm": 0.6064332127571106, "learning_rate": 7.061781587369518e-06, "loss": 0.6288, "step": 162 }, { "epoch": 1.2894736842105263, "grad_norm": 0.5805593729019165, "learning_rate": 7.019605024359475e-06, "loss": 0.6037, "step": 163 }, { "epoch": 1.2973684210526315, "grad_norm": 0.7273927927017212, "learning_rate": 6.977256034352713e-06, "loss": 0.6268, "step": 164 }, { "epoch": 1.305263157894737, "grad_norm": 0.6278533339500427, "learning_rate": 6.934738232961728e-06, "loss": 0.6283, "step": 165 }, { "epoch": 1.313157894736842, "grad_norm": 0.6402047872543335, "learning_rate": 6.892055250211552e-06, "loss": 0.6157, "step": 166 }, { "epoch": 1.3210526315789473, "grad_norm": 0.6458708047866821, "learning_rate": 6.849210730229846e-06, "loss": 0.6187, "step": 167 }, { "epoch": 1.3289473684210527, "grad_norm": 0.5871455073356628, "learning_rate": 6.806208330935766e-06, "loss": 0.6162, "step": 168 }, { "epoch": 1.3368421052631578, "grad_norm": 0.5851725935935974, "learning_rate": 6.763051723727663e-06, "loss": 0.605, "step": 169 }, { "epoch": 1.3447368421052632, "grad_norm": 0.607880175113678, "learning_rate": 6.719744593169642e-06, "loss": 0.5954, "step": 170 }, { "epoch": 1.3526315789473684, "grad_norm": 0.6497223973274231, "learning_rate": 6.67629063667697e-06, "loss": 0.6233, "step": 171 }, { "epoch": 1.3605263157894738, "grad_norm": 0.6154512166976929, "learning_rate": 6.6326935642004165e-06, "loss": 0.6186, "step": 172 }, { "epoch": 1.368421052631579, "grad_norm": 0.6759777665138245, "learning_rate": 6.588957097909509e-06, "loss": 0.6171, "step": 173 }, { "epoch": 1.3763157894736842, "grad_norm": 0.6606743931770325, "learning_rate": 6.545084971874738e-06, "loss": 0.5943, "step": 174 }, { "epoch": 1.3842105263157896, "grad_norm": 0.6321778297424316, "learning_rate": 6.501080931748764e-06, "loss": 0.6191, "step": 175 }, { "epoch": 1.3921052631578947, "grad_norm": 0.622314453125, "learning_rate": 6.456948734446624e-06, "loss": 0.6134, "step": 176 }, { "epoch": 1.4, "grad_norm": 0.6384446620941162, "learning_rate": 6.412692147824976e-06, "loss": 0.6005, "step": 177 }, { "epoch": 1.4078947368421053, "grad_norm": 0.6004672646522522, "learning_rate": 6.368314950360416e-06, "loss": 0.6082, "step": 178 }, { "epoch": 1.4157894736842105, "grad_norm": 0.6092491745948792, "learning_rate": 6.323820930826879e-06, "loss": 0.6306, "step": 179 }, { "epoch": 1.4236842105263157, "grad_norm": 0.6291044354438782, "learning_rate": 6.279213887972179e-06, "loss": 0.6188, "step": 180 }, { "epoch": 1.431578947368421, "grad_norm": 0.5878775119781494, "learning_rate": 6.234497630193666e-06, "loss": 0.6154, "step": 181 }, { "epoch": 1.4394736842105262, "grad_norm": 0.5391842126846313, "learning_rate": 6.189675975213094e-06, "loss": 0.6105, "step": 182 }, { "epoch": 1.4473684210526316, "grad_norm": 0.5454199314117432, "learning_rate": 6.144752749750671e-06, "loss": 0.6023, "step": 183 }, { "epoch": 1.4552631578947368, "grad_norm": 0.6040089130401611, "learning_rate": 6.099731789198344e-06, "loss": 0.6081, "step": 184 }, { "epoch": 1.4631578947368422, "grad_norm": 0.6564128398895264, "learning_rate": 6.05461693729235e-06, "loss": 0.6115, "step": 185 }, { "epoch": 1.4710526315789474, "grad_norm": 0.5978277325630188, "learning_rate": 6.009412045785051e-06, "loss": 0.6044, "step": 186 }, { "epoch": 1.4789473684210526, "grad_norm": 0.6195292472839355, "learning_rate": 5.964120974116085e-06, "loss": 0.6253, "step": 187 }, { "epoch": 1.486842105263158, "grad_norm": 0.5672737956047058, "learning_rate": 5.918747589082853e-06, "loss": 0.614, "step": 188 }, { "epoch": 1.4947368421052631, "grad_norm": 0.5825613737106323, "learning_rate": 5.8732957645103946e-06, "loss": 0.5987, "step": 189 }, { "epoch": 1.5026315789473683, "grad_norm": 0.5839316844940186, "learning_rate": 5.82776938092065e-06, "loss": 0.6099, "step": 190 }, { "epoch": 1.5105263157894737, "grad_norm": 0.592494010925293, "learning_rate": 5.782172325201155e-06, "loss": 0.6272, "step": 191 }, { "epoch": 1.518421052631579, "grad_norm": 0.6230676174163818, "learning_rate": 5.736508490273189e-06, "loss": 0.6132, "step": 192 }, { "epoch": 1.526315789473684, "grad_norm": 0.5931450724601746, "learning_rate": 5.690781774759412e-06, "loss": 0.6022, "step": 193 }, { "epoch": 1.5342105263157895, "grad_norm": 0.6010186076164246, "learning_rate": 5.644996082651018e-06, "loss": 0.6144, "step": 194 }, { "epoch": 1.5421052631578949, "grad_norm": 0.6734380722045898, "learning_rate": 5.5991553229744166e-06, "loss": 0.6343, "step": 195 }, { "epoch": 1.55, "grad_norm": 0.5894003510475159, "learning_rate": 5.553263409457504e-06, "loss": 0.6165, "step": 196 }, { "epoch": 1.5578947368421052, "grad_norm": 0.5955084562301636, "learning_rate": 5.507324260195516e-06, "loss": 0.611, "step": 197 }, { "epoch": 1.5657894736842106, "grad_norm": 0.5771483778953552, "learning_rate": 5.46134179731651e-06, "loss": 0.6079, "step": 198 }, { "epoch": 1.5736842105263158, "grad_norm": 0.6015431880950928, "learning_rate": 5.41531994664652e-06, "loss": 0.5994, "step": 199 }, { "epoch": 1.581578947368421, "grad_norm": 0.5537875890731812, "learning_rate": 5.36926263737437e-06, "loss": 0.597, "step": 200 }, { "epoch": 1.5894736842105264, "grad_norm": 0.5928704142570496, "learning_rate": 5.323173801716222e-06, "loss": 0.596, "step": 201 }, { "epoch": 1.5973684210526315, "grad_norm": 0.6157421469688416, "learning_rate": 5.27705737457985e-06, "loss": 0.6261, "step": 202 }, { "epoch": 1.6052631578947367, "grad_norm": 0.612349271774292, "learning_rate": 5.230917293228699e-06, "loss": 0.6079, "step": 203 }, { "epoch": 1.6131578947368421, "grad_norm": 0.54092937707901, "learning_rate": 5.184757496945726e-06, "loss": 0.5942, "step": 204 }, { "epoch": 1.6210526315789475, "grad_norm": 0.5439791083335876, "learning_rate": 5.138581926697083e-06, "loss": 0.6135, "step": 205 }, { "epoch": 1.6289473684210525, "grad_norm": 0.5337532758712769, "learning_rate": 5.09239452479565e-06, "loss": 0.587, "step": 206 }, { "epoch": 1.6368421052631579, "grad_norm": 0.5457773208618164, "learning_rate": 5.046199234564455e-06, "loss": 0.5961, "step": 207 }, { "epoch": 1.6447368421052633, "grad_norm": 0.5572032332420349, "learning_rate": 5e-06, "loss": 0.6022, "step": 208 }, { "epoch": 1.6526315789473685, "grad_norm": 0.5409188270568848, "learning_rate": 4.953800765435547e-06, "loss": 0.5966, "step": 209 }, { "epoch": 1.6605263157894736, "grad_norm": 0.5420222878456116, "learning_rate": 4.907605475204352e-06, "loss": 0.6069, "step": 210 }, { "epoch": 1.668421052631579, "grad_norm": 0.5424708724021912, "learning_rate": 4.861418073302919e-06, "loss": 0.6009, "step": 211 }, { "epoch": 1.6763157894736842, "grad_norm": 0.5323647856712341, "learning_rate": 4.815242503054277e-06, "loss": 0.6183, "step": 212 }, { "epoch": 1.6842105263157894, "grad_norm": 0.5776340961456299, "learning_rate": 4.7690827067713035e-06, "loss": 0.6109, "step": 213 }, { "epoch": 1.6921052631578948, "grad_norm": 0.5825272798538208, "learning_rate": 4.7229426254201504e-06, "loss": 0.615, "step": 214 }, { "epoch": 1.7, "grad_norm": 0.5867627859115601, "learning_rate": 4.676826198283779e-06, "loss": 0.6114, "step": 215 }, { "epoch": 1.7078947368421051, "grad_norm": 0.5459916591644287, "learning_rate": 4.630737362625631e-06, "loss": 0.6039, "step": 216 }, { "epoch": 1.7157894736842105, "grad_norm": 0.5879709124565125, "learning_rate": 4.584680053353481e-06, "loss": 0.6154, "step": 217 }, { "epoch": 1.723684210526316, "grad_norm": 0.5895808339118958, "learning_rate": 4.53865820268349e-06, "loss": 0.6133, "step": 218 }, { "epoch": 1.731578947368421, "grad_norm": 0.6173647046089172, "learning_rate": 4.492675739804486e-06, "loss": 0.6018, "step": 219 }, { "epoch": 1.7394736842105263, "grad_norm": 0.5456452965736389, "learning_rate": 4.446736590542497e-06, "loss": 0.6083, "step": 220 }, { "epoch": 1.7473684210526317, "grad_norm": 0.5733683705329895, "learning_rate": 4.400844677025585e-06, "loss": 0.6015, "step": 221 }, { "epoch": 1.7552631578947369, "grad_norm": 0.5852586627006531, "learning_rate": 4.355003917348985e-06, "loss": 0.5983, "step": 222 }, { "epoch": 1.763157894736842, "grad_norm": 0.5621438026428223, "learning_rate": 4.309218225240591e-06, "loss": 0.5891, "step": 223 }, { "epoch": 1.7710526315789474, "grad_norm": 0.582830011844635, "learning_rate": 4.263491509726812e-06, "loss": 0.5864, "step": 224 }, { "epoch": 1.7789473684210526, "grad_norm": 0.5376387238502502, "learning_rate": 4.217827674798845e-06, "loss": 0.6084, "step": 225 }, { "epoch": 1.7868421052631578, "grad_norm": 0.6110755801200867, "learning_rate": 4.17223061907935e-06, "loss": 0.6055, "step": 226 }, { "epoch": 1.7947368421052632, "grad_norm": 0.5580980777740479, "learning_rate": 4.126704235489606e-06, "loss": 0.616, "step": 227 }, { "epoch": 1.8026315789473686, "grad_norm": 0.6138029098510742, "learning_rate": 4.081252410917148e-06, "loss": 0.6088, "step": 228 }, { "epoch": 1.8105263157894735, "grad_norm": 0.5521398186683655, "learning_rate": 4.035879025883916e-06, "loss": 0.6146, "step": 229 }, { "epoch": 1.818421052631579, "grad_norm": 0.5325535535812378, "learning_rate": 3.99058795421495e-06, "loss": 0.5895, "step": 230 }, { "epoch": 1.8263157894736843, "grad_norm": 0.6027734875679016, "learning_rate": 3.945383062707652e-06, "loss": 0.5979, "step": 231 }, { "epoch": 1.8342105263157895, "grad_norm": 0.5285260677337646, "learning_rate": 3.9002682108016585e-06, "loss": 0.6119, "step": 232 }, { "epoch": 1.8421052631578947, "grad_norm": 0.5392246246337891, "learning_rate": 3.855247250249331e-06, "loss": 0.6104, "step": 233 }, { "epoch": 1.85, "grad_norm": 0.5578750967979431, "learning_rate": 3.8103240247869077e-06, "loss": 0.606, "step": 234 }, { "epoch": 1.8578947368421053, "grad_norm": 0.5479610562324524, "learning_rate": 3.765502369806334e-06, "loss": 0.6147, "step": 235 }, { "epoch": 1.8657894736842104, "grad_norm": 0.5477908253669739, "learning_rate": 3.720786112027822e-06, "loss": 0.5947, "step": 236 }, { "epoch": 1.8736842105263158, "grad_norm": 0.5339275598526001, "learning_rate": 3.6761790691731207e-06, "loss": 0.5975, "step": 237 }, { "epoch": 1.881578947368421, "grad_norm": 0.5823328495025635, "learning_rate": 3.6316850496395863e-06, "loss": 0.5898, "step": 238 }, { "epoch": 1.8894736842105262, "grad_norm": 0.5550434589385986, "learning_rate": 3.587307852175025e-06, "loss": 0.5953, "step": 239 }, { "epoch": 1.8973684210526316, "grad_norm": 0.5742660164833069, "learning_rate": 3.5430512655533774e-06, "loss": 0.6185, "step": 240 }, { "epoch": 1.905263157894737, "grad_norm": 0.5972154140472412, "learning_rate": 3.498919068251237e-06, "loss": 0.5985, "step": 241 }, { "epoch": 1.913157894736842, "grad_norm": 0.5590418577194214, "learning_rate": 3.4549150281252635e-06, "loss": 0.6052, "step": 242 }, { "epoch": 1.9210526315789473, "grad_norm": 0.58773273229599, "learning_rate": 3.4110429020904924e-06, "loss": 0.6158, "step": 243 }, { "epoch": 1.9289473684210527, "grad_norm": 0.5843963027000427, "learning_rate": 3.3673064357995844e-06, "loss": 0.6079, "step": 244 }, { "epoch": 1.936842105263158, "grad_norm": 0.5518509745597839, "learning_rate": 3.3237093633230323e-06, "loss": 0.6023, "step": 245 }, { "epoch": 1.944736842105263, "grad_norm": 0.5634933114051819, "learning_rate": 3.2802554068303595e-06, "loss": 0.5972, "step": 246 }, { "epoch": 1.9526315789473685, "grad_norm": 0.5546795129776001, "learning_rate": 3.236948276272337e-06, "loss": 0.6012, "step": 247 }, { "epoch": 1.9605263157894737, "grad_norm": 0.55860435962677, "learning_rate": 3.1937916690642356e-06, "loss": 0.6186, "step": 248 }, { "epoch": 1.9684210526315788, "grad_norm": 0.5482208728790283, "learning_rate": 3.150789269770155e-06, "loss": 0.6218, "step": 249 }, { "epoch": 1.9763157894736842, "grad_norm": 0.6123985648155212, "learning_rate": 3.107944749788449e-06, "loss": 0.6088, "step": 250 }, { "epoch": 1.9842105263157894, "grad_norm": 0.5692940354347229, "learning_rate": 3.0652617670382745e-06, "loss": 0.5958, "step": 251 }, { "epoch": 1.9921052631578946, "grad_norm": 0.5609405040740967, "learning_rate": 3.0227439656472878e-06, "loss": 0.6141, "step": 252 }, { "epoch": 2.0026315789473683, "grad_norm": 0.5651322603225708, "learning_rate": 2.980394975640526e-06, "loss": 0.5795, "step": 253 }, { "epoch": 2.0105263157894737, "grad_norm": 0.6917658448219299, "learning_rate": 2.9382184126304834e-06, "loss": 0.5615, "step": 254 }, { "epoch": 2.018421052631579, "grad_norm": 0.6464757323265076, "learning_rate": 2.8962178775084267e-06, "loss": 0.517, "step": 255 }, { "epoch": 2.026315789473684, "grad_norm": 0.5215118527412415, "learning_rate": 2.8543969561369556e-06, "loss": 0.5421, "step": 256 }, { "epoch": 2.0342105263157895, "grad_norm": 0.5354113578796387, "learning_rate": 2.812759219043869e-06, "loss": 0.5397, "step": 257 }, { "epoch": 2.042105263157895, "grad_norm": 0.6157158613204956, "learning_rate": 2.771308221117309e-06, "loss": 0.5523, "step": 258 }, { "epoch": 2.05, "grad_norm": 0.646392822265625, "learning_rate": 2.7300475013022666e-06, "loss": 0.518, "step": 259 }, { "epoch": 2.057894736842105, "grad_norm": 0.5618102550506592, "learning_rate": 2.6889805822984348e-06, "loss": 0.5343, "step": 260 }, { "epoch": 2.0657894736842106, "grad_norm": 0.5714638233184814, "learning_rate": 2.648110970259454e-06, "loss": 0.5421, "step": 261 }, { "epoch": 2.0736842105263156, "grad_norm": 0.55697101354599, "learning_rate": 2.607442154493568e-06, "loss": 0.5382, "step": 262 }, { "epoch": 2.081578947368421, "grad_norm": 0.6028394103050232, "learning_rate": 2.5669776071657194e-06, "loss": 0.5367, "step": 263 }, { "epoch": 2.0894736842105264, "grad_norm": 0.580823540687561, "learning_rate": 2.526720783001107e-06, "loss": 0.5371, "step": 264 }, { "epoch": 2.0973684210526318, "grad_norm": 0.558423638343811, "learning_rate": 2.486675118990233e-06, "loss": 0.5304, "step": 265 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5470890998840332, "learning_rate": 2.4468440340954664e-06, "loss": 0.5286, "step": 266 }, { "epoch": 2.113157894736842, "grad_norm": 0.5406467914581299, "learning_rate": 2.4072309289591394e-06, "loss": 0.5733, "step": 267 }, { "epoch": 2.1210526315789475, "grad_norm": 0.5414575934410095, "learning_rate": 2.3678391856132203e-06, "loss": 0.5464, "step": 268 }, { "epoch": 2.1289473684210525, "grad_norm": 0.5568278431892395, "learning_rate": 2.328672167190558e-06, "loss": 0.5326, "step": 269 }, { "epoch": 2.136842105263158, "grad_norm": 0.5567864775657654, "learning_rate": 2.289733217637753e-06, "loss": 0.5045, "step": 270 }, { "epoch": 2.1447368421052633, "grad_norm": 0.5617071986198425, "learning_rate": 2.2510256614296638e-06, "loss": 0.5214, "step": 271 }, { "epoch": 2.1526315789473682, "grad_norm": 0.5503396391868591, "learning_rate": 2.2125528032855727e-06, "loss": 0.5436, "step": 272 }, { "epoch": 2.1605263157894736, "grad_norm": 0.5096573829650879, "learning_rate": 2.174317927887041e-06, "loss": 0.5268, "step": 273 }, { "epoch": 2.168421052631579, "grad_norm": 0.5499228239059448, "learning_rate": 2.136324299597474e-06, "loss": 0.5281, "step": 274 }, { "epoch": 2.1763157894736844, "grad_norm": 0.4998083710670471, "learning_rate": 2.098575162183422e-06, "loss": 0.5177, "step": 275 }, { "epoch": 2.1842105263157894, "grad_norm": 0.4903915524482727, "learning_rate": 2.061073738537635e-06, "loss": 0.533, "step": 276 }, { "epoch": 2.192105263157895, "grad_norm": 0.5267230868339539, "learning_rate": 2.023823230403907e-06, "loss": 0.5286, "step": 277 }, { "epoch": 2.2, "grad_norm": 0.5258969664573669, "learning_rate": 1.9868268181037186e-06, "loss": 0.5249, "step": 278 }, { "epoch": 2.207894736842105, "grad_norm": 0.5024856925010681, "learning_rate": 1.9500876602647167e-06, "loss": 0.5307, "step": 279 }, { "epoch": 2.2157894736842105, "grad_norm": 0.5637997984886169, "learning_rate": 1.913608893551036e-06, "loss": 0.5511, "step": 280 }, { "epoch": 2.223684210526316, "grad_norm": 0.5119351744651794, "learning_rate": 1.8773936323955055e-06, "loss": 0.5424, "step": 281 }, { "epoch": 2.231578947368421, "grad_norm": 0.5853419303894043, "learning_rate": 1.8414449687337467e-06, "loss": 0.5365, "step": 282 }, { "epoch": 2.2394736842105263, "grad_norm": 0.496768593788147, "learning_rate": 1.8057659717401948e-06, "loss": 0.5303, "step": 283 }, { "epoch": 2.2473684210526317, "grad_norm": 0.5818266272544861, "learning_rate": 1.7703596875660645e-06, "loss": 0.5386, "step": 284 }, { "epoch": 2.2552631578947366, "grad_norm": 0.5075578689575195, "learning_rate": 1.7352291390792798e-06, "loss": 0.5349, "step": 285 }, { "epoch": 2.263157894736842, "grad_norm": 0.4751499593257904, "learning_rate": 1.7003773256063882e-06, "loss": 0.5159, "step": 286 }, { "epoch": 2.2710526315789474, "grad_norm": 0.4846212565898895, "learning_rate": 1.6658072226764949e-06, "loss": 0.5432, "step": 287 }, { "epoch": 2.2789473684210524, "grad_norm": 0.5411502122879028, "learning_rate": 1.6315217817672142e-06, "loss": 0.5133, "step": 288 }, { "epoch": 2.286842105263158, "grad_norm": 0.5099013447761536, "learning_rate": 1.5975239300526924e-06, "loss": 0.5364, "step": 289 }, { "epoch": 2.294736842105263, "grad_norm": 0.5247920155525208, "learning_rate": 1.5638165701536866e-06, "loss": 0.5325, "step": 290 }, { "epoch": 2.3026315789473686, "grad_norm": 0.5047224760055542, "learning_rate": 1.5304025798897521e-06, "loss": 0.5429, "step": 291 }, { "epoch": 2.3105263157894735, "grad_norm": 0.4913298487663269, "learning_rate": 1.4972848120335453e-06, "loss": 0.5384, "step": 292 }, { "epoch": 2.318421052631579, "grad_norm": 0.4855342507362366, "learning_rate": 1.4644660940672628e-06, "loss": 0.5449, "step": 293 }, { "epoch": 2.3263157894736843, "grad_norm": 0.5031416416168213, "learning_rate": 1.4319492279412388e-06, "loss": 0.5404, "step": 294 }, { "epoch": 2.3342105263157893, "grad_norm": 0.4946608245372772, "learning_rate": 1.399736989834728e-06, "loss": 0.5431, "step": 295 }, { "epoch": 2.3421052631578947, "grad_norm": 0.5071939826011658, "learning_rate": 1.3678321299188802e-06, "loss": 0.5221, "step": 296 }, { "epoch": 2.35, "grad_norm": 0.48161017894744873, "learning_rate": 1.336237372121944e-06, "loss": 0.5229, "step": 297 }, { "epoch": 2.3578947368421055, "grad_norm": 0.5062857270240784, "learning_rate": 1.3049554138967052e-06, "loss": 0.5344, "step": 298 }, { "epoch": 2.3657894736842104, "grad_norm": 0.5024528503417969, "learning_rate": 1.2739889259901866e-06, "loss": 0.5432, "step": 299 }, { "epoch": 2.373684210526316, "grad_norm": 0.5160316824913025, "learning_rate": 1.2433405522156334e-06, "loss": 0.5374, "step": 300 }, { "epoch": 2.3815789473684212, "grad_norm": 0.527028501033783, "learning_rate": 1.213012909226786e-06, "loss": 0.5286, "step": 301 }, { "epoch": 2.389473684210526, "grad_norm": 0.5148992538452148, "learning_rate": 1.1830085862944851e-06, "loss": 0.5527, "step": 302 }, { "epoch": 2.3973684210526316, "grad_norm": 0.47975584864616394, "learning_rate": 1.1533301450856054e-06, "loss": 0.567, "step": 303 }, { "epoch": 2.405263157894737, "grad_norm": 0.4557743966579437, "learning_rate": 1.1239801194443507e-06, "loss": 0.5406, "step": 304 }, { "epoch": 2.413157894736842, "grad_norm": 0.5039398074150085, "learning_rate": 1.0949610151759233e-06, "loss": 0.5395, "step": 305 }, { "epoch": 2.4210526315789473, "grad_norm": 0.5025436282157898, "learning_rate": 1.066275309832584e-06, "loss": 0.5485, "step": 306 }, { "epoch": 2.4289473684210527, "grad_norm": 0.48488861322402954, "learning_rate": 1.037925452502131e-06, "loss": 0.5344, "step": 307 }, { "epoch": 2.4368421052631577, "grad_norm": 0.47348925471305847, "learning_rate": 1.0099138635988026e-06, "loss": 0.5647, "step": 308 }, { "epoch": 2.444736842105263, "grad_norm": 0.5118060111999512, "learning_rate": 9.822429346566314e-07, "loss": 0.5219, "step": 309 }, { "epoch": 2.4526315789473685, "grad_norm": 0.48763391375541687, "learning_rate": 9.549150281252633e-07, "loss": 0.5413, "step": 310 }, { "epoch": 2.4605263157894735, "grad_norm": 0.4737011194229126, "learning_rate": 9.279324771682586e-07, "loss": 0.5288, "step": 311 }, { "epoch": 2.468421052631579, "grad_norm": 0.501611590385437, "learning_rate": 9.01297585463895e-07, "loss": 0.5194, "step": 312 }, { "epoch": 2.4763157894736842, "grad_norm": 0.4514743983745575, "learning_rate": 8.750126270084891e-07, "loss": 0.5446, "step": 313 }, { "epoch": 2.4842105263157896, "grad_norm": 0.4870568811893463, "learning_rate": 8.490798459222477e-07, "loss": 0.5544, "step": 314 }, { "epoch": 2.4921052631578946, "grad_norm": 0.4649040102958679, "learning_rate": 8.235014562576732e-07, "loss": 0.5344, "step": 315 }, { "epoch": 2.5, "grad_norm": 0.49083462357521057, "learning_rate": 7.98279641810537e-07, "loss": 0.5304, "step": 316 }, { "epoch": 2.5078947368421054, "grad_norm": 0.49484091997146606, "learning_rate": 7.734165559334327e-07, "loss": 0.5423, "step": 317 }, { "epoch": 2.515789473684211, "grad_norm": 0.5281040072441101, "learning_rate": 7.489143213519301e-07, "loss": 0.5383, "step": 318 }, { "epoch": 2.5236842105263158, "grad_norm": 0.44312354922294617, "learning_rate": 7.24775029983345e-07, "loss": 0.5626, "step": 319 }, { "epoch": 2.531578947368421, "grad_norm": 0.47553321719169617, "learning_rate": 7.010007427581378e-07, "loss": 0.5321, "step": 320 }, { "epoch": 2.5394736842105265, "grad_norm": 0.4963271915912628, "learning_rate": 6.775934894439606e-07, "loss": 0.5305, "step": 321 }, { "epoch": 2.5473684210526315, "grad_norm": 0.4486421048641205, "learning_rate": 6.545552684723583e-07, "loss": 0.5698, "step": 322 }, { "epoch": 2.555263157894737, "grad_norm": 0.49345487356185913, "learning_rate": 6.318880467681527e-07, "loss": 0.5368, "step": 323 }, { "epoch": 2.5631578947368423, "grad_norm": 0.4889611601829529, "learning_rate": 6.095937595815104e-07, "loss": 0.5099, "step": 324 }, { "epoch": 2.5710526315789473, "grad_norm": 0.4943506419658661, "learning_rate": 5.876743103227217e-07, "loss": 0.5048, "step": 325 }, { "epoch": 2.5789473684210527, "grad_norm": 0.48465171456336975, "learning_rate": 5.661315703996905e-07, "loss": 0.5433, "step": 326 }, { "epoch": 2.586842105263158, "grad_norm": 0.49014216661453247, "learning_rate": 5.449673790581611e-07, "loss": 0.5684, "step": 327 }, { "epoch": 2.594736842105263, "grad_norm": 0.47829702496528625, "learning_rate": 5.241835432246888e-07, "loss": 0.5395, "step": 328 }, { "epoch": 2.6026315789473684, "grad_norm": 0.507520854473114, "learning_rate": 5.037818373523723e-07, "loss": 0.5369, "step": 329 }, { "epoch": 2.610526315789474, "grad_norm": 0.4816722869873047, "learning_rate": 4.837640032693558e-07, "loss": 0.5295, "step": 330 }, { "epoch": 2.6184210526315788, "grad_norm": 0.484370619058609, "learning_rate": 4.641317500301173e-07, "loss": 0.5354, "step": 331 }, { "epoch": 2.626315789473684, "grad_norm": 0.45856356620788574, "learning_rate": 4.448867537695578e-07, "loss": 0.563, "step": 332 }, { "epoch": 2.6342105263157896, "grad_norm": 0.47436991333961487, "learning_rate": 4.2603065755989493e-07, "loss": 0.5453, "step": 333 }, { "epoch": 2.6421052631578945, "grad_norm": 0.48465612530708313, "learning_rate": 4.0756507127038494e-07, "loss": 0.5348, "step": 334 }, { "epoch": 2.65, "grad_norm": 0.4726357161998749, "learning_rate": 3.894915714298775e-07, "loss": 0.5247, "step": 335 }, { "epoch": 2.6578947368421053, "grad_norm": 0.4793488681316376, "learning_rate": 3.71811701092219e-07, "loss": 0.5541, "step": 336 }, { "epoch": 2.6657894736842103, "grad_norm": 0.5060055255889893, "learning_rate": 3.5452696970450674e-07, "loss": 0.5609, "step": 337 }, { "epoch": 2.6736842105263157, "grad_norm": 0.5144080519676208, "learning_rate": 3.3763885297822153e-07, "loss": 0.5143, "step": 338 }, { "epoch": 2.681578947368421, "grad_norm": 0.461753249168396, "learning_rate": 3.2114879276323783e-07, "loss": 0.5422, "step": 339 }, { "epoch": 2.6894736842105265, "grad_norm": 0.48919305205345154, "learning_rate": 3.0505819692471797e-07, "loss": 0.5403, "step": 340 }, { "epoch": 2.6973684210526314, "grad_norm": 0.4654390811920166, "learning_rate": 2.893684392229185e-07, "loss": 0.5299, "step": 341 }, { "epoch": 2.705263157894737, "grad_norm": 0.45241108536720276, "learning_rate": 2.7408085919590265e-07, "loss": 0.5385, "step": 342 }, { "epoch": 2.713157894736842, "grad_norm": 0.4543478190898895, "learning_rate": 2.5919676204517073e-07, "loss": 0.5522, "step": 343 }, { "epoch": 2.7210526315789476, "grad_norm": 0.4557345509529114, "learning_rate": 2.447174185242324e-07, "loss": 0.5499, "step": 344 }, { "epoch": 2.7289473684210526, "grad_norm": 0.47247329354286194, "learning_rate": 2.3064406483010947e-07, "loss": 0.539, "step": 345 }, { "epoch": 2.736842105263158, "grad_norm": 0.5086033344268799, "learning_rate": 2.1697790249779638e-07, "loss": 0.5388, "step": 346 }, { "epoch": 2.7447368421052634, "grad_norm": 0.45670172572135925, "learning_rate": 2.0372009829767558e-07, "loss": 0.5378, "step": 347 }, { "epoch": 2.7526315789473683, "grad_norm": 0.47339746356010437, "learning_rate": 1.908717841359048e-07, "loss": 0.5362, "step": 348 }, { "epoch": 2.7605263157894737, "grad_norm": 0.47306424379348755, "learning_rate": 1.7843405695777582e-07, "loss": 0.54, "step": 349 }, { "epoch": 2.768421052631579, "grad_norm": 0.49057334661483765, "learning_rate": 1.664079786540629e-07, "loss": 0.5466, "step": 350 }, { "epoch": 2.776315789473684, "grad_norm": 0.4364169239997864, "learning_rate": 1.547945759703623e-07, "loss": 0.5468, "step": 351 }, { "epoch": 2.7842105263157895, "grad_norm": 0.45972496271133423, "learning_rate": 1.435948404194304e-07, "loss": 0.5359, "step": 352 }, { "epoch": 2.792105263157895, "grad_norm": 0.4681927561759949, "learning_rate": 1.328097281965357e-07, "loss": 0.5218, "step": 353 }, { "epoch": 2.8, "grad_norm": 0.48078453540802, "learning_rate": 1.22440160097817e-07, "loss": 0.5318, "step": 354 }, { "epoch": 2.807894736842105, "grad_norm": 0.5014187097549438, "learning_rate": 1.1248702144167123e-07, "loss": 0.5175, "step": 355 }, { "epoch": 2.8157894736842106, "grad_norm": 0.4723798632621765, "learning_rate": 1.0295116199317057e-07, "loss": 0.5459, "step": 356 }, { "epoch": 2.8236842105263156, "grad_norm": 0.485458105802536, "learning_rate": 9.383339589150776e-08, "loss": 0.5221, "step": 357 }, { "epoch": 2.831578947368421, "grad_norm": 0.4910981059074402, "learning_rate": 8.513450158049109e-08, "loss": 0.5192, "step": 358 }, { "epoch": 2.8394736842105264, "grad_norm": 0.49855437874794006, "learning_rate": 7.685522174208205e-08, "loss": 0.5347, "step": 359 }, { "epoch": 2.8473684210526313, "grad_norm": 0.46188458800315857, "learning_rate": 6.899626323298714e-08, "loss": 0.5286, "step": 360 }, { "epoch": 2.8552631578947367, "grad_norm": 0.488492876291275, "learning_rate": 6.15582970243117e-08, "loss": 0.5202, "step": 361 }, { "epoch": 2.863157894736842, "grad_norm": 0.516136109828949, "learning_rate": 5.454195814427021e-08, "loss": 0.5044, "step": 362 }, { "epoch": 2.8710526315789475, "grad_norm": 0.4855652451515198, "learning_rate": 4.794784562397459e-08, "loss": 0.535, "step": 363 }, { "epoch": 2.8789473684210525, "grad_norm": 0.49933773279190063, "learning_rate": 4.177652244628627e-08, "loss": 0.5383, "step": 364 }, { "epoch": 2.886842105263158, "grad_norm": 0.48090240359306335, "learning_rate": 3.602851549775521e-08, "loss": 0.5396, "step": 365 }, { "epoch": 2.8947368421052633, "grad_norm": 0.4793301820755005, "learning_rate": 3.0704315523631956e-08, "loss": 0.5389, "step": 366 }, { "epoch": 2.9026315789473687, "grad_norm": 0.5019791722297668, "learning_rate": 2.5804377085972278e-08, "loss": 0.5565, "step": 367 }, { "epoch": 2.9105263157894736, "grad_norm": 0.4860020577907562, "learning_rate": 2.1329118524827662e-08, "loss": 0.522, "step": 368 }, { "epoch": 2.918421052631579, "grad_norm": 0.49818456172943115, "learning_rate": 1.7278921922527224e-08, "loss": 0.5146, "step": 369 }, { "epoch": 2.9263157894736844, "grad_norm": 0.46853312849998474, "learning_rate": 1.3654133071059894e-08, "loss": 0.5414, "step": 370 }, { "epoch": 2.9342105263157894, "grad_norm": 0.47929537296295166, "learning_rate": 1.0455061442548597e-08, "loss": 0.5294, "step": 371 }, { "epoch": 2.942105263157895, "grad_norm": 0.4867682158946991, "learning_rate": 7.681980162830283e-09, "loss": 0.5533, "step": 372 }, { "epoch": 2.95, "grad_norm": 0.47420981526374817, "learning_rate": 5.3351259881379016e-09, "loss": 0.5315, "step": 373 }, { "epoch": 2.957894736842105, "grad_norm": 0.5050917863845825, "learning_rate": 3.41469928488547e-09, "loss": 0.5376, "step": 374 }, { "epoch": 2.9657894736842105, "grad_norm": 0.4844934940338135, "learning_rate": 1.9208640125628618e-09, "loss": 0.5484, "step": 375 }, { "epoch": 2.973684210526316, "grad_norm": 0.478400319814682, "learning_rate": 8.537477097364522e-10, "loss": 0.5329, "step": 376 }, { "epoch": 2.981578947368421, "grad_norm": 0.47381076216697693, "learning_rate": 2.1344148316060352e-10, "loss": 0.5301, "step": 377 }, { "epoch": 2.9894736842105263, "grad_norm": 0.5020145177841187, "learning_rate": 0.0, "loss": 0.5492, "step": 378 }, { "epoch": 2.9894736842105263, "step": 378, "total_flos": 3.713025592344904e+17, "train_loss": 0.0, "train_runtime": 12.3976, "train_samples_per_second": 2940.336, "train_steps_per_second": 30.49 } ], "logging_steps": 1, "max_steps": 378, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.713025592344904e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }