{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 16, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2909090909090909, "grad_norm": 10.451888084411621, "learning_rate": 0.0, "loss": 0.7859, "step": 1 }, { "epoch": 0.5818181818181818, "grad_norm": 9.886292457580566, "learning_rate": 2.5e-09, "loss": 0.7965, "step": 2 }, { "epoch": 0.8727272727272727, "grad_norm": 10.406240463256836, "learning_rate": 5e-09, "loss": 0.7893, "step": 3 }, { "epoch": 1.0, "grad_norm": 12.746437072753906, "learning_rate": 7.5e-09, "loss": 0.8702, "step": 4 }, { "epoch": 1.290909090909091, "grad_norm": 10.291970252990723, "learning_rate": 1e-08, "loss": 0.7785, "step": 5 }, { "epoch": 1.5818181818181818, "grad_norm": 9.746875762939453, "learning_rate": 1.25e-08, "loss": 0.759, "step": 6 }, { "epoch": 1.8727272727272726, "grad_norm": 10.920265197753906, "learning_rate": 1.5e-08, "loss": 0.8308, "step": 7 }, { "epoch": 2.0, "grad_norm": 11.265154838562012, "learning_rate": 1.7499999999999998e-08, "loss": 0.8828, "step": 8 }, { "epoch": 2.290909090909091, "grad_norm": 9.750505447387695, "learning_rate": 2e-08, "loss": 0.8004, "step": 9 }, { "epoch": 2.581818181818182, "grad_norm": 11.47065544128418, "learning_rate": 2.25e-08, "loss": 0.8204, "step": 10 }, { "epoch": 2.8727272727272726, "grad_norm": 10.275605201721191, "learning_rate": 2.5e-08, "loss": 0.7771, "step": 11 }, { "epoch": 3.0, "grad_norm": 11.604477882385254, "learning_rate": 2.75e-08, "loss": 0.8295, "step": 12 }, { "epoch": 3.290909090909091, "grad_norm": 10.712018966674805, "learning_rate": 3e-08, "loss": 0.8378, "step": 13 }, { "epoch": 3.581818181818182, "grad_norm": 10.54987907409668, "learning_rate": 3.25e-08, "loss": 0.8398, "step": 14 }, { "epoch": 3.8727272727272726, "grad_norm": 9.999624252319336, "learning_rate": 3.4999999999999996e-08, "loss": 0.773, "step": 15 }, { "epoch": 4.0, "grad_norm": 10.562870979309082, "learning_rate": 3.75e-08, "loss": 0.7025, "step": 16 }, { "epoch": 4.0, "eval_loss": 0.760595977306366, "eval_runtime": 0.722, "eval_samples_per_second": 18.005, "eval_steps_per_second": 18.005, "step": 16 }, { "epoch": 4.290909090909091, "grad_norm": 9.95614242553711, "learning_rate": 4e-08, "loss": 0.7785, "step": 17 }, { "epoch": 4.581818181818182, "grad_norm": 11.356291770935059, "learning_rate": 4.25e-08, "loss": 0.8645, "step": 18 }, { "epoch": 4.872727272727273, "grad_norm": 10.108142852783203, "learning_rate": 4.5e-08, "loss": 0.7834, "step": 19 }, { "epoch": 5.0, "grad_norm": 10.209877014160156, "learning_rate": 4.7499999999999995e-08, "loss": 0.7744, "step": 20 }, { "epoch": 5.290909090909091, "grad_norm": 9.586356163024902, "learning_rate": 5e-08, "loss": 0.7433, "step": 21 }, { "epoch": 5.581818181818182, "grad_norm": 10.589778900146484, "learning_rate": 5.25e-08, "loss": 0.818, "step": 22 }, { "epoch": 5.872727272727273, "grad_norm": 10.28813362121582, "learning_rate": 5.5e-08, "loss": 0.8245, "step": 23 }, { "epoch": 6.0, "grad_norm": 13.027183532714844, "learning_rate": 5.749999999999999e-08, "loss": 0.8331, "step": 24 }, { "epoch": 6.290909090909091, "grad_norm": 10.363873481750488, "learning_rate": 6e-08, "loss": 0.8331, "step": 25 }, { "epoch": 6.581818181818182, "grad_norm": 9.84264850616455, "learning_rate": 6.25e-08, "loss": 0.755, "step": 26 }, { "epoch": 6.872727272727273, "grad_norm": 10.973934173583984, "learning_rate": 6.5e-08, "loss": 0.8372, "step": 27 }, { "epoch": 7.0, "grad_norm": 10.278410911560059, "learning_rate": 6.75e-08, "loss": 0.7442, "step": 28 }, { "epoch": 7.290909090909091, "grad_norm": 10.205405235290527, "learning_rate": 6.999999999999999e-08, "loss": 0.7851, "step": 29 }, { "epoch": 7.581818181818182, "grad_norm": 10.862798690795898, "learning_rate": 7.25e-08, "loss": 0.7962, "step": 30 }, { "epoch": 7.872727272727273, "grad_norm": 9.971634864807129, "learning_rate": 7.5e-08, "loss": 0.79, "step": 31 }, { "epoch": 8.0, "grad_norm": 10.8460111618042, "learning_rate": 7.75e-08, "loss": 0.9105, "step": 32 }, { "epoch": 8.0, "eval_loss": 0.7589532136917114, "eval_runtime": 0.7554, "eval_samples_per_second": 17.21, "eval_steps_per_second": 17.21, "step": 32 }, { "epoch": 8.290909090909091, "grad_norm": 10.4276704788208, "learning_rate": 8e-08, "loss": 0.7914, "step": 33 }, { "epoch": 8.581818181818182, "grad_norm": 9.807103157043457, "learning_rate": 8.249999999999999e-08, "loss": 0.8344, "step": 34 }, { "epoch": 8.872727272727273, "grad_norm": 9.850166320800781, "learning_rate": 8.5e-08, "loss": 0.7419, "step": 35 }, { "epoch": 9.0, "grad_norm": 12.547399520874023, "learning_rate": 8.75e-08, "loss": 0.8597, "step": 36 }, { "epoch": 9.290909090909091, "grad_norm": 10.39106559753418, "learning_rate": 9e-08, "loss": 0.7911, "step": 37 }, { "epoch": 9.581818181818182, "grad_norm": 10.728227615356445, "learning_rate": 9.25e-08, "loss": 0.852, "step": 38 }, { "epoch": 9.872727272727273, "grad_norm": 10.104507446289062, "learning_rate": 9.499999999999999e-08, "loss": 0.7942, "step": 39 }, { "epoch": 10.0, "grad_norm": 9.163139343261719, "learning_rate": 9.749999999999999e-08, "loss": 0.7006, "step": 40 }, { "epoch": 10.290909090909091, "grad_norm": 9.795455932617188, "learning_rate": 1e-07, "loss": 0.7496, "step": 41 }, { "epoch": 10.581818181818182, "grad_norm": 9.88698959350586, "learning_rate": 9.99982865378877e-08, "loss": 0.7978, "step": 42 }, { "epoch": 10.872727272727273, "grad_norm": 10.60831069946289, "learning_rate": 9.99931462820376e-08, "loss": 0.8437, "step": 43 }, { "epoch": 11.0, "grad_norm": 10.177803039550781, "learning_rate": 9.998457962390006e-08, "loss": 0.7926, "step": 44 }, { "epoch": 11.290909090909091, "grad_norm": 9.594599723815918, "learning_rate": 9.997258721585931e-08, "loss": 0.7521, "step": 45 }, { "epoch": 11.581818181818182, "grad_norm": 9.713711738586426, "learning_rate": 9.99571699711836e-08, "loss": 0.7497, "step": 46 }, { "epoch": 11.872727272727273, "grad_norm": 10.672869682312012, "learning_rate": 9.993832906395581e-08, "loss": 0.8709, "step": 47 }, { "epoch": 12.0, "grad_norm": 10.758075714111328, "learning_rate": 9.991606592898401e-08, "loss": 0.8193, "step": 48 }, { "epoch": 12.0, "eval_loss": 0.7549822926521301, "eval_runtime": 0.7468, "eval_samples_per_second": 17.407, "eval_steps_per_second": 17.407, "step": 48 }, { "epoch": 12.290909090909091, "grad_norm": 10.45877742767334, "learning_rate": 9.989038226169209e-08, "loss": 0.8488, "step": 49 }, { "epoch": 12.581818181818182, "grad_norm": 9.969883918762207, "learning_rate": 9.986128001799076e-08, "loss": 0.7697, "step": 50 }, { "epoch": 12.872727272727273, "grad_norm": 9.769625663757324, "learning_rate": 9.982876141412855e-08, "loss": 0.769, "step": 51 }, { "epoch": 13.0, "grad_norm": 10.126288414001465, "learning_rate": 9.979282892652304e-08, "loss": 0.8111, "step": 52 }, { "epoch": 13.290909090909091, "grad_norm": 10.078240394592285, "learning_rate": 9.975348529157229e-08, "loss": 0.798, "step": 53 }, { "epoch": 13.581818181818182, "grad_norm": 10.470067024230957, "learning_rate": 9.971073350544643e-08, "loss": 0.8011, "step": 54 }, { "epoch": 13.872727272727273, "grad_norm": 9.311027526855469, "learning_rate": 9.966457682385949e-08, "loss": 0.7109, "step": 55 }, { "epoch": 14.0, "grad_norm": 10.48747730255127, "learning_rate": 9.961501876182147e-08, "loss": 0.969, "step": 56 }, { "epoch": 14.290909090909091, "grad_norm": 9.95235538482666, "learning_rate": 9.956206309337066e-08, "loss": 0.7757, "step": 57 }, { "epoch": 14.581818181818182, "grad_norm": 9.8017578125, "learning_rate": 9.950571385128625e-08, "loss": 0.7974, "step": 58 }, { "epoch": 14.872727272727273, "grad_norm": 10.287720680236816, "learning_rate": 9.94459753267812e-08, "loss": 0.842, "step": 59 }, { "epoch": 15.0, "grad_norm": 9.790772438049316, "learning_rate": 9.938285206917541e-08, "loss": 0.7127, "step": 60 }, { "epoch": 15.290909090909091, "grad_norm": 9.698260307312012, "learning_rate": 9.931634888554937e-08, "loss": 0.7662, "step": 61 }, { "epoch": 15.581818181818182, "grad_norm": 8.966428756713867, "learning_rate": 9.924647084037797e-08, "loss": 0.7652, "step": 62 }, { "epoch": 15.872727272727273, "grad_norm": 9.97366714477539, "learning_rate": 9.917322325514488e-08, "loss": 0.854, "step": 63 }, { "epoch": 16.0, "grad_norm": 8.8572359085083, "learning_rate": 9.909661170793732e-08, "loss": 0.6939, "step": 64 }, { "epoch": 16.0, "eval_loss": 0.7460314631462097, "eval_runtime": 0.7685, "eval_samples_per_second": 16.917, "eval_steps_per_second": 16.917, "step": 64 }, { "epoch": 16.29090909090909, "grad_norm": 9.27105712890625, "learning_rate": 9.901664203302125e-08, "loss": 0.7894, "step": 65 }, { "epoch": 16.581818181818182, "grad_norm": 9.316947937011719, "learning_rate": 9.8933320320397e-08, "loss": 0.7476, "step": 66 }, { "epoch": 16.87272727272727, "grad_norm": 9.482162475585938, "learning_rate": 9.884665291533559e-08, "loss": 0.8294, "step": 67 }, { "epoch": 17.0, "grad_norm": 8.661737442016602, "learning_rate": 9.875664641789545e-08, "loss": 0.741, "step": 68 }, { "epoch": 17.29090909090909, "grad_norm": 8.976079940795898, "learning_rate": 9.866330768241983e-08, "loss": 0.7999, "step": 69 }, { "epoch": 17.581818181818182, "grad_norm": 9.500139236450195, "learning_rate": 9.856664381701484e-08, "loss": 0.8324, "step": 70 }, { "epoch": 17.87272727272727, "grad_norm": 8.916980743408203, "learning_rate": 9.846666218300807e-08, "loss": 0.7308, "step": 71 }, { "epoch": 18.0, "grad_norm": 9.658391952514648, "learning_rate": 9.836337039438803e-08, "loss": 0.7691, "step": 72 }, { "epoch": 18.29090909090909, "grad_norm": 9.849284172058105, "learning_rate": 9.825677631722435e-08, "loss": 0.7804, "step": 73 }, { "epoch": 18.581818181818182, "grad_norm": 9.135872840881348, "learning_rate": 9.814688806906868e-08, "loss": 0.808, "step": 74 }, { "epoch": 18.87272727272727, "grad_norm": 8.595244407653809, "learning_rate": 9.80337140183366e-08, "loss": 0.7593, "step": 75 }, { "epoch": 19.0, "grad_norm": 9.557754516601562, "learning_rate": 9.791726278367021e-08, "loss": 0.795, "step": 76 }, { "epoch": 19.29090909090909, "grad_norm": 9.38132095336914, "learning_rate": 9.779754323328191e-08, "loss": 0.8303, "step": 77 }, { "epoch": 19.581818181818182, "grad_norm": 9.702839851379395, "learning_rate": 9.767456448427896e-08, "loss": 0.7404, "step": 78 }, { "epoch": 19.87272727272727, "grad_norm": 8.890022277832031, "learning_rate": 9.754833590196926e-08, "loss": 0.8152, "step": 79 }, { "epoch": 20.0, "grad_norm": 8.762577056884766, "learning_rate": 9.741886709914803e-08, "loss": 0.6623, "step": 80 }, { "epoch": 20.0, "eval_loss": 0.7418057918548584, "eval_runtime": 0.7512, "eval_samples_per_second": 17.305, "eval_steps_per_second": 17.305, "step": 80 }, { "epoch": 20.29090909090909, "grad_norm": 9.43545913696289, "learning_rate": 9.728616793536587e-08, "loss": 0.726, "step": 81 }, { "epoch": 20.581818181818182, "grad_norm": 8.36042308807373, "learning_rate": 9.715024851617789e-08, "loss": 0.7908, "step": 82 }, { "epoch": 20.87272727272727, "grad_norm": 9.46149730682373, "learning_rate": 9.701111919237408e-08, "loss": 0.8219, "step": 83 }, { "epoch": 21.0, "grad_norm": 9.277331352233887, "learning_rate": 9.68687905591911e-08, "loss": 0.7955, "step": 84 }, { "epoch": 21.29090909090909, "grad_norm": 9.980899810791016, "learning_rate": 9.672327345550542e-08, "loss": 0.8459, "step": 85 }, { "epoch": 21.581818181818182, "grad_norm": 8.734892845153809, "learning_rate": 9.65745789630079e-08, "loss": 0.7952, "step": 86 }, { "epoch": 21.87272727272727, "grad_norm": 7.979213714599609, "learning_rate": 9.642271840535982e-08, "loss": 0.6928, "step": 87 }, { "epoch": 22.0, "grad_norm": 9.570889472961426, "learning_rate": 9.626770334733058e-08, "loss": 0.7813, "step": 88 }, { "epoch": 22.29090909090909, "grad_norm": 9.478497505187988, "learning_rate": 9.610954559391703e-08, "loss": 0.783, "step": 89 }, { "epoch": 22.581818181818182, "grad_norm": 8.57199478149414, "learning_rate": 9.594825718944444e-08, "loss": 0.7859, "step": 90 }, { "epoch": 22.87272727272727, "grad_norm": 8.782203674316406, "learning_rate": 9.578385041664925e-08, "loss": 0.7784, "step": 91 }, { "epoch": 23.0, "grad_norm": 9.160470008850098, "learning_rate": 9.561633779574373e-08, "loss": 0.7613, "step": 92 }, { "epoch": 23.29090909090909, "grad_norm": 8.80034065246582, "learning_rate": 9.544573208346251e-08, "loss": 0.7708, "step": 93 }, { "epoch": 23.581818181818182, "grad_norm": 9.001204490661621, "learning_rate": 9.527204627209113e-08, "loss": 0.7975, "step": 94 }, { "epoch": 23.87272727272727, "grad_norm": 8.64294147491455, "learning_rate": 9.509529358847655e-08, "loss": 0.7533, "step": 95 }, { "epoch": 24.0, "grad_norm": 9.539164543151855, "learning_rate": 9.491548749301997e-08, "loss": 0.8112, "step": 96 }, { "epoch": 24.0, "eval_loss": 0.7388671040534973, "eval_runtime": 0.7379, "eval_samples_per_second": 17.617, "eval_steps_per_second": 17.617, "step": 96 }, { "epoch": 24.29090909090909, "grad_norm": 8.564647674560547, "learning_rate": 9.473264167865172e-08, "loss": 0.779, "step": 97 }, { "epoch": 24.581818181818182, "grad_norm": 8.466269493103027, "learning_rate": 9.454677006978843e-08, "loss": 0.7427, "step": 98 }, { "epoch": 24.87272727272727, "grad_norm": 9.549156188964844, "learning_rate": 9.435788682127281e-08, "loss": 0.7749, "step": 99 }, { "epoch": 25.0, "grad_norm": 8.791007041931152, "learning_rate": 9.416600631729548e-08, "loss": 0.8413, "step": 100 }, { "epoch": 25.29090909090909, "grad_norm": 8.481273651123047, "learning_rate": 9.397114317029974e-08, "loss": 0.7987, "step": 101 }, { "epoch": 25.581818181818182, "grad_norm": 7.957334518432617, "learning_rate": 9.377331221986867e-08, "loss": 0.7579, "step": 102 }, { "epoch": 25.87272727272727, "grad_norm": 7.695952415466309, "learning_rate": 9.357252853159505e-08, "loss": 0.7138, "step": 103 }, { "epoch": 26.0, "grad_norm": 8.535294532775879, "learning_rate": 9.336880739593415e-08, "loss": 0.8143, "step": 104 }, { "epoch": 26.29090909090909, "grad_norm": 7.785234451293945, "learning_rate": 9.316216432703917e-08, "loss": 0.7595, "step": 105 }, { "epoch": 26.581818181818182, "grad_norm": 7.210692882537842, "learning_rate": 9.295261506157986e-08, "loss": 0.6892, "step": 106 }, { "epoch": 26.87272727272727, "grad_norm": 7.439105033874512, "learning_rate": 9.274017555754408e-08, "loss": 0.7828, "step": 107 }, { "epoch": 27.0, "grad_norm": 8.5601167678833, "learning_rate": 9.252486199302256e-08, "loss": 0.8267, "step": 108 }, { "epoch": 27.29090909090909, "grad_norm": 7.751751899719238, "learning_rate": 9.230669076497686e-08, "loss": 0.7837, "step": 109 }, { "epoch": 27.581818181818182, "grad_norm": 7.58750057220459, "learning_rate": 9.20856784879907e-08, "loss": 0.7629, "step": 110 }, { "epoch": 27.87272727272727, "grad_norm": 7.078155040740967, "learning_rate": 9.186184199300463e-08, "loss": 0.732, "step": 111 }, { "epoch": 28.0, "grad_norm": 7.075254440307617, "learning_rate": 9.163519832603437e-08, "loss": 0.708, "step": 112 }, { "epoch": 28.0, "eval_loss": 0.7153984904289246, "eval_runtime": 0.7384, "eval_samples_per_second": 17.605, "eval_steps_per_second": 17.605, "step": 112 }, { "epoch": 28.29090909090909, "grad_norm": 7.660149097442627, "learning_rate": 9.140576474687262e-08, "loss": 0.7923, "step": 113 }, { "epoch": 28.581818181818182, "grad_norm": 6.516578674316406, "learning_rate": 9.117355872777476e-08, "loss": 0.6965, "step": 114 }, { "epoch": 28.87272727272727, "grad_norm": 6.818985462188721, "learning_rate": 9.093859795212817e-08, "loss": 0.7564, "step": 115 }, { "epoch": 29.0, "grad_norm": 9.244476318359375, "learning_rate": 9.070090031310558e-08, "loss": 0.7325, "step": 116 }, { "epoch": 29.29090909090909, "grad_norm": 7.331173419952393, "learning_rate": 9.046048391230248e-08, "loss": 0.6957, "step": 117 }, { "epoch": 29.581818181818182, "grad_norm": 7.235352993011475, "learning_rate": 9.021736705835861e-08, "loss": 0.8011, "step": 118 }, { "epoch": 29.87272727272727, "grad_norm": 7.370168209075928, "learning_rate": 8.997156826556369e-08, "loss": 0.7767, "step": 119 }, { "epoch": 30.0, "grad_norm": 6.547177314758301, "learning_rate": 8.97231062524474e-08, "loss": 0.6864, "step": 120 }, { "epoch": 30.29090909090909, "grad_norm": 6.999849796295166, "learning_rate": 8.9471999940354e-08, "loss": 0.757, "step": 121 }, { "epoch": 30.581818181818182, "grad_norm": 7.369142055511475, "learning_rate": 8.921826845200139e-08, "loss": 0.7184, "step": 122 }, { "epoch": 30.87272727272727, "grad_norm": 7.147704601287842, "learning_rate": 8.896193111002475e-08, "loss": 0.8074, "step": 123 }, { "epoch": 31.0, "grad_norm": 6.590007305145264, "learning_rate": 8.87030074355051e-08, "loss": 0.666, "step": 124 }, { "epoch": 31.29090909090909, "grad_norm": 6.5022711753845215, "learning_rate": 8.844151714648274e-08, "loss": 0.7109, "step": 125 }, { "epoch": 31.581818181818182, "grad_norm": 7.46487283706665, "learning_rate": 8.817748015645558e-08, "loss": 0.7848, "step": 126 }, { "epoch": 31.87272727272727, "grad_norm": 7.2371721267700195, "learning_rate": 8.791091657286267e-08, "loss": 0.7756, "step": 127 }, { "epoch": 32.0, "grad_norm": 6.450557708740234, "learning_rate": 8.764184669555293e-08, "loss": 0.6471, "step": 128 }, { "epoch": 32.0, "eval_loss": 0.7097088694572449, "eval_runtime": 0.7853, "eval_samples_per_second": 16.554, "eval_steps_per_second": 16.554, "step": 128 }, { "epoch": 32.29090909090909, "grad_norm": 7.1595611572265625, "learning_rate": 8.737029101523929e-08, "loss": 0.7418, "step": 129 }, { "epoch": 32.58181818181818, "grad_norm": 7.2520294189453125, "learning_rate": 8.709627021193817e-08, "loss": 0.7407, "step": 130 }, { "epoch": 32.872727272727275, "grad_norm": 6.757298469543457, "learning_rate": 8.681980515339464e-08, "loss": 0.7486, "step": 131 }, { "epoch": 33.0, "grad_norm": 6.70634651184082, "learning_rate": 8.65409168934933e-08, "loss": 0.7381, "step": 132 }, { "epoch": 33.29090909090909, "grad_norm": 8.267258644104004, "learning_rate": 8.625962667065488e-08, "loss": 0.8277, "step": 133 }, { "epoch": 33.58181818181818, "grad_norm": 6.568601608276367, "learning_rate": 8.597595590621892e-08, "loss": 0.7345, "step": 134 }, { "epoch": 33.872727272727275, "grad_norm": 6.368529796600342, "learning_rate": 8.568992620281244e-08, "loss": 0.6949, "step": 135 }, { "epoch": 34.0, "grad_norm": 6.077971458435059, "learning_rate": 8.540155934270471e-08, "loss": 0.6427, "step": 136 }, { "epoch": 34.29090909090909, "grad_norm": 6.2005743980407715, "learning_rate": 8.511087728614862e-08, "loss": 0.7113, "step": 137 }, { "epoch": 34.58181818181818, "grad_norm": 6.390923023223877, "learning_rate": 8.481790216970819e-08, "loss": 0.7422, "step": 138 }, { "epoch": 34.872727272727275, "grad_norm": 7.773628234863281, "learning_rate": 8.452265630457283e-08, "loss": 0.7829, "step": 139 }, { "epoch": 35.0, "grad_norm": 7.676466941833496, "learning_rate": 8.422516217485826e-08, "loss": 0.718, "step": 140 }, { "epoch": 35.29090909090909, "grad_norm": 6.630233287811279, "learning_rate": 8.392544243589427e-08, "loss": 0.7046, "step": 141 }, { "epoch": 35.58181818181818, "grad_norm": 6.816230297088623, "learning_rate": 8.362351991249938e-08, "loss": 0.7685, "step": 142 }, { "epoch": 35.872727272727275, "grad_norm": 6.341788291931152, "learning_rate": 8.331941759724268e-08, "loss": 0.6774, "step": 143 }, { "epoch": 36.0, "grad_norm": 8.306670188903809, "learning_rate": 8.301315864869288e-08, "loss": 0.9019, "step": 144 }, { "epoch": 36.0, "eval_loss": 0.7050113677978516, "eval_runtime": 0.7541, "eval_samples_per_second": 17.239, "eval_steps_per_second": 17.239, "step": 144 }, { "epoch": 36.29090909090909, "grad_norm": 6.883708953857422, "learning_rate": 8.270476638965461e-08, "loss": 0.7921, "step": 145 }, { "epoch": 36.58181818181818, "grad_norm": 6.389072418212891, "learning_rate": 8.239426430539243e-08, "loss": 0.6827, "step": 146 }, { "epoch": 36.872727272727275, "grad_norm": 6.924624443054199, "learning_rate": 8.208167604184218e-08, "loss": 0.7774, "step": 147 }, { "epoch": 37.0, "grad_norm": 6.5363450050354, "learning_rate": 8.176702540381035e-08, "loss": 0.6709, "step": 148 }, { "epoch": 37.29090909090909, "grad_norm": 6.154909610748291, "learning_rate": 8.145033635316129e-08, "loss": 0.7302, "step": 149 }, { "epoch": 37.58181818181818, "grad_norm": 6.3788676261901855, "learning_rate": 8.113163300699229e-08, "loss": 0.7301, "step": 150 }, { "epoch": 37.872727272727275, "grad_norm": 7.089733600616455, "learning_rate": 8.081093963579708e-08, "loss": 0.7223, "step": 151 }, { "epoch": 38.0, "grad_norm": 7.056278228759766, "learning_rate": 8.048828066161747e-08, "loss": 0.7989, "step": 152 }, { "epoch": 38.29090909090909, "grad_norm": 5.888208389282227, "learning_rate": 8.016368065618359e-08, "loss": 0.6768, "step": 153 }, { "epoch": 38.58181818181818, "grad_norm": 7.345203399658203, "learning_rate": 7.983716433904262e-08, "loss": 0.7454, "step": 154 }, { "epoch": 38.872727272727275, "grad_norm": 6.323718070983887, "learning_rate": 7.950875657567622e-08, "loss": 0.7511, "step": 155 }, { "epoch": 39.0, "grad_norm": 7.311026096343994, "learning_rate": 7.917848237560708e-08, "loss": 0.8113, "step": 156 }, { "epoch": 39.29090909090909, "grad_norm": 6.139308452606201, "learning_rate": 7.884636689049422e-08, "loss": 0.7076, "step": 157 }, { "epoch": 39.58181818181818, "grad_norm": 6.485006809234619, "learning_rate": 7.851243541221769e-08, "loss": 0.7437, "step": 158 }, { "epoch": 39.872727272727275, "grad_norm": 6.589916706085205, "learning_rate": 7.817671337095244e-08, "loss": 0.7404, "step": 159 }, { "epoch": 40.0, "grad_norm": 6.963124752044678, "learning_rate": 7.78392263332317e-08, "loss": 0.7328, "step": 160 }, { "epoch": 40.0, "eval_loss": 0.7006868124008179, "eval_runtime": 0.7566, "eval_samples_per_second": 17.183, "eval_steps_per_second": 17.183, "step": 160 }, { "epoch": 40.29090909090909, "grad_norm": 6.547840118408203, "learning_rate": 7.75e-08, "loss": 0.7431, "step": 161 }, { "epoch": 40.58181818181818, "grad_norm": 6.299688816070557, "learning_rate": 7.715906020465603e-08, "loss": 0.7585, "step": 162 }, { "epoch": 40.872727272727275, "grad_norm": 6.586760997772217, "learning_rate": 7.681643291108518e-08, "loss": 0.7324, "step": 163 }, { "epoch": 41.0, "grad_norm": 6.389430999755859, "learning_rate": 7.647214421168238e-08, "loss": 0.6533, "step": 164 }, { "epoch": 41.29090909090909, "grad_norm": 6.027109146118164, "learning_rate": 7.612622032536508e-08, "loss": 0.7135, "step": 165 }, { "epoch": 41.58181818181818, "grad_norm": 6.719674110412598, "learning_rate": 7.577868759557654e-08, "loss": 0.7597, "step": 166 }, { "epoch": 41.872727272727275, "grad_norm": 6.458725929260254, "learning_rate": 7.54295724882796e-08, "loss": 0.7109, "step": 167 }, { "epoch": 42.0, "grad_norm": 6.904190540313721, "learning_rate": 7.507890158994139e-08, "loss": 0.7504, "step": 168 }, { "epoch": 42.29090909090909, "grad_norm": 6.580723285675049, "learning_rate": 7.472670160550848e-08, "loss": 0.7096, "step": 169 }, { "epoch": 42.58181818181818, "grad_norm": 6.410011291503906, "learning_rate": 7.437299935637328e-08, "loss": 0.7692, "step": 170 }, { "epoch": 42.872727272727275, "grad_norm": 6.3067827224731445, "learning_rate": 7.401782177833146e-08, "loss": 0.7346, "step": 171 }, { "epoch": 43.0, "grad_norm": 5.755003929138184, "learning_rate": 7.366119591953075e-08, "loss": 0.6633, "step": 172 }, { "epoch": 43.29090909090909, "grad_norm": 6.46678352355957, "learning_rate": 7.3303148938411e-08, "loss": 0.7365, "step": 173 }, { "epoch": 43.58181818181818, "grad_norm": 6.622053623199463, "learning_rate": 7.294370810163607e-08, "loss": 0.7511, "step": 174 }, { "epoch": 43.872727272727275, "grad_norm": 5.266422748565674, "learning_rate": 7.258290078201731e-08, "loss": 0.6481, "step": 175 }, { "epoch": 44.0, "grad_norm": 7.341455936431885, "learning_rate": 7.222075445642904e-08, "loss": 0.8191, "step": 176 }, { "epoch": 44.0, "eval_loss": 0.6937930583953857, "eval_runtime": 0.7725, "eval_samples_per_second": 16.828, "eval_steps_per_second": 16.828, "step": 176 }, { "epoch": 44.29090909090909, "grad_norm": 6.314858436584473, "learning_rate": 7.185729670371604e-08, "loss": 0.7001, "step": 177 }, { "epoch": 44.58181818181818, "grad_norm": 6.364148139953613, "learning_rate": 7.149255520259337e-08, "loss": 0.786, "step": 178 }, { "epoch": 44.872727272727275, "grad_norm": 5.679451942443848, "learning_rate": 7.11265577295385e-08, "loss": 0.6767, "step": 179 }, { "epoch": 45.0, "grad_norm": 6.4454216957092285, "learning_rate": 7.075933215667603e-08, "loss": 0.7351, "step": 180 }, { "epoch": 45.29090909090909, "grad_norm": 5.991427421569824, "learning_rate": 7.039090644965509e-08, "loss": 0.7047, "step": 181 }, { "epoch": 45.58181818181818, "grad_norm": 5.386115550994873, "learning_rate": 7.002130866551968e-08, "loss": 0.7113, "step": 182 }, { "epoch": 45.872727272727275, "grad_norm": 6.815364360809326, "learning_rate": 6.965056695057204e-08, "loss": 0.7255, "step": 183 }, { "epoch": 46.0, "grad_norm": 6.38714599609375, "learning_rate": 6.927870953822915e-08, "loss": 0.7503, "step": 184 }, { "epoch": 46.29090909090909, "grad_norm": 5.759856224060059, "learning_rate": 6.890576474687262e-08, "loss": 0.7008, "step": 185 }, { "epoch": 46.58181818181818, "grad_norm": 5.1396918296813965, "learning_rate": 6.853176097769228e-08, "loss": 0.6925, "step": 186 }, { "epoch": 46.872727272727275, "grad_norm": 5.9070539474487305, "learning_rate": 6.815672671252315e-08, "loss": 0.7409, "step": 187 }, { "epoch": 47.0, "grad_norm": 5.90541410446167, "learning_rate": 6.778069051167653e-08, "loss": 0.702, "step": 188 }, { "epoch": 47.29090909090909, "grad_norm": 5.474076747894287, "learning_rate": 6.740368101176495e-08, "loss": 0.7085, "step": 189 }, { "epoch": 47.58181818181818, "grad_norm": 5.111520767211914, "learning_rate": 6.702572692352155e-08, "loss": 0.685, "step": 190 }, { "epoch": 47.872727272727275, "grad_norm": 5.618140697479248, "learning_rate": 6.664685702961344e-08, "loss": 0.7551, "step": 191 }, { "epoch": 48.0, "grad_norm": 4.961245059967041, "learning_rate": 6.626710018244986e-08, "loss": 0.6327, "step": 192 }, { "epoch": 48.0, "eval_loss": 0.6752312183380127, "eval_runtime": 0.7832, "eval_samples_per_second": 16.599, "eval_steps_per_second": 16.599, "step": 192 }, { "epoch": 48.29090909090909, "grad_norm": 5.36975622177124, "learning_rate": 6.588648530198504e-08, "loss": 0.7312, "step": 193 }, { "epoch": 48.58181818181818, "grad_norm": 5.021007061004639, "learning_rate": 6.550504137351574e-08, "loss": 0.7467, "step": 194 }, { "epoch": 48.872727272727275, "grad_norm": 4.721583843231201, "learning_rate": 6.512279744547392e-08, "loss": 0.6271, "step": 195 }, { "epoch": 49.0, "grad_norm": 5.531439304351807, "learning_rate": 6.473978262721462e-08, "loss": 0.7127, "step": 196 }, { "epoch": 49.29090909090909, "grad_norm": 5.3525309562683105, "learning_rate": 6.435602608679917e-08, "loss": 0.7255, "step": 197 }, { "epoch": 49.58181818181818, "grad_norm": 4.411137104034424, "learning_rate": 6.397155704877387e-08, "loss": 0.6177, "step": 198 }, { "epoch": 49.872727272727275, "grad_norm": 4.907252788543701, "learning_rate": 6.358640479194451e-08, "loss": 0.7295, "step": 199 }, { "epoch": 50.0, "grad_norm": 4.626101493835449, "learning_rate": 6.320059864714664e-08, "loss": 0.7091, "step": 200 }, { "epoch": 50.29090909090909, "grad_norm": 4.853626728057861, "learning_rate": 6.281416799501187e-08, "loss": 0.7432, "step": 201 }, { "epoch": 50.58181818181818, "grad_norm": 4.439899921417236, "learning_rate": 6.242714226373049e-08, "loss": 0.676, "step": 202 }, { "epoch": 50.872727272727275, "grad_norm": 4.5280985832214355, "learning_rate": 6.203955092681039e-08, "loss": 0.7086, "step": 203 }, { "epoch": 51.0, "grad_norm": 4.414018154144287, "learning_rate": 6.165142350083249e-08, "loss": 0.5264, "step": 204 }, { "epoch": 51.29090909090909, "grad_norm": 4.17572021484375, "learning_rate": 6.126278954320294e-08, "loss": 0.7346, "step": 205 }, { "epoch": 51.58181818181818, "grad_norm": 4.015255928039551, "learning_rate": 6.087367864990232e-08, "loss": 0.6239, "step": 206 }, { "epoch": 51.872727272727275, "grad_norm": 4.698182582855225, "learning_rate": 6.048412045323163e-08, "loss": 0.688, "step": 207 }, { "epoch": 52.0, "grad_norm": 5.5075297355651855, "learning_rate": 6.00941446195558e-08, "loss": 0.6903, "step": 208 }, { "epoch": 52.0, "eval_loss": 0.6604220271110535, "eval_runtime": 0.6915, "eval_samples_per_second": 18.8, "eval_steps_per_second": 18.8, "step": 208 }, { "epoch": 52.29090909090909, "grad_norm": 3.8842809200286865, "learning_rate": 5.970378084704441e-08, "loss": 0.6428, "step": 209 }, { "epoch": 52.58181818181818, "grad_norm": 4.9067301750183105, "learning_rate": 5.931305886341008e-08, "loss": 0.7572, "step": 210 }, { "epoch": 52.872727272727275, "grad_norm": 4.025907516479492, "learning_rate": 5.892200842364462e-08, "loss": 0.6545, "step": 211 }, { "epoch": 53.0, "grad_norm": 4.105547904968262, "learning_rate": 5.853065930775303e-08, "loss": 0.6439, "step": 212 }, { "epoch": 53.29090909090909, "grad_norm": 3.7520296573638916, "learning_rate": 5.813904131848564e-08, "loss": 0.677, "step": 213 }, { "epoch": 53.58181818181818, "grad_norm": 3.975045680999756, "learning_rate": 5.7747184279068564e-08, "loss": 0.6321, "step": 214 }, { "epoch": 53.872727272727275, "grad_norm": 4.536473274230957, "learning_rate": 5.735511803093248e-08, "loss": 0.7326, "step": 215 }, { "epoch": 54.0, "grad_norm": 5.148712158203125, "learning_rate": 5.696287243144012e-08, "loss": 0.6819, "step": 216 }, { "epoch": 54.29090909090909, "grad_norm": 3.6721999645233154, "learning_rate": 5.6570477351612554e-08, "loss": 0.6655, "step": 217 }, { "epoch": 54.58181818181818, "grad_norm": 4.29323148727417, "learning_rate": 5.61779626738543e-08, "loss": 0.6743, "step": 218 }, { "epoch": 54.872727272727275, "grad_norm": 4.018572807312012, "learning_rate": 5.5785358289677765e-08, "loss": 0.711, "step": 219 }, { "epoch": 55.0, "grad_norm": 4.6550445556640625, "learning_rate": 5.539269409742683e-08, "loss": 0.6398, "step": 220 }, { "epoch": 55.29090909090909, "grad_norm": 4.599621295928955, "learning_rate": 5.5e-08, "loss": 0.6885, "step": 221 }, { "epoch": 55.58181818181818, "grad_norm": 3.6876866817474365, "learning_rate": 5.460730590257318e-08, "loss": 0.6391, "step": 222 }, { "epoch": 55.872727272727275, "grad_norm": 3.641345262527466, "learning_rate": 5.421464171032224e-08, "loss": 0.6684, "step": 223 }, { "epoch": 56.0, "grad_norm": 4.325244903564453, "learning_rate": 5.382203732614572e-08, "loss": 0.7467, "step": 224 }, { "epoch": 56.0, "eval_loss": 0.6532977819442749, "eval_runtime": 0.746, "eval_samples_per_second": 17.427, "eval_steps_per_second": 17.427, "step": 224 }, { "epoch": 56.29090909090909, "grad_norm": 4.434227466583252, "learning_rate": 5.342952264838747e-08, "loss": 0.7395, "step": 225 }, { "epoch": 56.58181818181818, "grad_norm": 4.03561544418335, "learning_rate": 5.303712756855988e-08, "loss": 0.7176, "step": 226 }, { "epoch": 56.872727272727275, "grad_norm": 3.4329726696014404, "learning_rate": 5.264488196906752e-08, "loss": 0.5565, "step": 227 }, { "epoch": 57.0, "grad_norm": 3.6157584190368652, "learning_rate": 5.225281572093143e-08, "loss": 0.7052, "step": 228 }, { "epoch": 57.29090909090909, "grad_norm": 3.654561996459961, "learning_rate": 5.1860958681514355e-08, "loss": 0.6931, "step": 229 }, { "epoch": 57.58181818181818, "grad_norm": 3.4616754055023193, "learning_rate": 5.1469340692246985e-08, "loss": 0.6126, "step": 230 }, { "epoch": 57.872727272727275, "grad_norm": 4.538090229034424, "learning_rate": 5.107799157635537e-08, "loss": 0.7149, "step": 231 }, { "epoch": 58.0, "grad_norm": 3.8424854278564453, "learning_rate": 5.068694113658992e-08, "loss": 0.6564, "step": 232 }, { "epoch": 58.29090909090909, "grad_norm": 3.360053777694702, "learning_rate": 5.02962191529556e-08, "loss": 0.6657, "step": 233 }, { "epoch": 58.58181818181818, "grad_norm": 4.166203022003174, "learning_rate": 4.9905855380444194e-08, "loss": 0.7461, "step": 234 }, { "epoch": 58.872727272727275, "grad_norm": 3.4333815574645996, "learning_rate": 4.9515879546768366e-08, "loss": 0.5924, "step": 235 }, { "epoch": 59.0, "grad_norm": 4.719890594482422, "learning_rate": 4.912632135009769e-08, "loss": 0.6793, "step": 236 }, { "epoch": 59.29090909090909, "grad_norm": 3.6366472244262695, "learning_rate": 4.873721045679706e-08, "loss": 0.6648, "step": 237 }, { "epoch": 59.58181818181818, "grad_norm": 4.29836893081665, "learning_rate": 4.8348576499167516e-08, "loss": 0.6871, "step": 238 }, { "epoch": 59.872727272727275, "grad_norm": 3.3436715602874756, "learning_rate": 4.7960449073189604e-08, "loss": 0.6136, "step": 239 }, { "epoch": 60.0, "grad_norm": 3.974397897720337, "learning_rate": 4.75728577362695e-08, "loss": 0.7364, "step": 240 }, { "epoch": 60.0, "eval_loss": 0.6488688588142395, "eval_runtime": 0.7429, "eval_samples_per_second": 17.5, "eval_steps_per_second": 17.5, "step": 240 }, { "epoch": 60.29090909090909, "grad_norm": 4.133732318878174, "learning_rate": 4.718583200498813e-08, "loss": 0.7386, "step": 241 }, { "epoch": 60.58181818181818, "grad_norm": 3.358363151550293, "learning_rate": 4.6799401352853365e-08, "loss": 0.6255, "step": 242 }, { "epoch": 60.872727272727275, "grad_norm": 3.73943829536438, "learning_rate": 4.641359520805548e-08, "loss": 0.6834, "step": 243 }, { "epoch": 61.0, "grad_norm": 3.680448532104492, "learning_rate": 4.6028442951226135e-08, "loss": 0.5903, "step": 244 }, { "epoch": 61.29090909090909, "grad_norm": 3.3045241832733154, "learning_rate": 4.564397391320084e-08, "loss": 0.5871, "step": 245 }, { "epoch": 61.58181818181818, "grad_norm": 3.690742015838623, "learning_rate": 4.526021737278537e-08, "loss": 0.6913, "step": 246 }, { "epoch": 61.872727272727275, "grad_norm": 4.233401775360107, "learning_rate": 4.4877202554526084e-08, "loss": 0.7115, "step": 247 }, { "epoch": 62.0, "grad_norm": 3.5080771446228027, "learning_rate": 4.449495862648427e-08, "loss": 0.687, "step": 248 }, { "epoch": 62.29090909090909, "grad_norm": 3.3871119022369385, "learning_rate": 4.4113514698014955e-08, "loss": 0.6901, "step": 249 }, { "epoch": 62.58181818181818, "grad_norm": 3.6088693141937256, "learning_rate": 4.373289981755013e-08, "loss": 0.631, "step": 250 }, { "epoch": 62.872727272727275, "grad_norm": 3.743149518966675, "learning_rate": 4.335314297038656e-08, "loss": 0.6351, "step": 251 }, { "epoch": 63.0, "grad_norm": 4.030084133148193, "learning_rate": 4.297427307647844e-08, "loss": 0.7212, "step": 252 }, { "epoch": 63.29090909090909, "grad_norm": 3.458228349685669, "learning_rate": 4.2596318988235035e-08, "loss": 0.629, "step": 253 }, { "epoch": 63.58181818181818, "grad_norm": 4.063506126403809, "learning_rate": 4.2219309488323486e-08, "loss": 0.6565, "step": 254 }, { "epoch": 63.872727272727275, "grad_norm": 3.257892370223999, "learning_rate": 4.184327328747685e-08, "loss": 0.6644, "step": 255 }, { "epoch": 64.0, "grad_norm": 3.964184284210205, "learning_rate": 4.1468239022307716e-08, "loss": 0.7706, "step": 256 }, { "epoch": 64.0, "eval_loss": 0.6460027694702148, "eval_runtime": 0.7572, "eval_samples_per_second": 17.168, "eval_steps_per_second": 17.168, "step": 256 }, { "epoch": 64.2909090909091, "grad_norm": 3.444884777069092, "learning_rate": 4.1094235253127375e-08, "loss": 0.5848, "step": 257 }, { "epoch": 64.58181818181818, "grad_norm": 3.34226131439209, "learning_rate": 4.072129046177086e-08, "loss": 0.6438, "step": 258 }, { "epoch": 64.87272727272727, "grad_norm": 4.081578254699707, "learning_rate": 4.034943304942796e-08, "loss": 0.7825, "step": 259 }, { "epoch": 65.0, "grad_norm": 3.9306929111480713, "learning_rate": 3.997869133448031e-08, "loss": 0.7003, "step": 260 }, { "epoch": 65.2909090909091, "grad_norm": 3.3377864360809326, "learning_rate": 3.960909355034491e-08, "loss": 0.6723, "step": 261 }, { "epoch": 65.58181818181818, "grad_norm": 4.126795291900635, "learning_rate": 3.924066784332396e-08, "loss": 0.6778, "step": 262 }, { "epoch": 65.87272727272727, "grad_norm": 3.283628225326538, "learning_rate": 3.8873442270461487e-08, "loss": 0.6196, "step": 263 }, { "epoch": 66.0, "grad_norm": 3.599966526031494, "learning_rate": 3.850744479740663e-08, "loss": 0.7125, "step": 264 }, { "epoch": 66.2909090909091, "grad_norm": 3.398857831954956, "learning_rate": 3.814270329628395e-08, "loss": 0.6958, "step": 265 }, { "epoch": 66.58181818181818, "grad_norm": 3.538728952407837, "learning_rate": 3.777924554357096e-08, "loss": 0.6089, "step": 266 }, { "epoch": 66.87272727272727, "grad_norm": 3.549941062927246, "learning_rate": 3.7417099217982684e-08, "loss": 0.6794, "step": 267 }, { "epoch": 67.0, "grad_norm": 3.7104790210723877, "learning_rate": 3.7056291898363926e-08, "loss": 0.6845, "step": 268 }, { "epoch": 67.2909090909091, "grad_norm": 3.7661495208740234, "learning_rate": 3.669685106158899e-08, "loss": 0.6726, "step": 269 }, { "epoch": 67.58181818181818, "grad_norm": 3.2654290199279785, "learning_rate": 3.633880408046926e-08, "loss": 0.6597, "step": 270 }, { "epoch": 67.87272727272727, "grad_norm": 3.3389344215393066, "learning_rate": 3.598217822166854e-08, "loss": 0.5967, "step": 271 }, { "epoch": 68.0, "grad_norm": 4.037345886230469, "learning_rate": 3.5627000643626705e-08, "loss": 0.7777, "step": 272 }, { "epoch": 68.0, "eval_loss": 0.6440867185592651, "eval_runtime": 0.7501, "eval_samples_per_second": 17.331, "eval_steps_per_second": 17.331, "step": 272 }, { "epoch": 68.2909090909091, "grad_norm": 3.359079360961914, "learning_rate": 3.527329839449151e-08, "loss": 0.6824, "step": 273 }, { "epoch": 68.58181818181818, "grad_norm": 3.1845285892486572, "learning_rate": 3.49210984100586e-08, "loss": 0.5956, "step": 274 }, { "epoch": 68.87272727272727, "grad_norm": 3.564899206161499, "learning_rate": 3.4570427511720395e-08, "loss": 0.679, "step": 275 }, { "epoch": 69.0, "grad_norm": 5.116268157958984, "learning_rate": 3.4221312404423483e-08, "loss": 0.7194, "step": 276 }, { "epoch": 69.2909090909091, "grad_norm": 3.2692816257476807, "learning_rate": 3.387377967463493e-08, "loss": 0.5994, "step": 277 }, { "epoch": 69.58181818181818, "grad_norm": 3.1827392578125, "learning_rate": 3.3527855788317614e-08, "loss": 0.6582, "step": 278 }, { "epoch": 69.87272727272727, "grad_norm": 3.776779890060425, "learning_rate": 3.3183567088914834e-08, "loss": 0.7129, "step": 279 }, { "epoch": 70.0, "grad_norm": 4.038694381713867, "learning_rate": 3.2840939795343986e-08, "loss": 0.7197, "step": 280 }, { "epoch": 70.2909090909091, "grad_norm": 4.152867317199707, "learning_rate": 3.250000000000001e-08, "loss": 0.6844, "step": 281 }, { "epoch": 70.58181818181818, "grad_norm": 3.327399492263794, "learning_rate": 3.2160773666768325e-08, "loss": 0.7062, "step": 282 }, { "epoch": 70.87272727272727, "grad_norm": 3.2052102088928223, "learning_rate": 3.182328662904756e-08, "loss": 0.597, "step": 283 }, { "epoch": 71.0, "grad_norm": 3.528104305267334, "learning_rate": 3.14875645877823e-08, "loss": 0.6592, "step": 284 }, { "epoch": 71.2909090909091, "grad_norm": 3.380600929260254, "learning_rate": 3.1153633109505784e-08, "loss": 0.6398, "step": 285 }, { "epoch": 71.58181818181818, "grad_norm": 3.3184642791748047, "learning_rate": 3.082151762439292e-08, "loss": 0.6979, "step": 286 }, { "epoch": 71.87272727272727, "grad_norm": 3.4902994632720947, "learning_rate": 3.049124342432378e-08, "loss": 0.6352, "step": 287 }, { "epoch": 72.0, "grad_norm": 3.682870388031006, "learning_rate": 3.0162835660957385e-08, "loss": 0.6391, "step": 288 }, { "epoch": 72.0, "eval_loss": 0.641921877861023, "eval_runtime": 0.7653, "eval_samples_per_second": 16.986, "eval_steps_per_second": 16.986, "step": 288 }, { "epoch": 72.2909090909091, "grad_norm": 3.4279274940490723, "learning_rate": 2.983631934381639e-08, "loss": 0.6219, "step": 289 }, { "epoch": 72.58181818181818, "grad_norm": 3.627363681793213, "learning_rate": 2.9511719338382535e-08, "loss": 0.6635, "step": 290 }, { "epoch": 72.87272727272727, "grad_norm": 3.1634864807128906, "learning_rate": 2.918906036420294e-08, "loss": 0.6377, "step": 291 }, { "epoch": 73.0, "grad_norm": 3.896449327468872, "learning_rate": 2.886836699300771e-08, "loss": 0.7822, "step": 292 }, { "epoch": 73.2909090909091, "grad_norm": 3.168968677520752, "learning_rate": 2.8549663646838718e-08, "loss": 0.609, "step": 293 }, { "epoch": 73.58181818181818, "grad_norm": 3.3781349658966064, "learning_rate": 2.8232974596189653e-08, "loss": 0.6728, "step": 294 }, { "epoch": 73.87272727272727, "grad_norm": 3.41473650932312, "learning_rate": 2.791832395815782e-08, "loss": 0.7013, "step": 295 }, { "epoch": 74.0, "grad_norm": 3.771911859512329, "learning_rate": 2.760573569460757e-08, "loss": 0.6343, "step": 296 }, { "epoch": 74.2909090909091, "grad_norm": 3.527878761291504, "learning_rate": 2.729523361034538e-08, "loss": 0.6528, "step": 297 }, { "epoch": 74.58181818181818, "grad_norm": 3.105755090713501, "learning_rate": 2.6986841351307128e-08, "loss": 0.6243, "step": 298 }, { "epoch": 74.87272727272727, "grad_norm": 3.3217263221740723, "learning_rate": 2.6680582402757322e-08, "loss": 0.6658, "step": 299 }, { "epoch": 75.0, "grad_norm": 4.193359375, "learning_rate": 2.637648008750062e-08, "loss": 0.7016, "step": 300 }, { "epoch": 75.2909090909091, "grad_norm": 3.2874765396118164, "learning_rate": 2.6074557564105726e-08, "loss": 0.6661, "step": 301 }, { "epoch": 75.58181818181818, "grad_norm": 3.4806275367736816, "learning_rate": 2.5774837825141737e-08, "loss": 0.6277, "step": 302 }, { "epoch": 75.87272727272727, "grad_norm": 3.398120880126953, "learning_rate": 2.547734369542718e-08, "loss": 0.6863, "step": 303 }, { "epoch": 76.0, "grad_norm": 3.1762161254882812, "learning_rate": 2.5182097830291825e-08, "loss": 0.648, "step": 304 }, { "epoch": 76.0, "eval_loss": 0.6407743096351624, "eval_runtime": 0.7838, "eval_samples_per_second": 16.585, "eval_steps_per_second": 16.585, "step": 304 }, { "epoch": 76.2909090909091, "grad_norm": 3.250011444091797, "learning_rate": 2.4889122713851394e-08, "loss": 0.6552, "step": 305 }, { "epoch": 76.58181818181818, "grad_norm": 3.1045658588409424, "learning_rate": 2.4598440657295288e-08, "loss": 0.6147, "step": 306 }, { "epoch": 76.87272727272727, "grad_norm": 4.007096290588379, "learning_rate": 2.4310073797187574e-08, "loss": 0.7181, "step": 307 }, { "epoch": 77.0, "grad_norm": 3.300295829772949, "learning_rate": 2.4024044093781064e-08, "loss": 0.6115, "step": 308 }, { "epoch": 77.2909090909091, "grad_norm": 3.376610517501831, "learning_rate": 2.3740373329345117e-08, "loss": 0.7065, "step": 309 }, { "epoch": 77.58181818181818, "grad_norm": 3.1987497806549072, "learning_rate": 2.3459083106506712e-08, "loss": 0.6265, "step": 310 }, { "epoch": 77.87272727272727, "grad_norm": 3.428140878677368, "learning_rate": 2.3180194846605363e-08, "loss": 0.629, "step": 311 }, { "epoch": 78.0, "grad_norm": 3.489027261734009, "learning_rate": 2.2903729788061836e-08, "loss": 0.6626, "step": 312 }, { "epoch": 78.2909090909091, "grad_norm": 3.7477946281433105, "learning_rate": 2.2629708984760707e-08, "loss": 0.7006, "step": 313 }, { "epoch": 78.58181818181818, "grad_norm": 3.2413809299468994, "learning_rate": 2.2358153304447067e-08, "loss": 0.6363, "step": 314 }, { "epoch": 78.87272727272727, "grad_norm": 3.0365958213806152, "learning_rate": 2.2089083427137328e-08, "loss": 0.6307, "step": 315 }, { "epoch": 79.0, "grad_norm": 3.5392417907714844, "learning_rate": 2.182251984354442e-08, "loss": 0.6594, "step": 316 }, { "epoch": 79.2909090909091, "grad_norm": 3.2169861793518066, "learning_rate": 2.1558482853517254e-08, "loss": 0.6261, "step": 317 }, { "epoch": 79.58181818181818, "grad_norm": 3.1975908279418945, "learning_rate": 2.1296992564494903e-08, "loss": 0.6303, "step": 318 }, { "epoch": 79.87272727272727, "grad_norm": 3.5037009716033936, "learning_rate": 2.103806888997526e-08, "loss": 0.6847, "step": 319 }, { "epoch": 80.0, "grad_norm": 3.49397611618042, "learning_rate": 2.078173154799861e-08, "loss": 0.704, "step": 320 }, { "epoch": 80.0, "eval_loss": 0.6397803425788879, "eval_runtime": 0.7407, "eval_samples_per_second": 17.552, "eval_steps_per_second": 17.552, "step": 320 }, { "epoch": 80.2909090909091, "grad_norm": 3.312922954559326, "learning_rate": 2.0528000059645996e-08, "loss": 0.6742, "step": 321 }, { "epoch": 80.58181818181818, "grad_norm": 3.8569176197052, "learning_rate": 2.027689374755261e-08, "loss": 0.6589, "step": 322 }, { "epoch": 80.87272727272727, "grad_norm": 3.100782871246338, "learning_rate": 2.0028431734436306e-08, "loss": 0.6384, "step": 323 }, { "epoch": 81.0, "grad_norm": 3.488448143005371, "learning_rate": 1.9782632941641373e-08, "loss": 0.6523, "step": 324 }, { "epoch": 81.2909090909091, "grad_norm": 3.713844060897827, "learning_rate": 1.9539516087697516e-08, "loss": 0.7186, "step": 325 }, { "epoch": 81.58181818181818, "grad_norm": 3.2771830558776855, "learning_rate": 1.9299099686894422e-08, "loss": 0.7155, "step": 326 }, { "epoch": 81.87272727272727, "grad_norm": 3.013705015182495, "learning_rate": 1.9061402047871834e-08, "loss": 0.5796, "step": 327 }, { "epoch": 82.0, "grad_norm": 3.9129855632781982, "learning_rate": 1.8826441272225223e-08, "loss": 0.5744, "step": 328 }, { "epoch": 82.2909090909091, "grad_norm": 3.432311773300171, "learning_rate": 1.8594235253127372e-08, "loss": 0.6681, "step": 329 }, { "epoch": 82.58181818181818, "grad_norm": 3.2865946292877197, "learning_rate": 1.8364801673965642e-08, "loss": 0.7103, "step": 330 }, { "epoch": 82.87272727272727, "grad_norm": 3.1664698123931885, "learning_rate": 1.8138158006995365e-08, "loss": 0.5704, "step": 331 }, { "epoch": 83.0, "grad_norm": 3.5827629566192627, "learning_rate": 1.7914321512009295e-08, "loss": 0.7128, "step": 332 }, { "epoch": 83.2909090909091, "grad_norm": 3.228314161300659, "learning_rate": 1.7693309235023126e-08, "loss": 0.6072, "step": 333 }, { "epoch": 83.58181818181818, "grad_norm": 3.2263855934143066, "learning_rate": 1.7475138006977434e-08, "loss": 0.6525, "step": 334 }, { "epoch": 83.87272727272727, "grad_norm": 3.2911698818206787, "learning_rate": 1.7259824442455922e-08, "loss": 0.7323, "step": 335 }, { "epoch": 84.0, "grad_norm": 3.629072666168213, "learning_rate": 1.704738493842015e-08, "loss": 0.6316, "step": 336 }, { "epoch": 84.0, "eval_loss": 0.6387331485748291, "eval_runtime": 0.761, "eval_samples_per_second": 17.082, "eval_steps_per_second": 17.082, "step": 336 }, { "epoch": 84.2909090909091, "grad_norm": 3.0507638454437256, "learning_rate": 1.6837835672960833e-08, "loss": 0.6576, "step": 337 }, { "epoch": 84.58181818181818, "grad_norm": 3.1356823444366455, "learning_rate": 1.663119260406585e-08, "loss": 0.6216, "step": 338 }, { "epoch": 84.87272727272727, "grad_norm": 3.6667861938476562, "learning_rate": 1.642747146840495e-08, "loss": 0.6975, "step": 339 }, { "epoch": 85.0, "grad_norm": 3.397160768508911, "learning_rate": 1.6226687780131337e-08, "loss": 0.6022, "step": 340 }, { "epoch": 85.2909090909091, "grad_norm": 3.163560152053833, "learning_rate": 1.602885682970026e-08, "loss": 0.6316, "step": 341 }, { "epoch": 85.58181818181818, "grad_norm": 3.1787586212158203, "learning_rate": 1.5833993682704515e-08, "loss": 0.6725, "step": 342 }, { "epoch": 85.87272727272727, "grad_norm": 3.379927158355713, "learning_rate": 1.5642113178727193e-08, "loss": 0.6319, "step": 343 }, { "epoch": 86.0, "grad_norm": 3.783219337463379, "learning_rate": 1.5453229930211566e-08, "loss": 0.72, "step": 344 }, { "epoch": 86.2909090909091, "grad_norm": 3.678173065185547, "learning_rate": 1.5267358321348288e-08, "loss": 0.7343, "step": 345 }, { "epoch": 86.58181818181818, "grad_norm": 3.0277297496795654, "learning_rate": 1.5084512506980025e-08, "loss": 0.6112, "step": 346 }, { "epoch": 86.87272727272727, "grad_norm": 2.9994826316833496, "learning_rate": 1.490470641152345e-08, "loss": 0.6377, "step": 347 }, { "epoch": 87.0, "grad_norm": 3.4868507385253906, "learning_rate": 1.4727953727908877e-08, "loss": 0.603, "step": 348 }, { "epoch": 87.2909090909091, "grad_norm": 3.3681607246398926, "learning_rate": 1.4554267916537493e-08, "loss": 0.6832, "step": 349 }, { "epoch": 87.58181818181818, "grad_norm": 3.123229742050171, "learning_rate": 1.438366220425628e-08, "loss": 0.6086, "step": 350 }, { "epoch": 87.87272727272727, "grad_norm": 3.2728774547576904, "learning_rate": 1.4216149583350753e-08, "loss": 0.6735, "step": 351 }, { "epoch": 88.0, "grad_norm": 3.185558319091797, "learning_rate": 1.405174281055556e-08, "loss": 0.6232, "step": 352 }, { "epoch": 88.0, "eval_loss": 0.6380465030670166, "eval_runtime": 0.6934, "eval_samples_per_second": 18.748, "eval_steps_per_second": 18.748, "step": 352 }, { "epoch": 88.2909090909091, "grad_norm": 3.0658743381500244, "learning_rate": 1.3890454406082957e-08, "loss": 0.6388, "step": 353 }, { "epoch": 88.58181818181818, "grad_norm": 3.2591443061828613, "learning_rate": 1.3732296652669417e-08, "loss": 0.6658, "step": 354 }, { "epoch": 88.87272727272727, "grad_norm": 3.252021551132202, "learning_rate": 1.3577281594640182e-08, "loss": 0.6671, "step": 355 }, { "epoch": 89.0, "grad_norm": 3.7164716720581055, "learning_rate": 1.3425421036992096e-08, "loss": 0.6185, "step": 356 }, { "epoch": 89.2909090909091, "grad_norm": 3.1124889850616455, "learning_rate": 1.327672654449457e-08, "loss": 0.6648, "step": 357 }, { "epoch": 89.58181818181818, "grad_norm": 3.4343039989471436, "learning_rate": 1.3131209440808898e-08, "loss": 0.6731, "step": 358 }, { "epoch": 89.87272727272727, "grad_norm": 2.8693790435791016, "learning_rate": 1.2988880807625927e-08, "loss": 0.5938, "step": 359 }, { "epoch": 90.0, "grad_norm": 4.409383773803711, "learning_rate": 1.284975148382211e-08, "loss": 0.7107, "step": 360 }, { "epoch": 90.2909090909091, "grad_norm": 3.080493450164795, "learning_rate": 1.2713832064634124e-08, "loss": 0.6798, "step": 361 }, { "epoch": 90.58181818181818, "grad_norm": 2.9749112129211426, "learning_rate": 1.2581132900851971e-08, "loss": 0.6249, "step": 362 }, { "epoch": 90.87272727272727, "grad_norm": 3.626858949661255, "learning_rate": 1.2451664098030743e-08, "loss": 0.6616, "step": 363 }, { "epoch": 91.0, "grad_norm": 3.859955310821533, "learning_rate": 1.232543551572103e-08, "loss": 0.6418, "step": 364 }, { "epoch": 91.2909090909091, "grad_norm": 3.4120571613311768, "learning_rate": 1.2202456766718091e-08, "loss": 0.707, "step": 365 }, { "epoch": 91.58181818181818, "grad_norm": 2.8337764739990234, "learning_rate": 1.2082737216329793e-08, "loss": 0.6083, "step": 366 }, { "epoch": 91.87272727272727, "grad_norm": 3.1251420974731445, "learning_rate": 1.1966285981663406e-08, "loss": 0.6327, "step": 367 }, { "epoch": 92.0, "grad_norm": 4.236498832702637, "learning_rate": 1.1853111930931313e-08, "loss": 0.6545, "step": 368 }, { "epoch": 92.0, "eval_loss": 0.6371535062789917, "eval_runtime": 0.7492, "eval_samples_per_second": 17.353, "eval_steps_per_second": 17.353, "step": 368 }, { "epoch": 92.2909090909091, "grad_norm": 3.1066205501556396, "learning_rate": 1.174322368277565e-08, "loss": 0.5913, "step": 369 }, { "epoch": 92.58181818181818, "grad_norm": 3.334169626235962, "learning_rate": 1.1636629605611967e-08, "loss": 0.6869, "step": 370 }, { "epoch": 92.87272727272727, "grad_norm": 3.105184316635132, "learning_rate": 1.1533337816991931e-08, "loss": 0.6699, "step": 371 }, { "epoch": 93.0, "grad_norm": 3.6188406944274902, "learning_rate": 1.1433356182985158e-08, "loss": 0.658, "step": 372 }, { "epoch": 93.2909090909091, "grad_norm": 3.376845598220825, "learning_rate": 1.133669231758016e-08, "loss": 0.7064, "step": 373 }, { "epoch": 93.58181818181818, "grad_norm": 3.327584981918335, "learning_rate": 1.1243353582104555e-08, "loss": 0.6268, "step": 374 }, { "epoch": 93.87272727272727, "grad_norm": 2.9586260318756104, "learning_rate": 1.115334708466442e-08, "loss": 0.6366, "step": 375 }, { "epoch": 94.0, "grad_norm": 3.442078113555908, "learning_rate": 1.1066679679602998e-08, "loss": 0.5847, "step": 376 }, { "epoch": 94.2909090909091, "grad_norm": 2.9368817806243896, "learning_rate": 1.0983357966978745e-08, "loss": 0.6578, "step": 377 }, { "epoch": 94.58181818181818, "grad_norm": 3.0193896293640137, "learning_rate": 1.0903388292062667e-08, "loss": 0.6523, "step": 378 }, { "epoch": 94.87272727272727, "grad_norm": 3.727072238922119, "learning_rate": 1.0826776744855121e-08, "loss": 0.6582, "step": 379 }, { "epoch": 95.0, "grad_norm": 3.880216598510742, "learning_rate": 1.0753529159622047e-08, "loss": 0.6047, "step": 380 }, { "epoch": 95.2909090909091, "grad_norm": 3.0474436283111572, "learning_rate": 1.068365111445064e-08, "loss": 0.6078, "step": 381 }, { "epoch": 95.58181818181818, "grad_norm": 3.09653639793396, "learning_rate": 1.0617147930824585e-08, "loss": 0.6347, "step": 382 }, { "epoch": 95.87272727272727, "grad_norm": 3.2464277744293213, "learning_rate": 1.0554024673218806e-08, "loss": 0.6709, "step": 383 }, { "epoch": 96.0, "grad_norm": 3.866807699203491, "learning_rate": 1.0494286148713743e-08, "loss": 0.7126, "step": 384 }, { "epoch": 96.0, "eval_loss": 0.6363654732704163, "eval_runtime": 0.7605, "eval_samples_per_second": 17.094, "eval_steps_per_second": 17.094, "step": 384 }, { "epoch": 96.2909090909091, "grad_norm": 3.1493375301361084, "learning_rate": 1.0437936906629333e-08, "loss": 0.5571, "step": 385 }, { "epoch": 96.58181818181818, "grad_norm": 3.1544456481933594, "learning_rate": 1.0384981238178533e-08, "loss": 0.7043, "step": 386 }, { "epoch": 96.87272727272727, "grad_norm": 3.5196638107299805, "learning_rate": 1.033542317614051e-08, "loss": 0.6956, "step": 387 }, { "epoch": 97.0, "grad_norm": 2.829664707183838, "learning_rate": 1.0289266494553564e-08, "loss": 0.5839, "step": 388 }, { "epoch": 97.2909090909091, "grad_norm": 3.240220308303833, "learning_rate": 1.0246514708427701e-08, "loss": 0.629, "step": 389 }, { "epoch": 97.58181818181818, "grad_norm": 3.419234275817871, "learning_rate": 1.0207171073476952e-08, "loss": 0.7125, "step": 390 }, { "epoch": 97.87272727272727, "grad_norm": 3.266242742538452, "learning_rate": 1.017123858587145e-08, "loss": 0.7004, "step": 391 }, { "epoch": 98.0, "grad_norm": 2.8885867595672607, "learning_rate": 1.0138719982009241e-08, "loss": 0.4986, "step": 392 }, { "epoch": 98.2909090909091, "grad_norm": 3.4574053287506104, "learning_rate": 1.0109617738307912e-08, "loss": 0.7095, "step": 393 }, { "epoch": 98.58181818181818, "grad_norm": 3.2674267292022705, "learning_rate": 1.0083934071015988e-08, "loss": 0.5806, "step": 394 }, { "epoch": 98.87272727272727, "grad_norm": 2.897749423980713, "learning_rate": 1.0061670936044179e-08, "loss": 0.6434, "step": 395 }, { "epoch": 99.0, "grad_norm": 3.9228150844573975, "learning_rate": 1.0042830028816398e-08, "loss": 0.7094, "step": 396 }, { "epoch": 99.2909090909091, "grad_norm": 2.946876287460327, "learning_rate": 1.002741278414069e-08, "loss": 0.5678, "step": 397 }, { "epoch": 99.58181818181818, "grad_norm": 2.9825222492218018, "learning_rate": 1.0015420376099922e-08, "loss": 0.6347, "step": 398 }, { "epoch": 99.87272727272727, "grad_norm": 3.46803879737854, "learning_rate": 1.0006853717962394e-08, "loss": 0.7428, "step": 399 }, { "epoch": 100.0, "grad_norm": 4.149415969848633, "learning_rate": 1.0001713462112291e-08, "loss": 0.6465, "step": 400 }, { "epoch": 100.0, "eval_loss": 0.6363555192947388, "eval_runtime": 0.7511, "eval_samples_per_second": 17.308, "eval_steps_per_second": 17.308, "step": 400 }, { "epoch": 100.0, "step": 400, "total_flos": 1.34153286008832e+17, "train_loss": 0.7119150696694851, "train_runtime": 2950.0217, "train_samples_per_second": 3.729, "train_steps_per_second": 0.136 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.34153286008832e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }