| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 100.0, | |
| "eval_steps": 16, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.2909090909090909, | |
| "grad_norm": 10.451888084411621, | |
| "learning_rate": 0.0, | |
| "loss": 0.7859, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.5818181818181818, | |
| "grad_norm": 9.886292457580566, | |
| "learning_rate": 2.5e-09, | |
| "loss": 0.7965, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.8727272727272727, | |
| "grad_norm": 10.406240463256836, | |
| "learning_rate": 5e-09, | |
| "loss": 0.7893, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 12.746437072753906, | |
| "learning_rate": 7.5e-09, | |
| "loss": 0.8702, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 1.290909090909091, | |
| "grad_norm": 10.291970252990723, | |
| "learning_rate": 1e-08, | |
| "loss": 0.7785, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 1.5818181818181818, | |
| "grad_norm": 9.746875762939453, | |
| "learning_rate": 1.25e-08, | |
| "loss": 0.759, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 1.8727272727272726, | |
| "grad_norm": 10.920265197753906, | |
| "learning_rate": 1.5e-08, | |
| "loss": 0.8308, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 11.265154838562012, | |
| "learning_rate": 1.7499999999999998e-08, | |
| "loss": 0.8828, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 2.290909090909091, | |
| "grad_norm": 9.750505447387695, | |
| "learning_rate": 2e-08, | |
| "loss": 0.8004, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 2.581818181818182, | |
| "grad_norm": 11.47065544128418, | |
| "learning_rate": 2.25e-08, | |
| "loss": 0.8204, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 2.8727272727272726, | |
| "grad_norm": 10.275605201721191, | |
| "learning_rate": 2.5e-08, | |
| "loss": 0.7771, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 11.604477882385254, | |
| "learning_rate": 2.75e-08, | |
| "loss": 0.8295, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 3.290909090909091, | |
| "grad_norm": 10.712018966674805, | |
| "learning_rate": 3e-08, | |
| "loss": 0.8378, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 3.581818181818182, | |
| "grad_norm": 10.54987907409668, | |
| "learning_rate": 3.25e-08, | |
| "loss": 0.8398, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 3.8727272727272726, | |
| "grad_norm": 9.999624252319336, | |
| "learning_rate": 3.4999999999999996e-08, | |
| "loss": 0.773, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 10.562870979309082, | |
| "learning_rate": 3.75e-08, | |
| "loss": 0.7025, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.760595977306366, | |
| "eval_runtime": 0.722, | |
| "eval_samples_per_second": 18.005, | |
| "eval_steps_per_second": 18.005, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 4.290909090909091, | |
| "grad_norm": 9.95614242553711, | |
| "learning_rate": 4e-08, | |
| "loss": 0.7785, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 4.581818181818182, | |
| "grad_norm": 11.356291770935059, | |
| "learning_rate": 4.25e-08, | |
| "loss": 0.8645, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 4.872727272727273, | |
| "grad_norm": 10.108142852783203, | |
| "learning_rate": 4.5e-08, | |
| "loss": 0.7834, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 10.209877014160156, | |
| "learning_rate": 4.7499999999999995e-08, | |
| "loss": 0.7744, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 5.290909090909091, | |
| "grad_norm": 9.586356163024902, | |
| "learning_rate": 5e-08, | |
| "loss": 0.7433, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 5.581818181818182, | |
| "grad_norm": 10.589778900146484, | |
| "learning_rate": 5.25e-08, | |
| "loss": 0.818, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 5.872727272727273, | |
| "grad_norm": 10.28813362121582, | |
| "learning_rate": 5.5e-08, | |
| "loss": 0.8245, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 13.027183532714844, | |
| "learning_rate": 5.749999999999999e-08, | |
| "loss": 0.8331, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 6.290909090909091, | |
| "grad_norm": 10.363873481750488, | |
| "learning_rate": 6e-08, | |
| "loss": 0.8331, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 6.581818181818182, | |
| "grad_norm": 9.84264850616455, | |
| "learning_rate": 6.25e-08, | |
| "loss": 0.755, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 6.872727272727273, | |
| "grad_norm": 10.973934173583984, | |
| "learning_rate": 6.5e-08, | |
| "loss": 0.8372, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 10.278410911560059, | |
| "learning_rate": 6.75e-08, | |
| "loss": 0.7442, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 7.290909090909091, | |
| "grad_norm": 10.205405235290527, | |
| "learning_rate": 6.999999999999999e-08, | |
| "loss": 0.7851, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 7.581818181818182, | |
| "grad_norm": 10.862798690795898, | |
| "learning_rate": 7.25e-08, | |
| "loss": 0.7962, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 7.872727272727273, | |
| "grad_norm": 9.971634864807129, | |
| "learning_rate": 7.5e-08, | |
| "loss": 0.79, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 10.8460111618042, | |
| "learning_rate": 7.75e-08, | |
| "loss": 0.9105, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.7589532136917114, | |
| "eval_runtime": 0.7554, | |
| "eval_samples_per_second": 17.21, | |
| "eval_steps_per_second": 17.21, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 8.290909090909091, | |
| "grad_norm": 10.4276704788208, | |
| "learning_rate": 8e-08, | |
| "loss": 0.7914, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 8.581818181818182, | |
| "grad_norm": 9.807103157043457, | |
| "learning_rate": 8.249999999999999e-08, | |
| "loss": 0.8344, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 8.872727272727273, | |
| "grad_norm": 9.850166320800781, | |
| "learning_rate": 8.5e-08, | |
| "loss": 0.7419, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 12.547399520874023, | |
| "learning_rate": 8.75e-08, | |
| "loss": 0.8597, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 9.290909090909091, | |
| "grad_norm": 10.39106559753418, | |
| "learning_rate": 9e-08, | |
| "loss": 0.7911, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 9.581818181818182, | |
| "grad_norm": 10.728227615356445, | |
| "learning_rate": 9.25e-08, | |
| "loss": 0.852, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 9.872727272727273, | |
| "grad_norm": 10.104507446289062, | |
| "learning_rate": 9.499999999999999e-08, | |
| "loss": 0.7942, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 9.163139343261719, | |
| "learning_rate": 9.749999999999999e-08, | |
| "loss": 0.7006, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 10.290909090909091, | |
| "grad_norm": 9.795455932617188, | |
| "learning_rate": 1e-07, | |
| "loss": 0.7496, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 10.581818181818182, | |
| "grad_norm": 9.88698959350586, | |
| "learning_rate": 9.99982865378877e-08, | |
| "loss": 0.7978, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 10.872727272727273, | |
| "grad_norm": 10.60831069946289, | |
| "learning_rate": 9.99931462820376e-08, | |
| "loss": 0.8437, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 10.177803039550781, | |
| "learning_rate": 9.998457962390006e-08, | |
| "loss": 0.7926, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 11.290909090909091, | |
| "grad_norm": 9.594599723815918, | |
| "learning_rate": 9.997258721585931e-08, | |
| "loss": 0.7521, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 11.581818181818182, | |
| "grad_norm": 9.713711738586426, | |
| "learning_rate": 9.99571699711836e-08, | |
| "loss": 0.7497, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 11.872727272727273, | |
| "grad_norm": 10.672869682312012, | |
| "learning_rate": 9.993832906395581e-08, | |
| "loss": 0.8709, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 10.758075714111328, | |
| "learning_rate": 9.991606592898401e-08, | |
| "loss": 0.8193, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.7549822926521301, | |
| "eval_runtime": 0.7468, | |
| "eval_samples_per_second": 17.407, | |
| "eval_steps_per_second": 17.407, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 12.290909090909091, | |
| "grad_norm": 10.45877742767334, | |
| "learning_rate": 9.989038226169209e-08, | |
| "loss": 0.8488, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 12.581818181818182, | |
| "grad_norm": 9.969883918762207, | |
| "learning_rate": 9.986128001799076e-08, | |
| "loss": 0.7697, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 12.872727272727273, | |
| "grad_norm": 9.769625663757324, | |
| "learning_rate": 9.982876141412855e-08, | |
| "loss": 0.769, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 10.126288414001465, | |
| "learning_rate": 9.979282892652304e-08, | |
| "loss": 0.8111, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 13.290909090909091, | |
| "grad_norm": 10.078240394592285, | |
| "learning_rate": 9.975348529157229e-08, | |
| "loss": 0.798, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 13.581818181818182, | |
| "grad_norm": 10.470067024230957, | |
| "learning_rate": 9.971073350544643e-08, | |
| "loss": 0.8011, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 13.872727272727273, | |
| "grad_norm": 9.311027526855469, | |
| "learning_rate": 9.966457682385949e-08, | |
| "loss": 0.7109, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 10.48747730255127, | |
| "learning_rate": 9.961501876182147e-08, | |
| "loss": 0.969, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 14.290909090909091, | |
| "grad_norm": 9.95235538482666, | |
| "learning_rate": 9.956206309337066e-08, | |
| "loss": 0.7757, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 14.581818181818182, | |
| "grad_norm": 9.8017578125, | |
| "learning_rate": 9.950571385128625e-08, | |
| "loss": 0.7974, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 14.872727272727273, | |
| "grad_norm": 10.287720680236816, | |
| "learning_rate": 9.94459753267812e-08, | |
| "loss": 0.842, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 9.790772438049316, | |
| "learning_rate": 9.938285206917541e-08, | |
| "loss": 0.7127, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 15.290909090909091, | |
| "grad_norm": 9.698260307312012, | |
| "learning_rate": 9.931634888554937e-08, | |
| "loss": 0.7662, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 15.581818181818182, | |
| "grad_norm": 8.966428756713867, | |
| "learning_rate": 9.924647084037797e-08, | |
| "loss": 0.7652, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 15.872727272727273, | |
| "grad_norm": 9.97366714477539, | |
| "learning_rate": 9.917322325514488e-08, | |
| "loss": 0.854, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 8.8572359085083, | |
| "learning_rate": 9.909661170793732e-08, | |
| "loss": 0.6939, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.7460314631462097, | |
| "eval_runtime": 0.7685, | |
| "eval_samples_per_second": 16.917, | |
| "eval_steps_per_second": 16.917, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 16.29090909090909, | |
| "grad_norm": 9.27105712890625, | |
| "learning_rate": 9.901664203302125e-08, | |
| "loss": 0.7894, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 16.581818181818182, | |
| "grad_norm": 9.316947937011719, | |
| "learning_rate": 9.8933320320397e-08, | |
| "loss": 0.7476, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 16.87272727272727, | |
| "grad_norm": 9.482162475585938, | |
| "learning_rate": 9.884665291533559e-08, | |
| "loss": 0.8294, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 8.661737442016602, | |
| "learning_rate": 9.875664641789545e-08, | |
| "loss": 0.741, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 17.29090909090909, | |
| "grad_norm": 8.976079940795898, | |
| "learning_rate": 9.866330768241983e-08, | |
| "loss": 0.7999, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 17.581818181818182, | |
| "grad_norm": 9.500139236450195, | |
| "learning_rate": 9.856664381701484e-08, | |
| "loss": 0.8324, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 17.87272727272727, | |
| "grad_norm": 8.916980743408203, | |
| "learning_rate": 9.846666218300807e-08, | |
| "loss": 0.7308, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 9.658391952514648, | |
| "learning_rate": 9.836337039438803e-08, | |
| "loss": 0.7691, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 18.29090909090909, | |
| "grad_norm": 9.849284172058105, | |
| "learning_rate": 9.825677631722435e-08, | |
| "loss": 0.7804, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 18.581818181818182, | |
| "grad_norm": 9.135872840881348, | |
| "learning_rate": 9.814688806906868e-08, | |
| "loss": 0.808, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 18.87272727272727, | |
| "grad_norm": 8.595244407653809, | |
| "learning_rate": 9.80337140183366e-08, | |
| "loss": 0.7593, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 9.557754516601562, | |
| "learning_rate": 9.791726278367021e-08, | |
| "loss": 0.795, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 19.29090909090909, | |
| "grad_norm": 9.38132095336914, | |
| "learning_rate": 9.779754323328191e-08, | |
| "loss": 0.8303, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 19.581818181818182, | |
| "grad_norm": 9.702839851379395, | |
| "learning_rate": 9.767456448427896e-08, | |
| "loss": 0.7404, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 19.87272727272727, | |
| "grad_norm": 8.890022277832031, | |
| "learning_rate": 9.754833590196926e-08, | |
| "loss": 0.8152, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 8.762577056884766, | |
| "learning_rate": 9.741886709914803e-08, | |
| "loss": 0.6623, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.7418057918548584, | |
| "eval_runtime": 0.7512, | |
| "eval_samples_per_second": 17.305, | |
| "eval_steps_per_second": 17.305, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 20.29090909090909, | |
| "grad_norm": 9.43545913696289, | |
| "learning_rate": 9.728616793536587e-08, | |
| "loss": 0.726, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 20.581818181818182, | |
| "grad_norm": 8.36042308807373, | |
| "learning_rate": 9.715024851617789e-08, | |
| "loss": 0.7908, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 20.87272727272727, | |
| "grad_norm": 9.46149730682373, | |
| "learning_rate": 9.701111919237408e-08, | |
| "loss": 0.8219, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 9.277331352233887, | |
| "learning_rate": 9.68687905591911e-08, | |
| "loss": 0.7955, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 21.29090909090909, | |
| "grad_norm": 9.980899810791016, | |
| "learning_rate": 9.672327345550542e-08, | |
| "loss": 0.8459, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 21.581818181818182, | |
| "grad_norm": 8.734892845153809, | |
| "learning_rate": 9.65745789630079e-08, | |
| "loss": 0.7952, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 21.87272727272727, | |
| "grad_norm": 7.979213714599609, | |
| "learning_rate": 9.642271840535982e-08, | |
| "loss": 0.6928, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 9.570889472961426, | |
| "learning_rate": 9.626770334733058e-08, | |
| "loss": 0.7813, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 22.29090909090909, | |
| "grad_norm": 9.478497505187988, | |
| "learning_rate": 9.610954559391703e-08, | |
| "loss": 0.783, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 22.581818181818182, | |
| "grad_norm": 8.57199478149414, | |
| "learning_rate": 9.594825718944444e-08, | |
| "loss": 0.7859, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 22.87272727272727, | |
| "grad_norm": 8.782203674316406, | |
| "learning_rate": 9.578385041664925e-08, | |
| "loss": 0.7784, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 9.160470008850098, | |
| "learning_rate": 9.561633779574373e-08, | |
| "loss": 0.7613, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 23.29090909090909, | |
| "grad_norm": 8.80034065246582, | |
| "learning_rate": 9.544573208346251e-08, | |
| "loss": 0.7708, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 23.581818181818182, | |
| "grad_norm": 9.001204490661621, | |
| "learning_rate": 9.527204627209113e-08, | |
| "loss": 0.7975, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 23.87272727272727, | |
| "grad_norm": 8.64294147491455, | |
| "learning_rate": 9.509529358847655e-08, | |
| "loss": 0.7533, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 9.539164543151855, | |
| "learning_rate": 9.491548749301997e-08, | |
| "loss": 0.8112, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.7388671040534973, | |
| "eval_runtime": 0.7379, | |
| "eval_samples_per_second": 17.617, | |
| "eval_steps_per_second": 17.617, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 24.29090909090909, | |
| "grad_norm": 8.564647674560547, | |
| "learning_rate": 9.473264167865172e-08, | |
| "loss": 0.779, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 24.581818181818182, | |
| "grad_norm": 8.466269493103027, | |
| "learning_rate": 9.454677006978843e-08, | |
| "loss": 0.7427, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 24.87272727272727, | |
| "grad_norm": 9.549156188964844, | |
| "learning_rate": 9.435788682127281e-08, | |
| "loss": 0.7749, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 8.791007041931152, | |
| "learning_rate": 9.416600631729548e-08, | |
| "loss": 0.8413, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 25.29090909090909, | |
| "grad_norm": 8.481273651123047, | |
| "learning_rate": 9.397114317029974e-08, | |
| "loss": 0.7987, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 25.581818181818182, | |
| "grad_norm": 7.957334518432617, | |
| "learning_rate": 9.377331221986867e-08, | |
| "loss": 0.7579, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 25.87272727272727, | |
| "grad_norm": 7.695952415466309, | |
| "learning_rate": 9.357252853159505e-08, | |
| "loss": 0.7138, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 8.535294532775879, | |
| "learning_rate": 9.336880739593415e-08, | |
| "loss": 0.8143, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 26.29090909090909, | |
| "grad_norm": 7.785234451293945, | |
| "learning_rate": 9.316216432703917e-08, | |
| "loss": 0.7595, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 26.581818181818182, | |
| "grad_norm": 7.210692882537842, | |
| "learning_rate": 9.295261506157986e-08, | |
| "loss": 0.6892, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 26.87272727272727, | |
| "grad_norm": 7.439105033874512, | |
| "learning_rate": 9.274017555754408e-08, | |
| "loss": 0.7828, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 8.5601167678833, | |
| "learning_rate": 9.252486199302256e-08, | |
| "loss": 0.8267, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 27.29090909090909, | |
| "grad_norm": 7.751751899719238, | |
| "learning_rate": 9.230669076497686e-08, | |
| "loss": 0.7837, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 27.581818181818182, | |
| "grad_norm": 7.58750057220459, | |
| "learning_rate": 9.20856784879907e-08, | |
| "loss": 0.7629, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 27.87272727272727, | |
| "grad_norm": 7.078155040740967, | |
| "learning_rate": 9.186184199300463e-08, | |
| "loss": 0.732, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 7.075254440307617, | |
| "learning_rate": 9.163519832603437e-08, | |
| "loss": 0.708, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.7153984904289246, | |
| "eval_runtime": 0.7384, | |
| "eval_samples_per_second": 17.605, | |
| "eval_steps_per_second": 17.605, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 28.29090909090909, | |
| "grad_norm": 7.660149097442627, | |
| "learning_rate": 9.140576474687262e-08, | |
| "loss": 0.7923, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 28.581818181818182, | |
| "grad_norm": 6.516578674316406, | |
| "learning_rate": 9.117355872777476e-08, | |
| "loss": 0.6965, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 28.87272727272727, | |
| "grad_norm": 6.818985462188721, | |
| "learning_rate": 9.093859795212817e-08, | |
| "loss": 0.7564, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 9.244476318359375, | |
| "learning_rate": 9.070090031310558e-08, | |
| "loss": 0.7325, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 29.29090909090909, | |
| "grad_norm": 7.331173419952393, | |
| "learning_rate": 9.046048391230248e-08, | |
| "loss": 0.6957, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 29.581818181818182, | |
| "grad_norm": 7.235352993011475, | |
| "learning_rate": 9.021736705835861e-08, | |
| "loss": 0.8011, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 29.87272727272727, | |
| "grad_norm": 7.370168209075928, | |
| "learning_rate": 8.997156826556369e-08, | |
| "loss": 0.7767, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 6.547177314758301, | |
| "learning_rate": 8.97231062524474e-08, | |
| "loss": 0.6864, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 30.29090909090909, | |
| "grad_norm": 6.999849796295166, | |
| "learning_rate": 8.9471999940354e-08, | |
| "loss": 0.757, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 30.581818181818182, | |
| "grad_norm": 7.369142055511475, | |
| "learning_rate": 8.921826845200139e-08, | |
| "loss": 0.7184, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 30.87272727272727, | |
| "grad_norm": 7.147704601287842, | |
| "learning_rate": 8.896193111002475e-08, | |
| "loss": 0.8074, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 6.590007305145264, | |
| "learning_rate": 8.87030074355051e-08, | |
| "loss": 0.666, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 31.29090909090909, | |
| "grad_norm": 6.5022711753845215, | |
| "learning_rate": 8.844151714648274e-08, | |
| "loss": 0.7109, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 31.581818181818182, | |
| "grad_norm": 7.46487283706665, | |
| "learning_rate": 8.817748015645558e-08, | |
| "loss": 0.7848, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 31.87272727272727, | |
| "grad_norm": 7.2371721267700195, | |
| "learning_rate": 8.791091657286267e-08, | |
| "loss": 0.7756, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 6.450557708740234, | |
| "learning_rate": 8.764184669555293e-08, | |
| "loss": 0.6471, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.7097088694572449, | |
| "eval_runtime": 0.7853, | |
| "eval_samples_per_second": 16.554, | |
| "eval_steps_per_second": 16.554, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 32.29090909090909, | |
| "grad_norm": 7.1595611572265625, | |
| "learning_rate": 8.737029101523929e-08, | |
| "loss": 0.7418, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 32.58181818181818, | |
| "grad_norm": 7.2520294189453125, | |
| "learning_rate": 8.709627021193817e-08, | |
| "loss": 0.7407, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 32.872727272727275, | |
| "grad_norm": 6.757298469543457, | |
| "learning_rate": 8.681980515339464e-08, | |
| "loss": 0.7486, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 6.70634651184082, | |
| "learning_rate": 8.65409168934933e-08, | |
| "loss": 0.7381, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 33.29090909090909, | |
| "grad_norm": 8.267258644104004, | |
| "learning_rate": 8.625962667065488e-08, | |
| "loss": 0.8277, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 33.58181818181818, | |
| "grad_norm": 6.568601608276367, | |
| "learning_rate": 8.597595590621892e-08, | |
| "loss": 0.7345, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 33.872727272727275, | |
| "grad_norm": 6.368529796600342, | |
| "learning_rate": 8.568992620281244e-08, | |
| "loss": 0.6949, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 6.077971458435059, | |
| "learning_rate": 8.540155934270471e-08, | |
| "loss": 0.6427, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 34.29090909090909, | |
| "grad_norm": 6.2005743980407715, | |
| "learning_rate": 8.511087728614862e-08, | |
| "loss": 0.7113, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 34.58181818181818, | |
| "grad_norm": 6.390923023223877, | |
| "learning_rate": 8.481790216970819e-08, | |
| "loss": 0.7422, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 34.872727272727275, | |
| "grad_norm": 7.773628234863281, | |
| "learning_rate": 8.452265630457283e-08, | |
| "loss": 0.7829, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 7.676466941833496, | |
| "learning_rate": 8.422516217485826e-08, | |
| "loss": 0.718, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 35.29090909090909, | |
| "grad_norm": 6.630233287811279, | |
| "learning_rate": 8.392544243589427e-08, | |
| "loss": 0.7046, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 35.58181818181818, | |
| "grad_norm": 6.816230297088623, | |
| "learning_rate": 8.362351991249938e-08, | |
| "loss": 0.7685, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 35.872727272727275, | |
| "grad_norm": 6.341788291931152, | |
| "learning_rate": 8.331941759724268e-08, | |
| "loss": 0.6774, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 8.306670188903809, | |
| "learning_rate": 8.301315864869288e-08, | |
| "loss": 0.9019, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.7050113677978516, | |
| "eval_runtime": 0.7541, | |
| "eval_samples_per_second": 17.239, | |
| "eval_steps_per_second": 17.239, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 36.29090909090909, | |
| "grad_norm": 6.883708953857422, | |
| "learning_rate": 8.270476638965461e-08, | |
| "loss": 0.7921, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 36.58181818181818, | |
| "grad_norm": 6.389072418212891, | |
| "learning_rate": 8.239426430539243e-08, | |
| "loss": 0.6827, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 36.872727272727275, | |
| "grad_norm": 6.924624443054199, | |
| "learning_rate": 8.208167604184218e-08, | |
| "loss": 0.7774, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 6.5363450050354, | |
| "learning_rate": 8.176702540381035e-08, | |
| "loss": 0.6709, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 37.29090909090909, | |
| "grad_norm": 6.154909610748291, | |
| "learning_rate": 8.145033635316129e-08, | |
| "loss": 0.7302, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 37.58181818181818, | |
| "grad_norm": 6.3788676261901855, | |
| "learning_rate": 8.113163300699229e-08, | |
| "loss": 0.7301, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 37.872727272727275, | |
| "grad_norm": 7.089733600616455, | |
| "learning_rate": 8.081093963579708e-08, | |
| "loss": 0.7223, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 7.056278228759766, | |
| "learning_rate": 8.048828066161747e-08, | |
| "loss": 0.7989, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 38.29090909090909, | |
| "grad_norm": 5.888208389282227, | |
| "learning_rate": 8.016368065618359e-08, | |
| "loss": 0.6768, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 38.58181818181818, | |
| "grad_norm": 7.345203399658203, | |
| "learning_rate": 7.983716433904262e-08, | |
| "loss": 0.7454, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 38.872727272727275, | |
| "grad_norm": 6.323718070983887, | |
| "learning_rate": 7.950875657567622e-08, | |
| "loss": 0.7511, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 7.311026096343994, | |
| "learning_rate": 7.917848237560708e-08, | |
| "loss": 0.8113, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 39.29090909090909, | |
| "grad_norm": 6.139308452606201, | |
| "learning_rate": 7.884636689049422e-08, | |
| "loss": 0.7076, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 39.58181818181818, | |
| "grad_norm": 6.485006809234619, | |
| "learning_rate": 7.851243541221769e-08, | |
| "loss": 0.7437, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 39.872727272727275, | |
| "grad_norm": 6.589916706085205, | |
| "learning_rate": 7.817671337095244e-08, | |
| "loss": 0.7404, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 6.963124752044678, | |
| "learning_rate": 7.78392263332317e-08, | |
| "loss": 0.7328, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.7006868124008179, | |
| "eval_runtime": 0.7566, | |
| "eval_samples_per_second": 17.183, | |
| "eval_steps_per_second": 17.183, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 40.29090909090909, | |
| "grad_norm": 6.547840118408203, | |
| "learning_rate": 7.75e-08, | |
| "loss": 0.7431, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 40.58181818181818, | |
| "grad_norm": 6.299688816070557, | |
| "learning_rate": 7.715906020465603e-08, | |
| "loss": 0.7585, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 40.872727272727275, | |
| "grad_norm": 6.586760997772217, | |
| "learning_rate": 7.681643291108518e-08, | |
| "loss": 0.7324, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 6.389430999755859, | |
| "learning_rate": 7.647214421168238e-08, | |
| "loss": 0.6533, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 41.29090909090909, | |
| "grad_norm": 6.027109146118164, | |
| "learning_rate": 7.612622032536508e-08, | |
| "loss": 0.7135, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 41.58181818181818, | |
| "grad_norm": 6.719674110412598, | |
| "learning_rate": 7.577868759557654e-08, | |
| "loss": 0.7597, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 41.872727272727275, | |
| "grad_norm": 6.458725929260254, | |
| "learning_rate": 7.54295724882796e-08, | |
| "loss": 0.7109, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 6.904190540313721, | |
| "learning_rate": 7.507890158994139e-08, | |
| "loss": 0.7504, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 42.29090909090909, | |
| "grad_norm": 6.580723285675049, | |
| "learning_rate": 7.472670160550848e-08, | |
| "loss": 0.7096, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 42.58181818181818, | |
| "grad_norm": 6.410011291503906, | |
| "learning_rate": 7.437299935637328e-08, | |
| "loss": 0.7692, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 42.872727272727275, | |
| "grad_norm": 6.3067827224731445, | |
| "learning_rate": 7.401782177833146e-08, | |
| "loss": 0.7346, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 5.755003929138184, | |
| "learning_rate": 7.366119591953075e-08, | |
| "loss": 0.6633, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 43.29090909090909, | |
| "grad_norm": 6.46678352355957, | |
| "learning_rate": 7.3303148938411e-08, | |
| "loss": 0.7365, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 43.58181818181818, | |
| "grad_norm": 6.622053623199463, | |
| "learning_rate": 7.294370810163607e-08, | |
| "loss": 0.7511, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 43.872727272727275, | |
| "grad_norm": 5.266422748565674, | |
| "learning_rate": 7.258290078201731e-08, | |
| "loss": 0.6481, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 7.341455936431885, | |
| "learning_rate": 7.222075445642904e-08, | |
| "loss": 0.8191, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.6937930583953857, | |
| "eval_runtime": 0.7725, | |
| "eval_samples_per_second": 16.828, | |
| "eval_steps_per_second": 16.828, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 44.29090909090909, | |
| "grad_norm": 6.314858436584473, | |
| "learning_rate": 7.185729670371604e-08, | |
| "loss": 0.7001, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 44.58181818181818, | |
| "grad_norm": 6.364148139953613, | |
| "learning_rate": 7.149255520259337e-08, | |
| "loss": 0.786, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 44.872727272727275, | |
| "grad_norm": 5.679451942443848, | |
| "learning_rate": 7.11265577295385e-08, | |
| "loss": 0.6767, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 6.4454216957092285, | |
| "learning_rate": 7.075933215667603e-08, | |
| "loss": 0.7351, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 45.29090909090909, | |
| "grad_norm": 5.991427421569824, | |
| "learning_rate": 7.039090644965509e-08, | |
| "loss": 0.7047, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 45.58181818181818, | |
| "grad_norm": 5.386115550994873, | |
| "learning_rate": 7.002130866551968e-08, | |
| "loss": 0.7113, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 45.872727272727275, | |
| "grad_norm": 6.815364360809326, | |
| "learning_rate": 6.965056695057204e-08, | |
| "loss": 0.7255, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 6.38714599609375, | |
| "learning_rate": 6.927870953822915e-08, | |
| "loss": 0.7503, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 46.29090909090909, | |
| "grad_norm": 5.759856224060059, | |
| "learning_rate": 6.890576474687262e-08, | |
| "loss": 0.7008, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 46.58181818181818, | |
| "grad_norm": 5.1396918296813965, | |
| "learning_rate": 6.853176097769228e-08, | |
| "loss": 0.6925, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 46.872727272727275, | |
| "grad_norm": 5.9070539474487305, | |
| "learning_rate": 6.815672671252315e-08, | |
| "loss": 0.7409, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 5.90541410446167, | |
| "learning_rate": 6.778069051167653e-08, | |
| "loss": 0.702, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 47.29090909090909, | |
| "grad_norm": 5.474076747894287, | |
| "learning_rate": 6.740368101176495e-08, | |
| "loss": 0.7085, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 47.58181818181818, | |
| "grad_norm": 5.111520767211914, | |
| "learning_rate": 6.702572692352155e-08, | |
| "loss": 0.685, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 47.872727272727275, | |
| "grad_norm": 5.618140697479248, | |
| "learning_rate": 6.664685702961344e-08, | |
| "loss": 0.7551, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 4.961245059967041, | |
| "learning_rate": 6.626710018244986e-08, | |
| "loss": 0.6327, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.6752312183380127, | |
| "eval_runtime": 0.7832, | |
| "eval_samples_per_second": 16.599, | |
| "eval_steps_per_second": 16.599, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 48.29090909090909, | |
| "grad_norm": 5.36975622177124, | |
| "learning_rate": 6.588648530198504e-08, | |
| "loss": 0.7312, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 48.58181818181818, | |
| "grad_norm": 5.021007061004639, | |
| "learning_rate": 6.550504137351574e-08, | |
| "loss": 0.7467, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 48.872727272727275, | |
| "grad_norm": 4.721583843231201, | |
| "learning_rate": 6.512279744547392e-08, | |
| "loss": 0.6271, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 5.531439304351807, | |
| "learning_rate": 6.473978262721462e-08, | |
| "loss": 0.7127, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 49.29090909090909, | |
| "grad_norm": 5.3525309562683105, | |
| "learning_rate": 6.435602608679917e-08, | |
| "loss": 0.7255, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 49.58181818181818, | |
| "grad_norm": 4.411137104034424, | |
| "learning_rate": 6.397155704877387e-08, | |
| "loss": 0.6177, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 49.872727272727275, | |
| "grad_norm": 4.907252788543701, | |
| "learning_rate": 6.358640479194451e-08, | |
| "loss": 0.7295, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 4.626101493835449, | |
| "learning_rate": 6.320059864714664e-08, | |
| "loss": 0.7091, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 50.29090909090909, | |
| "grad_norm": 4.853626728057861, | |
| "learning_rate": 6.281416799501187e-08, | |
| "loss": 0.7432, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 50.58181818181818, | |
| "grad_norm": 4.439899921417236, | |
| "learning_rate": 6.242714226373049e-08, | |
| "loss": 0.676, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 50.872727272727275, | |
| "grad_norm": 4.5280985832214355, | |
| "learning_rate": 6.203955092681039e-08, | |
| "loss": 0.7086, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 4.414018154144287, | |
| "learning_rate": 6.165142350083249e-08, | |
| "loss": 0.5264, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 51.29090909090909, | |
| "grad_norm": 4.17572021484375, | |
| "learning_rate": 6.126278954320294e-08, | |
| "loss": 0.7346, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 51.58181818181818, | |
| "grad_norm": 4.015255928039551, | |
| "learning_rate": 6.087367864990232e-08, | |
| "loss": 0.6239, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 51.872727272727275, | |
| "grad_norm": 4.698182582855225, | |
| "learning_rate": 6.048412045323163e-08, | |
| "loss": 0.688, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 5.5075297355651855, | |
| "learning_rate": 6.00941446195558e-08, | |
| "loss": 0.6903, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_loss": 0.6604220271110535, | |
| "eval_runtime": 0.6915, | |
| "eval_samples_per_second": 18.8, | |
| "eval_steps_per_second": 18.8, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 52.29090909090909, | |
| "grad_norm": 3.8842809200286865, | |
| "learning_rate": 5.970378084704441e-08, | |
| "loss": 0.6428, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 52.58181818181818, | |
| "grad_norm": 4.9067301750183105, | |
| "learning_rate": 5.931305886341008e-08, | |
| "loss": 0.7572, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 52.872727272727275, | |
| "grad_norm": 4.025907516479492, | |
| "learning_rate": 5.892200842364462e-08, | |
| "loss": 0.6545, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 4.105547904968262, | |
| "learning_rate": 5.853065930775303e-08, | |
| "loss": 0.6439, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 53.29090909090909, | |
| "grad_norm": 3.7520296573638916, | |
| "learning_rate": 5.813904131848564e-08, | |
| "loss": 0.677, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 53.58181818181818, | |
| "grad_norm": 3.975045680999756, | |
| "learning_rate": 5.7747184279068564e-08, | |
| "loss": 0.6321, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 53.872727272727275, | |
| "grad_norm": 4.536473274230957, | |
| "learning_rate": 5.735511803093248e-08, | |
| "loss": 0.7326, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 5.148712158203125, | |
| "learning_rate": 5.696287243144012e-08, | |
| "loss": 0.6819, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 54.29090909090909, | |
| "grad_norm": 3.6721999645233154, | |
| "learning_rate": 5.6570477351612554e-08, | |
| "loss": 0.6655, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 54.58181818181818, | |
| "grad_norm": 4.29323148727417, | |
| "learning_rate": 5.61779626738543e-08, | |
| "loss": 0.6743, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 54.872727272727275, | |
| "grad_norm": 4.018572807312012, | |
| "learning_rate": 5.5785358289677765e-08, | |
| "loss": 0.711, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 4.6550445556640625, | |
| "learning_rate": 5.539269409742683e-08, | |
| "loss": 0.6398, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 55.29090909090909, | |
| "grad_norm": 4.599621295928955, | |
| "learning_rate": 5.5e-08, | |
| "loss": 0.6885, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 55.58181818181818, | |
| "grad_norm": 3.6876866817474365, | |
| "learning_rate": 5.460730590257318e-08, | |
| "loss": 0.6391, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 55.872727272727275, | |
| "grad_norm": 3.641345262527466, | |
| "learning_rate": 5.421464171032224e-08, | |
| "loss": 0.6684, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 4.325244903564453, | |
| "learning_rate": 5.382203732614572e-08, | |
| "loss": 0.7467, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_loss": 0.6532977819442749, | |
| "eval_runtime": 0.746, | |
| "eval_samples_per_second": 17.427, | |
| "eval_steps_per_second": 17.427, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 56.29090909090909, | |
| "grad_norm": 4.434227466583252, | |
| "learning_rate": 5.342952264838747e-08, | |
| "loss": 0.7395, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 56.58181818181818, | |
| "grad_norm": 4.03561544418335, | |
| "learning_rate": 5.303712756855988e-08, | |
| "loss": 0.7176, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 56.872727272727275, | |
| "grad_norm": 3.4329726696014404, | |
| "learning_rate": 5.264488196906752e-08, | |
| "loss": 0.5565, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 3.6157584190368652, | |
| "learning_rate": 5.225281572093143e-08, | |
| "loss": 0.7052, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 57.29090909090909, | |
| "grad_norm": 3.654561996459961, | |
| "learning_rate": 5.1860958681514355e-08, | |
| "loss": 0.6931, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 57.58181818181818, | |
| "grad_norm": 3.4616754055023193, | |
| "learning_rate": 5.1469340692246985e-08, | |
| "loss": 0.6126, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 57.872727272727275, | |
| "grad_norm": 4.538090229034424, | |
| "learning_rate": 5.107799157635537e-08, | |
| "loss": 0.7149, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 3.8424854278564453, | |
| "learning_rate": 5.068694113658992e-08, | |
| "loss": 0.6564, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 58.29090909090909, | |
| "grad_norm": 3.360053777694702, | |
| "learning_rate": 5.02962191529556e-08, | |
| "loss": 0.6657, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 58.58181818181818, | |
| "grad_norm": 4.166203022003174, | |
| "learning_rate": 4.9905855380444194e-08, | |
| "loss": 0.7461, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 58.872727272727275, | |
| "grad_norm": 3.4333815574645996, | |
| "learning_rate": 4.9515879546768366e-08, | |
| "loss": 0.5924, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 4.719890594482422, | |
| "learning_rate": 4.912632135009769e-08, | |
| "loss": 0.6793, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 59.29090909090909, | |
| "grad_norm": 3.6366472244262695, | |
| "learning_rate": 4.873721045679706e-08, | |
| "loss": 0.6648, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 59.58181818181818, | |
| "grad_norm": 4.29836893081665, | |
| "learning_rate": 4.8348576499167516e-08, | |
| "loss": 0.6871, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 59.872727272727275, | |
| "grad_norm": 3.3436715602874756, | |
| "learning_rate": 4.7960449073189604e-08, | |
| "loss": 0.6136, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 3.974397897720337, | |
| "learning_rate": 4.75728577362695e-08, | |
| "loss": 0.7364, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 0.6488688588142395, | |
| "eval_runtime": 0.7429, | |
| "eval_samples_per_second": 17.5, | |
| "eval_steps_per_second": 17.5, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 60.29090909090909, | |
| "grad_norm": 4.133732318878174, | |
| "learning_rate": 4.718583200498813e-08, | |
| "loss": 0.7386, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 60.58181818181818, | |
| "grad_norm": 3.358363151550293, | |
| "learning_rate": 4.6799401352853365e-08, | |
| "loss": 0.6255, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 60.872727272727275, | |
| "grad_norm": 3.73943829536438, | |
| "learning_rate": 4.641359520805548e-08, | |
| "loss": 0.6834, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 3.680448532104492, | |
| "learning_rate": 4.6028442951226135e-08, | |
| "loss": 0.5903, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 61.29090909090909, | |
| "grad_norm": 3.3045241832733154, | |
| "learning_rate": 4.564397391320084e-08, | |
| "loss": 0.5871, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 61.58181818181818, | |
| "grad_norm": 3.690742015838623, | |
| "learning_rate": 4.526021737278537e-08, | |
| "loss": 0.6913, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 61.872727272727275, | |
| "grad_norm": 4.233401775360107, | |
| "learning_rate": 4.4877202554526084e-08, | |
| "loss": 0.7115, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 3.5080771446228027, | |
| "learning_rate": 4.449495862648427e-08, | |
| "loss": 0.687, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 62.29090909090909, | |
| "grad_norm": 3.3871119022369385, | |
| "learning_rate": 4.4113514698014955e-08, | |
| "loss": 0.6901, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 62.58181818181818, | |
| "grad_norm": 3.6088693141937256, | |
| "learning_rate": 4.373289981755013e-08, | |
| "loss": 0.631, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 62.872727272727275, | |
| "grad_norm": 3.743149518966675, | |
| "learning_rate": 4.335314297038656e-08, | |
| "loss": 0.6351, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 4.030084133148193, | |
| "learning_rate": 4.297427307647844e-08, | |
| "loss": 0.7212, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 63.29090909090909, | |
| "grad_norm": 3.458228349685669, | |
| "learning_rate": 4.2596318988235035e-08, | |
| "loss": 0.629, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 63.58181818181818, | |
| "grad_norm": 4.063506126403809, | |
| "learning_rate": 4.2219309488323486e-08, | |
| "loss": 0.6565, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 63.872727272727275, | |
| "grad_norm": 3.257892370223999, | |
| "learning_rate": 4.184327328747685e-08, | |
| "loss": 0.6644, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 3.964184284210205, | |
| "learning_rate": 4.1468239022307716e-08, | |
| "loss": 0.7706, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_loss": 0.6460027694702148, | |
| "eval_runtime": 0.7572, | |
| "eval_samples_per_second": 17.168, | |
| "eval_steps_per_second": 17.168, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 64.2909090909091, | |
| "grad_norm": 3.444884777069092, | |
| "learning_rate": 4.1094235253127375e-08, | |
| "loss": 0.5848, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 64.58181818181818, | |
| "grad_norm": 3.34226131439209, | |
| "learning_rate": 4.072129046177086e-08, | |
| "loss": 0.6438, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 64.87272727272727, | |
| "grad_norm": 4.081578254699707, | |
| "learning_rate": 4.034943304942796e-08, | |
| "loss": 0.7825, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 3.9306929111480713, | |
| "learning_rate": 3.997869133448031e-08, | |
| "loss": 0.7003, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 65.2909090909091, | |
| "grad_norm": 3.3377864360809326, | |
| "learning_rate": 3.960909355034491e-08, | |
| "loss": 0.6723, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 65.58181818181818, | |
| "grad_norm": 4.126795291900635, | |
| "learning_rate": 3.924066784332396e-08, | |
| "loss": 0.6778, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 65.87272727272727, | |
| "grad_norm": 3.283628225326538, | |
| "learning_rate": 3.8873442270461487e-08, | |
| "loss": 0.6196, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 3.599966526031494, | |
| "learning_rate": 3.850744479740663e-08, | |
| "loss": 0.7125, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 66.2909090909091, | |
| "grad_norm": 3.398857831954956, | |
| "learning_rate": 3.814270329628395e-08, | |
| "loss": 0.6958, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 66.58181818181818, | |
| "grad_norm": 3.538728952407837, | |
| "learning_rate": 3.777924554357096e-08, | |
| "loss": 0.6089, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 66.87272727272727, | |
| "grad_norm": 3.549941062927246, | |
| "learning_rate": 3.7417099217982684e-08, | |
| "loss": 0.6794, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 3.7104790210723877, | |
| "learning_rate": 3.7056291898363926e-08, | |
| "loss": 0.6845, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 67.2909090909091, | |
| "grad_norm": 3.7661495208740234, | |
| "learning_rate": 3.669685106158899e-08, | |
| "loss": 0.6726, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 67.58181818181818, | |
| "grad_norm": 3.2654290199279785, | |
| "learning_rate": 3.633880408046926e-08, | |
| "loss": 0.6597, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 67.87272727272727, | |
| "grad_norm": 3.3389344215393066, | |
| "learning_rate": 3.598217822166854e-08, | |
| "loss": 0.5967, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 4.037345886230469, | |
| "learning_rate": 3.5627000643626705e-08, | |
| "loss": 0.7777, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_loss": 0.6440867185592651, | |
| "eval_runtime": 0.7501, | |
| "eval_samples_per_second": 17.331, | |
| "eval_steps_per_second": 17.331, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 68.2909090909091, | |
| "grad_norm": 3.359079360961914, | |
| "learning_rate": 3.527329839449151e-08, | |
| "loss": 0.6824, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 68.58181818181818, | |
| "grad_norm": 3.1845285892486572, | |
| "learning_rate": 3.49210984100586e-08, | |
| "loss": 0.5956, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 68.87272727272727, | |
| "grad_norm": 3.564899206161499, | |
| "learning_rate": 3.4570427511720395e-08, | |
| "loss": 0.679, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 5.116268157958984, | |
| "learning_rate": 3.4221312404423483e-08, | |
| "loss": 0.7194, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 69.2909090909091, | |
| "grad_norm": 3.2692816257476807, | |
| "learning_rate": 3.387377967463493e-08, | |
| "loss": 0.5994, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 69.58181818181818, | |
| "grad_norm": 3.1827392578125, | |
| "learning_rate": 3.3527855788317614e-08, | |
| "loss": 0.6582, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 69.87272727272727, | |
| "grad_norm": 3.776779890060425, | |
| "learning_rate": 3.3183567088914834e-08, | |
| "loss": 0.7129, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 4.038694381713867, | |
| "learning_rate": 3.2840939795343986e-08, | |
| "loss": 0.7197, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 70.2909090909091, | |
| "grad_norm": 4.152867317199707, | |
| "learning_rate": 3.250000000000001e-08, | |
| "loss": 0.6844, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 70.58181818181818, | |
| "grad_norm": 3.327399492263794, | |
| "learning_rate": 3.2160773666768325e-08, | |
| "loss": 0.7062, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 70.87272727272727, | |
| "grad_norm": 3.2052102088928223, | |
| "learning_rate": 3.182328662904756e-08, | |
| "loss": 0.597, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "grad_norm": 3.528104305267334, | |
| "learning_rate": 3.14875645877823e-08, | |
| "loss": 0.6592, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 71.2909090909091, | |
| "grad_norm": 3.380600929260254, | |
| "learning_rate": 3.1153633109505784e-08, | |
| "loss": 0.6398, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 71.58181818181818, | |
| "grad_norm": 3.3184642791748047, | |
| "learning_rate": 3.082151762439292e-08, | |
| "loss": 0.6979, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 71.87272727272727, | |
| "grad_norm": 3.4902994632720947, | |
| "learning_rate": 3.049124342432378e-08, | |
| "loss": 0.6352, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 3.682870388031006, | |
| "learning_rate": 3.0162835660957385e-08, | |
| "loss": 0.6391, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_loss": 0.641921877861023, | |
| "eval_runtime": 0.7653, | |
| "eval_samples_per_second": 16.986, | |
| "eval_steps_per_second": 16.986, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 72.2909090909091, | |
| "grad_norm": 3.4279274940490723, | |
| "learning_rate": 2.983631934381639e-08, | |
| "loss": 0.6219, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 72.58181818181818, | |
| "grad_norm": 3.627363681793213, | |
| "learning_rate": 2.9511719338382535e-08, | |
| "loss": 0.6635, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 72.87272727272727, | |
| "grad_norm": 3.1634864807128906, | |
| "learning_rate": 2.918906036420294e-08, | |
| "loss": 0.6377, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "grad_norm": 3.896449327468872, | |
| "learning_rate": 2.886836699300771e-08, | |
| "loss": 0.7822, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 73.2909090909091, | |
| "grad_norm": 3.168968677520752, | |
| "learning_rate": 2.8549663646838718e-08, | |
| "loss": 0.609, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 73.58181818181818, | |
| "grad_norm": 3.3781349658966064, | |
| "learning_rate": 2.8232974596189653e-08, | |
| "loss": 0.6728, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 73.87272727272727, | |
| "grad_norm": 3.41473650932312, | |
| "learning_rate": 2.791832395815782e-08, | |
| "loss": 0.7013, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 3.771911859512329, | |
| "learning_rate": 2.760573569460757e-08, | |
| "loss": 0.6343, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 74.2909090909091, | |
| "grad_norm": 3.527878761291504, | |
| "learning_rate": 2.729523361034538e-08, | |
| "loss": 0.6528, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 74.58181818181818, | |
| "grad_norm": 3.105755090713501, | |
| "learning_rate": 2.6986841351307128e-08, | |
| "loss": 0.6243, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 74.87272727272727, | |
| "grad_norm": 3.3217263221740723, | |
| "learning_rate": 2.6680582402757322e-08, | |
| "loss": 0.6658, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 4.193359375, | |
| "learning_rate": 2.637648008750062e-08, | |
| "loss": 0.7016, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 75.2909090909091, | |
| "grad_norm": 3.2874765396118164, | |
| "learning_rate": 2.6074557564105726e-08, | |
| "loss": 0.6661, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 75.58181818181818, | |
| "grad_norm": 3.4806275367736816, | |
| "learning_rate": 2.5774837825141737e-08, | |
| "loss": 0.6277, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 75.87272727272727, | |
| "grad_norm": 3.398120880126953, | |
| "learning_rate": 2.547734369542718e-08, | |
| "loss": 0.6863, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 3.1762161254882812, | |
| "learning_rate": 2.5182097830291825e-08, | |
| "loss": 0.648, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_loss": 0.6407743096351624, | |
| "eval_runtime": 0.7838, | |
| "eval_samples_per_second": 16.585, | |
| "eval_steps_per_second": 16.585, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 76.2909090909091, | |
| "grad_norm": 3.250011444091797, | |
| "learning_rate": 2.4889122713851394e-08, | |
| "loss": 0.6552, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 76.58181818181818, | |
| "grad_norm": 3.1045658588409424, | |
| "learning_rate": 2.4598440657295288e-08, | |
| "loss": 0.6147, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 76.87272727272727, | |
| "grad_norm": 4.007096290588379, | |
| "learning_rate": 2.4310073797187574e-08, | |
| "loss": 0.7181, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "grad_norm": 3.300295829772949, | |
| "learning_rate": 2.4024044093781064e-08, | |
| "loss": 0.6115, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 77.2909090909091, | |
| "grad_norm": 3.376610517501831, | |
| "learning_rate": 2.3740373329345117e-08, | |
| "loss": 0.7065, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 77.58181818181818, | |
| "grad_norm": 3.1987497806549072, | |
| "learning_rate": 2.3459083106506712e-08, | |
| "loss": 0.6265, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 77.87272727272727, | |
| "grad_norm": 3.428140878677368, | |
| "learning_rate": 2.3180194846605363e-08, | |
| "loss": 0.629, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "grad_norm": 3.489027261734009, | |
| "learning_rate": 2.2903729788061836e-08, | |
| "loss": 0.6626, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 78.2909090909091, | |
| "grad_norm": 3.7477946281433105, | |
| "learning_rate": 2.2629708984760707e-08, | |
| "loss": 0.7006, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 78.58181818181818, | |
| "grad_norm": 3.2413809299468994, | |
| "learning_rate": 2.2358153304447067e-08, | |
| "loss": 0.6363, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 78.87272727272727, | |
| "grad_norm": 3.0365958213806152, | |
| "learning_rate": 2.2089083427137328e-08, | |
| "loss": 0.6307, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "grad_norm": 3.5392417907714844, | |
| "learning_rate": 2.182251984354442e-08, | |
| "loss": 0.6594, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 79.2909090909091, | |
| "grad_norm": 3.2169861793518066, | |
| "learning_rate": 2.1558482853517254e-08, | |
| "loss": 0.6261, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 79.58181818181818, | |
| "grad_norm": 3.1975908279418945, | |
| "learning_rate": 2.1296992564494903e-08, | |
| "loss": 0.6303, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 79.87272727272727, | |
| "grad_norm": 3.5037009716033936, | |
| "learning_rate": 2.103806888997526e-08, | |
| "loss": 0.6847, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 3.49397611618042, | |
| "learning_rate": 2.078173154799861e-08, | |
| "loss": 0.704, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "eval_loss": 0.6397803425788879, | |
| "eval_runtime": 0.7407, | |
| "eval_samples_per_second": 17.552, | |
| "eval_steps_per_second": 17.552, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 80.2909090909091, | |
| "grad_norm": 3.312922954559326, | |
| "learning_rate": 2.0528000059645996e-08, | |
| "loss": 0.6742, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 80.58181818181818, | |
| "grad_norm": 3.8569176197052, | |
| "learning_rate": 2.027689374755261e-08, | |
| "loss": 0.6589, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 80.87272727272727, | |
| "grad_norm": 3.100782871246338, | |
| "learning_rate": 2.0028431734436306e-08, | |
| "loss": 0.6384, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "grad_norm": 3.488448143005371, | |
| "learning_rate": 1.9782632941641373e-08, | |
| "loss": 0.6523, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 81.2909090909091, | |
| "grad_norm": 3.713844060897827, | |
| "learning_rate": 1.9539516087697516e-08, | |
| "loss": 0.7186, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 81.58181818181818, | |
| "grad_norm": 3.2771830558776855, | |
| "learning_rate": 1.9299099686894422e-08, | |
| "loss": 0.7155, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 81.87272727272727, | |
| "grad_norm": 3.013705015182495, | |
| "learning_rate": 1.9061402047871834e-08, | |
| "loss": 0.5796, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "grad_norm": 3.9129855632781982, | |
| "learning_rate": 1.8826441272225223e-08, | |
| "loss": 0.5744, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 82.2909090909091, | |
| "grad_norm": 3.432311773300171, | |
| "learning_rate": 1.8594235253127372e-08, | |
| "loss": 0.6681, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 82.58181818181818, | |
| "grad_norm": 3.2865946292877197, | |
| "learning_rate": 1.8364801673965642e-08, | |
| "loss": 0.7103, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 82.87272727272727, | |
| "grad_norm": 3.1664698123931885, | |
| "learning_rate": 1.8138158006995365e-08, | |
| "loss": 0.5704, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "grad_norm": 3.5827629566192627, | |
| "learning_rate": 1.7914321512009295e-08, | |
| "loss": 0.7128, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 83.2909090909091, | |
| "grad_norm": 3.228314161300659, | |
| "learning_rate": 1.7693309235023126e-08, | |
| "loss": 0.6072, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 83.58181818181818, | |
| "grad_norm": 3.2263855934143066, | |
| "learning_rate": 1.7475138006977434e-08, | |
| "loss": 0.6525, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 83.87272727272727, | |
| "grad_norm": 3.2911698818206787, | |
| "learning_rate": 1.7259824442455922e-08, | |
| "loss": 0.7323, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "grad_norm": 3.629072666168213, | |
| "learning_rate": 1.704738493842015e-08, | |
| "loss": 0.6316, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "eval_loss": 0.6387331485748291, | |
| "eval_runtime": 0.761, | |
| "eval_samples_per_second": 17.082, | |
| "eval_steps_per_second": 17.082, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 84.2909090909091, | |
| "grad_norm": 3.0507638454437256, | |
| "learning_rate": 1.6837835672960833e-08, | |
| "loss": 0.6576, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 84.58181818181818, | |
| "grad_norm": 3.1356823444366455, | |
| "learning_rate": 1.663119260406585e-08, | |
| "loss": 0.6216, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 84.87272727272727, | |
| "grad_norm": 3.6667861938476562, | |
| "learning_rate": 1.642747146840495e-08, | |
| "loss": 0.6975, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "grad_norm": 3.397160768508911, | |
| "learning_rate": 1.6226687780131337e-08, | |
| "loss": 0.6022, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 85.2909090909091, | |
| "grad_norm": 3.163560152053833, | |
| "learning_rate": 1.602885682970026e-08, | |
| "loss": 0.6316, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 85.58181818181818, | |
| "grad_norm": 3.1787586212158203, | |
| "learning_rate": 1.5833993682704515e-08, | |
| "loss": 0.6725, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 85.87272727272727, | |
| "grad_norm": 3.379927158355713, | |
| "learning_rate": 1.5642113178727193e-08, | |
| "loss": 0.6319, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "grad_norm": 3.783219337463379, | |
| "learning_rate": 1.5453229930211566e-08, | |
| "loss": 0.72, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 86.2909090909091, | |
| "grad_norm": 3.678173065185547, | |
| "learning_rate": 1.5267358321348288e-08, | |
| "loss": 0.7343, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 86.58181818181818, | |
| "grad_norm": 3.0277297496795654, | |
| "learning_rate": 1.5084512506980025e-08, | |
| "loss": 0.6112, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 86.87272727272727, | |
| "grad_norm": 2.9994826316833496, | |
| "learning_rate": 1.490470641152345e-08, | |
| "loss": 0.6377, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "grad_norm": 3.4868507385253906, | |
| "learning_rate": 1.4727953727908877e-08, | |
| "loss": 0.603, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 87.2909090909091, | |
| "grad_norm": 3.3681607246398926, | |
| "learning_rate": 1.4554267916537493e-08, | |
| "loss": 0.6832, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 87.58181818181818, | |
| "grad_norm": 3.123229742050171, | |
| "learning_rate": 1.438366220425628e-08, | |
| "loss": 0.6086, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 87.87272727272727, | |
| "grad_norm": 3.2728774547576904, | |
| "learning_rate": 1.4216149583350753e-08, | |
| "loss": 0.6735, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "grad_norm": 3.185558319091797, | |
| "learning_rate": 1.405174281055556e-08, | |
| "loss": 0.6232, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "eval_loss": 0.6380465030670166, | |
| "eval_runtime": 0.6934, | |
| "eval_samples_per_second": 18.748, | |
| "eval_steps_per_second": 18.748, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 88.2909090909091, | |
| "grad_norm": 3.0658743381500244, | |
| "learning_rate": 1.3890454406082957e-08, | |
| "loss": 0.6388, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 88.58181818181818, | |
| "grad_norm": 3.2591443061828613, | |
| "learning_rate": 1.3732296652669417e-08, | |
| "loss": 0.6658, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 88.87272727272727, | |
| "grad_norm": 3.252021551132202, | |
| "learning_rate": 1.3577281594640182e-08, | |
| "loss": 0.6671, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "grad_norm": 3.7164716720581055, | |
| "learning_rate": 1.3425421036992096e-08, | |
| "loss": 0.6185, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 89.2909090909091, | |
| "grad_norm": 3.1124889850616455, | |
| "learning_rate": 1.327672654449457e-08, | |
| "loss": 0.6648, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 89.58181818181818, | |
| "grad_norm": 3.4343039989471436, | |
| "learning_rate": 1.3131209440808898e-08, | |
| "loss": 0.6731, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 89.87272727272727, | |
| "grad_norm": 2.8693790435791016, | |
| "learning_rate": 1.2988880807625927e-08, | |
| "loss": 0.5938, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "grad_norm": 4.409383773803711, | |
| "learning_rate": 1.284975148382211e-08, | |
| "loss": 0.7107, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 90.2909090909091, | |
| "grad_norm": 3.080493450164795, | |
| "learning_rate": 1.2713832064634124e-08, | |
| "loss": 0.6798, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 90.58181818181818, | |
| "grad_norm": 2.9749112129211426, | |
| "learning_rate": 1.2581132900851971e-08, | |
| "loss": 0.6249, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 90.87272727272727, | |
| "grad_norm": 3.626858949661255, | |
| "learning_rate": 1.2451664098030743e-08, | |
| "loss": 0.6616, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "grad_norm": 3.859955310821533, | |
| "learning_rate": 1.232543551572103e-08, | |
| "loss": 0.6418, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 91.2909090909091, | |
| "grad_norm": 3.4120571613311768, | |
| "learning_rate": 1.2202456766718091e-08, | |
| "loss": 0.707, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 91.58181818181818, | |
| "grad_norm": 2.8337764739990234, | |
| "learning_rate": 1.2082737216329793e-08, | |
| "loss": 0.6083, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 91.87272727272727, | |
| "grad_norm": 3.1251420974731445, | |
| "learning_rate": 1.1966285981663406e-08, | |
| "loss": 0.6327, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "grad_norm": 4.236498832702637, | |
| "learning_rate": 1.1853111930931313e-08, | |
| "loss": 0.6545, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "eval_loss": 0.6371535062789917, | |
| "eval_runtime": 0.7492, | |
| "eval_samples_per_second": 17.353, | |
| "eval_steps_per_second": 17.353, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 92.2909090909091, | |
| "grad_norm": 3.1066205501556396, | |
| "learning_rate": 1.174322368277565e-08, | |
| "loss": 0.5913, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 92.58181818181818, | |
| "grad_norm": 3.334169626235962, | |
| "learning_rate": 1.1636629605611967e-08, | |
| "loss": 0.6869, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 92.87272727272727, | |
| "grad_norm": 3.105184316635132, | |
| "learning_rate": 1.1533337816991931e-08, | |
| "loss": 0.6699, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "grad_norm": 3.6188406944274902, | |
| "learning_rate": 1.1433356182985158e-08, | |
| "loss": 0.658, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 93.2909090909091, | |
| "grad_norm": 3.376845598220825, | |
| "learning_rate": 1.133669231758016e-08, | |
| "loss": 0.7064, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 93.58181818181818, | |
| "grad_norm": 3.327584981918335, | |
| "learning_rate": 1.1243353582104555e-08, | |
| "loss": 0.6268, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 93.87272727272727, | |
| "grad_norm": 2.9586260318756104, | |
| "learning_rate": 1.115334708466442e-08, | |
| "loss": 0.6366, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "grad_norm": 3.442078113555908, | |
| "learning_rate": 1.1066679679602998e-08, | |
| "loss": 0.5847, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 94.2909090909091, | |
| "grad_norm": 2.9368817806243896, | |
| "learning_rate": 1.0983357966978745e-08, | |
| "loss": 0.6578, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 94.58181818181818, | |
| "grad_norm": 3.0193896293640137, | |
| "learning_rate": 1.0903388292062667e-08, | |
| "loss": 0.6523, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 94.87272727272727, | |
| "grad_norm": 3.727072238922119, | |
| "learning_rate": 1.0826776744855121e-08, | |
| "loss": 0.6582, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "grad_norm": 3.880216598510742, | |
| "learning_rate": 1.0753529159622047e-08, | |
| "loss": 0.6047, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 95.2909090909091, | |
| "grad_norm": 3.0474436283111572, | |
| "learning_rate": 1.068365111445064e-08, | |
| "loss": 0.6078, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 95.58181818181818, | |
| "grad_norm": 3.09653639793396, | |
| "learning_rate": 1.0617147930824585e-08, | |
| "loss": 0.6347, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 95.87272727272727, | |
| "grad_norm": 3.2464277744293213, | |
| "learning_rate": 1.0554024673218806e-08, | |
| "loss": 0.6709, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "grad_norm": 3.866807699203491, | |
| "learning_rate": 1.0494286148713743e-08, | |
| "loss": 0.7126, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "eval_loss": 0.6363654732704163, | |
| "eval_runtime": 0.7605, | |
| "eval_samples_per_second": 17.094, | |
| "eval_steps_per_second": 17.094, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 96.2909090909091, | |
| "grad_norm": 3.1493375301361084, | |
| "learning_rate": 1.0437936906629333e-08, | |
| "loss": 0.5571, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 96.58181818181818, | |
| "grad_norm": 3.1544456481933594, | |
| "learning_rate": 1.0384981238178533e-08, | |
| "loss": 0.7043, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 96.87272727272727, | |
| "grad_norm": 3.5196638107299805, | |
| "learning_rate": 1.033542317614051e-08, | |
| "loss": 0.6956, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "grad_norm": 2.829664707183838, | |
| "learning_rate": 1.0289266494553564e-08, | |
| "loss": 0.5839, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 97.2909090909091, | |
| "grad_norm": 3.240220308303833, | |
| "learning_rate": 1.0246514708427701e-08, | |
| "loss": 0.629, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 97.58181818181818, | |
| "grad_norm": 3.419234275817871, | |
| "learning_rate": 1.0207171073476952e-08, | |
| "loss": 0.7125, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 97.87272727272727, | |
| "grad_norm": 3.266242742538452, | |
| "learning_rate": 1.017123858587145e-08, | |
| "loss": 0.7004, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "grad_norm": 2.8885867595672607, | |
| "learning_rate": 1.0138719982009241e-08, | |
| "loss": 0.4986, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 98.2909090909091, | |
| "grad_norm": 3.4574053287506104, | |
| "learning_rate": 1.0109617738307912e-08, | |
| "loss": 0.7095, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 98.58181818181818, | |
| "grad_norm": 3.2674267292022705, | |
| "learning_rate": 1.0083934071015988e-08, | |
| "loss": 0.5806, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 98.87272727272727, | |
| "grad_norm": 2.897749423980713, | |
| "learning_rate": 1.0061670936044179e-08, | |
| "loss": 0.6434, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "grad_norm": 3.9228150844573975, | |
| "learning_rate": 1.0042830028816398e-08, | |
| "loss": 0.7094, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 99.2909090909091, | |
| "grad_norm": 2.946876287460327, | |
| "learning_rate": 1.002741278414069e-08, | |
| "loss": 0.5678, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 99.58181818181818, | |
| "grad_norm": 2.9825222492218018, | |
| "learning_rate": 1.0015420376099922e-08, | |
| "loss": 0.6347, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 99.87272727272727, | |
| "grad_norm": 3.46803879737854, | |
| "learning_rate": 1.0006853717962394e-08, | |
| "loss": 0.7428, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "grad_norm": 4.149415969848633, | |
| "learning_rate": 1.0001713462112291e-08, | |
| "loss": 0.6465, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "eval_loss": 0.6363555192947388, | |
| "eval_runtime": 0.7511, | |
| "eval_samples_per_second": 17.308, | |
| "eval_steps_per_second": 17.308, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "step": 400, | |
| "total_flos": 1.34153286008832e+17, | |
| "train_loss": 0.7119150696694851, | |
| "train_runtime": 2950.0217, | |
| "train_samples_per_second": 3.729, | |
| "train_steps_per_second": 0.136 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 16, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.34153286008832e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |