| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.95195530726257, | |
| "eval_steps": 112, | |
| "global_step": 669, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004464285714285714, | |
| "grad_norm": 5.096107617240597, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.81, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004464285714285714, | |
| "eval_loss": 0.7919066548347473, | |
| "eval_runtime": 32.2179, | |
| "eval_samples_per_second": 82.873, | |
| "eval_steps_per_second": 5.183, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008928571428571428, | |
| "grad_norm": 4.9041704402978805, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.7839, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.013392857142857142, | |
| "grad_norm": 4.977917588021941, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.7745, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017857142857142856, | |
| "grad_norm": 4.7373014121550705, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.7681, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.022321428571428572, | |
| "grad_norm": 4.563297129857777, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.7589, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.026785714285714284, | |
| "grad_norm": 3.5264108754930787, | |
| "learning_rate": 3e-06, | |
| "loss": 0.7629, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 3.4126703272457166, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.7181, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 3.6971429493758636, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.667, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04017857142857143, | |
| "grad_norm": 1.940701767077778, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.6629, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.044642857142857144, | |
| "grad_norm": 12.40862535155878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6832, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.049107142857142856, | |
| "grad_norm": 2.8235547786373867, | |
| "learning_rate": 4.992447129909366e-06, | |
| "loss": 0.6213, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05357142857142857, | |
| "grad_norm": 1.734602557718062, | |
| "learning_rate": 4.984894259818732e-06, | |
| "loss": 0.6279, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05803571428571429, | |
| "grad_norm": 1.3840828717613172, | |
| "learning_rate": 4.977341389728097e-06, | |
| "loss": 0.6134, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 1.1262172615918475, | |
| "learning_rate": 4.969788519637463e-06, | |
| "loss": 0.6029, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06696428571428571, | |
| "grad_norm": 0.9263739512436016, | |
| "learning_rate": 4.962235649546828e-06, | |
| "loss": 0.6067, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.8374584146721384, | |
| "learning_rate": 4.954682779456194e-06, | |
| "loss": 0.6014, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07589285714285714, | |
| "grad_norm": 0.7757487058764371, | |
| "learning_rate": 4.9471299093655595e-06, | |
| "loss": 0.5906, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.08035714285714286, | |
| "grad_norm": 0.6531787929864147, | |
| "learning_rate": 4.939577039274925e-06, | |
| "loss": 0.5589, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08482142857142858, | |
| "grad_norm": 0.7223853317191319, | |
| "learning_rate": 4.93202416918429e-06, | |
| "loss": 0.5877, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08928571428571429, | |
| "grad_norm": 0.6793848952870568, | |
| "learning_rate": 4.924471299093656e-06, | |
| "loss": 0.6075, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 0.65689704768199, | |
| "learning_rate": 4.9169184290030215e-06, | |
| "loss": 0.5659, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09821428571428571, | |
| "grad_norm": 0.6553424447930819, | |
| "learning_rate": 4.909365558912387e-06, | |
| "loss": 0.5728, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.10267857142857142, | |
| "grad_norm": 0.6095768945247507, | |
| "learning_rate": 4.901812688821753e-06, | |
| "loss": 0.5716, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.5874011280839848, | |
| "learning_rate": 4.894259818731118e-06, | |
| "loss": 0.5736, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.11160714285714286, | |
| "grad_norm": 0.642646780124128, | |
| "learning_rate": 4.8867069486404835e-06, | |
| "loss": 0.5889, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11607142857142858, | |
| "grad_norm": 0.5585168252549827, | |
| "learning_rate": 4.879154078549849e-06, | |
| "loss": 0.5761, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.12053571428571429, | |
| "grad_norm": 0.4948199084353006, | |
| "learning_rate": 4.871601208459215e-06, | |
| "loss": 0.545, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.580699538693282, | |
| "learning_rate": 4.864048338368581e-06, | |
| "loss": 0.5874, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.12946428571428573, | |
| "grad_norm": 0.5483036703790811, | |
| "learning_rate": 4.8564954682779455e-06, | |
| "loss": 0.553, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13392857142857142, | |
| "grad_norm": 0.5207516065535861, | |
| "learning_rate": 4.848942598187312e-06, | |
| "loss": 0.54, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13839285714285715, | |
| "grad_norm": 0.5453894595978196, | |
| "learning_rate": 4.841389728096677e-06, | |
| "loss": 0.5779, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.5223897707340578, | |
| "learning_rate": 4.833836858006043e-06, | |
| "loss": 0.5518, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.14732142857142858, | |
| "grad_norm": 0.5473812522488812, | |
| "learning_rate": 4.826283987915408e-06, | |
| "loss": 0.5831, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.15178571428571427, | |
| "grad_norm": 0.5437546013329395, | |
| "learning_rate": 4.818731117824774e-06, | |
| "loss": 0.5884, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 0.5305579139277316, | |
| "learning_rate": 4.81117824773414e-06, | |
| "loss": 0.5398, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16071428571428573, | |
| "grad_norm": 0.5258419287632591, | |
| "learning_rate": 4.803625377643505e-06, | |
| "loss": 0.5593, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.16517857142857142, | |
| "grad_norm": 0.5158851354579528, | |
| "learning_rate": 4.79607250755287e-06, | |
| "loss": 0.5564, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.16964285714285715, | |
| "grad_norm": 0.4824844062486759, | |
| "learning_rate": 4.788519637462236e-06, | |
| "loss": 0.5616, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.17410714285714285, | |
| "grad_norm": 0.5173226727977146, | |
| "learning_rate": 4.780966767371602e-06, | |
| "loss": 0.5674, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.4966629400190002, | |
| "learning_rate": 4.773413897280967e-06, | |
| "loss": 0.5833, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18303571428571427, | |
| "grad_norm": 0.48186435126437244, | |
| "learning_rate": 4.765861027190333e-06, | |
| "loss": 0.5571, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.5029558558326376, | |
| "learning_rate": 4.758308157099698e-06, | |
| "loss": 0.5358, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.19196428571428573, | |
| "grad_norm": 0.5249190794688006, | |
| "learning_rate": 4.750755287009064e-06, | |
| "loss": 0.5542, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.19642857142857142, | |
| "grad_norm": 0.4622817114582695, | |
| "learning_rate": 4.743202416918429e-06, | |
| "loss": 0.54, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.20089285714285715, | |
| "grad_norm": 0.49789824735563454, | |
| "learning_rate": 4.735649546827795e-06, | |
| "loss": 0.5308, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20535714285714285, | |
| "grad_norm": 0.51199116739647, | |
| "learning_rate": 4.728096676737161e-06, | |
| "loss": 0.5784, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.20982142857142858, | |
| "grad_norm": 0.47152096107780506, | |
| "learning_rate": 4.720543806646526e-06, | |
| "loss": 0.5654, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.5233581267403502, | |
| "learning_rate": 4.712990936555891e-06, | |
| "loss": 0.5727, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 0.517908769266997, | |
| "learning_rate": 4.705438066465257e-06, | |
| "loss": 0.5258, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.22321428571428573, | |
| "grad_norm": 0.5103782222366013, | |
| "learning_rate": 4.697885196374623e-06, | |
| "loss": 0.564, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22767857142857142, | |
| "grad_norm": 0.5894196264675328, | |
| "learning_rate": 4.6903323262839885e-06, | |
| "loss": 0.5355, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.23214285714285715, | |
| "grad_norm": 0.48136408513935, | |
| "learning_rate": 4.682779456193353e-06, | |
| "loss": 0.5714, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.23660714285714285, | |
| "grad_norm": 0.5996790928794941, | |
| "learning_rate": 4.67522658610272e-06, | |
| "loss": 0.5385, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.24107142857142858, | |
| "grad_norm": 0.5231061994660855, | |
| "learning_rate": 4.667673716012085e-06, | |
| "loss": 0.5429, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.24553571428571427, | |
| "grad_norm": 0.5416056944426403, | |
| "learning_rate": 4.6601208459214505e-06, | |
| "loss": 0.5423, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5734105721889117, | |
| "learning_rate": 4.652567975830816e-06, | |
| "loss": 0.5756, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.2544642857142857, | |
| "grad_norm": 0.5318541491738474, | |
| "learning_rate": 4.645015105740182e-06, | |
| "loss": 0.5486, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.25892857142857145, | |
| "grad_norm": 0.4839987879630258, | |
| "learning_rate": 4.637462235649548e-06, | |
| "loss": 0.5517, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.26339285714285715, | |
| "grad_norm": 0.5843387907103592, | |
| "learning_rate": 4.6299093655589125e-06, | |
| "loss": 0.5288, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.26785714285714285, | |
| "grad_norm": 0.5496996598474941, | |
| "learning_rate": 4.622356495468278e-06, | |
| "loss": 0.542, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.27232142857142855, | |
| "grad_norm": 0.5192508705264864, | |
| "learning_rate": 4.614803625377644e-06, | |
| "loss": 0.5291, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2767857142857143, | |
| "grad_norm": 0.5918952697319948, | |
| "learning_rate": 4.60725075528701e-06, | |
| "loss": 0.5229, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 0.5914173604947244, | |
| "learning_rate": 4.5996978851963745e-06, | |
| "loss": 0.5754, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.5298896547584293, | |
| "learning_rate": 4.592145015105741e-06, | |
| "loss": 0.5687, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.29017857142857145, | |
| "grad_norm": 0.5671535875314645, | |
| "learning_rate": 4.584592145015106e-06, | |
| "loss": 0.5513, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.29464285714285715, | |
| "grad_norm": 0.5234306194196137, | |
| "learning_rate": 4.577039274924472e-06, | |
| "loss": 0.5226, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.29910714285714285, | |
| "grad_norm": 0.4959853502285333, | |
| "learning_rate": 4.569486404833837e-06, | |
| "loss": 0.5335, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.30357142857142855, | |
| "grad_norm": 0.47729026424742405, | |
| "learning_rate": 4.561933534743202e-06, | |
| "loss": 0.5248, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3080357142857143, | |
| "grad_norm": 0.47294297292566195, | |
| "learning_rate": 4.554380664652569e-06, | |
| "loss": 0.5588, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.5275786368006364, | |
| "learning_rate": 4.5468277945619336e-06, | |
| "loss": 0.5483, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3169642857142857, | |
| "grad_norm": 0.5057705930103896, | |
| "learning_rate": 4.539274924471299e-06, | |
| "loss": 0.5236, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.49625696780589473, | |
| "learning_rate": 4.531722054380665e-06, | |
| "loss": 0.5221, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.32589285714285715, | |
| "grad_norm": 0.49548627058675154, | |
| "learning_rate": 4.524169184290031e-06, | |
| "loss": 0.5176, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.33035714285714285, | |
| "grad_norm": 0.5011897814029462, | |
| "learning_rate": 4.516616314199396e-06, | |
| "loss": 0.5431, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.33482142857142855, | |
| "grad_norm": 0.5168007116628185, | |
| "learning_rate": 4.509063444108761e-06, | |
| "loss": 0.5289, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3392857142857143, | |
| "grad_norm": 0.5199488476439844, | |
| "learning_rate": 4.501510574018128e-06, | |
| "loss": 0.5437, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 0.5776348374651958, | |
| "learning_rate": 4.493957703927493e-06, | |
| "loss": 0.5424, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3482142857142857, | |
| "grad_norm": 0.5785274375251012, | |
| "learning_rate": 4.486404833836858e-06, | |
| "loss": 0.5252, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.35267857142857145, | |
| "grad_norm": 0.5624252036929408, | |
| "learning_rate": 4.478851963746224e-06, | |
| "loss": 0.5502, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.5630273292563954, | |
| "learning_rate": 4.47129909365559e-06, | |
| "loss": 0.5249, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.36160714285714285, | |
| "grad_norm": 0.5229643469397499, | |
| "learning_rate": 4.463746223564955e-06, | |
| "loss": 0.572, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.36607142857142855, | |
| "grad_norm": 0.5401109169653766, | |
| "learning_rate": 4.45619335347432e-06, | |
| "loss": 0.5174, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3705357142857143, | |
| "grad_norm": 0.572853251261138, | |
| "learning_rate": 4.448640483383686e-06, | |
| "loss": 0.5336, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.49833694539628726, | |
| "learning_rate": 4.441087613293052e-06, | |
| "loss": 0.5205, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3794642857142857, | |
| "grad_norm": 0.5167418841184229, | |
| "learning_rate": 4.4335347432024175e-06, | |
| "loss": 0.5405, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.38392857142857145, | |
| "grad_norm": 0.5243612918158316, | |
| "learning_rate": 4.425981873111782e-06, | |
| "loss": 0.5204, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.38839285714285715, | |
| "grad_norm": 0.5159624173670619, | |
| "learning_rate": 4.418429003021149e-06, | |
| "loss": 0.5315, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.5227918139330733, | |
| "learning_rate": 4.410876132930514e-06, | |
| "loss": 0.5418, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.39732142857142855, | |
| "grad_norm": 0.48045718180033986, | |
| "learning_rate": 4.4033232628398795e-06, | |
| "loss": 0.506, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4017857142857143, | |
| "grad_norm": 0.5791314708394477, | |
| "learning_rate": 4.395770392749245e-06, | |
| "loss": 0.596, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.40625, | |
| "grad_norm": 0.6076262289549809, | |
| "learning_rate": 4.38821752265861e-06, | |
| "loss": 0.5312, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4107142857142857, | |
| "grad_norm": 0.4780377894473343, | |
| "learning_rate": 4.380664652567977e-06, | |
| "loss": 0.549, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.41517857142857145, | |
| "grad_norm": 0.5549693296410254, | |
| "learning_rate": 4.3731117824773415e-06, | |
| "loss": 0.5159, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.41964285714285715, | |
| "grad_norm": 0.5097636365128929, | |
| "learning_rate": 4.365558912386707e-06, | |
| "loss": 0.5612, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.42410714285714285, | |
| "grad_norm": 0.5021825140627123, | |
| "learning_rate": 4.358006042296073e-06, | |
| "loss": 0.5653, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.537945728158161, | |
| "learning_rate": 4.350453172205439e-06, | |
| "loss": 0.528, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4330357142857143, | |
| "grad_norm": 0.4846132130879552, | |
| "learning_rate": 4.342900302114804e-06, | |
| "loss": 0.5395, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.5206116049646974, | |
| "learning_rate": 4.335347432024169e-06, | |
| "loss": 0.5445, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.4419642857142857, | |
| "grad_norm": 0.551262765339688, | |
| "learning_rate": 4.327794561933535e-06, | |
| "loss": 0.5346, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.44642857142857145, | |
| "grad_norm": 0.4984265695742978, | |
| "learning_rate": 4.3202416918429006e-06, | |
| "loss": 0.5161, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.45089285714285715, | |
| "grad_norm": 0.49943465819192145, | |
| "learning_rate": 4.312688821752266e-06, | |
| "loss": 0.5164, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.45535714285714285, | |
| "grad_norm": 0.48826983665318036, | |
| "learning_rate": 4.305135951661632e-06, | |
| "loss": 0.552, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.45982142857142855, | |
| "grad_norm": 0.5320671379410685, | |
| "learning_rate": 4.297583081570998e-06, | |
| "loss": 0.5512, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.5027439491810191, | |
| "learning_rate": 4.2900302114803626e-06, | |
| "loss": 0.5728, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 0.4875858103308158, | |
| "learning_rate": 4.282477341389728e-06, | |
| "loss": 0.5306, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4732142857142857, | |
| "grad_norm": 0.5205285237402679, | |
| "learning_rate": 4.274924471299094e-06, | |
| "loss": 0.5274, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.47767857142857145, | |
| "grad_norm": 0.47059444782816534, | |
| "learning_rate": 4.26737160120846e-06, | |
| "loss": 0.5092, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.48214285714285715, | |
| "grad_norm": 0.4998056822147382, | |
| "learning_rate": 4.259818731117825e-06, | |
| "loss": 0.5481, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.48660714285714285, | |
| "grad_norm": 0.5403989848602658, | |
| "learning_rate": 4.25226586102719e-06, | |
| "loss": 0.518, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.49107142857142855, | |
| "grad_norm": 0.48519680312136604, | |
| "learning_rate": 4.244712990936557e-06, | |
| "loss": 0.5522, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4955357142857143, | |
| "grad_norm": 0.5655590239975465, | |
| "learning_rate": 4.237160120845922e-06, | |
| "loss": 0.526, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.49558151987556875, | |
| "learning_rate": 4.229607250755287e-06, | |
| "loss": 0.5275, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.5346882343292236, | |
| "eval_runtime": 32.2399, | |
| "eval_samples_per_second": 82.817, | |
| "eval_steps_per_second": 5.18, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5050279329608939, | |
| "grad_norm": 0.5662939933016762, | |
| "learning_rate": 4.218512898330804e-06, | |
| "loss": 0.515, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.5094972067039106, | |
| "grad_norm": 0.5225180760232837, | |
| "learning_rate": 4.2109256449165405e-06, | |
| "loss": 0.5038, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5139664804469274, | |
| "grad_norm": 0.5377410752157915, | |
| "learning_rate": 4.203338391502276e-06, | |
| "loss": 0.5043, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5184357541899441, | |
| "grad_norm": 0.5915094303426857, | |
| "learning_rate": 4.195751138088012e-06, | |
| "loss": 0.5115, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5229050279329609, | |
| "grad_norm": 0.5079762717076601, | |
| "learning_rate": 4.1881638846737485e-06, | |
| "loss": 0.5491, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5273743016759777, | |
| "grad_norm": 0.5933329970849119, | |
| "learning_rate": 4.180576631259484e-06, | |
| "loss": 0.5095, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5318435754189944, | |
| "grad_norm": 0.5053074762991974, | |
| "learning_rate": 4.17298937784522e-06, | |
| "loss": 0.4881, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5363128491620112, | |
| "grad_norm": 0.6515342691068152, | |
| "learning_rate": 4.1654021244309564e-06, | |
| "loss": 0.5069, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5407821229050279, | |
| "grad_norm": 0.5774345075195761, | |
| "learning_rate": 4.157814871016692e-06, | |
| "loss": 0.5007, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5452513966480447, | |
| "grad_norm": 0.6308991657157696, | |
| "learning_rate": 4.150227617602428e-06, | |
| "loss": 0.5083, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5497206703910614, | |
| "grad_norm": 0.5258198371155504, | |
| "learning_rate": 4.142640364188164e-06, | |
| "loss": 0.5103, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5541899441340782, | |
| "grad_norm": 0.5629243370057063, | |
| "learning_rate": 4.1350531107739e-06, | |
| "loss": 0.5016, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5586592178770949, | |
| "grad_norm": 0.5440095922230602, | |
| "learning_rate": 4.127465857359636e-06, | |
| "loss": 0.491, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5631284916201117, | |
| "grad_norm": 0.5126793634050918, | |
| "learning_rate": 4.119878603945372e-06, | |
| "loss": 0.5239, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5675977653631284, | |
| "grad_norm": 0.5476299968831635, | |
| "learning_rate": 4.112291350531108e-06, | |
| "loss": 0.5176, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5720670391061452, | |
| "grad_norm": 0.5570481457317527, | |
| "learning_rate": 4.104704097116844e-06, | |
| "loss": 0.5084, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.576536312849162, | |
| "grad_norm": 0.49480777896006917, | |
| "learning_rate": 4.09711684370258e-06, | |
| "loss": 0.4966, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5810055865921788, | |
| "grad_norm": 0.5089889598386482, | |
| "learning_rate": 4.089529590288316e-06, | |
| "loss": 0.5059, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5854748603351956, | |
| "grad_norm": 0.4954780811768282, | |
| "learning_rate": 4.081942336874052e-06, | |
| "loss": 0.5166, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5899441340782123, | |
| "grad_norm": 0.4809684003541764, | |
| "learning_rate": 4.074355083459787e-06, | |
| "loss": 0.4857, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5944134078212291, | |
| "grad_norm": 0.5009748980192239, | |
| "learning_rate": 4.066767830045524e-06, | |
| "loss": 0.523, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5988826815642458, | |
| "grad_norm": 0.48320950023108783, | |
| "learning_rate": 4.05918057663126e-06, | |
| "loss": 0.4956, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.6033519553072626, | |
| "grad_norm": 0.49399181892388616, | |
| "learning_rate": 4.051593323216995e-06, | |
| "loss": 0.5163, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6078212290502794, | |
| "grad_norm": 0.4821527638840584, | |
| "learning_rate": 4.044006069802732e-06, | |
| "loss": 0.4981, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6122905027932961, | |
| "grad_norm": 0.5099314085164022, | |
| "learning_rate": 4.036418816388468e-06, | |
| "loss": 0.5121, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6167597765363129, | |
| "grad_norm": 0.5323495276796993, | |
| "learning_rate": 4.028831562974203e-06, | |
| "loss": 0.5074, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6212290502793296, | |
| "grad_norm": 0.48851533865778524, | |
| "learning_rate": 4.02124430955994e-06, | |
| "loss": 0.52, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6256983240223464, | |
| "grad_norm": 0.5281525574471102, | |
| "learning_rate": 4.0136570561456756e-06, | |
| "loss": 0.5039, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6301675977653631, | |
| "grad_norm": 0.5279406819094602, | |
| "learning_rate": 4.006069802731411e-06, | |
| "loss": 0.4933, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6346368715083799, | |
| "grad_norm": 0.45822529010842367, | |
| "learning_rate": 3.998482549317148e-06, | |
| "loss": 0.5151, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6391061452513966, | |
| "grad_norm": 0.5349296018392958, | |
| "learning_rate": 3.9908952959028835e-06, | |
| "loss": 0.4983, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6435754189944134, | |
| "grad_norm": 0.5316574655817645, | |
| "learning_rate": 3.983308042488619e-06, | |
| "loss": 0.5344, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6480446927374302, | |
| "grad_norm": 0.5107562456487705, | |
| "learning_rate": 3.975720789074356e-06, | |
| "loss": 0.52, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6525139664804469, | |
| "grad_norm": 0.5072583855988344, | |
| "learning_rate": 3.9681335356600915e-06, | |
| "loss": 0.4923, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6569832402234637, | |
| "grad_norm": 0.5836000010606872, | |
| "learning_rate": 3.960546282245827e-06, | |
| "loss": 0.5295, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6614525139664804, | |
| "grad_norm": 0.49884721978842206, | |
| "learning_rate": 3.952959028831564e-06, | |
| "loss": 0.5216, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6659217877094972, | |
| "grad_norm": 0.5120991222561042, | |
| "learning_rate": 3.945371775417299e-06, | |
| "loss": 0.5414, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "grad_norm": 0.49838481084559894, | |
| "learning_rate": 3.937784522003035e-06, | |
| "loss": 0.5381, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6748603351955307, | |
| "grad_norm": 0.5398799415397277, | |
| "learning_rate": 3.930197268588772e-06, | |
| "loss": 0.4896, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6793296089385474, | |
| "grad_norm": 0.565748454568293, | |
| "learning_rate": 3.922610015174507e-06, | |
| "loss": 0.4985, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6837988826815642, | |
| "grad_norm": 0.5986782671726579, | |
| "learning_rate": 3.915022761760243e-06, | |
| "loss": 0.5125, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6882681564245811, | |
| "grad_norm": 0.5290750194980306, | |
| "learning_rate": 3.907435508345979e-06, | |
| "loss": 0.5078, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6927374301675978, | |
| "grad_norm": 0.5706479438251948, | |
| "learning_rate": 3.899848254931715e-06, | |
| "loss": 0.5205, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6972067039106146, | |
| "grad_norm": 0.5173864500214489, | |
| "learning_rate": 3.892261001517451e-06, | |
| "loss": 0.4988, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.7016759776536313, | |
| "grad_norm": 0.4920045809108581, | |
| "learning_rate": 3.884673748103187e-06, | |
| "loss": 0.4954, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.7061452513966481, | |
| "grad_norm": 0.5237901072069291, | |
| "learning_rate": 3.877086494688923e-06, | |
| "loss": 0.5253, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.7106145251396648, | |
| "grad_norm": 0.5212795514388354, | |
| "learning_rate": 3.869499241274659e-06, | |
| "loss": 0.5029, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.7150837988826816, | |
| "grad_norm": 0.48658569533086143, | |
| "learning_rate": 3.861911987860395e-06, | |
| "loss": 0.5023, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7195530726256983, | |
| "grad_norm": 0.5145837815914728, | |
| "learning_rate": 3.854324734446131e-06, | |
| "loss": 0.5436, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7240223463687151, | |
| "grad_norm": 0.49182827281373437, | |
| "learning_rate": 3.846737481031867e-06, | |
| "loss": 0.4948, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7284916201117319, | |
| "grad_norm": 0.5113640333759738, | |
| "learning_rate": 3.839150227617603e-06, | |
| "loss": 0.4778, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7329608938547486, | |
| "grad_norm": 0.47675402744999507, | |
| "learning_rate": 3.831562974203339e-06, | |
| "loss": 0.5182, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.7374301675977654, | |
| "grad_norm": 0.5214515656344721, | |
| "learning_rate": 3.823975720789075e-06, | |
| "loss": 0.5204, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7418994413407821, | |
| "grad_norm": 0.5130606167326404, | |
| "learning_rate": 3.816388467374811e-06, | |
| "loss": 0.492, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7463687150837989, | |
| "grad_norm": 0.49678736596959705, | |
| "learning_rate": 3.8088012139605467e-06, | |
| "loss": 0.4997, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7508379888268156, | |
| "grad_norm": 0.5275415964748987, | |
| "learning_rate": 3.801213960546283e-06, | |
| "loss": 0.5163, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7553072625698324, | |
| "grad_norm": 0.5323621998896121, | |
| "learning_rate": 3.7936267071320185e-06, | |
| "loss": 0.4846, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7597765363128491, | |
| "grad_norm": 0.5302675218909635, | |
| "learning_rate": 3.7860394537177547e-06, | |
| "loss": 0.5193, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7642458100558659, | |
| "grad_norm": 0.5097255453283106, | |
| "learning_rate": 3.778452200303491e-06, | |
| "loss": 0.5248, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.7687150837988826, | |
| "grad_norm": 0.5221084637587756, | |
| "learning_rate": 3.7708649468892265e-06, | |
| "loss": 0.5217, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7731843575418994, | |
| "grad_norm": 0.5823307367451829, | |
| "learning_rate": 3.7632776934749626e-06, | |
| "loss": 0.5057, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7776536312849162, | |
| "grad_norm": 0.48586162675121547, | |
| "learning_rate": 3.7556904400606987e-06, | |
| "loss": 0.5372, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7821229050279329, | |
| "grad_norm": 0.5099958749272927, | |
| "learning_rate": 3.748103186646434e-06, | |
| "loss": 0.4844, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7865921787709497, | |
| "grad_norm": 0.5221594966770278, | |
| "learning_rate": 3.74051593323217e-06, | |
| "loss": 0.4968, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7910614525139665, | |
| "grad_norm": 0.5392129841421105, | |
| "learning_rate": 3.732928679817906e-06, | |
| "loss": 0.4906, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7955307262569833, | |
| "grad_norm": 0.5239629013619267, | |
| "learning_rate": 3.725341426403642e-06, | |
| "loss": 0.5172, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.537861417175095, | |
| "learning_rate": 3.717754172989378e-06, | |
| "loss": 0.4914, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.8044692737430168, | |
| "grad_norm": 0.5001204419984062, | |
| "learning_rate": 3.710166919575114e-06, | |
| "loss": 0.5014, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8089385474860336, | |
| "grad_norm": 0.5243758405421232, | |
| "learning_rate": 3.70257966616085e-06, | |
| "loss": 0.4979, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.8134078212290503, | |
| "grad_norm": 0.507560920806033, | |
| "learning_rate": 3.6949924127465856e-06, | |
| "loss": 0.5003, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.8178770949720671, | |
| "grad_norm": 0.5623627396713589, | |
| "learning_rate": 3.6874051593323218e-06, | |
| "loss": 0.5011, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.8223463687150838, | |
| "grad_norm": 0.5225320984203384, | |
| "learning_rate": 3.679817905918058e-06, | |
| "loss": 0.5117, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.8268156424581006, | |
| "grad_norm": 0.5371208831947621, | |
| "learning_rate": 3.6722306525037936e-06, | |
| "loss": 0.5028, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8312849162011173, | |
| "grad_norm": 0.5412513689241548, | |
| "learning_rate": 3.6646433990895297e-06, | |
| "loss": 0.5111, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.8357541899441341, | |
| "grad_norm": 0.49083593539874787, | |
| "learning_rate": 3.657056145675266e-06, | |
| "loss": 0.5002, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.8402234636871508, | |
| "grad_norm": 0.5144646076400369, | |
| "learning_rate": 3.6494688922610015e-06, | |
| "loss": 0.4875, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8446927374301676, | |
| "grad_norm": 0.5385458846653849, | |
| "learning_rate": 3.6418816388467377e-06, | |
| "loss": 0.5072, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8491620111731844, | |
| "grad_norm": 0.48088002744673064, | |
| "learning_rate": 3.6342943854324738e-06, | |
| "loss": 0.549, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8536312849162011, | |
| "grad_norm": 0.563960197772806, | |
| "learning_rate": 3.6267071320182095e-06, | |
| "loss": 0.5344, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8581005586592179, | |
| "grad_norm": 0.5235289059912077, | |
| "learning_rate": 3.6191198786039456e-06, | |
| "loss": 0.5172, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8625698324022346, | |
| "grad_norm": 0.5059904029741168, | |
| "learning_rate": 3.6115326251896813e-06, | |
| "loss": 0.4908, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8670391061452514, | |
| "grad_norm": 0.507980780742306, | |
| "learning_rate": 3.6039453717754174e-06, | |
| "loss": 0.4951, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8715083798882681, | |
| "grad_norm": 0.5552145890763519, | |
| "learning_rate": 3.5963581183611536e-06, | |
| "loss": 0.505, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8759776536312849, | |
| "grad_norm": 0.5379367693029108, | |
| "learning_rate": 3.5887708649468893e-06, | |
| "loss": 0.52, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8804469273743016, | |
| "grad_norm": 0.5813281708032492, | |
| "learning_rate": 3.5811836115326254e-06, | |
| "loss": 0.4966, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8849162011173184, | |
| "grad_norm": 0.5138017935214574, | |
| "learning_rate": 3.5735963581183615e-06, | |
| "loss": 0.495, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8893854748603351, | |
| "grad_norm": 0.5116405158423362, | |
| "learning_rate": 3.5660091047040972e-06, | |
| "loss": 0.519, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "grad_norm": 0.5696283061922035, | |
| "learning_rate": 3.5584218512898333e-06, | |
| "loss": 0.5002, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8983240223463688, | |
| "grad_norm": 0.5541549121266046, | |
| "learning_rate": 3.5508345978755695e-06, | |
| "loss": 0.5125, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.9027932960893855, | |
| "grad_norm": 0.5017863683272789, | |
| "learning_rate": 3.543247344461305e-06, | |
| "loss": 0.499, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.9072625698324023, | |
| "grad_norm": 0.5776107100347877, | |
| "learning_rate": 3.5356600910470413e-06, | |
| "loss": 0.4883, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.911731843575419, | |
| "grad_norm": 0.5350847562000018, | |
| "learning_rate": 3.528072837632777e-06, | |
| "loss": 0.5052, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.9162011173184358, | |
| "grad_norm": 0.5012614076658582, | |
| "learning_rate": 3.520485584218513e-06, | |
| "loss": 0.5422, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9206703910614525, | |
| "grad_norm": 0.6107847324258918, | |
| "learning_rate": 3.5128983308042493e-06, | |
| "loss": 0.5204, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.9251396648044693, | |
| "grad_norm": 0.49314942553653257, | |
| "learning_rate": 3.505311077389985e-06, | |
| "loss": 0.5074, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.929608938547486, | |
| "grad_norm": 0.5340833706807174, | |
| "learning_rate": 3.497723823975721e-06, | |
| "loss": 0.4936, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9340782122905028, | |
| "grad_norm": 0.5342137283430334, | |
| "learning_rate": 3.490136570561457e-06, | |
| "loss": 0.4945, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.9385474860335196, | |
| "grad_norm": 0.5301769352005536, | |
| "learning_rate": 3.482549317147193e-06, | |
| "loss": 0.4963, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9430167597765363, | |
| "grad_norm": 0.5455545680194068, | |
| "learning_rate": 3.474962063732929e-06, | |
| "loss": 0.4931, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9474860335195531, | |
| "grad_norm": 0.5282021847913991, | |
| "learning_rate": 3.467374810318665e-06, | |
| "loss": 0.4894, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9519553072625698, | |
| "grad_norm": 0.5341400870442735, | |
| "learning_rate": 3.459787556904401e-06, | |
| "loss": 0.5102, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9564245810055866, | |
| "grad_norm": 0.5313633005763995, | |
| "learning_rate": 3.452200303490137e-06, | |
| "loss": 0.501, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.9608938547486033, | |
| "grad_norm": 0.5774555510069799, | |
| "learning_rate": 3.4446130500758727e-06, | |
| "loss": 0.5021, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9653631284916201, | |
| "grad_norm": 0.521646351973494, | |
| "learning_rate": 3.437025796661609e-06, | |
| "loss": 0.5071, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9698324022346368, | |
| "grad_norm": 0.5335773786656196, | |
| "learning_rate": 3.429438543247345e-06, | |
| "loss": 0.4993, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9743016759776536, | |
| "grad_norm": 0.5774515212071393, | |
| "learning_rate": 3.4218512898330806e-06, | |
| "loss": 0.4802, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.9787709497206704, | |
| "grad_norm": 0.5159511449041201, | |
| "learning_rate": 3.4142640364188168e-06, | |
| "loss": 0.483, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9832402234636871, | |
| "grad_norm": 0.509916149384416, | |
| "learning_rate": 3.406676783004553e-06, | |
| "loss": 0.4774, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9877094972067039, | |
| "grad_norm": 0.5226659969898425, | |
| "learning_rate": 3.3990895295902886e-06, | |
| "loss": 0.516, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9921787709497206, | |
| "grad_norm": 0.5632866199425641, | |
| "learning_rate": 3.3915022761760247e-06, | |
| "loss": 0.5346, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9966480446927374, | |
| "grad_norm": 0.5224220899066443, | |
| "learning_rate": 3.383915022761761e-06, | |
| "loss": 0.5441, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.0011173184357542, | |
| "grad_norm": 0.5965341004746808, | |
| "learning_rate": 3.3763277693474965e-06, | |
| "loss": 0.512, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.0011173184357542, | |
| "eval_loss": 0.5086758732795715, | |
| "eval_runtime": 32.322, | |
| "eval_samples_per_second": 82.606, | |
| "eval_steps_per_second": 5.167, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.005586592178771, | |
| "grad_norm": 0.5234315419190534, | |
| "learning_rate": 3.3687405159332327e-06, | |
| "loss": 0.5054, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.0100558659217878, | |
| "grad_norm": 0.5575750854403085, | |
| "learning_rate": 3.3611532625189684e-06, | |
| "loss": 0.4878, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.0145251396648045, | |
| "grad_norm": 0.5253448160628447, | |
| "learning_rate": 3.3535660091047045e-06, | |
| "loss": 0.5054, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.0189944134078213, | |
| "grad_norm": 0.49020025660118194, | |
| "learning_rate": 3.3459787556904406e-06, | |
| "loss": 0.4818, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.0044692737430168, | |
| "grad_norm": 0.6277138162928673, | |
| "learning_rate": 3.3383915022761763e-06, | |
| "loss": 0.5008, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.0089385474860335, | |
| "grad_norm": 0.5276575275655174, | |
| "learning_rate": 3.3308042488619125e-06, | |
| "loss": 0.4822, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0134078212290503, | |
| "grad_norm": 0.516729892988387, | |
| "learning_rate": 3.3232169954476486e-06, | |
| "loss": 0.4783, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.017877094972067, | |
| "grad_norm": 0.5648907712973277, | |
| "learning_rate": 3.3156297420333843e-06, | |
| "loss": 0.4927, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0223463687150838, | |
| "grad_norm": 0.5521221146190025, | |
| "learning_rate": 3.3080424886191204e-06, | |
| "loss": 0.4742, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.0268156424581005, | |
| "grad_norm": 0.5288379239842262, | |
| "learning_rate": 3.3004552352048565e-06, | |
| "loss": 0.4863, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0312849162011173, | |
| "grad_norm": 0.4932863276651226, | |
| "learning_rate": 3.2928679817905922e-06, | |
| "loss": 0.4613, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.035754189944134, | |
| "grad_norm": 0.5069055178884726, | |
| "learning_rate": 3.2852807283763284e-06, | |
| "loss": 0.4969, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0402234636871508, | |
| "grad_norm": 0.49009866664123986, | |
| "learning_rate": 3.277693474962064e-06, | |
| "loss": 0.4759, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0446927374301676, | |
| "grad_norm": 0.5220649844244807, | |
| "learning_rate": 3.2701062215478e-06, | |
| "loss": 0.4654, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0491620111731843, | |
| "grad_norm": 0.5354627141925222, | |
| "learning_rate": 3.2625189681335363e-06, | |
| "loss": 0.5009, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.053631284916201, | |
| "grad_norm": 0.5561818122893151, | |
| "learning_rate": 3.254931714719272e-06, | |
| "loss": 0.4564, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0581005586592178, | |
| "grad_norm": 0.48719455414113416, | |
| "learning_rate": 3.2473444613050077e-06, | |
| "loss": 0.4924, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0625698324022346, | |
| "grad_norm": 0.5527905454518657, | |
| "learning_rate": 3.2397572078907434e-06, | |
| "loss": 0.4856, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.0670391061452513, | |
| "grad_norm": 0.5381863443762583, | |
| "learning_rate": 3.2321699544764795e-06, | |
| "loss": 0.4935, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.071508379888268, | |
| "grad_norm": 0.5355360844432822, | |
| "learning_rate": 3.2245827010622157e-06, | |
| "loss": 0.4858, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.0759776536312848, | |
| "grad_norm": 0.5191892602172652, | |
| "learning_rate": 3.2169954476479514e-06, | |
| "loss": 0.4819, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0804469273743016, | |
| "grad_norm": 0.4952672717234948, | |
| "learning_rate": 3.2094081942336875e-06, | |
| "loss": 0.5007, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0849162011173183, | |
| "grad_norm": 0.5331734326280092, | |
| "learning_rate": 3.201820940819423e-06, | |
| "loss": 0.4874, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.089385474860335, | |
| "grad_norm": 0.49333273079420603, | |
| "learning_rate": 3.1942336874051593e-06, | |
| "loss": 0.5045, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.0938547486033519, | |
| "grad_norm": 0.5143326582021585, | |
| "learning_rate": 3.1866464339908955e-06, | |
| "loss": 0.5047, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.0983240223463686, | |
| "grad_norm": 0.5027227799706213, | |
| "learning_rate": 3.179059180576631e-06, | |
| "loss": 0.4803, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1027932960893856, | |
| "grad_norm": 0.5257359502707071, | |
| "learning_rate": 3.1714719271623673e-06, | |
| "loss": 0.4654, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.1072625698324021, | |
| "grad_norm": 0.5244897759431714, | |
| "learning_rate": 3.1638846737481034e-06, | |
| "loss": 0.5329, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.111731843575419, | |
| "grad_norm": 0.5028531015372357, | |
| "learning_rate": 3.156297420333839e-06, | |
| "loss": 0.4721, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.1162011173184359, | |
| "grad_norm": 0.5342121642252061, | |
| "learning_rate": 3.1487101669195752e-06, | |
| "loss": 0.4962, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.1206703910614526, | |
| "grad_norm": 0.49550125400926287, | |
| "learning_rate": 3.1411229135053114e-06, | |
| "loss": 0.4884, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1251396648044694, | |
| "grad_norm": 0.5724921757672813, | |
| "learning_rate": 3.133535660091047e-06, | |
| "loss": 0.4676, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.1296089385474861, | |
| "grad_norm": 0.5012184887171401, | |
| "learning_rate": 3.125948406676783e-06, | |
| "loss": 0.48, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.1340782122905029, | |
| "grad_norm": 0.5049054849116781, | |
| "learning_rate": 3.118361153262519e-06, | |
| "loss": 0.494, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.1385474860335196, | |
| "grad_norm": 0.5291980859522503, | |
| "learning_rate": 3.110773899848255e-06, | |
| "loss": 0.4787, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.1430167597765364, | |
| "grad_norm": 0.5235318263865767, | |
| "learning_rate": 3.103186646433991e-06, | |
| "loss": 0.4745, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1474860335195531, | |
| "grad_norm": 0.5067236081675356, | |
| "learning_rate": 3.095599393019727e-06, | |
| "loss": 0.5863, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.15195530726257, | |
| "grad_norm": 0.48763909341918976, | |
| "learning_rate": 3.088012139605463e-06, | |
| "loss": 0.5264, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.1564245810055866, | |
| "grad_norm": 0.5198110315693464, | |
| "learning_rate": 3.080424886191199e-06, | |
| "loss": 0.4716, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1608938547486034, | |
| "grad_norm": 0.5093655208180958, | |
| "learning_rate": 3.072837632776935e-06, | |
| "loss": 0.4768, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1653631284916202, | |
| "grad_norm": 0.49489186143964614, | |
| "learning_rate": 3.065250379362671e-06, | |
| "loss": 0.4765, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.169832402234637, | |
| "grad_norm": 0.5209973994944476, | |
| "learning_rate": 3.057663125948407e-06, | |
| "loss": 0.4869, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.1743016759776537, | |
| "grad_norm": 0.4873878645874658, | |
| "learning_rate": 3.0500758725341427e-06, | |
| "loss": 0.4623, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1787709497206704, | |
| "grad_norm": 0.5074910158328607, | |
| "learning_rate": 3.042488619119879e-06, | |
| "loss": 0.4765, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1832402234636872, | |
| "grad_norm": 0.5195087180758534, | |
| "learning_rate": 3.0349013657056146e-06, | |
| "loss": 0.4838, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.187709497206704, | |
| "grad_norm": 0.5264228931092361, | |
| "learning_rate": 3.0273141122913507e-06, | |
| "loss": 0.4638, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1921787709497207, | |
| "grad_norm": 0.4919380409463276, | |
| "learning_rate": 3.019726858877087e-06, | |
| "loss": 0.4981, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.1966480446927374, | |
| "grad_norm": 0.5060903656555795, | |
| "learning_rate": 3.0121396054628225e-06, | |
| "loss": 0.477, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.2011173184357542, | |
| "grad_norm": 0.4938697284936987, | |
| "learning_rate": 3.0045523520485587e-06, | |
| "loss": 0.5003, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.205586592178771, | |
| "grad_norm": 0.5599516302969035, | |
| "learning_rate": 2.9969650986342948e-06, | |
| "loss": 0.4673, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.2100558659217877, | |
| "grad_norm": 0.5126307823030745, | |
| "learning_rate": 2.9893778452200305e-06, | |
| "loss": 0.4971, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2145251396648045, | |
| "grad_norm": 0.5927333276589611, | |
| "learning_rate": 2.9817905918057666e-06, | |
| "loss": 0.5006, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.2189944134078212, | |
| "grad_norm": 0.5367579038483986, | |
| "learning_rate": 2.9742033383915027e-06, | |
| "loss": 0.5143, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.223463687150838, | |
| "grad_norm": 0.5179699664039845, | |
| "learning_rate": 2.9666160849772384e-06, | |
| "loss": 0.4733, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.2279329608938547, | |
| "grad_norm": 0.5015385250607317, | |
| "learning_rate": 2.9590288315629746e-06, | |
| "loss": 0.4786, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.2324022346368715, | |
| "grad_norm": 0.4925764923779409, | |
| "learning_rate": 2.9514415781487103e-06, | |
| "loss": 0.472, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2368715083798882, | |
| "grad_norm": 0.5522056674324646, | |
| "learning_rate": 2.9438543247344464e-06, | |
| "loss": 0.4675, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.241340782122905, | |
| "grad_norm": 0.5128909667557014, | |
| "learning_rate": 2.9362670713201825e-06, | |
| "loss": 0.4778, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.2458100558659218, | |
| "grad_norm": 0.49958051066954534, | |
| "learning_rate": 2.9286798179059182e-06, | |
| "loss": 0.4744, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.2502793296089385, | |
| "grad_norm": 0.5186801983796817, | |
| "learning_rate": 2.9210925644916543e-06, | |
| "loss": 0.47, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2547486033519553, | |
| "grad_norm": 0.49479081933964797, | |
| "learning_rate": 2.9135053110773905e-06, | |
| "loss": 0.4693, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.259217877094972, | |
| "grad_norm": 0.5172924586559385, | |
| "learning_rate": 2.905918057663126e-06, | |
| "loss": 0.5034, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2636871508379888, | |
| "grad_norm": 0.49588955761543196, | |
| "learning_rate": 2.8983308042488623e-06, | |
| "loss": 0.4857, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.2681564245810055, | |
| "grad_norm": 0.5651670438316321, | |
| "learning_rate": 2.8907435508345984e-06, | |
| "loss": 0.4845, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2726256983240223, | |
| "grad_norm": 0.47720259500943096, | |
| "learning_rate": 2.883156297420334e-06, | |
| "loss": 0.4482, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.277094972067039, | |
| "grad_norm": 0.5080092332022862, | |
| "learning_rate": 2.8755690440060702e-06, | |
| "loss": 0.4826, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2815642458100558, | |
| "grad_norm": 0.6016729996144168, | |
| "learning_rate": 2.867981790591806e-06, | |
| "loss": 0.5592, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2860335195530725, | |
| "grad_norm": 0.5456040494522852, | |
| "learning_rate": 2.860394537177542e-06, | |
| "loss": 0.4821, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.2905027932960893, | |
| "grad_norm": 0.5858331665900632, | |
| "learning_rate": 2.852807283763278e-06, | |
| "loss": 0.5089, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.2949720670391063, | |
| "grad_norm": 0.5895630636847224, | |
| "learning_rate": 2.845220030349014e-06, | |
| "loss": 0.5101, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.2994413407821228, | |
| "grad_norm": 0.505172328049275, | |
| "learning_rate": 2.83763277693475e-06, | |
| "loss": 0.5008, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3039106145251398, | |
| "grad_norm": 0.5035894899292893, | |
| "learning_rate": 2.830045523520486e-06, | |
| "loss": 0.4812, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.3083798882681563, | |
| "grad_norm": 0.5304264887862729, | |
| "learning_rate": 2.822458270106222e-06, | |
| "loss": 0.4579, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.3128491620111733, | |
| "grad_norm": 0.4742288658363735, | |
| "learning_rate": 2.814871016691958e-06, | |
| "loss": 0.4801, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.3173184357541898, | |
| "grad_norm": 0.5293412371049411, | |
| "learning_rate": 2.807283763277694e-06, | |
| "loss": 0.4602, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.3217877094972068, | |
| "grad_norm": 0.5333831439310788, | |
| "learning_rate": 2.79969650986343e-06, | |
| "loss": 0.4963, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3262569832402233, | |
| "grad_norm": 0.4947844594727205, | |
| "learning_rate": 2.792109256449166e-06, | |
| "loss": 0.4781, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.3307262569832403, | |
| "grad_norm": 0.5323122787674339, | |
| "learning_rate": 2.7845220030349016e-06, | |
| "loss": 0.4998, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.3351955307262569, | |
| "grad_norm": 0.48615316005162806, | |
| "learning_rate": 2.7769347496206378e-06, | |
| "loss": 0.466, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.3396648044692738, | |
| "grad_norm": 0.5038897355673981, | |
| "learning_rate": 2.769347496206374e-06, | |
| "loss": 0.531, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.3441340782122906, | |
| "grad_norm": 0.47725378544672703, | |
| "learning_rate": 2.7617602427921096e-06, | |
| "loss": 0.4917, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3486033519553073, | |
| "grad_norm": 0.5241579403675019, | |
| "learning_rate": 2.7541729893778457e-06, | |
| "loss": 0.4673, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.353072625698324, | |
| "grad_norm": 0.5252091486066182, | |
| "learning_rate": 2.746585735963581e-06, | |
| "loss": 0.467, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3575418994413408, | |
| "grad_norm": 0.4803361684555166, | |
| "learning_rate": 2.738998482549317e-06, | |
| "loss": 0.5257, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3620111731843576, | |
| "grad_norm": 0.5300934772683465, | |
| "learning_rate": 2.7314112291350532e-06, | |
| "loss": 0.4889, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.3664804469273744, | |
| "grad_norm": 0.5001374114396476, | |
| "learning_rate": 2.723823975720789e-06, | |
| "loss": 0.4847, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3709497206703911, | |
| "grad_norm": 0.5240875065710657, | |
| "learning_rate": 2.716236722306525e-06, | |
| "loss": 0.4578, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3754189944134079, | |
| "grad_norm": 0.4714761354385937, | |
| "learning_rate": 2.708649468892261e-06, | |
| "loss": 0.4947, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3798882681564246, | |
| "grad_norm": 0.5205222669608968, | |
| "learning_rate": 2.701062215477997e-06, | |
| "loss": 0.5049, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.3843575418994414, | |
| "grad_norm": 0.5409150510364881, | |
| "learning_rate": 2.693474962063733e-06, | |
| "loss": 0.4946, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.3888268156424581, | |
| "grad_norm": 0.49473773833387136, | |
| "learning_rate": 2.6858877086494687e-06, | |
| "loss": 0.5024, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.393296089385475, | |
| "grad_norm": 0.5322303350412809, | |
| "learning_rate": 2.678300455235205e-06, | |
| "loss": 0.5129, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.3977653631284916, | |
| "grad_norm": 0.5249345159944246, | |
| "learning_rate": 2.670713201820941e-06, | |
| "loss": 0.4852, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.4022346368715084, | |
| "grad_norm": 0.526125738227021, | |
| "learning_rate": 2.6631259484066767e-06, | |
| "loss": 0.4755, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.4067039106145252, | |
| "grad_norm": 0.5077327993266544, | |
| "learning_rate": 2.655538694992413e-06, | |
| "loss": 0.4708, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.411173184357542, | |
| "grad_norm": 0.509006777595202, | |
| "learning_rate": 2.647951441578149e-06, | |
| "loss": 0.4574, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4156424581005587, | |
| "grad_norm": 0.5397063984545032, | |
| "learning_rate": 2.6403641881638846e-06, | |
| "loss": 0.4983, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.4201117318435754, | |
| "grad_norm": 0.588039527577832, | |
| "learning_rate": 2.6327769347496208e-06, | |
| "loss": 0.5063, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.4245810055865922, | |
| "grad_norm": 0.5086942941008908, | |
| "learning_rate": 2.625189681335357e-06, | |
| "loss": 0.5439, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.429050279329609, | |
| "grad_norm": 0.5593378373560197, | |
| "learning_rate": 2.6176024279210926e-06, | |
| "loss": 0.4789, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.4335195530726257, | |
| "grad_norm": 0.5473461722884195, | |
| "learning_rate": 2.6100151745068287e-06, | |
| "loss": 0.4861, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4379888268156424, | |
| "grad_norm": 0.5744717523131797, | |
| "learning_rate": 2.6024279210925644e-06, | |
| "loss": 0.4818, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.4424581005586592, | |
| "grad_norm": 0.5240335872273282, | |
| "learning_rate": 2.5948406676783005e-06, | |
| "loss": 0.4785, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.446927374301676, | |
| "grad_norm": 0.5608475565499077, | |
| "learning_rate": 2.5872534142640367e-06, | |
| "loss": 0.5042, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.4513966480446927, | |
| "grad_norm": 0.5363050759442354, | |
| "learning_rate": 2.5796661608497724e-06, | |
| "loss": 0.4995, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.4558659217877095, | |
| "grad_norm": 0.5027607995293213, | |
| "learning_rate": 2.5720789074355085e-06, | |
| "loss": 0.4954, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4603351955307262, | |
| "grad_norm": 0.4969954043343912, | |
| "learning_rate": 2.5644916540212446e-06, | |
| "loss": 0.4882, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.464804469273743, | |
| "grad_norm": 0.5718337637259987, | |
| "learning_rate": 2.5569044006069803e-06, | |
| "loss": 0.4726, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4692737430167597, | |
| "grad_norm": 0.5234285673769393, | |
| "learning_rate": 2.5493171471927164e-06, | |
| "loss": 0.4697, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4737430167597765, | |
| "grad_norm": 0.5219608223935474, | |
| "learning_rate": 2.5417298937784526e-06, | |
| "loss": 0.4776, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4782122905027932, | |
| "grad_norm": 0.606768683027719, | |
| "learning_rate": 2.5341426403641883e-06, | |
| "loss": 0.4756, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.48268156424581, | |
| "grad_norm": 0.5432176614167162, | |
| "learning_rate": 2.5265553869499244e-06, | |
| "loss": 0.4619, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.48268156424581, | |
| "eval_loss": 0.5032872557640076, | |
| "eval_runtime": 32.2653, | |
| "eval_samples_per_second": 82.751, | |
| "eval_steps_per_second": 5.176, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.4871508379888267, | |
| "grad_norm": 0.5377224565720307, | |
| "learning_rate": 2.51896813353566e-06, | |
| "loss": 0.4667, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.4916201117318435, | |
| "grad_norm": 0.5875842821019925, | |
| "learning_rate": 2.5113808801213962e-06, | |
| "loss": 0.4873, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.4960893854748603, | |
| "grad_norm": 0.5624618850955563, | |
| "learning_rate": 2.5037936267071324e-06, | |
| "loss": 0.4626, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.500558659217877, | |
| "grad_norm": 0.5380665704623208, | |
| "learning_rate": 2.496206373292868e-06, | |
| "loss": 0.4749, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.505027932960894, | |
| "grad_norm": 0.5515048280645092, | |
| "learning_rate": 2.488619119878604e-06, | |
| "loss": 0.4862, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.5094972067039105, | |
| "grad_norm": 0.5541495309875792, | |
| "learning_rate": 2.4810318664643403e-06, | |
| "loss": 0.4533, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.5139664804469275, | |
| "grad_norm": 0.514569610923219, | |
| "learning_rate": 2.473444613050076e-06, | |
| "loss": 0.4585, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.518435754189944, | |
| "grad_norm": 0.5862261680615611, | |
| "learning_rate": 2.465857359635812e-06, | |
| "loss": 0.4835, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.522905027932961, | |
| "grad_norm": 0.536975433650959, | |
| "learning_rate": 2.458270106221548e-06, | |
| "loss": 0.4638, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5273743016759775, | |
| "grad_norm": 0.5142279881627296, | |
| "learning_rate": 2.450682852807284e-06, | |
| "loss": 0.4832, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.5318435754189945, | |
| "grad_norm": 0.5124062345457664, | |
| "learning_rate": 2.44309559939302e-06, | |
| "loss": 0.4857, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.536312849162011, | |
| "grad_norm": 0.509905421315166, | |
| "learning_rate": 2.435508345978756e-06, | |
| "loss": 0.4807, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.540782122905028, | |
| "grad_norm": 0.5275507665643101, | |
| "learning_rate": 2.427921092564492e-06, | |
| "loss": 0.4722, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.5452513966480446, | |
| "grad_norm": 0.5003951978639241, | |
| "learning_rate": 2.420333839150228e-06, | |
| "loss": 0.4941, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5497206703910615, | |
| "grad_norm": 0.5251614039487292, | |
| "learning_rate": 2.4127465857359637e-06, | |
| "loss": 0.5376, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.554189944134078, | |
| "grad_norm": 0.5128097582851606, | |
| "learning_rate": 2.4051593323217e-06, | |
| "loss": 0.5001, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.558659217877095, | |
| "grad_norm": 0.4971288144993544, | |
| "learning_rate": 2.397572078907436e-06, | |
| "loss": 0.4721, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.5631284916201116, | |
| "grad_norm": 0.5111491573396423, | |
| "learning_rate": 2.3899848254931717e-06, | |
| "loss": 0.5192, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.5675977653631286, | |
| "grad_norm": 0.5025703950184777, | |
| "learning_rate": 2.382397572078908e-06, | |
| "loss": 0.4541, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.572067039106145, | |
| "grad_norm": 0.5011840454618501, | |
| "learning_rate": 2.3748103186646435e-06, | |
| "loss": 0.4789, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.576536312849162, | |
| "grad_norm": 0.4981690031939279, | |
| "learning_rate": 2.3672230652503792e-06, | |
| "loss": 0.497, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.5810055865921788, | |
| "grad_norm": 0.4994506284814992, | |
| "learning_rate": 2.3596358118361154e-06, | |
| "loss": 0.4655, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.5854748603351956, | |
| "grad_norm": 0.49694019288140645, | |
| "learning_rate": 2.3520485584218515e-06, | |
| "loss": 0.4883, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.5899441340782123, | |
| "grad_norm": 0.5103222760000443, | |
| "learning_rate": 2.344461305007587e-06, | |
| "loss": 0.4615, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.594413407821229, | |
| "grad_norm": 0.4960772434204748, | |
| "learning_rate": 2.3368740515933233e-06, | |
| "loss": 0.4748, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.5988826815642458, | |
| "grad_norm": 0.5372239368130248, | |
| "learning_rate": 2.3292867981790594e-06, | |
| "loss": 0.461, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.6033519553072626, | |
| "grad_norm": 0.5002183877541576, | |
| "learning_rate": 2.321699544764795e-06, | |
| "loss": 0.4985, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.6078212290502794, | |
| "grad_norm": 0.5065612982842012, | |
| "learning_rate": 2.3141122913505313e-06, | |
| "loss": 0.4846, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.612290502793296, | |
| "grad_norm": 0.5294145045424744, | |
| "learning_rate": 2.3065250379362674e-06, | |
| "loss": 0.4769, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6167597765363129, | |
| "grad_norm": 0.5019842435412675, | |
| "learning_rate": 2.298937784522003e-06, | |
| "loss": 0.4738, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.6212290502793296, | |
| "grad_norm": 0.4973490272845337, | |
| "learning_rate": 2.291350531107739e-06, | |
| "loss": 0.4773, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.6256983240223464, | |
| "grad_norm": 0.4917027064554639, | |
| "learning_rate": 2.283763277693475e-06, | |
| "loss": 0.4921, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.6301675977653631, | |
| "grad_norm": 0.5191932977137537, | |
| "learning_rate": 2.276176024279211e-06, | |
| "loss": 0.4823, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.6346368715083799, | |
| "grad_norm": 0.5130853619632637, | |
| "learning_rate": 2.268588770864947e-06, | |
| "loss": 0.4631, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6391061452513966, | |
| "grad_norm": 0.5402133951776769, | |
| "learning_rate": 2.261001517450683e-06, | |
| "loss": 0.456, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.6435754189944134, | |
| "grad_norm": 0.5136201214507364, | |
| "learning_rate": 2.253414264036419e-06, | |
| "loss": 0.4715, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.6480446927374302, | |
| "grad_norm": 0.5397223114168315, | |
| "learning_rate": 2.245827010622155e-06, | |
| "loss": 0.465, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.652513966480447, | |
| "grad_norm": 0.5178224930093817, | |
| "learning_rate": 2.238239757207891e-06, | |
| "loss": 0.4526, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.6569832402234637, | |
| "grad_norm": 0.5399618966600667, | |
| "learning_rate": 2.230652503793627e-06, | |
| "loss": 0.4873, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.6614525139664804, | |
| "grad_norm": 0.5106529140250409, | |
| "learning_rate": 2.223065250379363e-06, | |
| "loss": 0.4771, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.6659217877094972, | |
| "grad_norm": 0.5460841160518792, | |
| "learning_rate": 2.2154779969650988e-06, | |
| "loss": 0.4743, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.670391061452514, | |
| "grad_norm": 0.5846706720135237, | |
| "learning_rate": 2.207890743550835e-06, | |
| "loss": 0.4552, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.6748603351955307, | |
| "grad_norm": 0.534699591229056, | |
| "learning_rate": 2.2003034901365706e-06, | |
| "loss": 0.495, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.6793296089385474, | |
| "grad_norm": 0.5377842578832556, | |
| "learning_rate": 2.1927162367223067e-06, | |
| "loss": 0.4824, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6837988826815642, | |
| "grad_norm": 0.5799148072307362, | |
| "learning_rate": 2.185128983308043e-06, | |
| "loss": 0.4708, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6882681564245812, | |
| "grad_norm": 0.5076181067753225, | |
| "learning_rate": 2.1775417298937786e-06, | |
| "loss": 0.4631, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6927374301675977, | |
| "grad_norm": 0.5025103762077108, | |
| "learning_rate": 2.1699544764795147e-06, | |
| "loss": 0.4676, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6972067039106147, | |
| "grad_norm": 0.48231447315943815, | |
| "learning_rate": 2.162367223065251e-06, | |
| "loss": 0.4694, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.7016759776536312, | |
| "grad_norm": 0.5003437712619672, | |
| "learning_rate": 2.1547799696509865e-06, | |
| "loss": 0.4607, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7061452513966482, | |
| "grad_norm": 0.5149834201342615, | |
| "learning_rate": 2.1471927162367226e-06, | |
| "loss": 0.5253, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.7106145251396647, | |
| "grad_norm": 0.5406857228194807, | |
| "learning_rate": 2.1396054628224588e-06, | |
| "loss": 0.5352, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.7150837988826817, | |
| "grad_norm": 0.5204717082880796, | |
| "learning_rate": 2.1320182094081945e-06, | |
| "loss": 0.4936, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.7195530726256982, | |
| "grad_norm": 0.5202749448515015, | |
| "learning_rate": 2.12443095599393e-06, | |
| "loss": 0.4594, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.7240223463687152, | |
| "grad_norm": 0.4952910050954741, | |
| "learning_rate": 2.1168437025796663e-06, | |
| "loss": 0.4793, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7284916201117317, | |
| "grad_norm": 0.48637344511275765, | |
| "learning_rate": 2.109256449165402e-06, | |
| "loss": 0.4745, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.7329608938547487, | |
| "grad_norm": 0.4906271558417189, | |
| "learning_rate": 2.101669195751138e-06, | |
| "loss": 0.4652, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.7374301675977653, | |
| "grad_norm": 0.5090727761430808, | |
| "learning_rate": 2.0940819423368742e-06, | |
| "loss": 0.5298, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.7418994413407822, | |
| "grad_norm": 0.4866596430096002, | |
| "learning_rate": 2.08649468892261e-06, | |
| "loss": 0.4838, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.7463687150837988, | |
| "grad_norm": 0.49259128094078225, | |
| "learning_rate": 2.078907435508346e-06, | |
| "loss": 0.4737, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7508379888268157, | |
| "grad_norm": 0.5031034927365164, | |
| "learning_rate": 2.071320182094082e-06, | |
| "loss": 0.4777, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.7553072625698323, | |
| "grad_norm": 0.48520021809068, | |
| "learning_rate": 2.063732928679818e-06, | |
| "loss": 0.454, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.7597765363128492, | |
| "grad_norm": 0.5217902917498033, | |
| "learning_rate": 2.056145675265554e-06, | |
| "loss": 0.5199, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.7642458100558658, | |
| "grad_norm": 0.4963706354662438, | |
| "learning_rate": 2.04855842185129e-06, | |
| "loss": 0.4649, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.7687150837988828, | |
| "grad_norm": 0.49552580249683814, | |
| "learning_rate": 2.040971168437026e-06, | |
| "loss": 0.4887, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7731843575418993, | |
| "grad_norm": 0.49468231483436587, | |
| "learning_rate": 2.033383915022762e-06, | |
| "loss": 0.4757, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.7776536312849163, | |
| "grad_norm": 0.47502769186897137, | |
| "learning_rate": 2.0257966616084977e-06, | |
| "loss": 0.4846, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7821229050279328, | |
| "grad_norm": 0.5089134108324753, | |
| "learning_rate": 2.018209408194234e-06, | |
| "loss": 0.4692, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.7865921787709498, | |
| "grad_norm": 0.4981731250179096, | |
| "learning_rate": 2.01062215477997e-06, | |
| "loss": 0.4665, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7910614525139665, | |
| "grad_norm": 0.4690174096990647, | |
| "learning_rate": 2.0030349013657056e-06, | |
| "loss": 0.4606, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7955307262569833, | |
| "grad_norm": 0.4914411018326139, | |
| "learning_rate": 1.9954476479514418e-06, | |
| "loss": 0.4847, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5513335287615446, | |
| "learning_rate": 1.987860394537178e-06, | |
| "loss": 0.488, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.8044692737430168, | |
| "grad_norm": 0.48992847389625854, | |
| "learning_rate": 1.9802731411229136e-06, | |
| "loss": 0.465, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.8089385474860336, | |
| "grad_norm": 0.5185912675971102, | |
| "learning_rate": 1.9726858877086497e-06, | |
| "loss": 0.4872, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.8134078212290503, | |
| "grad_norm": 0.47300994432774746, | |
| "learning_rate": 1.965098634294386e-06, | |
| "loss": 0.4534, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.817877094972067, | |
| "grad_norm": 0.5034275716173061, | |
| "learning_rate": 1.9575113808801215e-06, | |
| "loss": 0.4668, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.8223463687150838, | |
| "grad_norm": 0.46435516708821234, | |
| "learning_rate": 1.9499241274658577e-06, | |
| "loss": 0.491, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.8268156424581006, | |
| "grad_norm": 0.4915994950607221, | |
| "learning_rate": 1.9423368740515934e-06, | |
| "loss": 0.4793, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.8312849162011173, | |
| "grad_norm": 0.49834550842622505, | |
| "learning_rate": 1.9347496206373295e-06, | |
| "loss": 0.4897, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.835754189944134, | |
| "grad_norm": 0.5275123854480922, | |
| "learning_rate": 1.9271623672230656e-06, | |
| "loss": 0.5167, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.8402234636871508, | |
| "grad_norm": 0.501447167095016, | |
| "learning_rate": 1.9195751138088013e-06, | |
| "loss": 0.5522, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.8446927374301676, | |
| "grad_norm": 0.4929358533949542, | |
| "learning_rate": 1.9119878603945374e-06, | |
| "loss": 0.464, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.8491620111731844, | |
| "grad_norm": 0.4753893398726578, | |
| "learning_rate": 1.9044006069802734e-06, | |
| "loss": 0.4707, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.853631284916201, | |
| "grad_norm": 0.49249135995729143, | |
| "learning_rate": 1.8968133535660093e-06, | |
| "loss": 0.4674, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.8581005586592179, | |
| "grad_norm": 0.4959065755228421, | |
| "learning_rate": 1.8892261001517454e-06, | |
| "loss": 0.4731, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8625698324022346, | |
| "grad_norm": 0.4830053823448673, | |
| "learning_rate": 1.8816388467374813e-06, | |
| "loss": 0.4688, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.8670391061452514, | |
| "grad_norm": 0.5026774544021604, | |
| "learning_rate": 1.874051593323217e-06, | |
| "loss": 0.4777, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.8715083798882681, | |
| "grad_norm": 0.48376669794821286, | |
| "learning_rate": 1.866464339908953e-06, | |
| "loss": 0.4532, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.8759776536312849, | |
| "grad_norm": 0.5036910075540445, | |
| "learning_rate": 1.858877086494689e-06, | |
| "loss": 0.4749, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.8804469273743016, | |
| "grad_norm": 0.464687327497754, | |
| "learning_rate": 1.851289833080425e-06, | |
| "loss": 0.4582, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8849162011173184, | |
| "grad_norm": 0.4713537790626329, | |
| "learning_rate": 1.8437025796661609e-06, | |
| "loss": 0.4954, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.8893854748603351, | |
| "grad_norm": 0.4964027557908742, | |
| "learning_rate": 1.8361153262518968e-06, | |
| "loss": 0.4777, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.893854748603352, | |
| "grad_norm": 0.5022160749373462, | |
| "learning_rate": 1.828528072837633e-06, | |
| "loss": 0.4723, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.8983240223463689, | |
| "grad_norm": 0.4971783807561644, | |
| "learning_rate": 1.8209408194233688e-06, | |
| "loss": 0.4619, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.9027932960893854, | |
| "grad_norm": 0.5020482529233811, | |
| "learning_rate": 1.8133535660091047e-06, | |
| "loss": 0.4923, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9072625698324024, | |
| "grad_norm": 0.5027930539648167, | |
| "learning_rate": 1.8057663125948407e-06, | |
| "loss": 0.4936, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.911731843575419, | |
| "grad_norm": 0.5048109425957825, | |
| "learning_rate": 1.7981790591805768e-06, | |
| "loss": 0.4647, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.916201117318436, | |
| "grad_norm": 0.5152625504278344, | |
| "learning_rate": 1.7905918057663127e-06, | |
| "loss": 0.4962, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.9206703910614524, | |
| "grad_norm": 0.5486472960937676, | |
| "learning_rate": 1.7830045523520486e-06, | |
| "loss": 0.4643, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.9251396648044694, | |
| "grad_norm": 0.5074159325604022, | |
| "learning_rate": 1.7754172989377847e-06, | |
| "loss": 0.4838, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.929608938547486, | |
| "grad_norm": 0.487237277239409, | |
| "learning_rate": 1.7678300455235207e-06, | |
| "loss": 0.4953, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.934078212290503, | |
| "grad_norm": 0.5030243156211434, | |
| "learning_rate": 1.7602427921092566e-06, | |
| "loss": 0.4669, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.9385474860335195, | |
| "grad_norm": 0.54024162061195, | |
| "learning_rate": 1.7526555386949925e-06, | |
| "loss": 0.4645, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.9430167597765364, | |
| "grad_norm": 0.4924548318787792, | |
| "learning_rate": 1.7450682852807286e-06, | |
| "loss": 0.487, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.947486033519553, | |
| "grad_norm": 0.5002700607677886, | |
| "learning_rate": 1.7374810318664645e-06, | |
| "loss": 0.4904, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.95195530726257, | |
| "grad_norm": 0.49052978017153487, | |
| "learning_rate": 1.7298937784522004e-06, | |
| "loss": 0.4721, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.9564245810055865, | |
| "grad_norm": 0.5330277427973065, | |
| "learning_rate": 1.7223065250379363e-06, | |
| "loss": 0.4674, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9608938547486034, | |
| "grad_norm": 0.49644600432443986, | |
| "learning_rate": 1.7147192716236725e-06, | |
| "loss": 0.4735, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.96536312849162, | |
| "grad_norm": 0.4763428511189237, | |
| "learning_rate": 1.7071320182094084e-06, | |
| "loss": 0.4758, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.969832402234637, | |
| "grad_norm": 0.49662051116381534, | |
| "learning_rate": 1.6995447647951443e-06, | |
| "loss": 0.5025, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9743016759776535, | |
| "grad_norm": 0.5368027446646121, | |
| "learning_rate": 1.6919575113808804e-06, | |
| "loss": 0.4708, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.9787709497206705, | |
| "grad_norm": 0.49270124177132796, | |
| "learning_rate": 1.6843702579666163e-06, | |
| "loss": 0.4859, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.983240223463687, | |
| "grad_norm": 0.4886244120662818, | |
| "learning_rate": 1.6767830045523523e-06, | |
| "loss": 0.4542, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.983240223463687, | |
| "eval_loss": 0.49883514642715454, | |
| "eval_runtime": 32.2692, | |
| "eval_samples_per_second": 82.741, | |
| "eval_steps_per_second": 5.175, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.987709497206704, | |
| "grad_norm": 0.48474622370426634, | |
| "learning_rate": 1.6691957511380882e-06, | |
| "loss": 0.4752, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.9921787709497205, | |
| "grad_norm": 0.5089879136002737, | |
| "learning_rate": 1.6616084977238243e-06, | |
| "loss": 0.4835, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9966480446927375, | |
| "grad_norm": 0.540987114653738, | |
| "learning_rate": 1.6540212443095602e-06, | |
| "loss": 0.4685, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.001117318435754, | |
| "grad_norm": 0.5330263791441335, | |
| "learning_rate": 1.6464339908952961e-06, | |
| "loss": 0.4729, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.005586592178771, | |
| "grad_norm": 0.5025591514510047, | |
| "learning_rate": 1.638846737481032e-06, | |
| "loss": 0.4739, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.010055865921788, | |
| "grad_norm": 0.4938210284492649, | |
| "learning_rate": 1.6312594840667682e-06, | |
| "loss": 0.4772, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.0145251396648045, | |
| "grad_norm": 0.5590423643222178, | |
| "learning_rate": 1.6236722306525039e-06, | |
| "loss": 0.4651, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.018994413407821, | |
| "grad_norm": 0.4994221649154916, | |
| "learning_rate": 1.6160849772382398e-06, | |
| "loss": 0.4989, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.004469273743017, | |
| "grad_norm": 0.5578750520607872, | |
| "learning_rate": 1.6084977238239757e-06, | |
| "loss": 0.4779, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.0089385474860335, | |
| "grad_norm": 0.5277075050354076, | |
| "learning_rate": 1.6009104704097116e-06, | |
| "loss": 0.462, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.0134078212290505, | |
| "grad_norm": 0.5125996961203592, | |
| "learning_rate": 1.5933232169954477e-06, | |
| "loss": 0.4892, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.017877094972067, | |
| "grad_norm": 0.5150882727539035, | |
| "learning_rate": 1.5857359635811836e-06, | |
| "loss": 0.4913, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.022346368715084, | |
| "grad_norm": 0.5395569256138755, | |
| "learning_rate": 1.5781487101669196e-06, | |
| "loss": 0.4789, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.0268156424581005, | |
| "grad_norm": 0.4907655840755226, | |
| "learning_rate": 1.5705614567526557e-06, | |
| "loss": 0.4699, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.0312849162011175, | |
| "grad_norm": 0.508487377683692, | |
| "learning_rate": 1.5629742033383916e-06, | |
| "loss": 0.4671, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.035754189944134, | |
| "grad_norm": 0.5211842893319257, | |
| "learning_rate": 1.5553869499241275e-06, | |
| "loss": 0.48, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.040223463687151, | |
| "grad_norm": 0.5540445804429339, | |
| "learning_rate": 1.5477996965098634e-06, | |
| "loss": 0.4839, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.0446927374301676, | |
| "grad_norm": 0.502144462775385, | |
| "learning_rate": 1.5402124430955995e-06, | |
| "loss": 0.4991, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.0491620111731845, | |
| "grad_norm": 0.4963826920124019, | |
| "learning_rate": 1.5326251896813355e-06, | |
| "loss": 0.5268, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 2.053631284916201, | |
| "grad_norm": 0.48862357759049696, | |
| "learning_rate": 1.5250379362670714e-06, | |
| "loss": 0.4572, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.058100558659218, | |
| "grad_norm": 0.48705814461293884, | |
| "learning_rate": 1.5174506828528073e-06, | |
| "loss": 0.4761, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 2.0625698324022346, | |
| "grad_norm": 0.50151248823518, | |
| "learning_rate": 1.5098634294385434e-06, | |
| "loss": 0.468, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.0670391061452515, | |
| "grad_norm": 0.47075309247993236, | |
| "learning_rate": 1.5022761760242793e-06, | |
| "loss": 0.4496, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 2.071508379888268, | |
| "grad_norm": 0.5055609634042088, | |
| "learning_rate": 1.4946889226100152e-06, | |
| "loss": 0.4626, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.075977653631285, | |
| "grad_norm": 0.4931560723293831, | |
| "learning_rate": 1.4871016691957514e-06, | |
| "loss": 0.4743, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.0804469273743016, | |
| "grad_norm": 0.4970179492434698, | |
| "learning_rate": 1.4795144157814873e-06, | |
| "loss": 0.4849, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.0849162011173186, | |
| "grad_norm": 0.5205249715468657, | |
| "learning_rate": 1.4719271623672232e-06, | |
| "loss": 0.4647, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.089385474860335, | |
| "grad_norm": 0.5106891771853166, | |
| "learning_rate": 1.4643399089529591e-06, | |
| "loss": 0.4698, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.093854748603352, | |
| "grad_norm": 0.5030533511191912, | |
| "learning_rate": 1.4567526555386952e-06, | |
| "loss": 0.4435, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.0983240223463686, | |
| "grad_norm": 0.5267045945145785, | |
| "learning_rate": 1.4491654021244311e-06, | |
| "loss": 0.4551, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.1027932960893856, | |
| "grad_norm": 0.4910735265010368, | |
| "learning_rate": 1.441578148710167e-06, | |
| "loss": 0.4893, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.107262569832402, | |
| "grad_norm": 0.5098674286346987, | |
| "learning_rate": 1.433990895295903e-06, | |
| "loss": 0.4631, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.111731843575419, | |
| "grad_norm": 0.49833407519162715, | |
| "learning_rate": 1.426403641881639e-06, | |
| "loss": 0.4478, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.1162011173184356, | |
| "grad_norm": 0.4968638787102979, | |
| "learning_rate": 1.418816388467375e-06, | |
| "loss": 0.4661, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.1206703910614526, | |
| "grad_norm": 0.5287581344581138, | |
| "learning_rate": 1.411229135053111e-06, | |
| "loss": 0.487, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.125139664804469, | |
| "grad_norm": 0.5169520017940434, | |
| "learning_rate": 1.403641881638847e-06, | |
| "loss": 0.4641, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.129608938547486, | |
| "grad_norm": 0.4889710176907046, | |
| "learning_rate": 1.396054628224583e-06, | |
| "loss": 0.4657, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.1340782122905027, | |
| "grad_norm": 0.5068054201127302, | |
| "learning_rate": 1.3884673748103189e-06, | |
| "loss": 0.4655, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.1385474860335196, | |
| "grad_norm": 0.5227391076748372, | |
| "learning_rate": 1.3808801213960548e-06, | |
| "loss": 0.4489, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.143016759776536, | |
| "grad_norm": 0.4685531353247096, | |
| "learning_rate": 1.3732928679817905e-06, | |
| "loss": 0.4518, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.147486033519553, | |
| "grad_norm": 0.48729624697805496, | |
| "learning_rate": 1.3657056145675266e-06, | |
| "loss": 0.4539, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.1519553072625697, | |
| "grad_norm": 0.49686659002832273, | |
| "learning_rate": 1.3581183611532625e-06, | |
| "loss": 0.4698, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.1564245810055866, | |
| "grad_norm": 0.5165370496352449, | |
| "learning_rate": 1.3505311077389985e-06, | |
| "loss": 0.5282, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.160893854748603, | |
| "grad_norm": 0.49390583090447004, | |
| "learning_rate": 1.3429438543247344e-06, | |
| "loss": 0.5011, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.16536312849162, | |
| "grad_norm": 0.4966083299512843, | |
| "learning_rate": 1.3353566009104705e-06, | |
| "loss": 0.4888, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.1698324022346367, | |
| "grad_norm": 0.47651097727052893, | |
| "learning_rate": 1.3277693474962064e-06, | |
| "loss": 0.4629, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.1743016759776537, | |
| "grad_norm": 0.5171229401967155, | |
| "learning_rate": 1.3201820940819423e-06, | |
| "loss": 0.4538, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.17877094972067, | |
| "grad_norm": 0.5177579051508318, | |
| "learning_rate": 1.3125948406676784e-06, | |
| "loss": 0.4822, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.183240223463687, | |
| "grad_norm": 0.5363730349952008, | |
| "learning_rate": 1.3050075872534144e-06, | |
| "loss": 0.4879, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.1877094972067037, | |
| "grad_norm": 0.49547274684149156, | |
| "learning_rate": 1.2974203338391503e-06, | |
| "loss": 0.4732, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.1921787709497207, | |
| "grad_norm": 0.48560343902497966, | |
| "learning_rate": 1.2898330804248862e-06, | |
| "loss": 0.4918, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.1966480446927372, | |
| "grad_norm": 0.49966297308311575, | |
| "learning_rate": 1.2822458270106223e-06, | |
| "loss": 0.4618, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.201117318435754, | |
| "grad_norm": 0.472067295226221, | |
| "learning_rate": 1.2746585735963582e-06, | |
| "loss": 0.4953, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.205586592178771, | |
| "grad_norm": 0.5241464474412257, | |
| "learning_rate": 1.2670713201820941e-06, | |
| "loss": 0.4797, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.2100558659217877, | |
| "grad_norm": 0.5063960714766558, | |
| "learning_rate": 1.25948406676783e-06, | |
| "loss": 0.4493, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.2145251396648042, | |
| "grad_norm": 0.48407847297667117, | |
| "learning_rate": 1.2518968133535662e-06, | |
| "loss": 0.4736, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.218994413407821, | |
| "grad_norm": 0.4859389089228068, | |
| "learning_rate": 1.244309559939302e-06, | |
| "loss": 0.4476, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.223463687150838, | |
| "grad_norm": 0.49468343574845597, | |
| "learning_rate": 1.236722306525038e-06, | |
| "loss": 0.4447, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.2279329608938547, | |
| "grad_norm": 0.45980997966401754, | |
| "learning_rate": 1.229135053110774e-06, | |
| "loss": 0.4599, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.2324022346368717, | |
| "grad_norm": 0.5135776498149942, | |
| "learning_rate": 1.22154779969651e-06, | |
| "loss": 0.4852, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.2368715083798882, | |
| "grad_norm": 0.48749385873656575, | |
| "learning_rate": 1.213960546282246e-06, | |
| "loss": 0.5157, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.241340782122905, | |
| "grad_norm": 0.48899276420726745, | |
| "learning_rate": 1.2063732928679819e-06, | |
| "loss": 0.4437, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.2458100558659218, | |
| "grad_norm": 0.4918342780724834, | |
| "learning_rate": 1.198786039453718e-06, | |
| "loss": 0.4974, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.2502793296089387, | |
| "grad_norm": 0.4774969997416472, | |
| "learning_rate": 1.191198786039454e-06, | |
| "loss": 0.4509, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.2547486033519553, | |
| "grad_norm": 0.4791749757272505, | |
| "learning_rate": 1.1836115326251896e-06, | |
| "loss": 0.4599, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 2.2592178770949722, | |
| "grad_norm": 0.5072314853369309, | |
| "learning_rate": 1.1760242792109257e-06, | |
| "loss": 0.4643, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.2636871508379888, | |
| "grad_norm": 0.5004472392134528, | |
| "learning_rate": 1.1684370257966617e-06, | |
| "loss": 0.4688, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.2681564245810057, | |
| "grad_norm": 0.48150392498478517, | |
| "learning_rate": 1.1608497723823976e-06, | |
| "loss": 0.5069, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.2726256983240223, | |
| "grad_norm": 0.4828855445414688, | |
| "learning_rate": 1.1532625189681337e-06, | |
| "loss": 0.4571, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 2.2770949720670393, | |
| "grad_norm": 0.49789128855450915, | |
| "learning_rate": 1.1456752655538696e-06, | |
| "loss": 0.4651, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.281564245810056, | |
| "grad_norm": 0.49890243849633514, | |
| "learning_rate": 1.1380880121396055e-06, | |
| "loss": 0.4757, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 2.2860335195530728, | |
| "grad_norm": 0.48825603773254755, | |
| "learning_rate": 1.1305007587253414e-06, | |
| "loss": 0.4668, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.2905027932960893, | |
| "grad_norm": 0.48797823917514366, | |
| "learning_rate": 1.1229135053110776e-06, | |
| "loss": 0.4576, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 2.2949720670391063, | |
| "grad_norm": 0.4713265522467315, | |
| "learning_rate": 1.1153262518968135e-06, | |
| "loss": 0.4998, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.299441340782123, | |
| "grad_norm": 0.48731093463299924, | |
| "learning_rate": 1.1077389984825494e-06, | |
| "loss": 0.4583, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 2.30391061452514, | |
| "grad_norm": 0.5356992050391173, | |
| "learning_rate": 1.1001517450682853e-06, | |
| "loss": 0.4964, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.3083798882681563, | |
| "grad_norm": 0.5185855845215128, | |
| "learning_rate": 1.0925644916540214e-06, | |
| "loss": 0.4456, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.3128491620111733, | |
| "grad_norm": 0.5340341729511069, | |
| "learning_rate": 1.0849772382397573e-06, | |
| "loss": 0.4667, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.31731843575419, | |
| "grad_norm": 0.46791114784776194, | |
| "learning_rate": 1.0773899848254933e-06, | |
| "loss": 0.4907, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 2.321787709497207, | |
| "grad_norm": 0.5224682012273069, | |
| "learning_rate": 1.0698027314112294e-06, | |
| "loss": 0.4411, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.3262569832402233, | |
| "grad_norm": 0.5013175127206561, | |
| "learning_rate": 1.062215477996965e-06, | |
| "loss": 0.4872, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.3307262569832403, | |
| "grad_norm": 0.49567401298066655, | |
| "learning_rate": 1.054628224582701e-06, | |
| "loss": 0.4547, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.335195530726257, | |
| "grad_norm": 0.5092075971225333, | |
| "learning_rate": 1.0470409711684371e-06, | |
| "loss": 0.4774, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 2.339664804469274, | |
| "grad_norm": 0.5206537811339336, | |
| "learning_rate": 1.039453717754173e-06, | |
| "loss": 0.4593, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.3441340782122904, | |
| "grad_norm": 0.5247188237207532, | |
| "learning_rate": 1.031866464339909e-06, | |
| "loss": 0.4947, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 2.3486033519553073, | |
| "grad_norm": 0.48511496092549006, | |
| "learning_rate": 1.024279210925645e-06, | |
| "loss": 0.4675, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.353072625698324, | |
| "grad_norm": 0.49031229609343957, | |
| "learning_rate": 1.016691957511381e-06, | |
| "loss": 0.4654, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.357541899441341, | |
| "grad_norm": 0.5022437562865831, | |
| "learning_rate": 1.009104704097117e-06, | |
| "loss": 0.4881, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.3620111731843574, | |
| "grad_norm": 0.48427536854604036, | |
| "learning_rate": 1.0015174506828528e-06, | |
| "loss": 0.4722, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.3664804469273744, | |
| "grad_norm": 0.4951077677256604, | |
| "learning_rate": 9.93930197268589e-07, | |
| "loss": 0.4816, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.370949720670391, | |
| "grad_norm": 0.506971274674449, | |
| "learning_rate": 9.863429438543249e-07, | |
| "loss": 0.448, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 2.375418994413408, | |
| "grad_norm": 0.4843767211745505, | |
| "learning_rate": 9.787556904400608e-07, | |
| "loss": 0.4644, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.3798882681564244, | |
| "grad_norm": 0.4787312911085378, | |
| "learning_rate": 9.711684370257967e-07, | |
| "loss": 0.4681, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 2.3843575418994414, | |
| "grad_norm": 0.5040950848521704, | |
| "learning_rate": 9.635811836115328e-07, | |
| "loss": 0.4595, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.388826815642458, | |
| "grad_norm": 0.4742009596031962, | |
| "learning_rate": 9.559939301972687e-07, | |
| "loss": 0.4542, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 2.393296089385475, | |
| "grad_norm": 0.4819914757091554, | |
| "learning_rate": 9.484066767830046e-07, | |
| "loss": 0.4818, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.3977653631284914, | |
| "grad_norm": 0.4794763925814031, | |
| "learning_rate": 9.408194233687407e-07, | |
| "loss": 0.4459, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.4022346368715084, | |
| "grad_norm": 0.5065706041345039, | |
| "learning_rate": 9.332321699544765e-07, | |
| "loss": 0.4986, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.4067039106145254, | |
| "grad_norm": 0.4918492784353524, | |
| "learning_rate": 9.256449165402125e-07, | |
| "loss": 0.4618, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 2.411173184357542, | |
| "grad_norm": 0.48410824249150014, | |
| "learning_rate": 9.180576631259484e-07, | |
| "loss": 0.4358, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.4156424581005584, | |
| "grad_norm": 0.5071386174484, | |
| "learning_rate": 9.104704097116844e-07, | |
| "loss": 0.4522, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 2.4201117318435754, | |
| "grad_norm": 0.5179035085963222, | |
| "learning_rate": 9.028831562974203e-07, | |
| "loss": 0.4718, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.4245810055865924, | |
| "grad_norm": 0.48695002255677194, | |
| "learning_rate": 8.952959028831563e-07, | |
| "loss": 0.4946, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 2.429050279329609, | |
| "grad_norm": 0.4697467884861594, | |
| "learning_rate": 8.877086494688924e-07, | |
| "loss": 0.482, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.4335195530726255, | |
| "grad_norm": 0.5039869709064255, | |
| "learning_rate": 8.801213960546283e-07, | |
| "loss": 0.4528, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 2.4379888268156424, | |
| "grad_norm": 0.4956541767737936, | |
| "learning_rate": 8.725341426403643e-07, | |
| "loss": 0.4416, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.4424581005586594, | |
| "grad_norm": 0.5220748437476452, | |
| "learning_rate": 8.649468892261002e-07, | |
| "loss": 0.4534, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.446927374301676, | |
| "grad_norm": 0.5004114462806725, | |
| "learning_rate": 8.573596358118362e-07, | |
| "loss": 0.4523, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.451396648044693, | |
| "grad_norm": 0.5002883358165275, | |
| "learning_rate": 8.497723823975721e-07, | |
| "loss": 0.4483, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 2.4558659217877095, | |
| "grad_norm": 0.5001315929982186, | |
| "learning_rate": 8.421851289833082e-07, | |
| "loss": 0.4547, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.4603351955307264, | |
| "grad_norm": 0.5141518132214271, | |
| "learning_rate": 8.345978755690441e-07, | |
| "loss": 0.5064, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 2.464804469273743, | |
| "grad_norm": 0.4960566337702956, | |
| "learning_rate": 8.270106221547801e-07, | |
| "loss": 0.4473, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.464804469273743, | |
| "eval_loss": 0.4975128769874573, | |
| "eval_runtime": 32.1892, | |
| "eval_samples_per_second": 82.947, | |
| "eval_steps_per_second": 5.188, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.46927374301676, | |
| "grad_norm": 0.4897198333575608, | |
| "learning_rate": 8.19423368740516e-07, | |
| "loss": 0.4559, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 2.4737430167597765, | |
| "grad_norm": 0.49985877359817477, | |
| "learning_rate": 8.118361153262519e-07, | |
| "loss": 0.4651, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.4782122905027935, | |
| "grad_norm": 0.49485306049230043, | |
| "learning_rate": 8.042488619119878e-07, | |
| "loss": 0.4803, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 2.48268156424581, | |
| "grad_norm": 0.5208900276481403, | |
| "learning_rate": 7.966616084977239e-07, | |
| "loss": 0.4553, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.487150837988827, | |
| "grad_norm": 0.48837650054879367, | |
| "learning_rate": 7.890743550834598e-07, | |
| "loss": 0.4681, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.4916201117318435, | |
| "grad_norm": 0.47305516115923485, | |
| "learning_rate": 7.814871016691958e-07, | |
| "loss": 0.4564, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.4960893854748605, | |
| "grad_norm": 0.5049602967209187, | |
| "learning_rate": 7.738998482549317e-07, | |
| "loss": 0.4632, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 2.500558659217877, | |
| "grad_norm": 0.48547661489581506, | |
| "learning_rate": 7.663125948406677e-07, | |
| "loss": 0.448, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.505027932960894, | |
| "grad_norm": 0.5747686136950829, | |
| "learning_rate": 7.587253414264036e-07, | |
| "loss": 0.4432, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 2.5094972067039105, | |
| "grad_norm": 0.5499322063415041, | |
| "learning_rate": 7.511380880121397e-07, | |
| "loss": 0.432, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.5139664804469275, | |
| "grad_norm": 0.49909881450641125, | |
| "learning_rate": 7.435508345978757e-07, | |
| "loss": 0.4325, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 2.518435754189944, | |
| "grad_norm": 0.5137001617086465, | |
| "learning_rate": 7.359635811836116e-07, | |
| "loss": 0.4374, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.522905027932961, | |
| "grad_norm": 0.5043757440405017, | |
| "learning_rate": 7.283763277693476e-07, | |
| "loss": 0.4751, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 2.5273743016759775, | |
| "grad_norm": 0.4882894387336232, | |
| "learning_rate": 7.207890743550835e-07, | |
| "loss": 0.4368, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.5318435754189945, | |
| "grad_norm": 0.5302357654250439, | |
| "learning_rate": 7.132018209408196e-07, | |
| "loss": 0.4174, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.536312849162011, | |
| "grad_norm": 0.5229623842607946, | |
| "learning_rate": 7.056145675265555e-07, | |
| "loss": 0.4353, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.540782122905028, | |
| "grad_norm": 0.5145290981136341, | |
| "learning_rate": 6.980273141122915e-07, | |
| "loss": 0.4278, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 2.5452513966480446, | |
| "grad_norm": 0.49830066459201744, | |
| "learning_rate": 6.904400606980274e-07, | |
| "loss": 0.434, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.5497206703910615, | |
| "grad_norm": 0.5220459047427843, | |
| "learning_rate": 6.828528072837633e-07, | |
| "loss": 0.4375, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 2.554189944134078, | |
| "grad_norm": 0.5111391623585094, | |
| "learning_rate": 6.752655538694992e-07, | |
| "loss": 0.4295, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.558659217877095, | |
| "grad_norm": 0.4944043554942434, | |
| "learning_rate": 6.676783004552352e-07, | |
| "loss": 0.4205, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.5631284916201116, | |
| "grad_norm": 0.49154601956517674, | |
| "learning_rate": 6.600910470409712e-07, | |
| "loss": 0.4502, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.5675977653631286, | |
| "grad_norm": 0.5008348019898189, | |
| "learning_rate": 6.525037936267072e-07, | |
| "loss": 0.4453, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 2.572067039106145, | |
| "grad_norm": 0.5005572885290496, | |
| "learning_rate": 6.449165402124431e-07, | |
| "loss": 0.4353, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.576536312849162, | |
| "grad_norm": 0.5302967367534124, | |
| "learning_rate": 6.373292867981791e-07, | |
| "loss": 0.4269, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.5810055865921786, | |
| "grad_norm": 0.5050943269077487, | |
| "learning_rate": 6.29742033383915e-07, | |
| "loss": 0.4372, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.5854748603351956, | |
| "grad_norm": 0.5223556534474804, | |
| "learning_rate": 6.22154779969651e-07, | |
| "loss": 0.445, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 2.5899441340782126, | |
| "grad_norm": 0.4891137574673572, | |
| "learning_rate": 6.14567526555387e-07, | |
| "loss": 0.4171, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.594413407821229, | |
| "grad_norm": 0.5172276124686148, | |
| "learning_rate": 6.06980273141123e-07, | |
| "loss": 0.4522, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 2.5988826815642456, | |
| "grad_norm": 0.48873823307103154, | |
| "learning_rate": 5.99393019726859e-07, | |
| "loss": 0.4286, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.6033519553072626, | |
| "grad_norm": 0.502230133682332, | |
| "learning_rate": 5.918057663125948e-07, | |
| "loss": 0.4461, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 2.6078212290502796, | |
| "grad_norm": 0.4898396410998909, | |
| "learning_rate": 5.842185128983308e-07, | |
| "loss": 0.4289, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.612290502793296, | |
| "grad_norm": 0.48422568768513635, | |
| "learning_rate": 5.766312594840668e-07, | |
| "loss": 0.4423, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 2.6167597765363126, | |
| "grad_norm": 0.5003994120834477, | |
| "learning_rate": 5.690440060698028e-07, | |
| "loss": 0.437, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.6212290502793296, | |
| "grad_norm": 0.5079705228889307, | |
| "learning_rate": 5.614567526555388e-07, | |
| "loss": 0.4516, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.6256983240223466, | |
| "grad_norm": 0.49246631419316356, | |
| "learning_rate": 5.538694992412747e-07, | |
| "loss": 0.4368, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.630167597765363, | |
| "grad_norm": 0.5086922889782935, | |
| "learning_rate": 5.462822458270107e-07, | |
| "loss": 0.4227, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 2.6346368715083797, | |
| "grad_norm": 0.4780759168718152, | |
| "learning_rate": 5.386949924127466e-07, | |
| "loss": 0.4462, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.6391061452513966, | |
| "grad_norm": 0.464930849547806, | |
| "learning_rate": 5.311077389984825e-07, | |
| "loss": 0.4334, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 2.6435754189944136, | |
| "grad_norm": 0.5002300455424397, | |
| "learning_rate": 5.235204855842186e-07, | |
| "loss": 0.4641, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.64804469273743, | |
| "grad_norm": 0.4901819784256759, | |
| "learning_rate": 5.159332321699545e-07, | |
| "loss": 0.4534, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 2.6525139664804467, | |
| "grad_norm": 0.48520187595124403, | |
| "learning_rate": 5.083459787556905e-07, | |
| "loss": 0.4246, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.6569832402234637, | |
| "grad_norm": 0.5093634792454178, | |
| "learning_rate": 5.007587253414264e-07, | |
| "loss": 0.458, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 2.6614525139664806, | |
| "grad_norm": 0.47795678253705354, | |
| "learning_rate": 4.931714719271624e-07, | |
| "loss": 0.4535, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.665921787709497, | |
| "grad_norm": 0.48340944166993405, | |
| "learning_rate": 4.855842185128983e-07, | |
| "loss": 0.4721, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.6703910614525137, | |
| "grad_norm": 0.47996848455446794, | |
| "learning_rate": 4.779969650986344e-07, | |
| "loss": 0.4718, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.6748603351955307, | |
| "grad_norm": 0.5064110255390989, | |
| "learning_rate": 4.7040971168437033e-07, | |
| "loss": 0.4232, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 2.6793296089385477, | |
| "grad_norm": 0.4824240302682651, | |
| "learning_rate": 4.6282245827010624e-07, | |
| "loss": 0.4307, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.683798882681564, | |
| "grad_norm": 0.4920800315955025, | |
| "learning_rate": 4.552352048558422e-07, | |
| "loss": 0.4435, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 2.688268156424581, | |
| "grad_norm": 0.48921504704563673, | |
| "learning_rate": 4.476479514415782e-07, | |
| "loss": 0.4409, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.6927374301675977, | |
| "grad_norm": 0.500357475284234, | |
| "learning_rate": 4.4006069802731414e-07, | |
| "loss": 0.4512, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 2.6972067039106147, | |
| "grad_norm": 0.4772853981005586, | |
| "learning_rate": 4.324734446130501e-07, | |
| "loss": 0.4305, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 2.701675977653631, | |
| "grad_norm": 0.5075964338612743, | |
| "learning_rate": 4.248861911987861e-07, | |
| "loss": 0.4285, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 2.706145251396648, | |
| "grad_norm": 0.4916721748305679, | |
| "learning_rate": 4.1729893778452204e-07, | |
| "loss": 0.4586, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.7106145251396647, | |
| "grad_norm": 0.4746003660659448, | |
| "learning_rate": 4.09711684370258e-07, | |
| "loss": 0.4385, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.7150837988826817, | |
| "grad_norm": 0.49004649736827927, | |
| "learning_rate": 4.021244309559939e-07, | |
| "loss": 0.4349, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 2.7195530726256982, | |
| "grad_norm": 0.5055169203153188, | |
| "learning_rate": 3.945371775417299e-07, | |
| "loss": 0.4741, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 2.724022346368715, | |
| "grad_norm": 0.5491416139361283, | |
| "learning_rate": 3.8694992412746586e-07, | |
| "loss": 0.4285, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 2.7284916201117317, | |
| "grad_norm": 0.4860669923591883, | |
| "learning_rate": 3.793626707132018e-07, | |
| "loss": 0.4147, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 2.7329608938547487, | |
| "grad_norm": 0.48537127261627433, | |
| "learning_rate": 3.7177541729893784e-07, | |
| "loss": 0.4538, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.7374301675977653, | |
| "grad_norm": 0.47557197343237195, | |
| "learning_rate": 3.641881638846738e-07, | |
| "loss": 0.454, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 2.7418994413407822, | |
| "grad_norm": 0.4844044051975442, | |
| "learning_rate": 3.566009104704098e-07, | |
| "loss": 0.4273, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.7463687150837988, | |
| "grad_norm": 0.5000172198487197, | |
| "learning_rate": 3.4901365705614574e-07, | |
| "loss": 0.4315, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 2.7508379888268157, | |
| "grad_norm": 0.4848986778566373, | |
| "learning_rate": 3.4142640364188166e-07, | |
| "loss": 0.4495, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.7553072625698323, | |
| "grad_norm": 0.48082262468066483, | |
| "learning_rate": 3.338391502276176e-07, | |
| "loss": 0.4185, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.7597765363128492, | |
| "grad_norm": 0.4865964770369407, | |
| "learning_rate": 3.262518968133536e-07, | |
| "loss": 0.4538, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.764245810055866, | |
| "grad_norm": 0.4815694373638968, | |
| "learning_rate": 3.1866464339908956e-07, | |
| "loss": 0.459, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 2.7687150837988828, | |
| "grad_norm": 0.49485301744734556, | |
| "learning_rate": 3.110773899848255e-07, | |
| "loss": 0.4527, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.7731843575418993, | |
| "grad_norm": 0.5214886405175438, | |
| "learning_rate": 3.034901365705615e-07, | |
| "loss": 0.4393, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.7776536312849163, | |
| "grad_norm": 0.4645817613806632, | |
| "learning_rate": 2.959028831562974e-07, | |
| "loss": 0.4723, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.782122905027933, | |
| "grad_norm": 0.4887881586065441, | |
| "learning_rate": 2.883156297420334e-07, | |
| "loss": 0.418, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 2.78659217877095, | |
| "grad_norm": 0.47562569136752975, | |
| "learning_rate": 2.807283763277694e-07, | |
| "loss": 0.4347, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.7910614525139668, | |
| "grad_norm": 0.4848933584055101, | |
| "learning_rate": 2.7314112291350536e-07, | |
| "loss": 0.4266, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 2.7955307262569833, | |
| "grad_norm": 0.48829866920710585, | |
| "learning_rate": 2.6555386949924127e-07, | |
| "loss": 0.4522, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.4776396492507318, | |
| "learning_rate": 2.5796661608497724e-07, | |
| "loss": 0.4293, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.804469273743017, | |
| "grad_norm": 0.48342626581685083, | |
| "learning_rate": 2.503793626707132e-07, | |
| "loss": 0.437, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.8089385474860338, | |
| "grad_norm": 0.47481545268800873, | |
| "learning_rate": 2.4279210925644917e-07, | |
| "loss": 0.4333, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 2.8134078212290503, | |
| "grad_norm": 0.5105857690012979, | |
| "learning_rate": 2.3520485584218516e-07, | |
| "loss": 0.4367, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.817877094972067, | |
| "grad_norm": 0.49397311695562557, | |
| "learning_rate": 2.276176024279211e-07, | |
| "loss": 0.4355, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 2.822346368715084, | |
| "grad_norm": 0.4766966322472669, | |
| "learning_rate": 2.2003034901365707e-07, | |
| "loss": 0.4478, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.826815642458101, | |
| "grad_norm": 0.47484165703817144, | |
| "learning_rate": 2.1244309559939304e-07, | |
| "loss": 0.4366, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 2.8312849162011173, | |
| "grad_norm": 0.47271019258961905, | |
| "learning_rate": 2.04855842185129e-07, | |
| "loss": 0.4473, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.835754189944134, | |
| "grad_norm": 0.4668544421054751, | |
| "learning_rate": 1.9726858877086494e-07, | |
| "loss": 0.4341, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 2.840223463687151, | |
| "grad_norm": 0.47716986376667936, | |
| "learning_rate": 1.896813353566009e-07, | |
| "loss": 0.4223, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 2.844692737430168, | |
| "grad_norm": 0.4939991882199546, | |
| "learning_rate": 1.820940819423369e-07, | |
| "loss": 0.4441, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.8491620111731844, | |
| "grad_norm": 0.47705877463455093, | |
| "learning_rate": 1.7450682852807287e-07, | |
| "loss": 0.485, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 2.853631284916201, | |
| "grad_norm": 0.49941990019846055, | |
| "learning_rate": 1.669195751138088e-07, | |
| "loss": 0.4701, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 2.858100558659218, | |
| "grad_norm": 0.47625945090740057, | |
| "learning_rate": 1.5933232169954478e-07, | |
| "loss": 0.454, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 2.862569832402235, | |
| "grad_norm": 0.5028271948555065, | |
| "learning_rate": 1.5174506828528074e-07, | |
| "loss": 0.4296, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 2.8670391061452514, | |
| "grad_norm": 0.49358352504671604, | |
| "learning_rate": 1.441578148710167e-07, | |
| "loss": 0.4324, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.871508379888268, | |
| "grad_norm": 0.48576223310259714, | |
| "learning_rate": 1.3657056145675268e-07, | |
| "loss": 0.4424, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 2.875977653631285, | |
| "grad_norm": 0.47971171549572994, | |
| "learning_rate": 1.2898330804248862e-07, | |
| "loss": 0.4555, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 2.880446927374302, | |
| "grad_norm": 0.49955392754214234, | |
| "learning_rate": 1.2139605462822459e-07, | |
| "loss": 0.4338, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 2.8849162011173184, | |
| "grad_norm": 0.47232821100099637, | |
| "learning_rate": 1.1380880121396055e-07, | |
| "loss": 0.4349, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 2.889385474860335, | |
| "grad_norm": 0.4834170945858496, | |
| "learning_rate": 1.0622154779969652e-07, | |
| "loss": 0.4564, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.893854748603352, | |
| "grad_norm": 0.48493984226629705, | |
| "learning_rate": 9.863429438543247e-08, | |
| "loss": 0.4384, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.898324022346369, | |
| "grad_norm": 0.4601302405573049, | |
| "learning_rate": 9.104704097116845e-08, | |
| "loss": 0.4512, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 2.9027932960893854, | |
| "grad_norm": 0.4768143158802154, | |
| "learning_rate": 8.34597875569044e-08, | |
| "loss": 0.4354, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 2.9072625698324024, | |
| "grad_norm": 0.49427262358096746, | |
| "learning_rate": 7.587253414264037e-08, | |
| "loss": 0.4259, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 2.911731843575419, | |
| "grad_norm": 0.4749814243858473, | |
| "learning_rate": 6.828528072837634e-08, | |
| "loss": 0.4445, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.916201117318436, | |
| "grad_norm": 0.48533516583361297, | |
| "learning_rate": 6.069802731411229e-08, | |
| "loss": 0.4799, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 2.9206703910614524, | |
| "grad_norm": 0.4839142899304728, | |
| "learning_rate": 5.311077389984826e-08, | |
| "loss": 0.4574, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.9251396648044694, | |
| "grad_norm": 0.486543073762813, | |
| "learning_rate": 4.5523520485584226e-08, | |
| "loss": 0.4446, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 2.929608938547486, | |
| "grad_norm": 0.4980109939724879, | |
| "learning_rate": 3.7936267071320186e-08, | |
| "loss": 0.4315, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.934078212290503, | |
| "grad_norm": 0.48840296818117557, | |
| "learning_rate": 3.0349013657056146e-08, | |
| "loss": 0.4318, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.9385474860335195, | |
| "grad_norm": 0.4903615270099699, | |
| "learning_rate": 2.2761760242792113e-08, | |
| "loss": 0.4357, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.9430167597765364, | |
| "grad_norm": 0.48850430880690693, | |
| "learning_rate": 1.5174506828528073e-08, | |
| "loss": 0.4334, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 2.947486033519553, | |
| "grad_norm": 0.4773488572878224, | |
| "learning_rate": 7.587253414264037e-09, | |
| "loss": 0.4281, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.95195530726257, | |
| "grad_norm": 0.4760606483764159, | |
| "learning_rate": 0.0, | |
| "loss": 0.4504, | |
| "step": 669 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 669, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 56, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 559881199288320.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |