Invalid JSON:
Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": 6.391010284423828, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.04632568359375, | |
| "eval_steps": 5000, | |
| "global_step": 524288, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00095367431640625, | |
| "grad_norm": 0.6624638438224792, | |
| "learning_rate": 4.995241165161133e-05, | |
| "loss": 8.9018, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0019073486328125, | |
| "grad_norm": 0.6431354284286499, | |
| "learning_rate": 4.990472793579102e-05, | |
| "loss": 8.2495, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.00286102294921875, | |
| "grad_norm": 0.5842123031616211, | |
| "learning_rate": 4.98570442199707e-05, | |
| "loss": 7.7924, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.003814697265625, | |
| "grad_norm": 0.5727596282958984, | |
| "learning_rate": 4.9809360504150393e-05, | |
| "loss": 7.5279, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.00476837158203125, | |
| "grad_norm": 0.45176011323928833, | |
| "learning_rate": 4.9761676788330084e-05, | |
| "loss": 7.3262, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0057220458984375, | |
| "grad_norm": 0.4838062822818756, | |
| "learning_rate": 4.971399307250977e-05, | |
| "loss": 7.219, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.00667572021484375, | |
| "grad_norm": 0.56992506980896, | |
| "learning_rate": 4.966630935668946e-05, | |
| "loss": 7.1168, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.00762939453125, | |
| "grad_norm": 0.47706305980682373, | |
| "learning_rate": 4.961862564086914e-05, | |
| "loss": 7.0533, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.00858306884765625, | |
| "grad_norm": 0.4582304358482361, | |
| "learning_rate": 4.957094192504883e-05, | |
| "loss": 6.9537, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0095367431640625, | |
| "grad_norm": 0.4827233850955963, | |
| "learning_rate": 4.952325820922852e-05, | |
| "loss": 6.9745, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.0095367431640625, | |
| "eval_accuracy": 0.031426614481409, | |
| "eval_loss": 6.906828880310059, | |
| "eval_runtime": 236.8104, | |
| "eval_samples_per_second": 42.228, | |
| "eval_steps_per_second": 10.557, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.01049041748046875, | |
| "grad_norm": 0.4683932960033417, | |
| "learning_rate": 4.9475574493408205e-05, | |
| "loss": 6.9377, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.011444091796875, | |
| "grad_norm": 0.43654170632362366, | |
| "learning_rate": 4.9427890777587895e-05, | |
| "loss": 6.9642, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.01239776611328125, | |
| "grad_norm": 0.4411381483078003, | |
| "learning_rate": 4.938020706176758e-05, | |
| "loss": 6.9223, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.0133514404296875, | |
| "grad_norm": 0.5460022687911987, | |
| "learning_rate": 4.933252334594727e-05, | |
| "loss": 6.8951, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.01430511474609375, | |
| "grad_norm": 0.48390814661979675, | |
| "learning_rate": 4.928483963012696e-05, | |
| "loss": 6.8717, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.0152587890625, | |
| "grad_norm": 0.5311650037765503, | |
| "learning_rate": 4.923715591430664e-05, | |
| "loss": 6.8812, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.01621246337890625, | |
| "grad_norm": 0.5130866765975952, | |
| "learning_rate": 4.918947219848633e-05, | |
| "loss": 6.8148, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.0171661376953125, | |
| "grad_norm": 0.5294931530952454, | |
| "learning_rate": 4.9141788482666016e-05, | |
| "loss": 6.787, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.01811981201171875, | |
| "grad_norm": 0.544792652130127, | |
| "learning_rate": 4.9094104766845706e-05, | |
| "loss": 6.7906, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.019073486328125, | |
| "grad_norm": 0.568236768245697, | |
| "learning_rate": 4.9046421051025396e-05, | |
| "loss": 6.7778, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.019073486328125, | |
| "eval_accuracy": 0.031314090019569474, | |
| "eval_loss": 6.7409868240356445, | |
| "eval_runtime": 236.3476, | |
| "eval_samples_per_second": 42.311, | |
| "eval_steps_per_second": 10.578, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.02002716064453125, | |
| "grad_norm": 0.7601892352104187, | |
| "learning_rate": 4.899873733520508e-05, | |
| "loss": 6.7391, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.0209808349609375, | |
| "grad_norm": 0.698943555355072, | |
| "learning_rate": 4.895105361938477e-05, | |
| "loss": 6.757, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.02193450927734375, | |
| "grad_norm": 0.6955975294113159, | |
| "learning_rate": 4.890336990356445e-05, | |
| "loss": 6.7612, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.02288818359375, | |
| "grad_norm": 0.625805675983429, | |
| "learning_rate": 4.8855686187744143e-05, | |
| "loss": 6.808, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.02384185791015625, | |
| "grad_norm": 0.8192827701568604, | |
| "learning_rate": 4.8808002471923834e-05, | |
| "loss": 6.7573, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.0247955322265625, | |
| "grad_norm": 0.9873795509338379, | |
| "learning_rate": 4.876031875610352e-05, | |
| "loss": 6.7737, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.02574920654296875, | |
| "grad_norm": 1.016819953918457, | |
| "learning_rate": 4.871263504028321e-05, | |
| "loss": 6.6898, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.026702880859375, | |
| "grad_norm": 1.170738935470581, | |
| "learning_rate": 4.866495132446289e-05, | |
| "loss": 6.7066, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.02765655517578125, | |
| "grad_norm": 2.338644027709961, | |
| "learning_rate": 4.861726760864258e-05, | |
| "loss": 6.7263, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.0286102294921875, | |
| "grad_norm": 1.3942464590072632, | |
| "learning_rate": 4.856958389282227e-05, | |
| "loss": 6.715, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.0286102294921875, | |
| "eval_accuracy": 0.03333365949119374, | |
| "eval_loss": 6.643101692199707, | |
| "eval_runtime": 240.8417, | |
| "eval_samples_per_second": 41.521, | |
| "eval_steps_per_second": 10.38, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.02956390380859375, | |
| "grad_norm": 1.3489205837249756, | |
| "learning_rate": 4.8521900177001955e-05, | |
| "loss": 6.7263, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.030517578125, | |
| "grad_norm": 1.4647587537765503, | |
| "learning_rate": 4.8474216461181645e-05, | |
| "loss": 6.6834, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.03147125244140625, | |
| "grad_norm": 1.9520461559295654, | |
| "learning_rate": 4.842653274536133e-05, | |
| "loss": 6.6735, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.0324249267578125, | |
| "grad_norm": 1.378394365310669, | |
| "learning_rate": 4.837884902954102e-05, | |
| "loss": 6.6402, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.03337860107421875, | |
| "grad_norm": 1.9632004499435425, | |
| "learning_rate": 4.833116531372071e-05, | |
| "loss": 6.6743, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.034332275390625, | |
| "grad_norm": 2.1774470806121826, | |
| "learning_rate": 4.828348159790039e-05, | |
| "loss": 6.71, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.03528594970703125, | |
| "grad_norm": 2.1665990352630615, | |
| "learning_rate": 4.823579788208008e-05, | |
| "loss": 6.6658, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.0362396240234375, | |
| "grad_norm": 1.8897229433059692, | |
| "learning_rate": 4.8188114166259766e-05, | |
| "loss": 6.6589, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.03719329833984375, | |
| "grad_norm": 3.076052188873291, | |
| "learning_rate": 4.8140430450439456e-05, | |
| "loss": 6.6456, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.03814697265625, | |
| "grad_norm": 2.2930281162261963, | |
| "learning_rate": 4.8092746734619146e-05, | |
| "loss": 6.6547, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.03814697265625, | |
| "eval_accuracy": 0.0334894324853229, | |
| "eval_loss": 6.578402996063232, | |
| "eval_runtime": 239.0673, | |
| "eval_samples_per_second": 41.829, | |
| "eval_steps_per_second": 10.457, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.03910064697265625, | |
| "grad_norm": 4.879852771759033, | |
| "learning_rate": 4.804506301879883e-05, | |
| "loss": 6.6624, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.0400543212890625, | |
| "grad_norm": 4.431488037109375, | |
| "learning_rate": 4.799737930297852e-05, | |
| "loss": 6.6653, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.04100799560546875, | |
| "grad_norm": 2.174495220184326, | |
| "learning_rate": 4.79496955871582e-05, | |
| "loss": 6.6332, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.041961669921875, | |
| "grad_norm": 3.470643997192383, | |
| "learning_rate": 4.7902011871337893e-05, | |
| "loss": 6.6515, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.04291534423828125, | |
| "grad_norm": 3.1371328830718994, | |
| "learning_rate": 4.7854328155517584e-05, | |
| "loss": 6.6232, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.0438690185546875, | |
| "grad_norm": 4.542148113250732, | |
| "learning_rate": 4.780664443969727e-05, | |
| "loss": 6.6196, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.04482269287109375, | |
| "grad_norm": 2.4484500885009766, | |
| "learning_rate": 4.775896072387696e-05, | |
| "loss": 6.5961, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.0457763671875, | |
| "grad_norm": 3.348109006881714, | |
| "learning_rate": 4.771127700805664e-05, | |
| "loss": 6.6348, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.04673004150390625, | |
| "grad_norm": 6.815830707550049, | |
| "learning_rate": 4.766359329223633e-05, | |
| "loss": 6.645, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.0476837158203125, | |
| "grad_norm": 4.380649566650391, | |
| "learning_rate": 4.761590957641602e-05, | |
| "loss": 6.5695, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.0476837158203125, | |
| "eval_accuracy": 0.03409236790606654, | |
| "eval_loss": 6.529794692993164, | |
| "eval_runtime": 239.0015, | |
| "eval_samples_per_second": 41.841, | |
| "eval_steps_per_second": 10.46, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.04863739013671875, | |
| "grad_norm": 4.102541446685791, | |
| "learning_rate": 4.7568225860595705e-05, | |
| "loss": 6.6159, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.049591064453125, | |
| "grad_norm": 4.208073139190674, | |
| "learning_rate": 4.7520542144775395e-05, | |
| "loss": 6.5972, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.05054473876953125, | |
| "grad_norm": 4.404722690582275, | |
| "learning_rate": 4.747285842895508e-05, | |
| "loss": 6.5881, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.0514984130859375, | |
| "grad_norm": 4.774337291717529, | |
| "learning_rate": 4.742517471313477e-05, | |
| "loss": 6.6055, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.05245208740234375, | |
| "grad_norm": 4.009048938751221, | |
| "learning_rate": 4.737749099731446e-05, | |
| "loss": 6.5879, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.05340576171875, | |
| "grad_norm": 7.490168571472168, | |
| "learning_rate": 4.732980728149414e-05, | |
| "loss": 6.5723, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.05435943603515625, | |
| "grad_norm": 12.653952598571777, | |
| "learning_rate": 4.728212356567383e-05, | |
| "loss": 6.5784, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.0553131103515625, | |
| "grad_norm": 6.196599960327148, | |
| "learning_rate": 4.7234439849853516e-05, | |
| "loss": 6.5307, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.05626678466796875, | |
| "grad_norm": 8.773276329040527, | |
| "learning_rate": 4.7186756134033206e-05, | |
| "loss": 6.5646, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.057220458984375, | |
| "grad_norm": 6.8058929443359375, | |
| "learning_rate": 4.7139072418212896e-05, | |
| "loss": 6.5283, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.057220458984375, | |
| "eval_accuracy": 0.03399275929549902, | |
| "eval_loss": 6.526458740234375, | |
| "eval_runtime": 241.0876, | |
| "eval_samples_per_second": 41.479, | |
| "eval_steps_per_second": 10.37, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.05817413330078125, | |
| "grad_norm": 9.646665573120117, | |
| "learning_rate": 4.709138870239258e-05, | |
| "loss": 6.5259, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.0591278076171875, | |
| "grad_norm": 11.337198257446289, | |
| "learning_rate": 4.704370498657227e-05, | |
| "loss": 6.5051, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.06008148193359375, | |
| "grad_norm": 17.71413803100586, | |
| "learning_rate": 4.699602127075195e-05, | |
| "loss": 6.6211, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.06103515625, | |
| "grad_norm": 7.909314155578613, | |
| "learning_rate": 4.6948337554931643e-05, | |
| "loss": 6.6081, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.06198883056640625, | |
| "grad_norm": 10.922993659973145, | |
| "learning_rate": 4.6900653839111334e-05, | |
| "loss": 6.5762, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.0629425048828125, | |
| "grad_norm": 17.44997215270996, | |
| "learning_rate": 4.685297012329102e-05, | |
| "loss": 6.5255, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.06389617919921875, | |
| "grad_norm": 9.439545631408691, | |
| "learning_rate": 4.680528640747071e-05, | |
| "loss": 6.6299, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.064849853515625, | |
| "grad_norm": 14.736495018005371, | |
| "learning_rate": 4.675760269165039e-05, | |
| "loss": 6.5564, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.06580352783203125, | |
| "grad_norm": 11.100777626037598, | |
| "learning_rate": 4.670991897583008e-05, | |
| "loss": 6.5923, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.0667572021484375, | |
| "grad_norm": 15.978229522705078, | |
| "learning_rate": 4.666223526000977e-05, | |
| "loss": 6.5826, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.0667572021484375, | |
| "eval_accuracy": 0.03404227005870841, | |
| "eval_loss": 6.490207195281982, | |
| "eval_runtime": 239.3878, | |
| "eval_samples_per_second": 41.773, | |
| "eval_steps_per_second": 10.443, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.06771087646484375, | |
| "grad_norm": 9.80553150177002, | |
| "learning_rate": 4.6614551544189455e-05, | |
| "loss": 6.574, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.06866455078125, | |
| "grad_norm": 9.590557098388672, | |
| "learning_rate": 4.6566867828369145e-05, | |
| "loss": 6.5776, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.06961822509765625, | |
| "grad_norm": 13.170746803283691, | |
| "learning_rate": 4.651918411254883e-05, | |
| "loss": 6.6031, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.0705718994140625, | |
| "grad_norm": 17.351354598999023, | |
| "learning_rate": 4.647150039672852e-05, | |
| "loss": 6.5395, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.07152557373046875, | |
| "grad_norm": 11.295857429504395, | |
| "learning_rate": 4.642381668090821e-05, | |
| "loss": 6.5644, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.072479248046875, | |
| "grad_norm": 13.997299194335938, | |
| "learning_rate": 4.637613296508789e-05, | |
| "loss": 6.57, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.07343292236328125, | |
| "grad_norm": 10.547431945800781, | |
| "learning_rate": 4.632844924926758e-05, | |
| "loss": 6.5339, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.0743865966796875, | |
| "grad_norm": 11.340121269226074, | |
| "learning_rate": 4.6280765533447266e-05, | |
| "loss": 6.5829, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.07534027099609375, | |
| "grad_norm": 11.731833457946777, | |
| "learning_rate": 4.6233081817626956e-05, | |
| "loss": 6.5487, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.0762939453125, | |
| "grad_norm": 21.85376739501953, | |
| "learning_rate": 4.6185398101806646e-05, | |
| "loss": 6.5631, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.0762939453125, | |
| "eval_accuracy": 0.03568610567514677, | |
| "eval_loss": 6.4704718589782715, | |
| "eval_runtime": 253.9333, | |
| "eval_samples_per_second": 39.38, | |
| "eval_steps_per_second": 9.845, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.07724761962890625, | |
| "grad_norm": 25.65736961364746, | |
| "learning_rate": 4.613771438598633e-05, | |
| "loss": 6.5646, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.0782012939453125, | |
| "grad_norm": 13.945578575134277, | |
| "learning_rate": 4.609003067016602e-05, | |
| "loss": 6.5697, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.07915496826171875, | |
| "grad_norm": 19.017955780029297, | |
| "learning_rate": 4.60423469543457e-05, | |
| "loss": 6.5707, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.080108642578125, | |
| "grad_norm": 32.384315490722656, | |
| "learning_rate": 4.5994663238525393e-05, | |
| "loss": 6.571, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.08106231689453125, | |
| "grad_norm": 21.34004783630371, | |
| "learning_rate": 4.5946979522705084e-05, | |
| "loss": 6.5202, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.0820159912109375, | |
| "grad_norm": 14.851140022277832, | |
| "learning_rate": 4.589929580688477e-05, | |
| "loss": 6.5567, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.08296966552734375, | |
| "grad_norm": 39.20492935180664, | |
| "learning_rate": 4.585161209106446e-05, | |
| "loss": 6.531, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.08392333984375, | |
| "grad_norm": 19.704784393310547, | |
| "learning_rate": 4.580392837524414e-05, | |
| "loss": 6.5621, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.08487701416015625, | |
| "grad_norm": 19.00480079650879, | |
| "learning_rate": 4.575624465942383e-05, | |
| "loss": 6.5331, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.0858306884765625, | |
| "grad_norm": 19.51432228088379, | |
| "learning_rate": 4.570856094360352e-05, | |
| "loss": 6.5234, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.0858306884765625, | |
| "eval_accuracy": 0.0339426614481409, | |
| "eval_loss": 6.461673736572266, | |
| "eval_runtime": 240.3668, | |
| "eval_samples_per_second": 41.603, | |
| "eval_steps_per_second": 10.401, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.08678436279296875, | |
| "grad_norm": 21.277442932128906, | |
| "learning_rate": 4.5660877227783205e-05, | |
| "loss": 6.5399, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.087738037109375, | |
| "grad_norm": 13.337013244628906, | |
| "learning_rate": 4.5613193511962895e-05, | |
| "loss": 6.5251, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.08869171142578125, | |
| "grad_norm": 23.25571632385254, | |
| "learning_rate": 4.556550979614258e-05, | |
| "loss": 6.496, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.0896453857421875, | |
| "grad_norm": 29.124269485473633, | |
| "learning_rate": 4.551782608032227e-05, | |
| "loss": 6.5393, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.09059906005859375, | |
| "grad_norm": 26.739830017089844, | |
| "learning_rate": 4.547014236450196e-05, | |
| "loss": 6.4687, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.091552734375, | |
| "grad_norm": 10.364956855773926, | |
| "learning_rate": 4.542245864868164e-05, | |
| "loss": 6.4882, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.09250640869140625, | |
| "grad_norm": 10.050714492797852, | |
| "learning_rate": 4.537477493286133e-05, | |
| "loss": 6.4549, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.0934600830078125, | |
| "grad_norm": 11.470427513122559, | |
| "learning_rate": 4.5327091217041016e-05, | |
| "loss": 6.504, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.09441375732421875, | |
| "grad_norm": 14.701976776123047, | |
| "learning_rate": 4.5279407501220706e-05, | |
| "loss": 6.5087, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.095367431640625, | |
| "grad_norm": 23.34058952331543, | |
| "learning_rate": 4.523172378540039e-05, | |
| "loss": 6.548, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.095367431640625, | |
| "eval_accuracy": 0.03502857142857143, | |
| "eval_loss": 6.44745397567749, | |
| "eval_runtime": 236.7367, | |
| "eval_samples_per_second": 42.241, | |
| "eval_steps_per_second": 10.56, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.09632110595703125, | |
| "grad_norm": 18.535306930541992, | |
| "learning_rate": 4.518404006958008e-05, | |
| "loss": 6.5755, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.0972747802734375, | |
| "grad_norm": 23.872713088989258, | |
| "learning_rate": 4.513635635375977e-05, | |
| "loss": 6.5359, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.09822845458984375, | |
| "grad_norm": 14.181577682495117, | |
| "learning_rate": 4.508867263793945e-05, | |
| "loss": 6.5603, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.09918212890625, | |
| "grad_norm": 29.683629989624023, | |
| "learning_rate": 4.5040988922119143e-05, | |
| "loss": 6.5151, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.10013580322265625, | |
| "grad_norm": 20.120813369750977, | |
| "learning_rate": 4.499330520629883e-05, | |
| "loss": 6.5369, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.1010894775390625, | |
| "grad_norm": 46.01066589355469, | |
| "learning_rate": 4.494562149047852e-05, | |
| "loss": 6.5455, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.10204315185546875, | |
| "grad_norm": 171.9969024658203, | |
| "learning_rate": 4.489793777465821e-05, | |
| "loss": 6.5004, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.102996826171875, | |
| "grad_norm": 17.876941680908203, | |
| "learning_rate": 4.485025405883789e-05, | |
| "loss": 6.5654, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.10395050048828125, | |
| "grad_norm": 55.01047134399414, | |
| "learning_rate": 4.480257034301758e-05, | |
| "loss": 6.4845, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.1049041748046875, | |
| "grad_norm": 41.43305587768555, | |
| "learning_rate": 4.4754886627197264e-05, | |
| "loss": 6.599, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.1049041748046875, | |
| "eval_accuracy": 0.035460665362035224, | |
| "eval_loss": 6.4514336585998535, | |
| "eval_runtime": 239.1754, | |
| "eval_samples_per_second": 41.81, | |
| "eval_steps_per_second": 10.453, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.10585784912109375, | |
| "grad_norm": 65.93293762207031, | |
| "learning_rate": 4.4707202911376955e-05, | |
| "loss": 6.5473, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.1068115234375, | |
| "grad_norm": 52.11703872680664, | |
| "learning_rate": 4.4659519195556645e-05, | |
| "loss": 6.5312, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.10776519775390625, | |
| "grad_norm": 32.834877014160156, | |
| "learning_rate": 4.461183547973633e-05, | |
| "loss": 6.5875, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.1087188720703125, | |
| "grad_norm": 35.849151611328125, | |
| "learning_rate": 4.456415176391602e-05, | |
| "loss": 6.5532, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.10967254638671875, | |
| "grad_norm": 44.165794372558594, | |
| "learning_rate": 4.45164680480957e-05, | |
| "loss": 6.5122, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.110626220703125, | |
| "grad_norm": 22.337329864501953, | |
| "learning_rate": 4.446878433227539e-05, | |
| "loss": 6.3647, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.11157989501953125, | |
| "grad_norm": 21.325393676757812, | |
| "learning_rate": 4.442110061645508e-05, | |
| "loss": 6.5854, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.1125335693359375, | |
| "grad_norm": 49.023765563964844, | |
| "learning_rate": 4.4373416900634766e-05, | |
| "loss": 6.5848, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.11348724365234375, | |
| "grad_norm": 28.078752517700195, | |
| "learning_rate": 4.4325733184814456e-05, | |
| "loss": 6.5471, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.11444091796875, | |
| "grad_norm": 73.4981689453125, | |
| "learning_rate": 4.427804946899414e-05, | |
| "loss": 6.547, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.11444091796875, | |
| "eval_accuracy": 0.03644618395303327, | |
| "eval_loss": 6.456771373748779, | |
| "eval_runtime": 237.6709, | |
| "eval_samples_per_second": 42.075, | |
| "eval_steps_per_second": 10.519, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.11539459228515625, | |
| "grad_norm": 18.256385803222656, | |
| "learning_rate": 4.423036575317383e-05, | |
| "loss": 6.5359, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.1163482666015625, | |
| "grad_norm": 42.84888458251953, | |
| "learning_rate": 4.418268203735352e-05, | |
| "loss": 6.533, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.11730194091796875, | |
| "grad_norm": 23.927953720092773, | |
| "learning_rate": 4.41349983215332e-05, | |
| "loss": 6.5096, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.118255615234375, | |
| "grad_norm": 45.29853057861328, | |
| "learning_rate": 4.4087314605712893e-05, | |
| "loss": 6.5559, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.11920928955078125, | |
| "grad_norm": 26.264976501464844, | |
| "learning_rate": 4.403963088989258e-05, | |
| "loss": 6.5381, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.1201629638671875, | |
| "grad_norm": 44.30765914916992, | |
| "learning_rate": 4.399194717407227e-05, | |
| "loss": 6.5864, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.12111663818359375, | |
| "grad_norm": 21.413312911987305, | |
| "learning_rate": 4.394426345825196e-05, | |
| "loss": 6.5516, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.1220703125, | |
| "grad_norm": 59.862831115722656, | |
| "learning_rate": 4.389657974243164e-05, | |
| "loss": 6.5113, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.12302398681640625, | |
| "grad_norm": 140.16439819335938, | |
| "learning_rate": 4.384889602661133e-05, | |
| "loss": 6.5812, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.1239776611328125, | |
| "grad_norm": 123.18706512451172, | |
| "learning_rate": 4.3801212310791014e-05, | |
| "loss": 6.5265, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.1239776611328125, | |
| "eval_accuracy": 0.038078277886497064, | |
| "eval_loss": 6.470902442932129, | |
| "eval_runtime": 239.0755, | |
| "eval_samples_per_second": 41.828, | |
| "eval_steps_per_second": 10.457, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.12493133544921875, | |
| "grad_norm": 39.1711540222168, | |
| "learning_rate": 4.3753528594970705e-05, | |
| "loss": 6.5254, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.125885009765625, | |
| "grad_norm": 47.702598571777344, | |
| "learning_rate": 4.3705844879150395e-05, | |
| "loss": 6.5341, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.12683868408203125, | |
| "grad_norm": 28.170778274536133, | |
| "learning_rate": 4.365816116333008e-05, | |
| "loss": 6.4917, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.1277923583984375, | |
| "grad_norm": 34.447105407714844, | |
| "learning_rate": 4.361047744750977e-05, | |
| "loss": 6.515, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.12874603271484375, | |
| "grad_norm": 46.904090881347656, | |
| "learning_rate": 4.356279373168945e-05, | |
| "loss": 6.5513, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.12969970703125, | |
| "grad_norm": 27.520286560058594, | |
| "learning_rate": 4.351511001586914e-05, | |
| "loss": 6.6264, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.13065338134765625, | |
| "grad_norm": 77.6899185180664, | |
| "learning_rate": 4.346742630004883e-05, | |
| "loss": 6.5723, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.1316070556640625, | |
| "grad_norm": 35.40989685058594, | |
| "learning_rate": 4.3419742584228516e-05, | |
| "loss": 6.5746, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.13256072998046875, | |
| "grad_norm": 67.315673828125, | |
| "learning_rate": 4.3372058868408206e-05, | |
| "loss": 6.5677, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.133514404296875, | |
| "grad_norm": 94.40605163574219, | |
| "learning_rate": 4.332437515258789e-05, | |
| "loss": 6.5916, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.133514404296875, | |
| "eval_accuracy": 0.03715283757338552, | |
| "eval_loss": 6.496217250823975, | |
| "eval_runtime": 239.7511, | |
| "eval_samples_per_second": 41.71, | |
| "eval_steps_per_second": 10.427, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.13446807861328125, | |
| "grad_norm": 111.58074188232422, | |
| "learning_rate": 4.327669143676758e-05, | |
| "loss": 6.6048, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.1354217529296875, | |
| "grad_norm": 29.336286544799805, | |
| "learning_rate": 4.322900772094727e-05, | |
| "loss": 6.6133, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.13637542724609375, | |
| "grad_norm": 52.75020217895508, | |
| "learning_rate": 4.318132400512695e-05, | |
| "loss": 6.5921, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.1373291015625, | |
| "grad_norm": 72.22640991210938, | |
| "learning_rate": 4.3133640289306643e-05, | |
| "loss": 6.6073, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.13828277587890625, | |
| "grad_norm": 49.1014289855957, | |
| "learning_rate": 4.308595657348633e-05, | |
| "loss": 6.621, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.1392364501953125, | |
| "grad_norm": 59.5632209777832, | |
| "learning_rate": 4.303827285766602e-05, | |
| "loss": 6.5858, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.14019012451171875, | |
| "grad_norm": 87.36494445800781, | |
| "learning_rate": 4.299058914184571e-05, | |
| "loss": 6.6278, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.141143798828125, | |
| "grad_norm": 32.93346405029297, | |
| "learning_rate": 4.294290542602539e-05, | |
| "loss": 6.5914, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.14209747314453125, | |
| "grad_norm": 52.131343841552734, | |
| "learning_rate": 4.289522171020508e-05, | |
| "loss": 6.5675, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.1430511474609375, | |
| "grad_norm": 97.84684753417969, | |
| "learning_rate": 4.2847537994384764e-05, | |
| "loss": 6.5919, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.1430511474609375, | |
| "eval_accuracy": 0.03751819960861057, | |
| "eval_loss": 6.482682704925537, | |
| "eval_runtime": 237.4453, | |
| "eval_samples_per_second": 42.115, | |
| "eval_steps_per_second": 10.529, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.14400482177734375, | |
| "grad_norm": 46.23030090332031, | |
| "learning_rate": 4.2799854278564455e-05, | |
| "loss": 6.6028, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.14495849609375, | |
| "grad_norm": 73.1805191040039, | |
| "learning_rate": 4.2752170562744145e-05, | |
| "loss": 6.5949, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.14591217041015625, | |
| "grad_norm": 103.25483703613281, | |
| "learning_rate": 4.270448684692383e-05, | |
| "loss": 6.6279, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.1468658447265625, | |
| "grad_norm": 39.62518310546875, | |
| "learning_rate": 4.265680313110352e-05, | |
| "loss": 6.6005, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.14781951904296875, | |
| "grad_norm": 117.60237121582031, | |
| "learning_rate": 4.26091194152832e-05, | |
| "loss": 6.5755, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.148773193359375, | |
| "grad_norm": 93.19673156738281, | |
| "learning_rate": 4.256143569946289e-05, | |
| "loss": 6.5823, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.14972686767578125, | |
| "grad_norm": 24.11821174621582, | |
| "learning_rate": 4.251375198364258e-05, | |
| "loss": 6.6, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.1506805419921875, | |
| "grad_norm": 63.973106384277344, | |
| "learning_rate": 4.2466068267822266e-05, | |
| "loss": 6.592, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.15163421630859375, | |
| "grad_norm": 107.64210510253906, | |
| "learning_rate": 4.2418384552001956e-05, | |
| "loss": 6.5633, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.152587890625, | |
| "grad_norm": 59.94293212890625, | |
| "learning_rate": 4.237070083618164e-05, | |
| "loss": 6.4745, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.152587890625, | |
| "eval_accuracy": 0.038450489236790605, | |
| "eval_loss": 6.490172386169434, | |
| "eval_runtime": 236.0018, | |
| "eval_samples_per_second": 42.373, | |
| "eval_steps_per_second": 10.593, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.15354156494140625, | |
| "grad_norm": 46.570556640625, | |
| "learning_rate": 4.232301712036133e-05, | |
| "loss": 6.5729, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.1544952392578125, | |
| "grad_norm": 77.17676544189453, | |
| "learning_rate": 4.227533340454102e-05, | |
| "loss": 6.5433, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.15544891357421875, | |
| "grad_norm": 156.23880004882812, | |
| "learning_rate": 4.22276496887207e-05, | |
| "loss": 6.5623, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.156402587890625, | |
| "grad_norm": 293.4957275390625, | |
| "learning_rate": 4.2179965972900393e-05, | |
| "loss": 6.5936, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.15735626220703125, | |
| "grad_norm": 91.47918701171875, | |
| "learning_rate": 4.213228225708008e-05, | |
| "loss": 6.5374, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.1583099365234375, | |
| "grad_norm": 123.1861801147461, | |
| "learning_rate": 4.208459854125977e-05, | |
| "loss": 6.5576, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.15926361083984375, | |
| "grad_norm": 75.75604248046875, | |
| "learning_rate": 4.203691482543946e-05, | |
| "loss": 6.5649, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.16021728515625, | |
| "grad_norm": 156.376953125, | |
| "learning_rate": 4.198923110961914e-05, | |
| "loss": 6.5516, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.16117095947265625, | |
| "grad_norm": 198.7659149169922, | |
| "learning_rate": 4.194154739379883e-05, | |
| "loss": 6.5659, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.1621246337890625, | |
| "grad_norm": 89.94020080566406, | |
| "learning_rate": 4.1893863677978514e-05, | |
| "loss": 6.6695, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.1621246337890625, | |
| "eval_accuracy": 0.038353033268101765, | |
| "eval_loss": 6.54418420791626, | |
| "eval_runtime": 241.1387, | |
| "eval_samples_per_second": 41.47, | |
| "eval_steps_per_second": 10.367, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.16307830810546875, | |
| "grad_norm": 86.41958618164062, | |
| "learning_rate": 4.1846179962158205e-05, | |
| "loss": 6.6861, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.164031982421875, | |
| "grad_norm": 172.11248779296875, | |
| "learning_rate": 4.1798496246337895e-05, | |
| "loss": 6.6639, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.16498565673828125, | |
| "grad_norm": 115.12784576416016, | |
| "learning_rate": 4.175081253051758e-05, | |
| "loss": 6.6782, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.1659393310546875, | |
| "grad_norm": 248.4824676513672, | |
| "learning_rate": 4.170312881469727e-05, | |
| "loss": 6.702, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.16689300537109375, | |
| "grad_norm": 45.923828125, | |
| "learning_rate": 4.165544509887695e-05, | |
| "loss": 6.6079, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.1678466796875, | |
| "grad_norm": 602.0431518554688, | |
| "learning_rate": 4.160776138305664e-05, | |
| "loss": 6.6195, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.16880035400390625, | |
| "grad_norm": 41.64591598510742, | |
| "learning_rate": 4.156007766723633e-05, | |
| "loss": 6.6371, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.1697540283203125, | |
| "grad_norm": 213.25375366210938, | |
| "learning_rate": 4.1512393951416016e-05, | |
| "loss": 6.5944, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.17070770263671875, | |
| "grad_norm": 87.20841217041016, | |
| "learning_rate": 4.1464710235595706e-05, | |
| "loss": 6.6422, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.171661376953125, | |
| "grad_norm": 62.394229888916016, | |
| "learning_rate": 4.141702651977539e-05, | |
| "loss": 6.5994, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.171661376953125, | |
| "eval_accuracy": 0.03867651663405088, | |
| "eval_loss": 6.503583908081055, | |
| "eval_runtime": 238.2981, | |
| "eval_samples_per_second": 41.964, | |
| "eval_steps_per_second": 10.491, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.17261505126953125, | |
| "grad_norm": 48.49923324584961, | |
| "learning_rate": 4.136934280395508e-05, | |
| "loss": 6.5997, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.1735687255859375, | |
| "grad_norm": 85.41634368896484, | |
| "learning_rate": 4.132165908813477e-05, | |
| "loss": 6.5842, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.17452239990234375, | |
| "grad_norm": 86.25672149658203, | |
| "learning_rate": 4.127397537231445e-05, | |
| "loss": 6.6317, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.17547607421875, | |
| "grad_norm": 57.90113067626953, | |
| "learning_rate": 4.1226291656494143e-05, | |
| "loss": 6.6006, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.17642974853515625, | |
| "grad_norm": 644.6544189453125, | |
| "learning_rate": 4.117860794067383e-05, | |
| "loss": 6.6067, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.1773834228515625, | |
| "grad_norm": 45.653770446777344, | |
| "learning_rate": 4.113092422485352e-05, | |
| "loss": 6.6163, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.17833709716796875, | |
| "grad_norm": 28.696081161499023, | |
| "learning_rate": 4.108324050903321e-05, | |
| "loss": 6.6331, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.179290771484375, | |
| "grad_norm": 87.23275756835938, | |
| "learning_rate": 4.103555679321289e-05, | |
| "loss": 6.6418, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.18024444580078125, | |
| "grad_norm": 57.913978576660156, | |
| "learning_rate": 4.098787307739258e-05, | |
| "loss": 6.5708, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.1811981201171875, | |
| "grad_norm": 65.90794372558594, | |
| "learning_rate": 4.0940189361572264e-05, | |
| "loss": 6.6023, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.1811981201171875, | |
| "eval_accuracy": 0.03850430528375734, | |
| "eval_loss": 6.49788761138916, | |
| "eval_runtime": 242.9451, | |
| "eval_samples_per_second": 41.162, | |
| "eval_steps_per_second": 10.29, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.18215179443359375, | |
| "grad_norm": 174.3852081298828, | |
| "learning_rate": 4.0892505645751955e-05, | |
| "loss": 6.5774, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.18310546875, | |
| "grad_norm": 131.80348205566406, | |
| "learning_rate": 4.0844821929931645e-05, | |
| "loss": 6.5997, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.18405914306640625, | |
| "grad_norm": 80.42842864990234, | |
| "learning_rate": 4.079713821411133e-05, | |
| "loss": 6.5884, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.1850128173828125, | |
| "grad_norm": 102.92145538330078, | |
| "learning_rate": 4.074945449829102e-05, | |
| "loss": 6.6227, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.18596649169921875, | |
| "grad_norm": 83.99175262451172, | |
| "learning_rate": 4.07017707824707e-05, | |
| "loss": 6.6208, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.186920166015625, | |
| "grad_norm": 115.96537017822266, | |
| "learning_rate": 4.065408706665039e-05, | |
| "loss": 6.6298, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.18787384033203125, | |
| "grad_norm": 97.76721954345703, | |
| "learning_rate": 4.060640335083008e-05, | |
| "loss": 6.6488, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.1888275146484375, | |
| "grad_norm": 239.66554260253906, | |
| "learning_rate": 4.0558719635009766e-05, | |
| "loss": 6.6078, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.18978118896484375, | |
| "grad_norm": 143.55519104003906, | |
| "learning_rate": 4.0511035919189456e-05, | |
| "loss": 6.5442, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.19073486328125, | |
| "grad_norm": 489.53350830078125, | |
| "learning_rate": 4.046335220336914e-05, | |
| "loss": 6.5913, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.19073486328125, | |
| "eval_accuracy": 0.03816399217221135, | |
| "eval_loss": 6.582998275756836, | |
| "eval_runtime": 238.7992, | |
| "eval_samples_per_second": 41.876, | |
| "eval_steps_per_second": 10.469, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.19168853759765625, | |
| "grad_norm": 120.52062225341797, | |
| "learning_rate": 4.041566848754883e-05, | |
| "loss": 6.5985, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 0.1926422119140625, | |
| "grad_norm": 153.3665008544922, | |
| "learning_rate": 4.036798477172852e-05, | |
| "loss": 6.6235, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.19359588623046875, | |
| "grad_norm": 146.99871826171875, | |
| "learning_rate": 4.03203010559082e-05, | |
| "loss": 6.6944, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 0.194549560546875, | |
| "grad_norm": 128.0095977783203, | |
| "learning_rate": 4.0272617340087893e-05, | |
| "loss": 6.6253, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.19550323486328125, | |
| "grad_norm": 73.54423522949219, | |
| "learning_rate": 4.022493362426758e-05, | |
| "loss": 6.6544, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 0.1964569091796875, | |
| "grad_norm": 110.41128540039062, | |
| "learning_rate": 4.017724990844727e-05, | |
| "loss": 6.6509, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 0.19741058349609375, | |
| "grad_norm": 319.73358154296875, | |
| "learning_rate": 4.012956619262696e-05, | |
| "loss": 6.6368, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 0.1983642578125, | |
| "grad_norm": 207.58070373535156, | |
| "learning_rate": 4.008188247680664e-05, | |
| "loss": 6.6357, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 0.19931793212890625, | |
| "grad_norm": 55.293548583984375, | |
| "learning_rate": 4.003419876098633e-05, | |
| "loss": 6.6372, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 0.2002716064453125, | |
| "grad_norm": 284.53204345703125, | |
| "learning_rate": 3.9986515045166014e-05, | |
| "loss": 6.6542, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.2002716064453125, | |
| "eval_accuracy": 0.038425244618395304, | |
| "eval_loss": 6.55495023727417, | |
| "eval_runtime": 240.6255, | |
| "eval_samples_per_second": 41.558, | |
| "eval_steps_per_second": 10.39, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.20122528076171875, | |
| "grad_norm": 150.2811737060547, | |
| "learning_rate": 3.9938831329345705e-05, | |
| "loss": 6.6518, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 0.202178955078125, | |
| "grad_norm": 80.65524291992188, | |
| "learning_rate": 3.9891147613525395e-05, | |
| "loss": 6.6464, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 0.20313262939453125, | |
| "grad_norm": 84.09796905517578, | |
| "learning_rate": 3.984346389770508e-05, | |
| "loss": 6.6482, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 0.2040863037109375, | |
| "grad_norm": 104.57718658447266, | |
| "learning_rate": 3.979578018188477e-05, | |
| "loss": 6.6667, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 0.20503997802734375, | |
| "grad_norm": 327.62109375, | |
| "learning_rate": 3.974809646606445e-05, | |
| "loss": 6.6351, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 0.20599365234375, | |
| "grad_norm": 97.85407257080078, | |
| "learning_rate": 3.970041275024414e-05, | |
| "loss": 6.5981, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 0.20694732666015625, | |
| "grad_norm": 54.872920989990234, | |
| "learning_rate": 3.965272903442383e-05, | |
| "loss": 6.6179, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 0.2079010009765625, | |
| "grad_norm": 74.83031463623047, | |
| "learning_rate": 3.9605045318603516e-05, | |
| "loss": 6.6678, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 0.20885467529296875, | |
| "grad_norm": 171.44764709472656, | |
| "learning_rate": 3.9557361602783206e-05, | |
| "loss": 6.6617, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 0.209808349609375, | |
| "grad_norm": 107.16600799560547, | |
| "learning_rate": 3.950967788696289e-05, | |
| "loss": 6.6602, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.209808349609375, | |
| "eval_accuracy": 0.038764383561643835, | |
| "eval_loss": 6.5538482666015625, | |
| "eval_runtime": 245.2146, | |
| "eval_samples_per_second": 40.781, | |
| "eval_steps_per_second": 10.195, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.21076202392578125, | |
| "grad_norm": 97.79281616210938, | |
| "learning_rate": 3.946199417114258e-05, | |
| "loss": 6.6494, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 0.2117156982421875, | |
| "grad_norm": 194.7386474609375, | |
| "learning_rate": 3.941431045532227e-05, | |
| "loss": 6.6627, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 0.21266937255859375, | |
| "grad_norm": 39.52627944946289, | |
| "learning_rate": 3.936662673950195e-05, | |
| "loss": 6.6378, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 0.213623046875, | |
| "grad_norm": 304.6911926269531, | |
| "learning_rate": 3.9318943023681643e-05, | |
| "loss": 6.6002, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 0.21457672119140625, | |
| "grad_norm": 318.4564208984375, | |
| "learning_rate": 3.927125930786133e-05, | |
| "loss": 6.5922, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 0.2155303955078125, | |
| "grad_norm": 72.45951843261719, | |
| "learning_rate": 3.922357559204102e-05, | |
| "loss": 6.6411, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 0.21648406982421875, | |
| "grad_norm": 145.93966674804688, | |
| "learning_rate": 3.917589187622071e-05, | |
| "loss": 6.618, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 0.217437744140625, | |
| "grad_norm": 306.8751220703125, | |
| "learning_rate": 3.912820816040039e-05, | |
| "loss": 6.6476, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 0.21839141845703125, | |
| "grad_norm": 74.63150024414062, | |
| "learning_rate": 3.908052444458008e-05, | |
| "loss": 6.6734, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 0.2193450927734375, | |
| "grad_norm": 82.34027099609375, | |
| "learning_rate": 3.9032840728759764e-05, | |
| "loss": 6.6113, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.2193450927734375, | |
| "eval_accuracy": 0.039347162426614485, | |
| "eval_loss": 6.550750732421875, | |
| "eval_runtime": 237.2859, | |
| "eval_samples_per_second": 42.143, | |
| "eval_steps_per_second": 10.536, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.22029876708984375, | |
| "grad_norm": 83.30755615234375, | |
| "learning_rate": 3.8985157012939455e-05, | |
| "loss": 6.5598, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 0.22125244140625, | |
| "grad_norm": 133.1521453857422, | |
| "learning_rate": 3.8937473297119145e-05, | |
| "loss": 6.5783, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 0.22220611572265625, | |
| "grad_norm": 220.6308135986328, | |
| "learning_rate": 3.888978958129883e-05, | |
| "loss": 6.5847, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 0.2231597900390625, | |
| "grad_norm": 135.7482147216797, | |
| "learning_rate": 3.884210586547852e-05, | |
| "loss": 6.6044, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 0.22411346435546875, | |
| "grad_norm": 159.68698120117188, | |
| "learning_rate": 3.87944221496582e-05, | |
| "loss": 6.6709, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 0.225067138671875, | |
| "grad_norm": 370.2187805175781, | |
| "learning_rate": 3.874673843383789e-05, | |
| "loss": 6.6519, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 0.22602081298828125, | |
| "grad_norm": 115.01087188720703, | |
| "learning_rate": 3.869905471801758e-05, | |
| "loss": 6.6864, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 0.2269744873046875, | |
| "grad_norm": 102.77879333496094, | |
| "learning_rate": 3.8651371002197266e-05, | |
| "loss": 6.6534, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 0.22792816162109375, | |
| "grad_norm": 122.78046417236328, | |
| "learning_rate": 3.8603687286376956e-05, | |
| "loss": 6.6469, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 0.2288818359375, | |
| "grad_norm": 34.63007354736328, | |
| "learning_rate": 3.855600357055664e-05, | |
| "loss": 6.6568, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.2288818359375, | |
| "eval_accuracy": 0.03878786692759296, | |
| "eval_loss": 6.552664756774902, | |
| "eval_runtime": 241.1332, | |
| "eval_samples_per_second": 41.471, | |
| "eval_steps_per_second": 10.368, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.22983551025390625, | |
| "grad_norm": 104.76294708251953, | |
| "learning_rate": 3.850831985473633e-05, | |
| "loss": 6.6533, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 0.2307891845703125, | |
| "grad_norm": 224.17337036132812, | |
| "learning_rate": 3.846063613891602e-05, | |
| "loss": 6.6484, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 0.23174285888671875, | |
| "grad_norm": 73.90188598632812, | |
| "learning_rate": 3.84129524230957e-05, | |
| "loss": 6.6222, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 0.232696533203125, | |
| "grad_norm": 73.31129455566406, | |
| "learning_rate": 3.8365268707275393e-05, | |
| "loss": 6.6224, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 0.23365020751953125, | |
| "grad_norm": 64.67500305175781, | |
| "learning_rate": 3.831758499145508e-05, | |
| "loss": 6.6565, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 0.2346038818359375, | |
| "grad_norm": 175.16627502441406, | |
| "learning_rate": 3.826990127563477e-05, | |
| "loss": 6.6196, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 0.23555755615234375, | |
| "grad_norm": 449.0741882324219, | |
| "learning_rate": 3.822221755981446e-05, | |
| "loss": 6.6363, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 0.23651123046875, | |
| "grad_norm": 364.6779479980469, | |
| "learning_rate": 3.817453384399414e-05, | |
| "loss": 6.6512, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 0.23746490478515625, | |
| "grad_norm": 84.31883239746094, | |
| "learning_rate": 3.812685012817383e-05, | |
| "loss": 6.6305, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 0.2384185791015625, | |
| "grad_norm": 259.1290588378906, | |
| "learning_rate": 3.8079166412353514e-05, | |
| "loss": 6.6476, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.2384185791015625, | |
| "eval_accuracy": 0.039112133072407046, | |
| "eval_loss": 6.5456719398498535, | |
| "eval_runtime": 238.7402, | |
| "eval_samples_per_second": 41.887, | |
| "eval_steps_per_second": 10.472, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.23937225341796875, | |
| "grad_norm": 68.04620361328125, | |
| "learning_rate": 3.8031482696533205e-05, | |
| "loss": 6.6502, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 0.240325927734375, | |
| "grad_norm": 46.73642349243164, | |
| "learning_rate": 3.7983798980712895e-05, | |
| "loss": 6.6139, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 0.24127960205078125, | |
| "grad_norm": 183.39114379882812, | |
| "learning_rate": 3.793611526489258e-05, | |
| "loss": 6.6471, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 0.2422332763671875, | |
| "grad_norm": 208.6868133544922, | |
| "learning_rate": 3.788843154907227e-05, | |
| "loss": 6.6326, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 0.24318695068359375, | |
| "grad_norm": 1748.205322265625, | |
| "learning_rate": 3.784074783325195e-05, | |
| "loss": 6.6223, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 0.244140625, | |
| "grad_norm": 427.6435852050781, | |
| "learning_rate": 3.779306411743164e-05, | |
| "loss": 6.6556, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 0.24509429931640625, | |
| "grad_norm": 123.13027954101562, | |
| "learning_rate": 3.774538040161133e-05, | |
| "loss": 6.6449, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 0.2460479736328125, | |
| "grad_norm": 510.61474609375, | |
| "learning_rate": 3.7697696685791016e-05, | |
| "loss": 6.6579, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 0.24700164794921875, | |
| "grad_norm": 84.33484649658203, | |
| "learning_rate": 3.7650012969970706e-05, | |
| "loss": 6.6373, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 0.247955322265625, | |
| "grad_norm": 286.0257568359375, | |
| "learning_rate": 3.760232925415039e-05, | |
| "loss": 6.6636, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.247955322265625, | |
| "eval_accuracy": 0.04012641878669276, | |
| "eval_loss": 6.545810699462891, | |
| "eval_runtime": 237.6897, | |
| "eval_samples_per_second": 42.072, | |
| "eval_steps_per_second": 10.518, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.24890899658203125, | |
| "grad_norm": 206.83262634277344, | |
| "learning_rate": 3.755464553833008e-05, | |
| "loss": 6.6333, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 0.2498626708984375, | |
| "grad_norm": 93.4488525390625, | |
| "learning_rate": 3.750696182250977e-05, | |
| "loss": 6.6089, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 0.25081634521484375, | |
| "grad_norm": 59.847511291503906, | |
| "learning_rate": 3.745927810668945e-05, | |
| "loss": 6.6127, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 0.25177001953125, | |
| "grad_norm": 111.11254119873047, | |
| "learning_rate": 3.7411594390869143e-05, | |
| "loss": 6.6167, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 0.25272369384765625, | |
| "grad_norm": 243.5362548828125, | |
| "learning_rate": 3.736391067504883e-05, | |
| "loss": 6.6063, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 0.2536773681640625, | |
| "grad_norm": 109.8550796508789, | |
| "learning_rate": 3.731622695922852e-05, | |
| "loss": 6.6519, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 0.25463104248046875, | |
| "grad_norm": 196.270263671875, | |
| "learning_rate": 3.726854324340821e-05, | |
| "loss": 6.6547, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 0.255584716796875, | |
| "grad_norm": 102.67740631103516, | |
| "learning_rate": 3.722085952758789e-05, | |
| "loss": 6.6682, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 0.25653839111328125, | |
| "grad_norm": 75.3838882446289, | |
| "learning_rate": 3.717317581176758e-05, | |
| "loss": 6.6609, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 0.2574920654296875, | |
| "grad_norm": 247.18411254882812, | |
| "learning_rate": 3.7125492095947264e-05, | |
| "loss": 6.6318, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.2574920654296875, | |
| "eval_accuracy": 0.03993933463796478, | |
| "eval_loss": 6.546828269958496, | |
| "eval_runtime": 238.1837, | |
| "eval_samples_per_second": 41.984, | |
| "eval_steps_per_second": 10.496, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.25844573974609375, | |
| "grad_norm": 186.8214111328125, | |
| "learning_rate": 3.7077808380126955e-05, | |
| "loss": 6.6381, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 0.2593994140625, | |
| "grad_norm": 123.54873657226562, | |
| "learning_rate": 3.7030124664306645e-05, | |
| "loss": 6.6224, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 0.26035308837890625, | |
| "grad_norm": 101.96115112304688, | |
| "learning_rate": 3.698244094848633e-05, | |
| "loss": 6.634, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 0.2613067626953125, | |
| "grad_norm": 238.8611602783203, | |
| "learning_rate": 3.693475723266602e-05, | |
| "loss": 6.6743, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 0.26226043701171875, | |
| "grad_norm": 238.52011108398438, | |
| "learning_rate": 3.68870735168457e-05, | |
| "loss": 6.6772, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 0.263214111328125, | |
| "grad_norm": 180.68150329589844, | |
| "learning_rate": 3.683938980102539e-05, | |
| "loss": 6.6486, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 0.26416778564453125, | |
| "grad_norm": 108.15036010742188, | |
| "learning_rate": 3.679170608520508e-05, | |
| "loss": 6.6651, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 0.2651214599609375, | |
| "grad_norm": 1145.0054931640625, | |
| "learning_rate": 3.6744022369384766e-05, | |
| "loss": 6.5743, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 0.26607513427734375, | |
| "grad_norm": 98.10594177246094, | |
| "learning_rate": 3.6696338653564456e-05, | |
| "loss": 6.6723, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 0.26702880859375, | |
| "grad_norm": 62.280296325683594, | |
| "learning_rate": 3.664865493774414e-05, | |
| "loss": 6.6358, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.26702880859375, | |
| "eval_accuracy": 0.037448923679060664, | |
| "eval_loss": 6.57286262512207, | |
| "eval_runtime": 237.691, | |
| "eval_samples_per_second": 42.071, | |
| "eval_steps_per_second": 10.518, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.26798248291015625, | |
| "grad_norm": 84.15850067138672, | |
| "learning_rate": 3.660097122192383e-05, | |
| "loss": 6.657, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 0.2689361572265625, | |
| "grad_norm": 103.6788330078125, | |
| "learning_rate": 3.655328750610352e-05, | |
| "loss": 6.6567, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 0.26988983154296875, | |
| "grad_norm": 101.2646255493164, | |
| "learning_rate": 3.65056037902832e-05, | |
| "loss": 6.6804, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 0.270843505859375, | |
| "grad_norm": 141.26473999023438, | |
| "learning_rate": 3.6457920074462893e-05, | |
| "loss": 6.6451, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 0.27179718017578125, | |
| "grad_norm": 179.66439819335938, | |
| "learning_rate": 3.641023635864258e-05, | |
| "loss": 6.6583, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 0.2727508544921875, | |
| "grad_norm": 114.78199005126953, | |
| "learning_rate": 3.636255264282227e-05, | |
| "loss": 6.6123, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 0.27370452880859375, | |
| "grad_norm": 47.80651092529297, | |
| "learning_rate": 3.631486892700196e-05, | |
| "loss": 6.6632, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 0.274658203125, | |
| "grad_norm": 157.00042724609375, | |
| "learning_rate": 3.626718521118164e-05, | |
| "loss": 6.6487, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 0.27561187744140625, | |
| "grad_norm": 301.52874755859375, | |
| "learning_rate": 3.621950149536133e-05, | |
| "loss": 6.5943, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 0.2765655517578125, | |
| "grad_norm": 136.7102813720703, | |
| "learning_rate": 3.6171817779541014e-05, | |
| "loss": 6.639, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 0.2765655517578125, | |
| "eval_accuracy": 0.039391389432485324, | |
| "eval_loss": 6.5531721115112305, | |
| "eval_runtime": 243.7736, | |
| "eval_samples_per_second": 41.022, | |
| "eval_steps_per_second": 10.255, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 0.27751922607421875, | |
| "grad_norm": 153.75978088378906, | |
| "learning_rate": 3.6124134063720705e-05, | |
| "loss": 6.6015, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 0.278472900390625, | |
| "grad_norm": 296.2194519042969, | |
| "learning_rate": 3.6076450347900395e-05, | |
| "loss": 6.6052, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 0.27942657470703125, | |
| "grad_norm": 337.25872802734375, | |
| "learning_rate": 3.602876663208008e-05, | |
| "loss": 6.6046, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 0.2803802490234375, | |
| "grad_norm": 101.83748626708984, | |
| "learning_rate": 3.598108291625977e-05, | |
| "loss": 6.659, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 0.28133392333984375, | |
| "grad_norm": 248.71856689453125, | |
| "learning_rate": 3.593339920043945e-05, | |
| "loss": 6.6136, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 0.28228759765625, | |
| "grad_norm": 209.75669860839844, | |
| "learning_rate": 3.588571548461914e-05, | |
| "loss": 6.5739, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 0.28324127197265625, | |
| "grad_norm": 86.48981475830078, | |
| "learning_rate": 3.583803176879883e-05, | |
| "loss": 6.5912, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 0.2841949462890625, | |
| "grad_norm": 64.70508575439453, | |
| "learning_rate": 3.5790348052978516e-05, | |
| "loss": 6.6749, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 0.28514862060546875, | |
| "grad_norm": 55.55641555786133, | |
| "learning_rate": 3.5742664337158206e-05, | |
| "loss": 6.6692, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 0.286102294921875, | |
| "grad_norm": 77.39098358154297, | |
| "learning_rate": 3.569498062133789e-05, | |
| "loss": 6.6817, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.286102294921875, | |
| "eval_accuracy": 0.03908512720156556, | |
| "eval_loss": 6.55155611038208, | |
| "eval_runtime": 247.8743, | |
| "eval_samples_per_second": 40.343, | |
| "eval_steps_per_second": 10.086, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.28705596923828125, | |
| "grad_norm": 86.25543975830078, | |
| "learning_rate": 3.564729690551758e-05, | |
| "loss": 6.6456, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 0.2880096435546875, | |
| "grad_norm": 194.3202362060547, | |
| "learning_rate": 3.559961318969727e-05, | |
| "loss": 6.6447, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 0.28896331787109375, | |
| "grad_norm": 98.31609344482422, | |
| "learning_rate": 3.555192947387695e-05, | |
| "loss": 6.6226, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 0.2899169921875, | |
| "grad_norm": 66.98186492919922, | |
| "learning_rate": 3.5504245758056643e-05, | |
| "loss": 6.6559, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 0.29087066650390625, | |
| "grad_norm": 120.20733642578125, | |
| "learning_rate": 3.545656204223633e-05, | |
| "loss": 6.6021, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 0.2918243408203125, | |
| "grad_norm": 137.2542724609375, | |
| "learning_rate": 3.540887832641602e-05, | |
| "loss": 6.665, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 0.29277801513671875, | |
| "grad_norm": 97.15869140625, | |
| "learning_rate": 3.536119461059571e-05, | |
| "loss": 6.6658, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 0.293731689453125, | |
| "grad_norm": 206.2852020263672, | |
| "learning_rate": 3.531351089477539e-05, | |
| "loss": 6.6735, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 0.29468536376953125, | |
| "grad_norm": 303.65582275390625, | |
| "learning_rate": 3.526582717895508e-05, | |
| "loss": 6.6759, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 0.2956390380859375, | |
| "grad_norm": 152.71656799316406, | |
| "learning_rate": 3.5218143463134764e-05, | |
| "loss": 6.6339, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 0.2956390380859375, | |
| "eval_accuracy": 0.03885968688845401, | |
| "eval_loss": 6.550864219665527, | |
| "eval_runtime": 237.7604, | |
| "eval_samples_per_second": 42.059, | |
| "eval_steps_per_second": 10.515, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 0.29659271240234375, | |
| "grad_norm": 159.33724975585938, | |
| "learning_rate": 3.5170459747314455e-05, | |
| "loss": 6.6202, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 0.29754638671875, | |
| "grad_norm": 202.52700805664062, | |
| "learning_rate": 3.5122776031494145e-05, | |
| "loss": 6.6599, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 0.29850006103515625, | |
| "grad_norm": 1121.404296875, | |
| "learning_rate": 3.507509231567383e-05, | |
| "loss": 6.6501, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 0.2994537353515625, | |
| "grad_norm": 154.83506774902344, | |
| "learning_rate": 3.502740859985352e-05, | |
| "loss": 6.6193, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 0.30040740966796875, | |
| "grad_norm": 43.87880325317383, | |
| "learning_rate": 3.49797248840332e-05, | |
| "loss": 6.6633, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 0.301361083984375, | |
| "grad_norm": 81.32662963867188, | |
| "learning_rate": 3.493204116821289e-05, | |
| "loss": 6.616, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 0.30231475830078125, | |
| "grad_norm": 62.43561935424805, | |
| "learning_rate": 3.488435745239258e-05, | |
| "loss": 6.6279, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 0.3032684326171875, | |
| "grad_norm": 67.34500122070312, | |
| "learning_rate": 3.4836673736572266e-05, | |
| "loss": 6.599, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 0.30422210693359375, | |
| "grad_norm": 123.81517028808594, | |
| "learning_rate": 3.4788990020751956e-05, | |
| "loss": 6.5607, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 0.30517578125, | |
| "grad_norm": 135.31414794921875, | |
| "learning_rate": 3.474130630493164e-05, | |
| "loss": 6.6132, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.30517578125, | |
| "eval_accuracy": 0.03812700587084149, | |
| "eval_loss": 6.539149284362793, | |
| "eval_runtime": 239.8317, | |
| "eval_samples_per_second": 41.696, | |
| "eval_steps_per_second": 10.424, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.30612945556640625, | |
| "grad_norm": 73.59024047851562, | |
| "learning_rate": 3.469362258911133e-05, | |
| "loss": 6.6094, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 0.3070831298828125, | |
| "grad_norm": 536.4093017578125, | |
| "learning_rate": 3.464593887329102e-05, | |
| "loss": 6.6274, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 0.30803680419921875, | |
| "grad_norm": 198.60101318359375, | |
| "learning_rate": 3.45982551574707e-05, | |
| "loss": 6.6282, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 0.308990478515625, | |
| "grad_norm": 69.17730712890625, | |
| "learning_rate": 3.4550571441650393e-05, | |
| "loss": 6.6177, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 0.30994415283203125, | |
| "grad_norm": 86.33926391601562, | |
| "learning_rate": 3.450288772583008e-05, | |
| "loss": 6.5894, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 0.3108978271484375, | |
| "grad_norm": 214.99929809570312, | |
| "learning_rate": 3.445520401000977e-05, | |
| "loss": 6.5999, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 0.31185150146484375, | |
| "grad_norm": 169.09580993652344, | |
| "learning_rate": 3.440752029418946e-05, | |
| "loss": 6.5878, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 0.31280517578125, | |
| "grad_norm": 120.34239959716797, | |
| "learning_rate": 3.435983657836914e-05, | |
| "loss": 6.5944, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 0.31375885009765625, | |
| "grad_norm": 65.94364166259766, | |
| "learning_rate": 3.431215286254883e-05, | |
| "loss": 6.64, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 0.3147125244140625, | |
| "grad_norm": 75.02478790283203, | |
| "learning_rate": 3.4264469146728514e-05, | |
| "loss": 6.6347, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 0.3147125244140625, | |
| "eval_accuracy": 0.038639334637964776, | |
| "eval_loss": 6.537603855133057, | |
| "eval_runtime": 248.4175, | |
| "eval_samples_per_second": 40.255, | |
| "eval_steps_per_second": 10.064, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 0.31566619873046875, | |
| "grad_norm": 55.92184066772461, | |
| "learning_rate": 3.4216785430908205e-05, | |
| "loss": 6.6448, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 0.316619873046875, | |
| "grad_norm": 58.1660270690918, | |
| "learning_rate": 3.4169101715087895e-05, | |
| "loss": 6.6149, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 0.31757354736328125, | |
| "grad_norm": 391.7157897949219, | |
| "learning_rate": 3.412141799926758e-05, | |
| "loss": 6.6365, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 0.3185272216796875, | |
| "grad_norm": 55.8887825012207, | |
| "learning_rate": 3.407373428344727e-05, | |
| "loss": 6.6335, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 0.31948089599609375, | |
| "grad_norm": 181.20657348632812, | |
| "learning_rate": 3.402605056762695e-05, | |
| "loss": 6.6448, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 0.3204345703125, | |
| "grad_norm": 94.94180297851562, | |
| "learning_rate": 3.397836685180664e-05, | |
| "loss": 6.6586, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 0.32138824462890625, | |
| "grad_norm": 184.8909149169922, | |
| "learning_rate": 3.393068313598633e-05, | |
| "loss": 6.6272, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 0.3223419189453125, | |
| "grad_norm": 67.47586822509766, | |
| "learning_rate": 3.3882999420166016e-05, | |
| "loss": 6.6441, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 0.32329559326171875, | |
| "grad_norm": 345.04833984375, | |
| "learning_rate": 3.3835315704345706e-05, | |
| "loss": 6.6201, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 0.324249267578125, | |
| "grad_norm": 114.57809448242188, | |
| "learning_rate": 3.378763198852539e-05, | |
| "loss": 6.6542, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.324249267578125, | |
| "eval_accuracy": 0.03969354207436399, | |
| "eval_loss": 6.537732124328613, | |
| "eval_runtime": 236.6138, | |
| "eval_samples_per_second": 42.263, | |
| "eval_steps_per_second": 10.566, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.32520294189453125, | |
| "grad_norm": 186.77565002441406, | |
| "learning_rate": 3.373994827270508e-05, | |
| "loss": 6.6657, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 0.3261566162109375, | |
| "grad_norm": 371.9413146972656, | |
| "learning_rate": 3.369226455688477e-05, | |
| "loss": 6.6699, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 0.32711029052734375, | |
| "grad_norm": 209.8897247314453, | |
| "learning_rate": 3.364458084106445e-05, | |
| "loss": 6.6187, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 0.32806396484375, | |
| "grad_norm": 206.73399353027344, | |
| "learning_rate": 3.3596897125244143e-05, | |
| "loss": 6.6278, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 0.32901763916015625, | |
| "grad_norm": 122.4280776977539, | |
| "learning_rate": 3.354921340942383e-05, | |
| "loss": 6.6408, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 0.3299713134765625, | |
| "grad_norm": 116.70370483398438, | |
| "learning_rate": 3.350152969360352e-05, | |
| "loss": 6.6568, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 0.33092498779296875, | |
| "grad_norm": 117.74230194091797, | |
| "learning_rate": 3.345384597778321e-05, | |
| "loss": 6.6408, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 0.331878662109375, | |
| "grad_norm": 258.5967102050781, | |
| "learning_rate": 3.340616226196289e-05, | |
| "loss": 6.6316, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 0.33283233642578125, | |
| "grad_norm": 142.6446533203125, | |
| "learning_rate": 3.335847854614258e-05, | |
| "loss": 6.6485, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 0.3337860107421875, | |
| "grad_norm": 119.10045623779297, | |
| "learning_rate": 3.3310794830322264e-05, | |
| "loss": 6.619, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 0.3337860107421875, | |
| "eval_accuracy": 0.03876301369863014, | |
| "eval_loss": 6.544088840484619, | |
| "eval_runtime": 247.1694, | |
| "eval_samples_per_second": 40.458, | |
| "eval_steps_per_second": 10.115, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 0.33473968505859375, | |
| "grad_norm": 86.29735565185547, | |
| "learning_rate": 3.3263111114501955e-05, | |
| "loss": 6.6692, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 0.335693359375, | |
| "grad_norm": 594.6138305664062, | |
| "learning_rate": 3.3215427398681645e-05, | |
| "loss": 6.652, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 0.33664703369140625, | |
| "grad_norm": 157.3546142578125, | |
| "learning_rate": 3.316774368286133e-05, | |
| "loss": 6.6185, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 0.3376007080078125, | |
| "grad_norm": 174.54434204101562, | |
| "learning_rate": 3.312005996704102e-05, | |
| "loss": 6.6237, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 0.33855438232421875, | |
| "grad_norm": 210.47975158691406, | |
| "learning_rate": 3.30723762512207e-05, | |
| "loss": 6.5825, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 0.339508056640625, | |
| "grad_norm": 146.17979431152344, | |
| "learning_rate": 3.302469253540039e-05, | |
| "loss": 6.5469, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 0.34046173095703125, | |
| "grad_norm": 114.10115051269531, | |
| "learning_rate": 3.297700881958008e-05, | |
| "loss": 6.5547, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 0.3414154052734375, | |
| "grad_norm": 75.68565368652344, | |
| "learning_rate": 3.2929325103759766e-05, | |
| "loss": 6.609, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 0.34236907958984375, | |
| "grad_norm": 79.07707977294922, | |
| "learning_rate": 3.2881641387939456e-05, | |
| "loss": 6.6539, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 0.34332275390625, | |
| "grad_norm": 584.5313110351562, | |
| "learning_rate": 3.283395767211914e-05, | |
| "loss": 6.6979, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.34332275390625, | |
| "eval_accuracy": 0.03938180039138943, | |
| "eval_loss": 6.555627346038818, | |
| "eval_runtime": 248.0084, | |
| "eval_samples_per_second": 40.321, | |
| "eval_steps_per_second": 10.08, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.34427642822265625, | |
| "grad_norm": 293.7067565917969, | |
| "learning_rate": 3.278627395629883e-05, | |
| "loss": 6.6661, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 0.3452301025390625, | |
| "grad_norm": 112.20381927490234, | |
| "learning_rate": 3.273859024047852e-05, | |
| "loss": 6.6845, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 0.34618377685546875, | |
| "grad_norm": 207.96649169921875, | |
| "learning_rate": 3.26909065246582e-05, | |
| "loss": 6.6346, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 0.347137451171875, | |
| "grad_norm": 114.42713165283203, | |
| "learning_rate": 3.2643222808837893e-05, | |
| "loss": 6.6249, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 0.34809112548828125, | |
| "grad_norm": 102.50064086914062, | |
| "learning_rate": 3.259553909301758e-05, | |
| "loss": 6.6432, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 0.3490447998046875, | |
| "grad_norm": 73.7787857055664, | |
| "learning_rate": 3.254785537719727e-05, | |
| "loss": 6.6306, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 0.34999847412109375, | |
| "grad_norm": 151.01646423339844, | |
| "learning_rate": 3.250017166137696e-05, | |
| "loss": 6.6134, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 0.3509521484375, | |
| "grad_norm": 58.98166275024414, | |
| "learning_rate": 3.245248794555664e-05, | |
| "loss": 6.649, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 0.35190582275390625, | |
| "grad_norm": 103.82510375976562, | |
| "learning_rate": 3.240480422973633e-05, | |
| "loss": 6.6382, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 0.3528594970703125, | |
| "grad_norm": 50.53388977050781, | |
| "learning_rate": 3.2357120513916014e-05, | |
| "loss": 6.6485, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 0.3528594970703125, | |
| "eval_accuracy": 0.039942270058708415, | |
| "eval_loss": 6.536978244781494, | |
| "eval_runtime": 248.6094, | |
| "eval_samples_per_second": 40.224, | |
| "eval_steps_per_second": 10.056, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 0.35381317138671875, | |
| "grad_norm": 125.3191909790039, | |
| "learning_rate": 3.2309436798095705e-05, | |
| "loss": 6.6498, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 0.354766845703125, | |
| "grad_norm": 113.97254943847656, | |
| "learning_rate": 3.2261753082275395e-05, | |
| "loss": 6.5974, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 0.35572052001953125, | |
| "grad_norm": 134.717529296875, | |
| "learning_rate": 3.221406936645508e-05, | |
| "loss": 6.6141, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 0.3566741943359375, | |
| "grad_norm": 144.05467224121094, | |
| "learning_rate": 3.216638565063477e-05, | |
| "loss": 6.595, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 0.35762786865234375, | |
| "grad_norm": 60.121891021728516, | |
| "learning_rate": 3.211870193481445e-05, | |
| "loss": 6.6121, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 0.35858154296875, | |
| "grad_norm": 44.34535598754883, | |
| "learning_rate": 3.207101821899414e-05, | |
| "loss": 6.6154, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 0.35953521728515625, | |
| "grad_norm": 227.33172607421875, | |
| "learning_rate": 3.202333450317383e-05, | |
| "loss": 6.594, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 0.3604888916015625, | |
| "grad_norm": 1378.051025390625, | |
| "learning_rate": 3.1975650787353516e-05, | |
| "loss": 6.6521, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 0.36144256591796875, | |
| "grad_norm": 46.72209930419922, | |
| "learning_rate": 3.1927967071533206e-05, | |
| "loss": 6.5648, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 0.362396240234375, | |
| "grad_norm": 131.37269592285156, | |
| "learning_rate": 3.188028335571289e-05, | |
| "loss": 6.6035, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.362396240234375, | |
| "eval_accuracy": 0.03885205479452055, | |
| "eval_loss": 6.5300092697143555, | |
| "eval_runtime": 251.9306, | |
| "eval_samples_per_second": 39.693, | |
| "eval_steps_per_second": 9.923, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.36334991455078125, | |
| "grad_norm": 76.72520446777344, | |
| "learning_rate": 3.183259963989258e-05, | |
| "loss": 6.6213, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 0.3643035888671875, | |
| "grad_norm": 112.77885437011719, | |
| "learning_rate": 3.178491592407227e-05, | |
| "loss": 6.6474, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 0.36525726318359375, | |
| "grad_norm": 70.67579650878906, | |
| "learning_rate": 3.173723220825195e-05, | |
| "loss": 6.6288, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 0.3662109375, | |
| "grad_norm": 59.372642517089844, | |
| "learning_rate": 3.1689548492431643e-05, | |
| "loss": 6.5876, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 0.36716461181640625, | |
| "grad_norm": 63.93340301513672, | |
| "learning_rate": 3.164186477661133e-05, | |
| "loss": 6.5254, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 0.3681182861328125, | |
| "grad_norm": 93.77088165283203, | |
| "learning_rate": 3.159418106079102e-05, | |
| "loss": 6.5014, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 0.36907196044921875, | |
| "grad_norm": 348.0354919433594, | |
| "learning_rate": 3.154649734497071e-05, | |
| "loss": 6.5559, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 0.370025634765625, | |
| "grad_norm": 108.91474914550781, | |
| "learning_rate": 3.149881362915039e-05, | |
| "loss": 6.5741, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 0.37097930908203125, | |
| "grad_norm": 121.82709503173828, | |
| "learning_rate": 3.145112991333008e-05, | |
| "loss": 6.6283, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 0.3719329833984375, | |
| "grad_norm": 123.15145111083984, | |
| "learning_rate": 3.1403446197509764e-05, | |
| "loss": 6.6574, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 0.3719329833984375, | |
| "eval_accuracy": 0.038532093933463796, | |
| "eval_loss": 6.527206897735596, | |
| "eval_runtime": 254.2011, | |
| "eval_samples_per_second": 39.339, | |
| "eval_steps_per_second": 9.835, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 0.37288665771484375, | |
| "grad_norm": 97.69001007080078, | |
| "learning_rate": 3.1355762481689455e-05, | |
| "loss": 6.6062, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 0.37384033203125, | |
| "grad_norm": 160.37985229492188, | |
| "learning_rate": 3.1308078765869145e-05, | |
| "loss": 6.6397, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 0.37479400634765625, | |
| "grad_norm": 107.5202407836914, | |
| "learning_rate": 3.126039505004883e-05, | |
| "loss": 6.6337, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 0.3757476806640625, | |
| "grad_norm": 306.0714416503906, | |
| "learning_rate": 3.121271133422852e-05, | |
| "loss": 6.6156, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 0.37670135498046875, | |
| "grad_norm": 147.80152893066406, | |
| "learning_rate": 3.11650276184082e-05, | |
| "loss": 6.6691, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 0.377655029296875, | |
| "grad_norm": 69.64010620117188, | |
| "learning_rate": 3.111734390258789e-05, | |
| "loss": 6.6362, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 0.37860870361328125, | |
| "grad_norm": 61.15127182006836, | |
| "learning_rate": 3.106966018676758e-05, | |
| "loss": 6.6665, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 0.3795623779296875, | |
| "grad_norm": 524.3397216796875, | |
| "learning_rate": 3.1021976470947266e-05, | |
| "loss": 6.6447, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 0.38051605224609375, | |
| "grad_norm": 206.949951171875, | |
| "learning_rate": 3.0974292755126956e-05, | |
| "loss": 6.6509, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 0.3814697265625, | |
| "grad_norm": 109.48726654052734, | |
| "learning_rate": 3.092660903930664e-05, | |
| "loss": 6.6152, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.3814697265625, | |
| "eval_accuracy": 0.03765283757338552, | |
| "eval_loss": 6.532608985900879, | |
| "eval_runtime": 239.0143, | |
| "eval_samples_per_second": 41.839, | |
| "eval_steps_per_second": 10.46, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.38242340087890625, | |
| "grad_norm": 211.54612731933594, | |
| "learning_rate": 3.087892532348633e-05, | |
| "loss": 6.5765, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 0.3833770751953125, | |
| "grad_norm": 123.34220123291016, | |
| "learning_rate": 3.083124160766602e-05, | |
| "loss": 6.6243, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 0.38433074951171875, | |
| "grad_norm": 164.30299377441406, | |
| "learning_rate": 3.07835578918457e-05, | |
| "loss": 6.6452, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 0.385284423828125, | |
| "grad_norm": 156.64134216308594, | |
| "learning_rate": 3.0735874176025393e-05, | |
| "loss": 6.6471, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 0.38623809814453125, | |
| "grad_norm": 121.67871856689453, | |
| "learning_rate": 3.068819046020508e-05, | |
| "loss": 6.6086, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 0.3871917724609375, | |
| "grad_norm": 93.36353302001953, | |
| "learning_rate": 3.064050674438477e-05, | |
| "loss": 6.6283, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 0.38814544677734375, | |
| "grad_norm": 455.4304504394531, | |
| "learning_rate": 3.059282302856446e-05, | |
| "loss": 6.6332, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 0.38909912109375, | |
| "grad_norm": 118.70697784423828, | |
| "learning_rate": 3.054513931274414e-05, | |
| "loss": 6.6339, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 0.39005279541015625, | |
| "grad_norm": 59.528892517089844, | |
| "learning_rate": 3.049745559692383e-05, | |
| "loss": 6.5999, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 0.3910064697265625, | |
| "grad_norm": 169.48011779785156, | |
| "learning_rate": 3.0449771881103518e-05, | |
| "loss": 6.5946, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 0.3910064697265625, | |
| "eval_accuracy": 0.03863835616438356, | |
| "eval_loss": 6.531431198120117, | |
| "eval_runtime": 247.6939, | |
| "eval_samples_per_second": 40.372, | |
| "eval_steps_per_second": 10.093, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 0.39196014404296875, | |
| "grad_norm": 188.75572204589844, | |
| "learning_rate": 3.0402088165283205e-05, | |
| "loss": 6.5888, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 0.392913818359375, | |
| "grad_norm": 67.42155456542969, | |
| "learning_rate": 3.035440444946289e-05, | |
| "loss": 6.586, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 0.39386749267578125, | |
| "grad_norm": 76.16634368896484, | |
| "learning_rate": 3.0306720733642578e-05, | |
| "loss": 6.5993, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 0.3948211669921875, | |
| "grad_norm": 259.7588806152344, | |
| "learning_rate": 3.025903701782227e-05, | |
| "loss": 6.6088, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 0.39577484130859375, | |
| "grad_norm": 75.8446273803711, | |
| "learning_rate": 3.0211353302001955e-05, | |
| "loss": 6.5749, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 0.396728515625, | |
| "grad_norm": 563.4567260742188, | |
| "learning_rate": 3.0163669586181642e-05, | |
| "loss": 6.5621, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 0.39768218994140625, | |
| "grad_norm": 181.39927673339844, | |
| "learning_rate": 3.011598587036133e-05, | |
| "loss": 6.5429, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 0.3986358642578125, | |
| "grad_norm": 78.88523864746094, | |
| "learning_rate": 3.0068302154541016e-05, | |
| "loss": 6.5932, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 0.39958953857421875, | |
| "grad_norm": 288.15118408203125, | |
| "learning_rate": 3.0020618438720706e-05, | |
| "loss": 6.6502, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 0.400543212890625, | |
| "grad_norm": 1462.8275146484375, | |
| "learning_rate": 2.9972934722900393e-05, | |
| "loss": 6.6747, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.400543212890625, | |
| "eval_accuracy": 0.039049902152641876, | |
| "eval_loss": 6.518420696258545, | |
| "eval_runtime": 245.3499, | |
| "eval_samples_per_second": 40.758, | |
| "eval_steps_per_second": 10.19, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.40149688720703125, | |
| "grad_norm": 270.0433654785156, | |
| "learning_rate": 2.992525100708008e-05, | |
| "loss": 6.6314, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 0.4024505615234375, | |
| "grad_norm": 140.15133666992188, | |
| "learning_rate": 2.9877567291259766e-05, | |
| "loss": 6.6099, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 0.40340423583984375, | |
| "grad_norm": 158.65878295898438, | |
| "learning_rate": 2.9829883575439453e-05, | |
| "loss": 6.6602, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 0.40435791015625, | |
| "grad_norm": 63.215354919433594, | |
| "learning_rate": 2.9782199859619143e-05, | |
| "loss": 6.5969, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 0.40531158447265625, | |
| "grad_norm": 434.90972900390625, | |
| "learning_rate": 2.973451614379883e-05, | |
| "loss": 6.5978, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 0.4062652587890625, | |
| "grad_norm": 123.8073959350586, | |
| "learning_rate": 2.9686832427978517e-05, | |
| "loss": 6.632, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 0.40721893310546875, | |
| "grad_norm": 131.49026489257812, | |
| "learning_rate": 2.9639148712158204e-05, | |
| "loss": 6.6471, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 0.408172607421875, | |
| "grad_norm": 275.69775390625, | |
| "learning_rate": 2.959146499633789e-05, | |
| "loss": 6.6522, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 0.40912628173828125, | |
| "grad_norm": 285.1741638183594, | |
| "learning_rate": 2.954378128051758e-05, | |
| "loss": 6.6183, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 0.4100799560546875, | |
| "grad_norm": 242.5558624267578, | |
| "learning_rate": 2.9496097564697268e-05, | |
| "loss": 6.618, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 0.4100799560546875, | |
| "eval_accuracy": 0.03752504892367906, | |
| "eval_loss": 6.5262370109558105, | |
| "eval_runtime": 236.1515, | |
| "eval_samples_per_second": 42.346, | |
| "eval_steps_per_second": 10.586, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 0.41103363037109375, | |
| "grad_norm": 111.01753234863281, | |
| "learning_rate": 2.9448413848876955e-05, | |
| "loss": 6.5984, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 0.4119873046875, | |
| "grad_norm": 79.53560638427734, | |
| "learning_rate": 2.940073013305664e-05, | |
| "loss": 6.6158, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 0.41294097900390625, | |
| "grad_norm": 56.24204635620117, | |
| "learning_rate": 2.9353046417236328e-05, | |
| "loss": 6.6407, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 0.4138946533203125, | |
| "grad_norm": 407.7383117675781, | |
| "learning_rate": 2.930536270141602e-05, | |
| "loss": 6.5933, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 0.41484832763671875, | |
| "grad_norm": 111.54969024658203, | |
| "learning_rate": 2.9257678985595705e-05, | |
| "loss": 6.5416, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 0.415802001953125, | |
| "grad_norm": 573.9189453125, | |
| "learning_rate": 2.9209995269775392e-05, | |
| "loss": 6.627, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 0.41675567626953125, | |
| "grad_norm": 61.13914489746094, | |
| "learning_rate": 2.916231155395508e-05, | |
| "loss": 6.6041, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 0.4177093505859375, | |
| "grad_norm": 223.9310302734375, | |
| "learning_rate": 2.9114627838134766e-05, | |
| "loss": 6.6351, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 0.41866302490234375, | |
| "grad_norm": 222.9752960205078, | |
| "learning_rate": 2.9066944122314456e-05, | |
| "loss": 6.6542, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 0.41961669921875, | |
| "grad_norm": 93.45650482177734, | |
| "learning_rate": 2.9019260406494143e-05, | |
| "loss": 6.6218, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.41961669921875, | |
| "eval_accuracy": 0.038779843444227004, | |
| "eval_loss": 6.533949375152588, | |
| "eval_runtime": 253.8147, | |
| "eval_samples_per_second": 39.399, | |
| "eval_steps_per_second": 9.85, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.42057037353515625, | |
| "grad_norm": 116.55095672607422, | |
| "learning_rate": 2.897157669067383e-05, | |
| "loss": 6.6154, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 0.4215240478515625, | |
| "grad_norm": 97.29713439941406, | |
| "learning_rate": 2.8923892974853516e-05, | |
| "loss": 6.5961, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 0.42247772216796875, | |
| "grad_norm": 54.4910774230957, | |
| "learning_rate": 2.8876209259033203e-05, | |
| "loss": 6.5797, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 0.423431396484375, | |
| "grad_norm": 365.531494140625, | |
| "learning_rate": 2.8828525543212893e-05, | |
| "loss": 6.5709, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 0.42438507080078125, | |
| "grad_norm": 59.368221282958984, | |
| "learning_rate": 2.878084182739258e-05, | |
| "loss": 6.5644, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 0.4253387451171875, | |
| "grad_norm": 70.24651336669922, | |
| "learning_rate": 2.8733158111572267e-05, | |
| "loss": 6.5807, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 0.42629241943359375, | |
| "grad_norm": 67.14861297607422, | |
| "learning_rate": 2.8685474395751954e-05, | |
| "loss": 6.6404, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 0.42724609375, | |
| "grad_norm": 115.37001037597656, | |
| "learning_rate": 2.863779067993164e-05, | |
| "loss": 6.6317, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 0.42819976806640625, | |
| "grad_norm": 134.29444885253906, | |
| "learning_rate": 2.859010696411133e-05, | |
| "loss": 6.653, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 0.4291534423828125, | |
| "grad_norm": 88.78755187988281, | |
| "learning_rate": 2.8542423248291018e-05, | |
| "loss": 6.6659, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 0.4291534423828125, | |
| "eval_accuracy": 0.03827005870841487, | |
| "eval_loss": 6.525757312774658, | |
| "eval_runtime": 239.6203, | |
| "eval_samples_per_second": 41.733, | |
| "eval_steps_per_second": 10.433, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 0.43010711669921875, | |
| "grad_norm": 59.79543685913086, | |
| "learning_rate": 2.8494739532470705e-05, | |
| "loss": 6.6049, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 0.431060791015625, | |
| "grad_norm": 70.32538604736328, | |
| "learning_rate": 2.844705581665039e-05, | |
| "loss": 6.6035, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 0.43201446533203125, | |
| "grad_norm": 47.479225158691406, | |
| "learning_rate": 2.8399372100830078e-05, | |
| "loss": 6.665, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 0.4329681396484375, | |
| "grad_norm": 299.8247375488281, | |
| "learning_rate": 2.835168838500977e-05, | |
| "loss": 6.6103, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 0.43392181396484375, | |
| "grad_norm": 68.14066314697266, | |
| "learning_rate": 2.8304004669189455e-05, | |
| "loss": 6.6295, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 0.43487548828125, | |
| "grad_norm": 134.41168212890625, | |
| "learning_rate": 2.8256320953369142e-05, | |
| "loss": 6.5957, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 0.43582916259765625, | |
| "grad_norm": 121.62809753417969, | |
| "learning_rate": 2.820863723754883e-05, | |
| "loss": 6.646, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 0.4367828369140625, | |
| "grad_norm": 66.99333190917969, | |
| "learning_rate": 2.8160953521728516e-05, | |
| "loss": 6.6207, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 0.43773651123046875, | |
| "grad_norm": 105.14512634277344, | |
| "learning_rate": 2.8113269805908206e-05, | |
| "loss": 6.6208, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 0.438690185546875, | |
| "grad_norm": 80.91681671142578, | |
| "learning_rate": 2.8065586090087893e-05, | |
| "loss": 6.6292, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.438690185546875, | |
| "eval_accuracy": 0.03869960861056752, | |
| "eval_loss": 6.524181365966797, | |
| "eval_runtime": 237.6497, | |
| "eval_samples_per_second": 42.079, | |
| "eval_steps_per_second": 10.52, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.43964385986328125, | |
| "grad_norm": 70.6031494140625, | |
| "learning_rate": 2.801790237426758e-05, | |
| "loss": 6.634, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 0.4405975341796875, | |
| "grad_norm": 94.75713348388672, | |
| "learning_rate": 2.7970218658447266e-05, | |
| "loss": 6.5882, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 0.44155120849609375, | |
| "grad_norm": 85.7402572631836, | |
| "learning_rate": 2.7922534942626953e-05, | |
| "loss": 6.6304, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 0.4425048828125, | |
| "grad_norm": 62.476173400878906, | |
| "learning_rate": 2.7874851226806643e-05, | |
| "loss": 6.602, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 0.44345855712890625, | |
| "grad_norm": 103.45085144042969, | |
| "learning_rate": 2.782716751098633e-05, | |
| "loss": 6.6056, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 0.4444122314453125, | |
| "grad_norm": 188.56761169433594, | |
| "learning_rate": 2.7779483795166017e-05, | |
| "loss": 6.611, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 0.44536590576171875, | |
| "grad_norm": 122.7448959350586, | |
| "learning_rate": 2.7731800079345704e-05, | |
| "loss": 6.5814, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 0.446319580078125, | |
| "grad_norm": 127.61102294921875, | |
| "learning_rate": 2.768411636352539e-05, | |
| "loss": 6.62, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 0.44727325439453125, | |
| "grad_norm": 55.343955993652344, | |
| "learning_rate": 2.763643264770508e-05, | |
| "loss": 6.6301, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 0.4482269287109375, | |
| "grad_norm": 31.93642234802246, | |
| "learning_rate": 2.7588748931884768e-05, | |
| "loss": 6.6608, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 0.4482269287109375, | |
| "eval_accuracy": 0.035914090019569474, | |
| "eval_loss": 6.558816432952881, | |
| "eval_runtime": 249.3294, | |
| "eval_samples_per_second": 40.108, | |
| "eval_steps_per_second": 10.027, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 0.44918060302734375, | |
| "grad_norm": 498.0051574707031, | |
| "learning_rate": 2.7541065216064455e-05, | |
| "loss": 6.6317, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 0.45013427734375, | |
| "grad_norm": 81.84600830078125, | |
| "learning_rate": 2.749338150024414e-05, | |
| "loss": 6.5796, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 0.45108795166015625, | |
| "grad_norm": 95.89019775390625, | |
| "learning_rate": 2.7445697784423828e-05, | |
| "loss": 6.6014, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 0.4520416259765625, | |
| "grad_norm": 65.94055938720703, | |
| "learning_rate": 2.739801406860352e-05, | |
| "loss": 6.576, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 0.45299530029296875, | |
| "grad_norm": 232.42098999023438, | |
| "learning_rate": 2.7350330352783205e-05, | |
| "loss": 6.5618, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 0.453948974609375, | |
| "grad_norm": 46.52445983886719, | |
| "learning_rate": 2.7302646636962892e-05, | |
| "loss": 6.6523, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 0.45490264892578125, | |
| "grad_norm": 55.36016082763672, | |
| "learning_rate": 2.725496292114258e-05, | |
| "loss": 6.6198, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 0.4558563232421875, | |
| "grad_norm": 42.32960891723633, | |
| "learning_rate": 2.7207279205322266e-05, | |
| "loss": 6.6285, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 0.45680999755859375, | |
| "grad_norm": 65.83206939697266, | |
| "learning_rate": 2.7159595489501956e-05, | |
| "loss": 6.6045, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 0.457763671875, | |
| "grad_norm": 297.2817077636719, | |
| "learning_rate": 2.7111911773681643e-05, | |
| "loss": 6.5772, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.457763671875, | |
| "eval_accuracy": 0.0389399217221135, | |
| "eval_loss": 6.511717319488525, | |
| "eval_runtime": 239.2997, | |
| "eval_samples_per_second": 41.789, | |
| "eval_steps_per_second": 10.447, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.45871734619140625, | |
| "grad_norm": 284.4686279296875, | |
| "learning_rate": 2.706422805786133e-05, | |
| "loss": 6.6136, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 0.4596710205078125, | |
| "grad_norm": 57.11293411254883, | |
| "learning_rate": 2.7016544342041016e-05, | |
| "loss": 6.6187, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 0.46062469482421875, | |
| "grad_norm": 167.8474578857422, | |
| "learning_rate": 2.6968860626220703e-05, | |
| "loss": 6.651, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 0.461578369140625, | |
| "grad_norm": 47.619842529296875, | |
| "learning_rate": 2.6921176910400393e-05, | |
| "loss": 6.6191, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 0.46253204345703125, | |
| "grad_norm": 87.00941467285156, | |
| "learning_rate": 2.687349319458008e-05, | |
| "loss": 6.6161, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 0.4634857177734375, | |
| "grad_norm": 226.7034149169922, | |
| "learning_rate": 2.6825809478759767e-05, | |
| "loss": 6.6149, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 0.46443939208984375, | |
| "grad_norm": 103.8028335571289, | |
| "learning_rate": 2.6778125762939454e-05, | |
| "loss": 6.6247, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 0.46539306640625, | |
| "grad_norm": 83.36387634277344, | |
| "learning_rate": 2.673044204711914e-05, | |
| "loss": 6.6019, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 0.46634674072265625, | |
| "grad_norm": 75.43655395507812, | |
| "learning_rate": 2.668275833129883e-05, | |
| "loss": 6.5738, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 0.4673004150390625, | |
| "grad_norm": 113.69373321533203, | |
| "learning_rate": 2.6635074615478518e-05, | |
| "loss": 6.5961, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 0.4673004150390625, | |
| "eval_accuracy": 0.03589549902152642, | |
| "eval_loss": 6.538053035736084, | |
| "eval_runtime": 238.1145, | |
| "eval_samples_per_second": 41.997, | |
| "eval_steps_per_second": 10.499, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 0.46825408935546875, | |
| "grad_norm": 71.73027801513672, | |
| "learning_rate": 2.6587390899658205e-05, | |
| "loss": 6.6012, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 0.469207763671875, | |
| "grad_norm": 108.12360382080078, | |
| "learning_rate": 2.653970718383789e-05, | |
| "loss": 6.5581, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 0.47016143798828125, | |
| "grad_norm": 78.5285873413086, | |
| "learning_rate": 2.6492023468017578e-05, | |
| "loss": 6.5655, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 0.4711151123046875, | |
| "grad_norm": 42.62543487548828, | |
| "learning_rate": 2.644433975219727e-05, | |
| "loss": 6.595, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 0.47206878662109375, | |
| "grad_norm": 51.91836929321289, | |
| "learning_rate": 2.6396656036376955e-05, | |
| "loss": 6.5763, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 0.4730224609375, | |
| "grad_norm": 34.81822204589844, | |
| "learning_rate": 2.6348972320556642e-05, | |
| "loss": 6.6161, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 0.47397613525390625, | |
| "grad_norm": 33.98869323730469, | |
| "learning_rate": 2.630128860473633e-05, | |
| "loss": 6.5918, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 0.4749298095703125, | |
| "grad_norm": 36.41916275024414, | |
| "learning_rate": 2.6253604888916016e-05, | |
| "loss": 6.6115, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 0.47588348388671875, | |
| "grad_norm": 144.3103485107422, | |
| "learning_rate": 2.6205921173095706e-05, | |
| "loss": 6.6107, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 0.476837158203125, | |
| "grad_norm": 67.23009490966797, | |
| "learning_rate": 2.6158237457275393e-05, | |
| "loss": 6.563, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 0.476837158203125, | |
| "eval_accuracy": 0.04007847358121331, | |
| "eval_loss": 6.519131183624268, | |
| "eval_runtime": 239.77, | |
| "eval_samples_per_second": 41.707, | |
| "eval_steps_per_second": 10.427, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 0.47779083251953125, | |
| "grad_norm": 52.45304870605469, | |
| "learning_rate": 2.611055374145508e-05, | |
| "loss": 6.5189, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 0.4787445068359375, | |
| "grad_norm": 144.16065979003906, | |
| "learning_rate": 2.6062870025634766e-05, | |
| "loss": 6.5234, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 0.47969818115234375, | |
| "grad_norm": 53.3181037902832, | |
| "learning_rate": 2.6015186309814453e-05, | |
| "loss": 6.5775, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 0.48065185546875, | |
| "grad_norm": 61.429264068603516, | |
| "learning_rate": 2.5967502593994143e-05, | |
| "loss": 6.5551, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 0.48160552978515625, | |
| "grad_norm": 77.74535369873047, | |
| "learning_rate": 2.591981887817383e-05, | |
| "loss": 6.6271, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 0.4825592041015625, | |
| "grad_norm": 62.36492156982422, | |
| "learning_rate": 2.5872135162353517e-05, | |
| "loss": 6.599, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 0.48351287841796875, | |
| "grad_norm": 48.22749710083008, | |
| "learning_rate": 2.5824451446533204e-05, | |
| "loss": 6.6033, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 0.484466552734375, | |
| "grad_norm": 66.73870086669922, | |
| "learning_rate": 2.577676773071289e-05, | |
| "loss": 6.5897, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 0.48542022705078125, | |
| "grad_norm": 114.62262725830078, | |
| "learning_rate": 2.572908401489258e-05, | |
| "loss": 6.4891, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 0.4863739013671875, | |
| "grad_norm": 47.45481491088867, | |
| "learning_rate": 2.5681400299072268e-05, | |
| "loss": 6.5651, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 0.4863739013671875, | |
| "eval_accuracy": 0.0385412915851272, | |
| "eval_loss": 6.4979939460754395, | |
| "eval_runtime": 243.7128, | |
| "eval_samples_per_second": 41.032, | |
| "eval_steps_per_second": 10.258, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 0.48732757568359375, | |
| "grad_norm": 168.27577209472656, | |
| "learning_rate": 2.5633716583251955e-05, | |
| "loss": 6.5793, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 58.58369064331055, | |
| "learning_rate": 2.558603286743164e-05, | |
| "loss": 6.5713, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 0.48923492431640625, | |
| "grad_norm": 136.4830780029297, | |
| "learning_rate": 2.5538349151611328e-05, | |
| "loss": 6.5813, | |
| "step": 256500 | |
| }, | |
| { | |
| "epoch": 0.4901885986328125, | |
| "grad_norm": 141.25311279296875, | |
| "learning_rate": 2.549066543579102e-05, | |
| "loss": 6.5841, | |
| "step": 257000 | |
| }, | |
| { | |
| "epoch": 0.49114227294921875, | |
| "grad_norm": 62.92488479614258, | |
| "learning_rate": 2.5442981719970705e-05, | |
| "loss": 6.5859, | |
| "step": 257500 | |
| }, | |
| { | |
| "epoch": 0.492095947265625, | |
| "grad_norm": 322.6897888183594, | |
| "learning_rate": 2.5395298004150392e-05, | |
| "loss": 6.5814, | |
| "step": 258000 | |
| }, | |
| { | |
| "epoch": 0.49304962158203125, | |
| "grad_norm": 223.2647247314453, | |
| "learning_rate": 2.534761428833008e-05, | |
| "loss": 6.5797, | |
| "step": 258500 | |
| }, | |
| { | |
| "epoch": 0.4940032958984375, | |
| "grad_norm": 50.54922103881836, | |
| "learning_rate": 2.5299930572509766e-05, | |
| "loss": 6.5791, | |
| "step": 259000 | |
| }, | |
| { | |
| "epoch": 0.49495697021484375, | |
| "grad_norm": 206.4066619873047, | |
| "learning_rate": 2.5252246856689456e-05, | |
| "loss": 6.6171, | |
| "step": 259500 | |
| }, | |
| { | |
| "epoch": 0.49591064453125, | |
| "grad_norm": 82.16864776611328, | |
| "learning_rate": 2.5204563140869143e-05, | |
| "loss": 6.5398, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 0.49591064453125, | |
| "eval_accuracy": 0.03885655577299413, | |
| "eval_loss": 6.49983024597168, | |
| "eval_runtime": 237.2672, | |
| "eval_samples_per_second": 42.147, | |
| "eval_steps_per_second": 10.537, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 0.49686431884765625, | |
| "grad_norm": 100.80988311767578, | |
| "learning_rate": 2.515687942504883e-05, | |
| "loss": 6.5823, | |
| "step": 260500 | |
| }, | |
| { | |
| "epoch": 0.4978179931640625, | |
| "grad_norm": 40.91484069824219, | |
| "learning_rate": 2.5109195709228516e-05, | |
| "loss": 6.5807, | |
| "step": 261000 | |
| }, | |
| { | |
| "epoch": 0.49877166748046875, | |
| "grad_norm": 198.70712280273438, | |
| "learning_rate": 2.5061511993408203e-05, | |
| "loss": 6.5759, | |
| "step": 261500 | |
| }, | |
| { | |
| "epoch": 0.499725341796875, | |
| "grad_norm": 102.08186340332031, | |
| "learning_rate": 2.5013828277587893e-05, | |
| "loss": 6.6002, | |
| "step": 262000 | |
| }, | |
| { | |
| "epoch": 0.5006790161132812, | |
| "grad_norm": 114.76848602294922, | |
| "learning_rate": 2.496614456176758e-05, | |
| "loss": 6.5748, | |
| "step": 262500 | |
| }, | |
| { | |
| "epoch": 0.5016326904296875, | |
| "grad_norm": 475.52337646484375, | |
| "learning_rate": 2.4918460845947267e-05, | |
| "loss": 6.5824, | |
| "step": 263000 | |
| }, | |
| { | |
| "epoch": 0.5025863647460938, | |
| "grad_norm": 63.79277420043945, | |
| "learning_rate": 2.4870777130126954e-05, | |
| "loss": 6.5644, | |
| "step": 263500 | |
| }, | |
| { | |
| "epoch": 0.5035400390625, | |
| "grad_norm": 273.6373291015625, | |
| "learning_rate": 2.482309341430664e-05, | |
| "loss": 6.6185, | |
| "step": 264000 | |
| }, | |
| { | |
| "epoch": 0.5044937133789062, | |
| "grad_norm": 174.2074432373047, | |
| "learning_rate": 2.477540969848633e-05, | |
| "loss": 6.5506, | |
| "step": 264500 | |
| }, | |
| { | |
| "epoch": 0.5054473876953125, | |
| "grad_norm": 1188.002685546875, | |
| "learning_rate": 2.4727725982666018e-05, | |
| "loss": 6.5368, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 0.5054473876953125, | |
| "eval_accuracy": 0.03887651663405088, | |
| "eval_loss": 6.4984235763549805, | |
| "eval_runtime": 238.5437, | |
| "eval_samples_per_second": 41.921, | |
| "eval_steps_per_second": 10.48, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 0.5064010620117188, | |
| "grad_norm": 373.5093078613281, | |
| "learning_rate": 2.4680042266845705e-05, | |
| "loss": 6.5601, | |
| "step": 265500 | |
| }, | |
| { | |
| "epoch": 0.507354736328125, | |
| "grad_norm": 106.3024673461914, | |
| "learning_rate": 2.463235855102539e-05, | |
| "loss": 6.5036, | |
| "step": 266000 | |
| }, | |
| { | |
| "epoch": 0.5083084106445312, | |
| "grad_norm": 143.9859619140625, | |
| "learning_rate": 2.4584674835205078e-05, | |
| "loss": 6.5817, | |
| "step": 266500 | |
| }, | |
| { | |
| "epoch": 0.5092620849609375, | |
| "grad_norm": 62.84480285644531, | |
| "learning_rate": 2.453699111938477e-05, | |
| "loss": 6.5998, | |
| "step": 267000 | |
| }, | |
| { | |
| "epoch": 0.5102157592773438, | |
| "grad_norm": 235.6577606201172, | |
| "learning_rate": 2.4489307403564455e-05, | |
| "loss": 6.6033, | |
| "step": 267500 | |
| }, | |
| { | |
| "epoch": 0.51116943359375, | |
| "grad_norm": 140.92276000976562, | |
| "learning_rate": 2.4441623687744142e-05, | |
| "loss": 6.535, | |
| "step": 268000 | |
| }, | |
| { | |
| "epoch": 0.5121231079101562, | |
| "grad_norm": 153.2882843017578, | |
| "learning_rate": 2.439393997192383e-05, | |
| "loss": 6.5595, | |
| "step": 268500 | |
| }, | |
| { | |
| "epoch": 0.5130767822265625, | |
| "grad_norm": 66.25297546386719, | |
| "learning_rate": 2.4346256256103516e-05, | |
| "loss": 6.5609, | |
| "step": 269000 | |
| }, | |
| { | |
| "epoch": 0.5140304565429688, | |
| "grad_norm": 174.41998291015625, | |
| "learning_rate": 2.4298572540283206e-05, | |
| "loss": 6.5772, | |
| "step": 269500 | |
| }, | |
| { | |
| "epoch": 0.514984130859375, | |
| "grad_norm": 126.60942077636719, | |
| "learning_rate": 2.4250888824462893e-05, | |
| "loss": 6.599, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 0.514984130859375, | |
| "eval_accuracy": 0.0387761252446184, | |
| "eval_loss": 6.493450164794922, | |
| "eval_runtime": 239.9879, | |
| "eval_samples_per_second": 41.669, | |
| "eval_steps_per_second": 10.417, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 0.5159378051757812, | |
| "grad_norm": 226.81134033203125, | |
| "learning_rate": 2.420320510864258e-05, | |
| "loss": 6.5803, | |
| "step": 270500 | |
| }, | |
| { | |
| "epoch": 0.5168914794921875, | |
| "grad_norm": 174.19500732421875, | |
| "learning_rate": 2.4155521392822266e-05, | |
| "loss": 6.5671, | |
| "step": 271000 | |
| }, | |
| { | |
| "epoch": 0.5178451538085938, | |
| "grad_norm": 124.677001953125, | |
| "learning_rate": 2.4107837677001953e-05, | |
| "loss": 6.5904, | |
| "step": 271500 | |
| }, | |
| { | |
| "epoch": 0.518798828125, | |
| "grad_norm": 76.14005279541016, | |
| "learning_rate": 2.406015396118164e-05, | |
| "loss": 6.5971, | |
| "step": 272000 | |
| }, | |
| { | |
| "epoch": 0.5197525024414062, | |
| "grad_norm": 129.07936096191406, | |
| "learning_rate": 2.401247024536133e-05, | |
| "loss": 6.5572, | |
| "step": 272500 | |
| }, | |
| { | |
| "epoch": 0.5207061767578125, | |
| "grad_norm": 77.52312469482422, | |
| "learning_rate": 2.3964786529541017e-05, | |
| "loss": 6.5508, | |
| "step": 273000 | |
| }, | |
| { | |
| "epoch": 0.5216598510742188, | |
| "grad_norm": 96.4195556640625, | |
| "learning_rate": 2.3917102813720704e-05, | |
| "loss": 6.5542, | |
| "step": 273500 | |
| }, | |
| { | |
| "epoch": 0.522613525390625, | |
| "grad_norm": 304.2214050292969, | |
| "learning_rate": 2.386941909790039e-05, | |
| "loss": 6.5653, | |
| "step": 274000 | |
| }, | |
| { | |
| "epoch": 0.5235671997070312, | |
| "grad_norm": 114.71865844726562, | |
| "learning_rate": 2.3821735382080078e-05, | |
| "loss": 6.5624, | |
| "step": 274500 | |
| }, | |
| { | |
| "epoch": 0.5245208740234375, | |
| "grad_norm": 87.2974853515625, | |
| "learning_rate": 2.3774051666259768e-05, | |
| "loss": 6.6015, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 0.5245208740234375, | |
| "eval_accuracy": 0.03879902152641879, | |
| "eval_loss": 6.489101409912109, | |
| "eval_runtime": 237.0997, | |
| "eval_samples_per_second": 42.176, | |
| "eval_steps_per_second": 10.544, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 0.5254745483398438, | |
| "grad_norm": 89.22550201416016, | |
| "learning_rate": 2.3726367950439455e-05, | |
| "loss": 6.5772, | |
| "step": 275500 | |
| }, | |
| { | |
| "epoch": 0.52642822265625, | |
| "grad_norm": 239.70816040039062, | |
| "learning_rate": 2.367868423461914e-05, | |
| "loss": 6.5974, | |
| "step": 276000 | |
| }, | |
| { | |
| "epoch": 0.5273818969726562, | |
| "grad_norm": 100.86592102050781, | |
| "learning_rate": 2.3631000518798828e-05, | |
| "loss": 6.5683, | |
| "step": 276500 | |
| }, | |
| { | |
| "epoch": 0.5283355712890625, | |
| "grad_norm": 173.64987182617188, | |
| "learning_rate": 2.3583316802978515e-05, | |
| "loss": 6.5515, | |
| "step": 277000 | |
| }, | |
| { | |
| "epoch": 0.5292892456054688, | |
| "grad_norm": 48.169734954833984, | |
| "learning_rate": 2.3535633087158205e-05, | |
| "loss": 6.5881, | |
| "step": 277500 | |
| }, | |
| { | |
| "epoch": 0.530242919921875, | |
| "grad_norm": 121.87443542480469, | |
| "learning_rate": 2.3487949371337892e-05, | |
| "loss": 6.5795, | |
| "step": 278000 | |
| }, | |
| { | |
| "epoch": 0.5311965942382812, | |
| "grad_norm": 149.2621612548828, | |
| "learning_rate": 2.344026565551758e-05, | |
| "loss": 6.5027, | |
| "step": 278500 | |
| }, | |
| { | |
| "epoch": 0.5321502685546875, | |
| "grad_norm": 98.72933197021484, | |
| "learning_rate": 2.3392581939697266e-05, | |
| "loss": 6.5023, | |
| "step": 279000 | |
| }, | |
| { | |
| "epoch": 0.5331039428710938, | |
| "grad_norm": 95.72864532470703, | |
| "learning_rate": 2.3344898223876953e-05, | |
| "loss": 6.4943, | |
| "step": 279500 | |
| }, | |
| { | |
| "epoch": 0.5340576171875, | |
| "grad_norm": 1422.8848876953125, | |
| "learning_rate": 2.3297214508056643e-05, | |
| "loss": 6.5597, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 0.5340576171875, | |
| "eval_accuracy": 0.03942974559686888, | |
| "eval_loss": 6.477899074554443, | |
| "eval_runtime": 243.6605, | |
| "eval_samples_per_second": 41.041, | |
| "eval_steps_per_second": 10.26, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 0.5350112915039062, | |
| "grad_norm": 58.64554214477539, | |
| "learning_rate": 2.324953079223633e-05, | |
| "loss": 6.6398, | |
| "step": 280500 | |
| }, | |
| { | |
| "epoch": 0.5359649658203125, | |
| "grad_norm": 179.9173126220703, | |
| "learning_rate": 2.3201847076416016e-05, | |
| "loss": 6.5941, | |
| "step": 281000 | |
| }, | |
| { | |
| "epoch": 0.5369186401367188, | |
| "grad_norm": 243.6712188720703, | |
| "learning_rate": 2.3154163360595703e-05, | |
| "loss": 6.5901, | |
| "step": 281500 | |
| }, | |
| { | |
| "epoch": 0.537872314453125, | |
| "grad_norm": 62.67595291137695, | |
| "learning_rate": 2.310647964477539e-05, | |
| "loss": 6.5734, | |
| "step": 282000 | |
| }, | |
| { | |
| "epoch": 0.5388259887695312, | |
| "grad_norm": 249.361572265625, | |
| "learning_rate": 2.305879592895508e-05, | |
| "loss": 6.5552, | |
| "step": 282500 | |
| }, | |
| { | |
| "epoch": 0.5397796630859375, | |
| "grad_norm": 85.4103012084961, | |
| "learning_rate": 2.3011112213134767e-05, | |
| "loss": 6.5943, | |
| "step": 283000 | |
| }, | |
| { | |
| "epoch": 0.5407333374023438, | |
| "grad_norm": 194.80154418945312, | |
| "learning_rate": 2.2963428497314454e-05, | |
| "loss": 6.5562, | |
| "step": 283500 | |
| }, | |
| { | |
| "epoch": 0.54168701171875, | |
| "grad_norm": 113.66175842285156, | |
| "learning_rate": 2.291574478149414e-05, | |
| "loss": 6.5685, | |
| "step": 284000 | |
| }, | |
| { | |
| "epoch": 0.5426406860351562, | |
| "grad_norm": 173.5185546875, | |
| "learning_rate": 2.2868061065673828e-05, | |
| "loss": 6.565, | |
| "step": 284500 | |
| }, | |
| { | |
| "epoch": 0.5435943603515625, | |
| "grad_norm": 932.0955810546875, | |
| "learning_rate": 2.2820377349853518e-05, | |
| "loss": 6.5695, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 0.5435943603515625, | |
| "eval_accuracy": 0.039506066536203525, | |
| "eval_loss": 6.4823150634765625, | |
| "eval_runtime": 238.0878, | |
| "eval_samples_per_second": 42.001, | |
| "eval_steps_per_second": 10.5, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 0.5445480346679688, | |
| "grad_norm": 426.4958190917969, | |
| "learning_rate": 2.2772693634033205e-05, | |
| "loss": 6.5879, | |
| "step": 285500 | |
| }, | |
| { | |
| "epoch": 0.545501708984375, | |
| "grad_norm": 165.65635681152344, | |
| "learning_rate": 2.272500991821289e-05, | |
| "loss": 6.5646, | |
| "step": 286000 | |
| }, | |
| { | |
| "epoch": 0.5464553833007812, | |
| "grad_norm": 98.79949951171875, | |
| "learning_rate": 2.2677326202392578e-05, | |
| "loss": 6.5702, | |
| "step": 286500 | |
| }, | |
| { | |
| "epoch": 0.5474090576171875, | |
| "grad_norm": 86.36907196044922, | |
| "learning_rate": 2.2629642486572265e-05, | |
| "loss": 6.5367, | |
| "step": 287000 | |
| }, | |
| { | |
| "epoch": 0.5483627319335938, | |
| "grad_norm": 74.77356719970703, | |
| "learning_rate": 2.2581958770751955e-05, | |
| "loss": 6.5822, | |
| "step": 287500 | |
| }, | |
| { | |
| "epoch": 0.54931640625, | |
| "grad_norm": 353.52392578125, | |
| "learning_rate": 2.2534275054931642e-05, | |
| "loss": 6.5747, | |
| "step": 288000 | |
| }, | |
| { | |
| "epoch": 0.5502700805664062, | |
| "grad_norm": 166.5744171142578, | |
| "learning_rate": 2.248659133911133e-05, | |
| "loss": 6.5156, | |
| "step": 288500 | |
| }, | |
| { | |
| "epoch": 0.5512237548828125, | |
| "grad_norm": 63.69950485229492, | |
| "learning_rate": 2.2438907623291016e-05, | |
| "loss": 6.5647, | |
| "step": 289000 | |
| }, | |
| { | |
| "epoch": 0.5521774291992188, | |
| "grad_norm": 127.87213897705078, | |
| "learning_rate": 2.2391223907470703e-05, | |
| "loss": 6.5736, | |
| "step": 289500 | |
| }, | |
| { | |
| "epoch": 0.553131103515625, | |
| "grad_norm": 202.01283264160156, | |
| "learning_rate": 2.2343540191650393e-05, | |
| "loss": 6.5809, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 0.553131103515625, | |
| "eval_accuracy": 0.038223287671232876, | |
| "eval_loss": 6.4925103187561035, | |
| "eval_runtime": 237.5747, | |
| "eval_samples_per_second": 42.092, | |
| "eval_steps_per_second": 10.523, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 0.5540847778320312, | |
| "grad_norm": 481.0960998535156, | |
| "learning_rate": 2.229585647583008e-05, | |
| "loss": 6.559, | |
| "step": 290500 | |
| }, | |
| { | |
| "epoch": 0.5550384521484375, | |
| "grad_norm": 279.78582763671875, | |
| "learning_rate": 2.2248172760009766e-05, | |
| "loss": 6.5773, | |
| "step": 291000 | |
| }, | |
| { | |
| "epoch": 0.5559921264648438, | |
| "grad_norm": 1313.1634521484375, | |
| "learning_rate": 2.2200489044189453e-05, | |
| "loss": 6.5843, | |
| "step": 291500 | |
| }, | |
| { | |
| "epoch": 0.55694580078125, | |
| "grad_norm": 417.1656494140625, | |
| "learning_rate": 2.215280532836914e-05, | |
| "loss": 6.4884, | |
| "step": 292000 | |
| }, | |
| { | |
| "epoch": 0.5578994750976562, | |
| "grad_norm": 187.26255798339844, | |
| "learning_rate": 2.210512161254883e-05, | |
| "loss": 6.5431, | |
| "step": 292500 | |
| }, | |
| { | |
| "epoch": 0.5588531494140625, | |
| "grad_norm": 130.42347717285156, | |
| "learning_rate": 2.2057437896728517e-05, | |
| "loss": 6.5248, | |
| "step": 293000 | |
| }, | |
| { | |
| "epoch": 0.5598068237304688, | |
| "grad_norm": 266.1620178222656, | |
| "learning_rate": 2.2009754180908204e-05, | |
| "loss": 6.4695, | |
| "step": 293500 | |
| }, | |
| { | |
| "epoch": 0.560760498046875, | |
| "grad_norm": 62.113685607910156, | |
| "learning_rate": 2.196207046508789e-05, | |
| "loss": 6.6018, | |
| "step": 294000 | |
| }, | |
| { | |
| "epoch": 0.5617141723632812, | |
| "grad_norm": 159.71209716796875, | |
| "learning_rate": 2.1914386749267578e-05, | |
| "loss": 6.6181, | |
| "step": 294500 | |
| }, | |
| { | |
| "epoch": 0.5626678466796875, | |
| "grad_norm": 75.88334655761719, | |
| "learning_rate": 2.1866703033447268e-05, | |
| "loss": 6.6522, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 0.5626678466796875, | |
| "eval_accuracy": 0.03903522504892368, | |
| "eval_loss": 6.489808082580566, | |
| "eval_runtime": 237.5721, | |
| "eval_samples_per_second": 42.092, | |
| "eval_steps_per_second": 10.523, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 0.5636215209960938, | |
| "grad_norm": 397.1622314453125, | |
| "learning_rate": 2.1819019317626955e-05, | |
| "loss": 6.5981, | |
| "step": 295500 | |
| }, | |
| { | |
| "epoch": 0.5645751953125, | |
| "grad_norm": 100.62149047851562, | |
| "learning_rate": 2.177133560180664e-05, | |
| "loss": 6.5702, | |
| "step": 296000 | |
| }, | |
| { | |
| "epoch": 0.5655288696289062, | |
| "grad_norm": 139.2002410888672, | |
| "learning_rate": 2.1723651885986328e-05, | |
| "loss": 6.5627, | |
| "step": 296500 | |
| }, | |
| { | |
| "epoch": 0.5664825439453125, | |
| "grad_norm": 209.5233917236328, | |
| "learning_rate": 2.1675968170166015e-05, | |
| "loss": 6.5707, | |
| "step": 297000 | |
| }, | |
| { | |
| "epoch": 0.5674362182617188, | |
| "grad_norm": 252.32769775390625, | |
| "learning_rate": 2.1628284454345705e-05, | |
| "loss": 6.6021, | |
| "step": 297500 | |
| }, | |
| { | |
| "epoch": 0.568389892578125, | |
| "grad_norm": 120.24504852294922, | |
| "learning_rate": 2.1580600738525392e-05, | |
| "loss": 6.6022, | |
| "step": 298000 | |
| }, | |
| { | |
| "epoch": 0.5693435668945312, | |
| "grad_norm": 428.54132080078125, | |
| "learning_rate": 2.153291702270508e-05, | |
| "loss": 6.5826, | |
| "step": 298500 | |
| }, | |
| { | |
| "epoch": 0.5702972412109375, | |
| "grad_norm": 136.40740966796875, | |
| "learning_rate": 2.1485233306884766e-05, | |
| "loss": 6.5511, | |
| "step": 299000 | |
| }, | |
| { | |
| "epoch": 0.5712509155273438, | |
| "grad_norm": 98.76660919189453, | |
| "learning_rate": 2.1437549591064453e-05, | |
| "loss": 6.5606, | |
| "step": 299500 | |
| }, | |
| { | |
| "epoch": 0.57220458984375, | |
| "grad_norm": 121.40555572509766, | |
| "learning_rate": 2.1389865875244143e-05, | |
| "loss": 6.5688, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 0.57220458984375, | |
| "eval_accuracy": 0.03886947162426614, | |
| "eval_loss": 6.491418838500977, | |
| "eval_runtime": 239.8089, | |
| "eval_samples_per_second": 41.7, | |
| "eval_steps_per_second": 10.425, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 0.5731582641601562, | |
| "grad_norm": 65.11746215820312, | |
| "learning_rate": 2.134218215942383e-05, | |
| "loss": 6.593, | |
| "step": 300500 | |
| }, | |
| { | |
| "epoch": 0.5741119384765625, | |
| "grad_norm": 131.08642578125, | |
| "learning_rate": 2.1294498443603516e-05, | |
| "loss": 6.5569, | |
| "step": 301000 | |
| }, | |
| { | |
| "epoch": 0.5750656127929688, | |
| "grad_norm": 213.5546875, | |
| "learning_rate": 2.1246814727783203e-05, | |
| "loss": 6.5626, | |
| "step": 301500 | |
| }, | |
| { | |
| "epoch": 0.576019287109375, | |
| "grad_norm": 164.9312744140625, | |
| "learning_rate": 2.119913101196289e-05, | |
| "loss": 6.5405, | |
| "step": 302000 | |
| }, | |
| { | |
| "epoch": 0.5769729614257812, | |
| "grad_norm": 130.9062042236328, | |
| "learning_rate": 2.115144729614258e-05, | |
| "loss": 6.5618, | |
| "step": 302500 | |
| }, | |
| { | |
| "epoch": 0.5779266357421875, | |
| "grad_norm": 75.20059204101562, | |
| "learning_rate": 2.1103763580322267e-05, | |
| "loss": 6.573, | |
| "step": 303000 | |
| }, | |
| { | |
| "epoch": 0.5788803100585938, | |
| "grad_norm": 98.30027770996094, | |
| "learning_rate": 2.1056079864501954e-05, | |
| "loss": 6.5728, | |
| "step": 303500 | |
| }, | |
| { | |
| "epoch": 0.579833984375, | |
| "grad_norm": 57.365936279296875, | |
| "learning_rate": 2.100839614868164e-05, | |
| "loss": 6.5246, | |
| "step": 304000 | |
| }, | |
| { | |
| "epoch": 0.5807876586914062, | |
| "grad_norm": 118.35264587402344, | |
| "learning_rate": 2.0960712432861328e-05, | |
| "loss": 6.5676, | |
| "step": 304500 | |
| }, | |
| { | |
| "epoch": 0.5817413330078125, | |
| "grad_norm": 92.39289093017578, | |
| "learning_rate": 2.0913028717041018e-05, | |
| "loss": 6.5445, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 0.5817413330078125, | |
| "eval_accuracy": 0.038926614481409, | |
| "eval_loss": 6.482814788818359, | |
| "eval_runtime": 237.5203, | |
| "eval_samples_per_second": 42.102, | |
| "eval_steps_per_second": 10.525, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 0.5826950073242188, | |
| "grad_norm": 139.6799774169922, | |
| "learning_rate": 2.0865345001220705e-05, | |
| "loss": 6.5893, | |
| "step": 305500 | |
| }, | |
| { | |
| "epoch": 0.583648681640625, | |
| "grad_norm": 220.11317443847656, | |
| "learning_rate": 2.081766128540039e-05, | |
| "loss": 6.5063, | |
| "step": 306000 | |
| }, | |
| { | |
| "epoch": 0.5846023559570312, | |
| "grad_norm": 123.89678955078125, | |
| "learning_rate": 2.0769977569580078e-05, | |
| "loss": 6.541, | |
| "step": 306500 | |
| }, | |
| { | |
| "epoch": 0.5855560302734375, | |
| "grad_norm": 150.82089233398438, | |
| "learning_rate": 2.0722293853759765e-05, | |
| "loss": 6.5261, | |
| "step": 307000 | |
| }, | |
| { | |
| "epoch": 0.5865097045898438, | |
| "grad_norm": 110.43388366699219, | |
| "learning_rate": 2.0674610137939455e-05, | |
| "loss": 6.533, | |
| "step": 307500 | |
| }, | |
| { | |
| "epoch": 0.58746337890625, | |
| "grad_norm": 158.13780212402344, | |
| "learning_rate": 2.0626926422119142e-05, | |
| "loss": 6.6145, | |
| "step": 308000 | |
| }, | |
| { | |
| "epoch": 0.5884170532226562, | |
| "grad_norm": 944.8779907226562, | |
| "learning_rate": 2.057924270629883e-05, | |
| "loss": 6.6023, | |
| "step": 308500 | |
| }, | |
| { | |
| "epoch": 0.5893707275390625, | |
| "grad_norm": 105.20059204101562, | |
| "learning_rate": 2.0531558990478516e-05, | |
| "loss": 6.5803, | |
| "step": 309000 | |
| }, | |
| { | |
| "epoch": 0.5903244018554688, | |
| "grad_norm": 88.02117919921875, | |
| "learning_rate": 2.0483875274658203e-05, | |
| "loss": 6.5734, | |
| "step": 309500 | |
| }, | |
| { | |
| "epoch": 0.591278076171875, | |
| "grad_norm": 220.67440795898438, | |
| "learning_rate": 2.0436191558837893e-05, | |
| "loss": 6.5674, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 0.591278076171875, | |
| "eval_accuracy": 0.03877632093933464, | |
| "eval_loss": 6.493968963623047, | |
| "eval_runtime": 240.9683, | |
| "eval_samples_per_second": 41.499, | |
| "eval_steps_per_second": 10.375, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 0.5922317504882812, | |
| "grad_norm": 151.35154724121094, | |
| "learning_rate": 2.038850784301758e-05, | |
| "loss": 6.544, | |
| "step": 310500 | |
| }, | |
| { | |
| "epoch": 0.5931854248046875, | |
| "grad_norm": 280.2740173339844, | |
| "learning_rate": 2.0340824127197266e-05, | |
| "loss": 6.566, | |
| "step": 311000 | |
| }, | |
| { | |
| "epoch": 0.5941390991210938, | |
| "grad_norm": 130.1758270263672, | |
| "learning_rate": 2.0293140411376953e-05, | |
| "loss": 6.5556, | |
| "step": 311500 | |
| }, | |
| { | |
| "epoch": 0.5950927734375, | |
| "grad_norm": 1289.0264892578125, | |
| "learning_rate": 2.024545669555664e-05, | |
| "loss": 6.6155, | |
| "step": 312000 | |
| }, | |
| { | |
| "epoch": 0.5960464477539062, | |
| "grad_norm": 324.85662841796875, | |
| "learning_rate": 2.019777297973633e-05, | |
| "loss": 6.5985, | |
| "step": 312500 | |
| }, | |
| { | |
| "epoch": 0.5970001220703125, | |
| "grad_norm": 251.70790100097656, | |
| "learning_rate": 2.0150089263916017e-05, | |
| "loss": 6.5858, | |
| "step": 313000 | |
| }, | |
| { | |
| "epoch": 0.5979537963867188, | |
| "grad_norm": 154.8018341064453, | |
| "learning_rate": 2.0102405548095704e-05, | |
| "loss": 6.5304, | |
| "step": 313500 | |
| }, | |
| { | |
| "epoch": 0.598907470703125, | |
| "grad_norm": 891.773681640625, | |
| "learning_rate": 2.005472183227539e-05, | |
| "loss": 6.5531, | |
| "step": 314000 | |
| }, | |
| { | |
| "epoch": 0.5998611450195312, | |
| "grad_norm": 116.1676025390625, | |
| "learning_rate": 2.0007038116455078e-05, | |
| "loss": 6.5902, | |
| "step": 314500 | |
| }, | |
| { | |
| "epoch": 0.6008148193359375, | |
| "grad_norm": 64.54122924804688, | |
| "learning_rate": 1.9959354400634768e-05, | |
| "loss": 6.5926, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 0.6008148193359375, | |
| "eval_accuracy": 0.03876692759295499, | |
| "eval_loss": 6.478638172149658, | |
| "eval_runtime": 239.6249, | |
| "eval_samples_per_second": 41.732, | |
| "eval_steps_per_second": 10.433, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 0.6017684936523438, | |
| "grad_norm": 150.49513244628906, | |
| "learning_rate": 1.9911670684814455e-05, | |
| "loss": 6.5588, | |
| "step": 315500 | |
| }, | |
| { | |
| "epoch": 0.60272216796875, | |
| "grad_norm": 206.52194213867188, | |
| "learning_rate": 1.986398696899414e-05, | |
| "loss": 6.5864, | |
| "step": 316000 | |
| }, | |
| { | |
| "epoch": 0.6036758422851562, | |
| "grad_norm": 87.1997299194336, | |
| "learning_rate": 1.9816303253173828e-05, | |
| "loss": 6.5414, | |
| "step": 316500 | |
| }, | |
| { | |
| "epoch": 0.6046295166015625, | |
| "grad_norm": 737.2801513671875, | |
| "learning_rate": 1.9768619537353515e-05, | |
| "loss": 6.5796, | |
| "step": 317000 | |
| }, | |
| { | |
| "epoch": 0.6055831909179688, | |
| "grad_norm": 67.21781158447266, | |
| "learning_rate": 1.9720935821533205e-05, | |
| "loss": 6.5526, | |
| "step": 317500 | |
| }, | |
| { | |
| "epoch": 0.606536865234375, | |
| "grad_norm": 148.92437744140625, | |
| "learning_rate": 1.9673252105712892e-05, | |
| "loss": 6.5496, | |
| "step": 318000 | |
| }, | |
| { | |
| "epoch": 0.6074905395507812, | |
| "grad_norm": 159.55218505859375, | |
| "learning_rate": 1.962556838989258e-05, | |
| "loss": 6.5567, | |
| "step": 318500 | |
| }, | |
| { | |
| "epoch": 0.6084442138671875, | |
| "grad_norm": 298.22796630859375, | |
| "learning_rate": 1.9577884674072266e-05, | |
| "loss": 6.5438, | |
| "step": 319000 | |
| }, | |
| { | |
| "epoch": 0.6093978881835938, | |
| "grad_norm": 119.46234893798828, | |
| "learning_rate": 1.9530200958251953e-05, | |
| "loss": 6.5746, | |
| "step": 319500 | |
| }, | |
| { | |
| "epoch": 0.6103515625, | |
| "grad_norm": 497.9232482910156, | |
| "learning_rate": 1.9482517242431643e-05, | |
| "loss": 6.4979, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 0.6103515625, | |
| "eval_accuracy": 0.03913796477495108, | |
| "eval_loss": 6.475390434265137, | |
| "eval_runtime": 245.799, | |
| "eval_samples_per_second": 40.684, | |
| "eval_steps_per_second": 10.171, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 0.6113052368164062, | |
| "grad_norm": 95.33645629882812, | |
| "learning_rate": 1.943483352661133e-05, | |
| "loss": 6.5177, | |
| "step": 320500 | |
| }, | |
| { | |
| "epoch": 0.6122589111328125, | |
| "grad_norm": 177.78366088867188, | |
| "learning_rate": 1.9387149810791016e-05, | |
| "loss": 6.5235, | |
| "step": 321000 | |
| }, | |
| { | |
| "epoch": 0.6132125854492188, | |
| "grad_norm": 450.78155517578125, | |
| "learning_rate": 1.9339466094970703e-05, | |
| "loss": 6.6073, | |
| "step": 321500 | |
| }, | |
| { | |
| "epoch": 0.614166259765625, | |
| "grad_norm": 176.4562225341797, | |
| "learning_rate": 1.929178237915039e-05, | |
| "loss": 6.6167, | |
| "step": 322000 | |
| }, | |
| { | |
| "epoch": 0.6151199340820312, | |
| "grad_norm": 158.9343719482422, | |
| "learning_rate": 1.924409866333008e-05, | |
| "loss": 6.6144, | |
| "step": 322500 | |
| }, | |
| { | |
| "epoch": 0.6160736083984375, | |
| "grad_norm": 130.91563415527344, | |
| "learning_rate": 1.9196414947509767e-05, | |
| "loss": 6.6087, | |
| "step": 323000 | |
| }, | |
| { | |
| "epoch": 0.6170272827148438, | |
| "grad_norm": 158.33616638183594, | |
| "learning_rate": 1.9148731231689454e-05, | |
| "loss": 6.5553, | |
| "step": 323500 | |
| }, | |
| { | |
| "epoch": 0.61798095703125, | |
| "grad_norm": 104.18647003173828, | |
| "learning_rate": 1.910104751586914e-05, | |
| "loss": 6.5952, | |
| "step": 324000 | |
| }, | |
| { | |
| "epoch": 0.6189346313476562, | |
| "grad_norm": 151.77406311035156, | |
| "learning_rate": 1.9053363800048828e-05, | |
| "loss": 6.5814, | |
| "step": 324500 | |
| }, | |
| { | |
| "epoch": 0.6198883056640625, | |
| "grad_norm": 356.1595153808594, | |
| "learning_rate": 1.9005680084228518e-05, | |
| "loss": 6.5669, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 0.6198883056640625, | |
| "eval_accuracy": 0.039391780821917806, | |
| "eval_loss": 6.472179889678955, | |
| "eval_runtime": 244.9459, | |
| "eval_samples_per_second": 40.825, | |
| "eval_steps_per_second": 10.206, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 0.6208419799804688, | |
| "grad_norm": 314.9290466308594, | |
| "learning_rate": 1.8957996368408205e-05, | |
| "loss": 6.5539, | |
| "step": 325500 | |
| }, | |
| { | |
| "epoch": 0.621795654296875, | |
| "grad_norm": 184.68072509765625, | |
| "learning_rate": 1.891031265258789e-05, | |
| "loss": 6.5505, | |
| "step": 326000 | |
| }, | |
| { | |
| "epoch": 0.6227493286132812, | |
| "grad_norm": 72.39010620117188, | |
| "learning_rate": 1.8862628936767578e-05, | |
| "loss": 6.6057, | |
| "step": 326500 | |
| }, | |
| { | |
| "epoch": 0.6237030029296875, | |
| "grad_norm": 98.22920989990234, | |
| "learning_rate": 1.8814945220947265e-05, | |
| "loss": 6.536, | |
| "step": 327000 | |
| }, | |
| { | |
| "epoch": 0.6246566772460938, | |
| "grad_norm": 92.31875610351562, | |
| "learning_rate": 1.8767261505126955e-05, | |
| "loss": 6.5872, | |
| "step": 327500 | |
| }, | |
| { | |
| "epoch": 0.6256103515625, | |
| "grad_norm": 105.9183578491211, | |
| "learning_rate": 1.8719577789306642e-05, | |
| "loss": 6.5717, | |
| "step": 328000 | |
| }, | |
| { | |
| "epoch": 0.6265640258789062, | |
| "grad_norm": 109.58235931396484, | |
| "learning_rate": 1.867189407348633e-05, | |
| "loss": 6.5964, | |
| "step": 328500 | |
| }, | |
| { | |
| "epoch": 0.6275177001953125, | |
| "grad_norm": 99.37423706054688, | |
| "learning_rate": 1.8624210357666016e-05, | |
| "loss": 6.5457, | |
| "step": 329000 | |
| }, | |
| { | |
| "epoch": 0.6284713745117188, | |
| "grad_norm": 116.90908813476562, | |
| "learning_rate": 1.8576526641845703e-05, | |
| "loss": 6.5854, | |
| "step": 329500 | |
| }, | |
| { | |
| "epoch": 0.629425048828125, | |
| "grad_norm": 106.40868377685547, | |
| "learning_rate": 1.8528842926025393e-05, | |
| "loss": 6.5335, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 0.629425048828125, | |
| "eval_accuracy": 0.03977103718199609, | |
| "eval_loss": 6.473107814788818, | |
| "eval_runtime": 239.2374, | |
| "eval_samples_per_second": 41.799, | |
| "eval_steps_per_second": 10.45, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 0.6303787231445312, | |
| "grad_norm": 146.23626708984375, | |
| "learning_rate": 1.848115921020508e-05, | |
| "loss": 6.626, | |
| "step": 330500 | |
| }, | |
| { | |
| "epoch": 0.6313323974609375, | |
| "grad_norm": 85.45753479003906, | |
| "learning_rate": 1.8433475494384766e-05, | |
| "loss": 6.5638, | |
| "step": 331000 | |
| }, | |
| { | |
| "epoch": 0.6322860717773438, | |
| "grad_norm": 141.6040802001953, | |
| "learning_rate": 1.8385791778564453e-05, | |
| "loss": 6.5574, | |
| "step": 331500 | |
| }, | |
| { | |
| "epoch": 0.63323974609375, | |
| "grad_norm": 85.09579467773438, | |
| "learning_rate": 1.833810806274414e-05, | |
| "loss": 6.5948, | |
| "step": 332000 | |
| }, | |
| { | |
| "epoch": 0.6341934204101562, | |
| "grad_norm": 55.52738952636719, | |
| "learning_rate": 1.829042434692383e-05, | |
| "loss": 6.5454, | |
| "step": 332500 | |
| }, | |
| { | |
| "epoch": 0.6351470947265625, | |
| "grad_norm": 248.0428924560547, | |
| "learning_rate": 1.8242740631103517e-05, | |
| "loss": 6.5177, | |
| "step": 333000 | |
| }, | |
| { | |
| "epoch": 0.6361007690429688, | |
| "grad_norm": 205.9241943359375, | |
| "learning_rate": 1.8195056915283204e-05, | |
| "loss": 6.5136, | |
| "step": 333500 | |
| }, | |
| { | |
| "epoch": 0.637054443359375, | |
| "grad_norm": 81.16824340820312, | |
| "learning_rate": 1.814737319946289e-05, | |
| "loss": 6.5282, | |
| "step": 334000 | |
| }, | |
| { | |
| "epoch": 0.6380081176757812, | |
| "grad_norm": 130.18638610839844, | |
| "learning_rate": 1.8099689483642578e-05, | |
| "loss": 6.537, | |
| "step": 334500 | |
| }, | |
| { | |
| "epoch": 0.6389617919921875, | |
| "grad_norm": 223.24957275390625, | |
| "learning_rate": 1.8052005767822268e-05, | |
| "loss": 6.5727, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 0.6389617919921875, | |
| "eval_accuracy": 0.039358708414872795, | |
| "eval_loss": 6.476977825164795, | |
| "eval_runtime": 241.6364, | |
| "eval_samples_per_second": 41.384, | |
| "eval_steps_per_second": 10.346, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 0.6399154663085938, | |
| "grad_norm": 257.26629638671875, | |
| "learning_rate": 1.8004322052001955e-05, | |
| "loss": 6.6013, | |
| "step": 335500 | |
| }, | |
| { | |
| "epoch": 0.640869140625, | |
| "grad_norm": 526.728271484375, | |
| "learning_rate": 1.795663833618164e-05, | |
| "loss": 6.5763, | |
| "step": 336000 | |
| }, | |
| { | |
| "epoch": 0.6418228149414062, | |
| "grad_norm": 127.42972564697266, | |
| "learning_rate": 1.7908954620361328e-05, | |
| "loss": 6.5615, | |
| "step": 336500 | |
| }, | |
| { | |
| "epoch": 0.6427764892578125, | |
| "grad_norm": 62.457923889160156, | |
| "learning_rate": 1.7861270904541015e-05, | |
| "loss": 6.5083, | |
| "step": 337000 | |
| }, | |
| { | |
| "epoch": 0.6437301635742188, | |
| "grad_norm": 148.3224334716797, | |
| "learning_rate": 1.7813587188720705e-05, | |
| "loss": 6.5489, | |
| "step": 337500 | |
| }, | |
| { | |
| "epoch": 0.644683837890625, | |
| "grad_norm": 293.37652587890625, | |
| "learning_rate": 1.7765903472900392e-05, | |
| "loss": 6.5661, | |
| "step": 338000 | |
| }, | |
| { | |
| "epoch": 0.6456375122070312, | |
| "grad_norm": 233.91455078125, | |
| "learning_rate": 1.771821975708008e-05, | |
| "loss": 6.5598, | |
| "step": 338500 | |
| }, | |
| { | |
| "epoch": 0.6465911865234375, | |
| "grad_norm": 153.2530059814453, | |
| "learning_rate": 1.7670536041259766e-05, | |
| "loss": 6.5883, | |
| "step": 339000 | |
| }, | |
| { | |
| "epoch": 0.6475448608398438, | |
| "grad_norm": 193.26214599609375, | |
| "learning_rate": 1.7622852325439453e-05, | |
| "loss": 6.5564, | |
| "step": 339500 | |
| }, | |
| { | |
| "epoch": 0.64849853515625, | |
| "grad_norm": 347.178955078125, | |
| "learning_rate": 1.7575168609619143e-05, | |
| "loss": 6.5735, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 0.64849853515625, | |
| "eval_accuracy": 0.03903444227005871, | |
| "eval_loss": 6.481354713439941, | |
| "eval_runtime": 245.5901, | |
| "eval_samples_per_second": 40.718, | |
| "eval_steps_per_second": 10.18, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 0.6494522094726562, | |
| "grad_norm": 110.26327514648438, | |
| "learning_rate": 1.752748489379883e-05, | |
| "loss": 6.5545, | |
| "step": 340500 | |
| }, | |
| { | |
| "epoch": 0.6504058837890625, | |
| "grad_norm": 295.2226867675781, | |
| "learning_rate": 1.7479801177978516e-05, | |
| "loss": 6.5221, | |
| "step": 341000 | |
| }, | |
| { | |
| "epoch": 0.6513595581054688, | |
| "grad_norm": 3240.28369140625, | |
| "learning_rate": 1.7432117462158203e-05, | |
| "loss": 6.5653, | |
| "step": 341500 | |
| }, | |
| { | |
| "epoch": 0.652313232421875, | |
| "grad_norm": 415.33953857421875, | |
| "learning_rate": 1.738443374633789e-05, | |
| "loss": 6.5583, | |
| "step": 342000 | |
| }, | |
| { | |
| "epoch": 0.6532669067382812, | |
| "grad_norm": 546.7062377929688, | |
| "learning_rate": 1.733675003051758e-05, | |
| "loss": 6.5814, | |
| "step": 342500 | |
| }, | |
| { | |
| "epoch": 0.6542205810546875, | |
| "grad_norm": 139.72181701660156, | |
| "learning_rate": 1.7289066314697267e-05, | |
| "loss": 6.5635, | |
| "step": 343000 | |
| }, | |
| { | |
| "epoch": 0.6551742553710938, | |
| "grad_norm": 192.7628936767578, | |
| "learning_rate": 1.7241382598876954e-05, | |
| "loss": 6.5176, | |
| "step": 343500 | |
| }, | |
| { | |
| "epoch": 0.6561279296875, | |
| "grad_norm": 296.3890686035156, | |
| "learning_rate": 1.719369888305664e-05, | |
| "loss": 6.5611, | |
| "step": 344000 | |
| }, | |
| { | |
| "epoch": 0.6570816040039062, | |
| "grad_norm": 371.4485778808594, | |
| "learning_rate": 1.7146015167236328e-05, | |
| "loss": 6.5325, | |
| "step": 344500 | |
| }, | |
| { | |
| "epoch": 0.6580352783203125, | |
| "grad_norm": 171.9521942138672, | |
| "learning_rate": 1.7098331451416018e-05, | |
| "loss": 6.5672, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 0.6580352783203125, | |
| "eval_accuracy": 0.03944207436399217, | |
| "eval_loss": 6.461095809936523, | |
| "eval_runtime": 240.9495, | |
| "eval_samples_per_second": 41.502, | |
| "eval_steps_per_second": 10.376, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 0.6589889526367188, | |
| "grad_norm": 135.15243530273438, | |
| "learning_rate": 1.7050647735595705e-05, | |
| "loss": 6.548, | |
| "step": 345500 | |
| }, | |
| { | |
| "epoch": 0.659942626953125, | |
| "grad_norm": 263.35400390625, | |
| "learning_rate": 1.700296401977539e-05, | |
| "loss": 6.5543, | |
| "step": 346000 | |
| }, | |
| { | |
| "epoch": 0.6608963012695312, | |
| "grad_norm": 144.79660034179688, | |
| "learning_rate": 1.6955280303955078e-05, | |
| "loss": 6.5284, | |
| "step": 346500 | |
| }, | |
| { | |
| "epoch": 0.6618499755859375, | |
| "grad_norm": 112.84705352783203, | |
| "learning_rate": 1.6907596588134765e-05, | |
| "loss": 6.4988, | |
| "step": 347000 | |
| }, | |
| { | |
| "epoch": 0.6628036499023438, | |
| "grad_norm": 122.29241943359375, | |
| "learning_rate": 1.6859912872314455e-05, | |
| "loss": 6.5073, | |
| "step": 347500 | |
| }, | |
| { | |
| "epoch": 0.66375732421875, | |
| "grad_norm": 65.16890716552734, | |
| "learning_rate": 1.6812229156494142e-05, | |
| "loss": 6.515, | |
| "step": 348000 | |
| }, | |
| { | |
| "epoch": 0.6647109985351562, | |
| "grad_norm": 198.87631225585938, | |
| "learning_rate": 1.676454544067383e-05, | |
| "loss": 6.6216, | |
| "step": 348500 | |
| }, | |
| { | |
| "epoch": 0.6656646728515625, | |
| "grad_norm": 141.19139099121094, | |
| "learning_rate": 1.6716861724853516e-05, | |
| "loss": 6.5619, | |
| "step": 349000 | |
| }, | |
| { | |
| "epoch": 0.6666183471679688, | |
| "grad_norm": 52.12779235839844, | |
| "learning_rate": 1.6669178009033203e-05, | |
| "loss": 6.5689, | |
| "step": 349500 | |
| }, | |
| { | |
| "epoch": 0.667572021484375, | |
| "grad_norm": 246.66909790039062, | |
| "learning_rate": 1.6621494293212893e-05, | |
| "loss": 6.5223, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 0.667572021484375, | |
| "eval_accuracy": 0.03970039138943249, | |
| "eval_loss": 6.462384223937988, | |
| "eval_runtime": 239.8458, | |
| "eval_samples_per_second": 41.693, | |
| "eval_steps_per_second": 10.423, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 0.6685256958007812, | |
| "grad_norm": 319.72589111328125, | |
| "learning_rate": 1.657381057739258e-05, | |
| "loss": 6.5863, | |
| "step": 350500 | |
| }, | |
| { | |
| "epoch": 0.6694793701171875, | |
| "grad_norm": 196.7140350341797, | |
| "learning_rate": 1.6526126861572266e-05, | |
| "loss": 6.5362, | |
| "step": 351000 | |
| }, | |
| { | |
| "epoch": 0.6704330444335938, | |
| "grad_norm": 502.68743896484375, | |
| "learning_rate": 1.6478443145751953e-05, | |
| "loss": 6.5761, | |
| "step": 351500 | |
| }, | |
| { | |
| "epoch": 0.67138671875, | |
| "grad_norm": 92.79645538330078, | |
| "learning_rate": 1.643075942993164e-05, | |
| "loss": 6.5914, | |
| "step": 352000 | |
| }, | |
| { | |
| "epoch": 0.6723403930664062, | |
| "grad_norm": 236.26942443847656, | |
| "learning_rate": 1.638307571411133e-05, | |
| "loss": 6.5679, | |
| "step": 352500 | |
| }, | |
| { | |
| "epoch": 0.6732940673828125, | |
| "grad_norm": 123.47889709472656, | |
| "learning_rate": 1.6335391998291017e-05, | |
| "loss": 6.5484, | |
| "step": 353000 | |
| }, | |
| { | |
| "epoch": 0.6742477416992188, | |
| "grad_norm": 95.9494857788086, | |
| "learning_rate": 1.6287708282470704e-05, | |
| "loss": 6.5179, | |
| "step": 353500 | |
| }, | |
| { | |
| "epoch": 0.675201416015625, | |
| "grad_norm": 116.54471588134766, | |
| "learning_rate": 1.624002456665039e-05, | |
| "loss": 6.4845, | |
| "step": 354000 | |
| }, | |
| { | |
| "epoch": 0.6761550903320312, | |
| "grad_norm": 137.38478088378906, | |
| "learning_rate": 1.6192340850830078e-05, | |
| "loss": 6.5456, | |
| "step": 354500 | |
| }, | |
| { | |
| "epoch": 0.6771087646484375, | |
| "grad_norm": 102.44841766357422, | |
| "learning_rate": 1.6144657135009768e-05, | |
| "loss": 6.518, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 0.6771087646484375, | |
| "eval_accuracy": 0.040165949119373774, | |
| "eval_loss": 6.473403453826904, | |
| "eval_runtime": 242.3812, | |
| "eval_samples_per_second": 41.257, | |
| "eval_steps_per_second": 10.314, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 0.6780624389648438, | |
| "grad_norm": 143.36158752441406, | |
| "learning_rate": 1.6096973419189455e-05, | |
| "loss": 6.5395, | |
| "step": 355500 | |
| }, | |
| { | |
| "epoch": 0.67901611328125, | |
| "grad_norm": 88.15251922607422, | |
| "learning_rate": 1.604928970336914e-05, | |
| "loss": 6.5379, | |
| "step": 356000 | |
| }, | |
| { | |
| "epoch": 0.6799697875976562, | |
| "grad_norm": 77.68293762207031, | |
| "learning_rate": 1.6001605987548828e-05, | |
| "loss": 6.5462, | |
| "step": 356500 | |
| }, | |
| { | |
| "epoch": 0.6809234619140625, | |
| "grad_norm": 269.4987487792969, | |
| "learning_rate": 1.5953922271728515e-05, | |
| "loss": 6.5495, | |
| "step": 357000 | |
| }, | |
| { | |
| "epoch": 0.6818771362304688, | |
| "grad_norm": 158.91729736328125, | |
| "learning_rate": 1.5906238555908205e-05, | |
| "loss": 6.5422, | |
| "step": 357500 | |
| }, | |
| { | |
| "epoch": 0.682830810546875, | |
| "grad_norm": 104.279052734375, | |
| "learning_rate": 1.5858554840087892e-05, | |
| "loss": 6.53, | |
| "step": 358000 | |
| }, | |
| { | |
| "epoch": 0.6837844848632812, | |
| "grad_norm": 232.02540588378906, | |
| "learning_rate": 1.581087112426758e-05, | |
| "loss": 6.5474, | |
| "step": 358500 | |
| }, | |
| { | |
| "epoch": 0.6847381591796875, | |
| "grad_norm": 145.39788818359375, | |
| "learning_rate": 1.5763187408447266e-05, | |
| "loss": 6.5581, | |
| "step": 359000 | |
| }, | |
| { | |
| "epoch": 0.6856918334960938, | |
| "grad_norm": 208.22569274902344, | |
| "learning_rate": 1.5715503692626953e-05, | |
| "loss": 6.5472, | |
| "step": 359500 | |
| }, | |
| { | |
| "epoch": 0.6866455078125, | |
| "grad_norm": 187.49270629882812, | |
| "learning_rate": 1.5667819976806643e-05, | |
| "loss": 6.5466, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 0.6866455078125, | |
| "eval_accuracy": 0.03987984344422701, | |
| "eval_loss": 6.463972568511963, | |
| "eval_runtime": 242.9807, | |
| "eval_samples_per_second": 41.156, | |
| "eval_steps_per_second": 10.289, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 0.6875991821289062, | |
| "grad_norm": 115.62727355957031, | |
| "learning_rate": 1.562013626098633e-05, | |
| "loss": 6.5215, | |
| "step": 360500 | |
| }, | |
| { | |
| "epoch": 0.6885528564453125, | |
| "grad_norm": 179.17079162597656, | |
| "learning_rate": 1.5572452545166016e-05, | |
| "loss": 6.5396, | |
| "step": 361000 | |
| }, | |
| { | |
| "epoch": 0.6895065307617188, | |
| "grad_norm": 518.125, | |
| "learning_rate": 1.5524768829345703e-05, | |
| "loss": 6.5477, | |
| "step": 361500 | |
| }, | |
| { | |
| "epoch": 0.690460205078125, | |
| "grad_norm": 259.0045166015625, | |
| "learning_rate": 1.547708511352539e-05, | |
| "loss": 6.5385, | |
| "step": 362000 | |
| }, | |
| { | |
| "epoch": 0.6914138793945312, | |
| "grad_norm": 146.81991577148438, | |
| "learning_rate": 1.542940139770508e-05, | |
| "loss": 6.531, | |
| "step": 362500 | |
| }, | |
| { | |
| "epoch": 0.6923675537109375, | |
| "grad_norm": 109.12883758544922, | |
| "learning_rate": 1.5381717681884767e-05, | |
| "loss": 6.4949, | |
| "step": 363000 | |
| }, | |
| { | |
| "epoch": 0.6933212280273438, | |
| "grad_norm": 148.35174560546875, | |
| "learning_rate": 1.5334033966064454e-05, | |
| "loss": 6.4647, | |
| "step": 363500 | |
| }, | |
| { | |
| "epoch": 0.69427490234375, | |
| "grad_norm": 137.14820861816406, | |
| "learning_rate": 1.528635025024414e-05, | |
| "loss": 6.4803, | |
| "step": 364000 | |
| }, | |
| { | |
| "epoch": 0.6952285766601562, | |
| "grad_norm": 194.35723876953125, | |
| "learning_rate": 1.523866653442383e-05, | |
| "loss": 6.4629, | |
| "step": 364500 | |
| }, | |
| { | |
| "epoch": 0.6961822509765625, | |
| "grad_norm": 88.89530944824219, | |
| "learning_rate": 1.5190982818603516e-05, | |
| "loss": 6.5213, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 0.6961822509765625, | |
| "eval_accuracy": 0.04012818003913894, | |
| "eval_loss": 6.462370872497559, | |
| "eval_runtime": 243.706, | |
| "eval_samples_per_second": 41.033, | |
| "eval_steps_per_second": 10.258, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 0.6971359252929688, | |
| "grad_norm": Infinity, | |
| "learning_rate": 1.5143299102783205e-05, | |
| "loss": 6.6055, | |
| "step": 365500 | |
| }, | |
| { | |
| "epoch": 0.698089599609375, | |
| "grad_norm": 180.16140747070312, | |
| "learning_rate": 1.5095615386962891e-05, | |
| "loss": 6.5766, | |
| "step": 366000 | |
| }, | |
| { | |
| "epoch": 0.6990432739257812, | |
| "grad_norm": 196.64334106445312, | |
| "learning_rate": 1.5047931671142578e-05, | |
| "loss": 6.6007, | |
| "step": 366500 | |
| }, | |
| { | |
| "epoch": 0.6999969482421875, | |
| "grad_norm": 246.12445068359375, | |
| "learning_rate": 1.5000247955322267e-05, | |
| "loss": 6.6011, | |
| "step": 367000 | |
| }, | |
| { | |
| "epoch": 0.7009506225585938, | |
| "grad_norm": 99.25709533691406, | |
| "learning_rate": 1.4952564239501954e-05, | |
| "loss": 6.528, | |
| "step": 367500 | |
| }, | |
| { | |
| "epoch": 0.701904296875, | |
| "grad_norm": 255.1567840576172, | |
| "learning_rate": 1.4904880523681642e-05, | |
| "loss": 6.534, | |
| "step": 368000 | |
| }, | |
| { | |
| "epoch": 0.7028579711914062, | |
| "grad_norm": 472.0591125488281, | |
| "learning_rate": 1.4857196807861329e-05, | |
| "loss": 6.5193, | |
| "step": 368500 | |
| }, | |
| { | |
| "epoch": 0.7038116455078125, | |
| "grad_norm": 386.86834716796875, | |
| "learning_rate": 1.4809513092041016e-05, | |
| "loss": 6.547, | |
| "step": 369000 | |
| }, | |
| { | |
| "epoch": 0.7047653198242188, | |
| "grad_norm": 147.0093231201172, | |
| "learning_rate": 1.4761829376220704e-05, | |
| "loss": 6.5236, | |
| "step": 369500 | |
| }, | |
| { | |
| "epoch": 0.705718994140625, | |
| "grad_norm": 447.5209045410156, | |
| "learning_rate": 1.4714145660400391e-05, | |
| "loss": 6.5881, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 0.705718994140625, | |
| "eval_accuracy": 0.03990039138943249, | |
| "eval_loss": 6.450451374053955, | |
| "eval_runtime": 244.0341, | |
| "eval_samples_per_second": 40.978, | |
| "eval_steps_per_second": 10.244, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 0.7066726684570312, | |
| "grad_norm": 243.3019256591797, | |
| "learning_rate": 1.466646194458008e-05, | |
| "loss": 6.5152, | |
| "step": 370500 | |
| }, | |
| { | |
| "epoch": 0.7076263427734375, | |
| "grad_norm": 129.59954833984375, | |
| "learning_rate": 1.4618778228759766e-05, | |
| "loss": 6.5965, | |
| "step": 371000 | |
| }, | |
| { | |
| "epoch": 0.7085800170898438, | |
| "grad_norm": 88.1178970336914, | |
| "learning_rate": 1.4571094512939453e-05, | |
| "loss": 6.5418, | |
| "step": 371500 | |
| }, | |
| { | |
| "epoch": 0.70953369140625, | |
| "grad_norm": 170.9890594482422, | |
| "learning_rate": 1.4523410797119142e-05, | |
| "loss": 6.5358, | |
| "step": 372000 | |
| }, | |
| { | |
| "epoch": 0.7104873657226562, | |
| "grad_norm": 765.4295654296875, | |
| "learning_rate": 1.4475727081298829e-05, | |
| "loss": 6.5584, | |
| "step": 372500 | |
| }, | |
| { | |
| "epoch": 0.7114410400390625, | |
| "grad_norm": 296.3957824707031, | |
| "learning_rate": 1.4428043365478517e-05, | |
| "loss": 6.537, | |
| "step": 373000 | |
| }, | |
| { | |
| "epoch": 0.7123947143554688, | |
| "grad_norm": 238.33856201171875, | |
| "learning_rate": 1.4380359649658204e-05, | |
| "loss": 6.5633, | |
| "step": 373500 | |
| }, | |
| { | |
| "epoch": 0.713348388671875, | |
| "grad_norm": 175.88967895507812, | |
| "learning_rate": 1.433267593383789e-05, | |
| "loss": 6.5366, | |
| "step": 374000 | |
| }, | |
| { | |
| "epoch": 0.7143020629882812, | |
| "grad_norm": 101.9058609008789, | |
| "learning_rate": 1.428499221801758e-05, | |
| "loss": 6.5549, | |
| "step": 374500 | |
| }, | |
| { | |
| "epoch": 0.7152557373046875, | |
| "grad_norm": 140.43959045410156, | |
| "learning_rate": 1.4237308502197266e-05, | |
| "loss": 6.5353, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 0.7152557373046875, | |
| "eval_accuracy": 0.040435616438356164, | |
| "eval_loss": 6.447012901306152, | |
| "eval_runtime": 239.2555, | |
| "eval_samples_per_second": 41.796, | |
| "eval_steps_per_second": 10.449, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 0.7162094116210938, | |
| "grad_norm": 141.53201293945312, | |
| "learning_rate": 1.4189624786376955e-05, | |
| "loss": 6.4952, | |
| "step": 375500 | |
| }, | |
| { | |
| "epoch": 0.7171630859375, | |
| "grad_norm": 264.2409973144531, | |
| "learning_rate": 1.4141941070556641e-05, | |
| "loss": 6.5382, | |
| "step": 376000 | |
| }, | |
| { | |
| "epoch": 0.7181167602539062, | |
| "grad_norm": 667.0358276367188, | |
| "learning_rate": 1.4094257354736328e-05, | |
| "loss": 6.5211, | |
| "step": 376500 | |
| }, | |
| { | |
| "epoch": 0.7190704345703125, | |
| "grad_norm": 391.56927490234375, | |
| "learning_rate": 1.4046573638916017e-05, | |
| "loss": 6.5214, | |
| "step": 377000 | |
| }, | |
| { | |
| "epoch": 0.7200241088867188, | |
| "grad_norm": 141.28468322753906, | |
| "learning_rate": 1.3998889923095704e-05, | |
| "loss": 6.5409, | |
| "step": 377500 | |
| }, | |
| { | |
| "epoch": 0.720977783203125, | |
| "grad_norm": 377.8066711425781, | |
| "learning_rate": 1.3951206207275392e-05, | |
| "loss": 6.5413, | |
| "step": 378000 | |
| }, | |
| { | |
| "epoch": 0.7219314575195312, | |
| "grad_norm": 79.05961608886719, | |
| "learning_rate": 1.3903522491455079e-05, | |
| "loss": 6.5492, | |
| "step": 378500 | |
| }, | |
| { | |
| "epoch": 0.7228851318359375, | |
| "grad_norm": 195.48342895507812, | |
| "learning_rate": 1.3855838775634766e-05, | |
| "loss": 6.5255, | |
| "step": 379000 | |
| }, | |
| { | |
| "epoch": 0.7238388061523438, | |
| "grad_norm": 131.9173583984375, | |
| "learning_rate": 1.3808155059814454e-05, | |
| "loss": 6.5142, | |
| "step": 379500 | |
| }, | |
| { | |
| "epoch": 0.72479248046875, | |
| "grad_norm": 187.51319885253906, | |
| "learning_rate": 1.3760471343994141e-05, | |
| "loss": 6.5371, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 0.72479248046875, | |
| "eval_accuracy": 0.040126810176125245, | |
| "eval_loss": 6.445300102233887, | |
| "eval_runtime": 240.0497, | |
| "eval_samples_per_second": 41.658, | |
| "eval_steps_per_second": 10.415, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 0.7257461547851562, | |
| "grad_norm": 355.3277893066406, | |
| "learning_rate": 1.371278762817383e-05, | |
| "loss": 6.5643, | |
| "step": 380500 | |
| }, | |
| { | |
| "epoch": 0.7266998291015625, | |
| "grad_norm": 132.3724365234375, | |
| "learning_rate": 1.3665103912353516e-05, | |
| "loss": 6.5518, | |
| "step": 381000 | |
| }, | |
| { | |
| "epoch": 0.7276535034179688, | |
| "grad_norm": 138.26895141601562, | |
| "learning_rate": 1.3617420196533203e-05, | |
| "loss": 6.5359, | |
| "step": 381500 | |
| }, | |
| { | |
| "epoch": 0.728607177734375, | |
| "grad_norm": 127.5734634399414, | |
| "learning_rate": 1.3569736480712892e-05, | |
| "loss": 6.5026, | |
| "step": 382000 | |
| }, | |
| { | |
| "epoch": 0.7295608520507812, | |
| "grad_norm": 194.17762756347656, | |
| "learning_rate": 1.3522052764892579e-05, | |
| "loss": 6.4611, | |
| "step": 382500 | |
| }, | |
| { | |
| "epoch": 0.7305145263671875, | |
| "grad_norm": 97.86437225341797, | |
| "learning_rate": 1.3474369049072265e-05, | |
| "loss": 6.4943, | |
| "step": 383000 | |
| }, | |
| { | |
| "epoch": 0.7314682006835938, | |
| "grad_norm": 88.39631652832031, | |
| "learning_rate": 1.3426685333251954e-05, | |
| "loss": 6.4875, | |
| "step": 383500 | |
| }, | |
| { | |
| "epoch": 0.732421875, | |
| "grad_norm": 220.22547912597656, | |
| "learning_rate": 1.337900161743164e-05, | |
| "loss": 6.4643, | |
| "step": 384000 | |
| }, | |
| { | |
| "epoch": 0.7333755493164062, | |
| "grad_norm": 130.28128051757812, | |
| "learning_rate": 1.333131790161133e-05, | |
| "loss": 6.4525, | |
| "step": 384500 | |
| }, | |
| { | |
| "epoch": 0.7343292236328125, | |
| "grad_norm": 151.72984313964844, | |
| "learning_rate": 1.3283634185791016e-05, | |
| "loss": 6.5153, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 0.7343292236328125, | |
| "eval_accuracy": 0.04008082191780822, | |
| "eval_loss": 6.446129322052002, | |
| "eval_runtime": 241.548, | |
| "eval_samples_per_second": 41.4, | |
| "eval_steps_per_second": 10.35, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 0.7352828979492188, | |
| "grad_norm": 91.2538070678711, | |
| "learning_rate": 1.3235950469970703e-05, | |
| "loss": 6.558, | |
| "step": 385500 | |
| }, | |
| { | |
| "epoch": 0.736236572265625, | |
| "grad_norm": 772.9464721679688, | |
| "learning_rate": 1.3188266754150391e-05, | |
| "loss": 6.5602, | |
| "step": 386000 | |
| }, | |
| { | |
| "epoch": 0.7371902465820312, | |
| "grad_norm": 136.80430603027344, | |
| "learning_rate": 1.3140583038330078e-05, | |
| "loss": 6.5403, | |
| "step": 386500 | |
| }, | |
| { | |
| "epoch": 0.7381439208984375, | |
| "grad_norm": 85.5173110961914, | |
| "learning_rate": 1.3092899322509767e-05, | |
| "loss": 6.52, | |
| "step": 387000 | |
| }, | |
| { | |
| "epoch": 0.7390975952148438, | |
| "grad_norm": 575.4596557617188, | |
| "learning_rate": 1.3045215606689454e-05, | |
| "loss": 6.5036, | |
| "step": 387500 | |
| }, | |
| { | |
| "epoch": 0.74005126953125, | |
| "grad_norm": 265.74041748046875, | |
| "learning_rate": 1.299753189086914e-05, | |
| "loss": 6.563, | |
| "step": 388000 | |
| }, | |
| { | |
| "epoch": 0.7410049438476562, | |
| "grad_norm": 202.33619689941406, | |
| "learning_rate": 1.2949848175048829e-05, | |
| "loss": 6.5295, | |
| "step": 388500 | |
| }, | |
| { | |
| "epoch": 0.7419586181640625, | |
| "grad_norm": 158.96939086914062, | |
| "learning_rate": 1.2902164459228516e-05, | |
| "loss": 6.5082, | |
| "step": 389000 | |
| }, | |
| { | |
| "epoch": 0.7429122924804688, | |
| "grad_norm": 277.22186279296875, | |
| "learning_rate": 1.2854480743408204e-05, | |
| "loss": 6.5653, | |
| "step": 389500 | |
| }, | |
| { | |
| "epoch": 0.743865966796875, | |
| "grad_norm": 69.9422836303711, | |
| "learning_rate": 1.2806797027587891e-05, | |
| "loss": 6.5488, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 0.743865966796875, | |
| "eval_accuracy": 0.040149706457925635, | |
| "eval_loss": 6.443368911743164, | |
| "eval_runtime": 240.93, | |
| "eval_samples_per_second": 41.506, | |
| "eval_steps_per_second": 10.376, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 0.7448196411132812, | |
| "grad_norm": 84.9781265258789, | |
| "learning_rate": 1.2759113311767578e-05, | |
| "loss": 6.5277, | |
| "step": 390500 | |
| }, | |
| { | |
| "epoch": 0.7457733154296875, | |
| "grad_norm": 211.03887939453125, | |
| "learning_rate": 1.2711429595947266e-05, | |
| "loss": 6.5508, | |
| "step": 391000 | |
| }, | |
| { | |
| "epoch": 0.7467269897460938, | |
| "grad_norm": 120.260009765625, | |
| "learning_rate": 1.2663745880126953e-05, | |
| "loss": 6.4981, | |
| "step": 391500 | |
| }, | |
| { | |
| "epoch": 0.7476806640625, | |
| "grad_norm": 111.86019897460938, | |
| "learning_rate": 1.2616062164306642e-05, | |
| "loss": 6.5016, | |
| "step": 392000 | |
| }, | |
| { | |
| "epoch": 0.7486343383789062, | |
| "grad_norm": 136.80467224121094, | |
| "learning_rate": 1.2568378448486329e-05, | |
| "loss": 6.5204, | |
| "step": 392500 | |
| }, | |
| { | |
| "epoch": 0.7495880126953125, | |
| "grad_norm": 221.40573120117188, | |
| "learning_rate": 1.2520694732666015e-05, | |
| "loss": 6.5076, | |
| "step": 393000 | |
| }, | |
| { | |
| "epoch": 0.7505416870117188, | |
| "grad_norm": 174.58226013183594, | |
| "learning_rate": 1.2473011016845704e-05, | |
| "loss": 6.5282, | |
| "step": 393500 | |
| }, | |
| { | |
| "epoch": 0.751495361328125, | |
| "grad_norm": 108.00682830810547, | |
| "learning_rate": 1.242532730102539e-05, | |
| "loss": 6.5193, | |
| "step": 394000 | |
| }, | |
| { | |
| "epoch": 0.7524490356445312, | |
| "grad_norm": 110.358154296875, | |
| "learning_rate": 1.237764358520508e-05, | |
| "loss": 6.522, | |
| "step": 394500 | |
| }, | |
| { | |
| "epoch": 0.7534027099609375, | |
| "grad_norm": 114.46678924560547, | |
| "learning_rate": 1.2329959869384766e-05, | |
| "loss": 6.5121, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 0.7534027099609375, | |
| "eval_accuracy": 0.03947162426614481, | |
| "eval_loss": 6.446810722351074, | |
| "eval_runtime": 240.4854, | |
| "eval_samples_per_second": 41.583, | |
| "eval_steps_per_second": 10.396, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 0.7543563842773438, | |
| "grad_norm": 686.0939331054688, | |
| "learning_rate": 1.2282276153564453e-05, | |
| "loss": 6.4873, | |
| "step": 395500 | |
| }, | |
| { | |
| "epoch": 0.75531005859375, | |
| "grad_norm": 323.3516540527344, | |
| "learning_rate": 1.2234592437744141e-05, | |
| "loss": 6.4371, | |
| "step": 396000 | |
| }, | |
| { | |
| "epoch": 0.7562637329101562, | |
| "grad_norm": 552.0935668945312, | |
| "learning_rate": 1.2186908721923828e-05, | |
| "loss": 6.5156, | |
| "step": 396500 | |
| }, | |
| { | |
| "epoch": 0.7572174072265625, | |
| "grad_norm": 145.94496154785156, | |
| "learning_rate": 1.2139225006103517e-05, | |
| "loss": 6.5244, | |
| "step": 397000 | |
| }, | |
| { | |
| "epoch": 0.7581710815429688, | |
| "grad_norm": 213.95999145507812, | |
| "learning_rate": 1.2091541290283204e-05, | |
| "loss": 6.5308, | |
| "step": 397500 | |
| }, | |
| { | |
| "epoch": 0.759124755859375, | |
| "grad_norm": 179.3788299560547, | |
| "learning_rate": 1.204385757446289e-05, | |
| "loss": 6.4978, | |
| "step": 398000 | |
| }, | |
| { | |
| "epoch": 0.7600784301757812, | |
| "grad_norm": 479.97125244140625, | |
| "learning_rate": 1.1996173858642579e-05, | |
| "loss": 6.5211, | |
| "step": 398500 | |
| }, | |
| { | |
| "epoch": 0.7610321044921875, | |
| "grad_norm": 182.0722198486328, | |
| "learning_rate": 1.1948490142822266e-05, | |
| "loss": 6.5162, | |
| "step": 399000 | |
| }, | |
| { | |
| "epoch": 0.7619857788085938, | |
| "grad_norm": 134.76768493652344, | |
| "learning_rate": 1.1900806427001954e-05, | |
| "loss": 6.4966, | |
| "step": 399500 | |
| }, | |
| { | |
| "epoch": 0.762939453125, | |
| "grad_norm": 229.81442260742188, | |
| "learning_rate": 1.1853122711181641e-05, | |
| "loss": 6.5106, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 0.762939453125, | |
| "eval_accuracy": 0.039886692759295496, | |
| "eval_loss": 6.436385154724121, | |
| "eval_runtime": 242.7965, | |
| "eval_samples_per_second": 41.187, | |
| "eval_steps_per_second": 10.297, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 0.7638931274414062, | |
| "grad_norm": 249.11834716796875, | |
| "learning_rate": 1.1805438995361328e-05, | |
| "loss": 6.5044, | |
| "step": 400500 | |
| }, | |
| { | |
| "epoch": 0.7648468017578125, | |
| "grad_norm": 122.83702850341797, | |
| "learning_rate": 1.1757755279541016e-05, | |
| "loss": 6.4648, | |
| "step": 401000 | |
| }, | |
| { | |
| "epoch": 0.7658004760742188, | |
| "grad_norm": 123.14421844482422, | |
| "learning_rate": 1.1710071563720703e-05, | |
| "loss": 6.4555, | |
| "step": 401500 | |
| }, | |
| { | |
| "epoch": 0.766754150390625, | |
| "grad_norm": 275.2742004394531, | |
| "learning_rate": 1.1662387847900392e-05, | |
| "loss": 6.4705, | |
| "step": 402000 | |
| }, | |
| { | |
| "epoch": 0.7677078247070312, | |
| "grad_norm": 132.36386108398438, | |
| "learning_rate": 1.1614704132080079e-05, | |
| "loss": 6.4411, | |
| "step": 402500 | |
| }, | |
| { | |
| "epoch": 0.7686614990234375, | |
| "grad_norm": 75.54257202148438, | |
| "learning_rate": 1.1567020416259765e-05, | |
| "loss": 6.5292, | |
| "step": 403000 | |
| }, | |
| { | |
| "epoch": 0.7696151733398438, | |
| "grad_norm": 352.4110412597656, | |
| "learning_rate": 1.1519336700439454e-05, | |
| "loss": 6.47, | |
| "step": 403500 | |
| }, | |
| { | |
| "epoch": 0.77056884765625, | |
| "grad_norm": 138.96554565429688, | |
| "learning_rate": 1.147165298461914e-05, | |
| "loss": 6.5295, | |
| "step": 404000 | |
| }, | |
| { | |
| "epoch": 0.7715225219726562, | |
| "grad_norm": 162.00111389160156, | |
| "learning_rate": 1.142396926879883e-05, | |
| "loss": 6.5852, | |
| "step": 404500 | |
| }, | |
| { | |
| "epoch": 0.7724761962890625, | |
| "grad_norm": 119.4192886352539, | |
| "learning_rate": 1.1376285552978516e-05, | |
| "loss": 6.5572, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 0.7724761962890625, | |
| "eval_accuracy": 0.0405041095890411, | |
| "eval_loss": 6.444308757781982, | |
| "eval_runtime": 239.539, | |
| "eval_samples_per_second": 41.747, | |
| "eval_steps_per_second": 10.437, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 0.7734298706054688, | |
| "grad_norm": 112.89866638183594, | |
| "learning_rate": 1.1328601837158203e-05, | |
| "loss": 6.5389, | |
| "step": 405500 | |
| }, | |
| { | |
| "epoch": 0.774383544921875, | |
| "grad_norm": 404.556640625, | |
| "learning_rate": 1.1280918121337891e-05, | |
| "loss": 6.563, | |
| "step": 406000 | |
| }, | |
| { | |
| "epoch": 0.7753372192382812, | |
| "grad_norm": 448.70306396484375, | |
| "learning_rate": 1.1233234405517578e-05, | |
| "loss": 6.5147, | |
| "step": 406500 | |
| }, | |
| { | |
| "epoch": 0.7762908935546875, | |
| "grad_norm": 48.68925857543945, | |
| "learning_rate": 1.1185550689697267e-05, | |
| "loss": 6.5531, | |
| "step": 407000 | |
| }, | |
| { | |
| "epoch": 0.7772445678710938, | |
| "grad_norm": 332.9776611328125, | |
| "learning_rate": 1.1137866973876954e-05, | |
| "loss": 6.4849, | |
| "step": 407500 | |
| }, | |
| { | |
| "epoch": 0.7781982421875, | |
| "grad_norm": 134.01174926757812, | |
| "learning_rate": 1.109018325805664e-05, | |
| "loss": 6.4986, | |
| "step": 408000 | |
| }, | |
| { | |
| "epoch": 0.7791519165039062, | |
| "grad_norm": 172.95147705078125, | |
| "learning_rate": 1.1042499542236329e-05, | |
| "loss": 6.5161, | |
| "step": 408500 | |
| }, | |
| { | |
| "epoch": 0.7801055908203125, | |
| "grad_norm": 178.749267578125, | |
| "learning_rate": 1.0994815826416016e-05, | |
| "loss": 6.5186, | |
| "step": 409000 | |
| }, | |
| { | |
| "epoch": 0.7810592651367188, | |
| "grad_norm": 1029.6717529296875, | |
| "learning_rate": 1.0947132110595704e-05, | |
| "loss": 6.5478, | |
| "step": 409500 | |
| }, | |
| { | |
| "epoch": 0.782012939453125, | |
| "grad_norm": 202.45565795898438, | |
| "learning_rate": 1.0899448394775391e-05, | |
| "loss": 6.494, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 0.782012939453125, | |
| "eval_accuracy": 0.040156360078277886, | |
| "eval_loss": 6.433761119842529, | |
| "eval_runtime": 241.6954, | |
| "eval_samples_per_second": 41.374, | |
| "eval_steps_per_second": 10.344, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 0.7829666137695312, | |
| "grad_norm": 999.4024047851562, | |
| "learning_rate": 1.0851764678955078e-05, | |
| "loss": 6.46, | |
| "step": 410500 | |
| }, | |
| { | |
| "epoch": 0.7839202880859375, | |
| "grad_norm": 179.73165893554688, | |
| "learning_rate": 1.0804080963134766e-05, | |
| "loss": 6.5209, | |
| "step": 411000 | |
| }, | |
| { | |
| "epoch": 0.7848739624023438, | |
| "grad_norm": 234.8481903076172, | |
| "learning_rate": 1.0756397247314453e-05, | |
| "loss": 6.5324, | |
| "step": 411500 | |
| }, | |
| { | |
| "epoch": 0.78582763671875, | |
| "grad_norm": 126.8324966430664, | |
| "learning_rate": 1.0708713531494142e-05, | |
| "loss": 6.5223, | |
| "step": 412000 | |
| }, | |
| { | |
| "epoch": 0.7867813110351562, | |
| "grad_norm": 92.26342010498047, | |
| "learning_rate": 1.0661029815673829e-05, | |
| "loss": 6.5303, | |
| "step": 412500 | |
| }, | |
| { | |
| "epoch": 0.7877349853515625, | |
| "grad_norm": 309.4622497558594, | |
| "learning_rate": 1.0613346099853515e-05, | |
| "loss": 6.5365, | |
| "step": 413000 | |
| }, | |
| { | |
| "epoch": 0.7886886596679688, | |
| "grad_norm": 124.4228286743164, | |
| "learning_rate": 1.0565662384033204e-05, | |
| "loss": 6.5303, | |
| "step": 413500 | |
| }, | |
| { | |
| "epoch": 0.789642333984375, | |
| "grad_norm": 134.06324768066406, | |
| "learning_rate": 1.051797866821289e-05, | |
| "loss": 6.5122, | |
| "step": 414000 | |
| }, | |
| { | |
| "epoch": 0.7905960083007812, | |
| "grad_norm": 749.5646362304688, | |
| "learning_rate": 1.047029495239258e-05, | |
| "loss": 6.5199, | |
| "step": 414500 | |
| }, | |
| { | |
| "epoch": 0.7915496826171875, | |
| "grad_norm": 158.85972595214844, | |
| "learning_rate": 1.0422611236572266e-05, | |
| "loss": 6.5233, | |
| "step": 415000 | |
| }, | |
| { | |
| "epoch": 0.7915496826171875, | |
| "eval_accuracy": 0.04043620352250489, | |
| "eval_loss": 6.431772232055664, | |
| "eval_runtime": 247.2315, | |
| "eval_samples_per_second": 40.448, | |
| "eval_steps_per_second": 10.112, | |
| "step": 415000 | |
| }, | |
| { | |
| "epoch": 0.7925033569335938, | |
| "grad_norm": 337.47760009765625, | |
| "learning_rate": 1.0374927520751953e-05, | |
| "loss": 6.5102, | |
| "step": 415500 | |
| }, | |
| { | |
| "epoch": 0.79345703125, | |
| "grad_norm": 134.16856384277344, | |
| "learning_rate": 1.0327243804931641e-05, | |
| "loss": 6.5078, | |
| "step": 416000 | |
| }, | |
| { | |
| "epoch": 0.7944107055664062, | |
| "grad_norm": 394.0135192871094, | |
| "learning_rate": 1.0279560089111328e-05, | |
| "loss": 6.5206, | |
| "step": 416500 | |
| }, | |
| { | |
| "epoch": 0.7953643798828125, | |
| "grad_norm": 176.5369415283203, | |
| "learning_rate": 1.0231876373291017e-05, | |
| "loss": 6.5359, | |
| "step": 417000 | |
| }, | |
| { | |
| "epoch": 0.7963180541992188, | |
| "grad_norm": 180.64622497558594, | |
| "learning_rate": 1.0184192657470704e-05, | |
| "loss": 6.4871, | |
| "step": 417500 | |
| }, | |
| { | |
| "epoch": 0.797271728515625, | |
| "grad_norm": 299.3851013183594, | |
| "learning_rate": 1.013650894165039e-05, | |
| "loss": 6.5244, | |
| "step": 418000 | |
| }, | |
| { | |
| "epoch": 0.7982254028320312, | |
| "grad_norm": 844.39990234375, | |
| "learning_rate": 1.0088825225830079e-05, | |
| "loss": 6.4919, | |
| "step": 418500 | |
| }, | |
| { | |
| "epoch": 0.7991790771484375, | |
| "grad_norm": 264.82891845703125, | |
| "learning_rate": 1.0041141510009766e-05, | |
| "loss": 6.4495, | |
| "step": 419000 | |
| }, | |
| { | |
| "epoch": 0.8001327514648438, | |
| "grad_norm": 470.61968994140625, | |
| "learning_rate": 9.993457794189454e-06, | |
| "loss": 6.4704, | |
| "step": 419500 | |
| }, | |
| { | |
| "epoch": 0.80108642578125, | |
| "grad_norm": 1155.0771484375, | |
| "learning_rate": 9.945774078369141e-06, | |
| "loss": 6.4718, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 0.80108642578125, | |
| "eval_accuracy": 0.041468884540117414, | |
| "eval_loss": 6.425516128540039, | |
| "eval_runtime": 241.3638, | |
| "eval_samples_per_second": 41.431, | |
| "eval_steps_per_second": 10.358, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 0.8020401000976562, | |
| "grad_norm": 182.08621215820312, | |
| "learning_rate": 9.898090362548828e-06, | |
| "loss": 6.4815, | |
| "step": 420500 | |
| }, | |
| { | |
| "epoch": 0.8029937744140625, | |
| "grad_norm": 159.288330078125, | |
| "learning_rate": 9.850406646728516e-06, | |
| "loss": 6.5181, | |
| "step": 421000 | |
| }, | |
| { | |
| "epoch": 0.8039474487304688, | |
| "grad_norm": 503.52655029296875, | |
| "learning_rate": 9.802722930908203e-06, | |
| "loss": 6.5563, | |
| "step": 421500 | |
| }, | |
| { | |
| "epoch": 0.804901123046875, | |
| "grad_norm": 136.9956817626953, | |
| "learning_rate": 9.755039215087892e-06, | |
| "loss": 6.5364, | |
| "step": 422000 | |
| }, | |
| { | |
| "epoch": 0.8058547973632812, | |
| "grad_norm": 124.22718048095703, | |
| "learning_rate": 9.707355499267579e-06, | |
| "loss": 6.5407, | |
| "step": 422500 | |
| }, | |
| { | |
| "epoch": 0.8068084716796875, | |
| "grad_norm": 471.5389404296875, | |
| "learning_rate": 9.659671783447265e-06, | |
| "loss": 6.5318, | |
| "step": 423000 | |
| }, | |
| { | |
| "epoch": 0.8077621459960938, | |
| "grad_norm": 334.5975341796875, | |
| "learning_rate": 9.611988067626954e-06, | |
| "loss": 6.5438, | |
| "step": 423500 | |
| }, | |
| { | |
| "epoch": 0.8087158203125, | |
| "grad_norm": 294.684326171875, | |
| "learning_rate": 9.56430435180664e-06, | |
| "loss": 6.492, | |
| "step": 424000 | |
| }, | |
| { | |
| "epoch": 0.8096694946289062, | |
| "grad_norm": 143.47584533691406, | |
| "learning_rate": 9.51662063598633e-06, | |
| "loss": 6.5305, | |
| "step": 424500 | |
| }, | |
| { | |
| "epoch": 0.8106231689453125, | |
| "grad_norm": 219.26388549804688, | |
| "learning_rate": 9.468936920166016e-06, | |
| "loss": 6.4915, | |
| "step": 425000 | |
| }, | |
| { | |
| "epoch": 0.8106231689453125, | |
| "eval_accuracy": 0.040221135029354205, | |
| "eval_loss": 6.426062107086182, | |
| "eval_runtime": 238.9838, | |
| "eval_samples_per_second": 41.844, | |
| "eval_steps_per_second": 10.461, | |
| "step": 425000 | |
| }, | |
| { | |
| "epoch": 0.8115768432617188, | |
| "grad_norm": 85.5811996459961, | |
| "learning_rate": 9.421253204345703e-06, | |
| "loss": 6.508, | |
| "step": 425500 | |
| }, | |
| { | |
| "epoch": 0.812530517578125, | |
| "grad_norm": 134.37989807128906, | |
| "learning_rate": 9.373569488525391e-06, | |
| "loss": 6.5178, | |
| "step": 426000 | |
| }, | |
| { | |
| "epoch": 0.8134841918945312, | |
| "grad_norm": 824.7955932617188, | |
| "learning_rate": 9.325885772705078e-06, | |
| "loss": 6.5363, | |
| "step": 426500 | |
| }, | |
| { | |
| "epoch": 0.8144378662109375, | |
| "grad_norm": 134.56495666503906, | |
| "learning_rate": 9.278202056884767e-06, | |
| "loss": 6.5228, | |
| "step": 427000 | |
| }, | |
| { | |
| "epoch": 0.8153915405273438, | |
| "grad_norm": 320.2661437988281, | |
| "learning_rate": 9.230518341064454e-06, | |
| "loss": 6.5003, | |
| "step": 427500 | |
| }, | |
| { | |
| "epoch": 0.81634521484375, | |
| "grad_norm": 97.85888671875, | |
| "learning_rate": 9.18283462524414e-06, | |
| "loss": 6.512, | |
| "step": 428000 | |
| }, | |
| { | |
| "epoch": 0.8172988891601562, | |
| "grad_norm": 113.15918731689453, | |
| "learning_rate": 9.135150909423829e-06, | |
| "loss": 6.5378, | |
| "step": 428500 | |
| }, | |
| { | |
| "epoch": 0.8182525634765625, | |
| "grad_norm": 104.32511138916016, | |
| "learning_rate": 9.087467193603516e-06, | |
| "loss": 6.498, | |
| "step": 429000 | |
| }, | |
| { | |
| "epoch": 0.8192062377929688, | |
| "grad_norm": 139.578857421875, | |
| "learning_rate": 9.039783477783204e-06, | |
| "loss": 6.5095, | |
| "step": 429500 | |
| }, | |
| { | |
| "epoch": 0.820159912109375, | |
| "grad_norm": 152.16114807128906, | |
| "learning_rate": 8.992099761962891e-06, | |
| "loss": 6.5007, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 0.820159912109375, | |
| "eval_accuracy": 0.0407105675146771, | |
| "eval_loss": 6.424362659454346, | |
| "eval_runtime": 238.8953, | |
| "eval_samples_per_second": 41.859, | |
| "eval_steps_per_second": 10.465, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 0.8211135864257812, | |
| "grad_norm": 178.8160400390625, | |
| "learning_rate": 8.944416046142578e-06, | |
| "loss": 6.4954, | |
| "step": 430500 | |
| }, | |
| { | |
| "epoch": 0.8220672607421875, | |
| "grad_norm": 467.9737548828125, | |
| "learning_rate": 8.896732330322266e-06, | |
| "loss": 6.522, | |
| "step": 431000 | |
| }, | |
| { | |
| "epoch": 0.8230209350585938, | |
| "grad_norm": 283.6147766113281, | |
| "learning_rate": 8.849048614501953e-06, | |
| "loss": 6.5218, | |
| "step": 431500 | |
| }, | |
| { | |
| "epoch": 0.823974609375, | |
| "grad_norm": 129.71145629882812, | |
| "learning_rate": 8.801364898681642e-06, | |
| "loss": 6.4796, | |
| "step": 432000 | |
| }, | |
| { | |
| "epoch": 0.8249282836914062, | |
| "grad_norm": 83.04669189453125, | |
| "learning_rate": 8.753681182861329e-06, | |
| "loss": 6.5349, | |
| "step": 432500 | |
| }, | |
| { | |
| "epoch": 0.8258819580078125, | |
| "grad_norm": 258.3847351074219, | |
| "learning_rate": 8.705997467041015e-06, | |
| "loss": 6.4963, | |
| "step": 433000 | |
| }, | |
| { | |
| "epoch": 0.8268356323242188, | |
| "grad_norm": 394.8684387207031, | |
| "learning_rate": 8.658313751220704e-06, | |
| "loss": 6.522, | |
| "step": 433500 | |
| }, | |
| { | |
| "epoch": 0.827789306640625, | |
| "grad_norm": 164.72079467773438, | |
| "learning_rate": 8.61063003540039e-06, | |
| "loss": 6.5048, | |
| "step": 434000 | |
| }, | |
| { | |
| "epoch": 0.8287429809570312, | |
| "grad_norm": 265.7942199707031, | |
| "learning_rate": 8.56294631958008e-06, | |
| "loss": 6.5104, | |
| "step": 434500 | |
| }, | |
| { | |
| "epoch": 0.8296966552734375, | |
| "grad_norm": 123.39559936523438, | |
| "learning_rate": 8.515262603759766e-06, | |
| "loss": 6.488, | |
| "step": 435000 | |
| }, | |
| { | |
| "epoch": 0.8296966552734375, | |
| "eval_accuracy": 0.04050900195694716, | |
| "eval_loss": 6.418701171875, | |
| "eval_runtime": 238.8227, | |
| "eval_samples_per_second": 41.872, | |
| "eval_steps_per_second": 10.468, | |
| "step": 435000 | |
| }, | |
| { | |
| "epoch": 0.8306503295898438, | |
| "grad_norm": 109.87480926513672, | |
| "learning_rate": 8.467578887939453e-06, | |
| "loss": 6.4846, | |
| "step": 435500 | |
| }, | |
| { | |
| "epoch": 0.83160400390625, | |
| "grad_norm": 240.57347106933594, | |
| "learning_rate": 8.419895172119141e-06, | |
| "loss": 6.4685, | |
| "step": 436000 | |
| }, | |
| { | |
| "epoch": 0.8325576782226562, | |
| "grad_norm": 178.5532989501953, | |
| "learning_rate": 8.372211456298828e-06, | |
| "loss": 6.4504, | |
| "step": 436500 | |
| }, | |
| { | |
| "epoch": 0.8335113525390625, | |
| "grad_norm": 548.3411254882812, | |
| "learning_rate": 8.324527740478517e-06, | |
| "loss": 6.4365, | |
| "step": 437000 | |
| }, | |
| { | |
| "epoch": 0.8344650268554688, | |
| "grad_norm": 261.7312316894531, | |
| "learning_rate": 8.276844024658204e-06, | |
| "loss": 6.4829, | |
| "step": 437500 | |
| }, | |
| { | |
| "epoch": 0.835418701171875, | |
| "grad_norm": 180.61085510253906, | |
| "learning_rate": 8.22916030883789e-06, | |
| "loss": 6.4908, | |
| "step": 438000 | |
| }, | |
| { | |
| "epoch": 0.8363723754882812, | |
| "grad_norm": 141.17413330078125, | |
| "learning_rate": 8.181476593017579e-06, | |
| "loss": 6.5328, | |
| "step": 438500 | |
| }, | |
| { | |
| "epoch": 0.8373260498046875, | |
| "grad_norm": 100.98048400878906, | |
| "learning_rate": 8.133792877197266e-06, | |
| "loss": 6.5346, | |
| "step": 439000 | |
| }, | |
| { | |
| "epoch": 0.8382797241210938, | |
| "grad_norm": 131.7880401611328, | |
| "learning_rate": 8.086109161376954e-06, | |
| "loss": 6.5343, | |
| "step": 439500 | |
| }, | |
| { | |
| "epoch": 0.8392333984375, | |
| "grad_norm": 143.68992614746094, | |
| "learning_rate": 8.038425445556641e-06, | |
| "loss": 6.5093, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 0.8392333984375, | |
| "eval_accuracy": 0.04017945205479452, | |
| "eval_loss": 6.416311740875244, | |
| "eval_runtime": 239.7034, | |
| "eval_samples_per_second": 41.718, | |
| "eval_steps_per_second": 10.43, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 0.8401870727539062, | |
| "grad_norm": 221.09878540039062, | |
| "learning_rate": 7.990741729736328e-06, | |
| "loss": 6.5131, | |
| "step": 440500 | |
| }, | |
| { | |
| "epoch": 0.8411407470703125, | |
| "grad_norm": 147.80081176757812, | |
| "learning_rate": 7.943058013916016e-06, | |
| "loss": 6.4949, | |
| "step": 441000 | |
| }, | |
| { | |
| "epoch": 0.8420944213867188, | |
| "grad_norm": 98.18143463134766, | |
| "learning_rate": 7.895374298095703e-06, | |
| "loss": 6.4659, | |
| "step": 441500 | |
| }, | |
| { | |
| "epoch": 0.843048095703125, | |
| "grad_norm": 213.72427368164062, | |
| "learning_rate": 7.847690582275392e-06, | |
| "loss": 6.4745, | |
| "step": 442000 | |
| }, | |
| { | |
| "epoch": 0.8440017700195312, | |
| "grad_norm": 171.76968383789062, | |
| "learning_rate": 7.800006866455079e-06, | |
| "loss": 6.5142, | |
| "step": 442500 | |
| }, | |
| { | |
| "epoch": 0.8449554443359375, | |
| "grad_norm": 241.19647216796875, | |
| "learning_rate": 7.752323150634765e-06, | |
| "loss": 6.5102, | |
| "step": 443000 | |
| }, | |
| { | |
| "epoch": 0.8459091186523438, | |
| "grad_norm": 111.6817626953125, | |
| "learning_rate": 7.704639434814454e-06, | |
| "loss": 6.4809, | |
| "step": 443500 | |
| }, | |
| { | |
| "epoch": 0.84686279296875, | |
| "grad_norm": 168.18035888671875, | |
| "learning_rate": 7.65695571899414e-06, | |
| "loss": 6.5021, | |
| "step": 444000 | |
| }, | |
| { | |
| "epoch": 0.8478164672851562, | |
| "grad_norm": 603.36572265625, | |
| "learning_rate": 7.6092720031738284e-06, | |
| "loss": 6.4793, | |
| "step": 444500 | |
| }, | |
| { | |
| "epoch": 0.8487701416015625, | |
| "grad_norm": 228.9732666015625, | |
| "learning_rate": 7.561588287353516e-06, | |
| "loss": 6.5029, | |
| "step": 445000 | |
| }, | |
| { | |
| "epoch": 0.8487701416015625, | |
| "eval_accuracy": 0.040287671232876715, | |
| "eval_loss": 6.415238380432129, | |
| "eval_runtime": 238.84, | |
| "eval_samples_per_second": 41.869, | |
| "eval_steps_per_second": 10.467, | |
| "step": 445000 | |
| }, | |
| { | |
| "epoch": 0.8497238159179688, | |
| "grad_norm": 395.8603820800781, | |
| "learning_rate": 7.513904571533204e-06, | |
| "loss": 6.5165, | |
| "step": 445500 | |
| }, | |
| { | |
| "epoch": 0.850677490234375, | |
| "grad_norm": 324.56048583984375, | |
| "learning_rate": 7.466220855712891e-06, | |
| "loss": 6.5002, | |
| "step": 446000 | |
| }, | |
| { | |
| "epoch": 0.8516311645507812, | |
| "grad_norm": 150.86160278320312, | |
| "learning_rate": 7.418537139892578e-06, | |
| "loss": 6.5325, | |
| "step": 446500 | |
| }, | |
| { | |
| "epoch": 0.8525848388671875, | |
| "grad_norm": 416.0628967285156, | |
| "learning_rate": 7.370853424072266e-06, | |
| "loss": 6.5135, | |
| "step": 447000 | |
| }, | |
| { | |
| "epoch": 0.8535385131835938, | |
| "grad_norm": 210.53988647460938, | |
| "learning_rate": 7.323169708251954e-06, | |
| "loss": 6.5016, | |
| "step": 447500 | |
| }, | |
| { | |
| "epoch": 0.8544921875, | |
| "grad_norm": 228.83592224121094, | |
| "learning_rate": 7.275485992431641e-06, | |
| "loss": 6.529, | |
| "step": 448000 | |
| }, | |
| { | |
| "epoch": 0.8554458618164062, | |
| "grad_norm": 73.7596206665039, | |
| "learning_rate": 7.227802276611328e-06, | |
| "loss": 6.4851, | |
| "step": 448500 | |
| }, | |
| { | |
| "epoch": 0.8563995361328125, | |
| "grad_norm": 113.41374969482422, | |
| "learning_rate": 7.180118560791016e-06, | |
| "loss": 6.4297, | |
| "step": 449000 | |
| }, | |
| { | |
| "epoch": 0.8573532104492188, | |
| "grad_norm": 164.508056640625, | |
| "learning_rate": 7.1324348449707034e-06, | |
| "loss": 6.5079, | |
| "step": 449500 | |
| }, | |
| { | |
| "epoch": 0.858306884765625, | |
| "grad_norm": 103.63333892822266, | |
| "learning_rate": 7.084751129150391e-06, | |
| "loss": 6.3943, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 0.858306884765625, | |
| "eval_accuracy": 0.04016125244618395, | |
| "eval_loss": 6.420052528381348, | |
| "eval_runtime": 238.5095, | |
| "eval_samples_per_second": 41.927, | |
| "eval_steps_per_second": 10.482, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 0.8592605590820312, | |
| "grad_norm": 207.37977600097656, | |
| "learning_rate": 7.037067413330079e-06, | |
| "loss": 6.4935, | |
| "step": 450500 | |
| }, | |
| { | |
| "epoch": 0.8602142333984375, | |
| "grad_norm": 143.06759643554688, | |
| "learning_rate": 6.989383697509766e-06, | |
| "loss": 6.4474, | |
| "step": 451000 | |
| }, | |
| { | |
| "epoch": 0.8611679077148438, | |
| "grad_norm": 237.50033569335938, | |
| "learning_rate": 6.941699981689453e-06, | |
| "loss": 6.5138, | |
| "step": 451500 | |
| }, | |
| { | |
| "epoch": 0.86212158203125, | |
| "grad_norm": 86.28911590576172, | |
| "learning_rate": 6.894016265869141e-06, | |
| "loss": 6.4986, | |
| "step": 452000 | |
| }, | |
| { | |
| "epoch": 0.8630752563476562, | |
| "grad_norm": 249.3565673828125, | |
| "learning_rate": 6.846332550048829e-06, | |
| "loss": 6.4488, | |
| "step": 452500 | |
| }, | |
| { | |
| "epoch": 0.8640289306640625, | |
| "grad_norm": 161.2953643798828, | |
| "learning_rate": 6.798648834228516e-06, | |
| "loss": 6.4201, | |
| "step": 453000 | |
| }, | |
| { | |
| "epoch": 0.8649826049804688, | |
| "grad_norm": 94.91120147705078, | |
| "learning_rate": 6.750965118408203e-06, | |
| "loss": 6.4487, | |
| "step": 453500 | |
| }, | |
| { | |
| "epoch": 0.865936279296875, | |
| "grad_norm": 160.8304901123047, | |
| "learning_rate": 6.703281402587891e-06, | |
| "loss": 6.4593, | |
| "step": 454000 | |
| }, | |
| { | |
| "epoch": 0.8668899536132812, | |
| "grad_norm": 121.05160522460938, | |
| "learning_rate": 6.6555976867675784e-06, | |
| "loss": 6.5208, | |
| "step": 454500 | |
| }, | |
| { | |
| "epoch": 0.8678436279296875, | |
| "grad_norm": 161.79888916015625, | |
| "learning_rate": 6.607913970947266e-06, | |
| "loss": 6.5358, | |
| "step": 455000 | |
| }, | |
| { | |
| "epoch": 0.8678436279296875, | |
| "eval_accuracy": 0.04102739726027397, | |
| "eval_loss": 6.410449028015137, | |
| "eval_runtime": 241.4157, | |
| "eval_samples_per_second": 41.422, | |
| "eval_steps_per_second": 10.356, | |
| "step": 455000 | |
| }, | |
| { | |
| "epoch": 0.8687973022460938, | |
| "grad_norm": 994.7066040039062, | |
| "learning_rate": 6.560230255126954e-06, | |
| "loss": 6.467, | |
| "step": 455500 | |
| }, | |
| { | |
| "epoch": 0.8697509765625, | |
| "grad_norm": 101.90473937988281, | |
| "learning_rate": 6.512546539306641e-06, | |
| "loss": 6.5229, | |
| "step": 456000 | |
| }, | |
| { | |
| "epoch": 0.8707046508789062, | |
| "grad_norm": 317.31573486328125, | |
| "learning_rate": 6.464862823486328e-06, | |
| "loss": 6.4841, | |
| "step": 456500 | |
| }, | |
| { | |
| "epoch": 0.8716583251953125, | |
| "grad_norm": 152.74020385742188, | |
| "learning_rate": 6.417179107666016e-06, | |
| "loss": 6.4979, | |
| "step": 457000 | |
| }, | |
| { | |
| "epoch": 0.8726119995117188, | |
| "grad_norm": 78.52469635009766, | |
| "learning_rate": 6.369495391845704e-06, | |
| "loss": 6.4786, | |
| "step": 457500 | |
| }, | |
| { | |
| "epoch": 0.873565673828125, | |
| "grad_norm": 189.6461181640625, | |
| "learning_rate": 6.321811676025391e-06, | |
| "loss": 6.5052, | |
| "step": 458000 | |
| }, | |
| { | |
| "epoch": 0.8745193481445312, | |
| "grad_norm": 255.77638244628906, | |
| "learning_rate": 6.274127960205078e-06, | |
| "loss": 6.4747, | |
| "step": 458500 | |
| }, | |
| { | |
| "epoch": 0.8754730224609375, | |
| "grad_norm": 665.3084716796875, | |
| "learning_rate": 6.226444244384766e-06, | |
| "loss": 6.5084, | |
| "step": 459000 | |
| }, | |
| { | |
| "epoch": 0.8764266967773438, | |
| "grad_norm": 138.03125, | |
| "learning_rate": 6.1787605285644534e-06, | |
| "loss": 6.4459, | |
| "step": 459500 | |
| }, | |
| { | |
| "epoch": 0.87738037109375, | |
| "grad_norm": 112.24298095703125, | |
| "learning_rate": 6.131076812744141e-06, | |
| "loss": 6.5185, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 0.87738037109375, | |
| "eval_accuracy": 0.041087084148727984, | |
| "eval_loss": 6.409872055053711, | |
| "eval_runtime": 238.0851, | |
| "eval_samples_per_second": 42.002, | |
| "eval_steps_per_second": 10.5, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 0.8783340454101562, | |
| "grad_norm": 120.41625213623047, | |
| "learning_rate": 6.083393096923829e-06, | |
| "loss": 6.5047, | |
| "step": 460500 | |
| }, | |
| { | |
| "epoch": 0.8792877197265625, | |
| "grad_norm": 269.24188232421875, | |
| "learning_rate": 6.035709381103516e-06, | |
| "loss": 6.5034, | |
| "step": 461000 | |
| }, | |
| { | |
| "epoch": 0.8802413940429688, | |
| "grad_norm": 229.2635040283203, | |
| "learning_rate": 5.988025665283203e-06, | |
| "loss": 6.4601, | |
| "step": 461500 | |
| }, | |
| { | |
| "epoch": 0.881195068359375, | |
| "grad_norm": 200.8750457763672, | |
| "learning_rate": 5.940341949462891e-06, | |
| "loss": 6.5129, | |
| "step": 462000 | |
| }, | |
| { | |
| "epoch": 0.8821487426757812, | |
| "grad_norm": 298.3540344238281, | |
| "learning_rate": 5.892658233642579e-06, | |
| "loss": 6.4728, | |
| "step": 462500 | |
| }, | |
| { | |
| "epoch": 0.8831024169921875, | |
| "grad_norm": 376.58282470703125, | |
| "learning_rate": 5.844974517822266e-06, | |
| "loss": 6.5131, | |
| "step": 463000 | |
| }, | |
| { | |
| "epoch": 0.8840560913085938, | |
| "grad_norm": 120.4054946899414, | |
| "learning_rate": 5.797290802001953e-06, | |
| "loss": 6.4966, | |
| "step": 463500 | |
| }, | |
| { | |
| "epoch": 0.885009765625, | |
| "grad_norm": 137.49420166015625, | |
| "learning_rate": 5.749607086181641e-06, | |
| "loss": 6.4923, | |
| "step": 464000 | |
| }, | |
| { | |
| "epoch": 0.8859634399414062, | |
| "grad_norm": 191.45240783691406, | |
| "learning_rate": 5.7019233703613284e-06, | |
| "loss": 6.4865, | |
| "step": 464500 | |
| }, | |
| { | |
| "epoch": 0.8869171142578125, | |
| "grad_norm": 105.69901275634766, | |
| "learning_rate": 5.654239654541016e-06, | |
| "loss": 6.4622, | |
| "step": 465000 | |
| }, | |
| { | |
| "epoch": 0.8869171142578125, | |
| "eval_accuracy": 0.040835812133072406, | |
| "eval_loss": 6.4109673500061035, | |
| "eval_runtime": 240.5013, | |
| "eval_samples_per_second": 41.58, | |
| "eval_steps_per_second": 10.395, | |
| "step": 465000 | |
| }, | |
| { | |
| "epoch": 0.8878707885742188, | |
| "grad_norm": 155.20513916015625, | |
| "learning_rate": 5.606555938720704e-06, | |
| "loss": 6.4755, | |
| "step": 465500 | |
| }, | |
| { | |
| "epoch": 0.888824462890625, | |
| "grad_norm": 138.47914123535156, | |
| "learning_rate": 5.558872222900391e-06, | |
| "loss": 6.4916, | |
| "step": 466000 | |
| }, | |
| { | |
| "epoch": 0.8897781372070312, | |
| "grad_norm": 183.78927612304688, | |
| "learning_rate": 5.511188507080078e-06, | |
| "loss": 6.4476, | |
| "step": 466500 | |
| }, | |
| { | |
| "epoch": 0.8907318115234375, | |
| "grad_norm": 213.38343811035156, | |
| "learning_rate": 5.463504791259766e-06, | |
| "loss": 6.468, | |
| "step": 467000 | |
| }, | |
| { | |
| "epoch": 0.8916854858398438, | |
| "grad_norm": 323.8433532714844, | |
| "learning_rate": 5.415821075439454e-06, | |
| "loss": 6.4651, | |
| "step": 467500 | |
| }, | |
| { | |
| "epoch": 0.89263916015625, | |
| "grad_norm": 478.55859375, | |
| "learning_rate": 5.368137359619141e-06, | |
| "loss": 6.4735, | |
| "step": 468000 | |
| }, | |
| { | |
| "epoch": 0.8935928344726562, | |
| "grad_norm": 130.722412109375, | |
| "learning_rate": 5.320453643798828e-06, | |
| "loss": 6.4699, | |
| "step": 468500 | |
| }, | |
| { | |
| "epoch": 0.8945465087890625, | |
| "grad_norm": 394.5163269042969, | |
| "learning_rate": 5.272769927978516e-06, | |
| "loss": 6.4711, | |
| "step": 469000 | |
| }, | |
| { | |
| "epoch": 0.8955001831054688, | |
| "grad_norm": 146.37515258789062, | |
| "learning_rate": 5.2250862121582034e-06, | |
| "loss": 6.4345, | |
| "step": 469500 | |
| }, | |
| { | |
| "epoch": 0.896453857421875, | |
| "grad_norm": 201.2168426513672, | |
| "learning_rate": 5.177402496337891e-06, | |
| "loss": 6.4632, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 0.896453857421875, | |
| "eval_accuracy": 0.040519569471624266, | |
| "eval_loss": 6.408752918243408, | |
| "eval_runtime": 239.9438, | |
| "eval_samples_per_second": 41.676, | |
| "eval_steps_per_second": 10.419, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 0.8974075317382812, | |
| "grad_norm": 111.96930694580078, | |
| "learning_rate": 5.129718780517579e-06, | |
| "loss": 6.4518, | |
| "step": 470500 | |
| }, | |
| { | |
| "epoch": 0.8983612060546875, | |
| "grad_norm": 94.77079010009766, | |
| "learning_rate": 5.082035064697266e-06, | |
| "loss": 6.5325, | |
| "step": 471000 | |
| }, | |
| { | |
| "epoch": 0.8993148803710938, | |
| "grad_norm": 210.18495178222656, | |
| "learning_rate": 5.034351348876953e-06, | |
| "loss": 6.5485, | |
| "step": 471500 | |
| }, | |
| { | |
| "epoch": 0.9002685546875, | |
| "grad_norm": 206.44248962402344, | |
| "learning_rate": 4.986667633056641e-06, | |
| "loss": 6.515, | |
| "step": 472000 | |
| }, | |
| { | |
| "epoch": 0.9012222290039062, | |
| "grad_norm": 157.20700073242188, | |
| "learning_rate": 4.938983917236329e-06, | |
| "loss": 6.4947, | |
| "step": 472500 | |
| }, | |
| { | |
| "epoch": 0.9021759033203125, | |
| "grad_norm": 218.9573974609375, | |
| "learning_rate": 4.891300201416016e-06, | |
| "loss": 6.467, | |
| "step": 473000 | |
| }, | |
| { | |
| "epoch": 0.9031295776367188, | |
| "grad_norm": 158.2908477783203, | |
| "learning_rate": 4.843616485595703e-06, | |
| "loss": 6.4824, | |
| "step": 473500 | |
| }, | |
| { | |
| "epoch": 0.904083251953125, | |
| "grad_norm": 219.37716674804688, | |
| "learning_rate": 4.795932769775391e-06, | |
| "loss": 6.5119, | |
| "step": 474000 | |
| }, | |
| { | |
| "epoch": 0.9050369262695312, | |
| "grad_norm": 200.3717803955078, | |
| "learning_rate": 4.7482490539550784e-06, | |
| "loss": 6.4907, | |
| "step": 474500 | |
| }, | |
| { | |
| "epoch": 0.9059906005859375, | |
| "grad_norm": 203.56930541992188, | |
| "learning_rate": 4.700565338134766e-06, | |
| "loss": 6.5168, | |
| "step": 475000 | |
| }, | |
| { | |
| "epoch": 0.9059906005859375, | |
| "eval_accuracy": 0.04081624266144814, | |
| "eval_loss": 6.405585765838623, | |
| "eval_runtime": 237.2887, | |
| "eval_samples_per_second": 42.143, | |
| "eval_steps_per_second": 10.536, | |
| "step": 475000 | |
| }, | |
| { | |
| "epoch": 0.9069442749023438, | |
| "grad_norm": 318.3104248046875, | |
| "learning_rate": 4.652881622314453e-06, | |
| "loss": 6.5018, | |
| "step": 475500 | |
| }, | |
| { | |
| "epoch": 0.90789794921875, | |
| "grad_norm": 182.2261962890625, | |
| "learning_rate": 4.605197906494141e-06, | |
| "loss": 6.5036, | |
| "step": 476000 | |
| }, | |
| { | |
| "epoch": 0.9088516235351562, | |
| "grad_norm": 209.1435089111328, | |
| "learning_rate": 4.557514190673828e-06, | |
| "loss": 6.4778, | |
| "step": 476500 | |
| }, | |
| { | |
| "epoch": 0.9098052978515625, | |
| "grad_norm": 351.99151611328125, | |
| "learning_rate": 4.509830474853516e-06, | |
| "loss": 6.489, | |
| "step": 477000 | |
| }, | |
| { | |
| "epoch": 0.9107589721679688, | |
| "grad_norm": 115.95133972167969, | |
| "learning_rate": 4.462146759033204e-06, | |
| "loss": 6.4559, | |
| "step": 477500 | |
| }, | |
| { | |
| "epoch": 0.911712646484375, | |
| "grad_norm": 107.39529418945312, | |
| "learning_rate": 4.4144630432128904e-06, | |
| "loss": 6.5008, | |
| "step": 478000 | |
| }, | |
| { | |
| "epoch": 0.9126663208007812, | |
| "grad_norm": 186.85157775878906, | |
| "learning_rate": 4.366779327392578e-06, | |
| "loss": 6.4971, | |
| "step": 478500 | |
| }, | |
| { | |
| "epoch": 0.9136199951171875, | |
| "grad_norm": 247.09622192382812, | |
| "learning_rate": 4.319095611572266e-06, | |
| "loss": 6.5137, | |
| "step": 479000 | |
| }, | |
| { | |
| "epoch": 0.9145736694335938, | |
| "grad_norm": 54.6359977722168, | |
| "learning_rate": 4.2714118957519534e-06, | |
| "loss": 6.5132, | |
| "step": 479500 | |
| }, | |
| { | |
| "epoch": 0.91552734375, | |
| "grad_norm": 187.11416625976562, | |
| "learning_rate": 4.223728179931641e-06, | |
| "loss": 6.4607, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 0.91552734375, | |
| "eval_accuracy": 0.04039393346379648, | |
| "eval_loss": 6.399998664855957, | |
| "eval_runtime": 238.4674, | |
| "eval_samples_per_second": 41.934, | |
| "eval_steps_per_second": 10.484, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 0.9164810180664062, | |
| "grad_norm": 233.7747039794922, | |
| "learning_rate": 4.176044464111328e-06, | |
| "loss": 6.4675, | |
| "step": 480500 | |
| }, | |
| { | |
| "epoch": 0.9174346923828125, | |
| "grad_norm": 244.14247131347656, | |
| "learning_rate": 4.128360748291016e-06, | |
| "loss": 6.4788, | |
| "step": 481000 | |
| }, | |
| { | |
| "epoch": 0.9183883666992188, | |
| "grad_norm": 151.67935180664062, | |
| "learning_rate": 4.080677032470703e-06, | |
| "loss": 6.4817, | |
| "step": 481500 | |
| }, | |
| { | |
| "epoch": 0.919342041015625, | |
| "grad_norm": 113.63164520263672, | |
| "learning_rate": 4.032993316650391e-06, | |
| "loss": 6.4787, | |
| "step": 482000 | |
| }, | |
| { | |
| "epoch": 0.9202957153320312, | |
| "grad_norm": 368.08343505859375, | |
| "learning_rate": 3.985309600830079e-06, | |
| "loss": 6.5057, | |
| "step": 482500 | |
| }, | |
| { | |
| "epoch": 0.9212493896484375, | |
| "grad_norm": 159.99847412109375, | |
| "learning_rate": 3.9376258850097654e-06, | |
| "loss": 6.4912, | |
| "step": 483000 | |
| }, | |
| { | |
| "epoch": 0.9222030639648438, | |
| "grad_norm": 249.7469024658203, | |
| "learning_rate": 3.889942169189453e-06, | |
| "loss": 6.4585, | |
| "step": 483500 | |
| }, | |
| { | |
| "epoch": 0.92315673828125, | |
| "grad_norm": 156.4268798828125, | |
| "learning_rate": 3.842258453369141e-06, | |
| "loss": 6.5154, | |
| "step": 484000 | |
| }, | |
| { | |
| "epoch": 0.9241104125976562, | |
| "grad_norm": 930.0221557617188, | |
| "learning_rate": 3.7945747375488284e-06, | |
| "loss": 6.4562, | |
| "step": 484500 | |
| }, | |
| { | |
| "epoch": 0.9250640869140625, | |
| "grad_norm": 184.3603057861328, | |
| "learning_rate": 3.7468910217285157e-06, | |
| "loss": 6.4444, | |
| "step": 485000 | |
| }, | |
| { | |
| "epoch": 0.9250640869140625, | |
| "eval_accuracy": 0.04043659491193738, | |
| "eval_loss": 6.402373313903809, | |
| "eval_runtime": 239.0466, | |
| "eval_samples_per_second": 41.833, | |
| "eval_steps_per_second": 10.458, | |
| "step": 485000 | |
| }, | |
| { | |
| "epoch": 0.9260177612304688, | |
| "grad_norm": 206.1715087890625, | |
| "learning_rate": 3.6992073059082034e-06, | |
| "loss": 6.4102, | |
| "step": 485500 | |
| }, | |
| { | |
| "epoch": 0.926971435546875, | |
| "grad_norm": 146.68963623046875, | |
| "learning_rate": 3.6515235900878906e-06, | |
| "loss": 6.4127, | |
| "step": 486000 | |
| }, | |
| { | |
| "epoch": 0.9279251098632812, | |
| "grad_norm": 89.89360046386719, | |
| "learning_rate": 3.6038398742675783e-06, | |
| "loss": 6.4257, | |
| "step": 486500 | |
| }, | |
| { | |
| "epoch": 0.9288787841796875, | |
| "grad_norm": 242.57437133789062, | |
| "learning_rate": 3.556156158447266e-06, | |
| "loss": 6.4693, | |
| "step": 487000 | |
| }, | |
| { | |
| "epoch": 0.9298324584960938, | |
| "grad_norm": 125.63064575195312, | |
| "learning_rate": 3.508472442626953e-06, | |
| "loss": 6.4649, | |
| "step": 487500 | |
| }, | |
| { | |
| "epoch": 0.9307861328125, | |
| "grad_norm": 683.224853515625, | |
| "learning_rate": 3.460788726806641e-06, | |
| "loss": 6.5341, | |
| "step": 488000 | |
| }, | |
| { | |
| "epoch": 0.9317398071289062, | |
| "grad_norm": 167.0951690673828, | |
| "learning_rate": 3.413105010986328e-06, | |
| "loss": 6.5514, | |
| "step": 488500 | |
| }, | |
| { | |
| "epoch": 0.9326934814453125, | |
| "grad_norm": 298.4920959472656, | |
| "learning_rate": 3.3654212951660158e-06, | |
| "loss": 6.4831, | |
| "step": 489000 | |
| }, | |
| { | |
| "epoch": 0.9336471557617188, | |
| "grad_norm": 268.6081237792969, | |
| "learning_rate": 3.3177375793457034e-06, | |
| "loss": 6.5185, | |
| "step": 489500 | |
| }, | |
| { | |
| "epoch": 0.934600830078125, | |
| "grad_norm": 134.19676208496094, | |
| "learning_rate": 3.2700538635253907e-06, | |
| "loss": 6.5035, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 0.934600830078125, | |
| "eval_accuracy": 0.04039667318982387, | |
| "eval_loss": 6.399056434631348, | |
| "eval_runtime": 247.0746, | |
| "eval_samples_per_second": 40.474, | |
| "eval_steps_per_second": 10.118, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 0.9355545043945312, | |
| "grad_norm": 496.64947509765625, | |
| "learning_rate": 3.2223701477050784e-06, | |
| "loss": 6.5038, | |
| "step": 490500 | |
| }, | |
| { | |
| "epoch": 0.9365081787109375, | |
| "grad_norm": 254.47743225097656, | |
| "learning_rate": 3.1746864318847656e-06, | |
| "loss": 6.5193, | |
| "step": 491000 | |
| }, | |
| { | |
| "epoch": 0.9374618530273438, | |
| "grad_norm": 567.7495727539062, | |
| "learning_rate": 3.1270027160644533e-06, | |
| "loss": 6.5031, | |
| "step": 491500 | |
| }, | |
| { | |
| "epoch": 0.93841552734375, | |
| "grad_norm": 133.81227111816406, | |
| "learning_rate": 3.079319000244141e-06, | |
| "loss": 6.4954, | |
| "step": 492000 | |
| }, | |
| { | |
| "epoch": 0.9393692016601562, | |
| "grad_norm": 515.5515747070312, | |
| "learning_rate": 3.031635284423828e-06, | |
| "loss": 6.6057, | |
| "step": 492500 | |
| }, | |
| { | |
| "epoch": 0.9403228759765625, | |
| "grad_norm": 133.3535919189453, | |
| "learning_rate": 2.983951568603516e-06, | |
| "loss": 6.4989, | |
| "step": 493000 | |
| }, | |
| { | |
| "epoch": 0.9412765502929688, | |
| "grad_norm": 206.1186065673828, | |
| "learning_rate": 2.936267852783203e-06, | |
| "loss": 6.4909, | |
| "step": 493500 | |
| }, | |
| { | |
| "epoch": 0.942230224609375, | |
| "grad_norm": 235.87135314941406, | |
| "learning_rate": 2.8885841369628908e-06, | |
| "loss": 6.4921, | |
| "step": 494000 | |
| }, | |
| { | |
| "epoch": 0.9431838989257812, | |
| "grad_norm": 92.96924591064453, | |
| "learning_rate": 2.8409004211425784e-06, | |
| "loss": 6.5005, | |
| "step": 494500 | |
| }, | |
| { | |
| "epoch": 0.9441375732421875, | |
| "grad_norm": 170.0372772216797, | |
| "learning_rate": 2.7932167053222657e-06, | |
| "loss": 6.4861, | |
| "step": 495000 | |
| }, | |
| { | |
| "epoch": 0.9441375732421875, | |
| "eval_accuracy": 0.04069197651663405, | |
| "eval_loss": 6.399594306945801, | |
| "eval_runtime": 236.9558, | |
| "eval_samples_per_second": 42.202, | |
| "eval_steps_per_second": 10.55, | |
| "step": 495000 | |
| }, | |
| { | |
| "epoch": 0.9450912475585938, | |
| "grad_norm": 138.89389038085938, | |
| "learning_rate": 2.7455329895019534e-06, | |
| "loss": 6.4904, | |
| "step": 495500 | |
| }, | |
| { | |
| "epoch": 0.946044921875, | |
| "grad_norm": 428.46466064453125, | |
| "learning_rate": 2.6978492736816406e-06, | |
| "loss": 6.4434, | |
| "step": 496000 | |
| }, | |
| { | |
| "epoch": 0.9469985961914062, | |
| "grad_norm": 94.90145874023438, | |
| "learning_rate": 2.6501655578613283e-06, | |
| "loss": 6.4063, | |
| "step": 496500 | |
| }, | |
| { | |
| "epoch": 0.9479522705078125, | |
| "grad_norm": 215.63514709472656, | |
| "learning_rate": 2.602481842041016e-06, | |
| "loss": 6.4792, | |
| "step": 497000 | |
| }, | |
| { | |
| "epoch": 0.9489059448242188, | |
| "grad_norm": 110.41034698486328, | |
| "learning_rate": 2.554798126220703e-06, | |
| "loss": 6.4369, | |
| "step": 497500 | |
| }, | |
| { | |
| "epoch": 0.949859619140625, | |
| "grad_norm": 89.65414428710938, | |
| "learning_rate": 2.507114410400391e-06, | |
| "loss": 6.4895, | |
| "step": 498000 | |
| }, | |
| { | |
| "epoch": 0.9508132934570312, | |
| "grad_norm": 531.5847778320312, | |
| "learning_rate": 2.459430694580078e-06, | |
| "loss": 6.4937, | |
| "step": 498500 | |
| }, | |
| { | |
| "epoch": 0.9517669677734375, | |
| "grad_norm": 341.2304992675781, | |
| "learning_rate": 2.4117469787597658e-06, | |
| "loss": 6.4776, | |
| "step": 499000 | |
| }, | |
| { | |
| "epoch": 0.9527206420898438, | |
| "grad_norm": 198.3251495361328, | |
| "learning_rate": 2.3640632629394534e-06, | |
| "loss": 6.4901, | |
| "step": 499500 | |
| }, | |
| { | |
| "epoch": 0.95367431640625, | |
| "grad_norm": 275.1803894042969, | |
| "learning_rate": 2.3163795471191407e-06, | |
| "loss": 6.4772, | |
| "step": 500000 | |
| }, | |
| { | |
| "epoch": 0.95367431640625, | |
| "eval_accuracy": 0.04034637964774951, | |
| "eval_loss": 6.396467208862305, | |
| "eval_runtime": 236.7401, | |
| "eval_samples_per_second": 42.24, | |
| "eval_steps_per_second": 10.56, | |
| "step": 500000 | |
| }, | |
| { | |
| "epoch": 1.0009536743164062, | |
| "grad_norm": 660.521484375, | |
| "learning_rate": 2.2686958312988284e-06, | |
| "loss": 6.4742, | |
| "step": 500500 | |
| }, | |
| { | |
| "epoch": 1.0019073486328125, | |
| "grad_norm": 61.67089080810547, | |
| "learning_rate": 2.2210121154785156e-06, | |
| "loss": 6.4717, | |
| "step": 501000 | |
| }, | |
| { | |
| "epoch": 1.0028610229492188, | |
| "grad_norm": 443.39886474609375, | |
| "learning_rate": 2.1733283996582033e-06, | |
| "loss": 6.4642, | |
| "step": 501500 | |
| }, | |
| { | |
| "epoch": 1.003814697265625, | |
| "grad_norm": 176.8584442138672, | |
| "learning_rate": 2.125644683837891e-06, | |
| "loss": 6.4951, | |
| "step": 502000 | |
| }, | |
| { | |
| "epoch": 1.0047683715820312, | |
| "grad_norm": 161.3272247314453, | |
| "learning_rate": 2.077960968017578e-06, | |
| "loss": 6.4766, | |
| "step": 502500 | |
| }, | |
| { | |
| "epoch": 1.0057220458984375, | |
| "grad_norm": 335.29766845703125, | |
| "learning_rate": 2.030277252197266e-06, | |
| "loss": 6.499, | |
| "step": 503000 | |
| }, | |
| { | |
| "epoch": 1.0066757202148438, | |
| "grad_norm": 1061.899169921875, | |
| "learning_rate": 1.982593536376953e-06, | |
| "loss": 6.4728, | |
| "step": 503500 | |
| }, | |
| { | |
| "epoch": 1.00762939453125, | |
| "grad_norm": 812.8785400390625, | |
| "learning_rate": 1.9349098205566408e-06, | |
| "loss": 6.4709, | |
| "step": 504000 | |
| }, | |
| { | |
| "epoch": 1.0085830688476562, | |
| "grad_norm": 449.0191345214844, | |
| "learning_rate": 1.8872261047363282e-06, | |
| "loss": 6.4303, | |
| "step": 504500 | |
| }, | |
| { | |
| "epoch": 1.0095367431640625, | |
| "grad_norm": 87.57962799072266, | |
| "learning_rate": 1.8395423889160157e-06, | |
| "loss": 6.4699, | |
| "step": 505000 | |
| }, | |
| { | |
| "epoch": 1.0095367431640625, | |
| "eval_accuracy": 0.04081682974559687, | |
| "eval_loss": 6.394299507141113, | |
| "eval_runtime": 237.5756, | |
| "eval_samples_per_second": 42.092, | |
| "eval_steps_per_second": 10.523, | |
| "step": 505000 | |
| }, | |
| { | |
| "epoch": 1.0104904174804688, | |
| "grad_norm": 650.8753662109375, | |
| "learning_rate": 1.7918586730957031e-06, | |
| "loss": 6.4638, | |
| "step": 505500 | |
| }, | |
| { | |
| "epoch": 1.011444091796875, | |
| "grad_norm": 344.9217834472656, | |
| "learning_rate": 1.7441749572753908e-06, | |
| "loss": 6.4907, | |
| "step": 506000 | |
| }, | |
| { | |
| "epoch": 1.0123977661132812, | |
| "grad_norm": 264.13104248046875, | |
| "learning_rate": 1.6964912414550783e-06, | |
| "loss": 6.4645, | |
| "step": 506500 | |
| }, | |
| { | |
| "epoch": 1.0133514404296875, | |
| "grad_norm": 110.09146881103516, | |
| "learning_rate": 1.6488075256347657e-06, | |
| "loss": 6.4765, | |
| "step": 507000 | |
| }, | |
| { | |
| "epoch": 1.0143051147460938, | |
| "grad_norm": 451.4542236328125, | |
| "learning_rate": 1.6011238098144532e-06, | |
| "loss": 6.4731, | |
| "step": 507500 | |
| }, | |
| { | |
| "epoch": 1.0152587890625, | |
| "grad_norm": 105.0625, | |
| "learning_rate": 1.5534400939941406e-06, | |
| "loss": 6.4823, | |
| "step": 508000 | |
| }, | |
| { | |
| "epoch": 1.0162124633789062, | |
| "grad_norm": 144.6260528564453, | |
| "learning_rate": 1.505756378173828e-06, | |
| "loss": 6.4399, | |
| "step": 508500 | |
| }, | |
| { | |
| "epoch": 1.0171661376953125, | |
| "grad_norm": 164.07302856445312, | |
| "learning_rate": 1.4580726623535158e-06, | |
| "loss": 6.4199, | |
| "step": 509000 | |
| }, | |
| { | |
| "epoch": 1.0181198120117188, | |
| "grad_norm": 532.625244140625, | |
| "learning_rate": 1.4103889465332032e-06, | |
| "loss": 6.4347, | |
| "step": 509500 | |
| }, | |
| { | |
| "epoch": 1.019073486328125, | |
| "grad_norm": 121.664306640625, | |
| "learning_rate": 1.3627052307128907e-06, | |
| "loss": 6.4423, | |
| "step": 510000 | |
| }, | |
| { | |
| "epoch": 1.019073486328125, | |
| "eval_accuracy": 0.04074931506849315, | |
| "eval_loss": 6.395938873291016, | |
| "eval_runtime": 238.1708, | |
| "eval_samples_per_second": 41.987, | |
| "eval_steps_per_second": 10.497, | |
| "step": 510000 | |
| }, | |
| { | |
| "epoch": 1.0200271606445312, | |
| "grad_norm": 144.7412567138672, | |
| "learning_rate": 1.3150215148925781e-06, | |
| "loss": 6.4167, | |
| "step": 510500 | |
| }, | |
| { | |
| "epoch": 1.0209808349609375, | |
| "grad_norm": 163.46463012695312, | |
| "learning_rate": 1.2673377990722656e-06, | |
| "loss": 6.4411, | |
| "step": 511000 | |
| }, | |
| { | |
| "epoch": 1.0219345092773438, | |
| "grad_norm": 165.60867309570312, | |
| "learning_rate": 1.2196540832519533e-06, | |
| "loss": 6.4644, | |
| "step": 511500 | |
| }, | |
| { | |
| "epoch": 1.02288818359375, | |
| "grad_norm": 142.94607543945312, | |
| "learning_rate": 1.1719703674316407e-06, | |
| "loss": 6.5151, | |
| "step": 512000 | |
| }, | |
| { | |
| "epoch": 1.0238418579101562, | |
| "grad_norm": 203.33934020996094, | |
| "learning_rate": 1.1242866516113282e-06, | |
| "loss": 6.4719, | |
| "step": 512500 | |
| }, | |
| { | |
| "epoch": 1.0247955322265625, | |
| "grad_norm": 142.940185546875, | |
| "learning_rate": 1.0766029357910156e-06, | |
| "loss": 6.5046, | |
| "step": 513000 | |
| }, | |
| { | |
| "epoch": 1.0257492065429688, | |
| "grad_norm": 69.12759399414062, | |
| "learning_rate": 1.028919219970703e-06, | |
| "loss": 6.4209, | |
| "step": 513500 | |
| }, | |
| { | |
| "epoch": 1.026702880859375, | |
| "grad_norm": 334.0024719238281, | |
| "learning_rate": 9.812355041503908e-07, | |
| "loss": 6.4513, | |
| "step": 514000 | |
| }, | |
| { | |
| "epoch": 1.0276565551757812, | |
| "grad_norm": 188.01004028320312, | |
| "learning_rate": 9.335517883300781e-07, | |
| "loss": 6.4816, | |
| "step": 514500 | |
| }, | |
| { | |
| "epoch": 1.0286102294921875, | |
| "grad_norm": 247.91525268554688, | |
| "learning_rate": 8.858680725097657e-07, | |
| "loss": 6.4724, | |
| "step": 515000 | |
| }, | |
| { | |
| "epoch": 1.0286102294921875, | |
| "eval_accuracy": 0.04097964774951076, | |
| "eval_loss": 6.3929643630981445, | |
| "eval_runtime": 239.731, | |
| "eval_samples_per_second": 41.713, | |
| "eval_steps_per_second": 10.428, | |
| "step": 515000 | |
| }, | |
| { | |
| "epoch": 1.0295639038085938, | |
| "grad_norm": 147.8994598388672, | |
| "learning_rate": 8.381843566894531e-07, | |
| "loss": 6.4946, | |
| "step": 515500 | |
| }, | |
| { | |
| "epoch": 1.030517578125, | |
| "grad_norm": 105.9547119140625, | |
| "learning_rate": 7.905006408691407e-07, | |
| "loss": 6.4546, | |
| "step": 516000 | |
| }, | |
| { | |
| "epoch": 1.0314712524414062, | |
| "grad_norm": 259.9665832519531, | |
| "learning_rate": 7.428169250488282e-07, | |
| "loss": 6.457, | |
| "step": 516500 | |
| }, | |
| { | |
| "epoch": 1.0324249267578125, | |
| "grad_norm": 144.90777587890625, | |
| "learning_rate": 6.951332092285156e-07, | |
| "loss": 6.4264, | |
| "step": 517000 | |
| }, | |
| { | |
| "epoch": 1.0333786010742188, | |
| "grad_norm": 185.05178833007812, | |
| "learning_rate": 6.474494934082032e-07, | |
| "loss": 6.4623, | |
| "step": 517500 | |
| }, | |
| { | |
| "epoch": 1.034332275390625, | |
| "grad_norm": 211.78102111816406, | |
| "learning_rate": 5.997657775878906e-07, | |
| "loss": 6.5132, | |
| "step": 518000 | |
| }, | |
| { | |
| "epoch": 1.0352859497070312, | |
| "grad_norm": 111.77483367919922, | |
| "learning_rate": 5.520820617675782e-07, | |
| "loss": 6.4667, | |
| "step": 518500 | |
| }, | |
| { | |
| "epoch": 1.0362396240234375, | |
| "grad_norm": 118.17922973632812, | |
| "learning_rate": 5.043983459472657e-07, | |
| "loss": 6.4682, | |
| "step": 519000 | |
| }, | |
| { | |
| "epoch": 1.0371932983398438, | |
| "grad_norm": 141.0331268310547, | |
| "learning_rate": 4.5671463012695317e-07, | |
| "loss": 6.4657, | |
| "step": 519500 | |
| }, | |
| { | |
| "epoch": 1.03814697265625, | |
| "grad_norm": 388.150390625, | |
| "learning_rate": 4.0903091430664063e-07, | |
| "loss": 6.4827, | |
| "step": 520000 | |
| }, | |
| { | |
| "epoch": 1.03814697265625, | |
| "eval_accuracy": 0.04092270058708415, | |
| "eval_loss": 6.391010284423828, | |
| "eval_runtime": 236.8403, | |
| "eval_samples_per_second": 42.223, | |
| "eval_steps_per_second": 10.556, | |
| "step": 520000 | |
| }, | |
| { | |
| "epoch": 1.0391006469726562, | |
| "grad_norm": 198.49420166015625, | |
| "learning_rate": 3.6134719848632814e-07, | |
| "loss": 6.4965, | |
| "step": 520500 | |
| }, | |
| { | |
| "epoch": 1.0400543212890625, | |
| "grad_norm": 214.90264892578125, | |
| "learning_rate": 3.1366348266601565e-07, | |
| "loss": 6.5029, | |
| "step": 521000 | |
| }, | |
| { | |
| "epoch": 1.0410079956054688, | |
| "grad_norm": 111.66552734375, | |
| "learning_rate": 2.6597976684570316e-07, | |
| "loss": 6.4834, | |
| "step": 521500 | |
| }, | |
| { | |
| "epoch": 1.041961669921875, | |
| "grad_norm": 145.68179321289062, | |
| "learning_rate": 2.1829605102539064e-07, | |
| "loss": 6.4821, | |
| "step": 522000 | |
| }, | |
| { | |
| "epoch": 1.0429153442382812, | |
| "grad_norm": 174.4090576171875, | |
| "learning_rate": 1.7061233520507813e-07, | |
| "loss": 6.4684, | |
| "step": 522500 | |
| }, | |
| { | |
| "epoch": 1.0438690185546875, | |
| "grad_norm": 138.6009521484375, | |
| "learning_rate": 1.2292861938476564e-07, | |
| "loss": 6.4652, | |
| "step": 523000 | |
| }, | |
| { | |
| "epoch": 1.0448226928710938, | |
| "grad_norm": 112.79365539550781, | |
| "learning_rate": 7.524490356445312e-08, | |
| "loss": 6.4511, | |
| "step": 523500 | |
| }, | |
| { | |
| "epoch": 1.0457763671875, | |
| "grad_norm": 297.8994445800781, | |
| "learning_rate": 2.7561187744140627e-08, | |
| "loss": 6.4939, | |
| "step": 524000 | |
| }, | |
| { | |
| "epoch": 1.04632568359375, | |
| "step": 524288, | |
| "total_flos": 5.045399375119909e+18, | |
| "train_loss": 6.577412648592144, | |
| "train_runtime": 154506.2809, | |
| "train_samples_per_second": 13.573, | |
| "train_steps_per_second": 3.393 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 524288, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.045399375119909e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |