| { | |
| "best_metric": 1.6068978309631348, | |
| "best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-12420", | |
| "epoch": 9.049218813243746, | |
| "eval_steps": 90, | |
| "global_step": 12420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.2670376300811768, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 7.2579, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.1587345600128174, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 7.2077, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.9462554454803467, | |
| "learning_rate": 1.25e-05, | |
| "loss": 7.1099, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.7092673778533936, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 6.9866, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.605360269546509, | |
| "learning_rate": 2.0833333333333333e-05, | |
| "loss": 6.87, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5747287273406982, | |
| "learning_rate": 2.5e-05, | |
| "loss": 6.7736, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.551903247833252, | |
| "learning_rate": 2.9166666666666666e-05, | |
| "loss": 6.6903, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5231289863586426, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 6.6151, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.503075361251831, | |
| "learning_rate": 3.75e-05, | |
| "loss": 6.5462, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_accuracy": 0.2242063046599867, | |
| "eval_loss": 6.500818729400635, | |
| "eval_runtime": 1083.9947, | |
| "eval_samples_per_second": 460.679, | |
| "eval_steps_per_second": 2.399, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.4953837394714355, | |
| "learning_rate": 4.1666666666666665e-05, | |
| "loss": 6.4802, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.4765946865081787, | |
| "learning_rate": 4.5833333333333334e-05, | |
| "loss": 6.4127, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.4805634021759033, | |
| "learning_rate": 5e-05, | |
| "loss": 6.3425, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.4720654487609863, | |
| "learning_rate": 5.416666666666667e-05, | |
| "loss": 6.2706, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.454899787902832, | |
| "learning_rate": 5.833333333333333e-05, | |
| "loss": 6.1941, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 2.4357142448425293, | |
| "learning_rate": 6.25e-05, | |
| "loss": 6.1169, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.4193003177642822, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 6.0351, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.396649122238159, | |
| "learning_rate": 7.083333333333334e-05, | |
| "loss": 5.9532, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.3566408157348633, | |
| "learning_rate": 7.5e-05, | |
| "loss": 5.8695, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_accuracy": 0.22404351306840548, | |
| "eval_loss": 5.819457530975342, | |
| "eval_runtime": 1077.6772, | |
| "eval_samples_per_second": 463.38, | |
| "eval_steps_per_second": 2.414, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.310816526412964, | |
| "learning_rate": 7.916666666666666e-05, | |
| "loss": 5.7876, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.2797272205352783, | |
| "learning_rate": 8.333333333333333e-05, | |
| "loss": 5.706, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.2188355922698975, | |
| "learning_rate": 8.75e-05, | |
| "loss": 5.6255, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.142122745513916, | |
| "learning_rate": 9.166666666666667e-05, | |
| "loss": 5.5471, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.069880485534668, | |
| "learning_rate": 9.583333333333334e-05, | |
| "loss": 5.4704, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.957664966583252, | |
| "learning_rate": 0.0001, | |
| "loss": 5.4003, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.8441264629364014, | |
| "learning_rate": 0.00010416666666666667, | |
| "loss": 5.3306, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.722961664199829, | |
| "learning_rate": 0.00010833333333333334, | |
| "loss": 5.2648, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.5622942447662354, | |
| "learning_rate": 0.00011250000000000001, | |
| "loss": 5.2004, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "eval_accuracy": 0.22578753539560809, | |
| "eval_loss": 5.162991046905518, | |
| "eval_runtime": 1079.7321, | |
| "eval_samples_per_second": 462.498, | |
| "eval_steps_per_second": 2.409, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.4034879207611084, | |
| "learning_rate": 0.00011666666666666667, | |
| "loss": 5.144, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.2136635780334473, | |
| "learning_rate": 0.00012083333333333333, | |
| "loss": 5.087, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.9789605140686035, | |
| "learning_rate": 0.000125, | |
| "loss": 5.0347, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.7709304094314575, | |
| "learning_rate": 0.00012916666666666667, | |
| "loss": 4.9873, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5693560838699341, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 4.95, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.42085811495780945, | |
| "learning_rate": 0.0001375, | |
| "loss": 4.9181, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.3126681447029114, | |
| "learning_rate": 0.00014166666666666668, | |
| "loss": 4.8959, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.24236658215522766, | |
| "learning_rate": 0.00014583333333333335, | |
| "loss": 4.876, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.23320983350276947, | |
| "learning_rate": 0.00015, | |
| "loss": 4.8607, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "eval_accuracy": 0.2620490039433166, | |
| "eval_loss": 4.841182231903076, | |
| "eval_runtime": 1079.5667, | |
| "eval_samples_per_second": 462.569, | |
| "eval_steps_per_second": 2.409, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.23191139101982117, | |
| "learning_rate": 0.00015416666666666668, | |
| "loss": 4.846, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.2323000133037567, | |
| "learning_rate": 0.00015833333333333332, | |
| "loss": 4.8304, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.2213001251220703, | |
| "learning_rate": 0.00016250000000000002, | |
| "loss": 4.816, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.21700094640254974, | |
| "learning_rate": 0.00016666666666666666, | |
| "loss": 4.8044, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.27367648482322693, | |
| "learning_rate": 0.00017083333333333333, | |
| "loss": 4.787, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.3331514000892639, | |
| "learning_rate": 0.000175, | |
| "loss": 4.775, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.3531811833381653, | |
| "learning_rate": 0.00017916666666666667, | |
| "loss": 4.7599, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.17425844073295593, | |
| "learning_rate": 0.00018333333333333334, | |
| "loss": 4.7471, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.16081774234771729, | |
| "learning_rate": 0.0001875, | |
| "loss": 4.732, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "eval_accuracy": 0.2854636107287403, | |
| "eval_loss": 4.713276386260986, | |
| "eval_runtime": 1135.1993, | |
| "eval_samples_per_second": 439.9, | |
| "eval_steps_per_second": 2.291, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.5644450783729553, | |
| "learning_rate": 0.00019166666666666667, | |
| "loss": 4.7196, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.4822804629802704, | |
| "learning_rate": 0.00019583333333333334, | |
| "loss": 4.7067, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.26140686869621277, | |
| "learning_rate": 0.0002, | |
| "loss": 4.6939, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.24341146647930145, | |
| "learning_rate": 0.00020416666666666668, | |
| "loss": 4.6797, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.1883888840675354, | |
| "learning_rate": 0.00020833333333333335, | |
| "loss": 4.667, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.32793405652046204, | |
| "learning_rate": 0.0002125, | |
| "loss": 4.6568, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.6900771856307983, | |
| "learning_rate": 0.00021666666666666668, | |
| "loss": 4.6504, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.23155897855758667, | |
| "learning_rate": 0.00022083333333333333, | |
| "loss": 4.6371, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.15708310902118683, | |
| "learning_rate": 0.00022500000000000002, | |
| "loss": 4.6273, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_accuracy": 0.2875459051329938, | |
| "eval_loss": 4.6114654541015625, | |
| "eval_runtime": 1075.2001, | |
| "eval_samples_per_second": 464.447, | |
| "eval_steps_per_second": 2.419, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.14138343930244446, | |
| "learning_rate": 0.00022916666666666666, | |
| "loss": 4.6187, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.10016636550426483, | |
| "learning_rate": 0.00023333333333333333, | |
| "loss": 4.6101, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.09886801242828369, | |
| "learning_rate": 0.0002375, | |
| "loss": 4.6027, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.9051061272621155, | |
| "learning_rate": 0.00024166666666666667, | |
| "loss": 4.5993, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.8797232508659363, | |
| "learning_rate": 0.0002458333333333333, | |
| "loss": 4.5982, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.4441538453102112, | |
| "learning_rate": 0.00025, | |
| "loss": 4.5868, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.24725468456745148, | |
| "learning_rate": 0.00025416666666666665, | |
| "loss": 4.5836, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.08581159263849258, | |
| "learning_rate": 0.00025833333333333334, | |
| "loss": 4.5773, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.10642833262681961, | |
| "learning_rate": 0.00026250000000000004, | |
| "loss": 4.572, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_accuracy": 0.28878817310094446, | |
| "eval_loss": 4.563485622406006, | |
| "eval_runtime": 1076.0022, | |
| "eval_samples_per_second": 464.101, | |
| "eval_steps_per_second": 2.417, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.17463397979736328, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 4.5707, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.1806878000497818, | |
| "learning_rate": 0.0002708333333333333, | |
| "loss": 4.5681, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.18553458154201508, | |
| "learning_rate": 0.000275, | |
| "loss": 4.5612, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7338638305664062, | |
| "learning_rate": 0.00027916666666666666, | |
| "loss": 4.5601, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.3852124512195587, | |
| "learning_rate": 0.00028333333333333335, | |
| "loss": 4.5567, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.2558722198009491, | |
| "learning_rate": 0.0002875, | |
| "loss": 4.555, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.1996091902256012, | |
| "learning_rate": 0.0002916666666666667, | |
| "loss": 4.553, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.414126992225647, | |
| "learning_rate": 0.00029583333333333333, | |
| "loss": 4.551, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6828728914260864, | |
| "learning_rate": 0.0003, | |
| "loss": 4.5485, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "eval_accuracy": 0.2897431184302517, | |
| "eval_loss": 4.544471263885498, | |
| "eval_runtime": 1075.524, | |
| "eval_samples_per_second": 464.308, | |
| "eval_steps_per_second": 2.418, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.31978073716163635, | |
| "learning_rate": 0.00030416666666666667, | |
| "loss": 4.5478, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.23724275827407837, | |
| "learning_rate": 0.00030833333333333337, | |
| "loss": 4.5451, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0507104396820068, | |
| "learning_rate": 0.0003125, | |
| "loss": 4.5441, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.38554638624191284, | |
| "learning_rate": 0.00031666666666666665, | |
| "loss": 4.5433, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.275704562664032, | |
| "learning_rate": 0.00032083333333333334, | |
| "loss": 4.5415, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.2305123656988144, | |
| "learning_rate": 0.00032500000000000004, | |
| "loss": 4.5398, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.4813285768032074, | |
| "learning_rate": 0.0003291666666666667, | |
| "loss": 4.5391, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6520434617996216, | |
| "learning_rate": 0.0003333333333333333, | |
| "loss": 4.5361, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.3875904381275177, | |
| "learning_rate": 0.0003375, | |
| "loss": 4.5351, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "eval_accuracy": 0.29011102141892453, | |
| "eval_loss": 4.53138542175293, | |
| "eval_runtime": 1074.6899, | |
| "eval_samples_per_second": 464.668, | |
| "eval_steps_per_second": 2.42, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6874526143074036, | |
| "learning_rate": 0.00034166666666666666, | |
| "loss": 4.5338, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.4180966019630432, | |
| "learning_rate": 0.00034583333333333335, | |
| "loss": 4.5331, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.41761451959609985, | |
| "learning_rate": 0.00035, | |
| "loss": 4.5291, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6088799834251404, | |
| "learning_rate": 0.0003541666666666667, | |
| "loss": 4.5303, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.20438095927238464, | |
| "learning_rate": 0.00035833333333333333, | |
| "loss": 4.5298, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.4336546063423157, | |
| "learning_rate": 0.0003625, | |
| "loss": 4.5283, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.40454909205436707, | |
| "learning_rate": 0.00036666666666666667, | |
| "loss": 4.5301, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.4893989562988281, | |
| "learning_rate": 0.00037083333333333337, | |
| "loss": 4.5286, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.4546484053134918, | |
| "learning_rate": 0.000375, | |
| "loss": 4.5263, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_accuracy": 0.2903379261149848, | |
| "eval_loss": 4.5242390632629395, | |
| "eval_runtime": 1074.5238, | |
| "eval_samples_per_second": 464.74, | |
| "eval_steps_per_second": 2.421, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.22772909700870514, | |
| "learning_rate": 0.00037916666666666665, | |
| "loss": 4.5251, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.48488083481788635, | |
| "learning_rate": 0.00038333333333333334, | |
| "loss": 4.524, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.21915870904922485, | |
| "learning_rate": 0.00038750000000000004, | |
| "loss": 4.5276, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4013586938381195, | |
| "learning_rate": 0.0003916666666666667, | |
| "loss": 4.5263, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.6083785891532898, | |
| "learning_rate": 0.0003958333333333333, | |
| "loss": 4.5226, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.5477403402328491, | |
| "learning_rate": 0.0004, | |
| "loss": 4.5223, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.25517192482948303, | |
| "learning_rate": 0.00040416666666666666, | |
| "loss": 4.5213, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.5681092739105225, | |
| "learning_rate": 0.00040833333333333336, | |
| "loss": 4.5213, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.27979689836502075, | |
| "learning_rate": 0.0004125, | |
| "loss": 4.5223, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_accuracy": 0.2904179370099399, | |
| "eval_loss": 4.5171709060668945, | |
| "eval_runtime": 1074.3659, | |
| "eval_samples_per_second": 464.808, | |
| "eval_steps_per_second": 2.421, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.278210312128067, | |
| "learning_rate": 0.0004166666666666667, | |
| "loss": 4.5185, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.31496554613113403, | |
| "learning_rate": 0.00042083333333333333, | |
| "loss": 4.5184, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.28795501589775085, | |
| "learning_rate": 0.000425, | |
| "loss": 4.5202, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.1585451066493988, | |
| "learning_rate": 0.00042916666666666667, | |
| "loss": 4.5177, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.25573596358299255, | |
| "learning_rate": 0.00043333333333333337, | |
| "loss": 4.5157, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.9261253476142883, | |
| "learning_rate": 0.0004375, | |
| "loss": 4.5155, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.20488545298576355, | |
| "learning_rate": 0.00044166666666666665, | |
| "loss": 4.5171, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.19982470571994781, | |
| "learning_rate": 0.00044583333333333335, | |
| "loss": 4.5157, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.2195570170879364, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 4.511, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_accuracy": 0.29042484814802466, | |
| "eval_loss": 4.511170387268066, | |
| "eval_runtime": 1074.2406, | |
| "eval_samples_per_second": 464.862, | |
| "eval_steps_per_second": 2.421, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.5774135589599609, | |
| "learning_rate": 0.0004541666666666667, | |
| "loss": 4.5159, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.3652968108654022, | |
| "learning_rate": 0.0004583333333333333, | |
| "loss": 4.5152, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.4962700307369232, | |
| "learning_rate": 0.0004625, | |
| "loss": 4.5139, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.3226447105407715, | |
| "learning_rate": 0.00046666666666666666, | |
| "loss": 4.5117, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.5751166939735413, | |
| "learning_rate": 0.00047083333333333336, | |
| "loss": 4.5131, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.181748166680336, | |
| "learning_rate": 0.000475, | |
| "loss": 4.5136, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.6175718903541565, | |
| "learning_rate": 0.0004791666666666667, | |
| "loss": 4.5107, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.261405348777771, | |
| "learning_rate": 0.00048333333333333334, | |
| "loss": 4.5124, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.18674370646476746, | |
| "learning_rate": 0.0004875, | |
| "loss": 4.5093, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "eval_accuracy": 0.2903912366894582, | |
| "eval_loss": 4.507014274597168, | |
| "eval_runtime": 1075.0378, | |
| "eval_samples_per_second": 464.518, | |
| "eval_steps_per_second": 2.419, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.4493379294872284, | |
| "learning_rate": 0.0004916666666666666, | |
| "loss": 4.5107, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.40603315830230713, | |
| "learning_rate": 0.0004958333333333334, | |
| "loss": 4.5103, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.17475590109825134, | |
| "learning_rate": 0.0005, | |
| "loss": 4.507, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.6670963168144226, | |
| "learning_rate": 0.0005041666666666667, | |
| "loss": 4.5076, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.20023925602436066, | |
| "learning_rate": 0.0005083333333333333, | |
| "loss": 4.5092, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.38801464438438416, | |
| "learning_rate": 0.0005124999999999999, | |
| "loss": 4.5047, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.4637294113636017, | |
| "learning_rate": 0.0005166666666666667, | |
| "loss": 4.5088, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.17603175342082977, | |
| "learning_rate": 0.0005208333333333334, | |
| "loss": 4.5057, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.4553210735321045, | |
| "learning_rate": 0.0005250000000000001, | |
| "loss": 4.505, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "eval_accuracy": 0.29054962247372956, | |
| "eval_loss": 4.502260208129883, | |
| "eval_runtime": 1075.7242, | |
| "eval_samples_per_second": 464.221, | |
| "eval_steps_per_second": 2.418, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.30805012583732605, | |
| "learning_rate": 0.0005291666666666667, | |
| "loss": 4.5032, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.5969117283821106, | |
| "learning_rate": 0.0005333333333333334, | |
| "loss": 4.5036, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.24606676399707794, | |
| "learning_rate": 0.0005375, | |
| "loss": 4.5042, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.2750067710876465, | |
| "learning_rate": 0.0005416666666666666, | |
| "loss": 4.5037, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.4421214163303375, | |
| "learning_rate": 0.0005458333333333333, | |
| "loss": 4.5014, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.2441830188035965, | |
| "learning_rate": 0.00055, | |
| "loss": 4.5005, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.25598272681236267, | |
| "learning_rate": 0.0005541666666666667, | |
| "loss": 4.5007, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.17499062418937683, | |
| "learning_rate": 0.0005583333333333333, | |
| "loss": 4.5031, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.6325914263725281, | |
| "learning_rate": 0.0005625000000000001, | |
| "loss": 4.5003, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "eval_accuracy": 0.2903214025778754, | |
| "eval_loss": 4.499546051025391, | |
| "eval_runtime": 1074.5865, | |
| "eval_samples_per_second": 464.713, | |
| "eval_steps_per_second": 2.42, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.29858532547950745, | |
| "learning_rate": 0.0005666666666666667, | |
| "loss": 4.5014, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.3625228703022003, | |
| "learning_rate": 0.0005708333333333333, | |
| "loss": 4.4991, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.271508127450943, | |
| "learning_rate": 0.000575, | |
| "loss": 4.499, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.7316662073135376, | |
| "learning_rate": 0.0005791666666666667, | |
| "loss": 4.498, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.3072379529476166, | |
| "learning_rate": 0.0005833333333333334, | |
| "loss": 4.5021, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.16316668689250946, | |
| "learning_rate": 0.0005875, | |
| "loss": 4.4963, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.6301301717758179, | |
| "learning_rate": 0.0005916666666666667, | |
| "loss": 4.4978, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.22797346115112305, | |
| "learning_rate": 0.0005958333333333333, | |
| "loss": 4.4964, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.3899094760417938, | |
| "learning_rate": 0.0006, | |
| "loss": 4.4939, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "eval_accuracy": 0.29044920062027546, | |
| "eval_loss": 4.489974021911621, | |
| "eval_runtime": 1075.4503, | |
| "eval_samples_per_second": 464.339, | |
| "eval_steps_per_second": 2.419, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.34080126881599426, | |
| "learning_rate": 0.0006041666666666666, | |
| "loss": 4.493, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.12690007686615, | |
| "learning_rate": 0.0006083333333333333, | |
| "loss": 4.4904, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.30394747853279114, | |
| "learning_rate": 0.0006125000000000001, | |
| "loss": 4.489, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.4542248845100403, | |
| "learning_rate": 0.0006166666666666667, | |
| "loss": 4.4841, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.3733484447002411, | |
| "learning_rate": 0.0006208333333333334, | |
| "loss": 4.4738, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.31465083360671997, | |
| "learning_rate": 0.000625, | |
| "loss": 4.4695, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.32257241010665894, | |
| "learning_rate": 0.0006291666666666667, | |
| "loss": 4.461, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.7750576734542847, | |
| "learning_rate": 0.0006333333333333333, | |
| "loss": 4.4636, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.35094720125198364, | |
| "learning_rate": 0.0006374999999999999, | |
| "loss": 4.4569, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_accuracy": 0.2906712538997569, | |
| "eval_loss": 4.445650100708008, | |
| "eval_runtime": 1075.3046, | |
| "eval_samples_per_second": 464.402, | |
| "eval_steps_per_second": 2.419, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.5662222504615784, | |
| "learning_rate": 0.0006416666666666667, | |
| "loss": 4.4542, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.6326726675033569, | |
| "learning_rate": 0.0006458333333333334, | |
| "loss": 4.4509, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.4773523807525635, | |
| "learning_rate": 0.0006500000000000001, | |
| "loss": 4.4468, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.4514019191265106, | |
| "learning_rate": 0.0006541666666666667, | |
| "loss": 4.442, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.5631856918334961, | |
| "learning_rate": 0.0006583333333333334, | |
| "loss": 4.44, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.36760690808296204, | |
| "learning_rate": 0.0006625, | |
| "loss": 4.4317, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.669217586517334, | |
| "learning_rate": 0.0006666666666666666, | |
| "loss": 4.4365, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.4648076891899109, | |
| "learning_rate": 0.0006708333333333333, | |
| "loss": 4.4277, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.45093855261802673, | |
| "learning_rate": 0.000675, | |
| "loss": 4.4215, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "eval_accuracy": 0.29071258845839876, | |
| "eval_loss": 4.40159797668457, | |
| "eval_runtime": 1073.4339, | |
| "eval_samples_per_second": 465.212, | |
| "eval_steps_per_second": 2.423, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.6909874081611633, | |
| "learning_rate": 0.0006791666666666667, | |
| "loss": 4.4243, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.5092808604240417, | |
| "learning_rate": 0.0006833333333333333, | |
| "loss": 4.4162, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.7031335234642029, | |
| "learning_rate": 0.0006875, | |
| "loss": 4.409, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.778090238571167, | |
| "learning_rate": 0.0006916666666666667, | |
| "loss": 4.4091, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.935316801071167, | |
| "learning_rate": 0.0006958333333333334, | |
| "loss": 4.4059, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.9648371934890747, | |
| "learning_rate": 0.0007, | |
| "loss": 4.4033, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.524691641330719, | |
| "learning_rate": 0.0007041666666666667, | |
| "loss": 4.4012, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.5595187544822693, | |
| "learning_rate": 0.0007083333333333334, | |
| "loss": 4.3954, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.5809574723243713, | |
| "learning_rate": 0.0007125, | |
| "loss": 4.3943, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "eval_accuracy": 0.29116029691107925, | |
| "eval_loss": 4.3684492111206055, | |
| "eval_runtime": 1080.2368, | |
| "eval_samples_per_second": 462.282, | |
| "eval_steps_per_second": 2.408, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.4484635293483734, | |
| "learning_rate": 0.0007166666666666667, | |
| "loss": 4.3881, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.6823798418045044, | |
| "learning_rate": 0.0007208333333333333, | |
| "loss": 4.3829, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.7428690791130066, | |
| "learning_rate": 0.000725, | |
| "loss": 4.3843, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.6587589979171753, | |
| "learning_rate": 0.0007291666666666666, | |
| "loss": 4.3828, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.5860837697982788, | |
| "learning_rate": 0.0007333333333333333, | |
| "loss": 4.3764, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.5413070321083069, | |
| "learning_rate": 0.0007375000000000001, | |
| "loss": 4.3752, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.665489137172699, | |
| "learning_rate": 0.0007416666666666667, | |
| "loss": 4.3728, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.711599588394165, | |
| "learning_rate": 0.0007458333333333334, | |
| "loss": 4.373, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.6164100170135498, | |
| "learning_rate": 0.00075, | |
| "loss": 4.3677, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "eval_accuracy": 0.29116748362162354, | |
| "eval_loss": 4.338656902313232, | |
| "eval_runtime": 1078.2447, | |
| "eval_samples_per_second": 463.136, | |
| "eval_steps_per_second": 2.412, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.8174536228179932, | |
| "learning_rate": 0.0007541666666666667, | |
| "loss": 4.3622, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.4686708152294159, | |
| "learning_rate": 0.0007583333333333333, | |
| "loss": 4.3615, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.726311206817627, | |
| "learning_rate": 0.0007624999999999999, | |
| "loss": 4.3553, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.6094339489936829, | |
| "learning_rate": 0.0007666666666666667, | |
| "loss": 4.3584, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.43092453479766846, | |
| "learning_rate": 0.0007708333333333334, | |
| "loss": 4.3515, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.8314465284347534, | |
| "learning_rate": 0.0007750000000000001, | |
| "loss": 4.3503, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 0.9535554647445679, | |
| "learning_rate": 0.0007791666666666667, | |
| "loss": 4.3459, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.9487770795822144, | |
| "learning_rate": 0.0007833333333333334, | |
| "loss": 4.3438, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.65323406457901, | |
| "learning_rate": 0.0007875, | |
| "loss": 4.3382, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "eval_accuracy": 0.2915768463978657, | |
| "eval_loss": 4.299588203430176, | |
| "eval_runtime": 1078.9106, | |
| "eval_samples_per_second": 462.85, | |
| "eval_steps_per_second": 2.411, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.8234522342681885, | |
| "learning_rate": 0.0007916666666666666, | |
| "loss": 4.337, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.984524667263031, | |
| "learning_rate": 0.0007958333333333333, | |
| "loss": 4.3342, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.087571382522583, | |
| "learning_rate": 0.0008, | |
| "loss": 4.3327, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.8180701732635498, | |
| "learning_rate": 0.0008041666666666667, | |
| "loss": 4.3292, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.209524154663086, | |
| "learning_rate": 0.0008083333333333333, | |
| "loss": 4.3258, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.2440215349197388, | |
| "learning_rate": 0.0008125000000000001, | |
| "loss": 4.322, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.1065999269485474, | |
| "learning_rate": 0.0008166666666666667, | |
| "loss": 4.3207, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.2142093181610107, | |
| "learning_rate": 0.0008208333333333334, | |
| "loss": 4.3183, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.8045121431350708, | |
| "learning_rate": 0.000825, | |
| "loss": 4.3216, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "eval_accuracy": 0.2919771311011085, | |
| "eval_loss": 4.267116069793701, | |
| "eval_runtime": 1079.6166, | |
| "eval_samples_per_second": 462.548, | |
| "eval_steps_per_second": 2.409, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.8329472541809082, | |
| "learning_rate": 0.0008291666666666667, | |
| "loss": 4.3102, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.114058256149292, | |
| "learning_rate": 0.0008333333333333334, | |
| "loss": 4.3138, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.989930272102356, | |
| "learning_rate": 0.0008375, | |
| "loss": 4.3068, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.9290440082550049, | |
| "learning_rate": 0.0008416666666666667, | |
| "loss": 4.3016, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.6841094493865967, | |
| "learning_rate": 0.0008458333333333333, | |
| "loss": 4.3031, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.0473177433013916, | |
| "learning_rate": 0.00085, | |
| "loss": 4.3079, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.1396197080612183, | |
| "learning_rate": 0.0008541666666666666, | |
| "loss": 4.2986, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.2318438291549683, | |
| "learning_rate": 0.0008583333333333333, | |
| "loss": 4.3005, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.749923825263977, | |
| "learning_rate": 0.0008625000000000001, | |
| "loss": 4.2879, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "eval_accuracy": 0.2931822363307358, | |
| "eval_loss": 4.231507778167725, | |
| "eval_runtime": 1089.4507, | |
| "eval_samples_per_second": 458.378, | |
| "eval_steps_per_second": 2.047, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.9980252385139465, | |
| "learning_rate": 0.0008666666666666667, | |
| "loss": 4.2821, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.5635493993759155, | |
| "learning_rate": 0.0008708333333333334, | |
| "loss": 4.29, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.6463395357131958, | |
| "learning_rate": 0.000875, | |
| "loss": 4.2856, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.3602315187454224, | |
| "learning_rate": 0.0008791666666666667, | |
| "loss": 4.2802, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.8902882933616638, | |
| "learning_rate": 0.0008833333333333333, | |
| "loss": 4.2729, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.186219573020935, | |
| "learning_rate": 0.0008874999999999999, | |
| "loss": 4.2691, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.0059683322906494, | |
| "learning_rate": 0.0008916666666666667, | |
| "loss": 4.26, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.2011739015579224, | |
| "learning_rate": 0.0008958333333333334, | |
| "loss": 4.2577, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.4442743062973022, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 4.263, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "eval_accuracy": 0.2934964665435206, | |
| "eval_loss": 4.21316385269165, | |
| "eval_runtime": 1096.5373, | |
| "eval_samples_per_second": 455.415, | |
| "eval_steps_per_second": 2.034, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.1617356538772583, | |
| "learning_rate": 0.0009041666666666667, | |
| "loss": 4.2661, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.3990079164505005, | |
| "learning_rate": 0.0009083333333333334, | |
| "loss": 4.2475, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.2959562540054321, | |
| "learning_rate": 0.0009125, | |
| "loss": 4.2415, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.0796222686767578, | |
| "learning_rate": 0.0009166666666666666, | |
| "loss": 4.2337, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.6943458318710327, | |
| "learning_rate": 0.0009208333333333333, | |
| "loss": 4.2281, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.7960783243179321, | |
| "learning_rate": 0.000925, | |
| "loss": 4.2241, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.086534023284912, | |
| "learning_rate": 0.0009291666666666667, | |
| "loss": 4.2269, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.137702226638794, | |
| "learning_rate": 0.0009333333333333333, | |
| "loss": 4.2158, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.1577701568603516, | |
| "learning_rate": 0.0009375, | |
| "loss": 4.2013, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "eval_accuracy": 0.2987269750298371, | |
| "eval_loss": 4.123126029968262, | |
| "eval_runtime": 1097.6256, | |
| "eval_samples_per_second": 454.964, | |
| "eval_steps_per_second": 2.032, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.6455570459365845, | |
| "learning_rate": 0.0009416666666666667, | |
| "loss": 4.1815, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.6025768518447876, | |
| "learning_rate": 0.0009458333333333334, | |
| "loss": 4.1534, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.2223172187805176, | |
| "learning_rate": 0.00095, | |
| "loss": 4.1298, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.766542673110962, | |
| "learning_rate": 0.0009541666666666667, | |
| "loss": 4.1187, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.156003952026367, | |
| "learning_rate": 0.0009583333333333334, | |
| "loss": 4.0858, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.9074057340621948, | |
| "learning_rate": 0.0009625, | |
| "loss": 4.0801, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.6140304803848267, | |
| "learning_rate": 0.0009666666666666667, | |
| "loss": 4.0383, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.5922300815582275, | |
| "learning_rate": 0.0009708333333333333, | |
| "loss": 4.0099, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.9714833498001099, | |
| "learning_rate": 0.000975, | |
| "loss": 3.9757, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "eval_accuracy": 0.3303083702251303, | |
| "eval_loss": 3.764934539794922, | |
| "eval_runtime": 1104.6754, | |
| "eval_samples_per_second": 452.06, | |
| "eval_steps_per_second": 2.019, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.1198415756225586, | |
| "learning_rate": 0.0009791666666666666, | |
| "loss": 3.9507, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.0731935501098633, | |
| "learning_rate": 0.0009833333333333332, | |
| "loss": 3.9258, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.1984808444976807, | |
| "learning_rate": 0.0009875, | |
| "loss": 3.9003, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.033250331878662, | |
| "learning_rate": 0.0009916666666666667, | |
| "loss": 3.8732, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.7183982133865356, | |
| "learning_rate": 0.0009958333333333334, | |
| "loss": 3.8557, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.216938018798828, | |
| "learning_rate": 0.001, | |
| "loss": 3.8376, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.109079599380493, | |
| "learning_rate": 0.000999009900990099, | |
| "loss": 3.8212, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.5402984619140625, | |
| "learning_rate": 0.0009980198019801981, | |
| "loss": 3.8, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.051513433456421, | |
| "learning_rate": 0.000997029702970297, | |
| "loss": 3.7913, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "eval_accuracy": 0.35789052045361985, | |
| "eval_loss": 3.5296359062194824, | |
| "eval_runtime": 1087.9351, | |
| "eval_samples_per_second": 459.016, | |
| "eval_steps_per_second": 2.05, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.8306666612625122, | |
| "learning_rate": 0.000996039603960396, | |
| "loss": 3.7567, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.9114989042282104, | |
| "learning_rate": 0.000995049504950495, | |
| "loss": 3.7491, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.9881885051727295, | |
| "learning_rate": 0.0009940594059405941, | |
| "loss": 3.7297, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.2852580547332764, | |
| "learning_rate": 0.0009930693069306932, | |
| "loss": 3.7073, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.090174913406372, | |
| "learning_rate": 0.000992079207920792, | |
| "loss": 3.6902, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.5586419105529785, | |
| "learning_rate": 0.000991089108910891, | |
| "loss": 3.6792, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.9420301914215088, | |
| "learning_rate": 0.0009900990099009901, | |
| "loss": 3.6728, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.319821834564209, | |
| "learning_rate": 0.0009891089108910892, | |
| "loss": 3.6627, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.134413480758667, | |
| "learning_rate": 0.0009881188118811882, | |
| "loss": 3.6435, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "eval_accuracy": 0.3799301143797497, | |
| "eval_loss": 3.3790884017944336, | |
| "eval_runtime": 1089.5448, | |
| "eval_samples_per_second": 458.338, | |
| "eval_steps_per_second": 2.047, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.8554224967956543, | |
| "learning_rate": 0.000987128712871287, | |
| "loss": 3.6265, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.16987681388855, | |
| "learning_rate": 0.000986138613861386, | |
| "loss": 3.6098, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.9863182306289673, | |
| "learning_rate": 0.0009851485148514852, | |
| "loss": 3.5982, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.0247480869293213, | |
| "learning_rate": 0.0009841584158415842, | |
| "loss": 3.5911, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.2719273567199707, | |
| "learning_rate": 0.0009831683168316833, | |
| "loss": 3.5804, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.0588369369506836, | |
| "learning_rate": 0.000982178217821782, | |
| "loss": 3.5654, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.9666892290115356, | |
| "learning_rate": 0.0009811881188118811, | |
| "loss": 3.558, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.752681016921997, | |
| "learning_rate": 0.0009801980198019802, | |
| "loss": 3.5389, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.821775197982788, | |
| "learning_rate": 0.0009792079207920793, | |
| "loss": 3.5327, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_accuracy": 0.39317171253107736, | |
| "eval_loss": 3.2741596698760986, | |
| "eval_runtime": 1086.2288, | |
| "eval_samples_per_second": 459.737, | |
| "eval_steps_per_second": 2.053, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.4307518005371094, | |
| "learning_rate": 0.0009782178217821783, | |
| "loss": 3.5311, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.8416870832443237, | |
| "learning_rate": 0.0009772277227722771, | |
| "loss": 3.5199, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.7294279336929321, | |
| "learning_rate": 0.0009762376237623762, | |
| "loss": 3.5067, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.0376105308532715, | |
| "learning_rate": 0.0009752475247524752, | |
| "loss": 3.4957, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.845569133758545, | |
| "learning_rate": 0.0009742574257425743, | |
| "loss": 3.4778, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.1370015144348145, | |
| "learning_rate": 0.0009732673267326732, | |
| "loss": 3.4766, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.0046229362487793, | |
| "learning_rate": 0.0009722772277227723, | |
| "loss": 3.4609, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.7367238998413086, | |
| "learning_rate": 0.0009712871287128712, | |
| "loss": 3.4574, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.245299816131592, | |
| "learning_rate": 0.0009702970297029703, | |
| "loss": 3.4402, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "eval_accuracy": 0.40886959953318786, | |
| "eval_loss": 3.1605701446533203, | |
| "eval_runtime": 1086.3963, | |
| "eval_samples_per_second": 459.666, | |
| "eval_steps_per_second": 2.053, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.6792678833007812, | |
| "learning_rate": 0.0009693069306930693, | |
| "loss": 3.4155, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.1290223598480225, | |
| "learning_rate": 0.0009683168316831683, | |
| "loss": 3.3953, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 1.9963873624801636, | |
| "learning_rate": 0.0009673267326732673, | |
| "loss": 3.3722, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.212454080581665, | |
| "learning_rate": 0.0009663366336633663, | |
| "loss": 3.3532, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.145552396774292, | |
| "learning_rate": 0.0009653465346534653, | |
| "loss": 3.336, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.423874616622925, | |
| "learning_rate": 0.0009643564356435644, | |
| "loss": 3.3111, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.0116701126098633, | |
| "learning_rate": 0.0009633663366336633, | |
| "loss": 3.305, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.243619203567505, | |
| "learning_rate": 0.0009623762376237624, | |
| "loss": 3.2854, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.5583114624023438, | |
| "learning_rate": 0.0009613861386138613, | |
| "loss": 3.2635, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "eval_accuracy": 0.43169227745021366, | |
| "eval_loss": 2.9848363399505615, | |
| "eval_runtime": 1087.8763, | |
| "eval_samples_per_second": 459.041, | |
| "eval_steps_per_second": 2.05, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.156170606613159, | |
| "learning_rate": 0.0009603960396039604, | |
| "loss": 3.2498, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.917297601699829, | |
| "learning_rate": 0.0009594059405940594, | |
| "loss": 3.2343, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.7647627592086792, | |
| "learning_rate": 0.0009584158415841584, | |
| "loss": 3.2206, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.7406831979751587, | |
| "learning_rate": 0.0009574257425742574, | |
| "loss": 3.2023, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.721940040588379, | |
| "learning_rate": 0.0009564356435643564, | |
| "loss": 3.1896, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.5204572677612305, | |
| "learning_rate": 0.0009554455445544554, | |
| "loss": 3.1769, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.20760440826416, | |
| "learning_rate": 0.0009544554455445545, | |
| "loss": 3.1706, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.6796480417251587, | |
| "learning_rate": 0.0009534653465346534, | |
| "loss": 3.1552, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.044858455657959, | |
| "learning_rate": 0.0009524752475247525, | |
| "loss": 3.1385, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_accuracy": 0.4465053493029932, | |
| "eval_loss": 2.872570037841797, | |
| "eval_runtime": 1089.0874, | |
| "eval_samples_per_second": 458.531, | |
| "eval_steps_per_second": 2.048, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.3739113807678223, | |
| "learning_rate": 0.0009514851485148514, | |
| "loss": 3.1217, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.631298303604126, | |
| "learning_rate": 0.0009504950495049505, | |
| "loss": 3.1102, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.8213354349136353, | |
| "learning_rate": 0.0009495049504950495, | |
| "loss": 3.092, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.910646915435791, | |
| "learning_rate": 0.0009485148514851485, | |
| "loss": 3.0768, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.6591072082519531, | |
| "learning_rate": 0.0009475247524752475, | |
| "loss": 3.0721, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.921587347984314, | |
| "learning_rate": 0.0009465346534653465, | |
| "loss": 3.0493, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.576114296913147, | |
| "learning_rate": 0.0009455445544554455, | |
| "loss": 3.0392, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.41093909740448, | |
| "learning_rate": 0.0009445544554455446, | |
| "loss": 3.0204, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.41178297996521, | |
| "learning_rate": 0.0009435643564356435, | |
| "loss": 3.0046, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_accuracy": 0.46085574907280247, | |
| "eval_loss": 2.7754335403442383, | |
| "eval_runtime": 1088.9809, | |
| "eval_samples_per_second": 458.576, | |
| "eval_steps_per_second": 2.048, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.5229026079177856, | |
| "learning_rate": 0.0009425742574257426, | |
| "loss": 2.989, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.3193325996398926, | |
| "learning_rate": 0.0009415841584158415, | |
| "loss": 2.9764, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.1938610076904297, | |
| "learning_rate": 0.0009405940594059406, | |
| "loss": 2.9636, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.1402697563171387, | |
| "learning_rate": 0.0009396039603960396, | |
| "loss": 2.9517, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.4980099201202393, | |
| "learning_rate": 0.0009386138613861386, | |
| "loss": 2.9445, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.4591041803359985, | |
| "learning_rate": 0.0009376237623762376, | |
| "loss": 2.9317, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.4302833080291748, | |
| "learning_rate": 0.0009366336633663367, | |
| "loss": 2.9167, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.2571301460266113, | |
| "learning_rate": 0.0009356435643564357, | |
| "loss": 2.9049, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.2039096355438232, | |
| "learning_rate": 0.0009346534653465348, | |
| "loss": 2.8885, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "eval_accuracy": 0.4740639726753192, | |
| "eval_loss": 2.6853535175323486, | |
| "eval_runtime": 1088.5789, | |
| "eval_samples_per_second": 458.745, | |
| "eval_steps_per_second": 2.049, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1458439826965332, | |
| "learning_rate": 0.0009336633663366337, | |
| "loss": 2.8844, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1883801221847534, | |
| "learning_rate": 0.0009326732673267328, | |
| "loss": 2.8777, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.0597162246704102, | |
| "learning_rate": 0.0009316831683168317, | |
| "loss": 2.8647, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.2362898588180542, | |
| "learning_rate": 0.0009306930693069308, | |
| "loss": 2.8565, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.588973879814148, | |
| "learning_rate": 0.0009297029702970298, | |
| "loss": 2.8443, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.7287636995315552, | |
| "learning_rate": 0.0009287128712871288, | |
| "loss": 2.8401, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.2376179695129395, | |
| "learning_rate": 0.0009277227722772278, | |
| "loss": 2.8338, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.1424429416656494, | |
| "learning_rate": 0.0009267326732673268, | |
| "loss": 2.8158, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.0561778545379639, | |
| "learning_rate": 0.0009257425742574258, | |
| "loss": 2.8086, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_accuracy": 0.4839770905517238, | |
| "eval_loss": 2.6107919216156006, | |
| "eval_runtime": 1086.2364, | |
| "eval_samples_per_second": 459.734, | |
| "eval_steps_per_second": 2.053, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.1775144338607788, | |
| "learning_rate": 0.0009247524752475249, | |
| "loss": 2.8002, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.1752933263778687, | |
| "learning_rate": 0.0009237623762376238, | |
| "loss": 2.7913, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.35330331325531, | |
| "learning_rate": 0.0009227722772277229, | |
| "loss": 2.7821, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.163878321647644, | |
| "learning_rate": 0.0009217821782178218, | |
| "loss": 2.7755, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.0859084129333496, | |
| "learning_rate": 0.0009207920792079209, | |
| "loss": 2.7651, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.1351704597473145, | |
| "learning_rate": 0.0009198019801980199, | |
| "loss": 2.7583, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.0613981485366821, | |
| "learning_rate": 0.0009188118811881188, | |
| "loss": 2.7482, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.1925811767578125, | |
| "learning_rate": 0.0009178217821782179, | |
| "loss": 2.7411, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.00603187084198, | |
| "learning_rate": 0.0009168316831683168, | |
| "loss": 2.7357, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "eval_accuracy": 0.4945267646340819, | |
| "eval_loss": 2.5409770011901855, | |
| "eval_runtime": 1086.0357, | |
| "eval_samples_per_second": 459.819, | |
| "eval_steps_per_second": 2.053, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.0638866424560547, | |
| "learning_rate": 0.0009158415841584159, | |
| "loss": 2.7271, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.0579949617385864, | |
| "learning_rate": 0.000914851485148515, | |
| "loss": 2.7192, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.8697578310966492, | |
| "learning_rate": 0.0009138613861386139, | |
| "loss": 2.7114, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.1074854135513306, | |
| "learning_rate": 0.0009128712871287129, | |
| "loss": 2.7044, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.2240349054336548, | |
| "learning_rate": 0.0009118811881188119, | |
| "loss": 2.6981, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.0825715065002441, | |
| "learning_rate": 0.0009108910891089109, | |
| "loss": 2.688, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.9786016941070557, | |
| "learning_rate": 0.00090990099009901, | |
| "loss": 2.681, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.7355318665504456, | |
| "learning_rate": 0.0009089108910891089, | |
| "loss": 2.6771, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.0296987295150757, | |
| "learning_rate": 0.000907920792079208, | |
| "loss": 2.6714, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "eval_accuracy": 0.5031676098849697, | |
| "eval_loss": 2.4873406887054443, | |
| "eval_runtime": 1086.6349, | |
| "eval_samples_per_second": 459.566, | |
| "eval_steps_per_second": 2.052, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.0019482374191284, | |
| "learning_rate": 0.0009069306930693069, | |
| "loss": 2.659, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.95697021484375, | |
| "learning_rate": 0.000905940594059406, | |
| "loss": 2.6534, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.9206619262695312, | |
| "learning_rate": 0.000904950495049505, | |
| "loss": 2.6499, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.9173060059547424, | |
| "learning_rate": 0.000903960396039604, | |
| "loss": 2.6436, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.946976900100708, | |
| "learning_rate": 0.000902970297029703, | |
| "loss": 2.6386, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.1060245037078857, | |
| "learning_rate": 0.000901980198019802, | |
| "loss": 2.6295, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.9128373861312866, | |
| "learning_rate": 0.000900990099009901, | |
| "loss": 2.6207, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.7775394916534424, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 2.6152, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.0009465217590332, | |
| "learning_rate": 0.000899009900990099, | |
| "loss": 2.6114, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "eval_accuracy": 0.5107387401188807, | |
| "eval_loss": 2.430750608444214, | |
| "eval_runtime": 1085.7185, | |
| "eval_samples_per_second": 459.953, | |
| "eval_steps_per_second": 2.054, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.9122986197471619, | |
| "learning_rate": 0.0008980198019801981, | |
| "loss": 2.6046, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.9263846278190613, | |
| "learning_rate": 0.000897029702970297, | |
| "loss": 2.6006, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.9440599083900452, | |
| "learning_rate": 0.0008960396039603961, | |
| "loss": 2.5954, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 1.0791646242141724, | |
| "learning_rate": 0.0008950495049504951, | |
| "loss": 2.5875, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.8801349401473999, | |
| "learning_rate": 0.0008940594059405941, | |
| "loss": 2.5805, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.8976075053215027, | |
| "learning_rate": 0.0008930693069306931, | |
| "loss": 2.5856, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.9874941110610962, | |
| "learning_rate": 0.0008920792079207921, | |
| "loss": 2.5741, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.8185487985610962, | |
| "learning_rate": 0.0008910891089108911, | |
| "loss": 2.5625, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.0372703075408936, | |
| "learning_rate": 0.0008900990099009902, | |
| "loss": 2.5622, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_accuracy": 0.5173882190435195, | |
| "eval_loss": 2.3903918266296387, | |
| "eval_runtime": 1085.7537, | |
| "eval_samples_per_second": 459.939, | |
| "eval_steps_per_second": 2.054, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.7737933397293091, | |
| "learning_rate": 0.0008891089108910891, | |
| "loss": 2.5551, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.8418464660644531, | |
| "learning_rate": 0.0008881188118811882, | |
| "loss": 2.5505, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.9638449549674988, | |
| "learning_rate": 0.0008871287128712871, | |
| "loss": 2.5506, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.8606787919998169, | |
| "learning_rate": 0.0008861386138613862, | |
| "loss": 2.5446, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.9567099213600159, | |
| "learning_rate": 0.0008851485148514852, | |
| "loss": 2.5316, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.9098414182662964, | |
| "learning_rate": 0.0008841584158415842, | |
| "loss": 2.5299, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.9305897951126099, | |
| "learning_rate": 0.0008831683168316832, | |
| "loss": 2.53, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.6542484164237976, | |
| "learning_rate": 0.0008821782178217822, | |
| "loss": 2.5182, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.8789640069007874, | |
| "learning_rate": 0.0008811881188118812, | |
| "loss": 2.5145, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "eval_accuracy": 0.5233957340804408, | |
| "eval_loss": 2.3455302715301514, | |
| "eval_runtime": 1084.7897, | |
| "eval_samples_per_second": 460.347, | |
| "eval_steps_per_second": 2.056, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.7239986062049866, | |
| "learning_rate": 0.0008801980198019803, | |
| "loss": 2.5092, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.7931397557258606, | |
| "learning_rate": 0.0008792079207920792, | |
| "loss": 2.4992, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.9696986675262451, | |
| "learning_rate": 0.0008782178217821783, | |
| "loss": 2.5035, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.7151007056236267, | |
| "learning_rate": 0.0008772277227722772, | |
| "loss": 2.4927, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.054768443107605, | |
| "learning_rate": 0.0008762376237623763, | |
| "loss": 2.4831, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.7492119669914246, | |
| "learning_rate": 0.0008752475247524753, | |
| "loss": 2.4872, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.6677684187889099, | |
| "learning_rate": 0.0008742574257425743, | |
| "loss": 2.4797, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.8682121634483337, | |
| "learning_rate": 0.0008732673267326733, | |
| "loss": 2.4747, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.9361952543258667, | |
| "learning_rate": 0.0008722772277227722, | |
| "loss": 2.4741, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "eval_accuracy": 0.5298733491746213, | |
| "eval_loss": 2.301394462585449, | |
| "eval_runtime": 1087.2464, | |
| "eval_samples_per_second": 459.307, | |
| "eval_steps_per_second": 2.051, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.7424592971801758, | |
| "learning_rate": 0.0008712871287128713, | |
| "loss": 2.4641, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.7794514894485474, | |
| "learning_rate": 0.0008702970297029704, | |
| "loss": 2.4558, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.8285810351371765, | |
| "learning_rate": 0.0008693069306930693, | |
| "loss": 2.4541, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.8555008769035339, | |
| "learning_rate": 0.0008683168316831684, | |
| "loss": 2.456, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.7897722721099854, | |
| "learning_rate": 0.0008673267326732673, | |
| "loss": 2.4447, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.7043498754501343, | |
| "learning_rate": 0.0008663366336633663, | |
| "loss": 2.44, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.7770318984985352, | |
| "learning_rate": 0.0008653465346534654, | |
| "loss": 2.439, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.8721603155136108, | |
| "learning_rate": 0.0008643564356435643, | |
| "loss": 2.4363, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.653965413570404, | |
| "learning_rate": 0.0008633663366336634, | |
| "loss": 2.4298, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "eval_accuracy": 0.5352807916398568, | |
| "eval_loss": 2.2673096656799316, | |
| "eval_runtime": 1086.4494, | |
| "eval_samples_per_second": 459.644, | |
| "eval_steps_per_second": 2.053, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.7826604247093201, | |
| "learning_rate": 0.0008623762376237623, | |
| "loss": 2.4204, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.6523510813713074, | |
| "learning_rate": 0.0008613861386138614, | |
| "loss": 2.4194, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.9408191442489624, | |
| "learning_rate": 0.0008603960396039604, | |
| "loss": 2.4135, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.8393464684486389, | |
| "learning_rate": 0.0008594059405940594, | |
| "loss": 2.4179, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.7277712821960449, | |
| "learning_rate": 0.0008584158415841584, | |
| "loss": 2.4087, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.6864319443702698, | |
| "learning_rate": 0.0008574257425742574, | |
| "loss": 2.405, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.6246985197067261, | |
| "learning_rate": 0.0008564356435643564, | |
| "loss": 2.3962, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.7276294231414795, | |
| "learning_rate": 0.0008554455445544555, | |
| "loss": 2.4043, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.7767272591590881, | |
| "learning_rate": 0.0008544554455445544, | |
| "loss": 2.3947, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "eval_accuracy": 0.5401726205914658, | |
| "eval_loss": 2.2340025901794434, | |
| "eval_runtime": 1089.0379, | |
| "eval_samples_per_second": 458.552, | |
| "eval_steps_per_second": 2.048, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.717089831829071, | |
| "learning_rate": 0.0008534653465346535, | |
| "loss": 2.3934, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.7381496429443359, | |
| "learning_rate": 0.0008524752475247524, | |
| "loss": 2.384, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.8456007838249207, | |
| "learning_rate": 0.0008514851485148515, | |
| "loss": 2.3842, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.6353156566619873, | |
| "learning_rate": 0.0008504950495049505, | |
| "loss": 2.3774, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.6743925213813782, | |
| "learning_rate": 0.0008495049504950495, | |
| "loss": 2.3775, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.6839917898178101, | |
| "learning_rate": 0.0008485148514851485, | |
| "loss": 2.3737, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.6635532379150391, | |
| "learning_rate": 0.0008475247524752475, | |
| "loss": 2.3689, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.7584016919136047, | |
| "learning_rate": 0.0008465346534653465, | |
| "loss": 2.3623, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.9728506803512573, | |
| "learning_rate": 0.0008455445544554456, | |
| "loss": 2.3653, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "eval_accuracy": 0.5444187372461048, | |
| "eval_loss": 2.210555076599121, | |
| "eval_runtime": 1087.1075, | |
| "eval_samples_per_second": 459.366, | |
| "eval_steps_per_second": 2.051, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.7243532538414001, | |
| "learning_rate": 0.0008445544554455445, | |
| "loss": 2.3605, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.5774228572845459, | |
| "learning_rate": 0.0008435643564356436, | |
| "loss": 2.3547, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.9098168611526489, | |
| "learning_rate": 0.0008425742574257425, | |
| "loss": 2.3507, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.8770572543144226, | |
| "learning_rate": 0.0008415841584158416, | |
| "loss": 2.3535, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.5555605292320251, | |
| "learning_rate": 0.0008405940594059406, | |
| "loss": 2.3463, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.8333105444908142, | |
| "learning_rate": 0.0008396039603960396, | |
| "loss": 2.3417, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.724617600440979, | |
| "learning_rate": 0.0008386138613861386, | |
| "loss": 2.3382, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.6441348791122437, | |
| "learning_rate": 0.0008376237623762376, | |
| "loss": 2.3322, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.7889347672462463, | |
| "learning_rate": 0.0008366336633663366, | |
| "loss": 2.332, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "eval_accuracy": 0.5490578933317315, | |
| "eval_loss": 2.1787993907928467, | |
| "eval_runtime": 1089.6908, | |
| "eval_samples_per_second": 458.277, | |
| "eval_steps_per_second": 2.046, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.7708555459976196, | |
| "learning_rate": 0.0008356435643564357, | |
| "loss": 2.3326, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.5825323462486267, | |
| "learning_rate": 0.0008346534653465346, | |
| "loss": 2.3269, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.6252484321594238, | |
| "learning_rate": 0.0008336633663366337, | |
| "loss": 2.3226, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.8860800266265869, | |
| "learning_rate": 0.0008326732673267326, | |
| "loss": 2.3213, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.6472296714782715, | |
| "learning_rate": 0.0008316831683168317, | |
| "loss": 2.3159, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.5913267731666565, | |
| "learning_rate": 0.0008306930693069307, | |
| "loss": 2.31, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.9591660499572754, | |
| "learning_rate": 0.0008297029702970297, | |
| "loss": 2.3105, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.6770280599594116, | |
| "learning_rate": 0.0008287128712871287, | |
| "loss": 2.3043, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.6088559031486511, | |
| "learning_rate": 0.0008277227722772277, | |
| "loss": 2.3006, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "eval_accuracy": 0.5543035745580942, | |
| "eval_loss": 2.146070718765259, | |
| "eval_runtime": 1088.9664, | |
| "eval_samples_per_second": 458.582, | |
| "eval_steps_per_second": 2.048, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.8546580076217651, | |
| "learning_rate": 0.0008267326732673267, | |
| "loss": 2.3008, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 0.5891646146774292, | |
| "learning_rate": 0.0008257425742574258, | |
| "loss": 2.2996, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.7681787610054016, | |
| "learning_rate": 0.0008247524752475247, | |
| "loss": 2.291, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.6504939198493958, | |
| "learning_rate": 0.0008237623762376238, | |
| "loss": 2.2908, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.7509806752204895, | |
| "learning_rate": 0.0008227722772277227, | |
| "loss": 2.2894, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.9019284844398499, | |
| "learning_rate": 0.0008217821782178218, | |
| "loss": 2.2851, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.5588305592536926, | |
| "learning_rate": 0.0008207920792079208, | |
| "loss": 2.284, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.659537672996521, | |
| "learning_rate": 0.0008198019801980197, | |
| "loss": 2.2758, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.6673635840415955, | |
| "learning_rate": 0.0008188118811881188, | |
| "loss": 2.2731, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "eval_accuracy": 0.5577760802425583, | |
| "eval_loss": 2.1262881755828857, | |
| "eval_runtime": 1086.5663, | |
| "eval_samples_per_second": 459.595, | |
| "eval_steps_per_second": 2.052, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.7020514607429504, | |
| "learning_rate": 0.0008178217821782177, | |
| "loss": 2.2744, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.6836090087890625, | |
| "learning_rate": 0.0008168316831683168, | |
| "loss": 2.2726, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.8297485709190369, | |
| "learning_rate": 0.0008158415841584159, | |
| "loss": 2.2696, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.6747464537620544, | |
| "learning_rate": 0.0008148514851485148, | |
| "loss": 2.2667, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.6610013246536255, | |
| "learning_rate": 0.0008138613861386138, | |
| "loss": 2.261, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 0.9729331731796265, | |
| "learning_rate": 0.0008128712871287128, | |
| "loss": 2.2634, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 0.580893874168396, | |
| "learning_rate": 0.000811881188118812, | |
| "loss": 2.2643, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.5239897966384888, | |
| "learning_rate": 0.000810891089108911, | |
| "loss": 2.2533, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.9247382283210754, | |
| "learning_rate": 0.00080990099009901, | |
| "loss": 2.2544, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "eval_accuracy": 0.5607038793304535, | |
| "eval_loss": 2.1080663204193115, | |
| "eval_runtime": 1087.2566, | |
| "eval_samples_per_second": 459.303, | |
| "eval_steps_per_second": 2.051, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.7254253029823303, | |
| "learning_rate": 0.000808910891089109, | |
| "loss": 2.2521, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.47916772961616516, | |
| "learning_rate": 0.0008079207920792079, | |
| "loss": 2.2471, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.8148419260978699, | |
| "learning_rate": 0.000806930693069307, | |
| "loss": 2.2479, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.6527644991874695, | |
| "learning_rate": 0.000805940594059406, | |
| "loss": 2.2446, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.7129182815551758, | |
| "learning_rate": 0.000804950495049505, | |
| "loss": 2.2382, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 0.6954285502433777, | |
| "learning_rate": 0.000803960396039604, | |
| "loss": 2.2399, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.6172522902488708, | |
| "learning_rate": 0.000802970297029703, | |
| "loss": 2.2395, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 0.8309088349342346, | |
| "learning_rate": 0.000801980198019802, | |
| "loss": 2.2379, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 0.6792633533477783, | |
| "learning_rate": 0.0008009900990099011, | |
| "loss": 2.2364, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "eval_accuracy": 0.5646760378560493, | |
| "eval_loss": 2.0840134620666504, | |
| "eval_runtime": 1086.4023, | |
| "eval_samples_per_second": 459.664, | |
| "eval_steps_per_second": 2.053, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.6008450388908386, | |
| "learning_rate": 0.0008, | |
| "loss": 2.2289, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 0.5826246738433838, | |
| "learning_rate": 0.0007990099009900991, | |
| "loss": 2.2259, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.6360362768173218, | |
| "learning_rate": 0.000798019801980198, | |
| "loss": 2.2262, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.7450495362281799, | |
| "learning_rate": 0.0007970297029702971, | |
| "loss": 2.2241, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.5571395754814148, | |
| "learning_rate": 0.0007960396039603961, | |
| "loss": 2.2245, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.622724175453186, | |
| "learning_rate": 0.0007950495049504951, | |
| "loss": 2.2183, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.621284544467926, | |
| "learning_rate": 0.0007940594059405941, | |
| "loss": 2.2153, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.9469096660614014, | |
| "learning_rate": 0.0007930693069306931, | |
| "loss": 2.2156, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.6318257451057434, | |
| "learning_rate": 0.0007920792079207921, | |
| "loss": 2.2143, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "eval_accuracy": 0.5673093799211185, | |
| "eval_loss": 2.0671584606170654, | |
| "eval_runtime": 1088.5283, | |
| "eval_samples_per_second": 458.766, | |
| "eval_steps_per_second": 2.049, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 0.7229343056678772, | |
| "learning_rate": 0.0007910891089108912, | |
| "loss": 2.2111, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 0.8711042404174805, | |
| "learning_rate": 0.0007900990099009901, | |
| "loss": 2.2144, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 0.5540309548377991, | |
| "learning_rate": 0.0007891089108910892, | |
| "loss": 2.2063, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.6157627105712891, | |
| "learning_rate": 0.0007881188118811881, | |
| "loss": 2.2026, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 1.0260213613510132, | |
| "learning_rate": 0.0007871287128712872, | |
| "loss": 2.2107, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.5229135155677795, | |
| "learning_rate": 0.0007861386138613862, | |
| "loss": 2.2031, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.460483580827713, | |
| "learning_rate": 0.0007851485148514852, | |
| "loss": 2.1956, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.9283266067504883, | |
| "learning_rate": 0.0007841584158415842, | |
| "loss": 2.1976, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.5621626973152161, | |
| "learning_rate": 0.0007831683168316832, | |
| "loss": 2.1972, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "eval_accuracy": 0.5702843964025183, | |
| "eval_loss": 2.049508810043335, | |
| "eval_runtime": 1086.5875, | |
| "eval_samples_per_second": 459.586, | |
| "eval_steps_per_second": 2.052, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.6998817324638367, | |
| "learning_rate": 0.0007821782178217822, | |
| "loss": 2.1905, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.6161481738090515, | |
| "learning_rate": 0.0007811881188118813, | |
| "loss": 2.1916, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 0.7690967321395874, | |
| "learning_rate": 0.0007801980198019802, | |
| "loss": 2.1904, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 0.6058160066604614, | |
| "learning_rate": 0.0007792079207920793, | |
| "loss": 2.1894, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.5859819054603577, | |
| "learning_rate": 0.0007782178217821782, | |
| "loss": 2.1823, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 0.7922290563583374, | |
| "learning_rate": 0.0007772277227722773, | |
| "loss": 2.1816, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.617785632610321, | |
| "learning_rate": 0.0007762376237623763, | |
| "loss": 2.1855, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.6082860827445984, | |
| "learning_rate": 0.0007752475247524753, | |
| "loss": 2.1811, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.44060420989990234, | |
| "learning_rate": 0.0007742574257425743, | |
| "loss": 2.1738, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "eval_accuracy": 0.5732795510180845, | |
| "eval_loss": 2.031883955001831, | |
| "eval_runtime": 1085.1554, | |
| "eval_samples_per_second": 460.192, | |
| "eval_steps_per_second": 2.055, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.6842211484909058, | |
| "learning_rate": 0.0007732673267326733, | |
| "loss": 2.1746, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.7363536357879639, | |
| "learning_rate": 0.0007722772277227723, | |
| "loss": 2.1756, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 0.657122015953064, | |
| "learning_rate": 0.0007712871287128714, | |
| "loss": 2.1767, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.525112509727478, | |
| "learning_rate": 0.0007702970297029703, | |
| "loss": 2.1705, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.594642162322998, | |
| "learning_rate": 0.0007693069306930694, | |
| "loss": 2.167, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.7353718280792236, | |
| "learning_rate": 0.0007683168316831683, | |
| "loss": 2.1678, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.7462971806526184, | |
| "learning_rate": 0.0007673267326732674, | |
| "loss": 2.1665, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.543685257434845, | |
| "learning_rate": 0.0007663366336633664, | |
| "loss": 2.1624, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.53340744972229, | |
| "learning_rate": 0.0007653465346534654, | |
| "loss": 2.1587, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "eval_accuracy": 0.5758243970327254, | |
| "eval_loss": 2.01580810546875, | |
| "eval_runtime": 1087.4822, | |
| "eval_samples_per_second": 459.208, | |
| "eval_steps_per_second": 2.051, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.819010853767395, | |
| "learning_rate": 0.0007643564356435644, | |
| "loss": 2.1623, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.6191548109054565, | |
| "learning_rate": 0.0007633663366336634, | |
| "loss": 2.1611, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.5724292993545532, | |
| "learning_rate": 0.0007623762376237624, | |
| "loss": 2.1583, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.6278745532035828, | |
| "learning_rate": 0.0007613861386138615, | |
| "loss": 2.1573, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 0.6978874802589417, | |
| "learning_rate": 0.0007603960396039604, | |
| "loss": 2.1523, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.9318163990974426, | |
| "learning_rate": 0.0007594059405940595, | |
| "loss": 2.1524, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 0.5397381782531738, | |
| "learning_rate": 0.0007584158415841584, | |
| "loss": 2.1505, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 0.6277997493743896, | |
| "learning_rate": 0.0007574257425742574, | |
| "loss": 2.15, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.6128600239753723, | |
| "learning_rate": 0.0007564356435643565, | |
| "loss": 2.1466, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "eval_accuracy": 0.5778634145294884, | |
| "eval_loss": 2.0040018558502197, | |
| "eval_runtime": 1086.3606, | |
| "eval_samples_per_second": 459.682, | |
| "eval_steps_per_second": 2.053, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.602790892124176, | |
| "learning_rate": 0.0007554455445544554, | |
| "loss": 2.1483, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.5856905579566956, | |
| "learning_rate": 0.0007544554455445545, | |
| "loss": 2.1471, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.6018987894058228, | |
| "learning_rate": 0.0007534653465346534, | |
| "loss": 2.141, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.6041855216026306, | |
| "learning_rate": 0.0007524752475247525, | |
| "loss": 2.1385, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.580766499042511, | |
| "learning_rate": 0.0007514851485148515, | |
| "loss": 2.1375, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.5637401342391968, | |
| "learning_rate": 0.0007504950495049505, | |
| "loss": 2.1374, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 0.6017095446586609, | |
| "learning_rate": 0.0007495049504950495, | |
| "loss": 2.1372, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.766730010509491, | |
| "learning_rate": 0.0007485148514851485, | |
| "loss": 2.1336, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.5679196119308472, | |
| "learning_rate": 0.0007475247524752475, | |
| "loss": 2.1339, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "eval_accuracy": 0.5801526152356328, | |
| "eval_loss": 1.9901340007781982, | |
| "eval_runtime": 1085.9898, | |
| "eval_samples_per_second": 459.839, | |
| "eval_steps_per_second": 2.053, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.5875471234321594, | |
| "learning_rate": 0.0007465346534653466, | |
| "loss": 2.1327, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.885311484336853, | |
| "learning_rate": 0.0007455445544554455, | |
| "loss": 2.1319, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.5480872392654419, | |
| "learning_rate": 0.0007445544554455446, | |
| "loss": 2.1283, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.6052006483078003, | |
| "learning_rate": 0.0007435643564356435, | |
| "loss": 2.1258, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.4672467112541199, | |
| "learning_rate": 0.0007425742574257426, | |
| "loss": 2.1246, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.7137532234191895, | |
| "learning_rate": 0.0007415841584158416, | |
| "loss": 2.1225, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 0.726308286190033, | |
| "learning_rate": 0.0007405940594059406, | |
| "loss": 2.1247, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.4779931902885437, | |
| "learning_rate": 0.0007396039603960396, | |
| "loss": 2.1215, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.5192296504974365, | |
| "learning_rate": 0.0007386138613861386, | |
| "loss": 2.1151, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "eval_accuracy": 0.5818349975818327, | |
| "eval_loss": 1.981979489326477, | |
| "eval_runtime": 1085.0979, | |
| "eval_samples_per_second": 460.217, | |
| "eval_steps_per_second": 2.055, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.824546754360199, | |
| "learning_rate": 0.0007376237623762376, | |
| "loss": 2.1176, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 0.5459938049316406, | |
| "learning_rate": 0.0007366336633663367, | |
| "loss": 2.1209, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 0.657993733882904, | |
| "learning_rate": 0.0007356435643564356, | |
| "loss": 2.1174, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.6625123023986816, | |
| "learning_rate": 0.0007346534653465347, | |
| "loss": 2.114, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 0.6521473526954651, | |
| "learning_rate": 0.0007336633663366336, | |
| "loss": 2.1126, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.540843665599823, | |
| "learning_rate": 0.0007326732673267327, | |
| "loss": 2.1139, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.5456762313842773, | |
| "learning_rate": 0.0007316831683168317, | |
| "loss": 2.1096, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.6277236938476562, | |
| "learning_rate": 0.0007306930693069307, | |
| "loss": 2.1085, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 0.5047609210014343, | |
| "learning_rate": 0.0007297029702970297, | |
| "loss": 2.1048, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "eval_accuracy": 0.5844656610858342, | |
| "eval_loss": 1.9673104286193848, | |
| "eval_runtime": 1086.0993, | |
| "eval_samples_per_second": 459.792, | |
| "eval_steps_per_second": 2.053, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.5852828025817871, | |
| "learning_rate": 0.0007287128712871287, | |
| "loss": 2.1091, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.49883949756622314, | |
| "learning_rate": 0.0007277227722772277, | |
| "loss": 2.1059, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.4874211251735687, | |
| "learning_rate": 0.0007267326732673268, | |
| "loss": 2.101, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 0.807388961315155, | |
| "learning_rate": 0.0007257425742574257, | |
| "loss": 2.1019, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 0.4876428246498108, | |
| "learning_rate": 0.0007247524752475248, | |
| "loss": 2.101, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.5534060597419739, | |
| "learning_rate": 0.0007237623762376237, | |
| "loss": 2.096, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.5464605093002319, | |
| "learning_rate": 0.0007227722772277228, | |
| "loss": 2.0999, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.6738607883453369, | |
| "learning_rate": 0.0007217821782178218, | |
| "loss": 2.0973, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.5829378366470337, | |
| "learning_rate": 0.0007207920792079208, | |
| "loss": 2.0943, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_accuracy": 0.5862642603086486, | |
| "eval_loss": 1.9560039043426514, | |
| "eval_runtime": 1085.7866, | |
| "eval_samples_per_second": 459.925, | |
| "eval_steps_per_second": 2.054, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 0.6175411939620972, | |
| "learning_rate": 0.0007198019801980198, | |
| "loss": 2.097, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 0.4551532566547394, | |
| "learning_rate": 0.0007188118811881188, | |
| "loss": 2.0909, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.6612002849578857, | |
| "learning_rate": 0.0007178217821782178, | |
| "loss": 2.0885, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.5608059763908386, | |
| "learning_rate": 0.0007168316831683169, | |
| "loss": 2.0893, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.6450534462928772, | |
| "learning_rate": 0.0007158415841584158, | |
| "loss": 2.0881, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.5012123584747314, | |
| "learning_rate": 0.0007148514851485149, | |
| "loss": 2.0843, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.5907981395721436, | |
| "learning_rate": 0.0007138613861386138, | |
| "loss": 2.0848, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.6378484964370728, | |
| "learning_rate": 0.0007128712871287129, | |
| "loss": 2.0891, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.5008774399757385, | |
| "learning_rate": 0.0007118811881188119, | |
| "loss": 2.0839, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "eval_accuracy": 0.5886411137424359, | |
| "eval_loss": 1.9411782026290894, | |
| "eval_runtime": 1086.0682, | |
| "eval_samples_per_second": 459.805, | |
| "eval_steps_per_second": 2.053, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.5727280378341675, | |
| "learning_rate": 0.0007108910891089109, | |
| "loss": 2.0834, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.5417036414146423, | |
| "learning_rate": 0.0007099009900990099, | |
| "loss": 2.0806, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.5739684104919434, | |
| "learning_rate": 0.0007089108910891088, | |
| "loss": 2.0809, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.4842034876346588, | |
| "learning_rate": 0.0007079207920792079, | |
| "loss": 2.0787, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.7235381603240967, | |
| "learning_rate": 0.000706930693069307, | |
| "loss": 2.0761, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.5333278775215149, | |
| "learning_rate": 0.0007059405940594059, | |
| "loss": 2.082, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.5933953523635864, | |
| "learning_rate": 0.000704950495049505, | |
| "loss": 2.0711, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 0.3995033800601959, | |
| "learning_rate": 0.0007039603960396039, | |
| "loss": 2.072, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 0.5461521744728088, | |
| "learning_rate": 0.0007029702970297029, | |
| "loss": 2.0748, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "eval_accuracy": 0.5892188513716441, | |
| "eval_loss": 1.9380106925964355, | |
| "eval_runtime": 1086.2866, | |
| "eval_samples_per_second": 459.713, | |
| "eval_steps_per_second": 2.053, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.6759417057037354, | |
| "learning_rate": 0.000701980198019802, | |
| "loss": 2.0743, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.4820743203163147, | |
| "learning_rate": 0.0007009900990099009, | |
| "loss": 2.0708, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.5824475884437561, | |
| "learning_rate": 0.0007, | |
| "loss": 2.0676, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.5133311748504639, | |
| "learning_rate": 0.0006990099009900989, | |
| "loss": 2.0688, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 0.5744913220405579, | |
| "learning_rate": 0.000698019801980198, | |
| "loss": 2.0657, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 0.5103346705436707, | |
| "learning_rate": 0.000697029702970297, | |
| "loss": 2.0674, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 0.48300009965896606, | |
| "learning_rate": 0.000696039603960396, | |
| "loss": 2.0649, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 0.64620441198349, | |
| "learning_rate": 0.000695049504950495, | |
| "loss": 2.0658, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.5096336603164673, | |
| "learning_rate": 0.000694059405940594, | |
| "loss": 2.0671, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.5915717983649539, | |
| "eval_loss": 1.9237810373306274, | |
| "eval_runtime": 1088.2291, | |
| "eval_samples_per_second": 458.892, | |
| "eval_steps_per_second": 2.049, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 0.45002949237823486, | |
| "learning_rate": 0.000693069306930693, | |
| "loss": 2.0612, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 0.48484668135643005, | |
| "learning_rate": 0.0006920792079207921, | |
| "loss": 2.0598, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": 0.5888765454292297, | |
| "learning_rate": 0.000691089108910891, | |
| "loss": 2.0665, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 0.6311919689178467, | |
| "learning_rate": 0.0006900990099009901, | |
| "loss": 2.0604, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 0.4337356686592102, | |
| "learning_rate": 0.000689108910891089, | |
| "loss": 2.0562, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 0.5855375528335571, | |
| "learning_rate": 0.0006881188118811881, | |
| "loss": 2.0535, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 0.6715276837348938, | |
| "learning_rate": 0.0006871287128712872, | |
| "loss": 2.0569, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 0.5453487634658813, | |
| "learning_rate": 0.0006861386138613862, | |
| "loss": 2.0539, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 0.44611501693725586, | |
| "learning_rate": 0.0006851485148514852, | |
| "loss": 2.0557, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "eval_accuracy": 0.593527753135558, | |
| "eval_loss": 1.9137904644012451, | |
| "eval_runtime": 1092.9474, | |
| "eval_samples_per_second": 456.911, | |
| "eval_steps_per_second": 2.04, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 0.46945101022720337, | |
| "learning_rate": 0.0006841584158415842, | |
| "loss": 2.0494, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.6355441808700562, | |
| "learning_rate": 0.0006831683168316832, | |
| "loss": 2.0535, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 0.5079066157341003, | |
| "learning_rate": 0.0006821782178217823, | |
| "loss": 2.0535, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 0.48156508803367615, | |
| "learning_rate": 0.0006811881188118812, | |
| "loss": 2.0479, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 0.40759479999542236, | |
| "learning_rate": 0.0006801980198019803, | |
| "loss": 2.0465, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 0.6267878413200378, | |
| "learning_rate": 0.0006792079207920792, | |
| "loss": 2.047, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.4935464560985565, | |
| "learning_rate": 0.0006782178217821783, | |
| "loss": 2.0457, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.8215575814247131, | |
| "learning_rate": 0.0006772277227722773, | |
| "loss": 2.0459, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 0.5285871624946594, | |
| "learning_rate": 0.0006762376237623763, | |
| "loss": 2.046, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "eval_accuracy": 0.5947354040081998, | |
| "eval_loss": 1.9072514772415161, | |
| "eval_runtime": 1086.9971, | |
| "eval_samples_per_second": 459.412, | |
| "eval_steps_per_second": 2.052, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 0.4480656087398529, | |
| "learning_rate": 0.0006752475247524753, | |
| "loss": 2.0419, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 0.5856757760047913, | |
| "learning_rate": 0.0006742574257425743, | |
| "loss": 2.043, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 0.5994493365287781, | |
| "learning_rate": 0.0006732673267326733, | |
| "loss": 2.0423, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.5151802897453308, | |
| "learning_rate": 0.0006722772277227724, | |
| "loss": 2.0367, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 0.5299440622329712, | |
| "learning_rate": 0.0006712871287128713, | |
| "loss": 2.0404, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 0.5033411979675293, | |
| "learning_rate": 0.0006702970297029704, | |
| "loss": 2.0383, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 0.5515163540840149, | |
| "learning_rate": 0.0006693069306930693, | |
| "loss": 2.0377, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 0.6445341110229492, | |
| "learning_rate": 0.0006683168316831684, | |
| "loss": 2.0348, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.48192110657691956, | |
| "learning_rate": 0.0006673267326732674, | |
| "loss": 2.0376, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "eval_accuracy": 0.5963143559474492, | |
| "eval_loss": 1.8983112573623657, | |
| "eval_runtime": 1086.0935, | |
| "eval_samples_per_second": 459.795, | |
| "eval_steps_per_second": 2.053, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.43177658319473267, | |
| "learning_rate": 0.0006663366336633664, | |
| "loss": 2.0304, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 0.5673606395721436, | |
| "learning_rate": 0.0006653465346534654, | |
| "loss": 2.0342, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 0.6636048555374146, | |
| "learning_rate": 0.0006643564356435644, | |
| "loss": 2.035, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 0.5259466171264648, | |
| "learning_rate": 0.0006633663366336634, | |
| "loss": 2.0323, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 0.561341404914856, | |
| "learning_rate": 0.0006623762376237625, | |
| "loss": 2.0277, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.48961034417152405, | |
| "learning_rate": 0.0006613861386138614, | |
| "loss": 2.0316, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 0.46912887692451477, | |
| "learning_rate": 0.0006603960396039605, | |
| "loss": 2.0285, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 0.5009626746177673, | |
| "learning_rate": 0.0006594059405940594, | |
| "loss": 2.0287, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 0.5485634207725525, | |
| "learning_rate": 0.0006584158415841585, | |
| "loss": 2.0275, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "eval_accuracy": 0.5978907811550054, | |
| "eval_loss": 1.8894693851470947, | |
| "eval_runtime": 1085.9104, | |
| "eval_samples_per_second": 459.872, | |
| "eval_steps_per_second": 2.054, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 0.5094584822654724, | |
| "learning_rate": 0.0006574257425742575, | |
| "loss": 2.0285, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.5796740651130676, | |
| "learning_rate": 0.0006564356435643565, | |
| "loss": 2.0246, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.5473222136497498, | |
| "learning_rate": 0.0006554455445544555, | |
| "loss": 2.0262, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 0.514639675617218, | |
| "learning_rate": 0.0006544554455445545, | |
| "loss": 2.0196, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 0.7184515595436096, | |
| "learning_rate": 0.0006534653465346535, | |
| "loss": 2.0228, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.5729575157165527, | |
| "learning_rate": 0.0006524752475247526, | |
| "loss": 2.0216, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.543946385383606, | |
| "learning_rate": 0.0006514851485148515, | |
| "loss": 2.0221, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.5328618884086609, | |
| "learning_rate": 0.0006504950495049506, | |
| "loss": 2.0205, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 0.45202726125717163, | |
| "learning_rate": 0.0006495049504950495, | |
| "loss": 2.0171, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "eval_accuracy": 0.5995061137091588, | |
| "eval_loss": 1.880002498626709, | |
| "eval_runtime": 1087.0114, | |
| "eval_samples_per_second": 459.406, | |
| "eval_steps_per_second": 2.051, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 0.6767913103103638, | |
| "learning_rate": 0.0006485148514851485, | |
| "loss": 2.0196, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 0.6758072972297668, | |
| "learning_rate": 0.0006475247524752476, | |
| "loss": 2.0187, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 0.49338895082473755, | |
| "learning_rate": 0.0006465346534653465, | |
| "loss": 2.0203, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.6283939480781555, | |
| "learning_rate": 0.0006455445544554456, | |
| "loss": 2.0141, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.4420575499534607, | |
| "learning_rate": 0.0006445544554455445, | |
| "loss": 2.0169, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 0.5496084690093994, | |
| "learning_rate": 0.0006435643564356436, | |
| "loss": 2.0132, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 0.5455350279808044, | |
| "learning_rate": 0.0006425742574257426, | |
| "loss": 2.0135, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 0.5139738917350769, | |
| "learning_rate": 0.0006415841584158416, | |
| "loss": 2.0165, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 0.5023753046989441, | |
| "learning_rate": 0.0006405940594059406, | |
| "loss": 2.0107, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "eval_accuracy": 0.6004914418538849, | |
| "eval_loss": 1.8729732036590576, | |
| "eval_runtime": 1086.4568, | |
| "eval_samples_per_second": 459.641, | |
| "eval_steps_per_second": 2.053, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.50832599401474, | |
| "learning_rate": 0.0006396039603960396, | |
| "loss": 2.0125, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 0.6144891977310181, | |
| "learning_rate": 0.0006386138613861386, | |
| "loss": 2.0074, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 0.5852723121643066, | |
| "learning_rate": 0.0006376237623762377, | |
| "loss": 2.0116, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 0.6694257259368896, | |
| "learning_rate": 0.0006366336633663366, | |
| "loss": 2.0068, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 0.5024294257164001, | |
| "learning_rate": 0.0006356435643564357, | |
| "loss": 2.0097, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.4397622048854828, | |
| "learning_rate": 0.0006346534653465346, | |
| "loss": 2.0058, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.46592214703559875, | |
| "learning_rate": 0.0006336633663366337, | |
| "loss": 2.0036, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 0.6728220582008362, | |
| "learning_rate": 0.0006326732673267327, | |
| "loss": 2.0027, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 0.5555120706558228, | |
| "learning_rate": 0.0006316831683168317, | |
| "loss": 2.0037, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "eval_accuracy": 0.6017631778318724, | |
| "eval_loss": 1.8681055307388306, | |
| "eval_runtime": 1084.7583, | |
| "eval_samples_per_second": 460.361, | |
| "eval_steps_per_second": 2.056, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 0.5361539721488953, | |
| "learning_rate": 0.0006306930693069307, | |
| "loss": 2.0029, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 0.4616907238960266, | |
| "learning_rate": 0.0006297029702970297, | |
| "loss": 2.004, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.4987693727016449, | |
| "learning_rate": 0.0006287128712871287, | |
| "loss": 2.0023, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 0.5090926289558411, | |
| "learning_rate": 0.0006277227722772278, | |
| "loss": 1.9998, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.5008625388145447, | |
| "learning_rate": 0.0006267326732673267, | |
| "loss": 1.9983, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.5074314475059509, | |
| "learning_rate": 0.0006257425742574258, | |
| "loss": 1.9971, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 0.5803602933883667, | |
| "learning_rate": 0.0006247524752475247, | |
| "loss": 1.9974, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.534377932548523, | |
| "learning_rate": 0.0006237623762376238, | |
| "loss": 1.9962, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.5166971683502197, | |
| "learning_rate": 0.0006227722772277228, | |
| "loss": 1.9967, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "eval_accuracy": 0.6030721722960667, | |
| "eval_loss": 1.8595592975616455, | |
| "eval_runtime": 1083.857, | |
| "eval_samples_per_second": 460.743, | |
| "eval_steps_per_second": 2.057, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 0.610471785068512, | |
| "learning_rate": 0.0006217821782178218, | |
| "loss": 1.9972, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 0.4919542372226715, | |
| "learning_rate": 0.0006207920792079208, | |
| "loss": 1.9945, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 0.6607844829559326, | |
| "learning_rate": 0.0006198019801980198, | |
| "loss": 1.9962, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 0.4905446171760559, | |
| "learning_rate": 0.0006188118811881188, | |
| "loss": 1.9937, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 0.46790874004364014, | |
| "learning_rate": 0.0006178217821782179, | |
| "loss": 1.99, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 0.5997541546821594, | |
| "learning_rate": 0.0006168316831683168, | |
| "loss": 1.9928, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 0.5869884490966797, | |
| "learning_rate": 0.0006158415841584159, | |
| "loss": 1.9913, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 0.5359517335891724, | |
| "learning_rate": 0.0006148514851485148, | |
| "loss": 1.9892, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 0.5119579434394836, | |
| "learning_rate": 0.0006138613861386139, | |
| "loss": 1.9892, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "eval_accuracy": 0.6040549869719135, | |
| "eval_loss": 1.8538638353347778, | |
| "eval_runtime": 1084.8597, | |
| "eval_samples_per_second": 460.318, | |
| "eval_steps_per_second": 2.056, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.501132071018219, | |
| "learning_rate": 0.0006128712871287129, | |
| "loss": 1.9874, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.5507422089576721, | |
| "learning_rate": 0.0006118811881188119, | |
| "loss": 1.9904, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 0.5412635207176208, | |
| "learning_rate": 0.0006108910891089109, | |
| "loss": 1.9873, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 0.44309449195861816, | |
| "learning_rate": 0.0006099009900990099, | |
| "loss": 1.9869, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 0.5056418776512146, | |
| "learning_rate": 0.0006089108910891089, | |
| "loss": 1.9855, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 0.48882943391799927, | |
| "learning_rate": 0.000607920792079208, | |
| "loss": 1.984, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.4717276692390442, | |
| "learning_rate": 0.0006069306930693069, | |
| "loss": 1.9841, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 0.5123859643936157, | |
| "learning_rate": 0.000605940594059406, | |
| "loss": 1.9847, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 0.4882528483867645, | |
| "learning_rate": 0.0006049504950495049, | |
| "loss": 1.9824, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "eval_accuracy": 0.6054229902100914, | |
| "eval_loss": 1.8454294204711914, | |
| "eval_runtime": 1084.8895, | |
| "eval_samples_per_second": 460.305, | |
| "eval_steps_per_second": 2.056, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 0.4354398548603058, | |
| "learning_rate": 0.000603960396039604, | |
| "loss": 1.9806, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 0.5733498334884644, | |
| "learning_rate": 0.000602970297029703, | |
| "loss": 1.9828, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.5736912488937378, | |
| "learning_rate": 0.000601980198019802, | |
| "loss": 1.9816, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 0.44418302178382874, | |
| "learning_rate": 0.000600990099009901, | |
| "loss": 1.978, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 0.5334004759788513, | |
| "learning_rate": 0.0006, | |
| "loss": 1.9814, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 0.5706362128257751, | |
| "learning_rate": 0.000599009900990099, | |
| "loss": 1.9787, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 0.8315806984901428, | |
| "learning_rate": 0.000598019801980198, | |
| "loss": 1.9772, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 0.47482118010520935, | |
| "learning_rate": 0.000597029702970297, | |
| "loss": 1.98, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.43432384729385376, | |
| "learning_rate": 0.000596039603960396, | |
| "loss": 1.9766, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "eval_accuracy": 0.6066947766959975, | |
| "eval_loss": 1.8382798433303833, | |
| "eval_runtime": 1084.0208, | |
| "eval_samples_per_second": 460.674, | |
| "eval_steps_per_second": 2.057, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 0.4684685170650482, | |
| "learning_rate": 0.000595049504950495, | |
| "loss": 1.9747, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 0.5347057580947876, | |
| "learning_rate": 0.000594059405940594, | |
| "loss": 1.9773, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 0.5178421139717102, | |
| "learning_rate": 0.0005930693069306931, | |
| "loss": 1.9754, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 0.5036115050315857, | |
| "learning_rate": 0.000592079207920792, | |
| "loss": 1.9737, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.48273569345474243, | |
| "learning_rate": 0.0005910891089108911, | |
| "loss": 1.9723, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 0.4350590109825134, | |
| "learning_rate": 0.00059009900990099, | |
| "loss": 1.9715, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 0.4483092129230499, | |
| "learning_rate": 0.0005891089108910891, | |
| "loss": 1.9718, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 0.6620519757270813, | |
| "learning_rate": 0.0005881188118811881, | |
| "loss": 1.9726, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 0.41184690594673157, | |
| "learning_rate": 0.0005871287128712871, | |
| "loss": 1.9682, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "eval_accuracy": 0.6078188605164281, | |
| "eval_loss": 1.8335860967636108, | |
| "eval_runtime": 1085.8051, | |
| "eval_samples_per_second": 459.917, | |
| "eval_steps_per_second": 2.054, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 0.6032079458236694, | |
| "learning_rate": 0.0005861386138613861, | |
| "loss": 1.9701, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.5113199949264526, | |
| "learning_rate": 0.0005851485148514851, | |
| "loss": 1.9715, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 0.43198567628860474, | |
| "learning_rate": 0.0005841584158415841, | |
| "loss": 1.9677, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 0.511009931564331, | |
| "learning_rate": 0.0005831683168316832, | |
| "loss": 1.9692, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 0.48394373059272766, | |
| "learning_rate": 0.0005821782178217821, | |
| "loss": 1.9658, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 0.5180623531341553, | |
| "learning_rate": 0.0005811881188118812, | |
| "loss": 1.9656, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 0.5282729864120483, | |
| "learning_rate": 0.0005801980198019801, | |
| "loss": 1.9672, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 0.44400766491889954, | |
| "learning_rate": 0.0005792079207920792, | |
| "loss": 1.9628, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 0.47033068537712097, | |
| "learning_rate": 0.0005782178217821782, | |
| "loss": 1.9653, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "eval_accuracy": 0.6084373992178661, | |
| "eval_loss": 1.8281679153442383, | |
| "eval_runtime": 1084.5844, | |
| "eval_samples_per_second": 460.434, | |
| "eval_steps_per_second": 2.056, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 0.7458497881889343, | |
| "learning_rate": 0.0005772277227722772, | |
| "loss": 1.9662, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 0.4841892421245575, | |
| "learning_rate": 0.0005762376237623762, | |
| "loss": 1.9645, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 0.472526490688324, | |
| "learning_rate": 0.0005752475247524752, | |
| "loss": 1.9621, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 0.41672539710998535, | |
| "learning_rate": 0.0005742574257425742, | |
| "loss": 1.9595, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 0.5024113059043884, | |
| "learning_rate": 0.0005732673267326733, | |
| "loss": 1.9593, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 0.5023587942123413, | |
| "learning_rate": 0.0005722772277227722, | |
| "loss": 1.9599, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 0.45752909779548645, | |
| "learning_rate": 0.0005712871287128713, | |
| "loss": 1.9599, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 0.6170557737350464, | |
| "learning_rate": 0.0005702970297029702, | |
| "loss": 1.9616, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.4267810583114624, | |
| "learning_rate": 0.0005693069306930693, | |
| "loss": 1.9599, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "eval_accuracy": 0.609472718679915, | |
| "eval_loss": 1.8216131925582886, | |
| "eval_runtime": 1085.0974, | |
| "eval_samples_per_second": 460.217, | |
| "eval_steps_per_second": 2.055, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.5765691995620728, | |
| "learning_rate": 0.0005683168316831683, | |
| "loss": 1.9571, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.5143380761146545, | |
| "learning_rate": 0.0005673267326732673, | |
| "loss": 1.9594, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 0.47696933150291443, | |
| "learning_rate": 0.0005663366336633663, | |
| "loss": 1.9569, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.6473893523216248, | |
| "learning_rate": 0.0005653465346534653, | |
| "loss": 1.959, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.42045238614082336, | |
| "learning_rate": 0.0005643564356435643, | |
| "loss": 1.9537, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.47495463490486145, | |
| "learning_rate": 0.0005633663366336634, | |
| "loss": 1.9539, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 0.46555668115615845, | |
| "learning_rate": 0.0005623762376237624, | |
| "loss": 1.9532, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 0.5669355988502502, | |
| "learning_rate": 0.0005613861386138615, | |
| "loss": 1.9539, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 0.5012803077697754, | |
| "learning_rate": 0.0005603960396039604, | |
| "loss": 1.9516, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "eval_accuracy": 0.6106878911706901, | |
| "eval_loss": 1.8161377906799316, | |
| "eval_runtime": 1086.8974, | |
| "eval_samples_per_second": 459.455, | |
| "eval_steps_per_second": 2.052, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 0.6031624674797058, | |
| "learning_rate": 0.0005594059405940595, | |
| "loss": 1.951, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.48754164576530457, | |
| "learning_rate": 0.0005584158415841585, | |
| "loss": 1.9512, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 5.01, | |
| "grad_norm": 0.5661942362785339, | |
| "learning_rate": 0.0005574257425742575, | |
| "loss": 1.953, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 5.01, | |
| "grad_norm": 0.5200914144515991, | |
| "learning_rate": 0.0005564356435643565, | |
| "loss": 1.9528, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 5.02, | |
| "grad_norm": 0.504625678062439, | |
| "learning_rate": 0.0005554455445544555, | |
| "loss": 1.9473, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 5.03, | |
| "grad_norm": 0.5114207863807678, | |
| "learning_rate": 0.0005544554455445545, | |
| "loss": 1.9503, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.03, | |
| "grad_norm": 0.4563724100589752, | |
| "learning_rate": 0.0005534653465346536, | |
| "loss": 1.9522, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 0.48981210589408875, | |
| "learning_rate": 0.0005524752475247525, | |
| "loss": 1.9463, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 0.4641856849193573, | |
| "learning_rate": 0.0005514851485148516, | |
| "loss": 1.9473, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "eval_accuracy": 0.6114587633382502, | |
| "eval_loss": 1.8128423690795898, | |
| "eval_runtime": 1088.997, | |
| "eval_samples_per_second": 458.569, | |
| "eval_steps_per_second": 2.048, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 5.06, | |
| "grad_norm": 0.522405743598938, | |
| "learning_rate": 0.0005504950495049505, | |
| "loss": 1.9464, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 5.06, | |
| "grad_norm": 0.5010780692100525, | |
| "learning_rate": 0.0005495049504950496, | |
| "loss": 1.9462, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "grad_norm": 0.4186078906059265, | |
| "learning_rate": 0.0005485148514851486, | |
| "loss": 1.9454, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 5.08, | |
| "grad_norm": 0.43226584792137146, | |
| "learning_rate": 0.0005475247524752476, | |
| "loss": 1.9429, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 5.09, | |
| "grad_norm": 0.4429096579551697, | |
| "learning_rate": 0.0005465346534653466, | |
| "loss": 1.9439, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 5.09, | |
| "grad_norm": 0.5576241612434387, | |
| "learning_rate": 0.0005455445544554456, | |
| "loss": 1.9465, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "grad_norm": 0.5840058326721191, | |
| "learning_rate": 0.0005445544554455446, | |
| "loss": 1.9465, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "grad_norm": 0.42570897936820984, | |
| "learning_rate": 0.0005435643564356437, | |
| "loss": 1.9423, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "grad_norm": 0.4703156650066376, | |
| "learning_rate": 0.0005425742574257426, | |
| "loss": 1.9445, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "eval_accuracy": 0.6122824776592447, | |
| "eval_loss": 1.8064905405044556, | |
| "eval_runtime": 1089.2916, | |
| "eval_samples_per_second": 458.445, | |
| "eval_steps_per_second": 2.047, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 0.47701114416122437, | |
| "learning_rate": 0.0005415841584158417, | |
| "loss": 1.9414, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 5.13, | |
| "grad_norm": 0.4756263196468353, | |
| "learning_rate": 0.0005405940594059406, | |
| "loss": 1.9412, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 5.14, | |
| "grad_norm": 0.4438433051109314, | |
| "learning_rate": 0.0005396039603960396, | |
| "loss": 1.9396, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 5.14, | |
| "grad_norm": 0.7217634916305542, | |
| "learning_rate": 0.0005386138613861387, | |
| "loss": 1.9405, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 0.5862283110618591, | |
| "learning_rate": 0.0005376237623762376, | |
| "loss": 1.9408, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 0.4042336642742157, | |
| "learning_rate": 0.0005366336633663367, | |
| "loss": 1.9389, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 5.17, | |
| "grad_norm": 0.48928365111351013, | |
| "learning_rate": 0.0005356435643564356, | |
| "loss": 1.9387, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 5.17, | |
| "grad_norm": 0.4354238212108612, | |
| "learning_rate": 0.0005346534653465347, | |
| "loss": 1.9383, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 5.18, | |
| "grad_norm": 0.4175672233104706, | |
| "learning_rate": 0.0005336633663366337, | |
| "loss": 1.9352, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 5.18, | |
| "eval_accuracy": 0.6133205485630702, | |
| "eval_loss": 1.8010112047195435, | |
| "eval_runtime": 1088.7719, | |
| "eval_samples_per_second": 458.664, | |
| "eval_steps_per_second": 2.048, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 0.49512535333633423, | |
| "learning_rate": 0.0005326732673267327, | |
| "loss": 1.936, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 0.5564088225364685, | |
| "learning_rate": 0.0005316831683168317, | |
| "loss": 1.9352, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 0.42951545119285583, | |
| "learning_rate": 0.0005306930693069307, | |
| "loss": 1.9382, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 5.21, | |
| "grad_norm": 0.4925052523612976, | |
| "learning_rate": 0.0005297029702970297, | |
| "loss": 1.9332, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "grad_norm": 0.47808635234832764, | |
| "learning_rate": 0.0005287128712871288, | |
| "loss": 1.9347, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "grad_norm": 0.49086272716522217, | |
| "learning_rate": 0.0005277227722772277, | |
| "loss": 1.9346, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 5.23, | |
| "grad_norm": 0.6410700678825378, | |
| "learning_rate": 0.0005267326732673268, | |
| "loss": 1.9355, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 5.24, | |
| "grad_norm": 0.5207043886184692, | |
| "learning_rate": 0.0005257425742574257, | |
| "loss": 1.9362, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 0.4774588346481323, | |
| "learning_rate": 0.0005247524752475248, | |
| "loss": 1.9275, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "eval_accuracy": 0.6140941851085354, | |
| "eval_loss": 1.795212745666504, | |
| "eval_runtime": 1086.7491, | |
| "eval_samples_per_second": 459.517, | |
| "eval_steps_per_second": 2.052, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 0.43084409832954407, | |
| "learning_rate": 0.0005237623762376238, | |
| "loss": 1.9323, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 5.26, | |
| "grad_norm": 0.47473257780075073, | |
| "learning_rate": 0.0005227722772277228, | |
| "loss": 1.9325, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 5.27, | |
| "grad_norm": 0.5835340023040771, | |
| "learning_rate": 0.0005217821782178218, | |
| "loss": 1.9307, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.5061103701591492, | |
| "learning_rate": 0.0005207920792079208, | |
| "loss": 1.9312, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.41326335072517395, | |
| "learning_rate": 0.0005198019801980198, | |
| "loss": 1.9302, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "grad_norm": 0.4128727316856384, | |
| "learning_rate": 0.0005188118811881189, | |
| "loss": 1.9306, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 0.4514748752117157, | |
| "learning_rate": 0.0005178217821782178, | |
| "loss": 1.9306, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 0.4459412097930908, | |
| "learning_rate": 0.0005168316831683169, | |
| "loss": 1.9273, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 5.31, | |
| "grad_norm": 0.508747935295105, | |
| "learning_rate": 0.0005158415841584158, | |
| "loss": 1.9288, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 5.31, | |
| "eval_accuracy": 0.6147504784530803, | |
| "eval_loss": 1.7935823202133179, | |
| "eval_runtime": 1086.9403, | |
| "eval_samples_per_second": 459.436, | |
| "eval_steps_per_second": 2.052, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "grad_norm": 0.47734731435775757, | |
| "learning_rate": 0.0005148514851485149, | |
| "loss": 1.9298, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "grad_norm": 0.5174552798271179, | |
| "learning_rate": 0.0005138613861386139, | |
| "loss": 1.9265, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "grad_norm": 0.5077902674674988, | |
| "learning_rate": 0.0005128712871287129, | |
| "loss": 1.9261, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 0.5650092959403992, | |
| "learning_rate": 0.0005118811881188119, | |
| "loss": 1.9298, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "grad_norm": 0.4757165312767029, | |
| "learning_rate": 0.0005108910891089109, | |
| "loss": 1.924, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 0.39661648869514465, | |
| "learning_rate": 0.0005099009900990099, | |
| "loss": 1.9246, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 0.4992424547672272, | |
| "learning_rate": 0.000508910891089109, | |
| "loss": 1.9238, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "grad_norm": 0.5065789222717285, | |
| "learning_rate": 0.0005079207920792079, | |
| "loss": 1.9227, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "grad_norm": 0.4040853977203369, | |
| "learning_rate": 0.000506930693069307, | |
| "loss": 1.9216, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "eval_accuracy": 0.6157344716957146, | |
| "eval_loss": 1.7872822284698486, | |
| "eval_runtime": 1088.1718, | |
| "eval_samples_per_second": 458.917, | |
| "eval_steps_per_second": 2.049, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "grad_norm": 0.5322463512420654, | |
| "learning_rate": 0.0005059405940594059, | |
| "loss": 1.9224, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 5.39, | |
| "grad_norm": 0.7337666153907776, | |
| "learning_rate": 0.000504950495049505, | |
| "loss": 1.9215, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 0.4274744987487793, | |
| "learning_rate": 0.000503960396039604, | |
| "loss": 1.9204, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 0.5131354928016663, | |
| "learning_rate": 0.000502970297029703, | |
| "loss": 1.9232, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 0.46376702189445496, | |
| "learning_rate": 0.000501980198019802, | |
| "loss": 1.9206, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "grad_norm": 0.4123290181159973, | |
| "learning_rate": 0.000500990099009901, | |
| "loss": 1.9195, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 5.43, | |
| "grad_norm": 0.5006473660469055, | |
| "learning_rate": 0.0005, | |
| "loss": 1.9193, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 0.4529099464416504, | |
| "learning_rate": 0.0004990099009900991, | |
| "loss": 1.9194, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 0.43744367361068726, | |
| "learning_rate": 0.000498019801980198, | |
| "loss": 1.9215, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "eval_accuracy": 0.6164003583042955, | |
| "eval_loss": 1.7831112146377563, | |
| "eval_runtime": 1087.85, | |
| "eval_samples_per_second": 459.052, | |
| "eval_steps_per_second": 2.05, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "grad_norm": 0.4329184591770172, | |
| "learning_rate": 0.0004970297029702971, | |
| "loss": 1.9184, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 5.46, | |
| "grad_norm": 0.4381932020187378, | |
| "learning_rate": 0.000496039603960396, | |
| "loss": 1.9194, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 5.46, | |
| "grad_norm": 0.4533543586730957, | |
| "learning_rate": 0.0004950495049504951, | |
| "loss": 1.9163, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 5.47, | |
| "grad_norm": 0.50531405210495, | |
| "learning_rate": 0.0004940594059405941, | |
| "loss": 1.9196, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 5.48, | |
| "grad_norm": 0.45237472653388977, | |
| "learning_rate": 0.000493069306930693, | |
| "loss": 1.9148, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "grad_norm": 0.4136326313018799, | |
| "learning_rate": 0.0004920792079207921, | |
| "loss": 1.9152, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "grad_norm": 0.5655802488327026, | |
| "learning_rate": 0.000491089108910891, | |
| "loss": 1.9162, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.4693652391433716, | |
| "learning_rate": 0.0004900990099009901, | |
| "loss": 1.9172, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 5.51, | |
| "grad_norm": 0.43454521894454956, | |
| "learning_rate": 0.0004891089108910892, | |
| "loss": 1.9143, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 5.51, | |
| "eval_accuracy": 0.6173679783746377, | |
| "eval_loss": 1.7775607109069824, | |
| "eval_runtime": 1086.5491, | |
| "eval_samples_per_second": 459.602, | |
| "eval_steps_per_second": 2.052, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 0.5520709156990051, | |
| "learning_rate": 0.0004881188118811881, | |
| "loss": 1.9129, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 0.5848320722579956, | |
| "learning_rate": 0.00048712871287128715, | |
| "loss": 1.9157, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 5.53, | |
| "grad_norm": 0.4443696141242981, | |
| "learning_rate": 0.00048613861386138615, | |
| "loss": 1.9129, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 5.54, | |
| "grad_norm": 0.4639554023742676, | |
| "learning_rate": 0.00048514851485148515, | |
| "loss": 1.9118, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 5.54, | |
| "grad_norm": 0.4965130090713501, | |
| "learning_rate": 0.00048415841584158414, | |
| "loss": 1.911, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 5.55, | |
| "grad_norm": 0.5710552334785461, | |
| "learning_rate": 0.00048316831683168314, | |
| "loss": 1.9133, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "grad_norm": 0.5551273226737976, | |
| "learning_rate": 0.0004821782178217822, | |
| "loss": 1.9115, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "grad_norm": 0.4237355887889862, | |
| "learning_rate": 0.0004811881188118812, | |
| "loss": 1.9107, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "grad_norm": 0.5999632477760315, | |
| "learning_rate": 0.0004801980198019802, | |
| "loss": 1.9125, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "eval_accuracy": 0.6181112629292015, | |
| "eval_loss": 1.7737356424331665, | |
| "eval_runtime": 1089.6084, | |
| "eval_samples_per_second": 458.311, | |
| "eval_steps_per_second": 2.047, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 5.58, | |
| "grad_norm": 0.4806526303291321, | |
| "learning_rate": 0.0004792079207920792, | |
| "loss": 1.9102, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 5.59, | |
| "grad_norm": 0.4812794327735901, | |
| "learning_rate": 0.0004782178217821782, | |
| "loss": 1.9071, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 0.4029771089553833, | |
| "learning_rate": 0.00047722772277227724, | |
| "loss": 1.9081, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 0.40626466274261475, | |
| "learning_rate": 0.00047623762376237624, | |
| "loss": 1.9113, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 5.61, | |
| "grad_norm": 0.4513317942619324, | |
| "learning_rate": 0.00047524752475247524, | |
| "loss": 1.9088, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 5.62, | |
| "grad_norm": 0.4461658000946045, | |
| "learning_rate": 0.00047425742574257423, | |
| "loss": 1.9054, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 5.62, | |
| "grad_norm": 0.4517589807510376, | |
| "learning_rate": 0.00047326732673267323, | |
| "loss": 1.9082, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 5.63, | |
| "grad_norm": 0.45047953724861145, | |
| "learning_rate": 0.0004722772277227723, | |
| "loss": 1.9064, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "grad_norm": 0.4856921434402466, | |
| "learning_rate": 0.0004712871287128713, | |
| "loss": 1.9075, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "eval_accuracy": 0.6187679819070717, | |
| "eval_loss": 1.7702995538711548, | |
| "eval_runtime": 1084.6666, | |
| "eval_samples_per_second": 460.4, | |
| "eval_steps_per_second": 2.056, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 0.4753153920173645, | |
| "learning_rate": 0.0004702970297029703, | |
| "loss": 1.9063, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 0.45308151841163635, | |
| "learning_rate": 0.0004693069306930693, | |
| "loss": 1.9077, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 5.66, | |
| "grad_norm": 0.4464881718158722, | |
| "learning_rate": 0.00046831683168316833, | |
| "loss": 1.9038, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 5.67, | |
| "grad_norm": 0.49109694361686707, | |
| "learning_rate": 0.0004673267326732674, | |
| "loss": 1.9041, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.45286795496940613, | |
| "learning_rate": 0.0004663366336633664, | |
| "loss": 1.9028, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.4543616771697998, | |
| "learning_rate": 0.0004653465346534654, | |
| "loss": 1.9055, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 5.69, | |
| "grad_norm": 0.45107975602149963, | |
| "learning_rate": 0.0004643564356435644, | |
| "loss": 1.905, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 0.4717351198196411, | |
| "learning_rate": 0.0004633663366336634, | |
| "loss": 1.9035, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "grad_norm": 0.47570666670799255, | |
| "learning_rate": 0.00046237623762376243, | |
| "loss": 1.9023, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "eval_accuracy": 0.619487519380987, | |
| "eval_loss": 1.764754295349121, | |
| "eval_runtime": 1085.4383, | |
| "eval_samples_per_second": 460.072, | |
| "eval_steps_per_second": 2.054, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "grad_norm": 0.4219855070114136, | |
| "learning_rate": 0.00046138613861386143, | |
| "loss": 1.9023, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 5.72, | |
| "grad_norm": 0.43669965863227844, | |
| "learning_rate": 0.0004603960396039604, | |
| "loss": 1.9027, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "grad_norm": 0.3779612183570862, | |
| "learning_rate": 0.0004594059405940594, | |
| "loss": 1.8983, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "grad_norm": 0.42528143525123596, | |
| "learning_rate": 0.0004584158415841584, | |
| "loss": 1.9003, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 5.74, | |
| "grad_norm": 0.5401535630226135, | |
| "learning_rate": 0.0004574257425742575, | |
| "loss": 1.902, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 0.5040842890739441, | |
| "learning_rate": 0.00045643564356435647, | |
| "loss": 1.898, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.40838822722435, | |
| "learning_rate": 0.00045544554455445547, | |
| "loss": 1.9009, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.4518304169178009, | |
| "learning_rate": 0.00045445544554455447, | |
| "loss": 1.8971, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 5.77, | |
| "grad_norm": 0.4375505745410919, | |
| "learning_rate": 0.00045346534653465347, | |
| "loss": 1.8968, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 5.77, | |
| "eval_accuracy": 0.6199796521907708, | |
| "eval_loss": 1.7632313966751099, | |
| "eval_runtime": 1087.3022, | |
| "eval_samples_per_second": 459.284, | |
| "eval_steps_per_second": 2.051, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 5.78, | |
| "grad_norm": 0.4348089098930359, | |
| "learning_rate": 0.0004524752475247525, | |
| "loss": 1.897, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "grad_norm": 0.4852411150932312, | |
| "learning_rate": 0.0004514851485148515, | |
| "loss": 1.8979, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "grad_norm": 0.3691408932209015, | |
| "learning_rate": 0.0004504950495049505, | |
| "loss": 1.8952, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 0.40795382857322693, | |
| "learning_rate": 0.0004495049504950495, | |
| "loss": 1.8976, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 5.81, | |
| "grad_norm": 0.5768758058547974, | |
| "learning_rate": 0.0004485148514851485, | |
| "loss": 1.9002, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 5.81, | |
| "grad_norm": 0.47147759795188904, | |
| "learning_rate": 0.00044752475247524756, | |
| "loss": 1.8962, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 5.82, | |
| "grad_norm": 0.4033481478691101, | |
| "learning_rate": 0.00044653465346534656, | |
| "loss": 1.8902, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 5.83, | |
| "grad_norm": 0.474514365196228, | |
| "learning_rate": 0.00044554455445544556, | |
| "loss": 1.8914, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.4343509376049042, | |
| "learning_rate": 0.00044455445544554456, | |
| "loss": 1.8909, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "eval_accuracy": 0.6212259728987508, | |
| "eval_loss": 1.755420207977295, | |
| "eval_runtime": 1084.9224, | |
| "eval_samples_per_second": 460.291, | |
| "eval_steps_per_second": 2.055, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.435395210981369, | |
| "learning_rate": 0.00044356435643564356, | |
| "loss": 1.8907, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 0.48715683817863464, | |
| "learning_rate": 0.0004425742574257426, | |
| "loss": 1.8894, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 5.86, | |
| "grad_norm": 0.4001710116863251, | |
| "learning_rate": 0.0004415841584158416, | |
| "loss": 1.89, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 5.87, | |
| "grad_norm": 0.38079318404197693, | |
| "learning_rate": 0.0004405940594059406, | |
| "loss": 1.8893, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 5.87, | |
| "grad_norm": 0.5229191780090332, | |
| "learning_rate": 0.0004396039603960396, | |
| "loss": 1.8916, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 0.4705289900302887, | |
| "learning_rate": 0.0004386138613861386, | |
| "loss": 1.891, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "grad_norm": 0.4324556589126587, | |
| "learning_rate": 0.00043762376237623765, | |
| "loss": 1.8872, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "grad_norm": 0.4106965959072113, | |
| "learning_rate": 0.00043663366336633665, | |
| "loss": 1.8861, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "grad_norm": 0.461008220911026, | |
| "learning_rate": 0.00043564356435643565, | |
| "loss": 1.8903, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "eval_accuracy": 0.6217799147539793, | |
| "eval_loss": 1.751676082611084, | |
| "eval_runtime": 1084.7892, | |
| "eval_samples_per_second": 460.348, | |
| "eval_steps_per_second": 2.056, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 5.91, | |
| "grad_norm": 0.4489499032497406, | |
| "learning_rate": 0.00043465346534653465, | |
| "loss": 1.8894, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 0.40970441699028015, | |
| "learning_rate": 0.00043366336633663365, | |
| "loss": 1.8858, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 0.397197425365448, | |
| "learning_rate": 0.0004326732673267327, | |
| "loss": 1.8866, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 5.93, | |
| "grad_norm": 0.4046621322631836, | |
| "learning_rate": 0.0004316831683168317, | |
| "loss": 1.8865, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 5.94, | |
| "grad_norm": 0.4488195478916168, | |
| "learning_rate": 0.0004306930693069307, | |
| "loss": 1.8898, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 0.37480929493904114, | |
| "learning_rate": 0.0004297029702970297, | |
| "loss": 1.883, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 0.39327338337898254, | |
| "learning_rate": 0.0004287128712871287, | |
| "loss": 1.8837, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 5.96, | |
| "grad_norm": 0.4443519115447998, | |
| "learning_rate": 0.00042772277227722774, | |
| "loss": 1.8825, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "grad_norm": 0.4902293384075165, | |
| "learning_rate": 0.00042673267326732674, | |
| "loss": 1.8837, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "eval_accuracy": 0.622874142241673, | |
| "eval_loss": 1.7469381093978882, | |
| "eval_runtime": 1082.2695, | |
| "eval_samples_per_second": 461.419, | |
| "eval_steps_per_second": 2.06, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "grad_norm": 0.43381059169769287, | |
| "learning_rate": 0.00042574257425742574, | |
| "loss": 1.8812, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 5.98, | |
| "grad_norm": 0.4675629734992981, | |
| "learning_rate": 0.00042475247524752474, | |
| "loss": 1.8808, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 5.99, | |
| "grad_norm": 0.4100710451602936, | |
| "learning_rate": 0.00042376237623762374, | |
| "loss": 1.882, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.5555077791213989, | |
| "learning_rate": 0.0004227722772277228, | |
| "loss": 1.8824, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.5407902598381042, | |
| "learning_rate": 0.0004217821782178218, | |
| "loss": 1.8813, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 6.01, | |
| "grad_norm": 0.48739898204803467, | |
| "learning_rate": 0.0004207920792079208, | |
| "loss": 1.8782, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "grad_norm": 0.4977239966392517, | |
| "learning_rate": 0.0004198019801980198, | |
| "loss": 1.8801, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "grad_norm": 0.47402721643447876, | |
| "learning_rate": 0.0004188118811881188, | |
| "loss": 1.8811, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "grad_norm": 0.4796026349067688, | |
| "learning_rate": 0.00041782178217821784, | |
| "loss": 1.8801, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "eval_accuracy": 0.6238451571943063, | |
| "eval_loss": 1.741037130355835, | |
| "eval_runtime": 1087.5002, | |
| "eval_samples_per_second": 459.2, | |
| "eval_steps_per_second": 2.051, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 6.04, | |
| "grad_norm": 0.419162392616272, | |
| "learning_rate": 0.00041683168316831683, | |
| "loss": 1.8796, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 6.05, | |
| "grad_norm": 0.409493625164032, | |
| "learning_rate": 0.00041584158415841583, | |
| "loss": 1.8794, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 6.05, | |
| "grad_norm": 0.5155593752861023, | |
| "learning_rate": 0.00041485148514851483, | |
| "loss": 1.8786, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 6.06, | |
| "grad_norm": 0.46643194556236267, | |
| "learning_rate": 0.00041386138613861383, | |
| "loss": 1.8786, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 6.07, | |
| "grad_norm": 0.4528968930244446, | |
| "learning_rate": 0.0004128712871287129, | |
| "loss": 1.8765, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.4780101180076599, | |
| "learning_rate": 0.0004118811881188119, | |
| "loss": 1.8742, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.41187387704849243, | |
| "learning_rate": 0.0004108910891089109, | |
| "loss": 1.8754, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "grad_norm": 0.48807311058044434, | |
| "learning_rate": 0.0004099009900990099, | |
| "loss": 1.877, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 6.1, | |
| "grad_norm": 0.4205549657344818, | |
| "learning_rate": 0.0004089108910891089, | |
| "loss": 1.8769, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 6.1, | |
| "eval_accuracy": 0.6247575321496361, | |
| "eval_loss": 1.7364323139190674, | |
| "eval_runtime": 1085.7284, | |
| "eval_samples_per_second": 459.949, | |
| "eval_steps_per_second": 2.054, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "grad_norm": 0.5422940850257874, | |
| "learning_rate": 0.0004079207920792079, | |
| "loss": 1.8743, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "grad_norm": 0.4061022400856018, | |
| "learning_rate": 0.0004069306930693069, | |
| "loss": 1.8748, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 0.4819527268409729, | |
| "learning_rate": 0.000405940594059406, | |
| "loss": 1.8734, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 6.13, | |
| "grad_norm": 0.4938518702983856, | |
| "learning_rate": 0.000404950495049505, | |
| "loss": 1.8746, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 6.13, | |
| "grad_norm": 0.39021047949790955, | |
| "learning_rate": 0.00040396039603960397, | |
| "loss": 1.874, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 6.14, | |
| "grad_norm": 0.3815496265888214, | |
| "learning_rate": 0.000402970297029703, | |
| "loss": 1.8722, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 6.15, | |
| "grad_norm": 0.4226173758506775, | |
| "learning_rate": 0.000401980198019802, | |
| "loss": 1.8716, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 0.4311840236186981, | |
| "learning_rate": 0.000400990099009901, | |
| "loss": 1.8729, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 0.37644141912460327, | |
| "learning_rate": 0.0004, | |
| "loss": 1.8697, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "eval_accuracy": 0.6258217191475034, | |
| "eval_loss": 1.7313834428787231, | |
| "eval_runtime": 1087.4157, | |
| "eval_samples_per_second": 459.236, | |
| "eval_steps_per_second": 2.051, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 6.17, | |
| "grad_norm": 0.34123849868774414, | |
| "learning_rate": 0.000399009900990099, | |
| "loss": 1.8709, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 6.18, | |
| "grad_norm": 0.5545117259025574, | |
| "learning_rate": 0.00039801980198019807, | |
| "loss": 1.8729, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "grad_norm": 0.44565874338150024, | |
| "learning_rate": 0.00039702970297029707, | |
| "loss": 1.8716, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "grad_norm": 0.4301845133304596, | |
| "learning_rate": 0.00039603960396039607, | |
| "loss": 1.866, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.46877652406692505, | |
| "learning_rate": 0.00039504950495049506, | |
| "loss": 1.8672, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 6.21, | |
| "grad_norm": 0.535370945930481, | |
| "learning_rate": 0.00039405940594059406, | |
| "loss": 1.8712, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 6.22, | |
| "grad_norm": 0.39393341541290283, | |
| "learning_rate": 0.0003930693069306931, | |
| "loss": 1.8671, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 6.22, | |
| "grad_norm": 0.36698782444000244, | |
| "learning_rate": 0.0003920792079207921, | |
| "loss": 1.8685, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 6.23, | |
| "grad_norm": 0.4744206666946411, | |
| "learning_rate": 0.0003910891089108911, | |
| "loss": 1.8673, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 6.23, | |
| "eval_accuracy": 0.6265433755799307, | |
| "eval_loss": 1.7275755405426025, | |
| "eval_runtime": 1085.596, | |
| "eval_samples_per_second": 460.005, | |
| "eval_steps_per_second": 2.054, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.4614261984825134, | |
| "learning_rate": 0.0003900990099009901, | |
| "loss": 1.8631, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.4780130386352539, | |
| "learning_rate": 0.0003891089108910891, | |
| "loss": 1.8658, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.4078359007835388, | |
| "learning_rate": 0.00038811881188118816, | |
| "loss": 1.8644, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 6.26, | |
| "grad_norm": 0.4950817823410034, | |
| "learning_rate": 0.00038712871287128716, | |
| "loss": 1.8631, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "grad_norm": 0.4473728835582733, | |
| "learning_rate": 0.00038613861386138616, | |
| "loss": 1.862, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "grad_norm": 0.41996896266937256, | |
| "learning_rate": 0.00038514851485148515, | |
| "loss": 1.8609, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 6.28, | |
| "grad_norm": 0.4896424114704132, | |
| "learning_rate": 0.00038415841584158415, | |
| "loss": 1.8639, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 6.29, | |
| "grad_norm": 0.5288357138633728, | |
| "learning_rate": 0.0003831683168316832, | |
| "loss": 1.8629, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 0.414982408285141, | |
| "learning_rate": 0.0003821782178217822, | |
| "loss": 1.8611, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "eval_accuracy": 0.6280281595524545, | |
| "eval_loss": 1.7198432683944702, | |
| "eval_runtime": 1085.3535, | |
| "eval_samples_per_second": 460.108, | |
| "eval_steps_per_second": 2.055, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 0.45578229427337646, | |
| "learning_rate": 0.0003811881188118812, | |
| "loss": 1.8598, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 6.31, | |
| "grad_norm": 0.4705806374549866, | |
| "learning_rate": 0.0003801980198019802, | |
| "loss": 1.8606, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.40844404697418213, | |
| "learning_rate": 0.0003792079207920792, | |
| "loss": 1.8597, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.3853258490562439, | |
| "learning_rate": 0.00037821782178217825, | |
| "loss": 1.8582, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 6.33, | |
| "grad_norm": 0.4357406198978424, | |
| "learning_rate": 0.00037722772277227725, | |
| "loss": 1.8595, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "grad_norm": 0.5262021422386169, | |
| "learning_rate": 0.00037623762376237625, | |
| "loss": 1.858, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 6.35, | |
| "grad_norm": 0.5264425873756409, | |
| "learning_rate": 0.00037524752475247524, | |
| "loss": 1.8587, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 6.35, | |
| "grad_norm": 0.45019951462745667, | |
| "learning_rate": 0.00037425742574257424, | |
| "loss": 1.8566, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "grad_norm": 0.41189640760421753, | |
| "learning_rate": 0.0003732673267326733, | |
| "loss": 1.8572, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "eval_accuracy": 0.6290112373666048, | |
| "eval_loss": 1.7162350416183472, | |
| "eval_runtime": 1085.09, | |
| "eval_samples_per_second": 460.22, | |
| "eval_steps_per_second": 2.055, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 6.37, | |
| "grad_norm": 0.4982648491859436, | |
| "learning_rate": 0.0003722772277227723, | |
| "loss": 1.8546, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 0.3992471694946289, | |
| "learning_rate": 0.0003712871287128713, | |
| "loss": 1.8537, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 0.46979019045829773, | |
| "learning_rate": 0.0003702970297029703, | |
| "loss": 1.8566, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "grad_norm": 0.5312979817390442, | |
| "learning_rate": 0.0003693069306930693, | |
| "loss": 1.8528, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.4439733624458313, | |
| "learning_rate": 0.00036831683168316834, | |
| "loss": 1.855, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.39169180393218994, | |
| "learning_rate": 0.00036732673267326734, | |
| "loss": 1.8528, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 6.41, | |
| "grad_norm": 0.41544708609580994, | |
| "learning_rate": 0.00036633663366336634, | |
| "loss": 1.8508, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "grad_norm": 0.4738489091396332, | |
| "learning_rate": 0.00036534653465346533, | |
| "loss": 1.8515, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 6.43, | |
| "grad_norm": 0.4519752860069275, | |
| "learning_rate": 0.00036435643564356433, | |
| "loss": 1.8507, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 6.43, | |
| "eval_accuracy": 0.6307912179674288, | |
| "eval_loss": 1.707141399383545, | |
| "eval_runtime": 1087.4605, | |
| "eval_samples_per_second": 459.217, | |
| "eval_steps_per_second": 2.051, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 6.43, | |
| "grad_norm": 0.382656991481781, | |
| "learning_rate": 0.0003633663366336634, | |
| "loss": 1.8487, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 6.44, | |
| "grad_norm": 0.4599255919456482, | |
| "learning_rate": 0.0003623762376237624, | |
| "loss": 1.8506, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "grad_norm": 0.4658602476119995, | |
| "learning_rate": 0.0003613861386138614, | |
| "loss": 1.8496, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 6.46, | |
| "grad_norm": 0.43937841057777405, | |
| "learning_rate": 0.0003603960396039604, | |
| "loss": 1.8504, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 6.46, | |
| "grad_norm": 0.4001791477203369, | |
| "learning_rate": 0.0003594059405940594, | |
| "loss": 1.8479, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 6.47, | |
| "grad_norm": 0.47235003113746643, | |
| "learning_rate": 0.00035841584158415843, | |
| "loss": 1.8462, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 0.42399758100509644, | |
| "learning_rate": 0.00035742574257425743, | |
| "loss": 1.8483, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 0.3666219413280487, | |
| "learning_rate": 0.0003564356435643564, | |
| "loss": 1.8462, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "grad_norm": 0.4140106737613678, | |
| "learning_rate": 0.0003554455445544554, | |
| "loss": 1.8447, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "eval_accuracy": 0.6324917716108477, | |
| "eval_loss": 1.6985867023468018, | |
| "eval_runtime": 1087.0314, | |
| "eval_samples_per_second": 459.398, | |
| "eval_steps_per_second": 2.051, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.4904990792274475, | |
| "learning_rate": 0.0003544554455445544, | |
| "loss": 1.8451, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "grad_norm": 0.43653395771980286, | |
| "learning_rate": 0.0003534653465346535, | |
| "loss": 1.8434, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "grad_norm": 0.4815143942832947, | |
| "learning_rate": 0.0003524752475247525, | |
| "loss": 1.8405, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "grad_norm": 0.5665289759635925, | |
| "learning_rate": 0.00035148514851485147, | |
| "loss": 1.8426, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "grad_norm": 0.45166435837745667, | |
| "learning_rate": 0.00035049504950495047, | |
| "loss": 1.8431, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.45748040080070496, | |
| "learning_rate": 0.00034950495049504947, | |
| "loss": 1.8434, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.42432862520217896, | |
| "learning_rate": 0.0003485148514851485, | |
| "loss": 1.8408, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 6.55, | |
| "grad_norm": 0.3683374226093292, | |
| "learning_rate": 0.0003475247524752475, | |
| "loss": 1.8388, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "grad_norm": 0.4106079041957855, | |
| "learning_rate": 0.0003465346534653465, | |
| "loss": 1.839, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "eval_accuracy": 0.6338676239837059, | |
| "eval_loss": 1.6945050954818726, | |
| "eval_runtime": 1084.8256, | |
| "eval_samples_per_second": 460.332, | |
| "eval_steps_per_second": 2.056, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "grad_norm": 0.46325406432151794, | |
| "learning_rate": 0.0003455445544554455, | |
| "loss": 1.8383, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 6.57, | |
| "grad_norm": 0.4024347960948944, | |
| "learning_rate": 0.0003445544554455445, | |
| "loss": 1.8397, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 6.58, | |
| "grad_norm": 0.5055080652236938, | |
| "learning_rate": 0.0003435643564356436, | |
| "loss": 1.8386, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 6.59, | |
| "grad_norm": 0.38540130853652954, | |
| "learning_rate": 0.0003425742574257426, | |
| "loss": 1.8381, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 6.59, | |
| "grad_norm": 0.42219218611717224, | |
| "learning_rate": 0.0003415841584158416, | |
| "loss": 1.8379, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 0.37353622913360596, | |
| "learning_rate": 0.0003405940594059406, | |
| "loss": 1.8354, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 6.61, | |
| "grad_norm": 0.4756326973438263, | |
| "learning_rate": 0.0003396039603960396, | |
| "loss": 1.8345, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 6.62, | |
| "grad_norm": 0.44085556268692017, | |
| "learning_rate": 0.00033861386138613867, | |
| "loss": 1.8355, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 6.62, | |
| "grad_norm": 0.4287554919719696, | |
| "learning_rate": 0.00033762376237623766, | |
| "loss": 1.8345, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 6.62, | |
| "eval_accuracy": 0.6351918568609488, | |
| "eval_loss": 1.6874170303344727, | |
| "eval_runtime": 1085.5989, | |
| "eval_samples_per_second": 460.004, | |
| "eval_steps_per_second": 2.054, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 6.63, | |
| "grad_norm": 0.4309207499027252, | |
| "learning_rate": 0.00033663366336633666, | |
| "loss": 1.8336, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 0.41971975564956665, | |
| "learning_rate": 0.00033564356435643566, | |
| "loss": 1.83, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 0.4418039619922638, | |
| "learning_rate": 0.00033465346534653466, | |
| "loss": 1.829, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 6.65, | |
| "grad_norm": 0.44311702251434326, | |
| "learning_rate": 0.0003336633663366337, | |
| "loss": 1.8337, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 6.66, | |
| "grad_norm": 0.46464502811431885, | |
| "learning_rate": 0.0003326732673267327, | |
| "loss": 1.835, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "grad_norm": 0.40370500087738037, | |
| "learning_rate": 0.0003316831683168317, | |
| "loss": 1.8304, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "grad_norm": 0.37608906626701355, | |
| "learning_rate": 0.0003306930693069307, | |
| "loss": 1.8312, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 0.3990706503391266, | |
| "learning_rate": 0.0003297029702970297, | |
| "loss": 1.8313, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 6.69, | |
| "grad_norm": 0.5069761872291565, | |
| "learning_rate": 0.00032871287128712876, | |
| "loss": 1.8292, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 6.69, | |
| "eval_accuracy": 0.6364002620555558, | |
| "eval_loss": 1.6848387718200684, | |
| "eval_runtime": 1085.7505, | |
| "eval_samples_per_second": 459.94, | |
| "eval_steps_per_second": 2.054, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 6.7, | |
| "grad_norm": 0.450114905834198, | |
| "learning_rate": 0.00032772277227722775, | |
| "loss": 1.8279, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 6.7, | |
| "grad_norm": 0.4129829704761505, | |
| "learning_rate": 0.00032673267326732675, | |
| "loss": 1.8282, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 6.71, | |
| "grad_norm": 0.4841521084308624, | |
| "learning_rate": 0.00032574257425742575, | |
| "loss": 1.8282, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 0.5160727500915527, | |
| "learning_rate": 0.00032475247524752475, | |
| "loss": 1.8269, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 6.73, | |
| "grad_norm": 0.5048640370368958, | |
| "learning_rate": 0.0003237623762376238, | |
| "loss": 1.8268, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 6.73, | |
| "grad_norm": 0.584237813949585, | |
| "learning_rate": 0.0003227722772277228, | |
| "loss": 1.8295, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 6.74, | |
| "grad_norm": 0.43098556995391846, | |
| "learning_rate": 0.0003217821782178218, | |
| "loss": 1.8249, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 0.3972054421901703, | |
| "learning_rate": 0.0003207920792079208, | |
| "loss": 1.8246, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 0.3777482211589813, | |
| "learning_rate": 0.0003198019801980198, | |
| "loss": 1.8243, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "eval_accuracy": 0.6379164581645916, | |
| "eval_loss": 1.6772228479385376, | |
| "eval_runtime": 1085.6894, | |
| "eval_samples_per_second": 459.966, | |
| "eval_steps_per_second": 2.054, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 6.76, | |
| "grad_norm": 0.42633891105651855, | |
| "learning_rate": 0.00031881188118811885, | |
| "loss": 1.8221, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 6.77, | |
| "grad_norm": 0.5893245339393616, | |
| "learning_rate": 0.00031782178217821784, | |
| "loss": 1.8243, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 6.78, | |
| "grad_norm": 0.5304878354072571, | |
| "learning_rate": 0.00031683168316831684, | |
| "loss": 1.8242, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 6.78, | |
| "grad_norm": 0.4657045304775238, | |
| "learning_rate": 0.00031584158415841584, | |
| "loss": 1.8242, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 6.79, | |
| "grad_norm": 0.43276962637901306, | |
| "learning_rate": 0.00031485148514851484, | |
| "loss": 1.8225, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 0.40515634417533875, | |
| "learning_rate": 0.0003138613861386139, | |
| "loss": 1.8228, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 6.81, | |
| "grad_norm": 0.40448087453842163, | |
| "learning_rate": 0.0003128712871287129, | |
| "loss": 1.8198, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 6.81, | |
| "grad_norm": 0.4380359351634979, | |
| "learning_rate": 0.0003118811881188119, | |
| "loss": 1.8201, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 6.82, | |
| "grad_norm": 0.46539106965065, | |
| "learning_rate": 0.0003108910891089109, | |
| "loss": 1.8217, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 6.82, | |
| "eval_accuracy": 0.6389200973153136, | |
| "eval_loss": 1.6709976196289062, | |
| "eval_runtime": 1085.3788, | |
| "eval_samples_per_second": 460.097, | |
| "eval_steps_per_second": 2.055, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 6.83, | |
| "grad_norm": 0.4772910177707672, | |
| "learning_rate": 0.0003099009900990099, | |
| "loss": 1.8202, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 6.83, | |
| "grad_norm": 0.42799142003059387, | |
| "learning_rate": 0.00030891089108910894, | |
| "loss": 1.819, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 6.84, | |
| "grad_norm": 0.40562522411346436, | |
| "learning_rate": 0.00030792079207920793, | |
| "loss": 1.8208, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "grad_norm": 0.6129370927810669, | |
| "learning_rate": 0.00030693069306930693, | |
| "loss": 1.8158, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 6.86, | |
| "grad_norm": 0.4654337465763092, | |
| "learning_rate": 0.00030594059405940593, | |
| "loss": 1.8175, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 6.86, | |
| "grad_norm": 0.4340899884700775, | |
| "learning_rate": 0.00030495049504950493, | |
| "loss": 1.8156, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 0.4216438829898834, | |
| "learning_rate": 0.000303960396039604, | |
| "loss": 1.8182, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.3995111584663391, | |
| "learning_rate": 0.000302970297029703, | |
| "loss": 1.8175, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 6.89, | |
| "grad_norm": 0.4166571795940399, | |
| "learning_rate": 0.000301980198019802, | |
| "loss": 1.8171, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 6.89, | |
| "eval_accuracy": 0.6397325747268708, | |
| "eval_loss": 1.6698857545852661, | |
| "eval_runtime": 1086.4276, | |
| "eval_samples_per_second": 459.653, | |
| "eval_steps_per_second": 2.053, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 6.89, | |
| "grad_norm": 0.4397519528865814, | |
| "learning_rate": 0.000300990099009901, | |
| "loss": 1.8171, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "grad_norm": 0.4201519787311554, | |
| "learning_rate": 0.0003, | |
| "loss": 1.8166, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 6.91, | |
| "grad_norm": 0.4194183945655823, | |
| "learning_rate": 0.000299009900990099, | |
| "loss": 1.8137, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 6.91, | |
| "grad_norm": 0.4156358540058136, | |
| "learning_rate": 0.000298019801980198, | |
| "loss": 1.8149, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 6.92, | |
| "grad_norm": 0.43356573581695557, | |
| "learning_rate": 0.000297029702970297, | |
| "loss": 1.8104, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "grad_norm": 0.4354686141014099, | |
| "learning_rate": 0.000296039603960396, | |
| "loss": 1.8131, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 6.94, | |
| "grad_norm": 0.45513659715652466, | |
| "learning_rate": 0.000295049504950495, | |
| "loss": 1.8115, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 6.94, | |
| "grad_norm": 0.39433979988098145, | |
| "learning_rate": 0.00029405940594059407, | |
| "loss": 1.8106, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 0.5176064372062683, | |
| "learning_rate": 0.00029306930693069307, | |
| "loss": 1.8153, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "eval_accuracy": 0.640691022778346, | |
| "eval_loss": 1.66335129737854, | |
| "eval_runtime": 1082.6564, | |
| "eval_samples_per_second": 461.254, | |
| "eval_steps_per_second": 2.06, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 0.5153635740280151, | |
| "learning_rate": 0.00029207920792079207, | |
| "loss": 1.8142, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 6.97, | |
| "grad_norm": 0.37104055285453796, | |
| "learning_rate": 0.00029108910891089107, | |
| "loss": 1.812, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 6.97, | |
| "grad_norm": 0.40426042675971985, | |
| "learning_rate": 0.00029009900990099006, | |
| "loss": 1.8119, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 6.98, | |
| "grad_norm": 0.5108228325843811, | |
| "learning_rate": 0.0002891089108910891, | |
| "loss": 1.8131, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "grad_norm": 0.4702747166156769, | |
| "learning_rate": 0.0002881188118811881, | |
| "loss": 1.812, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "grad_norm": 0.3683488965034485, | |
| "learning_rate": 0.0002871287128712871, | |
| "loss": 1.8081, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.42281776666641235, | |
| "learning_rate": 0.0002861386138613861, | |
| "loss": 1.81, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 7.01, | |
| "grad_norm": 0.48128095269203186, | |
| "learning_rate": 0.0002851485148514851, | |
| "loss": 1.81, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 7.02, | |
| "grad_norm": 0.4020933210849762, | |
| "learning_rate": 0.00028415841584158416, | |
| "loss": 1.81, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 7.02, | |
| "eval_accuracy": 0.6416495776980271, | |
| "eval_loss": 1.6598803997039795, | |
| "eval_runtime": 1084.1267, | |
| "eval_samples_per_second": 460.629, | |
| "eval_steps_per_second": 2.057, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 7.02, | |
| "grad_norm": 0.5061802864074707, | |
| "learning_rate": 0.00028316831683168316, | |
| "loss": 1.8091, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "grad_norm": 0.5182695388793945, | |
| "learning_rate": 0.00028217821782178216, | |
| "loss": 1.8101, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 0.45669490098953247, | |
| "learning_rate": 0.0002811881188118812, | |
| "loss": 1.8059, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 0.5214717984199524, | |
| "learning_rate": 0.0002801980198019802, | |
| "loss": 1.809, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 0.46073251962661743, | |
| "learning_rate": 0.00027920792079207926, | |
| "loss": 1.8066, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 7.06, | |
| "grad_norm": 0.3924192190170288, | |
| "learning_rate": 0.00027821782178217826, | |
| "loss": 1.8058, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 7.07, | |
| "grad_norm": 0.42634785175323486, | |
| "learning_rate": 0.00027722772277227726, | |
| "loss": 1.8082, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 7.07, | |
| "grad_norm": 0.46675705909729004, | |
| "learning_rate": 0.00027623762376237626, | |
| "loss": 1.8057, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "grad_norm": 0.43609708547592163, | |
| "learning_rate": 0.00027524752475247525, | |
| "loss": 1.8051, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "eval_accuracy": 0.6424563152815425, | |
| "eval_loss": 1.6557390689849854, | |
| "eval_runtime": 1083.0526, | |
| "eval_samples_per_second": 461.086, | |
| "eval_steps_per_second": 2.059, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 7.09, | |
| "grad_norm": 0.35754847526550293, | |
| "learning_rate": 0.0002742574257425743, | |
| "loss": 1.8053, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.38588428497314453, | |
| "learning_rate": 0.0002732673267326733, | |
| "loss": 1.8059, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.4815811514854431, | |
| "learning_rate": 0.0002722772277227723, | |
| "loss": 1.8053, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 7.11, | |
| "grad_norm": 0.4292014241218567, | |
| "learning_rate": 0.0002712871287128713, | |
| "loss": 1.8018, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 7.12, | |
| "grad_norm": 0.4443877339363098, | |
| "learning_rate": 0.0002702970297029703, | |
| "loss": 1.8022, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 0.4887067675590515, | |
| "learning_rate": 0.00026930693069306935, | |
| "loss": 1.804, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 0.4577280282974243, | |
| "learning_rate": 0.00026831683168316835, | |
| "loss": 1.8043, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 7.14, | |
| "grad_norm": 0.38725900650024414, | |
| "learning_rate": 0.00026732673267326735, | |
| "loss": 1.8014, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 7.15, | |
| "grad_norm": 0.45041412115097046, | |
| "learning_rate": 0.00026633663366336635, | |
| "loss": 1.8046, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 7.15, | |
| "eval_accuracy": 0.6431962908340371, | |
| "eval_loss": 1.6528569459915161, | |
| "eval_runtime": 1084.5222, | |
| "eval_samples_per_second": 460.461, | |
| "eval_steps_per_second": 2.056, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 7.15, | |
| "grad_norm": 0.532798707485199, | |
| "learning_rate": 0.00026534653465346534, | |
| "loss": 1.8015, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 0.43234437704086304, | |
| "learning_rate": 0.0002643564356435644, | |
| "loss": 1.8013, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 7.17, | |
| "grad_norm": 0.4301891624927521, | |
| "learning_rate": 0.0002633663366336634, | |
| "loss": 1.8011, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 7.18, | |
| "grad_norm": 0.41938352584838867, | |
| "learning_rate": 0.0002623762376237624, | |
| "loss": 1.8032, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 7.18, | |
| "grad_norm": 0.4415999948978424, | |
| "learning_rate": 0.0002613861386138614, | |
| "loss": 1.8011, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 7.19, | |
| "grad_norm": 0.46587106585502625, | |
| "learning_rate": 0.0002603960396039604, | |
| "loss": 1.8005, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.3755381107330322, | |
| "learning_rate": 0.00025940594059405944, | |
| "loss": 1.7996, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 7.21, | |
| "grad_norm": 0.37551912665367126, | |
| "learning_rate": 0.00025841584158415844, | |
| "loss": 1.7981, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 7.21, | |
| "grad_norm": 0.4268946349620819, | |
| "learning_rate": 0.00025742574257425744, | |
| "loss": 1.7997, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 7.21, | |
| "eval_accuracy": 0.6438810244549774, | |
| "eval_loss": 1.6489626169204712, | |
| "eval_runtime": 1085.6377, | |
| "eval_samples_per_second": 459.988, | |
| "eval_steps_per_second": 2.054, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 7.22, | |
| "grad_norm": 0.5793518424034119, | |
| "learning_rate": 0.00025643564356435644, | |
| "loss": 1.8, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "grad_norm": 0.37436190247535706, | |
| "learning_rate": 0.00025544554455445543, | |
| "loss": 1.7974, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 0.5522225499153137, | |
| "learning_rate": 0.0002544554455445545, | |
| "loss": 1.7968, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 0.4452868402004242, | |
| "learning_rate": 0.0002534653465346535, | |
| "loss": 1.7997, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 0.41211819648742676, | |
| "learning_rate": 0.0002524752475247525, | |
| "loss": 1.798, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 7.26, | |
| "grad_norm": 0.4052869379520416, | |
| "learning_rate": 0.0002514851485148515, | |
| "loss": 1.7948, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 7.26, | |
| "grad_norm": 0.4514144957065582, | |
| "learning_rate": 0.0002504950495049505, | |
| "loss": 1.7955, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 7.27, | |
| "grad_norm": 0.49351832270622253, | |
| "learning_rate": 0.00024950495049504953, | |
| "loss": 1.7991, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 0.49827703833580017, | |
| "learning_rate": 0.00024851485148514853, | |
| "loss": 1.7994, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "eval_accuracy": 0.6442858204973027, | |
| "eval_loss": 1.6466220617294312, | |
| "eval_runtime": 1086.2584, | |
| "eval_samples_per_second": 459.725, | |
| "eval_steps_per_second": 2.053, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 7.29, | |
| "grad_norm": 0.4449995160102844, | |
| "learning_rate": 0.00024752475247524753, | |
| "loss": 1.7962, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 7.29, | |
| "grad_norm": 0.3683604300022125, | |
| "learning_rate": 0.0002465346534653465, | |
| "loss": 1.7984, | |
| "step": 10010 | |
| }, | |
| { | |
| "epoch": 7.3, | |
| "grad_norm": 0.48126864433288574, | |
| "learning_rate": 0.0002455445544554455, | |
| "loss": 1.7953, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 7.31, | |
| "grad_norm": 0.3565351068973541, | |
| "learning_rate": 0.0002445544554455446, | |
| "loss": 1.7914, | |
| "step": 10030 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 0.4369056820869446, | |
| "learning_rate": 0.00024356435643564357, | |
| "loss": 1.7949, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 0.5566734671592712, | |
| "learning_rate": 0.00024257425742574257, | |
| "loss": 1.7961, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 7.33, | |
| "grad_norm": 0.4066598117351532, | |
| "learning_rate": 0.00024158415841584157, | |
| "loss": 1.7962, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 7.34, | |
| "grad_norm": 0.4281260669231415, | |
| "learning_rate": 0.0002405940594059406, | |
| "loss": 1.7943, | |
| "step": 10070 | |
| }, | |
| { | |
| "epoch": 7.34, | |
| "grad_norm": 0.34586983919143677, | |
| "learning_rate": 0.0002396039603960396, | |
| "loss": 1.7934, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 7.34, | |
| "eval_accuracy": 0.6448193534487687, | |
| "eval_loss": 1.644710898399353, | |
| "eval_runtime": 1084.6339, | |
| "eval_samples_per_second": 460.413, | |
| "eval_steps_per_second": 2.056, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 0.3622014820575714, | |
| "learning_rate": 0.00023861386138613862, | |
| "loss": 1.7912, | |
| "step": 10090 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 0.456106960773468, | |
| "learning_rate": 0.00023762376237623762, | |
| "loss": 1.7922, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 7.37, | |
| "grad_norm": 0.4329501986503601, | |
| "learning_rate": 0.00023663366336633662, | |
| "loss": 1.7937, | |
| "step": 10110 | |
| }, | |
| { | |
| "epoch": 7.37, | |
| "grad_norm": 0.4640803039073944, | |
| "learning_rate": 0.00023564356435643564, | |
| "loss": 1.793, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 7.38, | |
| "grad_norm": 0.39238548278808594, | |
| "learning_rate": 0.00023465346534653464, | |
| "loss": 1.7916, | |
| "step": 10130 | |
| }, | |
| { | |
| "epoch": 7.39, | |
| "grad_norm": 0.43311530351638794, | |
| "learning_rate": 0.0002336633663366337, | |
| "loss": 1.7943, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.35872432589530945, | |
| "learning_rate": 0.0002326732673267327, | |
| "loss": 1.7918, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.36510738730430603, | |
| "learning_rate": 0.0002316831683168317, | |
| "loss": 1.7907, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "grad_norm": 0.46106651425361633, | |
| "learning_rate": 0.00023069306930693071, | |
| "loss": 1.7917, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "eval_accuracy": 0.6454906168005663, | |
| "eval_loss": 1.641427993774414, | |
| "eval_runtime": 1084.7154, | |
| "eval_samples_per_second": 460.379, | |
| "eval_steps_per_second": 2.056, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 7.42, | |
| "grad_norm": 0.37493211030960083, | |
| "learning_rate": 0.0002297029702970297, | |
| "loss": 1.7921, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 7.42, | |
| "grad_norm": 0.3865686058998108, | |
| "learning_rate": 0.00022871287128712874, | |
| "loss": 1.791, | |
| "step": 10190 | |
| }, | |
| { | |
| "epoch": 7.43, | |
| "grad_norm": 0.3889116942882538, | |
| "learning_rate": 0.00022772277227722774, | |
| "loss": 1.7907, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 0.6045088768005371, | |
| "learning_rate": 0.00022673267326732673, | |
| "loss": 1.7902, | |
| "step": 10210 | |
| }, | |
| { | |
| "epoch": 7.45, | |
| "grad_norm": 0.4185848832130432, | |
| "learning_rate": 0.00022574257425742576, | |
| "loss": 1.7889, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 7.45, | |
| "grad_norm": 0.3915616571903229, | |
| "learning_rate": 0.00022475247524752476, | |
| "loss": 1.7906, | |
| "step": 10230 | |
| }, | |
| { | |
| "epoch": 7.46, | |
| "grad_norm": 0.3687775433063507, | |
| "learning_rate": 0.00022376237623762378, | |
| "loss": 1.7897, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 7.47, | |
| "grad_norm": 0.39049315452575684, | |
| "learning_rate": 0.00022277227722772278, | |
| "loss": 1.789, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 0.37289920449256897, | |
| "learning_rate": 0.00022178217821782178, | |
| "loss": 1.7887, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "eval_accuracy": 0.645869271741179, | |
| "eval_loss": 1.639427661895752, | |
| "eval_runtime": 1118.4989, | |
| "eval_samples_per_second": 446.473, | |
| "eval_steps_per_second": 1.994, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 0.45301392674446106, | |
| "learning_rate": 0.0002207920792079208, | |
| "loss": 1.7911, | |
| "step": 10270 | |
| }, | |
| { | |
| "epoch": 7.49, | |
| "grad_norm": 0.42282310128211975, | |
| "learning_rate": 0.0002198019801980198, | |
| "loss": 1.788, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.34825190901756287, | |
| "learning_rate": 0.00021881188118811883, | |
| "loss": 1.7888, | |
| "step": 10290 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.4688248038291931, | |
| "learning_rate": 0.00021782178217821783, | |
| "loss": 1.7878, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 7.51, | |
| "grad_norm": 0.4295547902584076, | |
| "learning_rate": 0.00021683168316831682, | |
| "loss": 1.7877, | |
| "step": 10310 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.3433161675930023, | |
| "learning_rate": 0.00021584158415841585, | |
| "loss": 1.7884, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 7.53, | |
| "grad_norm": 0.34638333320617676, | |
| "learning_rate": 0.00021485148514851485, | |
| "loss": 1.7891, | |
| "step": 10330 | |
| }, | |
| { | |
| "epoch": 7.53, | |
| "grad_norm": 0.40277931094169617, | |
| "learning_rate": 0.00021386138613861387, | |
| "loss": 1.7856, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 7.54, | |
| "grad_norm": 0.42518341541290283, | |
| "learning_rate": 0.00021287128712871287, | |
| "loss": 1.7861, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 7.54, | |
| "eval_accuracy": 0.6466240342094058, | |
| "eval_loss": 1.637886881828308, | |
| "eval_runtime": 1094.5797, | |
| "eval_samples_per_second": 456.23, | |
| "eval_steps_per_second": 2.037, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 7.55, | |
| "grad_norm": 0.3844136893749237, | |
| "learning_rate": 0.00021188118811881187, | |
| "loss": 1.7862, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 0.34440210461616516, | |
| "learning_rate": 0.0002108910891089109, | |
| "loss": 1.7864, | |
| "step": 10370 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 0.501716136932373, | |
| "learning_rate": 0.0002099009900990099, | |
| "loss": 1.7843, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 7.57, | |
| "grad_norm": 0.3695526421070099, | |
| "learning_rate": 0.00020891089108910892, | |
| "loss": 1.7855, | |
| "step": 10390 | |
| }, | |
| { | |
| "epoch": 7.58, | |
| "grad_norm": 0.38437628746032715, | |
| "learning_rate": 0.00020792079207920792, | |
| "loss": 1.7847, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 7.58, | |
| "grad_norm": 0.4197578430175781, | |
| "learning_rate": 0.00020693069306930691, | |
| "loss": 1.7833, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "grad_norm": 0.39093175530433655, | |
| "learning_rate": 0.00020594059405940594, | |
| "loss": 1.786, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 0.36000731587409973, | |
| "learning_rate": 0.00020495049504950494, | |
| "loss": 1.7872, | |
| "step": 10430 | |
| }, | |
| { | |
| "epoch": 7.61, | |
| "grad_norm": 0.45473846793174744, | |
| "learning_rate": 0.00020396039603960396, | |
| "loss": 1.7853, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 7.61, | |
| "eval_accuracy": 0.6470880404015521, | |
| "eval_loss": 1.6340434551239014, | |
| "eval_runtime": 1091.9524, | |
| "eval_samples_per_second": 457.328, | |
| "eval_steps_per_second": 2.042, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 7.61, | |
| "grad_norm": 0.3739522099494934, | |
| "learning_rate": 0.000202970297029703, | |
| "loss": 1.7824, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 7.62, | |
| "grad_norm": 0.40237516164779663, | |
| "learning_rate": 0.00020198019801980199, | |
| "loss": 1.7837, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 7.63, | |
| "grad_norm": 0.44717445969581604, | |
| "learning_rate": 0.000200990099009901, | |
| "loss": 1.7828, | |
| "step": 10470 | |
| }, | |
| { | |
| "epoch": 7.64, | |
| "grad_norm": 0.4383144676685333, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7841, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 7.64, | |
| "grad_norm": 0.368528813123703, | |
| "learning_rate": 0.00019900990099009903, | |
| "loss": 1.7818, | |
| "step": 10490 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 0.40288105607032776, | |
| "learning_rate": 0.00019801980198019803, | |
| "loss": 1.7832, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 7.66, | |
| "grad_norm": 0.3775827884674072, | |
| "learning_rate": 0.00019702970297029703, | |
| "loss": 1.7814, | |
| "step": 10510 | |
| }, | |
| { | |
| "epoch": 7.66, | |
| "grad_norm": 0.4067000448703766, | |
| "learning_rate": 0.00019603960396039606, | |
| "loss": 1.7815, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "grad_norm": 0.45529502630233765, | |
| "learning_rate": 0.00019504950495049505, | |
| "loss": 1.7847, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "eval_accuracy": 0.647348903103301, | |
| "eval_loss": 1.6344057321548462, | |
| "eval_runtime": 1092.9918, | |
| "eval_samples_per_second": 456.893, | |
| "eval_steps_per_second": 2.04, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 0.4383666217327118, | |
| "learning_rate": 0.00019405940594059408, | |
| "loss": 1.7799, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 7.69, | |
| "grad_norm": 0.41071295738220215, | |
| "learning_rate": 0.00019306930693069308, | |
| "loss": 1.7826, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 7.69, | |
| "grad_norm": 0.4367865324020386, | |
| "learning_rate": 0.00019207920792079208, | |
| "loss": 1.7819, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 7.7, | |
| "grad_norm": 0.3609165549278259, | |
| "learning_rate": 0.0001910891089108911, | |
| "loss": 1.7802, | |
| "step": 10570 | |
| }, | |
| { | |
| "epoch": 7.71, | |
| "grad_norm": 0.39005783200263977, | |
| "learning_rate": 0.0001900990099009901, | |
| "loss": 1.78, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.3287705183029175, | |
| "learning_rate": 0.00018910891089108913, | |
| "loss": 1.7797, | |
| "step": 10590 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.3266151547431946, | |
| "learning_rate": 0.00018811881188118812, | |
| "loss": 1.7795, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 7.73, | |
| "grad_norm": 0.35796740651130676, | |
| "learning_rate": 0.00018712871287128712, | |
| "loss": 1.7782, | |
| "step": 10610 | |
| }, | |
| { | |
| "epoch": 7.74, | |
| "grad_norm": 0.39394471049308777, | |
| "learning_rate": 0.00018613861386138615, | |
| "loss": 1.7799, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 7.74, | |
| "eval_accuracy": 0.647908722477522, | |
| "eval_loss": 1.6299968957901, | |
| "eval_runtime": 1086.7095, | |
| "eval_samples_per_second": 459.534, | |
| "eval_steps_per_second": 2.052, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.4640734791755676, | |
| "learning_rate": 0.00018514851485148514, | |
| "loss": 1.7794, | |
| "step": 10630 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.3890862464904785, | |
| "learning_rate": 0.00018415841584158417, | |
| "loss": 1.7794, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 0.3503568768501282, | |
| "learning_rate": 0.00018316831683168317, | |
| "loss": 1.7776, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "grad_norm": 0.39593997597694397, | |
| "learning_rate": 0.00018217821782178217, | |
| "loss": 1.7798, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "grad_norm": 0.3677063286304474, | |
| "learning_rate": 0.0001811881188118812, | |
| "loss": 1.7769, | |
| "step": 10670 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "grad_norm": 0.3558836877346039, | |
| "learning_rate": 0.0001801980198019802, | |
| "loss": 1.7803, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 7.79, | |
| "grad_norm": 0.4983728229999542, | |
| "learning_rate": 0.00017920792079207922, | |
| "loss": 1.7774, | |
| "step": 10690 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.3735315203666687, | |
| "learning_rate": 0.0001782178217821782, | |
| "loss": 1.778, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.3440467417240143, | |
| "learning_rate": 0.0001772277227722772, | |
| "loss": 1.7773, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "eval_accuracy": 0.6480254558570787, | |
| "eval_loss": 1.6308239698410034, | |
| "eval_runtime": 1084.4491, | |
| "eval_samples_per_second": 460.492, | |
| "eval_steps_per_second": 2.056, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 7.81, | |
| "grad_norm": 0.3812803626060486, | |
| "learning_rate": 0.00017623762376237624, | |
| "loss": 1.7776, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 7.82, | |
| "grad_norm": 0.3772016763687134, | |
| "learning_rate": 0.00017524752475247524, | |
| "loss": 1.7751, | |
| "step": 10730 | |
| }, | |
| { | |
| "epoch": 7.83, | |
| "grad_norm": 0.3638882339000702, | |
| "learning_rate": 0.00017425742574257426, | |
| "loss": 1.7794, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 7.83, | |
| "grad_norm": 0.43426576256752014, | |
| "learning_rate": 0.00017326732673267326, | |
| "loss": 1.7787, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 0.41420796513557434, | |
| "learning_rate": 0.00017227722772277226, | |
| "loss": 1.7766, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 7.85, | |
| "grad_norm": 0.3958096206188202, | |
| "learning_rate": 0.0001712871287128713, | |
| "loss": 1.7765, | |
| "step": 10770 | |
| }, | |
| { | |
| "epoch": 7.85, | |
| "grad_norm": 0.3829286992549896, | |
| "learning_rate": 0.0001702970297029703, | |
| "loss": 1.7773, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 7.86, | |
| "grad_norm": 0.41278573870658875, | |
| "learning_rate": 0.00016930693069306933, | |
| "loss": 1.7769, | |
| "step": 10790 | |
| }, | |
| { | |
| "epoch": 7.87, | |
| "grad_norm": 0.337071031332016, | |
| "learning_rate": 0.00016831683168316833, | |
| "loss": 1.7766, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 7.87, | |
| "eval_accuracy": 0.6483471740815172, | |
| "eval_loss": 1.6272797584533691, | |
| "eval_runtime": 1083.9003, | |
| "eval_samples_per_second": 460.725, | |
| "eval_steps_per_second": 2.057, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "grad_norm": 0.3459130823612213, | |
| "learning_rate": 0.00016732673267326733, | |
| "loss": 1.7781, | |
| "step": 10810 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "grad_norm": 0.3339349925518036, | |
| "learning_rate": 0.00016633663366336635, | |
| "loss": 1.7754, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 7.89, | |
| "grad_norm": 0.3156519830226898, | |
| "learning_rate": 0.00016534653465346535, | |
| "loss": 1.776, | |
| "step": 10830 | |
| }, | |
| { | |
| "epoch": 7.9, | |
| "grad_norm": 0.3628999590873718, | |
| "learning_rate": 0.00016435643564356438, | |
| "loss": 1.7744, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 7.91, | |
| "grad_norm": 0.350087434053421, | |
| "learning_rate": 0.00016336633663366338, | |
| "loss": 1.7752, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 7.91, | |
| "grad_norm": 0.40664511919021606, | |
| "learning_rate": 0.00016237623762376237, | |
| "loss": 1.7781, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.3364078104496002, | |
| "learning_rate": 0.0001613861386138614, | |
| "loss": 1.776, | |
| "step": 10870 | |
| }, | |
| { | |
| "epoch": 7.93, | |
| "grad_norm": 0.32285594940185547, | |
| "learning_rate": 0.0001603960396039604, | |
| "loss": 1.772, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 7.93, | |
| "grad_norm": 0.39036986231803894, | |
| "learning_rate": 0.00015940594059405942, | |
| "loss": 1.7755, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 7.93, | |
| "eval_accuracy": 0.6488458576263911, | |
| "eval_loss": 1.6268614530563354, | |
| "eval_runtime": 1085.4437, | |
| "eval_samples_per_second": 460.07, | |
| "eval_steps_per_second": 2.054, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 7.94, | |
| "grad_norm": 0.41111525893211365, | |
| "learning_rate": 0.00015841584158415842, | |
| "loss": 1.7747, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 7.95, | |
| "grad_norm": 0.4492338299751282, | |
| "learning_rate": 0.00015742574257425742, | |
| "loss": 1.7753, | |
| "step": 10910 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 0.3836340606212616, | |
| "learning_rate": 0.00015643564356435644, | |
| "loss": 1.7727, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 0.33723685145378113, | |
| "learning_rate": 0.00015544554455445544, | |
| "loss": 1.775, | |
| "step": 10930 | |
| }, | |
| { | |
| "epoch": 7.97, | |
| "grad_norm": 0.4088629484176636, | |
| "learning_rate": 0.00015445544554455447, | |
| "loss": 1.7741, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 7.98, | |
| "grad_norm": 0.3302168548107147, | |
| "learning_rate": 0.00015346534653465347, | |
| "loss": 1.7732, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 7.99, | |
| "grad_norm": 0.3605554401874542, | |
| "learning_rate": 0.00015247524752475246, | |
| "loss": 1.7722, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 7.99, | |
| "grad_norm": 0.355826735496521, | |
| "learning_rate": 0.0001514851485148515, | |
| "loss": 1.7715, | |
| "step": 10970 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.3708418607711792, | |
| "learning_rate": 0.0001504950495049505, | |
| "loss": 1.7721, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.6491409467366662, | |
| "eval_loss": 1.6255041360855103, | |
| "eval_runtime": 1085.4323, | |
| "eval_samples_per_second": 460.075, | |
| "eval_steps_per_second": 2.054, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 8.01, | |
| "grad_norm": 0.3321845233440399, | |
| "learning_rate": 0.0001495049504950495, | |
| "loss": 1.7719, | |
| "step": 10990 | |
| }, | |
| { | |
| "epoch": 8.01, | |
| "grad_norm": 0.3251964747905731, | |
| "learning_rate": 0.0001485148514851485, | |
| "loss": 1.7731, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 8.02, | |
| "grad_norm": 0.4384077787399292, | |
| "learning_rate": 0.0001475247524752475, | |
| "loss": 1.7753, | |
| "step": 11010 | |
| }, | |
| { | |
| "epoch": 8.03, | |
| "grad_norm": 0.3885464370250702, | |
| "learning_rate": 0.00014653465346534653, | |
| "loss": 1.7707, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 8.04, | |
| "grad_norm": 0.42948633432388306, | |
| "learning_rate": 0.00014554455445544553, | |
| "loss": 1.7716, | |
| "step": 11030 | |
| }, | |
| { | |
| "epoch": 8.04, | |
| "grad_norm": 0.33700281381607056, | |
| "learning_rate": 0.00014455445544554456, | |
| "loss": 1.7711, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 8.05, | |
| "grad_norm": 0.37887144088745117, | |
| "learning_rate": 0.00014356435643564356, | |
| "loss": 1.7707, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 8.06, | |
| "grad_norm": 0.2920531630516052, | |
| "learning_rate": 0.00014257425742574255, | |
| "loss": 1.7717, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 8.07, | |
| "grad_norm": 0.41969195008277893, | |
| "learning_rate": 0.00014158415841584158, | |
| "loss": 1.77, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 8.07, | |
| "eval_accuracy": 0.6496409109474853, | |
| "eval_loss": 1.6225236654281616, | |
| "eval_runtime": 1085.9823, | |
| "eval_samples_per_second": 459.842, | |
| "eval_steps_per_second": 2.053, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 8.07, | |
| "grad_norm": 0.3497646152973175, | |
| "learning_rate": 0.0001405940594059406, | |
| "loss": 1.7712, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 0.38377320766448975, | |
| "learning_rate": 0.00013960396039603963, | |
| "loss": 1.77, | |
| "step": 11090 | |
| }, | |
| { | |
| "epoch": 8.09, | |
| "grad_norm": 0.3549109101295471, | |
| "learning_rate": 0.00013861386138613863, | |
| "loss": 1.7736, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 8.09, | |
| "grad_norm": 0.3375445604324341, | |
| "learning_rate": 0.00013762376237623763, | |
| "loss": 1.7715, | |
| "step": 11110 | |
| }, | |
| { | |
| "epoch": 8.1, | |
| "grad_norm": 0.30400267243385315, | |
| "learning_rate": 0.00013663366336633665, | |
| "loss": 1.7708, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 8.11, | |
| "grad_norm": 0.36075320839881897, | |
| "learning_rate": 0.00013564356435643565, | |
| "loss": 1.7706, | |
| "step": 11130 | |
| }, | |
| { | |
| "epoch": 8.12, | |
| "grad_norm": 0.40548309683799744, | |
| "learning_rate": 0.00013465346534653468, | |
| "loss": 1.7677, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 8.12, | |
| "grad_norm": 0.45684516429901123, | |
| "learning_rate": 0.00013366336633663367, | |
| "loss": 1.7682, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 8.13, | |
| "grad_norm": 0.35352224111557007, | |
| "learning_rate": 0.00013267326732673267, | |
| "loss": 1.7708, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 8.13, | |
| "eval_accuracy": 0.6498058850737081, | |
| "eval_loss": 1.6215531826019287, | |
| "eval_runtime": 1090.6922, | |
| "eval_samples_per_second": 457.856, | |
| "eval_steps_per_second": 2.045, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 8.14, | |
| "grad_norm": 0.32690539956092834, | |
| "learning_rate": 0.0001316831683168317, | |
| "loss": 1.7701, | |
| "step": 11170 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "grad_norm": 0.3275192975997925, | |
| "learning_rate": 0.0001306930693069307, | |
| "loss": 1.7664, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "grad_norm": 0.35621440410614014, | |
| "learning_rate": 0.00012970297029702972, | |
| "loss": 1.7708, | |
| "step": 11190 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 0.3187929093837738, | |
| "learning_rate": 0.00012871287128712872, | |
| "loss": 1.7686, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 8.17, | |
| "grad_norm": 0.30944034457206726, | |
| "learning_rate": 0.00012772277227722772, | |
| "loss": 1.7705, | |
| "step": 11210 | |
| }, | |
| { | |
| "epoch": 8.17, | |
| "grad_norm": 0.3147297501564026, | |
| "learning_rate": 0.00012673267326732674, | |
| "loss": 1.7668, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 8.18, | |
| "grad_norm": 0.31606265902519226, | |
| "learning_rate": 0.00012574257425742574, | |
| "loss": 1.7664, | |
| "step": 11230 | |
| }, | |
| { | |
| "epoch": 8.19, | |
| "grad_norm": 0.3430984914302826, | |
| "learning_rate": 0.00012475247524752477, | |
| "loss": 1.7688, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 0.36714431643486023, | |
| "learning_rate": 0.00012376237623762376, | |
| "loss": 1.7686, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "eval_accuracy": 0.6500912899825928, | |
| "eval_loss": 1.6193368434906006, | |
| "eval_runtime": 1084.4098, | |
| "eval_samples_per_second": 460.509, | |
| "eval_steps_per_second": 2.056, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 0.33857467770576477, | |
| "learning_rate": 0.00012277227722772276, | |
| "loss": 1.7706, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "grad_norm": 0.33994871377944946, | |
| "learning_rate": 0.00012178217821782179, | |
| "loss": 1.7657, | |
| "step": 11270 | |
| }, | |
| { | |
| "epoch": 8.22, | |
| "grad_norm": 0.3522297143936157, | |
| "learning_rate": 0.00012079207920792079, | |
| "loss": 1.7671, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 8.23, | |
| "grad_norm": 0.3409149646759033, | |
| "learning_rate": 0.0001198019801980198, | |
| "loss": 1.7683, | |
| "step": 11290 | |
| }, | |
| { | |
| "epoch": 8.23, | |
| "grad_norm": 0.34772610664367676, | |
| "learning_rate": 0.00011881188118811881, | |
| "loss": 1.7694, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 0.3905799388885498, | |
| "learning_rate": 0.00011782178217821782, | |
| "loss": 1.7676, | |
| "step": 11310 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 0.3478334844112396, | |
| "learning_rate": 0.00011683168316831685, | |
| "loss": 1.7656, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 8.26, | |
| "grad_norm": 0.3372560143470764, | |
| "learning_rate": 0.00011584158415841584, | |
| "loss": 1.7662, | |
| "step": 11330 | |
| }, | |
| { | |
| "epoch": 8.26, | |
| "grad_norm": 0.31676506996154785, | |
| "learning_rate": 0.00011485148514851486, | |
| "loss": 1.7673, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 8.26, | |
| "eval_accuracy": 0.6503277550921033, | |
| "eval_loss": 1.6178277730941772, | |
| "eval_runtime": 1085.1752, | |
| "eval_samples_per_second": 460.184, | |
| "eval_steps_per_second": 2.055, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 8.27, | |
| "grad_norm": 0.31384769082069397, | |
| "learning_rate": 0.00011386138613861387, | |
| "loss": 1.7657, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 0.3267647922039032, | |
| "learning_rate": 0.00011287128712871288, | |
| "loss": 1.766, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 0.32966649532318115, | |
| "learning_rate": 0.00011188118811881189, | |
| "loss": 1.7636, | |
| "step": 11370 | |
| }, | |
| { | |
| "epoch": 8.29, | |
| "grad_norm": 0.34511563181877136, | |
| "learning_rate": 0.00011089108910891089, | |
| "loss": 1.767, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 8.3, | |
| "grad_norm": 0.3151010274887085, | |
| "learning_rate": 0.0001099009900990099, | |
| "loss": 1.7663, | |
| "step": 11390 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "grad_norm": 0.33481037616729736, | |
| "learning_rate": 0.00010891089108910891, | |
| "loss": 1.7687, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "grad_norm": 0.35512518882751465, | |
| "learning_rate": 0.00010792079207920792, | |
| "loss": 1.7681, | |
| "step": 11410 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 0.3472909927368164, | |
| "learning_rate": 0.00010693069306930694, | |
| "loss": 1.7653, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "grad_norm": 0.3452986180782318, | |
| "learning_rate": 0.00010594059405940593, | |
| "loss": 1.7666, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "eval_accuracy": 0.6505906465254327, | |
| "eval_loss": 1.6169500350952148, | |
| "eval_runtime": 1083.6278, | |
| "eval_samples_per_second": 460.841, | |
| "eval_steps_per_second": 2.058, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 8.34, | |
| "grad_norm": 0.34459248185157776, | |
| "learning_rate": 0.00010495049504950495, | |
| "loss": 1.7661, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 8.34, | |
| "grad_norm": 0.3042079508304596, | |
| "learning_rate": 0.00010396039603960396, | |
| "loss": 1.761, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 8.35, | |
| "grad_norm": 0.32908689975738525, | |
| "learning_rate": 0.00010297029702970297, | |
| "loss": 1.7657, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 8.36, | |
| "grad_norm": 0.34110862016677856, | |
| "learning_rate": 0.00010198019801980198, | |
| "loss": 1.7644, | |
| "step": 11470 | |
| }, | |
| { | |
| "epoch": 8.36, | |
| "grad_norm": 0.2914797365665436, | |
| "learning_rate": 0.00010099009900990099, | |
| "loss": 1.7659, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 8.37, | |
| "grad_norm": 0.32843562960624695, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7634, | |
| "step": 11490 | |
| }, | |
| { | |
| "epoch": 8.38, | |
| "grad_norm": 0.3443434536457062, | |
| "learning_rate": 9.900990099009902e-05, | |
| "loss": 1.7647, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 8.39, | |
| "grad_norm": 0.3086354732513428, | |
| "learning_rate": 9.801980198019803e-05, | |
| "loss": 1.7633, | |
| "step": 11510 | |
| }, | |
| { | |
| "epoch": 8.39, | |
| "grad_norm": 0.2858017086982727, | |
| "learning_rate": 9.702970297029704e-05, | |
| "loss": 1.7635, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 8.39, | |
| "eval_accuracy": 0.6506746559931541, | |
| "eval_loss": 1.61593496799469, | |
| "eval_runtime": 1085.1265, | |
| "eval_samples_per_second": 460.204, | |
| "eval_steps_per_second": 2.055, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 0.32728368043899536, | |
| "learning_rate": 9.603960396039604e-05, | |
| "loss": 1.7637, | |
| "step": 11530 | |
| }, | |
| { | |
| "epoch": 8.41, | |
| "grad_norm": 0.3133088946342468, | |
| "learning_rate": 9.504950495049505e-05, | |
| "loss": 1.7629, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 8.42, | |
| "grad_norm": 0.2798636853694916, | |
| "learning_rate": 9.405940594059406e-05, | |
| "loss": 1.766, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 8.42, | |
| "grad_norm": 0.3476144075393677, | |
| "learning_rate": 9.306930693069307e-05, | |
| "loss": 1.7658, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 8.43, | |
| "grad_norm": 0.2828819751739502, | |
| "learning_rate": 9.207920792079209e-05, | |
| "loss": 1.7624, | |
| "step": 11570 | |
| }, | |
| { | |
| "epoch": 8.44, | |
| "grad_norm": 0.27723389863967896, | |
| "learning_rate": 9.108910891089108e-05, | |
| "loss": 1.7635, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 8.44, | |
| "grad_norm": 0.32631412148475647, | |
| "learning_rate": 9.00990099009901e-05, | |
| "loss": 1.7621, | |
| "step": 11590 | |
| }, | |
| { | |
| "epoch": 8.45, | |
| "grad_norm": 0.3203299641609192, | |
| "learning_rate": 8.91089108910891e-05, | |
| "loss": 1.7629, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 8.46, | |
| "grad_norm": 0.31430014967918396, | |
| "learning_rate": 8.811881188118812e-05, | |
| "loss": 1.7631, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 8.46, | |
| "eval_accuracy": 0.6510713372187361, | |
| "eval_loss": 1.6139030456542969, | |
| "eval_runtime": 1084.8096, | |
| "eval_samples_per_second": 460.339, | |
| "eval_steps_per_second": 2.056, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 8.47, | |
| "grad_norm": 0.302937775850296, | |
| "learning_rate": 8.712871287128713e-05, | |
| "loss": 1.7628, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 8.47, | |
| "grad_norm": 0.2944415807723999, | |
| "learning_rate": 8.613861386138613e-05, | |
| "loss": 1.7642, | |
| "step": 11630 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 0.3318140506744385, | |
| "learning_rate": 8.514851485148515e-05, | |
| "loss": 1.7626, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 8.49, | |
| "grad_norm": 0.2729699909687042, | |
| "learning_rate": 8.415841584158417e-05, | |
| "loss": 1.7623, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.37606656551361084, | |
| "learning_rate": 8.316831683168318e-05, | |
| "loss": 1.7609, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.30239492654800415, | |
| "learning_rate": 8.217821782178219e-05, | |
| "loss": 1.7621, | |
| "step": 11670 | |
| }, | |
| { | |
| "epoch": 8.51, | |
| "grad_norm": 0.2841242253780365, | |
| "learning_rate": 8.118811881188119e-05, | |
| "loss": 1.7629, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 0.3027147054672241, | |
| "learning_rate": 8.01980198019802e-05, | |
| "loss": 1.7618, | |
| "step": 11690 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 0.2852645218372345, | |
| "learning_rate": 7.920792079207921e-05, | |
| "loss": 1.7633, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "eval_accuracy": 0.6512795497031492, | |
| "eval_loss": 1.6128230094909668, | |
| "eval_runtime": 1083.3765, | |
| "eval_samples_per_second": 460.948, | |
| "eval_steps_per_second": 2.058, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 8.53, | |
| "grad_norm": 0.30832818150520325, | |
| "learning_rate": 7.821782178217822e-05, | |
| "loss": 1.7606, | |
| "step": 11710 | |
| }, | |
| { | |
| "epoch": 8.54, | |
| "grad_norm": 0.30394139885902405, | |
| "learning_rate": 7.722772277227723e-05, | |
| "loss": 1.7609, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 8.55, | |
| "grad_norm": 0.2910577952861786, | |
| "learning_rate": 7.623762376237623e-05, | |
| "loss": 1.7607, | |
| "step": 11730 | |
| }, | |
| { | |
| "epoch": 8.55, | |
| "grad_norm": 0.27040547132492065, | |
| "learning_rate": 7.524752475247524e-05, | |
| "loss": 1.7607, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 0.2972952723503113, | |
| "learning_rate": 7.425742574257426e-05, | |
| "loss": 1.7615, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 8.57, | |
| "grad_norm": 0.26889026165008545, | |
| "learning_rate": 7.326732673267327e-05, | |
| "loss": 1.7631, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 8.58, | |
| "grad_norm": 0.3071919083595276, | |
| "learning_rate": 7.227722772277228e-05, | |
| "loss": 1.7572, | |
| "step": 11770 | |
| }, | |
| { | |
| "epoch": 8.58, | |
| "grad_norm": 0.30390483140945435, | |
| "learning_rate": 7.128712871287128e-05, | |
| "loss": 1.7593, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 8.59, | |
| "grad_norm": 0.2942393124103546, | |
| "learning_rate": 7.02970297029703e-05, | |
| "loss": 1.7616, | |
| "step": 11790 | |
| }, | |
| { | |
| "epoch": 8.59, | |
| "eval_accuracy": 0.6516749418564994, | |
| "eval_loss": 1.6112834215164185, | |
| "eval_runtime": 1083.6459, | |
| "eval_samples_per_second": 460.833, | |
| "eval_steps_per_second": 2.058, | |
| "step": 11790 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 0.2823123037815094, | |
| "learning_rate": 6.930693069306931e-05, | |
| "loss": 1.7615, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 0.3058627247810364, | |
| "learning_rate": 6.831683168316833e-05, | |
| "loss": 1.7612, | |
| "step": 11810 | |
| }, | |
| { | |
| "epoch": 8.61, | |
| "grad_norm": 0.2954027056694031, | |
| "learning_rate": 6.732673267326734e-05, | |
| "loss": 1.7623, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 8.62, | |
| "grad_norm": 0.32210031151771545, | |
| "learning_rate": 6.633663366336634e-05, | |
| "loss": 1.7607, | |
| "step": 11830 | |
| }, | |
| { | |
| "epoch": 8.63, | |
| "grad_norm": 0.2638227343559265, | |
| "learning_rate": 6.534653465346535e-05, | |
| "loss": 1.7591, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 8.63, | |
| "grad_norm": 0.2716045379638672, | |
| "learning_rate": 6.435643564356436e-05, | |
| "loss": 1.7602, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 0.2823101282119751, | |
| "learning_rate": 6.336633663366337e-05, | |
| "loss": 1.7602, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 8.65, | |
| "grad_norm": 0.26111429929733276, | |
| "learning_rate": 6.237623762376238e-05, | |
| "loss": 1.7608, | |
| "step": 11870 | |
| }, | |
| { | |
| "epoch": 8.66, | |
| "grad_norm": 0.28957730531692505, | |
| "learning_rate": 6.138613861386138e-05, | |
| "loss": 1.7602, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 8.66, | |
| "eval_accuracy": 0.6518121780771471, | |
| "eval_loss": 1.610386610031128, | |
| "eval_runtime": 1085.801, | |
| "eval_samples_per_second": 459.919, | |
| "eval_steps_per_second": 2.054, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 8.66, | |
| "grad_norm": 0.31074461340904236, | |
| "learning_rate": 6.039603960396039e-05, | |
| "loss": 1.7599, | |
| "step": 11890 | |
| }, | |
| { | |
| "epoch": 8.67, | |
| "grad_norm": 0.2974682152271271, | |
| "learning_rate": 5.9405940594059404e-05, | |
| "loss": 1.7604, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 0.3034124970436096, | |
| "learning_rate": 5.841584158415842e-05, | |
| "loss": 1.7605, | |
| "step": 11910 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 0.28555795550346375, | |
| "learning_rate": 5.742574257425743e-05, | |
| "loss": 1.7568, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 8.69, | |
| "grad_norm": 0.2668933868408203, | |
| "learning_rate": 5.643564356435644e-05, | |
| "loss": 1.7576, | |
| "step": 11930 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "grad_norm": 0.2799495458602905, | |
| "learning_rate": 5.5445544554455445e-05, | |
| "loss": 1.7595, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 8.71, | |
| "grad_norm": 0.28266316652297974, | |
| "learning_rate": 5.4455445544554456e-05, | |
| "loss": 1.7602, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 8.71, | |
| "grad_norm": 0.27878817915916443, | |
| "learning_rate": 5.346534653465347e-05, | |
| "loss": 1.7607, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "grad_norm": 0.2904963493347168, | |
| "learning_rate": 5.247524752475247e-05, | |
| "loss": 1.7578, | |
| "step": 11970 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "eval_accuracy": 0.6515499792766575, | |
| "eval_loss": 1.6111468076705933, | |
| "eval_runtime": 1090.6667, | |
| "eval_samples_per_second": 457.867, | |
| "eval_steps_per_second": 2.045, | |
| "step": 11970 | |
| }, | |
| { | |
| "epoch": 8.73, | |
| "grad_norm": 0.289413720369339, | |
| "learning_rate": 5.1485148514851485e-05, | |
| "loss": 1.758, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 8.74, | |
| "grad_norm": 0.2739205062389374, | |
| "learning_rate": 5.0495049504950497e-05, | |
| "loss": 1.7579, | |
| "step": 11990 | |
| }, | |
| { | |
| "epoch": 8.74, | |
| "grad_norm": 0.26597511768341064, | |
| "learning_rate": 4.950495049504951e-05, | |
| "loss": 1.7568, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 0.24635004997253418, | |
| "learning_rate": 4.851485148514852e-05, | |
| "loss": 1.7584, | |
| "step": 12010 | |
| }, | |
| { | |
| "epoch": 8.76, | |
| "grad_norm": 0.2534136474132538, | |
| "learning_rate": 4.7524752475247525e-05, | |
| "loss": 1.7602, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 8.77, | |
| "grad_norm": 0.26007363200187683, | |
| "learning_rate": 4.653465346534654e-05, | |
| "loss": 1.7567, | |
| "step": 12030 | |
| }, | |
| { | |
| "epoch": 8.77, | |
| "grad_norm": 0.2807808816432953, | |
| "learning_rate": 4.554455445544554e-05, | |
| "loss": 1.7566, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 8.78, | |
| "grad_norm": 0.2677513360977173, | |
| "learning_rate": 4.455445544554455e-05, | |
| "loss": 1.7567, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 8.79, | |
| "grad_norm": 0.2691977620124817, | |
| "learning_rate": 4.3564356435643565e-05, | |
| "loss": 1.757, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 8.79, | |
| "eval_accuracy": 0.6521601327172856, | |
| "eval_loss": 1.60829758644104, | |
| "eval_runtime": 1089.928, | |
| "eval_samples_per_second": 458.177, | |
| "eval_steps_per_second": 2.046, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 8.79, | |
| "grad_norm": 0.2577356696128845, | |
| "learning_rate": 4.257425742574258e-05, | |
| "loss": 1.7584, | |
| "step": 12070 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.2654874324798584, | |
| "learning_rate": 4.158415841584159e-05, | |
| "loss": 1.7571, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 8.81, | |
| "grad_norm": 0.25344353914260864, | |
| "learning_rate": 4.0594059405940594e-05, | |
| "loss": 1.7581, | |
| "step": 12090 | |
| }, | |
| { | |
| "epoch": 8.82, | |
| "grad_norm": 0.25865158438682556, | |
| "learning_rate": 3.9603960396039605e-05, | |
| "loss": 1.7552, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 8.82, | |
| "grad_norm": 0.28875982761383057, | |
| "learning_rate": 3.861386138613862e-05, | |
| "loss": 1.757, | |
| "step": 12110 | |
| }, | |
| { | |
| "epoch": 8.83, | |
| "grad_norm": 0.2697414755821228, | |
| "learning_rate": 3.762376237623762e-05, | |
| "loss": 1.7579, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 8.84, | |
| "grad_norm": 0.2786589562892914, | |
| "learning_rate": 3.6633663366336634e-05, | |
| "loss": 1.7583, | |
| "step": 12130 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 0.258486270904541, | |
| "learning_rate": 3.564356435643564e-05, | |
| "loss": 1.7581, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 0.2595365345478058, | |
| "learning_rate": 3.465346534653466e-05, | |
| "loss": 1.757, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "eval_accuracy": 0.652040482066107, | |
| "eval_loss": 1.6086018085479736, | |
| "eval_runtime": 1089.5635, | |
| "eval_samples_per_second": 458.33, | |
| "eval_steps_per_second": 2.047, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 8.86, | |
| "grad_norm": 0.25674012303352356, | |
| "learning_rate": 3.366336633663367e-05, | |
| "loss": 1.7595, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 8.87, | |
| "grad_norm": 0.23194921016693115, | |
| "learning_rate": 3.2673267326732674e-05, | |
| "loss": 1.7574, | |
| "step": 12170 | |
| }, | |
| { | |
| "epoch": 8.87, | |
| "grad_norm": 0.2626875936985016, | |
| "learning_rate": 3.1683168316831686e-05, | |
| "loss": 1.7571, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 0.2361476868391037, | |
| "learning_rate": 3.069306930693069e-05, | |
| "loss": 1.7573, | |
| "step": 12190 | |
| }, | |
| { | |
| "epoch": 8.89, | |
| "grad_norm": 0.2606755793094635, | |
| "learning_rate": 2.9702970297029702e-05, | |
| "loss": 1.7567, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 0.27499887347221375, | |
| "learning_rate": 2.8712871287128714e-05, | |
| "loss": 1.7579, | |
| "step": 12210 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 0.24832656979560852, | |
| "learning_rate": 2.7722772277227722e-05, | |
| "loss": 1.7566, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 8.91, | |
| "grad_norm": 0.24898388981819153, | |
| "learning_rate": 2.6732673267326734e-05, | |
| "loss": 1.7544, | |
| "step": 12230 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "grad_norm": 0.24266423285007477, | |
| "learning_rate": 2.5742574257425742e-05, | |
| "loss": 1.7559, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "eval_accuracy": 0.6522573824099933, | |
| "eval_loss": 1.6079708337783813, | |
| "eval_runtime": 1089.9176, | |
| "eval_samples_per_second": 458.181, | |
| "eval_steps_per_second": 2.046, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 8.93, | |
| "grad_norm": 0.2438860386610031, | |
| "learning_rate": 2.4752475247524754e-05, | |
| "loss": 1.7554, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 8.93, | |
| "grad_norm": 0.22911418974399567, | |
| "learning_rate": 2.3762376237623762e-05, | |
| "loss": 1.7547, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 8.94, | |
| "grad_norm": 0.2550877034664154, | |
| "learning_rate": 2.277227722772277e-05, | |
| "loss": 1.7567, | |
| "step": 12270 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 0.2409505546092987, | |
| "learning_rate": 2.1782178217821783e-05, | |
| "loss": 1.7556, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 0.23632997274398804, | |
| "learning_rate": 2.0792079207920794e-05, | |
| "loss": 1.7573, | |
| "step": 12290 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 0.22292740643024445, | |
| "learning_rate": 1.9801980198019803e-05, | |
| "loss": 1.757, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 8.97, | |
| "grad_norm": 0.2350420504808426, | |
| "learning_rate": 1.881188118811881e-05, | |
| "loss": 1.756, | |
| "step": 12310 | |
| }, | |
| { | |
| "epoch": 8.98, | |
| "grad_norm": 0.22938278317451477, | |
| "learning_rate": 1.782178217821782e-05, | |
| "loss": 1.7562, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 8.98, | |
| "grad_norm": 0.2246268391609192, | |
| "learning_rate": 1.6831683168316834e-05, | |
| "loss": 1.7556, | |
| "step": 12330 | |
| }, | |
| { | |
| "epoch": 8.98, | |
| "eval_accuracy": 0.652376308176148, | |
| "eval_loss": 1.6073620319366455, | |
| "eval_runtime": 1088.9818, | |
| "eval_samples_per_second": 458.575, | |
| "eval_steps_per_second": 2.048, | |
| "step": 12330 | |
| }, | |
| { | |
| "epoch": 8.99, | |
| "grad_norm": 0.22820483148097992, | |
| "learning_rate": 1.5841584158415843e-05, | |
| "loss": 1.7564, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.2315167486667633, | |
| "learning_rate": 1.4851485148514851e-05, | |
| "loss": 1.7558, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 9.01, | |
| "grad_norm": 0.21513350307941437, | |
| "learning_rate": 1.3861386138613861e-05, | |
| "loss": 1.757, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 9.01, | |
| "grad_norm": 0.21538245677947998, | |
| "learning_rate": 1.2871287128712871e-05, | |
| "loss": 1.7527, | |
| "step": 12370 | |
| }, | |
| { | |
| "epoch": 9.02, | |
| "grad_norm": 0.22796376049518585, | |
| "learning_rate": 1.1881188118811881e-05, | |
| "loss": 1.7549, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 9.03, | |
| "grad_norm": 0.21846508979797363, | |
| "learning_rate": 1.0891089108910891e-05, | |
| "loss": 1.7527, | |
| "step": 12390 | |
| }, | |
| { | |
| "epoch": 9.03, | |
| "grad_norm": 0.2252340316772461, | |
| "learning_rate": 9.900990099009901e-06, | |
| "loss": 1.757, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 0.22679966688156128, | |
| "learning_rate": 8.91089108910891e-06, | |
| "loss": 1.7547, | |
| "step": 12410 | |
| }, | |
| { | |
| "epoch": 9.05, | |
| "grad_norm": 0.21749068796634674, | |
| "learning_rate": 7.920792079207921e-06, | |
| "loss": 1.755, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 9.05, | |
| "eval_accuracy": 0.6525192559694988, | |
| "eval_loss": 1.6068978309631348, | |
| "eval_runtime": 1087.147, | |
| "eval_samples_per_second": 459.349, | |
| "eval_steps_per_second": 2.051, | |
| "step": 12420 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 12500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 90, | |
| "total_flos": 1.28938481325833e+18, | |
| "train_batch_size": 192, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |