| { | |
| "best_metric": 3.9158332347869873, | |
| "best_model_checkpoint": "/content/drive/MyDrive/checkpoints/checkpoint-3342", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3342, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008976660682226212, | |
| "grad_norm": 0.31800901889801025, | |
| "learning_rate": 0.0004997755834829443, | |
| "loss": 45.5673, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017953321364452424, | |
| "grad_norm": 0.35396715998649597, | |
| "learning_rate": 0.0004995511669658887, | |
| "loss": 45.3249, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026929982046678635, | |
| "grad_norm": 0.37757596373558044, | |
| "learning_rate": 0.000499326750448833, | |
| "loss": 44.7853, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03590664272890485, | |
| "grad_norm": 0.38259994983673096, | |
| "learning_rate": 0.0004991023339317774, | |
| "loss": 44.8996, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04488330341113106, | |
| "grad_norm": 0.38321229815483093, | |
| "learning_rate": 0.0004988779174147217, | |
| "loss": 44.1242, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05385996409335727, | |
| "grad_norm": 0.42167848348617554, | |
| "learning_rate": 0.0004986535008976661, | |
| "loss": 44.4739, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06283662477558348, | |
| "grad_norm": 0.40018683671951294, | |
| "learning_rate": 0.0004984290843806105, | |
| "loss": 44.4548, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0718132854578097, | |
| "grad_norm": 0.39394208788871765, | |
| "learning_rate": 0.0004982046678635547, | |
| "loss": 43.4622, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0807899461400359, | |
| "grad_norm": 0.3660307228565216, | |
| "learning_rate": 0.0004979802513464991, | |
| "loss": 43.0475, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08976660682226212, | |
| "grad_norm": 0.36663416028022766, | |
| "learning_rate": 0.0004977558348294434, | |
| "loss": 42.8994, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09874326750448834, | |
| "grad_norm": 0.40418022871017456, | |
| "learning_rate": 0.0004975314183123878, | |
| "loss": 43.0499, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10771992818671454, | |
| "grad_norm": 0.32946863770484924, | |
| "learning_rate": 0.0004973070017953322, | |
| "loss": 43.0516, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11669658886894076, | |
| "grad_norm": 0.36427420377731323, | |
| "learning_rate": 0.0004970825852782765, | |
| "loss": 43.0005, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12567324955116696, | |
| "grad_norm": 0.3754049837589264, | |
| "learning_rate": 0.0004968581687612209, | |
| "loss": 42.3461, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13464991023339318, | |
| "grad_norm": 0.3867158889770508, | |
| "learning_rate": 0.0004966337522441652, | |
| "loss": 42.5113, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1436265709156194, | |
| "grad_norm": 0.35019099712371826, | |
| "learning_rate": 0.0004964093357271095, | |
| "loss": 41.8364, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1526032315978456, | |
| "grad_norm": 0.37168896198272705, | |
| "learning_rate": 0.0004961849192100539, | |
| "loss": 42.4202, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1615798922800718, | |
| "grad_norm": 0.36585116386413574, | |
| "learning_rate": 0.0004959605026929982, | |
| "loss": 41.5454, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.17055655296229802, | |
| "grad_norm": 0.4089430272579193, | |
| "learning_rate": 0.0004957360861759426, | |
| "loss": 41.0863, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17953321364452424, | |
| "grad_norm": 0.3681723475456238, | |
| "learning_rate": 0.0004955116696588868, | |
| "loss": 41.0869, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18850987432675045, | |
| "grad_norm": 0.3823374807834625, | |
| "learning_rate": 0.0004952872531418312, | |
| "loss": 41.0254, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.19748653500897667, | |
| "grad_norm": 0.3909670412540436, | |
| "learning_rate": 0.0004950628366247755, | |
| "loss": 41.0026, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.20646319569120286, | |
| "grad_norm": 0.39831164479255676, | |
| "learning_rate": 0.0004948384201077199, | |
| "loss": 40.3224, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.21543985637342908, | |
| "grad_norm": 0.3801274597644806, | |
| "learning_rate": 0.0004946140035906643, | |
| "loss": 40.4711, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2244165170556553, | |
| "grad_norm": 0.39255771040916443, | |
| "learning_rate": 0.0004943895870736086, | |
| "loss": 39.9713, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2333931777378815, | |
| "grad_norm": 0.400642067193985, | |
| "learning_rate": 0.000494165170556553, | |
| "loss": 39.3574, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.24236983842010773, | |
| "grad_norm": 0.44542375206947327, | |
| "learning_rate": 0.0004939407540394973, | |
| "loss": 39.4756, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2513464991023339, | |
| "grad_norm": 0.41471394896507263, | |
| "learning_rate": 0.0004937163375224417, | |
| "loss": 39.4551, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.26032315978456017, | |
| "grad_norm": 0.3956909775733948, | |
| "learning_rate": 0.000493491921005386, | |
| "loss": 39.0815, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.26929982046678635, | |
| "grad_norm": 0.5405673384666443, | |
| "learning_rate": 0.0004932675044883304, | |
| "loss": 38.7405, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.27827648114901254, | |
| "grad_norm": 0.4720427691936493, | |
| "learning_rate": 0.0004930430879712747, | |
| "loss": 38.1905, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2872531418312388, | |
| "grad_norm": 0.4677943289279938, | |
| "learning_rate": 0.0004928186714542191, | |
| "loss": 38.023, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.296229802513465, | |
| "grad_norm": 0.4742816090583801, | |
| "learning_rate": 0.0004925942549371633, | |
| "loss": 37.6844, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3052064631956912, | |
| "grad_norm": 0.463733047246933, | |
| "learning_rate": 0.0004923698384201077, | |
| "loss": 37.4262, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3141831238779174, | |
| "grad_norm": 0.48447635769844055, | |
| "learning_rate": 0.000492145421903052, | |
| "loss": 37.1974, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3231597845601436, | |
| "grad_norm": 0.5126340389251709, | |
| "learning_rate": 0.0004919210053859964, | |
| "loss": 36.4875, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.33213644524236985, | |
| "grad_norm": 0.5128099322319031, | |
| "learning_rate": 0.0004916965888689407, | |
| "loss": 36.8894, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.34111310592459604, | |
| "grad_norm": 0.5677986741065979, | |
| "learning_rate": 0.0004914721723518851, | |
| "loss": 36.0053, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3500897666068223, | |
| "grad_norm": 0.6088815927505493, | |
| "learning_rate": 0.0004912477558348294, | |
| "loss": 36.2308, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3590664272890485, | |
| "grad_norm": 0.5765969157218933, | |
| "learning_rate": 0.0004910233393177738, | |
| "loss": 35.4461, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.36804308797127466, | |
| "grad_norm": 0.5858592391014099, | |
| "learning_rate": 0.0004907989228007182, | |
| "loss": 35.0571, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3770197486535009, | |
| "grad_norm": 0.6825990080833435, | |
| "learning_rate": 0.0004905745062836625, | |
| "loss": 34.717, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3859964093357271, | |
| "grad_norm": 0.7166014313697815, | |
| "learning_rate": 0.0004903500897666069, | |
| "loss": 33.8366, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.39497307001795334, | |
| "grad_norm": 0.6887209415435791, | |
| "learning_rate": 0.0004901256732495512, | |
| "loss": 33.7563, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.40394973070017953, | |
| "grad_norm": 0.7413772344589233, | |
| "learning_rate": 0.0004899012567324956, | |
| "loss": 33.1205, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4129263913824057, | |
| "grad_norm": 0.7537035942077637, | |
| "learning_rate": 0.0004896768402154398, | |
| "loss": 32.9826, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.42190305206463197, | |
| "grad_norm": 0.730989396572113, | |
| "learning_rate": 0.0004894524236983842, | |
| "loss": 31.9265, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.43087971274685816, | |
| "grad_norm": 0.9165148735046387, | |
| "learning_rate": 0.0004892280071813285, | |
| "loss": 31.3014, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4398563734290844, | |
| "grad_norm": 0.8587144613265991, | |
| "learning_rate": 0.0004890035906642729, | |
| "loss": 30.889, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4488330341113106, | |
| "grad_norm": 0.9183847904205322, | |
| "learning_rate": 0.0004887791741472172, | |
| "loss": 30.1653, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4578096947935368, | |
| "grad_norm": 0.9044579863548279, | |
| "learning_rate": 0.0004885547576301616, | |
| "loss": 29.9274, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.466786355475763, | |
| "grad_norm": 0.8621285557746887, | |
| "learning_rate": 0.0004883303411131059, | |
| "loss": 29.5479, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4757630161579892, | |
| "grad_norm": 1.1030315160751343, | |
| "learning_rate": 0.0004881059245960503, | |
| "loss": 29.1674, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.48473967684021546, | |
| "grad_norm": 1.071616768836975, | |
| "learning_rate": 0.00048788150807899463, | |
| "loss": 28.5656, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.49371633752244165, | |
| "grad_norm": 0.9452396035194397, | |
| "learning_rate": 0.000487657091561939, | |
| "loss": 28.1162, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5026929982046678, | |
| "grad_norm": 0.9999839067459106, | |
| "learning_rate": 0.0004874326750448833, | |
| "loss": 27.1627, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5116696588868941, | |
| "grad_norm": 1.5522288084030151, | |
| "learning_rate": 0.00048720825852782766, | |
| "loss": 26.8812, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5206463195691203, | |
| "grad_norm": 1.1541786193847656, | |
| "learning_rate": 0.000486983842010772, | |
| "loss": 26.2589, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5296229802513465, | |
| "grad_norm": 0.9977880120277405, | |
| "learning_rate": 0.00048675942549371634, | |
| "loss": 26.168, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5385996409335727, | |
| "grad_norm": 0.9028811454772949, | |
| "learning_rate": 0.0004865350089766607, | |
| "loss": 24.9378, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.547576301615799, | |
| "grad_norm": 1.0026092529296875, | |
| "learning_rate": 0.00048631059245960503, | |
| "loss": 25.0135, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5565529622980251, | |
| "grad_norm": 1.0198203325271606, | |
| "learning_rate": 0.00048608617594254937, | |
| "loss": 24.6053, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5655296229802513, | |
| "grad_norm": 1.1564388275146484, | |
| "learning_rate": 0.0004858617594254937, | |
| "loss": 24.0813, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5745062836624776, | |
| "grad_norm": 0.8892808556556702, | |
| "learning_rate": 0.0004856373429084381, | |
| "loss": 23.7441, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5834829443447038, | |
| "grad_norm": 1.1114846467971802, | |
| "learning_rate": 0.0004854129263913824, | |
| "loss": 22.8734, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.59245960502693, | |
| "grad_norm": 1.0558847188949585, | |
| "learning_rate": 0.0004851885098743268, | |
| "loss": 22.3045, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6014362657091562, | |
| "grad_norm": 0.8897343277931213, | |
| "learning_rate": 0.0004849640933572711, | |
| "loss": 21.977, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6104129263913824, | |
| "grad_norm": 0.9796168208122253, | |
| "learning_rate": 0.0004847396768402155, | |
| "loss": 21.5113, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6193895870736086, | |
| "grad_norm": 0.8519884943962097, | |
| "learning_rate": 0.0004845152603231598, | |
| "loss": 20.9744, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6283662477558348, | |
| "grad_norm": 1.1632051467895508, | |
| "learning_rate": 0.00048429084380610416, | |
| "loss": 20.271, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6373429084380611, | |
| "grad_norm": 0.9868700504302979, | |
| "learning_rate": 0.0004840664272890485, | |
| "loss": 19.9961, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6463195691202872, | |
| "grad_norm": 0.9679480791091919, | |
| "learning_rate": 0.0004838420107719928, | |
| "loss": 19.4405, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6552962298025135, | |
| "grad_norm": 1.0145677328109741, | |
| "learning_rate": 0.0004836175942549372, | |
| "loss": 19.2046, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6642728904847397, | |
| "grad_norm": 1.0279533863067627, | |
| "learning_rate": 0.00048339317773788147, | |
| "loss": 18.2792, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6732495511669659, | |
| "grad_norm": 1.2876602411270142, | |
| "learning_rate": 0.00048316876122082587, | |
| "loss": 17.8022, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6822262118491921, | |
| "grad_norm": 1.0419774055480957, | |
| "learning_rate": 0.0004829443447037702, | |
| "loss": 17.4577, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6912028725314183, | |
| "grad_norm": 1.0887730121612549, | |
| "learning_rate": 0.00048271992818671455, | |
| "loss": 16.5106, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.7001795332136446, | |
| "grad_norm": 1.1203436851501465, | |
| "learning_rate": 0.0004824955116696589, | |
| "loss": 16.4582, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7091561938958707, | |
| "grad_norm": 1.0770111083984375, | |
| "learning_rate": 0.00048227109515260324, | |
| "loss": 16.003, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.718132854578097, | |
| "grad_norm": 1.2158771753311157, | |
| "learning_rate": 0.0004820466786355476, | |
| "loss": 15.2694, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7271095152603232, | |
| "grad_norm": 1.1706403493881226, | |
| "learning_rate": 0.000481822262118492, | |
| "loss": 14.9252, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7360861759425493, | |
| "grad_norm": 1.189310908317566, | |
| "learning_rate": 0.00048159784560143626, | |
| "loss": 14.4921, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7450628366247756, | |
| "grad_norm": 1.6199108362197876, | |
| "learning_rate": 0.00048137342908438066, | |
| "loss": 13.9443, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7540394973070018, | |
| "grad_norm": 1.1757200956344604, | |
| "learning_rate": 0.00048114901256732494, | |
| "loss": 13.8288, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7630161579892281, | |
| "grad_norm": 1.2064054012298584, | |
| "learning_rate": 0.00048092459605026934, | |
| "loss": 12.9563, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7719928186714542, | |
| "grad_norm": 1.1954108476638794, | |
| "learning_rate": 0.00048070017953321363, | |
| "loss": 12.1382, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7809694793536804, | |
| "grad_norm": 1.5387598276138306, | |
| "learning_rate": 0.00048047576301615797, | |
| "loss": 12.1248, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7899461400359067, | |
| "grad_norm": 1.2923359870910645, | |
| "learning_rate": 0.00048025134649910237, | |
| "loss": 11.7902, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7989228007181328, | |
| "grad_norm": 0.9865145683288574, | |
| "learning_rate": 0.00048002692998204665, | |
| "loss": 10.7329, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8078994614003591, | |
| "grad_norm": 1.140541672706604, | |
| "learning_rate": 0.00047980251346499105, | |
| "loss": 10.5986, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8168761220825853, | |
| "grad_norm": 1.1022454500198364, | |
| "learning_rate": 0.00047957809694793534, | |
| "loss": 10.2782, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8258527827648114, | |
| "grad_norm": 0.8876429200172424, | |
| "learning_rate": 0.00047935368043087973, | |
| "loss": 9.3573, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8348294434470377, | |
| "grad_norm": 0.9144046306610107, | |
| "learning_rate": 0.0004791292639138241, | |
| "loss": 9.4616, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8438061041292639, | |
| "grad_norm": 1.022176742553711, | |
| "learning_rate": 0.0004789048473967684, | |
| "loss": 9.0571, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8527827648114902, | |
| "grad_norm": 0.9050130248069763, | |
| "learning_rate": 0.00047868043087971276, | |
| "loss": 8.4811, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8617594254937163, | |
| "grad_norm": 0.8372008800506592, | |
| "learning_rate": 0.0004784560143626571, | |
| "loss": 8.3873, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8707360861759426, | |
| "grad_norm": 0.8663610816001892, | |
| "learning_rate": 0.00047823159784560144, | |
| "loss": 8.0233, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8797127468581688, | |
| "grad_norm": 0.6936354637145996, | |
| "learning_rate": 0.00047800718132854584, | |
| "loss": 7.8054, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8886894075403949, | |
| "grad_norm": 0.5529871582984924, | |
| "learning_rate": 0.00047778276481149013, | |
| "loss": 7.6013, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8976660682226212, | |
| "grad_norm": 0.6260952353477478, | |
| "learning_rate": 0.00047755834829443447, | |
| "loss": 7.4237, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9066427289048474, | |
| "grad_norm": 0.851337730884552, | |
| "learning_rate": 0.0004773339317773788, | |
| "loss": 7.2549, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9156193895870736, | |
| "grad_norm": 0.6702756285667419, | |
| "learning_rate": 0.00047710951526032315, | |
| "loss": 7.0967, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9245960502692998, | |
| "grad_norm": 0.6650304794311523, | |
| "learning_rate": 0.0004768850987432675, | |
| "loss": 6.9988, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.933572710951526, | |
| "grad_norm": 0.551717221736908, | |
| "learning_rate": 0.00047666068222621184, | |
| "loss": 6.5465, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.9425493716337523, | |
| "grad_norm": 0.4560067653656006, | |
| "learning_rate": 0.00047643626570915623, | |
| "loss": 6.5641, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.9515260323159784, | |
| "grad_norm": 0.4556948244571686, | |
| "learning_rate": 0.0004762118491921005, | |
| "loss": 6.6911, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9605026929982047, | |
| "grad_norm": 0.8652740716934204, | |
| "learning_rate": 0.0004759874326750449, | |
| "loss": 6.7453, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9694793536804309, | |
| "grad_norm": 0.32210618257522583, | |
| "learning_rate": 0.0004757630161579892, | |
| "loss": 6.5263, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9784560143626571, | |
| "grad_norm": 1.9738398790359497, | |
| "learning_rate": 0.0004755385996409336, | |
| "loss": 6.4019, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9874326750448833, | |
| "grad_norm": 0.31478866934776306, | |
| "learning_rate": 0.00047531418312387794, | |
| "loss": 6.266, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9964093357271095, | |
| "grad_norm": 0.39359068870544434, | |
| "learning_rate": 0.0004750897666068223, | |
| "loss": 6.2422, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 5.025014400482178, | |
| "eval_runtime": 436.9889, | |
| "eval_samples_per_second": 10.197, | |
| "eval_steps_per_second": 1.275, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 1.0053859964093357, | |
| "grad_norm": 0.3087250888347626, | |
| "learning_rate": 0.0004748653500897666, | |
| "loss": 6.1059, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.014362657091562, | |
| "grad_norm": 0.4997764825820923, | |
| "learning_rate": 0.00047464093357271097, | |
| "loss": 6.1567, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.0233393177737882, | |
| "grad_norm": 0.4492017328739166, | |
| "learning_rate": 0.0004744165170556553, | |
| "loss": 6.0689, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.0323159784560143, | |
| "grad_norm": 0.35565611720085144, | |
| "learning_rate": 0.00047419210053859965, | |
| "loss": 5.9551, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.0412926391382407, | |
| "grad_norm": 0.28686025738716125, | |
| "learning_rate": 0.000473967684021544, | |
| "loss": 5.9306, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.0502692998204668, | |
| "grad_norm": 0.28098103404045105, | |
| "learning_rate": 0.00047374326750448834, | |
| "loss": 5.8205, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.059245960502693, | |
| "grad_norm": 0.3124157190322876, | |
| "learning_rate": 0.0004735188509874327, | |
| "loss": 5.7734, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.0682226211849193, | |
| "grad_norm": 0.27604150772094727, | |
| "learning_rate": 0.000473294434470377, | |
| "loss": 5.8549, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.0771992818671454, | |
| "grad_norm": 0.48105934262275696, | |
| "learning_rate": 0.00047307001795332136, | |
| "loss": 5.8208, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0861759425493716, | |
| "grad_norm": 0.33073532581329346, | |
| "learning_rate": 0.0004728456014362657, | |
| "loss": 5.7798, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.095152603231598, | |
| "grad_norm": 0.24770517647266388, | |
| "learning_rate": 0.0004726211849192101, | |
| "loss": 5.6513, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.104129263913824, | |
| "grad_norm": 0.23116350173950195, | |
| "learning_rate": 0.0004723967684021544, | |
| "loss": 5.6458, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.1131059245960502, | |
| "grad_norm": 0.2757456302642822, | |
| "learning_rate": 0.0004721723518850988, | |
| "loss": 5.7592, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.1220825852782765, | |
| "grad_norm": 0.23286688327789307, | |
| "learning_rate": 0.00047194793536804307, | |
| "loss": 5.6889, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.1310592459605027, | |
| "grad_norm": 0.1967301219701767, | |
| "learning_rate": 0.00047172351885098747, | |
| "loss": 5.5865, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.140035906642729, | |
| "grad_norm": 0.22576653957366943, | |
| "learning_rate": 0.0004714991023339318, | |
| "loss": 5.4764, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.1490125673249552, | |
| "grad_norm": 0.217813640832901, | |
| "learning_rate": 0.00047127468581687615, | |
| "loss": 5.6309, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.1579892280071813, | |
| "grad_norm": 0.1798250824213028, | |
| "learning_rate": 0.0004710502692998205, | |
| "loss": 5.4452, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.1669658886894076, | |
| "grad_norm": 0.22210471332073212, | |
| "learning_rate": 0.0004708258527827648, | |
| "loss": 5.5905, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1759425493716338, | |
| "grad_norm": 0.24236564338207245, | |
| "learning_rate": 0.0004706014362657092, | |
| "loss": 5.5106, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.18491921005386, | |
| "grad_norm": 0.205738365650177, | |
| "learning_rate": 0.00047037701974865346, | |
| "loss": 5.4863, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.1938958707360863, | |
| "grad_norm": 0.2275596708059311, | |
| "learning_rate": 0.00047015260323159786, | |
| "loss": 5.4782, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.2028725314183124, | |
| "grad_norm": 0.40637847781181335, | |
| "learning_rate": 0.0004699281867145422, | |
| "loss": 5.4103, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.2118491921005385, | |
| "grad_norm": 0.17678338289260864, | |
| "learning_rate": 0.00046970377019748654, | |
| "loss": 5.3858, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.220825852782765, | |
| "grad_norm": 0.1862853765487671, | |
| "learning_rate": 0.0004694793536804309, | |
| "loss": 5.379, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.229802513464991, | |
| "grad_norm": 0.12334032356739044, | |
| "learning_rate": 0.0004692549371633752, | |
| "loss": 5.396, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.2387791741472172, | |
| "grad_norm": 0.15632939338684082, | |
| "learning_rate": 0.00046903052064631957, | |
| "loss": 5.3853, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.2477558348294435, | |
| "grad_norm": 0.18021011352539062, | |
| "learning_rate": 0.00046880610412926396, | |
| "loss": 5.2905, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.2567324955116697, | |
| "grad_norm": 0.15651032328605652, | |
| "learning_rate": 0.00046858168761220825, | |
| "loss": 5.4102, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.2657091561938958, | |
| "grad_norm": 0.15990717709064484, | |
| "learning_rate": 0.00046835727109515265, | |
| "loss": 5.3213, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.2746858168761221, | |
| "grad_norm": 0.23683366179466248, | |
| "learning_rate": 0.00046813285457809694, | |
| "loss": 5.3596, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.2836624775583483, | |
| "grad_norm": 0.17186540365219116, | |
| "learning_rate": 0.0004679084380610413, | |
| "loss": 5.2734, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.2926391382405744, | |
| "grad_norm": 0.12084522843360901, | |
| "learning_rate": 0.0004676840215439857, | |
| "loss": 5.2741, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.3016157989228008, | |
| "grad_norm": 0.13929304480552673, | |
| "learning_rate": 0.00046745960502692996, | |
| "loss": 5.2734, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.310592459605027, | |
| "grad_norm": 0.22931580245494843, | |
| "learning_rate": 0.00046723518850987436, | |
| "loss": 5.2281, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.319569120287253, | |
| "grad_norm": 0.13986773788928986, | |
| "learning_rate": 0.00046701077199281865, | |
| "loss": 5.2185, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.3285457809694794, | |
| "grad_norm": 0.11496925354003906, | |
| "learning_rate": 0.00046678635547576304, | |
| "loss": 5.2082, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.3375224416517055, | |
| "grad_norm": 0.2594555616378784, | |
| "learning_rate": 0.00046656193895870733, | |
| "loss": 5.1917, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.3464991023339317, | |
| "grad_norm": 0.13332834839820862, | |
| "learning_rate": 0.0004663375224416517, | |
| "loss": 5.1701, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.355475763016158, | |
| "grad_norm": 0.1260669082403183, | |
| "learning_rate": 0.00046611310592459607, | |
| "loss": 5.1703, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.3644524236983842, | |
| "grad_norm": 0.17557017505168915, | |
| "learning_rate": 0.0004658886894075404, | |
| "loss": 5.1374, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.3734290843806103, | |
| "grad_norm": 0.1354808807373047, | |
| "learning_rate": 0.00046566427289048475, | |
| "loss": 5.1732, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.3824057450628366, | |
| "grad_norm": 0.16720908880233765, | |
| "learning_rate": 0.0004654398563734291, | |
| "loss": 5.3396, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3913824057450628, | |
| "grad_norm": 0.19078396260738373, | |
| "learning_rate": 0.00046521543985637343, | |
| "loss": 5.1455, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.400359066427289, | |
| "grad_norm": 0.2168230563402176, | |
| "learning_rate": 0.00046499102333931783, | |
| "loss": 5.1026, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.4093357271095153, | |
| "grad_norm": 0.12317873537540436, | |
| "learning_rate": 0.0004647666068222621, | |
| "loss": 5.1632, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.4183123877917414, | |
| "grad_norm": 0.16298305988311768, | |
| "learning_rate": 0.00046454219030520646, | |
| "loss": 5.1489, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.4272890484739678, | |
| "grad_norm": 0.09502866864204407, | |
| "learning_rate": 0.0004643177737881508, | |
| "loss": 5.1068, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.436265709156194, | |
| "grad_norm": 0.15911273658275604, | |
| "learning_rate": 0.00046409335727109514, | |
| "loss": 5.0888, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.44524236983842, | |
| "grad_norm": 0.12198328226804733, | |
| "learning_rate": 0.00046386894075403954, | |
| "loss": 5.071, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.4542190305206464, | |
| "grad_norm": 0.11831381171941757, | |
| "learning_rate": 0.00046364452423698383, | |
| "loss": 5.0809, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.4631956912028725, | |
| "grad_norm": 0.1053285300731659, | |
| "learning_rate": 0.0004634201077199282, | |
| "loss": 5.0774, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.4721723518850989, | |
| "grad_norm": 0.1193586066365242, | |
| "learning_rate": 0.0004631956912028725, | |
| "loss": 5.0553, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.481149012567325, | |
| "grad_norm": 0.16306863725185394, | |
| "learning_rate": 0.0004629712746858169, | |
| "loss": 5.0607, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.4901256732495511, | |
| "grad_norm": 0.12861207127571106, | |
| "learning_rate": 0.0004627468581687612, | |
| "loss": 5.0656, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.4991023339317775, | |
| "grad_norm": 0.08006058633327484, | |
| "learning_rate": 0.0004625224416517056, | |
| "loss": 5.0515, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.5080789946140036, | |
| "grad_norm": 0.11404240876436234, | |
| "learning_rate": 0.00046229802513464993, | |
| "loss": 5.0098, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.5170556552962298, | |
| "grad_norm": 0.13075587153434753, | |
| "learning_rate": 0.0004620736086175943, | |
| "loss": 4.9911, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.5260323159784561, | |
| "grad_norm": 0.17212539911270142, | |
| "learning_rate": 0.0004618491921005386, | |
| "loss": 5.0541, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.5350089766606823, | |
| "grad_norm": 0.07674333453178406, | |
| "learning_rate": 0.00046162477558348296, | |
| "loss": 5.0126, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.5439856373429084, | |
| "grad_norm": 0.1121719628572464, | |
| "learning_rate": 0.0004614003590664273, | |
| "loss": 5.0082, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.5529622980251347, | |
| "grad_norm": 0.16214531660079956, | |
| "learning_rate": 0.00046117594254937164, | |
| "loss": 4.9905, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.5619389587073609, | |
| "grad_norm": 0.12353977560997009, | |
| "learning_rate": 0.000460951526032316, | |
| "loss": 4.9644, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.570915619389587, | |
| "grad_norm": 0.15267392992973328, | |
| "learning_rate": 0.0004607271095152603, | |
| "loss": 4.9708, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.5798922800718134, | |
| "grad_norm": 0.17361833155155182, | |
| "learning_rate": 0.00046050269299820467, | |
| "loss": 4.9869, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.5888689407540395, | |
| "grad_norm": 0.2920306622982025, | |
| "learning_rate": 0.000460278276481149, | |
| "loss": 4.9322, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.5978456014362656, | |
| "grad_norm": 0.09478717297315598, | |
| "learning_rate": 0.0004600538599640934, | |
| "loss": 4.9247, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.606822262118492, | |
| "grad_norm": 0.09164275228977203, | |
| "learning_rate": 0.0004598294434470377, | |
| "loss": 4.9086, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.6157989228007181, | |
| "grad_norm": 0.07962439954280853, | |
| "learning_rate": 0.0004596050269299821, | |
| "loss": 4.9412, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.6247755834829443, | |
| "grad_norm": 0.08752849698066711, | |
| "learning_rate": 0.0004593806104129264, | |
| "loss": 4.9291, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.6337522441651706, | |
| "grad_norm": 0.09293937683105469, | |
| "learning_rate": 0.0004591561938958708, | |
| "loss": 4.9652, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.6427289048473968, | |
| "grad_norm": 0.09523571282625198, | |
| "learning_rate": 0.00045893177737881506, | |
| "loss": 4.9137, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.6517055655296229, | |
| "grad_norm": 0.09075015783309937, | |
| "learning_rate": 0.00045870736086175946, | |
| "loss": 4.8925, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.6606822262118492, | |
| "grad_norm": 0.14088210463523865, | |
| "learning_rate": 0.0004584829443447038, | |
| "loss": 4.8941, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.6696588868940754, | |
| "grad_norm": 0.06859997659921646, | |
| "learning_rate": 0.0004582585278276481, | |
| "loss": 4.8731, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.6786355475763015, | |
| "grad_norm": 0.06676523387432098, | |
| "learning_rate": 0.0004580341113105925, | |
| "loss": 4.8615, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.6876122082585279, | |
| "grad_norm": 0.08721990138292313, | |
| "learning_rate": 0.00045780969479353677, | |
| "loss": 4.8847, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.696588868940754, | |
| "grad_norm": 0.08681096136569977, | |
| "learning_rate": 0.00045758527827648117, | |
| "loss": 4.884, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.7055655296229801, | |
| "grad_norm": 0.1754937767982483, | |
| "learning_rate": 0.0004573608617594255, | |
| "loss": 4.8663, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.7145421903052065, | |
| "grad_norm": 0.07060963660478592, | |
| "learning_rate": 0.00045713644524236985, | |
| "loss": 4.9142, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.7235188509874326, | |
| "grad_norm": 0.12035933881998062, | |
| "learning_rate": 0.0004569120287253142, | |
| "loss": 4.8599, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.7324955116696588, | |
| "grad_norm": 0.11212557554244995, | |
| "learning_rate": 0.00045668761220825853, | |
| "loss": 4.8899, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.7414721723518851, | |
| "grad_norm": 0.058452803641557693, | |
| "learning_rate": 0.0004564631956912029, | |
| "loss": 4.8454, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.7504488330341115, | |
| "grad_norm": 0.1073731780052185, | |
| "learning_rate": 0.0004562387791741472, | |
| "loss": 4.8546, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.7594254937163374, | |
| "grad_norm": 0.12025927007198334, | |
| "learning_rate": 0.00045601436265709156, | |
| "loss": 4.8446, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.7684021543985637, | |
| "grad_norm": 0.08838968724012375, | |
| "learning_rate": 0.00045578994614003596, | |
| "loss": 4.8625, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.77737881508079, | |
| "grad_norm": 0.0963386595249176, | |
| "learning_rate": 0.00045556552962298024, | |
| "loss": 4.8518, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.786355475763016, | |
| "grad_norm": 0.10317738354206085, | |
| "learning_rate": 0.00045534111310592464, | |
| "loss": 4.8161, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.7953321364452424, | |
| "grad_norm": 2.792144536972046, | |
| "learning_rate": 0.00045511669658886893, | |
| "loss": 4.8589, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.8043087971274687, | |
| "grad_norm": 0.08297235518693924, | |
| "learning_rate": 0.00045489228007181327, | |
| "loss": 4.8135, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.8132854578096946, | |
| "grad_norm": 0.080784372985363, | |
| "learning_rate": 0.00045466786355475767, | |
| "loss": 4.8436, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.822262118491921, | |
| "grad_norm": 0.08878181129693985, | |
| "learning_rate": 0.00045444344703770195, | |
| "loss": 4.8532, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.8312387791741473, | |
| "grad_norm": 0.0814034566283226, | |
| "learning_rate": 0.00045421903052064635, | |
| "loss": 4.7992, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.8402154398563735, | |
| "grad_norm": 0.05908200889825821, | |
| "learning_rate": 0.00045399461400359064, | |
| "loss": 4.8005, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.8491921005385996, | |
| "grad_norm": 0.06837856769561768, | |
| "learning_rate": 0.00045377019748653503, | |
| "loss": 4.802, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.858168761220826, | |
| "grad_norm": 0.06775591522455215, | |
| "learning_rate": 0.0004535457809694794, | |
| "loss": 4.7847, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.867145421903052, | |
| "grad_norm": 0.27018266916275024, | |
| "learning_rate": 0.0004533213644524237, | |
| "loss": 4.7918, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.8761220825852782, | |
| "grad_norm": 0.21435914933681488, | |
| "learning_rate": 0.00045309694793536806, | |
| "loss": 4.8033, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.8850987432675046, | |
| "grad_norm": 0.07224582880735397, | |
| "learning_rate": 0.0004528725314183124, | |
| "loss": 4.7639, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.8940754039497307, | |
| "grad_norm": 0.08708648383617401, | |
| "learning_rate": 0.00045264811490125674, | |
| "loss": 4.7894, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.9030520646319569, | |
| "grad_norm": 0.08637712150812149, | |
| "learning_rate": 0.0004524236983842011, | |
| "loss": 4.7745, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.9120287253141832, | |
| "grad_norm": 0.06233949586749077, | |
| "learning_rate": 0.0004521992818671454, | |
| "loss": 4.783, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.9210053859964094, | |
| "grad_norm": 0.07999356091022491, | |
| "learning_rate": 0.0004519748653500898, | |
| "loss": 4.7585, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.9299820466786355, | |
| "grad_norm": 0.09440754354000092, | |
| "learning_rate": 0.0004517504488330341, | |
| "loss": 4.7653, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.9389587073608618, | |
| "grad_norm": 0.09272520244121552, | |
| "learning_rate": 0.00045152603231597845, | |
| "loss": 4.7523, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.947935368043088, | |
| "grad_norm": 0.08041410148143768, | |
| "learning_rate": 0.0004513016157989228, | |
| "loss": 4.7593, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.9569120287253141, | |
| "grad_norm": 0.048107001930475235, | |
| "learning_rate": 0.00045107719928186714, | |
| "loss": 4.7392, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.9658886894075405, | |
| "grad_norm": 0.07445549219846725, | |
| "learning_rate": 0.00045085278276481153, | |
| "loss": 4.7288, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.9748653500897666, | |
| "grad_norm": 0.06540877372026443, | |
| "learning_rate": 0.0004506283662477558, | |
| "loss": 4.7311, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.9838420107719927, | |
| "grad_norm": 0.05422632023692131, | |
| "learning_rate": 0.0004504039497307002, | |
| "loss": 4.728, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.992818671454219, | |
| "grad_norm": 0.05353199318051338, | |
| "learning_rate": 0.0004501795332136445, | |
| "loss": 4.7274, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 4.539531707763672, | |
| "eval_runtime": 437.126, | |
| "eval_samples_per_second": 10.194, | |
| "eval_steps_per_second": 1.274, | |
| "step": 2228 | |
| }, | |
| { | |
| "epoch": 2.0017953321364454, | |
| "grad_norm": 0.05564208701252937, | |
| "learning_rate": 0.0004499551166965889, | |
| "loss": 4.7397, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.0107719928186714, | |
| "grad_norm": 0.05997077003121376, | |
| "learning_rate": 0.00044973070017953324, | |
| "loss": 4.714, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.0197486535008977, | |
| "grad_norm": 0.0496087446808815, | |
| "learning_rate": 0.0004495062836624776, | |
| "loss": 4.7431, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.028725314183124, | |
| "grad_norm": 0.08797594904899597, | |
| "learning_rate": 0.0004492818671454219, | |
| "loss": 4.7186, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.03770197486535, | |
| "grad_norm": 0.05270407721400261, | |
| "learning_rate": 0.00044905745062836627, | |
| "loss": 4.7419, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.0466786355475763, | |
| "grad_norm": 0.06538320332765579, | |
| "learning_rate": 0.0004488330341113106, | |
| "loss": 4.714, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.0556552962298027, | |
| "grad_norm": 0.060536161065101624, | |
| "learning_rate": 0.0004486086175942549, | |
| "loss": 4.691, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.0646319569120286, | |
| "grad_norm": 0.10158341377973557, | |
| "learning_rate": 0.0004483842010771993, | |
| "loss": 4.702, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.073608617594255, | |
| "grad_norm": 0.08171387016773224, | |
| "learning_rate": 0.00044815978456014363, | |
| "loss": 4.7029, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.0825852782764813, | |
| "grad_norm": 0.07701843976974487, | |
| "learning_rate": 0.000447935368043088, | |
| "loss": 4.6957, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.0915619389587072, | |
| "grad_norm": 0.06302302330732346, | |
| "learning_rate": 0.0004477109515260323, | |
| "loss": 4.6855, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.1005385996409336, | |
| "grad_norm": 0.12679466605186462, | |
| "learning_rate": 0.00044748653500897666, | |
| "loss": 4.7147, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.10951526032316, | |
| "grad_norm": 0.17339470982551575, | |
| "learning_rate": 0.000447262118491921, | |
| "loss": 4.6697, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.118491921005386, | |
| "grad_norm": 0.07397322356700897, | |
| "learning_rate": 0.0004470377019748654, | |
| "loss": 4.6642, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.127468581687612, | |
| "grad_norm": 0.0524037629365921, | |
| "learning_rate": 0.0004468132854578097, | |
| "loss": 4.6511, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.1364452423698386, | |
| "grad_norm": 0.06674987077713013, | |
| "learning_rate": 0.0004465888689407541, | |
| "loss": 4.6374, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.1454219030520645, | |
| "grad_norm": 0.04827852547168732, | |
| "learning_rate": 0.00044636445242369837, | |
| "loss": 4.6531, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.154398563734291, | |
| "grad_norm": 0.05094282329082489, | |
| "learning_rate": 0.00044614003590664277, | |
| "loss": 4.6554, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.163375224416517, | |
| "grad_norm": 0.0653914213180542, | |
| "learning_rate": 0.00044591561938958705, | |
| "loss": 4.6568, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.172351885098743, | |
| "grad_norm": 0.06652519851922989, | |
| "learning_rate": 0.00044569120287253145, | |
| "loss": 4.6533, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.1813285457809695, | |
| "grad_norm": 0.051527008414268494, | |
| "learning_rate": 0.0004454667863554758, | |
| "loss": 4.6438, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.190305206463196, | |
| "grad_norm": 0.047185543924570084, | |
| "learning_rate": 0.0004452423698384201, | |
| "loss": 4.6157, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.1992818671454217, | |
| "grad_norm": 0.0524996742606163, | |
| "learning_rate": 0.0004450179533213645, | |
| "loss": 4.6583, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.208258527827648, | |
| "grad_norm": 0.055864688009023666, | |
| "learning_rate": 0.00044479353680430876, | |
| "loss": 4.611, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.2172351885098744, | |
| "grad_norm": 0.055937688797712326, | |
| "learning_rate": 0.00044456912028725316, | |
| "loss": 4.6302, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.2262118491921004, | |
| "grad_norm": 0.07318311929702759, | |
| "learning_rate": 0.0004443447037701975, | |
| "loss": 4.6383, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.2351885098743267, | |
| "grad_norm": 0.05302512273192406, | |
| "learning_rate": 0.00044412028725314184, | |
| "loss": 4.6142, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.244165170556553, | |
| "grad_norm": 0.050843581557273865, | |
| "learning_rate": 0.0004438958707360862, | |
| "loss": 4.5937, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.253141831238779, | |
| "grad_norm": 0.0519312284886837, | |
| "learning_rate": 0.0004436714542190305, | |
| "loss": 4.6083, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.2621184919210053, | |
| "grad_norm": 0.05857894569635391, | |
| "learning_rate": 0.00044344703770197487, | |
| "loss": 4.5765, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.2710951526032317, | |
| "grad_norm": 0.05550041422247887, | |
| "learning_rate": 0.00044322262118491926, | |
| "loss": 4.5859, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.280071813285458, | |
| "grad_norm": 0.10349979996681213, | |
| "learning_rate": 0.00044299820466786355, | |
| "loss": 4.5765, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.289048473967684, | |
| "grad_norm": 0.1185607761144638, | |
| "learning_rate": 0.00044277378815080795, | |
| "loss": 4.5946, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.2980251346499103, | |
| "grad_norm": 0.09133188426494598, | |
| "learning_rate": 0.00044254937163375224, | |
| "loss": 4.578, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.3070017953321367, | |
| "grad_norm": 0.08713024109601974, | |
| "learning_rate": 0.00044232495511669663, | |
| "loss": 4.6011, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.3159784560143626, | |
| "grad_norm": 0.05465725436806679, | |
| "learning_rate": 0.0004421005385996409, | |
| "loss": 4.5755, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.324955116696589, | |
| "grad_norm": 0.056493621319532394, | |
| "learning_rate": 0.00044187612208258526, | |
| "loss": 4.5855, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.3339317773788153, | |
| "grad_norm": 0.047107528895139694, | |
| "learning_rate": 0.00044165170556552966, | |
| "loss": 4.5681, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.342908438061041, | |
| "grad_norm": 0.05533495545387268, | |
| "learning_rate": 0.00044142728904847394, | |
| "loss": 4.5581, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.3518850987432676, | |
| "grad_norm": 0.0478278249502182, | |
| "learning_rate": 0.00044120287253141834, | |
| "loss": 4.5425, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.360861759425494, | |
| "grad_norm": 0.06553395092487335, | |
| "learning_rate": 0.00044097845601436263, | |
| "loss": 4.5484, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.36983842010772, | |
| "grad_norm": 0.07375505566596985, | |
| "learning_rate": 0.000440754039497307, | |
| "loss": 4.541, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.378815080789946, | |
| "grad_norm": 0.20693852007389069, | |
| "learning_rate": 0.00044052962298025137, | |
| "loss": 4.5521, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.3877917414721725, | |
| "grad_norm": 0.056829433888196945, | |
| "learning_rate": 0.0004403052064631957, | |
| "loss": 4.5588, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.3967684021543985, | |
| "grad_norm": 0.05583564192056656, | |
| "learning_rate": 0.00044008078994614005, | |
| "loss": 4.5358, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.405745062836625, | |
| "grad_norm": 0.07319542020559311, | |
| "learning_rate": 0.0004398563734290844, | |
| "loss": 4.523, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.414721723518851, | |
| "grad_norm": 0.052402835339307785, | |
| "learning_rate": 0.00043963195691202873, | |
| "loss": 4.5096, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.423698384201077, | |
| "grad_norm": 0.05206010863184929, | |
| "learning_rate": 0.00043940754039497313, | |
| "loss": 4.5053, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.4326750448833034, | |
| "grad_norm": 0.05443358048796654, | |
| "learning_rate": 0.0004391831238779174, | |
| "loss": 4.5501, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.44165170556553, | |
| "grad_norm": 0.07843279093503952, | |
| "learning_rate": 0.00043895870736086176, | |
| "loss": 4.5027, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.4506283662477557, | |
| "grad_norm": 0.046305350959300995, | |
| "learning_rate": 0.0004387342908438061, | |
| "loss": 4.4975, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.459605026929982, | |
| "grad_norm": 0.22592291235923767, | |
| "learning_rate": 0.00043850987432675044, | |
| "loss": 4.5183, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.4685816876122084, | |
| "grad_norm": 0.05082382634282112, | |
| "learning_rate": 0.0004382854578096948, | |
| "loss": 4.4864, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.4775583482944343, | |
| "grad_norm": 0.06731193512678146, | |
| "learning_rate": 0.00043806104129263913, | |
| "loss": 4.4982, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.4865350089766607, | |
| "grad_norm": 185.7747039794922, | |
| "learning_rate": 0.0004378366247755835, | |
| "loss": 4.6692, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.495511669658887, | |
| "grad_norm": 0.058124568313360214, | |
| "learning_rate": 0.0004376122082585278, | |
| "loss": 4.4775, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.504488330341113, | |
| "grad_norm": 0.08968983590602875, | |
| "learning_rate": 0.0004373877917414722, | |
| "loss": 4.4944, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.5134649910233393, | |
| "grad_norm": 0.2788603901863098, | |
| "learning_rate": 0.0004371633752244165, | |
| "loss": 4.503, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.5224416517055657, | |
| "grad_norm": 0.05559522658586502, | |
| "learning_rate": 0.0004369389587073609, | |
| "loss": 4.4733, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.5314183123877916, | |
| "grad_norm": 0.05935097113251686, | |
| "learning_rate": 0.00043671454219030523, | |
| "loss": 4.4686, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.540394973070018, | |
| "grad_norm": 0.05860767886042595, | |
| "learning_rate": 0.0004364901256732496, | |
| "loss": 4.4593, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.5493716337522443, | |
| "grad_norm": 0.047259800136089325, | |
| "learning_rate": 0.0004362657091561939, | |
| "loss": 4.4479, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.55834829443447, | |
| "grad_norm": 0.04901234060525894, | |
| "learning_rate": 0.00043604129263913826, | |
| "loss": 4.4621, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.5673249551166966, | |
| "grad_norm": 0.05742761120200157, | |
| "learning_rate": 0.0004358168761220826, | |
| "loss": 4.4422, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.576301615798923, | |
| "grad_norm": 0.05717416852712631, | |
| "learning_rate": 0.0004355924596050269, | |
| "loss": 4.4248, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.585278276481149, | |
| "grad_norm": 0.0896502435207367, | |
| "learning_rate": 0.0004353680430879713, | |
| "loss": 4.4368, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.594254937163375, | |
| "grad_norm": 0.08746081590652466, | |
| "learning_rate": 0.0004351436265709156, | |
| "loss": 4.4282, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.6032315978456015, | |
| "grad_norm": 0.07144750654697418, | |
| "learning_rate": 0.00043491921005385997, | |
| "loss": 4.4794, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.6122082585278275, | |
| "grad_norm": 0.05990668013691902, | |
| "learning_rate": 0.0004346947935368043, | |
| "loss": 4.4117, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.621184919210054, | |
| "grad_norm": 0.07920947670936584, | |
| "learning_rate": 0.00043447037701974865, | |
| "loss": 4.4179, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.63016157989228, | |
| "grad_norm": 0.053824532777071, | |
| "learning_rate": 0.000434245960502693, | |
| "loss": 4.3963, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.639138240574506, | |
| "grad_norm": 0.06394129246473312, | |
| "learning_rate": 0.0004340215439856374, | |
| "loss": 4.4045, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.6481149012567324, | |
| "grad_norm": 0.2640804648399353, | |
| "learning_rate": 0.0004337971274685817, | |
| "loss": 4.3916, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.657091561938959, | |
| "grad_norm": 0.04887564107775688, | |
| "learning_rate": 0.0004335727109515261, | |
| "loss": 4.3892, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.6660682226211847, | |
| "grad_norm": 0.05104290321469307, | |
| "learning_rate": 0.00043334829443447036, | |
| "loss": 4.3883, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.675044883303411, | |
| "grad_norm": 0.18991751968860626, | |
| "learning_rate": 0.00043312387791741476, | |
| "loss": 4.3747, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.6840215439856374, | |
| "grad_norm": 0.3262752294540405, | |
| "learning_rate": 0.0004328994614003591, | |
| "loss": 4.3755, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.6929982046678633, | |
| "grad_norm": 0.6619095802307129, | |
| "learning_rate": 0.0004326750448833034, | |
| "loss": 4.3711, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.7019748653500897, | |
| "grad_norm": 0.06734511256217957, | |
| "learning_rate": 0.0004324506283662478, | |
| "loss": 4.3606, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.710951526032316, | |
| "grad_norm": 0.06055251508951187, | |
| "learning_rate": 0.00043222621184919207, | |
| "loss": 4.3639, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.719928186714542, | |
| "grad_norm": 0.08325715363025665, | |
| "learning_rate": 0.00043200179533213647, | |
| "loss": 4.3439, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.7289048473967683, | |
| "grad_norm": 0.06473597139120102, | |
| "learning_rate": 0.00043177737881508075, | |
| "loss": 4.3448, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.7378815080789947, | |
| "grad_norm": 0.062395766377449036, | |
| "learning_rate": 0.00043155296229802515, | |
| "loss": 4.3262, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.7468581687612206, | |
| "grad_norm": 0.054201096296310425, | |
| "learning_rate": 0.0004313285457809695, | |
| "loss": 4.3466, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.755834829443447, | |
| "grad_norm": 0.05278482288122177, | |
| "learning_rate": 0.00043110412926391383, | |
| "loss": 4.3493, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.7648114901256733, | |
| "grad_norm": 0.05334211513400078, | |
| "learning_rate": 0.0004308797127468582, | |
| "loss": 4.3308, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.773788150807899, | |
| "grad_norm": 0.06164594739675522, | |
| "learning_rate": 0.0004306552962298025, | |
| "loss": 4.3166, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.7827648114901256, | |
| "grad_norm": 0.07043807953596115, | |
| "learning_rate": 0.00043043087971274686, | |
| "loss": 4.3009, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.791741472172352, | |
| "grad_norm": 0.05904858186841011, | |
| "learning_rate": 0.00043020646319569126, | |
| "loss": 4.2982, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.800718132854578, | |
| "grad_norm": 0.3487374782562256, | |
| "learning_rate": 0.00042998204667863554, | |
| "loss": 4.2992, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.809694793536804, | |
| "grad_norm": 0.06090310215950012, | |
| "learning_rate": 0.00042975763016157994, | |
| "loss": 4.3199, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.8186714542190305, | |
| "grad_norm": 0.0674201026558876, | |
| "learning_rate": 0.0004295332136445242, | |
| "loss": 4.2813, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.827648114901257, | |
| "grad_norm": 0.0564940869808197, | |
| "learning_rate": 0.00042930879712746857, | |
| "loss": 4.2875, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.836624775583483, | |
| "grad_norm": 0.08277291059494019, | |
| "learning_rate": 0.00042908438061041297, | |
| "loss": 4.2741, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.845601436265709, | |
| "grad_norm": 0.05882051959633827, | |
| "learning_rate": 0.00042885996409335725, | |
| "loss": 4.2606, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.8545780969479355, | |
| "grad_norm": 0.056912124156951904, | |
| "learning_rate": 0.00042863554757630165, | |
| "loss": 4.2387, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.8635547576301614, | |
| "grad_norm": 0.06803829967975616, | |
| "learning_rate": 0.00042841113105924594, | |
| "loss": 4.2595, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.872531418312388, | |
| "grad_norm": 0.38242146372795105, | |
| "learning_rate": 0.00042818671454219033, | |
| "loss": 4.252, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.881508078994614, | |
| "grad_norm": 0.06552311778068542, | |
| "learning_rate": 0.0004279622980251346, | |
| "loss": 4.2386, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.89048473967684, | |
| "grad_norm": 0.06190953776240349, | |
| "learning_rate": 0.000427737881508079, | |
| "loss": 4.2279, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.8994614003590664, | |
| "grad_norm": 0.07202804833650589, | |
| "learning_rate": 0.00042751346499102336, | |
| "loss": 4.2397, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.9084380610412928, | |
| "grad_norm": 0.06415878981351852, | |
| "learning_rate": 0.0004272890484739677, | |
| "loss": 4.2124, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.917414721723519, | |
| "grad_norm": 0.06290468573570251, | |
| "learning_rate": 0.00042706463195691204, | |
| "loss": 4.2267, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.926391382405745, | |
| "grad_norm": 0.05975602567195892, | |
| "learning_rate": 0.0004268402154398564, | |
| "loss": 4.21, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.9353680430879714, | |
| "grad_norm": 0.08022774755954742, | |
| "learning_rate": 0.0004266157989228007, | |
| "loss": 4.2127, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.9443447037701977, | |
| "grad_norm": 0.11041318625211716, | |
| "learning_rate": 0.0004263913824057451, | |
| "loss": 4.1903, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.9533213644524237, | |
| "grad_norm": 0.06093136593699455, | |
| "learning_rate": 0.0004261669658886894, | |
| "loss": 4.1882, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.96229802513465, | |
| "grad_norm": 0.060306135565042496, | |
| "learning_rate": 0.00042594254937163375, | |
| "loss": 4.1874, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.9712746858168764, | |
| "grad_norm": 0.0592743381857872, | |
| "learning_rate": 0.0004257181328545781, | |
| "loss": 4.1881, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.9802513464991023, | |
| "grad_norm": 0.06113787367939949, | |
| "learning_rate": 0.00042549371633752244, | |
| "loss": 4.1788, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.9892280071813286, | |
| "grad_norm": 0.0978228747844696, | |
| "learning_rate": 0.0004252692998204668, | |
| "loss": 4.1714, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.998204667863555, | |
| "grad_norm": 0.06704937666654587, | |
| "learning_rate": 0.0004250448833034111, | |
| "loss": 4.1375, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 3.9158332347869873, | |
| "eval_runtime": 437.3013, | |
| "eval_samples_per_second": 10.19, | |
| "eval_steps_per_second": 1.274, | |
| "step": 3342 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 22280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.661123660598477e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |