| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9998663994655979, | |
| "eval_steps": 500, | |
| "global_step": 1871, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005344021376085505, | |
| "grad_norm": 486.19793701171875, | |
| "learning_rate": 1.7543859649122806e-10, | |
| "loss": 44.3932, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01068804275217101, | |
| "grad_norm": 479.20001220703125, | |
| "learning_rate": 3.5087719298245613e-10, | |
| "loss": 45.7403, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01603206412825651, | |
| "grad_norm": 404.8466796875, | |
| "learning_rate": 5.263157894736842e-10, | |
| "loss": 45.2088, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02137608550434202, | |
| "grad_norm": 481.6076965332031, | |
| "learning_rate": 7.017543859649123e-10, | |
| "loss": 45.2267, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.026720106880427523, | |
| "grad_norm": 505.76458740234375, | |
| "learning_rate": 8.771929824561403e-10, | |
| "loss": 45.4064, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03206412825651302, | |
| "grad_norm": 436.2538146972656, | |
| "learning_rate": 9.99993251508253e-10, | |
| "loss": 45.0983, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03740814963259853, | |
| "grad_norm": 487.7237243652344, | |
| "learning_rate": 9.998732833893071e-10, | |
| "loss": 45.2743, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04275217100868404, | |
| "grad_norm": 504.32977294921875, | |
| "learning_rate": 9.996033902036725e-10, | |
| "loss": 45.9555, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04809619238476954, | |
| "grad_norm": 402.4021911621094, | |
| "learning_rate": 9.991836528993718e-10, | |
| "loss": 45.9827, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.053440213760855046, | |
| "grad_norm": 423.0853271484375, | |
| "learning_rate": 9.986141973665967e-10, | |
| "loss": 46.21, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.058784235136940546, | |
| "grad_norm": 533.7306518554688, | |
| "learning_rate": 9.978951943999498e-10, | |
| "loss": 41.8617, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06412825651302605, | |
| "grad_norm": 483.0992736816406, | |
| "learning_rate": 9.970268596472183e-10, | |
| "loss": 46.5482, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06947227788911156, | |
| "grad_norm": 457.9750061035156, | |
| "learning_rate": 9.960094535446974e-10, | |
| "loss": 45.5803, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07481629926519706, | |
| "grad_norm": 464.4324951171875, | |
| "learning_rate": 9.948432812390764e-10, | |
| "loss": 44.9389, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08016032064128256, | |
| "grad_norm": 435.9920349121094, | |
| "learning_rate": 9.935286924959192e-10, | |
| "loss": 47.8866, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08550434201736808, | |
| "grad_norm": 456.80889892578125, | |
| "learning_rate": 9.920660815947595e-10, | |
| "loss": 45.0282, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09084836339345358, | |
| "grad_norm": 497.8466491699219, | |
| "learning_rate": 9.904558872108458e-10, | |
| "loss": 46.1007, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09619238476953908, | |
| "grad_norm": 405.5328369140625, | |
| "learning_rate": 9.886985922835717e-10, | |
| "loss": 44.4369, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10153640614562458, | |
| "grad_norm": 467.2921142578125, | |
| "learning_rate": 9.867947238716296e-10, | |
| "loss": 48.2561, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10688042752171009, | |
| "grad_norm": 450.1244812011719, | |
| "learning_rate": 9.847448529949325e-10, | |
| "loss": 43.8374, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11222444889779559, | |
| "grad_norm": 495.2174072265625, | |
| "learning_rate": 9.82549594463349e-10, | |
| "loss": 45.4406, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11756847027388109, | |
| "grad_norm": 410.7157897949219, | |
| "learning_rate": 9.802096066923072e-10, | |
| "loss": 45.8352, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1229124916499666, | |
| "grad_norm": 394.4830017089844, | |
| "learning_rate": 9.777255915053179e-10, | |
| "loss": 46.1355, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1282565130260521, | |
| "grad_norm": 375.6810607910156, | |
| "learning_rate": 9.75098293923479e-10, | |
| "loss": 44.0556, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13360053440213762, | |
| "grad_norm": 552.1492309570312, | |
| "learning_rate": 9.723285019420253e-10, | |
| "loss": 48.5456, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13894455577822312, | |
| "grad_norm": 386.6697998046875, | |
| "learning_rate": 9.69417046293987e-10, | |
| "loss": 47.2565, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14428857715430862, | |
| "grad_norm": 385.3708190917969, | |
| "learning_rate": 9.66364800201032e-10, | |
| "loss": 47.0423, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14963259853039412, | |
| "grad_norm": 407.2390441894531, | |
| "learning_rate": 9.631726791115632e-10, | |
| "loss": 45.1834, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.15497661990647962, | |
| "grad_norm": 424.76519775390625, | |
| "learning_rate": 9.598416404261524e-10, | |
| "loss": 45.0167, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16032064128256512, | |
| "grad_norm": 472.8403015136719, | |
| "learning_rate": 9.5637268321039e-10, | |
| "loss": 46.5219, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16566466265865062, | |
| "grad_norm": 428.8890686035156, | |
| "learning_rate": 9.527668478952394e-10, | |
| "loss": 47.501, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.17100868403473615, | |
| "grad_norm": 399.3135681152344, | |
| "learning_rate": 9.490252159649852e-10, | |
| "loss": 44.057, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17635270541082165, | |
| "grad_norm": 385.1392517089844, | |
| "learning_rate": 9.451489096328667e-10, | |
| "loss": 43.8841, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18169672678690715, | |
| "grad_norm": 416.7450866699219, | |
| "learning_rate": 9.411390915044974e-10, | |
| "loss": 44.5708, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18704074816299265, | |
| "grad_norm": 374.25531005859375, | |
| "learning_rate": 9.369969642291692e-10, | |
| "loss": 46.3587, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19238476953907815, | |
| "grad_norm": 451.29510498046875, | |
| "learning_rate": 9.327237701391466e-10, | |
| "loss": 46.0082, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19772879091516365, | |
| "grad_norm": 481.7860412597656, | |
| "learning_rate": 9.283207908770579e-10, | |
| "loss": 49.3258, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20307281229124916, | |
| "grad_norm": 493.49517822265625, | |
| "learning_rate": 9.237893470114983e-10, | |
| "loss": 46.3923, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.20841683366733466, | |
| "grad_norm": 451.55072021484375, | |
| "learning_rate": 9.191307976409558e-10, | |
| "loss": 46.2008, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21376085504342018, | |
| "grad_norm": 474.09625244140625, | |
| "learning_rate": 9.143465399861828e-10, | |
| "loss": 44.9755, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21910487641950568, | |
| "grad_norm": 454.8025207519531, | |
| "learning_rate": 9.094380089711325e-10, | |
| "loss": 45.1256, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.22444889779559118, | |
| "grad_norm": 480.7178955078125, | |
| "learning_rate": 9.04406676792588e-10, | |
| "loss": 48.9151, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22979291917167669, | |
| "grad_norm": 361.1165771484375, | |
| "learning_rate": 8.992540524786122e-10, | |
| "loss": 45.1897, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.23513694054776219, | |
| "grad_norm": 455.4756164550781, | |
| "learning_rate": 8.939816814359501e-10, | |
| "loss": 46.2868, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.24048096192384769, | |
| "grad_norm": 518.1709594726562, | |
| "learning_rate": 8.885911449865215e-10, | |
| "loss": 48.0527, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2458249832999332, | |
| "grad_norm": 470.18536376953125, | |
| "learning_rate": 8.830840598931412e-10, | |
| "loss": 46.6266, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2511690046760187, | |
| "grad_norm": 456.1210632324219, | |
| "learning_rate": 8.774620778746093e-10, | |
| "loss": 45.275, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2565130260521042, | |
| "grad_norm": 427.2693176269531, | |
| "learning_rate": 8.71726885110318e-10, | |
| "loss": 44.1736, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2618570474281897, | |
| "grad_norm": 465.0010681152344, | |
| "learning_rate": 8.658802017345217e-10, | |
| "loss": 46.5734, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.26720106880427524, | |
| "grad_norm": 483.6059265136719, | |
| "learning_rate": 8.599237813204241e-10, | |
| "loss": 47.0762, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2725450901803607, | |
| "grad_norm": 388.6180725097656, | |
| "learning_rate": 8.538594103542357e-10, | |
| "loss": 45.9568, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27788911155644624, | |
| "grad_norm": 492.0127868652344, | |
| "learning_rate": 8.476889076993602e-10, | |
| "loss": 45.8206, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2832331329325317, | |
| "grad_norm": 446.49700927734375, | |
| "learning_rate": 8.414141240508689e-10, | |
| "loss": 46.4758, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.28857715430861725, | |
| "grad_norm": 401.5068359375, | |
| "learning_rate": 8.350369413804303e-10, | |
| "loss": 45.8422, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2939211756847027, | |
| "grad_norm": 443.8550109863281, | |
| "learning_rate": 8.285592723718561e-10, | |
| "loss": 46.1345, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.29926519706078825, | |
| "grad_norm": 385.59033203125, | |
| "learning_rate": 8.219830598474381e-10, | |
| "loss": 45.8269, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3046092184368738, | |
| "grad_norm": 405.3898010253906, | |
| "learning_rate": 8.153102761852451e-10, | |
| "loss": 45.4571, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.30995323981295925, | |
| "grad_norm": 524.8499145507812, | |
| "learning_rate": 8.085429227275549e-10, | |
| "loss": 49.0534, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3152972611890448, | |
| "grad_norm": 485.2023010253906, | |
| "learning_rate": 8.016830291805995e-10, | |
| "loss": 45.2131, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.32064128256513025, | |
| "grad_norm": 416.6390686035156, | |
| "learning_rate": 7.947326530058027e-10, | |
| "loss": 44.0664, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3259853039412158, | |
| "grad_norm": 437.5408630371094, | |
| "learning_rate": 7.876938788026944e-10, | |
| "loss": 45.3301, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.33132932531730125, | |
| "grad_norm": 471.2472229003906, | |
| "learning_rate": 7.805688176836843e-10, | |
| "loss": 48.167, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3366733466933868, | |
| "grad_norm": 468.9358215332031, | |
| "learning_rate": 7.73359606640884e-10, | |
| "loss": 46.2929, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3420173680694723, | |
| "grad_norm": 523.02783203125, | |
| "learning_rate": 7.660684079051672e-10, | |
| "loss": 46.2754, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3473613894455578, | |
| "grad_norm": 439.5931396484375, | |
| "learning_rate": 7.586974082976608e-10, | |
| "loss": 45.8867, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3527054108216433, | |
| "grad_norm": 464.5501403808594, | |
| "learning_rate": 7.512488185738588e-10, | |
| "loss": 45.7995, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3580494321977288, | |
| "grad_norm": 463.3254699707031, | |
| "learning_rate": 7.437248727605602e-10, | |
| "loss": 45.2951, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3633934535738143, | |
| "grad_norm": 416.180908203125, | |
| "learning_rate": 7.361278274858247e-10, | |
| "loss": 46.9576, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3687374749498998, | |
| "grad_norm": 504.0238037109375, | |
| "learning_rate": 7.284599613021526e-10, | |
| "loss": 47.678, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3740814963259853, | |
| "grad_norm": 410.3598937988281, | |
| "learning_rate": 7.207235740030858e-10, | |
| "loss": 44.9078, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.37942551770207084, | |
| "grad_norm": 489.8995361328125, | |
| "learning_rate": 7.1292098593344e-10, | |
| "loss": 45.3449, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3847695390781563, | |
| "grad_norm": 372.5830078125, | |
| "learning_rate": 7.050545372933732e-10, | |
| "loss": 45.1218, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.39011356045424184, | |
| "grad_norm": 502.1708068847656, | |
| "learning_rate": 6.97126587436498e-10, | |
| "loss": 47.2275, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3954575818303273, | |
| "grad_norm": 431.7566833496094, | |
| "learning_rate": 6.891395141622495e-10, | |
| "loss": 45.798, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.40080160320641284, | |
| "grad_norm": 415.6434631347656, | |
| "learning_rate": 6.810957130027218e-10, | |
| "loss": 45.2911, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4061456245824983, | |
| "grad_norm": 451.16290283203125, | |
| "learning_rate": 6.729975965041849e-10, | |
| "loss": 47.2858, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.41148964595858384, | |
| "grad_norm": 427.8857116699219, | |
| "learning_rate": 6.64847593503499e-10, | |
| "loss": 46.2518, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4168336673346693, | |
| "grad_norm": 448.13836669921875, | |
| "learning_rate": 6.566481483996427e-10, | |
| "loss": 43.878, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.42217768871075484, | |
| "grad_norm": 495.7452392578125, | |
| "learning_rate": 6.484017204205741e-10, | |
| "loss": 47.3328, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.42752171008684037, | |
| "grad_norm": 448.1509704589844, | |
| "learning_rate": 6.401107828856438e-10, | |
| "loss": 45.6594, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.43286573146292584, | |
| "grad_norm": 454.22900390625, | |
| "learning_rate": 6.31777822463782e-10, | |
| "loss": 46.073, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.43820975283901137, | |
| "grad_norm": 494.20709228515625, | |
| "learning_rate": 6.234053384276815e-10, | |
| "loss": 44.3891, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.44355377421509684, | |
| "grad_norm": 397.5838928222656, | |
| "learning_rate": 6.149958419042e-10, | |
| "loss": 44.5643, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.44889779559118237, | |
| "grad_norm": 471.062255859375, | |
| "learning_rate": 6.065518551212083e-10, | |
| "loss": 46.9195, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.45424181696726784, | |
| "grad_norm": 486.7655334472656, | |
| "learning_rate": 5.98075910651107e-10, | |
| "loss": 47.3481, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.45958583834335337, | |
| "grad_norm": 515.932373046875, | |
| "learning_rate": 5.895705506512437e-10, | |
| "loss": 46.0562, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4649298597194389, | |
| "grad_norm": 429.03814697265625, | |
| "learning_rate": 5.810383261014514e-10, | |
| "loss": 44.6224, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.47027388109552437, | |
| "grad_norm": 299.903564453125, | |
| "learning_rate": 5.724817960389447e-10, | |
| "loss": 44.7293, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4756179024716099, | |
| "grad_norm": 477.40850830078125, | |
| "learning_rate": 5.639035267907963e-10, | |
| "loss": 45.3137, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.48096192384769537, | |
| "grad_norm": 468.72052001953125, | |
| "learning_rate": 5.553060912042296e-10, | |
| "loss": 44.8162, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4863059452237809, | |
| "grad_norm": 454.3052062988281, | |
| "learning_rate": 5.466920678749537e-10, | |
| "loss": 44.9499, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4916499665998664, | |
| "grad_norm": 424.9459228515625, | |
| "learning_rate": 5.380640403737752e-10, | |
| "loss": 47.8759, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4969939879759519, | |
| "grad_norm": 380.2132873535156, | |
| "learning_rate": 5.294245964717187e-10, | |
| "loss": 44.8434, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5023380093520374, | |
| "grad_norm": 516.621826171875, | |
| "learning_rate": 5.207763273638852e-10, | |
| "loss": 46.6005, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5076820307281229, | |
| "grad_norm": 421.64404296875, | |
| "learning_rate": 5.121218268922859e-10, | |
| "loss": 45.6592, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5130260521042084, | |
| "grad_norm": 458.570068359375, | |
| "learning_rate": 5.03463690767881e-10, | |
| "loss": 45.8901, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.518370073480294, | |
| "grad_norm": 443.55267333984375, | |
| "learning_rate": 4.94804515792058e-10, | |
| "loss": 44.6454, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5237140948563794, | |
| "grad_norm": 427.3360595703125, | |
| "learning_rate": 4.86146899077783e-10, | |
| "loss": 45.2378, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5290581162324649, | |
| "grad_norm": 299.7554016113281, | |
| "learning_rate": 4.774934372706585e-10, | |
| "loss": 44.3535, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5344021376085505, | |
| "grad_norm": 445.1256103515625, | |
| "learning_rate": 4.688467257701225e-10, | |
| "loss": 45.9619, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.539746158984636, | |
| "grad_norm": 467.1534423828125, | |
| "learning_rate": 4.6020935795101856e-10, | |
| "loss": 46.8164, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5450901803607214, | |
| "grad_norm": 433.607666015625, | |
| "learning_rate": 4.5158392438577654e-10, | |
| "loss": 44.5307, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5504342017368069, | |
| "grad_norm": 457.611328125, | |
| "learning_rate": 4.429730120674315e-10, | |
| "loss": 43.718, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5557782231128925, | |
| "grad_norm": 372.8671569824219, | |
| "learning_rate": 4.343792036337167e-10, | |
| "loss": 44.3206, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.561122244488978, | |
| "grad_norm": 444.3086242675781, | |
| "learning_rate": 4.258050765924633e-10, | |
| "loss": 45.5667, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5664662658650634, | |
| "grad_norm": 338.9475402832031, | |
| "learning_rate": 4.172532025485384e-10, | |
| "loss": 42.6416, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.571810287241149, | |
| "grad_norm": 448.279052734375, | |
| "learning_rate": 4.0872614643255335e-10, | |
| "loss": 45.6553, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5771543086172345, | |
| "grad_norm": 459.70159912109375, | |
| "learning_rate": 4.002264657315738e-10, | |
| "loss": 46.4637, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.58249832999332, | |
| "grad_norm": 433.6397705078125, | |
| "learning_rate": 3.9175670972206326e-10, | |
| "loss": 43.3037, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5878423513694054, | |
| "grad_norm": 431.869873046875, | |
| "learning_rate": 3.8331941870528737e-10, | |
| "loss": 46.3079, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.593186372745491, | |
| "grad_norm": 411.36077880859375, | |
| "learning_rate": 3.7491712324541183e-10, | |
| "loss": 46.909, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5985303941215765, | |
| "grad_norm": 400.7826843261719, | |
| "learning_rate": 3.6655234341052023e-10, | |
| "loss": 46.5449, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.603874415497662, | |
| "grad_norm": 409.33355712890625, | |
| "learning_rate": 3.5822758801677894e-10, | |
| "loss": 47.9383, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6092184368737475, | |
| "grad_norm": 399.2182312011719, | |
| "learning_rate": 3.4994535387597803e-10, | |
| "loss": 42.633, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.614562458249833, | |
| "grad_norm": 315.4638977050781, | |
| "learning_rate": 3.417081250466723e-10, | |
| "loss": 43.8757, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6199064796259185, | |
| "grad_norm": 389.83831787109375, | |
| "learning_rate": 3.3351837208914703e-10, | |
| "loss": 44.3336, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.625250501002004, | |
| "grad_norm": 556.4785766601562, | |
| "learning_rate": 3.253785513244322e-10, | |
| "loss": 48.7932, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6305945223780896, | |
| "grad_norm": 457.6601867675781, | |
| "learning_rate": 3.172911040975875e-10, | |
| "loss": 45.7914, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.635938543754175, | |
| "grad_norm": 497.7450256347656, | |
| "learning_rate": 3.0925845604547985e-10, | |
| "loss": 45.789, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6412825651302605, | |
| "grad_norm": 433.0904846191406, | |
| "learning_rate": 3.012830163692706e-10, | |
| "loss": 44.0252, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6466265865063461, | |
| "grad_norm": 417.3199157714844, | |
| "learning_rate": 2.933671771118333e-10, | |
| "loss": 45.2464, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6519706078824316, | |
| "grad_norm": 439.3309326171875, | |
| "learning_rate": 2.8551331244031814e-10, | |
| "loss": 43.0369, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.657314629258517, | |
| "grad_norm": 416.8631286621094, | |
| "learning_rate": 2.7772377793407634e-10, | |
| "loss": 44.467, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6626586506346025, | |
| "grad_norm": 459.6900329589844, | |
| "learning_rate": 2.7000090987816086e-10, | |
| "loss": 45.8894, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6680026720106881, | |
| "grad_norm": 452.77166748046875, | |
| "learning_rate": 2.623470245626131e-10, | |
| "loss": 46.3879, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6733466933867736, | |
| "grad_norm": 373.71551513671875, | |
| "learning_rate": 2.547644175877475e-10, | |
| "loss": 44.8361, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.678690714762859, | |
| "grad_norm": 493.52978515625, | |
| "learning_rate": 2.472553631756397e-10, | |
| "loss": 45.5009, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6840347361389446, | |
| "grad_norm": 495.15216064453125, | |
| "learning_rate": 2.3982211348802956e-10, | |
| "loss": 45.423, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6893787575150301, | |
| "grad_norm": 426.9014892578125, | |
| "learning_rate": 2.324668979508382e-10, | |
| "loss": 45.0799, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6947227788911156, | |
| "grad_norm": 475.074951171875, | |
| "learning_rate": 2.251919225855041e-10, | |
| "loss": 45.2446, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.700066800267201, | |
| "grad_norm": 479.0338439941406, | |
| "learning_rate": 2.1799936934734111e-10, | |
| "loss": 44.113, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7054108216432866, | |
| "grad_norm": 350.0298156738281, | |
| "learning_rate": 2.1089139547111202e-10, | |
| "loss": 45.8131, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7107548430193721, | |
| "grad_norm": 459.32635498046875, | |
| "learning_rate": 2.0387013282401746e-10, | |
| "loss": 46.7643, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7160988643954576, | |
| "grad_norm": 455.7977294921875, | |
| "learning_rate": 1.969376872662936e-10, | |
| "loss": 45.0021, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7214428857715431, | |
| "grad_norm": 425.2674560546875, | |
| "learning_rate": 1.9009613801960964e-10, | |
| "loss": 45.1843, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7267869071476286, | |
| "grad_norm": 390.6509704589844, | |
| "learning_rate": 1.8334753704345403e-10, | |
| "loss": 44.7194, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7321309285237141, | |
| "grad_norm": 505.694580078125, | |
| "learning_rate": 1.7669390841969942e-10, | |
| "loss": 46.2759, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7374749498997996, | |
| "grad_norm": 369.56854248046875, | |
| "learning_rate": 1.7013724774552676e-10, | |
| "loss": 44.7077, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7428189712758851, | |
| "grad_norm": 429.40838623046875, | |
| "learning_rate": 1.6367952153489342e-10, | |
| "loss": 48.0047, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7481629926519706, | |
| "grad_norm": 479.33917236328125, | |
| "learning_rate": 1.5732266662872497e-10, | |
| "loss": 45.8104, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7535070140280561, | |
| "grad_norm": 447.9562072753906, | |
| "learning_rate": 1.510685896140055e-10, | |
| "loss": 46.4843, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7588510354041417, | |
| "grad_norm": 404.3109130859375, | |
| "learning_rate": 1.4491916625194192e-10, | |
| "loss": 44.9299, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7641950567802271, | |
| "grad_norm": 444.400634765625, | |
| "learning_rate": 1.3887624091537504e-10, | |
| "loss": 44.375, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7695390781563126, | |
| "grad_norm": 522.9306030273438, | |
| "learning_rate": 1.329416260356035e-10, | |
| "loss": 45.507, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7748830995323981, | |
| "grad_norm": 511.14727783203125, | |
| "learning_rate": 1.271171015587877e-10, | |
| "loss": 46.0719, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7802271209084837, | |
| "grad_norm": 459.71435546875, | |
| "learning_rate": 1.2140441441209837e-10, | |
| "loss": 44.2746, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7855711422845691, | |
| "grad_norm": 422.275390625, | |
| "learning_rate": 1.158052779797671e-10, | |
| "loss": 46.109, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7909151636606546, | |
| "grad_norm": 407.68096923828125, | |
| "learning_rate": 1.1032137158919697e-10, | |
| "loss": 44.9659, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.7962591850367402, | |
| "grad_norm": 448.97314453125, | |
| "learning_rate": 1.0495434000728927e-10, | |
| "loss": 47.4394, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8016032064128257, | |
| "grad_norm": 485.30316162109375, | |
| "learning_rate": 9.970579294713462e-11, | |
| "loss": 46.3913, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8069472277889111, | |
| "grad_norm": 432.86883544921875, | |
| "learning_rate": 9.457730458521747e-11, | |
| "loss": 47.0394, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8122912491649966, | |
| "grad_norm": 531.20556640625, | |
| "learning_rate": 8.95704130892801e-11, | |
| "loss": 45.8065, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8176352705410822, | |
| "grad_norm": 460.9041748046875, | |
| "learning_rate": 8.468662015698525e-11, | |
| "loss": 44.684, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8229792919171677, | |
| "grad_norm": 520.4033813476562, | |
| "learning_rate": 7.99273905655184e-11, | |
| "loss": 46.8526, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8283233132932531, | |
| "grad_norm": 411.2076416015625, | |
| "learning_rate": 7.52941517322624e-11, | |
| "loss": 46.6088, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8336673346693386, | |
| "grad_norm": 419.9526062011719, | |
| "learning_rate": 7.078829328667747e-11, | |
| "loss": 46.7982, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8390113560454242, | |
| "grad_norm": 461.82977294921875, | |
| "learning_rate": 6.641116665351543e-11, | |
| "loss": 44.4069, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8443553774215097, | |
| "grad_norm": 462.8651123046875, | |
| "learning_rate": 6.216408464749213e-11, | |
| "loss": 46.1496, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8496993987975952, | |
| "grad_norm": 462.21368408203125, | |
| "learning_rate": 5.804832107953923e-11, | |
| "loss": 43.4678, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8550434201736807, | |
| "grad_norm": 480.9452209472656, | |
| "learning_rate": 5.406511037475603e-11, | |
| "loss": 46.4101, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8603874415497662, | |
| "grad_norm": 467.31610107421875, | |
| "learning_rate": 5.021564720217248e-11, | |
| "loss": 45.7185, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8657314629258517, | |
| "grad_norm": 411.0181579589844, | |
| "learning_rate": 4.650108611643672e-11, | |
| "loss": 43.6447, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8710754843019372, | |
| "grad_norm": 424.5155944824219, | |
| "learning_rate": 4.292254121153422e-11, | |
| "loss": 45.3636, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8764195056780227, | |
| "grad_norm": 410.9560241699219, | |
| "learning_rate": 3.948108578664178e-11, | |
| "loss": 46.3407, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8817635270541082, | |
| "grad_norm": 477.28240966796875, | |
| "learning_rate": 3.617775202421675e-11, | |
| "loss": 44.1375, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8871075484301937, | |
| "grad_norm": 461.23309326171875, | |
| "learning_rate": 3.301353068041896e-11, | |
| "loss": 43.7172, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8924515698062793, | |
| "grad_norm": 447.8024597167969, | |
| "learning_rate": 2.998937078795672e-11, | |
| "loss": 47.4198, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8977955911823647, | |
| "grad_norm": 511.4967956542969, | |
| "learning_rate": 2.7106179371447437e-11, | |
| "loss": 45.0943, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9031396125584502, | |
| "grad_norm": 478.0008850097656, | |
| "learning_rate": 2.4364821175376806e-11, | |
| "loss": 46.4821, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9084836339345357, | |
| "grad_norm": 509.50958251953125, | |
| "learning_rate": 2.1766118404739633e-11, | |
| "loss": 44.1657, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9138276553106213, | |
| "grad_norm": 423.26739501953125, | |
| "learning_rate": 1.931085047843889e-11, | |
| "loss": 47.6892, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9191716766867067, | |
| "grad_norm": 518.2147827148438, | |
| "learning_rate": 1.6999753795517883e-11, | |
| "loss": 46.303, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9245156980627922, | |
| "grad_norm": 386.4200134277344, | |
| "learning_rate": 1.483352151429446e-11, | |
| "loss": 43.9672, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9298597194388778, | |
| "grad_norm": 476.4378967285156, | |
| "learning_rate": 1.2812803344465052e-11, | |
| "loss": 45.2255, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9352037408149633, | |
| "grad_norm": 493.2698669433594, | |
| "learning_rate": 1.0938205352239883e-11, | |
| "loss": 45.0955, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9405477621910487, | |
| "grad_norm": 455.5923156738281, | |
| "learning_rate": 9.210289778567305e-12, | |
| "loss": 43.4817, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9458917835671342, | |
| "grad_norm": 452.3888854980469, | |
| "learning_rate": 7.629574870503641e-12, | |
| "loss": 43.956, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9512358049432198, | |
| "grad_norm": 369.5935363769531, | |
| "learning_rate": 6.196534725777081e-12, | |
| "loss": 45.6541, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9565798263193053, | |
| "grad_norm": 434.7474365234375, | |
| "learning_rate": 4.911599150593193e-12, | |
| "loss": 45.4734, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9619238476953907, | |
| "grad_norm": 515.116455078125, | |
| "learning_rate": 3.7751535307252726e-12, | |
| "loss": 44.1633, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9672678690714763, | |
| "grad_norm": 423.6890563964844, | |
| "learning_rate": 2.7875387159265744e-12, | |
| "loss": 45.9123, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9726118904475618, | |
| "grad_norm": 456.1297607421875, | |
| "learning_rate": 1.949050917700923e-12, | |
| "loss": 46.53, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9779559118236473, | |
| "grad_norm": 496.5564270019531, | |
| "learning_rate": 1.259941620460947e-12, | |
| "loss": 44.7135, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9832999331997327, | |
| "grad_norm": 408.35650634765625, | |
| "learning_rate": 7.204175061013562e-13, | |
| "loss": 42.9871, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9886439545758183, | |
| "grad_norm": 438.6337890625, | |
| "learning_rate": 3.3064039200975115e-13, | |
| "loss": 44.0024, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9939879759519038, | |
| "grad_norm": 453.5023193359375, | |
| "learning_rate": 9.072718253316792e-14, | |
| "loss": 47.6027, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9993319973279893, | |
| "grad_norm": 423.05963134765625, | |
| "learning_rate": 7.498339156808421e-16, | |
| "loss": 47.3557, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9998663994655979, | |
| "step": 1871, | |
| "total_flos": 0.0, | |
| "train_loss": 45.65529471906604, | |
| "train_runtime": 8058.9213, | |
| "train_samples_per_second": 7.43, | |
| "train_steps_per_second": 0.232 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1871, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |