{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998663994655979, "eval_steps": 500, "global_step": 1871, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005344021376085505, "grad_norm": 486.19793701171875, "learning_rate": 1.7543859649122806e-10, "loss": 44.3932, "step": 10 }, { "epoch": 0.01068804275217101, "grad_norm": 479.20001220703125, "learning_rate": 3.5087719298245613e-10, "loss": 45.7403, "step": 20 }, { "epoch": 0.01603206412825651, "grad_norm": 404.8466796875, "learning_rate": 5.263157894736842e-10, "loss": 45.2088, "step": 30 }, { "epoch": 0.02137608550434202, "grad_norm": 481.6076965332031, "learning_rate": 7.017543859649123e-10, "loss": 45.2267, "step": 40 }, { "epoch": 0.026720106880427523, "grad_norm": 505.76458740234375, "learning_rate": 8.771929824561403e-10, "loss": 45.4064, "step": 50 }, { "epoch": 0.03206412825651302, "grad_norm": 436.2538146972656, "learning_rate": 9.99993251508253e-10, "loss": 45.0983, "step": 60 }, { "epoch": 0.03740814963259853, "grad_norm": 487.7237243652344, "learning_rate": 9.998732833893071e-10, "loss": 45.2743, "step": 70 }, { "epoch": 0.04275217100868404, "grad_norm": 504.32977294921875, "learning_rate": 9.996033902036725e-10, "loss": 45.9555, "step": 80 }, { "epoch": 0.04809619238476954, "grad_norm": 402.4021911621094, "learning_rate": 9.991836528993718e-10, "loss": 45.9827, "step": 90 }, { "epoch": 0.053440213760855046, "grad_norm": 423.0853271484375, "learning_rate": 9.986141973665967e-10, "loss": 46.21, "step": 100 }, { "epoch": 0.058784235136940546, "grad_norm": 533.7306518554688, "learning_rate": 9.978951943999498e-10, "loss": 41.8617, "step": 110 }, { "epoch": 0.06412825651302605, "grad_norm": 483.0992736816406, "learning_rate": 9.970268596472183e-10, "loss": 46.5482, "step": 120 }, { "epoch": 0.06947227788911156, "grad_norm": 457.9750061035156, "learning_rate": 9.960094535446974e-10, "loss": 45.5803, "step": 130 }, { "epoch": 0.07481629926519706, "grad_norm": 464.4324951171875, "learning_rate": 9.948432812390764e-10, "loss": 44.9389, "step": 140 }, { "epoch": 0.08016032064128256, "grad_norm": 435.9920349121094, "learning_rate": 9.935286924959192e-10, "loss": 47.8866, "step": 150 }, { "epoch": 0.08550434201736808, "grad_norm": 456.80889892578125, "learning_rate": 9.920660815947595e-10, "loss": 45.0282, "step": 160 }, { "epoch": 0.09084836339345358, "grad_norm": 497.8466491699219, "learning_rate": 9.904558872108458e-10, "loss": 46.1007, "step": 170 }, { "epoch": 0.09619238476953908, "grad_norm": 405.5328369140625, "learning_rate": 9.886985922835717e-10, "loss": 44.4369, "step": 180 }, { "epoch": 0.10153640614562458, "grad_norm": 467.2921142578125, "learning_rate": 9.867947238716296e-10, "loss": 48.2561, "step": 190 }, { "epoch": 0.10688042752171009, "grad_norm": 450.1244812011719, "learning_rate": 9.847448529949325e-10, "loss": 43.8374, "step": 200 }, { "epoch": 0.11222444889779559, "grad_norm": 495.2174072265625, "learning_rate": 9.82549594463349e-10, "loss": 45.4406, "step": 210 }, { "epoch": 0.11756847027388109, "grad_norm": 410.7157897949219, "learning_rate": 9.802096066923072e-10, "loss": 45.8352, "step": 220 }, { "epoch": 0.1229124916499666, "grad_norm": 394.4830017089844, "learning_rate": 9.777255915053179e-10, "loss": 46.1355, "step": 230 }, { "epoch": 0.1282565130260521, "grad_norm": 375.6810607910156, "learning_rate": 9.75098293923479e-10, "loss": 44.0556, "step": 240 }, { "epoch": 0.13360053440213762, "grad_norm": 552.1492309570312, "learning_rate": 9.723285019420253e-10, "loss": 48.5456, "step": 250 }, { "epoch": 0.13894455577822312, "grad_norm": 386.6697998046875, "learning_rate": 9.69417046293987e-10, "loss": 47.2565, "step": 260 }, { "epoch": 0.14428857715430862, "grad_norm": 385.3708190917969, "learning_rate": 9.66364800201032e-10, "loss": 47.0423, "step": 270 }, { "epoch": 0.14963259853039412, "grad_norm": 407.2390441894531, "learning_rate": 9.631726791115632e-10, "loss": 45.1834, "step": 280 }, { "epoch": 0.15497661990647962, "grad_norm": 424.76519775390625, "learning_rate": 9.598416404261524e-10, "loss": 45.0167, "step": 290 }, { "epoch": 0.16032064128256512, "grad_norm": 472.8403015136719, "learning_rate": 9.5637268321039e-10, "loss": 46.5219, "step": 300 }, { "epoch": 0.16566466265865062, "grad_norm": 428.8890686035156, "learning_rate": 9.527668478952394e-10, "loss": 47.501, "step": 310 }, { "epoch": 0.17100868403473615, "grad_norm": 399.3135681152344, "learning_rate": 9.490252159649852e-10, "loss": 44.057, "step": 320 }, { "epoch": 0.17635270541082165, "grad_norm": 385.1392517089844, "learning_rate": 9.451489096328667e-10, "loss": 43.8841, "step": 330 }, { "epoch": 0.18169672678690715, "grad_norm": 416.7450866699219, "learning_rate": 9.411390915044974e-10, "loss": 44.5708, "step": 340 }, { "epoch": 0.18704074816299265, "grad_norm": 374.25531005859375, "learning_rate": 9.369969642291692e-10, "loss": 46.3587, "step": 350 }, { "epoch": 0.19238476953907815, "grad_norm": 451.29510498046875, "learning_rate": 9.327237701391466e-10, "loss": 46.0082, "step": 360 }, { "epoch": 0.19772879091516365, "grad_norm": 481.7860412597656, "learning_rate": 9.283207908770579e-10, "loss": 49.3258, "step": 370 }, { "epoch": 0.20307281229124916, "grad_norm": 493.49517822265625, "learning_rate": 9.237893470114983e-10, "loss": 46.3923, "step": 380 }, { "epoch": 0.20841683366733466, "grad_norm": 451.55072021484375, "learning_rate": 9.191307976409558e-10, "loss": 46.2008, "step": 390 }, { "epoch": 0.21376085504342018, "grad_norm": 474.09625244140625, "learning_rate": 9.143465399861828e-10, "loss": 44.9755, "step": 400 }, { "epoch": 0.21910487641950568, "grad_norm": 454.8025207519531, "learning_rate": 9.094380089711325e-10, "loss": 45.1256, "step": 410 }, { "epoch": 0.22444889779559118, "grad_norm": 480.7178955078125, "learning_rate": 9.04406676792588e-10, "loss": 48.9151, "step": 420 }, { "epoch": 0.22979291917167669, "grad_norm": 361.1165771484375, "learning_rate": 8.992540524786122e-10, "loss": 45.1897, "step": 430 }, { "epoch": 0.23513694054776219, "grad_norm": 455.4756164550781, "learning_rate": 8.939816814359501e-10, "loss": 46.2868, "step": 440 }, { "epoch": 0.24048096192384769, "grad_norm": 518.1709594726562, "learning_rate": 8.885911449865215e-10, "loss": 48.0527, "step": 450 }, { "epoch": 0.2458249832999332, "grad_norm": 470.18536376953125, "learning_rate": 8.830840598931412e-10, "loss": 46.6266, "step": 460 }, { "epoch": 0.2511690046760187, "grad_norm": 456.1210632324219, "learning_rate": 8.774620778746093e-10, "loss": 45.275, "step": 470 }, { "epoch": 0.2565130260521042, "grad_norm": 427.2693176269531, "learning_rate": 8.71726885110318e-10, "loss": 44.1736, "step": 480 }, { "epoch": 0.2618570474281897, "grad_norm": 465.0010681152344, "learning_rate": 8.658802017345217e-10, "loss": 46.5734, "step": 490 }, { "epoch": 0.26720106880427524, "grad_norm": 483.6059265136719, "learning_rate": 8.599237813204241e-10, "loss": 47.0762, "step": 500 }, { "epoch": 0.2725450901803607, "grad_norm": 388.6180725097656, "learning_rate": 8.538594103542357e-10, "loss": 45.9568, "step": 510 }, { "epoch": 0.27788911155644624, "grad_norm": 492.0127868652344, "learning_rate": 8.476889076993602e-10, "loss": 45.8206, "step": 520 }, { "epoch": 0.2832331329325317, "grad_norm": 446.49700927734375, "learning_rate": 8.414141240508689e-10, "loss": 46.4758, "step": 530 }, { "epoch": 0.28857715430861725, "grad_norm": 401.5068359375, "learning_rate": 8.350369413804303e-10, "loss": 45.8422, "step": 540 }, { "epoch": 0.2939211756847027, "grad_norm": 443.8550109863281, "learning_rate": 8.285592723718561e-10, "loss": 46.1345, "step": 550 }, { "epoch": 0.29926519706078825, "grad_norm": 385.59033203125, "learning_rate": 8.219830598474381e-10, "loss": 45.8269, "step": 560 }, { "epoch": 0.3046092184368738, "grad_norm": 405.3898010253906, "learning_rate": 8.153102761852451e-10, "loss": 45.4571, "step": 570 }, { "epoch": 0.30995323981295925, "grad_norm": 524.8499145507812, "learning_rate": 8.085429227275549e-10, "loss": 49.0534, "step": 580 }, { "epoch": 0.3152972611890448, "grad_norm": 485.2023010253906, "learning_rate": 8.016830291805995e-10, "loss": 45.2131, "step": 590 }, { "epoch": 0.32064128256513025, "grad_norm": 416.6390686035156, "learning_rate": 7.947326530058027e-10, "loss": 44.0664, "step": 600 }, { "epoch": 0.3259853039412158, "grad_norm": 437.5408630371094, "learning_rate": 7.876938788026944e-10, "loss": 45.3301, "step": 610 }, { "epoch": 0.33132932531730125, "grad_norm": 471.2472229003906, "learning_rate": 7.805688176836843e-10, "loss": 48.167, "step": 620 }, { "epoch": 0.3366733466933868, "grad_norm": 468.9358215332031, "learning_rate": 7.73359606640884e-10, "loss": 46.2929, "step": 630 }, { "epoch": 0.3420173680694723, "grad_norm": 523.02783203125, "learning_rate": 7.660684079051672e-10, "loss": 46.2754, "step": 640 }, { "epoch": 0.3473613894455578, "grad_norm": 439.5931396484375, "learning_rate": 7.586974082976608e-10, "loss": 45.8867, "step": 650 }, { "epoch": 0.3527054108216433, "grad_norm": 464.5501403808594, "learning_rate": 7.512488185738588e-10, "loss": 45.7995, "step": 660 }, { "epoch": 0.3580494321977288, "grad_norm": 463.3254699707031, "learning_rate": 7.437248727605602e-10, "loss": 45.2951, "step": 670 }, { "epoch": 0.3633934535738143, "grad_norm": 416.180908203125, "learning_rate": 7.361278274858247e-10, "loss": 46.9576, "step": 680 }, { "epoch": 0.3687374749498998, "grad_norm": 504.0238037109375, "learning_rate": 7.284599613021526e-10, "loss": 47.678, "step": 690 }, { "epoch": 0.3740814963259853, "grad_norm": 410.3598937988281, "learning_rate": 7.207235740030858e-10, "loss": 44.9078, "step": 700 }, { "epoch": 0.37942551770207084, "grad_norm": 489.8995361328125, "learning_rate": 7.1292098593344e-10, "loss": 45.3449, "step": 710 }, { "epoch": 0.3847695390781563, "grad_norm": 372.5830078125, "learning_rate": 7.050545372933732e-10, "loss": 45.1218, "step": 720 }, { "epoch": 0.39011356045424184, "grad_norm": 502.1708068847656, "learning_rate": 6.97126587436498e-10, "loss": 47.2275, "step": 730 }, { "epoch": 0.3954575818303273, "grad_norm": 431.7566833496094, "learning_rate": 6.891395141622495e-10, "loss": 45.798, "step": 740 }, { "epoch": 0.40080160320641284, "grad_norm": 415.6434631347656, "learning_rate": 6.810957130027218e-10, "loss": 45.2911, "step": 750 }, { "epoch": 0.4061456245824983, "grad_norm": 451.16290283203125, "learning_rate": 6.729975965041849e-10, "loss": 47.2858, "step": 760 }, { "epoch": 0.41148964595858384, "grad_norm": 427.8857116699219, "learning_rate": 6.64847593503499e-10, "loss": 46.2518, "step": 770 }, { "epoch": 0.4168336673346693, "grad_norm": 448.13836669921875, "learning_rate": 6.566481483996427e-10, "loss": 43.878, "step": 780 }, { "epoch": 0.42217768871075484, "grad_norm": 495.7452392578125, "learning_rate": 6.484017204205741e-10, "loss": 47.3328, "step": 790 }, { "epoch": 0.42752171008684037, "grad_norm": 448.1509704589844, "learning_rate": 6.401107828856438e-10, "loss": 45.6594, "step": 800 }, { "epoch": 0.43286573146292584, "grad_norm": 454.22900390625, "learning_rate": 6.31777822463782e-10, "loss": 46.073, "step": 810 }, { "epoch": 0.43820975283901137, "grad_norm": 494.20709228515625, "learning_rate": 6.234053384276815e-10, "loss": 44.3891, "step": 820 }, { "epoch": 0.44355377421509684, "grad_norm": 397.5838928222656, "learning_rate": 6.149958419042e-10, "loss": 44.5643, "step": 830 }, { "epoch": 0.44889779559118237, "grad_norm": 471.062255859375, "learning_rate": 6.065518551212083e-10, "loss": 46.9195, "step": 840 }, { "epoch": 0.45424181696726784, "grad_norm": 486.7655334472656, "learning_rate": 5.98075910651107e-10, "loss": 47.3481, "step": 850 }, { "epoch": 0.45958583834335337, "grad_norm": 515.932373046875, "learning_rate": 5.895705506512437e-10, "loss": 46.0562, "step": 860 }, { "epoch": 0.4649298597194389, "grad_norm": 429.03814697265625, "learning_rate": 5.810383261014514e-10, "loss": 44.6224, "step": 870 }, { "epoch": 0.47027388109552437, "grad_norm": 299.903564453125, "learning_rate": 5.724817960389447e-10, "loss": 44.7293, "step": 880 }, { "epoch": 0.4756179024716099, "grad_norm": 477.40850830078125, "learning_rate": 5.639035267907963e-10, "loss": 45.3137, "step": 890 }, { "epoch": 0.48096192384769537, "grad_norm": 468.72052001953125, "learning_rate": 5.553060912042296e-10, "loss": 44.8162, "step": 900 }, { "epoch": 0.4863059452237809, "grad_norm": 454.3052062988281, "learning_rate": 5.466920678749537e-10, "loss": 44.9499, "step": 910 }, { "epoch": 0.4916499665998664, "grad_norm": 424.9459228515625, "learning_rate": 5.380640403737752e-10, "loss": 47.8759, "step": 920 }, { "epoch": 0.4969939879759519, "grad_norm": 380.2132873535156, "learning_rate": 5.294245964717187e-10, "loss": 44.8434, "step": 930 }, { "epoch": 0.5023380093520374, "grad_norm": 516.621826171875, "learning_rate": 5.207763273638852e-10, "loss": 46.6005, "step": 940 }, { "epoch": 0.5076820307281229, "grad_norm": 421.64404296875, "learning_rate": 5.121218268922859e-10, "loss": 45.6592, "step": 950 }, { "epoch": 0.5130260521042084, "grad_norm": 458.570068359375, "learning_rate": 5.03463690767881e-10, "loss": 45.8901, "step": 960 }, { "epoch": 0.518370073480294, "grad_norm": 443.55267333984375, "learning_rate": 4.94804515792058e-10, "loss": 44.6454, "step": 970 }, { "epoch": 0.5237140948563794, "grad_norm": 427.3360595703125, "learning_rate": 4.86146899077783e-10, "loss": 45.2378, "step": 980 }, { "epoch": 0.5290581162324649, "grad_norm": 299.7554016113281, "learning_rate": 4.774934372706585e-10, "loss": 44.3535, "step": 990 }, { "epoch": 0.5344021376085505, "grad_norm": 445.1256103515625, "learning_rate": 4.688467257701225e-10, "loss": 45.9619, "step": 1000 }, { "epoch": 0.539746158984636, "grad_norm": 467.1534423828125, "learning_rate": 4.6020935795101856e-10, "loss": 46.8164, "step": 1010 }, { "epoch": 0.5450901803607214, "grad_norm": 433.607666015625, "learning_rate": 4.5158392438577654e-10, "loss": 44.5307, "step": 1020 }, { "epoch": 0.5504342017368069, "grad_norm": 457.611328125, "learning_rate": 4.429730120674315e-10, "loss": 43.718, "step": 1030 }, { "epoch": 0.5557782231128925, "grad_norm": 372.8671569824219, "learning_rate": 4.343792036337167e-10, "loss": 44.3206, "step": 1040 }, { "epoch": 0.561122244488978, "grad_norm": 444.3086242675781, "learning_rate": 4.258050765924633e-10, "loss": 45.5667, "step": 1050 }, { "epoch": 0.5664662658650634, "grad_norm": 338.9475402832031, "learning_rate": 4.172532025485384e-10, "loss": 42.6416, "step": 1060 }, { "epoch": 0.571810287241149, "grad_norm": 448.279052734375, "learning_rate": 4.0872614643255335e-10, "loss": 45.6553, "step": 1070 }, { "epoch": 0.5771543086172345, "grad_norm": 459.70159912109375, "learning_rate": 4.002264657315738e-10, "loss": 46.4637, "step": 1080 }, { "epoch": 0.58249832999332, "grad_norm": 433.6397705078125, "learning_rate": 3.9175670972206326e-10, "loss": 43.3037, "step": 1090 }, { "epoch": 0.5878423513694054, "grad_norm": 431.869873046875, "learning_rate": 3.8331941870528737e-10, "loss": 46.3079, "step": 1100 }, { "epoch": 0.593186372745491, "grad_norm": 411.36077880859375, "learning_rate": 3.7491712324541183e-10, "loss": 46.909, "step": 1110 }, { "epoch": 0.5985303941215765, "grad_norm": 400.7826843261719, "learning_rate": 3.6655234341052023e-10, "loss": 46.5449, "step": 1120 }, { "epoch": 0.603874415497662, "grad_norm": 409.33355712890625, "learning_rate": 3.5822758801677894e-10, "loss": 47.9383, "step": 1130 }, { "epoch": 0.6092184368737475, "grad_norm": 399.2182312011719, "learning_rate": 3.4994535387597803e-10, "loss": 42.633, "step": 1140 }, { "epoch": 0.614562458249833, "grad_norm": 315.4638977050781, "learning_rate": 3.417081250466723e-10, "loss": 43.8757, "step": 1150 }, { "epoch": 0.6199064796259185, "grad_norm": 389.83831787109375, "learning_rate": 3.3351837208914703e-10, "loss": 44.3336, "step": 1160 }, { "epoch": 0.625250501002004, "grad_norm": 556.4785766601562, "learning_rate": 3.253785513244322e-10, "loss": 48.7932, "step": 1170 }, { "epoch": 0.6305945223780896, "grad_norm": 457.6601867675781, "learning_rate": 3.172911040975875e-10, "loss": 45.7914, "step": 1180 }, { "epoch": 0.635938543754175, "grad_norm": 497.7450256347656, "learning_rate": 3.0925845604547985e-10, "loss": 45.789, "step": 1190 }, { "epoch": 0.6412825651302605, "grad_norm": 433.0904846191406, "learning_rate": 3.012830163692706e-10, "loss": 44.0252, "step": 1200 }, { "epoch": 0.6466265865063461, "grad_norm": 417.3199157714844, "learning_rate": 2.933671771118333e-10, "loss": 45.2464, "step": 1210 }, { "epoch": 0.6519706078824316, "grad_norm": 439.3309326171875, "learning_rate": 2.8551331244031814e-10, "loss": 43.0369, "step": 1220 }, { "epoch": 0.657314629258517, "grad_norm": 416.8631286621094, "learning_rate": 2.7772377793407634e-10, "loss": 44.467, "step": 1230 }, { "epoch": 0.6626586506346025, "grad_norm": 459.6900329589844, "learning_rate": 2.7000090987816086e-10, "loss": 45.8894, "step": 1240 }, { "epoch": 0.6680026720106881, "grad_norm": 452.77166748046875, "learning_rate": 2.623470245626131e-10, "loss": 46.3879, "step": 1250 }, { "epoch": 0.6733466933867736, "grad_norm": 373.71551513671875, "learning_rate": 2.547644175877475e-10, "loss": 44.8361, "step": 1260 }, { "epoch": 0.678690714762859, "grad_norm": 493.52978515625, "learning_rate": 2.472553631756397e-10, "loss": 45.5009, "step": 1270 }, { "epoch": 0.6840347361389446, "grad_norm": 495.15216064453125, "learning_rate": 2.3982211348802956e-10, "loss": 45.423, "step": 1280 }, { "epoch": 0.6893787575150301, "grad_norm": 426.9014892578125, "learning_rate": 2.324668979508382e-10, "loss": 45.0799, "step": 1290 }, { "epoch": 0.6947227788911156, "grad_norm": 475.074951171875, "learning_rate": 2.251919225855041e-10, "loss": 45.2446, "step": 1300 }, { "epoch": 0.700066800267201, "grad_norm": 479.0338439941406, "learning_rate": 2.1799936934734111e-10, "loss": 44.113, "step": 1310 }, { "epoch": 0.7054108216432866, "grad_norm": 350.0298156738281, "learning_rate": 2.1089139547111202e-10, "loss": 45.8131, "step": 1320 }, { "epoch": 0.7107548430193721, "grad_norm": 459.32635498046875, "learning_rate": 2.0387013282401746e-10, "loss": 46.7643, "step": 1330 }, { "epoch": 0.7160988643954576, "grad_norm": 455.7977294921875, "learning_rate": 1.969376872662936e-10, "loss": 45.0021, "step": 1340 }, { "epoch": 0.7214428857715431, "grad_norm": 425.2674560546875, "learning_rate": 1.9009613801960964e-10, "loss": 45.1843, "step": 1350 }, { "epoch": 0.7267869071476286, "grad_norm": 390.6509704589844, "learning_rate": 1.8334753704345403e-10, "loss": 44.7194, "step": 1360 }, { "epoch": 0.7321309285237141, "grad_norm": 505.694580078125, "learning_rate": 1.7669390841969942e-10, "loss": 46.2759, "step": 1370 }, { "epoch": 0.7374749498997996, "grad_norm": 369.56854248046875, "learning_rate": 1.7013724774552676e-10, "loss": 44.7077, "step": 1380 }, { "epoch": 0.7428189712758851, "grad_norm": 429.40838623046875, "learning_rate": 1.6367952153489342e-10, "loss": 48.0047, "step": 1390 }, { "epoch": 0.7481629926519706, "grad_norm": 479.33917236328125, "learning_rate": 1.5732266662872497e-10, "loss": 45.8104, "step": 1400 }, { "epoch": 0.7535070140280561, "grad_norm": 447.9562072753906, "learning_rate": 1.510685896140055e-10, "loss": 46.4843, "step": 1410 }, { "epoch": 0.7588510354041417, "grad_norm": 404.3109130859375, "learning_rate": 1.4491916625194192e-10, "loss": 44.9299, "step": 1420 }, { "epoch": 0.7641950567802271, "grad_norm": 444.400634765625, "learning_rate": 1.3887624091537504e-10, "loss": 44.375, "step": 1430 }, { "epoch": 0.7695390781563126, "grad_norm": 522.9306030273438, "learning_rate": 1.329416260356035e-10, "loss": 45.507, "step": 1440 }, { "epoch": 0.7748830995323981, "grad_norm": 511.14727783203125, "learning_rate": 1.271171015587877e-10, "loss": 46.0719, "step": 1450 }, { "epoch": 0.7802271209084837, "grad_norm": 459.71435546875, "learning_rate": 1.2140441441209837e-10, "loss": 44.2746, "step": 1460 }, { "epoch": 0.7855711422845691, "grad_norm": 422.275390625, "learning_rate": 1.158052779797671e-10, "loss": 46.109, "step": 1470 }, { "epoch": 0.7909151636606546, "grad_norm": 407.68096923828125, "learning_rate": 1.1032137158919697e-10, "loss": 44.9659, "step": 1480 }, { "epoch": 0.7962591850367402, "grad_norm": 448.97314453125, "learning_rate": 1.0495434000728927e-10, "loss": 47.4394, "step": 1490 }, { "epoch": 0.8016032064128257, "grad_norm": 485.30316162109375, "learning_rate": 9.970579294713462e-11, "loss": 46.3913, "step": 1500 }, { "epoch": 0.8069472277889111, "grad_norm": 432.86883544921875, "learning_rate": 9.457730458521747e-11, "loss": 47.0394, "step": 1510 }, { "epoch": 0.8122912491649966, "grad_norm": 531.20556640625, "learning_rate": 8.95704130892801e-11, "loss": 45.8065, "step": 1520 }, { "epoch": 0.8176352705410822, "grad_norm": 460.9041748046875, "learning_rate": 8.468662015698525e-11, "loss": 44.684, "step": 1530 }, { "epoch": 0.8229792919171677, "grad_norm": 520.4033813476562, "learning_rate": 7.99273905655184e-11, "loss": 46.8526, "step": 1540 }, { "epoch": 0.8283233132932531, "grad_norm": 411.2076416015625, "learning_rate": 7.52941517322624e-11, "loss": 46.6088, "step": 1550 }, { "epoch": 0.8336673346693386, "grad_norm": 419.9526062011719, "learning_rate": 7.078829328667747e-11, "loss": 46.7982, "step": 1560 }, { "epoch": 0.8390113560454242, "grad_norm": 461.82977294921875, "learning_rate": 6.641116665351543e-11, "loss": 44.4069, "step": 1570 }, { "epoch": 0.8443553774215097, "grad_norm": 462.8651123046875, "learning_rate": 6.216408464749213e-11, "loss": 46.1496, "step": 1580 }, { "epoch": 0.8496993987975952, "grad_norm": 462.21368408203125, "learning_rate": 5.804832107953923e-11, "loss": 43.4678, "step": 1590 }, { "epoch": 0.8550434201736807, "grad_norm": 480.9452209472656, "learning_rate": 5.406511037475603e-11, "loss": 46.4101, "step": 1600 }, { "epoch": 0.8603874415497662, "grad_norm": 467.31610107421875, "learning_rate": 5.021564720217248e-11, "loss": 45.7185, "step": 1610 }, { "epoch": 0.8657314629258517, "grad_norm": 411.0181579589844, "learning_rate": 4.650108611643672e-11, "loss": 43.6447, "step": 1620 }, { "epoch": 0.8710754843019372, "grad_norm": 424.5155944824219, "learning_rate": 4.292254121153422e-11, "loss": 45.3636, "step": 1630 }, { "epoch": 0.8764195056780227, "grad_norm": 410.9560241699219, "learning_rate": 3.948108578664178e-11, "loss": 46.3407, "step": 1640 }, { "epoch": 0.8817635270541082, "grad_norm": 477.28240966796875, "learning_rate": 3.617775202421675e-11, "loss": 44.1375, "step": 1650 }, { "epoch": 0.8871075484301937, "grad_norm": 461.23309326171875, "learning_rate": 3.301353068041896e-11, "loss": 43.7172, "step": 1660 }, { "epoch": 0.8924515698062793, "grad_norm": 447.8024597167969, "learning_rate": 2.998937078795672e-11, "loss": 47.4198, "step": 1670 }, { "epoch": 0.8977955911823647, "grad_norm": 511.4967956542969, "learning_rate": 2.7106179371447437e-11, "loss": 45.0943, "step": 1680 }, { "epoch": 0.9031396125584502, "grad_norm": 478.0008850097656, "learning_rate": 2.4364821175376806e-11, "loss": 46.4821, "step": 1690 }, { "epoch": 0.9084836339345357, "grad_norm": 509.50958251953125, "learning_rate": 2.1766118404739633e-11, "loss": 44.1657, "step": 1700 }, { "epoch": 0.9138276553106213, "grad_norm": 423.26739501953125, "learning_rate": 1.931085047843889e-11, "loss": 47.6892, "step": 1710 }, { "epoch": 0.9191716766867067, "grad_norm": 518.2147827148438, "learning_rate": 1.6999753795517883e-11, "loss": 46.303, "step": 1720 }, { "epoch": 0.9245156980627922, "grad_norm": 386.4200134277344, "learning_rate": 1.483352151429446e-11, "loss": 43.9672, "step": 1730 }, { "epoch": 0.9298597194388778, "grad_norm": 476.4378967285156, "learning_rate": 1.2812803344465052e-11, "loss": 45.2255, "step": 1740 }, { "epoch": 0.9352037408149633, "grad_norm": 493.2698669433594, "learning_rate": 1.0938205352239883e-11, "loss": 45.0955, "step": 1750 }, { "epoch": 0.9405477621910487, "grad_norm": 455.5923156738281, "learning_rate": 9.210289778567305e-12, "loss": 43.4817, "step": 1760 }, { "epoch": 0.9458917835671342, "grad_norm": 452.3888854980469, "learning_rate": 7.629574870503641e-12, "loss": 43.956, "step": 1770 }, { "epoch": 0.9512358049432198, "grad_norm": 369.5935363769531, "learning_rate": 6.196534725777081e-12, "loss": 45.6541, "step": 1780 }, { "epoch": 0.9565798263193053, "grad_norm": 434.7474365234375, "learning_rate": 4.911599150593193e-12, "loss": 45.4734, "step": 1790 }, { "epoch": 0.9619238476953907, "grad_norm": 515.116455078125, "learning_rate": 3.7751535307252726e-12, "loss": 44.1633, "step": 1800 }, { "epoch": 0.9672678690714763, "grad_norm": 423.6890563964844, "learning_rate": 2.7875387159265744e-12, "loss": 45.9123, "step": 1810 }, { "epoch": 0.9726118904475618, "grad_norm": 456.1297607421875, "learning_rate": 1.949050917700923e-12, "loss": 46.53, "step": 1820 }, { "epoch": 0.9779559118236473, "grad_norm": 496.5564270019531, "learning_rate": 1.259941620460947e-12, "loss": 44.7135, "step": 1830 }, { "epoch": 0.9832999331997327, "grad_norm": 408.35650634765625, "learning_rate": 7.204175061013562e-13, "loss": 42.9871, "step": 1840 }, { "epoch": 0.9886439545758183, "grad_norm": 438.6337890625, "learning_rate": 3.3064039200975115e-13, "loss": 44.0024, "step": 1850 }, { "epoch": 0.9939879759519038, "grad_norm": 453.5023193359375, "learning_rate": 9.072718253316792e-14, "loss": 47.6027, "step": 1860 }, { "epoch": 0.9993319973279893, "grad_norm": 423.05963134765625, "learning_rate": 7.498339156808421e-16, "loss": 47.3557, "step": 1870 }, { "epoch": 0.9998663994655979, "step": 1871, "total_flos": 0.0, "train_loss": 45.65529471906604, "train_runtime": 8058.9213, "train_samples_per_second": 7.43, "train_steps_per_second": 0.232 } ], "logging_steps": 10, "max_steps": 1871, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }