{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998899122967965, "eval_steps": 500, "global_step": 3406, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029356720854280577, "grad_norm": 2.3989222049713135, "learning_rate": 1.9999583060217186e-05, "loss": 1.4046, "step": 10 }, { "epoch": 0.005871344170856115, "grad_norm": 2.4013407230377197, "learning_rate": 1.9998315387870395e-05, "loss": 1.2805, "step": 20 }, { "epoch": 0.008807016256284174, "grad_norm": 2.5987584590911865, "learning_rate": 1.9996197048273697e-05, "loss": 1.2627, "step": 30 }, { "epoch": 0.01174268834171223, "grad_norm": 2.5626957416534424, "learning_rate": 1.999322822165767e-05, "loss": 1.34, "step": 40 }, { "epoch": 0.014678360427140288, "grad_norm": 2.4768199920654297, "learning_rate": 1.998940916061322e-05, "loss": 1.2963, "step": 50 }, { "epoch": 0.017614032512568347, "grad_norm": 2.1327669620513916, "learning_rate": 1.9984740190070102e-05, "loss": 1.3513, "step": 60 }, { "epoch": 0.020549704597996404, "grad_norm": 2.7165908813476562, "learning_rate": 1.9979221707269273e-05, "loss": 1.2629, "step": 70 }, { "epoch": 0.02348537668342446, "grad_norm": 2.5773496627807617, "learning_rate": 1.997285418172908e-05, "loss": 1.2504, "step": 80 }, { "epoch": 0.02642104876885252, "grad_norm": 2.7524304389953613, "learning_rate": 1.9965638155205335e-05, "loss": 1.2854, "step": 90 }, { "epoch": 0.029356720854280576, "grad_norm": 2.542572498321533, "learning_rate": 1.995757424164521e-05, "loss": 1.2583, "step": 100 }, { "epoch": 0.03229239293970863, "grad_norm": 3.595125198364258, "learning_rate": 1.9948663127135003e-05, "loss": 1.2612, "step": 110 }, { "epoch": 0.035228065025136694, "grad_norm": 2.8669538497924805, "learning_rate": 1.9938905569841754e-05, "loss": 1.237, "step": 120 }, { "epoch": 0.03816373711056475, "grad_norm": 3.1492984294891357, "learning_rate": 1.9928302399948767e-05, "loss": 1.2394, "step": 130 }, { "epoch": 0.04109940919599281, "grad_norm": 3.0048630237579346, "learning_rate": 1.991685451958495e-05, "loss": 1.1899, "step": 140 }, { "epoch": 0.04403508128142086, "grad_norm": 2.9907774925231934, "learning_rate": 1.990456290274808e-05, "loss": 1.1939, "step": 150 }, { "epoch": 0.04697075336684892, "grad_norm": 3.010820150375366, "learning_rate": 1.9891428595221914e-05, "loss": 1.2212, "step": 160 }, { "epoch": 0.049906425452276984, "grad_norm": 2.486607551574707, "learning_rate": 1.9877452714487232e-05, "loss": 1.1824, "step": 170 }, { "epoch": 0.05284209753770504, "grad_norm": 2.491534471511841, "learning_rate": 1.9862636449626752e-05, "loss": 1.2118, "step": 180 }, { "epoch": 0.0557777696231331, "grad_norm": 2.5148768424987793, "learning_rate": 1.9846981061223958e-05, "loss": 1.2377, "step": 190 }, { "epoch": 0.05871344170856115, "grad_norm": 2.640007734298706, "learning_rate": 1.9830487881255864e-05, "loss": 1.1995, "step": 200 }, { "epoch": 0.06164911379398921, "grad_norm": 2.6175191402435303, "learning_rate": 1.981315831297966e-05, "loss": 1.1114, "step": 210 }, { "epoch": 0.06458478587941727, "grad_norm": 2.8480780124664307, "learning_rate": 1.9794993830813358e-05, "loss": 1.1897, "step": 220 }, { "epoch": 0.06752045796484532, "grad_norm": 2.635657787322998, "learning_rate": 1.9775995980210306e-05, "loss": 1.1592, "step": 230 }, { "epoch": 0.07045613005027339, "grad_norm": 2.8263065814971924, "learning_rate": 1.9756166377527734e-05, "loss": 1.1712, "step": 240 }, { "epoch": 0.07339180213570144, "grad_norm": 3.2777886390686035, "learning_rate": 1.9735506709889213e-05, "loss": 1.1724, "step": 250 }, { "epoch": 0.0763274742211295, "grad_norm": 2.587409019470215, "learning_rate": 1.9714018735041125e-05, "loss": 1.1767, "step": 260 }, { "epoch": 0.07926314630655756, "grad_norm": 2.9432199001312256, "learning_rate": 1.9691704281203098e-05, "loss": 1.1553, "step": 270 }, { "epoch": 0.08219881839198562, "grad_norm": 3.1995699405670166, "learning_rate": 1.966856524691247e-05, "loss": 1.1577, "step": 280 }, { "epoch": 0.08513449047741367, "grad_norm": 2.805522918701172, "learning_rate": 1.9644603600862753e-05, "loss": 1.1627, "step": 290 }, { "epoch": 0.08807016256284173, "grad_norm": 2.6327457427978516, "learning_rate": 1.961982138173615e-05, "loss": 1.1483, "step": 300 }, { "epoch": 0.09100583464826979, "grad_norm": 4.197879314422607, "learning_rate": 1.959422069803007e-05, "loss": 1.2015, "step": 310 }, { "epoch": 0.09394150673369785, "grad_norm": 3.2353358268737793, "learning_rate": 1.956780372787777e-05, "loss": 1.1068, "step": 320 }, { "epoch": 0.0968771788191259, "grad_norm": 2.8246076107025146, "learning_rate": 1.9540572718863012e-05, "loss": 1.1424, "step": 330 }, { "epoch": 0.09981285090455397, "grad_norm": 4.32204532623291, "learning_rate": 1.9512529987828853e-05, "loss": 1.1508, "step": 340 }, { "epoch": 0.10274852298998202, "grad_norm": 2.8142335414886475, "learning_rate": 1.9483677920680512e-05, "loss": 1.1623, "step": 350 }, { "epoch": 0.10568419507541008, "grad_norm": 2.9336957931518555, "learning_rate": 1.9454018972182383e-05, "loss": 1.161, "step": 360 }, { "epoch": 0.10861986716083813, "grad_norm": 2.9903533458709717, "learning_rate": 1.9423555665749182e-05, "loss": 1.1444, "step": 370 }, { "epoch": 0.1115555392462662, "grad_norm": 2.8897149562835693, "learning_rate": 1.939229059323124e-05, "loss": 1.171, "step": 380 }, { "epoch": 0.11449121133169425, "grad_norm": 3.222294807434082, "learning_rate": 1.9360226414694008e-05, "loss": 1.1365, "step": 390 }, { "epoch": 0.1174268834171223, "grad_norm": 2.9000742435455322, "learning_rate": 1.932736585819171e-05, "loss": 1.1342, "step": 400 }, { "epoch": 0.12036255550255036, "grad_norm": 2.682969093322754, "learning_rate": 1.929371171953526e-05, "loss": 1.0428, "step": 410 }, { "epoch": 0.12329822758797843, "grad_norm": 2.8611629009246826, "learning_rate": 1.9259266862054366e-05, "loss": 1.1135, "step": 420 }, { "epoch": 0.12623389967340648, "grad_norm": 2.9910166263580322, "learning_rate": 1.9224034216353947e-05, "loss": 1.1154, "step": 430 }, { "epoch": 0.12916957175883453, "grad_norm": 3.245227336883545, "learning_rate": 1.9188016780064768e-05, "loss": 1.2029, "step": 440 }, { "epoch": 0.1321052438442626, "grad_norm": 3.2252321243286133, "learning_rate": 1.9151217617588412e-05, "loss": 1.1272, "step": 450 }, { "epoch": 0.13504091592969064, "grad_norm": 3.23498272895813, "learning_rate": 1.9113639859836544e-05, "loss": 1.1421, "step": 460 }, { "epoch": 0.13797658801511872, "grad_norm": 3.0721843242645264, "learning_rate": 1.9075286703964554e-05, "loss": 1.14, "step": 470 }, { "epoch": 0.14091226010054678, "grad_norm": 3.130610704421997, "learning_rate": 1.9036161413099512e-05, "loss": 1.1699, "step": 480 }, { "epoch": 0.14384793218597483, "grad_norm": 3.054914712905884, "learning_rate": 1.899626731606255e-05, "loss": 1.0919, "step": 490 }, { "epoch": 0.14678360427140288, "grad_norm": 3.3167009353637695, "learning_rate": 1.895560780708565e-05, "loss": 1.0625, "step": 500 }, { "epoch": 0.14971927635683094, "grad_norm": 3.075392484664917, "learning_rate": 1.8914186345522846e-05, "loss": 1.0899, "step": 510 }, { "epoch": 0.152654948442259, "grad_norm": 3.1269266605377197, "learning_rate": 1.8872006455555906e-05, "loss": 1.1148, "step": 520 }, { "epoch": 0.15559062052768705, "grad_norm": 3.867361068725586, "learning_rate": 1.8829071725894483e-05, "loss": 1.02, "step": 530 }, { "epoch": 0.15852629261311513, "grad_norm": 3.529639720916748, "learning_rate": 1.87853858094708e-05, "loss": 1.1167, "step": 540 }, { "epoch": 0.16146196469854318, "grad_norm": 3.098249673843384, "learning_rate": 1.8740952423128842e-05, "loss": 1.0181, "step": 550 }, { "epoch": 0.16439763678397123, "grad_norm": 3.1614904403686523, "learning_rate": 1.869577534730812e-05, "loss": 1.1118, "step": 560 }, { "epoch": 0.1673333088693993, "grad_norm": 3.054616928100586, "learning_rate": 1.8649858425722033e-05, "loss": 1.0666, "step": 570 }, { "epoch": 0.17026898095482734, "grad_norm": 3.479527711868286, "learning_rate": 1.8603205565030846e-05, "loss": 1.108, "step": 580 }, { "epoch": 0.1732046530402554, "grad_norm": 2.9523024559020996, "learning_rate": 1.8555820734509297e-05, "loss": 1.0833, "step": 590 }, { "epoch": 0.17614032512568345, "grad_norm": 2.9907584190368652, "learning_rate": 1.8507707965708892e-05, "loss": 1.0283, "step": 600 }, { "epoch": 0.17907599721111153, "grad_norm": 3.2911376953125, "learning_rate": 1.8458871352114894e-05, "loss": 1.0747, "step": 610 }, { "epoch": 0.18201166929653959, "grad_norm": 3.1361849308013916, "learning_rate": 1.840931504879806e-05, "loss": 1.11, "step": 620 }, { "epoch": 0.18494734138196764, "grad_norm": 3.527332067489624, "learning_rate": 1.8359043272061086e-05, "loss": 1.0424, "step": 630 }, { "epoch": 0.1878830134673957, "grad_norm": 3.5494275093078613, "learning_rate": 1.8308060299079926e-05, "loss": 1.0818, "step": 640 }, { "epoch": 0.19081868555282375, "grad_norm": 3.4427106380462646, "learning_rate": 1.8256370467539847e-05, "loss": 1.0883, "step": 650 }, { "epoch": 0.1937543576382518, "grad_norm": 3.092515230178833, "learning_rate": 1.82039781752664e-05, "loss": 1.0285, "step": 660 }, { "epoch": 0.19669002972367985, "grad_norm": 2.667904853820801, "learning_rate": 1.815088787985124e-05, "loss": 0.9751, "step": 670 }, { "epoch": 0.19962570180910794, "grad_norm": 3.5892174243927, "learning_rate": 1.809710409827285e-05, "loss": 1.0603, "step": 680 }, { "epoch": 0.202561373894536, "grad_norm": 3.122434616088867, "learning_rate": 1.804263140651227e-05, "loss": 1.0919, "step": 690 }, { "epoch": 0.20549704597996404, "grad_norm": 2.9182698726654053, "learning_rate": 1.798747443916374e-05, "loss": 1.0553, "step": 700 }, { "epoch": 0.2084327180653921, "grad_norm": 3.260917901992798, "learning_rate": 1.793163788904038e-05, "loss": 1.0711, "step": 710 }, { "epoch": 0.21136839015082015, "grad_norm": 3.4142649173736572, "learning_rate": 1.7875126506774956e-05, "loss": 1.0423, "step": 720 }, { "epoch": 0.2143040622362482, "grad_norm": 3.0127294063568115, "learning_rate": 1.781794510041564e-05, "loss": 1.0679, "step": 730 }, { "epoch": 0.21723973432167626, "grad_norm": 3.398015022277832, "learning_rate": 1.776009853501698e-05, "loss": 1.0558, "step": 740 }, { "epoch": 0.2201754064071043, "grad_norm": 3.6017568111419678, "learning_rate": 1.770159173222595e-05, "loss": 1.0198, "step": 750 }, { "epoch": 0.2231110784925324, "grad_norm": 3.5204339027404785, "learning_rate": 1.7642429669863225e-05, "loss": 0.9951, "step": 760 }, { "epoch": 0.22604675057796045, "grad_norm": 3.3134777545928955, "learning_rate": 1.7582617381499655e-05, "loss": 0.9906, "step": 770 }, { "epoch": 0.2289824226633885, "grad_norm": 3.5017244815826416, "learning_rate": 1.7522159956028003e-05, "loss": 1.0711, "step": 780 }, { "epoch": 0.23191809474881656, "grad_norm": 3.1634137630462646, "learning_rate": 1.7461062537229987e-05, "loss": 0.9909, "step": 790 }, { "epoch": 0.2348537668342446, "grad_norm": 3.368623971939087, "learning_rate": 1.739933032333863e-05, "loss": 0.9815, "step": 800 }, { "epoch": 0.23778943891967266, "grad_norm": 3.1064817905426025, "learning_rate": 1.733696856659599e-05, "loss": 1.0191, "step": 810 }, { "epoch": 0.24072511100510072, "grad_norm": 3.2074899673461914, "learning_rate": 1.7273982572806303e-05, "loss": 1.0314, "step": 820 }, { "epoch": 0.2436607830905288, "grad_norm": 2.5882649421691895, "learning_rate": 1.721037770088455e-05, "loss": 0.958, "step": 830 }, { "epoch": 0.24659645517595685, "grad_norm": 3.730363130569458, "learning_rate": 1.7146159362400515e-05, "loss": 1.0272, "step": 840 }, { "epoch": 0.2495321272613849, "grad_norm": 2.940425395965576, "learning_rate": 1.708133302111837e-05, "loss": 1.0437, "step": 850 }, { "epoch": 0.25246779934681296, "grad_norm": 4.833681106567383, "learning_rate": 1.7015904192531814e-05, "loss": 1.0393, "step": 860 }, { "epoch": 0.255403471432241, "grad_norm": 3.417707681655884, "learning_rate": 1.694987844339479e-05, "loss": 1.0602, "step": 870 }, { "epoch": 0.25833914351766907, "grad_norm": 3.239388942718506, "learning_rate": 1.6883261391247888e-05, "loss": 0.9515, "step": 880 }, { "epoch": 0.2612748156030971, "grad_norm": 3.1867291927337646, "learning_rate": 1.6816058703940366e-05, "loss": 0.9961, "step": 890 }, { "epoch": 0.2642104876885252, "grad_norm": 3.193343162536621, "learning_rate": 1.6748276099147952e-05, "loss": 1.0066, "step": 900 }, { "epoch": 0.26714615977395323, "grad_norm": 3.1413753032684326, "learning_rate": 1.6679919343886376e-05, "loss": 0.9714, "step": 910 }, { "epoch": 0.2700818318593813, "grad_norm": 3.0826566219329834, "learning_rate": 1.661099425402067e-05, "loss": 0.9689, "step": 920 }, { "epoch": 0.2730175039448094, "grad_norm": 3.5959160327911377, "learning_rate": 1.6541506693770403e-05, "loss": 0.9867, "step": 930 }, { "epoch": 0.27595317603023745, "grad_norm": 3.8435122966766357, "learning_rate": 1.647146257521071e-05, "loss": 1.0281, "step": 940 }, { "epoch": 0.2788888481156655, "grad_norm": 3.396488904953003, "learning_rate": 1.6400867857769287e-05, "loss": 0.975, "step": 950 }, { "epoch": 0.28182452020109355, "grad_norm": 3.2766590118408203, "learning_rate": 1.6329728547719375e-05, "loss": 0.9373, "step": 960 }, { "epoch": 0.2847601922865216, "grad_norm": 3.673755645751953, "learning_rate": 1.625805069766873e-05, "loss": 0.9651, "step": 970 }, { "epoch": 0.28769586437194966, "grad_norm": 3.8751864433288574, "learning_rate": 1.6185840406044657e-05, "loss": 0.9262, "step": 980 }, { "epoch": 0.2906315364573777, "grad_norm": 3.708500623703003, "learning_rate": 1.611310381657515e-05, "loss": 0.9972, "step": 990 }, { "epoch": 0.29356720854280577, "grad_norm": 3.4258711338043213, "learning_rate": 1.60398471177662e-05, "loss": 0.9331, "step": 1000 }, { "epoch": 0.2965028806282338, "grad_norm": 3.4662258625030518, "learning_rate": 1.596607654237522e-05, "loss": 0.9592, "step": 1010 }, { "epoch": 0.2994385527136619, "grad_norm": 2.938396453857422, "learning_rate": 1.589179836688081e-05, "loss": 0.9568, "step": 1020 }, { "epoch": 0.30237422479908993, "grad_norm": 3.248762845993042, "learning_rate": 1.5817018910948712e-05, "loss": 0.9928, "step": 1030 }, { "epoch": 0.305309896884518, "grad_norm": 3.423213243484497, "learning_rate": 1.574174453689415e-05, "loss": 0.9387, "step": 1040 }, { "epoch": 0.30824556896994604, "grad_norm": 3.249216318130493, "learning_rate": 1.566598164914049e-05, "loss": 0.8925, "step": 1050 }, { "epoch": 0.3111812410553741, "grad_norm": 3.6318016052246094, "learning_rate": 1.5589736693674372e-05, "loss": 1.0153, "step": 1060 }, { "epoch": 0.31411691314080215, "grad_norm": 3.9752533435821533, "learning_rate": 1.551301615749726e-05, "loss": 0.9323, "step": 1070 }, { "epoch": 0.31705258522623025, "grad_norm": 3.46864914894104, "learning_rate": 1.5435826568073532e-05, "loss": 0.8901, "step": 1080 }, { "epoch": 0.3199882573116583, "grad_norm": 4.399304389953613, "learning_rate": 1.535817449277511e-05, "loss": 0.9118, "step": 1090 }, { "epoch": 0.32292392939708636, "grad_norm": 3.2890026569366455, "learning_rate": 1.5280066538322703e-05, "loss": 0.8655, "step": 1100 }, { "epoch": 0.3258596014825144, "grad_norm": 3.491983652114868, "learning_rate": 1.5201509350223708e-05, "loss": 0.9217, "step": 1110 }, { "epoch": 0.32879527356794247, "grad_norm": 5.0014824867248535, "learning_rate": 1.5122509612206785e-05, "loss": 0.9362, "step": 1120 }, { "epoch": 0.3317309456533705, "grad_norm": 4.092339038848877, "learning_rate": 1.5043074045653215e-05, "loss": 0.9262, "step": 1130 }, { "epoch": 0.3346666177387986, "grad_norm": 3.286433219909668, "learning_rate": 1.496320940902503e-05, "loss": 0.8891, "step": 1140 }, { "epoch": 0.33760228982422663, "grad_norm": 3.6521873474121094, "learning_rate": 1.4882922497290007e-05, "loss": 0.9281, "step": 1150 }, { "epoch": 0.3405379619096547, "grad_norm": 3.8015809059143066, "learning_rate": 1.4802220141343516e-05, "loss": 0.8949, "step": 1160 }, { "epoch": 0.34347363399508274, "grad_norm": 4.149661064147949, "learning_rate": 1.472110920742738e-05, "loss": 0.8889, "step": 1170 }, { "epoch": 0.3464093060805108, "grad_norm": 3.5252785682678223, "learning_rate": 1.4639596596545656e-05, "loss": 0.8397, "step": 1180 }, { "epoch": 0.34934497816593885, "grad_norm": 3.6884541511535645, "learning_rate": 1.4557689243877507e-05, "loss": 0.9142, "step": 1190 }, { "epoch": 0.3522806502513669, "grad_norm": 3.9577550888061523, "learning_rate": 1.4475394118187146e-05, "loss": 0.9809, "step": 1200 }, { "epoch": 0.35521632233679495, "grad_norm": 3.6897339820861816, "learning_rate": 1.4392718221230917e-05, "loss": 0.9141, "step": 1210 }, { "epoch": 0.35815199442222306, "grad_norm": 3.061516046524048, "learning_rate": 1.4309668587161596e-05, "loss": 0.8669, "step": 1220 }, { "epoch": 0.3610876665076511, "grad_norm": 3.1191623210906982, "learning_rate": 1.4226252281929902e-05, "loss": 0.8384, "step": 1230 }, { "epoch": 0.36402333859307917, "grad_norm": 4.198310852050781, "learning_rate": 1.4142476402683327e-05, "loss": 0.8971, "step": 1240 }, { "epoch": 0.3669590106785072, "grad_norm": 3.8184337615966797, "learning_rate": 1.4058348077162301e-05, "loss": 0.8783, "step": 1250 }, { "epoch": 0.3698946827639353, "grad_norm": 3.842637777328491, "learning_rate": 1.3973874463093747e-05, "loss": 0.9623, "step": 1260 }, { "epoch": 0.37283035484936333, "grad_norm": 3.5173559188842773, "learning_rate": 1.3889062747582118e-05, "loss": 0.8092, "step": 1270 }, { "epoch": 0.3757660269347914, "grad_norm": 3.8953890800476074, "learning_rate": 1.3803920146497887e-05, "loss": 0.8762, "step": 1280 }, { "epoch": 0.37870169902021944, "grad_norm": 3.0550928115844727, "learning_rate": 1.3718453903863616e-05, "loss": 0.8321, "step": 1290 }, { "epoch": 0.3816373711056475, "grad_norm": 3.9677445888519287, "learning_rate": 1.3632671291237645e-05, "loss": 0.8566, "step": 1300 }, { "epoch": 0.38457304319107555, "grad_norm": 3.887268304824829, "learning_rate": 1.35465796070954e-05, "loss": 0.8944, "step": 1310 }, { "epoch": 0.3875087152765036, "grad_norm": 3.1006393432617188, "learning_rate": 1.3460186176208439e-05, "loss": 0.7583, "step": 1320 }, { "epoch": 0.39044438736193166, "grad_norm": 3.7594895362854004, "learning_rate": 1.337349834902125e-05, "loss": 0.814, "step": 1330 }, { "epoch": 0.3933800594473597, "grad_norm": 4.34951114654541, "learning_rate": 1.328652350102588e-05, "loss": 0.8006, "step": 1340 }, { "epoch": 0.39631573153278776, "grad_norm": 2.9645636081695557, "learning_rate": 1.3199269032134395e-05, "loss": 0.8129, "step": 1350 }, { "epoch": 0.39925140361821587, "grad_norm": 3.858602285385132, "learning_rate": 1.3111742366049317e-05, "loss": 0.8366, "step": 1360 }, { "epoch": 0.4021870757036439, "grad_norm": 3.2778103351593018, "learning_rate": 1.3023950949631979e-05, "loss": 0.8551, "step": 1370 }, { "epoch": 0.405122747789072, "grad_norm": 3.2402875423431396, "learning_rate": 1.2935902252268965e-05, "loss": 0.8398, "step": 1380 }, { "epoch": 0.40805841987450003, "grad_norm": 4.325957775115967, "learning_rate": 1.2847603765236589e-05, "loss": 0.836, "step": 1390 }, { "epoch": 0.4109940919599281, "grad_norm": 3.5310022830963135, "learning_rate": 1.2759063001063531e-05, "loss": 0.8369, "step": 1400 }, { "epoch": 0.41392976404535614, "grad_norm": 3.5352087020874023, "learning_rate": 1.2670287492891675e-05, "loss": 0.8988, "step": 1410 }, { "epoch": 0.4168654361307842, "grad_norm": 3.190788745880127, "learning_rate": 1.258128479383516e-05, "loss": 0.8352, "step": 1420 }, { "epoch": 0.41980110821621225, "grad_norm": 3.459728240966797, "learning_rate": 1.249206247633778e-05, "loss": 0.8295, "step": 1430 }, { "epoch": 0.4227367803016403, "grad_norm": 3.5794529914855957, "learning_rate": 1.2402628131528686e-05, "loss": 0.8103, "step": 1440 }, { "epoch": 0.42567245238706836, "grad_norm": 4.169612407684326, "learning_rate": 1.2312989368576547e-05, "loss": 0.7757, "step": 1450 }, { "epoch": 0.4286081244724964, "grad_norm": 3.301011562347412, "learning_rate": 1.2223153814042137e-05, "loss": 0.7871, "step": 1460 }, { "epoch": 0.43154379655792446, "grad_norm": 4.524185657501221, "learning_rate": 1.2133129111229466e-05, "loss": 0.851, "step": 1470 }, { "epoch": 0.4344794686433525, "grad_norm": 3.72041392326355, "learning_rate": 1.2042922919535484e-05, "loss": 0.803, "step": 1480 }, { "epoch": 0.43741514072878057, "grad_norm": 3.926424503326416, "learning_rate": 1.1952542913798406e-05, "loss": 0.761, "step": 1490 }, { "epoch": 0.4403508128142086, "grad_norm": 3.5725414752960205, "learning_rate": 1.1861996783644727e-05, "loss": 0.8086, "step": 1500 }, { "epoch": 0.44328648489963673, "grad_norm": 4.109748363494873, "learning_rate": 1.1771292232834983e-05, "loss": 0.8483, "step": 1510 }, { "epoch": 0.4462221569850648, "grad_norm": 3.673794984817505, "learning_rate": 1.1680436978608314e-05, "loss": 0.738, "step": 1520 }, { "epoch": 0.44915782907049284, "grad_norm": 3.831571102142334, "learning_rate": 1.1589438751025852e-05, "loss": 0.7462, "step": 1530 }, { "epoch": 0.4520935011559209, "grad_norm": 4.181507587432861, "learning_rate": 1.149830529231307e-05, "loss": 0.7707, "step": 1540 }, { "epoch": 0.45502917324134895, "grad_norm": 3.3295936584472656, "learning_rate": 1.140704435620104e-05, "loss": 0.7832, "step": 1550 }, { "epoch": 0.457964845326777, "grad_norm": 4.025683403015137, "learning_rate": 1.1315663707266742e-05, "loss": 0.74, "step": 1560 }, { "epoch": 0.46090051741220506, "grad_norm": 3.792701244354248, "learning_rate": 1.1224171120272455e-05, "loss": 0.6698, "step": 1570 }, { "epoch": 0.4638361894976331, "grad_norm": 3.7220959663391113, "learning_rate": 1.1132574379504269e-05, "loss": 0.7604, "step": 1580 }, { "epoch": 0.46677186158306117, "grad_norm": 4.423033714294434, "learning_rate": 1.1040881278109784e-05, "loss": 0.7466, "step": 1590 }, { "epoch": 0.4697075336684892, "grad_norm": 3.633347272872925, "learning_rate": 1.0949099617435062e-05, "loss": 0.7452, "step": 1600 }, { "epoch": 0.4726432057539173, "grad_norm": 3.661238431930542, "learning_rate": 1.0857237206360885e-05, "loss": 0.7637, "step": 1610 }, { "epoch": 0.4755788778393453, "grad_norm": 4.33590030670166, "learning_rate": 1.0765301860638364e-05, "loss": 0.7364, "step": 1620 }, { "epoch": 0.4785145499247734, "grad_norm": 3.7030036449432373, "learning_rate": 1.0673301402223964e-05, "loss": 0.7356, "step": 1630 }, { "epoch": 0.48145022201020143, "grad_norm": 4.784999847412109, "learning_rate": 1.0581243658614013e-05, "loss": 0.765, "step": 1640 }, { "epoch": 0.48438589409562954, "grad_norm": 3.2158679962158203, "learning_rate": 1.0489136462178718e-05, "loss": 0.75, "step": 1650 }, { "epoch": 0.4873215661810576, "grad_norm": 4.584315299987793, "learning_rate": 1.039698764949579e-05, "loss": 0.7347, "step": 1660 }, { "epoch": 0.49025723826648565, "grad_norm": 3.4453585147857666, "learning_rate": 1.0304805060683692e-05, "loss": 0.7887, "step": 1670 }, { "epoch": 0.4931929103519137, "grad_norm": 3.9263744354248047, "learning_rate": 1.021259653873459e-05, "loss": 0.7492, "step": 1680 }, { "epoch": 0.49612858243734176, "grad_norm": 4.6535539627075195, "learning_rate": 1.012036992884708e-05, "loss": 0.7676, "step": 1690 }, { "epoch": 0.4990642545227698, "grad_norm": 4.22018575668335, "learning_rate": 1.0028133077758688e-05, "loss": 0.7088, "step": 1700 }, { "epoch": 0.5019999266081978, "grad_norm": 4.408539295196533, "learning_rate": 9.935893833078284e-06, "loss": 0.7646, "step": 1710 }, { "epoch": 0.5049355986936259, "grad_norm": 5.264422416687012, "learning_rate": 9.843660042618372e-06, "loss": 0.8147, "step": 1720 }, { "epoch": 0.507871270779054, "grad_norm": 4.2693047523498535, "learning_rate": 9.75143955372742e-06, "loss": 0.7104, "step": 1730 }, { "epoch": 0.510806942864482, "grad_norm": 4.856871128082275, "learning_rate": 9.659240212622175e-06, "loss": 0.7367, "step": 1740 }, { "epoch": 0.5137426149499101, "grad_norm": 2.8976457118988037, "learning_rate": 9.567069863720113e-06, "loss": 0.7564, "step": 1750 }, { "epoch": 0.5166782870353381, "grad_norm": 5.992892742156982, "learning_rate": 9.474936348972021e-06, "loss": 0.7735, "step": 1760 }, { "epoch": 0.5196139591207662, "grad_norm": 3.6526339054107666, "learning_rate": 9.382847507194797e-06, "loss": 0.7035, "step": 1770 }, { "epoch": 0.5225496312061942, "grad_norm": 4.040701389312744, "learning_rate": 9.290811173404513e-06, "loss": 0.6347, "step": 1780 }, { "epoch": 0.5254853032916224, "grad_norm": 3.848483085632324, "learning_rate": 9.198835178149807e-06, "loss": 0.6359, "step": 1790 }, { "epoch": 0.5284209753770504, "grad_norm": 3.2821764945983887, "learning_rate": 9.106927346845663e-06, "loss": 0.7137, "step": 1800 }, { "epoch": 0.5313566474624785, "grad_norm": 4.672881603240967, "learning_rate": 9.015095499107578e-06, "loss": 0.7085, "step": 1810 }, { "epoch": 0.5342923195479065, "grad_norm": 3.976231098175049, "learning_rate": 8.923347448086311e-06, "loss": 0.6501, "step": 1820 }, { "epoch": 0.5372279916333346, "grad_norm": 4.726049423217773, "learning_rate": 8.831690999803101e-06, "loss": 0.8129, "step": 1830 }, { "epoch": 0.5401636637187626, "grad_norm": 6.278385162353516, "learning_rate": 8.740133952485515e-06, "loss": 0.6732, "step": 1840 }, { "epoch": 0.5430993358041907, "grad_norm": 4.620763301849365, "learning_rate": 8.648684095904001e-06, "loss": 0.6872, "step": 1850 }, { "epoch": 0.5460350078896188, "grad_norm": 4.494777679443359, "learning_rate": 8.557349210709098e-06, "loss": 0.6686, "step": 1860 }, { "epoch": 0.5489706799750468, "grad_norm": 4.2295637130737305, "learning_rate": 8.46613706776945e-06, "loss": 0.6853, "step": 1870 }, { "epoch": 0.5519063520604749, "grad_norm": 3.5783040523529053, "learning_rate": 8.375055427510673e-06, "loss": 0.6923, "step": 1880 }, { "epoch": 0.5548420241459029, "grad_norm": 3.5585546493530273, "learning_rate": 8.284112039255071e-06, "loss": 0.6744, "step": 1890 }, { "epoch": 0.557777696231331, "grad_norm": 3.939253330230713, "learning_rate": 8.193314640562315e-06, "loss": 0.627, "step": 1900 }, { "epoch": 0.560713368316759, "grad_norm": 3.630519390106201, "learning_rate": 8.102670956571139e-06, "loss": 0.6627, "step": 1910 }, { "epoch": 0.5636490404021871, "grad_norm": 11.943046569824219, "learning_rate": 8.012188699342072e-06, "loss": 0.6476, "step": 1920 }, { "epoch": 0.5665847124876151, "grad_norm": 5.358550071716309, "learning_rate": 7.92187556720126e-06, "loss": 0.6968, "step": 1930 }, { "epoch": 0.5695203845730432, "grad_norm": 3.8031585216522217, "learning_rate": 7.831739244085534e-06, "loss": 0.6811, "step": 1940 }, { "epoch": 0.5724560566584712, "grad_norm": 3.1659951210021973, "learning_rate": 7.741787398888617e-06, "loss": 0.6501, "step": 1950 }, { "epoch": 0.5753917287438993, "grad_norm": 3.7877001762390137, "learning_rate": 7.652027684808644e-06, "loss": 0.6496, "step": 1960 }, { "epoch": 0.5783274008293273, "grad_norm": 4.701345920562744, "learning_rate": 7.56246773869705e-06, "loss": 0.659, "step": 1970 }, { "epoch": 0.5812630729147554, "grad_norm": 4.617175579071045, "learning_rate": 7.47311518040879e-06, "loss": 0.6429, "step": 1980 }, { "epoch": 0.5841987450001834, "grad_norm": 5.269269943237305, "learning_rate": 7.3839776121540385e-06, "loss": 0.6845, "step": 1990 }, { "epoch": 0.5871344170856115, "grad_norm": 3.911558151245117, "learning_rate": 7.2950626178514e-06, "loss": 0.6536, "step": 2000 }, { "epoch": 0.5900700891710396, "grad_norm": NaN, "learning_rate": 7.215235676567183e-06, "loss": 0.6691, "step": 2010 }, { "epoch": 0.5930057612564676, "grad_norm": 5.29760217666626, "learning_rate": 7.126764398128368e-06, "loss": 0.6483, "step": 2020 }, { "epoch": 0.5959414333418958, "grad_norm": 3.4294636249542236, "learning_rate": 7.038537577614009e-06, "loss": 0.5965, "step": 2030 }, { "epoch": 0.5988771054273238, "grad_norm": 3.6569931507110596, "learning_rate": 6.950562721455325e-06, "loss": 0.5782, "step": 2040 }, { "epoch": 0.6018127775127519, "grad_norm": 3.845431089401245, "learning_rate": 6.86284731464614e-06, "loss": 0.6419, "step": 2050 }, { "epoch": 0.6047484495981799, "grad_norm": 3.8947107791900635, "learning_rate": 6.775398820106065e-06, "loss": 0.5942, "step": 2060 }, { "epoch": 0.607684121683608, "grad_norm": 5.501591682434082, "learning_rate": 6.688224678045507e-06, "loss": 0.5874, "step": 2070 }, { "epoch": 0.610619793769036, "grad_norm": 4.684408187866211, "learning_rate": 6.6013323053327065e-06, "loss": 0.6178, "step": 2080 }, { "epoch": 0.6135554658544641, "grad_norm": 4.132544040679932, "learning_rate": 6.5147290948626365e-06, "loss": 0.5972, "step": 2090 }, { "epoch": 0.6164911379398921, "grad_norm": 3.2844135761260986, "learning_rate": 6.428422414928066e-06, "loss": 0.5808, "step": 2100 }, { "epoch": 0.6194268100253202, "grad_norm": 4.8152289390563965, "learning_rate": 6.342419608592626e-06, "loss": 0.6407, "step": 2110 }, { "epoch": 0.6223624821107482, "grad_norm": 4.975841999053955, "learning_rate": 6.25672799306605e-06, "loss": 0.5792, "step": 2120 }, { "epoch": 0.6252981541961763, "grad_norm": 3.772268772125244, "learning_rate": 6.171354859081639e-06, "loss": 0.7062, "step": 2130 }, { "epoch": 0.6282338262816043, "grad_norm": 3.6091275215148926, "learning_rate": 6.086307470275947e-06, "loss": 0.6015, "step": 2140 }, { "epoch": 0.6311694983670324, "grad_norm": 3.9650683403015137, "learning_rate": 6.001593062570776e-06, "loss": 0.699, "step": 2150 }, { "epoch": 0.6341051704524605, "grad_norm": 3.4142041206359863, "learning_rate": 5.917218843557551e-06, "loss": 0.5912, "step": 2160 }, { "epoch": 0.6370408425378885, "grad_norm": 3.6262595653533936, "learning_rate": 5.8415788415375744e-06, "loss": 0.6029, "step": 2170 }, { "epoch": 0.6399765146233166, "grad_norm": 3.612025737762451, "learning_rate": 5.757870733799642e-06, "loss": 0.6054, "step": 2180 }, { "epoch": 0.6429121867087446, "grad_norm": 3.7721731662750244, "learning_rate": 5.6745235509072135e-06, "loss": 0.5703, "step": 2190 }, { "epoch": 0.6458478587941727, "grad_norm": 4.44386100769043, "learning_rate": 5.591544384126769e-06, "loss": 0.6101, "step": 2200 }, { "epoch": 0.6487835308796007, "grad_norm": 3.6553893089294434, "learning_rate": 5.508940293413603e-06, "loss": 0.6131, "step": 2210 }, { "epoch": 0.6517192029650288, "grad_norm": 4.550465106964111, "learning_rate": 5.426718306811134e-06, "loss": 0.5761, "step": 2220 }, { "epoch": 0.6546548750504568, "grad_norm": 3.433598279953003, "learning_rate": 5.344885419852961e-06, "loss": 0.6456, "step": 2230 }, { "epoch": 0.6575905471358849, "grad_norm": 5.087676048278809, "learning_rate": 5.263448594967673e-06, "loss": 0.657, "step": 2240 }, { "epoch": 0.6605262192213129, "grad_norm": 4.578396797180176, "learning_rate": 5.182414760886484e-06, "loss": 0.6083, "step": 2250 }, { "epoch": 0.663461891306741, "grad_norm": 6.043960094451904, "learning_rate": 5.1017908120537105e-06, "loss": 0.5721, "step": 2260 }, { "epoch": 0.666397563392169, "grad_norm": 3.624394178390503, "learning_rate": 5.021583608040208e-06, "loss": 0.5952, "step": 2270 }, { "epoch": 0.6693332354775972, "grad_norm": 2.965820074081421, "learning_rate": 4.941799972959752e-06, "loss": 0.5074, "step": 2280 }, { "epoch": 0.6722689075630253, "grad_norm": 5.590756416320801, "learning_rate": 4.862446694888403e-06, "loss": 0.5274, "step": 2290 }, { "epoch": 0.6752045796484533, "grad_norm": 4.188043594360352, "learning_rate": 4.783530525287006e-06, "loss": 0.5694, "step": 2300 }, { "epoch": 0.6781402517338814, "grad_norm": 3.925184488296509, "learning_rate": 4.705058178426753e-06, "loss": 0.55, "step": 2310 }, { "epoch": 0.6810759238193094, "grad_norm": 4.226954936981201, "learning_rate": 4.627036330817926e-06, "loss": 0.5432, "step": 2320 }, { "epoch": 0.6840115959047375, "grad_norm": 3.9109609127044678, "learning_rate": 4.5494716206418555e-06, "loss": 0.5332, "step": 2330 }, { "epoch": 0.6869472679901655, "grad_norm": 4.698592662811279, "learning_rate": 4.4723706471861385e-06, "loss": 0.5744, "step": 2340 }, { "epoch": 0.6898829400755936, "grad_norm": 4.461889266967773, "learning_rate": 4.3957399702831505e-06, "loss": 0.5314, "step": 2350 }, { "epoch": 0.6928186121610216, "grad_norm": 4.412221908569336, "learning_rate": 4.31958610975195e-06, "loss": 0.554, "step": 2360 }, { "epoch": 0.6957542842464497, "grad_norm": 3.776421308517456, "learning_rate": 4.243915544843549e-06, "loss": 0.4857, "step": 2370 }, { "epoch": 0.6986899563318777, "grad_norm": 4.851159572601318, "learning_rate": 4.168734713689658e-06, "loss": 0.5484, "step": 2380 }, { "epoch": 0.7016256284173058, "grad_norm": 3.8917558193206787, "learning_rate": 4.094050012754925e-06, "loss": 0.4888, "step": 2390 }, { "epoch": 0.7045613005027338, "grad_norm": 4.396358966827393, "learning_rate": 4.019867796292709e-06, "loss": 0.5125, "step": 2400 }, { "epoch": 0.7074969725881619, "grad_norm": 4.374291896820068, "learning_rate": 3.946194375804452e-06, "loss": 0.5262, "step": 2410 }, { "epoch": 0.7104326446735899, "grad_norm": 4.330350875854492, "learning_rate": 3.873036019502716e-06, "loss": 0.5581, "step": 2420 }, { "epoch": 0.713368316759018, "grad_norm": 4.86287784576416, "learning_rate": 3.800398951777845e-06, "loss": 0.5687, "step": 2430 }, { "epoch": 0.7163039888444461, "grad_norm": 5.004453659057617, "learning_rate": 3.7282893526683914e-06, "loss": 0.5136, "step": 2440 }, { "epoch": 0.7192396609298741, "grad_norm": 3.035261631011963, "learning_rate": 3.656713357335334e-06, "loss": 0.5358, "step": 2450 }, { "epoch": 0.7221753330153022, "grad_norm": 3.3477425575256348, "learning_rate": 3.585677055540072e-06, "loss": 0.5214, "step": 2460 }, { "epoch": 0.7251110051007302, "grad_norm": 3.495814323425293, "learning_rate": 3.5151864911263066e-06, "loss": 0.5048, "step": 2470 }, { "epoch": 0.7280466771861583, "grad_norm": 3.35532283782959, "learning_rate": 3.4452476615058316e-06, "loss": 0.509, "step": 2480 }, { "epoch": 0.7309823492715863, "grad_norm": 3.0357613563537598, "learning_rate": 3.3758665171482474e-06, "loss": 0.5361, "step": 2490 }, { "epoch": 0.7339180213570144, "grad_norm": 4.02761173248291, "learning_rate": 3.3070489610747146e-06, "loss": 0.5033, "step": 2500 }, { "epoch": 0.7368536934424424, "grad_norm": 4.085331916809082, "learning_rate": 3.238800848355702e-06, "loss": 0.526, "step": 2510 }, { "epoch": 0.7397893655278706, "grad_norm": 4.161253929138184, "learning_rate": 3.1711279856128387e-06, "loss": 0.5014, "step": 2520 }, { "epoch": 0.7427250376132986, "grad_norm": 3.7220897674560547, "learning_rate": 3.10403613052487e-06, "loss": 0.4514, "step": 2530 }, { "epoch": 0.7456607096987267, "grad_norm": 4.337230682373047, "learning_rate": 3.037530991337807e-06, "loss": 0.5645, "step": 2540 }, { "epoch": 0.7485963817841547, "grad_norm": 4.30481481552124, "learning_rate": 2.9716182263792314e-06, "loss": 0.5026, "step": 2550 }, { "epoch": 0.7515320538695828, "grad_norm": 3.3447349071502686, "learning_rate": 2.9063034435769242e-06, "loss": 0.5318, "step": 2560 }, { "epoch": 0.7544677259550108, "grad_norm": 3.936032295227051, "learning_rate": 2.8415921999816966e-06, "loss": 0.5106, "step": 2570 }, { "epoch": 0.7574033980404389, "grad_norm": 3.9542150497436523, "learning_rate": 2.7774900012946037e-06, "loss": 0.5299, "step": 2580 }, { "epoch": 0.760339070125867, "grad_norm": 4.351448059082031, "learning_rate": 2.714002301398524e-06, "loss": 0.5211, "step": 2590 }, { "epoch": 0.763274742211295, "grad_norm": 4.167295932769775, "learning_rate": 2.6511345018941225e-06, "loss": 0.5071, "step": 2600 }, { "epoch": 0.7662104142967231, "grad_norm": 5.125722408294678, "learning_rate": 2.588891951640288e-06, "loss": 0.5199, "step": 2610 }, { "epoch": 0.7691460863821511, "grad_norm": 4.25960111618042, "learning_rate": 2.527279946299037e-06, "loss": 0.4537, "step": 2620 }, { "epoch": 0.7720817584675792, "grad_norm": 3.875459909439087, "learning_rate": 2.4663037278849665e-06, "loss": 0.4993, "step": 2630 }, { "epoch": 0.7750174305530072, "grad_norm": 4.285188674926758, "learning_rate": 2.405968484319231e-06, "loss": 0.4596, "step": 2640 }, { "epoch": 0.7779531026384353, "grad_norm": 3.827913284301758, "learning_rate": 2.3462793489881884e-06, "loss": 0.5141, "step": 2650 }, { "epoch": 0.7808887747238633, "grad_norm": 4.174901485443115, "learning_rate": 2.2872414003066146e-06, "loss": 0.4483, "step": 2660 }, { "epoch": 0.7838244468092914, "grad_norm": 3.4712812900543213, "learning_rate": 2.2288596612856306e-06, "loss": 0.4834, "step": 2670 }, { "epoch": 0.7867601188947194, "grad_norm": 4.7577972412109375, "learning_rate": 2.1711390991053547e-06, "loss": 0.4913, "step": 2680 }, { "epoch": 0.7896957909801475, "grad_norm": 3.8983209133148193, "learning_rate": 2.1140846246922774e-06, "loss": 0.4748, "step": 2690 }, { "epoch": 0.7926314630655755, "grad_norm": 3.3365228176116943, "learning_rate": 2.0577010923014353e-06, "loss": 0.5014, "step": 2700 }, { "epoch": 0.7955671351510036, "grad_norm": 3.7394635677337646, "learning_rate": 2.001993299103411e-06, "loss": 0.4524, "step": 2710 }, { "epoch": 0.7985028072364317, "grad_norm": 3.325190544128418, "learning_rate": 1.946965984776181e-06, "loss": 0.486, "step": 2720 }, { "epoch": 0.8014384793218597, "grad_norm": 3.337636947631836, "learning_rate": 1.8926238311018551e-06, "loss": 0.4112, "step": 2730 }, { "epoch": 0.8043741514072879, "grad_norm": 4.279343128204346, "learning_rate": 1.8443055276768218e-06, "loss": 0.4758, "step": 2740 }, { "epoch": 0.8073098234927159, "grad_norm": 3.3319621086120605, "learning_rate": 1.7912778684550137e-06, "loss": 0.4209, "step": 2750 }, { "epoch": 0.810245495578144, "grad_norm": 4.332451343536377, "learning_rate": 1.7389486159957436e-06, "loss": 0.4913, "step": 2760 }, { "epoch": 0.813181167663572, "grad_norm": 5.22000789642334, "learning_rate": 1.6873222225271656e-06, "loss": 0.4523, "step": 2770 }, { "epoch": 0.8161168397490001, "grad_norm": 4.080671787261963, "learning_rate": 1.63640308047745e-06, "loss": 0.4545, "step": 2780 }, { "epoch": 0.8190525118344281, "grad_norm": 5.591613292694092, "learning_rate": 1.5861955221010671e-06, "loss": 0.5272, "step": 2790 }, { "epoch": 0.8219881839198562, "grad_norm": 3.4515106678009033, "learning_rate": 1.536703819110198e-06, "loss": 0.4166, "step": 2800 }, { "epoch": 0.8249238560052842, "grad_norm": 3.1613569259643555, "learning_rate": 1.4879321823112802e-06, "loss": 0.5194, "step": 2810 }, { "epoch": 0.8278595280907123, "grad_norm": 4.931222915649414, "learning_rate": 1.4398847612467736e-06, "loss": 0.4626, "step": 2820 }, { "epoch": 0.8307952001761403, "grad_norm": 3.088315963745117, "learning_rate": 1.3925656438420876e-06, "loss": 0.4246, "step": 2830 }, { "epoch": 0.8337308722615684, "grad_norm": 3.7036452293395996, "learning_rate": 1.3459788560577847e-06, "loss": 0.431, "step": 2840 }, { "epoch": 0.8366665443469964, "grad_norm": 4.452617168426514, "learning_rate": 1.3001283615470517e-06, "loss": 0.4478, "step": 2850 }, { "epoch": 0.8396022164324245, "grad_norm": 3.2161977291107178, "learning_rate": 1.255018061318467e-06, "loss": 0.4432, "step": 2860 }, { "epoch": 0.8425378885178526, "grad_norm": 4.302596092224121, "learning_rate": 1.2106517934040917e-06, "loss": 0.4598, "step": 2870 }, { "epoch": 0.8454735606032806, "grad_norm": 4.297342300415039, "learning_rate": 1.1670333325329353e-06, "loss": 0.4908, "step": 2880 }, { "epoch": 0.8484092326887087, "grad_norm": 3.9199209213256836, "learning_rate": 1.1241663898097865e-06, "loss": 0.4239, "step": 2890 }, { "epoch": 0.8513449047741367, "grad_norm": 4.693470001220703, "learning_rate": 1.08205461239948e-06, "loss": 0.4636, "step": 2900 }, { "epoch": 0.8542805768595648, "grad_norm": 4.2040510177612305, "learning_rate": 1.04070158321659e-06, "loss": 0.4595, "step": 2910 }, { "epoch": 0.8572162489449928, "grad_norm": 4.8676252365112305, "learning_rate": 1.00011082062058e-06, "loss": 0.4699, "step": 2920 }, { "epoch": 0.8601519210304209, "grad_norm": 3.176576852798462, "learning_rate": 9.602857781164721e-07, "loss": 0.4599, "step": 2930 }, { "epoch": 0.8630875931158489, "grad_norm": 4.111423969268799, "learning_rate": 9.212298440610101e-07, "loss": 0.4601, "step": 2940 }, { "epoch": 0.866023265201277, "grad_norm": 3.106792449951172, "learning_rate": 8.829463413743811e-07, "loss": 0.453, "step": 2950 }, { "epoch": 0.868958937286705, "grad_norm": 4.940300941467285, "learning_rate": 8.454385272574906e-07, "loss": 0.4298, "step": 2960 }, { "epoch": 0.8718946093721331, "grad_norm": 4.7473249435424805, "learning_rate": 8.087095929148436e-07, "loss": 0.457, "step": 2970 }, { "epoch": 0.8748302814575611, "grad_norm": 4.263439655303955, "learning_rate": 7.727626632830221e-07, "loss": 0.4194, "step": 2980 }, { "epoch": 0.8777659535429893, "grad_norm": 4.69775390625, "learning_rate": 7.376007967648302e-07, "loss": 0.4457, "step": 2990 }, { "epoch": 0.8807016256284173, "grad_norm": 4.177097320556641, "learning_rate": 7.032269849690654e-07, "loss": 0.4532, "step": 3000 }, { "epoch": 0.8836372977138454, "grad_norm": 4.312076568603516, "learning_rate": 6.696441524559983e-07, "loss": 0.4772, "step": 3010 }, { "epoch": 0.8865729697992735, "grad_norm": 4.328220367431641, "learning_rate": 6.368551564885439e-07, "loss": 0.4239, "step": 3020 }, { "epoch": 0.8895086418847015, "grad_norm": 3.3847310543060303, "learning_rate": 6.048627867891665e-07, "loss": 0.4564, "step": 3030 }, { "epoch": 0.8924443139701296, "grad_norm": 3.408613681793213, "learning_rate": 5.736697653025192e-07, "loss": 0.4206, "step": 3040 }, { "epoch": 0.8953799860555576, "grad_norm": 4.045165061950684, "learning_rate": 5.432787459638722e-07, "loss": 0.4751, "step": 3050 }, { "epoch": 0.8983156581409857, "grad_norm": 3.965830087661743, "learning_rate": 5.136923144732997e-07, "loss": 0.4273, "step": 3060 }, { "epoch": 0.9012513302264137, "grad_norm": 3.462986707687378, "learning_rate": 4.849129880756886e-07, "loss": 0.472, "step": 3070 }, { "epoch": 0.9041870023118418, "grad_norm": 5.114270210266113, "learning_rate": 4.569432153465736e-07, "loss": 0.5233, "step": 3080 }, { "epoch": 0.9071226743972698, "grad_norm": 4.655681610107422, "learning_rate": 4.297853759838055e-07, "loss": 0.4543, "step": 3090 }, { "epoch": 0.9100583464826979, "grad_norm": 4.586308002471924, "learning_rate": 4.034417806050872e-07, "loss": 0.4383, "step": 3100 }, { "epoch": 0.9129940185681259, "grad_norm": 3.405298948287964, "learning_rate": 3.779146705513814e-07, "loss": 0.4644, "step": 3110 }, { "epoch": 0.915929690653554, "grad_norm": 3.8995399475097656, "learning_rate": 3.532062176962159e-07, "loss": 0.4591, "step": 3120 }, { "epoch": 0.918865362738982, "grad_norm": 3.6494014263153076, "learning_rate": 3.293185242608954e-07, "loss": 0.4354, "step": 3130 }, { "epoch": 0.9218010348244101, "grad_norm": 4.192446708679199, "learning_rate": 3.062536226356472e-07, "loss": 0.4466, "step": 3140 }, { "epoch": 0.9247367069098382, "grad_norm": 3.4682350158691406, "learning_rate": 2.8401347520670253e-07, "loss": 0.4629, "step": 3150 }, { "epoch": 0.9276723789952662, "grad_norm": 3.987903594970703, "learning_rate": 2.625999741893304e-07, "loss": 0.5156, "step": 3160 }, { "epoch": 0.9306080510806943, "grad_norm": 4.3802103996276855, "learning_rate": 2.420149414668493e-07, "loss": 0.42, "step": 3170 }, { "epoch": 0.9335437231661223, "grad_norm": 4.355963230133057, "learning_rate": 2.222601284356185e-07, "loss": 0.4408, "step": 3180 }, { "epoch": 0.9364793952515504, "grad_norm": 5.095834255218506, "learning_rate": 2.0333721585602984e-07, "loss": 0.4558, "step": 3190 }, { "epoch": 0.9394150673369784, "grad_norm": 3.3932993412017822, "learning_rate": 1.8524781370950262e-07, "loss": 0.4475, "step": 3200 }, { "epoch": 0.9423507394224065, "grad_norm": 5.2142014503479, "learning_rate": 1.679934610615064e-07, "loss": 0.4351, "step": 3210 }, { "epoch": 0.9452864115078345, "grad_norm": 4.271505832672119, "learning_rate": 1.515756259306178e-07, "loss": 0.4431, "step": 3220 }, { "epoch": 0.9482220835932627, "grad_norm": 4.888089656829834, "learning_rate": 1.3599570516361737e-07, "loss": 0.4256, "step": 3230 }, { "epoch": 0.9511577556786907, "grad_norm": 4.216527938842773, "learning_rate": 1.212550243166455e-07, "loss": 0.4811, "step": 3240 }, { "epoch": 0.9540934277641188, "grad_norm": 4.079187393188477, "learning_rate": 1.0735483754242049e-07, "loss": 0.4435, "step": 3250 }, { "epoch": 0.9570290998495468, "grad_norm": 4.337707042694092, "learning_rate": 9.429632748354068e-08, "loss": 0.4152, "step": 3260 }, { "epoch": 0.9599647719349749, "grad_norm": 3.4174439907073975, "learning_rate": 8.208060517185146e-08, "loss": 0.4579, "step": 3270 }, { "epoch": 0.9629004440204029, "grad_norm": 4.021118640899658, "learning_rate": 7.070870993393209e-08, "loss": 0.4531, "step": 3280 }, { "epoch": 0.965836116105831, "grad_norm": 3.2883527278900146, "learning_rate": 6.01816093026586e-08, "loss": 0.4384, "step": 3290 }, { "epoch": 0.9687717881912591, "grad_norm": 3.5450029373168945, "learning_rate": 5.0500198934889665e-08, "loss": 0.4028, "step": 3300 }, { "epoch": 0.9717074602766871, "grad_norm": 3.9169414043426514, "learning_rate": 4.16653025352598e-08, "loss": 0.4894, "step": 3310 }, { "epoch": 0.9746431323621152, "grad_norm": 4.527153015136719, "learning_rate": 3.367767178609982e-08, "loss": 0.4403, "step": 3320 }, { "epoch": 0.9775788044475432, "grad_norm": 4.728188514709473, "learning_rate": 2.6537986283485805e-08, "loss": 0.4123, "step": 3330 }, { "epoch": 0.9805144765329713, "grad_norm": 4.7194037437438965, "learning_rate": 2.024685347941202e-08, "loss": 0.4456, "step": 3340 }, { "epoch": 0.9834501486183993, "grad_norm": 4.34934139251709, "learning_rate": 1.4804808630112244e-08, "loss": 0.4383, "step": 3350 }, { "epoch": 0.9863858207038274, "grad_norm": 4.218658924102783, "learning_rate": 1.0212314750518426e-08, "loss": 0.4368, "step": 3360 }, { "epoch": 0.9893214927892554, "grad_norm": 4.213326454162598, "learning_rate": 6.469762574868866e-09, "loss": 0.4769, "step": 3370 }, { "epoch": 0.9922571648746835, "grad_norm": 3.9344699382781982, "learning_rate": 3.5774705234625783e-09, "loss": 0.4412, "step": 3380 }, { "epoch": 0.9951928369601115, "grad_norm": 3.622770071029663, "learning_rate": 1.5356846755654187e-09, "loss": 0.5059, "step": 3390 }, { "epoch": 0.9981285090455396, "grad_norm": 5.872491359710693, "learning_rate": 3.4457874847793063e-10, "loss": 0.4697, "step": 3400 }, { "epoch": 0.9998899122967965, "step": 3406, "total_flos": 3.2891568428128666e+18, "train_loss": 0.7668657067657847, "train_runtime": 24228.5265, "train_samples_per_second": 4.499, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 3406, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2891568428128666e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }