{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996003729852138, "eval_steps": 500, "global_step": 1876, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005328360197149328, "grad_norm": 0.3923943299533747, "learning_rate": 1.0638297872340427e-06, "loss": 1.5282, "step": 1 }, { "epoch": 0.0026641800985746636, "grad_norm": 0.39997680726308193, "learning_rate": 5.319148936170213e-06, "loss": 1.5368, "step": 5 }, { "epoch": 0.005328360197149327, "grad_norm": 0.42655574017759024, "learning_rate": 1.0638297872340426e-05, "loss": 1.5771, "step": 10 }, { "epoch": 0.007992540295723992, "grad_norm": 0.4800547793020182, "learning_rate": 1.595744680851064e-05, "loss": 1.5624, "step": 15 }, { "epoch": 0.010656720394298654, "grad_norm": 0.36870917165643946, "learning_rate": 2.1276595744680852e-05, "loss": 1.5192, "step": 20 }, { "epoch": 0.013320900492873319, "grad_norm": 0.23219684210183744, "learning_rate": 2.6595744680851064e-05, "loss": 1.5223, "step": 25 }, { "epoch": 0.015985080591447983, "grad_norm": 0.2091606557073564, "learning_rate": 3.191489361702128e-05, "loss": 1.4527, "step": 30 }, { "epoch": 0.018649260690022644, "grad_norm": 0.18998890324763035, "learning_rate": 3.723404255319149e-05, "loss": 1.4828, "step": 35 }, { "epoch": 0.02131344078859731, "grad_norm": 0.14996191641971635, "learning_rate": 4.2553191489361704e-05, "loss": 1.4084, "step": 40 }, { "epoch": 0.023977620887171973, "grad_norm": 0.16249029940771254, "learning_rate": 4.787234042553192e-05, "loss": 1.4001, "step": 45 }, { "epoch": 0.026641800985746637, "grad_norm": 0.1460528370416976, "learning_rate": 5.319148936170213e-05, "loss": 1.3865, "step": 50 }, { "epoch": 0.0293059810843213, "grad_norm": 0.12748888043071394, "learning_rate": 5.851063829787234e-05, "loss": 1.3832, "step": 55 }, { "epoch": 0.031970161182895966, "grad_norm": 0.1047553716550612, "learning_rate": 6.382978723404256e-05, "loss": 1.3383, "step": 60 }, { "epoch": 0.03463434128147063, "grad_norm": 1.0945957696792898, "learning_rate": 6.914893617021277e-05, "loss": 1.3216, "step": 65 }, { "epoch": 0.03729852138004529, "grad_norm": 0.0891343071370869, "learning_rate": 7.446808510638298e-05, "loss": 1.3098, "step": 70 }, { "epoch": 0.03996270147861995, "grad_norm": 0.07870697792715504, "learning_rate": 7.978723404255319e-05, "loss": 1.3138, "step": 75 }, { "epoch": 0.04262688157719462, "grad_norm": 0.08760750591416677, "learning_rate": 8.510638297872341e-05, "loss": 1.3006, "step": 80 }, { "epoch": 0.04529106167576928, "grad_norm": 0.08120742351067671, "learning_rate": 9.042553191489363e-05, "loss": 1.2385, "step": 85 }, { "epoch": 0.047955241774343946, "grad_norm": 0.08142820910966997, "learning_rate": 9.574468085106384e-05, "loss": 1.2945, "step": 90 }, { "epoch": 0.05061942187291861, "grad_norm": 0.09177922715037878, "learning_rate": 0.00010106382978723406, "loss": 1.2761, "step": 95 }, { "epoch": 0.053283601971493275, "grad_norm": 0.07677835076593886, "learning_rate": 0.00010638297872340425, "loss": 1.244, "step": 100 }, { "epoch": 0.05594778207006794, "grad_norm": 0.08525332172522379, "learning_rate": 0.00011170212765957446, "loss": 1.2838, "step": 105 }, { "epoch": 0.0586119621686426, "grad_norm": 0.10003274873557398, "learning_rate": 0.00011702127659574468, "loss": 1.2489, "step": 110 }, { "epoch": 0.06127614226721726, "grad_norm": 0.09434455492112725, "learning_rate": 0.0001223404255319149, "loss": 1.2503, "step": 115 }, { "epoch": 0.06394032236579193, "grad_norm": 0.0960230638906738, "learning_rate": 0.00012765957446808513, "loss": 1.2205, "step": 120 }, { "epoch": 0.0666045024643666, "grad_norm": 0.09721981845211174, "learning_rate": 0.00013297872340425532, "loss": 1.2624, "step": 125 }, { "epoch": 0.06926868256294126, "grad_norm": 0.08809066774892928, "learning_rate": 0.00013829787234042554, "loss": 1.2545, "step": 130 }, { "epoch": 0.07193286266151591, "grad_norm": 0.26229245154975894, "learning_rate": 0.00014361702127659576, "loss": 1.2408, "step": 135 }, { "epoch": 0.07459704276009058, "grad_norm": 0.10552899642439768, "learning_rate": 0.00014893617021276596, "loss": 1.2392, "step": 140 }, { "epoch": 0.07726122285866524, "grad_norm": 0.10911221765360271, "learning_rate": 0.00015425531914893618, "loss": 1.2148, "step": 145 }, { "epoch": 0.0799254029572399, "grad_norm": 0.11632059103832315, "learning_rate": 0.00015957446808510637, "loss": 1.2382, "step": 150 }, { "epoch": 0.08258958305581457, "grad_norm": 0.10281933721760748, "learning_rate": 0.00016489361702127662, "loss": 1.226, "step": 155 }, { "epoch": 0.08525376315438923, "grad_norm": 0.10561194502249595, "learning_rate": 0.00017021276595744682, "loss": 1.2531, "step": 160 }, { "epoch": 0.0879179432529639, "grad_norm": 0.10407844313384682, "learning_rate": 0.000175531914893617, "loss": 1.2428, "step": 165 }, { "epoch": 0.09058212335153856, "grad_norm": 0.08952286824052161, "learning_rate": 0.00018085106382978726, "loss": 1.2176, "step": 170 }, { "epoch": 0.09324630345011323, "grad_norm": 0.0938821785588311, "learning_rate": 0.00018617021276595746, "loss": 1.2307, "step": 175 }, { "epoch": 0.09591048354868789, "grad_norm": 0.1162063476232978, "learning_rate": 0.00019148936170212768, "loss": 1.2276, "step": 180 }, { "epoch": 0.09857466364726256, "grad_norm": 0.09632362372953375, "learning_rate": 0.00019680851063829787, "loss": 1.213, "step": 185 }, { "epoch": 0.10123884374583722, "grad_norm": 0.0984821085857903, "learning_rate": 0.00019999930723752516, "loss": 1.2093, "step": 190 }, { "epoch": 0.10390302384441188, "grad_norm": 0.09843324070313318, "learning_rate": 0.00019999151376991434, "loss": 1.2405, "step": 195 }, { "epoch": 0.10656720394298655, "grad_norm": 0.09038802438280195, "learning_rate": 0.00019997506155872244, "loss": 1.2226, "step": 200 }, { "epoch": 0.10923138404156121, "grad_norm": 0.09204563702733434, "learning_rate": 0.00019994995202862512, "loss": 1.1841, "step": 205 }, { "epoch": 0.11189556414013588, "grad_norm": 0.09153264803236888, "learning_rate": 0.00019991618735397672, "loss": 1.1963, "step": 210 }, { "epoch": 0.11455974423871054, "grad_norm": 0.10122556405254433, "learning_rate": 0.00019987377045862202, "loss": 1.1912, "step": 215 }, { "epoch": 0.1172239243372852, "grad_norm": 0.10731171538200276, "learning_rate": 0.00019982270501564284, "loss": 1.2206, "step": 220 }, { "epoch": 0.11988810443585986, "grad_norm": 0.10261300609074893, "learning_rate": 0.00019976299544704026, "loss": 1.2063, "step": 225 }, { "epoch": 0.12255228453443452, "grad_norm": 0.10220846662233612, "learning_rate": 0.00019969464692335152, "loss": 1.2176, "step": 230 }, { "epoch": 0.1252164646330092, "grad_norm": 0.106711961435993, "learning_rate": 0.00019961766536320225, "loss": 1.2338, "step": 235 }, { "epoch": 0.12788064473158386, "grad_norm": 0.08642302944089619, "learning_rate": 0.0001995320574327941, "loss": 1.1834, "step": 240 }, { "epoch": 0.13054482483015853, "grad_norm": 0.10984213170710444, "learning_rate": 0.00019943783054532732, "loss": 1.2157, "step": 245 }, { "epoch": 0.1332090049287332, "grad_norm": 0.09485880486202997, "learning_rate": 0.00019933499286035894, "loss": 1.204, "step": 250 }, { "epoch": 0.13587318502730786, "grad_norm": 0.10445439668189137, "learning_rate": 0.0001992235532830961, "loss": 1.2193, "step": 255 }, { "epoch": 0.13853736512588252, "grad_norm": 0.08363317544398702, "learning_rate": 0.00019910352146362497, "loss": 1.1989, "step": 260 }, { "epoch": 0.1412015452244572, "grad_norm": 0.13246579577470613, "learning_rate": 0.00019897490779607514, "loss": 1.1942, "step": 265 }, { "epoch": 0.14386572532303182, "grad_norm": 0.11140593579855256, "learning_rate": 0.00019883772341771936, "loss": 1.226, "step": 270 }, { "epoch": 0.1465299054216065, "grad_norm": 0.09879562730310984, "learning_rate": 0.0001986919802080093, "loss": 1.2206, "step": 275 }, { "epoch": 0.14919408552018115, "grad_norm": 0.09576696495195093, "learning_rate": 0.00019853769078754686, "loss": 1.2156, "step": 280 }, { "epoch": 0.15185826561875582, "grad_norm": 0.09339602143232201, "learning_rate": 0.00019837486851699104, "loss": 1.2136, "step": 285 }, { "epoch": 0.15452244571733048, "grad_norm": 0.1015702269360243, "learning_rate": 0.0001982035274959014, "loss": 1.228, "step": 290 }, { "epoch": 0.15718662581590515, "grad_norm": 0.10731429321620178, "learning_rate": 0.0001980236825615166, "loss": 1.2084, "step": 295 }, { "epoch": 0.1598508059144798, "grad_norm": 0.11015660781012064, "learning_rate": 0.00019783534928747006, "loss": 1.233, "step": 300 }, { "epoch": 0.16251498601305447, "grad_norm": 0.09317505295867419, "learning_rate": 0.000197638543982441, "loss": 1.1949, "step": 305 }, { "epoch": 0.16517916611162914, "grad_norm": 0.10444172159792364, "learning_rate": 0.00019743328368874237, "loss": 1.2077, "step": 310 }, { "epoch": 0.1678433462102038, "grad_norm": 0.08551570812304626, "learning_rate": 0.00019721958618084507, "loss": 1.206, "step": 315 }, { "epoch": 0.17050752630877847, "grad_norm": 0.08486005829321344, "learning_rate": 0.00019699746996383878, "loss": 1.2162, "step": 320 }, { "epoch": 0.17317170640735313, "grad_norm": 0.08572790019112694, "learning_rate": 0.00019676695427182938, "loss": 1.1866, "step": 325 }, { "epoch": 0.1758358865059278, "grad_norm": 0.0948088094425236, "learning_rate": 0.00019652805906627356, "loss": 1.1903, "step": 330 }, { "epoch": 0.17850006660450246, "grad_norm": 0.09120766035226856, "learning_rate": 0.00019628080503425013, "loss": 1.2231, "step": 335 }, { "epoch": 0.18116424670307713, "grad_norm": 0.10099249661711791, "learning_rate": 0.0001960252135866687, "loss": 1.192, "step": 340 }, { "epoch": 0.1838284268016518, "grad_norm": 0.08142630900860537, "learning_rate": 0.0001957613068564156, "loss": 1.2093, "step": 345 }, { "epoch": 0.18649260690022645, "grad_norm": 0.08243035183498343, "learning_rate": 0.00019548910769643722, "loss": 1.2232, "step": 350 }, { "epoch": 0.18915678699880112, "grad_norm": 0.08113246622113654, "learning_rate": 0.00019520863967776116, "loss": 1.1773, "step": 355 }, { "epoch": 0.19182096709737578, "grad_norm": 0.08217032990907831, "learning_rate": 0.000194919927087455, "loss": 1.1909, "step": 360 }, { "epoch": 0.19448514719595045, "grad_norm": 0.08936356448152723, "learning_rate": 0.00019462299492652336, "loss": 1.1768, "step": 365 }, { "epoch": 0.1971493272945251, "grad_norm": 0.09171279408988006, "learning_rate": 0.00019431786890774264, "loss": 1.1899, "step": 370 }, { "epoch": 0.19981350739309978, "grad_norm": 0.08313643695167447, "learning_rate": 0.00019400457545343464, "loss": 1.224, "step": 375 }, { "epoch": 0.20247768749167444, "grad_norm": 0.08891928174737382, "learning_rate": 0.00019368314169317856, "loss": 1.1723, "step": 380 }, { "epoch": 0.2051418675902491, "grad_norm": 0.08770933776267686, "learning_rate": 0.00019335359546146156, "loss": 1.2028, "step": 385 }, { "epoch": 0.20780604768882377, "grad_norm": 0.08301528792067146, "learning_rate": 0.00019301596529526854, "loss": 1.2056, "step": 390 }, { "epoch": 0.21047022778739843, "grad_norm": 0.08890281842327881, "learning_rate": 0.00019267028043161094, "loss": 1.2138, "step": 395 }, { "epoch": 0.2131344078859731, "grad_norm": 0.0900002819215321, "learning_rate": 0.0001923165708049951, "loss": 1.2051, "step": 400 }, { "epoch": 0.21579858798454776, "grad_norm": 0.10504748978346506, "learning_rate": 0.00019195486704482977, "loss": 1.1954, "step": 405 }, { "epoch": 0.21846276808312243, "grad_norm": 0.09718704604170597, "learning_rate": 0.0001915852004727742, "loss": 1.1639, "step": 410 }, { "epoch": 0.2211269481816971, "grad_norm": 0.08996864675224003, "learning_rate": 0.00019120760310002545, "loss": 1.2265, "step": 415 }, { "epoch": 0.22379112828027176, "grad_norm": 0.07943107198536507, "learning_rate": 0.0001908221076245466, "loss": 1.2169, "step": 420 }, { "epoch": 0.22645530837884642, "grad_norm": 0.08284744366625013, "learning_rate": 0.0001904287474282353, "loss": 1.1828, "step": 425 }, { "epoch": 0.22911948847742108, "grad_norm": 0.07736916644319017, "learning_rate": 0.00019002755657403298, "loss": 1.1979, "step": 430 }, { "epoch": 0.23178366857599575, "grad_norm": 0.08280858698864835, "learning_rate": 0.00018961856980297513, "loss": 1.191, "step": 435 }, { "epoch": 0.2344478486745704, "grad_norm": 0.07937399115370877, "learning_rate": 0.0001892018225311831, "loss": 1.2173, "step": 440 }, { "epoch": 0.23711202877314508, "grad_norm": 0.08409239653616714, "learning_rate": 0.00018877735084679693, "loss": 1.1903, "step": 445 }, { "epoch": 0.23977620887171971, "grad_norm": 0.0825557060792781, "learning_rate": 0.00018834519150685071, "loss": 1.1985, "step": 450 }, { "epoch": 0.24244038897029438, "grad_norm": 0.07982212340626871, "learning_rate": 0.00018790538193408937, "loss": 1.2038, "step": 455 }, { "epoch": 0.24510456906886904, "grad_norm": 0.0774659417434534, "learning_rate": 0.000187457960213728, "loss": 1.1788, "step": 460 }, { "epoch": 0.2477687491674437, "grad_norm": 0.08395126176587492, "learning_rate": 0.00018700296509015406, "loss": 1.1862, "step": 465 }, { "epoch": 0.2504329292660184, "grad_norm": 0.09287329697747186, "learning_rate": 0.00018654043596357217, "loss": 1.2092, "step": 470 }, { "epoch": 0.25309710936459306, "grad_norm": 0.09110234430419713, "learning_rate": 0.00018607041288659236, "loss": 1.1974, "step": 475 }, { "epoch": 0.25576128946316773, "grad_norm": 0.08308651742822264, "learning_rate": 0.00018559293656076166, "loss": 1.1869, "step": 480 }, { "epoch": 0.2584254695617424, "grad_norm": 0.08214254142545631, "learning_rate": 0.0001851080483330396, "loss": 1.1831, "step": 485 }, { "epoch": 0.26108964966031706, "grad_norm": 0.08701064777773787, "learning_rate": 0.00018461579019221774, "loss": 1.1879, "step": 490 }, { "epoch": 0.2637538297588917, "grad_norm": 0.08680397859688439, "learning_rate": 0.00018411620476528362, "loss": 1.1929, "step": 495 }, { "epoch": 0.2664180098574664, "grad_norm": 0.09030897168085561, "learning_rate": 0.0001836093353137297, "loss": 1.1902, "step": 500 }, { "epoch": 0.26908218995604105, "grad_norm": 0.08473182672968443, "learning_rate": 0.00018309522572980673, "loss": 1.2044, "step": 505 }, { "epoch": 0.2717463700546157, "grad_norm": 0.08536458790900281, "learning_rate": 0.00018257392053272345, "loss": 1.2259, "step": 510 }, { "epoch": 0.2744105501531904, "grad_norm": 0.081003787470878, "learning_rate": 0.00018204546486479096, "loss": 1.213, "step": 515 }, { "epoch": 0.27707473025176504, "grad_norm": 0.089272857126091, "learning_rate": 0.00018150990448751394, "loss": 1.1791, "step": 520 }, { "epoch": 0.2797389103503397, "grad_norm": 0.08621840551694437, "learning_rate": 0.0001809672857776278, "loss": 1.2024, "step": 525 }, { "epoch": 0.2824030904489144, "grad_norm": 0.07933042265020962, "learning_rate": 0.00018041765572308278, "loss": 1.2025, "step": 530 }, { "epoch": 0.28506727054748904, "grad_norm": 0.0850752493351441, "learning_rate": 0.00017986106191897493, "loss": 1.1994, "step": 535 }, { "epoch": 0.28773145064606365, "grad_norm": 0.07823580475459424, "learning_rate": 0.00017929755256342479, "loss": 1.2139, "step": 540 }, { "epoch": 0.2903956307446383, "grad_norm": 0.07706877273113974, "learning_rate": 0.0001787271764534035, "loss": 1.1909, "step": 545 }, { "epoch": 0.293059810843213, "grad_norm": 0.07946013242507238, "learning_rate": 0.00017814998298050743, "loss": 1.1795, "step": 550 }, { "epoch": 0.29572399094178764, "grad_norm": 0.08180878258450536, "learning_rate": 0.00017756602212668082, "loss": 1.1906, "step": 555 }, { "epoch": 0.2983881710403623, "grad_norm": 0.09250927603458256, "learning_rate": 0.00017697534445988803, "loss": 1.1779, "step": 560 }, { "epoch": 0.30105235113893697, "grad_norm": 0.09177332060381718, "learning_rate": 0.00017637800112973428, "loss": 1.1723, "step": 565 }, { "epoch": 0.30371653123751163, "grad_norm": 0.0837886550110598, "learning_rate": 0.00017577404386303645, "loss": 1.1954, "step": 570 }, { "epoch": 0.3063807113360863, "grad_norm": 0.09291852958714042, "learning_rate": 0.0001751635249593439, "loss": 1.1913, "step": 575 }, { "epoch": 0.30904489143466096, "grad_norm": 0.0751186613096299, "learning_rate": 0.00017454649728640943, "loss": 1.1884, "step": 580 }, { "epoch": 0.3117090715332356, "grad_norm": 0.08335342725723932, "learning_rate": 0.00017392301427561146, "loss": 1.2182, "step": 585 }, { "epoch": 0.3143732516318103, "grad_norm": 0.10512490135127568, "learning_rate": 0.00017329312991732688, "loss": 1.2022, "step": 590 }, { "epoch": 0.31703743173038496, "grad_norm": 0.08736469147276345, "learning_rate": 0.00017265689875625587, "loss": 1.2034, "step": 595 }, { "epoch": 0.3197016118289596, "grad_norm": 0.08181172036849664, "learning_rate": 0.00017201437588669878, "loss": 1.1734, "step": 600 }, { "epoch": 0.3223657919275343, "grad_norm": 0.10161889430894355, "learning_rate": 0.0001713656169477849, "loss": 1.1819, "step": 605 }, { "epoch": 0.32502997202610895, "grad_norm": 0.08576387885641809, "learning_rate": 0.00017071067811865476, "loss": 1.2189, "step": 610 }, { "epoch": 0.3276941521246836, "grad_norm": 0.07896254402827822, "learning_rate": 0.00017004961611359506, "loss": 1.1975, "step": 615 }, { "epoch": 0.3303583322232583, "grad_norm": 0.0843725771070502, "learning_rate": 0.00016938248817712767, "loss": 1.2049, "step": 620 }, { "epoch": 0.33302251232183294, "grad_norm": 0.08922430755828152, "learning_rate": 0.0001687093520790524, "loss": 1.2, "step": 625 }, { "epoch": 0.3356866924204076, "grad_norm": 0.09060345427129264, "learning_rate": 0.00016803026610944462, "loss": 1.2019, "step": 630 }, { "epoch": 0.33835087251898227, "grad_norm": 0.08487192271777715, "learning_rate": 0.0001673452890736074, "loss": 1.2101, "step": 635 }, { "epoch": 0.34101505261755694, "grad_norm": 0.10237244952453312, "learning_rate": 0.00016665448028697961, "loss": 1.1917, "step": 640 }, { "epoch": 0.3436792327161316, "grad_norm": 0.08297722193267411, "learning_rate": 0.0001659578995699991, "loss": 1.1714, "step": 645 }, { "epoch": 0.34634341281470626, "grad_norm": 0.08034790490692274, "learning_rate": 0.00016525560724292305, "loss": 1.2135, "step": 650 }, { "epoch": 0.34900759291328093, "grad_norm": 0.07638178286632234, "learning_rate": 0.00016454766412060402, "loss": 1.1725, "step": 655 }, { "epoch": 0.3516717730118556, "grad_norm": 0.08455753089941971, "learning_rate": 0.00016383413150722415, "loss": 1.1842, "step": 660 }, { "epoch": 0.35433595311043026, "grad_norm": 0.07805666953810234, "learning_rate": 0.00016311507119098627, "loss": 1.2005, "step": 665 }, { "epoch": 0.3570001332090049, "grad_norm": 0.07613396423899182, "learning_rate": 0.00016239054543876343, "loss": 1.1971, "step": 670 }, { "epoch": 0.3596643133075796, "grad_norm": 0.07964041743179047, "learning_rate": 0.00016166061699070703, "loss": 1.2247, "step": 675 }, { "epoch": 0.36232849340615425, "grad_norm": 0.07545427492471256, "learning_rate": 0.00016092534905481367, "loss": 1.1965, "step": 680 }, { "epoch": 0.3649926735047289, "grad_norm": 0.07935466685302715, "learning_rate": 0.00016018480530145175, "loss": 1.1777, "step": 685 }, { "epoch": 0.3676568536033036, "grad_norm": 0.07697591776205723, "learning_rate": 0.00015943904985784796, "loss": 1.2043, "step": 690 }, { "epoch": 0.37032103370187824, "grad_norm": 0.08353307335035773, "learning_rate": 0.00015868814730253422, "loss": 1.2016, "step": 695 }, { "epoch": 0.3729852138004529, "grad_norm": 0.12083203321167697, "learning_rate": 0.00015793216265975538, "loss": 1.2039, "step": 700 }, { "epoch": 0.3756493938990276, "grad_norm": 0.08002384831591002, "learning_rate": 0.0001571711613938387, "loss": 1.2018, "step": 705 }, { "epoch": 0.37831357399760224, "grad_norm": 0.07416696808155057, "learning_rate": 0.00015640520940352474, "loss": 1.1761, "step": 710 }, { "epoch": 0.3809777540961769, "grad_norm": 0.07477584114343706, "learning_rate": 0.00015563437301626095, "loss": 1.1691, "step": 715 }, { "epoch": 0.38364193419475157, "grad_norm": 0.08021029434297425, "learning_rate": 0.00015485871898245822, "loss": 1.1879, "step": 720 }, { "epoch": 0.38630611429332623, "grad_norm": 0.07655243996754035, "learning_rate": 0.0001540783144697103, "loss": 1.1895, "step": 725 }, { "epoch": 0.3889702943919009, "grad_norm": 0.07480027547778711, "learning_rate": 0.0001532932270569778, "loss": 1.1975, "step": 730 }, { "epoch": 0.39163447449047556, "grad_norm": 0.0777051603884027, "learning_rate": 0.00015250352472873603, "loss": 1.1676, "step": 735 }, { "epoch": 0.3942986545890502, "grad_norm": 0.09274122893001194, "learning_rate": 0.00015170927586908786, "loss": 1.185, "step": 740 }, { "epoch": 0.3969628346876249, "grad_norm": 0.08369763065847857, "learning_rate": 0.00015091054925584204, "loss": 1.1839, "step": 745 }, { "epoch": 0.39962701478619955, "grad_norm": 0.0785954093676615, "learning_rate": 0.0001501074140545575, "loss": 1.195, "step": 750 }, { "epoch": 0.4022911948847742, "grad_norm": 0.08125955697905107, "learning_rate": 0.00014929993981255388, "loss": 1.167, "step": 755 }, { "epoch": 0.4049553749833489, "grad_norm": 0.0795318945758808, "learning_rate": 0.00014848819645288915, "loss": 1.1809, "step": 760 }, { "epoch": 0.40761955508192355, "grad_norm": 0.08215872783779934, "learning_rate": 0.0001476722542683045, "loss": 1.1982, "step": 765 }, { "epoch": 0.4102837351804982, "grad_norm": 0.0902569586423544, "learning_rate": 0.0001468521839151375, "loss": 1.2058, "step": 770 }, { "epoch": 0.4129479152790729, "grad_norm": 0.08977396388549971, "learning_rate": 0.00014602805640720373, "loss": 1.2046, "step": 775 }, { "epoch": 0.41561209537764754, "grad_norm": 0.08797468277446341, "learning_rate": 0.00014519994310964698, "loss": 1.1767, "step": 780 }, { "epoch": 0.4182762754762222, "grad_norm": 0.08154618218534682, "learning_rate": 0.0001443679157327598, "loss": 1.2016, "step": 785 }, { "epoch": 0.42094045557479687, "grad_norm": 0.08123872265319151, "learning_rate": 0.00014353204632577352, "loss": 1.1836, "step": 790 }, { "epoch": 0.42360463567337153, "grad_norm": 0.09034057444914761, "learning_rate": 0.00014269240727061928, "loss": 1.1984, "step": 795 }, { "epoch": 0.4262688157719462, "grad_norm": 0.07751594264193425, "learning_rate": 0.00014184907127566006, "loss": 1.1721, "step": 800 }, { "epoch": 0.42893299587052086, "grad_norm": 0.07817067745827988, "learning_rate": 0.00014100211136939457, "loss": 1.2066, "step": 805 }, { "epoch": 0.4315971759690955, "grad_norm": 0.07666379004066795, "learning_rate": 0.00014015160089413331, "loss": 1.201, "step": 810 }, { "epoch": 0.4342613560676702, "grad_norm": 0.07977661703965382, "learning_rate": 0.00013929761349964755, "loss": 1.1986, "step": 815 }, { "epoch": 0.43692553616624485, "grad_norm": 0.07515238252186936, "learning_rate": 0.00013844022313679166, "loss": 1.1673, "step": 820 }, { "epoch": 0.4395897162648195, "grad_norm": 0.08668062284259438, "learning_rate": 0.00013757950405109926, "loss": 1.2055, "step": 825 }, { "epoch": 0.4422538963633942, "grad_norm": 0.07958110157591258, "learning_rate": 0.00013671553077635403, "loss": 1.2052, "step": 830 }, { "epoch": 0.44491807646196885, "grad_norm": 0.07801662094774196, "learning_rate": 0.00013584837812813554, "loss": 1.191, "step": 835 }, { "epoch": 0.4475822565605435, "grad_norm": 0.0766973195738648, "learning_rate": 0.00013497812119734037, "loss": 1.1918, "step": 840 }, { "epoch": 0.4502464366591182, "grad_norm": 0.07475449320976814, "learning_rate": 0.00013410483534367988, "loss": 1.1837, "step": 845 }, { "epoch": 0.45291061675769284, "grad_norm": 0.07991050463902843, "learning_rate": 0.0001332285961891543, "loss": 1.1941, "step": 850 }, { "epoch": 0.4555747968562675, "grad_norm": 0.08254589522833469, "learning_rate": 0.00013234947961150438, "loss": 1.1782, "step": 855 }, { "epoch": 0.45823897695484217, "grad_norm": 0.07717793943049406, "learning_rate": 0.0001314675617376406, "loss": 1.1773, "step": 860 }, { "epoch": 0.46090315705341683, "grad_norm": 0.07981760014460237, "learning_rate": 0.00013058291893705123, "loss": 1.1587, "step": 865 }, { "epoch": 0.4635673371519915, "grad_norm": 0.0746596071448608, "learning_rate": 0.00012969562781518884, "loss": 1.1672, "step": 870 }, { "epoch": 0.46623151725056616, "grad_norm": 0.07546467676535663, "learning_rate": 0.00012880576520683687, "loss": 1.188, "step": 875 }, { "epoch": 0.4688956973491408, "grad_norm": 0.0822763180376801, "learning_rate": 0.00012791340816945609, "loss": 1.1773, "step": 880 }, { "epoch": 0.4715598774477155, "grad_norm": 0.08094564516346787, "learning_rate": 0.00012701863397651176, "loss": 1.1692, "step": 885 }, { "epoch": 0.47422405754629016, "grad_norm": 0.08080906995900732, "learning_rate": 0.00012612152011078233, "loss": 1.1923, "step": 890 }, { "epoch": 0.47688823764486477, "grad_norm": 0.07799247485478207, "learning_rate": 0.00012522214425764953, "loss": 1.1762, "step": 895 }, { "epoch": 0.47955241774343943, "grad_norm": 0.07890128228587008, "learning_rate": 0.00012432058429837152, "loss": 1.1872, "step": 900 }, { "epoch": 0.4822165978420141, "grad_norm": 0.07930485805581218, "learning_rate": 0.00012341691830333867, "loss": 1.1801, "step": 905 }, { "epoch": 0.48488077794058876, "grad_norm": 0.07523574346910737, "learning_rate": 0.000122511224525313, "loss": 1.1705, "step": 910 }, { "epoch": 0.4875449580391634, "grad_norm": 0.08019564137195122, "learning_rate": 0.00012160358139265202, "loss": 1.1968, "step": 915 }, { "epoch": 0.4902091381377381, "grad_norm": 0.07799871453882441, "learning_rate": 0.00012069406750251713, "loss": 1.2037, "step": 920 }, { "epoch": 0.49287331823631275, "grad_norm": 0.07616406333142303, "learning_rate": 0.00011978276161406756, "loss": 1.1771, "step": 925 }, { "epoch": 0.4955374983348874, "grad_norm": 0.08287250388641591, "learning_rate": 0.00011886974264164037, "loss": 1.1817, "step": 930 }, { "epoch": 0.4982016784334621, "grad_norm": 0.07709252333335805, "learning_rate": 0.00011795508964791659, "loss": 1.1837, "step": 935 }, { "epoch": 0.5008658585320368, "grad_norm": 0.07774534231906956, "learning_rate": 0.00011703888183707512, "loss": 1.1801, "step": 940 }, { "epoch": 0.5035300386306114, "grad_norm": 0.07836641922774835, "learning_rate": 0.00011612119854793377, "loss": 1.1928, "step": 945 }, { "epoch": 0.5061942187291861, "grad_norm": 0.0855819667886154, "learning_rate": 0.00011520211924707917, "loss": 1.2115, "step": 950 }, { "epoch": 0.5088583988277607, "grad_norm": 0.08249753825043267, "learning_rate": 0.00011428172352198534, "loss": 1.1902, "step": 955 }, { "epoch": 0.5115225789263355, "grad_norm": 0.07960165957450928, "learning_rate": 0.00011336009107412162, "loss": 1.1846, "step": 960 }, { "epoch": 0.5141867590249101, "grad_norm": 0.0844585882351588, "learning_rate": 0.00011243730171205118, "loss": 1.1546, "step": 965 }, { "epoch": 0.5168509391234848, "grad_norm": 0.07615164060165457, "learning_rate": 0.00011151343534451994, "loss": 1.1909, "step": 970 }, { "epoch": 0.5195151192220594, "grad_norm": 0.08628193709778877, "learning_rate": 0.00011058857197353683, "loss": 1.1832, "step": 975 }, { "epoch": 0.5221792993206341, "grad_norm": 0.08464663568633256, "learning_rate": 0.0001096627916874461, "loss": 1.19, "step": 980 }, { "epoch": 0.5248434794192087, "grad_norm": 0.07335066380801168, "learning_rate": 0.00010873617465399209, "loss": 1.1962, "step": 985 }, { "epoch": 0.5275076595177834, "grad_norm": 0.07968663002138815, "learning_rate": 0.00010780880111337703, "loss": 1.1882, "step": 990 }, { "epoch": 0.530171839616358, "grad_norm": 0.08264162145143913, "learning_rate": 0.00010688075137131282, "loss": 1.1731, "step": 995 }, { "epoch": 0.5328360197149328, "grad_norm": 0.0801021939451612, "learning_rate": 0.00010595210579206676, "loss": 1.1947, "step": 1000 }, { "epoch": 0.5355001998135074, "grad_norm": 0.08017946990238331, "learning_rate": 0.0001050229447915027, "loss": 1.2001, "step": 1005 }, { "epoch": 0.5381643799120821, "grad_norm": 0.07658836800590599, "learning_rate": 0.0001040933488301171, "loss": 1.2002, "step": 1010 }, { "epoch": 0.5408285600106567, "grad_norm": 0.07982866205360158, "learning_rate": 0.00010316339840607194, "loss": 1.1836, "step": 1015 }, { "epoch": 0.5434927401092314, "grad_norm": 0.07960723865086924, "learning_rate": 0.0001022331740482237, "loss": 1.1777, "step": 1020 }, { "epoch": 0.546156920207806, "grad_norm": 0.0748679705543507, "learning_rate": 0.00010130275630915009, "loss": 1.1921, "step": 1025 }, { "epoch": 0.5488211003063808, "grad_norm": 0.0878356815626402, "learning_rate": 0.00010037222575817475, "loss": 1.1709, "step": 1030 }, { "epoch": 0.5514852804049554, "grad_norm": 0.07848930190109027, "learning_rate": 9.944166297439011e-05, "loss": 1.1896, "step": 1035 }, { "epoch": 0.5541494605035301, "grad_norm": 0.07598028143410929, "learning_rate": 9.85111485396798e-05, "loss": 1.1671, "step": 1040 }, { "epoch": 0.5568136406021047, "grad_norm": 0.07455678199305199, "learning_rate": 9.758076303174082e-05, "loss": 1.1879, "step": 1045 }, { "epoch": 0.5594778207006794, "grad_norm": 0.07933883340689663, "learning_rate": 9.665058701710561e-05, "loss": 1.1906, "step": 1050 }, { "epoch": 0.562142000799254, "grad_norm": 0.0782311193459855, "learning_rate": 9.572070104416566e-05, "loss": 1.1814, "step": 1055 }, { "epoch": 0.5648061808978287, "grad_norm": 0.07728561745740628, "learning_rate": 9.479118563619636e-05, "loss": 1.179, "step": 1060 }, { "epoch": 0.5674703609964034, "grad_norm": 0.08077639691991045, "learning_rate": 9.386212128438412e-05, "loss": 1.1957, "step": 1065 }, { "epoch": 0.5701345410949781, "grad_norm": 0.07773225366778684, "learning_rate": 9.29335884408562e-05, "loss": 1.221, "step": 1070 }, { "epoch": 0.5727987211935527, "grad_norm": 0.07773099884244754, "learning_rate": 9.2005667511714e-05, "loss": 1.158, "step": 1075 }, { "epoch": 0.5754629012921273, "grad_norm": 0.07725515386954404, "learning_rate": 9.107843885007042e-05, "loss": 1.1699, "step": 1080 }, { "epoch": 0.578127081390702, "grad_norm": 0.08162033290693702, "learning_rate": 9.015198274909151e-05, "loss": 1.1885, "step": 1085 }, { "epoch": 0.5807912614892766, "grad_norm": 0.07774378164612243, "learning_rate": 8.922637943504361e-05, "loss": 1.1924, "step": 1090 }, { "epoch": 0.5834554415878513, "grad_norm": 0.08627267331483346, "learning_rate": 8.830170906034625e-05, "loss": 1.1971, "step": 1095 }, { "epoch": 0.586119621686426, "grad_norm": 0.07645366300993464, "learning_rate": 8.737805169663114e-05, "loss": 1.1807, "step": 1100 }, { "epoch": 0.5887838017850007, "grad_norm": 0.08443619410444902, "learning_rate": 8.645548732780864e-05, "loss": 1.1761, "step": 1105 }, { "epoch": 0.5914479818835753, "grad_norm": 0.07706639361213953, "learning_rate": 8.553409584314138e-05, "loss": 1.1902, "step": 1110 }, { "epoch": 0.59411216198215, "grad_norm": 0.08249092376191036, "learning_rate": 8.461395703032638e-05, "loss": 1.1839, "step": 1115 }, { "epoch": 0.5967763420807246, "grad_norm": 0.08065147422026245, "learning_rate": 8.369515056858575e-05, "loss": 1.1731, "step": 1120 }, { "epoch": 0.5994405221792993, "grad_norm": 0.07848658439918688, "learning_rate": 8.277775602176702e-05, "loss": 1.177, "step": 1125 }, { "epoch": 0.6021047022778739, "grad_norm": 0.07816482146783796, "learning_rate": 8.186185283145325e-05, "loss": 1.1625, "step": 1130 }, { "epoch": 0.6047688823764487, "grad_norm": 0.07727396916592474, "learning_rate": 8.094752031008371e-05, "loss": 1.2127, "step": 1135 }, { "epoch": 0.6074330624750233, "grad_norm": 0.0789877501243841, "learning_rate": 8.003483763408603e-05, "loss": 1.1685, "step": 1140 }, { "epoch": 0.610097242573598, "grad_norm": 0.07854117414343613, "learning_rate": 7.912388383701982e-05, "loss": 1.1826, "step": 1145 }, { "epoch": 0.6127614226721726, "grad_norm": 0.07974248826415456, "learning_rate": 7.821473780273279e-05, "loss": 1.1867, "step": 1150 }, { "epoch": 0.6154256027707473, "grad_norm": 0.08234486503543673, "learning_rate": 7.730747825852975e-05, "loss": 1.1928, "step": 1155 }, { "epoch": 0.6180897828693219, "grad_norm": 0.08113984284337296, "learning_rate": 7.64021837683554e-05, "loss": 1.2018, "step": 1160 }, { "epoch": 0.6207539629678966, "grad_norm": 0.07823635237833673, "learning_rate": 7.549893272599098e-05, "loss": 1.1756, "step": 1165 }, { "epoch": 0.6234181430664713, "grad_norm": 0.07948360741609674, "learning_rate": 7.459780334826578e-05, "loss": 1.2052, "step": 1170 }, { "epoch": 0.626082323165046, "grad_norm": 0.07981232728150925, "learning_rate": 7.369887366828405e-05, "loss": 1.1935, "step": 1175 }, { "epoch": 0.6287465032636206, "grad_norm": 0.07772028594630517, "learning_rate": 7.28022215286676e-05, "loss": 1.1742, "step": 1180 }, { "epoch": 0.6314106833621953, "grad_norm": 0.07942822498880386, "learning_rate": 7.190792457481526e-05, "loss": 1.2044, "step": 1185 }, { "epoch": 0.6340748634607699, "grad_norm": 0.08072671416547043, "learning_rate": 7.101606024817888e-05, "loss": 1.2139, "step": 1190 }, { "epoch": 0.6367390435593446, "grad_norm": 0.07657851251411404, "learning_rate": 7.01267057795577e-05, "loss": 1.1771, "step": 1195 }, { "epoch": 0.6394032236579192, "grad_norm": 0.07629161839506614, "learning_rate": 6.923993818241013e-05, "loss": 1.1878, "step": 1200 }, { "epoch": 0.642067403756494, "grad_norm": 0.07750592384625017, "learning_rate": 6.83558342461851e-05, "loss": 1.1965, "step": 1205 }, { "epoch": 0.6447315838550686, "grad_norm": 0.08074061148056243, "learning_rate": 6.747447052967246e-05, "loss": 1.1598, "step": 1210 }, { "epoch": 0.6473957639536433, "grad_norm": 0.08114992875515604, "learning_rate": 6.659592335437321e-05, "loss": 1.1863, "step": 1215 }, { "epoch": 0.6500599440522179, "grad_norm": 0.07837122688644742, "learning_rate": 6.572026879789064e-05, "loss": 1.1789, "step": 1220 }, { "epoch": 0.6527241241507926, "grad_norm": 0.07904963655043487, "learning_rate": 6.484758268734226e-05, "loss": 1.1988, "step": 1225 }, { "epoch": 0.6553883042493672, "grad_norm": 0.07794516218687547, "learning_rate": 6.397794059279376e-05, "loss": 1.1797, "step": 1230 }, { "epoch": 0.658052484347942, "grad_norm": 0.0782549564009468, "learning_rate": 6.311141782071486e-05, "loss": 1.1861, "step": 1235 }, { "epoch": 0.6607166644465166, "grad_norm": 0.08113466467014144, "learning_rate": 6.224808940745814e-05, "loss": 1.1812, "step": 1240 }, { "epoch": 0.6633808445450913, "grad_norm": 0.0751998700186739, "learning_rate": 6.138803011276157e-05, "loss": 1.1903, "step": 1245 }, { "epoch": 0.6660450246436659, "grad_norm": 0.08263899166467202, "learning_rate": 6.0531314413274306e-05, "loss": 1.1652, "step": 1250 }, { "epoch": 0.6687092047422406, "grad_norm": 0.07705824954536489, "learning_rate": 5.9678016496107737e-05, "loss": 1.1811, "step": 1255 }, { "epoch": 0.6713733848408152, "grad_norm": 0.08540183694555796, "learning_rate": 5.8828210252410995e-05, "loss": 1.1896, "step": 1260 }, { "epoch": 0.6740375649393899, "grad_norm": 0.0742001043082849, "learning_rate": 5.798196927097259e-05, "loss": 1.1709, "step": 1265 }, { "epoch": 0.6767017450379645, "grad_norm": 0.2483647268345394, "learning_rate": 5.7139366831847955e-05, "loss": 1.1841, "step": 1270 }, { "epoch": 0.6793659251365393, "grad_norm": 0.07966799417452507, "learning_rate": 5.63004759000136e-05, "loss": 1.1739, "step": 1275 }, { "epoch": 0.6820301052351139, "grad_norm": 0.07969094718480424, "learning_rate": 5.546536911904896e-05, "loss": 1.1903, "step": 1280 }, { "epoch": 0.6846942853336886, "grad_norm": 0.0828241197230709, "learning_rate": 5.463411880484577e-05, "loss": 1.1802, "step": 1285 }, { "epoch": 0.6873584654322632, "grad_norm": 0.09372413723978523, "learning_rate": 5.3806796939345685e-05, "loss": 1.1786, "step": 1290 }, { "epoch": 0.6900226455308379, "grad_norm": 0.07631033783827931, "learning_rate": 5.298347516430748e-05, "loss": 1.1895, "step": 1295 }, { "epoch": 0.6926868256294125, "grad_norm": 0.07690415214661994, "learning_rate": 5.216422477510267e-05, "loss": 1.1913, "step": 1300 }, { "epoch": 0.6953510057279872, "grad_norm": 0.0767097655138434, "learning_rate": 5.1349116714542144e-05, "loss": 1.1685, "step": 1305 }, { "epoch": 0.6980151858265619, "grad_norm": 0.07609380641851764, "learning_rate": 5.053822156673276e-05, "loss": 1.1907, "step": 1310 }, { "epoch": 0.7006793659251366, "grad_norm": 0.07956518870952646, "learning_rate": 4.973160955096496e-05, "loss": 1.1668, "step": 1315 }, { "epoch": 0.7033435460237112, "grad_norm": 0.0781847554601962, "learning_rate": 4.892935051563242e-05, "loss": 1.1898, "step": 1320 }, { "epoch": 0.7060077261222859, "grad_norm": 0.07968044970093434, "learning_rate": 4.8131513932183415e-05, "loss": 1.2072, "step": 1325 }, { "epoch": 0.7086719062208605, "grad_norm": 0.07875194128774458, "learning_rate": 4.733816888910483e-05, "loss": 1.178, "step": 1330 }, { "epoch": 0.7113360863194352, "grad_norm": 0.08181988531463072, "learning_rate": 4.654938408593974e-05, "loss": 1.1679, "step": 1335 }, { "epoch": 0.7140002664180098, "grad_norm": 0.08458887612797164, "learning_rate": 4.576522782733802e-05, "loss": 1.1925, "step": 1340 }, { "epoch": 0.7166644465165846, "grad_norm": 0.07929665848737272, "learning_rate": 4.4985768017142014e-05, "loss": 1.1942, "step": 1345 }, { "epoch": 0.7193286266151592, "grad_norm": 0.0793190517544045, "learning_rate": 4.421107215250586e-05, "loss": 1.1504, "step": 1350 }, { "epoch": 0.7219928067137339, "grad_norm": 0.0793615584488964, "learning_rate": 4.3441207318051005e-05, "loss": 1.1704, "step": 1355 }, { "epoch": 0.7246569868123085, "grad_norm": 0.08055131022376696, "learning_rate": 4.2676240180056856e-05, "loss": 1.1937, "step": 1360 }, { "epoch": 0.7273211669108832, "grad_norm": 0.08173806907452158, "learning_rate": 4.191623698068778e-05, "loss": 1.1779, "step": 1365 }, { "epoch": 0.7299853470094578, "grad_norm": 0.07773805545181321, "learning_rate": 4.116126353225703e-05, "loss": 1.1846, "step": 1370 }, { "epoch": 0.7326495271080325, "grad_norm": 0.07667327254766519, "learning_rate": 4.0411385211527684e-05, "loss": 1.2095, "step": 1375 }, { "epoch": 0.7353137072066072, "grad_norm": 0.08098668895333083, "learning_rate": 3.96666669540512e-05, "loss": 1.1682, "step": 1380 }, { "epoch": 0.7379778873051819, "grad_norm": 0.07984137500350058, "learning_rate": 3.892717324854459e-05, "loss": 1.1729, "step": 1385 }, { "epoch": 0.7406420674037565, "grad_norm": 0.07922048060290626, "learning_rate": 3.8192968131305886e-05, "loss": 1.1775, "step": 1390 }, { "epoch": 0.7433062475023312, "grad_norm": 0.07652665688687964, "learning_rate": 3.746411518066894e-05, "loss": 1.1621, "step": 1395 }, { "epoch": 0.7459704276009058, "grad_norm": 0.0798026979694231, "learning_rate": 3.674067751149796e-05, "loss": 1.1702, "step": 1400 }, { "epoch": 0.7486346076994805, "grad_norm": 0.08300115098412487, "learning_rate": 3.602271776972188e-05, "loss": 1.1533, "step": 1405 }, { "epoch": 0.7512987877980551, "grad_norm": 0.08575297897696614, "learning_rate": 3.5310298126909816e-05, "loss": 1.2051, "step": 1410 }, { "epoch": 0.7539629678966299, "grad_norm": 0.07861668329891834, "learning_rate": 3.46034802748872e-05, "loss": 1.1804, "step": 1415 }, { "epoch": 0.7566271479952045, "grad_norm": 0.07590825262005231, "learning_rate": 3.390232542039352e-05, "loss": 1.1846, "step": 1420 }, { "epoch": 0.7592913280937792, "grad_norm": 0.07760055527146281, "learning_rate": 3.320689427978232e-05, "loss": 1.174, "step": 1425 }, { "epoch": 0.7619555081923538, "grad_norm": 0.07989565355982597, "learning_rate": 3.251724707376324e-05, "loss": 1.1696, "step": 1430 }, { "epoch": 0.7646196882909284, "grad_norm": 0.07764798745610466, "learning_rate": 3.1833443522187454e-05, "loss": 1.1761, "step": 1435 }, { "epoch": 0.7672838683895031, "grad_norm": 0.07976913879065081, "learning_rate": 3.115554283887614e-05, "loss": 1.1909, "step": 1440 }, { "epoch": 0.7699480484880777, "grad_norm": 0.08144781158937257, "learning_rate": 3.0483603726492836e-05, "loss": 1.1718, "step": 1445 }, { "epoch": 0.7726122285866525, "grad_norm": 0.07748040591215276, "learning_rate": 2.9817684371460153e-05, "loss": 1.1867, "step": 1450 }, { "epoch": 0.7752764086852271, "grad_norm": 0.07690798090395808, "learning_rate": 2.9157842438921047e-05, "loss": 1.201, "step": 1455 }, { "epoch": 0.7779405887838018, "grad_norm": 0.07507016568699426, "learning_rate": 2.8504135067745464e-05, "loss": 1.1881, "step": 1460 }, { "epoch": 0.7806047688823764, "grad_norm": 0.07900678246787794, "learning_rate": 2.7856618865582318e-05, "loss": 1.1734, "step": 1465 }, { "epoch": 0.7832689489809511, "grad_norm": 0.08067040264461905, "learning_rate": 2.721534990395752e-05, "loss": 1.2003, "step": 1470 }, { "epoch": 0.7859331290795257, "grad_norm": 0.08005195988665038, "learning_rate": 2.658038371341859e-05, "loss": 1.1898, "step": 1475 }, { "epoch": 0.7885973091781004, "grad_norm": 0.07899201399633156, "learning_rate": 2.5951775278725955e-05, "loss": 1.1934, "step": 1480 }, { "epoch": 0.7912614892766751, "grad_norm": 0.07642642805468737, "learning_rate": 2.5329579034091455e-05, "loss": 1.2073, "step": 1485 }, { "epoch": 0.7939256693752498, "grad_norm": 0.08151590645902157, "learning_rate": 2.4713848858464817e-05, "loss": 1.181, "step": 1490 }, { "epoch": 0.7965898494738244, "grad_norm": 0.07944386459666944, "learning_rate": 2.410463807086786e-05, "loss": 1.1955, "step": 1495 }, { "epoch": 0.7992540295723991, "grad_norm": 0.07711356209897446, "learning_rate": 2.3501999425777432e-05, "loss": 1.1891, "step": 1500 }, { "epoch": 0.8019182096709737, "grad_norm": 0.08004988252971652, "learning_rate": 2.2905985108557114e-05, "loss": 1.1851, "step": 1505 }, { "epoch": 0.8045823897695484, "grad_norm": 0.08534573467737261, "learning_rate": 2.2316646730938196e-05, "loss": 1.1721, "step": 1510 }, { "epoch": 0.807246569868123, "grad_norm": 0.07920097969121198, "learning_rate": 2.173403532655046e-05, "loss": 1.1694, "step": 1515 }, { "epoch": 0.8099107499666978, "grad_norm": 0.07511500294571079, "learning_rate": 2.1158201346502926e-05, "loss": 1.1746, "step": 1520 }, { "epoch": 0.8125749300652724, "grad_norm": 0.07737975818247868, "learning_rate": 2.0589194655014898e-05, "loss": 1.185, "step": 1525 }, { "epoch": 0.8152391101638471, "grad_norm": 0.08190267832244168, "learning_rate": 2.0027064525098236e-05, "loss": 1.195, "step": 1530 }, { "epoch": 0.8179032902624217, "grad_norm": 0.08131306910775678, "learning_rate": 1.9471859634290336e-05, "loss": 1.1742, "step": 1535 }, { "epoch": 0.8205674703609964, "grad_norm": 0.0858733149118693, "learning_rate": 1.8923628060439036e-05, "loss": 1.1898, "step": 1540 }, { "epoch": 0.823231650459571, "grad_norm": 0.07675405619600406, "learning_rate": 1.838241727753931e-05, "loss": 1.1881, "step": 1545 }, { "epoch": 0.8258958305581457, "grad_norm": 0.07682847096326867, "learning_rate": 1.7848274151622234e-05, "loss": 1.1805, "step": 1550 }, { "epoch": 0.8285600106567204, "grad_norm": 0.08364774954218579, "learning_rate": 1.732124493669671e-05, "loss": 1.192, "step": 1555 }, { "epoch": 0.8312241907552951, "grad_norm": 0.07984768371406777, "learning_rate": 1.6801375270743924e-05, "loss": 1.1858, "step": 1560 }, { "epoch": 0.8338883708538697, "grad_norm": 0.08106959134364375, "learning_rate": 1.6288710171765576e-05, "loss": 1.1813, "step": 1565 }, { "epoch": 0.8365525509524444, "grad_norm": 0.07836491305718539, "learning_rate": 1.578329403388541e-05, "loss": 1.1881, "step": 1570 }, { "epoch": 0.839216731051019, "grad_norm": 0.0758115839482243, "learning_rate": 1.528517062350492e-05, "loss": 1.1889, "step": 1575 }, { "epoch": 0.8418809111495937, "grad_norm": 0.07588248787185455, "learning_rate": 1.4794383075513452e-05, "loss": 1.1768, "step": 1580 }, { "epoch": 0.8445450912481683, "grad_norm": 0.08067020299948538, "learning_rate": 1.431097388955297e-05, "loss": 1.2063, "step": 1585 }, { "epoch": 0.8472092713467431, "grad_norm": 0.07642268182718946, "learning_rate": 1.3834984926337657e-05, "loss": 1.1589, "step": 1590 }, { "epoch": 0.8498734514453177, "grad_norm": 0.07651179067647247, "learning_rate": 1.3366457404029275e-05, "loss": 1.2185, "step": 1595 }, { "epoch": 0.8525376315438924, "grad_norm": 0.07900907023829833, "learning_rate": 1.2905431894667553e-05, "loss": 1.163, "step": 1600 }, { "epoch": 0.855201811642467, "grad_norm": 0.07852531932632778, "learning_rate": 1.2451948320657114e-05, "loss": 1.1827, "step": 1605 }, { "epoch": 0.8578659917410417, "grad_norm": 0.08065290367816753, "learning_rate": 1.200604595131033e-05, "loss": 1.1723, "step": 1610 }, { "epoch": 0.8605301718396163, "grad_norm": 0.0779423491553262, "learning_rate": 1.1567763399446718e-05, "loss": 1.1636, "step": 1615 }, { "epoch": 0.863194351938191, "grad_norm": 0.07899604520177515, "learning_rate": 1.1137138618049404e-05, "loss": 1.2024, "step": 1620 }, { "epoch": 0.8658585320367657, "grad_norm": 0.08253045338915561, "learning_rate": 1.0714208896978484e-05, "loss": 1.1735, "step": 1625 }, { "epoch": 0.8685227121353404, "grad_norm": 0.07870321328036661, "learning_rate": 1.0299010859742009e-05, "loss": 1.1731, "step": 1630 }, { "epoch": 0.871186892233915, "grad_norm": 0.08272725502586431, "learning_rate": 9.891580460324523e-06, "loss": 1.1929, "step": 1635 }, { "epoch": 0.8738510723324897, "grad_norm": 0.07935943504829367, "learning_rate": 9.491952980073604e-06, "loss": 1.1709, "step": 1640 }, { "epoch": 0.8765152524310643, "grad_norm": 0.07895683534188976, "learning_rate": 9.100163024644815e-06, "loss": 1.1712, "step": 1645 }, { "epoch": 0.879179432529639, "grad_norm": 0.08082741880893497, "learning_rate": 8.716244521004846e-06, "loss": 1.1698, "step": 1650 }, { "epoch": 0.8818436126282136, "grad_norm": 0.07991694698538618, "learning_rate": 8.34023071449378e-06, "loss": 1.1789, "step": 1655 }, { "epoch": 0.8845077927267884, "grad_norm": 0.08214765689148273, "learning_rate": 7.972154165946155e-06, "loss": 1.1845, "step": 1660 }, { "epoch": 0.887171972825363, "grad_norm": 0.08176590536508709, "learning_rate": 7.612046748871327e-06, "loss": 1.1771, "step": 1665 }, { "epoch": 0.8898361529239377, "grad_norm": 0.0808798899357427, "learning_rate": 7.25993964669347e-06, "loss": 1.2, "step": 1670 }, { "epoch": 0.8925003330225123, "grad_norm": 0.07776019044347303, "learning_rate": 6.915863350051199e-06, "loss": 1.204, "step": 1675 }, { "epoch": 0.895164513121087, "grad_norm": 0.07706209163365262, "learning_rate": 6.579847654157234e-06, "loss": 1.1972, "step": 1680 }, { "epoch": 0.8978286932196616, "grad_norm": 0.07653638119319102, "learning_rate": 6.2519216562183516e-06, "loss": 1.1623, "step": 1685 }, { "epoch": 0.9004928733182364, "grad_norm": 0.07709508406908554, "learning_rate": 5.932113752915658e-06, "loss": 1.165, "step": 1690 }, { "epoch": 0.903157053416811, "grad_norm": 0.07922295657062284, "learning_rate": 5.620451637945567e-06, "loss": 1.19, "step": 1695 }, { "epoch": 0.9058212335153857, "grad_norm": 0.07654014983869019, "learning_rate": 5.316962299621808e-06, "loss": 1.1708, "step": 1700 }, { "epoch": 0.9084854136139603, "grad_norm": 0.07678567172116282, "learning_rate": 5.0216720185381595e-06, "loss": 1.1873, "step": 1705 }, { "epoch": 0.911149593712535, "grad_norm": 0.07752534581690265, "learning_rate": 4.734606365292871e-06, "loss": 1.175, "step": 1710 }, { "epoch": 0.9138137738111096, "grad_norm": 0.07543079231709852, "learning_rate": 4.4557901982743345e-06, "loss": 1.1718, "step": 1715 }, { "epoch": 0.9164779539096843, "grad_norm": 0.0847130046966339, "learning_rate": 4.185247661508396e-06, "loss": 1.1853, "step": 1720 }, { "epoch": 0.919142134008259, "grad_norm": 0.07672126462506362, "learning_rate": 3.923002182567737e-06, "loss": 1.1528, "step": 1725 }, { "epoch": 0.9218063141068337, "grad_norm": 0.08103275321155728, "learning_rate": 3.6690764705430537e-06, "loss": 1.1925, "step": 1730 }, { "epoch": 0.9244704942054083, "grad_norm": 0.07625095781555866, "learning_rate": 3.423492514076654e-06, "loss": 1.1466, "step": 1735 }, { "epoch": 0.927134674303983, "grad_norm": 0.07795348864062035, "learning_rate": 3.186271579458333e-06, "loss": 1.1804, "step": 1740 }, { "epoch": 0.9297988544025576, "grad_norm": 0.0798431550705619, "learning_rate": 2.9574342087837382e-06, "loss": 1.1948, "step": 1745 }, { "epoch": 0.9324630345011323, "grad_norm": 0.07723730498120181, "learning_rate": 2.7370002181757114e-06, "loss": 1.194, "step": 1750 }, { "epoch": 0.9351272145997069, "grad_norm": 0.076024197686587, "learning_rate": 2.52498869606812e-06, "loss": 1.1553, "step": 1755 }, { "epoch": 0.9377913946982817, "grad_norm": 0.07525767304214721, "learning_rate": 2.3214180015530218e-06, "loss": 1.1717, "step": 1760 }, { "epoch": 0.9404555747968563, "grad_norm": 0.07866979015778834, "learning_rate": 2.1263057627908478e-06, "loss": 1.1877, "step": 1765 }, { "epoch": 0.943119754895431, "grad_norm": 0.07918446703667696, "learning_rate": 1.9396688754838355e-06, "loss": 1.1825, "step": 1770 }, { "epoch": 0.9457839349940056, "grad_norm": 0.07807410852748545, "learning_rate": 1.7615235014130205e-06, "loss": 1.1597, "step": 1775 }, { "epoch": 0.9484481150925803, "grad_norm": 0.07773597995281895, "learning_rate": 1.5918850670386676e-06, "loss": 1.1736, "step": 1780 }, { "epoch": 0.9511122951911549, "grad_norm": 0.08101968421115548, "learning_rate": 1.4307682621644392e-06, "loss": 1.1726, "step": 1785 }, { "epoch": 0.9537764752897295, "grad_norm": 0.07640809417651317, "learning_rate": 1.2781870386653017e-06, "loss": 1.176, "step": 1790 }, { "epoch": 0.9564406553883043, "grad_norm": 0.07624826302993815, "learning_rate": 1.1341546092794475e-06, "loss": 1.1712, "step": 1795 }, { "epoch": 0.9591048354868789, "grad_norm": 0.07644512408140866, "learning_rate": 9.986834464640328e-07, "loss": 1.1804, "step": 1800 }, { "epoch": 0.9617690155854536, "grad_norm": 0.078535810502388, "learning_rate": 8.717852813152073e-07, "loss": 1.1634, "step": 1805 }, { "epoch": 0.9644331956840282, "grad_norm": 0.07806596561964084, "learning_rate": 7.534711025522167e-07, "loss": 1.1685, "step": 1810 }, { "epoch": 0.9670973757826029, "grad_norm": 0.07845914770402948, "learning_rate": 6.437511555658748e-07, "loss": 1.1704, "step": 1815 }, { "epoch": 0.9697615558811775, "grad_norm": 0.07631630513554058, "learning_rate": 5.426349415313503e-07, "loss": 1.167, "step": 1820 }, { "epoch": 0.9724257359797522, "grad_norm": 0.07644461634231602, "learning_rate": 4.5013121658538107e-07, "loss": 1.1952, "step": 1825 }, { "epoch": 0.9750899160783268, "grad_norm": 0.07797081390243328, "learning_rate": 3.662479910681027e-07, "loss": 1.1587, "step": 1830 }, { "epoch": 0.9777540961769016, "grad_norm": 0.07746705967903876, "learning_rate": 2.909925288293369e-07, "loss": 1.1729, "step": 1835 }, { "epoch": 0.9804182762754762, "grad_norm": 0.08118653860364235, "learning_rate": 2.2437134659962778e-07, "loss": 1.1637, "step": 1840 }, { "epoch": 0.9830824563740509, "grad_norm": 0.07709302387499754, "learning_rate": 1.6639021342588213e-07, "loss": 1.1714, "step": 1845 }, { "epoch": 0.9857466364726255, "grad_norm": 0.07514251112734556, "learning_rate": 1.1705415017183585e-07, "loss": 1.1826, "step": 1850 }, { "epoch": 0.9884108165712002, "grad_norm": 0.07785113536208763, "learning_rate": 7.636742908324613e-08, "loss": 1.1894, "step": 1855 }, { "epoch": 0.9910749966697748, "grad_norm": 0.08355649548742282, "learning_rate": 4.4333573417953967e-08, "loss": 1.1833, "step": 1860 }, { "epoch": 0.9937391767683496, "grad_norm": 0.07679966879567424, "learning_rate": 2.0955357140783893e-08, "loss": 1.1922, "step": 1865 }, { "epoch": 0.9964033568669242, "grad_norm": 0.07806796015090577, "learning_rate": 6.234804683336038e-09, "loss": 1.1612, "step": 1870 }, { "epoch": 0.9990675369654989, "grad_norm": 0.07653772348938681, "learning_rate": 1.7319076868194117e-10, "loss": 1.1679, "step": 1875 }, { "epoch": 0.9996003729852138, "eval_loss": 1.1644140481948853, "eval_runtime": 1556.6302, "eval_samples_per_second": 8.594, "eval_steps_per_second": 0.538, "step": 1876 }, { "epoch": 0.9996003729852138, "step": 1876, "total_flos": 2.3594858912415744e+16, "train_loss": 1.2015958401694227, "train_runtime": 40566.6998, "train_samples_per_second": 2.961, "train_steps_per_second": 0.046 } ], "logging_steps": 5, "max_steps": 1876, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3594858912415744e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }