{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.273117937291103, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006366385484641095, "grad_norm": 28.7295833201599, "learning_rate": 0.00011219390703061137, "loss": 6.6223, "step": 10 }, { "epoch": 0.01273277096928219, "grad_norm": 15.081303427956621, "learning_rate": 0.00014596763837756142, "loss": 2.7812, "step": 20 }, { "epoch": 0.019099156453923283, "grad_norm": 17.081283324526982, "learning_rate": 0.00016572400472495783, "loss": 2.6574, "step": 30 }, { "epoch": 0.02546554193856438, "grad_norm": 15.201762160364256, "learning_rate": 0.00017974136972451145, "loss": 2.9039, "step": 40 }, { "epoch": 0.031831927423205474, "grad_norm": 11.557044296106628, "learning_rate": 0.0001906140827142727, "loss": 2.9066, "step": 50 }, { "epoch": 0.03819831290784657, "grad_norm": 14.915432761425704, "learning_rate": 0.00019949773607190786, "loss": 2.8914, "step": 60 }, { "epoch": 0.04456469839248767, "grad_norm": 8.745902020340656, "learning_rate": 0.0002070087579637228, "loss": 2.9014, "step": 70 }, { "epoch": 0.05093108387712876, "grad_norm": 8.857658675771324, "learning_rate": 0.0002135151010714615, "loss": 2.9293, "step": 80 }, { "epoch": 0.057297469361769854, "grad_norm": 11.48593285000581, "learning_rate": 0.0002192541024193043, "loss": 2.8625, "step": 90 }, { "epoch": 0.06366385484641095, "grad_norm": 7.815960040918993, "learning_rate": 0.00022438781406122275, "loss": 3.0063, "step": 100 }, { "epoch": 0.07003024033105204, "grad_norm": 8.632620514774269, "learning_rate": 0.00022903182113161202, "loss": 2.9527, "step": 110 }, { "epoch": 0.07639662581569313, "grad_norm": 7.955348207734164, "learning_rate": 0.00023327146741885792, "loss": 3.0082, "step": 120 }, { "epoch": 0.08276301130033424, "grad_norm": 7.216941061016277, "learning_rate": 0.00023717156393669215, "loss": 2.7871, "step": 130 }, { "epoch": 0.08912939678497533, "grad_norm": 5.822424121539524, "learning_rate": 0.00024078248931067283, "loss": 2.7844, "step": 140 }, { "epoch": 0.09549578226961643, "grad_norm": 7.906464628684026, "learning_rate": 0.00024414418040861915, "loss": 2.8514, "step": 150 }, { "epoch": 0.10186216775425752, "grad_norm": 7.514192904534888, "learning_rate": 0.00024728883241841157, "loss": 3.0086, "step": 160 }, { "epoch": 0.10822855323889861, "grad_norm": 6.425203024033055, "learning_rate": 0.0002502427789216415, "loss": 2.8754, "step": 170 }, { "epoch": 0.11459493872353971, "grad_norm": 7.088171053123606, "learning_rate": 0.00025302783376625435, "loss": 2.9094, "step": 180 }, { "epoch": 0.1209613242081808, "grad_norm": 6.538392044337595, "learning_rate": 0.00025566226965097254, "loss": 2.9262, "step": 190 }, { "epoch": 0.1273277096928219, "grad_norm": 8.672338929603391, "learning_rate": 0.0002581615454081728, "loss": 2.8391, "step": 200 }, { "epoch": 0.133694095177463, "grad_norm": 7.204443295582082, "learning_rate": 0.00026053885565806924, "loss": 2.9121, "step": 210 }, { "epoch": 0.14006048066210408, "grad_norm": 6.463760927732387, "learning_rate": 0.0002628055524785621, "loss": 2.9268, "step": 220 }, { "epoch": 0.1464268661467452, "grad_norm": 6.869739493367925, "learning_rate": 0.0002649714732657648, "loss": 2.8965, "step": 230 }, { "epoch": 0.15279325163138627, "grad_norm": 5.613452359830457, "learning_rate": 0.00026704519876580795, "loss": 2.9688, "step": 240 }, { "epoch": 0.15915963711602737, "grad_norm": 5.7575322101372866, "learning_rate": 0.000269034258397934, "loss": 2.9008, "step": 250 }, { "epoch": 0.16552602260066848, "grad_norm": 7.337481922102622, "learning_rate": 0.00027094529528364224, "loss": 2.9398, "step": 260 }, { "epoch": 0.17189240808530956, "grad_norm": 6.95242288679829, "learning_rate": 0.00027278420011365073, "loss": 2.8973, "step": 270 }, { "epoch": 0.17825879356995067, "grad_norm": 4.554710482953035, "learning_rate": 0.00027455622065762283, "loss": 2.9867, "step": 280 }, { "epoch": 0.18462517905459175, "grad_norm": 7.670629856220385, "learning_rate": 0.00027626605204863905, "loss": 2.9191, "step": 290 }, { "epoch": 0.19099156453923286, "grad_norm": 6.140158435864239, "learning_rate": 0.0002779179117555692, "loss": 2.8199, "step": 300 }, { "epoch": 0.19735795002387393, "grad_norm": 6.06880330501044, "learning_rate": 0.00027951560225766885, "loss": 2.959, "step": 310 }, { "epoch": 0.20372433550851504, "grad_norm": 5.932458703590213, "learning_rate": 0.0002810625637653616, "loss": 2.9984, "step": 320 }, { "epoch": 0.21009072099315615, "grad_norm": 5.413884214713577, "learning_rate": 0.0002825619188259585, "loss": 2.8937, "step": 330 }, { "epoch": 0.21645710647779723, "grad_norm": 5.6133458231383315, "learning_rate": 0.00028401651026859154, "loss": 2.8844, "step": 340 }, { "epoch": 0.22282349196243834, "grad_norm": 6.157515185075721, "learning_rate": 0.00028542893364738413, "loss": 2.9137, "step": 350 }, { "epoch": 0.22918987744707942, "grad_norm": 6.150435968994946, "learning_rate": 0.0002868015651132044, "loss": 2.8086, "step": 360 }, { "epoch": 0.23555626293172052, "grad_norm": 6.1391865377467205, "learning_rate": 0.00028813658546582825, "loss": 2.9246, "step": 370 }, { "epoch": 0.2419226484163616, "grad_norm": 4.720212170517568, "learning_rate": 0.0002894360009979226, "loss": 2.7273, "step": 380 }, { "epoch": 0.2482890339010027, "grad_norm": 5.221438084968306, "learning_rate": 0.0002907016616310386, "loss": 2.8332, "step": 390 }, { "epoch": 0.2546554193856438, "grad_norm": 5.397389581466363, "learning_rate": 0.00029193527675512284, "loss": 2.776, "step": 400 }, { "epoch": 0.26102180487028487, "grad_norm": 7.525981812223918, "learning_rate": 0.0002931384291118962, "loss": 2.8727, "step": 410 }, { "epoch": 0.267388190354926, "grad_norm": 5.7116155190893805, "learning_rate": 0.00029431258700501927, "loss": 2.9766, "step": 420 }, { "epoch": 0.2737545758395671, "grad_norm": 7.012023587035556, "learning_rate": 0.00029545911507334384, "loss": 2.7902, "step": 430 }, { "epoch": 0.28012096132420816, "grad_norm": 4.8865448559050435, "learning_rate": 0.0002965792838255121, "loss": 2.6873, "step": 440 }, { "epoch": 0.2864873468088493, "grad_norm": 4.682656054807096, "learning_rate": 0.0002976742781029656, "loss": 2.8498, "step": 450 }, { "epoch": 0.2928537322934904, "grad_norm": 4.114956894922335, "learning_rate": 0.00029874520461271485, "loss": 2.7461, "step": 460 }, { "epoch": 0.29922011777813146, "grad_norm": 4.388435500348782, "learning_rate": 0.0002997930986499357, "loss": 2.9004, "step": 470 }, { "epoch": 0.30558650326277254, "grad_norm": 4.06992963685517, "learning_rate": 0.0002995048337656213, "loss": 2.8203, "step": 480 }, { "epoch": 0.31195288874741367, "grad_norm": 5.7122354924160375, "learning_rate": 0.0002987974534307946, "loss": 2.8477, "step": 490 }, { "epoch": 0.31831927423205475, "grad_norm": 4.543743759310822, "learning_rate": 0.0002980900730959679, "loss": 2.8168, "step": 500 }, { "epoch": 0.32468565971669583, "grad_norm": 5.2449822235737384, "learning_rate": 0.0002973826927611412, "loss": 2.7021, "step": 510 }, { "epoch": 0.33105204520133696, "grad_norm": 6.42302117474529, "learning_rate": 0.0002966753124263145, "loss": 2.8545, "step": 520 }, { "epoch": 0.33741843068597804, "grad_norm": 6.4116258183908945, "learning_rate": 0.0002959679320914878, "loss": 2.8436, "step": 530 }, { "epoch": 0.3437848161706191, "grad_norm": 4.982523320381435, "learning_rate": 0.00029526055175666113, "loss": 2.7607, "step": 540 }, { "epoch": 0.3501512016552602, "grad_norm": 4.815920229901925, "learning_rate": 0.00029455317142183443, "loss": 2.734, "step": 550 }, { "epoch": 0.35651758713990134, "grad_norm": 9.300484207985622, "learning_rate": 0.00029384579108700774, "loss": 2.7889, "step": 560 }, { "epoch": 0.3628839726245424, "grad_norm": 4.5559522343853285, "learning_rate": 0.00029313841075218105, "loss": 2.7531, "step": 570 }, { "epoch": 0.3692503581091835, "grad_norm": 5.351641807589207, "learning_rate": 0.0002924310304173544, "loss": 2.7451, "step": 580 }, { "epoch": 0.37561674359382463, "grad_norm": 5.696588702079196, "learning_rate": 0.00029172365008252766, "loss": 2.7189, "step": 590 }, { "epoch": 0.3819831290784657, "grad_norm": 4.694068773418385, "learning_rate": 0.00029101626974770096, "loss": 2.6096, "step": 600 }, { "epoch": 0.3883495145631068, "grad_norm": 9.013555425033168, "learning_rate": 0.0002903088894128743, "loss": 2.6605, "step": 610 }, { "epoch": 0.39471590004774787, "grad_norm": 10.964569620769009, "learning_rate": 0.00028960150907804763, "loss": 2.7258, "step": 620 }, { "epoch": 0.401082285532389, "grad_norm": 8.94321766980447, "learning_rate": 0.00028889412874322093, "loss": 2.6688, "step": 630 }, { "epoch": 0.4074486710170301, "grad_norm": 10.742420289850541, "learning_rate": 0.0002881867484083942, "loss": 2.573, "step": 640 }, { "epoch": 0.41381505650167116, "grad_norm": 8.890111021830462, "learning_rate": 0.00028747936807356755, "loss": 2.5846, "step": 650 }, { "epoch": 0.4201814419863123, "grad_norm": 27.744833393643688, "learning_rate": 0.00028677198773874085, "loss": 2.5406, "step": 660 }, { "epoch": 0.4265478274709534, "grad_norm": 8.283239589638123, "learning_rate": 0.00028606460740391416, "loss": 2.602, "step": 670 }, { "epoch": 0.43291421295559446, "grad_norm": 7.738262298672388, "learning_rate": 0.00028535722706908746, "loss": 2.6947, "step": 680 }, { "epoch": 0.43928059844023554, "grad_norm": 11.536106786052837, "learning_rate": 0.00028464984673426077, "loss": 2.6422, "step": 690 }, { "epoch": 0.44564698392487667, "grad_norm": 7.93060742031869, "learning_rate": 0.0002839424663994341, "loss": 2.5859, "step": 700 }, { "epoch": 0.45201336940951775, "grad_norm": 4.031662354294752, "learning_rate": 0.0002832350860646074, "loss": 2.5947, "step": 710 }, { "epoch": 0.45837975489415883, "grad_norm": 5.258278074052536, "learning_rate": 0.0002825277057297807, "loss": 2.64, "step": 720 }, { "epoch": 0.4647461403787999, "grad_norm": 4.323834597980534, "learning_rate": 0.000281820325394954, "loss": 2.6949, "step": 730 }, { "epoch": 0.47111252586344105, "grad_norm": 5.0167173121086135, "learning_rate": 0.0002811129450601273, "loss": 2.6221, "step": 740 }, { "epoch": 0.4774789113480821, "grad_norm": 5.933856548166221, "learning_rate": 0.0002804055647253006, "loss": 2.632, "step": 750 }, { "epoch": 0.4838452968327232, "grad_norm": 5.090852518324657, "learning_rate": 0.0002796981843904739, "loss": 2.8242, "step": 760 }, { "epoch": 0.49021168231736434, "grad_norm": 4.60199924467046, "learning_rate": 0.0002789908040556472, "loss": 2.6357, "step": 770 }, { "epoch": 0.4965780678020054, "grad_norm": 4.62334978934682, "learning_rate": 0.0002782834237208206, "loss": 2.6455, "step": 780 }, { "epoch": 0.5029444532866465, "grad_norm": 5.7098868621282834, "learning_rate": 0.0002775760433859938, "loss": 2.7822, "step": 790 }, { "epoch": 0.5093108387712876, "grad_norm": 5.492909395839608, "learning_rate": 0.00027686866305116713, "loss": 2.5965, "step": 800 }, { "epoch": 0.5156772242559287, "grad_norm": 5.007751033932614, "learning_rate": 0.00027616128271634044, "loss": 2.6355, "step": 810 }, { "epoch": 0.5220436097405697, "grad_norm": 6.170648429353342, "learning_rate": 0.0002754539023815138, "loss": 2.6123, "step": 820 }, { "epoch": 0.5284099952252109, "grad_norm": 4.235113061064731, "learning_rate": 0.0002747465220466871, "loss": 2.6453, "step": 830 }, { "epoch": 0.534776380709852, "grad_norm": 4.538569485793573, "learning_rate": 0.00027403914171186036, "loss": 2.6713, "step": 840 }, { "epoch": 0.5411427661944931, "grad_norm": 4.019460059963649, "learning_rate": 0.0002733317613770337, "loss": 2.5943, "step": 850 }, { "epoch": 0.5475091516791342, "grad_norm": 6.254490760667432, "learning_rate": 0.000272624381042207, "loss": 2.5418, "step": 860 }, { "epoch": 0.5538755371637752, "grad_norm": 4.520259916472461, "learning_rate": 0.0002719170007073803, "loss": 2.584, "step": 870 }, { "epoch": 0.5602419226484163, "grad_norm": 5.065731135931019, "learning_rate": 0.0002712096203725536, "loss": 2.6723, "step": 880 }, { "epoch": 0.5666083081330574, "grad_norm": 3.833075034277501, "learning_rate": 0.00027050224003772694, "loss": 2.5248, "step": 890 }, { "epoch": 0.5729746936176986, "grad_norm": 5.507080157861899, "learning_rate": 0.00026979485970290024, "loss": 2.6379, "step": 900 }, { "epoch": 0.5793410791023397, "grad_norm": 14.128248882331409, "learning_rate": 0.00026908747936807355, "loss": 2.5904, "step": 910 }, { "epoch": 0.5857074645869808, "grad_norm": 6.1778278241357265, "learning_rate": 0.00026838009903324686, "loss": 2.5844, "step": 920 }, { "epoch": 0.5920738500716218, "grad_norm": 8.064848357512728, "learning_rate": 0.00026767271869842016, "loss": 2.6055, "step": 930 }, { "epoch": 0.5984402355562629, "grad_norm": 6.16438539022089, "learning_rate": 0.00026696533836359347, "loss": 2.5506, "step": 940 }, { "epoch": 0.604806621040904, "grad_norm": 5.882124357483614, "learning_rate": 0.0002662579580287668, "loss": 2.4973, "step": 950 }, { "epoch": 0.6111730065255451, "grad_norm": 8.373991467730166, "learning_rate": 0.0002655505776939401, "loss": 2.5428, "step": 960 }, { "epoch": 0.6175393920101863, "grad_norm": 6.831051520016273, "learning_rate": 0.0002648431973591134, "loss": 2.4982, "step": 970 }, { "epoch": 0.6239057774948273, "grad_norm": 5.074869694709693, "learning_rate": 0.0002641358170242867, "loss": 2.5465, "step": 980 }, { "epoch": 0.6302721629794684, "grad_norm": 5.503484162830985, "learning_rate": 0.00026342843668946, "loss": 2.324, "step": 990 }, { "epoch": 0.6366385484641095, "grad_norm": 4.385405886090266, "learning_rate": 0.0002627210563546333, "loss": 2.417, "step": 1000 }, { "epoch": 0.6430049339487506, "grad_norm": 5.034207848822367, "learning_rate": 0.0002620136760198066, "loss": 2.6066, "step": 1010 }, { "epoch": 0.6493713194333917, "grad_norm": 3.696840611162144, "learning_rate": 0.00026130629568497997, "loss": 2.5441, "step": 1020 }, { "epoch": 0.6557377049180327, "grad_norm": 4.6314444028768875, "learning_rate": 0.0002605989153501533, "loss": 2.4752, "step": 1030 }, { "epoch": 0.6621040904026739, "grad_norm": 3.4663340841035093, "learning_rate": 0.0002598915350153265, "loss": 2.4637, "step": 1040 }, { "epoch": 0.668470475887315, "grad_norm": 8.334602958229153, "learning_rate": 0.00025918415468049983, "loss": 2.6082, "step": 1050 }, { "epoch": 0.6748368613719561, "grad_norm": 4.575304861734687, "learning_rate": 0.0002584767743456732, "loss": 2.4842, "step": 1060 }, { "epoch": 0.6812032468565972, "grad_norm": 3.618603659881246, "learning_rate": 0.0002577693940108465, "loss": 2.4982, "step": 1070 }, { "epoch": 0.6875696323412382, "grad_norm": 4.4740750234847475, "learning_rate": 0.00025706201367601975, "loss": 2.4029, "step": 1080 }, { "epoch": 0.6939360178258793, "grad_norm": 4.208151138305904, "learning_rate": 0.0002563546333411931, "loss": 2.4836, "step": 1090 }, { "epoch": 0.7003024033105204, "grad_norm": 5.736418133529195, "learning_rate": 0.0002556472530063664, "loss": 2.6402, "step": 1100 }, { "epoch": 0.7066687887951616, "grad_norm": 3.6852138552936395, "learning_rate": 0.0002549398726715397, "loss": 2.4941, "step": 1110 }, { "epoch": 0.7130351742798027, "grad_norm": 4.11339792490071, "learning_rate": 0.000254232492336713, "loss": 2.4641, "step": 1120 }, { "epoch": 0.7194015597644438, "grad_norm": 3.7487169212790477, "learning_rate": 0.00025352511200188633, "loss": 2.3414, "step": 1130 }, { "epoch": 0.7257679452490848, "grad_norm": 3.442251781584789, "learning_rate": 0.00025281773166705964, "loss": 2.3932, "step": 1140 }, { "epoch": 0.7321343307337259, "grad_norm": 3.8470452186807846, "learning_rate": 0.00025211035133223294, "loss": 2.4334, "step": 1150 }, { "epoch": 0.738500716218367, "grad_norm": 4.506628903442333, "learning_rate": 0.00025140297099740625, "loss": 2.3195, "step": 1160 }, { "epoch": 0.7448671017030081, "grad_norm": 4.325389792461109, "learning_rate": 0.00025069559066257955, "loss": 2.4027, "step": 1170 }, { "epoch": 0.7512334871876493, "grad_norm": 3.8669860452410445, "learning_rate": 0.00024998821032775286, "loss": 2.4686, "step": 1180 }, { "epoch": 0.7575998726722903, "grad_norm": 3.561694549590622, "learning_rate": 0.00024928082999292617, "loss": 2.5395, "step": 1190 }, { "epoch": 0.7639662581569314, "grad_norm": 4.156976510059248, "learning_rate": 0.00024857344965809947, "loss": 2.4416, "step": 1200 }, { "epoch": 0.7703326436415725, "grad_norm": 5.616798309964706, "learning_rate": 0.0002478660693232728, "loss": 2.4055, "step": 1210 }, { "epoch": 0.7766990291262136, "grad_norm": 3.149306910194074, "learning_rate": 0.0002471586889884461, "loss": 2.4488, "step": 1220 }, { "epoch": 0.7830654146108547, "grad_norm": 3.0254085146949974, "learning_rate": 0.00024645130865361944, "loss": 2.44, "step": 1230 }, { "epoch": 0.7894318000954957, "grad_norm": 3.3553234269170815, "learning_rate": 0.0002457439283187927, "loss": 2.4689, "step": 1240 }, { "epoch": 0.7957981855801369, "grad_norm": 3.560968004340506, "learning_rate": 0.000245036547983966, "loss": 2.4328, "step": 1250 }, { "epoch": 0.802164571064778, "grad_norm": 3.670143268942363, "learning_rate": 0.00024432916764913936, "loss": 2.357, "step": 1260 }, { "epoch": 0.8085309565494191, "grad_norm": 4.603180595509743, "learning_rate": 0.00024362178731431264, "loss": 2.4336, "step": 1270 }, { "epoch": 0.8148973420340602, "grad_norm": 3.2173958923931845, "learning_rate": 0.00024291440697948595, "loss": 2.3859, "step": 1280 }, { "epoch": 0.8212637275187012, "grad_norm": 4.238015371838477, "learning_rate": 0.00024220702664465925, "loss": 2.4234, "step": 1290 }, { "epoch": 0.8276301130033423, "grad_norm": 3.350821041839247, "learning_rate": 0.00024149964630983258, "loss": 2.34, "step": 1300 }, { "epoch": 0.8339964984879834, "grad_norm": 3.956822835287818, "learning_rate": 0.00024079226597500586, "loss": 2.409, "step": 1310 }, { "epoch": 0.8403628839726246, "grad_norm": 3.206745954274851, "learning_rate": 0.00024008488564017917, "loss": 2.4766, "step": 1320 }, { "epoch": 0.8467292694572657, "grad_norm": 3.16798874537864, "learning_rate": 0.0002393775053053525, "loss": 2.4918, "step": 1330 }, { "epoch": 0.8530956549419068, "grad_norm": 3.2084119932227613, "learning_rate": 0.0002386701249705258, "loss": 2.3279, "step": 1340 }, { "epoch": 0.8594620404265478, "grad_norm": 2.6626163554970694, "learning_rate": 0.00023796274463569909, "loss": 2.3762, "step": 1350 }, { "epoch": 0.8658284259111889, "grad_norm": 3.543568104802714, "learning_rate": 0.00023725536430087242, "loss": 2.4406, "step": 1360 }, { "epoch": 0.87219481139583, "grad_norm": 3.0950338366546832, "learning_rate": 0.00023654798396604572, "loss": 2.4135, "step": 1370 }, { "epoch": 0.8785611968804711, "grad_norm": 3.4459117956268583, "learning_rate": 0.00023584060363121903, "loss": 2.3371, "step": 1380 }, { "epoch": 0.8849275823651122, "grad_norm": 3.3339357175581026, "learning_rate": 0.00023513322329639234, "loss": 2.3568, "step": 1390 }, { "epoch": 0.8912939678497533, "grad_norm": 2.7770602679788836, "learning_rate": 0.00023442584296156567, "loss": 2.4703, "step": 1400 }, { "epoch": 0.8976603533343944, "grad_norm": 3.1587359306963925, "learning_rate": 0.00023371846262673895, "loss": 2.3687, "step": 1410 }, { "epoch": 0.9040267388190355, "grad_norm": 3.6463373041615057, "learning_rate": 0.00023301108229191225, "loss": 2.5037, "step": 1420 }, { "epoch": 0.9103931243036766, "grad_norm": 3.8799007179484066, "learning_rate": 0.00023230370195708559, "loss": 2.3883, "step": 1430 }, { "epoch": 0.9167595097883177, "grad_norm": 2.650501985473451, "learning_rate": 0.0002315963216222589, "loss": 2.3818, "step": 1440 }, { "epoch": 0.9231258952729587, "grad_norm": 3.3575457214628353, "learning_rate": 0.00023088894128743217, "loss": 2.2014, "step": 1450 }, { "epoch": 0.9294922807575998, "grad_norm": 4.160903578127555, "learning_rate": 0.00023018156095260548, "loss": 2.3586, "step": 1460 }, { "epoch": 0.935858666242241, "grad_norm": 3.7275531046661805, "learning_rate": 0.0002294741806177788, "loss": 2.4684, "step": 1470 }, { "epoch": 0.9422250517268821, "grad_norm": 3.5804971221330515, "learning_rate": 0.00022876680028295211, "loss": 2.3984, "step": 1480 }, { "epoch": 0.9485914372115232, "grad_norm": 2.903402028010888, "learning_rate": 0.00022805941994812542, "loss": 2.3508, "step": 1490 }, { "epoch": 0.9549578226961642, "grad_norm": 3.2883720153819804, "learning_rate": 0.00022735203961329875, "loss": 2.468, "step": 1500 }, { "epoch": 0.9613242081808053, "grad_norm": 3.165068282067837, "learning_rate": 0.00022664465927847203, "loss": 2.3387, "step": 1510 }, { "epoch": 0.9676905936654464, "grad_norm": 2.8318847580527895, "learning_rate": 0.00022593727894364534, "loss": 2.235, "step": 1520 }, { "epoch": 0.9740569791500875, "grad_norm": 3.0627223359274556, "learning_rate": 0.00022522989860881867, "loss": 2.4135, "step": 1530 }, { "epoch": 0.9804233646347287, "grad_norm": 4.63667156924246, "learning_rate": 0.00022452251827399198, "loss": 2.3891, "step": 1540 }, { "epoch": 0.9867897501193698, "grad_norm": 2.7155003830350735, "learning_rate": 0.00022381513793916526, "loss": 2.2814, "step": 1550 }, { "epoch": 0.9931561356040108, "grad_norm": 3.743790594304019, "learning_rate": 0.00022310775760433856, "loss": 2.1729, "step": 1560 }, { "epoch": 0.9995225210886519, "grad_norm": 3.0964479659673336, "learning_rate": 0.0002224003772695119, "loss": 2.3365, "step": 1570 }, { "epoch": 1.005729746936177, "grad_norm": 3.9150118918426307, "learning_rate": 0.0002216929969346852, "loss": 1.9852, "step": 1580 }, { "epoch": 1.012096132420818, "grad_norm": 3.5380613834183614, "learning_rate": 0.0002209856165998585, "loss": 2.0525, "step": 1590 }, { "epoch": 1.0184625179054592, "grad_norm": 3.1487817928894613, "learning_rate": 0.00022027823626503184, "loss": 1.9064, "step": 1600 }, { "epoch": 1.0248289033901004, "grad_norm": 3.266369156342291, "learning_rate": 0.00021957085593020512, "loss": 1.8943, "step": 1610 }, { "epoch": 1.0311952888747413, "grad_norm": 3.4336556947723307, "learning_rate": 0.00021886347559537842, "loss": 1.7861, "step": 1620 }, { "epoch": 1.0375616743593825, "grad_norm": 3.832267113304745, "learning_rate": 0.00021815609526055173, "loss": 2.0424, "step": 1630 }, { "epoch": 1.0439280598440235, "grad_norm": 4.130087016761162, "learning_rate": 0.00021744871492572506, "loss": 1.9139, "step": 1640 }, { "epoch": 1.0502944453286647, "grad_norm": 2.7335284776977886, "learning_rate": 0.00021674133459089834, "loss": 2.0488, "step": 1650 }, { "epoch": 1.0566608308133056, "grad_norm": 3.329599622048459, "learning_rate": 0.00021603395425607165, "loss": 1.9137, "step": 1660 }, { "epoch": 1.0630272162979468, "grad_norm": 3.6314777543543313, "learning_rate": 0.00021532657392124498, "loss": 1.9768, "step": 1670 }, { "epoch": 1.069393601782588, "grad_norm": 4.218111606262662, "learning_rate": 0.00021461919358641828, "loss": 2.0141, "step": 1680 }, { "epoch": 1.075759987267229, "grad_norm": 2.895306844768361, "learning_rate": 0.0002139118132515916, "loss": 1.9783, "step": 1690 }, { "epoch": 1.0821263727518702, "grad_norm": 3.753588342038039, "learning_rate": 0.00021320443291676492, "loss": 2.0227, "step": 1700 }, { "epoch": 1.0884927582365111, "grad_norm": 3.250518576003523, "learning_rate": 0.0002124970525819382, "loss": 1.8848, "step": 1710 }, { "epoch": 1.0948591437211523, "grad_norm": 2.980988290019129, "learning_rate": 0.0002117896722471115, "loss": 1.867, "step": 1720 }, { "epoch": 1.1012255292057933, "grad_norm": 3.3311589155605263, "learning_rate": 0.0002110822919122848, "loss": 1.8766, "step": 1730 }, { "epoch": 1.1075919146904345, "grad_norm": 3.2929109952092523, "learning_rate": 0.00021037491157745815, "loss": 2.0685, "step": 1740 }, { "epoch": 1.1139583001750757, "grad_norm": 3.8495434375573243, "learning_rate": 0.00020966753124263143, "loss": 1.8662, "step": 1750 }, { "epoch": 1.1203246856597167, "grad_norm": 3.223669203555661, "learning_rate": 0.00020896015090780473, "loss": 1.9401, "step": 1760 }, { "epoch": 1.1266910711443578, "grad_norm": 3.068725890159831, "learning_rate": 0.00020825277057297806, "loss": 1.9732, "step": 1770 }, { "epoch": 1.1330574566289988, "grad_norm": 3.7652954958158724, "learning_rate": 0.00020754539023815137, "loss": 1.9264, "step": 1780 }, { "epoch": 1.13942384211364, "grad_norm": 3.283263110696873, "learning_rate": 0.00020683800990332468, "loss": 1.8805, "step": 1790 }, { "epoch": 1.145790227598281, "grad_norm": 3.578786877708257, "learning_rate": 0.00020613062956849795, "loss": 1.8764, "step": 1800 }, { "epoch": 1.1521566130829222, "grad_norm": 3.2073175602979402, "learning_rate": 0.0002054232492336713, "loss": 1.9123, "step": 1810 }, { "epoch": 1.1585229985675634, "grad_norm": 3.960679432169151, "learning_rate": 0.0002047158688988446, "loss": 1.8938, "step": 1820 }, { "epoch": 1.1648893840522043, "grad_norm": 3.026836467606075, "learning_rate": 0.0002040084885640179, "loss": 1.9291, "step": 1830 }, { "epoch": 1.1712557695368455, "grad_norm": 3.368777784279614, "learning_rate": 0.00020330110822919123, "loss": 1.9459, "step": 1840 }, { "epoch": 1.1776221550214865, "grad_norm": 2.843063910159576, "learning_rate": 0.0002025937278943645, "loss": 1.9449, "step": 1850 }, { "epoch": 1.1839885405061277, "grad_norm": 2.9510127362455876, "learning_rate": 0.00020188634755953782, "loss": 1.9309, "step": 1860 }, { "epoch": 1.1903549259907686, "grad_norm": 4.545581163313047, "learning_rate": 0.00020117896722471115, "loss": 1.8869, "step": 1870 }, { "epoch": 1.1967213114754098, "grad_norm": 3.301260365676958, "learning_rate": 0.00020047158688988445, "loss": 1.9613, "step": 1880 }, { "epoch": 1.203087696960051, "grad_norm": 3.299718394999854, "learning_rate": 0.00019976420655505776, "loss": 2.0072, "step": 1890 }, { "epoch": 1.209454082444692, "grad_norm": 3.5617690109472444, "learning_rate": 0.00019905682622023104, "loss": 1.9713, "step": 1900 }, { "epoch": 1.2158204679293332, "grad_norm": 3.1359097580279767, "learning_rate": 0.00019834944588540437, "loss": 1.8154, "step": 1910 }, { "epoch": 1.2221868534139742, "grad_norm": 3.0984711664236206, "learning_rate": 0.00019764206555057768, "loss": 1.8568, "step": 1920 }, { "epoch": 1.2285532388986153, "grad_norm": 2.842733095279462, "learning_rate": 0.00019693468521575098, "loss": 1.851, "step": 1930 }, { "epoch": 1.2349196243832563, "grad_norm": 2.935936669675825, "learning_rate": 0.00019622730488092432, "loss": 1.9604, "step": 1940 }, { "epoch": 1.2412860098678975, "grad_norm": 3.3716657028276096, "learning_rate": 0.0001955199245460976, "loss": 1.9171, "step": 1950 }, { "epoch": 1.2476523953525387, "grad_norm": 2.9604032685493133, "learning_rate": 0.0001948125442112709, "loss": 1.8604, "step": 1960 }, { "epoch": 1.2540187808371797, "grad_norm": 2.7269826434015427, "learning_rate": 0.0001941051638764442, "loss": 1.9076, "step": 1970 }, { "epoch": 1.2603851663218208, "grad_norm": 4.046544087868519, "learning_rate": 0.00019339778354161754, "loss": 1.8299, "step": 1980 }, { "epoch": 1.2667515518064618, "grad_norm": 3.12579708206849, "learning_rate": 0.00019269040320679084, "loss": 1.7545, "step": 1990 }, { "epoch": 1.273117937291103, "grad_norm": 2.7391520471543225, "learning_rate": 0.00019198302287196412, "loss": 1.8973, "step": 2000 } ], "logging_steps": 10, "max_steps": 4713, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }