{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.34134821811936455, "eval_steps": 500, "global_step": 1590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021468441391155, "grad_norm": 1.7282733917236328, "learning_rate": 0.0004989265779304422, "loss": 1.4129, "step": 10 }, { "epoch": 0.004293688278231, "grad_norm": 2.1508498191833496, "learning_rate": 0.0004978531558608846, "loss": 1.2225, "step": 20 }, { "epoch": 0.006440532417346501, "grad_norm": 1.6386512517929077, "learning_rate": 0.0004967797337913268, "loss": 1.1663, "step": 30 }, { "epoch": 0.008587376556462, "grad_norm": 1.2367421388626099, "learning_rate": 0.000495706311721769, "loss": 1.1373, "step": 40 }, { "epoch": 0.010734220695577501, "grad_norm": 1.2300989627838135, "learning_rate": 0.0004946328896522112, "loss": 1.1143, "step": 50 }, { "epoch": 0.012881064834693002, "grad_norm": 1.1807990074157715, "learning_rate": 0.0004935594675826536, "loss": 1.0937, "step": 60 }, { "epoch": 0.015027908973808502, "grad_norm": 0.9375188946723938, "learning_rate": 0.0004924860455130958, "loss": 1.0732, "step": 70 }, { "epoch": 0.017174753112924, "grad_norm": 0.9801538586616516, "learning_rate": 0.000491412623443538, "loss": 1.0369, "step": 80 }, { "epoch": 0.019321597252039503, "grad_norm": 0.9229792356491089, "learning_rate": 0.0004903392013739802, "loss": 1.0093, "step": 90 }, { "epoch": 0.021468441391155002, "grad_norm": 1.011305570602417, "learning_rate": 0.0004892657793044225, "loss": 1.0161, "step": 100 }, { "epoch": 0.0236152855302705, "grad_norm": 0.9356452822685242, "learning_rate": 0.00048819235723486477, "loss": 0.9939, "step": 110 }, { "epoch": 0.025762129669386003, "grad_norm": 1.0092449188232422, "learning_rate": 0.00048711893516530704, "loss": 0.9647, "step": 120 }, { "epoch": 0.027908973808501502, "grad_norm": 0.9663442373275757, "learning_rate": 0.0004860455130957492, "loss": 0.9595, "step": 130 }, { "epoch": 0.030055817947617004, "grad_norm": 1.1502243280410767, "learning_rate": 0.0004849720910261915, "loss": 0.9422, "step": 140 }, { "epoch": 0.0322026620867325, "grad_norm": 0.970102846622467, "learning_rate": 0.00048389866895663376, "loss": 0.945, "step": 150 }, { "epoch": 0.034349506225848, "grad_norm": 1.2466392517089844, "learning_rate": 0.00048282524688707604, "loss": 0.9385, "step": 160 }, { "epoch": 0.0364963503649635, "grad_norm": 1.0010186433792114, "learning_rate": 0.00048175182481751826, "loss": 0.9301, "step": 170 }, { "epoch": 0.038643194504079006, "grad_norm": 1.2516905069351196, "learning_rate": 0.0004806784027479605, "loss": 0.919, "step": 180 }, { "epoch": 0.040790038643194505, "grad_norm": 0.8497525453567505, "learning_rate": 0.00047960498067840275, "loss": 0.9054, "step": 190 }, { "epoch": 0.042936882782310004, "grad_norm": 1.0371205806732178, "learning_rate": 0.00047853155860884503, "loss": 0.9109, "step": 200 }, { "epoch": 0.0450837269214255, "grad_norm": 1.3313541412353516, "learning_rate": 0.00047745813653928725, "loss": 0.9131, "step": 210 }, { "epoch": 0.047230571060541, "grad_norm": 0.9448315501213074, "learning_rate": 0.0004763847144697295, "loss": 0.9014, "step": 220 }, { "epoch": 0.04937741519965651, "grad_norm": 1.274882435798645, "learning_rate": 0.00047531129240017175, "loss": 0.8786, "step": 230 }, { "epoch": 0.051524259338772006, "grad_norm": 1.3116368055343628, "learning_rate": 0.000474237870330614, "loss": 0.9075, "step": 240 }, { "epoch": 0.053671103477887505, "grad_norm": 0.9970440864562988, "learning_rate": 0.00047316444826105624, "loss": 0.8932, "step": 250 }, { "epoch": 0.055817947617003004, "grad_norm": 1.698472499847412, "learning_rate": 0.0004720910261914985, "loss": 0.8838, "step": 260 }, { "epoch": 0.0579647917561185, "grad_norm": 1.0129982233047485, "learning_rate": 0.0004710176041219408, "loss": 0.8779, "step": 270 }, { "epoch": 0.06011163589523401, "grad_norm": 1.0594947338104248, "learning_rate": 0.00046994418205238296, "loss": 0.8631, "step": 280 }, { "epoch": 0.06225848003434951, "grad_norm": 0.7768178582191467, "learning_rate": 0.00046887075998282524, "loss": 0.8666, "step": 290 }, { "epoch": 0.064405324173465, "grad_norm": 0.9108049869537354, "learning_rate": 0.0004677973379132675, "loss": 0.8676, "step": 300 }, { "epoch": 0.06655216831258051, "grad_norm": 1.4127992391586304, "learning_rate": 0.0004667239158437098, "loss": 0.8951, "step": 310 }, { "epoch": 0.068699012451696, "grad_norm": 1.1507939100265503, "learning_rate": 0.000465650493774152, "loss": 0.863, "step": 320 }, { "epoch": 0.07084585659081151, "grad_norm": 1.1579265594482422, "learning_rate": 0.00046457707170459423, "loss": 0.8716, "step": 330 }, { "epoch": 0.072992700729927, "grad_norm": 0.9873006343841553, "learning_rate": 0.0004635036496350365, "loss": 0.8569, "step": 340 }, { "epoch": 0.07513954486904251, "grad_norm": 1.1990203857421875, "learning_rate": 0.0004624302275654788, "loss": 0.8776, "step": 350 }, { "epoch": 0.07728638900815801, "grad_norm": 1.1173065900802612, "learning_rate": 0.000461356805495921, "loss": 0.865, "step": 360 }, { "epoch": 0.0794332331472735, "grad_norm": 1.2493510246276855, "learning_rate": 0.0004602833834263633, "loss": 0.8609, "step": 370 }, { "epoch": 0.08158007728638901, "grad_norm": 1.1254737377166748, "learning_rate": 0.0004592099613568055, "loss": 0.8697, "step": 380 }, { "epoch": 0.0837269214255045, "grad_norm": 1.1009331941604614, "learning_rate": 0.0004581365392872477, "loss": 0.8653, "step": 390 }, { "epoch": 0.08587376556462001, "grad_norm": 1.3970990180969238, "learning_rate": 0.00045706311721769, "loss": 0.8542, "step": 400 }, { "epoch": 0.08802060970373551, "grad_norm": 1.278136968612671, "learning_rate": 0.00045598969514813227, "loss": 0.8485, "step": 410 }, { "epoch": 0.090167453842851, "grad_norm": 1.3295845985412598, "learning_rate": 0.00045491627307857454, "loss": 0.8501, "step": 420 }, { "epoch": 0.09231429798196651, "grad_norm": 1.310677170753479, "learning_rate": 0.0004538428510090167, "loss": 0.849, "step": 430 }, { "epoch": 0.094461142121082, "grad_norm": 1.0189110040664673, "learning_rate": 0.000452769428939459, "loss": 0.8566, "step": 440 }, { "epoch": 0.09660798626019751, "grad_norm": 1.2950178384780884, "learning_rate": 0.00045169600686990126, "loss": 0.8369, "step": 450 }, { "epoch": 0.09875483039931301, "grad_norm": 0.8336394429206848, "learning_rate": 0.00045062258480034354, "loss": 0.853, "step": 460 }, { "epoch": 0.1009016745384285, "grad_norm": 1.1623280048370361, "learning_rate": 0.00044954916273078576, "loss": 0.8437, "step": 470 }, { "epoch": 0.10304851867754401, "grad_norm": 1.5341142416000366, "learning_rate": 0.000448475740661228, "loss": 0.8388, "step": 480 }, { "epoch": 0.1051953628166595, "grad_norm": 1.154572606086731, "learning_rate": 0.00044740231859167025, "loss": 0.8499, "step": 490 }, { "epoch": 0.10734220695577501, "grad_norm": 1.291874885559082, "learning_rate": 0.00044632889652211253, "loss": 0.8508, "step": 500 }, { "epoch": 0.10948905109489052, "grad_norm": 2.017030954360962, "learning_rate": 0.00044525547445255475, "loss": 0.8163, "step": 510 }, { "epoch": 0.11163589523400601, "grad_norm": 1.2181349992752075, "learning_rate": 0.000444182052382997, "loss": 0.8304, "step": 520 }, { "epoch": 0.11378273937312151, "grad_norm": 1.1240856647491455, "learning_rate": 0.00044310863031343925, "loss": 0.8339, "step": 530 }, { "epoch": 0.115929583512237, "grad_norm": 1.5953660011291504, "learning_rate": 0.00044203520824388147, "loss": 0.8416, "step": 540 }, { "epoch": 0.11807642765135251, "grad_norm": 0.9097370505332947, "learning_rate": 0.00044096178617432374, "loss": 0.8362, "step": 550 }, { "epoch": 0.12022327179046802, "grad_norm": 1.0670212507247925, "learning_rate": 0.000439888364104766, "loss": 0.8395, "step": 560 }, { "epoch": 0.12237011592958351, "grad_norm": 1.1179403066635132, "learning_rate": 0.0004388149420352083, "loss": 0.8477, "step": 570 }, { "epoch": 0.12451696006869901, "grad_norm": 1.218599557876587, "learning_rate": 0.00043774151996565046, "loss": 0.8295, "step": 580 }, { "epoch": 0.1266638042078145, "grad_norm": 0.9557531476020813, "learning_rate": 0.00043666809789609274, "loss": 0.8257, "step": 590 }, { "epoch": 0.12881064834693, "grad_norm": 0.8345034122467041, "learning_rate": 0.000435594675826535, "loss": 0.8607, "step": 600 }, { "epoch": 0.13095749248604552, "grad_norm": 0.9946607947349548, "learning_rate": 0.0004345212537569773, "loss": 0.8173, "step": 610 }, { "epoch": 0.13310433662516102, "grad_norm": 1.3076237440109253, "learning_rate": 0.0004334478316874195, "loss": 0.8293, "step": 620 }, { "epoch": 0.1352511807642765, "grad_norm": 1.6002768278121948, "learning_rate": 0.00043237440961786173, "loss": 0.8328, "step": 630 }, { "epoch": 0.137398024903392, "grad_norm": 1.03147554397583, "learning_rate": 0.000431300987548304, "loss": 0.8297, "step": 640 }, { "epoch": 0.1395448690425075, "grad_norm": 1.42938232421875, "learning_rate": 0.0004302275654787463, "loss": 0.8328, "step": 650 }, { "epoch": 0.14169171318162302, "grad_norm": 1.319884181022644, "learning_rate": 0.0004291541434091885, "loss": 0.8496, "step": 660 }, { "epoch": 0.14383855732073852, "grad_norm": 1.289533019065857, "learning_rate": 0.0004280807213396308, "loss": 0.8171, "step": 670 }, { "epoch": 0.145985401459854, "grad_norm": 1.4401450157165527, "learning_rate": 0.000427007299270073, "loss": 0.8259, "step": 680 }, { "epoch": 0.1481322455989695, "grad_norm": 1.403343677520752, "learning_rate": 0.0004259338772005152, "loss": 0.8116, "step": 690 }, { "epoch": 0.15027908973808501, "grad_norm": 1.0387822389602661, "learning_rate": 0.0004248604551309575, "loss": 0.8233, "step": 700 }, { "epoch": 0.15242593387720052, "grad_norm": 1.0579140186309814, "learning_rate": 0.00042378703306139977, "loss": 0.8205, "step": 710 }, { "epoch": 0.15457277801631603, "grad_norm": 1.7332643270492554, "learning_rate": 0.00042271361099184204, "loss": 0.845, "step": 720 }, { "epoch": 0.1567196221554315, "grad_norm": 1.8401075601577759, "learning_rate": 0.0004216401889222842, "loss": 0.8441, "step": 730 }, { "epoch": 0.158866466294547, "grad_norm": 1.3133872747421265, "learning_rate": 0.0004205667668527265, "loss": 0.8427, "step": 740 }, { "epoch": 0.16101331043366252, "grad_norm": 2.1324663162231445, "learning_rate": 0.00041949334478316876, "loss": 0.8298, "step": 750 }, { "epoch": 0.16316015457277802, "grad_norm": 1.1304748058319092, "learning_rate": 0.00041841992271361104, "loss": 0.836, "step": 760 }, { "epoch": 0.16530699871189353, "grad_norm": 1.1530399322509766, "learning_rate": 0.00041734650064405326, "loss": 0.803, "step": 770 }, { "epoch": 0.167453842851009, "grad_norm": 0.8117969632148743, "learning_rate": 0.0004162730785744955, "loss": 0.8177, "step": 780 }, { "epoch": 0.1696006869901245, "grad_norm": 1.217517375946045, "learning_rate": 0.00041519965650493775, "loss": 0.8383, "step": 790 }, { "epoch": 0.17174753112924002, "grad_norm": 1.2580839395523071, "learning_rate": 0.00041412623443538, "loss": 0.8257, "step": 800 }, { "epoch": 0.17389437526835552, "grad_norm": 1.7408099174499512, "learning_rate": 0.00041305281236582225, "loss": 0.8201, "step": 810 }, { "epoch": 0.17604121940747103, "grad_norm": 1.1754316091537476, "learning_rate": 0.0004119793902962645, "loss": 0.8094, "step": 820 }, { "epoch": 0.1781880635465865, "grad_norm": 1.5301543474197388, "learning_rate": 0.00041090596822670675, "loss": 0.8112, "step": 830 }, { "epoch": 0.180334907685702, "grad_norm": 0.8299456834793091, "learning_rate": 0.00040983254615714897, "loss": 0.8518, "step": 840 }, { "epoch": 0.18248175182481752, "grad_norm": 1.3171818256378174, "learning_rate": 0.00040875912408759124, "loss": 0.8292, "step": 850 }, { "epoch": 0.18462859596393302, "grad_norm": 1.4290481805801392, "learning_rate": 0.0004076857020180335, "loss": 0.8147, "step": 860 }, { "epoch": 0.18677544010304853, "grad_norm": 0.9816901683807373, "learning_rate": 0.0004066122799484758, "loss": 0.825, "step": 870 }, { "epoch": 0.188922284242164, "grad_norm": 0.8896159529685974, "learning_rate": 0.00040553885787891796, "loss": 0.8245, "step": 880 }, { "epoch": 0.1910691283812795, "grad_norm": 1.5641008615493774, "learning_rate": 0.00040446543580936024, "loss": 0.8204, "step": 890 }, { "epoch": 0.19321597252039502, "grad_norm": 1.174325704574585, "learning_rate": 0.0004033920137398025, "loss": 0.8046, "step": 900 }, { "epoch": 0.19536281665951052, "grad_norm": 1.0568900108337402, "learning_rate": 0.0004023185916702448, "loss": 0.835, "step": 910 }, { "epoch": 0.19750966079862603, "grad_norm": 1.4573074579238892, "learning_rate": 0.000401245169600687, "loss": 0.8151, "step": 920 }, { "epoch": 0.1996565049377415, "grad_norm": 1.7658246755599976, "learning_rate": 0.00040017174753112923, "loss": 0.8012, "step": 930 }, { "epoch": 0.201803349076857, "grad_norm": 1.3144532442092896, "learning_rate": 0.0003990983254615715, "loss": 0.8155, "step": 940 }, { "epoch": 0.20395019321597252, "grad_norm": 1.302480697631836, "learning_rate": 0.0003980249033920137, "loss": 0.8125, "step": 950 }, { "epoch": 0.20609703735508803, "grad_norm": 1.6297829151153564, "learning_rate": 0.000396951481322456, "loss": 0.8157, "step": 960 }, { "epoch": 0.20824388149420353, "grad_norm": 1.2462539672851562, "learning_rate": 0.0003958780592528983, "loss": 0.8135, "step": 970 }, { "epoch": 0.210390725633319, "grad_norm": 1.3543071746826172, "learning_rate": 0.0003948046371833405, "loss": 0.8154, "step": 980 }, { "epoch": 0.21253756977243451, "grad_norm": 1.5854978561401367, "learning_rate": 0.0003937312151137827, "loss": 0.7982, "step": 990 }, { "epoch": 0.21468441391155002, "grad_norm": 1.0589042901992798, "learning_rate": 0.000392657793044225, "loss": 0.8267, "step": 1000 }, { "epoch": 0.21683125805066553, "grad_norm": 1.226970911026001, "learning_rate": 0.00039158437097466727, "loss": 0.8055, "step": 1010 }, { "epoch": 0.21897810218978103, "grad_norm": 1.390030860900879, "learning_rate": 0.00039051094890510954, "loss": 0.8272, "step": 1020 }, { "epoch": 0.2211249463288965, "grad_norm": 1.102220892906189, "learning_rate": 0.0003894375268355517, "loss": 0.8246, "step": 1030 }, { "epoch": 0.22327179046801202, "grad_norm": 1.094040870666504, "learning_rate": 0.000388364104765994, "loss": 0.814, "step": 1040 }, { "epoch": 0.22541863460712752, "grad_norm": 1.4209458827972412, "learning_rate": 0.00038729068269643626, "loss": 0.7972, "step": 1050 }, { "epoch": 0.22756547874624303, "grad_norm": 1.3925952911376953, "learning_rate": 0.0003862172606268785, "loss": 0.809, "step": 1060 }, { "epoch": 0.22971232288535853, "grad_norm": 1.0035127401351929, "learning_rate": 0.00038514383855732076, "loss": 0.8012, "step": 1070 }, { "epoch": 0.231859167024474, "grad_norm": 1.0175857543945312, "learning_rate": 0.000384070416487763, "loss": 0.7916, "step": 1080 }, { "epoch": 0.23400601116358952, "grad_norm": 1.3213493824005127, "learning_rate": 0.00038299699441820525, "loss": 0.8084, "step": 1090 }, { "epoch": 0.23615285530270502, "grad_norm": 1.4422920942306519, "learning_rate": 0.0003819235723486475, "loss": 0.8135, "step": 1100 }, { "epoch": 0.23829969944182053, "grad_norm": 1.228966474533081, "learning_rate": 0.00038085015027908975, "loss": 0.8221, "step": 1110 }, { "epoch": 0.24044654358093603, "grad_norm": 1.5089335441589355, "learning_rate": 0.000379776728209532, "loss": 0.8183, "step": 1120 }, { "epoch": 0.2425933877200515, "grad_norm": 1.2208846807479858, "learning_rate": 0.00037870330613997425, "loss": 0.7888, "step": 1130 }, { "epoch": 0.24474023185916702, "grad_norm": 1.057085633277893, "learning_rate": 0.00037762988407041647, "loss": 0.8064, "step": 1140 }, { "epoch": 0.24688707599828252, "grad_norm": 1.746360421180725, "learning_rate": 0.00037655646200085874, "loss": 0.8209, "step": 1150 }, { "epoch": 0.24903392013739803, "grad_norm": 1.4103171825408936, "learning_rate": 0.000375483039931301, "loss": 0.8161, "step": 1160 }, { "epoch": 0.25118076427651354, "grad_norm": 1.0949628353118896, "learning_rate": 0.0003744096178617433, "loss": 0.7999, "step": 1170 }, { "epoch": 0.253327608415629, "grad_norm": 1.1674295663833618, "learning_rate": 0.00037333619579218546, "loss": 0.7999, "step": 1180 }, { "epoch": 0.25547445255474455, "grad_norm": 1.729760766029358, "learning_rate": 0.00037226277372262774, "loss": 0.8091, "step": 1190 }, { "epoch": 0.25762129669386, "grad_norm": 1.3376595973968506, "learning_rate": 0.00037118935165307, "loss": 0.7909, "step": 1200 }, { "epoch": 0.2597681408329755, "grad_norm": 2.1753225326538086, "learning_rate": 0.00037011592958351223, "loss": 0.7844, "step": 1210 }, { "epoch": 0.26191498497209104, "grad_norm": 1.7476351261138916, "learning_rate": 0.0003690425075139545, "loss": 0.7972, "step": 1220 }, { "epoch": 0.2640618291112065, "grad_norm": 1.241102933883667, "learning_rate": 0.00036796908544439673, "loss": 0.8046, "step": 1230 }, { "epoch": 0.26620867325032205, "grad_norm": 1.7534103393554688, "learning_rate": 0.000366895663374839, "loss": 0.7938, "step": 1240 }, { "epoch": 0.2683555173894375, "grad_norm": 1.2782504558563232, "learning_rate": 0.0003658222413052812, "loss": 0.7891, "step": 1250 }, { "epoch": 0.270502361528553, "grad_norm": 1.1518951654434204, "learning_rate": 0.0003647488192357235, "loss": 0.7995, "step": 1260 }, { "epoch": 0.27264920566766854, "grad_norm": 1.1520744562149048, "learning_rate": 0.0003636753971661658, "loss": 0.7934, "step": 1270 }, { "epoch": 0.274796049806784, "grad_norm": 1.4017630815505981, "learning_rate": 0.000362601975096608, "loss": 0.8272, "step": 1280 }, { "epoch": 0.27694289394589955, "grad_norm": 1.7796710729599, "learning_rate": 0.0003615285530270502, "loss": 0.7782, "step": 1290 }, { "epoch": 0.279089738085015, "grad_norm": 1.5225216150283813, "learning_rate": 0.0003604551309574925, "loss": 0.7978, "step": 1300 }, { "epoch": 0.2812365822241305, "grad_norm": 1.0838427543640137, "learning_rate": 0.00035938170888793477, "loss": 0.8185, "step": 1310 }, { "epoch": 0.28338342636324604, "grad_norm": 1.5116959810256958, "learning_rate": 0.000358308286818377, "loss": 0.7929, "step": 1320 }, { "epoch": 0.2855302705023615, "grad_norm": 1.2074556350708008, "learning_rate": 0.0003572348647488192, "loss": 0.804, "step": 1330 }, { "epoch": 0.28767711464147705, "grad_norm": 1.004355788230896, "learning_rate": 0.0003561614426792615, "loss": 0.813, "step": 1340 }, { "epoch": 0.2898239587805925, "grad_norm": 1.4230481386184692, "learning_rate": 0.00035508802060970376, "loss": 0.7831, "step": 1350 }, { "epoch": 0.291970802919708, "grad_norm": 1.1971302032470703, "learning_rate": 0.000354014598540146, "loss": 0.7673, "step": 1360 }, { "epoch": 0.29411764705882354, "grad_norm": 1.3551030158996582, "learning_rate": 0.00035294117647058826, "loss": 0.7757, "step": 1370 }, { "epoch": 0.296264491197939, "grad_norm": 1.0632190704345703, "learning_rate": 0.0003518677544010305, "loss": 0.7824, "step": 1380 }, { "epoch": 0.29841133533705455, "grad_norm": 1.5460542440414429, "learning_rate": 0.00035079433233147275, "loss": 0.7871, "step": 1390 }, { "epoch": 0.30055817947617003, "grad_norm": 1.8900117874145508, "learning_rate": 0.000349720910261915, "loss": 0.7967, "step": 1400 }, { "epoch": 0.3027050236152855, "grad_norm": 1.5339765548706055, "learning_rate": 0.00034864748819235725, "loss": 0.7759, "step": 1410 }, { "epoch": 0.30485186775440104, "grad_norm": 1.721113681793213, "learning_rate": 0.0003475740661227995, "loss": 0.7792, "step": 1420 }, { "epoch": 0.3069987118935165, "grad_norm": 1.0442615747451782, "learning_rate": 0.0003465006440532417, "loss": 0.7734, "step": 1430 }, { "epoch": 0.30914555603263205, "grad_norm": 1.4723149538040161, "learning_rate": 0.00034542722198368397, "loss": 0.7839, "step": 1440 }, { "epoch": 0.31129240017174753, "grad_norm": 1.4786028861999512, "learning_rate": 0.00034435379991412624, "loss": 0.7995, "step": 1450 }, { "epoch": 0.313439244310863, "grad_norm": 1.392654538154602, "learning_rate": 0.0003432803778445685, "loss": 0.8046, "step": 1460 }, { "epoch": 0.31558608844997854, "grad_norm": 1.730966567993164, "learning_rate": 0.00034220695577501074, "loss": 0.7909, "step": 1470 }, { "epoch": 0.317732932589094, "grad_norm": 1.365211844444275, "learning_rate": 0.00034113353370545296, "loss": 0.7881, "step": 1480 }, { "epoch": 0.31987977672820955, "grad_norm": 1.2406139373779297, "learning_rate": 0.00034006011163589524, "loss": 0.8095, "step": 1490 }, { "epoch": 0.32202662086732503, "grad_norm": 2.0166332721710205, "learning_rate": 0.0003389866895663375, "loss": 0.7694, "step": 1500 }, { "epoch": 0.3241734650064405, "grad_norm": 1.9494292736053467, "learning_rate": 0.00033791326749677973, "loss": 0.8033, "step": 1510 }, { "epoch": 0.32632030914555604, "grad_norm": 1.6693007946014404, "learning_rate": 0.000336839845427222, "loss": 0.8158, "step": 1520 }, { "epoch": 0.3284671532846715, "grad_norm": 1.595958948135376, "learning_rate": 0.00033576642335766423, "loss": 0.7974, "step": 1530 }, { "epoch": 0.33061399742378705, "grad_norm": 1.8875946998596191, "learning_rate": 0.0003346930012881065, "loss": 0.7835, "step": 1540 }, { "epoch": 0.33276084156290253, "grad_norm": 1.5482693910598755, "learning_rate": 0.0003336195792185487, "loss": 0.7866, "step": 1550 }, { "epoch": 0.334907685702018, "grad_norm": 1.1274839639663696, "learning_rate": 0.000332546157148991, "loss": 0.7964, "step": 1560 }, { "epoch": 0.33705452984113354, "grad_norm": 1.5397554636001587, "learning_rate": 0.0003314727350794333, "loss": 0.7802, "step": 1570 }, { "epoch": 0.339201373980249, "grad_norm": 1.2875391244888306, "learning_rate": 0.00033039931300987544, "loss": 0.7764, "step": 1580 }, { "epoch": 0.34134821811936455, "grad_norm": 1.0845388174057007, "learning_rate": 0.0003293258909403177, "loss": 0.7839, "step": 1590 } ], "logging_steps": 10, "max_steps": 4658, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 30, "total_flos": 1.2016090886217754e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }