| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.34134821811936455, | |
| "eval_steps": 500, | |
| "global_step": 1590, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0021468441391155, | |
| "grad_norm": 1.7282733917236328, | |
| "learning_rate": 0.0004989265779304422, | |
| "loss": 1.4129, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004293688278231, | |
| "grad_norm": 2.1508498191833496, | |
| "learning_rate": 0.0004978531558608846, | |
| "loss": 1.2225, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006440532417346501, | |
| "grad_norm": 1.6386512517929077, | |
| "learning_rate": 0.0004967797337913268, | |
| "loss": 1.1663, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008587376556462, | |
| "grad_norm": 1.2367421388626099, | |
| "learning_rate": 0.000495706311721769, | |
| "loss": 1.1373, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.010734220695577501, | |
| "grad_norm": 1.2300989627838135, | |
| "learning_rate": 0.0004946328896522112, | |
| "loss": 1.1143, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012881064834693002, | |
| "grad_norm": 1.1807990074157715, | |
| "learning_rate": 0.0004935594675826536, | |
| "loss": 1.0937, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.015027908973808502, | |
| "grad_norm": 0.9375188946723938, | |
| "learning_rate": 0.0004924860455130958, | |
| "loss": 1.0732, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017174753112924, | |
| "grad_norm": 0.9801538586616516, | |
| "learning_rate": 0.000491412623443538, | |
| "loss": 1.0369, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019321597252039503, | |
| "grad_norm": 0.9229792356491089, | |
| "learning_rate": 0.0004903392013739802, | |
| "loss": 1.0093, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.021468441391155002, | |
| "grad_norm": 1.011305570602417, | |
| "learning_rate": 0.0004892657793044225, | |
| "loss": 1.0161, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0236152855302705, | |
| "grad_norm": 0.9356452822685242, | |
| "learning_rate": 0.00048819235723486477, | |
| "loss": 0.9939, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.025762129669386003, | |
| "grad_norm": 1.0092449188232422, | |
| "learning_rate": 0.00048711893516530704, | |
| "loss": 0.9647, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.027908973808501502, | |
| "grad_norm": 0.9663442373275757, | |
| "learning_rate": 0.0004860455130957492, | |
| "loss": 0.9595, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.030055817947617004, | |
| "grad_norm": 1.1502243280410767, | |
| "learning_rate": 0.0004849720910261915, | |
| "loss": 0.9422, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0322026620867325, | |
| "grad_norm": 0.970102846622467, | |
| "learning_rate": 0.00048389866895663376, | |
| "loss": 0.945, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.034349506225848, | |
| "grad_norm": 1.2466392517089844, | |
| "learning_rate": 0.00048282524688707604, | |
| "loss": 0.9385, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0364963503649635, | |
| "grad_norm": 1.0010186433792114, | |
| "learning_rate": 0.00048175182481751826, | |
| "loss": 0.9301, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.038643194504079006, | |
| "grad_norm": 1.2516905069351196, | |
| "learning_rate": 0.0004806784027479605, | |
| "loss": 0.919, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.040790038643194505, | |
| "grad_norm": 0.8497525453567505, | |
| "learning_rate": 0.00047960498067840275, | |
| "loss": 0.9054, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.042936882782310004, | |
| "grad_norm": 1.0371205806732178, | |
| "learning_rate": 0.00047853155860884503, | |
| "loss": 0.9109, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0450837269214255, | |
| "grad_norm": 1.3313541412353516, | |
| "learning_rate": 0.00047745813653928725, | |
| "loss": 0.9131, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.047230571060541, | |
| "grad_norm": 0.9448315501213074, | |
| "learning_rate": 0.0004763847144697295, | |
| "loss": 0.9014, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.04937741519965651, | |
| "grad_norm": 1.274882435798645, | |
| "learning_rate": 0.00047531129240017175, | |
| "loss": 0.8786, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.051524259338772006, | |
| "grad_norm": 1.3116368055343628, | |
| "learning_rate": 0.000474237870330614, | |
| "loss": 0.9075, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.053671103477887505, | |
| "grad_norm": 0.9970440864562988, | |
| "learning_rate": 0.00047316444826105624, | |
| "loss": 0.8932, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.055817947617003004, | |
| "grad_norm": 1.698472499847412, | |
| "learning_rate": 0.0004720910261914985, | |
| "loss": 0.8838, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0579647917561185, | |
| "grad_norm": 1.0129982233047485, | |
| "learning_rate": 0.0004710176041219408, | |
| "loss": 0.8779, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06011163589523401, | |
| "grad_norm": 1.0594947338104248, | |
| "learning_rate": 0.00046994418205238296, | |
| "loss": 0.8631, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06225848003434951, | |
| "grad_norm": 0.7768178582191467, | |
| "learning_rate": 0.00046887075998282524, | |
| "loss": 0.8666, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.064405324173465, | |
| "grad_norm": 0.9108049869537354, | |
| "learning_rate": 0.0004677973379132675, | |
| "loss": 0.8676, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06655216831258051, | |
| "grad_norm": 1.4127992391586304, | |
| "learning_rate": 0.0004667239158437098, | |
| "loss": 0.8951, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.068699012451696, | |
| "grad_norm": 1.1507939100265503, | |
| "learning_rate": 0.000465650493774152, | |
| "loss": 0.863, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07084585659081151, | |
| "grad_norm": 1.1579265594482422, | |
| "learning_rate": 0.00046457707170459423, | |
| "loss": 0.8716, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.072992700729927, | |
| "grad_norm": 0.9873006343841553, | |
| "learning_rate": 0.0004635036496350365, | |
| "loss": 0.8569, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07513954486904251, | |
| "grad_norm": 1.1990203857421875, | |
| "learning_rate": 0.0004624302275654788, | |
| "loss": 0.8776, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07728638900815801, | |
| "grad_norm": 1.1173065900802612, | |
| "learning_rate": 0.000461356805495921, | |
| "loss": 0.865, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0794332331472735, | |
| "grad_norm": 1.2493510246276855, | |
| "learning_rate": 0.0004602833834263633, | |
| "loss": 0.8609, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.08158007728638901, | |
| "grad_norm": 1.1254737377166748, | |
| "learning_rate": 0.0004592099613568055, | |
| "loss": 0.8697, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0837269214255045, | |
| "grad_norm": 1.1009331941604614, | |
| "learning_rate": 0.0004581365392872477, | |
| "loss": 0.8653, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08587376556462001, | |
| "grad_norm": 1.3970990180969238, | |
| "learning_rate": 0.00045706311721769, | |
| "loss": 0.8542, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08802060970373551, | |
| "grad_norm": 1.278136968612671, | |
| "learning_rate": 0.00045598969514813227, | |
| "loss": 0.8485, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.090167453842851, | |
| "grad_norm": 1.3295845985412598, | |
| "learning_rate": 0.00045491627307857454, | |
| "loss": 0.8501, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.09231429798196651, | |
| "grad_norm": 1.310677170753479, | |
| "learning_rate": 0.0004538428510090167, | |
| "loss": 0.849, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.094461142121082, | |
| "grad_norm": 1.0189110040664673, | |
| "learning_rate": 0.000452769428939459, | |
| "loss": 0.8566, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09660798626019751, | |
| "grad_norm": 1.2950178384780884, | |
| "learning_rate": 0.00045169600686990126, | |
| "loss": 0.8369, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.09875483039931301, | |
| "grad_norm": 0.8336394429206848, | |
| "learning_rate": 0.00045062258480034354, | |
| "loss": 0.853, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1009016745384285, | |
| "grad_norm": 1.1623280048370361, | |
| "learning_rate": 0.00044954916273078576, | |
| "loss": 0.8437, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.10304851867754401, | |
| "grad_norm": 1.5341142416000366, | |
| "learning_rate": 0.000448475740661228, | |
| "loss": 0.8388, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1051953628166595, | |
| "grad_norm": 1.154572606086731, | |
| "learning_rate": 0.00044740231859167025, | |
| "loss": 0.8499, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.10734220695577501, | |
| "grad_norm": 1.291874885559082, | |
| "learning_rate": 0.00044632889652211253, | |
| "loss": 0.8508, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.10948905109489052, | |
| "grad_norm": 2.017030954360962, | |
| "learning_rate": 0.00044525547445255475, | |
| "loss": 0.8163, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.11163589523400601, | |
| "grad_norm": 1.2181349992752075, | |
| "learning_rate": 0.000444182052382997, | |
| "loss": 0.8304, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.11378273937312151, | |
| "grad_norm": 1.1240856647491455, | |
| "learning_rate": 0.00044310863031343925, | |
| "loss": 0.8339, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.115929583512237, | |
| "grad_norm": 1.5953660011291504, | |
| "learning_rate": 0.00044203520824388147, | |
| "loss": 0.8416, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.11807642765135251, | |
| "grad_norm": 0.9097370505332947, | |
| "learning_rate": 0.00044096178617432374, | |
| "loss": 0.8362, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12022327179046802, | |
| "grad_norm": 1.0670212507247925, | |
| "learning_rate": 0.000439888364104766, | |
| "loss": 0.8395, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.12237011592958351, | |
| "grad_norm": 1.1179403066635132, | |
| "learning_rate": 0.0004388149420352083, | |
| "loss": 0.8477, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.12451696006869901, | |
| "grad_norm": 1.218599557876587, | |
| "learning_rate": 0.00043774151996565046, | |
| "loss": 0.8295, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1266638042078145, | |
| "grad_norm": 0.9557531476020813, | |
| "learning_rate": 0.00043666809789609274, | |
| "loss": 0.8257, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.12881064834693, | |
| "grad_norm": 0.8345034122467041, | |
| "learning_rate": 0.000435594675826535, | |
| "loss": 0.8607, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13095749248604552, | |
| "grad_norm": 0.9946607947349548, | |
| "learning_rate": 0.0004345212537569773, | |
| "loss": 0.8173, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.13310433662516102, | |
| "grad_norm": 1.3076237440109253, | |
| "learning_rate": 0.0004334478316874195, | |
| "loss": 0.8293, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1352511807642765, | |
| "grad_norm": 1.6002768278121948, | |
| "learning_rate": 0.00043237440961786173, | |
| "loss": 0.8328, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.137398024903392, | |
| "grad_norm": 1.03147554397583, | |
| "learning_rate": 0.000431300987548304, | |
| "loss": 0.8297, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1395448690425075, | |
| "grad_norm": 1.42938232421875, | |
| "learning_rate": 0.0004302275654787463, | |
| "loss": 0.8328, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.14169171318162302, | |
| "grad_norm": 1.319884181022644, | |
| "learning_rate": 0.0004291541434091885, | |
| "loss": 0.8496, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.14383855732073852, | |
| "grad_norm": 1.289533019065857, | |
| "learning_rate": 0.0004280807213396308, | |
| "loss": 0.8171, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.145985401459854, | |
| "grad_norm": 1.4401450157165527, | |
| "learning_rate": 0.000427007299270073, | |
| "loss": 0.8259, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.1481322455989695, | |
| "grad_norm": 1.403343677520752, | |
| "learning_rate": 0.0004259338772005152, | |
| "loss": 0.8116, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.15027908973808501, | |
| "grad_norm": 1.0387822389602661, | |
| "learning_rate": 0.0004248604551309575, | |
| "loss": 0.8233, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15242593387720052, | |
| "grad_norm": 1.0579140186309814, | |
| "learning_rate": 0.00042378703306139977, | |
| "loss": 0.8205, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.15457277801631603, | |
| "grad_norm": 1.7332643270492554, | |
| "learning_rate": 0.00042271361099184204, | |
| "loss": 0.845, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1567196221554315, | |
| "grad_norm": 1.8401075601577759, | |
| "learning_rate": 0.0004216401889222842, | |
| "loss": 0.8441, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.158866466294547, | |
| "grad_norm": 1.3133872747421265, | |
| "learning_rate": 0.0004205667668527265, | |
| "loss": 0.8427, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.16101331043366252, | |
| "grad_norm": 2.1324663162231445, | |
| "learning_rate": 0.00041949334478316876, | |
| "loss": 0.8298, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.16316015457277802, | |
| "grad_norm": 1.1304748058319092, | |
| "learning_rate": 0.00041841992271361104, | |
| "loss": 0.836, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.16530699871189353, | |
| "grad_norm": 1.1530399322509766, | |
| "learning_rate": 0.00041734650064405326, | |
| "loss": 0.803, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.167453842851009, | |
| "grad_norm": 0.8117969632148743, | |
| "learning_rate": 0.0004162730785744955, | |
| "loss": 0.8177, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.1696006869901245, | |
| "grad_norm": 1.217517375946045, | |
| "learning_rate": 0.00041519965650493775, | |
| "loss": 0.8383, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.17174753112924002, | |
| "grad_norm": 1.2580839395523071, | |
| "learning_rate": 0.00041412623443538, | |
| "loss": 0.8257, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17389437526835552, | |
| "grad_norm": 1.7408099174499512, | |
| "learning_rate": 0.00041305281236582225, | |
| "loss": 0.8201, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.17604121940747103, | |
| "grad_norm": 1.1754316091537476, | |
| "learning_rate": 0.0004119793902962645, | |
| "loss": 0.8094, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1781880635465865, | |
| "grad_norm": 1.5301543474197388, | |
| "learning_rate": 0.00041090596822670675, | |
| "loss": 0.8112, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.180334907685702, | |
| "grad_norm": 0.8299456834793091, | |
| "learning_rate": 0.00040983254615714897, | |
| "loss": 0.8518, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.18248175182481752, | |
| "grad_norm": 1.3171818256378174, | |
| "learning_rate": 0.00040875912408759124, | |
| "loss": 0.8292, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.18462859596393302, | |
| "grad_norm": 1.4290481805801392, | |
| "learning_rate": 0.0004076857020180335, | |
| "loss": 0.8147, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.18677544010304853, | |
| "grad_norm": 0.9816901683807373, | |
| "learning_rate": 0.0004066122799484758, | |
| "loss": 0.825, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.188922284242164, | |
| "grad_norm": 0.8896159529685974, | |
| "learning_rate": 0.00040553885787891796, | |
| "loss": 0.8245, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1910691283812795, | |
| "grad_norm": 1.5641008615493774, | |
| "learning_rate": 0.00040446543580936024, | |
| "loss": 0.8204, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.19321597252039502, | |
| "grad_norm": 1.174325704574585, | |
| "learning_rate": 0.0004033920137398025, | |
| "loss": 0.8046, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19536281665951052, | |
| "grad_norm": 1.0568900108337402, | |
| "learning_rate": 0.0004023185916702448, | |
| "loss": 0.835, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.19750966079862603, | |
| "grad_norm": 1.4573074579238892, | |
| "learning_rate": 0.000401245169600687, | |
| "loss": 0.8151, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.1996565049377415, | |
| "grad_norm": 1.7658246755599976, | |
| "learning_rate": 0.00040017174753112923, | |
| "loss": 0.8012, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.201803349076857, | |
| "grad_norm": 1.3144532442092896, | |
| "learning_rate": 0.0003990983254615715, | |
| "loss": 0.8155, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.20395019321597252, | |
| "grad_norm": 1.302480697631836, | |
| "learning_rate": 0.0003980249033920137, | |
| "loss": 0.8125, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.20609703735508803, | |
| "grad_norm": 1.6297829151153564, | |
| "learning_rate": 0.000396951481322456, | |
| "loss": 0.8157, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.20824388149420353, | |
| "grad_norm": 1.2462539672851562, | |
| "learning_rate": 0.0003958780592528983, | |
| "loss": 0.8135, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.210390725633319, | |
| "grad_norm": 1.3543071746826172, | |
| "learning_rate": 0.0003948046371833405, | |
| "loss": 0.8154, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.21253756977243451, | |
| "grad_norm": 1.5854978561401367, | |
| "learning_rate": 0.0003937312151137827, | |
| "loss": 0.7982, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.21468441391155002, | |
| "grad_norm": 1.0589042901992798, | |
| "learning_rate": 0.000392657793044225, | |
| "loss": 0.8267, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.21683125805066553, | |
| "grad_norm": 1.226970911026001, | |
| "learning_rate": 0.00039158437097466727, | |
| "loss": 0.8055, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.21897810218978103, | |
| "grad_norm": 1.390030860900879, | |
| "learning_rate": 0.00039051094890510954, | |
| "loss": 0.8272, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2211249463288965, | |
| "grad_norm": 1.102220892906189, | |
| "learning_rate": 0.0003894375268355517, | |
| "loss": 0.8246, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.22327179046801202, | |
| "grad_norm": 1.094040870666504, | |
| "learning_rate": 0.000388364104765994, | |
| "loss": 0.814, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.22541863460712752, | |
| "grad_norm": 1.4209458827972412, | |
| "learning_rate": 0.00038729068269643626, | |
| "loss": 0.7972, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.22756547874624303, | |
| "grad_norm": 1.3925952911376953, | |
| "learning_rate": 0.0003862172606268785, | |
| "loss": 0.809, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.22971232288535853, | |
| "grad_norm": 1.0035127401351929, | |
| "learning_rate": 0.00038514383855732076, | |
| "loss": 0.8012, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.231859167024474, | |
| "grad_norm": 1.0175857543945312, | |
| "learning_rate": 0.000384070416487763, | |
| "loss": 0.7916, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.23400601116358952, | |
| "grad_norm": 1.3213493824005127, | |
| "learning_rate": 0.00038299699441820525, | |
| "loss": 0.8084, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.23615285530270502, | |
| "grad_norm": 1.4422920942306519, | |
| "learning_rate": 0.0003819235723486475, | |
| "loss": 0.8135, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.23829969944182053, | |
| "grad_norm": 1.228966474533081, | |
| "learning_rate": 0.00038085015027908975, | |
| "loss": 0.8221, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.24044654358093603, | |
| "grad_norm": 1.5089335441589355, | |
| "learning_rate": 0.000379776728209532, | |
| "loss": 0.8183, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2425933877200515, | |
| "grad_norm": 1.2208846807479858, | |
| "learning_rate": 0.00037870330613997425, | |
| "loss": 0.7888, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.24474023185916702, | |
| "grad_norm": 1.057085633277893, | |
| "learning_rate": 0.00037762988407041647, | |
| "loss": 0.8064, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.24688707599828252, | |
| "grad_norm": 1.746360421180725, | |
| "learning_rate": 0.00037655646200085874, | |
| "loss": 0.8209, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.24903392013739803, | |
| "grad_norm": 1.4103171825408936, | |
| "learning_rate": 0.000375483039931301, | |
| "loss": 0.8161, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.25118076427651354, | |
| "grad_norm": 1.0949628353118896, | |
| "learning_rate": 0.0003744096178617433, | |
| "loss": 0.7999, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.253327608415629, | |
| "grad_norm": 1.1674295663833618, | |
| "learning_rate": 0.00037333619579218546, | |
| "loss": 0.7999, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.25547445255474455, | |
| "grad_norm": 1.729760766029358, | |
| "learning_rate": 0.00037226277372262774, | |
| "loss": 0.8091, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.25762129669386, | |
| "grad_norm": 1.3376595973968506, | |
| "learning_rate": 0.00037118935165307, | |
| "loss": 0.7909, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2597681408329755, | |
| "grad_norm": 2.1753225326538086, | |
| "learning_rate": 0.00037011592958351223, | |
| "loss": 0.7844, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.26191498497209104, | |
| "grad_norm": 1.7476351261138916, | |
| "learning_rate": 0.0003690425075139545, | |
| "loss": 0.7972, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.2640618291112065, | |
| "grad_norm": 1.241102933883667, | |
| "learning_rate": 0.00036796908544439673, | |
| "loss": 0.8046, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.26620867325032205, | |
| "grad_norm": 1.7534103393554688, | |
| "learning_rate": 0.000366895663374839, | |
| "loss": 0.7938, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.2683555173894375, | |
| "grad_norm": 1.2782504558563232, | |
| "learning_rate": 0.0003658222413052812, | |
| "loss": 0.7891, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.270502361528553, | |
| "grad_norm": 1.1518951654434204, | |
| "learning_rate": 0.0003647488192357235, | |
| "loss": 0.7995, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.27264920566766854, | |
| "grad_norm": 1.1520744562149048, | |
| "learning_rate": 0.0003636753971661658, | |
| "loss": 0.7934, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.274796049806784, | |
| "grad_norm": 1.4017630815505981, | |
| "learning_rate": 0.000362601975096608, | |
| "loss": 0.8272, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.27694289394589955, | |
| "grad_norm": 1.7796710729599, | |
| "learning_rate": 0.0003615285530270502, | |
| "loss": 0.7782, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.279089738085015, | |
| "grad_norm": 1.5225216150283813, | |
| "learning_rate": 0.0003604551309574925, | |
| "loss": 0.7978, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2812365822241305, | |
| "grad_norm": 1.0838427543640137, | |
| "learning_rate": 0.00035938170888793477, | |
| "loss": 0.8185, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.28338342636324604, | |
| "grad_norm": 1.5116959810256958, | |
| "learning_rate": 0.000358308286818377, | |
| "loss": 0.7929, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2855302705023615, | |
| "grad_norm": 1.2074556350708008, | |
| "learning_rate": 0.0003572348647488192, | |
| "loss": 0.804, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.28767711464147705, | |
| "grad_norm": 1.004355788230896, | |
| "learning_rate": 0.0003561614426792615, | |
| "loss": 0.813, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.2898239587805925, | |
| "grad_norm": 1.4230481386184692, | |
| "learning_rate": 0.00035508802060970376, | |
| "loss": 0.7831, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.291970802919708, | |
| "grad_norm": 1.1971302032470703, | |
| "learning_rate": 0.000354014598540146, | |
| "loss": 0.7673, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.29411764705882354, | |
| "grad_norm": 1.3551030158996582, | |
| "learning_rate": 0.00035294117647058826, | |
| "loss": 0.7757, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.296264491197939, | |
| "grad_norm": 1.0632190704345703, | |
| "learning_rate": 0.0003518677544010305, | |
| "loss": 0.7824, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.29841133533705455, | |
| "grad_norm": 1.5460542440414429, | |
| "learning_rate": 0.00035079433233147275, | |
| "loss": 0.7871, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.30055817947617003, | |
| "grad_norm": 1.8900117874145508, | |
| "learning_rate": 0.000349720910261915, | |
| "loss": 0.7967, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3027050236152855, | |
| "grad_norm": 1.5339765548706055, | |
| "learning_rate": 0.00034864748819235725, | |
| "loss": 0.7759, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.30485186775440104, | |
| "grad_norm": 1.721113681793213, | |
| "learning_rate": 0.0003475740661227995, | |
| "loss": 0.7792, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3069987118935165, | |
| "grad_norm": 1.0442615747451782, | |
| "learning_rate": 0.0003465006440532417, | |
| "loss": 0.7734, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.30914555603263205, | |
| "grad_norm": 1.4723149538040161, | |
| "learning_rate": 0.00034542722198368397, | |
| "loss": 0.7839, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.31129240017174753, | |
| "grad_norm": 1.4786028861999512, | |
| "learning_rate": 0.00034435379991412624, | |
| "loss": 0.7995, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.313439244310863, | |
| "grad_norm": 1.392654538154602, | |
| "learning_rate": 0.0003432803778445685, | |
| "loss": 0.8046, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.31558608844997854, | |
| "grad_norm": 1.730966567993164, | |
| "learning_rate": 0.00034220695577501074, | |
| "loss": 0.7909, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.317732932589094, | |
| "grad_norm": 1.365211844444275, | |
| "learning_rate": 0.00034113353370545296, | |
| "loss": 0.7881, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.31987977672820955, | |
| "grad_norm": 1.2406139373779297, | |
| "learning_rate": 0.00034006011163589524, | |
| "loss": 0.8095, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.32202662086732503, | |
| "grad_norm": 2.0166332721710205, | |
| "learning_rate": 0.0003389866895663375, | |
| "loss": 0.7694, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3241734650064405, | |
| "grad_norm": 1.9494292736053467, | |
| "learning_rate": 0.00033791326749677973, | |
| "loss": 0.8033, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.32632030914555604, | |
| "grad_norm": 1.6693007946014404, | |
| "learning_rate": 0.000336839845427222, | |
| "loss": 0.8158, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.3284671532846715, | |
| "grad_norm": 1.595958948135376, | |
| "learning_rate": 0.00033576642335766423, | |
| "loss": 0.7974, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.33061399742378705, | |
| "grad_norm": 1.8875946998596191, | |
| "learning_rate": 0.0003346930012881065, | |
| "loss": 0.7835, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.33276084156290253, | |
| "grad_norm": 1.5482693910598755, | |
| "learning_rate": 0.0003336195792185487, | |
| "loss": 0.7866, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.334907685702018, | |
| "grad_norm": 1.1274839639663696, | |
| "learning_rate": 0.000332546157148991, | |
| "loss": 0.7964, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.33705452984113354, | |
| "grad_norm": 1.5397554636001587, | |
| "learning_rate": 0.0003314727350794333, | |
| "loss": 0.7802, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.339201373980249, | |
| "grad_norm": 1.2875391244888306, | |
| "learning_rate": 0.00033039931300987544, | |
| "loss": 0.7764, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.34134821811936455, | |
| "grad_norm": 1.0845388174057007, | |
| "learning_rate": 0.0003293258909403177, | |
| "loss": 0.7839, | |
| "step": 1590 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4658, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 30, | |
| "total_flos": 1.2016090886217754e+17, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |