{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9891774989096567, "eval_steps": 500, "global_step": 110000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008992522717360514, "grad_norm": 0.5267877578735352, "learning_rate": 0.0004999002617394902, "loss": 4.4189, "step": 1000 }, { "epoch": 0.017985045434721028, "grad_norm": 0.4186614155769348, "learning_rate": 0.0004996010866580058, "loss": 3.2692, "step": 2000 }, { "epoch": 0.026977568152081546, "grad_norm": 0.37206095457077026, "learning_rate": 0.0004991027135159133, "loss": 3.055, "step": 3000 }, { "epoch": 0.035970090869442056, "grad_norm": 0.3365996479988098, "learning_rate": 0.0004984055400477128, "loss": 2.9514, "step": 4000 }, { "epoch": 0.04496261358680257, "grad_norm": 0.3055461645126343, "learning_rate": 0.0004975101226436211, "loss": 2.8843, "step": 5000 }, { "epoch": 0.05395513630416309, "grad_norm": 0.2653150260448456, "learning_rate": 0.0004964171759055367, "loss": 2.8359, "step": 6000 }, { "epoch": 0.0629476590215236, "grad_norm": 0.26524779200553894, "learning_rate": 0.0004951275720767395, "loss": 2.7964, "step": 7000 }, { "epoch": 0.07194018173888411, "grad_norm": 0.23818659782409668, "learning_rate": 0.0004936423403457847, "loss": 2.7717, "step": 8000 }, { "epoch": 0.08093270445624463, "grad_norm": 0.24217627942562103, "learning_rate": 0.0004919626660251412, "loss": 2.7383, "step": 9000 }, { "epoch": 0.08992522717360514, "grad_norm": 0.2376166582107544, "learning_rate": 0.0004900898896052357, "loss": 2.7244, "step": 10000 }, { "epoch": 0.09891774989096566, "grad_norm": 0.2203313708305359, "learning_rate": 0.00048802550568465263, "loss": 2.7031, "step": 11000 }, { "epoch": 0.10791027260832618, "grad_norm": 0.20930123329162598, "learning_rate": 0.00048577116177734653, "loss": 2.6876, "step": 12000 }, { "epoch": 0.11690279532568669, "grad_norm": 0.2004300355911255, "learning_rate": 0.0004833286569978177, "loss": 2.6722, "step": 13000 }, { "epoch": 0.1258953180430472, "grad_norm": 0.19555214047431946, "learning_rate": 0.0004806999406253004, "loss": 2.6577, "step": 14000 }, { "epoch": 0.13488784076040772, "grad_norm": 0.1877773553133011, "learning_rate": 0.0004778871105481104, "loss": 2.647, "step": 15000 }, { "epoch": 0.14388036347776822, "grad_norm": 0.18455323576927185, "learning_rate": 0.0004748924115893922, "loss": 2.6388, "step": 16000 }, { "epoch": 0.15287288619512876, "grad_norm": 0.18309278786182404, "learning_rate": 0.0004717214967118909, "loss": 2.6254, "step": 17000 }, { "epoch": 0.16186540891248927, "grad_norm": 0.17841506004333496, "learning_rate": 0.0004683705487586517, "loss": 2.6187, "step": 18000 }, { "epoch": 0.17085793162984977, "grad_norm": 0.17425057291984558, "learning_rate": 0.00046484532676522683, "loss": 2.6115, "step": 19000 }, { "epoch": 0.17985045434721028, "grad_norm": 0.17270472645759583, "learning_rate": 0.00046114864409029877, "loss": 2.6017, "step": 20000 }, { "epoch": 0.18884297706457082, "grad_norm": 0.1661667376756668, "learning_rate": 0.00045728739927797956, "loss": 2.5935, "step": 21000 }, { "epoch": 0.19783549978193132, "grad_norm": 0.16307614743709564, "learning_rate": 0.0004532569441500434, "loss": 2.593, "step": 22000 }, { "epoch": 0.20682802249929183, "grad_norm": 0.15843474864959717, "learning_rate": 0.0004490642766310399, "loss": 2.5831, "step": 23000 }, { "epoch": 0.21582054521665237, "grad_norm": 0.15826140344142914, "learning_rate": 0.0004447171724872102, "loss": 2.575, "step": 24000 }, { "epoch": 0.22481306793401287, "grad_norm": 0.15876850485801697, "learning_rate": 0.0004402103986613901, "loss": 2.5718, "step": 25000 }, { "epoch": 0.23380559065137338, "grad_norm": 0.15799590945243835, "learning_rate": 0.0004355518244446819, "loss": 2.5643, "step": 26000 }, { "epoch": 0.2427981133687339, "grad_norm": 0.15866918861865997, "learning_rate": 0.000430750047042202, "loss": 2.5611, "step": 27000 }, { "epoch": 0.2517906360860944, "grad_norm": 0.1551973819732666, "learning_rate": 0.0004257992860597374, "loss": 2.5532, "step": 28000 }, { "epoch": 0.26078315880345493, "grad_norm": 0.1489175260066986, "learning_rate": 0.00042070822570074265, "loss": 2.5508, "step": 29000 }, { "epoch": 0.26977568152081544, "grad_norm": 0.14987464249134064, "learning_rate": 0.0004154809289657581, "loss": 2.5451, "step": 30000 }, { "epoch": 0.27876820423817594, "grad_norm": 0.14638318121433258, "learning_rate": 0.00041013241527853406, "loss": 2.5382, "step": 31000 }, { "epoch": 0.28776072695553645, "grad_norm": 0.14251314103603363, "learning_rate": 0.0004046455176043083, "loss": 2.5351, "step": 32000 }, { "epoch": 0.296753249672897, "grad_norm": 0.14875943958759308, "learning_rate": 0.00039904087312981354, "loss": 2.5315, "step": 33000 }, { "epoch": 0.3057457723902575, "grad_norm": 0.14232666790485382, "learning_rate": 0.00039331173496701843, "loss": 2.5287, "step": 34000 }, { "epoch": 0.314738295107618, "grad_norm": 0.1461074948310852, "learning_rate": 0.00038746822462720277, "loss": 2.5252, "step": 35000 }, { "epoch": 0.32373081782497853, "grad_norm": 0.14645366370677948, "learning_rate": 0.00038152101205969716, "loss": 2.5184, "step": 36000 }, { "epoch": 0.33272334054233904, "grad_norm": 0.14445528388023376, "learning_rate": 0.00037546293799195995, "loss": 2.5201, "step": 37000 }, { "epoch": 0.34171586325969955, "grad_norm": 0.13728487491607666, "learning_rate": 0.00036930473625947265, "loss": 2.5149, "step": 38000 }, { "epoch": 0.35070838597706006, "grad_norm": 0.13824057579040527, "learning_rate": 0.00036305132151167983, "loss": 2.5122, "step": 39000 }, { "epoch": 0.35970090869442056, "grad_norm": 0.1395253986120224, "learning_rate": 0.00035672045834706856, "loss": 2.5069, "step": 40000 }, { "epoch": 0.3686934314117811, "grad_norm": 0.14017699658870697, "learning_rate": 0.0003502918267002188, "loss": 2.5047, "step": 41000 }, { "epoch": 0.37768595412914163, "grad_norm": 0.1350419521331787, "learning_rate": 0.00034378315558863357, "loss": 2.5011, "step": 42000 }, { "epoch": 0.38667847684650214, "grad_norm": 0.14121927320957184, "learning_rate": 0.00033719963935934026, "loss": 2.496, "step": 43000 }, { "epoch": 0.39567099956386265, "grad_norm": 0.1398804783821106, "learning_rate": 0.0003305532181958758, "loss": 2.4966, "step": 44000 }, { "epoch": 0.40466352228122315, "grad_norm": 0.14015056192874908, "learning_rate": 0.00032383589111626814, "loss": 2.493, "step": 45000 }, { "epoch": 0.41365604499858366, "grad_norm": 0.1367267668247223, "learning_rate": 0.00031705963814611035, "loss": 2.484, "step": 46000 }, { "epoch": 0.42264856771594417, "grad_norm": 0.13933929800987244, "learning_rate": 0.0003102367218727284, "loss": 2.4843, "step": 47000 }, { "epoch": 0.43164109043330473, "grad_norm": 0.13233740627765656, "learning_rate": 0.0003033589288488015, "loss": 2.4813, "step": 48000 }, { "epoch": 0.44063361315066524, "grad_norm": 0.13488726317882538, "learning_rate": 0.0002964385518951125, "loss": 2.4786, "step": 49000 }, { "epoch": 0.44962613586802574, "grad_norm": 0.1470656394958496, "learning_rate": 0.0002894811139269912, "loss": 2.4743, "step": 50000 }, { "epoch": 0.45861865858538625, "grad_norm": 0.1314244419336319, "learning_rate": 0.0002825061730718414, "loss": 2.4751, "step": 51000 }, { "epoch": 0.46761118130274676, "grad_norm": 0.13198289275169373, "learning_rate": 0.0002754913419788723, "loss": 2.4729, "step": 52000 }, { "epoch": 0.47660370402010727, "grad_norm": 0.12992499768733978, "learning_rate": 0.00026845616712076794, "loss": 2.4694, "step": 53000 }, { "epoch": 0.4855962267374678, "grad_norm": 0.13007935881614685, "learning_rate": 0.00026141331841860756, "loss": 2.463, "step": 54000 }, { "epoch": 0.49458874945482834, "grad_norm": 0.1366134136915207, "learning_rate": 0.000254354317669745, "loss": 2.4592, "step": 55000 }, { "epoch": 0.5035812721721888, "grad_norm": 0.12824617326259613, "learning_rate": 0.0002472918418893963, "loss": 2.4585, "step": 56000 }, { "epoch": 0.5125737948895494, "grad_norm": 0.1312493234872818, "learning_rate": 0.00024023858475626683, "loss": 2.4571, "step": 57000 }, { "epoch": 0.5215663176069099, "grad_norm": 0.12798364460468292, "learning_rate": 0.0002331860555380091, "loss": 2.4561, "step": 58000 }, { "epoch": 0.5305588403242704, "grad_norm": 0.1297149360179901, "learning_rate": 0.00022614694495174873, "loss": 2.4524, "step": 59000 }, { "epoch": 0.5395513630416309, "grad_norm": 0.12818291783332825, "learning_rate": 0.00021912687067005265, "loss": 2.4496, "step": 60000 }, { "epoch": 0.5485438857589914, "grad_norm": 0.13012410700321198, "learning_rate": 0.00021213841644168087, "loss": 2.4465, "step": 61000 }, { "epoch": 0.5575364084763519, "grad_norm": 0.13271279633045197, "learning_rate": 0.00020518706618325146, "loss": 2.4471, "step": 62000 }, { "epoch": 0.5665289311937124, "grad_norm": 0.13144823908805847, "learning_rate": 0.00019825751696178184, "loss": 2.4405, "step": 63000 }, { "epoch": 0.5755214539110729, "grad_norm": 0.1301935911178589, "learning_rate": 0.00019136926164015156, "loss": 2.4368, "step": 64000 }, { "epoch": 0.5845139766284334, "grad_norm": 0.1324065774679184, "learning_rate": 0.00018452779749851848, "loss": 2.4383, "step": 65000 }, { "epoch": 0.593506499345794, "grad_norm": 0.13288547098636627, "learning_rate": 0.00017774534578113516, "loss": 2.4307, "step": 66000 }, { "epoch": 0.6024990220631545, "grad_norm": 0.13380388915538788, "learning_rate": 0.00017101374175203582, "loss": 2.4311, "step": 67000 }, { "epoch": 0.611491544780515, "grad_norm": 0.12911923229694366, "learning_rate": 0.00016434517394472685, "loss": 2.4301, "step": 68000 }, { "epoch": 0.6204840674978755, "grad_norm": 0.129085510969162, "learning_rate": 0.0001577515286210997, "loss": 2.4237, "step": 69000 }, { "epoch": 0.629476590215236, "grad_norm": 0.12780623137950897, "learning_rate": 0.00015122486833089863, "loss": 2.4248, "step": 70000 }, { "epoch": 0.6384691129325966, "grad_norm": 0.13019651174545288, "learning_rate": 0.00014478344382118653, "loss": 2.4217, "step": 71000 }, { "epoch": 0.6474616356499571, "grad_norm": 0.13168035447597504, "learning_rate": 0.00013841950088558575, "loss": 2.4179, "step": 72000 }, { "epoch": 0.6564541583673176, "grad_norm": 0.12985067069530487, "learning_rate": 0.000132150835258465, "loss": 2.4188, "step": 73000 }, { "epoch": 0.6654466810846781, "grad_norm": 0.1350133717060089, "learning_rate": 0.00012596990070111393, "loss": 2.4159, "step": 74000 }, { "epoch": 0.6744392038020386, "grad_norm": 0.13242337107658386, "learning_rate": 0.00011988795030912905, "loss": 2.4116, "step": 75000 }, { "epoch": 0.6834317265193991, "grad_norm": 0.1263236254453659, "learning_rate": 0.00011391576252872856, "loss": 2.4121, "step": 76000 }, { "epoch": 0.6924242492367596, "grad_norm": 0.1311461478471756, "learning_rate": 0.00010804614802213383, "loss": 2.4086, "step": 77000 }, { "epoch": 0.7014167719541201, "grad_norm": 0.1291423887014389, "learning_rate": 0.00010229552021442814, "loss": 2.4061, "step": 78000 }, { "epoch": 0.7104092946714806, "grad_norm": 0.13248379528522491, "learning_rate": 9.66569564824003e-05, "loss": 2.403, "step": 79000 }, { "epoch": 0.7194018173888411, "grad_norm": 0.13015016913414001, "learning_rate": 9.115167786655006e-05, "loss": 2.4049, "step": 80000 }, { "epoch": 0.7283943401062017, "grad_norm": 0.13090303540229797, "learning_rate": 8.576201416635018e-05, "loss": 2.4002, "step": 81000 }, { "epoch": 0.7373868628235623, "grad_norm": 0.13276338577270508, "learning_rate": 8.050342316576978e-05, "loss": 2.3996, "step": 82000 }, { "epoch": 0.7463793855409228, "grad_norm": 0.13524140417575836, "learning_rate": 7.538010156580435e-05, "loss": 2.3976, "step": 83000 }, { "epoch": 0.7553719082582833, "grad_norm": 0.13214819133281708, "learning_rate": 7.039613811358328e-05, "loss": 2.3951, "step": 84000 }, { "epoch": 0.7643644309756438, "grad_norm": 0.13076630234718323, "learning_rate": 6.555551033928139e-05, "loss": 2.3936, "step": 85000 }, { "epoch": 0.7733569536930043, "grad_norm": 0.1300399899482727, "learning_rate": 6.08620813817882e-05, "loss": 2.3891, "step": 86000 }, { "epoch": 0.7823494764103648, "grad_norm": 0.12917381525039673, "learning_rate": 5.631959690566982e-05, "loss": 2.3865, "step": 87000 }, { "epoch": 0.7913419991277253, "grad_norm": 0.13091173768043518, "learning_rate": 5.1940301332541934e-05, "loss": 2.3814, "step": 88000 }, { "epoch": 0.8003345218450858, "grad_norm": 0.13044433295726776, "learning_rate": 4.771013851078279e-05, "loss": 2.3799, "step": 89000 }, { "epoch": 0.8093270445624463, "grad_norm": 0.12928803265094757, "learning_rate": 4.364141628461085e-05, "loss": 2.3801, "step": 90000 }, { "epoch": 0.8183195672798068, "grad_norm": 0.1315431147813797, "learning_rate": 3.973738176159078e-05, "loss": 2.3749, "step": 91000 }, { "epoch": 0.8273120899971673, "grad_norm": 0.13129934668540955, "learning_rate": 3.6004802027335776e-05, "loss": 2.3751, "step": 92000 }, { "epoch": 0.8363046127145278, "grad_norm": 0.13383065164089203, "learning_rate": 3.243918379103131e-05, "loss": 2.3747, "step": 93000 }, { "epoch": 0.8452971354318883, "grad_norm": 0.13397032022476196, "learning_rate": 2.905049772583343e-05, "loss": 2.3716, "step": 94000 }, { "epoch": 0.854289658149249, "grad_norm": 0.13112303614616394, "learning_rate": 2.5834664521170502e-05, "loss": 2.3705, "step": 95000 }, { "epoch": 0.8632821808666095, "grad_norm": 0.1314103603363037, "learning_rate": 2.2797729977089537e-05, "loss": 2.3649, "step": 96000 }, { "epoch": 0.87227470358397, "grad_norm": 0.1317732334136963, "learning_rate": 1.994488203786088e-05, "loss": 2.3652, "step": 97000 }, { "epoch": 0.8812672263013305, "grad_norm": 0.13032500445842743, "learning_rate": 1.7272686436475505e-05, "loss": 2.3646, "step": 98000 }, { "epoch": 0.890259749018691, "grad_norm": 0.1355644017457962, "learning_rate": 1.4788615534578526e-05, "loss": 2.363, "step": 99000 }, { "epoch": 0.8992522717360515, "grad_norm": 0.1347675770521164, "learning_rate": 1.2489679000077859e-05, "loss": 2.354, "step": 100000 }, { "epoch": 0.908244794453412, "grad_norm": 0.13323768973350525, "learning_rate": 1.0380291301085198e-05, "loss": 2.3521, "step": 101000 }, { "epoch": 0.9172373171707725, "grad_norm": 0.13284514844417572, "learning_rate": 8.463957980162084e-06, "loss": 2.3492, "step": 102000 }, { "epoch": 0.926229839888133, "grad_norm": 0.1306186467409134, "learning_rate": 6.738372149439731e-06, "loss": 2.3454, "step": 103000 }, { "epoch": 0.9352223626054935, "grad_norm": 0.13234929740428925, "learning_rate": 5.206925074830115e-06, "loss": 2.3452, "step": 104000 }, { "epoch": 0.944214885322854, "grad_norm": 0.1304798424243927, "learning_rate": 3.8708389516820544e-06, "loss": 2.3448, "step": 105000 }, { "epoch": 0.9532074080402145, "grad_norm": 0.1340903788805008, "learning_rate": 2.733262717159124e-06, "loss": 2.338, "step": 106000 }, { "epoch": 0.962199930757575, "grad_norm": 0.13070346415042877, "learning_rate": 1.7905451118825079e-06, "loss": 2.3358, "step": 107000 }, { "epoch": 0.9711924534749355, "grad_norm": 0.13099098205566406, "learning_rate": 1.0459149536610202e-06, "loss": 2.3329, "step": 108000 }, { "epoch": 0.980184976192296, "grad_norm": 0.13041888177394867, "learning_rate": 5.004130549572938e-07, "loss": 2.3332, "step": 109000 }, { "epoch": 0.9891774989096567, "grad_norm": 0.1311406046152115, "learning_rate": 1.5338273892975818e-07, "loss": 2.3253, "step": 110000 } ], "logging_steps": 1000, "max_steps": 111203, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "total_flos": 9.3094825426944e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }