{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.376227322220802, "epoch": 0.01670843776106934, "grad_norm": 23.182296752929688, "learning_rate": 1.5e-06, "loss": 2.4729, "mean_token_accuracy": 0.5330174341797829, "num_tokens": 23968.0, "step": 10 }, { "entropy": 2.4090343087911608, "epoch": 0.03341687552213868, "grad_norm": 16.90180778503418, "learning_rate": 4e-06, "loss": 2.2456, "mean_token_accuracy": 0.546089893579483, "num_tokens": 50061.0, "step": 20 }, { "entropy": 2.3292118519544602, "epoch": 0.05012531328320802, "grad_norm": 7.097179889678955, "learning_rate": 6.5000000000000004e-06, "loss": 2.0194, "mean_token_accuracy": 0.5704705387353897, "num_tokens": 77221.0, "step": 30 }, { "entropy": 2.38493994474411, "epoch": 0.06683375104427736, "grad_norm": 3.9334213733673096, "learning_rate": 9e-06, "loss": 2.091, "mean_token_accuracy": 0.5746636532247067, "num_tokens": 102592.0, "step": 40 }, { "entropy": 2.4236884146928785, "epoch": 0.0835421888053467, "grad_norm": 2.742833375930786, "learning_rate": 1.1500000000000002e-05, "loss": 1.9327, "mean_token_accuracy": 0.5654355354607106, "num_tokens": 130876.0, "step": 50 }, { "entropy": 2.4756928414106367, "epoch": 0.10025062656641603, "grad_norm": 4.295400142669678, "learning_rate": 1.4e-05, "loss": 2.0381, "mean_token_accuracy": 0.5688402287662029, "num_tokens": 155526.0, "step": 60 }, { "entropy": 2.422525143623352, "epoch": 0.11695906432748537, "grad_norm": 37.36378479003906, "learning_rate": 1.65e-05, "loss": 1.9249, "mean_token_accuracy": 0.5765560433268547, "num_tokens": 181754.0, "step": 70 }, { "entropy": 2.3750332206487657, "epoch": 0.1336675020885547, "grad_norm": 53.244632720947266, "learning_rate": 1.9e-05, "loss": 1.7852, "mean_token_accuracy": 0.5806652165949344, "num_tokens": 208156.0, "step": 80 }, { "entropy": 2.1606528967618943, "epoch": 0.15037593984962405, "grad_norm": 1.9780453443527222, "learning_rate": 2.15e-05, "loss": 1.74, "mean_token_accuracy": 0.595666554570198, "num_tokens": 236321.0, "step": 90 }, { "entropy": 2.1004574805498124, "epoch": 0.1670843776106934, "grad_norm": 2.5225274562835693, "learning_rate": 2.4e-05, "loss": 1.7544, "mean_token_accuracy": 0.5997186966240406, "num_tokens": 263316.0, "step": 100 }, { "entropy": 2.0930860340595245, "epoch": 0.18379281537176273, "grad_norm": 2.0412206649780273, "learning_rate": 2.65e-05, "loss": 1.6994, "mean_token_accuracy": 0.6029496222734452, "num_tokens": 289555.0, "step": 110 }, { "entropy": 2.0005944788455965, "epoch": 0.20050125313283207, "grad_norm": 5.798661231994629, "learning_rate": 2.9e-05, "loss": 1.5928, "mean_token_accuracy": 0.6268587596714497, "num_tokens": 322035.0, "step": 120 }, { "entropy": 2.1824300229549407, "epoch": 0.2172096908939014, "grad_norm": 2.135401964187622, "learning_rate": 2.999770694261986e-05, "loss": 1.7705, "mean_token_accuracy": 0.5905720897018909, "num_tokens": 343574.0, "step": 130 }, { "entropy": 2.317758610844612, "epoch": 0.23391812865497075, "grad_norm": 4.436051368713379, "learning_rate": 2.9983696353001538e-05, "loss": 1.8244, "mean_token_accuracy": 0.578718575835228, "num_tokens": 367065.0, "step": 140 }, { "entropy": 2.0938976377248766, "epoch": 0.2506265664160401, "grad_norm": 1.6219489574432373, "learning_rate": 2.995696097848732e-05, "loss": 1.6849, "mean_token_accuracy": 0.6061625145375729, "num_tokens": 395337.0, "step": 150 }, { "entropy": 2.05442219376564, "epoch": 0.2673350041771094, "grad_norm": 1.6669152975082397, "learning_rate": 2.9917523523881268e-05, "loss": 1.7082, "mean_token_accuracy": 0.6042715333402157, "num_tokens": 420868.0, "step": 160 }, { "entropy": 2.032812547683716, "epoch": 0.28404344193817876, "grad_norm": 1.9019651412963867, "learning_rate": 2.9865417481127877e-05, "loss": 1.546, "mean_token_accuracy": 0.6138139866292477, "num_tokens": 451944.0, "step": 170 }, { "entropy": 2.0105547934770582, "epoch": 0.3007518796992481, "grad_norm": 1.7808424234390259, "learning_rate": 2.9800687100869334e-05, "loss": 1.5909, "mean_token_accuracy": 0.6109806254506112, "num_tokens": 482345.0, "step": 180 }, { "entropy": 2.0666785687208176, "epoch": 0.31746031746031744, "grad_norm": 1.7441576719284058, "learning_rate": 2.972338735486598e-05, "loss": 1.6263, "mean_token_accuracy": 0.6088879898190498, "num_tokens": 508164.0, "step": 190 }, { "entropy": 2.0136130273342134, "epoch": 0.3341687552213868, "grad_norm": 1.8193825483322144, "learning_rate": 2.9633583889312e-05, "loss": 1.6149, "mean_token_accuracy": 0.619242499768734, "num_tokens": 536345.0, "step": 200 }, { "entropy": 2.0101848542690277, "epoch": 0.3508771929824561, "grad_norm": 1.11763596534729, "learning_rate": 2.9531352969085914e-05, "loss": 1.5869, "mean_token_accuracy": 0.6176401123404502, "num_tokens": 563157.0, "step": 210 }, { "entropy": 1.9853627145290376, "epoch": 0.36758563074352546, "grad_norm": 2.025529384613037, "learning_rate": 2.9416781412983223e-05, "loss": 1.6008, "mean_token_accuracy": 0.6206148102879524, "num_tokens": 590929.0, "step": 220 }, { "entropy": 2.108823761343956, "epoch": 0.3842940685045948, "grad_norm": 1.4173170328140259, "learning_rate": 2.9289966519986258e-05, "loss": 1.6553, "mean_token_accuracy": 0.598318774998188, "num_tokens": 615814.0, "step": 230 }, { "entropy": 2.0000301033258436, "epoch": 0.40100250626566414, "grad_norm": 2.162895679473877, "learning_rate": 2.9151015986633762e-05, "loss": 1.6305, "mean_token_accuracy": 0.6174192741513252, "num_tokens": 643303.0, "step": 240 }, { "entropy": 2.0013814836740496, "epoch": 0.4177109440267335, "grad_norm": 3.5182316303253174, "learning_rate": 2.90000478155605e-05, "loss": 1.5884, "mean_token_accuracy": 0.615642835944891, "num_tokens": 670279.0, "step": 250 }, { "entropy": 2.061664643883705, "epoch": 0.4344193817878028, "grad_norm": 2.530327558517456, "learning_rate": 2.883719021528444e-05, "loss": 1.7341, "mean_token_accuracy": 0.6077517293393612, "num_tokens": 695006.0, "step": 260 }, { "entropy": 2.0266662567853926, "epoch": 0.45112781954887216, "grad_norm": 2.2248501777648926, "learning_rate": 2.8662581491326686e-05, "loss": 1.5991, "mean_token_accuracy": 0.6176492936909199, "num_tokens": 720926.0, "step": 270 }, { "entropy": 1.998877900838852, "epoch": 0.4678362573099415, "grad_norm": 1.6995679140090942, "learning_rate": 2.847636992875661e-05, "loss": 1.5343, "mean_token_accuracy": 0.6100999519228936, "num_tokens": 753552.0, "step": 280 }, { "entropy": 2.0063836723566055, "epoch": 0.48454469507101083, "grad_norm": 1.5225791931152344, "learning_rate": 2.827871366626197e-05, "loss": 1.6512, "mean_token_accuracy": 0.6144520066678524, "num_tokens": 776650.0, "step": 290 }, { "entropy": 1.9781533569097518, "epoch": 0.5012531328320802, "grad_norm": 1.590501070022583, "learning_rate": 2.806978056185084e-05, "loss": 1.6408, "mean_token_accuracy": 0.6150619685649872, "num_tokens": 801816.0, "step": 300 }, { "entropy": 1.9714119344949723, "epoch": 0.5179615705931495, "grad_norm": 2.535109519958496, "learning_rate": 2.7849748050299578e-05, "loss": 1.603, "mean_token_accuracy": 0.6225824594497681, "num_tokens": 826564.0, "step": 310 }, { "entropy": 1.94197179377079, "epoch": 0.5346700083542189, "grad_norm": 1.5610677003860474, "learning_rate": 2.7618802992467718e-05, "loss": 1.5703, "mean_token_accuracy": 0.6192868888378144, "num_tokens": 855007.0, "step": 320 }, { "entropy": 1.9983945518732071, "epoch": 0.5513784461152882, "grad_norm": 1.7279447317123413, "learning_rate": 2.7377141516607896e-05, "loss": 1.5837, "mean_token_accuracy": 0.6141151934862137, "num_tokens": 881616.0, "step": 330 }, { "entropy": 1.8347389072179794, "epoch": 0.5680868838763575, "grad_norm": 1.3221409320831299, "learning_rate": 2.712496885180547e-05, "loss": 1.4675, "mean_token_accuracy": 0.6429531842470169, "num_tokens": 913148.0, "step": 340 }, { "entropy": 1.8934204697608947, "epoch": 0.5847953216374269, "grad_norm": 1.4465378522872925, "learning_rate": 2.686249915368938e-05, "loss": 1.4669, "mean_token_accuracy": 0.6286376528441906, "num_tokens": 944937.0, "step": 350 }, { "entropy": 1.9060048371553422, "epoch": 0.6015037593984962, "grad_norm": 1.6246923208236694, "learning_rate": 2.658995532256217e-05, "loss": 1.5736, "mean_token_accuracy": 0.622390191257, "num_tokens": 974558.0, "step": 360 }, { "entropy": 2.025865262746811, "epoch": 0.6182121971595655, "grad_norm": 1.2998696565628052, "learning_rate": 2.6307568814103658e-05, "loss": 1.6277, "mean_token_accuracy": 0.6089055955410003, "num_tokens": 1001232.0, "step": 370 }, { "entropy": 1.9043485343456268, "epoch": 0.6349206349206349, "grad_norm": 1.7911604642868042, "learning_rate": 2.6015579442809094e-05, "loss": 1.5542, "mean_token_accuracy": 0.6255838416516781, "num_tokens": 1029679.0, "step": 380 }, { "entropy": 2.0594927102327345, "epoch": 0.6516290726817042, "grad_norm": 1.6075160503387451, "learning_rate": 2.5714235178328554e-05, "loss": 1.664, "mean_token_accuracy": 0.608974926173687, "num_tokens": 1053040.0, "step": 390 }, { "entropy": 2.034518975019455, "epoch": 0.6683375104427736, "grad_norm": 3.4272360801696777, "learning_rate": 2.540379193488072e-05, "loss": 1.7209, "mean_token_accuracy": 0.6031469151377677, "num_tokens": 1075141.0, "step": 400 }, { "entropy": 2.0515862941741942, "epoch": 0.6850459482038429, "grad_norm": 1.2661142349243164, "learning_rate": 2.5084513353919753e-05, "loss": 1.6599, "mean_token_accuracy": 0.6064547099173069, "num_tokens": 1097768.0, "step": 410 }, { "entropy": 1.9067467480897904, "epoch": 0.7017543859649122, "grad_norm": 2.2614035606384277, "learning_rate": 2.4756670580239894e-05, "loss": 1.5424, "mean_token_accuracy": 0.6350538037717343, "num_tokens": 1125706.0, "step": 420 }, { "entropy": 2.103572541475296, "epoch": 0.7184628237259816, "grad_norm": 2.4501664638519287, "learning_rate": 2.4420542031707905e-05, "loss": 1.703, "mean_token_accuracy": 0.6036252595484257, "num_tokens": 1147357.0, "step": 430 }, { "entropy": 2.072985142469406, "epoch": 0.7351712614870509, "grad_norm": 1.8719037771224976, "learning_rate": 2.4076413162818904e-05, "loss": 1.6609, "mean_token_accuracy": 0.606941219419241, "num_tokens": 1170179.0, "step": 440 }, { "entropy": 2.022436353564262, "epoch": 0.7518796992481203, "grad_norm": 2.076544761657715, "learning_rate": 2.3724576222276405e-05, "loss": 1.7007, "mean_token_accuracy": 0.6098736897110939, "num_tokens": 1196284.0, "step": 450 }, { "entropy": 2.1311296701431273, "epoch": 0.7685881370091896, "grad_norm": 2.9471235275268555, "learning_rate": 2.3365330004802443e-05, "loss": 1.7893, "mean_token_accuracy": 0.5961334474384785, "num_tokens": 1217358.0, "step": 460 }, { "entropy": 1.9371352732181548, "epoch": 0.7852965747702589, "grad_norm": 1.4203218221664429, "learning_rate": 2.2998979597388526e-05, "loss": 1.608, "mean_token_accuracy": 0.6184874981641769, "num_tokens": 1243374.0, "step": 470 }, { "entropy": 2.0747445046901705, "epoch": 0.8020050125313283, "grad_norm": 2.2124152183532715, "learning_rate": 2.262583612020294e-05, "loss": 1.6711, "mean_token_accuracy": 0.6062519766390324, "num_tokens": 1267314.0, "step": 480 }, { "entropy": 1.7878674179315568, "epoch": 0.8187134502923976, "grad_norm": 1.3049612045288086, "learning_rate": 2.224621646237442e-05, "loss": 1.4722, "mean_token_accuracy": 0.6389728732407093, "num_tokens": 1296917.0, "step": 490 }, { "entropy": 1.925000047683716, "epoch": 0.835421888053467, "grad_norm": 1.4271504878997803, "learning_rate": 2.1860443012876574e-05, "loss": 1.6114, "mean_token_accuracy": 0.616053618490696, "num_tokens": 1322826.0, "step": 500 }, { "entropy": 2.0017556965351107, "epoch": 0.8521303258145363, "grad_norm": 1.6036300659179688, "learning_rate": 2.146884338674161e-05, "loss": 1.5247, "mean_token_accuracy": 0.6218138873577118, "num_tokens": 1349542.0, "step": 510 }, { "entropy": 1.995826429128647, "epoch": 0.8688387635756056, "grad_norm": 1.979646921157837, "learning_rate": 2.1071750146835906e-05, "loss": 1.5882, "mean_token_accuracy": 0.6175018042325974, "num_tokens": 1375550.0, "step": 520 }, { "entropy": 1.918816116452217, "epoch": 0.885547201336675, "grad_norm": 1.531023383140564, "learning_rate": 2.06695005214336e-05, "loss": 1.5158, "mean_token_accuracy": 0.6258822798728942, "num_tokens": 1401535.0, "step": 530 }, { "entropy": 1.918640747666359, "epoch": 0.9022556390977443, "grad_norm": 1.6574758291244507, "learning_rate": 2.026243611782818e-05, "loss": 1.4805, "mean_token_accuracy": 0.6370069451630116, "num_tokens": 1430094.0, "step": 540 }, { "entropy": 2.0166410475969316, "epoch": 0.9189640768588136, "grad_norm": 2.085106372833252, "learning_rate": 1.9850902632225228e-05, "loss": 1.6067, "mean_token_accuracy": 0.613443473726511, "num_tokens": 1453767.0, "step": 550 }, { "entropy": 1.9751090347766875, "epoch": 0.935672514619883, "grad_norm": 1.5619549751281738, "learning_rate": 1.9435249556162633e-05, "loss": 1.5186, "mean_token_accuracy": 0.6245452538132668, "num_tokens": 1479122.0, "step": 560 }, { "entropy": 2.0289832055568695, "epoch": 0.9523809523809523, "grad_norm": 1.9765042066574097, "learning_rate": 1.9015829879707705e-05, "loss": 1.635, "mean_token_accuracy": 0.6078214071691036, "num_tokens": 1502393.0, "step": 570 }, { "entropy": 2.0707653611898422, "epoch": 0.9690893901420217, "grad_norm": 2.33292293548584, "learning_rate": 1.8592999791683163e-05, "loss": 1.7044, "mean_token_accuracy": 0.5983508452773094, "num_tokens": 1528688.0, "step": 580 }, { "entropy": 1.830039870738983, "epoch": 0.985797827903091, "grad_norm": 1.7089521884918213, "learning_rate": 1.8167118377176626e-05, "loss": 1.4488, "mean_token_accuracy": 0.6382713422179223, "num_tokens": 1559267.0, "step": 590 }, { "entropy": 2.093587511464169, "epoch": 1.001670843776107, "grad_norm": 1.8502051830291748, "learning_rate": 1.7738547312590426e-05, "loss": 1.6569, "mean_token_accuracy": 0.6035293050502476, "num_tokens": 1579777.0, "step": 600 }, { "entropy": 1.986822435259819, "epoch": 1.0183792815371762, "grad_norm": 1.8095533847808838, "learning_rate": 1.7307650558490842e-05, "loss": 1.5196, "mean_token_accuracy": 0.624878978729248, "num_tokens": 1603371.0, "step": 610 }, { "entropy": 1.8720842987298965, "epoch": 1.0350877192982457, "grad_norm": 1.418859601020813, "learning_rate": 1.687479405051745e-05, "loss": 1.4617, "mean_token_accuracy": 0.6376385740935803, "num_tokens": 1629671.0, "step": 620 }, { "entropy": 1.9420676857233048, "epoch": 1.0517961570593148, "grad_norm": 1.9264510869979858, "learning_rate": 1.6440345388615225e-05, "loss": 1.5184, "mean_token_accuracy": 0.6275502189993858, "num_tokens": 1654665.0, "step": 630 }, { "entropy": 1.877267986536026, "epoch": 1.0685045948203844, "grad_norm": 1.5752804279327393, "learning_rate": 1.6004673524853184e-05, "loss": 1.4846, "mean_token_accuracy": 0.6406380534172058, "num_tokens": 1681208.0, "step": 640 }, { "entropy": 1.7192733228206634, "epoch": 1.0852130325814535, "grad_norm": 1.8777337074279785, "learning_rate": 1.5568148450094788e-05, "loss": 1.3596, "mean_token_accuracy": 0.6518337815999985, "num_tokens": 1711943.0, "step": 650 }, { "entropy": 1.8170194894075393, "epoch": 1.101921470342523, "grad_norm": 1.324455738067627, "learning_rate": 1.5131140879786162e-05, "loss": 1.3627, "mean_token_accuracy": 0.6482401996850967, "num_tokens": 1744406.0, "step": 660 }, { "entropy": 1.9752755105495452, "epoch": 1.1186299081035922, "grad_norm": 1.697969913482666, "learning_rate": 1.4694021939128925e-05, "loss": 1.6435, "mean_token_accuracy": 0.6173132970929146, "num_tokens": 1763684.0, "step": 670 }, { "entropy": 1.810714191198349, "epoch": 1.1353383458646618, "grad_norm": 1.6863676309585571, "learning_rate": 1.4257162847905073e-05, "loss": 1.4572, "mean_token_accuracy": 0.6450446091592312, "num_tokens": 1790387.0, "step": 680 }, { "entropy": 1.7295511305332183, "epoch": 1.1520467836257309, "grad_norm": 1.8147404193878174, "learning_rate": 1.3820934605221554e-05, "loss": 1.4375, "mean_token_accuracy": 0.656483718752861, "num_tokens": 1818936.0, "step": 690 }, { "entropy": 1.876406043767929, "epoch": 1.1687552213868004, "grad_norm": 1.4509400129318237, "learning_rate": 1.3385707674442212e-05, "loss": 1.5038, "mean_token_accuracy": 0.6330045036971569, "num_tokens": 1844285.0, "step": 700 }, { "entropy": 1.7754180505871773, "epoch": 1.1854636591478696, "grad_norm": 1.8553998470306396, "learning_rate": 1.295185166857471e-05, "loss": 1.3342, "mean_token_accuracy": 0.6631456315517426, "num_tokens": 1872386.0, "step": 710 }, { "entropy": 1.8854310810565948, "epoch": 1.2021720969089391, "grad_norm": 1.9020118713378906, "learning_rate": 1.2519735036379654e-05, "loss": 1.5075, "mean_token_accuracy": 0.6296862445771694, "num_tokens": 1898136.0, "step": 720 }, { "entropy": 1.8075116664171218, "epoch": 1.2188805346700082, "grad_norm": 1.4228155612945557, "learning_rate": 1.2089724749468383e-05, "loss": 1.475, "mean_token_accuracy": 0.63388437256217, "num_tokens": 1926755.0, "step": 730 }, { "entropy": 1.9176389634609223, "epoch": 1.2355889724310778, "grad_norm": 1.4030358791351318, "learning_rate": 1.1662185990655285e-05, "loss": 1.4261, "mean_token_accuracy": 0.6366409629583358, "num_tokens": 1954094.0, "step": 740 }, { "entropy": 1.9318130016326904, "epoch": 1.252297410192147, "grad_norm": 2.5528788566589355, "learning_rate": 1.1237481843829162e-05, "loss": 1.637, "mean_token_accuracy": 0.6269572585821152, "num_tokens": 1977028.0, "step": 750 }, { "entropy": 1.8894671678543091, "epoch": 1.2690058479532165, "grad_norm": 1.5508147478103638, "learning_rate": 1.0815972985607149e-05, "loss": 1.5153, "mean_token_accuracy": 0.6365115389227867, "num_tokens": 2000096.0, "step": 760 }, { "entropy": 1.9068289607763291, "epoch": 1.2857142857142856, "grad_norm": 1.2476104497909546, "learning_rate": 1.0398017379032955e-05, "loss": 1.5797, "mean_token_accuracy": 0.6310160294175148, "num_tokens": 2024669.0, "step": 770 }, { "entropy": 1.7813611298799514, "epoch": 1.3024227234753551, "grad_norm": 2.0824921131134033, "learning_rate": 9.983969969579616e-06, "loss": 1.4656, "mean_token_accuracy": 0.6474256098270417, "num_tokens": 2051877.0, "step": 780 }, { "entropy": 1.8404263019561768, "epoch": 1.3191311612364243, "grad_norm": 1.2777612209320068, "learning_rate": 9.574182383714839e-06, "loss": 1.4355, "mean_token_accuracy": 0.6538358055055141, "num_tokens": 2079905.0, "step": 790 }, { "entropy": 1.8420085251331328, "epoch": 1.3358395989974938, "grad_norm": 1.2849156856536865, "learning_rate": 9.169002630285063e-06, "loss": 1.4413, "mean_token_accuracy": 0.6362872712314129, "num_tokens": 2109705.0, "step": 800 }, { "entropy": 1.9093056112527846, "epoch": 1.352548036758563, "grad_norm": 1.3787521123886108, "learning_rate": 8.768774804971705e-06, "loss": 1.4332, "mean_token_accuracy": 0.646092452853918, "num_tokens": 2136002.0, "step": 810 }, { "entropy": 1.8188541814684869, "epoch": 1.3692564745196325, "grad_norm": 1.9946447610855103, "learning_rate": 8.37383879807061e-06, "loss": 1.4213, "mean_token_accuracy": 0.6514127373695373, "num_tokens": 2162939.0, "step": 820 }, { "entropy": 1.767152488231659, "epoch": 1.3859649122807016, "grad_norm": 2.6128296852111816, "learning_rate": 7.98453000584296e-06, "loss": 1.4077, "mean_token_accuracy": 0.6433039925992489, "num_tokens": 2192868.0, "step": 830 }, { "entropy": 1.8834378957748412, "epoch": 1.4026733500417712, "grad_norm": 2.5565028190612793, "learning_rate": 7.601179045682659e-06, "loss": 1.4508, "mean_token_accuracy": 0.6357239708304405, "num_tokens": 2220574.0, "step": 840 }, { "entropy": 1.8659739628434182, "epoch": 1.4193817878028403, "grad_norm": 1.6156526803970337, "learning_rate": 7.2241114753421245e-06, "loss": 1.4151, "mean_token_accuracy": 0.6440570905804635, "num_tokens": 2247980.0, "step": 850 }, { "entropy": 2.0131053000688555, "epoch": 1.4360902255639099, "grad_norm": 1.5411407947540283, "learning_rate": 6.853647516454978e-06, "loss": 1.7022, "mean_token_accuracy": 0.6108858585357666, "num_tokens": 2271288.0, "step": 860 }, { "entropy": 1.9804413557052611, "epoch": 1.452798663324979, "grad_norm": 1.2808128595352173, "learning_rate": 6.490101782590357e-06, "loss": 1.502, "mean_token_accuracy": 0.6262324623763561, "num_tokens": 2297295.0, "step": 870 }, { "entropy": 1.7805092990398408, "epoch": 1.4695071010860485, "grad_norm": 1.404811978340149, "learning_rate": 6.133783012069853e-06, "loss": 1.3104, "mean_token_accuracy": 0.6503325693309308, "num_tokens": 2325314.0, "step": 880 }, { "entropy": 1.8354784041643142, "epoch": 1.4862155388471177, "grad_norm": 1.467010736465454, "learning_rate": 5.784993805773946e-06, "loss": 1.4201, "mean_token_accuracy": 0.6422530055046082, "num_tokens": 2353565.0, "step": 890 }, { "entropy": 1.9021347552537917, "epoch": 1.5029239766081872, "grad_norm": 1.5276216268539429, "learning_rate": 5.44403037016061e-06, "loss": 1.5201, "mean_token_accuracy": 0.6460477262735367, "num_tokens": 2375717.0, "step": 900 }, { "entropy": 1.8378847867250443, "epoch": 1.5196324143692563, "grad_norm": 1.413011908531189, "learning_rate": 5.11118226571434e-06, "loss": 1.4236, "mean_token_accuracy": 0.6440291911363601, "num_tokens": 2404483.0, "step": 910 }, { "entropy": 1.8757817566394805, "epoch": 1.536340852130326, "grad_norm": 1.5251425504684448, "learning_rate": 4.786732161039234e-06, "loss": 1.5295, "mean_token_accuracy": 0.6261114202439785, "num_tokens": 2431919.0, "step": 920 }, { "entropy": 1.8587533831596375, "epoch": 1.553049289891395, "grad_norm": 1.647172451019287, "learning_rate": 4.470955592804871e-06, "loss": 1.3916, "mean_token_accuracy": 0.6414183430373669, "num_tokens": 2461737.0, "step": 930 }, { "entropy": 1.8832595944404602, "epoch": 1.5697577276524646, "grad_norm": 2.1904146671295166, "learning_rate": 4.164120731749072e-06, "loss": 1.5253, "mean_token_accuracy": 0.63388307467103, "num_tokens": 2484053.0, "step": 940 }, { "entropy": 1.9385288596153258, "epoch": 1.5864661654135337, "grad_norm": 1.4704978466033936, "learning_rate": 3.866488154935951e-06, "loss": 1.544, "mean_token_accuracy": 0.6218141697347164, "num_tokens": 2509336.0, "step": 950 }, { "entropy": 1.92947296500206, "epoch": 1.6031746031746033, "grad_norm": 2.599453926086426, "learning_rate": 3.5783106244629837e-06, "loss": 1.4998, "mean_token_accuracy": 0.6333407089114189, "num_tokens": 2534725.0, "step": 960 }, { "entropy": 1.7914826542139053, "epoch": 1.6198830409356724, "grad_norm": 1.3860251903533936, "learning_rate": 3.299832872804765e-06, "loss": 1.4052, "mean_token_accuracy": 0.6594420954585075, "num_tokens": 2561604.0, "step": 970 }, { "entropy": 1.9076363742351532, "epoch": 1.636591478696742, "grad_norm": 1.7287890911102295, "learning_rate": 3.0312913949759196e-06, "loss": 1.4772, "mean_token_accuracy": 0.6380034938454628, "num_tokens": 2586962.0, "step": 980 }, { "entropy": 1.897977489233017, "epoch": 1.653299916457811, "grad_norm": 1.6165651082992554, "learning_rate": 2.77291424768959e-06, "loss": 1.4259, "mean_token_accuracy": 0.6349127419292927, "num_tokens": 2614128.0, "step": 990 }, { "entropy": 1.9598186224699021, "epoch": 1.6700083542188806, "grad_norm": 2.7245564460754395, "learning_rate": 2.5249208556820908e-06, "loss": 1.5447, "mean_token_accuracy": 0.6180414237082005, "num_tokens": 2638378.0, "step": 1000 }, { "entropy": 1.8438974112272262, "epoch": 1.6867167919799497, "grad_norm": 1.519782543182373, "learning_rate": 2.287521825368189e-06, "loss": 1.4452, "mean_token_accuracy": 0.6378182969987393, "num_tokens": 2667006.0, "step": 1010 }, { "entropy": 1.8624888569116593, "epoch": 1.7034252297410193, "grad_norm": 1.3807040452957153, "learning_rate": 2.060918765985288e-06, "loss": 1.4039, "mean_token_accuracy": 0.6466488510370254, "num_tokens": 2694457.0, "step": 1020 }, { "entropy": 1.752406159043312, "epoch": 1.7201336675020884, "grad_norm": 1.9477757215499878, "learning_rate": 1.8453041183783953e-06, "loss": 1.3724, "mean_token_accuracy": 0.6649182379245758, "num_tokens": 2721514.0, "step": 1030 }, { "entropy": 1.867113146185875, "epoch": 1.736842105263158, "grad_norm": 1.5146008729934692, "learning_rate": 1.6408609915712397e-06, "loss": 1.4912, "mean_token_accuracy": 0.6285726055502892, "num_tokens": 2749941.0, "step": 1040 }, { "entropy": 1.881867691874504, "epoch": 1.753550543024227, "grad_norm": 1.1970162391662598, "learning_rate": 1.447763007262431e-06, "loss": 1.4274, "mean_token_accuracy": 0.6479133352637291, "num_tokens": 2774956.0, "step": 1050 }, { "entropy": 1.8124820232391357, "epoch": 1.7702589807852966, "grad_norm": 1.135063886642456, "learning_rate": 1.2661741523785637e-06, "loss": 1.3427, "mean_token_accuracy": 0.6474431194365025, "num_tokens": 2805317.0, "step": 1060 }, { "entropy": 1.9482445955276488, "epoch": 1.7869674185463658, "grad_norm": 1.608454942703247, "learning_rate": 1.0962486398096755e-06, "loss": 1.5699, "mean_token_accuracy": 0.6215325608849526, "num_tokens": 2832036.0, "step": 1070 }, { "entropy": 1.945644235610962, "epoch": 1.8036758563074353, "grad_norm": 2.766500949859619, "learning_rate": 9.381307774451348e-07, "loss": 1.5286, "mean_token_accuracy": 0.6369006544351578, "num_tokens": 2855743.0, "step": 1080 }, { "entropy": 1.8863470137119294, "epoch": 1.8203842940685044, "grad_norm": 1.3567376136779785, "learning_rate": 7.919548456213516e-07, "loss": 1.5037, "mean_token_accuracy": 0.6256943866610527, "num_tokens": 2881629.0, "step": 1090 }, { "entropy": 1.7123635321855546, "epoch": 1.837092731829574, "grad_norm": 1.452407956123352, "learning_rate": 6.578449830852279e-07, "loss": 1.4047, "mean_token_accuracy": 0.65928635597229, "num_tokens": 2908678.0, "step": 1100 }, { "entropy": 1.8440750926733016, "epoch": 1.8538011695906431, "grad_norm": 1.331641435623169, "learning_rate": 5.359150815703629e-07, "loss": 1.4408, "mean_token_accuracy": 0.6417556688189506, "num_tokens": 2937408.0, "step": 1110 }, { "entropy": 1.919699600338936, "epoch": 1.8705096073517127, "grad_norm": 1.5076831579208374, "learning_rate": 4.2626868907536034e-07, "loss": 1.4585, "mean_token_accuracy": 0.6399278536438942, "num_tokens": 2962447.0, "step": 1120 }, { "entropy": 1.8350076168775558, "epoch": 1.8872180451127818, "grad_norm": 1.707216739654541, "learning_rate": 3.289989219265466e-07, "loss": 1.4885, "mean_token_accuracy": 0.6428370043635369, "num_tokens": 2987461.0, "step": 1130 }, { "entropy": 1.8436865717172624, "epoch": 1.9039264828738514, "grad_norm": 2.3200364112854004, "learning_rate": 2.441883856996513e-07, "loss": 1.5211, "mean_token_accuracy": 0.640003713965416, "num_tokens": 3011483.0, "step": 1140 }, { "entropy": 1.8367240995168685, "epoch": 1.9206349206349205, "grad_norm": 1.7375178337097168, "learning_rate": 1.719091050676902e-07, "loss": 1.4348, "mean_token_accuracy": 0.644163503497839, "num_tokens": 3038564.0, "step": 1150 }, { "entropy": 1.9745660215616225, "epoch": 1.93734335839599, "grad_norm": 1.5831917524337769, "learning_rate": 1.1222246263458469e-07, "loss": 1.6556, "mean_token_accuracy": 0.6158465214073658, "num_tokens": 3059313.0, "step": 1160 }, { "entropy": 1.8914725601673126, "epoch": 1.9540517961570592, "grad_norm": 1.6992645263671875, "learning_rate": 6.517914680646386e-08, "loss": 1.39, "mean_token_accuracy": 0.6418868929147721, "num_tokens": 3084800.0, "step": 1170 }, { "entropy": 1.8038861930370331, "epoch": 1.9707602339181287, "grad_norm": 1.4963058233261108, "learning_rate": 3.081910874495497e-08, "loss": 1.3698, "mean_token_accuracy": 0.6503725171089172, "num_tokens": 3113883.0, "step": 1180 }, { "entropy": 1.8201948583126069, "epoch": 1.9874686716791978, "grad_norm": 2.040982723236084, "learning_rate": 9.171528438959852e-09, "loss": 1.4156, "mean_token_accuracy": 0.6591305956244469, "num_tokens": 3138912.0, "step": 1190 } ], "logging_steps": 10, "max_steps": 1198, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4514045996918374e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }