{ "best_metric": null, "best_model_checkpoint": null, "epoch": 25.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 6.874109268188477, "learning_rate": 2.5559105431309904e-07, "loss": 1.5574, "step": 10 }, { "epoch": 0.16, "grad_norm": 6.183030128479004, "learning_rate": 8.306709265175719e-07, "loss": 1.4724, "step": 20 }, { "epoch": 0.24, "grad_norm": 2.6565823554992676, "learning_rate": 1.4696485623003196e-06, "loss": 1.054, "step": 30 }, { "epoch": 0.32, "grad_norm": 3.227349042892456, "learning_rate": 2.1086261980830672e-06, "loss": 0.8732, "step": 40 }, { "epoch": 0.4, "grad_norm": 1.1049450635910034, "learning_rate": 2.747603833865815e-06, "loss": 0.7617, "step": 50 }, { "epoch": 0.48, "grad_norm": 1.8624663352966309, "learning_rate": 3.386581469648563e-06, "loss": 0.7144, "step": 60 }, { "epoch": 0.56, "grad_norm": 2.1665029525756836, "learning_rate": 4.02555910543131e-06, "loss": 0.7612, "step": 70 }, { "epoch": 0.64, "grad_norm": 1.0787376165390015, "learning_rate": 4.664536741214058e-06, "loss": 0.6864, "step": 80 }, { "epoch": 0.72, "grad_norm": 1.4465893507003784, "learning_rate": 5.303514376996806e-06, "loss": 0.6558, "step": 90 }, { "epoch": 0.8, "grad_norm": 1.1737278699874878, "learning_rate": 5.8785942492012785e-06, "loss": 0.6295, "step": 100 }, { "epoch": 0.88, "grad_norm": 1.7027487754821777, "learning_rate": 6.517571884984026e-06, "loss": 0.638, "step": 110 }, { "epoch": 0.96, "grad_norm": 1.5483449697494507, "learning_rate": 7.156549520766773e-06, "loss": 0.6173, "step": 120 }, { "epoch": 1.04, "grad_norm": 1.9829121828079224, "learning_rate": 7.795527156549521e-06, "loss": 0.5511, "step": 130 }, { "epoch": 1.12, "grad_norm": 1.466599464416504, "learning_rate": 8.434504792332269e-06, "loss": 0.5366, "step": 140 }, { "epoch": 1.2, "grad_norm": 1.5559449195861816, "learning_rate": 9.073482428115017e-06, "loss": 0.5662, "step": 150 }, { "epoch": 1.28, "grad_norm": 1.33512282371521, "learning_rate": 9.712460063897765e-06, "loss": 0.5567, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 0.943450927734375, "learning_rate": 1.035143769968051e-05, "loss": 0.5997, "step": 170 }, { "epoch": 1.44, "grad_norm": 1.2756850719451904, "learning_rate": 1.099041533546326e-05, "loss": 0.5587, "step": 180 }, { "epoch": 1.52, "grad_norm": 1.0768604278564453, "learning_rate": 1.1629392971246008e-05, "loss": 0.6288, "step": 190 }, { "epoch": 1.6, "grad_norm": 1.082158088684082, "learning_rate": 1.2268370607028754e-05, "loss": 0.5475, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 1.4872550964355469, "learning_rate": 1.2907348242811502e-05, "loss": 0.6002, "step": 210 }, { "epoch": 1.76, "grad_norm": 2.0925095081329346, "learning_rate": 1.3546325878594251e-05, "loss": 0.5887, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 1.4571633338928223, "learning_rate": 1.4185303514376998e-05, "loss": 0.6107, "step": 230 }, { "epoch": 1.92, "grad_norm": 2.4871673583984375, "learning_rate": 1.4824281150159745e-05, "loss": 0.5582, "step": 240 }, { "epoch": 2.0, "grad_norm": 1.3184454441070557, "learning_rate": 1.5463258785942495e-05, "loss": 0.582, "step": 250 }, { "epoch": 2.08, "grad_norm": 1.9747495651245117, "learning_rate": 1.610223642172524e-05, "loss": 0.4687, "step": 260 }, { "epoch": 2.16, "grad_norm": 1.7691447734832764, "learning_rate": 1.6741214057507987e-05, "loss": 0.5013, "step": 270 }, { "epoch": 2.24, "grad_norm": 1.8902829885482788, "learning_rate": 1.7380191693290737e-05, "loss": 0.5008, "step": 280 }, { "epoch": 2.32, "grad_norm": 1.3197141885757446, "learning_rate": 1.8019169329073486e-05, "loss": 0.5229, "step": 290 }, { "epoch": 2.4, "grad_norm": 2.325404167175293, "learning_rate": 1.8658146964856232e-05, "loss": 0.4975, "step": 300 }, { "epoch": 2.48, "grad_norm": 1.3323551416397095, "learning_rate": 1.929712460063898e-05, "loss": 0.5254, "step": 310 }, { "epoch": 2.56, "grad_norm": 0.7770269513130188, "learning_rate": 1.9936102236421725e-05, "loss": 0.5304, "step": 320 }, { "epoch": 2.64, "grad_norm": 1.887294054031372, "learning_rate": 1.999949450079496e-05, "loss": 0.4755, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 1.2255381345748901, "learning_rate": 1.9997747161747696e-05, "loss": 0.4982, "step": 340 }, { "epoch": 2.8, "grad_norm": 1.6865767240524292, "learning_rate": 1.9994751960168383e-05, "loss": 0.4857, "step": 350 }, { "epoch": 2.88, "grad_norm": 1.2997519969940186, "learning_rate": 1.999050926990122e-05, "loss": 0.4987, "step": 360 }, { "epoch": 2.96, "grad_norm": 1.4636356830596924, "learning_rate": 1.9985019620494935e-05, "loss": 0.4947, "step": 370 }, { "epoch": 3.04, "grad_norm": 1.3720135688781738, "learning_rate": 1.9978283697136662e-05, "loss": 0.5135, "step": 380 }, { "epoch": 3.12, "grad_norm": 1.6108952760696411, "learning_rate": 1.997030234056645e-05, "loss": 0.4405, "step": 390 }, { "epoch": 3.2, "grad_norm": 2.1790759563446045, "learning_rate": 1.9961076546972304e-05, "loss": 0.4468, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 4.406925678253174, "learning_rate": 1.9950607467865856e-05, "loss": 0.4675, "step": 410 }, { "epoch": 3.36, "grad_norm": 1.7346851825714111, "learning_rate": 1.993889640993864e-05, "loss": 0.4243, "step": 420 }, { "epoch": 3.44, "grad_norm": 1.2865455150604248, "learning_rate": 1.9925944834898992e-05, "loss": 0.3787, "step": 430 }, { "epoch": 3.52, "grad_norm": 1.321824550628662, "learning_rate": 1.991175435928962e-05, "loss": 0.3982, "step": 440 }, { "epoch": 3.6, "grad_norm": 2.45711350440979, "learning_rate": 1.989632675428582e-05, "loss": 0.4811, "step": 450 }, { "epoch": 3.68, "grad_norm": 2.5552310943603516, "learning_rate": 1.9879663945474416e-05, "loss": 0.4255, "step": 460 }, { "epoch": 3.76, "grad_norm": 1.763244867324829, "learning_rate": 1.9861768012613435e-05, "loss": 0.4579, "step": 470 }, { "epoch": 3.84, "grad_norm": 1.0951169729232788, "learning_rate": 1.984264118937249e-05, "loss": 0.398, "step": 480 }, { "epoch": 3.92, "grad_norm": 8.235908508300781, "learning_rate": 1.9822285863054e-05, "loss": 0.4101, "step": 490 }, { "epoch": 4.0, "grad_norm": 19.39371681213379, "learning_rate": 1.9800704574295246e-05, "loss": 0.4645, "step": 500 }, { "epoch": 4.08, "grad_norm": 1.2594205141067505, "learning_rate": 1.9777900016751224e-05, "loss": 0.3632, "step": 510 }, { "epoch": 4.16, "grad_norm": 1.7697833776474, "learning_rate": 1.9753875036758464e-05, "loss": 0.3356, "step": 520 }, { "epoch": 4.24, "grad_norm": 1.66884446144104, "learning_rate": 1.9728632632979746e-05, "loss": 0.382, "step": 530 }, { "epoch": 4.32, "grad_norm": 1.7648805379867554, "learning_rate": 1.970217595602985e-05, "loss": 0.3997, "step": 540 }, { "epoch": 4.4, "grad_norm": 1.6132514476776123, "learning_rate": 1.967450830808228e-05, "loss": 0.3476, "step": 550 }, { "epoch": 4.48, "grad_norm": 1.3217155933380127, "learning_rate": 1.9645633142457143e-05, "loss": 0.3362, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 1.5668702125549316, "learning_rate": 1.9615554063190098e-05, "loss": 0.3996, "step": 570 }, { "epoch": 4.64, "grad_norm": 2.051454544067383, "learning_rate": 1.958427482458253e-05, "loss": 0.4104, "step": 580 }, { "epoch": 4.72, "grad_norm": 1.84868586063385, "learning_rate": 1.9551799330732954e-05, "loss": 0.374, "step": 590 }, { "epoch": 4.8, "grad_norm": 1.3498121500015259, "learning_rate": 1.9518131635049745e-05, "loss": 0.3444, "step": 600 }, { "epoch": 4.88, "grad_norm": 1.962553858757019, "learning_rate": 1.9483275939745184e-05, "loss": 0.3153, "step": 610 }, { "epoch": 4.96, "grad_norm": 1.7132457494735718, "learning_rate": 1.944723659531099e-05, "loss": 0.383, "step": 620 }, { "epoch": 5.04, "grad_norm": 1.8860396146774292, "learning_rate": 1.9410018099975297e-05, "loss": 0.327, "step": 630 }, { "epoch": 5.12, "grad_norm": 1.9154820442199707, "learning_rate": 1.9371625099141223e-05, "loss": 0.2704, "step": 640 }, { "epoch": 5.2, "grad_norm": 1.4077069759368896, "learning_rate": 1.9332062384807058e-05, "loss": 0.2659, "step": 650 }, { "epoch": 5.28, "grad_norm": 1.8489642143249512, "learning_rate": 1.9291334894968133e-05, "loss": 0.2523, "step": 660 }, { "epoch": 5.36, "grad_norm": 2.3917829990386963, "learning_rate": 1.9249447713000515e-05, "loss": 0.2885, "step": 670 }, { "epoch": 5.44, "grad_norm": 2.5429599285125732, "learning_rate": 1.9206406067026506e-05, "loss": 0.3097, "step": 680 }, { "epoch": 5.52, "grad_norm": 3.4684863090515137, "learning_rate": 1.9162215329262115e-05, "loss": 0.2868, "step": 690 }, { "epoch": 5.6, "grad_norm": 2.662013292312622, "learning_rate": 1.9116881015346517e-05, "loss": 0.3281, "step": 700 }, { "epoch": 5.68, "grad_norm": 2.223334312438965, "learning_rate": 1.9070408783653627e-05, "loss": 0.2799, "step": 710 }, { "epoch": 5.76, "grad_norm": 2.992532730102539, "learning_rate": 1.9022804434585854e-05, "loss": 0.3252, "step": 720 }, { "epoch": 5.84, "grad_norm": 1.6079481840133667, "learning_rate": 1.8974073909850125e-05, "loss": 0.2791, "step": 730 }, { "epoch": 5.92, "grad_norm": 2.97104549407959, "learning_rate": 1.8924223291716274e-05, "loss": 0.3029, "step": 740 }, { "epoch": 6.0, "grad_norm": 2.1889212131500244, "learning_rate": 1.887325880225789e-05, "loss": 0.293, "step": 750 }, { "epoch": 6.08, "grad_norm": 2.123610019683838, "learning_rate": 1.882118680257572e-05, "loss": 0.2308, "step": 760 }, { "epoch": 6.16, "grad_norm": 3.71563720703125, "learning_rate": 1.8768013792003683e-05, "loss": 0.216, "step": 770 }, { "epoch": 6.24, "grad_norm": 2.459091901779175, "learning_rate": 1.8713746407297703e-05, "loss": 0.1972, "step": 780 }, { "epoch": 6.32, "grad_norm": 2.669945240020752, "learning_rate": 1.8658391421807313e-05, "loss": 0.2478, "step": 790 }, { "epoch": 6.4, "grad_norm": 2.1581931114196777, "learning_rate": 1.8601955744630255e-05, "loss": 0.2295, "step": 800 }, { "epoch": 6.48, "grad_norm": 2.567908763885498, "learning_rate": 1.8544446419750125e-05, "loss": 0.2239, "step": 810 }, { "epoch": 6.5600000000000005, "grad_norm": 2.0205740928649902, "learning_rate": 1.8485870625157186e-05, "loss": 0.216, "step": 820 }, { "epoch": 6.64, "grad_norm": 2.2898030281066895, "learning_rate": 1.8426235671952452e-05, "loss": 0.2343, "step": 830 }, { "epoch": 6.72, "grad_norm": 2.2003941535949707, "learning_rate": 1.836554900343514e-05, "loss": 0.2076, "step": 840 }, { "epoch": 6.8, "grad_norm": 1.5117710828781128, "learning_rate": 1.8303818194173665e-05, "loss": 0.1904, "step": 850 }, { "epoch": 6.88, "grad_norm": 1.2647337913513184, "learning_rate": 1.824105094906021e-05, "loss": 0.2341, "step": 860 }, { "epoch": 6.96, "grad_norm": 2.00254225730896, "learning_rate": 1.8177255102349047e-05, "loss": 0.2288, "step": 870 }, { "epoch": 7.04, "grad_norm": 2.516129493713379, "learning_rate": 1.8112438616678712e-05, "loss": 0.1685, "step": 880 }, { "epoch": 7.12, "grad_norm": 3.6129150390625, "learning_rate": 1.8046609582078147e-05, "loss": 0.1625, "step": 890 }, { "epoch": 7.2, "grad_norm": 3.1554768085479736, "learning_rate": 1.797977621495696e-05, "loss": 0.2011, "step": 900 }, { "epoch": 7.28, "grad_norm": 3.8677282333374023, "learning_rate": 1.7911946857079886e-05, "loss": 0.1579, "step": 910 }, { "epoch": 7.36, "grad_norm": 3.062568426132202, "learning_rate": 1.784312997452562e-05, "loss": 0.1705, "step": 920 }, { "epoch": 7.44, "grad_norm": 1.8281478881835938, "learning_rate": 1.777333415663014e-05, "loss": 0.1708, "step": 930 }, { "epoch": 7.52, "grad_norm": 1.1312201023101807, "learning_rate": 1.7702568114914607e-05, "loss": 0.1644, "step": 940 }, { "epoch": 7.6, "grad_norm": 1.1482549905776978, "learning_rate": 1.7630840681998068e-05, "loss": 0.1297, "step": 950 }, { "epoch": 7.68, "grad_norm": 3.109949827194214, "learning_rate": 1.755816081049501e-05, "loss": 0.1812, "step": 960 }, { "epoch": 7.76, "grad_norm": 1.2507814168930054, "learning_rate": 1.7484537571897943e-05, "loss": 0.1331, "step": 970 }, { "epoch": 7.84, "grad_norm": 2.898353338241577, "learning_rate": 1.740998015544514e-05, "loss": 0.1625, "step": 980 }, { "epoch": 7.92, "grad_norm": 2.2018096446990967, "learning_rate": 1.7334497866973716e-05, "loss": 0.1441, "step": 990 }, { "epoch": 8.0, "grad_norm": 2.1120376586914062, "learning_rate": 1.725810012775808e-05, "loss": 0.1804, "step": 1000 }, { "epoch": 8.08, "grad_norm": 1.815804362297058, "learning_rate": 1.7180796473334075e-05, "loss": 0.1145, "step": 1010 }, { "epoch": 8.16, "grad_norm": 2.5671639442443848, "learning_rate": 1.7102596552308765e-05, "loss": 0.1094, "step": 1020 }, { "epoch": 8.24, "grad_norm": 0.9062528610229492, "learning_rate": 1.7023510125156173e-05, "loss": 0.0924, "step": 1030 }, { "epoch": 8.32, "grad_norm": 3.5460164546966553, "learning_rate": 1.6943547062999027e-05, "loss": 0.1323, "step": 1040 }, { "epoch": 8.4, "grad_norm": 2.856273889541626, "learning_rate": 1.6862717346376706e-05, "loss": 0.0938, "step": 1050 }, { "epoch": 8.48, "grad_norm": 3.177924633026123, "learning_rate": 1.6781031063999515e-05, "loss": 0.1204, "step": 1060 }, { "epoch": 8.56, "grad_norm": 1.3371851444244385, "learning_rate": 1.6698498411489477e-05, "loss": 0.1342, "step": 1070 }, { "epoch": 8.64, "grad_norm": 2.0833916664123535, "learning_rate": 1.6615129690107773e-05, "loss": 0.1337, "step": 1080 }, { "epoch": 8.72, "grad_norm": 2.4857378005981445, "learning_rate": 1.6530935305469e-05, "loss": 0.1233, "step": 1090 }, { "epoch": 8.8, "grad_norm": 1.7040643692016602, "learning_rate": 1.6445925766242392e-05, "loss": 0.121, "step": 1100 }, { "epoch": 8.88, "grad_norm": 1.2290680408477783, "learning_rate": 1.6360111682840184e-05, "loss": 0.1037, "step": 1110 }, { "epoch": 8.96, "grad_norm": 3.9770867824554443, "learning_rate": 1.62735037660933e-05, "loss": 0.1133, "step": 1120 }, { "epoch": 9.04, "grad_norm": 1.0533943176269531, "learning_rate": 1.618611282591446e-05, "loss": 0.0943, "step": 1130 }, { "epoch": 9.12, "grad_norm": 1.9520882368087769, "learning_rate": 1.609794976994897e-05, "loss": 0.0725, "step": 1140 }, { "epoch": 9.2, "grad_norm": 3.610280990600586, "learning_rate": 1.600902560221329e-05, "loss": 0.1015, "step": 1150 }, { "epoch": 9.28, "grad_norm": 1.3682546615600586, "learning_rate": 1.5919351421721548e-05, "loss": 0.0945, "step": 1160 }, { "epoch": 9.36, "grad_norm": 2.125917673110962, "learning_rate": 1.5828938421100266e-05, "loss": 0.0733, "step": 1170 }, { "epoch": 9.44, "grad_norm": 2.5206658840179443, "learning_rate": 1.5737797885191316e-05, "loss": 0.0646, "step": 1180 }, { "epoch": 9.52, "grad_norm": 2.5717523097991943, "learning_rate": 1.5645941189643444e-05, "loss": 0.0836, "step": 1190 }, { "epoch": 9.6, "grad_norm": 2.1724965572357178, "learning_rate": 1.55533797994924e-05, "loss": 0.0933, "step": 1200 }, { "epoch": 9.68, "grad_norm": 3.0643951892852783, "learning_rate": 1.546012526772996e-05, "loss": 0.083, "step": 1210 }, { "epoch": 9.76, "grad_norm": 1.3585115671157837, "learning_rate": 1.5366189233861933e-05, "loss": 0.0804, "step": 1220 }, { "epoch": 9.84, "grad_norm": 2.106403112411499, "learning_rate": 1.5271583422455373e-05, "loss": 0.0941, "step": 1230 }, { "epoch": 9.92, "grad_norm": 1.7128456830978394, "learning_rate": 1.5176319641675213e-05, "loss": 0.0668, "step": 1240 }, { "epoch": 10.0, "grad_norm": 1.7037241458892822, "learning_rate": 1.5080409781810406e-05, "loss": 0.1043, "step": 1250 }, { "epoch": 10.08, "grad_norm": 0.8989495038986206, "learning_rate": 1.4983865813789869e-05, "loss": 0.0521, "step": 1260 }, { "epoch": 10.16, "grad_norm": 1.3013043403625488, "learning_rate": 1.488669978768833e-05, "loss": 0.0647, "step": 1270 }, { "epoch": 10.24, "grad_norm": 0.6377755999565125, "learning_rate": 1.47889238312223e-05, "loss": 0.0367, "step": 1280 }, { "epoch": 10.32, "grad_norm": 2.2679171562194824, "learning_rate": 1.4690550148236371e-05, "loss": 0.0769, "step": 1290 }, { "epoch": 10.4, "grad_norm": 2.3292899131774902, "learning_rate": 1.4591591017179993e-05, "loss": 0.0908, "step": 1300 }, { "epoch": 10.48, "grad_norm": 1.8322817087173462, "learning_rate": 1.4492058789574948e-05, "loss": 0.0551, "step": 1310 }, { "epoch": 10.56, "grad_norm": 5.285430431365967, "learning_rate": 1.4391965888473705e-05, "loss": 0.0584, "step": 1320 }, { "epoch": 10.64, "grad_norm": 0.768731415271759, "learning_rate": 1.4291324806908846e-05, "loss": 0.0833, "step": 1330 }, { "epoch": 10.72, "grad_norm": 1.1357346773147583, "learning_rate": 1.419014810633374e-05, "loss": 0.0596, "step": 1340 }, { "epoch": 10.8, "grad_norm": 0.9390424489974976, "learning_rate": 1.408844841505473e-05, "loss": 0.0436, "step": 1350 }, { "epoch": 10.88, "grad_norm": 0.775092363357544, "learning_rate": 1.3986238426654894e-05, "loss": 0.0663, "step": 1360 }, { "epoch": 10.96, "grad_norm": 1.6158931255340576, "learning_rate": 1.3883530898409736e-05, "loss": 0.0438, "step": 1370 }, { "epoch": 11.04, "grad_norm": 1.8279812335968018, "learning_rate": 1.3780338649694874e-05, "loss": 0.0944, "step": 1380 }, { "epoch": 11.12, "grad_norm": 1.8661407232284546, "learning_rate": 1.3676674560386018e-05, "loss": 0.0398, "step": 1390 }, { "epoch": 11.2, "grad_norm": 1.8592077493667603, "learning_rate": 1.357255156925136e-05, "loss": 0.0561, "step": 1400 }, { "epoch": 11.28, "grad_norm": 3.479532241821289, "learning_rate": 1.3467982672336633e-05, "loss": 0.0408, "step": 1410 }, { "epoch": 11.36, "grad_norm": 2.4556691646575928, "learning_rate": 1.336298092134302e-05, "loss": 0.0603, "step": 1420 }, { "epoch": 11.44, "grad_norm": 2.028334379196167, "learning_rate": 1.325755942199812e-05, "loss": 0.0423, "step": 1430 }, { "epoch": 11.52, "grad_norm": 0.6319141387939453, "learning_rate": 1.3151731332420152e-05, "loss": 0.0717, "step": 1440 }, { "epoch": 11.6, "grad_norm": 1.3395280838012695, "learning_rate": 1.3045509861475645e-05, "loss": 0.0505, "step": 1450 }, { "epoch": 11.68, "grad_norm": 2.619269847869873, "learning_rate": 1.293890826713077e-05, "loss": 0.0477, "step": 1460 }, { "epoch": 11.76, "grad_norm": 2.6248464584350586, "learning_rate": 1.2831939854796567e-05, "loss": 0.0426, "step": 1470 }, { "epoch": 11.84, "grad_norm": 1.672989845275879, "learning_rate": 1.2724617975668229e-05, "loss": 0.0533, "step": 1480 }, { "epoch": 11.92, "grad_norm": 5.669078350067139, "learning_rate": 1.2616956025058688e-05, "loss": 0.0413, "step": 1490 }, { "epoch": 12.0, "grad_norm": 2.987652063369751, "learning_rate": 1.2508967440726689e-05, "loss": 0.0442, "step": 1500 }, { "epoch": 12.08, "grad_norm": 1.2909622192382812, "learning_rate": 1.2400665701199541e-05, "loss": 0.0565, "step": 1510 }, { "epoch": 12.16, "grad_norm": 0.264878511428833, "learning_rate": 1.2292064324090842e-05, "loss": 0.0243, "step": 1520 }, { "epoch": 12.24, "grad_norm": 2.0698373317718506, "learning_rate": 1.2183176864413262e-05, "loss": 0.0241, "step": 1530 }, { "epoch": 12.32, "grad_norm": 0.20062156021595, "learning_rate": 1.2074016912886683e-05, "loss": 0.0204, "step": 1540 }, { "epoch": 12.4, "grad_norm": 0.5076002478599548, "learning_rate": 1.1964598094241884e-05, "loss": 0.0462, "step": 1550 }, { "epoch": 12.48, "grad_norm": 1.1509472131729126, "learning_rate": 1.1854934065519986e-05, "loss": 0.0302, "step": 1560 }, { "epoch": 12.56, "grad_norm": 1.603930950164795, "learning_rate": 1.1745038514367853e-05, "loss": 0.0282, "step": 1570 }, { "epoch": 12.64, "grad_norm": 0.9757488965988159, "learning_rate": 1.1634925157329668e-05, "loss": 0.0387, "step": 1580 }, { "epoch": 12.72, "grad_norm": 1.058209776878357, "learning_rate": 1.1524607738134928e-05, "loss": 0.0445, "step": 1590 }, { "epoch": 12.8, "grad_norm": 2.3291449546813965, "learning_rate": 1.141410002598301e-05, "loss": 0.0572, "step": 1600 }, { "epoch": 12.88, "grad_norm": 0.8108656406402588, "learning_rate": 1.1303415813824599e-05, "loss": 0.0337, "step": 1610 }, { "epoch": 12.96, "grad_norm": 6.551490783691406, "learning_rate": 1.1192568916640107e-05, "loss": 0.0337, "step": 1620 }, { "epoch": 13.04, "grad_norm": 0.2524939477443695, "learning_rate": 1.1081573169715379e-05, "loss": 0.0218, "step": 1630 }, { "epoch": 13.12, "grad_norm": 0.17596031725406647, "learning_rate": 1.0970442426914847e-05, "loss": 0.0132, "step": 1640 }, { "epoch": 13.2, "grad_norm": 0.7680679559707642, "learning_rate": 1.085919055895237e-05, "loss": 0.0207, "step": 1650 }, { "epoch": 13.28, "grad_norm": 0.24066166579723358, "learning_rate": 1.0747831451659967e-05, "loss": 0.0142, "step": 1660 }, { "epoch": 13.36, "grad_norm": 3.3667285442352295, "learning_rate": 1.0636379004254665e-05, "loss": 0.0183, "step": 1670 }, { "epoch": 13.44, "grad_norm": 1.8831534385681152, "learning_rate": 1.0524847127603677e-05, "loss": 0.0253, "step": 1680 }, { "epoch": 13.52, "grad_norm": 1.329085111618042, "learning_rate": 1.0413249742488132e-05, "loss": 0.0118, "step": 1690 }, { "epoch": 13.6, "grad_norm": 2.0776255130767822, "learning_rate": 1.030160077786556e-05, "loss": 0.0128, "step": 1700 }, { "epoch": 13.68, "grad_norm": 1.02082097530365, "learning_rate": 1.0189914169131341e-05, "loss": 0.0406, "step": 1710 }, { "epoch": 13.76, "grad_norm": 0.7947062253952026, "learning_rate": 1.0078203856379394e-05, "loss": 0.0183, "step": 1720 }, { "epoch": 13.84, "grad_norm": 0.3098564147949219, "learning_rate": 9.966483782662233e-06, "loss": 0.0251, "step": 1730 }, { "epoch": 13.92, "grad_norm": 0.4250534772872925, "learning_rate": 9.854767892250692e-06, "loss": 0.0394, "step": 1740 }, { "epoch": 14.0, "grad_norm": 1.0927317142486572, "learning_rate": 9.743070128893452e-06, "loss": 0.0187, "step": 1750 }, { "epoch": 14.08, "grad_norm": 0.5906019806861877, "learning_rate": 9.631404434076687e-06, "loss": 0.0042, "step": 1760 }, { "epoch": 14.16, "grad_norm": 0.4497601389884949, "learning_rate": 9.519784745283956e-06, "loss": 0.0402, "step": 1770 }, { "epoch": 14.24, "grad_norm": 0.2257859855890274, "learning_rate": 9.408224994256603e-06, "loss": 0.0039, "step": 1780 }, { "epoch": 14.32, "grad_norm": 0.9509989023208618, "learning_rate": 9.296739105254869e-06, "loss": 0.0156, "step": 1790 }, { "epoch": 14.4, "grad_norm": 1.7652183771133423, "learning_rate": 9.185340993319977e-06, "loss": 0.0121, "step": 1800 }, { "epoch": 14.48, "grad_norm": 8.208316802978516, "learning_rate": 9.074044562537284e-06, "loss": 0.0389, "step": 1810 }, { "epoch": 14.56, "grad_norm": 0.13252846896648407, "learning_rate": 8.962863704300893e-06, "loss": 0.011, "step": 1820 }, { "epoch": 14.64, "grad_norm": 0.23998035490512848, "learning_rate": 8.851812295579789e-06, "loss": 0.0248, "step": 1830 }, { "epoch": 14.72, "grad_norm": 0.10990134626626968, "learning_rate": 8.740904197185794e-06, "loss": 0.0204, "step": 1840 }, { "epoch": 14.8, "grad_norm": 3.542644500732422, "learning_rate": 8.630153252043543e-06, "loss": 0.0163, "step": 1850 }, { "epoch": 14.88, "grad_norm": 0.7273716926574707, "learning_rate": 8.519573283462688e-06, "loss": 0.0068, "step": 1860 }, { "epoch": 14.96, "grad_norm": 0.036577966064214706, "learning_rate": 8.409178093412549e-06, "loss": 0.0084, "step": 1870 }, { "epoch": 15.04, "grad_norm": 0.1642957627773285, "learning_rate": 8.298981460799426e-06, "loss": 0.0093, "step": 1880 }, { "epoch": 15.12, "grad_norm": 0.11017989367246628, "learning_rate": 8.188997139746807e-06, "loss": 0.0076, "step": 1890 }, { "epoch": 15.2, "grad_norm": 0.04475900158286095, "learning_rate": 8.079238857878631e-06, "loss": 0.0059, "step": 1900 }, { "epoch": 15.28, "grad_norm": 1.0872098207473755, "learning_rate": 7.969720314605915e-06, "loss": 0.0051, "step": 1910 }, { "epoch": 15.36, "grad_norm": 0.32729968428611755, "learning_rate": 7.860455179416837e-06, "loss": 0.0016, "step": 1920 }, { "epoch": 15.44, "grad_norm": 1.3896279335021973, "learning_rate": 7.751457090170616e-06, "loss": 0.0066, "step": 1930 }, { "epoch": 15.52, "grad_norm": 0.24193909764289856, "learning_rate": 7.642739651395295e-06, "loss": 0.0049, "step": 1940 }, { "epoch": 15.6, "grad_norm": 0.0498543456196785, "learning_rate": 7.534316432589706e-06, "loss": 0.0065, "step": 1950 }, { "epoch": 15.68, "grad_norm": 0.053042277693748474, "learning_rate": 7.426200966529795e-06, "loss": 0.0096, "step": 1960 }, { "epoch": 15.76, "grad_norm": 0.6754854917526245, "learning_rate": 7.318406747579556e-06, "loss": 0.0074, "step": 1970 }, { "epoch": 15.84, "grad_norm": 0.05438900366425514, "learning_rate": 7.210947230006713e-06, "loss": 0.0025, "step": 1980 }, { "epoch": 15.92, "grad_norm": 0.0401092991232872, "learning_rate": 7.103835826303451e-06, "loss": 0.0022, "step": 1990 }, { "epoch": 16.0, "grad_norm": 0.05556059256196022, "learning_rate": 6.997085905512346e-06, "loss": 0.0049, "step": 2000 }, { "epoch": 16.08, "grad_norm": 0.07605039328336716, "learning_rate": 6.8907107915577075e-06, "loss": 0.0051, "step": 2010 }, { "epoch": 16.16, "grad_norm": 0.07472195476293564, "learning_rate": 6.7847237615825636e-06, "loss": 0.0031, "step": 2020 }, { "epoch": 16.24, "grad_norm": 0.203807532787323, "learning_rate": 6.6791380442914866e-06, "loss": 0.0014, "step": 2030 }, { "epoch": 16.32, "grad_norm": 0.08696573972702026, "learning_rate": 6.573966818299461e-06, "loss": 0.0024, "step": 2040 }, { "epoch": 16.4, "grad_norm": 0.025404011830687523, "learning_rate": 6.469223210486992e-06, "loss": 0.0007, "step": 2050 }, { "epoch": 16.48, "grad_norm": 0.05262196436524391, "learning_rate": 6.364920294361701e-06, "loss": 0.0006, "step": 2060 }, { "epoch": 16.56, "grad_norm": 0.029066545888781548, "learning_rate": 6.261071088426546e-06, "loss": 0.0004, "step": 2070 }, { "epoch": 16.64, "grad_norm": 0.02618943341076374, "learning_rate": 6.1576885545549355e-06, "loss": 0.0022, "step": 2080 }, { "epoch": 16.72, "grad_norm": 0.06179804354906082, "learning_rate": 6.054785596372894e-06, "loss": 0.0005, "step": 2090 }, { "epoch": 16.8, "grad_norm": 0.014377947896718979, "learning_rate": 5.952375057648509e-06, "loss": 0.0028, "step": 2100 }, { "epoch": 16.88, "grad_norm": 0.039646223187446594, "learning_rate": 5.850469720688847e-06, "loss": 0.001, "step": 2110 }, { "epoch": 16.96, "grad_norm": 0.44483599066734314, "learning_rate": 5.74908230474453e-06, "loss": 0.0044, "step": 2120 }, { "epoch": 17.04, "grad_norm": 0.024487623944878578, "learning_rate": 5.648225464422189e-06, "loss": 0.0007, "step": 2130 }, { "epoch": 17.12, "grad_norm": 0.0326782651245594, "learning_rate": 5.547911788105001e-06, "loss": 0.0008, "step": 2140 }, { "epoch": 17.2, "grad_norm": 0.015867168083786964, "learning_rate": 5.4481537963814675e-06, "loss": 0.0004, "step": 2150 }, { "epoch": 17.28, "grad_norm": 0.022902317345142365, "learning_rate": 5.348963940482663e-06, "loss": 0.0003, "step": 2160 }, { "epoch": 17.36, "grad_norm": 0.024737635627388954, "learning_rate": 5.25035460072814e-06, "loss": 0.0023, "step": 2170 }, { "epoch": 17.44, "grad_norm": 0.019413290545344353, "learning_rate": 5.15233808498071e-06, "loss": 0.0004, "step": 2180 }, { "epoch": 17.52, "grad_norm": 0.01702185347676277, "learning_rate": 5.054926627110208e-06, "loss": 0.0004, "step": 2190 }, { "epoch": 17.6, "grad_norm": 2.464230537414551, "learning_rate": 4.9581323854665695e-06, "loss": 0.0049, "step": 2200 }, { "epoch": 17.68, "grad_norm": 0.013668897561728954, "learning_rate": 4.861967441362262e-06, "loss": 0.0004, "step": 2210 }, { "epoch": 17.76, "grad_norm": 0.014037170447409153, "learning_rate": 4.766443797564375e-06, "loss": 0.0004, "step": 2220 }, { "epoch": 17.84, "grad_norm": 0.011271145194768906, "learning_rate": 4.671573376796511e-06, "loss": 0.0005, "step": 2230 }, { "epoch": 17.92, "grad_norm": 0.015100213699042797, "learning_rate": 4.57736802025065e-06, "loss": 0.0007, "step": 2240 }, { "epoch": 18.0, "grad_norm": 0.1259368360042572, "learning_rate": 4.48383948610919e-06, "loss": 0.0003, "step": 2250 }, { "epoch": 18.08, "grad_norm": 0.04630790278315544, "learning_rate": 4.390999448077375e-06, "loss": 0.0004, "step": 2260 }, { "epoch": 18.16, "grad_norm": 0.03695525601506233, "learning_rate": 4.298859493926235e-06, "loss": 0.0003, "step": 2270 }, { "epoch": 18.24, "grad_norm": 0.00454062782227993, "learning_rate": 4.207431124046267e-06, "loss": 0.0003, "step": 2280 }, { "epoch": 18.32, "grad_norm": 0.016318323090672493, "learning_rate": 4.116725750012035e-06, "loss": 0.0003, "step": 2290 }, { "epoch": 18.4, "grad_norm": 0.02545909956097603, "learning_rate": 4.026754693157816e-06, "loss": 0.0003, "step": 2300 }, { "epoch": 18.48, "grad_norm": 0.014473868533968925, "learning_rate": 3.937529183164562e-06, "loss": 0.0003, "step": 2310 }, { "epoch": 18.56, "grad_norm": 0.014056684449315071, "learning_rate": 3.84906035665826e-06, "loss": 0.0002, "step": 2320 }, { "epoch": 18.64, "grad_norm": 0.013438849709928036, "learning_rate": 3.7613592558199162e-06, "loss": 0.0002, "step": 2330 }, { "epoch": 18.72, "grad_norm": 0.01617852970957756, "learning_rate": 3.6744368270073393e-06, "loss": 0.0007, "step": 2340 }, { "epoch": 18.8, "grad_norm": 0.013655665330588818, "learning_rate": 3.5883039193888914e-06, "loss": 0.0007, "step": 2350 }, { "epoch": 18.88, "grad_norm": 0.014764860272407532, "learning_rate": 3.502971283589326e-06, "loss": 0.0005, "step": 2360 }, { "epoch": 18.96, "grad_norm": 0.03937000408768654, "learning_rate": 3.418449570347986e-06, "loss": 0.0003, "step": 2370 }, { "epoch": 19.04, "grad_norm": 0.014079142361879349, "learning_rate": 3.334749329189415e-06, "loss": 0.0003, "step": 2380 }, { "epoch": 19.12, "grad_norm": 0.016212107613682747, "learning_rate": 3.2518810071066363e-06, "loss": 0.0002, "step": 2390 }, { "epoch": 19.2, "grad_norm": 0.015338711440563202, "learning_rate": 3.1698549472572203e-06, "loss": 0.0003, "step": 2400 }, { "epoch": 19.28, "grad_norm": 0.02485400065779686, "learning_rate": 3.0886813876723075e-06, "loss": 0.0002, "step": 2410 }, { "epoch": 19.36, "grad_norm": 0.017927484586834908, "learning_rate": 3.0083704599787423e-06, "loss": 0.0003, "step": 2420 }, { "epoch": 19.44, "grad_norm": 0.012980809435248375, "learning_rate": 2.9289321881345257e-06, "loss": 0.0005, "step": 2430 }, { "epoch": 19.52, "grad_norm": 0.03192094713449478, "learning_rate": 2.850376487177656e-06, "loss": 0.0002, "step": 2440 }, { "epoch": 19.6, "grad_norm": 0.019058849662542343, "learning_rate": 2.7727131619886017e-06, "loss": 0.0002, "step": 2450 }, { "epoch": 19.68, "grad_norm": 0.012099268846213818, "learning_rate": 2.6959519060665195e-06, "loss": 0.0002, "step": 2460 }, { "epoch": 19.76, "grad_norm": 0.010341337881982327, "learning_rate": 2.6201023003193437e-06, "loss": 0.0002, "step": 2470 }, { "epoch": 19.84, "grad_norm": 0.014265513978898525, "learning_rate": 2.545173811867977e-06, "loss": 0.0002, "step": 2480 }, { "epoch": 19.92, "grad_norm": 0.01814926601946354, "learning_rate": 2.471175792864642e-06, "loss": 0.001, "step": 2490 }, { "epoch": 20.0, "grad_norm": 0.011018522083759308, "learning_rate": 2.3981174793255956e-06, "loss": 0.0002, "step": 2500 }, { "epoch": 20.08, "grad_norm": 0.015308992005884647, "learning_rate": 2.3260079899783492e-06, "loss": 0.0008, "step": 2510 }, { "epoch": 20.16, "grad_norm": 0.010280388407409191, "learning_rate": 2.254856325123529e-06, "loss": 0.0002, "step": 2520 }, { "epoch": 20.24, "grad_norm": 0.011187535710632801, "learning_rate": 2.1846713655114836e-06, "loss": 0.0002, "step": 2530 }, { "epoch": 20.32, "grad_norm": 0.015644947066903114, "learning_rate": 2.115461871233867e-06, "loss": 0.0002, "step": 2540 }, { "epoch": 20.4, "grad_norm": 0.00875458400696516, "learning_rate": 2.0472364806302313e-06, "loss": 0.0002, "step": 2550 }, { "epoch": 20.48, "grad_norm": 0.017235323786735535, "learning_rate": 1.9800037092098477e-06, "loss": 0.0002, "step": 2560 }, { "epoch": 20.56, "grad_norm": 0.03400786966085434, "learning_rate": 1.9137719485888527e-06, "loss": 0.0002, "step": 2570 }, { "epoch": 20.64, "grad_norm": 0.011186143383383751, "learning_rate": 1.8485494654428482e-06, "loss": 0.0002, "step": 2580 }, { "epoch": 20.72, "grad_norm": 0.008517293259501457, "learning_rate": 1.784344400475093e-06, "loss": 0.0002, "step": 2590 }, { "epoch": 20.8, "grad_norm": 0.010979237034916878, "learning_rate": 1.7211647674004483e-06, "loss": 0.0002, "step": 2600 }, { "epoch": 20.88, "grad_norm": 0.01790277473628521, "learning_rate": 1.659018451945128e-06, "loss": 0.0002, "step": 2610 }, { "epoch": 20.96, "grad_norm": 0.01020762138068676, "learning_rate": 1.5979132108624572e-06, "loss": 0.0002, "step": 2620 }, { "epoch": 21.04, "grad_norm": 0.014280925504863262, "learning_rate": 1.5378566709647225e-06, "loss": 0.0003, "step": 2630 }, { "epoch": 21.12, "grad_norm": 0.012542898766696453, "learning_rate": 1.4788563281712253e-06, "loss": 0.0002, "step": 2640 }, { "epoch": 21.2, "grad_norm": 0.0072167362086474895, "learning_rate": 1.420919546572691e-06, "loss": 0.0002, "step": 2650 }, { "epoch": 21.28, "grad_norm": 0.010835397057235241, "learning_rate": 1.364053557512126e-06, "loss": 0.0002, "step": 2660 }, { "epoch": 21.36, "grad_norm": 0.010230830870568752, "learning_rate": 1.308265458682234e-06, "loss": 0.0004, "step": 2670 }, { "epoch": 21.44, "grad_norm": 0.00813596136868, "learning_rate": 1.2535622132395242e-06, "loss": 0.0005, "step": 2680 }, { "epoch": 21.52, "grad_norm": 0.02058684267103672, "learning_rate": 1.1999506489352208e-06, "loss": 0.0002, "step": 2690 }, { "epoch": 21.6, "grad_norm": 0.022641103714704514, "learning_rate": 1.1474374572630432e-06, "loss": 0.0002, "step": 2700 }, { "epoch": 21.68, "grad_norm": 0.015892324969172478, "learning_rate": 1.0960291926240263e-06, "loss": 0.0002, "step": 2710 }, { "epoch": 21.76, "grad_norm": 0.01219907309859991, "learning_rate": 1.0457322715084305e-06, "loss": 0.0002, "step": 2720 }, { "epoch": 21.84, "grad_norm": 0.009200166910886765, "learning_rate": 9.965529716948684e-07, "loss": 0.0002, "step": 2730 }, { "epoch": 21.92, "grad_norm": 0.00976441241800785, "learning_rate": 9.484974314667561e-07, "loss": 0.0002, "step": 2740 }, { "epoch": 22.0, "grad_norm": 0.010954627767205238, "learning_rate": 9.015716488461656e-07, "loss": 0.0002, "step": 2750 }, { "epoch": 22.08, "grad_norm": 0.016307951882481575, "learning_rate": 8.557814808451737e-07, "loss": 0.0002, "step": 2760 }, { "epoch": 22.16, "grad_norm": 0.009793213568627834, "learning_rate": 8.11132642734841e-07, "loss": 0.0001, "step": 2770 }, { "epoch": 22.24, "grad_norm": 0.023596106097102165, "learning_rate": 7.676307073318479e-07, "loss": 0.0002, "step": 2780 }, { "epoch": 22.32, "grad_norm": 0.011904028244316578, "learning_rate": 7.252811043029373e-07, "loss": 0.0004, "step": 2790 }, { "epoch": 22.4, "grad_norm": 0.013149023056030273, "learning_rate": 6.840891194872112e-07, "loss": 0.0002, "step": 2800 }, { "epoch": 22.48, "grad_norm": 0.011410288512706757, "learning_rate": 6.440598942363796e-07, "loss": 0.0002, "step": 2810 }, { "epoch": 22.56, "grad_norm": 0.008377713151276112, "learning_rate": 6.051984247730447e-07, "loss": 0.0002, "step": 2820 }, { "epoch": 22.64, "grad_norm": 0.19204267859458923, "learning_rate": 5.675095615671144e-07, "loss": 0.0005, "step": 2830 }, { "epoch": 22.72, "grad_norm": 0.017059409990906715, "learning_rate": 5.309980087303713e-07, "loss": 0.0002, "step": 2840 }, { "epoch": 22.8, "grad_norm": 0.00994216650724411, "learning_rate": 4.956683234293491e-07, "loss": 0.0002, "step": 2850 }, { "epoch": 22.88, "grad_norm": 0.017344018444418907, "learning_rate": 4.61524915316528e-07, "loss": 0.0001, "step": 2860 }, { "epoch": 22.96, "grad_norm": 0.009385120123624802, "learning_rate": 4.285720459799425e-07, "loss": 0.0002, "step": 2870 }, { "epoch": 23.04, "grad_norm": 0.009774768725037575, "learning_rate": 3.9681382841128323e-07, "loss": 0.0002, "step": 2880 }, { "epoch": 23.12, "grad_norm": 0.13636986911296844, "learning_rate": 3.6625422649252617e-07, "loss": 0.0005, "step": 2890 }, { "epoch": 23.2, "grad_norm": 0.011067167855799198, "learning_rate": 3.368970545011874e-07, "loss": 0.0002, "step": 2900 }, { "epoch": 23.28, "grad_norm": 0.008906451985239983, "learning_rate": 3.0874597663425046e-07, "loss": 0.0002, "step": 2910 }, { "epoch": 23.36, "grad_norm": 0.011505813337862492, "learning_rate": 2.818045065508168e-07, "loss": 0.0001, "step": 2920 }, { "epoch": 23.44, "grad_norm": 0.01766866073012352, "learning_rate": 2.560760069335511e-07, "loss": 0.0002, "step": 2930 }, { "epoch": 23.52, "grad_norm": 0.01433554571121931, "learning_rate": 2.315636890689743e-07, "loss": 0.0002, "step": 2940 }, { "epoch": 23.6, "grad_norm": 0.009151219390332699, "learning_rate": 2.082706124466416e-07, "loss": 0.0002, "step": 2950 }, { "epoch": 23.68, "grad_norm": 0.014051095582544804, "learning_rate": 1.8619968437727954e-07, "loss": 0.0002, "step": 2960 }, { "epoch": 23.76, "grad_norm": 0.01479571033269167, "learning_rate": 1.6535365962991322e-07, "loss": 0.0002, "step": 2970 }, { "epoch": 23.84, "grad_norm": 0.17555338144302368, "learning_rate": 1.4573514008802693e-07, "loss": 0.0005, "step": 2980 }, { "epoch": 23.92, "grad_norm": 0.01707664132118225, "learning_rate": 1.2734657442481368e-07, "loss": 0.0002, "step": 2990 }, { "epoch": 24.0, "grad_norm": 0.023542573675513268, "learning_rate": 1.1019025779754666e-07, "loss": 0.0002, "step": 3000 }, { "epoch": 24.08, "grad_norm": 0.008962544612586498, "learning_rate": 9.426833156111038e-08, "loss": 0.0002, "step": 3010 }, { "epoch": 24.16, "grad_norm": 0.009664681740105152, "learning_rate": 7.958278300072453e-08, "loss": 0.0002, "step": 3020 }, { "epoch": 24.24, "grad_norm": 0.0093710171058774, "learning_rate": 6.613544508391024e-08, "loss": 0.0001, "step": 3030 }, { "epoch": 24.32, "grad_norm": 0.016148734837770462, "learning_rate": 5.392799623170186e-08, "loss": 0.0002, "step": 3040 }, { "epoch": 24.4, "grad_norm": 0.018474169075489044, "learning_rate": 4.2961960109160205e-08, "loss": 0.0002, "step": 3050 }, { "epoch": 24.48, "grad_norm": 0.007969476282596588, "learning_rate": 3.323870543519458e-08, "loss": 0.0001, "step": 3060 }, { "epoch": 24.56, "grad_norm": 0.009026006795465946, "learning_rate": 2.475944581173173e-08, "loss": 0.0002, "step": 3070 }, { "epoch": 24.64, "grad_norm": 0.016195567324757576, "learning_rate": 1.752523957223362e-08, "loss": 0.0002, "step": 3080 }, { "epoch": 24.72, "grad_norm": 0.004977029282599688, "learning_rate": 1.1536989649608699e-08, "loss": 0.0002, "step": 3090 }, { "epoch": 24.8, "grad_norm": 0.008723889477550983, "learning_rate": 6.795443463509799e-09, "loss": 0.0002, "step": 3100 }, { "epoch": 24.88, "grad_norm": 0.01094700489193201, "learning_rate": 3.3011928270454406e-09, "loss": 0.0004, "step": 3110 }, { "epoch": 24.96, "grad_norm": 0.009475067257881165, "learning_rate": 1.054673872914469e-09, "loss": 0.0005, "step": 3120 }, { "epoch": 25.0, "step": 3125, "total_flos": 1.246578685771776e+17, "train_loss": 0.15220735648881645, "train_runtime": 31927.3271, "train_samples_per_second": 0.392, "train_steps_per_second": 0.098 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.246578685771776e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }