{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.121212121212121, "eval_steps": 500, "global_step": 1750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 0.03181853145360947, "learning_rate": 0.0002, "loss": 0.0117, "step": 10 }, { "epoch": 0.14, "grad_norm": 0.03301709145307541, "learning_rate": 0.0002, "loss": 0.0061, "step": 20 }, { "epoch": 0.21, "grad_norm": 0.12774519622325897, "learning_rate": 0.0002, "loss": 0.008, "step": 30 }, { "epoch": 0.28, "grad_norm": 0.189144566655159, "learning_rate": 0.0002, "loss": 0.0167, "step": 40 }, { "epoch": 0.35, "grad_norm": 0.15458767116069794, "learning_rate": 0.0002, "loss": 0.0116, "step": 50 }, { "epoch": 0.42, "grad_norm": 0.23779332637786865, "learning_rate": 0.0002, "loss": 0.0124, "step": 60 }, { "epoch": 0.48, "grad_norm": 0.17651872336864471, "learning_rate": 0.0002, "loss": 0.0081, "step": 70 }, { "epoch": 0.55, "grad_norm": 0.2730236053466797, "learning_rate": 0.0002, "loss": 0.0196, "step": 80 }, { "epoch": 0.62, "grad_norm": 0.17761944234371185, "learning_rate": 0.0002, "loss": 0.0123, "step": 90 }, { "epoch": 0.69, "grad_norm": 0.02090522274374962, "learning_rate": 0.0002, "loss": 0.008, "step": 100 }, { "epoch": 0.76, "grad_norm": 0.19568940997123718, "learning_rate": 0.0002, "loss": 0.0137, "step": 110 }, { "epoch": 0.83, "grad_norm": 0.09299980849027634, "learning_rate": 0.0002, "loss": 0.0179, "step": 120 }, { "epoch": 0.9, "grad_norm": 0.039965927600860596, "learning_rate": 0.0002, "loss": 0.0094, "step": 130 }, { "epoch": 0.97, "grad_norm": 0.05453222617506981, "learning_rate": 0.0002, "loss": 0.0082, "step": 140 }, { "epoch": 1.04, "grad_norm": 0.08042442053556442, "learning_rate": 0.0002, "loss": 0.0144, "step": 150 }, { "epoch": 1.11, "grad_norm": 0.066365547478199, "learning_rate": 0.0002, "loss": 0.0106, "step": 160 }, { "epoch": 1.18, "grad_norm": 0.06514760106801987, "learning_rate": 0.0002, "loss": 0.0104, "step": 170 }, { "epoch": 1.25, "grad_norm": 0.019024183973670006, "learning_rate": 0.0002, "loss": 0.0088, "step": 180 }, { "epoch": 1.32, "grad_norm": 0.1361686736345291, "learning_rate": 0.0002, "loss": 0.013, "step": 190 }, { "epoch": 1.39, "grad_norm": 0.07207699865102768, "learning_rate": 0.0002, "loss": 0.0093, "step": 200 }, { "epoch": 1.45, "grad_norm": 0.03459335118532181, "learning_rate": 0.0002, "loss": 0.0106, "step": 210 }, { "epoch": 1.52, "grad_norm": 0.03898346796631813, "learning_rate": 0.0002, "loss": 0.0103, "step": 220 }, { "epoch": 1.59, "grad_norm": 0.17066022753715515, "learning_rate": 0.0002, "loss": 0.0154, "step": 230 }, { "epoch": 1.66, "grad_norm": 0.11916259676218033, "learning_rate": 0.0002, "loss": 0.0054, "step": 240 }, { "epoch": 1.73, "grad_norm": 0.1582280397415161, "learning_rate": 0.0002, "loss": 0.0058, "step": 250 }, { "epoch": 1.8, "grad_norm": 0.11945289373397827, "learning_rate": 0.0002, "loss": 0.0168, "step": 260 }, { "epoch": 1.87, "grad_norm": 0.11030268669128418, "learning_rate": 0.0002, "loss": 0.0062, "step": 270 }, { "epoch": 1.94, "grad_norm": 0.19329409301280975, "learning_rate": 0.0002, "loss": 0.0107, "step": 280 }, { "epoch": 2.01, "grad_norm": 0.19244401156902313, "learning_rate": 0.0002, "loss": 0.0079, "step": 290 }, { "epoch": 2.08, "grad_norm": 0.17133064568042755, "learning_rate": 0.0002, "loss": 0.0121, "step": 300 }, { "epoch": 2.15, "grad_norm": 0.2458006590604782, "learning_rate": 0.0002, "loss": 0.0099, "step": 310 }, { "epoch": 2.22, "grad_norm": 0.04351121559739113, "learning_rate": 0.0002, "loss": 0.0151, "step": 320 }, { "epoch": 2.29, "grad_norm": 0.11653552204370499, "learning_rate": 0.0002, "loss": 0.0215, "step": 330 }, { "epoch": 2.35, "grad_norm": 0.17622575163841248, "learning_rate": 0.0002, "loss": 0.0113, "step": 340 }, { "epoch": 2.42, "grad_norm": 0.06326813995838165, "learning_rate": 0.0002, "loss": 0.0088, "step": 350 }, { "epoch": 2.49, "grad_norm": 0.05258811265230179, "learning_rate": 0.0002, "loss": 0.0209, "step": 360 }, { "epoch": 2.56, "grad_norm": 0.14271198213100433, "learning_rate": 0.0002, "loss": 0.0165, "step": 370 }, { "epoch": 2.63, "grad_norm": 0.2666095495223999, "learning_rate": 0.0002, "loss": 0.0074, "step": 380 }, { "epoch": 2.7, "grad_norm": 0.33327990770339966, "learning_rate": 0.0002, "loss": 0.0119, "step": 390 }, { "epoch": 2.77, "grad_norm": 0.16939572989940643, "learning_rate": 0.0002, "loss": 0.0198, "step": 400 }, { "epoch": 2.84, "grad_norm": 0.15744081139564514, "learning_rate": 0.0002, "loss": 0.0128, "step": 410 }, { "epoch": 2.91, "grad_norm": 0.08405949920415878, "learning_rate": 0.0002, "loss": 0.0097, "step": 420 }, { "epoch": 2.98, "grad_norm": 0.19052201509475708, "learning_rate": 0.0002, "loss": 0.0044, "step": 430 }, { "epoch": 3.05, "grad_norm": 0.07543158531188965, "learning_rate": 0.0002, "loss": 0.013, "step": 440 }, { "epoch": 3.12, "grad_norm": 0.13838960230350494, "learning_rate": 0.0002, "loss": 0.0128, "step": 450 }, { "epoch": 3.19, "grad_norm": 0.1489538550376892, "learning_rate": 0.0002, "loss": 0.0107, "step": 460 }, { "epoch": 3.26, "grad_norm": 0.14335639774799347, "learning_rate": 0.0002, "loss": 0.0095, "step": 470 }, { "epoch": 3.32, "grad_norm": 0.07466119527816772, "learning_rate": 0.0002, "loss": 0.0124, "step": 480 }, { "epoch": 3.39, "grad_norm": 0.175484761595726, "learning_rate": 0.0002, "loss": 0.012, "step": 490 }, { "epoch": 3.46, "grad_norm": 0.15107689797878265, "learning_rate": 0.0002, "loss": 0.0091, "step": 500 }, { "epoch": 3.53, "grad_norm": 0.18606631457805634, "learning_rate": 0.0002, "loss": 0.0141, "step": 510 }, { "epoch": 3.6, "grad_norm": 0.12696044147014618, "learning_rate": 0.0002, "loss": 0.0125, "step": 520 }, { "epoch": 3.67, "grad_norm": 0.11616098135709763, "learning_rate": 0.0002, "loss": 0.0129, "step": 530 }, { "epoch": 3.74, "grad_norm": 0.3160938620567322, "learning_rate": 0.0002, "loss": 0.0085, "step": 540 }, { "epoch": 3.81, "grad_norm": 0.08054643124341965, "learning_rate": 0.0002, "loss": 0.0163, "step": 550 }, { "epoch": 3.88, "grad_norm": 0.06550751626491547, "learning_rate": 0.0002, "loss": 0.0133, "step": 560 }, { "epoch": 3.95, "grad_norm": 0.04109380394220352, "learning_rate": 0.0002, "loss": 0.0085, "step": 570 }, { "epoch": 4.02, "grad_norm": 0.09408660233020782, "learning_rate": 0.0002, "loss": 0.0127, "step": 580 }, { "epoch": 4.09, "grad_norm": 0.2768724858760834, "learning_rate": 0.0002, "loss": 0.0114, "step": 590 }, { "epoch": 4.16, "grad_norm": 0.02514655888080597, "learning_rate": 0.0002, "loss": 0.0071, "step": 600 }, { "epoch": 4.23, "grad_norm": 0.26696744561195374, "learning_rate": 0.0002, "loss": 0.0066, "step": 610 }, { "epoch": 4.29, "grad_norm": 0.19320635497570038, "learning_rate": 0.0002, "loss": 0.0135, "step": 620 }, { "epoch": 4.36, "grad_norm": 0.08179598301649094, "learning_rate": 0.0002, "loss": 0.0067, "step": 630 }, { "epoch": 4.43, "grad_norm": 0.30935776233673096, "learning_rate": 0.0002, "loss": 0.0083, "step": 640 }, { "epoch": 4.5, "grad_norm": 0.21668316423892975, "learning_rate": 0.0002, "loss": 0.0144, "step": 650 }, { "epoch": 4.57, "grad_norm": 0.18344347178936005, "learning_rate": 0.0002, "loss": 0.0131, "step": 660 }, { "epoch": 4.64, "grad_norm": 0.31796884536743164, "learning_rate": 0.0002, "loss": 0.0101, "step": 670 }, { "epoch": 4.71, "grad_norm": 0.04909071326255798, "learning_rate": 0.0002, "loss": 0.0108, "step": 680 }, { "epoch": 4.78, "grad_norm": 0.08679769188165665, "learning_rate": 0.0002, "loss": 0.0197, "step": 690 }, { "epoch": 4.85, "grad_norm": 0.05100365728139877, "learning_rate": 0.0002, "loss": 0.013, "step": 700 }, { "epoch": 4.92, "grad_norm": 0.08279485255479813, "learning_rate": 0.0002, "loss": 0.0062, "step": 710 }, { "epoch": 4.99, "grad_norm": 0.05374281853437424, "learning_rate": 0.0002, "loss": 0.0101, "step": 720 }, { "epoch": 5.06, "grad_norm": 0.10421579331159592, "learning_rate": 0.0002, "loss": 0.0115, "step": 730 }, { "epoch": 5.13, "grad_norm": 0.022278541699051857, "learning_rate": 0.0002, "loss": 0.0071, "step": 740 }, { "epoch": 5.19, "grad_norm": 0.014159414917230606, "learning_rate": 0.0002, "loss": 0.0076, "step": 750 }, { "epoch": 5.26, "grad_norm": 0.15521597862243652, "learning_rate": 0.0002, "loss": 0.0122, "step": 760 }, { "epoch": 5.33, "grad_norm": 0.0737166702747345, "learning_rate": 0.0002, "loss": 0.0077, "step": 770 }, { "epoch": 5.4, "grad_norm": 0.03631032258272171, "learning_rate": 0.0002, "loss": 0.0051, "step": 780 }, { "epoch": 5.47, "grad_norm": 0.11575569957494736, "learning_rate": 0.0002, "loss": 0.0057, "step": 790 }, { "epoch": 5.54, "grad_norm": 0.13107435405254364, "learning_rate": 0.0002, "loss": 0.0104, "step": 800 }, { "epoch": 5.61, "grad_norm": 0.11719845235347748, "learning_rate": 0.0002, "loss": 0.004, "step": 810 }, { "epoch": 5.68, "grad_norm": 0.024636002257466316, "learning_rate": 0.0002, "loss": 0.0048, "step": 820 }, { "epoch": 5.75, "grad_norm": 0.05650615319609642, "learning_rate": 0.0002, "loss": 0.009, "step": 830 }, { "epoch": 5.82, "grad_norm": 0.11958178132772446, "learning_rate": 0.0002, "loss": 0.0096, "step": 840 }, { "epoch": 5.89, "grad_norm": 0.3462698757648468, "learning_rate": 0.0002, "loss": 0.0079, "step": 850 }, { "epoch": 5.96, "grad_norm": 0.07302005589008331, "learning_rate": 0.0002, "loss": 0.007, "step": 860 }, { "epoch": 6.03, "grad_norm": 0.24322502315044403, "learning_rate": 0.0002, "loss": 0.0118, "step": 870 }, { "epoch": 6.1, "grad_norm": 0.024749072268605232, "learning_rate": 0.0002, "loss": 0.0076, "step": 880 }, { "epoch": 6.16, "grad_norm": 0.03444315120577812, "learning_rate": 0.0002, "loss": 0.0056, "step": 890 }, { "epoch": 6.23, "grad_norm": 0.36686971783638, "learning_rate": 0.0002, "loss": 0.0066, "step": 900 }, { "epoch": 6.3, "grad_norm": 0.25445130467414856, "learning_rate": 0.0002, "loss": 0.0114, "step": 910 }, { "epoch": 6.37, "grad_norm": 0.09220030158758163, "learning_rate": 0.0002, "loss": 0.0076, "step": 920 }, { "epoch": 6.44, "grad_norm": 0.2540806531906128, "learning_rate": 0.0002, "loss": 0.0102, "step": 930 }, { "epoch": 6.51, "grad_norm": 0.1557808369398117, "learning_rate": 0.0002, "loss": 0.0074, "step": 940 }, { "epoch": 6.58, "grad_norm": 0.05450637638568878, "learning_rate": 0.0002, "loss": 0.0086, "step": 950 }, { "epoch": 6.65, "grad_norm": 0.04527583718299866, "learning_rate": 0.0002, "loss": 0.0038, "step": 960 }, { "epoch": 6.72, "grad_norm": 0.11897213757038116, "learning_rate": 0.0002, "loss": 0.0034, "step": 970 }, { "epoch": 6.79, "grad_norm": 0.10568214952945709, "learning_rate": 0.0002, "loss": 0.0158, "step": 980 }, { "epoch": 6.86, "grad_norm": 0.07142580300569534, "learning_rate": 0.0002, "loss": 0.0077, "step": 990 }, { "epoch": 6.93, "grad_norm": 0.04490859434008598, "learning_rate": 0.0002, "loss": 0.0061, "step": 1000 }, { "epoch": 7.0, "grad_norm": 0.030944261699914932, "learning_rate": 0.0002, "loss": 0.0049, "step": 1010 }, { "epoch": 7.06, "grad_norm": 0.23092027008533478, "learning_rate": 0.0002, "loss": 0.012, "step": 1020 }, { "epoch": 7.13, "grad_norm": 0.07528150826692581, "learning_rate": 0.0002, "loss": 0.005, "step": 1030 }, { "epoch": 7.2, "grad_norm": 0.011815500445663929, "learning_rate": 0.0002, "loss": 0.0074, "step": 1040 }, { "epoch": 7.27, "grad_norm": 0.08818691223859787, "learning_rate": 0.0002, "loss": 0.007, "step": 1050 }, { "epoch": 7.34, "grad_norm": 0.013633369468152523, "learning_rate": 0.0002, "loss": 0.0069, "step": 1060 }, { "epoch": 7.41, "grad_norm": 0.014418787322938442, "learning_rate": 0.0002, "loss": 0.0071, "step": 1070 }, { "epoch": 7.48, "grad_norm": 0.12523452937602997, "learning_rate": 0.0002, "loss": 0.0072, "step": 1080 }, { "epoch": 7.55, "grad_norm": 0.13877837359905243, "learning_rate": 0.0002, "loss": 0.0097, "step": 1090 }, { "epoch": 7.62, "grad_norm": 0.24948883056640625, "learning_rate": 0.0002, "loss": 0.006, "step": 1100 }, { "epoch": 7.69, "grad_norm": 0.021404897794127464, "learning_rate": 0.0002, "loss": 0.0038, "step": 1110 }, { "epoch": 7.76, "grad_norm": 0.1368672251701355, "learning_rate": 0.0002, "loss": 0.0059, "step": 1120 }, { "epoch": 7.83, "grad_norm": 0.03312192112207413, "learning_rate": 0.0002, "loss": 0.007, "step": 1130 }, { "epoch": 7.9, "grad_norm": 0.032206010073423386, "learning_rate": 0.0002, "loss": 0.0021, "step": 1140 }, { "epoch": 7.97, "grad_norm": 0.03399817645549774, "learning_rate": 0.0002, "loss": 0.0045, "step": 1150 }, { "epoch": 8.03, "grad_norm": 0.18925493955612183, "learning_rate": 0.0002, "loss": 0.0093, "step": 1160 }, { "epoch": 8.1, "grad_norm": 0.1772252917289734, "learning_rate": 0.0002, "loss": 0.003, "step": 1170 }, { "epoch": 8.17, "grad_norm": 0.007787138223648071, "learning_rate": 0.0002, "loss": 0.0034, "step": 1180 }, { "epoch": 8.24, "grad_norm": 0.01511111855506897, "learning_rate": 0.0002, "loss": 0.002, "step": 1190 }, { "epoch": 8.31, "grad_norm": 0.03841459006071091, "learning_rate": 0.0002, "loss": 0.0072, "step": 1200 }, { "epoch": 8.38, "grad_norm": 0.0446455255150795, "learning_rate": 0.0002, "loss": 0.0066, "step": 1210 }, { "epoch": 8.45, "grad_norm": 0.050412483513355255, "learning_rate": 0.0002, "loss": 0.0038, "step": 1220 }, { "epoch": 8.52, "grad_norm": 0.05555203557014465, "learning_rate": 0.0002, "loss": 0.0088, "step": 1230 }, { "epoch": 8.59, "grad_norm": 0.07313160598278046, "learning_rate": 0.0002, "loss": 0.0096, "step": 1240 }, { "epoch": 8.66, "grad_norm": 0.06051426753401756, "learning_rate": 0.0002, "loss": 0.0176, "step": 1250 }, { "epoch": 8.73, "grad_norm": 0.052646949887275696, "learning_rate": 0.0002, "loss": 0.0038, "step": 1260 }, { "epoch": 8.8, "grad_norm": 0.05086590349674225, "learning_rate": 0.0002, "loss": 0.0099, "step": 1270 }, { "epoch": 8.87, "grad_norm": 0.07178761065006256, "learning_rate": 0.0002, "loss": 0.0074, "step": 1280 }, { "epoch": 8.94, "grad_norm": 0.009413959458470345, "learning_rate": 0.0002, "loss": 0.0064, "step": 1290 }, { "epoch": 9.0, "grad_norm": 0.10240955650806427, "learning_rate": 0.0002, "loss": 0.0143, "step": 1300 }, { "epoch": 9.07, "grad_norm": 0.01897740177810192, "learning_rate": 0.0002, "loss": 0.0074, "step": 1310 }, { "epoch": 9.14, "grad_norm": 0.022444820031523705, "learning_rate": 0.0002, "loss": 0.0062, "step": 1320 }, { "epoch": 9.21, "grad_norm": 0.0814051404595375, "learning_rate": 0.0002, "loss": 0.0061, "step": 1330 }, { "epoch": 9.28, "grad_norm": 0.18547141551971436, "learning_rate": 0.0002, "loss": 0.0112, "step": 1340 }, { "epoch": 9.35, "grad_norm": 0.022853808477520943, "learning_rate": 0.0002, "loss": 0.0082, "step": 1350 }, { "epoch": 9.42, "grad_norm": 0.14588187634944916, "learning_rate": 0.0002, "loss": 0.0068, "step": 1360 }, { "epoch": 9.49, "grad_norm": 0.10406211018562317, "learning_rate": 0.0002, "loss": 0.0045, "step": 1370 }, { "epoch": 9.56, "grad_norm": 0.02087993547320366, "learning_rate": 0.0002, "loss": 0.0144, "step": 1380 }, { "epoch": 9.63, "grad_norm": 0.015244157053530216, "learning_rate": 0.0002, "loss": 0.0044, "step": 1390 }, { "epoch": 9.7, "grad_norm": 0.030207380652427673, "learning_rate": 0.0002, "loss": 0.0062, "step": 1400 }, { "epoch": 9.77, "grad_norm": 0.17009080946445465, "learning_rate": 0.0002, "loss": 0.0113, "step": 1410 }, { "epoch": 9.84, "grad_norm": 0.11179855465888977, "learning_rate": 0.0002, "loss": 0.0095, "step": 1420 }, { "epoch": 9.9, "grad_norm": 0.07035847008228302, "learning_rate": 0.0002, "loss": 0.0051, "step": 1430 }, { "epoch": 9.97, "grad_norm": 0.2190101593732834, "learning_rate": 0.0002, "loss": 0.0058, "step": 1440 }, { "epoch": 10.04, "grad_norm": 0.043730515986680984, "learning_rate": 0.0002, "loss": 0.0096, "step": 1450 }, { "epoch": 10.11, "grad_norm": 0.05817865580320358, "learning_rate": 0.0002, "loss": 0.0084, "step": 1460 }, { "epoch": 10.18, "grad_norm": 0.26322343945503235, "learning_rate": 0.0002, "loss": 0.0069, "step": 1470 }, { "epoch": 10.25, "grad_norm": 0.0707869678735733, "learning_rate": 0.0002, "loss": 0.0063, "step": 1480 }, { "epoch": 10.32, "grad_norm": 0.06895614415407181, "learning_rate": 0.0002, "loss": 0.0107, "step": 1490 }, { "epoch": 10.39, "grad_norm": 0.014940924942493439, "learning_rate": 0.0002, "loss": 0.006, "step": 1500 }, { "epoch": 10.46, "grad_norm": 0.12346550822257996, "learning_rate": 0.0002, "loss": 0.0101, "step": 1510 }, { "epoch": 10.53, "grad_norm": 0.06773683428764343, "learning_rate": 0.0002, "loss": 0.0108, "step": 1520 }, { "epoch": 10.6, "grad_norm": 0.08521382510662079, "learning_rate": 0.0002, "loss": 0.0088, "step": 1530 }, { "epoch": 10.67, "grad_norm": 0.3153349459171295, "learning_rate": 0.0002, "loss": 0.0113, "step": 1540 }, { "epoch": 10.74, "grad_norm": 0.32351627945899963, "learning_rate": 0.0002, "loss": 0.0095, "step": 1550 }, { "epoch": 10.81, "grad_norm": 0.1824280321598053, "learning_rate": 0.0002, "loss": 0.0163, "step": 1560 }, { "epoch": 10.87, "grad_norm": 0.1012108325958252, "learning_rate": 0.0002, "loss": 0.0078, "step": 1570 }, { "epoch": 10.94, "grad_norm": 0.12629801034927368, "learning_rate": 0.0002, "loss": 0.0086, "step": 1580 }, { "epoch": 11.01, "grad_norm": 0.0864088237285614, "learning_rate": 0.0002, "loss": 0.0104, "step": 1590 }, { "epoch": 11.08, "grad_norm": 0.09980332106351852, "learning_rate": 0.0002, "loss": 0.0082, "step": 1600 }, { "epoch": 11.15, "grad_norm": 0.3955361545085907, "learning_rate": 0.0002, "loss": 0.0161, "step": 1610 }, { "epoch": 11.22, "grad_norm": 0.3551037013530731, "learning_rate": 0.0002, "loss": 0.0105, "step": 1620 }, { "epoch": 11.29, "grad_norm": 0.1988571435213089, "learning_rate": 0.0002, "loss": 0.013, "step": 1630 }, { "epoch": 11.36, "grad_norm": 0.09209605306386948, "learning_rate": 0.0002, "loss": 0.009, "step": 1640 }, { "epoch": 11.43, "grad_norm": 0.08203499764204025, "learning_rate": 0.0002, "loss": 0.0118, "step": 1650 }, { "epoch": 11.5, "grad_norm": 0.05315827578306198, "learning_rate": 0.0002, "loss": 0.0074, "step": 1660 }, { "epoch": 11.57, "grad_norm": 0.06285399943590164, "learning_rate": 0.0002, "loss": 0.014, "step": 1670 }, { "epoch": 11.64, "grad_norm": 0.2484624832868576, "learning_rate": 0.0002, "loss": 0.0154, "step": 1680 }, { "epoch": 11.71, "grad_norm": 0.2875213623046875, "learning_rate": 0.0002, "loss": 0.0061, "step": 1690 }, { "epoch": 11.77, "grad_norm": 0.043303802609443665, "learning_rate": 0.0002, "loss": 0.0096, "step": 1700 }, { "epoch": 11.84, "grad_norm": 0.1575409173965454, "learning_rate": 0.0002, "loss": 0.0129, "step": 1710 }, { "epoch": 11.91, "grad_norm": 0.07893040031194687, "learning_rate": 0.0002, "loss": 0.0061, "step": 1720 }, { "epoch": 11.98, "grad_norm": 0.18039758503437042, "learning_rate": 0.0002, "loss": 0.0118, "step": 1730 }, { "epoch": 12.05, "grad_norm": 0.07571464776992798, "learning_rate": 0.0002, "loss": 0.0135, "step": 1740 }, { "epoch": 12.12, "grad_norm": 0.3126212954521179, "learning_rate": 0.0002, "loss": 0.0086, "step": 1750 } ], "logging_steps": 10, "max_steps": 1750, "num_input_tokens_seen": 0, "num_train_epochs": 13, "save_steps": 250, "total_flos": 2.123929865873141e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }