{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.931896354047237, "eval_steps": 30, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018344416418252695, "grad_norm": 11.986655235290527, "learning_rate": 1.0975609756097562e-05, "loss": 2.8049, "step": 10 }, { "epoch": 0.03668883283650539, "grad_norm": 1.3442561626434326, "learning_rate": 2.3170731707317075e-05, "loss": 1.2643, "step": 20 }, { "epoch": 0.05503324925475808, "grad_norm": 1.0081877708435059, "learning_rate": 3.5365853658536584e-05, "loss": 0.7278, "step": 30 }, { "epoch": 0.05503324925475808, "eval_loss": 0.6342164874076843, "eval_runtime": 47.3806, "eval_samples_per_second": 4.854, "eval_steps_per_second": 4.854, "step": 30 }, { "epoch": 0.07337766567301078, "grad_norm": 0.8855087757110596, "learning_rate": 4.75609756097561e-05, "loss": 0.5154, "step": 40 }, { "epoch": 0.09172208209126347, "grad_norm": 0.7658259272575378, "learning_rate": 5.975609756097561e-05, "loss": 0.4091, "step": 50 }, { "epoch": 0.11006649850951616, "grad_norm": 1.1029750108718872, "learning_rate": 7.195121951219513e-05, "loss": 0.3197, "step": 60 }, { "epoch": 0.11006649850951616, "eval_loss": 0.26837778091430664, "eval_runtime": 47.1026, "eval_samples_per_second": 4.883, "eval_steps_per_second": 4.883, "step": 60 }, { "epoch": 0.12841091492776885, "grad_norm": 0.5400053262710571, "learning_rate": 8.414634146341464e-05, "loss": 0.2753, "step": 70 }, { "epoch": 0.14675533134602156, "grad_norm": 0.5874791145324707, "learning_rate": 9.634146341463415e-05, "loss": 0.243, "step": 80 }, { "epoch": 0.16509974776427425, "grad_norm": 0.47156110405921936, "learning_rate": 0.00010853658536585367, "loss": 0.2144, "step": 90 }, { "epoch": 0.16509974776427425, "eval_loss": 0.18793882429599762, "eval_runtime": 46.9461, "eval_samples_per_second": 4.899, "eval_steps_per_second": 4.899, "step": 90 }, { "epoch": 0.18344416418252693, "grad_norm": 0.6732025146484375, "learning_rate": 0.00012073170731707318, "loss": 0.1804, "step": 100 }, { "epoch": 0.20178858060077964, "grad_norm": 0.4138810336589813, "learning_rate": 0.0001329268292682927, "loss": 0.1704, "step": 110 }, { "epoch": 0.22013299701903233, "grad_norm": 0.3548312783241272, "learning_rate": 0.0001451219512195122, "loss": 0.1642, "step": 120 }, { "epoch": 0.22013299701903233, "eval_loss": 0.14807145297527313, "eval_runtime": 47.16, "eval_samples_per_second": 4.877, "eval_steps_per_second": 4.877, "step": 120 }, { "epoch": 0.238477413437285, "grad_norm": 0.5005571842193604, "learning_rate": 0.00015731707317073173, "loss": 0.1688, "step": 130 }, { "epoch": 0.2568218298555377, "grad_norm": 0.4382654130458832, "learning_rate": 0.00016951219512195123, "loss": 0.1479, "step": 140 }, { "epoch": 0.27516624627379044, "grad_norm": 0.3288620114326477, "learning_rate": 0.00018170731707317075, "loss": 0.1542, "step": 150 }, { "epoch": 0.27516624627379044, "eval_loss": 0.13612844049930573, "eval_runtime": 46.9463, "eval_samples_per_second": 4.899, "eval_steps_per_second": 4.899, "step": 150 }, { "epoch": 0.2935106626920431, "grad_norm": 0.48989060521125793, "learning_rate": 0.00019390243902439025, "loss": 0.1536, "step": 160 }, { "epoch": 0.3118550791102958, "grad_norm": 0.2770105004310608, "learning_rate": 0.00019999432180005332, "loss": 0.1407, "step": 170 }, { "epoch": 0.3301994955285485, "grad_norm": 1.1703704595565796, "learning_rate": 0.00019994890006944105, "loss": 0.1485, "step": 180 }, { "epoch": 0.3301994955285485, "eval_loss": 0.13879014551639557, "eval_runtime": 46.9507, "eval_samples_per_second": 4.899, "eval_steps_per_second": 4.899, "step": 180 }, { "epoch": 0.3485439119468012, "grad_norm": 0.4198831617832184, "learning_rate": 0.0001998580772407242, "loss": 0.1488, "step": 190 }, { "epoch": 0.36688832836505386, "grad_norm": 0.5260700583457947, "learning_rate": 0.00019972189456954594, "loss": 0.1589, "step": 200 }, { "epoch": 0.3852327447833066, "grad_norm": 0.34109413623809814, "learning_rate": 0.00019954041391594487, "loss": 0.1429, "step": 210 }, { "epoch": 0.3852327447833066, "eval_loss": 0.1368015855550766, "eval_runtime": 46.9592, "eval_samples_per_second": 4.898, "eval_steps_per_second": 4.898, "step": 210 }, { "epoch": 0.4035771612015593, "grad_norm": 0.5479871034622192, "learning_rate": 0.00019931371771625544, "loss": 0.1413, "step": 220 }, { "epoch": 0.42192157761981197, "grad_norm": 0.2849060297012329, "learning_rate": 0.00019904190894566194, "loss": 0.1329, "step": 230 }, { "epoch": 0.44026599403806466, "grad_norm": 0.2200181484222412, "learning_rate": 0.00019872511107142261, "loss": 0.15, "step": 240 }, { "epoch": 0.44026599403806466, "eval_loss": 0.12958292663097382, "eval_runtime": 46.9085, "eval_samples_per_second": 4.903, "eval_steps_per_second": 4.903, "step": 240 }, { "epoch": 0.45861041045631734, "grad_norm": 0.31030556559562683, "learning_rate": 0.00019836346799678568, "loss": 0.1277, "step": 250 }, { "epoch": 0.47695482687457, "grad_norm": 0.1974978893995285, "learning_rate": 0.00019795714399562197, "loss": 0.1293, "step": 260 }, { "epoch": 0.49529924329282277, "grad_norm": 0.40907976031303406, "learning_rate": 0.00019750632363780505, "loss": 0.1437, "step": 270 }, { "epoch": 0.49529924329282277, "eval_loss": 0.12063605338335037, "eval_runtime": 47.4843, "eval_samples_per_second": 4.844, "eval_steps_per_second": 4.844, "step": 270 }, { "epoch": 0.5136436597110754, "grad_norm": 0.37478116154670715, "learning_rate": 0.00019701121170537125, "loss": 0.1388, "step": 280 }, { "epoch": 0.5319880761293282, "grad_norm": 0.24447715282440186, "learning_rate": 0.00019647203309949913, "loss": 0.1316, "step": 290 }, { "epoch": 0.5503324925475809, "grad_norm": 0.44361746311187744, "learning_rate": 0.00019588903273834953, "loss": 0.14, "step": 300 }, { "epoch": 0.5503324925475809, "eval_loss": 0.12744086980819702, "eval_runtime": 47.4937, "eval_samples_per_second": 4.843, "eval_steps_per_second": 4.843, "step": 300 }, { "epoch": 0.5686769089658336, "grad_norm": 0.734635591506958, "learning_rate": 0.00019526247544581312, "loss": 0.1403, "step": 310 }, { "epoch": 0.5870213253840862, "grad_norm": 0.16803057491779327, "learning_rate": 0.00019459264583121622, "loss": 0.138, "step": 320 }, { "epoch": 0.6053657418023389, "grad_norm": 0.29355189204216003, "learning_rate": 0.00019387984816003867, "loss": 0.1292, "step": 330 }, { "epoch": 0.6053657418023389, "eval_loss": 0.11417385935783386, "eval_runtime": 47.4735, "eval_samples_per_second": 4.845, "eval_steps_per_second": 4.845, "step": 330 }, { "epoch": 0.6237101582205916, "grad_norm": 0.14734932780265808, "learning_rate": 0.00019312440621570355, "loss": 0.1319, "step": 340 }, { "epoch": 0.6420545746388443, "grad_norm": 0.1784052848815918, "learning_rate": 0.00019232666315250078, "loss": 0.133, "step": 350 }, { "epoch": 0.660398991057097, "grad_norm": 0.21272070705890656, "learning_rate": 0.00019148698133971155, "loss": 0.1244, "step": 360 }, { "epoch": 0.660398991057097, "eval_loss": 0.11460884660482407, "eval_runtime": 47.2816, "eval_samples_per_second": 4.864, "eval_steps_per_second": 4.864, "step": 360 }, { "epoch": 0.6787434074753497, "grad_norm": 0.14382557570934296, "learning_rate": 0.0001906057421970046, "loss": 0.1252, "step": 370 }, { "epoch": 0.6970878238936024, "grad_norm": 0.138260617852211, "learning_rate": 0.00018968334602117906, "loss": 0.1358, "step": 380 }, { "epoch": 0.715432240311855, "grad_norm": 0.14098672568798065, "learning_rate": 0.00018872021180433232, "loss": 0.1183, "step": 390 }, { "epoch": 0.715432240311855, "eval_loss": 0.11497555673122406, "eval_runtime": 47.5258, "eval_samples_per_second": 4.839, "eval_steps_per_second": 4.839, "step": 390 }, { "epoch": 0.7337766567301077, "grad_norm": 0.5053527355194092, "learning_rate": 0.0001877167770435357, "loss": 0.1181, "step": 400 }, { "epoch": 0.7521210731483605, "grad_norm": 0.12653136253356934, "learning_rate": 0.00018667349754210457, "loss": 0.1274, "step": 410 }, { "epoch": 0.7704654895666132, "grad_norm": 0.11409477144479752, "learning_rate": 0.00018559084720255276, "loss": 0.1336, "step": 420 }, { "epoch": 0.7704654895666132, "eval_loss": 0.11223822832107544, "eval_runtime": 47.4963, "eval_samples_per_second": 4.842, "eval_steps_per_second": 4.842, "step": 420 }, { "epoch": 0.7888099059848659, "grad_norm": 0.12272641062736511, "learning_rate": 0.00018446931781132553, "loss": 0.1297, "step": 430 }, { "epoch": 0.8071543224031186, "grad_norm": 0.1796967089176178, "learning_rate": 0.00018330941881540915, "loss": 0.1378, "step": 440 }, { "epoch": 0.8254987388213713, "grad_norm": 0.11490087956190109, "learning_rate": 0.00018211167709091802, "loss": 0.1262, "step": 450 }, { "epoch": 0.8254987388213713, "eval_loss": 0.11122166365385056, "eval_runtime": 47.6342, "eval_samples_per_second": 4.828, "eval_steps_per_second": 4.828, "step": 450 }, { "epoch": 0.8438431552396239, "grad_norm": 0.1275721788406372, "learning_rate": 0.00018087663670376483, "loss": 0.1244, "step": 460 }, { "epoch": 0.8621875716578766, "grad_norm": 0.10838750749826431, "learning_rate": 0.0001796048586625223, "loss": 0.1278, "step": 470 }, { "epoch": 0.8805319880761293, "grad_norm": 0.132126122713089, "learning_rate": 0.00017829692066358914, "loss": 0.1365, "step": 480 }, { "epoch": 0.8805319880761293, "eval_loss": 0.11278139054775238, "eval_runtime": 47.4269, "eval_samples_per_second": 4.85, "eval_steps_per_second": 4.85, "step": 480 }, { "epoch": 0.898876404494382, "grad_norm": 0.130384624004364, "learning_rate": 0.0001769534168287752, "loss": 0.1215, "step": 490 }, { "epoch": 0.9172208209126347, "grad_norm": 0.11609125882387161, "learning_rate": 0.00017557495743542585, "loss": 0.1178, "step": 500 }, { "epoch": 0.9355652373308874, "grad_norm": 0.12103743851184845, "learning_rate": 0.0001741621686392077, "loss": 0.125, "step": 510 }, { "epoch": 0.9355652373308874, "eval_loss": 0.11113429814577103, "eval_runtime": 47.2207, "eval_samples_per_second": 4.871, "eval_steps_per_second": 4.871, "step": 510 }, { "epoch": 0.95390965374914, "grad_norm": 0.13481061160564423, "learning_rate": 0.00017271569218968175, "loss": 0.1233, "step": 520 }, { "epoch": 0.9722540701673928, "grad_norm": 0.12927769124507904, "learning_rate": 0.00017123618513879295, "loss": 0.1237, "step": 530 }, { "epoch": 0.9905984865856455, "grad_norm": 0.1044018417596817, "learning_rate": 0.00016972431954240906, "loss": 0.1222, "step": 540 }, { "epoch": 0.9905984865856455, "eval_loss": 0.11119447648525238, "eval_runtime": 47.6107, "eval_samples_per_second": 4.831, "eval_steps_per_second": 4.831, "step": 540 }, { "epoch": 1.0073377665673011, "grad_norm": 0.10839453339576721, "learning_rate": 0.0001681807821550438, "loss": 0.1076, "step": 550 }, { "epoch": 1.0256821829855538, "grad_norm": 0.151426762342453, "learning_rate": 0.00016660627411790329, "loss": 0.1131, "step": 560 }, { "epoch": 1.0440265994038065, "grad_norm": 0.1167786568403244, "learning_rate": 0.00016500151064039766, "loss": 0.1332, "step": 570 }, { "epoch": 1.0440265994038065, "eval_loss": 0.11187437176704407, "eval_runtime": 47.4208, "eval_samples_per_second": 4.85, "eval_steps_per_second": 4.85, "step": 570 }, { "epoch": 1.0623710158220592, "grad_norm": 0.0833975151181221, "learning_rate": 0.0001633672206752621, "loss": 0.1236, "step": 580 }, { "epoch": 1.0807154322403119, "grad_norm": 0.08196871727705002, "learning_rate": 0.00016170414658743488, "loss": 0.1177, "step": 590 }, { "epoch": 1.0990598486585645, "grad_norm": 0.1361839324235916, "learning_rate": 0.00016001304381684347, "loss": 0.1103, "step": 600 }, { "epoch": 1.0990598486585645, "eval_loss": 0.11141453683376312, "eval_runtime": 47.4348, "eval_samples_per_second": 4.849, "eval_steps_per_second": 4.849, "step": 600 }, { "epoch": 1.1174042650768172, "grad_norm": 0.1309792697429657, "learning_rate": 0.00015829468053525102, "loss": 0.1285, "step": 610 }, { "epoch": 1.13574868149507, "grad_norm": 0.09610354155302048, "learning_rate": 0.00015654983729731977, "loss": 0.1278, "step": 620 }, { "epoch": 1.1540930979133226, "grad_norm": 0.08027515560388565, "learning_rate": 0.00015477930668604916, "loss": 0.117, "step": 630 }, { "epoch": 1.1540930979133226, "eval_loss": 0.11004864424467087, "eval_runtime": 47.4023, "eval_samples_per_second": 4.852, "eval_steps_per_second": 4.852, "step": 630 }, { "epoch": 1.1724375143315753, "grad_norm": 0.25526005029678345, "learning_rate": 0.00015298389295275098, "loss": 0.1179, "step": 640 }, { "epoch": 1.190781930749828, "grad_norm": 0.2913426458835602, "learning_rate": 0.00015116441165172328, "loss": 0.1226, "step": 650 }, { "epoch": 1.2091263471680807, "grad_norm": 0.12240047007799149, "learning_rate": 0.00014932168926979074, "loss": 0.1206, "step": 660 }, { "epoch": 1.2091263471680807, "eval_loss": 0.11615979671478271, "eval_runtime": 47.3821, "eval_samples_per_second": 4.854, "eval_steps_per_second": 4.854, "step": 660 }, { "epoch": 1.2274707635863333, "grad_norm": 0.5275110602378845, "learning_rate": 0.00014745656285087866, "loss": 0.131, "step": 670 }, { "epoch": 1.245815180004586, "grad_norm": 0.08649874478578568, "learning_rate": 0.00014556987961579146, "loss": 0.1273, "step": 680 }, { "epoch": 1.264159596422839, "grad_norm": 0.21493305265903473, "learning_rate": 0.00014366249657736866, "loss": 0.1238, "step": 690 }, { "epoch": 1.264159596422839, "eval_loss": 0.11726190149784088, "eval_runtime": 47.4729, "eval_samples_per_second": 4.845, "eval_steps_per_second": 4.845, "step": 690 }, { "epoch": 1.2825040128410916, "grad_norm": 0.2242494374513626, "learning_rate": 0.00014173528015119246, "loss": 0.1267, "step": 700 }, { "epoch": 1.3008484292593443, "grad_norm": 0.1080465167760849, "learning_rate": 0.0001397891057620247, "loss": 0.1275, "step": 710 }, { "epoch": 1.319192845677597, "grad_norm": 0.11881080269813538, "learning_rate": 0.00013782485744615096, "loss": 0.1265, "step": 720 }, { "epoch": 1.319192845677597, "eval_loss": 0.11231047660112381, "eval_runtime": 47.5156, "eval_samples_per_second": 4.841, "eval_steps_per_second": 4.841, "step": 720 }, { "epoch": 1.3375372620958497, "grad_norm": 0.11510378122329712, "learning_rate": 0.0001358434274498134, "loss": 0.118, "step": 730 }, { "epoch": 1.3558816785141024, "grad_norm": 0.1533641517162323, "learning_rate": 0.00013384571582391393, "loss": 0.1239, "step": 740 }, { "epoch": 1.374226094932355, "grad_norm": 0.09978242218494415, "learning_rate": 0.00013183263001517224, "loss": 0.1206, "step": 750 }, { "epoch": 1.374226094932355, "eval_loss": 0.1097174733877182, "eval_runtime": 47.3489, "eval_samples_per_second": 4.858, "eval_steps_per_second": 4.858, "step": 750 }, { "epoch": 1.3925705113506077, "grad_norm": 0.10042322427034378, "learning_rate": 0.0001298050844539246, "loss": 0.1244, "step": 760 }, { "epoch": 1.4109149277688604, "grad_norm": 0.09835106879472733, "learning_rate": 0.00012776400013875006, "loss": 0.1157, "step": 770 }, { "epoch": 1.429259344187113, "grad_norm": 0.0930342748761177, "learning_rate": 0.00012571030421811314, "loss": 0.1236, "step": 780 }, { "epoch": 1.429259344187113, "eval_loss": 0.11065001785755157, "eval_runtime": 47.4754, "eval_samples_per_second": 4.845, "eval_steps_per_second": 4.845, "step": 780 }, { "epoch": 1.4476037606053658, "grad_norm": 0.08919622749090195, "learning_rate": 0.0001236449295692131, "loss": 0.1261, "step": 790 }, { "epoch": 1.4659481770236185, "grad_norm": 0.0871356800198555, "learning_rate": 0.00012156881437423103, "loss": 0.1215, "step": 800 }, { "epoch": 1.4842925934418711, "grad_norm": 0.11698926240205765, "learning_rate": 0.00011948290169416682, "loss": 0.1269, "step": 810 }, { "epoch": 1.4842925934418711, "eval_loss": 0.10887381434440613, "eval_runtime": 47.4819, "eval_samples_per_second": 4.844, "eval_steps_per_second": 4.844, "step": 810 }, { "epoch": 1.5026370098601238, "grad_norm": 0.06746023893356323, "learning_rate": 0.00011738813904046044, "loss": 0.1122, "step": 820 }, { "epoch": 1.5209814262783765, "grad_norm": 0.09794127196073532, "learning_rate": 0.00011528547794459128, "loss": 0.1161, "step": 830 }, { "epoch": 1.5393258426966292, "grad_norm": 0.15457604825496674, "learning_rate": 0.00011317587352585157, "loss": 0.1281, "step": 840 }, { "epoch": 1.5393258426966292, "eval_loss": 0.10888072103261948, "eval_runtime": 47.3656, "eval_samples_per_second": 4.856, "eval_steps_per_second": 4.856, "step": 840 }, { "epoch": 1.5576702591148819, "grad_norm": 0.08662489801645279, "learning_rate": 0.00011106028405749005, "loss": 0.1191, "step": 850 }, { "epoch": 1.5760146755331346, "grad_norm": 0.08548998087644577, "learning_rate": 0.00010893967053142296, "loss": 0.1196, "step": 860 }, { "epoch": 1.5943590919513873, "grad_norm": 0.10732567310333252, "learning_rate": 0.00010681499622171005, "loss": 0.1246, "step": 870 }, { "epoch": 1.5943590919513873, "eval_loss": 0.10937893390655518, "eval_runtime": 47.5072, "eval_samples_per_second": 4.841, "eval_steps_per_second": 4.841, "step": 870 }, { "epoch": 1.61270350836964, "grad_norm": 0.07822810858488083, "learning_rate": 0.00010468722624699401, "loss": 0.1121, "step": 880 }, { "epoch": 1.6310479247878926, "grad_norm": 0.07400278747081757, "learning_rate": 0.00010255732713210206, "loss": 0.124, "step": 890 }, { "epoch": 1.6493923412061453, "grad_norm": 0.0789525955915451, "learning_rate": 0.00010042626636900856, "loss": 0.1181, "step": 900 }, { "epoch": 1.6493923412061453, "eval_loss": 0.10813174396753311, "eval_runtime": 47.5685, "eval_samples_per_second": 4.835, "eval_steps_per_second": 4.835, "step": 900 }, { "epoch": 1.667736757624398, "grad_norm": 0.07806720584630966, "learning_rate": 9.829501197735866e-05, "loss": 0.1172, "step": 910 }, { "epoch": 1.6860811740426507, "grad_norm": 0.09959676861763, "learning_rate": 9.616453206475179e-05, "loss": 0.1214, "step": 920 }, { "epoch": 1.7044255904609034, "grad_norm": 0.07471271604299545, "learning_rate": 9.40357943869858e-05, "loss": 0.1212, "step": 930 }, { "epoch": 1.7044255904609034, "eval_loss": 0.1081625297665596, "eval_runtime": 47.4204, "eval_samples_per_second": 4.85, "eval_steps_per_second": 4.85, "step": 930 }, { "epoch": 1.722770006879156, "grad_norm": 0.0786692202091217, "learning_rate": 9.190976590846027e-05, "loss": 0.1187, "step": 940 }, { "epoch": 1.7411144232974087, "grad_norm": 0.09310892224311829, "learning_rate": 8.978741236293973e-05, "loss": 0.115, "step": 950 }, { "epoch": 1.7594588397156614, "grad_norm": 0.10510563850402832, "learning_rate": 8.766969781487578e-05, "loss": 0.1254, "step": 960 }, { "epoch": 1.7594588397156614, "eval_loss": 0.10792473703622818, "eval_runtime": 47.4662, "eval_samples_per_second": 4.846, "eval_steps_per_second": 4.846, "step": 960 }, { "epoch": 1.777803256133914, "grad_norm": 0.07966180145740509, "learning_rate": 8.555758422148745e-05, "loss": 0.121, "step": 970 }, { "epoch": 1.7961476725521668, "grad_norm": 0.08049053698778152, "learning_rate": 8.345203099579874e-05, "loss": 0.11, "step": 980 }, { "epoch": 1.8144920889704195, "grad_norm": 0.06832806766033173, "learning_rate": 8.13539945708319e-05, "loss": 0.1236, "step": 990 }, { "epoch": 1.8144920889704195, "eval_loss": 0.10886505246162415, "eval_runtime": 47.5036, "eval_samples_per_second": 4.842, "eval_steps_per_second": 4.842, "step": 990 }, { "epoch": 1.8328365053886724, "grad_norm": 0.08461681008338928, "learning_rate": 7.926442796515429e-05, "loss": 0.1341, "step": 1000 }, { "epoch": 1.851180921806925, "grad_norm": 0.07851795107126236, "learning_rate": 7.71842803499764e-05, "loss": 0.1306, "step": 1010 }, { "epoch": 1.8695253382251777, "grad_norm": 0.0859571248292923, "learning_rate": 7.51144966179972e-05, "loss": 0.1155, "step": 1020 }, { "epoch": 1.8695253382251777, "eval_loss": 0.107667475938797, "eval_runtime": 47.4797, "eval_samples_per_second": 4.844, "eval_steps_per_second": 4.844, "step": 1020 }, { "epoch": 1.8878697546434304, "grad_norm": 0.08879910409450531, "learning_rate": 7.305601695419323e-05, "loss": 0.1173, "step": 1030 }, { "epoch": 1.9062141710616831, "grad_norm": 0.07893572747707367, "learning_rate": 7.10097764087462e-05, "loss": 0.1109, "step": 1040 }, { "epoch": 1.9245585874799358, "grad_norm": 0.0726647600531578, "learning_rate": 6.897670447230262e-05, "loss": 0.1136, "step": 1050 }, { "epoch": 1.9245585874799358, "eval_loss": 0.10811686515808105, "eval_runtime": 47.401, "eval_samples_per_second": 4.852, "eval_steps_per_second": 4.852, "step": 1050 }, { "epoch": 1.9429030038981885, "grad_norm": 0.08151935786008835, "learning_rate": 6.69577246537593e-05, "loss": 0.1145, "step": 1060 }, { "epoch": 1.9612474203164412, "grad_norm": 0.09550312161445618, "learning_rate": 6.495375406076573e-05, "loss": 0.1215, "step": 1070 }, { "epoch": 1.9795918367346939, "grad_norm": 0.20997484028339386, "learning_rate": 6.296570298313431e-05, "loss": 0.1222, "step": 1080 }, { "epoch": 1.9795918367346939, "eval_loss": 0.1070978045463562, "eval_runtime": 47.3536, "eval_samples_per_second": 4.857, "eval_steps_per_second": 4.857, "step": 1080 }, { "epoch": 1.9979362531529465, "grad_norm": 0.07865249365568161, "learning_rate": 6.099447447934743e-05, "loss": 0.1113, "step": 1090 }, { "epoch": 2.0146755331346022, "grad_norm": 0.08859091252088547, "learning_rate": 5.904096396634935e-05, "loss": 0.1253, "step": 1100 }, { "epoch": 2.033019949552855, "grad_norm": 0.08467495441436768, "learning_rate": 5.710605881280939e-05, "loss": 0.1168, "step": 1110 }, { "epoch": 2.033019949552855, "eval_loss": 0.10781675577163696, "eval_runtime": 47.5138, "eval_samples_per_second": 4.841, "eval_steps_per_second": 4.841, "step": 1110 }, { "epoch": 2.0513643659711076, "grad_norm": 0.07836291193962097, "learning_rate": 5.519063793604067e-05, "loss": 0.1138, "step": 1120 }, { "epoch": 2.0697087823893603, "grad_norm": 0.10746737569570541, "learning_rate": 5.329557140275801e-05, "loss": 0.1146, "step": 1130 }, { "epoch": 2.088053198807613, "grad_norm": 0.06803814321756363, "learning_rate": 5.1421720033856216e-05, "loss": 0.1148, "step": 1140 }, { "epoch": 2.088053198807613, "eval_loss": 0.10701934248209, "eval_runtime": 47.4053, "eval_samples_per_second": 4.852, "eval_steps_per_second": 4.852, "step": 1140 }, { "epoch": 2.1063976152258657, "grad_norm": 0.08060242980718613, "learning_rate": 4.9569935013388125e-05, "loss": 0.1099, "step": 1150 }, { "epoch": 2.1247420316441183, "grad_norm": 0.08353856205940247, "learning_rate": 4.774105750192e-05, "loss": 0.1203, "step": 1160 }, { "epoch": 2.143086448062371, "grad_norm": 0.08800723403692245, "learning_rate": 4.593591825444028e-05, "loss": 0.112, "step": 1170 }, { "epoch": 2.143086448062371, "eval_loss": 0.10760512948036194, "eval_runtime": 47.4958, "eval_samples_per_second": 4.843, "eval_steps_per_second": 4.843, "step": 1170 }, { "epoch": 2.1614308644806237, "grad_norm": 0.07223484665155411, "learning_rate": 4.415533724299471e-05, "loss": 0.1221, "step": 1180 }, { "epoch": 2.1797752808988764, "grad_norm": 0.07669170200824738, "learning_rate": 4.240012328421997e-05, "loss": 0.1088, "step": 1190 }, { "epoch": 2.198119697317129, "grad_norm": 0.07591399550437927, "learning_rate": 4.067107367194397e-05, "loss": 0.1115, "step": 1200 }, { "epoch": 2.198119697317129, "eval_loss": 0.1069360300898552, "eval_runtime": 47.5987, "eval_samples_per_second": 4.832, "eval_steps_per_second": 4.832, "step": 1200 }, { "epoch": 2.2164641137353818, "grad_norm": 0.0850834846496582, "learning_rate": 3.8968973815020806e-05, "loss": 0.1149, "step": 1210 }, { "epoch": 2.2348085301536345, "grad_norm": 0.08643563091754913, "learning_rate": 3.729459688056427e-05, "loss": 0.1125, "step": 1220 }, { "epoch": 2.253152946571887, "grad_norm": 0.07351703196763992, "learning_rate": 3.564870344274185e-05, "loss": 0.1099, "step": 1230 }, { "epoch": 2.253152946571887, "eval_loss": 0.10722808539867401, "eval_runtime": 47.5417, "eval_samples_per_second": 4.838, "eval_steps_per_second": 4.838, "step": 1230 }, { "epoch": 2.27149736299014, "grad_norm": 0.09461668133735657, "learning_rate": 3.403204113728933e-05, "loss": 0.1189, "step": 1240 }, { "epoch": 2.2898417794083925, "grad_norm": 0.09557740390300751, "learning_rate": 3.244534432190225e-05, "loss": 0.1219, "step": 1250 }, { "epoch": 2.308186195826645, "grad_norm": 0.09171107411384583, "learning_rate": 3.088933374265919e-05, "loss": 0.1199, "step": 1260 }, { "epoch": 2.308186195826645, "eval_loss": 0.10666479170322418, "eval_runtime": 47.7241, "eval_samples_per_second": 4.819, "eval_steps_per_second": 4.819, "step": 1260 }, { "epoch": 2.326530612244898, "grad_norm": 0.09569697827100754, "learning_rate": 2.936471620662763e-05, "loss": 0.1018, "step": 1270 }, { "epoch": 2.3448750286631506, "grad_norm": 0.10778363794088364, "learning_rate": 2.7872184260801838e-05, "loss": 0.1156, "step": 1280 }, { "epoch": 2.3632194450814032, "grad_norm": 0.07144766300916672, "learning_rate": 2.6412415877518238e-05, "loss": 0.1171, "step": 1290 }, { "epoch": 2.3632194450814032, "eval_loss": 0.10628043115139008, "eval_runtime": 47.8715, "eval_samples_per_second": 4.805, "eval_steps_per_second": 4.805, "step": 1290 }, { "epoch": 2.381563861499656, "grad_norm": 0.10883186757564545, "learning_rate": 2.4986074146490967e-05, "loss": 0.115, "step": 1300 }, { "epoch": 2.3999082779179086, "grad_norm": 0.08375997096300125, "learning_rate": 2.35938069736081e-05, "loss": 0.1134, "step": 1310 }, { "epoch": 2.4182526943361613, "grad_norm": 0.09004294127225876, "learning_rate": 2.2236246786624792e-05, "loss": 0.1067, "step": 1320 }, { "epoch": 2.4182526943361613, "eval_loss": 0.10667029023170471, "eval_runtime": 47.8027, "eval_samples_per_second": 4.811, "eval_steps_per_second": 4.811, "step": 1320 }, { "epoch": 2.436597110754414, "grad_norm": 0.08431920409202576, "learning_rate": 2.091401024788745e-05, "loss": 0.1175, "step": 1330 }, { "epoch": 2.4549415271726667, "grad_norm": 0.06736938655376434, "learning_rate": 1.962769797421895e-05, "loss": 0.1081, "step": 1340 }, { "epoch": 2.4732859435909194, "grad_norm": 0.0825665220618248, "learning_rate": 1.83778942640927e-05, "loss": 0.1144, "step": 1350 }, { "epoch": 2.4732859435909194, "eval_loss": 0.10638684034347534, "eval_runtime": 47.5125, "eval_samples_per_second": 4.841, "eval_steps_per_second": 4.841, "step": 1350 }, { "epoch": 2.491630360009172, "grad_norm": 0.09597857296466827, "learning_rate": 1.716516683221906e-05, "loss": 0.1174, "step": 1360 }, { "epoch": 2.509974776427425, "grad_norm": 0.087304025888443, "learning_rate": 1.5990066551664906e-05, "loss": 0.1217, "step": 1370 }, { "epoch": 2.528319192845678, "grad_norm": 0.1020963117480278, "learning_rate": 1.4853127203623252e-05, "loss": 0.1206, "step": 1380 }, { "epoch": 2.528319192845678, "eval_loss": 0.10611271858215332, "eval_runtime": 47.5951, "eval_samples_per_second": 4.832, "eval_steps_per_second": 4.832, "step": 1380 }, { "epoch": 2.5466636092639305, "grad_norm": 0.09987211227416992, "learning_rate": 1.3754865234946835e-05, "loss": 0.1237, "step": 1390 }, { "epoch": 2.5650080256821832, "grad_norm": 0.09537294507026672, "learning_rate": 1.2695779523555829e-05, "loss": 0.1074, "step": 1400 }, { "epoch": 2.583352442100436, "grad_norm": 0.06951133906841278, "learning_rate": 1.1676351151825804e-05, "loss": 0.1113, "step": 1410 }, { "epoch": 2.583352442100436, "eval_loss": 0.10620437562465668, "eval_runtime": 47.5731, "eval_samples_per_second": 4.835, "eval_steps_per_second": 4.835, "step": 1410 }, { "epoch": 2.6016968585186886, "grad_norm": 0.08393154293298721, "learning_rate": 1.0697043188059475e-05, "loss": 0.1082, "step": 1420 }, { "epoch": 2.6200412749369413, "grad_norm": 0.08919060230255127, "learning_rate": 9.75830047614117e-06, "loss": 0.1129, "step": 1430 }, { "epoch": 2.638385691355194, "grad_norm": 0.08663026988506317, "learning_rate": 8.860549433469444e-06, "loss": 0.1151, "step": 1440 }, { "epoch": 2.638385691355194, "eval_loss": 0.10626456141471863, "eval_runtime": 47.3921, "eval_samples_per_second": 4.853, "eval_steps_per_second": 4.853, "step": 1440 }, { "epoch": 2.6567301077734466, "grad_norm": 0.09450593590736389, "learning_rate": 8.004197857260042e-06, "loss": 0.1154, "step": 1450 }, { "epoch": 2.6750745241916993, "grad_norm": 0.08700842410326004, "learning_rate": 7.189634739306705e-06, "loss": 0.1113, "step": 1460 }, { "epoch": 2.693418940609952, "grad_norm": 0.08897579461336136, "learning_rate": 6.4172300892844425e-06, "loss": 0.1068, "step": 1470 }, { "epoch": 2.693418940609952, "eval_loss": 0.10611724108457565, "eval_runtime": 47.6236, "eval_samples_per_second": 4.83, "eval_steps_per_second": 4.83, "step": 1470 }, { "epoch": 2.7117633570282047, "grad_norm": 0.08531934022903442, "learning_rate": 5.687334766675123e-06, "loss": 0.1057, "step": 1480 }, { "epoch": 2.7301077734464574, "grad_norm": 0.08912410587072372, "learning_rate": 5.000280321392004e-06, "loss": 0.1179, "step": 1490 }, { "epoch": 2.74845218986471, "grad_norm": 0.0903608500957489, "learning_rate": 4.356378843175446e-06, "loss": 0.1137, "step": 1500 }, { "epoch": 2.74845218986471, "eval_loss": 0.10605704039335251, "eval_runtime": 47.5904, "eval_samples_per_second": 4.833, "eval_steps_per_second": 4.833, "step": 1500 }, { "epoch": 2.7667966062829628, "grad_norm": 0.08206541836261749, "learning_rate": 3.75592281982835e-06, "loss": 0.121, "step": 1510 }, { "epoch": 2.7851410227012154, "grad_norm": 0.08107905834913254, "learning_rate": 3.1991850043555425e-06, "loss": 0.1141, "step": 1520 }, { "epoch": 2.803485439119468, "grad_norm": 0.09001165628433228, "learning_rate": 2.6864182910676273e-06, "loss": 0.1135, "step": 1530 }, { "epoch": 2.803485439119468, "eval_loss": 0.10597212612628937, "eval_runtime": 47.6034, "eval_samples_per_second": 4.832, "eval_steps_per_second": 4.832, "step": 1530 }, { "epoch": 2.821829855537721, "grad_norm": 0.08551483601331711, "learning_rate": 2.2178556007054872e-06, "loss": 0.1209, "step": 1540 }, { "epoch": 2.8401742719559735, "grad_norm": 0.09350485354661942, "learning_rate": 1.793709774637653e-06, "loss": 0.1174, "step": 1550 }, { "epoch": 2.858518688374226, "grad_norm": 0.08843007683753967, "learning_rate": 1.41417347817856e-06, "loss": 0.1146, "step": 1560 }, { "epoch": 2.858518688374226, "eval_loss": 0.10600461810827255, "eval_runtime": 47.5109, "eval_samples_per_second": 4.841, "eval_steps_per_second": 4.841, "step": 1560 }, { "epoch": 2.876863104792479, "grad_norm": 0.0968756377696991, "learning_rate": 1.079419113071678e-06, "loss": 0.1239, "step": 1570 }, { "epoch": 2.8952075212107315, "grad_norm": 0.09828708320856094, "learning_rate": 7.895987391771997e-07, "loss": 0.1206, "step": 1580 }, { "epoch": 2.9135519376289842, "grad_norm": 0.09327838569879532, "learning_rate": 5.448440053999137e-07, "loss": 0.1173, "step": 1590 }, { "epoch": 2.9135519376289842, "eval_loss": 0.10602891445159912, "eval_runtime": 47.5894, "eval_samples_per_second": 4.833, "eval_steps_per_second": 4.833, "step": 1590 }, { "epoch": 2.931896354047237, "grad_norm": 0.07366887480020523, "learning_rate": 3.45266089888574e-07, "loss": 0.1076, "step": 1600 } ], "logging_steps": 10, "max_steps": 1638, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2220701474596557e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }