{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9985842378480414, "eval_steps": 500, "global_step": 3177, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009438414346389807, "grad_norm": 0.7047261682571029, "learning_rate": 2.0833333333333334e-06, "loss": 0.3368, "step": 10 }, { "epoch": 0.018876828692779613, "grad_norm": 0.44565794909772116, "learning_rate": 4.166666666666667e-06, "loss": 0.2696, "step": 20 }, { "epoch": 0.028315243039169418, "grad_norm": 0.2469001773100438, "learning_rate": 6.25e-06, "loss": 0.2345, "step": 30 }, { "epoch": 0.037753657385559226, "grad_norm": 0.17504888016598083, "learning_rate": 8.333333333333334e-06, "loss": 0.213, "step": 40 }, { "epoch": 0.04719207173194903, "grad_norm": 0.14319985578848382, "learning_rate": 1.0416666666666668e-05, "loss": 0.194, "step": 50 }, { "epoch": 0.056630486078338836, "grad_norm": 0.1594795618687463, "learning_rate": 1.25e-05, "loss": 0.1871, "step": 60 }, { "epoch": 0.06606890042472864, "grad_norm": 0.14625288121586646, "learning_rate": 1.4583333333333333e-05, "loss": 0.1791, "step": 70 }, { "epoch": 0.07550731477111845, "grad_norm": 0.44536813060583275, "learning_rate": 1.6666666666666667e-05, "loss": 0.1777, "step": 80 }, { "epoch": 0.08494572911750826, "grad_norm": 0.17858201185676476, "learning_rate": 1.8750000000000002e-05, "loss": 0.1743, "step": 90 }, { "epoch": 0.09438414346389806, "grad_norm": 0.22679600482590775, "learning_rate": 1.9999916822524766e-05, "loss": 0.173, "step": 100 }, { "epoch": 0.10382255781028787, "grad_norm": 0.2245846959065181, "learning_rate": 1.999898109181919e-05, "loss": 0.1697, "step": 110 }, { "epoch": 0.11326097215667767, "grad_norm": 0.21274961909318216, "learning_rate": 1.9997005756177228e-05, "loss": 0.1669, "step": 120 }, { "epoch": 0.12269938650306748, "grad_norm": 0.15535944242468744, "learning_rate": 1.999399102097668e-05, "loss": 0.1664, "step": 130 }, { "epoch": 0.13213780084945728, "grad_norm": 0.1677351780597522, "learning_rate": 1.9989937199662845e-05, "loss": 0.1652, "step": 140 }, { "epoch": 0.1415762151958471, "grad_norm": 0.15664753634679404, "learning_rate": 1.998484471371593e-05, "loss": 0.1619, "step": 150 }, { "epoch": 0.1510146295422369, "grad_norm": 0.1936710831336268, "learning_rate": 1.9978714092607234e-05, "loss": 0.1606, "step": 160 }, { "epoch": 0.16045304388862672, "grad_norm": 0.21595388620143963, "learning_rate": 1.9971545973744102e-05, "loss": 0.16, "step": 170 }, { "epoch": 0.16989145823501653, "grad_norm": 0.19084712938417736, "learning_rate": 1.9963341102403652e-05, "loss": 0.1582, "step": 180 }, { "epoch": 0.1793298725814063, "grad_norm": 0.19755743286804991, "learning_rate": 1.9954100331655265e-05, "loss": 0.1551, "step": 190 }, { "epoch": 0.18876828692779613, "grad_norm": 0.21785901423412252, "learning_rate": 1.9943824622271934e-05, "loss": 0.1559, "step": 200 }, { "epoch": 0.19820670127418594, "grad_norm": 0.1472275149396309, "learning_rate": 1.9932515042630335e-05, "loss": 0.1534, "step": 210 }, { "epoch": 0.20764511562057575, "grad_norm": 0.14184865805893884, "learning_rate": 1.9920172768599763e-05, "loss": 0.1545, "step": 220 }, { "epoch": 0.21708352996696556, "grad_norm": 0.1833363522186809, "learning_rate": 1.9906799083419865e-05, "loss": 0.1543, "step": 230 }, { "epoch": 0.22652194431335534, "grad_norm": 0.141212536462394, "learning_rate": 1.989239537756723e-05, "loss": 0.1538, "step": 240 }, { "epoch": 0.23596035865974516, "grad_norm": 0.1556507383209179, "learning_rate": 1.987696314861082e-05, "loss": 0.1541, "step": 250 }, { "epoch": 0.24539877300613497, "grad_norm": 0.17173283637998002, "learning_rate": 1.986050400105626e-05, "loss": 0.1518, "step": 260 }, { "epoch": 0.25483718735252475, "grad_norm": 0.150629748875022, "learning_rate": 1.9843019646179014e-05, "loss": 0.1501, "step": 270 }, { "epoch": 0.26427560169891456, "grad_norm": 0.19334006281958221, "learning_rate": 1.9824511901846475e-05, "loss": 0.1483, "step": 280 }, { "epoch": 0.2737140160453044, "grad_norm": 0.12527664802237196, "learning_rate": 1.9804982692328944e-05, "loss": 0.1514, "step": 290 }, { "epoch": 0.2831524303916942, "grad_norm": 0.19858199254198736, "learning_rate": 1.9784434048099565e-05, "loss": 0.151, "step": 300 }, { "epoch": 0.292590844738084, "grad_norm": 0.1677799134496006, "learning_rate": 1.976286810562323e-05, "loss": 0.1498, "step": 310 }, { "epoch": 0.3020292590844738, "grad_norm": 0.15743349400071904, "learning_rate": 1.9740287107134417e-05, "loss": 0.1513, "step": 320 }, { "epoch": 0.3114676734308636, "grad_norm": 0.1616061413079364, "learning_rate": 1.97166934004041e-05, "loss": 0.1489, "step": 330 }, { "epoch": 0.32090608777725343, "grad_norm": 0.1878283137943562, "learning_rate": 1.9692089438495622e-05, "loss": 0.1449, "step": 340 }, { "epoch": 0.33034450212364325, "grad_norm": 0.16166637953640253, "learning_rate": 1.9666477779509655e-05, "loss": 0.1469, "step": 350 }, { "epoch": 0.33978291647003306, "grad_norm": 0.12292532976422958, "learning_rate": 1.963986108631823e-05, "loss": 0.1468, "step": 360 }, { "epoch": 0.3492213308164228, "grad_norm": 0.16469874208354635, "learning_rate": 1.9612242126287876e-05, "loss": 0.1483, "step": 370 }, { "epoch": 0.3586597451628126, "grad_norm": 0.13369950841636608, "learning_rate": 1.958362377099191e-05, "loss": 0.1443, "step": 380 }, { "epoch": 0.36809815950920244, "grad_norm": 0.12551292002845085, "learning_rate": 1.9554008995911837e-05, "loss": 0.1463, "step": 390 }, { "epoch": 0.37753657385559225, "grad_norm": 0.14851921568961518, "learning_rate": 1.9523400880128032e-05, "loss": 0.1471, "step": 400 }, { "epoch": 0.38697498820198206, "grad_norm": 0.13162403368451694, "learning_rate": 1.949180260599957e-05, "loss": 0.1452, "step": 410 }, { "epoch": 0.3964134025483719, "grad_norm": 0.12724887401811377, "learning_rate": 1.945921745883337e-05, "loss": 0.1455, "step": 420 }, { "epoch": 0.4058518168947617, "grad_norm": 0.11779906396951238, "learning_rate": 1.9425648826542618e-05, "loss": 0.1435, "step": 430 }, { "epoch": 0.4152902312411515, "grad_norm": 0.1610933457769652, "learning_rate": 1.939110019929451e-05, "loss": 0.1436, "step": 440 }, { "epoch": 0.4247286455875413, "grad_norm": 0.12250958203512534, "learning_rate": 1.935557516914739e-05, "loss": 0.1451, "step": 450 }, { "epoch": 0.4341670599339311, "grad_norm": 0.1380572524992858, "learning_rate": 1.931907742967727e-05, "loss": 0.1444, "step": 460 }, { "epoch": 0.44360547428032093, "grad_norm": 0.13646993698111895, "learning_rate": 1.92816107755938e-05, "loss": 0.142, "step": 470 }, { "epoch": 0.4530438886267107, "grad_norm": 0.11765542306036501, "learning_rate": 1.9243179102345753e-05, "loss": 0.1406, "step": 480 }, { "epoch": 0.4624823029731005, "grad_norm": 0.1266567901893174, "learning_rate": 1.9203786405715984e-05, "loss": 0.144, "step": 490 }, { "epoch": 0.4719207173194903, "grad_norm": 0.1113634311573256, "learning_rate": 1.9163436781405992e-05, "loss": 0.1428, "step": 500 }, { "epoch": 0.4813591316658801, "grad_norm": 0.13808836428511967, "learning_rate": 1.912213442461009e-05, "loss": 0.1399, "step": 510 }, { "epoch": 0.49079754601226994, "grad_norm": 0.1226613837593307, "learning_rate": 1.9079883629579224e-05, "loss": 0.1396, "step": 520 }, { "epoch": 0.5002359603586597, "grad_norm": 0.14272835200919645, "learning_rate": 1.9036688789174496e-05, "loss": 0.1403, "step": 530 }, { "epoch": 0.5096743747050495, "grad_norm": 0.12981510040553715, "learning_rate": 1.899255439441043e-05, "loss": 0.1399, "step": 540 }, { "epoch": 0.5191127890514393, "grad_norm": 0.1190871345092575, "learning_rate": 1.8947485033988034e-05, "loss": 0.1376, "step": 550 }, { "epoch": 0.5285512033978291, "grad_norm": 0.1271477738963388, "learning_rate": 1.8901485393817724e-05, "loss": 0.1415, "step": 560 }, { "epoch": 0.5379896177442189, "grad_norm": 0.12965211048846748, "learning_rate": 1.8854560256532098e-05, "loss": 0.1423, "step": 570 }, { "epoch": 0.5474280320906088, "grad_norm": 0.13373262160455968, "learning_rate": 1.880671450098871e-05, "loss": 0.139, "step": 580 }, { "epoch": 0.5568664464369986, "grad_norm": 0.1322939697550499, "learning_rate": 1.8757953101762786e-05, "loss": 0.1396, "step": 590 }, { "epoch": 0.5663048607833884, "grad_norm": 0.11918437239832326, "learning_rate": 1.8708281128630023e-05, "loss": 0.138, "step": 600 }, { "epoch": 0.5757432751297782, "grad_norm": 0.12338738357381479, "learning_rate": 1.865770374603948e-05, "loss": 0.1406, "step": 610 }, { "epoch": 0.585181689476168, "grad_norm": 0.11573754594906395, "learning_rate": 1.8606226212576612e-05, "loss": 0.138, "step": 620 }, { "epoch": 0.5946201038225578, "grad_norm": 0.1419588706141848, "learning_rate": 1.8553853880416555e-05, "loss": 0.1408, "step": 630 }, { "epoch": 0.6040585181689476, "grad_norm": 0.13998266637185536, "learning_rate": 1.8500592194767625e-05, "loss": 0.1394, "step": 640 }, { "epoch": 0.6134969325153374, "grad_norm": 0.11868995175822014, "learning_rate": 1.8446446693305194e-05, "loss": 0.1384, "step": 650 }, { "epoch": 0.6229353468617272, "grad_norm": 0.1328472026088287, "learning_rate": 1.8391423005595928e-05, "loss": 0.1393, "step": 660 }, { "epoch": 0.6323737612081171, "grad_norm": 0.11726921800593894, "learning_rate": 1.833552685251246e-05, "loss": 0.1398, "step": 670 }, { "epoch": 0.6418121755545069, "grad_norm": 0.11466260187649016, "learning_rate": 1.827876404563861e-05, "loss": 0.1369, "step": 680 }, { "epoch": 0.6512505899008967, "grad_norm": 0.11234281014514101, "learning_rate": 1.8221140486665125e-05, "loss": 0.1346, "step": 690 }, { "epoch": 0.6606890042472865, "grad_norm": 0.11159741277810285, "learning_rate": 1.8162662166776085e-05, "loss": 0.1357, "step": 700 }, { "epoch": 0.6701274185936763, "grad_norm": 0.12752868267859116, "learning_rate": 1.8103335166026002e-05, "loss": 0.1389, "step": 710 }, { "epoch": 0.6795658329400661, "grad_norm": 0.12084535348559353, "learning_rate": 1.804316565270765e-05, "loss": 0.1375, "step": 720 }, { "epoch": 0.6890042472864559, "grad_norm": 0.12102077085461252, "learning_rate": 1.798215988271075e-05, "loss": 0.1364, "step": 730 }, { "epoch": 0.6984426616328456, "grad_norm": 0.11713742692774234, "learning_rate": 1.7920324198871546e-05, "loss": 0.138, "step": 740 }, { "epoch": 0.7078810759792354, "grad_norm": 0.11656355822805255, "learning_rate": 1.785766503031332e-05, "loss": 0.1346, "step": 750 }, { "epoch": 0.7173194903256253, "grad_norm": 0.11377328844943654, "learning_rate": 1.7794188891777964e-05, "loss": 0.1352, "step": 760 }, { "epoch": 0.7267579046720151, "grad_norm": 0.12103799646507679, "learning_rate": 1.7729902382948617e-05, "loss": 0.1353, "step": 770 }, { "epoch": 0.7361963190184049, "grad_norm": 0.1073585292390918, "learning_rate": 1.76648121877635e-05, "loss": 0.1352, "step": 780 }, { "epoch": 0.7456347333647947, "grad_norm": 0.11214075940260533, "learning_rate": 1.759892507372099e-05, "loss": 0.1341, "step": 790 }, { "epoch": 0.7550731477111845, "grad_norm": 0.11706899066793994, "learning_rate": 1.7532247891175968e-05, "loss": 0.1333, "step": 800 }, { "epoch": 0.7645115620575743, "grad_norm": 0.11789888505768062, "learning_rate": 1.746478757262761e-05, "loss": 0.136, "step": 810 }, { "epoch": 0.7739499764039641, "grad_norm": 0.11237848535926774, "learning_rate": 1.739655113199858e-05, "loss": 0.1336, "step": 820 }, { "epoch": 0.7833883907503539, "grad_norm": 0.10753154987431834, "learning_rate": 1.7327545663905813e-05, "loss": 0.1331, "step": 830 }, { "epoch": 0.7928268050967437, "grad_norm": 0.1441522552747225, "learning_rate": 1.7257778342922853e-05, "loss": 0.1328, "step": 840 }, { "epoch": 0.8022652194431336, "grad_norm": 0.1269707942863234, "learning_rate": 1.7187256422833928e-05, "loss": 0.1319, "step": 850 }, { "epoch": 0.8117036337895234, "grad_norm": 0.11091236494221275, "learning_rate": 1.711598723587975e-05, "loss": 0.1324, "step": 860 }, { "epoch": 0.8211420481359132, "grad_norm": 0.10854265306012167, "learning_rate": 1.7043978191995177e-05, "loss": 0.1325, "step": 870 }, { "epoch": 0.830580462482303, "grad_norm": 0.1110467712060928, "learning_rate": 1.6971236778038806e-05, "loss": 0.1315, "step": 880 }, { "epoch": 0.8400188768286928, "grad_norm": 0.12129611756408008, "learning_rate": 1.6897770557014535e-05, "loss": 0.1328, "step": 890 }, { "epoch": 0.8494572911750826, "grad_norm": 0.106781748696916, "learning_rate": 1.682358716728525e-05, "loss": 0.1351, "step": 900 }, { "epoch": 0.8588957055214724, "grad_norm": 0.11020118439519076, "learning_rate": 1.674869432177864e-05, "loss": 0.1325, "step": 910 }, { "epoch": 0.8683341198678622, "grad_norm": 0.11750342557908768, "learning_rate": 1.667309980718529e-05, "loss": 0.1312, "step": 920 }, { "epoch": 0.877772534214252, "grad_norm": 0.12853148875033116, "learning_rate": 1.6596811483149077e-05, "loss": 0.1317, "step": 930 }, { "epoch": 0.8872109485606419, "grad_norm": 0.11393786746070304, "learning_rate": 1.651983728145e-05, "loss": 0.1355, "step": 940 }, { "epoch": 0.8966493629070316, "grad_norm": 0.11013100846033319, "learning_rate": 1.6442185205179507e-05, "loss": 0.1309, "step": 950 }, { "epoch": 0.9060877772534214, "grad_norm": 0.10641286141618259, "learning_rate": 1.6363863327908405e-05, "loss": 0.1339, "step": 960 }, { "epoch": 0.9155261915998112, "grad_norm": 0.11308702339858176, "learning_rate": 1.6284879792847433e-05, "loss": 0.1299, "step": 970 }, { "epoch": 0.924964605946201, "grad_norm": 0.11315250527471539, "learning_rate": 1.620524281200062e-05, "loss": 0.1305, "step": 980 }, { "epoch": 0.9344030202925908, "grad_norm": 0.0972875949252274, "learning_rate": 1.6124960665311447e-05, "loss": 0.1322, "step": 990 }, { "epoch": 0.9438414346389806, "grad_norm": 0.10070713866086979, "learning_rate": 1.6044041699802005e-05, "loss": 0.129, "step": 1000 }, { "epoch": 0.9532798489853704, "grad_norm": 0.11109074990403733, "learning_rate": 1.5962494328705123e-05, "loss": 0.1321, "step": 1010 }, { "epoch": 0.9627182633317602, "grad_norm": 0.1199186598391774, "learning_rate": 1.588032703058964e-05, "loss": 0.1334, "step": 1020 }, { "epoch": 0.9721566776781501, "grad_norm": 0.10777396066893469, "learning_rate": 1.5797548348478893e-05, "loss": 0.1325, "step": 1030 }, { "epoch": 0.9815950920245399, "grad_norm": 0.11999882098060052, "learning_rate": 1.571416688896246e-05, "loss": 0.132, "step": 1040 }, { "epoch": 0.9910335063709297, "grad_norm": 0.10911342083469809, "learning_rate": 1.563019132130136e-05, "loss": 0.1301, "step": 1050 }, { "epoch": 1.0004719207173194, "grad_norm": 0.11482143235010223, "learning_rate": 1.5545630376526665e-05, "loss": 0.1282, "step": 1060 }, { "epoch": 1.0099103350637093, "grad_norm": 0.11699392564682201, "learning_rate": 1.5460492846531748e-05, "loss": 0.1142, "step": 1070 }, { "epoch": 1.019348749410099, "grad_norm": 0.09530170230868218, "learning_rate": 1.5374787583158188e-05, "loss": 0.1157, "step": 1080 }, { "epoch": 1.028787163756489, "grad_norm": 0.09486822390799159, "learning_rate": 1.5288523497275392e-05, "loss": 0.1143, "step": 1090 }, { "epoch": 1.0382255781028786, "grad_norm": 0.09531194088354993, "learning_rate": 1.5201709557854178e-05, "loss": 0.1128, "step": 1100 }, { "epoch": 1.0476639924492686, "grad_norm": 0.11206649112299381, "learning_rate": 1.5114354791034225e-05, "loss": 0.1161, "step": 1110 }, { "epoch": 1.0571024067956583, "grad_norm": 0.10196697892456508, "learning_rate": 1.5026468279185615e-05, "loss": 0.1159, "step": 1120 }, { "epoch": 1.0665408211420482, "grad_norm": 0.10326390731228648, "learning_rate": 1.4938059159964555e-05, "loss": 0.1161, "step": 1130 }, { "epoch": 1.0759792354884379, "grad_norm": 0.09969628777021497, "learning_rate": 1.4849136625363297e-05, "loss": 0.1141, "step": 1140 }, { "epoch": 1.0854176498348278, "grad_norm": 0.0939091288214549, "learning_rate": 1.4759709920754453e-05, "loss": 0.1125, "step": 1150 }, { "epoch": 1.0948560641812175, "grad_norm": 0.09506323252337912, "learning_rate": 1.4669788343929736e-05, "loss": 0.1141, "step": 1160 }, { "epoch": 1.1042944785276074, "grad_norm": 0.10154441851850998, "learning_rate": 1.4579381244133265e-05, "loss": 0.1128, "step": 1170 }, { "epoch": 1.1137328928739971, "grad_norm": 0.11822773479215737, "learning_rate": 1.4488498021089514e-05, "loss": 0.1137, "step": 1180 }, { "epoch": 1.123171307220387, "grad_norm": 0.1009800661484683, "learning_rate": 1.4397148124025997e-05, "loss": 0.1143, "step": 1190 }, { "epoch": 1.1326097215667768, "grad_norm": 0.10126420585679885, "learning_rate": 1.4305341050690845e-05, "loss": 0.117, "step": 1200 }, { "epoch": 1.1420481359131667, "grad_norm": 0.09588874040008494, "learning_rate": 1.421308634636529e-05, "loss": 0.1137, "step": 1210 }, { "epoch": 1.1514865502595564, "grad_norm": 0.10922875430592754, "learning_rate": 1.412039360287126e-05, "loss": 0.1145, "step": 1220 }, { "epoch": 1.1609249646059463, "grad_norm": 0.11298890031757797, "learning_rate": 1.4027272457574082e-05, "loss": 0.1138, "step": 1230 }, { "epoch": 1.170363378952336, "grad_norm": 0.10857815603466196, "learning_rate": 1.3933732592380485e-05, "loss": 0.1135, "step": 1240 }, { "epoch": 1.1798017932987257, "grad_norm": 0.10213279825434321, "learning_rate": 1.3839783732731966e-05, "loss": 0.1134, "step": 1250 }, { "epoch": 1.1892402076451156, "grad_norm": 0.10027833977041692, "learning_rate": 1.3745435646593613e-05, "loss": 0.1136, "step": 1260 }, { "epoch": 1.1986786219915055, "grad_norm": 0.09590585357482817, "learning_rate": 1.3650698143438534e-05, "loss": 0.113, "step": 1270 }, { "epoch": 1.2081170363378952, "grad_norm": 0.10321073236266613, "learning_rate": 1.3555581073227942e-05, "loss": 0.1167, "step": 1280 }, { "epoch": 1.217555450684285, "grad_norm": 0.09327710523878686, "learning_rate": 1.346009432538705e-05, "loss": 0.1147, "step": 1290 }, { "epoch": 1.2269938650306749, "grad_norm": 0.0933025296287067, "learning_rate": 1.3364247827776854e-05, "loss": 0.1145, "step": 1300 }, { "epoch": 1.2364322793770646, "grad_norm": 0.09493771082819326, "learning_rate": 1.3268051545661937e-05, "loss": 0.1141, "step": 1310 }, { "epoch": 1.2458706937234545, "grad_norm": 0.10053484854502866, "learning_rate": 1.3171515480674342e-05, "loss": 0.1122, "step": 1320 }, { "epoch": 1.2553091080698442, "grad_norm": 0.1108335770631105, "learning_rate": 1.3074649669773716e-05, "loss": 0.1173, "step": 1330 }, { "epoch": 1.2647475224162341, "grad_norm": 0.10521299166726314, "learning_rate": 1.297746418420374e-05, "loss": 0.1103, "step": 1340 }, { "epoch": 1.2741859367626238, "grad_norm": 0.10478814209881943, "learning_rate": 1.2879969128445025e-05, "loss": 0.1122, "step": 1350 }, { "epoch": 1.2836243511090137, "grad_norm": 0.0969588608522638, "learning_rate": 1.2782174639164528e-05, "loss": 0.1112, "step": 1360 }, { "epoch": 1.2930627654554034, "grad_norm": 0.10783687256200783, "learning_rate": 1.2684090884161636e-05, "loss": 0.1125, "step": 1370 }, { "epoch": 1.3025011798017934, "grad_norm": 0.10076692580892369, "learning_rate": 1.2585728061311003e-05, "loss": 0.1107, "step": 1380 }, { "epoch": 1.311939594148183, "grad_norm": 0.09895358395270354, "learning_rate": 1.248709639750228e-05, "loss": 0.1122, "step": 1390 }, { "epoch": 1.321378008494573, "grad_norm": 0.10215738990006902, "learning_rate": 1.2388206147576796e-05, "loss": 0.1124, "step": 1400 }, { "epoch": 1.3308164228409627, "grad_norm": 0.09611638665301472, "learning_rate": 1.2289067593261358e-05, "loss": 0.1151, "step": 1410 }, { "epoch": 1.3402548371873526, "grad_norm": 0.09899073401009041, "learning_rate": 1.2189691042099265e-05, "loss": 0.1124, "step": 1420 }, { "epoch": 1.3496932515337423, "grad_norm": 0.1157109248340035, "learning_rate": 1.209008682637859e-05, "loss": 0.1154, "step": 1430 }, { "epoch": 1.359131665880132, "grad_norm": 0.09358448441833775, "learning_rate": 1.1990265302057948e-05, "loss": 0.1127, "step": 1440 }, { "epoch": 1.368570080226522, "grad_norm": 0.10318474117014907, "learning_rate": 1.1890236847689762e-05, "loss": 0.1134, "step": 1450 }, { "epoch": 1.3780084945729119, "grad_norm": 0.10507403584009326, "learning_rate": 1.1790011863341197e-05, "loss": 0.1145, "step": 1460 }, { "epoch": 1.3874469089193016, "grad_norm": 0.09210160678217687, "learning_rate": 1.1689600769512855e-05, "loss": 0.1128, "step": 1470 }, { "epoch": 1.3968853232656913, "grad_norm": 0.09890271495599744, "learning_rate": 1.1589014006055337e-05, "loss": 0.1158, "step": 1480 }, { "epoch": 1.4063237376120812, "grad_norm": 0.09768042621781026, "learning_rate": 1.1488262031083816e-05, "loss": 0.1107, "step": 1490 }, { "epoch": 1.415762151958471, "grad_norm": 0.09501221720590393, "learning_rate": 1.1387355319890685e-05, "loss": 0.1138, "step": 1500 }, { "epoch": 1.4252005663048608, "grad_norm": 0.09098951194154016, "learning_rate": 1.1286304363856418e-05, "loss": 0.112, "step": 1510 }, { "epoch": 1.4346389806512505, "grad_norm": 0.08856225785605727, "learning_rate": 1.1185119669358792e-05, "loss": 0.1137, "step": 1520 }, { "epoch": 1.4440773949976404, "grad_norm": 0.09111265321801558, "learning_rate": 1.1083811756680523e-05, "loss": 0.1093, "step": 1530 }, { "epoch": 1.4535158093440301, "grad_norm": 0.09329783759350743, "learning_rate": 1.0982391158915441e-05, "loss": 0.1138, "step": 1540 }, { "epoch": 1.46295422369042, "grad_norm": 0.09114905230583735, "learning_rate": 1.0880868420873375e-05, "loss": 0.1135, "step": 1550 }, { "epoch": 1.4723926380368098, "grad_norm": 0.10049302399783284, "learning_rate": 1.0779254097983788e-05, "loss": 0.1104, "step": 1560 }, { "epoch": 1.4818310523831997, "grad_norm": 0.08811790258148439, "learning_rate": 1.0677558755198327e-05, "loss": 0.114, "step": 1570 }, { "epoch": 1.4912694667295894, "grad_norm": 0.09402068008649977, "learning_rate": 1.0575792965892349e-05, "loss": 0.1112, "step": 1580 }, { "epoch": 1.500707881075979, "grad_norm": 0.09144293350756144, "learning_rate": 1.0473967310765629e-05, "loss": 0.1099, "step": 1590 }, { "epoch": 1.510146295422369, "grad_norm": 0.08887643971229772, "learning_rate": 1.0372092376742247e-05, "loss": 0.1109, "step": 1600 }, { "epoch": 1.519584709768759, "grad_norm": 0.09042876745354687, "learning_rate": 1.0270178755869861e-05, "loss": 0.1123, "step": 1610 }, { "epoch": 1.5290231241151486, "grad_norm": 0.08799872003450031, "learning_rate": 1.0168237044218452e-05, "loss": 0.1088, "step": 1620 }, { "epoch": 1.5384615384615383, "grad_norm": 0.08558681617619375, "learning_rate": 1.0066277840778626e-05, "loss": 0.1125, "step": 1630 }, { "epoch": 1.5478999528079282, "grad_norm": 0.08917702916102198, "learning_rate": 9.964311746359631e-06, "loss": 0.1078, "step": 1640 }, { "epoch": 1.5573383671543182, "grad_norm": 0.09365014825092945, "learning_rate": 9.862349362487172e-06, "loss": 0.108, "step": 1650 }, { "epoch": 1.5667767815007079, "grad_norm": 0.08881624424784158, "learning_rate": 9.760401290301164e-06, "loss": 0.1073, "step": 1660 }, { "epoch": 1.5762151958470976, "grad_norm": 0.09040629927788134, "learning_rate": 9.658478129453532e-06, "loss": 0.1095, "step": 1670 }, { "epoch": 1.5856536101934875, "grad_norm": 0.09668654356646131, "learning_rate": 9.556590477006123e-06, "loss": 0.109, "step": 1680 }, { "epoch": 1.5950920245398774, "grad_norm": 0.08945527030759594, "learning_rate": 9.454748926328962e-06, "loss": 0.1111, "step": 1690 }, { "epoch": 1.6045304388862671, "grad_norm": 0.09096700102455489, "learning_rate": 9.352964065998801e-06, "loss": 0.1091, "step": 1700 }, { "epoch": 1.6139688532326568, "grad_norm": 0.098227701202526, "learning_rate": 9.251246478698242e-06, "loss": 0.1124, "step": 1710 }, { "epoch": 1.6234072675790467, "grad_norm": 0.08702267003826049, "learning_rate": 9.149606740115444e-06, "loss": 0.1091, "step": 1720 }, { "epoch": 1.6328456819254367, "grad_norm": 0.0912741057496456, "learning_rate": 9.04805541784454e-06, "loss": 0.1084, "step": 1730 }, { "epoch": 1.6422840962718264, "grad_norm": 0.09473199957469115, "learning_rate": 8.946603070286926e-06, "loss": 0.1071, "step": 1740 }, { "epoch": 1.651722510618216, "grad_norm": 0.09786418318351309, "learning_rate": 8.845260245553493e-06, "loss": 0.1106, "step": 1750 }, { "epoch": 1.661160924964606, "grad_norm": 0.09376970400308367, "learning_rate": 8.744037480367922e-06, "loss": 0.1095, "step": 1760 }, { "epoch": 1.670599339310996, "grad_norm": 0.09927967174299174, "learning_rate": 8.642945298971168e-06, "loss": 0.1086, "step": 1770 }, { "epoch": 1.6800377536573856, "grad_norm": 0.08941325714107755, "learning_rate": 8.54199421202726e-06, "loss": 0.1096, "step": 1780 }, { "epoch": 1.6894761680037753, "grad_norm": 0.09076378690689199, "learning_rate": 8.441194715530472e-06, "loss": 0.111, "step": 1790 }, { "epoch": 1.6989145823501652, "grad_norm": 0.08350889813597084, "learning_rate": 8.340557289714055e-06, "loss": 0.1089, "step": 1800 }, { "epoch": 1.708352996696555, "grad_norm": 0.1102767062542333, "learning_rate": 8.240092397960601e-06, "loss": 0.1077, "step": 1810 }, { "epoch": 1.7177914110429446, "grad_norm": 0.0928092514701947, "learning_rate": 8.139810485714142e-06, "loss": 0.109, "step": 1820 }, { "epoch": 1.7272298253893346, "grad_norm": 0.08827741386544802, "learning_rate": 8.03972197939414e-06, "loss": 0.1103, "step": 1830 }, { "epoch": 1.7366682397357245, "grad_norm": 0.09074532027658296, "learning_rate": 7.939837285311425e-06, "loss": 0.106, "step": 1840 }, { "epoch": 1.7461066540821142, "grad_norm": 0.08863060213083432, "learning_rate": 7.840166788586244e-06, "loss": 0.1111, "step": 1850 }, { "epoch": 1.7555450684285039, "grad_norm": 0.0873194235737418, "learning_rate": 7.740720852068524e-06, "loss": 0.1107, "step": 1860 }, { "epoch": 1.7649834827748938, "grad_norm": 0.09463118593541094, "learning_rate": 7.641509815260412e-06, "loss": 0.1067, "step": 1870 }, { "epoch": 1.7744218971212837, "grad_norm": 0.08277999614215281, "learning_rate": 7.542543993241278e-06, "loss": 0.1092, "step": 1880 }, { "epoch": 1.7838603114676734, "grad_norm": 0.08350764579820083, "learning_rate": 7.443833675595254e-06, "loss": 0.1033, "step": 1890 }, { "epoch": 1.7932987258140631, "grad_norm": 0.0892168369121696, "learning_rate": 7.3453891253413935e-06, "loss": 0.1088, "step": 1900 }, { "epoch": 1.802737140160453, "grad_norm": 0.09068391166704463, "learning_rate": 7.247220577866625e-06, "loss": 0.1074, "step": 1910 }, { "epoch": 1.812175554506843, "grad_norm": 0.09208367707491026, "learning_rate": 7.149338239861579e-06, "loss": 0.1069, "step": 1920 }, { "epoch": 1.8216139688532327, "grad_norm": 0.09334448658561058, "learning_rate": 7.051752288259366e-06, "loss": 0.1051, "step": 1930 }, { "epoch": 1.8310523831996224, "grad_norm": 0.0867013966015152, "learning_rate": 6.954472869177479e-06, "loss": 0.1071, "step": 1940 }, { "epoch": 1.8404907975460123, "grad_norm": 0.08513824070105314, "learning_rate": 6.857510096862901e-06, "loss": 0.108, "step": 1950 }, { "epoch": 1.8499292118924022, "grad_norm": 0.08880515925379688, "learning_rate": 6.760874052640494e-06, "loss": 0.1081, "step": 1960 }, { "epoch": 1.859367626238792, "grad_norm": 0.09395118992476542, "learning_rate": 6.664574783864862e-06, "loss": 0.1079, "step": 1970 }, { "epoch": 1.8688060405851816, "grad_norm": 0.09253843263366972, "learning_rate": 6.568622302875682e-06, "loss": 0.1068, "step": 1980 }, { "epoch": 1.8782444549315715, "grad_norm": 0.09103342170778085, "learning_rate": 6.473026585956736e-06, "loss": 0.106, "step": 1990 }, { "epoch": 1.8876828692779613, "grad_norm": 0.08266259287550586, "learning_rate": 6.377797572298661e-06, "loss": 0.1076, "step": 2000 }, { "epoch": 1.897121283624351, "grad_norm": 0.08360823987901486, "learning_rate": 6.282945162965548e-06, "loss": 0.1079, "step": 2010 }, { "epoch": 1.9065596979707409, "grad_norm": 0.09004863363551058, "learning_rate": 6.188479219865529e-06, "loss": 0.1064, "step": 2020 }, { "epoch": 1.9159981123171308, "grad_norm": 0.0931263680297109, "learning_rate": 6.094409564725435e-06, "loss": 0.1054, "step": 2030 }, { "epoch": 1.9254365266635205, "grad_norm": 0.09051504504706874, "learning_rate": 6.0007459780695885e-06, "loss": 0.1082, "step": 2040 }, { "epoch": 1.9348749410099102, "grad_norm": 0.0904194328886728, "learning_rate": 5.907498198202939e-06, "loss": 0.1081, "step": 2050 }, { "epoch": 1.9443133553563001, "grad_norm": 0.08711490496456058, "learning_rate": 5.8146759201985525e-06, "loss": 0.1069, "step": 2060 }, { "epoch": 1.95375176970269, "grad_norm": 0.08686808156213838, "learning_rate": 5.722288794889603e-06, "loss": 0.1064, "step": 2070 }, { "epoch": 1.9631901840490797, "grad_norm": 0.08817123970600604, "learning_rate": 5.630346427865965e-06, "loss": 0.1045, "step": 2080 }, { "epoch": 1.9726285983954694, "grad_norm": 0.08339231796976605, "learning_rate": 5.538858378475508e-06, "loss": 0.1066, "step": 2090 }, { "epoch": 1.9820670127418594, "grad_norm": 0.08824324960180577, "learning_rate": 5.447834158830202e-06, "loss": 0.1037, "step": 2100 }, { "epoch": 1.9915054270882493, "grad_norm": 0.09311929590923752, "learning_rate": 5.357283232817147e-06, "loss": 0.1054, "step": 2110 }, { "epoch": 2.0009438414346388, "grad_norm": 0.1334431719993605, "learning_rate": 5.267215015114574e-06, "loss": 0.1031, "step": 2120 }, { "epoch": 2.0103822557810287, "grad_norm": 0.10361861757914052, "learning_rate": 5.177638870213008e-06, "loss": 0.0868, "step": 2130 }, { "epoch": 2.0198206701274186, "grad_norm": 0.084172782289755, "learning_rate": 5.088564111441645e-06, "loss": 0.0834, "step": 2140 }, { "epoch": 2.0292590844738085, "grad_norm": 0.08727410339549913, "learning_rate": 5.000000000000003e-06, "loss": 0.0852, "step": 2150 }, { "epoch": 2.038697498820198, "grad_norm": 0.08994662846331379, "learning_rate": 4.911955743995042e-06, "loss": 0.0845, "step": 2160 }, { "epoch": 2.048135913166588, "grad_norm": 0.08715835430152602, "learning_rate": 4.824440497483802e-06, "loss": 0.0847, "step": 2170 }, { "epoch": 2.057574327512978, "grad_norm": 0.0943214139106939, "learning_rate": 4.737463359521618e-06, "loss": 0.0845, "step": 2180 }, { "epoch": 2.067012741859368, "grad_norm": 0.09049982951899538, "learning_rate": 4.6510333732160915e-06, "loss": 0.085, "step": 2190 }, { "epoch": 2.0764511562057573, "grad_norm": 0.08823142440331375, "learning_rate": 4.565159524786888e-06, "loss": 0.0867, "step": 2200 }, { "epoch": 2.085889570552147, "grad_norm": 0.08727162151071796, "learning_rate": 4.479850742631396e-06, "loss": 0.0834, "step": 2210 }, { "epoch": 2.095327984898537, "grad_norm": 0.08243182067038303, "learning_rate": 4.395115896396457e-06, "loss": 0.0849, "step": 2220 }, { "epoch": 2.104766399244927, "grad_norm": 0.08720723034547441, "learning_rate": 4.310963796056168e-06, "loss": 0.084, "step": 2230 }, { "epoch": 2.1142048135913165, "grad_norm": 0.08588933137845103, "learning_rate": 4.227403190995901e-06, "loss": 0.0875, "step": 2240 }, { "epoch": 2.1236432279377064, "grad_norm": 0.09200761679022347, "learning_rate": 4.14444276910263e-06, "loss": 0.0853, "step": 2250 }, { "epoch": 2.1330816422840964, "grad_norm": 0.08831298949051568, "learning_rate": 4.06209115586162e-06, "loss": 0.0867, "step": 2260 }, { "epoch": 2.1425200566304863, "grad_norm": 0.0893828115241757, "learning_rate": 3.980356913459642e-06, "loss": 0.0865, "step": 2270 }, { "epoch": 2.1519584709768758, "grad_norm": 0.09119946740323005, "learning_rate": 3.899248539894756e-06, "loss": 0.0848, "step": 2280 }, { "epoch": 2.1613968853232657, "grad_norm": 0.08882008929472095, "learning_rate": 3.818774468092754e-06, "loss": 0.0843, "step": 2290 }, { "epoch": 2.1708352996696556, "grad_norm": 0.08922739717614447, "learning_rate": 3.738943065030376e-06, "loss": 0.0811, "step": 2300 }, { "epoch": 2.180273714016045, "grad_norm": 0.08622476744102522, "learning_rate": 3.659762630865411e-06, "loss": 0.083, "step": 2310 }, { "epoch": 2.189712128362435, "grad_norm": 0.08399251697806781, "learning_rate": 3.5812413980736916e-06, "loss": 0.0827, "step": 2320 }, { "epoch": 2.199150542708825, "grad_norm": 0.09006770579644241, "learning_rate": 3.5033875305931662e-06, "loss": 0.0849, "step": 2330 }, { "epoch": 2.208588957055215, "grad_norm": 0.08747795506814363, "learning_rate": 3.4262091229750973e-06, "loss": 0.0822, "step": 2340 }, { "epoch": 2.2180273714016043, "grad_norm": 0.08840099962821243, "learning_rate": 3.3497141995424397e-06, "loss": 0.0835, "step": 2350 }, { "epoch": 2.2274657857479943, "grad_norm": 0.08986511740506226, "learning_rate": 3.2739107135555603e-06, "loss": 0.0841, "step": 2360 }, { "epoch": 2.236904200094384, "grad_norm": 0.08924913381765429, "learning_rate": 3.1988065463853204e-06, "loss": 0.0849, "step": 2370 }, { "epoch": 2.246342614440774, "grad_norm": 0.08622955784811655, "learning_rate": 3.1244095066936396e-06, "loss": 0.0848, "step": 2380 }, { "epoch": 2.2557810287871636, "grad_norm": 0.08819044440789944, "learning_rate": 3.050727329621637e-06, "loss": 0.0835, "step": 2390 }, { "epoch": 2.2652194431335535, "grad_norm": 0.08900963097417651, "learning_rate": 2.977767675985377e-06, "loss": 0.0805, "step": 2400 }, { "epoch": 2.2746578574799434, "grad_norm": 0.0889820260869723, "learning_rate": 2.905538131479376e-06, "loss": 0.0844, "step": 2410 }, { "epoch": 2.2840962718263333, "grad_norm": 0.08508956855605507, "learning_rate": 2.8340462058879214e-06, "loss": 0.082, "step": 2420 }, { "epoch": 2.293534686172723, "grad_norm": 0.0859290434178059, "learning_rate": 2.76329933230425e-06, "loss": 0.0819, "step": 2430 }, { "epoch": 2.3029731005191127, "grad_norm": 0.08310233373376713, "learning_rate": 2.6933048663577297e-06, "loss": 0.0811, "step": 2440 }, { "epoch": 2.3124115148655027, "grad_norm": 0.08745596999299074, "learning_rate": 2.6240700854490988e-06, "loss": 0.0824, "step": 2450 }, { "epoch": 2.3218499292118926, "grad_norm": 0.08451577327209396, "learning_rate": 2.5556021879938074e-06, "loss": 0.0828, "step": 2460 }, { "epoch": 2.331288343558282, "grad_norm": 0.09129945645731877, "learning_rate": 2.4879082926735974e-06, "loss": 0.0837, "step": 2470 }, { "epoch": 2.340726757904672, "grad_norm": 0.083910957600724, "learning_rate": 2.4209954376963797e-06, "loss": 0.0816, "step": 2480 }, { "epoch": 2.350165172251062, "grad_norm": 0.08447673558716574, "learning_rate": 2.354870580064439e-06, "loss": 0.0808, "step": 2490 }, { "epoch": 2.3596035865974514, "grad_norm": 0.08671014168934867, "learning_rate": 2.289540594851122e-06, "loss": 0.0814, "step": 2500 }, { "epoch": 2.3690420009438413, "grad_norm": 0.08155772855747594, "learning_rate": 2.225012274486028e-06, "loss": 0.0791, "step": 2510 }, { "epoch": 2.3784804152902312, "grad_norm": 0.08555794455636312, "learning_rate": 2.1612923280487883e-06, "loss": 0.0843, "step": 2520 }, { "epoch": 2.387918829636621, "grad_norm": 0.08566290155470521, "learning_rate": 2.0983873805715216e-06, "loss": 0.0837, "step": 2530 }, { "epoch": 2.397357243983011, "grad_norm": 0.08531077534490512, "learning_rate": 2.0363039723500155e-06, "loss": 0.0838, "step": 2540 }, { "epoch": 2.4067956583294006, "grad_norm": 0.08375539552341336, "learning_rate": 1.9750485582637245e-06, "loss": 0.0822, "step": 2550 }, { "epoch": 2.4162340726757905, "grad_norm": 0.08503293134727295, "learning_rate": 1.9146275071046626e-06, "loss": 0.0849, "step": 2560 }, { "epoch": 2.4256724870221804, "grad_norm": 0.08165485153496227, "learning_rate": 1.8550471009152138e-06, "loss": 0.0803, "step": 2570 }, { "epoch": 2.43511090136857, "grad_norm": 0.08309504205099255, "learning_rate": 1.7963135343349914e-06, "loss": 0.0789, "step": 2580 }, { "epoch": 2.44454931571496, "grad_norm": 0.08294260063335909, "learning_rate": 1.73843291395678e-06, "loss": 0.0823, "step": 2590 }, { "epoch": 2.4539877300613497, "grad_norm": 0.08537750016844735, "learning_rate": 1.6814112576916142e-06, "loss": 0.0825, "step": 2600 }, { "epoch": 2.4634261444077397, "grad_norm": 0.08433197560779272, "learning_rate": 1.6252544941430982e-06, "loss": 0.0813, "step": 2610 }, { "epoch": 2.472864558754129, "grad_norm": 0.08321923421106348, "learning_rate": 1.5699684619909983e-06, "loss": 0.0826, "step": 2620 }, { "epoch": 2.482302973100519, "grad_norm": 0.08373732680479463, "learning_rate": 1.5155589093841939e-06, "loss": 0.0802, "step": 2630 }, { "epoch": 2.491741387446909, "grad_norm": 0.09112397953039225, "learning_rate": 1.4620314933430269e-06, "loss": 0.081, "step": 2640 }, { "epoch": 2.501179801793299, "grad_norm": 0.08246005822696881, "learning_rate": 1.4093917791711497e-06, "loss": 0.0808, "step": 2650 }, { "epoch": 2.5106182161396884, "grad_norm": 0.08399111524479627, "learning_rate": 1.357645239876879e-06, "loss": 0.08, "step": 2660 }, { "epoch": 2.5200566304860783, "grad_norm": 0.08520663335129833, "learning_rate": 1.3067972556041753e-06, "loss": 0.0818, "step": 2670 }, { "epoch": 2.5294950448324682, "grad_norm": 0.08299784168003543, "learning_rate": 1.2568531130732498e-06, "loss": 0.0807, "step": 2680 }, { "epoch": 2.5389334591788577, "grad_norm": 0.08594453806774255, "learning_rate": 1.207818005030904e-06, "loss": 0.0802, "step": 2690 }, { "epoch": 2.5483718735252476, "grad_norm": 0.08039958287229476, "learning_rate": 1.1596970297106458e-06, "loss": 0.0818, "step": 2700 }, { "epoch": 2.5578102878716376, "grad_norm": 0.08085895400758615, "learning_rate": 1.1124951903025981e-06, "loss": 0.0806, "step": 2710 }, { "epoch": 2.5672487022180275, "grad_norm": 0.08458215660632902, "learning_rate": 1.0662173944333288e-06, "loss": 0.081, "step": 2720 }, { "epoch": 2.5766871165644174, "grad_norm": 0.0826446041832577, "learning_rate": 1.0208684536555968e-06, "loss": 0.081, "step": 2730 }, { "epoch": 2.586125530910807, "grad_norm": 0.08442284791430928, "learning_rate": 9.764530829480822e-07, "loss": 0.0832, "step": 2740 }, { "epoch": 2.595563945257197, "grad_norm": 0.08336636617446731, "learning_rate": 9.329759002251726e-07, "loss": 0.0802, "step": 2750 }, { "epoch": 2.6050023596035867, "grad_norm": 0.0846992268529094, "learning_rate": 8.904414258568306e-07, "loss": 0.0799, "step": 2760 }, { "epoch": 2.614440773949976, "grad_norm": 0.08300701320734614, "learning_rate": 8.488540821986035e-07, "loss": 0.0827, "step": 2770 }, { "epoch": 2.623879188296366, "grad_norm": 0.08443465909555346, "learning_rate": 8.082181931318311e-07, "loss": 0.0792, "step": 2780 }, { "epoch": 2.633317602642756, "grad_norm": 0.08476186997139065, "learning_rate": 7.685379836140872e-07, "loss": 0.079, "step": 2790 }, { "epoch": 2.642756016989146, "grad_norm": 0.08472149270611445, "learning_rate": 7.298175792398976e-07, "loss": 0.0818, "step": 2800 }, { "epoch": 2.652194431335536, "grad_norm": 0.08032205169166948, "learning_rate": 6.920610058118105e-07, "loss": 0.0804, "step": 2810 }, { "epoch": 2.6616328456819254, "grad_norm": 0.08367697987102687, "learning_rate": 6.552721889218194e-07, "loss": 0.0816, "step": 2820 }, { "epoch": 2.6710712600283153, "grad_norm": 0.08668064800964889, "learning_rate": 6.194549535432137e-07, "loss": 0.08, "step": 2830 }, { "epoch": 2.680509674374705, "grad_norm": 0.0828804628433115, "learning_rate": 5.846130236329073e-07, "loss": 0.0823, "step": 2840 }, { "epoch": 2.6899480887210947, "grad_norm": 0.08378018124842795, "learning_rate": 5.507500217442341e-07, "loss": 0.0809, "step": 2850 }, { "epoch": 2.6993865030674846, "grad_norm": 0.08220414182971268, "learning_rate": 5.178694686503205e-07, "loss": 0.0784, "step": 2860 }, { "epoch": 2.7088249174138745, "grad_norm": 0.08076213129882256, "learning_rate": 4.85974782978027e-07, "loss": 0.081, "step": 2870 }, { "epoch": 2.718263331760264, "grad_norm": 0.08189865863600006, "learning_rate": 4.5506928085250033e-07, "loss": 0.0778, "step": 2880 }, { "epoch": 2.727701746106654, "grad_norm": 0.08777777733586507, "learning_rate": 4.251561755524036e-07, "loss": 0.0832, "step": 2890 }, { "epoch": 2.737140160453044, "grad_norm": 0.08438414185840341, "learning_rate": 3.9623857717581813e-07, "loss": 0.0808, "step": 2900 }, { "epoch": 2.746578574799434, "grad_norm": 0.08245920785308986, "learning_rate": 3.6831949231689203e-07, "loss": 0.081, "step": 2910 }, { "epoch": 2.7560169891458237, "grad_norm": 0.08356304284398074, "learning_rate": 3.414018237532335e-07, "loss": 0.0821, "step": 2920 }, { "epoch": 2.765455403492213, "grad_norm": 0.08633702580648989, "learning_rate": 3.154883701441136e-07, "loss": 0.08, "step": 2930 }, { "epoch": 2.774893817838603, "grad_norm": 0.0821243443006684, "learning_rate": 2.905818257394799e-07, "loss": 0.0798, "step": 2940 }, { "epoch": 2.784332232184993, "grad_norm": 0.08233971677127812, "learning_rate": 2.666847800998362e-07, "loss": 0.0819, "step": 2950 }, { "epoch": 2.7937706465313825, "grad_norm": 0.08176593022884952, "learning_rate": 2.437997178270035e-07, "loss": 0.0807, "step": 2960 }, { "epoch": 2.8032090608777724, "grad_norm": 0.08485236840414098, "learning_rate": 2.219290183057865e-07, "loss": 0.0806, "step": 2970 }, { "epoch": 2.8126474752241624, "grad_norm": 0.08134044819035506, "learning_rate": 2.0107495545659829e-07, "loss": 0.0778, "step": 2980 }, { "epoch": 2.8220858895705523, "grad_norm": 0.08200978384685277, "learning_rate": 1.8123969749902714e-07, "loss": 0.0777, "step": 2990 }, { "epoch": 2.831524303916942, "grad_norm": 0.08409107175117113, "learning_rate": 1.6242530672641143e-07, "loss": 0.0813, "step": 3000 }, { "epoch": 2.8409627182633317, "grad_norm": 0.08288246905003016, "learning_rate": 1.4463373929141766e-07, "loss": 0.0788, "step": 3010 }, { "epoch": 2.8504011326097216, "grad_norm": 0.08212980978941326, "learning_rate": 1.2786684500265546e-07, "loss": 0.0819, "step": 3020 }, { "epoch": 2.8598395469561115, "grad_norm": 0.0832626354470357, "learning_rate": 1.1212636713235581e-07, "loss": 0.0794, "step": 3030 }, { "epoch": 2.869277961302501, "grad_norm": 0.0836378792472762, "learning_rate": 9.741394223512057e-08, "loss": 0.0814, "step": 3040 }, { "epoch": 2.878716375648891, "grad_norm": 0.08017963643030881, "learning_rate": 8.373109997776185e-08, "loss": 0.0804, "step": 3050 }, { "epoch": 2.888154789995281, "grad_norm": 0.08105963755024889, "learning_rate": 7.10792629802659e-08, "loss": 0.08, "step": 3060 }, { "epoch": 2.8975932043416703, "grad_norm": 0.08196798527897163, "learning_rate": 5.945974666788479e-08, "loss": 0.0809, "step": 3070 }, { "epoch": 2.9070316186880603, "grad_norm": 0.08445010118063752, "learning_rate": 4.887375913436132e-08, "loss": 0.0817, "step": 3080 }, { "epoch": 2.91647003303445, "grad_norm": 0.0830340739185833, "learning_rate": 3.932240101633178e-08, "loss": 0.0823, "step": 3090 }, { "epoch": 2.92590844738084, "grad_norm": 0.08107816362561618, "learning_rate": 3.0806665378884106e-08, "loss": 0.0764, "step": 3100 }, { "epoch": 2.93534686172723, "grad_norm": 0.08498347611137054, "learning_rate": 2.33274376123116e-08, "loss": 0.0784, "step": 3110 }, { "epoch": 2.9447852760736195, "grad_norm": 0.08214101571060746, "learning_rate": 1.68854953400599e-08, "loss": 0.0795, "step": 3120 }, { "epoch": 2.9542236904200094, "grad_norm": 0.0849230883706386, "learning_rate": 1.1481508337869429e-08, "loss": 0.0801, "step": 3130 }, { "epoch": 2.9636621047663994, "grad_norm": 0.09022687172778407, "learning_rate": 7.1160384641455475e-09, "loss": 0.0828, "step": 3140 }, { "epoch": 2.973100519112789, "grad_norm": 0.08399540827662202, "learning_rate": 3.7895396015374955e-09, "loss": 0.0808, "step": 3150 }, { "epoch": 2.9825389334591788, "grad_norm": 0.08027494442622467, "learning_rate": 1.502357609749483e-09, "loss": 0.0803, "step": 3160 }, { "epoch": 2.9919773478055687, "grad_norm": 0.08480380742956374, "learning_rate": 2.5473028957945234e-10, "loss": 0.0813, "step": 3170 }, { "epoch": 2.9985842378480414, "step": 3177, "total_flos": 5.875342281120154e+16, "train_loss": 0.11385704865424108, "train_runtime": 162519.6807, "train_samples_per_second": 3.441, "train_steps_per_second": 0.02 } ], "logging_steps": 10, "max_steps": 3177, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.875342281120154e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }