| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.931896354047237, | |
| "eval_steps": 30, | |
| "global_step": 1600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018344416418252695, | |
| "grad_norm": 11.986655235290527, | |
| "learning_rate": 1.0975609756097562e-05, | |
| "loss": 2.8049, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03668883283650539, | |
| "grad_norm": 1.3442561626434326, | |
| "learning_rate": 2.3170731707317075e-05, | |
| "loss": 1.2643, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05503324925475808, | |
| "grad_norm": 1.0081877708435059, | |
| "learning_rate": 3.5365853658536584e-05, | |
| "loss": 0.7278, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05503324925475808, | |
| "eval_loss": 0.6342164874076843, | |
| "eval_runtime": 47.3806, | |
| "eval_samples_per_second": 4.854, | |
| "eval_steps_per_second": 4.854, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07337766567301078, | |
| "grad_norm": 0.8855087757110596, | |
| "learning_rate": 4.75609756097561e-05, | |
| "loss": 0.5154, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09172208209126347, | |
| "grad_norm": 0.7658259272575378, | |
| "learning_rate": 5.975609756097561e-05, | |
| "loss": 0.4091, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11006649850951616, | |
| "grad_norm": 1.1029750108718872, | |
| "learning_rate": 7.195121951219513e-05, | |
| "loss": 0.3197, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11006649850951616, | |
| "eval_loss": 0.26837778091430664, | |
| "eval_runtime": 47.1026, | |
| "eval_samples_per_second": 4.883, | |
| "eval_steps_per_second": 4.883, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12841091492776885, | |
| "grad_norm": 0.5400053262710571, | |
| "learning_rate": 8.414634146341464e-05, | |
| "loss": 0.2753, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14675533134602156, | |
| "grad_norm": 0.5874791145324707, | |
| "learning_rate": 9.634146341463415e-05, | |
| "loss": 0.243, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.16509974776427425, | |
| "grad_norm": 0.47156110405921936, | |
| "learning_rate": 0.00010853658536585367, | |
| "loss": 0.2144, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16509974776427425, | |
| "eval_loss": 0.18793882429599762, | |
| "eval_runtime": 46.9461, | |
| "eval_samples_per_second": 4.899, | |
| "eval_steps_per_second": 4.899, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18344416418252693, | |
| "grad_norm": 0.6732025146484375, | |
| "learning_rate": 0.00012073170731707318, | |
| "loss": 0.1804, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20178858060077964, | |
| "grad_norm": 0.4138810336589813, | |
| "learning_rate": 0.0001329268292682927, | |
| "loss": 0.1704, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.22013299701903233, | |
| "grad_norm": 0.3548312783241272, | |
| "learning_rate": 0.0001451219512195122, | |
| "loss": 0.1642, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22013299701903233, | |
| "eval_loss": 0.14807145297527313, | |
| "eval_runtime": 47.16, | |
| "eval_samples_per_second": 4.877, | |
| "eval_steps_per_second": 4.877, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.238477413437285, | |
| "grad_norm": 0.5005571842193604, | |
| "learning_rate": 0.00015731707317073173, | |
| "loss": 0.1688, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2568218298555377, | |
| "grad_norm": 0.4382654130458832, | |
| "learning_rate": 0.00016951219512195123, | |
| "loss": 0.1479, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.27516624627379044, | |
| "grad_norm": 0.3288620114326477, | |
| "learning_rate": 0.00018170731707317075, | |
| "loss": 0.1542, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.27516624627379044, | |
| "eval_loss": 0.13612844049930573, | |
| "eval_runtime": 46.9463, | |
| "eval_samples_per_second": 4.899, | |
| "eval_steps_per_second": 4.899, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2935106626920431, | |
| "grad_norm": 0.48989060521125793, | |
| "learning_rate": 0.00019390243902439025, | |
| "loss": 0.1536, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3118550791102958, | |
| "grad_norm": 0.2770105004310608, | |
| "learning_rate": 0.00019999432180005332, | |
| "loss": 0.1407, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3301994955285485, | |
| "grad_norm": 1.1703704595565796, | |
| "learning_rate": 0.00019994890006944105, | |
| "loss": 0.1485, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3301994955285485, | |
| "eval_loss": 0.13879014551639557, | |
| "eval_runtime": 46.9507, | |
| "eval_samples_per_second": 4.899, | |
| "eval_steps_per_second": 4.899, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3485439119468012, | |
| "grad_norm": 0.4198831617832184, | |
| "learning_rate": 0.0001998580772407242, | |
| "loss": 0.1488, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.36688832836505386, | |
| "grad_norm": 0.5260700583457947, | |
| "learning_rate": 0.00019972189456954594, | |
| "loss": 0.1589, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3852327447833066, | |
| "grad_norm": 0.34109413623809814, | |
| "learning_rate": 0.00019954041391594487, | |
| "loss": 0.1429, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3852327447833066, | |
| "eval_loss": 0.1368015855550766, | |
| "eval_runtime": 46.9592, | |
| "eval_samples_per_second": 4.898, | |
| "eval_steps_per_second": 4.898, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4035771612015593, | |
| "grad_norm": 0.5479871034622192, | |
| "learning_rate": 0.00019931371771625544, | |
| "loss": 0.1413, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.42192157761981197, | |
| "grad_norm": 0.2849060297012329, | |
| "learning_rate": 0.00019904190894566194, | |
| "loss": 0.1329, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.44026599403806466, | |
| "grad_norm": 0.2200181484222412, | |
| "learning_rate": 0.00019872511107142261, | |
| "loss": 0.15, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.44026599403806466, | |
| "eval_loss": 0.12958292663097382, | |
| "eval_runtime": 46.9085, | |
| "eval_samples_per_second": 4.903, | |
| "eval_steps_per_second": 4.903, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.45861041045631734, | |
| "grad_norm": 0.31030556559562683, | |
| "learning_rate": 0.00019836346799678568, | |
| "loss": 0.1277, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.47695482687457, | |
| "grad_norm": 0.1974978893995285, | |
| "learning_rate": 0.00019795714399562197, | |
| "loss": 0.1293, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.49529924329282277, | |
| "grad_norm": 0.40907976031303406, | |
| "learning_rate": 0.00019750632363780505, | |
| "loss": 0.1437, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.49529924329282277, | |
| "eval_loss": 0.12063605338335037, | |
| "eval_runtime": 47.4843, | |
| "eval_samples_per_second": 4.844, | |
| "eval_steps_per_second": 4.844, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5136436597110754, | |
| "grad_norm": 0.37478116154670715, | |
| "learning_rate": 0.00019701121170537125, | |
| "loss": 0.1388, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5319880761293282, | |
| "grad_norm": 0.24447715282440186, | |
| "learning_rate": 0.00019647203309949913, | |
| "loss": 0.1316, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5503324925475809, | |
| "grad_norm": 0.44361746311187744, | |
| "learning_rate": 0.00019588903273834953, | |
| "loss": 0.14, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5503324925475809, | |
| "eval_loss": 0.12744086980819702, | |
| "eval_runtime": 47.4937, | |
| "eval_samples_per_second": 4.843, | |
| "eval_steps_per_second": 4.843, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5686769089658336, | |
| "grad_norm": 0.734635591506958, | |
| "learning_rate": 0.00019526247544581312, | |
| "loss": 0.1403, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5870213253840862, | |
| "grad_norm": 0.16803057491779327, | |
| "learning_rate": 0.00019459264583121622, | |
| "loss": 0.138, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6053657418023389, | |
| "grad_norm": 0.29355189204216003, | |
| "learning_rate": 0.00019387984816003867, | |
| "loss": 0.1292, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6053657418023389, | |
| "eval_loss": 0.11417385935783386, | |
| "eval_runtime": 47.4735, | |
| "eval_samples_per_second": 4.845, | |
| "eval_steps_per_second": 4.845, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6237101582205916, | |
| "grad_norm": 0.14734932780265808, | |
| "learning_rate": 0.00019312440621570355, | |
| "loss": 0.1319, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6420545746388443, | |
| "grad_norm": 0.1784052848815918, | |
| "learning_rate": 0.00019232666315250078, | |
| "loss": 0.133, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.660398991057097, | |
| "grad_norm": 0.21272070705890656, | |
| "learning_rate": 0.00019148698133971155, | |
| "loss": 0.1244, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.660398991057097, | |
| "eval_loss": 0.11460884660482407, | |
| "eval_runtime": 47.2816, | |
| "eval_samples_per_second": 4.864, | |
| "eval_steps_per_second": 4.864, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6787434074753497, | |
| "grad_norm": 0.14382557570934296, | |
| "learning_rate": 0.0001906057421970046, | |
| "loss": 0.1252, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6970878238936024, | |
| "grad_norm": 0.138260617852211, | |
| "learning_rate": 0.00018968334602117906, | |
| "loss": 0.1358, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.715432240311855, | |
| "grad_norm": 0.14098672568798065, | |
| "learning_rate": 0.00018872021180433232, | |
| "loss": 0.1183, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.715432240311855, | |
| "eval_loss": 0.11497555673122406, | |
| "eval_runtime": 47.5258, | |
| "eval_samples_per_second": 4.839, | |
| "eval_steps_per_second": 4.839, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7337766567301077, | |
| "grad_norm": 0.5053527355194092, | |
| "learning_rate": 0.0001877167770435357, | |
| "loss": 0.1181, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7521210731483605, | |
| "grad_norm": 0.12653136253356934, | |
| "learning_rate": 0.00018667349754210457, | |
| "loss": 0.1274, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7704654895666132, | |
| "grad_norm": 0.11409477144479752, | |
| "learning_rate": 0.00018559084720255276, | |
| "loss": 0.1336, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7704654895666132, | |
| "eval_loss": 0.11223822832107544, | |
| "eval_runtime": 47.4963, | |
| "eval_samples_per_second": 4.842, | |
| "eval_steps_per_second": 4.842, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7888099059848659, | |
| "grad_norm": 0.12272641062736511, | |
| "learning_rate": 0.00018446931781132553, | |
| "loss": 0.1297, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8071543224031186, | |
| "grad_norm": 0.1796967089176178, | |
| "learning_rate": 0.00018330941881540915, | |
| "loss": 0.1378, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8254987388213713, | |
| "grad_norm": 0.11490087956190109, | |
| "learning_rate": 0.00018211167709091802, | |
| "loss": 0.1262, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8254987388213713, | |
| "eval_loss": 0.11122166365385056, | |
| "eval_runtime": 47.6342, | |
| "eval_samples_per_second": 4.828, | |
| "eval_steps_per_second": 4.828, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8438431552396239, | |
| "grad_norm": 0.1275721788406372, | |
| "learning_rate": 0.00018087663670376483, | |
| "loss": 0.1244, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8621875716578766, | |
| "grad_norm": 0.10838750749826431, | |
| "learning_rate": 0.0001796048586625223, | |
| "loss": 0.1278, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8805319880761293, | |
| "grad_norm": 0.132126122713089, | |
| "learning_rate": 0.00017829692066358914, | |
| "loss": 0.1365, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8805319880761293, | |
| "eval_loss": 0.11278139054775238, | |
| "eval_runtime": 47.4269, | |
| "eval_samples_per_second": 4.85, | |
| "eval_steps_per_second": 4.85, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.130384624004364, | |
| "learning_rate": 0.0001769534168287752, | |
| "loss": 0.1215, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9172208209126347, | |
| "grad_norm": 0.11609125882387161, | |
| "learning_rate": 0.00017557495743542585, | |
| "loss": 0.1178, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9355652373308874, | |
| "grad_norm": 0.12103743851184845, | |
| "learning_rate": 0.0001741621686392077, | |
| "loss": 0.125, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9355652373308874, | |
| "eval_loss": 0.11113429814577103, | |
| "eval_runtime": 47.2207, | |
| "eval_samples_per_second": 4.871, | |
| "eval_steps_per_second": 4.871, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.95390965374914, | |
| "grad_norm": 0.13481061160564423, | |
| "learning_rate": 0.00017271569218968175, | |
| "loss": 0.1233, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9722540701673928, | |
| "grad_norm": 0.12927769124507904, | |
| "learning_rate": 0.00017123618513879295, | |
| "loss": 0.1237, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9905984865856455, | |
| "grad_norm": 0.1044018417596817, | |
| "learning_rate": 0.00016972431954240906, | |
| "loss": 0.1222, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9905984865856455, | |
| "eval_loss": 0.11119447648525238, | |
| "eval_runtime": 47.6107, | |
| "eval_samples_per_second": 4.831, | |
| "eval_steps_per_second": 4.831, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0073377665673011, | |
| "grad_norm": 0.10839453339576721, | |
| "learning_rate": 0.0001681807821550438, | |
| "loss": 0.1076, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.0256821829855538, | |
| "grad_norm": 0.151426762342453, | |
| "learning_rate": 0.00016660627411790329, | |
| "loss": 0.1131, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0440265994038065, | |
| "grad_norm": 0.1167786568403244, | |
| "learning_rate": 0.00016500151064039766, | |
| "loss": 0.1332, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0440265994038065, | |
| "eval_loss": 0.11187437176704407, | |
| "eval_runtime": 47.4208, | |
| "eval_samples_per_second": 4.85, | |
| "eval_steps_per_second": 4.85, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0623710158220592, | |
| "grad_norm": 0.0833975151181221, | |
| "learning_rate": 0.0001633672206752621, | |
| "loss": 0.1236, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.0807154322403119, | |
| "grad_norm": 0.08196871727705002, | |
| "learning_rate": 0.00016170414658743488, | |
| "loss": 0.1177, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0990598486585645, | |
| "grad_norm": 0.1361839324235916, | |
| "learning_rate": 0.00016001304381684347, | |
| "loss": 0.1103, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0990598486585645, | |
| "eval_loss": 0.11141453683376312, | |
| "eval_runtime": 47.4348, | |
| "eval_samples_per_second": 4.849, | |
| "eval_steps_per_second": 4.849, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1174042650768172, | |
| "grad_norm": 0.1309792697429657, | |
| "learning_rate": 0.00015829468053525102, | |
| "loss": 0.1285, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.13574868149507, | |
| "grad_norm": 0.09610354155302048, | |
| "learning_rate": 0.00015654983729731977, | |
| "loss": 0.1278, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1540930979133226, | |
| "grad_norm": 0.08027515560388565, | |
| "learning_rate": 0.00015477930668604916, | |
| "loss": 0.117, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1540930979133226, | |
| "eval_loss": 0.11004864424467087, | |
| "eval_runtime": 47.4023, | |
| "eval_samples_per_second": 4.852, | |
| "eval_steps_per_second": 4.852, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1724375143315753, | |
| "grad_norm": 0.25526005029678345, | |
| "learning_rate": 0.00015298389295275098, | |
| "loss": 0.1179, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.190781930749828, | |
| "grad_norm": 0.2913426458835602, | |
| "learning_rate": 0.00015116441165172328, | |
| "loss": 0.1226, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2091263471680807, | |
| "grad_norm": 0.12240047007799149, | |
| "learning_rate": 0.00014932168926979074, | |
| "loss": 0.1206, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.2091263471680807, | |
| "eval_loss": 0.11615979671478271, | |
| "eval_runtime": 47.3821, | |
| "eval_samples_per_second": 4.854, | |
| "eval_steps_per_second": 4.854, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.2274707635863333, | |
| "grad_norm": 0.5275110602378845, | |
| "learning_rate": 0.00014745656285087866, | |
| "loss": 0.131, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.245815180004586, | |
| "grad_norm": 0.08649874478578568, | |
| "learning_rate": 0.00014556987961579146, | |
| "loss": 0.1273, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.264159596422839, | |
| "grad_norm": 0.21493305265903473, | |
| "learning_rate": 0.00014366249657736866, | |
| "loss": 0.1238, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.264159596422839, | |
| "eval_loss": 0.11726190149784088, | |
| "eval_runtime": 47.4729, | |
| "eval_samples_per_second": 4.845, | |
| "eval_steps_per_second": 4.845, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2825040128410916, | |
| "grad_norm": 0.2242494374513626, | |
| "learning_rate": 0.00014173528015119246, | |
| "loss": 0.1267, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3008484292593443, | |
| "grad_norm": 0.1080465167760849, | |
| "learning_rate": 0.0001397891057620247, | |
| "loss": 0.1275, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.319192845677597, | |
| "grad_norm": 0.11881080269813538, | |
| "learning_rate": 0.00013782485744615096, | |
| "loss": 0.1265, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.319192845677597, | |
| "eval_loss": 0.11231047660112381, | |
| "eval_runtime": 47.5156, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.3375372620958497, | |
| "grad_norm": 0.11510378122329712, | |
| "learning_rate": 0.0001358434274498134, | |
| "loss": 0.118, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.3558816785141024, | |
| "grad_norm": 0.1533641517162323, | |
| "learning_rate": 0.00013384571582391393, | |
| "loss": 0.1239, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.374226094932355, | |
| "grad_norm": 0.09978242218494415, | |
| "learning_rate": 0.00013183263001517224, | |
| "loss": 0.1206, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.374226094932355, | |
| "eval_loss": 0.1097174733877182, | |
| "eval_runtime": 47.3489, | |
| "eval_samples_per_second": 4.858, | |
| "eval_steps_per_second": 4.858, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3925705113506077, | |
| "grad_norm": 0.10042322427034378, | |
| "learning_rate": 0.0001298050844539246, | |
| "loss": 0.1244, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.4109149277688604, | |
| "grad_norm": 0.09835106879472733, | |
| "learning_rate": 0.00012776400013875006, | |
| "loss": 0.1157, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.429259344187113, | |
| "grad_norm": 0.0930342748761177, | |
| "learning_rate": 0.00012571030421811314, | |
| "loss": 0.1236, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.429259344187113, | |
| "eval_loss": 0.11065001785755157, | |
| "eval_runtime": 47.4754, | |
| "eval_samples_per_second": 4.845, | |
| "eval_steps_per_second": 4.845, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.4476037606053658, | |
| "grad_norm": 0.08919622749090195, | |
| "learning_rate": 0.0001236449295692131, | |
| "loss": 0.1261, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.4659481770236185, | |
| "grad_norm": 0.0871356800198555, | |
| "learning_rate": 0.00012156881437423103, | |
| "loss": 0.1215, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4842925934418711, | |
| "grad_norm": 0.11698926240205765, | |
| "learning_rate": 0.00011948290169416682, | |
| "loss": 0.1269, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4842925934418711, | |
| "eval_loss": 0.10887381434440613, | |
| "eval_runtime": 47.4819, | |
| "eval_samples_per_second": 4.844, | |
| "eval_steps_per_second": 4.844, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.5026370098601238, | |
| "grad_norm": 0.06746023893356323, | |
| "learning_rate": 0.00011738813904046044, | |
| "loss": 0.1122, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.5209814262783765, | |
| "grad_norm": 0.09794127196073532, | |
| "learning_rate": 0.00011528547794459128, | |
| "loss": 0.1161, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.5393258426966292, | |
| "grad_norm": 0.15457604825496674, | |
| "learning_rate": 0.00011317587352585157, | |
| "loss": 0.1281, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5393258426966292, | |
| "eval_loss": 0.10888072103261948, | |
| "eval_runtime": 47.3656, | |
| "eval_samples_per_second": 4.856, | |
| "eval_steps_per_second": 4.856, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5576702591148819, | |
| "grad_norm": 0.08662489801645279, | |
| "learning_rate": 0.00011106028405749005, | |
| "loss": 0.1191, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5760146755331346, | |
| "grad_norm": 0.08548998087644577, | |
| "learning_rate": 0.00010893967053142296, | |
| "loss": 0.1196, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.5943590919513873, | |
| "grad_norm": 0.10732567310333252, | |
| "learning_rate": 0.00010681499622171005, | |
| "loss": 0.1246, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5943590919513873, | |
| "eval_loss": 0.10937893390655518, | |
| "eval_runtime": 47.5072, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.61270350836964, | |
| "grad_norm": 0.07822810858488083, | |
| "learning_rate": 0.00010468722624699401, | |
| "loss": 0.1121, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.6310479247878926, | |
| "grad_norm": 0.07400278747081757, | |
| "learning_rate": 0.00010255732713210206, | |
| "loss": 0.124, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.6493923412061453, | |
| "grad_norm": 0.0789525955915451, | |
| "learning_rate": 0.00010042626636900856, | |
| "loss": 0.1181, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.6493923412061453, | |
| "eval_loss": 0.10813174396753311, | |
| "eval_runtime": 47.5685, | |
| "eval_samples_per_second": 4.835, | |
| "eval_steps_per_second": 4.835, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.667736757624398, | |
| "grad_norm": 0.07806720584630966, | |
| "learning_rate": 9.829501197735866e-05, | |
| "loss": 0.1172, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6860811740426507, | |
| "grad_norm": 0.09959676861763, | |
| "learning_rate": 9.616453206475179e-05, | |
| "loss": 0.1214, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.7044255904609034, | |
| "grad_norm": 0.07471271604299545, | |
| "learning_rate": 9.40357943869858e-05, | |
| "loss": 0.1212, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.7044255904609034, | |
| "eval_loss": 0.1081625297665596, | |
| "eval_runtime": 47.4204, | |
| "eval_samples_per_second": 4.85, | |
| "eval_steps_per_second": 4.85, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.722770006879156, | |
| "grad_norm": 0.0786692202091217, | |
| "learning_rate": 9.190976590846027e-05, | |
| "loss": 0.1187, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.7411144232974087, | |
| "grad_norm": 0.09310892224311829, | |
| "learning_rate": 8.978741236293973e-05, | |
| "loss": 0.115, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7594588397156614, | |
| "grad_norm": 0.10510563850402832, | |
| "learning_rate": 8.766969781487578e-05, | |
| "loss": 0.1254, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.7594588397156614, | |
| "eval_loss": 0.10792473703622818, | |
| "eval_runtime": 47.4662, | |
| "eval_samples_per_second": 4.846, | |
| "eval_steps_per_second": 4.846, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.777803256133914, | |
| "grad_norm": 0.07966180145740509, | |
| "learning_rate": 8.555758422148745e-05, | |
| "loss": 0.121, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7961476725521668, | |
| "grad_norm": 0.08049053698778152, | |
| "learning_rate": 8.345203099579874e-05, | |
| "loss": 0.11, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.8144920889704195, | |
| "grad_norm": 0.06832806766033173, | |
| "learning_rate": 8.13539945708319e-05, | |
| "loss": 0.1236, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.8144920889704195, | |
| "eval_loss": 0.10886505246162415, | |
| "eval_runtime": 47.5036, | |
| "eval_samples_per_second": 4.842, | |
| "eval_steps_per_second": 4.842, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.8328365053886724, | |
| "grad_norm": 0.08461681008338928, | |
| "learning_rate": 7.926442796515429e-05, | |
| "loss": 0.1341, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.851180921806925, | |
| "grad_norm": 0.07851795107126236, | |
| "learning_rate": 7.71842803499764e-05, | |
| "loss": 0.1306, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.8695253382251777, | |
| "grad_norm": 0.0859571248292923, | |
| "learning_rate": 7.51144966179972e-05, | |
| "loss": 0.1155, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.8695253382251777, | |
| "eval_loss": 0.107667475938797, | |
| "eval_runtime": 47.4797, | |
| "eval_samples_per_second": 4.844, | |
| "eval_steps_per_second": 4.844, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.8878697546434304, | |
| "grad_norm": 0.08879910409450531, | |
| "learning_rate": 7.305601695419323e-05, | |
| "loss": 0.1173, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.9062141710616831, | |
| "grad_norm": 0.07893572747707367, | |
| "learning_rate": 7.10097764087462e-05, | |
| "loss": 0.1109, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.9245585874799358, | |
| "grad_norm": 0.0726647600531578, | |
| "learning_rate": 6.897670447230262e-05, | |
| "loss": 0.1136, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.9245585874799358, | |
| "eval_loss": 0.10811686515808105, | |
| "eval_runtime": 47.401, | |
| "eval_samples_per_second": 4.852, | |
| "eval_steps_per_second": 4.852, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.9429030038981885, | |
| "grad_norm": 0.08151935786008835, | |
| "learning_rate": 6.69577246537593e-05, | |
| "loss": 0.1145, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.9612474203164412, | |
| "grad_norm": 0.09550312161445618, | |
| "learning_rate": 6.495375406076573e-05, | |
| "loss": 0.1215, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.9795918367346939, | |
| "grad_norm": 0.20997484028339386, | |
| "learning_rate": 6.296570298313431e-05, | |
| "loss": 0.1222, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.9795918367346939, | |
| "eval_loss": 0.1070978045463562, | |
| "eval_runtime": 47.3536, | |
| "eval_samples_per_second": 4.857, | |
| "eval_steps_per_second": 4.857, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.9979362531529465, | |
| "grad_norm": 0.07865249365568161, | |
| "learning_rate": 6.099447447934743e-05, | |
| "loss": 0.1113, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.0146755331346022, | |
| "grad_norm": 0.08859091252088547, | |
| "learning_rate": 5.904096396634935e-05, | |
| "loss": 0.1253, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.033019949552855, | |
| "grad_norm": 0.08467495441436768, | |
| "learning_rate": 5.710605881280939e-05, | |
| "loss": 0.1168, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.033019949552855, | |
| "eval_loss": 0.10781675577163696, | |
| "eval_runtime": 47.5138, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.0513643659711076, | |
| "grad_norm": 0.07836291193962097, | |
| "learning_rate": 5.519063793604067e-05, | |
| "loss": 0.1138, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.0697087823893603, | |
| "grad_norm": 0.10746737569570541, | |
| "learning_rate": 5.329557140275801e-05, | |
| "loss": 0.1146, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.088053198807613, | |
| "grad_norm": 0.06803814321756363, | |
| "learning_rate": 5.1421720033856216e-05, | |
| "loss": 0.1148, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.088053198807613, | |
| "eval_loss": 0.10701934248209, | |
| "eval_runtime": 47.4053, | |
| "eval_samples_per_second": 4.852, | |
| "eval_steps_per_second": 4.852, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.1063976152258657, | |
| "grad_norm": 0.08060242980718613, | |
| "learning_rate": 4.9569935013388125e-05, | |
| "loss": 0.1099, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.1247420316441183, | |
| "grad_norm": 0.08353856205940247, | |
| "learning_rate": 4.774105750192e-05, | |
| "loss": 0.1203, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.143086448062371, | |
| "grad_norm": 0.08800723403692245, | |
| "learning_rate": 4.593591825444028e-05, | |
| "loss": 0.112, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.143086448062371, | |
| "eval_loss": 0.10760512948036194, | |
| "eval_runtime": 47.4958, | |
| "eval_samples_per_second": 4.843, | |
| "eval_steps_per_second": 4.843, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.1614308644806237, | |
| "grad_norm": 0.07223484665155411, | |
| "learning_rate": 4.415533724299471e-05, | |
| "loss": 0.1221, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.1797752808988764, | |
| "grad_norm": 0.07669170200824738, | |
| "learning_rate": 4.240012328421997e-05, | |
| "loss": 0.1088, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.198119697317129, | |
| "grad_norm": 0.07591399550437927, | |
| "learning_rate": 4.067107367194397e-05, | |
| "loss": 0.1115, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.198119697317129, | |
| "eval_loss": 0.1069360300898552, | |
| "eval_runtime": 47.5987, | |
| "eval_samples_per_second": 4.832, | |
| "eval_steps_per_second": 4.832, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.2164641137353818, | |
| "grad_norm": 0.0850834846496582, | |
| "learning_rate": 3.8968973815020806e-05, | |
| "loss": 0.1149, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.2348085301536345, | |
| "grad_norm": 0.08643563091754913, | |
| "learning_rate": 3.729459688056427e-05, | |
| "loss": 0.1125, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.253152946571887, | |
| "grad_norm": 0.07351703196763992, | |
| "learning_rate": 3.564870344274185e-05, | |
| "loss": 0.1099, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.253152946571887, | |
| "eval_loss": 0.10722808539867401, | |
| "eval_runtime": 47.5417, | |
| "eval_samples_per_second": 4.838, | |
| "eval_steps_per_second": 4.838, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.27149736299014, | |
| "grad_norm": 0.09461668133735657, | |
| "learning_rate": 3.403204113728933e-05, | |
| "loss": 0.1189, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.2898417794083925, | |
| "grad_norm": 0.09557740390300751, | |
| "learning_rate": 3.244534432190225e-05, | |
| "loss": 0.1219, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.308186195826645, | |
| "grad_norm": 0.09171107411384583, | |
| "learning_rate": 3.088933374265919e-05, | |
| "loss": 0.1199, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.308186195826645, | |
| "eval_loss": 0.10666479170322418, | |
| "eval_runtime": 47.7241, | |
| "eval_samples_per_second": 4.819, | |
| "eval_steps_per_second": 4.819, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.326530612244898, | |
| "grad_norm": 0.09569697827100754, | |
| "learning_rate": 2.936471620662763e-05, | |
| "loss": 0.1018, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.3448750286631506, | |
| "grad_norm": 0.10778363794088364, | |
| "learning_rate": 2.7872184260801838e-05, | |
| "loss": 0.1156, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.3632194450814032, | |
| "grad_norm": 0.07144766300916672, | |
| "learning_rate": 2.6412415877518238e-05, | |
| "loss": 0.1171, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.3632194450814032, | |
| "eval_loss": 0.10628043115139008, | |
| "eval_runtime": 47.8715, | |
| "eval_samples_per_second": 4.805, | |
| "eval_steps_per_second": 4.805, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.381563861499656, | |
| "grad_norm": 0.10883186757564545, | |
| "learning_rate": 2.4986074146490967e-05, | |
| "loss": 0.115, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.3999082779179086, | |
| "grad_norm": 0.08375997096300125, | |
| "learning_rate": 2.35938069736081e-05, | |
| "loss": 0.1134, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.4182526943361613, | |
| "grad_norm": 0.09004294127225876, | |
| "learning_rate": 2.2236246786624792e-05, | |
| "loss": 0.1067, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.4182526943361613, | |
| "eval_loss": 0.10667029023170471, | |
| "eval_runtime": 47.8027, | |
| "eval_samples_per_second": 4.811, | |
| "eval_steps_per_second": 4.811, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.436597110754414, | |
| "grad_norm": 0.08431920409202576, | |
| "learning_rate": 2.091401024788745e-05, | |
| "loss": 0.1175, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.4549415271726667, | |
| "grad_norm": 0.06736938655376434, | |
| "learning_rate": 1.962769797421895e-05, | |
| "loss": 0.1081, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.4732859435909194, | |
| "grad_norm": 0.0825665220618248, | |
| "learning_rate": 1.83778942640927e-05, | |
| "loss": 0.1144, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.4732859435909194, | |
| "eval_loss": 0.10638684034347534, | |
| "eval_runtime": 47.5125, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.491630360009172, | |
| "grad_norm": 0.09597857296466827, | |
| "learning_rate": 1.716516683221906e-05, | |
| "loss": 0.1174, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.509974776427425, | |
| "grad_norm": 0.087304025888443, | |
| "learning_rate": 1.5990066551664906e-05, | |
| "loss": 0.1217, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.528319192845678, | |
| "grad_norm": 0.1020963117480278, | |
| "learning_rate": 1.4853127203623252e-05, | |
| "loss": 0.1206, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.528319192845678, | |
| "eval_loss": 0.10611271858215332, | |
| "eval_runtime": 47.5951, | |
| "eval_samples_per_second": 4.832, | |
| "eval_steps_per_second": 4.832, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.5466636092639305, | |
| "grad_norm": 0.09987211227416992, | |
| "learning_rate": 1.3754865234946835e-05, | |
| "loss": 0.1237, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.5650080256821832, | |
| "grad_norm": 0.09537294507026672, | |
| "learning_rate": 1.2695779523555829e-05, | |
| "loss": 0.1074, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.583352442100436, | |
| "grad_norm": 0.06951133906841278, | |
| "learning_rate": 1.1676351151825804e-05, | |
| "loss": 0.1113, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.583352442100436, | |
| "eval_loss": 0.10620437562465668, | |
| "eval_runtime": 47.5731, | |
| "eval_samples_per_second": 4.835, | |
| "eval_steps_per_second": 4.835, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.6016968585186886, | |
| "grad_norm": 0.08393154293298721, | |
| "learning_rate": 1.0697043188059475e-05, | |
| "loss": 0.1082, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.6200412749369413, | |
| "grad_norm": 0.08919060230255127, | |
| "learning_rate": 9.75830047614117e-06, | |
| "loss": 0.1129, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.638385691355194, | |
| "grad_norm": 0.08663026988506317, | |
| "learning_rate": 8.860549433469444e-06, | |
| "loss": 0.1151, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.638385691355194, | |
| "eval_loss": 0.10626456141471863, | |
| "eval_runtime": 47.3921, | |
| "eval_samples_per_second": 4.853, | |
| "eval_steps_per_second": 4.853, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.6567301077734466, | |
| "grad_norm": 0.09450593590736389, | |
| "learning_rate": 8.004197857260042e-06, | |
| "loss": 0.1154, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.6750745241916993, | |
| "grad_norm": 0.08700842410326004, | |
| "learning_rate": 7.189634739306705e-06, | |
| "loss": 0.1113, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.693418940609952, | |
| "grad_norm": 0.08897579461336136, | |
| "learning_rate": 6.4172300892844425e-06, | |
| "loss": 0.1068, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.693418940609952, | |
| "eval_loss": 0.10611724108457565, | |
| "eval_runtime": 47.6236, | |
| "eval_samples_per_second": 4.83, | |
| "eval_steps_per_second": 4.83, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.7117633570282047, | |
| "grad_norm": 0.08531934022903442, | |
| "learning_rate": 5.687334766675123e-06, | |
| "loss": 0.1057, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.7301077734464574, | |
| "grad_norm": 0.08912410587072372, | |
| "learning_rate": 5.000280321392004e-06, | |
| "loss": 0.1179, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.74845218986471, | |
| "grad_norm": 0.0903608500957489, | |
| "learning_rate": 4.356378843175446e-06, | |
| "loss": 0.1137, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.74845218986471, | |
| "eval_loss": 0.10605704039335251, | |
| "eval_runtime": 47.5904, | |
| "eval_samples_per_second": 4.833, | |
| "eval_steps_per_second": 4.833, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.7667966062829628, | |
| "grad_norm": 0.08206541836261749, | |
| "learning_rate": 3.75592281982835e-06, | |
| "loss": 0.121, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.7851410227012154, | |
| "grad_norm": 0.08107905834913254, | |
| "learning_rate": 3.1991850043555425e-06, | |
| "loss": 0.1141, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.803485439119468, | |
| "grad_norm": 0.09001165628433228, | |
| "learning_rate": 2.6864182910676273e-06, | |
| "loss": 0.1135, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.803485439119468, | |
| "eval_loss": 0.10597212612628937, | |
| "eval_runtime": 47.6034, | |
| "eval_samples_per_second": 4.832, | |
| "eval_steps_per_second": 4.832, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.821829855537721, | |
| "grad_norm": 0.08551483601331711, | |
| "learning_rate": 2.2178556007054872e-06, | |
| "loss": 0.1209, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.8401742719559735, | |
| "grad_norm": 0.09350485354661942, | |
| "learning_rate": 1.793709774637653e-06, | |
| "loss": 0.1174, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.858518688374226, | |
| "grad_norm": 0.08843007683753967, | |
| "learning_rate": 1.41417347817856e-06, | |
| "loss": 0.1146, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.858518688374226, | |
| "eval_loss": 0.10600461810827255, | |
| "eval_runtime": 47.5109, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.876863104792479, | |
| "grad_norm": 0.0968756377696991, | |
| "learning_rate": 1.079419113071678e-06, | |
| "loss": 0.1239, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.8952075212107315, | |
| "grad_norm": 0.09828708320856094, | |
| "learning_rate": 7.895987391771997e-07, | |
| "loss": 0.1206, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.9135519376289842, | |
| "grad_norm": 0.09327838569879532, | |
| "learning_rate": 5.448440053999137e-07, | |
| "loss": 0.1173, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.9135519376289842, | |
| "eval_loss": 0.10602891445159912, | |
| "eval_runtime": 47.5894, | |
| "eval_samples_per_second": 4.833, | |
| "eval_steps_per_second": 4.833, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.931896354047237, | |
| "grad_norm": 0.07366887480020523, | |
| "learning_rate": 3.45266089888574e-07, | |
| "loss": 0.1076, | |
| "step": 1600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1638, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2220701474596557e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |