| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.8936719758932417, |
| "eval_steps": 30, |
| "global_step": 1100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.017219113215669393, |
| "grad_norm": 7.988399028778076, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 2.5848, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.034438226431338786, |
| "grad_norm": 0.776594340801239, |
| "learning_rate": 3.247863247863248e-05, |
| "loss": 0.9692, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05165733964700818, |
| "grad_norm": 0.7523375749588013, |
| "learning_rate": 4.9572649572649575e-05, |
| "loss": 0.6028, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05165733964700818, |
| "eval_loss": 0.5047398805618286, |
| "eval_runtime": 54.2722, |
| "eval_samples_per_second": 4.514, |
| "eval_steps_per_second": 4.514, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06887645286267757, |
| "grad_norm": 0.5522134304046631, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 0.4415, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08609556607834697, |
| "grad_norm": 0.5470026731491089, |
| "learning_rate": 8.376068376068377e-05, |
| "loss": 0.3325, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10331467929401636, |
| "grad_norm": 0.5383180975914001, |
| "learning_rate": 0.00010085470085470086, |
| "loss": 0.2818, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10331467929401636, |
| "eval_loss": 0.24386584758758545, |
| "eval_runtime": 53.5106, |
| "eval_samples_per_second": 4.579, |
| "eval_steps_per_second": 4.579, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12053379250968575, |
| "grad_norm": 0.4517495632171631, |
| "learning_rate": 0.00011794871794871796, |
| "loss": 0.2372, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13775290572535515, |
| "grad_norm": 0.6536285877227783, |
| "learning_rate": 0.00013504273504273505, |
| "loss": 0.2293, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15497201894102453, |
| "grad_norm": 0.44978171586990356, |
| "learning_rate": 0.00015213675213675214, |
| "loss": 0.1987, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15497201894102453, |
| "eval_loss": 0.17985357344150543, |
| "eval_runtime": 53.3878, |
| "eval_samples_per_second": 4.589, |
| "eval_steps_per_second": 4.589, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17219113215669393, |
| "grad_norm": 0.4183058440685272, |
| "learning_rate": 0.00016923076923076923, |
| "loss": 0.1891, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1894102453723633, |
| "grad_norm": 0.3675980269908905, |
| "learning_rate": 0.00018632478632478634, |
| "loss": 0.1746, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20662935858803272, |
| "grad_norm": 0.29412326216697693, |
| "learning_rate": 0.00019999819242697418, |
| "loss": 0.1701, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.20662935858803272, |
| "eval_loss": 0.14962351322174072, |
| "eval_runtime": 53.3747, |
| "eval_samples_per_second": 4.59, |
| "eval_steps_per_second": 4.59, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2238484718037021, |
| "grad_norm": 0.6068556904792786, |
| "learning_rate": 0.00019993493423217814, |
| "loss": 0.1631, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2410675850193715, |
| "grad_norm": 0.4110848605632782, |
| "learning_rate": 0.00019978136272187747, |
| "loss": 0.1457, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2582866982350409, |
| "grad_norm": 0.2771225571632385, |
| "learning_rate": 0.0001995376166818969, |
| "loss": 0.1598, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2582866982350409, |
| "eval_loss": 0.15198171138763428, |
| "eval_runtime": 53.4628, |
| "eval_samples_per_second": 4.583, |
| "eval_steps_per_second": 4.583, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2755058114507103, |
| "grad_norm": 0.2621934711933136, |
| "learning_rate": 0.00019920391639069242, |
| "loss": 0.1463, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.29272492466637967, |
| "grad_norm": 0.22565044462680817, |
| "learning_rate": 0.00019878056342028102, |
| "loss": 0.1561, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.30994403788204905, |
| "grad_norm": 0.2568676173686981, |
| "learning_rate": 0.00019826794036370362, |
| "loss": 0.1493, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.30994403788204905, |
| "eval_loss": 0.13555820286273956, |
| "eval_runtime": 53.598, |
| "eval_samples_per_second": 4.571, |
| "eval_steps_per_second": 4.571, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3271631510977185, |
| "grad_norm": 0.19471700489521027, |
| "learning_rate": 0.0001976665104892678, |
| "loss": 0.1472, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.34438226431338786, |
| "grad_norm": 0.16541129350662231, |
| "learning_rate": 0.00019697681732188218, |
| "loss": 0.1424, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.36160137752905724, |
| "grad_norm": 0.23701885342597961, |
| "learning_rate": 0.00019619948415186173, |
| "loss": 0.1386, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.36160137752905724, |
| "eval_loss": 0.13430726528167725, |
| "eval_runtime": 53.3368, |
| "eval_samples_per_second": 4.593, |
| "eval_steps_per_second": 4.593, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3788204907447266, |
| "grad_norm": 0.15007953345775604, |
| "learning_rate": 0.00019533521347164687, |
| "loss": 0.1442, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.39603960396039606, |
| "grad_norm": 0.47527194023132324, |
| "learning_rate": 0.00019438478634094638, |
| "loss": 0.1431, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.41325871717606544, |
| "grad_norm": 0.18094901740550995, |
| "learning_rate": 0.00019334906168087698, |
| "loss": 0.1412, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.41325871717606544, |
| "eval_loss": 0.1281569004058838, |
| "eval_runtime": 53.5115, |
| "eval_samples_per_second": 4.578, |
| "eval_steps_per_second": 4.578, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4304778303917348, |
| "grad_norm": 0.2504327893257141, |
| "learning_rate": 0.00019222897549773848, |
| "loss": 0.1518, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4476969436074042, |
| "grad_norm": 0.2442682981491089, |
| "learning_rate": 0.00019102554003712466, |
| "loss": 0.1232, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.46491605682307363, |
| "grad_norm": 0.18342557549476624, |
| "learning_rate": 0.00018973984286913584, |
| "loss": 0.1369, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.46491605682307363, |
| "eval_loss": 0.1246885359287262, |
| "eval_runtime": 53.4863, |
| "eval_samples_per_second": 4.581, |
| "eval_steps_per_second": 4.581, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.482135170038743, |
| "grad_norm": 0.14184711873531342, |
| "learning_rate": 0.0001883730459055186, |
| "loss": 0.1306, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4993542832544124, |
| "grad_norm": 0.2095550298690796, |
| "learning_rate": 0.00018692638434962143, |
| "loss": 0.1372, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5165733964700818, |
| "grad_norm": 0.14405280351638794, |
| "learning_rate": 0.0001854011655801157, |
| "loss": 0.141, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5165733964700818, |
| "eval_loss": 0.12198741734027863, |
| "eval_runtime": 53.9404, |
| "eval_samples_per_second": 4.542, |
| "eval_steps_per_second": 4.542, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5337925096857512, |
| "grad_norm": 0.2048429548740387, |
| "learning_rate": 0.0001837987679694894, |
| "loss": 0.1358, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5510116229014206, |
| "grad_norm": 0.16774621605873108, |
| "learning_rate": 0.0001821206396383831, |
| "loss": 0.1332, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.56823073611709, |
| "grad_norm": 0.18126481771469116, |
| "learning_rate": 0.00018036829714689252, |
| "loss": 0.1261, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.56823073611709, |
| "eval_loss": 0.12057841569185257, |
| "eval_runtime": 53.8415, |
| "eval_samples_per_second": 4.55, |
| "eval_steps_per_second": 4.55, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5854498493327593, |
| "grad_norm": 0.1449306458234787, |
| "learning_rate": 0.00017854332412402108, |
| "loss": 0.1334, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6026689625484287, |
| "grad_norm": 0.326578825712204, |
| "learning_rate": 0.00017664736983652088, |
| "loss": 0.1342, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6198880757640981, |
| "grad_norm": 0.1592559516429901, |
| "learning_rate": 0.0001746821476984154, |
| "loss": 0.1299, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6198880757640981, |
| "eval_loss": 0.11931858956813812, |
| "eval_runtime": 53.859, |
| "eval_samples_per_second": 4.549, |
| "eval_steps_per_second": 4.549, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6371071889797676, |
| "grad_norm": 0.1399652510881424, |
| "learning_rate": 0.000172649433722551, |
| "loss": 0.1241, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.654326302195437, |
| "grad_norm": 0.12264904379844666, |
| "learning_rate": 0.00017055106491557645, |
| "loss": 0.1267, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6715454154111064, |
| "grad_norm": 0.12538975477218628, |
| "learning_rate": 0.00016838893761780127, |
| "loss": 0.1265, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6715454154111064, |
| "eval_loss": 0.11791779100894928, |
| "eval_runtime": 54.0636, |
| "eval_samples_per_second": 4.532, |
| "eval_steps_per_second": 4.532, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6887645286267757, |
| "grad_norm": 0.13257497549057007, |
| "learning_rate": 0.00016616500578943273, |
| "loss": 0.1226, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7059836418424451, |
| "grad_norm": 0.10295815765857697, |
| "learning_rate": 0.0001638812792447406, |
| "loss": 0.1278, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7232027550581145, |
| "grad_norm": 0.12271147221326828, |
| "learning_rate": 0.0001615398218357457, |
| "loss": 0.1337, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7232027550581145, |
| "eval_loss": 0.11634409427642822, |
| "eval_runtime": 53.9599, |
| "eval_samples_per_second": 4.54, |
| "eval_steps_per_second": 4.54, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7404218682737839, |
| "grad_norm": 0.10796191543340683, |
| "learning_rate": 0.0001591427495870729, |
| "loss": 0.1282, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7576409814894532, |
| "grad_norm": 0.13710945844650269, |
| "learning_rate": 0.00015669222878365486, |
| "loss": 0.1324, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7748600947051227, |
| "grad_norm": 0.1041555106639862, |
| "learning_rate": 0.00015419047401301472, |
| "loss": 0.1321, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7748600947051227, |
| "eval_loss": 0.11630499362945557, |
| "eval_runtime": 53.8765, |
| "eval_samples_per_second": 4.547, |
| "eval_steps_per_second": 4.547, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7920792079207921, |
| "grad_norm": 0.13567079603672028, |
| "learning_rate": 0.0001516397461638962, |
| "loss": 0.1281, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8092983211364615, |
| "grad_norm": 0.13361193239688873, |
| "learning_rate": 0.00014904235038305083, |
| "loss": 0.1227, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8265174343521309, |
| "grad_norm": 0.14394643902778625, |
| "learning_rate": 0.0001464006339920278, |
| "loss": 0.1366, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8265174343521309, |
| "eval_loss": 0.11639704555273056, |
| "eval_runtime": 53.9134, |
| "eval_samples_per_second": 4.544, |
| "eval_steps_per_second": 4.544, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8437365475678003, |
| "grad_norm": 0.30794933438301086, |
| "learning_rate": 0.00014371698436585004, |
| "loss": 0.1287, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8609556607834696, |
| "grad_norm": 0.11450614035129547, |
| "learning_rate": 0.0001409938267754926, |
| "loss": 0.1284, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.878174773999139, |
| "grad_norm": 1.7245404720306396, |
| "learning_rate": 0.0001382336221961141, |
| "loss": 0.1497, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.878174773999139, |
| "eval_loss": 0.1258104145526886, |
| "eval_runtime": 53.8228, |
| "eval_samples_per_second": 4.552, |
| "eval_steps_per_second": 4.552, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8953938872148084, |
| "grad_norm": 0.1183474138379097, |
| "learning_rate": 0.00013543886508302148, |
| "loss": 0.1338, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9126130004304779, |
| "grad_norm": 0.11419904977083206, |
| "learning_rate": 0.00013261208111737765, |
| "loss": 0.1189, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9298321136461473, |
| "grad_norm": 0.11519365757703781, |
| "learning_rate": 0.00012975582492369016, |
| "loss": 0.1212, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.9298321136461473, |
| "eval_loss": 0.11533824354410172, |
| "eval_runtime": 53.938, |
| "eval_samples_per_second": 4.542, |
| "eval_steps_per_second": 4.542, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.9470512268618166, |
| "grad_norm": 0.10076776891946793, |
| "learning_rate": 0.00012687267776114304, |
| "loss": 0.1235, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.964270340077486, |
| "grad_norm": 0.1136770099401474, |
| "learning_rate": 0.0001239652451908579, |
| "loss": 0.1272, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9814894532931554, |
| "grad_norm": 0.0829247385263443, |
| "learning_rate": 0.0001210361547211936, |
| "loss": 0.1092, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9814894532931554, |
| "eval_loss": 0.11433606594800949, |
| "eval_runtime": 53.9521, |
| "eval_samples_per_second": 4.541, |
| "eval_steps_per_second": 4.541, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9987085665088248, |
| "grad_norm": 0.1091647669672966, |
| "learning_rate": 0.000118088053433211, |
| "loss": 0.1262, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.0154972018941024, |
| "grad_norm": 0.1265544295310974, |
| "learning_rate": 0.00011512360558844994, |
| "loss": 0.1207, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.0327163151097718, |
| "grad_norm": 0.09603710472583771, |
| "learning_rate": 0.00011214549022117967, |
| "loss": 0.1209, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0327163151097718, |
| "eval_loss": 0.1145610511302948, |
| "eval_runtime": 53.848, |
| "eval_samples_per_second": 4.55, |
| "eval_steps_per_second": 4.55, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0499354283254412, |
| "grad_norm": 0.08947043865919113, |
| "learning_rate": 0.00010915639871729874, |
| "loss": 0.1079, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0671545415411106, |
| "grad_norm": 0.1086762472987175, |
| "learning_rate": 0.00010615903238207292, |
| "loss": 0.1264, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.08437365475678, |
| "grad_norm": 0.09653393179178238, |
| "learning_rate": 0.00010315609999890798, |
| "loss": 0.1213, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.08437365475678, |
| "eval_loss": 0.11310213059186935, |
| "eval_runtime": 53.9218, |
| "eval_samples_per_second": 4.544, |
| "eval_steps_per_second": 4.544, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.1015927679724493, |
| "grad_norm": 0.09453985095024109, |
| "learning_rate": 0.00010015031538136518, |
| "loss": 0.1234, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.118811881188119, |
| "grad_norm": 0.09583611786365509, |
| "learning_rate": 9.71443949206304e-05, |
| "loss": 0.1219, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.1360309944037883, |
| "grad_norm": 0.09019785374403, |
| "learning_rate": 9.41410551306537e-05, |
| "loss": 0.1151, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.1360309944037883, |
| "eval_loss": 0.11455921083688736, |
| "eval_runtime": 54.11, |
| "eval_samples_per_second": 4.528, |
| "eval_steps_per_second": 4.528, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.1532501076194577, |
| "grad_norm": 0.08167584985494614, |
| "learning_rate": 9.114301019317854e-05, |
| "loss": 0.1137, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.170469220835127, |
| "grad_norm": 0.09403249621391296, |
| "learning_rate": 8.815296950487804e-05, |
| "loss": 0.1151, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1876883340507964, |
| "grad_norm": 0.10912812501192093, |
| "learning_rate": 8.517363522881579e-05, |
| "loss": 0.1201, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1876883340507964, |
| "eval_loss": 0.11227670311927795, |
| "eval_runtime": 53.8875, |
| "eval_samples_per_second": 4.547, |
| "eval_steps_per_second": 4.547, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.2049074472664658, |
| "grad_norm": 0.1018662080168724, |
| "learning_rate": 8.220769985244425e-05, |
| "loss": 0.1239, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2221265604821352, |
| "grad_norm": 0.08478210121393204, |
| "learning_rate": 7.925784375434629e-05, |
| "loss": 0.1125, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.2393456736978046, |
| "grad_norm": 0.10391442477703094, |
| "learning_rate": 7.63267327819209e-05, |
| "loss": 0.1216, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.2393456736978046, |
| "eval_loss": 0.11215105652809143, |
| "eval_runtime": 54.106, |
| "eval_samples_per_second": 4.528, |
| "eval_steps_per_second": 4.528, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.256564786913474, |
| "grad_norm": 0.09761766344308853, |
| "learning_rate": 7.341701584220006e-05, |
| "loss": 0.1169, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.2737839001291433, |
| "grad_norm": 0.09429631382226944, |
| "learning_rate": 7.05313225079756e-05, |
| "loss": 0.1192, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2910030133448127, |
| "grad_norm": 0.09800124913454056, |
| "learning_rate": 6.767226064139841e-05, |
| "loss": 0.1213, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2910030133448127, |
| "eval_loss": 0.11156849563121796, |
| "eval_runtime": 53.9406, |
| "eval_samples_per_second": 4.542, |
| "eval_steps_per_second": 4.542, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.308222126560482, |
| "grad_norm": 0.09360459446907043, |
| "learning_rate": 6.484241403719842e-05, |
| "loss": 0.1207, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.3254412397761515, |
| "grad_norm": 0.10078331083059311, |
| "learning_rate": 6.204434008765458e-05, |
| "loss": 0.1177, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.3426603529918208, |
| "grad_norm": 0.10541950166225433, |
| "learning_rate": 5.9280567471425077e-05, |
| "loss": 0.1156, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.3426603529918208, |
| "eval_loss": 0.11099184304475784, |
| "eval_runtime": 53.971, |
| "eval_samples_per_second": 4.539, |
| "eval_steps_per_second": 4.539, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.3598794662074902, |
| "grad_norm": 0.08638057112693787, |
| "learning_rate": 5.655359386832728e-05, |
| "loss": 0.1208, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3770985794231598, |
| "grad_norm": 0.08249291032552719, |
| "learning_rate": 5.386588370213124e-05, |
| "loss": 0.1232, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.394317692638829, |
| "grad_norm": 0.08767437189817429, |
| "learning_rate": 5.121986591340808e-05, |
| "loss": 0.1177, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.394317692638829, |
| "eval_loss": 0.1114472895860672, |
| "eval_runtime": 53.9211, |
| "eval_samples_per_second": 4.544, |
| "eval_steps_per_second": 4.544, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.4115368058544986, |
| "grad_norm": 0.09942208975553513, |
| "learning_rate": 4.861793176444479e-05, |
| "loss": 0.1152, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.428755919070168, |
| "grad_norm": 0.9783937931060791, |
| "learning_rate": 4.6062432678209986e-05, |
| "loss": 0.1129, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.4459750322858373, |
| "grad_norm": 0.10314920544624329, |
| "learning_rate": 4.355567811332311e-05, |
| "loss": 0.1149, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.4459750322858373, |
| "eval_loss": 0.11096727102994919, |
| "eval_runtime": 53.9371, |
| "eval_samples_per_second": 4.542, |
| "eval_steps_per_second": 4.542, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.4631941455015067, |
| "grad_norm": 0.11127369850873947, |
| "learning_rate": 4.109993347694781e-05, |
| "loss": 0.1293, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.480413258717176, |
| "grad_norm": 0.09351260960102081, |
| "learning_rate": 3.8697418077495575e-05, |
| "loss": 0.1198, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4976323719328455, |
| "grad_norm": 0.08981552720069885, |
| "learning_rate": 3.635030311898975e-05, |
| "loss": 0.1166, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.4976323719328455, |
| "eval_loss": 0.11088854819536209, |
| "eval_runtime": 54.014, |
| "eval_samples_per_second": 4.536, |
| "eval_steps_per_second": 4.536, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.5148514851485149, |
| "grad_norm": 0.09358756244182587, |
| "learning_rate": 3.4060709738902485e-05, |
| "loss": 0.1229, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.5320705983641842, |
| "grad_norm": 0.07254694402217865, |
| "learning_rate": 3.183070709123781e-05, |
| "loss": 0.1122, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.5492897115798536, |
| "grad_norm": 0.0889156311750412, |
| "learning_rate": 2.9662310476593492e-05, |
| "loss": 0.1161, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.5492897115798536, |
| "eval_loss": 0.11066355556249619, |
| "eval_runtime": 54.1151, |
| "eval_samples_per_second": 4.527, |
| "eval_steps_per_second": 4.527, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.566508824795523, |
| "grad_norm": 0.08892069011926651, |
| "learning_rate": 2.7557479520891104e-05, |
| "loss": 0.1226, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.5837279380111924, |
| "grad_norm": 0.09420421719551086, |
| "learning_rate": 2.551811640442081e-05, |
| "loss": 0.1157, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.6009470512268618, |
| "grad_norm": 0.0823286697268486, |
| "learning_rate": 2.354606414280045e-05, |
| "loss": 0.1177, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.6009470512268618, |
| "eval_loss": 0.10992853343486786, |
| "eval_runtime": 54.0273, |
| "eval_samples_per_second": 4.535, |
| "eval_steps_per_second": 4.535, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.6181661644425311, |
| "grad_norm": 0.08363740146160126, |
| "learning_rate": 2.1643104921403657e-05, |
| "loss": 0.1164, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.6353852776582007, |
| "grad_norm": 0.08525504916906357, |
| "learning_rate": 1.98109584847609e-05, |
| "loss": 0.1217, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.65260439087387, |
| "grad_norm": 0.08780323714017868, |
| "learning_rate": 1.805128058239014e-05, |
| "loss": 0.1209, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.65260439087387, |
| "eval_loss": 0.10971155762672424, |
| "eval_runtime": 54.0565, |
| "eval_samples_per_second": 4.532, |
| "eval_steps_per_second": 4.532, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.6698235040895395, |
| "grad_norm": 0.08478442579507828, |
| "learning_rate": 1.6365661472460946e-05, |
| "loss": 0.1225, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.6870426173052087, |
| "grad_norm": 0.07923008501529694, |
| "learning_rate": 1.475562448464437e-05, |
| "loss": 0.1251, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.7042617305208783, |
| "grad_norm": 0.08420061320066452, |
| "learning_rate": 1.3222624643447879e-05, |
| "loss": 0.1143, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.7042617305208783, |
| "eval_loss": 0.10979125648736954, |
| "eval_runtime": 54.1033, |
| "eval_samples_per_second": 4.528, |
| "eval_steps_per_second": 4.528, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.7214808437365474, |
| "grad_norm": 0.0820653885602951, |
| "learning_rate": 1.1768047353278721e-05, |
| "loss": 0.1174, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.738699956952217, |
| "grad_norm": 0.09282553195953369, |
| "learning_rate": 1.0393207146424766e-05, |
| "loss": 0.1166, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.7559190701678864, |
| "grad_norm": 0.08007533848285675, |
| "learning_rate": 9.09934649508375e-06, |
| "loss": 0.1219, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.7559190701678864, |
| "eval_loss": 0.10961439460515976, |
| "eval_runtime": 54.2265, |
| "eval_samples_per_second": 4.518, |
| "eval_steps_per_second": 4.518, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.7731381833835558, |
| "grad_norm": 0.0757787823677063, |
| "learning_rate": 7.887634688515e-06, |
| "loss": 0.1205, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.7903572965992252, |
| "grad_norm": 0.09963615238666534, |
| "learning_rate": 6.759166776327786e-06, |
| "loss": 0.1122, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.8075764098148945, |
| "grad_norm": 0.08780790865421295, |
| "learning_rate": 5.71496257886196e-06, |
| "loss": 0.1147, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.8075764098148945, |
| "eval_loss": 0.10968345403671265, |
| "eval_runtime": 54.1218, |
| "eval_samples_per_second": 4.527, |
| "eval_steps_per_second": 4.527, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.824795523030564, |
| "grad_norm": 0.08804041147232056, |
| "learning_rate": 4.755965765554637e-06, |
| "loss": 0.1118, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.8420146362462333, |
| "grad_norm": 0.08495019376277924, |
| "learning_rate": 3.883043002126219e-06, |
| "loss": 0.115, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.8592337494619027, |
| "grad_norm": 0.09160571545362473, |
| "learning_rate": 3.0969831673562042e-06, |
| "loss": 0.1142, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.8592337494619027, |
| "eval_loss": 0.10952436178922653, |
| "eval_runtime": 54.0717, |
| "eval_samples_per_second": 4.531, |
| "eval_steps_per_second": 4.531, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.876452862677572, |
| "grad_norm": 0.09295180439949036, |
| "learning_rate": 2.3984966401567e-06, |
| "loss": 0.1159, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.8936719758932417, |
| "grad_norm": 0.09608935564756393, |
| "learning_rate": 1.7882146575880166e-06, |
| "loss": 0.1164, |
| "step": 1100 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1162, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5706792867604582e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|