{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.989247311827957, "eval_steps": 500, "global_step": 696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.043010752688172046, "grad_norm": 3.3071749210357666, "learning_rate": 2.9984721919587606e-05, "loss": 0.9267, "num_input_tokens_seen": 26208, "step": 10 }, { "epoch": 0.08602150537634409, "grad_norm": 1.9867345094680786, "learning_rate": 2.9938918800982563e-05, "loss": 0.1561, "num_input_tokens_seen": 53152, "step": 20 }, { "epoch": 0.12903225806451613, "grad_norm": 10.124670028686523, "learning_rate": 2.9862683948682103e-05, "loss": 0.1764, "num_input_tokens_seen": 79776, "step": 30 }, { "epoch": 0.17204301075268819, "grad_norm": 2.0668740272521973, "learning_rate": 2.975617265898004e-05, "loss": 0.1505, "num_input_tokens_seen": 106496, "step": 40 }, { "epoch": 0.21505376344086022, "grad_norm": 2.0054426193237305, "learning_rate": 2.961960190361624e-05, "loss": 0.1615, "num_input_tokens_seen": 133728, "step": 50 }, { "epoch": 0.25806451612903225, "grad_norm": 2.5156946182250977, "learning_rate": 2.9453249887788343e-05, "loss": 0.1369, "num_input_tokens_seen": 159936, "step": 60 }, { "epoch": 0.3010752688172043, "grad_norm": 1.1180003881454468, "learning_rate": 2.925745548342631e-05, "loss": 0.1253, "num_input_tokens_seen": 186240, "step": 70 }, { "epoch": 0.34408602150537637, "grad_norm": 13.929261207580566, "learning_rate": 2.9032617538884018e-05, "loss": 0.1479, "num_input_tokens_seen": 212832, "step": 80 }, { "epoch": 0.3870967741935484, "grad_norm": 2.071075439453125, "learning_rate": 2.877919406645433e-05, "loss": 0.1102, "num_input_tokens_seen": 239648, "step": 90 }, { "epoch": 0.43010752688172044, "grad_norm": 2.170992374420166, "learning_rate": 2.84977013093626e-05, "loss": 0.099, "num_input_tokens_seen": 265888, "step": 100 }, { "epoch": 0.4731182795698925, "grad_norm": 5.617093086242676, "learning_rate": 2.818871269013928e-05, "loss": 0.107, "num_input_tokens_seen": 292480, "step": 110 }, { "epoch": 0.5161290322580645, "grad_norm": 6.3603010177612305, "learning_rate": 2.7852857642513838e-05, "loss": 0.1183, "num_input_tokens_seen": 318784, "step": 120 }, { "epoch": 0.5591397849462365, "grad_norm": 3.8436505794525146, "learning_rate": 2.7490820329209546e-05, "loss": 0.1097, "num_input_tokens_seen": 346016, "step": 130 }, { "epoch": 0.6021505376344086, "grad_norm": 1.736432671546936, "learning_rate": 2.7103338248251055e-05, "loss": 0.0946, "num_input_tokens_seen": 372384, "step": 140 }, { "epoch": 0.6451612903225806, "grad_norm": 8.300951957702637, "learning_rate": 2.6691200730623874e-05, "loss": 0.1251, "num_input_tokens_seen": 399328, "step": 150 }, { "epoch": 0.6881720430107527, "grad_norm": 2.540724277496338, "learning_rate": 2.6255247332346036e-05, "loss": 0.1069, "num_input_tokens_seen": 426048, "step": 160 }, { "epoch": 0.7311827956989247, "grad_norm": 1.967483639717102, "learning_rate": 2.5796366124227532e-05, "loss": 0.0904, "num_input_tokens_seen": 452640, "step": 170 }, { "epoch": 0.7741935483870968, "grad_norm": 5.206757545471191, "learning_rate": 2.531549188280135e-05, "loss": 0.1273, "num_input_tokens_seen": 479808, "step": 180 }, { "epoch": 0.8172043010752689, "grad_norm": 3.0387344360351562, "learning_rate": 2.481360418611132e-05, "loss": 0.1206, "num_input_tokens_seen": 506176, "step": 190 }, { "epoch": 0.8602150537634409, "grad_norm": 2.0281670093536377, "learning_rate": 2.4291725418235848e-05, "loss": 0.103, "num_input_tokens_seen": 533216, "step": 200 }, { "epoch": 0.9032258064516129, "grad_norm": 2.5650763511657715, "learning_rate": 2.3750918686612414e-05, "loss": 0.0696, "num_input_tokens_seen": 561056, "step": 210 }, { "epoch": 0.946236559139785, "grad_norm": 8.955713272094727, "learning_rate": 2.3192285656405456e-05, "loss": 0.0822, "num_input_tokens_seen": 588160, "step": 220 }, { "epoch": 0.989247311827957, "grad_norm": 3.034013032913208, "learning_rate": 2.2616964306329183e-05, "loss": 0.0913, "num_input_tokens_seen": 615168, "step": 230 }, { "epoch": 1.0301075268817204, "grad_norm": 2.60020112991333, "learning_rate": 2.2026126610496852e-05, "loss": 0.0735, "num_input_tokens_seen": 639864, "step": 240 }, { "epoch": 1.0731182795698926, "grad_norm": 4.891764163970947, "learning_rate": 2.1420976151018813e-05, "loss": 0.0752, "num_input_tokens_seen": 667224, "step": 250 }, { "epoch": 1.1161290322580646, "grad_norm": 1.1149002313613892, "learning_rate": 2.0802745666212592e-05, "loss": 0.0588, "num_input_tokens_seen": 693848, "step": 260 }, { "epoch": 1.1591397849462366, "grad_norm": 3.1601271629333496, "learning_rate": 2.0172694539419557e-05, "loss": 0.0924, "num_input_tokens_seen": 720568, "step": 270 }, { "epoch": 1.2021505376344086, "grad_norm": 3.555192470550537, "learning_rate": 1.953210623354359e-05, "loss": 0.062, "num_input_tokens_seen": 746872, "step": 280 }, { "epoch": 1.2451612903225806, "grad_norm": 2.235698699951172, "learning_rate": 1.888228567653781e-05, "loss": 0.0621, "num_input_tokens_seen": 773720, "step": 290 }, { "epoch": 1.2881720430107526, "grad_norm": 2.9058539867401123, "learning_rate": 1.8224556603165363e-05, "loss": 0.075, "num_input_tokens_seen": 801464, "step": 300 }, { "epoch": 1.3311827956989246, "grad_norm": 3.204787015914917, "learning_rate": 1.7560258858449248e-05, "loss": 0.0858, "num_input_tokens_seen": 829144, "step": 310 }, { "epoch": 1.3741935483870968, "grad_norm": 1.0885004997253418, "learning_rate": 1.689074566830434e-05, "loss": 0.0697, "num_input_tokens_seen": 855672, "step": 320 }, { "epoch": 1.4172043010752688, "grad_norm": 3.0750925540924072, "learning_rate": 1.621738088291147e-05, "loss": 0.0827, "num_input_tokens_seen": 882424, "step": 330 }, { "epoch": 1.4602150537634409, "grad_norm": 2.689297914505005, "learning_rate": 1.5541536198449044e-05, "loss": 0.0651, "num_input_tokens_seen": 908792, "step": 340 }, { "epoch": 1.5032258064516129, "grad_norm": 2.297851324081421, "learning_rate": 1.4864588362841808e-05, "loss": 0.0607, "num_input_tokens_seen": 935672, "step": 350 }, { "epoch": 1.546236559139785, "grad_norm": 2.712674140930176, "learning_rate": 1.4187916371218739e-05, "loss": 0.056, "num_input_tokens_seen": 961848, "step": 360 }, { "epoch": 1.589247311827957, "grad_norm": 0.8086225986480713, "learning_rate": 1.3512898656793283e-05, "loss": 0.0823, "num_input_tokens_seen": 988600, "step": 370 }, { "epoch": 1.632258064516129, "grad_norm": 2.166210174560547, "learning_rate": 1.2840910282888211e-05, "loss": 0.058, "num_input_tokens_seen": 1014840, "step": 380 }, { "epoch": 1.675268817204301, "grad_norm": 5.169621467590332, "learning_rate": 1.2173320141825232e-05, "loss": 0.0705, "num_input_tokens_seen": 1040856, "step": 390 }, { "epoch": 1.718279569892473, "grad_norm": 1.8176069259643555, "learning_rate": 1.1511488166385349e-05, "loss": 0.0514, "num_input_tokens_seen": 1067544, "step": 400 }, { "epoch": 1.761290322580645, "grad_norm": 3.424694776535034, "learning_rate": 1.0856762559520605e-05, "loss": 0.0834, "num_input_tokens_seen": 1094584, "step": 410 }, { "epoch": 1.8043010752688171, "grad_norm": 1.8838876485824585, "learning_rate": 1.0210477047960303e-05, "loss": 0.0583, "num_input_tokens_seen": 1120760, "step": 420 }, { "epoch": 1.8473118279569891, "grad_norm": 3.7757434844970703, "learning_rate": 9.573948165306438e-06, "loss": 0.0922, "num_input_tokens_seen": 1146776, "step": 430 }, { "epoch": 1.8903225806451613, "grad_norm": 3.0619328022003174, "learning_rate": 8.948472570152874e-06, "loss": 0.0633, "num_input_tokens_seen": 1174424, "step": 440 }, { "epoch": 1.9333333333333333, "grad_norm": 2.5175821781158447, "learning_rate": 8.33532440469145e-06, "loss": 0.0597, "num_input_tokens_seen": 1201048, "step": 450 }, { "epoch": 1.9763440860215054, "grad_norm": 3.6232197284698486, "learning_rate": 7.735752699185711e-06, "loss": 0.0491, "num_input_tokens_seen": 1227576, "step": 460 }, { "epoch": 2.0172043010752687, "grad_norm": 2.8846399784088135, "learning_rate": 7.150978827599619e-06, "loss": 0.0341, "num_input_tokens_seen": 1252160, "step": 470 }, { "epoch": 2.0602150537634407, "grad_norm": 2.196216106414795, "learning_rate": 6.582194019564266e-06, "loss": 0.0373, "num_input_tokens_seen": 1279328, "step": 480 }, { "epoch": 2.1032258064516127, "grad_norm": 1.4616115093231201, "learning_rate": 6.0305569337509225e-06, "loss": 0.0281, "num_input_tokens_seen": 1306304, "step": 490 }, { "epoch": 2.146236559139785, "grad_norm": 0.17581823468208313, "learning_rate": 5.497191297593647e-06, "loss": 0.0183, "num_input_tokens_seen": 1333184, "step": 500 }, { "epoch": 2.189247311827957, "grad_norm": 3.8919403553009033, "learning_rate": 4.98318361816957e-06, "loss": 0.0334, "num_input_tokens_seen": 1359872, "step": 510 }, { "epoch": 2.232258064516129, "grad_norm": 1.3041765689849854, "learning_rate": 4.4895809688998655e-06, "loss": 0.0282, "num_input_tokens_seen": 1387328, "step": 520 }, { "epoch": 2.275268817204301, "grad_norm": 1.669753074645996, "learning_rate": 4.017388856580178e-06, "loss": 0.0562, "num_input_tokens_seen": 1414816, "step": 530 }, { "epoch": 2.318279569892473, "grad_norm": 0.28061679005622864, "learning_rate": 3.567569173085455e-06, "loss": 0.0243, "num_input_tokens_seen": 1441504, "step": 540 }, { "epoch": 2.361290322580645, "grad_norm": 2.324270009994507, "learning_rate": 3.1410382359217645e-06, "loss": 0.044, "num_input_tokens_seen": 1467680, "step": 550 }, { "epoch": 2.404301075268817, "grad_norm": 2.708113670349121, "learning_rate": 2.7386649216166233e-06, "loss": 0.0551, "num_input_tokens_seen": 1494176, "step": 560 }, { "epoch": 2.447311827956989, "grad_norm": 3.16683030128479, "learning_rate": 2.361268895750264e-06, "loss": 0.0258, "num_input_tokens_seen": 1520544, "step": 570 }, { "epoch": 2.490322580645161, "grad_norm": 6.040332794189453, "learning_rate": 2.0096189432334194e-06, "loss": 0.0415, "num_input_tokens_seen": 1547264, "step": 580 }, { "epoch": 2.533333333333333, "grad_norm": 5.078160285949707, "learning_rate": 1.6844314022329676e-06, "loss": 0.0375, "num_input_tokens_seen": 1573920, "step": 590 }, { "epoch": 2.576344086021505, "grad_norm": 4.950022220611572, "learning_rate": 1.3863687049356465e-06, "loss": 0.0235, "num_input_tokens_seen": 1600640, "step": 600 }, { "epoch": 2.6193548387096772, "grad_norm": 1.7687643766403198, "learning_rate": 1.116038028122413e-06, "loss": 0.0354, "num_input_tokens_seen": 1626848, "step": 610 }, { "epoch": 2.6623655913978492, "grad_norm": 3.893580913543701, "learning_rate": 8.7399005630238e-07, "loss": 0.0357, "num_input_tokens_seen": 1653408, "step": 620 }, { "epoch": 2.7053763440860212, "grad_norm": 2.830453395843506, "learning_rate": 6.607178599258268e-07, "loss": 0.0512, "num_input_tokens_seen": 1679968, "step": 630 }, { "epoch": 2.7483870967741937, "grad_norm": 3.638772487640381, "learning_rate": 4.766558909615504e-07, "loss": 0.0243, "num_input_tokens_seen": 1706944, "step": 640 }, { "epoch": 2.7913978494623657, "grad_norm": 1.229244589805603, "learning_rate": 3.22179097884579e-07, "loss": 0.0367, "num_input_tokens_seen": 1733888, "step": 650 }, { "epoch": 2.8344086021505377, "grad_norm": 2.9207515716552734, "learning_rate": 1.9760216187710788e-07, "loss": 0.0377, "num_input_tokens_seen": 1760448, "step": 660 }, { "epoch": 2.8774193548387097, "grad_norm": 3.560971975326538, "learning_rate": 1.0317885579858522e-07, "loss": 0.0471, "num_input_tokens_seen": 1787072, "step": 670 }, { "epoch": 2.9204301075268817, "grad_norm": 0.2031625360250473, "learning_rate": 3.910152723075322e-08, "loss": 0.0222, "num_input_tokens_seen": 1813632, "step": 680 }, { "epoch": 2.9634408602150537, "grad_norm": 4.158380508422852, "learning_rate": 5.50070665074065e-09, "loss": 0.0454, "num_input_tokens_seen": 1840384, "step": 690 }, { "epoch": 2.989247311827957, "num_input_tokens_seen": 1855776, "step": 696, "total_flos": 7.944329136203366e+16, "train_loss": 0.08561765917459097, "train_runtime": 638.811, "train_samples_per_second": 8.721, "train_steps_per_second": 1.09 } ], "logging_steps": 10, "max_steps": 696, "num_input_tokens_seen": 1855776, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.944329136203366e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }