| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.989247311827957, | |
| "eval_steps": 500, | |
| "global_step": 696, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.043010752688172046, | |
| "grad_norm": 3.3071749210357666, | |
| "learning_rate": 2.9984721919587606e-05, | |
| "loss": 0.9267, | |
| "num_input_tokens_seen": 26208, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08602150537634409, | |
| "grad_norm": 1.9867345094680786, | |
| "learning_rate": 2.9938918800982563e-05, | |
| "loss": 0.1561, | |
| "num_input_tokens_seen": 53152, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 10.124670028686523, | |
| "learning_rate": 2.9862683948682103e-05, | |
| "loss": 0.1764, | |
| "num_input_tokens_seen": 79776, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.17204301075268819, | |
| "grad_norm": 2.0668740272521973, | |
| "learning_rate": 2.975617265898004e-05, | |
| "loss": 0.1505, | |
| "num_input_tokens_seen": 106496, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 2.0054426193237305, | |
| "learning_rate": 2.961960190361624e-05, | |
| "loss": 0.1615, | |
| "num_input_tokens_seen": 133728, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 2.5156946182250977, | |
| "learning_rate": 2.9453249887788343e-05, | |
| "loss": 0.1369, | |
| "num_input_tokens_seen": 159936, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3010752688172043, | |
| "grad_norm": 1.1180003881454468, | |
| "learning_rate": 2.925745548342631e-05, | |
| "loss": 0.1253, | |
| "num_input_tokens_seen": 186240, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.34408602150537637, | |
| "grad_norm": 13.929261207580566, | |
| "learning_rate": 2.9032617538884018e-05, | |
| "loss": 0.1479, | |
| "num_input_tokens_seen": 212832, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 2.071075439453125, | |
| "learning_rate": 2.877919406645433e-05, | |
| "loss": 0.1102, | |
| "num_input_tokens_seen": 239648, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 2.170992374420166, | |
| "learning_rate": 2.84977013093626e-05, | |
| "loss": 0.099, | |
| "num_input_tokens_seen": 265888, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4731182795698925, | |
| "grad_norm": 5.617093086242676, | |
| "learning_rate": 2.818871269013928e-05, | |
| "loss": 0.107, | |
| "num_input_tokens_seen": 292480, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 6.3603010177612305, | |
| "learning_rate": 2.7852857642513838e-05, | |
| "loss": 0.1183, | |
| "num_input_tokens_seen": 318784, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5591397849462365, | |
| "grad_norm": 3.8436505794525146, | |
| "learning_rate": 2.7490820329209546e-05, | |
| "loss": 0.1097, | |
| "num_input_tokens_seen": 346016, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6021505376344086, | |
| "grad_norm": 1.736432671546936, | |
| "learning_rate": 2.7103338248251055e-05, | |
| "loss": 0.0946, | |
| "num_input_tokens_seen": 372384, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 8.300951957702637, | |
| "learning_rate": 2.6691200730623874e-05, | |
| "loss": 0.1251, | |
| "num_input_tokens_seen": 399328, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6881720430107527, | |
| "grad_norm": 2.540724277496338, | |
| "learning_rate": 2.6255247332346036e-05, | |
| "loss": 0.1069, | |
| "num_input_tokens_seen": 426048, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7311827956989247, | |
| "grad_norm": 1.967483639717102, | |
| "learning_rate": 2.5796366124227532e-05, | |
| "loss": 0.0904, | |
| "num_input_tokens_seen": 452640, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 5.206757545471191, | |
| "learning_rate": 2.531549188280135e-05, | |
| "loss": 0.1273, | |
| "num_input_tokens_seen": 479808, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8172043010752689, | |
| "grad_norm": 3.0387344360351562, | |
| "learning_rate": 2.481360418611132e-05, | |
| "loss": 0.1206, | |
| "num_input_tokens_seen": 506176, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 2.0281670093536377, | |
| "learning_rate": 2.4291725418235848e-05, | |
| "loss": 0.103, | |
| "num_input_tokens_seen": 533216, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 2.5650763511657715, | |
| "learning_rate": 2.3750918686612414e-05, | |
| "loss": 0.0696, | |
| "num_input_tokens_seen": 561056, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.946236559139785, | |
| "grad_norm": 8.955713272094727, | |
| "learning_rate": 2.3192285656405456e-05, | |
| "loss": 0.0822, | |
| "num_input_tokens_seen": 588160, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.989247311827957, | |
| "grad_norm": 3.034013032913208, | |
| "learning_rate": 2.2616964306329183e-05, | |
| "loss": 0.0913, | |
| "num_input_tokens_seen": 615168, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0301075268817204, | |
| "grad_norm": 2.60020112991333, | |
| "learning_rate": 2.2026126610496852e-05, | |
| "loss": 0.0735, | |
| "num_input_tokens_seen": 639864, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0731182795698926, | |
| "grad_norm": 4.891764163970947, | |
| "learning_rate": 2.1420976151018813e-05, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 667224, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1161290322580646, | |
| "grad_norm": 1.1149002313613892, | |
| "learning_rate": 2.0802745666212592e-05, | |
| "loss": 0.0588, | |
| "num_input_tokens_seen": 693848, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1591397849462366, | |
| "grad_norm": 3.1601271629333496, | |
| "learning_rate": 2.0172694539419557e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 720568, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2021505376344086, | |
| "grad_norm": 3.555192470550537, | |
| "learning_rate": 1.953210623354359e-05, | |
| "loss": 0.062, | |
| "num_input_tokens_seen": 746872, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2451612903225806, | |
| "grad_norm": 2.235698699951172, | |
| "learning_rate": 1.888228567653781e-05, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 773720, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2881720430107526, | |
| "grad_norm": 2.9058539867401123, | |
| "learning_rate": 1.8224556603165363e-05, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 801464, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3311827956989246, | |
| "grad_norm": 3.204787015914917, | |
| "learning_rate": 1.7560258858449248e-05, | |
| "loss": 0.0858, | |
| "num_input_tokens_seen": 829144, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3741935483870968, | |
| "grad_norm": 1.0885004997253418, | |
| "learning_rate": 1.689074566830434e-05, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 855672, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4172043010752688, | |
| "grad_norm": 3.0750925540924072, | |
| "learning_rate": 1.621738088291147e-05, | |
| "loss": 0.0827, | |
| "num_input_tokens_seen": 882424, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4602150537634409, | |
| "grad_norm": 2.689297914505005, | |
| "learning_rate": 1.5541536198449044e-05, | |
| "loss": 0.0651, | |
| "num_input_tokens_seen": 908792, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5032258064516129, | |
| "grad_norm": 2.297851324081421, | |
| "learning_rate": 1.4864588362841808e-05, | |
| "loss": 0.0607, | |
| "num_input_tokens_seen": 935672, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.546236559139785, | |
| "grad_norm": 2.712674140930176, | |
| "learning_rate": 1.4187916371218739e-05, | |
| "loss": 0.056, | |
| "num_input_tokens_seen": 961848, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.589247311827957, | |
| "grad_norm": 0.8086225986480713, | |
| "learning_rate": 1.3512898656793283e-05, | |
| "loss": 0.0823, | |
| "num_input_tokens_seen": 988600, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.632258064516129, | |
| "grad_norm": 2.166210174560547, | |
| "learning_rate": 1.2840910282888211e-05, | |
| "loss": 0.058, | |
| "num_input_tokens_seen": 1014840, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.675268817204301, | |
| "grad_norm": 5.169621467590332, | |
| "learning_rate": 1.2173320141825232e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 1040856, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.718279569892473, | |
| "grad_norm": 1.8176069259643555, | |
| "learning_rate": 1.1511488166385349e-05, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 1067544, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.761290322580645, | |
| "grad_norm": 3.424694776535034, | |
| "learning_rate": 1.0856762559520605e-05, | |
| "loss": 0.0834, | |
| "num_input_tokens_seen": 1094584, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8043010752688171, | |
| "grad_norm": 1.8838876485824585, | |
| "learning_rate": 1.0210477047960303e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 1120760, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8473118279569891, | |
| "grad_norm": 3.7757434844970703, | |
| "learning_rate": 9.573948165306438e-06, | |
| "loss": 0.0922, | |
| "num_input_tokens_seen": 1146776, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8903225806451613, | |
| "grad_norm": 3.0619328022003174, | |
| "learning_rate": 8.948472570152874e-06, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 1174424, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9333333333333333, | |
| "grad_norm": 2.5175821781158447, | |
| "learning_rate": 8.33532440469145e-06, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 1201048, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9763440860215054, | |
| "grad_norm": 3.6232197284698486, | |
| "learning_rate": 7.735752699185711e-06, | |
| "loss": 0.0491, | |
| "num_input_tokens_seen": 1227576, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.0172043010752687, | |
| "grad_norm": 2.8846399784088135, | |
| "learning_rate": 7.150978827599619e-06, | |
| "loss": 0.0341, | |
| "num_input_tokens_seen": 1252160, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.0602150537634407, | |
| "grad_norm": 2.196216106414795, | |
| "learning_rate": 6.582194019564266e-06, | |
| "loss": 0.0373, | |
| "num_input_tokens_seen": 1279328, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.1032258064516127, | |
| "grad_norm": 1.4616115093231201, | |
| "learning_rate": 6.0305569337509225e-06, | |
| "loss": 0.0281, | |
| "num_input_tokens_seen": 1306304, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.146236559139785, | |
| "grad_norm": 0.17581823468208313, | |
| "learning_rate": 5.497191297593647e-06, | |
| "loss": 0.0183, | |
| "num_input_tokens_seen": 1333184, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.189247311827957, | |
| "grad_norm": 3.8919403553009033, | |
| "learning_rate": 4.98318361816957e-06, | |
| "loss": 0.0334, | |
| "num_input_tokens_seen": 1359872, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.232258064516129, | |
| "grad_norm": 1.3041765689849854, | |
| "learning_rate": 4.4895809688998655e-06, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 1387328, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.275268817204301, | |
| "grad_norm": 1.669753074645996, | |
| "learning_rate": 4.017388856580178e-06, | |
| "loss": 0.0562, | |
| "num_input_tokens_seen": 1414816, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.318279569892473, | |
| "grad_norm": 0.28061679005622864, | |
| "learning_rate": 3.567569173085455e-06, | |
| "loss": 0.0243, | |
| "num_input_tokens_seen": 1441504, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.361290322580645, | |
| "grad_norm": 2.324270009994507, | |
| "learning_rate": 3.1410382359217645e-06, | |
| "loss": 0.044, | |
| "num_input_tokens_seen": 1467680, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.404301075268817, | |
| "grad_norm": 2.708113670349121, | |
| "learning_rate": 2.7386649216166233e-06, | |
| "loss": 0.0551, | |
| "num_input_tokens_seen": 1494176, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.447311827956989, | |
| "grad_norm": 3.16683030128479, | |
| "learning_rate": 2.361268895750264e-06, | |
| "loss": 0.0258, | |
| "num_input_tokens_seen": 1520544, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.490322580645161, | |
| "grad_norm": 6.040332794189453, | |
| "learning_rate": 2.0096189432334194e-06, | |
| "loss": 0.0415, | |
| "num_input_tokens_seen": 1547264, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 5.078160285949707, | |
| "learning_rate": 1.6844314022329676e-06, | |
| "loss": 0.0375, | |
| "num_input_tokens_seen": 1573920, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.576344086021505, | |
| "grad_norm": 4.950022220611572, | |
| "learning_rate": 1.3863687049356465e-06, | |
| "loss": 0.0235, | |
| "num_input_tokens_seen": 1600640, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.6193548387096772, | |
| "grad_norm": 1.7687643766403198, | |
| "learning_rate": 1.116038028122413e-06, | |
| "loss": 0.0354, | |
| "num_input_tokens_seen": 1626848, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.6623655913978492, | |
| "grad_norm": 3.893580913543701, | |
| "learning_rate": 8.7399005630238e-07, | |
| "loss": 0.0357, | |
| "num_input_tokens_seen": 1653408, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.7053763440860212, | |
| "grad_norm": 2.830453395843506, | |
| "learning_rate": 6.607178599258268e-07, | |
| "loss": 0.0512, | |
| "num_input_tokens_seen": 1679968, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.7483870967741937, | |
| "grad_norm": 3.638772487640381, | |
| "learning_rate": 4.766558909615504e-07, | |
| "loss": 0.0243, | |
| "num_input_tokens_seen": 1706944, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.7913978494623657, | |
| "grad_norm": 1.229244589805603, | |
| "learning_rate": 3.22179097884579e-07, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 1733888, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.8344086021505377, | |
| "grad_norm": 2.9207515716552734, | |
| "learning_rate": 1.9760216187710788e-07, | |
| "loss": 0.0377, | |
| "num_input_tokens_seen": 1760448, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.8774193548387097, | |
| "grad_norm": 3.560971975326538, | |
| "learning_rate": 1.0317885579858522e-07, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 1787072, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.9204301075268817, | |
| "grad_norm": 0.2031625360250473, | |
| "learning_rate": 3.910152723075322e-08, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 1813632, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.9634408602150537, | |
| "grad_norm": 4.158380508422852, | |
| "learning_rate": 5.50070665074065e-09, | |
| "loss": 0.0454, | |
| "num_input_tokens_seen": 1840384, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.989247311827957, | |
| "num_input_tokens_seen": 1855776, | |
| "step": 696, | |
| "total_flos": 7.944329136203366e+16, | |
| "train_loss": 0.08561765917459097, | |
| "train_runtime": 638.811, | |
| "train_samples_per_second": 8.721, | |
| "train_steps_per_second": 1.09 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 696, | |
| "num_input_tokens_seen": 1855776, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.944329136203366e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |