{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 2000, "global_step": 514, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038910505836575876, "grad_norm": 3.7059597969055176, "learning_rate": 9.994024049928221e-05, "loss": 17.1185, "step": 10 }, { "epoch": 0.07782101167315175, "grad_norm": 2.1613128185272217, "learning_rate": 9.969771232924403e-05, "loss": 9.3101, "step": 20 }, { "epoch": 0.11673151750972763, "grad_norm": 1.1486432552337646, "learning_rate": 9.926958555700134e-05, "loss": 8.2256, "step": 30 }, { "epoch": 0.1556420233463035, "grad_norm": 1.1596895456314087, "learning_rate": 9.865745904348295e-05, "loss": 8.105, "step": 40 }, { "epoch": 0.19455252918287938, "grad_norm": 1.1468164920806885, "learning_rate": 9.786361880589083e-05, "loss": 7.914, "step": 50 }, { "epoch": 0.23346303501945526, "grad_norm": 1.1589274406433105, "learning_rate": 9.689102948045397e-05, "loss": 7.9438, "step": 60 }, { "epoch": 0.2723735408560311, "grad_norm": 1.0185976028442383, "learning_rate": 9.574332325084563e-05, "loss": 7.8356, "step": 70 }, { "epoch": 0.311284046692607, "grad_norm": 0.9653449058532715, "learning_rate": 9.442478628361098e-05, "loss": 7.6501, "step": 80 }, { "epoch": 0.35019455252918286, "grad_norm": 0.959281861782074, "learning_rate": 9.294034272126287e-05, "loss": 7.6496, "step": 90 }, { "epoch": 0.38910505836575876, "grad_norm": 0.9253648519515991, "learning_rate": 9.129553629282448e-05, "loss": 7.421, "step": 100 }, { "epoch": 0.4280155642023346, "grad_norm": 0.9299134016036987, "learning_rate": 8.949650961049478e-05, "loss": 7.5062, "step": 110 }, { "epoch": 0.4669260700389105, "grad_norm": 0.9562272429466248, "learning_rate": 8.754998122975489e-05, "loss": 7.43, "step": 120 }, { "epoch": 0.5058365758754864, "grad_norm": 0.9745796322822571, "learning_rate": 8.546322055858526e-05, "loss": 7.3152, "step": 130 }, { "epoch": 0.5447470817120622, "grad_norm": 1.0300663709640503, "learning_rate": 8.324402070949658e-05, "loss": 7.3288, "step": 140 }, { "epoch": 0.5836575875486382, "grad_norm": 0.8551069498062134, "learning_rate": 8.09006693957597e-05, "loss": 7.288, "step": 150 }, { "epoch": 0.622568093385214, "grad_norm": 0.9526028037071228, "learning_rate": 7.844191798052438e-05, "loss": 7.3128, "step": 160 }, { "epoch": 0.6614785992217899, "grad_norm": 0.8692190647125244, "learning_rate": 7.587694879441401e-05, "loss": 7.0454, "step": 170 }, { "epoch": 0.7003891050583657, "grad_norm": 0.7848012447357178, "learning_rate": 7.321534084365102e-05, "loss": 7.0277, "step": 180 }, { "epoch": 0.7392996108949417, "grad_norm": 0.7873611450195312, "learning_rate": 7.046703403677695e-05, "loss": 6.9448, "step": 190 }, { "epoch": 0.7782101167315175, "grad_norm": 0.8347607254981995, "learning_rate": 6.764229206356498e-05, "loss": 7.0677, "step": 200 }, { "epoch": 0.8171206225680934, "grad_norm": 0.7865148186683655, "learning_rate": 6.475166406475515e-05, "loss": 6.9666, "step": 210 }, { "epoch": 0.8560311284046692, "grad_norm": 0.8299317359924316, "learning_rate": 6.180594523575838e-05, "loss": 7.1694, "step": 220 }, { "epoch": 0.8949416342412452, "grad_norm": 0.8158650994300842, "learning_rate": 5.881613651145732e-05, "loss": 6.7787, "step": 230 }, { "epoch": 0.933852140077821, "grad_norm": 0.8030637502670288, "learning_rate": 5.579340348266251e-05, "loss": 6.7451, "step": 240 }, { "epoch": 0.9727626459143969, "grad_norm": 0.8576043844223022, "learning_rate": 5.27490346976529e-05, "loss": 6.7098, "step": 250 }, { "epoch": 1.0116731517509727, "grad_norm": 1.007199764251709, "learning_rate": 4.969439950452543e-05, "loss": 6.3174, "step": 260 }, { "epoch": 1.0505836575875487, "grad_norm": 0.7966341972351074, "learning_rate": 4.664090559179367e-05, "loss": 5.085, "step": 270 }, { "epoch": 1.0894941634241244, "grad_norm": 0.8274776339530945, "learning_rate": 4.359995638580226e-05, "loss": 5.2567, "step": 280 }, { "epoch": 1.1284046692607004, "grad_norm": 0.8015701770782471, "learning_rate": 4.0582908464058556e-05, "loss": 5.1255, "step": 290 }, { "epoch": 1.1673151750972763, "grad_norm": 0.7696110010147095, "learning_rate": 3.7601029143523764e-05, "loss": 5.1827, "step": 300 }, { "epoch": 1.206225680933852, "grad_norm": 0.7955564260482788, "learning_rate": 3.466545440225193e-05, "loss": 5.1197, "step": 310 }, { "epoch": 1.245136186770428, "grad_norm": 0.7841880917549133, "learning_rate": 3.1787147291520674e-05, "loss": 5.1158, "step": 320 }, { "epoch": 1.2840466926070038, "grad_norm": 0.7522294521331787, "learning_rate": 2.8976856993765766e-05, "loss": 5.1173, "step": 330 }, { "epoch": 1.3229571984435797, "grad_norm": 0.7152595520019531, "learning_rate": 2.6245078679219505e-05, "loss": 5.0291, "step": 340 }, { "epoch": 1.3618677042801557, "grad_norm": 0.697422206401825, "learning_rate": 2.3602014311170523e-05, "loss": 4.9941, "step": 350 }, { "epoch": 1.4007782101167314, "grad_norm": 0.7435976266860962, "learning_rate": 2.1057534546219658e-05, "loss": 5.1369, "step": 360 }, { "epoch": 1.4396887159533074, "grad_norm": 0.8627301454544067, "learning_rate": 1.862114187181705e-05, "loss": 4.9706, "step": 370 }, { "epoch": 1.4785992217898833, "grad_norm": 0.7423489093780518, "learning_rate": 1.6301935118745826e-05, "loss": 4.9751, "step": 380 }, { "epoch": 1.517509727626459, "grad_norm": 0.7178505659103394, "learning_rate": 1.4108575481081521e-05, "loss": 4.8353, "step": 390 }, { "epoch": 1.556420233463035, "grad_norm": 0.7140501737594604, "learning_rate": 1.2049254170527857e-05, "loss": 4.9182, "step": 400 }, { "epoch": 1.595330739299611, "grad_norm": 0.7342681884765625, "learning_rate": 1.013166182592551e-05, "loss": 4.8421, "step": 410 }, { "epoch": 1.6342412451361867, "grad_norm": 0.716266930103302, "learning_rate": 8.36295979217494e-06, "loss": 4.8006, "step": 420 }, { "epoch": 1.6731517509727627, "grad_norm": 0.7294363975524902, "learning_rate": 6.7497533758344665e-06, "loss": 4.7904, "step": 430 }, { "epoch": 1.7120622568093387, "grad_norm": 0.6808627843856812, "learning_rate": 5.298067177271143e-06, "loss": 4.7973, "step": 440 }, { "epoch": 1.7509727626459144, "grad_norm": 0.7073158025741577, "learning_rate": 4.01332259148815e-06, "loss": 4.7772, "step": 450 }, { "epoch": 1.7898832684824901, "grad_norm": 0.6910358667373657, "learning_rate": 2.9003175616530265e-06, "loss": 4.9969, "step": 460 }, { "epoch": 1.8287937743190663, "grad_norm": 0.688657283782959, "learning_rate": 1.963208660937904e-06, "loss": 4.9254, "step": 470 }, { "epoch": 1.867704280155642, "grad_norm": 0.6506223678588867, "learning_rate": 1.205495569588283e-06, "loss": 4.9407, "step": 480 }, { "epoch": 1.9066147859922178, "grad_norm": 0.6927580237388611, "learning_rate": 6.300080051914791e-07, "loss": 4.8365, "step": 490 }, { "epoch": 1.9455252918287937, "grad_norm": 0.682306170463562, "learning_rate": 2.3889515495413296e-07, "loss": 4.8349, "step": 500 }, { "epoch": 1.9844357976653697, "grad_norm": 0.6962385177612305, "learning_rate": 3.361764945473134e-08, "loss": 4.9875, "step": 510 }, { "epoch": 2.0, "step": 514, "total_flos": 3.8982891094409216e+17, "train_loss": 6.394122821347723, "train_runtime": 7794.0822, "train_samples_per_second": 4.219, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 514, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 15000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8982891094409216e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }