| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 2000, | |
| "global_step": 514, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.038910505836575876, | |
| "grad_norm": 3.7059597969055176, | |
| "learning_rate": 9.994024049928221e-05, | |
| "loss": 17.1185, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07782101167315175, | |
| "grad_norm": 2.1613128185272217, | |
| "learning_rate": 9.969771232924403e-05, | |
| "loss": 9.3101, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11673151750972763, | |
| "grad_norm": 1.1486432552337646, | |
| "learning_rate": 9.926958555700134e-05, | |
| "loss": 8.2256, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1556420233463035, | |
| "grad_norm": 1.1596895456314087, | |
| "learning_rate": 9.865745904348295e-05, | |
| "loss": 8.105, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19455252918287938, | |
| "grad_norm": 1.1468164920806885, | |
| "learning_rate": 9.786361880589083e-05, | |
| "loss": 7.914, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23346303501945526, | |
| "grad_norm": 1.1589274406433105, | |
| "learning_rate": 9.689102948045397e-05, | |
| "loss": 7.9438, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2723735408560311, | |
| "grad_norm": 1.0185976028442383, | |
| "learning_rate": 9.574332325084563e-05, | |
| "loss": 7.8356, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.311284046692607, | |
| "grad_norm": 0.9653449058532715, | |
| "learning_rate": 9.442478628361098e-05, | |
| "loss": 7.6501, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.35019455252918286, | |
| "grad_norm": 0.959281861782074, | |
| "learning_rate": 9.294034272126287e-05, | |
| "loss": 7.6496, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38910505836575876, | |
| "grad_norm": 0.9253648519515991, | |
| "learning_rate": 9.129553629282448e-05, | |
| "loss": 7.421, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4280155642023346, | |
| "grad_norm": 0.9299134016036987, | |
| "learning_rate": 8.949650961049478e-05, | |
| "loss": 7.5062, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4669260700389105, | |
| "grad_norm": 0.9562272429466248, | |
| "learning_rate": 8.754998122975489e-05, | |
| "loss": 7.43, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5058365758754864, | |
| "grad_norm": 0.9745796322822571, | |
| "learning_rate": 8.546322055858526e-05, | |
| "loss": 7.3152, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5447470817120622, | |
| "grad_norm": 1.0300663709640503, | |
| "learning_rate": 8.324402070949658e-05, | |
| "loss": 7.3288, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5836575875486382, | |
| "grad_norm": 0.8551069498062134, | |
| "learning_rate": 8.09006693957597e-05, | |
| "loss": 7.288, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.622568093385214, | |
| "grad_norm": 0.9526028037071228, | |
| "learning_rate": 7.844191798052438e-05, | |
| "loss": 7.3128, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6614785992217899, | |
| "grad_norm": 0.8692190647125244, | |
| "learning_rate": 7.587694879441401e-05, | |
| "loss": 7.0454, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7003891050583657, | |
| "grad_norm": 0.7848012447357178, | |
| "learning_rate": 7.321534084365102e-05, | |
| "loss": 7.0277, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7392996108949417, | |
| "grad_norm": 0.7873611450195312, | |
| "learning_rate": 7.046703403677695e-05, | |
| "loss": 6.9448, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7782101167315175, | |
| "grad_norm": 0.8347607254981995, | |
| "learning_rate": 6.764229206356498e-05, | |
| "loss": 7.0677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8171206225680934, | |
| "grad_norm": 0.7865148186683655, | |
| "learning_rate": 6.475166406475515e-05, | |
| "loss": 6.9666, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8560311284046692, | |
| "grad_norm": 0.8299317359924316, | |
| "learning_rate": 6.180594523575838e-05, | |
| "loss": 7.1694, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8949416342412452, | |
| "grad_norm": 0.8158650994300842, | |
| "learning_rate": 5.881613651145732e-05, | |
| "loss": 6.7787, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.933852140077821, | |
| "grad_norm": 0.8030637502670288, | |
| "learning_rate": 5.579340348266251e-05, | |
| "loss": 6.7451, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9727626459143969, | |
| "grad_norm": 0.8576043844223022, | |
| "learning_rate": 5.27490346976529e-05, | |
| "loss": 6.7098, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0116731517509727, | |
| "grad_norm": 1.007199764251709, | |
| "learning_rate": 4.969439950452543e-05, | |
| "loss": 6.3174, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0505836575875487, | |
| "grad_norm": 0.7966341972351074, | |
| "learning_rate": 4.664090559179367e-05, | |
| "loss": 5.085, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0894941634241244, | |
| "grad_norm": 0.8274776339530945, | |
| "learning_rate": 4.359995638580226e-05, | |
| "loss": 5.2567, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1284046692607004, | |
| "grad_norm": 0.8015701770782471, | |
| "learning_rate": 4.0582908464058556e-05, | |
| "loss": 5.1255, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1673151750972763, | |
| "grad_norm": 0.7696110010147095, | |
| "learning_rate": 3.7601029143523764e-05, | |
| "loss": 5.1827, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.206225680933852, | |
| "grad_norm": 0.7955564260482788, | |
| "learning_rate": 3.466545440225193e-05, | |
| "loss": 5.1197, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.245136186770428, | |
| "grad_norm": 0.7841880917549133, | |
| "learning_rate": 3.1787147291520674e-05, | |
| "loss": 5.1158, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2840466926070038, | |
| "grad_norm": 0.7522294521331787, | |
| "learning_rate": 2.8976856993765766e-05, | |
| "loss": 5.1173, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3229571984435797, | |
| "grad_norm": 0.7152595520019531, | |
| "learning_rate": 2.6245078679219505e-05, | |
| "loss": 5.0291, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3618677042801557, | |
| "grad_norm": 0.697422206401825, | |
| "learning_rate": 2.3602014311170523e-05, | |
| "loss": 4.9941, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4007782101167314, | |
| "grad_norm": 0.7435976266860962, | |
| "learning_rate": 2.1057534546219658e-05, | |
| "loss": 5.1369, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4396887159533074, | |
| "grad_norm": 0.8627301454544067, | |
| "learning_rate": 1.862114187181705e-05, | |
| "loss": 4.9706, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4785992217898833, | |
| "grad_norm": 0.7423489093780518, | |
| "learning_rate": 1.6301935118745826e-05, | |
| "loss": 4.9751, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.517509727626459, | |
| "grad_norm": 0.7178505659103394, | |
| "learning_rate": 1.4108575481081521e-05, | |
| "loss": 4.8353, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.556420233463035, | |
| "grad_norm": 0.7140501737594604, | |
| "learning_rate": 1.2049254170527857e-05, | |
| "loss": 4.9182, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.595330739299611, | |
| "grad_norm": 0.7342681884765625, | |
| "learning_rate": 1.013166182592551e-05, | |
| "loss": 4.8421, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6342412451361867, | |
| "grad_norm": 0.716266930103302, | |
| "learning_rate": 8.36295979217494e-06, | |
| "loss": 4.8006, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6731517509727627, | |
| "grad_norm": 0.7294363975524902, | |
| "learning_rate": 6.7497533758344665e-06, | |
| "loss": 4.7904, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7120622568093387, | |
| "grad_norm": 0.6808627843856812, | |
| "learning_rate": 5.298067177271143e-06, | |
| "loss": 4.7973, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7509727626459144, | |
| "grad_norm": 0.7073158025741577, | |
| "learning_rate": 4.01332259148815e-06, | |
| "loss": 4.7772, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7898832684824901, | |
| "grad_norm": 0.6910358667373657, | |
| "learning_rate": 2.9003175616530265e-06, | |
| "loss": 4.9969, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8287937743190663, | |
| "grad_norm": 0.688657283782959, | |
| "learning_rate": 1.963208660937904e-06, | |
| "loss": 4.9254, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.867704280155642, | |
| "grad_norm": 0.6506223678588867, | |
| "learning_rate": 1.205495569588283e-06, | |
| "loss": 4.9407, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9066147859922178, | |
| "grad_norm": 0.6927580237388611, | |
| "learning_rate": 6.300080051914791e-07, | |
| "loss": 4.8365, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9455252918287937, | |
| "grad_norm": 0.682306170463562, | |
| "learning_rate": 2.3889515495413296e-07, | |
| "loss": 4.8349, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9844357976653697, | |
| "grad_norm": 0.6962385177612305, | |
| "learning_rate": 3.361764945473134e-08, | |
| "loss": 4.9875, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 514, | |
| "total_flos": 3.8982891094409216e+17, | |
| "train_loss": 6.394122821347723, | |
| "train_runtime": 7794.0822, | |
| "train_samples_per_second": 4.219, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 514, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 15000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.8982891094409216e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |