{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 645, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.046511627906976744, "grad_norm": 2.4341416358947754, "learning_rate": 4.1538461538461545e-06, "loss": 0.3952, "step": 10 }, { "epoch": 0.09302325581395349, "grad_norm": 1.8208298683166504, "learning_rate": 8.76923076923077e-06, "loss": 0.2555, "step": 20 }, { "epoch": 0.13953488372093023, "grad_norm": 1.736371636390686, "learning_rate": 1.3384615384615386e-05, "loss": 0.2123, "step": 30 }, { "epoch": 0.18604651162790697, "grad_norm": 1.3780733346939087, "learning_rate": 1.8e-05, "loss": 0.1893, "step": 40 }, { "epoch": 0.23255813953488372, "grad_norm": 1.1913007497787476, "learning_rate": 2.2615384615384615e-05, "loss": 0.1759, "step": 50 }, { "epoch": 0.27906976744186046, "grad_norm": 0.8563091158866882, "learning_rate": 2.7230769230769233e-05, "loss": 0.1704, "step": 60 }, { "epoch": 0.32558139534883723, "grad_norm": 1.2382513284683228, "learning_rate": 2.9996479470277262e-05, "loss": 0.1648, "step": 70 }, { "epoch": 0.37209302325581395, "grad_norm": 0.8481675982475281, "learning_rate": 2.9956892486957502e-05, "loss": 0.1654, "step": 80 }, { "epoch": 0.4186046511627907, "grad_norm": 1.4965696334838867, "learning_rate": 2.9873434360934543e-05, "loss": 0.1613, "step": 90 }, { "epoch": 0.46511627906976744, "grad_norm": 0.8099003434181213, "learning_rate": 2.9746349889271652e-05, "loss": 0.1554, "step": 100 }, { "epoch": 0.5116279069767442, "grad_norm": 0.8638372421264648, "learning_rate": 2.9576011832620583e-05, "loss": 0.1464, "step": 110 }, { "epoch": 0.5581395348837209, "grad_norm": 1.090198040008545, "learning_rate": 2.9362919821850365e-05, "loss": 0.1484, "step": 120 }, { "epoch": 0.6046511627906976, "grad_norm": 0.7675595283508301, "learning_rate": 2.9107698892543862e-05, "loss": 0.1359, "step": 130 }, { "epoch": 0.6511627906976745, "grad_norm": 0.6692689657211304, "learning_rate": 2.8811097651660716e-05, "loss": 0.1334, "step": 140 }, { "epoch": 0.6976744186046512, "grad_norm": 0.7122989892959595, "learning_rate": 2.847398608174417e-05, "loss": 0.1408, "step": 150 }, { "epoch": 0.7441860465116279, "grad_norm": 0.7892124056816101, "learning_rate": 2.8097352989112345e-05, "loss": 0.1348, "step": 160 }, { "epoch": 0.7906976744186046, "grad_norm": 0.6389254927635193, "learning_rate": 2.768230310351898e-05, "loss": 0.133, "step": 170 }, { "epoch": 0.8372093023255814, "grad_norm": 0.9744543433189392, "learning_rate": 2.7230053837790673e-05, "loss": 0.1416, "step": 180 }, { "epoch": 0.8837209302325582, "grad_norm": 0.638029932975769, "learning_rate": 2.6741931716945336e-05, "loss": 0.1281, "step": 190 }, { "epoch": 0.9302325581395349, "grad_norm": 0.6276944279670715, "learning_rate": 2.6219368487265756e-05, "loss": 0.1322, "step": 200 }, { "epoch": 0.9767441860465116, "grad_norm": 0.5903041958808899, "learning_rate": 2.5663896916741064e-05, "loss": 0.1242, "step": 210 }, { "epoch": 1.0232558139534884, "grad_norm": 0.47011125087738037, "learning_rate": 2.5077146299194094e-05, "loss": 0.1253, "step": 220 }, { "epoch": 1.069767441860465, "grad_norm": 0.6500623226165771, "learning_rate": 2.446083767528193e-05, "loss": 0.1245, "step": 230 }, { "epoch": 1.1162790697674418, "grad_norm": 0.5090748071670532, "learning_rate": 2.3816778784387097e-05, "loss": 0.1218, "step": 240 }, { "epoch": 1.1627906976744187, "grad_norm": 0.547989010810852, "learning_rate": 2.3146858762206493e-05, "loss": 0.1201, "step": 250 }, { "epoch": 1.2093023255813953, "grad_norm": 0.6994646787643433, "learning_rate": 2.2453042599590884e-05, "loss": 0.118, "step": 260 }, { "epoch": 1.255813953488372, "grad_norm": 0.6814610362052917, "learning_rate": 2.173736537888819e-05, "loss": 0.121, "step": 270 }, { "epoch": 1.302325581395349, "grad_norm": 0.5295448303222656, "learning_rate": 2.10019263046963e-05, "loss": 0.1194, "step": 280 }, { "epoch": 1.3488372093023255, "grad_norm": 0.6496727466583252, "learning_rate": 2.0248882546534327e-05, "loss": 0.1142, "step": 290 }, { "epoch": 1.3953488372093024, "grad_norm": 0.7958609461784363, "learning_rate": 1.9480442911492706e-05, "loss": 0.1137, "step": 300 }, { "epoch": 1.441860465116279, "grad_norm": 0.5419607758522034, "learning_rate": 1.8698861365421433e-05, "loss": 0.1148, "step": 310 }, { "epoch": 1.4883720930232558, "grad_norm": 0.5299695134162903, "learning_rate": 1.7906430421659876e-05, "loss": 0.119, "step": 320 }, { "epoch": 1.5348837209302326, "grad_norm": 0.5715431571006775, "learning_rate": 1.7105474416700165e-05, "loss": 0.114, "step": 330 }, { "epoch": 1.5813953488372094, "grad_norm": 0.5162650346755981, "learning_rate": 1.6298342692507765e-05, "loss": 0.1132, "step": 340 }, { "epoch": 1.627906976744186, "grad_norm": 0.42877525091171265, "learning_rate": 1.548740270549671e-05, "loss": 0.1131, "step": 350 }, { "epoch": 1.6744186046511627, "grad_norm": 0.41223767399787903, "learning_rate": 1.467503308237204e-05, "loss": 0.1104, "step": 360 }, { "epoch": 1.7209302325581395, "grad_norm": 0.44639158248901367, "learning_rate": 1.3863616643207844e-05, "loss": 0.1091, "step": 370 }, { "epoch": 1.7674418604651163, "grad_norm": 0.4975012242794037, "learning_rate": 1.3055533412225422e-05, "loss": 0.1123, "step": 380 }, { "epoch": 1.8139534883720931, "grad_norm": 0.550603985786438, "learning_rate": 1.2253153636772158e-05, "loss": 0.1109, "step": 390 }, { "epoch": 1.8604651162790697, "grad_norm": 0.4819272458553314, "learning_rate": 1.1458830834977698e-05, "loss": 0.1129, "step": 400 }, { "epoch": 1.9069767441860463, "grad_norm": 0.34267139434814453, "learning_rate": 1.067489489247974e-05, "loss": 0.1085, "step": 410 }, { "epoch": 1.9534883720930232, "grad_norm": 0.48774823546409607, "learning_rate": 9.903645228468024e-06, "loss": 0.1098, "step": 420 }, { "epoch": 2.0, "grad_norm": 0.5222486257553101, "learning_rate": 9.147344051091682e-06, "loss": 0.1076, "step": 430 }, { "epoch": 2.046511627906977, "grad_norm": 0.45477646589279175, "learning_rate": 8.408209722012956e-06, "loss": 0.1044, "step": 440 }, { "epoch": 2.0930232558139537, "grad_norm": 0.4191150963306427, "learning_rate": 7.688410249570214e-06, "loss": 0.1051, "step": 450 }, { "epoch": 2.13953488372093, "grad_norm": 0.5323087573051453, "learning_rate": 6.990056929635958e-06, "loss": 0.1043, "step": 460 }, { "epoch": 2.186046511627907, "grad_norm": 0.3530571162700653, "learning_rate": 6.315198152822273e-06, "loss": 0.1017, "step": 470 }, { "epoch": 2.2325581395348837, "grad_norm": 0.40887731313705444, "learning_rate": 5.66581339619819e-06, "loss": 0.0984, "step": 480 }, { "epoch": 2.2790697674418605, "grad_norm": 0.51576828956604, "learning_rate": 5.043807417142436e-06, "loss": 0.1015, "step": 490 }, { "epoch": 2.3255813953488373, "grad_norm": 0.47971853613853455, "learning_rate": 4.4510046663618e-06, "loss": 0.1008, "step": 500 }, { "epoch": 2.3720930232558137, "grad_norm": 0.38621294498443604, "learning_rate": 3.889143936462915e-06, "loss": 0.1008, "step": 510 }, { "epoch": 2.4186046511627906, "grad_norm": 0.48758363723754883, "learning_rate": 3.359873261773904e-06, "loss": 0.1018, "step": 520 }, { "epoch": 2.4651162790697674, "grad_norm": 0.47174710035324097, "learning_rate": 2.86474508437579e-06, "loss": 0.101, "step": 530 }, { "epoch": 2.511627906976744, "grad_norm": 0.43842917680740356, "learning_rate": 2.4052117005223457e-06, "loss": 0.1045, "step": 540 }, { "epoch": 2.558139534883721, "grad_norm": 0.3704339265823364, "learning_rate": 1.982621000804979e-06, "loss": 0.0947, "step": 550 }, { "epoch": 2.604651162790698, "grad_norm": 0.44818997383117676, "learning_rate": 1.5982125165573941e-06, "loss": 0.1011, "step": 560 }, { "epoch": 2.6511627906976747, "grad_norm": 0.3882347345352173, "learning_rate": 1.25311378409661e-06, "loss": 0.1008, "step": 570 }, { "epoch": 2.697674418604651, "grad_norm": 0.3568360507488251, "learning_rate": 9.483370374646661e-07, "loss": 0.0998, "step": 580 }, { "epoch": 2.744186046511628, "grad_norm": 0.45614829659461975, "learning_rate": 6.847762393717782e-07, "loss": 0.0946, "step": 590 }, { "epoch": 2.7906976744186047, "grad_norm": 0.40172144770622253, "learning_rate": 4.632044590496948e-07, "loss": 0.0971, "step": 600 }, { "epoch": 2.8372093023255816, "grad_norm": 0.5011858344078064, "learning_rate": 2.8427160470641255e-07, "loss": 0.0982, "step": 610 }, { "epoch": 2.883720930232558, "grad_norm": 0.4626616835594177, "learning_rate": 1.4850251723345198e-07, "loss": 0.1001, "step": 620 }, { "epoch": 2.9302325581395348, "grad_norm": 0.4319652020931244, "learning_rate": 5.629543075708177e-08, "loss": 0.0938, "step": 630 }, { "epoch": 2.9767441860465116, "grad_norm": 0.4647758901119232, "learning_rate": 7.920804549007011e-09, "loss": 0.0945, "step": 640 } ], "logging_steps": 10, "max_steps": 645, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 188525150076928.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }