{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 645, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.046511627906976744, "grad_norm": 1.7476660013198853, "learning_rate": 4.1538461538461545e-06, "loss": 0.4053, "step": 10 }, { "epoch": 0.09302325581395349, "grad_norm": 1.2797056436538696, "learning_rate": 8.76923076923077e-06, "loss": 0.2451, "step": 20 }, { "epoch": 0.13953488372093023, "grad_norm": 0.9715517163276672, "learning_rate": 1.3384615384615386e-05, "loss": 0.2026, "step": 30 }, { "epoch": 0.18604651162790697, "grad_norm": 0.8870955109596252, "learning_rate": 1.8e-05, "loss": 0.1818, "step": 40 }, { "epoch": 0.23255813953488372, "grad_norm": 0.706699550151825, "learning_rate": 2.2615384615384615e-05, "loss": 0.1687, "step": 50 }, { "epoch": 0.27906976744186046, "grad_norm": 0.6627687811851501, "learning_rate": 2.7230769230769233e-05, "loss": 0.1621, "step": 60 }, { "epoch": 0.32558139534883723, "grad_norm": 0.5161024928092957, "learning_rate": 2.9996479470277262e-05, "loss": 0.1597, "step": 70 }, { "epoch": 0.37209302325581395, "grad_norm": 0.45436057448387146, "learning_rate": 2.9956892486957502e-05, "loss": 0.1527, "step": 80 }, { "epoch": 0.4186046511627907, "grad_norm": 0.7213302254676819, "learning_rate": 2.9873434360934543e-05, "loss": 0.1536, "step": 90 }, { "epoch": 0.46511627906976744, "grad_norm": 0.41891512274742126, "learning_rate": 2.9746349889271652e-05, "loss": 0.1454, "step": 100 }, { "epoch": 0.5116279069767442, "grad_norm": 0.40932586789131165, "learning_rate": 2.9576011832620583e-05, "loss": 0.1384, "step": 110 }, { "epoch": 0.5581395348837209, "grad_norm": 0.6326035857200623, "learning_rate": 2.9362919821850365e-05, "loss": 0.1401, "step": 120 }, { "epoch": 0.6046511627906976, "grad_norm": 0.47325533628463745, "learning_rate": 2.9107698892543862e-05, "loss": 0.1297, "step": 130 }, { "epoch": 0.6511627906976745, "grad_norm": 0.4126339852809906, "learning_rate": 2.8811097651660716e-05, "loss": 0.1288, "step": 140 }, { "epoch": 0.6976744186046512, "grad_norm": 0.4939899742603302, "learning_rate": 2.847398608174417e-05, "loss": 0.1339, "step": 150 }, { "epoch": 0.7441860465116279, "grad_norm": 0.38328367471694946, "learning_rate": 2.8097352989112345e-05, "loss": 0.1293, "step": 160 }, { "epoch": 0.7906976744186046, "grad_norm": 0.36505013704299927, "learning_rate": 2.768230310351898e-05, "loss": 0.1272, "step": 170 }, { "epoch": 0.8372093023255814, "grad_norm": 0.40006181597709656, "learning_rate": 2.7230053837790673e-05, "loss": 0.1285, "step": 180 }, { "epoch": 0.8837209302325582, "grad_norm": 0.4462541341781616, "learning_rate": 2.6741931716945336e-05, "loss": 0.1236, "step": 190 }, { "epoch": 0.9302325581395349, "grad_norm": 0.34532684087753296, "learning_rate": 2.6219368487265756e-05, "loss": 0.1274, "step": 200 }, { "epoch": 0.9767441860465116, "grad_norm": 0.4481622576713562, "learning_rate": 2.5663896916741064e-05, "loss": 0.1213, "step": 210 }, { "epoch": 1.0232558139534884, "grad_norm": 0.3308081328868866, "learning_rate": 2.5077146299194094e-05, "loss": 0.124, "step": 220 }, { "epoch": 1.069767441860465, "grad_norm": 0.3247145116329193, "learning_rate": 2.446083767528193e-05, "loss": 0.1223, "step": 230 }, { "epoch": 1.1162790697674418, "grad_norm": 0.31839317083358765, "learning_rate": 2.3816778784387097e-05, "loss": 0.1162, "step": 240 }, { "epoch": 1.1627906976744187, "grad_norm": 0.36601680517196655, "learning_rate": 2.3146858762206493e-05, "loss": 0.1166, "step": 250 }, { "epoch": 1.2093023255813953, "grad_norm": 0.39110299944877625, "learning_rate": 2.2453042599590884e-05, "loss": 0.1147, "step": 260 }, { "epoch": 1.255813953488372, "grad_norm": 0.40319550037384033, "learning_rate": 2.173736537888819e-05, "loss": 0.1137, "step": 270 }, { "epoch": 1.302325581395349, "grad_norm": 0.35869157314300537, "learning_rate": 2.10019263046963e-05, "loss": 0.1174, "step": 280 }, { "epoch": 1.3488372093023255, "grad_norm": 0.39020806550979614, "learning_rate": 2.0248882546534327e-05, "loss": 0.1141, "step": 290 }, { "epoch": 1.3953488372093024, "grad_norm": 0.39524269104003906, "learning_rate": 1.9480442911492706e-05, "loss": 0.1132, "step": 300 }, { "epoch": 1.441860465116279, "grad_norm": 0.3699205219745636, "learning_rate": 1.8698861365421433e-05, "loss": 0.1119, "step": 310 }, { "epoch": 1.4883720930232558, "grad_norm": 0.33726540207862854, "learning_rate": 1.7906430421659876e-05, "loss": 0.1166, "step": 320 }, { "epoch": 1.5348837209302326, "grad_norm": 0.41265928745269775, "learning_rate": 1.7105474416700165e-05, "loss": 0.1127, "step": 330 }, { "epoch": 1.5813953488372094, "grad_norm": 0.3320264220237732, "learning_rate": 1.6298342692507765e-05, "loss": 0.1115, "step": 340 }, { "epoch": 1.627906976744186, "grad_norm": 0.2614348828792572, "learning_rate": 1.548740270549671e-05, "loss": 0.111, "step": 350 }, { "epoch": 1.6744186046511627, "grad_norm": 0.29234352707862854, "learning_rate": 1.467503308237204e-05, "loss": 0.1087, "step": 360 }, { "epoch": 1.7209302325581395, "grad_norm": 0.2900184094905853, "learning_rate": 1.3863616643207844e-05, "loss": 0.1074, "step": 370 }, { "epoch": 1.7674418604651163, "grad_norm": 0.29722127318382263, "learning_rate": 1.3055533412225422e-05, "loss": 0.1104, "step": 380 }, { "epoch": 1.8139534883720931, "grad_norm": 0.38949820399284363, "learning_rate": 1.2253153636772158e-05, "loss": 0.1091, "step": 390 }, { "epoch": 1.8604651162790697, "grad_norm": 0.2845311164855957, "learning_rate": 1.1458830834977698e-05, "loss": 0.1111, "step": 400 }, { "epoch": 1.9069767441860463, "grad_norm": 0.24525171518325806, "learning_rate": 1.067489489247974e-05, "loss": 0.107, "step": 410 }, { "epoch": 1.9534883720930232, "grad_norm": 0.29745063185691833, "learning_rate": 9.903645228468024e-06, "loss": 0.1071, "step": 420 }, { "epoch": 2.0, "grad_norm": 0.2796909809112549, "learning_rate": 9.147344051091682e-06, "loss": 0.1059, "step": 430 }, { "epoch": 2.046511627906977, "grad_norm": 0.28045353293418884, "learning_rate": 8.408209722012956e-06, "loss": 0.1023, "step": 440 }, { "epoch": 2.0930232558139537, "grad_norm": 0.21314753592014313, "learning_rate": 7.688410249570214e-06, "loss": 0.1031, "step": 450 }, { "epoch": 2.13953488372093, "grad_norm": 0.3194475471973419, "learning_rate": 6.990056929635958e-06, "loss": 0.1023, "step": 460 }, { "epoch": 2.186046511627907, "grad_norm": 0.21530288457870483, "learning_rate": 6.315198152822273e-06, "loss": 0.0997, "step": 470 }, { "epoch": 2.2325581395348837, "grad_norm": 0.27216485142707825, "learning_rate": 5.66581339619819e-06, "loss": 0.0968, "step": 480 }, { "epoch": 2.2790697674418605, "grad_norm": 0.3025151193141937, "learning_rate": 5.043807417142436e-06, "loss": 0.0999, "step": 490 }, { "epoch": 2.3255813953488373, "grad_norm": 0.27557316422462463, "learning_rate": 4.4510046663618e-06, "loss": 0.0991, "step": 500 }, { "epoch": 2.3720930232558137, "grad_norm": 0.2208959013223648, "learning_rate": 3.889143936462915e-06, "loss": 0.0985, "step": 510 }, { "epoch": 2.4186046511627906, "grad_norm": 0.31590473651885986, "learning_rate": 3.359873261773904e-06, "loss": 0.0994, "step": 520 }, { "epoch": 2.4651162790697674, "grad_norm": 0.28370803594589233, "learning_rate": 2.86474508437579e-06, "loss": 0.0989, "step": 530 }, { "epoch": 2.511627906976744, "grad_norm": 0.2663387358188629, "learning_rate": 2.4052117005223457e-06, "loss": 0.1022, "step": 540 }, { "epoch": 2.558139534883721, "grad_norm": 0.2275928556919098, "learning_rate": 1.982621000804979e-06, "loss": 0.0926, "step": 550 }, { "epoch": 2.604651162790698, "grad_norm": 0.2721380889415741, "learning_rate": 1.5982125165573941e-06, "loss": 0.0991, "step": 560 }, { "epoch": 2.6511627906976747, "grad_norm": 0.24267996847629547, "learning_rate": 1.25311378409661e-06, "loss": 0.0991, "step": 570 }, { "epoch": 2.697674418604651, "grad_norm": 0.22184228897094727, "learning_rate": 9.483370374646661e-07, "loss": 0.0975, "step": 580 }, { "epoch": 2.744186046511628, "grad_norm": 0.28250575065612793, "learning_rate": 6.847762393717782e-07, "loss": 0.0926, "step": 590 }, { "epoch": 2.7906976744186047, "grad_norm": 0.28555381298065186, "learning_rate": 4.632044590496948e-07, "loss": 0.0956, "step": 600 }, { "epoch": 2.8372093023255816, "grad_norm": 0.29760462045669556, "learning_rate": 2.8427160470641255e-07, "loss": 0.0966, "step": 610 }, { "epoch": 2.883720930232558, "grad_norm": 0.28198060393333435, "learning_rate": 1.4850251723345198e-07, "loss": 0.0981, "step": 620 }, { "epoch": 2.9302325581395348, "grad_norm": 0.27139362692832947, "learning_rate": 5.629543075708177e-08, "loss": 0.0919, "step": 630 }, { "epoch": 2.9767441860465116, "grad_norm": 0.2770179808139801, "learning_rate": 7.920804549007011e-09, "loss": 0.0927, "step": 640 } ], "logging_steps": 10, "max_steps": 645, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 475511974002688.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }