{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9910313901345291, "eval_steps": 500, "global_step": 444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02242152466367713, "grad_norm": 2.390625, "learning_rate": 2.123791785182557e-05, "loss": 0.9148, "step": 5 }, { "epoch": 0.04484304932735426, "grad_norm": 1.3828125, "learning_rate": 4.778531516660753e-05, "loss": 0.7159, "step": 10 }, { "epoch": 0.06726457399103139, "grad_norm": 1.1875, "learning_rate": 7.433271248138951e-05, "loss": 0.6679, "step": 15 }, { "epoch": 0.08968609865470852, "grad_norm": 1.0078125, "learning_rate": 0.00010088010979617146, "loss": 0.6512, "step": 20 }, { "epoch": 0.11210762331838565, "grad_norm": 1.046875, "learning_rate": 0.00012742750711095343, "loss": 0.6596, "step": 25 }, { "epoch": 0.13452914798206278, "grad_norm": 1.015625, "learning_rate": 0.0001539749044257354, "loss": 0.6348, "step": 30 }, { "epoch": 0.15695067264573992, "grad_norm": 1.1015625, "learning_rate": 0.00018052230174051736, "loss": 0.6352, "step": 35 }, { "epoch": 0.17937219730941703, "grad_norm": 1.046875, "learning_rate": 0.00018581809294315903, "loss": 0.6422, "step": 40 }, { "epoch": 0.20179372197309417, "grad_norm": 0.8125, "learning_rate": 0.0001857624936017134, "loss": 0.6619, "step": 45 }, { "epoch": 0.2242152466367713, "grad_norm": 0.84375, "learning_rate": 0.0001856641617644384, "loss": 0.6454, "step": 50 }, { "epoch": 0.24663677130044842, "grad_norm": 0.66015625, "learning_rate": 0.00018552315778910269, "loss": 0.6523, "step": 55 }, { "epoch": 0.26905829596412556, "grad_norm": 0.69140625, "learning_rate": 0.0001853395682263653, "loss": 0.6485, "step": 60 }, { "epoch": 0.2914798206278027, "grad_norm": 0.9453125, "learning_rate": 0.0001851135057666494, "loss": 0.6452, "step": 65 }, { "epoch": 0.31390134529147984, "grad_norm": 1.2109375, "learning_rate": 0.00018484510917097098, "loss": 0.6878, "step": 70 }, { "epoch": 0.336322869955157, "grad_norm": 0.84765625, "learning_rate": 0.00018453454318576493, "loss": 0.6585, "step": 75 }, { "epoch": 0.35874439461883406, "grad_norm": 0.62890625, "learning_rate": 0.00018418199844176086, "loss": 0.6447, "step": 80 }, { "epoch": 0.3811659192825112, "grad_norm": 0.67578125, "learning_rate": 0.00018378769133697064, "loss": 0.618, "step": 85 }, { "epoch": 0.40358744394618834, "grad_norm": 0.6171875, "learning_rate": 0.00018335186390385983, "loss": 0.6242, "step": 90 }, { "epoch": 0.4260089686098655, "grad_norm": 0.62109375, "learning_rate": 0.00018287478366078356, "loss": 0.62, "step": 95 }, { "epoch": 0.4484304932735426, "grad_norm": 0.5234375, "learning_rate": 0.0001823567434477796, "loss": 0.6234, "step": 100 }, { "epoch": 0.47085201793721976, "grad_norm": 0.546875, "learning_rate": 0.00018179806124681794, "loss": 0.6088, "step": 105 }, { "epoch": 0.49327354260089684, "grad_norm": 0.5078125, "learning_rate": 0.00018119907998661786, "loss": 0.6069, "step": 110 }, { "epoch": 0.515695067264574, "grad_norm": 0.51171875, "learning_rate": 0.00018056016733215274, "loss": 0.6037, "step": 115 }, { "epoch": 0.5381165919282511, "grad_norm": 0.50390625, "learning_rate": 0.00017988171545897072, "loss": 0.5977, "step": 120 }, { "epoch": 0.5605381165919282, "grad_norm": 0.5234375, "learning_rate": 0.00017916414081247065, "loss": 0.5903, "step": 125 }, { "epoch": 0.5829596412556054, "grad_norm": 0.4765625, "learning_rate": 0.0001784078838522809, "loss": 0.6063, "step": 130 }, { "epoch": 0.6053811659192825, "grad_norm": 0.58203125, "learning_rate": 0.0001776134087818975, "loss": 0.5831, "step": 135 }, { "epoch": 0.6278026905829597, "grad_norm": 0.5546875, "learning_rate": 0.00017678120326374824, "loss": 0.6045, "step": 140 }, { "epoch": 0.6502242152466368, "grad_norm": 0.478515625, "learning_rate": 0.00017591177811985704, "loss": 0.593, "step": 145 }, { "epoch": 0.672645739910314, "grad_norm": 0.55859375, "learning_rate": 0.00017500566701829254, "loss": 0.5943, "step": 150 }, { "epoch": 0.695067264573991, "grad_norm": 0.51953125, "learning_rate": 0.00017406342614559366, "loss": 0.5716, "step": 155 }, { "epoch": 0.7174887892376681, "grad_norm": 0.466796875, "learning_rate": 0.00017308563386537258, "loss": 0.59, "step": 160 }, { "epoch": 0.7399103139013453, "grad_norm": 0.427734375, "learning_rate": 0.00017207289036330536, "loss": 0.5801, "step": 165 }, { "epoch": 0.7623318385650224, "grad_norm": 0.48828125, "learning_rate": 0.00017102581727872763, "loss": 0.5918, "step": 170 }, { "epoch": 0.7847533632286996, "grad_norm": 0.5078125, "learning_rate": 0.0001699450573230617, "loss": 0.5793, "step": 175 }, { "epoch": 0.8071748878923767, "grad_norm": 0.51953125, "learning_rate": 0.00016883127388530927, "loss": 0.5737, "step": 180 }, { "epoch": 0.8295964125560538, "grad_norm": 0.5546875, "learning_rate": 0.00016768515062485188, "loss": 0.5792, "step": 185 }, { "epoch": 0.852017937219731, "grad_norm": 0.48046875, "learning_rate": 0.000166507391051809, "loss": 0.5656, "step": 190 }, { "epoch": 0.874439461883408, "grad_norm": 0.48046875, "learning_rate": 0.0001652987180952116, "loss": 0.5625, "step": 195 }, { "epoch": 0.8968609865470852, "grad_norm": 0.54296875, "learning_rate": 0.00016405987365925574, "loss": 0.5929, "step": 200 }, { "epoch": 0.9192825112107623, "grad_norm": 0.51953125, "learning_rate": 0.00016279161816790908, "loss": 0.5732, "step": 205 }, { "epoch": 0.9417040358744395, "grad_norm": 0.5625, "learning_rate": 0.00016149473009814973, "loss": 0.5595, "step": 210 }, { "epoch": 0.9641255605381166, "grad_norm": 0.490234375, "learning_rate": 0.00016017000550212353, "loss": 0.5662, "step": 215 }, { "epoch": 0.9865470852017937, "grad_norm": 0.4765625, "learning_rate": 0.0001588182575185139, "loss": 0.5771, "step": 220 }, { "epoch": 0.9955156950672646, "eval_loss": 0.5749066472053528, "eval_runtime": 2.1723, "eval_samples_per_second": 19.335, "eval_steps_per_second": 19.335, "step": 222 }, { "epoch": 1.0089686098654709, "grad_norm": 0.640625, "learning_rate": 0.0001574403158734228, "loss": 0.5053, "step": 225 }, { "epoch": 1.031390134529148, "grad_norm": 0.55078125, "learning_rate": 0.0001560370263710713, "loss": 0.4183, "step": 230 }, { "epoch": 1.053811659192825, "grad_norm": 0.5703125, "learning_rate": 0.00015460925037463, "loss": 0.4084, "step": 235 }, { "epoch": 1.0762331838565022, "grad_norm": 0.51171875, "learning_rate": 0.00015315786427749957, "loss": 0.4123, "step": 240 }, { "epoch": 1.0986547085201794, "grad_norm": 0.498046875, "learning_rate": 0.0001516837589653656, "loss": 0.4045, "step": 245 }, { "epoch": 1.1210762331838564, "grad_norm": 0.4921875, "learning_rate": 0.00015018783926935766, "loss": 0.4039, "step": 250 }, { "epoch": 1.1434977578475336, "grad_norm": 0.458984375, "learning_rate": 0.00014867102341064824, "loss": 0.4171, "step": 255 }, { "epoch": 1.1659192825112108, "grad_norm": 0.49609375, "learning_rate": 0.00014713424243683308, "loss": 0.4107, "step": 260 }, { "epoch": 1.188340807174888, "grad_norm": 0.458984375, "learning_rate": 0.00014557843965043807, "loss": 0.4092, "step": 265 }, { "epoch": 1.210762331838565, "grad_norm": 0.486328125, "learning_rate": 0.000144004570029904, "loss": 0.4068, "step": 270 }, { "epoch": 1.2331838565022422, "grad_norm": 0.45703125, "learning_rate": 0.00014241359964340427, "loss": 0.4143, "step": 275 }, { "epoch": 1.2556053811659194, "grad_norm": 0.453125, "learning_rate": 0.00014080650505585602, "loss": 0.4233, "step": 280 }, { "epoch": 1.2780269058295963, "grad_norm": 0.4765625, "learning_rate": 0.00013918427272948742, "loss": 0.4154, "step": 285 }, { "epoch": 1.3004484304932735, "grad_norm": 0.423828125, "learning_rate": 0.00013754789841833055, "loss": 0.415, "step": 290 }, { "epoch": 1.3228699551569507, "grad_norm": 0.4453125, "learning_rate": 0.00013589838655701027, "loss": 0.4167, "step": 295 }, { "epoch": 1.3452914798206277, "grad_norm": 0.44140625, "learning_rate": 0.00013423674964420517, "loss": 0.4098, "step": 300 }, { "epoch": 1.3677130044843049, "grad_norm": 0.458984375, "learning_rate": 0.00013256400762115845, "loss": 0.4156, "step": 305 }, { "epoch": 1.390134529147982, "grad_norm": 0.470703125, "learning_rate": 0.00013088118724562047, "loss": 0.4148, "step": 310 }, { "epoch": 1.4125560538116593, "grad_norm": 0.453125, "learning_rate": 0.00012918932146160724, "loss": 0.416, "step": 315 }, { "epoch": 1.4349775784753362, "grad_norm": 0.474609375, "learning_rate": 0.0001274894487653617, "loss": 0.4152, "step": 320 }, { "epoch": 1.4573991031390134, "grad_norm": 0.44140625, "learning_rate": 0.00012578261256790684, "loss": 0.4252, "step": 325 }, { "epoch": 1.4798206278026906, "grad_norm": 0.443359375, "learning_rate": 0.00012406986055458216, "loss": 0.4119, "step": 330 }, { "epoch": 1.5022421524663678, "grad_norm": 0.427734375, "learning_rate": 0.0001223522440419565, "loss": 0.4106, "step": 335 }, { "epoch": 1.5246636771300448, "grad_norm": 0.423828125, "learning_rate": 0.00012063081733251175, "loss": 0.412, "step": 340 }, { "epoch": 1.547085201793722, "grad_norm": 0.423828125, "learning_rate": 0.00011890663706749399, "loss": 0.4008, "step": 345 }, { "epoch": 1.5695067264573992, "grad_norm": 0.44140625, "learning_rate": 0.00011718076157832895, "loss": 0.4067, "step": 350 }, { "epoch": 1.5919282511210762, "grad_norm": 0.470703125, "learning_rate": 0.00011545425023700002, "loss": 0.4087, "step": 355 }, { "epoch": 1.6143497757847534, "grad_norm": 0.4296875, "learning_rate": 0.00011372816280578757, "loss": 0.4006, "step": 360 }, { "epoch": 1.6367713004484306, "grad_norm": 0.412109375, "learning_rate": 0.00011200355878676867, "loss": 0.4059, "step": 365 }, { "epoch": 1.6591928251121075, "grad_norm": 0.43359375, "learning_rate": 0.0001102814967714765, "loss": 0.4132, "step": 370 }, { "epoch": 1.6816143497757847, "grad_norm": 0.4453125, "learning_rate": 0.0001085630337911188, "loss": 0.4101, "step": 375 }, { "epoch": 1.704035874439462, "grad_norm": 0.423828125, "learning_rate": 0.00010684922466775389, "loss": 0.4049, "step": 380 }, { "epoch": 1.726457399103139, "grad_norm": 0.40234375, "learning_rate": 0.00010514112136682282, "loss": 0.4077, "step": 385 }, { "epoch": 1.7488789237668163, "grad_norm": 0.447265625, "learning_rate": 0.00010343977235143499, "loss": 0.4049, "step": 390 }, { "epoch": 1.7713004484304933, "grad_norm": 0.416015625, "learning_rate": 0.00010174622193880337, "loss": 0.3886, "step": 395 }, { "epoch": 1.7937219730941703, "grad_norm": 0.40625, "learning_rate": 0.0001000615096592249, "loss": 0.3985, "step": 400 }, { "epoch": 1.8161434977578477, "grad_norm": 0.416015625, "learning_rate": 9.83866696179987e-05, "loss": 0.3987, "step": 405 }, { "epoch": 1.8385650224215246, "grad_norm": 0.40625, "learning_rate": 9.672272986067473e-05, "loss": 0.4011, "step": 410 }, { "epoch": 1.8609865470852018, "grad_norm": 0.388671875, "learning_rate": 9.507071174202163e-05, "loss": 0.3897, "step": 415 }, { "epoch": 1.883408071748879, "grad_norm": 0.42578125, "learning_rate": 9.343162929910169e-05, "loss": 0.3872, "step": 420 }, { "epoch": 1.905829596412556, "grad_norm": 0.431640625, "learning_rate": 9.180648862883761e-05, "loss": 0.3903, "step": 425 }, { "epoch": 1.9282511210762332, "grad_norm": 0.4296875, "learning_rate": 9.019628727045275e-05, "loss": 0.4004, "step": 430 }, { "epoch": 1.9506726457399104, "grad_norm": 0.390625, "learning_rate": 8.860201359316462e-05, "loss": 0.3857, "step": 435 }, { "epoch": 1.9730941704035874, "grad_norm": 0.412109375, "learning_rate": 8.702464618950674e-05, "loss": 0.3916, "step": 440 }, { "epoch": 1.9910313901345291, "eval_loss": 0.5398380160331726, "eval_runtime": 1.9374, "eval_samples_per_second": 21.679, "eval_steps_per_second": 21.679, "step": 444 } ], "logging_steps": 5, "max_steps": 669, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.145669325918044e+17, "train_batch_size": 100, "trial_name": null, "trial_params": null }