{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988610478359909, "eval_steps": 500, "global_step": 657, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04555808656036447, "grad_norm": 16.628382336058554, "learning_rate": 1.5151515151515152e-06, "loss": 1.5883, "step": 10 }, { "epoch": 0.09111617312072894, "grad_norm": 3.221028223725061, "learning_rate": 3.0303030303030305e-06, "loss": 0.9317, "step": 20 }, { "epoch": 0.1366742596810934, "grad_norm": 1.5103642702449496, "learning_rate": 4.5454545454545455e-06, "loss": 0.5336, "step": 30 }, { "epoch": 0.18223234624145787, "grad_norm": 1.0681798496560502, "learning_rate": 6.060606060606061e-06, "loss": 0.4417, "step": 40 }, { "epoch": 0.22779043280182232, "grad_norm": 0.9679921213671909, "learning_rate": 7.5757575757575764e-06, "loss": 0.3979, "step": 50 }, { "epoch": 0.2733485193621868, "grad_norm": 0.926628742270625, "learning_rate": 9.090909090909091e-06, "loss": 0.3744, "step": 60 }, { "epoch": 0.31890660592255127, "grad_norm": 0.9491261260284587, "learning_rate": 9.998869765883566e-06, "loss": 0.3486, "step": 70 }, { "epoch": 0.36446469248291574, "grad_norm": 0.84920311370356, "learning_rate": 9.986160499534318e-06, "loss": 0.3453, "step": 80 }, { "epoch": 0.41002277904328016, "grad_norm": 0.9789183018412179, "learning_rate": 9.959365197965824e-06, "loss": 0.3572, "step": 90 }, { "epoch": 0.45558086560364464, "grad_norm": 0.9102455500916922, "learning_rate": 9.918559558613344e-06, "loss": 0.3356, "step": 100 }, { "epoch": 0.5011389521640092, "grad_norm": 0.9258850899143908, "learning_rate": 9.863858858486736e-06, "loss": 0.3278, "step": 110 }, { "epoch": 0.5466970387243736, "grad_norm": 0.8922762696366542, "learning_rate": 9.795417628509857e-06, "loss": 0.3268, "step": 120 }, { "epoch": 0.592255125284738, "grad_norm": 0.9396732926347776, "learning_rate": 9.713429216966624e-06, "loss": 0.3108, "step": 130 }, { "epoch": 0.6378132118451025, "grad_norm": 0.845601467481683, "learning_rate": 9.618125243286989e-06, "loss": 0.3164, "step": 140 }, { "epoch": 0.683371298405467, "grad_norm": 0.8159514975969553, "learning_rate": 9.50977494371594e-06, "loss": 0.3127, "step": 150 }, { "epoch": 0.7289293849658315, "grad_norm": 0.8778654186332652, "learning_rate": 9.388684410713977e-06, "loss": 0.3118, "step": 160 }, { "epoch": 0.7744874715261959, "grad_norm": 0.8158041504461108, "learning_rate": 9.255195728237837e-06, "loss": 0.3033, "step": 170 }, { "epoch": 0.8200455580865603, "grad_norm": 0.8241477933179293, "learning_rate": 9.109686005344258e-06, "loss": 0.298, "step": 180 }, { "epoch": 0.8656036446469249, "grad_norm": 0.7836442953404367, "learning_rate": 8.952566310846931e-06, "loss": 0.2967, "step": 190 }, { "epoch": 0.9111617312072893, "grad_norm": 0.8589198296868927, "learning_rate": 8.784280512036235e-06, "loss": 0.2908, "step": 200 }, { "epoch": 0.9567198177676538, "grad_norm": 0.8976108880329798, "learning_rate": 8.60530402074241e-06, "loss": 0.288, "step": 210 }, { "epoch": 1.0, "grad_norm": 0.8244261812910136, "learning_rate": 8.416142450284565e-06, "loss": 0.274, "step": 220 }, { "epoch": 1.0455580865603644, "grad_norm": 0.7709445009431932, "learning_rate": 8.217330187099689e-06, "loss": 0.178, "step": 230 }, { "epoch": 1.0911161731207288, "grad_norm": 0.7371669649380324, "learning_rate": 8.009428881086836e-06, "loss": 0.1706, "step": 240 }, { "epoch": 1.1366742596810935, "grad_norm": 0.8325359027964357, "learning_rate": 7.793025858931317e-06, "loss": 0.1717, "step": 250 }, { "epoch": 1.182232346241458, "grad_norm": 0.7810927662905283, "learning_rate": 7.568732464891293e-06, "loss": 0.1768, "step": 260 }, { "epoch": 1.2277904328018223, "grad_norm": 0.7328567227225008, "learning_rate": 7.33718233373407e-06, "loss": 0.174, "step": 270 }, { "epoch": 1.2733485193621867, "grad_norm": 0.829615067889057, "learning_rate": 7.099029600701144e-06, "loss": 0.1721, "step": 280 }, { "epoch": 1.3189066059225514, "grad_norm": 0.7764413190939856, "learning_rate": 6.854947053558849e-06, "loss": 0.169, "step": 290 }, { "epoch": 1.3644646924829158, "grad_norm": 0.7592152186492857, "learning_rate": 6.6056242319551315e-06, "loss": 0.1667, "step": 300 }, { "epoch": 1.4100227790432802, "grad_norm": 0.7583979219386041, "learning_rate": 6.3517654794518156e-06, "loss": 0.1644, "step": 310 }, { "epoch": 1.4555808656036446, "grad_norm": 0.7329938588396943, "learning_rate": 6.094087953735423e-06, "loss": 0.168, "step": 320 }, { "epoch": 1.501138952164009, "grad_norm": 0.8196364714199265, "learning_rate": 5.8333196006277536e-06, "loss": 0.1617, "step": 330 }, { "epoch": 1.5466970387243735, "grad_norm": 0.7565763296431312, "learning_rate": 5.570197097619688e-06, "loss": 0.1611, "step": 340 }, { "epoch": 1.592255125284738, "grad_norm": 0.8868392512031471, "learning_rate": 5.305463772737812e-06, "loss": 0.1609, "step": 350 }, { "epoch": 1.6378132118451025, "grad_norm": 0.8812463705246544, "learning_rate": 5.039867504623084e-06, "loss": 0.1677, "step": 360 }, { "epoch": 1.683371298405467, "grad_norm": 0.7633777742059934, "learning_rate": 4.774158609753908e-06, "loss": 0.1512, "step": 370 }, { "epoch": 1.7289293849658316, "grad_norm": 0.7820693250794272, "learning_rate": 4.5090877227822424e-06, "loss": 0.1572, "step": 380 }, { "epoch": 1.774487471526196, "grad_norm": 0.7688524374447184, "learning_rate": 4.245403675970877e-06, "loss": 0.1617, "step": 390 }, { "epoch": 1.8200455580865604, "grad_norm": 0.7254423021253176, "learning_rate": 3.9838513837224814e-06, "loss": 0.1519, "step": 400 }, { "epoch": 1.8656036446469249, "grad_norm": 0.8076268890579671, "learning_rate": 3.7251697381767373e-06, "loss": 0.154, "step": 410 }, { "epoch": 1.9111617312072893, "grad_norm": 0.7615743261772087, "learning_rate": 3.4700895218205026e-06, "loss": 0.1468, "step": 420 }, { "epoch": 1.9567198177676537, "grad_norm": 0.7511849885826963, "learning_rate": 3.2193313430079737e-06, "loss": 0.147, "step": 430 }, { "epoch": 2.0, "grad_norm": 0.7584213495047252, "learning_rate": 2.9736036002230332e-06, "loss": 0.1429, "step": 440 }, { "epoch": 2.0455580865603644, "grad_norm": 0.6815225387343684, "learning_rate": 2.7336004808348094e-06, "loss": 0.0751, "step": 450 }, { "epoch": 2.091116173120729, "grad_norm": 0.6174750652979677, "learning_rate": 2.5000000000000015e-06, "loss": 0.0711, "step": 460 }, { "epoch": 2.1366742596810933, "grad_norm": 0.6236626143150734, "learning_rate": 2.273462085252146e-06, "loss": 0.0722, "step": 470 }, { "epoch": 2.1822323462414577, "grad_norm": 0.5971939289865311, "learning_rate": 2.0546267121888863e-06, "loss": 0.0693, "step": 480 }, { "epoch": 2.2277904328018225, "grad_norm": 0.631697145877715, "learning_rate": 1.8441120965239912e-06, "loss": 0.0713, "step": 490 }, { "epoch": 2.273348519362187, "grad_norm": 0.6817300872785331, "learning_rate": 1.642512947611622e-06, "loss": 0.0729, "step": 500 }, { "epoch": 2.3189066059225514, "grad_norm": 0.5682490177071967, "learning_rate": 1.4503987883766857e-06, "loss": 0.0656, "step": 510 }, { "epoch": 2.364464692482916, "grad_norm": 0.5912819077013511, "learning_rate": 1.2683123463975144e-06, "loss": 0.0658, "step": 520 }, { "epoch": 2.41002277904328, "grad_norm": 0.606352905873965, "learning_rate": 1.0967680206861198e-06, "loss": 0.0694, "step": 530 }, { "epoch": 2.4555808656036446, "grad_norm": 0.6353412192229968, "learning_rate": 9.362504284973683e-07, "loss": 0.0701, "step": 540 }, { "epoch": 2.501138952164009, "grad_norm": 0.6607014617898831, "learning_rate": 7.872130362724422e-07, "loss": 0.0691, "step": 550 }, { "epoch": 2.5466970387243735, "grad_norm": 0.6527059303311876, "learning_rate": 6.500768785841482e-07, "loss": 0.0647, "step": 560 }, { "epoch": 2.592255125284738, "grad_norm": 0.6203144834959747, "learning_rate": 5.252293687031196e-07, "loss": 0.0669, "step": 570 }, { "epoch": 2.6378132118451028, "grad_norm": 0.6069382959627451, "learning_rate": 4.130232041450866e-07, "loss": 0.0624, "step": 580 }, { "epoch": 2.6833712984054667, "grad_norm": 0.6304879856190249, "learning_rate": 3.1377537029107174e-07, "loss": 0.0655, "step": 590 }, { "epoch": 2.7289293849658316, "grad_norm": 0.6325528536254185, "learning_rate": 2.2776624489530664e-07, "loss": 0.0667, "step": 600 }, { "epoch": 2.774487471526196, "grad_norm": 0.6179538898003908, "learning_rate": 1.55238806010668e-07, "loss": 0.0644, "step": 610 }, { "epoch": 2.8200455580865604, "grad_norm": 0.5965673740089573, "learning_rate": 9.639794556925041e-08, "loss": 0.0659, "step": 620 }, { "epoch": 2.865603644646925, "grad_norm": 0.6382075048871372, "learning_rate": 5.1409890557246876e-08, "loss": 0.0645, "step": 630 }, { "epoch": 2.9111617312072893, "grad_norm": 0.612194769673645, "learning_rate": 2.0401733419315727e-08, "loss": 0.063, "step": 640 }, { "epoch": 2.9567198177676537, "grad_norm": 0.6571824905728761, "learning_rate": 3.4610730190648423e-09, "loss": 0.0647, "step": 650 } ], "logging_steps": 10, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 418097876697088.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }