{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006993006993006993, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.993006993006993e-05, "grad_norm": 5.159224033355713, "learning_rate": 1.048951048951049e-06, "loss": 11.0749, "step": 1 }, { "epoch": 0.00013986013986013986, "grad_norm": 5.171240329742432, "learning_rate": 2.097902097902098e-06, "loss": 11.076, "step": 2 }, { "epoch": 0.0002097902097902098, "grad_norm": 5.199888229370117, "learning_rate": 3.146853146853147e-06, "loss": 11.0669, "step": 3 }, { "epoch": 0.0002797202797202797, "grad_norm": 5.08656120300293, "learning_rate": 4.195804195804196e-06, "loss": 11.0237, "step": 4 }, { "epoch": 0.00034965034965034965, "grad_norm": 5.065122127532959, "learning_rate": 5.244755244755245e-06, "loss": 10.8949, "step": 5 }, { "epoch": 0.0004195804195804196, "grad_norm": 4.298279762268066, "learning_rate": 6.293706293706294e-06, "loss": 10.675, "step": 6 }, { "epoch": 0.0004895104895104895, "grad_norm": 3.8743011951446533, "learning_rate": 7.342657342657342e-06, "loss": 10.4729, "step": 7 }, { "epoch": 0.0005594405594405594, "grad_norm": 3.7294981479644775, "learning_rate": 8.391608391608391e-06, "loss": 10.3221, "step": 8 }, { "epoch": 0.0006293706293706294, "grad_norm": 3.3471932411193848, "learning_rate": 9.44055944055944e-06, "loss": 10.1544, "step": 9 }, { "epoch": 0.0006993006993006993, "grad_norm": 3.088207721710205, "learning_rate": 1.048951048951049e-05, "loss": 10.0823, "step": 10 }, { "epoch": 0.0007692307692307692, "grad_norm": 2.713122606277466, "learning_rate": 1.1538461538461538e-05, "loss": 9.9324, "step": 11 }, { "epoch": 0.0008391608391608392, "grad_norm": 2.5934906005859375, "learning_rate": 1.2587412587412587e-05, "loss": 9.8238, "step": 12 }, { "epoch": 0.0009090909090909091, "grad_norm": 2.258924722671509, "learning_rate": 1.3636363636363635e-05, "loss": 9.7534, "step": 13 }, { "epoch": 0.000979020979020979, "grad_norm": 2.0431277751922607, "learning_rate": 1.4685314685314684e-05, "loss": 9.683, "step": 14 }, { "epoch": 0.001048951048951049, "grad_norm": 2.0174779891967773, "learning_rate": 1.5734265734265734e-05, "loss": 9.5835, "step": 15 }, { "epoch": 0.0011188811188811189, "grad_norm": 1.9392926692962646, "learning_rate": 1.6783216783216783e-05, "loss": 9.566, "step": 16 }, { "epoch": 0.0011888111888111888, "grad_norm": 1.841917634010315, "learning_rate": 1.783216783216783e-05, "loss": 9.4862, "step": 17 }, { "epoch": 0.0012587412587412587, "grad_norm": 1.6751583814620972, "learning_rate": 1.888111888111888e-05, "loss": 9.4613, "step": 18 }, { "epoch": 0.0013286713286713287, "grad_norm": 1.701292634010315, "learning_rate": 1.9930069930069928e-05, "loss": 9.3861, "step": 19 }, { "epoch": 0.0013986013986013986, "grad_norm": 1.7422462701797485, "learning_rate": 2.097902097902098e-05, "loss": 9.3607, "step": 20 }, { "epoch": 0.0014685314685314685, "grad_norm": 1.7331982851028442, "learning_rate": 2.2027972027972026e-05, "loss": 9.3106, "step": 21 }, { "epoch": 0.0015384615384615385, "grad_norm": 1.8083829879760742, "learning_rate": 2.3076923076923076e-05, "loss": 9.252, "step": 22 }, { "epoch": 0.0016083916083916084, "grad_norm": 1.7314563989639282, "learning_rate": 2.412587412587412e-05, "loss": 9.2325, "step": 23 }, { "epoch": 0.0016783216783216783, "grad_norm": 1.5793718099594116, "learning_rate": 2.5174825174825174e-05, "loss": 9.2489, "step": 24 }, { "epoch": 0.0017482517482517483, "grad_norm": 1.5183552503585815, "learning_rate": 2.622377622377622e-05, "loss": 9.1821, "step": 25 }, { "epoch": 0.0018181818181818182, "grad_norm": 1.5751609802246094, "learning_rate": 2.727272727272727e-05, "loss": 9.1217, "step": 26 }, { "epoch": 0.0018881118881118881, "grad_norm": 1.541367530822754, "learning_rate": 2.832167832167832e-05, "loss": 9.1186, "step": 27 }, { "epoch": 0.001958041958041958, "grad_norm": 1.5820801258087158, "learning_rate": 2.937062937062937e-05, "loss": 9.105, "step": 28 }, { "epoch": 0.002027972027972028, "grad_norm": 1.483319640159607, "learning_rate": 3.0419580419580414e-05, "loss": 9.0772, "step": 29 }, { "epoch": 0.002097902097902098, "grad_norm": 1.5269867181777954, "learning_rate": 3.146853146853147e-05, "loss": 9.0185, "step": 30 }, { "epoch": 0.002167832167832168, "grad_norm": 1.4273761510849, "learning_rate": 3.251748251748251e-05, "loss": 9.0013, "step": 31 }, { "epoch": 0.0022377622377622378, "grad_norm": 1.4377866983413696, "learning_rate": 3.3566433566433566e-05, "loss": 8.9161, "step": 32 }, { "epoch": 0.002307692307692308, "grad_norm": 1.369158148765564, "learning_rate": 3.461538461538461e-05, "loss": 8.917, "step": 33 }, { "epoch": 0.0023776223776223776, "grad_norm": 1.328121542930603, "learning_rate": 3.566433566433566e-05, "loss": 8.8484, "step": 34 }, { "epoch": 0.0024475524475524478, "grad_norm": 1.289856195449829, "learning_rate": 3.671328671328671e-05, "loss": 8.8637, "step": 35 }, { "epoch": 0.0025174825174825175, "grad_norm": 1.2946962118148804, "learning_rate": 3.776223776223776e-05, "loss": 8.8045, "step": 36 }, { "epoch": 0.0025874125874125876, "grad_norm": 1.3396406173706055, "learning_rate": 3.881118881118881e-05, "loss": 8.7569, "step": 37 }, { "epoch": 0.0026573426573426573, "grad_norm": 1.3043384552001953, "learning_rate": 3.9860139860139855e-05, "loss": 8.7184, "step": 38 }, { "epoch": 0.0027272727272727275, "grad_norm": 1.2441431283950806, "learning_rate": 4.09090909090909e-05, "loss": 8.66, "step": 39 }, { "epoch": 0.002797202797202797, "grad_norm": 1.1379547119140625, "learning_rate": 4.195804195804196e-05, "loss": 8.6217, "step": 40 }, { "epoch": 0.0028671328671328673, "grad_norm": 1.1792885065078735, "learning_rate": 4.300699300699301e-05, "loss": 8.5006, "step": 41 }, { "epoch": 0.002937062937062937, "grad_norm": 1.183509349822998, "learning_rate": 4.405594405594405e-05, "loss": 8.5037, "step": 42 }, { "epoch": 0.003006993006993007, "grad_norm": 1.1481878757476807, "learning_rate": 4.51048951048951e-05, "loss": 8.5114, "step": 43 }, { "epoch": 0.003076923076923077, "grad_norm": 1.1832213401794434, "learning_rate": 4.615384615384615e-05, "loss": 8.4117, "step": 44 }, { "epoch": 0.003146853146853147, "grad_norm": 1.0428388118743896, "learning_rate": 4.72027972027972e-05, "loss": 8.4378, "step": 45 }, { "epoch": 0.0032167832167832168, "grad_norm": 1.1736469268798828, "learning_rate": 4.825174825174824e-05, "loss": 8.2981, "step": 46 }, { "epoch": 0.003286713286713287, "grad_norm": 1.0533231496810913, "learning_rate": 4.930069930069929e-05, "loss": 8.3406, "step": 47 }, { "epoch": 0.0033566433566433566, "grad_norm": 1.1464074850082397, "learning_rate": 5.034965034965035e-05, "loss": 8.2388, "step": 48 }, { "epoch": 0.003426573426573427, "grad_norm": 1.224696159362793, "learning_rate": 5.1398601398601395e-05, "loss": 8.208, "step": 49 }, { "epoch": 0.0034965034965034965, "grad_norm": 1.1058298349380493, "learning_rate": 5.244755244755244e-05, "loss": 8.192, "step": 50 }, { "epoch": 0.0035664335664335666, "grad_norm": 1.0586011409759521, "learning_rate": 5.3496503496503493e-05, "loss": 8.1369, "step": 51 }, { "epoch": 0.0036363636363636364, "grad_norm": 1.0969116687774658, "learning_rate": 5.454545454545454e-05, "loss": 8.0515, "step": 52 }, { "epoch": 0.0037062937062937065, "grad_norm": 0.9527297616004944, "learning_rate": 5.559440559440559e-05, "loss": 8.0319, "step": 53 }, { "epoch": 0.0037762237762237762, "grad_norm": 1.0115875005722046, "learning_rate": 5.664335664335664e-05, "loss": 7.9882, "step": 54 }, { "epoch": 0.0038461538461538464, "grad_norm": 1.044940710067749, "learning_rate": 5.769230769230769e-05, "loss": 7.9776, "step": 55 }, { "epoch": 0.003916083916083916, "grad_norm": 0.9222269058227539, "learning_rate": 5.874125874125874e-05, "loss": 7.9327, "step": 56 }, { "epoch": 0.003986013986013986, "grad_norm": 0.9342182278633118, "learning_rate": 5.979020979020978e-05, "loss": 7.853, "step": 57 }, { "epoch": 0.004055944055944056, "grad_norm": 0.9146339297294617, "learning_rate": 6.083916083916083e-05, "loss": 7.8091, "step": 58 }, { "epoch": 0.004125874125874126, "grad_norm": 0.8632137775421143, "learning_rate": 6.188811188811188e-05, "loss": 7.7744, "step": 59 }, { "epoch": 0.004195804195804196, "grad_norm": 1.001086711883545, "learning_rate": 6.293706293706293e-05, "loss": 7.7623, "step": 60 }, { "epoch": 0.0042657342657342655, "grad_norm": 0.9110626578330994, "learning_rate": 6.398601398601397e-05, "loss": 7.7356, "step": 61 }, { "epoch": 0.004335664335664336, "grad_norm": 0.7626254558563232, "learning_rate": 6.503496503496503e-05, "loss": 7.6801, "step": 62 }, { "epoch": 0.004405594405594406, "grad_norm": 1.0254472494125366, "learning_rate": 6.608391608391608e-05, "loss": 7.6111, "step": 63 }, { "epoch": 0.0044755244755244755, "grad_norm": 0.9034542441368103, "learning_rate": 6.713286713286713e-05, "loss": 7.603, "step": 64 }, { "epoch": 0.004545454545454545, "grad_norm": 0.685672402381897, "learning_rate": 6.818181818181817e-05, "loss": 7.5614, "step": 65 }, { "epoch": 0.004615384615384616, "grad_norm": 0.9960275292396545, "learning_rate": 6.923076923076922e-05, "loss": 7.5537, "step": 66 }, { "epoch": 0.0046853146853146855, "grad_norm": 0.6702042818069458, "learning_rate": 7.027972027972028e-05, "loss": 7.4794, "step": 67 }, { "epoch": 0.004755244755244755, "grad_norm": 0.8843338489532471, "learning_rate": 7.132867132867132e-05, "loss": 7.4486, "step": 68 }, { "epoch": 0.004825174825174825, "grad_norm": 0.7096754908561707, "learning_rate": 7.237762237762237e-05, "loss": 7.4253, "step": 69 }, { "epoch": 0.0048951048951048955, "grad_norm": 0.8532392382621765, "learning_rate": 7.342657342657342e-05, "loss": 7.383, "step": 70 }, { "epoch": 0.004965034965034965, "grad_norm": 0.7407330870628357, "learning_rate": 7.447552447552447e-05, "loss": 7.4059, "step": 71 }, { "epoch": 0.005034965034965035, "grad_norm": 0.7601925730705261, "learning_rate": 7.552447552447553e-05, "loss": 7.3522, "step": 72 }, { "epoch": 0.005104895104895105, "grad_norm": 0.6095029711723328, "learning_rate": 7.657342657342657e-05, "loss": 7.3442, "step": 73 }, { "epoch": 0.005174825174825175, "grad_norm": 0.8200113773345947, "learning_rate": 7.762237762237762e-05, "loss": 7.3272, "step": 74 }, { "epoch": 0.005244755244755245, "grad_norm": 0.9570252299308777, "learning_rate": 7.867132867132867e-05, "loss": 7.2258, "step": 75 }, { "epoch": 0.005314685314685315, "grad_norm": 1.005595088005066, "learning_rate": 7.972027972027971e-05, "loss": 7.2552, "step": 76 }, { "epoch": 0.005384615384615384, "grad_norm": 0.7686155438423157, "learning_rate": 8.076923076923076e-05, "loss": 7.2615, "step": 77 }, { "epoch": 0.005454545454545455, "grad_norm": 0.6844556927680969, "learning_rate": 8.18181818181818e-05, "loss": 7.1683, "step": 78 }, { "epoch": 0.005524475524475525, "grad_norm": 0.665515661239624, "learning_rate": 8.286713286713286e-05, "loss": 7.2015, "step": 79 }, { "epoch": 0.005594405594405594, "grad_norm": 0.7764729261398315, "learning_rate": 8.391608391608392e-05, "loss": 7.1474, "step": 80 }, { "epoch": 0.005664335664335664, "grad_norm": 0.6877023577690125, "learning_rate": 8.496503496503496e-05, "loss": 7.1705, "step": 81 }, { "epoch": 0.005734265734265735, "grad_norm": 0.678676426410675, "learning_rate": 8.601398601398601e-05, "loss": 7.1529, "step": 82 }, { "epoch": 0.005804195804195804, "grad_norm": 0.8167915940284729, "learning_rate": 8.706293706293705e-05, "loss": 7.0738, "step": 83 }, { "epoch": 0.005874125874125874, "grad_norm": 0.6965641975402832, "learning_rate": 8.81118881118881e-05, "loss": 7.1085, "step": 84 }, { "epoch": 0.005944055944055944, "grad_norm": 0.9094026684761047, "learning_rate": 8.916083916083914e-05, "loss": 7.0906, "step": 85 }, { "epoch": 0.006013986013986014, "grad_norm": 0.7994691133499146, "learning_rate": 9.02097902097902e-05, "loss": 7.0782, "step": 86 }, { "epoch": 0.006083916083916084, "grad_norm": 1.007351279258728, "learning_rate": 9.125874125874126e-05, "loss": 7.0095, "step": 87 }, { "epoch": 0.006153846153846154, "grad_norm": 0.8736310601234436, "learning_rate": 9.23076923076923e-05, "loss": 6.9629, "step": 88 }, { "epoch": 0.0062237762237762236, "grad_norm": 0.6392287015914917, "learning_rate": 9.335664335664336e-05, "loss": 6.9908, "step": 89 }, { "epoch": 0.006293706293706294, "grad_norm": 1.091141939163208, "learning_rate": 9.44055944055944e-05, "loss": 6.9523, "step": 90 }, { "epoch": 0.006363636363636364, "grad_norm": 0.7908533215522766, "learning_rate": 9.545454545454545e-05, "loss": 6.9238, "step": 91 }, { "epoch": 0.0064335664335664336, "grad_norm": 0.6706556081771851, "learning_rate": 9.650349650349649e-05, "loss": 6.9672, "step": 92 }, { "epoch": 0.006503496503496503, "grad_norm": 0.9139024019241333, "learning_rate": 9.755244755244754e-05, "loss": 6.9997, "step": 93 }, { "epoch": 0.006573426573426574, "grad_norm": 0.7192760109901428, "learning_rate": 9.860139860139858e-05, "loss": 6.936, "step": 94 }, { "epoch": 0.006643356643356644, "grad_norm": 0.7734940052032471, "learning_rate": 9.965034965034964e-05, "loss": 7.0156, "step": 95 }, { "epoch": 0.006713286713286713, "grad_norm": 0.8069332838058472, "learning_rate": 0.0001006993006993007, "loss": 6.8958, "step": 96 }, { "epoch": 0.006783216783216783, "grad_norm": 1.0324732065200806, "learning_rate": 0.00010174825174825174, "loss": 6.8863, "step": 97 }, { "epoch": 0.006853146853146854, "grad_norm": 0.6798093318939209, "learning_rate": 0.00010279720279720279, "loss": 6.8534, "step": 98 }, { "epoch": 0.006923076923076923, "grad_norm": 0.7703111171722412, "learning_rate": 0.00010384615384615383, "loss": 6.7499, "step": 99 }, { "epoch": 0.006993006993006993, "grad_norm": 0.941916823387146, "learning_rate": 0.00010489510489510488, "loss": 6.8574, "step": 100 } ], "logging_steps": 1, "max_steps": 14300, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7825750933504e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }