| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 400, | |
| "global_step": 375, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.5721463561058044, | |
| "learning_rate": 0.000997326203208556, | |
| "loss": 22.6723, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.5221903324127197, | |
| "learning_rate": 0.0009919786096256684, | |
| "loss": 22.7502, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 3.269012212753296, | |
| "learning_rate": 0.0009866310160427808, | |
| "loss": 22.2706, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 4.567020416259766, | |
| "learning_rate": 0.0009812834224598931, | |
| "loss": 21.3625, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 7.019204139709473, | |
| "learning_rate": 0.0009759358288770054, | |
| "loss": 20.7279, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 8.498096466064453, | |
| "learning_rate": 0.0009705882352941176, | |
| "loss": 20.8221, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 7.8151397705078125, | |
| "learning_rate": 0.00096524064171123, | |
| "loss": 20.4136, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 8.028499603271484, | |
| "learning_rate": 0.0009598930481283422, | |
| "loss": 20.2719, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 8.516434669494629, | |
| "learning_rate": 0.0009545454545454546, | |
| "loss": 20.1681, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 8.52490520477295, | |
| "learning_rate": 0.0009491978609625669, | |
| "loss": 19.8895, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 6.709629058837891, | |
| "learning_rate": 0.0009438502673796791, | |
| "loss": 19.93, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.038687705993652, | |
| "learning_rate": 0.0009385026737967914, | |
| "loss": 19.6312, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 5.785665512084961, | |
| "learning_rate": 0.0009331550802139037, | |
| "loss": 19.7683, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 5.79067850112915, | |
| "learning_rate": 0.0009278074866310161, | |
| "loss": 19.6965, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.166928291320801, | |
| "learning_rate": 0.0009224598930481284, | |
| "loss": 19.4005, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 4.578023433685303, | |
| "learning_rate": 0.0009171122994652407, | |
| "loss": 19.3963, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 4.7540693283081055, | |
| "learning_rate": 0.0009117647058823529, | |
| "loss": 19.4129, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 5.394408226013184, | |
| "learning_rate": 0.0009064171122994653, | |
| "loss": 19.5821, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 4.4902753829956055, | |
| "learning_rate": 0.0009010695187165776, | |
| "loss": 19.6562, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.49019193649292, | |
| "learning_rate": 0.0008957219251336899, | |
| "loss": 19.3588, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 4.184142589569092, | |
| "learning_rate": 0.0008903743315508022, | |
| "loss": 18.9032, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 3.98618483543396, | |
| "learning_rate": 0.0008850267379679144, | |
| "loss": 19.1882, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 4.851687908172607, | |
| "learning_rate": 0.0008796791443850267, | |
| "loss": 19.4565, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 4.108444690704346, | |
| "learning_rate": 0.0008743315508021391, | |
| "loss": 19.6149, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 3.7055838108062744, | |
| "learning_rate": 0.0008689839572192514, | |
| "loss": 18.9573, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 4.930137634277344, | |
| "learning_rate": 0.0008636363636363636, | |
| "loss": 19.4389, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 3.910098075866699, | |
| "learning_rate": 0.000858288770053476, | |
| "loss": 19.1465, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 4.0127716064453125, | |
| "learning_rate": 0.0008529411764705882, | |
| "loss": 19.5038, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 4.495028018951416, | |
| "learning_rate": 0.0008475935828877005, | |
| "loss": 19.3252, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 3.7703821659088135, | |
| "learning_rate": 0.0008422459893048129, | |
| "loss": 19.0238, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 3.6335291862487793, | |
| "learning_rate": 0.0008368983957219252, | |
| "loss": 19.1296, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 3.819183588027954, | |
| "learning_rate": 0.0008315508021390374, | |
| "loss": 18.4946, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 3.3171255588531494, | |
| "learning_rate": 0.0008262032085561497, | |
| "loss": 18.8054, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 4.316566467285156, | |
| "learning_rate": 0.000820855614973262, | |
| "loss": 19.162, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.39648175239563, | |
| "learning_rate": 0.0008155080213903744, | |
| "loss": 18.5671, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 3.7200136184692383, | |
| "learning_rate": 0.0008101604278074867, | |
| "loss": 18.9179, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 3.6730430126190186, | |
| "learning_rate": 0.0008048128342245989, | |
| "loss": 18.7162, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 3.5580945014953613, | |
| "learning_rate": 0.0007994652406417113, | |
| "loss": 19.0574, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 3.4793589115142822, | |
| "learning_rate": 0.0007941176470588235, | |
| "loss": 18.8649, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.074679374694824, | |
| "learning_rate": 0.0007887700534759359, | |
| "loss": 18.5553, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 3.315810441970825, | |
| "learning_rate": 0.0007834224598930482, | |
| "loss": 18.2136, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 4.288172721862793, | |
| "learning_rate": 0.0007780748663101605, | |
| "loss": 18.6089, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 3.5749149322509766, | |
| "learning_rate": 0.0007727272727272727, | |
| "loss": 18.8697, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 3.608825206756592, | |
| "learning_rate": 0.000767379679144385, | |
| "loss": 18.4129, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.5199592113494873, | |
| "learning_rate": 0.0007620320855614974, | |
| "loss": 18.1619, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 3.5022549629211426, | |
| "learning_rate": 0.0007566844919786096, | |
| "loss": 18.7368, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 3.6002230644226074, | |
| "learning_rate": 0.000751336898395722, | |
| "loss": 18.7792, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 4.682362079620361, | |
| "learning_rate": 0.0007459893048128342, | |
| "loss": 18.5495, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 3.6108767986297607, | |
| "learning_rate": 0.0007406417112299465, | |
| "loss": 18.7077, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.4719815254211426, | |
| "learning_rate": 0.0007352941176470589, | |
| "loss": 18.3262, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 4.4115986824035645, | |
| "learning_rate": 0.0007299465240641712, | |
| "loss": 18.3416, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 3.324169158935547, | |
| "learning_rate": 0.0007245989304812834, | |
| "loss": 18.7297, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 3.4287421703338623, | |
| "learning_rate": 0.0007192513368983958, | |
| "loss": 18.4499, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 3.9451239109039307, | |
| "learning_rate": 0.000713903743315508, | |
| "loss": 18.2669, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 3.5031988620758057, | |
| "learning_rate": 0.0007085561497326202, | |
| "loss": 18.8895, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 3.5174903869628906, | |
| "learning_rate": 0.0007032085561497327, | |
| "loss": 18.2961, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 4.080729961395264, | |
| "learning_rate": 0.0006978609625668449, | |
| "loss": 18.5613, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 3.7523930072784424, | |
| "learning_rate": 0.0006925133689839572, | |
| "loss": 18.5538, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 3.066669225692749, | |
| "learning_rate": 0.0006871657754010695, | |
| "loss": 18.6904, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 4.274256706237793, | |
| "learning_rate": 0.0006818181818181818, | |
| "loss": 18.6147, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 3.690139055252075, | |
| "learning_rate": 0.0006764705882352942, | |
| "loss": 18.1693, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 3.6681807041168213, | |
| "learning_rate": 0.0006711229946524065, | |
| "loss": 18.2498, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 3.5203354358673096, | |
| "learning_rate": 0.0006657754010695187, | |
| "loss": 18.4522, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 4.650991439819336, | |
| "learning_rate": 0.000660427807486631, | |
| "loss": 18.2839, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 3.7944228649139404, | |
| "learning_rate": 0.0006550802139037433, | |
| "loss": 18.051, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 3.2437500953674316, | |
| "learning_rate": 0.0006497326203208556, | |
| "loss": 18.1842, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 3.2863543033599854, | |
| "learning_rate": 0.000644385026737968, | |
| "loss": 18.2304, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 3.553260326385498, | |
| "learning_rate": 0.0006390374331550802, | |
| "loss": 18.1385, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 3.4277195930480957, | |
| "learning_rate": 0.0006336898395721925, | |
| "loss": 18.1337, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.974073886871338, | |
| "learning_rate": 0.0006283422459893048, | |
| "loss": 18.0326, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 3.3450510501861572, | |
| "learning_rate": 0.0006229946524064172, | |
| "loss": 18.2695, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 3.2181997299194336, | |
| "learning_rate": 0.0006176470588235294, | |
| "loss": 18.0315, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 3.8346364498138428, | |
| "learning_rate": 0.0006122994652406418, | |
| "loss": 18.4272, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 3.2085418701171875, | |
| "learning_rate": 0.000606951871657754, | |
| "loss": 18.1768, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 3.462108850479126, | |
| "learning_rate": 0.0006016042780748662, | |
| "loss": 18.1731, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 3.444965362548828, | |
| "learning_rate": 0.0005962566844919787, | |
| "loss": 18.3599, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 3.3701171875, | |
| "learning_rate": 0.0005909090909090909, | |
| "loss": 18.1495, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 3.5145843029022217, | |
| "learning_rate": 0.0005855614973262032, | |
| "loss": 18.0835, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 3.4785313606262207, | |
| "learning_rate": 0.0005802139037433155, | |
| "loss": 17.8138, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 3.9735538959503174, | |
| "learning_rate": 0.0005748663101604278, | |
| "loss": 18.0071, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 3.650447368621826, | |
| "learning_rate": 0.00056951871657754, | |
| "loss": 18.0124, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 3.6459813117980957, | |
| "learning_rate": 0.0005641711229946525, | |
| "loss": 18.0059, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 3.2154831886291504, | |
| "learning_rate": 0.0005588235294117647, | |
| "loss": 17.9694, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 3.367403507232666, | |
| "learning_rate": 0.0005534759358288771, | |
| "loss": 17.6557, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 3.9948298931121826, | |
| "learning_rate": 0.0005481283422459893, | |
| "loss": 18.1942, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 3.3495073318481445, | |
| "learning_rate": 0.0005427807486631015, | |
| "loss": 18.2016, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 3.373162269592285, | |
| "learning_rate": 0.000537433155080214, | |
| "loss": 18.0422, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 4.063633441925049, | |
| "learning_rate": 0.0005320855614973262, | |
| "loss": 18.0809, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 3.4912514686584473, | |
| "learning_rate": 0.0005267379679144385, | |
| "loss": 18.0674, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 3.5900015830993652, | |
| "learning_rate": 0.0005213903743315508, | |
| "loss": 17.9285, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 4.066802024841309, | |
| "learning_rate": 0.0005160427807486631, | |
| "loss": 18.1551, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 3.9782357215881348, | |
| "learning_rate": 0.0005106951871657754, | |
| "loss": 18.0509, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 3.314682960510254, | |
| "learning_rate": 0.0005053475935828878, | |
| "loss": 17.7608, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 3.3548595905303955, | |
| "learning_rate": 0.0005, | |
| "loss": 17.8103, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.3475797176361084, | |
| "learning_rate": 0.0004946524064171123, | |
| "loss": 17.9465, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 3.4256432056427, | |
| "learning_rate": 0.0004893048128342246, | |
| "loss": 17.6619, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 3.390056848526001, | |
| "learning_rate": 0.0004839572192513369, | |
| "loss": 17.9681, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 3.4441208839416504, | |
| "learning_rate": 0.00047860962566844924, | |
| "loss": 17.9407, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 3.2374165058135986, | |
| "learning_rate": 0.0004732620320855615, | |
| "loss": 17.7235, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 3.5628514289855957, | |
| "learning_rate": 0.0004679144385026738, | |
| "loss": 18.1743, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 3.41139554977417, | |
| "learning_rate": 0.00046256684491978613, | |
| "loss": 17.8456, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 3.423110008239746, | |
| "learning_rate": 0.0004572192513368984, | |
| "loss": 17.6656, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 3.3344337940216064, | |
| "learning_rate": 0.00045187165775401067, | |
| "loss": 17.962, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 3.5036981105804443, | |
| "learning_rate": 0.000446524064171123, | |
| "loss": 18.0875, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 3.4953839778900146, | |
| "learning_rate": 0.0004411764705882353, | |
| "loss": 17.3435, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 3.6864068508148193, | |
| "learning_rate": 0.0004358288770053476, | |
| "loss": 17.9087, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 3.4755449295043945, | |
| "learning_rate": 0.0004304812834224599, | |
| "loss": 17.5076, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 3.8116891384124756, | |
| "learning_rate": 0.0004251336898395722, | |
| "loss": 17.9272, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 3.18284010887146, | |
| "learning_rate": 0.0004197860962566845, | |
| "loss": 17.7148, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 3.2884979248046875, | |
| "learning_rate": 0.0004144385026737968, | |
| "loss": 17.8813, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 3.3735768795013428, | |
| "learning_rate": 0.00040909090909090913, | |
| "loss": 18.0372, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 3.2611794471740723, | |
| "learning_rate": 0.00040374331550802143, | |
| "loss": 17.3771, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 3.3338570594787598, | |
| "learning_rate": 0.00039839572192513367, | |
| "loss": 18.4657, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 3.405127763748169, | |
| "learning_rate": 0.000393048128342246, | |
| "loss": 17.9076, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 3.561793565750122, | |
| "learning_rate": 0.0003877005347593583, | |
| "loss": 17.8996, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 3.5615479946136475, | |
| "learning_rate": 0.00038235294117647055, | |
| "loss": 17.6746, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 3.4306275844573975, | |
| "learning_rate": 0.0003770053475935829, | |
| "loss": 17.7182, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 3.5057003498077393, | |
| "learning_rate": 0.0003716577540106952, | |
| "loss": 17.8058, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 3.3117101192474365, | |
| "learning_rate": 0.0003663101604278075, | |
| "loss": 17.8643, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 3.6897945404052734, | |
| "learning_rate": 0.0003609625668449198, | |
| "loss": 17.8266, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 3.7577505111694336, | |
| "learning_rate": 0.0003556149732620321, | |
| "loss": 18.6381, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 3.2401480674743652, | |
| "learning_rate": 0.0003502673796791444, | |
| "loss": 17.6933, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 3.6619515419006348, | |
| "learning_rate": 0.0003449197860962567, | |
| "loss": 18.0547, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 3.8387668132781982, | |
| "learning_rate": 0.000339572192513369, | |
| "loss": 17.7932, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.390653371810913, | |
| "learning_rate": 0.0003342245989304813, | |
| "loss": 17.2655, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 3.40058970451355, | |
| "learning_rate": 0.00032887700534759356, | |
| "loss": 17.703, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 3.568702220916748, | |
| "learning_rate": 0.0003235294117647059, | |
| "loss": 17.2042, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 3.529431104660034, | |
| "learning_rate": 0.0003181818181818182, | |
| "loss": 17.5732, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 3.3919003009796143, | |
| "learning_rate": 0.00031283422459893044, | |
| "loss": 17.6191, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 3.878042459487915, | |
| "learning_rate": 0.0003074866310160428, | |
| "loss": 17.4911, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 3.772318124771118, | |
| "learning_rate": 0.0003021390374331551, | |
| "loss": 17.7258, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 3.4453060626983643, | |
| "learning_rate": 0.0002967914438502674, | |
| "loss": 17.4906, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 3.4957454204559326, | |
| "learning_rate": 0.0002914438502673797, | |
| "loss": 17.5716, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 3.530831813812256, | |
| "learning_rate": 0.000286096256684492, | |
| "loss": 17.4089, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 3.7524755001068115, | |
| "learning_rate": 0.0002807486631016043, | |
| "loss": 17.7712, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 3.297961711883545, | |
| "learning_rate": 0.00027540106951871656, | |
| "loss": 17.4408, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 3.3661088943481445, | |
| "learning_rate": 0.0002700534759358289, | |
| "loss": 17.6753, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 3.646210193634033, | |
| "learning_rate": 0.0002647058823529412, | |
| "loss": 17.7821, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 3.475140333175659, | |
| "learning_rate": 0.00025935828877005345, | |
| "loss": 17.6129, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.4734578132629395, | |
| "learning_rate": 0.0002540106951871658, | |
| "loss": 17.6856, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 3.491572380065918, | |
| "learning_rate": 0.0002486631016042781, | |
| "loss": 17.6071, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 3.4102542400360107, | |
| "learning_rate": 0.0002433155080213904, | |
| "loss": 17.352, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 3.393477439880371, | |
| "learning_rate": 0.00023796791443850268, | |
| "loss": 17.2612, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 3.112462282180786, | |
| "learning_rate": 0.000232620320855615, | |
| "loss": 17.3272, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 3.3398191928863525, | |
| "learning_rate": 0.00022727272727272727, | |
| "loss": 17.5815, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 3.5039889812469482, | |
| "learning_rate": 0.00022192513368983957, | |
| "loss": 17.7557, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 3.532892942428589, | |
| "learning_rate": 0.0002165775401069519, | |
| "loss": 18.0523, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 3.2969062328338623, | |
| "learning_rate": 0.00021122994652406418, | |
| "loss": 17.7496, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 3.262855291366577, | |
| "learning_rate": 0.00020588235294117645, | |
| "loss": 17.793, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.459914445877075, | |
| "learning_rate": 0.00020053475935828877, | |
| "loss": 17.9245, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 3.6749696731567383, | |
| "learning_rate": 0.00019518716577540107, | |
| "loss": 17.7125, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 3.266754150390625, | |
| "learning_rate": 0.0001898395721925134, | |
| "loss": 17.5905, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 3.1848971843719482, | |
| "learning_rate": 0.00018449197860962566, | |
| "loss": 17.523, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 3.2962844371795654, | |
| "learning_rate": 0.00017914438502673795, | |
| "loss": 17.5297, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 3.4688000679016113, | |
| "learning_rate": 0.00017379679144385028, | |
| "loss": 17.6315, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 3.4146833419799805, | |
| "learning_rate": 0.00016844919786096257, | |
| "loss": 17.5776, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 3.3122944831848145, | |
| "learning_rate": 0.0001631016042780749, | |
| "loss": 17.7264, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 3.2939462661743164, | |
| "learning_rate": 0.00015775401069518716, | |
| "loss": 17.48, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 3.8504631519317627, | |
| "learning_rate": 0.00015240641711229946, | |
| "loss": 17.3854, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4.062356948852539, | |
| "learning_rate": 0.00014705882352941178, | |
| "loss": 17.6811, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 3.741989850997925, | |
| "learning_rate": 0.00014171122994652407, | |
| "loss": 17.4078, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 3.7287967205047607, | |
| "learning_rate": 0.00013636363636363637, | |
| "loss": 17.3517, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 3.6224465370178223, | |
| "learning_rate": 0.00013101604278074866, | |
| "loss": 17.254, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 3.5674147605895996, | |
| "learning_rate": 0.00012566844919786096, | |
| "loss": 17.869, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.722736358642578, | |
| "learning_rate": 0.00012032085561497325, | |
| "loss": 17.7399, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 3.6463096141815186, | |
| "learning_rate": 0.00011497326203208556, | |
| "loss": 17.5016, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 3.5358524322509766, | |
| "learning_rate": 0.00010962566844919786, | |
| "loss": 17.0355, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 3.5321309566497803, | |
| "learning_rate": 0.00010427807486631017, | |
| "loss": 17.5089, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 3.4019291400909424, | |
| "learning_rate": 9.893048128342247e-05, | |
| "loss": 17.3768, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 3.4486570358276367, | |
| "learning_rate": 9.358288770053476e-05, | |
| "loss": 17.488, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 3.7740256786346436, | |
| "learning_rate": 8.823529411764706e-05, | |
| "loss": 17.5768, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 3.5659339427948, | |
| "learning_rate": 8.288770053475936e-05, | |
| "loss": 17.6865, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 3.3678972721099854, | |
| "learning_rate": 7.754010695187167e-05, | |
| "loss": 17.4687, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 3.585134506225586, | |
| "learning_rate": 7.219251336898395e-05, | |
| "loss": 17.536, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 3.6471846103668213, | |
| "learning_rate": 6.684491978609626e-05, | |
| "loss": 17.6269, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 3.533790111541748, | |
| "learning_rate": 6.149732620320857e-05, | |
| "loss": 17.5771, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 3.7971367835998535, | |
| "learning_rate": 5.614973262032086e-05, | |
| "loss": 17.874, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 3.391874074935913, | |
| "learning_rate": 5.080213903743316e-05, | |
| "loss": 17.2528, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 3.069033145904541, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 17.6175, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 3.780275821685791, | |
| "learning_rate": 4.0106951871657754e-05, | |
| "loss": 17.2663, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 3.3377978801727295, | |
| "learning_rate": 3.4759358288770055e-05, | |
| "loss": 17.3711, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 3.356203317642212, | |
| "learning_rate": 2.9411764705882354e-05, | |
| "loss": 17.6077, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 3.302241563796997, | |
| "learning_rate": 2.4064171122994652e-05, | |
| "loss": 17.4777, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 3.73811411857605, | |
| "learning_rate": 1.871657754010695e-05, | |
| "loss": 17.3149, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 3.392902135848999, | |
| "learning_rate": 1.336898395721925e-05, | |
| "loss": 17.8118, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 3.8080010414123535, | |
| "learning_rate": 8.021390374331552e-06, | |
| "loss": 17.1875, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 3.5202646255493164, | |
| "learning_rate": 2.67379679144385e-06, | |
| "loss": 17.7556, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 375, | |
| "total_flos": 2.6461914289864704e+17, | |
| "train_loss": 18.264725362141927, | |
| "train_runtime": 1944.3243, | |
| "train_samples_per_second": 24.687, | |
| "train_steps_per_second": 0.193 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.2290163040161133, | |
| "eval_runtime": 83.3238, | |
| "eval_samples_per_second": 24.003, | |
| "eval_steps_per_second": 3.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.226619243621826, | |
| "eval_runtime": 83.9815, | |
| "eval_samples_per_second": 23.815, | |
| "eval_steps_per_second": 2.977, | |
| "step": 375 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.6461914289864704e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |