| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0718232044198897, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006906077348066298, | |
| "grad_norm": 10.042680740356445, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.9482, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013812154696132596, | |
| "grad_norm": 2.674130916595459, | |
| "learning_rate": 8.444444444444446e-06, | |
| "loss": 0.4538, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.020718232044198894, | |
| "grad_norm": 2.9186959266662598, | |
| "learning_rate": 1.2888888888888889e-05, | |
| "loss": 0.2859, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.027624309392265192, | |
| "grad_norm": 2.0180764198303223, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 0.2322, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.034530386740331494, | |
| "grad_norm": 1.7288544178009033, | |
| "learning_rate": 2.177777777777778e-05, | |
| "loss": 0.2054, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04143646408839779, | |
| "grad_norm": 2.643184185028076, | |
| "learning_rate": 2.6222222222222226e-05, | |
| "loss": 0.2063, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04834254143646409, | |
| "grad_norm": 1.8499925136566162, | |
| "learning_rate": 3.066666666666667e-05, | |
| "loss": 0.16, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.055248618784530384, | |
| "grad_norm": 1.6038734912872314, | |
| "learning_rate": 3.511111111111111e-05, | |
| "loss": 0.1561, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.062154696132596686, | |
| "grad_norm": 0.8061902523040771, | |
| "learning_rate": 3.9555555555555556e-05, | |
| "loss": 0.1359, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06906077348066299, | |
| "grad_norm": 2.517512083053589, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.1365, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07596685082872928, | |
| "grad_norm": 1.773244857788086, | |
| "learning_rate": 4.844444444444445e-05, | |
| "loss": 0.1359, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.08287292817679558, | |
| "grad_norm": 1.7556045055389404, | |
| "learning_rate": 5.2888888888888885e-05, | |
| "loss": 0.1234, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08977900552486189, | |
| "grad_norm": 1.2455209493637085, | |
| "learning_rate": 5.7333333333333336e-05, | |
| "loss": 0.116, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09668508287292818, | |
| "grad_norm": 1.0859894752502441, | |
| "learning_rate": 6.177777777777779e-05, | |
| "loss": 0.1172, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.10359116022099447, | |
| "grad_norm": 1.5400012731552124, | |
| "learning_rate": 6.622222222222224e-05, | |
| "loss": 0.1125, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.11049723756906077, | |
| "grad_norm": 0.9862754940986633, | |
| "learning_rate": 7.066666666666667e-05, | |
| "loss": 0.107, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11740331491712708, | |
| "grad_norm": 0.7784848809242249, | |
| "learning_rate": 7.511111111111111e-05, | |
| "loss": 0.1013, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.12430939226519337, | |
| "grad_norm": 1.40177583694458, | |
| "learning_rate": 7.955555555555556e-05, | |
| "loss": 0.103, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.13121546961325967, | |
| "grad_norm": 0.9513179659843445, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.0903, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.13812154696132597, | |
| "grad_norm": 1.2130392789840698, | |
| "learning_rate": 8.844444444444445e-05, | |
| "loss": 0.1034, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.14502762430939226, | |
| "grad_norm": 0.6497124433517456, | |
| "learning_rate": 9.28888888888889e-05, | |
| "loss": 0.1053, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.15193370165745856, | |
| "grad_norm": 1.1570148468017578, | |
| "learning_rate": 9.733333333333335e-05, | |
| "loss": 0.1007, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15883977900552487, | |
| "grad_norm": 1.1539334058761597, | |
| "learning_rate": 9.999978398337033e-05, | |
| "loss": 0.0842, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.16574585635359115, | |
| "grad_norm": 1.02969491481781, | |
| "learning_rate": 9.999735381772228e-05, | |
| "loss": 0.0855, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.17265193370165746, | |
| "grad_norm": 0.652882993221283, | |
| "learning_rate": 9.999222359731514e-05, | |
| "loss": 0.0817, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.17955801104972377, | |
| "grad_norm": 0.7674477100372314, | |
| "learning_rate": 9.998439359920107e-05, | |
| "loss": 0.0707, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.18646408839779005, | |
| "grad_norm": 0.755526065826416, | |
| "learning_rate": 9.997386424623091e-05, | |
| "loss": 0.0815, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.19337016574585636, | |
| "grad_norm": 0.7330914735794067, | |
| "learning_rate": 9.996063610703137e-05, | |
| "loss": 0.0773, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.20027624309392264, | |
| "grad_norm": 0.5776280164718628, | |
| "learning_rate": 9.994470989597423e-05, | |
| "loss": 0.0861, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.20718232044198895, | |
| "grad_norm": 0.6976489424705505, | |
| "learning_rate": 9.992608647313789e-05, | |
| "loss": 0.071, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.21408839779005526, | |
| "grad_norm": 0.550521194934845, | |
| "learning_rate": 9.990476684426075e-05, | |
| "loss": 0.0743, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.22099447513812154, | |
| "grad_norm": 0.5623897314071655, | |
| "learning_rate": 9.988075216068711e-05, | |
| "loss": 0.068, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.22790055248618785, | |
| "grad_norm": 0.5536251068115234, | |
| "learning_rate": 9.98540437193048e-05, | |
| "loss": 0.0728, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.23480662983425415, | |
| "grad_norm": 0.745213508605957, | |
| "learning_rate": 9.982464296247522e-05, | |
| "loss": 0.0662, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.24171270718232044, | |
| "grad_norm": 0.9005521535873413, | |
| "learning_rate": 9.979255147795549e-05, | |
| "loss": 0.0645, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.24861878453038674, | |
| "grad_norm": 0.4740263819694519, | |
| "learning_rate": 9.975777099881263e-05, | |
| "loss": 0.0669, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.255524861878453, | |
| "grad_norm": 0.7341731786727905, | |
| "learning_rate": 9.972030340333001e-05, | |
| "loss": 0.063, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.26243093922651933, | |
| "grad_norm": 0.5400853753089905, | |
| "learning_rate": 9.968015071490591e-05, | |
| "loss": 0.0574, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.26933701657458564, | |
| "grad_norm": 0.7812430262565613, | |
| "learning_rate": 9.963731510194425e-05, | |
| "loss": 0.0602, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.27624309392265195, | |
| "grad_norm": 0.6542518734931946, | |
| "learning_rate": 9.959179887773744e-05, | |
| "loss": 0.0522, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.28314917127071826, | |
| "grad_norm": 0.619616687297821, | |
| "learning_rate": 9.954360450034155e-05, | |
| "loss": 0.0657, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2900552486187845, | |
| "grad_norm": 0.7797542214393616, | |
| "learning_rate": 9.949273457244348e-05, | |
| "loss": 0.0568, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2969613259668508, | |
| "grad_norm": 0.6183603405952454, | |
| "learning_rate": 9.943919184122043e-05, | |
| "loss": 0.0589, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.30386740331491713, | |
| "grad_norm": 0.7843531966209412, | |
| "learning_rate": 9.938297919819157e-05, | |
| "loss": 0.0622, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.31077348066298344, | |
| "grad_norm": 0.629660964012146, | |
| "learning_rate": 9.932409967906184e-05, | |
| "loss": 0.0626, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.31767955801104975, | |
| "grad_norm": 0.693767786026001, | |
| "learning_rate": 9.926255646355804e-05, | |
| "loss": 0.0648, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.324585635359116, | |
| "grad_norm": 0.5813653469085693, | |
| "learning_rate": 9.91983528752571e-05, | |
| "loss": 0.0606, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3314917127071823, | |
| "grad_norm": 0.88108229637146, | |
| "learning_rate": 9.91314923814066e-05, | |
| "loss": 0.0579, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3383977900552486, | |
| "grad_norm": 1.2719063758850098, | |
| "learning_rate": 9.906197859273753e-05, | |
| "loss": 0.0601, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3453038674033149, | |
| "grad_norm": 0.566552460193634, | |
| "learning_rate": 9.89898152632693e-05, | |
| "loss": 0.0578, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.35220994475138123, | |
| "grad_norm": 0.5750138163566589, | |
| "learning_rate": 9.891500629010694e-05, | |
| "loss": 0.0633, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.35911602209944754, | |
| "grad_norm": 0.6586229801177979, | |
| "learning_rate": 9.88375557132308e-05, | |
| "loss": 0.0563, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3660220994475138, | |
| "grad_norm": 0.6225771903991699, | |
| "learning_rate": 9.875746771527816e-05, | |
| "loss": 0.0568, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3729281767955801, | |
| "grad_norm": 0.657524049282074, | |
| "learning_rate": 9.867474662131754e-05, | |
| "loss": 0.0489, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3798342541436464, | |
| "grad_norm": 0.7671463489532471, | |
| "learning_rate": 9.858939689861506e-05, | |
| "loss": 0.0538, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3867403314917127, | |
| "grad_norm": 0.42189261317253113, | |
| "learning_rate": 9.850142315639312e-05, | |
| "loss": 0.046, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.393646408839779, | |
| "grad_norm": 0.48630833625793457, | |
| "learning_rate": 9.841083014558158e-05, | |
| "loss": 0.0505, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.4005524861878453, | |
| "grad_norm": 0.8498868346214294, | |
| "learning_rate": 9.831762275856118e-05, | |
| "loss": 0.0529, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4074585635359116, | |
| "grad_norm": 0.5209752321243286, | |
| "learning_rate": 9.82218060288993e-05, | |
| "loss": 0.0504, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.4143646408839779, | |
| "grad_norm": 0.46043241024017334, | |
| "learning_rate": 9.81233851310781e-05, | |
| "loss": 0.0536, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4212707182320442, | |
| "grad_norm": 0.8280401229858398, | |
| "learning_rate": 9.802236538021518e-05, | |
| "loss": 0.0558, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4281767955801105, | |
| "grad_norm": 0.9656640887260437, | |
| "learning_rate": 9.791875223177643e-05, | |
| "loss": 0.0512, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4350828729281768, | |
| "grad_norm": 0.6366522312164307, | |
| "learning_rate": 9.781255128128148e-05, | |
| "loss": 0.0444, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4419889502762431, | |
| "grad_norm": 0.4692375957965851, | |
| "learning_rate": 9.77037682640015e-05, | |
| "loss": 0.0465, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4488950276243094, | |
| "grad_norm": 0.450482577085495, | |
| "learning_rate": 9.759240905464946e-05, | |
| "loss": 0.0481, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4558011049723757, | |
| "grad_norm": 0.47211357951164246, | |
| "learning_rate": 9.74784796670629e-05, | |
| "loss": 0.05, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.462707182320442, | |
| "grad_norm": 0.6791536211967468, | |
| "learning_rate": 9.736198625387916e-05, | |
| "loss": 0.0578, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4696132596685083, | |
| "grad_norm": 0.7372534275054932, | |
| "learning_rate": 9.724293510620306e-05, | |
| "loss": 0.0585, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.47651933701657456, | |
| "grad_norm": 0.49842604994773865, | |
| "learning_rate": 9.712133265326722e-05, | |
| "loss": 0.0489, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.48342541436464087, | |
| "grad_norm": 0.35578662157058716, | |
| "learning_rate": 9.699718546208484e-05, | |
| "loss": 0.0519, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4903314917127072, | |
| "grad_norm": 0.5673187375068665, | |
| "learning_rate": 9.6870500237095e-05, | |
| "loss": 0.0623, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4972375690607735, | |
| "grad_norm": 0.5961718559265137, | |
| "learning_rate": 9.674128381980072e-05, | |
| "loss": 0.0497, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.5041436464088398, | |
| "grad_norm": 0.47361811995506287, | |
| "learning_rate": 9.660954318839933e-05, | |
| "loss": 0.0502, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.511049723756906, | |
| "grad_norm": 0.7358847260475159, | |
| "learning_rate": 9.647528545740573e-05, | |
| "loss": 0.0434, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5179558011049724, | |
| "grad_norm": 0.7085921168327332, | |
| "learning_rate": 9.633851787726815e-05, | |
| "loss": 0.0534, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5248618784530387, | |
| "grad_norm": 0.5258869528770447, | |
| "learning_rate": 9.619924783397661e-05, | |
| "loss": 0.0425, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5317679558011049, | |
| "grad_norm": 0.4688310921192169, | |
| "learning_rate": 9.6057482848664e-05, | |
| "loss": 0.0439, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5386740331491713, | |
| "grad_norm": 0.4370345175266266, | |
| "learning_rate": 9.591323057719998e-05, | |
| "loss": 0.0464, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5455801104972375, | |
| "grad_norm": 0.45098453760147095, | |
| "learning_rate": 9.576649880977748e-05, | |
| "loss": 0.0428, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5524861878453039, | |
| "grad_norm": 0.36217615008354187, | |
| "learning_rate": 9.561729547049199e-05, | |
| "loss": 0.0514, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5593922651933702, | |
| "grad_norm": 0.7802874445915222, | |
| "learning_rate": 9.546562861691369e-05, | |
| "loss": 0.0487, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5662983425414365, | |
| "grad_norm": 0.4431716799736023, | |
| "learning_rate": 9.531150643965223e-05, | |
| "loss": 0.0511, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5732044198895028, | |
| "grad_norm": 0.9877989292144775, | |
| "learning_rate": 9.51549372619145e-05, | |
| "loss": 0.0498, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.580110497237569, | |
| "grad_norm": 0.5740323066711426, | |
| "learning_rate": 9.499592953905504e-05, | |
| "loss": 0.0464, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5870165745856354, | |
| "grad_norm": 0.3987601101398468, | |
| "learning_rate": 9.483449185811948e-05, | |
| "loss": 0.0569, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5939226519337016, | |
| "grad_norm": 0.6680428385734558, | |
| "learning_rate": 9.467063293738081e-05, | |
| "loss": 0.049, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.600828729281768, | |
| "grad_norm": 0.5118589997291565, | |
| "learning_rate": 9.450436162586853e-05, | |
| "loss": 0.0462, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.6077348066298343, | |
| "grad_norm": 0.4932630658149719, | |
| "learning_rate": 9.433568690289075e-05, | |
| "loss": 0.0439, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6146408839779005, | |
| "grad_norm": 0.3190293312072754, | |
| "learning_rate": 9.416461787754935e-05, | |
| "loss": 0.0411, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6215469613259669, | |
| "grad_norm": 0.3397425711154938, | |
| "learning_rate": 9.3991163788248e-05, | |
| "loss": 0.0458, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6284530386740331, | |
| "grad_norm": 0.7478238344192505, | |
| "learning_rate": 9.381533400219318e-05, | |
| "loss": 0.0419, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6353591160220995, | |
| "grad_norm": 0.5642843842506409, | |
| "learning_rate": 9.36371380148885e-05, | |
| "loss": 0.0461, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6422651933701657, | |
| "grad_norm": 0.3694552183151245, | |
| "learning_rate": 9.345658544962166e-05, | |
| "loss": 0.0415, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.649171270718232, | |
| "grad_norm": 0.45040807127952576, | |
| "learning_rate": 9.327368605694502e-05, | |
| "loss": 0.043, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6560773480662984, | |
| "grad_norm": 0.31986427307128906, | |
| "learning_rate": 9.30884497141488e-05, | |
| "loss": 0.0478, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6629834254143646, | |
| "grad_norm": 0.6989539265632629, | |
| "learning_rate": 9.290088642472783e-05, | |
| "loss": 0.0396, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.669889502762431, | |
| "grad_norm": 0.5615038871765137, | |
| "learning_rate": 9.27110063178412e-05, | |
| "loss": 0.0419, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6767955801104972, | |
| "grad_norm": 0.4937025010585785, | |
| "learning_rate": 9.251881964776535e-05, | |
| "loss": 0.044, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6837016574585635, | |
| "grad_norm": 0.4085635840892792, | |
| "learning_rate": 9.232433679334018e-05, | |
| "loss": 0.0431, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6906077348066298, | |
| "grad_norm": 0.548740029335022, | |
| "learning_rate": 9.212756825740873e-05, | |
| "loss": 0.0408, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6975138121546961, | |
| "grad_norm": 0.5693575739860535, | |
| "learning_rate": 9.192852466624981e-05, | |
| "loss": 0.0411, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.7044198895027625, | |
| "grad_norm": 0.532451868057251, | |
| "learning_rate": 9.172721676900419e-05, | |
| "loss": 0.0426, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.7113259668508287, | |
| "grad_norm": 0.6173306703567505, | |
| "learning_rate": 9.152365543709416e-05, | |
| "loss": 0.0452, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.7182320441988951, | |
| "grad_norm": 0.5103174448013306, | |
| "learning_rate": 9.131785166363638e-05, | |
| "loss": 0.0415, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7251381215469613, | |
| "grad_norm": 0.3975609242916107, | |
| "learning_rate": 9.11098165628482e-05, | |
| "loss": 0.0346, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7320441988950276, | |
| "grad_norm": 0.48710817098617554, | |
| "learning_rate": 9.089956136944751e-05, | |
| "loss": 0.0477, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.738950276243094, | |
| "grad_norm": 0.44057202339172363, | |
| "learning_rate": 9.06870974380459e-05, | |
| "loss": 0.0449, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7458563535911602, | |
| "grad_norm": 0.3550662100315094, | |
| "learning_rate": 9.047243624253564e-05, | |
| "loss": 0.0458, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7527624309392266, | |
| "grad_norm": 0.5015395879745483, | |
| "learning_rate": 9.025558937546988e-05, | |
| "loss": 0.0347, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7596685082872928, | |
| "grad_norm": 0.6827583312988281, | |
| "learning_rate": 9.003656854743667e-05, | |
| "loss": 0.0397, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7665745856353591, | |
| "grad_norm": 0.4369044601917267, | |
| "learning_rate": 8.981538558642663e-05, | |
| "loss": 0.0363, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7734806629834254, | |
| "grad_norm": 0.41478782892227173, | |
| "learning_rate": 8.959205243719402e-05, | |
| "loss": 0.037, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7803867403314917, | |
| "grad_norm": 0.3793696463108063, | |
| "learning_rate": 8.936658116061178e-05, | |
| "loss": 0.0462, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.787292817679558, | |
| "grad_norm": 0.39440885186195374, | |
| "learning_rate": 8.913898393302021e-05, | |
| "loss": 0.0435, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7941988950276243, | |
| "grad_norm": 0.3810882270336151, | |
| "learning_rate": 8.890927304556935e-05, | |
| "loss": 0.0394, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.8011049723756906, | |
| "grad_norm": 0.2765768766403198, | |
| "learning_rate": 8.867746090355525e-05, | |
| "loss": 0.0415, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.8080110497237569, | |
| "grad_norm": 0.626876711845398, | |
| "learning_rate": 8.844356002574996e-05, | |
| "loss": 0.0383, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.8149171270718232, | |
| "grad_norm": 0.4267805516719818, | |
| "learning_rate": 8.820758304372557e-05, | |
| "loss": 0.0345, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.8218232044198895, | |
| "grad_norm": 0.6483110785484314, | |
| "learning_rate": 8.796954270117199e-05, | |
| "loss": 0.0383, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.8287292817679558, | |
| "grad_norm": 0.40159228444099426, | |
| "learning_rate": 8.772945185320875e-05, | |
| "loss": 0.0399, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.835635359116022, | |
| "grad_norm": 0.29305708408355713, | |
| "learning_rate": 8.74873234656908e-05, | |
| "loss": 0.0387, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.8425414364640884, | |
| "grad_norm": 0.49229493737220764, | |
| "learning_rate": 8.724317061450824e-05, | |
| "loss": 0.036, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8494475138121547, | |
| "grad_norm": 0.328227698802948, | |
| "learning_rate": 8.699700648488027e-05, | |
| "loss": 0.0379, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.856353591160221, | |
| "grad_norm": 0.3456778824329376, | |
| "learning_rate": 8.674884437064302e-05, | |
| "loss": 0.0369, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8632596685082873, | |
| "grad_norm": 0.5480336546897888, | |
| "learning_rate": 8.64986976735317e-05, | |
| "loss": 0.0411, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8701657458563536, | |
| "grad_norm": 0.37527692317962646, | |
| "learning_rate": 8.624657990245687e-05, | |
| "loss": 0.0348, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8770718232044199, | |
| "grad_norm": 0.5111171007156372, | |
| "learning_rate": 8.599250467277483e-05, | |
| "loss": 0.0427, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8839779005524862, | |
| "grad_norm": 0.416939377784729, | |
| "learning_rate": 8.573648570555245e-05, | |
| "loss": 0.0416, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8908839779005525, | |
| "grad_norm": 0.36200088262557983, | |
| "learning_rate": 8.547853682682604e-05, | |
| "loss": 0.0408, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8977900552486188, | |
| "grad_norm": 0.3712616264820099, | |
| "learning_rate": 8.521867196685482e-05, | |
| "loss": 0.0372, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9046961325966851, | |
| "grad_norm": 0.42901334166526794, | |
| "learning_rate": 8.495690515936852e-05, | |
| "loss": 0.0375, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.9116022099447514, | |
| "grad_norm": 0.44897764921188354, | |
| "learning_rate": 8.46932505408096e-05, | |
| "loss": 0.0386, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.9185082872928176, | |
| "grad_norm": 0.50035160779953, | |
| "learning_rate": 8.442772234956972e-05, | |
| "loss": 0.0344, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.925414364640884, | |
| "grad_norm": 0.3799741864204407, | |
| "learning_rate": 8.416033492522097e-05, | |
| "loss": 0.0395, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.9323204419889503, | |
| "grad_norm": 0.4632473289966583, | |
| "learning_rate": 8.389110270774128e-05, | |
| "loss": 0.0336, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.9392265193370166, | |
| "grad_norm": 0.311646044254303, | |
| "learning_rate": 8.362004023673474e-05, | |
| "loss": 0.036, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.9461325966850829, | |
| "grad_norm": 0.4639626145362854, | |
| "learning_rate": 8.334716215064637e-05, | |
| "loss": 0.0286, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.9530386740331491, | |
| "grad_norm": 0.34951701760292053, | |
| "learning_rate": 8.30724831859716e-05, | |
| "loss": 0.0352, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.9599447513812155, | |
| "grad_norm": 0.42107245326042175, | |
| "learning_rate": 8.279601817646036e-05, | |
| "loss": 0.0321, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9668508287292817, | |
| "grad_norm": 0.3809627294540405, | |
| "learning_rate": 8.251778205231617e-05, | |
| "loss": 0.0398, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9737569060773481, | |
| "grad_norm": 0.2785834074020386, | |
| "learning_rate": 8.223778983938962e-05, | |
| "loss": 0.031, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9806629834254144, | |
| "grad_norm": 0.4540090262889862, | |
| "learning_rate": 8.19560566583671e-05, | |
| "loss": 0.039, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9875690607734806, | |
| "grad_norm": 0.3515707552433014, | |
| "learning_rate": 8.167259772395415e-05, | |
| "loss": 0.0317, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.994475138121547, | |
| "grad_norm": 0.4071841537952423, | |
| "learning_rate": 8.138742834405386e-05, | |
| "loss": 0.0382, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.0013812154696133, | |
| "grad_norm": 0.5569164752960205, | |
| "learning_rate": 8.110056391894005e-05, | |
| "loss": 0.0395, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.0082872928176796, | |
| "grad_norm": 0.41124194860458374, | |
| "learning_rate": 8.081201994042573e-05, | |
| "loss": 0.0362, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.0151933701657458, | |
| "grad_norm": 0.414541631937027, | |
| "learning_rate": 8.052181199102646e-05, | |
| "loss": 0.0394, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.022099447513812, | |
| "grad_norm": 0.47132864594459534, | |
| "learning_rate": 8.022995574311876e-05, | |
| "loss": 0.0389, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.0290055248618784, | |
| "grad_norm": 0.39693185687065125, | |
| "learning_rate": 7.993646695809378e-05, | |
| "loss": 0.0377, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.0359116022099448, | |
| "grad_norm": 0.39153772592544556, | |
| "learning_rate": 7.96413614855062e-05, | |
| "loss": 0.0404, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.042817679558011, | |
| "grad_norm": 0.49630650877952576, | |
| "learning_rate": 8.364451042705998e-05, | |
| "loss": 0.0372, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.0497237569060773, | |
| "grad_norm": 0.2868261933326721, | |
| "learning_rate": 8.33991480157679e-05, | |
| "loss": 0.0363, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.0566298342541436, | |
| "grad_norm": 0.38209059834480286, | |
| "learning_rate": 8.315232461754338e-05, | |
| "loss": 0.0395, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.06353591160221, | |
| "grad_norm": 0.3526831567287445, | |
| "learning_rate": 8.290405102924144e-05, | |
| "loss": 0.0385, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.0704419889502763, | |
| "grad_norm": 0.2756149172782898, | |
| "learning_rate": 8.265433811115316e-05, | |
| "loss": 0.0392, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.0773480662983426, | |
| "grad_norm": 0.23444347083568573, | |
| "learning_rate": 8.240319678653049e-05, | |
| "loss": 0.0321, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.0842541436464088, | |
| "grad_norm": 0.30609193444252014, | |
| "learning_rate": 8.215063804110857e-05, | |
| "loss": 0.0347, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.091160220994475, | |
| "grad_norm": 0.28997406363487244, | |
| "learning_rate": 8.189667292262512e-05, | |
| "loss": 0.0331, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.0980662983425415, | |
| "grad_norm": 0.3041757643222809, | |
| "learning_rate": 8.164131254033716e-05, | |
| "loss": 0.0373, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.1049723756906078, | |
| "grad_norm": 0.40974169969558716, | |
| "learning_rate": 8.138456806453503e-05, | |
| "loss": 0.0335, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.111878453038674, | |
| "grad_norm": 0.46436306834220886, | |
| "learning_rate": 8.112645072605386e-05, | |
| "loss": 0.0301, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.1187845303867403, | |
| "grad_norm": 0.3110021948814392, | |
| "learning_rate": 8.086697181578222e-05, | |
| "loss": 0.0292, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.1256906077348066, | |
| "grad_norm": 0.3286142647266388, | |
| "learning_rate": 8.060614268416823e-05, | |
| "loss": 0.0337, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.132596685082873, | |
| "grad_norm": 0.457504004240036, | |
| "learning_rate": 8.034397474072309e-05, | |
| "loss": 0.032, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.1395027624309393, | |
| "grad_norm": 0.4233052730560303, | |
| "learning_rate": 8.008047945352193e-05, | |
| "loss": 0.0397, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.1464088397790055, | |
| "grad_norm": 0.5389223098754883, | |
| "learning_rate": 7.981566834870225e-05, | |
| "loss": 0.0314, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.1533149171270718, | |
| "grad_norm": 0.44030722975730896, | |
| "learning_rate": 7.954955300995961e-05, | |
| "loss": 0.0324, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.160220994475138, | |
| "grad_norm": 0.4734176993370056, | |
| "learning_rate": 7.928214507804104e-05, | |
| "loss": 0.0382, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.1671270718232045, | |
| "grad_norm": 0.4901622235774994, | |
| "learning_rate": 7.901345625023576e-05, | |
| "loss": 0.0353, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.1740331491712708, | |
| "grad_norm": 0.30757012963294983, | |
| "learning_rate": 7.874349827986354e-05, | |
| "loss": 0.0313, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.180939226519337, | |
| "grad_norm": 0.421566367149353, | |
| "learning_rate": 7.847228297576053e-05, | |
| "loss": 0.0343, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.1878453038674033, | |
| "grad_norm": 0.3917239308357239, | |
| "learning_rate": 7.819982220176276e-05, | |
| "loss": 0.0363, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.1947513812154695, | |
| "grad_norm": 0.386471688747406, | |
| "learning_rate": 7.792612787618714e-05, | |
| "loss": 0.033, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.201657458563536, | |
| "grad_norm": 0.382355660200119, | |
| "learning_rate": 7.765121197131009e-05, | |
| "loss": 0.0327, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.2085635359116023, | |
| "grad_norm": 0.3192639946937561, | |
| "learning_rate": 7.737508651284391e-05, | |
| "loss": 0.0326, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.2154696132596685, | |
| "grad_norm": 0.47056350111961365, | |
| "learning_rate": 7.709776357941069e-05, | |
| "loss": 0.0341, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.2223756906077348, | |
| "grad_norm": 0.47461798787117004, | |
| "learning_rate": 7.681925530201392e-05, | |
| "loss": 0.0345, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.229281767955801, | |
| "grad_norm": 0.3219474256038666, | |
| "learning_rate": 7.65395738635079e-05, | |
| "loss": 0.0317, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.2361878453038675, | |
| "grad_norm": 0.39588093757629395, | |
| "learning_rate": 7.62587314980648e-05, | |
| "loss": 0.0378, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.2430939226519337, | |
| "grad_norm": 0.42843881249427795, | |
| "learning_rate": 7.597674049063947e-05, | |
| "loss": 0.0308, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.433452844619751, | |
| "learning_rate": 7.569361317643211e-05, | |
| "loss": 0.0327, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.2569060773480663, | |
| "grad_norm": 0.38872626423835754, | |
| "learning_rate": 7.540936194034865e-05, | |
| "loss": 0.0292, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.2638121546961325, | |
| "grad_norm": 0.37333056330680847, | |
| "learning_rate": 7.512399921645901e-05, | |
| "loss": 0.0287, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.270718232044199, | |
| "grad_norm": 0.37983638048171997, | |
| "learning_rate": 7.483753748745317e-05, | |
| "loss": 0.0283, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.2776243093922652, | |
| "grad_norm": 0.29978811740875244, | |
| "learning_rate": 7.454998928409516e-05, | |
| "loss": 0.0287, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.2845303867403315, | |
| "grad_norm": 0.3676117956638336, | |
| "learning_rate": 7.426136718467493e-05, | |
| "loss": 0.0337, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.2914364640883977, | |
| "grad_norm": 0.32428571581840515, | |
| "learning_rate": 7.397168381445812e-05, | |
| "loss": 0.0294, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.298342541436464, | |
| "grad_norm": 0.5252200365066528, | |
| "learning_rate": 7.368095184513377e-05, | |
| "loss": 0.0321, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.3052486187845305, | |
| "grad_norm": 0.570086658000946, | |
| "learning_rate": 7.338918399426005e-05, | |
| "loss": 0.0262, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.3121546961325967, | |
| "grad_norm": 0.2742464542388916, | |
| "learning_rate": 7.309639302470801e-05, | |
| "loss": 0.0334, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.319060773480663, | |
| "grad_norm": 0.4127005338668823, | |
| "learning_rate": 7.280259174410312e-05, | |
| "loss": 0.0329, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.3259668508287292, | |
| "grad_norm": 0.29896727204322815, | |
| "learning_rate": 7.250779300426517e-05, | |
| "loss": 0.0299, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.3328729281767955, | |
| "grad_norm": 0.25678521394729614, | |
| "learning_rate": 7.22120097006461e-05, | |
| "loss": 0.0288, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.339779005524862, | |
| "grad_norm": 0.40922263264656067, | |
| "learning_rate": 7.191525477176577e-05, | |
| "loss": 0.0312, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.3466850828729282, | |
| "grad_norm": 0.26698246598243713, | |
| "learning_rate": 7.161754119864616e-05, | |
| "loss": 0.0306, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.3535911602209945, | |
| "grad_norm": 0.3253629207611084, | |
| "learning_rate": 7.131888200424339e-05, | |
| "loss": 0.0257, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.3604972375690607, | |
| "grad_norm": 0.3369474411010742, | |
| "learning_rate": 7.101929025287816e-05, | |
| "loss": 0.0364, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.367403314917127, | |
| "grad_norm": 0.2908439040184021, | |
| "learning_rate": 7.071877904966423e-05, | |
| "loss": 0.0364, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.3743093922651934, | |
| "grad_norm": 0.31447526812553406, | |
| "learning_rate": 7.04173615399351e-05, | |
| "loss": 0.0298, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.3812154696132597, | |
| "grad_norm": 0.314809650182724, | |
| "learning_rate": 7.011505090866913e-05, | |
| "loss": 0.0268, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.388121546961326, | |
| "grad_norm": 0.465692400932312, | |
| "learning_rate": 6.981186037991271e-05, | |
| "loss": 0.0262, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.3950276243093922, | |
| "grad_norm": 0.38034024834632874, | |
| "learning_rate": 6.950780321620174e-05, | |
| "loss": 0.0282, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.4019337016574585, | |
| "grad_norm": 0.23804844915866852, | |
| "learning_rate": 6.920289271798157e-05, | |
| "loss": 0.0329, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.408839779005525, | |
| "grad_norm": 0.32702165842056274, | |
| "learning_rate": 6.889714222302517e-05, | |
| "loss": 0.0363, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.4157458563535912, | |
| "grad_norm": 0.2517118453979492, | |
| "learning_rate": 6.85905651058497e-05, | |
| "loss": 0.0296, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.4226519337016574, | |
| "grad_norm": 0.29005321860313416, | |
| "learning_rate": 6.82831747771314e-05, | |
| "loss": 0.0282, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.4295580110497237, | |
| "grad_norm": 0.3186084032058716, | |
| "learning_rate": 6.797498468311907e-05, | |
| "loss": 0.0269, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.43646408839779, | |
| "grad_norm": 0.29946812987327576, | |
| "learning_rate": 6.766600830504585e-05, | |
| "loss": 0.0295, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.4433701657458564, | |
| "grad_norm": 0.38879671692848206, | |
| "learning_rate": 6.735625915853942e-05, | |
| "loss": 0.0281, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.4502762430939227, | |
| "grad_norm": 0.33375823497772217, | |
| "learning_rate": 6.70457507930309e-05, | |
| "loss": 0.0278, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.457182320441989, | |
| "grad_norm": 0.36101585626602173, | |
| "learning_rate": 6.673449679116215e-05, | |
| "loss": 0.0259, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.4640883977900552, | |
| "grad_norm": 0.39331042766571045, | |
| "learning_rate": 6.642251076819148e-05, | |
| "loss": 0.0251, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.4709944751381214, | |
| "grad_norm": 0.30164289474487305, | |
| "learning_rate": 6.610980637139827e-05, | |
| "loss": 0.0308, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.477900552486188, | |
| "grad_norm": 0.274733304977417, | |
| "learning_rate": 6.579639727948583e-05, | |
| "loss": 0.0306, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.4848066298342542, | |
| "grad_norm": 0.2641105055809021, | |
| "learning_rate": 6.548229720198315e-05, | |
| "loss": 0.031, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.4917127071823204, | |
| "grad_norm": 0.34264707565307617, | |
| "learning_rate": 6.516751987864517e-05, | |
| "loss": 0.0303, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.4986187845303867, | |
| "grad_norm": 0.2634020149707794, | |
| "learning_rate": 6.485207907885175e-05, | |
| "loss": 0.0267, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.505524861878453, | |
| "grad_norm": 0.32609301805496216, | |
| "learning_rate": 6.453598860100536e-05, | |
| "loss": 0.0267, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.5124309392265194, | |
| "grad_norm": 0.3219829499721527, | |
| "learning_rate": 6.421926227192749e-05, | |
| "loss": 0.0289, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.5193370165745856, | |
| "grad_norm": 0.1896410584449768, | |
| "learning_rate": 6.390191394625381e-05, | |
| "loss": 0.0248, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.526243093922652, | |
| "grad_norm": 0.36194756627082825, | |
| "learning_rate": 6.358395750582817e-05, | |
| "loss": 0.0242, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.5331491712707184, | |
| "grad_norm": 0.3969047963619232, | |
| "learning_rate": 6.326540685909532e-05, | |
| "loss": 0.026, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.5400552486187844, | |
| "grad_norm": 0.35505595803260803, | |
| "learning_rate": 6.294627594049249e-05, | |
| "loss": 0.0296, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.5469613259668509, | |
| "grad_norm": 0.3605109453201294, | |
| "learning_rate": 6.262657870983989e-05, | |
| "loss": 0.0269, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.5538674033149171, | |
| "grad_norm": 0.28720366954803467, | |
| "learning_rate": 6.230632915173009e-05, | |
| "loss": 0.0293, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.5607734806629834, | |
| "grad_norm": 0.34371650218963623, | |
| "learning_rate": 6.198554127491622e-05, | |
| "loss": 0.0254, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.5676795580110499, | |
| "grad_norm": 0.3585488200187683, | |
| "learning_rate": 6.166422911169923e-05, | |
| "loss": 0.0303, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.5745856353591159, | |
| "grad_norm": 0.5171918869018555, | |
| "learning_rate": 6.1342406717314e-05, | |
| "loss": 0.0316, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.5814917127071824, | |
| "grad_norm": 0.4940383732318878, | |
| "learning_rate": 6.102008816931466e-05, | |
| "loss": 0.0292, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.5883977900552486, | |
| "grad_norm": 0.3063161075115204, | |
| "learning_rate": 6.069728756695866e-05, | |
| "loss": 0.0242, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.5953038674033149, | |
| "grad_norm": 0.37871360778808594, | |
| "learning_rate": 6.037401903059008e-05, | |
| "loss": 0.0296, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.6022099447513813, | |
| "grad_norm": 0.4769527316093445, | |
| "learning_rate": 6.005029670102195e-05, | |
| "loss": 0.0241, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.6091160220994474, | |
| "grad_norm": 0.3484481871128082, | |
| "learning_rate": 5.972613473891766e-05, | |
| "loss": 0.0336, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.6160220994475138, | |
| "grad_norm": 0.43675076961517334, | |
| "learning_rate": 5.940154732417158e-05, | |
| "loss": 0.0252, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.62292817679558, | |
| "grad_norm": 0.30599460005760193, | |
| "learning_rate": 5.907654865528876e-05, | |
| "loss": 0.0297, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.6298342541436464, | |
| "grad_norm": 0.4845811426639557, | |
| "learning_rate": 5.875115294876381e-05, | |
| "loss": 0.025, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.6367403314917128, | |
| "grad_norm": 0.46819767355918884, | |
| "learning_rate": 5.842537443845908e-05, | |
| "loss": 0.0294, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.6436464088397789, | |
| "grad_norm": 0.2887173593044281, | |
| "learning_rate": 5.809922737498198e-05, | |
| "loss": 0.0248, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.6505524861878453, | |
| "grad_norm": 0.366458922624588, | |
| "learning_rate": 5.777272602506165e-05, | |
| "loss": 0.0284, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.6574585635359116, | |
| "grad_norm": 0.28219345211982727, | |
| "learning_rate": 5.744588467092483e-05, | |
| "loss": 0.0301, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.6643646408839778, | |
| "grad_norm": 0.2311965823173523, | |
| "learning_rate": 5.7118717609671194e-05, | |
| "loss": 0.0222, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.6712707182320443, | |
| "grad_norm": 0.34282025694847107, | |
| "learning_rate": 5.679123915264786e-05, | |
| "loss": 0.0249, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.6781767955801103, | |
| "grad_norm": 0.3125259280204773, | |
| "learning_rate": 5.646346362482342e-05, | |
| "loss": 0.0261, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.6850828729281768, | |
| "grad_norm": 0.3358251452445984, | |
| "learning_rate": 5.613540536416132e-05, | |
| "loss": 0.0287, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.691988950276243, | |
| "grad_norm": 0.2952382266521454, | |
| "learning_rate": 5.5807078720992645e-05, | |
| "loss": 0.0253, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.6988950276243093, | |
| "grad_norm": 0.38777631521224976, | |
| "learning_rate": 5.547849805738836e-05, | |
| "loss": 0.0269, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.7058011049723758, | |
| "grad_norm": 0.381124347448349, | |
| "learning_rate": 5.514967774653118e-05, | |
| "loss": 0.0254, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.7127071823204418, | |
| "grad_norm": 0.3334466218948364, | |
| "learning_rate": 5.482063217208674e-05, | |
| "loss": 0.0257, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.7196132596685083, | |
| "grad_norm": 0.2492898404598236, | |
| "learning_rate": 5.449137572757439e-05, | |
| "loss": 0.0219, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.7265193370165746, | |
| "grad_norm": 0.2959388494491577, | |
| "learning_rate": 5.4161922815737696e-05, | |
| "loss": 0.0245, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.7334254143646408, | |
| "grad_norm": 0.20602189004421234, | |
| "learning_rate": 5.3832287847914276e-05, | |
| "loss": 0.0244, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.7403314917127073, | |
| "grad_norm": 0.3126443028450012, | |
| "learning_rate": 5.35024852434055e-05, | |
| "loss": 0.0223, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.7472375690607733, | |
| "grad_norm": 0.24175651371479034, | |
| "learning_rate": 5.317252942884567e-05, | |
| "loss": 0.0221, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.7541436464088398, | |
| "grad_norm": 0.2680174708366394, | |
| "learning_rate": 5.284243483757109e-05, | |
| "loss": 0.0245, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.761049723756906, | |
| "grad_norm": 0.39669111371040344, | |
| "learning_rate": 5.2512215908988484e-05, | |
| "loss": 0.0291, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.7679558011049723, | |
| "grad_norm": 0.4152435064315796, | |
| "learning_rate": 5.218188708794357e-05, | |
| "loss": 0.0249, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.7748618784530388, | |
| "grad_norm": 0.3751486539840698, | |
| "learning_rate": 5.18514628240891e-05, | |
| "loss": 0.0253, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.7817679558011048, | |
| "grad_norm": 0.369488000869751, | |
| "learning_rate": 5.1520957571252795e-05, | |
| "loss": 0.0229, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.7886740331491713, | |
| "grad_norm": 0.3103255331516266, | |
| "learning_rate": 5.1190385786805106e-05, | |
| "loss": 0.0251, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.7955801104972375, | |
| "grad_norm": 0.33669042587280273, | |
| "learning_rate": 5.085976193102677e-05, | |
| "loss": 0.0243, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.8024861878453038, | |
| "grad_norm": 0.24954897165298462, | |
| "learning_rate": 5.052910046647634e-05, | |
| "loss": 0.0243, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.8093922651933703, | |
| "grad_norm": 0.41135188937187195, | |
| "learning_rate": 5.0198415857357464e-05, | |
| "loss": 0.0257, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.8162983425414365, | |
| "grad_norm": 0.26518484950065613, | |
| "learning_rate": 4.9867722568886223e-05, | |
| "loss": 0.0222, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.8232044198895028, | |
| "grad_norm": 0.3827347457408905, | |
| "learning_rate": 4.9537035066658314e-05, | |
| "loss": 0.0199, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.830110497237569, | |
| "grad_norm": 0.3803371489048004, | |
| "learning_rate": 4.920636781601638e-05, | |
| "loss": 0.0275, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.8370165745856353, | |
| "grad_norm": 0.27980637550354004, | |
| "learning_rate": 4.88757352814172e-05, | |
| "loss": 0.0304, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.8439226519337018, | |
| "grad_norm": 0.2618419826030731, | |
| "learning_rate": 4.8545151925798924e-05, | |
| "loss": 0.0238, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.850828729281768, | |
| "grad_norm": 0.28470802307128906, | |
| "learning_rate": 4.821463220994848e-05, | |
| "loss": 0.0299, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.8577348066298343, | |
| "grad_norm": 0.2959590256214142, | |
| "learning_rate": 4.788419059186895e-05, | |
| "loss": 0.0224, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.8646408839779005, | |
| "grad_norm": 0.36985912919044495, | |
| "learning_rate": 4.7553841526147205e-05, | |
| "loss": 0.0284, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.8715469613259668, | |
| "grad_norm": 0.2780645787715912, | |
| "learning_rate": 4.722359946332156e-05, | |
| "loss": 0.0259, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.8784530386740332, | |
| "grad_norm": 0.2221125364303589, | |
| "learning_rate": 4.6893478849249654e-05, | |
| "loss": 0.0206, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.8853591160220995, | |
| "grad_norm": 0.23101362586021423, | |
| "learning_rate": 4.656349412447664e-05, | |
| "loss": 0.0247, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.8922651933701657, | |
| "grad_norm": 0.31796690821647644, | |
| "learning_rate": 4.623365972360337e-05, | |
| "loss": 0.023, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.899171270718232, | |
| "grad_norm": 0.36655211448669434, | |
| "learning_rate": 4.590399007465503e-05, | |
| "loss": 0.0201, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.9060773480662982, | |
| "grad_norm": 0.30221128463745117, | |
| "learning_rate": 4.557449959845005e-05, | |
| "loss": 0.0222, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.9129834254143647, | |
| "grad_norm": 0.2875446677207947, | |
| "learning_rate": 4.524520270796927e-05, | |
| "loss": 0.0232, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.919889502762431, | |
| "grad_norm": 0.27749645709991455, | |
| "learning_rate": 4.491611380772545e-05, | |
| "loss": 0.0232, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.9267955801104972, | |
| "grad_norm": 0.2576722800731659, | |
| "learning_rate": 4.458724729313318e-05, | |
| "loss": 0.0193, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.9337016574585635, | |
| "grad_norm": 0.2539500594139099, | |
| "learning_rate": 4.42586175498792e-05, | |
| "loss": 0.0226, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.9406077348066297, | |
| "grad_norm": 0.2775489389896393, | |
| "learning_rate": 4.3930238953293094e-05, | |
| "loss": 0.0226, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.9475138121546962, | |
| "grad_norm": 0.34480902552604675, | |
| "learning_rate": 4.360212586771847e-05, | |
| "loss": 0.0171, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.9544198895027625, | |
| "grad_norm": 0.21589399874210358, | |
| "learning_rate": 4.327429264588463e-05, | |
| "loss": 0.0237, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.9613259668508287, | |
| "grad_norm": 0.21422788500785828, | |
| "learning_rate": 4.2946753628278725e-05, | |
| "loss": 0.0219, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.9682320441988952, | |
| "grad_norm": 0.2897253930568695, | |
| "learning_rate": 4.2619523142518474e-05, | |
| "loss": 0.0242, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.9751381215469612, | |
| "grad_norm": 0.2709295451641083, | |
| "learning_rate": 4.229261550272539e-05, | |
| "loss": 0.0208, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.9820441988950277, | |
| "grad_norm": 0.3681797683238983, | |
| "learning_rate": 4.196604500889868e-05, | |
| "loss": 0.0251, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.988950276243094, | |
| "grad_norm": 0.3151683807373047, | |
| "learning_rate": 4.163982594628969e-05, | |
| "loss": 0.021, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.9958563535911602, | |
| "grad_norm": 0.42392632365226746, | |
| "learning_rate": 4.131397258477702e-05, | |
| "loss": 0.0223, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.0027624309392267, | |
| "grad_norm": 0.416048526763916, | |
| "learning_rate": 4.0988499178242315e-05, | |
| "loss": 0.0218, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.0096685082872927, | |
| "grad_norm": 0.4189092516899109, | |
| "learning_rate": 4.066341996394678e-05, | |
| "loss": 0.0306, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.016574585635359, | |
| "grad_norm": 0.320120632648468, | |
| "learning_rate": 4.033874916190833e-05, | |
| "loss": 0.0195, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.023480662983425, | |
| "grad_norm": 0.306089848279953, | |
| "learning_rate": 4.001450097427966e-05, | |
| "loss": 0.0202, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.0303867403314917, | |
| "grad_norm": 0.16026566922664642, | |
| "learning_rate": 3.9690689584726894e-05, | |
| "loss": 0.02, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.037292817679558, | |
| "grad_norm": 0.19151918590068817, | |
| "learning_rate": 3.936732915780923e-05, | |
| "loss": 0.0199, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.044198895027624, | |
| "grad_norm": 0.24968740344047546, | |
| "learning_rate": 3.904443383835929e-05, | |
| "loss": 0.0265, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.0511049723756907, | |
| "grad_norm": 0.28650423884391785, | |
| "learning_rate": 3.872201775086437e-05, | |
| "loss": 0.0247, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.0580110497237567, | |
| "grad_norm": 0.23170770704746246, | |
| "learning_rate": 3.8400094998848616e-05, | |
| "loss": 0.0196, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.064917127071823, | |
| "grad_norm": 0.33920174837112427, | |
| "learning_rate": 3.807867966425611e-05, | |
| "loss": 0.017, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.0718232044198897, | |
| "grad_norm": 0.2694188356399536, | |
| "learning_rate": 3.775778580683481e-05, | |
| "loss": 0.0229, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 1500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |