diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,236166 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 33732, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011858176212498517, + "grad_norm": 29.7334115858346, + "learning_rate": 4.940711462450593e-08, + "loss": 1.8796, + "step": 1 + }, + { + "epoch": 0.00023716352424997035, + "grad_norm": 36.986189981478525, + "learning_rate": 9.881422924901186e-08, + "loss": 2.284, + "step": 2 + }, + { + "epoch": 0.0003557452863749555, + "grad_norm": 40.7859748549836, + "learning_rate": 1.4822134387351778e-07, + "loss": 2.2242, + "step": 3 + }, + { + "epoch": 0.0004743270484999407, + "grad_norm": 36.48457054375701, + "learning_rate": 1.976284584980237e-07, + "loss": 2.0602, + "step": 4 + }, + { + "epoch": 0.0005929088106249259, + "grad_norm": 26.626520027662174, + "learning_rate": 2.4703557312252967e-07, + "loss": 2.1763, + "step": 5 + }, + { + "epoch": 0.000711490572749911, + "grad_norm": 28.75938841669468, + "learning_rate": 2.9644268774703555e-07, + "loss": 2.1749, + "step": 6 + }, + { + "epoch": 0.0008300723348748962, + "grad_norm": 26.450691626862728, + "learning_rate": 3.4584980237154154e-07, + "loss": 2.0659, + "step": 7 + }, + { + "epoch": 0.0009486540969998814, + "grad_norm": 32.388409037114734, + "learning_rate": 3.952569169960474e-07, + "loss": 2.269, + "step": 8 + }, + { + "epoch": 0.0010672358591248667, + "grad_norm": 27.818829596147488, + "learning_rate": 4.446640316205534e-07, + "loss": 2.0405, + "step": 9 + }, + { + "epoch": 0.0011858176212498517, + "grad_norm": 27.213551128824204, + "learning_rate": 4.940711462450593e-07, + "loss": 2.0429, + "step": 10 + }, + { + "epoch": 0.001304399383374837, + "grad_norm": 24.677212610588604, + "learning_rate": 5.434782608695653e-07, + "loss": 2.0898, + "step": 11 + }, + { + "epoch": 0.001422981145499822, + "grad_norm": 22.13518130020256, + "learning_rate": 5.928853754940711e-07, + "loss": 1.9264, + "step": 12 + }, + { + "epoch": 0.0015415629076248074, + "grad_norm": 26.311666508606532, + "learning_rate": 6.422924901185771e-07, + "loss": 2.2384, + "step": 13 + }, + { + "epoch": 0.0016601446697497924, + "grad_norm": 20.846841284164107, + "learning_rate": 6.916996047430831e-07, + "loss": 2.0143, + "step": 14 + }, + { + "epoch": 0.0017787264318747777, + "grad_norm": 22.083048308127594, + "learning_rate": 7.411067193675889e-07, + "loss": 1.8876, + "step": 15 + }, + { + "epoch": 0.0018973081939997628, + "grad_norm": 24.8360423578799, + "learning_rate": 7.905138339920948e-07, + "loss": 2.0197, + "step": 16 + }, + { + "epoch": 0.002015889956124748, + "grad_norm": 17.354519140819882, + "learning_rate": 8.399209486166009e-07, + "loss": 1.8757, + "step": 17 + }, + { + "epoch": 0.0021344717182497333, + "grad_norm": 17.472884260289447, + "learning_rate": 8.893280632411068e-07, + "loss": 1.9005, + "step": 18 + }, + { + "epoch": 0.0022530534803747184, + "grad_norm": 20.275167427552432, + "learning_rate": 9.387351778656126e-07, + "loss": 1.7199, + "step": 19 + }, + { + "epoch": 0.0023716352424997035, + "grad_norm": 14.28383937483396, + "learning_rate": 9.881422924901187e-07, + "loss": 1.8742, + "step": 20 + }, + { + "epoch": 0.0024902170046246885, + "grad_norm": 19.785370699910622, + "learning_rate": 1.0375494071146247e-06, + "loss": 1.6949, + "step": 21 + }, + { + "epoch": 0.002608798766749674, + "grad_norm": 12.311019168455724, + "learning_rate": 1.0869565217391306e-06, + "loss": 1.7765, + "step": 22 + }, + { + "epoch": 0.002727380528874659, + "grad_norm": 14.037359974966835, + "learning_rate": 1.1363636363636364e-06, + "loss": 1.4765, + "step": 23 + }, + { + "epoch": 0.002845962290999644, + "grad_norm": 15.329905744804806, + "learning_rate": 1.1857707509881422e-06, + "loss": 1.6874, + "step": 24 + }, + { + "epoch": 0.0029645440531246296, + "grad_norm": 9.241739188353932, + "learning_rate": 1.2351778656126482e-06, + "loss": 1.6788, + "step": 25 + }, + { + "epoch": 0.0030831258152496147, + "grad_norm": 10.658162413453047, + "learning_rate": 1.2845849802371543e-06, + "loss": 1.6139, + "step": 26 + }, + { + "epoch": 0.0032017075773745998, + "grad_norm": 11.484737065357146, + "learning_rate": 1.3339920948616601e-06, + "loss": 1.6647, + "step": 27 + }, + { + "epoch": 0.003320289339499585, + "grad_norm": 5.809020273223719, + "learning_rate": 1.3833992094861662e-06, + "loss": 1.6373, + "step": 28 + }, + { + "epoch": 0.0034388711016245703, + "grad_norm": 5.094718107564571, + "learning_rate": 1.432806324110672e-06, + "loss": 1.5396, + "step": 29 + }, + { + "epoch": 0.0035574528637495554, + "grad_norm": 5.475453114625414, + "learning_rate": 1.4822134387351778e-06, + "loss": 1.3301, + "step": 30 + }, + { + "epoch": 0.0036760346258745405, + "grad_norm": 5.252787264813708, + "learning_rate": 1.5316205533596839e-06, + "loss": 1.2724, + "step": 31 + }, + { + "epoch": 0.0037946163879995255, + "grad_norm": 5.791267103219969, + "learning_rate": 1.5810276679841897e-06, + "loss": 0.8098, + "step": 32 + }, + { + "epoch": 0.003913198150124511, + "grad_norm": 4.485775929175042, + "learning_rate": 1.6304347826086957e-06, + "loss": 1.3726, + "step": 33 + }, + { + "epoch": 0.004031779912249496, + "grad_norm": 4.654827189747514, + "learning_rate": 1.6798418972332018e-06, + "loss": 1.5377, + "step": 34 + }, + { + "epoch": 0.004150361674374482, + "grad_norm": 3.6622689865341744, + "learning_rate": 1.7292490118577076e-06, + "loss": 1.5224, + "step": 35 + }, + { + "epoch": 0.004268943436499467, + "grad_norm": 4.979949858514664, + "learning_rate": 1.7786561264822136e-06, + "loss": 1.5067, + "step": 36 + }, + { + "epoch": 0.004387525198624452, + "grad_norm": 4.501677583957293, + "learning_rate": 1.8280632411067192e-06, + "loss": 1.3756, + "step": 37 + }, + { + "epoch": 0.004506106960749437, + "grad_norm": 4.91743142836715, + "learning_rate": 1.8774703557312253e-06, + "loss": 1.5613, + "step": 38 + }, + { + "epoch": 0.004624688722874422, + "grad_norm": 4.806424592516933, + "learning_rate": 1.9268774703557313e-06, + "loss": 1.1191, + "step": 39 + }, + { + "epoch": 0.004743270484999407, + "grad_norm": 5.047904499811148, + "learning_rate": 1.9762845849802374e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.004861852247124392, + "grad_norm": 4.810873280149616, + "learning_rate": 2.0256916996047434e-06, + "loss": 1.2913, + "step": 41 + }, + { + "epoch": 0.004980434009249377, + "grad_norm": 4.839732027601575, + "learning_rate": 2.0750988142292494e-06, + "loss": 1.4684, + "step": 42 + }, + { + "epoch": 0.005099015771374363, + "grad_norm": 4.205533048559967, + "learning_rate": 2.124505928853755e-06, + "loss": 1.4724, + "step": 43 + }, + { + "epoch": 0.005217597533499348, + "grad_norm": 4.649302570240601, + "learning_rate": 2.173913043478261e-06, + "loss": 1.3319, + "step": 44 + }, + { + "epoch": 0.005336179295624333, + "grad_norm": 4.9705205235957814, + "learning_rate": 2.2233201581027667e-06, + "loss": 1.4927, + "step": 45 + }, + { + "epoch": 0.005454761057749318, + "grad_norm": 4.636719619809015, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.2349, + "step": 46 + }, + { + "epoch": 0.005573342819874303, + "grad_norm": 3.971633369633716, + "learning_rate": 2.322134387351779e-06, + "loss": 1.4677, + "step": 47 + }, + { + "epoch": 0.005691924581999288, + "grad_norm": 3.8899464489610818, + "learning_rate": 2.3715415019762844e-06, + "loss": 1.3009, + "step": 48 + }, + { + "epoch": 0.005810506344124273, + "grad_norm": 4.329629116852109, + "learning_rate": 2.4209486166007905e-06, + "loss": 1.403, + "step": 49 + }, + { + "epoch": 0.005929088106249259, + "grad_norm": 4.218784231886035, + "learning_rate": 2.4703557312252965e-06, + "loss": 1.4691, + "step": 50 + }, + { + "epoch": 0.006047669868374244, + "grad_norm": 3.8196953426633433, + "learning_rate": 2.5197628458498025e-06, + "loss": 1.5109, + "step": 51 + }, + { + "epoch": 0.006166251630499229, + "grad_norm": 3.5799719246204194, + "learning_rate": 2.5691699604743086e-06, + "loss": 1.5903, + "step": 52 + }, + { + "epoch": 0.0062848333926242145, + "grad_norm": 3.5789978015119397, + "learning_rate": 2.6185770750988146e-06, + "loss": 1.3442, + "step": 53 + }, + { + "epoch": 0.0064034151547491995, + "grad_norm": 3.2603597513307245, + "learning_rate": 2.6679841897233202e-06, + "loss": 1.5437, + "step": 54 + }, + { + "epoch": 0.006521996916874185, + "grad_norm": 3.4341440061642086, + "learning_rate": 2.7173913043478263e-06, + "loss": 1.5123, + "step": 55 + }, + { + "epoch": 0.00664057867899917, + "grad_norm": 3.987972758410137, + "learning_rate": 2.7667984189723323e-06, + "loss": 1.3457, + "step": 56 + }, + { + "epoch": 0.006759160441124155, + "grad_norm": 3.977475845211303, + "learning_rate": 2.816205533596838e-06, + "loss": 1.4073, + "step": 57 + }, + { + "epoch": 0.006877742203249141, + "grad_norm": 4.203700275846737, + "learning_rate": 2.865612648221344e-06, + "loss": 1.3345, + "step": 58 + }, + { + "epoch": 0.006996323965374126, + "grad_norm": 3.3146057498522437, + "learning_rate": 2.91501976284585e-06, + "loss": 1.4505, + "step": 59 + }, + { + "epoch": 0.007114905727499111, + "grad_norm": 3.5921841458375234, + "learning_rate": 2.9644268774703556e-06, + "loss": 1.1119, + "step": 60 + }, + { + "epoch": 0.007233487489624096, + "grad_norm": 4.534186220908105, + "learning_rate": 3.0138339920948617e-06, + "loss": 1.2502, + "step": 61 + }, + { + "epoch": 0.007352069251749081, + "grad_norm": 3.4930738853086996, + "learning_rate": 3.0632411067193677e-06, + "loss": 1.4959, + "step": 62 + }, + { + "epoch": 0.007470651013874066, + "grad_norm": 3.793249085036293, + "learning_rate": 3.1126482213438737e-06, + "loss": 1.5658, + "step": 63 + }, + { + "epoch": 0.007589232775999051, + "grad_norm": 3.7272413318236564, + "learning_rate": 3.1620553359683794e-06, + "loss": 1.4786, + "step": 64 + }, + { + "epoch": 0.007707814538124036, + "grad_norm": 3.7666046621727376, + "learning_rate": 3.211462450592886e-06, + "loss": 1.3404, + "step": 65 + }, + { + "epoch": 0.007826396300249021, + "grad_norm": 3.6489802688735518, + "learning_rate": 3.2608695652173914e-06, + "loss": 1.3084, + "step": 66 + }, + { + "epoch": 0.007944978062374007, + "grad_norm": 3.393741805739487, + "learning_rate": 3.310276679841898e-06, + "loss": 1.4177, + "step": 67 + }, + { + "epoch": 0.008063559824498991, + "grad_norm": 5.2200674212050435, + "learning_rate": 3.3596837944664035e-06, + "loss": 1.223, + "step": 68 + }, + { + "epoch": 0.008182141586623977, + "grad_norm": 3.5668605052898017, + "learning_rate": 3.409090909090909e-06, + "loss": 1.2097, + "step": 69 + }, + { + "epoch": 0.008300723348748963, + "grad_norm": 3.6280712755852456, + "learning_rate": 3.458498023715415e-06, + "loss": 1.3289, + "step": 70 + }, + { + "epoch": 0.008419305110873947, + "grad_norm": 3.268044709665363, + "learning_rate": 3.507905138339921e-06, + "loss": 1.564, + "step": 71 + }, + { + "epoch": 0.008537886872998933, + "grad_norm": 3.9903293434366716, + "learning_rate": 3.5573122529644273e-06, + "loss": 1.3967, + "step": 72 + }, + { + "epoch": 0.008656468635123917, + "grad_norm": 3.276506449815736, + "learning_rate": 3.606719367588933e-06, + "loss": 1.4608, + "step": 73 + }, + { + "epoch": 0.008775050397248903, + "grad_norm": 3.349344682930467, + "learning_rate": 3.6561264822134385e-06, + "loss": 1.2848, + "step": 74 + }, + { + "epoch": 0.008893632159373888, + "grad_norm": 3.8759647234303696, + "learning_rate": 3.705533596837945e-06, + "loss": 1.4331, + "step": 75 + }, + { + "epoch": 0.009012213921498874, + "grad_norm": 4.032042318942088, + "learning_rate": 3.7549407114624506e-06, + "loss": 1.2687, + "step": 76 + }, + { + "epoch": 0.00913079568362386, + "grad_norm": 4.05463512824053, + "learning_rate": 3.804347826086957e-06, + "loss": 1.3397, + "step": 77 + }, + { + "epoch": 0.009249377445748844, + "grad_norm": 3.570905635236118, + "learning_rate": 3.853754940711463e-06, + "loss": 1.4733, + "step": 78 + }, + { + "epoch": 0.00936795920787383, + "grad_norm": 3.779305628665041, + "learning_rate": 3.903162055335968e-06, + "loss": 1.4018, + "step": 79 + }, + { + "epoch": 0.009486540969998814, + "grad_norm": 3.7854719623767825, + "learning_rate": 3.952569169960475e-06, + "loss": 1.4833, + "step": 80 + }, + { + "epoch": 0.0096051227321238, + "grad_norm": 3.6505406776124714, + "learning_rate": 4.00197628458498e-06, + "loss": 1.1614, + "step": 81 + }, + { + "epoch": 0.009723704494248784, + "grad_norm": 3.4311507500208274, + "learning_rate": 4.051383399209487e-06, + "loss": 1.5358, + "step": 82 + }, + { + "epoch": 0.00984228625637377, + "grad_norm": 3.1149203272886146, + "learning_rate": 4.1007905138339924e-06, + "loss": 1.3102, + "step": 83 + }, + { + "epoch": 0.009960868018498754, + "grad_norm": 3.3638340217134637, + "learning_rate": 4.150197628458499e-06, + "loss": 1.5811, + "step": 84 + }, + { + "epoch": 0.01007944978062374, + "grad_norm": 3.7390399791979925, + "learning_rate": 4.1996047430830045e-06, + "loss": 1.2584, + "step": 85 + }, + { + "epoch": 0.010198031542748726, + "grad_norm": 3.4182632034218603, + "learning_rate": 4.24901185770751e-06, + "loss": 1.6082, + "step": 86 + }, + { + "epoch": 0.01031661330487371, + "grad_norm": 3.354729488177407, + "learning_rate": 4.298418972332017e-06, + "loss": 1.1889, + "step": 87 + }, + { + "epoch": 0.010435195066998696, + "grad_norm": 3.5089261935914022, + "learning_rate": 4.347826086956522e-06, + "loss": 1.3345, + "step": 88 + }, + { + "epoch": 0.01055377682912368, + "grad_norm": 3.8462814773896525, + "learning_rate": 4.397233201581028e-06, + "loss": 1.1596, + "step": 89 + }, + { + "epoch": 0.010672358591248666, + "grad_norm": 3.6272409788759012, + "learning_rate": 4.4466403162055334e-06, + "loss": 1.4737, + "step": 90 + }, + { + "epoch": 0.01079094035337365, + "grad_norm": 3.13130266377601, + "learning_rate": 4.496047430830039e-06, + "loss": 1.434, + "step": 91 + }, + { + "epoch": 0.010909522115498636, + "grad_norm": 3.196212648459861, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.4654, + "step": 92 + }, + { + "epoch": 0.011028103877623622, + "grad_norm": 3.1293886723406406, + "learning_rate": 4.594861660079051e-06, + "loss": 1.2483, + "step": 93 + }, + { + "epoch": 0.011146685639748606, + "grad_norm": 3.6867170578203434, + "learning_rate": 4.644268774703558e-06, + "loss": 1.5428, + "step": 94 + }, + { + "epoch": 0.011265267401873592, + "grad_norm": 3.212927331712037, + "learning_rate": 4.693675889328063e-06, + "loss": 1.5376, + "step": 95 + }, + { + "epoch": 0.011383849163998577, + "grad_norm": 3.1924142182429613, + "learning_rate": 4.743083003952569e-06, + "loss": 1.7211, + "step": 96 + }, + { + "epoch": 0.011502430926123563, + "grad_norm": 3.381404143138639, + "learning_rate": 4.792490118577075e-06, + "loss": 1.4578, + "step": 97 + }, + { + "epoch": 0.011621012688248547, + "grad_norm": 3.502019885762863, + "learning_rate": 4.841897233201581e-06, + "loss": 1.4686, + "step": 98 + }, + { + "epoch": 0.011739594450373533, + "grad_norm": 3.4239685805832365, + "learning_rate": 4.891304347826087e-06, + "loss": 1.4302, + "step": 99 + }, + { + "epoch": 0.011858176212498519, + "grad_norm": 3.2866255993402373, + "learning_rate": 4.940711462450593e-06, + "loss": 1.5406, + "step": 100 + }, + { + "epoch": 0.011976757974623503, + "grad_norm": 3.8695232957002332, + "learning_rate": 4.9901185770750995e-06, + "loss": 1.2577, + "step": 101 + }, + { + "epoch": 0.012095339736748489, + "grad_norm": 3.1954221106642087, + "learning_rate": 5.039525691699605e-06, + "loss": 1.6125, + "step": 102 + }, + { + "epoch": 0.012213921498873473, + "grad_norm": 3.4547480778911455, + "learning_rate": 5.088932806324111e-06, + "loss": 1.0321, + "step": 103 + }, + { + "epoch": 0.012332503260998459, + "grad_norm": 3.490136777704389, + "learning_rate": 5.138339920948617e-06, + "loss": 1.1058, + "step": 104 + }, + { + "epoch": 0.012451085023123443, + "grad_norm": 3.0967678463388544, + "learning_rate": 5.187747035573123e-06, + "loss": 1.5521, + "step": 105 + }, + { + "epoch": 0.012569666785248429, + "grad_norm": 3.172889434152784, + "learning_rate": 5.237154150197629e-06, + "loss": 1.5146, + "step": 106 + }, + { + "epoch": 0.012688248547373413, + "grad_norm": 3.693898674128569, + "learning_rate": 5.286561264822135e-06, + "loss": 1.3215, + "step": 107 + }, + { + "epoch": 0.012806830309498399, + "grad_norm": 3.398769183694832, + "learning_rate": 5.3359683794466405e-06, + "loss": 1.3303, + "step": 108 + }, + { + "epoch": 0.012925412071623385, + "grad_norm": 3.7096404981782327, + "learning_rate": 5.385375494071147e-06, + "loss": 1.3601, + "step": 109 + }, + { + "epoch": 0.01304399383374837, + "grad_norm": 3.347710020043152, + "learning_rate": 5.4347826086956525e-06, + "loss": 1.4923, + "step": 110 + }, + { + "epoch": 0.013162575595873355, + "grad_norm": 3.584519958005256, + "learning_rate": 5.484189723320159e-06, + "loss": 0.8943, + "step": 111 + }, + { + "epoch": 0.01328115735799834, + "grad_norm": 3.3351266533490636, + "learning_rate": 5.533596837944665e-06, + "loss": 1.4478, + "step": 112 + }, + { + "epoch": 0.013399739120123325, + "grad_norm": 3.2227745201141342, + "learning_rate": 5.58300395256917e-06, + "loss": 1.497, + "step": 113 + }, + { + "epoch": 0.01351832088224831, + "grad_norm": 3.5761240428588277, + "learning_rate": 5.632411067193676e-06, + "loss": 1.2487, + "step": 114 + }, + { + "epoch": 0.013636902644373295, + "grad_norm": 3.5964241142188733, + "learning_rate": 5.681818181818182e-06, + "loss": 1.5315, + "step": 115 + }, + { + "epoch": 0.013755484406498281, + "grad_norm": 3.532485779559769, + "learning_rate": 5.731225296442688e-06, + "loss": 1.5633, + "step": 116 + }, + { + "epoch": 0.013874066168623266, + "grad_norm": 4.458237380787414, + "learning_rate": 5.7806324110671936e-06, + "loss": 1.1755, + "step": 117 + }, + { + "epoch": 0.013992647930748251, + "grad_norm": 3.401693392114225, + "learning_rate": 5.8300395256917e-06, + "loss": 1.1613, + "step": 118 + }, + { + "epoch": 0.014111229692873236, + "grad_norm": 3.43659936937958, + "learning_rate": 5.879446640316206e-06, + "loss": 1.4882, + "step": 119 + }, + { + "epoch": 0.014229811454998222, + "grad_norm": 3.933037361894272, + "learning_rate": 5.928853754940711e-06, + "loss": 1.2865, + "step": 120 + }, + { + "epoch": 0.014348393217123206, + "grad_norm": 3.326242133376254, + "learning_rate": 5.978260869565218e-06, + "loss": 1.2986, + "step": 121 + }, + { + "epoch": 0.014466974979248192, + "grad_norm": 3.1079589329108264, + "learning_rate": 6.027667984189723e-06, + "loss": 1.4292, + "step": 122 + }, + { + "epoch": 0.014585556741373178, + "grad_norm": 3.4170179875454467, + "learning_rate": 6.07707509881423e-06, + "loss": 1.4056, + "step": 123 + }, + { + "epoch": 0.014704138503498162, + "grad_norm": 3.2897278731100053, + "learning_rate": 6.126482213438735e-06, + "loss": 0.9931, + "step": 124 + }, + { + "epoch": 0.014822720265623148, + "grad_norm": 3.0168456876655223, + "learning_rate": 6.175889328063241e-06, + "loss": 1.4632, + "step": 125 + }, + { + "epoch": 0.014941302027748132, + "grad_norm": 4.051435358425506, + "learning_rate": 6.2252964426877475e-06, + "loss": 1.0668, + "step": 126 + }, + { + "epoch": 0.015059883789873118, + "grad_norm": 3.2358187287341686, + "learning_rate": 6.274703557312253e-06, + "loss": 1.4251, + "step": 127 + }, + { + "epoch": 0.015178465551998102, + "grad_norm": 3.6497534834135124, + "learning_rate": 6.324110671936759e-06, + "loss": 1.4442, + "step": 128 + }, + { + "epoch": 0.015297047314123088, + "grad_norm": 3.2445532742932106, + "learning_rate": 6.373517786561266e-06, + "loss": 1.284, + "step": 129 + }, + { + "epoch": 0.015415629076248072, + "grad_norm": 3.6969900055784333, + "learning_rate": 6.422924901185772e-06, + "loss": 1.2353, + "step": 130 + }, + { + "epoch": 0.015534210838373058, + "grad_norm": 3.2840007521188865, + "learning_rate": 6.472332015810277e-06, + "loss": 1.5767, + "step": 131 + }, + { + "epoch": 0.015652792600498042, + "grad_norm": 3.359971817996501, + "learning_rate": 6.521739130434783e-06, + "loss": 1.4194, + "step": 132 + }, + { + "epoch": 0.01577137436262303, + "grad_norm": 3.686163280364927, + "learning_rate": 6.5711462450592885e-06, + "loss": 1.3864, + "step": 133 + }, + { + "epoch": 0.015889956124748014, + "grad_norm": 3.517698750818215, + "learning_rate": 6.620553359683796e-06, + "loss": 1.3154, + "step": 134 + }, + { + "epoch": 0.016008537886873, + "grad_norm": 3.6938279258929088, + "learning_rate": 6.6699604743083014e-06, + "loss": 1.243, + "step": 135 + }, + { + "epoch": 0.016127119648997983, + "grad_norm": 3.8472159360375247, + "learning_rate": 6.719367588932807e-06, + "loss": 1.0296, + "step": 136 + }, + { + "epoch": 0.01624570141112297, + "grad_norm": 3.5873584587564187, + "learning_rate": 6.768774703557313e-06, + "loss": 1.3103, + "step": 137 + }, + { + "epoch": 0.016364283173247954, + "grad_norm": 3.557262653873429, + "learning_rate": 6.818181818181818e-06, + "loss": 1.4158, + "step": 138 + }, + { + "epoch": 0.01648286493537294, + "grad_norm": 3.4128391682193606, + "learning_rate": 6.867588932806325e-06, + "loss": 1.4815, + "step": 139 + }, + { + "epoch": 0.016601446697497926, + "grad_norm": 3.5543485647000184, + "learning_rate": 6.91699604743083e-06, + "loss": 1.2262, + "step": 140 + }, + { + "epoch": 0.01672002845962291, + "grad_norm": 4.040205557480039, + "learning_rate": 6.966403162055336e-06, + "loss": 1.3685, + "step": 141 + }, + { + "epoch": 0.016838610221747895, + "grad_norm": 3.571608477900209, + "learning_rate": 7.015810276679842e-06, + "loss": 1.1649, + "step": 142 + }, + { + "epoch": 0.01695719198387288, + "grad_norm": 3.5082596175908534, + "learning_rate": 7.065217391304347e-06, + "loss": 1.3327, + "step": 143 + }, + { + "epoch": 0.017075773745997867, + "grad_norm": 3.2121355373777987, + "learning_rate": 7.1146245059288545e-06, + "loss": 1.3955, + "step": 144 + }, + { + "epoch": 0.01719435550812285, + "grad_norm": 3.5775456145700737, + "learning_rate": 7.16403162055336e-06, + "loss": 1.419, + "step": 145 + }, + { + "epoch": 0.017312937270247835, + "grad_norm": 3.04031831251046, + "learning_rate": 7.213438735177866e-06, + "loss": 1.5208, + "step": 146 + }, + { + "epoch": 0.017431519032372823, + "grad_norm": 4.129023970677371, + "learning_rate": 7.262845849802371e-06, + "loss": 1.2054, + "step": 147 + }, + { + "epoch": 0.017550100794497807, + "grad_norm": 3.052172301065618, + "learning_rate": 7.312252964426877e-06, + "loss": 1.5402, + "step": 148 + }, + { + "epoch": 0.01766868255662279, + "grad_norm": 3.4338766812762933, + "learning_rate": 7.361660079051384e-06, + "loss": 1.2949, + "step": 149 + }, + { + "epoch": 0.017787264318747775, + "grad_norm": 3.2861687031919553, + "learning_rate": 7.41106719367589e-06, + "loss": 1.4539, + "step": 150 + }, + { + "epoch": 0.017905846080872763, + "grad_norm": 3.614448911565531, + "learning_rate": 7.4604743083003955e-06, + "loss": 1.4611, + "step": 151 + }, + { + "epoch": 0.018024427842997747, + "grad_norm": 3.2149909098883334, + "learning_rate": 7.509881422924901e-06, + "loss": 1.0467, + "step": 152 + }, + { + "epoch": 0.01814300960512273, + "grad_norm": 3.684892340896345, + "learning_rate": 7.559288537549407e-06, + "loss": 1.3944, + "step": 153 + }, + { + "epoch": 0.01826159136724772, + "grad_norm": 3.376670522018421, + "learning_rate": 7.608695652173914e-06, + "loss": 1.3431, + "step": 154 + }, + { + "epoch": 0.018380173129372703, + "grad_norm": 3.3011577275242083, + "learning_rate": 7.65810276679842e-06, + "loss": 1.2032, + "step": 155 + }, + { + "epoch": 0.018498754891497687, + "grad_norm": 3.334271435215534, + "learning_rate": 7.707509881422925e-06, + "loss": 1.1876, + "step": 156 + }, + { + "epoch": 0.01861733665362267, + "grad_norm": 3.4977331301136263, + "learning_rate": 7.756916996047431e-06, + "loss": 1.0686, + "step": 157 + }, + { + "epoch": 0.01873591841574766, + "grad_norm": 3.616540296725585, + "learning_rate": 7.806324110671937e-06, + "loss": 1.1448, + "step": 158 + }, + { + "epoch": 0.018854500177872643, + "grad_norm": 3.647057966520627, + "learning_rate": 7.855731225296444e-06, + "loss": 1.3524, + "step": 159 + }, + { + "epoch": 0.018973081939997628, + "grad_norm": 3.521250636690629, + "learning_rate": 7.90513833992095e-06, + "loss": 1.3396, + "step": 160 + }, + { + "epoch": 0.019091663702122612, + "grad_norm": 3.4665570121511724, + "learning_rate": 7.954545454545455e-06, + "loss": 1.4676, + "step": 161 + }, + { + "epoch": 0.0192102454642476, + "grad_norm": 4.101700616924512, + "learning_rate": 8.00395256916996e-06, + "loss": 1.0273, + "step": 162 + }, + { + "epoch": 0.019328827226372584, + "grad_norm": 3.210628877875299, + "learning_rate": 8.053359683794468e-06, + "loss": 1.4024, + "step": 163 + }, + { + "epoch": 0.019447408988497568, + "grad_norm": 3.4017394309666957, + "learning_rate": 8.102766798418974e-06, + "loss": 1.3566, + "step": 164 + }, + { + "epoch": 0.019565990750622556, + "grad_norm": 3.566329320796935, + "learning_rate": 8.15217391304348e-06, + "loss": 1.2346, + "step": 165 + }, + { + "epoch": 0.01968457251274754, + "grad_norm": 3.4620705984226285, + "learning_rate": 8.201581027667985e-06, + "loss": 1.4025, + "step": 166 + }, + { + "epoch": 0.019803154274872524, + "grad_norm": 3.145918115470483, + "learning_rate": 8.25098814229249e-06, + "loss": 1.3182, + "step": 167 + }, + { + "epoch": 0.019921736036997508, + "grad_norm": 3.297200804982551, + "learning_rate": 8.300395256916998e-06, + "loss": 1.5242, + "step": 168 + }, + { + "epoch": 0.020040317799122496, + "grad_norm": 3.2933296628880044, + "learning_rate": 8.349802371541503e-06, + "loss": 1.4143, + "step": 169 + }, + { + "epoch": 0.02015889956124748, + "grad_norm": 3.1545288713113306, + "learning_rate": 8.399209486166009e-06, + "loss": 1.4978, + "step": 170 + }, + { + "epoch": 0.020277481323372464, + "grad_norm": 3.2121797354568313, + "learning_rate": 8.448616600790515e-06, + "loss": 1.2417, + "step": 171 + }, + { + "epoch": 0.020396063085497452, + "grad_norm": 3.4702710147199416, + "learning_rate": 8.49802371541502e-06, + "loss": 1.5379, + "step": 172 + }, + { + "epoch": 0.020514644847622436, + "grad_norm": 3.327819772531745, + "learning_rate": 8.547430830039528e-06, + "loss": 1.4314, + "step": 173 + }, + { + "epoch": 0.02063322660974742, + "grad_norm": 2.9313494652986996, + "learning_rate": 8.596837944664033e-06, + "loss": 1.494, + "step": 174 + }, + { + "epoch": 0.020751808371872404, + "grad_norm": 3.3530024113440766, + "learning_rate": 8.646245059288539e-06, + "loss": 1.4671, + "step": 175 + }, + { + "epoch": 0.020870390133997392, + "grad_norm": 3.7084244554771724, + "learning_rate": 8.695652173913044e-06, + "loss": 1.2303, + "step": 176 + }, + { + "epoch": 0.020988971896122376, + "grad_norm": 3.4031030399903384, + "learning_rate": 8.74505928853755e-06, + "loss": 1.2403, + "step": 177 + }, + { + "epoch": 0.02110755365824736, + "grad_norm": 3.253593424578815, + "learning_rate": 8.794466403162056e-06, + "loss": 1.4751, + "step": 178 + }, + { + "epoch": 0.021226135420372348, + "grad_norm": 3.4444607383500707, + "learning_rate": 8.843873517786561e-06, + "loss": 1.2758, + "step": 179 + }, + { + "epoch": 0.021344717182497332, + "grad_norm": 3.2018005057847154, + "learning_rate": 8.893280632411067e-06, + "loss": 1.5201, + "step": 180 + }, + { + "epoch": 0.021463298944622317, + "grad_norm": 3.043906797607487, + "learning_rate": 8.942687747035572e-06, + "loss": 1.2787, + "step": 181 + }, + { + "epoch": 0.0215818807067473, + "grad_norm": 3.1622398033223305, + "learning_rate": 8.992094861660078e-06, + "loss": 1.4893, + "step": 182 + }, + { + "epoch": 0.02170046246887229, + "grad_norm": 3.0029573752796885, + "learning_rate": 9.041501976284585e-06, + "loss": 1.3394, + "step": 183 + }, + { + "epoch": 0.021819044230997273, + "grad_norm": 3.4803964110259655, + "learning_rate": 9.090909090909091e-06, + "loss": 1.3506, + "step": 184 + }, + { + "epoch": 0.021937625993122257, + "grad_norm": 3.1091340550524516, + "learning_rate": 9.140316205533597e-06, + "loss": 1.3211, + "step": 185 + }, + { + "epoch": 0.022056207755247245, + "grad_norm": 3.317771728202137, + "learning_rate": 9.189723320158102e-06, + "loss": 1.3114, + "step": 186 + }, + { + "epoch": 0.02217478951737223, + "grad_norm": 3.3820404696210717, + "learning_rate": 9.239130434782608e-06, + "loss": 1.4481, + "step": 187 + }, + { + "epoch": 0.022293371279497213, + "grad_norm": 3.1482997926234413, + "learning_rate": 9.288537549407115e-06, + "loss": 1.0854, + "step": 188 + }, + { + "epoch": 0.022411953041622197, + "grad_norm": 3.04452907062631, + "learning_rate": 9.33794466403162e-06, + "loss": 1.0224, + "step": 189 + }, + { + "epoch": 0.022530534803747185, + "grad_norm": 3.5779674496527667, + "learning_rate": 9.387351778656126e-06, + "loss": 0.8139, + "step": 190 + }, + { + "epoch": 0.02264911656587217, + "grad_norm": 3.1118150715574235, + "learning_rate": 9.436758893280632e-06, + "loss": 1.0896, + "step": 191 + }, + { + "epoch": 0.022767698327997153, + "grad_norm": 3.1440109613818974, + "learning_rate": 9.486166007905138e-06, + "loss": 1.1657, + "step": 192 + }, + { + "epoch": 0.02288628009012214, + "grad_norm": 3.5495503065498046, + "learning_rate": 9.535573122529645e-06, + "loss": 1.2247, + "step": 193 + }, + { + "epoch": 0.023004861852247125, + "grad_norm": 3.2695848907794716, + "learning_rate": 9.58498023715415e-06, + "loss": 1.2869, + "step": 194 + }, + { + "epoch": 0.02312344361437211, + "grad_norm": 3.619782453638658, + "learning_rate": 9.634387351778656e-06, + "loss": 1.1305, + "step": 195 + }, + { + "epoch": 0.023242025376497093, + "grad_norm": 3.77958219860935, + "learning_rate": 9.683794466403162e-06, + "loss": 1.1507, + "step": 196 + }, + { + "epoch": 0.02336060713862208, + "grad_norm": 3.231305756325196, + "learning_rate": 9.733201581027667e-06, + "loss": 1.3795, + "step": 197 + }, + { + "epoch": 0.023479188900747065, + "grad_norm": 3.272144840163199, + "learning_rate": 9.782608695652175e-06, + "loss": 1.2991, + "step": 198 + }, + { + "epoch": 0.02359777066287205, + "grad_norm": 3.2163198350672153, + "learning_rate": 9.83201581027668e-06, + "loss": 1.6104, + "step": 199 + }, + { + "epoch": 0.023716352424997037, + "grad_norm": 2.988892808386588, + "learning_rate": 9.881422924901186e-06, + "loss": 1.435, + "step": 200 + }, + { + "epoch": 0.02383493418712202, + "grad_norm": 3.107283218262279, + "learning_rate": 9.930830039525692e-06, + "loss": 1.4289, + "step": 201 + }, + { + "epoch": 0.023953515949247006, + "grad_norm": 3.4543468151119154, + "learning_rate": 9.980237154150199e-06, + "loss": 1.2277, + "step": 202 + }, + { + "epoch": 0.02407209771137199, + "grad_norm": 2.974171011998451, + "learning_rate": 1.0029644268774705e-05, + "loss": 1.2908, + "step": 203 + }, + { + "epoch": 0.024190679473496977, + "grad_norm": 3.326076310722646, + "learning_rate": 1.007905138339921e-05, + "loss": 1.1085, + "step": 204 + }, + { + "epoch": 0.02430926123562196, + "grad_norm": 2.794461991541987, + "learning_rate": 1.0128458498023716e-05, + "loss": 1.0693, + "step": 205 + }, + { + "epoch": 0.024427842997746946, + "grad_norm": 2.98309651474819, + "learning_rate": 1.0177865612648221e-05, + "loss": 1.4743, + "step": 206 + }, + { + "epoch": 0.02454642475987193, + "grad_norm": 2.736817891772091, + "learning_rate": 1.0227272727272729e-05, + "loss": 1.4711, + "step": 207 + }, + { + "epoch": 0.024665006521996918, + "grad_norm": 3.275712278506568, + "learning_rate": 1.0276679841897234e-05, + "loss": 1.1462, + "step": 208 + }, + { + "epoch": 0.024783588284121902, + "grad_norm": 3.6558416270782783, + "learning_rate": 1.032608695652174e-05, + "loss": 1.3439, + "step": 209 + }, + { + "epoch": 0.024902170046246886, + "grad_norm": 3.232849042632529, + "learning_rate": 1.0375494071146246e-05, + "loss": 1.4936, + "step": 210 + }, + { + "epoch": 0.025020751808371874, + "grad_norm": 3.233829764608336, + "learning_rate": 1.0424901185770751e-05, + "loss": 1.2881, + "step": 211 + }, + { + "epoch": 0.025139333570496858, + "grad_norm": 3.0654443181694226, + "learning_rate": 1.0474308300395258e-05, + "loss": 1.1639, + "step": 212 + }, + { + "epoch": 0.025257915332621842, + "grad_norm": 2.7865196739296247, + "learning_rate": 1.0523715415019764e-05, + "loss": 1.2722, + "step": 213 + }, + { + "epoch": 0.025376497094746826, + "grad_norm": 3.115314074935528, + "learning_rate": 1.057312252964427e-05, + "loss": 1.5186, + "step": 214 + }, + { + "epoch": 0.025495078856871814, + "grad_norm": 3.5258294570883684, + "learning_rate": 1.0622529644268775e-05, + "loss": 1.4668, + "step": 215 + }, + { + "epoch": 0.025613660618996798, + "grad_norm": 2.931127286710215, + "learning_rate": 1.0671936758893281e-05, + "loss": 1.3485, + "step": 216 + }, + { + "epoch": 0.025732242381121782, + "grad_norm": 3.0732311709708835, + "learning_rate": 1.0721343873517788e-05, + "loss": 1.2787, + "step": 217 + }, + { + "epoch": 0.02585082414324677, + "grad_norm": 2.723098769970518, + "learning_rate": 1.0770750988142294e-05, + "loss": 1.2318, + "step": 218 + }, + { + "epoch": 0.025969405905371754, + "grad_norm": 3.071806327744816, + "learning_rate": 1.08201581027668e-05, + "loss": 1.389, + "step": 219 + }, + { + "epoch": 0.02608798766749674, + "grad_norm": 3.2375957118465037, + "learning_rate": 1.0869565217391305e-05, + "loss": 1.1371, + "step": 220 + }, + { + "epoch": 0.026206569429621723, + "grad_norm": 3.001458532632069, + "learning_rate": 1.091897233201581e-05, + "loss": 1.3853, + "step": 221 + }, + { + "epoch": 0.02632515119174671, + "grad_norm": 3.487329871009364, + "learning_rate": 1.0968379446640318e-05, + "loss": 1.095, + "step": 222 + }, + { + "epoch": 0.026443732953871694, + "grad_norm": 3.1835632178903013, + "learning_rate": 1.1017786561264824e-05, + "loss": 1.6332, + "step": 223 + }, + { + "epoch": 0.02656231471599668, + "grad_norm": 3.467523876220792, + "learning_rate": 1.106719367588933e-05, + "loss": 1.0143, + "step": 224 + }, + { + "epoch": 0.026680896478121666, + "grad_norm": 2.911364808687225, + "learning_rate": 1.1116600790513835e-05, + "loss": 1.1766, + "step": 225 + }, + { + "epoch": 0.02679947824024665, + "grad_norm": 3.2167562309227353, + "learning_rate": 1.116600790513834e-05, + "loss": 1.5045, + "step": 226 + }, + { + "epoch": 0.026918060002371635, + "grad_norm": 3.4759934540061406, + "learning_rate": 1.1215415019762846e-05, + "loss": 1.1668, + "step": 227 + }, + { + "epoch": 0.02703664176449662, + "grad_norm": 3.297419934469312, + "learning_rate": 1.1264822134387352e-05, + "loss": 1.3496, + "step": 228 + }, + { + "epoch": 0.027155223526621607, + "grad_norm": 3.364102618831967, + "learning_rate": 1.1314229249011857e-05, + "loss": 1.1864, + "step": 229 + }, + { + "epoch": 0.02727380528874659, + "grad_norm": 3.1796901912912996, + "learning_rate": 1.1363636363636365e-05, + "loss": 1.4592, + "step": 230 + }, + { + "epoch": 0.027392387050871575, + "grad_norm": 2.800510376598655, + "learning_rate": 1.141304347826087e-05, + "loss": 1.2089, + "step": 231 + }, + { + "epoch": 0.027510968812996563, + "grad_norm": 4.005827348979385, + "learning_rate": 1.1462450592885376e-05, + "loss": 1.2746, + "step": 232 + }, + { + "epoch": 0.027629550575121547, + "grad_norm": 3.090867193648447, + "learning_rate": 1.1511857707509881e-05, + "loss": 1.3314, + "step": 233 + }, + { + "epoch": 0.02774813233724653, + "grad_norm": 3.39522468659474, + "learning_rate": 1.1561264822134387e-05, + "loss": 1.2716, + "step": 234 + }, + { + "epoch": 0.027866714099371515, + "grad_norm": 3.0192711391082647, + "learning_rate": 1.1610671936758893e-05, + "loss": 1.4077, + "step": 235 + }, + { + "epoch": 0.027985295861496503, + "grad_norm": 3.2403561609241636, + "learning_rate": 1.16600790513834e-05, + "loss": 1.2828, + "step": 236 + }, + { + "epoch": 0.028103877623621487, + "grad_norm": 3.1603540755334136, + "learning_rate": 1.1709486166007906e-05, + "loss": 1.1665, + "step": 237 + }, + { + "epoch": 0.02822245938574647, + "grad_norm": 3.3813771701826134, + "learning_rate": 1.1758893280632411e-05, + "loss": 1.4781, + "step": 238 + }, + { + "epoch": 0.02834104114787146, + "grad_norm": 3.0019164087983548, + "learning_rate": 1.1808300395256917e-05, + "loss": 1.1131, + "step": 239 + }, + { + "epoch": 0.028459622909996443, + "grad_norm": 3.1578127892215218, + "learning_rate": 1.1857707509881423e-05, + "loss": 1.3916, + "step": 240 + }, + { + "epoch": 0.028578204672121427, + "grad_norm": 2.9470404708927567, + "learning_rate": 1.190711462450593e-05, + "loss": 1.174, + "step": 241 + }, + { + "epoch": 0.02869678643424641, + "grad_norm": 3.1050812047836174, + "learning_rate": 1.1956521739130435e-05, + "loss": 1.3575, + "step": 242 + }, + { + "epoch": 0.0288153681963714, + "grad_norm": 3.1615698850784657, + "learning_rate": 1.2005928853754941e-05, + "loss": 1.2436, + "step": 243 + }, + { + "epoch": 0.028933949958496383, + "grad_norm": 3.3625992795196717, + "learning_rate": 1.2055335968379447e-05, + "loss": 1.3217, + "step": 244 + }, + { + "epoch": 0.029052531720621368, + "grad_norm": 3.3399551855723257, + "learning_rate": 1.2104743083003952e-05, + "loss": 1.5199, + "step": 245 + }, + { + "epoch": 0.029171113482746355, + "grad_norm": 3.129576888473782, + "learning_rate": 1.215415019762846e-05, + "loss": 1.0011, + "step": 246 + }, + { + "epoch": 0.02928969524487134, + "grad_norm": 3.335255382333624, + "learning_rate": 1.2203557312252965e-05, + "loss": 1.4456, + "step": 247 + }, + { + "epoch": 0.029408277006996324, + "grad_norm": 2.9507200211167444, + "learning_rate": 1.225296442687747e-05, + "loss": 1.2832, + "step": 248 + }, + { + "epoch": 0.029526858769121308, + "grad_norm": 2.854895707501369, + "learning_rate": 1.2302371541501976e-05, + "loss": 1.3801, + "step": 249 + }, + { + "epoch": 0.029645440531246296, + "grad_norm": 2.9750434725073496, + "learning_rate": 1.2351778656126482e-05, + "loss": 1.4296, + "step": 250 + }, + { + "epoch": 0.02976402229337128, + "grad_norm": 2.858011029472128, + "learning_rate": 1.240118577075099e-05, + "loss": 1.3666, + "step": 251 + }, + { + "epoch": 0.029882604055496264, + "grad_norm": 2.89441377368366, + "learning_rate": 1.2450592885375495e-05, + "loss": 1.1648, + "step": 252 + }, + { + "epoch": 0.030001185817621248, + "grad_norm": 2.794815472722316, + "learning_rate": 1.25e-05, + "loss": 1.5307, + "step": 253 + }, + { + "epoch": 0.030119767579746236, + "grad_norm": 2.9336641981417286, + "learning_rate": 1.2549407114624506e-05, + "loss": 1.1884, + "step": 254 + }, + { + "epoch": 0.03023834934187122, + "grad_norm": 2.939555396418759, + "learning_rate": 1.2598814229249012e-05, + "loss": 1.236, + "step": 255 + }, + { + "epoch": 0.030356931103996204, + "grad_norm": 3.02904822723086, + "learning_rate": 1.2648221343873517e-05, + "loss": 1.1735, + "step": 256 + }, + { + "epoch": 0.030475512866121192, + "grad_norm": 3.2708882494099565, + "learning_rate": 1.2697628458498023e-05, + "loss": 1.5093, + "step": 257 + }, + { + "epoch": 0.030594094628246176, + "grad_norm": 2.8132740117639696, + "learning_rate": 1.2747035573122532e-05, + "loss": 0.991, + "step": 258 + }, + { + "epoch": 0.03071267639037116, + "grad_norm": 3.628748609206158, + "learning_rate": 1.2796442687747038e-05, + "loss": 1.4408, + "step": 259 + }, + { + "epoch": 0.030831258152496144, + "grad_norm": 2.690392885950218, + "learning_rate": 1.2845849802371543e-05, + "loss": 1.4896, + "step": 260 + }, + { + "epoch": 0.030949839914621132, + "grad_norm": 2.7253767001875304, + "learning_rate": 1.2895256916996049e-05, + "loss": 1.44, + "step": 261 + }, + { + "epoch": 0.031068421676746116, + "grad_norm": 2.996097305741969, + "learning_rate": 1.2944664031620555e-05, + "loss": 1.215, + "step": 262 + }, + { + "epoch": 0.0311870034388711, + "grad_norm": 3.0305816572516777, + "learning_rate": 1.299407114624506e-05, + "loss": 1.3951, + "step": 263 + }, + { + "epoch": 0.031305585200996085, + "grad_norm": 2.813865807960468, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.3024, + "step": 264 + }, + { + "epoch": 0.03142416696312107, + "grad_norm": 3.17286501915085, + "learning_rate": 1.3092885375494071e-05, + "loss": 1.3738, + "step": 265 + }, + { + "epoch": 0.03154274872524606, + "grad_norm": 2.91800033507398, + "learning_rate": 1.3142292490118577e-05, + "loss": 1.2549, + "step": 266 + }, + { + "epoch": 0.03166133048737104, + "grad_norm": 3.0355879144815265, + "learning_rate": 1.3191699604743083e-05, + "loss": 1.5729, + "step": 267 + }, + { + "epoch": 0.03177991224949603, + "grad_norm": 3.268922250348648, + "learning_rate": 1.3241106719367592e-05, + "loss": 1.1893, + "step": 268 + }, + { + "epoch": 0.031898494011621016, + "grad_norm": 3.588822855096603, + "learning_rate": 1.3290513833992097e-05, + "loss": 1.1747, + "step": 269 + }, + { + "epoch": 0.032017075773746, + "grad_norm": 2.9715923441486374, + "learning_rate": 1.3339920948616603e-05, + "loss": 1.0844, + "step": 270 + }, + { + "epoch": 0.032135657535870985, + "grad_norm": 2.9028381236258163, + "learning_rate": 1.3389328063241108e-05, + "loss": 1.328, + "step": 271 + }, + { + "epoch": 0.032254239297995965, + "grad_norm": 3.0676383468952255, + "learning_rate": 1.3438735177865614e-05, + "loss": 1.4886, + "step": 272 + }, + { + "epoch": 0.03237282106012095, + "grad_norm": 3.1607629298318454, + "learning_rate": 1.348814229249012e-05, + "loss": 1.4551, + "step": 273 + }, + { + "epoch": 0.03249140282224594, + "grad_norm": 2.939682519335106, + "learning_rate": 1.3537549407114625e-05, + "loss": 1.3138, + "step": 274 + }, + { + "epoch": 0.03260998458437092, + "grad_norm": 3.1490689635709477, + "learning_rate": 1.3586956521739131e-05, + "loss": 1.364, + "step": 275 + }, + { + "epoch": 0.03272856634649591, + "grad_norm": 3.228298996345851, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0217, + "step": 276 + }, + { + "epoch": 0.0328471481086209, + "grad_norm": 3.4447793508579405, + "learning_rate": 1.3685770750988142e-05, + "loss": 1.3476, + "step": 277 + }, + { + "epoch": 0.03296572987074588, + "grad_norm": 3.6130432267633124, + "learning_rate": 1.373517786561265e-05, + "loss": 1.1731, + "step": 278 + }, + { + "epoch": 0.033084311632870865, + "grad_norm": 3.42749570332953, + "learning_rate": 1.3784584980237155e-05, + "loss": 1.3049, + "step": 279 + }, + { + "epoch": 0.03320289339499585, + "grad_norm": 2.7633062234667114, + "learning_rate": 1.383399209486166e-05, + "loss": 1.4279, + "step": 280 + }, + { + "epoch": 0.03332147515712083, + "grad_norm": 2.81269753352603, + "learning_rate": 1.3883399209486166e-05, + "loss": 1.3276, + "step": 281 + }, + { + "epoch": 0.03344005691924582, + "grad_norm": 2.992052958959111, + "learning_rate": 1.3932806324110672e-05, + "loss": 1.273, + "step": 282 + }, + { + "epoch": 0.0335586386813708, + "grad_norm": 2.8044042641908282, + "learning_rate": 1.3982213438735178e-05, + "loss": 1.29, + "step": 283 + }, + { + "epoch": 0.03367722044349579, + "grad_norm": 3.0718594133503485, + "learning_rate": 1.4031620553359683e-05, + "loss": 1.3044, + "step": 284 + }, + { + "epoch": 0.03379580220562078, + "grad_norm": 3.007170895230799, + "learning_rate": 1.4081027667984189e-05, + "loss": 1.2988, + "step": 285 + }, + { + "epoch": 0.03391438396774576, + "grad_norm": 3.0035641600216296, + "learning_rate": 1.4130434782608694e-05, + "loss": 1.3672, + "step": 286 + }, + { + "epoch": 0.034032965729870746, + "grad_norm": 2.879150954864047, + "learning_rate": 1.4179841897233202e-05, + "loss": 1.3355, + "step": 287 + }, + { + "epoch": 0.03415154749199573, + "grad_norm": 2.7608409030046612, + "learning_rate": 1.4229249011857709e-05, + "loss": 1.4106, + "step": 288 + }, + { + "epoch": 0.034270129254120714, + "grad_norm": 3.120967959460571, + "learning_rate": 1.4278656126482215e-05, + "loss": 1.3414, + "step": 289 + }, + { + "epoch": 0.0343887110162457, + "grad_norm": 3.29530761471481, + "learning_rate": 1.432806324110672e-05, + "loss": 1.4086, + "step": 290 + }, + { + "epoch": 0.03450729277837069, + "grad_norm": 2.77052267374074, + "learning_rate": 1.4377470355731226e-05, + "loss": 1.2666, + "step": 291 + }, + { + "epoch": 0.03462587454049567, + "grad_norm": 2.8929161211593755, + "learning_rate": 1.4426877470355732e-05, + "loss": 1.2327, + "step": 292 + }, + { + "epoch": 0.03474445630262066, + "grad_norm": 2.6688492013211835, + "learning_rate": 1.4476284584980237e-05, + "loss": 1.1343, + "step": 293 + }, + { + "epoch": 0.034863038064745645, + "grad_norm": 2.8700171016090184, + "learning_rate": 1.4525691699604743e-05, + "loss": 1.3562, + "step": 294 + }, + { + "epoch": 0.034981619826870626, + "grad_norm": 3.07740375797054, + "learning_rate": 1.4575098814229248e-05, + "loss": 1.4666, + "step": 295 + }, + { + "epoch": 0.035100201588995614, + "grad_norm": 2.715821803820841, + "learning_rate": 1.4624505928853754e-05, + "loss": 1.2865, + "step": 296 + }, + { + "epoch": 0.035218783351120594, + "grad_norm": 2.6460045131291086, + "learning_rate": 1.4673913043478263e-05, + "loss": 1.5349, + "step": 297 + }, + { + "epoch": 0.03533736511324558, + "grad_norm": 2.790848655126991, + "learning_rate": 1.4723320158102769e-05, + "loss": 1.0609, + "step": 298 + }, + { + "epoch": 0.03545594687537057, + "grad_norm": 3.0207164786220453, + "learning_rate": 1.4772727272727274e-05, + "loss": 1.4353, + "step": 299 + }, + { + "epoch": 0.03557452863749555, + "grad_norm": 2.8118512131019666, + "learning_rate": 1.482213438735178e-05, + "loss": 0.8828, + "step": 300 + }, + { + "epoch": 0.03569311039962054, + "grad_norm": 2.4199788367860027, + "learning_rate": 1.4871541501976285e-05, + "loss": 1.2519, + "step": 301 + }, + { + "epoch": 0.035811692161745526, + "grad_norm": 2.830151537521618, + "learning_rate": 1.4920948616600791e-05, + "loss": 1.1018, + "step": 302 + }, + { + "epoch": 0.03593027392387051, + "grad_norm": 3.1894980583872172, + "learning_rate": 1.4970355731225297e-05, + "loss": 1.1274, + "step": 303 + }, + { + "epoch": 0.036048855685995494, + "grad_norm": 2.934860539549872, + "learning_rate": 1.5019762845849802e-05, + "loss": 1.3366, + "step": 304 + }, + { + "epoch": 0.03616743744812048, + "grad_norm": 3.005695626896576, + "learning_rate": 1.5069169960474308e-05, + "loss": 1.2271, + "step": 305 + }, + { + "epoch": 0.03628601921024546, + "grad_norm": 3.062027378007026, + "learning_rate": 1.5118577075098814e-05, + "loss": 1.2645, + "step": 306 + }, + { + "epoch": 0.03640460097237045, + "grad_norm": 3.1159478259058675, + "learning_rate": 1.5167984189723323e-05, + "loss": 1.4309, + "step": 307 + }, + { + "epoch": 0.03652318273449544, + "grad_norm": 2.8160745677737067, + "learning_rate": 1.5217391304347828e-05, + "loss": 1.4065, + "step": 308 + }, + { + "epoch": 0.03664176449662042, + "grad_norm": 3.1389496736732228, + "learning_rate": 1.5266798418972334e-05, + "loss": 1.4968, + "step": 309 + }, + { + "epoch": 0.036760346258745406, + "grad_norm": 2.995121412938753, + "learning_rate": 1.531620553359684e-05, + "loss": 1.0021, + "step": 310 + }, + { + "epoch": 0.03687892802087039, + "grad_norm": 3.0908821774613187, + "learning_rate": 1.5365612648221345e-05, + "loss": 1.1323, + "step": 311 + }, + { + "epoch": 0.036997509782995375, + "grad_norm": 2.632724885137674, + "learning_rate": 1.541501976284585e-05, + "loss": 1.1269, + "step": 312 + }, + { + "epoch": 0.03711609154512036, + "grad_norm": 2.6593956631496374, + "learning_rate": 1.5464426877470356e-05, + "loss": 1.3245, + "step": 313 + }, + { + "epoch": 0.03723467330724534, + "grad_norm": 2.8490044181183682, + "learning_rate": 1.5513833992094862e-05, + "loss": 1.3662, + "step": 314 + }, + { + "epoch": 0.03735325506937033, + "grad_norm": 2.901610373823375, + "learning_rate": 1.5563241106719367e-05, + "loss": 1.1619, + "step": 315 + }, + { + "epoch": 0.03747183683149532, + "grad_norm": 2.899104145631089, + "learning_rate": 1.5612648221343873e-05, + "loss": 1.1575, + "step": 316 + }, + { + "epoch": 0.0375904185936203, + "grad_norm": 3.1200289361112645, + "learning_rate": 1.5662055335968382e-05, + "loss": 1.0384, + "step": 317 + }, + { + "epoch": 0.03770900035574529, + "grad_norm": 2.7294800265354238, + "learning_rate": 1.5711462450592888e-05, + "loss": 0.9991, + "step": 318 + }, + { + "epoch": 0.037827582117870275, + "grad_norm": 3.07139464376348, + "learning_rate": 1.5760869565217393e-05, + "loss": 1.3464, + "step": 319 + }, + { + "epoch": 0.037946163879995255, + "grad_norm": 2.8207673297276306, + "learning_rate": 1.58102766798419e-05, + "loss": 1.1102, + "step": 320 + }, + { + "epoch": 0.03806474564212024, + "grad_norm": 2.708080661837101, + "learning_rate": 1.5859683794466405e-05, + "loss": 1.2179, + "step": 321 + }, + { + "epoch": 0.038183327404245224, + "grad_norm": 2.4727939501155034, + "learning_rate": 1.590909090909091e-05, + "loss": 1.0123, + "step": 322 + }, + { + "epoch": 0.03830190916637021, + "grad_norm": 3.3053693464665748, + "learning_rate": 1.5958498023715416e-05, + "loss": 1.2476, + "step": 323 + }, + { + "epoch": 0.0384204909284952, + "grad_norm": 2.550640356902746, + "learning_rate": 1.600790513833992e-05, + "loss": 1.1672, + "step": 324 + }, + { + "epoch": 0.03853907269062018, + "grad_norm": 2.531894398368546, + "learning_rate": 1.6057312252964427e-05, + "loss": 1.3367, + "step": 325 + }, + { + "epoch": 0.03865765445274517, + "grad_norm": 2.8996648833907175, + "learning_rate": 1.6106719367588936e-05, + "loss": 1.3594, + "step": 326 + }, + { + "epoch": 0.038776236214870155, + "grad_norm": 3.2383767106258246, + "learning_rate": 1.615612648221344e-05, + "loss": 0.9857, + "step": 327 + }, + { + "epoch": 0.038894817976995136, + "grad_norm": 2.931043829552459, + "learning_rate": 1.6205533596837947e-05, + "loss": 1.2328, + "step": 328 + }, + { + "epoch": 0.03901339973912012, + "grad_norm": 2.568735956061358, + "learning_rate": 1.6254940711462453e-05, + "loss": 1.229, + "step": 329 + }, + { + "epoch": 0.03913198150124511, + "grad_norm": 2.7393826944786426, + "learning_rate": 1.630434782608696e-05, + "loss": 1.2744, + "step": 330 + }, + { + "epoch": 0.03925056326337009, + "grad_norm": 2.8854648571402017, + "learning_rate": 1.6353754940711464e-05, + "loss": 1.1236, + "step": 331 + }, + { + "epoch": 0.03936914502549508, + "grad_norm": 2.75479551720139, + "learning_rate": 1.640316205533597e-05, + "loss": 1.429, + "step": 332 + }, + { + "epoch": 0.03948772678762007, + "grad_norm": 3.1814163306804795, + "learning_rate": 1.6452569169960475e-05, + "loss": 1.2706, + "step": 333 + }, + { + "epoch": 0.03960630854974505, + "grad_norm": 3.0679336409926017, + "learning_rate": 1.650197628458498e-05, + "loss": 1.3427, + "step": 334 + }, + { + "epoch": 0.039724890311870036, + "grad_norm": 2.833714022158204, + "learning_rate": 1.6551383399209487e-05, + "loss": 1.2048, + "step": 335 + }, + { + "epoch": 0.039843472073995016, + "grad_norm": 2.811441805328102, + "learning_rate": 1.6600790513833996e-05, + "loss": 1.3839, + "step": 336 + }, + { + "epoch": 0.039962053836120004, + "grad_norm": 2.8783084562160934, + "learning_rate": 1.66501976284585e-05, + "loss": 1.4425, + "step": 337 + }, + { + "epoch": 0.04008063559824499, + "grad_norm": 2.869220738097393, + "learning_rate": 1.6699604743083007e-05, + "loss": 1.2402, + "step": 338 + }, + { + "epoch": 0.04019921736036997, + "grad_norm": 2.4912980928093034, + "learning_rate": 1.6749011857707512e-05, + "loss": 1.4056, + "step": 339 + }, + { + "epoch": 0.04031779912249496, + "grad_norm": 2.8780374896546905, + "learning_rate": 1.6798418972332018e-05, + "loss": 1.5192, + "step": 340 + }, + { + "epoch": 0.04043638088461995, + "grad_norm": 2.483578857441582, + "learning_rate": 1.6847826086956524e-05, + "loss": 1.4183, + "step": 341 + }, + { + "epoch": 0.04055496264674493, + "grad_norm": 2.848729534490631, + "learning_rate": 1.689723320158103e-05, + "loss": 1.154, + "step": 342 + }, + { + "epoch": 0.040673544408869916, + "grad_norm": 2.978625663186514, + "learning_rate": 1.6946640316205535e-05, + "loss": 1.2166, + "step": 343 + }, + { + "epoch": 0.040792126170994904, + "grad_norm": 3.0852779736945655, + "learning_rate": 1.699604743083004e-05, + "loss": 1.392, + "step": 344 + }, + { + "epoch": 0.040910707933119884, + "grad_norm": 2.8000682430258, + "learning_rate": 1.7045454545454546e-05, + "loss": 1.1834, + "step": 345 + }, + { + "epoch": 0.04102928969524487, + "grad_norm": 3.471516636774333, + "learning_rate": 1.7094861660079055e-05, + "loss": 1.2817, + "step": 346 + }, + { + "epoch": 0.04114787145736986, + "grad_norm": 2.958564646878144, + "learning_rate": 1.714426877470356e-05, + "loss": 1.1575, + "step": 347 + }, + { + "epoch": 0.04126645321949484, + "grad_norm": 2.7306316127406682, + "learning_rate": 1.7193675889328066e-05, + "loss": 1.3868, + "step": 348 + }, + { + "epoch": 0.04138503498161983, + "grad_norm": 2.6542113314104716, + "learning_rate": 1.7243083003952572e-05, + "loss": 1.0978, + "step": 349 + }, + { + "epoch": 0.04150361674374481, + "grad_norm": 3.0212890662894334, + "learning_rate": 1.7292490118577078e-05, + "loss": 1.0651, + "step": 350 + }, + { + "epoch": 0.0416221985058698, + "grad_norm": 2.589383615461841, + "learning_rate": 1.7341897233201583e-05, + "loss": 1.3817, + "step": 351 + }, + { + "epoch": 0.041740780267994784, + "grad_norm": 2.7158519772469196, + "learning_rate": 1.739130434782609e-05, + "loss": 1.2833, + "step": 352 + }, + { + "epoch": 0.041859362030119765, + "grad_norm": 2.3988236994354692, + "learning_rate": 1.7440711462450594e-05, + "loss": 1.5628, + "step": 353 + }, + { + "epoch": 0.04197794379224475, + "grad_norm": 2.6525156123483264, + "learning_rate": 1.74901185770751e-05, + "loss": 1.155, + "step": 354 + }, + { + "epoch": 0.04209652555436974, + "grad_norm": 2.746334648905017, + "learning_rate": 1.7539525691699606e-05, + "loss": 1.2261, + "step": 355 + }, + { + "epoch": 0.04221510731649472, + "grad_norm": 2.9308772233452474, + "learning_rate": 1.758893280632411e-05, + "loss": 1.3595, + "step": 356 + }, + { + "epoch": 0.04233368907861971, + "grad_norm": 3.2317202838323134, + "learning_rate": 1.7638339920948617e-05, + "loss": 1.1708, + "step": 357 + }, + { + "epoch": 0.042452270840744696, + "grad_norm": 2.7751069380383964, + "learning_rate": 1.7687747035573123e-05, + "loss": 1.4414, + "step": 358 + }, + { + "epoch": 0.04257085260286968, + "grad_norm": 3.2992330129634166, + "learning_rate": 1.7737154150197628e-05, + "loss": 1.3721, + "step": 359 + }, + { + "epoch": 0.042689434364994665, + "grad_norm": 2.6879681968900226, + "learning_rate": 1.7786561264822134e-05, + "loss": 1.3715, + "step": 360 + }, + { + "epoch": 0.04280801612711965, + "grad_norm": 2.5248425908653944, + "learning_rate": 1.783596837944664e-05, + "loss": 1.3598, + "step": 361 + }, + { + "epoch": 0.04292659788924463, + "grad_norm": 2.768215996766254, + "learning_rate": 1.7885375494071145e-05, + "loss": 1.2683, + "step": 362 + }, + { + "epoch": 0.04304517965136962, + "grad_norm": 2.5064096555244366, + "learning_rate": 1.793478260869565e-05, + "loss": 1.5114, + "step": 363 + }, + { + "epoch": 0.0431637614134946, + "grad_norm": 2.6248952213619723, + "learning_rate": 1.7984189723320156e-05, + "loss": 1.0202, + "step": 364 + }, + { + "epoch": 0.04328234317561959, + "grad_norm": 2.7129786968153313, + "learning_rate": 1.8033596837944665e-05, + "loss": 1.5015, + "step": 365 + }, + { + "epoch": 0.04340092493774458, + "grad_norm": 2.7623959890852228, + "learning_rate": 1.808300395256917e-05, + "loss": 1.3076, + "step": 366 + }, + { + "epoch": 0.04351950669986956, + "grad_norm": 2.49062022415326, + "learning_rate": 1.8132411067193676e-05, + "loss": 1.0397, + "step": 367 + }, + { + "epoch": 0.043638088461994545, + "grad_norm": 2.701721090939951, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.2362, + "step": 368 + }, + { + "epoch": 0.04375667022411953, + "grad_norm": 2.693200520797649, + "learning_rate": 1.8231225296442688e-05, + "loss": 1.3149, + "step": 369 + }, + { + "epoch": 0.043875251986244514, + "grad_norm": 2.370850380411606, + "learning_rate": 1.8280632411067193e-05, + "loss": 1.0036, + "step": 370 + }, + { + "epoch": 0.0439938337483695, + "grad_norm": 3.09354127304867, + "learning_rate": 1.83300395256917e-05, + "loss": 1.0679, + "step": 371 + }, + { + "epoch": 0.04411241551049449, + "grad_norm": 2.4601969145644618, + "learning_rate": 1.8379446640316205e-05, + "loss": 1.0765, + "step": 372 + }, + { + "epoch": 0.04423099727261947, + "grad_norm": 2.6708841028139467, + "learning_rate": 1.842885375494071e-05, + "loss": 1.2483, + "step": 373 + }, + { + "epoch": 0.04434957903474446, + "grad_norm": 2.8173795559657973, + "learning_rate": 1.8478260869565216e-05, + "loss": 1.4145, + "step": 374 + }, + { + "epoch": 0.04446816079686944, + "grad_norm": 2.7968738061251455, + "learning_rate": 1.8527667984189725e-05, + "loss": 1.337, + "step": 375 + }, + { + "epoch": 0.044586742558994426, + "grad_norm": 2.6730911336625147, + "learning_rate": 1.857707509881423e-05, + "loss": 1.1814, + "step": 376 + }, + { + "epoch": 0.04470532432111941, + "grad_norm": 2.2466398240845074, + "learning_rate": 1.8626482213438736e-05, + "loss": 1.3667, + "step": 377 + }, + { + "epoch": 0.044823906083244394, + "grad_norm": 2.8047732189056256, + "learning_rate": 1.867588932806324e-05, + "loss": 1.3818, + "step": 378 + }, + { + "epoch": 0.04494248784536938, + "grad_norm": 3.1374057943405527, + "learning_rate": 1.8725296442687747e-05, + "loss": 1.3987, + "step": 379 + }, + { + "epoch": 0.04506106960749437, + "grad_norm": 2.5800989925266578, + "learning_rate": 1.8774703557312253e-05, + "loss": 1.3251, + "step": 380 + }, + { + "epoch": 0.04517965136961935, + "grad_norm": 2.98255007678058, + "learning_rate": 1.882411067193676e-05, + "loss": 1.3252, + "step": 381 + }, + { + "epoch": 0.04529823313174434, + "grad_norm": 2.5866644842917115, + "learning_rate": 1.8873517786561264e-05, + "loss": 1.2671, + "step": 382 + }, + { + "epoch": 0.045416814893869326, + "grad_norm": 2.452559032895594, + "learning_rate": 1.892292490118577e-05, + "loss": 1.2186, + "step": 383 + }, + { + "epoch": 0.045535396655994306, + "grad_norm": 2.192440791600819, + "learning_rate": 1.8972332015810275e-05, + "loss": 1.4327, + "step": 384 + }, + { + "epoch": 0.045653978418119294, + "grad_norm": 2.5565570508693742, + "learning_rate": 1.9021739130434784e-05, + "loss": 1.2116, + "step": 385 + }, + { + "epoch": 0.04577256018024428, + "grad_norm": 2.4046860349727965, + "learning_rate": 1.907114624505929e-05, + "loss": 1.4138, + "step": 386 + }, + { + "epoch": 0.04589114194236926, + "grad_norm": 3.087776738481778, + "learning_rate": 1.9120553359683796e-05, + "loss": 1.1523, + "step": 387 + }, + { + "epoch": 0.04600972370449425, + "grad_norm": 2.40347872311201, + "learning_rate": 1.91699604743083e-05, + "loss": 1.2957, + "step": 388 + }, + { + "epoch": 0.04612830546661923, + "grad_norm": 2.577403784881223, + "learning_rate": 1.9219367588932807e-05, + "loss": 1.2516, + "step": 389 + }, + { + "epoch": 0.04624688722874422, + "grad_norm": 3.627663315251857, + "learning_rate": 1.9268774703557312e-05, + "loss": 1.1396, + "step": 390 + }, + { + "epoch": 0.046365468990869206, + "grad_norm": 2.904375696802533, + "learning_rate": 1.9318181818181818e-05, + "loss": 1.2148, + "step": 391 + }, + { + "epoch": 0.04648405075299419, + "grad_norm": 2.6248782159696846, + "learning_rate": 1.9367588932806324e-05, + "loss": 1.2488, + "step": 392 + }, + { + "epoch": 0.046602632515119174, + "grad_norm": 2.946821241101097, + "learning_rate": 1.941699604743083e-05, + "loss": 1.3025, + "step": 393 + }, + { + "epoch": 0.04672121427724416, + "grad_norm": 2.5918187772174686, + "learning_rate": 1.9466403162055335e-05, + "loss": 1.0872, + "step": 394 + }, + { + "epoch": 0.04683979603936914, + "grad_norm": 3.0203386230678806, + "learning_rate": 1.9515810276679844e-05, + "loss": 1.3952, + "step": 395 + }, + { + "epoch": 0.04695837780149413, + "grad_norm": 2.725483459510198, + "learning_rate": 1.956521739130435e-05, + "loss": 1.3247, + "step": 396 + }, + { + "epoch": 0.04707695956361912, + "grad_norm": 2.6259583528738006, + "learning_rate": 1.9614624505928855e-05, + "loss": 1.2225, + "step": 397 + }, + { + "epoch": 0.0471955413257441, + "grad_norm": 2.698273418177609, + "learning_rate": 1.966403162055336e-05, + "loss": 1.1325, + "step": 398 + }, + { + "epoch": 0.04731412308786909, + "grad_norm": 2.530249329574852, + "learning_rate": 1.9713438735177866e-05, + "loss": 1.0368, + "step": 399 + }, + { + "epoch": 0.047432704849994074, + "grad_norm": 2.7126273436438573, + "learning_rate": 1.9762845849802372e-05, + "loss": 1.2787, + "step": 400 + }, + { + "epoch": 0.047551286612119055, + "grad_norm": 2.8273910219060485, + "learning_rate": 1.9812252964426878e-05, + "loss": 1.3649, + "step": 401 + }, + { + "epoch": 0.04766986837424404, + "grad_norm": 2.453252721776185, + "learning_rate": 1.9861660079051383e-05, + "loss": 1.2477, + "step": 402 + }, + { + "epoch": 0.04778845013636902, + "grad_norm": 2.4934203363558427, + "learning_rate": 1.991106719367589e-05, + "loss": 1.3257, + "step": 403 + }, + { + "epoch": 0.04790703189849401, + "grad_norm": 2.630789345122103, + "learning_rate": 1.9960474308300398e-05, + "loss": 1.4981, + "step": 404 + }, + { + "epoch": 0.048025613660619, + "grad_norm": 2.5627331187175164, + "learning_rate": 2.0009881422924903e-05, + "loss": 1.1666, + "step": 405 + }, + { + "epoch": 0.04814419542274398, + "grad_norm": 2.5838119959030243, + "learning_rate": 2.005928853754941e-05, + "loss": 1.0082, + "step": 406 + }, + { + "epoch": 0.04826277718486897, + "grad_norm": 2.918813289254095, + "learning_rate": 2.0108695652173915e-05, + "loss": 1.2295, + "step": 407 + }, + { + "epoch": 0.048381358946993955, + "grad_norm": 2.7532907852622728, + "learning_rate": 2.015810276679842e-05, + "loss": 1.3116, + "step": 408 + }, + { + "epoch": 0.048499940709118936, + "grad_norm": 2.6796675143457414, + "learning_rate": 2.0207509881422926e-05, + "loss": 1.4023, + "step": 409 + }, + { + "epoch": 0.04861852247124392, + "grad_norm": 2.843202942987257, + "learning_rate": 2.025691699604743e-05, + "loss": 1.2611, + "step": 410 + }, + { + "epoch": 0.04873710423336891, + "grad_norm": 2.534438516774595, + "learning_rate": 2.0306324110671937e-05, + "loss": 1.2428, + "step": 411 + }, + { + "epoch": 0.04885568599549389, + "grad_norm": 2.6450572993182013, + "learning_rate": 2.0355731225296443e-05, + "loss": 1.271, + "step": 412 + }, + { + "epoch": 0.04897426775761888, + "grad_norm": 2.487767986910595, + "learning_rate": 2.040513833992095e-05, + "loss": 1.4073, + "step": 413 + }, + { + "epoch": 0.04909284951974386, + "grad_norm": 2.779679036561318, + "learning_rate": 2.0454545454545457e-05, + "loss": 1.2182, + "step": 414 + }, + { + "epoch": 0.04921143128186885, + "grad_norm": 2.5826758599738158, + "learning_rate": 2.0503952569169963e-05, + "loss": 1.3293, + "step": 415 + }, + { + "epoch": 0.049330013043993835, + "grad_norm": 2.679500276811846, + "learning_rate": 2.055335968379447e-05, + "loss": 1.3586, + "step": 416 + }, + { + "epoch": 0.049448594806118816, + "grad_norm": 2.5570392799908066, + "learning_rate": 2.0602766798418974e-05, + "loss": 1.4393, + "step": 417 + }, + { + "epoch": 0.049567176568243804, + "grad_norm": 2.663263864974672, + "learning_rate": 2.065217391304348e-05, + "loss": 1.1841, + "step": 418 + }, + { + "epoch": 0.04968575833036879, + "grad_norm": 2.747566745937756, + "learning_rate": 2.0701581027667985e-05, + "loss": 1.2873, + "step": 419 + }, + { + "epoch": 0.04980434009249377, + "grad_norm": 2.633546314291064, + "learning_rate": 2.075098814229249e-05, + "loss": 1.2836, + "step": 420 + }, + { + "epoch": 0.04992292185461876, + "grad_norm": 2.7890223401303724, + "learning_rate": 2.0800395256916997e-05, + "loss": 1.1734, + "step": 421 + }, + { + "epoch": 0.05004150361674375, + "grad_norm": 2.8685974366873372, + "learning_rate": 2.0849802371541502e-05, + "loss": 1.416, + "step": 422 + }, + { + "epoch": 0.05016008537886873, + "grad_norm": 2.1011346736711665, + "learning_rate": 2.0899209486166008e-05, + "loss": 1.5013, + "step": 423 + }, + { + "epoch": 0.050278667140993716, + "grad_norm": 2.493094272852067, + "learning_rate": 2.0948616600790517e-05, + "loss": 1.1525, + "step": 424 + }, + { + "epoch": 0.050397248903118703, + "grad_norm": 2.508612821617293, + "learning_rate": 2.0998023715415023e-05, + "loss": 1.2973, + "step": 425 + }, + { + "epoch": 0.050515830665243684, + "grad_norm": 2.5260690991520383, + "learning_rate": 2.1047430830039528e-05, + "loss": 1.2556, + "step": 426 + }, + { + "epoch": 0.05063441242736867, + "grad_norm": 2.3555352058181285, + "learning_rate": 2.1096837944664034e-05, + "loss": 1.1786, + "step": 427 + }, + { + "epoch": 0.05075299418949365, + "grad_norm": 2.89919894739498, + "learning_rate": 2.114624505928854e-05, + "loss": 1.4252, + "step": 428 + }, + { + "epoch": 0.05087157595161864, + "grad_norm": 2.4757561248996245, + "learning_rate": 2.1195652173913045e-05, + "loss": 1.107, + "step": 429 + }, + { + "epoch": 0.05099015771374363, + "grad_norm": 2.682947988089475, + "learning_rate": 2.124505928853755e-05, + "loss": 1.1692, + "step": 430 + }, + { + "epoch": 0.05110873947586861, + "grad_norm": 2.4097428191628185, + "learning_rate": 2.1294466403162056e-05, + "loss": 1.1052, + "step": 431 + }, + { + "epoch": 0.051227321237993596, + "grad_norm": 2.7409542482830416, + "learning_rate": 2.1343873517786562e-05, + "loss": 0.9055, + "step": 432 + }, + { + "epoch": 0.051345903000118584, + "grad_norm": 2.5372125088557453, + "learning_rate": 2.1393280632411067e-05, + "loss": 1.2053, + "step": 433 + }, + { + "epoch": 0.051464484762243565, + "grad_norm": 2.762055197192633, + "learning_rate": 2.1442687747035576e-05, + "loss": 1.298, + "step": 434 + }, + { + "epoch": 0.05158306652436855, + "grad_norm": 2.3011982487989764, + "learning_rate": 2.1492094861660082e-05, + "loss": 1.0514, + "step": 435 + }, + { + "epoch": 0.05170164828649354, + "grad_norm": 2.5255779203953117, + "learning_rate": 2.1541501976284588e-05, + "loss": 1.264, + "step": 436 + }, + { + "epoch": 0.05182023004861852, + "grad_norm": 2.435216582525352, + "learning_rate": 2.1590909090909093e-05, + "loss": 0.7867, + "step": 437 + }, + { + "epoch": 0.05193881181074351, + "grad_norm": 2.353394507643216, + "learning_rate": 2.16403162055336e-05, + "loss": 1.2825, + "step": 438 + }, + { + "epoch": 0.052057393572868496, + "grad_norm": 2.5313845209200934, + "learning_rate": 2.1689723320158105e-05, + "loss": 1.3053, + "step": 439 + }, + { + "epoch": 0.05217597533499348, + "grad_norm": 2.5154295218017158, + "learning_rate": 2.173913043478261e-05, + "loss": 1.1763, + "step": 440 + }, + { + "epoch": 0.052294557097118465, + "grad_norm": 2.3197559609956264, + "learning_rate": 2.1788537549407116e-05, + "loss": 1.2474, + "step": 441 + }, + { + "epoch": 0.052413138859243445, + "grad_norm": 2.4039543623357287, + "learning_rate": 2.183794466403162e-05, + "loss": 1.1774, + "step": 442 + }, + { + "epoch": 0.05253172062136843, + "grad_norm": 2.7607236114077995, + "learning_rate": 2.188735177865613e-05, + "loss": 1.2824, + "step": 443 + }, + { + "epoch": 0.05265030238349342, + "grad_norm": 2.326240623762275, + "learning_rate": 2.1936758893280636e-05, + "loss": 1.3178, + "step": 444 + }, + { + "epoch": 0.0527688841456184, + "grad_norm": 2.2615910131927093, + "learning_rate": 2.198616600790514e-05, + "loss": 1.4656, + "step": 445 + }, + { + "epoch": 0.05288746590774339, + "grad_norm": 2.3310789269739747, + "learning_rate": 2.2035573122529647e-05, + "loss": 1.2193, + "step": 446 + }, + { + "epoch": 0.05300604766986838, + "grad_norm": 2.2586959253526326, + "learning_rate": 2.2084980237154153e-05, + "loss": 1.1863, + "step": 447 + }, + { + "epoch": 0.05312462943199336, + "grad_norm": 2.382749475739641, + "learning_rate": 2.213438735177866e-05, + "loss": 1.1248, + "step": 448 + }, + { + "epoch": 0.053243211194118345, + "grad_norm": 2.138637898312669, + "learning_rate": 2.2183794466403164e-05, + "loss": 1.2273, + "step": 449 + }, + { + "epoch": 0.05336179295624333, + "grad_norm": 3.4037441782123965, + "learning_rate": 2.223320158102767e-05, + "loss": 1.3639, + "step": 450 + }, + { + "epoch": 0.05348037471836831, + "grad_norm": 2.580456951357517, + "learning_rate": 2.2282608695652175e-05, + "loss": 1.0896, + "step": 451 + }, + { + "epoch": 0.0535989564804933, + "grad_norm": 2.7394744607809596, + "learning_rate": 2.233201581027668e-05, + "loss": 1.1623, + "step": 452 + }, + { + "epoch": 0.05371753824261829, + "grad_norm": 2.653299356035562, + "learning_rate": 2.2381422924901187e-05, + "loss": 1.267, + "step": 453 + }, + { + "epoch": 0.05383612000474327, + "grad_norm": 2.500084480235046, + "learning_rate": 2.2430830039525692e-05, + "loss": 1.0464, + "step": 454 + }, + { + "epoch": 0.05395470176686826, + "grad_norm": 2.314329189943909, + "learning_rate": 2.2480237154150198e-05, + "loss": 1.45, + "step": 455 + }, + { + "epoch": 0.05407328352899324, + "grad_norm": 2.4125235643241596, + "learning_rate": 2.2529644268774703e-05, + "loss": 1.0963, + "step": 456 + }, + { + "epoch": 0.054191865291118226, + "grad_norm": 2.3471230259516203, + "learning_rate": 2.257905138339921e-05, + "loss": 1.1924, + "step": 457 + }, + { + "epoch": 0.05431044705324321, + "grad_norm": 2.3298907769978063, + "learning_rate": 2.2628458498023715e-05, + "loss": 1.123, + "step": 458 + }, + { + "epoch": 0.054429028815368194, + "grad_norm": 2.4931427271189066, + "learning_rate": 2.267786561264822e-05, + "loss": 0.9861, + "step": 459 + }, + { + "epoch": 0.05454761057749318, + "grad_norm": 2.7180656690207163, + "learning_rate": 2.272727272727273e-05, + "loss": 1.481, + "step": 460 + }, + { + "epoch": 0.05466619233961817, + "grad_norm": 2.3575846367746567, + "learning_rate": 2.2776679841897235e-05, + "loss": 0.9426, + "step": 461 + }, + { + "epoch": 0.05478477410174315, + "grad_norm": 2.5332645841909343, + "learning_rate": 2.282608695652174e-05, + "loss": 1.312, + "step": 462 + }, + { + "epoch": 0.05490335586386814, + "grad_norm": 2.555982982982781, + "learning_rate": 2.2875494071146246e-05, + "loss": 1.1929, + "step": 463 + }, + { + "epoch": 0.055021937625993125, + "grad_norm": 2.40072650974613, + "learning_rate": 2.2924901185770752e-05, + "loss": 1.2516, + "step": 464 + }, + { + "epoch": 0.055140519388118106, + "grad_norm": 2.3778537022151767, + "learning_rate": 2.2974308300395257e-05, + "loss": 1.4855, + "step": 465 + }, + { + "epoch": 0.055259101150243094, + "grad_norm": 2.5340798929760866, + "learning_rate": 2.3023715415019763e-05, + "loss": 1.4211, + "step": 466 + }, + { + "epoch": 0.055377682912368074, + "grad_norm": 2.477747414468036, + "learning_rate": 2.307312252964427e-05, + "loss": 1.262, + "step": 467 + }, + { + "epoch": 0.05549626467449306, + "grad_norm": 2.500063974888124, + "learning_rate": 2.3122529644268774e-05, + "loss": 1.2145, + "step": 468 + }, + { + "epoch": 0.05561484643661805, + "grad_norm": 2.275696839469494, + "learning_rate": 2.317193675889328e-05, + "loss": 1.4108, + "step": 469 + }, + { + "epoch": 0.05573342819874303, + "grad_norm": 2.28745630330474, + "learning_rate": 2.3221343873517785e-05, + "loss": 1.1709, + "step": 470 + }, + { + "epoch": 0.05585200996086802, + "grad_norm": 2.420649632953623, + "learning_rate": 2.327075098814229e-05, + "loss": 1.4523, + "step": 471 + }, + { + "epoch": 0.055970591722993006, + "grad_norm": 2.6885323833495303, + "learning_rate": 2.33201581027668e-05, + "loss": 1.4183, + "step": 472 + }, + { + "epoch": 0.05608917348511799, + "grad_norm": 2.4894451378739184, + "learning_rate": 2.3369565217391306e-05, + "loss": 1.1483, + "step": 473 + }, + { + "epoch": 0.056207755247242974, + "grad_norm": 2.313024376563048, + "learning_rate": 2.341897233201581e-05, + "loss": 1.3377, + "step": 474 + }, + { + "epoch": 0.05632633700936796, + "grad_norm": 2.294353848201621, + "learning_rate": 2.3468379446640317e-05, + "loss": 1.3604, + "step": 475 + }, + { + "epoch": 0.05644491877149294, + "grad_norm": 2.1362500600365713, + "learning_rate": 2.3517786561264823e-05, + "loss": 1.181, + "step": 476 + }, + { + "epoch": 0.05656350053361793, + "grad_norm": 2.563769832874949, + "learning_rate": 2.3567193675889328e-05, + "loss": 1.2985, + "step": 477 + }, + { + "epoch": 0.05668208229574292, + "grad_norm": 2.9297674376484006, + "learning_rate": 2.3616600790513834e-05, + "loss": 1.5035, + "step": 478 + }, + { + "epoch": 0.0568006640578679, + "grad_norm": 2.618748600000825, + "learning_rate": 2.366600790513834e-05, + "loss": 1.379, + "step": 479 + }, + { + "epoch": 0.056919245819992886, + "grad_norm": 2.382478210556736, + "learning_rate": 2.3715415019762845e-05, + "loss": 1.0395, + "step": 480 + }, + { + "epoch": 0.05703782758211787, + "grad_norm": 2.5129405838238714, + "learning_rate": 2.376482213438735e-05, + "loss": 1.1808, + "step": 481 + }, + { + "epoch": 0.057156409344242855, + "grad_norm": 2.133649214639615, + "learning_rate": 2.381422924901186e-05, + "loss": 1.2312, + "step": 482 + }, + { + "epoch": 0.05727499110636784, + "grad_norm": 2.5288196179800493, + "learning_rate": 2.3863636363636365e-05, + "loss": 1.331, + "step": 483 + }, + { + "epoch": 0.05739357286849282, + "grad_norm": 2.6499928563696824, + "learning_rate": 2.391304347826087e-05, + "loss": 1.1671, + "step": 484 + }, + { + "epoch": 0.05751215463061781, + "grad_norm": 2.3145140816381433, + "learning_rate": 2.3962450592885376e-05, + "loss": 1.3676, + "step": 485 + }, + { + "epoch": 0.0576307363927428, + "grad_norm": 2.4904999874462903, + "learning_rate": 2.4011857707509882e-05, + "loss": 1.3468, + "step": 486 + }, + { + "epoch": 0.05774931815486778, + "grad_norm": 2.2397738579272843, + "learning_rate": 2.4061264822134388e-05, + "loss": 1.2489, + "step": 487 + }, + { + "epoch": 0.05786789991699277, + "grad_norm": 2.3628444614984994, + "learning_rate": 2.4110671936758893e-05, + "loss": 1.2156, + "step": 488 + }, + { + "epoch": 0.057986481679117755, + "grad_norm": 2.469237961456168, + "learning_rate": 2.41600790513834e-05, + "loss": 1.1843, + "step": 489 + }, + { + "epoch": 0.058105063441242735, + "grad_norm": 2.515446717209009, + "learning_rate": 2.4209486166007905e-05, + "loss": 1.2417, + "step": 490 + }, + { + "epoch": 0.05822364520336772, + "grad_norm": 2.7227544170275872, + "learning_rate": 2.425889328063241e-05, + "loss": 1.4287, + "step": 491 + }, + { + "epoch": 0.05834222696549271, + "grad_norm": 2.4335756707662086, + "learning_rate": 2.430830039525692e-05, + "loss": 1.2592, + "step": 492 + }, + { + "epoch": 0.05846080872761769, + "grad_norm": 2.548147958053396, + "learning_rate": 2.4357707509881425e-05, + "loss": 1.2616, + "step": 493 + }, + { + "epoch": 0.05857939048974268, + "grad_norm": 2.252372966105802, + "learning_rate": 2.440711462450593e-05, + "loss": 1.4302, + "step": 494 + }, + { + "epoch": 0.05869797225186766, + "grad_norm": 2.4237053246414058, + "learning_rate": 2.4456521739130436e-05, + "loss": 1.284, + "step": 495 + }, + { + "epoch": 0.05881655401399265, + "grad_norm": 2.4890989881758285, + "learning_rate": 2.450592885375494e-05, + "loss": 1.326, + "step": 496 + }, + { + "epoch": 0.058935135776117635, + "grad_norm": 2.419802188364975, + "learning_rate": 2.4555335968379447e-05, + "loss": 1.4267, + "step": 497 + }, + { + "epoch": 0.059053717538242616, + "grad_norm": 2.407643603680517, + "learning_rate": 2.4604743083003953e-05, + "loss": 1.5514, + "step": 498 + }, + { + "epoch": 0.0591722993003676, + "grad_norm": 3.1014669788604237, + "learning_rate": 2.465415019762846e-05, + "loss": 1.1942, + "step": 499 + }, + { + "epoch": 0.05929088106249259, + "grad_norm": 2.4503206694521635, + "learning_rate": 2.4703557312252964e-05, + "loss": 1.1499, + "step": 500 + }, + { + "epoch": 0.05940946282461757, + "grad_norm": 2.3932687139334203, + "learning_rate": 2.475296442687747e-05, + "loss": 1.1471, + "step": 501 + }, + { + "epoch": 0.05952804458674256, + "grad_norm": 2.277081619942885, + "learning_rate": 2.480237154150198e-05, + "loss": 1.2769, + "step": 502 + }, + { + "epoch": 0.05964662634886755, + "grad_norm": 2.655722352609263, + "learning_rate": 2.4851778656126484e-05, + "loss": 1.116, + "step": 503 + }, + { + "epoch": 0.05976520811099253, + "grad_norm": 2.264160588330383, + "learning_rate": 2.490118577075099e-05, + "loss": 1.3582, + "step": 504 + }, + { + "epoch": 0.059883789873117516, + "grad_norm": 2.993580899931676, + "learning_rate": 2.4950592885375496e-05, + "loss": 1.2383, + "step": 505 + }, + { + "epoch": 0.060002371635242496, + "grad_norm": 2.1615567728250693, + "learning_rate": 2.5e-05, + "loss": 1.3259, + "step": 506 + }, + { + "epoch": 0.060120953397367484, + "grad_norm": 2.4218557620019143, + "learning_rate": 2.5049407114624507e-05, + "loss": 1.2555, + "step": 507 + }, + { + "epoch": 0.06023953515949247, + "grad_norm": 2.601143530731072, + "learning_rate": 2.5098814229249012e-05, + "loss": 1.0902, + "step": 508 + }, + { + "epoch": 0.06035811692161745, + "grad_norm": 2.4329648100540986, + "learning_rate": 2.5148221343873518e-05, + "loss": 1.4724, + "step": 509 + }, + { + "epoch": 0.06047669868374244, + "grad_norm": 2.299271869439856, + "learning_rate": 2.5197628458498024e-05, + "loss": 1.3814, + "step": 510 + }, + { + "epoch": 0.06059528044586743, + "grad_norm": 2.4631391845178703, + "learning_rate": 2.524703557312253e-05, + "loss": 1.3259, + "step": 511 + }, + { + "epoch": 0.06071386220799241, + "grad_norm": 2.394392906763372, + "learning_rate": 2.5296442687747035e-05, + "loss": 1.2841, + "step": 512 + }, + { + "epoch": 0.060832443970117396, + "grad_norm": 2.340144421472407, + "learning_rate": 2.534584980237154e-05, + "loss": 1.1505, + "step": 513 + }, + { + "epoch": 0.060951025732242384, + "grad_norm": 2.3297309759024047, + "learning_rate": 2.5395256916996046e-05, + "loss": 1.2812, + "step": 514 + }, + { + "epoch": 0.061069607494367364, + "grad_norm": 2.270780261398767, + "learning_rate": 2.5444664031620552e-05, + "loss": 1.1226, + "step": 515 + }, + { + "epoch": 0.06118818925649235, + "grad_norm": 2.2020257546475297, + "learning_rate": 2.5494071146245064e-05, + "loss": 1.1403, + "step": 516 + }, + { + "epoch": 0.06130677101861734, + "grad_norm": 2.441677977405497, + "learning_rate": 2.554347826086957e-05, + "loss": 1.1804, + "step": 517 + }, + { + "epoch": 0.06142535278074232, + "grad_norm": 2.4882519208622336, + "learning_rate": 2.5592885375494075e-05, + "loss": 1.1411, + "step": 518 + }, + { + "epoch": 0.06154393454286731, + "grad_norm": 2.0790833616196047, + "learning_rate": 2.564229249011858e-05, + "loss": 1.3033, + "step": 519 + }, + { + "epoch": 0.06166251630499229, + "grad_norm": 2.0819502918020913, + "learning_rate": 2.5691699604743087e-05, + "loss": 1.273, + "step": 520 + }, + { + "epoch": 0.06178109806711728, + "grad_norm": 2.2168657280508373, + "learning_rate": 2.5741106719367592e-05, + "loss": 1.2951, + "step": 521 + }, + { + "epoch": 0.061899679829242264, + "grad_norm": 2.4205586808550716, + "learning_rate": 2.5790513833992098e-05, + "loss": 1.3411, + "step": 522 + }, + { + "epoch": 0.062018261591367245, + "grad_norm": 2.0972680213728694, + "learning_rate": 2.5839920948616603e-05, + "loss": 1.2263, + "step": 523 + }, + { + "epoch": 0.06213684335349223, + "grad_norm": 2.365153783525565, + "learning_rate": 2.588932806324111e-05, + "loss": 1.0177, + "step": 524 + }, + { + "epoch": 0.06225542511561722, + "grad_norm": 2.052962479136888, + "learning_rate": 2.5938735177865615e-05, + "loss": 1.3229, + "step": 525 + }, + { + "epoch": 0.0623740068777422, + "grad_norm": 2.561629781307026, + "learning_rate": 2.598814229249012e-05, + "loss": 1.4536, + "step": 526 + }, + { + "epoch": 0.06249258863986719, + "grad_norm": 2.2803189711579734, + "learning_rate": 2.6037549407114626e-05, + "loss": 1.4335, + "step": 527 + }, + { + "epoch": 0.06261117040199217, + "grad_norm": 2.347795278790725, + "learning_rate": 2.608695652173913e-05, + "loss": 1.5287, + "step": 528 + }, + { + "epoch": 0.06272975216411716, + "grad_norm": 2.138694960674397, + "learning_rate": 2.6136363636363637e-05, + "loss": 1.54, + "step": 529 + }, + { + "epoch": 0.06284833392624214, + "grad_norm": 2.2938027333713453, + "learning_rate": 2.6185770750988143e-05, + "loss": 1.2111, + "step": 530 + }, + { + "epoch": 0.06296691568836713, + "grad_norm": 2.4954802735517307, + "learning_rate": 2.623517786561265e-05, + "loss": 1.1311, + "step": 531 + }, + { + "epoch": 0.06308549745049212, + "grad_norm": 2.5123695054399477, + "learning_rate": 2.6284584980237154e-05, + "loss": 1.1936, + "step": 532 + }, + { + "epoch": 0.0632040792126171, + "grad_norm": 2.240803724579135, + "learning_rate": 2.633399209486166e-05, + "loss": 1.0283, + "step": 533 + }, + { + "epoch": 0.06332266097474208, + "grad_norm": 2.5397508254841075, + "learning_rate": 2.6383399209486165e-05, + "loss": 1.072, + "step": 534 + }, + { + "epoch": 0.06344124273686708, + "grad_norm": 2.1343889089665016, + "learning_rate": 2.643280632411067e-05, + "loss": 1.5775, + "step": 535 + }, + { + "epoch": 0.06355982449899206, + "grad_norm": 2.1018512408543115, + "learning_rate": 2.6482213438735183e-05, + "loss": 1.0445, + "step": 536 + }, + { + "epoch": 0.06367840626111704, + "grad_norm": 2.262576385937686, + "learning_rate": 2.653162055335969e-05, + "loss": 1.3332, + "step": 537 + }, + { + "epoch": 0.06379698802324203, + "grad_norm": 2.595962190337002, + "learning_rate": 2.6581027667984194e-05, + "loss": 1.2729, + "step": 538 + }, + { + "epoch": 0.06391556978536701, + "grad_norm": 2.3214648614472257, + "learning_rate": 2.66304347826087e-05, + "loss": 1.3461, + "step": 539 + }, + { + "epoch": 0.064034151547492, + "grad_norm": 2.33212808719731, + "learning_rate": 2.6679841897233206e-05, + "loss": 1.4264, + "step": 540 + }, + { + "epoch": 0.06415273330961697, + "grad_norm": 2.551232093868217, + "learning_rate": 2.672924901185771e-05, + "loss": 1.2352, + "step": 541 + }, + { + "epoch": 0.06427131507174197, + "grad_norm": 2.126277756022061, + "learning_rate": 2.6778656126482217e-05, + "loss": 1.3146, + "step": 542 + }, + { + "epoch": 0.06438989683386695, + "grad_norm": 2.6343895844534204, + "learning_rate": 2.6828063241106723e-05, + "loss": 1.2963, + "step": 543 + }, + { + "epoch": 0.06450847859599193, + "grad_norm": 3.1269497889316864, + "learning_rate": 2.6877470355731228e-05, + "loss": 1.4217, + "step": 544 + }, + { + "epoch": 0.06462706035811693, + "grad_norm": 2.209949180971256, + "learning_rate": 2.6926877470355734e-05, + "loss": 1.2183, + "step": 545 + }, + { + "epoch": 0.0647456421202419, + "grad_norm": 2.2784093704226214, + "learning_rate": 2.697628458498024e-05, + "loss": 1.2003, + "step": 546 + }, + { + "epoch": 0.06486422388236689, + "grad_norm": 2.0827375183550543, + "learning_rate": 2.7025691699604745e-05, + "loss": 1.1671, + "step": 547 + }, + { + "epoch": 0.06498280564449188, + "grad_norm": 2.0615295255055517, + "learning_rate": 2.707509881422925e-05, + "loss": 1.388, + "step": 548 + }, + { + "epoch": 0.06510138740661686, + "grad_norm": 2.158957186779704, + "learning_rate": 2.7124505928853756e-05, + "loss": 1.149, + "step": 549 + }, + { + "epoch": 0.06521996916874184, + "grad_norm": 2.3494861863352487, + "learning_rate": 2.7173913043478262e-05, + "loss": 1.3205, + "step": 550 + }, + { + "epoch": 0.06533855093086684, + "grad_norm": 2.338850055804069, + "learning_rate": 2.7223320158102767e-05, + "loss": 1.1631, + "step": 551 + }, + { + "epoch": 0.06545713269299182, + "grad_norm": 2.200781494549883, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.338, + "step": 552 + }, + { + "epoch": 0.0655757144551168, + "grad_norm": 2.3502168701130386, + "learning_rate": 2.732213438735178e-05, + "loss": 1.2998, + "step": 553 + }, + { + "epoch": 0.0656942962172418, + "grad_norm": 2.4202855750037924, + "learning_rate": 2.7371541501976284e-05, + "loss": 1.2907, + "step": 554 + }, + { + "epoch": 0.06581287797936677, + "grad_norm": 2.3234784157195643, + "learning_rate": 2.7420948616600793e-05, + "loss": 1.278, + "step": 555 + }, + { + "epoch": 0.06593145974149175, + "grad_norm": 2.4446211192847582, + "learning_rate": 2.74703557312253e-05, + "loss": 1.3567, + "step": 556 + }, + { + "epoch": 0.06605004150361675, + "grad_norm": 2.2866458271720886, + "learning_rate": 2.7519762845849805e-05, + "loss": 0.9767, + "step": 557 + }, + { + "epoch": 0.06616862326574173, + "grad_norm": 2.609262746765853, + "learning_rate": 2.756916996047431e-05, + "loss": 1.443, + "step": 558 + }, + { + "epoch": 0.06628720502786671, + "grad_norm": 2.3740859707682826, + "learning_rate": 2.7618577075098816e-05, + "loss": 1.4475, + "step": 559 + }, + { + "epoch": 0.0664057867899917, + "grad_norm": 2.5762621742943557, + "learning_rate": 2.766798418972332e-05, + "loss": 1.3917, + "step": 560 + }, + { + "epoch": 0.06652436855211669, + "grad_norm": 2.1722378179101027, + "learning_rate": 2.7717391304347827e-05, + "loss": 1.0666, + "step": 561 + }, + { + "epoch": 0.06664295031424167, + "grad_norm": 2.2609664887850993, + "learning_rate": 2.7766798418972333e-05, + "loss": 1.332, + "step": 562 + }, + { + "epoch": 0.06676153207636666, + "grad_norm": 2.190577586592334, + "learning_rate": 2.7816205533596838e-05, + "loss": 1.165, + "step": 563 + }, + { + "epoch": 0.06688011383849164, + "grad_norm": 2.3310695181091603, + "learning_rate": 2.7865612648221344e-05, + "loss": 1.296, + "step": 564 + }, + { + "epoch": 0.06699869560061662, + "grad_norm": 2.913174220423938, + "learning_rate": 2.791501976284585e-05, + "loss": 1.3252, + "step": 565 + }, + { + "epoch": 0.0671172773627416, + "grad_norm": 2.3723399662015074, + "learning_rate": 2.7964426877470355e-05, + "loss": 1.1765, + "step": 566 + }, + { + "epoch": 0.0672358591248666, + "grad_norm": 2.2321120851721448, + "learning_rate": 2.801383399209486e-05, + "loss": 1.2669, + "step": 567 + }, + { + "epoch": 0.06735444088699158, + "grad_norm": 2.348018589720416, + "learning_rate": 2.8063241106719366e-05, + "loss": 1.3448, + "step": 568 + }, + { + "epoch": 0.06747302264911656, + "grad_norm": 2.2628298531037547, + "learning_rate": 2.8112648221343872e-05, + "loss": 1.175, + "step": 569 + }, + { + "epoch": 0.06759160441124155, + "grad_norm": 2.9745657121416373, + "learning_rate": 2.8162055335968378e-05, + "loss": 1.4164, + "step": 570 + }, + { + "epoch": 0.06771018617336654, + "grad_norm": 2.1799016155552278, + "learning_rate": 2.8211462450592883e-05, + "loss": 1.3606, + "step": 571 + }, + { + "epoch": 0.06782876793549152, + "grad_norm": 2.295860594362138, + "learning_rate": 2.826086956521739e-05, + "loss": 1.109, + "step": 572 + }, + { + "epoch": 0.06794734969761651, + "grad_norm": 2.3734313084286085, + "learning_rate": 2.8310276679841894e-05, + "loss": 1.1759, + "step": 573 + }, + { + "epoch": 0.06806593145974149, + "grad_norm": 2.550689100853419, + "learning_rate": 2.8359683794466403e-05, + "loss": 1.2211, + "step": 574 + }, + { + "epoch": 0.06818451322186647, + "grad_norm": 2.2278869913563395, + "learning_rate": 2.8409090909090912e-05, + "loss": 1.1187, + "step": 575 + }, + { + "epoch": 0.06830309498399147, + "grad_norm": 2.1600829534343617, + "learning_rate": 2.8458498023715418e-05, + "loss": 1.047, + "step": 576 + }, + { + "epoch": 0.06842167674611645, + "grad_norm": 2.2318769989219014, + "learning_rate": 2.8507905138339924e-05, + "loss": 1.1978, + "step": 577 + }, + { + "epoch": 0.06854025850824143, + "grad_norm": 2.2365189898352846, + "learning_rate": 2.855731225296443e-05, + "loss": 1.0909, + "step": 578 + }, + { + "epoch": 0.06865884027036642, + "grad_norm": 2.386323796617713, + "learning_rate": 2.8606719367588935e-05, + "loss": 1.1331, + "step": 579 + }, + { + "epoch": 0.0687774220324914, + "grad_norm": 2.3502834224507154, + "learning_rate": 2.865612648221344e-05, + "loss": 1.4039, + "step": 580 + }, + { + "epoch": 0.06889600379461638, + "grad_norm": 1.9468222794647423, + "learning_rate": 2.8705533596837946e-05, + "loss": 1.2398, + "step": 581 + }, + { + "epoch": 0.06901458555674138, + "grad_norm": 2.1965793446547215, + "learning_rate": 2.8754940711462452e-05, + "loss": 1.1226, + "step": 582 + }, + { + "epoch": 0.06913316731886636, + "grad_norm": 2.1468281290600175, + "learning_rate": 2.8804347826086957e-05, + "loss": 1.2343, + "step": 583 + }, + { + "epoch": 0.06925174908099134, + "grad_norm": 2.1819282632068484, + "learning_rate": 2.8853754940711463e-05, + "loss": 1.0693, + "step": 584 + }, + { + "epoch": 0.06937033084311633, + "grad_norm": 2.5568229728407696, + "learning_rate": 2.890316205533597e-05, + "loss": 1.2834, + "step": 585 + }, + { + "epoch": 0.06948891260524132, + "grad_norm": 2.2426630336916493, + "learning_rate": 2.8952569169960474e-05, + "loss": 1.3073, + "step": 586 + }, + { + "epoch": 0.0696074943673663, + "grad_norm": 2.184080807208477, + "learning_rate": 2.900197628458498e-05, + "loss": 1.3569, + "step": 587 + }, + { + "epoch": 0.06972607612949129, + "grad_norm": 2.153072527328173, + "learning_rate": 2.9051383399209485e-05, + "loss": 1.316, + "step": 588 + }, + { + "epoch": 0.06984465789161627, + "grad_norm": 2.20119147513914, + "learning_rate": 2.910079051383399e-05, + "loss": 1.353, + "step": 589 + }, + { + "epoch": 0.06996323965374125, + "grad_norm": 2.1995812868490416, + "learning_rate": 2.9150197628458497e-05, + "loss": 0.8626, + "step": 590 + }, + { + "epoch": 0.07008182141586625, + "grad_norm": 2.63430958551922, + "learning_rate": 2.9199604743083002e-05, + "loss": 1.1934, + "step": 591 + }, + { + "epoch": 0.07020040317799123, + "grad_norm": 2.1543968578695405, + "learning_rate": 2.9249011857707508e-05, + "loss": 0.9758, + "step": 592 + }, + { + "epoch": 0.07031898494011621, + "grad_norm": 2.577090617768706, + "learning_rate": 2.9298418972332014e-05, + "loss": 1.2178, + "step": 593 + }, + { + "epoch": 0.07043756670224119, + "grad_norm": 2.1960636756491163, + "learning_rate": 2.9347826086956526e-05, + "loss": 1.3568, + "step": 594 + }, + { + "epoch": 0.07055614846436618, + "grad_norm": 2.5635171843913875, + "learning_rate": 2.939723320158103e-05, + "loss": 1.162, + "step": 595 + }, + { + "epoch": 0.07067473022649116, + "grad_norm": 2.258402073980407, + "learning_rate": 2.9446640316205537e-05, + "loss": 1.1793, + "step": 596 + }, + { + "epoch": 0.07079331198861614, + "grad_norm": 2.339126866973791, + "learning_rate": 2.9496047430830043e-05, + "loss": 1.3893, + "step": 597 + }, + { + "epoch": 0.07091189375074114, + "grad_norm": 2.3165493505539008, + "learning_rate": 2.954545454545455e-05, + "loss": 1.1491, + "step": 598 + }, + { + "epoch": 0.07103047551286612, + "grad_norm": 2.080024361942203, + "learning_rate": 2.9594861660079054e-05, + "loss": 0.832, + "step": 599 + }, + { + "epoch": 0.0711490572749911, + "grad_norm": 2.059472479536438, + "learning_rate": 2.964426877470356e-05, + "loss": 1.2426, + "step": 600 + }, + { + "epoch": 0.0712676390371161, + "grad_norm": 2.5679202315236536, + "learning_rate": 2.9693675889328065e-05, + "loss": 1.2791, + "step": 601 + }, + { + "epoch": 0.07138622079924108, + "grad_norm": 1.9662487086267828, + "learning_rate": 2.974308300395257e-05, + "loss": 1.2761, + "step": 602 + }, + { + "epoch": 0.07150480256136606, + "grad_norm": 2.0401467865885805, + "learning_rate": 2.9792490118577076e-05, + "loss": 1.347, + "step": 603 + }, + { + "epoch": 0.07162338432349105, + "grad_norm": 2.1185551056585896, + "learning_rate": 2.9841897233201582e-05, + "loss": 0.6096, + "step": 604 + }, + { + "epoch": 0.07174196608561603, + "grad_norm": 2.241714332501538, + "learning_rate": 2.9891304347826088e-05, + "loss": 1.4908, + "step": 605 + }, + { + "epoch": 0.07186054784774101, + "grad_norm": 2.4785543126672924, + "learning_rate": 2.9940711462450593e-05, + "loss": 1.336, + "step": 606 + }, + { + "epoch": 0.07197912960986601, + "grad_norm": 2.3094742794243146, + "learning_rate": 2.99901185770751e-05, + "loss": 1.3999, + "step": 607 + }, + { + "epoch": 0.07209771137199099, + "grad_norm": 2.2997951375743213, + "learning_rate": 3.0039525691699605e-05, + "loss": 1.4017, + "step": 608 + }, + { + "epoch": 0.07221629313411597, + "grad_norm": 2.4009311508809272, + "learning_rate": 3.008893280632411e-05, + "loss": 1.4592, + "step": 609 + }, + { + "epoch": 0.07233487489624096, + "grad_norm": 2.1131382544040815, + "learning_rate": 3.0138339920948616e-05, + "loss": 1.2334, + "step": 610 + }, + { + "epoch": 0.07245345665836594, + "grad_norm": 2.2017485441669047, + "learning_rate": 3.018774703557312e-05, + "loss": 1.2462, + "step": 611 + }, + { + "epoch": 0.07257203842049093, + "grad_norm": 2.0918687069268267, + "learning_rate": 3.0237154150197627e-05, + "loss": 1.3212, + "step": 612 + }, + { + "epoch": 0.07269062018261592, + "grad_norm": 2.2171748388548305, + "learning_rate": 3.0286561264822133e-05, + "loss": 1.0972, + "step": 613 + }, + { + "epoch": 0.0728092019447409, + "grad_norm": 2.185411100980378, + "learning_rate": 3.0335968379446645e-05, + "loss": 1.2474, + "step": 614 + }, + { + "epoch": 0.07292778370686588, + "grad_norm": 2.112212795996242, + "learning_rate": 3.038537549407115e-05, + "loss": 1.4223, + "step": 615 + }, + { + "epoch": 0.07304636546899088, + "grad_norm": 2.4900141329748506, + "learning_rate": 3.0434782608695656e-05, + "loss": 1.1955, + "step": 616 + }, + { + "epoch": 0.07316494723111586, + "grad_norm": 2.1863466934201585, + "learning_rate": 3.0484189723320162e-05, + "loss": 1.1771, + "step": 617 + }, + { + "epoch": 0.07328352899324084, + "grad_norm": 2.055606299634549, + "learning_rate": 3.053359683794467e-05, + "loss": 1.1907, + "step": 618 + }, + { + "epoch": 0.07340211075536582, + "grad_norm": 2.351500214968349, + "learning_rate": 3.058300395256917e-05, + "loss": 1.2593, + "step": 619 + }, + { + "epoch": 0.07352069251749081, + "grad_norm": 2.1343037304166397, + "learning_rate": 3.063241106719368e-05, + "loss": 1.3491, + "step": 620 + }, + { + "epoch": 0.0736392742796158, + "grad_norm": 2.178001910206558, + "learning_rate": 3.068181818181818e-05, + "loss": 1.0337, + "step": 621 + }, + { + "epoch": 0.07375785604174077, + "grad_norm": 2.513016764974934, + "learning_rate": 3.073122529644269e-05, + "loss": 1.3247, + "step": 622 + }, + { + "epoch": 0.07387643780386577, + "grad_norm": 2.430546202910741, + "learning_rate": 3.078063241106719e-05, + "loss": 1.2511, + "step": 623 + }, + { + "epoch": 0.07399501956599075, + "grad_norm": 2.2610743212604643, + "learning_rate": 3.08300395256917e-05, + "loss": 1.0712, + "step": 624 + }, + { + "epoch": 0.07411360132811573, + "grad_norm": 2.086946857646669, + "learning_rate": 3.0879446640316203e-05, + "loss": 1.4171, + "step": 625 + }, + { + "epoch": 0.07423218309024072, + "grad_norm": 2.4141481186267706, + "learning_rate": 3.092885375494071e-05, + "loss": 1.2962, + "step": 626 + }, + { + "epoch": 0.0743507648523657, + "grad_norm": 2.020708114178066, + "learning_rate": 3.0978260869565215e-05, + "loss": 1.2542, + "step": 627 + }, + { + "epoch": 0.07446934661449069, + "grad_norm": 2.1868188417972365, + "learning_rate": 3.1027667984189724e-05, + "loss": 1.1663, + "step": 628 + }, + { + "epoch": 0.07458792837661568, + "grad_norm": 2.2765309833984024, + "learning_rate": 3.1077075098814226e-05, + "loss": 1.4439, + "step": 629 + }, + { + "epoch": 0.07470651013874066, + "grad_norm": 2.1002806417238076, + "learning_rate": 3.1126482213438735e-05, + "loss": 1.2488, + "step": 630 + }, + { + "epoch": 0.07482509190086564, + "grad_norm": 2.4328411398703973, + "learning_rate": 3.117588932806324e-05, + "loss": 1.1749, + "step": 631 + }, + { + "epoch": 0.07494367366299064, + "grad_norm": 2.1633496603266926, + "learning_rate": 3.1225296442687746e-05, + "loss": 1.3822, + "step": 632 + }, + { + "epoch": 0.07506225542511562, + "grad_norm": 2.394648861031172, + "learning_rate": 3.1274703557312255e-05, + "loss": 1.1556, + "step": 633 + }, + { + "epoch": 0.0751808371872406, + "grad_norm": 2.462460018255017, + "learning_rate": 3.1324110671936764e-05, + "loss": 1.2045, + "step": 634 + }, + { + "epoch": 0.07529941894936559, + "grad_norm": 2.1592438434065846, + "learning_rate": 3.1373517786561266e-05, + "loss": 1.295, + "step": 635 + }, + { + "epoch": 0.07541800071149057, + "grad_norm": 2.2949609954914205, + "learning_rate": 3.1422924901185775e-05, + "loss": 0.8925, + "step": 636 + }, + { + "epoch": 0.07553658247361555, + "grad_norm": 2.2454320674029637, + "learning_rate": 3.147233201581028e-05, + "loss": 1.3552, + "step": 637 + }, + { + "epoch": 0.07565516423574055, + "grad_norm": 2.2503175616551663, + "learning_rate": 3.152173913043479e-05, + "loss": 1.3242, + "step": 638 + }, + { + "epoch": 0.07577374599786553, + "grad_norm": 1.9513395050231432, + "learning_rate": 3.157114624505929e-05, + "loss": 1.2112, + "step": 639 + }, + { + "epoch": 0.07589232775999051, + "grad_norm": 2.493416694991229, + "learning_rate": 3.16205533596838e-05, + "loss": 1.1913, + "step": 640 + }, + { + "epoch": 0.0760109095221155, + "grad_norm": 2.403724854397969, + "learning_rate": 3.16699604743083e-05, + "loss": 1.4156, + "step": 641 + }, + { + "epoch": 0.07612949128424049, + "grad_norm": 2.5461074275512305, + "learning_rate": 3.171936758893281e-05, + "loss": 1.2197, + "step": 642 + }, + { + "epoch": 0.07624807304636547, + "grad_norm": 2.3293402994235373, + "learning_rate": 3.176877470355731e-05, + "loss": 1.1746, + "step": 643 + }, + { + "epoch": 0.07636665480849045, + "grad_norm": 2.0781989564578662, + "learning_rate": 3.181818181818182e-05, + "loss": 1.3666, + "step": 644 + }, + { + "epoch": 0.07648523657061544, + "grad_norm": 2.2313907677331697, + "learning_rate": 3.186758893280632e-05, + "loss": 1.5331, + "step": 645 + }, + { + "epoch": 0.07660381833274042, + "grad_norm": 2.3431465246335086, + "learning_rate": 3.191699604743083e-05, + "loss": 1.284, + "step": 646 + }, + { + "epoch": 0.0767224000948654, + "grad_norm": 2.1346601541832686, + "learning_rate": 3.1966403162055334e-05, + "loss": 1.0286, + "step": 647 + }, + { + "epoch": 0.0768409818569904, + "grad_norm": 2.155180433637423, + "learning_rate": 3.201581027667984e-05, + "loss": 1.2454, + "step": 648 + }, + { + "epoch": 0.07695956361911538, + "grad_norm": 1.8240945560814488, + "learning_rate": 3.2065217391304345e-05, + "loss": 1.3571, + "step": 649 + }, + { + "epoch": 0.07707814538124036, + "grad_norm": 2.686597292158509, + "learning_rate": 3.2114624505928854e-05, + "loss": 1.0689, + "step": 650 + }, + { + "epoch": 0.07719672714336535, + "grad_norm": 2.131642033188979, + "learning_rate": 3.2164031620553356e-05, + "loss": 1.2287, + "step": 651 + }, + { + "epoch": 0.07731530890549033, + "grad_norm": 2.047945977270479, + "learning_rate": 3.221343873517787e-05, + "loss": 1.2977, + "step": 652 + }, + { + "epoch": 0.07743389066761532, + "grad_norm": 2.0922079462528087, + "learning_rate": 3.2262845849802374e-05, + "loss": 1.3369, + "step": 653 + }, + { + "epoch": 0.07755247242974031, + "grad_norm": 2.0071577271544836, + "learning_rate": 3.231225296442688e-05, + "loss": 0.9622, + "step": 654 + }, + { + "epoch": 0.07767105419186529, + "grad_norm": 1.8106085019867801, + "learning_rate": 3.2361660079051385e-05, + "loss": 1.3767, + "step": 655 + }, + { + "epoch": 0.07778963595399027, + "grad_norm": 2.024519122811612, + "learning_rate": 3.2411067193675894e-05, + "loss": 1.1097, + "step": 656 + }, + { + "epoch": 0.07790821771611527, + "grad_norm": 2.2896010776180638, + "learning_rate": 3.24604743083004e-05, + "loss": 1.1981, + "step": 657 + }, + { + "epoch": 0.07802679947824025, + "grad_norm": 2.4414534453185883, + "learning_rate": 3.2509881422924906e-05, + "loss": 1.1189, + "step": 658 + }, + { + "epoch": 0.07814538124036523, + "grad_norm": 2.493767082887729, + "learning_rate": 3.255928853754941e-05, + "loss": 1.1435, + "step": 659 + }, + { + "epoch": 0.07826396300249022, + "grad_norm": 2.3659242083809526, + "learning_rate": 3.260869565217392e-05, + "loss": 1.3759, + "step": 660 + }, + { + "epoch": 0.0783825447646152, + "grad_norm": 2.0617954455206298, + "learning_rate": 3.265810276679842e-05, + "loss": 1.2278, + "step": 661 + }, + { + "epoch": 0.07850112652674018, + "grad_norm": 2.3859410364101024, + "learning_rate": 3.270750988142293e-05, + "loss": 1.0266, + "step": 662 + }, + { + "epoch": 0.07861970828886518, + "grad_norm": 2.453665435105549, + "learning_rate": 3.275691699604743e-05, + "loss": 1.3979, + "step": 663 + }, + { + "epoch": 0.07873829005099016, + "grad_norm": 2.40411250015987, + "learning_rate": 3.280632411067194e-05, + "loss": 1.1569, + "step": 664 + }, + { + "epoch": 0.07885687181311514, + "grad_norm": 2.5377065367314877, + "learning_rate": 3.285573122529644e-05, + "loss": 1.208, + "step": 665 + }, + { + "epoch": 0.07897545357524013, + "grad_norm": 2.1587067174195322, + "learning_rate": 3.290513833992095e-05, + "loss": 1.1847, + "step": 666 + }, + { + "epoch": 0.07909403533736512, + "grad_norm": 2.0266377451748063, + "learning_rate": 3.295454545454545e-05, + "loss": 1.1628, + "step": 667 + }, + { + "epoch": 0.0792126170994901, + "grad_norm": 1.9103375000981677, + "learning_rate": 3.300395256916996e-05, + "loss": 1.1041, + "step": 668 + }, + { + "epoch": 0.07933119886161509, + "grad_norm": 2.118907982214127, + "learning_rate": 3.3053359683794464e-05, + "loss": 1.4679, + "step": 669 + }, + { + "epoch": 0.07944978062374007, + "grad_norm": 2.374562165583004, + "learning_rate": 3.310276679841897e-05, + "loss": 1.3944, + "step": 670 + }, + { + "epoch": 0.07956836238586505, + "grad_norm": 2.017225265432069, + "learning_rate": 3.3152173913043475e-05, + "loss": 1.1328, + "step": 671 + }, + { + "epoch": 0.07968694414799003, + "grad_norm": 2.2327847507495133, + "learning_rate": 3.320158102766799e-05, + "loss": 1.1816, + "step": 672 + }, + { + "epoch": 0.07980552591011503, + "grad_norm": 2.334767423555494, + "learning_rate": 3.325098814229249e-05, + "loss": 1.32, + "step": 673 + }, + { + "epoch": 0.07992410767224001, + "grad_norm": 2.095355611749733, + "learning_rate": 3.3300395256917e-05, + "loss": 1.0858, + "step": 674 + }, + { + "epoch": 0.08004268943436499, + "grad_norm": 2.633494345017161, + "learning_rate": 3.3349802371541505e-05, + "loss": 1.2824, + "step": 675 + }, + { + "epoch": 0.08016127119648998, + "grad_norm": 2.1373584515263873, + "learning_rate": 3.3399209486166014e-05, + "loss": 1.2237, + "step": 676 + }, + { + "epoch": 0.08027985295861496, + "grad_norm": 2.343213875301192, + "learning_rate": 3.3448616600790516e-05, + "loss": 1.0752, + "step": 677 + }, + { + "epoch": 0.08039843472073994, + "grad_norm": 2.115296502138324, + "learning_rate": 3.3498023715415025e-05, + "loss": 1.3571, + "step": 678 + }, + { + "epoch": 0.08051701648286494, + "grad_norm": 2.1214780677244547, + "learning_rate": 3.354743083003953e-05, + "loss": 1.0668, + "step": 679 + }, + { + "epoch": 0.08063559824498992, + "grad_norm": 2.1907827608732555, + "learning_rate": 3.3596837944664036e-05, + "loss": 1.269, + "step": 680 + }, + { + "epoch": 0.0807541800071149, + "grad_norm": 1.9420783532672319, + "learning_rate": 3.364624505928854e-05, + "loss": 1.1561, + "step": 681 + }, + { + "epoch": 0.0808727617692399, + "grad_norm": 2.2031239730886836, + "learning_rate": 3.369565217391305e-05, + "loss": 1.5454, + "step": 682 + }, + { + "epoch": 0.08099134353136488, + "grad_norm": 2.2594219734341245, + "learning_rate": 3.374505928853755e-05, + "loss": 1.2732, + "step": 683 + }, + { + "epoch": 0.08110992529348986, + "grad_norm": 2.1544634619454537, + "learning_rate": 3.379446640316206e-05, + "loss": 1.4987, + "step": 684 + }, + { + "epoch": 0.08122850705561485, + "grad_norm": 2.6484524379602132, + "learning_rate": 3.384387351778656e-05, + "loss": 1.188, + "step": 685 + }, + { + "epoch": 0.08134708881773983, + "grad_norm": 1.8930749999761514, + "learning_rate": 3.389328063241107e-05, + "loss": 1.0723, + "step": 686 + }, + { + "epoch": 0.08146567057986481, + "grad_norm": 2.1041458788958898, + "learning_rate": 3.394268774703557e-05, + "loss": 1.292, + "step": 687 + }, + { + "epoch": 0.08158425234198981, + "grad_norm": 2.3393629881270877, + "learning_rate": 3.399209486166008e-05, + "loss": 1.2795, + "step": 688 + }, + { + "epoch": 0.08170283410411479, + "grad_norm": 2.246018686556954, + "learning_rate": 3.404150197628458e-05, + "loss": 1.2298, + "step": 689 + }, + { + "epoch": 0.08182141586623977, + "grad_norm": 2.0415236458011465, + "learning_rate": 3.409090909090909e-05, + "loss": 1.3815, + "step": 690 + }, + { + "epoch": 0.08193999762836476, + "grad_norm": 2.0050226172145584, + "learning_rate": 3.41403162055336e-05, + "loss": 0.827, + "step": 691 + }, + { + "epoch": 0.08205857939048974, + "grad_norm": 2.0528823131532357, + "learning_rate": 3.418972332015811e-05, + "loss": 1.2963, + "step": 692 + }, + { + "epoch": 0.08217716115261472, + "grad_norm": 2.0532336654751235, + "learning_rate": 3.423913043478261e-05, + "loss": 1.4726, + "step": 693 + }, + { + "epoch": 0.08229574291473972, + "grad_norm": 2.013861324264989, + "learning_rate": 3.428853754940712e-05, + "loss": 0.9586, + "step": 694 + }, + { + "epoch": 0.0824143246768647, + "grad_norm": 2.1109504634949308, + "learning_rate": 3.4337944664031624e-05, + "loss": 1.3651, + "step": 695 + }, + { + "epoch": 0.08253290643898968, + "grad_norm": 2.355524109905185, + "learning_rate": 3.438735177865613e-05, + "loss": 1.3987, + "step": 696 + }, + { + "epoch": 0.08265148820111466, + "grad_norm": 2.005912665626445, + "learning_rate": 3.4436758893280635e-05, + "loss": 1.1214, + "step": 697 + }, + { + "epoch": 0.08277006996323966, + "grad_norm": 1.8824213311759646, + "learning_rate": 3.4486166007905144e-05, + "loss": 1.2458, + "step": 698 + }, + { + "epoch": 0.08288865172536464, + "grad_norm": 2.3442890017539333, + "learning_rate": 3.4535573122529646e-05, + "loss": 1.5919, + "step": 699 + }, + { + "epoch": 0.08300723348748962, + "grad_norm": 2.0363567825099587, + "learning_rate": 3.4584980237154155e-05, + "loss": 1.2358, + "step": 700 + }, + { + "epoch": 0.08312581524961461, + "grad_norm": 1.908902725924768, + "learning_rate": 3.463438735177866e-05, + "loss": 1.3315, + "step": 701 + }, + { + "epoch": 0.0832443970117396, + "grad_norm": 1.937704617549985, + "learning_rate": 3.4683794466403166e-05, + "loss": 1.2119, + "step": 702 + }, + { + "epoch": 0.08336297877386457, + "grad_norm": 2.0252018575771142, + "learning_rate": 3.473320158102767e-05, + "loss": 1.1829, + "step": 703 + }, + { + "epoch": 0.08348156053598957, + "grad_norm": 2.200314530837662, + "learning_rate": 3.478260869565218e-05, + "loss": 1.4052, + "step": 704 + }, + { + "epoch": 0.08360014229811455, + "grad_norm": 1.8319711337049067, + "learning_rate": 3.483201581027668e-05, + "loss": 0.8856, + "step": 705 + }, + { + "epoch": 0.08371872406023953, + "grad_norm": 1.8916274781789295, + "learning_rate": 3.488142292490119e-05, + "loss": 1.401, + "step": 706 + }, + { + "epoch": 0.08383730582236452, + "grad_norm": 2.0049433218094683, + "learning_rate": 3.493083003952569e-05, + "loss": 1.333, + "step": 707 + }, + { + "epoch": 0.0839558875844895, + "grad_norm": 2.157756764032009, + "learning_rate": 3.49802371541502e-05, + "loss": 1.2478, + "step": 708 + }, + { + "epoch": 0.08407446934661449, + "grad_norm": 1.8445024831246304, + "learning_rate": 3.50296442687747e-05, + "loss": 1.1563, + "step": 709 + }, + { + "epoch": 0.08419305110873948, + "grad_norm": 2.0844870894129737, + "learning_rate": 3.507905138339921e-05, + "loss": 0.9649, + "step": 710 + }, + { + "epoch": 0.08431163287086446, + "grad_norm": 1.8975178582598577, + "learning_rate": 3.512845849802372e-05, + "loss": 1.0537, + "step": 711 + }, + { + "epoch": 0.08443021463298944, + "grad_norm": 1.9850209175482596, + "learning_rate": 3.517786561264822e-05, + "loss": 1.3354, + "step": 712 + }, + { + "epoch": 0.08454879639511444, + "grad_norm": 2.3859087686276848, + "learning_rate": 3.522727272727273e-05, + "loss": 1.3569, + "step": 713 + }, + { + "epoch": 0.08466737815723942, + "grad_norm": 2.0144627936459827, + "learning_rate": 3.5276679841897234e-05, + "loss": 1.4431, + "step": 714 + }, + { + "epoch": 0.0847859599193644, + "grad_norm": 2.2707148228503495, + "learning_rate": 3.532608695652174e-05, + "loss": 1.196, + "step": 715 + }, + { + "epoch": 0.08490454168148939, + "grad_norm": 1.995004379337582, + "learning_rate": 3.5375494071146245e-05, + "loss": 1.1068, + "step": 716 + }, + { + "epoch": 0.08502312344361437, + "grad_norm": 2.2688964225269035, + "learning_rate": 3.5424901185770754e-05, + "loss": 1.0812, + "step": 717 + }, + { + "epoch": 0.08514170520573935, + "grad_norm": 2.151511985849467, + "learning_rate": 3.5474308300395256e-05, + "loss": 1.524, + "step": 718 + }, + { + "epoch": 0.08526028696786435, + "grad_norm": 1.867339507248027, + "learning_rate": 3.5523715415019765e-05, + "loss": 1.0814, + "step": 719 + }, + { + "epoch": 0.08537886872998933, + "grad_norm": 1.950013348043195, + "learning_rate": 3.557312252964427e-05, + "loss": 1.2955, + "step": 720 + }, + { + "epoch": 0.08549745049211431, + "grad_norm": 2.1881143430762853, + "learning_rate": 3.5622529644268777e-05, + "loss": 1.0316, + "step": 721 + }, + { + "epoch": 0.0856160322542393, + "grad_norm": 2.074620560669965, + "learning_rate": 3.567193675889328e-05, + "loss": 1.4001, + "step": 722 + }, + { + "epoch": 0.08573461401636429, + "grad_norm": 2.2477059414773075, + "learning_rate": 3.572134387351779e-05, + "loss": 1.1642, + "step": 723 + }, + { + "epoch": 0.08585319577848927, + "grad_norm": 2.1598794607792127, + "learning_rate": 3.577075098814229e-05, + "loss": 1.1704, + "step": 724 + }, + { + "epoch": 0.08597177754061425, + "grad_norm": 2.0054288575265873, + "learning_rate": 3.58201581027668e-05, + "loss": 1.2874, + "step": 725 + }, + { + "epoch": 0.08609035930273924, + "grad_norm": 2.018201412986407, + "learning_rate": 3.58695652173913e-05, + "loss": 1.2215, + "step": 726 + }, + { + "epoch": 0.08620894106486422, + "grad_norm": 2.112750670485053, + "learning_rate": 3.591897233201581e-05, + "loss": 1.2149, + "step": 727 + }, + { + "epoch": 0.0863275228269892, + "grad_norm": 2.230391385910561, + "learning_rate": 3.596837944664031e-05, + "loss": 1.2555, + "step": 728 + }, + { + "epoch": 0.0864461045891142, + "grad_norm": 2.147636108655856, + "learning_rate": 3.601778656126482e-05, + "loss": 1.2996, + "step": 729 + }, + { + "epoch": 0.08656468635123918, + "grad_norm": 2.1142289499374405, + "learning_rate": 3.606719367588933e-05, + "loss": 1.2136, + "step": 730 + }, + { + "epoch": 0.08668326811336416, + "grad_norm": 2.148582425413884, + "learning_rate": 3.611660079051384e-05, + "loss": 1.1621, + "step": 731 + }, + { + "epoch": 0.08680184987548915, + "grad_norm": 1.9720848682342424, + "learning_rate": 3.616600790513834e-05, + "loss": 1.2137, + "step": 732 + }, + { + "epoch": 0.08692043163761413, + "grad_norm": 1.911307435494401, + "learning_rate": 3.621541501976285e-05, + "loss": 0.9674, + "step": 733 + }, + { + "epoch": 0.08703901339973912, + "grad_norm": 2.266432678082131, + "learning_rate": 3.626482213438735e-05, + "loss": 1.2808, + "step": 734 + }, + { + "epoch": 0.08715759516186411, + "grad_norm": 1.8934794972549702, + "learning_rate": 3.631422924901186e-05, + "loss": 1.2665, + "step": 735 + }, + { + "epoch": 0.08727617692398909, + "grad_norm": 1.7963249591338462, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0023, + "step": 736 + }, + { + "epoch": 0.08739475868611407, + "grad_norm": 1.9429790773681141, + "learning_rate": 3.641304347826087e-05, + "loss": 1.1188, + "step": 737 + }, + { + "epoch": 0.08751334044823907, + "grad_norm": 2.1208827102753394, + "learning_rate": 3.6462450592885375e-05, + "loss": 1.3529, + "step": 738 + }, + { + "epoch": 0.08763192221036405, + "grad_norm": 2.1491179885176637, + "learning_rate": 3.6511857707509884e-05, + "loss": 1.2563, + "step": 739 + }, + { + "epoch": 0.08775050397248903, + "grad_norm": 1.9682896204386802, + "learning_rate": 3.656126482213439e-05, + "loss": 1.1543, + "step": 740 + }, + { + "epoch": 0.08786908573461402, + "grad_norm": 2.6134864341662727, + "learning_rate": 3.6610671936758896e-05, + "loss": 1.3361, + "step": 741 + }, + { + "epoch": 0.087987667496739, + "grad_norm": 2.84463656121695, + "learning_rate": 3.66600790513834e-05, + "loss": 1.5155, + "step": 742 + }, + { + "epoch": 0.08810624925886398, + "grad_norm": 2.1573916437470193, + "learning_rate": 3.670948616600791e-05, + "loss": 1.3103, + "step": 743 + }, + { + "epoch": 0.08822483102098898, + "grad_norm": 1.9971857757828326, + "learning_rate": 3.675889328063241e-05, + "loss": 1.3001, + "step": 744 + }, + { + "epoch": 0.08834341278311396, + "grad_norm": 2.088134644103535, + "learning_rate": 3.680830039525692e-05, + "loss": 1.1391, + "step": 745 + }, + { + "epoch": 0.08846199454523894, + "grad_norm": 1.9048938409476335, + "learning_rate": 3.685770750988142e-05, + "loss": 1.2182, + "step": 746 + }, + { + "epoch": 0.08858057630736393, + "grad_norm": 2.140521370998425, + "learning_rate": 3.690711462450593e-05, + "loss": 1.0491, + "step": 747 + }, + { + "epoch": 0.08869915806948891, + "grad_norm": 1.7979602980447402, + "learning_rate": 3.695652173913043e-05, + "loss": 1.1915, + "step": 748 + }, + { + "epoch": 0.0888177398316139, + "grad_norm": 1.9023154713918125, + "learning_rate": 3.700592885375494e-05, + "loss": 1.173, + "step": 749 + }, + { + "epoch": 0.08893632159373888, + "grad_norm": 1.9874490983714652, + "learning_rate": 3.705533596837945e-05, + "loss": 1.3811, + "step": 750 + }, + { + "epoch": 0.08905490335586387, + "grad_norm": 2.066170615101303, + "learning_rate": 3.710474308300396e-05, + "loss": 1.3331, + "step": 751 + }, + { + "epoch": 0.08917348511798885, + "grad_norm": 2.106846147766279, + "learning_rate": 3.715415019762846e-05, + "loss": 1.1641, + "step": 752 + }, + { + "epoch": 0.08929206688011383, + "grad_norm": 2.010016021320213, + "learning_rate": 3.720355731225297e-05, + "loss": 1.2018, + "step": 753 + }, + { + "epoch": 0.08941064864223883, + "grad_norm": 2.269361084684674, + "learning_rate": 3.725296442687747e-05, + "loss": 1.2466, + "step": 754 + }, + { + "epoch": 0.08952923040436381, + "grad_norm": 2.200944492960789, + "learning_rate": 3.730237154150198e-05, + "loss": 1.3754, + "step": 755 + }, + { + "epoch": 0.08964781216648879, + "grad_norm": 1.9588635828677123, + "learning_rate": 3.735177865612648e-05, + "loss": 1.2139, + "step": 756 + }, + { + "epoch": 0.08976639392861378, + "grad_norm": 2.336764382832611, + "learning_rate": 3.740118577075099e-05, + "loss": 1.2051, + "step": 757 + }, + { + "epoch": 0.08988497569073876, + "grad_norm": 2.0604024003027432, + "learning_rate": 3.7450592885375494e-05, + "loss": 1.5451, + "step": 758 + }, + { + "epoch": 0.09000355745286374, + "grad_norm": 1.9087552143741133, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.0638, + "step": 759 + }, + { + "epoch": 0.09012213921498874, + "grad_norm": 2.1629937244115167, + "learning_rate": 3.7549407114624506e-05, + "loss": 1.3252, + "step": 760 + }, + { + "epoch": 0.09024072097711372, + "grad_norm": 1.9272862447690944, + "learning_rate": 3.7598814229249015e-05, + "loss": 1.3971, + "step": 761 + }, + { + "epoch": 0.0903593027392387, + "grad_norm": 1.9707413014850337, + "learning_rate": 3.764822134387352e-05, + "loss": 1.3212, + "step": 762 + }, + { + "epoch": 0.0904778845013637, + "grad_norm": 1.9855428981452112, + "learning_rate": 3.7697628458498026e-05, + "loss": 1.3926, + "step": 763 + }, + { + "epoch": 0.09059646626348868, + "grad_norm": 2.0081166667511017, + "learning_rate": 3.774703557312253e-05, + "loss": 1.1415, + "step": 764 + }, + { + "epoch": 0.09071504802561366, + "grad_norm": 2.132496779620122, + "learning_rate": 3.779644268774704e-05, + "loss": 1.0158, + "step": 765 + }, + { + "epoch": 0.09083362978773865, + "grad_norm": 1.8882261360932378, + "learning_rate": 3.784584980237154e-05, + "loss": 1.2197, + "step": 766 + }, + { + "epoch": 0.09095221154986363, + "grad_norm": 2.072963891619346, + "learning_rate": 3.789525691699605e-05, + "loss": 1.0694, + "step": 767 + }, + { + "epoch": 0.09107079331198861, + "grad_norm": 1.9686449742403433, + "learning_rate": 3.794466403162055e-05, + "loss": 1.1653, + "step": 768 + }, + { + "epoch": 0.09118937507411361, + "grad_norm": 2.394420033356521, + "learning_rate": 3.7994071146245066e-05, + "loss": 1.2517, + "step": 769 + }, + { + "epoch": 0.09130795683623859, + "grad_norm": 1.79923002967305, + "learning_rate": 3.804347826086957e-05, + "loss": 1.4045, + "step": 770 + }, + { + "epoch": 0.09142653859836357, + "grad_norm": 2.45011876921403, + "learning_rate": 3.809288537549408e-05, + "loss": 1.1309, + "step": 771 + }, + { + "epoch": 0.09154512036048856, + "grad_norm": 2.039062136245424, + "learning_rate": 3.814229249011858e-05, + "loss": 1.176, + "step": 772 + }, + { + "epoch": 0.09166370212261354, + "grad_norm": 1.8993568309514166, + "learning_rate": 3.819169960474309e-05, + "loss": 1.4058, + "step": 773 + }, + { + "epoch": 0.09178228388473852, + "grad_norm": 1.987374335073063, + "learning_rate": 3.824110671936759e-05, + "loss": 1.2478, + "step": 774 + }, + { + "epoch": 0.0919008656468635, + "grad_norm": 1.9683668826973582, + "learning_rate": 3.82905138339921e-05, + "loss": 1.0807, + "step": 775 + }, + { + "epoch": 0.0920194474089885, + "grad_norm": 1.726234548030342, + "learning_rate": 3.83399209486166e-05, + "loss": 1.1416, + "step": 776 + }, + { + "epoch": 0.09213802917111348, + "grad_norm": 2.056944594881047, + "learning_rate": 3.838932806324111e-05, + "loss": 1.0965, + "step": 777 + }, + { + "epoch": 0.09225661093323846, + "grad_norm": 2.03884930625359, + "learning_rate": 3.8438735177865614e-05, + "loss": 1.0829, + "step": 778 + }, + { + "epoch": 0.09237519269536346, + "grad_norm": 1.9918481277966453, + "learning_rate": 3.848814229249012e-05, + "loss": 1.2253, + "step": 779 + }, + { + "epoch": 0.09249377445748844, + "grad_norm": 1.8589113310217127, + "learning_rate": 3.8537549407114625e-05, + "loss": 1.4867, + "step": 780 + }, + { + "epoch": 0.09261235621961342, + "grad_norm": 1.7108111605808969, + "learning_rate": 3.8586956521739134e-05, + "loss": 0.9607, + "step": 781 + }, + { + "epoch": 0.09273093798173841, + "grad_norm": 1.9579371064233615, + "learning_rate": 3.8636363636363636e-05, + "loss": 1.0628, + "step": 782 + }, + { + "epoch": 0.09284951974386339, + "grad_norm": 2.3083962779000644, + "learning_rate": 3.8685770750988145e-05, + "loss": 1.0626, + "step": 783 + }, + { + "epoch": 0.09296810150598837, + "grad_norm": 2.0871321095522672, + "learning_rate": 3.873517786561265e-05, + "loss": 1.2407, + "step": 784 + }, + { + "epoch": 0.09308668326811337, + "grad_norm": 1.830627626149258, + "learning_rate": 3.8784584980237156e-05, + "loss": 1.244, + "step": 785 + }, + { + "epoch": 0.09320526503023835, + "grad_norm": 1.9163261155014395, + "learning_rate": 3.883399209486166e-05, + "loss": 1.1841, + "step": 786 + }, + { + "epoch": 0.09332384679236333, + "grad_norm": 2.1374368197020917, + "learning_rate": 3.888339920948617e-05, + "loss": 1.1872, + "step": 787 + }, + { + "epoch": 0.09344242855448832, + "grad_norm": 2.2879414968759813, + "learning_rate": 3.893280632411067e-05, + "loss": 1.2956, + "step": 788 + }, + { + "epoch": 0.0935610103166133, + "grad_norm": 1.919613605997733, + "learning_rate": 3.8982213438735186e-05, + "loss": 1.0762, + "step": 789 + }, + { + "epoch": 0.09367959207873829, + "grad_norm": 1.9117486478745016, + "learning_rate": 3.903162055335969e-05, + "loss": 1.1087, + "step": 790 + }, + { + "epoch": 0.09379817384086328, + "grad_norm": 2.1692338885653166, + "learning_rate": 3.90810276679842e-05, + "loss": 1.3508, + "step": 791 + }, + { + "epoch": 0.09391675560298826, + "grad_norm": 1.9707297484103659, + "learning_rate": 3.91304347826087e-05, + "loss": 1.1895, + "step": 792 + }, + { + "epoch": 0.09403533736511324, + "grad_norm": 2.080641285140318, + "learning_rate": 3.917984189723321e-05, + "loss": 1.2609, + "step": 793 + }, + { + "epoch": 0.09415391912723824, + "grad_norm": 2.0771137201593297, + "learning_rate": 3.922924901185771e-05, + "loss": 1.3348, + "step": 794 + }, + { + "epoch": 0.09427250088936322, + "grad_norm": 2.082356002136901, + "learning_rate": 3.927865612648222e-05, + "loss": 1.1172, + "step": 795 + }, + { + "epoch": 0.0943910826514882, + "grad_norm": 2.0592410333350677, + "learning_rate": 3.932806324110672e-05, + "loss": 1.2573, + "step": 796 + }, + { + "epoch": 0.09450966441361319, + "grad_norm": 2.0423949289939287, + "learning_rate": 3.937747035573123e-05, + "loss": 1.5129, + "step": 797 + }, + { + "epoch": 0.09462824617573817, + "grad_norm": 1.8842003872612643, + "learning_rate": 3.942687747035573e-05, + "loss": 1.2048, + "step": 798 + }, + { + "epoch": 0.09474682793786315, + "grad_norm": 2.1131952328169556, + "learning_rate": 3.947628458498024e-05, + "loss": 1.3676, + "step": 799 + }, + { + "epoch": 0.09486540969998815, + "grad_norm": 2.182430677382619, + "learning_rate": 3.9525691699604744e-05, + "loss": 1.1955, + "step": 800 + }, + { + "epoch": 0.09498399146211313, + "grad_norm": 2.059213024896555, + "learning_rate": 3.957509881422925e-05, + "loss": 1.0985, + "step": 801 + }, + { + "epoch": 0.09510257322423811, + "grad_norm": 1.6369766530788656, + "learning_rate": 3.9624505928853755e-05, + "loss": 1.1896, + "step": 802 + }, + { + "epoch": 0.09522115498636309, + "grad_norm": 1.8255258387813063, + "learning_rate": 3.9673913043478264e-05, + "loss": 1.2477, + "step": 803 + }, + { + "epoch": 0.09533973674848809, + "grad_norm": 2.162579954024575, + "learning_rate": 3.9723320158102766e-05, + "loss": 1.1176, + "step": 804 + }, + { + "epoch": 0.09545831851061307, + "grad_norm": 2.004376525231961, + "learning_rate": 3.9772727272727275e-05, + "loss": 1.215, + "step": 805 + }, + { + "epoch": 0.09557690027273805, + "grad_norm": 1.6962724045968693, + "learning_rate": 3.982213438735178e-05, + "loss": 1.1027, + "step": 806 + }, + { + "epoch": 0.09569548203486304, + "grad_norm": 1.846328983639688, + "learning_rate": 3.987154150197629e-05, + "loss": 1.2272, + "step": 807 + }, + { + "epoch": 0.09581406379698802, + "grad_norm": 2.2400765181151345, + "learning_rate": 3.9920948616600796e-05, + "loss": 1.3806, + "step": 808 + }, + { + "epoch": 0.095932645559113, + "grad_norm": 2.3032432860820515, + "learning_rate": 3.99703557312253e-05, + "loss": 1.3515, + "step": 809 + }, + { + "epoch": 0.096051227321238, + "grad_norm": 1.8562954221125572, + "learning_rate": 4.001976284584981e-05, + "loss": 1.2801, + "step": 810 + }, + { + "epoch": 0.09616980908336298, + "grad_norm": 1.853668007566561, + "learning_rate": 4.006916996047431e-05, + "loss": 1.3078, + "step": 811 + }, + { + "epoch": 0.09628839084548796, + "grad_norm": 2.1704001324965327, + "learning_rate": 4.011857707509882e-05, + "loss": 1.1313, + "step": 812 + }, + { + "epoch": 0.09640697260761295, + "grad_norm": 2.006749582829935, + "learning_rate": 4.016798418972332e-05, + "loss": 1.1217, + "step": 813 + }, + { + "epoch": 0.09652555436973793, + "grad_norm": 1.9572131864352607, + "learning_rate": 4.021739130434783e-05, + "loss": 1.2878, + "step": 814 + }, + { + "epoch": 0.09664413613186291, + "grad_norm": 1.9897941710919402, + "learning_rate": 4.026679841897233e-05, + "loss": 1.1713, + "step": 815 + }, + { + "epoch": 0.09676271789398791, + "grad_norm": 2.0169852356240012, + "learning_rate": 4.031620553359684e-05, + "loss": 1.304, + "step": 816 + }, + { + "epoch": 0.09688129965611289, + "grad_norm": 1.8101163459236398, + "learning_rate": 4.036561264822134e-05, + "loss": 1.2212, + "step": 817 + }, + { + "epoch": 0.09699988141823787, + "grad_norm": 2.045954558587558, + "learning_rate": 4.041501976284585e-05, + "loss": 1.3892, + "step": 818 + }, + { + "epoch": 0.09711846318036287, + "grad_norm": 2.0244257526296154, + "learning_rate": 4.0464426877470354e-05, + "loss": 1.1943, + "step": 819 + }, + { + "epoch": 0.09723704494248785, + "grad_norm": 1.8140753757579653, + "learning_rate": 4.051383399209486e-05, + "loss": 1.3084, + "step": 820 + }, + { + "epoch": 0.09735562670461283, + "grad_norm": 1.8014643051122026, + "learning_rate": 4.0563241106719365e-05, + "loss": 1.0362, + "step": 821 + }, + { + "epoch": 0.09747420846673782, + "grad_norm": 2.0596177893138563, + "learning_rate": 4.0612648221343874e-05, + "loss": 1.5499, + "step": 822 + }, + { + "epoch": 0.0975927902288628, + "grad_norm": 1.818569622680358, + "learning_rate": 4.0662055335968377e-05, + "loss": 1.249, + "step": 823 + }, + { + "epoch": 0.09771137199098778, + "grad_norm": 2.048847924967824, + "learning_rate": 4.0711462450592886e-05, + "loss": 1.4027, + "step": 824 + }, + { + "epoch": 0.09782995375311278, + "grad_norm": 2.0016751634662118, + "learning_rate": 4.076086956521739e-05, + "loss": 1.2888, + "step": 825 + }, + { + "epoch": 0.09794853551523776, + "grad_norm": 2.0902610110569513, + "learning_rate": 4.08102766798419e-05, + "loss": 1.2238, + "step": 826 + }, + { + "epoch": 0.09806711727736274, + "grad_norm": 2.118515128947569, + "learning_rate": 4.0859683794466406e-05, + "loss": 1.4035, + "step": 827 + }, + { + "epoch": 0.09818569903948772, + "grad_norm": 1.9549421008196457, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0099, + "step": 828 + }, + { + "epoch": 0.09830428080161271, + "grad_norm": 1.9369676130278435, + "learning_rate": 4.095849802371542e-05, + "loss": 1.1756, + "step": 829 + }, + { + "epoch": 0.0984228625637377, + "grad_norm": 2.0467149459912326, + "learning_rate": 4.1007905138339926e-05, + "loss": 1.3032, + "step": 830 + }, + { + "epoch": 0.09854144432586268, + "grad_norm": 2.1771320433376014, + "learning_rate": 4.105731225296443e-05, + "loss": 1.2864, + "step": 831 + }, + { + "epoch": 0.09866002608798767, + "grad_norm": 1.8247310544606976, + "learning_rate": 4.110671936758894e-05, + "loss": 1.2725, + "step": 832 + }, + { + "epoch": 0.09877860785011265, + "grad_norm": 1.7473377159219128, + "learning_rate": 4.115612648221344e-05, + "loss": 1.252, + "step": 833 + }, + { + "epoch": 0.09889718961223763, + "grad_norm": 1.8955558776420427, + "learning_rate": 4.120553359683795e-05, + "loss": 1.2867, + "step": 834 + }, + { + "epoch": 0.09901577137436263, + "grad_norm": 1.8694595682778619, + "learning_rate": 4.125494071146245e-05, + "loss": 1.074, + "step": 835 + }, + { + "epoch": 0.09913435313648761, + "grad_norm": 1.7139564894242536, + "learning_rate": 4.130434782608696e-05, + "loss": 1.188, + "step": 836 + }, + { + "epoch": 0.09925293489861259, + "grad_norm": 1.6488950592388731, + "learning_rate": 4.135375494071146e-05, + "loss": 1.3664, + "step": 837 + }, + { + "epoch": 0.09937151666073758, + "grad_norm": 1.9726394321118084, + "learning_rate": 4.140316205533597e-05, + "loss": 1.0221, + "step": 838 + }, + { + "epoch": 0.09949009842286256, + "grad_norm": 2.0623449352578387, + "learning_rate": 4.145256916996047e-05, + "loss": 1.1312, + "step": 839 + }, + { + "epoch": 0.09960868018498754, + "grad_norm": 1.9312461884967698, + "learning_rate": 4.150197628458498e-05, + "loss": 1.3727, + "step": 840 + }, + { + "epoch": 0.09972726194711254, + "grad_norm": 1.9786099830436268, + "learning_rate": 4.1551383399209484e-05, + "loss": 1.3299, + "step": 841 + }, + { + "epoch": 0.09984584370923752, + "grad_norm": 2.4660285091151515, + "learning_rate": 4.160079051383399e-05, + "loss": 1.3515, + "step": 842 + }, + { + "epoch": 0.0999644254713625, + "grad_norm": 1.9237543580154044, + "learning_rate": 4.1650197628458496e-05, + "loss": 1.4227, + "step": 843 + }, + { + "epoch": 0.1000830072334875, + "grad_norm": 1.7351420748317194, + "learning_rate": 4.1699604743083005e-05, + "loss": 0.6643, + "step": 844 + }, + { + "epoch": 0.10020158899561248, + "grad_norm": 2.0591602458797316, + "learning_rate": 4.174901185770751e-05, + "loss": 1.404, + "step": 845 + }, + { + "epoch": 0.10032017075773746, + "grad_norm": 1.8192293948305147, + "learning_rate": 4.1798418972332016e-05, + "loss": 1.1474, + "step": 846 + }, + { + "epoch": 0.10043875251986245, + "grad_norm": 2.0611499294257922, + "learning_rate": 4.1847826086956525e-05, + "loss": 1.2152, + "step": 847 + }, + { + "epoch": 0.10055733428198743, + "grad_norm": 2.2534487177219074, + "learning_rate": 4.1897233201581034e-05, + "loss": 1.0678, + "step": 848 + }, + { + "epoch": 0.10067591604411241, + "grad_norm": 1.968717669879219, + "learning_rate": 4.1946640316205536e-05, + "loss": 1.1626, + "step": 849 + }, + { + "epoch": 0.10079449780623741, + "grad_norm": 2.156380333963278, + "learning_rate": 4.1996047430830045e-05, + "loss": 1.0764, + "step": 850 + }, + { + "epoch": 0.10091307956836239, + "grad_norm": 1.9497695815682135, + "learning_rate": 4.204545454545455e-05, + "loss": 1.1463, + "step": 851 + }, + { + "epoch": 0.10103166133048737, + "grad_norm": 1.9879877753040118, + "learning_rate": 4.2094861660079056e-05, + "loss": 1.0398, + "step": 852 + }, + { + "epoch": 0.10115024309261236, + "grad_norm": 2.1932365865877395, + "learning_rate": 4.214426877470356e-05, + "loss": 1.3112, + "step": 853 + }, + { + "epoch": 0.10126882485473734, + "grad_norm": 2.2291811291446226, + "learning_rate": 4.219367588932807e-05, + "loss": 1.2053, + "step": 854 + }, + { + "epoch": 0.10138740661686232, + "grad_norm": 2.4204808458141835, + "learning_rate": 4.224308300395257e-05, + "loss": 0.9389, + "step": 855 + }, + { + "epoch": 0.1015059883789873, + "grad_norm": 1.887568640982617, + "learning_rate": 4.229249011857708e-05, + "loss": 1.3766, + "step": 856 + }, + { + "epoch": 0.1016245701411123, + "grad_norm": 2.0088931764450924, + "learning_rate": 4.234189723320158e-05, + "loss": 1.3968, + "step": 857 + }, + { + "epoch": 0.10174315190323728, + "grad_norm": 1.8863378987932518, + "learning_rate": 4.239130434782609e-05, + "loss": 0.9547, + "step": 858 + }, + { + "epoch": 0.10186173366536226, + "grad_norm": 2.166206132233298, + "learning_rate": 4.244071146245059e-05, + "loss": 1.0599, + "step": 859 + }, + { + "epoch": 0.10198031542748726, + "grad_norm": 1.9023084798837424, + "learning_rate": 4.24901185770751e-05, + "loss": 1.3391, + "step": 860 + }, + { + "epoch": 0.10209889718961224, + "grad_norm": 1.966553097866335, + "learning_rate": 4.2539525691699603e-05, + "loss": 1.2819, + "step": 861 + }, + { + "epoch": 0.10221747895173722, + "grad_norm": 1.908856504444068, + "learning_rate": 4.258893280632411e-05, + "loss": 1.1397, + "step": 862 + }, + { + "epoch": 0.10233606071386221, + "grad_norm": 2.0220728533214576, + "learning_rate": 4.2638339920948615e-05, + "loss": 1.3664, + "step": 863 + }, + { + "epoch": 0.10245464247598719, + "grad_norm": 1.9501631201580372, + "learning_rate": 4.2687747035573124e-05, + "loss": 1.2686, + "step": 864 + }, + { + "epoch": 0.10257322423811217, + "grad_norm": 1.769558256501275, + "learning_rate": 4.2737154150197626e-05, + "loss": 1.3436, + "step": 865 + }, + { + "epoch": 0.10269180600023717, + "grad_norm": 1.8578844166907498, + "learning_rate": 4.2786561264822135e-05, + "loss": 1.2387, + "step": 866 + }, + { + "epoch": 0.10281038776236215, + "grad_norm": 1.9678988334101155, + "learning_rate": 4.2835968379446644e-05, + "loss": 1.4244, + "step": 867 + }, + { + "epoch": 0.10292896952448713, + "grad_norm": 2.191004380786469, + "learning_rate": 4.288537549407115e-05, + "loss": 1.1014, + "step": 868 + }, + { + "epoch": 0.10304755128661212, + "grad_norm": 2.074482620996577, + "learning_rate": 4.2934782608695655e-05, + "loss": 1.5526, + "step": 869 + }, + { + "epoch": 0.1031661330487371, + "grad_norm": 2.2228737816081243, + "learning_rate": 4.2984189723320164e-05, + "loss": 1.2706, + "step": 870 + }, + { + "epoch": 0.10328471481086209, + "grad_norm": 1.745876903000705, + "learning_rate": 4.3033596837944666e-05, + "loss": 0.9514, + "step": 871 + }, + { + "epoch": 0.10340329657298708, + "grad_norm": 2.0824681483484464, + "learning_rate": 4.3083003952569175e-05, + "loss": 1.2042, + "step": 872 + }, + { + "epoch": 0.10352187833511206, + "grad_norm": 2.0967339509318044, + "learning_rate": 4.313241106719368e-05, + "loss": 0.8052, + "step": 873 + }, + { + "epoch": 0.10364046009723704, + "grad_norm": 1.8949877611990555, + "learning_rate": 4.318181818181819e-05, + "loss": 1.1511, + "step": 874 + }, + { + "epoch": 0.10375904185936204, + "grad_norm": 1.9043727390088339, + "learning_rate": 4.323122529644269e-05, + "loss": 1.2973, + "step": 875 + }, + { + "epoch": 0.10387762362148702, + "grad_norm": 2.0952638816849505, + "learning_rate": 4.32806324110672e-05, + "loss": 1.1198, + "step": 876 + }, + { + "epoch": 0.103996205383612, + "grad_norm": 2.4468528049292, + "learning_rate": 4.33300395256917e-05, + "loss": 1.1874, + "step": 877 + }, + { + "epoch": 0.10411478714573699, + "grad_norm": 2.2346995754120704, + "learning_rate": 4.337944664031621e-05, + "loss": 1.2748, + "step": 878 + }, + { + "epoch": 0.10423336890786197, + "grad_norm": 1.902601337232446, + "learning_rate": 4.342885375494071e-05, + "loss": 1.2307, + "step": 879 + }, + { + "epoch": 0.10435195066998695, + "grad_norm": 1.9487531221239764, + "learning_rate": 4.347826086956522e-05, + "loss": 1.2994, + "step": 880 + }, + { + "epoch": 0.10447053243211193, + "grad_norm": 1.7596427645246235, + "learning_rate": 4.352766798418972e-05, + "loss": 1.3672, + "step": 881 + }, + { + "epoch": 0.10458911419423693, + "grad_norm": 1.7590861378320122, + "learning_rate": 4.357707509881423e-05, + "loss": 1.2722, + "step": 882 + }, + { + "epoch": 0.10470769595636191, + "grad_norm": 2.044211916033582, + "learning_rate": 4.3626482213438734e-05, + "loss": 0.9985, + "step": 883 + }, + { + "epoch": 0.10482627771848689, + "grad_norm": 1.6468563802673024, + "learning_rate": 4.367588932806324e-05, + "loss": 1.1106, + "step": 884 + }, + { + "epoch": 0.10494485948061189, + "grad_norm": 1.801814450984469, + "learning_rate": 4.3725296442687745e-05, + "loss": 1.3193, + "step": 885 + }, + { + "epoch": 0.10506344124273687, + "grad_norm": 1.9961628868622643, + "learning_rate": 4.377470355731226e-05, + "loss": 1.0353, + "step": 886 + }, + { + "epoch": 0.10518202300486185, + "grad_norm": 1.8436219413080774, + "learning_rate": 4.382411067193676e-05, + "loss": 1.1249, + "step": 887 + }, + { + "epoch": 0.10530060476698684, + "grad_norm": 1.9741144551584124, + "learning_rate": 4.387351778656127e-05, + "loss": 1.1672, + "step": 888 + }, + { + "epoch": 0.10541918652911182, + "grad_norm": 1.6923283503935258, + "learning_rate": 4.3922924901185774e-05, + "loss": 1.2347, + "step": 889 + }, + { + "epoch": 0.1055377682912368, + "grad_norm": 1.6409134952018314, + "learning_rate": 4.397233201581028e-05, + "loss": 0.996, + "step": 890 + }, + { + "epoch": 0.1056563500533618, + "grad_norm": 1.7559903407107664, + "learning_rate": 4.4021739130434786e-05, + "loss": 1.3053, + "step": 891 + }, + { + "epoch": 0.10577493181548678, + "grad_norm": 1.7279329119648792, + "learning_rate": 4.4071146245059295e-05, + "loss": 1.4076, + "step": 892 + }, + { + "epoch": 0.10589351357761176, + "grad_norm": 2.1244467806105707, + "learning_rate": 4.41205533596838e-05, + "loss": 1.4878, + "step": 893 + }, + { + "epoch": 0.10601209533973675, + "grad_norm": 2.3752813238702535, + "learning_rate": 4.4169960474308306e-05, + "loss": 1.2124, + "step": 894 + }, + { + "epoch": 0.10613067710186173, + "grad_norm": 2.05779691288018, + "learning_rate": 4.421936758893281e-05, + "loss": 1.2139, + "step": 895 + }, + { + "epoch": 0.10624925886398671, + "grad_norm": 1.9261420575362562, + "learning_rate": 4.426877470355732e-05, + "loss": 0.9466, + "step": 896 + }, + { + "epoch": 0.10636784062611171, + "grad_norm": 1.7298709781494437, + "learning_rate": 4.431818181818182e-05, + "loss": 1.2658, + "step": 897 + }, + { + "epoch": 0.10648642238823669, + "grad_norm": 1.658945611223593, + "learning_rate": 4.436758893280633e-05, + "loss": 1.2954, + "step": 898 + }, + { + "epoch": 0.10660500415036167, + "grad_norm": 1.8339472691663428, + "learning_rate": 4.441699604743083e-05, + "loss": 1.1393, + "step": 899 + }, + { + "epoch": 0.10672358591248667, + "grad_norm": 1.9285292451385885, + "learning_rate": 4.446640316205534e-05, + "loss": 1.3713, + "step": 900 + }, + { + "epoch": 0.10684216767461165, + "grad_norm": 2.1134731542458156, + "learning_rate": 4.451581027667984e-05, + "loss": 1.1754, + "step": 901 + }, + { + "epoch": 0.10696074943673663, + "grad_norm": 1.8981371458183007, + "learning_rate": 4.456521739130435e-05, + "loss": 1.2121, + "step": 902 + }, + { + "epoch": 0.10707933119886162, + "grad_norm": 1.7672718156652993, + "learning_rate": 4.461462450592885e-05, + "loss": 1.2403, + "step": 903 + }, + { + "epoch": 0.1071979129609866, + "grad_norm": 1.8165985090687495, + "learning_rate": 4.466403162055336e-05, + "loss": 1.2038, + "step": 904 + }, + { + "epoch": 0.10731649472311158, + "grad_norm": 1.8549108104357437, + "learning_rate": 4.471343873517787e-05, + "loss": 1.3917, + "step": 905 + }, + { + "epoch": 0.10743507648523658, + "grad_norm": 1.9023262164992225, + "learning_rate": 4.476284584980237e-05, + "loss": 1.2924, + "step": 906 + }, + { + "epoch": 0.10755365824736156, + "grad_norm": 2.029546637131855, + "learning_rate": 4.481225296442688e-05, + "loss": 1.3235, + "step": 907 + }, + { + "epoch": 0.10767224000948654, + "grad_norm": 1.7274872916585335, + "learning_rate": 4.4861660079051384e-05, + "loss": 1.2639, + "step": 908 + }, + { + "epoch": 0.10779082177161152, + "grad_norm": 2.3181291421702213, + "learning_rate": 4.4911067193675893e-05, + "loss": 1.1122, + "step": 909 + }, + { + "epoch": 0.10790940353373651, + "grad_norm": 1.6709200747355057, + "learning_rate": 4.4960474308300396e-05, + "loss": 0.8722, + "step": 910 + }, + { + "epoch": 0.1080279852958615, + "grad_norm": 1.9262307751965115, + "learning_rate": 4.5009881422924905e-05, + "loss": 1.3221, + "step": 911 + }, + { + "epoch": 0.10814656705798648, + "grad_norm": 1.7541556743157296, + "learning_rate": 4.505928853754941e-05, + "loss": 1.2547, + "step": 912 + }, + { + "epoch": 0.10826514882011147, + "grad_norm": 2.0693202513228264, + "learning_rate": 4.5108695652173916e-05, + "loss": 1.1727, + "step": 913 + }, + { + "epoch": 0.10838373058223645, + "grad_norm": 1.6748716546145963, + "learning_rate": 4.515810276679842e-05, + "loss": 1.0561, + "step": 914 + }, + { + "epoch": 0.10850231234436143, + "grad_norm": 1.7669442762098209, + "learning_rate": 4.520750988142293e-05, + "loss": 1.0247, + "step": 915 + }, + { + "epoch": 0.10862089410648643, + "grad_norm": 1.764858036082851, + "learning_rate": 4.525691699604743e-05, + "loss": 1.3064, + "step": 916 + }, + { + "epoch": 0.10873947586861141, + "grad_norm": 1.8523355766215788, + "learning_rate": 4.530632411067194e-05, + "loss": 1.3436, + "step": 917 + }, + { + "epoch": 0.10885805763073639, + "grad_norm": 1.7208198277303732, + "learning_rate": 4.535573122529644e-05, + "loss": 1.2577, + "step": 918 + }, + { + "epoch": 0.10897663939286138, + "grad_norm": 1.8955864533123026, + "learning_rate": 4.540513833992095e-05, + "loss": 0.9976, + "step": 919 + }, + { + "epoch": 0.10909522115498636, + "grad_norm": 1.9759289842880057, + "learning_rate": 4.545454545454546e-05, + "loss": 1.2256, + "step": 920 + }, + { + "epoch": 0.10921380291711134, + "grad_norm": 1.8432663662277446, + "learning_rate": 4.550395256916996e-05, + "loss": 1.1955, + "step": 921 + }, + { + "epoch": 0.10933238467923634, + "grad_norm": 1.9418810962425284, + "learning_rate": 4.555335968379447e-05, + "loss": 1.2502, + "step": 922 + }, + { + "epoch": 0.10945096644136132, + "grad_norm": 2.1153215664389817, + "learning_rate": 4.560276679841897e-05, + "loss": 1.2582, + "step": 923 + }, + { + "epoch": 0.1095695482034863, + "grad_norm": 2.1726670434809767, + "learning_rate": 4.565217391304348e-05, + "loss": 1.2586, + "step": 924 + }, + { + "epoch": 0.1096881299656113, + "grad_norm": 1.9671307783727052, + "learning_rate": 4.570158102766799e-05, + "loss": 1.3821, + "step": 925 + }, + { + "epoch": 0.10980671172773628, + "grad_norm": 2.0521507782723694, + "learning_rate": 4.575098814229249e-05, + "loss": 1.37, + "step": 926 + }, + { + "epoch": 0.10992529348986126, + "grad_norm": 1.9181584692109688, + "learning_rate": 4.5800395256917e-05, + "loss": 0.9747, + "step": 927 + }, + { + "epoch": 0.11004387525198625, + "grad_norm": 1.8722199623877365, + "learning_rate": 4.5849802371541504e-05, + "loss": 1.0937, + "step": 928 + }, + { + "epoch": 0.11016245701411123, + "grad_norm": 1.7865272778702999, + "learning_rate": 4.589920948616601e-05, + "loss": 1.1477, + "step": 929 + }, + { + "epoch": 0.11028103877623621, + "grad_norm": 1.7119662914166476, + "learning_rate": 4.5948616600790515e-05, + "loss": 1.2592, + "step": 930 + }, + { + "epoch": 0.1103996205383612, + "grad_norm": 1.7885041358769678, + "learning_rate": 4.5998023715415024e-05, + "loss": 1.1396, + "step": 931 + }, + { + "epoch": 0.11051820230048619, + "grad_norm": 2.0130227835378314, + "learning_rate": 4.6047430830039526e-05, + "loss": 1.0512, + "step": 932 + }, + { + "epoch": 0.11063678406261117, + "grad_norm": 1.8315683633769348, + "learning_rate": 4.6096837944664035e-05, + "loss": 1.4377, + "step": 933 + }, + { + "epoch": 0.11075536582473615, + "grad_norm": 1.8732028503154297, + "learning_rate": 4.614624505928854e-05, + "loss": 1.2662, + "step": 934 + }, + { + "epoch": 0.11087394758686114, + "grad_norm": 1.9999463394591568, + "learning_rate": 4.6195652173913046e-05, + "loss": 1.2867, + "step": 935 + }, + { + "epoch": 0.11099252934898612, + "grad_norm": 2.1162199150606886, + "learning_rate": 4.624505928853755e-05, + "loss": 1.232, + "step": 936 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.896852560904737, + "learning_rate": 4.629446640316206e-05, + "loss": 0.9709, + "step": 937 + }, + { + "epoch": 0.1112296928732361, + "grad_norm": 2.1124524990856726, + "learning_rate": 4.634387351778656e-05, + "loss": 1.3192, + "step": 938 + }, + { + "epoch": 0.11134827463536108, + "grad_norm": 2.0247553109642897, + "learning_rate": 4.639328063241107e-05, + "loss": 1.0495, + "step": 939 + }, + { + "epoch": 0.11146685639748606, + "grad_norm": 1.837032193157011, + "learning_rate": 4.644268774703557e-05, + "loss": 1.2016, + "step": 940 + }, + { + "epoch": 0.11158543815961106, + "grad_norm": 1.7637329162392503, + "learning_rate": 4.649209486166008e-05, + "loss": 1.1527, + "step": 941 + }, + { + "epoch": 0.11170401992173604, + "grad_norm": 1.7004539282959317, + "learning_rate": 4.654150197628458e-05, + "loss": 0.8982, + "step": 942 + }, + { + "epoch": 0.11182260168386102, + "grad_norm": 1.6509845886045809, + "learning_rate": 4.659090909090909e-05, + "loss": 1.1349, + "step": 943 + }, + { + "epoch": 0.11194118344598601, + "grad_norm": 2.01819605967656, + "learning_rate": 4.66403162055336e-05, + "loss": 1.2484, + "step": 944 + }, + { + "epoch": 0.11205976520811099, + "grad_norm": 1.8885785050848634, + "learning_rate": 4.668972332015811e-05, + "loss": 1.2451, + "step": 945 + }, + { + "epoch": 0.11217834697023597, + "grad_norm": 1.7149115286236059, + "learning_rate": 4.673913043478261e-05, + "loss": 0.9956, + "step": 946 + }, + { + "epoch": 0.11229692873236097, + "grad_norm": 1.858290844796101, + "learning_rate": 4.678853754940712e-05, + "loss": 0.9594, + "step": 947 + }, + { + "epoch": 0.11241551049448595, + "grad_norm": 1.6783958274539101, + "learning_rate": 4.683794466403162e-05, + "loss": 0.8649, + "step": 948 + }, + { + "epoch": 0.11253409225661093, + "grad_norm": 1.8331216161311423, + "learning_rate": 4.688735177865613e-05, + "loss": 1.4045, + "step": 949 + }, + { + "epoch": 0.11265267401873592, + "grad_norm": 2.153906467736849, + "learning_rate": 4.6936758893280634e-05, + "loss": 1.4198, + "step": 950 + }, + { + "epoch": 0.1127712557808609, + "grad_norm": 2.235965827757821, + "learning_rate": 4.698616600790514e-05, + "loss": 1.3424, + "step": 951 + }, + { + "epoch": 0.11288983754298589, + "grad_norm": 2.155332770523107, + "learning_rate": 4.7035573122529645e-05, + "loss": 1.3781, + "step": 952 + }, + { + "epoch": 0.11300841930511088, + "grad_norm": 1.8623449872378037, + "learning_rate": 4.7084980237154154e-05, + "loss": 1.2494, + "step": 953 + }, + { + "epoch": 0.11312700106723586, + "grad_norm": 2.087247285271247, + "learning_rate": 4.7134387351778656e-05, + "loss": 1.2624, + "step": 954 + }, + { + "epoch": 0.11324558282936084, + "grad_norm": 1.813153002810931, + "learning_rate": 4.7183794466403165e-05, + "loss": 1.4037, + "step": 955 + }, + { + "epoch": 0.11336416459148584, + "grad_norm": 1.7715822596606956, + "learning_rate": 4.723320158102767e-05, + "loss": 1.0358, + "step": 956 + }, + { + "epoch": 0.11348274635361082, + "grad_norm": 1.783962441375639, + "learning_rate": 4.7282608695652177e-05, + "loss": 1.1431, + "step": 957 + }, + { + "epoch": 0.1136013281157358, + "grad_norm": 1.8052430807935291, + "learning_rate": 4.733201581027668e-05, + "loss": 1.0908, + "step": 958 + }, + { + "epoch": 0.11371990987786078, + "grad_norm": 1.8586355378281585, + "learning_rate": 4.738142292490119e-05, + "loss": 1.0749, + "step": 959 + }, + { + "epoch": 0.11383849163998577, + "grad_norm": 2.0881727003165755, + "learning_rate": 4.743083003952569e-05, + "loss": 1.1952, + "step": 960 + }, + { + "epoch": 0.11395707340211075, + "grad_norm": 1.6846664201538244, + "learning_rate": 4.74802371541502e-05, + "loss": 1.3378, + "step": 961 + }, + { + "epoch": 0.11407565516423573, + "grad_norm": 2.005243313616091, + "learning_rate": 4.75296442687747e-05, + "loss": 1.2846, + "step": 962 + }, + { + "epoch": 0.11419423692636073, + "grad_norm": 2.3306286589425995, + "learning_rate": 4.757905138339921e-05, + "loss": 1.2981, + "step": 963 + }, + { + "epoch": 0.11431281868848571, + "grad_norm": 1.900634256089495, + "learning_rate": 4.762845849802372e-05, + "loss": 1.4166, + "step": 964 + }, + { + "epoch": 0.11443140045061069, + "grad_norm": 2.0920531846538304, + "learning_rate": 4.767786561264823e-05, + "loss": 1.2489, + "step": 965 + }, + { + "epoch": 0.11454998221273568, + "grad_norm": 1.96156413237464, + "learning_rate": 4.772727272727273e-05, + "loss": 1.2874, + "step": 966 + }, + { + "epoch": 0.11466856397486067, + "grad_norm": 1.9617195638348448, + "learning_rate": 4.777667984189724e-05, + "loss": 1.0668, + "step": 967 + }, + { + "epoch": 0.11478714573698565, + "grad_norm": 2.0481211742199674, + "learning_rate": 4.782608695652174e-05, + "loss": 1.1679, + "step": 968 + }, + { + "epoch": 0.11490572749911064, + "grad_norm": 2.0941894832795107, + "learning_rate": 4.787549407114625e-05, + "loss": 1.1117, + "step": 969 + }, + { + "epoch": 0.11502430926123562, + "grad_norm": 1.659940015126908, + "learning_rate": 4.792490118577075e-05, + "loss": 1.3317, + "step": 970 + }, + { + "epoch": 0.1151428910233606, + "grad_norm": 1.7143135700833254, + "learning_rate": 4.797430830039526e-05, + "loss": 1.2139, + "step": 971 + }, + { + "epoch": 0.1152614727854856, + "grad_norm": 1.8946769265830694, + "learning_rate": 4.8023715415019764e-05, + "loss": 1.1573, + "step": 972 + }, + { + "epoch": 0.11538005454761058, + "grad_norm": 1.9568290156255086, + "learning_rate": 4.807312252964427e-05, + "loss": 1.1493, + "step": 973 + }, + { + "epoch": 0.11549863630973556, + "grad_norm": 1.9682931863285784, + "learning_rate": 4.8122529644268775e-05, + "loss": 1.2079, + "step": 974 + }, + { + "epoch": 0.11561721807186055, + "grad_norm": 2.242643603494696, + "learning_rate": 4.8171936758893284e-05, + "loss": 1.2469, + "step": 975 + }, + { + "epoch": 0.11573579983398553, + "grad_norm": 1.881984303824199, + "learning_rate": 4.822134387351779e-05, + "loss": 1.2232, + "step": 976 + }, + { + "epoch": 0.11585438159611051, + "grad_norm": 1.9788716852599484, + "learning_rate": 4.8270750988142296e-05, + "loss": 1.1689, + "step": 977 + }, + { + "epoch": 0.11597296335823551, + "grad_norm": 1.852786357550817, + "learning_rate": 4.83201581027668e-05, + "loss": 1.1687, + "step": 978 + }, + { + "epoch": 0.11609154512036049, + "grad_norm": 2.013911311112838, + "learning_rate": 4.836956521739131e-05, + "loss": 1.2123, + "step": 979 + }, + { + "epoch": 0.11621012688248547, + "grad_norm": 1.979762269989753, + "learning_rate": 4.841897233201581e-05, + "loss": 1.0937, + "step": 980 + }, + { + "epoch": 0.11632870864461047, + "grad_norm": 1.9586515160489768, + "learning_rate": 4.846837944664032e-05, + "loss": 1.1928, + "step": 981 + }, + { + "epoch": 0.11644729040673545, + "grad_norm": 1.8787502593383414, + "learning_rate": 4.851778656126482e-05, + "loss": 1.2031, + "step": 982 + }, + { + "epoch": 0.11656587216886043, + "grad_norm": 1.8925454821695789, + "learning_rate": 4.8567193675889336e-05, + "loss": 1.035, + "step": 983 + }, + { + "epoch": 0.11668445393098542, + "grad_norm": 1.9625363754307843, + "learning_rate": 4.861660079051384e-05, + "loss": 1.0582, + "step": 984 + }, + { + "epoch": 0.1168030356931104, + "grad_norm": 1.9065950092131323, + "learning_rate": 4.866600790513835e-05, + "loss": 1.3718, + "step": 985 + }, + { + "epoch": 0.11692161745523538, + "grad_norm": 1.8046804656741444, + "learning_rate": 4.871541501976285e-05, + "loss": 1.1547, + "step": 986 + }, + { + "epoch": 0.11704019921736036, + "grad_norm": 1.7622499727660677, + "learning_rate": 4.876482213438736e-05, + "loss": 0.8967, + "step": 987 + }, + { + "epoch": 0.11715878097948536, + "grad_norm": 1.9781241882538605, + "learning_rate": 4.881422924901186e-05, + "loss": 1.3093, + "step": 988 + }, + { + "epoch": 0.11727736274161034, + "grad_norm": 1.724237468843233, + "learning_rate": 4.886363636363637e-05, + "loss": 1.2335, + "step": 989 + }, + { + "epoch": 0.11739594450373532, + "grad_norm": 1.8110047843615895, + "learning_rate": 4.891304347826087e-05, + "loss": 0.9987, + "step": 990 + }, + { + "epoch": 0.11751452626586031, + "grad_norm": 2.051439912230871, + "learning_rate": 4.896245059288538e-05, + "loss": 1.2339, + "step": 991 + }, + { + "epoch": 0.1176331080279853, + "grad_norm": 2.2060499732588257, + "learning_rate": 4.901185770750988e-05, + "loss": 1.1418, + "step": 992 + }, + { + "epoch": 0.11775168979011028, + "grad_norm": 1.8898111790633472, + "learning_rate": 4.906126482213439e-05, + "loss": 0.9866, + "step": 993 + }, + { + "epoch": 0.11787027155223527, + "grad_norm": 1.9105448729196526, + "learning_rate": 4.9110671936758895e-05, + "loss": 1.3148, + "step": 994 + }, + { + "epoch": 0.11798885331436025, + "grad_norm": 1.7988818762938, + "learning_rate": 4.9160079051383404e-05, + "loss": 0.6929, + "step": 995 + }, + { + "epoch": 0.11810743507648523, + "grad_norm": 1.6665733638139224, + "learning_rate": 4.9209486166007906e-05, + "loss": 0.9581, + "step": 996 + }, + { + "epoch": 0.11822601683861023, + "grad_norm": 2.1392490113267217, + "learning_rate": 4.9258893280632415e-05, + "loss": 1.3025, + "step": 997 + }, + { + "epoch": 0.1183445986007352, + "grad_norm": 1.6484191710586578, + "learning_rate": 4.930830039525692e-05, + "loss": 1.3707, + "step": 998 + }, + { + "epoch": 0.11846318036286019, + "grad_norm": 1.948458496037207, + "learning_rate": 4.9357707509881426e-05, + "loss": 1.2303, + "step": 999 + }, + { + "epoch": 0.11858176212498518, + "grad_norm": 1.8026660099959277, + "learning_rate": 4.940711462450593e-05, + "loss": 1.0804, + "step": 1000 + }, + { + "epoch": 0.11870034388711016, + "grad_norm": 1.8026466537417387, + "learning_rate": 4.945652173913044e-05, + "loss": 1.354, + "step": 1001 + }, + { + "epoch": 0.11881892564923514, + "grad_norm": 1.6044287487874351, + "learning_rate": 4.950592885375494e-05, + "loss": 1.3488, + "step": 1002 + }, + { + "epoch": 0.11893750741136014, + "grad_norm": 1.7719903825353065, + "learning_rate": 4.955533596837945e-05, + "loss": 1.1391, + "step": 1003 + }, + { + "epoch": 0.11905608917348512, + "grad_norm": 2.0330938724381307, + "learning_rate": 4.960474308300396e-05, + "loss": 1.0259, + "step": 1004 + }, + { + "epoch": 0.1191746709356101, + "grad_norm": 1.5916787119813633, + "learning_rate": 4.965415019762846e-05, + "loss": 0.8591, + "step": 1005 + }, + { + "epoch": 0.1192932526977351, + "grad_norm": 1.682357430479222, + "learning_rate": 4.970355731225297e-05, + "loss": 1.1286, + "step": 1006 + }, + { + "epoch": 0.11941183445986008, + "grad_norm": 1.932868773806507, + "learning_rate": 4.975296442687747e-05, + "loss": 1.3098, + "step": 1007 + }, + { + "epoch": 0.11953041622198506, + "grad_norm": 1.8164841750466307, + "learning_rate": 4.980237154150198e-05, + "loss": 1.2577, + "step": 1008 + }, + { + "epoch": 0.11964899798411005, + "grad_norm": 1.9510900299128513, + "learning_rate": 4.985177865612648e-05, + "loss": 1.4262, + "step": 1009 + }, + { + "epoch": 0.11976757974623503, + "grad_norm": 1.567392649076469, + "learning_rate": 4.990118577075099e-05, + "loss": 1.2252, + "step": 1010 + }, + { + "epoch": 0.11988616150836001, + "grad_norm": 1.7364980371963226, + "learning_rate": 4.9950592885375493e-05, + "loss": 0.9976, + "step": 1011 + }, + { + "epoch": 0.12000474327048499, + "grad_norm": 1.849831435461812, + "learning_rate": 5e-05, + "loss": 1.228, + "step": 1012 + }, + { + "epoch": 0.12012332503260999, + "grad_norm": 1.9555085269155421, + "learning_rate": 4.999999988476533e-05, + "loss": 0.956, + "step": 1013 + }, + { + "epoch": 0.12024190679473497, + "grad_norm": 1.9440927627835354, + "learning_rate": 4.9999999539061315e-05, + "loss": 1.1064, + "step": 1014 + }, + { + "epoch": 0.12036048855685995, + "grad_norm": 1.84469394830904, + "learning_rate": 4.999999896288796e-05, + "loss": 1.2925, + "step": 1015 + }, + { + "epoch": 0.12047907031898494, + "grad_norm": 1.9158110052876123, + "learning_rate": 4.999999815624527e-05, + "loss": 1.3134, + "step": 1016 + }, + { + "epoch": 0.12059765208110992, + "grad_norm": 2.0117361645944984, + "learning_rate": 4.999999711913327e-05, + "loss": 1.1083, + "step": 1017 + }, + { + "epoch": 0.1207162338432349, + "grad_norm": 1.844216080601768, + "learning_rate": 4.999999585155194e-05, + "loss": 1.2087, + "step": 1018 + }, + { + "epoch": 0.1208348156053599, + "grad_norm": 1.730543135065885, + "learning_rate": 4.999999435350131e-05, + "loss": 1.1102, + "step": 1019 + }, + { + "epoch": 0.12095339736748488, + "grad_norm": 1.9818506190570875, + "learning_rate": 4.9999992624981384e-05, + "loss": 1.3528, + "step": 1020 + }, + { + "epoch": 0.12107197912960986, + "grad_norm": 1.8715099993860853, + "learning_rate": 4.9999990665992186e-05, + "loss": 1.0893, + "step": 1021 + }, + { + "epoch": 0.12119056089173486, + "grad_norm": 1.716167027610077, + "learning_rate": 4.999998847653373e-05, + "loss": 1.1002, + "step": 1022 + }, + { + "epoch": 0.12130914265385984, + "grad_norm": 1.790487121080667, + "learning_rate": 4.9999986056606044e-05, + "loss": 1.2409, + "step": 1023 + }, + { + "epoch": 0.12142772441598482, + "grad_norm": 1.9883428146216597, + "learning_rate": 4.9999983406209127e-05, + "loss": 1.324, + "step": 1024 + }, + { + "epoch": 0.12154630617810981, + "grad_norm": 1.7654449428420373, + "learning_rate": 4.9999980525343035e-05, + "loss": 1.0383, + "step": 1025 + }, + { + "epoch": 0.12166488794023479, + "grad_norm": 1.950085632070345, + "learning_rate": 4.999997741400777e-05, + "loss": 1.102, + "step": 1026 + }, + { + "epoch": 0.12178346970235977, + "grad_norm": 1.8529344004458457, + "learning_rate": 4.9999974072203383e-05, + "loss": 1.2229, + "step": 1027 + }, + { + "epoch": 0.12190205146448477, + "grad_norm": 2.0505550638386665, + "learning_rate": 4.9999970499929884e-05, + "loss": 1.1214, + "step": 1028 + }, + { + "epoch": 0.12202063322660975, + "grad_norm": 1.7682492794989868, + "learning_rate": 4.999996669718731e-05, + "loss": 0.9526, + "step": 1029 + }, + { + "epoch": 0.12213921498873473, + "grad_norm": 1.7107854388329125, + "learning_rate": 4.999996266397571e-05, + "loss": 1.2079, + "step": 1030 + }, + { + "epoch": 0.12225779675085972, + "grad_norm": 1.507303001129272, + "learning_rate": 4.99999584002951e-05, + "loss": 1.0681, + "step": 1031 + }, + { + "epoch": 0.1223763785129847, + "grad_norm": 1.727884210491178, + "learning_rate": 4.9999953906145534e-05, + "loss": 1.2404, + "step": 1032 + }, + { + "epoch": 0.12249496027510968, + "grad_norm": 2.1978094225061695, + "learning_rate": 4.999994918152705e-05, + "loss": 1.1827, + "step": 1033 + }, + { + "epoch": 0.12261354203723468, + "grad_norm": 1.9662562674855946, + "learning_rate": 4.99999442264397e-05, + "loss": 1.1612, + "step": 1034 + }, + { + "epoch": 0.12273212379935966, + "grad_norm": 1.7215098542107097, + "learning_rate": 4.999993904088351e-05, + "loss": 1.3064, + "step": 1035 + }, + { + "epoch": 0.12285070556148464, + "grad_norm": 1.6821097285578939, + "learning_rate": 4.999993362485855e-05, + "loss": 1.0158, + "step": 1036 + }, + { + "epoch": 0.12296928732360964, + "grad_norm": 1.8062673216588407, + "learning_rate": 4.9999927978364846e-05, + "loss": 1.1758, + "step": 1037 + }, + { + "epoch": 0.12308786908573462, + "grad_norm": 1.8673197453023052, + "learning_rate": 4.999992210140248e-05, + "loss": 1.002, + "step": 1038 + }, + { + "epoch": 0.1232064508478596, + "grad_norm": 1.8407668562551853, + "learning_rate": 4.999991599397147e-05, + "loss": 1.3068, + "step": 1039 + }, + { + "epoch": 0.12332503260998458, + "grad_norm": 1.776021028710403, + "learning_rate": 4.99999096560719e-05, + "loss": 1.0876, + "step": 1040 + }, + { + "epoch": 0.12344361437210957, + "grad_norm": 1.9118130429942444, + "learning_rate": 4.999990308770382e-05, + "loss": 1.1303, + "step": 1041 + }, + { + "epoch": 0.12356219613423455, + "grad_norm": 1.9030062723811847, + "learning_rate": 4.999989628886729e-05, + "loss": 1.2033, + "step": 1042 + }, + { + "epoch": 0.12368077789635953, + "grad_norm": 1.8020574134702663, + "learning_rate": 4.999988925956237e-05, + "loss": 1.3159, + "step": 1043 + }, + { + "epoch": 0.12379935965848453, + "grad_norm": 1.833444467228498, + "learning_rate": 4.9999881999789136e-05, + "loss": 1.1946, + "step": 1044 + }, + { + "epoch": 0.12391794142060951, + "grad_norm": 1.9243703033074129, + "learning_rate": 4.999987450954764e-05, + "loss": 1.1783, + "step": 1045 + }, + { + "epoch": 0.12403652318273449, + "grad_norm": 1.8863534023748212, + "learning_rate": 4.9999866788837964e-05, + "loss": 1.3485, + "step": 1046 + }, + { + "epoch": 0.12415510494485948, + "grad_norm": 1.7730614947739338, + "learning_rate": 4.999985883766017e-05, + "loss": 1.252, + "step": 1047 + }, + { + "epoch": 0.12427368670698447, + "grad_norm": 2.011959995811757, + "learning_rate": 4.999985065601433e-05, + "loss": 1.1448, + "step": 1048 + }, + { + "epoch": 0.12439226846910945, + "grad_norm": 1.644928149889462, + "learning_rate": 4.999984224390053e-05, + "loss": 1.2044, + "step": 1049 + }, + { + "epoch": 0.12451085023123444, + "grad_norm": 1.7007102037002007, + "learning_rate": 4.999983360131883e-05, + "loss": 0.9127, + "step": 1050 + }, + { + "epoch": 0.12462943199335942, + "grad_norm": 1.9475956461288497, + "learning_rate": 4.999982472826934e-05, + "loss": 1.3703, + "step": 1051 + }, + { + "epoch": 0.1247480137554844, + "grad_norm": 1.6820796990762905, + "learning_rate": 4.999981562475211e-05, + "loss": 1.21, + "step": 1052 + }, + { + "epoch": 0.1248665955176094, + "grad_norm": 1.9299529576017962, + "learning_rate": 4.999980629076724e-05, + "loss": 1.1906, + "step": 1053 + }, + { + "epoch": 0.12498517727973438, + "grad_norm": 1.730668895714683, + "learning_rate": 4.9999796726314805e-05, + "loss": 1.0917, + "step": 1054 + }, + { + "epoch": 0.12510375904185936, + "grad_norm": 1.6823271421774455, + "learning_rate": 4.999978693139492e-05, + "loss": 0.9525, + "step": 1055 + }, + { + "epoch": 0.12522234080398434, + "grad_norm": 1.8355783680393534, + "learning_rate": 4.999977690600764e-05, + "loss": 1.1243, + "step": 1056 + }, + { + "epoch": 0.12534092256610932, + "grad_norm": 1.6726440650668317, + "learning_rate": 4.999976665015308e-05, + "loss": 1.3405, + "step": 1057 + }, + { + "epoch": 0.12545950432823433, + "grad_norm": 1.7524104511638119, + "learning_rate": 4.9999756163831325e-05, + "loss": 1.4106, + "step": 1058 + }, + { + "epoch": 0.1255780860903593, + "grad_norm": 1.6975359408831927, + "learning_rate": 4.999974544704247e-05, + "loss": 1.1604, + "step": 1059 + }, + { + "epoch": 0.1256966678524843, + "grad_norm": 1.9298643376479314, + "learning_rate": 4.9999734499786635e-05, + "loss": 1.2125, + "step": 1060 + }, + { + "epoch": 0.12581524961460927, + "grad_norm": 1.7416592424032298, + "learning_rate": 4.9999723322063886e-05, + "loss": 1.0412, + "step": 1061 + }, + { + "epoch": 0.12593383137673425, + "grad_norm": 2.031533783288666, + "learning_rate": 4.9999711913874355e-05, + "loss": 1.0601, + "step": 1062 + }, + { + "epoch": 0.12605241313885923, + "grad_norm": 1.5042702179708785, + "learning_rate": 4.999970027521814e-05, + "loss": 1.0035, + "step": 1063 + }, + { + "epoch": 0.12617099490098424, + "grad_norm": 1.743517812999746, + "learning_rate": 4.999968840609533e-05, + "loss": 1.2745, + "step": 1064 + }, + { + "epoch": 0.12628957666310922, + "grad_norm": 1.5570688720184922, + "learning_rate": 4.9999676306506064e-05, + "loss": 0.8286, + "step": 1065 + }, + { + "epoch": 0.1264081584252342, + "grad_norm": 1.5932407197526472, + "learning_rate": 4.999966397645044e-05, + "loss": 1.325, + "step": 1066 + }, + { + "epoch": 0.12652674018735918, + "grad_norm": 2.016366838524765, + "learning_rate": 4.9999651415928564e-05, + "loss": 1.055, + "step": 1067 + }, + { + "epoch": 0.12664532194948416, + "grad_norm": 1.927509589279619, + "learning_rate": 4.999963862494056e-05, + "loss": 1.2582, + "step": 1068 + }, + { + "epoch": 0.12676390371160914, + "grad_norm": 1.897029217529347, + "learning_rate": 4.999962560348654e-05, + "loss": 1.0861, + "step": 1069 + }, + { + "epoch": 0.12688248547373415, + "grad_norm": 1.8148911752814572, + "learning_rate": 4.9999612351566637e-05, + "loss": 1.2267, + "step": 1070 + }, + { + "epoch": 0.12700106723585913, + "grad_norm": 1.769416009370227, + "learning_rate": 4.999959886918096e-05, + "loss": 1.231, + "step": 1071 + }, + { + "epoch": 0.1271196489979841, + "grad_norm": 1.7238857957089018, + "learning_rate": 4.9999585156329634e-05, + "loss": 1.2082, + "step": 1072 + }, + { + "epoch": 0.1272382307601091, + "grad_norm": 1.5468343508179117, + "learning_rate": 4.9999571213012796e-05, + "loss": 0.838, + "step": 1073 + }, + { + "epoch": 0.12735681252223408, + "grad_norm": 1.6335036893171109, + "learning_rate": 4.9999557039230565e-05, + "loss": 1.2508, + "step": 1074 + }, + { + "epoch": 0.12747539428435906, + "grad_norm": 1.972163041577415, + "learning_rate": 4.9999542634983076e-05, + "loss": 1.1852, + "step": 1075 + }, + { + "epoch": 0.12759397604648406, + "grad_norm": 1.8169485001389083, + "learning_rate": 4.999952800027046e-05, + "loss": 1.1128, + "step": 1076 + }, + { + "epoch": 0.12771255780860905, + "grad_norm": 1.6886683116720567, + "learning_rate": 4.9999513135092855e-05, + "loss": 1.1429, + "step": 1077 + }, + { + "epoch": 0.12783113957073403, + "grad_norm": 2.101878852335351, + "learning_rate": 4.9999498039450386e-05, + "loss": 1.0705, + "step": 1078 + }, + { + "epoch": 0.127949721332859, + "grad_norm": 1.7381293148988552, + "learning_rate": 4.9999482713343215e-05, + "loss": 0.9712, + "step": 1079 + }, + { + "epoch": 0.128068303094984, + "grad_norm": 1.5493550093842379, + "learning_rate": 4.9999467156771464e-05, + "loss": 1.1666, + "step": 1080 + }, + { + "epoch": 0.12818688485710897, + "grad_norm": 1.6573753726006677, + "learning_rate": 4.9999451369735276e-05, + "loss": 0.8108, + "step": 1081 + }, + { + "epoch": 0.12830546661923395, + "grad_norm": 1.7568689030898086, + "learning_rate": 4.9999435352234806e-05, + "loss": 1.0184, + "step": 1082 + }, + { + "epoch": 0.12842404838135896, + "grad_norm": 1.948309836515425, + "learning_rate": 4.99994191042702e-05, + "loss": 1.2576, + "step": 1083 + }, + { + "epoch": 0.12854263014348394, + "grad_norm": 2.0497108527418577, + "learning_rate": 4.999940262584161e-05, + "loss": 0.9323, + "step": 1084 + }, + { + "epoch": 0.12866121190560892, + "grad_norm": 1.8350101931266938, + "learning_rate": 4.999938591694918e-05, + "loss": 1.1348, + "step": 1085 + }, + { + "epoch": 0.1287797936677339, + "grad_norm": 1.7046512984904234, + "learning_rate": 4.999936897759306e-05, + "loss": 1.2158, + "step": 1086 + }, + { + "epoch": 0.12889837542985888, + "grad_norm": 2.0563344286444267, + "learning_rate": 4.999935180777343e-05, + "loss": 1.2374, + "step": 1087 + }, + { + "epoch": 0.12901695719198386, + "grad_norm": 1.893878065008453, + "learning_rate": 4.999933440749043e-05, + "loss": 1.2286, + "step": 1088 + }, + { + "epoch": 0.12913553895410887, + "grad_norm": 2.0464174044644508, + "learning_rate": 4.9999316776744206e-05, + "loss": 1.0264, + "step": 1089 + }, + { + "epoch": 0.12925412071623385, + "grad_norm": 2.0049912179641884, + "learning_rate": 4.999929891553495e-05, + "loss": 0.8706, + "step": 1090 + }, + { + "epoch": 0.12937270247835883, + "grad_norm": 2.326455363219795, + "learning_rate": 4.999928082386282e-05, + "loss": 1.2466, + "step": 1091 + }, + { + "epoch": 0.1294912842404838, + "grad_norm": 2.177055176550707, + "learning_rate": 4.999926250172797e-05, + "loss": 1.0925, + "step": 1092 + }, + { + "epoch": 0.1296098660026088, + "grad_norm": 2.139923066543896, + "learning_rate": 4.9999243949130584e-05, + "loss": 1.2492, + "step": 1093 + }, + { + "epoch": 0.12972844776473377, + "grad_norm": 1.8650182102463062, + "learning_rate": 4.999922516607081e-05, + "loss": 1.4138, + "step": 1094 + }, + { + "epoch": 0.12984702952685878, + "grad_norm": 1.727890729251297, + "learning_rate": 4.999920615254884e-05, + "loss": 1.3152, + "step": 1095 + }, + { + "epoch": 0.12996561128898376, + "grad_norm": 2.1479397502814357, + "learning_rate": 4.999918690856485e-05, + "loss": 1.235, + "step": 1096 + }, + { + "epoch": 0.13008419305110874, + "grad_norm": 1.613696030978353, + "learning_rate": 4.999916743411901e-05, + "loss": 1.2745, + "step": 1097 + }, + { + "epoch": 0.13020277481323372, + "grad_norm": 1.7704666297831217, + "learning_rate": 4.999914772921151e-05, + "loss": 1.2599, + "step": 1098 + }, + { + "epoch": 0.1303213565753587, + "grad_norm": 1.8629181856716268, + "learning_rate": 4.999912779384252e-05, + "loss": 1.2683, + "step": 1099 + }, + { + "epoch": 0.13043993833748369, + "grad_norm": 1.8419651470267826, + "learning_rate": 4.999910762801222e-05, + "loss": 1.4847, + "step": 1100 + }, + { + "epoch": 0.1305585200996087, + "grad_norm": 1.809145851451482, + "learning_rate": 4.999908723172081e-05, + "loss": 1.2067, + "step": 1101 + }, + { + "epoch": 0.13067710186173367, + "grad_norm": 1.7503667199447281, + "learning_rate": 4.999906660496847e-05, + "loss": 1.1288, + "step": 1102 + }, + { + "epoch": 0.13079568362385866, + "grad_norm": 1.717215861980404, + "learning_rate": 4.999904574775539e-05, + "loss": 1.2514, + "step": 1103 + }, + { + "epoch": 0.13091426538598364, + "grad_norm": 1.640608828372075, + "learning_rate": 4.999902466008177e-05, + "loss": 1.1283, + "step": 1104 + }, + { + "epoch": 0.13103284714810862, + "grad_norm": 1.8051514984999204, + "learning_rate": 4.999900334194779e-05, + "loss": 1.4241, + "step": 1105 + }, + { + "epoch": 0.1311514289102336, + "grad_norm": 1.704628757020758, + "learning_rate": 4.9998981793353666e-05, + "loss": 1.1074, + "step": 1106 + }, + { + "epoch": 0.13127001067235858, + "grad_norm": 1.8795192043366644, + "learning_rate": 4.999896001429958e-05, + "loss": 1.1803, + "step": 1107 + }, + { + "epoch": 0.1313885924344836, + "grad_norm": 1.530375425014469, + "learning_rate": 4.999893800478573e-05, + "loss": 1.1084, + "step": 1108 + }, + { + "epoch": 0.13150717419660857, + "grad_norm": 1.6456513960475965, + "learning_rate": 4.999891576481234e-05, + "loss": 1.1216, + "step": 1109 + }, + { + "epoch": 0.13162575595873355, + "grad_norm": 1.8076841183456553, + "learning_rate": 4.99988932943796e-05, + "loss": 1.3404, + "step": 1110 + }, + { + "epoch": 0.13174433772085853, + "grad_norm": 1.8679515896365269, + "learning_rate": 4.999887059348772e-05, + "loss": 1.2514, + "step": 1111 + }, + { + "epoch": 0.1318629194829835, + "grad_norm": 1.6942154184592442, + "learning_rate": 4.9998847662136905e-05, + "loss": 1.1239, + "step": 1112 + }, + { + "epoch": 0.1319815012451085, + "grad_norm": 1.8109324430726381, + "learning_rate": 4.9998824500327365e-05, + "loss": 1.1998, + "step": 1113 + }, + { + "epoch": 0.1321000830072335, + "grad_norm": 1.7852251750571821, + "learning_rate": 4.999880110805933e-05, + "loss": 1.2698, + "step": 1114 + }, + { + "epoch": 0.13221866476935848, + "grad_norm": 1.5281057517563001, + "learning_rate": 4.9998777485332996e-05, + "loss": 1.0754, + "step": 1115 + }, + { + "epoch": 0.13233724653148346, + "grad_norm": 1.6715323301310816, + "learning_rate": 4.99987536321486e-05, + "loss": 0.9717, + "step": 1116 + }, + { + "epoch": 0.13245582829360844, + "grad_norm": 1.5752741396364391, + "learning_rate": 4.999872954850634e-05, + "loss": 0.8794, + "step": 1117 + }, + { + "epoch": 0.13257441005573342, + "grad_norm": 1.6534887738899051, + "learning_rate": 4.9998705234406454e-05, + "loss": 1.3386, + "step": 1118 + }, + { + "epoch": 0.1326929918178584, + "grad_norm": 1.5776050571298075, + "learning_rate": 4.9998680689849157e-05, + "loss": 1.1703, + "step": 1119 + }, + { + "epoch": 0.1328115735799834, + "grad_norm": 1.558972000338839, + "learning_rate": 4.999865591483468e-05, + "loss": 1.056, + "step": 1120 + }, + { + "epoch": 0.1329301553421084, + "grad_norm": 1.652550377955272, + "learning_rate": 4.999863090936325e-05, + "loss": 0.9584, + "step": 1121 + }, + { + "epoch": 0.13304873710423337, + "grad_norm": 1.8539743168789626, + "learning_rate": 4.99986056734351e-05, + "loss": 1.2429, + "step": 1122 + }, + { + "epoch": 0.13316731886635835, + "grad_norm": 1.6568608777747698, + "learning_rate": 4.9998580207050466e-05, + "loss": 1.2244, + "step": 1123 + }, + { + "epoch": 0.13328590062848333, + "grad_norm": 1.9346238827442102, + "learning_rate": 4.999855451020957e-05, + "loss": 1.3629, + "step": 1124 + }, + { + "epoch": 0.13340448239060831, + "grad_norm": 1.8240837126904088, + "learning_rate": 4.999852858291266e-05, + "loss": 1.0605, + "step": 1125 + }, + { + "epoch": 0.13352306415273332, + "grad_norm": 1.6109697129490912, + "learning_rate": 4.999850242515998e-05, + "loss": 1.0814, + "step": 1126 + }, + { + "epoch": 0.1336416459148583, + "grad_norm": 1.5982312128969685, + "learning_rate": 4.999847603695175e-05, + "loss": 1.2116, + "step": 1127 + }, + { + "epoch": 0.13376022767698328, + "grad_norm": 2.028970955865662, + "learning_rate": 4.9998449418288234e-05, + "loss": 1.412, + "step": 1128 + }, + { + "epoch": 0.13387880943910827, + "grad_norm": 1.7530959227923946, + "learning_rate": 4.999842256916967e-05, + "loss": 0.9405, + "step": 1129 + }, + { + "epoch": 0.13399739120123325, + "grad_norm": 2.102151242661334, + "learning_rate": 4.99983954895963e-05, + "loss": 1.2924, + "step": 1130 + }, + { + "epoch": 0.13411597296335823, + "grad_norm": 1.7996441930889286, + "learning_rate": 4.999836817956838e-05, + "loss": 1.315, + "step": 1131 + }, + { + "epoch": 0.1342345547254832, + "grad_norm": 1.6438776683665282, + "learning_rate": 4.9998340639086164e-05, + "loss": 1.2632, + "step": 1132 + }, + { + "epoch": 0.13435313648760822, + "grad_norm": 1.514760449719494, + "learning_rate": 4.99983128681499e-05, + "loss": 0.8421, + "step": 1133 + }, + { + "epoch": 0.1344717182497332, + "grad_norm": 1.9035040354587183, + "learning_rate": 4.999828486675984e-05, + "loss": 1.1838, + "step": 1134 + }, + { + "epoch": 0.13459030001185818, + "grad_norm": 1.7344527449097376, + "learning_rate": 4.999825663491625e-05, + "loss": 1.2936, + "step": 1135 + }, + { + "epoch": 0.13470888177398316, + "grad_norm": 1.419892547102826, + "learning_rate": 4.99982281726194e-05, + "loss": 1.028, + "step": 1136 + }, + { + "epoch": 0.13482746353610814, + "grad_norm": 1.8473328333033185, + "learning_rate": 4.999819947986954e-05, + "loss": 1.1984, + "step": 1137 + }, + { + "epoch": 0.13494604529823312, + "grad_norm": 1.783309595734706, + "learning_rate": 4.999817055666692e-05, + "loss": 1.1726, + "step": 1138 + }, + { + "epoch": 0.13506462706035813, + "grad_norm": 1.476040860622353, + "learning_rate": 4.9998141403011836e-05, + "loss": 1.4328, + "step": 1139 + }, + { + "epoch": 0.1351832088224831, + "grad_norm": 1.622774991952162, + "learning_rate": 4.999811201890453e-05, + "loss": 1.1468, + "step": 1140 + }, + { + "epoch": 0.1353017905846081, + "grad_norm": 1.9952505668065472, + "learning_rate": 4.9998082404345295e-05, + "loss": 1.1575, + "step": 1141 + }, + { + "epoch": 0.13542037234673307, + "grad_norm": 1.7312521317568663, + "learning_rate": 4.99980525593344e-05, + "loss": 1.2236, + "step": 1142 + }, + { + "epoch": 0.13553895410885805, + "grad_norm": 1.7562832946281564, + "learning_rate": 4.999802248387211e-05, + "loss": 1.2104, + "step": 1143 + }, + { + "epoch": 0.13565753587098303, + "grad_norm": 1.8064074588280818, + "learning_rate": 4.9997992177958705e-05, + "loss": 1.1883, + "step": 1144 + }, + { + "epoch": 0.13577611763310804, + "grad_norm": 1.7512489356566345, + "learning_rate": 4.999796164159446e-05, + "loss": 1.2332, + "step": 1145 + }, + { + "epoch": 0.13589469939523302, + "grad_norm": 1.9506302754602374, + "learning_rate": 4.999793087477967e-05, + "loss": 1.3006, + "step": 1146 + }, + { + "epoch": 0.136013281157358, + "grad_norm": 1.7782049334377104, + "learning_rate": 4.999789987751462e-05, + "loss": 1.2128, + "step": 1147 + }, + { + "epoch": 0.13613186291948298, + "grad_norm": 1.8294946717763059, + "learning_rate": 4.999786864979958e-05, + "loss": 1.3324, + "step": 1148 + }, + { + "epoch": 0.13625044468160796, + "grad_norm": 1.5996806219328112, + "learning_rate": 4.999783719163485e-05, + "loss": 1.2727, + "step": 1149 + }, + { + "epoch": 0.13636902644373294, + "grad_norm": 1.7814278002985118, + "learning_rate": 4.999780550302071e-05, + "loss": 1.1625, + "step": 1150 + }, + { + "epoch": 0.13648760820585795, + "grad_norm": 1.644250913880291, + "learning_rate": 4.999777358395746e-05, + "loss": 1.2001, + "step": 1151 + }, + { + "epoch": 0.13660618996798293, + "grad_norm": 1.759508106680521, + "learning_rate": 4.999774143444539e-05, + "loss": 1.087, + "step": 1152 + }, + { + "epoch": 0.1367247717301079, + "grad_norm": 1.5933677812136826, + "learning_rate": 4.99977090544848e-05, + "loss": 1.0231, + "step": 1153 + }, + { + "epoch": 0.1368433534922329, + "grad_norm": 1.4502302265609956, + "learning_rate": 4.999767644407599e-05, + "loss": 1.0564, + "step": 1154 + }, + { + "epoch": 0.13696193525435787, + "grad_norm": 2.0823735946209987, + "learning_rate": 4.9997643603219245e-05, + "loss": 1.3283, + "step": 1155 + }, + { + "epoch": 0.13708051701648286, + "grad_norm": 1.97240903344867, + "learning_rate": 4.9997610531914896e-05, + "loss": 1.2341, + "step": 1156 + }, + { + "epoch": 0.13719909877860784, + "grad_norm": 1.8975780811776994, + "learning_rate": 4.999757723016322e-05, + "loss": 1.2824, + "step": 1157 + }, + { + "epoch": 0.13731768054073284, + "grad_norm": 1.9099669591461903, + "learning_rate": 4.999754369796454e-05, + "loss": 0.9507, + "step": 1158 + }, + { + "epoch": 0.13743626230285783, + "grad_norm": 1.9400843364465061, + "learning_rate": 4.9997509935319166e-05, + "loss": 1.1591, + "step": 1159 + }, + { + "epoch": 0.1375548440649828, + "grad_norm": 1.6865899194570029, + "learning_rate": 4.99974759422274e-05, + "loss": 1.2238, + "step": 1160 + }, + { + "epoch": 0.1376734258271078, + "grad_norm": 1.4965875438925413, + "learning_rate": 4.9997441718689555e-05, + "loss": 1.2548, + "step": 1161 + }, + { + "epoch": 0.13779200758923277, + "grad_norm": 1.7167257090672152, + "learning_rate": 4.9997407264705964e-05, + "loss": 1.1094, + "step": 1162 + }, + { + "epoch": 0.13791058935135775, + "grad_norm": 1.9984652261976052, + "learning_rate": 4.999737258027692e-05, + "loss": 1.1271, + "step": 1163 + }, + { + "epoch": 0.13802917111348276, + "grad_norm": 1.7841749850405313, + "learning_rate": 4.999733766540276e-05, + "loss": 1.2, + "step": 1164 + }, + { + "epoch": 0.13814775287560774, + "grad_norm": 1.762805640217121, + "learning_rate": 4.99973025200838e-05, + "loss": 1.3382, + "step": 1165 + }, + { + "epoch": 0.13826633463773272, + "grad_norm": 1.7812156369967262, + "learning_rate": 4.999726714432036e-05, + "loss": 1.2499, + "step": 1166 + }, + { + "epoch": 0.1383849163998577, + "grad_norm": 1.5159957600761718, + "learning_rate": 4.9997231538112775e-05, + "loss": 1.1019, + "step": 1167 + }, + { + "epoch": 0.13850349816198268, + "grad_norm": 1.5730948609362534, + "learning_rate": 4.9997195701461366e-05, + "loss": 1.111, + "step": 1168 + }, + { + "epoch": 0.13862207992410766, + "grad_norm": 1.671554365423232, + "learning_rate": 4.999715963436647e-05, + "loss": 0.9275, + "step": 1169 + }, + { + "epoch": 0.13874066168623267, + "grad_norm": 1.5771847419188025, + "learning_rate": 4.999712333682842e-05, + "loss": 1.2024, + "step": 1170 + }, + { + "epoch": 0.13885924344835765, + "grad_norm": 1.7518683705047189, + "learning_rate": 4.9997086808847534e-05, + "loss": 0.9171, + "step": 1171 + }, + { + "epoch": 0.13897782521048263, + "grad_norm": 2.037762937444253, + "learning_rate": 4.999705005042417e-05, + "loss": 1.1708, + "step": 1172 + }, + { + "epoch": 0.1390964069726076, + "grad_norm": 1.7094853046049336, + "learning_rate": 4.999701306155866e-05, + "loss": 0.9993, + "step": 1173 + }, + { + "epoch": 0.1392149887347326, + "grad_norm": 1.7175331116889134, + "learning_rate": 4.999697584225134e-05, + "loss": 1.1446, + "step": 1174 + }, + { + "epoch": 0.13933357049685757, + "grad_norm": 1.6984031425181925, + "learning_rate": 4.9996938392502545e-05, + "loss": 1.0772, + "step": 1175 + }, + { + "epoch": 0.13945215225898258, + "grad_norm": 2.295462813306635, + "learning_rate": 4.9996900712312644e-05, + "loss": 1.3928, + "step": 1176 + }, + { + "epoch": 0.13957073402110756, + "grad_norm": 2.017049743044323, + "learning_rate": 4.999686280168197e-05, + "loss": 1.4378, + "step": 1177 + }, + { + "epoch": 0.13968931578323254, + "grad_norm": 1.7071582975539423, + "learning_rate": 4.999682466061087e-05, + "loss": 0.9676, + "step": 1178 + }, + { + "epoch": 0.13980789754535752, + "grad_norm": 1.986355541613088, + "learning_rate": 4.999678628909971e-05, + "loss": 1.3657, + "step": 1179 + }, + { + "epoch": 0.1399264793074825, + "grad_norm": 1.7591926645544043, + "learning_rate": 4.9996747687148814e-05, + "loss": 1.2089, + "step": 1180 + }, + { + "epoch": 0.14004506106960748, + "grad_norm": 1.8527089057921522, + "learning_rate": 4.9996708854758576e-05, + "loss": 1.085, + "step": 1181 + }, + { + "epoch": 0.1401636428317325, + "grad_norm": 1.7350098650979797, + "learning_rate": 4.999666979192933e-05, + "loss": 1.3144, + "step": 1182 + }, + { + "epoch": 0.14028222459385747, + "grad_norm": 1.8868965019686643, + "learning_rate": 4.999663049866143e-05, + "loss": 1.2824, + "step": 1183 + }, + { + "epoch": 0.14040080635598245, + "grad_norm": 1.7163461818046193, + "learning_rate": 4.9996590974955266e-05, + "loss": 1.1217, + "step": 1184 + }, + { + "epoch": 0.14051938811810744, + "grad_norm": 1.576818259174689, + "learning_rate": 4.9996551220811184e-05, + "loss": 1.3079, + "step": 1185 + }, + { + "epoch": 0.14063796988023242, + "grad_norm": 1.8157868535658144, + "learning_rate": 4.9996511236229545e-05, + "loss": 1.0443, + "step": 1186 + }, + { + "epoch": 0.1407565516423574, + "grad_norm": 1.7552961924004054, + "learning_rate": 4.9996471021210724e-05, + "loss": 1.1268, + "step": 1187 + }, + { + "epoch": 0.14087513340448238, + "grad_norm": 1.6686548181995204, + "learning_rate": 4.99964305757551e-05, + "loss": 1.1303, + "step": 1188 + }, + { + "epoch": 0.1409937151666074, + "grad_norm": 1.6197950471484572, + "learning_rate": 4.999638989986304e-05, + "loss": 1.0167, + "step": 1189 + }, + { + "epoch": 0.14111229692873237, + "grad_norm": 1.732361576161734, + "learning_rate": 4.999634899353491e-05, + "loss": 0.9718, + "step": 1190 + }, + { + "epoch": 0.14123087869085735, + "grad_norm": 2.066262402635185, + "learning_rate": 4.9996307856771096e-05, + "loss": 1.2907, + "step": 1191 + }, + { + "epoch": 0.14134946045298233, + "grad_norm": 1.793716343723841, + "learning_rate": 4.999626648957198e-05, + "loss": 1.0441, + "step": 1192 + }, + { + "epoch": 0.1414680422151073, + "grad_norm": 1.8863311630968926, + "learning_rate": 4.999622489193794e-05, + "loss": 1.3592, + "step": 1193 + }, + { + "epoch": 0.1415866239772323, + "grad_norm": 1.7735250385507957, + "learning_rate": 4.9996183063869355e-05, + "loss": 0.9285, + "step": 1194 + }, + { + "epoch": 0.1417052057393573, + "grad_norm": 1.756861961412436, + "learning_rate": 4.999614100536662e-05, + "loss": 1.2972, + "step": 1195 + }, + { + "epoch": 0.14182378750148228, + "grad_norm": 1.6064191847559832, + "learning_rate": 4.9996098716430104e-05, + "loss": 1.1341, + "step": 1196 + }, + { + "epoch": 0.14194236926360726, + "grad_norm": 1.6956565278627764, + "learning_rate": 4.999605619706022e-05, + "loss": 0.9694, + "step": 1197 + }, + { + "epoch": 0.14206095102573224, + "grad_norm": 1.6832988393402268, + "learning_rate": 4.999601344725735e-05, + "loss": 1.177, + "step": 1198 + }, + { + "epoch": 0.14217953278785722, + "grad_norm": 1.8715433487081345, + "learning_rate": 4.999597046702189e-05, + "loss": 1.1384, + "step": 1199 + }, + { + "epoch": 0.1422981145499822, + "grad_norm": 1.8804678595689712, + "learning_rate": 4.999592725635424e-05, + "loss": 1.202, + "step": 1200 + }, + { + "epoch": 0.1424166963121072, + "grad_norm": 1.5527524736543963, + "learning_rate": 4.999588381525478e-05, + "loss": 0.7849, + "step": 1201 + }, + { + "epoch": 0.1425352780742322, + "grad_norm": 1.6428617808559762, + "learning_rate": 4.999584014372393e-05, + "loss": 0.8942, + "step": 1202 + }, + { + "epoch": 0.14265385983635717, + "grad_norm": 1.545120757575061, + "learning_rate": 4.9995796241762084e-05, + "loss": 1.1661, + "step": 1203 + }, + { + "epoch": 0.14277244159848215, + "grad_norm": 1.7755186816207469, + "learning_rate": 4.999575210936965e-05, + "loss": 1.309, + "step": 1204 + }, + { + "epoch": 0.14289102336060713, + "grad_norm": 1.5555418872843174, + "learning_rate": 4.999570774654703e-05, + "loss": 1.0607, + "step": 1205 + }, + { + "epoch": 0.14300960512273211, + "grad_norm": 1.902952478040249, + "learning_rate": 4.999566315329464e-05, + "loss": 1.2914, + "step": 1206 + }, + { + "epoch": 0.14312818688485712, + "grad_norm": 1.7056092733989878, + "learning_rate": 4.999561832961288e-05, + "loss": 1.1355, + "step": 1207 + }, + { + "epoch": 0.1432467686469821, + "grad_norm": 1.6053042385025378, + "learning_rate": 4.999557327550218e-05, + "loss": 1.2679, + "step": 1208 + }, + { + "epoch": 0.14336535040910708, + "grad_norm": 1.5607468083661529, + "learning_rate": 4.999552799096294e-05, + "loss": 1.1363, + "step": 1209 + }, + { + "epoch": 0.14348393217123206, + "grad_norm": 1.9027257314605164, + "learning_rate": 4.999548247599559e-05, + "loss": 1.3458, + "step": 1210 + }, + { + "epoch": 0.14360251393335705, + "grad_norm": 1.7321633169997943, + "learning_rate": 4.9995436730600534e-05, + "loss": 1.3314, + "step": 1211 + }, + { + "epoch": 0.14372109569548203, + "grad_norm": 1.499493316242706, + "learning_rate": 4.999539075477821e-05, + "loss": 1.1102, + "step": 1212 + }, + { + "epoch": 0.143839677457607, + "grad_norm": 1.6018622171252828, + "learning_rate": 4.9995344548529036e-05, + "loss": 1.243, + "step": 1213 + }, + { + "epoch": 0.14395825921973202, + "grad_norm": 1.5331922748009625, + "learning_rate": 4.9995298111853425e-05, + "loss": 1.0878, + "step": 1214 + }, + { + "epoch": 0.144076840981857, + "grad_norm": 1.490080635786967, + "learning_rate": 4.9995251444751825e-05, + "loss": 1.1486, + "step": 1215 + }, + { + "epoch": 0.14419542274398198, + "grad_norm": 1.5612252033418892, + "learning_rate": 4.999520454722466e-05, + "loss": 0.9337, + "step": 1216 + }, + { + "epoch": 0.14431400450610696, + "grad_norm": 1.8738043685946761, + "learning_rate": 4.999515741927235e-05, + "loss": 1.1505, + "step": 1217 + }, + { + "epoch": 0.14443258626823194, + "grad_norm": 1.766246009871135, + "learning_rate": 4.999511006089536e-05, + "loss": 1.3653, + "step": 1218 + }, + { + "epoch": 0.14455116803035692, + "grad_norm": 1.8767318977122809, + "learning_rate": 4.999506247209409e-05, + "loss": 1.4427, + "step": 1219 + }, + { + "epoch": 0.14466974979248193, + "grad_norm": 1.593975328322896, + "learning_rate": 4.9995014652869e-05, + "loss": 1.0437, + "step": 1220 + }, + { + "epoch": 0.1447883315546069, + "grad_norm": 1.6004199974583742, + "learning_rate": 4.999496660322052e-05, + "loss": 1.0681, + "step": 1221 + }, + { + "epoch": 0.1449069133167319, + "grad_norm": 1.7266539291336964, + "learning_rate": 4.99949183231491e-05, + "loss": 0.9899, + "step": 1222 + }, + { + "epoch": 0.14502549507885687, + "grad_norm": 1.861815187600703, + "learning_rate": 4.999486981265519e-05, + "loss": 1.2356, + "step": 1223 + }, + { + "epoch": 0.14514407684098185, + "grad_norm": 1.4483260694883633, + "learning_rate": 4.999482107173923e-05, + "loss": 1.2101, + "step": 1224 + }, + { + "epoch": 0.14526265860310683, + "grad_norm": 1.6195627653520417, + "learning_rate": 4.999477210040166e-05, + "loss": 1.1378, + "step": 1225 + }, + { + "epoch": 0.14538124036523184, + "grad_norm": 1.8762253817057535, + "learning_rate": 4.9994722898642954e-05, + "loss": 1.0455, + "step": 1226 + }, + { + "epoch": 0.14549982212735682, + "grad_norm": 1.6191483027767335, + "learning_rate": 4.9994673466463555e-05, + "loss": 1.0258, + "step": 1227 + }, + { + "epoch": 0.1456184038894818, + "grad_norm": 1.875887775892771, + "learning_rate": 4.9994623803863913e-05, + "loss": 1.2244, + "step": 1228 + }, + { + "epoch": 0.14573698565160678, + "grad_norm": 1.9528563844228284, + "learning_rate": 4.999457391084449e-05, + "loss": 1.2241, + "step": 1229 + }, + { + "epoch": 0.14585556741373176, + "grad_norm": 2.454791627347521, + "learning_rate": 4.999452378740574e-05, + "loss": 1.3756, + "step": 1230 + }, + { + "epoch": 0.14597414917585674, + "grad_norm": 1.685031826684907, + "learning_rate": 4.999447343354814e-05, + "loss": 1.3783, + "step": 1231 + }, + { + "epoch": 0.14609273093798175, + "grad_norm": 1.7289974628520888, + "learning_rate": 4.999442284927214e-05, + "loss": 1.198, + "step": 1232 + }, + { + "epoch": 0.14621131270010673, + "grad_norm": 1.667868317618341, + "learning_rate": 4.999437203457821e-05, + "loss": 1.1139, + "step": 1233 + }, + { + "epoch": 0.1463298944622317, + "grad_norm": 1.4831022830691543, + "learning_rate": 4.999432098946683e-05, + "loss": 0.9182, + "step": 1234 + }, + { + "epoch": 0.1464484762243567, + "grad_norm": 1.6292358001358807, + "learning_rate": 4.999426971393845e-05, + "loss": 1.1391, + "step": 1235 + }, + { + "epoch": 0.14656705798648167, + "grad_norm": 1.8952586846307453, + "learning_rate": 4.999421820799355e-05, + "loss": 0.9573, + "step": 1236 + }, + { + "epoch": 0.14668563974860666, + "grad_norm": 1.7177120734797628, + "learning_rate": 4.999416647163262e-05, + "loss": 1.1769, + "step": 1237 + }, + { + "epoch": 0.14680422151073164, + "grad_norm": 1.71426271180194, + "learning_rate": 4.9994114504856114e-05, + "loss": 1.3694, + "step": 1238 + }, + { + "epoch": 0.14692280327285664, + "grad_norm": 1.770585262166228, + "learning_rate": 4.9994062307664524e-05, + "loss": 1.0823, + "step": 1239 + }, + { + "epoch": 0.14704138503498163, + "grad_norm": 1.4668295248851515, + "learning_rate": 4.9994009880058336e-05, + "loss": 1.1306, + "step": 1240 + }, + { + "epoch": 0.1471599667971066, + "grad_norm": 1.8994295369008685, + "learning_rate": 4.9993957222038024e-05, + "loss": 1.153, + "step": 1241 + }, + { + "epoch": 0.1472785485592316, + "grad_norm": 2.534146900128462, + "learning_rate": 4.9993904333604075e-05, + "loss": 1.287, + "step": 1242 + }, + { + "epoch": 0.14739713032135657, + "grad_norm": 1.6395031609579849, + "learning_rate": 4.999385121475698e-05, + "loss": 1.1112, + "step": 1243 + }, + { + "epoch": 0.14751571208348155, + "grad_norm": 1.6593852805239584, + "learning_rate": 4.999379786549722e-05, + "loss": 1.0031, + "step": 1244 + }, + { + "epoch": 0.14763429384560656, + "grad_norm": 1.684307673154832, + "learning_rate": 4.9993744285825294e-05, + "loss": 0.6741, + "step": 1245 + }, + { + "epoch": 0.14775287560773154, + "grad_norm": 1.8788236046229874, + "learning_rate": 4.999369047574171e-05, + "loss": 1.2566, + "step": 1246 + }, + { + "epoch": 0.14787145736985652, + "grad_norm": 1.832074938470291, + "learning_rate": 4.9993636435246925e-05, + "loss": 0.9454, + "step": 1247 + }, + { + "epoch": 0.1479900391319815, + "grad_norm": 1.9481679343587273, + "learning_rate": 4.999358216434148e-05, + "loss": 1.264, + "step": 1248 + }, + { + "epoch": 0.14810862089410648, + "grad_norm": 1.5833200533708982, + "learning_rate": 4.999352766302585e-05, + "loss": 1.2313, + "step": 1249 + }, + { + "epoch": 0.14822720265623146, + "grad_norm": 1.765278360808599, + "learning_rate": 4.999347293130055e-05, + "loss": 1.4331, + "step": 1250 + }, + { + "epoch": 0.14834578441835647, + "grad_norm": 1.5132943499508853, + "learning_rate": 4.999341796916607e-05, + "loss": 1.1507, + "step": 1251 + }, + { + "epoch": 0.14846436618048145, + "grad_norm": 1.638414784503229, + "learning_rate": 4.999336277662292e-05, + "loss": 1.1789, + "step": 1252 + }, + { + "epoch": 0.14858294794260643, + "grad_norm": 1.5462375596262405, + "learning_rate": 4.999330735367163e-05, + "loss": 1.0959, + "step": 1253 + }, + { + "epoch": 0.1487015297047314, + "grad_norm": 1.8943635083491603, + "learning_rate": 4.999325170031268e-05, + "loss": 1.3259, + "step": 1254 + }, + { + "epoch": 0.1488201114668564, + "grad_norm": 1.5685533536017315, + "learning_rate": 4.9993195816546606e-05, + "loss": 1.4312, + "step": 1255 + }, + { + "epoch": 0.14893869322898137, + "grad_norm": 1.7391159423665425, + "learning_rate": 4.999313970237392e-05, + "loss": 0.9545, + "step": 1256 + }, + { + "epoch": 0.14905727499110638, + "grad_norm": 1.518565995609839, + "learning_rate": 4.999308335779512e-05, + "loss": 1.0849, + "step": 1257 + }, + { + "epoch": 0.14917585675323136, + "grad_norm": 1.575538375423931, + "learning_rate": 4.999302678281075e-05, + "loss": 1.1261, + "step": 1258 + }, + { + "epoch": 0.14929443851535634, + "grad_norm": 1.6499055699179195, + "learning_rate": 4.9992969977421326e-05, + "loss": 1.232, + "step": 1259 + }, + { + "epoch": 0.14941302027748132, + "grad_norm": 1.8399268893507483, + "learning_rate": 4.999291294162736e-05, + "loss": 1.1835, + "step": 1260 + }, + { + "epoch": 0.1495316020396063, + "grad_norm": 1.8902854118282284, + "learning_rate": 4.999285567542938e-05, + "loss": 1.2072, + "step": 1261 + }, + { + "epoch": 0.14965018380173128, + "grad_norm": 1.5635735692501445, + "learning_rate": 4.9992798178827935e-05, + "loss": 1.1974, + "step": 1262 + }, + { + "epoch": 0.14976876556385627, + "grad_norm": 1.4881035857644036, + "learning_rate": 4.9992740451823525e-05, + "loss": 1.0461, + "step": 1263 + }, + { + "epoch": 0.14988734732598127, + "grad_norm": 1.619204907852576, + "learning_rate": 4.99926824944167e-05, + "loss": 1.123, + "step": 1264 + }, + { + "epoch": 0.15000592908810625, + "grad_norm": 1.5595905844771718, + "learning_rate": 4.9992624306607996e-05, + "loss": 0.7769, + "step": 1265 + }, + { + "epoch": 0.15012451085023124, + "grad_norm": 1.6648495040578173, + "learning_rate": 4.9992565888397934e-05, + "loss": 1.0967, + "step": 1266 + }, + { + "epoch": 0.15024309261235622, + "grad_norm": 1.7292098187813407, + "learning_rate": 4.999250723978707e-05, + "loss": 0.9824, + "step": 1267 + }, + { + "epoch": 0.1503616743744812, + "grad_norm": 1.6001301790444746, + "learning_rate": 4.9992448360775934e-05, + "loss": 1.0827, + "step": 1268 + }, + { + "epoch": 0.15048025613660618, + "grad_norm": 2.329032038747233, + "learning_rate": 4.999238925136507e-05, + "loss": 1.0457, + "step": 1269 + }, + { + "epoch": 0.15059883789873119, + "grad_norm": 2.121790223281565, + "learning_rate": 4.999232991155503e-05, + "loss": 0.9609, + "step": 1270 + }, + { + "epoch": 0.15071741966085617, + "grad_norm": 2.1223927317500637, + "learning_rate": 4.999227034134635e-05, + "loss": 1.0869, + "step": 1271 + }, + { + "epoch": 0.15083600142298115, + "grad_norm": 1.4723309732232674, + "learning_rate": 4.999221054073959e-05, + "loss": 1.0952, + "step": 1272 + }, + { + "epoch": 0.15095458318510613, + "grad_norm": 2.0059441102497986, + "learning_rate": 4.9992150509735295e-05, + "loss": 1.0492, + "step": 1273 + }, + { + "epoch": 0.1510731649472311, + "grad_norm": 1.574135491265611, + "learning_rate": 4.9992090248334025e-05, + "loss": 1.0685, + "step": 1274 + }, + { + "epoch": 0.1511917467093561, + "grad_norm": 1.5836557229593584, + "learning_rate": 4.999202975653632e-05, + "loss": 0.9973, + "step": 1275 + }, + { + "epoch": 0.1513103284714811, + "grad_norm": 2.0161056400445423, + "learning_rate": 4.9991969034342755e-05, + "loss": 1.2552, + "step": 1276 + }, + { + "epoch": 0.15142891023360608, + "grad_norm": 1.4780318902030296, + "learning_rate": 4.999190808175388e-05, + "loss": 1.1236, + "step": 1277 + }, + { + "epoch": 0.15154749199573106, + "grad_norm": 2.085160808055643, + "learning_rate": 4.999184689877026e-05, + "loss": 1.2996, + "step": 1278 + }, + { + "epoch": 0.15166607375785604, + "grad_norm": 1.4645509468166276, + "learning_rate": 4.999178548539246e-05, + "loss": 1.0603, + "step": 1279 + }, + { + "epoch": 0.15178465551998102, + "grad_norm": 2.300259013177836, + "learning_rate": 4.999172384162104e-05, + "loss": 1.4209, + "step": 1280 + }, + { + "epoch": 0.151903237282106, + "grad_norm": 1.7191836693698612, + "learning_rate": 4.999166196745657e-05, + "loss": 1.0689, + "step": 1281 + }, + { + "epoch": 0.152021819044231, + "grad_norm": 1.8953768239030517, + "learning_rate": 4.999159986289963e-05, + "loss": 0.9715, + "step": 1282 + }, + { + "epoch": 0.152140400806356, + "grad_norm": 1.5667141784888319, + "learning_rate": 4.999153752795079e-05, + "loss": 1.0007, + "step": 1283 + }, + { + "epoch": 0.15225898256848097, + "grad_norm": 1.8265074488912159, + "learning_rate": 4.999147496261062e-05, + "loss": 1.0861, + "step": 1284 + }, + { + "epoch": 0.15237756433060595, + "grad_norm": 1.610189090899638, + "learning_rate": 4.999141216687969e-05, + "loss": 1.1591, + "step": 1285 + }, + { + "epoch": 0.15249614609273093, + "grad_norm": 1.547954513498484, + "learning_rate": 4.999134914075859e-05, + "loss": 1.1521, + "step": 1286 + }, + { + "epoch": 0.1526147278548559, + "grad_norm": 1.7512618319061934, + "learning_rate": 4.99912858842479e-05, + "loss": 1.1999, + "step": 1287 + }, + { + "epoch": 0.1527333096169809, + "grad_norm": 1.582230793080285, + "learning_rate": 4.9991222397348194e-05, + "loss": 1.0295, + "step": 1288 + }, + { + "epoch": 0.1528518913791059, + "grad_norm": 1.7510589960283165, + "learning_rate": 4.999115868006007e-05, + "loss": 1.1902, + "step": 1289 + }, + { + "epoch": 0.15297047314123088, + "grad_norm": 1.8049958783819773, + "learning_rate": 4.9991094732384104e-05, + "loss": 0.9099, + "step": 1290 + }, + { + "epoch": 0.15308905490335586, + "grad_norm": 1.5267877574714108, + "learning_rate": 4.99910305543209e-05, + "loss": 0.8774, + "step": 1291 + }, + { + "epoch": 0.15320763666548085, + "grad_norm": 1.6976143638780485, + "learning_rate": 4.9990966145871023e-05, + "loss": 1.3123, + "step": 1292 + }, + { + "epoch": 0.15332621842760583, + "grad_norm": 1.792778435355543, + "learning_rate": 4.99909015070351e-05, + "loss": 1.1311, + "step": 1293 + }, + { + "epoch": 0.1534448001897308, + "grad_norm": 1.5252738558389507, + "learning_rate": 4.99908366378137e-05, + "loss": 1.007, + "step": 1294 + }, + { + "epoch": 0.15356338195185582, + "grad_norm": 1.5763695995571658, + "learning_rate": 4.9990771538207435e-05, + "loss": 1.081, + "step": 1295 + }, + { + "epoch": 0.1536819637139808, + "grad_norm": 1.826054469333273, + "learning_rate": 4.9990706208216906e-05, + "loss": 0.925, + "step": 1296 + }, + { + "epoch": 0.15380054547610578, + "grad_norm": 1.7731372862635013, + "learning_rate": 4.999064064784271e-05, + "loss": 1.3251, + "step": 1297 + }, + { + "epoch": 0.15391912723823076, + "grad_norm": 1.9228675927892496, + "learning_rate": 4.999057485708546e-05, + "loss": 1.2426, + "step": 1298 + }, + { + "epoch": 0.15403770900035574, + "grad_norm": 1.7422907778919667, + "learning_rate": 4.999050883594575e-05, + "loss": 1.0248, + "step": 1299 + }, + { + "epoch": 0.15415629076248072, + "grad_norm": 1.5990375363783687, + "learning_rate": 4.999044258442419e-05, + "loss": 1.2214, + "step": 1300 + }, + { + "epoch": 0.15427487252460573, + "grad_norm": 1.6756229929557358, + "learning_rate": 4.9990376102521394e-05, + "loss": 1.0051, + "step": 1301 + }, + { + "epoch": 0.1543934542867307, + "grad_norm": 1.4674982656206077, + "learning_rate": 4.999030939023798e-05, + "loss": 1.0562, + "step": 1302 + }, + { + "epoch": 0.1545120360488557, + "grad_norm": 1.4418958174552292, + "learning_rate": 4.999024244757456e-05, + "loss": 1.193, + "step": 1303 + }, + { + "epoch": 0.15463061781098067, + "grad_norm": 1.8164264072967786, + "learning_rate": 4.999017527453174e-05, + "loss": 1.2013, + "step": 1304 + }, + { + "epoch": 0.15474919957310565, + "grad_norm": 1.5694835313937379, + "learning_rate": 4.999010787111016e-05, + "loss": 0.9759, + "step": 1305 + }, + { + "epoch": 0.15486778133523063, + "grad_norm": 1.54852895811122, + "learning_rate": 4.999004023731042e-05, + "loss": 1.3139, + "step": 1306 + }, + { + "epoch": 0.15498636309735564, + "grad_norm": 1.4948946221920836, + "learning_rate": 4.998997237313316e-05, + "loss": 1.2137, + "step": 1307 + }, + { + "epoch": 0.15510494485948062, + "grad_norm": 1.8472444111636044, + "learning_rate": 4.9989904278579004e-05, + "loss": 0.9788, + "step": 1308 + }, + { + "epoch": 0.1552235266216056, + "grad_norm": 1.672584990814677, + "learning_rate": 4.998983595364857e-05, + "loss": 1.1889, + "step": 1309 + }, + { + "epoch": 0.15534210838373058, + "grad_norm": 1.6549177998243312, + "learning_rate": 4.998976739834249e-05, + "loss": 0.8923, + "step": 1310 + }, + { + "epoch": 0.15546069014585556, + "grad_norm": 1.46146487362686, + "learning_rate": 4.9989698612661405e-05, + "loss": 0.7893, + "step": 1311 + }, + { + "epoch": 0.15557927190798054, + "grad_norm": 1.7377509622463765, + "learning_rate": 4.998962959660594e-05, + "loss": 1.2051, + "step": 1312 + }, + { + "epoch": 0.15569785367010555, + "grad_norm": 1.7403031131102804, + "learning_rate": 4.998956035017673e-05, + "loss": 0.7649, + "step": 1313 + }, + { + "epoch": 0.15581643543223053, + "grad_norm": 2.0088239292121397, + "learning_rate": 4.998949087337443e-05, + "loss": 1.3126, + "step": 1314 + }, + { + "epoch": 0.1559350171943555, + "grad_norm": 1.6978563988062945, + "learning_rate": 4.998942116619966e-05, + "loss": 0.898, + "step": 1315 + }, + { + "epoch": 0.1560535989564805, + "grad_norm": 1.6525233218943678, + "learning_rate": 4.998935122865307e-05, + "loss": 0.8661, + "step": 1316 + }, + { + "epoch": 0.15617218071860547, + "grad_norm": 1.5736867371926349, + "learning_rate": 4.998928106073531e-05, + "loss": 1.1599, + "step": 1317 + }, + { + "epoch": 0.15629076248073046, + "grad_norm": 1.776632946810059, + "learning_rate": 4.998921066244702e-05, + "loss": 1.2149, + "step": 1318 + }, + { + "epoch": 0.15640934424285544, + "grad_norm": 1.7595684570916508, + "learning_rate": 4.998914003378885e-05, + "loss": 1.0396, + "step": 1319 + }, + { + "epoch": 0.15652792600498044, + "grad_norm": 1.6960609255284567, + "learning_rate": 4.998906917476146e-05, + "loss": 1.0262, + "step": 1320 + }, + { + "epoch": 0.15664650776710543, + "grad_norm": 1.4378377791277002, + "learning_rate": 4.9988998085365485e-05, + "loss": 1.1506, + "step": 1321 + }, + { + "epoch": 0.1567650895292304, + "grad_norm": 1.758680227858358, + "learning_rate": 4.9988926765601595e-05, + "loss": 1.2901, + "step": 1322 + }, + { + "epoch": 0.1568836712913554, + "grad_norm": 1.8502493514386316, + "learning_rate": 4.9988855215470445e-05, + "loss": 1.2058, + "step": 1323 + }, + { + "epoch": 0.15700225305348037, + "grad_norm": 1.8381725506849718, + "learning_rate": 4.9988783434972694e-05, + "loss": 1.1684, + "step": 1324 + }, + { + "epoch": 0.15712083481560535, + "grad_norm": 1.6551184548015327, + "learning_rate": 4.9988711424109005e-05, + "loss": 1.1479, + "step": 1325 + }, + { + "epoch": 0.15723941657773036, + "grad_norm": 1.5375455606786526, + "learning_rate": 4.9988639182880035e-05, + "loss": 1.013, + "step": 1326 + }, + { + "epoch": 0.15735799833985534, + "grad_norm": 1.8127577910221961, + "learning_rate": 4.9988566711286456e-05, + "loss": 1.0397, + "step": 1327 + }, + { + "epoch": 0.15747658010198032, + "grad_norm": 1.7400170529434082, + "learning_rate": 4.998849400932894e-05, + "loss": 1.1312, + "step": 1328 + }, + { + "epoch": 0.1575951618641053, + "grad_norm": 1.5720575633215734, + "learning_rate": 4.9988421077008144e-05, + "loss": 1.1339, + "step": 1329 + }, + { + "epoch": 0.15771374362623028, + "grad_norm": 1.616725737387187, + "learning_rate": 4.9988347914324754e-05, + "loss": 1.0865, + "step": 1330 + }, + { + "epoch": 0.15783232538835526, + "grad_norm": 1.846202621900896, + "learning_rate": 4.998827452127944e-05, + "loss": 0.9726, + "step": 1331 + }, + { + "epoch": 0.15795090715048027, + "grad_norm": 1.4488651346871844, + "learning_rate": 4.998820089787287e-05, + "loss": 0.9041, + "step": 1332 + }, + { + "epoch": 0.15806948891260525, + "grad_norm": 1.9464798400163108, + "learning_rate": 4.9988127044105735e-05, + "loss": 0.9932, + "step": 1333 + }, + { + "epoch": 0.15818807067473023, + "grad_norm": 1.7024723527878545, + "learning_rate": 4.998805295997872e-05, + "loss": 1.0388, + "step": 1334 + }, + { + "epoch": 0.1583066524368552, + "grad_norm": 1.701304856457065, + "learning_rate": 4.9987978645492485e-05, + "loss": 1.2282, + "step": 1335 + }, + { + "epoch": 0.1584252341989802, + "grad_norm": 1.7499374448902882, + "learning_rate": 4.998790410064773e-05, + "loss": 1.2021, + "step": 1336 + }, + { + "epoch": 0.15854381596110517, + "grad_norm": 1.606080755997141, + "learning_rate": 4.998782932544515e-05, + "loss": 1.2471, + "step": 1337 + }, + { + "epoch": 0.15866239772323018, + "grad_norm": 1.9367341909970133, + "learning_rate": 4.998775431988542e-05, + "loss": 1.0381, + "step": 1338 + }, + { + "epoch": 0.15878097948535516, + "grad_norm": 2.558115803232956, + "learning_rate": 4.998767908396924e-05, + "loss": 1.2334, + "step": 1339 + }, + { + "epoch": 0.15889956124748014, + "grad_norm": 1.7251439119382295, + "learning_rate": 4.99876036176973e-05, + "loss": 1.0612, + "step": 1340 + }, + { + "epoch": 0.15901814300960512, + "grad_norm": 1.693088520139888, + "learning_rate": 4.99875279210703e-05, + "loss": 1.0109, + "step": 1341 + }, + { + "epoch": 0.1591367247717301, + "grad_norm": 1.6539676170027573, + "learning_rate": 4.998745199408893e-05, + "loss": 1.1749, + "step": 1342 + }, + { + "epoch": 0.15925530653385508, + "grad_norm": 1.9358835265570449, + "learning_rate": 4.998737583675389e-05, + "loss": 1.1675, + "step": 1343 + }, + { + "epoch": 0.15937388829598007, + "grad_norm": 1.5526320915382463, + "learning_rate": 4.998729944906589e-05, + "loss": 1.271, + "step": 1344 + }, + { + "epoch": 0.15949247005810507, + "grad_norm": 1.416996187632587, + "learning_rate": 4.998722283102564e-05, + "loss": 1.1774, + "step": 1345 + }, + { + "epoch": 0.15961105182023005, + "grad_norm": 1.8451392660335684, + "learning_rate": 4.998714598263382e-05, + "loss": 1.2966, + "step": 1346 + }, + { + "epoch": 0.15972963358235504, + "grad_norm": 1.6480227665248777, + "learning_rate": 4.9987068903891166e-05, + "loss": 1.0489, + "step": 1347 + }, + { + "epoch": 0.15984821534448002, + "grad_norm": 1.6033367077826735, + "learning_rate": 4.998699159479838e-05, + "loss": 1.1511, + "step": 1348 + }, + { + "epoch": 0.159966797106605, + "grad_norm": 1.5169813261234948, + "learning_rate": 4.9986914055356164e-05, + "loss": 1.1875, + "step": 1349 + }, + { + "epoch": 0.16008537886872998, + "grad_norm": 1.4747467622712633, + "learning_rate": 4.9986836285565245e-05, + "loss": 0.8391, + "step": 1350 + }, + { + "epoch": 0.16020396063085499, + "grad_norm": 1.655132936590586, + "learning_rate": 4.998675828542634e-05, + "loss": 0.9983, + "step": 1351 + }, + { + "epoch": 0.16032254239297997, + "grad_norm": 1.7669111830096487, + "learning_rate": 4.998668005494016e-05, + "loss": 0.9647, + "step": 1352 + }, + { + "epoch": 0.16044112415510495, + "grad_norm": 2.1959404382497616, + "learning_rate": 4.998660159410743e-05, + "loss": 0.9244, + "step": 1353 + }, + { + "epoch": 0.16055970591722993, + "grad_norm": 2.279622871669324, + "learning_rate": 4.9986522902928875e-05, + "loss": 1.2936, + "step": 1354 + }, + { + "epoch": 0.1606782876793549, + "grad_norm": 2.5419759825599613, + "learning_rate": 4.998644398140522e-05, + "loss": 1.2477, + "step": 1355 + }, + { + "epoch": 0.1607968694414799, + "grad_norm": 1.885575999643548, + "learning_rate": 4.998636482953719e-05, + "loss": 1.1913, + "step": 1356 + }, + { + "epoch": 0.1609154512036049, + "grad_norm": 1.4529551505269742, + "learning_rate": 4.998628544732552e-05, + "loss": 0.7107, + "step": 1357 + }, + { + "epoch": 0.16103403296572988, + "grad_norm": 1.470745515845118, + "learning_rate": 4.998620583477094e-05, + "loss": 1.0223, + "step": 1358 + }, + { + "epoch": 0.16115261472785486, + "grad_norm": 1.5322377747313256, + "learning_rate": 4.998612599187418e-05, + "loss": 1.18, + "step": 1359 + }, + { + "epoch": 0.16127119648997984, + "grad_norm": 2.0058627224604413, + "learning_rate": 4.998604591863598e-05, + "loss": 1.0544, + "step": 1360 + }, + { + "epoch": 0.16138977825210482, + "grad_norm": 1.4782689787274887, + "learning_rate": 4.9985965615057074e-05, + "loss": 1.1748, + "step": 1361 + }, + { + "epoch": 0.1615083600142298, + "grad_norm": 1.6602189463986323, + "learning_rate": 4.99858850811382e-05, + "loss": 1.1235, + "step": 1362 + }, + { + "epoch": 0.1616269417763548, + "grad_norm": 1.4690540337794304, + "learning_rate": 4.998580431688011e-05, + "loss": 0.943, + "step": 1363 + }, + { + "epoch": 0.1617455235384798, + "grad_norm": 1.5618257016045878, + "learning_rate": 4.998572332228354e-05, + "loss": 1.0188, + "step": 1364 + }, + { + "epoch": 0.16186410530060477, + "grad_norm": 1.6128620612067248, + "learning_rate": 4.998564209734925e-05, + "loss": 1.1391, + "step": 1365 + }, + { + "epoch": 0.16198268706272975, + "grad_norm": 1.6236778085614243, + "learning_rate": 4.998556064207798e-05, + "loss": 0.8928, + "step": 1366 + }, + { + "epoch": 0.16210126882485473, + "grad_norm": 1.7596560704096995, + "learning_rate": 4.998547895647047e-05, + "loss": 1.2288, + "step": 1367 + }, + { + "epoch": 0.1622198505869797, + "grad_norm": 1.718194006170096, + "learning_rate": 4.998539704052749e-05, + "loss": 0.9102, + "step": 1368 + }, + { + "epoch": 0.1623384323491047, + "grad_norm": 1.6039558904898432, + "learning_rate": 4.998531489424978e-05, + "loss": 0.9787, + "step": 1369 + }, + { + "epoch": 0.1624570141112297, + "grad_norm": 1.7869624821254941, + "learning_rate": 4.9985232517638115e-05, + "loss": 1.0885, + "step": 1370 + }, + { + "epoch": 0.16257559587335468, + "grad_norm": 1.7293962754362249, + "learning_rate": 4.9985149910693244e-05, + "loss": 1.1675, + "step": 1371 + }, + { + "epoch": 0.16269417763547966, + "grad_norm": 1.8622520945426955, + "learning_rate": 4.998506707341592e-05, + "loss": 1.198, + "step": 1372 + }, + { + "epoch": 0.16281275939760464, + "grad_norm": 1.8362667213437427, + "learning_rate": 4.9984984005806925e-05, + "loss": 1.0501, + "step": 1373 + }, + { + "epoch": 0.16293134115972963, + "grad_norm": 1.9210347977058373, + "learning_rate": 4.9984900707867016e-05, + "loss": 1.0317, + "step": 1374 + }, + { + "epoch": 0.1630499229218546, + "grad_norm": 1.7071580333555145, + "learning_rate": 4.998481717959696e-05, + "loss": 1.2073, + "step": 1375 + }, + { + "epoch": 0.16316850468397961, + "grad_norm": 1.4449328645374082, + "learning_rate": 4.998473342099753e-05, + "loss": 0.8054, + "step": 1376 + }, + { + "epoch": 0.1632870864461046, + "grad_norm": 1.6902855839068058, + "learning_rate": 4.998464943206949e-05, + "loss": 1.1447, + "step": 1377 + }, + { + "epoch": 0.16340566820822958, + "grad_norm": 1.547117913099056, + "learning_rate": 4.9984565212813626e-05, + "loss": 0.9696, + "step": 1378 + }, + { + "epoch": 0.16352424997035456, + "grad_norm": 1.7937492179941836, + "learning_rate": 4.998448076323071e-05, + "loss": 1.0036, + "step": 1379 + }, + { + "epoch": 0.16364283173247954, + "grad_norm": 1.4696067729434532, + "learning_rate": 4.998439608332152e-05, + "loss": 0.7646, + "step": 1380 + }, + { + "epoch": 0.16376141349460452, + "grad_norm": 1.6232357800223798, + "learning_rate": 4.998431117308683e-05, + "loss": 1.0554, + "step": 1381 + }, + { + "epoch": 0.16387999525672953, + "grad_norm": 1.7774372565714536, + "learning_rate": 4.9984226032527427e-05, + "loss": 1.1452, + "step": 1382 + }, + { + "epoch": 0.1639985770188545, + "grad_norm": 1.5122969230074477, + "learning_rate": 4.9984140661644104e-05, + "loss": 1.1687, + "step": 1383 + }, + { + "epoch": 0.1641171587809795, + "grad_norm": 1.707824051786149, + "learning_rate": 4.998405506043764e-05, + "loss": 0.992, + "step": 1384 + }, + { + "epoch": 0.16423574054310447, + "grad_norm": 1.7964291947552244, + "learning_rate": 4.998396922890882e-05, + "loss": 1.2336, + "step": 1385 + }, + { + "epoch": 0.16435432230522945, + "grad_norm": 1.8507942404277464, + "learning_rate": 4.9983883167058446e-05, + "loss": 1.1822, + "step": 1386 + }, + { + "epoch": 0.16447290406735443, + "grad_norm": 1.7083047860778546, + "learning_rate": 4.998379687488731e-05, + "loss": 1.0221, + "step": 1387 + }, + { + "epoch": 0.16459148582947944, + "grad_norm": 1.8890501093404444, + "learning_rate": 4.9983710352396194e-05, + "loss": 1.1994, + "step": 1388 + }, + { + "epoch": 0.16471006759160442, + "grad_norm": 1.6359330352054908, + "learning_rate": 4.9983623599585916e-05, + "loss": 0.814, + "step": 1389 + }, + { + "epoch": 0.1648286493537294, + "grad_norm": 1.8083874461192206, + "learning_rate": 4.998353661645726e-05, + "loss": 1.2825, + "step": 1390 + }, + { + "epoch": 0.16494723111585438, + "grad_norm": 1.4181332793894104, + "learning_rate": 4.998344940301104e-05, + "loss": 0.9403, + "step": 1391 + }, + { + "epoch": 0.16506581287797936, + "grad_norm": 1.7019727168625205, + "learning_rate": 4.998336195924804e-05, + "loss": 1.1425, + "step": 1392 + }, + { + "epoch": 0.16518439464010434, + "grad_norm": 1.5952923275470947, + "learning_rate": 4.9983274285169093e-05, + "loss": 1.0634, + "step": 1393 + }, + { + "epoch": 0.16530297640222932, + "grad_norm": 1.8433030883516914, + "learning_rate": 4.9983186380774996e-05, + "loss": 1.2255, + "step": 1394 + }, + { + "epoch": 0.16542155816435433, + "grad_norm": 1.8219505592892096, + "learning_rate": 4.998309824606654e-05, + "loss": 1.1347, + "step": 1395 + }, + { + "epoch": 0.1655401399264793, + "grad_norm": 1.755335266754154, + "learning_rate": 4.998300988104456e-05, + "loss": 1.0888, + "step": 1396 + }, + { + "epoch": 0.1656587216886043, + "grad_norm": 1.675502777760315, + "learning_rate": 4.9982921285709874e-05, + "loss": 1.0251, + "step": 1397 + }, + { + "epoch": 0.16577730345072927, + "grad_norm": 1.6596941761025816, + "learning_rate": 4.998283246006329e-05, + "loss": 1.1501, + "step": 1398 + }, + { + "epoch": 0.16589588521285425, + "grad_norm": 1.4645756874087867, + "learning_rate": 4.998274340410561e-05, + "loss": 1.2798, + "step": 1399 + }, + { + "epoch": 0.16601446697497924, + "grad_norm": 1.5820250620259302, + "learning_rate": 4.998265411783769e-05, + "loss": 1.1002, + "step": 1400 + }, + { + "epoch": 0.16613304873710424, + "grad_norm": 1.4078825987526167, + "learning_rate": 4.998256460126033e-05, + "loss": 0.9054, + "step": 1401 + }, + { + "epoch": 0.16625163049922922, + "grad_norm": 1.5243710760865883, + "learning_rate": 4.998247485437436e-05, + "loss": 0.9966, + "step": 1402 + }, + { + "epoch": 0.1663702122613542, + "grad_norm": 1.6470781801332368, + "learning_rate": 4.99823848771806e-05, + "loss": 0.9787, + "step": 1403 + }, + { + "epoch": 0.1664887940234792, + "grad_norm": 1.5889052541175233, + "learning_rate": 4.998229466967989e-05, + "loss": 1.0884, + "step": 1404 + }, + { + "epoch": 0.16660737578560417, + "grad_norm": 1.3704529808819026, + "learning_rate": 4.998220423187306e-05, + "loss": 1.2483, + "step": 1405 + }, + { + "epoch": 0.16672595754772915, + "grad_norm": 1.927980647407345, + "learning_rate": 4.9982113563760945e-05, + "loss": 1.2073, + "step": 1406 + }, + { + "epoch": 0.16684453930985416, + "grad_norm": 1.5410154330439827, + "learning_rate": 4.998202266534438e-05, + "loss": 1.0818, + "step": 1407 + }, + { + "epoch": 0.16696312107197914, + "grad_norm": 1.402050506527795, + "learning_rate": 4.998193153662419e-05, + "loss": 1.1509, + "step": 1408 + }, + { + "epoch": 0.16708170283410412, + "grad_norm": 1.4241215245509102, + "learning_rate": 4.998184017760123e-05, + "loss": 1.1866, + "step": 1409 + }, + { + "epoch": 0.1672002845962291, + "grad_norm": 1.5441533052492682, + "learning_rate": 4.998174858827634e-05, + "loss": 1.1454, + "step": 1410 + }, + { + "epoch": 0.16731886635835408, + "grad_norm": 1.5800478469216788, + "learning_rate": 4.998165676865037e-05, + "loss": 0.8861, + "step": 1411 + }, + { + "epoch": 0.16743744812047906, + "grad_norm": 1.5705578617296188, + "learning_rate": 4.998156471872415e-05, + "loss": 1.0652, + "step": 1412 + }, + { + "epoch": 0.16755602988260407, + "grad_norm": 1.4863429599215927, + "learning_rate": 4.998147243849855e-05, + "loss": 1.1484, + "step": 1413 + }, + { + "epoch": 0.16767461164472905, + "grad_norm": 1.6347150361636131, + "learning_rate": 4.998137992797439e-05, + "loss": 1.0709, + "step": 1414 + }, + { + "epoch": 0.16779319340685403, + "grad_norm": 1.5368943593595368, + "learning_rate": 4.9981287187152546e-05, + "loss": 1.1044, + "step": 1415 + }, + { + "epoch": 0.167911775168979, + "grad_norm": 1.6502131474726844, + "learning_rate": 4.9981194216033875e-05, + "loss": 1.1846, + "step": 1416 + }, + { + "epoch": 0.168030356931104, + "grad_norm": 1.9846678863048601, + "learning_rate": 4.9981101014619224e-05, + "loss": 0.884, + "step": 1417 + }, + { + "epoch": 0.16814893869322897, + "grad_norm": 1.3600737258061761, + "learning_rate": 4.998100758290946e-05, + "loss": 0.7511, + "step": 1418 + }, + { + "epoch": 0.16826752045535395, + "grad_norm": 1.5691300525267515, + "learning_rate": 4.998091392090544e-05, + "loss": 1.199, + "step": 1419 + }, + { + "epoch": 0.16838610221747896, + "grad_norm": 1.7665247104005, + "learning_rate": 4.9980820028608025e-05, + "loss": 1.1579, + "step": 1420 + }, + { + "epoch": 0.16850468397960394, + "grad_norm": 1.6733398921950076, + "learning_rate": 4.9980725906018074e-05, + "loss": 0.9913, + "step": 1421 + }, + { + "epoch": 0.16862326574172892, + "grad_norm": 1.7739747092382598, + "learning_rate": 4.998063155313647e-05, + "loss": 1.2746, + "step": 1422 + }, + { + "epoch": 0.1687418475038539, + "grad_norm": 1.682865054549608, + "learning_rate": 4.998053696996408e-05, + "loss": 1.13, + "step": 1423 + }, + { + "epoch": 0.16886042926597888, + "grad_norm": 1.8069592696566756, + "learning_rate": 4.998044215650177e-05, + "loss": 1.1755, + "step": 1424 + }, + { + "epoch": 0.16897901102810386, + "grad_norm": 1.6273140479001253, + "learning_rate": 4.9980347112750414e-05, + "loss": 0.8599, + "step": 1425 + }, + { + "epoch": 0.16909759279022887, + "grad_norm": 1.5384645308976084, + "learning_rate": 4.998025183871089e-05, + "loss": 1.0985, + "step": 1426 + }, + { + "epoch": 0.16921617455235385, + "grad_norm": 1.7328047000606086, + "learning_rate": 4.9980156334384084e-05, + "loss": 1.1288, + "step": 1427 + }, + { + "epoch": 0.16933475631447883, + "grad_norm": 1.5348274356762661, + "learning_rate": 4.998006059977086e-05, + "loss": 1.0162, + "step": 1428 + }, + { + "epoch": 0.16945333807660382, + "grad_norm": 1.653035063915359, + "learning_rate": 4.997996463487212e-05, + "loss": 0.9438, + "step": 1429 + }, + { + "epoch": 0.1695719198387288, + "grad_norm": 1.619869964671937, + "learning_rate": 4.997986843968873e-05, + "loss": 1.0679, + "step": 1430 + }, + { + "epoch": 0.16969050160085378, + "grad_norm": 1.7021100950289887, + "learning_rate": 4.997977201422159e-05, + "loss": 1.1952, + "step": 1431 + }, + { + "epoch": 0.16980908336297879, + "grad_norm": 1.7110899386599807, + "learning_rate": 4.997967535847158e-05, + "loss": 1.0525, + "step": 1432 + }, + { + "epoch": 0.16992766512510377, + "grad_norm": 1.8266389853219662, + "learning_rate": 4.99795784724396e-05, + "loss": 1.0271, + "step": 1433 + }, + { + "epoch": 0.17004624688722875, + "grad_norm": 1.8332102111802118, + "learning_rate": 4.997948135612653e-05, + "loss": 0.8833, + "step": 1434 + }, + { + "epoch": 0.17016482864935373, + "grad_norm": 1.6140013024256237, + "learning_rate": 4.997938400953328e-05, + "loss": 1.1543, + "step": 1435 + }, + { + "epoch": 0.1702834104114787, + "grad_norm": 1.6583537380804787, + "learning_rate": 4.997928643266074e-05, + "loss": 1.1544, + "step": 1436 + }, + { + "epoch": 0.1704019921736037, + "grad_norm": 1.9182840917386577, + "learning_rate": 4.9979188625509814e-05, + "loss": 0.8248, + "step": 1437 + }, + { + "epoch": 0.1705205739357287, + "grad_norm": 1.7287301114514082, + "learning_rate": 4.99790905880814e-05, + "loss": 1.2935, + "step": 1438 + }, + { + "epoch": 0.17063915569785368, + "grad_norm": 1.5761391017594453, + "learning_rate": 4.99789923203764e-05, + "loss": 1.0135, + "step": 1439 + }, + { + "epoch": 0.17075773745997866, + "grad_norm": 1.6950791739319335, + "learning_rate": 4.9978893822395724e-05, + "loss": 1.142, + "step": 1440 + }, + { + "epoch": 0.17087631922210364, + "grad_norm": 1.44275203338743, + "learning_rate": 4.9978795094140275e-05, + "loss": 0.7813, + "step": 1441 + }, + { + "epoch": 0.17099490098422862, + "grad_norm": 1.7452004002603028, + "learning_rate": 4.997869613561097e-05, + "loss": 1.1846, + "step": 1442 + }, + { + "epoch": 0.1711134827463536, + "grad_norm": 1.4030538144467086, + "learning_rate": 4.997859694680871e-05, + "loss": 1.1223, + "step": 1443 + }, + { + "epoch": 0.1712320645084786, + "grad_norm": 1.4818477284841778, + "learning_rate": 4.9978497527734426e-05, + "loss": 0.908, + "step": 1444 + }, + { + "epoch": 0.1713506462706036, + "grad_norm": 1.5831523565636467, + "learning_rate": 4.9978397878389024e-05, + "loss": 1.0888, + "step": 1445 + }, + { + "epoch": 0.17146922803272857, + "grad_norm": 1.636573204338768, + "learning_rate": 4.997829799877342e-05, + "loss": 1.1432, + "step": 1446 + }, + { + "epoch": 0.17158780979485355, + "grad_norm": 1.4279258283797855, + "learning_rate": 4.997819788888854e-05, + "loss": 0.9166, + "step": 1447 + }, + { + "epoch": 0.17170639155697853, + "grad_norm": 1.5964752851183637, + "learning_rate": 4.9978097548735306e-05, + "loss": 1.1063, + "step": 1448 + }, + { + "epoch": 0.1718249733191035, + "grad_norm": 1.7517647870587683, + "learning_rate": 4.997799697831464e-05, + "loss": 1.1025, + "step": 1449 + }, + { + "epoch": 0.1719435550812285, + "grad_norm": 1.380213447604419, + "learning_rate": 4.997789617762748e-05, + "loss": 0.7001, + "step": 1450 + }, + { + "epoch": 0.1720621368433535, + "grad_norm": 1.8734690070113083, + "learning_rate": 4.997779514667474e-05, + "loss": 1.2019, + "step": 1451 + }, + { + "epoch": 0.17218071860547848, + "grad_norm": 1.8192272605829554, + "learning_rate": 4.997769388545736e-05, + "loss": 1.1731, + "step": 1452 + }, + { + "epoch": 0.17229930036760346, + "grad_norm": 1.775085065519624, + "learning_rate": 4.9977592393976266e-05, + "loss": 1.2824, + "step": 1453 + }, + { + "epoch": 0.17241788212972844, + "grad_norm": 2.265627287758425, + "learning_rate": 4.997749067223241e-05, + "loss": 1.3224, + "step": 1454 + }, + { + "epoch": 0.17253646389185343, + "grad_norm": 2.064325840663082, + "learning_rate": 4.997738872022672e-05, + "loss": 1.1425, + "step": 1455 + }, + { + "epoch": 0.1726550456539784, + "grad_norm": 1.6187393667641785, + "learning_rate": 4.9977286537960134e-05, + "loss": 0.8909, + "step": 1456 + }, + { + "epoch": 0.17277362741610341, + "grad_norm": 1.9316235828255726, + "learning_rate": 4.997718412543358e-05, + "loss": 1.0881, + "step": 1457 + }, + { + "epoch": 0.1728922091782284, + "grad_norm": 1.799445331389091, + "learning_rate": 4.9977081482648034e-05, + "loss": 1.1179, + "step": 1458 + }, + { + "epoch": 0.17301079094035338, + "grad_norm": 1.6134467133889536, + "learning_rate": 4.997697860960443e-05, + "loss": 1.0122, + "step": 1459 + }, + { + "epoch": 0.17312937270247836, + "grad_norm": 1.6234587648135672, + "learning_rate": 4.99768755063037e-05, + "loss": 1.0887, + "step": 1460 + }, + { + "epoch": 0.17324795446460334, + "grad_norm": 1.4403799320594506, + "learning_rate": 4.997677217274681e-05, + "loss": 1.4116, + "step": 1461 + }, + { + "epoch": 0.17336653622672832, + "grad_norm": 1.482711243411262, + "learning_rate": 4.997666860893471e-05, + "loss": 1.2614, + "step": 1462 + }, + { + "epoch": 0.17348511798885333, + "grad_norm": 1.332450678599432, + "learning_rate": 4.9976564814868355e-05, + "loss": 0.9241, + "step": 1463 + }, + { + "epoch": 0.1736036997509783, + "grad_norm": 1.6241224258990767, + "learning_rate": 4.997646079054869e-05, + "loss": 1.0423, + "step": 1464 + }, + { + "epoch": 0.1737222815131033, + "grad_norm": 1.3316483396682481, + "learning_rate": 4.99763565359767e-05, + "loss": 1.1342, + "step": 1465 + }, + { + "epoch": 0.17384086327522827, + "grad_norm": 1.2856547831546101, + "learning_rate": 4.997625205115332e-05, + "loss": 1.0796, + "step": 1466 + }, + { + "epoch": 0.17395944503735325, + "grad_norm": 1.4410122889544619, + "learning_rate": 4.997614733607953e-05, + "loss": 1.17, + "step": 1467 + }, + { + "epoch": 0.17407802679947823, + "grad_norm": 1.558541533942193, + "learning_rate": 4.997604239075629e-05, + "loss": 0.861, + "step": 1468 + }, + { + "epoch": 0.17419660856160324, + "grad_norm": 1.4902226374816268, + "learning_rate": 4.9975937215184565e-05, + "loss": 0.8577, + "step": 1469 + }, + { + "epoch": 0.17431519032372822, + "grad_norm": 1.841154795727841, + "learning_rate": 4.9975831809365326e-05, + "loss": 1.2982, + "step": 1470 + }, + { + "epoch": 0.1744337720858532, + "grad_norm": 1.5465443159511676, + "learning_rate": 4.997572617329954e-05, + "loss": 1.161, + "step": 1471 + }, + { + "epoch": 0.17455235384797818, + "grad_norm": 1.5677916292699619, + "learning_rate": 4.997562030698819e-05, + "loss": 1.0781, + "step": 1472 + }, + { + "epoch": 0.17467093561010316, + "grad_norm": 2.01829886524247, + "learning_rate": 4.997551421043225e-05, + "loss": 1.2409, + "step": 1473 + }, + { + "epoch": 0.17478951737222814, + "grad_norm": 1.6911966877832518, + "learning_rate": 4.9975407883632694e-05, + "loss": 0.8492, + "step": 1474 + }, + { + "epoch": 0.17490809913435312, + "grad_norm": 1.8276022629462294, + "learning_rate": 4.9975301326590505e-05, + "loss": 1.4104, + "step": 1475 + }, + { + "epoch": 0.17502668089647813, + "grad_norm": 1.2870209023721173, + "learning_rate": 4.997519453930667e-05, + "loss": 1.0204, + "step": 1476 + }, + { + "epoch": 0.1751452626586031, + "grad_norm": 1.8952367266962644, + "learning_rate": 4.997508752178216e-05, + "loss": 1.0593, + "step": 1477 + }, + { + "epoch": 0.1752638444207281, + "grad_norm": 1.6375402616836958, + "learning_rate": 4.997498027401797e-05, + "loss": 1.1603, + "step": 1478 + }, + { + "epoch": 0.17538242618285307, + "grad_norm": 1.694895787038009, + "learning_rate": 4.9974872796015094e-05, + "loss": 1.2543, + "step": 1479 + }, + { + "epoch": 0.17550100794497805, + "grad_norm": 1.4015770953166906, + "learning_rate": 4.997476508777451e-05, + "loss": 0.966, + "step": 1480 + }, + { + "epoch": 0.17561958970710304, + "grad_norm": 1.6679901695572588, + "learning_rate": 4.9974657149297224e-05, + "loss": 1.1412, + "step": 1481 + }, + { + "epoch": 0.17573817146922804, + "grad_norm": 1.591288894090236, + "learning_rate": 4.997454898058422e-05, + "loss": 1.0461, + "step": 1482 + }, + { + "epoch": 0.17585675323135302, + "grad_norm": 1.7695886916045607, + "learning_rate": 4.99744405816365e-05, + "loss": 1.1108, + "step": 1483 + }, + { + "epoch": 0.175975334993478, + "grad_norm": 1.7972988499455775, + "learning_rate": 4.9974331952455066e-05, + "loss": 1.2997, + "step": 1484 + }, + { + "epoch": 0.17609391675560299, + "grad_norm": 1.579948511635081, + "learning_rate": 4.9974223093040914e-05, + "loss": 1.2128, + "step": 1485 + }, + { + "epoch": 0.17621249851772797, + "grad_norm": 1.7119639176786876, + "learning_rate": 4.9974114003395055e-05, + "loss": 1.3646, + "step": 1486 + }, + { + "epoch": 0.17633108027985295, + "grad_norm": 1.3915413627293984, + "learning_rate": 4.9974004683518486e-05, + "loss": 1.0938, + "step": 1487 + }, + { + "epoch": 0.17644966204197796, + "grad_norm": 1.5234884759219331, + "learning_rate": 4.997389513341222e-05, + "loss": 1.0556, + "step": 1488 + }, + { + "epoch": 0.17656824380410294, + "grad_norm": 1.5575362725223327, + "learning_rate": 4.997378535307727e-05, + "loss": 1.1533, + "step": 1489 + }, + { + "epoch": 0.17668682556622792, + "grad_norm": 1.5947918650552788, + "learning_rate": 4.997367534251464e-05, + "loss": 1.1162, + "step": 1490 + }, + { + "epoch": 0.1768054073283529, + "grad_norm": 1.5685493577313225, + "learning_rate": 4.997356510172535e-05, + "loss": 1.1426, + "step": 1491 + }, + { + "epoch": 0.17692398909047788, + "grad_norm": 1.6912356860419224, + "learning_rate": 4.9973454630710415e-05, + "loss": 1.0526, + "step": 1492 + }, + { + "epoch": 0.17704257085260286, + "grad_norm": 1.5945533763197095, + "learning_rate": 4.997334392947085e-05, + "loss": 1.3125, + "step": 1493 + }, + { + "epoch": 0.17716115261472787, + "grad_norm": 1.2198739588388656, + "learning_rate": 4.997323299800768e-05, + "loss": 0.695, + "step": 1494 + }, + { + "epoch": 0.17727973437685285, + "grad_norm": 1.8101120795210968, + "learning_rate": 4.9973121836321926e-05, + "loss": 1.0565, + "step": 1495 + }, + { + "epoch": 0.17739831613897783, + "grad_norm": 1.634643368357219, + "learning_rate": 4.997301044441461e-05, + "loss": 1.0169, + "step": 1496 + }, + { + "epoch": 0.1775168979011028, + "grad_norm": 1.981421224236605, + "learning_rate": 4.997289882228677e-05, + "loss": 1.1409, + "step": 1497 + }, + { + "epoch": 0.1776354796632278, + "grad_norm": 1.5375694100235646, + "learning_rate": 4.997278696993942e-05, + "loss": 0.8433, + "step": 1498 + }, + { + "epoch": 0.17775406142535277, + "grad_norm": 1.9124195112188587, + "learning_rate": 4.9972674887373595e-05, + "loss": 0.9968, + "step": 1499 + }, + { + "epoch": 0.17787264318747775, + "grad_norm": 1.8604747789840341, + "learning_rate": 4.997256257459033e-05, + "loss": 1.1898, + "step": 1500 + }, + { + "epoch": 0.17799122494960276, + "grad_norm": 2.0490196148552973, + "learning_rate": 4.997245003159067e-05, + "loss": 1.2243, + "step": 1501 + }, + { + "epoch": 0.17810980671172774, + "grad_norm": 1.5531823121979078, + "learning_rate": 4.997233725837564e-05, + "loss": 1.1142, + "step": 1502 + }, + { + "epoch": 0.17822838847385272, + "grad_norm": 2.1710003866311265, + "learning_rate": 4.9972224254946287e-05, + "loss": 1.2255, + "step": 1503 + }, + { + "epoch": 0.1783469702359777, + "grad_norm": 1.658832936724596, + "learning_rate": 4.997211102130365e-05, + "loss": 0.9675, + "step": 1504 + }, + { + "epoch": 0.17846555199810268, + "grad_norm": 1.5609785327705152, + "learning_rate": 4.9971997557448754e-05, + "loss": 1.0465, + "step": 1505 + }, + { + "epoch": 0.17858413376022766, + "grad_norm": 1.5189526883854214, + "learning_rate": 4.9971883863382684e-05, + "loss": 0.7537, + "step": 1506 + }, + { + "epoch": 0.17870271552235267, + "grad_norm": 1.5552763545544326, + "learning_rate": 4.997176993910646e-05, + "loss": 0.9973, + "step": 1507 + }, + { + "epoch": 0.17882129728447765, + "grad_norm": 1.5640200550146752, + "learning_rate": 4.997165578462114e-05, + "loss": 1.1428, + "step": 1508 + }, + { + "epoch": 0.17893987904660263, + "grad_norm": 1.630544982785924, + "learning_rate": 4.997154139992778e-05, + "loss": 0.974, + "step": 1509 + }, + { + "epoch": 0.17905846080872762, + "grad_norm": 1.8309397256139732, + "learning_rate": 4.997142678502742e-05, + "loss": 1.1002, + "step": 1510 + }, + { + "epoch": 0.1791770425708526, + "grad_norm": 1.5067299008405102, + "learning_rate": 4.997131193992114e-05, + "loss": 1.0767, + "step": 1511 + }, + { + "epoch": 0.17929562433297758, + "grad_norm": 1.908703220521912, + "learning_rate": 4.9971196864609975e-05, + "loss": 0.914, + "step": 1512 + }, + { + "epoch": 0.17941420609510259, + "grad_norm": 1.595290521437111, + "learning_rate": 4.9971081559095e-05, + "loss": 0.9302, + "step": 1513 + }, + { + "epoch": 0.17953278785722757, + "grad_norm": 1.8397020354562523, + "learning_rate": 4.9970966023377276e-05, + "loss": 0.9759, + "step": 1514 + }, + { + "epoch": 0.17965136961935255, + "grad_norm": 1.5394998969824591, + "learning_rate": 4.997085025745787e-05, + "loss": 1.0322, + "step": 1515 + }, + { + "epoch": 0.17976995138147753, + "grad_norm": 1.666277470371369, + "learning_rate": 4.997073426133784e-05, + "loss": 1.2071, + "step": 1516 + }, + { + "epoch": 0.1798885331436025, + "grad_norm": 1.6803052962291969, + "learning_rate": 4.997061803501826e-05, + "loss": 1.1268, + "step": 1517 + }, + { + "epoch": 0.1800071149057275, + "grad_norm": 1.7071040119875576, + "learning_rate": 4.997050157850021e-05, + "loss": 1.1063, + "step": 1518 + }, + { + "epoch": 0.1801256966678525, + "grad_norm": 1.576905839128205, + "learning_rate": 4.997038489178475e-05, + "loss": 1.0089, + "step": 1519 + }, + { + "epoch": 0.18024427842997748, + "grad_norm": 1.7541192620099566, + "learning_rate": 4.997026797487296e-05, + "loss": 1.1136, + "step": 1520 + }, + { + "epoch": 0.18036286019210246, + "grad_norm": 1.337539222603322, + "learning_rate": 4.9970150827765924e-05, + "loss": 0.8104, + "step": 1521 + }, + { + "epoch": 0.18048144195422744, + "grad_norm": 1.26884287581894, + "learning_rate": 4.9970033450464726e-05, + "loss": 0.9024, + "step": 1522 + }, + { + "epoch": 0.18060002371635242, + "grad_norm": 1.639011493873352, + "learning_rate": 4.9969915842970427e-05, + "loss": 1.3009, + "step": 1523 + }, + { + "epoch": 0.1807186054784774, + "grad_norm": 1.6740358964049336, + "learning_rate": 4.996979800528413e-05, + "loss": 1.3152, + "step": 1524 + }, + { + "epoch": 0.18083718724060238, + "grad_norm": 1.7612752135538128, + "learning_rate": 4.996967993740692e-05, + "loss": 1.1132, + "step": 1525 + }, + { + "epoch": 0.1809557690027274, + "grad_norm": 2.047014102194751, + "learning_rate": 4.996956163933987e-05, + "loss": 1.1026, + "step": 1526 + }, + { + "epoch": 0.18107435076485237, + "grad_norm": 1.6109427837288615, + "learning_rate": 4.996944311108408e-05, + "loss": 0.9948, + "step": 1527 + }, + { + "epoch": 0.18119293252697735, + "grad_norm": 1.4986031352082327, + "learning_rate": 4.9969324352640655e-05, + "loss": 1.1169, + "step": 1528 + }, + { + "epoch": 0.18131151428910233, + "grad_norm": 1.4617013846761768, + "learning_rate": 4.996920536401067e-05, + "loss": 0.9764, + "step": 1529 + }, + { + "epoch": 0.1814300960512273, + "grad_norm": 1.3788176100202822, + "learning_rate": 4.996908614519524e-05, + "loss": 1.0428, + "step": 1530 + }, + { + "epoch": 0.1815486778133523, + "grad_norm": 1.7710026774067769, + "learning_rate": 4.996896669619545e-05, + "loss": 0.8615, + "step": 1531 + }, + { + "epoch": 0.1816672595754773, + "grad_norm": 1.7100416230946485, + "learning_rate": 4.996884701701241e-05, + "loss": 1.0277, + "step": 1532 + }, + { + "epoch": 0.18178584133760228, + "grad_norm": 1.663728112202658, + "learning_rate": 4.996872710764721e-05, + "loss": 0.9948, + "step": 1533 + }, + { + "epoch": 0.18190442309972726, + "grad_norm": 1.7722070223271151, + "learning_rate": 4.996860696810097e-05, + "loss": 1.1701, + "step": 1534 + }, + { + "epoch": 0.18202300486185224, + "grad_norm": 1.7150482641625249, + "learning_rate": 4.99684865983748e-05, + "loss": 1.0361, + "step": 1535 + }, + { + "epoch": 0.18214158662397723, + "grad_norm": 1.7881928321923817, + "learning_rate": 4.996836599846979e-05, + "loss": 1.0756, + "step": 1536 + }, + { + "epoch": 0.1822601683861022, + "grad_norm": 1.7257719944101488, + "learning_rate": 4.996824516838707e-05, + "loss": 1.1809, + "step": 1537 + }, + { + "epoch": 0.18237875014822721, + "grad_norm": 1.961229936757753, + "learning_rate": 4.9968124108127746e-05, + "loss": 1.1316, + "step": 1538 + }, + { + "epoch": 0.1824973319103522, + "grad_norm": 1.4424018846943025, + "learning_rate": 4.996800281769293e-05, + "loss": 0.9507, + "step": 1539 + }, + { + "epoch": 0.18261591367247718, + "grad_norm": 2.0653052528596163, + "learning_rate": 4.996788129708375e-05, + "loss": 1.3239, + "step": 1540 + }, + { + "epoch": 0.18273449543460216, + "grad_norm": 1.5385058486297811, + "learning_rate": 4.996775954630133e-05, + "loss": 0.9697, + "step": 1541 + }, + { + "epoch": 0.18285307719672714, + "grad_norm": 1.6188812183821766, + "learning_rate": 4.9967637565346774e-05, + "loss": 1.1462, + "step": 1542 + }, + { + "epoch": 0.18297165895885212, + "grad_norm": 1.460773623800915, + "learning_rate": 4.996751535422122e-05, + "loss": 1.1681, + "step": 1543 + }, + { + "epoch": 0.18309024072097713, + "grad_norm": 1.5371762096946808, + "learning_rate": 4.996739291292579e-05, + "loss": 1.0415, + "step": 1544 + }, + { + "epoch": 0.1832088224831021, + "grad_norm": 1.4937286727022654, + "learning_rate": 4.996727024146162e-05, + "loss": 1.2516, + "step": 1545 + }, + { + "epoch": 0.1833274042452271, + "grad_norm": 1.6603403180655056, + "learning_rate": 4.9967147339829835e-05, + "loss": 1.1199, + "step": 1546 + }, + { + "epoch": 0.18344598600735207, + "grad_norm": 1.4583604112193427, + "learning_rate": 4.9967024208031566e-05, + "loss": 1.3386, + "step": 1547 + }, + { + "epoch": 0.18356456776947705, + "grad_norm": 1.5544086365430707, + "learning_rate": 4.9966900846067944e-05, + "loss": 1.0127, + "step": 1548 + }, + { + "epoch": 0.18368314953160203, + "grad_norm": 1.5341383614114819, + "learning_rate": 4.996677725394012e-05, + "loss": 0.9757, + "step": 1549 + }, + { + "epoch": 0.183801731293727, + "grad_norm": 1.5440330020968618, + "learning_rate": 4.9966653431649225e-05, + "loss": 1.0375, + "step": 1550 + }, + { + "epoch": 0.18392031305585202, + "grad_norm": 1.6783147772359568, + "learning_rate": 4.9966529379196406e-05, + "loss": 1.0863, + "step": 1551 + }, + { + "epoch": 0.184038894817977, + "grad_norm": 1.7130596092543076, + "learning_rate": 4.996640509658279e-05, + "loss": 1.164, + "step": 1552 + }, + { + "epoch": 0.18415747658010198, + "grad_norm": 1.5557615062034251, + "learning_rate": 4.9966280583809546e-05, + "loss": 0.9441, + "step": 1553 + }, + { + "epoch": 0.18427605834222696, + "grad_norm": 1.8976070426085894, + "learning_rate": 4.99661558408778e-05, + "loss": 1.2119, + "step": 1554 + }, + { + "epoch": 0.18439464010435194, + "grad_norm": 1.755881234025144, + "learning_rate": 4.996603086778873e-05, + "loss": 1.1316, + "step": 1555 + }, + { + "epoch": 0.18451322186647692, + "grad_norm": 1.6336313731120855, + "learning_rate": 4.9965905664543455e-05, + "loss": 1.1469, + "step": 1556 + }, + { + "epoch": 0.18463180362860193, + "grad_norm": 1.7604787396012924, + "learning_rate": 4.996578023114314e-05, + "loss": 1.07, + "step": 1557 + }, + { + "epoch": 0.1847503853907269, + "grad_norm": 1.5672944326904943, + "learning_rate": 4.996565456758897e-05, + "loss": 1.0435, + "step": 1558 + }, + { + "epoch": 0.1848689671528519, + "grad_norm": 1.9068366815275553, + "learning_rate": 4.996552867388206e-05, + "loss": 1.0765, + "step": 1559 + }, + { + "epoch": 0.18498754891497687, + "grad_norm": 1.7476299569943206, + "learning_rate": 4.99654025500236e-05, + "loss": 1.2208, + "step": 1560 + }, + { + "epoch": 0.18510613067710185, + "grad_norm": 1.660763032483411, + "learning_rate": 4.996527619601473e-05, + "loss": 1.0627, + "step": 1561 + }, + { + "epoch": 0.18522471243922684, + "grad_norm": 1.547670732331173, + "learning_rate": 4.996514961185664e-05, + "loss": 0.9164, + "step": 1562 + }, + { + "epoch": 0.18534329420135184, + "grad_norm": 1.730788351270218, + "learning_rate": 4.996502279755049e-05, + "loss": 0.9246, + "step": 1563 + }, + { + "epoch": 0.18546187596347682, + "grad_norm": 1.8215216898016657, + "learning_rate": 4.996489575309743e-05, + "loss": 0.8069, + "step": 1564 + }, + { + "epoch": 0.1855804577256018, + "grad_norm": 1.783487920841857, + "learning_rate": 4.9964768478498655e-05, + "loss": 1.0708, + "step": 1565 + }, + { + "epoch": 0.18569903948772679, + "grad_norm": 1.7990803976020977, + "learning_rate": 4.996464097375532e-05, + "loss": 0.9731, + "step": 1566 + }, + { + "epoch": 0.18581762124985177, + "grad_norm": 1.7644737502639234, + "learning_rate": 4.996451323886862e-05, + "loss": 1.2162, + "step": 1567 + }, + { + "epoch": 0.18593620301197675, + "grad_norm": 1.6336924370427168, + "learning_rate": 4.996438527383971e-05, + "loss": 0.9407, + "step": 1568 + }, + { + "epoch": 0.18605478477410176, + "grad_norm": 1.826532888364708, + "learning_rate": 4.9964257078669785e-05, + "loss": 1.0371, + "step": 1569 + }, + { + "epoch": 0.18617336653622674, + "grad_norm": 1.668822514249424, + "learning_rate": 4.9964128653360024e-05, + "loss": 1.229, + "step": 1570 + }, + { + "epoch": 0.18629194829835172, + "grad_norm": 1.578288551131229, + "learning_rate": 4.9963999997911615e-05, + "loss": 0.9798, + "step": 1571 + }, + { + "epoch": 0.1864105300604767, + "grad_norm": 1.6065515753527535, + "learning_rate": 4.9963871112325736e-05, + "loss": 1.2065, + "step": 1572 + }, + { + "epoch": 0.18652911182260168, + "grad_norm": 1.6926844626428836, + "learning_rate": 4.996374199660357e-05, + "loss": 1.0218, + "step": 1573 + }, + { + "epoch": 0.18664769358472666, + "grad_norm": 1.5660879970131347, + "learning_rate": 4.996361265074632e-05, + "loss": 0.9985, + "step": 1574 + }, + { + "epoch": 0.18676627534685167, + "grad_norm": 1.8349596018395025, + "learning_rate": 4.996348307475518e-05, + "loss": 1.1781, + "step": 1575 + }, + { + "epoch": 0.18688485710897665, + "grad_norm": 1.6200880153490875, + "learning_rate": 4.996335326863133e-05, + "loss": 1.3366, + "step": 1576 + }, + { + "epoch": 0.18700343887110163, + "grad_norm": 1.4726909357333346, + "learning_rate": 4.996322323237598e-05, + "loss": 1.0393, + "step": 1577 + }, + { + "epoch": 0.1871220206332266, + "grad_norm": 1.644644569904305, + "learning_rate": 4.996309296599032e-05, + "loss": 1.2533, + "step": 1578 + }, + { + "epoch": 0.1872406023953516, + "grad_norm": 1.6092633734896153, + "learning_rate": 4.996296246947556e-05, + "loss": 1.1218, + "step": 1579 + }, + { + "epoch": 0.18735918415747657, + "grad_norm": 1.3625614405223025, + "learning_rate": 4.996283174283289e-05, + "loss": 1.0278, + "step": 1580 + }, + { + "epoch": 0.18747776591960155, + "grad_norm": 1.6056241331557157, + "learning_rate": 4.996270078606353e-05, + "loss": 1.1424, + "step": 1581 + }, + { + "epoch": 0.18759634768172656, + "grad_norm": 1.5612493425851182, + "learning_rate": 4.9962569599168674e-05, + "loss": 0.7986, + "step": 1582 + }, + { + "epoch": 0.18771492944385154, + "grad_norm": 1.7312694491015879, + "learning_rate": 4.996243818214954e-05, + "loss": 1.1667, + "step": 1583 + }, + { + "epoch": 0.18783351120597652, + "grad_norm": 1.5825270657591424, + "learning_rate": 4.996230653500734e-05, + "loss": 0.8741, + "step": 1584 + }, + { + "epoch": 0.1879520929681015, + "grad_norm": 1.54593603729105, + "learning_rate": 4.9962174657743286e-05, + "loss": 0.8064, + "step": 1585 + }, + { + "epoch": 0.18807067473022648, + "grad_norm": 1.6697306264364022, + "learning_rate": 4.996204255035858e-05, + "loss": 1.202, + "step": 1586 + }, + { + "epoch": 0.18818925649235146, + "grad_norm": 1.6299478725188448, + "learning_rate": 4.996191021285447e-05, + "loss": 1.0735, + "step": 1587 + }, + { + "epoch": 0.18830783825447647, + "grad_norm": 1.615862379203652, + "learning_rate": 4.996177764523214e-05, + "loss": 0.8661, + "step": 1588 + }, + { + "epoch": 0.18842642001660145, + "grad_norm": 1.6562773593366404, + "learning_rate": 4.996164484749284e-05, + "loss": 0.8983, + "step": 1589 + }, + { + "epoch": 0.18854500177872643, + "grad_norm": 1.8255610662468014, + "learning_rate": 4.9961511819637784e-05, + "loss": 1.1389, + "step": 1590 + }, + { + "epoch": 0.18866358354085142, + "grad_norm": 1.6954937022304208, + "learning_rate": 4.9961378561668204e-05, + "loss": 1.0967, + "step": 1591 + }, + { + "epoch": 0.1887821653029764, + "grad_norm": 1.6979503309447315, + "learning_rate": 4.996124507358532e-05, + "loss": 1.0346, + "step": 1592 + }, + { + "epoch": 0.18890074706510138, + "grad_norm": 1.6459416960962332, + "learning_rate": 4.9961111355390354e-05, + "loss": 1.097, + "step": 1593 + }, + { + "epoch": 0.18901932882722638, + "grad_norm": 1.7014786529806827, + "learning_rate": 4.9960977407084565e-05, + "loss": 1.0413, + "step": 1594 + }, + { + "epoch": 0.18913791058935137, + "grad_norm": 1.479642376560751, + "learning_rate": 4.996084322866917e-05, + "loss": 1.1937, + "step": 1595 + }, + { + "epoch": 0.18925649235147635, + "grad_norm": 1.5638024996354594, + "learning_rate": 4.996070882014541e-05, + "loss": 0.5993, + "step": 1596 + }, + { + "epoch": 0.18937507411360133, + "grad_norm": 1.5882216569684302, + "learning_rate": 4.996057418151452e-05, + "loss": 1.0294, + "step": 1597 + }, + { + "epoch": 0.1894936558757263, + "grad_norm": 1.6466476608655074, + "learning_rate": 4.996043931277774e-05, + "loss": 1.0351, + "step": 1598 + }, + { + "epoch": 0.1896122376378513, + "grad_norm": 1.763414535423779, + "learning_rate": 4.9960304213936325e-05, + "loss": 0.936, + "step": 1599 + }, + { + "epoch": 0.1897308193999763, + "grad_norm": 1.46580655597137, + "learning_rate": 4.996016888499152e-05, + "loss": 1.065, + "step": 1600 + }, + { + "epoch": 0.18984940116210128, + "grad_norm": 1.7656122257680125, + "learning_rate": 4.996003332594455e-05, + "loss": 1.0626, + "step": 1601 + }, + { + "epoch": 0.18996798292422626, + "grad_norm": 1.6396504604458573, + "learning_rate": 4.9959897536796696e-05, + "loss": 0.6935, + "step": 1602 + }, + { + "epoch": 0.19008656468635124, + "grad_norm": 1.665624202116392, + "learning_rate": 4.995976151754919e-05, + "loss": 1.09, + "step": 1603 + }, + { + "epoch": 0.19020514644847622, + "grad_norm": 1.711105281387666, + "learning_rate": 4.995962526820328e-05, + "loss": 1.0259, + "step": 1604 + }, + { + "epoch": 0.1903237282106012, + "grad_norm": 1.9345464158644985, + "learning_rate": 4.995948878876025e-05, + "loss": 1.1515, + "step": 1605 + }, + { + "epoch": 0.19044230997272618, + "grad_norm": 1.70825965581751, + "learning_rate": 4.995935207922133e-05, + "loss": 1.2688, + "step": 1606 + }, + { + "epoch": 0.1905608917348512, + "grad_norm": 1.5389291662264457, + "learning_rate": 4.995921513958779e-05, + "loss": 1.0209, + "step": 1607 + }, + { + "epoch": 0.19067947349697617, + "grad_norm": 1.73641096466723, + "learning_rate": 4.99590779698609e-05, + "loss": 1.2913, + "step": 1608 + }, + { + "epoch": 0.19079805525910115, + "grad_norm": 1.6294992044280474, + "learning_rate": 4.9958940570041915e-05, + "loss": 0.878, + "step": 1609 + }, + { + "epoch": 0.19091663702122613, + "grad_norm": 1.4940994114039168, + "learning_rate": 4.99588029401321e-05, + "loss": 1.0951, + "step": 1610 + }, + { + "epoch": 0.1910352187833511, + "grad_norm": 1.4801674321823637, + "learning_rate": 4.995866508013273e-05, + "loss": 0.9697, + "step": 1611 + }, + { + "epoch": 0.1911538005454761, + "grad_norm": 1.4450292156047881, + "learning_rate": 4.995852699004508e-05, + "loss": 1.0828, + "step": 1612 + }, + { + "epoch": 0.1912723823076011, + "grad_norm": 1.7517146057630513, + "learning_rate": 4.9958388669870416e-05, + "loss": 1.3314, + "step": 1613 + }, + { + "epoch": 0.19139096406972608, + "grad_norm": 1.5972451399551157, + "learning_rate": 4.995825011961001e-05, + "loss": 1.0655, + "step": 1614 + }, + { + "epoch": 0.19150954583185106, + "grad_norm": 1.4740666792983896, + "learning_rate": 4.9958111339265144e-05, + "loss": 0.9281, + "step": 1615 + }, + { + "epoch": 0.19162812759397604, + "grad_norm": 1.7321921331202959, + "learning_rate": 4.99579723288371e-05, + "loss": 1.1222, + "step": 1616 + }, + { + "epoch": 0.19174670935610102, + "grad_norm": 1.4080161674973215, + "learning_rate": 4.9957833088327154e-05, + "loss": 1.0761, + "step": 1617 + }, + { + "epoch": 0.191865291118226, + "grad_norm": 1.5072447382472143, + "learning_rate": 4.99576936177366e-05, + "loss": 0.9452, + "step": 1618 + }, + { + "epoch": 0.19198387288035101, + "grad_norm": 1.7615166325438916, + "learning_rate": 4.9957553917066704e-05, + "loss": 1.0928, + "step": 1619 + }, + { + "epoch": 0.192102454642476, + "grad_norm": 1.6848493445171975, + "learning_rate": 4.995741398631878e-05, + "loss": 1.0243, + "step": 1620 + }, + { + "epoch": 0.19222103640460098, + "grad_norm": 1.577187289991337, + "learning_rate": 4.995727382549409e-05, + "loss": 0.8626, + "step": 1621 + }, + { + "epoch": 0.19233961816672596, + "grad_norm": 1.5829535952372968, + "learning_rate": 4.995713343459395e-05, + "loss": 1.0412, + "step": 1622 + }, + { + "epoch": 0.19245819992885094, + "grad_norm": 1.503173660178691, + "learning_rate": 4.995699281361964e-05, + "loss": 0.9868, + "step": 1623 + }, + { + "epoch": 0.19257678169097592, + "grad_norm": 1.5597795071662501, + "learning_rate": 4.995685196257246e-05, + "loss": 1.0105, + "step": 1624 + }, + { + "epoch": 0.19269536345310093, + "grad_norm": 1.7191342371999896, + "learning_rate": 4.9956710881453706e-05, + "loss": 1.0825, + "step": 1625 + }, + { + "epoch": 0.1928139452152259, + "grad_norm": 1.6608030975252759, + "learning_rate": 4.995656957026469e-05, + "loss": 1.1788, + "step": 1626 + }, + { + "epoch": 0.1929325269773509, + "grad_norm": 1.695166747511289, + "learning_rate": 4.9956428029006696e-05, + "loss": 0.9869, + "step": 1627 + }, + { + "epoch": 0.19305110873947587, + "grad_norm": 1.8141557233582302, + "learning_rate": 4.995628625768105e-05, + "loss": 1.0292, + "step": 1628 + }, + { + "epoch": 0.19316969050160085, + "grad_norm": 2.145845017514409, + "learning_rate": 4.995614425628904e-05, + "loss": 1.2887, + "step": 1629 + }, + { + "epoch": 0.19328827226372583, + "grad_norm": 1.7478758497788793, + "learning_rate": 4.995600202483198e-05, + "loss": 0.9496, + "step": 1630 + }, + { + "epoch": 0.1934068540258508, + "grad_norm": 1.7116924285331965, + "learning_rate": 4.99558595633112e-05, + "loss": 0.9766, + "step": 1631 + }, + { + "epoch": 0.19352543578797582, + "grad_norm": 1.8139529437771327, + "learning_rate": 4.995571687172799e-05, + "loss": 1.138, + "step": 1632 + }, + { + "epoch": 0.1936440175501008, + "grad_norm": 1.4801469341164957, + "learning_rate": 4.9955573950083666e-05, + "loss": 1.073, + "step": 1633 + }, + { + "epoch": 0.19376259931222578, + "grad_norm": 1.5079528918607126, + "learning_rate": 4.995543079837955e-05, + "loss": 1.1936, + "step": 1634 + }, + { + "epoch": 0.19388118107435076, + "grad_norm": 1.542575806221861, + "learning_rate": 4.995528741661698e-05, + "loss": 0.9571, + "step": 1635 + }, + { + "epoch": 0.19399976283647574, + "grad_norm": 1.5332456874515554, + "learning_rate": 4.995514380479726e-05, + "loss": 1.2609, + "step": 1636 + }, + { + "epoch": 0.19411834459860072, + "grad_norm": 1.6082813430068659, + "learning_rate": 4.995499996292171e-05, + "loss": 1.1247, + "step": 1637 + }, + { + "epoch": 0.19423692636072573, + "grad_norm": 1.5853263242043059, + "learning_rate": 4.9954855890991655e-05, + "loss": 1.1335, + "step": 1638 + }, + { + "epoch": 0.1943555081228507, + "grad_norm": 1.599811253749778, + "learning_rate": 4.995471158900844e-05, + "loss": 1.0815, + "step": 1639 + }, + { + "epoch": 0.1944740898849757, + "grad_norm": 1.5591151974354802, + "learning_rate": 4.995456705697339e-05, + "loss": 1.0416, + "step": 1640 + }, + { + "epoch": 0.19459267164710067, + "grad_norm": 1.554402008379964, + "learning_rate": 4.995442229488782e-05, + "loss": 0.9294, + "step": 1641 + }, + { + "epoch": 0.19471125340922565, + "grad_norm": 1.4412584466658627, + "learning_rate": 4.995427730275309e-05, + "loss": 0.8145, + "step": 1642 + }, + { + "epoch": 0.19482983517135063, + "grad_norm": 1.5872184017615707, + "learning_rate": 4.995413208057052e-05, + "loss": 1.0059, + "step": 1643 + }, + { + "epoch": 0.19494841693347564, + "grad_norm": 1.682573626022976, + "learning_rate": 4.9953986628341446e-05, + "loss": 0.9632, + "step": 1644 + }, + { + "epoch": 0.19506699869560062, + "grad_norm": 1.6410360964108557, + "learning_rate": 4.995384094606722e-05, + "loss": 1.0105, + "step": 1645 + }, + { + "epoch": 0.1951855804577256, + "grad_norm": 1.7698020970257424, + "learning_rate": 4.995369503374919e-05, + "loss": 1.1068, + "step": 1646 + }, + { + "epoch": 0.19530416221985059, + "grad_norm": 1.7708635646376054, + "learning_rate": 4.995354889138868e-05, + "loss": 1.2475, + "step": 1647 + }, + { + "epoch": 0.19542274398197557, + "grad_norm": 1.7270509486243872, + "learning_rate": 4.9953402518987055e-05, + "loss": 1.0318, + "step": 1648 + }, + { + "epoch": 0.19554132574410055, + "grad_norm": 1.7145746326288438, + "learning_rate": 4.995325591654566e-05, + "loss": 1.0454, + "step": 1649 + }, + { + "epoch": 0.19565990750622556, + "grad_norm": 1.5585643287270916, + "learning_rate": 4.9953109084065844e-05, + "loss": 1.1918, + "step": 1650 + }, + { + "epoch": 0.19577848926835054, + "grad_norm": 1.5658708845858225, + "learning_rate": 4.9952962021548956e-05, + "loss": 0.9562, + "step": 1651 + }, + { + "epoch": 0.19589707103047552, + "grad_norm": 1.846981826824355, + "learning_rate": 4.9952814728996364e-05, + "loss": 0.9759, + "step": 1652 + }, + { + "epoch": 0.1960156527926005, + "grad_norm": 1.6646992654048087, + "learning_rate": 4.995266720640942e-05, + "loss": 1.0486, + "step": 1653 + }, + { + "epoch": 0.19613423455472548, + "grad_norm": 1.5808156338629298, + "learning_rate": 4.995251945378948e-05, + "loss": 1.2012, + "step": 1654 + }, + { + "epoch": 0.19625281631685046, + "grad_norm": 1.496615175883074, + "learning_rate": 4.9952371471137906e-05, + "loss": 1.1035, + "step": 1655 + }, + { + "epoch": 0.19637139807897544, + "grad_norm": 1.9117194169781258, + "learning_rate": 4.9952223258456073e-05, + "loss": 1.0139, + "step": 1656 + }, + { + "epoch": 0.19648997984110045, + "grad_norm": 1.4165521527467075, + "learning_rate": 4.9952074815745334e-05, + "loss": 1.1724, + "step": 1657 + }, + { + "epoch": 0.19660856160322543, + "grad_norm": 1.41997264004625, + "learning_rate": 4.995192614300707e-05, + "loss": 0.9647, + "step": 1658 + }, + { + "epoch": 0.1967271433653504, + "grad_norm": 1.6020967041237606, + "learning_rate": 4.9951777240242636e-05, + "loss": 1.1342, + "step": 1659 + }, + { + "epoch": 0.1968457251274754, + "grad_norm": 1.7317468486424006, + "learning_rate": 4.995162810745342e-05, + "loss": 1.0423, + "step": 1660 + }, + { + "epoch": 0.19696430688960037, + "grad_norm": 1.7652024631449514, + "learning_rate": 4.995147874464079e-05, + "loss": 0.9864, + "step": 1661 + }, + { + "epoch": 0.19708288865172535, + "grad_norm": 1.842025099507146, + "learning_rate": 4.995132915180612e-05, + "loss": 1.118, + "step": 1662 + }, + { + "epoch": 0.19720147041385036, + "grad_norm": 1.9452712316805012, + "learning_rate": 4.99511793289508e-05, + "loss": 1.2296, + "step": 1663 + }, + { + "epoch": 0.19732005217597534, + "grad_norm": 1.4646914187637992, + "learning_rate": 4.99510292760762e-05, + "loss": 1.0104, + "step": 1664 + }, + { + "epoch": 0.19743863393810032, + "grad_norm": 1.6669624750251892, + "learning_rate": 4.99508789931837e-05, + "loss": 0.9821, + "step": 1665 + }, + { + "epoch": 0.1975572157002253, + "grad_norm": 1.626903801356354, + "learning_rate": 4.99507284802747e-05, + "loss": 1.1056, + "step": 1666 + }, + { + "epoch": 0.19767579746235028, + "grad_norm": 1.605058845384302, + "learning_rate": 4.9950577737350576e-05, + "loss": 1.1125, + "step": 1667 + }, + { + "epoch": 0.19779437922447526, + "grad_norm": 1.969487312458771, + "learning_rate": 4.995042676441273e-05, + "loss": 1.1872, + "step": 1668 + }, + { + "epoch": 0.19791296098660027, + "grad_norm": 1.5932370098646527, + "learning_rate": 4.995027556146254e-05, + "loss": 1.0766, + "step": 1669 + }, + { + "epoch": 0.19803154274872525, + "grad_norm": 1.4561715443258096, + "learning_rate": 4.995012412850141e-05, + "loss": 1.0901, + "step": 1670 + }, + { + "epoch": 0.19815012451085023, + "grad_norm": 1.2345701828227944, + "learning_rate": 4.994997246553073e-05, + "loss": 0.8799, + "step": 1671 + }, + { + "epoch": 0.19826870627297521, + "grad_norm": 1.7080506443303032, + "learning_rate": 4.994982057255189e-05, + "loss": 0.9506, + "step": 1672 + }, + { + "epoch": 0.1983872880351002, + "grad_norm": 1.413957169756916, + "learning_rate": 4.994966844956631e-05, + "loss": 0.9748, + "step": 1673 + }, + { + "epoch": 0.19850586979722518, + "grad_norm": 1.5219243786423844, + "learning_rate": 4.994951609657538e-05, + "loss": 0.9359, + "step": 1674 + }, + { + "epoch": 0.19862445155935018, + "grad_norm": 1.497740175307494, + "learning_rate": 4.9949363513580496e-05, + "loss": 0.9727, + "step": 1675 + }, + { + "epoch": 0.19874303332147517, + "grad_norm": 1.5136099462825126, + "learning_rate": 4.9949210700583085e-05, + "loss": 1.2001, + "step": 1676 + }, + { + "epoch": 0.19886161508360015, + "grad_norm": 1.5189415011992695, + "learning_rate": 4.994905765758455e-05, + "loss": 0.9744, + "step": 1677 + }, + { + "epoch": 0.19898019684572513, + "grad_norm": 1.5936937633766755, + "learning_rate": 4.9948904384586294e-05, + "loss": 0.8591, + "step": 1678 + }, + { + "epoch": 0.1990987786078501, + "grad_norm": 1.7883333249974978, + "learning_rate": 4.994875088158973e-05, + "loss": 1.1883, + "step": 1679 + }, + { + "epoch": 0.1992173603699751, + "grad_norm": 1.8700564302397769, + "learning_rate": 4.994859714859628e-05, + "loss": 1.1277, + "step": 1680 + }, + { + "epoch": 0.1993359421321001, + "grad_norm": 1.6341812365503992, + "learning_rate": 4.9948443185607364e-05, + "loss": 0.7868, + "step": 1681 + }, + { + "epoch": 0.19945452389422508, + "grad_norm": 1.6733235830605675, + "learning_rate": 4.994828899262439e-05, + "loss": 1.138, + "step": 1682 + }, + { + "epoch": 0.19957310565635006, + "grad_norm": 1.7030875989263967, + "learning_rate": 4.994813456964878e-05, + "loss": 1.2738, + "step": 1683 + }, + { + "epoch": 0.19969168741847504, + "grad_norm": 1.6348095529259548, + "learning_rate": 4.994797991668197e-05, + "loss": 0.9949, + "step": 1684 + }, + { + "epoch": 0.19981026918060002, + "grad_norm": 1.7179726782240836, + "learning_rate": 4.9947825033725374e-05, + "loss": 1.1669, + "step": 1685 + }, + { + "epoch": 0.199928850942725, + "grad_norm": 1.484490939792777, + "learning_rate": 4.9947669920780424e-05, + "loss": 0.9256, + "step": 1686 + }, + { + "epoch": 0.20004743270484998, + "grad_norm": 1.6176959254885943, + "learning_rate": 4.994751457784856e-05, + "loss": 1.1282, + "step": 1687 + }, + { + "epoch": 0.200166014466975, + "grad_norm": 1.576108462036272, + "learning_rate": 4.994735900493119e-05, + "loss": 1.2005, + "step": 1688 + }, + { + "epoch": 0.20028459622909997, + "grad_norm": 1.4282791041216865, + "learning_rate": 4.9947203202029775e-05, + "loss": 0.761, + "step": 1689 + }, + { + "epoch": 0.20040317799122495, + "grad_norm": 1.5184503581956634, + "learning_rate": 4.9947047169145734e-05, + "loss": 1.1169, + "step": 1690 + }, + { + "epoch": 0.20052175975334993, + "grad_norm": 1.683957066742088, + "learning_rate": 4.994689090628051e-05, + "loss": 1.1099, + "step": 1691 + }, + { + "epoch": 0.2006403415154749, + "grad_norm": 1.6462819844418561, + "learning_rate": 4.994673441343554e-05, + "loss": 1.2618, + "step": 1692 + }, + { + "epoch": 0.2007589232775999, + "grad_norm": 1.584046479519788, + "learning_rate": 4.994657769061227e-05, + "loss": 1.1248, + "step": 1693 + }, + { + "epoch": 0.2008775050397249, + "grad_norm": 1.4571926508724682, + "learning_rate": 4.9946420737812157e-05, + "loss": 1.0697, + "step": 1694 + }, + { + "epoch": 0.20099608680184988, + "grad_norm": 1.7497312421048874, + "learning_rate": 4.9946263555036626e-05, + "loss": 1.0349, + "step": 1695 + }, + { + "epoch": 0.20111466856397486, + "grad_norm": 1.4432742180577316, + "learning_rate": 4.994610614228714e-05, + "loss": 0.9521, + "step": 1696 + }, + { + "epoch": 0.20123325032609984, + "grad_norm": 1.4523741163492265, + "learning_rate": 4.994594849956514e-05, + "loss": 1.202, + "step": 1697 + }, + { + "epoch": 0.20135183208822482, + "grad_norm": 1.6742439301787915, + "learning_rate": 4.9945790626872094e-05, + "loss": 1.205, + "step": 1698 + }, + { + "epoch": 0.2014704138503498, + "grad_norm": 1.8230204578107454, + "learning_rate": 4.994563252420944e-05, + "loss": 0.9721, + "step": 1699 + }, + { + "epoch": 0.20158899561247481, + "grad_norm": 1.4769890734951583, + "learning_rate": 4.994547419157865e-05, + "loss": 1.1395, + "step": 1700 + }, + { + "epoch": 0.2017075773745998, + "grad_norm": 1.681804084431201, + "learning_rate": 4.994531562898118e-05, + "loss": 0.9031, + "step": 1701 + }, + { + "epoch": 0.20182615913672478, + "grad_norm": 1.45242561534564, + "learning_rate": 4.994515683641849e-05, + "loss": 0.9718, + "step": 1702 + }, + { + "epoch": 0.20194474089884976, + "grad_norm": 1.6177282166256657, + "learning_rate": 4.994499781389204e-05, + "loss": 1.1625, + "step": 1703 + }, + { + "epoch": 0.20206332266097474, + "grad_norm": 1.477489806041545, + "learning_rate": 4.99448385614033e-05, + "loss": 0.9836, + "step": 1704 + }, + { + "epoch": 0.20218190442309972, + "grad_norm": 1.6771280363493812, + "learning_rate": 4.9944679078953736e-05, + "loss": 1.017, + "step": 1705 + }, + { + "epoch": 0.20230048618522473, + "grad_norm": 1.6054470041505133, + "learning_rate": 4.994451936654483e-05, + "loss": 1.1695, + "step": 1706 + }, + { + "epoch": 0.2024190679473497, + "grad_norm": 1.7878402675389176, + "learning_rate": 4.994435942417803e-05, + "loss": 1.1955, + "step": 1707 + }, + { + "epoch": 0.2025376497094747, + "grad_norm": 1.5873248595629983, + "learning_rate": 4.994419925185484e-05, + "loss": 0.9112, + "step": 1708 + }, + { + "epoch": 0.20265623147159967, + "grad_norm": 1.5418366133079218, + "learning_rate": 4.9944038849576715e-05, + "loss": 0.8404, + "step": 1709 + }, + { + "epoch": 0.20277481323372465, + "grad_norm": 1.527032797405593, + "learning_rate": 4.994387821734514e-05, + "loss": 1.1733, + "step": 1710 + }, + { + "epoch": 0.20289339499584963, + "grad_norm": 1.64101637416326, + "learning_rate": 4.994371735516159e-05, + "loss": 0.9959, + "step": 1711 + }, + { + "epoch": 0.2030119767579746, + "grad_norm": 1.556261750926364, + "learning_rate": 4.9943556263027556e-05, + "loss": 0.9996, + "step": 1712 + }, + { + "epoch": 0.20313055852009962, + "grad_norm": 1.5810757376291598, + "learning_rate": 4.9943394940944524e-05, + "loss": 0.9193, + "step": 1713 + }, + { + "epoch": 0.2032491402822246, + "grad_norm": 1.6768514622149213, + "learning_rate": 4.994323338891398e-05, + "loss": 1.0607, + "step": 1714 + }, + { + "epoch": 0.20336772204434958, + "grad_norm": 1.6954795868482353, + "learning_rate": 4.9943071606937406e-05, + "loss": 0.7783, + "step": 1715 + }, + { + "epoch": 0.20348630380647456, + "grad_norm": 1.797214731172522, + "learning_rate": 4.994290959501631e-05, + "loss": 0.8388, + "step": 1716 + }, + { + "epoch": 0.20360488556859954, + "grad_norm": 1.6715602372481064, + "learning_rate": 4.994274735315217e-05, + "loss": 0.8317, + "step": 1717 + }, + { + "epoch": 0.20372346733072452, + "grad_norm": 1.581351744055121, + "learning_rate": 4.994258488134648e-05, + "loss": 0.8884, + "step": 1718 + }, + { + "epoch": 0.20384204909284953, + "grad_norm": 1.6984808000978888, + "learning_rate": 4.9942422179600744e-05, + "loss": 1.0653, + "step": 1719 + }, + { + "epoch": 0.2039606308549745, + "grad_norm": 1.6633539099896346, + "learning_rate": 4.9942259247916466e-05, + "loss": 1.0328, + "step": 1720 + }, + { + "epoch": 0.2040792126170995, + "grad_norm": 1.6844877781073115, + "learning_rate": 4.994209608629514e-05, + "loss": 1.0734, + "step": 1721 + }, + { + "epoch": 0.20419779437922447, + "grad_norm": 1.7034391268998788, + "learning_rate": 4.994193269473828e-05, + "loss": 1.3863, + "step": 1722 + }, + { + "epoch": 0.20431637614134945, + "grad_norm": 1.520890327606274, + "learning_rate": 4.994176907324739e-05, + "loss": 0.7457, + "step": 1723 + }, + { + "epoch": 0.20443495790347443, + "grad_norm": 1.9135702434326587, + "learning_rate": 4.9941605221823966e-05, + "loss": 0.9161, + "step": 1724 + }, + { + "epoch": 0.20455353966559944, + "grad_norm": 1.6540452264322365, + "learning_rate": 4.994144114046953e-05, + "loss": 1.0658, + "step": 1725 + }, + { + "epoch": 0.20467212142772442, + "grad_norm": 1.4760096447067115, + "learning_rate": 4.994127682918559e-05, + "loss": 1.0304, + "step": 1726 + }, + { + "epoch": 0.2047907031898494, + "grad_norm": 1.5426711581734656, + "learning_rate": 4.9941112287973667e-05, + "loss": 1.0164, + "step": 1727 + }, + { + "epoch": 0.20490928495197439, + "grad_norm": 1.441155608760965, + "learning_rate": 4.994094751683527e-05, + "loss": 0.905, + "step": 1728 + }, + { + "epoch": 0.20502786671409937, + "grad_norm": 1.4559611456947479, + "learning_rate": 4.994078251577192e-05, + "loss": 1.1475, + "step": 1729 + }, + { + "epoch": 0.20514644847622435, + "grad_norm": 1.620660284370224, + "learning_rate": 4.994061728478515e-05, + "loss": 1.1511, + "step": 1730 + }, + { + "epoch": 0.20526503023834936, + "grad_norm": 2.31359467476029, + "learning_rate": 4.994045182387646e-05, + "loss": 1.2315, + "step": 1731 + }, + { + "epoch": 0.20538361200047434, + "grad_norm": 1.9084462912569549, + "learning_rate": 4.99402861330474e-05, + "loss": 0.9592, + "step": 1732 + }, + { + "epoch": 0.20550219376259932, + "grad_norm": 1.7834346334879625, + "learning_rate": 4.994012021229947e-05, + "loss": 1.2626, + "step": 1733 + }, + { + "epoch": 0.2056207755247243, + "grad_norm": 1.2791137489708033, + "learning_rate": 4.993995406163423e-05, + "loss": 0.6817, + "step": 1734 + }, + { + "epoch": 0.20573935728684928, + "grad_norm": 1.4828581130834255, + "learning_rate": 4.993978768105319e-05, + "loss": 1.1931, + "step": 1735 + }, + { + "epoch": 0.20585793904897426, + "grad_norm": 1.388861603365636, + "learning_rate": 4.993962107055789e-05, + "loss": 1.2472, + "step": 1736 + }, + { + "epoch": 0.20597652081109924, + "grad_norm": 1.5273895517373055, + "learning_rate": 4.9939454230149876e-05, + "loss": 1.0781, + "step": 1737 + }, + { + "epoch": 0.20609510257322425, + "grad_norm": 1.3887691937906943, + "learning_rate": 4.993928715983066e-05, + "loss": 0.6454, + "step": 1738 + }, + { + "epoch": 0.20621368433534923, + "grad_norm": 1.5077867803934344, + "learning_rate": 4.9939119859601815e-05, + "loss": 1.1834, + "step": 1739 + }, + { + "epoch": 0.2063322660974742, + "grad_norm": 1.8678705487037224, + "learning_rate": 4.993895232946486e-05, + "loss": 1.0303, + "step": 1740 + }, + { + "epoch": 0.2064508478595992, + "grad_norm": 1.5442292056873277, + "learning_rate": 4.993878456942135e-05, + "loss": 0.9555, + "step": 1741 + }, + { + "epoch": 0.20656942962172417, + "grad_norm": 1.447615081627025, + "learning_rate": 4.993861657947282e-05, + "loss": 1.0007, + "step": 1742 + }, + { + "epoch": 0.20668801138384915, + "grad_norm": 1.3730228427725495, + "learning_rate": 4.993844835962083e-05, + "loss": 0.9227, + "step": 1743 + }, + { + "epoch": 0.20680659314597416, + "grad_norm": 1.6208873327976283, + "learning_rate": 4.9938279909866934e-05, + "loss": 1.0335, + "step": 1744 + }, + { + "epoch": 0.20692517490809914, + "grad_norm": 1.6873979395166983, + "learning_rate": 4.993811123021267e-05, + "loss": 0.9054, + "step": 1745 + }, + { + "epoch": 0.20704375667022412, + "grad_norm": 1.5399954937671008, + "learning_rate": 4.9937942320659606e-05, + "loss": 1.1125, + "step": 1746 + }, + { + "epoch": 0.2071623384323491, + "grad_norm": 1.8398817072076532, + "learning_rate": 4.9937773181209303e-05, + "loss": 1.1029, + "step": 1747 + }, + { + "epoch": 0.20728092019447408, + "grad_norm": 1.6226195596555468, + "learning_rate": 4.9937603811863295e-05, + "loss": 1.0397, + "step": 1748 + }, + { + "epoch": 0.20739950195659906, + "grad_norm": 1.589253326721244, + "learning_rate": 4.993743421262317e-05, + "loss": 0.9006, + "step": 1749 + }, + { + "epoch": 0.20751808371872407, + "grad_norm": 1.5925971027788899, + "learning_rate": 4.993726438349048e-05, + "loss": 1.0645, + "step": 1750 + }, + { + "epoch": 0.20763666548084905, + "grad_norm": 1.6467258830513347, + "learning_rate": 4.9937094324466796e-05, + "loss": 1.1085, + "step": 1751 + }, + { + "epoch": 0.20775524724297403, + "grad_norm": 1.8338451967704728, + "learning_rate": 4.9936924035553685e-05, + "loss": 1.1637, + "step": 1752 + }, + { + "epoch": 0.20787382900509901, + "grad_norm": 1.6765550683077919, + "learning_rate": 4.993675351675271e-05, + "loss": 1.398, + "step": 1753 + }, + { + "epoch": 0.207992410767224, + "grad_norm": 1.5325063413242601, + "learning_rate": 4.9936582768065444e-05, + "loss": 0.9329, + "step": 1754 + }, + { + "epoch": 0.20811099252934898, + "grad_norm": 1.6769241364689538, + "learning_rate": 4.9936411789493466e-05, + "loss": 1.2809, + "step": 1755 + }, + { + "epoch": 0.20822957429147398, + "grad_norm": 1.7583528951550318, + "learning_rate": 4.993624058103835e-05, + "loss": 1.135, + "step": 1756 + }, + { + "epoch": 0.20834815605359897, + "grad_norm": 1.4495803853251503, + "learning_rate": 4.993606914270167e-05, + "loss": 0.7078, + "step": 1757 + }, + { + "epoch": 0.20846673781572395, + "grad_norm": 1.703141278093442, + "learning_rate": 4.9935897474485024e-05, + "loss": 1.0222, + "step": 1758 + }, + { + "epoch": 0.20858531957784893, + "grad_norm": 1.6444780116013784, + "learning_rate": 4.993572557638997e-05, + "loss": 0.9826, + "step": 1759 + }, + { + "epoch": 0.2087039013399739, + "grad_norm": 1.3462293716961187, + "learning_rate": 4.993555344841811e-05, + "loss": 0.7668, + "step": 1760 + }, + { + "epoch": 0.2088224831020989, + "grad_norm": 1.6900900173921347, + "learning_rate": 4.993538109057102e-05, + "loss": 1.2862, + "step": 1761 + }, + { + "epoch": 0.20894106486422387, + "grad_norm": 1.697847244274863, + "learning_rate": 4.99352085028503e-05, + "loss": 1.1521, + "step": 1762 + }, + { + "epoch": 0.20905964662634888, + "grad_norm": 1.5904989333198283, + "learning_rate": 4.993503568525753e-05, + "loss": 0.9432, + "step": 1763 + }, + { + "epoch": 0.20917822838847386, + "grad_norm": 1.704765592971509, + "learning_rate": 4.993486263779431e-05, + "loss": 0.9619, + "step": 1764 + }, + { + "epoch": 0.20929681015059884, + "grad_norm": 1.5132574536824275, + "learning_rate": 4.993468936046224e-05, + "loss": 0.7403, + "step": 1765 + }, + { + "epoch": 0.20941539191272382, + "grad_norm": 1.6435486594042532, + "learning_rate": 4.993451585326291e-05, + "loss": 1.1546, + "step": 1766 + }, + { + "epoch": 0.2095339736748488, + "grad_norm": 1.9369754408662387, + "learning_rate": 4.993434211619791e-05, + "loss": 1.1746, + "step": 1767 + }, + { + "epoch": 0.20965255543697378, + "grad_norm": 1.8354854014944162, + "learning_rate": 4.993416814926886e-05, + "loss": 0.8454, + "step": 1768 + }, + { + "epoch": 0.2097711371990988, + "grad_norm": 1.6939830390941384, + "learning_rate": 4.9933993952477356e-05, + "loss": 1.2456, + "step": 1769 + }, + { + "epoch": 0.20988971896122377, + "grad_norm": 2.0557622507307496, + "learning_rate": 4.9933819525825e-05, + "loss": 1.021, + "step": 1770 + }, + { + "epoch": 0.21000830072334875, + "grad_norm": 1.4863482070848308, + "learning_rate": 4.993364486931341e-05, + "loss": 1.1479, + "step": 1771 + }, + { + "epoch": 0.21012688248547373, + "grad_norm": 1.5346269924438494, + "learning_rate": 4.993346998294418e-05, + "loss": 0.7894, + "step": 1772 + }, + { + "epoch": 0.2102454642475987, + "grad_norm": 1.4579088906263657, + "learning_rate": 4.9933294866718944e-05, + "loss": 1.0244, + "step": 1773 + }, + { + "epoch": 0.2103640460097237, + "grad_norm": 1.6202177346562843, + "learning_rate": 4.99331195206393e-05, + "loss": 1.0095, + "step": 1774 + }, + { + "epoch": 0.2104826277718487, + "grad_norm": 1.5905943899383632, + "learning_rate": 4.9932943944706866e-05, + "loss": 1.1442, + "step": 1775 + }, + { + "epoch": 0.21060120953397368, + "grad_norm": 1.466998129111414, + "learning_rate": 4.9932768138923266e-05, + "loss": 0.6028, + "step": 1776 + }, + { + "epoch": 0.21071979129609866, + "grad_norm": 1.776335044910245, + "learning_rate": 4.993259210329012e-05, + "loss": 0.8719, + "step": 1777 + }, + { + "epoch": 0.21083837305822364, + "grad_norm": 1.919229595620604, + "learning_rate": 4.993241583780904e-05, + "loss": 0.9369, + "step": 1778 + }, + { + "epoch": 0.21095695482034862, + "grad_norm": 1.5995657368290241, + "learning_rate": 4.9932239342481675e-05, + "loss": 0.8465, + "step": 1779 + }, + { + "epoch": 0.2110755365824736, + "grad_norm": 1.5582019366268987, + "learning_rate": 4.993206261730963e-05, + "loss": 1.0723, + "step": 1780 + }, + { + "epoch": 0.2111941183445986, + "grad_norm": 1.6606393223791542, + "learning_rate": 4.9931885662294534e-05, + "loss": 1.2432, + "step": 1781 + }, + { + "epoch": 0.2113127001067236, + "grad_norm": 1.6178539362453277, + "learning_rate": 4.993170847743803e-05, + "loss": 0.9048, + "step": 1782 + }, + { + "epoch": 0.21143128186884858, + "grad_norm": 1.5738489570577288, + "learning_rate": 4.9931531062741756e-05, + "loss": 0.8792, + "step": 1783 + }, + { + "epoch": 0.21154986363097356, + "grad_norm": 1.947769728555521, + "learning_rate": 4.993135341820733e-05, + "loss": 1.0994, + "step": 1784 + }, + { + "epoch": 0.21166844539309854, + "grad_norm": 1.5472524602914977, + "learning_rate": 4.9931175543836405e-05, + "loss": 1.0497, + "step": 1785 + }, + { + "epoch": 0.21178702715522352, + "grad_norm": 1.6382354891865967, + "learning_rate": 4.993099743963061e-05, + "loss": 1.2515, + "step": 1786 + }, + { + "epoch": 0.2119056089173485, + "grad_norm": 1.9099621718026734, + "learning_rate": 4.9930819105591586e-05, + "loss": 0.9412, + "step": 1787 + }, + { + "epoch": 0.2120241906794735, + "grad_norm": 1.559666977472927, + "learning_rate": 4.993064054172099e-05, + "loss": 0.8705, + "step": 1788 + }, + { + "epoch": 0.2121427724415985, + "grad_norm": 1.5062377742763124, + "learning_rate": 4.993046174802046e-05, + "loss": 0.8809, + "step": 1789 + }, + { + "epoch": 0.21226135420372347, + "grad_norm": 1.784795651827728, + "learning_rate": 4.9930282724491636e-05, + "loss": 1.0041, + "step": 1790 + }, + { + "epoch": 0.21237993596584845, + "grad_norm": 1.9033927797707701, + "learning_rate": 4.993010347113618e-05, + "loss": 1.1279, + "step": 1791 + }, + { + "epoch": 0.21249851772797343, + "grad_norm": 1.4942523448965166, + "learning_rate": 4.992992398795575e-05, + "loss": 1.1273, + "step": 1792 + }, + { + "epoch": 0.2126170994900984, + "grad_norm": 1.8056002101571806, + "learning_rate": 4.992974427495198e-05, + "loss": 1.0004, + "step": 1793 + }, + { + "epoch": 0.21273568125222342, + "grad_norm": 1.5329191765979016, + "learning_rate": 4.9929564332126544e-05, + "loss": 1.0626, + "step": 1794 + }, + { + "epoch": 0.2128542630143484, + "grad_norm": 1.7662363309001166, + "learning_rate": 4.99293841594811e-05, + "loss": 0.9929, + "step": 1795 + }, + { + "epoch": 0.21297284477647338, + "grad_norm": 1.498106031027585, + "learning_rate": 4.99292037570173e-05, + "loss": 0.9335, + "step": 1796 + }, + { + "epoch": 0.21309142653859836, + "grad_norm": 1.339438713242771, + "learning_rate": 4.9929023124736815e-05, + "loss": 1.1458, + "step": 1797 + }, + { + "epoch": 0.21321000830072334, + "grad_norm": 1.584225465802162, + "learning_rate": 4.99288422626413e-05, + "loss": 0.7808, + "step": 1798 + }, + { + "epoch": 0.21332859006284832, + "grad_norm": 1.513684303964356, + "learning_rate": 4.9928661170732435e-05, + "loss": 0.9652, + "step": 1799 + }, + { + "epoch": 0.21344717182497333, + "grad_norm": 1.4038372598318014, + "learning_rate": 4.992847984901188e-05, + "loss": 0.8727, + "step": 1800 + }, + { + "epoch": 0.2135657535870983, + "grad_norm": 1.31897524492445, + "learning_rate": 4.9928298297481314e-05, + "loss": 0.8711, + "step": 1801 + }, + { + "epoch": 0.2136843353492233, + "grad_norm": 1.695394487890238, + "learning_rate": 4.99281165161424e-05, + "loss": 0.7944, + "step": 1802 + }, + { + "epoch": 0.21380291711134827, + "grad_norm": 1.5551358484408893, + "learning_rate": 4.992793450499682e-05, + "loss": 1.1532, + "step": 1803 + }, + { + "epoch": 0.21392149887347325, + "grad_norm": 1.5269241141660146, + "learning_rate": 4.992775226404626e-05, + "loss": 0.8349, + "step": 1804 + }, + { + "epoch": 0.21404008063559823, + "grad_norm": 1.5805087719348463, + "learning_rate": 4.9927569793292394e-05, + "loss": 1.0319, + "step": 1805 + }, + { + "epoch": 0.21415866239772324, + "grad_norm": 1.6503897720899885, + "learning_rate": 4.99273870927369e-05, + "loss": 1.1086, + "step": 1806 + }, + { + "epoch": 0.21427724415984822, + "grad_norm": 1.5635799591579393, + "learning_rate": 4.992720416238146e-05, + "loss": 0.8309, + "step": 1807 + }, + { + "epoch": 0.2143958259219732, + "grad_norm": 1.6465010435770757, + "learning_rate": 4.992702100222777e-05, + "loss": 1.1674, + "step": 1808 + }, + { + "epoch": 0.21451440768409819, + "grad_norm": 1.8014310311556232, + "learning_rate": 4.992683761227751e-05, + "loss": 0.8525, + "step": 1809 + }, + { + "epoch": 0.21463298944622317, + "grad_norm": 1.8974436233092724, + "learning_rate": 4.992665399253238e-05, + "loss": 1.2909, + "step": 1810 + }, + { + "epoch": 0.21475157120834815, + "grad_norm": 1.6709769946132582, + "learning_rate": 4.9926470142994064e-05, + "loss": 1.1106, + "step": 1811 + }, + { + "epoch": 0.21487015297047315, + "grad_norm": 1.798964123628831, + "learning_rate": 4.992628606366426e-05, + "loss": 1.0588, + "step": 1812 + }, + { + "epoch": 0.21498873473259814, + "grad_norm": 1.374801196280157, + "learning_rate": 4.992610175454466e-05, + "loss": 0.8615, + "step": 1813 + }, + { + "epoch": 0.21510731649472312, + "grad_norm": 1.4826999433025927, + "learning_rate": 4.992591721563698e-05, + "loss": 1.1079, + "step": 1814 + }, + { + "epoch": 0.2152258982568481, + "grad_norm": 1.67673822479578, + "learning_rate": 4.99257324469429e-05, + "loss": 1.061, + "step": 1815 + }, + { + "epoch": 0.21534448001897308, + "grad_norm": 1.4063072308004343, + "learning_rate": 4.992554744846414e-05, + "loss": 0.8714, + "step": 1816 + }, + { + "epoch": 0.21546306178109806, + "grad_norm": 1.3383712365027427, + "learning_rate": 4.9925362220202394e-05, + "loss": 0.8772, + "step": 1817 + }, + { + "epoch": 0.21558164354322304, + "grad_norm": 1.3765919670828244, + "learning_rate": 4.9925176762159374e-05, + "loss": 0.7735, + "step": 1818 + }, + { + "epoch": 0.21570022530534805, + "grad_norm": 1.5172475275785322, + "learning_rate": 4.992499107433679e-05, + "loss": 0.8821, + "step": 1819 + }, + { + "epoch": 0.21581880706747303, + "grad_norm": 1.4200037960979732, + "learning_rate": 4.992480515673635e-05, + "loss": 0.9199, + "step": 1820 + }, + { + "epoch": 0.215937388829598, + "grad_norm": 1.6199742256974323, + "learning_rate": 4.992461900935977e-05, + "loss": 1.0698, + "step": 1821 + }, + { + "epoch": 0.216055970591723, + "grad_norm": 1.320286491922516, + "learning_rate": 4.9924432632208777e-05, + "loss": 0.9704, + "step": 1822 + }, + { + "epoch": 0.21617455235384797, + "grad_norm": 1.6772689637681681, + "learning_rate": 4.992424602528508e-05, + "loss": 0.9608, + "step": 1823 + }, + { + "epoch": 0.21629313411597295, + "grad_norm": 1.7511405558431619, + "learning_rate": 4.992405918859039e-05, + "loss": 1.1828, + "step": 1824 + }, + { + "epoch": 0.21641171587809796, + "grad_norm": 1.7018676054889086, + "learning_rate": 4.992387212212644e-05, + "loss": 1.2891, + "step": 1825 + }, + { + "epoch": 0.21653029764022294, + "grad_norm": 1.6533398538861304, + "learning_rate": 4.9923684825894956e-05, + "loss": 1.1131, + "step": 1826 + }, + { + "epoch": 0.21664887940234792, + "grad_norm": 1.4746690963194322, + "learning_rate": 4.992349729989766e-05, + "loss": 0.8581, + "step": 1827 + }, + { + "epoch": 0.2167674611644729, + "grad_norm": 1.5928974923688193, + "learning_rate": 4.992330954413628e-05, + "loss": 0.8705, + "step": 1828 + }, + { + "epoch": 0.21688604292659788, + "grad_norm": 1.3997086429905612, + "learning_rate": 4.992312155861255e-05, + "loss": 0.9265, + "step": 1829 + }, + { + "epoch": 0.21700462468872286, + "grad_norm": 1.4520132435561812, + "learning_rate": 4.99229333433282e-05, + "loss": 1.0254, + "step": 1830 + }, + { + "epoch": 0.21712320645084787, + "grad_norm": 1.6876108352881252, + "learning_rate": 4.992274489828497e-05, + "loss": 1.0986, + "step": 1831 + }, + { + "epoch": 0.21724178821297285, + "grad_norm": 1.7470058293604902, + "learning_rate": 4.9922556223484597e-05, + "loss": 1.0919, + "step": 1832 + }, + { + "epoch": 0.21736036997509783, + "grad_norm": 1.6596551857657267, + "learning_rate": 4.992236731892881e-05, + "loss": 0.8802, + "step": 1833 + }, + { + "epoch": 0.21747895173722281, + "grad_norm": 1.927651363943336, + "learning_rate": 4.992217818461936e-05, + "loss": 0.9943, + "step": 1834 + }, + { + "epoch": 0.2175975334993478, + "grad_norm": 1.5664505955630812, + "learning_rate": 4.9921988820558e-05, + "loss": 1.0237, + "step": 1835 + }, + { + "epoch": 0.21771611526147278, + "grad_norm": 1.5156700004582557, + "learning_rate": 4.992179922674645e-05, + "loss": 1.0596, + "step": 1836 + }, + { + "epoch": 0.21783469702359778, + "grad_norm": 1.856405408669696, + "learning_rate": 4.992160940318648e-05, + "loss": 1.1608, + "step": 1837 + }, + { + "epoch": 0.21795327878572276, + "grad_norm": 1.542693395552593, + "learning_rate": 4.9921419349879825e-05, + "loss": 1.0861, + "step": 1838 + }, + { + "epoch": 0.21807186054784775, + "grad_norm": 1.7158448204857237, + "learning_rate": 4.9921229066828255e-05, + "loss": 1.0082, + "step": 1839 + }, + { + "epoch": 0.21819044230997273, + "grad_norm": 1.525708005750468, + "learning_rate": 4.9921038554033506e-05, + "loss": 1.0691, + "step": 1840 + }, + { + "epoch": 0.2183090240720977, + "grad_norm": 1.653714723069598, + "learning_rate": 4.992084781149735e-05, + "loss": 0.9966, + "step": 1841 + }, + { + "epoch": 0.2184276058342227, + "grad_norm": 1.3515594570526117, + "learning_rate": 4.9920656839221526e-05, + "loss": 1.3076, + "step": 1842 + }, + { + "epoch": 0.21854618759634767, + "grad_norm": 1.9200494896537197, + "learning_rate": 4.992046563720781e-05, + "loss": 1.0626, + "step": 1843 + }, + { + "epoch": 0.21866476935847268, + "grad_norm": 1.4220167862980555, + "learning_rate": 4.9920274205457964e-05, + "loss": 1.1005, + "step": 1844 + }, + { + "epoch": 0.21878335112059766, + "grad_norm": 1.5352022156011431, + "learning_rate": 4.992008254397375e-05, + "loss": 1.1366, + "step": 1845 + }, + { + "epoch": 0.21890193288272264, + "grad_norm": 1.3843520916336933, + "learning_rate": 4.9919890652756924e-05, + "loss": 1.0221, + "step": 1846 + }, + { + "epoch": 0.21902051464484762, + "grad_norm": 1.57518514189869, + "learning_rate": 4.991969853180927e-05, + "loss": 1.0119, + "step": 1847 + }, + { + "epoch": 0.2191390964069726, + "grad_norm": 1.5895401987736604, + "learning_rate": 4.991950618113256e-05, + "loss": 1.0046, + "step": 1848 + }, + { + "epoch": 0.21925767816909758, + "grad_norm": 1.5784149900482223, + "learning_rate": 4.991931360072855e-05, + "loss": 1.1749, + "step": 1849 + }, + { + "epoch": 0.2193762599312226, + "grad_norm": 1.5334504225497627, + "learning_rate": 4.9919120790599034e-05, + "loss": 0.9645, + "step": 1850 + }, + { + "epoch": 0.21949484169334757, + "grad_norm": 1.4970358394519854, + "learning_rate": 4.991892775074578e-05, + "loss": 0.9116, + "step": 1851 + }, + { + "epoch": 0.21961342345547255, + "grad_norm": 1.7281591112546828, + "learning_rate": 4.991873448117057e-05, + "loss": 1.0225, + "step": 1852 + }, + { + "epoch": 0.21973200521759753, + "grad_norm": 1.6350897061630507, + "learning_rate": 4.991854098187519e-05, + "loss": 1.1064, + "step": 1853 + }, + { + "epoch": 0.2198505869797225, + "grad_norm": 1.7686439260996882, + "learning_rate": 4.991834725286141e-05, + "loss": 0.8749, + "step": 1854 + }, + { + "epoch": 0.2199691687418475, + "grad_norm": 1.7677798921713643, + "learning_rate": 4.991815329413103e-05, + "loss": 1.158, + "step": 1855 + }, + { + "epoch": 0.2200877505039725, + "grad_norm": 1.9020717492809287, + "learning_rate": 4.991795910568584e-05, + "loss": 1.1096, + "step": 1856 + }, + { + "epoch": 0.22020633226609748, + "grad_norm": 1.9522659629924806, + "learning_rate": 4.991776468752761e-05, + "loss": 1.0991, + "step": 1857 + }, + { + "epoch": 0.22032491402822246, + "grad_norm": 1.4349160179485294, + "learning_rate": 4.991757003965816e-05, + "loss": 1.0615, + "step": 1858 + }, + { + "epoch": 0.22044349579034744, + "grad_norm": 1.423929420477791, + "learning_rate": 4.991737516207926e-05, + "loss": 0.8839, + "step": 1859 + }, + { + "epoch": 0.22056207755247242, + "grad_norm": 1.5044535285252192, + "learning_rate": 4.991718005479272e-05, + "loss": 1.1417, + "step": 1860 + }, + { + "epoch": 0.2206806593145974, + "grad_norm": 1.7414041355935288, + "learning_rate": 4.9916984717800344e-05, + "loss": 1.0484, + "step": 1861 + }, + { + "epoch": 0.2207992410767224, + "grad_norm": 1.4437084033895031, + "learning_rate": 4.991678915110391e-05, + "loss": 1.1822, + "step": 1862 + }, + { + "epoch": 0.2209178228388474, + "grad_norm": 1.4455191507665384, + "learning_rate": 4.991659335470524e-05, + "loss": 1.1809, + "step": 1863 + }, + { + "epoch": 0.22103640460097237, + "grad_norm": 1.4945772055832913, + "learning_rate": 4.9916397328606134e-05, + "loss": 1.1299, + "step": 1864 + }, + { + "epoch": 0.22115498636309736, + "grad_norm": 1.3635213557253822, + "learning_rate": 4.99162010728084e-05, + "loss": 0.9142, + "step": 1865 + }, + { + "epoch": 0.22127356812522234, + "grad_norm": 1.509206278558251, + "learning_rate": 4.991600458731385e-05, + "loss": 0.9276, + "step": 1866 + }, + { + "epoch": 0.22139214988734732, + "grad_norm": 1.4188155367741584, + "learning_rate": 4.9915807872124286e-05, + "loss": 1.0059, + "step": 1867 + }, + { + "epoch": 0.2215107316494723, + "grad_norm": 1.3584141328533217, + "learning_rate": 4.991561092724153e-05, + "loss": 0.861, + "step": 1868 + }, + { + "epoch": 0.2216293134115973, + "grad_norm": 1.2936543472621367, + "learning_rate": 4.9915413752667394e-05, + "loss": 0.743, + "step": 1869 + }, + { + "epoch": 0.2217478951737223, + "grad_norm": 1.468589159350368, + "learning_rate": 4.9915216348403696e-05, + "loss": 0.7735, + "step": 1870 + }, + { + "epoch": 0.22186647693584727, + "grad_norm": 1.4481797410067405, + "learning_rate": 4.9915018714452255e-05, + "loss": 0.8009, + "step": 1871 + }, + { + "epoch": 0.22198505869797225, + "grad_norm": 1.46429387221064, + "learning_rate": 4.9914820850814895e-05, + "loss": 1.0691, + "step": 1872 + }, + { + "epoch": 0.22210364046009723, + "grad_norm": 1.6581296181932614, + "learning_rate": 4.9914622757493445e-05, + "loss": 1.1952, + "step": 1873 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.6169357029747768, + "learning_rate": 4.9914424434489714e-05, + "loss": 1.0887, + "step": 1874 + }, + { + "epoch": 0.22234080398434722, + "grad_norm": 1.7652292970013435, + "learning_rate": 4.991422588180555e-05, + "loss": 0.9275, + "step": 1875 + }, + { + "epoch": 0.2224593857464722, + "grad_norm": 2.2091805829322753, + "learning_rate": 4.991402709944277e-05, + "loss": 1.0107, + "step": 1876 + }, + { + "epoch": 0.22257796750859718, + "grad_norm": 1.6709860603153783, + "learning_rate": 4.991382808740322e-05, + "loss": 1.0192, + "step": 1877 + }, + { + "epoch": 0.22269654927072216, + "grad_norm": 1.5914127315097557, + "learning_rate": 4.991362884568872e-05, + "loss": 1.0754, + "step": 1878 + }, + { + "epoch": 0.22281513103284714, + "grad_norm": 2.1609347023659455, + "learning_rate": 4.9913429374301117e-05, + "loss": 0.9836, + "step": 1879 + }, + { + "epoch": 0.22293371279497212, + "grad_norm": 1.4324581759099748, + "learning_rate": 4.9913229673242245e-05, + "loss": 0.9519, + "step": 1880 + }, + { + "epoch": 0.22305229455709713, + "grad_norm": 1.6315959590837206, + "learning_rate": 4.991302974251395e-05, + "loss": 0.86, + "step": 1881 + }, + { + "epoch": 0.2231708763192221, + "grad_norm": 1.9131010948498428, + "learning_rate": 4.991282958211806e-05, + "loss": 0.9786, + "step": 1882 + }, + { + "epoch": 0.2232894580813471, + "grad_norm": 1.7170206484114023, + "learning_rate": 4.991262919205644e-05, + "loss": 0.934, + "step": 1883 + }, + { + "epoch": 0.22340803984347207, + "grad_norm": 1.5232059656739256, + "learning_rate": 4.991242857233093e-05, + "loss": 0.9466, + "step": 1884 + }, + { + "epoch": 0.22352662160559705, + "grad_norm": 1.4154528681373066, + "learning_rate": 4.991222772294337e-05, + "loss": 1.0804, + "step": 1885 + }, + { + "epoch": 0.22364520336772203, + "grad_norm": 1.4662514737883914, + "learning_rate": 4.9912026643895626e-05, + "loss": 0.7908, + "step": 1886 + }, + { + "epoch": 0.22376378512984704, + "grad_norm": 1.5075440337244752, + "learning_rate": 4.9911825335189545e-05, + "loss": 0.8633, + "step": 1887 + }, + { + "epoch": 0.22388236689197202, + "grad_norm": 1.5296579985702576, + "learning_rate": 4.991162379682698e-05, + "loss": 0.9413, + "step": 1888 + }, + { + "epoch": 0.224000948654097, + "grad_norm": 1.5047062348677775, + "learning_rate": 4.99114220288098e-05, + "loss": 1.1115, + "step": 1889 + }, + { + "epoch": 0.22411953041622198, + "grad_norm": 1.8633852031190115, + "learning_rate": 4.991122003113985e-05, + "loss": 0.9855, + "step": 1890 + }, + { + "epoch": 0.22423811217834697, + "grad_norm": 1.8703982112153525, + "learning_rate": 4.9911017803819e-05, + "loss": 1.1626, + "step": 1891 + }, + { + "epoch": 0.22435669394047195, + "grad_norm": 1.4825220322872457, + "learning_rate": 4.9910815346849114e-05, + "loss": 0.6765, + "step": 1892 + }, + { + "epoch": 0.22447527570259693, + "grad_norm": 1.6751652057381117, + "learning_rate": 4.991061266023206e-05, + "loss": 0.9255, + "step": 1893 + }, + { + "epoch": 0.22459385746472194, + "grad_norm": 1.5593401623687249, + "learning_rate": 4.99104097439697e-05, + "loss": 0.9443, + "step": 1894 + }, + { + "epoch": 0.22471243922684692, + "grad_norm": 1.580381420129191, + "learning_rate": 4.9910206598063914e-05, + "loss": 0.8111, + "step": 1895 + }, + { + "epoch": 0.2248310209889719, + "grad_norm": 1.553131240829057, + "learning_rate": 4.9910003222516575e-05, + "loss": 0.9166, + "step": 1896 + }, + { + "epoch": 0.22494960275109688, + "grad_norm": 1.4657259942333234, + "learning_rate": 4.990979961732954e-05, + "loss": 0.9469, + "step": 1897 + }, + { + "epoch": 0.22506818451322186, + "grad_norm": 1.7574242945632421, + "learning_rate": 4.990959578250471e-05, + "loss": 0.8758, + "step": 1898 + }, + { + "epoch": 0.22518676627534684, + "grad_norm": 1.7582971652958272, + "learning_rate": 4.9909391718043953e-05, + "loss": 1.0759, + "step": 1899 + }, + { + "epoch": 0.22530534803747185, + "grad_norm": 1.5176181013992036, + "learning_rate": 4.990918742394914e-05, + "loss": 0.9966, + "step": 1900 + }, + { + "epoch": 0.22542392979959683, + "grad_norm": 1.6082399969320567, + "learning_rate": 4.9908982900222175e-05, + "loss": 1.2566, + "step": 1901 + }, + { + "epoch": 0.2255425115617218, + "grad_norm": 1.7371549585852941, + "learning_rate": 4.990877814686493e-05, + "loss": 1.0014, + "step": 1902 + }, + { + "epoch": 0.2256610933238468, + "grad_norm": 1.6710487071226605, + "learning_rate": 4.990857316387929e-05, + "loss": 1.119, + "step": 1903 + }, + { + "epoch": 0.22577967508597177, + "grad_norm": 1.5574127459027793, + "learning_rate": 4.9908367951267156e-05, + "loss": 0.8728, + "step": 1904 + }, + { + "epoch": 0.22589825684809675, + "grad_norm": 1.5277533332292048, + "learning_rate": 4.990816250903041e-05, + "loss": 0.9944, + "step": 1905 + }, + { + "epoch": 0.22601683861022176, + "grad_norm": 1.7278124086170772, + "learning_rate": 4.990795683717095e-05, + "loss": 0.9807, + "step": 1906 + }, + { + "epoch": 0.22613542037234674, + "grad_norm": 1.5763014784081089, + "learning_rate": 4.9907750935690686e-05, + "loss": 0.992, + "step": 1907 + }, + { + "epoch": 0.22625400213447172, + "grad_norm": 1.8351843750735692, + "learning_rate": 4.990754480459149e-05, + "loss": 1.2093, + "step": 1908 + }, + { + "epoch": 0.2263725838965967, + "grad_norm": 1.6613475605159602, + "learning_rate": 4.9907338443875276e-05, + "loss": 1.0477, + "step": 1909 + }, + { + "epoch": 0.22649116565872168, + "grad_norm": 1.6068511644574766, + "learning_rate": 4.990713185354394e-05, + "loss": 1.1514, + "step": 1910 + }, + { + "epoch": 0.22660974742084666, + "grad_norm": 1.4188395394930773, + "learning_rate": 4.99069250335994e-05, + "loss": 0.7674, + "step": 1911 + }, + { + "epoch": 0.22672832918297167, + "grad_norm": 1.5271961528559412, + "learning_rate": 4.990671798404355e-05, + "loss": 0.9017, + "step": 1912 + }, + { + "epoch": 0.22684691094509665, + "grad_norm": 1.81350360089599, + "learning_rate": 4.990651070487831e-05, + "loss": 1.0771, + "step": 1913 + }, + { + "epoch": 0.22696549270722163, + "grad_norm": 2.037348342366854, + "learning_rate": 4.990630319610558e-05, + "loss": 1.089, + "step": 1914 + }, + { + "epoch": 0.22708407446934661, + "grad_norm": 1.6453973283766061, + "learning_rate": 4.9906095457727266e-05, + "loss": 0.9462, + "step": 1915 + }, + { + "epoch": 0.2272026562314716, + "grad_norm": 1.5056713329533733, + "learning_rate": 4.99058874897453e-05, + "loss": 0.9865, + "step": 1916 + }, + { + "epoch": 0.22732123799359658, + "grad_norm": 1.9082870688953486, + "learning_rate": 4.99056792921616e-05, + "loss": 1.4323, + "step": 1917 + }, + { + "epoch": 0.22743981975572156, + "grad_norm": 1.5632063982546813, + "learning_rate": 4.990547086497807e-05, + "loss": 1.0358, + "step": 1918 + }, + { + "epoch": 0.22755840151784656, + "grad_norm": 1.4126413997959382, + "learning_rate": 4.990526220819665e-05, + "loss": 0.8899, + "step": 1919 + }, + { + "epoch": 0.22767698327997155, + "grad_norm": 1.2892210652447933, + "learning_rate": 4.990505332181924e-05, + "loss": 0.748, + "step": 1920 + }, + { + "epoch": 0.22779556504209653, + "grad_norm": 1.3786902145782214, + "learning_rate": 4.990484420584778e-05, + "loss": 0.9859, + "step": 1921 + }, + { + "epoch": 0.2279141468042215, + "grad_norm": 1.7865509519350509, + "learning_rate": 4.9904634860284205e-05, + "loss": 1.031, + "step": 1922 + }, + { + "epoch": 0.2280327285663465, + "grad_norm": 1.3964635841098754, + "learning_rate": 4.990442528513043e-05, + "loss": 1.0724, + "step": 1923 + }, + { + "epoch": 0.22815131032847147, + "grad_norm": 1.9025682596244988, + "learning_rate": 4.99042154803884e-05, + "loss": 1.0905, + "step": 1924 + }, + { + "epoch": 0.22826989209059648, + "grad_norm": 1.4511239729599272, + "learning_rate": 4.990400544606004e-05, + "loss": 1.0908, + "step": 1925 + }, + { + "epoch": 0.22838847385272146, + "grad_norm": 1.4839117555497356, + "learning_rate": 4.990379518214729e-05, + "loss": 1.2301, + "step": 1926 + }, + { + "epoch": 0.22850705561484644, + "grad_norm": 1.446694712344075, + "learning_rate": 4.990358468865208e-05, + "loss": 0.9176, + "step": 1927 + }, + { + "epoch": 0.22862563737697142, + "grad_norm": 1.4566533932211327, + "learning_rate": 4.9903373965576374e-05, + "loss": 0.9359, + "step": 1928 + }, + { + "epoch": 0.2287442191390964, + "grad_norm": 1.5647290237653095, + "learning_rate": 4.990316301292208e-05, + "loss": 0.9452, + "step": 1929 + }, + { + "epoch": 0.22886280090122138, + "grad_norm": 1.683677343503608, + "learning_rate": 4.9902951830691174e-05, + "loss": 0.9945, + "step": 1930 + }, + { + "epoch": 0.2289813826633464, + "grad_norm": 1.4797446705615598, + "learning_rate": 4.9902740418885584e-05, + "loss": 0.7636, + "step": 1931 + }, + { + "epoch": 0.22909996442547137, + "grad_norm": 1.7636490481249374, + "learning_rate": 4.990252877750727e-05, + "loss": 1.1825, + "step": 1932 + }, + { + "epoch": 0.22921854618759635, + "grad_norm": 1.8783997288101326, + "learning_rate": 4.9902316906558176e-05, + "loss": 1.1034, + "step": 1933 + }, + { + "epoch": 0.22933712794972133, + "grad_norm": 1.7742946874538126, + "learning_rate": 4.990210480604026e-05, + "loss": 1.3336, + "step": 1934 + }, + { + "epoch": 0.2294557097118463, + "grad_norm": 1.7007761766981013, + "learning_rate": 4.990189247595547e-05, + "loss": 1.0209, + "step": 1935 + }, + { + "epoch": 0.2295742914739713, + "grad_norm": 1.4748321185817905, + "learning_rate": 4.990167991630577e-05, + "loss": 1.0802, + "step": 1936 + }, + { + "epoch": 0.2296928732360963, + "grad_norm": 1.591573942222844, + "learning_rate": 4.990146712709311e-05, + "loss": 0.9398, + "step": 1937 + }, + { + "epoch": 0.22981145499822128, + "grad_norm": 1.806026602602224, + "learning_rate": 4.990125410831947e-05, + "loss": 1.2122, + "step": 1938 + }, + { + "epoch": 0.22993003676034626, + "grad_norm": 1.5119910869947426, + "learning_rate": 4.99010408599868e-05, + "loss": 0.8531, + "step": 1939 + }, + { + "epoch": 0.23004861852247124, + "grad_norm": 1.450478371624302, + "learning_rate": 4.990082738209707e-05, + "loss": 1.0147, + "step": 1940 + }, + { + "epoch": 0.23016720028459622, + "grad_norm": 1.7400018832520756, + "learning_rate": 4.990061367465225e-05, + "loss": 0.9817, + "step": 1941 + }, + { + "epoch": 0.2302857820467212, + "grad_norm": 1.7258653564423474, + "learning_rate": 4.99003997376543e-05, + "loss": 1.0653, + "step": 1942 + }, + { + "epoch": 0.2304043638088462, + "grad_norm": 1.5900747720824306, + "learning_rate": 4.990018557110521e-05, + "loss": 0.9803, + "step": 1943 + }, + { + "epoch": 0.2305229455709712, + "grad_norm": 1.664635743731406, + "learning_rate": 4.9899971175006935e-05, + "loss": 1.0196, + "step": 1944 + }, + { + "epoch": 0.23064152733309617, + "grad_norm": 1.7589702915611536, + "learning_rate": 4.989975654936145e-05, + "loss": 1.0616, + "step": 1945 + }, + { + "epoch": 0.23076010909522116, + "grad_norm": 1.6676224590932205, + "learning_rate": 4.989954169417076e-05, + "loss": 1.0653, + "step": 1946 + }, + { + "epoch": 0.23087869085734614, + "grad_norm": 1.7437219448372143, + "learning_rate": 4.989932660943683e-05, + "loss": 0.9015, + "step": 1947 + }, + { + "epoch": 0.23099727261947112, + "grad_norm": 1.7960057698934622, + "learning_rate": 4.989911129516163e-05, + "loss": 1.116, + "step": 1948 + }, + { + "epoch": 0.2311158543815961, + "grad_norm": 1.742686596708882, + "learning_rate": 4.989889575134716e-05, + "loss": 1.0465, + "step": 1949 + }, + { + "epoch": 0.2312344361437211, + "grad_norm": 1.621432342286824, + "learning_rate": 4.989867997799541e-05, + "loss": 1.0201, + "step": 1950 + }, + { + "epoch": 0.2313530179058461, + "grad_norm": 1.5817729674707892, + "learning_rate": 4.989846397510836e-05, + "loss": 1.0506, + "step": 1951 + }, + { + "epoch": 0.23147159966797107, + "grad_norm": 1.6586796590566955, + "learning_rate": 4.989824774268801e-05, + "loss": 1.1147, + "step": 1952 + }, + { + "epoch": 0.23159018143009605, + "grad_norm": 1.4491229999935904, + "learning_rate": 4.989803128073634e-05, + "loss": 1.0071, + "step": 1953 + }, + { + "epoch": 0.23170876319222103, + "grad_norm": 1.5262658252815045, + "learning_rate": 4.9897814589255356e-05, + "loss": 1.0998, + "step": 1954 + }, + { + "epoch": 0.231827344954346, + "grad_norm": 1.5414346089595201, + "learning_rate": 4.9897597668247053e-05, + "loss": 0.8681, + "step": 1955 + }, + { + "epoch": 0.23194592671647102, + "grad_norm": 1.6120838818867245, + "learning_rate": 4.9897380517713435e-05, + "loss": 0.886, + "step": 1956 + }, + { + "epoch": 0.232064508478596, + "grad_norm": 1.4884754413795882, + "learning_rate": 4.98971631376565e-05, + "loss": 0.9615, + "step": 1957 + }, + { + "epoch": 0.23218309024072098, + "grad_norm": 1.6553146961222758, + "learning_rate": 4.989694552807825e-05, + "loss": 0.9295, + "step": 1958 + }, + { + "epoch": 0.23230167200284596, + "grad_norm": 1.5886543437484113, + "learning_rate": 4.9896727688980685e-05, + "loss": 1.1488, + "step": 1959 + }, + { + "epoch": 0.23242025376497094, + "grad_norm": 1.4116963124391961, + "learning_rate": 4.9896509620365837e-05, + "loss": 0.7749, + "step": 1960 + }, + { + "epoch": 0.23253883552709592, + "grad_norm": 1.6624504635704234, + "learning_rate": 4.989629132223569e-05, + "loss": 0.9358, + "step": 1961 + }, + { + "epoch": 0.23265741728922093, + "grad_norm": 1.5117295325039906, + "learning_rate": 4.9896072794592265e-05, + "loss": 0.7922, + "step": 1962 + }, + { + "epoch": 0.2327759990513459, + "grad_norm": 1.563330494149231, + "learning_rate": 4.989585403743758e-05, + "loss": 0.8102, + "step": 1963 + }, + { + "epoch": 0.2328945808134709, + "grad_norm": 1.8466863593704017, + "learning_rate": 4.9895635050773655e-05, + "loss": 1.0596, + "step": 1964 + }, + { + "epoch": 0.23301316257559587, + "grad_norm": 1.6115387898798983, + "learning_rate": 4.98954158346025e-05, + "loss": 1.082, + "step": 1965 + }, + { + "epoch": 0.23313174433772085, + "grad_norm": 1.4355556888688414, + "learning_rate": 4.9895196388926134e-05, + "loss": 0.9935, + "step": 1966 + }, + { + "epoch": 0.23325032609984583, + "grad_norm": 1.6208885503856885, + "learning_rate": 4.98949767137466e-05, + "loss": 1.0307, + "step": 1967 + }, + { + "epoch": 0.23336890786197084, + "grad_norm": 1.4891803992804684, + "learning_rate": 4.98947568090659e-05, + "loss": 0.8513, + "step": 1968 + }, + { + "epoch": 0.23348748962409582, + "grad_norm": 1.6014904889871437, + "learning_rate": 4.989453667488607e-05, + "loss": 0.7923, + "step": 1969 + }, + { + "epoch": 0.2336060713862208, + "grad_norm": 1.5880776397570076, + "learning_rate": 4.9894316311209146e-05, + "loss": 0.9867, + "step": 1970 + }, + { + "epoch": 0.23372465314834578, + "grad_norm": 1.688895651743746, + "learning_rate": 4.989409571803714e-05, + "loss": 0.8132, + "step": 1971 + }, + { + "epoch": 0.23384323491047077, + "grad_norm": 1.6150611764032206, + "learning_rate": 4.9893874895372104e-05, + "loss": 1.1188, + "step": 1972 + }, + { + "epoch": 0.23396181667259575, + "grad_norm": 1.5076617450993473, + "learning_rate": 4.989365384321608e-05, + "loss": 0.9619, + "step": 1973 + }, + { + "epoch": 0.23408039843472073, + "grad_norm": 1.7063303632871143, + "learning_rate": 4.989343256157109e-05, + "loss": 0.9928, + "step": 1974 + }, + { + "epoch": 0.23419898019684574, + "grad_norm": 1.9220408091723056, + "learning_rate": 4.989321105043917e-05, + "loss": 1.143, + "step": 1975 + }, + { + "epoch": 0.23431756195897072, + "grad_norm": 1.6124276217819924, + "learning_rate": 4.9892989309822366e-05, + "loss": 1.1244, + "step": 1976 + }, + { + "epoch": 0.2344361437210957, + "grad_norm": 1.5301767942807971, + "learning_rate": 4.989276733972274e-05, + "loss": 1.0421, + "step": 1977 + }, + { + "epoch": 0.23455472548322068, + "grad_norm": 1.654123751710572, + "learning_rate": 4.989254514014231e-05, + "loss": 1.0686, + "step": 1978 + }, + { + "epoch": 0.23467330724534566, + "grad_norm": 1.614887529451615, + "learning_rate": 4.989232271108315e-05, + "loss": 1.0473, + "step": 1979 + }, + { + "epoch": 0.23479188900747064, + "grad_norm": 1.542641082392051, + "learning_rate": 4.98921000525473e-05, + "loss": 0.8383, + "step": 1980 + }, + { + "epoch": 0.23491047076959565, + "grad_norm": 1.5262365828713016, + "learning_rate": 4.989187716453681e-05, + "loss": 1.0046, + "step": 1981 + }, + { + "epoch": 0.23502905253172063, + "grad_norm": 1.53292325365757, + "learning_rate": 4.989165404705374e-05, + "loss": 0.8342, + "step": 1982 + }, + { + "epoch": 0.2351476342938456, + "grad_norm": 1.3475749264992607, + "learning_rate": 4.989143070010014e-05, + "loss": 0.9374, + "step": 1983 + }, + { + "epoch": 0.2352662160559706, + "grad_norm": 1.6221715921766138, + "learning_rate": 4.9891207123678066e-05, + "loss": 1.1065, + "step": 1984 + }, + { + "epoch": 0.23538479781809557, + "grad_norm": 1.4483109153226144, + "learning_rate": 4.98909833177896e-05, + "loss": 0.8322, + "step": 1985 + }, + { + "epoch": 0.23550337958022055, + "grad_norm": 1.5834207187261131, + "learning_rate": 4.989075928243678e-05, + "loss": 1.2424, + "step": 1986 + }, + { + "epoch": 0.23562196134234556, + "grad_norm": 1.6574874702700058, + "learning_rate": 4.989053501762169e-05, + "loss": 0.9054, + "step": 1987 + }, + { + "epoch": 0.23574054310447054, + "grad_norm": 1.4067236059962838, + "learning_rate": 4.989031052334639e-05, + "loss": 1.0597, + "step": 1988 + }, + { + "epoch": 0.23585912486659552, + "grad_norm": 1.6466900317681978, + "learning_rate": 4.989008579961294e-05, + "loss": 1.0568, + "step": 1989 + }, + { + "epoch": 0.2359777066287205, + "grad_norm": 1.3927862769275399, + "learning_rate": 4.9889860846423424e-05, + "loss": 1.0058, + "step": 1990 + }, + { + "epoch": 0.23609628839084548, + "grad_norm": 1.3822215912561235, + "learning_rate": 4.9889635663779924e-05, + "loss": 0.8841, + "step": 1991 + }, + { + "epoch": 0.23621487015297046, + "grad_norm": 1.4574827504673697, + "learning_rate": 4.988941025168449e-05, + "loss": 0.6813, + "step": 1992 + }, + { + "epoch": 0.23633345191509547, + "grad_norm": 1.719004768181018, + "learning_rate": 4.9889184610139224e-05, + "loss": 0.9685, + "step": 1993 + }, + { + "epoch": 0.23645203367722045, + "grad_norm": 2.0527458524994926, + "learning_rate": 4.98889587391462e-05, + "loss": 1.1746, + "step": 1994 + }, + { + "epoch": 0.23657061543934543, + "grad_norm": 1.4936699647859821, + "learning_rate": 4.988873263870749e-05, + "loss": 0.9448, + "step": 1995 + }, + { + "epoch": 0.2366891972014704, + "grad_norm": 1.4439063670962893, + "learning_rate": 4.988850630882518e-05, + "loss": 0.977, + "step": 1996 + }, + { + "epoch": 0.2368077789635954, + "grad_norm": 1.419087140707462, + "learning_rate": 4.9888279749501376e-05, + "loss": 0.9751, + "step": 1997 + }, + { + "epoch": 0.23692636072572038, + "grad_norm": 1.4273422518297376, + "learning_rate": 4.988805296073814e-05, + "loss": 1.1365, + "step": 1998 + }, + { + "epoch": 0.23704494248784536, + "grad_norm": 1.453329512122131, + "learning_rate": 4.988782594253759e-05, + "loss": 0.844, + "step": 1999 + }, + { + "epoch": 0.23716352424997036, + "grad_norm": 1.4305247347925087, + "learning_rate": 4.988759869490179e-05, + "loss": 1.0306, + "step": 2000 + }, + { + "epoch": 0.23728210601209535, + "grad_norm": 1.5011974457556396, + "learning_rate": 4.988737121783286e-05, + "loss": 0.9683, + "step": 2001 + }, + { + "epoch": 0.23740068777422033, + "grad_norm": 1.6997714885769597, + "learning_rate": 4.988714351133288e-05, + "loss": 1.0922, + "step": 2002 + }, + { + "epoch": 0.2375192695363453, + "grad_norm": 1.6200599655597099, + "learning_rate": 4.988691557540396e-05, + "loss": 0.9344, + "step": 2003 + }, + { + "epoch": 0.2376378512984703, + "grad_norm": 1.7060622861567702, + "learning_rate": 4.988668741004819e-05, + "loss": 0.9164, + "step": 2004 + }, + { + "epoch": 0.23775643306059527, + "grad_norm": 1.5158364073916335, + "learning_rate": 4.9886459015267686e-05, + "loss": 1.0081, + "step": 2005 + }, + { + "epoch": 0.23787501482272028, + "grad_norm": 1.5915494494909135, + "learning_rate": 4.9886230391064546e-05, + "loss": 0.9278, + "step": 2006 + }, + { + "epoch": 0.23799359658484526, + "grad_norm": 1.537703096356776, + "learning_rate": 4.988600153744088e-05, + "loss": 1.0062, + "step": 2007 + }, + { + "epoch": 0.23811217834697024, + "grad_norm": 1.4974768986296876, + "learning_rate": 4.988577245439879e-05, + "loss": 1.1319, + "step": 2008 + }, + { + "epoch": 0.23823076010909522, + "grad_norm": 1.351841212945489, + "learning_rate": 4.988554314194041e-05, + "loss": 0.9002, + "step": 2009 + }, + { + "epoch": 0.2383493418712202, + "grad_norm": 1.5079069125205833, + "learning_rate": 4.988531360006783e-05, + "loss": 0.9929, + "step": 2010 + }, + { + "epoch": 0.23846792363334518, + "grad_norm": 1.6035690337295347, + "learning_rate": 4.9885083828783177e-05, + "loss": 0.8551, + "step": 2011 + }, + { + "epoch": 0.2385865053954702, + "grad_norm": 1.6362171810052264, + "learning_rate": 4.988485382808856e-05, + "loss": 1.0243, + "step": 2012 + }, + { + "epoch": 0.23870508715759517, + "grad_norm": 1.7725984468404168, + "learning_rate": 4.9884623597986114e-05, + "loss": 1.0561, + "step": 2013 + }, + { + "epoch": 0.23882366891972015, + "grad_norm": 1.6036716667503024, + "learning_rate": 4.988439313847795e-05, + "loss": 1.0867, + "step": 2014 + }, + { + "epoch": 0.23894225068184513, + "grad_norm": 1.4178518793455746, + "learning_rate": 4.9884162449566195e-05, + "loss": 0.7159, + "step": 2015 + }, + { + "epoch": 0.2390608324439701, + "grad_norm": 1.3227277049945634, + "learning_rate": 4.988393153125298e-05, + "loss": 0.7813, + "step": 2016 + }, + { + "epoch": 0.2391794142060951, + "grad_norm": 1.4046718367654667, + "learning_rate": 4.988370038354043e-05, + "loss": 0.9646, + "step": 2017 + }, + { + "epoch": 0.2392979959682201, + "grad_norm": 1.8928861731811264, + "learning_rate": 4.988346900643068e-05, + "loss": 0.9578, + "step": 2018 + }, + { + "epoch": 0.23941657773034508, + "grad_norm": 1.6294544528772894, + "learning_rate": 4.9883237399925866e-05, + "loss": 1.0142, + "step": 2019 + }, + { + "epoch": 0.23953515949247006, + "grad_norm": 1.66823103934097, + "learning_rate": 4.988300556402811e-05, + "loss": 1.0936, + "step": 2020 + }, + { + "epoch": 0.23965374125459504, + "grad_norm": 1.3626170809403297, + "learning_rate": 4.9882773498739554e-05, + "loss": 0.7495, + "step": 2021 + }, + { + "epoch": 0.23977232301672002, + "grad_norm": 1.6676941995618688, + "learning_rate": 4.988254120406234e-05, + "loss": 0.8901, + "step": 2022 + }, + { + "epoch": 0.239890904778845, + "grad_norm": 1.4399585787567406, + "learning_rate": 4.988230867999861e-05, + "loss": 0.9626, + "step": 2023 + }, + { + "epoch": 0.24000948654096999, + "grad_norm": 1.5074320076237988, + "learning_rate": 4.988207592655051e-05, + "loss": 0.921, + "step": 2024 + }, + { + "epoch": 0.240128068303095, + "grad_norm": 1.6650399626345913, + "learning_rate": 4.9881842943720175e-05, + "loss": 0.9947, + "step": 2025 + }, + { + "epoch": 0.24024665006521997, + "grad_norm": 1.2800387119811232, + "learning_rate": 4.988160973150976e-05, + "loss": 0.7056, + "step": 2026 + }, + { + "epoch": 0.24036523182734496, + "grad_norm": 1.5124841667699283, + "learning_rate": 4.988137628992142e-05, + "loss": 0.9348, + "step": 2027 + }, + { + "epoch": 0.24048381358946994, + "grad_norm": 1.5476100031479807, + "learning_rate": 4.9881142618957306e-05, + "loss": 1.0174, + "step": 2028 + }, + { + "epoch": 0.24060239535159492, + "grad_norm": 1.506464539576529, + "learning_rate": 4.988090871861956e-05, + "loss": 0.7834, + "step": 2029 + }, + { + "epoch": 0.2407209771137199, + "grad_norm": 1.429115642549729, + "learning_rate": 4.988067458891035e-05, + "loss": 1.1353, + "step": 2030 + }, + { + "epoch": 0.2408395588758449, + "grad_norm": 1.3947657879611741, + "learning_rate": 4.988044022983184e-05, + "loss": 0.9847, + "step": 2031 + }, + { + "epoch": 0.2409581406379699, + "grad_norm": 1.6371821895710357, + "learning_rate": 4.9880205641386166e-05, + "loss": 1.1881, + "step": 2032 + }, + { + "epoch": 0.24107672240009487, + "grad_norm": 1.5567591383339119, + "learning_rate": 4.9879970823575516e-05, + "loss": 0.8566, + "step": 2033 + }, + { + "epoch": 0.24119530416221985, + "grad_norm": 1.5132029475982507, + "learning_rate": 4.987973577640204e-05, + "loss": 1.0845, + "step": 2034 + }, + { + "epoch": 0.24131388592434483, + "grad_norm": 1.685839343053115, + "learning_rate": 4.987950049986792e-05, + "loss": 0.9758, + "step": 2035 + }, + { + "epoch": 0.2414324676864698, + "grad_norm": 1.7948089256830078, + "learning_rate": 4.98792649939753e-05, + "loss": 0.9953, + "step": 2036 + }, + { + "epoch": 0.24155104944859482, + "grad_norm": 1.5236375384391327, + "learning_rate": 4.987902925872637e-05, + "loss": 1.1155, + "step": 2037 + }, + { + "epoch": 0.2416696312107198, + "grad_norm": 1.9413252307354085, + "learning_rate": 4.98787932941233e-05, + "loss": 1.1305, + "step": 2038 + }, + { + "epoch": 0.24178821297284478, + "grad_norm": 1.5927993451224265, + "learning_rate": 4.987855710016827e-05, + "loss": 0.8815, + "step": 2039 + }, + { + "epoch": 0.24190679473496976, + "grad_norm": 1.5541187183107388, + "learning_rate": 4.9878320676863447e-05, + "loss": 0.9947, + "step": 2040 + }, + { + "epoch": 0.24202537649709474, + "grad_norm": 1.7437707027452671, + "learning_rate": 4.9878084024211016e-05, + "loss": 1.1141, + "step": 2041 + }, + { + "epoch": 0.24214395825921972, + "grad_norm": 1.3092698415644695, + "learning_rate": 4.987784714221315e-05, + "loss": 0.6535, + "step": 2042 + }, + { + "epoch": 0.24226254002134473, + "grad_norm": 1.4673804816217528, + "learning_rate": 4.987761003087205e-05, + "loss": 0.8086, + "step": 2043 + }, + { + "epoch": 0.2423811217834697, + "grad_norm": 1.611202688500416, + "learning_rate": 4.98773726901899e-05, + "loss": 1.1189, + "step": 2044 + }, + { + "epoch": 0.2424997035455947, + "grad_norm": 1.6697317423480866, + "learning_rate": 4.987713512016886e-05, + "loss": 0.9961, + "step": 2045 + }, + { + "epoch": 0.24261828530771967, + "grad_norm": 1.549985483044613, + "learning_rate": 4.9876897320811156e-05, + "loss": 0.9909, + "step": 2046 + }, + { + "epoch": 0.24273686706984465, + "grad_norm": 1.7362645097211575, + "learning_rate": 4.9876659292118954e-05, + "loss": 1.1183, + "step": 2047 + }, + { + "epoch": 0.24285544883196963, + "grad_norm": 1.3649091551872485, + "learning_rate": 4.9876421034094465e-05, + "loss": 0.9134, + "step": 2048 + }, + { + "epoch": 0.24297403059409464, + "grad_norm": 1.4132473796853338, + "learning_rate": 4.987618254673988e-05, + "loss": 0.9402, + "step": 2049 + }, + { + "epoch": 0.24309261235621962, + "grad_norm": 1.6130785263956027, + "learning_rate": 4.98759438300574e-05, + "loss": 0.9947, + "step": 2050 + }, + { + "epoch": 0.2432111941183446, + "grad_norm": 1.7264268441998956, + "learning_rate": 4.987570488404922e-05, + "loss": 0.9563, + "step": 2051 + }, + { + "epoch": 0.24332977588046958, + "grad_norm": 1.9545428216677283, + "learning_rate": 4.987546570871755e-05, + "loss": 1.0508, + "step": 2052 + }, + { + "epoch": 0.24344835764259457, + "grad_norm": 1.5359874623630718, + "learning_rate": 4.9875226304064585e-05, + "loss": 1.0449, + "step": 2053 + }, + { + "epoch": 0.24356693940471955, + "grad_norm": 1.4272986609242349, + "learning_rate": 4.9874986670092536e-05, + "loss": 1.0308, + "step": 2054 + }, + { + "epoch": 0.24368552116684453, + "grad_norm": 1.529774285897543, + "learning_rate": 4.987474680680361e-05, + "loss": 0.8353, + "step": 2055 + }, + { + "epoch": 0.24380410292896953, + "grad_norm": 1.5517482498026967, + "learning_rate": 4.987450671420003e-05, + "loss": 0.8764, + "step": 2056 + }, + { + "epoch": 0.24392268469109452, + "grad_norm": 1.894789000668694, + "learning_rate": 4.9874266392283995e-05, + "loss": 0.9849, + "step": 2057 + }, + { + "epoch": 0.2440412664532195, + "grad_norm": 1.378135361945518, + "learning_rate": 4.987402584105774e-05, + "loss": 0.7118, + "step": 2058 + }, + { + "epoch": 0.24415984821534448, + "grad_norm": 2.154430703838357, + "learning_rate": 4.9873785060523454e-05, + "loss": 1.1367, + "step": 2059 + }, + { + "epoch": 0.24427842997746946, + "grad_norm": 1.8515920718017587, + "learning_rate": 4.987354405068337e-05, + "loss": 1.1248, + "step": 2060 + }, + { + "epoch": 0.24439701173959444, + "grad_norm": 1.419908808170865, + "learning_rate": 4.987330281153973e-05, + "loss": 1.0317, + "step": 2061 + }, + { + "epoch": 0.24451559350171945, + "grad_norm": 1.5629243897126097, + "learning_rate": 4.987306134309473e-05, + "loss": 0.7564, + "step": 2062 + }, + { + "epoch": 0.24463417526384443, + "grad_norm": 1.7219430406112892, + "learning_rate": 4.98728196453506e-05, + "loss": 1.0869, + "step": 2063 + }, + { + "epoch": 0.2447527570259694, + "grad_norm": 1.3396006974954793, + "learning_rate": 4.987257771830958e-05, + "loss": 0.6436, + "step": 2064 + }, + { + "epoch": 0.2448713387880944, + "grad_norm": 1.6507259608009928, + "learning_rate": 4.987233556197389e-05, + "loss": 0.9214, + "step": 2065 + }, + { + "epoch": 0.24498992055021937, + "grad_norm": 1.7201876753334069, + "learning_rate": 4.987209317634577e-05, + "loss": 1.2168, + "step": 2066 + }, + { + "epoch": 0.24510850231234435, + "grad_norm": 1.6582412831686764, + "learning_rate": 4.987185056142745e-05, + "loss": 1.1449, + "step": 2067 + }, + { + "epoch": 0.24522708407446936, + "grad_norm": 1.5572669492435904, + "learning_rate": 4.987160771722117e-05, + "loss": 0.9175, + "step": 2068 + }, + { + "epoch": 0.24534566583659434, + "grad_norm": 1.6561084514887285, + "learning_rate": 4.987136464372917e-05, + "loss": 0.883, + "step": 2069 + }, + { + "epoch": 0.24546424759871932, + "grad_norm": 2.427111843035678, + "learning_rate": 4.9871121340953674e-05, + "loss": 1.2025, + "step": 2070 + }, + { + "epoch": 0.2455828293608443, + "grad_norm": 1.6293295543066901, + "learning_rate": 4.987087780889695e-05, + "loss": 1.0961, + "step": 2071 + }, + { + "epoch": 0.24570141112296928, + "grad_norm": 1.7243286199876149, + "learning_rate": 4.987063404756123e-05, + "loss": 1.1328, + "step": 2072 + }, + { + "epoch": 0.24581999288509426, + "grad_norm": 1.336293364418295, + "learning_rate": 4.987039005694876e-05, + "loss": 0.7303, + "step": 2073 + }, + { + "epoch": 0.24593857464721927, + "grad_norm": 1.335171807715241, + "learning_rate": 4.987014583706179e-05, + "loss": 0.9421, + "step": 2074 + }, + { + "epoch": 0.24605715640934425, + "grad_norm": 1.5627468303837904, + "learning_rate": 4.986990138790257e-05, + "loss": 1.1774, + "step": 2075 + }, + { + "epoch": 0.24617573817146923, + "grad_norm": 1.5014928735012183, + "learning_rate": 4.986965670947337e-05, + "loss": 0.9542, + "step": 2076 + }, + { + "epoch": 0.2462943199335942, + "grad_norm": 1.3527540617191718, + "learning_rate": 4.986941180177642e-05, + "loss": 1.0114, + "step": 2077 + }, + { + "epoch": 0.2464129016957192, + "grad_norm": 1.2005745079586585, + "learning_rate": 4.9869166664813996e-05, + "loss": 0.911, + "step": 2078 + }, + { + "epoch": 0.24653148345784417, + "grad_norm": 1.4295014467282676, + "learning_rate": 4.986892129858835e-05, + "loss": 1.0468, + "step": 2079 + }, + { + "epoch": 0.24665006521996916, + "grad_norm": 1.5074075350836313, + "learning_rate": 4.986867570310174e-05, + "loss": 0.9018, + "step": 2080 + }, + { + "epoch": 0.24676864698209416, + "grad_norm": 1.492128442222448, + "learning_rate": 4.986842987835645e-05, + "loss": 1.0957, + "step": 2081 + }, + { + "epoch": 0.24688722874421914, + "grad_norm": 1.505625909657182, + "learning_rate": 4.986818382435472e-05, + "loss": 1.041, + "step": 2082 + }, + { + "epoch": 0.24700581050634413, + "grad_norm": 1.417385103898961, + "learning_rate": 4.9867937541098835e-05, + "loss": 0.9335, + "step": 2083 + }, + { + "epoch": 0.2471243922684691, + "grad_norm": 1.373392156625882, + "learning_rate": 4.9867691028591054e-05, + "loss": 1.0242, + "step": 2084 + }, + { + "epoch": 0.2472429740305941, + "grad_norm": 1.636374582256655, + "learning_rate": 4.986744428683367e-05, + "loss": 0.9774, + "step": 2085 + }, + { + "epoch": 0.24736155579271907, + "grad_norm": 1.7366743252086316, + "learning_rate": 4.986719731582894e-05, + "loss": 1.1063, + "step": 2086 + }, + { + "epoch": 0.24748013755484408, + "grad_norm": 1.7257485946559759, + "learning_rate": 4.9866950115579135e-05, + "loss": 0.8362, + "step": 2087 + }, + { + "epoch": 0.24759871931696906, + "grad_norm": 1.8215115838072902, + "learning_rate": 4.986670268608655e-05, + "loss": 1.0129, + "step": 2088 + }, + { + "epoch": 0.24771730107909404, + "grad_norm": 1.5893486207755414, + "learning_rate": 4.986645502735346e-05, + "loss": 1.0106, + "step": 2089 + }, + { + "epoch": 0.24783588284121902, + "grad_norm": 1.596333336718307, + "learning_rate": 4.986620713938215e-05, + "loss": 1.0782, + "step": 2090 + }, + { + "epoch": 0.247954464603344, + "grad_norm": 1.645355479943473, + "learning_rate": 4.986595902217491e-05, + "loss": 1.1981, + "step": 2091 + }, + { + "epoch": 0.24807304636546898, + "grad_norm": 1.5005414462213689, + "learning_rate": 4.986571067573401e-05, + "loss": 1.1614, + "step": 2092 + }, + { + "epoch": 0.248191628127594, + "grad_norm": 1.6092147081247734, + "learning_rate": 4.986546210006175e-05, + "loss": 1.0217, + "step": 2093 + }, + { + "epoch": 0.24831020988971897, + "grad_norm": 1.5854139290624074, + "learning_rate": 4.986521329516043e-05, + "loss": 0.9246, + "step": 2094 + }, + { + "epoch": 0.24842879165184395, + "grad_norm": 1.4527272338093522, + "learning_rate": 4.986496426103232e-05, + "loss": 0.8635, + "step": 2095 + }, + { + "epoch": 0.24854737341396893, + "grad_norm": 1.4962003507951074, + "learning_rate": 4.986471499767974e-05, + "loss": 1.0938, + "step": 2096 + }, + { + "epoch": 0.2486659551760939, + "grad_norm": 1.4057909464966287, + "learning_rate": 4.9864465505104985e-05, + "loss": 1.0784, + "step": 2097 + }, + { + "epoch": 0.2487845369382189, + "grad_norm": 1.6055504181451705, + "learning_rate": 4.9864215783310344e-05, + "loss": 0.9935, + "step": 2098 + }, + { + "epoch": 0.2489031187003439, + "grad_norm": 1.4699657177818262, + "learning_rate": 4.986396583229812e-05, + "loss": 0.6405, + "step": 2099 + }, + { + "epoch": 0.24902170046246888, + "grad_norm": 1.376464715833646, + "learning_rate": 4.986371565207062e-05, + "loss": 0.9561, + "step": 2100 + }, + { + "epoch": 0.24914028222459386, + "grad_norm": 1.5723865013643497, + "learning_rate": 4.986346524263016e-05, + "loss": 0.8234, + "step": 2101 + }, + { + "epoch": 0.24925886398671884, + "grad_norm": 1.2630350121290737, + "learning_rate": 4.986321460397904e-05, + "loss": 0.7528, + "step": 2102 + }, + { + "epoch": 0.24937744574884382, + "grad_norm": 1.4848277535306345, + "learning_rate": 4.986296373611956e-05, + "loss": 0.7253, + "step": 2103 + }, + { + "epoch": 0.2494960275109688, + "grad_norm": 1.6589826827990275, + "learning_rate": 4.986271263905405e-05, + "loss": 0.9045, + "step": 2104 + }, + { + "epoch": 0.24961460927309378, + "grad_norm": 1.6055563779007433, + "learning_rate": 4.986246131278481e-05, + "loss": 0.9267, + "step": 2105 + }, + { + "epoch": 0.2497331910352188, + "grad_norm": 1.6166123649988795, + "learning_rate": 4.9862209757314184e-05, + "loss": 1.118, + "step": 2106 + }, + { + "epoch": 0.24985177279734377, + "grad_norm": 1.5887712600318746, + "learning_rate": 4.986195797264446e-05, + "loss": 1.0529, + "step": 2107 + }, + { + "epoch": 0.24997035455946875, + "grad_norm": 1.6692519959274708, + "learning_rate": 4.986170595877797e-05, + "loss": 0.9421, + "step": 2108 + }, + { + "epoch": 0.25008893632159374, + "grad_norm": 1.7195944350620005, + "learning_rate": 4.986145371571704e-05, + "loss": 1.2998, + "step": 2109 + }, + { + "epoch": 0.2502075180837187, + "grad_norm": 1.4158535188409478, + "learning_rate": 4.9861201243463994e-05, + "loss": 0.9264, + "step": 2110 + }, + { + "epoch": 0.2503260998458437, + "grad_norm": 1.395510290906992, + "learning_rate": 4.986094854202116e-05, + "loss": 0.834, + "step": 2111 + }, + { + "epoch": 0.2504446816079687, + "grad_norm": 1.445087878954799, + "learning_rate": 4.986069561139086e-05, + "loss": 0.9971, + "step": 2112 + }, + { + "epoch": 0.25056326337009366, + "grad_norm": 1.6208342599758634, + "learning_rate": 4.986044245157544e-05, + "loss": 0.8777, + "step": 2113 + }, + { + "epoch": 0.25068184513221864, + "grad_norm": 1.6055488234224757, + "learning_rate": 4.986018906257723e-05, + "loss": 0.942, + "step": 2114 + }, + { + "epoch": 0.2508004268943437, + "grad_norm": 1.7560188830725787, + "learning_rate": 4.9859935444398556e-05, + "loss": 1.0793, + "step": 2115 + }, + { + "epoch": 0.25091900865646866, + "grad_norm": 1.6224173383291136, + "learning_rate": 4.9859681597041765e-05, + "loss": 1.0261, + "step": 2116 + }, + { + "epoch": 0.25103759041859364, + "grad_norm": 1.4325652978105656, + "learning_rate": 4.985942752050919e-05, + "loss": 0.9031, + "step": 2117 + }, + { + "epoch": 0.2511561721807186, + "grad_norm": 1.7606896343631897, + "learning_rate": 4.985917321480319e-05, + "loss": 1.1023, + "step": 2118 + }, + { + "epoch": 0.2512747539428436, + "grad_norm": 1.853382170939425, + "learning_rate": 4.985891867992609e-05, + "loss": 0.9167, + "step": 2119 + }, + { + "epoch": 0.2513933357049686, + "grad_norm": 1.6116968993821403, + "learning_rate": 4.985866391588024e-05, + "loss": 0.723, + "step": 2120 + }, + { + "epoch": 0.25151191746709356, + "grad_norm": 1.8416582616185058, + "learning_rate": 4.9858408922668e-05, + "loss": 1.069, + "step": 2121 + }, + { + "epoch": 0.25163049922921854, + "grad_norm": 1.830199379315522, + "learning_rate": 4.985815370029171e-05, + "loss": 1.0778, + "step": 2122 + }, + { + "epoch": 0.2517490809913435, + "grad_norm": 1.59336869913217, + "learning_rate": 4.985789824875372e-05, + "loss": 0.6965, + "step": 2123 + }, + { + "epoch": 0.2518676627534685, + "grad_norm": 1.5201708003214887, + "learning_rate": 4.9857642568056395e-05, + "loss": 0.9348, + "step": 2124 + }, + { + "epoch": 0.2519862445155935, + "grad_norm": 1.7073867745960254, + "learning_rate": 4.9857386658202086e-05, + "loss": 1.0733, + "step": 2125 + }, + { + "epoch": 0.25210482627771846, + "grad_norm": 1.682780684489853, + "learning_rate": 4.9857130519193164e-05, + "loss": 0.9605, + "step": 2126 + }, + { + "epoch": 0.2522234080398435, + "grad_norm": 2.0031707997058237, + "learning_rate": 4.985687415103197e-05, + "loss": 1.0503, + "step": 2127 + }, + { + "epoch": 0.2523419898019685, + "grad_norm": 1.6097499813200011, + "learning_rate": 4.985661755372088e-05, + "loss": 1.0184, + "step": 2128 + }, + { + "epoch": 0.25246057156409346, + "grad_norm": 1.7698967588309922, + "learning_rate": 4.9856360727262255e-05, + "loss": 1.0644, + "step": 2129 + }, + { + "epoch": 0.25257915332621844, + "grad_norm": 1.627925059741286, + "learning_rate": 4.985610367165847e-05, + "loss": 0.9859, + "step": 2130 + }, + { + "epoch": 0.2526977350883434, + "grad_norm": 1.5307603267014067, + "learning_rate": 4.985584638691189e-05, + "loss": 1.0008, + "step": 2131 + }, + { + "epoch": 0.2528163168504684, + "grad_norm": 1.6600232641719685, + "learning_rate": 4.985558887302488e-05, + "loss": 0.9726, + "step": 2132 + }, + { + "epoch": 0.2529348986125934, + "grad_norm": 1.540626989422014, + "learning_rate": 4.985533112999983e-05, + "loss": 0.7683, + "step": 2133 + }, + { + "epoch": 0.25305348037471836, + "grad_norm": 1.6232508751271337, + "learning_rate": 4.9855073157839104e-05, + "loss": 0.9407, + "step": 2134 + }, + { + "epoch": 0.25317206213684335, + "grad_norm": 1.492004022909737, + "learning_rate": 4.985481495654508e-05, + "loss": 0.7443, + "step": 2135 + }, + { + "epoch": 0.2532906438989683, + "grad_norm": 1.7337049234465745, + "learning_rate": 4.9854556526120144e-05, + "loss": 1.1503, + "step": 2136 + }, + { + "epoch": 0.2534092256610933, + "grad_norm": 1.7877832683475772, + "learning_rate": 4.9854297866566665e-05, + "loss": 1.1425, + "step": 2137 + }, + { + "epoch": 0.2535278074232183, + "grad_norm": 1.7253199632214957, + "learning_rate": 4.9854038977887054e-05, + "loss": 1.0926, + "step": 2138 + }, + { + "epoch": 0.25364638918534327, + "grad_norm": 1.73219076185296, + "learning_rate": 4.9853779860083675e-05, + "loss": 0.9113, + "step": 2139 + }, + { + "epoch": 0.2537649709474683, + "grad_norm": 1.7153471311367947, + "learning_rate": 4.985352051315892e-05, + "loss": 1.0412, + "step": 2140 + }, + { + "epoch": 0.2538835527095933, + "grad_norm": 1.6450974655983241, + "learning_rate": 4.985326093711519e-05, + "loss": 1.0801, + "step": 2141 + }, + { + "epoch": 0.25400213447171827, + "grad_norm": 1.5298974316076808, + "learning_rate": 4.985300113195486e-05, + "loss": 0.9512, + "step": 2142 + }, + { + "epoch": 0.25412071623384325, + "grad_norm": 1.543117649229392, + "learning_rate": 4.9852741097680344e-05, + "loss": 1.1495, + "step": 2143 + }, + { + "epoch": 0.2542392979959682, + "grad_norm": 1.4568123761158644, + "learning_rate": 4.985248083429403e-05, + "loss": 0.9383, + "step": 2144 + }, + { + "epoch": 0.2543578797580932, + "grad_norm": 1.5886075473869836, + "learning_rate": 4.985222034179832e-05, + "loss": 1.0387, + "step": 2145 + }, + { + "epoch": 0.2544764615202182, + "grad_norm": 1.4482718730262762, + "learning_rate": 4.985195962019561e-05, + "loss": 0.9691, + "step": 2146 + }, + { + "epoch": 0.25459504328234317, + "grad_norm": 1.643185622307714, + "learning_rate": 4.985169866948831e-05, + "loss": 0.7969, + "step": 2147 + }, + { + "epoch": 0.25471362504446815, + "grad_norm": 1.6215371189085541, + "learning_rate": 4.985143748967883e-05, + "loss": 0.8904, + "step": 2148 + }, + { + "epoch": 0.25483220680659313, + "grad_norm": 1.4569548540730763, + "learning_rate": 4.985117608076957e-05, + "loss": 0.9873, + "step": 2149 + }, + { + "epoch": 0.2549507885687181, + "grad_norm": 1.9108667612409593, + "learning_rate": 4.985091444276293e-05, + "loss": 1.144, + "step": 2150 + }, + { + "epoch": 0.2550693703308431, + "grad_norm": 1.683342170167743, + "learning_rate": 4.985065257566135e-05, + "loss": 1.2595, + "step": 2151 + }, + { + "epoch": 0.25518795209296813, + "grad_norm": 2.0167574601508518, + "learning_rate": 4.9850390479467214e-05, + "loss": 0.9175, + "step": 2152 + }, + { + "epoch": 0.2553065338550931, + "grad_norm": 1.709546124106927, + "learning_rate": 4.985012815418295e-05, + "loss": 1.0466, + "step": 2153 + }, + { + "epoch": 0.2554251156172181, + "grad_norm": 1.3743328469186065, + "learning_rate": 4.984986559981098e-05, + "loss": 0.9594, + "step": 2154 + }, + { + "epoch": 0.25554369737934307, + "grad_norm": 1.6006546101028003, + "learning_rate": 4.984960281635373e-05, + "loss": 1.0004, + "step": 2155 + }, + { + "epoch": 0.25566227914146805, + "grad_norm": 1.373543147012109, + "learning_rate": 4.984933980381361e-05, + "loss": 0.8517, + "step": 2156 + }, + { + "epoch": 0.25578086090359303, + "grad_norm": 1.4634271540663177, + "learning_rate": 4.9849076562193044e-05, + "loss": 0.8683, + "step": 2157 + }, + { + "epoch": 0.255899442665718, + "grad_norm": 1.5319474259217496, + "learning_rate": 4.984881309149447e-05, + "loss": 0.9806, + "step": 2158 + }, + { + "epoch": 0.256018024427843, + "grad_norm": 1.6519943761700993, + "learning_rate": 4.984854939172031e-05, + "loss": 0.8745, + "step": 2159 + }, + { + "epoch": 0.256136606189968, + "grad_norm": 1.5862225596842816, + "learning_rate": 4.984828546287299e-05, + "loss": 0.8198, + "step": 2160 + }, + { + "epoch": 0.25625518795209296, + "grad_norm": 1.488037554096196, + "learning_rate": 4.984802130495496e-05, + "loss": 0.8924, + "step": 2161 + }, + { + "epoch": 0.25637376971421794, + "grad_norm": 1.5924279560940893, + "learning_rate": 4.9847756917968637e-05, + "loss": 0.8875, + "step": 2162 + }, + { + "epoch": 0.2564923514763429, + "grad_norm": 1.5612196989023017, + "learning_rate": 4.984749230191646e-05, + "loss": 0.9875, + "step": 2163 + }, + { + "epoch": 0.2566109332384679, + "grad_norm": 1.5873992363771936, + "learning_rate": 4.984722745680089e-05, + "loss": 1.0515, + "step": 2164 + }, + { + "epoch": 0.25672951500059293, + "grad_norm": 1.5157166593132512, + "learning_rate": 4.984696238262434e-05, + "loss": 0.9316, + "step": 2165 + }, + { + "epoch": 0.2568480967627179, + "grad_norm": 1.5746688263250848, + "learning_rate": 4.984669707938927e-05, + "loss": 0.875, + "step": 2166 + }, + { + "epoch": 0.2569666785248429, + "grad_norm": 1.4016955373441873, + "learning_rate": 4.984643154709813e-05, + "loss": 0.8289, + "step": 2167 + }, + { + "epoch": 0.2570852602869679, + "grad_norm": 1.6548005907863415, + "learning_rate": 4.984616578575335e-05, + "loss": 0.9674, + "step": 2168 + }, + { + "epoch": 0.25720384204909286, + "grad_norm": 1.8031069859353175, + "learning_rate": 4.9845899795357385e-05, + "loss": 1.1634, + "step": 2169 + }, + { + "epoch": 0.25732242381121784, + "grad_norm": 1.546651082570925, + "learning_rate": 4.9845633575912696e-05, + "loss": 0.6455, + "step": 2170 + }, + { + "epoch": 0.2574410055733428, + "grad_norm": 1.3810770801903356, + "learning_rate": 4.984536712742174e-05, + "loss": 0.792, + "step": 2171 + }, + { + "epoch": 0.2575595873354678, + "grad_norm": 1.6841781699677776, + "learning_rate": 4.984510044988696e-05, + "loss": 1.1331, + "step": 2172 + }, + { + "epoch": 0.2576781690975928, + "grad_norm": 1.8543678668450854, + "learning_rate": 4.984483354331082e-05, + "loss": 1.0579, + "step": 2173 + }, + { + "epoch": 0.25779675085971776, + "grad_norm": 1.730530140611696, + "learning_rate": 4.984456640769579e-05, + "loss": 0.9951, + "step": 2174 + }, + { + "epoch": 0.25791533262184274, + "grad_norm": 1.5446571432852814, + "learning_rate": 4.984429904304432e-05, + "loss": 0.939, + "step": 2175 + }, + { + "epoch": 0.2580339143839677, + "grad_norm": 1.6214628783263327, + "learning_rate": 4.984403144935888e-05, + "loss": 1.0707, + "step": 2176 + }, + { + "epoch": 0.25815249614609276, + "grad_norm": 1.5239598462709665, + "learning_rate": 4.9843763626641924e-05, + "loss": 0.8749, + "step": 2177 + }, + { + "epoch": 0.25827107790821774, + "grad_norm": 1.6541707779285175, + "learning_rate": 4.984349557489595e-05, + "loss": 0.9682, + "step": 2178 + }, + { + "epoch": 0.2583896596703427, + "grad_norm": 1.5466005597786918, + "learning_rate": 4.98432272941234e-05, + "loss": 0.9338, + "step": 2179 + }, + { + "epoch": 0.2585082414324677, + "grad_norm": 1.3353576130769804, + "learning_rate": 4.984295878432677e-05, + "loss": 0.9302, + "step": 2180 + }, + { + "epoch": 0.2586268231945927, + "grad_norm": 1.678865232338637, + "learning_rate": 4.9842690045508514e-05, + "loss": 1.0057, + "step": 2181 + }, + { + "epoch": 0.25874540495671766, + "grad_norm": 1.4092464192696998, + "learning_rate": 4.984242107767112e-05, + "loss": 0.5636, + "step": 2182 + }, + { + "epoch": 0.25886398671884264, + "grad_norm": 1.3882276999778254, + "learning_rate": 4.9842151880817074e-05, + "loss": 0.7967, + "step": 2183 + }, + { + "epoch": 0.2589825684809676, + "grad_norm": 1.7965720480845428, + "learning_rate": 4.984188245494885e-05, + "loss": 0.9934, + "step": 2184 + }, + { + "epoch": 0.2591011502430926, + "grad_norm": 1.632711314953918, + "learning_rate": 4.984161280006893e-05, + "loss": 0.9033, + "step": 2185 + }, + { + "epoch": 0.2592197320052176, + "grad_norm": 1.3772573331821034, + "learning_rate": 4.98413429161798e-05, + "loss": 0.6674, + "step": 2186 + }, + { + "epoch": 0.25933831376734257, + "grad_norm": 1.4466555805588186, + "learning_rate": 4.9841072803283956e-05, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.25945689552946755, + "grad_norm": 1.871384745228664, + "learning_rate": 4.9840802461383886e-05, + "loss": 1.0916, + "step": 2188 + }, + { + "epoch": 0.2595754772915925, + "grad_norm": 1.6990502069363669, + "learning_rate": 4.984053189048207e-05, + "loss": 0.9982, + "step": 2189 + }, + { + "epoch": 0.25969405905371756, + "grad_norm": 1.437708528667643, + "learning_rate": 4.984026109058102e-05, + "loss": 0.955, + "step": 2190 + }, + { + "epoch": 0.25981264081584254, + "grad_norm": 1.7859525097681908, + "learning_rate": 4.983999006168322e-05, + "loss": 0.7692, + "step": 2191 + }, + { + "epoch": 0.2599312225779675, + "grad_norm": 1.8483293806153176, + "learning_rate": 4.983971880379117e-05, + "loss": 1.1746, + "step": 2192 + }, + { + "epoch": 0.2600498043400925, + "grad_norm": 2.0288037142827964, + "learning_rate": 4.983944731690737e-05, + "loss": 0.907, + "step": 2193 + }, + { + "epoch": 0.2601683861022175, + "grad_norm": 1.5390537565231013, + "learning_rate": 4.983917560103433e-05, + "loss": 1.1079, + "step": 2194 + }, + { + "epoch": 0.26028696786434247, + "grad_norm": 1.516324556302055, + "learning_rate": 4.9838903656174554e-05, + "loss": 0.8732, + "step": 2195 + }, + { + "epoch": 0.26040554962646745, + "grad_norm": 1.4402602453072635, + "learning_rate": 4.9838631482330544e-05, + "loss": 0.8412, + "step": 2196 + }, + { + "epoch": 0.26052413138859243, + "grad_norm": 1.6002422094818345, + "learning_rate": 4.983835907950482e-05, + "loss": 1.0157, + "step": 2197 + }, + { + "epoch": 0.2606427131507174, + "grad_norm": 1.478293098304953, + "learning_rate": 4.983808644769987e-05, + "loss": 1.0273, + "step": 2198 + }, + { + "epoch": 0.2607612949128424, + "grad_norm": 1.7782026275703942, + "learning_rate": 4.9837813586918224e-05, + "loss": 0.6987, + "step": 2199 + }, + { + "epoch": 0.26087987667496737, + "grad_norm": 1.4484772310873528, + "learning_rate": 4.983754049716239e-05, + "loss": 1.021, + "step": 2200 + }, + { + "epoch": 0.26099845843709235, + "grad_norm": 1.7128472077423973, + "learning_rate": 4.9837267178434905e-05, + "loss": 0.9625, + "step": 2201 + }, + { + "epoch": 0.2611170401992174, + "grad_norm": 1.748379826879709, + "learning_rate": 4.9836993630738264e-05, + "loss": 0.9788, + "step": 2202 + }, + { + "epoch": 0.26123562196134237, + "grad_norm": 1.7422509198227587, + "learning_rate": 4.9836719854075005e-05, + "loss": 1.0559, + "step": 2203 + }, + { + "epoch": 0.26135420372346735, + "grad_norm": 1.544675532466091, + "learning_rate": 4.983644584844764e-05, + "loss": 0.9626, + "step": 2204 + }, + { + "epoch": 0.26147278548559233, + "grad_norm": 1.6285447897168717, + "learning_rate": 4.9836171613858705e-05, + "loss": 1.0278, + "step": 2205 + }, + { + "epoch": 0.2615913672477173, + "grad_norm": 1.531423601258921, + "learning_rate": 4.983589715031072e-05, + "loss": 1.1288, + "step": 2206 + }, + { + "epoch": 0.2617099490098423, + "grad_norm": 1.5734436408369779, + "learning_rate": 4.983562245780622e-05, + "loss": 0.9444, + "step": 2207 + }, + { + "epoch": 0.26182853077196727, + "grad_norm": 1.4859973449424488, + "learning_rate": 4.9835347536347736e-05, + "loss": 0.6696, + "step": 2208 + }, + { + "epoch": 0.26194711253409225, + "grad_norm": 1.6205795107790246, + "learning_rate": 4.9835072385937806e-05, + "loss": 0.9353, + "step": 2209 + }, + { + "epoch": 0.26206569429621723, + "grad_norm": 1.5004063496502429, + "learning_rate": 4.983479700657896e-05, + "loss": 1.0302, + "step": 2210 + }, + { + "epoch": 0.2621842760583422, + "grad_norm": 1.5257792347590051, + "learning_rate": 4.983452139827374e-05, + "loss": 0.8633, + "step": 2211 + }, + { + "epoch": 0.2623028578204672, + "grad_norm": 1.7636800159696502, + "learning_rate": 4.983424556102469e-05, + "loss": 0.9552, + "step": 2212 + }, + { + "epoch": 0.2624214395825922, + "grad_norm": 1.5940702075896371, + "learning_rate": 4.983396949483435e-05, + "loss": 1.03, + "step": 2213 + }, + { + "epoch": 0.26254002134471716, + "grad_norm": 1.6484006797332835, + "learning_rate": 4.983369319970526e-05, + "loss": 1.0547, + "step": 2214 + }, + { + "epoch": 0.2626586031068422, + "grad_norm": 1.5736273624894186, + "learning_rate": 4.983341667563998e-05, + "loss": 0.8281, + "step": 2215 + }, + { + "epoch": 0.2627771848689672, + "grad_norm": 1.6087840156818691, + "learning_rate": 4.9833139922641035e-05, + "loss": 0.8274, + "step": 2216 + }, + { + "epoch": 0.26289576663109215, + "grad_norm": 1.6025409699300026, + "learning_rate": 4.9832862940711015e-05, + "loss": 0.9733, + "step": 2217 + }, + { + "epoch": 0.26301434839321713, + "grad_norm": 1.5049979976906493, + "learning_rate": 4.983258572985243e-05, + "loss": 0.7118, + "step": 2218 + }, + { + "epoch": 0.2631329301553421, + "grad_norm": 1.5982744601079337, + "learning_rate": 4.9832308290067865e-05, + "loss": 1.0606, + "step": 2219 + }, + { + "epoch": 0.2632515119174671, + "grad_norm": 1.4913775573609198, + "learning_rate": 4.983203062135987e-05, + "loss": 0.9558, + "step": 2220 + }, + { + "epoch": 0.2633700936795921, + "grad_norm": 1.4983971437597996, + "learning_rate": 4.9831752723731006e-05, + "loss": 0.9606, + "step": 2221 + }, + { + "epoch": 0.26348867544171706, + "grad_norm": 1.3966979712238767, + "learning_rate": 4.983147459718382e-05, + "loss": 0.7713, + "step": 2222 + }, + { + "epoch": 0.26360725720384204, + "grad_norm": 1.508943126036321, + "learning_rate": 4.98311962417209e-05, + "loss": 1.0105, + "step": 2223 + }, + { + "epoch": 0.263725838965967, + "grad_norm": 1.999047387364816, + "learning_rate": 4.98309176573448e-05, + "loss": 1.0228, + "step": 2224 + }, + { + "epoch": 0.263844420728092, + "grad_norm": 1.4607476931341195, + "learning_rate": 4.983063884405809e-05, + "loss": 0.6235, + "step": 2225 + }, + { + "epoch": 0.263963002490217, + "grad_norm": 1.6119254415235267, + "learning_rate": 4.9830359801863334e-05, + "loss": 0.8707, + "step": 2226 + }, + { + "epoch": 0.264081584252342, + "grad_norm": 1.6797512904245933, + "learning_rate": 4.983008053076311e-05, + "loss": 1.1013, + "step": 2227 + }, + { + "epoch": 0.264200166014467, + "grad_norm": 1.7402944002719882, + "learning_rate": 4.982980103076e-05, + "loss": 0.8649, + "step": 2228 + }, + { + "epoch": 0.264318747776592, + "grad_norm": 1.7560916351057352, + "learning_rate": 4.982952130185656e-05, + "loss": 1.1523, + "step": 2229 + }, + { + "epoch": 0.26443732953871696, + "grad_norm": 1.5835855166801942, + "learning_rate": 4.982924134405539e-05, + "loss": 0.5252, + "step": 2230 + }, + { + "epoch": 0.26455591130084194, + "grad_norm": 1.6437484027032823, + "learning_rate": 4.9828961157359064e-05, + "loss": 0.9379, + "step": 2231 + }, + { + "epoch": 0.2646744930629669, + "grad_norm": 1.6911624819781965, + "learning_rate": 4.982868074177016e-05, + "loss": 0.9642, + "step": 2232 + }, + { + "epoch": 0.2647930748250919, + "grad_norm": 1.5717456559719993, + "learning_rate": 4.982840009729127e-05, + "loss": 0.9797, + "step": 2233 + }, + { + "epoch": 0.2649116565872169, + "grad_norm": 1.8251235311549547, + "learning_rate": 4.9828119223924974e-05, + "loss": 0.8802, + "step": 2234 + }, + { + "epoch": 0.26503023834934186, + "grad_norm": 1.572833259712977, + "learning_rate": 4.9827838121673865e-05, + "loss": 1.0434, + "step": 2235 + }, + { + "epoch": 0.26514882011146684, + "grad_norm": 1.59258086953376, + "learning_rate": 4.982755679054053e-05, + "loss": 0.7834, + "step": 2236 + }, + { + "epoch": 0.2652674018735918, + "grad_norm": 1.8604818786034807, + "learning_rate": 4.9827275230527574e-05, + "loss": 0.987, + "step": 2237 + }, + { + "epoch": 0.2653859836357168, + "grad_norm": 1.9526092151832255, + "learning_rate": 4.982699344163758e-05, + "loss": 1.0604, + "step": 2238 + }, + { + "epoch": 0.2655045653978418, + "grad_norm": 1.8261736885883155, + "learning_rate": 4.982671142387316e-05, + "loss": 1.0139, + "step": 2239 + }, + { + "epoch": 0.2656231471599668, + "grad_norm": 1.5544617615957195, + "learning_rate": 4.98264291772369e-05, + "loss": 0.6288, + "step": 2240 + }, + { + "epoch": 0.2657417289220918, + "grad_norm": 1.5234981316922735, + "learning_rate": 4.982614670173141e-05, + "loss": 0.8816, + "step": 2241 + }, + { + "epoch": 0.2658603106842168, + "grad_norm": 1.5496095724245618, + "learning_rate": 4.9825863997359285e-05, + "loss": 1.0243, + "step": 2242 + }, + { + "epoch": 0.26597889244634176, + "grad_norm": 1.7460792920788026, + "learning_rate": 4.982558106412314e-05, + "loss": 1.0361, + "step": 2243 + }, + { + "epoch": 0.26609747420846674, + "grad_norm": 1.64859127945359, + "learning_rate": 4.982529790202558e-05, + "loss": 0.8022, + "step": 2244 + }, + { + "epoch": 0.2662160559705917, + "grad_norm": 1.6444116804048423, + "learning_rate": 4.9825014511069224e-05, + "loss": 1.1802, + "step": 2245 + }, + { + "epoch": 0.2663346377327167, + "grad_norm": 1.4887703484872488, + "learning_rate": 4.9824730891256666e-05, + "loss": 1.0765, + "step": 2246 + }, + { + "epoch": 0.2664532194948417, + "grad_norm": 1.5458836219818344, + "learning_rate": 4.9824447042590537e-05, + "loss": 0.8803, + "step": 2247 + }, + { + "epoch": 0.26657180125696667, + "grad_norm": 1.4257107751417126, + "learning_rate": 4.982416296507345e-05, + "loss": 1.0698, + "step": 2248 + }, + { + "epoch": 0.26669038301909165, + "grad_norm": 1.5052202499178404, + "learning_rate": 4.9823878658708015e-05, + "loss": 1.0725, + "step": 2249 + }, + { + "epoch": 0.26680896478121663, + "grad_norm": 1.3545901892659957, + "learning_rate": 4.982359412349687e-05, + "loss": 0.5528, + "step": 2250 + }, + { + "epoch": 0.2669275465433416, + "grad_norm": 1.6451570791559917, + "learning_rate": 4.982330935944262e-05, + "loss": 0.8372, + "step": 2251 + }, + { + "epoch": 0.26704612830546665, + "grad_norm": 1.6420761030414943, + "learning_rate": 4.98230243665479e-05, + "loss": 0.9582, + "step": 2252 + }, + { + "epoch": 0.2671647100675916, + "grad_norm": 1.3776570541081061, + "learning_rate": 4.982273914481533e-05, + "loss": 0.6706, + "step": 2253 + }, + { + "epoch": 0.2672832918297166, + "grad_norm": 1.5789502447259047, + "learning_rate": 4.982245369424755e-05, + "loss": 0.6965, + "step": 2254 + }, + { + "epoch": 0.2674018735918416, + "grad_norm": 2.07294976044734, + "learning_rate": 4.9822168014847184e-05, + "loss": 0.9643, + "step": 2255 + }, + { + "epoch": 0.26752045535396657, + "grad_norm": 1.482478545110422, + "learning_rate": 4.9821882106616866e-05, + "loss": 0.7547, + "step": 2256 + }, + { + "epoch": 0.26763903711609155, + "grad_norm": 1.4797870893039287, + "learning_rate": 4.982159596955924e-05, + "loss": 0.8033, + "step": 2257 + }, + { + "epoch": 0.26775761887821653, + "grad_norm": 1.9766718282505586, + "learning_rate": 4.9821309603676934e-05, + "loss": 0.8569, + "step": 2258 + }, + { + "epoch": 0.2678762006403415, + "grad_norm": 1.9933158407426539, + "learning_rate": 4.982102300897259e-05, + "loss": 1.1115, + "step": 2259 + }, + { + "epoch": 0.2679947824024665, + "grad_norm": 1.4779440467816958, + "learning_rate": 4.982073618544886e-05, + "loss": 0.9309, + "step": 2260 + }, + { + "epoch": 0.2681133641645915, + "grad_norm": 1.8356509863555672, + "learning_rate": 4.982044913310837e-05, + "loss": 0.7702, + "step": 2261 + }, + { + "epoch": 0.26823194592671645, + "grad_norm": 1.6260396673597868, + "learning_rate": 4.9820161851953776e-05, + "loss": 0.8036, + "step": 2262 + }, + { + "epoch": 0.26835052768884143, + "grad_norm": 1.5249168521541052, + "learning_rate": 4.981987434198773e-05, + "loss": 0.898, + "step": 2263 + }, + { + "epoch": 0.2684691094509664, + "grad_norm": 1.6728766437857052, + "learning_rate": 4.9819586603212875e-05, + "loss": 0.8531, + "step": 2264 + }, + { + "epoch": 0.26858769121309145, + "grad_norm": 1.8197059253344197, + "learning_rate": 4.9819298635631874e-05, + "loss": 0.8003, + "step": 2265 + }, + { + "epoch": 0.26870627297521643, + "grad_norm": 1.763754879739386, + "learning_rate": 4.981901043924737e-05, + "loss": 1.085, + "step": 2266 + }, + { + "epoch": 0.2688248547373414, + "grad_norm": 1.7069536207971974, + "learning_rate": 4.9818722014062026e-05, + "loss": 0.9278, + "step": 2267 + }, + { + "epoch": 0.2689434364994664, + "grad_norm": 1.7613303459881093, + "learning_rate": 4.98184333600785e-05, + "loss": 1.3622, + "step": 2268 + }, + { + "epoch": 0.2690620182615914, + "grad_norm": 1.321905287133801, + "learning_rate": 4.981814447729946e-05, + "loss": 0.8156, + "step": 2269 + }, + { + "epoch": 0.26918060002371635, + "grad_norm": 1.6163533681343276, + "learning_rate": 4.981785536572755e-05, + "loss": 1.1632, + "step": 2270 + }, + { + "epoch": 0.26929918178584134, + "grad_norm": 1.509857902851942, + "learning_rate": 4.981756602536545e-05, + "loss": 0.8462, + "step": 2271 + }, + { + "epoch": 0.2694177635479663, + "grad_norm": 1.554687450517775, + "learning_rate": 4.9817276456215824e-05, + "loss": 1.0378, + "step": 2272 + }, + { + "epoch": 0.2695363453100913, + "grad_norm": 1.6327130379448598, + "learning_rate": 4.981698665828135e-05, + "loss": 0.8297, + "step": 2273 + }, + { + "epoch": 0.2696549270722163, + "grad_norm": 1.407290209389833, + "learning_rate": 4.981669663156468e-05, + "loss": 1.1696, + "step": 2274 + }, + { + "epoch": 0.26977350883434126, + "grad_norm": 1.391307660677606, + "learning_rate": 4.98164063760685e-05, + "loss": 0.8938, + "step": 2275 + }, + { + "epoch": 0.26989209059646624, + "grad_norm": 1.4091273614813733, + "learning_rate": 4.9816115891795494e-05, + "loss": 0.7126, + "step": 2276 + }, + { + "epoch": 0.2700106723585913, + "grad_norm": 1.359466268642746, + "learning_rate": 4.981582517874833e-05, + "loss": 0.6684, + "step": 2277 + }, + { + "epoch": 0.27012925412071626, + "grad_norm": 1.5083895667694889, + "learning_rate": 4.981553423692968e-05, + "loss": 0.8027, + "step": 2278 + }, + { + "epoch": 0.27024783588284124, + "grad_norm": 1.8026952308350164, + "learning_rate": 4.9815243066342244e-05, + "loss": 0.9701, + "step": 2279 + }, + { + "epoch": 0.2703664176449662, + "grad_norm": 2.0804895384463378, + "learning_rate": 4.98149516669887e-05, + "loss": 1.1016, + "step": 2280 + }, + { + "epoch": 0.2704849994070912, + "grad_norm": 1.8346028102579401, + "learning_rate": 4.981466003887172e-05, + "loss": 0.8025, + "step": 2281 + }, + { + "epoch": 0.2706035811692162, + "grad_norm": 1.524770294365194, + "learning_rate": 4.9814368181994015e-05, + "loss": 0.8095, + "step": 2282 + }, + { + "epoch": 0.27072216293134116, + "grad_norm": 1.8018233518594808, + "learning_rate": 4.981407609635826e-05, + "loss": 0.8499, + "step": 2283 + }, + { + "epoch": 0.27084074469346614, + "grad_norm": 1.495876179315143, + "learning_rate": 4.981378378196715e-05, + "loss": 0.8544, + "step": 2284 + }, + { + "epoch": 0.2709593264555911, + "grad_norm": 1.7571659850907795, + "learning_rate": 4.981349123882338e-05, + "loss": 1.0576, + "step": 2285 + }, + { + "epoch": 0.2710779082177161, + "grad_norm": 1.6580855935237229, + "learning_rate": 4.9813198466929664e-05, + "loss": 0.9806, + "step": 2286 + }, + { + "epoch": 0.2711964899798411, + "grad_norm": 1.4988019278436455, + "learning_rate": 4.9812905466288675e-05, + "loss": 1.0478, + "step": 2287 + }, + { + "epoch": 0.27131507174196606, + "grad_norm": 1.4824962295696775, + "learning_rate": 4.981261223690312e-05, + "loss": 0.8817, + "step": 2288 + }, + { + "epoch": 0.27143365350409104, + "grad_norm": 1.4941938146835907, + "learning_rate": 4.981231877877572e-05, + "loss": 0.8142, + "step": 2289 + }, + { + "epoch": 0.2715522352662161, + "grad_norm": 1.5575367610113489, + "learning_rate": 4.981202509190916e-05, + "loss": 0.8436, + "step": 2290 + }, + { + "epoch": 0.27167081702834106, + "grad_norm": 1.4830160665362453, + "learning_rate": 4.9811731176306155e-05, + "loss": 0.767, + "step": 2291 + }, + { + "epoch": 0.27178939879046604, + "grad_norm": 1.4219043161223104, + "learning_rate": 4.981143703196942e-05, + "loss": 0.6989, + "step": 2292 + }, + { + "epoch": 0.271907980552591, + "grad_norm": 1.6943146917458523, + "learning_rate": 4.9811142658901654e-05, + "loss": 1.0129, + "step": 2293 + }, + { + "epoch": 0.272026562314716, + "grad_norm": 1.5028034780402468, + "learning_rate": 4.9810848057105585e-05, + "loss": 0.9935, + "step": 2294 + }, + { + "epoch": 0.272145144076841, + "grad_norm": 2.0035001915090023, + "learning_rate": 4.9810553226583914e-05, + "loss": 1.0074, + "step": 2295 + }, + { + "epoch": 0.27226372583896596, + "grad_norm": 1.4212208784813856, + "learning_rate": 4.981025816733937e-05, + "loss": 0.9766, + "step": 2296 + }, + { + "epoch": 0.27238230760109094, + "grad_norm": 1.4816361482315437, + "learning_rate": 4.980996287937467e-05, + "loss": 0.8218, + "step": 2297 + }, + { + "epoch": 0.2725008893632159, + "grad_norm": 1.4804506855450281, + "learning_rate": 4.9809667362692544e-05, + "loss": 0.7826, + "step": 2298 + }, + { + "epoch": 0.2726194711253409, + "grad_norm": 1.2822091180647575, + "learning_rate": 4.98093716172957e-05, + "loss": 0.6886, + "step": 2299 + }, + { + "epoch": 0.2727380528874659, + "grad_norm": 1.7219809251086458, + "learning_rate": 4.9809075643186874e-05, + "loss": 1.0186, + "step": 2300 + }, + { + "epoch": 0.27285663464959087, + "grad_norm": 1.564623932198742, + "learning_rate": 4.98087794403688e-05, + "loss": 1.055, + "step": 2301 + }, + { + "epoch": 0.2729752164117159, + "grad_norm": 1.5931004960104955, + "learning_rate": 4.98084830088442e-05, + "loss": 0.8173, + "step": 2302 + }, + { + "epoch": 0.2730937981738409, + "grad_norm": 1.924877399858819, + "learning_rate": 4.980818634861581e-05, + "loss": 1.159, + "step": 2303 + }, + { + "epoch": 0.27321237993596587, + "grad_norm": 1.732778358575352, + "learning_rate": 4.980788945968635e-05, + "loss": 1.0589, + "step": 2304 + }, + { + "epoch": 0.27333096169809085, + "grad_norm": 1.5868140485179991, + "learning_rate": 4.980759234205859e-05, + "loss": 1.0167, + "step": 2305 + }, + { + "epoch": 0.2734495434602158, + "grad_norm": 1.7951333614525498, + "learning_rate": 4.980729499573523e-05, + "loss": 0.7651, + "step": 2306 + }, + { + "epoch": 0.2735681252223408, + "grad_norm": 1.4583221021034045, + "learning_rate": 4.980699742071904e-05, + "loss": 0.8476, + "step": 2307 + }, + { + "epoch": 0.2736867069844658, + "grad_norm": 1.317658478209005, + "learning_rate": 4.980669961701276e-05, + "loss": 0.677, + "step": 2308 + }, + { + "epoch": 0.27380528874659077, + "grad_norm": 1.720988007705009, + "learning_rate": 4.9806401584619126e-05, + "loss": 1.1059, + "step": 2309 + }, + { + "epoch": 0.27392387050871575, + "grad_norm": 1.5774053912907289, + "learning_rate": 4.980610332354089e-05, + "loss": 0.8255, + "step": 2310 + }, + { + "epoch": 0.27404245227084073, + "grad_norm": 1.6482463396987364, + "learning_rate": 4.980580483378079e-05, + "loss": 1.0962, + "step": 2311 + }, + { + "epoch": 0.2741610340329657, + "grad_norm": 1.60663566020083, + "learning_rate": 4.98055061153416e-05, + "loss": 0.8796, + "step": 2312 + }, + { + "epoch": 0.2742796157950907, + "grad_norm": 1.5154733818598287, + "learning_rate": 4.980520716822605e-05, + "loss": 0.7362, + "step": 2313 + }, + { + "epoch": 0.2743981975572157, + "grad_norm": 1.3977900450593432, + "learning_rate": 4.980490799243692e-05, + "loss": 0.7525, + "step": 2314 + }, + { + "epoch": 0.2745167793193407, + "grad_norm": 1.5099580337741563, + "learning_rate": 4.9804608587976956e-05, + "loss": 0.7127, + "step": 2315 + }, + { + "epoch": 0.2746353610814657, + "grad_norm": 1.435205956582461, + "learning_rate": 4.9804308954848914e-05, + "loss": 0.617, + "step": 2316 + }, + { + "epoch": 0.27475394284359067, + "grad_norm": 1.8546267361016988, + "learning_rate": 4.980400909305556e-05, + "loss": 1.2456, + "step": 2317 + }, + { + "epoch": 0.27487252460571565, + "grad_norm": 1.9915868614349475, + "learning_rate": 4.980370900259966e-05, + "loss": 0.9704, + "step": 2318 + }, + { + "epoch": 0.27499110636784063, + "grad_norm": 1.5163234322517274, + "learning_rate": 4.9803408683483984e-05, + "loss": 0.8733, + "step": 2319 + }, + { + "epoch": 0.2751096881299656, + "grad_norm": 1.3720475281936442, + "learning_rate": 4.9803108135711286e-05, + "loss": 1.1588, + "step": 2320 + }, + { + "epoch": 0.2752282698920906, + "grad_norm": 1.4655058511409425, + "learning_rate": 4.9802807359284355e-05, + "loss": 0.8837, + "step": 2321 + }, + { + "epoch": 0.2753468516542156, + "grad_norm": 1.5960543089517507, + "learning_rate": 4.980250635420595e-05, + "loss": 1.0131, + "step": 2322 + }, + { + "epoch": 0.27546543341634055, + "grad_norm": 1.3538657951985782, + "learning_rate": 4.980220512047886e-05, + "loss": 0.8084, + "step": 2323 + }, + { + "epoch": 0.27558401517846554, + "grad_norm": 1.7799238241875865, + "learning_rate": 4.980190365810584e-05, + "loss": 0.9622, + "step": 2324 + }, + { + "epoch": 0.2757025969405905, + "grad_norm": 1.5375595362129812, + "learning_rate": 4.9801601967089693e-05, + "loss": 0.7816, + "step": 2325 + }, + { + "epoch": 0.2758211787027155, + "grad_norm": 1.6333305758753784, + "learning_rate": 4.980130004743319e-05, + "loss": 0.6349, + "step": 2326 + }, + { + "epoch": 0.27593976046484053, + "grad_norm": 1.6710919558394968, + "learning_rate": 4.98009978991391e-05, + "loss": 0.9131, + "step": 2327 + }, + { + "epoch": 0.2760583422269655, + "grad_norm": 1.6287896002599938, + "learning_rate": 4.980069552221024e-05, + "loss": 0.7864, + "step": 2328 + }, + { + "epoch": 0.2761769239890905, + "grad_norm": 1.6131836385996834, + "learning_rate": 4.980039291664937e-05, + "loss": 0.8226, + "step": 2329 + }, + { + "epoch": 0.2762955057512155, + "grad_norm": 1.5932362862822576, + "learning_rate": 4.980009008245929e-05, + "loss": 0.999, + "step": 2330 + }, + { + "epoch": 0.27641408751334046, + "grad_norm": 1.7634165043714547, + "learning_rate": 4.97997870196428e-05, + "loss": 0.8163, + "step": 2331 + }, + { + "epoch": 0.27653266927546544, + "grad_norm": 1.5615923071196849, + "learning_rate": 4.979948372820267e-05, + "loss": 0.8127, + "step": 2332 + }, + { + "epoch": 0.2766512510375904, + "grad_norm": 1.6540325937889555, + "learning_rate": 4.979918020814172e-05, + "loss": 1.0004, + "step": 2333 + }, + { + "epoch": 0.2767698327997154, + "grad_norm": 1.7429460374434826, + "learning_rate": 4.979887645946274e-05, + "loss": 0.9948, + "step": 2334 + }, + { + "epoch": 0.2768884145618404, + "grad_norm": 1.9278768852265296, + "learning_rate": 4.979857248216853e-05, + "loss": 1.018, + "step": 2335 + }, + { + "epoch": 0.27700699632396536, + "grad_norm": 1.4999276115477773, + "learning_rate": 4.979826827626188e-05, + "loss": 0.7011, + "step": 2336 + }, + { + "epoch": 0.27712557808609034, + "grad_norm": 1.4598177691290897, + "learning_rate": 4.9797963841745624e-05, + "loss": 0.8391, + "step": 2337 + }, + { + "epoch": 0.2772441598482153, + "grad_norm": 1.5108863025708286, + "learning_rate": 4.979765917862254e-05, + "loss": 0.8332, + "step": 2338 + }, + { + "epoch": 0.2773627416103403, + "grad_norm": 1.5628769425476425, + "learning_rate": 4.9797354286895446e-05, + "loss": 1.0507, + "step": 2339 + }, + { + "epoch": 0.27748132337246534, + "grad_norm": 1.3580658331795292, + "learning_rate": 4.9797049166567164e-05, + "loss": 0.6127, + "step": 2340 + }, + { + "epoch": 0.2775999051345903, + "grad_norm": 1.4937611856361923, + "learning_rate": 4.979674381764049e-05, + "loss": 0.9559, + "step": 2341 + }, + { + "epoch": 0.2777184868967153, + "grad_norm": 1.6145460289546036, + "learning_rate": 4.979643824011825e-05, + "loss": 0.808, + "step": 2342 + }, + { + "epoch": 0.2778370686588403, + "grad_norm": 1.738272955057422, + "learning_rate": 4.979613243400325e-05, + "loss": 0.9047, + "step": 2343 + }, + { + "epoch": 0.27795565042096526, + "grad_norm": 1.7674905924830935, + "learning_rate": 4.979582639929832e-05, + "loss": 0.9718, + "step": 2344 + }, + { + "epoch": 0.27807423218309024, + "grad_norm": 1.6862318775331195, + "learning_rate": 4.9795520136006274e-05, + "loss": 0.8843, + "step": 2345 + }, + { + "epoch": 0.2781928139452152, + "grad_norm": 1.8997882913714228, + "learning_rate": 4.979521364412995e-05, + "loss": 0.9614, + "step": 2346 + }, + { + "epoch": 0.2783113957073402, + "grad_norm": 1.6493405366054128, + "learning_rate": 4.9794906923672146e-05, + "loss": 0.8881, + "step": 2347 + }, + { + "epoch": 0.2784299774694652, + "grad_norm": 1.8992771542503692, + "learning_rate": 4.979459997463571e-05, + "loss": 1.0411, + "step": 2348 + }, + { + "epoch": 0.27854855923159016, + "grad_norm": 1.3926693823490475, + "learning_rate": 4.9794292797023475e-05, + "loss": 0.6415, + "step": 2349 + }, + { + "epoch": 0.27866714099371515, + "grad_norm": 1.5795305337883796, + "learning_rate": 4.979398539083826e-05, + "loss": 0.9302, + "step": 2350 + }, + { + "epoch": 0.2787857227558401, + "grad_norm": 2.0269354261427166, + "learning_rate": 4.97936777560829e-05, + "loss": 1.0145, + "step": 2351 + }, + { + "epoch": 0.27890430451796516, + "grad_norm": 1.3509875297531597, + "learning_rate": 4.979336989276024e-05, + "loss": 1.0907, + "step": 2352 + }, + { + "epoch": 0.27902288628009014, + "grad_norm": 1.3776882640084553, + "learning_rate": 4.979306180087312e-05, + "loss": 0.7369, + "step": 2353 + }, + { + "epoch": 0.2791414680422151, + "grad_norm": 1.3447027082182557, + "learning_rate": 4.979275348042436e-05, + "loss": 0.8507, + "step": 2354 + }, + { + "epoch": 0.2792600498043401, + "grad_norm": 1.501206382220582, + "learning_rate": 4.9792444931416824e-05, + "loss": 0.8519, + "step": 2355 + }, + { + "epoch": 0.2793786315664651, + "grad_norm": 1.3781663804520787, + "learning_rate": 4.979213615385334e-05, + "loss": 0.6508, + "step": 2356 + }, + { + "epoch": 0.27949721332859007, + "grad_norm": 1.7375104070283125, + "learning_rate": 4.9791827147736777e-05, + "loss": 0.9627, + "step": 2357 + }, + { + "epoch": 0.27961579509071505, + "grad_norm": 1.600400254068593, + "learning_rate": 4.9791517913069966e-05, + "loss": 1.0308, + "step": 2358 + }, + { + "epoch": 0.27973437685284, + "grad_norm": 1.7044306390133839, + "learning_rate": 4.979120844985575e-05, + "loss": 0.8975, + "step": 2359 + }, + { + "epoch": 0.279852958614965, + "grad_norm": 1.5963307212014914, + "learning_rate": 4.9790898758097e-05, + "loss": 0.8561, + "step": 2360 + }, + { + "epoch": 0.27997154037709, + "grad_norm": 1.8424300966720464, + "learning_rate": 4.9790588837796566e-05, + "loss": 0.8956, + "step": 2361 + }, + { + "epoch": 0.28009012213921497, + "grad_norm": 1.6962694860934189, + "learning_rate": 4.97902786889573e-05, + "loss": 0.8921, + "step": 2362 + }, + { + "epoch": 0.28020870390133995, + "grad_norm": 1.8879000873979064, + "learning_rate": 4.978996831158206e-05, + "loss": 0.9548, + "step": 2363 + }, + { + "epoch": 0.280327285663465, + "grad_norm": 1.8732744939240091, + "learning_rate": 4.978965770567372e-05, + "loss": 0.5602, + "step": 2364 + }, + { + "epoch": 0.28044586742558997, + "grad_norm": 2.342423842334272, + "learning_rate": 4.978934687123513e-05, + "loss": 1.174, + "step": 2365 + }, + { + "epoch": 0.28056444918771495, + "grad_norm": 2.1020523110662905, + "learning_rate": 4.978903580826917e-05, + "loss": 1.0082, + "step": 2366 + }, + { + "epoch": 0.28068303094983993, + "grad_norm": 1.5400195148870632, + "learning_rate": 4.978872451677868e-05, + "loss": 0.7915, + "step": 2367 + }, + { + "epoch": 0.2808016127119649, + "grad_norm": 1.6456400683839234, + "learning_rate": 4.9788412996766565e-05, + "loss": 0.962, + "step": 2368 + }, + { + "epoch": 0.2809201944740899, + "grad_norm": 1.599856550585295, + "learning_rate": 4.978810124823567e-05, + "loss": 1.1514, + "step": 2369 + }, + { + "epoch": 0.28103877623621487, + "grad_norm": 1.6554155146302068, + "learning_rate": 4.978778927118889e-05, + "loss": 1.1269, + "step": 2370 + }, + { + "epoch": 0.28115735799833985, + "grad_norm": 1.5222823208211833, + "learning_rate": 4.9787477065629076e-05, + "loss": 1.0928, + "step": 2371 + }, + { + "epoch": 0.28127593976046483, + "grad_norm": 1.6160714938964338, + "learning_rate": 4.978716463155913e-05, + "loss": 1.2369, + "step": 2372 + }, + { + "epoch": 0.2813945215225898, + "grad_norm": 1.3738300214158063, + "learning_rate": 4.9786851968981916e-05, + "loss": 0.9255, + "step": 2373 + }, + { + "epoch": 0.2815131032847148, + "grad_norm": 1.25735968450066, + "learning_rate": 4.978653907790032e-05, + "loss": 0.8168, + "step": 2374 + }, + { + "epoch": 0.2816316850468398, + "grad_norm": 1.511509307572403, + "learning_rate": 4.9786225958317237e-05, + "loss": 0.959, + "step": 2375 + }, + { + "epoch": 0.28175026680896476, + "grad_norm": 1.2785000313988124, + "learning_rate": 4.978591261023554e-05, + "loss": 0.9499, + "step": 2376 + }, + { + "epoch": 0.2818688485710898, + "grad_norm": 1.3774059919242738, + "learning_rate": 4.9785599033658125e-05, + "loss": 0.8132, + "step": 2377 + }, + { + "epoch": 0.2819874303332148, + "grad_norm": 1.4855844999222076, + "learning_rate": 4.9785285228587885e-05, + "loss": 0.8463, + "step": 2378 + }, + { + "epoch": 0.28210601209533975, + "grad_norm": 1.4942637851503844, + "learning_rate": 4.97849711950277e-05, + "loss": 0.8871, + "step": 2379 + }, + { + "epoch": 0.28222459385746473, + "grad_norm": 1.5422906574622464, + "learning_rate": 4.978465693298048e-05, + "loss": 0.9864, + "step": 2380 + }, + { + "epoch": 0.2823431756195897, + "grad_norm": 1.347215576910993, + "learning_rate": 4.9784342442449115e-05, + "loss": 1.0103, + "step": 2381 + }, + { + "epoch": 0.2824617573817147, + "grad_norm": 1.504008556970501, + "learning_rate": 4.9784027723436505e-05, + "loss": 1.0114, + "step": 2382 + }, + { + "epoch": 0.2825803391438397, + "grad_norm": 1.6930182151257767, + "learning_rate": 4.978371277594555e-05, + "loss": 0.797, + "step": 2383 + }, + { + "epoch": 0.28269892090596466, + "grad_norm": 1.3268759644495594, + "learning_rate": 4.978339759997916e-05, + "loss": 0.702, + "step": 2384 + }, + { + "epoch": 0.28281750266808964, + "grad_norm": 1.4438104742677618, + "learning_rate": 4.978308219554023e-05, + "loss": 0.8245, + "step": 2385 + }, + { + "epoch": 0.2829360844302146, + "grad_norm": 1.5063078728190158, + "learning_rate": 4.9782766562631675e-05, + "loss": 0.8495, + "step": 2386 + }, + { + "epoch": 0.2830546661923396, + "grad_norm": 1.7472729568551792, + "learning_rate": 4.9782450701256405e-05, + "loss": 1.0, + "step": 2387 + }, + { + "epoch": 0.2831732479544646, + "grad_norm": 1.5258581507948437, + "learning_rate": 4.978213461141733e-05, + "loss": 0.8206, + "step": 2388 + }, + { + "epoch": 0.2832918297165896, + "grad_norm": 1.4474755435901978, + "learning_rate": 4.978181829311736e-05, + "loss": 0.658, + "step": 2389 + }, + { + "epoch": 0.2834104114787146, + "grad_norm": 1.4129592213485813, + "learning_rate": 4.9781501746359416e-05, + "loss": 0.953, + "step": 2390 + }, + { + "epoch": 0.2835289932408396, + "grad_norm": 1.534427866639826, + "learning_rate": 4.9781184971146415e-05, + "loss": 0.7199, + "step": 2391 + }, + { + "epoch": 0.28364757500296456, + "grad_norm": 1.56943773618129, + "learning_rate": 4.978086796748128e-05, + "loss": 1.0846, + "step": 2392 + }, + { + "epoch": 0.28376615676508954, + "grad_norm": 1.4865639121992624, + "learning_rate": 4.9780550735366926e-05, + "loss": 0.6404, + "step": 2393 + }, + { + "epoch": 0.2838847385272145, + "grad_norm": 1.6075098162842552, + "learning_rate": 4.9780233274806286e-05, + "loss": 0.912, + "step": 2394 + }, + { + "epoch": 0.2840033202893395, + "grad_norm": 1.5383713194163655, + "learning_rate": 4.977991558580228e-05, + "loss": 0.9617, + "step": 2395 + }, + { + "epoch": 0.2841219020514645, + "grad_norm": 1.7899617678234088, + "learning_rate": 4.9779597668357844e-05, + "loss": 1.0784, + "step": 2396 + }, + { + "epoch": 0.28424048381358946, + "grad_norm": 1.7320488989887202, + "learning_rate": 4.97792795224759e-05, + "loss": 1.0967, + "step": 2397 + }, + { + "epoch": 0.28435906557571444, + "grad_norm": 1.5400649448911345, + "learning_rate": 4.9778961148159384e-05, + "loss": 0.7434, + "step": 2398 + }, + { + "epoch": 0.2844776473378394, + "grad_norm": 1.6161655988288448, + "learning_rate": 4.9778642545411236e-05, + "loss": 0.8305, + "step": 2399 + }, + { + "epoch": 0.2845962290999644, + "grad_norm": 1.5004247017968646, + "learning_rate": 4.977832371423439e-05, + "loss": 0.871, + "step": 2400 + }, + { + "epoch": 0.2847148108620894, + "grad_norm": 1.6302941174498264, + "learning_rate": 4.977800465463178e-05, + "loss": 1.0917, + "step": 2401 + }, + { + "epoch": 0.2848333926242144, + "grad_norm": 1.9500142453998417, + "learning_rate": 4.977768536660635e-05, + "loss": 0.9573, + "step": 2402 + }, + { + "epoch": 0.2849519743863394, + "grad_norm": 1.7279385961801683, + "learning_rate": 4.977736585016105e-05, + "loss": 0.8734, + "step": 2403 + }, + { + "epoch": 0.2850705561484644, + "grad_norm": 1.5789031368206385, + "learning_rate": 4.977704610529883e-05, + "loss": 1.0316, + "step": 2404 + }, + { + "epoch": 0.28518913791058936, + "grad_norm": 1.5441335709818145, + "learning_rate": 4.977672613202261e-05, + "loss": 0.8706, + "step": 2405 + }, + { + "epoch": 0.28530771967271434, + "grad_norm": 1.45866301566283, + "learning_rate": 4.9776405930335365e-05, + "loss": 0.9468, + "step": 2406 + }, + { + "epoch": 0.2854263014348393, + "grad_norm": 1.8185536141867098, + "learning_rate": 4.977608550024005e-05, + "loss": 1.035, + "step": 2407 + }, + { + "epoch": 0.2855448831969643, + "grad_norm": 1.6712295967439859, + "learning_rate": 4.97757648417396e-05, + "loss": 0.9168, + "step": 2408 + }, + { + "epoch": 0.2856634649590893, + "grad_norm": 1.4939317295222134, + "learning_rate": 4.977544395483699e-05, + "loss": 1.0048, + "step": 2409 + }, + { + "epoch": 0.28578204672121427, + "grad_norm": 1.5496299542817438, + "learning_rate": 4.977512283953515e-05, + "loss": 1.0024, + "step": 2410 + }, + { + "epoch": 0.28590062848333925, + "grad_norm": 1.469882899699358, + "learning_rate": 4.9774801495837074e-05, + "loss": 1.1496, + "step": 2411 + }, + { + "epoch": 0.28601921024546423, + "grad_norm": 1.6512217810272487, + "learning_rate": 4.97744799237457e-05, + "loss": 0.9102, + "step": 2412 + }, + { + "epoch": 0.2861377920075892, + "grad_norm": 1.4709189995620464, + "learning_rate": 4.977415812326401e-05, + "loss": 1.0569, + "step": 2413 + }, + { + "epoch": 0.28625637376971425, + "grad_norm": 1.3953818969620164, + "learning_rate": 4.977383609439496e-05, + "loss": 0.9195, + "step": 2414 + }, + { + "epoch": 0.2863749555318392, + "grad_norm": 1.5530593465931122, + "learning_rate": 4.9773513837141516e-05, + "loss": 1.0856, + "step": 2415 + }, + { + "epoch": 0.2864935372939642, + "grad_norm": 1.3699945452734779, + "learning_rate": 4.9773191351506654e-05, + "loss": 0.8359, + "step": 2416 + }, + { + "epoch": 0.2866121190560892, + "grad_norm": 1.4870348851076065, + "learning_rate": 4.977286863749335e-05, + "loss": 1.0555, + "step": 2417 + }, + { + "epoch": 0.28673070081821417, + "grad_norm": 1.4359111663921689, + "learning_rate": 4.977254569510457e-05, + "loss": 1.0983, + "step": 2418 + }, + { + "epoch": 0.28684928258033915, + "grad_norm": 1.410138983438062, + "learning_rate": 4.97722225243433e-05, + "loss": 0.8596, + "step": 2419 + }, + { + "epoch": 0.28696786434246413, + "grad_norm": 1.7809862578142337, + "learning_rate": 4.9771899125212524e-05, + "loss": 0.9321, + "step": 2420 + }, + { + "epoch": 0.2870864461045891, + "grad_norm": 1.8669371676755382, + "learning_rate": 4.9771575497715206e-05, + "loss": 0.9885, + "step": 2421 + }, + { + "epoch": 0.2872050278667141, + "grad_norm": 1.4964475734731368, + "learning_rate": 4.977125164185434e-05, + "loss": 0.9255, + "step": 2422 + }, + { + "epoch": 0.28732360962883907, + "grad_norm": 1.6679845036740746, + "learning_rate": 4.977092755763292e-05, + "loss": 1.0333, + "step": 2423 + }, + { + "epoch": 0.28744219139096405, + "grad_norm": 1.5799589531737646, + "learning_rate": 4.977060324505391e-05, + "loss": 1.0092, + "step": 2424 + }, + { + "epoch": 0.28756077315308903, + "grad_norm": 1.6957229771398297, + "learning_rate": 4.977027870412032e-05, + "loss": 0.8155, + "step": 2425 + }, + { + "epoch": 0.287679354915214, + "grad_norm": 1.387962184634277, + "learning_rate": 4.976995393483513e-05, + "loss": 0.7769, + "step": 2426 + }, + { + "epoch": 0.28779793667733905, + "grad_norm": 1.587797760950652, + "learning_rate": 4.9769628937201355e-05, + "loss": 0.9381, + "step": 2427 + }, + { + "epoch": 0.28791651843946403, + "grad_norm": 1.5781299114198215, + "learning_rate": 4.976930371122196e-05, + "loss": 0.879, + "step": 2428 + }, + { + "epoch": 0.288035100201589, + "grad_norm": 1.6842908631420495, + "learning_rate": 4.976897825689997e-05, + "loss": 1.1819, + "step": 2429 + }, + { + "epoch": 0.288153681963714, + "grad_norm": 1.6456598767281734, + "learning_rate": 4.976865257423836e-05, + "loss": 0.6421, + "step": 2430 + }, + { + "epoch": 0.288272263725839, + "grad_norm": 1.6081830903014547, + "learning_rate": 4.976832666324016e-05, + "loss": 0.8206, + "step": 2431 + }, + { + "epoch": 0.28839084548796395, + "grad_norm": 1.5311996265100005, + "learning_rate": 4.976800052390836e-05, + "loss": 0.8542, + "step": 2432 + }, + { + "epoch": 0.28850942725008893, + "grad_norm": 1.3851321623650494, + "learning_rate": 4.976767415624596e-05, + "loss": 0.7755, + "step": 2433 + }, + { + "epoch": 0.2886280090122139, + "grad_norm": 2.0135001249696174, + "learning_rate": 4.9767347560255986e-05, + "loss": 1.164, + "step": 2434 + }, + { + "epoch": 0.2887465907743389, + "grad_norm": 2.26102826066825, + "learning_rate": 4.9767020735941426e-05, + "loss": 1.1262, + "step": 2435 + }, + { + "epoch": 0.2888651725364639, + "grad_norm": 1.5980948256977554, + "learning_rate": 4.976669368330532e-05, + "loss": 0.7784, + "step": 2436 + }, + { + "epoch": 0.28898375429858886, + "grad_norm": 1.396902187945008, + "learning_rate": 4.9766366402350664e-05, + "loss": 0.8269, + "step": 2437 + }, + { + "epoch": 0.28910233606071384, + "grad_norm": 1.4686863146321578, + "learning_rate": 4.976603889308048e-05, + "loss": 0.8729, + "step": 2438 + }, + { + "epoch": 0.2892209178228389, + "grad_norm": 1.4972790957768372, + "learning_rate": 4.976571115549779e-05, + "loss": 0.8332, + "step": 2439 + }, + { + "epoch": 0.28933949958496386, + "grad_norm": 1.4774115268183194, + "learning_rate": 4.976538318960561e-05, + "loss": 0.9241, + "step": 2440 + }, + { + "epoch": 0.28945808134708884, + "grad_norm": 1.463869044411452, + "learning_rate": 4.976505499540696e-05, + "loss": 0.935, + "step": 2441 + }, + { + "epoch": 0.2895766631092138, + "grad_norm": 1.6434574831572348, + "learning_rate": 4.976472657290488e-05, + "loss": 1.0176, + "step": 2442 + }, + { + "epoch": 0.2896952448713388, + "grad_norm": 1.372632154226201, + "learning_rate": 4.9764397922102393e-05, + "loss": 0.7306, + "step": 2443 + }, + { + "epoch": 0.2898138266334638, + "grad_norm": 1.47297405661419, + "learning_rate": 4.9764069043002516e-05, + "loss": 0.7562, + "step": 2444 + }, + { + "epoch": 0.28993240839558876, + "grad_norm": 1.4207201533286626, + "learning_rate": 4.9763739935608286e-05, + "loss": 0.7087, + "step": 2445 + }, + { + "epoch": 0.29005099015771374, + "grad_norm": 1.5189953106395007, + "learning_rate": 4.976341059992275e-05, + "loss": 0.7495, + "step": 2446 + }, + { + "epoch": 0.2901695719198387, + "grad_norm": 1.5766578427788733, + "learning_rate": 4.976308103594894e-05, + "loss": 0.8623, + "step": 2447 + }, + { + "epoch": 0.2902881536819637, + "grad_norm": 1.6111442449254452, + "learning_rate": 4.9762751243689885e-05, + "loss": 1.1065, + "step": 2448 + }, + { + "epoch": 0.2904067354440887, + "grad_norm": 1.6997701859633294, + "learning_rate": 4.976242122314863e-05, + "loss": 0.8941, + "step": 2449 + }, + { + "epoch": 0.29052531720621366, + "grad_norm": 1.7355967636695475, + "learning_rate": 4.976209097432821e-05, + "loss": 0.8303, + "step": 2450 + }, + { + "epoch": 0.29064389896833864, + "grad_norm": 1.8976717022593483, + "learning_rate": 4.9761760497231677e-05, + "loss": 0.8139, + "step": 2451 + }, + { + "epoch": 0.2907624807304637, + "grad_norm": 1.7551599543917875, + "learning_rate": 4.9761429791862094e-05, + "loss": 0.9635, + "step": 2452 + }, + { + "epoch": 0.29088106249258866, + "grad_norm": 2.1549159818777888, + "learning_rate": 4.976109885822248e-05, + "loss": 1.1164, + "step": 2453 + }, + { + "epoch": 0.29099964425471364, + "grad_norm": 1.8153668812167236, + "learning_rate": 4.976076769631589e-05, + "loss": 0.9211, + "step": 2454 + }, + { + "epoch": 0.2911182260168386, + "grad_norm": 1.7011277934190991, + "learning_rate": 4.97604363061454e-05, + "loss": 1.1271, + "step": 2455 + }, + { + "epoch": 0.2912368077789636, + "grad_norm": 1.6852902298470798, + "learning_rate": 4.976010468771405e-05, + "loss": 0.9859, + "step": 2456 + }, + { + "epoch": 0.2913553895410886, + "grad_norm": 1.6563526036450935, + "learning_rate": 4.975977284102489e-05, + "loss": 0.9522, + "step": 2457 + }, + { + "epoch": 0.29147397130321356, + "grad_norm": 1.5370122152361108, + "learning_rate": 4.975944076608099e-05, + "loss": 0.8795, + "step": 2458 + }, + { + "epoch": 0.29159255306533854, + "grad_norm": 1.5597852455302597, + "learning_rate": 4.9759108462885404e-05, + "loss": 0.8938, + "step": 2459 + }, + { + "epoch": 0.2917111348274635, + "grad_norm": 1.4439289768870136, + "learning_rate": 4.9758775931441204e-05, + "loss": 0.7363, + "step": 2460 + }, + { + "epoch": 0.2918297165895885, + "grad_norm": 1.8901467329119255, + "learning_rate": 4.975844317175145e-05, + "loss": 0.858, + "step": 2461 + }, + { + "epoch": 0.2919482983517135, + "grad_norm": 1.456985236465551, + "learning_rate": 4.9758110183819215e-05, + "loss": 1.1251, + "step": 2462 + }, + { + "epoch": 0.29206688011383847, + "grad_norm": 1.5815380036151172, + "learning_rate": 4.975777696764756e-05, + "loss": 0.9014, + "step": 2463 + }, + { + "epoch": 0.2921854618759635, + "grad_norm": 1.5078973699400564, + "learning_rate": 4.975744352323956e-05, + "loss": 0.9357, + "step": 2464 + }, + { + "epoch": 0.2923040436380885, + "grad_norm": 1.6320533758673677, + "learning_rate": 4.9757109850598295e-05, + "loss": 1.0957, + "step": 2465 + }, + { + "epoch": 0.29242262540021347, + "grad_norm": 1.533129535635482, + "learning_rate": 4.975677594972683e-05, + "loss": 1.0212, + "step": 2466 + }, + { + "epoch": 0.29254120716233845, + "grad_norm": 1.343228638405399, + "learning_rate": 4.975644182062825e-05, + "loss": 0.7514, + "step": 2467 + }, + { + "epoch": 0.2926597889244634, + "grad_norm": 1.3048993275906209, + "learning_rate": 4.9756107463305645e-05, + "loss": 0.6401, + "step": 2468 + }, + { + "epoch": 0.2927783706865884, + "grad_norm": 1.4004138509470117, + "learning_rate": 4.975577287776208e-05, + "loss": 0.9109, + "step": 2469 + }, + { + "epoch": 0.2928969524487134, + "grad_norm": 1.5108247036926876, + "learning_rate": 4.975543806400065e-05, + "loss": 0.8359, + "step": 2470 + }, + { + "epoch": 0.29301553421083837, + "grad_norm": 1.1662701489021907, + "learning_rate": 4.975510302202443e-05, + "loss": 0.698, + "step": 2471 + }, + { + "epoch": 0.29313411597296335, + "grad_norm": 1.6722735594043467, + "learning_rate": 4.9754767751836523e-05, + "loss": 1.0934, + "step": 2472 + }, + { + "epoch": 0.29325269773508833, + "grad_norm": 1.6570678169279243, + "learning_rate": 4.975443225344002e-05, + "loss": 0.9784, + "step": 2473 + }, + { + "epoch": 0.2933712794972133, + "grad_norm": 1.4538665492823626, + "learning_rate": 4.975409652683799e-05, + "loss": 0.8224, + "step": 2474 + }, + { + "epoch": 0.2934898612593383, + "grad_norm": 1.620074595506638, + "learning_rate": 4.975376057203356e-05, + "loss": 0.9211, + "step": 2475 + }, + { + "epoch": 0.2936084430214633, + "grad_norm": 1.4825846884460248, + "learning_rate": 4.975342438902981e-05, + "loss": 0.6733, + "step": 2476 + }, + { + "epoch": 0.2937270247835883, + "grad_norm": 1.4023811884534079, + "learning_rate": 4.975308797782984e-05, + "loss": 1.0042, + "step": 2477 + }, + { + "epoch": 0.2938456065457133, + "grad_norm": 1.4679035191682481, + "learning_rate": 4.975275133843675e-05, + "loss": 0.9633, + "step": 2478 + }, + { + "epoch": 0.29396418830783827, + "grad_norm": 1.5207144099481416, + "learning_rate": 4.975241447085365e-05, + "loss": 0.9552, + "step": 2479 + }, + { + "epoch": 0.29408277006996325, + "grad_norm": 1.5906830869911035, + "learning_rate": 4.9752077375083636e-05, + "loss": 0.8537, + "step": 2480 + }, + { + "epoch": 0.29420135183208823, + "grad_norm": 1.7732865528408428, + "learning_rate": 4.975174005112984e-05, + "loss": 0.8556, + "step": 2481 + }, + { + "epoch": 0.2943199335942132, + "grad_norm": 1.543750249856277, + "learning_rate": 4.9751402498995334e-05, + "loss": 0.8888, + "step": 2482 + }, + { + "epoch": 0.2944385153563382, + "grad_norm": 1.9281247230521332, + "learning_rate": 4.975106471868325e-05, + "loss": 1.0933, + "step": 2483 + }, + { + "epoch": 0.2945570971184632, + "grad_norm": 1.7627602015000823, + "learning_rate": 4.975072671019671e-05, + "loss": 1.0963, + "step": 2484 + }, + { + "epoch": 0.29467567888058815, + "grad_norm": 1.3745490362794537, + "learning_rate": 4.975038847353882e-05, + "loss": 0.945, + "step": 2485 + }, + { + "epoch": 0.29479426064271314, + "grad_norm": 1.7791728116352725, + "learning_rate": 4.9750050008712696e-05, + "loss": 1.2101, + "step": 2486 + }, + { + "epoch": 0.2949128424048381, + "grad_norm": 1.4757730441072967, + "learning_rate": 4.974971131572146e-05, + "loss": 0.8775, + "step": 2487 + }, + { + "epoch": 0.2950314241669631, + "grad_norm": 1.5079020424842573, + "learning_rate": 4.974937239456824e-05, + "loss": 0.8159, + "step": 2488 + }, + { + "epoch": 0.29515000592908813, + "grad_norm": 1.3711467806605337, + "learning_rate": 4.974903324525615e-05, + "loss": 0.8425, + "step": 2489 + }, + { + "epoch": 0.2952685876912131, + "grad_norm": 1.388534906127683, + "learning_rate": 4.974869386778833e-05, + "loss": 0.8557, + "step": 2490 + }, + { + "epoch": 0.2953871694533381, + "grad_norm": 1.5431256961615287, + "learning_rate": 4.97483542621679e-05, + "loss": 1.0615, + "step": 2491 + }, + { + "epoch": 0.2955057512154631, + "grad_norm": 1.3979012359819658, + "learning_rate": 4.9748014428397996e-05, + "loss": 0.9582, + "step": 2492 + }, + { + "epoch": 0.29562433297758806, + "grad_norm": 1.4037425953107994, + "learning_rate": 4.9747674366481734e-05, + "loss": 0.9099, + "step": 2493 + }, + { + "epoch": 0.29574291473971304, + "grad_norm": 1.5096630353844176, + "learning_rate": 4.974733407642227e-05, + "loss": 0.9342, + "step": 2494 + }, + { + "epoch": 0.295861496501838, + "grad_norm": 1.7251883883446064, + "learning_rate": 4.974699355822273e-05, + "loss": 0.9831, + "step": 2495 + }, + { + "epoch": 0.295980078263963, + "grad_norm": 1.6940908272405255, + "learning_rate": 4.974665281188626e-05, + "loss": 1.1715, + "step": 2496 + }, + { + "epoch": 0.296098660026088, + "grad_norm": 1.5063970898475894, + "learning_rate": 4.9746311837416e-05, + "loss": 0.8927, + "step": 2497 + }, + { + "epoch": 0.29621724178821296, + "grad_norm": 1.5959764203463707, + "learning_rate": 4.974597063481509e-05, + "loss": 0.9756, + "step": 2498 + }, + { + "epoch": 0.29633582355033794, + "grad_norm": 1.4694363782588318, + "learning_rate": 4.9745629204086666e-05, + "loss": 0.7107, + "step": 2499 + }, + { + "epoch": 0.2964544053124629, + "grad_norm": 1.7715520245566363, + "learning_rate": 4.97452875452339e-05, + "loss": 0.8678, + "step": 2500 + }, + { + "epoch": 0.2965729870745879, + "grad_norm": 1.719779403221391, + "learning_rate": 4.974494565825992e-05, + "loss": 0.7108, + "step": 2501 + }, + { + "epoch": 0.29669156883671294, + "grad_norm": 1.6909317137489985, + "learning_rate": 4.9744603543167887e-05, + "loss": 1.051, + "step": 2502 + }, + { + "epoch": 0.2968101505988379, + "grad_norm": 1.624849096638231, + "learning_rate": 4.974426119996095e-05, + "loss": 1.0165, + "step": 2503 + }, + { + "epoch": 0.2969287323609629, + "grad_norm": 1.3354729663321676, + "learning_rate": 4.974391862864227e-05, + "loss": 0.7568, + "step": 2504 + }, + { + "epoch": 0.2970473141230879, + "grad_norm": 1.6751802870583177, + "learning_rate": 4.974357582921501e-05, + "loss": 1.1076, + "step": 2505 + }, + { + "epoch": 0.29716589588521286, + "grad_norm": 1.4878367576048521, + "learning_rate": 4.974323280168231e-05, + "loss": 0.9715, + "step": 2506 + }, + { + "epoch": 0.29728447764733784, + "grad_norm": 1.4687316386714155, + "learning_rate": 4.9742889546047356e-05, + "loss": 0.8072, + "step": 2507 + }, + { + "epoch": 0.2974030594094628, + "grad_norm": 1.4020801471124975, + "learning_rate": 4.974254606231329e-05, + "loss": 0.838, + "step": 2508 + }, + { + "epoch": 0.2975216411715878, + "grad_norm": 1.4308406017369033, + "learning_rate": 4.97422023504833e-05, + "loss": 1.0792, + "step": 2509 + }, + { + "epoch": 0.2976402229337128, + "grad_norm": 1.6967117127736508, + "learning_rate": 4.9741858410560546e-05, + "loss": 1.077, + "step": 2510 + }, + { + "epoch": 0.29775880469583776, + "grad_norm": 1.5168504739120334, + "learning_rate": 4.97415142425482e-05, + "loss": 0.8955, + "step": 2511 + }, + { + "epoch": 0.29787738645796275, + "grad_norm": 1.4729172729059716, + "learning_rate": 4.974116984644943e-05, + "loss": 0.722, + "step": 2512 + }, + { + "epoch": 0.2979959682200877, + "grad_norm": 1.6603166010675723, + "learning_rate": 4.974082522226741e-05, + "loss": 0.9697, + "step": 2513 + }, + { + "epoch": 0.29811454998221276, + "grad_norm": 1.4771539161281177, + "learning_rate": 4.9740480370005324e-05, + "loss": 0.9614, + "step": 2514 + }, + { + "epoch": 0.29823313174433774, + "grad_norm": 1.2436134692586345, + "learning_rate": 4.9740135289666346e-05, + "loss": 0.5818, + "step": 2515 + }, + { + "epoch": 0.2983517135064627, + "grad_norm": 1.474971165310201, + "learning_rate": 4.973978998125366e-05, + "loss": 1.0244, + "step": 2516 + }, + { + "epoch": 0.2984702952685877, + "grad_norm": 1.5170666539393374, + "learning_rate": 4.973944444477045e-05, + "loss": 0.922, + "step": 2517 + }, + { + "epoch": 0.2985888770307127, + "grad_norm": 1.890515246560515, + "learning_rate": 4.97390986802199e-05, + "loss": 0.9076, + "step": 2518 + }, + { + "epoch": 0.29870745879283767, + "grad_norm": 1.55404467869752, + "learning_rate": 4.973875268760519e-05, + "loss": 0.9594, + "step": 2519 + }, + { + "epoch": 0.29882604055496265, + "grad_norm": 1.4606248062809595, + "learning_rate": 4.973840646692953e-05, + "loss": 0.7005, + "step": 2520 + }, + { + "epoch": 0.2989446223170876, + "grad_norm": 1.6411613131427558, + "learning_rate": 4.9738060018196085e-05, + "loss": 1.1581, + "step": 2521 + }, + { + "epoch": 0.2990632040792126, + "grad_norm": 1.466427373853806, + "learning_rate": 4.973771334140807e-05, + "loss": 0.7981, + "step": 2522 + }, + { + "epoch": 0.2991817858413376, + "grad_norm": 1.487091372512933, + "learning_rate": 4.973736643656867e-05, + "loss": 0.8585, + "step": 2523 + }, + { + "epoch": 0.29930036760346257, + "grad_norm": 1.6807062201555787, + "learning_rate": 4.9737019303681085e-05, + "loss": 0.9157, + "step": 2524 + }, + { + "epoch": 0.29941894936558755, + "grad_norm": 1.5828903958457703, + "learning_rate": 4.9736671942748526e-05, + "loss": 1.0478, + "step": 2525 + }, + { + "epoch": 0.29953753112771253, + "grad_norm": 1.4624945339510396, + "learning_rate": 4.973632435377418e-05, + "loss": 1.0318, + "step": 2526 + }, + { + "epoch": 0.29965611288983757, + "grad_norm": 1.2269386621513128, + "learning_rate": 4.9735976536761256e-05, + "loss": 0.7653, + "step": 2527 + }, + { + "epoch": 0.29977469465196255, + "grad_norm": 1.5215876464080502, + "learning_rate": 4.973562849171296e-05, + "loss": 0.8221, + "step": 2528 + }, + { + "epoch": 0.29989327641408753, + "grad_norm": 1.607624170716136, + "learning_rate": 4.973528021863251e-05, + "loss": 0.9775, + "step": 2529 + }, + { + "epoch": 0.3000118581762125, + "grad_norm": 1.645590583933082, + "learning_rate": 4.97349317175231e-05, + "loss": 0.931, + "step": 2530 + }, + { + "epoch": 0.3001304399383375, + "grad_norm": 1.4391606533173735, + "learning_rate": 4.9734582988387954e-05, + "loss": 0.9401, + "step": 2531 + }, + { + "epoch": 0.30024902170046247, + "grad_norm": 1.6451140065260246, + "learning_rate": 4.973423403123029e-05, + "loss": 1.0595, + "step": 2532 + }, + { + "epoch": 0.30036760346258745, + "grad_norm": 1.6129355092836206, + "learning_rate": 4.9733884846053314e-05, + "loss": 1.0479, + "step": 2533 + }, + { + "epoch": 0.30048618522471243, + "grad_norm": 1.5091322616920457, + "learning_rate": 4.9733535432860246e-05, + "loss": 0.9707, + "step": 2534 + }, + { + "epoch": 0.3006047669868374, + "grad_norm": 1.2857608346899063, + "learning_rate": 4.973318579165432e-05, + "loss": 0.6603, + "step": 2535 + }, + { + "epoch": 0.3007233487489624, + "grad_norm": 1.4144053479158973, + "learning_rate": 4.973283592243875e-05, + "loss": 0.877, + "step": 2536 + }, + { + "epoch": 0.3008419305110874, + "grad_norm": 1.6757614114070765, + "learning_rate": 4.973248582521676e-05, + "loss": 1.0667, + "step": 2537 + }, + { + "epoch": 0.30096051227321235, + "grad_norm": 1.7293242060366243, + "learning_rate": 4.9732135499991575e-05, + "loss": 0.8467, + "step": 2538 + }, + { + "epoch": 0.3010790940353374, + "grad_norm": 1.3856228030841544, + "learning_rate": 4.9731784946766435e-05, + "loss": 0.8273, + "step": 2539 + }, + { + "epoch": 0.30119767579746237, + "grad_norm": 1.3508044585325354, + "learning_rate": 4.973143416554457e-05, + "loss": 0.8825, + "step": 2540 + }, + { + "epoch": 0.30131625755958735, + "grad_norm": 1.5219045444439614, + "learning_rate": 4.9731083156329196e-05, + "loss": 0.8772, + "step": 2541 + }, + { + "epoch": 0.30143483932171233, + "grad_norm": 1.4213795692867122, + "learning_rate": 4.9730731919123575e-05, + "loss": 0.9348, + "step": 2542 + }, + { + "epoch": 0.3015534210838373, + "grad_norm": 1.6097094915903436, + "learning_rate": 4.9730380453930926e-05, + "loss": 0.8343, + "step": 2543 + }, + { + "epoch": 0.3016720028459623, + "grad_norm": 1.6377048285092566, + "learning_rate": 4.97300287607545e-05, + "loss": 1.0675, + "step": 2544 + }, + { + "epoch": 0.3017905846080873, + "grad_norm": 1.422753237788405, + "learning_rate": 4.972967683959753e-05, + "loss": 0.9265, + "step": 2545 + }, + { + "epoch": 0.30190916637021226, + "grad_norm": 1.6579036301187857, + "learning_rate": 4.972932469046326e-05, + "loss": 1.1289, + "step": 2546 + }, + { + "epoch": 0.30202774813233724, + "grad_norm": 1.6965738314625463, + "learning_rate": 4.972897231335496e-05, + "loss": 0.9131, + "step": 2547 + }, + { + "epoch": 0.3021463298944622, + "grad_norm": 1.6105316742110862, + "learning_rate": 4.972861970827585e-05, + "loss": 0.9157, + "step": 2548 + }, + { + "epoch": 0.3022649116565872, + "grad_norm": 1.5336097991416138, + "learning_rate": 4.9728266875229187e-05, + "loss": 0.8835, + "step": 2549 + }, + { + "epoch": 0.3023834934187122, + "grad_norm": 1.6969230508997102, + "learning_rate": 4.972791381421823e-05, + "loss": 1.0882, + "step": 2550 + }, + { + "epoch": 0.30250207518083716, + "grad_norm": 1.8540808408366418, + "learning_rate": 4.9727560525246234e-05, + "loss": 0.935, + "step": 2551 + }, + { + "epoch": 0.3026206569429622, + "grad_norm": 1.8986895243253412, + "learning_rate": 4.972720700831645e-05, + "loss": 1.3103, + "step": 2552 + }, + { + "epoch": 0.3027392387050872, + "grad_norm": 1.475388450852394, + "learning_rate": 4.972685326343213e-05, + "loss": 0.8544, + "step": 2553 + }, + { + "epoch": 0.30285782046721216, + "grad_norm": 1.4505693549702259, + "learning_rate": 4.972649929059656e-05, + "loss": 0.8083, + "step": 2554 + }, + { + "epoch": 0.30297640222933714, + "grad_norm": 1.2993510353824953, + "learning_rate": 4.972614508981298e-05, + "loss": 0.8974, + "step": 2555 + }, + { + "epoch": 0.3030949839914621, + "grad_norm": 1.530886957891809, + "learning_rate": 4.972579066108467e-05, + "loss": 0.9336, + "step": 2556 + }, + { + "epoch": 0.3032135657535871, + "grad_norm": 1.3390048195672548, + "learning_rate": 4.972543600441488e-05, + "loss": 0.847, + "step": 2557 + }, + { + "epoch": 0.3033321475157121, + "grad_norm": 1.4820355050045977, + "learning_rate": 4.9725081119806906e-05, + "loss": 0.9657, + "step": 2558 + }, + { + "epoch": 0.30345072927783706, + "grad_norm": 1.3398319578874784, + "learning_rate": 4.972472600726399e-05, + "loss": 0.9304, + "step": 2559 + }, + { + "epoch": 0.30356931103996204, + "grad_norm": 1.3590243926911865, + "learning_rate": 4.9724370666789424e-05, + "loss": 1.0812, + "step": 2560 + }, + { + "epoch": 0.303687892802087, + "grad_norm": 1.466930909587523, + "learning_rate": 4.972401509838648e-05, + "loss": 0.7857, + "step": 2561 + }, + { + "epoch": 0.303806474564212, + "grad_norm": 1.7058262201137064, + "learning_rate": 4.972365930205844e-05, + "loss": 0.7288, + "step": 2562 + }, + { + "epoch": 0.303925056326337, + "grad_norm": 1.587130364137291, + "learning_rate": 4.972330327780857e-05, + "loss": 0.8536, + "step": 2563 + }, + { + "epoch": 0.304043638088462, + "grad_norm": 1.715769603271045, + "learning_rate": 4.972294702564017e-05, + "loss": 0.7871, + "step": 2564 + }, + { + "epoch": 0.304162219850587, + "grad_norm": 1.6852421762577392, + "learning_rate": 4.9722590545556516e-05, + "loss": 0.8263, + "step": 2565 + }, + { + "epoch": 0.304280801612712, + "grad_norm": 1.5455181894644734, + "learning_rate": 4.972223383756089e-05, + "loss": 0.755, + "step": 2566 + }, + { + "epoch": 0.30439938337483696, + "grad_norm": 1.4859971423366658, + "learning_rate": 4.972187690165658e-05, + "loss": 0.6916, + "step": 2567 + }, + { + "epoch": 0.30451796513696194, + "grad_norm": 1.7446534965871885, + "learning_rate": 4.972151973784689e-05, + "loss": 0.8266, + "step": 2568 + }, + { + "epoch": 0.3046365468990869, + "grad_norm": 2.010701507626782, + "learning_rate": 4.97211623461351e-05, + "loss": 0.984, + "step": 2569 + }, + { + "epoch": 0.3047551286612119, + "grad_norm": 1.8291165760086305, + "learning_rate": 4.9720804726524504e-05, + "loss": 0.9564, + "step": 2570 + }, + { + "epoch": 0.3048737104233369, + "grad_norm": 1.5580193151761066, + "learning_rate": 4.9720446879018414e-05, + "loss": 0.8761, + "step": 2571 + }, + { + "epoch": 0.30499229218546187, + "grad_norm": 1.4154722087209504, + "learning_rate": 4.972008880362011e-05, + "loss": 0.7328, + "step": 2572 + }, + { + "epoch": 0.30511087394758685, + "grad_norm": 1.4905693002974378, + "learning_rate": 4.9719730500332895e-05, + "loss": 0.8294, + "step": 2573 + }, + { + "epoch": 0.3052294557097118, + "grad_norm": 1.632979845700117, + "learning_rate": 4.9719371969160086e-05, + "loss": 0.9339, + "step": 2574 + }, + { + "epoch": 0.3053480374718368, + "grad_norm": 1.4621386131526322, + "learning_rate": 4.971901321010498e-05, + "loss": 0.8916, + "step": 2575 + }, + { + "epoch": 0.3054666192339618, + "grad_norm": 1.4814258969399958, + "learning_rate": 4.9718654223170884e-05, + "loss": 0.9591, + "step": 2576 + }, + { + "epoch": 0.3055852009960868, + "grad_norm": 1.2969267923326158, + "learning_rate": 4.971829500836111e-05, + "loss": 0.7788, + "step": 2577 + }, + { + "epoch": 0.3057037827582118, + "grad_norm": 1.4427795961104461, + "learning_rate": 4.971793556567896e-05, + "loss": 0.7664, + "step": 2578 + }, + { + "epoch": 0.3058223645203368, + "grad_norm": 1.6143677658883506, + "learning_rate": 4.971757589512777e-05, + "loss": 0.9594, + "step": 2579 + }, + { + "epoch": 0.30594094628246177, + "grad_norm": 1.5199534968373927, + "learning_rate": 4.971721599671083e-05, + "loss": 0.8734, + "step": 2580 + }, + { + "epoch": 0.30605952804458675, + "grad_norm": 1.7345809244002213, + "learning_rate": 4.971685587043147e-05, + "loss": 0.9996, + "step": 2581 + }, + { + "epoch": 0.30617810980671173, + "grad_norm": 1.2547650177353251, + "learning_rate": 4.9716495516293006e-05, + "loss": 0.5779, + "step": 2582 + }, + { + "epoch": 0.3062966915688367, + "grad_norm": 1.449675058444994, + "learning_rate": 4.971613493429877e-05, + "loss": 0.8739, + "step": 2583 + }, + { + "epoch": 0.3064152733309617, + "grad_norm": 1.380035294120362, + "learning_rate": 4.9715774124452076e-05, + "loss": 0.674, + "step": 2584 + }, + { + "epoch": 0.30653385509308667, + "grad_norm": 1.5749245337471962, + "learning_rate": 4.971541308675625e-05, + "loss": 0.8355, + "step": 2585 + }, + { + "epoch": 0.30665243685521165, + "grad_norm": 1.47932020546663, + "learning_rate": 4.971505182121462e-05, + "loss": 0.6311, + "step": 2586 + }, + { + "epoch": 0.30677101861733663, + "grad_norm": 1.5421659610812786, + "learning_rate": 4.971469032783053e-05, + "loss": 0.7349, + "step": 2587 + }, + { + "epoch": 0.3068896003794616, + "grad_norm": 1.4579061987067918, + "learning_rate": 4.9714328606607296e-05, + "loss": 0.5985, + "step": 2588 + }, + { + "epoch": 0.30700818214158665, + "grad_norm": 1.7765789338821452, + "learning_rate": 4.971396665754826e-05, + "loss": 0.9933, + "step": 2589 + }, + { + "epoch": 0.30712676390371163, + "grad_norm": 1.6220379350890641, + "learning_rate": 4.971360448065676e-05, + "loss": 0.7988, + "step": 2590 + }, + { + "epoch": 0.3072453456658366, + "grad_norm": 1.5878553627182814, + "learning_rate": 4.971324207593613e-05, + "loss": 0.8747, + "step": 2591 + }, + { + "epoch": 0.3073639274279616, + "grad_norm": 1.4551534605513254, + "learning_rate": 4.9712879443389715e-05, + "loss": 0.9099, + "step": 2592 + }, + { + "epoch": 0.3074825091900866, + "grad_norm": 1.5078157084711428, + "learning_rate": 4.9712516583020854e-05, + "loss": 0.9787, + "step": 2593 + }, + { + "epoch": 0.30760109095221155, + "grad_norm": 1.4219650038883662, + "learning_rate": 4.971215349483289e-05, + "loss": 0.6654, + "step": 2594 + }, + { + "epoch": 0.30771967271433653, + "grad_norm": 1.5018418918247791, + "learning_rate": 4.971179017882919e-05, + "loss": 0.841, + "step": 2595 + }, + { + "epoch": 0.3078382544764615, + "grad_norm": 1.478086151175857, + "learning_rate": 4.9711426635013076e-05, + "loss": 0.5684, + "step": 2596 + }, + { + "epoch": 0.3079568362385865, + "grad_norm": 1.7522568180604647, + "learning_rate": 4.9711062863387915e-05, + "loss": 0.9465, + "step": 2597 + }, + { + "epoch": 0.3080754180007115, + "grad_norm": 1.4582486901807175, + "learning_rate": 4.971069886395706e-05, + "loss": 0.775, + "step": 2598 + }, + { + "epoch": 0.30819399976283646, + "grad_norm": 1.8510213048731483, + "learning_rate": 4.971033463672385e-05, + "loss": 0.7992, + "step": 2599 + }, + { + "epoch": 0.30831258152496144, + "grad_norm": 1.6151131238928218, + "learning_rate": 4.970997018169167e-05, + "loss": 0.7525, + "step": 2600 + }, + { + "epoch": 0.3084311632870864, + "grad_norm": 1.6836601205663049, + "learning_rate": 4.9709605498863865e-05, + "loss": 0.9163, + "step": 2601 + }, + { + "epoch": 0.30854974504921145, + "grad_norm": 1.7062777568567788, + "learning_rate": 4.97092405882438e-05, + "loss": 1.0714, + "step": 2602 + }, + { + "epoch": 0.30866832681133644, + "grad_norm": 1.3897848854666475, + "learning_rate": 4.970887544983483e-05, + "loss": 0.7758, + "step": 2603 + }, + { + "epoch": 0.3087869085734614, + "grad_norm": 1.822950464884883, + "learning_rate": 4.970851008364034e-05, + "loss": 0.8179, + "step": 2604 + }, + { + "epoch": 0.3089054903355864, + "grad_norm": 1.5606074460297912, + "learning_rate": 4.9708144489663675e-05, + "loss": 0.7607, + "step": 2605 + }, + { + "epoch": 0.3090240720977114, + "grad_norm": 1.6103331138012362, + "learning_rate": 4.970777866790822e-05, + "loss": 1.0334, + "step": 2606 + }, + { + "epoch": 0.30914265385983636, + "grad_norm": 1.3150429649914626, + "learning_rate": 4.970741261837734e-05, + "loss": 0.6621, + "step": 2607 + }, + { + "epoch": 0.30926123562196134, + "grad_norm": 1.5630751426917011, + "learning_rate": 4.970704634107442e-05, + "loss": 0.8753, + "step": 2608 + }, + { + "epoch": 0.3093798173840863, + "grad_norm": 1.4200733841857318, + "learning_rate": 4.9706679836002836e-05, + "loss": 0.7946, + "step": 2609 + }, + { + "epoch": 0.3094983991462113, + "grad_norm": 1.833466194263778, + "learning_rate": 4.970631310316595e-05, + "loss": 0.8894, + "step": 2610 + }, + { + "epoch": 0.3096169809083363, + "grad_norm": 1.3462887980066904, + "learning_rate": 4.970594614256716e-05, + "loss": 0.745, + "step": 2611 + }, + { + "epoch": 0.30973556267046126, + "grad_norm": 1.4386182090423194, + "learning_rate": 4.970557895420984e-05, + "loss": 0.8175, + "step": 2612 + }, + { + "epoch": 0.30985414443258624, + "grad_norm": 1.5409815392447845, + "learning_rate": 4.970521153809738e-05, + "loss": 0.7803, + "step": 2613 + }, + { + "epoch": 0.3099727261947113, + "grad_norm": 1.3749228546246528, + "learning_rate": 4.9704843894233164e-05, + "loss": 0.5846, + "step": 2614 + }, + { + "epoch": 0.31009130795683626, + "grad_norm": 1.5263414253033938, + "learning_rate": 4.970447602262058e-05, + "loss": 0.6874, + "step": 2615 + }, + { + "epoch": 0.31020988971896124, + "grad_norm": 1.7794791138627477, + "learning_rate": 4.970410792326303e-05, + "loss": 0.9489, + "step": 2616 + }, + { + "epoch": 0.3103284714810862, + "grad_norm": 1.760711225890816, + "learning_rate": 4.970373959616389e-05, + "loss": 1.063, + "step": 2617 + }, + { + "epoch": 0.3104470532432112, + "grad_norm": 1.5668682954357929, + "learning_rate": 4.970337104132657e-05, + "loss": 0.9784, + "step": 2618 + }, + { + "epoch": 0.3105656350053362, + "grad_norm": 1.635818448919912, + "learning_rate": 4.970300225875446e-05, + "loss": 0.9689, + "step": 2619 + }, + { + "epoch": 0.31068421676746116, + "grad_norm": 1.9725092733775906, + "learning_rate": 4.970263324845096e-05, + "loss": 0.8159, + "step": 2620 + }, + { + "epoch": 0.31080279852958614, + "grad_norm": 1.5012003477409537, + "learning_rate": 4.970226401041948e-05, + "loss": 0.9158, + "step": 2621 + }, + { + "epoch": 0.3109213802917111, + "grad_norm": 2.0004391485720228, + "learning_rate": 4.970189454466342e-05, + "loss": 0.8863, + "step": 2622 + }, + { + "epoch": 0.3110399620538361, + "grad_norm": 1.6453957143104205, + "learning_rate": 4.970152485118618e-05, + "loss": 0.8352, + "step": 2623 + }, + { + "epoch": 0.3111585438159611, + "grad_norm": 1.4973576448470858, + "learning_rate": 4.970115492999117e-05, + "loss": 1.0024, + "step": 2624 + }, + { + "epoch": 0.31127712557808607, + "grad_norm": 1.5803617812198987, + "learning_rate": 4.9700784781081796e-05, + "loss": 0.827, + "step": 2625 + }, + { + "epoch": 0.3113957073402111, + "grad_norm": 1.7660374366710427, + "learning_rate": 4.970041440446148e-05, + "loss": 0.9267, + "step": 2626 + }, + { + "epoch": 0.3115142891023361, + "grad_norm": 1.6011907976076205, + "learning_rate": 4.970004380013364e-05, + "loss": 0.9827, + "step": 2627 + }, + { + "epoch": 0.31163287086446106, + "grad_norm": 1.6072789459622534, + "learning_rate": 4.969967296810168e-05, + "loss": 0.9632, + "step": 2628 + }, + { + "epoch": 0.31175145262658605, + "grad_norm": 1.3322846308689626, + "learning_rate": 4.9699301908369025e-05, + "loss": 0.7393, + "step": 2629 + }, + { + "epoch": 0.311870034388711, + "grad_norm": 1.4847747822411748, + "learning_rate": 4.969893062093909e-05, + "loss": 1.0349, + "step": 2630 + }, + { + "epoch": 0.311988616150836, + "grad_norm": 1.3543384788385817, + "learning_rate": 4.969855910581531e-05, + "loss": 0.9077, + "step": 2631 + }, + { + "epoch": 0.312107197912961, + "grad_norm": 1.5721906040927294, + "learning_rate": 4.96981873630011e-05, + "loss": 1.0494, + "step": 2632 + }, + { + "epoch": 0.31222577967508597, + "grad_norm": 1.7243191094359915, + "learning_rate": 4.9697815392499894e-05, + "loss": 0.8057, + "step": 2633 + }, + { + "epoch": 0.31234436143721095, + "grad_norm": 1.4379544903133519, + "learning_rate": 4.9697443194315105e-05, + "loss": 0.9455, + "step": 2634 + }, + { + "epoch": 0.31246294319933593, + "grad_norm": 1.3368235556260124, + "learning_rate": 4.9697070768450184e-05, + "loss": 0.8406, + "step": 2635 + }, + { + "epoch": 0.3125815249614609, + "grad_norm": 1.3095789823362, + "learning_rate": 4.969669811490856e-05, + "loss": 0.7857, + "step": 2636 + }, + { + "epoch": 0.3127001067235859, + "grad_norm": 1.429464353700218, + "learning_rate": 4.9696325233693655e-05, + "loss": 0.9826, + "step": 2637 + }, + { + "epoch": 0.31281868848571087, + "grad_norm": 1.6413598191801042, + "learning_rate": 4.969595212480892e-05, + "loss": 0.9167, + "step": 2638 + }, + { + "epoch": 0.3129372702478359, + "grad_norm": 1.432700278522245, + "learning_rate": 4.969557878825779e-05, + "loss": 0.7019, + "step": 2639 + }, + { + "epoch": 0.3130558520099609, + "grad_norm": 1.5048084279873926, + "learning_rate": 4.96952052240437e-05, + "loss": 0.8958, + "step": 2640 + }, + { + "epoch": 0.31317443377208587, + "grad_norm": 1.7805021359273707, + "learning_rate": 4.969483143217012e-05, + "loss": 0.9748, + "step": 2641 + }, + { + "epoch": 0.31329301553421085, + "grad_norm": 1.389726066484738, + "learning_rate": 4.969445741264046e-05, + "loss": 0.7684, + "step": 2642 + }, + { + "epoch": 0.31341159729633583, + "grad_norm": 1.3655791222700386, + "learning_rate": 4.969408316545819e-05, + "loss": 0.8109, + "step": 2643 + }, + { + "epoch": 0.3135301790584608, + "grad_norm": 1.6702168018498835, + "learning_rate": 4.969370869062676e-05, + "loss": 0.8914, + "step": 2644 + }, + { + "epoch": 0.3136487608205858, + "grad_norm": 1.5746687507393857, + "learning_rate": 4.969333398814961e-05, + "loss": 0.9526, + "step": 2645 + }, + { + "epoch": 0.3137673425827108, + "grad_norm": 1.3273528179635306, + "learning_rate": 4.96929590580302e-05, + "loss": 0.914, + "step": 2646 + }, + { + "epoch": 0.31388592434483575, + "grad_norm": 1.4826094318989829, + "learning_rate": 4.9692583900271996e-05, + "loss": 0.8494, + "step": 2647 + }, + { + "epoch": 0.31400450610696073, + "grad_norm": 1.5554826478075097, + "learning_rate": 4.9692208514878444e-05, + "loss": 0.6351, + "step": 2648 + }, + { + "epoch": 0.3141230878690857, + "grad_norm": 1.7102026713677392, + "learning_rate": 4.9691832901853006e-05, + "loss": 0.9757, + "step": 2649 + }, + { + "epoch": 0.3142416696312107, + "grad_norm": 1.6146548626478319, + "learning_rate": 4.969145706119915e-05, + "loss": 0.9551, + "step": 2650 + }, + { + "epoch": 0.31436025139333573, + "grad_norm": 1.3389391007527207, + "learning_rate": 4.969108099292035e-05, + "loss": 0.7567, + "step": 2651 + }, + { + "epoch": 0.3144788331554607, + "grad_norm": 1.6437079099692233, + "learning_rate": 4.969070469702005e-05, + "loss": 0.961, + "step": 2652 + }, + { + "epoch": 0.3145974149175857, + "grad_norm": 1.6589477068104022, + "learning_rate": 4.9690328173501734e-05, + "loss": 0.8826, + "step": 2653 + }, + { + "epoch": 0.3147159966797107, + "grad_norm": 1.1360292348701742, + "learning_rate": 4.9689951422368866e-05, + "loss": 0.6122, + "step": 2654 + }, + { + "epoch": 0.31483457844183566, + "grad_norm": 1.51308237033301, + "learning_rate": 4.968957444362492e-05, + "loss": 0.9353, + "step": 2655 + }, + { + "epoch": 0.31495316020396064, + "grad_norm": 1.6549714117141376, + "learning_rate": 4.968919723727338e-05, + "loss": 1.1647, + "step": 2656 + }, + { + "epoch": 0.3150717419660856, + "grad_norm": 1.7603797713426332, + "learning_rate": 4.968881980331772e-05, + "loss": 0.7734, + "step": 2657 + }, + { + "epoch": 0.3151903237282106, + "grad_norm": 1.5695819809878564, + "learning_rate": 4.968844214176142e-05, + "loss": 0.7266, + "step": 2658 + }, + { + "epoch": 0.3153089054903356, + "grad_norm": 1.6032529088845697, + "learning_rate": 4.968806425260794e-05, + "loss": 1.0162, + "step": 2659 + }, + { + "epoch": 0.31542748725246056, + "grad_norm": 1.3886534291144894, + "learning_rate": 4.96876861358608e-05, + "loss": 0.8603, + "step": 2660 + }, + { + "epoch": 0.31554606901458554, + "grad_norm": 1.4051979279527427, + "learning_rate": 4.968730779152346e-05, + "loss": 0.742, + "step": 2661 + }, + { + "epoch": 0.3156646507767105, + "grad_norm": 1.486309612007322, + "learning_rate": 4.968692921959942e-05, + "loss": 0.6914, + "step": 2662 + }, + { + "epoch": 0.3157832325388355, + "grad_norm": 1.644788206480169, + "learning_rate": 4.968655042009216e-05, + "loss": 0.919, + "step": 2663 + }, + { + "epoch": 0.31590181430096054, + "grad_norm": 1.4512727914309704, + "learning_rate": 4.9686171393005174e-05, + "loss": 0.7048, + "step": 2664 + }, + { + "epoch": 0.3160203960630855, + "grad_norm": 1.4854024363489926, + "learning_rate": 4.968579213834197e-05, + "loss": 0.6105, + "step": 2665 + }, + { + "epoch": 0.3161389778252105, + "grad_norm": 1.649956629720194, + "learning_rate": 4.968541265610603e-05, + "loss": 0.995, + "step": 2666 + }, + { + "epoch": 0.3162575595873355, + "grad_norm": 1.4139044989294303, + "learning_rate": 4.9685032946300855e-05, + "loss": 0.5695, + "step": 2667 + }, + { + "epoch": 0.31637614134946046, + "grad_norm": 1.6930928952874693, + "learning_rate": 4.968465300892996e-05, + "loss": 0.8023, + "step": 2668 + }, + { + "epoch": 0.31649472311158544, + "grad_norm": 1.7352741521881156, + "learning_rate": 4.968427284399681e-05, + "loss": 0.8135, + "step": 2669 + }, + { + "epoch": 0.3166133048737104, + "grad_norm": 1.7099178691237082, + "learning_rate": 4.9683892451504955e-05, + "loss": 0.7621, + "step": 2670 + }, + { + "epoch": 0.3167318866358354, + "grad_norm": 1.8840255962750865, + "learning_rate": 4.968351183145787e-05, + "loss": 0.7483, + "step": 2671 + }, + { + "epoch": 0.3168504683979604, + "grad_norm": 1.8020612147598967, + "learning_rate": 4.9683130983859086e-05, + "loss": 0.8415, + "step": 2672 + }, + { + "epoch": 0.31696905016008536, + "grad_norm": 2.1774718662523913, + "learning_rate": 4.968274990871209e-05, + "loss": 0.9584, + "step": 2673 + }, + { + "epoch": 0.31708763192221034, + "grad_norm": 1.618495300544766, + "learning_rate": 4.968236860602041e-05, + "loss": 0.789, + "step": 2674 + }, + { + "epoch": 0.3172062136843353, + "grad_norm": 1.7209048977814367, + "learning_rate": 4.9681987075787576e-05, + "loss": 0.7891, + "step": 2675 + }, + { + "epoch": 0.31732479544646036, + "grad_norm": 1.5649218262829656, + "learning_rate": 4.9681605318017066e-05, + "loss": 0.9379, + "step": 2676 + }, + { + "epoch": 0.31744337720858534, + "grad_norm": 1.4423781212296602, + "learning_rate": 4.968122333271243e-05, + "loss": 0.7404, + "step": 2677 + }, + { + "epoch": 0.3175619589707103, + "grad_norm": 1.4483519041783814, + "learning_rate": 4.9680841119877184e-05, + "loss": 0.993, + "step": 2678 + }, + { + "epoch": 0.3176805407328353, + "grad_norm": 1.5923483763814457, + "learning_rate": 4.968045867951484e-05, + "loss": 0.895, + "step": 2679 + }, + { + "epoch": 0.3177991224949603, + "grad_norm": 1.5844197951628598, + "learning_rate": 4.968007601162894e-05, + "loss": 0.8953, + "step": 2680 + }, + { + "epoch": 0.31791770425708527, + "grad_norm": 1.3874534619501644, + "learning_rate": 4.9679693116223e-05, + "loss": 0.8436, + "step": 2681 + }, + { + "epoch": 0.31803628601921025, + "grad_norm": 1.1986487574212918, + "learning_rate": 4.967930999330056e-05, + "loss": 0.6752, + "step": 2682 + }, + { + "epoch": 0.3181548677813352, + "grad_norm": 1.598011115707571, + "learning_rate": 4.967892664286514e-05, + "loss": 0.878, + "step": 2683 + }, + { + "epoch": 0.3182734495434602, + "grad_norm": 1.8463185876241386, + "learning_rate": 4.967854306492028e-05, + "loss": 1.1226, + "step": 2684 + }, + { + "epoch": 0.3183920313055852, + "grad_norm": 1.405426475257274, + "learning_rate": 4.967815925946951e-05, + "loss": 0.9405, + "step": 2685 + }, + { + "epoch": 0.31851061306771017, + "grad_norm": 1.4328218943422086, + "learning_rate": 4.967777522651639e-05, + "loss": 0.9408, + "step": 2686 + }, + { + "epoch": 0.31862919482983515, + "grad_norm": 1.4000156335004044, + "learning_rate": 4.967739096606443e-05, + "loss": 1.1054, + "step": 2687 + }, + { + "epoch": 0.31874777659196013, + "grad_norm": 1.42324732835267, + "learning_rate": 4.9677006478117195e-05, + "loss": 0.6972, + "step": 2688 + }, + { + "epoch": 0.31886635835408517, + "grad_norm": 1.5464415545746586, + "learning_rate": 4.967662176267822e-05, + "loss": 0.8433, + "step": 2689 + }, + { + "epoch": 0.31898494011621015, + "grad_norm": 1.483059713974556, + "learning_rate": 4.967623681975105e-05, + "loss": 0.7086, + "step": 2690 + }, + { + "epoch": 0.31910352187833513, + "grad_norm": 1.3967410649533993, + "learning_rate": 4.9675851649339234e-05, + "loss": 0.8798, + "step": 2691 + }, + { + "epoch": 0.3192221036404601, + "grad_norm": 1.713256336302126, + "learning_rate": 4.967546625144633e-05, + "loss": 0.9062, + "step": 2692 + }, + { + "epoch": 0.3193406854025851, + "grad_norm": 1.8156979784314708, + "learning_rate": 4.9675080626075885e-05, + "loss": 1.1516, + "step": 2693 + }, + { + "epoch": 0.31945926716471007, + "grad_norm": 1.7429895717933346, + "learning_rate": 4.967469477323146e-05, + "loss": 1.0175, + "step": 2694 + }, + { + "epoch": 0.31957784892683505, + "grad_norm": 1.2684034593628921, + "learning_rate": 4.967430869291659e-05, + "loss": 0.5538, + "step": 2695 + }, + { + "epoch": 0.31969643068896003, + "grad_norm": 1.6112117856295023, + "learning_rate": 4.967392238513487e-05, + "loss": 0.8699, + "step": 2696 + }, + { + "epoch": 0.319815012451085, + "grad_norm": 1.7298997979520403, + "learning_rate": 4.9673535849889834e-05, + "loss": 0.7953, + "step": 2697 + }, + { + "epoch": 0.31993359421321, + "grad_norm": 1.4851996145557502, + "learning_rate": 4.967314908718506e-05, + "loss": 0.6843, + "step": 2698 + }, + { + "epoch": 0.320052175975335, + "grad_norm": 1.5034993079186254, + "learning_rate": 4.9672762097024106e-05, + "loss": 0.7931, + "step": 2699 + }, + { + "epoch": 0.32017075773745995, + "grad_norm": 1.6694327106982263, + "learning_rate": 4.9672374879410536e-05, + "loss": 0.8841, + "step": 2700 + }, + { + "epoch": 0.320289339499585, + "grad_norm": 1.5191427430641669, + "learning_rate": 4.967198743434793e-05, + "loss": 0.8175, + "step": 2701 + }, + { + "epoch": 0.32040792126170997, + "grad_norm": 1.898652546812242, + "learning_rate": 4.967159976183985e-05, + "loss": 0.6929, + "step": 2702 + }, + { + "epoch": 0.32052650302383495, + "grad_norm": 1.4914277779343135, + "learning_rate": 4.9671211861889874e-05, + "loss": 0.9222, + "step": 2703 + }, + { + "epoch": 0.32064508478595993, + "grad_norm": 1.4914992246795422, + "learning_rate": 4.967082373450158e-05, + "loss": 0.8679, + "step": 2704 + }, + { + "epoch": 0.3207636665480849, + "grad_norm": 1.639095466114579, + "learning_rate": 4.967043537967855e-05, + "loss": 0.7045, + "step": 2705 + }, + { + "epoch": 0.3208822483102099, + "grad_norm": 1.6225478976843937, + "learning_rate": 4.967004679742436e-05, + "loss": 0.758, + "step": 2706 + }, + { + "epoch": 0.3210008300723349, + "grad_norm": 1.6415244527084527, + "learning_rate": 4.966965798774258e-05, + "loss": 0.7123, + "step": 2707 + }, + { + "epoch": 0.32111941183445986, + "grad_norm": 1.7771767698210414, + "learning_rate": 4.9669268950636815e-05, + "loss": 0.8649, + "step": 2708 + }, + { + "epoch": 0.32123799359658484, + "grad_norm": 1.427174591417389, + "learning_rate": 4.966887968611064e-05, + "loss": 0.6612, + "step": 2709 + }, + { + "epoch": 0.3213565753587098, + "grad_norm": 2.2487945318053058, + "learning_rate": 4.966849019416764e-05, + "loss": 1.0242, + "step": 2710 + }, + { + "epoch": 0.3214751571208348, + "grad_norm": 1.8989091526360655, + "learning_rate": 4.966810047481142e-05, + "loss": 0.8491, + "step": 2711 + }, + { + "epoch": 0.3215937388829598, + "grad_norm": 1.4886489194472363, + "learning_rate": 4.966771052804555e-05, + "loss": 0.6541, + "step": 2712 + }, + { + "epoch": 0.32171232064508476, + "grad_norm": 1.9425249096952568, + "learning_rate": 4.966732035387365e-05, + "loss": 0.7138, + "step": 2713 + }, + { + "epoch": 0.3218309024072098, + "grad_norm": 1.5603168881662295, + "learning_rate": 4.96669299522993e-05, + "loss": 0.8729, + "step": 2714 + }, + { + "epoch": 0.3219494841693348, + "grad_norm": 1.4372828896893561, + "learning_rate": 4.9666539323326106e-05, + "loss": 0.7963, + "step": 2715 + }, + { + "epoch": 0.32206806593145976, + "grad_norm": 1.4630052135750407, + "learning_rate": 4.966614846695767e-05, + "loss": 0.7536, + "step": 2716 + }, + { + "epoch": 0.32218664769358474, + "grad_norm": 1.7179239631099212, + "learning_rate": 4.966575738319759e-05, + "loss": 0.7555, + "step": 2717 + }, + { + "epoch": 0.3223052294557097, + "grad_norm": 1.777404528290335, + "learning_rate": 4.966536607204948e-05, + "loss": 1.0564, + "step": 2718 + }, + { + "epoch": 0.3224238112178347, + "grad_norm": 1.2913568877915826, + "learning_rate": 4.966497453351693e-05, + "loss": 0.7498, + "step": 2719 + }, + { + "epoch": 0.3225423929799597, + "grad_norm": 1.5428853804520588, + "learning_rate": 4.966458276760357e-05, + "loss": 0.89, + "step": 2720 + }, + { + "epoch": 0.32266097474208466, + "grad_norm": 1.5111616582472553, + "learning_rate": 4.9664190774313e-05, + "loss": 0.7881, + "step": 2721 + }, + { + "epoch": 0.32277955650420964, + "grad_norm": 1.4869216829819702, + "learning_rate": 4.9663798553648834e-05, + "loss": 0.8756, + "step": 2722 + }, + { + "epoch": 0.3228981382663346, + "grad_norm": 1.4748085806439688, + "learning_rate": 4.966340610561469e-05, + "loss": 0.8295, + "step": 2723 + }, + { + "epoch": 0.3230167200284596, + "grad_norm": 1.6795243846872943, + "learning_rate": 4.966301343021419e-05, + "loss": 0.9624, + "step": 2724 + }, + { + "epoch": 0.3231353017905846, + "grad_norm": 1.5703761146324269, + "learning_rate": 4.966262052745094e-05, + "loss": 0.8193, + "step": 2725 + }, + { + "epoch": 0.3232538835527096, + "grad_norm": 1.4118069984151005, + "learning_rate": 4.966222739732859e-05, + "loss": 0.9, + "step": 2726 + }, + { + "epoch": 0.3233724653148346, + "grad_norm": 1.5061008334007595, + "learning_rate": 4.966183403985073e-05, + "loss": 0.7043, + "step": 2727 + }, + { + "epoch": 0.3234910470769596, + "grad_norm": 1.5529059476868734, + "learning_rate": 4.966144045502101e-05, + "loss": 0.8262, + "step": 2728 + }, + { + "epoch": 0.32360962883908456, + "grad_norm": 1.7654291314310488, + "learning_rate": 4.966104664284305e-05, + "loss": 1.0376, + "step": 2729 + }, + { + "epoch": 0.32372821060120954, + "grad_norm": 1.6535180662890188, + "learning_rate": 4.966065260332048e-05, + "loss": 0.9945, + "step": 2730 + }, + { + "epoch": 0.3238467923633345, + "grad_norm": 1.3232890366181171, + "learning_rate": 4.966025833645693e-05, + "loss": 0.8465, + "step": 2731 + }, + { + "epoch": 0.3239653741254595, + "grad_norm": 1.8552384102608477, + "learning_rate": 4.965986384225605e-05, + "loss": 0.9806, + "step": 2732 + }, + { + "epoch": 0.3240839558875845, + "grad_norm": 1.6633362597333998, + "learning_rate": 4.9659469120721456e-05, + "loss": 1.0228, + "step": 2733 + }, + { + "epoch": 0.32420253764970947, + "grad_norm": 1.8039601811575963, + "learning_rate": 4.96590741718568e-05, + "loss": 1.0149, + "step": 2734 + }, + { + "epoch": 0.32432111941183445, + "grad_norm": 1.496510299099411, + "learning_rate": 4.965867899566572e-05, + "loss": 0.8182, + "step": 2735 + }, + { + "epoch": 0.3244397011739594, + "grad_norm": 1.5003034536532909, + "learning_rate": 4.9658283592151855e-05, + "loss": 0.7613, + "step": 2736 + }, + { + "epoch": 0.3245582829360844, + "grad_norm": 1.9844302900875372, + "learning_rate": 4.965788796131885e-05, + "loss": 1.1964, + "step": 2737 + }, + { + "epoch": 0.3246768646982094, + "grad_norm": 1.415547845892196, + "learning_rate": 4.965749210317037e-05, + "loss": 0.8273, + "step": 2738 + }, + { + "epoch": 0.3247954464603344, + "grad_norm": 1.4280341983919356, + "learning_rate": 4.9657096017710034e-05, + "loss": 0.9482, + "step": 2739 + }, + { + "epoch": 0.3249140282224594, + "grad_norm": 1.4673923875595514, + "learning_rate": 4.965669970494151e-05, + "loss": 1.0265, + "step": 2740 + }, + { + "epoch": 0.3250326099845844, + "grad_norm": 1.5639943883450238, + "learning_rate": 4.9656303164868454e-05, + "loss": 0.8925, + "step": 2741 + }, + { + "epoch": 0.32515119174670937, + "grad_norm": 1.2772132468819737, + "learning_rate": 4.965590639749452e-05, + "loss": 0.7292, + "step": 2742 + }, + { + "epoch": 0.32526977350883435, + "grad_norm": 1.3935507203213573, + "learning_rate": 4.965550940282337e-05, + "loss": 0.8946, + "step": 2743 + }, + { + "epoch": 0.32538835527095933, + "grad_norm": 1.3417704909645944, + "learning_rate": 4.9655112180858656e-05, + "loss": 0.769, + "step": 2744 + }, + { + "epoch": 0.3255069370330843, + "grad_norm": 1.6819792822071984, + "learning_rate": 4.9654714731604036e-05, + "loss": 1.049, + "step": 2745 + }, + { + "epoch": 0.3256255187952093, + "grad_norm": 1.7624632124897714, + "learning_rate": 4.965431705506318e-05, + "loss": 0.9167, + "step": 2746 + }, + { + "epoch": 0.32574410055733427, + "grad_norm": 1.8125552206933606, + "learning_rate": 4.965391915123975e-05, + "loss": 0.9906, + "step": 2747 + }, + { + "epoch": 0.32586268231945925, + "grad_norm": 1.5585853476730893, + "learning_rate": 4.965352102013743e-05, + "loss": 0.9564, + "step": 2748 + }, + { + "epoch": 0.32598126408158423, + "grad_norm": 1.4782312524801715, + "learning_rate": 4.9653122661759866e-05, + "loss": 0.8484, + "step": 2749 + }, + { + "epoch": 0.3260998458437092, + "grad_norm": 1.492183036900416, + "learning_rate": 4.9652724076110754e-05, + "loss": 0.8403, + "step": 2750 + }, + { + "epoch": 0.32621842760583425, + "grad_norm": 1.6551330285410444, + "learning_rate": 4.965232526319375e-05, + "loss": 1.0068, + "step": 2751 + }, + { + "epoch": 0.32633700936795923, + "grad_norm": 1.4293310224809268, + "learning_rate": 4.9651926223012536e-05, + "loss": 0.7726, + "step": 2752 + }, + { + "epoch": 0.3264555911300842, + "grad_norm": 1.5640227735740697, + "learning_rate": 4.9651526955570795e-05, + "loss": 0.7291, + "step": 2753 + }, + { + "epoch": 0.3265741728922092, + "grad_norm": 1.5129509564578678, + "learning_rate": 4.965112746087221e-05, + "loss": 0.7347, + "step": 2754 + }, + { + "epoch": 0.32669275465433417, + "grad_norm": 1.7005463169985402, + "learning_rate": 4.9650727738920456e-05, + "loss": 1.119, + "step": 2755 + }, + { + "epoch": 0.32681133641645915, + "grad_norm": 1.7813645301392322, + "learning_rate": 4.965032778971922e-05, + "loss": 0.9366, + "step": 2756 + }, + { + "epoch": 0.32692991817858413, + "grad_norm": 1.6658920949671843, + "learning_rate": 4.9649927613272184e-05, + "loss": 0.9145, + "step": 2757 + }, + { + "epoch": 0.3270484999407091, + "grad_norm": 1.5530916201485885, + "learning_rate": 4.964952720958305e-05, + "loss": 0.7077, + "step": 2758 + }, + { + "epoch": 0.3271670817028341, + "grad_norm": 1.983070495308045, + "learning_rate": 4.96491265786555e-05, + "loss": 1.0396, + "step": 2759 + }, + { + "epoch": 0.3272856634649591, + "grad_norm": 1.6429419397195484, + "learning_rate": 4.964872572049324e-05, + "loss": 0.9702, + "step": 2760 + }, + { + "epoch": 0.32740424522708406, + "grad_norm": 1.567598038429543, + "learning_rate": 4.964832463509994e-05, + "loss": 0.9339, + "step": 2761 + }, + { + "epoch": 0.32752282698920904, + "grad_norm": 1.4689677873242704, + "learning_rate": 4.964792332247932e-05, + "loss": 0.8379, + "step": 2762 + }, + { + "epoch": 0.327641408751334, + "grad_norm": 1.6379684180806189, + "learning_rate": 4.964752178263508e-05, + "loss": 0.8064, + "step": 2763 + }, + { + "epoch": 0.32775999051345905, + "grad_norm": 1.6612423213752732, + "learning_rate": 4.9647120015570894e-05, + "loss": 0.9853, + "step": 2764 + }, + { + "epoch": 0.32787857227558403, + "grad_norm": 1.5492941388907633, + "learning_rate": 4.96467180212905e-05, + "loss": 0.8689, + "step": 2765 + }, + { + "epoch": 0.327997154037709, + "grad_norm": 1.4700239159035646, + "learning_rate": 4.964631579979758e-05, + "loss": 0.8644, + "step": 2766 + }, + { + "epoch": 0.328115735799834, + "grad_norm": 1.7397612302161707, + "learning_rate": 4.964591335109586e-05, + "loss": 0.8844, + "step": 2767 + }, + { + "epoch": 0.328234317561959, + "grad_norm": 1.4451745171867123, + "learning_rate": 4.964551067518904e-05, + "loss": 0.8543, + "step": 2768 + }, + { + "epoch": 0.32835289932408396, + "grad_norm": 1.3240068738470825, + "learning_rate": 4.9645107772080834e-05, + "loss": 0.6118, + "step": 2769 + }, + { + "epoch": 0.32847148108620894, + "grad_norm": 1.5794189295605323, + "learning_rate": 4.9644704641774963e-05, + "loss": 1.0382, + "step": 2770 + }, + { + "epoch": 0.3285900628483339, + "grad_norm": 1.4758957542508462, + "learning_rate": 4.964430128427513e-05, + "loss": 0.7699, + "step": 2771 + }, + { + "epoch": 0.3287086446104589, + "grad_norm": 1.6164342833908791, + "learning_rate": 4.9643897699585056e-05, + "loss": 0.8955, + "step": 2772 + }, + { + "epoch": 0.3288272263725839, + "grad_norm": 1.6202558170918955, + "learning_rate": 4.964349388770847e-05, + "loss": 0.8362, + "step": 2773 + }, + { + "epoch": 0.32894580813470886, + "grad_norm": 1.4348110533810172, + "learning_rate": 4.964308984864909e-05, + "loss": 0.7605, + "step": 2774 + }, + { + "epoch": 0.32906438989683384, + "grad_norm": 1.4218686201839146, + "learning_rate": 4.964268558241064e-05, + "loss": 0.67, + "step": 2775 + }, + { + "epoch": 0.3291829716589589, + "grad_norm": 1.525256457738026, + "learning_rate": 4.964228108899684e-05, + "loss": 0.6838, + "step": 2776 + }, + { + "epoch": 0.32930155342108386, + "grad_norm": 1.4343599954804653, + "learning_rate": 4.964187636841144e-05, + "loss": 0.7929, + "step": 2777 + }, + { + "epoch": 0.32942013518320884, + "grad_norm": 1.6673166920532625, + "learning_rate": 4.9641471420658154e-05, + "loss": 0.7381, + "step": 2778 + }, + { + "epoch": 0.3295387169453338, + "grad_norm": 1.5933448255015839, + "learning_rate": 4.964106624574072e-05, + "loss": 0.7367, + "step": 2779 + }, + { + "epoch": 0.3296572987074588, + "grad_norm": 1.6123113902170438, + "learning_rate": 4.9640660843662865e-05, + "loss": 0.8604, + "step": 2780 + }, + { + "epoch": 0.3297758804695838, + "grad_norm": 2.1716368279114278, + "learning_rate": 4.964025521442834e-05, + "loss": 0.9571, + "step": 2781 + }, + { + "epoch": 0.32989446223170876, + "grad_norm": 2.169942548016722, + "learning_rate": 4.9639849358040874e-05, + "loss": 0.9362, + "step": 2782 + }, + { + "epoch": 0.33001304399383374, + "grad_norm": 1.834795633740072, + "learning_rate": 4.963944327450422e-05, + "loss": 0.841, + "step": 2783 + }, + { + "epoch": 0.3301316257559587, + "grad_norm": 1.84713720722264, + "learning_rate": 4.96390369638221e-05, + "loss": 0.6122, + "step": 2784 + }, + { + "epoch": 0.3302502075180837, + "grad_norm": 1.34013181753607, + "learning_rate": 4.9638630425998285e-05, + "loss": 0.6473, + "step": 2785 + }, + { + "epoch": 0.3303687892802087, + "grad_norm": 1.6089334234479604, + "learning_rate": 4.963822366103651e-05, + "loss": 0.9525, + "step": 2786 + }, + { + "epoch": 0.33048737104233367, + "grad_norm": 1.8312307755719461, + "learning_rate": 4.963781666894052e-05, + "loss": 1.0734, + "step": 2787 + }, + { + "epoch": 0.33060595280445865, + "grad_norm": 1.65368028349116, + "learning_rate": 4.9637409449714076e-05, + "loss": 0.9452, + "step": 2788 + }, + { + "epoch": 0.3307245345665837, + "grad_norm": 1.520381340220029, + "learning_rate": 4.963700200336093e-05, + "loss": 0.9324, + "step": 2789 + }, + { + "epoch": 0.33084311632870866, + "grad_norm": 1.5031168137028363, + "learning_rate": 4.963659432988484e-05, + "loss": 1.0041, + "step": 2790 + }, + { + "epoch": 0.33096169809083364, + "grad_norm": 1.8458510736368334, + "learning_rate": 4.963618642928956e-05, + "loss": 0.9691, + "step": 2791 + }, + { + "epoch": 0.3310802798529586, + "grad_norm": 1.4212338358323864, + "learning_rate": 4.963577830157885e-05, + "loss": 0.7868, + "step": 2792 + }, + { + "epoch": 0.3311988616150836, + "grad_norm": 1.48049567821041, + "learning_rate": 4.963536994675648e-05, + "loss": 0.8595, + "step": 2793 + }, + { + "epoch": 0.3313174433772086, + "grad_norm": 1.5153884021235164, + "learning_rate": 4.9634961364826206e-05, + "loss": 0.8579, + "step": 2794 + }, + { + "epoch": 0.33143602513933357, + "grad_norm": 1.594545454096086, + "learning_rate": 4.963455255579179e-05, + "loss": 0.9207, + "step": 2795 + }, + { + "epoch": 0.33155460690145855, + "grad_norm": 1.4923294895537007, + "learning_rate": 4.9634143519657014e-05, + "loss": 0.7358, + "step": 2796 + }, + { + "epoch": 0.33167318866358353, + "grad_norm": 1.4473185155513781, + "learning_rate": 4.9633734256425644e-05, + "loss": 0.7292, + "step": 2797 + }, + { + "epoch": 0.3317917704257085, + "grad_norm": 1.54527523654456, + "learning_rate": 4.963332476610145e-05, + "loss": 0.8929, + "step": 2798 + }, + { + "epoch": 0.3319103521878335, + "grad_norm": 1.72342725632366, + "learning_rate": 4.963291504868822e-05, + "loss": 0.9952, + "step": 2799 + }, + { + "epoch": 0.33202893394995847, + "grad_norm": 1.3668676221515605, + "learning_rate": 4.963250510418971e-05, + "loss": 0.7836, + "step": 2800 + }, + { + "epoch": 0.3321475157120835, + "grad_norm": 1.6140539758231358, + "learning_rate": 4.963209493260971e-05, + "loss": 0.8181, + "step": 2801 + }, + { + "epoch": 0.3322660974742085, + "grad_norm": 1.5186487093143384, + "learning_rate": 4.9631684533951996e-05, + "loss": 0.6405, + "step": 2802 + }, + { + "epoch": 0.33238467923633347, + "grad_norm": 2.081595154271113, + "learning_rate": 4.9631273908220364e-05, + "loss": 0.9064, + "step": 2803 + }, + { + "epoch": 0.33250326099845845, + "grad_norm": 1.4832673412218698, + "learning_rate": 4.9630863055418586e-05, + "loss": 0.8756, + "step": 2804 + }, + { + "epoch": 0.33262184276058343, + "grad_norm": 2.1309771132101045, + "learning_rate": 4.963045197555046e-05, + "loss": 0.7442, + "step": 2805 + }, + { + "epoch": 0.3327404245227084, + "grad_norm": 1.653285934336091, + "learning_rate": 4.963004066861977e-05, + "loss": 0.776, + "step": 2806 + }, + { + "epoch": 0.3328590062848334, + "grad_norm": 1.8030736793595858, + "learning_rate": 4.9629629134630306e-05, + "loss": 0.7965, + "step": 2807 + }, + { + "epoch": 0.3329775880469584, + "grad_norm": 1.5396209943692214, + "learning_rate": 4.962921737358587e-05, + "loss": 0.6777, + "step": 2808 + }, + { + "epoch": 0.33309616980908335, + "grad_norm": 1.3184378776717363, + "learning_rate": 4.962880538549024e-05, + "loss": 0.5342, + "step": 2809 + }, + { + "epoch": 0.33321475157120833, + "grad_norm": 1.3876034318785482, + "learning_rate": 4.962839317034723e-05, + "loss": 0.6402, + "step": 2810 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.3753280647202086, + "learning_rate": 4.962798072816065e-05, + "loss": 0.5978, + "step": 2811 + }, + { + "epoch": 0.3334519150954583, + "grad_norm": 1.660007722079822, + "learning_rate": 4.9627568058934274e-05, + "loss": 0.8813, + "step": 2812 + }, + { + "epoch": 0.3335704968575833, + "grad_norm": 1.9798030696132103, + "learning_rate": 4.962715516267192e-05, + "loss": 1.0693, + "step": 2813 + }, + { + "epoch": 0.3336890786197083, + "grad_norm": 1.5290415078224444, + "learning_rate": 4.962674203937741e-05, + "loss": 0.8477, + "step": 2814 + }, + { + "epoch": 0.3338076603818333, + "grad_norm": 1.5955194156465384, + "learning_rate": 4.962632868905452e-05, + "loss": 0.6505, + "step": 2815 + }, + { + "epoch": 0.3339262421439583, + "grad_norm": 1.4776903754309634, + "learning_rate": 4.962591511170709e-05, + "loss": 0.7045, + "step": 2816 + }, + { + "epoch": 0.33404482390608325, + "grad_norm": 1.6896571199719752, + "learning_rate": 4.962550130733891e-05, + "loss": 0.7483, + "step": 2817 + }, + { + "epoch": 0.33416340566820824, + "grad_norm": 1.5807609014437038, + "learning_rate": 4.962508727595382e-05, + "loss": 0.7775, + "step": 2818 + }, + { + "epoch": 0.3342819874303332, + "grad_norm": 1.9799473341463127, + "learning_rate": 4.962467301755561e-05, + "loss": 0.9784, + "step": 2819 + }, + { + "epoch": 0.3344005691924582, + "grad_norm": 1.724025274585934, + "learning_rate": 4.9624258532148114e-05, + "loss": 0.9772, + "step": 2820 + }, + { + "epoch": 0.3345191509545832, + "grad_norm": 1.7189110824497051, + "learning_rate": 4.962384381973516e-05, + "loss": 1.0267, + "step": 2821 + }, + { + "epoch": 0.33463773271670816, + "grad_norm": 1.414210404307473, + "learning_rate": 4.9623428880320556e-05, + "loss": 0.6544, + "step": 2822 + }, + { + "epoch": 0.33475631447883314, + "grad_norm": 1.654770549949875, + "learning_rate": 4.962301371390813e-05, + "loss": 0.8016, + "step": 2823 + }, + { + "epoch": 0.3348748962409581, + "grad_norm": 1.460637026310849, + "learning_rate": 4.962259832050171e-05, + "loss": 0.8311, + "step": 2824 + }, + { + "epoch": 0.3349934780030831, + "grad_norm": 1.6273694842527817, + "learning_rate": 4.962218270010513e-05, + "loss": 0.7325, + "step": 2825 + }, + { + "epoch": 0.33511205976520814, + "grad_norm": 1.4939469121769486, + "learning_rate": 4.9621766852722224e-05, + "loss": 0.8355, + "step": 2826 + }, + { + "epoch": 0.3352306415273331, + "grad_norm": 1.7283197028868467, + "learning_rate": 4.962135077835682e-05, + "loss": 0.7549, + "step": 2827 + }, + { + "epoch": 0.3353492232894581, + "grad_norm": 1.9593240951420299, + "learning_rate": 4.962093447701275e-05, + "loss": 1.3127, + "step": 2828 + }, + { + "epoch": 0.3354678050515831, + "grad_norm": 1.4316475921318357, + "learning_rate": 4.962051794869386e-05, + "loss": 0.8735, + "step": 2829 + }, + { + "epoch": 0.33558638681370806, + "grad_norm": 1.3372951686821986, + "learning_rate": 4.962010119340399e-05, + "loss": 0.7494, + "step": 2830 + }, + { + "epoch": 0.33570496857583304, + "grad_norm": 1.3746263273388082, + "learning_rate": 4.961968421114697e-05, + "loss": 0.8242, + "step": 2831 + }, + { + "epoch": 0.335823550337958, + "grad_norm": 1.5671956067437434, + "learning_rate": 4.961926700192665e-05, + "loss": 0.9125, + "step": 2832 + }, + { + "epoch": 0.335942132100083, + "grad_norm": 1.5245976019969716, + "learning_rate": 4.9618849565746886e-05, + "loss": 0.7008, + "step": 2833 + }, + { + "epoch": 0.336060713862208, + "grad_norm": 1.6046706174628338, + "learning_rate": 4.961843190261152e-05, + "loss": 0.7532, + "step": 2834 + }, + { + "epoch": 0.33617929562433296, + "grad_norm": 1.4291737061886658, + "learning_rate": 4.9618014012524395e-05, + "loss": 0.8646, + "step": 2835 + }, + { + "epoch": 0.33629787738645794, + "grad_norm": 2.240609001073133, + "learning_rate": 4.9617595895489375e-05, + "loss": 0.8901, + "step": 2836 + }, + { + "epoch": 0.3364164591485829, + "grad_norm": 1.7589961771319107, + "learning_rate": 4.961717755151031e-05, + "loss": 0.9592, + "step": 2837 + }, + { + "epoch": 0.3365350409107079, + "grad_norm": 1.3448616237641957, + "learning_rate": 4.961675898059104e-05, + "loss": 0.455, + "step": 2838 + }, + { + "epoch": 0.33665362267283294, + "grad_norm": 1.1899828364806053, + "learning_rate": 4.9616340182735457e-05, + "loss": 0.6567, + "step": 2839 + }, + { + "epoch": 0.3367722044349579, + "grad_norm": 1.5444616283351622, + "learning_rate": 4.96159211579474e-05, + "loss": 0.7597, + "step": 2840 + }, + { + "epoch": 0.3368907861970829, + "grad_norm": 1.507572360636242, + "learning_rate": 4.9615501906230734e-05, + "loss": 0.7383, + "step": 2841 + }, + { + "epoch": 0.3370093679592079, + "grad_norm": 2.057334748719598, + "learning_rate": 4.961508242758932e-05, + "loss": 0.9777, + "step": 2842 + }, + { + "epoch": 0.33712794972133286, + "grad_norm": 1.6090544302351908, + "learning_rate": 4.961466272202704e-05, + "loss": 0.8335, + "step": 2843 + }, + { + "epoch": 0.33724653148345785, + "grad_norm": 1.7514103983026081, + "learning_rate": 4.961424278954775e-05, + "loss": 0.802, + "step": 2844 + }, + { + "epoch": 0.3373651132455828, + "grad_norm": 1.4103369286139642, + "learning_rate": 4.961382263015533e-05, + "loss": 0.7398, + "step": 2845 + }, + { + "epoch": 0.3374836950077078, + "grad_norm": 1.4969483703583482, + "learning_rate": 4.9613402243853644e-05, + "loss": 0.7988, + "step": 2846 + }, + { + "epoch": 0.3376022767698328, + "grad_norm": 1.4832407581082243, + "learning_rate": 4.961298163064657e-05, + "loss": 0.6867, + "step": 2847 + }, + { + "epoch": 0.33772085853195777, + "grad_norm": 1.5675731404656152, + "learning_rate": 4.961256079053799e-05, + "loss": 0.9691, + "step": 2848 + }, + { + "epoch": 0.33783944029408275, + "grad_norm": 1.5585724472907976, + "learning_rate": 4.961213972353178e-05, + "loss": 0.9006, + "step": 2849 + }, + { + "epoch": 0.33795802205620773, + "grad_norm": 1.4264671873097026, + "learning_rate": 4.961171842963183e-05, + "loss": 1.0245, + "step": 2850 + }, + { + "epoch": 0.33807660381833277, + "grad_norm": 1.5627125350800048, + "learning_rate": 4.961129690884201e-05, + "loss": 0.9583, + "step": 2851 + }, + { + "epoch": 0.33819518558045775, + "grad_norm": 1.4026114383891077, + "learning_rate": 4.9610875161166214e-05, + "loss": 0.7778, + "step": 2852 + }, + { + "epoch": 0.3383137673425827, + "grad_norm": 1.4174218463707347, + "learning_rate": 4.961045318660833e-05, + "loss": 0.9183, + "step": 2853 + }, + { + "epoch": 0.3384323491047077, + "grad_norm": 1.5009330505642695, + "learning_rate": 4.9610030985172254e-05, + "loss": 0.7885, + "step": 2854 + }, + { + "epoch": 0.3385509308668327, + "grad_norm": 1.4590603134232503, + "learning_rate": 4.9609608556861864e-05, + "loss": 0.7576, + "step": 2855 + }, + { + "epoch": 0.33866951262895767, + "grad_norm": 1.4101792816062046, + "learning_rate": 4.960918590168106e-05, + "loss": 0.7472, + "step": 2856 + }, + { + "epoch": 0.33878809439108265, + "grad_norm": 1.377982379433936, + "learning_rate": 4.960876301963374e-05, + "loss": 0.7622, + "step": 2857 + }, + { + "epoch": 0.33890667615320763, + "grad_norm": 1.6369897854224613, + "learning_rate": 4.9608339910723805e-05, + "loss": 0.8572, + "step": 2858 + }, + { + "epoch": 0.3390252579153326, + "grad_norm": 1.7746473769897542, + "learning_rate": 4.960791657495515e-05, + "loss": 0.8703, + "step": 2859 + }, + { + "epoch": 0.3391438396774576, + "grad_norm": 1.7089409180061108, + "learning_rate": 4.9607493012331685e-05, + "loss": 0.7029, + "step": 2860 + }, + { + "epoch": 0.3392624214395826, + "grad_norm": 1.5637088972903435, + "learning_rate": 4.9607069222857306e-05, + "loss": 0.8632, + "step": 2861 + }, + { + "epoch": 0.33938100320170755, + "grad_norm": 1.8350189716827918, + "learning_rate": 4.960664520653593e-05, + "loss": 0.9446, + "step": 2862 + }, + { + "epoch": 0.3394995849638326, + "grad_norm": 1.8785922565171425, + "learning_rate": 4.960622096337145e-05, + "loss": 0.9769, + "step": 2863 + }, + { + "epoch": 0.33961816672595757, + "grad_norm": 1.7552565041603954, + "learning_rate": 4.960579649336779e-05, + "loss": 0.8059, + "step": 2864 + }, + { + "epoch": 0.33973674848808255, + "grad_norm": 1.49552979531624, + "learning_rate": 4.9605371796528864e-05, + "loss": 0.786, + "step": 2865 + }, + { + "epoch": 0.33985533025020753, + "grad_norm": 1.7502690392214282, + "learning_rate": 4.9604946872858585e-05, + "loss": 1.1297, + "step": 2866 + }, + { + "epoch": 0.3399739120123325, + "grad_norm": 1.6302192735156025, + "learning_rate": 4.960452172236086e-05, + "loss": 0.9746, + "step": 2867 + }, + { + "epoch": 0.3400924937744575, + "grad_norm": 1.670406605489306, + "learning_rate": 4.960409634503962e-05, + "loss": 1.0384, + "step": 2868 + }, + { + "epoch": 0.3402110755365825, + "grad_norm": 1.7978646739080515, + "learning_rate": 4.960367074089879e-05, + "loss": 0.9543, + "step": 2869 + }, + { + "epoch": 0.34032965729870746, + "grad_norm": 1.2808967466357806, + "learning_rate": 4.9603244909942274e-05, + "loss": 0.6514, + "step": 2870 + }, + { + "epoch": 0.34044823906083244, + "grad_norm": 1.4142144379265653, + "learning_rate": 4.960281885217402e-05, + "loss": 0.693, + "step": 2871 + }, + { + "epoch": 0.3405668208229574, + "grad_norm": 1.402273416382213, + "learning_rate": 4.9602392567597946e-05, + "loss": 0.7622, + "step": 2872 + }, + { + "epoch": 0.3406854025850824, + "grad_norm": 1.5503730452737443, + "learning_rate": 4.9601966056217974e-05, + "loss": 0.7811, + "step": 2873 + }, + { + "epoch": 0.3408039843472074, + "grad_norm": 1.6623532190872874, + "learning_rate": 4.960153931803805e-05, + "loss": 1.1486, + "step": 2874 + }, + { + "epoch": 0.34092256610933236, + "grad_norm": 1.3508075864689846, + "learning_rate": 4.9601112353062104e-05, + "loss": 0.8321, + "step": 2875 + }, + { + "epoch": 0.3410411478714574, + "grad_norm": 1.4919573525634406, + "learning_rate": 4.960068516129407e-05, + "loss": 1.1293, + "step": 2876 + }, + { + "epoch": 0.3411597296335824, + "grad_norm": 1.5536254652927997, + "learning_rate": 4.9600257742737885e-05, + "loss": 0.7924, + "step": 2877 + }, + { + "epoch": 0.34127831139570736, + "grad_norm": 1.7777949990598614, + "learning_rate": 4.959983009739748e-05, + "loss": 0.7825, + "step": 2878 + }, + { + "epoch": 0.34139689315783234, + "grad_norm": 1.489579095403158, + "learning_rate": 4.9599402225276825e-05, + "loss": 0.7607, + "step": 2879 + }, + { + "epoch": 0.3415154749199573, + "grad_norm": 1.7204057525117367, + "learning_rate": 4.9598974126379836e-05, + "loss": 0.8257, + "step": 2880 + }, + { + "epoch": 0.3416340566820823, + "grad_norm": 1.5548134399020723, + "learning_rate": 4.959854580071047e-05, + "loss": 0.7232, + "step": 2881 + }, + { + "epoch": 0.3417526384442073, + "grad_norm": 1.5267958144256801, + "learning_rate": 4.9598117248272684e-05, + "loss": 0.8521, + "step": 2882 + }, + { + "epoch": 0.34187122020633226, + "grad_norm": 1.7164534194811047, + "learning_rate": 4.959768846907042e-05, + "loss": 0.8078, + "step": 2883 + }, + { + "epoch": 0.34198980196845724, + "grad_norm": 2.121653714241097, + "learning_rate": 4.959725946310763e-05, + "loss": 1.0854, + "step": 2884 + }, + { + "epoch": 0.3421083837305822, + "grad_norm": 1.4848838808553477, + "learning_rate": 4.959683023038827e-05, + "loss": 0.6335, + "step": 2885 + }, + { + "epoch": 0.3422269654927072, + "grad_norm": 1.6805762609491637, + "learning_rate": 4.9596400770916295e-05, + "loss": 0.7168, + "step": 2886 + }, + { + "epoch": 0.3423455472548322, + "grad_norm": 1.791311544759708, + "learning_rate": 4.959597108469567e-05, + "loss": 0.7745, + "step": 2887 + }, + { + "epoch": 0.3424641290169572, + "grad_norm": 1.8439552954657066, + "learning_rate": 4.9595541171730355e-05, + "loss": 1.0191, + "step": 2888 + }, + { + "epoch": 0.3425827107790822, + "grad_norm": 1.8490681041122836, + "learning_rate": 4.959511103202431e-05, + "loss": 0.8734, + "step": 2889 + }, + { + "epoch": 0.3427012925412072, + "grad_norm": 1.6413582286901611, + "learning_rate": 4.959468066558151e-05, + "loss": 0.6911, + "step": 2890 + }, + { + "epoch": 0.34281987430333216, + "grad_norm": 1.6828091334256225, + "learning_rate": 4.95942500724059e-05, + "loss": 0.8553, + "step": 2891 + }, + { + "epoch": 0.34293845606545714, + "grad_norm": 1.3734560340484727, + "learning_rate": 4.959381925250147e-05, + "loss": 0.6126, + "step": 2892 + }, + { + "epoch": 0.3430570378275821, + "grad_norm": 1.7643219648293704, + "learning_rate": 4.9593388205872184e-05, + "loss": 0.9363, + "step": 2893 + }, + { + "epoch": 0.3431756195897071, + "grad_norm": 1.601004157565287, + "learning_rate": 4.9592956932522015e-05, + "loss": 0.8632, + "step": 2894 + }, + { + "epoch": 0.3432942013518321, + "grad_norm": 1.5337521076998217, + "learning_rate": 4.959252543245495e-05, + "loss": 0.8709, + "step": 2895 + }, + { + "epoch": 0.34341278311395707, + "grad_norm": 1.4482297831974633, + "learning_rate": 4.959209370567495e-05, + "loss": 0.7381, + "step": 2896 + }, + { + "epoch": 0.34353136487608205, + "grad_norm": 1.5149045970592943, + "learning_rate": 4.9591661752186e-05, + "loss": 0.8716, + "step": 2897 + }, + { + "epoch": 0.343649946638207, + "grad_norm": 1.4319461546205026, + "learning_rate": 4.959122957199209e-05, + "loss": 0.8863, + "step": 2898 + }, + { + "epoch": 0.343768528400332, + "grad_norm": 1.4846130598672929, + "learning_rate": 4.95907971650972e-05, + "loss": 0.8486, + "step": 2899 + }, + { + "epoch": 0.343887110162457, + "grad_norm": 1.4783318079775827, + "learning_rate": 4.959036453150532e-05, + "loss": 1.0275, + "step": 2900 + }, + { + "epoch": 0.344005691924582, + "grad_norm": 1.5878967901218546, + "learning_rate": 4.9589931671220426e-05, + "loss": 0.971, + "step": 2901 + }, + { + "epoch": 0.344124273686707, + "grad_norm": 1.5381404860042924, + "learning_rate": 4.9589498584246516e-05, + "loss": 0.8736, + "step": 2902 + }, + { + "epoch": 0.344242855448832, + "grad_norm": 1.4901685133057792, + "learning_rate": 4.958906527058759e-05, + "loss": 0.7956, + "step": 2903 + }, + { + "epoch": 0.34436143721095697, + "grad_norm": 1.4918793113371167, + "learning_rate": 4.958863173024763e-05, + "loss": 0.8814, + "step": 2904 + }, + { + "epoch": 0.34448001897308195, + "grad_norm": 1.5736887725373363, + "learning_rate": 4.9588197963230634e-05, + "loss": 0.8224, + "step": 2905 + }, + { + "epoch": 0.34459860073520693, + "grad_norm": 1.4567855777889704, + "learning_rate": 4.9587763969540615e-05, + "loss": 0.806, + "step": 2906 + }, + { + "epoch": 0.3447171824973319, + "grad_norm": 1.7400888865345627, + "learning_rate": 4.958732974918156e-05, + "loss": 1.1106, + "step": 2907 + }, + { + "epoch": 0.3448357642594569, + "grad_norm": 1.6463626810786787, + "learning_rate": 4.9586895302157466e-05, + "loss": 0.8044, + "step": 2908 + }, + { + "epoch": 0.34495434602158187, + "grad_norm": 2.0682076583454396, + "learning_rate": 4.958646062847235e-05, + "loss": 0.956, + "step": 2909 + }, + { + "epoch": 0.34507292778370685, + "grad_norm": 1.7049160832110697, + "learning_rate": 4.958602572813023e-05, + "loss": 0.8021, + "step": 2910 + }, + { + "epoch": 0.34519150954583183, + "grad_norm": 2.0271936111919713, + "learning_rate": 4.958559060113509e-05, + "loss": 0.7953, + "step": 2911 + }, + { + "epoch": 0.3453100913079568, + "grad_norm": 1.9357673191340123, + "learning_rate": 4.958515524749096e-05, + "loss": 1.1852, + "step": 2912 + }, + { + "epoch": 0.34542867307008185, + "grad_norm": 1.8974394315908816, + "learning_rate": 4.958471966720185e-05, + "loss": 0.8866, + "step": 2913 + }, + { + "epoch": 0.34554725483220683, + "grad_norm": 1.5913551807067745, + "learning_rate": 4.9584283860271766e-05, + "loss": 0.8123, + "step": 2914 + }, + { + "epoch": 0.3456658365943318, + "grad_norm": 1.6841784536416802, + "learning_rate": 4.958384782670473e-05, + "loss": 0.9463, + "step": 2915 + }, + { + "epoch": 0.3457844183564568, + "grad_norm": 1.4918550317000312, + "learning_rate": 4.958341156650477e-05, + "loss": 0.7815, + "step": 2916 + }, + { + "epoch": 0.34590300011858177, + "grad_norm": 1.5962074725422235, + "learning_rate": 4.95829750796759e-05, + "loss": 1.057, + "step": 2917 + }, + { + "epoch": 0.34602158188070675, + "grad_norm": 1.4915102033567327, + "learning_rate": 4.958253836622214e-05, + "loss": 0.8276, + "step": 2918 + }, + { + "epoch": 0.34614016364283173, + "grad_norm": 1.4575562855826463, + "learning_rate": 4.9582101426147524e-05, + "loss": 0.8253, + "step": 2919 + }, + { + "epoch": 0.3462587454049567, + "grad_norm": 1.3138631171233541, + "learning_rate": 4.958166425945608e-05, + "loss": 0.6927, + "step": 2920 + }, + { + "epoch": 0.3463773271670817, + "grad_norm": 1.5390301051786683, + "learning_rate": 4.958122686615183e-05, + "loss": 0.749, + "step": 2921 + }, + { + "epoch": 0.3464959089292067, + "grad_norm": 1.5113020196288256, + "learning_rate": 4.958078924623882e-05, + "loss": 0.8579, + "step": 2922 + }, + { + "epoch": 0.34661449069133166, + "grad_norm": 1.5584020877082554, + "learning_rate": 4.958035139972107e-05, + "loss": 0.8704, + "step": 2923 + }, + { + "epoch": 0.34673307245345664, + "grad_norm": 1.581652104238569, + "learning_rate": 4.957991332660262e-05, + "loss": 0.9784, + "step": 2924 + }, + { + "epoch": 0.3468516542155816, + "grad_norm": 1.3080677917923837, + "learning_rate": 4.957947502688752e-05, + "loss": 0.6602, + "step": 2925 + }, + { + "epoch": 0.34697023597770665, + "grad_norm": 1.6629534110270083, + "learning_rate": 4.95790365005798e-05, + "loss": 0.9093, + "step": 2926 + }, + { + "epoch": 0.34708881773983163, + "grad_norm": 1.3684825803811067, + "learning_rate": 4.95785977476835e-05, + "loss": 0.7229, + "step": 2927 + }, + { + "epoch": 0.3472073995019566, + "grad_norm": 1.699546159398271, + "learning_rate": 4.957815876820268e-05, + "loss": 0.9919, + "step": 2928 + }, + { + "epoch": 0.3473259812640816, + "grad_norm": 1.4523691409449255, + "learning_rate": 4.9577719562141354e-05, + "loss": 0.7973, + "step": 2929 + }, + { + "epoch": 0.3474445630262066, + "grad_norm": 1.7118166424198409, + "learning_rate": 4.957728012950361e-05, + "loss": 0.8463, + "step": 2930 + }, + { + "epoch": 0.34756314478833156, + "grad_norm": 1.4668118436738335, + "learning_rate": 4.957684047029348e-05, + "loss": 0.7836, + "step": 2931 + }, + { + "epoch": 0.34768172655045654, + "grad_norm": 1.436301002663976, + "learning_rate": 4.9576400584515016e-05, + "loss": 0.8215, + "step": 2932 + }, + { + "epoch": 0.3478003083125815, + "grad_norm": 1.5620087406130712, + "learning_rate": 4.9575960472172274e-05, + "loss": 0.8432, + "step": 2933 + }, + { + "epoch": 0.3479188900747065, + "grad_norm": 1.5848392804953346, + "learning_rate": 4.957552013326931e-05, + "loss": 0.802, + "step": 2934 + }, + { + "epoch": 0.3480374718368315, + "grad_norm": 1.658576623667986, + "learning_rate": 4.95750795678102e-05, + "loss": 0.7934, + "step": 2935 + }, + { + "epoch": 0.34815605359895646, + "grad_norm": 1.7102619746485916, + "learning_rate": 4.957463877579899e-05, + "loss": 0.8402, + "step": 2936 + }, + { + "epoch": 0.34827463536108144, + "grad_norm": 1.4234612054929487, + "learning_rate": 4.957419775723974e-05, + "loss": 0.9876, + "step": 2937 + }, + { + "epoch": 0.3483932171232065, + "grad_norm": 1.7310833873898996, + "learning_rate": 4.957375651213651e-05, + "loss": 0.9304, + "step": 2938 + }, + { + "epoch": 0.34851179888533146, + "grad_norm": 1.5280488326137254, + "learning_rate": 4.9573315040493396e-05, + "loss": 0.7575, + "step": 2939 + }, + { + "epoch": 0.34863038064745644, + "grad_norm": 1.4080776199050185, + "learning_rate": 4.9572873342314454e-05, + "loss": 0.6303, + "step": 2940 + }, + { + "epoch": 0.3487489624095814, + "grad_norm": 1.3147304229328767, + "learning_rate": 4.9572431417603734e-05, + "loss": 0.6367, + "step": 2941 + }, + { + "epoch": 0.3488675441717064, + "grad_norm": 1.7144716924788561, + "learning_rate": 4.957198926636535e-05, + "loss": 0.7634, + "step": 2942 + }, + { + "epoch": 0.3489861259338314, + "grad_norm": 1.6234874142773068, + "learning_rate": 4.957154688860335e-05, + "loss": 0.7326, + "step": 2943 + }, + { + "epoch": 0.34910470769595636, + "grad_norm": 1.556993440137025, + "learning_rate": 4.957110428432181e-05, + "loss": 0.8707, + "step": 2944 + }, + { + "epoch": 0.34922328945808134, + "grad_norm": 1.6126807612520484, + "learning_rate": 4.957066145352483e-05, + "loss": 0.8636, + "step": 2945 + }, + { + "epoch": 0.3493418712202063, + "grad_norm": 1.6328306517189772, + "learning_rate": 4.9570218396216484e-05, + "loss": 0.8413, + "step": 2946 + }, + { + "epoch": 0.3494604529823313, + "grad_norm": 1.4379299411178703, + "learning_rate": 4.9569775112400844e-05, + "loss": 0.7204, + "step": 2947 + }, + { + "epoch": 0.3495790347444563, + "grad_norm": 1.6388481213265393, + "learning_rate": 4.9569331602082015e-05, + "loss": 0.8935, + "step": 2948 + }, + { + "epoch": 0.34969761650658127, + "grad_norm": 1.6032559555797714, + "learning_rate": 4.956888786526407e-05, + "loss": 0.7418, + "step": 2949 + }, + { + "epoch": 0.34981619826870625, + "grad_norm": 1.4713099466171342, + "learning_rate": 4.956844390195111e-05, + "loss": 0.758, + "step": 2950 + }, + { + "epoch": 0.3499347800308313, + "grad_norm": 1.6545557755063045, + "learning_rate": 4.956799971214723e-05, + "loss": 0.8655, + "step": 2951 + }, + { + "epoch": 0.35005336179295626, + "grad_norm": 1.687222754277601, + "learning_rate": 4.956755529585651e-05, + "loss": 0.9229, + "step": 2952 + }, + { + "epoch": 0.35017194355508124, + "grad_norm": 1.3206347253748691, + "learning_rate": 4.956711065308307e-05, + "loss": 0.5424, + "step": 2953 + }, + { + "epoch": 0.3502905253172062, + "grad_norm": 1.8243610828010588, + "learning_rate": 4.956666578383099e-05, + "loss": 1.026, + "step": 2954 + }, + { + "epoch": 0.3504091070793312, + "grad_norm": 1.6742606863312652, + "learning_rate": 4.956622068810438e-05, + "loss": 0.9345, + "step": 2955 + }, + { + "epoch": 0.3505276888414562, + "grad_norm": 1.726377272865767, + "learning_rate": 4.9565775365907334e-05, + "loss": 0.8389, + "step": 2956 + }, + { + "epoch": 0.35064627060358117, + "grad_norm": 1.5685248012943251, + "learning_rate": 4.956532981724397e-05, + "loss": 0.7085, + "step": 2957 + }, + { + "epoch": 0.35076485236570615, + "grad_norm": 1.7795963662027212, + "learning_rate": 4.956488404211839e-05, + "loss": 0.9965, + "step": 2958 + }, + { + "epoch": 0.35088343412783113, + "grad_norm": 1.9510552695806374, + "learning_rate": 4.9564438040534694e-05, + "loss": 0.9712, + "step": 2959 + }, + { + "epoch": 0.3510020158899561, + "grad_norm": 1.3049175952627046, + "learning_rate": 4.9563991812497006e-05, + "loss": 0.7541, + "step": 2960 + }, + { + "epoch": 0.3511205976520811, + "grad_norm": 1.49258541064587, + "learning_rate": 4.9563545358009436e-05, + "loss": 0.7934, + "step": 2961 + }, + { + "epoch": 0.35123917941420607, + "grad_norm": 1.3036530677258027, + "learning_rate": 4.9563098677076106e-05, + "loss": 0.5105, + "step": 2962 + }, + { + "epoch": 0.3513577611763311, + "grad_norm": 1.4782586639585151, + "learning_rate": 4.956265176970112e-05, + "loss": 0.7925, + "step": 2963 + }, + { + "epoch": 0.3514763429384561, + "grad_norm": 1.5469636403054137, + "learning_rate": 4.95622046358886e-05, + "loss": 1.0086, + "step": 2964 + }, + { + "epoch": 0.35159492470058107, + "grad_norm": 1.5570075982462888, + "learning_rate": 4.956175727564268e-05, + "loss": 0.7613, + "step": 2965 + }, + { + "epoch": 0.35171350646270605, + "grad_norm": 1.573615697636306, + "learning_rate": 4.9561309688967486e-05, + "loss": 1.0269, + "step": 2966 + }, + { + "epoch": 0.35183208822483103, + "grad_norm": 1.4107038297967127, + "learning_rate": 4.956086187586712e-05, + "loss": 0.7883, + "step": 2967 + }, + { + "epoch": 0.351950669986956, + "grad_norm": 1.7837398546505594, + "learning_rate": 4.9560413836345746e-05, + "loss": 0.7122, + "step": 2968 + }, + { + "epoch": 0.352069251749081, + "grad_norm": 1.5832268673622538, + "learning_rate": 4.9559965570407454e-05, + "loss": 0.861, + "step": 2969 + }, + { + "epoch": 0.35218783351120597, + "grad_norm": 1.7826131916159293, + "learning_rate": 4.955951707805641e-05, + "loss": 0.7119, + "step": 2970 + }, + { + "epoch": 0.35230641527333095, + "grad_norm": 1.6887855542094625, + "learning_rate": 4.955906835929672e-05, + "loss": 0.7696, + "step": 2971 + }, + { + "epoch": 0.35242499703545593, + "grad_norm": 1.7890858742706244, + "learning_rate": 4.955861941413256e-05, + "loss": 0.7846, + "step": 2972 + }, + { + "epoch": 0.3525435787975809, + "grad_norm": 1.4596349390894916, + "learning_rate": 4.9558170242568024e-05, + "loss": 0.7303, + "step": 2973 + }, + { + "epoch": 0.3526621605597059, + "grad_norm": 1.923570221633485, + "learning_rate": 4.955772084460728e-05, + "loss": 0.8321, + "step": 2974 + }, + { + "epoch": 0.3527807423218309, + "grad_norm": 1.6502747991989648, + "learning_rate": 4.9557271220254467e-05, + "loss": 0.7305, + "step": 2975 + }, + { + "epoch": 0.3528993240839559, + "grad_norm": 1.6285583035323776, + "learning_rate": 4.955682136951373e-05, + "loss": 0.562, + "step": 2976 + }, + { + "epoch": 0.3530179058460809, + "grad_norm": 1.6674177660973286, + "learning_rate": 4.95563712923892e-05, + "loss": 0.7961, + "step": 2977 + }, + { + "epoch": 0.3531364876082059, + "grad_norm": 1.6808646309155737, + "learning_rate": 4.955592098888505e-05, + "loss": 0.9521, + "step": 2978 + }, + { + "epoch": 0.35325506937033085, + "grad_norm": 1.6761801043979336, + "learning_rate": 4.955547045900543e-05, + "loss": 0.8625, + "step": 2979 + }, + { + "epoch": 0.35337365113245583, + "grad_norm": 1.8601542255577375, + "learning_rate": 4.9555019702754465e-05, + "loss": 0.9694, + "step": 2980 + }, + { + "epoch": 0.3534922328945808, + "grad_norm": 1.5292544616314994, + "learning_rate": 4.955456872013634e-05, + "loss": 0.8918, + "step": 2981 + }, + { + "epoch": 0.3536108146567058, + "grad_norm": 1.4340198285115624, + "learning_rate": 4.95541175111552e-05, + "loss": 0.5483, + "step": 2982 + }, + { + "epoch": 0.3537293964188308, + "grad_norm": 1.5800962189833856, + "learning_rate": 4.955366607581521e-05, + "loss": 0.9857, + "step": 2983 + }, + { + "epoch": 0.35384797818095576, + "grad_norm": 1.5339571706420119, + "learning_rate": 4.955321441412053e-05, + "loss": 0.8825, + "step": 2984 + }, + { + "epoch": 0.35396655994308074, + "grad_norm": 1.5178552469809528, + "learning_rate": 4.955276252607531e-05, + "loss": 0.7755, + "step": 2985 + }, + { + "epoch": 0.3540851417052057, + "grad_norm": 1.569887299930257, + "learning_rate": 4.9552310411683746e-05, + "loss": 0.9048, + "step": 2986 + }, + { + "epoch": 0.3542037234673307, + "grad_norm": 1.4289257071239987, + "learning_rate": 4.9551858070949974e-05, + "loss": 0.7332, + "step": 2987 + }, + { + "epoch": 0.35432230522945574, + "grad_norm": 1.5354610492983838, + "learning_rate": 4.9551405503878184e-05, + "loss": 1.0801, + "step": 2988 + }, + { + "epoch": 0.3544408869915807, + "grad_norm": 1.5137555515960313, + "learning_rate": 4.955095271047254e-05, + "loss": 0.7765, + "step": 2989 + }, + { + "epoch": 0.3545594687537057, + "grad_norm": 1.9405116904435764, + "learning_rate": 4.955049969073722e-05, + "loss": 0.9585, + "step": 2990 + }, + { + "epoch": 0.3546780505158307, + "grad_norm": 1.362903773780495, + "learning_rate": 4.9550046444676404e-05, + "loss": 0.6924, + "step": 2991 + }, + { + "epoch": 0.35479663227795566, + "grad_norm": 1.5390871410249416, + "learning_rate": 4.9549592972294255e-05, + "loss": 0.9379, + "step": 2992 + }, + { + "epoch": 0.35491521404008064, + "grad_norm": 1.4105440772947262, + "learning_rate": 4.954913927359497e-05, + "loss": 0.8856, + "step": 2993 + }, + { + "epoch": 0.3550337958022056, + "grad_norm": 1.4581988888287054, + "learning_rate": 4.9548685348582716e-05, + "loss": 0.8131, + "step": 2994 + }, + { + "epoch": 0.3551523775643306, + "grad_norm": 1.719569923507095, + "learning_rate": 4.9548231197261694e-05, + "loss": 0.7819, + "step": 2995 + }, + { + "epoch": 0.3552709593264556, + "grad_norm": 1.488327990204134, + "learning_rate": 4.954777681963608e-05, + "loss": 0.8274, + "step": 2996 + }, + { + "epoch": 0.35538954108858056, + "grad_norm": 1.7581560916023447, + "learning_rate": 4.954732221571007e-05, + "loss": 0.8761, + "step": 2997 + }, + { + "epoch": 0.35550812285070554, + "grad_norm": 1.6533360909876231, + "learning_rate": 4.954686738548785e-05, + "loss": 0.7798, + "step": 2998 + }, + { + "epoch": 0.3556267046128305, + "grad_norm": 1.5634067505450038, + "learning_rate": 4.954641232897362e-05, + "loss": 0.9799, + "step": 2999 + }, + { + "epoch": 0.3557452863749555, + "grad_norm": 1.4407973692789942, + "learning_rate": 4.954595704617157e-05, + "loss": 0.7325, + "step": 3000 + }, + { + "epoch": 0.35586386813708054, + "grad_norm": 1.5072455336585853, + "learning_rate": 4.9545501537085883e-05, + "loss": 0.8016, + "step": 3001 + }, + { + "epoch": 0.3559824498992055, + "grad_norm": 1.5984563111311543, + "learning_rate": 4.9545045801720775e-05, + "loss": 0.744, + "step": 3002 + }, + { + "epoch": 0.3561010316613305, + "grad_norm": 1.562704885265823, + "learning_rate": 4.954458984008044e-05, + "loss": 0.7576, + "step": 3003 + }, + { + "epoch": 0.3562196134234555, + "grad_norm": 1.4125712517411957, + "learning_rate": 4.9544133652169104e-05, + "loss": 0.7392, + "step": 3004 + }, + { + "epoch": 0.35633819518558046, + "grad_norm": 1.725572241192222, + "learning_rate": 4.954367723799094e-05, + "loss": 1.0061, + "step": 3005 + }, + { + "epoch": 0.35645677694770544, + "grad_norm": 1.6055368127299405, + "learning_rate": 4.9543220597550165e-05, + "loss": 0.9393, + "step": 3006 + }, + { + "epoch": 0.3565753587098304, + "grad_norm": 1.8379252399502464, + "learning_rate": 4.9542763730851e-05, + "loss": 0.9558, + "step": 3007 + }, + { + "epoch": 0.3566939404719554, + "grad_norm": 1.247226392234636, + "learning_rate": 4.954230663789765e-05, + "loss": 0.4901, + "step": 3008 + }, + { + "epoch": 0.3568125222340804, + "grad_norm": 1.4178700508244053, + "learning_rate": 4.9541849318694325e-05, + "loss": 0.6649, + "step": 3009 + }, + { + "epoch": 0.35693110399620537, + "grad_norm": 1.4462425078907981, + "learning_rate": 4.9541391773245246e-05, + "loss": 0.5956, + "step": 3010 + }, + { + "epoch": 0.35704968575833035, + "grad_norm": 1.5583754084070844, + "learning_rate": 4.9540934001554636e-05, + "loss": 0.78, + "step": 3011 + }, + { + "epoch": 0.35716826752045533, + "grad_norm": 1.7814783610954232, + "learning_rate": 4.95404760036267e-05, + "loss": 0.8628, + "step": 3012 + }, + { + "epoch": 0.35728684928258037, + "grad_norm": 1.9135780625032741, + "learning_rate": 4.9540017779465674e-05, + "loss": 0.7793, + "step": 3013 + }, + { + "epoch": 0.35740543104470535, + "grad_norm": 2.089838912557425, + "learning_rate": 4.953955932907577e-05, + "loss": 1.039, + "step": 3014 + }, + { + "epoch": 0.3575240128068303, + "grad_norm": 1.5963057652398853, + "learning_rate": 4.953910065246123e-05, + "loss": 0.7339, + "step": 3015 + }, + { + "epoch": 0.3576425945689553, + "grad_norm": 1.6094023989165038, + "learning_rate": 4.953864174962627e-05, + "loss": 0.7641, + "step": 3016 + }, + { + "epoch": 0.3577611763310803, + "grad_norm": 1.3763792051623827, + "learning_rate": 4.953818262057512e-05, + "loss": 0.8104, + "step": 3017 + }, + { + "epoch": 0.35787975809320527, + "grad_norm": 1.534363063593355, + "learning_rate": 4.9537723265312025e-05, + "loss": 0.9068, + "step": 3018 + }, + { + "epoch": 0.35799833985533025, + "grad_norm": 1.3666112664273016, + "learning_rate": 4.9537263683841204e-05, + "loss": 0.7003, + "step": 3019 + }, + { + "epoch": 0.35811692161745523, + "grad_norm": 1.2125257799696063, + "learning_rate": 4.953680387616691e-05, + "loss": 0.4938, + "step": 3020 + }, + { + "epoch": 0.3582355033795802, + "grad_norm": 1.6479222974773404, + "learning_rate": 4.9536343842293364e-05, + "loss": 0.8523, + "step": 3021 + }, + { + "epoch": 0.3583540851417052, + "grad_norm": 1.4147977363952473, + "learning_rate": 4.953588358222482e-05, + "loss": 0.7956, + "step": 3022 + }, + { + "epoch": 0.3584726669038302, + "grad_norm": 1.43393137191777, + "learning_rate": 4.953542309596552e-05, + "loss": 0.8112, + "step": 3023 + }, + { + "epoch": 0.35859124866595515, + "grad_norm": 1.4328320955107607, + "learning_rate": 4.953496238351971e-05, + "loss": 0.6698, + "step": 3024 + }, + { + "epoch": 0.35870983042808013, + "grad_norm": 1.604865243270586, + "learning_rate": 4.953450144489162e-05, + "loss": 0.7421, + "step": 3025 + }, + { + "epoch": 0.35882841219020517, + "grad_norm": 1.5917373343691805, + "learning_rate": 4.9534040280085525e-05, + "loss": 0.6728, + "step": 3026 + }, + { + "epoch": 0.35894699395233015, + "grad_norm": 1.6733732284370273, + "learning_rate": 4.953357888910566e-05, + "loss": 0.7577, + "step": 3027 + }, + { + "epoch": 0.35906557571445513, + "grad_norm": 1.6558906471205739, + "learning_rate": 4.953311727195629e-05, + "loss": 0.8126, + "step": 3028 + }, + { + "epoch": 0.3591841574765801, + "grad_norm": 1.5738568788593073, + "learning_rate": 4.953265542864165e-05, + "loss": 0.7169, + "step": 3029 + }, + { + "epoch": 0.3593027392387051, + "grad_norm": 1.8758615822996598, + "learning_rate": 4.953219335916602e-05, + "loss": 0.8914, + "step": 3030 + }, + { + "epoch": 0.3594213210008301, + "grad_norm": 1.5471360054195962, + "learning_rate": 4.953173106353365e-05, + "loss": 0.5907, + "step": 3031 + }, + { + "epoch": 0.35953990276295505, + "grad_norm": 2.0368190995309, + "learning_rate": 4.953126854174879e-05, + "loss": 0.8821, + "step": 3032 + }, + { + "epoch": 0.35965848452508004, + "grad_norm": 2.2349378020224298, + "learning_rate": 4.953080579381573e-05, + "loss": 0.9962, + "step": 3033 + }, + { + "epoch": 0.359777066287205, + "grad_norm": 1.5120927156983903, + "learning_rate": 4.953034281973872e-05, + "loss": 0.7009, + "step": 3034 + }, + { + "epoch": 0.35989564804933, + "grad_norm": 1.6131889636097, + "learning_rate": 4.952987961952202e-05, + "loss": 0.657, + "step": 3035 + }, + { + "epoch": 0.360014229811455, + "grad_norm": 2.0291359075603954, + "learning_rate": 4.9529416193169934e-05, + "loss": 0.771, + "step": 3036 + }, + { + "epoch": 0.36013281157357996, + "grad_norm": 1.489274538431269, + "learning_rate": 4.952895254068669e-05, + "loss": 0.854, + "step": 3037 + }, + { + "epoch": 0.360251393335705, + "grad_norm": 1.5076793196768112, + "learning_rate": 4.952848866207659e-05, + "loss": 0.9301, + "step": 3038 + }, + { + "epoch": 0.36036997509783, + "grad_norm": 1.692543501144454, + "learning_rate": 4.95280245573439e-05, + "loss": 1.0165, + "step": 3039 + }, + { + "epoch": 0.36048855685995496, + "grad_norm": 1.5134494109263879, + "learning_rate": 4.9527560226492904e-05, + "loss": 0.8951, + "step": 3040 + }, + { + "epoch": 0.36060713862207994, + "grad_norm": 1.5210311465001456, + "learning_rate": 4.952709566952788e-05, + "loss": 1.0127, + "step": 3041 + }, + { + "epoch": 0.3607257203842049, + "grad_norm": 1.3956213050753752, + "learning_rate": 4.9526630886453105e-05, + "loss": 0.8691, + "step": 3042 + }, + { + "epoch": 0.3608443021463299, + "grad_norm": 1.442554976220813, + "learning_rate": 4.952616587727288e-05, + "loss": 0.771, + "step": 3043 + }, + { + "epoch": 0.3609628839084549, + "grad_norm": 1.461985484018066, + "learning_rate": 4.952570064199148e-05, + "loss": 0.787, + "step": 3044 + }, + { + "epoch": 0.36108146567057986, + "grad_norm": 1.3279848473548264, + "learning_rate": 4.9525235180613195e-05, + "loss": 0.7694, + "step": 3045 + }, + { + "epoch": 0.36120004743270484, + "grad_norm": 1.396806118629168, + "learning_rate": 4.952476949314231e-05, + "loss": 0.7692, + "step": 3046 + }, + { + "epoch": 0.3613186291948298, + "grad_norm": 1.5233668443311017, + "learning_rate": 4.952430357958313e-05, + "loss": 0.6486, + "step": 3047 + }, + { + "epoch": 0.3614372109569548, + "grad_norm": 1.8053216469751772, + "learning_rate": 4.9523837439939944e-05, + "loss": 0.9482, + "step": 3048 + }, + { + "epoch": 0.3615557927190798, + "grad_norm": 1.6896153413715158, + "learning_rate": 4.952337107421705e-05, + "loss": 0.7094, + "step": 3049 + }, + { + "epoch": 0.36167437448120476, + "grad_norm": 1.4259300385178255, + "learning_rate": 4.9522904482418756e-05, + "loss": 0.631, + "step": 3050 + }, + { + "epoch": 0.3617929562433298, + "grad_norm": 1.7758430335378435, + "learning_rate": 4.9522437664549335e-05, + "loss": 1.0085, + "step": 3051 + }, + { + "epoch": 0.3619115380054548, + "grad_norm": 1.8920695082568961, + "learning_rate": 4.952197062061312e-05, + "loss": 0.7856, + "step": 3052 + }, + { + "epoch": 0.36203011976757976, + "grad_norm": 1.366156133602365, + "learning_rate": 4.952150335061441e-05, + "loss": 0.6513, + "step": 3053 + }, + { + "epoch": 0.36214870152970474, + "grad_norm": 1.7236594256537405, + "learning_rate": 4.9521035854557505e-05, + "loss": 0.9887, + "step": 3054 + }, + { + "epoch": 0.3622672832918297, + "grad_norm": 1.730354589363647, + "learning_rate": 4.952056813244673e-05, + "loss": 0.8043, + "step": 3055 + }, + { + "epoch": 0.3623858650539547, + "grad_norm": 1.932169929444306, + "learning_rate": 4.952010018428637e-05, + "loss": 1.0445, + "step": 3056 + }, + { + "epoch": 0.3625044468160797, + "grad_norm": 1.6779594466665573, + "learning_rate": 4.951963201008076e-05, + "loss": 0.7471, + "step": 3057 + }, + { + "epoch": 0.36262302857820466, + "grad_norm": 1.482827695342862, + "learning_rate": 4.9519163609834215e-05, + "loss": 0.6568, + "step": 3058 + }, + { + "epoch": 0.36274161034032965, + "grad_norm": 1.584255872841578, + "learning_rate": 4.951869498355105e-05, + "loss": 0.9082, + "step": 3059 + }, + { + "epoch": 0.3628601921024546, + "grad_norm": 1.519188816814503, + "learning_rate": 4.951822613123558e-05, + "loss": 0.6978, + "step": 3060 + }, + { + "epoch": 0.3629787738645796, + "grad_norm": 1.5248519900422424, + "learning_rate": 4.951775705289213e-05, + "loss": 0.7838, + "step": 3061 + }, + { + "epoch": 0.3630973556267046, + "grad_norm": 1.542970212077856, + "learning_rate": 4.951728774852503e-05, + "loss": 0.9152, + "step": 3062 + }, + { + "epoch": 0.3632159373888296, + "grad_norm": 1.8395570880387342, + "learning_rate": 4.951681821813859e-05, + "loss": 0.9609, + "step": 3063 + }, + { + "epoch": 0.3633345191509546, + "grad_norm": 1.4848233052778923, + "learning_rate": 4.9516348461737165e-05, + "loss": 0.987, + "step": 3064 + }, + { + "epoch": 0.3634531009130796, + "grad_norm": 1.504856122057004, + "learning_rate": 4.951587847932507e-05, + "loss": 0.7416, + "step": 3065 + }, + { + "epoch": 0.36357168267520457, + "grad_norm": 1.4910796893326475, + "learning_rate": 4.9515408270906634e-05, + "loss": 0.8023, + "step": 3066 + }, + { + "epoch": 0.36369026443732955, + "grad_norm": 1.383019189698515, + "learning_rate": 4.95149378364862e-05, + "loss": 0.9304, + "step": 3067 + }, + { + "epoch": 0.3638088461994545, + "grad_norm": 1.3726024665149232, + "learning_rate": 4.9514467176068095e-05, + "loss": 0.6963, + "step": 3068 + }, + { + "epoch": 0.3639274279615795, + "grad_norm": 1.3781340571948948, + "learning_rate": 4.9513996289656675e-05, + "loss": 0.7768, + "step": 3069 + }, + { + "epoch": 0.3640460097237045, + "grad_norm": 1.2081512756433084, + "learning_rate": 4.951352517725626e-05, + "loss": 0.6691, + "step": 3070 + }, + { + "epoch": 0.36416459148582947, + "grad_norm": 1.6018988435288883, + "learning_rate": 4.9513053838871206e-05, + "loss": 0.8019, + "step": 3071 + }, + { + "epoch": 0.36428317324795445, + "grad_norm": 1.5986818618229484, + "learning_rate": 4.9512582274505856e-05, + "loss": 0.9968, + "step": 3072 + }, + { + "epoch": 0.36440175501007943, + "grad_norm": 1.3852726037248357, + "learning_rate": 4.951211048416455e-05, + "loss": 0.6909, + "step": 3073 + }, + { + "epoch": 0.3645203367722044, + "grad_norm": 1.3987378329116338, + "learning_rate": 4.9511638467851655e-05, + "loss": 0.5926, + "step": 3074 + }, + { + "epoch": 0.3646389185343294, + "grad_norm": 1.3433850029677807, + "learning_rate": 4.951116622557151e-05, + "loss": 0.7893, + "step": 3075 + }, + { + "epoch": 0.36475750029645443, + "grad_norm": 1.3974852777274687, + "learning_rate": 4.9510693757328466e-05, + "loss": 0.659, + "step": 3076 + }, + { + "epoch": 0.3648760820585794, + "grad_norm": 1.6547634926704458, + "learning_rate": 4.951022106312688e-05, + "loss": 0.7087, + "step": 3077 + }, + { + "epoch": 0.3649946638207044, + "grad_norm": 1.6940226680935768, + "learning_rate": 4.950974814297112e-05, + "loss": 0.7476, + "step": 3078 + }, + { + "epoch": 0.36511324558282937, + "grad_norm": 1.3826705109332402, + "learning_rate": 4.950927499686553e-05, + "loss": 0.7631, + "step": 3079 + }, + { + "epoch": 0.36523182734495435, + "grad_norm": 1.53213496447221, + "learning_rate": 4.950880162481449e-05, + "loss": 0.7835, + "step": 3080 + }, + { + "epoch": 0.36535040910707933, + "grad_norm": 1.446269617506082, + "learning_rate": 4.950832802682235e-05, + "loss": 0.7253, + "step": 3081 + }, + { + "epoch": 0.3654689908692043, + "grad_norm": 1.5452729747288778, + "learning_rate": 4.9507854202893476e-05, + "loss": 0.6578, + "step": 3082 + }, + { + "epoch": 0.3655875726313293, + "grad_norm": 1.6925066698535058, + "learning_rate": 4.9507380153032235e-05, + "loss": 0.9766, + "step": 3083 + }, + { + "epoch": 0.3657061543934543, + "grad_norm": 1.5861017323242954, + "learning_rate": 4.950690587724301e-05, + "loss": 0.7515, + "step": 3084 + }, + { + "epoch": 0.36582473615557926, + "grad_norm": 1.5421785132360604, + "learning_rate": 4.950643137553017e-05, + "loss": 0.7139, + "step": 3085 + }, + { + "epoch": 0.36594331791770424, + "grad_norm": 1.797656007354017, + "learning_rate": 4.950595664789807e-05, + "loss": 1.0194, + "step": 3086 + }, + { + "epoch": 0.3660618996798292, + "grad_norm": 1.6612192330026478, + "learning_rate": 4.950548169435111e-05, + "loss": 0.7712, + "step": 3087 + }, + { + "epoch": 0.36618048144195425, + "grad_norm": 1.408446272288491, + "learning_rate": 4.950500651489366e-05, + "loss": 0.7585, + "step": 3088 + }, + { + "epoch": 0.36629906320407923, + "grad_norm": 1.4142255940348891, + "learning_rate": 4.950453110953009e-05, + "loss": 0.7657, + "step": 3089 + }, + { + "epoch": 0.3664176449662042, + "grad_norm": 1.8816504784867716, + "learning_rate": 4.950405547826481e-05, + "loss": 0.935, + "step": 3090 + }, + { + "epoch": 0.3665362267283292, + "grad_norm": 1.4370066479372023, + "learning_rate": 4.9503579621102176e-05, + "loss": 0.6169, + "step": 3091 + }, + { + "epoch": 0.3666548084904542, + "grad_norm": 1.470547888839295, + "learning_rate": 4.950310353804659e-05, + "loss": 0.879, + "step": 3092 + }, + { + "epoch": 0.36677339025257916, + "grad_norm": 1.7029642618683662, + "learning_rate": 4.950262722910243e-05, + "loss": 0.8014, + "step": 3093 + }, + { + "epoch": 0.36689197201470414, + "grad_norm": 1.3426190199931218, + "learning_rate": 4.9502150694274104e-05, + "loss": 0.8665, + "step": 3094 + }, + { + "epoch": 0.3670105537768291, + "grad_norm": 1.6361554051165519, + "learning_rate": 4.950167393356599e-05, + "loss": 0.857, + "step": 3095 + }, + { + "epoch": 0.3671291355389541, + "grad_norm": 1.4564511792028059, + "learning_rate": 4.9501196946982485e-05, + "loss": 0.7492, + "step": 3096 + }, + { + "epoch": 0.3672477173010791, + "grad_norm": 1.7369176612102415, + "learning_rate": 4.9500719734527995e-05, + "loss": 0.868, + "step": 3097 + }, + { + "epoch": 0.36736629906320406, + "grad_norm": 1.5893430490275953, + "learning_rate": 4.950024229620691e-05, + "loss": 0.895, + "step": 3098 + }, + { + "epoch": 0.36748488082532904, + "grad_norm": 1.7438388426276605, + "learning_rate": 4.949976463202364e-05, + "loss": 0.9188, + "step": 3099 + }, + { + "epoch": 0.367603462587454, + "grad_norm": 1.8046485382079513, + "learning_rate": 4.9499286741982583e-05, + "loss": 0.9495, + "step": 3100 + }, + { + "epoch": 0.36772204434957906, + "grad_norm": 1.5677319628989352, + "learning_rate": 4.949880862608814e-05, + "loss": 0.9387, + "step": 3101 + }, + { + "epoch": 0.36784062611170404, + "grad_norm": 1.646655819356231, + "learning_rate": 4.949833028434473e-05, + "loss": 0.8451, + "step": 3102 + }, + { + "epoch": 0.367959207873829, + "grad_norm": 1.7950707400402195, + "learning_rate": 4.949785171675675e-05, + "loss": 0.8476, + "step": 3103 + }, + { + "epoch": 0.368077789635954, + "grad_norm": 1.3742436774358162, + "learning_rate": 4.949737292332862e-05, + "loss": 0.7807, + "step": 3104 + }, + { + "epoch": 0.368196371398079, + "grad_norm": 1.7144668741626803, + "learning_rate": 4.949689390406476e-05, + "loss": 0.9348, + "step": 3105 + }, + { + "epoch": 0.36831495316020396, + "grad_norm": 1.7201192604467652, + "learning_rate": 4.949641465896957e-05, + "loss": 0.9535, + "step": 3106 + }, + { + "epoch": 0.36843353492232894, + "grad_norm": 1.5269513895875113, + "learning_rate": 4.9495935188047485e-05, + "loss": 0.7667, + "step": 3107 + }, + { + "epoch": 0.3685521166844539, + "grad_norm": 1.4375063723949297, + "learning_rate": 4.94954554913029e-05, + "loss": 0.827, + "step": 3108 + }, + { + "epoch": 0.3686706984465789, + "grad_norm": 1.4010376341069477, + "learning_rate": 4.9494975568740264e-05, + "loss": 0.7737, + "step": 3109 + }, + { + "epoch": 0.3687892802087039, + "grad_norm": 1.506937707201339, + "learning_rate": 4.9494495420364e-05, + "loss": 0.9038, + "step": 3110 + }, + { + "epoch": 0.36890786197082887, + "grad_norm": 1.4371368168437029, + "learning_rate": 4.949401504617851e-05, + "loss": 0.8653, + "step": 3111 + }, + { + "epoch": 0.36902644373295385, + "grad_norm": 1.2752521208307115, + "learning_rate": 4.949353444618825e-05, + "loss": 0.748, + "step": 3112 + }, + { + "epoch": 0.3691450254950789, + "grad_norm": 1.4067886922378956, + "learning_rate": 4.949305362039763e-05, + "loss": 0.8786, + "step": 3113 + }, + { + "epoch": 0.36926360725720386, + "grad_norm": 1.5498589541877545, + "learning_rate": 4.949257256881109e-05, + "loss": 0.9742, + "step": 3114 + }, + { + "epoch": 0.36938218901932884, + "grad_norm": 1.377755709501552, + "learning_rate": 4.949209129143307e-05, + "loss": 0.7635, + "step": 3115 + }, + { + "epoch": 0.3695007707814538, + "grad_norm": 1.426476550326406, + "learning_rate": 4.9491609788268e-05, + "loss": 0.7045, + "step": 3116 + }, + { + "epoch": 0.3696193525435788, + "grad_norm": 1.518607682738011, + "learning_rate": 4.949112805932033e-05, + "loss": 0.6794, + "step": 3117 + }, + { + "epoch": 0.3697379343057038, + "grad_norm": 1.4301568253555041, + "learning_rate": 4.9490646104594484e-05, + "loss": 0.6418, + "step": 3118 + }, + { + "epoch": 0.36985651606782877, + "grad_norm": 1.504160682879259, + "learning_rate": 4.9490163924094915e-05, + "loss": 0.8151, + "step": 3119 + }, + { + "epoch": 0.36997509782995375, + "grad_norm": 1.4592808378482456, + "learning_rate": 4.948968151782607e-05, + "loss": 0.7903, + "step": 3120 + }, + { + "epoch": 0.37009367959207873, + "grad_norm": 1.6521590023815609, + "learning_rate": 4.9489198885792384e-05, + "loss": 1.0075, + "step": 3121 + }, + { + "epoch": 0.3702122613542037, + "grad_norm": 1.9312519868948987, + "learning_rate": 4.948871602799832e-05, + "loss": 1.0681, + "step": 3122 + }, + { + "epoch": 0.3703308431163287, + "grad_norm": 1.738615563657647, + "learning_rate": 4.948823294444832e-05, + "loss": 0.9093, + "step": 3123 + }, + { + "epoch": 0.37044942487845367, + "grad_norm": 1.8128934960529652, + "learning_rate": 4.9487749635146854e-05, + "loss": 0.9639, + "step": 3124 + }, + { + "epoch": 0.3705680066405787, + "grad_norm": 1.3787411891821522, + "learning_rate": 4.948726610009835e-05, + "loss": 0.4826, + "step": 3125 + }, + { + "epoch": 0.3706865884027037, + "grad_norm": 1.779818670080989, + "learning_rate": 4.948678233930729e-05, + "loss": 1.0307, + "step": 3126 + }, + { + "epoch": 0.37080517016482867, + "grad_norm": 1.2947148610204073, + "learning_rate": 4.948629835277812e-05, + "loss": 0.7339, + "step": 3127 + }, + { + "epoch": 0.37092375192695365, + "grad_norm": 1.4135005245490013, + "learning_rate": 4.948581414051531e-05, + "loss": 0.6742, + "step": 3128 + }, + { + "epoch": 0.37104233368907863, + "grad_norm": 1.6987002172737578, + "learning_rate": 4.948532970252332e-05, + "loss": 0.917, + "step": 3129 + }, + { + "epoch": 0.3711609154512036, + "grad_norm": 1.4449456552337652, + "learning_rate": 4.948484503880662e-05, + "loss": 0.7827, + "step": 3130 + }, + { + "epoch": 0.3712794972133286, + "grad_norm": 1.4233277765094676, + "learning_rate": 4.948436014936966e-05, + "loss": 0.7593, + "step": 3131 + }, + { + "epoch": 0.37139807897545357, + "grad_norm": 1.4465663303522114, + "learning_rate": 4.948387503421693e-05, + "loss": 1.0046, + "step": 3132 + }, + { + "epoch": 0.37151666073757855, + "grad_norm": 1.4623876244790341, + "learning_rate": 4.94833896933529e-05, + "loss": 1.0284, + "step": 3133 + }, + { + "epoch": 0.37163524249970353, + "grad_norm": 1.4300904266193268, + "learning_rate": 4.948290412678204e-05, + "loss": 0.6498, + "step": 3134 + }, + { + "epoch": 0.3717538242618285, + "grad_norm": 1.4717989807132263, + "learning_rate": 4.948241833450883e-05, + "loss": 0.8905, + "step": 3135 + }, + { + "epoch": 0.3718724060239535, + "grad_norm": 1.3262491376951644, + "learning_rate": 4.9481932316537736e-05, + "loss": 0.5522, + "step": 3136 + }, + { + "epoch": 0.3719909877860785, + "grad_norm": 1.4481642258155922, + "learning_rate": 4.948144607287326e-05, + "loss": 0.9073, + "step": 3137 + }, + { + "epoch": 0.3721095695482035, + "grad_norm": 1.4563062956790982, + "learning_rate": 4.948095960351987e-05, + "loss": 0.8969, + "step": 3138 + }, + { + "epoch": 0.3722281513103285, + "grad_norm": 1.520419960996801, + "learning_rate": 4.948047290848204e-05, + "loss": 0.8179, + "step": 3139 + }, + { + "epoch": 0.3723467330724535, + "grad_norm": 1.3450777803673666, + "learning_rate": 4.947998598776428e-05, + "loss": 0.8044, + "step": 3140 + }, + { + "epoch": 0.37246531483457845, + "grad_norm": 1.5606968122080485, + "learning_rate": 4.947949884137107e-05, + "loss": 0.6789, + "step": 3141 + }, + { + "epoch": 0.37258389659670343, + "grad_norm": 1.4062469307833254, + "learning_rate": 4.9479011469306894e-05, + "loss": 0.8931, + "step": 3142 + }, + { + "epoch": 0.3727024783588284, + "grad_norm": 1.3980507100251707, + "learning_rate": 4.9478523871576256e-05, + "loss": 0.5461, + "step": 3143 + }, + { + "epoch": 0.3728210601209534, + "grad_norm": 1.6939582675399725, + "learning_rate": 4.947803604818364e-05, + "loss": 0.7129, + "step": 3144 + }, + { + "epoch": 0.3729396418830784, + "grad_norm": 1.4783008286089696, + "learning_rate": 4.947754799913355e-05, + "loss": 0.761, + "step": 3145 + }, + { + "epoch": 0.37305822364520336, + "grad_norm": 1.5948483024844975, + "learning_rate": 4.947705972443049e-05, + "loss": 0.7131, + "step": 3146 + }, + { + "epoch": 0.37317680540732834, + "grad_norm": 2.222387833273549, + "learning_rate": 4.947657122407895e-05, + "loss": 0.8595, + "step": 3147 + }, + { + "epoch": 0.3732953871694533, + "grad_norm": 1.876941547816875, + "learning_rate": 4.9476082498083436e-05, + "loss": 0.9518, + "step": 3148 + }, + { + "epoch": 0.3734139689315783, + "grad_norm": 1.5063571787320436, + "learning_rate": 4.9475593546448464e-05, + "loss": 0.7863, + "step": 3149 + }, + { + "epoch": 0.37353255069370334, + "grad_norm": 1.5768965514697513, + "learning_rate": 4.9475104369178525e-05, + "loss": 0.6851, + "step": 3150 + }, + { + "epoch": 0.3736511324558283, + "grad_norm": 1.4691952437462241, + "learning_rate": 4.947461496627814e-05, + "loss": 0.5351, + "step": 3151 + }, + { + "epoch": 0.3737697142179533, + "grad_norm": 1.8114812930950623, + "learning_rate": 4.9474125337751816e-05, + "loss": 0.734, + "step": 3152 + }, + { + "epoch": 0.3738882959800783, + "grad_norm": 1.4240446503931021, + "learning_rate": 4.947363548360407e-05, + "loss": 0.6649, + "step": 3153 + }, + { + "epoch": 0.37400687774220326, + "grad_norm": 1.6531309937676175, + "learning_rate": 4.947314540383942e-05, + "loss": 0.7416, + "step": 3154 + }, + { + "epoch": 0.37412545950432824, + "grad_norm": 1.4708664664760922, + "learning_rate": 4.947265509846237e-05, + "loss": 0.7357, + "step": 3155 + }, + { + "epoch": 0.3742440412664532, + "grad_norm": 1.8048843383732813, + "learning_rate": 4.947216456747747e-05, + "loss": 1.0099, + "step": 3156 + }, + { + "epoch": 0.3743626230285782, + "grad_norm": 1.5648079189735453, + "learning_rate": 4.94716738108892e-05, + "loss": 0.6917, + "step": 3157 + }, + { + "epoch": 0.3744812047907032, + "grad_norm": 1.6302908568409067, + "learning_rate": 4.9471182828702117e-05, + "loss": 0.8869, + "step": 3158 + }, + { + "epoch": 0.37459978655282816, + "grad_norm": 1.7228509019924438, + "learning_rate": 4.947069162092074e-05, + "loss": 0.6653, + "step": 3159 + }, + { + "epoch": 0.37471836831495314, + "grad_norm": 1.4773590169339796, + "learning_rate": 4.947020018754959e-05, + "loss": 0.8771, + "step": 3160 + }, + { + "epoch": 0.3748369500770781, + "grad_norm": 1.7114689799955096, + "learning_rate": 4.9469708528593195e-05, + "loss": 0.7831, + "step": 3161 + }, + { + "epoch": 0.3749555318392031, + "grad_norm": 1.7996657150148339, + "learning_rate": 4.9469216644056105e-05, + "loss": 0.7412, + "step": 3162 + }, + { + "epoch": 0.37507411360132814, + "grad_norm": 1.6191683638540606, + "learning_rate": 4.946872453394284e-05, + "loss": 0.7563, + "step": 3163 + }, + { + "epoch": 0.3751926953634531, + "grad_norm": 1.3636154314902038, + "learning_rate": 4.946823219825794e-05, + "loss": 0.817, + "step": 3164 + }, + { + "epoch": 0.3753112771255781, + "grad_norm": 1.5712974505498554, + "learning_rate": 4.946773963700594e-05, + "loss": 0.5072, + "step": 3165 + }, + { + "epoch": 0.3754298588877031, + "grad_norm": 1.7789265812221249, + "learning_rate": 4.9467246850191396e-05, + "loss": 0.8249, + "step": 3166 + }, + { + "epoch": 0.37554844064982806, + "grad_norm": 1.7366284351646062, + "learning_rate": 4.946675383781883e-05, + "loss": 0.8146, + "step": 3167 + }, + { + "epoch": 0.37566702241195304, + "grad_norm": 1.6318880366672075, + "learning_rate": 4.94662605998928e-05, + "loss": 0.733, + "step": 3168 + }, + { + "epoch": 0.375785604174078, + "grad_norm": 1.715867773946733, + "learning_rate": 4.9465767136417854e-05, + "loss": 0.9381, + "step": 3169 + }, + { + "epoch": 0.375904185936203, + "grad_norm": 1.470873895639843, + "learning_rate": 4.946527344739852e-05, + "loss": 0.745, + "step": 3170 + }, + { + "epoch": 0.376022767698328, + "grad_norm": 1.376725209381632, + "learning_rate": 4.946477953283938e-05, + "loss": 0.6302, + "step": 3171 + }, + { + "epoch": 0.37614134946045297, + "grad_norm": 1.3574409045506295, + "learning_rate": 4.946428539274497e-05, + "loss": 0.6984, + "step": 3172 + }, + { + "epoch": 0.37625993122257795, + "grad_norm": 1.6063654419338014, + "learning_rate": 4.9463791027119855e-05, + "loss": 0.8708, + "step": 3173 + }, + { + "epoch": 0.37637851298470293, + "grad_norm": 1.675272102404614, + "learning_rate": 4.946329643596859e-05, + "loss": 0.9243, + "step": 3174 + }, + { + "epoch": 0.37649709474682796, + "grad_norm": 1.300527882520946, + "learning_rate": 4.946280161929572e-05, + "loss": 0.5008, + "step": 3175 + }, + { + "epoch": 0.37661567650895295, + "grad_norm": 1.4619334510515205, + "learning_rate": 4.946230657710581e-05, + "loss": 0.9637, + "step": 3176 + }, + { + "epoch": 0.3767342582710779, + "grad_norm": 1.50147265079686, + "learning_rate": 4.946181130940345e-05, + "loss": 0.7774, + "step": 3177 + }, + { + "epoch": 0.3768528400332029, + "grad_norm": 1.5529070294363772, + "learning_rate": 4.946131581619318e-05, + "loss": 0.7223, + "step": 3178 + }, + { + "epoch": 0.3769714217953279, + "grad_norm": 1.537729678627995, + "learning_rate": 4.946082009747957e-05, + "loss": 0.9531, + "step": 3179 + }, + { + "epoch": 0.37709000355745287, + "grad_norm": 1.3073230151784905, + "learning_rate": 4.946032415326719e-05, + "loss": 0.596, + "step": 3180 + }, + { + "epoch": 0.37720858531957785, + "grad_norm": 1.574068515762463, + "learning_rate": 4.945982798356062e-05, + "loss": 0.7335, + "step": 3181 + }, + { + "epoch": 0.37732716708170283, + "grad_norm": 1.5812123527383504, + "learning_rate": 4.945933158836444e-05, + "loss": 0.6088, + "step": 3182 + }, + { + "epoch": 0.3774457488438278, + "grad_norm": 1.7007120737005263, + "learning_rate": 4.945883496768321e-05, + "loss": 1.0282, + "step": 3183 + }, + { + "epoch": 0.3775643306059528, + "grad_norm": 1.5701105661628214, + "learning_rate": 4.945833812152152e-05, + "loss": 0.7584, + "step": 3184 + }, + { + "epoch": 0.37768291236807777, + "grad_norm": 1.7147346433184019, + "learning_rate": 4.945784104988394e-05, + "loss": 0.8626, + "step": 3185 + }, + { + "epoch": 0.37780149413020275, + "grad_norm": 1.5344752122503422, + "learning_rate": 4.9457343752775056e-05, + "loss": 0.7486, + "step": 3186 + }, + { + "epoch": 0.37792007589232773, + "grad_norm": 1.5028270917437605, + "learning_rate": 4.945684623019946e-05, + "loss": 0.6737, + "step": 3187 + }, + { + "epoch": 0.37803865765445277, + "grad_norm": 1.6534090106340509, + "learning_rate": 4.945634848216173e-05, + "loss": 0.7625, + "step": 3188 + }, + { + "epoch": 0.37815723941657775, + "grad_norm": 1.6028359755042079, + "learning_rate": 4.945585050866646e-05, + "loss": 0.5606, + "step": 3189 + }, + { + "epoch": 0.37827582117870273, + "grad_norm": 1.6952371649735012, + "learning_rate": 4.945535230971823e-05, + "loss": 0.9395, + "step": 3190 + }, + { + "epoch": 0.3783944029408277, + "grad_norm": 1.7421054560489009, + "learning_rate": 4.9454853885321646e-05, + "loss": 0.8346, + "step": 3191 + }, + { + "epoch": 0.3785129847029527, + "grad_norm": 1.5658393562438548, + "learning_rate": 4.94543552354813e-05, + "loss": 0.7322, + "step": 3192 + }, + { + "epoch": 0.3786315664650777, + "grad_norm": 1.4965581241651604, + "learning_rate": 4.945385636020178e-05, + "loss": 0.7484, + "step": 3193 + }, + { + "epoch": 0.37875014822720265, + "grad_norm": 1.6106757061715484, + "learning_rate": 4.9453357259487695e-05, + "loss": 0.9881, + "step": 3194 + }, + { + "epoch": 0.37886872998932764, + "grad_norm": 1.9682943419689864, + "learning_rate": 4.9452857933343644e-05, + "loss": 0.9061, + "step": 3195 + }, + { + "epoch": 0.3789873117514526, + "grad_norm": 1.7297291745174646, + "learning_rate": 4.9452358381774235e-05, + "loss": 0.4589, + "step": 3196 + }, + { + "epoch": 0.3791058935135776, + "grad_norm": 1.5167427422582236, + "learning_rate": 4.945185860478405e-05, + "loss": 0.6759, + "step": 3197 + }, + { + "epoch": 0.3792244752757026, + "grad_norm": 2.107242678501436, + "learning_rate": 4.945135860237773e-05, + "loss": 1.088, + "step": 3198 + }, + { + "epoch": 0.37934305703782756, + "grad_norm": 1.7420784052269787, + "learning_rate": 4.945085837455986e-05, + "loss": 0.8665, + "step": 3199 + }, + { + "epoch": 0.3794616387999526, + "grad_norm": 1.6553981378431084, + "learning_rate": 4.945035792133507e-05, + "loss": 0.8491, + "step": 3200 + }, + { + "epoch": 0.3795802205620776, + "grad_norm": 1.3770407071189947, + "learning_rate": 4.9449857242707945e-05, + "loss": 0.6946, + "step": 3201 + }, + { + "epoch": 0.37969880232420256, + "grad_norm": 1.546841518273277, + "learning_rate": 4.944935633868313e-05, + "loss": 0.8674, + "step": 3202 + }, + { + "epoch": 0.37981738408632754, + "grad_norm": 1.5558363194793887, + "learning_rate": 4.944885520926523e-05, + "loss": 0.736, + "step": 3203 + }, + { + "epoch": 0.3799359658484525, + "grad_norm": 1.2731785550420782, + "learning_rate": 4.944835385445886e-05, + "loss": 0.5703, + "step": 3204 + }, + { + "epoch": 0.3800545476105775, + "grad_norm": 1.4275409889352497, + "learning_rate": 4.944785227426866e-05, + "loss": 0.7343, + "step": 3205 + }, + { + "epoch": 0.3801731293727025, + "grad_norm": 1.628861859844902, + "learning_rate": 4.944735046869924e-05, + "loss": 0.5253, + "step": 3206 + }, + { + "epoch": 0.38029171113482746, + "grad_norm": 1.4770288410451435, + "learning_rate": 4.944684843775522e-05, + "loss": 0.8356, + "step": 3207 + }, + { + "epoch": 0.38041029289695244, + "grad_norm": 1.4984531193115744, + "learning_rate": 4.944634618144124e-05, + "loss": 0.7241, + "step": 3208 + }, + { + "epoch": 0.3805288746590774, + "grad_norm": 1.7930841598970384, + "learning_rate": 4.944584369976192e-05, + "loss": 0.8751, + "step": 3209 + }, + { + "epoch": 0.3806474564212024, + "grad_norm": 1.5391882347829557, + "learning_rate": 4.944534099272191e-05, + "loss": 0.6944, + "step": 3210 + }, + { + "epoch": 0.3807660381833274, + "grad_norm": 1.4620488319081386, + "learning_rate": 4.9444838060325824e-05, + "loss": 0.5565, + "step": 3211 + }, + { + "epoch": 0.38088461994545236, + "grad_norm": 1.7795897274277042, + "learning_rate": 4.9444334902578315e-05, + "loss": 0.6879, + "step": 3212 + }, + { + "epoch": 0.3810032017075774, + "grad_norm": 2.0318057224496577, + "learning_rate": 4.9443831519484006e-05, + "loss": 1.0165, + "step": 3213 + }, + { + "epoch": 0.3811217834697024, + "grad_norm": 1.9500451352331196, + "learning_rate": 4.944332791104755e-05, + "loss": 0.8578, + "step": 3214 + }, + { + "epoch": 0.38124036523182736, + "grad_norm": 1.6877660051036747, + "learning_rate": 4.944282407727359e-05, + "loss": 0.7278, + "step": 3215 + }, + { + "epoch": 0.38135894699395234, + "grad_norm": 1.4272935823812445, + "learning_rate": 4.944232001816676e-05, + "loss": 0.6532, + "step": 3216 + }, + { + "epoch": 0.3814775287560773, + "grad_norm": 1.8710862340985495, + "learning_rate": 4.944181573373171e-05, + "loss": 1.0306, + "step": 3217 + }, + { + "epoch": 0.3815961105182023, + "grad_norm": 1.4347736502535138, + "learning_rate": 4.944131122397309e-05, + "loss": 0.6131, + "step": 3218 + }, + { + "epoch": 0.3817146922803273, + "grad_norm": 1.6325334188961806, + "learning_rate": 4.944080648889556e-05, + "loss": 0.8383, + "step": 3219 + }, + { + "epoch": 0.38183327404245226, + "grad_norm": 1.3338105719880446, + "learning_rate": 4.9440301528503766e-05, + "loss": 0.6039, + "step": 3220 + }, + { + "epoch": 0.38195185580457724, + "grad_norm": 1.4045480050218806, + "learning_rate": 4.9439796342802355e-05, + "loss": 0.64, + "step": 3221 + }, + { + "epoch": 0.3820704375667022, + "grad_norm": 1.4466372993883514, + "learning_rate": 4.943929093179599e-05, + "loss": 0.7163, + "step": 3222 + }, + { + "epoch": 0.3821890193288272, + "grad_norm": 1.5500648734178097, + "learning_rate": 4.9438785295489345e-05, + "loss": 0.734, + "step": 3223 + }, + { + "epoch": 0.3823076010909522, + "grad_norm": 1.570605331982657, + "learning_rate": 4.943827943388706e-05, + "loss": 0.7607, + "step": 3224 + }, + { + "epoch": 0.3824261828530772, + "grad_norm": 1.3646505079483133, + "learning_rate": 4.94377733469938e-05, + "loss": 0.8536, + "step": 3225 + }, + { + "epoch": 0.3825447646152022, + "grad_norm": 1.4424544203799678, + "learning_rate": 4.943726703481425e-05, + "loss": 0.6229, + "step": 3226 + }, + { + "epoch": 0.3826633463773272, + "grad_norm": 1.7042540177205865, + "learning_rate": 4.943676049735306e-05, + "loss": 0.764, + "step": 3227 + }, + { + "epoch": 0.38278192813945217, + "grad_norm": 1.723461587132966, + "learning_rate": 4.94362537346149e-05, + "loss": 0.8225, + "step": 3228 + }, + { + "epoch": 0.38290050990157715, + "grad_norm": 1.7172024465548814, + "learning_rate": 4.943574674660445e-05, + "loss": 0.854, + "step": 3229 + }, + { + "epoch": 0.3830190916637021, + "grad_norm": 1.3082006367327952, + "learning_rate": 4.9435239533326385e-05, + "loss": 0.6813, + "step": 3230 + }, + { + "epoch": 0.3831376734258271, + "grad_norm": 1.6411378008009438, + "learning_rate": 4.943473209478537e-05, + "loss": 0.797, + "step": 3231 + }, + { + "epoch": 0.3832562551879521, + "grad_norm": 1.8692058454766995, + "learning_rate": 4.9434224430986085e-05, + "loss": 0.6085, + "step": 3232 + }, + { + "epoch": 0.38337483695007707, + "grad_norm": 1.734893282235405, + "learning_rate": 4.943371654193322e-05, + "loss": 0.7662, + "step": 3233 + }, + { + "epoch": 0.38349341871220205, + "grad_norm": 1.4542707627433027, + "learning_rate": 4.943320842763145e-05, + "loss": 0.6431, + "step": 3234 + }, + { + "epoch": 0.38361200047432703, + "grad_norm": 1.5525541947145052, + "learning_rate": 4.943270008808546e-05, + "loss": 0.5759, + "step": 3235 + }, + { + "epoch": 0.383730582236452, + "grad_norm": 1.7276720241415515, + "learning_rate": 4.943219152329994e-05, + "loss": 0.7173, + "step": 3236 + }, + { + "epoch": 0.383849163998577, + "grad_norm": 1.6572381957177205, + "learning_rate": 4.9431682733279574e-05, + "loss": 0.7443, + "step": 3237 + }, + { + "epoch": 0.38396774576070203, + "grad_norm": 1.8280189074308468, + "learning_rate": 4.943117371802906e-05, + "loss": 1.0962, + "step": 3238 + }, + { + "epoch": 0.384086327522827, + "grad_norm": 1.7206032164545222, + "learning_rate": 4.9430664477553065e-05, + "loss": 0.8564, + "step": 3239 + }, + { + "epoch": 0.384204909284952, + "grad_norm": 1.6109114175140071, + "learning_rate": 4.943015501185632e-05, + "loss": 0.689, + "step": 3240 + }, + { + "epoch": 0.38432349104707697, + "grad_norm": 1.560493881262144, + "learning_rate": 4.942964532094349e-05, + "loss": 0.7849, + "step": 3241 + }, + { + "epoch": 0.38444207280920195, + "grad_norm": 1.6714374623450774, + "learning_rate": 4.942913540481929e-05, + "loss": 0.8819, + "step": 3242 + }, + { + "epoch": 0.38456065457132693, + "grad_norm": 1.3746458332764817, + "learning_rate": 4.942862526348843e-05, + "loss": 0.8442, + "step": 3243 + }, + { + "epoch": 0.3846792363334519, + "grad_norm": 1.5135044505373825, + "learning_rate": 4.942811489695559e-05, + "loss": 0.5896, + "step": 3244 + }, + { + "epoch": 0.3847978180955769, + "grad_norm": 1.5176651260072362, + "learning_rate": 4.9427604305225495e-05, + "loss": 0.9982, + "step": 3245 + }, + { + "epoch": 0.3849163998577019, + "grad_norm": 1.6595143628642668, + "learning_rate": 4.942709348830284e-05, + "loss": 0.8944, + "step": 3246 + }, + { + "epoch": 0.38503498161982685, + "grad_norm": 1.6771260511618922, + "learning_rate": 4.9426582446192335e-05, + "loss": 0.9621, + "step": 3247 + }, + { + "epoch": 0.38515356338195184, + "grad_norm": 1.3944462078929023, + "learning_rate": 4.942607117889869e-05, + "loss": 0.7567, + "step": 3248 + }, + { + "epoch": 0.3852721451440768, + "grad_norm": 1.6254229427188742, + "learning_rate": 4.9425559686426635e-05, + "loss": 0.9746, + "step": 3249 + }, + { + "epoch": 0.38539072690620185, + "grad_norm": 1.662723354770451, + "learning_rate": 4.9425047968780866e-05, + "loss": 0.9029, + "step": 3250 + }, + { + "epoch": 0.38550930866832683, + "grad_norm": 1.4871712832674107, + "learning_rate": 4.9424536025966106e-05, + "loss": 0.7813, + "step": 3251 + }, + { + "epoch": 0.3856278904304518, + "grad_norm": 1.6183753481259633, + "learning_rate": 4.9424023857987065e-05, + "loss": 0.7846, + "step": 3252 + }, + { + "epoch": 0.3857464721925768, + "grad_norm": 1.7266362277947056, + "learning_rate": 4.942351146484849e-05, + "loss": 0.85, + "step": 3253 + }, + { + "epoch": 0.3858650539547018, + "grad_norm": 1.429297893713081, + "learning_rate": 4.9422998846555084e-05, + "loss": 0.8742, + "step": 3254 + }, + { + "epoch": 0.38598363571682676, + "grad_norm": 1.5598735043186809, + "learning_rate": 4.942248600311158e-05, + "loss": 0.6862, + "step": 3255 + }, + { + "epoch": 0.38610221747895174, + "grad_norm": 1.3599248605267584, + "learning_rate": 4.9421972934522695e-05, + "loss": 0.5351, + "step": 3256 + }, + { + "epoch": 0.3862207992410767, + "grad_norm": 1.3516309865025078, + "learning_rate": 4.942145964079318e-05, + "loss": 0.8849, + "step": 3257 + }, + { + "epoch": 0.3863393810032017, + "grad_norm": 1.6872664530749308, + "learning_rate": 4.942094612192775e-05, + "loss": 0.7987, + "step": 3258 + }, + { + "epoch": 0.3864579627653267, + "grad_norm": 1.4703973708752998, + "learning_rate": 4.942043237793114e-05, + "loss": 0.8241, + "step": 3259 + }, + { + "epoch": 0.38657654452745166, + "grad_norm": 1.7339408503007132, + "learning_rate": 4.94199184088081e-05, + "loss": 0.9833, + "step": 3260 + }, + { + "epoch": 0.38669512628957664, + "grad_norm": 1.7571404510200175, + "learning_rate": 4.9419404214563346e-05, + "loss": 0.7578, + "step": 3261 + }, + { + "epoch": 0.3868137080517016, + "grad_norm": 1.352606476313795, + "learning_rate": 4.941888979520163e-05, + "loss": 0.6468, + "step": 3262 + }, + { + "epoch": 0.38693228981382666, + "grad_norm": 1.5753864298295939, + "learning_rate": 4.94183751507277e-05, + "loss": 0.7856, + "step": 3263 + }, + { + "epoch": 0.38705087157595164, + "grad_norm": 1.3628070917687545, + "learning_rate": 4.94178602811463e-05, + "loss": 0.674, + "step": 3264 + }, + { + "epoch": 0.3871694533380766, + "grad_norm": 1.4276681805990732, + "learning_rate": 4.941734518646216e-05, + "loss": 0.87, + "step": 3265 + }, + { + "epoch": 0.3872880351002016, + "grad_norm": 1.447348059374916, + "learning_rate": 4.941682986668005e-05, + "loss": 0.8123, + "step": 3266 + }, + { + "epoch": 0.3874066168623266, + "grad_norm": 1.577136083871205, + "learning_rate": 4.9416314321804705e-05, + "loss": 1.1176, + "step": 3267 + }, + { + "epoch": 0.38752519862445156, + "grad_norm": 1.5432176237036788, + "learning_rate": 4.9415798551840884e-05, + "loss": 0.8998, + "step": 3268 + }, + { + "epoch": 0.38764378038657654, + "grad_norm": 1.5614019493619116, + "learning_rate": 4.941528255679334e-05, + "loss": 0.8637, + "step": 3269 + }, + { + "epoch": 0.3877623621487015, + "grad_norm": 1.2358149862891883, + "learning_rate": 4.941476633666683e-05, + "loss": 0.6726, + "step": 3270 + }, + { + "epoch": 0.3878809439108265, + "grad_norm": 1.435991032563116, + "learning_rate": 4.941424989146612e-05, + "loss": 0.8389, + "step": 3271 + }, + { + "epoch": 0.3879995256729515, + "grad_norm": 1.7090639872696989, + "learning_rate": 4.9413733221195954e-05, + "loss": 0.8458, + "step": 3272 + }, + { + "epoch": 0.38811810743507646, + "grad_norm": 1.370839445725039, + "learning_rate": 4.941321632586112e-05, + "loss": 0.6159, + "step": 3273 + }, + { + "epoch": 0.38823668919720145, + "grad_norm": 1.6330061753200962, + "learning_rate": 4.941269920546636e-05, + "loss": 0.873, + "step": 3274 + }, + { + "epoch": 0.3883552709593265, + "grad_norm": 1.5734966018109338, + "learning_rate": 4.941218186001645e-05, + "loss": 0.9558, + "step": 3275 + }, + { + "epoch": 0.38847385272145146, + "grad_norm": 1.708319256953241, + "learning_rate": 4.9411664289516154e-05, + "loss": 0.7515, + "step": 3276 + }, + { + "epoch": 0.38859243448357644, + "grad_norm": 1.678446161771092, + "learning_rate": 4.941114649397026e-05, + "loss": 0.807, + "step": 3277 + }, + { + "epoch": 0.3887110162457014, + "grad_norm": 2.0889158301984665, + "learning_rate": 4.941062847338353e-05, + "loss": 1.0104, + "step": 3278 + }, + { + "epoch": 0.3888295980078264, + "grad_norm": 1.3491515048262477, + "learning_rate": 4.941011022776073e-05, + "loss": 0.6331, + "step": 3279 + }, + { + "epoch": 0.3889481797699514, + "grad_norm": 1.8027892738195368, + "learning_rate": 4.9409591757106655e-05, + "loss": 0.8477, + "step": 3280 + }, + { + "epoch": 0.38906676153207637, + "grad_norm": 1.6248770939591675, + "learning_rate": 4.9409073061426084e-05, + "loss": 1.0085, + "step": 3281 + }, + { + "epoch": 0.38918534329420135, + "grad_norm": 1.5348868739595138, + "learning_rate": 4.940855414072377e-05, + "loss": 0.7302, + "step": 3282 + }, + { + "epoch": 0.3893039250563263, + "grad_norm": 1.460753591592435, + "learning_rate": 4.940803499500454e-05, + "loss": 0.8878, + "step": 3283 + }, + { + "epoch": 0.3894225068184513, + "grad_norm": 1.6395725770935132, + "learning_rate": 4.940751562427315e-05, + "loss": 0.7117, + "step": 3284 + }, + { + "epoch": 0.3895410885805763, + "grad_norm": 1.7980389044425864, + "learning_rate": 4.940699602853439e-05, + "loss": 0.8298, + "step": 3285 + }, + { + "epoch": 0.38965967034270127, + "grad_norm": 1.423547702493396, + "learning_rate": 4.940647620779307e-05, + "loss": 0.7047, + "step": 3286 + }, + { + "epoch": 0.38977825210482625, + "grad_norm": 1.4263621663054926, + "learning_rate": 4.940595616205396e-05, + "loss": 0.8817, + "step": 3287 + }, + { + "epoch": 0.3898968338669513, + "grad_norm": 1.422393723432806, + "learning_rate": 4.940543589132186e-05, + "loss": 0.7458, + "step": 3288 + }, + { + "epoch": 0.39001541562907627, + "grad_norm": 1.4784881977921902, + "learning_rate": 4.9404915395601574e-05, + "loss": 0.6777, + "step": 3289 + }, + { + "epoch": 0.39013399739120125, + "grad_norm": 1.5326485409600152, + "learning_rate": 4.940439467489789e-05, + "loss": 0.8657, + "step": 3290 + }, + { + "epoch": 0.39025257915332623, + "grad_norm": 1.7660718154046016, + "learning_rate": 4.940387372921562e-05, + "loss": 0.9972, + "step": 3291 + }, + { + "epoch": 0.3903711609154512, + "grad_norm": 1.7339433884538618, + "learning_rate": 4.9403352558559564e-05, + "loss": 0.793, + "step": 3292 + }, + { + "epoch": 0.3904897426775762, + "grad_norm": 1.8361831142060703, + "learning_rate": 4.940283116293452e-05, + "loss": 0.7304, + "step": 3293 + }, + { + "epoch": 0.39060832443970117, + "grad_norm": 1.6550205962358926, + "learning_rate": 4.9402309542345294e-05, + "loss": 0.8737, + "step": 3294 + }, + { + "epoch": 0.39072690620182615, + "grad_norm": 1.9644551239812384, + "learning_rate": 4.9401787696796695e-05, + "loss": 0.7666, + "step": 3295 + }, + { + "epoch": 0.39084548796395113, + "grad_norm": 1.7150059625474834, + "learning_rate": 4.9401265626293534e-05, + "loss": 0.8499, + "step": 3296 + }, + { + "epoch": 0.3909640697260761, + "grad_norm": 1.5227352248686579, + "learning_rate": 4.9400743330840634e-05, + "loss": 0.8438, + "step": 3297 + }, + { + "epoch": 0.3910826514882011, + "grad_norm": 1.578894035824458, + "learning_rate": 4.940022081044281e-05, + "loss": 0.9757, + "step": 3298 + }, + { + "epoch": 0.3912012332503261, + "grad_norm": 1.6238206759532363, + "learning_rate": 4.939969806510486e-05, + "loss": 0.8863, + "step": 3299 + }, + { + "epoch": 0.3913198150124511, + "grad_norm": 1.3440291685675794, + "learning_rate": 4.939917509483162e-05, + "loss": 0.7646, + "step": 3300 + }, + { + "epoch": 0.3914383967745761, + "grad_norm": 1.5086282919821539, + "learning_rate": 4.9398651899627904e-05, + "loss": 0.6555, + "step": 3301 + }, + { + "epoch": 0.3915569785367011, + "grad_norm": 1.1122777817209073, + "learning_rate": 4.9398128479498535e-05, + "loss": 0.4731, + "step": 3302 + }, + { + "epoch": 0.39167556029882605, + "grad_norm": 1.4516725550872647, + "learning_rate": 4.939760483444834e-05, + "loss": 0.6484, + "step": 3303 + }, + { + "epoch": 0.39179414206095103, + "grad_norm": 1.4504480110526297, + "learning_rate": 4.939708096448216e-05, + "loss": 0.8402, + "step": 3304 + }, + { + "epoch": 0.391912723823076, + "grad_norm": 1.1918348332356694, + "learning_rate": 4.939655686960479e-05, + "loss": 0.6642, + "step": 3305 + }, + { + "epoch": 0.392031305585201, + "grad_norm": 1.4163219298573644, + "learning_rate": 4.93960325498211e-05, + "loss": 0.7743, + "step": 3306 + }, + { + "epoch": 0.392149887347326, + "grad_norm": 1.6273564640794183, + "learning_rate": 4.939550800513589e-05, + "loss": 0.8476, + "step": 3307 + }, + { + "epoch": 0.39226846910945096, + "grad_norm": 1.5597893820936082, + "learning_rate": 4.9394983235554026e-05, + "loss": 0.7616, + "step": 3308 + }, + { + "epoch": 0.39238705087157594, + "grad_norm": 1.5828427790084643, + "learning_rate": 4.939445824108033e-05, + "loss": 0.7975, + "step": 3309 + }, + { + "epoch": 0.3925056326337009, + "grad_norm": 1.3681850527160384, + "learning_rate": 4.939393302171964e-05, + "loss": 0.7218, + "step": 3310 + }, + { + "epoch": 0.3926242143958259, + "grad_norm": 1.6608662129483673, + "learning_rate": 4.93934075774768e-05, + "loss": 0.803, + "step": 3311 + }, + { + "epoch": 0.3927427961579509, + "grad_norm": 1.4518046880463196, + "learning_rate": 4.9392881908356656e-05, + "loss": 0.7858, + "step": 3312 + }, + { + "epoch": 0.3928613779200759, + "grad_norm": 1.7026444158450262, + "learning_rate": 4.9392356014364064e-05, + "loss": 0.8648, + "step": 3313 + }, + { + "epoch": 0.3929799596822009, + "grad_norm": 1.3049209607878027, + "learning_rate": 4.939182989550384e-05, + "loss": 0.5429, + "step": 3314 + }, + { + "epoch": 0.3930985414443259, + "grad_norm": 1.4527370492530713, + "learning_rate": 4.9391303551780875e-05, + "loss": 0.7274, + "step": 3315 + }, + { + "epoch": 0.39321712320645086, + "grad_norm": 1.3896346423597878, + "learning_rate": 4.9390776983199995e-05, + "loss": 0.6508, + "step": 3316 + }, + { + "epoch": 0.39333570496857584, + "grad_norm": 1.6249645979739245, + "learning_rate": 4.939025018976606e-05, + "loss": 0.7569, + "step": 3317 + }, + { + "epoch": 0.3934542867307008, + "grad_norm": 1.5190156063919769, + "learning_rate": 4.938972317148392e-05, + "loss": 0.7588, + "step": 3318 + }, + { + "epoch": 0.3935728684928258, + "grad_norm": 1.6685653326119232, + "learning_rate": 4.9389195928358455e-05, + "loss": 0.9356, + "step": 3319 + }, + { + "epoch": 0.3936914502549508, + "grad_norm": 1.8987857422184213, + "learning_rate": 4.93886684603945e-05, + "loss": 0.8156, + "step": 3320 + }, + { + "epoch": 0.39381003201707576, + "grad_norm": 1.5318532587333362, + "learning_rate": 4.938814076759694e-05, + "loss": 0.8025, + "step": 3321 + }, + { + "epoch": 0.39392861377920074, + "grad_norm": 1.5395002117451884, + "learning_rate": 4.938761284997062e-05, + "loss": 0.8123, + "step": 3322 + }, + { + "epoch": 0.3940471955413257, + "grad_norm": 1.5854882781999662, + "learning_rate": 4.9387084707520424e-05, + "loss": 0.6995, + "step": 3323 + }, + { + "epoch": 0.3941657773034507, + "grad_norm": 1.7024722292091725, + "learning_rate": 4.938655634025121e-05, + "loss": 0.7811, + "step": 3324 + }, + { + "epoch": 0.39428435906557574, + "grad_norm": 1.53892323711443, + "learning_rate": 4.938602774816784e-05, + "loss": 0.6055, + "step": 3325 + }, + { + "epoch": 0.3944029408277007, + "grad_norm": 1.5910635459703626, + "learning_rate": 4.938549893127522e-05, + "loss": 0.6579, + "step": 3326 + }, + { + "epoch": 0.3945215225898257, + "grad_norm": 1.4956944844954958, + "learning_rate": 4.938496988957818e-05, + "loss": 0.9684, + "step": 3327 + }, + { + "epoch": 0.3946401043519507, + "grad_norm": 1.5579680423675262, + "learning_rate": 4.9384440623081634e-05, + "loss": 0.86, + "step": 3328 + }, + { + "epoch": 0.39475868611407566, + "grad_norm": 1.6918983159602863, + "learning_rate": 4.938391113179045e-05, + "loss": 0.8824, + "step": 3329 + }, + { + "epoch": 0.39487726787620064, + "grad_norm": 1.6496230498757027, + "learning_rate": 4.938338141570949e-05, + "loss": 0.7569, + "step": 3330 + }, + { + "epoch": 0.3949958496383256, + "grad_norm": 1.6788619839830157, + "learning_rate": 4.938285147484367e-05, + "loss": 0.813, + "step": 3331 + }, + { + "epoch": 0.3951144314004506, + "grad_norm": 1.4458562756329658, + "learning_rate": 4.938232130919786e-05, + "loss": 0.6295, + "step": 3332 + }, + { + "epoch": 0.3952330131625756, + "grad_norm": 1.505330626191431, + "learning_rate": 4.9381790918776944e-05, + "loss": 0.7627, + "step": 3333 + }, + { + "epoch": 0.39535159492470057, + "grad_norm": 1.4694504990661184, + "learning_rate": 4.938126030358581e-05, + "loss": 0.6246, + "step": 3334 + }, + { + "epoch": 0.39547017668682555, + "grad_norm": 1.4346908739765836, + "learning_rate": 4.938072946362936e-05, + "loss": 0.7796, + "step": 3335 + }, + { + "epoch": 0.39558875844895053, + "grad_norm": 1.293304158845047, + "learning_rate": 4.9380198398912484e-05, + "loss": 0.5415, + "step": 3336 + }, + { + "epoch": 0.3957073402110755, + "grad_norm": 1.7584943751560544, + "learning_rate": 4.9379667109440066e-05, + "loss": 0.8135, + "step": 3337 + }, + { + "epoch": 0.39582592197320055, + "grad_norm": 1.7645390824554963, + "learning_rate": 4.937913559521702e-05, + "loss": 0.8113, + "step": 3338 + }, + { + "epoch": 0.3959445037353255, + "grad_norm": 1.4701346530124066, + "learning_rate": 4.937860385624824e-05, + "loss": 0.5539, + "step": 3339 + }, + { + "epoch": 0.3960630854974505, + "grad_norm": 1.239994218604369, + "learning_rate": 4.937807189253862e-05, + "loss": 0.5161, + "step": 3340 + }, + { + "epoch": 0.3961816672595755, + "grad_norm": 1.4749371391915223, + "learning_rate": 4.937753970409308e-05, + "loss": 0.7041, + "step": 3341 + }, + { + "epoch": 0.39630024902170047, + "grad_norm": 2.29390332576408, + "learning_rate": 4.937700729091652e-05, + "loss": 1.0802, + "step": 3342 + }, + { + "epoch": 0.39641883078382545, + "grad_norm": 2.0241809279902667, + "learning_rate": 4.9376474653013836e-05, + "loss": 1.0442, + "step": 3343 + }, + { + "epoch": 0.39653741254595043, + "grad_norm": 1.647959462601484, + "learning_rate": 4.937594179038996e-05, + "loss": 0.8566, + "step": 3344 + }, + { + "epoch": 0.3966559943080754, + "grad_norm": 1.5882247130810698, + "learning_rate": 4.937540870304978e-05, + "loss": 0.588, + "step": 3345 + }, + { + "epoch": 0.3967745760702004, + "grad_norm": 1.4582133961969428, + "learning_rate": 4.937487539099823e-05, + "loss": 0.6158, + "step": 3346 + }, + { + "epoch": 0.39689315783232537, + "grad_norm": 1.9055394927783285, + "learning_rate": 4.937434185424021e-05, + "loss": 0.9306, + "step": 3347 + }, + { + "epoch": 0.39701173959445035, + "grad_norm": 1.6881088256853234, + "learning_rate": 4.937380809278066e-05, + "loss": 0.7005, + "step": 3348 + }, + { + "epoch": 0.39713032135657533, + "grad_norm": 1.7204046429486144, + "learning_rate": 4.9373274106624486e-05, + "loss": 0.718, + "step": 3349 + }, + { + "epoch": 0.39724890311870037, + "grad_norm": 1.813095108852824, + "learning_rate": 4.937273989577661e-05, + "loss": 0.7771, + "step": 3350 + }, + { + "epoch": 0.39736748488082535, + "grad_norm": 1.6675996370654633, + "learning_rate": 4.937220546024196e-05, + "loss": 0.8399, + "step": 3351 + }, + { + "epoch": 0.39748606664295033, + "grad_norm": 1.8288119795391033, + "learning_rate": 4.937167080002546e-05, + "loss": 0.9696, + "step": 3352 + }, + { + "epoch": 0.3976046484050753, + "grad_norm": 1.6153792563060803, + "learning_rate": 4.9371135915132046e-05, + "loss": 0.7503, + "step": 3353 + }, + { + "epoch": 0.3977232301672003, + "grad_norm": 1.2700027823378905, + "learning_rate": 4.9370600805566644e-05, + "loss": 0.82, + "step": 3354 + }, + { + "epoch": 0.3978418119293253, + "grad_norm": 1.364088379972212, + "learning_rate": 4.9370065471334195e-05, + "loss": 0.6156, + "step": 3355 + }, + { + "epoch": 0.39796039369145025, + "grad_norm": 1.503570511929708, + "learning_rate": 4.936952991243961e-05, + "loss": 0.8318, + "step": 3356 + }, + { + "epoch": 0.39807897545357523, + "grad_norm": 1.4208174714595145, + "learning_rate": 4.936899412888786e-05, + "loss": 0.5889, + "step": 3357 + }, + { + "epoch": 0.3981975572157002, + "grad_norm": 1.263580305900277, + "learning_rate": 4.936845812068386e-05, + "loss": 0.7254, + "step": 3358 + }, + { + "epoch": 0.3983161389778252, + "grad_norm": 1.6364896714071715, + "learning_rate": 4.9367921887832555e-05, + "loss": 0.6205, + "step": 3359 + }, + { + "epoch": 0.3984347207399502, + "grad_norm": 1.6880092895329795, + "learning_rate": 4.93673854303389e-05, + "loss": 0.7069, + "step": 3360 + }, + { + "epoch": 0.39855330250207516, + "grad_norm": 1.7146483234076988, + "learning_rate": 4.9366848748207825e-05, + "loss": 0.9251, + "step": 3361 + }, + { + "epoch": 0.3986718842642002, + "grad_norm": 1.2106349928792526, + "learning_rate": 4.93663118414443e-05, + "loss": 0.4962, + "step": 3362 + }, + { + "epoch": 0.3987904660263252, + "grad_norm": 1.5855827639331197, + "learning_rate": 4.936577471005325e-05, + "loss": 0.892, + "step": 3363 + }, + { + "epoch": 0.39890904778845016, + "grad_norm": 1.5816083378500552, + "learning_rate": 4.936523735403963e-05, + "loss": 0.6376, + "step": 3364 + }, + { + "epoch": 0.39902762955057514, + "grad_norm": 1.4855177848230925, + "learning_rate": 4.9364699773408406e-05, + "loss": 0.6907, + "step": 3365 + }, + { + "epoch": 0.3991462113127001, + "grad_norm": 1.3170190881884658, + "learning_rate": 4.9364161968164524e-05, + "loss": 0.6577, + "step": 3366 + }, + { + "epoch": 0.3992647930748251, + "grad_norm": 1.6298589072215264, + "learning_rate": 4.936362393831295e-05, + "loss": 0.8367, + "step": 3367 + }, + { + "epoch": 0.3993833748369501, + "grad_norm": 1.5387479753800952, + "learning_rate": 4.9363085683858644e-05, + "loss": 0.9639, + "step": 3368 + }, + { + "epoch": 0.39950195659907506, + "grad_norm": 1.5261479950227446, + "learning_rate": 4.936254720480656e-05, + "loss": 0.7021, + "step": 3369 + }, + { + "epoch": 0.39962053836120004, + "grad_norm": 1.9399829320051425, + "learning_rate": 4.936200850116166e-05, + "loss": 0.8731, + "step": 3370 + }, + { + "epoch": 0.399739120123325, + "grad_norm": 1.4958363778256412, + "learning_rate": 4.9361469572928925e-05, + "loss": 0.6063, + "step": 3371 + }, + { + "epoch": 0.39985770188545, + "grad_norm": 1.6236347143526728, + "learning_rate": 4.936093042011331e-05, + "loss": 0.9497, + "step": 3372 + }, + { + "epoch": 0.399976283647575, + "grad_norm": 1.8810137861852714, + "learning_rate": 4.9360391042719786e-05, + "loss": 0.9255, + "step": 3373 + }, + { + "epoch": 0.40009486540969996, + "grad_norm": 1.3787712495858466, + "learning_rate": 4.935985144075334e-05, + "loss": 0.6541, + "step": 3374 + }, + { + "epoch": 0.400213447171825, + "grad_norm": 1.5274191748814043, + "learning_rate": 4.9359311614218925e-05, + "loss": 0.9342, + "step": 3375 + }, + { + "epoch": 0.40033202893395, + "grad_norm": 1.344509935976214, + "learning_rate": 4.9358771563121544e-05, + "loss": 0.4745, + "step": 3376 + }, + { + "epoch": 0.40045061069607496, + "grad_norm": 1.6576988718001862, + "learning_rate": 4.935823128746615e-05, + "loss": 0.8457, + "step": 3377 + }, + { + "epoch": 0.40056919245819994, + "grad_norm": 1.5238953304913703, + "learning_rate": 4.9357690787257727e-05, + "loss": 0.8041, + "step": 3378 + }, + { + "epoch": 0.4006877742203249, + "grad_norm": 1.405020032574401, + "learning_rate": 4.935715006250127e-05, + "loss": 0.6469, + "step": 3379 + }, + { + "epoch": 0.4008063559824499, + "grad_norm": 1.5413253189765799, + "learning_rate": 4.9356609113201765e-05, + "loss": 0.8746, + "step": 3380 + }, + { + "epoch": 0.4009249377445749, + "grad_norm": 1.5114623058462089, + "learning_rate": 4.935606793936418e-05, + "loss": 0.8747, + "step": 3381 + }, + { + "epoch": 0.40104351950669986, + "grad_norm": 1.2867647258247454, + "learning_rate": 4.935552654099352e-05, + "loss": 0.8175, + "step": 3382 + }, + { + "epoch": 0.40116210126882484, + "grad_norm": 1.3478712432482556, + "learning_rate": 4.935498491809478e-05, + "loss": 0.6771, + "step": 3383 + }, + { + "epoch": 0.4012806830309498, + "grad_norm": 1.7330315836675476, + "learning_rate": 4.935444307067294e-05, + "loss": 0.7197, + "step": 3384 + }, + { + "epoch": 0.4013992647930748, + "grad_norm": 1.4665619216901866, + "learning_rate": 4.935390099873299e-05, + "loss": 0.7805, + "step": 3385 + }, + { + "epoch": 0.4015178465551998, + "grad_norm": 1.440689315931767, + "learning_rate": 4.935335870227995e-05, + "loss": 0.8184, + "step": 3386 + }, + { + "epoch": 0.4016364283173248, + "grad_norm": 1.6097100683343846, + "learning_rate": 4.93528161813188e-05, + "loss": 0.8286, + "step": 3387 + }, + { + "epoch": 0.4017550100794498, + "grad_norm": 1.5429713573431623, + "learning_rate": 4.935227343585456e-05, + "loss": 0.9283, + "step": 3388 + }, + { + "epoch": 0.4018735918415748, + "grad_norm": 1.3851640255278754, + "learning_rate": 4.935173046589221e-05, + "loss": 0.8534, + "step": 3389 + }, + { + "epoch": 0.40199217360369977, + "grad_norm": 1.5980185114582355, + "learning_rate": 4.935118727143677e-05, + "loss": 0.8216, + "step": 3390 + }, + { + "epoch": 0.40211075536582475, + "grad_norm": 1.628011852553332, + "learning_rate": 4.9350643852493246e-05, + "loss": 0.9178, + "step": 3391 + }, + { + "epoch": 0.4022293371279497, + "grad_norm": 1.2916267016545147, + "learning_rate": 4.935010020906664e-05, + "loss": 0.7533, + "step": 3392 + }, + { + "epoch": 0.4023479188900747, + "grad_norm": 1.3867717963000399, + "learning_rate": 4.934955634116198e-05, + "loss": 0.5335, + "step": 3393 + }, + { + "epoch": 0.4024665006521997, + "grad_norm": 1.5893106206145253, + "learning_rate": 4.934901224878427e-05, + "loss": 0.8569, + "step": 3394 + }, + { + "epoch": 0.40258508241432467, + "grad_norm": 1.1778019412265597, + "learning_rate": 4.9348467931938513e-05, + "loss": 0.5039, + "step": 3395 + }, + { + "epoch": 0.40270366417644965, + "grad_norm": 1.8783924132261551, + "learning_rate": 4.9347923390629754e-05, + "loss": 1.0726, + "step": 3396 + }, + { + "epoch": 0.40282224593857463, + "grad_norm": 1.2022616088968383, + "learning_rate": 4.9347378624862996e-05, + "loss": 0.5291, + "step": 3397 + }, + { + "epoch": 0.4029408277006996, + "grad_norm": 1.4995959226947364, + "learning_rate": 4.9346833634643254e-05, + "loss": 0.75, + "step": 3398 + }, + { + "epoch": 0.4030594094628246, + "grad_norm": 1.8034446989076438, + "learning_rate": 4.934628841997557e-05, + "loss": 0.7166, + "step": 3399 + }, + { + "epoch": 0.40317799122494963, + "grad_norm": 1.3233313178205286, + "learning_rate": 4.9345742980864965e-05, + "loss": 0.6449, + "step": 3400 + }, + { + "epoch": 0.4032965729870746, + "grad_norm": 1.4255309221165924, + "learning_rate": 4.934519731731645e-05, + "loss": 0.7342, + "step": 3401 + }, + { + "epoch": 0.4034151547491996, + "grad_norm": 1.5792627725796746, + "learning_rate": 4.934465142933509e-05, + "loss": 0.6658, + "step": 3402 + }, + { + "epoch": 0.40353373651132457, + "grad_norm": 1.727653623241166, + "learning_rate": 4.934410531692588e-05, + "loss": 0.7189, + "step": 3403 + }, + { + "epoch": 0.40365231827344955, + "grad_norm": 2.1020966399488366, + "learning_rate": 4.9343558980093886e-05, + "loss": 0.8857, + "step": 3404 + }, + { + "epoch": 0.40377090003557453, + "grad_norm": 1.7388821362600073, + "learning_rate": 4.934301241884412e-05, + "loss": 0.9631, + "step": 3405 + }, + { + "epoch": 0.4038894817976995, + "grad_norm": 1.4400083498352596, + "learning_rate": 4.934246563318163e-05, + "loss": 0.4774, + "step": 3406 + }, + { + "epoch": 0.4040080635598245, + "grad_norm": 1.9420579513930956, + "learning_rate": 4.9341918623111465e-05, + "loss": 0.6372, + "step": 3407 + }, + { + "epoch": 0.4041266453219495, + "grad_norm": 1.5099200276144495, + "learning_rate": 4.9341371388638654e-05, + "loss": 0.9154, + "step": 3408 + }, + { + "epoch": 0.40424522708407445, + "grad_norm": 1.5452078716447772, + "learning_rate": 4.9340823929768256e-05, + "loss": 0.7251, + "step": 3409 + }, + { + "epoch": 0.40436380884619944, + "grad_norm": 1.3661487184230467, + "learning_rate": 4.9340276246505304e-05, + "loss": 0.6121, + "step": 3410 + }, + { + "epoch": 0.4044823906083244, + "grad_norm": 1.961458200481552, + "learning_rate": 4.9339728338854854e-05, + "loss": 1.0919, + "step": 3411 + }, + { + "epoch": 0.40460097237044945, + "grad_norm": 1.5911094321446138, + "learning_rate": 4.9339180206821955e-05, + "loss": 0.7319, + "step": 3412 + }, + { + "epoch": 0.40471955413257443, + "grad_norm": 1.7372527818237073, + "learning_rate": 4.933863185041167e-05, + "loss": 0.766, + "step": 3413 + }, + { + "epoch": 0.4048381358946994, + "grad_norm": 1.4697016060135826, + "learning_rate": 4.9338083269629033e-05, + "loss": 0.7045, + "step": 3414 + }, + { + "epoch": 0.4049567176568244, + "grad_norm": 1.3166802504153585, + "learning_rate": 4.9337534464479115e-05, + "loss": 0.6637, + "step": 3415 + }, + { + "epoch": 0.4050752994189494, + "grad_norm": 1.277464774511906, + "learning_rate": 4.9336985434966974e-05, + "loss": 0.5968, + "step": 3416 + }, + { + "epoch": 0.40519388118107436, + "grad_norm": 1.5931220291623915, + "learning_rate": 4.933643618109768e-05, + "loss": 0.9288, + "step": 3417 + }, + { + "epoch": 0.40531246294319934, + "grad_norm": 1.2518560200232918, + "learning_rate": 4.933588670287628e-05, + "loss": 0.7393, + "step": 3418 + }, + { + "epoch": 0.4054310447053243, + "grad_norm": 1.5079765308091393, + "learning_rate": 4.933533700030785e-05, + "loss": 0.6285, + "step": 3419 + }, + { + "epoch": 0.4055496264674493, + "grad_norm": 1.6601107216223718, + "learning_rate": 4.9334787073397457e-05, + "loss": 0.5472, + "step": 3420 + }, + { + "epoch": 0.4056682082295743, + "grad_norm": 1.3242192456250603, + "learning_rate": 4.9334236922150165e-05, + "loss": 0.6104, + "step": 3421 + }, + { + "epoch": 0.40578678999169926, + "grad_norm": 1.7438692663798263, + "learning_rate": 4.933368654657105e-05, + "loss": 0.9018, + "step": 3422 + }, + { + "epoch": 0.40590537175382424, + "grad_norm": 1.4367483782157127, + "learning_rate": 4.9333135946665184e-05, + "loss": 0.6488, + "step": 3423 + }, + { + "epoch": 0.4060239535159492, + "grad_norm": 1.5978080753042538, + "learning_rate": 4.9332585122437645e-05, + "loss": 0.6762, + "step": 3424 + }, + { + "epoch": 0.40614253527807426, + "grad_norm": 1.4488677975801816, + "learning_rate": 4.933203407389351e-05, + "loss": 0.5466, + "step": 3425 + }, + { + "epoch": 0.40626111704019924, + "grad_norm": 1.5824385254384985, + "learning_rate": 4.9331482801037856e-05, + "loss": 0.7685, + "step": 3426 + }, + { + "epoch": 0.4063796988023242, + "grad_norm": 1.9380282970927374, + "learning_rate": 4.933093130387577e-05, + "loss": 1.0251, + "step": 3427 + }, + { + "epoch": 0.4064982805644492, + "grad_norm": 1.4500720640721474, + "learning_rate": 4.933037958241233e-05, + "loss": 0.6312, + "step": 3428 + }, + { + "epoch": 0.4066168623265742, + "grad_norm": 1.5916545605778114, + "learning_rate": 4.9329827636652634e-05, + "loss": 0.7453, + "step": 3429 + }, + { + "epoch": 0.40673544408869916, + "grad_norm": 1.6341541487652664, + "learning_rate": 4.932927546660176e-05, + "loss": 0.6945, + "step": 3430 + }, + { + "epoch": 0.40685402585082414, + "grad_norm": 1.7196416855281644, + "learning_rate": 4.9328723072264796e-05, + "loss": 0.8219, + "step": 3431 + }, + { + "epoch": 0.4069726076129491, + "grad_norm": 1.5825117829133402, + "learning_rate": 4.932817045364684e-05, + "loss": 0.7061, + "step": 3432 + }, + { + "epoch": 0.4070911893750741, + "grad_norm": 1.5792277168757893, + "learning_rate": 4.932761761075299e-05, + "loss": 0.7178, + "step": 3433 + }, + { + "epoch": 0.4072097711371991, + "grad_norm": 1.5135792919178124, + "learning_rate": 4.932706454358834e-05, + "loss": 0.7541, + "step": 3434 + }, + { + "epoch": 0.40732835289932406, + "grad_norm": 1.44129597228733, + "learning_rate": 4.932651125215798e-05, + "loss": 0.5207, + "step": 3435 + }, + { + "epoch": 0.40744693466144905, + "grad_norm": 1.418601843407384, + "learning_rate": 4.9325957736467024e-05, + "loss": 0.82, + "step": 3436 + }, + { + "epoch": 0.4075655164235741, + "grad_norm": 1.6785701775057487, + "learning_rate": 4.9325403996520556e-05, + "loss": 0.8429, + "step": 3437 + }, + { + "epoch": 0.40768409818569906, + "grad_norm": 1.5274176025786574, + "learning_rate": 4.932485003232371e-05, + "loss": 0.7568, + "step": 3438 + }, + { + "epoch": 0.40780267994782404, + "grad_norm": 1.4593084696703231, + "learning_rate": 4.932429584388156e-05, + "loss": 0.8834, + "step": 3439 + }, + { + "epoch": 0.407921261709949, + "grad_norm": 1.5031065216817876, + "learning_rate": 4.932374143119924e-05, + "loss": 0.6498, + "step": 3440 + }, + { + "epoch": 0.408039843472074, + "grad_norm": 1.5074483218756476, + "learning_rate": 4.9323186794281854e-05, + "loss": 0.9409, + "step": 3441 + }, + { + "epoch": 0.408158425234199, + "grad_norm": 1.3944935465291692, + "learning_rate": 4.93226319331345e-05, + "loss": 0.6616, + "step": 3442 + }, + { + "epoch": 0.40827700699632397, + "grad_norm": 1.7637032208267687, + "learning_rate": 4.9322076847762325e-05, + "loss": 0.886, + "step": 3443 + }, + { + "epoch": 0.40839558875844895, + "grad_norm": 1.5682612317748061, + "learning_rate": 4.932152153817041e-05, + "loss": 0.7478, + "step": 3444 + }, + { + "epoch": 0.4085141705205739, + "grad_norm": 1.5443493968539377, + "learning_rate": 4.93209660043639e-05, + "loss": 0.8019, + "step": 3445 + }, + { + "epoch": 0.4086327522826989, + "grad_norm": 1.5744348073970764, + "learning_rate": 4.93204102463479e-05, + "loss": 0.8721, + "step": 3446 + }, + { + "epoch": 0.4087513340448239, + "grad_norm": 1.5048826586613848, + "learning_rate": 4.931985426412755e-05, + "loss": 0.7389, + "step": 3447 + }, + { + "epoch": 0.40886991580694887, + "grad_norm": 1.6222487926822102, + "learning_rate": 4.9319298057707963e-05, + "loss": 0.7643, + "step": 3448 + }, + { + "epoch": 0.40898849756907385, + "grad_norm": 1.3656604469741485, + "learning_rate": 4.9318741627094274e-05, + "loss": 0.6644, + "step": 3449 + }, + { + "epoch": 0.4091070793311989, + "grad_norm": 1.6069248588623468, + "learning_rate": 4.9318184972291605e-05, + "loss": 0.8331, + "step": 3450 + }, + { + "epoch": 0.40922566109332387, + "grad_norm": 1.6028723093677715, + "learning_rate": 4.931762809330509e-05, + "loss": 0.8465, + "step": 3451 + }, + { + "epoch": 0.40934424285544885, + "grad_norm": 1.6336867309047571, + "learning_rate": 4.931707099013987e-05, + "loss": 0.7729, + "step": 3452 + }, + { + "epoch": 0.40946282461757383, + "grad_norm": 1.5954690811340562, + "learning_rate": 4.931651366280107e-05, + "loss": 0.8472, + "step": 3453 + }, + { + "epoch": 0.4095814063796988, + "grad_norm": 1.4966391219210247, + "learning_rate": 4.9315956111293834e-05, + "loss": 0.6397, + "step": 3454 + }, + { + "epoch": 0.4096999881418238, + "grad_norm": 1.4440028027098661, + "learning_rate": 4.93153983356233e-05, + "loss": 0.7314, + "step": 3455 + }, + { + "epoch": 0.40981856990394877, + "grad_norm": 1.5852238093115323, + "learning_rate": 4.931484033579461e-05, + "loss": 0.5464, + "step": 3456 + }, + { + "epoch": 0.40993715166607375, + "grad_norm": 1.5729654258586305, + "learning_rate": 4.931428211181292e-05, + "loss": 0.7612, + "step": 3457 + }, + { + "epoch": 0.41005573342819873, + "grad_norm": 1.3487204548532536, + "learning_rate": 4.9313723663683347e-05, + "loss": 0.4851, + "step": 3458 + }, + { + "epoch": 0.4101743151903237, + "grad_norm": 1.4905212843137303, + "learning_rate": 4.9313164991411064e-05, + "loss": 0.6278, + "step": 3459 + }, + { + "epoch": 0.4102928969524487, + "grad_norm": 1.3755600685344214, + "learning_rate": 4.9312606095001226e-05, + "loss": 0.7017, + "step": 3460 + }, + { + "epoch": 0.4104114787145737, + "grad_norm": 1.8752606132270313, + "learning_rate": 4.931204697445896e-05, + "loss": 0.7082, + "step": 3461 + }, + { + "epoch": 0.4105300604766987, + "grad_norm": 1.577139917113333, + "learning_rate": 4.9311487629789435e-05, + "loss": 0.6306, + "step": 3462 + }, + { + "epoch": 0.4106486422388237, + "grad_norm": 1.5914606207220774, + "learning_rate": 4.9310928060997816e-05, + "loss": 0.8667, + "step": 3463 + }, + { + "epoch": 0.41076722400094867, + "grad_norm": 1.7841461888097216, + "learning_rate": 4.931036826808925e-05, + "loss": 0.8442, + "step": 3464 + }, + { + "epoch": 0.41088580576307365, + "grad_norm": 1.5518005588465547, + "learning_rate": 4.930980825106889e-05, + "loss": 0.6227, + "step": 3465 + }, + { + "epoch": 0.41100438752519863, + "grad_norm": 1.7562414908245274, + "learning_rate": 4.9309248009941914e-05, + "loss": 0.725, + "step": 3466 + }, + { + "epoch": 0.4111229692873236, + "grad_norm": 1.4494233795813607, + "learning_rate": 4.930868754471348e-05, + "loss": 0.6459, + "step": 3467 + }, + { + "epoch": 0.4112415510494486, + "grad_norm": 1.442957451447209, + "learning_rate": 4.9308126855388766e-05, + "loss": 0.7303, + "step": 3468 + }, + { + "epoch": 0.4113601328115736, + "grad_norm": 1.708564518054293, + "learning_rate": 4.930756594197292e-05, + "loss": 0.9952, + "step": 3469 + }, + { + "epoch": 0.41147871457369856, + "grad_norm": 1.5289381108194489, + "learning_rate": 4.930700480447113e-05, + "loss": 0.5792, + "step": 3470 + }, + { + "epoch": 0.41159729633582354, + "grad_norm": 1.439925454497166, + "learning_rate": 4.930644344288856e-05, + "loss": 0.5569, + "step": 3471 + }, + { + "epoch": 0.4117158780979485, + "grad_norm": 1.617093750744133, + "learning_rate": 4.930588185723039e-05, + "loss": 0.9871, + "step": 3472 + }, + { + "epoch": 0.4118344598600735, + "grad_norm": 1.4453161687567218, + "learning_rate": 4.930532004750179e-05, + "loss": 0.768, + "step": 3473 + }, + { + "epoch": 0.4119530416221985, + "grad_norm": 1.4527607790123462, + "learning_rate": 4.9304758013707955e-05, + "loss": 0.7126, + "step": 3474 + }, + { + "epoch": 0.4120716233843235, + "grad_norm": 1.5060731668936405, + "learning_rate": 4.9304195755854046e-05, + "loss": 0.7668, + "step": 3475 + }, + { + "epoch": 0.4121902051464485, + "grad_norm": 1.5718959237329786, + "learning_rate": 4.930363327394526e-05, + "loss": 0.7469, + "step": 3476 + }, + { + "epoch": 0.4123087869085735, + "grad_norm": 1.6925607504881741, + "learning_rate": 4.9303070567986776e-05, + "loss": 0.7178, + "step": 3477 + }, + { + "epoch": 0.41242736867069846, + "grad_norm": 1.6654667133548968, + "learning_rate": 4.930250763798378e-05, + "loss": 0.8337, + "step": 3478 + }, + { + "epoch": 0.41254595043282344, + "grad_norm": 1.6908092294152437, + "learning_rate": 4.9301944483941477e-05, + "loss": 0.8391, + "step": 3479 + }, + { + "epoch": 0.4126645321949484, + "grad_norm": 1.5207017324353465, + "learning_rate": 4.930138110586504e-05, + "loss": 0.7875, + "step": 3480 + }, + { + "epoch": 0.4127831139570734, + "grad_norm": 1.9390192450079304, + "learning_rate": 4.9300817503759666e-05, + "loss": 0.6412, + "step": 3481 + }, + { + "epoch": 0.4129016957191984, + "grad_norm": 1.5247446077824394, + "learning_rate": 4.930025367763056e-05, + "loss": 0.7827, + "step": 3482 + }, + { + "epoch": 0.41302027748132336, + "grad_norm": 1.4392877651827398, + "learning_rate": 4.929968962748291e-05, + "loss": 0.7235, + "step": 3483 + }, + { + "epoch": 0.41313885924344834, + "grad_norm": 1.7384104651675722, + "learning_rate": 4.929912535332192e-05, + "loss": 0.8048, + "step": 3484 + }, + { + "epoch": 0.4132574410055733, + "grad_norm": 1.2921828206814736, + "learning_rate": 4.9298560855152795e-05, + "loss": 0.467, + "step": 3485 + }, + { + "epoch": 0.4133760227676983, + "grad_norm": 1.5098218903705045, + "learning_rate": 4.9297996132980734e-05, + "loss": 0.5982, + "step": 3486 + }, + { + "epoch": 0.41349460452982334, + "grad_norm": 1.498529510436589, + "learning_rate": 4.9297431186810946e-05, + "loss": 0.6793, + "step": 3487 + }, + { + "epoch": 0.4136131862919483, + "grad_norm": 2.012622463968173, + "learning_rate": 4.9296866016648635e-05, + "loss": 1.0554, + "step": 3488 + }, + { + "epoch": 0.4137317680540733, + "grad_norm": 1.304370021550811, + "learning_rate": 4.929630062249901e-05, + "loss": 0.54, + "step": 3489 + }, + { + "epoch": 0.4138503498161983, + "grad_norm": 1.5910058209386557, + "learning_rate": 4.92957350043673e-05, + "loss": 0.7642, + "step": 3490 + }, + { + "epoch": 0.41396893157832326, + "grad_norm": 1.6771178219265093, + "learning_rate": 4.9295169162258706e-05, + "loss": 0.8248, + "step": 3491 + }, + { + "epoch": 0.41408751334044824, + "grad_norm": 1.7524390891311574, + "learning_rate": 4.929460309617844e-05, + "loss": 0.7506, + "step": 3492 + }, + { + "epoch": 0.4142060951025732, + "grad_norm": 1.3186552976242494, + "learning_rate": 4.9294036806131716e-05, + "loss": 0.4944, + "step": 3493 + }, + { + "epoch": 0.4143246768646982, + "grad_norm": 1.5676041178828677, + "learning_rate": 4.929347029212378e-05, + "loss": 0.6818, + "step": 3494 + }, + { + "epoch": 0.4144432586268232, + "grad_norm": 1.2629778387783006, + "learning_rate": 4.9292903554159825e-05, + "loss": 0.6744, + "step": 3495 + }, + { + "epoch": 0.41456184038894817, + "grad_norm": 1.5543199898423563, + "learning_rate": 4.929233659224509e-05, + "loss": 0.9598, + "step": 3496 + }, + { + "epoch": 0.41468042215107315, + "grad_norm": 1.392466191742914, + "learning_rate": 4.9291769406384815e-05, + "loss": 0.5856, + "step": 3497 + }, + { + "epoch": 0.4147990039131981, + "grad_norm": 1.4195353735619303, + "learning_rate": 4.92912019965842e-05, + "loss": 0.6725, + "step": 3498 + }, + { + "epoch": 0.4149175856753231, + "grad_norm": 1.7339981168627936, + "learning_rate": 4.9290634362848496e-05, + "loss": 0.8707, + "step": 3499 + }, + { + "epoch": 0.41503616743744814, + "grad_norm": 1.6001785860816717, + "learning_rate": 4.929006650518293e-05, + "loss": 0.8114, + "step": 3500 + }, + { + "epoch": 0.4151547491995731, + "grad_norm": 1.386350899960089, + "learning_rate": 4.928949842359274e-05, + "loss": 0.8131, + "step": 3501 + }, + { + "epoch": 0.4152733309616981, + "grad_norm": 1.8402233478375718, + "learning_rate": 4.928893011808316e-05, + "loss": 0.7719, + "step": 3502 + }, + { + "epoch": 0.4153919127238231, + "grad_norm": 1.5626656738683897, + "learning_rate": 4.9288361588659425e-05, + "loss": 0.961, + "step": 3503 + }, + { + "epoch": 0.41551049448594807, + "grad_norm": 1.9040769285908825, + "learning_rate": 4.928779283532679e-05, + "loss": 1.0175, + "step": 3504 + }, + { + "epoch": 0.41562907624807305, + "grad_norm": 1.5862661063074501, + "learning_rate": 4.9287223858090475e-05, + "loss": 0.6777, + "step": 3505 + }, + { + "epoch": 0.41574765801019803, + "grad_norm": 1.4479039744577782, + "learning_rate": 4.928665465695574e-05, + "loss": 0.5448, + "step": 3506 + }, + { + "epoch": 0.415866239772323, + "grad_norm": 1.8070495012197811, + "learning_rate": 4.928608523192784e-05, + "loss": 0.859, + "step": 3507 + }, + { + "epoch": 0.415984821534448, + "grad_norm": 1.576782667520276, + "learning_rate": 4.9285515583012014e-05, + "loss": 0.7788, + "step": 3508 + }, + { + "epoch": 0.41610340329657297, + "grad_norm": 1.564038324448268, + "learning_rate": 4.9284945710213514e-05, + "loss": 0.72, + "step": 3509 + }, + { + "epoch": 0.41622198505869795, + "grad_norm": 1.637643711315041, + "learning_rate": 4.928437561353759e-05, + "loss": 0.6804, + "step": 3510 + }, + { + "epoch": 0.41634056682082293, + "grad_norm": 1.393310972684991, + "learning_rate": 4.9283805292989514e-05, + "loss": 0.6632, + "step": 3511 + }, + { + "epoch": 0.41645914858294797, + "grad_norm": 1.3798318232090196, + "learning_rate": 4.9283234748574524e-05, + "loss": 0.6344, + "step": 3512 + }, + { + "epoch": 0.41657773034507295, + "grad_norm": 1.7109262826729672, + "learning_rate": 4.928266398029789e-05, + "loss": 0.8602, + "step": 3513 + }, + { + "epoch": 0.41669631210719793, + "grad_norm": 1.5900011759965962, + "learning_rate": 4.928209298816487e-05, + "loss": 0.7648, + "step": 3514 + }, + { + "epoch": 0.4168148938693229, + "grad_norm": 1.854129126487447, + "learning_rate": 4.928152177218073e-05, + "loss": 0.9259, + "step": 3515 + }, + { + "epoch": 0.4169334756314479, + "grad_norm": 1.5047202853638824, + "learning_rate": 4.9280950332350725e-05, + "loss": 0.7767, + "step": 3516 + }, + { + "epoch": 0.4170520573935729, + "grad_norm": 1.8199421049242315, + "learning_rate": 4.928037866868015e-05, + "loss": 0.9127, + "step": 3517 + }, + { + "epoch": 0.41717063915569785, + "grad_norm": 1.4314315752243256, + "learning_rate": 4.927980678117426e-05, + "loss": 0.6471, + "step": 3518 + }, + { + "epoch": 0.41728922091782283, + "grad_norm": 1.3300663970491629, + "learning_rate": 4.9279234669838304e-05, + "loss": 0.8425, + "step": 3519 + }, + { + "epoch": 0.4174078026799478, + "grad_norm": 1.4923891540841085, + "learning_rate": 4.9278662334677594e-05, + "loss": 0.6898, + "step": 3520 + }, + { + "epoch": 0.4175263844420728, + "grad_norm": 1.5828728769972298, + "learning_rate": 4.9278089775697386e-05, + "loss": 0.8851, + "step": 3521 + }, + { + "epoch": 0.4176449662041978, + "grad_norm": 1.5270001649144636, + "learning_rate": 4.927751699290296e-05, + "loss": 0.5907, + "step": 3522 + }, + { + "epoch": 0.41776354796632276, + "grad_norm": 1.43211114771798, + "learning_rate": 4.92769439862996e-05, + "loss": 0.5983, + "step": 3523 + }, + { + "epoch": 0.41788212972844774, + "grad_norm": 1.7378783966698976, + "learning_rate": 4.927637075589259e-05, + "loss": 0.8189, + "step": 3524 + }, + { + "epoch": 0.4180007114905728, + "grad_norm": 1.8966172742010539, + "learning_rate": 4.92757973016872e-05, + "loss": 0.9898, + "step": 3525 + }, + { + "epoch": 0.41811929325269775, + "grad_norm": 1.3002902120509119, + "learning_rate": 4.927522362368874e-05, + "loss": 0.6857, + "step": 3526 + }, + { + "epoch": 0.41823787501482274, + "grad_norm": 1.4700536867257123, + "learning_rate": 4.9274649721902476e-05, + "loss": 0.814, + "step": 3527 + }, + { + "epoch": 0.4183564567769477, + "grad_norm": 1.6486968124252013, + "learning_rate": 4.927407559633373e-05, + "loss": 0.657, + "step": 3528 + }, + { + "epoch": 0.4184750385390727, + "grad_norm": 1.784845725906869, + "learning_rate": 4.927350124698775e-05, + "loss": 0.7814, + "step": 3529 + }, + { + "epoch": 0.4185936203011977, + "grad_norm": 1.3335203233829882, + "learning_rate": 4.927292667386986e-05, + "loss": 0.606, + "step": 3530 + }, + { + "epoch": 0.41871220206332266, + "grad_norm": 1.2989491997378126, + "learning_rate": 4.927235187698536e-05, + "loss": 0.6775, + "step": 3531 + }, + { + "epoch": 0.41883078382544764, + "grad_norm": 1.4535845474685962, + "learning_rate": 4.9271776856339535e-05, + "loss": 0.7103, + "step": 3532 + }, + { + "epoch": 0.4189493655875726, + "grad_norm": 1.5546267145940014, + "learning_rate": 4.927120161193769e-05, + "loss": 0.7025, + "step": 3533 + }, + { + "epoch": 0.4190679473496976, + "grad_norm": 1.6460273879388636, + "learning_rate": 4.927062614378514e-05, + "loss": 0.8167, + "step": 3534 + }, + { + "epoch": 0.4191865291118226, + "grad_norm": 1.3792996867155898, + "learning_rate": 4.927005045188716e-05, + "loss": 0.555, + "step": 3535 + }, + { + "epoch": 0.41930511087394756, + "grad_norm": 1.432479930635415, + "learning_rate": 4.9269474536249094e-05, + "loss": 0.5637, + "step": 3536 + }, + { + "epoch": 0.4194236926360726, + "grad_norm": 1.3652992542784572, + "learning_rate": 4.926889839687623e-05, + "loss": 0.6221, + "step": 3537 + }, + { + "epoch": 0.4195422743981976, + "grad_norm": 1.3187452075054733, + "learning_rate": 4.926832203377388e-05, + "loss": 0.6293, + "step": 3538 + }, + { + "epoch": 0.41966085616032256, + "grad_norm": 1.6330179847261659, + "learning_rate": 4.9267745446947356e-05, + "loss": 0.6778, + "step": 3539 + }, + { + "epoch": 0.41977943792244754, + "grad_norm": 1.975727614539572, + "learning_rate": 4.926716863640198e-05, + "loss": 1.0026, + "step": 3540 + }, + { + "epoch": 0.4198980196845725, + "grad_norm": 1.958121796048007, + "learning_rate": 4.926659160214307e-05, + "loss": 0.7335, + "step": 3541 + }, + { + "epoch": 0.4200166014466975, + "grad_norm": 1.6498080152457262, + "learning_rate": 4.926601434417595e-05, + "loss": 0.6311, + "step": 3542 + }, + { + "epoch": 0.4201351832088225, + "grad_norm": 1.7130506452521574, + "learning_rate": 4.926543686250593e-05, + "loss": 0.8736, + "step": 3543 + }, + { + "epoch": 0.42025376497094746, + "grad_norm": 1.5842018150112882, + "learning_rate": 4.9264859157138345e-05, + "loss": 0.8008, + "step": 3544 + }, + { + "epoch": 0.42037234673307244, + "grad_norm": 1.8150917980430235, + "learning_rate": 4.92642812280785e-05, + "loss": 0.9648, + "step": 3545 + }, + { + "epoch": 0.4204909284951974, + "grad_norm": 1.6833113126739736, + "learning_rate": 4.9263703075331745e-05, + "loss": 0.726, + "step": 3546 + }, + { + "epoch": 0.4206095102573224, + "grad_norm": 1.6405329373035606, + "learning_rate": 4.9263124698903404e-05, + "loss": 0.6916, + "step": 3547 + }, + { + "epoch": 0.4207280920194474, + "grad_norm": 1.423317558747031, + "learning_rate": 4.92625460987988e-05, + "loss": 0.7874, + "step": 3548 + }, + { + "epoch": 0.42084667378157237, + "grad_norm": 1.9312037684789953, + "learning_rate": 4.926196727502328e-05, + "loss": 1.0744, + "step": 3549 + }, + { + "epoch": 0.4209652555436974, + "grad_norm": 1.5607958927225387, + "learning_rate": 4.9261388227582164e-05, + "loss": 0.7294, + "step": 3550 + }, + { + "epoch": 0.4210838373058224, + "grad_norm": 1.5326288238551535, + "learning_rate": 4.926080895648081e-05, + "loss": 0.5466, + "step": 3551 + }, + { + "epoch": 0.42120241906794736, + "grad_norm": 1.4614499762077864, + "learning_rate": 4.926022946172454e-05, + "loss": 0.5313, + "step": 3552 + }, + { + "epoch": 0.42132100083007235, + "grad_norm": 1.2185440339609745, + "learning_rate": 4.9259649743318715e-05, + "loss": 0.6272, + "step": 3553 + }, + { + "epoch": 0.4214395825921973, + "grad_norm": 1.23296350371015, + "learning_rate": 4.925906980126866e-05, + "loss": 0.6447, + "step": 3554 + }, + { + "epoch": 0.4215581643543223, + "grad_norm": 1.5527661833930966, + "learning_rate": 4.925848963557973e-05, + "loss": 0.5731, + "step": 3555 + }, + { + "epoch": 0.4216767461164473, + "grad_norm": 1.416556079216272, + "learning_rate": 4.925790924625727e-05, + "loss": 0.7796, + "step": 3556 + }, + { + "epoch": 0.42179532787857227, + "grad_norm": 1.4456527337256488, + "learning_rate": 4.925732863330664e-05, + "loss": 0.6367, + "step": 3557 + }, + { + "epoch": 0.42191390964069725, + "grad_norm": 1.5393838260331123, + "learning_rate": 4.925674779673319e-05, + "loss": 0.4778, + "step": 3558 + }, + { + "epoch": 0.42203249140282223, + "grad_norm": 1.1712302774523844, + "learning_rate": 4.925616673654226e-05, + "loss": 0.4387, + "step": 3559 + }, + { + "epoch": 0.4221510731649472, + "grad_norm": 1.7376767364009615, + "learning_rate": 4.925558545273923e-05, + "loss": 0.8326, + "step": 3560 + }, + { + "epoch": 0.4222696549270722, + "grad_norm": 1.781427047547457, + "learning_rate": 4.925500394532944e-05, + "loss": 0.785, + "step": 3561 + }, + { + "epoch": 0.4223882366891972, + "grad_norm": 2.062542179565209, + "learning_rate": 4.925442221431825e-05, + "loss": 0.7379, + "step": 3562 + }, + { + "epoch": 0.4225068184513222, + "grad_norm": 1.69686844221876, + "learning_rate": 4.9253840259711044e-05, + "loss": 0.7686, + "step": 3563 + }, + { + "epoch": 0.4226254002134472, + "grad_norm": 1.6955283678200248, + "learning_rate": 4.925325808151316e-05, + "loss": 0.7466, + "step": 3564 + }, + { + "epoch": 0.42274398197557217, + "grad_norm": 1.846277668698648, + "learning_rate": 4.925267567972999e-05, + "loss": 0.816, + "step": 3565 + }, + { + "epoch": 0.42286256373769715, + "grad_norm": 1.8550378681862272, + "learning_rate": 4.925209305436688e-05, + "loss": 0.7164, + "step": 3566 + }, + { + "epoch": 0.42298114549982213, + "grad_norm": 1.826908253972868, + "learning_rate": 4.925151020542922e-05, + "loss": 0.916, + "step": 3567 + }, + { + "epoch": 0.4230997272619471, + "grad_norm": 1.3874577073278027, + "learning_rate": 4.925092713292237e-05, + "loss": 0.6935, + "step": 3568 + }, + { + "epoch": 0.4232183090240721, + "grad_norm": 1.5609961314884795, + "learning_rate": 4.925034383685171e-05, + "loss": 0.7876, + "step": 3569 + }, + { + "epoch": 0.4233368907861971, + "grad_norm": 1.4713092804102157, + "learning_rate": 4.924976031722262e-05, + "loss": 0.6411, + "step": 3570 + }, + { + "epoch": 0.42345547254832205, + "grad_norm": 1.4704412599397514, + "learning_rate": 4.924917657404048e-05, + "loss": 0.7979, + "step": 3571 + }, + { + "epoch": 0.42357405431044703, + "grad_norm": 1.520337509263915, + "learning_rate": 4.924859260731066e-05, + "loss": 0.7182, + "step": 3572 + }, + { + "epoch": 0.423692636072572, + "grad_norm": 1.3856828313346083, + "learning_rate": 4.924800841703856e-05, + "loss": 0.6619, + "step": 3573 + }, + { + "epoch": 0.423811217834697, + "grad_norm": 1.4983785203297317, + "learning_rate": 4.9247424003229545e-05, + "loss": 0.6431, + "step": 3574 + }, + { + "epoch": 0.42392979959682203, + "grad_norm": 1.3017804331234106, + "learning_rate": 4.924683936588903e-05, + "loss": 0.6925, + "step": 3575 + }, + { + "epoch": 0.424048381358947, + "grad_norm": 1.2734043362734362, + "learning_rate": 4.9246254505022386e-05, + "loss": 0.6866, + "step": 3576 + }, + { + "epoch": 0.424166963121072, + "grad_norm": 1.6619665718376493, + "learning_rate": 4.9245669420635e-05, + "loss": 0.6716, + "step": 3577 + }, + { + "epoch": 0.424285544883197, + "grad_norm": 1.486131225865649, + "learning_rate": 4.9245084112732276e-05, + "loss": 0.4649, + "step": 3578 + }, + { + "epoch": 0.42440412664532196, + "grad_norm": 1.2932597737835567, + "learning_rate": 4.924449858131961e-05, + "loss": 0.5919, + "step": 3579 + }, + { + "epoch": 0.42452270840744694, + "grad_norm": 2.2149386210146718, + "learning_rate": 4.924391282640241e-05, + "loss": 0.9465, + "step": 3580 + }, + { + "epoch": 0.4246412901695719, + "grad_norm": 1.547804336265583, + "learning_rate": 4.9243326847986043e-05, + "loss": 0.7911, + "step": 3581 + }, + { + "epoch": 0.4247598719316969, + "grad_norm": 1.4684751791866157, + "learning_rate": 4.9242740646075944e-05, + "loss": 0.5629, + "step": 3582 + }, + { + "epoch": 0.4248784536938219, + "grad_norm": 2.00338335890547, + "learning_rate": 4.9242154220677496e-05, + "loss": 0.8294, + "step": 3583 + }, + { + "epoch": 0.42499703545594686, + "grad_norm": 1.4541086795040787, + "learning_rate": 4.924156757179612e-05, + "loss": 0.785, + "step": 3584 + }, + { + "epoch": 0.42511561721807184, + "grad_norm": 1.377526745544194, + "learning_rate": 4.924098069943722e-05, + "loss": 0.6608, + "step": 3585 + }, + { + "epoch": 0.4252341989801968, + "grad_norm": 1.5842606218308786, + "learning_rate": 4.92403936036062e-05, + "loss": 0.7119, + "step": 3586 + }, + { + "epoch": 0.42535278074232186, + "grad_norm": 1.4559660956518086, + "learning_rate": 4.923980628430848e-05, + "loss": 0.5201, + "step": 3587 + }, + { + "epoch": 0.42547136250444684, + "grad_norm": 2.0140504689614858, + "learning_rate": 4.9239218741549465e-05, + "loss": 1.0032, + "step": 3588 + }, + { + "epoch": 0.4255899442665718, + "grad_norm": 1.6600662389753644, + "learning_rate": 4.923863097533458e-05, + "loss": 0.7328, + "step": 3589 + }, + { + "epoch": 0.4257085260286968, + "grad_norm": 1.7216566567609004, + "learning_rate": 4.923804298566924e-05, + "loss": 0.776, + "step": 3590 + }, + { + "epoch": 0.4258271077908218, + "grad_norm": 1.6586163931785904, + "learning_rate": 4.9237454772558875e-05, + "loss": 0.768, + "step": 3591 + }, + { + "epoch": 0.42594568955294676, + "grad_norm": 1.4797600851048218, + "learning_rate": 4.923686633600889e-05, + "loss": 0.6951, + "step": 3592 + }, + { + "epoch": 0.42606427131507174, + "grad_norm": 1.9780805386359785, + "learning_rate": 4.923627767602472e-05, + "loss": 0.8154, + "step": 3593 + }, + { + "epoch": 0.4261828530771967, + "grad_norm": 1.746785944833219, + "learning_rate": 4.923568879261179e-05, + "loss": 0.9425, + "step": 3594 + }, + { + "epoch": 0.4263014348393217, + "grad_norm": 1.8735693465200096, + "learning_rate": 4.923509968577553e-05, + "loss": 0.7171, + "step": 3595 + }, + { + "epoch": 0.4264200166014467, + "grad_norm": 1.3773955300560434, + "learning_rate": 4.923451035552137e-05, + "loss": 0.7619, + "step": 3596 + }, + { + "epoch": 0.42653859836357166, + "grad_norm": 1.6528130108882608, + "learning_rate": 4.9233920801854735e-05, + "loss": 0.7374, + "step": 3597 + }, + { + "epoch": 0.42665718012569664, + "grad_norm": 1.5627615246274467, + "learning_rate": 4.923333102478108e-05, + "loss": 0.6958, + "step": 3598 + }, + { + "epoch": 0.4267757618878217, + "grad_norm": 1.1670083686722073, + "learning_rate": 4.9232741024305827e-05, + "loss": 0.5657, + "step": 3599 + }, + { + "epoch": 0.42689434364994666, + "grad_norm": 1.2898307555228037, + "learning_rate": 4.9232150800434415e-05, + "loss": 0.5877, + "step": 3600 + }, + { + "epoch": 0.42701292541207164, + "grad_norm": 1.6677942892979996, + "learning_rate": 4.9231560353172293e-05, + "loss": 1.029, + "step": 3601 + }, + { + "epoch": 0.4271315071741966, + "grad_norm": 1.324527837267684, + "learning_rate": 4.923096968252489e-05, + "loss": 0.8308, + "step": 3602 + }, + { + "epoch": 0.4272500889363216, + "grad_norm": 1.5354695954732414, + "learning_rate": 4.923037878849767e-05, + "loss": 0.7351, + "step": 3603 + }, + { + "epoch": 0.4273686706984466, + "grad_norm": 1.2628997696640223, + "learning_rate": 4.9229787671096064e-05, + "loss": 0.5528, + "step": 3604 + }, + { + "epoch": 0.42748725246057157, + "grad_norm": 1.475423898243588, + "learning_rate": 4.922919633032553e-05, + "loss": 0.6277, + "step": 3605 + }, + { + "epoch": 0.42760583422269655, + "grad_norm": 1.5262853356718367, + "learning_rate": 4.922860476619152e-05, + "loss": 0.6486, + "step": 3606 + }, + { + "epoch": 0.4277244159848215, + "grad_norm": 1.7899732784824407, + "learning_rate": 4.9228012978699485e-05, + "loss": 0.753, + "step": 3607 + }, + { + "epoch": 0.4278429977469465, + "grad_norm": 1.152657026452765, + "learning_rate": 4.9227420967854876e-05, + "loss": 0.4617, + "step": 3608 + }, + { + "epoch": 0.4279615795090715, + "grad_norm": 1.5508543915785118, + "learning_rate": 4.922682873366316e-05, + "loss": 0.6744, + "step": 3609 + }, + { + "epoch": 0.42808016127119647, + "grad_norm": 2.041771537754529, + "learning_rate": 4.922623627612979e-05, + "loss": 0.8721, + "step": 3610 + }, + { + "epoch": 0.42819874303332145, + "grad_norm": 1.5526423343742715, + "learning_rate": 4.9225643595260226e-05, + "loss": 0.6584, + "step": 3611 + }, + { + "epoch": 0.4283173247954465, + "grad_norm": 1.429012045737173, + "learning_rate": 4.922505069105995e-05, + "loss": 0.6408, + "step": 3612 + }, + { + "epoch": 0.42843590655757147, + "grad_norm": 1.6461667999253002, + "learning_rate": 4.92244575635344e-05, + "loss": 0.7386, + "step": 3613 + }, + { + "epoch": 0.42855448831969645, + "grad_norm": 1.6958129230644987, + "learning_rate": 4.922386421268906e-05, + "loss": 0.7408, + "step": 3614 + }, + { + "epoch": 0.42867307008182143, + "grad_norm": 1.3678971934138342, + "learning_rate": 4.9223270638529395e-05, + "loss": 0.6246, + "step": 3615 + }, + { + "epoch": 0.4287916518439464, + "grad_norm": 2.2514740346975484, + "learning_rate": 4.9222676841060884e-05, + "loss": 0.9973, + "step": 3616 + }, + { + "epoch": 0.4289102336060714, + "grad_norm": 1.7456557993212967, + "learning_rate": 4.9222082820288995e-05, + "loss": 0.6772, + "step": 3617 + }, + { + "epoch": 0.42902881536819637, + "grad_norm": 1.5644205018426343, + "learning_rate": 4.9221488576219204e-05, + "loss": 0.7423, + "step": 3618 + }, + { + "epoch": 0.42914739713032135, + "grad_norm": 1.5231342306745959, + "learning_rate": 4.9220894108856993e-05, + "loss": 0.7717, + "step": 3619 + }, + { + "epoch": 0.42926597889244633, + "grad_norm": 1.5828451146834837, + "learning_rate": 4.922029941820784e-05, + "loss": 0.9748, + "step": 3620 + }, + { + "epoch": 0.4293845606545713, + "grad_norm": 1.57544429361408, + "learning_rate": 4.921970450427722e-05, + "loss": 0.8111, + "step": 3621 + }, + { + "epoch": 0.4295031424166963, + "grad_norm": 1.3832598400448304, + "learning_rate": 4.9219109367070634e-05, + "loss": 0.8215, + "step": 3622 + }, + { + "epoch": 0.4296217241788213, + "grad_norm": 1.4107701692129464, + "learning_rate": 4.921851400659355e-05, + "loss": 0.8369, + "step": 3623 + }, + { + "epoch": 0.4297403059409463, + "grad_norm": 1.5164034619063047, + "learning_rate": 4.921791842285147e-05, + "loss": 0.7799, + "step": 3624 + }, + { + "epoch": 0.4298588877030713, + "grad_norm": 1.326331700257152, + "learning_rate": 4.921732261584989e-05, + "loss": 0.5584, + "step": 3625 + }, + { + "epoch": 0.42997746946519627, + "grad_norm": 1.7467407500074026, + "learning_rate": 4.921672658559428e-05, + "loss": 1.0535, + "step": 3626 + }, + { + "epoch": 0.43009605122732125, + "grad_norm": 1.472094091833865, + "learning_rate": 4.921613033209015e-05, + "loss": 0.6261, + "step": 3627 + }, + { + "epoch": 0.43021463298944623, + "grad_norm": 1.5939412527840942, + "learning_rate": 4.9215533855343e-05, + "loss": 0.827, + "step": 3628 + }, + { + "epoch": 0.4303332147515712, + "grad_norm": 1.4491678625367623, + "learning_rate": 4.9214937155358314e-05, + "loss": 0.7466, + "step": 3629 + }, + { + "epoch": 0.4304517965136962, + "grad_norm": 1.7112870242884934, + "learning_rate": 4.921434023214161e-05, + "loss": 0.9231, + "step": 3630 + }, + { + "epoch": 0.4305703782758212, + "grad_norm": 1.486700237721347, + "learning_rate": 4.921374308569837e-05, + "loss": 0.7307, + "step": 3631 + }, + { + "epoch": 0.43068896003794616, + "grad_norm": 1.494024944603728, + "learning_rate": 4.9213145716034126e-05, + "loss": 0.5931, + "step": 3632 + }, + { + "epoch": 0.43080754180007114, + "grad_norm": 1.400027914651356, + "learning_rate": 4.921254812315437e-05, + "loss": 0.6403, + "step": 3633 + }, + { + "epoch": 0.4309261235621961, + "grad_norm": 1.9638210870232098, + "learning_rate": 4.92119503070646e-05, + "loss": 0.7431, + "step": 3634 + }, + { + "epoch": 0.4310447053243211, + "grad_norm": 1.7555278779462034, + "learning_rate": 4.921135226777035e-05, + "loss": 0.743, + "step": 3635 + }, + { + "epoch": 0.4311632870864461, + "grad_norm": 1.6976217148129749, + "learning_rate": 4.921075400527712e-05, + "loss": 0.714, + "step": 3636 + }, + { + "epoch": 0.4312818688485711, + "grad_norm": 1.5782072039517103, + "learning_rate": 4.9210155519590415e-05, + "loss": 0.8035, + "step": 3637 + }, + { + "epoch": 0.4314004506106961, + "grad_norm": 1.4685744763962134, + "learning_rate": 4.9209556810715776e-05, + "loss": 0.7166, + "step": 3638 + }, + { + "epoch": 0.4315190323728211, + "grad_norm": 1.4634655088235458, + "learning_rate": 4.920895787865871e-05, + "loss": 0.8275, + "step": 3639 + }, + { + "epoch": 0.43163761413494606, + "grad_norm": 1.3656673551094536, + "learning_rate": 4.920835872342474e-05, + "loss": 0.5801, + "step": 3640 + }, + { + "epoch": 0.43175619589707104, + "grad_norm": 1.0789543302385225, + "learning_rate": 4.920775934501939e-05, + "loss": 0.4636, + "step": 3641 + }, + { + "epoch": 0.431874777659196, + "grad_norm": 1.691919714558414, + "learning_rate": 4.920715974344817e-05, + "loss": 0.7791, + "step": 3642 + }, + { + "epoch": 0.431993359421321, + "grad_norm": 1.7764887615934934, + "learning_rate": 4.9206559918716646e-05, + "loss": 0.8471, + "step": 3643 + }, + { + "epoch": 0.432111941183446, + "grad_norm": 1.6564892670750262, + "learning_rate": 4.92059598708303e-05, + "loss": 0.6381, + "step": 3644 + }, + { + "epoch": 0.43223052294557096, + "grad_norm": 1.4607827108712397, + "learning_rate": 4.9205359599794705e-05, + "loss": 0.7051, + "step": 3645 + }, + { + "epoch": 0.43234910470769594, + "grad_norm": 1.462463283883136, + "learning_rate": 4.9204759105615374e-05, + "loss": 0.6712, + "step": 3646 + }, + { + "epoch": 0.4324676864698209, + "grad_norm": 1.3440757466446647, + "learning_rate": 4.920415838829784e-05, + "loss": 0.8212, + "step": 3647 + }, + { + "epoch": 0.4325862682319459, + "grad_norm": 1.5180572758932502, + "learning_rate": 4.920355744784765e-05, + "loss": 0.6539, + "step": 3648 + }, + { + "epoch": 0.43270484999407094, + "grad_norm": 1.6734326087894855, + "learning_rate": 4.920295628427034e-05, + "loss": 0.707, + "step": 3649 + }, + { + "epoch": 0.4328234317561959, + "grad_norm": 1.371012472460331, + "learning_rate": 4.920235489757145e-05, + "loss": 0.6526, + "step": 3650 + }, + { + "epoch": 0.4329420135183209, + "grad_norm": 1.5815516194415, + "learning_rate": 4.9201753287756525e-05, + "loss": 0.7194, + "step": 3651 + }, + { + "epoch": 0.4330605952804459, + "grad_norm": 1.4451949171689138, + "learning_rate": 4.920115145483112e-05, + "loss": 0.6913, + "step": 3652 + }, + { + "epoch": 0.43317917704257086, + "grad_norm": 1.8469045899301022, + "learning_rate": 4.9200549398800776e-05, + "loss": 0.8504, + "step": 3653 + }, + { + "epoch": 0.43329775880469584, + "grad_norm": 1.2330769137244062, + "learning_rate": 4.919994711967104e-05, + "loss": 0.4834, + "step": 3654 + }, + { + "epoch": 0.4334163405668208, + "grad_norm": 1.6734947812265601, + "learning_rate": 4.919934461744746e-05, + "loss": 0.8655, + "step": 3655 + }, + { + "epoch": 0.4335349223289458, + "grad_norm": 1.7832224715093994, + "learning_rate": 4.9198741892135615e-05, + "loss": 0.6894, + "step": 3656 + }, + { + "epoch": 0.4336535040910708, + "grad_norm": 1.5120723118565969, + "learning_rate": 4.919813894374103e-05, + "loss": 0.7809, + "step": 3657 + }, + { + "epoch": 0.43377208585319577, + "grad_norm": 1.4009153144787523, + "learning_rate": 4.919753577226928e-05, + "loss": 0.7459, + "step": 3658 + }, + { + "epoch": 0.43389066761532075, + "grad_norm": 1.758782589767061, + "learning_rate": 4.919693237772593e-05, + "loss": 0.8602, + "step": 3659 + }, + { + "epoch": 0.4340092493774457, + "grad_norm": 1.8162248823592884, + "learning_rate": 4.919632876011653e-05, + "loss": 0.851, + "step": 3660 + }, + { + "epoch": 0.4341278311395707, + "grad_norm": 1.5342316444953457, + "learning_rate": 4.9195724919446654e-05, + "loss": 0.8674, + "step": 3661 + }, + { + "epoch": 0.43424641290169574, + "grad_norm": 1.71097521152454, + "learning_rate": 4.919512085572186e-05, + "loss": 0.8472, + "step": 3662 + }, + { + "epoch": 0.4343649946638207, + "grad_norm": 1.2693315980607447, + "learning_rate": 4.919451656894772e-05, + "loss": 0.6663, + "step": 3663 + }, + { + "epoch": 0.4344835764259457, + "grad_norm": 1.500321649820223, + "learning_rate": 4.9193912059129814e-05, + "loss": 0.7773, + "step": 3664 + }, + { + "epoch": 0.4346021581880707, + "grad_norm": 1.4185710089312993, + "learning_rate": 4.9193307326273704e-05, + "loss": 0.5322, + "step": 3665 + }, + { + "epoch": 0.43472073995019567, + "grad_norm": 1.3746099979327262, + "learning_rate": 4.919270237038497e-05, + "loss": 0.702, + "step": 3666 + }, + { + "epoch": 0.43483932171232065, + "grad_norm": 1.3613530508839398, + "learning_rate": 4.919209719146919e-05, + "loss": 0.8026, + "step": 3667 + }, + { + "epoch": 0.43495790347444563, + "grad_norm": 1.4506416775954576, + "learning_rate": 4.9191491789531926e-05, + "loss": 0.6914, + "step": 3668 + }, + { + "epoch": 0.4350764852365706, + "grad_norm": 1.3045897269564, + "learning_rate": 4.9190886164578785e-05, + "loss": 0.5917, + "step": 3669 + }, + { + "epoch": 0.4351950669986956, + "grad_norm": 1.2444689054122857, + "learning_rate": 4.9190280316615347e-05, + "loss": 0.601, + "step": 3670 + }, + { + "epoch": 0.43531364876082057, + "grad_norm": 1.4338860185182447, + "learning_rate": 4.9189674245647174e-05, + "loss": 0.6681, + "step": 3671 + }, + { + "epoch": 0.43543223052294555, + "grad_norm": 1.8262187051994045, + "learning_rate": 4.9189067951679866e-05, + "loss": 0.8211, + "step": 3672 + }, + { + "epoch": 0.43555081228507053, + "grad_norm": 1.5012674199011737, + "learning_rate": 4.9188461434719025e-05, + "loss": 0.5294, + "step": 3673 + }, + { + "epoch": 0.43566939404719557, + "grad_norm": 1.8051003770663558, + "learning_rate": 4.918785469477022e-05, + "loss": 0.8733, + "step": 3674 + }, + { + "epoch": 0.43578797580932055, + "grad_norm": 1.7767769820440422, + "learning_rate": 4.918724773183907e-05, + "loss": 0.7308, + "step": 3675 + }, + { + "epoch": 0.43590655757144553, + "grad_norm": 1.659089147148862, + "learning_rate": 4.918664054593114e-05, + "loss": 0.5209, + "step": 3676 + }, + { + "epoch": 0.4360251393335705, + "grad_norm": 1.4663770956953726, + "learning_rate": 4.9186033137052055e-05, + "loss": 0.593, + "step": 3677 + }, + { + "epoch": 0.4361437210956955, + "grad_norm": 1.656563482684807, + "learning_rate": 4.918542550520741e-05, + "loss": 0.8125, + "step": 3678 + }, + { + "epoch": 0.43626230285782047, + "grad_norm": 2.719523158507605, + "learning_rate": 4.9184817650402784e-05, + "loss": 0.7172, + "step": 3679 + }, + { + "epoch": 0.43638088461994545, + "grad_norm": 1.5811272923199091, + "learning_rate": 4.91842095726438e-05, + "loss": 0.6138, + "step": 3680 + }, + { + "epoch": 0.43649946638207043, + "grad_norm": 1.9327206016566538, + "learning_rate": 4.918360127193606e-05, + "loss": 1.0, + "step": 3681 + }, + { + "epoch": 0.4366180481441954, + "grad_norm": 1.8397618233140558, + "learning_rate": 4.918299274828519e-05, + "loss": 0.8983, + "step": 3682 + }, + { + "epoch": 0.4367366299063204, + "grad_norm": 1.4124432541445955, + "learning_rate": 4.918238400169676e-05, + "loss": 0.568, + "step": 3683 + }, + { + "epoch": 0.4368552116684454, + "grad_norm": 1.327914249272055, + "learning_rate": 4.918177503217641e-05, + "loss": 0.4976, + "step": 3684 + }, + { + "epoch": 0.43697379343057036, + "grad_norm": 1.7923800931396248, + "learning_rate": 4.918116583972975e-05, + "loss": 0.6769, + "step": 3685 + }, + { + "epoch": 0.43709237519269534, + "grad_norm": 1.3925756345635616, + "learning_rate": 4.9180556424362396e-05, + "loss": 0.6912, + "step": 3686 + }, + { + "epoch": 0.4372109569548204, + "grad_norm": 1.4603745488788336, + "learning_rate": 4.917994678607996e-05, + "loss": 0.855, + "step": 3687 + }, + { + "epoch": 0.43732953871694535, + "grad_norm": 1.8095696079747157, + "learning_rate": 4.917933692488807e-05, + "loss": 0.7382, + "step": 3688 + }, + { + "epoch": 0.43744812047907033, + "grad_norm": 1.849922042895717, + "learning_rate": 4.917872684079234e-05, + "loss": 0.9788, + "step": 3689 + }, + { + "epoch": 0.4375667022411953, + "grad_norm": 1.7149869859652864, + "learning_rate": 4.91781165337984e-05, + "loss": 0.7345, + "step": 3690 + }, + { + "epoch": 0.4376852840033203, + "grad_norm": 1.4496827083648698, + "learning_rate": 4.917750600391188e-05, + "loss": 0.583, + "step": 3691 + }, + { + "epoch": 0.4378038657654453, + "grad_norm": 1.5589365568101325, + "learning_rate": 4.917689525113839e-05, + "loss": 0.753, + "step": 3692 + }, + { + "epoch": 0.43792244752757026, + "grad_norm": 1.4597175602431152, + "learning_rate": 4.917628427548358e-05, + "loss": 0.6889, + "step": 3693 + }, + { + "epoch": 0.43804102928969524, + "grad_norm": 1.823583848891648, + "learning_rate": 4.9175673076953076e-05, + "loss": 0.8768, + "step": 3694 + }, + { + "epoch": 0.4381596110518202, + "grad_norm": 1.7123028027872849, + "learning_rate": 4.917506165555252e-05, + "loss": 0.6102, + "step": 3695 + }, + { + "epoch": 0.4382781928139452, + "grad_norm": 1.4893953471738448, + "learning_rate": 4.917445001128753e-05, + "loss": 0.6914, + "step": 3696 + }, + { + "epoch": 0.4383967745760702, + "grad_norm": 1.4332809003314921, + "learning_rate": 4.9173838144163765e-05, + "loss": 0.7672, + "step": 3697 + }, + { + "epoch": 0.43851535633819516, + "grad_norm": 1.8586618413710596, + "learning_rate": 4.917322605418685e-05, + "loss": 0.9991, + "step": 3698 + }, + { + "epoch": 0.4386339381003202, + "grad_norm": 1.440583183728854, + "learning_rate": 4.9172613741362436e-05, + "loss": 0.7482, + "step": 3699 + }, + { + "epoch": 0.4387525198624452, + "grad_norm": 1.5492880173245118, + "learning_rate": 4.917200120569616e-05, + "loss": 0.9971, + "step": 3700 + }, + { + "epoch": 0.43887110162457016, + "grad_norm": 1.3793141066878345, + "learning_rate": 4.917138844719368e-05, + "loss": 0.6898, + "step": 3701 + }, + { + "epoch": 0.43898968338669514, + "grad_norm": 1.6170643774535185, + "learning_rate": 4.917077546586064e-05, + "loss": 0.9576, + "step": 3702 + }, + { + "epoch": 0.4391082651488201, + "grad_norm": 1.3901001031585087, + "learning_rate": 4.917016226170269e-05, + "loss": 0.5633, + "step": 3703 + }, + { + "epoch": 0.4392268469109451, + "grad_norm": 1.3449762708164168, + "learning_rate": 4.9169548834725486e-05, + "loss": 0.7096, + "step": 3704 + }, + { + "epoch": 0.4393454286730701, + "grad_norm": 1.2260697050694453, + "learning_rate": 4.9168935184934674e-05, + "loss": 0.5112, + "step": 3705 + }, + { + "epoch": 0.43946401043519506, + "grad_norm": 1.5825576765995726, + "learning_rate": 4.916832131233592e-05, + "loss": 0.7319, + "step": 3706 + }, + { + "epoch": 0.43958259219732004, + "grad_norm": 1.5680591056232225, + "learning_rate": 4.916770721693488e-05, + "loss": 0.7233, + "step": 3707 + }, + { + "epoch": 0.439701173959445, + "grad_norm": 1.4229518217731505, + "learning_rate": 4.9167092898737224e-05, + "loss": 0.5274, + "step": 3708 + }, + { + "epoch": 0.43981975572157, + "grad_norm": 1.5300068456784883, + "learning_rate": 4.91664783577486e-05, + "loss": 0.7531, + "step": 3709 + }, + { + "epoch": 0.439938337483695, + "grad_norm": 1.5310814384588105, + "learning_rate": 4.916586359397468e-05, + "loss": 0.6816, + "step": 3710 + }, + { + "epoch": 0.44005691924581997, + "grad_norm": 2.1172809827624763, + "learning_rate": 4.9165248607421136e-05, + "loss": 1.1004, + "step": 3711 + }, + { + "epoch": 0.440175501007945, + "grad_norm": 1.5464155690483254, + "learning_rate": 4.9164633398093624e-05, + "loss": 0.702, + "step": 3712 + }, + { + "epoch": 0.44029408277007, + "grad_norm": 1.4943619429465729, + "learning_rate": 4.9164017965997836e-05, + "loss": 0.8011, + "step": 3713 + }, + { + "epoch": 0.44041266453219496, + "grad_norm": 1.6845957672465786, + "learning_rate": 4.916340231113942e-05, + "loss": 0.7088, + "step": 3714 + }, + { + "epoch": 0.44053124629431994, + "grad_norm": 2.0549469251025894, + "learning_rate": 4.916278643352408e-05, + "loss": 0.9282, + "step": 3715 + }, + { + "epoch": 0.4406498280564449, + "grad_norm": 1.5611075286804772, + "learning_rate": 4.916217033315747e-05, + "loss": 0.7549, + "step": 3716 + }, + { + "epoch": 0.4407684098185699, + "grad_norm": 1.4213731132756906, + "learning_rate": 4.9161554010045283e-05, + "loss": 0.4818, + "step": 3717 + }, + { + "epoch": 0.4408869915806949, + "grad_norm": 1.6234472103234088, + "learning_rate": 4.9160937464193205e-05, + "loss": 0.9881, + "step": 3718 + }, + { + "epoch": 0.44100557334281987, + "grad_norm": 1.5957563866931437, + "learning_rate": 4.91603206956069e-05, + "loss": 0.6846, + "step": 3719 + }, + { + "epoch": 0.44112415510494485, + "grad_norm": 1.4734756467575763, + "learning_rate": 4.9159703704292064e-05, + "loss": 0.6638, + "step": 3720 + }, + { + "epoch": 0.44124273686706983, + "grad_norm": 1.8775096623885879, + "learning_rate": 4.915908649025439e-05, + "loss": 0.9088, + "step": 3721 + }, + { + "epoch": 0.4413613186291948, + "grad_norm": 1.4129643210765253, + "learning_rate": 4.9158469053499565e-05, + "loss": 0.55, + "step": 3722 + }, + { + "epoch": 0.4414799003913198, + "grad_norm": 1.4467042404042083, + "learning_rate": 4.915785139403328e-05, + "loss": 0.5976, + "step": 3723 + }, + { + "epoch": 0.4415984821534448, + "grad_norm": 1.8067560227426842, + "learning_rate": 4.915723351186123e-05, + "loss": 0.8322, + "step": 3724 + }, + { + "epoch": 0.4417170639155698, + "grad_norm": 1.501098409605572, + "learning_rate": 4.9156615406989117e-05, + "loss": 0.6459, + "step": 3725 + }, + { + "epoch": 0.4418356456776948, + "grad_norm": 1.6027400691424911, + "learning_rate": 4.915599707942263e-05, + "loss": 0.5494, + "step": 3726 + }, + { + "epoch": 0.44195422743981977, + "grad_norm": 1.5644844153572834, + "learning_rate": 4.915537852916747e-05, + "loss": 0.9371, + "step": 3727 + }, + { + "epoch": 0.44207280920194475, + "grad_norm": 1.8197042138895774, + "learning_rate": 4.915475975622934e-05, + "loss": 0.7445, + "step": 3728 + }, + { + "epoch": 0.44219139096406973, + "grad_norm": 1.4482409800832963, + "learning_rate": 4.9154140760613944e-05, + "loss": 0.6838, + "step": 3729 + }, + { + "epoch": 0.4423099727261947, + "grad_norm": 1.9586948516918967, + "learning_rate": 4.9153521542326994e-05, + "loss": 0.8686, + "step": 3730 + }, + { + "epoch": 0.4424285544883197, + "grad_norm": 1.8051914067186734, + "learning_rate": 4.9152902101374184e-05, + "loss": 0.7008, + "step": 3731 + }, + { + "epoch": 0.4425471362504447, + "grad_norm": 1.4752092220789619, + "learning_rate": 4.915228243776125e-05, + "loss": 0.5936, + "step": 3732 + }, + { + "epoch": 0.44266571801256965, + "grad_norm": 1.4247274664317182, + "learning_rate": 4.9151662551493884e-05, + "loss": 0.5571, + "step": 3733 + }, + { + "epoch": 0.44278429977469463, + "grad_norm": 2.1067707284390162, + "learning_rate": 4.91510424425778e-05, + "loss": 0.9149, + "step": 3734 + }, + { + "epoch": 0.4429028815368196, + "grad_norm": 1.5361355697085894, + "learning_rate": 4.915042211101872e-05, + "loss": 0.7869, + "step": 3735 + }, + { + "epoch": 0.4430214632989446, + "grad_norm": 1.3418294877099208, + "learning_rate": 4.9149801556822374e-05, + "loss": 0.5111, + "step": 3736 + }, + { + "epoch": 0.44314004506106963, + "grad_norm": 1.5931091827568034, + "learning_rate": 4.914918077999447e-05, + "loss": 0.8624, + "step": 3737 + }, + { + "epoch": 0.4432586268231946, + "grad_norm": 1.7827612958946482, + "learning_rate": 4.9148559780540726e-05, + "loss": 0.9248, + "step": 3738 + }, + { + "epoch": 0.4433772085853196, + "grad_norm": 1.4053596753527213, + "learning_rate": 4.9147938558466876e-05, + "loss": 0.7461, + "step": 3739 + }, + { + "epoch": 0.4434957903474446, + "grad_norm": 1.4329184616364687, + "learning_rate": 4.9147317113778645e-05, + "loss": 0.5927, + "step": 3740 + }, + { + "epoch": 0.44361437210956955, + "grad_norm": 1.5630833810454374, + "learning_rate": 4.914669544648177e-05, + "loss": 0.5502, + "step": 3741 + }, + { + "epoch": 0.44373295387169454, + "grad_norm": 1.56400966014751, + "learning_rate": 4.914607355658196e-05, + "loss": 0.6651, + "step": 3742 + }, + { + "epoch": 0.4438515356338195, + "grad_norm": 1.7616046534451268, + "learning_rate": 4.9145451444084966e-05, + "loss": 0.6851, + "step": 3743 + }, + { + "epoch": 0.4439701173959445, + "grad_norm": 1.5878650119468072, + "learning_rate": 4.914482910899653e-05, + "loss": 0.6144, + "step": 3744 + }, + { + "epoch": 0.4440886991580695, + "grad_norm": 1.3360251556162615, + "learning_rate": 4.914420655132236e-05, + "loss": 0.587, + "step": 3745 + }, + { + "epoch": 0.44420728092019446, + "grad_norm": 1.3908556014931897, + "learning_rate": 4.914358377106822e-05, + "loss": 0.6415, + "step": 3746 + }, + { + "epoch": 0.44432586268231944, + "grad_norm": 1.431619376695129, + "learning_rate": 4.914296076823985e-05, + "loss": 0.5723, + "step": 3747 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.556430548594164, + "learning_rate": 4.9142337542842985e-05, + "loss": 0.7124, + "step": 3748 + }, + { + "epoch": 0.44456302620656946, + "grad_norm": 1.7201589634273486, + "learning_rate": 4.914171409488337e-05, + "loss": 0.7343, + "step": 3749 + }, + { + "epoch": 0.44468160796869444, + "grad_norm": 1.8242733860975722, + "learning_rate": 4.914109042436676e-05, + "loss": 0.8909, + "step": 3750 + }, + { + "epoch": 0.4448001897308194, + "grad_norm": 1.4112996987407562, + "learning_rate": 4.9140466531298904e-05, + "loss": 0.576, + "step": 3751 + }, + { + "epoch": 0.4449187714929444, + "grad_norm": 1.630391042923391, + "learning_rate": 4.913984241568554e-05, + "loss": 0.84, + "step": 3752 + }, + { + "epoch": 0.4450373532550694, + "grad_norm": 1.6409502129124651, + "learning_rate": 4.913921807753244e-05, + "loss": 0.7816, + "step": 3753 + }, + { + "epoch": 0.44515593501719436, + "grad_norm": 1.414269556362842, + "learning_rate": 4.913859351684534e-05, + "loss": 0.5965, + "step": 3754 + }, + { + "epoch": 0.44527451677931934, + "grad_norm": 1.6129567739004687, + "learning_rate": 4.913796873363001e-05, + "loss": 0.6209, + "step": 3755 + }, + { + "epoch": 0.4453930985414443, + "grad_norm": 1.8511535227983973, + "learning_rate": 4.913734372789221e-05, + "loss": 0.9404, + "step": 3756 + }, + { + "epoch": 0.4455116803035693, + "grad_norm": 1.707954855749132, + "learning_rate": 4.91367184996377e-05, + "loss": 0.6642, + "step": 3757 + }, + { + "epoch": 0.4456302620656943, + "grad_norm": 1.818370375033754, + "learning_rate": 4.913609304887225e-05, + "loss": 0.6689, + "step": 3758 + }, + { + "epoch": 0.44574884382781926, + "grad_norm": 1.6860975368572186, + "learning_rate": 4.9135467375601616e-05, + "loss": 0.7352, + "step": 3759 + }, + { + "epoch": 0.44586742558994424, + "grad_norm": 2.0096452690905955, + "learning_rate": 4.9134841479831576e-05, + "loss": 0.9705, + "step": 3760 + }, + { + "epoch": 0.4459860073520692, + "grad_norm": 1.535999674825247, + "learning_rate": 4.913421536156788e-05, + "loss": 0.6785, + "step": 3761 + }, + { + "epoch": 0.44610458911419426, + "grad_norm": 1.2198690861992927, + "learning_rate": 4.913358902081632e-05, + "loss": 0.3961, + "step": 3762 + }, + { + "epoch": 0.44622317087631924, + "grad_norm": 1.7567315605999472, + "learning_rate": 4.913296245758266e-05, + "loss": 0.7412, + "step": 3763 + }, + { + "epoch": 0.4463417526384442, + "grad_norm": 1.6277791287368613, + "learning_rate": 4.9132335671872686e-05, + "loss": 0.7336, + "step": 3764 + }, + { + "epoch": 0.4464603344005692, + "grad_norm": 1.4532483072853397, + "learning_rate": 4.913170866369217e-05, + "loss": 0.6846, + "step": 3765 + }, + { + "epoch": 0.4465789161626942, + "grad_norm": 1.4885014287867977, + "learning_rate": 4.913108143304689e-05, + "loss": 0.5426, + "step": 3766 + }, + { + "epoch": 0.44669749792481916, + "grad_norm": 1.5773922728543264, + "learning_rate": 4.913045397994263e-05, + "loss": 1.0181, + "step": 3767 + }, + { + "epoch": 0.44681607968694415, + "grad_norm": 1.3610474505671297, + "learning_rate": 4.912982630438518e-05, + "loss": 0.6499, + "step": 3768 + }, + { + "epoch": 0.4469346614490691, + "grad_norm": 1.651622434406524, + "learning_rate": 4.912919840638032e-05, + "loss": 0.8597, + "step": 3769 + }, + { + "epoch": 0.4470532432111941, + "grad_norm": 1.0520033149010617, + "learning_rate": 4.912857028593384e-05, + "loss": 0.3987, + "step": 3770 + }, + { + "epoch": 0.4471718249733191, + "grad_norm": 1.6425076495795228, + "learning_rate": 4.9127941943051525e-05, + "loss": 0.883, + "step": 3771 + }, + { + "epoch": 0.44729040673544407, + "grad_norm": 1.2731972881379685, + "learning_rate": 4.9127313377739176e-05, + "loss": 0.589, + "step": 3772 + }, + { + "epoch": 0.44740898849756905, + "grad_norm": 1.9231656365556318, + "learning_rate": 4.912668459000258e-05, + "loss": 0.985, + "step": 3773 + }, + { + "epoch": 0.4475275702596941, + "grad_norm": 1.2825504668294465, + "learning_rate": 4.9126055579847545e-05, + "loss": 0.7065, + "step": 3774 + }, + { + "epoch": 0.44764615202181907, + "grad_norm": 1.719422836285591, + "learning_rate": 4.912542634727986e-05, + "loss": 0.722, + "step": 3775 + }, + { + "epoch": 0.44776473378394405, + "grad_norm": 1.4388996069150808, + "learning_rate": 4.912479689230533e-05, + "loss": 0.628, + "step": 3776 + }, + { + "epoch": 0.447883315546069, + "grad_norm": 1.4902129687725527, + "learning_rate": 4.9124167214929755e-05, + "loss": 0.7974, + "step": 3777 + }, + { + "epoch": 0.448001897308194, + "grad_norm": 1.4918084741995443, + "learning_rate": 4.912353731515894e-05, + "loss": 0.6942, + "step": 3778 + }, + { + "epoch": 0.448120479070319, + "grad_norm": 1.729523899728431, + "learning_rate": 4.91229071929987e-05, + "loss": 0.7166, + "step": 3779 + }, + { + "epoch": 0.44823906083244397, + "grad_norm": 1.623715462843089, + "learning_rate": 4.912227684845483e-05, + "loss": 0.6594, + "step": 3780 + }, + { + "epoch": 0.44835764259456895, + "grad_norm": 1.7478286897809354, + "learning_rate": 4.912164628153315e-05, + "loss": 0.9286, + "step": 3781 + }, + { + "epoch": 0.44847622435669393, + "grad_norm": 1.4604256847957344, + "learning_rate": 4.9121015492239476e-05, + "loss": 0.4092, + "step": 3782 + }, + { + "epoch": 0.4485948061188189, + "grad_norm": 1.3089477128924802, + "learning_rate": 4.912038448057961e-05, + "loss": 0.673, + "step": 3783 + }, + { + "epoch": 0.4487133878809439, + "grad_norm": 1.5961967442156921, + "learning_rate": 4.911975324655939e-05, + "loss": 0.7159, + "step": 3784 + }, + { + "epoch": 0.4488319696430689, + "grad_norm": 1.3301222494624887, + "learning_rate": 4.911912179018461e-05, + "loss": 0.7359, + "step": 3785 + }, + { + "epoch": 0.44895055140519385, + "grad_norm": 1.8420222578934395, + "learning_rate": 4.91184901114611e-05, + "loss": 1.0092, + "step": 3786 + }, + { + "epoch": 0.4490691331673189, + "grad_norm": 1.378966185998779, + "learning_rate": 4.9117858210394695e-05, + "loss": 0.7268, + "step": 3787 + }, + { + "epoch": 0.44918771492944387, + "grad_norm": 1.7787160039407819, + "learning_rate": 4.911722608699122e-05, + "loss": 0.7847, + "step": 3788 + }, + { + "epoch": 0.44930629669156885, + "grad_norm": 1.3109885218541657, + "learning_rate": 4.911659374125649e-05, + "loss": 0.5242, + "step": 3789 + }, + { + "epoch": 0.44942487845369383, + "grad_norm": 1.6146944022912781, + "learning_rate": 4.911596117319633e-05, + "loss": 0.7144, + "step": 3790 + }, + { + "epoch": 0.4495434602158188, + "grad_norm": 1.607551373462184, + "learning_rate": 4.911532838281659e-05, + "loss": 0.7032, + "step": 3791 + }, + { + "epoch": 0.4496620419779438, + "grad_norm": 1.5228137846995211, + "learning_rate": 4.911469537012309e-05, + "loss": 0.8923, + "step": 3792 + }, + { + "epoch": 0.4497806237400688, + "grad_norm": 1.3007008424278421, + "learning_rate": 4.911406213512167e-05, + "loss": 0.6418, + "step": 3793 + }, + { + "epoch": 0.44989920550219376, + "grad_norm": 1.2963399765415107, + "learning_rate": 4.911342867781817e-05, + "loss": 0.6311, + "step": 3794 + }, + { + "epoch": 0.45001778726431874, + "grad_norm": 1.7411019417764901, + "learning_rate": 4.9112794998218434e-05, + "loss": 0.6924, + "step": 3795 + }, + { + "epoch": 0.4501363690264437, + "grad_norm": 1.783734064891071, + "learning_rate": 4.9112161096328294e-05, + "loss": 0.7682, + "step": 3796 + }, + { + "epoch": 0.4502549507885687, + "grad_norm": 1.293956454833335, + "learning_rate": 4.911152697215359e-05, + "loss": 0.5522, + "step": 3797 + }, + { + "epoch": 0.4503735325506937, + "grad_norm": 1.535777114104377, + "learning_rate": 4.911089262570018e-05, + "loss": 0.5626, + "step": 3798 + }, + { + "epoch": 0.4504921143128187, + "grad_norm": 1.2685280933203757, + "learning_rate": 4.9110258056973904e-05, + "loss": 0.5296, + "step": 3799 + }, + { + "epoch": 0.4506106960749437, + "grad_norm": 1.2798759735181464, + "learning_rate": 4.910962326598062e-05, + "loss": 0.4881, + "step": 3800 + }, + { + "epoch": 0.4507292778370687, + "grad_norm": 1.7206375722837797, + "learning_rate": 4.9108988252726165e-05, + "loss": 0.7156, + "step": 3801 + }, + { + "epoch": 0.45084785959919366, + "grad_norm": 1.7938982466354982, + "learning_rate": 4.9108353017216416e-05, + "loss": 0.8591, + "step": 3802 + }, + { + "epoch": 0.45096644136131864, + "grad_norm": 2.0662464710521116, + "learning_rate": 4.91077175594572e-05, + "loss": 0.8492, + "step": 3803 + }, + { + "epoch": 0.4510850231234436, + "grad_norm": 1.2959847820625885, + "learning_rate": 4.9107081879454405e-05, + "loss": 0.5016, + "step": 3804 + }, + { + "epoch": 0.4512036048855686, + "grad_norm": 1.843461402589745, + "learning_rate": 4.9106445977213866e-05, + "loss": 0.7092, + "step": 3805 + }, + { + "epoch": 0.4513221866476936, + "grad_norm": 1.6160542140543694, + "learning_rate": 4.9105809852741466e-05, + "loss": 0.6072, + "step": 3806 + }, + { + "epoch": 0.45144076840981856, + "grad_norm": 1.562064584699593, + "learning_rate": 4.9105173506043054e-05, + "loss": 0.5655, + "step": 3807 + }, + { + "epoch": 0.45155935017194354, + "grad_norm": 1.7022415719479005, + "learning_rate": 4.91045369371245e-05, + "loss": 0.8045, + "step": 3808 + }, + { + "epoch": 0.4516779319340685, + "grad_norm": 1.4270914321392234, + "learning_rate": 4.910390014599168e-05, + "loss": 0.5626, + "step": 3809 + }, + { + "epoch": 0.4517965136961935, + "grad_norm": 1.313578023957952, + "learning_rate": 4.9103263132650455e-05, + "loss": 0.659, + "step": 3810 + }, + { + "epoch": 0.4519150954583185, + "grad_norm": 1.3195594561473414, + "learning_rate": 4.9102625897106704e-05, + "loss": 0.3815, + "step": 3811 + }, + { + "epoch": 0.4520336772204435, + "grad_norm": 1.6990982758180448, + "learning_rate": 4.9101988439366295e-05, + "loss": 0.6752, + "step": 3812 + }, + { + "epoch": 0.4521522589825685, + "grad_norm": 1.4737468802858278, + "learning_rate": 4.910135075943512e-05, + "loss": 0.7983, + "step": 3813 + }, + { + "epoch": 0.4522708407446935, + "grad_norm": 1.4934705597449243, + "learning_rate": 4.910071285731903e-05, + "loss": 0.833, + "step": 3814 + }, + { + "epoch": 0.45238942250681846, + "grad_norm": 1.760011942289493, + "learning_rate": 4.910007473302393e-05, + "loss": 0.7476, + "step": 3815 + }, + { + "epoch": 0.45250800426894344, + "grad_norm": 1.4224654529365122, + "learning_rate": 4.9099436386555694e-05, + "loss": 0.6646, + "step": 3816 + }, + { + "epoch": 0.4526265860310684, + "grad_norm": 1.2455874492846077, + "learning_rate": 4.9098797817920205e-05, + "loss": 0.5943, + "step": 3817 + }, + { + "epoch": 0.4527451677931934, + "grad_norm": 1.508674142573095, + "learning_rate": 4.9098159027123355e-05, + "loss": 0.7141, + "step": 3818 + }, + { + "epoch": 0.4528637495553184, + "grad_norm": 1.4666608965950758, + "learning_rate": 4.909752001417103e-05, + "loss": 0.6025, + "step": 3819 + }, + { + "epoch": 0.45298233131744337, + "grad_norm": 1.469878899838565, + "learning_rate": 4.9096880779069124e-05, + "loss": 0.6373, + "step": 3820 + }, + { + "epoch": 0.45310091307956835, + "grad_norm": 1.4411637406241682, + "learning_rate": 4.909624132182352e-05, + "loss": 0.7271, + "step": 3821 + }, + { + "epoch": 0.4532194948416933, + "grad_norm": 1.4021438941792483, + "learning_rate": 4.9095601642440115e-05, + "loss": 0.6757, + "step": 3822 + }, + { + "epoch": 0.4533380766038183, + "grad_norm": 1.8623158577827397, + "learning_rate": 4.9094961740924815e-05, + "loss": 0.6938, + "step": 3823 + }, + { + "epoch": 0.45345665836594334, + "grad_norm": 1.3933268720226462, + "learning_rate": 4.909432161728352e-05, + "loss": 0.6903, + "step": 3824 + }, + { + "epoch": 0.4535752401280683, + "grad_norm": 1.6096553276358954, + "learning_rate": 4.909368127152213e-05, + "loss": 0.6633, + "step": 3825 + }, + { + "epoch": 0.4536938218901933, + "grad_norm": 1.7989893279415112, + "learning_rate": 4.9093040703646534e-05, + "loss": 0.8226, + "step": 3826 + }, + { + "epoch": 0.4538124036523183, + "grad_norm": 1.5464451450628212, + "learning_rate": 4.9092399913662654e-05, + "loss": 0.5812, + "step": 3827 + }, + { + "epoch": 0.45393098541444327, + "grad_norm": 1.617298821479472, + "learning_rate": 4.909175890157638e-05, + "loss": 0.6264, + "step": 3828 + }, + { + "epoch": 0.45404956717656825, + "grad_norm": 1.6430659180240612, + "learning_rate": 4.909111766739365e-05, + "loss": 0.8252, + "step": 3829 + }, + { + "epoch": 0.45416814893869323, + "grad_norm": 1.4731128455602542, + "learning_rate": 4.909047621112035e-05, + "loss": 0.7514, + "step": 3830 + }, + { + "epoch": 0.4542867307008182, + "grad_norm": 1.7273395973357997, + "learning_rate": 4.908983453276239e-05, + "loss": 0.6379, + "step": 3831 + }, + { + "epoch": 0.4544053124629432, + "grad_norm": 1.467912585798283, + "learning_rate": 4.908919263232571e-05, + "loss": 0.5499, + "step": 3832 + }, + { + "epoch": 0.45452389422506817, + "grad_norm": 1.7255056888954539, + "learning_rate": 4.908855050981621e-05, + "loss": 0.8297, + "step": 3833 + }, + { + "epoch": 0.45464247598719315, + "grad_norm": 1.6881986314049335, + "learning_rate": 4.9087908165239814e-05, + "loss": 0.7899, + "step": 3834 + }, + { + "epoch": 0.45476105774931813, + "grad_norm": 1.5771814855496442, + "learning_rate": 4.908726559860244e-05, + "loss": 0.8075, + "step": 3835 + }, + { + "epoch": 0.4548796395114431, + "grad_norm": 1.373764834619931, + "learning_rate": 4.908662280991002e-05, + "loss": 0.5749, + "step": 3836 + }, + { + "epoch": 0.45499822127356815, + "grad_norm": 1.474062270759114, + "learning_rate": 4.908597979916847e-05, + "loss": 0.5688, + "step": 3837 + }, + { + "epoch": 0.45511680303569313, + "grad_norm": 2.1548988058717793, + "learning_rate": 4.908533656638372e-05, + "loss": 1.0108, + "step": 3838 + }, + { + "epoch": 0.4552353847978181, + "grad_norm": 1.402959183715603, + "learning_rate": 4.908469311156171e-05, + "loss": 0.8226, + "step": 3839 + }, + { + "epoch": 0.4553539665599431, + "grad_norm": 1.2477160800837894, + "learning_rate": 4.908404943470836e-05, + "loss": 0.5615, + "step": 3840 + }, + { + "epoch": 0.45547254832206807, + "grad_norm": 1.3527597071957347, + "learning_rate": 4.9083405535829606e-05, + "loss": 0.7796, + "step": 3841 + }, + { + "epoch": 0.45559113008419305, + "grad_norm": 1.7408160138616826, + "learning_rate": 4.908276141493139e-05, + "loss": 0.9229, + "step": 3842 + }, + { + "epoch": 0.45570971184631803, + "grad_norm": 1.784155513595179, + "learning_rate": 4.908211707201965e-05, + "loss": 0.8841, + "step": 3843 + }, + { + "epoch": 0.455828293608443, + "grad_norm": 1.7403243643152146, + "learning_rate": 4.908147250710032e-05, + "loss": 0.7747, + "step": 3844 + }, + { + "epoch": 0.455946875370568, + "grad_norm": 1.3585062648161286, + "learning_rate": 4.9080827720179335e-05, + "loss": 0.6185, + "step": 3845 + }, + { + "epoch": 0.456065457132693, + "grad_norm": 1.3027210885049758, + "learning_rate": 4.9080182711262655e-05, + "loss": 0.5389, + "step": 3846 + }, + { + "epoch": 0.45618403889481796, + "grad_norm": 1.5884605082388368, + "learning_rate": 4.907953748035622e-05, + "loss": 0.8242, + "step": 3847 + }, + { + "epoch": 0.45630262065694294, + "grad_norm": 1.4418144374708464, + "learning_rate": 4.907889202746598e-05, + "loss": 0.8891, + "step": 3848 + }, + { + "epoch": 0.456421202419068, + "grad_norm": 1.2886277141789806, + "learning_rate": 4.907824635259788e-05, + "loss": 0.7018, + "step": 3849 + }, + { + "epoch": 0.45653978418119295, + "grad_norm": 1.6661091468174534, + "learning_rate": 4.9077600455757874e-05, + "loss": 0.8326, + "step": 3850 + }, + { + "epoch": 0.45665836594331793, + "grad_norm": 1.099852964184064, + "learning_rate": 4.907695433695192e-05, + "loss": 0.482, + "step": 3851 + }, + { + "epoch": 0.4567769477054429, + "grad_norm": 1.5194879750727894, + "learning_rate": 4.9076307996185965e-05, + "loss": 0.8902, + "step": 3852 + }, + { + "epoch": 0.4568955294675679, + "grad_norm": 1.3209386539182897, + "learning_rate": 4.907566143346598e-05, + "loss": 0.5603, + "step": 3853 + }, + { + "epoch": 0.4570141112296929, + "grad_norm": 1.2800404838749628, + "learning_rate": 4.907501464879792e-05, + "loss": 0.5532, + "step": 3854 + }, + { + "epoch": 0.45713269299181786, + "grad_norm": 1.7332696694044751, + "learning_rate": 4.9074367642187755e-05, + "loss": 0.8841, + "step": 3855 + }, + { + "epoch": 0.45725127475394284, + "grad_norm": 1.8382919453597066, + "learning_rate": 4.907372041364143e-05, + "loss": 0.8905, + "step": 3856 + }, + { + "epoch": 0.4573698565160678, + "grad_norm": 1.6235708588779727, + "learning_rate": 4.907307296316493e-05, + "loss": 0.8045, + "step": 3857 + }, + { + "epoch": 0.4574884382781928, + "grad_norm": 1.8703965672329288, + "learning_rate": 4.9072425290764215e-05, + "loss": 0.7944, + "step": 3858 + }, + { + "epoch": 0.4576070200403178, + "grad_norm": 1.4852276869980678, + "learning_rate": 4.9071777396445264e-05, + "loss": 0.6896, + "step": 3859 + }, + { + "epoch": 0.45772560180244276, + "grad_norm": 1.4395871313724196, + "learning_rate": 4.907112928021404e-05, + "loss": 0.564, + "step": 3860 + }, + { + "epoch": 0.4578441835645678, + "grad_norm": 1.4192767665836947, + "learning_rate": 4.9070480942076525e-05, + "loss": 0.6113, + "step": 3861 + }, + { + "epoch": 0.4579627653266928, + "grad_norm": 1.5545815853504679, + "learning_rate": 4.906983238203869e-05, + "loss": 0.6205, + "step": 3862 + }, + { + "epoch": 0.45808134708881776, + "grad_norm": 1.6435934277835695, + "learning_rate": 4.9069183600106514e-05, + "loss": 0.9305, + "step": 3863 + }, + { + "epoch": 0.45819992885094274, + "grad_norm": 1.2115130005462942, + "learning_rate": 4.9068534596285986e-05, + "loss": 0.3547, + "step": 3864 + }, + { + "epoch": 0.4583185106130677, + "grad_norm": 1.7952156415594516, + "learning_rate": 4.906788537058308e-05, + "loss": 0.6765, + "step": 3865 + }, + { + "epoch": 0.4584370923751927, + "grad_norm": 1.6301437198739848, + "learning_rate": 4.9067235923003785e-05, + "loss": 0.5921, + "step": 3866 + }, + { + "epoch": 0.4585556741373177, + "grad_norm": 1.478657463941508, + "learning_rate": 4.906658625355409e-05, + "loss": 0.4462, + "step": 3867 + }, + { + "epoch": 0.45867425589944266, + "grad_norm": 1.6473083149437346, + "learning_rate": 4.906593636223998e-05, + "loss": 0.6595, + "step": 3868 + }, + { + "epoch": 0.45879283766156764, + "grad_norm": 1.8653975638249014, + "learning_rate": 4.906528624906746e-05, + "loss": 0.9367, + "step": 3869 + }, + { + "epoch": 0.4589114194236926, + "grad_norm": 1.831296404398805, + "learning_rate": 4.9064635914042496e-05, + "loss": 0.8648, + "step": 3870 + }, + { + "epoch": 0.4590300011858176, + "grad_norm": 1.74845795477681, + "learning_rate": 4.90639853571711e-05, + "loss": 0.647, + "step": 3871 + }, + { + "epoch": 0.4591485829479426, + "grad_norm": 1.5669897049683417, + "learning_rate": 4.906333457845927e-05, + "loss": 0.7053, + "step": 3872 + }, + { + "epoch": 0.45926716471006757, + "grad_norm": 1.7906588997586064, + "learning_rate": 4.9062683577913016e-05, + "loss": 0.9526, + "step": 3873 + }, + { + "epoch": 0.4593857464721926, + "grad_norm": 1.531394444625228, + "learning_rate": 4.906203235553831e-05, + "loss": 1.0061, + "step": 3874 + }, + { + "epoch": 0.4595043282343176, + "grad_norm": 1.3768643370734988, + "learning_rate": 4.906138091134118e-05, + "loss": 0.5589, + "step": 3875 + }, + { + "epoch": 0.45962290999644256, + "grad_norm": 1.4626533863096403, + "learning_rate": 4.906072924532763e-05, + "loss": 0.7018, + "step": 3876 + }, + { + "epoch": 0.45974149175856754, + "grad_norm": 1.4904213663766408, + "learning_rate": 4.906007735750365e-05, + "loss": 0.851, + "step": 3877 + }, + { + "epoch": 0.4598600735206925, + "grad_norm": 1.537247342365679, + "learning_rate": 4.905942524787527e-05, + "loss": 0.6715, + "step": 3878 + }, + { + "epoch": 0.4599786552828175, + "grad_norm": 1.5440679214942663, + "learning_rate": 4.905877291644849e-05, + "loss": 0.5307, + "step": 3879 + }, + { + "epoch": 0.4600972370449425, + "grad_norm": 1.5552460485875799, + "learning_rate": 4.905812036322933e-05, + "loss": 0.8204, + "step": 3880 + }, + { + "epoch": 0.46021581880706747, + "grad_norm": 1.526570790249855, + "learning_rate": 4.905746758822379e-05, + "loss": 0.65, + "step": 3881 + }, + { + "epoch": 0.46033440056919245, + "grad_norm": 1.7120137627091885, + "learning_rate": 4.9056814591437915e-05, + "loss": 0.7747, + "step": 3882 + }, + { + "epoch": 0.46045298233131743, + "grad_norm": 1.171639455548391, + "learning_rate": 4.90561613728777e-05, + "loss": 0.5756, + "step": 3883 + }, + { + "epoch": 0.4605715640934424, + "grad_norm": 1.3705261990298039, + "learning_rate": 4.905550793254918e-05, + "loss": 0.6278, + "step": 3884 + }, + { + "epoch": 0.4606901458555674, + "grad_norm": 1.4925580393884696, + "learning_rate": 4.905485427045838e-05, + "loss": 0.8162, + "step": 3885 + }, + { + "epoch": 0.4608087276176924, + "grad_norm": 1.7788135537556984, + "learning_rate": 4.905420038661132e-05, + "loss": 0.8564, + "step": 3886 + }, + { + "epoch": 0.4609273093798174, + "grad_norm": 1.6144537417747584, + "learning_rate": 4.9053546281014015e-05, + "loss": 0.7105, + "step": 3887 + }, + { + "epoch": 0.4610458911419424, + "grad_norm": 1.4439448607274257, + "learning_rate": 4.905289195367253e-05, + "loss": 0.6366, + "step": 3888 + }, + { + "epoch": 0.46116447290406737, + "grad_norm": 1.5405235151430978, + "learning_rate": 4.905223740459287e-05, + "loss": 0.6678, + "step": 3889 + }, + { + "epoch": 0.46128305466619235, + "grad_norm": 1.7674029583984499, + "learning_rate": 4.905158263378107e-05, + "loss": 0.6945, + "step": 3890 + }, + { + "epoch": 0.46140163642831733, + "grad_norm": 1.3702347820301166, + "learning_rate": 4.905092764124318e-05, + "loss": 0.4349, + "step": 3891 + }, + { + "epoch": 0.4615202181904423, + "grad_norm": 1.5827924155064585, + "learning_rate": 4.905027242698522e-05, + "loss": 0.6405, + "step": 3892 + }, + { + "epoch": 0.4616387999525673, + "grad_norm": 1.9073888376252277, + "learning_rate": 4.904961699101325e-05, + "loss": 0.6366, + "step": 3893 + }, + { + "epoch": 0.46175738171469227, + "grad_norm": 1.7506606794365636, + "learning_rate": 4.90489613333333e-05, + "loss": 0.9597, + "step": 3894 + }, + { + "epoch": 0.46187596347681725, + "grad_norm": 1.3437260359346264, + "learning_rate": 4.904830545395142e-05, + "loss": 0.5944, + "step": 3895 + }, + { + "epoch": 0.46199454523894223, + "grad_norm": 1.600913318675501, + "learning_rate": 4.904764935287365e-05, + "loss": 0.8692, + "step": 3896 + }, + { + "epoch": 0.4621131270010672, + "grad_norm": 1.7949099350667155, + "learning_rate": 4.9046993030106045e-05, + "loss": 0.7092, + "step": 3897 + }, + { + "epoch": 0.4622317087631922, + "grad_norm": 1.4142425472326396, + "learning_rate": 4.904633648565465e-05, + "loss": 0.5069, + "step": 3898 + }, + { + "epoch": 0.46235029052531723, + "grad_norm": 1.3599823223103784, + "learning_rate": 4.904567971952552e-05, + "loss": 0.4836, + "step": 3899 + }, + { + "epoch": 0.4624688722874422, + "grad_norm": 1.6288667276987718, + "learning_rate": 4.904502273172471e-05, + "loss": 0.9368, + "step": 3900 + }, + { + "epoch": 0.4625874540495672, + "grad_norm": 1.6049253877551126, + "learning_rate": 4.904436552225829e-05, + "loss": 0.8813, + "step": 3901 + }, + { + "epoch": 0.4627060358116922, + "grad_norm": 1.6354578853495267, + "learning_rate": 4.9043708091132276e-05, + "loss": 0.7031, + "step": 3902 + }, + { + "epoch": 0.46282461757381715, + "grad_norm": 1.6208141587150502, + "learning_rate": 4.904305043835278e-05, + "loss": 0.78, + "step": 3903 + }, + { + "epoch": 0.46294319933594213, + "grad_norm": 1.4385332523695364, + "learning_rate": 4.904239256392584e-05, + "loss": 0.55, + "step": 3904 + }, + { + "epoch": 0.4630617810980671, + "grad_norm": 1.3830612769863857, + "learning_rate": 4.904173446785751e-05, + "loss": 0.6586, + "step": 3905 + }, + { + "epoch": 0.4631803628601921, + "grad_norm": 1.5659664568763982, + "learning_rate": 4.9041076150153884e-05, + "loss": 0.7248, + "step": 3906 + }, + { + "epoch": 0.4632989446223171, + "grad_norm": 1.3295450119027863, + "learning_rate": 4.9040417610821024e-05, + "loss": 0.7001, + "step": 3907 + }, + { + "epoch": 0.46341752638444206, + "grad_norm": 1.4114730771787645, + "learning_rate": 4.9039758849864974e-05, + "loss": 0.6531, + "step": 3908 + }, + { + "epoch": 0.46353610814656704, + "grad_norm": 1.4185807252531328, + "learning_rate": 4.903909986729184e-05, + "loss": 0.609, + "step": 3909 + }, + { + "epoch": 0.463654689908692, + "grad_norm": 1.5495475489397543, + "learning_rate": 4.9038440663107675e-05, + "loss": 0.7076, + "step": 3910 + }, + { + "epoch": 0.46377327167081706, + "grad_norm": 1.5026385015757315, + "learning_rate": 4.903778123731857e-05, + "loss": 0.6378, + "step": 3911 + }, + { + "epoch": 0.46389185343294204, + "grad_norm": 1.9550129098826767, + "learning_rate": 4.90371215899306e-05, + "loss": 0.7823, + "step": 3912 + }, + { + "epoch": 0.464010435195067, + "grad_norm": 1.6753413200228116, + "learning_rate": 4.903646172094985e-05, + "loss": 0.7834, + "step": 3913 + }, + { + "epoch": 0.464129016957192, + "grad_norm": 1.637177941525017, + "learning_rate": 4.903580163038239e-05, + "loss": 0.6965, + "step": 3914 + }, + { + "epoch": 0.464247598719317, + "grad_norm": 1.735417630666785, + "learning_rate": 4.903514131823431e-05, + "loss": 0.8469, + "step": 3915 + }, + { + "epoch": 0.46436618048144196, + "grad_norm": 1.5843422197656436, + "learning_rate": 4.9034480784511716e-05, + "loss": 0.5814, + "step": 3916 + }, + { + "epoch": 0.46448476224356694, + "grad_norm": 1.430894230393008, + "learning_rate": 4.9033820029220664e-05, + "loss": 0.6954, + "step": 3917 + }, + { + "epoch": 0.4646033440056919, + "grad_norm": 1.4931532965248684, + "learning_rate": 4.903315905236728e-05, + "loss": 0.6591, + "step": 3918 + }, + { + "epoch": 0.4647219257678169, + "grad_norm": 1.3851523589471468, + "learning_rate": 4.903249785395763e-05, + "loss": 0.6661, + "step": 3919 + }, + { + "epoch": 0.4648405075299419, + "grad_norm": 1.4100291446072577, + "learning_rate": 4.903183643399783e-05, + "loss": 0.7618, + "step": 3920 + }, + { + "epoch": 0.46495908929206686, + "grad_norm": 1.476628845485094, + "learning_rate": 4.903117479249396e-05, + "loss": 0.7454, + "step": 3921 + }, + { + "epoch": 0.46507767105419184, + "grad_norm": 1.30179915827753, + "learning_rate": 4.903051292945213e-05, + "loss": 0.607, + "step": 3922 + }, + { + "epoch": 0.4651962528163168, + "grad_norm": 1.4579507027328646, + "learning_rate": 4.9029850844878434e-05, + "loss": 0.5942, + "step": 3923 + }, + { + "epoch": 0.46531483457844186, + "grad_norm": 1.2556975059942581, + "learning_rate": 4.9029188538778984e-05, + "loss": 0.5013, + "step": 3924 + }, + { + "epoch": 0.46543341634056684, + "grad_norm": 1.5365495039134514, + "learning_rate": 4.902852601115989e-05, + "loss": 0.737, + "step": 3925 + }, + { + "epoch": 0.4655519981026918, + "grad_norm": 1.4403506397512933, + "learning_rate": 4.902786326202724e-05, + "loss": 0.6221, + "step": 3926 + }, + { + "epoch": 0.4656705798648168, + "grad_norm": 1.4277837516883107, + "learning_rate": 4.902720029138716e-05, + "loss": 0.6272, + "step": 3927 + }, + { + "epoch": 0.4657891616269418, + "grad_norm": 1.3315725734733932, + "learning_rate": 4.902653709924576e-05, + "loss": 0.6386, + "step": 3928 + }, + { + "epoch": 0.46590774338906676, + "grad_norm": 1.40808178588933, + "learning_rate": 4.902587368560915e-05, + "loss": 0.5094, + "step": 3929 + }, + { + "epoch": 0.46602632515119174, + "grad_norm": 1.4438750135640057, + "learning_rate": 4.902521005048344e-05, + "loss": 0.4953, + "step": 3930 + }, + { + "epoch": 0.4661449069133167, + "grad_norm": 1.938640573701615, + "learning_rate": 4.9024546193874766e-05, + "loss": 0.9957, + "step": 3931 + }, + { + "epoch": 0.4662634886754417, + "grad_norm": 1.5640975204949963, + "learning_rate": 4.9023882115789235e-05, + "loss": 0.6202, + "step": 3932 + }, + { + "epoch": 0.4663820704375667, + "grad_norm": 1.374178567427156, + "learning_rate": 4.902321781623297e-05, + "loss": 0.5316, + "step": 3933 + }, + { + "epoch": 0.46650065219969167, + "grad_norm": 1.8903055510531646, + "learning_rate": 4.9022553295212096e-05, + "loss": 0.7963, + "step": 3934 + }, + { + "epoch": 0.46661923396181665, + "grad_norm": 1.751624954078587, + "learning_rate": 4.902188855273273e-05, + "loss": 0.6553, + "step": 3935 + }, + { + "epoch": 0.4667378157239417, + "grad_norm": 2.1899938311854474, + "learning_rate": 4.9021223588801015e-05, + "loss": 0.8237, + "step": 3936 + }, + { + "epoch": 0.46685639748606667, + "grad_norm": 1.5922649326482845, + "learning_rate": 4.902055840342308e-05, + "loss": 0.8273, + "step": 3937 + }, + { + "epoch": 0.46697497924819165, + "grad_norm": 1.7151532308645714, + "learning_rate": 4.901989299660505e-05, + "loss": 0.7354, + "step": 3938 + }, + { + "epoch": 0.4670935610103166, + "grad_norm": 1.7443760513214397, + "learning_rate": 4.901922736835306e-05, + "loss": 0.5804, + "step": 3939 + }, + { + "epoch": 0.4672121427724416, + "grad_norm": 1.751310645529247, + "learning_rate": 4.9018561518673244e-05, + "loss": 0.504, + "step": 3940 + }, + { + "epoch": 0.4673307245345666, + "grad_norm": 1.7285568000634297, + "learning_rate": 4.901789544757175e-05, + "loss": 0.8114, + "step": 3941 + }, + { + "epoch": 0.46744930629669157, + "grad_norm": 1.5402565423477548, + "learning_rate": 4.901722915505471e-05, + "loss": 0.6299, + "step": 3942 + }, + { + "epoch": 0.46756788805881655, + "grad_norm": 1.3436366313004255, + "learning_rate": 4.9016562641128274e-05, + "loss": 0.5137, + "step": 3943 + }, + { + "epoch": 0.46768646982094153, + "grad_norm": 1.3481116358833505, + "learning_rate": 4.901589590579858e-05, + "loss": 0.5649, + "step": 3944 + }, + { + "epoch": 0.4678050515830665, + "grad_norm": 1.6811926161648858, + "learning_rate": 4.9015228949071775e-05, + "loss": 0.7793, + "step": 3945 + }, + { + "epoch": 0.4679236333451915, + "grad_norm": 1.4634823852735208, + "learning_rate": 4.9014561770954e-05, + "loss": 0.597, + "step": 3946 + }, + { + "epoch": 0.4680422151073165, + "grad_norm": 1.4135062307625486, + "learning_rate": 4.901389437145143e-05, + "loss": 0.684, + "step": 3947 + }, + { + "epoch": 0.46816079686944145, + "grad_norm": 1.682643521684982, + "learning_rate": 4.901322675057019e-05, + "loss": 0.7634, + "step": 3948 + }, + { + "epoch": 0.4682793786315665, + "grad_norm": 1.5275828697094942, + "learning_rate": 4.9012558908316456e-05, + "loss": 0.8555, + "step": 3949 + }, + { + "epoch": 0.46839796039369147, + "grad_norm": 1.7360246657558485, + "learning_rate": 4.901189084469636e-05, + "loss": 0.9502, + "step": 3950 + }, + { + "epoch": 0.46851654215581645, + "grad_norm": 1.6032419262345101, + "learning_rate": 4.901122255971609e-05, + "loss": 0.6202, + "step": 3951 + }, + { + "epoch": 0.46863512391794143, + "grad_norm": 1.7432053249266153, + "learning_rate": 4.901055405338179e-05, + "loss": 0.7833, + "step": 3952 + }, + { + "epoch": 0.4687537056800664, + "grad_norm": 1.4496754789134678, + "learning_rate": 4.900988532569962e-05, + "loss": 0.571, + "step": 3953 + }, + { + "epoch": 0.4688722874421914, + "grad_norm": 1.314564641626623, + "learning_rate": 4.900921637667575e-05, + "loss": 0.6157, + "step": 3954 + }, + { + "epoch": 0.4689908692043164, + "grad_norm": 1.7614749590123622, + "learning_rate": 4.900854720631635e-05, + "loss": 0.6339, + "step": 3955 + }, + { + "epoch": 0.46910945096644135, + "grad_norm": 1.5547691706950737, + "learning_rate": 4.900787781462759e-05, + "loss": 0.6558, + "step": 3956 + }, + { + "epoch": 0.46922803272856634, + "grad_norm": 2.055557692717598, + "learning_rate": 4.900720820161563e-05, + "loss": 1.0625, + "step": 3957 + }, + { + "epoch": 0.4693466144906913, + "grad_norm": 1.4193368362369563, + "learning_rate": 4.900653836728665e-05, + "loss": 0.6164, + "step": 3958 + }, + { + "epoch": 0.4694651962528163, + "grad_norm": 1.318129806750987, + "learning_rate": 4.900586831164683e-05, + "loss": 0.5594, + "step": 3959 + }, + { + "epoch": 0.4695837780149413, + "grad_norm": 1.0981519983463217, + "learning_rate": 4.9005198034702334e-05, + "loss": 0.4395, + "step": 3960 + }, + { + "epoch": 0.4697023597770663, + "grad_norm": 1.594499940337011, + "learning_rate": 4.900452753645936e-05, + "loss": 0.6927, + "step": 3961 + }, + { + "epoch": 0.4698209415391913, + "grad_norm": 1.4850699575153201, + "learning_rate": 4.900385681692408e-05, + "loss": 0.6133, + "step": 3962 + }, + { + "epoch": 0.4699395233013163, + "grad_norm": 1.4197672629729599, + "learning_rate": 4.900318587610266e-05, + "loss": 0.6068, + "step": 3963 + }, + { + "epoch": 0.47005810506344126, + "grad_norm": 1.3895682920916126, + "learning_rate": 4.9002514714001316e-05, + "loss": 0.5485, + "step": 3964 + }, + { + "epoch": 0.47017668682556624, + "grad_norm": 1.4942194581011752, + "learning_rate": 4.9001843330626206e-05, + "loss": 0.755, + "step": 3965 + }, + { + "epoch": 0.4702952685876912, + "grad_norm": 1.4330464968844026, + "learning_rate": 4.900117172598354e-05, + "loss": 0.6341, + "step": 3966 + }, + { + "epoch": 0.4704138503498162, + "grad_norm": 1.4457597417394499, + "learning_rate": 4.9000499900079514e-05, + "loss": 0.5418, + "step": 3967 + }, + { + "epoch": 0.4705324321119412, + "grad_norm": 2.019120421486503, + "learning_rate": 4.89998278529203e-05, + "loss": 0.7049, + "step": 3968 + }, + { + "epoch": 0.47065101387406616, + "grad_norm": 1.4719025360944575, + "learning_rate": 4.89991555845121e-05, + "loss": 0.6184, + "step": 3969 + }, + { + "epoch": 0.47076959563619114, + "grad_norm": 1.3611737649430142, + "learning_rate": 4.899848309486112e-05, + "loss": 0.4403, + "step": 3970 + }, + { + "epoch": 0.4708881773983161, + "grad_norm": 1.419523577489354, + "learning_rate": 4.8997810383973555e-05, + "loss": 0.4889, + "step": 3971 + }, + { + "epoch": 0.4710067591604411, + "grad_norm": 1.5603446084461012, + "learning_rate": 4.89971374518556e-05, + "loss": 0.625, + "step": 3972 + }, + { + "epoch": 0.4711253409225661, + "grad_norm": 1.8143458039256453, + "learning_rate": 4.899646429851348e-05, + "loss": 0.7447, + "step": 3973 + }, + { + "epoch": 0.4712439226846911, + "grad_norm": 2.1663887676376348, + "learning_rate": 4.899579092395337e-05, + "loss": 0.8614, + "step": 3974 + }, + { + "epoch": 0.4713625044468161, + "grad_norm": 1.5155045664094833, + "learning_rate": 4.89951173281815e-05, + "loss": 0.7454, + "step": 3975 + }, + { + "epoch": 0.4714810862089411, + "grad_norm": 1.5983429094696837, + "learning_rate": 4.8994443511204066e-05, + "loss": 0.7299, + "step": 3976 + }, + { + "epoch": 0.47159966797106606, + "grad_norm": 1.3322413839941973, + "learning_rate": 4.89937694730273e-05, + "loss": 0.5266, + "step": 3977 + }, + { + "epoch": 0.47171824973319104, + "grad_norm": 1.656333853396519, + "learning_rate": 4.899309521365739e-05, + "loss": 0.7718, + "step": 3978 + }, + { + "epoch": 0.471836831495316, + "grad_norm": 1.5622427901651401, + "learning_rate": 4.8992420733100575e-05, + "loss": 0.9564, + "step": 3979 + }, + { + "epoch": 0.471955413257441, + "grad_norm": 1.521875384698585, + "learning_rate": 4.899174603136306e-05, + "loss": 0.6321, + "step": 3980 + }, + { + "epoch": 0.472073995019566, + "grad_norm": 1.3876440854518506, + "learning_rate": 4.899107110845106e-05, + "loss": 0.523, + "step": 3981 + }, + { + "epoch": 0.47219257678169096, + "grad_norm": 1.484977238624346, + "learning_rate": 4.899039596437081e-05, + "loss": 0.6178, + "step": 3982 + }, + { + "epoch": 0.47231115854381595, + "grad_norm": 1.5739026044610969, + "learning_rate": 4.898972059912853e-05, + "loss": 0.8231, + "step": 3983 + }, + { + "epoch": 0.4724297403059409, + "grad_norm": 1.3631026074105321, + "learning_rate": 4.898904501273044e-05, + "loss": 0.7308, + "step": 3984 + }, + { + "epoch": 0.4725483220680659, + "grad_norm": 1.3962003734040087, + "learning_rate": 4.898836920518277e-05, + "loss": 0.6757, + "step": 3985 + }, + { + "epoch": 0.47266690383019094, + "grad_norm": 1.4260222496595076, + "learning_rate": 4.8987693176491755e-05, + "loss": 0.6364, + "step": 3986 + }, + { + "epoch": 0.4727854855923159, + "grad_norm": 1.7031115856236982, + "learning_rate": 4.8987016926663634e-05, + "loss": 0.7404, + "step": 3987 + }, + { + "epoch": 0.4729040673544409, + "grad_norm": 1.4270711873264215, + "learning_rate": 4.898634045570462e-05, + "loss": 0.8067, + "step": 3988 + }, + { + "epoch": 0.4730226491165659, + "grad_norm": 1.3196458940706794, + "learning_rate": 4.898566376362096e-05, + "loss": 0.5122, + "step": 3989 + }, + { + "epoch": 0.47314123087869087, + "grad_norm": 1.5540140525155384, + "learning_rate": 4.89849868504189e-05, + "loss": 0.7291, + "step": 3990 + }, + { + "epoch": 0.47325981264081585, + "grad_norm": 1.5398439188462036, + "learning_rate": 4.898430971610467e-05, + "loss": 0.8767, + "step": 3991 + }, + { + "epoch": 0.4733783944029408, + "grad_norm": 1.5213517661258038, + "learning_rate": 4.8983632360684515e-05, + "loss": 0.587, + "step": 3992 + }, + { + "epoch": 0.4734969761650658, + "grad_norm": 1.544424965668842, + "learning_rate": 4.898295478416469e-05, + "loss": 0.591, + "step": 3993 + }, + { + "epoch": 0.4736155579271908, + "grad_norm": 1.7192561139355211, + "learning_rate": 4.8982276986551425e-05, + "loss": 0.7074, + "step": 3994 + }, + { + "epoch": 0.47373413968931577, + "grad_norm": 2.1650992817174517, + "learning_rate": 4.8981598967850976e-05, + "loss": 1.085, + "step": 3995 + }, + { + "epoch": 0.47385272145144075, + "grad_norm": 1.3411474107828578, + "learning_rate": 4.898092072806959e-05, + "loss": 0.5091, + "step": 3996 + }, + { + "epoch": 0.47397130321356573, + "grad_norm": 1.3029771241772945, + "learning_rate": 4.8980242267213515e-05, + "loss": 0.5191, + "step": 3997 + }, + { + "epoch": 0.4740898849756907, + "grad_norm": 1.8634113149361473, + "learning_rate": 4.8979563585289026e-05, + "loss": 0.852, + "step": 3998 + }, + { + "epoch": 0.47420846673781575, + "grad_norm": 1.3602374025023698, + "learning_rate": 4.897888468230236e-05, + "loss": 0.6197, + "step": 3999 + }, + { + "epoch": 0.47432704849994073, + "grad_norm": 1.6569787616312923, + "learning_rate": 4.897820555825978e-05, + "loss": 0.6495, + "step": 4000 + }, + { + "epoch": 0.4744456302620657, + "grad_norm": 1.9807303477404288, + "learning_rate": 4.897752621316756e-05, + "loss": 0.7887, + "step": 4001 + }, + { + "epoch": 0.4745642120241907, + "grad_norm": 1.2689286980761965, + "learning_rate": 4.8976846647031935e-05, + "loss": 0.3509, + "step": 4002 + }, + { + "epoch": 0.47468279378631567, + "grad_norm": 1.4730594431095392, + "learning_rate": 4.8976166859859195e-05, + "loss": 0.5824, + "step": 4003 + }, + { + "epoch": 0.47480137554844065, + "grad_norm": 1.443382242028131, + "learning_rate": 4.89754868516556e-05, + "loss": 0.6988, + "step": 4004 + }, + { + "epoch": 0.47491995731056563, + "grad_norm": 1.4456194629572676, + "learning_rate": 4.897480662242742e-05, + "loss": 0.6312, + "step": 4005 + }, + { + "epoch": 0.4750385390726906, + "grad_norm": 1.7066445885245023, + "learning_rate": 4.8974126172180914e-05, + "loss": 0.7714, + "step": 4006 + }, + { + "epoch": 0.4751571208348156, + "grad_norm": 1.7300554750289905, + "learning_rate": 4.897344550092237e-05, + "loss": 0.8768, + "step": 4007 + }, + { + "epoch": 0.4752757025969406, + "grad_norm": 1.6961087913619637, + "learning_rate": 4.897276460865805e-05, + "loss": 0.7856, + "step": 4008 + }, + { + "epoch": 0.47539428435906556, + "grad_norm": 1.3678737900741764, + "learning_rate": 4.8972083495394245e-05, + "loss": 0.5315, + "step": 4009 + }, + { + "epoch": 0.47551286612119054, + "grad_norm": 1.5793692209423693, + "learning_rate": 4.8971402161137226e-05, + "loss": 0.6267, + "step": 4010 + }, + { + "epoch": 0.4756314478833156, + "grad_norm": 1.588296401621017, + "learning_rate": 4.8970720605893275e-05, + "loss": 0.7175, + "step": 4011 + }, + { + "epoch": 0.47575002964544055, + "grad_norm": 1.5988067220892, + "learning_rate": 4.897003882966866e-05, + "loss": 0.6675, + "step": 4012 + }, + { + "epoch": 0.47586861140756553, + "grad_norm": 1.397453887214475, + "learning_rate": 4.89693568324697e-05, + "loss": 0.6756, + "step": 4013 + }, + { + "epoch": 0.4759871931696905, + "grad_norm": 1.8477954354125279, + "learning_rate": 4.8968674614302656e-05, + "loss": 0.8347, + "step": 4014 + }, + { + "epoch": 0.4761057749318155, + "grad_norm": 1.4740456338380772, + "learning_rate": 4.896799217517383e-05, + "loss": 0.7265, + "step": 4015 + }, + { + "epoch": 0.4762243566939405, + "grad_norm": 1.4444323844621767, + "learning_rate": 4.8967309515089496e-05, + "loss": 0.7475, + "step": 4016 + }, + { + "epoch": 0.47634293845606546, + "grad_norm": 1.5344883361660566, + "learning_rate": 4.896662663405597e-05, + "loss": 0.654, + "step": 4017 + }, + { + "epoch": 0.47646152021819044, + "grad_norm": 1.7910780444585894, + "learning_rate": 4.896594353207953e-05, + "loss": 0.6878, + "step": 4018 + }, + { + "epoch": 0.4765801019803154, + "grad_norm": 1.6883597187225852, + "learning_rate": 4.896526020916647e-05, + "loss": 0.813, + "step": 4019 + }, + { + "epoch": 0.4766986837424404, + "grad_norm": 1.2489307469669835, + "learning_rate": 4.896457666532311e-05, + "loss": 0.3902, + "step": 4020 + }, + { + "epoch": 0.4768172655045654, + "grad_norm": 1.529896729236191, + "learning_rate": 4.8963892900555745e-05, + "loss": 0.6819, + "step": 4021 + }, + { + "epoch": 0.47693584726669036, + "grad_norm": 1.4559016496272676, + "learning_rate": 4.896320891487067e-05, + "loss": 0.508, + "step": 4022 + }, + { + "epoch": 0.47705442902881534, + "grad_norm": 1.1620507816987244, + "learning_rate": 4.8962524708274195e-05, + "loss": 0.5299, + "step": 4023 + }, + { + "epoch": 0.4771730107909404, + "grad_norm": 1.4597397148806852, + "learning_rate": 4.896184028077263e-05, + "loss": 0.5113, + "step": 4024 + }, + { + "epoch": 0.47729159255306536, + "grad_norm": 2.019835618225478, + "learning_rate": 4.896115563237227e-05, + "loss": 0.8913, + "step": 4025 + }, + { + "epoch": 0.47741017431519034, + "grad_norm": 1.7249381078122055, + "learning_rate": 4.8960470763079444e-05, + "loss": 0.6928, + "step": 4026 + }, + { + "epoch": 0.4775287560773153, + "grad_norm": 1.452990692402297, + "learning_rate": 4.895978567290047e-05, + "loss": 0.5525, + "step": 4027 + }, + { + "epoch": 0.4776473378394403, + "grad_norm": 1.3474797999461565, + "learning_rate": 4.8959100361841643e-05, + "loss": 0.8029, + "step": 4028 + }, + { + "epoch": 0.4777659196015653, + "grad_norm": 1.7018900648720616, + "learning_rate": 4.8958414829909296e-05, + "loss": 0.7859, + "step": 4029 + }, + { + "epoch": 0.47788450136369026, + "grad_norm": 1.6926426370638796, + "learning_rate": 4.895772907710974e-05, + "loss": 0.6492, + "step": 4030 + }, + { + "epoch": 0.47800308312581524, + "grad_norm": 1.7433949304626866, + "learning_rate": 4.8957043103449296e-05, + "loss": 0.4681, + "step": 4031 + }, + { + "epoch": 0.4781216648879402, + "grad_norm": 1.477728800187049, + "learning_rate": 4.89563569089343e-05, + "loss": 0.7129, + "step": 4032 + }, + { + "epoch": 0.4782402466500652, + "grad_norm": 1.3005606221716723, + "learning_rate": 4.895567049357107e-05, + "loss": 0.6076, + "step": 4033 + }, + { + "epoch": 0.4783588284121902, + "grad_norm": 1.5038771543549763, + "learning_rate": 4.895498385736593e-05, + "loss": 0.734, + "step": 4034 + }, + { + "epoch": 0.47847741017431517, + "grad_norm": 1.6848296533218543, + "learning_rate": 4.8954297000325225e-05, + "loss": 0.8285, + "step": 4035 + }, + { + "epoch": 0.4785959919364402, + "grad_norm": 1.6794823041534357, + "learning_rate": 4.895360992245527e-05, + "loss": 0.6397, + "step": 4036 + }, + { + "epoch": 0.4787145736985652, + "grad_norm": 1.7476007210168303, + "learning_rate": 4.89529226237624e-05, + "loss": 0.7239, + "step": 4037 + }, + { + "epoch": 0.47883315546069016, + "grad_norm": 1.5703064005891074, + "learning_rate": 4.895223510425296e-05, + "loss": 0.6421, + "step": 4038 + }, + { + "epoch": 0.47895173722281514, + "grad_norm": 1.87443774142814, + "learning_rate": 4.895154736393329e-05, + "loss": 0.9704, + "step": 4039 + }, + { + "epoch": 0.4790703189849401, + "grad_norm": 1.5653107126365349, + "learning_rate": 4.8950859402809724e-05, + "loss": 0.6423, + "step": 4040 + }, + { + "epoch": 0.4791889007470651, + "grad_norm": 1.617704406298519, + "learning_rate": 4.89501712208886e-05, + "loss": 0.6999, + "step": 4041 + }, + { + "epoch": 0.4793074825091901, + "grad_norm": 1.5171819855324005, + "learning_rate": 4.894948281817626e-05, + "loss": 0.8446, + "step": 4042 + }, + { + "epoch": 0.47942606427131507, + "grad_norm": 1.3941988529859095, + "learning_rate": 4.894879419467907e-05, + "loss": 0.5789, + "step": 4043 + }, + { + "epoch": 0.47954464603344005, + "grad_norm": 1.298817209928704, + "learning_rate": 4.894810535040336e-05, + "loss": 0.4436, + "step": 4044 + }, + { + "epoch": 0.47966322779556503, + "grad_norm": 1.671129374601477, + "learning_rate": 4.894741628535549e-05, + "loss": 0.6212, + "step": 4045 + }, + { + "epoch": 0.47978180955769, + "grad_norm": 1.6626633351931168, + "learning_rate": 4.89467269995418e-05, + "loss": 0.6574, + "step": 4046 + }, + { + "epoch": 0.479900391319815, + "grad_norm": 1.7987262732440326, + "learning_rate": 4.894603749296866e-05, + "loss": 0.6805, + "step": 4047 + }, + { + "epoch": 0.48001897308193997, + "grad_norm": 1.8410941049156422, + "learning_rate": 4.8945347765642414e-05, + "loss": 0.7511, + "step": 4048 + }, + { + "epoch": 0.480137554844065, + "grad_norm": 1.7156379650873421, + "learning_rate": 4.894465781756943e-05, + "loss": 0.5072, + "step": 4049 + }, + { + "epoch": 0.48025613660619, + "grad_norm": 1.4723610685674535, + "learning_rate": 4.894396764875606e-05, + "loss": 0.588, + "step": 4050 + }, + { + "epoch": 0.48037471836831497, + "grad_norm": 1.6741522949163534, + "learning_rate": 4.894327725920868e-05, + "loss": 0.8677, + "step": 4051 + }, + { + "epoch": 0.48049330013043995, + "grad_norm": 1.4864454634420534, + "learning_rate": 4.894258664893363e-05, + "loss": 0.7783, + "step": 4052 + }, + { + "epoch": 0.48061188189256493, + "grad_norm": 1.6999465145214452, + "learning_rate": 4.89418958179373e-05, + "loss": 0.811, + "step": 4053 + }, + { + "epoch": 0.4807304636546899, + "grad_norm": 1.516233615843892, + "learning_rate": 4.894120476622605e-05, + "loss": 0.4956, + "step": 4054 + }, + { + "epoch": 0.4808490454168149, + "grad_norm": 1.5693543010984206, + "learning_rate": 4.894051349380624e-05, + "loss": 0.6724, + "step": 4055 + }, + { + "epoch": 0.48096762717893987, + "grad_norm": 1.6279231489096926, + "learning_rate": 4.893982200068426e-05, + "loss": 0.8112, + "step": 4056 + }, + { + "epoch": 0.48108620894106485, + "grad_norm": 1.706148097966865, + "learning_rate": 4.893913028686649e-05, + "loss": 0.5358, + "step": 4057 + }, + { + "epoch": 0.48120479070318983, + "grad_norm": 1.1372754006512094, + "learning_rate": 4.893843835235928e-05, + "loss": 0.4509, + "step": 4058 + }, + { + "epoch": 0.4813233724653148, + "grad_norm": 1.6060460644852042, + "learning_rate": 4.8937746197169026e-05, + "loss": 0.5663, + "step": 4059 + }, + { + "epoch": 0.4814419542274398, + "grad_norm": 1.8041408234356868, + "learning_rate": 4.893705382130211e-05, + "loss": 0.711, + "step": 4060 + }, + { + "epoch": 0.48156053598956483, + "grad_norm": 1.5240559669729792, + "learning_rate": 4.893636122476491e-05, + "loss": 0.8108, + "step": 4061 + }, + { + "epoch": 0.4816791177516898, + "grad_norm": 1.4873789488663596, + "learning_rate": 4.893566840756382e-05, + "loss": 0.7501, + "step": 4062 + }, + { + "epoch": 0.4817976995138148, + "grad_norm": 1.5210579920654985, + "learning_rate": 4.8934975369705206e-05, + "loss": 0.5496, + "step": 4063 + }, + { + "epoch": 0.4819162812759398, + "grad_norm": 1.4447483588927672, + "learning_rate": 4.893428211119547e-05, + "loss": 0.553, + "step": 4064 + }, + { + "epoch": 0.48203486303806475, + "grad_norm": 1.6740466848860047, + "learning_rate": 4.893358863204102e-05, + "loss": 0.7344, + "step": 4065 + }, + { + "epoch": 0.48215344480018973, + "grad_norm": 1.6292526941699856, + "learning_rate": 4.893289493224821e-05, + "loss": 0.5869, + "step": 4066 + }, + { + "epoch": 0.4822720265623147, + "grad_norm": 1.6136912403114538, + "learning_rate": 4.893220101182348e-05, + "loss": 0.8246, + "step": 4067 + }, + { + "epoch": 0.4823906083244397, + "grad_norm": 1.6158382972926673, + "learning_rate": 4.893150687077319e-05, + "loss": 0.5992, + "step": 4068 + }, + { + "epoch": 0.4825091900865647, + "grad_norm": 1.546305554071709, + "learning_rate": 4.8930812509103754e-05, + "loss": 0.7114, + "step": 4069 + }, + { + "epoch": 0.48262777184868966, + "grad_norm": 1.6256671101198086, + "learning_rate": 4.8930117926821575e-05, + "loss": 0.6918, + "step": 4070 + }, + { + "epoch": 0.48274635361081464, + "grad_norm": 1.537034421437289, + "learning_rate": 4.892942312393305e-05, + "loss": 0.6889, + "step": 4071 + }, + { + "epoch": 0.4828649353729396, + "grad_norm": 1.2500742039316979, + "learning_rate": 4.892872810044459e-05, + "loss": 0.5916, + "step": 4072 + }, + { + "epoch": 0.4829835171350646, + "grad_norm": 1.6405389578015364, + "learning_rate": 4.8928032856362606e-05, + "loss": 0.6848, + "step": 4073 + }, + { + "epoch": 0.48310209889718964, + "grad_norm": 1.6960840410535822, + "learning_rate": 4.8927337391693496e-05, + "loss": 0.7937, + "step": 4074 + }, + { + "epoch": 0.4832206806593146, + "grad_norm": 1.9319655568202767, + "learning_rate": 4.892664170644368e-05, + "loss": 0.6337, + "step": 4075 + }, + { + "epoch": 0.4833392624214396, + "grad_norm": 1.4384555355081734, + "learning_rate": 4.892594580061956e-05, + "loss": 0.5111, + "step": 4076 + }, + { + "epoch": 0.4834578441835646, + "grad_norm": 1.39379814598019, + "learning_rate": 4.892524967422757e-05, + "loss": 0.5648, + "step": 4077 + }, + { + "epoch": 0.48357642594568956, + "grad_norm": 1.7211400203628826, + "learning_rate": 4.892455332727411e-05, + "loss": 0.5459, + "step": 4078 + }, + { + "epoch": 0.48369500770781454, + "grad_norm": 1.6520252665648292, + "learning_rate": 4.892385675976561e-05, + "loss": 0.7197, + "step": 4079 + }, + { + "epoch": 0.4838135894699395, + "grad_norm": 1.5109557819232557, + "learning_rate": 4.8923159971708486e-05, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 0.4839321712320645, + "grad_norm": 1.6118726597728827, + "learning_rate": 4.8922462963109164e-05, + "loss": 0.7617, + "step": 4081 + }, + { + "epoch": 0.4840507529941895, + "grad_norm": 1.7294407075149008, + "learning_rate": 4.892176573397407e-05, + "loss": 0.9104, + "step": 4082 + }, + { + "epoch": 0.48416933475631446, + "grad_norm": 1.6104780839581314, + "learning_rate": 4.892106828430963e-05, + "loss": 0.6309, + "step": 4083 + }, + { + "epoch": 0.48428791651843944, + "grad_norm": 1.8083183201491937, + "learning_rate": 4.8920370614122276e-05, + "loss": 0.8299, + "step": 4084 + }, + { + "epoch": 0.4844064982805644, + "grad_norm": 1.4528208733084937, + "learning_rate": 4.8919672723418445e-05, + "loss": 0.6245, + "step": 4085 + }, + { + "epoch": 0.48452508004268946, + "grad_norm": 1.265037789273784, + "learning_rate": 4.891897461220455e-05, + "loss": 0.5078, + "step": 4086 + }, + { + "epoch": 0.48464366180481444, + "grad_norm": 1.7523662108839682, + "learning_rate": 4.891827628048705e-05, + "loss": 0.8144, + "step": 4087 + }, + { + "epoch": 0.4847622435669394, + "grad_norm": 1.940696520113745, + "learning_rate": 4.8917577728272366e-05, + "loss": 0.9523, + "step": 4088 + }, + { + "epoch": 0.4848808253290644, + "grad_norm": 1.4980767489192377, + "learning_rate": 4.8916878955566944e-05, + "loss": 0.611, + "step": 4089 + }, + { + "epoch": 0.4849994070911894, + "grad_norm": 1.8046204333877207, + "learning_rate": 4.891617996237724e-05, + "loss": 0.7514, + "step": 4090 + }, + { + "epoch": 0.48511798885331436, + "grad_norm": 1.32352084124772, + "learning_rate": 4.8915480748709676e-05, + "loss": 0.6854, + "step": 4091 + }, + { + "epoch": 0.48523657061543934, + "grad_norm": 1.5245406008052478, + "learning_rate": 4.89147813145707e-05, + "loss": 0.7774, + "step": 4092 + }, + { + "epoch": 0.4853551523775643, + "grad_norm": 1.5098346099708075, + "learning_rate": 4.891408165996678e-05, + "loss": 0.8574, + "step": 4093 + }, + { + "epoch": 0.4854737341396893, + "grad_norm": 1.4580324273644576, + "learning_rate": 4.891338178490435e-05, + "loss": 0.5788, + "step": 4094 + }, + { + "epoch": 0.4855923159018143, + "grad_norm": 1.6065984179723092, + "learning_rate": 4.891268168938985e-05, + "loss": 0.793, + "step": 4095 + }, + { + "epoch": 0.48571089766393927, + "grad_norm": 1.3341504926731984, + "learning_rate": 4.891198137342976e-05, + "loss": 0.5563, + "step": 4096 + }, + { + "epoch": 0.48582947942606425, + "grad_norm": 1.4323232108624768, + "learning_rate": 4.8911280837030524e-05, + "loss": 0.5846, + "step": 4097 + }, + { + "epoch": 0.4859480611881893, + "grad_norm": 1.7164217240185693, + "learning_rate": 4.8910580080198595e-05, + "loss": 0.5719, + "step": 4098 + }, + { + "epoch": 0.48606664295031426, + "grad_norm": 1.5967946442701428, + "learning_rate": 4.890987910294045e-05, + "loss": 0.6094, + "step": 4099 + }, + { + "epoch": 0.48618522471243925, + "grad_norm": 1.4920737327854732, + "learning_rate": 4.890917790526254e-05, + "loss": 0.7785, + "step": 4100 + }, + { + "epoch": 0.4863038064745642, + "grad_norm": 1.5188401832717104, + "learning_rate": 4.890847648717132e-05, + "loss": 0.8239, + "step": 4101 + }, + { + "epoch": 0.4864223882366892, + "grad_norm": 1.5397230860448081, + "learning_rate": 4.8907774848673266e-05, + "loss": 0.5947, + "step": 4102 + }, + { + "epoch": 0.4865409699988142, + "grad_norm": 1.4253714923762693, + "learning_rate": 4.8907072989774846e-05, + "loss": 0.5785, + "step": 4103 + }, + { + "epoch": 0.48665955176093917, + "grad_norm": 1.4849977100756124, + "learning_rate": 4.890637091048253e-05, + "loss": 0.554, + "step": 4104 + }, + { + "epoch": 0.48677813352306415, + "grad_norm": 2.130122589854132, + "learning_rate": 4.890566861080279e-05, + "loss": 0.8115, + "step": 4105 + }, + { + "epoch": 0.48689671528518913, + "grad_norm": 1.5268531439250852, + "learning_rate": 4.8904966090742105e-05, + "loss": 0.6931, + "step": 4106 + }, + { + "epoch": 0.4870152970473141, + "grad_norm": 1.7458415197480863, + "learning_rate": 4.8904263350306946e-05, + "loss": 0.6334, + "step": 4107 + }, + { + "epoch": 0.4871338788094391, + "grad_norm": 1.5900275340024916, + "learning_rate": 4.8903560389503785e-05, + "loss": 0.701, + "step": 4108 + }, + { + "epoch": 0.48725246057156407, + "grad_norm": 1.5468873138125376, + "learning_rate": 4.8902857208339115e-05, + "loss": 0.6714, + "step": 4109 + }, + { + "epoch": 0.48737104233368905, + "grad_norm": 1.6606342853677105, + "learning_rate": 4.890215380681942e-05, + "loss": 0.6874, + "step": 4110 + }, + { + "epoch": 0.4874896240958141, + "grad_norm": 1.5581855374473612, + "learning_rate": 4.890145018495117e-05, + "loss": 0.7354, + "step": 4111 + }, + { + "epoch": 0.48760820585793907, + "grad_norm": 2.102185665828938, + "learning_rate": 4.8900746342740854e-05, + "loss": 0.9245, + "step": 4112 + }, + { + "epoch": 0.48772678762006405, + "grad_norm": 1.6588631539365102, + "learning_rate": 4.890004228019497e-05, + "loss": 0.6962, + "step": 4113 + }, + { + "epoch": 0.48784536938218903, + "grad_norm": 1.6970211134979376, + "learning_rate": 4.889933799732001e-05, + "loss": 0.8241, + "step": 4114 + }, + { + "epoch": 0.487963951144314, + "grad_norm": 1.6667289133910457, + "learning_rate": 4.889863349412246e-05, + "loss": 0.6617, + "step": 4115 + }, + { + "epoch": 0.488082532906439, + "grad_norm": 1.253757782531047, + "learning_rate": 4.889792877060881e-05, + "loss": 0.5734, + "step": 4116 + }, + { + "epoch": 0.488201114668564, + "grad_norm": 1.55631041443559, + "learning_rate": 4.8897223826785566e-05, + "loss": 0.6085, + "step": 4117 + }, + { + "epoch": 0.48831969643068895, + "grad_norm": 1.6873564117248379, + "learning_rate": 4.8896518662659216e-05, + "loss": 0.669, + "step": 4118 + }, + { + "epoch": 0.48843827819281393, + "grad_norm": 2.4422979425445517, + "learning_rate": 4.889581327823628e-05, + "loss": 0.7793, + "step": 4119 + }, + { + "epoch": 0.4885568599549389, + "grad_norm": 1.503631848742357, + "learning_rate": 4.889510767352325e-05, + "loss": 0.7079, + "step": 4120 + }, + { + "epoch": 0.4886754417170639, + "grad_norm": 1.4481281580581187, + "learning_rate": 4.889440184852661e-05, + "loss": 0.6951, + "step": 4121 + }, + { + "epoch": 0.4887940234791889, + "grad_norm": 1.3778356376513048, + "learning_rate": 4.8893695803252906e-05, + "loss": 0.5633, + "step": 4122 + }, + { + "epoch": 0.4889126052413139, + "grad_norm": 1.6849604465025894, + "learning_rate": 4.889298953770861e-05, + "loss": 0.7162, + "step": 4123 + }, + { + "epoch": 0.4890311870034389, + "grad_norm": 1.5255935518260273, + "learning_rate": 4.889228305190026e-05, + "loss": 0.6022, + "step": 4124 + }, + { + "epoch": 0.4891497687655639, + "grad_norm": 1.4820373294779996, + "learning_rate": 4.8891576345834356e-05, + "loss": 0.7192, + "step": 4125 + }, + { + "epoch": 0.48926835052768886, + "grad_norm": 1.8751054113127694, + "learning_rate": 4.889086941951742e-05, + "loss": 0.8899, + "step": 4126 + }, + { + "epoch": 0.48938693228981384, + "grad_norm": 1.24773879548174, + "learning_rate": 4.8890162272955965e-05, + "loss": 0.5906, + "step": 4127 + }, + { + "epoch": 0.4895055140519388, + "grad_norm": 1.704567914020501, + "learning_rate": 4.8889454906156505e-05, + "loss": 0.6906, + "step": 4128 + }, + { + "epoch": 0.4896240958140638, + "grad_norm": 2.0079580171076024, + "learning_rate": 4.888874731912557e-05, + "loss": 0.7996, + "step": 4129 + }, + { + "epoch": 0.4897426775761888, + "grad_norm": 1.805758434603456, + "learning_rate": 4.888803951186968e-05, + "loss": 0.8334, + "step": 4130 + }, + { + "epoch": 0.48986125933831376, + "grad_norm": 1.6825341754045586, + "learning_rate": 4.888733148439535e-05, + "loss": 0.5857, + "step": 4131 + }, + { + "epoch": 0.48997984110043874, + "grad_norm": 1.5677698233854147, + "learning_rate": 4.888662323670913e-05, + "loss": 0.4552, + "step": 4132 + }, + { + "epoch": 0.4900984228625637, + "grad_norm": 1.6767162847898258, + "learning_rate": 4.888591476881752e-05, + "loss": 0.773, + "step": 4133 + }, + { + "epoch": 0.4902170046246887, + "grad_norm": 1.5594819633686676, + "learning_rate": 4.888520608072707e-05, + "loss": 0.7898, + "step": 4134 + }, + { + "epoch": 0.4903355863868137, + "grad_norm": 1.5421139748801498, + "learning_rate": 4.888449717244432e-05, + "loss": 0.4921, + "step": 4135 + }, + { + "epoch": 0.4904541681489387, + "grad_norm": 1.34077188423937, + "learning_rate": 4.88837880439758e-05, + "loss": 0.4277, + "step": 4136 + }, + { + "epoch": 0.4905727499110637, + "grad_norm": 1.561133768175664, + "learning_rate": 4.8883078695328024e-05, + "loss": 0.8, + "step": 4137 + }, + { + "epoch": 0.4906913316731887, + "grad_norm": 1.6404591085136184, + "learning_rate": 4.888236912650756e-05, + "loss": 0.6402, + "step": 4138 + }, + { + "epoch": 0.49080991343531366, + "grad_norm": 1.417770694364946, + "learning_rate": 4.8881659337520936e-05, + "loss": 0.6099, + "step": 4139 + }, + { + "epoch": 0.49092849519743864, + "grad_norm": 1.6909666240490429, + "learning_rate": 4.8880949328374706e-05, + "loss": 0.5555, + "step": 4140 + }, + { + "epoch": 0.4910470769595636, + "grad_norm": 1.245433334589589, + "learning_rate": 4.88802390990754e-05, + "loss": 0.5044, + "step": 4141 + }, + { + "epoch": 0.4911656587216886, + "grad_norm": 1.8321746367368652, + "learning_rate": 4.8879528649629576e-05, + "loss": 0.9042, + "step": 4142 + }, + { + "epoch": 0.4912842404838136, + "grad_norm": 1.305741089603041, + "learning_rate": 4.887881798004378e-05, + "loss": 0.4757, + "step": 4143 + }, + { + "epoch": 0.49140282224593856, + "grad_norm": 1.3943127188450002, + "learning_rate": 4.8878107090324566e-05, + "loss": 0.5234, + "step": 4144 + }, + { + "epoch": 0.49152140400806354, + "grad_norm": 1.6508612661443025, + "learning_rate": 4.887739598047848e-05, + "loss": 0.6387, + "step": 4145 + }, + { + "epoch": 0.4916399857701885, + "grad_norm": 2.0678054677163806, + "learning_rate": 4.887668465051209e-05, + "loss": 0.9759, + "step": 4146 + }, + { + "epoch": 0.4917585675323135, + "grad_norm": 1.5928100850259528, + "learning_rate": 4.887597310043194e-05, + "loss": 0.6206, + "step": 4147 + }, + { + "epoch": 0.49187714929443854, + "grad_norm": 1.8251664810641781, + "learning_rate": 4.88752613302446e-05, + "loss": 0.8042, + "step": 4148 + }, + { + "epoch": 0.4919957310565635, + "grad_norm": 1.699626363261998, + "learning_rate": 4.8874549339956634e-05, + "loss": 0.5784, + "step": 4149 + }, + { + "epoch": 0.4921143128186885, + "grad_norm": 1.7501436979317118, + "learning_rate": 4.8873837129574595e-05, + "loss": 0.6352, + "step": 4150 + }, + { + "epoch": 0.4922328945808135, + "grad_norm": 1.3134200765468103, + "learning_rate": 4.887312469910506e-05, + "loss": 0.517, + "step": 4151 + }, + { + "epoch": 0.49235147634293847, + "grad_norm": 1.4403613011533414, + "learning_rate": 4.887241204855458e-05, + "loss": 0.753, + "step": 4152 + }, + { + "epoch": 0.49247005810506345, + "grad_norm": 1.5750643229769377, + "learning_rate": 4.887169917792974e-05, + "loss": 0.7375, + "step": 4153 + }, + { + "epoch": 0.4925886398671884, + "grad_norm": 1.7407713983302688, + "learning_rate": 4.887098608723711e-05, + "loss": 0.7501, + "step": 4154 + }, + { + "epoch": 0.4927072216293134, + "grad_norm": 1.2546618616730723, + "learning_rate": 4.8870272776483263e-05, + "loss": 0.559, + "step": 4155 + }, + { + "epoch": 0.4928258033914384, + "grad_norm": 1.5695898163434259, + "learning_rate": 4.886955924567476e-05, + "loss": 0.6674, + "step": 4156 + }, + { + "epoch": 0.49294438515356337, + "grad_norm": 1.3865669006483519, + "learning_rate": 4.886884549481821e-05, + "loss": 0.5162, + "step": 4157 + }, + { + "epoch": 0.49306296691568835, + "grad_norm": 1.5574696071485596, + "learning_rate": 4.886813152392016e-05, + "loss": 0.5564, + "step": 4158 + }, + { + "epoch": 0.49318154867781333, + "grad_norm": 1.50565383158194, + "learning_rate": 4.886741733298721e-05, + "loss": 0.6875, + "step": 4159 + }, + { + "epoch": 0.4933001304399383, + "grad_norm": 1.174068859498064, + "learning_rate": 4.8866702922025934e-05, + "loss": 0.4053, + "step": 4160 + }, + { + "epoch": 0.49341871220206335, + "grad_norm": 1.477747394461469, + "learning_rate": 4.886598829104293e-05, + "loss": 0.5704, + "step": 4161 + }, + { + "epoch": 0.49353729396418833, + "grad_norm": 1.6593838599138053, + "learning_rate": 4.8865273440044784e-05, + "loss": 0.6486, + "step": 4162 + }, + { + "epoch": 0.4936558757263133, + "grad_norm": 1.257871526941035, + "learning_rate": 4.8864558369038084e-05, + "loss": 0.6281, + "step": 4163 + }, + { + "epoch": 0.4937744574884383, + "grad_norm": 1.4212857980846925, + "learning_rate": 4.8863843078029416e-05, + "loss": 0.3895, + "step": 4164 + }, + { + "epoch": 0.49389303925056327, + "grad_norm": 1.6167045505101534, + "learning_rate": 4.886312756702537e-05, + "loss": 0.6028, + "step": 4165 + }, + { + "epoch": 0.49401162101268825, + "grad_norm": 1.5976736478926108, + "learning_rate": 4.886241183603256e-05, + "loss": 0.5893, + "step": 4166 + }, + { + "epoch": 0.49413020277481323, + "grad_norm": 1.8155228437003454, + "learning_rate": 4.886169588505758e-05, + "loss": 0.7451, + "step": 4167 + }, + { + "epoch": 0.4942487845369382, + "grad_norm": 1.8801691852585987, + "learning_rate": 4.886097971410701e-05, + "loss": 0.588, + "step": 4168 + }, + { + "epoch": 0.4943673662990632, + "grad_norm": 1.897293570669478, + "learning_rate": 4.8860263323187484e-05, + "loss": 0.6171, + "step": 4169 + }, + { + "epoch": 0.4944859480611882, + "grad_norm": 2.014350674090454, + "learning_rate": 4.885954671230558e-05, + "loss": 0.838, + "step": 4170 + }, + { + "epoch": 0.49460452982331315, + "grad_norm": 1.4582943101312837, + "learning_rate": 4.885882988146791e-05, + "loss": 0.6183, + "step": 4171 + }, + { + "epoch": 0.49472311158543814, + "grad_norm": 1.534112061742086, + "learning_rate": 4.8858112830681103e-05, + "loss": 0.5194, + "step": 4172 + }, + { + "epoch": 0.49484169334756317, + "grad_norm": 1.477448897460018, + "learning_rate": 4.885739555995174e-05, + "loss": 0.4589, + "step": 4173 + }, + { + "epoch": 0.49496027510968815, + "grad_norm": 1.6422734516772388, + "learning_rate": 4.8856678069286444e-05, + "loss": 0.657, + "step": 4174 + }, + { + "epoch": 0.49507885687181313, + "grad_norm": 1.4317073687155395, + "learning_rate": 4.885596035869184e-05, + "loss": 0.511, + "step": 4175 + }, + { + "epoch": 0.4951974386339381, + "grad_norm": 1.1763927505840408, + "learning_rate": 4.885524242817453e-05, + "loss": 0.4551, + "step": 4176 + }, + { + "epoch": 0.4953160203960631, + "grad_norm": 1.411962298666528, + "learning_rate": 4.885452427774114e-05, + "loss": 0.7799, + "step": 4177 + }, + { + "epoch": 0.4954346021581881, + "grad_norm": 1.5564797788444686, + "learning_rate": 4.8853805907398285e-05, + "loss": 0.7688, + "step": 4178 + }, + { + "epoch": 0.49555318392031306, + "grad_norm": 1.5399659255151243, + "learning_rate": 4.885308731715259e-05, + "loss": 0.5685, + "step": 4179 + }, + { + "epoch": 0.49567176568243804, + "grad_norm": 1.4136804299191332, + "learning_rate": 4.885236850701068e-05, + "loss": 0.5345, + "step": 4180 + }, + { + "epoch": 0.495790347444563, + "grad_norm": 1.6974687606245338, + "learning_rate": 4.8851649476979186e-05, + "loss": 0.7021, + "step": 4181 + }, + { + "epoch": 0.495908929206688, + "grad_norm": 1.479164042532574, + "learning_rate": 4.885093022706474e-05, + "loss": 0.6877, + "step": 4182 + }, + { + "epoch": 0.496027510968813, + "grad_norm": 1.7268811040623062, + "learning_rate": 4.885021075727395e-05, + "loss": 0.7108, + "step": 4183 + }, + { + "epoch": 0.49614609273093796, + "grad_norm": 1.7785163771219035, + "learning_rate": 4.884949106761347e-05, + "loss": 0.8364, + "step": 4184 + }, + { + "epoch": 0.49626467449306294, + "grad_norm": 1.6849634177784654, + "learning_rate": 4.884877115808994e-05, + "loss": 0.8014, + "step": 4185 + }, + { + "epoch": 0.496383256255188, + "grad_norm": 1.5578554150730428, + "learning_rate": 4.884805102870997e-05, + "loss": 0.789, + "step": 4186 + }, + { + "epoch": 0.49650183801731296, + "grad_norm": 1.1898274144184005, + "learning_rate": 4.884733067948022e-05, + "loss": 0.3891, + "step": 4187 + }, + { + "epoch": 0.49662041977943794, + "grad_norm": 1.6693449916857581, + "learning_rate": 4.884661011040732e-05, + "loss": 0.7627, + "step": 4188 + }, + { + "epoch": 0.4967390015415629, + "grad_norm": 1.5523424827098533, + "learning_rate": 4.8845889321497924e-05, + "loss": 0.6997, + "step": 4189 + }, + { + "epoch": 0.4968575833036879, + "grad_norm": 1.3891164936677896, + "learning_rate": 4.8845168312758666e-05, + "loss": 0.5352, + "step": 4190 + }, + { + "epoch": 0.4969761650658129, + "grad_norm": 1.6715473176601217, + "learning_rate": 4.88444470841962e-05, + "loss": 0.7345, + "step": 4191 + }, + { + "epoch": 0.49709474682793786, + "grad_norm": 1.7407339993306394, + "learning_rate": 4.884372563581717e-05, + "loss": 0.8162, + "step": 4192 + }, + { + "epoch": 0.49721332859006284, + "grad_norm": 1.4784768258988155, + "learning_rate": 4.884300396762823e-05, + "loss": 0.5728, + "step": 4193 + }, + { + "epoch": 0.4973319103521878, + "grad_norm": 1.489602126228985, + "learning_rate": 4.884228207963603e-05, + "loss": 0.6675, + "step": 4194 + }, + { + "epoch": 0.4974504921143128, + "grad_norm": 1.6384792962331787, + "learning_rate": 4.884155997184722e-05, + "loss": 0.7756, + "step": 4195 + }, + { + "epoch": 0.4975690738764378, + "grad_norm": 1.861307279641469, + "learning_rate": 4.8840837644268475e-05, + "loss": 0.6619, + "step": 4196 + }, + { + "epoch": 0.49768765563856276, + "grad_norm": 1.473318276156382, + "learning_rate": 4.884011509690644e-05, + "loss": 0.6956, + "step": 4197 + }, + { + "epoch": 0.4978062374006878, + "grad_norm": 1.061196047377696, + "learning_rate": 4.883939232976777e-05, + "loss": 0.371, + "step": 4198 + }, + { + "epoch": 0.4979248191628128, + "grad_norm": 1.6278635107735988, + "learning_rate": 4.883866934285914e-05, + "loss": 0.754, + "step": 4199 + }, + { + "epoch": 0.49804340092493776, + "grad_norm": 1.5076234190528521, + "learning_rate": 4.883794613618722e-05, + "loss": 0.7768, + "step": 4200 + }, + { + "epoch": 0.49816198268706274, + "grad_norm": 1.872306013792876, + "learning_rate": 4.883722270975867e-05, + "loss": 0.9127, + "step": 4201 + }, + { + "epoch": 0.4982805644491877, + "grad_norm": 1.4074893571222526, + "learning_rate": 4.8836499063580146e-05, + "loss": 0.5926, + "step": 4202 + }, + { + "epoch": 0.4983991462113127, + "grad_norm": 1.5216243608245041, + "learning_rate": 4.883577519765833e-05, + "loss": 0.7436, + "step": 4203 + }, + { + "epoch": 0.4985177279734377, + "grad_norm": 1.4405062317053228, + "learning_rate": 4.883505111199991e-05, + "loss": 0.7063, + "step": 4204 + }, + { + "epoch": 0.49863630973556267, + "grad_norm": 1.779269922314004, + "learning_rate": 4.883432680661154e-05, + "loss": 0.8743, + "step": 4205 + }, + { + "epoch": 0.49875489149768765, + "grad_norm": 1.4789425910200547, + "learning_rate": 4.88336022814999e-05, + "loss": 0.7553, + "step": 4206 + }, + { + "epoch": 0.4988734732598126, + "grad_norm": 1.7794041384353698, + "learning_rate": 4.883287753667168e-05, + "loss": 0.619, + "step": 4207 + }, + { + "epoch": 0.4989920550219376, + "grad_norm": 1.5933419171339571, + "learning_rate": 4.883215257213355e-05, + "loss": 0.7724, + "step": 4208 + }, + { + "epoch": 0.4991106367840626, + "grad_norm": 1.4844177478767082, + "learning_rate": 4.88314273878922e-05, + "loss": 0.5486, + "step": 4209 + }, + { + "epoch": 0.49922921854618757, + "grad_norm": 1.1878547377186868, + "learning_rate": 4.8830701983954315e-05, + "loss": 0.5809, + "step": 4210 + }, + { + "epoch": 0.4993478003083126, + "grad_norm": 1.3873467405899866, + "learning_rate": 4.882997636032658e-05, + "loss": 0.702, + "step": 4211 + }, + { + "epoch": 0.4994663820704376, + "grad_norm": 1.6044905952323638, + "learning_rate": 4.8829250517015684e-05, + "loss": 0.7674, + "step": 4212 + }, + { + "epoch": 0.49958496383256257, + "grad_norm": 1.2884582427232414, + "learning_rate": 4.8828524454028316e-05, + "loss": 0.5025, + "step": 4213 + }, + { + "epoch": 0.49970354559468755, + "grad_norm": 1.4136532623261207, + "learning_rate": 4.882779817137118e-05, + "loss": 0.7417, + "step": 4214 + }, + { + "epoch": 0.49982212735681253, + "grad_norm": 1.3912320284418225, + "learning_rate": 4.882707166905096e-05, + "loss": 0.5058, + "step": 4215 + }, + { + "epoch": 0.4999407091189375, + "grad_norm": 1.3825235372558804, + "learning_rate": 4.8826344947074366e-05, + "loss": 0.6605, + "step": 4216 + }, + { + "epoch": 0.5000592908810625, + "grad_norm": 1.3774562264883885, + "learning_rate": 4.882561800544808e-05, + "loss": 0.5235, + "step": 4217 + }, + { + "epoch": 0.5001778726431875, + "grad_norm": 1.801641450892196, + "learning_rate": 4.882489084417882e-05, + "loss": 0.8543, + "step": 4218 + }, + { + "epoch": 0.5002964544053125, + "grad_norm": 1.7233188633556076, + "learning_rate": 4.882416346327328e-05, + "loss": 0.7866, + "step": 4219 + }, + { + "epoch": 0.5004150361674374, + "grad_norm": 1.5515120599469079, + "learning_rate": 4.8823435862738164e-05, + "loss": 0.8005, + "step": 4220 + }, + { + "epoch": 0.5005336179295624, + "grad_norm": 1.5546421398652037, + "learning_rate": 4.882270804258018e-05, + "loss": 0.7, + "step": 4221 + }, + { + "epoch": 0.5006521996916874, + "grad_norm": 1.3279820447450426, + "learning_rate": 4.882198000280605e-05, + "loss": 0.5414, + "step": 4222 + }, + { + "epoch": 0.5007707814538124, + "grad_norm": 1.6117348888181129, + "learning_rate": 4.882125174342247e-05, + "loss": 0.7985, + "step": 4223 + }, + { + "epoch": 0.5008893632159374, + "grad_norm": 1.3126769099056539, + "learning_rate": 4.882052326443617e-05, + "loss": 0.5891, + "step": 4224 + }, + { + "epoch": 0.5010079449780623, + "grad_norm": 1.2497594271467203, + "learning_rate": 4.881979456585385e-05, + "loss": 0.6086, + "step": 4225 + }, + { + "epoch": 0.5011265267401873, + "grad_norm": 1.4286193726464136, + "learning_rate": 4.881906564768224e-05, + "loss": 0.4473, + "step": 4226 + }, + { + "epoch": 0.5012451085023123, + "grad_norm": 1.514045026920415, + "learning_rate": 4.881833650992804e-05, + "loss": 0.4986, + "step": 4227 + }, + { + "epoch": 0.5013636902644373, + "grad_norm": 1.2983605260850395, + "learning_rate": 4.8817607152598e-05, + "loss": 0.5017, + "step": 4228 + }, + { + "epoch": 0.5014822720265624, + "grad_norm": 1.5663459204185326, + "learning_rate": 4.881687757569882e-05, + "loss": 0.5535, + "step": 4229 + }, + { + "epoch": 0.5016008537886874, + "grad_norm": 1.5590348755485475, + "learning_rate": 4.8816147779237233e-05, + "loss": 0.616, + "step": 4230 + }, + { + "epoch": 0.5017194355508123, + "grad_norm": 1.6849064477247209, + "learning_rate": 4.881541776321997e-05, + "loss": 0.5921, + "step": 4231 + }, + { + "epoch": 0.5018380173129373, + "grad_norm": 1.3703863095399684, + "learning_rate": 4.881468752765377e-05, + "loss": 0.5513, + "step": 4232 + }, + { + "epoch": 0.5019565990750623, + "grad_norm": 1.7246021852827493, + "learning_rate": 4.881395707254535e-05, + "loss": 0.5612, + "step": 4233 + }, + { + "epoch": 0.5020751808371873, + "grad_norm": 1.6816272680633655, + "learning_rate": 4.8813226397901445e-05, + "loss": 0.5198, + "step": 4234 + }, + { + "epoch": 0.5021937625993123, + "grad_norm": 1.356148412104441, + "learning_rate": 4.881249550372879e-05, + "loss": 0.5843, + "step": 4235 + }, + { + "epoch": 0.5023123443614372, + "grad_norm": 1.996944208667317, + "learning_rate": 4.8811764390034136e-05, + "loss": 0.9025, + "step": 4236 + }, + { + "epoch": 0.5024309261235622, + "grad_norm": 1.6982715690074102, + "learning_rate": 4.881103305682421e-05, + "loss": 0.6618, + "step": 4237 + }, + { + "epoch": 0.5025495078856872, + "grad_norm": 1.6151975918804486, + "learning_rate": 4.881030150410576e-05, + "loss": 0.7062, + "step": 4238 + }, + { + "epoch": 0.5026680896478122, + "grad_norm": 1.8491658047589286, + "learning_rate": 4.880956973188553e-05, + "loss": 0.5378, + "step": 4239 + }, + { + "epoch": 0.5027866714099372, + "grad_norm": 1.4313157141007176, + "learning_rate": 4.8808837740170255e-05, + "loss": 0.6601, + "step": 4240 + }, + { + "epoch": 0.5029052531720621, + "grad_norm": 1.740632426834221, + "learning_rate": 4.88081055289667e-05, + "loss": 0.5644, + "step": 4241 + }, + { + "epoch": 0.5030238349341871, + "grad_norm": 1.7504321603976716, + "learning_rate": 4.880737309828161e-05, + "loss": 0.8082, + "step": 4242 + }, + { + "epoch": 0.5031424166963121, + "grad_norm": 1.4353924714650723, + "learning_rate": 4.8806640448121734e-05, + "loss": 0.5907, + "step": 4243 + }, + { + "epoch": 0.5032609984584371, + "grad_norm": 1.6107280250245621, + "learning_rate": 4.8805907578493815e-05, + "loss": 0.6974, + "step": 4244 + }, + { + "epoch": 0.5033795802205621, + "grad_norm": 1.41990326212491, + "learning_rate": 4.880517448940463e-05, + "loss": 0.4632, + "step": 4245 + }, + { + "epoch": 0.503498161982687, + "grad_norm": 1.5788742408691216, + "learning_rate": 4.880444118086093e-05, + "loss": 0.6662, + "step": 4246 + }, + { + "epoch": 0.503616743744812, + "grad_norm": 1.6118784305872118, + "learning_rate": 4.880370765286947e-05, + "loss": 0.5502, + "step": 4247 + }, + { + "epoch": 0.503735325506937, + "grad_norm": 2.1054420277646564, + "learning_rate": 4.8802973905437014e-05, + "loss": 1.0403, + "step": 4248 + }, + { + "epoch": 0.503853907269062, + "grad_norm": 1.4594871542109626, + "learning_rate": 4.880223993857033e-05, + "loss": 0.7616, + "step": 4249 + }, + { + "epoch": 0.503972489031187, + "grad_norm": 1.462582950889475, + "learning_rate": 4.880150575227619e-05, + "loss": 0.6323, + "step": 4250 + }, + { + "epoch": 0.504091070793312, + "grad_norm": 1.7575824499643078, + "learning_rate": 4.8800771346561344e-05, + "loss": 0.7085, + "step": 4251 + }, + { + "epoch": 0.5042096525554369, + "grad_norm": 1.3885744676303227, + "learning_rate": 4.8800036721432576e-05, + "loss": 0.5506, + "step": 4252 + }, + { + "epoch": 0.5043282343175619, + "grad_norm": 1.573913101757252, + "learning_rate": 4.8799301876896654e-05, + "loss": 0.5521, + "step": 4253 + }, + { + "epoch": 0.504446816079687, + "grad_norm": 1.6100738796254352, + "learning_rate": 4.879856681296035e-05, + "loss": 0.7821, + "step": 4254 + }, + { + "epoch": 0.504565397841812, + "grad_norm": 1.2580862955276049, + "learning_rate": 4.8797831529630444e-05, + "loss": 0.5561, + "step": 4255 + }, + { + "epoch": 0.504683979603937, + "grad_norm": 1.2241080064586294, + "learning_rate": 4.8797096026913716e-05, + "loss": 0.4425, + "step": 4256 + }, + { + "epoch": 0.5048025613660619, + "grad_norm": 1.7110876613388555, + "learning_rate": 4.8796360304816946e-05, + "loss": 0.6743, + "step": 4257 + }, + { + "epoch": 0.5049211431281869, + "grad_norm": 1.357456952015221, + "learning_rate": 4.8795624363346915e-05, + "loss": 0.7113, + "step": 4258 + }, + { + "epoch": 0.5050397248903119, + "grad_norm": 1.0427786329694098, + "learning_rate": 4.879488820251041e-05, + "loss": 0.4747, + "step": 4259 + }, + { + "epoch": 0.5051583066524369, + "grad_norm": 1.1352901158948334, + "learning_rate": 4.879415182231421e-05, + "loss": 0.4775, + "step": 4260 + }, + { + "epoch": 0.5052768884145619, + "grad_norm": 1.721441822370237, + "learning_rate": 4.879341522276512e-05, + "loss": 0.6987, + "step": 4261 + }, + { + "epoch": 0.5053954701766868, + "grad_norm": 1.5464069681153891, + "learning_rate": 4.87926784038699e-05, + "loss": 0.5903, + "step": 4262 + }, + { + "epoch": 0.5055140519388118, + "grad_norm": 1.5314737576847943, + "learning_rate": 4.879194136563537e-05, + "loss": 0.6618, + "step": 4263 + }, + { + "epoch": 0.5056326337009368, + "grad_norm": 1.4400615373627441, + "learning_rate": 4.8791204108068325e-05, + "loss": 0.5703, + "step": 4264 + }, + { + "epoch": 0.5057512154630618, + "grad_norm": 1.5214081666390176, + "learning_rate": 4.879046663117554e-05, + "loss": 0.6375, + "step": 4265 + }, + { + "epoch": 0.5058697972251868, + "grad_norm": 1.4304312983777487, + "learning_rate": 4.878972893496383e-05, + "loss": 0.4897, + "step": 4266 + }, + { + "epoch": 0.5059883789873117, + "grad_norm": 1.525342411241944, + "learning_rate": 4.8788991019439994e-05, + "loss": 0.4666, + "step": 4267 + }, + { + "epoch": 0.5061069607494367, + "grad_norm": 1.6547747495172347, + "learning_rate": 4.878825288461083e-05, + "loss": 0.8045, + "step": 4268 + }, + { + "epoch": 0.5062255425115617, + "grad_norm": 1.7061044359475626, + "learning_rate": 4.8787514530483154e-05, + "loss": 0.8173, + "step": 4269 + }, + { + "epoch": 0.5063441242736867, + "grad_norm": 1.6459805379134642, + "learning_rate": 4.878677595706376e-05, + "loss": 0.6463, + "step": 4270 + }, + { + "epoch": 0.5064627060358117, + "grad_norm": 1.5001298552477893, + "learning_rate": 4.878603716435946e-05, + "loss": 0.431, + "step": 4271 + }, + { + "epoch": 0.5065812877979367, + "grad_norm": 1.5647085445165811, + "learning_rate": 4.8785298152377056e-05, + "loss": 0.7245, + "step": 4272 + }, + { + "epoch": 0.5066998695600616, + "grad_norm": 2.014055464191367, + "learning_rate": 4.878455892112338e-05, + "loss": 0.6241, + "step": 4273 + }, + { + "epoch": 0.5068184513221866, + "grad_norm": 1.6431332446322693, + "learning_rate": 4.878381947060524e-05, + "loss": 0.7089, + "step": 4274 + }, + { + "epoch": 0.5069370330843116, + "grad_norm": 1.5592536658261191, + "learning_rate": 4.878307980082945e-05, + "loss": 0.6915, + "step": 4275 + }, + { + "epoch": 0.5070556148464366, + "grad_norm": 1.3562566241530667, + "learning_rate": 4.878233991180282e-05, + "loss": 0.5154, + "step": 4276 + }, + { + "epoch": 0.5071741966085616, + "grad_norm": 1.795781477147339, + "learning_rate": 4.8781599803532186e-05, + "loss": 0.6748, + "step": 4277 + }, + { + "epoch": 0.5072927783706865, + "grad_norm": 1.2232488770553969, + "learning_rate": 4.8780859476024365e-05, + "loss": 0.4768, + "step": 4278 + }, + { + "epoch": 0.5074113601328116, + "grad_norm": 1.5798784371533379, + "learning_rate": 4.878011892928618e-05, + "loss": 0.5479, + "step": 4279 + }, + { + "epoch": 0.5075299418949366, + "grad_norm": 1.5229794354372765, + "learning_rate": 4.877937816332446e-05, + "loss": 0.6582, + "step": 4280 + }, + { + "epoch": 0.5076485236570616, + "grad_norm": 1.5747052056397377, + "learning_rate": 4.8778637178146034e-05, + "loss": 0.7234, + "step": 4281 + }, + { + "epoch": 0.5077671054191866, + "grad_norm": 1.6317492098254287, + "learning_rate": 4.8777895973757735e-05, + "loss": 0.6669, + "step": 4282 + }, + { + "epoch": 0.5078856871813116, + "grad_norm": 1.4086505952961939, + "learning_rate": 4.877715455016639e-05, + "loss": 0.4567, + "step": 4283 + }, + { + "epoch": 0.5080042689434365, + "grad_norm": 1.7872303841028734, + "learning_rate": 4.877641290737884e-05, + "loss": 0.7972, + "step": 4284 + }, + { + "epoch": 0.5081228507055615, + "grad_norm": 1.839904312015274, + "learning_rate": 4.8775671045401917e-05, + "loss": 0.7262, + "step": 4285 + }, + { + "epoch": 0.5082414324676865, + "grad_norm": 1.7398210483128675, + "learning_rate": 4.877492896424247e-05, + "loss": 0.9163, + "step": 4286 + }, + { + "epoch": 0.5083600142298115, + "grad_norm": 1.2991097468156927, + "learning_rate": 4.8774186663907324e-05, + "loss": 0.454, + "step": 4287 + }, + { + "epoch": 0.5084785959919365, + "grad_norm": 1.448745979056906, + "learning_rate": 4.877344414440333e-05, + "loss": 0.6695, + "step": 4288 + }, + { + "epoch": 0.5085971777540614, + "grad_norm": 1.204904170374811, + "learning_rate": 4.8772701405737344e-05, + "loss": 0.4157, + "step": 4289 + }, + { + "epoch": 0.5087157595161864, + "grad_norm": 1.7133064179963, + "learning_rate": 4.877195844791619e-05, + "loss": 0.7037, + "step": 4290 + }, + { + "epoch": 0.5088343412783114, + "grad_norm": 1.9414203542059185, + "learning_rate": 4.877121527094674e-05, + "loss": 0.6867, + "step": 4291 + }, + { + "epoch": 0.5089529230404364, + "grad_norm": 1.7554749595806534, + "learning_rate": 4.877047187483583e-05, + "loss": 0.6805, + "step": 4292 + }, + { + "epoch": 0.5090715048025614, + "grad_norm": 1.3328127396353469, + "learning_rate": 4.876972825959032e-05, + "loss": 0.5189, + "step": 4293 + }, + { + "epoch": 0.5091900865646863, + "grad_norm": 1.5364637849081937, + "learning_rate": 4.876898442521707e-05, + "loss": 0.5138, + "step": 4294 + }, + { + "epoch": 0.5093086683268113, + "grad_norm": 1.67316819522203, + "learning_rate": 4.876824037172292e-05, + "loss": 0.7078, + "step": 4295 + }, + { + "epoch": 0.5094272500889363, + "grad_norm": 1.8755695795416718, + "learning_rate": 4.876749609911475e-05, + "loss": 0.6817, + "step": 4296 + }, + { + "epoch": 0.5095458318510613, + "grad_norm": 1.6341065759798845, + "learning_rate": 4.8766751607399406e-05, + "loss": 0.7713, + "step": 4297 + }, + { + "epoch": 0.5096644136131863, + "grad_norm": 1.9452685899135145, + "learning_rate": 4.876600689658376e-05, + "loss": 0.7185, + "step": 4298 + }, + { + "epoch": 0.5097829953753112, + "grad_norm": 1.5140360276676623, + "learning_rate": 4.876526196667467e-05, + "loss": 0.9291, + "step": 4299 + }, + { + "epoch": 0.5099015771374362, + "grad_norm": 1.4011209200325194, + "learning_rate": 4.876451681767901e-05, + "loss": 0.6767, + "step": 4300 + }, + { + "epoch": 0.5100201588995612, + "grad_norm": 1.1976750679777761, + "learning_rate": 4.876377144960365e-05, + "loss": 0.4699, + "step": 4301 + }, + { + "epoch": 0.5101387406616862, + "grad_norm": 1.490329002873059, + "learning_rate": 4.876302586245545e-05, + "loss": 0.5952, + "step": 4302 + }, + { + "epoch": 0.5102573224238112, + "grad_norm": 1.2275232555015385, + "learning_rate": 4.876228005624129e-05, + "loss": 0.5118, + "step": 4303 + }, + { + "epoch": 0.5103759041859363, + "grad_norm": 1.3172748855868155, + "learning_rate": 4.876153403096806e-05, + "loss": 0.6384, + "step": 4304 + }, + { + "epoch": 0.5104944859480612, + "grad_norm": 1.336336256458007, + "learning_rate": 4.876078778664262e-05, + "loss": 0.5737, + "step": 4305 + }, + { + "epoch": 0.5106130677101862, + "grad_norm": 1.729024748724994, + "learning_rate": 4.876004132327185e-05, + "loss": 0.8029, + "step": 4306 + }, + { + "epoch": 0.5107316494723112, + "grad_norm": 1.3874971652252845, + "learning_rate": 4.8759294640862644e-05, + "loss": 0.537, + "step": 4307 + }, + { + "epoch": 0.5108502312344362, + "grad_norm": 1.7491461912373207, + "learning_rate": 4.875854773942187e-05, + "loss": 0.8612, + "step": 4308 + }, + { + "epoch": 0.5109688129965612, + "grad_norm": 1.882271306943973, + "learning_rate": 4.875780061895642e-05, + "loss": 0.8454, + "step": 4309 + }, + { + "epoch": 0.5110873947586861, + "grad_norm": 1.3244913070521658, + "learning_rate": 4.875705327947319e-05, + "loss": 0.5742, + "step": 4310 + }, + { + "epoch": 0.5112059765208111, + "grad_norm": 1.4003872733668077, + "learning_rate": 4.875630572097904e-05, + "loss": 0.5193, + "step": 4311 + }, + { + "epoch": 0.5113245582829361, + "grad_norm": 1.5406857483333913, + "learning_rate": 4.87555579434809e-05, + "loss": 0.7657, + "step": 4312 + }, + { + "epoch": 0.5114431400450611, + "grad_norm": 1.4874976336138959, + "learning_rate": 4.8754809946985655e-05, + "loss": 0.6416, + "step": 4313 + }, + { + "epoch": 0.5115617218071861, + "grad_norm": 1.669455380926477, + "learning_rate": 4.8754061731500175e-05, + "loss": 0.68, + "step": 4314 + }, + { + "epoch": 0.511680303569311, + "grad_norm": 1.661920317808168, + "learning_rate": 4.875331329703139e-05, + "loss": 0.825, + "step": 4315 + }, + { + "epoch": 0.511798885331436, + "grad_norm": 1.7875666062831155, + "learning_rate": 4.875256464358618e-05, + "loss": 0.7185, + "step": 4316 + }, + { + "epoch": 0.511917467093561, + "grad_norm": 1.3440039791447256, + "learning_rate": 4.875181577117144e-05, + "loss": 0.53, + "step": 4317 + }, + { + "epoch": 0.512036048855686, + "grad_norm": 1.6952333574965321, + "learning_rate": 4.87510666797941e-05, + "loss": 0.8539, + "step": 4318 + }, + { + "epoch": 0.512154630617811, + "grad_norm": 1.694943913024686, + "learning_rate": 4.875031736946104e-05, + "loss": 0.6808, + "step": 4319 + }, + { + "epoch": 0.512273212379936, + "grad_norm": 1.4261773998506648, + "learning_rate": 4.8749567840179185e-05, + "loss": 0.4835, + "step": 4320 + }, + { + "epoch": 0.5123917941420609, + "grad_norm": 1.7274971706842293, + "learning_rate": 4.8748818091955434e-05, + "loss": 0.6794, + "step": 4321 + }, + { + "epoch": 0.5125103759041859, + "grad_norm": 1.795531929554698, + "learning_rate": 4.8748068124796706e-05, + "loss": 0.8452, + "step": 4322 + }, + { + "epoch": 0.5126289576663109, + "grad_norm": 1.407552464228538, + "learning_rate": 4.8747317938709905e-05, + "loss": 0.6288, + "step": 4323 + }, + { + "epoch": 0.5127475394284359, + "grad_norm": 1.6506024859839095, + "learning_rate": 4.8746567533701957e-05, + "loss": 0.6969, + "step": 4324 + }, + { + "epoch": 0.5128661211905609, + "grad_norm": 1.2598084894610087, + "learning_rate": 4.8745816909779786e-05, + "loss": 0.5747, + "step": 4325 + }, + { + "epoch": 0.5129847029526858, + "grad_norm": 1.2269588253904922, + "learning_rate": 4.874506606695029e-05, + "loss": 0.4926, + "step": 4326 + }, + { + "epoch": 0.5131032847148108, + "grad_norm": 1.7530810964390664, + "learning_rate": 4.874431500522041e-05, + "loss": 0.7086, + "step": 4327 + }, + { + "epoch": 0.5132218664769358, + "grad_norm": 1.3065926719476002, + "learning_rate": 4.8743563724597056e-05, + "loss": 0.5385, + "step": 4328 + }, + { + "epoch": 0.5133404482390609, + "grad_norm": 1.2566848999354667, + "learning_rate": 4.8742812225087164e-05, + "loss": 0.3724, + "step": 4329 + }, + { + "epoch": 0.5134590300011859, + "grad_norm": 1.8048932031989933, + "learning_rate": 4.874206050669766e-05, + "loss": 0.5741, + "step": 4330 + }, + { + "epoch": 0.5135776117633108, + "grad_norm": 1.4327277197720851, + "learning_rate": 4.874130856943547e-05, + "loss": 0.4936, + "step": 4331 + }, + { + "epoch": 0.5136961935254358, + "grad_norm": 2.0533666083397892, + "learning_rate": 4.8740556413307536e-05, + "loss": 0.6106, + "step": 4332 + }, + { + "epoch": 0.5138147752875608, + "grad_norm": 1.7843774945246158, + "learning_rate": 4.873980403832078e-05, + "loss": 0.7307, + "step": 4333 + }, + { + "epoch": 0.5139333570496858, + "grad_norm": 1.4792530379858109, + "learning_rate": 4.873905144448214e-05, + "loss": 0.6169, + "step": 4334 + }, + { + "epoch": 0.5140519388118108, + "grad_norm": 1.835368481503766, + "learning_rate": 4.873829863179856e-05, + "loss": 0.7943, + "step": 4335 + }, + { + "epoch": 0.5141705205739358, + "grad_norm": 1.7910123407167908, + "learning_rate": 4.873754560027697e-05, + "loss": 0.4681, + "step": 4336 + }, + { + "epoch": 0.5142891023360607, + "grad_norm": 1.6340724223139678, + "learning_rate": 4.873679234992432e-05, + "loss": 0.7023, + "step": 4337 + }, + { + "epoch": 0.5144076840981857, + "grad_norm": 1.5833296410382676, + "learning_rate": 4.873603888074756e-05, + "loss": 0.613, + "step": 4338 + }, + { + "epoch": 0.5145262658603107, + "grad_norm": 1.18941440568035, + "learning_rate": 4.873528519275362e-05, + "loss": 0.4358, + "step": 4339 + }, + { + "epoch": 0.5146448476224357, + "grad_norm": 1.239768102946135, + "learning_rate": 4.8734531285949464e-05, + "loss": 0.5141, + "step": 4340 + }, + { + "epoch": 0.5147634293845607, + "grad_norm": 1.3547619010597018, + "learning_rate": 4.873377716034203e-05, + "loss": 0.6689, + "step": 4341 + }, + { + "epoch": 0.5148820111466856, + "grad_norm": 1.513421177250175, + "learning_rate": 4.8733022815938276e-05, + "loss": 0.7213, + "step": 4342 + }, + { + "epoch": 0.5150005929088106, + "grad_norm": 1.452987781700197, + "learning_rate": 4.873226825274516e-05, + "loss": 0.6429, + "step": 4343 + }, + { + "epoch": 0.5151191746709356, + "grad_norm": 1.479006782101966, + "learning_rate": 4.8731513470769624e-05, + "loss": 0.7134, + "step": 4344 + }, + { + "epoch": 0.5152377564330606, + "grad_norm": 1.4103549864026725, + "learning_rate": 4.873075847001864e-05, + "loss": 0.451, + "step": 4345 + }, + { + "epoch": 0.5153563381951856, + "grad_norm": 1.3421869823528818, + "learning_rate": 4.873000325049916e-05, + "loss": 0.4948, + "step": 4346 + }, + { + "epoch": 0.5154749199573105, + "grad_norm": 1.212312606478388, + "learning_rate": 4.8729247812218154e-05, + "loss": 0.4737, + "step": 4347 + }, + { + "epoch": 0.5155935017194355, + "grad_norm": 1.423683265756171, + "learning_rate": 4.872849215518258e-05, + "loss": 0.637, + "step": 4348 + }, + { + "epoch": 0.5157120834815605, + "grad_norm": 2.2363116010565225, + "learning_rate": 4.8727736279399404e-05, + "loss": 0.7408, + "step": 4349 + }, + { + "epoch": 0.5158306652436855, + "grad_norm": 1.2298470032179871, + "learning_rate": 4.87269801848756e-05, + "loss": 0.5297, + "step": 4350 + }, + { + "epoch": 0.5159492470058105, + "grad_norm": 1.7169089715602739, + "learning_rate": 4.872622387161814e-05, + "loss": 0.7142, + "step": 4351 + }, + { + "epoch": 0.5160678287679354, + "grad_norm": 1.8538833576270843, + "learning_rate": 4.872546733963398e-05, + "loss": 0.5517, + "step": 4352 + }, + { + "epoch": 0.5161864105300604, + "grad_norm": 2.055110615270066, + "learning_rate": 4.8724710588930114e-05, + "loss": 0.7678, + "step": 4353 + }, + { + "epoch": 0.5163049922921855, + "grad_norm": 1.7505100473095558, + "learning_rate": 4.87239536195135e-05, + "loss": 0.6489, + "step": 4354 + }, + { + "epoch": 0.5164235740543105, + "grad_norm": 1.719659719884238, + "learning_rate": 4.872319643139113e-05, + "loss": 0.5574, + "step": 4355 + }, + { + "epoch": 0.5165421558164355, + "grad_norm": 1.2815322830499518, + "learning_rate": 4.8722439024569986e-05, + "loss": 0.5784, + "step": 4356 + }, + { + "epoch": 0.5166607375785605, + "grad_norm": 1.4048186273630272, + "learning_rate": 4.872168139905704e-05, + "loss": 0.6516, + "step": 4357 + }, + { + "epoch": 0.5167793193406854, + "grad_norm": 1.4863377727791536, + "learning_rate": 4.872092355485928e-05, + "loss": 0.7097, + "step": 4358 + }, + { + "epoch": 0.5168979011028104, + "grad_norm": 1.402777851749483, + "learning_rate": 4.872016549198369e-05, + "loss": 0.5733, + "step": 4359 + }, + { + "epoch": 0.5170164828649354, + "grad_norm": 1.4310797835071725, + "learning_rate": 4.871940721043727e-05, + "loss": 0.7479, + "step": 4360 + }, + { + "epoch": 0.5171350646270604, + "grad_norm": 1.4056118446343133, + "learning_rate": 4.8718648710227e-05, + "loss": 0.5605, + "step": 4361 + }, + { + "epoch": 0.5172536463891854, + "grad_norm": 1.6034416104079923, + "learning_rate": 4.8717889991359873e-05, + "loss": 0.599, + "step": 4362 + }, + { + "epoch": 0.5173722281513103, + "grad_norm": 1.421445906078631, + "learning_rate": 4.871713105384288e-05, + "loss": 0.5901, + "step": 4363 + }, + { + "epoch": 0.5174908099134353, + "grad_norm": 1.733640423948289, + "learning_rate": 4.871637189768303e-05, + "loss": 0.6949, + "step": 4364 + }, + { + "epoch": 0.5176093916755603, + "grad_norm": 1.2853813523999746, + "learning_rate": 4.8715612522887314e-05, + "loss": 0.5228, + "step": 4365 + }, + { + "epoch": 0.5177279734376853, + "grad_norm": 1.8764146874812087, + "learning_rate": 4.871485292946273e-05, + "loss": 0.6983, + "step": 4366 + }, + { + "epoch": 0.5178465551998103, + "grad_norm": 1.2681443587754253, + "learning_rate": 4.871409311741628e-05, + "loss": 0.4408, + "step": 4367 + }, + { + "epoch": 0.5179651369619352, + "grad_norm": 1.6384801968529032, + "learning_rate": 4.8713333086754975e-05, + "loss": 0.6928, + "step": 4368 + }, + { + "epoch": 0.5180837187240602, + "grad_norm": 1.3750824198102838, + "learning_rate": 4.871257283748582e-05, + "loss": 0.5717, + "step": 4369 + }, + { + "epoch": 0.5182023004861852, + "grad_norm": 1.5850602281076362, + "learning_rate": 4.871181236961582e-05, + "loss": 0.6826, + "step": 4370 + }, + { + "epoch": 0.5183208822483102, + "grad_norm": 1.615717861638507, + "learning_rate": 4.871105168315199e-05, + "loss": 0.5857, + "step": 4371 + }, + { + "epoch": 0.5184394640104352, + "grad_norm": 1.6014217047581945, + "learning_rate": 4.871029077810133e-05, + "loss": 0.6551, + "step": 4372 + }, + { + "epoch": 0.5185580457725601, + "grad_norm": 1.596441921016555, + "learning_rate": 4.8709529654470874e-05, + "loss": 0.6224, + "step": 4373 + }, + { + "epoch": 0.5186766275346851, + "grad_norm": 1.6422005677022933, + "learning_rate": 4.870876831226763e-05, + "loss": 0.6878, + "step": 4374 + }, + { + "epoch": 0.5187952092968101, + "grad_norm": 1.1944226113604948, + "learning_rate": 4.8708006751498607e-05, + "loss": 0.4179, + "step": 4375 + }, + { + "epoch": 0.5189137910589351, + "grad_norm": 1.7688920948212643, + "learning_rate": 4.870724497217084e-05, + "loss": 0.757, + "step": 4376 + }, + { + "epoch": 0.5190323728210601, + "grad_norm": 1.567790241085552, + "learning_rate": 4.870648297429134e-05, + "loss": 0.6371, + "step": 4377 + }, + { + "epoch": 0.519150954583185, + "grad_norm": 1.3964084687908715, + "learning_rate": 4.870572075786715e-05, + "loss": 0.5215, + "step": 4378 + }, + { + "epoch": 0.5192695363453101, + "grad_norm": 1.5268418666301682, + "learning_rate": 4.8704958322905284e-05, + "loss": 0.653, + "step": 4379 + }, + { + "epoch": 0.5193881181074351, + "grad_norm": 1.4719155273625835, + "learning_rate": 4.8704195669412754e-05, + "loss": 0.5743, + "step": 4380 + }, + { + "epoch": 0.5195066998695601, + "grad_norm": 1.4249703400761256, + "learning_rate": 4.8703432797396626e-05, + "loss": 0.6077, + "step": 4381 + }, + { + "epoch": 0.5196252816316851, + "grad_norm": 1.584314998404907, + "learning_rate": 4.870266970686391e-05, + "loss": 0.7967, + "step": 4382 + }, + { + "epoch": 0.5197438633938101, + "grad_norm": 1.5201360748304293, + "learning_rate": 4.870190639782164e-05, + "loss": 0.6638, + "step": 4383 + }, + { + "epoch": 0.519862445155935, + "grad_norm": 1.5792573939940566, + "learning_rate": 4.870114287027686e-05, + "loss": 0.6002, + "step": 4384 + }, + { + "epoch": 0.51998102691806, + "grad_norm": 1.3670244656283548, + "learning_rate": 4.870037912423661e-05, + "loss": 0.4267, + "step": 4385 + }, + { + "epoch": 0.520099608680185, + "grad_norm": 1.7266240762366005, + "learning_rate": 4.8699615159707925e-05, + "loss": 0.5855, + "step": 4386 + }, + { + "epoch": 0.52021819044231, + "grad_norm": 2.0213895654162513, + "learning_rate": 4.8698850976697854e-05, + "loss": 0.8816, + "step": 4387 + }, + { + "epoch": 0.520336772204435, + "grad_norm": 1.608368658650489, + "learning_rate": 4.8698086575213436e-05, + "loss": 0.6886, + "step": 4388 + }, + { + "epoch": 0.52045535396656, + "grad_norm": 1.4906512669942593, + "learning_rate": 4.869732195526172e-05, + "loss": 0.5743, + "step": 4389 + }, + { + "epoch": 0.5205739357286849, + "grad_norm": 1.231440785445485, + "learning_rate": 4.869655711684975e-05, + "loss": 0.4757, + "step": 4390 + }, + { + "epoch": 0.5206925174908099, + "grad_norm": 1.6043355989429942, + "learning_rate": 4.869579205998459e-05, + "loss": 0.5134, + "step": 4391 + }, + { + "epoch": 0.5208110992529349, + "grad_norm": 1.476836755291894, + "learning_rate": 4.869502678467329e-05, + "loss": 0.5482, + "step": 4392 + }, + { + "epoch": 0.5209296810150599, + "grad_norm": 1.5454196814220742, + "learning_rate": 4.869426129092289e-05, + "loss": 0.5682, + "step": 4393 + }, + { + "epoch": 0.5210482627771849, + "grad_norm": 1.5800482959728221, + "learning_rate": 4.8693495578740455e-05, + "loss": 0.5445, + "step": 4394 + }, + { + "epoch": 0.5211668445393098, + "grad_norm": 2.0692935688005814, + "learning_rate": 4.869272964813305e-05, + "loss": 0.7565, + "step": 4395 + }, + { + "epoch": 0.5212854263014348, + "grad_norm": 1.4852198544233544, + "learning_rate": 4.869196349910773e-05, + "loss": 0.6052, + "step": 4396 + }, + { + "epoch": 0.5214040080635598, + "grad_norm": 1.660823847328371, + "learning_rate": 4.869119713167156e-05, + "loss": 0.5963, + "step": 4397 + }, + { + "epoch": 0.5215225898256848, + "grad_norm": 1.243286114023559, + "learning_rate": 4.869043054583161e-05, + "loss": 0.447, + "step": 4398 + }, + { + "epoch": 0.5216411715878098, + "grad_norm": 1.678068850240401, + "learning_rate": 4.868966374159494e-05, + "loss": 0.6476, + "step": 4399 + }, + { + "epoch": 0.5217597533499347, + "grad_norm": 1.445100279676624, + "learning_rate": 4.8688896718968616e-05, + "loss": 0.6506, + "step": 4400 + }, + { + "epoch": 0.5218783351120597, + "grad_norm": 2.038456547215106, + "learning_rate": 4.868812947795971e-05, + "loss": 0.8992, + "step": 4401 + }, + { + "epoch": 0.5219969168741847, + "grad_norm": 1.5332951920831164, + "learning_rate": 4.868736201857531e-05, + "loss": 0.4276, + "step": 4402 + }, + { + "epoch": 0.5221154986363097, + "grad_norm": 1.5139872669907983, + "learning_rate": 4.868659434082247e-05, + "loss": 0.71, + "step": 4403 + }, + { + "epoch": 0.5222340803984348, + "grad_norm": 1.6006944262613803, + "learning_rate": 4.868582644470828e-05, + "loss": 0.6845, + "step": 4404 + }, + { + "epoch": 0.5223526621605598, + "grad_norm": 1.7769887828234796, + "learning_rate": 4.868505833023982e-05, + "loss": 0.7685, + "step": 4405 + }, + { + "epoch": 0.5224712439226847, + "grad_norm": 1.3576119350879052, + "learning_rate": 4.868428999742416e-05, + "loss": 0.4362, + "step": 4406 + }, + { + "epoch": 0.5225898256848097, + "grad_norm": 1.308231904349106, + "learning_rate": 4.868352144626839e-05, + "loss": 0.5516, + "step": 4407 + }, + { + "epoch": 0.5227084074469347, + "grad_norm": 1.5768439955419067, + "learning_rate": 4.86827526767796e-05, + "loss": 0.618, + "step": 4408 + }, + { + "epoch": 0.5228269892090597, + "grad_norm": 1.4994561336583552, + "learning_rate": 4.868198368896487e-05, + "loss": 0.7538, + "step": 4409 + }, + { + "epoch": 0.5229455709711847, + "grad_norm": 1.4866490704201185, + "learning_rate": 4.8681214482831286e-05, + "loss": 0.5659, + "step": 4410 + }, + { + "epoch": 0.5230641527333096, + "grad_norm": 1.5778425787056913, + "learning_rate": 4.8680445058385946e-05, + "loss": 0.6206, + "step": 4411 + }, + { + "epoch": 0.5231827344954346, + "grad_norm": 1.6963734844616165, + "learning_rate": 4.867967541563594e-05, + "loss": 0.623, + "step": 4412 + }, + { + "epoch": 0.5233013162575596, + "grad_norm": 1.420854388197455, + "learning_rate": 4.867890555458837e-05, + "loss": 0.5316, + "step": 4413 + }, + { + "epoch": 0.5234198980196846, + "grad_norm": 1.4783436884954324, + "learning_rate": 4.867813547525033e-05, + "loss": 0.5288, + "step": 4414 + }, + { + "epoch": 0.5235384797818096, + "grad_norm": 1.49724289146969, + "learning_rate": 4.867736517762891e-05, + "loss": 0.4132, + "step": 4415 + }, + { + "epoch": 0.5236570615439345, + "grad_norm": 1.682038987345143, + "learning_rate": 4.867659466173122e-05, + "loss": 0.6077, + "step": 4416 + }, + { + "epoch": 0.5237756433060595, + "grad_norm": 1.9428196826213506, + "learning_rate": 4.867582392756437e-05, + "loss": 0.8046, + "step": 4417 + }, + { + "epoch": 0.5238942250681845, + "grad_norm": 1.5279885282713979, + "learning_rate": 4.867505297513545e-05, + "loss": 0.5574, + "step": 4418 + }, + { + "epoch": 0.5240128068303095, + "grad_norm": 1.5314906046574306, + "learning_rate": 4.867428180445157e-05, + "loss": 0.5091, + "step": 4419 + }, + { + "epoch": 0.5241313885924345, + "grad_norm": 1.7760739042591318, + "learning_rate": 4.867351041551984e-05, + "loss": 0.6293, + "step": 4420 + }, + { + "epoch": 0.5242499703545594, + "grad_norm": 1.4129206158173455, + "learning_rate": 4.8672738808347384e-05, + "loss": 0.4576, + "step": 4421 + }, + { + "epoch": 0.5243685521166844, + "grad_norm": 1.4479174937295485, + "learning_rate": 4.8671966982941306e-05, + "loss": 0.5571, + "step": 4422 + }, + { + "epoch": 0.5244871338788094, + "grad_norm": 1.8224902975448736, + "learning_rate": 4.8671194939308716e-05, + "loss": 0.5339, + "step": 4423 + }, + { + "epoch": 0.5246057156409344, + "grad_norm": 1.7952334135203418, + "learning_rate": 4.8670422677456735e-05, + "loss": 0.6919, + "step": 4424 + }, + { + "epoch": 0.5247242974030594, + "grad_norm": 1.33303578988859, + "learning_rate": 4.8669650197392486e-05, + "loss": 0.4483, + "step": 4425 + }, + { + "epoch": 0.5248428791651844, + "grad_norm": 1.3571959830512428, + "learning_rate": 4.866887749912309e-05, + "loss": 0.464, + "step": 4426 + }, + { + "epoch": 0.5249614609273093, + "grad_norm": 1.552878108234724, + "learning_rate": 4.866810458265566e-05, + "loss": 0.6021, + "step": 4427 + }, + { + "epoch": 0.5250800426894343, + "grad_norm": 1.7853948568256417, + "learning_rate": 4.866733144799734e-05, + "loss": 0.7552, + "step": 4428 + }, + { + "epoch": 0.5251986244515594, + "grad_norm": 1.4469313468748655, + "learning_rate": 4.866655809515524e-05, + "loss": 0.4424, + "step": 4429 + }, + { + "epoch": 0.5253172062136844, + "grad_norm": 1.4555099461536596, + "learning_rate": 4.86657845241365e-05, + "loss": 0.5232, + "step": 4430 + }, + { + "epoch": 0.5254357879758094, + "grad_norm": 2.001668353118628, + "learning_rate": 4.8665010734948245e-05, + "loss": 0.7381, + "step": 4431 + }, + { + "epoch": 0.5255543697379343, + "grad_norm": 1.59099077803781, + "learning_rate": 4.8664236727597615e-05, + "loss": 0.5631, + "step": 4432 + }, + { + "epoch": 0.5256729515000593, + "grad_norm": 2.0847687272798905, + "learning_rate": 4.866346250209173e-05, + "loss": 0.8796, + "step": 4433 + }, + { + "epoch": 0.5257915332621843, + "grad_norm": 1.2139686609665075, + "learning_rate": 4.866268805843775e-05, + "loss": 0.4307, + "step": 4434 + }, + { + "epoch": 0.5259101150243093, + "grad_norm": 1.75236246527139, + "learning_rate": 4.86619133966428e-05, + "loss": 0.7721, + "step": 4435 + }, + { + "epoch": 0.5260286967864343, + "grad_norm": 1.7146225047154127, + "learning_rate": 4.866113851671402e-05, + "loss": 0.8131, + "step": 4436 + }, + { + "epoch": 0.5261472785485592, + "grad_norm": 1.438602205053691, + "learning_rate": 4.8660363418658575e-05, + "loss": 0.6005, + "step": 4437 + }, + { + "epoch": 0.5262658603106842, + "grad_norm": 1.6661042257577836, + "learning_rate": 4.865958810248358e-05, + "loss": 0.7491, + "step": 4438 + }, + { + "epoch": 0.5263844420728092, + "grad_norm": 1.2587764999281459, + "learning_rate": 4.8658812568196195e-05, + "loss": 0.5335, + "step": 4439 + }, + { + "epoch": 0.5265030238349342, + "grad_norm": 1.3591468933557243, + "learning_rate": 4.8658036815803573e-05, + "loss": 0.5218, + "step": 4440 + }, + { + "epoch": 0.5266216055970592, + "grad_norm": 1.1845006969337681, + "learning_rate": 4.865726084531286e-05, + "loss": 0.4206, + "step": 4441 + }, + { + "epoch": 0.5267401873591842, + "grad_norm": 1.3101098756465026, + "learning_rate": 4.8656484656731214e-05, + "loss": 0.4367, + "step": 4442 + }, + { + "epoch": 0.5268587691213091, + "grad_norm": 1.286824883534715, + "learning_rate": 4.865570825006579e-05, + "loss": 0.532, + "step": 4443 + }, + { + "epoch": 0.5269773508834341, + "grad_norm": 1.9545118725163724, + "learning_rate": 4.8654931625323746e-05, + "loss": 0.7781, + "step": 4444 + }, + { + "epoch": 0.5270959326455591, + "grad_norm": 1.3976575411226106, + "learning_rate": 4.865415478251224e-05, + "loss": 0.6011, + "step": 4445 + }, + { + "epoch": 0.5272145144076841, + "grad_norm": 1.4652432206964596, + "learning_rate": 4.8653377721638435e-05, + "loss": 0.6347, + "step": 4446 + }, + { + "epoch": 0.5273330961698091, + "grad_norm": 1.1703396970276103, + "learning_rate": 4.8652600442709484e-05, + "loss": 0.3956, + "step": 4447 + }, + { + "epoch": 0.527451677931934, + "grad_norm": 1.8154968278186074, + "learning_rate": 4.8651822945732575e-05, + "loss": 0.574, + "step": 4448 + }, + { + "epoch": 0.527570259694059, + "grad_norm": 1.7647361481699273, + "learning_rate": 4.8651045230714857e-05, + "loss": 0.5506, + "step": 4449 + }, + { + "epoch": 0.527688841456184, + "grad_norm": 1.6353746349683205, + "learning_rate": 4.86502672976635e-05, + "loss": 0.5793, + "step": 4450 + }, + { + "epoch": 0.527807423218309, + "grad_norm": 1.4846798224644913, + "learning_rate": 4.864948914658568e-05, + "loss": 0.6354, + "step": 4451 + }, + { + "epoch": 0.527926004980434, + "grad_norm": 1.5826525768354285, + "learning_rate": 4.8648710777488576e-05, + "loss": 0.5599, + "step": 4452 + }, + { + "epoch": 0.5280445867425589, + "grad_norm": 1.4254534292732386, + "learning_rate": 4.8647932190379355e-05, + "loss": 0.5717, + "step": 4453 + }, + { + "epoch": 0.528163168504684, + "grad_norm": 1.636891898032809, + "learning_rate": 4.86471533852652e-05, + "loss": 0.7132, + "step": 4454 + }, + { + "epoch": 0.528281750266809, + "grad_norm": 1.4810434232465695, + "learning_rate": 4.864637436215329e-05, + "loss": 0.4975, + "step": 4455 + }, + { + "epoch": 0.528400332028934, + "grad_norm": 1.8671196121855447, + "learning_rate": 4.86455951210508e-05, + "loss": 0.7043, + "step": 4456 + }, + { + "epoch": 0.528518913791059, + "grad_norm": 1.8399867544031365, + "learning_rate": 4.864481566196493e-05, + "loss": 0.6749, + "step": 4457 + }, + { + "epoch": 0.528637495553184, + "grad_norm": 1.6316633399417124, + "learning_rate": 4.8644035984902846e-05, + "loss": 0.7841, + "step": 4458 + }, + { + "epoch": 0.5287560773153089, + "grad_norm": 1.546258909723542, + "learning_rate": 4.864325608987175e-05, + "loss": 0.6653, + "step": 4459 + }, + { + "epoch": 0.5288746590774339, + "grad_norm": 1.601330454369243, + "learning_rate": 4.864247597687882e-05, + "loss": 0.6869, + "step": 4460 + }, + { + "epoch": 0.5289932408395589, + "grad_norm": 1.2041070205524016, + "learning_rate": 4.8641695645931264e-05, + "loss": 0.4076, + "step": 4461 + }, + { + "epoch": 0.5291118226016839, + "grad_norm": 1.3710377494112709, + "learning_rate": 4.8640915097036255e-05, + "loss": 0.5934, + "step": 4462 + }, + { + "epoch": 0.5292304043638089, + "grad_norm": 1.4190468532166456, + "learning_rate": 4.8640134330201004e-05, + "loss": 0.621, + "step": 4463 + }, + { + "epoch": 0.5293489861259338, + "grad_norm": 1.0811306885788852, + "learning_rate": 4.86393533454327e-05, + "loss": 0.3705, + "step": 4464 + }, + { + "epoch": 0.5294675678880588, + "grad_norm": 1.4354605043642081, + "learning_rate": 4.8638572142738545e-05, + "loss": 0.6095, + "step": 4465 + }, + { + "epoch": 0.5295861496501838, + "grad_norm": 1.792087480721045, + "learning_rate": 4.863779072212575e-05, + "loss": 0.8468, + "step": 4466 + }, + { + "epoch": 0.5297047314123088, + "grad_norm": 1.6378610706041166, + "learning_rate": 4.86370090836015e-05, + "loss": 0.6032, + "step": 4467 + }, + { + "epoch": 0.5298233131744338, + "grad_norm": 1.3982686477806978, + "learning_rate": 4.8636227227173024e-05, + "loss": 0.5256, + "step": 4468 + }, + { + "epoch": 0.5299418949365587, + "grad_norm": 1.5318600235278557, + "learning_rate": 4.863544515284752e-05, + "loss": 0.6063, + "step": 4469 + }, + { + "epoch": 0.5300604766986837, + "grad_norm": 1.4953853685168577, + "learning_rate": 4.863466286063218e-05, + "loss": 0.6626, + "step": 4470 + }, + { + "epoch": 0.5301790584608087, + "grad_norm": 1.3024439760060738, + "learning_rate": 4.8633880350534245e-05, + "loss": 0.4093, + "step": 4471 + }, + { + "epoch": 0.5302976402229337, + "grad_norm": 1.5799478365202173, + "learning_rate": 4.863309762256091e-05, + "loss": 0.5423, + "step": 4472 + }, + { + "epoch": 0.5304162219850587, + "grad_norm": 1.313413430077652, + "learning_rate": 4.863231467671939e-05, + "loss": 0.4018, + "step": 4473 + }, + { + "epoch": 0.5305348037471836, + "grad_norm": 1.7397068473489572, + "learning_rate": 4.863153151301692e-05, + "loss": 0.9764, + "step": 4474 + }, + { + "epoch": 0.5306533855093086, + "grad_norm": 1.280254851410496, + "learning_rate": 4.86307481314607e-05, + "loss": 0.4616, + "step": 4475 + }, + { + "epoch": 0.5307719672714336, + "grad_norm": 1.787276865710569, + "learning_rate": 4.862996453205796e-05, + "loss": 0.8909, + "step": 4476 + }, + { + "epoch": 0.5308905490335586, + "grad_norm": 1.630146750970933, + "learning_rate": 4.8629180714815926e-05, + "loss": 0.5798, + "step": 4477 + }, + { + "epoch": 0.5310091307956836, + "grad_norm": 1.8237607381914631, + "learning_rate": 4.8628396679741825e-05, + "loss": 0.738, + "step": 4478 + }, + { + "epoch": 0.5311277125578087, + "grad_norm": 1.353789499643375, + "learning_rate": 4.8627612426842875e-05, + "loss": 0.551, + "step": 4479 + }, + { + "epoch": 0.5312462943199336, + "grad_norm": 1.860142452235117, + "learning_rate": 4.862682795612632e-05, + "loss": 0.7932, + "step": 4480 + }, + { + "epoch": 0.5313648760820586, + "grad_norm": 1.5224150679079351, + "learning_rate": 4.862604326759938e-05, + "loss": 0.5583, + "step": 4481 + }, + { + "epoch": 0.5314834578441836, + "grad_norm": 1.1972188330289004, + "learning_rate": 4.86252583612693e-05, + "loss": 0.4846, + "step": 4482 + }, + { + "epoch": 0.5316020396063086, + "grad_norm": 1.340221200696147, + "learning_rate": 4.86244732371433e-05, + "loss": 0.4552, + "step": 4483 + }, + { + "epoch": 0.5317206213684336, + "grad_norm": 1.6678770263135654, + "learning_rate": 4.862368789522863e-05, + "loss": 0.8071, + "step": 4484 + }, + { + "epoch": 0.5318392031305585, + "grad_norm": 1.4869153711237448, + "learning_rate": 4.862290233553253e-05, + "loss": 0.7134, + "step": 4485 + }, + { + "epoch": 0.5319577848926835, + "grad_norm": 1.6989992856069864, + "learning_rate": 4.862211655806223e-05, + "loss": 0.8094, + "step": 4486 + }, + { + "epoch": 0.5320763666548085, + "grad_norm": 1.5296606323292, + "learning_rate": 4.862133056282499e-05, + "loss": 0.6472, + "step": 4487 + }, + { + "epoch": 0.5321949484169335, + "grad_norm": 1.517409423047397, + "learning_rate": 4.862054434982804e-05, + "loss": 0.6976, + "step": 4488 + }, + { + "epoch": 0.5323135301790585, + "grad_norm": 1.8003615121850975, + "learning_rate": 4.861975791907865e-05, + "loss": 0.7889, + "step": 4489 + }, + { + "epoch": 0.5324321119411835, + "grad_norm": 1.7327397917690093, + "learning_rate": 4.861897127058405e-05, + "loss": 0.6698, + "step": 4490 + }, + { + "epoch": 0.5325506937033084, + "grad_norm": 1.136977040545438, + "learning_rate": 4.86181844043515e-05, + "loss": 0.5029, + "step": 4491 + }, + { + "epoch": 0.5326692754654334, + "grad_norm": 1.3722089214981026, + "learning_rate": 4.8617397320388245e-05, + "loss": 0.5926, + "step": 4492 + }, + { + "epoch": 0.5327878572275584, + "grad_norm": 1.5475499665914598, + "learning_rate": 4.861661001870156e-05, + "loss": 0.7534, + "step": 4493 + }, + { + "epoch": 0.5329064389896834, + "grad_norm": 1.4771305097509935, + "learning_rate": 4.8615822499298685e-05, + "loss": 0.5686, + "step": 4494 + }, + { + "epoch": 0.5330250207518084, + "grad_norm": 1.3673161311932296, + "learning_rate": 4.8615034762186884e-05, + "loss": 0.564, + "step": 4495 + }, + { + "epoch": 0.5331436025139333, + "grad_norm": 1.4271981794223385, + "learning_rate": 4.861424680737343e-05, + "loss": 0.6364, + "step": 4496 + }, + { + "epoch": 0.5332621842760583, + "grad_norm": 1.366274636109926, + "learning_rate": 4.861345863486557e-05, + "loss": 0.6445, + "step": 4497 + }, + { + "epoch": 0.5333807660381833, + "grad_norm": 1.3985172631840035, + "learning_rate": 4.861267024467058e-05, + "loss": 0.5857, + "step": 4498 + }, + { + "epoch": 0.5334993478003083, + "grad_norm": 1.2002096316971793, + "learning_rate": 4.861188163679572e-05, + "loss": 0.4708, + "step": 4499 + }, + { + "epoch": 0.5336179295624333, + "grad_norm": 1.3621755441598689, + "learning_rate": 4.8611092811248276e-05, + "loss": 0.4241, + "step": 4500 + }, + { + "epoch": 0.5337365113245582, + "grad_norm": 1.4595258304678125, + "learning_rate": 4.86103037680355e-05, + "loss": 0.4027, + "step": 4501 + }, + { + "epoch": 0.5338550930866832, + "grad_norm": 1.671460675986169, + "learning_rate": 4.860951450716469e-05, + "loss": 0.6597, + "step": 4502 + }, + { + "epoch": 0.5339736748488082, + "grad_norm": 1.8442624663088294, + "learning_rate": 4.86087250286431e-05, + "loss": 0.5986, + "step": 4503 + }, + { + "epoch": 0.5340922566109333, + "grad_norm": 1.323794113837236, + "learning_rate": 4.860793533247802e-05, + "loss": 0.4808, + "step": 4504 + }, + { + "epoch": 0.5342108383730583, + "grad_norm": 1.5107610694247755, + "learning_rate": 4.860714541867672e-05, + "loss": 0.5817, + "step": 4505 + }, + { + "epoch": 0.5343294201351833, + "grad_norm": 1.7863992132424475, + "learning_rate": 4.8606355287246484e-05, + "loss": 0.7367, + "step": 4506 + }, + { + "epoch": 0.5344480018973082, + "grad_norm": 1.5563343380681909, + "learning_rate": 4.860556493819461e-05, + "loss": 0.649, + "step": 4507 + }, + { + "epoch": 0.5345665836594332, + "grad_norm": 1.742828077740644, + "learning_rate": 4.860477437152837e-05, + "loss": 0.8467, + "step": 4508 + }, + { + "epoch": 0.5346851654215582, + "grad_norm": 1.7124348705341195, + "learning_rate": 4.860398358725506e-05, + "loss": 0.8007, + "step": 4509 + }, + { + "epoch": 0.5348037471836832, + "grad_norm": 1.657080396614561, + "learning_rate": 4.860319258538197e-05, + "loss": 0.5213, + "step": 4510 + }, + { + "epoch": 0.5349223289458082, + "grad_norm": 1.4225477720166364, + "learning_rate": 4.860240136591639e-05, + "loss": 0.6349, + "step": 4511 + }, + { + "epoch": 0.5350409107079331, + "grad_norm": 1.4595090070937666, + "learning_rate": 4.86016099288656e-05, + "loss": 0.5302, + "step": 4512 + }, + { + "epoch": 0.5351594924700581, + "grad_norm": 1.3798087519975897, + "learning_rate": 4.860081827423691e-05, + "loss": 0.5068, + "step": 4513 + }, + { + "epoch": 0.5352780742321831, + "grad_norm": 1.4031454881732235, + "learning_rate": 4.860002640203762e-05, + "loss": 0.7924, + "step": 4514 + }, + { + "epoch": 0.5353966559943081, + "grad_norm": 1.648309404417123, + "learning_rate": 4.8599234312275034e-05, + "loss": 0.6164, + "step": 4515 + }, + { + "epoch": 0.5355152377564331, + "grad_norm": 1.7082430628021479, + "learning_rate": 4.859844200495644e-05, + "loss": 0.6831, + "step": 4516 + }, + { + "epoch": 0.535633819518558, + "grad_norm": 1.6864370943632745, + "learning_rate": 4.8597649480089145e-05, + "loss": 0.679, + "step": 4517 + }, + { + "epoch": 0.535752401280683, + "grad_norm": 1.5030761845828482, + "learning_rate": 4.859685673768046e-05, + "loss": 0.6223, + "step": 4518 + }, + { + "epoch": 0.535870983042808, + "grad_norm": 1.4004484345929218, + "learning_rate": 4.8596063777737696e-05, + "loss": 0.5802, + "step": 4519 + }, + { + "epoch": 0.535989564804933, + "grad_norm": 1.5409929354888585, + "learning_rate": 4.8595270600268163e-05, + "loss": 0.6202, + "step": 4520 + }, + { + "epoch": 0.536108146567058, + "grad_norm": 1.4774107959411877, + "learning_rate": 4.859447720527917e-05, + "loss": 0.5905, + "step": 4521 + }, + { + "epoch": 0.536226728329183, + "grad_norm": 1.7787169900058584, + "learning_rate": 4.8593683592778026e-05, + "loss": 0.5795, + "step": 4522 + }, + { + "epoch": 0.5363453100913079, + "grad_norm": 1.7857562651066252, + "learning_rate": 4.859288976277205e-05, + "loss": 0.6754, + "step": 4523 + }, + { + "epoch": 0.5364638918534329, + "grad_norm": 1.844227814672596, + "learning_rate": 4.859209571526857e-05, + "loss": 0.6247, + "step": 4524 + }, + { + "epoch": 0.5365824736155579, + "grad_norm": 1.901479811499885, + "learning_rate": 4.859130145027488e-05, + "loss": 0.8008, + "step": 4525 + }, + { + "epoch": 0.5367010553776829, + "grad_norm": 1.3934750170791916, + "learning_rate": 4.859050696779834e-05, + "loss": 0.5063, + "step": 4526 + }, + { + "epoch": 0.5368196371398078, + "grad_norm": 1.5384934157456482, + "learning_rate": 4.8589712267846244e-05, + "loss": 0.7642, + "step": 4527 + }, + { + "epoch": 0.5369382189019328, + "grad_norm": 1.4707245854077236, + "learning_rate": 4.858891735042593e-05, + "loss": 0.7965, + "step": 4528 + }, + { + "epoch": 0.5370568006640579, + "grad_norm": 1.4481291311832314, + "learning_rate": 4.858812221554473e-05, + "loss": 0.6285, + "step": 4529 + }, + { + "epoch": 0.5371753824261829, + "grad_norm": 1.4822541988510196, + "learning_rate": 4.858732686320997e-05, + "loss": 0.5763, + "step": 4530 + }, + { + "epoch": 0.5372939641883079, + "grad_norm": 1.6973982384199575, + "learning_rate": 4.858653129342897e-05, + "loss": 0.7594, + "step": 4531 + }, + { + "epoch": 0.5374125459504329, + "grad_norm": 1.5699154064362828, + "learning_rate": 4.858573550620908e-05, + "loss": 0.5418, + "step": 4532 + }, + { + "epoch": 0.5375311277125578, + "grad_norm": 1.5213510043525413, + "learning_rate": 4.8584939501557634e-05, + "loss": 0.5525, + "step": 4533 + }, + { + "epoch": 0.5376497094746828, + "grad_norm": 1.5765305943780945, + "learning_rate": 4.858414327948196e-05, + "loss": 0.8182, + "step": 4534 + }, + { + "epoch": 0.5377682912368078, + "grad_norm": 1.510408828778692, + "learning_rate": 4.858334683998942e-05, + "loss": 0.5502, + "step": 4535 + }, + { + "epoch": 0.5378868729989328, + "grad_norm": 1.524344193847123, + "learning_rate": 4.858255018308733e-05, + "loss": 0.5779, + "step": 4536 + }, + { + "epoch": 0.5380054547610578, + "grad_norm": 1.5596379090061872, + "learning_rate": 4.858175330878305e-05, + "loss": 0.5943, + "step": 4537 + }, + { + "epoch": 0.5381240365231827, + "grad_norm": 1.6721294318190811, + "learning_rate": 4.858095621708393e-05, + "loss": 0.6473, + "step": 4538 + }, + { + "epoch": 0.5382426182853077, + "grad_norm": 1.700351629336764, + "learning_rate": 4.8580158907997295e-05, + "loss": 0.9848, + "step": 4539 + }, + { + "epoch": 0.5383612000474327, + "grad_norm": 1.4145495283065948, + "learning_rate": 4.857936138153052e-05, + "loss": 0.5433, + "step": 4540 + }, + { + "epoch": 0.5384797818095577, + "grad_norm": 2.001175553349072, + "learning_rate": 4.857856363769095e-05, + "loss": 0.8281, + "step": 4541 + }, + { + "epoch": 0.5385983635716827, + "grad_norm": 1.3672911746088445, + "learning_rate": 4.857776567648593e-05, + "loss": 0.4978, + "step": 4542 + }, + { + "epoch": 0.5387169453338077, + "grad_norm": 1.465039645425579, + "learning_rate": 4.857696749792283e-05, + "loss": 0.7244, + "step": 4543 + }, + { + "epoch": 0.5388355270959326, + "grad_norm": 1.279671997198366, + "learning_rate": 4.8576169102009e-05, + "loss": 0.6594, + "step": 4544 + }, + { + "epoch": 0.5389541088580576, + "grad_norm": 1.2647805066759357, + "learning_rate": 4.85753704887518e-05, + "loss": 0.6247, + "step": 4545 + }, + { + "epoch": 0.5390726906201826, + "grad_norm": 1.6110394791778222, + "learning_rate": 4.8574571658158594e-05, + "loss": 0.9907, + "step": 4546 + }, + { + "epoch": 0.5391912723823076, + "grad_norm": 1.4993085343428467, + "learning_rate": 4.8573772610236744e-05, + "loss": 0.6429, + "step": 4547 + }, + { + "epoch": 0.5393098541444326, + "grad_norm": 1.5389457475036012, + "learning_rate": 4.857297334499362e-05, + "loss": 0.7495, + "step": 4548 + }, + { + "epoch": 0.5394284359065575, + "grad_norm": 1.4203145801526837, + "learning_rate": 4.8572173862436596e-05, + "loss": 0.626, + "step": 4549 + }, + { + "epoch": 0.5395470176686825, + "grad_norm": 1.4256003600834117, + "learning_rate": 4.857137416257303e-05, + "loss": 0.6833, + "step": 4550 + }, + { + "epoch": 0.5396655994308075, + "grad_norm": 1.5497445940406736, + "learning_rate": 4.85705742454103e-05, + "loss": 0.6396, + "step": 4551 + }, + { + "epoch": 0.5397841811929325, + "grad_norm": 1.4338265249968913, + "learning_rate": 4.856977411095578e-05, + "loss": 0.5476, + "step": 4552 + }, + { + "epoch": 0.5399027629550575, + "grad_norm": 1.304666068720877, + "learning_rate": 4.856897375921684e-05, + "loss": 0.4012, + "step": 4553 + }, + { + "epoch": 0.5400213447171825, + "grad_norm": 1.345022828888145, + "learning_rate": 4.856817319020087e-05, + "loss": 0.615, + "step": 4554 + }, + { + "epoch": 0.5401399264793075, + "grad_norm": 1.9320226815715766, + "learning_rate": 4.8567372403915246e-05, + "loss": 0.8215, + "step": 4555 + }, + { + "epoch": 0.5402585082414325, + "grad_norm": 1.6554837950547712, + "learning_rate": 4.8566571400367345e-05, + "loss": 0.7498, + "step": 4556 + }, + { + "epoch": 0.5403770900035575, + "grad_norm": 1.513809841285043, + "learning_rate": 4.8565770179564554e-05, + "loss": 0.5893, + "step": 4557 + }, + { + "epoch": 0.5404956717656825, + "grad_norm": 1.4524262532412162, + "learning_rate": 4.856496874151426e-05, + "loss": 0.4446, + "step": 4558 + }, + { + "epoch": 0.5406142535278075, + "grad_norm": 2.0805819836113932, + "learning_rate": 4.8564167086223856e-05, + "loss": 0.7636, + "step": 4559 + }, + { + "epoch": 0.5407328352899324, + "grad_norm": 1.7068089375369258, + "learning_rate": 4.856336521370073e-05, + "loss": 0.6354, + "step": 4560 + }, + { + "epoch": 0.5408514170520574, + "grad_norm": 1.6527353729288665, + "learning_rate": 4.856256312395227e-05, + "loss": 0.4953, + "step": 4561 + }, + { + "epoch": 0.5409699988141824, + "grad_norm": 1.5325358208548603, + "learning_rate": 4.856176081698586e-05, + "loss": 0.5759, + "step": 4562 + }, + { + "epoch": 0.5410885805763074, + "grad_norm": 1.3712259098753012, + "learning_rate": 4.8560958292808925e-05, + "loss": 0.6332, + "step": 4563 + }, + { + "epoch": 0.5412071623384324, + "grad_norm": 1.4830517171258473, + "learning_rate": 4.8560155551428837e-05, + "loss": 0.5893, + "step": 4564 + }, + { + "epoch": 0.5413257441005573, + "grad_norm": 1.5539116933746375, + "learning_rate": 4.855935259285301e-05, + "loss": 0.5408, + "step": 4565 + }, + { + "epoch": 0.5414443258626823, + "grad_norm": 1.8225975591005366, + "learning_rate": 4.855854941708884e-05, + "loss": 0.6158, + "step": 4566 + }, + { + "epoch": 0.5415629076248073, + "grad_norm": 1.4953083124956517, + "learning_rate": 4.8557746024143736e-05, + "loss": 0.6045, + "step": 4567 + }, + { + "epoch": 0.5416814893869323, + "grad_norm": 1.5670407121280645, + "learning_rate": 4.85569424140251e-05, + "loss": 0.5554, + "step": 4568 + }, + { + "epoch": 0.5418000711490573, + "grad_norm": 1.9093041853009627, + "learning_rate": 4.8556138586740344e-05, + "loss": 0.829, + "step": 4569 + }, + { + "epoch": 0.5419186529111822, + "grad_norm": 1.199880427591941, + "learning_rate": 4.8555334542296885e-05, + "loss": 0.3598, + "step": 4570 + }, + { + "epoch": 0.5420372346733072, + "grad_norm": 1.5597833551139135, + "learning_rate": 4.855453028070212e-05, + "loss": 0.5221, + "step": 4571 + }, + { + "epoch": 0.5421558164354322, + "grad_norm": 1.6736452601432108, + "learning_rate": 4.855372580196346e-05, + "loss": 0.4581, + "step": 4572 + }, + { + "epoch": 0.5422743981975572, + "grad_norm": 1.5696202275843527, + "learning_rate": 4.855292110608835e-05, + "loss": 0.6765, + "step": 4573 + }, + { + "epoch": 0.5423929799596822, + "grad_norm": 1.266090208846806, + "learning_rate": 4.855211619308417e-05, + "loss": 0.548, + "step": 4574 + }, + { + "epoch": 0.5425115617218071, + "grad_norm": 3.748677711905531, + "learning_rate": 4.8551311062958374e-05, + "loss": 0.503, + "step": 4575 + }, + { + "epoch": 0.5426301434839321, + "grad_norm": 1.6031353901749559, + "learning_rate": 4.855050571571837e-05, + "loss": 0.6942, + "step": 4576 + }, + { + "epoch": 0.5427487252460571, + "grad_norm": 1.9424007887873835, + "learning_rate": 4.854970015137158e-05, + "loss": 0.7993, + "step": 4577 + }, + { + "epoch": 0.5428673070081821, + "grad_norm": 2.057405537222361, + "learning_rate": 4.8548894369925426e-05, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.5429858887703072, + "grad_norm": 2.1087385339597446, + "learning_rate": 4.854808837138736e-05, + "loss": 0.7553, + "step": 4579 + }, + { + "epoch": 0.5431044705324322, + "grad_norm": 1.5995196439629515, + "learning_rate": 4.8547282155764784e-05, + "loss": 0.5758, + "step": 4580 + }, + { + "epoch": 0.5432230522945571, + "grad_norm": 1.5627636921531796, + "learning_rate": 4.854647572306514e-05, + "loss": 0.6802, + "step": 4581 + }, + { + "epoch": 0.5433416340566821, + "grad_norm": 1.435792822202445, + "learning_rate": 4.854566907329587e-05, + "loss": 0.5722, + "step": 4582 + }, + { + "epoch": 0.5434602158188071, + "grad_norm": 1.6145852056353225, + "learning_rate": 4.85448622064644e-05, + "loss": 0.5525, + "step": 4583 + }, + { + "epoch": 0.5435787975809321, + "grad_norm": 1.2926664456706392, + "learning_rate": 4.854405512257818e-05, + "loss": 0.5524, + "step": 4584 + }, + { + "epoch": 0.5436973793430571, + "grad_norm": 1.379386956991928, + "learning_rate": 4.854324782164464e-05, + "loss": 0.543, + "step": 4585 + }, + { + "epoch": 0.543815961105182, + "grad_norm": 1.4340811902686066, + "learning_rate": 4.8542440303671226e-05, + "loss": 0.8497, + "step": 4586 + }, + { + "epoch": 0.543934542867307, + "grad_norm": 1.42394526021585, + "learning_rate": 4.8541632568665385e-05, + "loss": 0.5296, + "step": 4587 + }, + { + "epoch": 0.544053124629432, + "grad_norm": 1.6041244510431334, + "learning_rate": 4.854082461663455e-05, + "loss": 0.6074, + "step": 4588 + }, + { + "epoch": 0.544171706391557, + "grad_norm": 2.043163493707532, + "learning_rate": 4.854001644758619e-05, + "loss": 0.703, + "step": 4589 + }, + { + "epoch": 0.544290288153682, + "grad_norm": 1.3550393164625205, + "learning_rate": 4.853920806152774e-05, + "loss": 0.524, + "step": 4590 + }, + { + "epoch": 0.544408869915807, + "grad_norm": 2.0747483982910673, + "learning_rate": 4.8538399458466666e-05, + "loss": 0.9109, + "step": 4591 + }, + { + "epoch": 0.5445274516779319, + "grad_norm": 1.5238513630205874, + "learning_rate": 4.85375906384104e-05, + "loss": 0.6593, + "step": 4592 + }, + { + "epoch": 0.5446460334400569, + "grad_norm": 1.5221841855082954, + "learning_rate": 4.8536781601366424e-05, + "loss": 0.631, + "step": 4593 + }, + { + "epoch": 0.5447646152021819, + "grad_norm": 1.4639806396358668, + "learning_rate": 4.853597234734218e-05, + "loss": 0.571, + "step": 4594 + }, + { + "epoch": 0.5448831969643069, + "grad_norm": 1.5013713815499385, + "learning_rate": 4.853516287634513e-05, + "loss": 0.5912, + "step": 4595 + }, + { + "epoch": 0.5450017787264319, + "grad_norm": 1.8578979102017428, + "learning_rate": 4.853435318838274e-05, + "loss": 0.8344, + "step": 4596 + }, + { + "epoch": 0.5451203604885568, + "grad_norm": 1.3112386535825042, + "learning_rate": 4.853354328346248e-05, + "loss": 0.6306, + "step": 4597 + }, + { + "epoch": 0.5452389422506818, + "grad_norm": 1.266581678151801, + "learning_rate": 4.85327331615918e-05, + "loss": 0.5989, + "step": 4598 + }, + { + "epoch": 0.5453575240128068, + "grad_norm": 1.43080876654775, + "learning_rate": 4.853192282277818e-05, + "loss": 0.4892, + "step": 4599 + }, + { + "epoch": 0.5454761057749318, + "grad_norm": 1.3247113555229515, + "learning_rate": 4.853111226702909e-05, + "loss": 0.613, + "step": 4600 + }, + { + "epoch": 0.5455946875370568, + "grad_norm": 1.5555763399140856, + "learning_rate": 4.8530301494352004e-05, + "loss": 0.5953, + "step": 4601 + }, + { + "epoch": 0.5457132692991817, + "grad_norm": 1.3705638385522079, + "learning_rate": 4.852949050475439e-05, + "loss": 0.6699, + "step": 4602 + }, + { + "epoch": 0.5458318510613067, + "grad_norm": 1.3759747949781405, + "learning_rate": 4.852867929824373e-05, + "loss": 0.6054, + "step": 4603 + }, + { + "epoch": 0.5459504328234318, + "grad_norm": 1.481210708410744, + "learning_rate": 4.852786787482749e-05, + "loss": 0.8254, + "step": 4604 + }, + { + "epoch": 0.5460690145855568, + "grad_norm": 1.1940831467489172, + "learning_rate": 4.852705623451317e-05, + "loss": 0.3735, + "step": 4605 + }, + { + "epoch": 0.5461875963476818, + "grad_norm": 1.4326360498307749, + "learning_rate": 4.852624437730824e-05, + "loss": 0.7431, + "step": 4606 + }, + { + "epoch": 0.5463061781098068, + "grad_norm": 1.8471006512679942, + "learning_rate": 4.8525432303220186e-05, + "loss": 0.7113, + "step": 4607 + }, + { + "epoch": 0.5464247598719317, + "grad_norm": 1.626825725420159, + "learning_rate": 4.852462001225649e-05, + "loss": 0.7306, + "step": 4608 + }, + { + "epoch": 0.5465433416340567, + "grad_norm": 1.959271360131848, + "learning_rate": 4.852380750442466e-05, + "loss": 0.768, + "step": 4609 + }, + { + "epoch": 0.5466619233961817, + "grad_norm": 1.35295276556761, + "learning_rate": 4.8522994779732154e-05, + "loss": 0.6207, + "step": 4610 + }, + { + "epoch": 0.5467805051583067, + "grad_norm": 1.3987928426264178, + "learning_rate": 4.852218183818649e-05, + "loss": 0.5558, + "step": 4611 + }, + { + "epoch": 0.5468990869204317, + "grad_norm": 1.5404127949910325, + "learning_rate": 4.8521368679795154e-05, + "loss": 0.6791, + "step": 4612 + }, + { + "epoch": 0.5470176686825566, + "grad_norm": 1.636783925818582, + "learning_rate": 4.8520555304565646e-05, + "loss": 0.5459, + "step": 4613 + }, + { + "epoch": 0.5471362504446816, + "grad_norm": 1.3270388970451197, + "learning_rate": 4.8519741712505455e-05, + "loss": 0.4578, + "step": 4614 + }, + { + "epoch": 0.5472548322068066, + "grad_norm": 1.8371891466297028, + "learning_rate": 4.851892790362209e-05, + "loss": 0.6889, + "step": 4615 + }, + { + "epoch": 0.5473734139689316, + "grad_norm": 1.8381220910372034, + "learning_rate": 4.8518113877923054e-05, + "loss": 0.7627, + "step": 4616 + }, + { + "epoch": 0.5474919957310566, + "grad_norm": 1.3218563459848849, + "learning_rate": 4.851729963541584e-05, + "loss": 0.5165, + "step": 4617 + }, + { + "epoch": 0.5476105774931815, + "grad_norm": 1.629902695086319, + "learning_rate": 4.851648517610797e-05, + "loss": 0.8131, + "step": 4618 + }, + { + "epoch": 0.5477291592553065, + "grad_norm": 1.3314720565563267, + "learning_rate": 4.8515670500006935e-05, + "loss": 0.4345, + "step": 4619 + }, + { + "epoch": 0.5478477410174315, + "grad_norm": 1.3665419691198926, + "learning_rate": 4.8514855607120266e-05, + "loss": 0.6458, + "step": 4620 + }, + { + "epoch": 0.5479663227795565, + "grad_norm": 1.7710392519737554, + "learning_rate": 4.851404049745546e-05, + "loss": 0.8572, + "step": 4621 + }, + { + "epoch": 0.5480849045416815, + "grad_norm": 1.4450706464752117, + "learning_rate": 4.851322517102003e-05, + "loss": 0.6613, + "step": 4622 + }, + { + "epoch": 0.5482034863038064, + "grad_norm": 1.2646704610345536, + "learning_rate": 4.85124096278215e-05, + "loss": 0.4361, + "step": 4623 + }, + { + "epoch": 0.5483220680659314, + "grad_norm": 1.6154187263908246, + "learning_rate": 4.851159386786739e-05, + "loss": 0.7703, + "step": 4624 + }, + { + "epoch": 0.5484406498280564, + "grad_norm": 1.5436568167590163, + "learning_rate": 4.8510777891165214e-05, + "loss": 0.7988, + "step": 4625 + }, + { + "epoch": 0.5485592315901814, + "grad_norm": 1.6124992786089767, + "learning_rate": 4.8509961697722494e-05, + "loss": 0.6282, + "step": 4626 + }, + { + "epoch": 0.5486778133523064, + "grad_norm": 1.5571841946311238, + "learning_rate": 4.850914528754676e-05, + "loss": 0.6455, + "step": 4627 + }, + { + "epoch": 0.5487963951144313, + "grad_norm": 1.4406391110076393, + "learning_rate": 4.8508328660645544e-05, + "loss": 0.598, + "step": 4628 + }, + { + "epoch": 0.5489149768765564, + "grad_norm": 1.534214814261759, + "learning_rate": 4.850751181702635e-05, + "loss": 0.7429, + "step": 4629 + }, + { + "epoch": 0.5490335586386814, + "grad_norm": 1.485976411771782, + "learning_rate": 4.850669475669674e-05, + "loss": 0.7817, + "step": 4630 + }, + { + "epoch": 0.5491521404008064, + "grad_norm": 1.4144126757985458, + "learning_rate": 4.850587747966421e-05, + "loss": 0.6631, + "step": 4631 + }, + { + "epoch": 0.5492707221629314, + "grad_norm": 1.654672630981689, + "learning_rate": 4.850505998593633e-05, + "loss": 0.7824, + "step": 4632 + }, + { + "epoch": 0.5493893039250564, + "grad_norm": 1.5067149626030398, + "learning_rate": 4.850424227552062e-05, + "loss": 0.5677, + "step": 4633 + }, + { + "epoch": 0.5495078856871813, + "grad_norm": 1.3087298132083809, + "learning_rate": 4.850342434842461e-05, + "loss": 0.4576, + "step": 4634 + }, + { + "epoch": 0.5496264674493063, + "grad_norm": 1.421116745263161, + "learning_rate": 4.8502606204655854e-05, + "loss": 0.5428, + "step": 4635 + }, + { + "epoch": 0.5497450492114313, + "grad_norm": 1.4667154064970254, + "learning_rate": 4.850178784422189e-05, + "loss": 0.613, + "step": 4636 + }, + { + "epoch": 0.5498636309735563, + "grad_norm": 1.5191311181763694, + "learning_rate": 4.850096926713026e-05, + "loss": 0.5944, + "step": 4637 + }, + { + "epoch": 0.5499822127356813, + "grad_norm": 1.4913363938637343, + "learning_rate": 4.850015047338852e-05, + "loss": 0.5489, + "step": 4638 + }, + { + "epoch": 0.5501007944978062, + "grad_norm": 1.5453091345302459, + "learning_rate": 4.84993314630042e-05, + "loss": 0.6337, + "step": 4639 + }, + { + "epoch": 0.5502193762599312, + "grad_norm": 1.4735449918341383, + "learning_rate": 4.849851223598486e-05, + "loss": 0.5379, + "step": 4640 + }, + { + "epoch": 0.5503379580220562, + "grad_norm": 1.7777655359549152, + "learning_rate": 4.8497692792338064e-05, + "loss": 0.5701, + "step": 4641 + }, + { + "epoch": 0.5504565397841812, + "grad_norm": 1.4978570225269476, + "learning_rate": 4.849687313207135e-05, + "loss": 0.4802, + "step": 4642 + }, + { + "epoch": 0.5505751215463062, + "grad_norm": 1.4481843596496, + "learning_rate": 4.849605325519227e-05, + "loss": 0.6916, + "step": 4643 + }, + { + "epoch": 0.5506937033084311, + "grad_norm": 1.7247600070035276, + "learning_rate": 4.849523316170841e-05, + "loss": 0.6928, + "step": 4644 + }, + { + "epoch": 0.5508122850705561, + "grad_norm": 1.8117250593739083, + "learning_rate": 4.84944128516273e-05, + "loss": 0.5815, + "step": 4645 + }, + { + "epoch": 0.5509308668326811, + "grad_norm": 1.437510525277727, + "learning_rate": 4.849359232495652e-05, + "loss": 0.6572, + "step": 4646 + }, + { + "epoch": 0.5510494485948061, + "grad_norm": 1.7884800094653588, + "learning_rate": 4.849277158170362e-05, + "loss": 0.7655, + "step": 4647 + }, + { + "epoch": 0.5511680303569311, + "grad_norm": 1.7814238512343543, + "learning_rate": 4.849195062187618e-05, + "loss": 0.486, + "step": 4648 + }, + { + "epoch": 0.551286612119056, + "grad_norm": 1.3723979022327824, + "learning_rate": 4.8491129445481766e-05, + "loss": 0.5038, + "step": 4649 + }, + { + "epoch": 0.551405193881181, + "grad_norm": 1.3725248342736727, + "learning_rate": 4.8490308052527936e-05, + "loss": 0.4731, + "step": 4650 + }, + { + "epoch": 0.551523775643306, + "grad_norm": 1.2718068279408583, + "learning_rate": 4.848948644302228e-05, + "loss": 0.5683, + "step": 4651 + }, + { + "epoch": 0.551642357405431, + "grad_norm": 1.378205646778303, + "learning_rate": 4.8488664616972365e-05, + "loss": 0.4383, + "step": 4652 + }, + { + "epoch": 0.551760939167556, + "grad_norm": 1.5557979870047707, + "learning_rate": 4.848784257438576e-05, + "loss": 0.7173, + "step": 4653 + }, + { + "epoch": 0.5518795209296811, + "grad_norm": 1.4708995358421324, + "learning_rate": 4.848702031527005e-05, + "loss": 0.5448, + "step": 4654 + }, + { + "epoch": 0.551998102691806, + "grad_norm": 1.5862606452688908, + "learning_rate": 4.848619783963282e-05, + "loss": 0.5746, + "step": 4655 + }, + { + "epoch": 0.552116684453931, + "grad_norm": 1.3570112103228538, + "learning_rate": 4.8485375147481636e-05, + "loss": 0.5887, + "step": 4656 + }, + { + "epoch": 0.552235266216056, + "grad_norm": 1.3192960878927074, + "learning_rate": 4.84845522388241e-05, + "loss": 0.4946, + "step": 4657 + }, + { + "epoch": 0.552353847978181, + "grad_norm": 1.1100948018697705, + "learning_rate": 4.84837291136678e-05, + "loss": 0.4363, + "step": 4658 + }, + { + "epoch": 0.552472429740306, + "grad_norm": 1.1618763729669603, + "learning_rate": 4.8482905772020296e-05, + "loss": 0.4306, + "step": 4659 + }, + { + "epoch": 0.552591011502431, + "grad_norm": 1.4838317130767715, + "learning_rate": 4.8482082213889214e-05, + "loss": 0.608, + "step": 4660 + }, + { + "epoch": 0.5527095932645559, + "grad_norm": 1.465500839212871, + "learning_rate": 4.848125843928212e-05, + "loss": 0.7675, + "step": 4661 + }, + { + "epoch": 0.5528281750266809, + "grad_norm": 1.3787459229974652, + "learning_rate": 4.848043444820662e-05, + "loss": 0.6611, + "step": 4662 + }, + { + "epoch": 0.5529467567888059, + "grad_norm": 1.592739776022908, + "learning_rate": 4.84796102406703e-05, + "loss": 0.6465, + "step": 4663 + }, + { + "epoch": 0.5530653385509309, + "grad_norm": 1.8181612569413934, + "learning_rate": 4.847878581668078e-05, + "loss": 0.6718, + "step": 4664 + }, + { + "epoch": 0.5531839203130559, + "grad_norm": 1.572554949515031, + "learning_rate": 4.847796117624565e-05, + "loss": 0.5006, + "step": 4665 + }, + { + "epoch": 0.5533025020751808, + "grad_norm": 1.4664958797035863, + "learning_rate": 4.8477136319372494e-05, + "loss": 0.5049, + "step": 4666 + }, + { + "epoch": 0.5534210838373058, + "grad_norm": 1.417737222512323, + "learning_rate": 4.847631124606893e-05, + "loss": 0.5159, + "step": 4667 + }, + { + "epoch": 0.5535396655994308, + "grad_norm": 1.5379533131922494, + "learning_rate": 4.8475485956342575e-05, + "loss": 0.6252, + "step": 4668 + }, + { + "epoch": 0.5536582473615558, + "grad_norm": 1.3735876928837583, + "learning_rate": 4.847466045020102e-05, + "loss": 0.6743, + "step": 4669 + }, + { + "epoch": 0.5537768291236808, + "grad_norm": 1.5440398032493083, + "learning_rate": 4.847383472765189e-05, + "loss": 0.6194, + "step": 4670 + }, + { + "epoch": 0.5538954108858057, + "grad_norm": 1.5087751063996322, + "learning_rate": 4.8473008788702786e-05, + "loss": 0.4916, + "step": 4671 + }, + { + "epoch": 0.5540139926479307, + "grad_norm": 1.3835571049237725, + "learning_rate": 4.847218263336132e-05, + "loss": 0.5027, + "step": 4672 + }, + { + "epoch": 0.5541325744100557, + "grad_norm": 1.3514424438852948, + "learning_rate": 4.847135626163511e-05, + "loss": 0.6236, + "step": 4673 + }, + { + "epoch": 0.5542511561721807, + "grad_norm": 1.8779877575413155, + "learning_rate": 4.8470529673531796e-05, + "loss": 0.6995, + "step": 4674 + }, + { + "epoch": 0.5543697379343057, + "grad_norm": 1.3948193924559493, + "learning_rate": 4.846970286905896e-05, + "loss": 0.5496, + "step": 4675 + }, + { + "epoch": 0.5544883196964306, + "grad_norm": 1.4197887700049203, + "learning_rate": 4.846887584822426e-05, + "loss": 0.4846, + "step": 4676 + }, + { + "epoch": 0.5546069014585556, + "grad_norm": 1.7061204244017096, + "learning_rate": 4.84680486110353e-05, + "loss": 0.6688, + "step": 4677 + }, + { + "epoch": 0.5547254832206806, + "grad_norm": 1.7532678636480827, + "learning_rate": 4.846722115749971e-05, + "loss": 0.8254, + "step": 4678 + }, + { + "epoch": 0.5548440649828057, + "grad_norm": 1.230650002435006, + "learning_rate": 4.846639348762511e-05, + "loss": 0.5609, + "step": 4679 + }, + { + "epoch": 0.5549626467449307, + "grad_norm": 1.4142387976366944, + "learning_rate": 4.846556560141915e-05, + "loss": 0.5596, + "step": 4680 + }, + { + "epoch": 0.5550812285070557, + "grad_norm": 1.3097042300343653, + "learning_rate": 4.846473749888944e-05, + "loss": 0.4881, + "step": 4681 + }, + { + "epoch": 0.5551998102691806, + "grad_norm": 1.25289964416459, + "learning_rate": 4.846390918004363e-05, + "loss": 0.4854, + "step": 4682 + }, + { + "epoch": 0.5553183920313056, + "grad_norm": 1.3622091259624007, + "learning_rate": 4.8463080644889346e-05, + "loss": 0.5836, + "step": 4683 + }, + { + "epoch": 0.5554369737934306, + "grad_norm": 2.033151753569081, + "learning_rate": 4.846225189343424e-05, + "loss": 0.601, + "step": 4684 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.6818109487391475, + "learning_rate": 4.8461422925685937e-05, + "loss": 0.5596, + "step": 4685 + }, + { + "epoch": 0.5556741373176806, + "grad_norm": 1.3900598457850954, + "learning_rate": 4.846059374165208e-05, + "loss": 0.4947, + "step": 4686 + }, + { + "epoch": 0.5557927190798055, + "grad_norm": 1.652638580117675, + "learning_rate": 4.8459764341340334e-05, + "loss": 0.5035, + "step": 4687 + }, + { + "epoch": 0.5559113008419305, + "grad_norm": 1.568445144867368, + "learning_rate": 4.845893472475832e-05, + "loss": 0.5453, + "step": 4688 + }, + { + "epoch": 0.5560298826040555, + "grad_norm": 1.6056317835735008, + "learning_rate": 4.845810489191369e-05, + "loss": 0.7067, + "step": 4689 + }, + { + "epoch": 0.5561484643661805, + "grad_norm": 2.252388874015356, + "learning_rate": 4.84572748428141e-05, + "loss": 0.8045, + "step": 4690 + }, + { + "epoch": 0.5562670461283055, + "grad_norm": 1.2569445382281208, + "learning_rate": 4.8456444577467205e-05, + "loss": 0.4281, + "step": 4691 + }, + { + "epoch": 0.5563856278904304, + "grad_norm": 1.436047509478209, + "learning_rate": 4.8455614095880656e-05, + "loss": 0.5263, + "step": 4692 + }, + { + "epoch": 0.5565042096525554, + "grad_norm": 1.685276282305297, + "learning_rate": 4.8454783398062106e-05, + "loss": 0.6148, + "step": 4693 + }, + { + "epoch": 0.5566227914146804, + "grad_norm": 2.0011247234410243, + "learning_rate": 4.845395248401921e-05, + "loss": 0.8917, + "step": 4694 + }, + { + "epoch": 0.5567413731768054, + "grad_norm": 1.4461293790502088, + "learning_rate": 4.845312135375965e-05, + "loss": 0.5344, + "step": 4695 + }, + { + "epoch": 0.5568599549389304, + "grad_norm": 1.711145297475681, + "learning_rate": 4.845229000729106e-05, + "loss": 0.697, + "step": 4696 + }, + { + "epoch": 0.5569785367010553, + "grad_norm": 1.7393647887081458, + "learning_rate": 4.845145844462111e-05, + "loss": 0.7389, + "step": 4697 + }, + { + "epoch": 0.5570971184631803, + "grad_norm": 1.333362153747004, + "learning_rate": 4.845062666575748e-05, + "loss": 0.4318, + "step": 4698 + }, + { + "epoch": 0.5572157002253053, + "grad_norm": 1.6545421383184948, + "learning_rate": 4.844979467070783e-05, + "loss": 0.6979, + "step": 4699 + }, + { + "epoch": 0.5573342819874303, + "grad_norm": 1.1268896779327935, + "learning_rate": 4.844896245947982e-05, + "loss": 0.4277, + "step": 4700 + }, + { + "epoch": 0.5574528637495553, + "grad_norm": 1.3272964697442295, + "learning_rate": 4.844813003208113e-05, + "loss": 0.4994, + "step": 4701 + }, + { + "epoch": 0.5575714455116803, + "grad_norm": 1.5825628757853425, + "learning_rate": 4.844729738851945e-05, + "loss": 0.9193, + "step": 4702 + }, + { + "epoch": 0.5576900272738052, + "grad_norm": 1.3459667922437653, + "learning_rate": 4.8446464528802424e-05, + "loss": 0.4887, + "step": 4703 + }, + { + "epoch": 0.5578086090359303, + "grad_norm": 1.2766478998110884, + "learning_rate": 4.844563145293776e-05, + "loss": 0.5471, + "step": 4704 + }, + { + "epoch": 0.5579271907980553, + "grad_norm": 1.406882937258525, + "learning_rate": 4.844479816093312e-05, + "loss": 0.6037, + "step": 4705 + }, + { + "epoch": 0.5580457725601803, + "grad_norm": 1.6933303970551348, + "learning_rate": 4.844396465279619e-05, + "loss": 0.6191, + "step": 4706 + }, + { + "epoch": 0.5581643543223053, + "grad_norm": 1.5386064235388461, + "learning_rate": 4.844313092853466e-05, + "loss": 0.6303, + "step": 4707 + }, + { + "epoch": 0.5582829360844302, + "grad_norm": 1.2455152767985185, + "learning_rate": 4.84422969881562e-05, + "loss": 0.4464, + "step": 4708 + }, + { + "epoch": 0.5584015178465552, + "grad_norm": 1.4263966404465287, + "learning_rate": 4.844146283166852e-05, + "loss": 0.5418, + "step": 4709 + }, + { + "epoch": 0.5585200996086802, + "grad_norm": 1.59294352986505, + "learning_rate": 4.8440628459079296e-05, + "loss": 0.6001, + "step": 4710 + }, + { + "epoch": 0.5586386813708052, + "grad_norm": 1.6031613022146738, + "learning_rate": 4.8439793870396214e-05, + "loss": 0.5417, + "step": 4711 + }, + { + "epoch": 0.5587572631329302, + "grad_norm": 1.6887126982323588, + "learning_rate": 4.843895906562699e-05, + "loss": 0.518, + "step": 4712 + }, + { + "epoch": 0.5588758448950552, + "grad_norm": 1.5029864895759364, + "learning_rate": 4.8438124044779297e-05, + "loss": 0.6802, + "step": 4713 + }, + { + "epoch": 0.5589944266571801, + "grad_norm": 1.5880976756775973, + "learning_rate": 4.8437288807860836e-05, + "loss": 0.7062, + "step": 4714 + }, + { + "epoch": 0.5591130084193051, + "grad_norm": 1.9453882148092791, + "learning_rate": 4.843645335487932e-05, + "loss": 0.8142, + "step": 4715 + }, + { + "epoch": 0.5592315901814301, + "grad_norm": 1.5855143360761945, + "learning_rate": 4.843561768584245e-05, + "loss": 0.737, + "step": 4716 + }, + { + "epoch": 0.5593501719435551, + "grad_norm": 1.8650019382239083, + "learning_rate": 4.843478180075792e-05, + "loss": 0.7356, + "step": 4717 + }, + { + "epoch": 0.55946875370568, + "grad_norm": 1.3330071835986503, + "learning_rate": 4.843394569963343e-05, + "loss": 0.5695, + "step": 4718 + }, + { + "epoch": 0.559587335467805, + "grad_norm": 1.268471120248681, + "learning_rate": 4.843310938247671e-05, + "loss": 0.5086, + "step": 4719 + }, + { + "epoch": 0.55970591722993, + "grad_norm": 1.743544632564371, + "learning_rate": 4.843227284929545e-05, + "loss": 0.7171, + "step": 4720 + }, + { + "epoch": 0.559824498992055, + "grad_norm": 1.3831640952292945, + "learning_rate": 4.843143610009737e-05, + "loss": 0.5205, + "step": 4721 + }, + { + "epoch": 0.55994308075418, + "grad_norm": 1.968358051109738, + "learning_rate": 4.843059913489019e-05, + "loss": 0.7948, + "step": 4722 + }, + { + "epoch": 0.560061662516305, + "grad_norm": 1.3426533177863889, + "learning_rate": 4.842976195368161e-05, + "loss": 0.5066, + "step": 4723 + }, + { + "epoch": 0.5601802442784299, + "grad_norm": 1.5651902997881924, + "learning_rate": 4.8428924556479354e-05, + "loss": 0.6285, + "step": 4724 + }, + { + "epoch": 0.5602988260405549, + "grad_norm": 1.4239696680602174, + "learning_rate": 4.8428086943291153e-05, + "loss": 0.6066, + "step": 4725 + }, + { + "epoch": 0.5604174078026799, + "grad_norm": 1.0554433953802749, + "learning_rate": 4.842724911412471e-05, + "loss": 0.4432, + "step": 4726 + }, + { + "epoch": 0.5605359895648049, + "grad_norm": 1.6259876325984461, + "learning_rate": 4.842641106898776e-05, + "loss": 0.5761, + "step": 4727 + }, + { + "epoch": 0.56065457132693, + "grad_norm": 1.5507685335836658, + "learning_rate": 4.8425572807888034e-05, + "loss": 0.607, + "step": 4728 + }, + { + "epoch": 0.560773153089055, + "grad_norm": 1.1633739207168885, + "learning_rate": 4.842473433083325e-05, + "loss": 0.4129, + "step": 4729 + }, + { + "epoch": 0.5608917348511799, + "grad_norm": 2.503725586544328, + "learning_rate": 4.842389563783114e-05, + "loss": 0.8213, + "step": 4730 + }, + { + "epoch": 0.5610103166133049, + "grad_norm": 1.1477729893822515, + "learning_rate": 4.8423056728889436e-05, + "loss": 0.394, + "step": 4731 + }, + { + "epoch": 0.5611288983754299, + "grad_norm": 1.3363624447267668, + "learning_rate": 4.842221760401587e-05, + "loss": 0.4036, + "step": 4732 + }, + { + "epoch": 0.5612474801375549, + "grad_norm": 1.8180172140413364, + "learning_rate": 4.842137826321819e-05, + "loss": 0.7912, + "step": 4733 + }, + { + "epoch": 0.5613660618996799, + "grad_norm": 1.6318346633882967, + "learning_rate": 4.8420538706504115e-05, + "loss": 0.6826, + "step": 4734 + }, + { + "epoch": 0.5614846436618048, + "grad_norm": 1.5000196535253845, + "learning_rate": 4.84196989338814e-05, + "loss": 0.4979, + "step": 4735 + }, + { + "epoch": 0.5616032254239298, + "grad_norm": 1.9539195300356078, + "learning_rate": 4.841885894535777e-05, + "loss": 0.5689, + "step": 4736 + }, + { + "epoch": 0.5617218071860548, + "grad_norm": 1.7550890054268293, + "learning_rate": 4.8418018740940985e-05, + "loss": 0.6404, + "step": 4737 + }, + { + "epoch": 0.5618403889481798, + "grad_norm": 0.9798494563074818, + "learning_rate": 4.841717832063878e-05, + "loss": 0.3949, + "step": 4738 + }, + { + "epoch": 0.5619589707103048, + "grad_norm": 1.8223535306150611, + "learning_rate": 4.841633768445891e-05, + "loss": 0.9005, + "step": 4739 + }, + { + "epoch": 0.5620775524724297, + "grad_norm": 1.6247666035661312, + "learning_rate": 4.841549683240913e-05, + "loss": 0.5935, + "step": 4740 + }, + { + "epoch": 0.5621961342345547, + "grad_norm": 1.3391693512383898, + "learning_rate": 4.841465576449717e-05, + "loss": 0.4234, + "step": 4741 + }, + { + "epoch": 0.5623147159966797, + "grad_norm": 1.3294420812500403, + "learning_rate": 4.8413814480730804e-05, + "loss": 0.4427, + "step": 4742 + }, + { + "epoch": 0.5624332977588047, + "grad_norm": 1.387040981635194, + "learning_rate": 4.841297298111778e-05, + "loss": 0.5338, + "step": 4743 + }, + { + "epoch": 0.5625518795209297, + "grad_norm": 1.7385073060511986, + "learning_rate": 4.841213126566585e-05, + "loss": 0.6415, + "step": 4744 + }, + { + "epoch": 0.5626704612830546, + "grad_norm": 1.3972870876051866, + "learning_rate": 4.841128933438278e-05, + "loss": 0.5391, + "step": 4745 + }, + { + "epoch": 0.5627890430451796, + "grad_norm": 1.4282435188573608, + "learning_rate": 4.841044718727633e-05, + "loss": 0.4641, + "step": 4746 + }, + { + "epoch": 0.5629076248073046, + "grad_norm": 1.6772967185691308, + "learning_rate": 4.840960482435427e-05, + "loss": 0.8723, + "step": 4747 + }, + { + "epoch": 0.5630262065694296, + "grad_norm": 1.3003424369656504, + "learning_rate": 4.840876224562436e-05, + "loss": 0.5178, + "step": 4748 + }, + { + "epoch": 0.5631447883315546, + "grad_norm": 1.3592832465302598, + "learning_rate": 4.840791945109437e-05, + "loss": 0.4712, + "step": 4749 + }, + { + "epoch": 0.5632633700936795, + "grad_norm": 1.7409042990156294, + "learning_rate": 4.840707644077207e-05, + "loss": 0.5227, + "step": 4750 + }, + { + "epoch": 0.5633819518558045, + "grad_norm": 1.3834563986530066, + "learning_rate": 4.840623321466522e-05, + "loss": 0.4417, + "step": 4751 + }, + { + "epoch": 0.5635005336179295, + "grad_norm": 1.5743968641708326, + "learning_rate": 4.8405389772781604e-05, + "loss": 0.658, + "step": 4752 + }, + { + "epoch": 0.5636191153800546, + "grad_norm": 1.4946443049600469, + "learning_rate": 4.8404546115129e-05, + "loss": 0.6127, + "step": 4753 + }, + { + "epoch": 0.5637376971421796, + "grad_norm": 1.5343673568153964, + "learning_rate": 4.8403702241715174e-05, + "loss": 0.6689, + "step": 4754 + }, + { + "epoch": 0.5638562789043046, + "grad_norm": 2.018980568926531, + "learning_rate": 4.8402858152547925e-05, + "loss": 0.8016, + "step": 4755 + }, + { + "epoch": 0.5639748606664295, + "grad_norm": 1.7339979206308598, + "learning_rate": 4.840201384763501e-05, + "loss": 0.8267, + "step": 4756 + }, + { + "epoch": 0.5640934424285545, + "grad_norm": 1.4941538325678674, + "learning_rate": 4.840116932698423e-05, + "loss": 0.6551, + "step": 4757 + }, + { + "epoch": 0.5642120241906795, + "grad_norm": 1.2041908726621509, + "learning_rate": 4.840032459060336e-05, + "loss": 0.4446, + "step": 4758 + }, + { + "epoch": 0.5643306059528045, + "grad_norm": 1.770585855121199, + "learning_rate": 4.8399479638500204e-05, + "loss": 0.7724, + "step": 4759 + }, + { + "epoch": 0.5644491877149295, + "grad_norm": 1.0882064892022583, + "learning_rate": 4.8398634470682535e-05, + "loss": 0.3883, + "step": 4760 + }, + { + "epoch": 0.5645677694770544, + "grad_norm": 1.635651968994243, + "learning_rate": 4.8397789087158155e-05, + "loss": 0.6636, + "step": 4761 + }, + { + "epoch": 0.5646863512391794, + "grad_norm": 1.4499353026304567, + "learning_rate": 4.839694348793484e-05, + "loss": 0.6702, + "step": 4762 + }, + { + "epoch": 0.5648049330013044, + "grad_norm": 1.4592886735342656, + "learning_rate": 4.8396097673020404e-05, + "loss": 0.6178, + "step": 4763 + }, + { + "epoch": 0.5649235147634294, + "grad_norm": 1.3886414655596917, + "learning_rate": 4.839525164242263e-05, + "loss": 0.4692, + "step": 4764 + }, + { + "epoch": 0.5650420965255544, + "grad_norm": 1.5319432267792714, + "learning_rate": 4.839440539614933e-05, + "loss": 0.7857, + "step": 4765 + }, + { + "epoch": 0.5651606782876794, + "grad_norm": 1.4527857393827315, + "learning_rate": 4.8393558934208306e-05, + "loss": 0.6159, + "step": 4766 + }, + { + "epoch": 0.5652792600498043, + "grad_norm": 1.3981707300645687, + "learning_rate": 4.839271225660735e-05, + "loss": 0.5045, + "step": 4767 + }, + { + "epoch": 0.5653978418119293, + "grad_norm": 1.6417564114427956, + "learning_rate": 4.8391865363354276e-05, + "loss": 0.8158, + "step": 4768 + }, + { + "epoch": 0.5655164235740543, + "grad_norm": 1.447321410698706, + "learning_rate": 4.839101825445689e-05, + "loss": 0.4194, + "step": 4769 + }, + { + "epoch": 0.5656350053361793, + "grad_norm": 1.6137992842584659, + "learning_rate": 4.8390170929922996e-05, + "loss": 0.6676, + "step": 4770 + }, + { + "epoch": 0.5657535870983043, + "grad_norm": 1.4412999180797266, + "learning_rate": 4.838932338976041e-05, + "loss": 0.6983, + "step": 4771 + }, + { + "epoch": 0.5658721688604292, + "grad_norm": 1.3089931996286113, + "learning_rate": 4.838847563397694e-05, + "loss": 0.4258, + "step": 4772 + }, + { + "epoch": 0.5659907506225542, + "grad_norm": 1.354614232310771, + "learning_rate": 4.838762766258041e-05, + "loss": 0.4004, + "step": 4773 + }, + { + "epoch": 0.5661093323846792, + "grad_norm": 1.6229047366852642, + "learning_rate": 4.8386779475578634e-05, + "loss": 0.6694, + "step": 4774 + }, + { + "epoch": 0.5662279141468042, + "grad_norm": 1.237000712125034, + "learning_rate": 4.838593107297943e-05, + "loss": 0.4017, + "step": 4775 + }, + { + "epoch": 0.5663464959089292, + "grad_norm": 1.731393118404643, + "learning_rate": 4.8385082454790607e-05, + "loss": 0.6467, + "step": 4776 + }, + { + "epoch": 0.5664650776710541, + "grad_norm": 2.0433069948636176, + "learning_rate": 4.8384233621020014e-05, + "loss": 0.7634, + "step": 4777 + }, + { + "epoch": 0.5665836594331792, + "grad_norm": 1.4422596845204285, + "learning_rate": 4.838338457167546e-05, + "loss": 0.4358, + "step": 4778 + }, + { + "epoch": 0.5667022411953042, + "grad_norm": 1.7644370197805088, + "learning_rate": 4.8382535306764775e-05, + "loss": 0.6541, + "step": 4779 + }, + { + "epoch": 0.5668208229574292, + "grad_norm": 2.6601285494283116, + "learning_rate": 4.838168582629579e-05, + "loss": 0.7526, + "step": 4780 + }, + { + "epoch": 0.5669394047195542, + "grad_norm": 1.3288981878863133, + "learning_rate": 4.838083613027633e-05, + "loss": 0.5614, + "step": 4781 + }, + { + "epoch": 0.5670579864816792, + "grad_norm": 1.5775407125323604, + "learning_rate": 4.837998621871424e-05, + "loss": 0.5134, + "step": 4782 + }, + { + "epoch": 0.5671765682438041, + "grad_norm": 1.4348938273809067, + "learning_rate": 4.837913609161733e-05, + "loss": 0.4587, + "step": 4783 + }, + { + "epoch": 0.5672951500059291, + "grad_norm": 1.3490908365432095, + "learning_rate": 4.837828574899347e-05, + "loss": 0.5812, + "step": 4784 + }, + { + "epoch": 0.5674137317680541, + "grad_norm": 1.356587603427273, + "learning_rate": 4.837743519085048e-05, + "loss": 0.5454, + "step": 4785 + }, + { + "epoch": 0.5675323135301791, + "grad_norm": 1.2704706964790042, + "learning_rate": 4.83765844171962e-05, + "loss": 0.503, + "step": 4786 + }, + { + "epoch": 0.5676508952923041, + "grad_norm": 1.339491676843473, + "learning_rate": 4.8375733428038486e-05, + "loss": 0.4458, + "step": 4787 + }, + { + "epoch": 0.567769477054429, + "grad_norm": 1.4618347600618564, + "learning_rate": 4.837488222338517e-05, + "loss": 0.5593, + "step": 4788 + }, + { + "epoch": 0.567888058816554, + "grad_norm": 1.403965826210278, + "learning_rate": 4.8374030803244105e-05, + "loss": 0.4938, + "step": 4789 + }, + { + "epoch": 0.568006640578679, + "grad_norm": 1.2494360523289174, + "learning_rate": 4.837317916762314e-05, + "loss": 0.501, + "step": 4790 + }, + { + "epoch": 0.568125222340804, + "grad_norm": 1.237857500938287, + "learning_rate": 4.8372327316530125e-05, + "loss": 0.386, + "step": 4791 + }, + { + "epoch": 0.568243804102929, + "grad_norm": 1.7492511135738555, + "learning_rate": 4.837147524997291e-05, + "loss": 0.6377, + "step": 4792 + }, + { + "epoch": 0.5683623858650539, + "grad_norm": 1.751645263389289, + "learning_rate": 4.8370622967959356e-05, + "loss": 0.6275, + "step": 4793 + }, + { + "epoch": 0.5684809676271789, + "grad_norm": 1.624975742164375, + "learning_rate": 4.836977047049732e-05, + "loss": 0.6145, + "step": 4794 + }, + { + "epoch": 0.5685995493893039, + "grad_norm": 1.4316480031103396, + "learning_rate": 4.8368917757594654e-05, + "loss": 0.4117, + "step": 4795 + }, + { + "epoch": 0.5687181311514289, + "grad_norm": 1.5419529746291927, + "learning_rate": 4.836806482925923e-05, + "loss": 0.5081, + "step": 4796 + }, + { + "epoch": 0.5688367129135539, + "grad_norm": 1.4269507270524802, + "learning_rate": 4.836721168549889e-05, + "loss": 0.4771, + "step": 4797 + }, + { + "epoch": 0.5689552946756788, + "grad_norm": 1.6029893055680104, + "learning_rate": 4.836635832632153e-05, + "loss": 0.4848, + "step": 4798 + }, + { + "epoch": 0.5690738764378038, + "grad_norm": 1.5445006162504786, + "learning_rate": 4.836550475173499e-05, + "loss": 0.5116, + "step": 4799 + }, + { + "epoch": 0.5691924581999288, + "grad_norm": 1.502185585772934, + "learning_rate": 4.8364650961747146e-05, + "loss": 0.5192, + "step": 4800 + }, + { + "epoch": 0.5693110399620538, + "grad_norm": 1.7086379684746391, + "learning_rate": 4.836379695636588e-05, + "loss": 0.5508, + "step": 4801 + }, + { + "epoch": 0.5694296217241788, + "grad_norm": 1.4074781314720364, + "learning_rate": 4.836294273559905e-05, + "loss": 0.5024, + "step": 4802 + }, + { + "epoch": 0.5695482034863039, + "grad_norm": 1.855059781593241, + "learning_rate": 4.836208829945454e-05, + "loss": 0.7017, + "step": 4803 + }, + { + "epoch": 0.5696667852484288, + "grad_norm": 1.2440513454133826, + "learning_rate": 4.836123364794023e-05, + "loss": 0.3513, + "step": 4804 + }, + { + "epoch": 0.5697853670105538, + "grad_norm": 1.4409402767150834, + "learning_rate": 4.836037878106399e-05, + "loss": 0.5128, + "step": 4805 + }, + { + "epoch": 0.5699039487726788, + "grad_norm": 1.3500115409600408, + "learning_rate": 4.83595236988337e-05, + "loss": 0.5079, + "step": 4806 + }, + { + "epoch": 0.5700225305348038, + "grad_norm": 2.09348074297985, + "learning_rate": 4.835866840125725e-05, + "loss": 1.0849, + "step": 4807 + }, + { + "epoch": 0.5701411122969288, + "grad_norm": 1.4319208906596437, + "learning_rate": 4.835781288834251e-05, + "loss": 0.4667, + "step": 4808 + }, + { + "epoch": 0.5702596940590537, + "grad_norm": 1.1776032007354145, + "learning_rate": 4.835695716009739e-05, + "loss": 0.4756, + "step": 4809 + }, + { + "epoch": 0.5703782758211787, + "grad_norm": 1.5882373105671523, + "learning_rate": 4.835610121652977e-05, + "loss": 0.6918, + "step": 4810 + }, + { + "epoch": 0.5704968575833037, + "grad_norm": 1.483576580982176, + "learning_rate": 4.8355245057647535e-05, + "loss": 0.5241, + "step": 4811 + }, + { + "epoch": 0.5706154393454287, + "grad_norm": 1.3828462724350303, + "learning_rate": 4.835438868345858e-05, + "loss": 0.5974, + "step": 4812 + }, + { + "epoch": 0.5707340211075537, + "grad_norm": 1.742183237128271, + "learning_rate": 4.83535320939708e-05, + "loss": 0.6456, + "step": 4813 + }, + { + "epoch": 0.5708526028696786, + "grad_norm": 1.3167680761724174, + "learning_rate": 4.835267528919209e-05, + "loss": 0.6312, + "step": 4814 + }, + { + "epoch": 0.5709711846318036, + "grad_norm": 1.4243607218591852, + "learning_rate": 4.8351818269130356e-05, + "loss": 0.6578, + "step": 4815 + }, + { + "epoch": 0.5710897663939286, + "grad_norm": 1.5903253628833416, + "learning_rate": 4.835096103379348e-05, + "loss": 0.5378, + "step": 4816 + }, + { + "epoch": 0.5712083481560536, + "grad_norm": 1.5103955404738643, + "learning_rate": 4.83501035831894e-05, + "loss": 0.5794, + "step": 4817 + }, + { + "epoch": 0.5713269299181786, + "grad_norm": 1.4801726843092147, + "learning_rate": 4.834924591732598e-05, + "loss": 0.4829, + "step": 4818 + }, + { + "epoch": 0.5714455116803036, + "grad_norm": 1.3714042197381975, + "learning_rate": 4.834838803621115e-05, + "loss": 0.6635, + "step": 4819 + }, + { + "epoch": 0.5715640934424285, + "grad_norm": 1.3490181247544406, + "learning_rate": 4.834752993985282e-05, + "loss": 0.6085, + "step": 4820 + }, + { + "epoch": 0.5716826752045535, + "grad_norm": 1.4270539644102322, + "learning_rate": 4.8346671628258896e-05, + "loss": 0.7319, + "step": 4821 + }, + { + "epoch": 0.5718012569666785, + "grad_norm": 1.4508676104723428, + "learning_rate": 4.834581310143728e-05, + "loss": 0.5925, + "step": 4822 + }, + { + "epoch": 0.5719198387288035, + "grad_norm": 1.4625955935839727, + "learning_rate": 4.83449543593959e-05, + "loss": 0.5281, + "step": 4823 + }, + { + "epoch": 0.5720384204909285, + "grad_norm": 1.3399936912690542, + "learning_rate": 4.834409540214267e-05, + "loss": 0.6165, + "step": 4824 + }, + { + "epoch": 0.5721570022530534, + "grad_norm": 1.5446371523837137, + "learning_rate": 4.8343236229685506e-05, + "loss": 0.6953, + "step": 4825 + }, + { + "epoch": 0.5722755840151784, + "grad_norm": 1.4771672213875835, + "learning_rate": 4.834237684203233e-05, + "loss": 0.6779, + "step": 4826 + }, + { + "epoch": 0.5723941657773034, + "grad_norm": 1.7909951036044593, + "learning_rate": 4.834151723919106e-05, + "loss": 0.837, + "step": 4827 + }, + { + "epoch": 0.5725127475394285, + "grad_norm": 1.3642387076290443, + "learning_rate": 4.834065742116962e-05, + "loss": 0.529, + "step": 4828 + }, + { + "epoch": 0.5726313293015535, + "grad_norm": 1.5104856199540138, + "learning_rate": 4.8339797387975946e-05, + "loss": 0.5356, + "step": 4829 + }, + { + "epoch": 0.5727499110636785, + "grad_norm": 1.3724178144977826, + "learning_rate": 4.8338937139617966e-05, + "loss": 0.4764, + "step": 4830 + }, + { + "epoch": 0.5728684928258034, + "grad_norm": 1.429346299228928, + "learning_rate": 4.833807667610361e-05, + "loss": 0.6614, + "step": 4831 + }, + { + "epoch": 0.5729870745879284, + "grad_norm": 1.1783626037822665, + "learning_rate": 4.8337215997440794e-05, + "loss": 0.4015, + "step": 4832 + }, + { + "epoch": 0.5731056563500534, + "grad_norm": 1.532441301078974, + "learning_rate": 4.8336355103637467e-05, + "loss": 0.6641, + "step": 4833 + }, + { + "epoch": 0.5732242381121784, + "grad_norm": 1.7186878417362013, + "learning_rate": 4.833549399470157e-05, + "loss": 0.7059, + "step": 4834 + }, + { + "epoch": 0.5733428198743034, + "grad_norm": 1.5725811830755916, + "learning_rate": 4.833463267064102e-05, + "loss": 0.6757, + "step": 4835 + }, + { + "epoch": 0.5734614016364283, + "grad_norm": 1.6018501065712376, + "learning_rate": 4.833377113146379e-05, + "loss": 0.5661, + "step": 4836 + }, + { + "epoch": 0.5735799833985533, + "grad_norm": 1.4558711381852625, + "learning_rate": 4.833290937717779e-05, + "loss": 0.5581, + "step": 4837 + }, + { + "epoch": 0.5736985651606783, + "grad_norm": 1.7372117374703748, + "learning_rate": 4.833204740779098e-05, + "loss": 0.6422, + "step": 4838 + }, + { + "epoch": 0.5738171469228033, + "grad_norm": 1.5390687407327341, + "learning_rate": 4.8331185223311314e-05, + "loss": 0.5437, + "step": 4839 + }, + { + "epoch": 0.5739357286849283, + "grad_norm": 1.5383781286764457, + "learning_rate": 4.833032282374672e-05, + "loss": 0.5266, + "step": 4840 + }, + { + "epoch": 0.5740543104470532, + "grad_norm": 1.4064972067128003, + "learning_rate": 4.832946020910516e-05, + "loss": 0.4996, + "step": 4841 + }, + { + "epoch": 0.5741728922091782, + "grad_norm": 1.6782330732049084, + "learning_rate": 4.832859737939459e-05, + "loss": 0.676, + "step": 4842 + }, + { + "epoch": 0.5742914739713032, + "grad_norm": 1.5288524495336786, + "learning_rate": 4.832773433462296e-05, + "loss": 0.5151, + "step": 4843 + }, + { + "epoch": 0.5744100557334282, + "grad_norm": 1.4646171075380927, + "learning_rate": 4.832687107479823e-05, + "loss": 0.4943, + "step": 4844 + }, + { + "epoch": 0.5745286374955532, + "grad_norm": 1.5785370476225866, + "learning_rate": 4.832600759992835e-05, + "loss": 0.4718, + "step": 4845 + }, + { + "epoch": 0.5746472192576781, + "grad_norm": 2.081196254034374, + "learning_rate": 4.832514391002128e-05, + "loss": 0.5636, + "step": 4846 + }, + { + "epoch": 0.5747658010198031, + "grad_norm": 1.4689289371789283, + "learning_rate": 4.832428000508498e-05, + "loss": 0.5522, + "step": 4847 + }, + { + "epoch": 0.5748843827819281, + "grad_norm": 1.4572796675293904, + "learning_rate": 4.8323415885127434e-05, + "loss": 0.6254, + "step": 4848 + }, + { + "epoch": 0.5750029645440531, + "grad_norm": 1.2066909395011887, + "learning_rate": 4.832255155015659e-05, + "loss": 0.4543, + "step": 4849 + }, + { + "epoch": 0.5751215463061781, + "grad_norm": 1.294971829524541, + "learning_rate": 4.832168700018042e-05, + "loss": 0.6465, + "step": 4850 + }, + { + "epoch": 0.575240128068303, + "grad_norm": 1.3934296154693546, + "learning_rate": 4.8320822235206894e-05, + "loss": 0.6258, + "step": 4851 + }, + { + "epoch": 0.575358709830428, + "grad_norm": 1.4605865133660587, + "learning_rate": 4.831995725524399e-05, + "loss": 0.4917, + "step": 4852 + }, + { + "epoch": 0.5754772915925531, + "grad_norm": 1.3047208604074407, + "learning_rate": 4.831909206029967e-05, + "loss": 0.3776, + "step": 4853 + }, + { + "epoch": 0.5755958733546781, + "grad_norm": 1.4203365445979046, + "learning_rate": 4.8318226650381914e-05, + "loss": 0.6028, + "step": 4854 + }, + { + "epoch": 0.5757144551168031, + "grad_norm": 1.3692200425246621, + "learning_rate": 4.8317361025498706e-05, + "loss": 0.5687, + "step": 4855 + }, + { + "epoch": 0.5758330368789281, + "grad_norm": 1.5018778378169475, + "learning_rate": 4.831649518565803e-05, + "loss": 0.4905, + "step": 4856 + }, + { + "epoch": 0.575951618641053, + "grad_norm": 1.791021420538173, + "learning_rate": 4.831562913086786e-05, + "loss": 0.7953, + "step": 4857 + }, + { + "epoch": 0.576070200403178, + "grad_norm": 1.8302943990569807, + "learning_rate": 4.831476286113618e-05, + "loss": 0.6317, + "step": 4858 + }, + { + "epoch": 0.576188782165303, + "grad_norm": 1.3034777713381278, + "learning_rate": 4.831389637647097e-05, + "loss": 0.4473, + "step": 4859 + }, + { + "epoch": 0.576307363927428, + "grad_norm": 1.2148598598879456, + "learning_rate": 4.8313029676880226e-05, + "loss": 0.4011, + "step": 4860 + }, + { + "epoch": 0.576425945689553, + "grad_norm": 2.001698884347611, + "learning_rate": 4.831216276237194e-05, + "loss": 0.8231, + "step": 4861 + }, + { + "epoch": 0.576544527451678, + "grad_norm": 1.7160727286238915, + "learning_rate": 4.8311295632954103e-05, + "loss": 0.6059, + "step": 4862 + }, + { + "epoch": 0.5766631092138029, + "grad_norm": 1.465416724556384, + "learning_rate": 4.8310428288634704e-05, + "loss": 0.6369, + "step": 4863 + }, + { + "epoch": 0.5767816909759279, + "grad_norm": 1.5785509865479557, + "learning_rate": 4.8309560729421745e-05, + "loss": 0.7225, + "step": 4864 + }, + { + "epoch": 0.5769002727380529, + "grad_norm": 1.7649696416874066, + "learning_rate": 4.8308692955323215e-05, + "loss": 0.6663, + "step": 4865 + }, + { + "epoch": 0.5770188545001779, + "grad_norm": 1.4939002469595135, + "learning_rate": 4.830782496634712e-05, + "loss": 0.5848, + "step": 4866 + }, + { + "epoch": 0.5771374362623028, + "grad_norm": 1.4417408068000643, + "learning_rate": 4.830695676250146e-05, + "loss": 0.6056, + "step": 4867 + }, + { + "epoch": 0.5772560180244278, + "grad_norm": 1.3491585682881249, + "learning_rate": 4.830608834379424e-05, + "loss": 0.5059, + "step": 4868 + }, + { + "epoch": 0.5773745997865528, + "grad_norm": 1.6677528219985802, + "learning_rate": 4.8305219710233465e-05, + "loss": 0.7372, + "step": 4869 + }, + { + "epoch": 0.5774931815486778, + "grad_norm": 1.4175237891988401, + "learning_rate": 4.8304350861827145e-05, + "loss": 0.6367, + "step": 4870 + }, + { + "epoch": 0.5776117633108028, + "grad_norm": 1.1290033384211742, + "learning_rate": 4.830348179858329e-05, + "loss": 0.5079, + "step": 4871 + }, + { + "epoch": 0.5777303450729278, + "grad_norm": 1.3794354136245188, + "learning_rate": 4.8302612520509905e-05, + "loss": 0.5143, + "step": 4872 + }, + { + "epoch": 0.5778489268350527, + "grad_norm": 1.348214567955498, + "learning_rate": 4.830174302761501e-05, + "loss": 0.5502, + "step": 4873 + }, + { + "epoch": 0.5779675085971777, + "grad_norm": 1.3002658754122565, + "learning_rate": 4.8300873319906614e-05, + "loss": 0.5729, + "step": 4874 + }, + { + "epoch": 0.5780860903593027, + "grad_norm": 1.43187834388376, + "learning_rate": 4.830000339739275e-05, + "loss": 0.5805, + "step": 4875 + }, + { + "epoch": 0.5782046721214277, + "grad_norm": 1.681641316296796, + "learning_rate": 4.829913326008142e-05, + "loss": 0.9402, + "step": 4876 + }, + { + "epoch": 0.5783232538835527, + "grad_norm": 1.5014871931669835, + "learning_rate": 4.829826290798065e-05, + "loss": 0.4971, + "step": 4877 + }, + { + "epoch": 0.5784418356456777, + "grad_norm": 1.7353059919276639, + "learning_rate": 4.829739234109847e-05, + "loss": 0.7456, + "step": 4878 + }, + { + "epoch": 0.5785604174078027, + "grad_norm": 1.8220243907551628, + "learning_rate": 4.82965215594429e-05, + "loss": 0.7658, + "step": 4879 + }, + { + "epoch": 0.5786789991699277, + "grad_norm": 2.5123982332767842, + "learning_rate": 4.829565056302197e-05, + "loss": 0.8431, + "step": 4880 + }, + { + "epoch": 0.5787975809320527, + "grad_norm": 1.5265359858221195, + "learning_rate": 4.829477935184371e-05, + "loss": 0.4993, + "step": 4881 + }, + { + "epoch": 0.5789161626941777, + "grad_norm": 1.4847234182651676, + "learning_rate": 4.829390792591615e-05, + "loss": 0.6811, + "step": 4882 + }, + { + "epoch": 0.5790347444563027, + "grad_norm": 1.2364768714494359, + "learning_rate": 4.829303628524732e-05, + "loss": 0.5412, + "step": 4883 + }, + { + "epoch": 0.5791533262184276, + "grad_norm": 1.2376614496573963, + "learning_rate": 4.8292164429845266e-05, + "loss": 0.5348, + "step": 4884 + }, + { + "epoch": 0.5792719079805526, + "grad_norm": 1.9906586451505632, + "learning_rate": 4.8291292359718014e-05, + "loss": 0.6483, + "step": 4885 + }, + { + "epoch": 0.5793904897426776, + "grad_norm": 1.7061451496650775, + "learning_rate": 4.829042007487361e-05, + "loss": 0.7485, + "step": 4886 + }, + { + "epoch": 0.5795090715048026, + "grad_norm": 1.2467610140821408, + "learning_rate": 4.828954757532009e-05, + "loss": 0.6076, + "step": 4887 + }, + { + "epoch": 0.5796276532669276, + "grad_norm": 1.4763793485777257, + "learning_rate": 4.828867486106551e-05, + "loss": 0.7074, + "step": 4888 + }, + { + "epoch": 0.5797462350290525, + "grad_norm": 1.2236173759463442, + "learning_rate": 4.828780193211789e-05, + "loss": 0.4563, + "step": 4889 + }, + { + "epoch": 0.5798648167911775, + "grad_norm": 1.2750074759086636, + "learning_rate": 4.828692878848531e-05, + "loss": 0.4611, + "step": 4890 + }, + { + "epoch": 0.5799833985533025, + "grad_norm": 1.7650976204135476, + "learning_rate": 4.8286055430175794e-05, + "loss": 0.7461, + "step": 4891 + }, + { + "epoch": 0.5801019803154275, + "grad_norm": 1.709284016971087, + "learning_rate": 4.828518185719741e-05, + "loss": 0.767, + "step": 4892 + }, + { + "epoch": 0.5802205620775525, + "grad_norm": 1.3724087995714866, + "learning_rate": 4.828430806955819e-05, + "loss": 0.5897, + "step": 4893 + }, + { + "epoch": 0.5803391438396774, + "grad_norm": 1.2858287207298587, + "learning_rate": 4.828343406726622e-05, + "loss": 0.5592, + "step": 4894 + }, + { + "epoch": 0.5804577256018024, + "grad_norm": 1.6962204296770302, + "learning_rate": 4.828255985032953e-05, + "loss": 0.7195, + "step": 4895 + }, + { + "epoch": 0.5805763073639274, + "grad_norm": 1.3551283545324055, + "learning_rate": 4.8281685418756184e-05, + "loss": 0.4472, + "step": 4896 + }, + { + "epoch": 0.5806948891260524, + "grad_norm": 1.4174282369468612, + "learning_rate": 4.8280810772554256e-05, + "loss": 0.4871, + "step": 4897 + }, + { + "epoch": 0.5808134708881774, + "grad_norm": 1.5523607403460744, + "learning_rate": 4.82799359117318e-05, + "loss": 0.6806, + "step": 4898 + }, + { + "epoch": 0.5809320526503023, + "grad_norm": 1.6277294374772822, + "learning_rate": 4.8279060836296887e-05, + "loss": 0.696, + "step": 4899 + }, + { + "epoch": 0.5810506344124273, + "grad_norm": 1.2411936235414647, + "learning_rate": 4.827818554625757e-05, + "loss": 0.3197, + "step": 4900 + }, + { + "epoch": 0.5811692161745523, + "grad_norm": 1.4946612232004128, + "learning_rate": 4.827731004162194e-05, + "loss": 0.5203, + "step": 4901 + }, + { + "epoch": 0.5812877979366773, + "grad_norm": 1.3031251426609378, + "learning_rate": 4.827643432239804e-05, + "loss": 0.5806, + "step": 4902 + }, + { + "epoch": 0.5814063796988024, + "grad_norm": 1.672332650845054, + "learning_rate": 4.827555838859397e-05, + "loss": 0.6177, + "step": 4903 + }, + { + "epoch": 0.5815249614609274, + "grad_norm": 1.491810752581244, + "learning_rate": 4.8274682240217796e-05, + "loss": 0.5896, + "step": 4904 + }, + { + "epoch": 0.5816435432230523, + "grad_norm": 1.7958790336490063, + "learning_rate": 4.827380587727759e-05, + "loss": 0.7725, + "step": 4905 + }, + { + "epoch": 0.5817621249851773, + "grad_norm": 1.237186784610053, + "learning_rate": 4.8272929299781436e-05, + "loss": 0.47, + "step": 4906 + }, + { + "epoch": 0.5818807067473023, + "grad_norm": 1.4772997688547798, + "learning_rate": 4.8272052507737404e-05, + "loss": 0.6071, + "step": 4907 + }, + { + "epoch": 0.5819992885094273, + "grad_norm": 1.4568872642056592, + "learning_rate": 4.8271175501153594e-05, + "loss": 0.6089, + "step": 4908 + }, + { + "epoch": 0.5821178702715523, + "grad_norm": 1.5142635627058525, + "learning_rate": 4.8270298280038076e-05, + "loss": 0.5942, + "step": 4909 + }, + { + "epoch": 0.5822364520336772, + "grad_norm": 1.278251935284489, + "learning_rate": 4.826942084439895e-05, + "loss": 0.3848, + "step": 4910 + }, + { + "epoch": 0.5823550337958022, + "grad_norm": 1.2796707713353495, + "learning_rate": 4.82685431942443e-05, + "loss": 0.3911, + "step": 4911 + }, + { + "epoch": 0.5824736155579272, + "grad_norm": 1.2421889716053645, + "learning_rate": 4.826766532958221e-05, + "loss": 0.4456, + "step": 4912 + }, + { + "epoch": 0.5825921973200522, + "grad_norm": 1.2465262725294888, + "learning_rate": 4.8266787250420786e-05, + "loss": 0.5373, + "step": 4913 + }, + { + "epoch": 0.5827107790821772, + "grad_norm": 1.7212458919102307, + "learning_rate": 4.826590895676811e-05, + "loss": 0.7864, + "step": 4914 + }, + { + "epoch": 0.5828293608443021, + "grad_norm": 1.5982046664302494, + "learning_rate": 4.826503044863228e-05, + "loss": 0.4658, + "step": 4915 + }, + { + "epoch": 0.5829479426064271, + "grad_norm": 1.5409932073947756, + "learning_rate": 4.8264151726021406e-05, + "loss": 0.7047, + "step": 4916 + }, + { + "epoch": 0.5830665243685521, + "grad_norm": 1.528264294478824, + "learning_rate": 4.826327278894358e-05, + "loss": 0.5087, + "step": 4917 + }, + { + "epoch": 0.5831851061306771, + "grad_norm": 1.5136365235774667, + "learning_rate": 4.82623936374069e-05, + "loss": 0.5912, + "step": 4918 + }, + { + "epoch": 0.5833036878928021, + "grad_norm": 1.5453141096085028, + "learning_rate": 4.826151427141948e-05, + "loss": 0.4833, + "step": 4919 + }, + { + "epoch": 0.583422269654927, + "grad_norm": 1.9146128166401783, + "learning_rate": 4.8260634690989424e-05, + "loss": 0.6219, + "step": 4920 + }, + { + "epoch": 0.583540851417052, + "grad_norm": 1.457959066505368, + "learning_rate": 4.8259754896124845e-05, + "loss": 0.5451, + "step": 4921 + }, + { + "epoch": 0.583659433179177, + "grad_norm": 1.209992708492431, + "learning_rate": 4.825887488683385e-05, + "loss": 0.4083, + "step": 4922 + }, + { + "epoch": 0.583778014941302, + "grad_norm": 1.320595721456107, + "learning_rate": 4.825799466312454e-05, + "loss": 0.3837, + "step": 4923 + }, + { + "epoch": 0.583896596703427, + "grad_norm": 1.6044715009048895, + "learning_rate": 4.8257114225005054e-05, + "loss": 0.5695, + "step": 4924 + }, + { + "epoch": 0.584015178465552, + "grad_norm": 1.6327763504325006, + "learning_rate": 4.825623357248348e-05, + "loss": 0.5824, + "step": 4925 + }, + { + "epoch": 0.5841337602276769, + "grad_norm": 1.907440586977788, + "learning_rate": 4.8255352705567956e-05, + "loss": 0.7961, + "step": 4926 + }, + { + "epoch": 0.5842523419898019, + "grad_norm": 1.7577270539680532, + "learning_rate": 4.825447162426661e-05, + "loss": 0.6645, + "step": 4927 + }, + { + "epoch": 0.584370923751927, + "grad_norm": 1.3533206572565841, + "learning_rate": 4.8253590328587534e-05, + "loss": 0.4939, + "step": 4928 + }, + { + "epoch": 0.584489505514052, + "grad_norm": 1.477625250675079, + "learning_rate": 4.8252708818538876e-05, + "loss": 0.3235, + "step": 4929 + }, + { + "epoch": 0.584608087276177, + "grad_norm": 1.2720623407303047, + "learning_rate": 4.825182709412876e-05, + "loss": 0.4572, + "step": 4930 + }, + { + "epoch": 0.584726669038302, + "grad_norm": 1.4111563928870146, + "learning_rate": 4.825094515536531e-05, + "loss": 0.4253, + "step": 4931 + }, + { + "epoch": 0.5848452508004269, + "grad_norm": 1.2850744846249977, + "learning_rate": 4.8250063002256665e-05, + "loss": 0.4387, + "step": 4932 + }, + { + "epoch": 0.5849638325625519, + "grad_norm": 1.4407256947563034, + "learning_rate": 4.824918063481094e-05, + "loss": 0.4781, + "step": 4933 + }, + { + "epoch": 0.5850824143246769, + "grad_norm": 1.4627077910974255, + "learning_rate": 4.824829805303628e-05, + "loss": 0.5704, + "step": 4934 + }, + { + "epoch": 0.5852009960868019, + "grad_norm": 1.616998837186328, + "learning_rate": 4.824741525694083e-05, + "loss": 0.6087, + "step": 4935 + }, + { + "epoch": 0.5853195778489269, + "grad_norm": 1.7573537873370957, + "learning_rate": 4.824653224653271e-05, + "loss": 0.7319, + "step": 4936 + }, + { + "epoch": 0.5854381596110518, + "grad_norm": 1.7875488432993492, + "learning_rate": 4.824564902182007e-05, + "loss": 0.5707, + "step": 4937 + }, + { + "epoch": 0.5855567413731768, + "grad_norm": 1.8596330290759115, + "learning_rate": 4.8244765582811055e-05, + "loss": 0.8066, + "step": 4938 + }, + { + "epoch": 0.5856753231353018, + "grad_norm": 1.5386017555599303, + "learning_rate": 4.8243881929513806e-05, + "loss": 0.5972, + "step": 4939 + }, + { + "epoch": 0.5857939048974268, + "grad_norm": 1.618275291694729, + "learning_rate": 4.8242998061936465e-05, + "loss": 0.6311, + "step": 4940 + }, + { + "epoch": 0.5859124866595518, + "grad_norm": 1.5911571473570507, + "learning_rate": 4.824211398008719e-05, + "loss": 0.5098, + "step": 4941 + }, + { + "epoch": 0.5860310684216767, + "grad_norm": 1.3264386395090593, + "learning_rate": 4.824122968397412e-05, + "loss": 0.4223, + "step": 4942 + }, + { + "epoch": 0.5861496501838017, + "grad_norm": 1.497922970845622, + "learning_rate": 4.8240345173605416e-05, + "loss": 0.5207, + "step": 4943 + }, + { + "epoch": 0.5862682319459267, + "grad_norm": 1.4870085530922106, + "learning_rate": 4.823946044898923e-05, + "loss": 0.5525, + "step": 4944 + }, + { + "epoch": 0.5863868137080517, + "grad_norm": 1.341605095592941, + "learning_rate": 4.823857551013372e-05, + "loss": 0.5122, + "step": 4945 + }, + { + "epoch": 0.5865053954701767, + "grad_norm": 1.4482539332360331, + "learning_rate": 4.823769035704704e-05, + "loss": 0.5254, + "step": 4946 + }, + { + "epoch": 0.5866239772323016, + "grad_norm": 1.3644827985664498, + "learning_rate": 4.823680498973734e-05, + "loss": 0.5206, + "step": 4947 + }, + { + "epoch": 0.5867425589944266, + "grad_norm": 1.6178064567117574, + "learning_rate": 4.8235919408212806e-05, + "loss": 0.6114, + "step": 4948 + }, + { + "epoch": 0.5868611407565516, + "grad_norm": 1.819789341404011, + "learning_rate": 4.823503361248158e-05, + "loss": 0.5999, + "step": 4949 + }, + { + "epoch": 0.5869797225186766, + "grad_norm": 1.4902867539268205, + "learning_rate": 4.823414760255184e-05, + "loss": 0.6032, + "step": 4950 + }, + { + "epoch": 0.5870983042808016, + "grad_norm": 1.6149339418010462, + "learning_rate": 4.823326137843175e-05, + "loss": 0.6059, + "step": 4951 + }, + { + "epoch": 0.5872168860429265, + "grad_norm": 1.3815233879204223, + "learning_rate": 4.8232374940129476e-05, + "loss": 0.4664, + "step": 4952 + }, + { + "epoch": 0.5873354678050516, + "grad_norm": 1.6781942553517675, + "learning_rate": 4.82314882876532e-05, + "loss": 0.7298, + "step": 4953 + }, + { + "epoch": 0.5874540495671766, + "grad_norm": 1.4619533068127624, + "learning_rate": 4.823060142101109e-05, + "loss": 0.3982, + "step": 4954 + }, + { + "epoch": 0.5875726313293016, + "grad_norm": 1.7030783033466592, + "learning_rate": 4.822971434021133e-05, + "loss": 0.5932, + "step": 4955 + }, + { + "epoch": 0.5876912130914266, + "grad_norm": 1.6074995225559945, + "learning_rate": 4.822882704526207e-05, + "loss": 0.4573, + "step": 4956 + }, + { + "epoch": 0.5878097948535516, + "grad_norm": 1.4244886061663535, + "learning_rate": 4.822793953617153e-05, + "loss": 0.543, + "step": 4957 + }, + { + "epoch": 0.5879283766156765, + "grad_norm": 1.7175738909380698, + "learning_rate": 4.822705181294786e-05, + "loss": 0.6094, + "step": 4958 + }, + { + "epoch": 0.5880469583778015, + "grad_norm": 1.4467631430827608, + "learning_rate": 4.822616387559926e-05, + "loss": 0.5011, + "step": 4959 + }, + { + "epoch": 0.5881655401399265, + "grad_norm": 1.4514160374905725, + "learning_rate": 4.822527572413391e-05, + "loss": 0.4559, + "step": 4960 + }, + { + "epoch": 0.5882841219020515, + "grad_norm": 1.4473463579580572, + "learning_rate": 4.822438735856e-05, + "loss": 0.4995, + "step": 4961 + }, + { + "epoch": 0.5884027036641765, + "grad_norm": 1.754711607251629, + "learning_rate": 4.822349877888571e-05, + "loss": 0.554, + "step": 4962 + }, + { + "epoch": 0.5885212854263014, + "grad_norm": 1.3504422328642773, + "learning_rate": 4.8222609985119246e-05, + "loss": 0.6406, + "step": 4963 + }, + { + "epoch": 0.5886398671884264, + "grad_norm": 1.5499165521212537, + "learning_rate": 4.822172097726879e-05, + "loss": 0.5446, + "step": 4964 + }, + { + "epoch": 0.5887584489505514, + "grad_norm": 1.592515051568355, + "learning_rate": 4.822083175534255e-05, + "loss": 0.4602, + "step": 4965 + }, + { + "epoch": 0.5888770307126764, + "grad_norm": 1.5556564806774136, + "learning_rate": 4.821994231934872e-05, + "loss": 0.5139, + "step": 4966 + }, + { + "epoch": 0.5889956124748014, + "grad_norm": 1.4603666284415884, + "learning_rate": 4.8219052669295486e-05, + "loss": 0.6581, + "step": 4967 + }, + { + "epoch": 0.5891141942369263, + "grad_norm": 1.7060853053175797, + "learning_rate": 4.821816280519106e-05, + "loss": 0.6731, + "step": 4968 + }, + { + "epoch": 0.5892327759990513, + "grad_norm": 1.4016686350949437, + "learning_rate": 4.821727272704365e-05, + "loss": 0.505, + "step": 4969 + }, + { + "epoch": 0.5893513577611763, + "grad_norm": 1.7723583783415964, + "learning_rate": 4.821638243486145e-05, + "loss": 0.7017, + "step": 4970 + }, + { + "epoch": 0.5894699395233013, + "grad_norm": 1.2811194732013829, + "learning_rate": 4.8215491928652684e-05, + "loss": 0.5106, + "step": 4971 + }, + { + "epoch": 0.5895885212854263, + "grad_norm": 1.322377238091314, + "learning_rate": 4.821460120842555e-05, + "loss": 0.5125, + "step": 4972 + }, + { + "epoch": 0.5897071030475513, + "grad_norm": 1.139025068738447, + "learning_rate": 4.821371027418825e-05, + "loss": 0.4502, + "step": 4973 + }, + { + "epoch": 0.5898256848096762, + "grad_norm": 1.314113897093728, + "learning_rate": 4.8212819125949014e-05, + "loss": 0.5854, + "step": 4974 + }, + { + "epoch": 0.5899442665718012, + "grad_norm": 1.33884531763015, + "learning_rate": 4.8211927763716055e-05, + "loss": 0.5843, + "step": 4975 + }, + { + "epoch": 0.5900628483339262, + "grad_norm": 1.4795076650833305, + "learning_rate": 4.821103618749758e-05, + "loss": 0.677, + "step": 4976 + }, + { + "epoch": 0.5901814300960512, + "grad_norm": 1.288852601890635, + "learning_rate": 4.8210144397301815e-05, + "loss": 0.5311, + "step": 4977 + }, + { + "epoch": 0.5903000118581763, + "grad_norm": 1.0097546511317559, + "learning_rate": 4.820925239313698e-05, + "loss": 0.3021, + "step": 4978 + }, + { + "epoch": 0.5904185936203012, + "grad_norm": 1.1988651343845003, + "learning_rate": 4.8208360175011306e-05, + "loss": 0.5135, + "step": 4979 + }, + { + "epoch": 0.5905371753824262, + "grad_norm": 1.356345268493025, + "learning_rate": 4.820746774293301e-05, + "loss": 0.5099, + "step": 4980 + }, + { + "epoch": 0.5906557571445512, + "grad_norm": 1.407120066057279, + "learning_rate": 4.820657509691032e-05, + "loss": 0.5495, + "step": 4981 + }, + { + "epoch": 0.5907743389066762, + "grad_norm": 1.365321287874761, + "learning_rate": 4.820568223695146e-05, + "loss": 0.5503, + "step": 4982 + }, + { + "epoch": 0.5908929206688012, + "grad_norm": 1.3491492936160716, + "learning_rate": 4.820478916306467e-05, + "loss": 0.4661, + "step": 4983 + }, + { + "epoch": 0.5910115024309261, + "grad_norm": 1.5255851747424745, + "learning_rate": 4.820389587525817e-05, + "loss": 0.6205, + "step": 4984 + }, + { + "epoch": 0.5911300841930511, + "grad_norm": 1.456832334799871, + "learning_rate": 4.820300237354022e-05, + "loss": 0.5399, + "step": 4985 + }, + { + "epoch": 0.5912486659551761, + "grad_norm": 1.535963031926081, + "learning_rate": 4.820210865791904e-05, + "loss": 0.5216, + "step": 4986 + }, + { + "epoch": 0.5913672477173011, + "grad_norm": 2.0268605010777496, + "learning_rate": 4.820121472840286e-05, + "loss": 0.8168, + "step": 4987 + }, + { + "epoch": 0.5914858294794261, + "grad_norm": 1.589261815186201, + "learning_rate": 4.820032058499994e-05, + "loss": 0.7101, + "step": 4988 + }, + { + "epoch": 0.591604411241551, + "grad_norm": 1.8683914871233251, + "learning_rate": 4.819942622771851e-05, + "loss": 0.7882, + "step": 4989 + }, + { + "epoch": 0.591722993003676, + "grad_norm": 1.8113528801407002, + "learning_rate": 4.819853165656682e-05, + "loss": 0.4888, + "step": 4990 + }, + { + "epoch": 0.591841574765801, + "grad_norm": 1.6207678613892398, + "learning_rate": 4.8197636871553114e-05, + "loss": 0.5498, + "step": 4991 + }, + { + "epoch": 0.591960156527926, + "grad_norm": 1.590397159740747, + "learning_rate": 4.819674187268565e-05, + "loss": 0.6903, + "step": 4992 + }, + { + "epoch": 0.592078738290051, + "grad_norm": 1.2108344148713195, + "learning_rate": 4.819584665997266e-05, + "loss": 0.3907, + "step": 4993 + }, + { + "epoch": 0.592197320052176, + "grad_norm": 1.6242475188544707, + "learning_rate": 4.819495123342242e-05, + "loss": 0.6247, + "step": 4994 + }, + { + "epoch": 0.5923159018143009, + "grad_norm": 1.4751523860161722, + "learning_rate": 4.819405559304316e-05, + "loss": 0.5935, + "step": 4995 + }, + { + "epoch": 0.5924344835764259, + "grad_norm": 1.5586573031421582, + "learning_rate": 4.819315973884316e-05, + "loss": 0.6489, + "step": 4996 + }, + { + "epoch": 0.5925530653385509, + "grad_norm": 1.3447268438941193, + "learning_rate": 4.8192263670830675e-05, + "loss": 0.4068, + "step": 4997 + }, + { + "epoch": 0.5926716471006759, + "grad_norm": 1.5205767686251805, + "learning_rate": 4.8191367389013946e-05, + "loss": 0.4971, + "step": 4998 + }, + { + "epoch": 0.5927902288628009, + "grad_norm": 1.036501958788195, + "learning_rate": 4.8190470893401255e-05, + "loss": 0.293, + "step": 4999 + }, + { + "epoch": 0.5929088106249258, + "grad_norm": 1.304283859209261, + "learning_rate": 4.8189574184000865e-05, + "loss": 0.4701, + "step": 5000 + }, + { + "epoch": 0.5930273923870508, + "grad_norm": 1.5925884145244469, + "learning_rate": 4.8188677260821027e-05, + "loss": 0.5358, + "step": 5001 + }, + { + "epoch": 0.5931459741491758, + "grad_norm": 1.3731194112450575, + "learning_rate": 4.818778012387002e-05, + "loss": 0.4784, + "step": 5002 + }, + { + "epoch": 0.5932645559113009, + "grad_norm": 1.655299439064095, + "learning_rate": 4.8186882773156125e-05, + "loss": 0.5839, + "step": 5003 + }, + { + "epoch": 0.5933831376734259, + "grad_norm": 1.381934959238323, + "learning_rate": 4.81859852086876e-05, + "loss": 0.4568, + "step": 5004 + }, + { + "epoch": 0.5935017194355509, + "grad_norm": 1.6590105470167593, + "learning_rate": 4.8185087430472727e-05, + "loss": 0.5548, + "step": 5005 + }, + { + "epoch": 0.5936203011976758, + "grad_norm": 2.009023871064922, + "learning_rate": 4.818418943851978e-05, + "loss": 0.6168, + "step": 5006 + }, + { + "epoch": 0.5937388829598008, + "grad_norm": 2.0528108940107805, + "learning_rate": 4.818329123283702e-05, + "loss": 0.6673, + "step": 5007 + }, + { + "epoch": 0.5938574647219258, + "grad_norm": 1.4913889646436398, + "learning_rate": 4.818239281343276e-05, + "loss": 0.48, + "step": 5008 + }, + { + "epoch": 0.5939760464840508, + "grad_norm": 2.0361704625406727, + "learning_rate": 4.8181494180315266e-05, + "loss": 0.7281, + "step": 5009 + }, + { + "epoch": 0.5940946282461758, + "grad_norm": 1.0951507274817396, + "learning_rate": 4.818059533349282e-05, + "loss": 0.3338, + "step": 5010 + }, + { + "epoch": 0.5942132100083007, + "grad_norm": 1.4371995806300184, + "learning_rate": 4.817969627297372e-05, + "loss": 0.7081, + "step": 5011 + }, + { + "epoch": 0.5943317917704257, + "grad_norm": 1.3951470256183967, + "learning_rate": 4.817879699876623e-05, + "loss": 0.5001, + "step": 5012 + }, + { + "epoch": 0.5944503735325507, + "grad_norm": 1.2482106993029958, + "learning_rate": 4.817789751087866e-05, + "loss": 0.3636, + "step": 5013 + }, + { + "epoch": 0.5945689552946757, + "grad_norm": 1.2272823607269867, + "learning_rate": 4.81769978093193e-05, + "loss": 0.4199, + "step": 5014 + }, + { + "epoch": 0.5946875370568007, + "grad_norm": 1.4579172828149722, + "learning_rate": 4.817609789409644e-05, + "loss": 0.3917, + "step": 5015 + }, + { + "epoch": 0.5948061188189256, + "grad_norm": 1.350376592082812, + "learning_rate": 4.8175197765218385e-05, + "loss": 0.5415, + "step": 5016 + }, + { + "epoch": 0.5949247005810506, + "grad_norm": 1.147667523341512, + "learning_rate": 4.817429742269342e-05, + "loss": 0.3403, + "step": 5017 + }, + { + "epoch": 0.5950432823431756, + "grad_norm": 1.5236424173203509, + "learning_rate": 4.817339686652985e-05, + "loss": 0.5902, + "step": 5018 + }, + { + "epoch": 0.5951618641053006, + "grad_norm": 1.6132229551059678, + "learning_rate": 4.8172496096735976e-05, + "loss": 0.5698, + "step": 5019 + }, + { + "epoch": 0.5952804458674256, + "grad_norm": 1.5071274358419369, + "learning_rate": 4.8171595113320104e-05, + "loss": 0.4969, + "step": 5020 + }, + { + "epoch": 0.5953990276295505, + "grad_norm": 1.7766606087607815, + "learning_rate": 4.8170693916290546e-05, + "loss": 0.5888, + "step": 5021 + }, + { + "epoch": 0.5955176093916755, + "grad_norm": 1.6802536101754872, + "learning_rate": 4.8169792505655595e-05, + "loss": 0.5, + "step": 5022 + }, + { + "epoch": 0.5956361911538005, + "grad_norm": 1.7475861796650667, + "learning_rate": 4.816889088142358e-05, + "loss": 0.7127, + "step": 5023 + }, + { + "epoch": 0.5957547729159255, + "grad_norm": 1.2816485424257849, + "learning_rate": 4.8167989043602797e-05, + "loss": 0.4385, + "step": 5024 + }, + { + "epoch": 0.5958733546780505, + "grad_norm": 1.6013429040675669, + "learning_rate": 4.8167086992201564e-05, + "loss": 0.4201, + "step": 5025 + }, + { + "epoch": 0.5959919364401755, + "grad_norm": 2.0827835955798863, + "learning_rate": 4.81661847272282e-05, + "loss": 0.7108, + "step": 5026 + }, + { + "epoch": 0.5961105182023004, + "grad_norm": 1.7877122123699725, + "learning_rate": 4.816528224869102e-05, + "loss": 0.7152, + "step": 5027 + }, + { + "epoch": 0.5962290999644255, + "grad_norm": 1.1697880232255828, + "learning_rate": 4.816437955659835e-05, + "loss": 0.376, + "step": 5028 + }, + { + "epoch": 0.5963476817265505, + "grad_norm": 1.5406976529876533, + "learning_rate": 4.81634766509585e-05, + "loss": 0.621, + "step": 5029 + }, + { + "epoch": 0.5964662634886755, + "grad_norm": 1.3486942314516916, + "learning_rate": 4.816257353177981e-05, + "loss": 0.3697, + "step": 5030 + }, + { + "epoch": 0.5965848452508005, + "grad_norm": 1.395307489759187, + "learning_rate": 4.8161670199070586e-05, + "loss": 0.4806, + "step": 5031 + }, + { + "epoch": 0.5967034270129254, + "grad_norm": 1.599650537995054, + "learning_rate": 4.816076665283917e-05, + "loss": 0.5306, + "step": 5032 + }, + { + "epoch": 0.5968220087750504, + "grad_norm": 1.1676089629735835, + "learning_rate": 4.815986289309389e-05, + "loss": 0.3826, + "step": 5033 + }, + { + "epoch": 0.5969405905371754, + "grad_norm": 1.1269684152675676, + "learning_rate": 4.815895891984307e-05, + "loss": 0.4442, + "step": 5034 + }, + { + "epoch": 0.5970591722993004, + "grad_norm": 1.4461293942964697, + "learning_rate": 4.8158054733095045e-05, + "loss": 0.535, + "step": 5035 + }, + { + "epoch": 0.5971777540614254, + "grad_norm": 2.112066157513325, + "learning_rate": 4.815715033285816e-05, + "loss": 0.7253, + "step": 5036 + }, + { + "epoch": 0.5972963358235504, + "grad_norm": 1.8450571824618847, + "learning_rate": 4.815624571914074e-05, + "loss": 0.7515, + "step": 5037 + }, + { + "epoch": 0.5974149175856753, + "grad_norm": 1.6674886579166792, + "learning_rate": 4.815534089195113e-05, + "loss": 0.562, + "step": 5038 + }, + { + "epoch": 0.5975334993478003, + "grad_norm": 1.1630090547709313, + "learning_rate": 4.8154435851297673e-05, + "loss": 0.4729, + "step": 5039 + }, + { + "epoch": 0.5976520811099253, + "grad_norm": 1.7221406184015344, + "learning_rate": 4.815353059718872e-05, + "loss": 0.5069, + "step": 5040 + }, + { + "epoch": 0.5977706628720503, + "grad_norm": 1.202078816308715, + "learning_rate": 4.81526251296326e-05, + "loss": 0.3556, + "step": 5041 + }, + { + "epoch": 0.5978892446341753, + "grad_norm": 1.3701079388096837, + "learning_rate": 4.815171944863767e-05, + "loss": 0.4533, + "step": 5042 + }, + { + "epoch": 0.5980078263963002, + "grad_norm": 1.3295549401634659, + "learning_rate": 4.815081355421228e-05, + "loss": 0.4942, + "step": 5043 + }, + { + "epoch": 0.5981264081584252, + "grad_norm": 1.5257462490018319, + "learning_rate": 4.814990744636477e-05, + "loss": 0.6476, + "step": 5044 + }, + { + "epoch": 0.5982449899205502, + "grad_norm": 1.8060255446536868, + "learning_rate": 4.814900112510351e-05, + "loss": 0.6456, + "step": 5045 + }, + { + "epoch": 0.5983635716826752, + "grad_norm": 1.5143404463942465, + "learning_rate": 4.814809459043684e-05, + "loss": 0.4752, + "step": 5046 + }, + { + "epoch": 0.5984821534448002, + "grad_norm": 1.6755462619347488, + "learning_rate": 4.814718784237313e-05, + "loss": 0.7621, + "step": 5047 + }, + { + "epoch": 0.5986007352069251, + "grad_norm": 1.2205560495921137, + "learning_rate": 4.814628088092073e-05, + "loss": 0.3825, + "step": 5048 + }, + { + "epoch": 0.5987193169690501, + "grad_norm": 1.5061517408543588, + "learning_rate": 4.8145373706088e-05, + "loss": 0.4932, + "step": 5049 + }, + { + "epoch": 0.5988378987311751, + "grad_norm": 1.3708880758975066, + "learning_rate": 4.814446631788332e-05, + "loss": 0.5724, + "step": 5050 + }, + { + "epoch": 0.5989564804933001, + "grad_norm": 1.6305563377604562, + "learning_rate": 4.814355871631503e-05, + "loss": 0.5918, + "step": 5051 + }, + { + "epoch": 0.5990750622554251, + "grad_norm": 1.523597534166945, + "learning_rate": 4.8142650901391516e-05, + "loss": 0.5747, + "step": 5052 + }, + { + "epoch": 0.5991936440175502, + "grad_norm": 1.6243422643459395, + "learning_rate": 4.814174287312113e-05, + "loss": 0.7525, + "step": 5053 + }, + { + "epoch": 0.5993122257796751, + "grad_norm": 1.368847099186889, + "learning_rate": 4.814083463151227e-05, + "loss": 0.5065, + "step": 5054 + }, + { + "epoch": 0.5994308075418001, + "grad_norm": 1.3765580247103222, + "learning_rate": 4.813992617657328e-05, + "loss": 0.3931, + "step": 5055 + }, + { + "epoch": 0.5995493893039251, + "grad_norm": 1.4335668156992916, + "learning_rate": 4.813901750831254e-05, + "loss": 0.5606, + "step": 5056 + }, + { + "epoch": 0.5996679710660501, + "grad_norm": 1.207092782522536, + "learning_rate": 4.813810862673845e-05, + "loss": 0.4633, + "step": 5057 + }, + { + "epoch": 0.5997865528281751, + "grad_norm": 1.586137675844984, + "learning_rate": 4.813719953185937e-05, + "loss": 0.6844, + "step": 5058 + }, + { + "epoch": 0.5999051345903, + "grad_norm": 2.043314230762433, + "learning_rate": 4.813629022368367e-05, + "loss": 0.5798, + "step": 5059 + }, + { + "epoch": 0.600023716352425, + "grad_norm": 1.4685239147452585, + "learning_rate": 4.8135380702219755e-05, + "loss": 0.5759, + "step": 5060 + }, + { + "epoch": 0.60014229811455, + "grad_norm": 1.534315762735501, + "learning_rate": 4.8134470967476e-05, + "loss": 0.482, + "step": 5061 + }, + { + "epoch": 0.600260879876675, + "grad_norm": 2.014959559902432, + "learning_rate": 4.8133561019460795e-05, + "loss": 0.6582, + "step": 5062 + }, + { + "epoch": 0.6003794616388, + "grad_norm": 1.1957021034173538, + "learning_rate": 4.8132650858182514e-05, + "loss": 0.3154, + "step": 5063 + }, + { + "epoch": 0.6004980434009249, + "grad_norm": 1.280188073057867, + "learning_rate": 4.8131740483649567e-05, + "loss": 0.4791, + "step": 5064 + }, + { + "epoch": 0.6006166251630499, + "grad_norm": 2.084445902131785, + "learning_rate": 4.8130829895870335e-05, + "loss": 0.646, + "step": 5065 + }, + { + "epoch": 0.6007352069251749, + "grad_norm": 1.4665718207375038, + "learning_rate": 4.812991909485323e-05, + "loss": 0.5222, + "step": 5066 + }, + { + "epoch": 0.6008537886872999, + "grad_norm": 1.5475381866587354, + "learning_rate": 4.812900808060662e-05, + "loss": 0.5606, + "step": 5067 + }, + { + "epoch": 0.6009723704494249, + "grad_norm": 1.7782790579332706, + "learning_rate": 4.812809685313892e-05, + "loss": 0.6577, + "step": 5068 + }, + { + "epoch": 0.6010909522115498, + "grad_norm": 1.5189955314164496, + "learning_rate": 4.812718541245853e-05, + "loss": 0.6553, + "step": 5069 + }, + { + "epoch": 0.6012095339736748, + "grad_norm": 1.446299832130682, + "learning_rate": 4.8126273758573854e-05, + "loss": 0.5396, + "step": 5070 + }, + { + "epoch": 0.6013281157357998, + "grad_norm": 1.803972743755853, + "learning_rate": 4.812536189149329e-05, + "loss": 0.6776, + "step": 5071 + }, + { + "epoch": 0.6014466974979248, + "grad_norm": 2.034633867002782, + "learning_rate": 4.812444981122525e-05, + "loss": 0.6492, + "step": 5072 + }, + { + "epoch": 0.6015652792600498, + "grad_norm": 1.3264444989925404, + "learning_rate": 4.812353751777814e-05, + "loss": 0.4247, + "step": 5073 + }, + { + "epoch": 0.6016838610221747, + "grad_norm": 1.170876866577602, + "learning_rate": 4.812262501116037e-05, + "loss": 0.4952, + "step": 5074 + }, + { + "epoch": 0.6018024427842997, + "grad_norm": 1.3611782355399467, + "learning_rate": 4.812171229138035e-05, + "loss": 0.5397, + "step": 5075 + }, + { + "epoch": 0.6019210245464247, + "grad_norm": 1.454323748334674, + "learning_rate": 4.81207993584465e-05, + "loss": 0.5909, + "step": 5076 + }, + { + "epoch": 0.6020396063085497, + "grad_norm": 1.4979392136927356, + "learning_rate": 4.811988621236723e-05, + "loss": 0.5057, + "step": 5077 + }, + { + "epoch": 0.6021581880706748, + "grad_norm": 1.2290751377485638, + "learning_rate": 4.811897285315096e-05, + "loss": 0.4264, + "step": 5078 + }, + { + "epoch": 0.6022767698327998, + "grad_norm": 1.3270491265591648, + "learning_rate": 4.81180592808061e-05, + "loss": 0.5107, + "step": 5079 + }, + { + "epoch": 0.6023953515949247, + "grad_norm": 1.2720075368407704, + "learning_rate": 4.81171454953411e-05, + "loss": 0.4438, + "step": 5080 + }, + { + "epoch": 0.6025139333570497, + "grad_norm": 1.4428491984928162, + "learning_rate": 4.811623149676436e-05, + "loss": 0.6836, + "step": 5081 + }, + { + "epoch": 0.6026325151191747, + "grad_norm": 1.3771981484159757, + "learning_rate": 4.811531728508432e-05, + "loss": 0.4773, + "step": 5082 + }, + { + "epoch": 0.6027510968812997, + "grad_norm": 1.5259425503507265, + "learning_rate": 4.8114402860309396e-05, + "loss": 0.41, + "step": 5083 + }, + { + "epoch": 0.6028696786434247, + "grad_norm": 1.255963532930276, + "learning_rate": 4.811348822244801e-05, + "loss": 0.3973, + "step": 5084 + }, + { + "epoch": 0.6029882604055496, + "grad_norm": 1.463816366164392, + "learning_rate": 4.811257337150862e-05, + "loss": 0.4047, + "step": 5085 + }, + { + "epoch": 0.6031068421676746, + "grad_norm": 1.322751692876159, + "learning_rate": 4.811165830749964e-05, + "loss": 0.4266, + "step": 5086 + }, + { + "epoch": 0.6032254239297996, + "grad_norm": 2.283883748747858, + "learning_rate": 4.8110743030429514e-05, + "loss": 0.7769, + "step": 5087 + }, + { + "epoch": 0.6033440056919246, + "grad_norm": 2.090669614735181, + "learning_rate": 4.8109827540306676e-05, + "loss": 0.5406, + "step": 5088 + }, + { + "epoch": 0.6034625874540496, + "grad_norm": 1.845831134186188, + "learning_rate": 4.810891183713957e-05, + "loss": 0.5868, + "step": 5089 + }, + { + "epoch": 0.6035811692161746, + "grad_norm": 1.7525573940139887, + "learning_rate": 4.8107995920936633e-05, + "loss": 0.5753, + "step": 5090 + }, + { + "epoch": 0.6036997509782995, + "grad_norm": 1.6485296637588127, + "learning_rate": 4.810707979170631e-05, + "loss": 0.5443, + "step": 5091 + }, + { + "epoch": 0.6038183327404245, + "grad_norm": 1.3287627646672227, + "learning_rate": 4.8106163449457057e-05, + "loss": 0.4508, + "step": 5092 + }, + { + "epoch": 0.6039369145025495, + "grad_norm": 1.7404530258525939, + "learning_rate": 4.81052468941973e-05, + "loss": 0.5225, + "step": 5093 + }, + { + "epoch": 0.6040554962646745, + "grad_norm": 1.6955720314758524, + "learning_rate": 4.81043301259355e-05, + "loss": 0.8031, + "step": 5094 + }, + { + "epoch": 0.6041740780267995, + "grad_norm": 1.248630116551128, + "learning_rate": 4.810341314468011e-05, + "loss": 0.3369, + "step": 5095 + }, + { + "epoch": 0.6042926597889244, + "grad_norm": 1.3354499394473458, + "learning_rate": 4.8102495950439585e-05, + "loss": 0.4917, + "step": 5096 + }, + { + "epoch": 0.6044112415510494, + "grad_norm": 1.705364132502535, + "learning_rate": 4.8101578543222375e-05, + "loss": 0.7415, + "step": 5097 + }, + { + "epoch": 0.6045298233131744, + "grad_norm": 1.2119152177979147, + "learning_rate": 4.8100660923036945e-05, + "loss": 0.5734, + "step": 5098 + }, + { + "epoch": 0.6046484050752994, + "grad_norm": 1.3310192005446184, + "learning_rate": 4.809974308989174e-05, + "loss": 0.5632, + "step": 5099 + }, + { + "epoch": 0.6047669868374244, + "grad_norm": 1.2851467890658483, + "learning_rate": 4.809882504379524e-05, + "loss": 0.4588, + "step": 5100 + }, + { + "epoch": 0.6048855685995493, + "grad_norm": 1.1475512823446283, + "learning_rate": 4.809790678475589e-05, + "loss": 0.4402, + "step": 5101 + }, + { + "epoch": 0.6050041503616743, + "grad_norm": 1.6553213736038617, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.6156, + "step": 5102 + }, + { + "epoch": 0.6051227321237994, + "grad_norm": 1.5164183733783878, + "learning_rate": 4.809606962788254e-05, + "loss": 0.5389, + "step": 5103 + }, + { + "epoch": 0.6052413138859244, + "grad_norm": 1.4729074338556065, + "learning_rate": 4.809515073006547e-05, + "loss": 0.5824, + "step": 5104 + }, + { + "epoch": 0.6053598956480494, + "grad_norm": 1.2646764134269506, + "learning_rate": 4.8094231619339435e-05, + "loss": 0.4538, + "step": 5105 + }, + { + "epoch": 0.6054784774101744, + "grad_norm": 1.5081382738102669, + "learning_rate": 4.80933122957129e-05, + "loss": 0.4854, + "step": 5106 + }, + { + "epoch": 0.6055970591722993, + "grad_norm": 1.2250886490878297, + "learning_rate": 4.809239275919434e-05, + "loss": 0.4201, + "step": 5107 + }, + { + "epoch": 0.6057156409344243, + "grad_norm": 1.3157920222591222, + "learning_rate": 4.809147300979224e-05, + "loss": 0.4371, + "step": 5108 + }, + { + "epoch": 0.6058342226965493, + "grad_norm": 1.3092736601165913, + "learning_rate": 4.809055304751509e-05, + "loss": 0.4836, + "step": 5109 + }, + { + "epoch": 0.6059528044586743, + "grad_norm": 1.3869044162792492, + "learning_rate": 4.808963287237134e-05, + "loss": 0.6041, + "step": 5110 + }, + { + "epoch": 0.6060713862207993, + "grad_norm": 1.6228777284260985, + "learning_rate": 4.8088712484369494e-05, + "loss": 0.4997, + "step": 5111 + }, + { + "epoch": 0.6061899679829242, + "grad_norm": 1.4427988729910017, + "learning_rate": 4.808779188351803e-05, + "loss": 0.5446, + "step": 5112 + }, + { + "epoch": 0.6063085497450492, + "grad_norm": 2.227934020482448, + "learning_rate": 4.8086871069825435e-05, + "loss": 0.8166, + "step": 5113 + }, + { + "epoch": 0.6064271315071742, + "grad_norm": 1.6808964006028664, + "learning_rate": 4.80859500433002e-05, + "loss": 0.6543, + "step": 5114 + }, + { + "epoch": 0.6065457132692992, + "grad_norm": 1.4034665807392264, + "learning_rate": 4.808502880395082e-05, + "loss": 0.5773, + "step": 5115 + }, + { + "epoch": 0.6066642950314242, + "grad_norm": 1.5314651436677196, + "learning_rate": 4.808410735178578e-05, + "loss": 0.621, + "step": 5116 + }, + { + "epoch": 0.6067828767935491, + "grad_norm": 1.3412577030774167, + "learning_rate": 4.808318568681358e-05, + "loss": 0.4374, + "step": 5117 + }, + { + "epoch": 0.6069014585556741, + "grad_norm": 1.200046615929627, + "learning_rate": 4.808226380904272e-05, + "loss": 0.4401, + "step": 5118 + }, + { + "epoch": 0.6070200403177991, + "grad_norm": 1.3545963892312394, + "learning_rate": 4.808134171848168e-05, + "loss": 0.4434, + "step": 5119 + }, + { + "epoch": 0.6071386220799241, + "grad_norm": 1.306121381545117, + "learning_rate": 4.8080419415138986e-05, + "loss": 0.3969, + "step": 5120 + }, + { + "epoch": 0.6072572038420491, + "grad_norm": 1.3590499409311205, + "learning_rate": 4.807949689902312e-05, + "loss": 0.4967, + "step": 5121 + }, + { + "epoch": 0.607375785604174, + "grad_norm": 1.679793443183019, + "learning_rate": 4.80785741701426e-05, + "loss": 0.406, + "step": 5122 + }, + { + "epoch": 0.607494367366299, + "grad_norm": 1.6556580616080594, + "learning_rate": 4.807765122850592e-05, + "loss": 0.4159, + "step": 5123 + }, + { + "epoch": 0.607612949128424, + "grad_norm": 1.2735924703557584, + "learning_rate": 4.80767280741216e-05, + "loss": 0.4884, + "step": 5124 + }, + { + "epoch": 0.607731530890549, + "grad_norm": 1.7318688957363033, + "learning_rate": 4.807580470699814e-05, + "loss": 0.6042, + "step": 5125 + }, + { + "epoch": 0.607850112652674, + "grad_norm": 1.4601316998654799, + "learning_rate": 4.807488112714407e-05, + "loss": 0.5996, + "step": 5126 + }, + { + "epoch": 0.607968694414799, + "grad_norm": 1.5438255111418846, + "learning_rate": 4.807395733456788e-05, + "loss": 0.5591, + "step": 5127 + }, + { + "epoch": 0.608087276176924, + "grad_norm": 1.2824193444668521, + "learning_rate": 4.8073033329278104e-05, + "loss": 0.4232, + "step": 5128 + }, + { + "epoch": 0.608205857939049, + "grad_norm": 1.4757143740395897, + "learning_rate": 4.807210911128326e-05, + "loss": 0.5815, + "step": 5129 + }, + { + "epoch": 0.608324439701174, + "grad_norm": 1.5958787077249146, + "learning_rate": 4.807118468059185e-05, + "loss": 0.4734, + "step": 5130 + }, + { + "epoch": 0.608443021463299, + "grad_norm": 2.0296200293057987, + "learning_rate": 4.8070260037212424e-05, + "loss": 0.6734, + "step": 5131 + }, + { + "epoch": 0.608561603225424, + "grad_norm": 1.6131794578410847, + "learning_rate": 4.806933518115348e-05, + "loss": 0.6257, + "step": 5132 + }, + { + "epoch": 0.608680184987549, + "grad_norm": 1.2646315449483865, + "learning_rate": 4.8068410112423555e-05, + "loss": 0.4488, + "step": 5133 + }, + { + "epoch": 0.6087987667496739, + "grad_norm": 1.6432191514603876, + "learning_rate": 4.806748483103118e-05, + "loss": 0.51, + "step": 5134 + }, + { + "epoch": 0.6089173485117989, + "grad_norm": 1.6247178936227242, + "learning_rate": 4.806655933698488e-05, + "loss": 0.5586, + "step": 5135 + }, + { + "epoch": 0.6090359302739239, + "grad_norm": 1.495842626160169, + "learning_rate": 4.80656336302932e-05, + "loss": 0.5205, + "step": 5136 + }, + { + "epoch": 0.6091545120360489, + "grad_norm": 1.6563708630979666, + "learning_rate": 4.806470771096465e-05, + "loss": 0.6364, + "step": 5137 + }, + { + "epoch": 0.6092730937981738, + "grad_norm": 1.1918572658453652, + "learning_rate": 4.806378157900779e-05, + "loss": 0.4134, + "step": 5138 + }, + { + "epoch": 0.6093916755602988, + "grad_norm": 1.823178871433116, + "learning_rate": 4.8062855234431134e-05, + "loss": 0.5548, + "step": 5139 + }, + { + "epoch": 0.6095102573224238, + "grad_norm": 1.6771415979459658, + "learning_rate": 4.806192867724324e-05, + "loss": 0.6072, + "step": 5140 + }, + { + "epoch": 0.6096288390845488, + "grad_norm": 1.4395101602316298, + "learning_rate": 4.806100190745265e-05, + "loss": 0.5975, + "step": 5141 + }, + { + "epoch": 0.6097474208466738, + "grad_norm": 1.2113407184956972, + "learning_rate": 4.80600749250679e-05, + "loss": 0.4883, + "step": 5142 + }, + { + "epoch": 0.6098660026087988, + "grad_norm": 1.2680769663525484, + "learning_rate": 4.805914773009753e-05, + "loss": 0.5614, + "step": 5143 + }, + { + "epoch": 0.6099845843709237, + "grad_norm": 1.3358051694707977, + "learning_rate": 4.8058220322550094e-05, + "loss": 0.4739, + "step": 5144 + }, + { + "epoch": 0.6101031661330487, + "grad_norm": 1.5752538821877609, + "learning_rate": 4.8057292702434147e-05, + "loss": 0.5615, + "step": 5145 + }, + { + "epoch": 0.6102217478951737, + "grad_norm": 1.2554048610845336, + "learning_rate": 4.8056364869758236e-05, + "loss": 0.5755, + "step": 5146 + }, + { + "epoch": 0.6103403296572987, + "grad_norm": 1.5992112813743595, + "learning_rate": 4.805543682453091e-05, + "loss": 0.5736, + "step": 5147 + }, + { + "epoch": 0.6104589114194237, + "grad_norm": 1.3776964998674834, + "learning_rate": 4.805450856676074e-05, + "loss": 0.5165, + "step": 5148 + }, + { + "epoch": 0.6105774931815486, + "grad_norm": 1.8055149877469596, + "learning_rate": 4.8053580096456265e-05, + "loss": 0.6152, + "step": 5149 + }, + { + "epoch": 0.6106960749436736, + "grad_norm": 1.3763332249235518, + "learning_rate": 4.805265141362605e-05, + "loss": 0.5127, + "step": 5150 + }, + { + "epoch": 0.6108146567057986, + "grad_norm": 1.4485612680719788, + "learning_rate": 4.8051722518278664e-05, + "loss": 0.5133, + "step": 5151 + }, + { + "epoch": 0.6109332384679236, + "grad_norm": 1.6429563401720764, + "learning_rate": 4.805079341042266e-05, + "loss": 0.5892, + "step": 5152 + }, + { + "epoch": 0.6110518202300487, + "grad_norm": 1.3003741727557958, + "learning_rate": 4.80498640900666e-05, + "loss": 0.3985, + "step": 5153 + }, + { + "epoch": 0.6111704019921737, + "grad_norm": 1.4314448943887212, + "learning_rate": 4.804893455721907e-05, + "loss": 0.6161, + "step": 5154 + }, + { + "epoch": 0.6112889837542986, + "grad_norm": 1.4687152518408269, + "learning_rate": 4.8048004811888624e-05, + "loss": 0.5328, + "step": 5155 + }, + { + "epoch": 0.6114075655164236, + "grad_norm": 1.5951057997564002, + "learning_rate": 4.804707485408384e-05, + "loss": 0.6635, + "step": 5156 + }, + { + "epoch": 0.6115261472785486, + "grad_norm": 1.5184467909990438, + "learning_rate": 4.804614468381329e-05, + "loss": 0.7319, + "step": 5157 + }, + { + "epoch": 0.6116447290406736, + "grad_norm": 1.2006157893956546, + "learning_rate": 4.804521430108554e-05, + "loss": 0.4571, + "step": 5158 + }, + { + "epoch": 0.6117633108027986, + "grad_norm": 1.4682327984056156, + "learning_rate": 4.804428370590918e-05, + "loss": 0.4983, + "step": 5159 + }, + { + "epoch": 0.6118818925649235, + "grad_norm": 1.4388321601640484, + "learning_rate": 4.804335289829277e-05, + "loss": 0.4815, + "step": 5160 + }, + { + "epoch": 0.6120004743270485, + "grad_norm": 1.68211222151509, + "learning_rate": 4.8042421878244915e-05, + "loss": 0.5805, + "step": 5161 + }, + { + "epoch": 0.6121190560891735, + "grad_norm": 1.8390806269762374, + "learning_rate": 4.804149064577419e-05, + "loss": 0.7274, + "step": 5162 + }, + { + "epoch": 0.6122376378512985, + "grad_norm": 1.2740463950572787, + "learning_rate": 4.804055920088917e-05, + "loss": 0.5688, + "step": 5163 + }, + { + "epoch": 0.6123562196134235, + "grad_norm": 1.564295333640591, + "learning_rate": 4.8039627543598444e-05, + "loss": 0.5073, + "step": 5164 + }, + { + "epoch": 0.6124748013755484, + "grad_norm": 1.4010758169918685, + "learning_rate": 4.803869567391062e-05, + "loss": 0.5881, + "step": 5165 + }, + { + "epoch": 0.6125933831376734, + "grad_norm": 1.7071053775573315, + "learning_rate": 4.803776359183426e-05, + "loss": 0.6914, + "step": 5166 + }, + { + "epoch": 0.6127119648997984, + "grad_norm": 2.054779043526917, + "learning_rate": 4.803683129737798e-05, + "loss": 0.6838, + "step": 5167 + }, + { + "epoch": 0.6128305466619234, + "grad_norm": 1.4190222045602578, + "learning_rate": 4.803589879055036e-05, + "loss": 0.6484, + "step": 5168 + }, + { + "epoch": 0.6129491284240484, + "grad_norm": 1.3618126331403309, + "learning_rate": 4.803496607136e-05, + "loss": 0.495, + "step": 5169 + }, + { + "epoch": 0.6130677101861733, + "grad_norm": 1.1509822957298423, + "learning_rate": 4.8034033139815505e-05, + "loss": 0.5288, + "step": 5170 + }, + { + "epoch": 0.6131862919482983, + "grad_norm": 1.1875282847795874, + "learning_rate": 4.8033099995925466e-05, + "loss": 0.4257, + "step": 5171 + }, + { + "epoch": 0.6133048737104233, + "grad_norm": 1.3974707128650843, + "learning_rate": 4.803216663969849e-05, + "loss": 0.6168, + "step": 5172 + }, + { + "epoch": 0.6134234554725483, + "grad_norm": 1.3375323310327434, + "learning_rate": 4.8031233071143184e-05, + "loss": 0.4502, + "step": 5173 + }, + { + "epoch": 0.6135420372346733, + "grad_norm": 1.4539077153064321, + "learning_rate": 4.803029929026815e-05, + "loss": 0.613, + "step": 5174 + }, + { + "epoch": 0.6136606189967982, + "grad_norm": 1.3237519671060893, + "learning_rate": 4.8029365297082e-05, + "loss": 0.4789, + "step": 5175 + }, + { + "epoch": 0.6137792007589232, + "grad_norm": 1.2961344081980335, + "learning_rate": 4.802843109159334e-05, + "loss": 0.5592, + "step": 5176 + }, + { + "epoch": 0.6138977825210482, + "grad_norm": 1.2586755603118107, + "learning_rate": 4.802749667381079e-05, + "loss": 0.3885, + "step": 5177 + }, + { + "epoch": 0.6140163642831733, + "grad_norm": 1.700934675478272, + "learning_rate": 4.802656204374296e-05, + "loss": 0.5543, + "step": 5178 + }, + { + "epoch": 0.6141349460452983, + "grad_norm": 1.2312058048800438, + "learning_rate": 4.802562720139846e-05, + "loss": 0.4855, + "step": 5179 + }, + { + "epoch": 0.6142535278074233, + "grad_norm": 1.2940083801497975, + "learning_rate": 4.802469214678591e-05, + "loss": 0.4376, + "step": 5180 + }, + { + "epoch": 0.6143721095695482, + "grad_norm": 1.2518996598134267, + "learning_rate": 4.802375687991394e-05, + "loss": 0.3781, + "step": 5181 + }, + { + "epoch": 0.6144906913316732, + "grad_norm": 1.3161944717652352, + "learning_rate": 4.802282140079116e-05, + "loss": 0.516, + "step": 5182 + }, + { + "epoch": 0.6146092730937982, + "grad_norm": 1.2198231595549482, + "learning_rate": 4.80218857094262e-05, + "loss": 0.5449, + "step": 5183 + }, + { + "epoch": 0.6147278548559232, + "grad_norm": 2.015855901107701, + "learning_rate": 4.802094980582769e-05, + "loss": 0.7629, + "step": 5184 + }, + { + "epoch": 0.6148464366180482, + "grad_norm": 1.5393474726112903, + "learning_rate": 4.802001369000425e-05, + "loss": 0.4966, + "step": 5185 + }, + { + "epoch": 0.6149650183801731, + "grad_norm": 2.144833383921898, + "learning_rate": 4.8019077361964507e-05, + "loss": 0.7549, + "step": 5186 + }, + { + "epoch": 0.6150836001422981, + "grad_norm": 1.810907509388293, + "learning_rate": 4.801814082171711e-05, + "loss": 0.5941, + "step": 5187 + }, + { + "epoch": 0.6152021819044231, + "grad_norm": 1.4412754687430986, + "learning_rate": 4.801720406927067e-05, + "loss": 0.5888, + "step": 5188 + }, + { + "epoch": 0.6153207636665481, + "grad_norm": 1.518589572930433, + "learning_rate": 4.801626710463384e-05, + "loss": 0.5193, + "step": 5189 + }, + { + "epoch": 0.6154393454286731, + "grad_norm": 1.63115004289834, + "learning_rate": 4.8015329927815245e-05, + "loss": 0.625, + "step": 5190 + }, + { + "epoch": 0.615557927190798, + "grad_norm": 1.38023197790358, + "learning_rate": 4.801439253882354e-05, + "loss": 0.5813, + "step": 5191 + }, + { + "epoch": 0.615676508952923, + "grad_norm": 1.5094069168227893, + "learning_rate": 4.8013454937667354e-05, + "loss": 0.6756, + "step": 5192 + }, + { + "epoch": 0.615795090715048, + "grad_norm": 1.2429148414529627, + "learning_rate": 4.801251712435534e-05, + "loss": 0.418, + "step": 5193 + }, + { + "epoch": 0.615913672477173, + "grad_norm": 1.4793392823217313, + "learning_rate": 4.8011579098896133e-05, + "loss": 0.6597, + "step": 5194 + }, + { + "epoch": 0.616032254239298, + "grad_norm": 1.602949514458845, + "learning_rate": 4.8010640861298385e-05, + "loss": 0.6783, + "step": 5195 + }, + { + "epoch": 0.616150836001423, + "grad_norm": 1.192940029114069, + "learning_rate": 4.800970241157074e-05, + "loss": 0.4723, + "step": 5196 + }, + { + "epoch": 0.6162694177635479, + "grad_norm": 1.9375481194456907, + "learning_rate": 4.8008763749721864e-05, + "loss": 0.6617, + "step": 5197 + }, + { + "epoch": 0.6163879995256729, + "grad_norm": 1.3285995089012554, + "learning_rate": 4.80078248757604e-05, + "loss": 0.4455, + "step": 5198 + }, + { + "epoch": 0.6165065812877979, + "grad_norm": 1.8530405728766532, + "learning_rate": 4.8006885789695e-05, + "loss": 0.7117, + "step": 5199 + }, + { + "epoch": 0.6166251630499229, + "grad_norm": 1.2311028382268137, + "learning_rate": 4.800594649153433e-05, + "loss": 0.4284, + "step": 5200 + }, + { + "epoch": 0.6167437448120479, + "grad_norm": 1.453170854914148, + "learning_rate": 4.800500698128705e-05, + "loss": 0.5454, + "step": 5201 + }, + { + "epoch": 0.6168623265741728, + "grad_norm": 1.2923622946712152, + "learning_rate": 4.8004067258961804e-05, + "loss": 0.586, + "step": 5202 + }, + { + "epoch": 0.6169809083362979, + "grad_norm": 1.4827790428823606, + "learning_rate": 4.800312732456728e-05, + "loss": 0.4296, + "step": 5203 + }, + { + "epoch": 0.6170994900984229, + "grad_norm": 1.1771660476046084, + "learning_rate": 4.800218717811213e-05, + "loss": 0.438, + "step": 5204 + }, + { + "epoch": 0.6172180718605479, + "grad_norm": 1.3120265328464134, + "learning_rate": 4.800124681960501e-05, + "loss": 0.4572, + "step": 5205 + }, + { + "epoch": 0.6173366536226729, + "grad_norm": 1.4274768825943103, + "learning_rate": 4.800030624905461e-05, + "loss": 0.4599, + "step": 5206 + }, + { + "epoch": 0.6174552353847979, + "grad_norm": 1.5852538817939432, + "learning_rate": 4.799936546646958e-05, + "loss": 0.7439, + "step": 5207 + }, + { + "epoch": 0.6175738171469228, + "grad_norm": 1.6253354775753432, + "learning_rate": 4.799842447185862e-05, + "loss": 0.5808, + "step": 5208 + }, + { + "epoch": 0.6176923989090478, + "grad_norm": 1.491964456725777, + "learning_rate": 4.799748326523037e-05, + "loss": 0.462, + "step": 5209 + }, + { + "epoch": 0.6178109806711728, + "grad_norm": 1.7656878350355303, + "learning_rate": 4.799654184659353e-05, + "loss": 0.6902, + "step": 5210 + }, + { + "epoch": 0.6179295624332978, + "grad_norm": 1.6540136315847704, + "learning_rate": 4.799560021595679e-05, + "loss": 0.5757, + "step": 5211 + }, + { + "epoch": 0.6180481441954228, + "grad_norm": 1.4330364142853929, + "learning_rate": 4.7994658373328804e-05, + "loss": 0.6828, + "step": 5212 + }, + { + "epoch": 0.6181667259575477, + "grad_norm": 1.188715352179939, + "learning_rate": 4.799371631871827e-05, + "loss": 0.3853, + "step": 5213 + }, + { + "epoch": 0.6182853077196727, + "grad_norm": 1.5531137280959875, + "learning_rate": 4.799277405213386e-05, + "loss": 0.598, + "step": 5214 + }, + { + "epoch": 0.6184038894817977, + "grad_norm": 1.3596991854182263, + "learning_rate": 4.799183157358428e-05, + "loss": 0.4502, + "step": 5215 + }, + { + "epoch": 0.6185224712439227, + "grad_norm": 1.2682220929396502, + "learning_rate": 4.79908888830782e-05, + "loss": 0.3503, + "step": 5216 + }, + { + "epoch": 0.6186410530060477, + "grad_norm": 1.6152658151749104, + "learning_rate": 4.7989945980624316e-05, + "loss": 0.6165, + "step": 5217 + }, + { + "epoch": 0.6187596347681726, + "grad_norm": 1.4928378141660166, + "learning_rate": 4.7989002866231325e-05, + "loss": 0.6361, + "step": 5218 + }, + { + "epoch": 0.6188782165302976, + "grad_norm": 2.116836739659109, + "learning_rate": 4.7988059539907925e-05, + "loss": 0.6442, + "step": 5219 + }, + { + "epoch": 0.6189967982924226, + "grad_norm": 1.5715249091356747, + "learning_rate": 4.7987116001662804e-05, + "loss": 0.6779, + "step": 5220 + }, + { + "epoch": 0.6191153800545476, + "grad_norm": 1.3270125809588837, + "learning_rate": 4.798617225150465e-05, + "loss": 0.4091, + "step": 5221 + }, + { + "epoch": 0.6192339618166726, + "grad_norm": 1.1953556167546557, + "learning_rate": 4.798522828944218e-05, + "loss": 0.3607, + "step": 5222 + }, + { + "epoch": 0.6193525435787975, + "grad_norm": 1.3274089814673415, + "learning_rate": 4.79842841154841e-05, + "loss": 0.4251, + "step": 5223 + }, + { + "epoch": 0.6194711253409225, + "grad_norm": 1.1965923880788638, + "learning_rate": 4.79833397296391e-05, + "loss": 0.3778, + "step": 5224 + }, + { + "epoch": 0.6195897071030475, + "grad_norm": 1.655606942835597, + "learning_rate": 4.798239513191589e-05, + "loss": 0.5837, + "step": 5225 + }, + { + "epoch": 0.6197082888651725, + "grad_norm": 1.367628512959878, + "learning_rate": 4.798145032232319e-05, + "loss": 0.4299, + "step": 5226 + }, + { + "epoch": 0.6198268706272976, + "grad_norm": 1.7302250399300894, + "learning_rate": 4.798050530086969e-05, + "loss": 0.5681, + "step": 5227 + }, + { + "epoch": 0.6199454523894226, + "grad_norm": 1.3939233160302065, + "learning_rate": 4.797956006756411e-05, + "loss": 0.4819, + "step": 5228 + }, + { + "epoch": 0.6200640341515475, + "grad_norm": 1.3601445546394562, + "learning_rate": 4.797861462241517e-05, + "loss": 0.4351, + "step": 5229 + }, + { + "epoch": 0.6201826159136725, + "grad_norm": 1.2433793157266273, + "learning_rate": 4.7977668965431584e-05, + "loss": 0.4173, + "step": 5230 + }, + { + "epoch": 0.6203011976757975, + "grad_norm": 1.6020965579517303, + "learning_rate": 4.797672309662206e-05, + "loss": 0.5502, + "step": 5231 + }, + { + "epoch": 0.6204197794379225, + "grad_norm": 1.6778860197342216, + "learning_rate": 4.797577701599533e-05, + "loss": 0.6392, + "step": 5232 + }, + { + "epoch": 0.6205383612000475, + "grad_norm": 1.077994781475414, + "learning_rate": 4.797483072356011e-05, + "loss": 0.3416, + "step": 5233 + }, + { + "epoch": 0.6206569429621724, + "grad_norm": 1.9310318167058182, + "learning_rate": 4.797388421932513e-05, + "loss": 0.6625, + "step": 5234 + }, + { + "epoch": 0.6207755247242974, + "grad_norm": 1.7579325934771286, + "learning_rate": 4.79729375032991e-05, + "loss": 0.5411, + "step": 5235 + }, + { + "epoch": 0.6208941064864224, + "grad_norm": 1.60394214135641, + "learning_rate": 4.797199057549076e-05, + "loss": 0.6265, + "step": 5236 + }, + { + "epoch": 0.6210126882485474, + "grad_norm": 1.5044085247749917, + "learning_rate": 4.797104343590883e-05, + "loss": 0.4975, + "step": 5237 + }, + { + "epoch": 0.6211312700106724, + "grad_norm": 2.4618772764711974, + "learning_rate": 4.797009608456206e-05, + "loss": 0.5313, + "step": 5238 + }, + { + "epoch": 0.6212498517727973, + "grad_norm": 1.9685998396889728, + "learning_rate": 4.796914852145917e-05, + "loss": 0.6543, + "step": 5239 + }, + { + "epoch": 0.6213684335349223, + "grad_norm": 2.123503393469146, + "learning_rate": 4.7968200746608896e-05, + "loss": 0.8104, + "step": 5240 + }, + { + "epoch": 0.6214870152970473, + "grad_norm": 1.5245313250283807, + "learning_rate": 4.7967252760019976e-05, + "loss": 0.4941, + "step": 5241 + }, + { + "epoch": 0.6216055970591723, + "grad_norm": 1.419204909119834, + "learning_rate": 4.796630456170115e-05, + "loss": 0.4651, + "step": 5242 + }, + { + "epoch": 0.6217241788212973, + "grad_norm": 1.5637572136428668, + "learning_rate": 4.796535615166116e-05, + "loss": 0.7094, + "step": 5243 + }, + { + "epoch": 0.6218427605834222, + "grad_norm": 1.3782029370496744, + "learning_rate": 4.796440752990875e-05, + "loss": 0.5342, + "step": 5244 + }, + { + "epoch": 0.6219613423455472, + "grad_norm": 1.3345791144494041, + "learning_rate": 4.796345869645266e-05, + "loss": 0.4697, + "step": 5245 + }, + { + "epoch": 0.6220799241076722, + "grad_norm": 1.4478013872788873, + "learning_rate": 4.796250965130164e-05, + "loss": 0.5671, + "step": 5246 + }, + { + "epoch": 0.6221985058697972, + "grad_norm": 1.4113780002777003, + "learning_rate": 4.7961560394464445e-05, + "loss": 0.7073, + "step": 5247 + }, + { + "epoch": 0.6223170876319222, + "grad_norm": 1.2016300524607295, + "learning_rate": 4.796061092594982e-05, + "loss": 0.3753, + "step": 5248 + }, + { + "epoch": 0.6224356693940472, + "grad_norm": 1.1606463363521238, + "learning_rate": 4.795966124576651e-05, + "loss": 0.4465, + "step": 5249 + }, + { + "epoch": 0.6225542511561721, + "grad_norm": 1.471461781009048, + "learning_rate": 4.795871135392329e-05, + "loss": 0.5272, + "step": 5250 + }, + { + "epoch": 0.6226728329182971, + "grad_norm": 1.2371438287816041, + "learning_rate": 4.795776125042889e-05, + "loss": 0.5247, + "step": 5251 + }, + { + "epoch": 0.6227914146804222, + "grad_norm": 0.9522004131849651, + "learning_rate": 4.7956810935292095e-05, + "loss": 0.3285, + "step": 5252 + }, + { + "epoch": 0.6229099964425472, + "grad_norm": 1.6152034614240498, + "learning_rate": 4.795586040852165e-05, + "loss": 0.7852, + "step": 5253 + }, + { + "epoch": 0.6230285782046722, + "grad_norm": 1.3824032134032107, + "learning_rate": 4.7954909670126323e-05, + "loss": 0.5268, + "step": 5254 + }, + { + "epoch": 0.6231471599667971, + "grad_norm": 1.746365282396822, + "learning_rate": 4.7953958720114886e-05, + "loss": 0.4973, + "step": 5255 + }, + { + "epoch": 0.6232657417289221, + "grad_norm": 1.3657154891230598, + "learning_rate": 4.7953007558496086e-05, + "loss": 0.5596, + "step": 5256 + }, + { + "epoch": 0.6233843234910471, + "grad_norm": 1.5449486685269789, + "learning_rate": 4.795205618527871e-05, + "loss": 0.3757, + "step": 5257 + }, + { + "epoch": 0.6235029052531721, + "grad_norm": 2.2666428743029092, + "learning_rate": 4.795110460047152e-05, + "loss": 0.7168, + "step": 5258 + }, + { + "epoch": 0.6236214870152971, + "grad_norm": 1.3079556025725079, + "learning_rate": 4.795015280408329e-05, + "loss": 0.4163, + "step": 5259 + }, + { + "epoch": 0.623740068777422, + "grad_norm": 1.3214027249906304, + "learning_rate": 4.7949200796122796e-05, + "loss": 0.4414, + "step": 5260 + }, + { + "epoch": 0.623858650539547, + "grad_norm": 1.6522168254567975, + "learning_rate": 4.794824857659881e-05, + "loss": 0.4816, + "step": 5261 + }, + { + "epoch": 0.623977232301672, + "grad_norm": 1.603538000697992, + "learning_rate": 4.7947296145520115e-05, + "loss": 0.7394, + "step": 5262 + }, + { + "epoch": 0.624095814063797, + "grad_norm": 1.7949231221731508, + "learning_rate": 4.7946343502895485e-05, + "loss": 0.6377, + "step": 5263 + }, + { + "epoch": 0.624214395825922, + "grad_norm": 1.7999610725121924, + "learning_rate": 4.7945390648733714e-05, + "loss": 0.5805, + "step": 5264 + }, + { + "epoch": 0.624332977588047, + "grad_norm": 1.4886701509539344, + "learning_rate": 4.794443758304358e-05, + "loss": 0.5166, + "step": 5265 + }, + { + "epoch": 0.6244515593501719, + "grad_norm": 1.6024398964964393, + "learning_rate": 4.794348430583386e-05, + "loss": 0.5625, + "step": 5266 + }, + { + "epoch": 0.6245701411122969, + "grad_norm": 1.3267546946989428, + "learning_rate": 4.794253081711335e-05, + "loss": 0.41, + "step": 5267 + }, + { + "epoch": 0.6246887228744219, + "grad_norm": 1.354950208517627, + "learning_rate": 4.7941577116890844e-05, + "loss": 0.4797, + "step": 5268 + }, + { + "epoch": 0.6248073046365469, + "grad_norm": 1.388058405252482, + "learning_rate": 4.7940623205175127e-05, + "loss": 0.5673, + "step": 5269 + }, + { + "epoch": 0.6249258863986719, + "grad_norm": 1.5142074365401943, + "learning_rate": 4.7939669081974994e-05, + "loss": 0.6708, + "step": 5270 + }, + { + "epoch": 0.6250444681607968, + "grad_norm": 1.517379599536678, + "learning_rate": 4.793871474729925e-05, + "loss": 0.5848, + "step": 5271 + }, + { + "epoch": 0.6251630499229218, + "grad_norm": 2.173920314289835, + "learning_rate": 4.7937760201156675e-05, + "loss": 0.8125, + "step": 5272 + }, + { + "epoch": 0.6252816316850468, + "grad_norm": 1.6303194616283676, + "learning_rate": 4.7936805443556086e-05, + "loss": 0.6143, + "step": 5273 + }, + { + "epoch": 0.6254002134471718, + "grad_norm": 1.5740363650831375, + "learning_rate": 4.7935850474506284e-05, + "loss": 0.52, + "step": 5274 + }, + { + "epoch": 0.6255187952092968, + "grad_norm": 1.7926411918258756, + "learning_rate": 4.793489529401606e-05, + "loss": 0.6414, + "step": 5275 + }, + { + "epoch": 0.6256373769714217, + "grad_norm": 1.4016383586236063, + "learning_rate": 4.793393990209422e-05, + "loss": 0.5189, + "step": 5276 + }, + { + "epoch": 0.6257559587335468, + "grad_norm": 1.4494240614145348, + "learning_rate": 4.7932984298749584e-05, + "loss": 0.4827, + "step": 5277 + }, + { + "epoch": 0.6258745404956718, + "grad_norm": 1.3389014150554297, + "learning_rate": 4.793202848399095e-05, + "loss": 0.5063, + "step": 5278 + }, + { + "epoch": 0.6259931222577968, + "grad_norm": 1.325004279898177, + "learning_rate": 4.7931072457827145e-05, + "loss": 0.4938, + "step": 5279 + }, + { + "epoch": 0.6261117040199218, + "grad_norm": 1.3346700428632767, + "learning_rate": 4.793011622026696e-05, + "loss": 0.4091, + "step": 5280 + }, + { + "epoch": 0.6262302857820468, + "grad_norm": 1.5350788608674566, + "learning_rate": 4.792915977131923e-05, + "loss": 0.5043, + "step": 5281 + }, + { + "epoch": 0.6263488675441717, + "grad_norm": 1.2796133345645224, + "learning_rate": 4.7928203110992766e-05, + "loss": 0.4634, + "step": 5282 + }, + { + "epoch": 0.6264674493062967, + "grad_norm": 1.2956359825959152, + "learning_rate": 4.792724623929637e-05, + "loss": 0.4306, + "step": 5283 + }, + { + "epoch": 0.6265860310684217, + "grad_norm": 1.5706668820443535, + "learning_rate": 4.79262891562389e-05, + "loss": 0.5849, + "step": 5284 + }, + { + "epoch": 0.6267046128305467, + "grad_norm": 1.609065747407046, + "learning_rate": 4.792533186182915e-05, + "loss": 0.6741, + "step": 5285 + }, + { + "epoch": 0.6268231945926717, + "grad_norm": 1.4213773612693494, + "learning_rate": 4.7924374356075955e-05, + "loss": 0.5323, + "step": 5286 + }, + { + "epoch": 0.6269417763547966, + "grad_norm": 1.5844943953545954, + "learning_rate": 4.792341663898813e-05, + "loss": 0.6792, + "step": 5287 + }, + { + "epoch": 0.6270603581169216, + "grad_norm": 1.3695743725875875, + "learning_rate": 4.792245871057452e-05, + "loss": 0.5285, + "step": 5288 + }, + { + "epoch": 0.6271789398790466, + "grad_norm": 1.5135072876060478, + "learning_rate": 4.7921500570843955e-05, + "loss": 0.6335, + "step": 5289 + }, + { + "epoch": 0.6272975216411716, + "grad_norm": 1.2789365778974466, + "learning_rate": 4.792054221980525e-05, + "loss": 0.5112, + "step": 5290 + }, + { + "epoch": 0.6274161034032966, + "grad_norm": 1.2680910751092334, + "learning_rate": 4.791958365746727e-05, + "loss": 0.4354, + "step": 5291 + }, + { + "epoch": 0.6275346851654215, + "grad_norm": 1.5424033643003792, + "learning_rate": 4.7918624883838824e-05, + "loss": 0.5543, + "step": 5292 + }, + { + "epoch": 0.6276532669275465, + "grad_norm": 1.515140621517723, + "learning_rate": 4.791766589892877e-05, + "loss": 0.5969, + "step": 5293 + }, + { + "epoch": 0.6277718486896715, + "grad_norm": 1.3953447862366992, + "learning_rate": 4.791670670274593e-05, + "loss": 0.5517, + "step": 5294 + }, + { + "epoch": 0.6278904304517965, + "grad_norm": 1.7660143833776598, + "learning_rate": 4.791574729529916e-05, + "loss": 0.691, + "step": 5295 + }, + { + "epoch": 0.6280090122139215, + "grad_norm": 1.5577937516447835, + "learning_rate": 4.7914787676597296e-05, + "loss": 0.5777, + "step": 5296 + }, + { + "epoch": 0.6281275939760464, + "grad_norm": 1.3602367688853483, + "learning_rate": 4.7913827846649196e-05, + "loss": 0.489, + "step": 5297 + }, + { + "epoch": 0.6282461757381714, + "grad_norm": 1.588140437823987, + "learning_rate": 4.7912867805463704e-05, + "loss": 0.5147, + "step": 5298 + }, + { + "epoch": 0.6283647575002964, + "grad_norm": 1.2285836215654025, + "learning_rate": 4.791190755304966e-05, + "loss": 0.399, + "step": 5299 + }, + { + "epoch": 0.6284833392624214, + "grad_norm": 1.3350271505634235, + "learning_rate": 4.791094708941594e-05, + "loss": 0.4623, + "step": 5300 + }, + { + "epoch": 0.6286019210245464, + "grad_norm": 1.4538059357218496, + "learning_rate": 4.790998641457137e-05, + "loss": 0.5195, + "step": 5301 + }, + { + "epoch": 0.6287205027866715, + "grad_norm": 1.291021045182473, + "learning_rate": 4.790902552852482e-05, + "loss": 0.402, + "step": 5302 + }, + { + "epoch": 0.6288390845487964, + "grad_norm": 1.5663951260754305, + "learning_rate": 4.7908064431285146e-05, + "loss": 0.545, + "step": 5303 + }, + { + "epoch": 0.6289576663109214, + "grad_norm": 1.2711913373147226, + "learning_rate": 4.7907103122861217e-05, + "loss": 0.4595, + "step": 5304 + }, + { + "epoch": 0.6290762480730464, + "grad_norm": 1.733945287599582, + "learning_rate": 4.7906141603261884e-05, + "loss": 0.665, + "step": 5305 + }, + { + "epoch": 0.6291948298351714, + "grad_norm": 1.2425178389956333, + "learning_rate": 4.790517987249602e-05, + "loss": 0.3812, + "step": 5306 + }, + { + "epoch": 0.6293134115972964, + "grad_norm": 1.3721106080274412, + "learning_rate": 4.7904217930572474e-05, + "loss": 0.4459, + "step": 5307 + }, + { + "epoch": 0.6294319933594213, + "grad_norm": 1.2730843573854453, + "learning_rate": 4.790325577750013e-05, + "loss": 0.436, + "step": 5308 + }, + { + "epoch": 0.6295505751215463, + "grad_norm": 1.5932038503093149, + "learning_rate": 4.790229341328786e-05, + "loss": 0.5711, + "step": 5309 + }, + { + "epoch": 0.6296691568836713, + "grad_norm": 1.6000006533912348, + "learning_rate": 4.7901330837944525e-05, + "loss": 0.4992, + "step": 5310 + }, + { + "epoch": 0.6297877386457963, + "grad_norm": 1.5628922023545795, + "learning_rate": 4.7900368051479e-05, + "loss": 0.4981, + "step": 5311 + }, + { + "epoch": 0.6299063204079213, + "grad_norm": 1.301749285990473, + "learning_rate": 4.7899405053900174e-05, + "loss": 0.4748, + "step": 5312 + }, + { + "epoch": 0.6300249021700463, + "grad_norm": 1.5502551861093352, + "learning_rate": 4.789844184521691e-05, + "loss": 0.5546, + "step": 5313 + }, + { + "epoch": 0.6301434839321712, + "grad_norm": 1.616725992364907, + "learning_rate": 4.7897478425438086e-05, + "loss": 0.6546, + "step": 5314 + }, + { + "epoch": 0.6302620656942962, + "grad_norm": 1.3352346431213222, + "learning_rate": 4.7896514794572595e-05, + "loss": 0.5913, + "step": 5315 + }, + { + "epoch": 0.6303806474564212, + "grad_norm": 1.5155149505113874, + "learning_rate": 4.789555095262931e-05, + "loss": 0.6493, + "step": 5316 + }, + { + "epoch": 0.6304992292185462, + "grad_norm": 1.6332759299045392, + "learning_rate": 4.7894586899617134e-05, + "loss": 0.7251, + "step": 5317 + }, + { + "epoch": 0.6306178109806712, + "grad_norm": 1.9917679330040885, + "learning_rate": 4.789362263554493e-05, + "loss": 0.6212, + "step": 5318 + }, + { + "epoch": 0.6307363927427961, + "grad_norm": 1.8216377879125303, + "learning_rate": 4.78926581604216e-05, + "loss": 0.6579, + "step": 5319 + }, + { + "epoch": 0.6308549745049211, + "grad_norm": 1.2042622774016865, + "learning_rate": 4.789169347425604e-05, + "loss": 0.3977, + "step": 5320 + }, + { + "epoch": 0.6309735562670461, + "grad_norm": 1.415696365251393, + "learning_rate": 4.789072857705713e-05, + "loss": 0.567, + "step": 5321 + }, + { + "epoch": 0.6310921380291711, + "grad_norm": 1.2970759302848736, + "learning_rate": 4.788976346883378e-05, + "loss": 0.4339, + "step": 5322 + }, + { + "epoch": 0.6312107197912961, + "grad_norm": 1.4556488312873683, + "learning_rate": 4.7888798149594884e-05, + "loss": 0.5294, + "step": 5323 + }, + { + "epoch": 0.631329301553421, + "grad_norm": 1.4087624411583701, + "learning_rate": 4.7887832619349326e-05, + "loss": 0.5091, + "step": 5324 + }, + { + "epoch": 0.631447883315546, + "grad_norm": 1.635384123807785, + "learning_rate": 4.788686687810603e-05, + "loss": 0.4875, + "step": 5325 + }, + { + "epoch": 0.631566465077671, + "grad_norm": 1.3787383970469083, + "learning_rate": 4.7885900925873874e-05, + "loss": 0.5108, + "step": 5326 + }, + { + "epoch": 0.6316850468397961, + "grad_norm": 1.4586136524507132, + "learning_rate": 4.788493476266179e-05, + "loss": 0.507, + "step": 5327 + }, + { + "epoch": 0.6318036286019211, + "grad_norm": 1.5983837928584794, + "learning_rate": 4.788396838847866e-05, + "loss": 0.7177, + "step": 5328 + }, + { + "epoch": 0.631922210364046, + "grad_norm": 1.5333005814466545, + "learning_rate": 4.7883001803333404e-05, + "loss": 0.2756, + "step": 5329 + }, + { + "epoch": 0.632040792126171, + "grad_norm": 1.29924547790288, + "learning_rate": 4.788203500723494e-05, + "loss": 0.4193, + "step": 5330 + }, + { + "epoch": 0.632159373888296, + "grad_norm": 1.3015094821585442, + "learning_rate": 4.7881068000192165e-05, + "loss": 0.4286, + "step": 5331 + }, + { + "epoch": 0.632277955650421, + "grad_norm": 1.4332629460486779, + "learning_rate": 4.788010078221401e-05, + "loss": 0.623, + "step": 5332 + }, + { + "epoch": 0.632396537412546, + "grad_norm": 1.495691841870641, + "learning_rate": 4.787913335330938e-05, + "loss": 0.4498, + "step": 5333 + }, + { + "epoch": 0.632515119174671, + "grad_norm": 2.1246004742534104, + "learning_rate": 4.787816571348719e-05, + "loss": 0.7647, + "step": 5334 + }, + { + "epoch": 0.6326337009367959, + "grad_norm": 2.064230678139192, + "learning_rate": 4.7877197862756374e-05, + "loss": 0.713, + "step": 5335 + }, + { + "epoch": 0.6327522826989209, + "grad_norm": 1.7724726253011236, + "learning_rate": 4.787622980112585e-05, + "loss": 0.6464, + "step": 5336 + }, + { + "epoch": 0.6328708644610459, + "grad_norm": 1.9981101405751718, + "learning_rate": 4.787526152860453e-05, + "loss": 0.8288, + "step": 5337 + }, + { + "epoch": 0.6329894462231709, + "grad_norm": 1.5545201705295337, + "learning_rate": 4.7874293045201355e-05, + "loss": 0.5627, + "step": 5338 + }, + { + "epoch": 0.6331080279852959, + "grad_norm": 1.3343313387755458, + "learning_rate": 4.7873324350925256e-05, + "loss": 0.4463, + "step": 5339 + }, + { + "epoch": 0.6332266097474208, + "grad_norm": 1.7357327950331438, + "learning_rate": 4.787235544578514e-05, + "loss": 0.8354, + "step": 5340 + }, + { + "epoch": 0.6333451915095458, + "grad_norm": 1.3043689970064647, + "learning_rate": 4.7871386329789966e-05, + "loss": 0.4894, + "step": 5341 + }, + { + "epoch": 0.6334637732716708, + "grad_norm": 1.3328476551613646, + "learning_rate": 4.787041700294866e-05, + "loss": 0.3624, + "step": 5342 + }, + { + "epoch": 0.6335823550337958, + "grad_norm": 1.6437652720914036, + "learning_rate": 4.7869447465270144e-05, + "loss": 0.5244, + "step": 5343 + }, + { + "epoch": 0.6337009367959208, + "grad_norm": 1.3153442402192532, + "learning_rate": 4.7868477716763374e-05, + "loss": 0.4051, + "step": 5344 + }, + { + "epoch": 0.6338195185580457, + "grad_norm": 1.4523196332534853, + "learning_rate": 4.7867507757437286e-05, + "loss": 0.5913, + "step": 5345 + }, + { + "epoch": 0.6339381003201707, + "grad_norm": 1.20628361711499, + "learning_rate": 4.7866537587300805e-05, + "loss": 0.4049, + "step": 5346 + }, + { + "epoch": 0.6340566820822957, + "grad_norm": 1.726914157885531, + "learning_rate": 4.78655672063629e-05, + "loss": 0.5617, + "step": 5347 + }, + { + "epoch": 0.6341752638444207, + "grad_norm": 1.1713739385238473, + "learning_rate": 4.786459661463251e-05, + "loss": 0.4262, + "step": 5348 + }, + { + "epoch": 0.6342938456065457, + "grad_norm": 1.3258495618267032, + "learning_rate": 4.786362581211856e-05, + "loss": 0.5309, + "step": 5349 + }, + { + "epoch": 0.6344124273686707, + "grad_norm": 1.8039607954533463, + "learning_rate": 4.786265479883003e-05, + "loss": 0.508, + "step": 5350 + }, + { + "epoch": 0.6345310091307956, + "grad_norm": 1.1221650876828526, + "learning_rate": 4.786168357477586e-05, + "loss": 0.3208, + "step": 5351 + }, + { + "epoch": 0.6346495908929207, + "grad_norm": 1.167831826222556, + "learning_rate": 4.786071213996499e-05, + "loss": 0.3924, + "step": 5352 + }, + { + "epoch": 0.6347681726550457, + "grad_norm": 1.6008869277754967, + "learning_rate": 4.7859740494406404e-05, + "loss": 0.4953, + "step": 5353 + }, + { + "epoch": 0.6348867544171707, + "grad_norm": 1.5108926279564776, + "learning_rate": 4.785876863810903e-05, + "loss": 0.4596, + "step": 5354 + }, + { + "epoch": 0.6350053361792957, + "grad_norm": 1.6840400245069476, + "learning_rate": 4.785779657108185e-05, + "loss": 0.6373, + "step": 5355 + }, + { + "epoch": 0.6351239179414206, + "grad_norm": 1.4753817736596173, + "learning_rate": 4.78568242933338e-05, + "loss": 0.6196, + "step": 5356 + }, + { + "epoch": 0.6352424997035456, + "grad_norm": 1.6554269218594935, + "learning_rate": 4.785585180487388e-05, + "loss": 0.6495, + "step": 5357 + }, + { + "epoch": 0.6353610814656706, + "grad_norm": 2.040615283682844, + "learning_rate": 4.785487910571102e-05, + "loss": 0.6354, + "step": 5358 + }, + { + "epoch": 0.6354796632277956, + "grad_norm": 1.7051311838868, + "learning_rate": 4.7853906195854206e-05, + "loss": 0.551, + "step": 5359 + }, + { + "epoch": 0.6355982449899206, + "grad_norm": 1.5037967534210024, + "learning_rate": 4.7852933075312404e-05, + "loss": 0.4127, + "step": 5360 + }, + { + "epoch": 0.6357168267520455, + "grad_norm": 1.5210801659146729, + "learning_rate": 4.7851959744094575e-05, + "loss": 0.4595, + "step": 5361 + }, + { + "epoch": 0.6358354085141705, + "grad_norm": 1.4883497493153646, + "learning_rate": 4.7850986202209704e-05, + "loss": 0.5642, + "step": 5362 + }, + { + "epoch": 0.6359539902762955, + "grad_norm": 1.4567994230823906, + "learning_rate": 4.7850012449666756e-05, + "loss": 0.4535, + "step": 5363 + }, + { + "epoch": 0.6360725720384205, + "grad_norm": 1.251316372445524, + "learning_rate": 4.784903848647473e-05, + "loss": 0.4453, + "step": 5364 + }, + { + "epoch": 0.6361911538005455, + "grad_norm": 1.4596461663833633, + "learning_rate": 4.784806431264258e-05, + "loss": 0.6271, + "step": 5365 + }, + { + "epoch": 0.6363097355626705, + "grad_norm": 1.2123895120579264, + "learning_rate": 4.784708992817929e-05, + "loss": 0.3851, + "step": 5366 + }, + { + "epoch": 0.6364283173247954, + "grad_norm": 1.5623063767197949, + "learning_rate": 4.7846115333093853e-05, + "loss": 0.4981, + "step": 5367 + }, + { + "epoch": 0.6365468990869204, + "grad_norm": 1.1546380231793918, + "learning_rate": 4.784514052739525e-05, + "loss": 0.431, + "step": 5368 + }, + { + "epoch": 0.6366654808490454, + "grad_norm": 1.2563692084485647, + "learning_rate": 4.7844165511092466e-05, + "loss": 0.4069, + "step": 5369 + }, + { + "epoch": 0.6367840626111704, + "grad_norm": 1.7000793848217148, + "learning_rate": 4.7843190284194484e-05, + "loss": 0.7841, + "step": 5370 + }, + { + "epoch": 0.6369026443732954, + "grad_norm": 1.3885910695022639, + "learning_rate": 4.78422148467103e-05, + "loss": 0.4804, + "step": 5371 + }, + { + "epoch": 0.6370212261354203, + "grad_norm": 1.2620339626398784, + "learning_rate": 4.784123919864891e-05, + "loss": 0.508, + "step": 5372 + }, + { + "epoch": 0.6371398078975453, + "grad_norm": 1.3536776450119712, + "learning_rate": 4.78402633400193e-05, + "loss": 0.5372, + "step": 5373 + }, + { + "epoch": 0.6372583896596703, + "grad_norm": 1.4834589181078492, + "learning_rate": 4.783928727083048e-05, + "loss": 0.585, + "step": 5374 + }, + { + "epoch": 0.6373769714217953, + "grad_norm": 1.3341343414257616, + "learning_rate": 4.783831099109143e-05, + "loss": 0.3697, + "step": 5375 + }, + { + "epoch": 0.6374955531839203, + "grad_norm": 1.1878416365902786, + "learning_rate": 4.783733450081116e-05, + "loss": 0.5103, + "step": 5376 + }, + { + "epoch": 0.6376141349460454, + "grad_norm": 1.2858984708942673, + "learning_rate": 4.783635779999867e-05, + "loss": 0.4269, + "step": 5377 + }, + { + "epoch": 0.6377327167081703, + "grad_norm": 1.3969941516227795, + "learning_rate": 4.7835380888662975e-05, + "loss": 0.4547, + "step": 5378 + }, + { + "epoch": 0.6378512984702953, + "grad_norm": 1.506017036790242, + "learning_rate": 4.783440376681306e-05, + "loss": 0.5481, + "step": 5379 + }, + { + "epoch": 0.6379698802324203, + "grad_norm": 1.0497963886332582, + "learning_rate": 4.783342643445794e-05, + "loss": 0.3821, + "step": 5380 + }, + { + "epoch": 0.6380884619945453, + "grad_norm": 1.3461982766222045, + "learning_rate": 4.783244889160664e-05, + "loss": 0.4196, + "step": 5381 + }, + { + "epoch": 0.6382070437566703, + "grad_norm": 1.4151672866323572, + "learning_rate": 4.783147113826815e-05, + "loss": 0.5746, + "step": 5382 + }, + { + "epoch": 0.6383256255187952, + "grad_norm": 2.197494737440352, + "learning_rate": 4.78304931744515e-05, + "loss": 0.9454, + "step": 5383 + }, + { + "epoch": 0.6384442072809202, + "grad_norm": 1.4815301123104077, + "learning_rate": 4.78295150001657e-05, + "loss": 0.4622, + "step": 5384 + }, + { + "epoch": 0.6385627890430452, + "grad_norm": 1.1691218925618796, + "learning_rate": 4.7828536615419767e-05, + "loss": 0.313, + "step": 5385 + }, + { + "epoch": 0.6386813708051702, + "grad_norm": 1.442858975514327, + "learning_rate": 4.7827558020222716e-05, + "loss": 0.4164, + "step": 5386 + }, + { + "epoch": 0.6387999525672952, + "grad_norm": 1.1769087341457927, + "learning_rate": 4.7826579214583576e-05, + "loss": 0.4112, + "step": 5387 + }, + { + "epoch": 0.6389185343294201, + "grad_norm": 1.687819661768108, + "learning_rate": 4.782560019851137e-05, + "loss": 0.5531, + "step": 5388 + }, + { + "epoch": 0.6390371160915451, + "grad_norm": 1.6280674560497048, + "learning_rate": 4.782462097201512e-05, + "loss": 0.5681, + "step": 5389 + }, + { + "epoch": 0.6391556978536701, + "grad_norm": 1.604990187828565, + "learning_rate": 4.7823641535103855e-05, + "loss": 0.4568, + "step": 5390 + }, + { + "epoch": 0.6392742796157951, + "grad_norm": 2.2182105996576196, + "learning_rate": 4.78226618877866e-05, + "loss": 0.9161, + "step": 5391 + }, + { + "epoch": 0.6393928613779201, + "grad_norm": 1.6357106831984065, + "learning_rate": 4.782168203007239e-05, + "loss": 0.6083, + "step": 5392 + }, + { + "epoch": 0.639511443140045, + "grad_norm": 2.1762357100866385, + "learning_rate": 4.782070196197026e-05, + "loss": 0.4393, + "step": 5393 + }, + { + "epoch": 0.63963002490217, + "grad_norm": 1.3596593328892868, + "learning_rate": 4.781972168348924e-05, + "loss": 0.4523, + "step": 5394 + }, + { + "epoch": 0.639748606664295, + "grad_norm": 1.6920758061302337, + "learning_rate": 4.7818741194638375e-05, + "loss": 0.6521, + "step": 5395 + }, + { + "epoch": 0.63986718842642, + "grad_norm": 1.4814534633552972, + "learning_rate": 4.781776049542669e-05, + "loss": 0.5402, + "step": 5396 + }, + { + "epoch": 0.639985770188545, + "grad_norm": 1.8233535182112188, + "learning_rate": 4.7816779585863235e-05, + "loss": 0.7446, + "step": 5397 + }, + { + "epoch": 0.64010435195067, + "grad_norm": 1.111412618242199, + "learning_rate": 4.781579846595705e-05, + "loss": 0.365, + "step": 5398 + }, + { + "epoch": 0.6402229337127949, + "grad_norm": 1.4031088236797145, + "learning_rate": 4.781481713571719e-05, + "loss": 0.4705, + "step": 5399 + }, + { + "epoch": 0.6403415154749199, + "grad_norm": 1.767670869916356, + "learning_rate": 4.7813835595152686e-05, + "loss": 0.6815, + "step": 5400 + }, + { + "epoch": 0.6404600972370449, + "grad_norm": 1.085042201217061, + "learning_rate": 4.78128538442726e-05, + "loss": 0.3866, + "step": 5401 + }, + { + "epoch": 0.64057867899917, + "grad_norm": 1.6226055255289282, + "learning_rate": 4.781187188308597e-05, + "loss": 0.8709, + "step": 5402 + }, + { + "epoch": 0.640697260761295, + "grad_norm": 1.2127150714557444, + "learning_rate": 4.781088971160186e-05, + "loss": 0.3767, + "step": 5403 + }, + { + "epoch": 0.6408158425234199, + "grad_norm": 1.391408584789372, + "learning_rate": 4.780990732982932e-05, + "loss": 0.6142, + "step": 5404 + }, + { + "epoch": 0.6409344242855449, + "grad_norm": 1.305139199911052, + "learning_rate": 4.780892473777741e-05, + "loss": 0.3902, + "step": 5405 + }, + { + "epoch": 0.6410530060476699, + "grad_norm": 1.4319342781171889, + "learning_rate": 4.780794193545517e-05, + "loss": 0.3422, + "step": 5406 + }, + { + "epoch": 0.6411715878097949, + "grad_norm": 1.279484739567199, + "learning_rate": 4.7806958922871686e-05, + "loss": 0.4586, + "step": 5407 + }, + { + "epoch": 0.6412901695719199, + "grad_norm": 1.1384447848896209, + "learning_rate": 4.7805975700036005e-05, + "loss": 0.4097, + "step": 5408 + }, + { + "epoch": 0.6414087513340448, + "grad_norm": 1.4458636367381004, + "learning_rate": 4.780499226695719e-05, + "loss": 0.597, + "step": 5409 + }, + { + "epoch": 0.6415273330961698, + "grad_norm": 1.7424479547365863, + "learning_rate": 4.780400862364432e-05, + "loss": 0.5852, + "step": 5410 + }, + { + "epoch": 0.6416459148582948, + "grad_norm": 1.3822484462263114, + "learning_rate": 4.7803024770106454e-05, + "loss": 0.37, + "step": 5411 + }, + { + "epoch": 0.6417644966204198, + "grad_norm": 1.4809781945287175, + "learning_rate": 4.780204070635266e-05, + "loss": 0.5341, + "step": 5412 + }, + { + "epoch": 0.6418830783825448, + "grad_norm": 1.3051568073139832, + "learning_rate": 4.780105643239201e-05, + "loss": 0.4986, + "step": 5413 + }, + { + "epoch": 0.6420016601446698, + "grad_norm": 1.3572770339767615, + "learning_rate": 4.780007194823358e-05, + "loss": 0.3548, + "step": 5414 + }, + { + "epoch": 0.6421202419067947, + "grad_norm": 1.3235423696564603, + "learning_rate": 4.779908725388645e-05, + "loss": 0.3902, + "step": 5415 + }, + { + "epoch": 0.6422388236689197, + "grad_norm": 1.7832450959702961, + "learning_rate": 4.7798102349359686e-05, + "loss": 0.4074, + "step": 5416 + }, + { + "epoch": 0.6423574054310447, + "grad_norm": 1.5770425977734124, + "learning_rate": 4.7797117234662384e-05, + "loss": 0.5872, + "step": 5417 + }, + { + "epoch": 0.6424759871931697, + "grad_norm": 1.4405424789587729, + "learning_rate": 4.779613190980362e-05, + "loss": 0.4384, + "step": 5418 + }, + { + "epoch": 0.6425945689552947, + "grad_norm": 1.6766428881256972, + "learning_rate": 4.779514637479247e-05, + "loss": 0.5144, + "step": 5419 + }, + { + "epoch": 0.6427131507174196, + "grad_norm": 1.2938108995565323, + "learning_rate": 4.779416062963801e-05, + "loss": 0.3308, + "step": 5420 + }, + { + "epoch": 0.6428317324795446, + "grad_norm": 1.1932600299923726, + "learning_rate": 4.7793174674349354e-05, + "loss": 0.3199, + "step": 5421 + }, + { + "epoch": 0.6429503142416696, + "grad_norm": 1.6319704656123462, + "learning_rate": 4.779218850893558e-05, + "loss": 0.5819, + "step": 5422 + }, + { + "epoch": 0.6430688960037946, + "grad_norm": 1.2588346124421088, + "learning_rate": 4.7791202133405774e-05, + "loss": 0.4005, + "step": 5423 + }, + { + "epoch": 0.6431874777659196, + "grad_norm": 1.436827623485643, + "learning_rate": 4.7790215547769034e-05, + "loss": 0.5772, + "step": 5424 + }, + { + "epoch": 0.6433060595280445, + "grad_norm": 1.5224763509491426, + "learning_rate": 4.778922875203445e-05, + "loss": 0.5826, + "step": 5425 + }, + { + "epoch": 0.6434246412901695, + "grad_norm": 1.3611826402810052, + "learning_rate": 4.778824174621113e-05, + "loss": 0.4757, + "step": 5426 + }, + { + "epoch": 0.6435432230522946, + "grad_norm": 1.781825167157929, + "learning_rate": 4.7787254530308155e-05, + "loss": 0.7802, + "step": 5427 + }, + { + "epoch": 0.6436618048144196, + "grad_norm": 1.4242538877061368, + "learning_rate": 4.778626710433465e-05, + "loss": 0.6275, + "step": 5428 + }, + { + "epoch": 0.6437803865765446, + "grad_norm": 1.6152927803156125, + "learning_rate": 4.7785279468299696e-05, + "loss": 0.5277, + "step": 5429 + }, + { + "epoch": 0.6438989683386696, + "grad_norm": 1.3666454356801594, + "learning_rate": 4.778429162221241e-05, + "loss": 0.4746, + "step": 5430 + }, + { + "epoch": 0.6440175501007945, + "grad_norm": 1.3421190270357968, + "learning_rate": 4.778330356608189e-05, + "loss": 0.5557, + "step": 5431 + }, + { + "epoch": 0.6441361318629195, + "grad_norm": 1.2908804192661198, + "learning_rate": 4.7782315299917256e-05, + "loss": 0.4707, + "step": 5432 + }, + { + "epoch": 0.6442547136250445, + "grad_norm": 1.4459911459496857, + "learning_rate": 4.7781326823727616e-05, + "loss": 0.57, + "step": 5433 + }, + { + "epoch": 0.6443732953871695, + "grad_norm": 1.2327369333227085, + "learning_rate": 4.7780338137522065e-05, + "loss": 0.3389, + "step": 5434 + }, + { + "epoch": 0.6444918771492945, + "grad_norm": 1.4481332285178652, + "learning_rate": 4.7779349241309745e-05, + "loss": 0.4996, + "step": 5435 + }, + { + "epoch": 0.6446104589114194, + "grad_norm": 1.2755738162590706, + "learning_rate": 4.7778360135099756e-05, + "loss": 0.4161, + "step": 5436 + }, + { + "epoch": 0.6447290406735444, + "grad_norm": 1.3636204266738823, + "learning_rate": 4.777737081890121e-05, + "loss": 0.3916, + "step": 5437 + }, + { + "epoch": 0.6448476224356694, + "grad_norm": 1.4398972014008835, + "learning_rate": 4.777638129272325e-05, + "loss": 0.4856, + "step": 5438 + }, + { + "epoch": 0.6449662041977944, + "grad_norm": 1.7389978213229575, + "learning_rate": 4.7775391556574974e-05, + "loss": 0.6661, + "step": 5439 + }, + { + "epoch": 0.6450847859599194, + "grad_norm": 1.3828946160487154, + "learning_rate": 4.777440161046552e-05, + "loss": 0.4211, + "step": 5440 + }, + { + "epoch": 0.6452033677220443, + "grad_norm": 1.5544160825809339, + "learning_rate": 4.7773411454404014e-05, + "loss": 0.4651, + "step": 5441 + }, + { + "epoch": 0.6453219494841693, + "grad_norm": 1.300913010799981, + "learning_rate": 4.777242108839958e-05, + "loss": 0.376, + "step": 5442 + }, + { + "epoch": 0.6454405312462943, + "grad_norm": 1.4494053943460876, + "learning_rate": 4.777143051246135e-05, + "loss": 0.4348, + "step": 5443 + }, + { + "epoch": 0.6455591130084193, + "grad_norm": 1.4202220089777073, + "learning_rate": 4.777043972659845e-05, + "loss": 0.5935, + "step": 5444 + }, + { + "epoch": 0.6456776947705443, + "grad_norm": 1.4142857304078622, + "learning_rate": 4.776944873082002e-05, + "loss": 0.3556, + "step": 5445 + }, + { + "epoch": 0.6457962765326692, + "grad_norm": 1.902449747429788, + "learning_rate": 4.776845752513519e-05, + "loss": 0.5438, + "step": 5446 + }, + { + "epoch": 0.6459148582947942, + "grad_norm": 1.2283827210140938, + "learning_rate": 4.7767466109553114e-05, + "loss": 0.4787, + "step": 5447 + }, + { + "epoch": 0.6460334400569192, + "grad_norm": 1.8411403399388864, + "learning_rate": 4.7766474484082914e-05, + "loss": 0.5716, + "step": 5448 + }, + { + "epoch": 0.6461520218190442, + "grad_norm": 1.8226755890417932, + "learning_rate": 4.776548264873373e-05, + "loss": 0.6661, + "step": 5449 + }, + { + "epoch": 0.6462706035811692, + "grad_norm": 1.857965825763071, + "learning_rate": 4.776449060351472e-05, + "loss": 0.642, + "step": 5450 + }, + { + "epoch": 0.6463891853432941, + "grad_norm": 1.6287274664600788, + "learning_rate": 4.7763498348435025e-05, + "loss": 0.4888, + "step": 5451 + }, + { + "epoch": 0.6465077671054192, + "grad_norm": 1.5013313828070372, + "learning_rate": 4.776250588350379e-05, + "loss": 0.4368, + "step": 5452 + }, + { + "epoch": 0.6466263488675442, + "grad_norm": 1.2167042778289083, + "learning_rate": 4.776151320873016e-05, + "loss": 0.4817, + "step": 5453 + }, + { + "epoch": 0.6467449306296692, + "grad_norm": 1.035093828969511, + "learning_rate": 4.776052032412329e-05, + "loss": 0.3316, + "step": 5454 + }, + { + "epoch": 0.6468635123917942, + "grad_norm": 1.5167762525439679, + "learning_rate": 4.775952722969233e-05, + "loss": 0.5834, + "step": 5455 + }, + { + "epoch": 0.6469820941539192, + "grad_norm": 1.604613759417812, + "learning_rate": 4.775853392544645e-05, + "loss": 0.4661, + "step": 5456 + }, + { + "epoch": 0.6471006759160441, + "grad_norm": 1.4732534189462014, + "learning_rate": 4.775754041139478e-05, + "loss": 0.5017, + "step": 5457 + }, + { + "epoch": 0.6472192576781691, + "grad_norm": 1.2803143846964333, + "learning_rate": 4.775654668754651e-05, + "loss": 0.4826, + "step": 5458 + }, + { + "epoch": 0.6473378394402941, + "grad_norm": 1.6869216406363567, + "learning_rate": 4.7755552753910785e-05, + "loss": 0.7759, + "step": 5459 + }, + { + "epoch": 0.6474564212024191, + "grad_norm": 1.4575977667369064, + "learning_rate": 4.775455861049676e-05, + "loss": 0.5912, + "step": 5460 + }, + { + "epoch": 0.6475750029645441, + "grad_norm": 1.475492662804033, + "learning_rate": 4.7753564257313605e-05, + "loss": 0.4078, + "step": 5461 + }, + { + "epoch": 0.647693584726669, + "grad_norm": 1.2937100753879756, + "learning_rate": 4.7752569694370494e-05, + "loss": 0.4788, + "step": 5462 + }, + { + "epoch": 0.647812166488794, + "grad_norm": 1.8864308269305865, + "learning_rate": 4.775157492167659e-05, + "loss": 0.7408, + "step": 5463 + }, + { + "epoch": 0.647930748250919, + "grad_norm": 1.1742415556966193, + "learning_rate": 4.775057993924107e-05, + "loss": 0.4594, + "step": 5464 + }, + { + "epoch": 0.648049330013044, + "grad_norm": 1.5269020082885998, + "learning_rate": 4.77495847470731e-05, + "loss": 0.5104, + "step": 5465 + }, + { + "epoch": 0.648167911775169, + "grad_norm": 1.3442383685824795, + "learning_rate": 4.774858934518185e-05, + "loss": 0.5688, + "step": 5466 + }, + { + "epoch": 0.648286493537294, + "grad_norm": 1.0926569470969445, + "learning_rate": 4.7747593733576514e-05, + "loss": 0.4409, + "step": 5467 + }, + { + "epoch": 0.6484050752994189, + "grad_norm": 1.4978326774658637, + "learning_rate": 4.774659791226625e-05, + "loss": 0.4878, + "step": 5468 + }, + { + "epoch": 0.6485236570615439, + "grad_norm": 1.4800648307170936, + "learning_rate": 4.774560188126025e-05, + "loss": 0.656, + "step": 5469 + }, + { + "epoch": 0.6486422388236689, + "grad_norm": 1.539092887665531, + "learning_rate": 4.774460564056769e-05, + "loss": 0.5315, + "step": 5470 + }, + { + "epoch": 0.6487608205857939, + "grad_norm": 1.5627909291876174, + "learning_rate": 4.774360919019776e-05, + "loss": 0.5972, + "step": 5471 + }, + { + "epoch": 0.6488794023479189, + "grad_norm": 1.7599269592639044, + "learning_rate": 4.774261253015964e-05, + "loss": 0.5613, + "step": 5472 + }, + { + "epoch": 0.6489979841100438, + "grad_norm": 1.5551229481979199, + "learning_rate": 4.774161566046253e-05, + "loss": 0.5281, + "step": 5473 + }, + { + "epoch": 0.6491165658721688, + "grad_norm": 1.483141266923919, + "learning_rate": 4.7740618581115604e-05, + "loss": 0.5445, + "step": 5474 + }, + { + "epoch": 0.6492351476342938, + "grad_norm": 1.2551531696643516, + "learning_rate": 4.773962129212806e-05, + "loss": 0.3909, + "step": 5475 + }, + { + "epoch": 0.6493537293964188, + "grad_norm": 1.4868683617729141, + "learning_rate": 4.77386237935091e-05, + "loss": 0.6902, + "step": 5476 + }, + { + "epoch": 0.6494723111585439, + "grad_norm": 1.0283816277048843, + "learning_rate": 4.7737626085267906e-05, + "loss": 0.35, + "step": 5477 + }, + { + "epoch": 0.6495908929206688, + "grad_norm": 1.7448799760932534, + "learning_rate": 4.773662816741368e-05, + "loss": 0.6958, + "step": 5478 + }, + { + "epoch": 0.6497094746827938, + "grad_norm": 1.3956807400238729, + "learning_rate": 4.773563003995563e-05, + "loss": 0.4433, + "step": 5479 + }, + { + "epoch": 0.6498280564449188, + "grad_norm": 1.8317350105845756, + "learning_rate": 4.7734631702902946e-05, + "loss": 0.5411, + "step": 5480 + }, + { + "epoch": 0.6499466382070438, + "grad_norm": 1.5882487845778857, + "learning_rate": 4.773363315626484e-05, + "loss": 0.5275, + "step": 5481 + }, + { + "epoch": 0.6500652199691688, + "grad_norm": 1.218483442532768, + "learning_rate": 4.7732634400050514e-05, + "loss": 0.3981, + "step": 5482 + }, + { + "epoch": 0.6501838017312938, + "grad_norm": 1.331287565834744, + "learning_rate": 4.7731635434269175e-05, + "loss": 0.3678, + "step": 5483 + }, + { + "epoch": 0.6503023834934187, + "grad_norm": 1.9894203123083969, + "learning_rate": 4.7730636258930035e-05, + "loss": 0.814, + "step": 5484 + }, + { + "epoch": 0.6504209652555437, + "grad_norm": 1.7955292253996447, + "learning_rate": 4.7729636874042303e-05, + "loss": 0.6522, + "step": 5485 + }, + { + "epoch": 0.6505395470176687, + "grad_norm": 1.6984635155707117, + "learning_rate": 4.772863727961518e-05, + "loss": 0.6717, + "step": 5486 + }, + { + "epoch": 0.6506581287797937, + "grad_norm": 1.9689116033896847, + "learning_rate": 4.772763747565791e-05, + "loss": 0.8422, + "step": 5487 + }, + { + "epoch": 0.6507767105419187, + "grad_norm": 1.0935900590479302, + "learning_rate": 4.7726637462179685e-05, + "loss": 0.3968, + "step": 5488 + }, + { + "epoch": 0.6508952923040436, + "grad_norm": 1.2643656976840127, + "learning_rate": 4.7725637239189735e-05, + "loss": 0.4154, + "step": 5489 + }, + { + "epoch": 0.6510138740661686, + "grad_norm": 1.4353385150991158, + "learning_rate": 4.772463680669728e-05, + "loss": 0.5626, + "step": 5490 + }, + { + "epoch": 0.6511324558282936, + "grad_norm": 0.9953620070113895, + "learning_rate": 4.772363616471153e-05, + "loss": 0.3132, + "step": 5491 + }, + { + "epoch": 0.6512510375904186, + "grad_norm": 1.4692244547511863, + "learning_rate": 4.772263531324173e-05, + "loss": 0.4421, + "step": 5492 + }, + { + "epoch": 0.6513696193525436, + "grad_norm": 1.6434472817410828, + "learning_rate": 4.7721634252297085e-05, + "loss": 0.6394, + "step": 5493 + }, + { + "epoch": 0.6514882011146685, + "grad_norm": 1.1895448360683918, + "learning_rate": 4.7720632981886846e-05, + "loss": 0.4162, + "step": 5494 + }, + { + "epoch": 0.6516067828767935, + "grad_norm": 1.237032113347861, + "learning_rate": 4.771963150202023e-05, + "loss": 0.341, + "step": 5495 + }, + { + "epoch": 0.6517253646389185, + "grad_norm": 1.5835959520451128, + "learning_rate": 4.771862981270647e-05, + "loss": 0.3984, + "step": 5496 + }, + { + "epoch": 0.6518439464010435, + "grad_norm": 1.5580807261587124, + "learning_rate": 4.771762791395481e-05, + "loss": 0.5832, + "step": 5497 + }, + { + "epoch": 0.6519625281631685, + "grad_norm": 2.086172142013894, + "learning_rate": 4.771662580577447e-05, + "loss": 0.7653, + "step": 5498 + }, + { + "epoch": 0.6520811099252934, + "grad_norm": 1.5960225712978253, + "learning_rate": 4.77156234881747e-05, + "loss": 0.4222, + "step": 5499 + }, + { + "epoch": 0.6521996916874184, + "grad_norm": 1.4705984340965006, + "learning_rate": 4.7714620961164735e-05, + "loss": 0.7105, + "step": 5500 + }, + { + "epoch": 0.6523182734495434, + "grad_norm": 1.5453492320340896, + "learning_rate": 4.771361822475382e-05, + "loss": 0.6626, + "step": 5501 + }, + { + "epoch": 0.6524368552116685, + "grad_norm": 1.3432120353182777, + "learning_rate": 4.77126152789512e-05, + "loss": 0.4712, + "step": 5502 + }, + { + "epoch": 0.6525554369737935, + "grad_norm": 1.0210423125604198, + "learning_rate": 4.771161212376612e-05, + "loss": 0.3562, + "step": 5503 + }, + { + "epoch": 0.6526740187359185, + "grad_norm": 1.2342993600058514, + "learning_rate": 4.7710608759207833e-05, + "loss": 0.4858, + "step": 5504 + }, + { + "epoch": 0.6527926004980434, + "grad_norm": 1.5932450921206622, + "learning_rate": 4.770960518528557e-05, + "loss": 0.727, + "step": 5505 + }, + { + "epoch": 0.6529111822601684, + "grad_norm": 1.1883263554735541, + "learning_rate": 4.77086014020086e-05, + "loss": 0.4113, + "step": 5506 + }, + { + "epoch": 0.6530297640222934, + "grad_norm": 1.196053688951196, + "learning_rate": 4.770759740938618e-05, + "loss": 0.4282, + "step": 5507 + }, + { + "epoch": 0.6531483457844184, + "grad_norm": 1.4200075896739233, + "learning_rate": 4.770659320742755e-05, + "loss": 0.5613, + "step": 5508 + }, + { + "epoch": 0.6532669275465434, + "grad_norm": 1.5345439021036995, + "learning_rate": 4.7705588796141974e-05, + "loss": 0.483, + "step": 5509 + }, + { + "epoch": 0.6533855093086683, + "grad_norm": 1.1808965279501091, + "learning_rate": 4.7704584175538715e-05, + "loss": 0.4488, + "step": 5510 + }, + { + "epoch": 0.6535040910707933, + "grad_norm": 1.4190273238530384, + "learning_rate": 4.7703579345627035e-05, + "loss": 0.4623, + "step": 5511 + }, + { + "epoch": 0.6536226728329183, + "grad_norm": 1.17892022842376, + "learning_rate": 4.770257430641619e-05, + "loss": 0.4379, + "step": 5512 + }, + { + "epoch": 0.6537412545950433, + "grad_norm": 1.7285906954612507, + "learning_rate": 4.770156905791545e-05, + "loss": 0.5286, + "step": 5513 + }, + { + "epoch": 0.6538598363571683, + "grad_norm": 1.3250402095894565, + "learning_rate": 4.7700563600134086e-05, + "loss": 0.4404, + "step": 5514 + }, + { + "epoch": 0.6539784181192932, + "grad_norm": 1.7166885057330972, + "learning_rate": 4.769955793308136e-05, + "loss": 0.5474, + "step": 5515 + }, + { + "epoch": 0.6540969998814182, + "grad_norm": 1.4913435454276924, + "learning_rate": 4.769855205676654e-05, + "loss": 0.6202, + "step": 5516 + }, + { + "epoch": 0.6542155816435432, + "grad_norm": 1.6706950089432584, + "learning_rate": 4.769754597119892e-05, + "loss": 0.6473, + "step": 5517 + }, + { + "epoch": 0.6543341634056682, + "grad_norm": 1.303131461306497, + "learning_rate": 4.769653967638775e-05, + "loss": 0.4517, + "step": 5518 + }, + { + "epoch": 0.6544527451677932, + "grad_norm": 1.4084224731721087, + "learning_rate": 4.769553317234232e-05, + "loss": 0.5158, + "step": 5519 + }, + { + "epoch": 0.6545713269299182, + "grad_norm": 1.6007399761276142, + "learning_rate": 4.7694526459071894e-05, + "loss": 0.5913, + "step": 5520 + }, + { + "epoch": 0.6546899086920431, + "grad_norm": 1.416039974432949, + "learning_rate": 4.769351953658578e-05, + "loss": 0.4333, + "step": 5521 + }, + { + "epoch": 0.6548084904541681, + "grad_norm": 1.164617499655951, + "learning_rate": 4.769251240489324e-05, + "loss": 0.4208, + "step": 5522 + }, + { + "epoch": 0.6549270722162931, + "grad_norm": 1.5514947160366102, + "learning_rate": 4.7691505064003554e-05, + "loss": 0.5274, + "step": 5523 + }, + { + "epoch": 0.6550456539784181, + "grad_norm": 1.3909723046319775, + "learning_rate": 4.7690497513926025e-05, + "loss": 0.5395, + "step": 5524 + }, + { + "epoch": 0.655164235740543, + "grad_norm": 1.4909464887992252, + "learning_rate": 4.768948975466993e-05, + "loss": 0.5379, + "step": 5525 + }, + { + "epoch": 0.655282817502668, + "grad_norm": 1.801353281118953, + "learning_rate": 4.768848178624457e-05, + "loss": 0.7727, + "step": 5526 + }, + { + "epoch": 0.6554013992647931, + "grad_norm": 1.5167777058503251, + "learning_rate": 4.768747360865922e-05, + "loss": 0.6603, + "step": 5527 + }, + { + "epoch": 0.6555199810269181, + "grad_norm": 1.4949367346257258, + "learning_rate": 4.768646522192319e-05, + "loss": 0.5946, + "step": 5528 + }, + { + "epoch": 0.6556385627890431, + "grad_norm": 1.423002051414685, + "learning_rate": 4.7685456626045774e-05, + "loss": 0.3946, + "step": 5529 + }, + { + "epoch": 0.6557571445511681, + "grad_norm": 1.87574418389901, + "learning_rate": 4.768444782103627e-05, + "loss": 0.7253, + "step": 5530 + }, + { + "epoch": 0.655875726313293, + "grad_norm": 1.5229904053889278, + "learning_rate": 4.7683438806903964e-05, + "loss": 0.55, + "step": 5531 + }, + { + "epoch": 0.655994308075418, + "grad_norm": 1.3751278644252654, + "learning_rate": 4.768242958365817e-05, + "loss": 0.4518, + "step": 5532 + }, + { + "epoch": 0.656112889837543, + "grad_norm": 1.4412807630448152, + "learning_rate": 4.76814201513082e-05, + "loss": 0.4618, + "step": 5533 + }, + { + "epoch": 0.656231471599668, + "grad_norm": 1.4078066925670683, + "learning_rate": 4.7680410509863337e-05, + "loss": 0.4111, + "step": 5534 + }, + { + "epoch": 0.656350053361793, + "grad_norm": 1.402411008421877, + "learning_rate": 4.767940065933291e-05, + "loss": 0.6071, + "step": 5535 + }, + { + "epoch": 0.656468635123918, + "grad_norm": 1.1915533163873622, + "learning_rate": 4.767839059972622e-05, + "loss": 0.3973, + "step": 5536 + }, + { + "epoch": 0.6565872168860429, + "grad_norm": 1.2714516289447388, + "learning_rate": 4.767738033105257e-05, + "loss": 0.4725, + "step": 5537 + }, + { + "epoch": 0.6567057986481679, + "grad_norm": 1.3804594917645507, + "learning_rate": 4.767636985332129e-05, + "loss": 0.5487, + "step": 5538 + }, + { + "epoch": 0.6568243804102929, + "grad_norm": 1.6629809928032673, + "learning_rate": 4.767535916654169e-05, + "loss": 0.5527, + "step": 5539 + }, + { + "epoch": 0.6569429621724179, + "grad_norm": 1.2884819379466255, + "learning_rate": 4.767434827072308e-05, + "loss": 0.4546, + "step": 5540 + }, + { + "epoch": 0.6570615439345429, + "grad_norm": 1.4469821637263613, + "learning_rate": 4.7673337165874785e-05, + "loss": 0.388, + "step": 5541 + }, + { + "epoch": 0.6571801256966678, + "grad_norm": 1.2178555229960908, + "learning_rate": 4.767232585200613e-05, + "loss": 0.4765, + "step": 5542 + }, + { + "epoch": 0.6572987074587928, + "grad_norm": 2.120933015542263, + "learning_rate": 4.7671314329126426e-05, + "loss": 0.6191, + "step": 5543 + }, + { + "epoch": 0.6574172892209178, + "grad_norm": 1.672918110318068, + "learning_rate": 4.767030259724501e-05, + "loss": 0.6743, + "step": 5544 + }, + { + "epoch": 0.6575358709830428, + "grad_norm": 1.5198129997614163, + "learning_rate": 4.7669290656371205e-05, + "loss": 0.4906, + "step": 5545 + }, + { + "epoch": 0.6576544527451678, + "grad_norm": 1.2960251588640668, + "learning_rate": 4.766827850651433e-05, + "loss": 0.3755, + "step": 5546 + }, + { + "epoch": 0.6577730345072927, + "grad_norm": 1.7814779645457421, + "learning_rate": 4.766726614768373e-05, + "loss": 0.6217, + "step": 5547 + }, + { + "epoch": 0.6578916162694177, + "grad_norm": 1.4857419047062352, + "learning_rate": 4.766625357988873e-05, + "loss": 0.4397, + "step": 5548 + }, + { + "epoch": 0.6580101980315427, + "grad_norm": 1.5592794432789863, + "learning_rate": 4.766524080313868e-05, + "loss": 0.4251, + "step": 5549 + }, + { + "epoch": 0.6581287797936677, + "grad_norm": 1.167676471813798, + "learning_rate": 4.766422781744289e-05, + "loss": 0.348, + "step": 5550 + }, + { + "epoch": 0.6582473615557927, + "grad_norm": 1.5103516982437075, + "learning_rate": 4.7663214622810715e-05, + "loss": 0.488, + "step": 5551 + }, + { + "epoch": 0.6583659433179178, + "grad_norm": 1.6266517907577251, + "learning_rate": 4.76622012192515e-05, + "loss": 0.6529, + "step": 5552 + }, + { + "epoch": 0.6584845250800427, + "grad_norm": 1.812538822833785, + "learning_rate": 4.7661187606774574e-05, + "loss": 0.7525, + "step": 5553 + }, + { + "epoch": 0.6586031068421677, + "grad_norm": 1.8026108733493802, + "learning_rate": 4.7660173785389285e-05, + "loss": 0.6529, + "step": 5554 + }, + { + "epoch": 0.6587216886042927, + "grad_norm": 1.4898206548250443, + "learning_rate": 4.765915975510499e-05, + "loss": 0.5007, + "step": 5555 + }, + { + "epoch": 0.6588402703664177, + "grad_norm": 1.7670391952159232, + "learning_rate": 4.765814551593102e-05, + "loss": 0.5546, + "step": 5556 + }, + { + "epoch": 0.6589588521285427, + "grad_norm": 1.5583754038727928, + "learning_rate": 4.7657131067876734e-05, + "loss": 0.4738, + "step": 5557 + }, + { + "epoch": 0.6590774338906676, + "grad_norm": 1.410780151393153, + "learning_rate": 4.765611641095149e-05, + "loss": 0.5144, + "step": 5558 + }, + { + "epoch": 0.6591960156527926, + "grad_norm": 1.5352844931344198, + "learning_rate": 4.765510154516463e-05, + "loss": 0.7187, + "step": 5559 + }, + { + "epoch": 0.6593145974149176, + "grad_norm": 1.715649013469572, + "learning_rate": 4.765408647052552e-05, + "loss": 0.7461, + "step": 5560 + }, + { + "epoch": 0.6594331791770426, + "grad_norm": 1.5716769792718548, + "learning_rate": 4.765307118704351e-05, + "loss": 0.6152, + "step": 5561 + }, + { + "epoch": 0.6595517609391676, + "grad_norm": 1.4602411962986155, + "learning_rate": 4.765205569472796e-05, + "loss": 0.4704, + "step": 5562 + }, + { + "epoch": 0.6596703427012925, + "grad_norm": 1.2454119480019343, + "learning_rate": 4.765103999358823e-05, + "loss": 0.5274, + "step": 5563 + }, + { + "epoch": 0.6597889244634175, + "grad_norm": 1.2977436715118487, + "learning_rate": 4.76500240836337e-05, + "loss": 0.5524, + "step": 5564 + }, + { + "epoch": 0.6599075062255425, + "grad_norm": 1.4396780924175279, + "learning_rate": 4.764900796487371e-05, + "loss": 0.5353, + "step": 5565 + }, + { + "epoch": 0.6600260879876675, + "grad_norm": 1.4270331336616673, + "learning_rate": 4.7647991637317656e-05, + "loss": 0.4987, + "step": 5566 + }, + { + "epoch": 0.6601446697497925, + "grad_norm": 1.2737595673598705, + "learning_rate": 4.7646975100974884e-05, + "loss": 0.4507, + "step": 5567 + }, + { + "epoch": 0.6602632515119174, + "grad_norm": 1.4701661176921312, + "learning_rate": 4.7645958355854766e-05, + "loss": 0.6398, + "step": 5568 + }, + { + "epoch": 0.6603818332740424, + "grad_norm": 1.5511452836945798, + "learning_rate": 4.764494140196669e-05, + "loss": 0.5602, + "step": 5569 + }, + { + "epoch": 0.6605004150361674, + "grad_norm": 1.2797118295655874, + "learning_rate": 4.7643924239320023e-05, + "loss": 0.4516, + "step": 5570 + }, + { + "epoch": 0.6606189967982924, + "grad_norm": 1.3468545053419123, + "learning_rate": 4.764290686792415e-05, + "loss": 0.4556, + "step": 5571 + }, + { + "epoch": 0.6607375785604174, + "grad_norm": 1.2871862747597538, + "learning_rate": 4.7641889287788435e-05, + "loss": 0.4775, + "step": 5572 + }, + { + "epoch": 0.6608561603225424, + "grad_norm": 1.2654100247589102, + "learning_rate": 4.764087149892226e-05, + "loss": 0.4236, + "step": 5573 + }, + { + "epoch": 0.6609747420846673, + "grad_norm": 1.5751204936076302, + "learning_rate": 4.763985350133502e-05, + "loss": 0.6585, + "step": 5574 + }, + { + "epoch": 0.6610933238467923, + "grad_norm": 1.6906274343266041, + "learning_rate": 4.76388352950361e-05, + "loss": 0.7877, + "step": 5575 + }, + { + "epoch": 0.6612119056089173, + "grad_norm": 1.571394152040877, + "learning_rate": 4.7637816880034866e-05, + "loss": 0.5419, + "step": 5576 + }, + { + "epoch": 0.6613304873710424, + "grad_norm": 1.4774632252518067, + "learning_rate": 4.763679825634073e-05, + "loss": 0.6181, + "step": 5577 + }, + { + "epoch": 0.6614490691331674, + "grad_norm": 1.425563574060063, + "learning_rate": 4.7635779423963075e-05, + "loss": 0.5019, + "step": 5578 + }, + { + "epoch": 0.6615676508952923, + "grad_norm": 2.1006436594990423, + "learning_rate": 4.7634760382911283e-05, + "loss": 0.7034, + "step": 5579 + }, + { + "epoch": 0.6616862326574173, + "grad_norm": 1.7035131521081666, + "learning_rate": 4.763374113319476e-05, + "loss": 0.5857, + "step": 5580 + }, + { + "epoch": 0.6618048144195423, + "grad_norm": 1.160551420182838, + "learning_rate": 4.763272167482291e-05, + "loss": 0.3458, + "step": 5581 + }, + { + "epoch": 0.6619233961816673, + "grad_norm": 1.5750743937732545, + "learning_rate": 4.76317020078051e-05, + "loss": 0.5356, + "step": 5582 + }, + { + "epoch": 0.6620419779437923, + "grad_norm": 1.2207003357773936, + "learning_rate": 4.763068213215076e-05, + "loss": 0.4344, + "step": 5583 + }, + { + "epoch": 0.6621605597059173, + "grad_norm": 1.3631054391252084, + "learning_rate": 4.7629662047869286e-05, + "loss": 0.4548, + "step": 5584 + }, + { + "epoch": 0.6622791414680422, + "grad_norm": 1.4000093442380195, + "learning_rate": 4.762864175497008e-05, + "loss": 0.6564, + "step": 5585 + }, + { + "epoch": 0.6623977232301672, + "grad_norm": 1.3602457084005724, + "learning_rate": 4.762762125346254e-05, + "loss": 0.6229, + "step": 5586 + }, + { + "epoch": 0.6625163049922922, + "grad_norm": 1.3037879926474403, + "learning_rate": 4.762660054335608e-05, + "loss": 0.4283, + "step": 5587 + }, + { + "epoch": 0.6626348867544172, + "grad_norm": 1.3634137545182907, + "learning_rate": 4.762557962466011e-05, + "loss": 0.5352, + "step": 5588 + }, + { + "epoch": 0.6627534685165422, + "grad_norm": 1.2981601889327796, + "learning_rate": 4.7624558497384045e-05, + "loss": 0.4211, + "step": 5589 + }, + { + "epoch": 0.6628720502786671, + "grad_norm": 1.6832766198162132, + "learning_rate": 4.762353716153729e-05, + "loss": 0.6646, + "step": 5590 + }, + { + "epoch": 0.6629906320407921, + "grad_norm": 1.0753178293381827, + "learning_rate": 4.7622515617129274e-05, + "loss": 0.3756, + "step": 5591 + }, + { + "epoch": 0.6631092138029171, + "grad_norm": 1.4718499875401652, + "learning_rate": 4.76214938641694e-05, + "loss": 0.4729, + "step": 5592 + }, + { + "epoch": 0.6632277955650421, + "grad_norm": 1.5267701312617328, + "learning_rate": 4.7620471902667084e-05, + "loss": 0.5159, + "step": 5593 + }, + { + "epoch": 0.6633463773271671, + "grad_norm": 1.2547282420401475, + "learning_rate": 4.7619449732631763e-05, + "loss": 0.417, + "step": 5594 + }, + { + "epoch": 0.663464959089292, + "grad_norm": 1.6506899632869252, + "learning_rate": 4.7618427354072855e-05, + "loss": 0.5606, + "step": 5595 + }, + { + "epoch": 0.663583540851417, + "grad_norm": 1.4150494938916278, + "learning_rate": 4.7617404766999786e-05, + "loss": 0.4706, + "step": 5596 + }, + { + "epoch": 0.663702122613542, + "grad_norm": 1.3883322554639808, + "learning_rate": 4.7616381971421973e-05, + "loss": 0.4994, + "step": 5597 + }, + { + "epoch": 0.663820704375667, + "grad_norm": 1.298505621090561, + "learning_rate": 4.761535896734886e-05, + "loss": 0.3917, + "step": 5598 + }, + { + "epoch": 0.663939286137792, + "grad_norm": 1.6245541923363853, + "learning_rate": 4.7614335754789865e-05, + "loss": 0.4814, + "step": 5599 + }, + { + "epoch": 0.6640578678999169, + "grad_norm": 1.4323275480078919, + "learning_rate": 4.761331233375442e-05, + "loss": 0.3568, + "step": 5600 + }, + { + "epoch": 0.6641764496620419, + "grad_norm": 1.7343320352651483, + "learning_rate": 4.7612288704251977e-05, + "loss": 0.5769, + "step": 5601 + }, + { + "epoch": 0.664295031424167, + "grad_norm": 1.486590100745642, + "learning_rate": 4.761126486629196e-05, + "loss": 0.4244, + "step": 5602 + }, + { + "epoch": 0.664413613186292, + "grad_norm": 1.7283797876027613, + "learning_rate": 4.76102408198838e-05, + "loss": 0.6618, + "step": 5603 + }, + { + "epoch": 0.664532194948417, + "grad_norm": 1.4008102849992075, + "learning_rate": 4.760921656503696e-05, + "loss": 0.5685, + "step": 5604 + }, + { + "epoch": 0.664650776710542, + "grad_norm": 1.7647114296346926, + "learning_rate": 4.760819210176086e-05, + "loss": 0.5761, + "step": 5605 + }, + { + "epoch": 0.6647693584726669, + "grad_norm": 1.5851175574799725, + "learning_rate": 4.760716743006495e-05, + "loss": 0.589, + "step": 5606 + }, + { + "epoch": 0.6648879402347919, + "grad_norm": 1.683217390993462, + "learning_rate": 4.7606142549958685e-05, + "loss": 0.6387, + "step": 5607 + }, + { + "epoch": 0.6650065219969169, + "grad_norm": 1.369570236936051, + "learning_rate": 4.76051174614515e-05, + "loss": 0.3804, + "step": 5608 + }, + { + "epoch": 0.6651251037590419, + "grad_norm": 1.3981975987954398, + "learning_rate": 4.760409216455286e-05, + "loss": 0.3623, + "step": 5609 + }, + { + "epoch": 0.6652436855211669, + "grad_norm": 1.4237396758140843, + "learning_rate": 4.76030666592722e-05, + "loss": 0.4478, + "step": 5610 + }, + { + "epoch": 0.6653622672832918, + "grad_norm": 1.3952658588201083, + "learning_rate": 4.760204094561899e-05, + "loss": 0.3809, + "step": 5611 + }, + { + "epoch": 0.6654808490454168, + "grad_norm": 1.5883259544870982, + "learning_rate": 4.760101502360268e-05, + "loss": 0.4881, + "step": 5612 + }, + { + "epoch": 0.6655994308075418, + "grad_norm": 1.4108621852212344, + "learning_rate": 4.7599988893232725e-05, + "loss": 0.4827, + "step": 5613 + }, + { + "epoch": 0.6657180125696668, + "grad_norm": 1.6265104001112098, + "learning_rate": 4.759896255451858e-05, + "loss": 0.557, + "step": 5614 + }, + { + "epoch": 0.6658365943317918, + "grad_norm": 1.5611252479710365, + "learning_rate": 4.7597936007469725e-05, + "loss": 0.5015, + "step": 5615 + }, + { + "epoch": 0.6659551760939167, + "grad_norm": 1.2938329926648473, + "learning_rate": 4.75969092520956e-05, + "loss": 0.5601, + "step": 5616 + }, + { + "epoch": 0.6660737578560417, + "grad_norm": 1.2489978934788148, + "learning_rate": 4.759588228840569e-05, + "loss": 0.3358, + "step": 5617 + }, + { + "epoch": 0.6661923396181667, + "grad_norm": 1.3207432830211974, + "learning_rate": 4.7594855116409457e-05, + "loss": 0.4072, + "step": 5618 + }, + { + "epoch": 0.6663109213802917, + "grad_norm": 1.4921468210346427, + "learning_rate": 4.759382773611636e-05, + "loss": 0.5415, + "step": 5619 + }, + { + "epoch": 0.6664295031424167, + "grad_norm": 1.4011866199606544, + "learning_rate": 4.7592800147535876e-05, + "loss": 0.5408, + "step": 5620 + }, + { + "epoch": 0.6665480849045416, + "grad_norm": 1.2505193533049175, + "learning_rate": 4.759177235067748e-05, + "loss": 0.413, + "step": 5621 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.4800908942904514, + "learning_rate": 4.7590744345550654e-05, + "loss": 0.501, + "step": 5622 + }, + { + "epoch": 0.6667852484287916, + "grad_norm": 1.4130622374022237, + "learning_rate": 4.758971613216486e-05, + "loss": 0.4447, + "step": 5623 + }, + { + "epoch": 0.6669038301909166, + "grad_norm": 1.3244612267950255, + "learning_rate": 4.758868771052959e-05, + "loss": 0.3931, + "step": 5624 + }, + { + "epoch": 0.6670224119530416, + "grad_norm": 1.9695968973840774, + "learning_rate": 4.7587659080654314e-05, + "loss": 0.749, + "step": 5625 + }, + { + "epoch": 0.6671409937151666, + "grad_norm": 1.2362170653629911, + "learning_rate": 4.7586630242548526e-05, + "loss": 0.423, + "step": 5626 + }, + { + "epoch": 0.6672595754772916, + "grad_norm": 1.6283913308991367, + "learning_rate": 4.7585601196221704e-05, + "loss": 0.4763, + "step": 5627 + }, + { + "epoch": 0.6673781572394166, + "grad_norm": 1.5627561708035391, + "learning_rate": 4.758457194168333e-05, + "loss": 0.5114, + "step": 5628 + }, + { + "epoch": 0.6674967390015416, + "grad_norm": 1.5970542858356094, + "learning_rate": 4.75835424789429e-05, + "loss": 0.5661, + "step": 5629 + }, + { + "epoch": 0.6676153207636666, + "grad_norm": 1.5134203044744, + "learning_rate": 4.75825128080099e-05, + "loss": 0.4794, + "step": 5630 + }, + { + "epoch": 0.6677339025257916, + "grad_norm": 1.5649060463357696, + "learning_rate": 4.758148292889383e-05, + "loss": 0.4563, + "step": 5631 + }, + { + "epoch": 0.6678524842879165, + "grad_norm": 1.5387929982081128, + "learning_rate": 4.7580452841604176e-05, + "loss": 0.4775, + "step": 5632 + }, + { + "epoch": 0.6679710660500415, + "grad_norm": 1.6389631850263902, + "learning_rate": 4.757942254615044e-05, + "loss": 0.5751, + "step": 5633 + }, + { + "epoch": 0.6680896478121665, + "grad_norm": 1.6622997003336746, + "learning_rate": 4.757839204254212e-05, + "loss": 0.6983, + "step": 5634 + }, + { + "epoch": 0.6682082295742915, + "grad_norm": 1.7377481934036711, + "learning_rate": 4.75773613307887e-05, + "loss": 0.6953, + "step": 5635 + }, + { + "epoch": 0.6683268113364165, + "grad_norm": 1.771795710755704, + "learning_rate": 4.7576330410899704e-05, + "loss": 0.543, + "step": 5636 + }, + { + "epoch": 0.6684453930985415, + "grad_norm": 1.7943604728199727, + "learning_rate": 4.757529928288462e-05, + "loss": 0.7238, + "step": 5637 + }, + { + "epoch": 0.6685639748606664, + "grad_norm": 1.2736568039209832, + "learning_rate": 4.757426794675297e-05, + "loss": 0.3546, + "step": 5638 + }, + { + "epoch": 0.6686825566227914, + "grad_norm": 1.606088964861244, + "learning_rate": 4.757323640251424e-05, + "loss": 0.6108, + "step": 5639 + }, + { + "epoch": 0.6688011383849164, + "grad_norm": 1.3915937094430204, + "learning_rate": 4.7572204650177966e-05, + "loss": 0.4997, + "step": 5640 + }, + { + "epoch": 0.6689197201470414, + "grad_norm": 1.5551136035331687, + "learning_rate": 4.7571172689753626e-05, + "loss": 0.6009, + "step": 5641 + }, + { + "epoch": 0.6690383019091664, + "grad_norm": 1.6491315161968312, + "learning_rate": 4.757014052125077e-05, + "loss": 0.5775, + "step": 5642 + }, + { + "epoch": 0.6691568836712913, + "grad_norm": 1.2786286915601188, + "learning_rate": 4.7569108144678886e-05, + "loss": 0.5127, + "step": 5643 + }, + { + "epoch": 0.6692754654334163, + "grad_norm": 1.1776203045019915, + "learning_rate": 4.7568075560047506e-05, + "loss": 0.4247, + "step": 5644 + }, + { + "epoch": 0.6693940471955413, + "grad_norm": 1.4844253440843935, + "learning_rate": 4.7567042767366146e-05, + "loss": 0.5413, + "step": 5645 + }, + { + "epoch": 0.6695126289576663, + "grad_norm": 1.2491382627483054, + "learning_rate": 4.756600976664431e-05, + "loss": 0.5179, + "step": 5646 + }, + { + "epoch": 0.6696312107197913, + "grad_norm": 1.3026591333069073, + "learning_rate": 4.756497655789155e-05, + "loss": 0.4513, + "step": 5647 + }, + { + "epoch": 0.6697497924819162, + "grad_norm": 1.7041582338216834, + "learning_rate": 4.756394314111738e-05, + "loss": 0.521, + "step": 5648 + }, + { + "epoch": 0.6698683742440412, + "grad_norm": 1.6921353140670534, + "learning_rate": 4.7562909516331325e-05, + "loss": 0.8663, + "step": 5649 + }, + { + "epoch": 0.6699869560061662, + "grad_norm": 1.2498501265020507, + "learning_rate": 4.75618756835429e-05, + "loss": 0.4141, + "step": 5650 + }, + { + "epoch": 0.6701055377682912, + "grad_norm": 1.3257997617322248, + "learning_rate": 4.7560841642761654e-05, + "loss": 0.4519, + "step": 5651 + }, + { + "epoch": 0.6702241195304163, + "grad_norm": 1.137537694655641, + "learning_rate": 4.7559807393997115e-05, + "loss": 0.3897, + "step": 5652 + }, + { + "epoch": 0.6703427012925413, + "grad_norm": 1.5822737179893402, + "learning_rate": 4.7558772937258815e-05, + "loss": 0.5599, + "step": 5653 + }, + { + "epoch": 0.6704612830546662, + "grad_norm": 1.572115193929137, + "learning_rate": 4.7557738272556294e-05, + "loss": 0.6356, + "step": 5654 + }, + { + "epoch": 0.6705798648167912, + "grad_norm": 1.3504263760920325, + "learning_rate": 4.755670339989909e-05, + "loss": 0.4882, + "step": 5655 + }, + { + "epoch": 0.6706984465789162, + "grad_norm": 1.7429438985929042, + "learning_rate": 4.7555668319296735e-05, + "loss": 0.6602, + "step": 5656 + }, + { + "epoch": 0.6708170283410412, + "grad_norm": 1.3334103399657982, + "learning_rate": 4.7554633030758786e-05, + "loss": 0.4111, + "step": 5657 + }, + { + "epoch": 0.6709356101031662, + "grad_norm": 1.6771003581283996, + "learning_rate": 4.7553597534294766e-05, + "loss": 0.6284, + "step": 5658 + }, + { + "epoch": 0.6710541918652911, + "grad_norm": 1.312761309800299, + "learning_rate": 4.755256182991425e-05, + "loss": 0.5118, + "step": 5659 + }, + { + "epoch": 0.6711727736274161, + "grad_norm": 1.1984560864675602, + "learning_rate": 4.755152591762676e-05, + "loss": 0.4398, + "step": 5660 + }, + { + "epoch": 0.6712913553895411, + "grad_norm": 1.4078255807019586, + "learning_rate": 4.755048979744185e-05, + "loss": 0.448, + "step": 5661 + }, + { + "epoch": 0.6714099371516661, + "grad_norm": 1.5702655905154772, + "learning_rate": 4.754945346936909e-05, + "loss": 0.4503, + "step": 5662 + }, + { + "epoch": 0.6715285189137911, + "grad_norm": 1.8173984831999148, + "learning_rate": 4.754841693341801e-05, + "loss": 0.6234, + "step": 5663 + }, + { + "epoch": 0.671647100675916, + "grad_norm": 1.4377111177575816, + "learning_rate": 4.754738018959819e-05, + "loss": 0.5765, + "step": 5664 + }, + { + "epoch": 0.671765682438041, + "grad_norm": 1.1930389051003656, + "learning_rate": 4.754634323791917e-05, + "loss": 0.3348, + "step": 5665 + }, + { + "epoch": 0.671884264200166, + "grad_norm": 1.552262420096495, + "learning_rate": 4.7545306078390504e-05, + "loss": 0.5953, + "step": 5666 + }, + { + "epoch": 0.672002845962291, + "grad_norm": 1.321534240760336, + "learning_rate": 4.7544268711021774e-05, + "loss": 0.391, + "step": 5667 + }, + { + "epoch": 0.672121427724416, + "grad_norm": 1.6097570745358494, + "learning_rate": 4.7543231135822526e-05, + "loss": 0.4784, + "step": 5668 + }, + { + "epoch": 0.672240009486541, + "grad_norm": 1.4615312544821546, + "learning_rate": 4.7542193352802336e-05, + "loss": 0.3915, + "step": 5669 + }, + { + "epoch": 0.6723585912486659, + "grad_norm": 2.00719680700747, + "learning_rate": 4.7541155361970756e-05, + "loss": 0.9051, + "step": 5670 + }, + { + "epoch": 0.6724771730107909, + "grad_norm": 1.6156403653975195, + "learning_rate": 4.754011716333737e-05, + "loss": 0.545, + "step": 5671 + }, + { + "epoch": 0.6725957547729159, + "grad_norm": 1.248186354039386, + "learning_rate": 4.7539078756911745e-05, + "loss": 0.4915, + "step": 5672 + }, + { + "epoch": 0.6727143365350409, + "grad_norm": 1.3289574053025608, + "learning_rate": 4.7538040142703456e-05, + "loss": 0.4955, + "step": 5673 + }, + { + "epoch": 0.6728329182971658, + "grad_norm": 1.140908851326464, + "learning_rate": 4.753700132072207e-05, + "loss": 0.3346, + "step": 5674 + }, + { + "epoch": 0.6729515000592908, + "grad_norm": 1.3713771008625937, + "learning_rate": 4.753596229097718e-05, + "loss": 0.5204, + "step": 5675 + }, + { + "epoch": 0.6730700818214158, + "grad_norm": 1.5101763232070664, + "learning_rate": 4.753492305347834e-05, + "loss": 0.4512, + "step": 5676 + }, + { + "epoch": 0.6731886635835409, + "grad_norm": 1.5487907524312163, + "learning_rate": 4.753388360823515e-05, + "loss": 0.4119, + "step": 5677 + }, + { + "epoch": 0.6733072453456659, + "grad_norm": 1.3383500000262378, + "learning_rate": 4.7532843955257186e-05, + "loss": 0.2748, + "step": 5678 + }, + { + "epoch": 0.6734258271077909, + "grad_norm": 1.9082305590113289, + "learning_rate": 4.7531804094554026e-05, + "loss": 0.7034, + "step": 5679 + }, + { + "epoch": 0.6735444088699158, + "grad_norm": 1.3134558299978445, + "learning_rate": 4.7530764026135267e-05, + "loss": 0.463, + "step": 5680 + }, + { + "epoch": 0.6736629906320408, + "grad_norm": 1.219087299787808, + "learning_rate": 4.752972375001049e-05, + "loss": 0.3625, + "step": 5681 + }, + { + "epoch": 0.6737815723941658, + "grad_norm": 1.2344574496439853, + "learning_rate": 4.75286832661893e-05, + "loss": 0.4653, + "step": 5682 + }, + { + "epoch": 0.6739001541562908, + "grad_norm": 1.4002967700738496, + "learning_rate": 4.7527642574681264e-05, + "loss": 0.3827, + "step": 5683 + }, + { + "epoch": 0.6740187359184158, + "grad_norm": 1.3748216956989279, + "learning_rate": 4.752660167549599e-05, + "loss": 0.4092, + "step": 5684 + }, + { + "epoch": 0.6741373176805407, + "grad_norm": 1.4187299090214673, + "learning_rate": 4.7525560568643075e-05, + "loss": 0.4458, + "step": 5685 + }, + { + "epoch": 0.6742558994426657, + "grad_norm": 1.6932882310584985, + "learning_rate": 4.752451925413212e-05, + "loss": 0.6198, + "step": 5686 + }, + { + "epoch": 0.6743744812047907, + "grad_norm": 1.6693948181196396, + "learning_rate": 4.752347773197271e-05, + "loss": 0.6604, + "step": 5687 + }, + { + "epoch": 0.6744930629669157, + "grad_norm": 1.5463446134157632, + "learning_rate": 4.752243600217445e-05, + "loss": 0.4411, + "step": 5688 + }, + { + "epoch": 0.6746116447290407, + "grad_norm": 1.3680417975062844, + "learning_rate": 4.752139406474696e-05, + "loss": 0.54, + "step": 5689 + }, + { + "epoch": 0.6747302264911657, + "grad_norm": 1.178473452164607, + "learning_rate": 4.7520351919699835e-05, + "loss": 0.3254, + "step": 5690 + }, + { + "epoch": 0.6748488082532906, + "grad_norm": 1.676362450203553, + "learning_rate": 4.751930956704268e-05, + "loss": 0.5942, + "step": 5691 + }, + { + "epoch": 0.6749673900154156, + "grad_norm": 1.8847627242455596, + "learning_rate": 4.75182670067851e-05, + "loss": 0.6309, + "step": 5692 + }, + { + "epoch": 0.6750859717775406, + "grad_norm": 1.8031157287255333, + "learning_rate": 4.751722423893672e-05, + "loss": 0.4821, + "step": 5693 + }, + { + "epoch": 0.6752045535396656, + "grad_norm": 1.7835041679124368, + "learning_rate": 4.751618126350714e-05, + "loss": 0.7409, + "step": 5694 + }, + { + "epoch": 0.6753231353017906, + "grad_norm": 1.1621828788924649, + "learning_rate": 4.7515138080505975e-05, + "loss": 0.3697, + "step": 5695 + }, + { + "epoch": 0.6754417170639155, + "grad_norm": 1.3005362854337295, + "learning_rate": 4.751409468994286e-05, + "loss": 0.5506, + "step": 5696 + }, + { + "epoch": 0.6755602988260405, + "grad_norm": 1.3127126657447157, + "learning_rate": 4.751305109182739e-05, + "loss": 0.491, + "step": 5697 + }, + { + "epoch": 0.6756788805881655, + "grad_norm": 1.6029909907554378, + "learning_rate": 4.7512007286169194e-05, + "loss": 0.5559, + "step": 5698 + }, + { + "epoch": 0.6757974623502905, + "grad_norm": 1.6368868822699887, + "learning_rate": 4.751096327297791e-05, + "loss": 0.7514, + "step": 5699 + }, + { + "epoch": 0.6759160441124155, + "grad_norm": 1.3486319765784842, + "learning_rate": 4.7509919052263135e-05, + "loss": 0.4731, + "step": 5700 + }, + { + "epoch": 0.6760346258745404, + "grad_norm": 1.6520943689150631, + "learning_rate": 4.750887462403452e-05, + "loss": 0.6394, + "step": 5701 + }, + { + "epoch": 0.6761532076366655, + "grad_norm": 1.9549205113576387, + "learning_rate": 4.750782998830168e-05, + "loss": 0.7417, + "step": 5702 + }, + { + "epoch": 0.6762717893987905, + "grad_norm": 1.4019045517710704, + "learning_rate": 4.750678514507424e-05, + "loss": 0.5043, + "step": 5703 + }, + { + "epoch": 0.6763903711609155, + "grad_norm": 1.2331103373410803, + "learning_rate": 4.750574009436185e-05, + "loss": 0.3457, + "step": 5704 + }, + { + "epoch": 0.6765089529230405, + "grad_norm": 1.2888198546063856, + "learning_rate": 4.750469483617414e-05, + "loss": 0.428, + "step": 5705 + }, + { + "epoch": 0.6766275346851655, + "grad_norm": 1.5027087734449516, + "learning_rate": 4.750364937052073e-05, + "loss": 0.4914, + "step": 5706 + }, + { + "epoch": 0.6767461164472904, + "grad_norm": 1.3758288242204866, + "learning_rate": 4.7502603697411275e-05, + "loss": 0.4446, + "step": 5707 + }, + { + "epoch": 0.6768646982094154, + "grad_norm": 1.3814191688905333, + "learning_rate": 4.75015578168554e-05, + "loss": 0.5369, + "step": 5708 + }, + { + "epoch": 0.6769832799715404, + "grad_norm": 1.39700404214086, + "learning_rate": 4.7500511728862765e-05, + "loss": 0.6476, + "step": 5709 + }, + { + "epoch": 0.6771018617336654, + "grad_norm": 1.382339708010986, + "learning_rate": 4.7499465433442994e-05, + "loss": 0.5057, + "step": 5710 + }, + { + "epoch": 0.6772204434957904, + "grad_norm": 1.141384171384917, + "learning_rate": 4.7498418930605746e-05, + "loss": 0.3883, + "step": 5711 + }, + { + "epoch": 0.6773390252579153, + "grad_norm": 1.519447735802292, + "learning_rate": 4.749737222036067e-05, + "loss": 0.483, + "step": 5712 + }, + { + "epoch": 0.6774576070200403, + "grad_norm": 1.4408536670783207, + "learning_rate": 4.749632530271741e-05, + "loss": 0.6194, + "step": 5713 + }, + { + "epoch": 0.6775761887821653, + "grad_norm": 1.255692066259264, + "learning_rate": 4.749527817768561e-05, + "loss": 0.3159, + "step": 5714 + }, + { + "epoch": 0.6776947705442903, + "grad_norm": 1.6091269946393139, + "learning_rate": 4.749423084527494e-05, + "loss": 0.4129, + "step": 5715 + }, + { + "epoch": 0.6778133523064153, + "grad_norm": 1.5598367103172588, + "learning_rate": 4.749318330549504e-05, + "loss": 0.6161, + "step": 5716 + }, + { + "epoch": 0.6779319340685402, + "grad_norm": 1.6045261438155523, + "learning_rate": 4.7492135558355575e-05, + "loss": 0.5549, + "step": 5717 + }, + { + "epoch": 0.6780505158306652, + "grad_norm": 1.401595317048469, + "learning_rate": 4.74910876038662e-05, + "loss": 0.4333, + "step": 5718 + }, + { + "epoch": 0.6781690975927902, + "grad_norm": 1.4040754506770416, + "learning_rate": 4.7490039442036586e-05, + "loss": 0.4989, + "step": 5719 + }, + { + "epoch": 0.6782876793549152, + "grad_norm": 1.34523836474768, + "learning_rate": 4.748899107287638e-05, + "loss": 0.2966, + "step": 5720 + }, + { + "epoch": 0.6784062611170402, + "grad_norm": 1.7956853970353286, + "learning_rate": 4.748794249639524e-05, + "loss": 0.7161, + "step": 5721 + }, + { + "epoch": 0.6785248428791651, + "grad_norm": 1.8319856047771041, + "learning_rate": 4.7486893712602864e-05, + "loss": 0.6027, + "step": 5722 + }, + { + "epoch": 0.6786434246412901, + "grad_norm": 1.1677178195233482, + "learning_rate": 4.7485844721508904e-05, + "loss": 0.3812, + "step": 5723 + }, + { + "epoch": 0.6787620064034151, + "grad_norm": 1.405859399955405, + "learning_rate": 4.7484795523123024e-05, + "loss": 0.4021, + "step": 5724 + }, + { + "epoch": 0.6788805881655401, + "grad_norm": 1.4871355049729083, + "learning_rate": 4.748374611745491e-05, + "loss": 0.6018, + "step": 5725 + }, + { + "epoch": 0.6789991699276652, + "grad_norm": 1.6977391140875737, + "learning_rate": 4.748269650451422e-05, + "loss": 0.6082, + "step": 5726 + }, + { + "epoch": 0.6791177516897902, + "grad_norm": 1.642297111535135, + "learning_rate": 4.7481646684310636e-05, + "loss": 0.4946, + "step": 5727 + }, + { + "epoch": 0.6792363334519151, + "grad_norm": 1.380392274427136, + "learning_rate": 4.748059665685385e-05, + "loss": 0.4981, + "step": 5728 + }, + { + "epoch": 0.6793549152140401, + "grad_norm": 1.0693675794851472, + "learning_rate": 4.747954642215352e-05, + "loss": 0.3795, + "step": 5729 + }, + { + "epoch": 0.6794734969761651, + "grad_norm": 1.7329321286892665, + "learning_rate": 4.747849598021934e-05, + "loss": 0.5827, + "step": 5730 + }, + { + "epoch": 0.6795920787382901, + "grad_norm": 1.553647587651021, + "learning_rate": 4.7477445331060995e-05, + "loss": 0.5982, + "step": 5731 + }, + { + "epoch": 0.6797106605004151, + "grad_norm": 1.3527526266461087, + "learning_rate": 4.747639447468816e-05, + "loss": 0.4878, + "step": 5732 + }, + { + "epoch": 0.67982924226254, + "grad_norm": 1.561262727665943, + "learning_rate": 4.747534341111054e-05, + "loss": 0.5842, + "step": 5733 + }, + { + "epoch": 0.679947824024665, + "grad_norm": 1.4717740938943071, + "learning_rate": 4.747429214033781e-05, + "loss": 0.5314, + "step": 5734 + }, + { + "epoch": 0.68006640578679, + "grad_norm": 1.2152414558564422, + "learning_rate": 4.747324066237967e-05, + "loss": 0.3428, + "step": 5735 + }, + { + "epoch": 0.680184987548915, + "grad_norm": 1.0286953562102505, + "learning_rate": 4.747218897724581e-05, + "loss": 0.2679, + "step": 5736 + }, + { + "epoch": 0.68030356931104, + "grad_norm": 1.471763413017837, + "learning_rate": 4.7471137084945914e-05, + "loss": 0.613, + "step": 5737 + }, + { + "epoch": 0.680422151073165, + "grad_norm": 1.025446495488857, + "learning_rate": 4.7470084985489706e-05, + "loss": 0.3155, + "step": 5738 + }, + { + "epoch": 0.6805407328352899, + "grad_norm": 1.4335370507410663, + "learning_rate": 4.746903267888686e-05, + "loss": 0.5187, + "step": 5739 + }, + { + "epoch": 0.6806593145974149, + "grad_norm": 1.7907229488384093, + "learning_rate": 4.746798016514708e-05, + "loss": 0.6627, + "step": 5740 + }, + { + "epoch": 0.6807778963595399, + "grad_norm": 1.629955398676479, + "learning_rate": 4.746692744428008e-05, + "loss": 0.5257, + "step": 5741 + }, + { + "epoch": 0.6808964781216649, + "grad_norm": 1.261510817769618, + "learning_rate": 4.746587451629557e-05, + "loss": 0.3229, + "step": 5742 + }, + { + "epoch": 0.6810150598837899, + "grad_norm": 1.2617854639955823, + "learning_rate": 4.746482138120324e-05, + "loss": 0.3327, + "step": 5743 + }, + { + "epoch": 0.6811336416459148, + "grad_norm": 1.427248778387898, + "learning_rate": 4.74637680390128e-05, + "loss": 0.526, + "step": 5744 + }, + { + "epoch": 0.6812522234080398, + "grad_norm": 1.633108309381934, + "learning_rate": 4.7462714489733966e-05, + "loss": 0.4754, + "step": 5745 + }, + { + "epoch": 0.6813708051701648, + "grad_norm": 1.5079288725901714, + "learning_rate": 4.746166073337646e-05, + "loss": 0.5699, + "step": 5746 + }, + { + "epoch": 0.6814893869322898, + "grad_norm": 1.5063429564856952, + "learning_rate": 4.7460606769949975e-05, + "loss": 0.5583, + "step": 5747 + }, + { + "epoch": 0.6816079686944148, + "grad_norm": 1.653845585743617, + "learning_rate": 4.745955259946424e-05, + "loss": 0.5072, + "step": 5748 + }, + { + "epoch": 0.6817265504565397, + "grad_norm": 1.5650740215091177, + "learning_rate": 4.7458498221928976e-05, + "loss": 0.5043, + "step": 5749 + }, + { + "epoch": 0.6818451322186647, + "grad_norm": 1.3560233151331795, + "learning_rate": 4.745744363735389e-05, + "loss": 0.3191, + "step": 5750 + }, + { + "epoch": 0.6819637139807898, + "grad_norm": 1.610234975595318, + "learning_rate": 4.7456388845748726e-05, + "loss": 0.4708, + "step": 5751 + }, + { + "epoch": 0.6820822957429148, + "grad_norm": 1.6766647512056025, + "learning_rate": 4.745533384712319e-05, + "loss": 0.6165, + "step": 5752 + }, + { + "epoch": 0.6822008775050398, + "grad_norm": 2.362638743684445, + "learning_rate": 4.745427864148701e-05, + "loss": 0.637, + "step": 5753 + }, + { + "epoch": 0.6823194592671648, + "grad_norm": 1.7308657564322087, + "learning_rate": 4.7453223228849916e-05, + "loss": 0.6062, + "step": 5754 + }, + { + "epoch": 0.6824380410292897, + "grad_norm": 1.3885745290325822, + "learning_rate": 4.745216760922164e-05, + "loss": 0.4075, + "step": 5755 + }, + { + "epoch": 0.6825566227914147, + "grad_norm": 1.5995432831001777, + "learning_rate": 4.745111178261191e-05, + "loss": 0.5634, + "step": 5756 + }, + { + "epoch": 0.6826752045535397, + "grad_norm": 1.1359592449952887, + "learning_rate": 4.745005574903046e-05, + "loss": 0.3389, + "step": 5757 + }, + { + "epoch": 0.6827937863156647, + "grad_norm": 1.2134720756085207, + "learning_rate": 4.744899950848703e-05, + "loss": 0.4107, + "step": 5758 + }, + { + "epoch": 0.6829123680777897, + "grad_norm": 1.562839897557436, + "learning_rate": 4.7447943060991354e-05, + "loss": 0.7269, + "step": 5759 + }, + { + "epoch": 0.6830309498399146, + "grad_norm": 1.8055121715893887, + "learning_rate": 4.7446886406553165e-05, + "loss": 0.5841, + "step": 5760 + }, + { + "epoch": 0.6831495316020396, + "grad_norm": 1.2775953107678784, + "learning_rate": 4.744582954518221e-05, + "loss": 0.4745, + "step": 5761 + }, + { + "epoch": 0.6832681133641646, + "grad_norm": 1.1828260212046602, + "learning_rate": 4.744477247688823e-05, + "loss": 0.3799, + "step": 5762 + }, + { + "epoch": 0.6833866951262896, + "grad_norm": 1.4155944055942016, + "learning_rate": 4.744371520168098e-05, + "loss": 0.4373, + "step": 5763 + }, + { + "epoch": 0.6835052768884146, + "grad_norm": 1.1952576433089332, + "learning_rate": 4.744265771957019e-05, + "loss": 0.3561, + "step": 5764 + }, + { + "epoch": 0.6836238586505395, + "grad_norm": 1.3890968927883893, + "learning_rate": 4.744160003056562e-05, + "loss": 0.3628, + "step": 5765 + }, + { + "epoch": 0.6837424404126645, + "grad_norm": 1.7639634469047607, + "learning_rate": 4.744054213467702e-05, + "loss": 0.416, + "step": 5766 + }, + { + "epoch": 0.6838610221747895, + "grad_norm": 1.4345110767900837, + "learning_rate": 4.7439484031914136e-05, + "loss": 0.3605, + "step": 5767 + }, + { + "epoch": 0.6839796039369145, + "grad_norm": 1.9889336129228103, + "learning_rate": 4.743842572228673e-05, + "loss": 0.6876, + "step": 5768 + }, + { + "epoch": 0.6840981856990395, + "grad_norm": 1.2993523100532105, + "learning_rate": 4.743736720580455e-05, + "loss": 0.2974, + "step": 5769 + }, + { + "epoch": 0.6842167674611644, + "grad_norm": 1.63393060305369, + "learning_rate": 4.743630848247735e-05, + "loss": 0.5947, + "step": 5770 + }, + { + "epoch": 0.6843353492232894, + "grad_norm": 2.1189957292946624, + "learning_rate": 4.743524955231492e-05, + "loss": 0.7836, + "step": 5771 + }, + { + "epoch": 0.6844539309854144, + "grad_norm": 1.8974490791467324, + "learning_rate": 4.7434190415326986e-05, + "loss": 0.5277, + "step": 5772 + }, + { + "epoch": 0.6845725127475394, + "grad_norm": 1.64567402334388, + "learning_rate": 4.7433131071523336e-05, + "loss": 0.5086, + "step": 5773 + }, + { + "epoch": 0.6846910945096644, + "grad_norm": 1.5410066066019805, + "learning_rate": 4.7432071520913726e-05, + "loss": 0.4219, + "step": 5774 + }, + { + "epoch": 0.6848096762717893, + "grad_norm": 1.4106540472369506, + "learning_rate": 4.743101176350791e-05, + "loss": 0.472, + "step": 5775 + }, + { + "epoch": 0.6849282580339144, + "grad_norm": 1.6071839898585567, + "learning_rate": 4.742995179931569e-05, + "loss": 0.6883, + "step": 5776 + }, + { + "epoch": 0.6850468397960394, + "grad_norm": 1.16745620335201, + "learning_rate": 4.742889162834681e-05, + "loss": 0.4036, + "step": 5777 + }, + { + "epoch": 0.6851654215581644, + "grad_norm": 1.2407815939140547, + "learning_rate": 4.742783125061106e-05, + "loss": 0.3848, + "step": 5778 + }, + { + "epoch": 0.6852840033202894, + "grad_norm": 1.4931621214811472, + "learning_rate": 4.7426770666118205e-05, + "loss": 0.4568, + "step": 5779 + }, + { + "epoch": 0.6854025850824144, + "grad_norm": 1.0254139308822854, + "learning_rate": 4.7425709874878024e-05, + "loss": 0.2936, + "step": 5780 + }, + { + "epoch": 0.6855211668445393, + "grad_norm": 1.523952301381571, + "learning_rate": 4.74246488769003e-05, + "loss": 0.4815, + "step": 5781 + }, + { + "epoch": 0.6856397486066643, + "grad_norm": 1.1871311239244178, + "learning_rate": 4.742358767219481e-05, + "loss": 0.4113, + "step": 5782 + }, + { + "epoch": 0.6857583303687893, + "grad_norm": 1.3933697555569393, + "learning_rate": 4.742252626077135e-05, + "loss": 0.5476, + "step": 5783 + }, + { + "epoch": 0.6858769121309143, + "grad_norm": 1.333752144454184, + "learning_rate": 4.742146464263968e-05, + "loss": 0.369, + "step": 5784 + }, + { + "epoch": 0.6859954938930393, + "grad_norm": 1.050728266739734, + "learning_rate": 4.742040281780961e-05, + "loss": 0.2619, + "step": 5785 + }, + { + "epoch": 0.6861140756551642, + "grad_norm": 1.5663966866200687, + "learning_rate": 4.741934078629092e-05, + "loss": 0.5648, + "step": 5786 + }, + { + "epoch": 0.6862326574172892, + "grad_norm": 1.5306240122144832, + "learning_rate": 4.741827854809339e-05, + "loss": 0.3877, + "step": 5787 + }, + { + "epoch": 0.6863512391794142, + "grad_norm": 2.0252082689044117, + "learning_rate": 4.7417216103226836e-05, + "loss": 0.4912, + "step": 5788 + }, + { + "epoch": 0.6864698209415392, + "grad_norm": 1.745513464000421, + "learning_rate": 4.741615345170103e-05, + "loss": 0.5605, + "step": 5789 + }, + { + "epoch": 0.6865884027036642, + "grad_norm": 1.6721958193318687, + "learning_rate": 4.7415090593525777e-05, + "loss": 0.6811, + "step": 5790 + }, + { + "epoch": 0.6867069844657891, + "grad_norm": 1.4525285813379845, + "learning_rate": 4.741402752871089e-05, + "loss": 0.3884, + "step": 5791 + }, + { + "epoch": 0.6868255662279141, + "grad_norm": 1.319502126596369, + "learning_rate": 4.741296425726614e-05, + "loss": 0.4509, + "step": 5792 + }, + { + "epoch": 0.6869441479900391, + "grad_norm": 1.6559489958152063, + "learning_rate": 4.741190077920135e-05, + "loss": 0.5439, + "step": 5793 + }, + { + "epoch": 0.6870627297521641, + "grad_norm": 1.5271169908275757, + "learning_rate": 4.741083709452632e-05, + "loss": 0.404, + "step": 5794 + }, + { + "epoch": 0.6871813115142891, + "grad_norm": 1.4201823236537994, + "learning_rate": 4.740977320325085e-05, + "loss": 0.4206, + "step": 5795 + }, + { + "epoch": 0.687299893276414, + "grad_norm": 1.3733678658698703, + "learning_rate": 4.740870910538475e-05, + "loss": 0.3867, + "step": 5796 + }, + { + "epoch": 0.687418475038539, + "grad_norm": 1.6173386157352332, + "learning_rate": 4.740764480093783e-05, + "loss": 0.5293, + "step": 5797 + }, + { + "epoch": 0.687537056800664, + "grad_norm": 1.6228498561005573, + "learning_rate": 4.740658028991991e-05, + "loss": 0.6621, + "step": 5798 + }, + { + "epoch": 0.687655638562789, + "grad_norm": 1.6840440604156015, + "learning_rate": 4.74055155723408e-05, + "loss": 0.6152, + "step": 5799 + }, + { + "epoch": 0.687774220324914, + "grad_norm": 1.3685307912183806, + "learning_rate": 4.7404450648210306e-05, + "loss": 0.4294, + "step": 5800 + }, + { + "epoch": 0.6878928020870391, + "grad_norm": 1.9204184077808022, + "learning_rate": 4.740338551753825e-05, + "loss": 0.6511, + "step": 5801 + }, + { + "epoch": 0.688011383849164, + "grad_norm": 1.1336783293914625, + "learning_rate": 4.740232018033445e-05, + "loss": 0.326, + "step": 5802 + }, + { + "epoch": 0.688129965611289, + "grad_norm": 1.7780522058694028, + "learning_rate": 4.740125463660874e-05, + "loss": 0.6763, + "step": 5803 + }, + { + "epoch": 0.688248547373414, + "grad_norm": 1.7670303771803042, + "learning_rate": 4.740018888637092e-05, + "loss": 0.7297, + "step": 5804 + }, + { + "epoch": 0.688367129135539, + "grad_norm": 1.4592300955971549, + "learning_rate": 4.7399122929630835e-05, + "loss": 0.5218, + "step": 5805 + }, + { + "epoch": 0.688485710897664, + "grad_norm": 1.3675071710589848, + "learning_rate": 4.73980567663983e-05, + "loss": 0.5019, + "step": 5806 + }, + { + "epoch": 0.688604292659789, + "grad_norm": 1.2105402872397526, + "learning_rate": 4.739699039668315e-05, + "loss": 0.4821, + "step": 5807 + }, + { + "epoch": 0.6887228744219139, + "grad_norm": 1.4429391343794966, + "learning_rate": 4.7395923820495216e-05, + "loss": 0.4568, + "step": 5808 + }, + { + "epoch": 0.6888414561840389, + "grad_norm": 1.7244619432020698, + "learning_rate": 4.7394857037844326e-05, + "loss": 0.5436, + "step": 5809 + }, + { + "epoch": 0.6889600379461639, + "grad_norm": 1.5212172194719675, + "learning_rate": 4.7393790048740315e-05, + "loss": 0.3645, + "step": 5810 + }, + { + "epoch": 0.6890786197082889, + "grad_norm": 1.3023759315402172, + "learning_rate": 4.739272285319302e-05, + "loss": 0.383, + "step": 5811 + }, + { + "epoch": 0.6891972014704139, + "grad_norm": 1.5013055581939643, + "learning_rate": 4.739165545121228e-05, + "loss": 0.5427, + "step": 5812 + }, + { + "epoch": 0.6893157832325388, + "grad_norm": 1.3645606562415795, + "learning_rate": 4.7390587842807945e-05, + "loss": 0.4807, + "step": 5813 + }, + { + "epoch": 0.6894343649946638, + "grad_norm": 1.7429523223751278, + "learning_rate": 4.738952002798983e-05, + "loss": 0.5391, + "step": 5814 + }, + { + "epoch": 0.6895529467567888, + "grad_norm": 1.4612758481364325, + "learning_rate": 4.738845200676781e-05, + "loss": 0.4797, + "step": 5815 + }, + { + "epoch": 0.6896715285189138, + "grad_norm": 1.4122391607989557, + "learning_rate": 4.738738377915172e-05, + "loss": 0.3728, + "step": 5816 + }, + { + "epoch": 0.6897901102810388, + "grad_norm": 1.2757257102006745, + "learning_rate": 4.738631534515139e-05, + "loss": 0.4348, + "step": 5817 + }, + { + "epoch": 0.6899086920431637, + "grad_norm": 1.4468298303499614, + "learning_rate": 4.738524670477669e-05, + "loss": 0.5377, + "step": 5818 + }, + { + "epoch": 0.6900272738052887, + "grad_norm": 2.460603324225845, + "learning_rate": 4.7384177858037474e-05, + "loss": 0.944, + "step": 5819 + }, + { + "epoch": 0.6901458555674137, + "grad_norm": 1.4839328462933412, + "learning_rate": 4.7383108804943574e-05, + "loss": 0.6181, + "step": 5820 + }, + { + "epoch": 0.6902644373295387, + "grad_norm": 1.4411617972886719, + "learning_rate": 4.738203954550487e-05, + "loss": 0.4295, + "step": 5821 + }, + { + "epoch": 0.6903830190916637, + "grad_norm": 1.2831902185515272, + "learning_rate": 4.73809700797312e-05, + "loss": 0.3694, + "step": 5822 + }, + { + "epoch": 0.6905016008537886, + "grad_norm": 1.9024922599655691, + "learning_rate": 4.737990040763244e-05, + "loss": 0.628, + "step": 5823 + }, + { + "epoch": 0.6906201826159136, + "grad_norm": 1.8004161491489894, + "learning_rate": 4.737883052921844e-05, + "loss": 0.6793, + "step": 5824 + }, + { + "epoch": 0.6907387643780386, + "grad_norm": 1.3782960645486948, + "learning_rate": 4.737776044449906e-05, + "loss": 0.5947, + "step": 5825 + }, + { + "epoch": 0.6908573461401637, + "grad_norm": 1.5623152300599972, + "learning_rate": 4.737669015348417e-05, + "loss": 0.5731, + "step": 5826 + }, + { + "epoch": 0.6909759279022887, + "grad_norm": 1.61871095612396, + "learning_rate": 4.7375619656183644e-05, + "loss": 0.7085, + "step": 5827 + }, + { + "epoch": 0.6910945096644137, + "grad_norm": 1.1084739368144534, + "learning_rate": 4.737454895260734e-05, + "loss": 0.368, + "step": 5828 + }, + { + "epoch": 0.6912130914265386, + "grad_norm": 1.211126403356313, + "learning_rate": 4.737347804276513e-05, + "loss": 0.5287, + "step": 5829 + }, + { + "epoch": 0.6913316731886636, + "grad_norm": 0.9986801684586978, + "learning_rate": 4.737240692666689e-05, + "loss": 0.3306, + "step": 5830 + }, + { + "epoch": 0.6914502549507886, + "grad_norm": 1.2615411251529267, + "learning_rate": 4.73713356043225e-05, + "loss": 0.5272, + "step": 5831 + }, + { + "epoch": 0.6915688367129136, + "grad_norm": 1.528425079750217, + "learning_rate": 4.737026407574182e-05, + "loss": 0.6236, + "step": 5832 + }, + { + "epoch": 0.6916874184750386, + "grad_norm": 1.334203849686508, + "learning_rate": 4.736919234093474e-05, + "loss": 0.4551, + "step": 5833 + }, + { + "epoch": 0.6918060002371635, + "grad_norm": 1.4331442130400391, + "learning_rate": 4.736812039991113e-05, + "loss": 0.5355, + "step": 5834 + }, + { + "epoch": 0.6919245819992885, + "grad_norm": 1.2440821015156103, + "learning_rate": 4.736704825268089e-05, + "loss": 0.5246, + "step": 5835 + }, + { + "epoch": 0.6920431637614135, + "grad_norm": 1.4219206266067421, + "learning_rate": 4.736597589925389e-05, + "loss": 0.6297, + "step": 5836 + }, + { + "epoch": 0.6921617455235385, + "grad_norm": 1.3086883771654296, + "learning_rate": 4.736490333964002e-05, + "loss": 0.4994, + "step": 5837 + }, + { + "epoch": 0.6922803272856635, + "grad_norm": 1.4388139831673052, + "learning_rate": 4.7363830573849166e-05, + "loss": 0.5046, + "step": 5838 + }, + { + "epoch": 0.6923989090477884, + "grad_norm": 1.4741280403822496, + "learning_rate": 4.736275760189122e-05, + "loss": 0.5176, + "step": 5839 + }, + { + "epoch": 0.6925174908099134, + "grad_norm": 1.4945539866120008, + "learning_rate": 4.736168442377607e-05, + "loss": 0.5921, + "step": 5840 + }, + { + "epoch": 0.6926360725720384, + "grad_norm": 1.5291303375321381, + "learning_rate": 4.736061103951361e-05, + "loss": 0.5943, + "step": 5841 + }, + { + "epoch": 0.6927546543341634, + "grad_norm": 1.2516326357163647, + "learning_rate": 4.735953744911374e-05, + "loss": 0.4633, + "step": 5842 + }, + { + "epoch": 0.6928732360962884, + "grad_norm": 1.3847101409884288, + "learning_rate": 4.735846365258635e-05, + "loss": 0.4081, + "step": 5843 + }, + { + "epoch": 0.6929918178584134, + "grad_norm": 1.8363616681263069, + "learning_rate": 4.7357389649941355e-05, + "loss": 0.5753, + "step": 5844 + }, + { + "epoch": 0.6931103996205383, + "grad_norm": 1.1289569515749396, + "learning_rate": 4.735631544118863e-05, + "loss": 0.3255, + "step": 5845 + }, + { + "epoch": 0.6932289813826633, + "grad_norm": 1.7183448942153636, + "learning_rate": 4.735524102633809e-05, + "loss": 0.728, + "step": 5846 + }, + { + "epoch": 0.6933475631447883, + "grad_norm": 1.7629938273922963, + "learning_rate": 4.735416640539966e-05, + "loss": 0.4884, + "step": 5847 + }, + { + "epoch": 0.6934661449069133, + "grad_norm": 1.4779273893577392, + "learning_rate": 4.7353091578383215e-05, + "loss": 0.3993, + "step": 5848 + }, + { + "epoch": 0.6935847266690383, + "grad_norm": 1.4366976387709554, + "learning_rate": 4.735201654529867e-05, + "loss": 0.4832, + "step": 5849 + }, + { + "epoch": 0.6937033084311632, + "grad_norm": 1.6048615250847587, + "learning_rate": 4.735094130615595e-05, + "loss": 0.4833, + "step": 5850 + }, + { + "epoch": 0.6938218901932883, + "grad_norm": 1.7905581277549416, + "learning_rate": 4.734986586096496e-05, + "loss": 0.6759, + "step": 5851 + }, + { + "epoch": 0.6939404719554133, + "grad_norm": 1.7620761364125563, + "learning_rate": 4.734879020973562e-05, + "loss": 0.7111, + "step": 5852 + }, + { + "epoch": 0.6940590537175383, + "grad_norm": 1.3730986457909102, + "learning_rate": 4.734771435247783e-05, + "loss": 0.5369, + "step": 5853 + }, + { + "epoch": 0.6941776354796633, + "grad_norm": 1.4499867947033716, + "learning_rate": 4.734663828920152e-05, + "loss": 0.5283, + "step": 5854 + }, + { + "epoch": 0.6942962172417882, + "grad_norm": 1.5452711121660943, + "learning_rate": 4.734556201991661e-05, + "loss": 0.6338, + "step": 5855 + }, + { + "epoch": 0.6944147990039132, + "grad_norm": 1.374513338033488, + "learning_rate": 4.734448554463302e-05, + "loss": 0.4917, + "step": 5856 + }, + { + "epoch": 0.6945333807660382, + "grad_norm": 1.2117202112112078, + "learning_rate": 4.734340886336068e-05, + "loss": 0.3614, + "step": 5857 + }, + { + "epoch": 0.6946519625281632, + "grad_norm": 1.1981286588434894, + "learning_rate": 4.73423319761095e-05, + "loss": 0.4032, + "step": 5858 + }, + { + "epoch": 0.6947705442902882, + "grad_norm": 1.1396402289290508, + "learning_rate": 4.734125488288942e-05, + "loss": 0.375, + "step": 5859 + }, + { + "epoch": 0.6948891260524132, + "grad_norm": 1.2497182299420218, + "learning_rate": 4.734017758371037e-05, + "loss": 0.416, + "step": 5860 + }, + { + "epoch": 0.6950077078145381, + "grad_norm": 1.591261943859206, + "learning_rate": 4.733910007858228e-05, + "loss": 0.6118, + "step": 5861 + }, + { + "epoch": 0.6951262895766631, + "grad_norm": 1.7658282301585015, + "learning_rate": 4.733802236751507e-05, + "loss": 0.596, + "step": 5862 + }, + { + "epoch": 0.6952448713387881, + "grad_norm": 1.6658263073469215, + "learning_rate": 4.7336944450518696e-05, + "loss": 0.5929, + "step": 5863 + }, + { + "epoch": 0.6953634531009131, + "grad_norm": 1.7425330592934143, + "learning_rate": 4.7335866327603086e-05, + "loss": 0.5903, + "step": 5864 + }, + { + "epoch": 0.6954820348630381, + "grad_norm": 1.6361601819130394, + "learning_rate": 4.733478799877817e-05, + "loss": 0.5193, + "step": 5865 + }, + { + "epoch": 0.695600616625163, + "grad_norm": 1.823413887170786, + "learning_rate": 4.733370946405391e-05, + "loss": 0.6462, + "step": 5866 + }, + { + "epoch": 0.695719198387288, + "grad_norm": 1.5333372358825776, + "learning_rate": 4.7332630723440225e-05, + "loss": 0.6184, + "step": 5867 + }, + { + "epoch": 0.695837780149413, + "grad_norm": 1.3254614858725624, + "learning_rate": 4.7331551776947075e-05, + "loss": 0.451, + "step": 5868 + }, + { + "epoch": 0.695956361911538, + "grad_norm": 1.386051915383401, + "learning_rate": 4.7330472624584396e-05, + "loss": 0.3332, + "step": 5869 + }, + { + "epoch": 0.696074943673663, + "grad_norm": 1.812729380861544, + "learning_rate": 4.732939326636216e-05, + "loss": 0.6201, + "step": 5870 + }, + { + "epoch": 0.6961935254357879, + "grad_norm": 1.3666421580671622, + "learning_rate": 4.7328313702290285e-05, + "loss": 0.3739, + "step": 5871 + }, + { + "epoch": 0.6963121071979129, + "grad_norm": 1.415734444424527, + "learning_rate": 4.732723393237875e-05, + "loss": 0.3941, + "step": 5872 + }, + { + "epoch": 0.6964306889600379, + "grad_norm": 1.7268802499312779, + "learning_rate": 4.732615395663748e-05, + "loss": 0.5275, + "step": 5873 + }, + { + "epoch": 0.6965492707221629, + "grad_norm": 1.5786202907707103, + "learning_rate": 4.732507377507646e-05, + "loss": 0.5458, + "step": 5874 + }, + { + "epoch": 0.6966678524842879, + "grad_norm": 1.4921647485503657, + "learning_rate": 4.732399338770563e-05, + "loss": 0.4846, + "step": 5875 + }, + { + "epoch": 0.696786434246413, + "grad_norm": 1.3034398436639854, + "learning_rate": 4.732291279453497e-05, + "loss": 0.3817, + "step": 5876 + }, + { + "epoch": 0.6969050160085379, + "grad_norm": 1.1344967833578508, + "learning_rate": 4.7321831995574414e-05, + "loss": 0.4116, + "step": 5877 + }, + { + "epoch": 0.6970235977706629, + "grad_norm": 1.3882287457673606, + "learning_rate": 4.7320750990833954e-05, + "loss": 0.439, + "step": 5878 + }, + { + "epoch": 0.6971421795327879, + "grad_norm": 1.4382200368897877, + "learning_rate": 4.731966978032353e-05, + "loss": 0.4349, + "step": 5879 + }, + { + "epoch": 0.6972607612949129, + "grad_norm": 1.6500673192625435, + "learning_rate": 4.7318588364053126e-05, + "loss": 0.5658, + "step": 5880 + }, + { + "epoch": 0.6973793430570379, + "grad_norm": 1.2696162915212013, + "learning_rate": 4.73175067420327e-05, + "loss": 0.3454, + "step": 5881 + }, + { + "epoch": 0.6974979248191628, + "grad_norm": 1.024509747364139, + "learning_rate": 4.731642491427224e-05, + "loss": 0.3429, + "step": 5882 + }, + { + "epoch": 0.6976165065812878, + "grad_norm": 1.665208322971569, + "learning_rate": 4.73153428807817e-05, + "loss": 0.4705, + "step": 5883 + }, + { + "epoch": 0.6977350883434128, + "grad_norm": 1.2298907692723107, + "learning_rate": 4.7314260641571065e-05, + "loss": 0.3774, + "step": 5884 + }, + { + "epoch": 0.6978536701055378, + "grad_norm": 1.1498817325914694, + "learning_rate": 4.731317819665031e-05, + "loss": 0.3075, + "step": 5885 + }, + { + "epoch": 0.6979722518676628, + "grad_norm": 1.502461154985352, + "learning_rate": 4.731209554602942e-05, + "loss": 0.3905, + "step": 5886 + }, + { + "epoch": 0.6980908336297877, + "grad_norm": 1.545365812659794, + "learning_rate": 4.731101268971836e-05, + "loss": 0.4738, + "step": 5887 + }, + { + "epoch": 0.6982094153919127, + "grad_norm": 1.4836968000950312, + "learning_rate": 4.730992962772714e-05, + "loss": 0.3957, + "step": 5888 + }, + { + "epoch": 0.6983279971540377, + "grad_norm": 1.6354786113236879, + "learning_rate": 4.7308846360065706e-05, + "loss": 0.5732, + "step": 5889 + }, + { + "epoch": 0.6984465789161627, + "grad_norm": 1.770928217338458, + "learning_rate": 4.730776288674408e-05, + "loss": 0.5702, + "step": 5890 + }, + { + "epoch": 0.6985651606782877, + "grad_norm": 2.0275870867200885, + "learning_rate": 4.730667920777222e-05, + "loss": 0.6628, + "step": 5891 + }, + { + "epoch": 0.6986837424404126, + "grad_norm": 1.7326626731569186, + "learning_rate": 4.730559532316015e-05, + "loss": 0.4809, + "step": 5892 + }, + { + "epoch": 0.6988023242025376, + "grad_norm": 1.902389066678117, + "learning_rate": 4.7304511232917836e-05, + "loss": 0.5499, + "step": 5893 + }, + { + "epoch": 0.6989209059646626, + "grad_norm": 1.5488931618511972, + "learning_rate": 4.730342693705528e-05, + "loss": 0.4387, + "step": 5894 + }, + { + "epoch": 0.6990394877267876, + "grad_norm": 1.6868137672314207, + "learning_rate": 4.7302342435582476e-05, + "loss": 0.4819, + "step": 5895 + }, + { + "epoch": 0.6991580694889126, + "grad_norm": 1.3542243404793528, + "learning_rate": 4.730125772850943e-05, + "loss": 0.4079, + "step": 5896 + }, + { + "epoch": 0.6992766512510376, + "grad_norm": 1.5639698899531111, + "learning_rate": 4.730017281584613e-05, + "loss": 0.5336, + "step": 5897 + }, + { + "epoch": 0.6993952330131625, + "grad_norm": 1.5811606975490025, + "learning_rate": 4.7299087697602587e-05, + "loss": 0.4721, + "step": 5898 + }, + { + "epoch": 0.6995138147752875, + "grad_norm": 1.732402612606219, + "learning_rate": 4.7298002373788795e-05, + "loss": 0.5615, + "step": 5899 + }, + { + "epoch": 0.6996323965374125, + "grad_norm": 1.4524728076027031, + "learning_rate": 4.729691684441476e-05, + "loss": 0.4574, + "step": 5900 + }, + { + "epoch": 0.6997509782995376, + "grad_norm": 1.197924847755885, + "learning_rate": 4.729583110949051e-05, + "loss": 0.4067, + "step": 5901 + }, + { + "epoch": 0.6998695600616626, + "grad_norm": 1.1416280794254032, + "learning_rate": 4.7294745169026024e-05, + "loss": 0.3958, + "step": 5902 + }, + { + "epoch": 0.6999881418237875, + "grad_norm": 1.3998865562194645, + "learning_rate": 4.729365902303133e-05, + "loss": 0.4019, + "step": 5903 + }, + { + "epoch": 0.7001067235859125, + "grad_norm": 1.3192129175817104, + "learning_rate": 4.7292572671516443e-05, + "loss": 0.3784, + "step": 5904 + }, + { + "epoch": 0.7002253053480375, + "grad_norm": 1.5874148871177438, + "learning_rate": 4.729148611449137e-05, + "loss": 0.5182, + "step": 5905 + }, + { + "epoch": 0.7003438871101625, + "grad_norm": 1.2384219320303527, + "learning_rate": 4.729039935196613e-05, + "loss": 0.4747, + "step": 5906 + }, + { + "epoch": 0.7004624688722875, + "grad_norm": 1.3975944832787983, + "learning_rate": 4.7289312383950746e-05, + "loss": 0.5296, + "step": 5907 + }, + { + "epoch": 0.7005810506344124, + "grad_norm": 1.746167561330862, + "learning_rate": 4.7288225210455237e-05, + "loss": 0.5548, + "step": 5908 + }, + { + "epoch": 0.7006996323965374, + "grad_norm": 1.4595847540456612, + "learning_rate": 4.728713783148961e-05, + "loss": 0.5001, + "step": 5909 + }, + { + "epoch": 0.7008182141586624, + "grad_norm": 1.000173164655837, + "learning_rate": 4.7286050247063916e-05, + "loss": 0.3067, + "step": 5910 + }, + { + "epoch": 0.7009367959207874, + "grad_norm": 1.0916913065665843, + "learning_rate": 4.728496245718816e-05, + "loss": 0.3675, + "step": 5911 + }, + { + "epoch": 0.7010553776829124, + "grad_norm": 1.345617170667106, + "learning_rate": 4.728387446187238e-05, + "loss": 0.4811, + "step": 5912 + }, + { + "epoch": 0.7011739594450374, + "grad_norm": 1.6765521877058907, + "learning_rate": 4.728278626112661e-05, + "loss": 0.5702, + "step": 5913 + }, + { + "epoch": 0.7012925412071623, + "grad_norm": 1.7942172738157531, + "learning_rate": 4.7281697854960874e-05, + "loss": 0.46, + "step": 5914 + }, + { + "epoch": 0.7014111229692873, + "grad_norm": 1.4679362350645122, + "learning_rate": 4.72806092433852e-05, + "loss": 0.4439, + "step": 5915 + }, + { + "epoch": 0.7015297047314123, + "grad_norm": 1.4064822935685775, + "learning_rate": 4.727952042640963e-05, + "loss": 0.4108, + "step": 5916 + }, + { + "epoch": 0.7016482864935373, + "grad_norm": 1.6994344565965738, + "learning_rate": 4.727843140404421e-05, + "loss": 0.4493, + "step": 5917 + }, + { + "epoch": 0.7017668682556623, + "grad_norm": 2.104917069903942, + "learning_rate": 4.727734217629897e-05, + "loss": 0.648, + "step": 5918 + }, + { + "epoch": 0.7018854500177872, + "grad_norm": 1.4274694245798705, + "learning_rate": 4.7276252743183945e-05, + "loss": 0.4147, + "step": 5919 + }, + { + "epoch": 0.7020040317799122, + "grad_norm": 1.6318014827359697, + "learning_rate": 4.72751631047092e-05, + "loss": 0.4388, + "step": 5920 + }, + { + "epoch": 0.7021226135420372, + "grad_norm": 1.3463002339299923, + "learning_rate": 4.727407326088476e-05, + "loss": 0.3649, + "step": 5921 + }, + { + "epoch": 0.7022411953041622, + "grad_norm": 1.3621512858616547, + "learning_rate": 4.7272983211720676e-05, + "loss": 0.4368, + "step": 5922 + }, + { + "epoch": 0.7023597770662872, + "grad_norm": 1.485238884881027, + "learning_rate": 4.7271892957227e-05, + "loss": 0.4407, + "step": 5923 + }, + { + "epoch": 0.7024783588284121, + "grad_norm": 1.5654029799509774, + "learning_rate": 4.727080249741379e-05, + "loss": 0.5948, + "step": 5924 + }, + { + "epoch": 0.7025969405905371, + "grad_norm": 1.6040530458222044, + "learning_rate": 4.726971183229109e-05, + "loss": 0.606, + "step": 5925 + }, + { + "epoch": 0.7027155223526622, + "grad_norm": 1.5521128202492758, + "learning_rate": 4.7268620961868956e-05, + "loss": 0.425, + "step": 5926 + }, + { + "epoch": 0.7028341041147872, + "grad_norm": 1.4568612287662037, + "learning_rate": 4.726752988615743e-05, + "loss": 0.4354, + "step": 5927 + }, + { + "epoch": 0.7029526858769122, + "grad_norm": 1.4386641990131188, + "learning_rate": 4.726643860516661e-05, + "loss": 0.5651, + "step": 5928 + }, + { + "epoch": 0.7030712676390372, + "grad_norm": 1.367753465330739, + "learning_rate": 4.726534711890651e-05, + "loss": 0.4008, + "step": 5929 + }, + { + "epoch": 0.7031898494011621, + "grad_norm": 1.5000165608998892, + "learning_rate": 4.726425542738722e-05, + "loss": 0.7037, + "step": 5930 + }, + { + "epoch": 0.7033084311632871, + "grad_norm": 1.5747477998382635, + "learning_rate": 4.72631635306188e-05, + "loss": 0.4796, + "step": 5931 + }, + { + "epoch": 0.7034270129254121, + "grad_norm": 1.3833280612224677, + "learning_rate": 4.726207142861131e-05, + "loss": 0.4967, + "step": 5932 + }, + { + "epoch": 0.7035455946875371, + "grad_norm": 1.895884851920397, + "learning_rate": 4.726097912137482e-05, + "loss": 0.7004, + "step": 5933 + }, + { + "epoch": 0.7036641764496621, + "grad_norm": 1.410781548415178, + "learning_rate": 4.72598866089194e-05, + "loss": 0.3574, + "step": 5934 + }, + { + "epoch": 0.703782758211787, + "grad_norm": 1.373035936774265, + "learning_rate": 4.7258793891255124e-05, + "loss": 0.4072, + "step": 5935 + }, + { + "epoch": 0.703901339973912, + "grad_norm": 1.6744581797437508, + "learning_rate": 4.7257700968392074e-05, + "loss": 0.4675, + "step": 5936 + }, + { + "epoch": 0.704019921736037, + "grad_norm": 1.1821717302901371, + "learning_rate": 4.72566078403403e-05, + "loss": 0.3443, + "step": 5937 + }, + { + "epoch": 0.704138503498162, + "grad_norm": 1.4611561842336995, + "learning_rate": 4.72555145071099e-05, + "loss": 0.5054, + "step": 5938 + }, + { + "epoch": 0.704257085260287, + "grad_norm": 1.7927054987476099, + "learning_rate": 4.7254420968710945e-05, + "loss": 0.6275, + "step": 5939 + }, + { + "epoch": 0.7043756670224119, + "grad_norm": 1.3376478924166504, + "learning_rate": 4.725332722515352e-05, + "loss": 0.4088, + "step": 5940 + }, + { + "epoch": 0.7044942487845369, + "grad_norm": 1.2137412295429737, + "learning_rate": 4.725223327644771e-05, + "loss": 0.3314, + "step": 5941 + }, + { + "epoch": 0.7046128305466619, + "grad_norm": 1.4230484652856537, + "learning_rate": 4.72511391226036e-05, + "loss": 0.4057, + "step": 5942 + }, + { + "epoch": 0.7047314123087869, + "grad_norm": 1.2226011256534264, + "learning_rate": 4.725004476363127e-05, + "loss": 0.3479, + "step": 5943 + }, + { + "epoch": 0.7048499940709119, + "grad_norm": 1.5094247786964183, + "learning_rate": 4.724895019954081e-05, + "loss": 0.4414, + "step": 5944 + }, + { + "epoch": 0.7049685758330368, + "grad_norm": 1.4776662812611645, + "learning_rate": 4.7247855430342315e-05, + "loss": 0.5321, + "step": 5945 + }, + { + "epoch": 0.7050871575951618, + "grad_norm": 1.1026220443720005, + "learning_rate": 4.7246760456045866e-05, + "loss": 0.2955, + "step": 5946 + }, + { + "epoch": 0.7052057393572868, + "grad_norm": 1.2130178216068794, + "learning_rate": 4.724566527666158e-05, + "loss": 0.2889, + "step": 5947 + }, + { + "epoch": 0.7053243211194118, + "grad_norm": 1.3021672927987606, + "learning_rate": 4.724456989219953e-05, + "loss": 0.3943, + "step": 5948 + }, + { + "epoch": 0.7054429028815368, + "grad_norm": 1.5558689035364544, + "learning_rate": 4.7243474302669834e-05, + "loss": 0.5451, + "step": 5949 + }, + { + "epoch": 0.7055614846436618, + "grad_norm": 1.815468782358231, + "learning_rate": 4.724237850808257e-05, + "loss": 0.6548, + "step": 5950 + }, + { + "epoch": 0.7056800664057868, + "grad_norm": 2.108305672181066, + "learning_rate": 4.724128250844786e-05, + "loss": 0.6917, + "step": 5951 + }, + { + "epoch": 0.7057986481679118, + "grad_norm": 1.6886394623265477, + "learning_rate": 4.7240186303775794e-05, + "loss": 0.4883, + "step": 5952 + }, + { + "epoch": 0.7059172299300368, + "grad_norm": 1.4978929230391342, + "learning_rate": 4.723908989407648e-05, + "loss": 0.4674, + "step": 5953 + }, + { + "epoch": 0.7060358116921618, + "grad_norm": 1.7491778616671785, + "learning_rate": 4.7237993279360036e-05, + "loss": 0.5268, + "step": 5954 + }, + { + "epoch": 0.7061543934542868, + "grad_norm": 1.528305047675958, + "learning_rate": 4.723689645963656e-05, + "loss": 0.4883, + "step": 5955 + }, + { + "epoch": 0.7062729752164117, + "grad_norm": 1.2759809321609918, + "learning_rate": 4.723579943491617e-05, + "loss": 0.3816, + "step": 5956 + }, + { + "epoch": 0.7063915569785367, + "grad_norm": 1.5935467831374166, + "learning_rate": 4.7234702205208974e-05, + "loss": 0.5671, + "step": 5957 + }, + { + "epoch": 0.7065101387406617, + "grad_norm": 1.5548924218578515, + "learning_rate": 4.7233604770525095e-05, + "loss": 0.5028, + "step": 5958 + }, + { + "epoch": 0.7066287205027867, + "grad_norm": 1.4301672275800712, + "learning_rate": 4.723250713087464e-05, + "loss": 0.447, + "step": 5959 + }, + { + "epoch": 0.7067473022649117, + "grad_norm": 1.482943506343618, + "learning_rate": 4.723140928626773e-05, + "loss": 0.5458, + "step": 5960 + }, + { + "epoch": 0.7068658840270367, + "grad_norm": 1.2833827823711732, + "learning_rate": 4.7230311236714495e-05, + "loss": 0.3465, + "step": 5961 + }, + { + "epoch": 0.7069844657891616, + "grad_norm": 1.242530107871678, + "learning_rate": 4.722921298222505e-05, + "loss": 0.4536, + "step": 5962 + }, + { + "epoch": 0.7071030475512866, + "grad_norm": 1.1598507232416433, + "learning_rate": 4.722811452280951e-05, + "loss": 0.3775, + "step": 5963 + }, + { + "epoch": 0.7072216293134116, + "grad_norm": 1.439248138879495, + "learning_rate": 4.7227015858478024e-05, + "loss": 0.5237, + "step": 5964 + }, + { + "epoch": 0.7073402110755366, + "grad_norm": 2.5165198662421266, + "learning_rate": 4.72259169892407e-05, + "loss": 0.727, + "step": 5965 + }, + { + "epoch": 0.7074587928376616, + "grad_norm": 1.721405742762175, + "learning_rate": 4.7224817915107686e-05, + "loss": 0.5552, + "step": 5966 + }, + { + "epoch": 0.7075773745997865, + "grad_norm": 1.3256628256889362, + "learning_rate": 4.72237186360891e-05, + "loss": 0.4454, + "step": 5967 + }, + { + "epoch": 0.7076959563619115, + "grad_norm": 1.6477022857390915, + "learning_rate": 4.722261915219508e-05, + "loss": 0.6967, + "step": 5968 + }, + { + "epoch": 0.7078145381240365, + "grad_norm": 1.0429301349454834, + "learning_rate": 4.722151946343576e-05, + "loss": 0.362, + "step": 5969 + }, + { + "epoch": 0.7079331198861615, + "grad_norm": 1.387324199367693, + "learning_rate": 4.722041956982128e-05, + "loss": 0.4778, + "step": 5970 + }, + { + "epoch": 0.7080517016482865, + "grad_norm": 1.5172439270254272, + "learning_rate": 4.721931947136179e-05, + "loss": 0.651, + "step": 5971 + }, + { + "epoch": 0.7081702834104114, + "grad_norm": 1.4978675611510586, + "learning_rate": 4.721821916806741e-05, + "loss": 0.6066, + "step": 5972 + }, + { + "epoch": 0.7082888651725364, + "grad_norm": 1.1445779510254601, + "learning_rate": 4.72171186599483e-05, + "loss": 0.3915, + "step": 5973 + }, + { + "epoch": 0.7084074469346614, + "grad_norm": 1.5565573812420421, + "learning_rate": 4.72160179470146e-05, + "loss": 0.5772, + "step": 5974 + }, + { + "epoch": 0.7085260286967864, + "grad_norm": 1.5208319074838659, + "learning_rate": 4.721491702927646e-05, + "loss": 0.4085, + "step": 5975 + }, + { + "epoch": 0.7086446104589115, + "grad_norm": 1.5829729189712474, + "learning_rate": 4.7213815906744026e-05, + "loss": 0.6353, + "step": 5976 + }, + { + "epoch": 0.7087631922210365, + "grad_norm": 1.3525127662085084, + "learning_rate": 4.7212714579427444e-05, + "loss": 0.4676, + "step": 5977 + }, + { + "epoch": 0.7088817739831614, + "grad_norm": 1.4653520896630094, + "learning_rate": 4.721161304733688e-05, + "loss": 0.5458, + "step": 5978 + }, + { + "epoch": 0.7090003557452864, + "grad_norm": 1.2492672836237313, + "learning_rate": 4.7210511310482476e-05, + "loss": 0.35, + "step": 5979 + }, + { + "epoch": 0.7091189375074114, + "grad_norm": 1.223145669993536, + "learning_rate": 4.72094093688744e-05, + "loss": 0.3711, + "step": 5980 + }, + { + "epoch": 0.7092375192695364, + "grad_norm": 1.2580518152776057, + "learning_rate": 4.7208307222522805e-05, + "loss": 0.4231, + "step": 5981 + }, + { + "epoch": 0.7093561010316614, + "grad_norm": 1.11427313023881, + "learning_rate": 4.720720487143784e-05, + "loss": 0.432, + "step": 5982 + }, + { + "epoch": 0.7094746827937863, + "grad_norm": 1.4003536407768433, + "learning_rate": 4.7206102315629685e-05, + "loss": 0.5864, + "step": 5983 + }, + { + "epoch": 0.7095932645559113, + "grad_norm": 1.2311598632068457, + "learning_rate": 4.72049995551085e-05, + "loss": 0.4259, + "step": 5984 + }, + { + "epoch": 0.7097118463180363, + "grad_norm": 1.4012744913169684, + "learning_rate": 4.7203896589884444e-05, + "loss": 0.3723, + "step": 5985 + }, + { + "epoch": 0.7098304280801613, + "grad_norm": 1.243354222337033, + "learning_rate": 4.720279341996769e-05, + "loss": 0.4444, + "step": 5986 + }, + { + "epoch": 0.7099490098422863, + "grad_norm": 1.461754893101291, + "learning_rate": 4.720169004536842e-05, + "loss": 0.4897, + "step": 5987 + }, + { + "epoch": 0.7100675916044112, + "grad_norm": 1.3485648434952067, + "learning_rate": 4.720058646609678e-05, + "loss": 0.3231, + "step": 5988 + }, + { + "epoch": 0.7101861733665362, + "grad_norm": 1.1887650968885484, + "learning_rate": 4.719948268216296e-05, + "loss": 0.364, + "step": 5989 + }, + { + "epoch": 0.7103047551286612, + "grad_norm": 1.6067977576490993, + "learning_rate": 4.7198378693577125e-05, + "loss": 0.4317, + "step": 5990 + }, + { + "epoch": 0.7104233368907862, + "grad_norm": 1.2634600163052283, + "learning_rate": 4.7197274500349475e-05, + "loss": 0.3408, + "step": 5991 + }, + { + "epoch": 0.7105419186529112, + "grad_norm": 1.5235591373752373, + "learning_rate": 4.7196170102490165e-05, + "loss": 0.3395, + "step": 5992 + }, + { + "epoch": 0.7106605004150361, + "grad_norm": 1.3898478875330689, + "learning_rate": 4.7195065500009386e-05, + "loss": 0.3994, + "step": 5993 + }, + { + "epoch": 0.7107790821771611, + "grad_norm": 1.2420743272771753, + "learning_rate": 4.719396069291733e-05, + "loss": 0.3463, + "step": 5994 + }, + { + "epoch": 0.7108976639392861, + "grad_norm": 1.8728331013760244, + "learning_rate": 4.719285568122416e-05, + "loss": 0.4337, + "step": 5995 + }, + { + "epoch": 0.7110162457014111, + "grad_norm": 1.9429262801750444, + "learning_rate": 4.719175046494008e-05, + "loss": 0.5696, + "step": 5996 + }, + { + "epoch": 0.7111348274635361, + "grad_norm": 1.5893827750058425, + "learning_rate": 4.719064504407526e-05, + "loss": 0.4821, + "step": 5997 + }, + { + "epoch": 0.711253409225661, + "grad_norm": 1.4002367476420166, + "learning_rate": 4.718953941863993e-05, + "loss": 0.3533, + "step": 5998 + }, + { + "epoch": 0.711371990987786, + "grad_norm": 1.8679366696872277, + "learning_rate": 4.7188433588644246e-05, + "loss": 0.6416, + "step": 5999 + }, + { + "epoch": 0.711490572749911, + "grad_norm": 3.0503118304620536, + "learning_rate": 4.718732755409841e-05, + "loss": 0.7112, + "step": 6000 + }, + { + "epoch": 0.7116091545120361, + "grad_norm": 1.7045087574206623, + "learning_rate": 4.718622131501263e-05, + "loss": 0.6012, + "step": 6001 + }, + { + "epoch": 0.7117277362741611, + "grad_norm": 1.696515044979516, + "learning_rate": 4.7185114871397086e-05, + "loss": 0.4959, + "step": 6002 + }, + { + "epoch": 0.7118463180362861, + "grad_norm": 1.6356284959822913, + "learning_rate": 4.718400822326199e-05, + "loss": 0.5346, + "step": 6003 + }, + { + "epoch": 0.711964899798411, + "grad_norm": 1.6989017943775164, + "learning_rate": 4.7182901370617546e-05, + "loss": 0.58, + "step": 6004 + }, + { + "epoch": 0.712083481560536, + "grad_norm": 1.6512814543362178, + "learning_rate": 4.718179431347395e-05, + "loss": 0.6236, + "step": 6005 + }, + { + "epoch": 0.712202063322661, + "grad_norm": 1.5570988112192905, + "learning_rate": 4.7180687051841416e-05, + "loss": 0.5012, + "step": 6006 + }, + { + "epoch": 0.712320645084786, + "grad_norm": 1.2465712638269832, + "learning_rate": 4.717957958573015e-05, + "loss": 0.4705, + "step": 6007 + }, + { + "epoch": 0.712439226846911, + "grad_norm": 1.4098136796235567, + "learning_rate": 4.7178471915150344e-05, + "loss": 0.4736, + "step": 6008 + }, + { + "epoch": 0.712557808609036, + "grad_norm": 1.6799569338660605, + "learning_rate": 4.7177364040112236e-05, + "loss": 0.5198, + "step": 6009 + }, + { + "epoch": 0.7126763903711609, + "grad_norm": 1.3489622627853803, + "learning_rate": 4.717625596062602e-05, + "loss": 0.5322, + "step": 6010 + }, + { + "epoch": 0.7127949721332859, + "grad_norm": 1.1519376170329596, + "learning_rate": 4.717514767670193e-05, + "loss": 0.4263, + "step": 6011 + }, + { + "epoch": 0.7129135538954109, + "grad_norm": 1.3553575209621211, + "learning_rate": 4.717403918835017e-05, + "loss": 0.573, + "step": 6012 + }, + { + "epoch": 0.7130321356575359, + "grad_norm": 1.3496918712492023, + "learning_rate": 4.7172930495580945e-05, + "loss": 0.4021, + "step": 6013 + }, + { + "epoch": 0.7131507174196609, + "grad_norm": 1.207950603079253, + "learning_rate": 4.717182159840451e-05, + "loss": 0.4735, + "step": 6014 + }, + { + "epoch": 0.7132692991817858, + "grad_norm": 1.5058323829145883, + "learning_rate": 4.717071249683106e-05, + "loss": 0.4265, + "step": 6015 + }, + { + "epoch": 0.7133878809439108, + "grad_norm": 1.5216611171563474, + "learning_rate": 4.716960319087082e-05, + "loss": 0.474, + "step": 6016 + }, + { + "epoch": 0.7135064627060358, + "grad_norm": 1.2316875303428252, + "learning_rate": 4.7168493680534034e-05, + "loss": 0.4708, + "step": 6017 + }, + { + "epoch": 0.7136250444681608, + "grad_norm": 1.365902239593643, + "learning_rate": 4.716738396583092e-05, + "loss": 0.3592, + "step": 6018 + }, + { + "epoch": 0.7137436262302858, + "grad_norm": 1.4722511555458055, + "learning_rate": 4.7166274046771714e-05, + "loss": 0.6663, + "step": 6019 + }, + { + "epoch": 0.7138622079924107, + "grad_norm": 2.2329775512133896, + "learning_rate": 4.716516392336664e-05, + "loss": 0.6219, + "step": 6020 + }, + { + "epoch": 0.7139807897545357, + "grad_norm": 1.385866409848301, + "learning_rate": 4.7164053595625934e-05, + "loss": 0.499, + "step": 6021 + }, + { + "epoch": 0.7140993715166607, + "grad_norm": 1.3480872785222318, + "learning_rate": 4.716294306355983e-05, + "loss": 0.4731, + "step": 6022 + }, + { + "epoch": 0.7142179532787857, + "grad_norm": 1.4786621663330546, + "learning_rate": 4.716183232717857e-05, + "loss": 0.5559, + "step": 6023 + }, + { + "epoch": 0.7143365350409107, + "grad_norm": 1.7232082315966315, + "learning_rate": 4.71607213864924e-05, + "loss": 0.6616, + "step": 6024 + }, + { + "epoch": 0.7144551168030356, + "grad_norm": 1.0566713350334818, + "learning_rate": 4.7159610241511545e-05, + "loss": 0.3917, + "step": 6025 + }, + { + "epoch": 0.7145736985651607, + "grad_norm": 1.275234150206778, + "learning_rate": 4.715849889224626e-05, + "loss": 0.3643, + "step": 6026 + }, + { + "epoch": 0.7146922803272857, + "grad_norm": 1.91446337837352, + "learning_rate": 4.715738733870679e-05, + "loss": 0.5402, + "step": 6027 + }, + { + "epoch": 0.7148108620894107, + "grad_norm": 1.2693028220829001, + "learning_rate": 4.715627558090338e-05, + "loss": 0.4081, + "step": 6028 + }, + { + "epoch": 0.7149294438515357, + "grad_norm": 1.5687933386689075, + "learning_rate": 4.7155163618846274e-05, + "loss": 0.4446, + "step": 6029 + }, + { + "epoch": 0.7150480256136607, + "grad_norm": 1.4604014205736202, + "learning_rate": 4.715405145254573e-05, + "loss": 0.3999, + "step": 6030 + }, + { + "epoch": 0.7151666073757856, + "grad_norm": 1.3775410782322095, + "learning_rate": 4.715293908201201e-05, + "loss": 0.4105, + "step": 6031 + }, + { + "epoch": 0.7152851891379106, + "grad_norm": 1.569618936490067, + "learning_rate": 4.7151826507255346e-05, + "loss": 0.5459, + "step": 6032 + }, + { + "epoch": 0.7154037709000356, + "grad_norm": 1.4234929001217402, + "learning_rate": 4.7150713728286014e-05, + "loss": 0.5213, + "step": 6033 + }, + { + "epoch": 0.7155223526621606, + "grad_norm": 1.3599099970854809, + "learning_rate": 4.714960074511425e-05, + "loss": 0.4318, + "step": 6034 + }, + { + "epoch": 0.7156409344242856, + "grad_norm": 1.185096464777227, + "learning_rate": 4.714848755775034e-05, + "loss": 0.4058, + "step": 6035 + }, + { + "epoch": 0.7157595161864105, + "grad_norm": 1.3312068217066193, + "learning_rate": 4.7147374166204525e-05, + "loss": 0.341, + "step": 6036 + }, + { + "epoch": 0.7158780979485355, + "grad_norm": 1.3527698017855552, + "learning_rate": 4.714626057048709e-05, + "loss": 0.5029, + "step": 6037 + }, + { + "epoch": 0.7159966797106605, + "grad_norm": 1.7018056504987147, + "learning_rate": 4.714514677060829e-05, + "loss": 0.5775, + "step": 6038 + }, + { + "epoch": 0.7161152614727855, + "grad_norm": 1.4503923855099095, + "learning_rate": 4.7144032766578386e-05, + "loss": 0.3905, + "step": 6039 + }, + { + "epoch": 0.7162338432349105, + "grad_norm": 1.041306270461812, + "learning_rate": 4.714291855840766e-05, + "loss": 0.3394, + "step": 6040 + }, + { + "epoch": 0.7163524249970354, + "grad_norm": 1.2064904598779385, + "learning_rate": 4.7141804146106374e-05, + "loss": 0.3862, + "step": 6041 + }, + { + "epoch": 0.7164710067591604, + "grad_norm": 1.6417707836099262, + "learning_rate": 4.7140689529684814e-05, + "loss": 0.4147, + "step": 6042 + }, + { + "epoch": 0.7165895885212854, + "grad_norm": 1.34049993247524, + "learning_rate": 4.7139574709153236e-05, + "loss": 0.4876, + "step": 6043 + }, + { + "epoch": 0.7167081702834104, + "grad_norm": 1.5479176655938736, + "learning_rate": 4.7138459684521935e-05, + "loss": 0.4892, + "step": 6044 + }, + { + "epoch": 0.7168267520455354, + "grad_norm": 1.5159484971048887, + "learning_rate": 4.713734445580118e-05, + "loss": 0.4567, + "step": 6045 + }, + { + "epoch": 0.7169453338076603, + "grad_norm": 1.0177933904238412, + "learning_rate": 4.713622902300126e-05, + "loss": 0.359, + "step": 6046 + }, + { + "epoch": 0.7170639155697853, + "grad_norm": 1.3862692328244868, + "learning_rate": 4.713511338613245e-05, + "loss": 0.397, + "step": 6047 + }, + { + "epoch": 0.7171824973319103, + "grad_norm": 1.3885611342207496, + "learning_rate": 4.713399754520505e-05, + "loss": 0.3334, + "step": 6048 + }, + { + "epoch": 0.7173010790940353, + "grad_norm": 2.13315786800925, + "learning_rate": 4.713288150022932e-05, + "loss": 0.7461, + "step": 6049 + }, + { + "epoch": 0.7174196608561603, + "grad_norm": 1.5981453321549308, + "learning_rate": 4.7131765251215565e-05, + "loss": 0.4815, + "step": 6050 + }, + { + "epoch": 0.7175382426182854, + "grad_norm": 1.913126525779876, + "learning_rate": 4.7130648798174085e-05, + "loss": 0.5337, + "step": 6051 + }, + { + "epoch": 0.7176568243804103, + "grad_norm": 1.6175865442186839, + "learning_rate": 4.712953214111515e-05, + "loss": 0.5532, + "step": 6052 + }, + { + "epoch": 0.7177754061425353, + "grad_norm": 1.4064948750719986, + "learning_rate": 4.712841528004907e-05, + "loss": 0.4733, + "step": 6053 + }, + { + "epoch": 0.7178939879046603, + "grad_norm": 1.1546817956854067, + "learning_rate": 4.712729821498614e-05, + "loss": 0.3239, + "step": 6054 + }, + { + "epoch": 0.7180125696667853, + "grad_norm": 1.4473153536356218, + "learning_rate": 4.712618094593665e-05, + "loss": 0.433, + "step": 6055 + }, + { + "epoch": 0.7181311514289103, + "grad_norm": 1.5849077953621233, + "learning_rate": 4.712506347291091e-05, + "loss": 0.4741, + "step": 6056 + }, + { + "epoch": 0.7182497331910352, + "grad_norm": 1.3400456664627796, + "learning_rate": 4.712394579591921e-05, + "loss": 0.4599, + "step": 6057 + }, + { + "epoch": 0.7183683149531602, + "grad_norm": 2.1700411447037706, + "learning_rate": 4.7122827914971864e-05, + "loss": 0.6045, + "step": 6058 + }, + { + "epoch": 0.7184868967152852, + "grad_norm": 1.221094944651475, + "learning_rate": 4.712170983007917e-05, + "loss": 0.4555, + "step": 6059 + }, + { + "epoch": 0.7186054784774102, + "grad_norm": 1.5110335358415627, + "learning_rate": 4.7120591541251444e-05, + "loss": 0.6035, + "step": 6060 + }, + { + "epoch": 0.7187240602395352, + "grad_norm": 1.238183171251342, + "learning_rate": 4.711947304849899e-05, + "loss": 0.4659, + "step": 6061 + }, + { + "epoch": 0.7188426420016601, + "grad_norm": 1.3234519559758533, + "learning_rate": 4.711835435183211e-05, + "loss": 0.5861, + "step": 6062 + }, + { + "epoch": 0.7189612237637851, + "grad_norm": 1.270018465954381, + "learning_rate": 4.711723545126113e-05, + "loss": 0.4681, + "step": 6063 + }, + { + "epoch": 0.7190798055259101, + "grad_norm": 1.565894749531541, + "learning_rate": 4.711611634679637e-05, + "loss": 0.6446, + "step": 6064 + }, + { + "epoch": 0.7191983872880351, + "grad_norm": 1.4628545783303284, + "learning_rate": 4.711499703844813e-05, + "loss": 0.4773, + "step": 6065 + }, + { + "epoch": 0.7193169690501601, + "grad_norm": 1.3100905789092787, + "learning_rate": 4.7113877526226733e-05, + "loss": 0.4971, + "step": 6066 + }, + { + "epoch": 0.719435550812285, + "grad_norm": 1.3882434648454482, + "learning_rate": 4.711275781014251e-05, + "loss": 0.4591, + "step": 6067 + }, + { + "epoch": 0.71955413257441, + "grad_norm": 1.6472013814594941, + "learning_rate": 4.7111637890205775e-05, + "loss": 0.6753, + "step": 6068 + }, + { + "epoch": 0.719672714336535, + "grad_norm": 1.4302680300845416, + "learning_rate": 4.7110517766426855e-05, + "loss": 0.4397, + "step": 6069 + }, + { + "epoch": 0.71979129609866, + "grad_norm": 1.1305982517149848, + "learning_rate": 4.710939743881607e-05, + "loss": 0.317, + "step": 6070 + }, + { + "epoch": 0.719909877860785, + "grad_norm": 1.2798688675403849, + "learning_rate": 4.710827690738376e-05, + "loss": 0.4581, + "step": 6071 + }, + { + "epoch": 0.72002845962291, + "grad_norm": 1.3004879211362474, + "learning_rate": 4.7107156172140244e-05, + "loss": 0.5232, + "step": 6072 + }, + { + "epoch": 0.7201470413850349, + "grad_norm": 1.4320610922750447, + "learning_rate": 4.710603523309586e-05, + "loss": 0.5267, + "step": 6073 + }, + { + "epoch": 0.7202656231471599, + "grad_norm": 1.621326979491838, + "learning_rate": 4.710491409026093e-05, + "loss": 0.56, + "step": 6074 + }, + { + "epoch": 0.7203842049092849, + "grad_norm": 1.4149332751544246, + "learning_rate": 4.7103792743645806e-05, + "loss": 0.4817, + "step": 6075 + }, + { + "epoch": 0.72050278667141, + "grad_norm": 1.2001455843915008, + "learning_rate": 4.710267119326082e-05, + "loss": 0.3876, + "step": 6076 + }, + { + "epoch": 0.720621368433535, + "grad_norm": 1.4304629414419554, + "learning_rate": 4.710154943911631e-05, + "loss": 0.5728, + "step": 6077 + }, + { + "epoch": 0.72073995019566, + "grad_norm": 1.574098980406714, + "learning_rate": 4.7100427481222614e-05, + "loss": 0.5394, + "step": 6078 + }, + { + "epoch": 0.7208585319577849, + "grad_norm": 1.6026621344997338, + "learning_rate": 4.709930531959008e-05, + "loss": 0.4271, + "step": 6079 + }, + { + "epoch": 0.7209771137199099, + "grad_norm": 1.2532736098180903, + "learning_rate": 4.7098182954229046e-05, + "loss": 0.3911, + "step": 6080 + }, + { + "epoch": 0.7210956954820349, + "grad_norm": 1.6874586795923738, + "learning_rate": 4.709706038514987e-05, + "loss": 0.5757, + "step": 6081 + }, + { + "epoch": 0.7212142772441599, + "grad_norm": 1.3503404322641774, + "learning_rate": 4.709593761236289e-05, + "loss": 0.4268, + "step": 6082 + }, + { + "epoch": 0.7213328590062849, + "grad_norm": 1.949233066843344, + "learning_rate": 4.709481463587846e-05, + "loss": 0.5557, + "step": 6083 + }, + { + "epoch": 0.7214514407684098, + "grad_norm": 1.5681717012804985, + "learning_rate": 4.709369145570694e-05, + "loss": 0.566, + "step": 6084 + }, + { + "epoch": 0.7215700225305348, + "grad_norm": 1.9754683427462, + "learning_rate": 4.709256807185867e-05, + "loss": 0.6635, + "step": 6085 + }, + { + "epoch": 0.7216886042926598, + "grad_norm": 1.6730690473100094, + "learning_rate": 4.709144448434402e-05, + "loss": 0.5168, + "step": 6086 + }, + { + "epoch": 0.7218071860547848, + "grad_norm": 1.5928167540314504, + "learning_rate": 4.709032069317334e-05, + "loss": 0.5256, + "step": 6087 + }, + { + "epoch": 0.7219257678169098, + "grad_norm": 1.899830145254083, + "learning_rate": 4.708919669835699e-05, + "loss": 0.792, + "step": 6088 + }, + { + "epoch": 0.7220443495790347, + "grad_norm": 1.1766100104843935, + "learning_rate": 4.7088072499905345e-05, + "loss": 0.3604, + "step": 6089 + }, + { + "epoch": 0.7221629313411597, + "grad_norm": 1.8544421159246804, + "learning_rate": 4.7086948097828744e-05, + "loss": 0.7563, + "step": 6090 + }, + { + "epoch": 0.7222815131032847, + "grad_norm": 1.352637704505466, + "learning_rate": 4.708582349213757e-05, + "loss": 0.5267, + "step": 6091 + }, + { + "epoch": 0.7224000948654097, + "grad_norm": 1.7480040727560886, + "learning_rate": 4.708469868284219e-05, + "loss": 0.5927, + "step": 6092 + }, + { + "epoch": 0.7225186766275347, + "grad_norm": 1.4510495934835725, + "learning_rate": 4.7083573669952975e-05, + "loss": 0.5037, + "step": 6093 + }, + { + "epoch": 0.7226372583896596, + "grad_norm": 1.4686067214467082, + "learning_rate": 4.7082448453480284e-05, + "loss": 0.6015, + "step": 6094 + }, + { + "epoch": 0.7227558401517846, + "grad_norm": 1.275748824337452, + "learning_rate": 4.70813230334345e-05, + "loss": 0.3506, + "step": 6095 + }, + { + "epoch": 0.7228744219139096, + "grad_norm": 1.4045493416032426, + "learning_rate": 4.708019740982599e-05, + "loss": 0.494, + "step": 6096 + }, + { + "epoch": 0.7229930036760346, + "grad_norm": 1.433057706514785, + "learning_rate": 4.7079071582665144e-05, + "loss": 0.3871, + "step": 6097 + }, + { + "epoch": 0.7231115854381596, + "grad_norm": 1.096336106973171, + "learning_rate": 4.707794555196233e-05, + "loss": 0.3349, + "step": 6098 + }, + { + "epoch": 0.7232301672002845, + "grad_norm": 1.4653026073170068, + "learning_rate": 4.707681931772793e-05, + "loss": 0.4744, + "step": 6099 + }, + { + "epoch": 0.7233487489624095, + "grad_norm": 1.4702433964655757, + "learning_rate": 4.707569287997233e-05, + "loss": 0.5952, + "step": 6100 + }, + { + "epoch": 0.7234673307245346, + "grad_norm": 1.3176293396136916, + "learning_rate": 4.707456623870591e-05, + "loss": 0.4497, + "step": 6101 + }, + { + "epoch": 0.7235859124866596, + "grad_norm": 1.2449112754993978, + "learning_rate": 4.707343939393906e-05, + "loss": 0.467, + "step": 6102 + }, + { + "epoch": 0.7237044942487846, + "grad_norm": 1.2954216031080554, + "learning_rate": 4.707231234568217e-05, + "loss": 0.4541, + "step": 6103 + }, + { + "epoch": 0.7238230760109096, + "grad_norm": 1.4922824390051814, + "learning_rate": 4.707118509394563e-05, + "loss": 0.4825, + "step": 6104 + }, + { + "epoch": 0.7239416577730345, + "grad_norm": 1.6072524405283446, + "learning_rate": 4.707005763873982e-05, + "loss": 0.6649, + "step": 6105 + }, + { + "epoch": 0.7240602395351595, + "grad_norm": 1.1182452364273174, + "learning_rate": 4.706892998007515e-05, + "loss": 0.3064, + "step": 6106 + }, + { + "epoch": 0.7241788212972845, + "grad_norm": 1.375058087510993, + "learning_rate": 4.7067802117962e-05, + "loss": 0.3902, + "step": 6107 + }, + { + "epoch": 0.7242974030594095, + "grad_norm": 1.6848531189701381, + "learning_rate": 4.7066674052410784e-05, + "loss": 0.4881, + "step": 6108 + }, + { + "epoch": 0.7244159848215345, + "grad_norm": 1.3644043370041457, + "learning_rate": 4.7065545783431894e-05, + "loss": 0.5179, + "step": 6109 + }, + { + "epoch": 0.7245345665836594, + "grad_norm": 1.7573666511370971, + "learning_rate": 4.7064417311035725e-05, + "loss": 0.5681, + "step": 6110 + }, + { + "epoch": 0.7246531483457844, + "grad_norm": 2.1447510082031775, + "learning_rate": 4.7063288635232685e-05, + "loss": 0.6742, + "step": 6111 + }, + { + "epoch": 0.7247717301079094, + "grad_norm": 1.2430738151333283, + "learning_rate": 4.706215975603319e-05, + "loss": 0.3815, + "step": 6112 + }, + { + "epoch": 0.7248903118700344, + "grad_norm": 1.8866850891453646, + "learning_rate": 4.706103067344762e-05, + "loss": 0.6571, + "step": 6113 + }, + { + "epoch": 0.7250088936321594, + "grad_norm": 1.6442078766946733, + "learning_rate": 4.705990138748641e-05, + "loss": 0.5493, + "step": 6114 + }, + { + "epoch": 0.7251274753942843, + "grad_norm": 1.298919915916663, + "learning_rate": 4.7058771898159956e-05, + "loss": 0.4686, + "step": 6115 + }, + { + "epoch": 0.7252460571564093, + "grad_norm": 1.5927623984654276, + "learning_rate": 4.705764220547868e-05, + "loss": 0.4893, + "step": 6116 + }, + { + "epoch": 0.7253646389185343, + "grad_norm": 1.4456115134497547, + "learning_rate": 4.7056512309452996e-05, + "loss": 0.4801, + "step": 6117 + }, + { + "epoch": 0.7254832206806593, + "grad_norm": 1.0179228604227588, + "learning_rate": 4.705538221009331e-05, + "loss": 0.3511, + "step": 6118 + }, + { + "epoch": 0.7256018024427843, + "grad_norm": 1.4692050879565346, + "learning_rate": 4.705425190741004e-05, + "loss": 0.5707, + "step": 6119 + }, + { + "epoch": 0.7257203842049093, + "grad_norm": 1.1014355604907669, + "learning_rate": 4.7053121401413626e-05, + "loss": 0.3457, + "step": 6120 + }, + { + "epoch": 0.7258389659670342, + "grad_norm": 1.3306585520792216, + "learning_rate": 4.7051990692114465e-05, + "loss": 0.4919, + "step": 6121 + }, + { + "epoch": 0.7259575477291592, + "grad_norm": 1.1324490986329399, + "learning_rate": 4.705085977952299e-05, + "loss": 0.4341, + "step": 6122 + }, + { + "epoch": 0.7260761294912842, + "grad_norm": 1.366145845239357, + "learning_rate": 4.704972866364964e-05, + "loss": 0.4886, + "step": 6123 + }, + { + "epoch": 0.7261947112534092, + "grad_norm": 1.4748933511031468, + "learning_rate": 4.704859734450483e-05, + "loss": 0.5491, + "step": 6124 + }, + { + "epoch": 0.7263132930155342, + "grad_norm": 1.4539515014673203, + "learning_rate": 4.704746582209898e-05, + "loss": 0.6176, + "step": 6125 + }, + { + "epoch": 0.7264318747776592, + "grad_norm": 1.1830381252103113, + "learning_rate": 4.704633409644254e-05, + "loss": 0.5141, + "step": 6126 + }, + { + "epoch": 0.7265504565397842, + "grad_norm": 1.526427468684614, + "learning_rate": 4.704520216754593e-05, + "loss": 0.5824, + "step": 6127 + }, + { + "epoch": 0.7266690383019092, + "grad_norm": 1.2105971405103786, + "learning_rate": 4.704407003541959e-05, + "loss": 0.3537, + "step": 6128 + }, + { + "epoch": 0.7267876200640342, + "grad_norm": 1.2663308055695888, + "learning_rate": 4.7042937700073955e-05, + "loss": 0.4999, + "step": 6129 + }, + { + "epoch": 0.7269062018261592, + "grad_norm": 1.2129383480140594, + "learning_rate": 4.704180516151947e-05, + "loss": 0.344, + "step": 6130 + }, + { + "epoch": 0.7270247835882842, + "grad_norm": 1.5349390517440467, + "learning_rate": 4.7040672419766566e-05, + "loss": 0.4624, + "step": 6131 + }, + { + "epoch": 0.7271433653504091, + "grad_norm": 1.5005467457808328, + "learning_rate": 4.703953947482569e-05, + "loss": 0.4436, + "step": 6132 + }, + { + "epoch": 0.7272619471125341, + "grad_norm": 1.5601081139531303, + "learning_rate": 4.70384063267073e-05, + "loss": 0.5558, + "step": 6133 + }, + { + "epoch": 0.7273805288746591, + "grad_norm": 1.7806659284043609, + "learning_rate": 4.703727297542181e-05, + "loss": 0.4965, + "step": 6134 + }, + { + "epoch": 0.7274991106367841, + "grad_norm": 1.4418226123121622, + "learning_rate": 4.70361394209797e-05, + "loss": 0.4394, + "step": 6135 + }, + { + "epoch": 0.727617692398909, + "grad_norm": 1.9574513130486417, + "learning_rate": 4.703500566339139e-05, + "loss": 0.6418, + "step": 6136 + }, + { + "epoch": 0.727736274161034, + "grad_norm": 1.6760989125310386, + "learning_rate": 4.7033871702667373e-05, + "loss": 0.5388, + "step": 6137 + }, + { + "epoch": 0.727854855923159, + "grad_norm": 1.074912096626831, + "learning_rate": 4.7032737538818063e-05, + "loss": 0.3293, + "step": 6138 + }, + { + "epoch": 0.727973437685284, + "grad_norm": 1.237697926830716, + "learning_rate": 4.703160317185393e-05, + "loss": 0.4246, + "step": 6139 + }, + { + "epoch": 0.728092019447409, + "grad_norm": 1.0479735513351773, + "learning_rate": 4.703046860178544e-05, + "loss": 0.3414, + "step": 6140 + }, + { + "epoch": 0.728210601209534, + "grad_norm": 1.6145324244652244, + "learning_rate": 4.702933382862304e-05, + "loss": 0.5514, + "step": 6141 + }, + { + "epoch": 0.7283291829716589, + "grad_norm": 1.6033699810295368, + "learning_rate": 4.70281988523772e-05, + "loss": 0.4655, + "step": 6142 + }, + { + "epoch": 0.7284477647337839, + "grad_norm": 1.79396504532826, + "learning_rate": 4.7027063673058385e-05, + "loss": 0.4765, + "step": 6143 + }, + { + "epoch": 0.7285663464959089, + "grad_norm": 1.04439722632225, + "learning_rate": 4.7025928290677045e-05, + "loss": 0.3362, + "step": 6144 + }, + { + "epoch": 0.7286849282580339, + "grad_norm": 1.5059568415885345, + "learning_rate": 4.702479270524366e-05, + "loss": 0.4121, + "step": 6145 + }, + { + "epoch": 0.7288035100201589, + "grad_norm": 1.423702517425788, + "learning_rate": 4.70236569167687e-05, + "loss": 0.5228, + "step": 6146 + }, + { + "epoch": 0.7289220917822838, + "grad_norm": 1.4425488350041447, + "learning_rate": 4.702252092526262e-05, + "loss": 0.4691, + "step": 6147 + }, + { + "epoch": 0.7290406735444088, + "grad_norm": 1.5632808079508402, + "learning_rate": 4.7021384730735906e-05, + "loss": 0.5509, + "step": 6148 + }, + { + "epoch": 0.7291592553065338, + "grad_norm": 1.1668290562334627, + "learning_rate": 4.7020248333199037e-05, + "loss": 0.2994, + "step": 6149 + }, + { + "epoch": 0.7292778370686588, + "grad_norm": 1.4527775611446538, + "learning_rate": 4.701911173266247e-05, + "loss": 0.4897, + "step": 6150 + }, + { + "epoch": 0.7293964188307839, + "grad_norm": 1.4109943976971817, + "learning_rate": 4.7017974929136696e-05, + "loss": 0.4182, + "step": 6151 + }, + { + "epoch": 0.7295150005929089, + "grad_norm": 1.5438995468183552, + "learning_rate": 4.70168379226322e-05, + "loss": 0.5807, + "step": 6152 + }, + { + "epoch": 0.7296335823550338, + "grad_norm": 1.4552135237747463, + "learning_rate": 4.701570071315945e-05, + "loss": 0.4432, + "step": 6153 + }, + { + "epoch": 0.7297521641171588, + "grad_norm": 0.9821154338245045, + "learning_rate": 4.7014563300728945e-05, + "loss": 0.3268, + "step": 6154 + }, + { + "epoch": 0.7298707458792838, + "grad_norm": 1.7380504818896743, + "learning_rate": 4.701342568535115e-05, + "loss": 0.6064, + "step": 6155 + }, + { + "epoch": 0.7299893276414088, + "grad_norm": 1.554833338188221, + "learning_rate": 4.7012287867036574e-05, + "loss": 0.4653, + "step": 6156 + }, + { + "epoch": 0.7301079094035338, + "grad_norm": 1.4807020832089637, + "learning_rate": 4.70111498457957e-05, + "loss": 0.5737, + "step": 6157 + }, + { + "epoch": 0.7302264911656587, + "grad_norm": 1.0859189164372294, + "learning_rate": 4.7010011621639014e-05, + "loss": 0.3405, + "step": 6158 + }, + { + "epoch": 0.7303450729277837, + "grad_norm": 1.6008969398029698, + "learning_rate": 4.700887319457701e-05, + "loss": 0.5306, + "step": 6159 + }, + { + "epoch": 0.7304636546899087, + "grad_norm": 1.390401548286397, + "learning_rate": 4.700773456462018e-05, + "loss": 0.38, + "step": 6160 + }, + { + "epoch": 0.7305822364520337, + "grad_norm": 1.34474864340512, + "learning_rate": 4.700659573177903e-05, + "loss": 0.423, + "step": 6161 + }, + { + "epoch": 0.7307008182141587, + "grad_norm": 1.0637285030638972, + "learning_rate": 4.700545669606405e-05, + "loss": 0.3454, + "step": 6162 + }, + { + "epoch": 0.7308193999762836, + "grad_norm": 1.1108455204122984, + "learning_rate": 4.700431745748575e-05, + "loss": 0.3514, + "step": 6163 + }, + { + "epoch": 0.7309379817384086, + "grad_norm": 1.315841038732577, + "learning_rate": 4.700317801605462e-05, + "loss": 0.4903, + "step": 6164 + }, + { + "epoch": 0.7310565635005336, + "grad_norm": 1.4186120888516303, + "learning_rate": 4.700203837178118e-05, + "loss": 0.5242, + "step": 6165 + }, + { + "epoch": 0.7311751452626586, + "grad_norm": 1.2177020366633406, + "learning_rate": 4.700089852467592e-05, + "loss": 0.3933, + "step": 6166 + }, + { + "epoch": 0.7312937270247836, + "grad_norm": 1.48942028659979, + "learning_rate": 4.699975847474936e-05, + "loss": 0.3509, + "step": 6167 + }, + { + "epoch": 0.7314123087869085, + "grad_norm": 1.600936471918868, + "learning_rate": 4.6998618222011994e-05, + "loss": 0.5253, + "step": 6168 + }, + { + "epoch": 0.7315308905490335, + "grad_norm": 1.8584273433299079, + "learning_rate": 4.699747776647435e-05, + "loss": 0.521, + "step": 6169 + }, + { + "epoch": 0.7316494723111585, + "grad_norm": 1.4689302738485108, + "learning_rate": 4.6996337108146934e-05, + "loss": 0.4587, + "step": 6170 + }, + { + "epoch": 0.7317680540732835, + "grad_norm": 1.3531526689751616, + "learning_rate": 4.6995196247040265e-05, + "loss": 0.3281, + "step": 6171 + }, + { + "epoch": 0.7318866358354085, + "grad_norm": 1.13607446537375, + "learning_rate": 4.6994055183164866e-05, + "loss": 0.3423, + "step": 6172 + }, + { + "epoch": 0.7320052175975335, + "grad_norm": 1.0883323003081955, + "learning_rate": 4.6992913916531246e-05, + "loss": 0.3638, + "step": 6173 + }, + { + "epoch": 0.7321237993596584, + "grad_norm": 1.2238382530147618, + "learning_rate": 4.6991772447149916e-05, + "loss": 0.502, + "step": 6174 + }, + { + "epoch": 0.7322423811217834, + "grad_norm": 1.6985040053128915, + "learning_rate": 4.699063077503143e-05, + "loss": 0.6741, + "step": 6175 + }, + { + "epoch": 0.7323609628839085, + "grad_norm": 1.3960324352697886, + "learning_rate": 4.6989488900186286e-05, + "loss": 0.4824, + "step": 6176 + }, + { + "epoch": 0.7324795446460335, + "grad_norm": 1.5055420216694377, + "learning_rate": 4.698834682262502e-05, + "loss": 0.5284, + "step": 6177 + }, + { + "epoch": 0.7325981264081585, + "grad_norm": 1.7294854946126987, + "learning_rate": 4.6987204542358165e-05, + "loss": 0.4152, + "step": 6178 + }, + { + "epoch": 0.7327167081702834, + "grad_norm": 2.274259617256788, + "learning_rate": 4.6986062059396244e-05, + "loss": 0.7068, + "step": 6179 + }, + { + "epoch": 0.7328352899324084, + "grad_norm": 1.443472081441776, + "learning_rate": 4.698491937374979e-05, + "loss": 0.5453, + "step": 6180 + }, + { + "epoch": 0.7329538716945334, + "grad_norm": 1.3129736414853164, + "learning_rate": 4.6983776485429334e-05, + "loss": 0.4856, + "step": 6181 + }, + { + "epoch": 0.7330724534566584, + "grad_norm": 1.5444262671347668, + "learning_rate": 4.698263339444543e-05, + "loss": 0.5288, + "step": 6182 + }, + { + "epoch": 0.7331910352187834, + "grad_norm": 1.8276854081251293, + "learning_rate": 4.698149010080859e-05, + "loss": 0.5939, + "step": 6183 + }, + { + "epoch": 0.7333096169809084, + "grad_norm": 1.2800965119769772, + "learning_rate": 4.6980346604529373e-05, + "loss": 0.3721, + "step": 6184 + }, + { + "epoch": 0.7334281987430333, + "grad_norm": 1.1686261227419101, + "learning_rate": 4.697920290561831e-05, + "loss": 0.4384, + "step": 6185 + }, + { + "epoch": 0.7335467805051583, + "grad_norm": 1.2403214374962317, + "learning_rate": 4.697805900408596e-05, + "loss": 0.3816, + "step": 6186 + }, + { + "epoch": 0.7336653622672833, + "grad_norm": 1.2670523518517676, + "learning_rate": 4.6976914899942846e-05, + "loss": 0.3659, + "step": 6187 + }, + { + "epoch": 0.7337839440294083, + "grad_norm": 1.63392549491091, + "learning_rate": 4.697577059319953e-05, + "loss": 0.6715, + "step": 6188 + }, + { + "epoch": 0.7339025257915333, + "grad_norm": 1.175237018573613, + "learning_rate": 4.697462608386656e-05, + "loss": 0.3582, + "step": 6189 + }, + { + "epoch": 0.7340211075536582, + "grad_norm": 1.2353414323680718, + "learning_rate": 4.697348137195449e-05, + "loss": 0.3591, + "step": 6190 + }, + { + "epoch": 0.7341396893157832, + "grad_norm": 1.5115238865837388, + "learning_rate": 4.697233645747385e-05, + "loss": 0.5769, + "step": 6191 + }, + { + "epoch": 0.7342582710779082, + "grad_norm": 1.860457604990858, + "learning_rate": 4.697119134043523e-05, + "loss": 0.6327, + "step": 6192 + }, + { + "epoch": 0.7343768528400332, + "grad_norm": 1.1299392051405932, + "learning_rate": 4.6970046020849165e-05, + "loss": 0.3577, + "step": 6193 + }, + { + "epoch": 0.7344954346021582, + "grad_norm": 1.5139762648327395, + "learning_rate": 4.6968900498726206e-05, + "loss": 0.566, + "step": 6194 + }, + { + "epoch": 0.7346140163642831, + "grad_norm": 1.6584783683673403, + "learning_rate": 4.696775477407693e-05, + "loss": 0.5875, + "step": 6195 + }, + { + "epoch": 0.7347325981264081, + "grad_norm": 1.3178724581766523, + "learning_rate": 4.69666088469119e-05, + "loss": 0.3837, + "step": 6196 + }, + { + "epoch": 0.7348511798885331, + "grad_norm": 1.5518933656315026, + "learning_rate": 4.696546271724167e-05, + "loss": 0.523, + "step": 6197 + }, + { + "epoch": 0.7349697616506581, + "grad_norm": 1.4776493106478672, + "learning_rate": 4.6964316385076815e-05, + "loss": 0.5226, + "step": 6198 + }, + { + "epoch": 0.7350883434127831, + "grad_norm": 1.2762043009316175, + "learning_rate": 4.696316985042788e-05, + "loss": 0.4173, + "step": 6199 + }, + { + "epoch": 0.735206925174908, + "grad_norm": 1.5719072045010867, + "learning_rate": 4.696202311330547e-05, + "loss": 0.511, + "step": 6200 + }, + { + "epoch": 0.7353255069370331, + "grad_norm": 1.9067606581198728, + "learning_rate": 4.696087617372012e-05, + "loss": 0.7745, + "step": 6201 + }, + { + "epoch": 0.7354440886991581, + "grad_norm": 1.722439127736495, + "learning_rate": 4.6959729031682444e-05, + "loss": 0.748, + "step": 6202 + }, + { + "epoch": 0.7355626704612831, + "grad_norm": 1.1810700100815466, + "learning_rate": 4.695858168720298e-05, + "loss": 0.4605, + "step": 6203 + }, + { + "epoch": 0.7356812522234081, + "grad_norm": 1.150253266435276, + "learning_rate": 4.695743414029233e-05, + "loss": 0.3639, + "step": 6204 + }, + { + "epoch": 0.7357998339855331, + "grad_norm": 1.2596372997562701, + "learning_rate": 4.695628639096106e-05, + "loss": 0.4115, + "step": 6205 + }, + { + "epoch": 0.735918415747658, + "grad_norm": 1.3016674762842935, + "learning_rate": 4.695513843921975e-05, + "loss": 0.3847, + "step": 6206 + }, + { + "epoch": 0.736036997509783, + "grad_norm": 1.1780769422285366, + "learning_rate": 4.6953990285078985e-05, + "loss": 0.3555, + "step": 6207 + }, + { + "epoch": 0.736155579271908, + "grad_norm": 1.2754956577478063, + "learning_rate": 4.695284192854936e-05, + "loss": 0.4648, + "step": 6208 + }, + { + "epoch": 0.736274161034033, + "grad_norm": 1.1123335540746864, + "learning_rate": 4.6951693369641445e-05, + "loss": 0.3568, + "step": 6209 + }, + { + "epoch": 0.736392742796158, + "grad_norm": 1.4984428450490659, + "learning_rate": 4.695054460836584e-05, + "loss": 0.4622, + "step": 6210 + }, + { + "epoch": 0.7365113245582829, + "grad_norm": 1.139312819189427, + "learning_rate": 4.694939564473313e-05, + "loss": 0.3493, + "step": 6211 + }, + { + "epoch": 0.7366299063204079, + "grad_norm": 1.3611111494634414, + "learning_rate": 4.694824647875391e-05, + "loss": 0.604, + "step": 6212 + }, + { + "epoch": 0.7367484880825329, + "grad_norm": 1.3571835127280327, + "learning_rate": 4.694709711043877e-05, + "loss": 0.5375, + "step": 6213 + }, + { + "epoch": 0.7368670698446579, + "grad_norm": 1.6814267035413153, + "learning_rate": 4.6945947539798304e-05, + "loss": 0.4604, + "step": 6214 + }, + { + "epoch": 0.7369856516067829, + "grad_norm": 1.2757036064202247, + "learning_rate": 4.694479776684312e-05, + "loss": 0.4486, + "step": 6215 + }, + { + "epoch": 0.7371042333689078, + "grad_norm": 1.5798852454912613, + "learning_rate": 4.694364779158381e-05, + "loss": 0.4504, + "step": 6216 + }, + { + "epoch": 0.7372228151310328, + "grad_norm": 1.3489750600329697, + "learning_rate": 4.694249761403098e-05, + "loss": 0.3882, + "step": 6217 + }, + { + "epoch": 0.7373413968931578, + "grad_norm": 1.4958926842604283, + "learning_rate": 4.6941347234195224e-05, + "loss": 0.3677, + "step": 6218 + }, + { + "epoch": 0.7374599786552828, + "grad_norm": 1.6205198729670163, + "learning_rate": 4.694019665208715e-05, + "loss": 0.5831, + "step": 6219 + }, + { + "epoch": 0.7375785604174078, + "grad_norm": 1.4242864209446198, + "learning_rate": 4.693904586771738e-05, + "loss": 0.371, + "step": 6220 + }, + { + "epoch": 0.7376971421795327, + "grad_norm": 2.060640282675291, + "learning_rate": 4.69378948810965e-05, + "loss": 0.6788, + "step": 6221 + }, + { + "epoch": 0.7378157239416577, + "grad_norm": 1.5737639808977788, + "learning_rate": 4.693674369223513e-05, + "loss": 0.5484, + "step": 6222 + }, + { + "epoch": 0.7379343057037827, + "grad_norm": 1.4721894584509985, + "learning_rate": 4.693559230114389e-05, + "loss": 0.5108, + "step": 6223 + }, + { + "epoch": 0.7380528874659077, + "grad_norm": 1.780398861560955, + "learning_rate": 4.693444070783338e-05, + "loss": 0.5076, + "step": 6224 + }, + { + "epoch": 0.7381714692280328, + "grad_norm": 1.4972951523777451, + "learning_rate": 4.693328891231423e-05, + "loss": 0.3972, + "step": 6225 + }, + { + "epoch": 0.7382900509901578, + "grad_norm": 1.6700400692469686, + "learning_rate": 4.6932136914597055e-05, + "loss": 0.4477, + "step": 6226 + }, + { + "epoch": 0.7384086327522827, + "grad_norm": 1.9875202458537773, + "learning_rate": 4.693098471469247e-05, + "loss": 0.7133, + "step": 6227 + }, + { + "epoch": 0.7385272145144077, + "grad_norm": 1.5074036409175564, + "learning_rate": 4.69298323126111e-05, + "loss": 0.5093, + "step": 6228 + }, + { + "epoch": 0.7386457962765327, + "grad_norm": 1.5168591566486045, + "learning_rate": 4.692867970836357e-05, + "loss": 0.6296, + "step": 6229 + }, + { + "epoch": 0.7387643780386577, + "grad_norm": 1.5856867070195186, + "learning_rate": 4.6927526901960505e-05, + "loss": 0.5616, + "step": 6230 + }, + { + "epoch": 0.7388829598007827, + "grad_norm": 1.3820906911864215, + "learning_rate": 4.6926373893412526e-05, + "loss": 0.4506, + "step": 6231 + }, + { + "epoch": 0.7390015415629076, + "grad_norm": 1.166112472442942, + "learning_rate": 4.692522068273027e-05, + "loss": 0.3213, + "step": 6232 + }, + { + "epoch": 0.7391201233250326, + "grad_norm": 1.9808266662163505, + "learning_rate": 4.6924067269924365e-05, + "loss": 0.735, + "step": 6233 + }, + { + "epoch": 0.7392387050871576, + "grad_norm": 1.1112223004449522, + "learning_rate": 4.692291365500545e-05, + "loss": 0.3481, + "step": 6234 + }, + { + "epoch": 0.7393572868492826, + "grad_norm": 1.6713131974593587, + "learning_rate": 4.6921759837984145e-05, + "loss": 0.5537, + "step": 6235 + }, + { + "epoch": 0.7394758686114076, + "grad_norm": 1.523757764031677, + "learning_rate": 4.69206058188711e-05, + "loss": 0.4888, + "step": 6236 + }, + { + "epoch": 0.7395944503735326, + "grad_norm": 1.5293078920365217, + "learning_rate": 4.6919451597676954e-05, + "loss": 0.4426, + "step": 6237 + }, + { + "epoch": 0.7397130321356575, + "grad_norm": 1.5612062343967632, + "learning_rate": 4.691829717441234e-05, + "loss": 0.6002, + "step": 6238 + }, + { + "epoch": 0.7398316138977825, + "grad_norm": 1.4664504308550452, + "learning_rate": 4.691714254908791e-05, + "loss": 0.4037, + "step": 6239 + }, + { + "epoch": 0.7399501956599075, + "grad_norm": 1.4605012174306575, + "learning_rate": 4.6915987721714294e-05, + "loss": 0.516, + "step": 6240 + }, + { + "epoch": 0.7400687774220325, + "grad_norm": 1.3064238209515766, + "learning_rate": 4.691483269230215e-05, + "loss": 0.3833, + "step": 6241 + }, + { + "epoch": 0.7401873591841575, + "grad_norm": 1.7329444590012557, + "learning_rate": 4.6913677460862124e-05, + "loss": 0.4952, + "step": 6242 + }, + { + "epoch": 0.7403059409462824, + "grad_norm": 1.3613866062947992, + "learning_rate": 4.691252202740486e-05, + "loss": 0.4433, + "step": 6243 + }, + { + "epoch": 0.7404245227084074, + "grad_norm": 1.4322400382294767, + "learning_rate": 4.6911366391941016e-05, + "loss": 0.6692, + "step": 6244 + }, + { + "epoch": 0.7405431044705324, + "grad_norm": 1.8028155918342084, + "learning_rate": 4.691021055448125e-05, + "loss": 0.5646, + "step": 6245 + }, + { + "epoch": 0.7406616862326574, + "grad_norm": 1.2414361610253701, + "learning_rate": 4.69090545150362e-05, + "loss": 0.3502, + "step": 6246 + }, + { + "epoch": 0.7407802679947824, + "grad_norm": 1.3857572150620012, + "learning_rate": 4.690789827361654e-05, + "loss": 0.4687, + "step": 6247 + }, + { + "epoch": 0.7408988497569073, + "grad_norm": 1.1439656211907105, + "learning_rate": 4.6906741830232924e-05, + "loss": 0.3641, + "step": 6248 + }, + { + "epoch": 0.7410174315190323, + "grad_norm": 1.2691498579985017, + "learning_rate": 4.690558518489601e-05, + "loss": 0.3836, + "step": 6249 + }, + { + "epoch": 0.7411360132811574, + "grad_norm": 1.7393228924994169, + "learning_rate": 4.6904428337616465e-05, + "loss": 0.547, + "step": 6250 + }, + { + "epoch": 0.7412545950432824, + "grad_norm": 1.4299207937025349, + "learning_rate": 4.690327128840495e-05, + "loss": 0.5943, + "step": 6251 + }, + { + "epoch": 0.7413731768054074, + "grad_norm": 1.583028220196199, + "learning_rate": 4.6902114037272124e-05, + "loss": 0.4931, + "step": 6252 + }, + { + "epoch": 0.7414917585675324, + "grad_norm": 1.4690036246475067, + "learning_rate": 4.690095658422868e-05, + "loss": 0.6038, + "step": 6253 + }, + { + "epoch": 0.7416103403296573, + "grad_norm": 1.3607355683654334, + "learning_rate": 4.689979892928526e-05, + "loss": 0.4787, + "step": 6254 + }, + { + "epoch": 0.7417289220917823, + "grad_norm": 1.3121263825412204, + "learning_rate": 4.689864107245256e-05, + "loss": 0.3524, + "step": 6255 + }, + { + "epoch": 0.7418475038539073, + "grad_norm": 1.349502106272725, + "learning_rate": 4.689748301374124e-05, + "loss": 0.5354, + "step": 6256 + }, + { + "epoch": 0.7419660856160323, + "grad_norm": 1.2986048992157135, + "learning_rate": 4.689632475316198e-05, + "loss": 0.4841, + "step": 6257 + }, + { + "epoch": 0.7420846673781573, + "grad_norm": 1.4643493803807395, + "learning_rate": 4.6895166290725456e-05, + "loss": 0.5016, + "step": 6258 + }, + { + "epoch": 0.7422032491402822, + "grad_norm": 1.4210685599000255, + "learning_rate": 4.689400762644235e-05, + "loss": 0.4371, + "step": 6259 + }, + { + "epoch": 0.7423218309024072, + "grad_norm": 1.574054715333771, + "learning_rate": 4.6892848760323334e-05, + "loss": 0.6711, + "step": 6260 + }, + { + "epoch": 0.7424404126645322, + "grad_norm": 1.742093082502485, + "learning_rate": 4.6891689692379104e-05, + "loss": 0.6061, + "step": 6261 + }, + { + "epoch": 0.7425589944266572, + "grad_norm": 1.1977217080585496, + "learning_rate": 4.6890530422620336e-05, + "loss": 0.398, + "step": 6262 + }, + { + "epoch": 0.7426775761887822, + "grad_norm": 1.0460934136921634, + "learning_rate": 4.688937095105773e-05, + "loss": 0.4373, + "step": 6263 + }, + { + "epoch": 0.7427961579509071, + "grad_norm": 1.2671592083371985, + "learning_rate": 4.688821127770197e-05, + "loss": 0.3046, + "step": 6264 + }, + { + "epoch": 0.7429147397130321, + "grad_norm": 1.3426680731401661, + "learning_rate": 4.688705140256373e-05, + "loss": 0.4547, + "step": 6265 + }, + { + "epoch": 0.7430333214751571, + "grad_norm": 1.521964705164523, + "learning_rate": 4.688589132565372e-05, + "loss": 0.5848, + "step": 6266 + }, + { + "epoch": 0.7431519032372821, + "grad_norm": 1.4341454164559786, + "learning_rate": 4.6884731046982634e-05, + "loss": 0.5758, + "step": 6267 + }, + { + "epoch": 0.7432704849994071, + "grad_norm": 1.4419929430364473, + "learning_rate": 4.6883570566561166e-05, + "loss": 0.3937, + "step": 6268 + }, + { + "epoch": 0.743389066761532, + "grad_norm": 1.3850729824248333, + "learning_rate": 4.6882409884400004e-05, + "loss": 0.3761, + "step": 6269 + }, + { + "epoch": 0.743507648523657, + "grad_norm": 1.4464350177976217, + "learning_rate": 4.688124900050986e-05, + "loss": 0.5444, + "step": 6270 + }, + { + "epoch": 0.743626230285782, + "grad_norm": 1.3448928629987063, + "learning_rate": 4.688008791490144e-05, + "loss": 0.4035, + "step": 6271 + }, + { + "epoch": 0.743744812047907, + "grad_norm": 1.2816235349919038, + "learning_rate": 4.687892662758543e-05, + "loss": 0.3779, + "step": 6272 + }, + { + "epoch": 0.743863393810032, + "grad_norm": 1.1283749770743732, + "learning_rate": 4.6877765138572555e-05, + "loss": 0.3269, + "step": 6273 + }, + { + "epoch": 0.743981975572157, + "grad_norm": 1.5368536326371092, + "learning_rate": 4.6876603447873515e-05, + "loss": 0.4891, + "step": 6274 + }, + { + "epoch": 0.744100557334282, + "grad_norm": 1.6549517169697632, + "learning_rate": 4.6875441555499014e-05, + "loss": 0.4507, + "step": 6275 + }, + { + "epoch": 0.744219139096407, + "grad_norm": 1.6954866653335727, + "learning_rate": 4.687427946145976e-05, + "loss": 0.5253, + "step": 6276 + }, + { + "epoch": 0.744337720858532, + "grad_norm": 1.453392360831875, + "learning_rate": 4.687311716576648e-05, + "loss": 0.475, + "step": 6277 + }, + { + "epoch": 0.744456302620657, + "grad_norm": 1.737531614806573, + "learning_rate": 4.6871954668429886e-05, + "loss": 0.5169, + "step": 6278 + }, + { + "epoch": 0.744574884382782, + "grad_norm": 1.3979650710063858, + "learning_rate": 4.6870791969460685e-05, + "loss": 0.4533, + "step": 6279 + }, + { + "epoch": 0.744693466144907, + "grad_norm": 2.0513704355663553, + "learning_rate": 4.68696290688696e-05, + "loss": 0.7054, + "step": 6280 + }, + { + "epoch": 0.7448120479070319, + "grad_norm": 1.1660077427246562, + "learning_rate": 4.6868465966667364e-05, + "loss": 0.2725, + "step": 6281 + }, + { + "epoch": 0.7449306296691569, + "grad_norm": 1.4810992969157994, + "learning_rate": 4.686730266286468e-05, + "loss": 0.4782, + "step": 6282 + }, + { + "epoch": 0.7450492114312819, + "grad_norm": 1.361708325245046, + "learning_rate": 4.6866139157472277e-05, + "loss": 0.4006, + "step": 6283 + }, + { + "epoch": 0.7451677931934069, + "grad_norm": 1.3644987167457872, + "learning_rate": 4.6864975450500895e-05, + "loss": 0.3732, + "step": 6284 + }, + { + "epoch": 0.7452863749555318, + "grad_norm": 1.281873756301015, + "learning_rate": 4.6863811541961245e-05, + "loss": 0.4067, + "step": 6285 + }, + { + "epoch": 0.7454049567176568, + "grad_norm": 1.6036346586664754, + "learning_rate": 4.686264743186407e-05, + "loss": 0.4754, + "step": 6286 + }, + { + "epoch": 0.7455235384797818, + "grad_norm": 1.5166698070384537, + "learning_rate": 4.6861483120220096e-05, + "loss": 0.4348, + "step": 6287 + }, + { + "epoch": 0.7456421202419068, + "grad_norm": 1.4206860095627198, + "learning_rate": 4.686031860704005e-05, + "loss": 0.5118, + "step": 6288 + }, + { + "epoch": 0.7457607020040318, + "grad_norm": 1.2567813032335244, + "learning_rate": 4.685915389233467e-05, + "loss": 0.3513, + "step": 6289 + }, + { + "epoch": 0.7458792837661568, + "grad_norm": 1.6151164669959355, + "learning_rate": 4.685798897611471e-05, + "loss": 0.6388, + "step": 6290 + }, + { + "epoch": 0.7459978655282817, + "grad_norm": 1.240300975613239, + "learning_rate": 4.6856823858390884e-05, + "loss": 0.4301, + "step": 6291 + }, + { + "epoch": 0.7461164472904067, + "grad_norm": 1.0376382955659638, + "learning_rate": 4.6855658539173955e-05, + "loss": 0.4068, + "step": 6292 + }, + { + "epoch": 0.7462350290525317, + "grad_norm": 0.7771924152856599, + "learning_rate": 4.685449301847465e-05, + "loss": 0.2652, + "step": 6293 + }, + { + "epoch": 0.7463536108146567, + "grad_norm": 1.4235350644691864, + "learning_rate": 4.6853327296303714e-05, + "loss": 0.4568, + "step": 6294 + }, + { + "epoch": 0.7464721925767817, + "grad_norm": 1.5592215962676588, + "learning_rate": 4.6852161372671896e-05, + "loss": 0.4522, + "step": 6295 + }, + { + "epoch": 0.7465907743389066, + "grad_norm": 1.0785573479148498, + "learning_rate": 4.6850995247589955e-05, + "loss": 0.3245, + "step": 6296 + }, + { + "epoch": 0.7467093561010316, + "grad_norm": 1.5374996539087347, + "learning_rate": 4.684982892106863e-05, + "loss": 0.3583, + "step": 6297 + }, + { + "epoch": 0.7468279378631566, + "grad_norm": 1.9464221293527022, + "learning_rate": 4.6848662393118684e-05, + "loss": 0.5642, + "step": 6298 + }, + { + "epoch": 0.7469465196252816, + "grad_norm": 1.3666059234077679, + "learning_rate": 4.684749566375085e-05, + "loss": 0.422, + "step": 6299 + }, + { + "epoch": 0.7470651013874067, + "grad_norm": 1.6365293180621938, + "learning_rate": 4.684632873297591e-05, + "loss": 0.604, + "step": 6300 + }, + { + "epoch": 0.7471836831495317, + "grad_norm": 1.6855090959004302, + "learning_rate": 4.68451616008046e-05, + "loss": 0.6004, + "step": 6301 + }, + { + "epoch": 0.7473022649116566, + "grad_norm": 1.560049246259256, + "learning_rate": 4.6843994267247695e-05, + "loss": 0.5244, + "step": 6302 + }, + { + "epoch": 0.7474208466737816, + "grad_norm": 0.8921498237627483, + "learning_rate": 4.684282673231595e-05, + "loss": 0.275, + "step": 6303 + }, + { + "epoch": 0.7475394284359066, + "grad_norm": 1.2897714879029365, + "learning_rate": 4.684165899602012e-05, + "loss": 0.4828, + "step": 6304 + }, + { + "epoch": 0.7476580101980316, + "grad_norm": 1.2839966808908505, + "learning_rate": 4.684049105837098e-05, + "loss": 0.3765, + "step": 6305 + }, + { + "epoch": 0.7477765919601566, + "grad_norm": 1.2264015874872136, + "learning_rate": 4.68393229193793e-05, + "loss": 0.368, + "step": 6306 + }, + { + "epoch": 0.7478951737222815, + "grad_norm": 1.5062032870307644, + "learning_rate": 4.683815457905584e-05, + "loss": 0.5263, + "step": 6307 + }, + { + "epoch": 0.7480137554844065, + "grad_norm": 1.5185731592604483, + "learning_rate": 4.683698603741138e-05, + "loss": 0.4129, + "step": 6308 + }, + { + "epoch": 0.7481323372465315, + "grad_norm": 0.9903665611370346, + "learning_rate": 4.683581729445668e-05, + "loss": 0.3155, + "step": 6309 + }, + { + "epoch": 0.7482509190086565, + "grad_norm": 1.1603193654960873, + "learning_rate": 4.6834648350202525e-05, + "loss": 0.2924, + "step": 6310 + }, + { + "epoch": 0.7483695007707815, + "grad_norm": 1.3101852942949073, + "learning_rate": 4.6833479204659695e-05, + "loss": 0.3953, + "step": 6311 + }, + { + "epoch": 0.7484880825329064, + "grad_norm": 1.898794508044423, + "learning_rate": 4.6832309857838946e-05, + "loss": 0.5173, + "step": 6312 + }, + { + "epoch": 0.7486066642950314, + "grad_norm": 1.2706464865648388, + "learning_rate": 4.683114030975108e-05, + "loss": 0.454, + "step": 6313 + }, + { + "epoch": 0.7487252460571564, + "grad_norm": 1.930766446772458, + "learning_rate": 4.682997056040687e-05, + "loss": 0.6507, + "step": 6314 + }, + { + "epoch": 0.7488438278192814, + "grad_norm": 1.580148496131274, + "learning_rate": 4.68288006098171e-05, + "loss": 0.4644, + "step": 6315 + }, + { + "epoch": 0.7489624095814064, + "grad_norm": 1.1073748525734413, + "learning_rate": 4.682763045799256e-05, + "loss": 0.3289, + "step": 6316 + }, + { + "epoch": 0.7490809913435313, + "grad_norm": 1.6606635855200873, + "learning_rate": 4.6826460104944035e-05, + "loss": 0.6108, + "step": 6317 + }, + { + "epoch": 0.7491995731056563, + "grad_norm": 1.711615462563066, + "learning_rate": 4.682528955068231e-05, + "loss": 0.5268, + "step": 6318 + }, + { + "epoch": 0.7493181548677813, + "grad_norm": 1.1788679942905094, + "learning_rate": 4.682411879521818e-05, + "loss": 0.4082, + "step": 6319 + }, + { + "epoch": 0.7494367366299063, + "grad_norm": 0.9787727467315984, + "learning_rate": 4.682294783856244e-05, + "loss": 0.2536, + "step": 6320 + }, + { + "epoch": 0.7495553183920313, + "grad_norm": 1.560339068923449, + "learning_rate": 4.6821776680725876e-05, + "loss": 0.4788, + "step": 6321 + }, + { + "epoch": 0.7496739001541562, + "grad_norm": 1.137331913528358, + "learning_rate": 4.68206053217193e-05, + "loss": 0.4129, + "step": 6322 + }, + { + "epoch": 0.7497924819162812, + "grad_norm": 1.5547947060057168, + "learning_rate": 4.681943376155349e-05, + "loss": 0.4586, + "step": 6323 + }, + { + "epoch": 0.7499110636784062, + "grad_norm": 1.4711917056605195, + "learning_rate": 4.6818262000239264e-05, + "loss": 0.3921, + "step": 6324 + }, + { + "epoch": 0.7500296454405313, + "grad_norm": 1.3853343833891825, + "learning_rate": 4.6817090037787416e-05, + "loss": 0.4461, + "step": 6325 + }, + { + "epoch": 0.7501482272026563, + "grad_norm": 1.5295401507384065, + "learning_rate": 4.6815917874208746e-05, + "loss": 0.4513, + "step": 6326 + }, + { + "epoch": 0.7502668089647813, + "grad_norm": 1.5731740575822744, + "learning_rate": 4.681474550951408e-05, + "loss": 0.5532, + "step": 6327 + }, + { + "epoch": 0.7503853907269062, + "grad_norm": 1.3947035345596193, + "learning_rate": 4.68135729437142e-05, + "loss": 0.3727, + "step": 6328 + }, + { + "epoch": 0.7505039724890312, + "grad_norm": 1.6756604812947764, + "learning_rate": 4.681240017681993e-05, + "loss": 0.4802, + "step": 6329 + }, + { + "epoch": 0.7506225542511562, + "grad_norm": 2.2677092230008826, + "learning_rate": 4.6811227208842076e-05, + "loss": 0.6117, + "step": 6330 + }, + { + "epoch": 0.7507411360132812, + "grad_norm": 1.5782431756054374, + "learning_rate": 4.681005403979146e-05, + "loss": 0.426, + "step": 6331 + }, + { + "epoch": 0.7508597177754062, + "grad_norm": 1.3447748555349468, + "learning_rate": 4.680888066967889e-05, + "loss": 0.3053, + "step": 6332 + }, + { + "epoch": 0.7509782995375311, + "grad_norm": 1.7228981969016572, + "learning_rate": 4.680770709851517e-05, + "loss": 0.5614, + "step": 6333 + }, + { + "epoch": 0.7510968812996561, + "grad_norm": 2.2698421409751544, + "learning_rate": 4.680653332631115e-05, + "loss": 0.9063, + "step": 6334 + }, + { + "epoch": 0.7512154630617811, + "grad_norm": 1.6218961033729142, + "learning_rate": 4.680535935307763e-05, + "loss": 0.5079, + "step": 6335 + }, + { + "epoch": 0.7513340448239061, + "grad_norm": 1.311630948240553, + "learning_rate": 4.6804185178825434e-05, + "loss": 0.3865, + "step": 6336 + }, + { + "epoch": 0.7514526265860311, + "grad_norm": 1.6246719049163065, + "learning_rate": 4.680301080356539e-05, + "loss": 0.5459, + "step": 6337 + }, + { + "epoch": 0.751571208348156, + "grad_norm": 1.972300465129122, + "learning_rate": 4.680183622730833e-05, + "loss": 0.6248, + "step": 6338 + }, + { + "epoch": 0.751689790110281, + "grad_norm": 1.343925631327889, + "learning_rate": 4.680066145006506e-05, + "loss": 0.4111, + "step": 6339 + }, + { + "epoch": 0.751808371872406, + "grad_norm": 1.3245543720241955, + "learning_rate": 4.679948647184643e-05, + "loss": 0.4471, + "step": 6340 + }, + { + "epoch": 0.751926953634531, + "grad_norm": 1.1057681698699755, + "learning_rate": 4.6798311292663265e-05, + "loss": 0.3182, + "step": 6341 + }, + { + "epoch": 0.752045535396656, + "grad_norm": 1.1435927650034214, + "learning_rate": 4.679713591252641e-05, + "loss": 0.3889, + "step": 6342 + }, + { + "epoch": 0.752164117158781, + "grad_norm": 1.302082473723719, + "learning_rate": 4.6795960331446684e-05, + "loss": 0.3491, + "step": 6343 + }, + { + "epoch": 0.7522826989209059, + "grad_norm": 1.4407345549067014, + "learning_rate": 4.679478454943494e-05, + "loss": 0.5028, + "step": 6344 + }, + { + "epoch": 0.7524012806830309, + "grad_norm": 1.2970821893293363, + "learning_rate": 4.6793608566502e-05, + "loss": 0.4172, + "step": 6345 + }, + { + "epoch": 0.7525198624451559, + "grad_norm": 1.7610362612712684, + "learning_rate": 4.679243238265872e-05, + "loss": 0.5603, + "step": 6346 + }, + { + "epoch": 0.7526384442072809, + "grad_norm": 1.5376896733591277, + "learning_rate": 4.6791255997915926e-05, + "loss": 0.5489, + "step": 6347 + }, + { + "epoch": 0.7527570259694059, + "grad_norm": 1.4221101972515386, + "learning_rate": 4.6790079412284485e-05, + "loss": 0.4881, + "step": 6348 + }, + { + "epoch": 0.7528756077315308, + "grad_norm": 1.2570054042193417, + "learning_rate": 4.678890262577523e-05, + "loss": 0.3891, + "step": 6349 + }, + { + "epoch": 0.7529941894936559, + "grad_norm": 1.2780964507939017, + "learning_rate": 4.6787725638399016e-05, + "loss": 0.4088, + "step": 6350 + }, + { + "epoch": 0.7531127712557809, + "grad_norm": 1.2651787367497171, + "learning_rate": 4.6786548450166686e-05, + "loss": 0.4207, + "step": 6351 + }, + { + "epoch": 0.7532313530179059, + "grad_norm": 1.7399243249642313, + "learning_rate": 4.6785371061089095e-05, + "loss": 0.4997, + "step": 6352 + }, + { + "epoch": 0.7533499347800309, + "grad_norm": 1.4677619673833766, + "learning_rate": 4.678419347117709e-05, + "loss": 0.4275, + "step": 6353 + }, + { + "epoch": 0.7534685165421559, + "grad_norm": 1.274939620926509, + "learning_rate": 4.6783015680441547e-05, + "loss": 0.6621, + "step": 6354 + }, + { + "epoch": 0.7535870983042808, + "grad_norm": 1.647710065775666, + "learning_rate": 4.6781837688893307e-05, + "loss": 0.507, + "step": 6355 + }, + { + "epoch": 0.7537056800664058, + "grad_norm": 1.2466373663376678, + "learning_rate": 4.6780659496543235e-05, + "loss": 0.3296, + "step": 6356 + }, + { + "epoch": 0.7538242618285308, + "grad_norm": 1.5647744791648217, + "learning_rate": 4.677948110340219e-05, + "loss": 0.4529, + "step": 6357 + }, + { + "epoch": 0.7539428435906558, + "grad_norm": 1.296580911664781, + "learning_rate": 4.6778302509481046e-05, + "loss": 0.3588, + "step": 6358 + }, + { + "epoch": 0.7540614253527808, + "grad_norm": 1.5830925680805785, + "learning_rate": 4.6777123714790646e-05, + "loss": 0.5504, + "step": 6359 + }, + { + "epoch": 0.7541800071149057, + "grad_norm": 1.529421546470144, + "learning_rate": 4.677594471934188e-05, + "loss": 0.4854, + "step": 6360 + }, + { + "epoch": 0.7542985888770307, + "grad_norm": 1.3575573438663817, + "learning_rate": 4.6774765523145596e-05, + "loss": 0.5334, + "step": 6361 + }, + { + "epoch": 0.7544171706391557, + "grad_norm": 1.1371287831170098, + "learning_rate": 4.677358612621269e-05, + "loss": 0.3899, + "step": 6362 + }, + { + "epoch": 0.7545357524012807, + "grad_norm": 1.879954146029522, + "learning_rate": 4.677240652855401e-05, + "loss": 0.6789, + "step": 6363 + }, + { + "epoch": 0.7546543341634057, + "grad_norm": 1.9507377658242837, + "learning_rate": 4.677122673018044e-05, + "loss": 0.6271, + "step": 6364 + }, + { + "epoch": 0.7547729159255306, + "grad_norm": 1.567099515901128, + "learning_rate": 4.677004673110286e-05, + "loss": 0.3931, + "step": 6365 + }, + { + "epoch": 0.7548914976876556, + "grad_norm": 1.5516136497773267, + "learning_rate": 4.6768866531332146e-05, + "loss": 0.5701, + "step": 6366 + }, + { + "epoch": 0.7550100794497806, + "grad_norm": 1.385937021189053, + "learning_rate": 4.676768613087918e-05, + "loss": 0.4048, + "step": 6367 + }, + { + "epoch": 0.7551286612119056, + "grad_norm": 1.3338284263764733, + "learning_rate": 4.6766505529754836e-05, + "loss": 0.4834, + "step": 6368 + }, + { + "epoch": 0.7552472429740306, + "grad_norm": 1.4188792856800085, + "learning_rate": 4.676532472797001e-05, + "loss": 0.5361, + "step": 6369 + }, + { + "epoch": 0.7553658247361555, + "grad_norm": 1.302520009652949, + "learning_rate": 4.676414372553558e-05, + "loss": 0.4428, + "step": 6370 + }, + { + "epoch": 0.7554844064982805, + "grad_norm": 1.5704706519737799, + "learning_rate": 4.676296252246242e-05, + "loss": 0.6478, + "step": 6371 + }, + { + "epoch": 0.7556029882604055, + "grad_norm": 1.4803086760575612, + "learning_rate": 4.676178111876145e-05, + "loss": 0.5026, + "step": 6372 + }, + { + "epoch": 0.7557215700225305, + "grad_norm": 1.5731255429612399, + "learning_rate": 4.6760599514443534e-05, + "loss": 0.5063, + "step": 6373 + }, + { + "epoch": 0.7558401517846555, + "grad_norm": 1.382580382360104, + "learning_rate": 4.675941770951958e-05, + "loss": 0.4246, + "step": 6374 + }, + { + "epoch": 0.7559587335467806, + "grad_norm": 1.340642397854399, + "learning_rate": 4.6758235704000475e-05, + "loss": 0.4869, + "step": 6375 + }, + { + "epoch": 0.7560773153089055, + "grad_norm": 1.2699941466730815, + "learning_rate": 4.675705349789712e-05, + "loss": 0.5821, + "step": 6376 + }, + { + "epoch": 0.7561958970710305, + "grad_norm": 1.311415128077252, + "learning_rate": 4.675587109122041e-05, + "loss": 0.388, + "step": 6377 + }, + { + "epoch": 0.7563144788331555, + "grad_norm": 1.5121760083020388, + "learning_rate": 4.6754688483981245e-05, + "loss": 0.4529, + "step": 6378 + }, + { + "epoch": 0.7564330605952805, + "grad_norm": 1.4939643932700748, + "learning_rate": 4.6753505676190534e-05, + "loss": 0.4166, + "step": 6379 + }, + { + "epoch": 0.7565516423574055, + "grad_norm": 1.3654979596748529, + "learning_rate": 4.675232266785918e-05, + "loss": 0.4754, + "step": 6380 + }, + { + "epoch": 0.7566702241195304, + "grad_norm": 1.7431542579573476, + "learning_rate": 4.675113945899808e-05, + "loss": 0.4331, + "step": 6381 + }, + { + "epoch": 0.7567888058816554, + "grad_norm": 1.3703199685142164, + "learning_rate": 4.674995604961815e-05, + "loss": 0.4035, + "step": 6382 + }, + { + "epoch": 0.7569073876437804, + "grad_norm": 1.6277486479822678, + "learning_rate": 4.6748772439730296e-05, + "loss": 0.6543, + "step": 6383 + }, + { + "epoch": 0.7570259694059054, + "grad_norm": 2.320300536021157, + "learning_rate": 4.674758862934543e-05, + "loss": 0.6792, + "step": 6384 + }, + { + "epoch": 0.7571445511680304, + "grad_norm": 1.313674046161056, + "learning_rate": 4.674640461847446e-05, + "loss": 0.3928, + "step": 6385 + }, + { + "epoch": 0.7572631329301553, + "grad_norm": 1.6008584178966698, + "learning_rate": 4.674522040712831e-05, + "loss": 0.4775, + "step": 6386 + }, + { + "epoch": 0.7573817146922803, + "grad_norm": 1.204795718682405, + "learning_rate": 4.6744035995317904e-05, + "loss": 0.4268, + "step": 6387 + }, + { + "epoch": 0.7575002964544053, + "grad_norm": 1.3008577256366438, + "learning_rate": 4.674285138305414e-05, + "loss": 0.4624, + "step": 6388 + }, + { + "epoch": 0.7576188782165303, + "grad_norm": 0.9339434218965825, + "learning_rate": 4.6741666570347955e-05, + "loss": 0.2961, + "step": 6389 + }, + { + "epoch": 0.7577374599786553, + "grad_norm": 1.3218351015514316, + "learning_rate": 4.6740481557210255e-05, + "loss": 0.4566, + "step": 6390 + }, + { + "epoch": 0.7578560417407803, + "grad_norm": 1.4449707954912088, + "learning_rate": 4.6739296343651984e-05, + "loss": 0.4949, + "step": 6391 + }, + { + "epoch": 0.7579746235029052, + "grad_norm": 1.1416505505809214, + "learning_rate": 4.673811092968405e-05, + "loss": 0.3755, + "step": 6392 + }, + { + "epoch": 0.7580932052650302, + "grad_norm": 1.5579435616217685, + "learning_rate": 4.6736925315317405e-05, + "loss": 0.429, + "step": 6393 + }, + { + "epoch": 0.7582117870271552, + "grad_norm": 1.9696530106094354, + "learning_rate": 4.673573950056295e-05, + "loss": 0.7039, + "step": 6394 + }, + { + "epoch": 0.7583303687892802, + "grad_norm": 1.0107883887827513, + "learning_rate": 4.6734553485431644e-05, + "loss": 0.3101, + "step": 6395 + }, + { + "epoch": 0.7584489505514052, + "grad_norm": 1.167814399850664, + "learning_rate": 4.6733367269934396e-05, + "loss": 0.3866, + "step": 6396 + }, + { + "epoch": 0.7585675323135301, + "grad_norm": 1.598100452253808, + "learning_rate": 4.673218085408215e-05, + "loss": 0.5933, + "step": 6397 + }, + { + "epoch": 0.7586861140756551, + "grad_norm": 1.5499286159558943, + "learning_rate": 4.673099423788586e-05, + "loss": 0.5904, + "step": 6398 + }, + { + "epoch": 0.7588046958377801, + "grad_norm": 1.6069915179791623, + "learning_rate": 4.6729807421356445e-05, + "loss": 0.6372, + "step": 6399 + }, + { + "epoch": 0.7589232775999052, + "grad_norm": 1.393395391411628, + "learning_rate": 4.672862040450485e-05, + "loss": 0.4713, + "step": 6400 + }, + { + "epoch": 0.7590418593620302, + "grad_norm": 1.5929691332121063, + "learning_rate": 4.672743318734203e-05, + "loss": 0.5967, + "step": 6401 + }, + { + "epoch": 0.7591604411241551, + "grad_norm": 1.6923247459856718, + "learning_rate": 4.672624576987891e-05, + "loss": 0.6099, + "step": 6402 + }, + { + "epoch": 0.7592790228862801, + "grad_norm": 1.3349663724973453, + "learning_rate": 4.672505815212645e-05, + "loss": 0.4296, + "step": 6403 + }, + { + "epoch": 0.7593976046484051, + "grad_norm": 1.359473232695507, + "learning_rate": 4.672387033409559e-05, + "loss": 0.4382, + "step": 6404 + }, + { + "epoch": 0.7595161864105301, + "grad_norm": 1.4302299571309112, + "learning_rate": 4.67226823157973e-05, + "loss": 0.5272, + "step": 6405 + }, + { + "epoch": 0.7596347681726551, + "grad_norm": 1.147234166712874, + "learning_rate": 4.67214940972425e-05, + "loss": 0.5173, + "step": 6406 + }, + { + "epoch": 0.75975334993478, + "grad_norm": 1.7253445215826135, + "learning_rate": 4.672030567844217e-05, + "loss": 0.4518, + "step": 6407 + }, + { + "epoch": 0.759871931696905, + "grad_norm": 1.4651173014109806, + "learning_rate": 4.671911705940725e-05, + "loss": 0.6164, + "step": 6408 + }, + { + "epoch": 0.75999051345903, + "grad_norm": 1.4604058134744229, + "learning_rate": 4.671792824014871e-05, + "loss": 0.4845, + "step": 6409 + }, + { + "epoch": 0.760109095221155, + "grad_norm": 1.342755721730925, + "learning_rate": 4.67167392206775e-05, + "loss": 0.4405, + "step": 6410 + }, + { + "epoch": 0.76022767698328, + "grad_norm": 1.2155231330655305, + "learning_rate": 4.6715550001004595e-05, + "loss": 0.4055, + "step": 6411 + }, + { + "epoch": 0.760346258745405, + "grad_norm": 1.7705242987871812, + "learning_rate": 4.6714360581140935e-05, + "loss": 0.547, + "step": 6412 + }, + { + "epoch": 0.7604648405075299, + "grad_norm": 1.2026494247519293, + "learning_rate": 4.671317096109751e-05, + "loss": 0.322, + "step": 6413 + }, + { + "epoch": 0.7605834222696549, + "grad_norm": 1.7333295397221768, + "learning_rate": 4.6711981140885266e-05, + "loss": 0.4771, + "step": 6414 + }, + { + "epoch": 0.7607020040317799, + "grad_norm": 1.8285567583799063, + "learning_rate": 4.6710791120515186e-05, + "loss": 0.4589, + "step": 6415 + }, + { + "epoch": 0.7608205857939049, + "grad_norm": 1.8510524984689483, + "learning_rate": 4.6709600899998227e-05, + "loss": 0.6604, + "step": 6416 + }, + { + "epoch": 0.7609391675560299, + "grad_norm": 1.0666000983178736, + "learning_rate": 4.670841047934538e-05, + "loss": 0.3965, + "step": 6417 + }, + { + "epoch": 0.7610577493181548, + "grad_norm": 1.5513888927779644, + "learning_rate": 4.670721985856761e-05, + "loss": 0.4553, + "step": 6418 + }, + { + "epoch": 0.7611763310802798, + "grad_norm": 1.824373617801327, + "learning_rate": 4.670602903767589e-05, + "loss": 0.6176, + "step": 6419 + }, + { + "epoch": 0.7612949128424048, + "grad_norm": 1.5627240704132948, + "learning_rate": 4.6704838016681194e-05, + "loss": 0.4372, + "step": 6420 + }, + { + "epoch": 0.7614134946045298, + "grad_norm": 1.80460435216683, + "learning_rate": 4.670364679559451e-05, + "loss": 0.5977, + "step": 6421 + }, + { + "epoch": 0.7615320763666548, + "grad_norm": 1.7914159435872152, + "learning_rate": 4.670245537442682e-05, + "loss": 0.6279, + "step": 6422 + }, + { + "epoch": 0.7616506581287797, + "grad_norm": 1.413865018881453, + "learning_rate": 4.670126375318911e-05, + "loss": 0.4546, + "step": 6423 + }, + { + "epoch": 0.7617692398909047, + "grad_norm": 1.330705173222603, + "learning_rate": 4.670007193189235e-05, + "loss": 0.4226, + "step": 6424 + }, + { + "epoch": 0.7618878216530298, + "grad_norm": 1.3134575371428086, + "learning_rate": 4.669887991054755e-05, + "loss": 0.4454, + "step": 6425 + }, + { + "epoch": 0.7620064034151548, + "grad_norm": 1.3163422520332084, + "learning_rate": 4.669768768916567e-05, + "loss": 0.4224, + "step": 6426 + }, + { + "epoch": 0.7621249851772798, + "grad_norm": 1.2029832276604893, + "learning_rate": 4.669649526775772e-05, + "loss": 0.3225, + "step": 6427 + }, + { + "epoch": 0.7622435669394048, + "grad_norm": 1.3088757485088305, + "learning_rate": 4.6695302646334696e-05, + "loss": 0.4099, + "step": 6428 + }, + { + "epoch": 0.7623621487015297, + "grad_norm": 1.3195542098819792, + "learning_rate": 4.669410982490758e-05, + "loss": 0.3074, + "step": 6429 + }, + { + "epoch": 0.7624807304636547, + "grad_norm": 1.5325658883459392, + "learning_rate": 4.669291680348738e-05, + "loss": 0.4068, + "step": 6430 + }, + { + "epoch": 0.7625993122257797, + "grad_norm": 1.3203077647651247, + "learning_rate": 4.669172358208509e-05, + "loss": 0.3552, + "step": 6431 + }, + { + "epoch": 0.7627178939879047, + "grad_norm": 1.5172277575313575, + "learning_rate": 4.6690530160711706e-05, + "loss": 0.4217, + "step": 6432 + }, + { + "epoch": 0.7628364757500297, + "grad_norm": 1.3277092787811597, + "learning_rate": 4.668933653937823e-05, + "loss": 0.3657, + "step": 6433 + }, + { + "epoch": 0.7629550575121546, + "grad_norm": 1.4330126501601583, + "learning_rate": 4.6688142718095674e-05, + "loss": 0.4196, + "step": 6434 + }, + { + "epoch": 0.7630736392742796, + "grad_norm": 1.405381881800787, + "learning_rate": 4.668694869687503e-05, + "loss": 0.5217, + "step": 6435 + }, + { + "epoch": 0.7631922210364046, + "grad_norm": 1.6006976445324543, + "learning_rate": 4.6685754475727314e-05, + "loss": 0.519, + "step": 6436 + }, + { + "epoch": 0.7633108027985296, + "grad_norm": 1.6376109359697357, + "learning_rate": 4.668456005466354e-05, + "loss": 0.5264, + "step": 6437 + }, + { + "epoch": 0.7634293845606546, + "grad_norm": 1.539930971646191, + "learning_rate": 4.668336543369471e-05, + "loss": 0.536, + "step": 6438 + }, + { + "epoch": 0.7635479663227795, + "grad_norm": 1.3223322924182594, + "learning_rate": 4.668217061283185e-05, + "loss": 0.3156, + "step": 6439 + }, + { + "epoch": 0.7636665480849045, + "grad_norm": 1.3355908715578695, + "learning_rate": 4.668097559208596e-05, + "loss": 0.387, + "step": 6440 + }, + { + "epoch": 0.7637851298470295, + "grad_norm": 1.4916414550098231, + "learning_rate": 4.667978037146806e-05, + "loss": 0.5523, + "step": 6441 + }, + { + "epoch": 0.7639037116091545, + "grad_norm": 1.1308474400435427, + "learning_rate": 4.6678584950989166e-05, + "loss": 0.3363, + "step": 6442 + }, + { + "epoch": 0.7640222933712795, + "grad_norm": 1.316071484681721, + "learning_rate": 4.667738933066031e-05, + "loss": 0.4495, + "step": 6443 + }, + { + "epoch": 0.7641408751334045, + "grad_norm": 1.500205501935983, + "learning_rate": 4.6676193510492506e-05, + "loss": 0.62, + "step": 6444 + }, + { + "epoch": 0.7642594568955294, + "grad_norm": 1.5590775239592332, + "learning_rate": 4.667499749049678e-05, + "loss": 0.5531, + "step": 6445 + }, + { + "epoch": 0.7643780386576544, + "grad_norm": 1.2984710585101096, + "learning_rate": 4.667380127068415e-05, + "loss": 0.5089, + "step": 6446 + }, + { + "epoch": 0.7644966204197794, + "grad_norm": 1.2114479726217011, + "learning_rate": 4.667260485106565e-05, + "loss": 0.3264, + "step": 6447 + }, + { + "epoch": 0.7646152021819044, + "grad_norm": 1.329612608425185, + "learning_rate": 4.667140823165232e-05, + "loss": 0.4754, + "step": 6448 + }, + { + "epoch": 0.7647337839440294, + "grad_norm": 1.704977618714458, + "learning_rate": 4.667021141245518e-05, + "loss": 0.5764, + "step": 6449 + }, + { + "epoch": 0.7648523657061544, + "grad_norm": 1.638804696965265, + "learning_rate": 4.6669014393485254e-05, + "loss": 0.4876, + "step": 6450 + }, + { + "epoch": 0.7649709474682794, + "grad_norm": 1.1010650840052179, + "learning_rate": 4.666781717475359e-05, + "loss": 0.2961, + "step": 6451 + }, + { + "epoch": 0.7650895292304044, + "grad_norm": 1.378726161184451, + "learning_rate": 4.6666619756271236e-05, + "loss": 0.4631, + "step": 6452 + }, + { + "epoch": 0.7652081109925294, + "grad_norm": 1.4812342936156728, + "learning_rate": 4.66654221380492e-05, + "loss": 0.5238, + "step": 6453 + }, + { + "epoch": 0.7653266927546544, + "grad_norm": 1.2047353970987953, + "learning_rate": 4.666422432009855e-05, + "loss": 0.3812, + "step": 6454 + }, + { + "epoch": 0.7654452745167794, + "grad_norm": 1.3230846691937228, + "learning_rate": 4.666302630243031e-05, + "loss": 0.409, + "step": 6455 + }, + { + "epoch": 0.7655638562789043, + "grad_norm": 1.6184003334444839, + "learning_rate": 4.666182808505554e-05, + "loss": 0.5414, + "step": 6456 + }, + { + "epoch": 0.7656824380410293, + "grad_norm": 1.1282027779603376, + "learning_rate": 4.666062966798528e-05, + "loss": 0.342, + "step": 6457 + }, + { + "epoch": 0.7658010198031543, + "grad_norm": 1.3697230127732885, + "learning_rate": 4.665943105123057e-05, + "loss": 0.4596, + "step": 6458 + }, + { + "epoch": 0.7659196015652793, + "grad_norm": 2.453823672789991, + "learning_rate": 4.665823223480247e-05, + "loss": 0.742, + "step": 6459 + }, + { + "epoch": 0.7660381833274043, + "grad_norm": 1.2935627740031765, + "learning_rate": 4.665703321871203e-05, + "loss": 0.381, + "step": 6460 + }, + { + "epoch": 0.7661567650895292, + "grad_norm": 1.4180040867851065, + "learning_rate": 4.66558340029703e-05, + "loss": 0.4217, + "step": 6461 + }, + { + "epoch": 0.7662753468516542, + "grad_norm": 1.5261154115877416, + "learning_rate": 4.665463458758833e-05, + "loss": 0.6565, + "step": 6462 + }, + { + "epoch": 0.7663939286137792, + "grad_norm": 1.17520212426376, + "learning_rate": 4.665343497257719e-05, + "loss": 0.3684, + "step": 6463 + }, + { + "epoch": 0.7665125103759042, + "grad_norm": 1.3179439747098858, + "learning_rate": 4.665223515794793e-05, + "loss": 0.5264, + "step": 6464 + }, + { + "epoch": 0.7666310921380292, + "grad_norm": 1.2429272828802582, + "learning_rate": 4.665103514371162e-05, + "loss": 0.3837, + "step": 6465 + }, + { + "epoch": 0.7667496739001541, + "grad_norm": 1.3587044291326027, + "learning_rate": 4.664983492987931e-05, + "loss": 0.3617, + "step": 6466 + }, + { + "epoch": 0.7668682556622791, + "grad_norm": 1.256189492911367, + "learning_rate": 4.6648634516462075e-05, + "loss": 0.4963, + "step": 6467 + }, + { + "epoch": 0.7669868374244041, + "grad_norm": 1.2474470533267716, + "learning_rate": 4.664743390347097e-05, + "loss": 0.3878, + "step": 6468 + }, + { + "epoch": 0.7671054191865291, + "grad_norm": 1.453369973399849, + "learning_rate": 4.664623309091708e-05, + "loss": 0.4113, + "step": 6469 + }, + { + "epoch": 0.7672240009486541, + "grad_norm": 1.2840396128370917, + "learning_rate": 4.6645032078811454e-05, + "loss": 0.4027, + "step": 6470 + }, + { + "epoch": 0.767342582710779, + "grad_norm": 1.595395091681589, + "learning_rate": 4.664383086716518e-05, + "loss": 0.633, + "step": 6471 + }, + { + "epoch": 0.767461164472904, + "grad_norm": 1.1877442668559632, + "learning_rate": 4.664262945598933e-05, + "loss": 0.3749, + "step": 6472 + }, + { + "epoch": 0.767579746235029, + "grad_norm": 1.4172857992591599, + "learning_rate": 4.664142784529497e-05, + "loss": 0.4716, + "step": 6473 + }, + { + "epoch": 0.767698327997154, + "grad_norm": 1.0966281256963835, + "learning_rate": 4.664022603509319e-05, + "loss": 0.2825, + "step": 6474 + }, + { + "epoch": 0.7678169097592791, + "grad_norm": 1.6544893955028661, + "learning_rate": 4.663902402539506e-05, + "loss": 0.5551, + "step": 6475 + }, + { + "epoch": 0.7679354915214041, + "grad_norm": 1.4694299699863405, + "learning_rate": 4.663782181621167e-05, + "loss": 0.4179, + "step": 6476 + }, + { + "epoch": 0.768054073283529, + "grad_norm": 1.1701320782118931, + "learning_rate": 4.663661940755409e-05, + "loss": 0.2864, + "step": 6477 + }, + { + "epoch": 0.768172655045654, + "grad_norm": 1.6338833511766266, + "learning_rate": 4.663541679943341e-05, + "loss": 0.4764, + "step": 6478 + }, + { + "epoch": 0.768291236807779, + "grad_norm": 1.7776164992610815, + "learning_rate": 4.6634213991860725e-05, + "loss": 0.4201, + "step": 6479 + }, + { + "epoch": 0.768409818569904, + "grad_norm": 1.5395720778779978, + "learning_rate": 4.663301098484711e-05, + "loss": 0.4249, + "step": 6480 + }, + { + "epoch": 0.768528400332029, + "grad_norm": 1.1992309264281984, + "learning_rate": 4.663180777840367e-05, + "loss": 0.4957, + "step": 6481 + }, + { + "epoch": 0.7686469820941539, + "grad_norm": 2.0749187178196884, + "learning_rate": 4.663060437254149e-05, + "loss": 0.6147, + "step": 6482 + }, + { + "epoch": 0.7687655638562789, + "grad_norm": 1.2843186067622823, + "learning_rate": 4.662940076727165e-05, + "loss": 0.3225, + "step": 6483 + }, + { + "epoch": 0.7688841456184039, + "grad_norm": 1.2583362540876608, + "learning_rate": 4.6628196962605264e-05, + "loss": 0.4189, + "step": 6484 + }, + { + "epoch": 0.7690027273805289, + "grad_norm": 1.3250166006018702, + "learning_rate": 4.662699295855343e-05, + "loss": 0.4007, + "step": 6485 + }, + { + "epoch": 0.7691213091426539, + "grad_norm": 1.4168172092669973, + "learning_rate": 4.6625788755127234e-05, + "loss": 0.3607, + "step": 6486 + }, + { + "epoch": 0.7692398909047788, + "grad_norm": 1.2787063208223133, + "learning_rate": 4.6624584352337785e-05, + "loss": 0.3277, + "step": 6487 + }, + { + "epoch": 0.7693584726669038, + "grad_norm": 1.547480863147103, + "learning_rate": 4.66233797501962e-05, + "loss": 0.5388, + "step": 6488 + }, + { + "epoch": 0.7694770544290288, + "grad_norm": 1.472271960827582, + "learning_rate": 4.662217494871355e-05, + "loss": 0.4292, + "step": 6489 + }, + { + "epoch": 0.7695956361911538, + "grad_norm": 1.346044493131191, + "learning_rate": 4.662096994790097e-05, + "loss": 0.4433, + "step": 6490 + }, + { + "epoch": 0.7697142179532788, + "grad_norm": 1.3774488443649382, + "learning_rate": 4.661976474776957e-05, + "loss": 0.3946, + "step": 6491 + }, + { + "epoch": 0.7698327997154037, + "grad_norm": 1.3387218577332938, + "learning_rate": 4.661855934833044e-05, + "loss": 0.4436, + "step": 6492 + }, + { + "epoch": 0.7699513814775287, + "grad_norm": 1.5211497409189045, + "learning_rate": 4.6617353749594705e-05, + "loss": 0.4678, + "step": 6493 + }, + { + "epoch": 0.7700699632396537, + "grad_norm": 1.5620165297944306, + "learning_rate": 4.6616147951573473e-05, + "loss": 0.4204, + "step": 6494 + }, + { + "epoch": 0.7701885450017787, + "grad_norm": 1.273343397846066, + "learning_rate": 4.6614941954277874e-05, + "loss": 0.3989, + "step": 6495 + }, + { + "epoch": 0.7703071267639037, + "grad_norm": 1.4827005910864917, + "learning_rate": 4.6613735757719015e-05, + "loss": 0.2968, + "step": 6496 + }, + { + "epoch": 0.7704257085260287, + "grad_norm": 1.6263568272872708, + "learning_rate": 4.661252936190801e-05, + "loss": 0.4583, + "step": 6497 + }, + { + "epoch": 0.7705442902881536, + "grad_norm": 1.7261357024183326, + "learning_rate": 4.661132276685599e-05, + "loss": 0.5942, + "step": 6498 + }, + { + "epoch": 0.7706628720502786, + "grad_norm": 1.2871280905432627, + "learning_rate": 4.6610115972574076e-05, + "loss": 0.3665, + "step": 6499 + }, + { + "epoch": 0.7707814538124037, + "grad_norm": 1.6336164844780496, + "learning_rate": 4.66089089790734e-05, + "loss": 0.6051, + "step": 6500 + }, + { + "epoch": 0.7709000355745287, + "grad_norm": 1.0089709225249612, + "learning_rate": 4.6607701786365073e-05, + "loss": 0.3319, + "step": 6501 + }, + { + "epoch": 0.7710186173366537, + "grad_norm": 1.1306694839257165, + "learning_rate": 4.6606494394460234e-05, + "loss": 0.3932, + "step": 6502 + }, + { + "epoch": 0.7711371990987786, + "grad_norm": 0.9224723105363929, + "learning_rate": 4.660528680337001e-05, + "loss": 0.2361, + "step": 6503 + }, + { + "epoch": 0.7712557808609036, + "grad_norm": 1.3306746757284846, + "learning_rate": 4.660407901310554e-05, + "loss": 0.368, + "step": 6504 + }, + { + "epoch": 0.7713743626230286, + "grad_norm": 1.244160563890268, + "learning_rate": 4.6602871023677954e-05, + "loss": 0.2994, + "step": 6505 + }, + { + "epoch": 0.7714929443851536, + "grad_norm": 1.4076381142777747, + "learning_rate": 4.660166283509839e-05, + "loss": 0.3604, + "step": 6506 + }, + { + "epoch": 0.7716115261472786, + "grad_norm": 1.1310523313151444, + "learning_rate": 4.6600454447377984e-05, + "loss": 0.3696, + "step": 6507 + }, + { + "epoch": 0.7717301079094036, + "grad_norm": 1.6816453083968836, + "learning_rate": 4.659924586052787e-05, + "loss": 0.6305, + "step": 6508 + }, + { + "epoch": 0.7718486896715285, + "grad_norm": 1.3113942665916933, + "learning_rate": 4.6598037074559206e-05, + "loss": 0.3883, + "step": 6509 + }, + { + "epoch": 0.7719672714336535, + "grad_norm": 1.3267456865479466, + "learning_rate": 4.659682808948311e-05, + "loss": 0.4406, + "step": 6510 + }, + { + "epoch": 0.7720858531957785, + "grad_norm": 1.3727033883731172, + "learning_rate": 4.659561890531075e-05, + "loss": 0.367, + "step": 6511 + }, + { + "epoch": 0.7722044349579035, + "grad_norm": 1.4160859909008723, + "learning_rate": 4.6594409522053274e-05, + "loss": 0.4587, + "step": 6512 + }, + { + "epoch": 0.7723230167200285, + "grad_norm": 1.4093932507475724, + "learning_rate": 4.659319993972182e-05, + "loss": 0.428, + "step": 6513 + }, + { + "epoch": 0.7724415984821534, + "grad_norm": 2.0514626920292423, + "learning_rate": 4.659199015832754e-05, + "loss": 0.7161, + "step": 6514 + }, + { + "epoch": 0.7725601802442784, + "grad_norm": 1.248877552649357, + "learning_rate": 4.659078017788159e-05, + "loss": 0.3889, + "step": 6515 + }, + { + "epoch": 0.7726787620064034, + "grad_norm": 1.4449355139035729, + "learning_rate": 4.658956999839512e-05, + "loss": 0.4634, + "step": 6516 + }, + { + "epoch": 0.7727973437685284, + "grad_norm": 1.4226979592672477, + "learning_rate": 4.658835961987929e-05, + "loss": 0.4873, + "step": 6517 + }, + { + "epoch": 0.7729159255306534, + "grad_norm": 1.4008376908230498, + "learning_rate": 4.658714904234526e-05, + "loss": 0.4552, + "step": 6518 + }, + { + "epoch": 0.7730345072927783, + "grad_norm": 1.2283476758321132, + "learning_rate": 4.6585938265804185e-05, + "loss": 0.4203, + "step": 6519 + }, + { + "epoch": 0.7731530890549033, + "grad_norm": 1.8445197291233268, + "learning_rate": 4.658472729026723e-05, + "loss": 0.4691, + "step": 6520 + }, + { + "epoch": 0.7732716708170283, + "grad_norm": 1.120060473976515, + "learning_rate": 4.658351611574556e-05, + "loss": 0.334, + "step": 6521 + }, + { + "epoch": 0.7733902525791533, + "grad_norm": 0.9966498068422353, + "learning_rate": 4.658230474225034e-05, + "loss": 0.2891, + "step": 6522 + }, + { + "epoch": 0.7735088343412783, + "grad_norm": 1.531459228654834, + "learning_rate": 4.6581093169792736e-05, + "loss": 0.4761, + "step": 6523 + }, + { + "epoch": 0.7736274161034032, + "grad_norm": 1.218605961609238, + "learning_rate": 4.657988139838391e-05, + "loss": 0.3765, + "step": 6524 + }, + { + "epoch": 0.7737459978655283, + "grad_norm": 1.2168243628158208, + "learning_rate": 4.6578669428035046e-05, + "loss": 0.3852, + "step": 6525 + }, + { + "epoch": 0.7738645796276533, + "grad_norm": 1.4128000594398538, + "learning_rate": 4.6577457258757305e-05, + "loss": 0.4081, + "step": 6526 + }, + { + "epoch": 0.7739831613897783, + "grad_norm": 1.8018811795496454, + "learning_rate": 4.657624489056188e-05, + "loss": 0.8212, + "step": 6527 + }, + { + "epoch": 0.7741017431519033, + "grad_norm": 1.3932370558432718, + "learning_rate": 4.6575032323459936e-05, + "loss": 0.564, + "step": 6528 + }, + { + "epoch": 0.7742203249140283, + "grad_norm": 1.4671316374111003, + "learning_rate": 4.657381955746264e-05, + "loss": 0.4626, + "step": 6529 + }, + { + "epoch": 0.7743389066761532, + "grad_norm": 1.6585652593287696, + "learning_rate": 4.6572606592581185e-05, + "loss": 0.5325, + "step": 6530 + }, + { + "epoch": 0.7744574884382782, + "grad_norm": 1.8186272342583483, + "learning_rate": 4.657139342882676e-05, + "loss": 0.5823, + "step": 6531 + }, + { + "epoch": 0.7745760702004032, + "grad_norm": 1.582671612198208, + "learning_rate": 4.6570180066210535e-05, + "loss": 0.5073, + "step": 6532 + }, + { + "epoch": 0.7746946519625282, + "grad_norm": 1.5956702678256094, + "learning_rate": 4.6568966504743695e-05, + "loss": 0.5744, + "step": 6533 + }, + { + "epoch": 0.7748132337246532, + "grad_norm": 1.252404815012282, + "learning_rate": 4.6567752744437444e-05, + "loss": 0.4314, + "step": 6534 + }, + { + "epoch": 0.7749318154867781, + "grad_norm": 1.188551376885332, + "learning_rate": 4.6566538785302954e-05, + "loss": 0.3851, + "step": 6535 + }, + { + "epoch": 0.7750503972489031, + "grad_norm": 1.2470106562023522, + "learning_rate": 4.656532462735143e-05, + "loss": 0.491, + "step": 6536 + }, + { + "epoch": 0.7751689790110281, + "grad_norm": 1.2937397339877341, + "learning_rate": 4.656411027059405e-05, + "loss": 0.4969, + "step": 6537 + }, + { + "epoch": 0.7752875607731531, + "grad_norm": 1.5625229148846294, + "learning_rate": 4.656289571504202e-05, + "loss": 0.6669, + "step": 6538 + }, + { + "epoch": 0.7754061425352781, + "grad_norm": 1.3401974795476501, + "learning_rate": 4.6561680960706534e-05, + "loss": 0.3877, + "step": 6539 + }, + { + "epoch": 0.775524724297403, + "grad_norm": 0.9860766231121926, + "learning_rate": 4.656046600759879e-05, + "loss": 0.3324, + "step": 6540 + }, + { + "epoch": 0.775643306059528, + "grad_norm": 1.5322818544569272, + "learning_rate": 4.6559250855729987e-05, + "loss": 0.4561, + "step": 6541 + }, + { + "epoch": 0.775761887821653, + "grad_norm": 1.321905160995246, + "learning_rate": 4.655803550511133e-05, + "loss": 0.4752, + "step": 6542 + }, + { + "epoch": 0.775880469583778, + "grad_norm": 1.1170411025960196, + "learning_rate": 4.6556819955754024e-05, + "loss": 0.3534, + "step": 6543 + }, + { + "epoch": 0.775999051345903, + "grad_norm": 1.3305223122317076, + "learning_rate": 4.6555604207669266e-05, + "loss": 0.408, + "step": 6544 + }, + { + "epoch": 0.776117633108028, + "grad_norm": 1.3377376979450066, + "learning_rate": 4.6554388260868275e-05, + "loss": 0.4269, + "step": 6545 + }, + { + "epoch": 0.7762362148701529, + "grad_norm": 2.21015347267266, + "learning_rate": 4.655317211536226e-05, + "loss": 0.5413, + "step": 6546 + }, + { + "epoch": 0.7763547966322779, + "grad_norm": 1.7716878362483046, + "learning_rate": 4.6551955771162425e-05, + "loss": 0.5767, + "step": 6547 + }, + { + "epoch": 0.7764733783944029, + "grad_norm": 1.377367055606228, + "learning_rate": 4.655073922827998e-05, + "loss": 0.4409, + "step": 6548 + }, + { + "epoch": 0.7765919601565279, + "grad_norm": 1.261813075480174, + "learning_rate": 4.654952248672616e-05, + "loss": 0.2847, + "step": 6549 + }, + { + "epoch": 0.776710541918653, + "grad_norm": 1.3986104484093065, + "learning_rate": 4.654830554651216e-05, + "loss": 0.4353, + "step": 6550 + }, + { + "epoch": 0.7768291236807779, + "grad_norm": 1.4572227997162819, + "learning_rate": 4.654708840764921e-05, + "loss": 0.5021, + "step": 6551 + }, + { + "epoch": 0.7769477054429029, + "grad_norm": 1.4022894493442308, + "learning_rate": 4.654587107014853e-05, + "loss": 0.3983, + "step": 6552 + }, + { + "epoch": 0.7770662872050279, + "grad_norm": 1.2834708178445597, + "learning_rate": 4.6544653534021334e-05, + "loss": 0.4158, + "step": 6553 + }, + { + "epoch": 0.7771848689671529, + "grad_norm": 3.0489271833187592, + "learning_rate": 4.654343579927885e-05, + "loss": 0.8787, + "step": 6554 + }, + { + "epoch": 0.7773034507292779, + "grad_norm": 1.3080379289422677, + "learning_rate": 4.6542217865932315e-05, + "loss": 0.2629, + "step": 6555 + }, + { + "epoch": 0.7774220324914028, + "grad_norm": 1.343897645741176, + "learning_rate": 4.654099973399294e-05, + "loss": 0.336, + "step": 6556 + }, + { + "epoch": 0.7775406142535278, + "grad_norm": 1.447205648137889, + "learning_rate": 4.653978140347197e-05, + "loss": 0.4831, + "step": 6557 + }, + { + "epoch": 0.7776591960156528, + "grad_norm": 1.2807828992554564, + "learning_rate": 4.653856287438062e-05, + "loss": 0.4801, + "step": 6558 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.2902055525111253, + "learning_rate": 4.653734414673014e-05, + "loss": 0.4083, + "step": 6559 + }, + { + "epoch": 0.7778963595399028, + "grad_norm": 1.2372292089986545, + "learning_rate": 4.6536125220531755e-05, + "loss": 0.4319, + "step": 6560 + }, + { + "epoch": 0.7780149413020278, + "grad_norm": 1.3271920961881103, + "learning_rate": 4.6534906095796706e-05, + "loss": 0.4973, + "step": 6561 + }, + { + "epoch": 0.7781335230641527, + "grad_norm": 1.4005573852124653, + "learning_rate": 4.653368677253624e-05, + "loss": 0.3038, + "step": 6562 + }, + { + "epoch": 0.7782521048262777, + "grad_norm": 1.4876545931199878, + "learning_rate": 4.653246725076157e-05, + "loss": 0.4208, + "step": 6563 + }, + { + "epoch": 0.7783706865884027, + "grad_norm": 1.6838067541636368, + "learning_rate": 4.653124753048397e-05, + "loss": 0.5805, + "step": 6564 + }, + { + "epoch": 0.7784892683505277, + "grad_norm": 1.3954592638945884, + "learning_rate": 4.653002761171467e-05, + "loss": 0.4218, + "step": 6565 + }, + { + "epoch": 0.7786078501126527, + "grad_norm": 1.5621459629155585, + "learning_rate": 4.652880749446491e-05, + "loss": 0.5241, + "step": 6566 + }, + { + "epoch": 0.7787264318747776, + "grad_norm": 1.2926988465946827, + "learning_rate": 4.6527587178745944e-05, + "loss": 0.3251, + "step": 6567 + }, + { + "epoch": 0.7788450136369026, + "grad_norm": 1.2667694853422353, + "learning_rate": 4.652636666456903e-05, + "loss": 0.3381, + "step": 6568 + }, + { + "epoch": 0.7789635953990276, + "grad_norm": 1.531268806586195, + "learning_rate": 4.6525145951945414e-05, + "loss": 0.4225, + "step": 6569 + }, + { + "epoch": 0.7790821771611526, + "grad_norm": 1.377772852536876, + "learning_rate": 4.652392504088634e-05, + "loss": 0.4351, + "step": 6570 + }, + { + "epoch": 0.7792007589232776, + "grad_norm": 1.5343216038050087, + "learning_rate": 4.6522703931403076e-05, + "loss": 0.5383, + "step": 6571 + }, + { + "epoch": 0.7793193406854025, + "grad_norm": 1.3832083923892833, + "learning_rate": 4.6521482623506884e-05, + "loss": 0.3849, + "step": 6572 + }, + { + "epoch": 0.7794379224475275, + "grad_norm": 1.5905603114586269, + "learning_rate": 4.6520261117209e-05, + "loss": 0.3746, + "step": 6573 + }, + { + "epoch": 0.7795565042096525, + "grad_norm": 1.3059507069915892, + "learning_rate": 4.6519039412520705e-05, + "loss": 0.4432, + "step": 6574 + }, + { + "epoch": 0.7796750859717776, + "grad_norm": 1.3754256838610053, + "learning_rate": 4.6517817509453255e-05, + "loss": 0.4308, + "step": 6575 + }, + { + "epoch": 0.7797936677339026, + "grad_norm": 1.1673034472723682, + "learning_rate": 4.651659540801791e-05, + "loss": 0.3347, + "step": 6576 + }, + { + "epoch": 0.7799122494960276, + "grad_norm": 1.2175770901414391, + "learning_rate": 4.6515373108225947e-05, + "loss": 0.3486, + "step": 6577 + }, + { + "epoch": 0.7800308312581525, + "grad_norm": 1.2881720123690121, + "learning_rate": 4.6514150610088625e-05, + "loss": 0.4074, + "step": 6578 + }, + { + "epoch": 0.7801494130202775, + "grad_norm": 1.3597629581164035, + "learning_rate": 4.651292791361722e-05, + "loss": 0.4276, + "step": 6579 + }, + { + "epoch": 0.7802679947824025, + "grad_norm": 1.7343670804077032, + "learning_rate": 4.6511705018822994e-05, + "loss": 0.4493, + "step": 6580 + }, + { + "epoch": 0.7803865765445275, + "grad_norm": 1.4995219045695767, + "learning_rate": 4.651048192571723e-05, + "loss": 0.3193, + "step": 6581 + }, + { + "epoch": 0.7805051583066525, + "grad_norm": 1.9612531233107218, + "learning_rate": 4.6509258634311204e-05, + "loss": 0.5545, + "step": 6582 + }, + { + "epoch": 0.7806237400687774, + "grad_norm": 1.251403274746046, + "learning_rate": 4.650803514461618e-05, + "loss": 0.3055, + "step": 6583 + }, + { + "epoch": 0.7807423218309024, + "grad_norm": 2.045916086418424, + "learning_rate": 4.650681145664345e-05, + "loss": 0.6073, + "step": 6584 + }, + { + "epoch": 0.7808609035930274, + "grad_norm": 1.0903603631561924, + "learning_rate": 4.6505587570404294e-05, + "loss": 0.2879, + "step": 6585 + }, + { + "epoch": 0.7809794853551524, + "grad_norm": 1.1904648057298506, + "learning_rate": 4.650436348591e-05, + "loss": 0.3152, + "step": 6586 + }, + { + "epoch": 0.7810980671172774, + "grad_norm": 1.7696324971328283, + "learning_rate": 4.6503139203171835e-05, + "loss": 0.4376, + "step": 6587 + }, + { + "epoch": 0.7812166488794023, + "grad_norm": 1.321851338481891, + "learning_rate": 4.65019147222011e-05, + "loss": 0.4104, + "step": 6588 + }, + { + "epoch": 0.7813352306415273, + "grad_norm": 1.6544236989936176, + "learning_rate": 4.650069004300907e-05, + "loss": 0.4254, + "step": 6589 + }, + { + "epoch": 0.7814538124036523, + "grad_norm": 1.1617717346810583, + "learning_rate": 4.649946516560706e-05, + "loss": 0.2722, + "step": 6590 + }, + { + "epoch": 0.7815723941657773, + "grad_norm": 1.5239958868019585, + "learning_rate": 4.649824009000634e-05, + "loss": 0.4815, + "step": 6591 + }, + { + "epoch": 0.7816909759279023, + "grad_norm": 1.3215836343969878, + "learning_rate": 4.6497014816218204e-05, + "loss": 0.4013, + "step": 6592 + }, + { + "epoch": 0.7818095576900272, + "grad_norm": 1.7773785611082737, + "learning_rate": 4.6495789344253956e-05, + "loss": 0.6061, + "step": 6593 + }, + { + "epoch": 0.7819281394521522, + "grad_norm": 1.6180311324097827, + "learning_rate": 4.6494563674124885e-05, + "loss": 0.585, + "step": 6594 + }, + { + "epoch": 0.7820467212142772, + "grad_norm": 1.2182669749934008, + "learning_rate": 4.64933378058423e-05, + "loss": 0.4376, + "step": 6595 + }, + { + "epoch": 0.7821653029764022, + "grad_norm": 1.4204672644715368, + "learning_rate": 4.64921117394175e-05, + "loss": 0.4075, + "step": 6596 + }, + { + "epoch": 0.7822838847385272, + "grad_norm": 1.552530833583739, + "learning_rate": 4.649088547486178e-05, + "loss": 0.5427, + "step": 6597 + }, + { + "epoch": 0.7824024665006521, + "grad_norm": 1.6896210923377524, + "learning_rate": 4.648965901218645e-05, + "loss": 0.3838, + "step": 6598 + }, + { + "epoch": 0.7825210482627771, + "grad_norm": 1.1646877590311608, + "learning_rate": 4.6488432351402814e-05, + "loss": 0.3693, + "step": 6599 + }, + { + "epoch": 0.7826396300249022, + "grad_norm": 1.3872232845816346, + "learning_rate": 4.648720549252219e-05, + "loss": 0.3612, + "step": 6600 + }, + { + "epoch": 0.7827582117870272, + "grad_norm": 1.2235432309515755, + "learning_rate": 4.6485978435555876e-05, + "loss": 0.4067, + "step": 6601 + }, + { + "epoch": 0.7828767935491522, + "grad_norm": 1.5887460736133132, + "learning_rate": 4.6484751180515195e-05, + "loss": 0.5259, + "step": 6602 + }, + { + "epoch": 0.7829953753112772, + "grad_norm": 1.3352573761402948, + "learning_rate": 4.648352372741145e-05, + "loss": 0.4603, + "step": 6603 + }, + { + "epoch": 0.7831139570734021, + "grad_norm": 1.6872872542466342, + "learning_rate": 4.648229607625595e-05, + "loss": 0.5336, + "step": 6604 + }, + { + "epoch": 0.7832325388355271, + "grad_norm": 1.2490662215353685, + "learning_rate": 4.648106822706004e-05, + "loss": 0.4372, + "step": 6605 + }, + { + "epoch": 0.7833511205976521, + "grad_norm": 2.371324581906594, + "learning_rate": 4.647984017983501e-05, + "loss": 0.7074, + "step": 6606 + }, + { + "epoch": 0.7834697023597771, + "grad_norm": 1.6153160111139822, + "learning_rate": 4.647861193459221e-05, + "loss": 0.5244, + "step": 6607 + }, + { + "epoch": 0.7835882841219021, + "grad_norm": 1.5251649159924985, + "learning_rate": 4.647738349134294e-05, + "loss": 0.5058, + "step": 6608 + }, + { + "epoch": 0.783706865884027, + "grad_norm": 1.5961512332210726, + "learning_rate": 4.647615485009852e-05, + "loss": 0.4258, + "step": 6609 + }, + { + "epoch": 0.783825447646152, + "grad_norm": 1.3324454357961255, + "learning_rate": 4.64749260108703e-05, + "loss": 0.3645, + "step": 6610 + }, + { + "epoch": 0.783944029408277, + "grad_norm": 1.5615902586942711, + "learning_rate": 4.647369697366959e-05, + "loss": 0.5477, + "step": 6611 + }, + { + "epoch": 0.784062611170402, + "grad_norm": 1.065742833650226, + "learning_rate": 4.647246773850773e-05, + "loss": 0.2854, + "step": 6612 + }, + { + "epoch": 0.784181192932527, + "grad_norm": 1.0915283346077382, + "learning_rate": 4.6471238305396056e-05, + "loss": 0.3757, + "step": 6613 + }, + { + "epoch": 0.784299774694652, + "grad_norm": 1.116887846202958, + "learning_rate": 4.647000867434588e-05, + "loss": 0.4114, + "step": 6614 + }, + { + "epoch": 0.7844183564567769, + "grad_norm": 1.2776495097600167, + "learning_rate": 4.6468778845368566e-05, + "loss": 0.3584, + "step": 6615 + }, + { + "epoch": 0.7845369382189019, + "grad_norm": 1.1430431398979126, + "learning_rate": 4.646754881847543e-05, + "loss": 0.3368, + "step": 6616 + }, + { + "epoch": 0.7846555199810269, + "grad_norm": 1.1785757549882503, + "learning_rate": 4.646631859367782e-05, + "loss": 0.2818, + "step": 6617 + }, + { + "epoch": 0.7847741017431519, + "grad_norm": 1.0665188018319987, + "learning_rate": 4.6465088170987076e-05, + "loss": 0.3598, + "step": 6618 + }, + { + "epoch": 0.7848926835052769, + "grad_norm": 1.5582696771865934, + "learning_rate": 4.646385755041454e-05, + "loss": 0.546, + "step": 6619 + }, + { + "epoch": 0.7850112652674018, + "grad_norm": 1.4715024267403254, + "learning_rate": 4.646262673197156e-05, + "loss": 0.3677, + "step": 6620 + }, + { + "epoch": 0.7851298470295268, + "grad_norm": 1.301571737301572, + "learning_rate": 4.646139571566949e-05, + "loss": 0.4136, + "step": 6621 + }, + { + "epoch": 0.7852484287916518, + "grad_norm": 1.5401951086643946, + "learning_rate": 4.646016450151966e-05, + "loss": 0.5287, + "step": 6622 + }, + { + "epoch": 0.7853670105537768, + "grad_norm": 1.2142059957851523, + "learning_rate": 4.6458933089533416e-05, + "loss": 0.3049, + "step": 6623 + }, + { + "epoch": 0.7854855923159018, + "grad_norm": 1.6300977595146753, + "learning_rate": 4.645770147972214e-05, + "loss": 0.4982, + "step": 6624 + }, + { + "epoch": 0.7856041740780269, + "grad_norm": 1.4395547715528687, + "learning_rate": 4.645646967209717e-05, + "loss": 0.3524, + "step": 6625 + }, + { + "epoch": 0.7857227558401518, + "grad_norm": 1.8461863985088411, + "learning_rate": 4.645523766666986e-05, + "loss": 0.597, + "step": 6626 + }, + { + "epoch": 0.7858413376022768, + "grad_norm": 1.7274946257097235, + "learning_rate": 4.645400546345157e-05, + "loss": 0.593, + "step": 6627 + }, + { + "epoch": 0.7859599193644018, + "grad_norm": 1.7788116565995369, + "learning_rate": 4.6452773062453646e-05, + "loss": 0.5449, + "step": 6628 + }, + { + "epoch": 0.7860785011265268, + "grad_norm": 1.2055902326119754, + "learning_rate": 4.645154046368747e-05, + "loss": 0.3368, + "step": 6629 + }, + { + "epoch": 0.7861970828886518, + "grad_norm": 1.4037112010367212, + "learning_rate": 4.645030766716441e-05, + "loss": 0.5296, + "step": 6630 + }, + { + "epoch": 0.7863156646507767, + "grad_norm": 1.0622710806818882, + "learning_rate": 4.64490746728958e-05, + "loss": 0.3247, + "step": 6631 + }, + { + "epoch": 0.7864342464129017, + "grad_norm": 1.961402237177919, + "learning_rate": 4.644784148089303e-05, + "loss": 0.5242, + "step": 6632 + }, + { + "epoch": 0.7865528281750267, + "grad_norm": 1.7623930305234956, + "learning_rate": 4.6446608091167456e-05, + "loss": 0.4759, + "step": 6633 + }, + { + "epoch": 0.7866714099371517, + "grad_norm": 1.6666386109048865, + "learning_rate": 4.644537450373046e-05, + "loss": 0.3788, + "step": 6634 + }, + { + "epoch": 0.7867899916992767, + "grad_norm": 1.6457684504133057, + "learning_rate": 4.644414071859341e-05, + "loss": 0.5141, + "step": 6635 + }, + { + "epoch": 0.7869085734614016, + "grad_norm": 1.385959081706645, + "learning_rate": 4.644290673576768e-05, + "loss": 0.4218, + "step": 6636 + }, + { + "epoch": 0.7870271552235266, + "grad_norm": 1.1663279395481374, + "learning_rate": 4.6441672555264645e-05, + "loss": 0.3106, + "step": 6637 + }, + { + "epoch": 0.7871457369856516, + "grad_norm": 1.1331546249382438, + "learning_rate": 4.644043817709568e-05, + "loss": 0.3627, + "step": 6638 + }, + { + "epoch": 0.7872643187477766, + "grad_norm": 1.1383862064615475, + "learning_rate": 4.643920360127217e-05, + "loss": 0.3647, + "step": 6639 + }, + { + "epoch": 0.7873829005099016, + "grad_norm": 1.7257923375862236, + "learning_rate": 4.6437968827805486e-05, + "loss": 0.446, + "step": 6640 + }, + { + "epoch": 0.7875014822720265, + "grad_norm": 1.160297109887058, + "learning_rate": 4.6436733856707025e-05, + "loss": 0.3328, + "step": 6641 + }, + { + "epoch": 0.7876200640341515, + "grad_norm": 1.454392590721401, + "learning_rate": 4.643549868798817e-05, + "loss": 0.413, + "step": 6642 + }, + { + "epoch": 0.7877386457962765, + "grad_norm": 1.2750985393389926, + "learning_rate": 4.643426332166029e-05, + "loss": 0.3623, + "step": 6643 + }, + { + "epoch": 0.7878572275584015, + "grad_norm": 1.6313985992965347, + "learning_rate": 4.643302775773479e-05, + "loss": 0.4306, + "step": 6644 + }, + { + "epoch": 0.7879758093205265, + "grad_norm": 1.314634510862468, + "learning_rate": 4.643179199622307e-05, + "loss": 0.4179, + "step": 6645 + }, + { + "epoch": 0.7880943910826514, + "grad_norm": 1.0705459733454226, + "learning_rate": 4.6430556037136495e-05, + "loss": 0.297, + "step": 6646 + }, + { + "epoch": 0.7882129728447764, + "grad_norm": 1.3726886085215728, + "learning_rate": 4.642931988048648e-05, + "loss": 0.3636, + "step": 6647 + }, + { + "epoch": 0.7883315546069014, + "grad_norm": 1.2046000989361858, + "learning_rate": 4.642808352628442e-05, + "loss": 0.351, + "step": 6648 + }, + { + "epoch": 0.7884501363690264, + "grad_norm": 1.3509071657805323, + "learning_rate": 4.6426846974541696e-05, + "loss": 0.438, + "step": 6649 + }, + { + "epoch": 0.7885687181311515, + "grad_norm": 1.4352684943496476, + "learning_rate": 4.642561022526972e-05, + "loss": 0.4469, + "step": 6650 + }, + { + "epoch": 0.7886872998932765, + "grad_norm": 1.2221690424330958, + "learning_rate": 4.642437327847989e-05, + "loss": 0.3184, + "step": 6651 + }, + { + "epoch": 0.7888058816554014, + "grad_norm": 1.2661749258692732, + "learning_rate": 4.642313613418361e-05, + "loss": 0.3689, + "step": 6652 + }, + { + "epoch": 0.7889244634175264, + "grad_norm": 1.5911924899119352, + "learning_rate": 4.642189879239229e-05, + "loss": 0.4863, + "step": 6653 + }, + { + "epoch": 0.7890430451796514, + "grad_norm": 1.922344576928574, + "learning_rate": 4.642066125311733e-05, + "loss": 0.5796, + "step": 6654 + }, + { + "epoch": 0.7891616269417764, + "grad_norm": 1.8658967166874785, + "learning_rate": 4.6419423516370144e-05, + "loss": 0.4865, + "step": 6655 + }, + { + "epoch": 0.7892802087039014, + "grad_norm": 1.7106563560318973, + "learning_rate": 4.641818558216214e-05, + "loss": 0.5357, + "step": 6656 + }, + { + "epoch": 0.7893987904660263, + "grad_norm": 1.417282263547433, + "learning_rate": 4.641694745050473e-05, + "loss": 0.4297, + "step": 6657 + }, + { + "epoch": 0.7895173722281513, + "grad_norm": 1.7981016934455447, + "learning_rate": 4.641570912140932e-05, + "loss": 0.4232, + "step": 6658 + }, + { + "epoch": 0.7896359539902763, + "grad_norm": 1.4458587949690043, + "learning_rate": 4.641447059488734e-05, + "loss": 0.4114, + "step": 6659 + }, + { + "epoch": 0.7897545357524013, + "grad_norm": 0.9917431294155186, + "learning_rate": 4.64132318709502e-05, + "loss": 0.253, + "step": 6660 + }, + { + "epoch": 0.7898731175145263, + "grad_norm": 1.2872793736742065, + "learning_rate": 4.6411992949609315e-05, + "loss": 0.4621, + "step": 6661 + }, + { + "epoch": 0.7899916992766512, + "grad_norm": 1.2676016765212923, + "learning_rate": 4.6410753830876123e-05, + "loss": 0.328, + "step": 6662 + }, + { + "epoch": 0.7901102810387762, + "grad_norm": 1.5118165672069042, + "learning_rate": 4.640951451476203e-05, + "loss": 0.5385, + "step": 6663 + }, + { + "epoch": 0.7902288628009012, + "grad_norm": 1.3920626060264858, + "learning_rate": 4.640827500127847e-05, + "loss": 0.5197, + "step": 6664 + }, + { + "epoch": 0.7903474445630262, + "grad_norm": 1.4117466536387884, + "learning_rate": 4.6407035290436864e-05, + "loss": 0.4164, + "step": 6665 + }, + { + "epoch": 0.7904660263251512, + "grad_norm": 1.5723947483681118, + "learning_rate": 4.640579538224865e-05, + "loss": 0.5232, + "step": 6666 + }, + { + "epoch": 0.7905846080872762, + "grad_norm": 1.2088978653510445, + "learning_rate": 4.640455527672525e-05, + "loss": 0.3552, + "step": 6667 + }, + { + "epoch": 0.7907031898494011, + "grad_norm": 1.2753403078876697, + "learning_rate": 4.6403314973878084e-05, + "loss": 0.4394, + "step": 6668 + }, + { + "epoch": 0.7908217716115261, + "grad_norm": 1.666791683833038, + "learning_rate": 4.6402074473718615e-05, + "loss": 0.6381, + "step": 6669 + }, + { + "epoch": 0.7909403533736511, + "grad_norm": 1.0753106163391204, + "learning_rate": 4.6400833776258264e-05, + "loss": 0.3624, + "step": 6670 + }, + { + "epoch": 0.7910589351357761, + "grad_norm": 1.345230400217538, + "learning_rate": 4.639959288150847e-05, + "loss": 0.482, + "step": 6671 + }, + { + "epoch": 0.7911775168979011, + "grad_norm": 1.251326785063224, + "learning_rate": 4.639835178948066e-05, + "loss": 0.3132, + "step": 6672 + }, + { + "epoch": 0.791296098660026, + "grad_norm": 1.3020222012003455, + "learning_rate": 4.63971105001863e-05, + "loss": 0.4366, + "step": 6673 + }, + { + "epoch": 0.791414680422151, + "grad_norm": 1.586571164742949, + "learning_rate": 4.639586901363682e-05, + "loss": 0.4924, + "step": 6674 + }, + { + "epoch": 0.7915332621842761, + "grad_norm": 1.431575939082728, + "learning_rate": 4.6394627329843654e-05, + "loss": 0.4961, + "step": 6675 + }, + { + "epoch": 0.7916518439464011, + "grad_norm": 1.220244542300123, + "learning_rate": 4.6393385448818264e-05, + "loss": 0.4741, + "step": 6676 + }, + { + "epoch": 0.7917704257085261, + "grad_norm": 1.6706780264398904, + "learning_rate": 4.63921433705721e-05, + "loss": 0.5741, + "step": 6677 + }, + { + "epoch": 0.791889007470651, + "grad_norm": 1.1431285251770469, + "learning_rate": 4.63909010951166e-05, + "loss": 0.3353, + "step": 6678 + }, + { + "epoch": 0.792007589232776, + "grad_norm": 1.1266289472718634, + "learning_rate": 4.638965862246323e-05, + "loss": 0.3645, + "step": 6679 + }, + { + "epoch": 0.792126170994901, + "grad_norm": 1.1443844818166071, + "learning_rate": 4.638841595262343e-05, + "loss": 0.3593, + "step": 6680 + }, + { + "epoch": 0.792244752757026, + "grad_norm": 1.746237188602518, + "learning_rate": 4.638717308560867e-05, + "loss": 0.4405, + "step": 6681 + }, + { + "epoch": 0.792363334519151, + "grad_norm": 1.8852425773643846, + "learning_rate": 4.6385930021430396e-05, + "loss": 0.4881, + "step": 6682 + }, + { + "epoch": 0.792481916281276, + "grad_norm": 1.3265351915529169, + "learning_rate": 4.638468676010007e-05, + "loss": 0.3897, + "step": 6683 + }, + { + "epoch": 0.7926004980434009, + "grad_norm": 1.3100356335585341, + "learning_rate": 4.638344330162916e-05, + "loss": 0.4241, + "step": 6684 + }, + { + "epoch": 0.7927190798055259, + "grad_norm": 1.7136016588992626, + "learning_rate": 4.6382199646029124e-05, + "loss": 0.5109, + "step": 6685 + }, + { + "epoch": 0.7928376615676509, + "grad_norm": 1.5351855879811311, + "learning_rate": 4.638095579331143e-05, + "loss": 0.4514, + "step": 6686 + }, + { + "epoch": 0.7929562433297759, + "grad_norm": 1.4090800811536346, + "learning_rate": 4.637971174348754e-05, + "loss": 0.4267, + "step": 6687 + }, + { + "epoch": 0.7930748250919009, + "grad_norm": 1.2238600826879646, + "learning_rate": 4.637846749656893e-05, + "loss": 0.3288, + "step": 6688 + }, + { + "epoch": 0.7931934068540258, + "grad_norm": 1.6666269896008972, + "learning_rate": 4.637722305256706e-05, + "loss": 0.6522, + "step": 6689 + }, + { + "epoch": 0.7933119886161508, + "grad_norm": 1.336914280757268, + "learning_rate": 4.637597841149341e-05, + "loss": 0.3779, + "step": 6690 + }, + { + "epoch": 0.7934305703782758, + "grad_norm": 1.3165339599593469, + "learning_rate": 4.6374733573359453e-05, + "loss": 0.3525, + "step": 6691 + }, + { + "epoch": 0.7935491521404008, + "grad_norm": 1.4495846181705692, + "learning_rate": 4.637348853817666e-05, + "loss": 0.4622, + "step": 6692 + }, + { + "epoch": 0.7936677339025258, + "grad_norm": 1.2397410901664694, + "learning_rate": 4.637224330595652e-05, + "loss": 0.4024, + "step": 6693 + }, + { + "epoch": 0.7937863156646507, + "grad_norm": 2.178579944601011, + "learning_rate": 4.63709978767105e-05, + "loss": 0.6014, + "step": 6694 + }, + { + "epoch": 0.7939048974267757, + "grad_norm": 1.6911426207568345, + "learning_rate": 4.6369752250450085e-05, + "loss": 0.6074, + "step": 6695 + }, + { + "epoch": 0.7940234791889007, + "grad_norm": 1.2733973553725428, + "learning_rate": 4.636850642718677e-05, + "loss": 0.4048, + "step": 6696 + }, + { + "epoch": 0.7941420609510257, + "grad_norm": 1.2705817896073603, + "learning_rate": 4.636726040693201e-05, + "loss": 0.4256, + "step": 6697 + }, + { + "epoch": 0.7942606427131507, + "grad_norm": 1.26153417010475, + "learning_rate": 4.6366014189697326e-05, + "loss": 0.4381, + "step": 6698 + }, + { + "epoch": 0.7943792244752758, + "grad_norm": 1.4458932724137932, + "learning_rate": 4.636476777549418e-05, + "loss": 0.4574, + "step": 6699 + }, + { + "epoch": 0.7944978062374007, + "grad_norm": 1.3847718982744737, + "learning_rate": 4.636352116433409e-05, + "loss": 0.4626, + "step": 6700 + }, + { + "epoch": 0.7946163879995257, + "grad_norm": 1.1733619221177414, + "learning_rate": 4.636227435622852e-05, + "loss": 0.3841, + "step": 6701 + }, + { + "epoch": 0.7947349697616507, + "grad_norm": 1.4828340460944935, + "learning_rate": 4.636102735118899e-05, + "loss": 0.4184, + "step": 6702 + }, + { + "epoch": 0.7948535515237757, + "grad_norm": 1.3719508096530924, + "learning_rate": 4.6359780149226966e-05, + "loss": 0.5399, + "step": 6703 + }, + { + "epoch": 0.7949721332859007, + "grad_norm": 1.1755658468001415, + "learning_rate": 4.635853275035397e-05, + "loss": 0.3341, + "step": 6704 + }, + { + "epoch": 0.7950907150480256, + "grad_norm": 1.3386974465661787, + "learning_rate": 4.63572851545815e-05, + "loss": 0.3725, + "step": 6705 + }, + { + "epoch": 0.7952092968101506, + "grad_norm": 1.1763790099697307, + "learning_rate": 4.635603736192104e-05, + "loss": 0.3718, + "step": 6706 + }, + { + "epoch": 0.7953278785722756, + "grad_norm": 1.263641923261262, + "learning_rate": 4.635478937238411e-05, + "loss": 0.3392, + "step": 6707 + }, + { + "epoch": 0.7954464603344006, + "grad_norm": 1.4254586789644557, + "learning_rate": 4.6353541185982206e-05, + "loss": 0.4604, + "step": 6708 + }, + { + "epoch": 0.7955650420965256, + "grad_norm": 1.365725702813229, + "learning_rate": 4.635229280272684e-05, + "loss": 0.3901, + "step": 6709 + }, + { + "epoch": 0.7956836238586505, + "grad_norm": 1.6291768065748808, + "learning_rate": 4.635104422262952e-05, + "loss": 0.3718, + "step": 6710 + }, + { + "epoch": 0.7958022056207755, + "grad_norm": 1.486035358264411, + "learning_rate": 4.634979544570175e-05, + "loss": 0.3896, + "step": 6711 + }, + { + "epoch": 0.7959207873829005, + "grad_norm": 1.1973979446494234, + "learning_rate": 4.6348546471955046e-05, + "loss": 0.3376, + "step": 6712 + }, + { + "epoch": 0.7960393691450255, + "grad_norm": 1.2559156132832991, + "learning_rate": 4.634729730140093e-05, + "loss": 0.3614, + "step": 6713 + }, + { + "epoch": 0.7961579509071505, + "grad_norm": 1.2888361019704597, + "learning_rate": 4.634604793405091e-05, + "loss": 0.2916, + "step": 6714 + }, + { + "epoch": 0.7962765326692754, + "grad_norm": 1.4107957533908084, + "learning_rate": 4.63447983699165e-05, + "loss": 0.4299, + "step": 6715 + }, + { + "epoch": 0.7963951144314004, + "grad_norm": 2.2125795983768137, + "learning_rate": 4.634354860900923e-05, + "loss": 0.53, + "step": 6716 + }, + { + "epoch": 0.7965136961935254, + "grad_norm": 1.5867592989727513, + "learning_rate": 4.63422986513406e-05, + "loss": 0.4669, + "step": 6717 + }, + { + "epoch": 0.7966322779556504, + "grad_norm": 1.8583076776464382, + "learning_rate": 4.634104849692216e-05, + "loss": 0.734, + "step": 6718 + }, + { + "epoch": 0.7967508597177754, + "grad_norm": 1.3744373514353871, + "learning_rate": 4.633979814576542e-05, + "loss": 0.3466, + "step": 6719 + }, + { + "epoch": 0.7968694414799004, + "grad_norm": 1.64428214350906, + "learning_rate": 4.633854759788191e-05, + "loss": 0.5211, + "step": 6720 + }, + { + "epoch": 0.7969880232420253, + "grad_norm": 1.343838823858824, + "learning_rate": 4.633729685328316e-05, + "loss": 0.33, + "step": 6721 + }, + { + "epoch": 0.7971066050041503, + "grad_norm": 1.439220607769147, + "learning_rate": 4.63360459119807e-05, + "loss": 0.5244, + "step": 6722 + }, + { + "epoch": 0.7972251867662753, + "grad_norm": 1.4946768571618467, + "learning_rate": 4.633479477398606e-05, + "loss": 0.5399, + "step": 6723 + }, + { + "epoch": 0.7973437685284004, + "grad_norm": 1.3879956847294836, + "learning_rate": 4.633354343931077e-05, + "loss": 0.4467, + "step": 6724 + }, + { + "epoch": 0.7974623502905254, + "grad_norm": 2.030469936187782, + "learning_rate": 4.633229190796637e-05, + "loss": 0.8327, + "step": 6725 + }, + { + "epoch": 0.7975809320526503, + "grad_norm": 1.2631796234751382, + "learning_rate": 4.6331040179964405e-05, + "loss": 0.4938, + "step": 6726 + }, + { + "epoch": 0.7976995138147753, + "grad_norm": 1.667410457936118, + "learning_rate": 4.6329788255316397e-05, + "loss": 0.6807, + "step": 6727 + }, + { + "epoch": 0.7978180955769003, + "grad_norm": 1.2729248595532918, + "learning_rate": 4.6328536134033905e-05, + "loss": 0.3497, + "step": 6728 + }, + { + "epoch": 0.7979366773390253, + "grad_norm": 1.0734125280368987, + "learning_rate": 4.632728381612846e-05, + "loss": 0.4101, + "step": 6729 + }, + { + "epoch": 0.7980552591011503, + "grad_norm": 1.631379794346881, + "learning_rate": 4.6326031301611615e-05, + "loss": 0.6069, + "step": 6730 + }, + { + "epoch": 0.7981738408632753, + "grad_norm": 1.1221636771970052, + "learning_rate": 4.632477859049492e-05, + "loss": 0.351, + "step": 6731 + }, + { + "epoch": 0.7982924226254002, + "grad_norm": 1.1297512728667818, + "learning_rate": 4.6323525682789904e-05, + "loss": 0.3994, + "step": 6732 + }, + { + "epoch": 0.7984110043875252, + "grad_norm": 1.0434640166474354, + "learning_rate": 4.632227257850813e-05, + "loss": 0.3483, + "step": 6733 + }, + { + "epoch": 0.7985295861496502, + "grad_norm": 1.8741624457167243, + "learning_rate": 4.632101927766116e-05, + "loss": 0.7487, + "step": 6734 + }, + { + "epoch": 0.7986481679117752, + "grad_norm": 1.5213760539938963, + "learning_rate": 4.6319765780260535e-05, + "loss": 0.4414, + "step": 6735 + }, + { + "epoch": 0.7987667496739002, + "grad_norm": 1.3818962983760439, + "learning_rate": 4.6318512086317814e-05, + "loss": 0.4314, + "step": 6736 + }, + { + "epoch": 0.7988853314360251, + "grad_norm": 1.112353064202862, + "learning_rate": 4.631725819584455e-05, + "loss": 0.4016, + "step": 6737 + }, + { + "epoch": 0.7990039131981501, + "grad_norm": 1.6388235361765107, + "learning_rate": 4.6316004108852305e-05, + "loss": 0.4274, + "step": 6738 + }, + { + "epoch": 0.7991224949602751, + "grad_norm": 1.0504024429323928, + "learning_rate": 4.631474982535265e-05, + "loss": 0.3547, + "step": 6739 + }, + { + "epoch": 0.7992410767224001, + "grad_norm": 1.4512305876863028, + "learning_rate": 4.631349534535713e-05, + "loss": 0.5515, + "step": 6740 + }, + { + "epoch": 0.7993596584845251, + "grad_norm": 1.324672131983842, + "learning_rate": 4.6312240668877324e-05, + "loss": 0.4533, + "step": 6741 + }, + { + "epoch": 0.79947824024665, + "grad_norm": 1.1436706263509928, + "learning_rate": 4.631098579592479e-05, + "loss": 0.3875, + "step": 6742 + }, + { + "epoch": 0.799596822008775, + "grad_norm": 1.6441173047516482, + "learning_rate": 4.6309730726511106e-05, + "loss": 0.536, + "step": 6743 + }, + { + "epoch": 0.7997154037709, + "grad_norm": 1.091040571826638, + "learning_rate": 4.630847546064784e-05, + "loss": 0.2915, + "step": 6744 + }, + { + "epoch": 0.799833985533025, + "grad_norm": 1.4804899540293577, + "learning_rate": 4.630721999834655e-05, + "loss": 0.411, + "step": 6745 + }, + { + "epoch": 0.79995256729515, + "grad_norm": 1.2747105966455636, + "learning_rate": 4.6305964339618824e-05, + "loss": 0.3576, + "step": 6746 + }, + { + "epoch": 0.8000711490572749, + "grad_norm": 1.425162608608509, + "learning_rate": 4.6304708484476235e-05, + "loss": 0.4836, + "step": 6747 + }, + { + "epoch": 0.8001897308193999, + "grad_norm": 1.3921251442110798, + "learning_rate": 4.630345243293036e-05, + "loss": 0.478, + "step": 6748 + }, + { + "epoch": 0.800308312581525, + "grad_norm": 1.4628555432334376, + "learning_rate": 4.6302196184992776e-05, + "loss": 0.4291, + "step": 6749 + }, + { + "epoch": 0.80042689434365, + "grad_norm": 1.690504358428121, + "learning_rate": 4.630093974067506e-05, + "loss": 0.4614, + "step": 6750 + }, + { + "epoch": 0.800545476105775, + "grad_norm": 1.3294904304974926, + "learning_rate": 4.629968309998881e-05, + "loss": 0.3572, + "step": 6751 + }, + { + "epoch": 0.8006640578679, + "grad_norm": 1.696175808254425, + "learning_rate": 4.62984262629456e-05, + "loss": 0.4245, + "step": 6752 + }, + { + "epoch": 0.8007826396300249, + "grad_norm": 1.2519467803042679, + "learning_rate": 4.629716922955701e-05, + "loss": 0.3736, + "step": 6753 + }, + { + "epoch": 0.8009012213921499, + "grad_norm": 1.2912918378135467, + "learning_rate": 4.6295911999834643e-05, + "loss": 0.3877, + "step": 6754 + }, + { + "epoch": 0.8010198031542749, + "grad_norm": 1.321579796961692, + "learning_rate": 4.629465457379008e-05, + "loss": 0.4068, + "step": 6755 + }, + { + "epoch": 0.8011383849163999, + "grad_norm": 1.5981171486208698, + "learning_rate": 4.629339695143492e-05, + "loss": 0.4079, + "step": 6756 + }, + { + "epoch": 0.8012569666785249, + "grad_norm": 1.1584180896054728, + "learning_rate": 4.6292139132780746e-05, + "loss": 0.389, + "step": 6757 + }, + { + "epoch": 0.8013755484406498, + "grad_norm": 1.208777134320982, + "learning_rate": 4.629088111783916e-05, + "loss": 0.4297, + "step": 6758 + }, + { + "epoch": 0.8014941302027748, + "grad_norm": 1.9780625139852064, + "learning_rate": 4.628962290662177e-05, + "loss": 0.501, + "step": 6759 + }, + { + "epoch": 0.8016127119648998, + "grad_norm": 1.4860800296803933, + "learning_rate": 4.628836449914015e-05, + "loss": 0.4463, + "step": 6760 + }, + { + "epoch": 0.8017312937270248, + "grad_norm": 1.8130228353690667, + "learning_rate": 4.6287105895405914e-05, + "loss": 0.6035, + "step": 6761 + }, + { + "epoch": 0.8018498754891498, + "grad_norm": 1.2689596267660956, + "learning_rate": 4.628584709543067e-05, + "loss": 0.3198, + "step": 6762 + }, + { + "epoch": 0.8019684572512747, + "grad_norm": 0.9467343406728173, + "learning_rate": 4.6284588099226015e-05, + "loss": 0.2962, + "step": 6763 + }, + { + "epoch": 0.8020870390133997, + "grad_norm": 1.2152850459659028, + "learning_rate": 4.6283328906803564e-05, + "loss": 0.3632, + "step": 6764 + }, + { + "epoch": 0.8022056207755247, + "grad_norm": 1.2096351643534689, + "learning_rate": 4.6282069518174917e-05, + "loss": 0.4263, + "step": 6765 + }, + { + "epoch": 0.8023242025376497, + "grad_norm": 1.0272148183248624, + "learning_rate": 4.628080993335169e-05, + "loss": 0.2338, + "step": 6766 + }, + { + "epoch": 0.8024427842997747, + "grad_norm": 1.3215016855140491, + "learning_rate": 4.627955015234548e-05, + "loss": 0.3884, + "step": 6767 + }, + { + "epoch": 0.8025613660618997, + "grad_norm": 1.4910180206463592, + "learning_rate": 4.6278290175167924e-05, + "loss": 0.4355, + "step": 6768 + }, + { + "epoch": 0.8026799478240246, + "grad_norm": 1.1596409284079148, + "learning_rate": 4.627703000183062e-05, + "loss": 0.367, + "step": 6769 + }, + { + "epoch": 0.8027985295861496, + "grad_norm": 1.8307524181095074, + "learning_rate": 4.62757696323452e-05, + "loss": 0.6003, + "step": 6770 + }, + { + "epoch": 0.8029171113482746, + "grad_norm": 1.049280438150683, + "learning_rate": 4.627450906672327e-05, + "loss": 0.3249, + "step": 6771 + }, + { + "epoch": 0.8030356931103996, + "grad_norm": 1.4884200270717227, + "learning_rate": 4.627324830497645e-05, + "loss": 0.41, + "step": 6772 + }, + { + "epoch": 0.8031542748725246, + "grad_norm": 1.182004658549509, + "learning_rate": 4.6271987347116376e-05, + "loss": 0.4377, + "step": 6773 + }, + { + "epoch": 0.8032728566346496, + "grad_norm": 1.1774188919633044, + "learning_rate": 4.627072619315466e-05, + "loss": 0.3573, + "step": 6774 + }, + { + "epoch": 0.8033914383967746, + "grad_norm": 1.3717639480403467, + "learning_rate": 4.626946484310293e-05, + "loss": 0.3323, + "step": 6775 + }, + { + "epoch": 0.8035100201588996, + "grad_norm": 1.3581740389125025, + "learning_rate": 4.6268203296972814e-05, + "loss": 0.326, + "step": 6776 + }, + { + "epoch": 0.8036286019210246, + "grad_norm": 1.1182919662356705, + "learning_rate": 4.626694155477595e-05, + "loss": 0.3199, + "step": 6777 + }, + { + "epoch": 0.8037471836831496, + "grad_norm": 1.8229771363188922, + "learning_rate": 4.626567961652396e-05, + "loss": 0.5005, + "step": 6778 + }, + { + "epoch": 0.8038657654452745, + "grad_norm": 1.0649165194462038, + "learning_rate": 4.6264417482228485e-05, + "loss": 0.2618, + "step": 6779 + }, + { + "epoch": 0.8039843472073995, + "grad_norm": 1.390740927899871, + "learning_rate": 4.6263155151901154e-05, + "loss": 0.3322, + "step": 6780 + }, + { + "epoch": 0.8041029289695245, + "grad_norm": 1.3957716576277635, + "learning_rate": 4.6261892625553615e-05, + "loss": 0.3581, + "step": 6781 + }, + { + "epoch": 0.8042215107316495, + "grad_norm": 1.1368453091336201, + "learning_rate": 4.626062990319749e-05, + "loss": 0.3366, + "step": 6782 + }, + { + "epoch": 0.8043400924937745, + "grad_norm": 1.8273001753181397, + "learning_rate": 4.6259366984844435e-05, + "loss": 0.6242, + "step": 6783 + }, + { + "epoch": 0.8044586742558995, + "grad_norm": 1.2600447884060986, + "learning_rate": 4.625810387050608e-05, + "loss": 0.3155, + "step": 6784 + }, + { + "epoch": 0.8045772560180244, + "grad_norm": 2.3890332584770415, + "learning_rate": 4.625684056019407e-05, + "loss": 0.6922, + "step": 6785 + }, + { + "epoch": 0.8046958377801494, + "grad_norm": 1.5823722143213492, + "learning_rate": 4.625557705392007e-05, + "loss": 0.392, + "step": 6786 + }, + { + "epoch": 0.8048144195422744, + "grad_norm": 1.7098485650327264, + "learning_rate": 4.625431335169571e-05, + "loss": 0.341, + "step": 6787 + }, + { + "epoch": 0.8049330013043994, + "grad_norm": 1.6615347613720943, + "learning_rate": 4.625304945353265e-05, + "loss": 0.5365, + "step": 6788 + }, + { + "epoch": 0.8050515830665244, + "grad_norm": 1.8418967655655545, + "learning_rate": 4.6251785359442526e-05, + "loss": 0.4528, + "step": 6789 + }, + { + "epoch": 0.8051701648286493, + "grad_norm": 1.198112842999033, + "learning_rate": 4.6250521069437014e-05, + "loss": 0.3127, + "step": 6790 + }, + { + "epoch": 0.8052887465907743, + "grad_norm": 1.4498693372491798, + "learning_rate": 4.6249256583527744e-05, + "loss": 0.4799, + "step": 6791 + }, + { + "epoch": 0.8054073283528993, + "grad_norm": 1.4145148818523667, + "learning_rate": 4.62479919017264e-05, + "loss": 0.3954, + "step": 6792 + }, + { + "epoch": 0.8055259101150243, + "grad_norm": 1.2566889507502206, + "learning_rate": 4.624672702404461e-05, + "loss": 0.366, + "step": 6793 + }, + { + "epoch": 0.8056444918771493, + "grad_norm": 1.5341680553321753, + "learning_rate": 4.624546195049406e-05, + "loss": 0.5208, + "step": 6794 + }, + { + "epoch": 0.8057630736392742, + "grad_norm": 1.5019981134785798, + "learning_rate": 4.62441966810864e-05, + "loss": 0.4101, + "step": 6795 + }, + { + "epoch": 0.8058816554013992, + "grad_norm": 1.515981419582991, + "learning_rate": 4.624293121583331e-05, + "loss": 0.4551, + "step": 6796 + }, + { + "epoch": 0.8060002371635242, + "grad_norm": 1.2863712799731994, + "learning_rate": 4.624166555474643e-05, + "loss": 0.4005, + "step": 6797 + }, + { + "epoch": 0.8061188189256492, + "grad_norm": 1.8588366002040344, + "learning_rate": 4.624039969783745e-05, + "loss": 0.573, + "step": 6798 + }, + { + "epoch": 0.8062374006877743, + "grad_norm": 1.3573818324239404, + "learning_rate": 4.6239133645118026e-05, + "loss": 0.3359, + "step": 6799 + }, + { + "epoch": 0.8063559824498993, + "grad_norm": 1.218570330752488, + "learning_rate": 4.623786739659984e-05, + "loss": 0.3297, + "step": 6800 + }, + { + "epoch": 0.8064745642120242, + "grad_norm": 2.0186060556263894, + "learning_rate": 4.623660095229456e-05, + "loss": 0.6217, + "step": 6801 + }, + { + "epoch": 0.8065931459741492, + "grad_norm": 1.1611995997099565, + "learning_rate": 4.6235334312213865e-05, + "loss": 0.371, + "step": 6802 + }, + { + "epoch": 0.8067117277362742, + "grad_norm": 1.646205038824545, + "learning_rate": 4.623406747636941e-05, + "loss": 0.5689, + "step": 6803 + }, + { + "epoch": 0.8068303094983992, + "grad_norm": 1.6496057037328447, + "learning_rate": 4.62328004447729e-05, + "loss": 0.5761, + "step": 6804 + }, + { + "epoch": 0.8069488912605242, + "grad_norm": 1.3385475317127633, + "learning_rate": 4.623153321743602e-05, + "loss": 0.4549, + "step": 6805 + }, + { + "epoch": 0.8070674730226491, + "grad_norm": 1.50874515272626, + "learning_rate": 4.623026579437043e-05, + "loss": 0.449, + "step": 6806 + }, + { + "epoch": 0.8071860547847741, + "grad_norm": 1.7168630795434103, + "learning_rate": 4.6228998175587824e-05, + "loss": 0.5428, + "step": 6807 + }, + { + "epoch": 0.8073046365468991, + "grad_norm": 1.45642122619262, + "learning_rate": 4.622773036109989e-05, + "loss": 0.5034, + "step": 6808 + }, + { + "epoch": 0.8074232183090241, + "grad_norm": 1.3175714980746869, + "learning_rate": 4.622646235091831e-05, + "loss": 0.4434, + "step": 6809 + }, + { + "epoch": 0.8075418000711491, + "grad_norm": 1.0743635370924898, + "learning_rate": 4.622519414505478e-05, + "loss": 0.2778, + "step": 6810 + }, + { + "epoch": 0.807660381833274, + "grad_norm": 2.029662397279706, + "learning_rate": 4.6223925743520986e-05, + "loss": 0.5856, + "step": 6811 + }, + { + "epoch": 0.807778963595399, + "grad_norm": 0.9836644394188714, + "learning_rate": 4.6222657146328624e-05, + "loss": 0.3557, + "step": 6812 + }, + { + "epoch": 0.807897545357524, + "grad_norm": 1.3522311663609816, + "learning_rate": 4.6221388353489385e-05, + "loss": 0.5474, + "step": 6813 + }, + { + "epoch": 0.808016127119649, + "grad_norm": 1.4476947608117725, + "learning_rate": 4.622011936501497e-05, + "loss": 0.4398, + "step": 6814 + }, + { + "epoch": 0.808134708881774, + "grad_norm": 1.3965037285273243, + "learning_rate": 4.6218850180917085e-05, + "loss": 0.4168, + "step": 6815 + }, + { + "epoch": 0.808253290643899, + "grad_norm": 1.2598709160885075, + "learning_rate": 4.62175808012074e-05, + "loss": 0.3739, + "step": 6816 + }, + { + "epoch": 0.8083718724060239, + "grad_norm": 1.3622694534504167, + "learning_rate": 4.621631122589766e-05, + "loss": 0.5131, + "step": 6817 + }, + { + "epoch": 0.8084904541681489, + "grad_norm": 1.2664714790914144, + "learning_rate": 4.621504145499954e-05, + "loss": 0.3809, + "step": 6818 + }, + { + "epoch": 0.8086090359302739, + "grad_norm": 1.2767506904838986, + "learning_rate": 4.621377148852476e-05, + "loss": 0.3427, + "step": 6819 + }, + { + "epoch": 0.8087276176923989, + "grad_norm": 1.1508347904751135, + "learning_rate": 4.6212501326485015e-05, + "loss": 0.3688, + "step": 6820 + }, + { + "epoch": 0.8088461994545239, + "grad_norm": 1.3192845946962655, + "learning_rate": 4.621123096889202e-05, + "loss": 0.4747, + "step": 6821 + }, + { + "epoch": 0.8089647812166488, + "grad_norm": 1.1994216502204298, + "learning_rate": 4.620996041575748e-05, + "loss": 0.3925, + "step": 6822 + }, + { + "epoch": 0.8090833629787738, + "grad_norm": 1.2703867564385012, + "learning_rate": 4.620868966709312e-05, + "loss": 0.3827, + "step": 6823 + }, + { + "epoch": 0.8092019447408989, + "grad_norm": 1.584460457119976, + "learning_rate": 4.6207418722910653e-05, + "loss": 0.473, + "step": 6824 + }, + { + "epoch": 0.8093205265030239, + "grad_norm": 1.247271986232191, + "learning_rate": 4.6206147583221785e-05, + "loss": 0.3332, + "step": 6825 + }, + { + "epoch": 0.8094391082651489, + "grad_norm": 1.3532235124007201, + "learning_rate": 4.620487624803824e-05, + "loss": 0.4384, + "step": 6826 + }, + { + "epoch": 0.8095576900272738, + "grad_norm": 1.495863954669008, + "learning_rate": 4.620360471737175e-05, + "loss": 0.4208, + "step": 6827 + }, + { + "epoch": 0.8096762717893988, + "grad_norm": 1.1005020585795202, + "learning_rate": 4.620233299123402e-05, + "loss": 0.3214, + "step": 6828 + }, + { + "epoch": 0.8097948535515238, + "grad_norm": 1.4401486466099782, + "learning_rate": 4.6201061069636775e-05, + "loss": 0.2959, + "step": 6829 + }, + { + "epoch": 0.8099134353136488, + "grad_norm": 1.6438650175049159, + "learning_rate": 4.619978895259176e-05, + "loss": 0.4883, + "step": 6830 + }, + { + "epoch": 0.8100320170757738, + "grad_norm": 1.4991136535956624, + "learning_rate": 4.619851664011067e-05, + "loss": 0.393, + "step": 6831 + }, + { + "epoch": 0.8101505988378987, + "grad_norm": 1.3816487362715175, + "learning_rate": 4.619724413220526e-05, + "loss": 0.4107, + "step": 6832 + }, + { + "epoch": 0.8102691806000237, + "grad_norm": 1.6535599943865003, + "learning_rate": 4.6195971428887256e-05, + "loss": 0.5028, + "step": 6833 + }, + { + "epoch": 0.8103877623621487, + "grad_norm": 1.2260893731413467, + "learning_rate": 4.619469853016838e-05, + "loss": 0.3284, + "step": 6834 + }, + { + "epoch": 0.8105063441242737, + "grad_norm": 2.5253034677050983, + "learning_rate": 4.619342543606038e-05, + "loss": 0.5981, + "step": 6835 + }, + { + "epoch": 0.8106249258863987, + "grad_norm": 1.2429938988860127, + "learning_rate": 4.619215214657498e-05, + "loss": 0.3117, + "step": 6836 + }, + { + "epoch": 0.8107435076485237, + "grad_norm": 1.1821435787255659, + "learning_rate": 4.6190878661723936e-05, + "loss": 0.3213, + "step": 6837 + }, + { + "epoch": 0.8108620894106486, + "grad_norm": 1.6662543864195645, + "learning_rate": 4.618960498151897e-05, + "loss": 0.6369, + "step": 6838 + }, + { + "epoch": 0.8109806711727736, + "grad_norm": 1.133639612588016, + "learning_rate": 4.618833110597183e-05, + "loss": 0.3696, + "step": 6839 + }, + { + "epoch": 0.8110992529348986, + "grad_norm": 1.8596646892366844, + "learning_rate": 4.618705703509426e-05, + "loss": 0.4764, + "step": 6840 + }, + { + "epoch": 0.8112178346970236, + "grad_norm": 1.2308297537799875, + "learning_rate": 4.6185782768898005e-05, + "loss": 0.4001, + "step": 6841 + }, + { + "epoch": 0.8113364164591486, + "grad_norm": 1.2644091652304206, + "learning_rate": 4.6184508307394816e-05, + "loss": 0.3262, + "step": 6842 + }, + { + "epoch": 0.8114549982212735, + "grad_norm": 1.2634073886030825, + "learning_rate": 4.618323365059644e-05, + "loss": 0.3653, + "step": 6843 + }, + { + "epoch": 0.8115735799833985, + "grad_norm": 1.7115150399734311, + "learning_rate": 4.618195879851462e-05, + "loss": 0.5678, + "step": 6844 + }, + { + "epoch": 0.8116921617455235, + "grad_norm": 1.4235026811763483, + "learning_rate": 4.618068375116111e-05, + "loss": 0.443, + "step": 6845 + }, + { + "epoch": 0.8118107435076485, + "grad_norm": 1.3909175873789308, + "learning_rate": 4.617940850854768e-05, + "loss": 0.4015, + "step": 6846 + }, + { + "epoch": 0.8119293252697735, + "grad_norm": 1.5217482587372124, + "learning_rate": 4.617813307068607e-05, + "loss": 0.4855, + "step": 6847 + }, + { + "epoch": 0.8120479070318984, + "grad_norm": 1.220270104125729, + "learning_rate": 4.617685743758805e-05, + "loss": 0.3778, + "step": 6848 + }, + { + "epoch": 0.8121664887940235, + "grad_norm": 1.1134563614733826, + "learning_rate": 4.617558160926537e-05, + "loss": 0.418, + "step": 6849 + }, + { + "epoch": 0.8122850705561485, + "grad_norm": 0.9838504620732718, + "learning_rate": 4.617430558572979e-05, + "loss": 0.3121, + "step": 6850 + }, + { + "epoch": 0.8124036523182735, + "grad_norm": 1.3759878540917931, + "learning_rate": 4.617302936699309e-05, + "loss": 0.4981, + "step": 6851 + }, + { + "epoch": 0.8125222340803985, + "grad_norm": 1.4386836280513957, + "learning_rate": 4.617175295306701e-05, + "loss": 0.4608, + "step": 6852 + }, + { + "epoch": 0.8126408158425235, + "grad_norm": 1.5804106025878415, + "learning_rate": 4.617047634396334e-05, + "loss": 0.7304, + "step": 6853 + }, + { + "epoch": 0.8127593976046484, + "grad_norm": 0.9954793797329403, + "learning_rate": 4.616919953969383e-05, + "loss": 0.333, + "step": 6854 + }, + { + "epoch": 0.8128779793667734, + "grad_norm": 1.4196540831214783, + "learning_rate": 4.6167922540270264e-05, + "loss": 0.5437, + "step": 6855 + }, + { + "epoch": 0.8129965611288984, + "grad_norm": 1.3429692168458418, + "learning_rate": 4.6166645345704404e-05, + "loss": 0.3509, + "step": 6856 + }, + { + "epoch": 0.8131151428910234, + "grad_norm": 1.168199214597201, + "learning_rate": 4.6165367956008046e-05, + "loss": 0.3219, + "step": 6857 + }, + { + "epoch": 0.8132337246531484, + "grad_norm": 1.1000093113469422, + "learning_rate": 4.616409037119294e-05, + "loss": 0.3962, + "step": 6858 + }, + { + "epoch": 0.8133523064152733, + "grad_norm": 2.065214904002503, + "learning_rate": 4.616281259127087e-05, + "loss": 0.4488, + "step": 6859 + }, + { + "epoch": 0.8134708881773983, + "grad_norm": 1.0881223381663112, + "learning_rate": 4.6161534616253625e-05, + "loss": 0.3594, + "step": 6860 + }, + { + "epoch": 0.8135894699395233, + "grad_norm": 1.221095541680054, + "learning_rate": 4.616025644615298e-05, + "loss": 0.3308, + "step": 6861 + }, + { + "epoch": 0.8137080517016483, + "grad_norm": 2.051796715699612, + "learning_rate": 4.6158978080980717e-05, + "loss": 0.7428, + "step": 6862 + }, + { + "epoch": 0.8138266334637733, + "grad_norm": 1.3447349237850492, + "learning_rate": 4.615769952074862e-05, + "loss": 0.3481, + "step": 6863 + }, + { + "epoch": 0.8139452152258982, + "grad_norm": 1.1663130974185572, + "learning_rate": 4.615642076546849e-05, + "loss": 0.3472, + "step": 6864 + }, + { + "epoch": 0.8140637969880232, + "grad_norm": 1.3202199780962067, + "learning_rate": 4.61551418151521e-05, + "loss": 0.4315, + "step": 6865 + }, + { + "epoch": 0.8141823787501482, + "grad_norm": 1.5380359064610487, + "learning_rate": 4.615386266981124e-05, + "loss": 0.4525, + "step": 6866 + }, + { + "epoch": 0.8143009605122732, + "grad_norm": 1.2127741088355166, + "learning_rate": 4.615258332945771e-05, + "loss": 0.28, + "step": 6867 + }, + { + "epoch": 0.8144195422743982, + "grad_norm": 1.3933895424341884, + "learning_rate": 4.6151303794103296e-05, + "loss": 0.4104, + "step": 6868 + }, + { + "epoch": 0.8145381240365231, + "grad_norm": 1.4148077766710137, + "learning_rate": 4.61500240637598e-05, + "loss": 0.4638, + "step": 6869 + }, + { + "epoch": 0.8146567057986481, + "grad_norm": 1.0787234244221315, + "learning_rate": 4.6148744138439025e-05, + "loss": 0.2766, + "step": 6870 + }, + { + "epoch": 0.8147752875607731, + "grad_norm": 1.2476899152375787, + "learning_rate": 4.6147464018152756e-05, + "loss": 0.2847, + "step": 6871 + }, + { + "epoch": 0.8148938693228981, + "grad_norm": 1.1583569318640041, + "learning_rate": 4.61461837029128e-05, + "loss": 0.2996, + "step": 6872 + }, + { + "epoch": 0.8150124510850231, + "grad_norm": 1.7182155188424093, + "learning_rate": 4.614490319273097e-05, + "loss": 0.5489, + "step": 6873 + }, + { + "epoch": 0.8151310328471482, + "grad_norm": 2.2439345219015663, + "learning_rate": 4.6143622487619056e-05, + "loss": 0.8763, + "step": 6874 + }, + { + "epoch": 0.8152496146092731, + "grad_norm": 1.2263807428693767, + "learning_rate": 4.6142341587588876e-05, + "loss": 0.3048, + "step": 6875 + }, + { + "epoch": 0.8153681963713981, + "grad_norm": 1.3209078645695385, + "learning_rate": 4.614106049265223e-05, + "loss": 0.4705, + "step": 6876 + }, + { + "epoch": 0.8154867781335231, + "grad_norm": 1.3214971876632662, + "learning_rate": 4.613977920282093e-05, + "loss": 0.4664, + "step": 6877 + }, + { + "epoch": 0.8156053598956481, + "grad_norm": 1.671277770411789, + "learning_rate": 4.6138497718106785e-05, + "loss": 0.4233, + "step": 6878 + }, + { + "epoch": 0.8157239416577731, + "grad_norm": 1.3204691268025084, + "learning_rate": 4.613721603852162e-05, + "loss": 0.4971, + "step": 6879 + }, + { + "epoch": 0.815842523419898, + "grad_norm": 1.2626571910719147, + "learning_rate": 4.613593416407724e-05, + "loss": 0.3536, + "step": 6880 + }, + { + "epoch": 0.815961105182023, + "grad_norm": 1.3128331705669765, + "learning_rate": 4.6134652094785466e-05, + "loss": 0.3741, + "step": 6881 + }, + { + "epoch": 0.816079686944148, + "grad_norm": 1.363344905576394, + "learning_rate": 4.6133369830658116e-05, + "loss": 0.4291, + "step": 6882 + }, + { + "epoch": 0.816198268706273, + "grad_norm": 1.5375017234643447, + "learning_rate": 4.613208737170701e-05, + "loss": 0.3893, + "step": 6883 + }, + { + "epoch": 0.816316850468398, + "grad_norm": 1.1178229881091326, + "learning_rate": 4.6130804717943974e-05, + "loss": 0.3799, + "step": 6884 + }, + { + "epoch": 0.816435432230523, + "grad_norm": 1.4015221514688507, + "learning_rate": 4.612952186938084e-05, + "loss": 0.5045, + "step": 6885 + }, + { + "epoch": 0.8165540139926479, + "grad_norm": 1.1770781327348847, + "learning_rate": 4.612823882602941e-05, + "loss": 0.3156, + "step": 6886 + }, + { + "epoch": 0.8166725957547729, + "grad_norm": 2.9339463454413317, + "learning_rate": 4.612695558790154e-05, + "loss": 0.4756, + "step": 6887 + }, + { + "epoch": 0.8167911775168979, + "grad_norm": 1.583152740399592, + "learning_rate": 4.6125672155009036e-05, + "loss": 0.5406, + "step": 6888 + }, + { + "epoch": 0.8169097592790229, + "grad_norm": 1.1108214359170132, + "learning_rate": 4.612438852736375e-05, + "loss": 0.3156, + "step": 6889 + }, + { + "epoch": 0.8170283410411479, + "grad_norm": 1.6948750920137083, + "learning_rate": 4.61231047049775e-05, + "loss": 0.4081, + "step": 6890 + }, + { + "epoch": 0.8171469228032728, + "grad_norm": 1.0349214086108491, + "learning_rate": 4.6121820687862125e-05, + "loss": 0.2773, + "step": 6891 + }, + { + "epoch": 0.8172655045653978, + "grad_norm": 1.4674265718232409, + "learning_rate": 4.612053647602947e-05, + "loss": 0.4955, + "step": 6892 + }, + { + "epoch": 0.8173840863275228, + "grad_norm": 1.0359480098102285, + "learning_rate": 4.611925206949137e-05, + "loss": 0.2466, + "step": 6893 + }, + { + "epoch": 0.8175026680896478, + "grad_norm": 1.6240907284068187, + "learning_rate": 4.611796746825966e-05, + "loss": 0.7163, + "step": 6894 + }, + { + "epoch": 0.8176212498517728, + "grad_norm": 1.3752290232796494, + "learning_rate": 4.611668267234619e-05, + "loss": 0.3387, + "step": 6895 + }, + { + "epoch": 0.8177398316138977, + "grad_norm": 1.2286465627789163, + "learning_rate": 4.611539768176279e-05, + "loss": 0.3316, + "step": 6896 + }, + { + "epoch": 0.8178584133760227, + "grad_norm": 1.6973273589311157, + "learning_rate": 4.6114112496521334e-05, + "loss": 0.6149, + "step": 6897 + }, + { + "epoch": 0.8179769951381477, + "grad_norm": 1.4361424993983911, + "learning_rate": 4.611282711663364e-05, + "loss": 0.5676, + "step": 6898 + }, + { + "epoch": 0.8180955769002728, + "grad_norm": 1.0995131663956217, + "learning_rate": 4.611154154211158e-05, + "loss": 0.4743, + "step": 6899 + }, + { + "epoch": 0.8182141586623978, + "grad_norm": 1.499281265397698, + "learning_rate": 4.6110255772966985e-05, + "loss": 0.4602, + "step": 6900 + }, + { + "epoch": 0.8183327404245228, + "grad_norm": 1.8426544611975424, + "learning_rate": 4.610896980921173e-05, + "loss": 0.7691, + "step": 6901 + }, + { + "epoch": 0.8184513221866477, + "grad_norm": 1.384752758375079, + "learning_rate": 4.610768365085765e-05, + "loss": 0.4458, + "step": 6902 + }, + { + "epoch": 0.8185699039487727, + "grad_norm": 1.3961589614193446, + "learning_rate": 4.6106397297916614e-05, + "loss": 0.4618, + "step": 6903 + }, + { + "epoch": 0.8186884857108977, + "grad_norm": 1.1320302161193398, + "learning_rate": 4.6105110750400475e-05, + "loss": 0.2972, + "step": 6904 + }, + { + "epoch": 0.8188070674730227, + "grad_norm": 1.5923364126934356, + "learning_rate": 4.610382400832109e-05, + "loss": 0.409, + "step": 6905 + }, + { + "epoch": 0.8189256492351477, + "grad_norm": 1.0330369975817901, + "learning_rate": 4.6102537071690344e-05, + "loss": 0.3401, + "step": 6906 + }, + { + "epoch": 0.8190442309972726, + "grad_norm": 0.9943889280669523, + "learning_rate": 4.6101249940520074e-05, + "loss": 0.2859, + "step": 6907 + }, + { + "epoch": 0.8191628127593976, + "grad_norm": 1.5970227772598284, + "learning_rate": 4.6099962614822156e-05, + "loss": 0.6476, + "step": 6908 + }, + { + "epoch": 0.8192813945215226, + "grad_norm": 1.5420964472368286, + "learning_rate": 4.6098675094608455e-05, + "loss": 0.4387, + "step": 6909 + }, + { + "epoch": 0.8193999762836476, + "grad_norm": 1.4424727064761438, + "learning_rate": 4.609738737989084e-05, + "loss": 0.4405, + "step": 6910 + }, + { + "epoch": 0.8195185580457726, + "grad_norm": 1.4532229649857227, + "learning_rate": 4.609609947068119e-05, + "loss": 0.4847, + "step": 6911 + }, + { + "epoch": 0.8196371398078975, + "grad_norm": 1.2975224320987593, + "learning_rate": 4.609481136699138e-05, + "loss": 0.4096, + "step": 6912 + }, + { + "epoch": 0.8197557215700225, + "grad_norm": 1.4117700992675135, + "learning_rate": 4.609352306883326e-05, + "loss": 0.4583, + "step": 6913 + }, + { + "epoch": 0.8198743033321475, + "grad_norm": 1.2785442007117673, + "learning_rate": 4.609223457621874e-05, + "loss": 0.3698, + "step": 6914 + }, + { + "epoch": 0.8199928850942725, + "grad_norm": 1.253609643605438, + "learning_rate": 4.6090945889159676e-05, + "loss": 0.4613, + "step": 6915 + }, + { + "epoch": 0.8201114668563975, + "grad_norm": 1.0669771517665434, + "learning_rate": 4.6089657007667955e-05, + "loss": 0.2513, + "step": 6916 + }, + { + "epoch": 0.8202300486185224, + "grad_norm": 1.2736998279597227, + "learning_rate": 4.6088367931755446e-05, + "loss": 0.4175, + "step": 6917 + }, + { + "epoch": 0.8203486303806474, + "grad_norm": 1.5503459770239172, + "learning_rate": 4.608707866143406e-05, + "loss": 0.5814, + "step": 6918 + }, + { + "epoch": 0.8204672121427724, + "grad_norm": 1.6334806158264925, + "learning_rate": 4.6085789196715665e-05, + "loss": 0.4759, + "step": 6919 + }, + { + "epoch": 0.8205857939048974, + "grad_norm": 1.2758505874743802, + "learning_rate": 4.608449953761215e-05, + "loss": 0.3696, + "step": 6920 + }, + { + "epoch": 0.8207043756670224, + "grad_norm": 1.6456903507379623, + "learning_rate": 4.60832096841354e-05, + "loss": 0.7055, + "step": 6921 + }, + { + "epoch": 0.8208229574291473, + "grad_norm": 1.219467641823786, + "learning_rate": 4.608191963629732e-05, + "loss": 0.3305, + "step": 6922 + }, + { + "epoch": 0.8209415391912723, + "grad_norm": 1.441447590162349, + "learning_rate": 4.608062939410979e-05, + "loss": 0.5057, + "step": 6923 + }, + { + "epoch": 0.8210601209533974, + "grad_norm": 0.946588941949069, + "learning_rate": 4.6079338957584694e-05, + "loss": 0.298, + "step": 6924 + }, + { + "epoch": 0.8211787027155224, + "grad_norm": 1.8476888465786403, + "learning_rate": 4.607804832673396e-05, + "loss": 0.6497, + "step": 6925 + }, + { + "epoch": 0.8212972844776474, + "grad_norm": 1.1156148696486041, + "learning_rate": 4.607675750156946e-05, + "loss": 0.2807, + "step": 6926 + }, + { + "epoch": 0.8214158662397724, + "grad_norm": 1.4831571916055704, + "learning_rate": 4.60754664821031e-05, + "loss": 0.6265, + "step": 6927 + }, + { + "epoch": 0.8215344480018973, + "grad_norm": 1.2703220386945084, + "learning_rate": 4.6074175268346785e-05, + "loss": 0.473, + "step": 6928 + }, + { + "epoch": 0.8216530297640223, + "grad_norm": 1.503550051002392, + "learning_rate": 4.607288386031242e-05, + "loss": 0.5291, + "step": 6929 + }, + { + "epoch": 0.8217716115261473, + "grad_norm": 1.4119589172598495, + "learning_rate": 4.607159225801191e-05, + "loss": 0.4992, + "step": 6930 + }, + { + "epoch": 0.8218901932882723, + "grad_norm": 1.6574402216183943, + "learning_rate": 4.607030046145715e-05, + "loss": 0.4943, + "step": 6931 + }, + { + "epoch": 0.8220087750503973, + "grad_norm": 1.458623539129203, + "learning_rate": 4.606900847066006e-05, + "loss": 0.5854, + "step": 6932 + }, + { + "epoch": 0.8221273568125222, + "grad_norm": 1.2781132734012055, + "learning_rate": 4.606771628563256e-05, + "loss": 0.3845, + "step": 6933 + }, + { + "epoch": 0.8222459385746472, + "grad_norm": 1.1542569844297383, + "learning_rate": 4.606642390638654e-05, + "loss": 0.3649, + "step": 6934 + }, + { + "epoch": 0.8223645203367722, + "grad_norm": 1.3554223559110044, + "learning_rate": 4.606513133293392e-05, + "loss": 0.3884, + "step": 6935 + }, + { + "epoch": 0.8224831020988972, + "grad_norm": 1.1868936047195198, + "learning_rate": 4.606383856528663e-05, + "loss": 0.3245, + "step": 6936 + }, + { + "epoch": 0.8226016838610222, + "grad_norm": 1.2623820950366946, + "learning_rate": 4.606254560345657e-05, + "loss": 0.4638, + "step": 6937 + }, + { + "epoch": 0.8227202656231472, + "grad_norm": 1.3341412846569125, + "learning_rate": 4.6061252447455686e-05, + "loss": 0.3575, + "step": 6938 + }, + { + "epoch": 0.8228388473852721, + "grad_norm": 1.974442373446165, + "learning_rate": 4.6059959097295867e-05, + "loss": 0.6477, + "step": 6939 + }, + { + "epoch": 0.8229574291473971, + "grad_norm": 1.5354035250887335, + "learning_rate": 4.6058665552989055e-05, + "loss": 0.4042, + "step": 6940 + }, + { + "epoch": 0.8230760109095221, + "grad_norm": 1.5673451621791408, + "learning_rate": 4.6057371814547165e-05, + "loss": 0.5021, + "step": 6941 + }, + { + "epoch": 0.8231945926716471, + "grad_norm": 1.457820319200091, + "learning_rate": 4.6056077881982135e-05, + "loss": 0.3568, + "step": 6942 + }, + { + "epoch": 0.823313174433772, + "grad_norm": 1.6666815646071398, + "learning_rate": 4.6054783755305884e-05, + "loss": 0.5243, + "step": 6943 + }, + { + "epoch": 0.823431756195897, + "grad_norm": 1.136554379497956, + "learning_rate": 4.6053489434530354e-05, + "loss": 0.3402, + "step": 6944 + }, + { + "epoch": 0.823550337958022, + "grad_norm": 1.3902832161326595, + "learning_rate": 4.6052194919667455e-05, + "loss": 0.3765, + "step": 6945 + }, + { + "epoch": 0.823668919720147, + "grad_norm": 1.2423878576582859, + "learning_rate": 4.605090021072914e-05, + "loss": 0.3154, + "step": 6946 + }, + { + "epoch": 0.823787501482272, + "grad_norm": 1.5028227362354765, + "learning_rate": 4.604960530772734e-05, + "loss": 0.5243, + "step": 6947 + }, + { + "epoch": 0.823906083244397, + "grad_norm": 1.2211456071869247, + "learning_rate": 4.604831021067399e-05, + "loss": 0.3656, + "step": 6948 + }, + { + "epoch": 0.824024665006522, + "grad_norm": 1.8252466684665545, + "learning_rate": 4.604701491958104e-05, + "loss": 0.6281, + "step": 6949 + }, + { + "epoch": 0.824143246768647, + "grad_norm": 1.808838162721825, + "learning_rate": 4.604571943446042e-05, + "loss": 0.5226, + "step": 6950 + }, + { + "epoch": 0.824261828530772, + "grad_norm": 1.3533664398911698, + "learning_rate": 4.604442375532407e-05, + "loss": 0.3456, + "step": 6951 + }, + { + "epoch": 0.824380410292897, + "grad_norm": 1.231223686368119, + "learning_rate": 4.604312788218394e-05, + "loss": 0.4113, + "step": 6952 + }, + { + "epoch": 0.824498992055022, + "grad_norm": 1.1756746085916812, + "learning_rate": 4.604183181505198e-05, + "loss": 0.319, + "step": 6953 + }, + { + "epoch": 0.824617573817147, + "grad_norm": 1.423148296952245, + "learning_rate": 4.604053555394012e-05, + "loss": 0.4873, + "step": 6954 + }, + { + "epoch": 0.8247361555792719, + "grad_norm": 1.3273088697151094, + "learning_rate": 4.6039239098860344e-05, + "loss": 0.4259, + "step": 6955 + }, + { + "epoch": 0.8248547373413969, + "grad_norm": 1.5229004248873121, + "learning_rate": 4.603794244982457e-05, + "loss": 0.4482, + "step": 6956 + }, + { + "epoch": 0.8249733191035219, + "grad_norm": 1.4933619675322012, + "learning_rate": 4.603664560684476e-05, + "loss": 0.4042, + "step": 6957 + }, + { + "epoch": 0.8250919008656469, + "grad_norm": 1.750799471665653, + "learning_rate": 4.603534856993289e-05, + "loss": 0.5006, + "step": 6958 + }, + { + "epoch": 0.8252104826277719, + "grad_norm": 1.397020327172941, + "learning_rate": 4.603405133910089e-05, + "loss": 0.3103, + "step": 6959 + }, + { + "epoch": 0.8253290643898968, + "grad_norm": 1.795183617652027, + "learning_rate": 4.603275391436074e-05, + "loss": 0.5132, + "step": 6960 + }, + { + "epoch": 0.8254476461520218, + "grad_norm": 1.6485748786931103, + "learning_rate": 4.6031456295724383e-05, + "loss": 0.5868, + "step": 6961 + }, + { + "epoch": 0.8255662279141468, + "grad_norm": 1.5684953372904735, + "learning_rate": 4.603015848320379e-05, + "loss": 0.4464, + "step": 6962 + }, + { + "epoch": 0.8256848096762718, + "grad_norm": 1.5632327602450327, + "learning_rate": 4.602886047681093e-05, + "loss": 0.4294, + "step": 6963 + }, + { + "epoch": 0.8258033914383968, + "grad_norm": 1.5001463880236297, + "learning_rate": 4.602756227655776e-05, + "loss": 0.4626, + "step": 6964 + }, + { + "epoch": 0.8259219732005217, + "grad_norm": 1.4080889700794406, + "learning_rate": 4.6026263882456254e-05, + "loss": 0.4097, + "step": 6965 + }, + { + "epoch": 0.8260405549626467, + "grad_norm": 1.3794714294782777, + "learning_rate": 4.6024965294518375e-05, + "loss": 0.4318, + "step": 6966 + }, + { + "epoch": 0.8261591367247717, + "grad_norm": 1.0716514553930818, + "learning_rate": 4.60236665127561e-05, + "loss": 0.2836, + "step": 6967 + }, + { + "epoch": 0.8262777184868967, + "grad_norm": 1.5122739657269972, + "learning_rate": 4.60223675371814e-05, + "loss": 0.426, + "step": 6968 + }, + { + "epoch": 0.8263963002490217, + "grad_norm": 1.7292348727545745, + "learning_rate": 4.6021068367806254e-05, + "loss": 0.3883, + "step": 6969 + }, + { + "epoch": 0.8265148820111466, + "grad_norm": 1.301283454409529, + "learning_rate": 4.601976900464263e-05, + "loss": 0.3437, + "step": 6970 + }, + { + "epoch": 0.8266334637732716, + "grad_norm": 1.2990687082770953, + "learning_rate": 4.601846944770252e-05, + "loss": 0.4559, + "step": 6971 + }, + { + "epoch": 0.8267520455353966, + "grad_norm": 1.677805448024113, + "learning_rate": 4.6017169696997885e-05, + "loss": 0.5687, + "step": 6972 + }, + { + "epoch": 0.8268706272975216, + "grad_norm": 1.275535243061787, + "learning_rate": 4.6015869752540726e-05, + "loss": 0.3545, + "step": 6973 + }, + { + "epoch": 0.8269892090596467, + "grad_norm": 1.7512093991972848, + "learning_rate": 4.6014569614343015e-05, + "loss": 0.5906, + "step": 6974 + }, + { + "epoch": 0.8271077908217717, + "grad_norm": 1.4128695816313643, + "learning_rate": 4.601326928241675e-05, + "loss": 0.6029, + "step": 6975 + }, + { + "epoch": 0.8272263725838966, + "grad_norm": 1.2237777444155091, + "learning_rate": 4.601196875677391e-05, + "loss": 0.4132, + "step": 6976 + }, + { + "epoch": 0.8273449543460216, + "grad_norm": 1.2238879378782095, + "learning_rate": 4.601066803742647e-05, + "loss": 0.3819, + "step": 6977 + }, + { + "epoch": 0.8274635361081466, + "grad_norm": 1.192288417155017, + "learning_rate": 4.6009367124386446e-05, + "loss": 0.4162, + "step": 6978 + }, + { + "epoch": 0.8275821178702716, + "grad_norm": 1.3825399164737215, + "learning_rate": 4.600806601766582e-05, + "loss": 0.4884, + "step": 6979 + }, + { + "epoch": 0.8277006996323966, + "grad_norm": 1.2906187145592458, + "learning_rate": 4.600676471727658e-05, + "loss": 0.4675, + "step": 6980 + }, + { + "epoch": 0.8278192813945215, + "grad_norm": 1.2957363112187992, + "learning_rate": 4.6005463223230747e-05, + "loss": 0.3702, + "step": 6981 + }, + { + "epoch": 0.8279378631566465, + "grad_norm": 1.286769334666312, + "learning_rate": 4.600416153554029e-05, + "loss": 0.3803, + "step": 6982 + }, + { + "epoch": 0.8280564449187715, + "grad_norm": 1.201440980691219, + "learning_rate": 4.600285965421722e-05, + "loss": 0.4141, + "step": 6983 + }, + { + "epoch": 0.8281750266808965, + "grad_norm": 1.4629779993047347, + "learning_rate": 4.600155757927354e-05, + "loss": 0.5193, + "step": 6984 + }, + { + "epoch": 0.8282936084430215, + "grad_norm": 1.3676391736639164, + "learning_rate": 4.6000255310721254e-05, + "loss": 0.3998, + "step": 6985 + }, + { + "epoch": 0.8284121902051464, + "grad_norm": 1.4316776908638378, + "learning_rate": 4.5998952848572365e-05, + "loss": 0.4908, + "step": 6986 + }, + { + "epoch": 0.8285307719672714, + "grad_norm": 1.4839328772079194, + "learning_rate": 4.599765019283888e-05, + "loss": 0.6005, + "step": 6987 + }, + { + "epoch": 0.8286493537293964, + "grad_norm": 1.4377477131650604, + "learning_rate": 4.599634734353282e-05, + "loss": 0.4846, + "step": 6988 + }, + { + "epoch": 0.8287679354915214, + "grad_norm": 1.056073186504529, + "learning_rate": 4.599504430066617e-05, + "loss": 0.2784, + "step": 6989 + }, + { + "epoch": 0.8288865172536464, + "grad_norm": 1.3670670524725252, + "learning_rate": 4.599374106425097e-05, + "loss": 0.5444, + "step": 6990 + }, + { + "epoch": 0.8290050990157714, + "grad_norm": 1.0912284053833328, + "learning_rate": 4.599243763429921e-05, + "loss": 0.2909, + "step": 6991 + }, + { + "epoch": 0.8291236807778963, + "grad_norm": 1.0938204015012638, + "learning_rate": 4.599113401082293e-05, + "loss": 0.3233, + "step": 6992 + }, + { + "epoch": 0.8292422625400213, + "grad_norm": 1.6186364371919248, + "learning_rate": 4.5989830193834125e-05, + "loss": 0.5083, + "step": 6993 + }, + { + "epoch": 0.8293608443021463, + "grad_norm": 1.20463992571718, + "learning_rate": 4.5988526183344835e-05, + "loss": 0.2843, + "step": 6994 + }, + { + "epoch": 0.8294794260642713, + "grad_norm": 1.297824341384146, + "learning_rate": 4.598722197936706e-05, + "loss": 0.3951, + "step": 6995 + }, + { + "epoch": 0.8295980078263963, + "grad_norm": 1.3483414726032132, + "learning_rate": 4.5985917581912844e-05, + "loss": 0.3697, + "step": 6996 + }, + { + "epoch": 0.8297165895885212, + "grad_norm": 1.9845987365090307, + "learning_rate": 4.598461299099419e-05, + "loss": 0.5435, + "step": 6997 + }, + { + "epoch": 0.8298351713506462, + "grad_norm": 2.023208583641064, + "learning_rate": 4.5983308206623156e-05, + "loss": 0.6105, + "step": 6998 + }, + { + "epoch": 0.8299537531127713, + "grad_norm": 1.3047258227526979, + "learning_rate": 4.598200322881174e-05, + "loss": 0.3732, + "step": 6999 + }, + { + "epoch": 0.8300723348748963, + "grad_norm": 1.7415085323948678, + "learning_rate": 4.598069805757198e-05, + "loss": 0.4355, + "step": 7000 + }, + { + "epoch": 0.8301909166370213, + "grad_norm": 1.323108007798805, + "learning_rate": 4.5979392692915926e-05, + "loss": 0.3705, + "step": 7001 + }, + { + "epoch": 0.8303094983991463, + "grad_norm": 1.3177974363006077, + "learning_rate": 4.5978087134855584e-05, + "loss": 0.3832, + "step": 7002 + }, + { + "epoch": 0.8304280801612712, + "grad_norm": 1.4414650383152594, + "learning_rate": 4.597678138340301e-05, + "loss": 0.4101, + "step": 7003 + }, + { + "epoch": 0.8305466619233962, + "grad_norm": 1.2780971317848426, + "learning_rate": 4.597547543857024e-05, + "loss": 0.4068, + "step": 7004 + }, + { + "epoch": 0.8306652436855212, + "grad_norm": 1.474912335855074, + "learning_rate": 4.5974169300369294e-05, + "loss": 0.4016, + "step": 7005 + }, + { + "epoch": 0.8307838254476462, + "grad_norm": 1.4057706102959007, + "learning_rate": 4.5972862968812234e-05, + "loss": 0.4014, + "step": 7006 + }, + { + "epoch": 0.8309024072097712, + "grad_norm": 1.2985387662916321, + "learning_rate": 4.59715564439111e-05, + "loss": 0.426, + "step": 7007 + }, + { + "epoch": 0.8310209889718961, + "grad_norm": 1.217946199130656, + "learning_rate": 4.597024972567794e-05, + "loss": 0.3529, + "step": 7008 + }, + { + "epoch": 0.8311395707340211, + "grad_norm": 1.222818215572617, + "learning_rate": 4.596894281412478e-05, + "loss": 0.489, + "step": 7009 + }, + { + "epoch": 0.8312581524961461, + "grad_norm": 1.1529796810884687, + "learning_rate": 4.596763570926368e-05, + "loss": 0.4121, + "step": 7010 + }, + { + "epoch": 0.8313767342582711, + "grad_norm": 1.177618172693117, + "learning_rate": 4.596632841110669e-05, + "loss": 0.2922, + "step": 7011 + }, + { + "epoch": 0.8314953160203961, + "grad_norm": 1.9259664332612327, + "learning_rate": 4.596502091966587e-05, + "loss": 0.4881, + "step": 7012 + }, + { + "epoch": 0.831613897782521, + "grad_norm": 1.4278897329416238, + "learning_rate": 4.596371323495327e-05, + "loss": 0.3484, + "step": 7013 + }, + { + "epoch": 0.831732479544646, + "grad_norm": 1.4325292044115803, + "learning_rate": 4.596240535698094e-05, + "loss": 0.5325, + "step": 7014 + }, + { + "epoch": 0.831851061306771, + "grad_norm": 1.6014831359545445, + "learning_rate": 4.596109728576093e-05, + "loss": 0.4207, + "step": 7015 + }, + { + "epoch": 0.831969643068896, + "grad_norm": 1.3809892362724274, + "learning_rate": 4.595978902130531e-05, + "loss": 0.4005, + "step": 7016 + }, + { + "epoch": 0.832088224831021, + "grad_norm": 1.3629826009209878, + "learning_rate": 4.595848056362614e-05, + "loss": 0.4076, + "step": 7017 + }, + { + "epoch": 0.8322068065931459, + "grad_norm": 1.2771677390653018, + "learning_rate": 4.595717191273548e-05, + "loss": 0.3686, + "step": 7018 + }, + { + "epoch": 0.8323253883552709, + "grad_norm": 1.4861027517996053, + "learning_rate": 4.59558630686454e-05, + "loss": 0.4515, + "step": 7019 + }, + { + "epoch": 0.8324439701173959, + "grad_norm": 1.4179092921865928, + "learning_rate": 4.5954554031367944e-05, + "loss": 0.3706, + "step": 7020 + }, + { + "epoch": 0.8325625518795209, + "grad_norm": 1.7431695250480128, + "learning_rate": 4.595324480091521e-05, + "loss": 0.5178, + "step": 7021 + }, + { + "epoch": 0.8326811336416459, + "grad_norm": 1.4752113720152238, + "learning_rate": 4.595193537729925e-05, + "loss": 0.4909, + "step": 7022 + }, + { + "epoch": 0.8327997154037708, + "grad_norm": 1.303208485134168, + "learning_rate": 4.595062576053214e-05, + "loss": 0.5388, + "step": 7023 + }, + { + "epoch": 0.8329182971658959, + "grad_norm": 1.3472554104888557, + "learning_rate": 4.594931595062595e-05, + "loss": 0.3815, + "step": 7024 + }, + { + "epoch": 0.8330368789280209, + "grad_norm": 1.5961029701650007, + "learning_rate": 4.5948005947592755e-05, + "loss": 0.5441, + "step": 7025 + }, + { + "epoch": 0.8331554606901459, + "grad_norm": 1.2503076915700297, + "learning_rate": 4.5946695751444633e-05, + "loss": 0.4817, + "step": 7026 + }, + { + "epoch": 0.8332740424522709, + "grad_norm": 1.2956803279040314, + "learning_rate": 4.5945385362193675e-05, + "loss": 0.343, + "step": 7027 + }, + { + "epoch": 0.8333926242143959, + "grad_norm": 1.407970940727167, + "learning_rate": 4.5944074779851933e-05, + "loss": 0.4484, + "step": 7028 + }, + { + "epoch": 0.8335112059765208, + "grad_norm": 1.2176798813934289, + "learning_rate": 4.594276400443151e-05, + "loss": 0.2747, + "step": 7029 + }, + { + "epoch": 0.8336297877386458, + "grad_norm": 1.3433885806433268, + "learning_rate": 4.5941453035944485e-05, + "loss": 0.5087, + "step": 7030 + }, + { + "epoch": 0.8337483695007708, + "grad_norm": 1.3279504014628545, + "learning_rate": 4.594014187440294e-05, + "loss": 0.4877, + "step": 7031 + }, + { + "epoch": 0.8338669512628958, + "grad_norm": 1.1905338524480276, + "learning_rate": 4.593883051981897e-05, + "loss": 0.4173, + "step": 7032 + }, + { + "epoch": 0.8339855330250208, + "grad_norm": 1.0905512495411593, + "learning_rate": 4.5937518972204665e-05, + "loss": 0.3036, + "step": 7033 + }, + { + "epoch": 0.8341041147871457, + "grad_norm": 1.5315953398477353, + "learning_rate": 4.59362072315721e-05, + "loss": 0.552, + "step": 7034 + }, + { + "epoch": 0.8342226965492707, + "grad_norm": 1.1345950037972643, + "learning_rate": 4.593489529793338e-05, + "loss": 0.3352, + "step": 7035 + }, + { + "epoch": 0.8343412783113957, + "grad_norm": 1.3299967567027684, + "learning_rate": 4.59335831713006e-05, + "loss": 0.3623, + "step": 7036 + }, + { + "epoch": 0.8344598600735207, + "grad_norm": 1.385481521275535, + "learning_rate": 4.5932270851685854e-05, + "loss": 0.3638, + "step": 7037 + }, + { + "epoch": 0.8345784418356457, + "grad_norm": 1.152502041801949, + "learning_rate": 4.593095833910124e-05, + "loss": 0.4568, + "step": 7038 + }, + { + "epoch": 0.8346970235977706, + "grad_norm": 1.1840830489668557, + "learning_rate": 4.592964563355886e-05, + "loss": 0.362, + "step": 7039 + }, + { + "epoch": 0.8348156053598956, + "grad_norm": 1.1721577865440138, + "learning_rate": 4.5928332735070804e-05, + "loss": 0.3801, + "step": 7040 + }, + { + "epoch": 0.8349341871220206, + "grad_norm": 1.6825170299424048, + "learning_rate": 4.5927019643649184e-05, + "loss": 0.4554, + "step": 7041 + }, + { + "epoch": 0.8350527688841456, + "grad_norm": 1.277039033052309, + "learning_rate": 4.592570635930611e-05, + "loss": 0.4206, + "step": 7042 + }, + { + "epoch": 0.8351713506462706, + "grad_norm": 1.5029019206662342, + "learning_rate": 4.592439288205368e-05, + "loss": 0.4647, + "step": 7043 + }, + { + "epoch": 0.8352899324083956, + "grad_norm": 1.1695388257990311, + "learning_rate": 4.592307921190402e-05, + "loss": 0.4493, + "step": 7044 + }, + { + "epoch": 0.8354085141705205, + "grad_norm": 1.121537457174954, + "learning_rate": 4.592176534886922e-05, + "loss": 0.3126, + "step": 7045 + }, + { + "epoch": 0.8355270959326455, + "grad_norm": 1.3393426096482335, + "learning_rate": 4.592045129296139e-05, + "loss": 0.2942, + "step": 7046 + }, + { + "epoch": 0.8356456776947705, + "grad_norm": 1.651560973230691, + "learning_rate": 4.591913704419266e-05, + "loss": 0.5033, + "step": 7047 + }, + { + "epoch": 0.8357642594568955, + "grad_norm": 1.6291888578208766, + "learning_rate": 4.591782260257514e-05, + "loss": 0.5085, + "step": 7048 + }, + { + "epoch": 0.8358828412190206, + "grad_norm": 1.5552880942083616, + "learning_rate": 4.5916507968120955e-05, + "loss": 0.5093, + "step": 7049 + }, + { + "epoch": 0.8360014229811455, + "grad_norm": 1.3934271721112037, + "learning_rate": 4.5915193140842204e-05, + "loss": 0.3936, + "step": 7050 + }, + { + "epoch": 0.8361200047432705, + "grad_norm": 1.4141378237785043, + "learning_rate": 4.5913878120751026e-05, + "loss": 0.4652, + "step": 7051 + }, + { + "epoch": 0.8362385865053955, + "grad_norm": 1.465209834025231, + "learning_rate": 4.5912562907859536e-05, + "loss": 0.4979, + "step": 7052 + }, + { + "epoch": 0.8363571682675205, + "grad_norm": 1.997946803572848, + "learning_rate": 4.591124750217986e-05, + "loss": 0.4689, + "step": 7053 + }, + { + "epoch": 0.8364757500296455, + "grad_norm": 1.3791090166662403, + "learning_rate": 4.590993190372412e-05, + "loss": 0.4335, + "step": 7054 + }, + { + "epoch": 0.8365943317917705, + "grad_norm": 1.3987936562100025, + "learning_rate": 4.590861611250447e-05, + "loss": 0.4838, + "step": 7055 + }, + { + "epoch": 0.8367129135538954, + "grad_norm": 1.4040617553804757, + "learning_rate": 4.5907300128532994e-05, + "loss": 0.4218, + "step": 7056 + }, + { + "epoch": 0.8368314953160204, + "grad_norm": 1.3195334841099744, + "learning_rate": 4.5905983951821864e-05, + "loss": 0.4051, + "step": 7057 + }, + { + "epoch": 0.8369500770781454, + "grad_norm": 1.3992787459947145, + "learning_rate": 4.590466758238319e-05, + "loss": 0.5247, + "step": 7058 + }, + { + "epoch": 0.8370686588402704, + "grad_norm": 1.3510459284521714, + "learning_rate": 4.590335102022912e-05, + "loss": 0.4949, + "step": 7059 + }, + { + "epoch": 0.8371872406023954, + "grad_norm": 1.34493887941941, + "learning_rate": 4.590203426537179e-05, + "loss": 0.4047, + "step": 7060 + }, + { + "epoch": 0.8373058223645203, + "grad_norm": 1.3277005269775548, + "learning_rate": 4.5900717317823336e-05, + "loss": 0.4176, + "step": 7061 + }, + { + "epoch": 0.8374244041266453, + "grad_norm": 1.4865371761835748, + "learning_rate": 4.589940017759589e-05, + "loss": 0.5581, + "step": 7062 + }, + { + "epoch": 0.8375429858887703, + "grad_norm": 1.3166918451989758, + "learning_rate": 4.5898082844701605e-05, + "loss": 0.4412, + "step": 7063 + }, + { + "epoch": 0.8376615676508953, + "grad_norm": 1.3356943839391102, + "learning_rate": 4.589676531915264e-05, + "loss": 0.4443, + "step": 7064 + }, + { + "epoch": 0.8377801494130203, + "grad_norm": 1.7792351803217843, + "learning_rate": 4.5895447600961104e-05, + "loss": 0.6642, + "step": 7065 + }, + { + "epoch": 0.8378987311751452, + "grad_norm": 1.3061607388991763, + "learning_rate": 4.589412969013917e-05, + "loss": 0.4908, + "step": 7066 + }, + { + "epoch": 0.8380173129372702, + "grad_norm": 1.7249280742012534, + "learning_rate": 4.589281158669898e-05, + "loss": 0.5537, + "step": 7067 + }, + { + "epoch": 0.8381358946993952, + "grad_norm": 1.4440626827836363, + "learning_rate": 4.58914932906527e-05, + "loss": 0.4878, + "step": 7068 + }, + { + "epoch": 0.8382544764615202, + "grad_norm": 1.2131991762409167, + "learning_rate": 4.5890174802012454e-05, + "loss": 0.416, + "step": 7069 + }, + { + "epoch": 0.8383730582236452, + "grad_norm": 1.598470333856422, + "learning_rate": 4.588885612079042e-05, + "loss": 0.4335, + "step": 7070 + }, + { + "epoch": 0.8384916399857701, + "grad_norm": 1.1999180085997176, + "learning_rate": 4.588753724699875e-05, + "loss": 0.4231, + "step": 7071 + }, + { + "epoch": 0.8386102217478951, + "grad_norm": 1.3047713823503646, + "learning_rate": 4.588621818064961e-05, + "loss": 0.3889, + "step": 7072 + }, + { + "epoch": 0.8387288035100201, + "grad_norm": 1.2617567814350175, + "learning_rate": 4.588489892175514e-05, + "loss": 0.3253, + "step": 7073 + }, + { + "epoch": 0.8388473852721452, + "grad_norm": 1.2286731625684353, + "learning_rate": 4.588357947032751e-05, + "loss": 0.335, + "step": 7074 + }, + { + "epoch": 0.8389659670342702, + "grad_norm": 1.5926813583968211, + "learning_rate": 4.588225982637889e-05, + "loss": 0.3221, + "step": 7075 + }, + { + "epoch": 0.8390845487963952, + "grad_norm": 1.1186845032068036, + "learning_rate": 4.588093998992144e-05, + "loss": 0.2998, + "step": 7076 + }, + { + "epoch": 0.8392031305585201, + "grad_norm": 2.095914160803281, + "learning_rate": 4.5879619960967334e-05, + "loss": 0.5813, + "step": 7077 + }, + { + "epoch": 0.8393217123206451, + "grad_norm": 0.9974205208866932, + "learning_rate": 4.5878299739528726e-05, + "loss": 0.3211, + "step": 7078 + }, + { + "epoch": 0.8394402940827701, + "grad_norm": 1.5250642261056486, + "learning_rate": 4.5876979325617805e-05, + "loss": 0.4861, + "step": 7079 + }, + { + "epoch": 0.8395588758448951, + "grad_norm": 2.074707334032716, + "learning_rate": 4.587565871924674e-05, + "loss": 0.6672, + "step": 7080 + }, + { + "epoch": 0.8396774576070201, + "grad_norm": 1.3666359990800323, + "learning_rate": 4.58743379204277e-05, + "loss": 0.3735, + "step": 7081 + }, + { + "epoch": 0.839796039369145, + "grad_norm": 1.6357284388119466, + "learning_rate": 4.587301692917285e-05, + "loss": 0.4192, + "step": 7082 + }, + { + "epoch": 0.83991462113127, + "grad_norm": 1.04344638958246, + "learning_rate": 4.5871695745494395e-05, + "loss": 0.4071, + "step": 7083 + }, + { + "epoch": 0.840033202893395, + "grad_norm": 1.2036471971444174, + "learning_rate": 4.587037436940449e-05, + "loss": 0.304, + "step": 7084 + }, + { + "epoch": 0.84015178465552, + "grad_norm": 1.813439379748862, + "learning_rate": 4.586905280091533e-05, + "loss": 0.4434, + "step": 7085 + }, + { + "epoch": 0.840270366417645, + "grad_norm": 1.1928467386921653, + "learning_rate": 4.58677310400391e-05, + "loss": 0.3803, + "step": 7086 + }, + { + "epoch": 0.84038894817977, + "grad_norm": 1.0191750949701213, + "learning_rate": 4.586640908678797e-05, + "loss": 0.3215, + "step": 7087 + }, + { + "epoch": 0.8405075299418949, + "grad_norm": 1.2531707690622746, + "learning_rate": 4.586508694117414e-05, + "loss": 0.3416, + "step": 7088 + }, + { + "epoch": 0.8406261117040199, + "grad_norm": 1.1534343618513794, + "learning_rate": 4.586376460320979e-05, + "loss": 0.3042, + "step": 7089 + }, + { + "epoch": 0.8407446934661449, + "grad_norm": 1.7457155837884886, + "learning_rate": 4.586244207290712e-05, + "loss": 0.4733, + "step": 7090 + }, + { + "epoch": 0.8408632752282699, + "grad_norm": 1.4711053836806198, + "learning_rate": 4.586111935027832e-05, + "loss": 0.471, + "step": 7091 + }, + { + "epoch": 0.8409818569903948, + "grad_norm": 0.9898910560389165, + "learning_rate": 4.585979643533558e-05, + "loss": 0.2836, + "step": 7092 + }, + { + "epoch": 0.8411004387525198, + "grad_norm": 1.5649234658071023, + "learning_rate": 4.58584733280911e-05, + "loss": 0.4703, + "step": 7093 + }, + { + "epoch": 0.8412190205146448, + "grad_norm": 1.2220656056139696, + "learning_rate": 4.585715002855706e-05, + "loss": 0.4042, + "step": 7094 + }, + { + "epoch": 0.8413376022767698, + "grad_norm": 1.4315612745030768, + "learning_rate": 4.585582653674568e-05, + "loss": 0.4769, + "step": 7095 + }, + { + "epoch": 0.8414561840388948, + "grad_norm": 1.4934335763558044, + "learning_rate": 4.5854502852669164e-05, + "loss": 0.5221, + "step": 7096 + }, + { + "epoch": 0.8415747658010198, + "grad_norm": 1.626095139825517, + "learning_rate": 4.585317897633969e-05, + "loss": 0.5574, + "step": 7097 + }, + { + "epoch": 0.8416933475631447, + "grad_norm": 1.74507221386313, + "learning_rate": 4.5851854907769494e-05, + "loss": 0.5843, + "step": 7098 + }, + { + "epoch": 0.8418119293252698, + "grad_norm": 1.3088064323612574, + "learning_rate": 4.585053064697076e-05, + "loss": 0.3782, + "step": 7099 + }, + { + "epoch": 0.8419305110873948, + "grad_norm": 1.129915061734744, + "learning_rate": 4.5849206193955696e-05, + "loss": 0.421, + "step": 7100 + }, + { + "epoch": 0.8420490928495198, + "grad_norm": 1.1488437001107814, + "learning_rate": 4.5847881548736525e-05, + "loss": 0.3437, + "step": 7101 + }, + { + "epoch": 0.8421676746116448, + "grad_norm": 1.4538757355004113, + "learning_rate": 4.584655671132545e-05, + "loss": 0.6788, + "step": 7102 + }, + { + "epoch": 0.8422862563737697, + "grad_norm": 1.051608260963614, + "learning_rate": 4.584523168173468e-05, + "loss": 0.3167, + "step": 7103 + }, + { + "epoch": 0.8424048381358947, + "grad_norm": 1.0065907038271593, + "learning_rate": 4.5843906459976444e-05, + "loss": 0.2773, + "step": 7104 + }, + { + "epoch": 0.8425234198980197, + "grad_norm": 1.2168388787979423, + "learning_rate": 4.584258104606295e-05, + "loss": 0.3864, + "step": 7105 + }, + { + "epoch": 0.8426420016601447, + "grad_norm": 1.511921838918595, + "learning_rate": 4.584125544000642e-05, + "loss": 0.4618, + "step": 7106 + }, + { + "epoch": 0.8427605834222697, + "grad_norm": 1.1175289116855611, + "learning_rate": 4.583992964181906e-05, + "loss": 0.3626, + "step": 7107 + }, + { + "epoch": 0.8428791651843947, + "grad_norm": 1.9887317852487079, + "learning_rate": 4.583860365151311e-05, + "loss": 0.7368, + "step": 7108 + }, + { + "epoch": 0.8429977469465196, + "grad_norm": 1.3783653745217441, + "learning_rate": 4.583727746910079e-05, + "loss": 0.4926, + "step": 7109 + }, + { + "epoch": 0.8431163287086446, + "grad_norm": 1.5291942744241511, + "learning_rate": 4.583595109459432e-05, + "loss": 0.4481, + "step": 7110 + }, + { + "epoch": 0.8432349104707696, + "grad_norm": 1.4389913859236114, + "learning_rate": 4.5834624528005945e-05, + "loss": 0.4191, + "step": 7111 + }, + { + "epoch": 0.8433534922328946, + "grad_norm": 1.310822690491155, + "learning_rate": 4.583329776934787e-05, + "loss": 0.3674, + "step": 7112 + }, + { + "epoch": 0.8434720739950196, + "grad_norm": 1.424451152306023, + "learning_rate": 4.583197081863233e-05, + "loss": 0.513, + "step": 7113 + }, + { + "epoch": 0.8435906557571445, + "grad_norm": 1.6624625637064836, + "learning_rate": 4.5830643675871575e-05, + "loss": 0.5131, + "step": 7114 + }, + { + "epoch": 0.8437092375192695, + "grad_norm": 1.196579809051718, + "learning_rate": 4.5829316341077836e-05, + "loss": 0.3587, + "step": 7115 + }, + { + "epoch": 0.8438278192813945, + "grad_norm": 1.4197360568337678, + "learning_rate": 4.5827988814263326e-05, + "loss": 0.4254, + "step": 7116 + }, + { + "epoch": 0.8439464010435195, + "grad_norm": 1.152824042999966, + "learning_rate": 4.582666109544032e-05, + "loss": 0.3434, + "step": 7117 + }, + { + "epoch": 0.8440649828056445, + "grad_norm": 1.3464443804008592, + "learning_rate": 4.582533318462102e-05, + "loss": 0.4305, + "step": 7118 + }, + { + "epoch": 0.8441835645677694, + "grad_norm": 1.3909288006190614, + "learning_rate": 4.582400508181769e-05, + "loss": 0.4006, + "step": 7119 + }, + { + "epoch": 0.8443021463298944, + "grad_norm": 1.218464298333134, + "learning_rate": 4.5822676787042576e-05, + "loss": 0.3378, + "step": 7120 + }, + { + "epoch": 0.8444207280920194, + "grad_norm": 1.6334404048672002, + "learning_rate": 4.582134830030791e-05, + "loss": 0.5409, + "step": 7121 + }, + { + "epoch": 0.8445393098541444, + "grad_norm": 1.4340212694181875, + "learning_rate": 4.582001962162595e-05, + "loss": 0.3957, + "step": 7122 + }, + { + "epoch": 0.8446578916162694, + "grad_norm": 1.2939794677996002, + "learning_rate": 4.5818690751008944e-05, + "loss": 0.2975, + "step": 7123 + }, + { + "epoch": 0.8447764733783945, + "grad_norm": 1.6975760181967092, + "learning_rate": 4.581736168846913e-05, + "loss": 0.6343, + "step": 7124 + }, + { + "epoch": 0.8448950551405194, + "grad_norm": 1.2706705289630686, + "learning_rate": 4.581603243401877e-05, + "loss": 0.3457, + "step": 7125 + }, + { + "epoch": 0.8450136369026444, + "grad_norm": 1.4775136783990745, + "learning_rate": 4.5814702987670116e-05, + "loss": 0.4266, + "step": 7126 + }, + { + "epoch": 0.8451322186647694, + "grad_norm": 1.2670524345010614, + "learning_rate": 4.581337334943543e-05, + "loss": 0.358, + "step": 7127 + }, + { + "epoch": 0.8452508004268944, + "grad_norm": 1.3161225225340951, + "learning_rate": 4.5812043519326964e-05, + "loss": 0.4068, + "step": 7128 + }, + { + "epoch": 0.8453693821890194, + "grad_norm": 1.3684417052894882, + "learning_rate": 4.581071349735698e-05, + "loss": 0.4186, + "step": 7129 + }, + { + "epoch": 0.8454879639511443, + "grad_norm": 1.1282050336378227, + "learning_rate": 4.5809383283537736e-05, + "loss": 0.3188, + "step": 7130 + }, + { + "epoch": 0.8456065457132693, + "grad_norm": 1.4499393382749237, + "learning_rate": 4.580805287788149e-05, + "loss": 0.3888, + "step": 7131 + }, + { + "epoch": 0.8457251274753943, + "grad_norm": 1.181365451756441, + "learning_rate": 4.580672228040052e-05, + "loss": 0.3403, + "step": 7132 + }, + { + "epoch": 0.8458437092375193, + "grad_norm": 1.2331775536551288, + "learning_rate": 4.580539149110708e-05, + "loss": 0.404, + "step": 7133 + }, + { + "epoch": 0.8459622909996443, + "grad_norm": 1.0477784082142796, + "learning_rate": 4.5804060510013453e-05, + "loss": 0.3819, + "step": 7134 + }, + { + "epoch": 0.8460808727617692, + "grad_norm": 1.0686391539434563, + "learning_rate": 4.580272933713189e-05, + "loss": 0.3071, + "step": 7135 + }, + { + "epoch": 0.8461994545238942, + "grad_norm": 0.8335998591155342, + "learning_rate": 4.580139797247468e-05, + "loss": 0.2631, + "step": 7136 + }, + { + "epoch": 0.8463180362860192, + "grad_norm": 1.5728601191348452, + "learning_rate": 4.580006641605409e-05, + "loss": 0.5338, + "step": 7137 + }, + { + "epoch": 0.8464366180481442, + "grad_norm": 1.7777007394438038, + "learning_rate": 4.579873466788238e-05, + "loss": 0.5172, + "step": 7138 + }, + { + "epoch": 0.8465551998102692, + "grad_norm": 1.3866859914169023, + "learning_rate": 4.5797402727971863e-05, + "loss": 0.483, + "step": 7139 + }, + { + "epoch": 0.8466737815723941, + "grad_norm": 1.2534540039187196, + "learning_rate": 4.579607059633478e-05, + "loss": 0.406, + "step": 7140 + }, + { + "epoch": 0.8467923633345191, + "grad_norm": 1.7880250666648718, + "learning_rate": 4.5794738272983426e-05, + "loss": 0.3913, + "step": 7141 + }, + { + "epoch": 0.8469109450966441, + "grad_norm": 1.4751278428566217, + "learning_rate": 4.57934057579301e-05, + "loss": 0.4833, + "step": 7142 + }, + { + "epoch": 0.8470295268587691, + "grad_norm": 1.0430644173990251, + "learning_rate": 4.579207305118706e-05, + "loss": 0.2729, + "step": 7143 + }, + { + "epoch": 0.8471481086208941, + "grad_norm": 1.1248108711877975, + "learning_rate": 4.579074015276661e-05, + "loss": 0.2185, + "step": 7144 + }, + { + "epoch": 0.847266690383019, + "grad_norm": 1.1208936854298543, + "learning_rate": 4.578940706268102e-05, + "loss": 0.3103, + "step": 7145 + }, + { + "epoch": 0.847385272145144, + "grad_norm": 1.1388180427140813, + "learning_rate": 4.57880737809426e-05, + "loss": 0.2301, + "step": 7146 + }, + { + "epoch": 0.847503853907269, + "grad_norm": 1.5256019270633212, + "learning_rate": 4.5786740307563636e-05, + "loss": 0.3796, + "step": 7147 + }, + { + "epoch": 0.847622435669394, + "grad_norm": 1.0476899328724834, + "learning_rate": 4.57854066425564e-05, + "loss": 0.2943, + "step": 7148 + }, + { + "epoch": 0.8477410174315191, + "grad_norm": 1.300570746925377, + "learning_rate": 4.5784072785933226e-05, + "loss": 0.3186, + "step": 7149 + }, + { + "epoch": 0.8478595991936441, + "grad_norm": 1.5456489869381351, + "learning_rate": 4.578273873770638e-05, + "loss": 0.4647, + "step": 7150 + }, + { + "epoch": 0.847978180955769, + "grad_norm": 1.429610395998424, + "learning_rate": 4.578140449788816e-05, + "loss": 0.4229, + "step": 7151 + }, + { + "epoch": 0.848096762717894, + "grad_norm": 1.6774194032141285, + "learning_rate": 4.578007006649088e-05, + "loss": 0.4414, + "step": 7152 + }, + { + "epoch": 0.848215344480019, + "grad_norm": 1.3601667902058143, + "learning_rate": 4.577873544352683e-05, + "loss": 0.3169, + "step": 7153 + }, + { + "epoch": 0.848333926242144, + "grad_norm": 1.6154494276466942, + "learning_rate": 4.577740062900833e-05, + "loss": 0.4521, + "step": 7154 + }, + { + "epoch": 0.848452508004269, + "grad_norm": 1.2351056370607754, + "learning_rate": 4.5776065622947665e-05, + "loss": 0.2803, + "step": 7155 + }, + { + "epoch": 0.848571089766394, + "grad_norm": 1.9120271250490748, + "learning_rate": 4.577473042535716e-05, + "loss": 0.5532, + "step": 7156 + }, + { + "epoch": 0.8486896715285189, + "grad_norm": 1.6792942219496667, + "learning_rate": 4.577339503624911e-05, + "loss": 0.4541, + "step": 7157 + }, + { + "epoch": 0.8488082532906439, + "grad_norm": 1.2644364383548468, + "learning_rate": 4.5772059455635844e-05, + "loss": 0.415, + "step": 7158 + }, + { + "epoch": 0.8489268350527689, + "grad_norm": 1.3922781116616507, + "learning_rate": 4.577072368352965e-05, + "loss": 0.3586, + "step": 7159 + }, + { + "epoch": 0.8490454168148939, + "grad_norm": 0.935445422358795, + "learning_rate": 4.576938771994286e-05, + "loss": 0.26, + "step": 7160 + }, + { + "epoch": 0.8491639985770189, + "grad_norm": 1.5077433486512617, + "learning_rate": 4.576805156488778e-05, + "loss": 0.4953, + "step": 7161 + }, + { + "epoch": 0.8492825803391438, + "grad_norm": 1.3186123811017356, + "learning_rate": 4.576671521837673e-05, + "loss": 0.3984, + "step": 7162 + }, + { + "epoch": 0.8494011621012688, + "grad_norm": 1.330872120112983, + "learning_rate": 4.576537868042204e-05, + "loss": 0.3489, + "step": 7163 + }, + { + "epoch": 0.8495197438633938, + "grad_norm": 1.0891832917653554, + "learning_rate": 4.576404195103602e-05, + "loss": 0.3281, + "step": 7164 + }, + { + "epoch": 0.8496383256255188, + "grad_norm": 1.515086410381966, + "learning_rate": 4.576270503023099e-05, + "loss": 0.4242, + "step": 7165 + }, + { + "epoch": 0.8497569073876438, + "grad_norm": 1.7235485331824882, + "learning_rate": 4.576136791801929e-05, + "loss": 0.5893, + "step": 7166 + }, + { + "epoch": 0.8498754891497687, + "grad_norm": 1.3256536449194125, + "learning_rate": 4.576003061441323e-05, + "loss": 0.3107, + "step": 7167 + }, + { + "epoch": 0.8499940709118937, + "grad_norm": 1.3174273370121818, + "learning_rate": 4.575869311942515e-05, + "loss": 0.4094, + "step": 7168 + }, + { + "epoch": 0.8501126526740187, + "grad_norm": 1.2243347140538134, + "learning_rate": 4.5757355433067373e-05, + "loss": 0.3756, + "step": 7169 + }, + { + "epoch": 0.8502312344361437, + "grad_norm": 1.3082500947105287, + "learning_rate": 4.5756017555352226e-05, + "loss": 0.4354, + "step": 7170 + }, + { + "epoch": 0.8503498161982687, + "grad_norm": 1.0801769316093435, + "learning_rate": 4.5754679486292064e-05, + "loss": 0.2775, + "step": 7171 + }, + { + "epoch": 0.8504683979603936, + "grad_norm": 1.0242673999179286, + "learning_rate": 4.57533412258992e-05, + "loss": 0.294, + "step": 7172 + }, + { + "epoch": 0.8505869797225186, + "grad_norm": 1.1738601544046217, + "learning_rate": 4.575200277418598e-05, + "loss": 0.3292, + "step": 7173 + }, + { + "epoch": 0.8507055614846437, + "grad_norm": 1.4139951983912715, + "learning_rate": 4.575066413116474e-05, + "loss": 0.4115, + "step": 7174 + }, + { + "epoch": 0.8508241432467687, + "grad_norm": 1.2164110478035621, + "learning_rate": 4.574932529684783e-05, + "loss": 0.3473, + "step": 7175 + }, + { + "epoch": 0.8509427250088937, + "grad_norm": 1.2618028074409744, + "learning_rate": 4.574798627124758e-05, + "loss": 0.3162, + "step": 7176 + }, + { + "epoch": 0.8510613067710187, + "grad_norm": 1.428061747159574, + "learning_rate": 4.574664705437634e-05, + "loss": 0.3791, + "step": 7177 + }, + { + "epoch": 0.8511798885331436, + "grad_norm": 1.3850605702384007, + "learning_rate": 4.574530764624645e-05, + "loss": 0.4022, + "step": 7178 + }, + { + "epoch": 0.8512984702952686, + "grad_norm": 1.4093269129504427, + "learning_rate": 4.574396804687028e-05, + "loss": 0.4825, + "step": 7179 + }, + { + "epoch": 0.8514170520573936, + "grad_norm": 1.5341413164778281, + "learning_rate": 4.574262825626014e-05, + "loss": 0.3963, + "step": 7180 + }, + { + "epoch": 0.8515356338195186, + "grad_norm": 1.3567623961945527, + "learning_rate": 4.574128827442842e-05, + "loss": 0.4112, + "step": 7181 + }, + { + "epoch": 0.8516542155816436, + "grad_norm": 2.446230034976584, + "learning_rate": 4.5739948101387455e-05, + "loss": 0.7262, + "step": 7182 + }, + { + "epoch": 0.8517727973437685, + "grad_norm": 1.2086296192747115, + "learning_rate": 4.5738607737149594e-05, + "loss": 0.342, + "step": 7183 + }, + { + "epoch": 0.8518913791058935, + "grad_norm": 1.321827540265069, + "learning_rate": 4.573726718172721e-05, + "loss": 0.3345, + "step": 7184 + }, + { + "epoch": 0.8520099608680185, + "grad_norm": 1.2018009716091063, + "learning_rate": 4.5735926435132644e-05, + "loss": 0.3284, + "step": 7185 + }, + { + "epoch": 0.8521285426301435, + "grad_norm": 0.8700022424153238, + "learning_rate": 4.573458549737827e-05, + "loss": 0.2763, + "step": 7186 + }, + { + "epoch": 0.8522471243922685, + "grad_norm": 1.3451856798328825, + "learning_rate": 4.573324436847644e-05, + "loss": 0.4308, + "step": 7187 + }, + { + "epoch": 0.8523657061543934, + "grad_norm": 0.9938305128542443, + "learning_rate": 4.573190304843953e-05, + "loss": 0.2891, + "step": 7188 + }, + { + "epoch": 0.8524842879165184, + "grad_norm": 1.450430468433095, + "learning_rate": 4.573056153727989e-05, + "loss": 0.4329, + "step": 7189 + }, + { + "epoch": 0.8526028696786434, + "grad_norm": 1.151552524427243, + "learning_rate": 4.572921983500989e-05, + "loss": 0.3145, + "step": 7190 + }, + { + "epoch": 0.8527214514407684, + "grad_norm": 1.07585458432924, + "learning_rate": 4.57278779416419e-05, + "loss": 0.3334, + "step": 7191 + }, + { + "epoch": 0.8528400332028934, + "grad_norm": 1.9184042944349622, + "learning_rate": 4.5726535857188316e-05, + "loss": 0.7702, + "step": 7192 + }, + { + "epoch": 0.8529586149650183, + "grad_norm": 1.2997294969661808, + "learning_rate": 4.572519358166147e-05, + "loss": 0.3647, + "step": 7193 + }, + { + "epoch": 0.8530771967271433, + "grad_norm": 1.2397749364991366, + "learning_rate": 4.572385111507376e-05, + "loss": 0.3666, + "step": 7194 + }, + { + "epoch": 0.8531957784892683, + "grad_norm": 1.3291678853883875, + "learning_rate": 4.572250845743755e-05, + "loss": 0.4318, + "step": 7195 + }, + { + "epoch": 0.8533143602513933, + "grad_norm": 1.184146312080771, + "learning_rate": 4.5721165608765225e-05, + "loss": 0.385, + "step": 7196 + }, + { + "epoch": 0.8534329420135183, + "grad_norm": 1.081240635788778, + "learning_rate": 4.571982256906917e-05, + "loss": 0.406, + "step": 7197 + }, + { + "epoch": 0.8535515237756434, + "grad_norm": 1.2394027547660684, + "learning_rate": 4.571847933836175e-05, + "loss": 0.42, + "step": 7198 + }, + { + "epoch": 0.8536701055377683, + "grad_norm": 1.212363036208342, + "learning_rate": 4.571713591665536e-05, + "loss": 0.3068, + "step": 7199 + }, + { + "epoch": 0.8537886872998933, + "grad_norm": 1.6620619292836225, + "learning_rate": 4.5715792303962386e-05, + "loss": 0.5966, + "step": 7200 + }, + { + "epoch": 0.8539072690620183, + "grad_norm": 1.4126581643668192, + "learning_rate": 4.57144485002952e-05, + "loss": 0.3321, + "step": 7201 + }, + { + "epoch": 0.8540258508241433, + "grad_norm": 1.3820042969067414, + "learning_rate": 4.5713104505666213e-05, + "loss": 0.3713, + "step": 7202 + }, + { + "epoch": 0.8541444325862683, + "grad_norm": 1.2879440354263358, + "learning_rate": 4.5711760320087803e-05, + "loss": 0.4177, + "step": 7203 + }, + { + "epoch": 0.8542630143483932, + "grad_norm": 1.4790864268609756, + "learning_rate": 4.571041594357235e-05, + "loss": 0.4029, + "step": 7204 + }, + { + "epoch": 0.8543815961105182, + "grad_norm": 1.5732935327504733, + "learning_rate": 4.570907137613227e-05, + "loss": 0.5195, + "step": 7205 + }, + { + "epoch": 0.8545001778726432, + "grad_norm": 1.1219793877650712, + "learning_rate": 4.5707726617779943e-05, + "loss": 0.2925, + "step": 7206 + }, + { + "epoch": 0.8546187596347682, + "grad_norm": 1.244067696569319, + "learning_rate": 4.570638166852776e-05, + "loss": 0.3946, + "step": 7207 + }, + { + "epoch": 0.8547373413968932, + "grad_norm": 1.7764259090649517, + "learning_rate": 4.570503652838815e-05, + "loss": 0.4663, + "step": 7208 + }, + { + "epoch": 0.8548559231590181, + "grad_norm": 1.2322336770989168, + "learning_rate": 4.570369119737348e-05, + "loss": 0.3218, + "step": 7209 + }, + { + "epoch": 0.8549745049211431, + "grad_norm": 1.573261261542684, + "learning_rate": 4.5702345675496166e-05, + "loss": 0.4099, + "step": 7210 + }, + { + "epoch": 0.8550930866832681, + "grad_norm": 1.2343368110775814, + "learning_rate": 4.5700999962768614e-05, + "loss": 0.2962, + "step": 7211 + }, + { + "epoch": 0.8552116684453931, + "grad_norm": 1.6675441839032255, + "learning_rate": 4.5699654059203225e-05, + "loss": 0.474, + "step": 7212 + }, + { + "epoch": 0.8553302502075181, + "grad_norm": 1.1715109092534155, + "learning_rate": 4.569830796481241e-05, + "loss": 0.3609, + "step": 7213 + }, + { + "epoch": 0.855448831969643, + "grad_norm": 1.6089259214319938, + "learning_rate": 4.5696961679608584e-05, + "loss": 0.4562, + "step": 7214 + }, + { + "epoch": 0.855567413731768, + "grad_norm": 1.4412469978839653, + "learning_rate": 4.5695615203604146e-05, + "loss": 0.3955, + "step": 7215 + }, + { + "epoch": 0.855685995493893, + "grad_norm": 1.3827848078066278, + "learning_rate": 4.5694268536811514e-05, + "loss": 0.4146, + "step": 7216 + }, + { + "epoch": 0.855804577256018, + "grad_norm": 1.2689538946340666, + "learning_rate": 4.569292167924311e-05, + "loss": 0.4168, + "step": 7217 + }, + { + "epoch": 0.855923159018143, + "grad_norm": 1.6551243620809284, + "learning_rate": 4.5691574630911336e-05, + "loss": 0.4076, + "step": 7218 + }, + { + "epoch": 0.856041740780268, + "grad_norm": 1.4175770036879314, + "learning_rate": 4.5690227391828624e-05, + "loss": 0.566, + "step": 7219 + }, + { + "epoch": 0.8561603225423929, + "grad_norm": 1.6677587536128793, + "learning_rate": 4.568887996200738e-05, + "loss": 0.5586, + "step": 7220 + }, + { + "epoch": 0.8562789043045179, + "grad_norm": 1.5621912674001708, + "learning_rate": 4.5687532341460035e-05, + "loss": 0.4438, + "step": 7221 + }, + { + "epoch": 0.8563974860666429, + "grad_norm": 1.5197221377336454, + "learning_rate": 4.568618453019901e-05, + "loss": 0.4657, + "step": 7222 + }, + { + "epoch": 0.856516067828768, + "grad_norm": 1.4254461731026589, + "learning_rate": 4.568483652823673e-05, + "loss": 0.48, + "step": 7223 + }, + { + "epoch": 0.856634649590893, + "grad_norm": 1.4387945232129289, + "learning_rate": 4.568348833558563e-05, + "loss": 0.4482, + "step": 7224 + }, + { + "epoch": 0.856753231353018, + "grad_norm": 1.1836569369377596, + "learning_rate": 4.568213995225812e-05, + "loss": 0.3391, + "step": 7225 + }, + { + "epoch": 0.8568718131151429, + "grad_norm": 1.2195785612446952, + "learning_rate": 4.568079137826665e-05, + "loss": 0.4256, + "step": 7226 + }, + { + "epoch": 0.8569903948772679, + "grad_norm": 1.4204226500928934, + "learning_rate": 4.5679442613623644e-05, + "loss": 0.3891, + "step": 7227 + }, + { + "epoch": 0.8571089766393929, + "grad_norm": 1.3435542071392623, + "learning_rate": 4.567809365834154e-05, + "loss": 0.3556, + "step": 7228 + }, + { + "epoch": 0.8572275584015179, + "grad_norm": 1.4999118970009129, + "learning_rate": 4.567674451243276e-05, + "loss": 0.4395, + "step": 7229 + }, + { + "epoch": 0.8573461401636429, + "grad_norm": 1.1289444365423957, + "learning_rate": 4.567539517590975e-05, + "loss": 0.3238, + "step": 7230 + }, + { + "epoch": 0.8574647219257678, + "grad_norm": 1.264474620617719, + "learning_rate": 4.567404564878496e-05, + "loss": 0.3813, + "step": 7231 + }, + { + "epoch": 0.8575833036878928, + "grad_norm": 1.4625307855594791, + "learning_rate": 4.5672695931070806e-05, + "loss": 0.5091, + "step": 7232 + }, + { + "epoch": 0.8577018854500178, + "grad_norm": 1.1142959631792297, + "learning_rate": 4.567134602277976e-05, + "loss": 0.3361, + "step": 7233 + }, + { + "epoch": 0.8578204672121428, + "grad_norm": 1.4860897652517404, + "learning_rate": 4.566999592392425e-05, + "loss": 0.535, + "step": 7234 + }, + { + "epoch": 0.8579390489742678, + "grad_norm": 1.5479028396742451, + "learning_rate": 4.5668645634516726e-05, + "loss": 0.4568, + "step": 7235 + }, + { + "epoch": 0.8580576307363927, + "grad_norm": 1.2961927964718378, + "learning_rate": 4.566729515456963e-05, + "loss": 0.4268, + "step": 7236 + }, + { + "epoch": 0.8581762124985177, + "grad_norm": 1.5087685748886726, + "learning_rate": 4.566594448409542e-05, + "loss": 0.3923, + "step": 7237 + }, + { + "epoch": 0.8582947942606427, + "grad_norm": 1.0165692840669505, + "learning_rate": 4.5664593623106544e-05, + "loss": 0.2594, + "step": 7238 + }, + { + "epoch": 0.8584133760227677, + "grad_norm": 2.277122188758163, + "learning_rate": 4.566324257161545e-05, + "loss": 0.6586, + "step": 7239 + }, + { + "epoch": 0.8585319577848927, + "grad_norm": 0.9789638963805193, + "learning_rate": 4.566189132963461e-05, + "loss": 0.2685, + "step": 7240 + }, + { + "epoch": 0.8586505395470176, + "grad_norm": 1.2959229486519026, + "learning_rate": 4.566053989717646e-05, + "loss": 0.3687, + "step": 7241 + }, + { + "epoch": 0.8587691213091426, + "grad_norm": 1.90405346566684, + "learning_rate": 4.5659188274253465e-05, + "loss": 0.5954, + "step": 7242 + }, + { + "epoch": 0.8588877030712676, + "grad_norm": 1.407913105975911, + "learning_rate": 4.56578364608781e-05, + "loss": 0.5968, + "step": 7243 + }, + { + "epoch": 0.8590062848333926, + "grad_norm": 1.4751038551192563, + "learning_rate": 4.565648445706281e-05, + "loss": 0.3709, + "step": 7244 + }, + { + "epoch": 0.8591248665955176, + "grad_norm": 1.4039496073803597, + "learning_rate": 4.5655132262820056e-05, + "loss": 0.4476, + "step": 7245 + }, + { + "epoch": 0.8592434483576425, + "grad_norm": 1.1966749253824351, + "learning_rate": 4.565377987816232e-05, + "loss": 0.3857, + "step": 7246 + }, + { + "epoch": 0.8593620301197675, + "grad_norm": 1.535926851951442, + "learning_rate": 4.565242730310206e-05, + "loss": 0.4, + "step": 7247 + }, + { + "epoch": 0.8594806118818926, + "grad_norm": 1.3177854834484768, + "learning_rate": 4.565107453765174e-05, + "loss": 0.4102, + "step": 7248 + }, + { + "epoch": 0.8595991936440176, + "grad_norm": 1.1153218508366356, + "learning_rate": 4.564972158182384e-05, + "loss": 0.3483, + "step": 7249 + }, + { + "epoch": 0.8597177754061426, + "grad_norm": 0.9991740844194126, + "learning_rate": 4.5648368435630834e-05, + "loss": 0.3274, + "step": 7250 + }, + { + "epoch": 0.8598363571682676, + "grad_norm": 1.1991559897951332, + "learning_rate": 4.564701509908519e-05, + "loss": 0.3228, + "step": 7251 + }, + { + "epoch": 0.8599549389303925, + "grad_norm": 1.580928061408572, + "learning_rate": 4.564566157219938e-05, + "loss": 0.6491, + "step": 7252 + }, + { + "epoch": 0.8600735206925175, + "grad_norm": 1.3620669990585974, + "learning_rate": 4.5644307854985894e-05, + "loss": 0.4298, + "step": 7253 + }, + { + "epoch": 0.8601921024546425, + "grad_norm": 1.2506920734641562, + "learning_rate": 4.56429539474572e-05, + "loss": 0.3907, + "step": 7254 + }, + { + "epoch": 0.8603106842167675, + "grad_norm": 1.81220072272726, + "learning_rate": 4.5641599849625794e-05, + "loss": 0.549, + "step": 7255 + }, + { + "epoch": 0.8604292659788925, + "grad_norm": 1.6901194818035248, + "learning_rate": 4.564024556150414e-05, + "loss": 0.5167, + "step": 7256 + }, + { + "epoch": 0.8605478477410174, + "grad_norm": 1.6550745626631662, + "learning_rate": 4.5638891083104743e-05, + "loss": 0.5885, + "step": 7257 + }, + { + "epoch": 0.8606664295031424, + "grad_norm": 1.196782985394039, + "learning_rate": 4.5637536414440065e-05, + "loss": 0.3136, + "step": 7258 + }, + { + "epoch": 0.8607850112652674, + "grad_norm": 1.247181198969873, + "learning_rate": 4.563618155552262e-05, + "loss": 0.4084, + "step": 7259 + }, + { + "epoch": 0.8609035930273924, + "grad_norm": 1.2218942297562996, + "learning_rate": 4.563482650636488e-05, + "loss": 0.2182, + "step": 7260 + }, + { + "epoch": 0.8610221747895174, + "grad_norm": 1.2992411118129639, + "learning_rate": 4.563347126697935e-05, + "loss": 0.4852, + "step": 7261 + }, + { + "epoch": 0.8611407565516424, + "grad_norm": 1.351514278724903, + "learning_rate": 4.563211583737851e-05, + "loss": 0.4644, + "step": 7262 + }, + { + "epoch": 0.8612593383137673, + "grad_norm": 1.4680034859532027, + "learning_rate": 4.563076021757487e-05, + "loss": 0.4816, + "step": 7263 + }, + { + "epoch": 0.8613779200758923, + "grad_norm": 1.3502992356354593, + "learning_rate": 4.562940440758091e-05, + "loss": 0.3825, + "step": 7264 + }, + { + "epoch": 0.8614965018380173, + "grad_norm": 1.0757651705675577, + "learning_rate": 4.5628048407409155e-05, + "loss": 0.3005, + "step": 7265 + }, + { + "epoch": 0.8616150836001423, + "grad_norm": 1.6180130033516864, + "learning_rate": 4.5626692217072074e-05, + "loss": 0.5993, + "step": 7266 + }, + { + "epoch": 0.8617336653622673, + "grad_norm": 1.5881343174749234, + "learning_rate": 4.5625335836582195e-05, + "loss": 0.533, + "step": 7267 + }, + { + "epoch": 0.8618522471243922, + "grad_norm": 1.3225607468519462, + "learning_rate": 4.562397926595201e-05, + "loss": 0.506, + "step": 7268 + }, + { + "epoch": 0.8619708288865172, + "grad_norm": 1.2355348588850255, + "learning_rate": 4.5622622505194024e-05, + "loss": 0.3525, + "step": 7269 + }, + { + "epoch": 0.8620894106486422, + "grad_norm": 1.5086741503387628, + "learning_rate": 4.562126555432075e-05, + "loss": 0.5134, + "step": 7270 + }, + { + "epoch": 0.8622079924107672, + "grad_norm": 1.6567325945214533, + "learning_rate": 4.5619908413344694e-05, + "loss": 0.5708, + "step": 7271 + }, + { + "epoch": 0.8623265741728922, + "grad_norm": 1.337268046844791, + "learning_rate": 4.561855108227836e-05, + "loss": 0.299, + "step": 7272 + }, + { + "epoch": 0.8624451559350172, + "grad_norm": 1.8350238533769732, + "learning_rate": 4.561719356113428e-05, + "loss": 0.4984, + "step": 7273 + }, + { + "epoch": 0.8625637376971422, + "grad_norm": 1.4626656977901709, + "learning_rate": 4.5615835849924955e-05, + "loss": 0.4396, + "step": 7274 + }, + { + "epoch": 0.8626823194592672, + "grad_norm": 1.3794101013005622, + "learning_rate": 4.561447794866291e-05, + "loss": 0.4039, + "step": 7275 + }, + { + "epoch": 0.8628009012213922, + "grad_norm": 1.4743154596282135, + "learning_rate": 4.561311985736064e-05, + "loss": 0.4753, + "step": 7276 + }, + { + "epoch": 0.8629194829835172, + "grad_norm": 1.3044749067279588, + "learning_rate": 4.56117615760307e-05, + "loss": 0.448, + "step": 7277 + }, + { + "epoch": 0.8630380647456422, + "grad_norm": 1.2313529451815786, + "learning_rate": 4.561040310468558e-05, + "loss": 0.3353, + "step": 7278 + }, + { + "epoch": 0.8631566465077671, + "grad_norm": 0.9761117459170572, + "learning_rate": 4.5609044443337834e-05, + "loss": 0.2572, + "step": 7279 + }, + { + "epoch": 0.8632752282698921, + "grad_norm": 1.4772815452986243, + "learning_rate": 4.560768559199996e-05, + "loss": 0.4457, + "step": 7280 + }, + { + "epoch": 0.8633938100320171, + "grad_norm": 1.5556919850799436, + "learning_rate": 4.56063265506845e-05, + "loss": 0.5264, + "step": 7281 + }, + { + "epoch": 0.8635123917941421, + "grad_norm": 1.3856253910314773, + "learning_rate": 4.560496731940398e-05, + "loss": 0.3668, + "step": 7282 + }, + { + "epoch": 0.8636309735562671, + "grad_norm": 1.327469926517054, + "learning_rate": 4.560360789817092e-05, + "loss": 0.4575, + "step": 7283 + }, + { + "epoch": 0.863749555318392, + "grad_norm": 1.2585881078908525, + "learning_rate": 4.560224828699786e-05, + "loss": 0.4443, + "step": 7284 + }, + { + "epoch": 0.863868137080517, + "grad_norm": 1.4485166280687547, + "learning_rate": 4.5600888485897346e-05, + "loss": 0.3869, + "step": 7285 + }, + { + "epoch": 0.863986718842642, + "grad_norm": 1.117627504897691, + "learning_rate": 4.55995284948819e-05, + "loss": 0.333, + "step": 7286 + }, + { + "epoch": 0.864105300604767, + "grad_norm": 1.612365582833256, + "learning_rate": 4.5598168313964064e-05, + "loss": 0.4684, + "step": 7287 + }, + { + "epoch": 0.864223882366892, + "grad_norm": 1.3844785365958128, + "learning_rate": 4.559680794315637e-05, + "loss": 0.4613, + "step": 7288 + }, + { + "epoch": 0.8643424641290169, + "grad_norm": 1.4126870795048978, + "learning_rate": 4.559544738247136e-05, + "loss": 0.4052, + "step": 7289 + }, + { + "epoch": 0.8644610458911419, + "grad_norm": 1.998573003667266, + "learning_rate": 4.5594086631921584e-05, + "loss": 0.5056, + "step": 7290 + }, + { + "epoch": 0.8645796276532669, + "grad_norm": 1.2040630968484565, + "learning_rate": 4.559272569151959e-05, + "loss": 0.35, + "step": 7291 + }, + { + "epoch": 0.8646982094153919, + "grad_norm": 1.65640061563667, + "learning_rate": 4.559136456127792e-05, + "loss": 0.4595, + "step": 7292 + }, + { + "epoch": 0.8648167911775169, + "grad_norm": 1.4814822336600904, + "learning_rate": 4.559000324120911e-05, + "loss": 0.367, + "step": 7293 + }, + { + "epoch": 0.8649353729396418, + "grad_norm": 1.5132246235247184, + "learning_rate": 4.558864173132572e-05, + "loss": 0.5336, + "step": 7294 + }, + { + "epoch": 0.8650539547017668, + "grad_norm": 1.1176134558115531, + "learning_rate": 4.55872800316403e-05, + "loss": 0.3577, + "step": 7295 + }, + { + "epoch": 0.8651725364638918, + "grad_norm": 1.090306694268348, + "learning_rate": 4.558591814216541e-05, + "loss": 0.2857, + "step": 7296 + }, + { + "epoch": 0.8652911182260168, + "grad_norm": 1.433719317049325, + "learning_rate": 4.558455606291359e-05, + "loss": 0.4027, + "step": 7297 + }, + { + "epoch": 0.8654096999881419, + "grad_norm": 1.6721417577004416, + "learning_rate": 4.5583193793897414e-05, + "loss": 0.5055, + "step": 7298 + }, + { + "epoch": 0.8655282817502669, + "grad_norm": 1.2089968469512535, + "learning_rate": 4.558183133512943e-05, + "loss": 0.4274, + "step": 7299 + }, + { + "epoch": 0.8656468635123918, + "grad_norm": 1.6055891498798285, + "learning_rate": 4.5580468686622194e-05, + "loss": 0.5293, + "step": 7300 + }, + { + "epoch": 0.8657654452745168, + "grad_norm": 1.4425252929514543, + "learning_rate": 4.557910584838828e-05, + "loss": 0.3672, + "step": 7301 + }, + { + "epoch": 0.8658840270366418, + "grad_norm": 1.2747186941396376, + "learning_rate": 4.557774282044025e-05, + "loss": 0.4354, + "step": 7302 + }, + { + "epoch": 0.8660026087987668, + "grad_norm": 1.4102791704856672, + "learning_rate": 4.557637960279066e-05, + "loss": 0.4173, + "step": 7303 + }, + { + "epoch": 0.8661211905608918, + "grad_norm": 1.2143207345699873, + "learning_rate": 4.557501619545208e-05, + "loss": 0.3154, + "step": 7304 + }, + { + "epoch": 0.8662397723230167, + "grad_norm": 1.176887771433976, + "learning_rate": 4.557365259843708e-05, + "loss": 0.3401, + "step": 7305 + }, + { + "epoch": 0.8663583540851417, + "grad_norm": 1.6748457475953071, + "learning_rate": 4.557228881175824e-05, + "loss": 0.3923, + "step": 7306 + }, + { + "epoch": 0.8664769358472667, + "grad_norm": 1.8718797406931287, + "learning_rate": 4.557092483542812e-05, + "loss": 0.648, + "step": 7307 + }, + { + "epoch": 0.8665955176093917, + "grad_norm": 1.3440814574674707, + "learning_rate": 4.55695606694593e-05, + "loss": 0.398, + "step": 7308 + }, + { + "epoch": 0.8667140993715167, + "grad_norm": 1.2981662704299994, + "learning_rate": 4.5568196313864356e-05, + "loss": 0.3446, + "step": 7309 + }, + { + "epoch": 0.8668326811336416, + "grad_norm": 1.1927849497728173, + "learning_rate": 4.556683176865586e-05, + "loss": 0.4024, + "step": 7310 + }, + { + "epoch": 0.8669512628957666, + "grad_norm": 1.257503295530811, + "learning_rate": 4.5565467033846406e-05, + "loss": 0.3471, + "step": 7311 + }, + { + "epoch": 0.8670698446578916, + "grad_norm": 1.41468994836031, + "learning_rate": 4.5564102109448554e-05, + "loss": 0.4202, + "step": 7312 + }, + { + "epoch": 0.8671884264200166, + "grad_norm": 1.4653973129115616, + "learning_rate": 4.55627369954749e-05, + "loss": 0.5187, + "step": 7313 + }, + { + "epoch": 0.8673070081821416, + "grad_norm": 1.2161406509889499, + "learning_rate": 4.5561371691938024e-05, + "loss": 0.2886, + "step": 7314 + }, + { + "epoch": 0.8674255899442666, + "grad_norm": 1.2474042306496682, + "learning_rate": 4.5560006198850526e-05, + "loss": 0.3675, + "step": 7315 + }, + { + "epoch": 0.8675441717063915, + "grad_norm": 2.017584314892035, + "learning_rate": 4.555864051622497e-05, + "loss": 0.6731, + "step": 7316 + }, + { + "epoch": 0.8676627534685165, + "grad_norm": 1.5942216169585766, + "learning_rate": 4.5557274644073974e-05, + "loss": 0.4822, + "step": 7317 + }, + { + "epoch": 0.8677813352306415, + "grad_norm": 1.3491880081944014, + "learning_rate": 4.55559085824101e-05, + "loss": 0.3458, + "step": 7318 + }, + { + "epoch": 0.8678999169927665, + "grad_norm": 1.1501681512053088, + "learning_rate": 4.555454233124596e-05, + "loss": 0.3004, + "step": 7319 + }, + { + "epoch": 0.8680184987548915, + "grad_norm": 1.4401299590622108, + "learning_rate": 4.555317589059415e-05, + "loss": 0.3717, + "step": 7320 + }, + { + "epoch": 0.8681370805170164, + "grad_norm": 1.1386699355063572, + "learning_rate": 4.555180926046726e-05, + "loss": 0.3464, + "step": 7321 + }, + { + "epoch": 0.8682556622791414, + "grad_norm": 1.4254965165847429, + "learning_rate": 4.5550442440877896e-05, + "loss": 0.5318, + "step": 7322 + }, + { + "epoch": 0.8683742440412665, + "grad_norm": 1.2053035241929624, + "learning_rate": 4.554907543183865e-05, + "loss": 0.3676, + "step": 7323 + }, + { + "epoch": 0.8684928258033915, + "grad_norm": 1.8512066705449168, + "learning_rate": 4.5547708233362117e-05, + "loss": 0.7014, + "step": 7324 + }, + { + "epoch": 0.8686114075655165, + "grad_norm": 1.117403592933197, + "learning_rate": 4.554634084546092e-05, + "loss": 0.3245, + "step": 7325 + }, + { + "epoch": 0.8687299893276414, + "grad_norm": 1.5440509552945505, + "learning_rate": 4.5544973268147664e-05, + "loss": 0.4129, + "step": 7326 + }, + { + "epoch": 0.8688485710897664, + "grad_norm": 1.2824227864574294, + "learning_rate": 4.5543605501434935e-05, + "loss": 0.387, + "step": 7327 + }, + { + "epoch": 0.8689671528518914, + "grad_norm": 1.0650820880751042, + "learning_rate": 4.554223754533536e-05, + "loss": 0.2984, + "step": 7328 + }, + { + "epoch": 0.8690857346140164, + "grad_norm": 1.0576952981903065, + "learning_rate": 4.554086939986155e-05, + "loss": 0.3245, + "step": 7329 + }, + { + "epoch": 0.8692043163761414, + "grad_norm": 1.4285320037603566, + "learning_rate": 4.553950106502611e-05, + "loss": 0.4047, + "step": 7330 + }, + { + "epoch": 0.8693228981382664, + "grad_norm": 1.329206109413159, + "learning_rate": 4.553813254084166e-05, + "loss": 0.3306, + "step": 7331 + }, + { + "epoch": 0.8694414799003913, + "grad_norm": 1.8243529509906597, + "learning_rate": 4.55367638273208e-05, + "loss": 0.4869, + "step": 7332 + }, + { + "epoch": 0.8695600616625163, + "grad_norm": 1.2139700087511072, + "learning_rate": 4.553539492447618e-05, + "loss": 0.3355, + "step": 7333 + }, + { + "epoch": 0.8696786434246413, + "grad_norm": 1.3971359873504205, + "learning_rate": 4.5534025832320395e-05, + "loss": 0.4779, + "step": 7334 + }, + { + "epoch": 0.8697972251867663, + "grad_norm": 1.2427392172016494, + "learning_rate": 4.5532656550866066e-05, + "loss": 0.3066, + "step": 7335 + }, + { + "epoch": 0.8699158069488913, + "grad_norm": 1.71943801000163, + "learning_rate": 4.553128708012583e-05, + "loss": 0.4433, + "step": 7336 + }, + { + "epoch": 0.8700343887110162, + "grad_norm": 1.3957801775648397, + "learning_rate": 4.55299174201123e-05, + "loss": 0.3785, + "step": 7337 + }, + { + "epoch": 0.8701529704731412, + "grad_norm": 1.7273384349037717, + "learning_rate": 4.552854757083811e-05, + "loss": 0.5541, + "step": 7338 + }, + { + "epoch": 0.8702715522352662, + "grad_norm": 1.88961035694688, + "learning_rate": 4.552717753231588e-05, + "loss": 0.5279, + "step": 7339 + }, + { + "epoch": 0.8703901339973912, + "grad_norm": 1.172766369151261, + "learning_rate": 4.552580730455824e-05, + "loss": 0.3119, + "step": 7340 + }, + { + "epoch": 0.8705087157595162, + "grad_norm": 1.1879080005085287, + "learning_rate": 4.552443688757784e-05, + "loss": 0.3303, + "step": 7341 + }, + { + "epoch": 0.8706272975216411, + "grad_norm": 1.5657196552764294, + "learning_rate": 4.552306628138729e-05, + "loss": 0.4455, + "step": 7342 + }, + { + "epoch": 0.8707458792837661, + "grad_norm": 1.51141049630003, + "learning_rate": 4.552169548599925e-05, + "loss": 0.4988, + "step": 7343 + }, + { + "epoch": 0.8708644610458911, + "grad_norm": 1.4465151909557281, + "learning_rate": 4.552032450142633e-05, + "loss": 0.4373, + "step": 7344 + }, + { + "epoch": 0.8709830428080161, + "grad_norm": 1.304997528595201, + "learning_rate": 4.551895332768119e-05, + "loss": 0.3168, + "step": 7345 + }, + { + "epoch": 0.8711016245701411, + "grad_norm": 1.4006279009759413, + "learning_rate": 4.551758196477646e-05, + "loss": 0.4118, + "step": 7346 + }, + { + "epoch": 0.871220206332266, + "grad_norm": 1.6058380543940962, + "learning_rate": 4.551621041272478e-05, + "loss": 0.3664, + "step": 7347 + }, + { + "epoch": 0.8713387880943911, + "grad_norm": 1.282195515787323, + "learning_rate": 4.5514838671538804e-05, + "loss": 0.2976, + "step": 7348 + }, + { + "epoch": 0.8714573698565161, + "grad_norm": 1.139299830685338, + "learning_rate": 4.551346674123117e-05, + "loss": 0.2998, + "step": 7349 + }, + { + "epoch": 0.8715759516186411, + "grad_norm": 1.6773530100454919, + "learning_rate": 4.551209462181453e-05, + "loss": 0.4718, + "step": 7350 + }, + { + "epoch": 0.8716945333807661, + "grad_norm": 1.4767097813520451, + "learning_rate": 4.551072231330153e-05, + "loss": 0.5514, + "step": 7351 + }, + { + "epoch": 0.8718131151428911, + "grad_norm": 1.3425300748359141, + "learning_rate": 4.550934981570482e-05, + "loss": 0.497, + "step": 7352 + }, + { + "epoch": 0.871931696905016, + "grad_norm": 1.1034425783569044, + "learning_rate": 4.550797712903705e-05, + "loss": 0.2837, + "step": 7353 + }, + { + "epoch": 0.872050278667141, + "grad_norm": 1.576009466773589, + "learning_rate": 4.5506604253310895e-05, + "loss": 0.5901, + "step": 7354 + }, + { + "epoch": 0.872168860429266, + "grad_norm": 1.4206196372364686, + "learning_rate": 4.5505231188538974e-05, + "loss": 0.3551, + "step": 7355 + }, + { + "epoch": 0.872287442191391, + "grad_norm": 1.4692112578420655, + "learning_rate": 4.5503857934733985e-05, + "loss": 0.4228, + "step": 7356 + }, + { + "epoch": 0.872406023953516, + "grad_norm": 1.359715528153113, + "learning_rate": 4.550248449190856e-05, + "loss": 0.4521, + "step": 7357 + }, + { + "epoch": 0.8725246057156409, + "grad_norm": 1.1886360175929769, + "learning_rate": 4.550111086007538e-05, + "loss": 0.2916, + "step": 7358 + }, + { + "epoch": 0.8726431874777659, + "grad_norm": 1.4211127106401604, + "learning_rate": 4.5499737039247084e-05, + "loss": 0.4596, + "step": 7359 + }, + { + "epoch": 0.8727617692398909, + "grad_norm": 1.14806114369829, + "learning_rate": 4.5498363029436364e-05, + "loss": 0.3095, + "step": 7360 + }, + { + "epoch": 0.8728803510020159, + "grad_norm": 1.3734226065174444, + "learning_rate": 4.5496988830655866e-05, + "loss": 0.4486, + "step": 7361 + }, + { + "epoch": 0.8729989327641409, + "grad_norm": 1.1246872719593164, + "learning_rate": 4.549561444291827e-05, + "loss": 0.3625, + "step": 7362 + }, + { + "epoch": 0.8731175145262658, + "grad_norm": 1.216917599264302, + "learning_rate": 4.549423986623623e-05, + "loss": 0.3113, + "step": 7363 + }, + { + "epoch": 0.8732360962883908, + "grad_norm": 1.3633339304097434, + "learning_rate": 4.549286510062245e-05, + "loss": 0.3906, + "step": 7364 + }, + { + "epoch": 0.8733546780505158, + "grad_norm": 1.1557294005356344, + "learning_rate": 4.549149014608957e-05, + "loss": 0.3898, + "step": 7365 + }, + { + "epoch": 0.8734732598126408, + "grad_norm": 1.4791551645135457, + "learning_rate": 4.5490115002650286e-05, + "loss": 0.4225, + "step": 7366 + }, + { + "epoch": 0.8735918415747658, + "grad_norm": 1.2526673774739785, + "learning_rate": 4.548873967031727e-05, + "loss": 0.3602, + "step": 7367 + }, + { + "epoch": 0.8737104233368908, + "grad_norm": 1.283930356058675, + "learning_rate": 4.5487364149103185e-05, + "loss": 0.335, + "step": 7368 + }, + { + "epoch": 0.8738290050990157, + "grad_norm": 1.2593852666426293, + "learning_rate": 4.548598843902074e-05, + "loss": 0.3793, + "step": 7369 + }, + { + "epoch": 0.8739475868611407, + "grad_norm": 1.2698143995947708, + "learning_rate": 4.5484612540082594e-05, + "loss": 0.4456, + "step": 7370 + }, + { + "epoch": 0.8740661686232657, + "grad_norm": 1.309544748815254, + "learning_rate": 4.548323645230145e-05, + "loss": 0.4765, + "step": 7371 + }, + { + "epoch": 0.8741847503853907, + "grad_norm": 1.246448442342234, + "learning_rate": 4.5481860175689974e-05, + "loss": 0.4613, + "step": 7372 + }, + { + "epoch": 0.8743033321475158, + "grad_norm": 1.133107490221832, + "learning_rate": 4.548048371026087e-05, + "loss": 0.2896, + "step": 7373 + }, + { + "epoch": 0.8744219139096407, + "grad_norm": 1.5834921412981975, + "learning_rate": 4.547910705602682e-05, + "loss": 0.5313, + "step": 7374 + }, + { + "epoch": 0.8745404956717657, + "grad_norm": 1.6771592893318699, + "learning_rate": 4.547773021300051e-05, + "loss": 0.5145, + "step": 7375 + }, + { + "epoch": 0.8746590774338907, + "grad_norm": 1.5251563510450794, + "learning_rate": 4.5476353181194654e-05, + "loss": 0.4674, + "step": 7376 + }, + { + "epoch": 0.8747776591960157, + "grad_norm": 1.4855851966590392, + "learning_rate": 4.547497596062192e-05, + "loss": 0.4321, + "step": 7377 + }, + { + "epoch": 0.8748962409581407, + "grad_norm": 1.1008718533772521, + "learning_rate": 4.547359855129502e-05, + "loss": 0.3546, + "step": 7378 + }, + { + "epoch": 0.8750148227202657, + "grad_norm": 1.1344392645136292, + "learning_rate": 4.547222095322664e-05, + "loss": 0.2922, + "step": 7379 + }, + { + "epoch": 0.8751334044823906, + "grad_norm": 1.1223864985053509, + "learning_rate": 4.54708431664295e-05, + "loss": 0.3644, + "step": 7380 + }, + { + "epoch": 0.8752519862445156, + "grad_norm": 1.140189346351853, + "learning_rate": 4.546946519091628e-05, + "loss": 0.3066, + "step": 7381 + }, + { + "epoch": 0.8753705680066406, + "grad_norm": 1.6984563160798498, + "learning_rate": 4.546808702669969e-05, + "loss": 0.5904, + "step": 7382 + }, + { + "epoch": 0.8754891497687656, + "grad_norm": 0.8285924862770526, + "learning_rate": 4.546670867379245e-05, + "loss": 0.2548, + "step": 7383 + }, + { + "epoch": 0.8756077315308906, + "grad_norm": 1.771462391051809, + "learning_rate": 4.546533013220724e-05, + "loss": 0.5366, + "step": 7384 + }, + { + "epoch": 0.8757263132930155, + "grad_norm": 1.2733237047674875, + "learning_rate": 4.5463951401956784e-05, + "loss": 0.4741, + "step": 7385 + }, + { + "epoch": 0.8758448950551405, + "grad_norm": 1.3081674955061557, + "learning_rate": 4.54625724830538e-05, + "loss": 0.4689, + "step": 7386 + }, + { + "epoch": 0.8759634768172655, + "grad_norm": 1.2106821970544333, + "learning_rate": 4.546119337551098e-05, + "loss": 0.4048, + "step": 7387 + }, + { + "epoch": 0.8760820585793905, + "grad_norm": 1.497032896249036, + "learning_rate": 4.5459814079341054e-05, + "loss": 0.4431, + "step": 7388 + }, + { + "epoch": 0.8762006403415155, + "grad_norm": 1.3555800419645154, + "learning_rate": 4.545843459455673e-05, + "loss": 0.4848, + "step": 7389 + }, + { + "epoch": 0.8763192221036404, + "grad_norm": 1.3329015842868543, + "learning_rate": 4.545705492117073e-05, + "loss": 0.3176, + "step": 7390 + }, + { + "epoch": 0.8764378038657654, + "grad_norm": 1.2645174177689273, + "learning_rate": 4.545567505919577e-05, + "loss": 0.4389, + "step": 7391 + }, + { + "epoch": 0.8765563856278904, + "grad_norm": 1.623111428326522, + "learning_rate": 4.545429500864456e-05, + "loss": 0.5109, + "step": 7392 + }, + { + "epoch": 0.8766749673900154, + "grad_norm": 1.1740662203383168, + "learning_rate": 4.545291476952985e-05, + "loss": 0.3536, + "step": 7393 + }, + { + "epoch": 0.8767935491521404, + "grad_norm": 1.6322385074365713, + "learning_rate": 4.545153434186433e-05, + "loss": 0.6483, + "step": 7394 + }, + { + "epoch": 0.8769121309142653, + "grad_norm": 1.6570657756963805, + "learning_rate": 4.5450153725660755e-05, + "loss": 0.5423, + "step": 7395 + }, + { + "epoch": 0.8770307126763903, + "grad_norm": 1.238899546264121, + "learning_rate": 4.5448772920931836e-05, + "loss": 0.335, + "step": 7396 + }, + { + "epoch": 0.8771492944385153, + "grad_norm": 1.2575776314704383, + "learning_rate": 4.544739192769031e-05, + "loss": 0.4462, + "step": 7397 + }, + { + "epoch": 0.8772678762006404, + "grad_norm": 1.2907989550238699, + "learning_rate": 4.5446010745948895e-05, + "loss": 0.3954, + "step": 7398 + }, + { + "epoch": 0.8773864579627654, + "grad_norm": 1.2403822736466519, + "learning_rate": 4.544462937572034e-05, + "loss": 0.3314, + "step": 7399 + }, + { + "epoch": 0.8775050397248904, + "grad_norm": 0.8921602166907387, + "learning_rate": 4.544324781701737e-05, + "loss": 0.2817, + "step": 7400 + }, + { + "epoch": 0.8776236214870153, + "grad_norm": 1.1047767010083496, + "learning_rate": 4.544186606985273e-05, + "loss": 0.3303, + "step": 7401 + }, + { + "epoch": 0.8777422032491403, + "grad_norm": 1.3439043672133621, + "learning_rate": 4.5440484134239146e-05, + "loss": 0.3863, + "step": 7402 + }, + { + "epoch": 0.8778607850112653, + "grad_norm": 1.2147159944492925, + "learning_rate": 4.5439102010189375e-05, + "loss": 0.4477, + "step": 7403 + }, + { + "epoch": 0.8779793667733903, + "grad_norm": 1.455164523621153, + "learning_rate": 4.543771969771613e-05, + "loss": 0.541, + "step": 7404 + }, + { + "epoch": 0.8780979485355153, + "grad_norm": 1.2539517111536944, + "learning_rate": 4.543633719683219e-05, + "loss": 0.3689, + "step": 7405 + }, + { + "epoch": 0.8782165302976402, + "grad_norm": 1.3011222947766705, + "learning_rate": 4.543495450755027e-05, + "loss": 0.4651, + "step": 7406 + }, + { + "epoch": 0.8783351120597652, + "grad_norm": 1.485531209048176, + "learning_rate": 4.5433571629883134e-05, + "loss": 0.5985, + "step": 7407 + }, + { + "epoch": 0.8784536938218902, + "grad_norm": 1.6911435362466862, + "learning_rate": 4.5432188563843516e-05, + "loss": 0.4357, + "step": 7408 + }, + { + "epoch": 0.8785722755840152, + "grad_norm": 1.8140269950860681, + "learning_rate": 4.543080530944418e-05, + "loss": 0.6002, + "step": 7409 + }, + { + "epoch": 0.8786908573461402, + "grad_norm": 1.3130561502283085, + "learning_rate": 4.542942186669788e-05, + "loss": 0.4256, + "step": 7410 + }, + { + "epoch": 0.8788094391082651, + "grad_norm": 1.4585855044897411, + "learning_rate": 4.5428038235617356e-05, + "loss": 0.4432, + "step": 7411 + }, + { + "epoch": 0.8789280208703901, + "grad_norm": 1.2304939384987652, + "learning_rate": 4.542665441621537e-05, + "loss": 0.3274, + "step": 7412 + }, + { + "epoch": 0.8790466026325151, + "grad_norm": 1.1922538363357853, + "learning_rate": 4.5425270408504676e-05, + "loss": 0.3028, + "step": 7413 + }, + { + "epoch": 0.8791651843946401, + "grad_norm": 1.0594932421220316, + "learning_rate": 4.542388621249805e-05, + "loss": 0.3196, + "step": 7414 + }, + { + "epoch": 0.8792837661567651, + "grad_norm": 1.3611899596243262, + "learning_rate": 4.542250182820822e-05, + "loss": 0.3945, + "step": 7415 + }, + { + "epoch": 0.87940234791889, + "grad_norm": 1.1249516245148061, + "learning_rate": 4.542111725564798e-05, + "loss": 0.3685, + "step": 7416 + }, + { + "epoch": 0.879520929681015, + "grad_norm": 1.3807266475930489, + "learning_rate": 4.541973249483008e-05, + "loss": 0.3935, + "step": 7417 + }, + { + "epoch": 0.87963951144314, + "grad_norm": 1.6217665528333411, + "learning_rate": 4.5418347545767276e-05, + "loss": 0.3805, + "step": 7418 + }, + { + "epoch": 0.879758093205265, + "grad_norm": 1.4248023288678837, + "learning_rate": 4.5416962408472354e-05, + "loss": 0.5099, + "step": 7419 + }, + { + "epoch": 0.87987667496739, + "grad_norm": 1.4665969005309927, + "learning_rate": 4.541557708295807e-05, + "loss": 0.3685, + "step": 7420 + }, + { + "epoch": 0.879995256729515, + "grad_norm": 1.2759976079611317, + "learning_rate": 4.541419156923721e-05, + "loss": 0.3833, + "step": 7421 + }, + { + "epoch": 0.8801138384916399, + "grad_norm": 1.364078508758613, + "learning_rate": 4.541280586732253e-05, + "loss": 0.5891, + "step": 7422 + }, + { + "epoch": 0.880232420253765, + "grad_norm": 1.1495915234589507, + "learning_rate": 4.541141997722682e-05, + "loss": 0.3645, + "step": 7423 + }, + { + "epoch": 0.88035100201589, + "grad_norm": 1.1085110813404089, + "learning_rate": 4.541003389896284e-05, + "loss": 0.2659, + "step": 7424 + }, + { + "epoch": 0.880469583778015, + "grad_norm": 0.927660882508683, + "learning_rate": 4.5408647632543374e-05, + "loss": 0.2395, + "step": 7425 + }, + { + "epoch": 0.88058816554014, + "grad_norm": 1.7633260255846206, + "learning_rate": 4.540726117798121e-05, + "loss": 0.53, + "step": 7426 + }, + { + "epoch": 0.880706747302265, + "grad_norm": 1.5225850659388012, + "learning_rate": 4.540587453528912e-05, + "loss": 0.4563, + "step": 7427 + }, + { + "epoch": 0.8808253290643899, + "grad_norm": 1.1524167988124348, + "learning_rate": 4.540448770447989e-05, + "loss": 0.3048, + "step": 7428 + }, + { + "epoch": 0.8809439108265149, + "grad_norm": 1.458953566367985, + "learning_rate": 4.540310068556631e-05, + "loss": 0.438, + "step": 7429 + }, + { + "epoch": 0.8810624925886399, + "grad_norm": 1.5565289837742644, + "learning_rate": 4.540171347856116e-05, + "loss": 0.3996, + "step": 7430 + }, + { + "epoch": 0.8811810743507649, + "grad_norm": 1.1035806845238953, + "learning_rate": 4.540032608347722e-05, + "loss": 0.2983, + "step": 7431 + }, + { + "epoch": 0.8812996561128899, + "grad_norm": 1.3940222241105604, + "learning_rate": 4.5398938500327306e-05, + "loss": 0.2827, + "step": 7432 + }, + { + "epoch": 0.8814182378750148, + "grad_norm": 1.4378767981369645, + "learning_rate": 4.539755072912418e-05, + "loss": 0.41, + "step": 7433 + }, + { + "epoch": 0.8815368196371398, + "grad_norm": 1.2877424340437988, + "learning_rate": 4.539616276988066e-05, + "loss": 0.3699, + "step": 7434 + }, + { + "epoch": 0.8816554013992648, + "grad_norm": 1.228235176875652, + "learning_rate": 4.5394774622609525e-05, + "loss": 0.365, + "step": 7435 + }, + { + "epoch": 0.8817739831613898, + "grad_norm": 1.3388305434404646, + "learning_rate": 4.539338628732358e-05, + "loss": 0.4483, + "step": 7436 + }, + { + "epoch": 0.8818925649235148, + "grad_norm": 1.800362397471492, + "learning_rate": 4.539199776403562e-05, + "loss": 0.5145, + "step": 7437 + }, + { + "epoch": 0.8820111466856397, + "grad_norm": 1.253933885820247, + "learning_rate": 4.539060905275844e-05, + "loss": 0.3825, + "step": 7438 + }, + { + "epoch": 0.8821297284477647, + "grad_norm": 1.1439189396406135, + "learning_rate": 4.538922015350486e-05, + "loss": 0.2829, + "step": 7439 + }, + { + "epoch": 0.8822483102098897, + "grad_norm": 1.4009906300650106, + "learning_rate": 4.5387831066287664e-05, + "loss": 0.3966, + "step": 7440 + }, + { + "epoch": 0.8823668919720147, + "grad_norm": 1.4328750243049735, + "learning_rate": 4.5386441791119674e-05, + "loss": 0.46, + "step": 7441 + }, + { + "epoch": 0.8824854737341397, + "grad_norm": 1.068162529221504, + "learning_rate": 4.538505232801369e-05, + "loss": 0.3141, + "step": 7442 + }, + { + "epoch": 0.8826040554962646, + "grad_norm": 1.7132475595448937, + "learning_rate": 4.538366267698252e-05, + "loss": 0.5287, + "step": 7443 + }, + { + "epoch": 0.8827226372583896, + "grad_norm": 1.2951134115714935, + "learning_rate": 4.538227283803897e-05, + "loss": 0.4536, + "step": 7444 + }, + { + "epoch": 0.8828412190205146, + "grad_norm": 1.315576384671872, + "learning_rate": 4.5380882811195866e-05, + "loss": 0.4417, + "step": 7445 + }, + { + "epoch": 0.8829598007826396, + "grad_norm": 1.2220443778786267, + "learning_rate": 4.537949259646601e-05, + "loss": 0.4138, + "step": 7446 + }, + { + "epoch": 0.8830783825447646, + "grad_norm": 1.1616525945594198, + "learning_rate": 4.5378102193862224e-05, + "loss": 0.3506, + "step": 7447 + }, + { + "epoch": 0.8831969643068897, + "grad_norm": 1.1348782513685463, + "learning_rate": 4.537671160339733e-05, + "loss": 0.3406, + "step": 7448 + }, + { + "epoch": 0.8833155460690146, + "grad_norm": 1.2295321090690572, + "learning_rate": 4.537532082508414e-05, + "loss": 0.4341, + "step": 7449 + }, + { + "epoch": 0.8834341278311396, + "grad_norm": 1.0330260534403048, + "learning_rate": 4.5373929858935483e-05, + "loss": 0.2963, + "step": 7450 + }, + { + "epoch": 0.8835527095932646, + "grad_norm": 1.225152965075056, + "learning_rate": 4.5372538704964174e-05, + "loss": 0.3985, + "step": 7451 + }, + { + "epoch": 0.8836712913553896, + "grad_norm": 1.1716191465770958, + "learning_rate": 4.5371147363183034e-05, + "loss": 0.3311, + "step": 7452 + }, + { + "epoch": 0.8837898731175146, + "grad_norm": 1.105796903943377, + "learning_rate": 4.53697558336049e-05, + "loss": 0.3375, + "step": 7453 + }, + { + "epoch": 0.8839084548796395, + "grad_norm": 1.2820648972825353, + "learning_rate": 4.53683641162426e-05, + "loss": 0.3775, + "step": 7454 + }, + { + "epoch": 0.8840270366417645, + "grad_norm": 1.2631314141152539, + "learning_rate": 4.5366972211108953e-05, + "loss": 0.4757, + "step": 7455 + }, + { + "epoch": 0.8841456184038895, + "grad_norm": 1.5617999303736578, + "learning_rate": 4.536558011821681e-05, + "loss": 0.3675, + "step": 7456 + }, + { + "epoch": 0.8842642001660145, + "grad_norm": 1.379780220704358, + "learning_rate": 4.5364187837578975e-05, + "loss": 0.4975, + "step": 7457 + }, + { + "epoch": 0.8843827819281395, + "grad_norm": 1.983911440215965, + "learning_rate": 4.536279536920831e-05, + "loss": 0.6987, + "step": 7458 + }, + { + "epoch": 0.8845013636902644, + "grad_norm": 1.3495770625559849, + "learning_rate": 4.536140271311764e-05, + "loss": 0.3996, + "step": 7459 + }, + { + "epoch": 0.8846199454523894, + "grad_norm": 1.4744562866523487, + "learning_rate": 4.536000986931981e-05, + "loss": 0.4256, + "step": 7460 + }, + { + "epoch": 0.8847385272145144, + "grad_norm": 0.9318111110010412, + "learning_rate": 4.535861683782766e-05, + "loss": 0.3048, + "step": 7461 + }, + { + "epoch": 0.8848571089766394, + "grad_norm": 1.4473418359441452, + "learning_rate": 4.535722361865402e-05, + "loss": 0.3552, + "step": 7462 + }, + { + "epoch": 0.8849756907387644, + "grad_norm": 1.5702515037311215, + "learning_rate": 4.535583021181174e-05, + "loss": 0.5568, + "step": 7463 + }, + { + "epoch": 0.8850942725008893, + "grad_norm": 1.514145693167217, + "learning_rate": 4.5354436617313674e-05, + "loss": 0.4409, + "step": 7464 + }, + { + "epoch": 0.8852128542630143, + "grad_norm": 1.046737380531244, + "learning_rate": 4.5353042835172656e-05, + "loss": 0.267, + "step": 7465 + }, + { + "epoch": 0.8853314360251393, + "grad_norm": 1.2783467075664185, + "learning_rate": 4.535164886540155e-05, + "loss": 0.316, + "step": 7466 + }, + { + "epoch": 0.8854500177872643, + "grad_norm": 1.5101740311875667, + "learning_rate": 4.535025470801319e-05, + "loss": 0.5353, + "step": 7467 + }, + { + "epoch": 0.8855685995493893, + "grad_norm": 1.293522863787606, + "learning_rate": 4.534886036302044e-05, + "loss": 0.3852, + "step": 7468 + }, + { + "epoch": 0.8856871813115142, + "grad_norm": 1.4374488666137482, + "learning_rate": 4.534746583043615e-05, + "loss": 0.4818, + "step": 7469 + }, + { + "epoch": 0.8858057630736392, + "grad_norm": 1.3772340669162901, + "learning_rate": 4.534607111027318e-05, + "loss": 0.3932, + "step": 7470 + }, + { + "epoch": 0.8859243448357642, + "grad_norm": 1.1397633828786091, + "learning_rate": 4.5344676202544376e-05, + "loss": 0.2866, + "step": 7471 + }, + { + "epoch": 0.8860429265978892, + "grad_norm": 1.5595051087781746, + "learning_rate": 4.534328110726262e-05, + "loss": 0.5676, + "step": 7472 + }, + { + "epoch": 0.8861615083600143, + "grad_norm": 1.2735092668574903, + "learning_rate": 4.534188582444075e-05, + "loss": 0.3575, + "step": 7473 + }, + { + "epoch": 0.8862800901221393, + "grad_norm": 1.3024464942242433, + "learning_rate": 4.534049035409164e-05, + "loss": 0.4319, + "step": 7474 + }, + { + "epoch": 0.8863986718842642, + "grad_norm": 1.635675784725513, + "learning_rate": 4.533909469622815e-05, + "loss": 0.458, + "step": 7475 + }, + { + "epoch": 0.8865172536463892, + "grad_norm": 1.2411939669213774, + "learning_rate": 4.533769885086315e-05, + "loss": 0.3037, + "step": 7476 + }, + { + "epoch": 0.8866358354085142, + "grad_norm": 1.161875858115989, + "learning_rate": 4.53363028180095e-05, + "loss": 0.3842, + "step": 7477 + }, + { + "epoch": 0.8867544171706392, + "grad_norm": 1.626727765953996, + "learning_rate": 4.533490659768008e-05, + "loss": 0.6438, + "step": 7478 + }, + { + "epoch": 0.8868729989327642, + "grad_norm": 1.237703173633034, + "learning_rate": 4.533351018988776e-05, + "loss": 0.3479, + "step": 7479 + }, + { + "epoch": 0.8869915806948891, + "grad_norm": 1.4961251329445258, + "learning_rate": 4.533211359464541e-05, + "loss": 0.352, + "step": 7480 + }, + { + "epoch": 0.8871101624570141, + "grad_norm": 1.5143411420458186, + "learning_rate": 4.5330716811965905e-05, + "loss": 0.4711, + "step": 7481 + }, + { + "epoch": 0.8872287442191391, + "grad_norm": 1.4223487328311617, + "learning_rate": 4.532931984186212e-05, + "loss": 0.421, + "step": 7482 + }, + { + "epoch": 0.8873473259812641, + "grad_norm": 1.574922957186531, + "learning_rate": 4.532792268434694e-05, + "loss": 0.4992, + "step": 7483 + }, + { + "epoch": 0.8874659077433891, + "grad_norm": 1.8334756847513334, + "learning_rate": 4.5326525339433236e-05, + "loss": 0.5734, + "step": 7484 + }, + { + "epoch": 0.887584489505514, + "grad_norm": 1.5073881985684319, + "learning_rate": 4.53251278071339e-05, + "loss": 0.3589, + "step": 7485 + }, + { + "epoch": 0.887703071267639, + "grad_norm": 1.2309674932839478, + "learning_rate": 4.53237300874618e-05, + "loss": 0.4563, + "step": 7486 + }, + { + "epoch": 0.887821653029764, + "grad_norm": 1.3431965679090963, + "learning_rate": 4.532233218042984e-05, + "loss": 0.2716, + "step": 7487 + }, + { + "epoch": 0.887940234791889, + "grad_norm": 1.658740503806538, + "learning_rate": 4.53209340860509e-05, + "loss": 0.4523, + "step": 7488 + }, + { + "epoch": 0.888058816554014, + "grad_norm": 1.5982759811054892, + "learning_rate": 4.5319535804337866e-05, + "loss": 0.5507, + "step": 7489 + }, + { + "epoch": 0.888177398316139, + "grad_norm": 1.1605884425655701, + "learning_rate": 4.531813733530362e-05, + "loss": 0.377, + "step": 7490 + }, + { + "epoch": 0.8882959800782639, + "grad_norm": 1.4024739922754021, + "learning_rate": 4.531673867896107e-05, + "loss": 0.4278, + "step": 7491 + }, + { + "epoch": 0.8884145618403889, + "grad_norm": 1.3233299472995177, + "learning_rate": 4.53153398353231e-05, + "loss": 0.3907, + "step": 7492 + }, + { + "epoch": 0.8885331436025139, + "grad_norm": 1.3560703556260767, + "learning_rate": 4.5313940804402615e-05, + "loss": 0.2931, + "step": 7493 + }, + { + "epoch": 0.8886517253646389, + "grad_norm": 1.1687818495918931, + "learning_rate": 4.5312541586212505e-05, + "loss": 0.3532, + "step": 7494 + }, + { + "epoch": 0.8887703071267639, + "grad_norm": 1.116227499762771, + "learning_rate": 4.531114218076567e-05, + "loss": 0.2988, + "step": 7495 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.6347523654235325, + "learning_rate": 4.530974258807501e-05, + "loss": 0.4236, + "step": 7496 + }, + { + "epoch": 0.8890074706510138, + "grad_norm": 1.6800674514267606, + "learning_rate": 4.5308342808153426e-05, + "loss": 0.5833, + "step": 7497 + }, + { + "epoch": 0.8891260524131389, + "grad_norm": 1.1475240099820048, + "learning_rate": 4.530694284101383e-05, + "loss": 0.291, + "step": 7498 + }, + { + "epoch": 0.8892446341752639, + "grad_norm": 1.319536894339529, + "learning_rate": 4.530554268666912e-05, + "loss": 0.3983, + "step": 7499 + }, + { + "epoch": 0.8893632159373889, + "grad_norm": 1.467827202603896, + "learning_rate": 4.530414234513221e-05, + "loss": 0.4666, + "step": 7500 + }, + { + "epoch": 0.8894817976995139, + "grad_norm": 1.2655876163217412, + "learning_rate": 4.530274181641601e-05, + "loss": 0.2855, + "step": 7501 + }, + { + "epoch": 0.8896003794616388, + "grad_norm": 1.2835862033966863, + "learning_rate": 4.530134110053341e-05, + "loss": 0.3567, + "step": 7502 + }, + { + "epoch": 0.8897189612237638, + "grad_norm": 1.654656670846327, + "learning_rate": 4.529994019749735e-05, + "loss": 0.5484, + "step": 7503 + }, + { + "epoch": 0.8898375429858888, + "grad_norm": 1.0611448647848152, + "learning_rate": 4.529853910732074e-05, + "loss": 0.3054, + "step": 7504 + }, + { + "epoch": 0.8899561247480138, + "grad_norm": 1.6389657966975257, + "learning_rate": 4.529713783001649e-05, + "loss": 0.5649, + "step": 7505 + }, + { + "epoch": 0.8900747065101388, + "grad_norm": 1.271627396448637, + "learning_rate": 4.529573636559752e-05, + "loss": 0.3167, + "step": 7506 + }, + { + "epoch": 0.8901932882722637, + "grad_norm": 1.340538347093671, + "learning_rate": 4.529433471407674e-05, + "loss": 0.345, + "step": 7507 + }, + { + "epoch": 0.8903118700343887, + "grad_norm": 1.4105896354868457, + "learning_rate": 4.5292932875467087e-05, + "loss": 0.4468, + "step": 7508 + }, + { + "epoch": 0.8904304517965137, + "grad_norm": 1.4869243922911697, + "learning_rate": 4.5291530849781475e-05, + "loss": 0.4622, + "step": 7509 + }, + { + "epoch": 0.8905490335586387, + "grad_norm": 1.6814625459797656, + "learning_rate": 4.529012863703284e-05, + "loss": 0.6405, + "step": 7510 + }, + { + "epoch": 0.8906676153207637, + "grad_norm": 1.2751004639507215, + "learning_rate": 4.528872623723409e-05, + "loss": 0.4186, + "step": 7511 + }, + { + "epoch": 0.8907861970828886, + "grad_norm": 1.233910030843671, + "learning_rate": 4.5287323650398164e-05, + "loss": 0.3098, + "step": 7512 + }, + { + "epoch": 0.8909047788450136, + "grad_norm": 1.2005533935268675, + "learning_rate": 4.5285920876538e-05, + "loss": 0.37, + "step": 7513 + }, + { + "epoch": 0.8910233606071386, + "grad_norm": 1.575228780921675, + "learning_rate": 4.528451791566651e-05, + "loss": 0.3426, + "step": 7514 + }, + { + "epoch": 0.8911419423692636, + "grad_norm": 1.023563386327721, + "learning_rate": 4.528311476779665e-05, + "loss": 0.3365, + "step": 7515 + }, + { + "epoch": 0.8912605241313886, + "grad_norm": 1.0517907720678317, + "learning_rate": 4.528171143294134e-05, + "loss": 0.3054, + "step": 7516 + }, + { + "epoch": 0.8913791058935135, + "grad_norm": 1.1236773799483701, + "learning_rate": 4.528030791111353e-05, + "loss": 0.3884, + "step": 7517 + }, + { + "epoch": 0.8914976876556385, + "grad_norm": 1.146417296845729, + "learning_rate": 4.527890420232614e-05, + "loss": 0.3212, + "step": 7518 + }, + { + "epoch": 0.8916162694177635, + "grad_norm": 1.6294921046640385, + "learning_rate": 4.527750030659212e-05, + "loss": 0.5284, + "step": 7519 + }, + { + "epoch": 0.8917348511798885, + "grad_norm": 1.4007261924705083, + "learning_rate": 4.527609622392442e-05, + "loss": 0.3842, + "step": 7520 + }, + { + "epoch": 0.8918534329420135, + "grad_norm": 1.4826411130064627, + "learning_rate": 4.527469195433598e-05, + "loss": 0.4624, + "step": 7521 + }, + { + "epoch": 0.8919720147041384, + "grad_norm": 1.453207265717605, + "learning_rate": 4.5273287497839735e-05, + "loss": 0.4269, + "step": 7522 + }, + { + "epoch": 0.8920905964662635, + "grad_norm": 1.1749695273630563, + "learning_rate": 4.527188285444864e-05, + "loss": 0.3403, + "step": 7523 + }, + { + "epoch": 0.8922091782283885, + "grad_norm": 1.207116221028276, + "learning_rate": 4.527047802417564e-05, + "loss": 0.2872, + "step": 7524 + }, + { + "epoch": 0.8923277599905135, + "grad_norm": 1.1727978791380111, + "learning_rate": 4.5269073007033703e-05, + "loss": 0.3454, + "step": 7525 + }, + { + "epoch": 0.8924463417526385, + "grad_norm": 1.2077134239271858, + "learning_rate": 4.526766780303576e-05, + "loss": 0.3354, + "step": 7526 + }, + { + "epoch": 0.8925649235147635, + "grad_norm": 1.574055736474613, + "learning_rate": 4.5266262412194774e-05, + "loss": 0.2895, + "step": 7527 + }, + { + "epoch": 0.8926835052768884, + "grad_norm": 1.2618052450169956, + "learning_rate": 4.52648568345237e-05, + "loss": 0.3141, + "step": 7528 + }, + { + "epoch": 0.8928020870390134, + "grad_norm": 1.1666752797445716, + "learning_rate": 4.52634510700355e-05, + "loss": 0.3105, + "step": 7529 + }, + { + "epoch": 0.8929206688011384, + "grad_norm": 1.480077403906003, + "learning_rate": 4.526204511874313e-05, + "loss": 0.4035, + "step": 7530 + }, + { + "epoch": 0.8930392505632634, + "grad_norm": 1.2362963662189608, + "learning_rate": 4.5260638980659546e-05, + "loss": 0.3356, + "step": 7531 + }, + { + "epoch": 0.8931578323253884, + "grad_norm": 1.129740865355936, + "learning_rate": 4.5259232655797725e-05, + "loss": 0.3093, + "step": 7532 + }, + { + "epoch": 0.8932764140875133, + "grad_norm": 1.3157891275921105, + "learning_rate": 4.5257826144170615e-05, + "loss": 0.2934, + "step": 7533 + }, + { + "epoch": 0.8933949958496383, + "grad_norm": 1.6843390716777602, + "learning_rate": 4.5256419445791185e-05, + "loss": 0.457, + "step": 7534 + }, + { + "epoch": 0.8935135776117633, + "grad_norm": 1.1981148226494402, + "learning_rate": 4.525501256067242e-05, + "loss": 0.3601, + "step": 7535 + }, + { + "epoch": 0.8936321593738883, + "grad_norm": 1.9627346791174598, + "learning_rate": 4.5253605488827264e-05, + "loss": 0.5465, + "step": 7536 + }, + { + "epoch": 0.8937507411360133, + "grad_norm": 1.3178186334718536, + "learning_rate": 4.5252198230268713e-05, + "loss": 0.3977, + "step": 7537 + }, + { + "epoch": 0.8938693228981383, + "grad_norm": 1.481868833752932, + "learning_rate": 4.525079078500972e-05, + "loss": 0.4507, + "step": 7538 + }, + { + "epoch": 0.8939879046602632, + "grad_norm": 0.9290292170322662, + "learning_rate": 4.5249383153063286e-05, + "loss": 0.2478, + "step": 7539 + }, + { + "epoch": 0.8941064864223882, + "grad_norm": 1.413300412541907, + "learning_rate": 4.5247975334442356e-05, + "loss": 0.3135, + "step": 7540 + }, + { + "epoch": 0.8942250681845132, + "grad_norm": 1.4785063813656771, + "learning_rate": 4.524656732915993e-05, + "loss": 0.3627, + "step": 7541 + }, + { + "epoch": 0.8943436499466382, + "grad_norm": 1.3926038791625226, + "learning_rate": 4.524515913722897e-05, + "loss": 0.3796, + "step": 7542 + }, + { + "epoch": 0.8944622317087632, + "grad_norm": 1.5291764411367228, + "learning_rate": 4.524375075866248e-05, + "loss": 0.4498, + "step": 7543 + }, + { + "epoch": 0.8945808134708881, + "grad_norm": 1.3630393563145187, + "learning_rate": 4.524234219347343e-05, + "loss": 0.3841, + "step": 7544 + }, + { + "epoch": 0.8946993952330131, + "grad_norm": 1.3361139917799805, + "learning_rate": 4.5240933441674806e-05, + "loss": 0.3533, + "step": 7545 + }, + { + "epoch": 0.8948179769951381, + "grad_norm": 1.3442169705210165, + "learning_rate": 4.523952450327959e-05, + "loss": 0.4053, + "step": 7546 + }, + { + "epoch": 0.8949365587572631, + "grad_norm": 1.367150186718433, + "learning_rate": 4.5238115378300795e-05, + "loss": 0.3624, + "step": 7547 + }, + { + "epoch": 0.8950551405193882, + "grad_norm": 1.4583397445951978, + "learning_rate": 4.5236706066751377e-05, + "loss": 0.5091, + "step": 7548 + }, + { + "epoch": 0.8951737222815132, + "grad_norm": 1.4740976024399954, + "learning_rate": 4.523529656864436e-05, + "loss": 0.3879, + "step": 7549 + }, + { + "epoch": 0.8952923040436381, + "grad_norm": 1.0557585225640393, + "learning_rate": 4.523388688399272e-05, + "loss": 0.2845, + "step": 7550 + }, + { + "epoch": 0.8954108858057631, + "grad_norm": 1.2039208162487118, + "learning_rate": 4.5232477012809444e-05, + "loss": 0.3087, + "step": 7551 + }, + { + "epoch": 0.8955294675678881, + "grad_norm": 1.1957511329251593, + "learning_rate": 4.523106695510755e-05, + "loss": 0.3128, + "step": 7552 + }, + { + "epoch": 0.8956480493300131, + "grad_norm": 1.197687010172547, + "learning_rate": 4.522965671090002e-05, + "loss": 0.4216, + "step": 7553 + }, + { + "epoch": 0.895766631092138, + "grad_norm": 1.029743689777649, + "learning_rate": 4.522824628019986e-05, + "loss": 0.3026, + "step": 7554 + }, + { + "epoch": 0.895885212854263, + "grad_norm": 1.364639951730422, + "learning_rate": 4.522683566302008e-05, + "loss": 0.3192, + "step": 7555 + }, + { + "epoch": 0.896003794616388, + "grad_norm": 1.3058909251919892, + "learning_rate": 4.522542485937369e-05, + "loss": 0.3117, + "step": 7556 + }, + { + "epoch": 0.896122376378513, + "grad_norm": 1.598257496011694, + "learning_rate": 4.5224013869273676e-05, + "loss": 0.4408, + "step": 7557 + }, + { + "epoch": 0.896240958140638, + "grad_norm": 1.6348324884773038, + "learning_rate": 4.522260269273305e-05, + "loss": 0.3923, + "step": 7558 + }, + { + "epoch": 0.896359539902763, + "grad_norm": 1.401434591464561, + "learning_rate": 4.5221191329764836e-05, + "loss": 0.4256, + "step": 7559 + }, + { + "epoch": 0.8964781216648879, + "grad_norm": 1.4165761747850558, + "learning_rate": 4.521977978038202e-05, + "loss": 0.358, + "step": 7560 + }, + { + "epoch": 0.8965967034270129, + "grad_norm": 1.5190554216741796, + "learning_rate": 4.521836804459764e-05, + "loss": 0.4619, + "step": 7561 + }, + { + "epoch": 0.8967152851891379, + "grad_norm": 1.368581521688346, + "learning_rate": 4.5216956122424705e-05, + "loss": 0.4075, + "step": 7562 + }, + { + "epoch": 0.8968338669512629, + "grad_norm": 1.1085603365538104, + "learning_rate": 4.521554401387621e-05, + "loss": 0.3227, + "step": 7563 + }, + { + "epoch": 0.8969524487133879, + "grad_norm": 1.2513947550875535, + "learning_rate": 4.52141317189652e-05, + "loss": 0.244, + "step": 7564 + }, + { + "epoch": 0.8970710304755128, + "grad_norm": 1.6458696727327704, + "learning_rate": 4.521271923770468e-05, + "loss": 0.4539, + "step": 7565 + }, + { + "epoch": 0.8971896122376378, + "grad_norm": 1.4217868752516982, + "learning_rate": 4.5211306570107683e-05, + "loss": 0.4374, + "step": 7566 + }, + { + "epoch": 0.8973081939997628, + "grad_norm": 1.311860388229482, + "learning_rate": 4.520989371618722e-05, + "loss": 0.3176, + "step": 7567 + }, + { + "epoch": 0.8974267757618878, + "grad_norm": 1.4163368181148714, + "learning_rate": 4.520848067595632e-05, + "loss": 0.5609, + "step": 7568 + }, + { + "epoch": 0.8975453575240128, + "grad_norm": 1.0460203776654151, + "learning_rate": 4.5207067449428004e-05, + "loss": 0.2779, + "step": 7569 + }, + { + "epoch": 0.8976639392861377, + "grad_norm": 2.1344436901762385, + "learning_rate": 4.520565403661531e-05, + "loss": 0.6181, + "step": 7570 + }, + { + "epoch": 0.8977825210482627, + "grad_norm": 1.2701913697384388, + "learning_rate": 4.520424043753126e-05, + "loss": 0.3862, + "step": 7571 + }, + { + "epoch": 0.8979011028103877, + "grad_norm": 1.4259866855576457, + "learning_rate": 4.520282665218889e-05, + "loss": 0.3782, + "step": 7572 + }, + { + "epoch": 0.8980196845725128, + "grad_norm": 1.7200997754684433, + "learning_rate": 4.520141268060124e-05, + "loss": 0.608, + "step": 7573 + }, + { + "epoch": 0.8981382663346378, + "grad_norm": 1.1349932982948479, + "learning_rate": 4.519999852278133e-05, + "loss": 0.3569, + "step": 7574 + }, + { + "epoch": 0.8982568480967628, + "grad_norm": 1.4156876254977044, + "learning_rate": 4.519858417874221e-05, + "loss": 0.4542, + "step": 7575 + }, + { + "epoch": 0.8983754298588877, + "grad_norm": 1.0241545326244827, + "learning_rate": 4.5197169648496905e-05, + "loss": 0.256, + "step": 7576 + }, + { + "epoch": 0.8984940116210127, + "grad_norm": 1.2722574831569817, + "learning_rate": 4.519575493205847e-05, + "loss": 0.4176, + "step": 7577 + }, + { + "epoch": 0.8986125933831377, + "grad_norm": 1.2569880195476344, + "learning_rate": 4.519434002943994e-05, + "loss": 0.347, + "step": 7578 + }, + { + "epoch": 0.8987311751452627, + "grad_norm": 1.3754892970874861, + "learning_rate": 4.519292494065436e-05, + "loss": 0.4595, + "step": 7579 + }, + { + "epoch": 0.8988497569073877, + "grad_norm": 1.2149123111509081, + "learning_rate": 4.519150966571477e-05, + "loss": 0.3042, + "step": 7580 + }, + { + "epoch": 0.8989683386695126, + "grad_norm": 1.2666562658212768, + "learning_rate": 4.5190094204634225e-05, + "loss": 0.3809, + "step": 7581 + }, + { + "epoch": 0.8990869204316376, + "grad_norm": 1.314368838388571, + "learning_rate": 4.5188678557425765e-05, + "loss": 0.3581, + "step": 7582 + }, + { + "epoch": 0.8992055021937626, + "grad_norm": 1.123515797024075, + "learning_rate": 4.5187262724102455e-05, + "loss": 0.2839, + "step": 7583 + }, + { + "epoch": 0.8993240839558876, + "grad_norm": 1.2574596153794537, + "learning_rate": 4.518584670467733e-05, + "loss": 0.4264, + "step": 7584 + }, + { + "epoch": 0.8994426657180126, + "grad_norm": 1.4240014958594773, + "learning_rate": 4.518443049916346e-05, + "loss": 0.4715, + "step": 7585 + }, + { + "epoch": 0.8995612474801375, + "grad_norm": 1.3102892472566667, + "learning_rate": 4.518301410757388e-05, + "loss": 0.3989, + "step": 7586 + }, + { + "epoch": 0.8996798292422625, + "grad_norm": 1.655841783384617, + "learning_rate": 4.518159752992166e-05, + "loss": 0.5198, + "step": 7587 + }, + { + "epoch": 0.8997984110043875, + "grad_norm": 1.5988325287391265, + "learning_rate": 4.518018076621987e-05, + "loss": 0.4547, + "step": 7588 + }, + { + "epoch": 0.8999169927665125, + "grad_norm": 1.441588761446772, + "learning_rate": 4.5178763816481554e-05, + "loss": 0.3601, + "step": 7589 + }, + { + "epoch": 0.9000355745286375, + "grad_norm": 1.162418093704123, + "learning_rate": 4.517734668071979e-05, + "loss": 0.3459, + "step": 7590 + }, + { + "epoch": 0.9001541562907625, + "grad_norm": 1.526947218950448, + "learning_rate": 4.517592935894762e-05, + "loss": 0.4758, + "step": 7591 + }, + { + "epoch": 0.9002727380528874, + "grad_norm": 1.4384887434471367, + "learning_rate": 4.5174511851178126e-05, + "loss": 0.3656, + "step": 7592 + }, + { + "epoch": 0.9003913198150124, + "grad_norm": 1.4003422015973448, + "learning_rate": 4.517309415742437e-05, + "loss": 0.3354, + "step": 7593 + }, + { + "epoch": 0.9005099015771374, + "grad_norm": 1.6139938693952318, + "learning_rate": 4.517167627769943e-05, + "loss": 0.4804, + "step": 7594 + }, + { + "epoch": 0.9006284833392624, + "grad_norm": 1.833876686615319, + "learning_rate": 4.517025821201637e-05, + "loss": 0.5674, + "step": 7595 + }, + { + "epoch": 0.9007470651013874, + "grad_norm": 1.1505440490053005, + "learning_rate": 4.5168839960388257e-05, + "loss": 0.2948, + "step": 7596 + }, + { + "epoch": 0.9008656468635123, + "grad_norm": 1.2744372883229853, + "learning_rate": 4.516742152282818e-05, + "loss": 0.3201, + "step": 7597 + }, + { + "epoch": 0.9009842286256374, + "grad_norm": 1.3577729157388, + "learning_rate": 4.5166002899349205e-05, + "loss": 0.4334, + "step": 7598 + }, + { + "epoch": 0.9011028103877624, + "grad_norm": 1.1139368066947266, + "learning_rate": 4.5164584089964414e-05, + "loss": 0.3076, + "step": 7599 + }, + { + "epoch": 0.9012213921498874, + "grad_norm": 0.9938339029374205, + "learning_rate": 4.516316509468688e-05, + "loss": 0.2953, + "step": 7600 + }, + { + "epoch": 0.9013399739120124, + "grad_norm": 1.7276343683208693, + "learning_rate": 4.51617459135297e-05, + "loss": 0.5405, + "step": 7601 + }, + { + "epoch": 0.9014585556741374, + "grad_norm": 1.362089511712901, + "learning_rate": 4.516032654650593e-05, + "loss": 0.3423, + "step": 7602 + }, + { + "epoch": 0.9015771374362623, + "grad_norm": 1.1769861374575195, + "learning_rate": 4.515890699362868e-05, + "loss": 0.3706, + "step": 7603 + }, + { + "epoch": 0.9016957191983873, + "grad_norm": 1.8069746495561436, + "learning_rate": 4.515748725491103e-05, + "loss": 0.746, + "step": 7604 + }, + { + "epoch": 0.9018143009605123, + "grad_norm": 1.5217458235243717, + "learning_rate": 4.515606733036607e-05, + "loss": 0.4631, + "step": 7605 + }, + { + "epoch": 0.9019328827226373, + "grad_norm": 1.361591502477298, + "learning_rate": 4.515464722000687e-05, + "loss": 0.3687, + "step": 7606 + }, + { + "epoch": 0.9020514644847623, + "grad_norm": 1.6113232327837583, + "learning_rate": 4.515322692384656e-05, + "loss": 0.4565, + "step": 7607 + }, + { + "epoch": 0.9021700462468872, + "grad_norm": 1.4923675269265537, + "learning_rate": 4.51518064418982e-05, + "loss": 0.4211, + "step": 7608 + }, + { + "epoch": 0.9022886280090122, + "grad_norm": 1.371461780432518, + "learning_rate": 4.51503857741749e-05, + "loss": 0.3482, + "step": 7609 + }, + { + "epoch": 0.9024072097711372, + "grad_norm": 1.5277033450761366, + "learning_rate": 4.514896492068974e-05, + "loss": 0.4395, + "step": 7610 + }, + { + "epoch": 0.9025257915332622, + "grad_norm": 1.6352280182067422, + "learning_rate": 4.514754388145584e-05, + "loss": 0.4457, + "step": 7611 + }, + { + "epoch": 0.9026443732953872, + "grad_norm": 1.0569352654481983, + "learning_rate": 4.51461226564863e-05, + "loss": 0.3433, + "step": 7612 + }, + { + "epoch": 0.9027629550575121, + "grad_norm": 1.240984361371545, + "learning_rate": 4.514470124579421e-05, + "loss": 0.3419, + "step": 7613 + }, + { + "epoch": 0.9028815368196371, + "grad_norm": 1.156852058944384, + "learning_rate": 4.5143279649392664e-05, + "loss": 0.2841, + "step": 7614 + }, + { + "epoch": 0.9030001185817621, + "grad_norm": 1.4283757661143701, + "learning_rate": 4.51418578672948e-05, + "loss": 0.4653, + "step": 7615 + }, + { + "epoch": 0.9031187003438871, + "grad_norm": 1.2757610090235179, + "learning_rate": 4.51404358995137e-05, + "loss": 0.3994, + "step": 7616 + }, + { + "epoch": 0.9032372821060121, + "grad_norm": 1.0841713201786027, + "learning_rate": 4.513901374606248e-05, + "loss": 0.3764, + "step": 7617 + }, + { + "epoch": 0.903355863868137, + "grad_norm": 1.9116776328309448, + "learning_rate": 4.513759140695424e-05, + "loss": 0.5992, + "step": 7618 + }, + { + "epoch": 0.903474445630262, + "grad_norm": 1.0820298334938925, + "learning_rate": 4.513616888220211e-05, + "loss": 0.2809, + "step": 7619 + }, + { + "epoch": 0.903593027392387, + "grad_norm": 1.3262955824543454, + "learning_rate": 4.513474617181919e-05, + "loss": 0.4512, + "step": 7620 + }, + { + "epoch": 0.903711609154512, + "grad_norm": 1.7892672199925554, + "learning_rate": 4.513332327581861e-05, + "loss": 0.583, + "step": 7621 + }, + { + "epoch": 0.903830190916637, + "grad_norm": 1.6235759935535257, + "learning_rate": 4.5131900194213475e-05, + "loss": 0.4629, + "step": 7622 + }, + { + "epoch": 0.9039487726787621, + "grad_norm": 1.5072165077648865, + "learning_rate": 4.51304769270169e-05, + "loss": 0.4914, + "step": 7623 + }, + { + "epoch": 0.904067354440887, + "grad_norm": 1.0347153279879222, + "learning_rate": 4.5129053474242024e-05, + "loss": 0.2785, + "step": 7624 + }, + { + "epoch": 0.904185936203012, + "grad_norm": 1.5113352185737285, + "learning_rate": 4.512762983590195e-05, + "loss": 0.3312, + "step": 7625 + }, + { + "epoch": 0.904304517965137, + "grad_norm": 1.1976578552888544, + "learning_rate": 4.5126206012009814e-05, + "loss": 0.3467, + "step": 7626 + }, + { + "epoch": 0.904423099727262, + "grad_norm": 1.0677171228276967, + "learning_rate": 4.512478200257874e-05, + "loss": 0.2718, + "step": 7627 + }, + { + "epoch": 0.904541681489387, + "grad_norm": 1.3467072116902425, + "learning_rate": 4.512335780762186e-05, + "loss": 0.3378, + "step": 7628 + }, + { + "epoch": 0.9046602632515119, + "grad_norm": 1.4685889967389898, + "learning_rate": 4.512193342715229e-05, + "loss": 0.3656, + "step": 7629 + }, + { + "epoch": 0.9047788450136369, + "grad_norm": 1.7047580839614918, + "learning_rate": 4.512050886118317e-05, + "loss": 0.385, + "step": 7630 + }, + { + "epoch": 0.9048974267757619, + "grad_norm": 1.1407480669824916, + "learning_rate": 4.5119084109727625e-05, + "loss": 0.3554, + "step": 7631 + }, + { + "epoch": 0.9050160085378869, + "grad_norm": 1.2689037438633692, + "learning_rate": 4.511765917279881e-05, + "loss": 0.3311, + "step": 7632 + }, + { + "epoch": 0.9051345903000119, + "grad_norm": 1.1727043063497256, + "learning_rate": 4.5116234050409844e-05, + "loss": 0.3193, + "step": 7633 + }, + { + "epoch": 0.9052531720621368, + "grad_norm": 1.0472693290287487, + "learning_rate": 4.511480874257387e-05, + "loss": 0.3126, + "step": 7634 + }, + { + "epoch": 0.9053717538242618, + "grad_norm": 1.189323386561484, + "learning_rate": 4.511338324930402e-05, + "loss": 0.2846, + "step": 7635 + }, + { + "epoch": 0.9054903355863868, + "grad_norm": 1.3210375123031637, + "learning_rate": 4.511195757061344e-05, + "loss": 0.3441, + "step": 7636 + }, + { + "epoch": 0.9056089173485118, + "grad_norm": 1.3587720228212135, + "learning_rate": 4.5110531706515276e-05, + "loss": 0.3931, + "step": 7637 + }, + { + "epoch": 0.9057274991106368, + "grad_norm": 1.436033234600989, + "learning_rate": 4.510910565702267e-05, + "loss": 0.4052, + "step": 7638 + }, + { + "epoch": 0.9058460808727617, + "grad_norm": 0.9418791559367704, + "learning_rate": 4.510767942214878e-05, + "loss": 0.2322, + "step": 7639 + }, + { + "epoch": 0.9059646626348867, + "grad_norm": 1.972050158895936, + "learning_rate": 4.510625300190673e-05, + "loss": 0.5868, + "step": 7640 + }, + { + "epoch": 0.9060832443970117, + "grad_norm": 1.1638788297177294, + "learning_rate": 4.510482639630969e-05, + "loss": 0.2705, + "step": 7641 + }, + { + "epoch": 0.9062018261591367, + "grad_norm": 1.0694999404555772, + "learning_rate": 4.510339960537079e-05, + "loss": 0.2388, + "step": 7642 + }, + { + "epoch": 0.9063204079212617, + "grad_norm": 1.615111097332677, + "learning_rate": 4.5101972629103214e-05, + "loss": 0.3293, + "step": 7643 + }, + { + "epoch": 0.9064389896833867, + "grad_norm": 1.2812729394065092, + "learning_rate": 4.5100545467520095e-05, + "loss": 0.3697, + "step": 7644 + }, + { + "epoch": 0.9065575714455116, + "grad_norm": 1.4810629737856582, + "learning_rate": 4.50991181206346e-05, + "loss": 0.422, + "step": 7645 + }, + { + "epoch": 0.9066761532076366, + "grad_norm": 1.380906431966548, + "learning_rate": 4.509769058845988e-05, + "loss": 0.3633, + "step": 7646 + }, + { + "epoch": 0.9067947349697616, + "grad_norm": 1.0861355708242368, + "learning_rate": 4.5096262871009096e-05, + "loss": 0.2496, + "step": 7647 + }, + { + "epoch": 0.9069133167318867, + "grad_norm": 1.0844665747632678, + "learning_rate": 4.509483496829542e-05, + "loss": 0.2956, + "step": 7648 + }, + { + "epoch": 0.9070318984940117, + "grad_norm": 1.304548659951187, + "learning_rate": 4.5093406880332e-05, + "loss": 0.3878, + "step": 7649 + }, + { + "epoch": 0.9071504802561366, + "grad_norm": 1.4042248641449977, + "learning_rate": 4.5091978607132015e-05, + "loss": 0.354, + "step": 7650 + }, + { + "epoch": 0.9072690620182616, + "grad_norm": 1.2627285659242993, + "learning_rate": 4.509055014870862e-05, + "loss": 0.3794, + "step": 7651 + }, + { + "epoch": 0.9073876437803866, + "grad_norm": 1.1345841769064382, + "learning_rate": 4.508912150507499e-05, + "loss": 0.3156, + "step": 7652 + }, + { + "epoch": 0.9075062255425116, + "grad_norm": 1.3792118275635625, + "learning_rate": 4.5087692676244295e-05, + "loss": 0.3413, + "step": 7653 + }, + { + "epoch": 0.9076248073046366, + "grad_norm": 1.1826370230088494, + "learning_rate": 4.508626366222971e-05, + "loss": 0.3494, + "step": 7654 + }, + { + "epoch": 0.9077433890667616, + "grad_norm": 1.018137837716447, + "learning_rate": 4.50848344630444e-05, + "loss": 0.3236, + "step": 7655 + }, + { + "epoch": 0.9078619708288865, + "grad_norm": 2.1759517445493075, + "learning_rate": 4.5083405078701554e-05, + "loss": 0.5963, + "step": 7656 + }, + { + "epoch": 0.9079805525910115, + "grad_norm": 1.6062260201587681, + "learning_rate": 4.508197550921434e-05, + "loss": 0.5152, + "step": 7657 + }, + { + "epoch": 0.9080991343531365, + "grad_norm": 0.8331092582519394, + "learning_rate": 4.508054575459593e-05, + "loss": 0.2494, + "step": 7658 + }, + { + "epoch": 0.9082177161152615, + "grad_norm": 1.1139760386927537, + "learning_rate": 4.507911581485952e-05, + "loss": 0.331, + "step": 7659 + }, + { + "epoch": 0.9083362978773865, + "grad_norm": 1.4758245264035799, + "learning_rate": 4.5077685690018276e-05, + "loss": 0.4066, + "step": 7660 + }, + { + "epoch": 0.9084548796395114, + "grad_norm": 1.4839626983062053, + "learning_rate": 4.50762553800854e-05, + "loss": 0.4746, + "step": 7661 + }, + { + "epoch": 0.9085734614016364, + "grad_norm": 1.517629519944083, + "learning_rate": 4.507482488507406e-05, + "loss": 0.4439, + "step": 7662 + }, + { + "epoch": 0.9086920431637614, + "grad_norm": 1.2109338771160942, + "learning_rate": 4.507339420499746e-05, + "loss": 0.4219, + "step": 7663 + }, + { + "epoch": 0.9088106249258864, + "grad_norm": 1.4511374615506445, + "learning_rate": 4.5071963339868775e-05, + "loss": 0.4183, + "step": 7664 + }, + { + "epoch": 0.9089292066880114, + "grad_norm": 1.8907267873200995, + "learning_rate": 4.507053228970121e-05, + "loss": 0.5963, + "step": 7665 + }, + { + "epoch": 0.9090477884501363, + "grad_norm": 1.063339428668752, + "learning_rate": 4.506910105450795e-05, + "loss": 0.2566, + "step": 7666 + }, + { + "epoch": 0.9091663702122613, + "grad_norm": 1.635825715213325, + "learning_rate": 4.506766963430218e-05, + "loss": 0.5889, + "step": 7667 + }, + { + "epoch": 0.9092849519743863, + "grad_norm": 1.3002983081858677, + "learning_rate": 4.50662380290971e-05, + "loss": 0.2589, + "step": 7668 + }, + { + "epoch": 0.9094035337365113, + "grad_norm": 1.6255231142940179, + "learning_rate": 4.506480623890592e-05, + "loss": 0.4548, + "step": 7669 + }, + { + "epoch": 0.9095221154986363, + "grad_norm": 1.171122896763931, + "learning_rate": 4.506337426374183e-05, + "loss": 0.2956, + "step": 7670 + }, + { + "epoch": 0.9096406972607612, + "grad_norm": 1.0574921610417867, + "learning_rate": 4.506194210361804e-05, + "loss": 0.2886, + "step": 7671 + }, + { + "epoch": 0.9097592790228862, + "grad_norm": 0.8213130926893109, + "learning_rate": 4.506050975854773e-05, + "loss": 0.2092, + "step": 7672 + }, + { + "epoch": 0.9098778607850113, + "grad_norm": 1.1599921557383326, + "learning_rate": 4.505907722854413e-05, + "loss": 0.3549, + "step": 7673 + }, + { + "epoch": 0.9099964425471363, + "grad_norm": 1.3208738309051264, + "learning_rate": 4.505764451362044e-05, + "loss": 0.412, + "step": 7674 + }, + { + "epoch": 0.9101150243092613, + "grad_norm": 1.511083454075665, + "learning_rate": 4.5056211613789856e-05, + "loss": 0.5389, + "step": 7675 + }, + { + "epoch": 0.9102336060713863, + "grad_norm": 1.2081578218188465, + "learning_rate": 4.50547785290656e-05, + "loss": 0.3078, + "step": 7676 + }, + { + "epoch": 0.9103521878335112, + "grad_norm": 1.4144430412015212, + "learning_rate": 4.505334525946088e-05, + "loss": 0.3656, + "step": 7677 + }, + { + "epoch": 0.9104707695956362, + "grad_norm": 1.1437440407991664, + "learning_rate": 4.50519118049889e-05, + "loss": 0.3226, + "step": 7678 + }, + { + "epoch": 0.9105893513577612, + "grad_norm": 1.1660838375465763, + "learning_rate": 4.505047816566289e-05, + "loss": 0.3526, + "step": 7679 + }, + { + "epoch": 0.9107079331198862, + "grad_norm": 1.477119765731285, + "learning_rate": 4.504904434149606e-05, + "loss": 0.3818, + "step": 7680 + }, + { + "epoch": 0.9108265148820112, + "grad_norm": 1.2036628431667507, + "learning_rate": 4.5047610332501624e-05, + "loss": 0.2814, + "step": 7681 + }, + { + "epoch": 0.9109450966441361, + "grad_norm": 1.7013263901607723, + "learning_rate": 4.50461761386928e-05, + "loss": 0.4727, + "step": 7682 + }, + { + "epoch": 0.9110636784062611, + "grad_norm": 1.675130444889837, + "learning_rate": 4.504474176008282e-05, + "loss": 0.5428, + "step": 7683 + }, + { + "epoch": 0.9111822601683861, + "grad_norm": 1.4137985124790995, + "learning_rate": 4.5043307196684895e-05, + "loss": 0.4122, + "step": 7684 + }, + { + "epoch": 0.9113008419305111, + "grad_norm": 1.273262266576916, + "learning_rate": 4.504187244851226e-05, + "loss": 0.3498, + "step": 7685 + }, + { + "epoch": 0.9114194236926361, + "grad_norm": 1.5569199956430209, + "learning_rate": 4.504043751557814e-05, + "loss": 0.4072, + "step": 7686 + }, + { + "epoch": 0.911538005454761, + "grad_norm": 1.4150380293196185, + "learning_rate": 4.503900239789576e-05, + "loss": 0.4747, + "step": 7687 + }, + { + "epoch": 0.911656587216886, + "grad_norm": 1.38414872306566, + "learning_rate": 4.503756709547835e-05, + "loss": 0.3839, + "step": 7688 + }, + { + "epoch": 0.911775168979011, + "grad_norm": 1.3807314393300076, + "learning_rate": 4.503613160833915e-05, + "loss": 0.3979, + "step": 7689 + }, + { + "epoch": 0.911893750741136, + "grad_norm": 1.2061788482990152, + "learning_rate": 4.503469593649138e-05, + "loss": 0.3431, + "step": 7690 + }, + { + "epoch": 0.912012332503261, + "grad_norm": 0.9712296714000724, + "learning_rate": 4.503326007994828e-05, + "loss": 0.309, + "step": 7691 + }, + { + "epoch": 0.912130914265386, + "grad_norm": 1.3255974120177378, + "learning_rate": 4.503182403872309e-05, + "loss": 0.2816, + "step": 7692 + }, + { + "epoch": 0.9122494960275109, + "grad_norm": 1.7071403192633634, + "learning_rate": 4.503038781282905e-05, + "loss": 0.4749, + "step": 7693 + }, + { + "epoch": 0.9123680777896359, + "grad_norm": 1.3729855806599593, + "learning_rate": 4.50289514022794e-05, + "loss": 0.3956, + "step": 7694 + }, + { + "epoch": 0.9124866595517609, + "grad_norm": 1.2462613834121212, + "learning_rate": 4.502751480708737e-05, + "loss": 0.3871, + "step": 7695 + }, + { + "epoch": 0.9126052413138859, + "grad_norm": 1.138251638426353, + "learning_rate": 4.502607802726621e-05, + "loss": 0.3142, + "step": 7696 + }, + { + "epoch": 0.912723823076011, + "grad_norm": 1.2921820241406097, + "learning_rate": 4.5024641062829185e-05, + "loss": 0.3981, + "step": 7697 + }, + { + "epoch": 0.912842404838136, + "grad_norm": 0.9615263250883, + "learning_rate": 4.502320391378951e-05, + "loss": 0.2454, + "step": 7698 + }, + { + "epoch": 0.9129609866002609, + "grad_norm": 1.4042691672533647, + "learning_rate": 4.502176658016046e-05, + "loss": 0.5004, + "step": 7699 + }, + { + "epoch": 0.9130795683623859, + "grad_norm": 1.1200615216348662, + "learning_rate": 4.502032906195527e-05, + "loss": 0.337, + "step": 7700 + }, + { + "epoch": 0.9131981501245109, + "grad_norm": 1.4519937717845874, + "learning_rate": 4.501889135918719e-05, + "loss": 0.5086, + "step": 7701 + }, + { + "epoch": 0.9133167318866359, + "grad_norm": 1.6130620612462245, + "learning_rate": 4.5017453471869484e-05, + "loss": 0.4938, + "step": 7702 + }, + { + "epoch": 0.9134353136487608, + "grad_norm": 1.1965081450360677, + "learning_rate": 4.501601540001541e-05, + "loss": 0.3261, + "step": 7703 + }, + { + "epoch": 0.9135538954108858, + "grad_norm": 1.2206727168967226, + "learning_rate": 4.501457714363821e-05, + "loss": 0.3007, + "step": 7704 + }, + { + "epoch": 0.9136724771730108, + "grad_norm": 1.6039047338414572, + "learning_rate": 4.5013138702751166e-05, + "loss": 0.4393, + "step": 7705 + }, + { + "epoch": 0.9137910589351358, + "grad_norm": 2.2491317851873167, + "learning_rate": 4.5011700077367516e-05, + "loss": 0.7463, + "step": 7706 + }, + { + "epoch": 0.9139096406972608, + "grad_norm": 1.1874133827961217, + "learning_rate": 4.501026126750053e-05, + "loss": 0.3606, + "step": 7707 + }, + { + "epoch": 0.9140282224593858, + "grad_norm": 0.9512269570210055, + "learning_rate": 4.500882227316348e-05, + "loss": 0.2632, + "step": 7708 + }, + { + "epoch": 0.9141468042215107, + "grad_norm": 0.8199160742025952, + "learning_rate": 4.500738309436962e-05, + "loss": 0.2457, + "step": 7709 + }, + { + "epoch": 0.9142653859836357, + "grad_norm": 1.2683709105002199, + "learning_rate": 4.500594373113222e-05, + "loss": 0.3402, + "step": 7710 + }, + { + "epoch": 0.9143839677457607, + "grad_norm": 1.4816787440459755, + "learning_rate": 4.5004504183464555e-05, + "loss": 0.496, + "step": 7711 + }, + { + "epoch": 0.9145025495078857, + "grad_norm": 1.634744056547108, + "learning_rate": 4.50030644513799e-05, + "loss": 0.4356, + "step": 7712 + }, + { + "epoch": 0.9146211312700107, + "grad_norm": 1.174486586674016, + "learning_rate": 4.5001624534891506e-05, + "loss": 0.3455, + "step": 7713 + }, + { + "epoch": 0.9147397130321356, + "grad_norm": 1.213793091609422, + "learning_rate": 4.500018443401267e-05, + "loss": 0.3747, + "step": 7714 + }, + { + "epoch": 0.9148582947942606, + "grad_norm": 1.1225562729783105, + "learning_rate": 4.499874414875666e-05, + "loss": 0.3452, + "step": 7715 + }, + { + "epoch": 0.9149768765563856, + "grad_norm": 1.8597111459046882, + "learning_rate": 4.4997303679136745e-05, + "loss": 0.4706, + "step": 7716 + }, + { + "epoch": 0.9150954583185106, + "grad_norm": 1.985115860260531, + "learning_rate": 4.4995863025166216e-05, + "loss": 0.6212, + "step": 7717 + }, + { + "epoch": 0.9152140400806356, + "grad_norm": 1.2896826761593936, + "learning_rate": 4.499442218685835e-05, + "loss": 0.348, + "step": 7718 + }, + { + "epoch": 0.9153326218427605, + "grad_norm": 1.3813634717277044, + "learning_rate": 4.4992981164226437e-05, + "loss": 0.3686, + "step": 7719 + }, + { + "epoch": 0.9154512036048855, + "grad_norm": 1.4682222762545816, + "learning_rate": 4.499153995728374e-05, + "loss": 0.4991, + "step": 7720 + }, + { + "epoch": 0.9155697853670105, + "grad_norm": 1.0376009256948489, + "learning_rate": 4.499009856604358e-05, + "loss": 0.2197, + "step": 7721 + }, + { + "epoch": 0.9156883671291356, + "grad_norm": 1.3851420555517167, + "learning_rate": 4.4988656990519215e-05, + "loss": 0.506, + "step": 7722 + }, + { + "epoch": 0.9158069488912606, + "grad_norm": 1.1444727163929822, + "learning_rate": 4.4987215230723944e-05, + "loss": 0.3404, + "step": 7723 + }, + { + "epoch": 0.9159255306533856, + "grad_norm": 1.3359278138288624, + "learning_rate": 4.4985773286671055e-05, + "loss": 0.4172, + "step": 7724 + }, + { + "epoch": 0.9160441124155105, + "grad_norm": 1.274091100791459, + "learning_rate": 4.498433115837385e-05, + "loss": 0.2933, + "step": 7725 + }, + { + "epoch": 0.9161626941776355, + "grad_norm": 1.223343759689751, + "learning_rate": 4.498288884584562e-05, + "loss": 0.2683, + "step": 7726 + }, + { + "epoch": 0.9162812759397605, + "grad_norm": 1.106464979742115, + "learning_rate": 4.498144634909965e-05, + "loss": 0.3483, + "step": 7727 + }, + { + "epoch": 0.9163998577018855, + "grad_norm": 1.4926558516131125, + "learning_rate": 4.498000366814925e-05, + "loss": 0.3605, + "step": 7728 + }, + { + "epoch": 0.9165184394640105, + "grad_norm": 1.2760492343248497, + "learning_rate": 4.4978560803007726e-05, + "loss": 0.3403, + "step": 7729 + }, + { + "epoch": 0.9166370212261354, + "grad_norm": 1.6229488094191824, + "learning_rate": 4.4977117753688365e-05, + "loss": 0.5266, + "step": 7730 + }, + { + "epoch": 0.9167556029882604, + "grad_norm": 1.4070076177336808, + "learning_rate": 4.497567452020447e-05, + "loss": 0.4731, + "step": 7731 + }, + { + "epoch": 0.9168741847503854, + "grad_norm": 1.199092379035364, + "learning_rate": 4.497423110256936e-05, + "loss": 0.3266, + "step": 7732 + }, + { + "epoch": 0.9169927665125104, + "grad_norm": 1.7048799958656824, + "learning_rate": 4.497278750079633e-05, + "loss": 0.5039, + "step": 7733 + }, + { + "epoch": 0.9171113482746354, + "grad_norm": 1.6579895101685898, + "learning_rate": 4.497134371489869e-05, + "loss": 0.509, + "step": 7734 + }, + { + "epoch": 0.9172299300367603, + "grad_norm": 1.4785513169855502, + "learning_rate": 4.4969899744889754e-05, + "loss": 0.3045, + "step": 7735 + }, + { + "epoch": 0.9173485117988853, + "grad_norm": 1.2631043574427585, + "learning_rate": 4.496845559078283e-05, + "loss": 0.3695, + "step": 7736 + }, + { + "epoch": 0.9174670935610103, + "grad_norm": 0.9002564889910122, + "learning_rate": 4.496701125259124e-05, + "loss": 0.2759, + "step": 7737 + }, + { + "epoch": 0.9175856753231353, + "grad_norm": 1.1540190694457242, + "learning_rate": 4.4965566730328276e-05, + "loss": 0.3049, + "step": 7738 + }, + { + "epoch": 0.9177042570852603, + "grad_norm": 1.7374797453709618, + "learning_rate": 4.4964122024007284e-05, + "loss": 0.5661, + "step": 7739 + }, + { + "epoch": 0.9178228388473852, + "grad_norm": 1.8064114537865115, + "learning_rate": 4.4962677133641565e-05, + "loss": 0.3956, + "step": 7740 + }, + { + "epoch": 0.9179414206095102, + "grad_norm": 1.2339673062852203, + "learning_rate": 4.496123205924444e-05, + "loss": 0.3407, + "step": 7741 + }, + { + "epoch": 0.9180600023716352, + "grad_norm": 1.5520125529019726, + "learning_rate": 4.495978680082923e-05, + "loss": 0.4136, + "step": 7742 + }, + { + "epoch": 0.9181785841337602, + "grad_norm": 1.342748952675133, + "learning_rate": 4.495834135840927e-05, + "loss": 0.3739, + "step": 7743 + }, + { + "epoch": 0.9182971658958852, + "grad_norm": 1.1636095490783083, + "learning_rate": 4.495689573199787e-05, + "loss": 0.2398, + "step": 7744 + }, + { + "epoch": 0.9184157476580102, + "grad_norm": 0.9907607123120113, + "learning_rate": 4.495544992160837e-05, + "loss": 0.2759, + "step": 7745 + }, + { + "epoch": 0.9185343294201351, + "grad_norm": 1.1290160577879085, + "learning_rate": 4.495400392725409e-05, + "loss": 0.4066, + "step": 7746 + }, + { + "epoch": 0.9186529111822602, + "grad_norm": 1.453740632832844, + "learning_rate": 4.4952557748948365e-05, + "loss": 0.4522, + "step": 7747 + }, + { + "epoch": 0.9187714929443852, + "grad_norm": 1.2145791747992798, + "learning_rate": 4.4951111386704524e-05, + "loss": 0.2882, + "step": 7748 + }, + { + "epoch": 0.9188900747065102, + "grad_norm": 1.0987434552313782, + "learning_rate": 4.49496648405359e-05, + "loss": 0.3841, + "step": 7749 + }, + { + "epoch": 0.9190086564686352, + "grad_norm": 1.3533597149787853, + "learning_rate": 4.4948218110455834e-05, + "loss": 0.3534, + "step": 7750 + }, + { + "epoch": 0.9191272382307601, + "grad_norm": 1.3079569691993735, + "learning_rate": 4.4946771196477656e-05, + "loss": 0.413, + "step": 7751 + }, + { + "epoch": 0.9192458199928851, + "grad_norm": 1.0710746039232166, + "learning_rate": 4.4945324098614703e-05, + "loss": 0.3431, + "step": 7752 + }, + { + "epoch": 0.9193644017550101, + "grad_norm": 1.5123226067311801, + "learning_rate": 4.4943876816880325e-05, + "loss": 0.4487, + "step": 7753 + }, + { + "epoch": 0.9194829835171351, + "grad_norm": 1.3054719213535602, + "learning_rate": 4.4942429351287865e-05, + "loss": 0.422, + "step": 7754 + }, + { + "epoch": 0.9196015652792601, + "grad_norm": 1.2754969400118263, + "learning_rate": 4.494098170185066e-05, + "loss": 0.4157, + "step": 7755 + }, + { + "epoch": 0.919720147041385, + "grad_norm": 1.0353761916412845, + "learning_rate": 4.493953386858205e-05, + "loss": 0.2645, + "step": 7756 + }, + { + "epoch": 0.91983872880351, + "grad_norm": 0.960425663683585, + "learning_rate": 4.4938085851495396e-05, + "loss": 0.2899, + "step": 7757 + }, + { + "epoch": 0.919957310565635, + "grad_norm": 1.4405704187411301, + "learning_rate": 4.493663765060403e-05, + "loss": 0.374, + "step": 7758 + }, + { + "epoch": 0.92007589232776, + "grad_norm": 1.406107698127145, + "learning_rate": 4.493518926592132e-05, + "loss": 0.3383, + "step": 7759 + }, + { + "epoch": 0.920194474089885, + "grad_norm": 1.392004196012602, + "learning_rate": 4.493374069746061e-05, + "loss": 0.261, + "step": 7760 + }, + { + "epoch": 0.92031305585201, + "grad_norm": 1.9732687724175109, + "learning_rate": 4.493229194523526e-05, + "loss": 0.5543, + "step": 7761 + }, + { + "epoch": 0.9204316376141349, + "grad_norm": 1.3085620373539597, + "learning_rate": 4.493084300925862e-05, + "loss": 0.3644, + "step": 7762 + }, + { + "epoch": 0.9205502193762599, + "grad_norm": 1.0632606161570382, + "learning_rate": 4.492939388954404e-05, + "loss": 0.2715, + "step": 7763 + }, + { + "epoch": 0.9206688011383849, + "grad_norm": 1.5046128090348205, + "learning_rate": 4.492794458610489e-05, + "loss": 0.4582, + "step": 7764 + }, + { + "epoch": 0.9207873829005099, + "grad_norm": 1.3508783636181265, + "learning_rate": 4.492649509895453e-05, + "loss": 0.4085, + "step": 7765 + }, + { + "epoch": 0.9209059646626349, + "grad_norm": 1.2293576191152629, + "learning_rate": 4.4925045428106326e-05, + "loss": 0.2648, + "step": 7766 + }, + { + "epoch": 0.9210245464247598, + "grad_norm": 1.1868116106358735, + "learning_rate": 4.492359557357363e-05, + "loss": 0.3057, + "step": 7767 + }, + { + "epoch": 0.9211431281868848, + "grad_norm": 1.2450503505631003, + "learning_rate": 4.492214553536982e-05, + "loss": 0.3368, + "step": 7768 + }, + { + "epoch": 0.9212617099490098, + "grad_norm": 1.1714029604154623, + "learning_rate": 4.492069531350826e-05, + "loss": 0.2999, + "step": 7769 + }, + { + "epoch": 0.9213802917111348, + "grad_norm": 1.1053039104032298, + "learning_rate": 4.4919244908002306e-05, + "loss": 0.2773, + "step": 7770 + }, + { + "epoch": 0.9214988734732598, + "grad_norm": 1.7467015601541371, + "learning_rate": 4.4917794318865345e-05, + "loss": 0.4541, + "step": 7771 + }, + { + "epoch": 0.9216174552353849, + "grad_norm": 1.8997667684381532, + "learning_rate": 4.491634354611075e-05, + "loss": 0.4688, + "step": 7772 + }, + { + "epoch": 0.9217360369975098, + "grad_norm": 1.326029053658454, + "learning_rate": 4.491489258975189e-05, + "loss": 0.3658, + "step": 7773 + }, + { + "epoch": 0.9218546187596348, + "grad_norm": 1.234995333356106, + "learning_rate": 4.491344144980214e-05, + "loss": 0.3367, + "step": 7774 + }, + { + "epoch": 0.9219732005217598, + "grad_norm": 1.28478611468339, + "learning_rate": 4.491199012627487e-05, + "loss": 0.3281, + "step": 7775 + }, + { + "epoch": 0.9220917822838848, + "grad_norm": 1.0645342712730936, + "learning_rate": 4.4910538619183484e-05, + "loss": 0.3272, + "step": 7776 + }, + { + "epoch": 0.9222103640460098, + "grad_norm": 1.694435002766982, + "learning_rate": 4.490908692854134e-05, + "loss": 0.4679, + "step": 7777 + }, + { + "epoch": 0.9223289458081347, + "grad_norm": 1.1724174861342698, + "learning_rate": 4.490763505436183e-05, + "loss": 0.3552, + "step": 7778 + }, + { + "epoch": 0.9224475275702597, + "grad_norm": 1.1921281212186352, + "learning_rate": 4.490618299665834e-05, + "loss": 0.3254, + "step": 7779 + }, + { + "epoch": 0.9225661093323847, + "grad_norm": 1.849401987964552, + "learning_rate": 4.490473075544425e-05, + "loss": 0.4285, + "step": 7780 + }, + { + "epoch": 0.9226846910945097, + "grad_norm": 1.6810878097266828, + "learning_rate": 4.490327833073295e-05, + "loss": 0.5888, + "step": 7781 + }, + { + "epoch": 0.9228032728566347, + "grad_norm": 1.0557502911062124, + "learning_rate": 4.490182572253783e-05, + "loss": 0.2591, + "step": 7782 + }, + { + "epoch": 0.9229218546187596, + "grad_norm": 1.4082389944819793, + "learning_rate": 4.4900372930872293e-05, + "loss": 0.441, + "step": 7783 + }, + { + "epoch": 0.9230404363808846, + "grad_norm": 1.6583354988312733, + "learning_rate": 4.489891995574971e-05, + "loss": 0.4747, + "step": 7784 + }, + { + "epoch": 0.9231590181430096, + "grad_norm": 1.3582323683861814, + "learning_rate": 4.489746679718349e-05, + "loss": 0.476, + "step": 7785 + }, + { + "epoch": 0.9232775999051346, + "grad_norm": 1.1417796657090502, + "learning_rate": 4.489601345518702e-05, + "loss": 0.2925, + "step": 7786 + }, + { + "epoch": 0.9233961816672596, + "grad_norm": 1.114183049129594, + "learning_rate": 4.489455992977371e-05, + "loss": 0.2992, + "step": 7787 + }, + { + "epoch": 0.9235147634293845, + "grad_norm": 1.1692663367404215, + "learning_rate": 4.4893106220956956e-05, + "loss": 0.3529, + "step": 7788 + }, + { + "epoch": 0.9236333451915095, + "grad_norm": 0.9311203608842018, + "learning_rate": 4.4891652328750154e-05, + "loss": 0.3438, + "step": 7789 + }, + { + "epoch": 0.9237519269536345, + "grad_norm": 1.3408234057398656, + "learning_rate": 4.489019825316671e-05, + "loss": 0.4041, + "step": 7790 + }, + { + "epoch": 0.9238705087157595, + "grad_norm": 1.1566618021141706, + "learning_rate": 4.4888743994220026e-05, + "loss": 0.3666, + "step": 7791 + }, + { + "epoch": 0.9239890904778845, + "grad_norm": 1.0540250896765162, + "learning_rate": 4.4887289551923514e-05, + "loss": 0.3165, + "step": 7792 + }, + { + "epoch": 0.9241076722400094, + "grad_norm": 1.0502523512011264, + "learning_rate": 4.488583492629058e-05, + "loss": 0.2931, + "step": 7793 + }, + { + "epoch": 0.9242262540021344, + "grad_norm": 1.7305521282442995, + "learning_rate": 4.488438011733464e-05, + "loss": 0.5915, + "step": 7794 + }, + { + "epoch": 0.9243448357642594, + "grad_norm": 1.4355646696957487, + "learning_rate": 4.4882925125069096e-05, + "loss": 0.381, + "step": 7795 + }, + { + "epoch": 0.9244634175263844, + "grad_norm": 1.2119279890716508, + "learning_rate": 4.488146994950736e-05, + "loss": 0.3972, + "step": 7796 + }, + { + "epoch": 0.9245819992885095, + "grad_norm": 1.3533266868043552, + "learning_rate": 4.4880014590662856e-05, + "loss": 0.3728, + "step": 7797 + }, + { + "epoch": 0.9247005810506345, + "grad_norm": 1.0461933564120343, + "learning_rate": 4.487855904854899e-05, + "loss": 0.3, + "step": 7798 + }, + { + "epoch": 0.9248191628127594, + "grad_norm": 1.4354944586498377, + "learning_rate": 4.487710332317919e-05, + "loss": 0.4518, + "step": 7799 + }, + { + "epoch": 0.9249377445748844, + "grad_norm": 1.1135224619163588, + "learning_rate": 4.4875647414566876e-05, + "loss": 0.3069, + "step": 7800 + }, + { + "epoch": 0.9250563263370094, + "grad_norm": 1.34825534621954, + "learning_rate": 4.4874191322725455e-05, + "loss": 0.3885, + "step": 7801 + }, + { + "epoch": 0.9251749080991344, + "grad_norm": 1.103581954364754, + "learning_rate": 4.487273504766837e-05, + "loss": 0.2995, + "step": 7802 + }, + { + "epoch": 0.9252934898612594, + "grad_norm": 1.049265812119674, + "learning_rate": 4.487127858940904e-05, + "loss": 0.2386, + "step": 7803 + }, + { + "epoch": 0.9254120716233843, + "grad_norm": 1.2216943670211076, + "learning_rate": 4.486982194796088e-05, + "loss": 0.3258, + "step": 7804 + }, + { + "epoch": 0.9255306533855093, + "grad_norm": 1.1602022379160677, + "learning_rate": 4.486836512333734e-05, + "loss": 0.3356, + "step": 7805 + }, + { + "epoch": 0.9256492351476343, + "grad_norm": 1.3074472591938728, + "learning_rate": 4.4866908115551826e-05, + "loss": 0.4347, + "step": 7806 + }, + { + "epoch": 0.9257678169097593, + "grad_norm": 2.23380788788739, + "learning_rate": 4.4865450924617786e-05, + "loss": 0.6296, + "step": 7807 + }, + { + "epoch": 0.9258863986718843, + "grad_norm": 1.2642368374191435, + "learning_rate": 4.486399355054865e-05, + "loss": 0.3576, + "step": 7808 + }, + { + "epoch": 0.9260049804340093, + "grad_norm": 1.327129019083146, + "learning_rate": 4.486253599335785e-05, + "loss": 0.4236, + "step": 7809 + }, + { + "epoch": 0.9261235621961342, + "grad_norm": 1.2427406330294402, + "learning_rate": 4.486107825305883e-05, + "loss": 0.4971, + "step": 7810 + }, + { + "epoch": 0.9262421439582592, + "grad_norm": 1.0740218556655832, + "learning_rate": 4.485962032966502e-05, + "loss": 0.3215, + "step": 7811 + }, + { + "epoch": 0.9263607257203842, + "grad_norm": 1.0993673509794941, + "learning_rate": 4.485816222318986e-05, + "loss": 0.2751, + "step": 7812 + }, + { + "epoch": 0.9264793074825092, + "grad_norm": 1.4535560799380876, + "learning_rate": 4.4856703933646805e-05, + "loss": 0.4098, + "step": 7813 + }, + { + "epoch": 0.9265978892446342, + "grad_norm": 1.2492611872200596, + "learning_rate": 4.485524546104928e-05, + "loss": 0.3422, + "step": 7814 + }, + { + "epoch": 0.9267164710067591, + "grad_norm": 1.2266483888374327, + "learning_rate": 4.485378680541075e-05, + "loss": 0.3532, + "step": 7815 + }, + { + "epoch": 0.9268350527688841, + "grad_norm": 1.5210786781429928, + "learning_rate": 4.485232796674465e-05, + "loss": 0.5057, + "step": 7816 + }, + { + "epoch": 0.9269536345310091, + "grad_norm": 1.2822330896168308, + "learning_rate": 4.485086894506442e-05, + "loss": 0.4074, + "step": 7817 + }, + { + "epoch": 0.9270722162931341, + "grad_norm": 1.4353409075424353, + "learning_rate": 4.484940974038353e-05, + "loss": 0.3878, + "step": 7818 + }, + { + "epoch": 0.9271907980552591, + "grad_norm": 1.247079212779066, + "learning_rate": 4.484795035271542e-05, + "loss": 0.349, + "step": 7819 + }, + { + "epoch": 0.927309379817384, + "grad_norm": 1.393291279251636, + "learning_rate": 4.484649078207355e-05, + "loss": 0.3462, + "step": 7820 + }, + { + "epoch": 0.927427961579509, + "grad_norm": 1.7772361292463945, + "learning_rate": 4.484503102847137e-05, + "loss": 0.413, + "step": 7821 + }, + { + "epoch": 0.9275465433416341, + "grad_norm": 0.9408083827503743, + "learning_rate": 4.4843571091922346e-05, + "loss": 0.2288, + "step": 7822 + }, + { + "epoch": 0.9276651251037591, + "grad_norm": 1.4865304903677392, + "learning_rate": 4.484211097243992e-05, + "loss": 0.3711, + "step": 7823 + }, + { + "epoch": 0.9277837068658841, + "grad_norm": 1.3373965518004143, + "learning_rate": 4.484065067003757e-05, + "loss": 0.4139, + "step": 7824 + }, + { + "epoch": 0.927902288628009, + "grad_norm": 1.2838123002941628, + "learning_rate": 4.483919018472875e-05, + "loss": 0.3333, + "step": 7825 + }, + { + "epoch": 0.928020870390134, + "grad_norm": 1.1496044077867684, + "learning_rate": 4.4837729516526924e-05, + "loss": 0.316, + "step": 7826 + }, + { + "epoch": 0.928139452152259, + "grad_norm": 1.1830967789911926, + "learning_rate": 4.483626866544556e-05, + "loss": 0.2934, + "step": 7827 + }, + { + "epoch": 0.928258033914384, + "grad_norm": 1.1933893354549958, + "learning_rate": 4.483480763149812e-05, + "loss": 0.3351, + "step": 7828 + }, + { + "epoch": 0.928376615676509, + "grad_norm": 1.2476646950412273, + "learning_rate": 4.483334641469808e-05, + "loss": 0.2912, + "step": 7829 + }, + { + "epoch": 0.928495197438634, + "grad_norm": 1.5287684976455276, + "learning_rate": 4.48318850150589e-05, + "loss": 0.4633, + "step": 7830 + }, + { + "epoch": 0.9286137792007589, + "grad_norm": 1.5873219347335292, + "learning_rate": 4.483042343259407e-05, + "loss": 0.5446, + "step": 7831 + }, + { + "epoch": 0.9287323609628839, + "grad_norm": 1.2483968199882163, + "learning_rate": 4.4828961667317046e-05, + "loss": 0.2844, + "step": 7832 + }, + { + "epoch": 0.9288509427250089, + "grad_norm": 1.457496674730435, + "learning_rate": 4.482749971924132e-05, + "loss": 0.3786, + "step": 7833 + }, + { + "epoch": 0.9289695244871339, + "grad_norm": 1.1203515871820942, + "learning_rate": 4.482603758838035e-05, + "loss": 0.3402, + "step": 7834 + }, + { + "epoch": 0.9290881062492589, + "grad_norm": 1.2665410600997544, + "learning_rate": 4.4824575274747635e-05, + "loss": 0.3176, + "step": 7835 + }, + { + "epoch": 0.9292066880113838, + "grad_norm": 1.595380170842477, + "learning_rate": 4.482311277835664e-05, + "loss": 0.4811, + "step": 7836 + }, + { + "epoch": 0.9293252697735088, + "grad_norm": 2.1240734028816495, + "learning_rate": 4.482165009922086e-05, + "loss": 0.5238, + "step": 7837 + }, + { + "epoch": 0.9294438515356338, + "grad_norm": 0.9491026579214498, + "learning_rate": 4.482018723735377e-05, + "loss": 0.253, + "step": 7838 + }, + { + "epoch": 0.9295624332977588, + "grad_norm": 1.5822020288784522, + "learning_rate": 4.481872419276886e-05, + "loss": 0.5196, + "step": 7839 + }, + { + "epoch": 0.9296810150598838, + "grad_norm": 1.5967015821742594, + "learning_rate": 4.481726096547961e-05, + "loss": 0.4201, + "step": 7840 + }, + { + "epoch": 0.9297995968220087, + "grad_norm": 1.1176860166406442, + "learning_rate": 4.481579755549952e-05, + "loss": 0.3291, + "step": 7841 + }, + { + "epoch": 0.9299181785841337, + "grad_norm": 1.4521404203464887, + "learning_rate": 4.481433396284207e-05, + "loss": 0.4124, + "step": 7842 + }, + { + "epoch": 0.9300367603462587, + "grad_norm": 0.9294814839923479, + "learning_rate": 4.4812870187520766e-05, + "loss": 0.3229, + "step": 7843 + }, + { + "epoch": 0.9301553421083837, + "grad_norm": 1.1213255975334284, + "learning_rate": 4.48114062295491e-05, + "loss": 0.3144, + "step": 7844 + }, + { + "epoch": 0.9302739238705087, + "grad_norm": 1.1760118526969023, + "learning_rate": 4.480994208894055e-05, + "loss": 0.3546, + "step": 7845 + }, + { + "epoch": 0.9303925056326336, + "grad_norm": 1.4061415612488872, + "learning_rate": 4.4808477765708636e-05, + "loss": 0.4154, + "step": 7846 + }, + { + "epoch": 0.9305110873947587, + "grad_norm": 1.2376305672650991, + "learning_rate": 4.480701325986685e-05, + "loss": 0.3207, + "step": 7847 + }, + { + "epoch": 0.9306296691568837, + "grad_norm": 0.7375877856855838, + "learning_rate": 4.480554857142868e-05, + "loss": 0.1926, + "step": 7848 + }, + { + "epoch": 0.9307482509190087, + "grad_norm": 1.1229883476650957, + "learning_rate": 4.480408370040765e-05, + "loss": 0.2563, + "step": 7849 + }, + { + "epoch": 0.9308668326811337, + "grad_norm": 1.854630983384136, + "learning_rate": 4.480261864681724e-05, + "loss": 0.5666, + "step": 7850 + }, + { + "epoch": 0.9309854144432587, + "grad_norm": 1.7259865506022278, + "learning_rate": 4.480115341067098e-05, + "loss": 0.4155, + "step": 7851 + }, + { + "epoch": 0.9311039962053836, + "grad_norm": 1.6291433492084548, + "learning_rate": 4.479968799198237e-05, + "loss": 0.5189, + "step": 7852 + }, + { + "epoch": 0.9312225779675086, + "grad_norm": 1.1049110082900269, + "learning_rate": 4.479822239076491e-05, + "loss": 0.2703, + "step": 7853 + }, + { + "epoch": 0.9313411597296336, + "grad_norm": 1.1478362112696097, + "learning_rate": 4.479675660703212e-05, + "loss": 0.3749, + "step": 7854 + }, + { + "epoch": 0.9314597414917586, + "grad_norm": 1.0511792951050525, + "learning_rate": 4.479529064079751e-05, + "loss": 0.2883, + "step": 7855 + }, + { + "epoch": 0.9315783232538836, + "grad_norm": 1.3709253421738306, + "learning_rate": 4.47938244920746e-05, + "loss": 0.3887, + "step": 7856 + }, + { + "epoch": 0.9316969050160085, + "grad_norm": 1.3121005183673742, + "learning_rate": 4.479235816087689e-05, + "loss": 0.4166, + "step": 7857 + }, + { + "epoch": 0.9318154867781335, + "grad_norm": 1.2588755475209268, + "learning_rate": 4.479089164721792e-05, + "loss": 0.3557, + "step": 7858 + }, + { + "epoch": 0.9319340685402585, + "grad_norm": 1.1317530987234352, + "learning_rate": 4.478942495111119e-05, + "loss": 0.232, + "step": 7859 + }, + { + "epoch": 0.9320526503023835, + "grad_norm": 1.0478994811404854, + "learning_rate": 4.4787958072570236e-05, + "loss": 0.2672, + "step": 7860 + }, + { + "epoch": 0.9321712320645085, + "grad_norm": 1.6150924653668606, + "learning_rate": 4.4786491011608564e-05, + "loss": 0.4797, + "step": 7861 + }, + { + "epoch": 0.9322898138266335, + "grad_norm": 1.7604349412057645, + "learning_rate": 4.478502376823972e-05, + "loss": 0.4635, + "step": 7862 + }, + { + "epoch": 0.9324083955887584, + "grad_norm": 1.563629897448531, + "learning_rate": 4.4783556342477215e-05, + "loss": 0.4081, + "step": 7863 + }, + { + "epoch": 0.9325269773508834, + "grad_norm": 1.3254193678936212, + "learning_rate": 4.478208873433458e-05, + "loss": 0.2975, + "step": 7864 + }, + { + "epoch": 0.9326455591130084, + "grad_norm": 1.581870455184607, + "learning_rate": 4.4780620943825344e-05, + "loss": 0.4012, + "step": 7865 + }, + { + "epoch": 0.9327641408751334, + "grad_norm": 1.2284418806157484, + "learning_rate": 4.4779152970963044e-05, + "loss": 0.3011, + "step": 7866 + }, + { + "epoch": 0.9328827226372584, + "grad_norm": 1.9716541877979046, + "learning_rate": 4.477768481576121e-05, + "loss": 0.4165, + "step": 7867 + }, + { + "epoch": 0.9330013043993833, + "grad_norm": 1.2946986859962948, + "learning_rate": 4.477621647823337e-05, + "loss": 0.378, + "step": 7868 + }, + { + "epoch": 0.9331198861615083, + "grad_norm": 0.9908641081653118, + "learning_rate": 4.477474795839306e-05, + "loss": 0.3587, + "step": 7869 + }, + { + "epoch": 0.9332384679236333, + "grad_norm": 0.9760000239221137, + "learning_rate": 4.477327925625383e-05, + "loss": 0.2375, + "step": 7870 + }, + { + "epoch": 0.9333570496857583, + "grad_norm": 1.3885695656913952, + "learning_rate": 4.4771810371829206e-05, + "loss": 0.4132, + "step": 7871 + }, + { + "epoch": 0.9334756314478834, + "grad_norm": 1.4924264398595317, + "learning_rate": 4.477034130513274e-05, + "loss": 0.4154, + "step": 7872 + }, + { + "epoch": 0.9335942132100084, + "grad_norm": 1.8066033486954594, + "learning_rate": 4.476887205617798e-05, + "loss": 0.4199, + "step": 7873 + }, + { + "epoch": 0.9337127949721333, + "grad_norm": 1.4149581234162891, + "learning_rate": 4.4767402624978456e-05, + "loss": 0.45, + "step": 7874 + }, + { + "epoch": 0.9338313767342583, + "grad_norm": 1.2024014224278727, + "learning_rate": 4.476593301154772e-05, + "loss": 0.3622, + "step": 7875 + }, + { + "epoch": 0.9339499584963833, + "grad_norm": 1.2812416206331476, + "learning_rate": 4.476446321589931e-05, + "loss": 0.3227, + "step": 7876 + }, + { + "epoch": 0.9340685402585083, + "grad_norm": 1.0629513956954166, + "learning_rate": 4.4762993238046805e-05, + "loss": 0.2742, + "step": 7877 + }, + { + "epoch": 0.9341871220206333, + "grad_norm": 2.3772212337473686, + "learning_rate": 4.476152307800372e-05, + "loss": 0.71, + "step": 7878 + }, + { + "epoch": 0.9343057037827582, + "grad_norm": 1.399959406051211, + "learning_rate": 4.4760052735783634e-05, + "loss": 0.4355, + "step": 7879 + }, + { + "epoch": 0.9344242855448832, + "grad_norm": 1.3084336705994624, + "learning_rate": 4.47585822114001e-05, + "loss": 0.3662, + "step": 7880 + }, + { + "epoch": 0.9345428673070082, + "grad_norm": 1.2793369175301326, + "learning_rate": 4.475711150486666e-05, + "loss": 0.3712, + "step": 7881 + }, + { + "epoch": 0.9346614490691332, + "grad_norm": 1.0757156580989016, + "learning_rate": 4.475564061619688e-05, + "loss": 0.3256, + "step": 7882 + }, + { + "epoch": 0.9347800308312582, + "grad_norm": 1.0934839206249856, + "learning_rate": 4.475416954540431e-05, + "loss": 0.3677, + "step": 7883 + }, + { + "epoch": 0.9348986125933831, + "grad_norm": 1.0348354180289654, + "learning_rate": 4.475269829250254e-05, + "loss": 0.312, + "step": 7884 + }, + { + "epoch": 0.9350171943555081, + "grad_norm": 1.793066961184684, + "learning_rate": 4.47512268575051e-05, + "loss": 0.4712, + "step": 7885 + }, + { + "epoch": 0.9351357761176331, + "grad_norm": 1.2234240240236505, + "learning_rate": 4.474975524042557e-05, + "loss": 0.4183, + "step": 7886 + }, + { + "epoch": 0.9352543578797581, + "grad_norm": 1.2305756453587218, + "learning_rate": 4.4748283441277514e-05, + "loss": 0.3182, + "step": 7887 + }, + { + "epoch": 0.9353729396418831, + "grad_norm": 1.2386729558645155, + "learning_rate": 4.474681146007451e-05, + "loss": 0.3458, + "step": 7888 + }, + { + "epoch": 0.935491521404008, + "grad_norm": 1.2798982321391188, + "learning_rate": 4.474533929683011e-05, + "loss": 0.3886, + "step": 7889 + }, + { + "epoch": 0.935610103166133, + "grad_norm": 1.1533686314110427, + "learning_rate": 4.474386695155791e-05, + "loss": 0.3434, + "step": 7890 + }, + { + "epoch": 0.935728684928258, + "grad_norm": 1.1045975223822042, + "learning_rate": 4.4742394424271455e-05, + "loss": 0.2739, + "step": 7891 + }, + { + "epoch": 0.935847266690383, + "grad_norm": 1.3697305999648342, + "learning_rate": 4.474092171498434e-05, + "loss": 0.3759, + "step": 7892 + }, + { + "epoch": 0.935965848452508, + "grad_norm": 1.220260455499924, + "learning_rate": 4.473944882371013e-05, + "loss": 0.3749, + "step": 7893 + }, + { + "epoch": 0.936084430214633, + "grad_norm": 1.3036442036027642, + "learning_rate": 4.4737975750462405e-05, + "loss": 0.3402, + "step": 7894 + }, + { + "epoch": 0.9362030119767579, + "grad_norm": 1.1568325114301774, + "learning_rate": 4.473650249525476e-05, + "loss": 0.3627, + "step": 7895 + }, + { + "epoch": 0.9363215937388829, + "grad_norm": 1.5322585575929735, + "learning_rate": 4.473502905810075e-05, + "loss": 0.3776, + "step": 7896 + }, + { + "epoch": 0.936440175501008, + "grad_norm": 1.4225899707476533, + "learning_rate": 4.473355543901398e-05, + "loss": 0.4446, + "step": 7897 + }, + { + "epoch": 0.936558757263133, + "grad_norm": 1.3442218016470409, + "learning_rate": 4.4732081638008024e-05, + "loss": 0.3422, + "step": 7898 + }, + { + "epoch": 0.936677339025258, + "grad_norm": 1.574901528723671, + "learning_rate": 4.473060765509648e-05, + "loss": 0.4623, + "step": 7899 + }, + { + "epoch": 0.9367959207873829, + "grad_norm": 1.7024167657787856, + "learning_rate": 4.472913349029292e-05, + "loss": 0.5328, + "step": 7900 + }, + { + "epoch": 0.9369145025495079, + "grad_norm": 1.271409654843525, + "learning_rate": 4.472765914361095e-05, + "loss": 0.3721, + "step": 7901 + }, + { + "epoch": 0.9370330843116329, + "grad_norm": 1.8097583526972605, + "learning_rate": 4.4726184615064154e-05, + "loss": 0.3364, + "step": 7902 + }, + { + "epoch": 0.9371516660737579, + "grad_norm": 0.9557662243946908, + "learning_rate": 4.4724709904666126e-05, + "loss": 0.2922, + "step": 7903 + }, + { + "epoch": 0.9372702478358829, + "grad_norm": 1.224507933366907, + "learning_rate": 4.4723235012430454e-05, + "loss": 0.307, + "step": 7904 + }, + { + "epoch": 0.9373888295980078, + "grad_norm": 1.3849379511128523, + "learning_rate": 4.472175993837074e-05, + "loss": 0.4773, + "step": 7905 + }, + { + "epoch": 0.9375074113601328, + "grad_norm": 1.4278112443010327, + "learning_rate": 4.472028468250059e-05, + "loss": 0.4522, + "step": 7906 + }, + { + "epoch": 0.9376259931222578, + "grad_norm": 1.3976732154105562, + "learning_rate": 4.471880924483361e-05, + "loss": 0.3006, + "step": 7907 + }, + { + "epoch": 0.9377445748843828, + "grad_norm": 0.9630806697750414, + "learning_rate": 4.4717333625383375e-05, + "loss": 0.2114, + "step": 7908 + }, + { + "epoch": 0.9378631566465078, + "grad_norm": 1.196954884523647, + "learning_rate": 4.47158578241635e-05, + "loss": 0.3476, + "step": 7909 + }, + { + "epoch": 0.9379817384086327, + "grad_norm": 1.3880275673979963, + "learning_rate": 4.4714381841187595e-05, + "loss": 0.454, + "step": 7910 + }, + { + "epoch": 0.9381003201707577, + "grad_norm": 1.5369109457235928, + "learning_rate": 4.471290567646927e-05, + "loss": 0.4362, + "step": 7911 + }, + { + "epoch": 0.9382189019328827, + "grad_norm": 1.146488087851261, + "learning_rate": 4.471142933002213e-05, + "loss": 0.3229, + "step": 7912 + }, + { + "epoch": 0.9383374836950077, + "grad_norm": 1.6331038547310854, + "learning_rate": 4.470995280185977e-05, + "loss": 0.3831, + "step": 7913 + }, + { + "epoch": 0.9384560654571327, + "grad_norm": 1.2847545929734496, + "learning_rate": 4.470847609199583e-05, + "loss": 0.2991, + "step": 7914 + }, + { + "epoch": 0.9385746472192577, + "grad_norm": 1.4174673583184492, + "learning_rate": 4.47069992004439e-05, + "loss": 0.3823, + "step": 7915 + }, + { + "epoch": 0.9386932289813826, + "grad_norm": 1.4382369954495928, + "learning_rate": 4.47055221272176e-05, + "loss": 0.3721, + "step": 7916 + }, + { + "epoch": 0.9388118107435076, + "grad_norm": 0.9368905265203433, + "learning_rate": 4.470404487233057e-05, + "loss": 0.2866, + "step": 7917 + }, + { + "epoch": 0.9389303925056326, + "grad_norm": 1.4405861502475428, + "learning_rate": 4.470256743579638e-05, + "loss": 0.395, + "step": 7918 + }, + { + "epoch": 0.9390489742677576, + "grad_norm": 1.3578860487971665, + "learning_rate": 4.47010898176287e-05, + "loss": 0.4248, + "step": 7919 + }, + { + "epoch": 0.9391675560298826, + "grad_norm": 1.358097664003968, + "learning_rate": 4.4699612017841124e-05, + "loss": 0.2689, + "step": 7920 + }, + { + "epoch": 0.9392861377920075, + "grad_norm": 1.3707335460118961, + "learning_rate": 4.4698134036447283e-05, + "loss": 0.414, + "step": 7921 + }, + { + "epoch": 0.9394047195541326, + "grad_norm": 1.344652650490059, + "learning_rate": 4.469665587346081e-05, + "loss": 0.3809, + "step": 7922 + }, + { + "epoch": 0.9395233013162576, + "grad_norm": 1.350668547688967, + "learning_rate": 4.4695177528895316e-05, + "loss": 0.2968, + "step": 7923 + }, + { + "epoch": 0.9396418830783826, + "grad_norm": 1.7592958012027944, + "learning_rate": 4.469369900276443e-05, + "loss": 0.5406, + "step": 7924 + }, + { + "epoch": 0.9397604648405076, + "grad_norm": 1.098127321887825, + "learning_rate": 4.4692220295081796e-05, + "loss": 0.2966, + "step": 7925 + }, + { + "epoch": 0.9398790466026326, + "grad_norm": 1.2430880166752971, + "learning_rate": 4.469074140586105e-05, + "loss": 0.3225, + "step": 7926 + }, + { + "epoch": 0.9399976283647575, + "grad_norm": 1.550801812963555, + "learning_rate": 4.46892623351158e-05, + "loss": 0.4522, + "step": 7927 + }, + { + "epoch": 0.9401162101268825, + "grad_norm": 1.7003616251711866, + "learning_rate": 4.46877830828597e-05, + "loss": 0.5074, + "step": 7928 + }, + { + "epoch": 0.9402347918890075, + "grad_norm": 1.7703051802855216, + "learning_rate": 4.468630364910638e-05, + "loss": 0.4702, + "step": 7929 + }, + { + "epoch": 0.9403533736511325, + "grad_norm": 1.0100421450087218, + "learning_rate": 4.468482403386949e-05, + "loss": 0.2823, + "step": 7930 + }, + { + "epoch": 0.9404719554132575, + "grad_norm": 1.4845330597300348, + "learning_rate": 4.468334423716265e-05, + "loss": 0.469, + "step": 7931 + }, + { + "epoch": 0.9405905371753824, + "grad_norm": 1.2598366819098834, + "learning_rate": 4.468186425899952e-05, + "loss": 0.2735, + "step": 7932 + }, + { + "epoch": 0.9407091189375074, + "grad_norm": 1.2323541666291946, + "learning_rate": 4.4680384099393736e-05, + "loss": 0.3348, + "step": 7933 + }, + { + "epoch": 0.9408277006996324, + "grad_norm": 1.30826069212377, + "learning_rate": 4.467890375835894e-05, + "loss": 0.3411, + "step": 7934 + }, + { + "epoch": 0.9409462824617574, + "grad_norm": 1.4817824306521818, + "learning_rate": 4.467742323590879e-05, + "loss": 0.4869, + "step": 7935 + }, + { + "epoch": 0.9410648642238824, + "grad_norm": 1.4365390922754468, + "learning_rate": 4.467594253205693e-05, + "loss": 0.4806, + "step": 7936 + }, + { + "epoch": 0.9411834459860073, + "grad_norm": 1.5521876920960798, + "learning_rate": 4.4674461646817e-05, + "loss": 0.5316, + "step": 7937 + }, + { + "epoch": 0.9413020277481323, + "grad_norm": 1.2196857701445063, + "learning_rate": 4.4672980580202664e-05, + "loss": 0.3046, + "step": 7938 + }, + { + "epoch": 0.9414206095102573, + "grad_norm": 1.306708166824195, + "learning_rate": 4.467149933222757e-05, + "loss": 0.4167, + "step": 7939 + }, + { + "epoch": 0.9415391912723823, + "grad_norm": 0.9658113327414094, + "learning_rate": 4.467001790290538e-05, + "loss": 0.2461, + "step": 7940 + }, + { + "epoch": 0.9416577730345073, + "grad_norm": 1.4775720569357749, + "learning_rate": 4.466853629224974e-05, + "loss": 0.2694, + "step": 7941 + }, + { + "epoch": 0.9417763547966322, + "grad_norm": 1.2558645890650024, + "learning_rate": 4.4667054500274313e-05, + "loss": 0.3378, + "step": 7942 + }, + { + "epoch": 0.9418949365587572, + "grad_norm": 1.29225538000051, + "learning_rate": 4.466557252699277e-05, + "loss": 0.3043, + "step": 7943 + }, + { + "epoch": 0.9420135183208822, + "grad_norm": 1.782053385644077, + "learning_rate": 4.4664090372418755e-05, + "loss": 0.5266, + "step": 7944 + }, + { + "epoch": 0.9421321000830072, + "grad_norm": 1.5351069697400619, + "learning_rate": 4.466260803656595e-05, + "loss": 0.416, + "step": 7945 + }, + { + "epoch": 0.9422506818451322, + "grad_norm": 1.098101520337271, + "learning_rate": 4.4661125519447997e-05, + "loss": 0.3123, + "step": 7946 + }, + { + "epoch": 0.9423692636072573, + "grad_norm": 1.634881170880433, + "learning_rate": 4.465964282107859e-05, + "loss": 0.5117, + "step": 7947 + }, + { + "epoch": 0.9424878453693822, + "grad_norm": 1.3250041333750595, + "learning_rate": 4.465815994147138e-05, + "loss": 0.3313, + "step": 7948 + }, + { + "epoch": 0.9426064271315072, + "grad_norm": 1.1521457703142357, + "learning_rate": 4.465667688064003e-05, + "loss": 0.3062, + "step": 7949 + }, + { + "epoch": 0.9427250088936322, + "grad_norm": 1.3427906734972042, + "learning_rate": 4.465519363859824e-05, + "loss": 0.3321, + "step": 7950 + }, + { + "epoch": 0.9428435906557572, + "grad_norm": 1.5998830895568372, + "learning_rate": 4.4653710215359655e-05, + "loss": 0.4044, + "step": 7951 + }, + { + "epoch": 0.9429621724178822, + "grad_norm": 1.2654796668555095, + "learning_rate": 4.4652226610937974e-05, + "loss": 0.4068, + "step": 7952 + }, + { + "epoch": 0.9430807541800071, + "grad_norm": 1.42276818249597, + "learning_rate": 4.465074282534686e-05, + "loss": 0.4116, + "step": 7953 + }, + { + "epoch": 0.9431993359421321, + "grad_norm": 1.181018237444917, + "learning_rate": 4.464925885859999e-05, + "loss": 0.2817, + "step": 7954 + }, + { + "epoch": 0.9433179177042571, + "grad_norm": 2.146627859528423, + "learning_rate": 4.464777471071106e-05, + "loss": 0.6179, + "step": 7955 + }, + { + "epoch": 0.9434364994663821, + "grad_norm": 1.4502575502158863, + "learning_rate": 4.464629038169372e-05, + "loss": 0.4363, + "step": 7956 + }, + { + "epoch": 0.9435550812285071, + "grad_norm": 1.6312090706986382, + "learning_rate": 4.464480587156169e-05, + "loss": 0.4231, + "step": 7957 + }, + { + "epoch": 0.943673662990632, + "grad_norm": 1.2278445287221056, + "learning_rate": 4.464332118032864e-05, + "loss": 0.4086, + "step": 7958 + }, + { + "epoch": 0.943792244752757, + "grad_norm": 1.7196454044303109, + "learning_rate": 4.464183630800825e-05, + "loss": 0.4648, + "step": 7959 + }, + { + "epoch": 0.943910826514882, + "grad_norm": 1.219768617827713, + "learning_rate": 4.464035125461422e-05, + "loss": 0.3525, + "step": 7960 + }, + { + "epoch": 0.944029408277007, + "grad_norm": 1.626080831652033, + "learning_rate": 4.463886602016024e-05, + "loss": 0.5388, + "step": 7961 + }, + { + "epoch": 0.944147990039132, + "grad_norm": 1.0452816723928051, + "learning_rate": 4.463738060466e-05, + "loss": 0.2611, + "step": 7962 + }, + { + "epoch": 0.944266571801257, + "grad_norm": 1.1541678277579022, + "learning_rate": 4.4635895008127176e-05, + "loss": 0.3874, + "step": 7963 + }, + { + "epoch": 0.9443851535633819, + "grad_norm": 1.2299553803873313, + "learning_rate": 4.463440923057549e-05, + "loss": 0.3195, + "step": 7964 + }, + { + "epoch": 0.9445037353255069, + "grad_norm": 1.3624632472750375, + "learning_rate": 4.463292327201862e-05, + "loss": 0.3454, + "step": 7965 + }, + { + "epoch": 0.9446223170876319, + "grad_norm": 1.3286463576522978, + "learning_rate": 4.463143713247029e-05, + "loss": 0.4108, + "step": 7966 + }, + { + "epoch": 0.9447408988497569, + "grad_norm": 1.314053233597529, + "learning_rate": 4.462995081194417e-05, + "loss": 0.4709, + "step": 7967 + }, + { + "epoch": 0.9448594806118819, + "grad_norm": 1.0905643205542164, + "learning_rate": 4.462846431045398e-05, + "loss": 0.2367, + "step": 7968 + }, + { + "epoch": 0.9449780623740068, + "grad_norm": 1.501224340697668, + "learning_rate": 4.462697762801341e-05, + "loss": 0.3225, + "step": 7969 + }, + { + "epoch": 0.9450966441361318, + "grad_norm": 1.2710424475121913, + "learning_rate": 4.4625490764636185e-05, + "loss": 0.3987, + "step": 7970 + }, + { + "epoch": 0.9452152258982568, + "grad_norm": 1.2347031385639617, + "learning_rate": 4.4624003720335995e-05, + "loss": 0.3689, + "step": 7971 + }, + { + "epoch": 0.9453338076603819, + "grad_norm": 1.3388408952108617, + "learning_rate": 4.462251649512656e-05, + "loss": 0.3252, + "step": 7972 + }, + { + "epoch": 0.9454523894225069, + "grad_norm": 1.2119969118017877, + "learning_rate": 4.462102908902159e-05, + "loss": 0.3643, + "step": 7973 + }, + { + "epoch": 0.9455709711846318, + "grad_norm": 1.3811258090387222, + "learning_rate": 4.4619541502034775e-05, + "loss": 0.3506, + "step": 7974 + }, + { + "epoch": 0.9456895529467568, + "grad_norm": 1.5207276715785374, + "learning_rate": 4.4618053734179865e-05, + "loss": 0.426, + "step": 7975 + }, + { + "epoch": 0.9458081347088818, + "grad_norm": 1.4089576816611045, + "learning_rate": 4.461656578547055e-05, + "loss": 0.3792, + "step": 7976 + }, + { + "epoch": 0.9459267164710068, + "grad_norm": 1.1704743736236165, + "learning_rate": 4.461507765592056e-05, + "loss": 0.2951, + "step": 7977 + }, + { + "epoch": 0.9460452982331318, + "grad_norm": 1.1165227329779759, + "learning_rate": 4.46135893455436e-05, + "loss": 0.3139, + "step": 7978 + }, + { + "epoch": 0.9461638799952568, + "grad_norm": 1.0189928153408196, + "learning_rate": 4.46121008543534e-05, + "loss": 0.2775, + "step": 7979 + }, + { + "epoch": 0.9462824617573817, + "grad_norm": 1.1659942035214148, + "learning_rate": 4.4610612182363675e-05, + "loss": 0.3271, + "step": 7980 + }, + { + "epoch": 0.9464010435195067, + "grad_norm": 1.3539645741259374, + "learning_rate": 4.460912332958816e-05, + "loss": 0.4138, + "step": 7981 + }, + { + "epoch": 0.9465196252816317, + "grad_norm": 1.2452454810924032, + "learning_rate": 4.460763429604057e-05, + "loss": 0.3684, + "step": 7982 + }, + { + "epoch": 0.9466382070437567, + "grad_norm": 1.19627613211601, + "learning_rate": 4.4606145081734644e-05, + "loss": 0.3264, + "step": 7983 + }, + { + "epoch": 0.9467567888058817, + "grad_norm": 1.6222523164440736, + "learning_rate": 4.4604655686684095e-05, + "loss": 0.5196, + "step": 7984 + }, + { + "epoch": 0.9468753705680066, + "grad_norm": 1.8338982544227187, + "learning_rate": 4.460316611090266e-05, + "loss": 0.4368, + "step": 7985 + }, + { + "epoch": 0.9469939523301316, + "grad_norm": 1.235964315610443, + "learning_rate": 4.460167635440408e-05, + "loss": 0.3448, + "step": 7986 + }, + { + "epoch": 0.9471125340922566, + "grad_norm": 1.1664407967539072, + "learning_rate": 4.4600186417202075e-05, + "loss": 0.2947, + "step": 7987 + }, + { + "epoch": 0.9472311158543816, + "grad_norm": 1.2098949845788092, + "learning_rate": 4.4598696299310394e-05, + "loss": 0.3396, + "step": 7988 + }, + { + "epoch": 0.9473496976165066, + "grad_norm": 1.0223607314559817, + "learning_rate": 4.4597206000742763e-05, + "loss": 0.2021, + "step": 7989 + }, + { + "epoch": 0.9474682793786315, + "grad_norm": 1.2822612013548682, + "learning_rate": 4.459571552151292e-05, + "loss": 0.3637, + "step": 7990 + }, + { + "epoch": 0.9475868611407565, + "grad_norm": 1.0786323328616567, + "learning_rate": 4.4594224861634616e-05, + "loss": 0.2747, + "step": 7991 + }, + { + "epoch": 0.9477054429028815, + "grad_norm": 1.4711053511842929, + "learning_rate": 4.4592734021121584e-05, + "loss": 0.3975, + "step": 7992 + }, + { + "epoch": 0.9478240246650065, + "grad_norm": 1.4899548444957251, + "learning_rate": 4.459124299998757e-05, + "loss": 0.4031, + "step": 7993 + }, + { + "epoch": 0.9479426064271315, + "grad_norm": 1.251338062631199, + "learning_rate": 4.458975179824632e-05, + "loss": 0.4014, + "step": 7994 + }, + { + "epoch": 0.9480611881892564, + "grad_norm": 1.2872533830052764, + "learning_rate": 4.458826041591158e-05, + "loss": 0.251, + "step": 7995 + }, + { + "epoch": 0.9481797699513814, + "grad_norm": 1.195781829172686, + "learning_rate": 4.458676885299711e-05, + "loss": 0.2668, + "step": 7996 + }, + { + "epoch": 0.9482983517135065, + "grad_norm": 1.2430773268634088, + "learning_rate": 4.458527710951663e-05, + "loss": 0.318, + "step": 7997 + }, + { + "epoch": 0.9484169334756315, + "grad_norm": 1.637819629720816, + "learning_rate": 4.458378518548393e-05, + "loss": 0.5257, + "step": 7998 + }, + { + "epoch": 0.9485355152377565, + "grad_norm": 1.1604200159891602, + "learning_rate": 4.458229308091274e-05, + "loss": 0.2454, + "step": 7999 + }, + { + "epoch": 0.9486540969998815, + "grad_norm": 1.720486806738014, + "learning_rate": 4.4580800795816814e-05, + "loss": 0.4358, + "step": 8000 + }, + { + "epoch": 0.9487726787620064, + "grad_norm": 2.274460163138203, + "learning_rate": 4.457930833020992e-05, + "loss": 0.6192, + "step": 8001 + }, + { + "epoch": 0.9488912605241314, + "grad_norm": 1.2245046785730396, + "learning_rate": 4.4577815684105814e-05, + "loss": 0.2759, + "step": 8002 + }, + { + "epoch": 0.9490098422862564, + "grad_norm": 1.3312692981864966, + "learning_rate": 4.457632285751826e-05, + "loss": 0.3275, + "step": 8003 + }, + { + "epoch": 0.9491284240483814, + "grad_norm": 1.3202877879297386, + "learning_rate": 4.4574829850461017e-05, + "loss": 0.4959, + "step": 8004 + }, + { + "epoch": 0.9492470058105064, + "grad_norm": 1.5399677169971753, + "learning_rate": 4.457333666294784e-05, + "loss": 0.4554, + "step": 8005 + }, + { + "epoch": 0.9493655875726313, + "grad_norm": 1.7444500829184237, + "learning_rate": 4.45718432949925e-05, + "loss": 0.5667, + "step": 8006 + }, + { + "epoch": 0.9494841693347563, + "grad_norm": 1.6085755538928557, + "learning_rate": 4.457034974660877e-05, + "loss": 0.5003, + "step": 8007 + }, + { + "epoch": 0.9496027510968813, + "grad_norm": 1.459773056167545, + "learning_rate": 4.456885601781041e-05, + "loss": 0.4111, + "step": 8008 + }, + { + "epoch": 0.9497213328590063, + "grad_norm": 1.0352725219581451, + "learning_rate": 4.45673621086112e-05, + "loss": 0.3308, + "step": 8009 + }, + { + "epoch": 0.9498399146211313, + "grad_norm": 1.1345589257293933, + "learning_rate": 4.45658680190249e-05, + "loss": 0.2684, + "step": 8010 + }, + { + "epoch": 0.9499584963832562, + "grad_norm": 1.4336598903907307, + "learning_rate": 4.45643737490653e-05, + "loss": 0.4025, + "step": 8011 + }, + { + "epoch": 0.9500770781453812, + "grad_norm": 0.8289305175252019, + "learning_rate": 4.4562879298746165e-05, + "loss": 0.2475, + "step": 8012 + }, + { + "epoch": 0.9501956599075062, + "grad_norm": 1.0358434516792998, + "learning_rate": 4.4561384668081265e-05, + "loss": 0.2642, + "step": 8013 + }, + { + "epoch": 0.9503142416696312, + "grad_norm": 1.0190059513705128, + "learning_rate": 4.455988985708438e-05, + "loss": 0.2709, + "step": 8014 + }, + { + "epoch": 0.9504328234317562, + "grad_norm": 1.264418083727074, + "learning_rate": 4.45583948657693e-05, + "loss": 0.4059, + "step": 8015 + }, + { + "epoch": 0.9505514051938811, + "grad_norm": 1.571236078747199, + "learning_rate": 4.455689969414982e-05, + "loss": 0.4957, + "step": 8016 + }, + { + "epoch": 0.9506699869560061, + "grad_norm": 1.3738764560336314, + "learning_rate": 4.455540434223969e-05, + "loss": 0.4545, + "step": 8017 + }, + { + "epoch": 0.9507885687181311, + "grad_norm": 0.9581658720322292, + "learning_rate": 4.455390881005272e-05, + "loss": 0.2882, + "step": 8018 + }, + { + "epoch": 0.9509071504802561, + "grad_norm": 1.062481471360761, + "learning_rate": 4.4552413097602684e-05, + "loss": 0.3373, + "step": 8019 + }, + { + "epoch": 0.9510257322423811, + "grad_norm": 1.6575485901743823, + "learning_rate": 4.455091720490338e-05, + "loss": 0.5094, + "step": 8020 + }, + { + "epoch": 0.951144314004506, + "grad_norm": 1.186919668441879, + "learning_rate": 4.4549421131968595e-05, + "loss": 0.308, + "step": 8021 + }, + { + "epoch": 0.9512628957666311, + "grad_norm": 1.447026596795619, + "learning_rate": 4.454792487881212e-05, + "loss": 0.3831, + "step": 8022 + }, + { + "epoch": 0.9513814775287561, + "grad_norm": 1.6912972492309262, + "learning_rate": 4.454642844544774e-05, + "loss": 0.535, + "step": 8023 + }, + { + "epoch": 0.9515000592908811, + "grad_norm": 1.2166015554651435, + "learning_rate": 4.454493183188927e-05, + "loss": 0.3196, + "step": 8024 + }, + { + "epoch": 0.9516186410530061, + "grad_norm": 1.3215703094254, + "learning_rate": 4.454343503815049e-05, + "loss": 0.3335, + "step": 8025 + }, + { + "epoch": 0.9517372228151311, + "grad_norm": 1.5619485374557895, + "learning_rate": 4.454193806424521e-05, + "loss": 0.4424, + "step": 8026 + }, + { + "epoch": 0.951855804577256, + "grad_norm": 1.3594956119314252, + "learning_rate": 4.454044091018722e-05, + "loss": 0.3284, + "step": 8027 + }, + { + "epoch": 0.951974386339381, + "grad_norm": 1.1236233546296095, + "learning_rate": 4.453894357599033e-05, + "loss": 0.2971, + "step": 8028 + }, + { + "epoch": 0.952092968101506, + "grad_norm": 1.0960494765237532, + "learning_rate": 4.453744606166834e-05, + "loss": 0.2601, + "step": 8029 + }, + { + "epoch": 0.952211549863631, + "grad_norm": 1.3952580742142333, + "learning_rate": 4.453594836723505e-05, + "loss": 0.3329, + "step": 8030 + }, + { + "epoch": 0.952330131625756, + "grad_norm": 1.222815159519293, + "learning_rate": 4.453445049270428e-05, + "loss": 0.3059, + "step": 8031 + }, + { + "epoch": 0.952448713387881, + "grad_norm": 1.362012167741883, + "learning_rate": 4.453295243808983e-05, + "loss": 0.2748, + "step": 8032 + }, + { + "epoch": 0.9525672951500059, + "grad_norm": 1.2150285777074779, + "learning_rate": 4.453145420340551e-05, + "loss": 0.2689, + "step": 8033 + }, + { + "epoch": 0.9526858769121309, + "grad_norm": 1.2855838883865331, + "learning_rate": 4.452995578866513e-05, + "loss": 0.27, + "step": 8034 + }, + { + "epoch": 0.9528044586742559, + "grad_norm": 1.207240116834465, + "learning_rate": 4.452845719388251e-05, + "loss": 0.348, + "step": 8035 + }, + { + "epoch": 0.9529230404363809, + "grad_norm": 1.976037561523375, + "learning_rate": 4.452695841907146e-05, + "loss": 0.4221, + "step": 8036 + }, + { + "epoch": 0.9530416221985059, + "grad_norm": 1.0882559818571111, + "learning_rate": 4.452545946424581e-05, + "loss": 0.2842, + "step": 8037 + }, + { + "epoch": 0.9531602039606308, + "grad_norm": 1.232931717864638, + "learning_rate": 4.452396032941935e-05, + "loss": 0.2973, + "step": 8038 + }, + { + "epoch": 0.9532787857227558, + "grad_norm": 1.1271075117335136, + "learning_rate": 4.452246101460593e-05, + "loss": 0.2876, + "step": 8039 + }, + { + "epoch": 0.9533973674848808, + "grad_norm": 1.5444116758723774, + "learning_rate": 4.452096151981936e-05, + "loss": 0.3832, + "step": 8040 + }, + { + "epoch": 0.9535159492470058, + "grad_norm": 0.9664276374759817, + "learning_rate": 4.4519461845073455e-05, + "loss": 0.1938, + "step": 8041 + }, + { + "epoch": 0.9536345310091308, + "grad_norm": 1.3147803835707417, + "learning_rate": 4.451796199038205e-05, + "loss": 0.4125, + "step": 8042 + }, + { + "epoch": 0.9537531127712557, + "grad_norm": 1.3761153329754388, + "learning_rate": 4.451646195575898e-05, + "loss": 0.3475, + "step": 8043 + }, + { + "epoch": 0.9538716945333807, + "grad_norm": 1.2878342541408505, + "learning_rate": 4.451496174121805e-05, + "loss": 0.3174, + "step": 8044 + }, + { + "epoch": 0.9539902762955057, + "grad_norm": 0.8866667723474136, + "learning_rate": 4.451346134677311e-05, + "loss": 0.2203, + "step": 8045 + }, + { + "epoch": 0.9541088580576307, + "grad_norm": 1.629654219561143, + "learning_rate": 4.451196077243798e-05, + "loss": 0.4417, + "step": 8046 + }, + { + "epoch": 0.9542274398197558, + "grad_norm": 1.4519373801476008, + "learning_rate": 4.4510460018226507e-05, + "loss": 0.4393, + "step": 8047 + }, + { + "epoch": 0.9543460215818808, + "grad_norm": 1.6852260665064158, + "learning_rate": 4.450895908415251e-05, + "loss": 0.3636, + "step": 8048 + }, + { + "epoch": 0.9544646033440057, + "grad_norm": 1.2445821204015943, + "learning_rate": 4.450745797022984e-05, + "loss": 0.3764, + "step": 8049 + }, + { + "epoch": 0.9545831851061307, + "grad_norm": 1.5855886874560812, + "learning_rate": 4.450595667647233e-05, + "loss": 0.4117, + "step": 8050 + }, + { + "epoch": 0.9547017668682557, + "grad_norm": 1.2995239847937021, + "learning_rate": 4.450445520289381e-05, + "loss": 0.375, + "step": 8051 + }, + { + "epoch": 0.9548203486303807, + "grad_norm": 1.3887038252275121, + "learning_rate": 4.450295354950814e-05, + "loss": 0.3991, + "step": 8052 + }, + { + "epoch": 0.9549389303925057, + "grad_norm": 1.2866676184614656, + "learning_rate": 4.450145171632915e-05, + "loss": 0.3988, + "step": 8053 + }, + { + "epoch": 0.9550575121546306, + "grad_norm": 1.0383869810289579, + "learning_rate": 4.449994970337069e-05, + "loss": 0.298, + "step": 8054 + }, + { + "epoch": 0.9551760939167556, + "grad_norm": 1.1664270313904719, + "learning_rate": 4.4498447510646615e-05, + "loss": 0.2781, + "step": 8055 + }, + { + "epoch": 0.9552946756788806, + "grad_norm": 1.3295822960354065, + "learning_rate": 4.449694513817075e-05, + "loss": 0.2839, + "step": 8056 + }, + { + "epoch": 0.9554132574410056, + "grad_norm": 1.0728539216328117, + "learning_rate": 4.449544258595697e-05, + "loss": 0.2943, + "step": 8057 + }, + { + "epoch": 0.9555318392031306, + "grad_norm": 3.8834231896939686, + "learning_rate": 4.449393985401911e-05, + "loss": 0.2671, + "step": 8058 + }, + { + "epoch": 0.9556504209652555, + "grad_norm": 1.592659370679597, + "learning_rate": 4.449243694237103e-05, + "loss": 0.4821, + "step": 8059 + }, + { + "epoch": 0.9557690027273805, + "grad_norm": 1.4868973791137006, + "learning_rate": 4.449093385102659e-05, + "loss": 0.4547, + "step": 8060 + }, + { + "epoch": 0.9558875844895055, + "grad_norm": 1.157942205419363, + "learning_rate": 4.4489430579999634e-05, + "loss": 0.3171, + "step": 8061 + }, + { + "epoch": 0.9560061662516305, + "grad_norm": 1.6566377794557878, + "learning_rate": 4.448792712930404e-05, + "loss": 0.455, + "step": 8062 + }, + { + "epoch": 0.9561247480137555, + "grad_norm": 1.830326292639886, + "learning_rate": 4.448642349895364e-05, + "loss": 0.5038, + "step": 8063 + }, + { + "epoch": 0.9562433297758804, + "grad_norm": 1.303351828879128, + "learning_rate": 4.448491968896232e-05, + "loss": 0.3944, + "step": 8064 + }, + { + "epoch": 0.9563619115380054, + "grad_norm": 1.4179668729976078, + "learning_rate": 4.448341569934393e-05, + "loss": 0.4467, + "step": 8065 + }, + { + "epoch": 0.9564804933001304, + "grad_norm": 2.635737760184117, + "learning_rate": 4.448191153011234e-05, + "loss": 0.4645, + "step": 8066 + }, + { + "epoch": 0.9565990750622554, + "grad_norm": 1.1044657407700522, + "learning_rate": 4.448040718128142e-05, + "loss": 0.3475, + "step": 8067 + }, + { + "epoch": 0.9567176568243804, + "grad_norm": 1.5548963917429848, + "learning_rate": 4.4478902652865034e-05, + "loss": 0.4924, + "step": 8068 + }, + { + "epoch": 0.9568362385865053, + "grad_norm": 1.5749362287182267, + "learning_rate": 4.447739794487705e-05, + "loss": 0.5497, + "step": 8069 + }, + { + "epoch": 0.9569548203486303, + "grad_norm": 1.0655730865525967, + "learning_rate": 4.4475893057331343e-05, + "loss": 0.3746, + "step": 8070 + }, + { + "epoch": 0.9570734021107553, + "grad_norm": 1.1173530949868353, + "learning_rate": 4.4474387990241783e-05, + "loss": 0.2728, + "step": 8071 + }, + { + "epoch": 0.9571919838728804, + "grad_norm": 1.6784232313488763, + "learning_rate": 4.447288274362225e-05, + "loss": 0.6521, + "step": 8072 + }, + { + "epoch": 0.9573105656350054, + "grad_norm": 1.360226467524376, + "learning_rate": 4.447137731748661e-05, + "loss": 0.3514, + "step": 8073 + }, + { + "epoch": 0.9574291473971304, + "grad_norm": 0.9404683682160336, + "learning_rate": 4.446987171184875e-05, + "loss": 0.2472, + "step": 8074 + }, + { + "epoch": 0.9575477291592553, + "grad_norm": 1.0707695114687983, + "learning_rate": 4.446836592672255e-05, + "loss": 0.3252, + "step": 8075 + }, + { + "epoch": 0.9576663109213803, + "grad_norm": 1.368796861831619, + "learning_rate": 4.4466859962121896e-05, + "loss": 0.4068, + "step": 8076 + }, + { + "epoch": 0.9577848926835053, + "grad_norm": 1.0747107215480836, + "learning_rate": 4.446535381806066e-05, + "loss": 0.2278, + "step": 8077 + }, + { + "epoch": 0.9579034744456303, + "grad_norm": 1.34691718125261, + "learning_rate": 4.446384749455274e-05, + "loss": 0.3826, + "step": 8078 + }, + { + "epoch": 0.9580220562077553, + "grad_norm": 1.6579510358080132, + "learning_rate": 4.4462340991612004e-05, + "loss": 0.3867, + "step": 8079 + }, + { + "epoch": 0.9581406379698802, + "grad_norm": 1.266088452291217, + "learning_rate": 4.446083430925235e-05, + "loss": 0.3902, + "step": 8080 + }, + { + "epoch": 0.9582592197320052, + "grad_norm": 1.5800724406047364, + "learning_rate": 4.445932744748767e-05, + "loss": 0.358, + "step": 8081 + }, + { + "epoch": 0.9583778014941302, + "grad_norm": 0.930351579158421, + "learning_rate": 4.4457820406331864e-05, + "loss": 0.2522, + "step": 8082 + }, + { + "epoch": 0.9584963832562552, + "grad_norm": 1.7147880752328326, + "learning_rate": 4.4456313185798803e-05, + "loss": 0.4437, + "step": 8083 + }, + { + "epoch": 0.9586149650183802, + "grad_norm": 1.0164223936533918, + "learning_rate": 4.445480578590239e-05, + "loss": 0.3444, + "step": 8084 + }, + { + "epoch": 0.9587335467805052, + "grad_norm": 1.4095027425860642, + "learning_rate": 4.4453298206656535e-05, + "loss": 0.4074, + "step": 8085 + }, + { + "epoch": 0.9588521285426301, + "grad_norm": 1.5968720502712217, + "learning_rate": 4.445179044807512e-05, + "loss": 0.3697, + "step": 8086 + }, + { + "epoch": 0.9589707103047551, + "grad_norm": 1.2775421103105544, + "learning_rate": 4.445028251017205e-05, + "loss": 0.3318, + "step": 8087 + }, + { + "epoch": 0.9590892920668801, + "grad_norm": 1.2787958473073109, + "learning_rate": 4.4448774392961226e-05, + "loss": 0.3619, + "step": 8088 + }, + { + "epoch": 0.9592078738290051, + "grad_norm": 1.5330875908652577, + "learning_rate": 4.444726609645656e-05, + "loss": 0.3618, + "step": 8089 + }, + { + "epoch": 0.9593264555911301, + "grad_norm": 1.243347735894414, + "learning_rate": 4.444575762067194e-05, + "loss": 0.2987, + "step": 8090 + }, + { + "epoch": 0.959445037353255, + "grad_norm": 1.6740693188633327, + "learning_rate": 4.444424896562128e-05, + "loss": 0.4515, + "step": 8091 + }, + { + "epoch": 0.95956361911538, + "grad_norm": 0.9688552680963481, + "learning_rate": 4.4442740131318496e-05, + "loss": 0.2767, + "step": 8092 + }, + { + "epoch": 0.959682200877505, + "grad_norm": 1.2008829208974772, + "learning_rate": 4.444123111777748e-05, + "loss": 0.3455, + "step": 8093 + }, + { + "epoch": 0.95980078263963, + "grad_norm": 1.153830083536485, + "learning_rate": 4.443972192501217e-05, + "loss": 0.2359, + "step": 8094 + }, + { + "epoch": 0.959919364401755, + "grad_norm": 1.1596798150145227, + "learning_rate": 4.443821255303645e-05, + "loss": 0.2659, + "step": 8095 + }, + { + "epoch": 0.9600379461638799, + "grad_norm": 1.155618774065739, + "learning_rate": 4.443670300186425e-05, + "loss": 0.3263, + "step": 8096 + }, + { + "epoch": 0.960156527926005, + "grad_norm": 1.3068599972133255, + "learning_rate": 4.443519327150948e-05, + "loss": 0.3976, + "step": 8097 + }, + { + "epoch": 0.96027510968813, + "grad_norm": 1.013263370286823, + "learning_rate": 4.443368336198607e-05, + "loss": 0.3024, + "step": 8098 + }, + { + "epoch": 0.960393691450255, + "grad_norm": 1.177026460800368, + "learning_rate": 4.443217327330792e-05, + "loss": 0.3888, + "step": 8099 + }, + { + "epoch": 0.96051227321238, + "grad_norm": 1.3293424581752964, + "learning_rate": 4.4430663005488957e-05, + "loss": 0.3226, + "step": 8100 + }, + { + "epoch": 0.960630854974505, + "grad_norm": 1.195968863039665, + "learning_rate": 4.442915255854312e-05, + "loss": 0.3837, + "step": 8101 + }, + { + "epoch": 0.9607494367366299, + "grad_norm": 1.3272635881102783, + "learning_rate": 4.442764193248432e-05, + "loss": 0.3465, + "step": 8102 + }, + { + "epoch": 0.9608680184987549, + "grad_norm": 1.077597182339218, + "learning_rate": 4.442613112732649e-05, + "loss": 0.2887, + "step": 8103 + }, + { + "epoch": 0.9609866002608799, + "grad_norm": 1.037463614026974, + "learning_rate": 4.442462014308354e-05, + "loss": 0.2723, + "step": 8104 + }, + { + "epoch": 0.9611051820230049, + "grad_norm": 1.2431475837003096, + "learning_rate": 4.442310897976942e-05, + "loss": 0.3588, + "step": 8105 + }, + { + "epoch": 0.9612237637851299, + "grad_norm": 1.3342722848806092, + "learning_rate": 4.442159763739805e-05, + "loss": 0.4415, + "step": 8106 + }, + { + "epoch": 0.9613423455472548, + "grad_norm": 1.8874440134539356, + "learning_rate": 4.4420086115983375e-05, + "loss": 0.5666, + "step": 8107 + }, + { + "epoch": 0.9614609273093798, + "grad_norm": 1.0096441313942262, + "learning_rate": 4.441857441553932e-05, + "loss": 0.2771, + "step": 8108 + }, + { + "epoch": 0.9615795090715048, + "grad_norm": 1.0963651312820257, + "learning_rate": 4.4417062536079815e-05, + "loss": 0.2775, + "step": 8109 + }, + { + "epoch": 0.9616980908336298, + "grad_norm": 1.4486858102987352, + "learning_rate": 4.44155504776188e-05, + "loss": 0.356, + "step": 8110 + }, + { + "epoch": 0.9618166725957548, + "grad_norm": 1.2102181480741514, + "learning_rate": 4.4414038240170225e-05, + "loss": 0.4111, + "step": 8111 + }, + { + "epoch": 0.9619352543578797, + "grad_norm": 0.9538152790884604, + "learning_rate": 4.441252582374802e-05, + "loss": 0.2945, + "step": 8112 + }, + { + "epoch": 0.9620538361200047, + "grad_norm": 0.9847115894546697, + "learning_rate": 4.441101322836614e-05, + "loss": 0.2325, + "step": 8113 + }, + { + "epoch": 0.9621724178821297, + "grad_norm": 0.9949457091988496, + "learning_rate": 4.440950045403851e-05, + "loss": 0.3063, + "step": 8114 + }, + { + "epoch": 0.9622909996442547, + "grad_norm": 1.2777596830826339, + "learning_rate": 4.4407987500779096e-05, + "loss": 0.2634, + "step": 8115 + }, + { + "epoch": 0.9624095814063797, + "grad_norm": 1.3962735110854212, + "learning_rate": 4.4406474368601835e-05, + "loss": 0.3875, + "step": 8116 + }, + { + "epoch": 0.9625281631685046, + "grad_norm": 1.416522443703208, + "learning_rate": 4.440496105752068e-05, + "loss": 0.2966, + "step": 8117 + }, + { + "epoch": 0.9626467449306296, + "grad_norm": 1.1606800810437023, + "learning_rate": 4.440344756754958e-05, + "loss": 0.268, + "step": 8118 + }, + { + "epoch": 0.9627653266927546, + "grad_norm": 1.1209591625658175, + "learning_rate": 4.440193389870249e-05, + "loss": 0.3025, + "step": 8119 + }, + { + "epoch": 0.9628839084548796, + "grad_norm": 1.463136962747462, + "learning_rate": 4.440042005099335e-05, + "loss": 0.3809, + "step": 8120 + }, + { + "epoch": 0.9630024902170046, + "grad_norm": 1.6301401110139413, + "learning_rate": 4.439890602443614e-05, + "loss": 0.401, + "step": 8121 + }, + { + "epoch": 0.9631210719791297, + "grad_norm": 1.3878120382227506, + "learning_rate": 4.4397391819044795e-05, + "loss": 0.3625, + "step": 8122 + }, + { + "epoch": 0.9632396537412546, + "grad_norm": 1.7591390657824497, + "learning_rate": 4.439587743483329e-05, + "loss": 0.5031, + "step": 8123 + }, + { + "epoch": 0.9633582355033796, + "grad_norm": 1.3904886805795058, + "learning_rate": 4.439436287181558e-05, + "loss": 0.429, + "step": 8124 + }, + { + "epoch": 0.9634768172655046, + "grad_norm": 1.1540343421284536, + "learning_rate": 4.439284813000563e-05, + "loss": 0.3056, + "step": 8125 + }, + { + "epoch": 0.9635953990276296, + "grad_norm": 1.3151689253188135, + "learning_rate": 4.43913332094174e-05, + "loss": 0.3113, + "step": 8126 + }, + { + "epoch": 0.9637139807897546, + "grad_norm": 1.1899261058339594, + "learning_rate": 4.4389818110064846e-05, + "loss": 0.3188, + "step": 8127 + }, + { + "epoch": 0.9638325625518795, + "grad_norm": 1.235129911718782, + "learning_rate": 4.4388302831961956e-05, + "loss": 0.2627, + "step": 8128 + }, + { + "epoch": 0.9639511443140045, + "grad_norm": 0.9892017396639945, + "learning_rate": 4.438678737512269e-05, + "loss": 0.2591, + "step": 8129 + }, + { + "epoch": 0.9640697260761295, + "grad_norm": 1.612438679686364, + "learning_rate": 4.438527173956101e-05, + "loss": 0.5128, + "step": 8130 + }, + { + "epoch": 0.9641883078382545, + "grad_norm": 1.3890739605982219, + "learning_rate": 4.43837559252909e-05, + "loss": 0.3803, + "step": 8131 + }, + { + "epoch": 0.9643068896003795, + "grad_norm": 1.1976421045171879, + "learning_rate": 4.438223993232634e-05, + "loss": 0.291, + "step": 8132 + }, + { + "epoch": 0.9644254713625044, + "grad_norm": 1.2402878385418235, + "learning_rate": 4.438072376068129e-05, + "loss": 0.4378, + "step": 8133 + }, + { + "epoch": 0.9645440531246294, + "grad_norm": 1.1393739353235415, + "learning_rate": 4.4379207410369725e-05, + "loss": 0.2987, + "step": 8134 + }, + { + "epoch": 0.9646626348867544, + "grad_norm": 1.5372456120979854, + "learning_rate": 4.4377690881405646e-05, + "loss": 0.516, + "step": 8135 + }, + { + "epoch": 0.9647812166488794, + "grad_norm": 1.257998111986321, + "learning_rate": 4.437617417380301e-05, + "loss": 0.3824, + "step": 8136 + }, + { + "epoch": 0.9648997984110044, + "grad_norm": 1.8242940507519305, + "learning_rate": 4.4374657287575807e-05, + "loss": 0.5142, + "step": 8137 + }, + { + "epoch": 0.9650183801731294, + "grad_norm": 1.2009205796839926, + "learning_rate": 4.4373140222738027e-05, + "loss": 0.2375, + "step": 8138 + }, + { + "epoch": 0.9651369619352543, + "grad_norm": 1.3957704949994112, + "learning_rate": 4.437162297930365e-05, + "loss": 0.3383, + "step": 8139 + }, + { + "epoch": 0.9652555436973793, + "grad_norm": 1.0481689531167053, + "learning_rate": 4.437010555728667e-05, + "loss": 0.2921, + "step": 8140 + }, + { + "epoch": 0.9653741254595043, + "grad_norm": 1.0674281857959564, + "learning_rate": 4.436858795670106e-05, + "loss": 0.2875, + "step": 8141 + }, + { + "epoch": 0.9654927072216293, + "grad_norm": 1.2253518955248046, + "learning_rate": 4.436707017756083e-05, + "loss": 0.2562, + "step": 8142 + }, + { + "epoch": 0.9656112889837543, + "grad_norm": 1.1128549873415263, + "learning_rate": 4.436555221987996e-05, + "loss": 0.2745, + "step": 8143 + }, + { + "epoch": 0.9657298707458792, + "grad_norm": 1.1623819644161728, + "learning_rate": 4.436403408367243e-05, + "loss": 0.3094, + "step": 8144 + }, + { + "epoch": 0.9658484525080042, + "grad_norm": 1.2204119488892813, + "learning_rate": 4.436251576895227e-05, + "loss": 0.3275, + "step": 8145 + }, + { + "epoch": 0.9659670342701292, + "grad_norm": 1.9673665393571227, + "learning_rate": 4.436099727573346e-05, + "loss": 0.5316, + "step": 8146 + }, + { + "epoch": 0.9660856160322543, + "grad_norm": 0.9989171554370674, + "learning_rate": 4.435947860402999e-05, + "loss": 0.267, + "step": 8147 + }, + { + "epoch": 0.9662041977943793, + "grad_norm": 1.635825437099239, + "learning_rate": 4.4357959753855866e-05, + "loss": 0.4859, + "step": 8148 + }, + { + "epoch": 0.9663227795565043, + "grad_norm": 1.4734499987039664, + "learning_rate": 4.4356440725225094e-05, + "loss": 0.3534, + "step": 8149 + }, + { + "epoch": 0.9664413613186292, + "grad_norm": 1.0632644293957512, + "learning_rate": 4.435492151815168e-05, + "loss": 0.2979, + "step": 8150 + }, + { + "epoch": 0.9665599430807542, + "grad_norm": 1.3189364320380723, + "learning_rate": 4.4353402132649615e-05, + "loss": 0.3005, + "step": 8151 + }, + { + "epoch": 0.9666785248428792, + "grad_norm": 1.0159099655762984, + "learning_rate": 4.4351882568732916e-05, + "loss": 0.2753, + "step": 8152 + }, + { + "epoch": 0.9667971066050042, + "grad_norm": 1.0748361457002256, + "learning_rate": 4.4350362826415594e-05, + "loss": 0.2713, + "step": 8153 + }, + { + "epoch": 0.9669156883671292, + "grad_norm": 1.256553425006719, + "learning_rate": 4.4348842905711656e-05, + "loss": 0.3459, + "step": 8154 + }, + { + "epoch": 0.9670342701292541, + "grad_norm": 1.493076337103224, + "learning_rate": 4.4347322806635115e-05, + "loss": 0.3048, + "step": 8155 + }, + { + "epoch": 0.9671528518913791, + "grad_norm": 1.1247932360153043, + "learning_rate": 4.434580252919998e-05, + "loss": 0.3384, + "step": 8156 + }, + { + "epoch": 0.9672714336535041, + "grad_norm": 1.6417396546811136, + "learning_rate": 4.434428207342027e-05, + "loss": 0.5175, + "step": 8157 + }, + { + "epoch": 0.9673900154156291, + "grad_norm": 1.2887541285621864, + "learning_rate": 4.434276143931e-05, + "loss": 0.315, + "step": 8158 + }, + { + "epoch": 0.9675085971777541, + "grad_norm": 1.0724426456594587, + "learning_rate": 4.434124062688319e-05, + "loss": 0.2762, + "step": 8159 + }, + { + "epoch": 0.967627178939879, + "grad_norm": 1.269726237926335, + "learning_rate": 4.433971963615386e-05, + "loss": 0.3186, + "step": 8160 + }, + { + "epoch": 0.967745760702004, + "grad_norm": 1.5213560838382587, + "learning_rate": 4.433819846713603e-05, + "loss": 0.3389, + "step": 8161 + }, + { + "epoch": 0.967864342464129, + "grad_norm": 1.4235110474144252, + "learning_rate": 4.4336677119843726e-05, + "loss": 0.3686, + "step": 8162 + }, + { + "epoch": 0.967982924226254, + "grad_norm": 2.0779846240463296, + "learning_rate": 4.4335155594290966e-05, + "loss": 0.4383, + "step": 8163 + }, + { + "epoch": 0.968101505988379, + "grad_norm": 1.3051941282844637, + "learning_rate": 4.433363389049179e-05, + "loss": 0.3209, + "step": 8164 + }, + { + "epoch": 0.9682200877505039, + "grad_norm": 1.1817107583731892, + "learning_rate": 4.433211200846021e-05, + "loss": 0.3527, + "step": 8165 + }, + { + "epoch": 0.9683386695126289, + "grad_norm": 1.37474012577105, + "learning_rate": 4.433058994821027e-05, + "loss": 0.4202, + "step": 8166 + }, + { + "epoch": 0.9684572512747539, + "grad_norm": 1.4037019268230604, + "learning_rate": 4.432906770975599e-05, + "loss": 0.4617, + "step": 8167 + }, + { + "epoch": 0.9685758330368789, + "grad_norm": 1.2809090366264535, + "learning_rate": 4.43275452931114e-05, + "loss": 0.3073, + "step": 8168 + }, + { + "epoch": 0.9686944147990039, + "grad_norm": 1.3158900853300421, + "learning_rate": 4.432602269829056e-05, + "loss": 0.2932, + "step": 8169 + }, + { + "epoch": 0.9688129965611288, + "grad_norm": 1.2815179877077714, + "learning_rate": 4.432449992530748e-05, + "loss": 0.3894, + "step": 8170 + }, + { + "epoch": 0.9689315783232538, + "grad_norm": 1.065872399761948, + "learning_rate": 4.432297697417622e-05, + "loss": 0.226, + "step": 8171 + }, + { + "epoch": 0.9690501600853789, + "grad_norm": 1.3683395850497797, + "learning_rate": 4.43214538449108e-05, + "loss": 0.4333, + "step": 8172 + }, + { + "epoch": 0.9691687418475039, + "grad_norm": 0.9104760576518058, + "learning_rate": 4.431993053752527e-05, + "loss": 0.2458, + "step": 8173 + }, + { + "epoch": 0.9692873236096289, + "grad_norm": 1.77293872559727, + "learning_rate": 4.431840705203367e-05, + "loss": 0.5781, + "step": 8174 + }, + { + "epoch": 0.9694059053717539, + "grad_norm": 1.1878893923556353, + "learning_rate": 4.431688338845005e-05, + "loss": 0.3127, + "step": 8175 + }, + { + "epoch": 0.9695244871338788, + "grad_norm": 1.1888864110771686, + "learning_rate": 4.431535954678845e-05, + "loss": 0.361, + "step": 8176 + }, + { + "epoch": 0.9696430688960038, + "grad_norm": 0.9074047903410065, + "learning_rate": 4.4313835527062916e-05, + "loss": 0.2467, + "step": 8177 + }, + { + "epoch": 0.9697616506581288, + "grad_norm": 1.3513410669722292, + "learning_rate": 4.431231132928752e-05, + "loss": 0.4258, + "step": 8178 + }, + { + "epoch": 0.9698802324202538, + "grad_norm": 2.4271383614477706, + "learning_rate": 4.431078695347628e-05, + "loss": 0.4968, + "step": 8179 + }, + { + "epoch": 0.9699988141823788, + "grad_norm": 1.2491309480356787, + "learning_rate": 4.430926239964327e-05, + "loss": 0.428, + "step": 8180 + }, + { + "epoch": 0.9701173959445037, + "grad_norm": 1.7306745460985185, + "learning_rate": 4.4307737667802535e-05, + "loss": 0.5438, + "step": 8181 + }, + { + "epoch": 0.9702359777066287, + "grad_norm": 1.987216902175568, + "learning_rate": 4.4306212757968136e-05, + "loss": 0.4798, + "step": 8182 + }, + { + "epoch": 0.9703545594687537, + "grad_norm": 1.4188095173894528, + "learning_rate": 4.430468767015413e-05, + "loss": 0.3463, + "step": 8183 + }, + { + "epoch": 0.9704731412308787, + "grad_norm": 1.493202920179971, + "learning_rate": 4.430316240437459e-05, + "loss": 0.4963, + "step": 8184 + }, + { + "epoch": 0.9705917229930037, + "grad_norm": 1.6176218137372922, + "learning_rate": 4.4301636960643547e-05, + "loss": 0.5062, + "step": 8185 + }, + { + "epoch": 0.9707103047551287, + "grad_norm": 1.1218699444425575, + "learning_rate": 4.430011133897509e-05, + "loss": 0.3898, + "step": 8186 + }, + { + "epoch": 0.9708288865172536, + "grad_norm": 1.404717250212203, + "learning_rate": 4.429858553938327e-05, + "loss": 0.3283, + "step": 8187 + }, + { + "epoch": 0.9709474682793786, + "grad_norm": 1.2652422308033182, + "learning_rate": 4.429705956188215e-05, + "loss": 0.3631, + "step": 8188 + }, + { + "epoch": 0.9710660500415036, + "grad_norm": 1.0702481052654365, + "learning_rate": 4.429553340648582e-05, + "loss": 0.2792, + "step": 8189 + }, + { + "epoch": 0.9711846318036286, + "grad_norm": 1.3641518937897057, + "learning_rate": 4.429400707320832e-05, + "loss": 0.3565, + "step": 8190 + }, + { + "epoch": 0.9713032135657536, + "grad_norm": 1.7613119523323126, + "learning_rate": 4.4292480562063744e-05, + "loss": 0.5023, + "step": 8191 + }, + { + "epoch": 0.9714217953278785, + "grad_norm": 1.440184518153207, + "learning_rate": 4.429095387306616e-05, + "loss": 0.3049, + "step": 8192 + }, + { + "epoch": 0.9715403770900035, + "grad_norm": 1.4479183717940678, + "learning_rate": 4.428942700622962e-05, + "loss": 0.4287, + "step": 8193 + }, + { + "epoch": 0.9716589588521285, + "grad_norm": 1.6790783285911148, + "learning_rate": 4.4287899961568225e-05, + "loss": 0.4686, + "step": 8194 + }, + { + "epoch": 0.9717775406142535, + "grad_norm": 1.3622350586926437, + "learning_rate": 4.428637273909605e-05, + "loss": 0.3239, + "step": 8195 + }, + { + "epoch": 0.9718961223763786, + "grad_norm": 1.2892139289553168, + "learning_rate": 4.428484533882716e-05, + "loss": 0.313, + "step": 8196 + }, + { + "epoch": 0.9720147041385035, + "grad_norm": 1.3401894177361846, + "learning_rate": 4.4283317760775655e-05, + "loss": 0.313, + "step": 8197 + }, + { + "epoch": 0.9721332859006285, + "grad_norm": 1.6402893736383397, + "learning_rate": 4.4281790004955596e-05, + "loss": 0.4722, + "step": 8198 + }, + { + "epoch": 0.9722518676627535, + "grad_norm": 1.3413166391540747, + "learning_rate": 4.4280262071381086e-05, + "loss": 0.3511, + "step": 8199 + }, + { + "epoch": 0.9723704494248785, + "grad_norm": 1.2495459317850015, + "learning_rate": 4.42787339600662e-05, + "loss": 0.3397, + "step": 8200 + }, + { + "epoch": 0.9724890311870035, + "grad_norm": 1.3937855029562911, + "learning_rate": 4.427720567102503e-05, + "loss": 0.3753, + "step": 8201 + }, + { + "epoch": 0.9726076129491285, + "grad_norm": 0.9350646121531249, + "learning_rate": 4.427567720427166e-05, + "loss": 0.2669, + "step": 8202 + }, + { + "epoch": 0.9727261947112534, + "grad_norm": 1.0361441724849219, + "learning_rate": 4.4274148559820184e-05, + "loss": 0.3553, + "step": 8203 + }, + { + "epoch": 0.9728447764733784, + "grad_norm": 1.541816279716457, + "learning_rate": 4.427261973768469e-05, + "loss": 0.398, + "step": 8204 + }, + { + "epoch": 0.9729633582355034, + "grad_norm": 1.723326328994623, + "learning_rate": 4.4271090737879284e-05, + "loss": 0.4114, + "step": 8205 + }, + { + "epoch": 0.9730819399976284, + "grad_norm": 1.452991294167984, + "learning_rate": 4.426956156041805e-05, + "loss": 0.3887, + "step": 8206 + }, + { + "epoch": 0.9732005217597534, + "grad_norm": 1.0077558517289296, + "learning_rate": 4.426803220531509e-05, + "loss": 0.3006, + "step": 8207 + }, + { + "epoch": 0.9733191035218783, + "grad_norm": 1.2721200412751315, + "learning_rate": 4.42665026725845e-05, + "loss": 0.3163, + "step": 8208 + }, + { + "epoch": 0.9734376852840033, + "grad_norm": 1.1337580601403356, + "learning_rate": 4.426497296224038e-05, + "loss": 0.3492, + "step": 8209 + }, + { + "epoch": 0.9735562670461283, + "grad_norm": 1.076771339224893, + "learning_rate": 4.4263443074296836e-05, + "loss": 0.3269, + "step": 8210 + }, + { + "epoch": 0.9736748488082533, + "grad_norm": 1.0294929896144827, + "learning_rate": 4.426191300876796e-05, + "loss": 0.2766, + "step": 8211 + }, + { + "epoch": 0.9737934305703783, + "grad_norm": 1.2721147737790501, + "learning_rate": 4.4260382765667875e-05, + "loss": 0.5007, + "step": 8212 + }, + { + "epoch": 0.9739120123325032, + "grad_norm": 1.1534284645168305, + "learning_rate": 4.425885234501068e-05, + "loss": 0.2473, + "step": 8213 + }, + { + "epoch": 0.9740305940946282, + "grad_norm": 0.9918955543628654, + "learning_rate": 4.425732174681048e-05, + "loss": 0.2563, + "step": 8214 + }, + { + "epoch": 0.9741491758567532, + "grad_norm": 1.1017125839990123, + "learning_rate": 4.425579097108139e-05, + "loss": 0.2688, + "step": 8215 + }, + { + "epoch": 0.9742677576188782, + "grad_norm": 1.6039275702104385, + "learning_rate": 4.425426001783752e-05, + "loss": 0.3914, + "step": 8216 + }, + { + "epoch": 0.9743863393810032, + "grad_norm": 1.413733879228654, + "learning_rate": 4.4252728887092985e-05, + "loss": 0.3802, + "step": 8217 + }, + { + "epoch": 0.9745049211431281, + "grad_norm": 1.9617133293881306, + "learning_rate": 4.4251197578861895e-05, + "loss": 0.519, + "step": 8218 + }, + { + "epoch": 0.9746235029052531, + "grad_norm": 1.049980515791884, + "learning_rate": 4.4249666093158375e-05, + "loss": 0.2601, + "step": 8219 + }, + { + "epoch": 0.9747420846673781, + "grad_norm": 1.282932316984849, + "learning_rate": 4.424813442999654e-05, + "loss": 0.3163, + "step": 8220 + }, + { + "epoch": 0.9748606664295032, + "grad_norm": 2.089350268423967, + "learning_rate": 4.4246602589390505e-05, + "loss": 0.4115, + "step": 8221 + }, + { + "epoch": 0.9749792481916282, + "grad_norm": 1.3385941295003538, + "learning_rate": 4.42450705713544e-05, + "loss": 0.3306, + "step": 8222 + }, + { + "epoch": 0.9750978299537532, + "grad_norm": 1.6387965707964869, + "learning_rate": 4.424353837590234e-05, + "loss": 0.4756, + "step": 8223 + }, + { + "epoch": 0.9752164117158781, + "grad_norm": 1.3945664719658082, + "learning_rate": 4.4242006003048455e-05, + "loss": 0.45, + "step": 8224 + }, + { + "epoch": 0.9753349934780031, + "grad_norm": 1.3718169850503437, + "learning_rate": 4.424047345280688e-05, + "loss": 0.3953, + "step": 8225 + }, + { + "epoch": 0.9754535752401281, + "grad_norm": 1.490019740919339, + "learning_rate": 4.423894072519173e-05, + "loss": 0.4175, + "step": 8226 + }, + { + "epoch": 0.9755721570022531, + "grad_norm": 1.390351439311583, + "learning_rate": 4.423740782021713e-05, + "loss": 0.4115, + "step": 8227 + }, + { + "epoch": 0.9756907387643781, + "grad_norm": 1.482690610601037, + "learning_rate": 4.423587473789722e-05, + "loss": 0.3932, + "step": 8228 + }, + { + "epoch": 0.975809320526503, + "grad_norm": 0.9740296380707301, + "learning_rate": 4.4234341478246135e-05, + "loss": 0.3463, + "step": 8229 + }, + { + "epoch": 0.975927902288628, + "grad_norm": 1.0893854687165152, + "learning_rate": 4.423280804127802e-05, + "loss": 0.2413, + "step": 8230 + }, + { + "epoch": 0.976046484050753, + "grad_norm": 1.3056862616797247, + "learning_rate": 4.423127442700699e-05, + "loss": 0.4037, + "step": 8231 + }, + { + "epoch": 0.976165065812878, + "grad_norm": 1.5161811346437355, + "learning_rate": 4.422974063544719e-05, + "loss": 0.3927, + "step": 8232 + }, + { + "epoch": 0.976283647575003, + "grad_norm": 1.1830751205590924, + "learning_rate": 4.422820666661276e-05, + "loss": 0.3047, + "step": 8233 + }, + { + "epoch": 0.976402229337128, + "grad_norm": 1.460136211669708, + "learning_rate": 4.422667252051785e-05, + "loss": 0.3145, + "step": 8234 + }, + { + "epoch": 0.9765208110992529, + "grad_norm": 1.3029913160266968, + "learning_rate": 4.422513819717661e-05, + "loss": 0.3256, + "step": 8235 + }, + { + "epoch": 0.9766393928613779, + "grad_norm": 1.3197674250621456, + "learning_rate": 4.422360369660316e-05, + "loss": 0.3718, + "step": 8236 + }, + { + "epoch": 0.9767579746235029, + "grad_norm": 1.819194100709286, + "learning_rate": 4.4222069018811646e-05, + "loss": 0.4408, + "step": 8237 + }, + { + "epoch": 0.9768765563856279, + "grad_norm": 1.1865882030349053, + "learning_rate": 4.422053416381624e-05, + "loss": 0.2861, + "step": 8238 + }, + { + "epoch": 0.9769951381477529, + "grad_norm": 1.378766527501846, + "learning_rate": 4.421899913163108e-05, + "loss": 0.3727, + "step": 8239 + }, + { + "epoch": 0.9771137199098778, + "grad_norm": 1.1750551265831204, + "learning_rate": 4.421746392227031e-05, + "loss": 0.285, + "step": 8240 + }, + { + "epoch": 0.9772323016720028, + "grad_norm": 1.5919439824388095, + "learning_rate": 4.421592853574808e-05, + "loss": 0.4557, + "step": 8241 + }, + { + "epoch": 0.9773508834341278, + "grad_norm": 1.4915030825085096, + "learning_rate": 4.4214392972078564e-05, + "loss": 0.3796, + "step": 8242 + }, + { + "epoch": 0.9774694651962528, + "grad_norm": 1.4856681422083686, + "learning_rate": 4.421285723127591e-05, + "loss": 0.4703, + "step": 8243 + }, + { + "epoch": 0.9775880469583778, + "grad_norm": 1.5037894558235063, + "learning_rate": 4.421132131335427e-05, + "loss": 0.47, + "step": 8244 + }, + { + "epoch": 0.9777066287205027, + "grad_norm": 2.290613850905578, + "learning_rate": 4.42097852183278e-05, + "loss": 0.439, + "step": 8245 + }, + { + "epoch": 0.9778252104826278, + "grad_norm": 1.9440260274002121, + "learning_rate": 4.420824894621068e-05, + "loss": 0.4844, + "step": 8246 + }, + { + "epoch": 0.9779437922447528, + "grad_norm": 1.1735948052483787, + "learning_rate": 4.4206712497017044e-05, + "loss": 0.3585, + "step": 8247 + }, + { + "epoch": 0.9780623740068778, + "grad_norm": 1.4126376662162046, + "learning_rate": 4.4205175870761084e-05, + "loss": 0.4634, + "step": 8248 + }, + { + "epoch": 0.9781809557690028, + "grad_norm": 1.155936304663757, + "learning_rate": 4.420363906745695e-05, + "loss": 0.3024, + "step": 8249 + }, + { + "epoch": 0.9782995375311277, + "grad_norm": 1.3232305058601213, + "learning_rate": 4.4202102087118816e-05, + "loss": 0.3661, + "step": 8250 + }, + { + "epoch": 0.9784181192932527, + "grad_norm": 1.2903021732971605, + "learning_rate": 4.420056492976083e-05, + "loss": 0.4302, + "step": 8251 + }, + { + "epoch": 0.9785367010553777, + "grad_norm": 1.0388218051379199, + "learning_rate": 4.41990275953972e-05, + "loss": 0.2812, + "step": 8252 + }, + { + "epoch": 0.9786552828175027, + "grad_norm": 1.2177193996533395, + "learning_rate": 4.419749008404207e-05, + "loss": 0.3629, + "step": 8253 + }, + { + "epoch": 0.9787738645796277, + "grad_norm": 1.2441246059391018, + "learning_rate": 4.4195952395709626e-05, + "loss": 0.4034, + "step": 8254 + }, + { + "epoch": 0.9788924463417527, + "grad_norm": 1.954026238271305, + "learning_rate": 4.419441453041404e-05, + "loss": 0.6383, + "step": 8255 + }, + { + "epoch": 0.9790110281038776, + "grad_norm": 1.5236650976977129, + "learning_rate": 4.419287648816949e-05, + "loss": 0.3747, + "step": 8256 + }, + { + "epoch": 0.9791296098660026, + "grad_norm": 1.4158456378821245, + "learning_rate": 4.4191338268990155e-05, + "loss": 0.3576, + "step": 8257 + }, + { + "epoch": 0.9792481916281276, + "grad_norm": 1.0445991961790237, + "learning_rate": 4.418979987289021e-05, + "loss": 0.3024, + "step": 8258 + }, + { + "epoch": 0.9793667733902526, + "grad_norm": 1.8982252414516851, + "learning_rate": 4.4188261299883855e-05, + "loss": 0.4167, + "step": 8259 + }, + { + "epoch": 0.9794853551523776, + "grad_norm": 1.7302093215581753, + "learning_rate": 4.418672254998525e-05, + "loss": 0.5263, + "step": 8260 + }, + { + "epoch": 0.9796039369145025, + "grad_norm": 1.3160211013671157, + "learning_rate": 4.4185183623208596e-05, + "loss": 0.3069, + "step": 8261 + }, + { + "epoch": 0.9797225186766275, + "grad_norm": 1.5029790123065305, + "learning_rate": 4.4183644519568074e-05, + "loss": 0.3576, + "step": 8262 + }, + { + "epoch": 0.9798411004387525, + "grad_norm": 1.1714167229553873, + "learning_rate": 4.4182105239077874e-05, + "loss": 0.2556, + "step": 8263 + }, + { + "epoch": 0.9799596822008775, + "grad_norm": 1.3695227954239373, + "learning_rate": 4.4180565781752185e-05, + "loss": 0.3805, + "step": 8264 + }, + { + "epoch": 0.9800782639630025, + "grad_norm": 1.5265943569979792, + "learning_rate": 4.4179026147605204e-05, + "loss": 0.4969, + "step": 8265 + }, + { + "epoch": 0.9801968457251274, + "grad_norm": 1.0496227491883274, + "learning_rate": 4.417748633665112e-05, + "loss": 0.2916, + "step": 8266 + }, + { + "epoch": 0.9803154274872524, + "grad_norm": 1.1745081791655443, + "learning_rate": 4.417594634890413e-05, + "loss": 0.3455, + "step": 8267 + }, + { + "epoch": 0.9804340092493774, + "grad_norm": 1.3660139898043802, + "learning_rate": 4.4174406184378423e-05, + "loss": 0.3067, + "step": 8268 + }, + { + "epoch": 0.9805525910115024, + "grad_norm": 1.4143690696727151, + "learning_rate": 4.417286584308821e-05, + "loss": 0.3741, + "step": 8269 + }, + { + "epoch": 0.9806711727736274, + "grad_norm": 1.6161539479091347, + "learning_rate": 4.417132532504768e-05, + "loss": 0.4053, + "step": 8270 + }, + { + "epoch": 0.9807897545357525, + "grad_norm": 1.9189972634805195, + "learning_rate": 4.4169784630271044e-05, + "loss": 0.4676, + "step": 8271 + }, + { + "epoch": 0.9809083362978774, + "grad_norm": 1.5210829500560445, + "learning_rate": 4.41682437587725e-05, + "loss": 0.4304, + "step": 8272 + }, + { + "epoch": 0.9810269180600024, + "grad_norm": 1.2980712379443737, + "learning_rate": 4.4166702710566253e-05, + "loss": 0.3041, + "step": 8273 + }, + { + "epoch": 0.9811454998221274, + "grad_norm": 1.042771555026388, + "learning_rate": 4.4165161485666515e-05, + "loss": 0.321, + "step": 8274 + }, + { + "epoch": 0.9812640815842524, + "grad_norm": 1.6762945741493627, + "learning_rate": 4.416362008408749e-05, + "loss": 0.3537, + "step": 8275 + }, + { + "epoch": 0.9813826633463774, + "grad_norm": 1.5355305933543786, + "learning_rate": 4.416207850584338e-05, + "loss": 0.5004, + "step": 8276 + }, + { + "epoch": 0.9815012451085023, + "grad_norm": 1.3440459745088584, + "learning_rate": 4.416053675094841e-05, + "loss": 0.3886, + "step": 8277 + }, + { + "epoch": 0.9816198268706273, + "grad_norm": 1.7659723243297478, + "learning_rate": 4.4158994819416785e-05, + "loss": 0.5173, + "step": 8278 + }, + { + "epoch": 0.9817384086327523, + "grad_norm": 0.9384418976409911, + "learning_rate": 4.415745271126273e-05, + "loss": 0.2449, + "step": 8279 + }, + { + "epoch": 0.9818569903948773, + "grad_norm": 1.134335210879764, + "learning_rate": 4.4155910426500445e-05, + "loss": 0.2208, + "step": 8280 + }, + { + "epoch": 0.9819755721570023, + "grad_norm": 1.4023215298284348, + "learning_rate": 4.415436796514416e-05, + "loss": 0.3452, + "step": 8281 + }, + { + "epoch": 0.9820941539191272, + "grad_norm": 1.2038328093017134, + "learning_rate": 4.415282532720809e-05, + "loss": 0.3374, + "step": 8282 + }, + { + "epoch": 0.9822127356812522, + "grad_norm": 1.8541931352399736, + "learning_rate": 4.415128251270646e-05, + "loss": 0.5911, + "step": 8283 + }, + { + "epoch": 0.9823313174433772, + "grad_norm": 1.1325715717383371, + "learning_rate": 4.414973952165349e-05, + "loss": 0.3457, + "step": 8284 + }, + { + "epoch": 0.9824498992055022, + "grad_norm": 1.3175636894067353, + "learning_rate": 4.41481963540634e-05, + "loss": 0.3728, + "step": 8285 + }, + { + "epoch": 0.9825684809676272, + "grad_norm": 1.1846950592231786, + "learning_rate": 4.414665300995042e-05, + "loss": 0.3901, + "step": 8286 + }, + { + "epoch": 0.9826870627297521, + "grad_norm": 1.3406832503286963, + "learning_rate": 4.414510948932878e-05, + "loss": 0.4634, + "step": 8287 + }, + { + "epoch": 0.9828056444918771, + "grad_norm": 1.1666157425659556, + "learning_rate": 4.414356579221271e-05, + "loss": 0.3602, + "step": 8288 + }, + { + "epoch": 0.9829242262540021, + "grad_norm": 1.1461167083692458, + "learning_rate": 4.4142021918616436e-05, + "loss": 0.3556, + "step": 8289 + }, + { + "epoch": 0.9830428080161271, + "grad_norm": 1.3339416297478541, + "learning_rate": 4.41404778685542e-05, + "loss": 0.366, + "step": 8290 + }, + { + "epoch": 0.9831613897782521, + "grad_norm": 1.0992496175637299, + "learning_rate": 4.413893364204022e-05, + "loss": 0.2794, + "step": 8291 + }, + { + "epoch": 0.983279971540377, + "grad_norm": 1.0496558124389437, + "learning_rate": 4.413738923908874e-05, + "loss": 0.3312, + "step": 8292 + }, + { + "epoch": 0.983398553302502, + "grad_norm": 1.0041303520195561, + "learning_rate": 4.413584465971401e-05, + "loss": 0.2563, + "step": 8293 + }, + { + "epoch": 0.983517135064627, + "grad_norm": 1.1994406671413391, + "learning_rate": 4.413429990393026e-05, + "loss": 0.3521, + "step": 8294 + }, + { + "epoch": 0.983635716826752, + "grad_norm": 1.1228716758603077, + "learning_rate": 4.4132754971751725e-05, + "loss": 0.2919, + "step": 8295 + }, + { + "epoch": 0.9837542985888771, + "grad_norm": 1.5484890592929543, + "learning_rate": 4.413120986319265e-05, + "loss": 0.5048, + "step": 8296 + }, + { + "epoch": 0.9838728803510021, + "grad_norm": 1.1216547936045358, + "learning_rate": 4.412966457826727e-05, + "loss": 0.3163, + "step": 8297 + }, + { + "epoch": 0.983991462113127, + "grad_norm": 1.216994063474826, + "learning_rate": 4.4128119116989853e-05, + "loss": 0.3122, + "step": 8298 + }, + { + "epoch": 0.984110043875252, + "grad_norm": 1.356051453945428, + "learning_rate": 4.412657347937463e-05, + "loss": 0.3979, + "step": 8299 + }, + { + "epoch": 0.984228625637377, + "grad_norm": 1.5565817440412268, + "learning_rate": 4.4125027665435855e-05, + "loss": 0.4419, + "step": 8300 + }, + { + "epoch": 0.984347207399502, + "grad_norm": 1.2939810403576215, + "learning_rate": 4.412348167518778e-05, + "loss": 0.3484, + "step": 8301 + }, + { + "epoch": 0.984465789161627, + "grad_norm": 1.6675655207334301, + "learning_rate": 4.412193550864465e-05, + "loss": 0.443, + "step": 8302 + }, + { + "epoch": 0.984584370923752, + "grad_norm": 1.1275146104501859, + "learning_rate": 4.412038916582072e-05, + "loss": 0.3197, + "step": 8303 + }, + { + "epoch": 0.9847029526858769, + "grad_norm": 1.4852302003618498, + "learning_rate": 4.4118842646730265e-05, + "loss": 0.4782, + "step": 8304 + }, + { + "epoch": 0.9848215344480019, + "grad_norm": 1.0994196022871845, + "learning_rate": 4.411729595138751e-05, + "loss": 0.2952, + "step": 8305 + }, + { + "epoch": 0.9849401162101269, + "grad_norm": 1.1325394807586397, + "learning_rate": 4.4115749079806735e-05, + "loss": 0.3134, + "step": 8306 + }, + { + "epoch": 0.9850586979722519, + "grad_norm": 1.8362708734907416, + "learning_rate": 4.4114202032002196e-05, + "loss": 0.5704, + "step": 8307 + }, + { + "epoch": 0.9851772797343769, + "grad_norm": 1.1073745324526318, + "learning_rate": 4.411265480798815e-05, + "loss": 0.3177, + "step": 8308 + }, + { + "epoch": 0.9852958614965018, + "grad_norm": 0.9190692121407944, + "learning_rate": 4.411110740777887e-05, + "loss": 0.2273, + "step": 8309 + }, + { + "epoch": 0.9854144432586268, + "grad_norm": 0.9970089836211617, + "learning_rate": 4.410955983138861e-05, + "loss": 0.2746, + "step": 8310 + }, + { + "epoch": 0.9855330250207518, + "grad_norm": 1.0994086346776935, + "learning_rate": 4.410801207883165e-05, + "loss": 0.3126, + "step": 8311 + }, + { + "epoch": 0.9856516067828768, + "grad_norm": 1.0749081274117072, + "learning_rate": 4.410646415012224e-05, + "loss": 0.2873, + "step": 8312 + }, + { + "epoch": 0.9857701885450018, + "grad_norm": 1.005669069933978, + "learning_rate": 4.4104916045274666e-05, + "loss": 0.2925, + "step": 8313 + }, + { + "epoch": 0.9858887703071267, + "grad_norm": 1.327819062712485, + "learning_rate": 4.41033677643032e-05, + "loss": 0.4411, + "step": 8314 + }, + { + "epoch": 0.9860073520692517, + "grad_norm": 1.1203345668524824, + "learning_rate": 4.410181930722209e-05, + "loss": 0.3527, + "step": 8315 + }, + { + "epoch": 0.9861259338313767, + "grad_norm": 1.377269764827712, + "learning_rate": 4.4100270674045644e-05, + "loss": 0.3138, + "step": 8316 + }, + { + "epoch": 0.9862445155935017, + "grad_norm": 1.1932555608511186, + "learning_rate": 4.409872186478812e-05, + "loss": 0.3424, + "step": 8317 + }, + { + "epoch": 0.9863630973556267, + "grad_norm": 1.3516302408045655, + "learning_rate": 4.409717287946381e-05, + "loss": 0.3693, + "step": 8318 + }, + { + "epoch": 0.9864816791177516, + "grad_norm": 1.5059061622839247, + "learning_rate": 4.4095623718086975e-05, + "loss": 0.3634, + "step": 8319 + }, + { + "epoch": 0.9866002608798766, + "grad_norm": 1.52567461120596, + "learning_rate": 4.409407438067191e-05, + "loss": 0.3972, + "step": 8320 + }, + { + "epoch": 0.9867188426420017, + "grad_norm": 1.6358471629146132, + "learning_rate": 4.409252486723289e-05, + "loss": 0.5007, + "step": 8321 + }, + { + "epoch": 0.9868374244041267, + "grad_norm": 1.9357186421675099, + "learning_rate": 4.409097517778421e-05, + "loss": 0.4328, + "step": 8322 + }, + { + "epoch": 0.9869560061662517, + "grad_norm": 1.144977940217153, + "learning_rate": 4.408942531234015e-05, + "loss": 0.2859, + "step": 8323 + }, + { + "epoch": 0.9870745879283767, + "grad_norm": 1.119934173802696, + "learning_rate": 4.408787527091499e-05, + "loss": 0.2795, + "step": 8324 + }, + { + "epoch": 0.9871931696905016, + "grad_norm": 1.2478435845979823, + "learning_rate": 4.4086325053523036e-05, + "loss": 0.3489, + "step": 8325 + }, + { + "epoch": 0.9873117514526266, + "grad_norm": 1.3078614196705987, + "learning_rate": 4.408477466017856e-05, + "loss": 0.3519, + "step": 8326 + }, + { + "epoch": 0.9874303332147516, + "grad_norm": 1.412297391448768, + "learning_rate": 4.408322409089587e-05, + "loss": 0.32, + "step": 8327 + }, + { + "epoch": 0.9875489149768766, + "grad_norm": 1.3341375931682915, + "learning_rate": 4.408167334568926e-05, + "loss": 0.3771, + "step": 8328 + }, + { + "epoch": 0.9876674967390016, + "grad_norm": 1.2051370130038352, + "learning_rate": 4.4080122424573014e-05, + "loss": 0.2873, + "step": 8329 + }, + { + "epoch": 0.9877860785011265, + "grad_norm": 0.8677588652279163, + "learning_rate": 4.407857132756144e-05, + "loss": 0.2477, + "step": 8330 + }, + { + "epoch": 0.9879046602632515, + "grad_norm": 1.603924359562475, + "learning_rate": 4.407702005466884e-05, + "loss": 0.5179, + "step": 8331 + }, + { + "epoch": 0.9880232420253765, + "grad_norm": 1.3642989926233313, + "learning_rate": 4.4075468605909495e-05, + "loss": 0.4478, + "step": 8332 + }, + { + "epoch": 0.9881418237875015, + "grad_norm": 1.3186306307710889, + "learning_rate": 4.407391698129773e-05, + "loss": 0.2738, + "step": 8333 + }, + { + "epoch": 0.9882604055496265, + "grad_norm": 1.2434007412517496, + "learning_rate": 4.407236518084784e-05, + "loss": 0.2545, + "step": 8334 + }, + { + "epoch": 0.9883789873117514, + "grad_norm": 1.4309835706668015, + "learning_rate": 4.407081320457414e-05, + "loss": 0.3443, + "step": 8335 + }, + { + "epoch": 0.9884975690738764, + "grad_norm": 1.220267684942932, + "learning_rate": 4.4069261052490905e-05, + "loss": 0.2703, + "step": 8336 + }, + { + "epoch": 0.9886161508360014, + "grad_norm": 1.7623157504592633, + "learning_rate": 4.406770872461249e-05, + "loss": 0.4866, + "step": 8337 + }, + { + "epoch": 0.9887347325981264, + "grad_norm": 1.5232810964712822, + "learning_rate": 4.406615622095317e-05, + "loss": 0.4773, + "step": 8338 + }, + { + "epoch": 0.9888533143602514, + "grad_norm": 1.6629620261011426, + "learning_rate": 4.406460354152728e-05, + "loss": 0.4104, + "step": 8339 + }, + { + "epoch": 0.9889718961223763, + "grad_norm": 1.4503818380698115, + "learning_rate": 4.4063050686349115e-05, + "loss": 0.4208, + "step": 8340 + }, + { + "epoch": 0.9890904778845013, + "grad_norm": 1.2750722881846672, + "learning_rate": 4.406149765543301e-05, + "loss": 0.3748, + "step": 8341 + }, + { + "epoch": 0.9892090596466263, + "grad_norm": 1.1198582269859838, + "learning_rate": 4.405994444879325e-05, + "loss": 0.285, + "step": 8342 + }, + { + "epoch": 0.9893276414087513, + "grad_norm": 1.3072882171867417, + "learning_rate": 4.405839106644419e-05, + "loss": 0.3668, + "step": 8343 + }, + { + "epoch": 0.9894462231708763, + "grad_norm": 1.1834262559508983, + "learning_rate": 4.405683750840014e-05, + "loss": 0.3053, + "step": 8344 + }, + { + "epoch": 0.9895648049330013, + "grad_norm": 2.103725946320049, + "learning_rate": 4.405528377467541e-05, + "loss": 0.6149, + "step": 8345 + }, + { + "epoch": 0.9896833866951263, + "grad_norm": 1.198405940078596, + "learning_rate": 4.405372986528433e-05, + "loss": 0.3677, + "step": 8346 + }, + { + "epoch": 0.9898019684572513, + "grad_norm": 1.2908765298526426, + "learning_rate": 4.405217578024123e-05, + "loss": 0.3976, + "step": 8347 + }, + { + "epoch": 0.9899205502193763, + "grad_norm": 0.9963238427189444, + "learning_rate": 4.4050621519560434e-05, + "loss": 0.3091, + "step": 8348 + }, + { + "epoch": 0.9900391319815013, + "grad_norm": 1.1757533106434608, + "learning_rate": 4.4049067083256266e-05, + "loss": 0.3467, + "step": 8349 + }, + { + "epoch": 0.9901577137436263, + "grad_norm": 0.9119039467758296, + "learning_rate": 4.404751247134306e-05, + "loss": 0.2468, + "step": 8350 + }, + { + "epoch": 0.9902762955057512, + "grad_norm": 1.0647396531477253, + "learning_rate": 4.404595768383514e-05, + "loss": 0.2835, + "step": 8351 + }, + { + "epoch": 0.9903948772678762, + "grad_norm": 0.8893043141503866, + "learning_rate": 4.4044402720746854e-05, + "loss": 0.1991, + "step": 8352 + }, + { + "epoch": 0.9905134590300012, + "grad_norm": 1.31823956288799, + "learning_rate": 4.404284758209253e-05, + "loss": 0.4616, + "step": 8353 + }, + { + "epoch": 0.9906320407921262, + "grad_norm": 1.455877491605922, + "learning_rate": 4.404129226788649e-05, + "loss": 0.4176, + "step": 8354 + }, + { + "epoch": 0.9907506225542512, + "grad_norm": 1.1537333783276036, + "learning_rate": 4.4039736778143103e-05, + "loss": 0.2802, + "step": 8355 + }, + { + "epoch": 0.9908692043163762, + "grad_norm": 1.1508236510848868, + "learning_rate": 4.403818111287668e-05, + "loss": 0.3065, + "step": 8356 + }, + { + "epoch": 0.9909877860785011, + "grad_norm": 0.9467449206899237, + "learning_rate": 4.4036625272101575e-05, + "loss": 0.2593, + "step": 8357 + }, + { + "epoch": 0.9911063678406261, + "grad_norm": 1.0830540285871675, + "learning_rate": 4.403506925583213e-05, + "loss": 0.3039, + "step": 8358 + }, + { + "epoch": 0.9912249496027511, + "grad_norm": 1.6873746458445982, + "learning_rate": 4.4033513064082684e-05, + "loss": 0.5691, + "step": 8359 + }, + { + "epoch": 0.9913435313648761, + "grad_norm": 1.4196795585458912, + "learning_rate": 4.403195669686759e-05, + "loss": 0.351, + "step": 8360 + }, + { + "epoch": 0.991462113127001, + "grad_norm": 1.1219907812060252, + "learning_rate": 4.40304001542012e-05, + "loss": 0.3611, + "step": 8361 + }, + { + "epoch": 0.991580694889126, + "grad_norm": 2.278143101768903, + "learning_rate": 4.402884343609785e-05, + "loss": 0.4256, + "step": 8362 + }, + { + "epoch": 0.991699276651251, + "grad_norm": 1.2288900606411612, + "learning_rate": 4.402728654257191e-05, + "loss": 0.3703, + "step": 8363 + }, + { + "epoch": 0.991817858413376, + "grad_norm": 1.07848626114996, + "learning_rate": 4.402572947363771e-05, + "loss": 0.3306, + "step": 8364 + }, + { + "epoch": 0.991936440175501, + "grad_norm": 1.391748692174989, + "learning_rate": 4.402417222930962e-05, + "loss": 0.3726, + "step": 8365 + }, + { + "epoch": 0.992055021937626, + "grad_norm": 1.3466464512086804, + "learning_rate": 4.4022614809601985e-05, + "loss": 0.3493, + "step": 8366 + }, + { + "epoch": 0.9921736036997509, + "grad_norm": 1.191317824634217, + "learning_rate": 4.402105721452918e-05, + "loss": 0.3208, + "step": 8367 + }, + { + "epoch": 0.9922921854618759, + "grad_norm": 0.8967699348752601, + "learning_rate": 4.401949944410554e-05, + "loss": 0.261, + "step": 8368 + }, + { + "epoch": 0.9924107672240009, + "grad_norm": 1.116273200530575, + "learning_rate": 4.401794149834544e-05, + "loss": 0.3364, + "step": 8369 + }, + { + "epoch": 0.9925293489861259, + "grad_norm": 1.3887421289050765, + "learning_rate": 4.401638337726325e-05, + "loss": 0.4445, + "step": 8370 + }, + { + "epoch": 0.992647930748251, + "grad_norm": 1.1396667876560393, + "learning_rate": 4.401482508087331e-05, + "loss": 0.3227, + "step": 8371 + }, + { + "epoch": 0.992766512510376, + "grad_norm": 1.2113054683819284, + "learning_rate": 4.401326660919002e-05, + "loss": 0.3597, + "step": 8372 + }, + { + "epoch": 0.9928850942725009, + "grad_norm": 0.9920963243245934, + "learning_rate": 4.401170796222771e-05, + "loss": 0.2495, + "step": 8373 + }, + { + "epoch": 0.9930036760346259, + "grad_norm": 1.4840581313377206, + "learning_rate": 4.401014914000078e-05, + "loss": 0.3918, + "step": 8374 + }, + { + "epoch": 0.9931222577967509, + "grad_norm": 1.2860677942356737, + "learning_rate": 4.4008590142523576e-05, + "loss": 0.394, + "step": 8375 + }, + { + "epoch": 0.9932408395588759, + "grad_norm": 1.2054274465654053, + "learning_rate": 4.4007030969810484e-05, + "loss": 0.3894, + "step": 8376 + }, + { + "epoch": 0.9933594213210009, + "grad_norm": 1.36404192030046, + "learning_rate": 4.4005471621875874e-05, + "loss": 0.3879, + "step": 8377 + }, + { + "epoch": 0.9934780030831258, + "grad_norm": 1.191411666390149, + "learning_rate": 4.4003912098734125e-05, + "loss": 0.3305, + "step": 8378 + }, + { + "epoch": 0.9935965848452508, + "grad_norm": 1.5263877000463102, + "learning_rate": 4.40023524003996e-05, + "loss": 0.5024, + "step": 8379 + }, + { + "epoch": 0.9937151666073758, + "grad_norm": 1.1770925707619329, + "learning_rate": 4.40007925268867e-05, + "loss": 0.2975, + "step": 8380 + }, + { + "epoch": 0.9938337483695008, + "grad_norm": 1.4316975666712985, + "learning_rate": 4.3999232478209796e-05, + "loss": 0.4419, + "step": 8381 + }, + { + "epoch": 0.9939523301316258, + "grad_norm": 1.110856695490159, + "learning_rate": 4.399767225438326e-05, + "loss": 0.2998, + "step": 8382 + }, + { + "epoch": 0.9940709118937507, + "grad_norm": 1.1922122693325554, + "learning_rate": 4.399611185542148e-05, + "loss": 0.2887, + "step": 8383 + }, + { + "epoch": 0.9941894936558757, + "grad_norm": 1.1728264172234133, + "learning_rate": 4.399455128133885e-05, + "loss": 0.2738, + "step": 8384 + }, + { + "epoch": 0.9943080754180007, + "grad_norm": 1.2844702506003918, + "learning_rate": 4.399299053214974e-05, + "loss": 0.3066, + "step": 8385 + }, + { + "epoch": 0.9944266571801257, + "grad_norm": 0.9807667478936694, + "learning_rate": 4.399142960786855e-05, + "loss": 0.2697, + "step": 8386 + }, + { + "epoch": 0.9945452389422507, + "grad_norm": 1.3044634803674768, + "learning_rate": 4.398986850850968e-05, + "loss": 0.3512, + "step": 8387 + }, + { + "epoch": 0.9946638207043756, + "grad_norm": 1.1771487897411643, + "learning_rate": 4.39883072340875e-05, + "loss": 0.2565, + "step": 8388 + }, + { + "epoch": 0.9947824024665006, + "grad_norm": 1.2420486685758554, + "learning_rate": 4.398674578461641e-05, + "loss": 0.3277, + "step": 8389 + }, + { + "epoch": 0.9949009842286256, + "grad_norm": 1.387581676215533, + "learning_rate": 4.398518416011081e-05, + "loss": 0.391, + "step": 8390 + }, + { + "epoch": 0.9950195659907506, + "grad_norm": 1.3726693436681654, + "learning_rate": 4.3983622360585094e-05, + "loss": 0.4378, + "step": 8391 + }, + { + "epoch": 0.9951381477528756, + "grad_norm": 1.3224357748157405, + "learning_rate": 4.398206038605366e-05, + "loss": 0.3967, + "step": 8392 + }, + { + "epoch": 0.9952567295150005, + "grad_norm": 1.0072762819409165, + "learning_rate": 4.3980498236530906e-05, + "loss": 0.2664, + "step": 8393 + }, + { + "epoch": 0.9953753112771255, + "grad_norm": 1.363029913503713, + "learning_rate": 4.3978935912031236e-05, + "loss": 0.4206, + "step": 8394 + }, + { + "epoch": 0.9954938930392505, + "grad_norm": 1.812383238024004, + "learning_rate": 4.397737341256904e-05, + "loss": 0.519, + "step": 8395 + }, + { + "epoch": 0.9956124748013756, + "grad_norm": 1.3389757404108453, + "learning_rate": 4.397581073815874e-05, + "loss": 0.3537, + "step": 8396 + }, + { + "epoch": 0.9957310565635006, + "grad_norm": 1.6177210072311754, + "learning_rate": 4.3974247888814736e-05, + "loss": 0.4413, + "step": 8397 + }, + { + "epoch": 0.9958496383256256, + "grad_norm": 1.5360715266935046, + "learning_rate": 4.3972684864551435e-05, + "loss": 0.4536, + "step": 8398 + }, + { + "epoch": 0.9959682200877505, + "grad_norm": 1.3953794592339568, + "learning_rate": 4.397112166538324e-05, + "loss": 0.3692, + "step": 8399 + }, + { + "epoch": 0.9960868018498755, + "grad_norm": 0.8565468997939193, + "learning_rate": 4.396955829132457e-05, + "loss": 0.2118, + "step": 8400 + }, + { + "epoch": 0.9962053836120005, + "grad_norm": 1.327332589709776, + "learning_rate": 4.396799474238984e-05, + "loss": 0.3484, + "step": 8401 + }, + { + "epoch": 0.9963239653741255, + "grad_norm": 0.965588737634401, + "learning_rate": 4.396643101859345e-05, + "loss": 0.2552, + "step": 8402 + }, + { + "epoch": 0.9964425471362505, + "grad_norm": 0.9267118186393212, + "learning_rate": 4.3964867119949826e-05, + "loss": 0.2815, + "step": 8403 + }, + { + "epoch": 0.9965611288983754, + "grad_norm": 1.2807303921377133, + "learning_rate": 4.3963303046473385e-05, + "loss": 0.3806, + "step": 8404 + }, + { + "epoch": 0.9966797106605004, + "grad_norm": 1.1912467045154984, + "learning_rate": 4.396173879817855e-05, + "loss": 0.3124, + "step": 8405 + }, + { + "epoch": 0.9967982924226254, + "grad_norm": 1.0972122119147063, + "learning_rate": 4.396017437507973e-05, + "loss": 0.2609, + "step": 8406 + }, + { + "epoch": 0.9969168741847504, + "grad_norm": 1.1517713969281416, + "learning_rate": 4.395860977719135e-05, + "loss": 0.2991, + "step": 8407 + }, + { + "epoch": 0.9970354559468754, + "grad_norm": 1.2341343789475054, + "learning_rate": 4.395704500452784e-05, + "loss": 0.3819, + "step": 8408 + }, + { + "epoch": 0.9971540377090004, + "grad_norm": 1.3898659377744782, + "learning_rate": 4.395548005710362e-05, + "loss": 0.3926, + "step": 8409 + }, + { + "epoch": 0.9972726194711253, + "grad_norm": 1.2453737077752018, + "learning_rate": 4.395391493493312e-05, + "loss": 0.2993, + "step": 8410 + }, + { + "epoch": 0.9973912012332503, + "grad_norm": 1.1577730934032031, + "learning_rate": 4.395234963803076e-05, + "loss": 0.201, + "step": 8411 + }, + { + "epoch": 0.9975097829953753, + "grad_norm": 1.3986908322431455, + "learning_rate": 4.395078416641099e-05, + "loss": 0.377, + "step": 8412 + }, + { + "epoch": 0.9976283647575003, + "grad_norm": 1.2659900958531087, + "learning_rate": 4.394921852008822e-05, + "loss": 0.3554, + "step": 8413 + }, + { + "epoch": 0.9977469465196253, + "grad_norm": 1.067785201855373, + "learning_rate": 4.39476526990769e-05, + "loss": 0.3007, + "step": 8414 + }, + { + "epoch": 0.9978655282817502, + "grad_norm": 0.9299346891534348, + "learning_rate": 4.394608670339145e-05, + "loss": 0.2628, + "step": 8415 + }, + { + "epoch": 0.9979841100438752, + "grad_norm": 1.459633649190856, + "learning_rate": 4.394452053304632e-05, + "loss": 0.4162, + "step": 8416 + }, + { + "epoch": 0.9981026918060002, + "grad_norm": 1.2886448968308508, + "learning_rate": 4.3942954188055934e-05, + "loss": 0.3561, + "step": 8417 + }, + { + "epoch": 0.9982212735681252, + "grad_norm": 1.6839758299514358, + "learning_rate": 4.394138766843474e-05, + "loss": 0.4649, + "step": 8418 + }, + { + "epoch": 0.9983398553302502, + "grad_norm": 1.2005170950324398, + "learning_rate": 4.393982097419719e-05, + "loss": 0.3372, + "step": 8419 + }, + { + "epoch": 0.9984584370923751, + "grad_norm": 1.0673854953428692, + "learning_rate": 4.393825410535771e-05, + "loss": 0.2691, + "step": 8420 + }, + { + "epoch": 0.9985770188545002, + "grad_norm": 1.5751152785993752, + "learning_rate": 4.3936687061930746e-05, + "loss": 0.3658, + "step": 8421 + }, + { + "epoch": 0.9986956006166252, + "grad_norm": 0.9716758827686145, + "learning_rate": 4.393511984393076e-05, + "loss": 0.2788, + "step": 8422 + }, + { + "epoch": 0.9988141823787502, + "grad_norm": 1.0730029202344653, + "learning_rate": 4.393355245137218e-05, + "loss": 0.2501, + "step": 8423 + }, + { + "epoch": 0.9989327641408752, + "grad_norm": 1.1905833741134881, + "learning_rate": 4.393198488426946e-05, + "loss": 0.3428, + "step": 8424 + }, + { + "epoch": 0.9990513459030002, + "grad_norm": 1.1001311224650676, + "learning_rate": 4.3930417142637064e-05, + "loss": 0.3724, + "step": 8425 + }, + { + "epoch": 0.9991699276651251, + "grad_norm": 1.228732629885281, + "learning_rate": 4.392884922648943e-05, + "loss": 0.3143, + "step": 8426 + }, + { + "epoch": 0.9992885094272501, + "grad_norm": 1.3814827585714875, + "learning_rate": 4.392728113584103e-05, + "loss": 0.3525, + "step": 8427 + }, + { + "epoch": 0.9994070911893751, + "grad_norm": 1.485714849450434, + "learning_rate": 4.392571287070629e-05, + "loss": 0.3761, + "step": 8428 + }, + { + "epoch": 0.9995256729515001, + "grad_norm": 1.2142553362948696, + "learning_rate": 4.39241444310997e-05, + "loss": 0.3448, + "step": 8429 + }, + { + "epoch": 0.9996442547136251, + "grad_norm": 1.8951449955056623, + "learning_rate": 4.3922575817035706e-05, + "loss": 0.5687, + "step": 8430 + }, + { + "epoch": 0.99976283647575, + "grad_norm": 1.6129581696271182, + "learning_rate": 4.3921007028528755e-05, + "loss": 0.4994, + "step": 8431 + }, + { + "epoch": 0.999881418237875, + "grad_norm": 1.2824675334568216, + "learning_rate": 4.391943806559333e-05, + "loss": 0.3286, + "step": 8432 + }, + { + "epoch": 1.0, + "grad_norm": 1.366681362063659, + "learning_rate": 4.3917868928243885e-05, + "loss": 0.3314, + "step": 8433 + }, + { + "epoch": 1.000118581762125, + "grad_norm": 1.0116535075431734, + "learning_rate": 4.391629961649488e-05, + "loss": 0.1771, + "step": 8434 + }, + { + "epoch": 1.00023716352425, + "grad_norm": 1.2193812128411812, + "learning_rate": 4.391473013036081e-05, + "loss": 0.2422, + "step": 8435 + }, + { + "epoch": 1.000355745286375, + "grad_norm": 1.4742052430419397, + "learning_rate": 4.3913160469856103e-05, + "loss": 0.4023, + "step": 8436 + }, + { + "epoch": 1.0004743270485, + "grad_norm": 1.5130150378610312, + "learning_rate": 4.391159063499525e-05, + "loss": 0.3015, + "step": 8437 + }, + { + "epoch": 1.000592908810625, + "grad_norm": 0.9946705772061516, + "learning_rate": 4.391002062579273e-05, + "loss": 0.2675, + "step": 8438 + }, + { + "epoch": 1.0007114905727499, + "grad_norm": 1.3997477424240494, + "learning_rate": 4.3908450442263005e-05, + "loss": 0.4026, + "step": 8439 + }, + { + "epoch": 1.0008300723348749, + "grad_norm": 1.1401223154126967, + "learning_rate": 4.390688008442055e-05, + "loss": 0.2775, + "step": 8440 + }, + { + "epoch": 1.0009486540969998, + "grad_norm": 0.8500053707219434, + "learning_rate": 4.390530955227986e-05, + "loss": 0.1858, + "step": 8441 + }, + { + "epoch": 1.0010672358591248, + "grad_norm": 1.3240812480486093, + "learning_rate": 4.390373884585539e-05, + "loss": 0.2419, + "step": 8442 + }, + { + "epoch": 1.0011858176212498, + "grad_norm": 1.172378877902708, + "learning_rate": 4.390216796516163e-05, + "loss": 0.2735, + "step": 8443 + }, + { + "epoch": 1.0013043993833748, + "grad_norm": 0.9041682156217384, + "learning_rate": 4.390059691021306e-05, + "loss": 0.1845, + "step": 8444 + }, + { + "epoch": 1.0014229811454998, + "grad_norm": 1.5993065022336403, + "learning_rate": 4.389902568102416e-05, + "loss": 0.216, + "step": 8445 + }, + { + "epoch": 1.0015415629076247, + "grad_norm": 0.7578723816842607, + "learning_rate": 4.389745427760943e-05, + "loss": 0.1625, + "step": 8446 + }, + { + "epoch": 1.0016601446697497, + "grad_norm": 1.4286839894077132, + "learning_rate": 4.389588269998334e-05, + "loss": 0.339, + "step": 8447 + }, + { + "epoch": 1.0017787264318747, + "grad_norm": 1.0012377571934274, + "learning_rate": 4.389431094816038e-05, + "loss": 0.1976, + "step": 8448 + }, + { + "epoch": 1.0018973081939997, + "grad_norm": 1.1289661925722079, + "learning_rate": 4.3892739022155035e-05, + "loss": 0.2083, + "step": 8449 + }, + { + "epoch": 1.0020158899561247, + "grad_norm": 0.9524707574705489, + "learning_rate": 4.389116692198182e-05, + "loss": 0.1963, + "step": 8450 + }, + { + "epoch": 1.0021344717182497, + "grad_norm": 1.3839711981021519, + "learning_rate": 4.38895946476552e-05, + "loss": 0.3207, + "step": 8451 + }, + { + "epoch": 1.0022530534803746, + "grad_norm": 1.17078525078618, + "learning_rate": 4.388802219918969e-05, + "loss": 0.2806, + "step": 8452 + }, + { + "epoch": 1.0023716352424996, + "grad_norm": 1.0086992151293408, + "learning_rate": 4.388644957659978e-05, + "loss": 0.1685, + "step": 8453 + }, + { + "epoch": 1.0024902170046246, + "grad_norm": 0.956868915552307, + "learning_rate": 4.388487677989995e-05, + "loss": 0.2242, + "step": 8454 + }, + { + "epoch": 1.0026087987667496, + "grad_norm": 1.0151949699309901, + "learning_rate": 4.3883303809104725e-05, + "loss": 0.2477, + "step": 8455 + }, + { + "epoch": 1.0027273805288746, + "grad_norm": 1.4989823584636237, + "learning_rate": 4.3881730664228586e-05, + "loss": 0.2747, + "step": 8456 + }, + { + "epoch": 1.0028459622909995, + "grad_norm": 1.0611127957650728, + "learning_rate": 4.388015734528605e-05, + "loss": 0.2439, + "step": 8457 + }, + { + "epoch": 1.0029645440531247, + "grad_norm": 1.1526288922072745, + "learning_rate": 4.387858385229162e-05, + "loss": 0.2337, + "step": 8458 + }, + { + "epoch": 1.0030831258152497, + "grad_norm": 1.006097871421932, + "learning_rate": 4.38770101852598e-05, + "loss": 0.2658, + "step": 8459 + }, + { + "epoch": 1.0032017075773747, + "grad_norm": 1.057263664126593, + "learning_rate": 4.387543634420509e-05, + "loss": 0.2374, + "step": 8460 + }, + { + "epoch": 1.0033202893394997, + "grad_norm": 1.0953231929775675, + "learning_rate": 4.3873862329142004e-05, + "loss": 0.2184, + "step": 8461 + }, + { + "epoch": 1.0034388711016247, + "grad_norm": 2.310140707232505, + "learning_rate": 4.3872288140085044e-05, + "loss": 0.5143, + "step": 8462 + }, + { + "epoch": 1.0035574528637496, + "grad_norm": 1.4511061270652497, + "learning_rate": 4.3870713777048736e-05, + "loss": 0.2653, + "step": 8463 + }, + { + "epoch": 1.0036760346258746, + "grad_norm": 0.9548306235049033, + "learning_rate": 4.386913924004759e-05, + "loss": 0.2277, + "step": 8464 + }, + { + "epoch": 1.0037946163879996, + "grad_norm": 1.1681892298685417, + "learning_rate": 4.3867564529096117e-05, + "loss": 0.243, + "step": 8465 + }, + { + "epoch": 1.0039131981501246, + "grad_norm": 1.1651845115311485, + "learning_rate": 4.386598964420884e-05, + "loss": 0.2535, + "step": 8466 + }, + { + "epoch": 1.0040317799122496, + "grad_norm": 1.4461354750105, + "learning_rate": 4.386441458540027e-05, + "loss": 0.3373, + "step": 8467 + }, + { + "epoch": 1.0041503616743745, + "grad_norm": 1.3104766460533388, + "learning_rate": 4.3862839352684925e-05, + "loss": 0.271, + "step": 8468 + }, + { + "epoch": 1.0042689434364995, + "grad_norm": 1.4133080049472115, + "learning_rate": 4.386126394607734e-05, + "loss": 0.2635, + "step": 8469 + }, + { + "epoch": 1.0043875251986245, + "grad_norm": 1.3595634842399655, + "learning_rate": 4.385968836559203e-05, + "loss": 0.26, + "step": 8470 + }, + { + "epoch": 1.0045061069607495, + "grad_norm": 1.223973498053663, + "learning_rate": 4.385811261124352e-05, + "loss": 0.2421, + "step": 8471 + }, + { + "epoch": 1.0046246887228745, + "grad_norm": 0.9985368000156944, + "learning_rate": 4.385653668304633e-05, + "loss": 0.2285, + "step": 8472 + }, + { + "epoch": 1.0047432704849995, + "grad_norm": 0.7920777545129393, + "learning_rate": 4.3854960581015e-05, + "loss": 0.1803, + "step": 8473 + }, + { + "epoch": 1.0048618522471244, + "grad_norm": 1.3979043692838444, + "learning_rate": 4.3853384305164055e-05, + "loss": 0.3115, + "step": 8474 + }, + { + "epoch": 1.0049804340092494, + "grad_norm": 1.059978332471856, + "learning_rate": 4.385180785550803e-05, + "loss": 0.2578, + "step": 8475 + }, + { + "epoch": 1.0050990157713744, + "grad_norm": 1.125049197367131, + "learning_rate": 4.3850231232061454e-05, + "loss": 0.2247, + "step": 8476 + }, + { + "epoch": 1.0052175975334994, + "grad_norm": 1.1846929368143908, + "learning_rate": 4.384865443483886e-05, + "loss": 0.3053, + "step": 8477 + }, + { + "epoch": 1.0053361792956244, + "grad_norm": 1.0256781277566005, + "learning_rate": 4.3847077463854784e-05, + "loss": 0.2091, + "step": 8478 + }, + { + "epoch": 1.0054547610577493, + "grad_norm": 1.0894297031011426, + "learning_rate": 4.384550031912377e-05, + "loss": 0.2476, + "step": 8479 + }, + { + "epoch": 1.0055733428198743, + "grad_norm": 1.2996948287644685, + "learning_rate": 4.3843923000660345e-05, + "loss": 0.2577, + "step": 8480 + }, + { + "epoch": 1.0056919245819993, + "grad_norm": 1.0352343956731125, + "learning_rate": 4.3842345508479064e-05, + "loss": 0.227, + "step": 8481 + }, + { + "epoch": 1.0058105063441243, + "grad_norm": 1.3341259711406626, + "learning_rate": 4.3840767842594466e-05, + "loss": 0.2441, + "step": 8482 + }, + { + "epoch": 1.0059290881062493, + "grad_norm": 1.2805923260634549, + "learning_rate": 4.3839190003021093e-05, + "loss": 0.2847, + "step": 8483 + }, + { + "epoch": 1.0060476698683742, + "grad_norm": 1.2672059214015086, + "learning_rate": 4.383761198977348e-05, + "loss": 0.3032, + "step": 8484 + }, + { + "epoch": 1.0061662516304992, + "grad_norm": 1.3003501316059527, + "learning_rate": 4.383603380286619e-05, + "loss": 0.2902, + "step": 8485 + }, + { + "epoch": 1.0062848333926242, + "grad_norm": 1.055031321568214, + "learning_rate": 4.3834455442313767e-05, + "loss": 0.2601, + "step": 8486 + }, + { + "epoch": 1.0064034151547492, + "grad_norm": 1.4761709617770922, + "learning_rate": 4.383287690813076e-05, + "loss": 0.314, + "step": 8487 + }, + { + "epoch": 1.0065219969168742, + "grad_norm": 1.0079408379731285, + "learning_rate": 4.3831298200331725e-05, + "loss": 0.2375, + "step": 8488 + }, + { + "epoch": 1.0066405786789991, + "grad_norm": 1.1380826431607163, + "learning_rate": 4.382971931893121e-05, + "loss": 0.2554, + "step": 8489 + }, + { + "epoch": 1.0067591604411241, + "grad_norm": 1.1348118086708896, + "learning_rate": 4.382814026394377e-05, + "loss": 0.2344, + "step": 8490 + }, + { + "epoch": 1.006877742203249, + "grad_norm": 1.4530174379540401, + "learning_rate": 4.382656103538397e-05, + "loss": 0.2875, + "step": 8491 + }, + { + "epoch": 1.006996323965374, + "grad_norm": 1.0407495624650387, + "learning_rate": 4.382498163326637e-05, + "loss": 0.2099, + "step": 8492 + }, + { + "epoch": 1.007114905727499, + "grad_norm": 1.122949505292771, + "learning_rate": 4.382340205760552e-05, + "loss": 0.1919, + "step": 8493 + }, + { + "epoch": 1.007233487489624, + "grad_norm": 0.9637810590025562, + "learning_rate": 4.382182230841598e-05, + "loss": 0.2381, + "step": 8494 + }, + { + "epoch": 1.007352069251749, + "grad_norm": 1.276754080857855, + "learning_rate": 4.382024238571233e-05, + "loss": 0.2988, + "step": 8495 + }, + { + "epoch": 1.007470651013874, + "grad_norm": 1.2003274390474081, + "learning_rate": 4.3818662289509116e-05, + "loss": 0.2915, + "step": 8496 + }, + { + "epoch": 1.007589232775999, + "grad_norm": 1.0495622467007464, + "learning_rate": 4.381708201982091e-05, + "loss": 0.2239, + "step": 8497 + }, + { + "epoch": 1.007707814538124, + "grad_norm": 1.337063216628371, + "learning_rate": 4.381550157666229e-05, + "loss": 0.2481, + "step": 8498 + }, + { + "epoch": 1.007826396300249, + "grad_norm": 1.0775840448888152, + "learning_rate": 4.3813920960047816e-05, + "loss": 0.2041, + "step": 8499 + }, + { + "epoch": 1.007944978062374, + "grad_norm": 1.857912393293358, + "learning_rate": 4.381234016999206e-05, + "loss": 0.3008, + "step": 8500 + }, + { + "epoch": 1.008063559824499, + "grad_norm": 0.9832865972245178, + "learning_rate": 4.38107592065096e-05, + "loss": 0.2294, + "step": 8501 + }, + { + "epoch": 1.008182141586624, + "grad_norm": 1.3910149826257736, + "learning_rate": 4.3809178069615e-05, + "loss": 0.3314, + "step": 8502 + }, + { + "epoch": 1.0083007233487489, + "grad_norm": 1.0976676155204652, + "learning_rate": 4.3807596759322855e-05, + "loss": 0.2184, + "step": 8503 + }, + { + "epoch": 1.0084193051108739, + "grad_norm": 1.362783412764019, + "learning_rate": 4.3806015275647724e-05, + "loss": 0.2922, + "step": 8504 + }, + { + "epoch": 1.0085378868729988, + "grad_norm": 1.0779840743153437, + "learning_rate": 4.3804433618604195e-05, + "loss": 0.2165, + "step": 8505 + }, + { + "epoch": 1.0086564686351238, + "grad_norm": 1.2584119587314155, + "learning_rate": 4.380285178820685e-05, + "loss": 0.2399, + "step": 8506 + }, + { + "epoch": 1.0087750503972488, + "grad_norm": 1.543329074444893, + "learning_rate": 4.3801269784470266e-05, + "loss": 0.3001, + "step": 8507 + }, + { + "epoch": 1.0088936321593738, + "grad_norm": 1.2425083163548134, + "learning_rate": 4.379968760740903e-05, + "loss": 0.2451, + "step": 8508 + }, + { + "epoch": 1.009012213921499, + "grad_norm": 1.2111355707958291, + "learning_rate": 4.379810525703773e-05, + "loss": 0.2215, + "step": 8509 + }, + { + "epoch": 1.009130795683624, + "grad_norm": 1.405106430613689, + "learning_rate": 4.379652273337095e-05, + "loss": 0.2892, + "step": 8510 + }, + { + "epoch": 1.009249377445749, + "grad_norm": 1.3252634958095557, + "learning_rate": 4.379494003642328e-05, + "loss": 0.341, + "step": 8511 + }, + { + "epoch": 1.009367959207874, + "grad_norm": 1.4089357504463023, + "learning_rate": 4.3793357166209313e-05, + "loss": 0.338, + "step": 8512 + }, + { + "epoch": 1.009486540969999, + "grad_norm": 1.3332979909294422, + "learning_rate": 4.379177412274363e-05, + "loss": 0.3292, + "step": 8513 + }, + { + "epoch": 1.0096051227321239, + "grad_norm": 1.9475234250562536, + "learning_rate": 4.3790190906040846e-05, + "loss": 0.5017, + "step": 8514 + }, + { + "epoch": 1.0097237044942489, + "grad_norm": 1.4424283663277089, + "learning_rate": 4.378860751611554e-05, + "loss": 0.2839, + "step": 8515 + }, + { + "epoch": 1.0098422862563738, + "grad_norm": 1.7639078241817572, + "learning_rate": 4.378702395298231e-05, + "loss": 0.2891, + "step": 8516 + }, + { + "epoch": 1.0099608680184988, + "grad_norm": 1.302431562174773, + "learning_rate": 4.378544021665576e-05, + "loss": 0.276, + "step": 8517 + }, + { + "epoch": 1.0100794497806238, + "grad_norm": 1.3781757993327952, + "learning_rate": 4.378385630715048e-05, + "loss": 0.305, + "step": 8518 + }, + { + "epoch": 1.0101980315427488, + "grad_norm": 1.067353111985892, + "learning_rate": 4.378227222448109e-05, + "loss": 0.2444, + "step": 8519 + }, + { + "epoch": 1.0103166133048738, + "grad_norm": 0.85282009762846, + "learning_rate": 4.378068796866218e-05, + "loss": 0.2051, + "step": 8520 + }, + { + "epoch": 1.0104351950669987, + "grad_norm": 1.055959983747183, + "learning_rate": 4.377910353970836e-05, + "loss": 0.2515, + "step": 8521 + }, + { + "epoch": 1.0105537768291237, + "grad_norm": 1.2216361808148561, + "learning_rate": 4.377751893763422e-05, + "loss": 0.2512, + "step": 8522 + }, + { + "epoch": 1.0106723585912487, + "grad_norm": 1.055060757462321, + "learning_rate": 4.377593416245439e-05, + "loss": 0.2039, + "step": 8523 + }, + { + "epoch": 1.0107909403533737, + "grad_norm": 1.2240033679493745, + "learning_rate": 4.3774349214183474e-05, + "loss": 0.2579, + "step": 8524 + }, + { + "epoch": 1.0109095221154987, + "grad_norm": 0.9350299926816681, + "learning_rate": 4.377276409283608e-05, + "loss": 0.2046, + "step": 8525 + }, + { + "epoch": 1.0110281038776237, + "grad_norm": 1.1864168428543627, + "learning_rate": 4.377117879842682e-05, + "loss": 0.298, + "step": 8526 + }, + { + "epoch": 1.0111466856397486, + "grad_norm": 1.2583408940588878, + "learning_rate": 4.3769593330970314e-05, + "loss": 0.2666, + "step": 8527 + }, + { + "epoch": 1.0112652674018736, + "grad_norm": 1.154412114982071, + "learning_rate": 4.376800769048117e-05, + "loss": 0.2235, + "step": 8528 + }, + { + "epoch": 1.0113838491639986, + "grad_norm": 1.1597480377614724, + "learning_rate": 4.376642187697401e-05, + "loss": 0.2364, + "step": 8529 + }, + { + "epoch": 1.0115024309261236, + "grad_norm": 1.1466625855854315, + "learning_rate": 4.376483589046345e-05, + "loss": 0.2277, + "step": 8530 + }, + { + "epoch": 1.0116210126882486, + "grad_norm": 1.2750514131529698, + "learning_rate": 4.3763249730964126e-05, + "loss": 0.3053, + "step": 8531 + }, + { + "epoch": 1.0117395944503735, + "grad_norm": 1.162166929675154, + "learning_rate": 4.376166339849064e-05, + "loss": 0.2499, + "step": 8532 + }, + { + "epoch": 1.0118581762124985, + "grad_norm": 0.9022452117208262, + "learning_rate": 4.376007689305762e-05, + "loss": 0.1549, + "step": 8533 + }, + { + "epoch": 1.0119767579746235, + "grad_norm": 0.9628951717996728, + "learning_rate": 4.3758490214679705e-05, + "loss": 0.197, + "step": 8534 + }, + { + "epoch": 1.0120953397367485, + "grad_norm": 1.5805408359281443, + "learning_rate": 4.375690336337151e-05, + "loss": 0.3197, + "step": 8535 + }, + { + "epoch": 1.0122139214988735, + "grad_norm": 1.8595952180351867, + "learning_rate": 4.375531633914767e-05, + "loss": 0.4343, + "step": 8536 + }, + { + "epoch": 1.0123325032609984, + "grad_norm": 1.1186694543055207, + "learning_rate": 4.375372914202281e-05, + "loss": 0.1961, + "step": 8537 + }, + { + "epoch": 1.0124510850231234, + "grad_norm": 1.1082097063203298, + "learning_rate": 4.375214177201157e-05, + "loss": 0.2299, + "step": 8538 + }, + { + "epoch": 1.0125696667852484, + "grad_norm": 0.9746838122112957, + "learning_rate": 4.375055422912857e-05, + "loss": 0.1949, + "step": 8539 + }, + { + "epoch": 1.0126882485473734, + "grad_norm": 1.1290862891330993, + "learning_rate": 4.3748966513388456e-05, + "loss": 0.3089, + "step": 8540 + }, + { + "epoch": 1.0128068303094984, + "grad_norm": 1.221681544702482, + "learning_rate": 4.374737862480586e-05, + "loss": 0.288, + "step": 8541 + }, + { + "epoch": 1.0129254120716233, + "grad_norm": 1.2629133963078014, + "learning_rate": 4.374579056339543e-05, + "loss": 0.2651, + "step": 8542 + }, + { + "epoch": 1.0130439938337483, + "grad_norm": 2.024871823279411, + "learning_rate": 4.374420232917179e-05, + "loss": 0.5251, + "step": 8543 + }, + { + "epoch": 1.0131625755958733, + "grad_norm": 1.2181195195908356, + "learning_rate": 4.37426139221496e-05, + "loss": 0.2844, + "step": 8544 + }, + { + "epoch": 1.0132811573579983, + "grad_norm": 1.2655830160350774, + "learning_rate": 4.374102534234348e-05, + "loss": 0.259, + "step": 8545 + }, + { + "epoch": 1.0133997391201233, + "grad_norm": 1.3374148346873538, + "learning_rate": 4.37394365897681e-05, + "loss": 0.2643, + "step": 8546 + }, + { + "epoch": 1.0135183208822482, + "grad_norm": 1.3434120321238792, + "learning_rate": 4.3737847664438084e-05, + "loss": 0.3005, + "step": 8547 + }, + { + "epoch": 1.0136369026443732, + "grad_norm": 1.1140270026468941, + "learning_rate": 4.37362585663681e-05, + "loss": 0.3034, + "step": 8548 + }, + { + "epoch": 1.0137554844064982, + "grad_norm": 1.0476829904503462, + "learning_rate": 4.3734669295572786e-05, + "loss": 0.2417, + "step": 8549 + }, + { + "epoch": 1.0138740661686232, + "grad_norm": 0.8781076452117877, + "learning_rate": 4.3733079852066795e-05, + "loss": 0.1618, + "step": 8550 + }, + { + "epoch": 1.0139926479307482, + "grad_norm": 0.7836120547345586, + "learning_rate": 4.373149023586477e-05, + "loss": 0.182, + "step": 8551 + }, + { + "epoch": 1.0141112296928732, + "grad_norm": 1.0635611802699878, + "learning_rate": 4.372990044698139e-05, + "loss": 0.2641, + "step": 8552 + }, + { + "epoch": 1.0142298114549981, + "grad_norm": 1.0905779848649926, + "learning_rate": 4.372831048543129e-05, + "loss": 0.2228, + "step": 8553 + }, + { + "epoch": 1.0143483932171231, + "grad_norm": 1.4739789595871764, + "learning_rate": 4.372672035122913e-05, + "loss": 0.3055, + "step": 8554 + }, + { + "epoch": 1.014466974979248, + "grad_norm": 1.8320097436038831, + "learning_rate": 4.3725130044389576e-05, + "loss": 0.3883, + "step": 8555 + }, + { + "epoch": 1.014585556741373, + "grad_norm": 1.096099754601582, + "learning_rate": 4.3723539564927285e-05, + "loss": 0.2171, + "step": 8556 + }, + { + "epoch": 1.014704138503498, + "grad_norm": 1.1946322294060978, + "learning_rate": 4.372194891285691e-05, + "loss": 0.2437, + "step": 8557 + }, + { + "epoch": 1.0148227202656233, + "grad_norm": 1.5795724462056968, + "learning_rate": 4.372035808819314e-05, + "loss": 0.3218, + "step": 8558 + }, + { + "epoch": 1.0149413020277482, + "grad_norm": 1.1833073236381562, + "learning_rate": 4.371876709095062e-05, + "loss": 0.2017, + "step": 8559 + }, + { + "epoch": 1.0150598837898732, + "grad_norm": 1.1839020467958385, + "learning_rate": 4.3717175921144014e-05, + "loss": 0.2859, + "step": 8560 + }, + { + "epoch": 1.0151784655519982, + "grad_norm": 1.2676361647536176, + "learning_rate": 4.371558457878801e-05, + "loss": 0.2671, + "step": 8561 + }, + { + "epoch": 1.0152970473141232, + "grad_norm": 1.1290241232784106, + "learning_rate": 4.371399306389726e-05, + "loss": 0.2056, + "step": 8562 + }, + { + "epoch": 1.0154156290762482, + "grad_norm": 1.3565608920727867, + "learning_rate": 4.371240137648645e-05, + "loss": 0.3137, + "step": 8563 + }, + { + "epoch": 1.0155342108383731, + "grad_norm": 1.6175094240854497, + "learning_rate": 4.371080951657024e-05, + "loss": 0.3397, + "step": 8564 + }, + { + "epoch": 1.0156527926004981, + "grad_norm": 1.1415218168424681, + "learning_rate": 4.370921748416331e-05, + "loss": 0.2181, + "step": 8565 + }, + { + "epoch": 1.015771374362623, + "grad_norm": 1.0185096003738565, + "learning_rate": 4.370762527928034e-05, + "loss": 0.2077, + "step": 8566 + }, + { + "epoch": 1.015889956124748, + "grad_norm": 1.6076607894450587, + "learning_rate": 4.3706032901936e-05, + "loss": 0.3515, + "step": 8567 + }, + { + "epoch": 1.016008537886873, + "grad_norm": 1.3478731083037627, + "learning_rate": 4.370444035214498e-05, + "loss": 0.251, + "step": 8568 + }, + { + "epoch": 1.016127119648998, + "grad_norm": 0.8551066517021854, + "learning_rate": 4.370284762992196e-05, + "loss": 0.197, + "step": 8569 + }, + { + "epoch": 1.016245701411123, + "grad_norm": 1.777650980670058, + "learning_rate": 4.3701254735281616e-05, + "loss": 0.3857, + "step": 8570 + }, + { + "epoch": 1.016364283173248, + "grad_norm": 1.0350879916798532, + "learning_rate": 4.3699661668238635e-05, + "loss": 0.1891, + "step": 8571 + }, + { + "epoch": 1.016482864935373, + "grad_norm": 1.185270775390171, + "learning_rate": 4.3698068428807706e-05, + "loss": 0.2342, + "step": 8572 + }, + { + "epoch": 1.016601446697498, + "grad_norm": 1.4536409818391902, + "learning_rate": 4.369647501700351e-05, + "loss": 0.2734, + "step": 8573 + }, + { + "epoch": 1.016720028459623, + "grad_norm": 1.3510088550288675, + "learning_rate": 4.369488143284075e-05, + "loss": 0.2676, + "step": 8574 + }, + { + "epoch": 1.016838610221748, + "grad_norm": 1.4935079821909465, + "learning_rate": 4.3693287676334106e-05, + "loss": 0.218, + "step": 8575 + }, + { + "epoch": 1.016957191983873, + "grad_norm": 1.2993281064220215, + "learning_rate": 4.3691693747498276e-05, + "loss": 0.2893, + "step": 8576 + }, + { + "epoch": 1.017075773745998, + "grad_norm": 1.23285325843082, + "learning_rate": 4.3690099646347946e-05, + "loss": 0.3107, + "step": 8577 + }, + { + "epoch": 1.0171943555081229, + "grad_norm": 1.0671111000727624, + "learning_rate": 4.368850537289782e-05, + "loss": 0.2652, + "step": 8578 + }, + { + "epoch": 1.0173129372702479, + "grad_norm": 1.0361573956546033, + "learning_rate": 4.368691092716258e-05, + "loss": 0.305, + "step": 8579 + }, + { + "epoch": 1.0174315190323728, + "grad_norm": 1.7640161514388055, + "learning_rate": 4.368531630915695e-05, + "loss": 0.304, + "step": 8580 + }, + { + "epoch": 1.0175501007944978, + "grad_norm": 1.2224999735062672, + "learning_rate": 4.3683721518895616e-05, + "loss": 0.2788, + "step": 8581 + }, + { + "epoch": 1.0176686825566228, + "grad_norm": 1.4266725900455859, + "learning_rate": 4.3682126556393274e-05, + "loss": 0.301, + "step": 8582 + }, + { + "epoch": 1.0177872643187478, + "grad_norm": 1.224107914486952, + "learning_rate": 4.368053142166465e-05, + "loss": 0.3261, + "step": 8583 + }, + { + "epoch": 1.0179058460808728, + "grad_norm": 1.2162984970764834, + "learning_rate": 4.367893611472442e-05, + "loss": 0.2351, + "step": 8584 + }, + { + "epoch": 1.0180244278429977, + "grad_norm": 1.8551110503179864, + "learning_rate": 4.3677340635587305e-05, + "loss": 0.2918, + "step": 8585 + }, + { + "epoch": 1.0181430096051227, + "grad_norm": 0.9759350665903611, + "learning_rate": 4.367574498426802e-05, + "loss": 0.1903, + "step": 8586 + }, + { + "epoch": 1.0182615913672477, + "grad_norm": 1.2567856377894209, + "learning_rate": 4.3674149160781256e-05, + "loss": 0.3184, + "step": 8587 + }, + { + "epoch": 1.0183801731293727, + "grad_norm": 1.170773376323589, + "learning_rate": 4.367255316514175e-05, + "loss": 0.268, + "step": 8588 + }, + { + "epoch": 1.0184987548914977, + "grad_norm": 1.0343395661729524, + "learning_rate": 4.3670956997364194e-05, + "loss": 0.2083, + "step": 8589 + }, + { + "epoch": 1.0186173366536226, + "grad_norm": 1.3085903682871336, + "learning_rate": 4.366936065746331e-05, + "loss": 0.2503, + "step": 8590 + }, + { + "epoch": 1.0187359184157476, + "grad_norm": 1.0182033537148185, + "learning_rate": 4.3667764145453816e-05, + "loss": 0.2435, + "step": 8591 + }, + { + "epoch": 1.0188545001778726, + "grad_norm": 1.3080754554820382, + "learning_rate": 4.366616746135043e-05, + "loss": 0.2516, + "step": 8592 + }, + { + "epoch": 1.0189730819399976, + "grad_norm": 1.546386948849375, + "learning_rate": 4.366457060516787e-05, + "loss": 0.3783, + "step": 8593 + }, + { + "epoch": 1.0190916637021226, + "grad_norm": 0.9466666465358384, + "learning_rate": 4.3662973576920854e-05, + "loss": 0.2242, + "step": 8594 + }, + { + "epoch": 1.0192102454642475, + "grad_norm": 1.4825237752593396, + "learning_rate": 4.366137637662411e-05, + "loss": 0.2938, + "step": 8595 + }, + { + "epoch": 1.0193288272263725, + "grad_norm": 1.0017315564352482, + "learning_rate": 4.3659779004292364e-05, + "loss": 0.1942, + "step": 8596 + }, + { + "epoch": 1.0194474089884975, + "grad_norm": 1.05011877294742, + "learning_rate": 4.365818145994033e-05, + "loss": 0.1975, + "step": 8597 + }, + { + "epoch": 1.0195659907506225, + "grad_norm": 1.2409887967350528, + "learning_rate": 4.3656583743582745e-05, + "loss": 0.2636, + "step": 8598 + }, + { + "epoch": 1.0196845725127475, + "grad_norm": 1.763804159577235, + "learning_rate": 4.365498585523434e-05, + "loss": 0.266, + "step": 8599 + }, + { + "epoch": 1.0198031542748724, + "grad_norm": 1.5727264498746163, + "learning_rate": 4.365338779490984e-05, + "loss": 0.3748, + "step": 8600 + }, + { + "epoch": 1.0199217360369974, + "grad_norm": 1.1128405014607599, + "learning_rate": 4.365178956262398e-05, + "loss": 0.2391, + "step": 8601 + }, + { + "epoch": 1.0200403177991224, + "grad_norm": 0.9792512394428746, + "learning_rate": 4.365019115839148e-05, + "loss": 0.1877, + "step": 8602 + }, + { + "epoch": 1.0201588995612474, + "grad_norm": 1.255373785436766, + "learning_rate": 4.364859258222711e-05, + "loss": 0.2588, + "step": 8603 + }, + { + "epoch": 1.0202774813233724, + "grad_norm": 1.2574673564154812, + "learning_rate": 4.364699383414557e-05, + "loss": 0.2384, + "step": 8604 + }, + { + "epoch": 1.0203960630854974, + "grad_norm": 1.5460089415797247, + "learning_rate": 4.364539491416162e-05, + "loss": 0.282, + "step": 8605 + }, + { + "epoch": 1.0205146448476223, + "grad_norm": 1.3123412455537662, + "learning_rate": 4.364379582228999e-05, + "loss": 0.2299, + "step": 8606 + }, + { + "epoch": 1.0206332266097473, + "grad_norm": 1.0849822710258632, + "learning_rate": 4.364219655854542e-05, + "loss": 0.2088, + "step": 8607 + }, + { + "epoch": 1.0207518083718723, + "grad_norm": 1.0046549242582878, + "learning_rate": 4.364059712294267e-05, + "loss": 0.2362, + "step": 8608 + }, + { + "epoch": 1.0208703901339975, + "grad_norm": 1.0871466528388025, + "learning_rate": 4.363899751549646e-05, + "loss": 0.2419, + "step": 8609 + }, + { + "epoch": 1.0209889718961225, + "grad_norm": 1.693557573374471, + "learning_rate": 4.363739773622156e-05, + "loss": 0.3573, + "step": 8610 + }, + { + "epoch": 1.0211075536582475, + "grad_norm": 1.2342037127426082, + "learning_rate": 4.3635797785132706e-05, + "loss": 0.3145, + "step": 8611 + }, + { + "epoch": 1.0212261354203724, + "grad_norm": 1.192676659756125, + "learning_rate": 4.363419766224465e-05, + "loss": 0.2205, + "step": 8612 + }, + { + "epoch": 1.0213447171824974, + "grad_norm": 1.0235318233881867, + "learning_rate": 4.3632597367572134e-05, + "loss": 0.1873, + "step": 8613 + }, + { + "epoch": 1.0214632989446224, + "grad_norm": 1.0484160852669286, + "learning_rate": 4.3630996901129925e-05, + "loss": 0.1826, + "step": 8614 + }, + { + "epoch": 1.0215818807067474, + "grad_norm": 1.2313775590420182, + "learning_rate": 4.3629396262932764e-05, + "loss": 0.2684, + "step": 8615 + }, + { + "epoch": 1.0217004624688724, + "grad_norm": 0.9980435957168756, + "learning_rate": 4.3627795452995425e-05, + "loss": 0.1999, + "step": 8616 + }, + { + "epoch": 1.0218190442309973, + "grad_norm": 1.0631005237042457, + "learning_rate": 4.362619447133265e-05, + "loss": 0.2169, + "step": 8617 + }, + { + "epoch": 1.0219376259931223, + "grad_norm": 1.1568639554661395, + "learning_rate": 4.3624593317959204e-05, + "loss": 0.2532, + "step": 8618 + }, + { + "epoch": 1.0220562077552473, + "grad_norm": 1.2794863953608537, + "learning_rate": 4.3622991992889847e-05, + "loss": 0.296, + "step": 8619 + }, + { + "epoch": 1.0221747895173723, + "grad_norm": 1.0152022377417569, + "learning_rate": 4.362139049613934e-05, + "loss": 0.2217, + "step": 8620 + }, + { + "epoch": 1.0222933712794973, + "grad_norm": 1.100257457769699, + "learning_rate": 4.3619788827722445e-05, + "loss": 0.2168, + "step": 8621 + }, + { + "epoch": 1.0224119530416222, + "grad_norm": 1.0345103681849837, + "learning_rate": 4.361818698765394e-05, + "loss": 0.2418, + "step": 8622 + }, + { + "epoch": 1.0225305348037472, + "grad_norm": 2.107323870670551, + "learning_rate": 4.361658497594857e-05, + "loss": 0.4822, + "step": 8623 + }, + { + "epoch": 1.0226491165658722, + "grad_norm": 1.4758190611139208, + "learning_rate": 4.3614982792621115e-05, + "loss": 0.4374, + "step": 8624 + }, + { + "epoch": 1.0227676983279972, + "grad_norm": 1.0801703291989793, + "learning_rate": 4.3613380437686356e-05, + "loss": 0.2475, + "step": 8625 + }, + { + "epoch": 1.0228862800901222, + "grad_norm": 1.348185040645482, + "learning_rate": 4.361177791115905e-05, + "loss": 0.3261, + "step": 8626 + }, + { + "epoch": 1.0230048618522471, + "grad_norm": 1.1377051753775735, + "learning_rate": 4.3610175213053977e-05, + "loss": 0.2569, + "step": 8627 + }, + { + "epoch": 1.0231234436143721, + "grad_norm": 0.7827188177737925, + "learning_rate": 4.360857234338591e-05, + "loss": 0.1651, + "step": 8628 + }, + { + "epoch": 1.023242025376497, + "grad_norm": 1.127017370847579, + "learning_rate": 4.360696930216962e-05, + "loss": 0.2222, + "step": 8629 + }, + { + "epoch": 1.023360607138622, + "grad_norm": 1.2987630458343589, + "learning_rate": 4.3605366089419894e-05, + "loss": 0.3068, + "step": 8630 + }, + { + "epoch": 1.023479188900747, + "grad_norm": 1.0790454260782965, + "learning_rate": 4.360376270515151e-05, + "loss": 0.256, + "step": 8631 + }, + { + "epoch": 1.023597770662872, + "grad_norm": 0.9345125406912789, + "learning_rate": 4.360215914937924e-05, + "loss": 0.2261, + "step": 8632 + }, + { + "epoch": 1.023716352424997, + "grad_norm": 1.2805315332452034, + "learning_rate": 4.360055542211788e-05, + "loss": 0.2827, + "step": 8633 + }, + { + "epoch": 1.023834934187122, + "grad_norm": 1.1373654659518602, + "learning_rate": 4.359895152338221e-05, + "loss": 0.2656, + "step": 8634 + }, + { + "epoch": 1.023953515949247, + "grad_norm": 1.0349107668777628, + "learning_rate": 4.3597347453187006e-05, + "loss": 0.2204, + "step": 8635 + }, + { + "epoch": 1.024072097711372, + "grad_norm": 1.5066624709931495, + "learning_rate": 4.359574321154708e-05, + "loss": 0.4631, + "step": 8636 + }, + { + "epoch": 1.024190679473497, + "grad_norm": 1.4406702036177779, + "learning_rate": 4.359413879847719e-05, + "loss": 0.2948, + "step": 8637 + }, + { + "epoch": 1.024309261235622, + "grad_norm": 1.257949406115359, + "learning_rate": 4.3592534213992154e-05, + "loss": 0.2531, + "step": 8638 + }, + { + "epoch": 1.024427842997747, + "grad_norm": 1.344263420829829, + "learning_rate": 4.359092945810674e-05, + "loss": 0.3121, + "step": 8639 + }, + { + "epoch": 1.024546424759872, + "grad_norm": 1.1011753445772992, + "learning_rate": 4.3589324530835763e-05, + "loss": 0.2807, + "step": 8640 + }, + { + "epoch": 1.0246650065219969, + "grad_norm": 1.163955025529501, + "learning_rate": 4.358771943219401e-05, + "loss": 0.2398, + "step": 8641 + }, + { + "epoch": 1.0247835882841219, + "grad_norm": 1.1846884286282486, + "learning_rate": 4.358611416219627e-05, + "loss": 0.276, + "step": 8642 + }, + { + "epoch": 1.0249021700462468, + "grad_norm": 0.9238875793461886, + "learning_rate": 4.358450872085735e-05, + "loss": 0.2381, + "step": 8643 + }, + { + "epoch": 1.0250207518083718, + "grad_norm": 1.0422962886427154, + "learning_rate": 4.3582903108192054e-05, + "loss": 0.2317, + "step": 8644 + }, + { + "epoch": 1.0251393335704968, + "grad_norm": 1.6624129115416268, + "learning_rate": 4.358129732421518e-05, + "loss": 0.2337, + "step": 8645 + }, + { + "epoch": 1.0252579153326218, + "grad_norm": 1.1133572928251048, + "learning_rate": 4.3579691368941525e-05, + "loss": 0.2645, + "step": 8646 + }, + { + "epoch": 1.0253764970947468, + "grad_norm": 1.3500513350857724, + "learning_rate": 4.35780852423859e-05, + "loss": 0.2658, + "step": 8647 + }, + { + "epoch": 1.0254950788568717, + "grad_norm": 1.265081611690696, + "learning_rate": 4.357647894456312e-05, + "loss": 0.2891, + "step": 8648 + }, + { + "epoch": 1.0256136606189967, + "grad_norm": 1.20832128093361, + "learning_rate": 4.3574872475487974e-05, + "loss": 0.2433, + "step": 8649 + }, + { + "epoch": 1.0257322423811217, + "grad_norm": 1.015469979660728, + "learning_rate": 4.357326583517528e-05, + "loss": 0.2232, + "step": 8650 + }, + { + "epoch": 1.0258508241432467, + "grad_norm": 1.2756741788228796, + "learning_rate": 4.3571659023639866e-05, + "loss": 0.2253, + "step": 8651 + }, + { + "epoch": 1.0259694059053717, + "grad_norm": 1.5126450377075686, + "learning_rate": 4.357005204089651e-05, + "loss": 0.2861, + "step": 8652 + }, + { + "epoch": 1.0260879876674966, + "grad_norm": 1.1260895676288503, + "learning_rate": 4.356844488696006e-05, + "loss": 0.2471, + "step": 8653 + }, + { + "epoch": 1.0262065694296216, + "grad_norm": 1.375178808750325, + "learning_rate": 4.356683756184532e-05, + "loss": 0.2569, + "step": 8654 + }, + { + "epoch": 1.0263251511917466, + "grad_norm": 1.0525314230008287, + "learning_rate": 4.3565230065567096e-05, + "loss": 0.212, + "step": 8655 + }, + { + "epoch": 1.0264437329538716, + "grad_norm": 1.2332741800940186, + "learning_rate": 4.3563622398140225e-05, + "loss": 0.278, + "step": 8656 + }, + { + "epoch": 1.0265623147159966, + "grad_norm": 1.0165661462476312, + "learning_rate": 4.3562014559579515e-05, + "loss": 0.1787, + "step": 8657 + }, + { + "epoch": 1.0266808964781218, + "grad_norm": 1.3748725008699616, + "learning_rate": 4.3560406549899793e-05, + "loss": 0.2766, + "step": 8658 + }, + { + "epoch": 1.0267994782402468, + "grad_norm": 1.2954567251766318, + "learning_rate": 4.3558798369115886e-05, + "loss": 0.2646, + "step": 8659 + }, + { + "epoch": 1.0269180600023717, + "grad_norm": 1.449890863718068, + "learning_rate": 4.355719001724262e-05, + "loss": 0.3121, + "step": 8660 + }, + { + "epoch": 1.0270366417644967, + "grad_norm": 1.16587726818535, + "learning_rate": 4.3555581494294806e-05, + "loss": 0.2784, + "step": 8661 + }, + { + "epoch": 1.0271552235266217, + "grad_norm": 1.2940707718686897, + "learning_rate": 4.355397280028729e-05, + "loss": 0.2863, + "step": 8662 + }, + { + "epoch": 1.0272738052887467, + "grad_norm": 1.4868700893655498, + "learning_rate": 4.355236393523491e-05, + "loss": 0.3262, + "step": 8663 + }, + { + "epoch": 1.0273923870508717, + "grad_norm": 1.0282925553844282, + "learning_rate": 4.355075489915247e-05, + "loss": 0.2224, + "step": 8664 + }, + { + "epoch": 1.0275109688129966, + "grad_norm": 1.3438010538190113, + "learning_rate": 4.354914569205482e-05, + "loss": 0.2806, + "step": 8665 + }, + { + "epoch": 1.0276295505751216, + "grad_norm": 1.5136575599641917, + "learning_rate": 4.35475363139568e-05, + "loss": 0.2856, + "step": 8666 + }, + { + "epoch": 1.0277481323372466, + "grad_norm": 1.2312771558396547, + "learning_rate": 4.3545926764873235e-05, + "loss": 0.2265, + "step": 8667 + }, + { + "epoch": 1.0278667140993716, + "grad_norm": 1.554837302441255, + "learning_rate": 4.354431704481897e-05, + "loss": 0.3886, + "step": 8668 + }, + { + "epoch": 1.0279852958614966, + "grad_norm": 1.00322057370471, + "learning_rate": 4.3542707153808835e-05, + "loss": 0.1962, + "step": 8669 + }, + { + "epoch": 1.0281038776236215, + "grad_norm": 1.2974210375724637, + "learning_rate": 4.354109709185769e-05, + "loss": 0.3219, + "step": 8670 + }, + { + "epoch": 1.0282224593857465, + "grad_norm": 1.5103470047795027, + "learning_rate": 4.353948685898036e-05, + "loss": 0.311, + "step": 8671 + }, + { + "epoch": 1.0283410411478715, + "grad_norm": 0.9170341863504202, + "learning_rate": 4.3537876455191694e-05, + "loss": 0.2145, + "step": 8672 + }, + { + "epoch": 1.0284596229099965, + "grad_norm": 1.098818793018443, + "learning_rate": 4.353626588050655e-05, + "loss": 0.2459, + "step": 8673 + }, + { + "epoch": 1.0285782046721215, + "grad_norm": 1.0529108121386994, + "learning_rate": 4.3534655134939754e-05, + "loss": 0.2663, + "step": 8674 + }, + { + "epoch": 1.0286967864342464, + "grad_norm": 1.2126640959620578, + "learning_rate": 4.353304421850616e-05, + "loss": 0.3154, + "step": 8675 + }, + { + "epoch": 1.0288153681963714, + "grad_norm": 1.3364425863511333, + "learning_rate": 4.353143313122064e-05, + "loss": 0.2916, + "step": 8676 + }, + { + "epoch": 1.0289339499584964, + "grad_norm": 1.1624370747883905, + "learning_rate": 4.352982187309802e-05, + "loss": 0.2097, + "step": 8677 + }, + { + "epoch": 1.0290525317206214, + "grad_norm": 1.6019569235270579, + "learning_rate": 4.352821044415318e-05, + "loss": 0.4394, + "step": 8678 + }, + { + "epoch": 1.0291711134827464, + "grad_norm": 1.561617553273008, + "learning_rate": 4.3526598844400944e-05, + "loss": 0.3642, + "step": 8679 + }, + { + "epoch": 1.0292896952448713, + "grad_norm": 1.452169521764443, + "learning_rate": 4.3524987073856196e-05, + "loss": 0.3088, + "step": 8680 + }, + { + "epoch": 1.0294082770069963, + "grad_norm": 1.5145734356436107, + "learning_rate": 4.3523375132533784e-05, + "loss": 0.334, + "step": 8681 + }, + { + "epoch": 1.0295268587691213, + "grad_norm": 0.8724238163042343, + "learning_rate": 4.352176302044856e-05, + "loss": 0.2184, + "step": 8682 + }, + { + "epoch": 1.0296454405312463, + "grad_norm": 1.3311126649851854, + "learning_rate": 4.352015073761541e-05, + "loss": 0.3208, + "step": 8683 + }, + { + "epoch": 1.0297640222933713, + "grad_norm": 1.4273368448124097, + "learning_rate": 4.351853828404916e-05, + "loss": 0.3032, + "step": 8684 + }, + { + "epoch": 1.0298826040554963, + "grad_norm": 1.0024048730139754, + "learning_rate": 4.351692565976471e-05, + "loss": 0.2047, + "step": 8685 + }, + { + "epoch": 1.0300011858176212, + "grad_norm": 1.1413840710263234, + "learning_rate": 4.351531286477691e-05, + "loss": 0.2301, + "step": 8686 + }, + { + "epoch": 1.0301197675797462, + "grad_norm": 1.5495909259010694, + "learning_rate": 4.3513699899100625e-05, + "loss": 0.3071, + "step": 8687 + }, + { + "epoch": 1.0302383493418712, + "grad_norm": 1.2274409169074116, + "learning_rate": 4.351208676275074e-05, + "loss": 0.2501, + "step": 8688 + }, + { + "epoch": 1.0303569311039962, + "grad_norm": 1.1330816233984966, + "learning_rate": 4.351047345574211e-05, + "loss": 0.2604, + "step": 8689 + }, + { + "epoch": 1.0304755128661212, + "grad_norm": 1.03018211653985, + "learning_rate": 4.350885997808961e-05, + "loss": 0.2421, + "step": 8690 + }, + { + "epoch": 1.0305940946282461, + "grad_norm": 1.4782467227042544, + "learning_rate": 4.350724632980812e-05, + "loss": 0.2479, + "step": 8691 + }, + { + "epoch": 1.0307126763903711, + "grad_norm": 1.3529534470570095, + "learning_rate": 4.350563251091252e-05, + "loss": 0.322, + "step": 8692 + }, + { + "epoch": 1.030831258152496, + "grad_norm": 1.2563931807512323, + "learning_rate": 4.350401852141768e-05, + "loss": 0.2842, + "step": 8693 + }, + { + "epoch": 1.030949839914621, + "grad_norm": 1.799749181714516, + "learning_rate": 4.3502404361338465e-05, + "loss": 0.4179, + "step": 8694 + }, + { + "epoch": 1.031068421676746, + "grad_norm": 0.9816304716450948, + "learning_rate": 4.350079003068979e-05, + "loss": 0.166, + "step": 8695 + }, + { + "epoch": 1.031187003438871, + "grad_norm": 0.9343594515011191, + "learning_rate": 4.349917552948651e-05, + "loss": 0.259, + "step": 8696 + }, + { + "epoch": 1.031305585200996, + "grad_norm": 1.5170821471604519, + "learning_rate": 4.349756085774352e-05, + "loss": 0.3389, + "step": 8697 + }, + { + "epoch": 1.031424166963121, + "grad_norm": 1.3630021742951526, + "learning_rate": 4.3495946015475696e-05, + "loss": 0.2963, + "step": 8698 + }, + { + "epoch": 1.031542748725246, + "grad_norm": 1.0877449347402883, + "learning_rate": 4.349433100269794e-05, + "loss": 0.2391, + "step": 8699 + }, + { + "epoch": 1.031661330487371, + "grad_norm": 1.016996287299599, + "learning_rate": 4.349271581942512e-05, + "loss": 0.2726, + "step": 8700 + }, + { + "epoch": 1.031779912249496, + "grad_norm": 0.8723684196893926, + "learning_rate": 4.349110046567215e-05, + "loss": 0.1678, + "step": 8701 + }, + { + "epoch": 1.031898494011621, + "grad_norm": 1.0390996424226597, + "learning_rate": 4.3489484941453906e-05, + "loss": 0.1867, + "step": 8702 + }, + { + "epoch": 1.032017075773746, + "grad_norm": 1.1726741762108155, + "learning_rate": 4.348786924678528e-05, + "loss": 0.2271, + "step": 8703 + }, + { + "epoch": 1.0321356575358709, + "grad_norm": 1.6747149529521428, + "learning_rate": 4.3486253381681166e-05, + "loss": 0.3035, + "step": 8704 + }, + { + "epoch": 1.0322542392979959, + "grad_norm": 1.6946372994231809, + "learning_rate": 4.3484637346156475e-05, + "loss": 0.3954, + "step": 8705 + }, + { + "epoch": 1.0323728210601208, + "grad_norm": 1.3241743348220776, + "learning_rate": 4.348302114022609e-05, + "loss": 0.2994, + "step": 8706 + }, + { + "epoch": 1.032491402822246, + "grad_norm": 1.136246431497065, + "learning_rate": 4.348140476390492e-05, + "loss": 0.2315, + "step": 8707 + }, + { + "epoch": 1.0326099845843708, + "grad_norm": 1.2393174987085978, + "learning_rate": 4.3479788217207854e-05, + "loss": 0.2246, + "step": 8708 + }, + { + "epoch": 1.032728566346496, + "grad_norm": 1.2274423095927245, + "learning_rate": 4.347817150014981e-05, + "loss": 0.2129, + "step": 8709 + }, + { + "epoch": 1.032847148108621, + "grad_norm": 1.3447853875204068, + "learning_rate": 4.347655461274568e-05, + "loss": 0.2691, + "step": 8710 + }, + { + "epoch": 1.032965729870746, + "grad_norm": 1.2125514069554881, + "learning_rate": 4.347493755501038e-05, + "loss": 0.2693, + "step": 8711 + }, + { + "epoch": 1.033084311632871, + "grad_norm": 0.995722339224991, + "learning_rate": 4.34733203269588e-05, + "loss": 0.1993, + "step": 8712 + }, + { + "epoch": 1.033202893394996, + "grad_norm": 1.0156587330666167, + "learning_rate": 4.3471702928605864e-05, + "loss": 0.2063, + "step": 8713 + }, + { + "epoch": 1.033321475157121, + "grad_norm": 1.176509565691804, + "learning_rate": 4.347008535996648e-05, + "loss": 0.2646, + "step": 8714 + }, + { + "epoch": 1.033440056919246, + "grad_norm": 1.472871713133462, + "learning_rate": 4.346846762105556e-05, + "loss": 0.2844, + "step": 8715 + }, + { + "epoch": 1.0335586386813709, + "grad_norm": 0.9656809403492589, + "learning_rate": 4.3466849711888017e-05, + "loss": 0.2099, + "step": 8716 + }, + { + "epoch": 1.0336772204434959, + "grad_norm": 1.1951250258776642, + "learning_rate": 4.346523163247876e-05, + "loss": 0.2382, + "step": 8717 + }, + { + "epoch": 1.0337958022056208, + "grad_norm": 1.2998572872598482, + "learning_rate": 4.3463613382842714e-05, + "loss": 0.2655, + "step": 8718 + }, + { + "epoch": 1.0339143839677458, + "grad_norm": 1.2992851735719566, + "learning_rate": 4.346199496299479e-05, + "loss": 0.2203, + "step": 8719 + }, + { + "epoch": 1.0340329657298708, + "grad_norm": 0.9981478588874155, + "learning_rate": 4.346037637294992e-05, + "loss": 0.1841, + "step": 8720 + }, + { + "epoch": 1.0341515474919958, + "grad_norm": 1.4126861220404356, + "learning_rate": 4.3458757612723014e-05, + "loss": 0.289, + "step": 8721 + }, + { + "epoch": 1.0342701292541208, + "grad_norm": 1.3344594151870173, + "learning_rate": 4.3457138682329e-05, + "loss": 0.3428, + "step": 8722 + }, + { + "epoch": 1.0343887110162457, + "grad_norm": 1.3051806547541311, + "learning_rate": 4.3455519581782794e-05, + "loss": 0.2788, + "step": 8723 + }, + { + "epoch": 1.0345072927783707, + "grad_norm": 1.0293813660546045, + "learning_rate": 4.345390031109934e-05, + "loss": 0.2292, + "step": 8724 + }, + { + "epoch": 1.0346258745404957, + "grad_norm": 0.8618549552234482, + "learning_rate": 4.345228087029355e-05, + "loss": 0.1766, + "step": 8725 + }, + { + "epoch": 1.0347444563026207, + "grad_norm": 0.9951411448236145, + "learning_rate": 4.3450661259380354e-05, + "loss": 0.2155, + "step": 8726 + }, + { + "epoch": 1.0348630380647457, + "grad_norm": 0.9098048947576263, + "learning_rate": 4.344904147837469e-05, + "loss": 0.2203, + "step": 8727 + }, + { + "epoch": 1.0349816198268706, + "grad_norm": 1.0960464828673955, + "learning_rate": 4.3447421527291485e-05, + "loss": 0.269, + "step": 8728 + }, + { + "epoch": 1.0351002015889956, + "grad_norm": 1.4080394397051454, + "learning_rate": 4.344580140614568e-05, + "loss": 0.299, + "step": 8729 + }, + { + "epoch": 1.0352187833511206, + "grad_norm": 1.2151715038059747, + "learning_rate": 4.34441811149522e-05, + "loss": 0.2634, + "step": 8730 + }, + { + "epoch": 1.0353373651132456, + "grad_norm": 1.2135902872442277, + "learning_rate": 4.3442560653725995e-05, + "loss": 0.2671, + "step": 8731 + }, + { + "epoch": 1.0354559468753706, + "grad_norm": 1.5541375428365387, + "learning_rate": 4.344094002248199e-05, + "loss": 0.2784, + "step": 8732 + }, + { + "epoch": 1.0355745286374956, + "grad_norm": 1.2453193184981581, + "learning_rate": 4.3439319221235134e-05, + "loss": 0.3015, + "step": 8733 + }, + { + "epoch": 1.0356931103996205, + "grad_norm": 1.3538594183892736, + "learning_rate": 4.3437698250000365e-05, + "loss": 0.2545, + "step": 8734 + }, + { + "epoch": 1.0358116921617455, + "grad_norm": 1.4433193194285805, + "learning_rate": 4.343607710879263e-05, + "loss": 0.3047, + "step": 8735 + }, + { + "epoch": 1.0359302739238705, + "grad_norm": 1.1398840434970634, + "learning_rate": 4.343445579762687e-05, + "loss": 0.2618, + "step": 8736 + }, + { + "epoch": 1.0360488556859955, + "grad_norm": 1.1998045997009865, + "learning_rate": 4.343283431651803e-05, + "loss": 0.2572, + "step": 8737 + }, + { + "epoch": 1.0361674374481205, + "grad_norm": 1.4733235172440011, + "learning_rate": 4.343121266548107e-05, + "loss": 0.3285, + "step": 8738 + }, + { + "epoch": 1.0362860192102454, + "grad_norm": 0.8893544149415541, + "learning_rate": 4.3429590844530935e-05, + "loss": 0.1835, + "step": 8739 + }, + { + "epoch": 1.0364046009723704, + "grad_norm": 0.8827264664372044, + "learning_rate": 4.342796885368256e-05, + "loss": 0.2134, + "step": 8740 + }, + { + "epoch": 1.0365231827344954, + "grad_norm": 1.567845394523652, + "learning_rate": 4.342634669295091e-05, + "loss": 0.2758, + "step": 8741 + }, + { + "epoch": 1.0366417644966204, + "grad_norm": 1.406028092529751, + "learning_rate": 4.342472436235095e-05, + "loss": 0.3393, + "step": 8742 + }, + { + "epoch": 1.0367603462587454, + "grad_norm": 1.0314139850107558, + "learning_rate": 4.342310186189762e-05, + "loss": 0.234, + "step": 8743 + }, + { + "epoch": 1.0368789280208703, + "grad_norm": 1.4233805817855374, + "learning_rate": 4.342147919160588e-05, + "loss": 0.2794, + "step": 8744 + }, + { + "epoch": 1.0369975097829953, + "grad_norm": 1.1977579374524407, + "learning_rate": 4.3419856351490694e-05, + "loss": 0.2372, + "step": 8745 + }, + { + "epoch": 1.0371160915451203, + "grad_norm": 1.3026654757368779, + "learning_rate": 4.341823334156702e-05, + "loss": 0.2697, + "step": 8746 + }, + { + "epoch": 1.0372346733072453, + "grad_norm": 1.1288654667301898, + "learning_rate": 4.341661016184982e-05, + "loss": 0.2789, + "step": 8747 + }, + { + "epoch": 1.0373532550693703, + "grad_norm": 0.8463236514993734, + "learning_rate": 4.341498681235406e-05, + "loss": 0.2149, + "step": 8748 + }, + { + "epoch": 1.0374718368314952, + "grad_norm": 1.2312868921741293, + "learning_rate": 4.34133632930947e-05, + "loss": 0.2762, + "step": 8749 + }, + { + "epoch": 1.0375904185936202, + "grad_norm": 1.308121474171523, + "learning_rate": 4.341173960408672e-05, + "loss": 0.258, + "step": 8750 + }, + { + "epoch": 1.0377090003557452, + "grad_norm": 1.6606574225528736, + "learning_rate": 4.341011574534507e-05, + "loss": 0.3799, + "step": 8751 + }, + { + "epoch": 1.0378275821178702, + "grad_norm": 1.0812082247667596, + "learning_rate": 4.340849171688473e-05, + "loss": 0.2239, + "step": 8752 + }, + { + "epoch": 1.0379461638799952, + "grad_norm": 0.964034702374784, + "learning_rate": 4.3406867518720675e-05, + "loss": 0.2049, + "step": 8753 + }, + { + "epoch": 1.0380647456421201, + "grad_norm": 1.714727204527804, + "learning_rate": 4.3405243150867864e-05, + "loss": 0.3433, + "step": 8754 + }, + { + "epoch": 1.0381833274042451, + "grad_norm": 1.2635026607600452, + "learning_rate": 4.340361861334129e-05, + "loss": 0.2785, + "step": 8755 + }, + { + "epoch": 1.03830190916637, + "grad_norm": 1.2343559142942158, + "learning_rate": 4.340199390615591e-05, + "loss": 0.2403, + "step": 8756 + }, + { + "epoch": 1.038420490928495, + "grad_norm": 1.2816855849696513, + "learning_rate": 4.340036902932673e-05, + "loss": 0.2704, + "step": 8757 + }, + { + "epoch": 1.0385390726906203, + "grad_norm": 0.987885643020447, + "learning_rate": 4.339874398286869e-05, + "loss": 0.2086, + "step": 8758 + }, + { + "epoch": 1.0386576544527453, + "grad_norm": 1.527245303444485, + "learning_rate": 4.339711876679681e-05, + "loss": 0.2944, + "step": 8759 + }, + { + "epoch": 1.0387762362148703, + "grad_norm": 1.4074646608389771, + "learning_rate": 4.339549338112605e-05, + "loss": 0.359, + "step": 8760 + }, + { + "epoch": 1.0388948179769952, + "grad_norm": 1.045346746673983, + "learning_rate": 4.3393867825871395e-05, + "loss": 0.2346, + "step": 8761 + }, + { + "epoch": 1.0390133997391202, + "grad_norm": 1.15375939389964, + "learning_rate": 4.3392242101047835e-05, + "loss": 0.2454, + "step": 8762 + }, + { + "epoch": 1.0391319815012452, + "grad_norm": 1.1677095748083433, + "learning_rate": 4.339061620667036e-05, + "loss": 0.2468, + "step": 8763 + }, + { + "epoch": 1.0392505632633702, + "grad_norm": 1.2720548306530906, + "learning_rate": 4.338899014275395e-05, + "loss": 0.2608, + "step": 8764 + }, + { + "epoch": 1.0393691450254952, + "grad_norm": 1.246176241739673, + "learning_rate": 4.33873639093136e-05, + "loss": 0.2598, + "step": 8765 + }, + { + "epoch": 1.0394877267876201, + "grad_norm": 1.1726450025804496, + "learning_rate": 4.338573750636432e-05, + "loss": 0.2873, + "step": 8766 + }, + { + "epoch": 1.0396063085497451, + "grad_norm": 1.3884145795692522, + "learning_rate": 4.3384110933921076e-05, + "loss": 0.3105, + "step": 8767 + }, + { + "epoch": 1.03972489031187, + "grad_norm": 1.160071445463676, + "learning_rate": 4.3382484191998865e-05, + "loss": 0.2186, + "step": 8768 + }, + { + "epoch": 1.039843472073995, + "grad_norm": 1.3139286227333213, + "learning_rate": 4.3380857280612704e-05, + "loss": 0.2862, + "step": 8769 + }, + { + "epoch": 1.03996205383612, + "grad_norm": 1.0447294964010192, + "learning_rate": 4.337923019977757e-05, + "loss": 0.1789, + "step": 8770 + }, + { + "epoch": 1.040080635598245, + "grad_norm": 2.2519352602999994, + "learning_rate": 4.337760294950848e-05, + "loss": 0.3521, + "step": 8771 + }, + { + "epoch": 1.04019921736037, + "grad_norm": 1.0381403781013068, + "learning_rate": 4.3375975529820414e-05, + "loss": 0.2436, + "step": 8772 + }, + { + "epoch": 1.040317799122495, + "grad_norm": 1.0012960772412955, + "learning_rate": 4.33743479407284e-05, + "loss": 0.2095, + "step": 8773 + }, + { + "epoch": 1.04043638088462, + "grad_norm": 1.520478511185997, + "learning_rate": 4.3372720182247426e-05, + "loss": 0.2404, + "step": 8774 + }, + { + "epoch": 1.040554962646745, + "grad_norm": 1.0465651929800464, + "learning_rate": 4.33710922543925e-05, + "loss": 0.1991, + "step": 8775 + }, + { + "epoch": 1.04067354440887, + "grad_norm": 0.8151450244121861, + "learning_rate": 4.336946415717864e-05, + "loss": 0.1673, + "step": 8776 + }, + { + "epoch": 1.040792126170995, + "grad_norm": 1.0194437005826082, + "learning_rate": 4.336783589062083e-05, + "loss": 0.2326, + "step": 8777 + }, + { + "epoch": 1.04091070793312, + "grad_norm": 1.0662929685560938, + "learning_rate": 4.3366207454734114e-05, + "loss": 0.2142, + "step": 8778 + }, + { + "epoch": 1.0410292896952449, + "grad_norm": 1.231154637162679, + "learning_rate": 4.336457884953348e-05, + "loss": 0.2079, + "step": 8779 + }, + { + "epoch": 1.0411478714573699, + "grad_norm": 0.9266943633340246, + "learning_rate": 4.336295007503395e-05, + "loss": 0.1922, + "step": 8780 + }, + { + "epoch": 1.0412664532194948, + "grad_norm": 1.024038553262166, + "learning_rate": 4.3361321131250545e-05, + "loss": 0.2087, + "step": 8781 + }, + { + "epoch": 1.0413850349816198, + "grad_norm": 1.3326645804226134, + "learning_rate": 4.335969201819826e-05, + "loss": 0.2493, + "step": 8782 + }, + { + "epoch": 1.0415036167437448, + "grad_norm": 1.0418875404568255, + "learning_rate": 4.335806273589214e-05, + "loss": 0.1913, + "step": 8783 + }, + { + "epoch": 1.0416221985058698, + "grad_norm": 1.3346171624502143, + "learning_rate": 4.335643328434719e-05, + "loss": 0.2546, + "step": 8784 + }, + { + "epoch": 1.0417407802679948, + "grad_norm": 1.328033259233724, + "learning_rate": 4.3354803663578433e-05, + "loss": 0.2373, + "step": 8785 + }, + { + "epoch": 1.0418593620301198, + "grad_norm": 1.121354638130676, + "learning_rate": 4.3353173873600904e-05, + "loss": 0.2081, + "step": 8786 + }, + { + "epoch": 1.0419779437922447, + "grad_norm": 1.1537376938316042, + "learning_rate": 4.3351543914429615e-05, + "loss": 0.2208, + "step": 8787 + }, + { + "epoch": 1.0420965255543697, + "grad_norm": 0.8725607853417957, + "learning_rate": 4.334991378607959e-05, + "loss": 0.1884, + "step": 8788 + }, + { + "epoch": 1.0422151073164947, + "grad_norm": 1.342452993715864, + "learning_rate": 4.3348283488565866e-05, + "loss": 0.2047, + "step": 8789 + }, + { + "epoch": 1.0423336890786197, + "grad_norm": 0.9416868486520811, + "learning_rate": 4.334665302190346e-05, + "loss": 0.2002, + "step": 8790 + }, + { + "epoch": 1.0424522708407447, + "grad_norm": 1.2038602348958505, + "learning_rate": 4.3345022386107426e-05, + "loss": 0.2792, + "step": 8791 + }, + { + "epoch": 1.0425708526028696, + "grad_norm": 1.437030084870201, + "learning_rate": 4.334339158119277e-05, + "loss": 0.2944, + "step": 8792 + }, + { + "epoch": 1.0426894343649946, + "grad_norm": 1.0618987990180877, + "learning_rate": 4.3341760607174544e-05, + "loss": 0.1933, + "step": 8793 + }, + { + "epoch": 1.0428080161271196, + "grad_norm": 1.4106723649317154, + "learning_rate": 4.334012946406777e-05, + "loss": 0.3085, + "step": 8794 + }, + { + "epoch": 1.0429265978892446, + "grad_norm": 2.4255884991628043, + "learning_rate": 4.3338498151887495e-05, + "loss": 0.4899, + "step": 8795 + }, + { + "epoch": 1.0430451796513696, + "grad_norm": 1.2560243748426505, + "learning_rate": 4.333686667064876e-05, + "loss": 0.3121, + "step": 8796 + }, + { + "epoch": 1.0431637614134945, + "grad_norm": 1.4073067198024598, + "learning_rate": 4.3335235020366596e-05, + "loss": 0.3117, + "step": 8797 + }, + { + "epoch": 1.0432823431756195, + "grad_norm": 1.2844028885794645, + "learning_rate": 4.333360320105605e-05, + "loss": 0.2589, + "step": 8798 + }, + { + "epoch": 1.0434009249377445, + "grad_norm": 1.2192694668117827, + "learning_rate": 4.333197121273217e-05, + "loss": 0.2239, + "step": 8799 + }, + { + "epoch": 1.0435195066998695, + "grad_norm": 0.9557466153046051, + "learning_rate": 4.333033905540999e-05, + "loss": 0.2309, + "step": 8800 + }, + { + "epoch": 1.0436380884619945, + "grad_norm": 1.2515369872502398, + "learning_rate": 4.332870672910456e-05, + "loss": 0.2328, + "step": 8801 + }, + { + "epoch": 1.0437566702241194, + "grad_norm": 1.0751337574664546, + "learning_rate": 4.3327074233830936e-05, + "loss": 0.2434, + "step": 8802 + }, + { + "epoch": 1.0438752519862444, + "grad_norm": 1.1733544926862942, + "learning_rate": 4.332544156960416e-05, + "loss": 0.2431, + "step": 8803 + }, + { + "epoch": 1.0439938337483694, + "grad_norm": 1.603492290251434, + "learning_rate": 4.3323808736439286e-05, + "loss": 0.3572, + "step": 8804 + }, + { + "epoch": 1.0441124155104944, + "grad_norm": 1.5965113525621177, + "learning_rate": 4.332217573435137e-05, + "loss": 0.3333, + "step": 8805 + }, + { + "epoch": 1.0442309972726194, + "grad_norm": 1.2903221023860152, + "learning_rate": 4.332054256335545e-05, + "loss": 0.2308, + "step": 8806 + }, + { + "epoch": 1.0443495790347446, + "grad_norm": 1.067358006662811, + "learning_rate": 4.33189092234666e-05, + "loss": 0.2849, + "step": 8807 + }, + { + "epoch": 1.0444681607968693, + "grad_norm": 1.2049656230679275, + "learning_rate": 4.331727571469987e-05, + "loss": 0.2301, + "step": 8808 + }, + { + "epoch": 1.0445867425589945, + "grad_norm": 1.3029317489105614, + "learning_rate": 4.331564203707032e-05, + "loss": 0.2835, + "step": 8809 + }, + { + "epoch": 1.0447053243211195, + "grad_norm": 1.1872153267628787, + "learning_rate": 4.331400819059301e-05, + "loss": 0.2829, + "step": 8810 + }, + { + "epoch": 1.0448239060832445, + "grad_norm": 1.075999343223569, + "learning_rate": 4.3312374175283004e-05, + "loss": 0.2398, + "step": 8811 + }, + { + "epoch": 1.0449424878453695, + "grad_norm": 0.9417120065820003, + "learning_rate": 4.3310739991155365e-05, + "loss": 0.2485, + "step": 8812 + }, + { + "epoch": 1.0450610696074945, + "grad_norm": 0.9161381571324625, + "learning_rate": 4.330910563822516e-05, + "loss": 0.2055, + "step": 8813 + }, + { + "epoch": 1.0451796513696194, + "grad_norm": 1.0111974834244464, + "learning_rate": 4.330747111650744e-05, + "loss": 0.2435, + "step": 8814 + }, + { + "epoch": 1.0452982331317444, + "grad_norm": 0.9821113880939647, + "learning_rate": 4.33058364260173e-05, + "loss": 0.2338, + "step": 8815 + }, + { + "epoch": 1.0454168148938694, + "grad_norm": 1.24870532998228, + "learning_rate": 4.330420156676979e-05, + "loss": 0.2424, + "step": 8816 + }, + { + "epoch": 1.0455353966559944, + "grad_norm": 1.363187185728759, + "learning_rate": 4.330256653877999e-05, + "loss": 0.2645, + "step": 8817 + }, + { + "epoch": 1.0456539784181194, + "grad_norm": 1.061073696883084, + "learning_rate": 4.330093134206297e-05, + "loss": 0.2196, + "step": 8818 + }, + { + "epoch": 1.0457725601802443, + "grad_norm": 1.1266140754123848, + "learning_rate": 4.32992959766338e-05, + "loss": 0.2903, + "step": 8819 + }, + { + "epoch": 1.0458911419423693, + "grad_norm": 1.1667155412209742, + "learning_rate": 4.329766044250757e-05, + "loss": 0.2576, + "step": 8820 + }, + { + "epoch": 1.0460097237044943, + "grad_norm": 1.288270297229462, + "learning_rate": 4.329602473969935e-05, + "loss": 0.2887, + "step": 8821 + }, + { + "epoch": 1.0461283054666193, + "grad_norm": 1.2296189744952115, + "learning_rate": 4.329438886822421e-05, + "loss": 0.26, + "step": 8822 + }, + { + "epoch": 1.0462468872287443, + "grad_norm": 1.3036444824644098, + "learning_rate": 4.3292752828097236e-05, + "loss": 0.2429, + "step": 8823 + }, + { + "epoch": 1.0463654689908692, + "grad_norm": 1.1042970330903388, + "learning_rate": 4.329111661933353e-05, + "loss": 0.2531, + "step": 8824 + }, + { + "epoch": 1.0464840507529942, + "grad_norm": 1.1349324656388533, + "learning_rate": 4.328948024194814e-05, + "loss": 0.2451, + "step": 8825 + }, + { + "epoch": 1.0466026325151192, + "grad_norm": 0.8544340971900954, + "learning_rate": 4.3287843695956185e-05, + "loss": 0.1866, + "step": 8826 + }, + { + "epoch": 1.0467212142772442, + "grad_norm": 1.0537107714389413, + "learning_rate": 4.328620698137272e-05, + "loss": 0.2267, + "step": 8827 + }, + { + "epoch": 1.0468397960393692, + "grad_norm": 1.8128610367506484, + "learning_rate": 4.328457009821286e-05, + "loss": 0.3897, + "step": 8828 + }, + { + "epoch": 1.0469583778014941, + "grad_norm": 1.2589400105622177, + "learning_rate": 4.328293304649169e-05, + "loss": 0.2679, + "step": 8829 + }, + { + "epoch": 1.0470769595636191, + "grad_norm": 1.2807313444723736, + "learning_rate": 4.32812958262243e-05, + "loss": 0.2581, + "step": 8830 + }, + { + "epoch": 1.047195541325744, + "grad_norm": 1.1296185928476177, + "learning_rate": 4.3279658437425775e-05, + "loss": 0.2015, + "step": 8831 + }, + { + "epoch": 1.047314123087869, + "grad_norm": 1.4441841270194349, + "learning_rate": 4.327802088011121e-05, + "loss": 0.2837, + "step": 8832 + }, + { + "epoch": 1.047432704849994, + "grad_norm": 1.150429917494715, + "learning_rate": 4.327638315429571e-05, + "loss": 0.2888, + "step": 8833 + }, + { + "epoch": 1.047551286612119, + "grad_norm": 1.400317009237685, + "learning_rate": 4.327474525999437e-05, + "loss": 0.3091, + "step": 8834 + }, + { + "epoch": 1.047669868374244, + "grad_norm": 1.3868193789651517, + "learning_rate": 4.327310719722229e-05, + "loss": 0.3058, + "step": 8835 + }, + { + "epoch": 1.047788450136369, + "grad_norm": 1.6883445466841218, + "learning_rate": 4.3271468965994564e-05, + "loss": 0.2998, + "step": 8836 + }, + { + "epoch": 1.047907031898494, + "grad_norm": 1.2884518325711596, + "learning_rate": 4.326983056632631e-05, + "loss": 0.2696, + "step": 8837 + }, + { + "epoch": 1.048025613660619, + "grad_norm": 1.2804090394545162, + "learning_rate": 4.326819199823261e-05, + "loss": 0.2649, + "step": 8838 + }, + { + "epoch": 1.048144195422744, + "grad_norm": 1.3449961276881859, + "learning_rate": 4.3266553261728595e-05, + "loss": 0.2686, + "step": 8839 + }, + { + "epoch": 1.048262777184869, + "grad_norm": 1.2266141106712674, + "learning_rate": 4.326491435682936e-05, + "loss": 0.2885, + "step": 8840 + }, + { + "epoch": 1.048381358946994, + "grad_norm": 1.3522164905710596, + "learning_rate": 4.326327528355e-05, + "loss": 0.3456, + "step": 8841 + }, + { + "epoch": 1.048499940709119, + "grad_norm": 1.2234743528214456, + "learning_rate": 4.326163604190564e-05, + "loss": 0.2582, + "step": 8842 + }, + { + "epoch": 1.0486185224712439, + "grad_norm": 0.881793714689452, + "learning_rate": 4.325999663191139e-05, + "loss": 0.2151, + "step": 8843 + }, + { + "epoch": 1.0487371042333689, + "grad_norm": 1.0167903365917756, + "learning_rate": 4.3258357053582364e-05, + "loss": 0.2123, + "step": 8844 + }, + { + "epoch": 1.0488556859954938, + "grad_norm": 1.0141325216494252, + "learning_rate": 4.325671730693368e-05, + "loss": 0.2233, + "step": 8845 + }, + { + "epoch": 1.0489742677576188, + "grad_norm": 1.1507496767609884, + "learning_rate": 4.325507739198045e-05, + "loss": 0.2524, + "step": 8846 + }, + { + "epoch": 1.0490928495197438, + "grad_norm": 1.1619724040131971, + "learning_rate": 4.3253437308737786e-05, + "loss": 0.2489, + "step": 8847 + }, + { + "epoch": 1.0492114312818688, + "grad_norm": 1.5593631672506056, + "learning_rate": 4.325179705722082e-05, + "loss": 0.3491, + "step": 8848 + }, + { + "epoch": 1.0493300130439938, + "grad_norm": 1.3662738111680175, + "learning_rate": 4.325015663744466e-05, + "loss": 0.2712, + "step": 8849 + }, + { + "epoch": 1.0494485948061187, + "grad_norm": 1.11739699865356, + "learning_rate": 4.324851604942445e-05, + "loss": 0.2472, + "step": 8850 + }, + { + "epoch": 1.0495671765682437, + "grad_norm": 0.8798053927196526, + "learning_rate": 4.324687529317528e-05, + "loss": 0.1874, + "step": 8851 + }, + { + "epoch": 1.0496857583303687, + "grad_norm": 1.4305799800818872, + "learning_rate": 4.324523436871231e-05, + "loss": 0.2831, + "step": 8852 + }, + { + "epoch": 1.0498043400924937, + "grad_norm": 1.454677101607734, + "learning_rate": 4.324359327605065e-05, + "loss": 0.3216, + "step": 8853 + }, + { + "epoch": 1.0499229218546187, + "grad_norm": 1.4454906560852592, + "learning_rate": 4.3241952015205426e-05, + "loss": 0.2819, + "step": 8854 + }, + { + "epoch": 1.0500415036167436, + "grad_norm": 1.2015396646799459, + "learning_rate": 4.324031058619178e-05, + "loss": 0.2573, + "step": 8855 + }, + { + "epoch": 1.0501600853788686, + "grad_norm": 0.9736835330095573, + "learning_rate": 4.323866898902483e-05, + "loss": 0.1928, + "step": 8856 + }, + { + "epoch": 1.0502786671409936, + "grad_norm": 1.031610897540319, + "learning_rate": 4.323702722371973e-05, + "loss": 0.2458, + "step": 8857 + }, + { + "epoch": 1.0503972489031188, + "grad_norm": 1.0462355883352463, + "learning_rate": 4.3235385290291596e-05, + "loss": 0.2368, + "step": 8858 + }, + { + "epoch": 1.0505158306652438, + "grad_norm": 1.1710986055152672, + "learning_rate": 4.323374318875557e-05, + "loss": 0.2579, + "step": 8859 + }, + { + "epoch": 1.0506344124273688, + "grad_norm": 1.066640261811951, + "learning_rate": 4.32321009191268e-05, + "loss": 0.2307, + "step": 8860 + }, + { + "epoch": 1.0507529941894937, + "grad_norm": 1.0752604307279687, + "learning_rate": 4.323045848142041e-05, + "loss": 0.2504, + "step": 8861 + }, + { + "epoch": 1.0508715759516187, + "grad_norm": 1.1638489414006856, + "learning_rate": 4.322881587565155e-05, + "loss": 0.1969, + "step": 8862 + }, + { + "epoch": 1.0509901577137437, + "grad_norm": 1.3483171204363438, + "learning_rate": 4.322717310183536e-05, + "loss": 0.2722, + "step": 8863 + }, + { + "epoch": 1.0511087394758687, + "grad_norm": 1.270579955871703, + "learning_rate": 4.3225530159986984e-05, + "loss": 0.2379, + "step": 8864 + }, + { + "epoch": 1.0512273212379937, + "grad_norm": 1.4308481473300818, + "learning_rate": 4.322388705012157e-05, + "loss": 0.3347, + "step": 8865 + }, + { + "epoch": 1.0513459030001187, + "grad_norm": 1.2329590279852176, + "learning_rate": 4.322224377225427e-05, + "loss": 0.2808, + "step": 8866 + }, + { + "epoch": 1.0514644847622436, + "grad_norm": 1.6365087366627815, + "learning_rate": 4.322060032640022e-05, + "loss": 0.3423, + "step": 8867 + }, + { + "epoch": 1.0515830665243686, + "grad_norm": 1.393620158262112, + "learning_rate": 4.3218956712574585e-05, + "loss": 0.3367, + "step": 8868 + }, + { + "epoch": 1.0517016482864936, + "grad_norm": 1.1915877328615943, + "learning_rate": 4.321731293079251e-05, + "loss": 0.2407, + "step": 8869 + }, + { + "epoch": 1.0518202300486186, + "grad_norm": 0.9902889047417804, + "learning_rate": 4.3215668981069155e-05, + "loss": 0.1838, + "step": 8870 + }, + { + "epoch": 1.0519388118107436, + "grad_norm": 1.3464064652896202, + "learning_rate": 4.3214024863419656e-05, + "loss": 0.2616, + "step": 8871 + }, + { + "epoch": 1.0520573935728685, + "grad_norm": 1.072765244046311, + "learning_rate": 4.321238057785919e-05, + "loss": 0.2378, + "step": 8872 + }, + { + "epoch": 1.0521759753349935, + "grad_norm": 1.1827099644413859, + "learning_rate": 4.3210736124402904e-05, + "loss": 0.2382, + "step": 8873 + }, + { + "epoch": 1.0522945570971185, + "grad_norm": 1.5199985647204575, + "learning_rate": 4.3209091503065974e-05, + "loss": 0.3294, + "step": 8874 + }, + { + "epoch": 1.0524131388592435, + "grad_norm": 1.784573543790307, + "learning_rate": 4.320744671386354e-05, + "loss": 0.441, + "step": 8875 + }, + { + "epoch": 1.0525317206213685, + "grad_norm": 1.4740787580854022, + "learning_rate": 4.320580175681078e-05, + "loss": 0.3039, + "step": 8876 + }, + { + "epoch": 1.0526503023834934, + "grad_norm": 0.9469816093936522, + "learning_rate": 4.3204156631922854e-05, + "loss": 0.2525, + "step": 8877 + }, + { + "epoch": 1.0527688841456184, + "grad_norm": 0.8960987478180966, + "learning_rate": 4.3202511339214925e-05, + "loss": 0.1686, + "step": 8878 + }, + { + "epoch": 1.0528874659077434, + "grad_norm": 1.0265006529984584, + "learning_rate": 4.320086587870216e-05, + "loss": 0.1584, + "step": 8879 + }, + { + "epoch": 1.0530060476698684, + "grad_norm": 1.1098375818949238, + "learning_rate": 4.319922025039973e-05, + "loss": 0.2569, + "step": 8880 + }, + { + "epoch": 1.0531246294319934, + "grad_norm": 1.1059638779722991, + "learning_rate": 4.319757445432281e-05, + "loss": 0.2536, + "step": 8881 + }, + { + "epoch": 1.0532432111941183, + "grad_norm": 1.720020472988096, + "learning_rate": 4.319592849048657e-05, + "loss": 0.3193, + "step": 8882 + }, + { + "epoch": 1.0533617929562433, + "grad_norm": 0.8792292719247136, + "learning_rate": 4.319428235890618e-05, + "loss": 0.1824, + "step": 8883 + }, + { + "epoch": 1.0534803747183683, + "grad_norm": 0.836266542622384, + "learning_rate": 4.319263605959683e-05, + "loss": 0.1576, + "step": 8884 + }, + { + "epoch": 1.0535989564804933, + "grad_norm": 1.111317085103575, + "learning_rate": 4.319098959257366e-05, + "loss": 0.281, + "step": 8885 + }, + { + "epoch": 1.0537175382426183, + "grad_norm": 1.181459652873555, + "learning_rate": 4.318934295785189e-05, + "loss": 0.2215, + "step": 8886 + }, + { + "epoch": 1.0538361200047432, + "grad_norm": 1.1938403572142715, + "learning_rate": 4.318769615544668e-05, + "loss": 0.2211, + "step": 8887 + }, + { + "epoch": 1.0539547017668682, + "grad_norm": 1.1544891816711473, + "learning_rate": 4.318604918537321e-05, + "loss": 0.1942, + "step": 8888 + }, + { + "epoch": 1.0540732835289932, + "grad_norm": 1.054571048726214, + "learning_rate": 4.318440204764668e-05, + "loss": 0.2406, + "step": 8889 + }, + { + "epoch": 1.0541918652911182, + "grad_norm": 1.1860359193342804, + "learning_rate": 4.3182754742282256e-05, + "loss": 0.2804, + "step": 8890 + }, + { + "epoch": 1.0543104470532432, + "grad_norm": 1.2779103613385185, + "learning_rate": 4.318110726929513e-05, + "loss": 0.2807, + "step": 8891 + }, + { + "epoch": 1.0544290288153682, + "grad_norm": 0.9972226531094392, + "learning_rate": 4.3179459628700485e-05, + "loss": 0.1994, + "step": 8892 + }, + { + "epoch": 1.0545476105774931, + "grad_norm": 1.4029352478770503, + "learning_rate": 4.317781182051353e-05, + "loss": 0.2824, + "step": 8893 + }, + { + "epoch": 1.0546661923396181, + "grad_norm": 1.4149659892357525, + "learning_rate": 4.317616384474942e-05, + "loss": 0.2862, + "step": 8894 + }, + { + "epoch": 1.054784774101743, + "grad_norm": 1.2544310738692328, + "learning_rate": 4.317451570142338e-05, + "loss": 0.2375, + "step": 8895 + }, + { + "epoch": 1.054903355863868, + "grad_norm": 1.1436078162441063, + "learning_rate": 4.31728673905506e-05, + "loss": 0.2226, + "step": 8896 + }, + { + "epoch": 1.055021937625993, + "grad_norm": 1.4329395612525961, + "learning_rate": 4.3171218912146254e-05, + "loss": 0.3414, + "step": 8897 + }, + { + "epoch": 1.055140519388118, + "grad_norm": 1.259362573078985, + "learning_rate": 4.316957026622556e-05, + "loss": 0.2341, + "step": 8898 + }, + { + "epoch": 1.055259101150243, + "grad_norm": 1.2978788058913768, + "learning_rate": 4.31679214528037e-05, + "loss": 0.2884, + "step": 8899 + }, + { + "epoch": 1.055377682912368, + "grad_norm": 1.0885381186198704, + "learning_rate": 4.3166272471895896e-05, + "loss": 0.2235, + "step": 8900 + }, + { + "epoch": 1.055496264674493, + "grad_norm": 1.316034831765526, + "learning_rate": 4.3164623323517336e-05, + "loss": 0.3118, + "step": 8901 + }, + { + "epoch": 1.055614846436618, + "grad_norm": 1.4534453300777241, + "learning_rate": 4.316297400768321e-05, + "loss": 0.2875, + "step": 8902 + }, + { + "epoch": 1.055733428198743, + "grad_norm": 1.2913230911341091, + "learning_rate": 4.316132452440875e-05, + "loss": 0.2314, + "step": 8903 + }, + { + "epoch": 1.055852009960868, + "grad_norm": 1.0389501436781414, + "learning_rate": 4.315967487370915e-05, + "loss": 0.1906, + "step": 8904 + }, + { + "epoch": 1.055970591722993, + "grad_norm": 1.2412164804531642, + "learning_rate": 4.3158025055599606e-05, + "loss": 0.2636, + "step": 8905 + }, + { + "epoch": 1.0560891734851179, + "grad_norm": 1.4873595366288763, + "learning_rate": 4.315637507009535e-05, + "loss": 0.3961, + "step": 8906 + }, + { + "epoch": 1.056207755247243, + "grad_norm": 1.123406053570726, + "learning_rate": 4.3154724917211566e-05, + "loss": 0.2129, + "step": 8907 + }, + { + "epoch": 1.056326337009368, + "grad_norm": 1.5617426636062754, + "learning_rate": 4.3153074596963486e-05, + "loss": 0.3703, + "step": 8908 + }, + { + "epoch": 1.056444918771493, + "grad_norm": 1.438374245803892, + "learning_rate": 4.315142410936632e-05, + "loss": 0.3027, + "step": 8909 + }, + { + "epoch": 1.056563500533618, + "grad_norm": 1.1159150963963393, + "learning_rate": 4.3149773454435295e-05, + "loss": 0.1945, + "step": 8910 + }, + { + "epoch": 1.056682082295743, + "grad_norm": 1.343367968291009, + "learning_rate": 4.31481226321856e-05, + "loss": 0.3615, + "step": 8911 + }, + { + "epoch": 1.056800664057868, + "grad_norm": 1.1729068056748642, + "learning_rate": 4.314647164263247e-05, + "loss": 0.2827, + "step": 8912 + }, + { + "epoch": 1.056919245819993, + "grad_norm": 1.6603533184132033, + "learning_rate": 4.3144820485791124e-05, + "loss": 0.339, + "step": 8913 + }, + { + "epoch": 1.057037827582118, + "grad_norm": 0.9710939359691912, + "learning_rate": 4.314316916167679e-05, + "loss": 0.2192, + "step": 8914 + }, + { + "epoch": 1.057156409344243, + "grad_norm": 1.0978209726910404, + "learning_rate": 4.3141517670304686e-05, + "loss": 0.2285, + "step": 8915 + }, + { + "epoch": 1.057274991106368, + "grad_norm": 1.0412556571995824, + "learning_rate": 4.3139866011690034e-05, + "loss": 0.2728, + "step": 8916 + }, + { + "epoch": 1.057393572868493, + "grad_norm": 1.2514926894578777, + "learning_rate": 4.313821418584806e-05, + "loss": 0.3034, + "step": 8917 + }, + { + "epoch": 1.0575121546306179, + "grad_norm": 1.079763254399404, + "learning_rate": 4.3136562192793995e-05, + "loss": 0.2283, + "step": 8918 + }, + { + "epoch": 1.0576307363927429, + "grad_norm": 1.1644936700450295, + "learning_rate": 4.313491003254306e-05, + "loss": 0.2714, + "step": 8919 + }, + { + "epoch": 1.0577493181548678, + "grad_norm": 1.4955168034119999, + "learning_rate": 4.31332577051105e-05, + "loss": 0.2799, + "step": 8920 + }, + { + "epoch": 1.0578678999169928, + "grad_norm": 1.4482151049037677, + "learning_rate": 4.313160521051154e-05, + "loss": 0.3154, + "step": 8921 + }, + { + "epoch": 1.0579864816791178, + "grad_norm": 0.9561902265401493, + "learning_rate": 4.312995254876141e-05, + "loss": 0.2089, + "step": 8922 + }, + { + "epoch": 1.0581050634412428, + "grad_norm": 1.2550428432264324, + "learning_rate": 4.312829971987536e-05, + "loss": 0.2357, + "step": 8923 + }, + { + "epoch": 1.0582236452033678, + "grad_norm": 1.1274592034914908, + "learning_rate": 4.312664672386861e-05, + "loss": 0.2621, + "step": 8924 + }, + { + "epoch": 1.0583422269654927, + "grad_norm": 1.0554154600257084, + "learning_rate": 4.31249935607564e-05, + "loss": 0.2087, + "step": 8925 + }, + { + "epoch": 1.0584608087276177, + "grad_norm": 1.3625757716577287, + "learning_rate": 4.312334023055399e-05, + "loss": 0.3026, + "step": 8926 + }, + { + "epoch": 1.0585793904897427, + "grad_norm": 1.296293967206312, + "learning_rate": 4.31216867332766e-05, + "loss": 0.2705, + "step": 8927 + }, + { + "epoch": 1.0586979722518677, + "grad_norm": 1.2736016172555966, + "learning_rate": 4.3120033068939474e-05, + "loss": 0.3028, + "step": 8928 + }, + { + "epoch": 1.0588165540139927, + "grad_norm": 1.1639038129477708, + "learning_rate": 4.311837923755787e-05, + "loss": 0.255, + "step": 8929 + }, + { + "epoch": 1.0589351357761176, + "grad_norm": 1.0060205868278775, + "learning_rate": 4.311672523914703e-05, + "loss": 0.214, + "step": 8930 + }, + { + "epoch": 1.0590537175382426, + "grad_norm": 0.8776638959645615, + "learning_rate": 4.311507107372219e-05, + "loss": 0.2223, + "step": 8931 + }, + { + "epoch": 1.0591722993003676, + "grad_norm": 1.0564195305153439, + "learning_rate": 4.311341674129862e-05, + "loss": 0.2338, + "step": 8932 + }, + { + "epoch": 1.0592908810624926, + "grad_norm": 1.1491375042693968, + "learning_rate": 4.311176224189155e-05, + "loss": 0.204, + "step": 8933 + }, + { + "epoch": 1.0594094628246176, + "grad_norm": 1.720202087651263, + "learning_rate": 4.311010757551625e-05, + "loss": 0.4948, + "step": 8934 + }, + { + "epoch": 1.0595280445867425, + "grad_norm": 1.0621428950579845, + "learning_rate": 4.3108452742187966e-05, + "loss": 0.2183, + "step": 8935 + }, + { + "epoch": 1.0596466263488675, + "grad_norm": 1.1162439123203791, + "learning_rate": 4.310679774192195e-05, + "loss": 0.2368, + "step": 8936 + }, + { + "epoch": 1.0597652081109925, + "grad_norm": 0.7739099401719993, + "learning_rate": 4.310514257473347e-05, + "loss": 0.1811, + "step": 8937 + }, + { + "epoch": 1.0598837898731175, + "grad_norm": 1.066524205596646, + "learning_rate": 4.310348724063776e-05, + "loss": 0.2337, + "step": 8938 + }, + { + "epoch": 1.0600023716352425, + "grad_norm": 1.4653809793886634, + "learning_rate": 4.3101831739650114e-05, + "loss": 0.3493, + "step": 8939 + }, + { + "epoch": 1.0601209533973674, + "grad_norm": 1.5810304447391754, + "learning_rate": 4.310017607178578e-05, + "loss": 0.3119, + "step": 8940 + }, + { + "epoch": 1.0602395351594924, + "grad_norm": 1.1318451697779535, + "learning_rate": 4.309852023706e-05, + "loss": 0.2284, + "step": 8941 + }, + { + "epoch": 1.0603581169216174, + "grad_norm": 0.9160378736782959, + "learning_rate": 4.3096864235488074e-05, + "loss": 0.2057, + "step": 8942 + }, + { + "epoch": 1.0604766986837424, + "grad_norm": 1.15808341029762, + "learning_rate": 4.309520806708525e-05, + "loss": 0.3078, + "step": 8943 + }, + { + "epoch": 1.0605952804458674, + "grad_norm": 1.3413889035162123, + "learning_rate": 4.3093551731866786e-05, + "loss": 0.286, + "step": 8944 + }, + { + "epoch": 1.0607138622079924, + "grad_norm": 0.7965623356903778, + "learning_rate": 4.3091895229847973e-05, + "loss": 0.1911, + "step": 8945 + }, + { + "epoch": 1.0608324439701173, + "grad_norm": 1.3946552556219765, + "learning_rate": 4.309023856104407e-05, + "loss": 0.357, + "step": 8946 + }, + { + "epoch": 1.0609510257322423, + "grad_norm": 1.1183816706160334, + "learning_rate": 4.3088581725470344e-05, + "loss": 0.2413, + "step": 8947 + }, + { + "epoch": 1.0610696074943673, + "grad_norm": 1.3686165939414234, + "learning_rate": 4.308692472314209e-05, + "loss": 0.2337, + "step": 8948 + }, + { + "epoch": 1.0611881892564923, + "grad_norm": 0.9217715080278231, + "learning_rate": 4.308526755407456e-05, + "loss": 0.2478, + "step": 8949 + }, + { + "epoch": 1.0613067710186173, + "grad_norm": 1.3718073212395634, + "learning_rate": 4.308361021828304e-05, + "loss": 0.2803, + "step": 8950 + }, + { + "epoch": 1.0614253527807422, + "grad_norm": 1.229293830563055, + "learning_rate": 4.3081952715782814e-05, + "loss": 0.2515, + "step": 8951 + }, + { + "epoch": 1.0615439345428672, + "grad_norm": 0.9984651829924758, + "learning_rate": 4.308029504658915e-05, + "loss": 0.287, + "step": 8952 + }, + { + "epoch": 1.0616625163049922, + "grad_norm": 1.119880996453459, + "learning_rate": 4.307863721071734e-05, + "loss": 0.2202, + "step": 8953 + }, + { + "epoch": 1.0617810980671172, + "grad_norm": 1.2831647967365851, + "learning_rate": 4.307697920818268e-05, + "loss": 0.221, + "step": 8954 + }, + { + "epoch": 1.0618996798292422, + "grad_norm": 1.1470940780362604, + "learning_rate": 4.307532103900042e-05, + "loss": 0.2104, + "step": 8955 + }, + { + "epoch": 1.0620182615913671, + "grad_norm": 1.1047220487826643, + "learning_rate": 4.307366270318587e-05, + "loss": 0.1999, + "step": 8956 + }, + { + "epoch": 1.0621368433534921, + "grad_norm": 1.1045817095076218, + "learning_rate": 4.307200420075431e-05, + "loss": 0.2959, + "step": 8957 + }, + { + "epoch": 1.0622554251156173, + "grad_norm": 1.2946283968846024, + "learning_rate": 4.3070345531721046e-05, + "loss": 0.26, + "step": 8958 + }, + { + "epoch": 1.0623740068777423, + "grad_norm": 1.0736141605270482, + "learning_rate": 4.3068686696101344e-05, + "loss": 0.2717, + "step": 8959 + }, + { + "epoch": 1.0624925886398673, + "grad_norm": 0.8447187597186173, + "learning_rate": 4.306702769391051e-05, + "loss": 0.1633, + "step": 8960 + }, + { + "epoch": 1.0626111704019923, + "grad_norm": 1.2851040672241327, + "learning_rate": 4.306536852516384e-05, + "loss": 0.2848, + "step": 8961 + }, + { + "epoch": 1.0627297521641172, + "grad_norm": 1.3266376397554454, + "learning_rate": 4.306370918987662e-05, + "loss": 0.2524, + "step": 8962 + }, + { + "epoch": 1.0628483339262422, + "grad_norm": 1.3851189973140525, + "learning_rate": 4.306204968806415e-05, + "loss": 0.2685, + "step": 8963 + }, + { + "epoch": 1.0629669156883672, + "grad_norm": 1.517439237185437, + "learning_rate": 4.306039001974174e-05, + "loss": 0.3562, + "step": 8964 + }, + { + "epoch": 1.0630854974504922, + "grad_norm": 1.036291773794552, + "learning_rate": 4.305873018492468e-05, + "loss": 0.2132, + "step": 8965 + }, + { + "epoch": 1.0632040792126172, + "grad_norm": 1.761000048706805, + "learning_rate": 4.305707018362827e-05, + "loss": 0.2972, + "step": 8966 + }, + { + "epoch": 1.0633226609747422, + "grad_norm": 1.009252824133555, + "learning_rate": 4.3055410015867816e-05, + "loss": 0.1978, + "step": 8967 + }, + { + "epoch": 1.0634412427368671, + "grad_norm": 1.150959082696007, + "learning_rate": 4.3053749681658626e-05, + "loss": 0.3112, + "step": 8968 + }, + { + "epoch": 1.0635598244989921, + "grad_norm": 1.3942882394484684, + "learning_rate": 4.3052089181016e-05, + "loss": 0.3407, + "step": 8969 + }, + { + "epoch": 1.063678406261117, + "grad_norm": 1.3836221604775842, + "learning_rate": 4.3050428513955246e-05, + "loss": 0.2642, + "step": 8970 + }, + { + "epoch": 1.063796988023242, + "grad_norm": 1.0041272823612306, + "learning_rate": 4.304876768049168e-05, + "loss": 0.1541, + "step": 8971 + }, + { + "epoch": 1.063915569785367, + "grad_norm": 1.518318407834466, + "learning_rate": 4.30471066806406e-05, + "loss": 0.2788, + "step": 8972 + }, + { + "epoch": 1.064034151547492, + "grad_norm": 0.979473770560935, + "learning_rate": 4.3045445514417335e-05, + "loss": 0.1912, + "step": 8973 + }, + { + "epoch": 1.064152733309617, + "grad_norm": 1.2893948876148587, + "learning_rate": 4.304378418183719e-05, + "loss": 0.3314, + "step": 8974 + }, + { + "epoch": 1.064271315071742, + "grad_norm": 1.2384475259864083, + "learning_rate": 4.304212268291549e-05, + "loss": 0.2289, + "step": 8975 + }, + { + "epoch": 1.064389896833867, + "grad_norm": 1.281637207632504, + "learning_rate": 4.304046101766753e-05, + "loss": 0.3511, + "step": 8976 + }, + { + "epoch": 1.064508478595992, + "grad_norm": 1.1446901988744174, + "learning_rate": 4.3038799186108645e-05, + "loss": 0.261, + "step": 8977 + }, + { + "epoch": 1.064627060358117, + "grad_norm": 0.9719391537309511, + "learning_rate": 4.303713718825416e-05, + "loss": 0.2018, + "step": 8978 + }, + { + "epoch": 1.064745642120242, + "grad_norm": 1.3742015684465356, + "learning_rate": 4.3035475024119375e-05, + "loss": 0.2756, + "step": 8979 + }, + { + "epoch": 1.064864223882367, + "grad_norm": 1.3608032787627595, + "learning_rate": 4.3033812693719646e-05, + "loss": 0.2581, + "step": 8980 + }, + { + "epoch": 1.0649828056444919, + "grad_norm": 1.2074513873318233, + "learning_rate": 4.303215019707026e-05, + "loss": 0.2456, + "step": 8981 + }, + { + "epoch": 1.0651013874066169, + "grad_norm": 1.3830806370005657, + "learning_rate": 4.303048753418657e-05, + "loss": 0.2876, + "step": 8982 + }, + { + "epoch": 1.0652199691687418, + "grad_norm": 1.2143586582734758, + "learning_rate": 4.302882470508389e-05, + "loss": 0.269, + "step": 8983 + }, + { + "epoch": 1.0653385509308668, + "grad_norm": 1.2168797711966175, + "learning_rate": 4.302716170977756e-05, + "loss": 0.2994, + "step": 8984 + }, + { + "epoch": 1.0654571326929918, + "grad_norm": 1.1437484886528926, + "learning_rate": 4.302549854828291e-05, + "loss": 0.273, + "step": 8985 + }, + { + "epoch": 1.0655757144551168, + "grad_norm": 0.9078382308545851, + "learning_rate": 4.302383522061526e-05, + "loss": 0.1879, + "step": 8986 + }, + { + "epoch": 1.0656942962172418, + "grad_norm": 0.8795419409066834, + "learning_rate": 4.302217172678995e-05, + "loss": 0.1623, + "step": 8987 + }, + { + "epoch": 1.0658128779793667, + "grad_norm": 1.355713394107112, + "learning_rate": 4.302050806682233e-05, + "loss": 0.3057, + "step": 8988 + }, + { + "epoch": 1.0659314597414917, + "grad_norm": 1.0960067785588234, + "learning_rate": 4.301884424072771e-05, + "loss": 0.2258, + "step": 8989 + }, + { + "epoch": 1.0660500415036167, + "grad_norm": 1.32699973326967, + "learning_rate": 4.301718024852145e-05, + "loss": 0.2873, + "step": 8990 + }, + { + "epoch": 1.0661686232657417, + "grad_norm": 1.092241496650955, + "learning_rate": 4.3015516090218874e-05, + "loss": 0.2718, + "step": 8991 + }, + { + "epoch": 1.0662872050278667, + "grad_norm": 1.1706829987355307, + "learning_rate": 4.3013851765835336e-05, + "loss": 0.2231, + "step": 8992 + }, + { + "epoch": 1.0664057867899916, + "grad_norm": 0.9454342360156471, + "learning_rate": 4.301218727538618e-05, + "loss": 0.2196, + "step": 8993 + }, + { + "epoch": 1.0665243685521166, + "grad_norm": 1.1967164930367047, + "learning_rate": 4.301052261888674e-05, + "loss": 0.1941, + "step": 8994 + }, + { + "epoch": 1.0666429503142416, + "grad_norm": 1.2763947562754276, + "learning_rate": 4.300885779635237e-05, + "loss": 0.3452, + "step": 8995 + }, + { + "epoch": 1.0667615320763666, + "grad_norm": 1.1591706203728867, + "learning_rate": 4.300719280779841e-05, + "loss": 0.2402, + "step": 8996 + }, + { + "epoch": 1.0668801138384916, + "grad_norm": 1.8672851178921273, + "learning_rate": 4.300552765324022e-05, + "loss": 0.3687, + "step": 8997 + }, + { + "epoch": 1.0669986956006166, + "grad_norm": 1.3462986581049743, + "learning_rate": 4.300386233269315e-05, + "loss": 0.3041, + "step": 8998 + }, + { + "epoch": 1.0671172773627415, + "grad_norm": 1.2106621143018468, + "learning_rate": 4.300219684617253e-05, + "loss": 0.244, + "step": 8999 + }, + { + "epoch": 1.0672358591248665, + "grad_norm": 1.4677575036285666, + "learning_rate": 4.300053119369374e-05, + "loss": 0.3419, + "step": 9000 + }, + { + "epoch": 1.0673544408869915, + "grad_norm": 0.9462512514032652, + "learning_rate": 4.299886537527213e-05, + "loss": 0.1823, + "step": 9001 + }, + { + "epoch": 1.0674730226491165, + "grad_norm": 1.3269085245164685, + "learning_rate": 4.299719939092305e-05, + "loss": 0.2497, + "step": 9002 + }, + { + "epoch": 1.0675916044112415, + "grad_norm": 1.339687899300843, + "learning_rate": 4.2995533240661856e-05, + "loss": 0.2999, + "step": 9003 + }, + { + "epoch": 1.0677101861733664, + "grad_norm": 1.0578807432046256, + "learning_rate": 4.2993866924503924e-05, + "loss": 0.1824, + "step": 9004 + }, + { + "epoch": 1.0678287679354914, + "grad_norm": 1.1039892393448962, + "learning_rate": 4.29922004424646e-05, + "loss": 0.2371, + "step": 9005 + }, + { + "epoch": 1.0679473496976164, + "grad_norm": 1.1791841363515745, + "learning_rate": 4.299053379455925e-05, + "loss": 0.272, + "step": 9006 + }, + { + "epoch": 1.0680659314597416, + "grad_norm": 1.2211231411830847, + "learning_rate": 4.298886698080323e-05, + "loss": 0.2399, + "step": 9007 + }, + { + "epoch": 1.0681845132218664, + "grad_norm": 1.3593767532217618, + "learning_rate": 4.2987200001211935e-05, + "loss": 0.3127, + "step": 9008 + }, + { + "epoch": 1.0683030949839916, + "grad_norm": 1.1826771488184598, + "learning_rate": 4.2985532855800705e-05, + "loss": 0.3167, + "step": 9009 + }, + { + "epoch": 1.0684216767461165, + "grad_norm": 1.2477171845014305, + "learning_rate": 4.298386554458491e-05, + "loss": 0.2417, + "step": 9010 + }, + { + "epoch": 1.0685402585082415, + "grad_norm": 1.065942137851997, + "learning_rate": 4.298219806757994e-05, + "loss": 0.281, + "step": 9011 + }, + { + "epoch": 1.0686588402703665, + "grad_norm": 1.2075585359525516, + "learning_rate": 4.2980530424801146e-05, + "loss": 0.2582, + "step": 9012 + }, + { + "epoch": 1.0687774220324915, + "grad_norm": 0.9711309583236689, + "learning_rate": 4.2978862616263915e-05, + "loss": 0.2643, + "step": 9013 + }, + { + "epoch": 1.0688960037946165, + "grad_norm": 1.7254556682709705, + "learning_rate": 4.2977194641983614e-05, + "loss": 0.475, + "step": 9014 + }, + { + "epoch": 1.0690145855567414, + "grad_norm": 1.0788452736813126, + "learning_rate": 4.2975526501975624e-05, + "loss": 0.2638, + "step": 9015 + }, + { + "epoch": 1.0691331673188664, + "grad_norm": 1.0433752002193102, + "learning_rate": 4.297385819625533e-05, + "loss": 0.2589, + "step": 9016 + }, + { + "epoch": 1.0692517490809914, + "grad_norm": 1.2275920345196518, + "learning_rate": 4.2972189724838095e-05, + "loss": 0.2631, + "step": 9017 + }, + { + "epoch": 1.0693703308431164, + "grad_norm": 1.146696780834424, + "learning_rate": 4.2970521087739314e-05, + "loss": 0.2788, + "step": 9018 + }, + { + "epoch": 1.0694889126052414, + "grad_norm": 1.1909728252723029, + "learning_rate": 4.296885228497437e-05, + "loss": 0.3278, + "step": 9019 + }, + { + "epoch": 1.0696074943673664, + "grad_norm": 1.262996716676593, + "learning_rate": 4.296718331655863e-05, + "loss": 0.2283, + "step": 9020 + }, + { + "epoch": 1.0697260761294913, + "grad_norm": 1.2604578148124501, + "learning_rate": 4.2965514182507505e-05, + "loss": 0.2741, + "step": 9021 + }, + { + "epoch": 1.0698446578916163, + "grad_norm": 1.6119046725692376, + "learning_rate": 4.296384488283637e-05, + "loss": 0.4329, + "step": 9022 + }, + { + "epoch": 1.0699632396537413, + "grad_norm": 1.4748898571930285, + "learning_rate": 4.296217541756061e-05, + "loss": 0.2737, + "step": 9023 + }, + { + "epoch": 1.0700818214158663, + "grad_norm": 0.9677703894960874, + "learning_rate": 4.2960505786695616e-05, + "loss": 0.1868, + "step": 9024 + }, + { + "epoch": 1.0702004031779913, + "grad_norm": 0.9897652041647286, + "learning_rate": 4.295883599025679e-05, + "loss": 0.2071, + "step": 9025 + }, + { + "epoch": 1.0703189849401162, + "grad_norm": 1.4089377201804612, + "learning_rate": 4.295716602825951e-05, + "loss": 0.2608, + "step": 9026 + }, + { + "epoch": 1.0704375667022412, + "grad_norm": 1.3841278894862825, + "learning_rate": 4.295549590071919e-05, + "loss": 0.3057, + "step": 9027 + }, + { + "epoch": 1.0705561484643662, + "grad_norm": 1.0201634744279666, + "learning_rate": 4.295382560765121e-05, + "loss": 0.1894, + "step": 9028 + }, + { + "epoch": 1.0706747302264912, + "grad_norm": 1.2110874357248926, + "learning_rate": 4.295215514907097e-05, + "loss": 0.2433, + "step": 9029 + }, + { + "epoch": 1.0707933119886162, + "grad_norm": 1.337181480885989, + "learning_rate": 4.2950484524993885e-05, + "loss": 0.309, + "step": 9030 + }, + { + "epoch": 1.0709118937507411, + "grad_norm": 1.420344963179303, + "learning_rate": 4.294881373543533e-05, + "loss": 0.3151, + "step": 9031 + }, + { + "epoch": 1.0710304755128661, + "grad_norm": 1.3460807564752804, + "learning_rate": 4.2947142780410746e-05, + "loss": 0.28, + "step": 9032 + }, + { + "epoch": 1.071149057274991, + "grad_norm": 1.4409974752364296, + "learning_rate": 4.29454716599355e-05, + "loss": 0.2758, + "step": 9033 + }, + { + "epoch": 1.071267639037116, + "grad_norm": 1.0416911648245097, + "learning_rate": 4.2943800374025e-05, + "loss": 0.2346, + "step": 9034 + }, + { + "epoch": 1.071386220799241, + "grad_norm": 1.318664041998203, + "learning_rate": 4.294212892269468e-05, + "loss": 0.2875, + "step": 9035 + }, + { + "epoch": 1.071504802561366, + "grad_norm": 1.1107026645148796, + "learning_rate": 4.294045730595993e-05, + "loss": 0.1974, + "step": 9036 + }, + { + "epoch": 1.071623384323491, + "grad_norm": 1.4585842315934443, + "learning_rate": 4.2938785523836164e-05, + "loss": 0.3108, + "step": 9037 + }, + { + "epoch": 1.071741966085616, + "grad_norm": 1.0238285458899163, + "learning_rate": 4.2937113576338796e-05, + "loss": 0.2193, + "step": 9038 + }, + { + "epoch": 1.071860547847741, + "grad_norm": 1.2088399274242076, + "learning_rate": 4.293544146348323e-05, + "loss": 0.2737, + "step": 9039 + }, + { + "epoch": 1.071979129609866, + "grad_norm": 0.970123567436288, + "learning_rate": 4.2933769185284894e-05, + "loss": 0.2311, + "step": 9040 + }, + { + "epoch": 1.072097711371991, + "grad_norm": 1.277235530249087, + "learning_rate": 4.293209674175919e-05, + "loss": 0.2831, + "step": 9041 + }, + { + "epoch": 1.072216293134116, + "grad_norm": 1.2274878147646704, + "learning_rate": 4.293042413292156e-05, + "loss": 0.2733, + "step": 9042 + }, + { + "epoch": 1.072334874896241, + "grad_norm": 0.9828680729303578, + "learning_rate": 4.292875135878739e-05, + "loss": 0.2253, + "step": 9043 + }, + { + "epoch": 1.072453456658366, + "grad_norm": 1.2091115868605609, + "learning_rate": 4.292707841937212e-05, + "loss": 0.2135, + "step": 9044 + }, + { + "epoch": 1.0725720384204909, + "grad_norm": 0.9443778743518986, + "learning_rate": 4.292540531469118e-05, + "loss": 0.1519, + "step": 9045 + }, + { + "epoch": 1.0726906201826159, + "grad_norm": 1.0735978982149512, + "learning_rate": 4.292373204475998e-05, + "loss": 0.226, + "step": 9046 + }, + { + "epoch": 1.0728092019447408, + "grad_norm": 1.0355469928970396, + "learning_rate": 4.2922058609593944e-05, + "loss": 0.2394, + "step": 9047 + }, + { + "epoch": 1.0729277837068658, + "grad_norm": 1.466589834932478, + "learning_rate": 4.292038500920851e-05, + "loss": 0.3803, + "step": 9048 + }, + { + "epoch": 1.0730463654689908, + "grad_norm": 1.266371288808464, + "learning_rate": 4.2918711243619106e-05, + "loss": 0.2768, + "step": 9049 + }, + { + "epoch": 1.0731649472311158, + "grad_norm": 1.0677907120441679, + "learning_rate": 4.291703731284116e-05, + "loss": 0.2027, + "step": 9050 + }, + { + "epoch": 1.0732835289932408, + "grad_norm": 0.9783059391947715, + "learning_rate": 4.291536321689009e-05, + "loss": 0.2359, + "step": 9051 + }, + { + "epoch": 1.0734021107553657, + "grad_norm": 1.2089250305357704, + "learning_rate": 4.291368895578134e-05, + "loss": 0.2077, + "step": 9052 + }, + { + "epoch": 1.0735206925174907, + "grad_norm": 1.189631386197382, + "learning_rate": 4.2912014529530355e-05, + "loss": 0.2602, + "step": 9053 + }, + { + "epoch": 1.0736392742796157, + "grad_norm": 1.159821925012147, + "learning_rate": 4.291033993815255e-05, + "loss": 0.2753, + "step": 9054 + }, + { + "epoch": 1.0737578560417407, + "grad_norm": 1.3166078600238478, + "learning_rate": 4.290866518166339e-05, + "loss": 0.2946, + "step": 9055 + }, + { + "epoch": 1.0738764378038659, + "grad_norm": 1.2401311408483102, + "learning_rate": 4.290699026007828e-05, + "loss": 0.3629, + "step": 9056 + }, + { + "epoch": 1.0739950195659906, + "grad_norm": 1.3273181340018039, + "learning_rate": 4.2905315173412694e-05, + "loss": 0.3066, + "step": 9057 + }, + { + "epoch": 1.0741136013281158, + "grad_norm": 0.9131526394426734, + "learning_rate": 4.290363992168205e-05, + "loss": 0.164, + "step": 9058 + }, + { + "epoch": 1.0742321830902408, + "grad_norm": 1.1052317584105726, + "learning_rate": 4.29019645049018e-05, + "loss": 0.206, + "step": 9059 + }, + { + "epoch": 1.0743507648523658, + "grad_norm": 1.3106412631445592, + "learning_rate": 4.29002889230874e-05, + "loss": 0.2342, + "step": 9060 + }, + { + "epoch": 1.0744693466144908, + "grad_norm": 1.2825405808535364, + "learning_rate": 4.289861317625427e-05, + "loss": 0.2902, + "step": 9061 + }, + { + "epoch": 1.0745879283766158, + "grad_norm": 1.0182269493619722, + "learning_rate": 4.289693726441789e-05, + "loss": 0.219, + "step": 9062 + }, + { + "epoch": 1.0747065101387407, + "grad_norm": 1.4575696016292132, + "learning_rate": 4.28952611875937e-05, + "loss": 0.3334, + "step": 9063 + }, + { + "epoch": 1.0748250919008657, + "grad_norm": 0.9844982821095701, + "learning_rate": 4.289358494579714e-05, + "loss": 0.1773, + "step": 9064 + }, + { + "epoch": 1.0749436736629907, + "grad_norm": 1.1053084135359166, + "learning_rate": 4.289190853904366e-05, + "loss": 0.253, + "step": 9065 + }, + { + "epoch": 1.0750622554251157, + "grad_norm": 1.036669505544453, + "learning_rate": 4.289023196734873e-05, + "loss": 0.2321, + "step": 9066 + }, + { + "epoch": 1.0751808371872407, + "grad_norm": 1.4857114172216324, + "learning_rate": 4.28885552307278e-05, + "loss": 0.3441, + "step": 9067 + }, + { + "epoch": 1.0752994189493656, + "grad_norm": 1.1418084859174489, + "learning_rate": 4.288687832919633e-05, + "loss": 0.2302, + "step": 9068 + }, + { + "epoch": 1.0754180007114906, + "grad_norm": 0.848256405668171, + "learning_rate": 4.288520126276977e-05, + "loss": 0.1786, + "step": 9069 + }, + { + "epoch": 1.0755365824736156, + "grad_norm": 1.106485565416263, + "learning_rate": 4.28835240314636e-05, + "loss": 0.2467, + "step": 9070 + }, + { + "epoch": 1.0756551642357406, + "grad_norm": 0.8534562842799192, + "learning_rate": 4.288184663529325e-05, + "loss": 0.1805, + "step": 9071 + }, + { + "epoch": 1.0757737459978656, + "grad_norm": 0.986074766616734, + "learning_rate": 4.288016907427421e-05, + "loss": 0.1752, + "step": 9072 + }, + { + "epoch": 1.0758923277599906, + "grad_norm": 1.5563381240086647, + "learning_rate": 4.287849134842194e-05, + "loss": 0.2918, + "step": 9073 + }, + { + "epoch": 1.0760109095221155, + "grad_norm": 1.2825886543748068, + "learning_rate": 4.2876813457751904e-05, + "loss": 0.3083, + "step": 9074 + }, + { + "epoch": 1.0761294912842405, + "grad_norm": 1.2646774288141014, + "learning_rate": 4.287513540227956e-05, + "loss": 0.2603, + "step": 9075 + }, + { + "epoch": 1.0762480730463655, + "grad_norm": 1.5849857832828784, + "learning_rate": 4.2873457182020405e-05, + "loss": 0.309, + "step": 9076 + }, + { + "epoch": 1.0763666548084905, + "grad_norm": 1.1444843842234906, + "learning_rate": 4.287177879698988e-05, + "loss": 0.2511, + "step": 9077 + }, + { + "epoch": 1.0764852365706155, + "grad_norm": 1.015343810650589, + "learning_rate": 4.287010024720347e-05, + "loss": 0.1957, + "step": 9078 + }, + { + "epoch": 1.0766038183327404, + "grad_norm": 1.2639319711716797, + "learning_rate": 4.286842153267666e-05, + "loss": 0.2812, + "step": 9079 + }, + { + "epoch": 1.0767224000948654, + "grad_norm": 0.9139499707734923, + "learning_rate": 4.286674265342491e-05, + "loss": 0.2612, + "step": 9080 + }, + { + "epoch": 1.0768409818569904, + "grad_norm": 1.3629518507191536, + "learning_rate": 4.2865063609463695e-05, + "loss": 0.2624, + "step": 9081 + }, + { + "epoch": 1.0769595636191154, + "grad_norm": 1.0527014830219137, + "learning_rate": 4.2863384400808506e-05, + "loss": 0.1867, + "step": 9082 + }, + { + "epoch": 1.0770781453812404, + "grad_norm": 1.4530877707706533, + "learning_rate": 4.286170502747482e-05, + "loss": 0.2993, + "step": 9083 + }, + { + "epoch": 1.0771967271433653, + "grad_norm": 1.2894894112699467, + "learning_rate": 4.2860025489478115e-05, + "loss": 0.2551, + "step": 9084 + }, + { + "epoch": 1.0773153089054903, + "grad_norm": 1.2307716213749689, + "learning_rate": 4.2858345786833874e-05, + "loss": 0.2915, + "step": 9085 + }, + { + "epoch": 1.0774338906676153, + "grad_norm": 1.50179134210579, + "learning_rate": 4.285666591955759e-05, + "loss": 0.3032, + "step": 9086 + }, + { + "epoch": 1.0775524724297403, + "grad_norm": 1.114447449720133, + "learning_rate": 4.2854985887664746e-05, + "loss": 0.24, + "step": 9087 + }, + { + "epoch": 1.0776710541918653, + "grad_norm": 0.9870700067468474, + "learning_rate": 4.285330569117082e-05, + "loss": 0.1597, + "step": 9088 + }, + { + "epoch": 1.0777896359539902, + "grad_norm": 1.0596746168299818, + "learning_rate": 4.285162533009131e-05, + "loss": 0.2242, + "step": 9089 + }, + { + "epoch": 1.0779082177161152, + "grad_norm": 0.8406095922616023, + "learning_rate": 4.284994480444171e-05, + "loss": 0.1847, + "step": 9090 + }, + { + "epoch": 1.0780267994782402, + "grad_norm": 1.0345510340878588, + "learning_rate": 4.28482641142375e-05, + "loss": 0.2179, + "step": 9091 + }, + { + "epoch": 1.0781453812403652, + "grad_norm": 0.975461620638216, + "learning_rate": 4.2846583259494186e-05, + "loss": 0.2105, + "step": 9092 + }, + { + "epoch": 1.0782639630024902, + "grad_norm": 1.1048821905712536, + "learning_rate": 4.2844902240227264e-05, + "loss": 0.2554, + "step": 9093 + }, + { + "epoch": 1.0783825447646151, + "grad_norm": 1.0167333207986267, + "learning_rate": 4.284322105645222e-05, + "loss": 0.2359, + "step": 9094 + }, + { + "epoch": 1.0785011265267401, + "grad_norm": 1.2860619685496621, + "learning_rate": 4.284153970818456e-05, + "loss": 0.2611, + "step": 9095 + }, + { + "epoch": 1.078619708288865, + "grad_norm": 1.1089967922582202, + "learning_rate": 4.283985819543979e-05, + "loss": 0.2274, + "step": 9096 + }, + { + "epoch": 1.07873829005099, + "grad_norm": 1.2213558658432468, + "learning_rate": 4.28381765182334e-05, + "loss": 0.219, + "step": 9097 + }, + { + "epoch": 1.078856871813115, + "grad_norm": 1.201846758472491, + "learning_rate": 4.283649467658089e-05, + "loss": 0.3088, + "step": 9098 + }, + { + "epoch": 1.07897545357524, + "grad_norm": 1.1980682166810548, + "learning_rate": 4.2834812670497774e-05, + "loss": 0.2464, + "step": 9099 + }, + { + "epoch": 1.079094035337365, + "grad_norm": 1.1537468838766292, + "learning_rate": 4.283313049999956e-05, + "loss": 0.2556, + "step": 9100 + }, + { + "epoch": 1.07921261709949, + "grad_norm": 1.2103706594429784, + "learning_rate": 4.2831448165101754e-05, + "loss": 0.2641, + "step": 9101 + }, + { + "epoch": 1.079331198861615, + "grad_norm": 1.7141584244553743, + "learning_rate": 4.2829765665819856e-05, + "loss": 0.3217, + "step": 9102 + }, + { + "epoch": 1.07944978062374, + "grad_norm": 1.4181461952332843, + "learning_rate": 4.2828083002169384e-05, + "loss": 0.3241, + "step": 9103 + }, + { + "epoch": 1.079568362385865, + "grad_norm": 1.6922053572592974, + "learning_rate": 4.282640017416585e-05, + "loss": 0.3536, + "step": 9104 + }, + { + "epoch": 1.07968694414799, + "grad_norm": 0.9957961595399599, + "learning_rate": 4.282471718182476e-05, + "loss": 0.1937, + "step": 9105 + }, + { + "epoch": 1.079805525910115, + "grad_norm": 1.1275301425599134, + "learning_rate": 4.282303402516165e-05, + "loss": 0.2115, + "step": 9106 + }, + { + "epoch": 1.0799241076722401, + "grad_norm": 1.5309215131145724, + "learning_rate": 4.282135070419201e-05, + "loss": 0.3131, + "step": 9107 + }, + { + "epoch": 1.0800426894343649, + "grad_norm": 1.2793855889529975, + "learning_rate": 4.281966721893137e-05, + "loss": 0.2787, + "step": 9108 + }, + { + "epoch": 1.08016127119649, + "grad_norm": 1.05056530809754, + "learning_rate": 4.281798356939526e-05, + "loss": 0.2119, + "step": 9109 + }, + { + "epoch": 1.080279852958615, + "grad_norm": 0.951603772277236, + "learning_rate": 4.281629975559918e-05, + "loss": 0.1715, + "step": 9110 + }, + { + "epoch": 1.08039843472074, + "grad_norm": 0.9866128667801015, + "learning_rate": 4.281461577755868e-05, + "loss": 0.2061, + "step": 9111 + }, + { + "epoch": 1.080517016482865, + "grad_norm": 1.099851354773601, + "learning_rate": 4.281293163528925e-05, + "loss": 0.1831, + "step": 9112 + }, + { + "epoch": 1.08063559824499, + "grad_norm": 1.1523562247462882, + "learning_rate": 4.281124732880644e-05, + "loss": 0.2845, + "step": 9113 + }, + { + "epoch": 1.080754180007115, + "grad_norm": 1.200690538203935, + "learning_rate": 4.280956285812577e-05, + "loss": 0.2529, + "step": 9114 + }, + { + "epoch": 1.08087276176924, + "grad_norm": 1.2545414315480166, + "learning_rate": 4.2807878223262774e-05, + "loss": 0.3057, + "step": 9115 + }, + { + "epoch": 1.080991343531365, + "grad_norm": 1.0644210265753857, + "learning_rate": 4.280619342423298e-05, + "loss": 0.2379, + "step": 9116 + }, + { + "epoch": 1.08110992529349, + "grad_norm": 1.5232123307371377, + "learning_rate": 4.280450846105192e-05, + "loss": 0.3437, + "step": 9117 + }, + { + "epoch": 1.081228507055615, + "grad_norm": 1.3497817162933794, + "learning_rate": 4.280282333373512e-05, + "loss": 0.2693, + "step": 9118 + }, + { + "epoch": 1.0813470888177399, + "grad_norm": 1.223590859907483, + "learning_rate": 4.280113804229812e-05, + "loss": 0.2421, + "step": 9119 + }, + { + "epoch": 1.0814656705798649, + "grad_norm": 0.9718899217488182, + "learning_rate": 4.279945258675646e-05, + "loss": 0.201, + "step": 9120 + }, + { + "epoch": 1.0815842523419898, + "grad_norm": 1.0675611368400064, + "learning_rate": 4.279776696712567e-05, + "loss": 0.2179, + "step": 9121 + }, + { + "epoch": 1.0817028341041148, + "grad_norm": 1.3146545463121166, + "learning_rate": 4.2796081183421295e-05, + "loss": 0.2807, + "step": 9122 + }, + { + "epoch": 1.0818214158662398, + "grad_norm": 1.06222449513102, + "learning_rate": 4.2794395235658875e-05, + "loss": 0.2194, + "step": 9123 + }, + { + "epoch": 1.0819399976283648, + "grad_norm": 1.1032776745367123, + "learning_rate": 4.279270912385395e-05, + "loss": 0.1914, + "step": 9124 + }, + { + "epoch": 1.0820585793904898, + "grad_norm": 0.9790355291698437, + "learning_rate": 4.279102284802207e-05, + "loss": 0.1856, + "step": 9125 + }, + { + "epoch": 1.0821771611526148, + "grad_norm": 1.3815266053179054, + "learning_rate": 4.278933640817877e-05, + "loss": 0.2608, + "step": 9126 + }, + { + "epoch": 1.0822957429147397, + "grad_norm": 1.124652971956732, + "learning_rate": 4.278764980433961e-05, + "loss": 0.1799, + "step": 9127 + }, + { + "epoch": 1.0824143246768647, + "grad_norm": 1.3162363543550377, + "learning_rate": 4.278596303652013e-05, + "loss": 0.2467, + "step": 9128 + }, + { + "epoch": 1.0825329064389897, + "grad_norm": 1.1470482453271884, + "learning_rate": 4.278427610473588e-05, + "loss": 0.2879, + "step": 9129 + }, + { + "epoch": 1.0826514882011147, + "grad_norm": 1.258312303289879, + "learning_rate": 4.278258900900241e-05, + "loss": 0.287, + "step": 9130 + }, + { + "epoch": 1.0827700699632397, + "grad_norm": 1.1689072241485048, + "learning_rate": 4.278090174933528e-05, + "loss": 0.2393, + "step": 9131 + }, + { + "epoch": 1.0828886517253646, + "grad_norm": 1.0902874096881485, + "learning_rate": 4.277921432575004e-05, + "loss": 0.2568, + "step": 9132 + }, + { + "epoch": 1.0830072334874896, + "grad_norm": 1.6724578178500933, + "learning_rate": 4.277752673826224e-05, + "loss": 0.3572, + "step": 9133 + }, + { + "epoch": 1.0831258152496146, + "grad_norm": 0.9578315286110548, + "learning_rate": 4.2775838986887454e-05, + "loss": 0.1935, + "step": 9134 + }, + { + "epoch": 1.0832443970117396, + "grad_norm": 1.5675319808445998, + "learning_rate": 4.277415107164122e-05, + "loss": 0.317, + "step": 9135 + }, + { + "epoch": 1.0833629787738646, + "grad_norm": 0.9029388678156982, + "learning_rate": 4.277246299253911e-05, + "loss": 0.1845, + "step": 9136 + }, + { + "epoch": 1.0834815605359895, + "grad_norm": 1.432714886806861, + "learning_rate": 4.27707747495967e-05, + "loss": 0.2867, + "step": 9137 + }, + { + "epoch": 1.0836001422981145, + "grad_norm": 1.3860162177963657, + "learning_rate": 4.276908634282952e-05, + "loss": 0.2932, + "step": 9138 + }, + { + "epoch": 1.0837187240602395, + "grad_norm": 1.4439714118398406, + "learning_rate": 4.2767397772253164e-05, + "loss": 0.277, + "step": 9139 + }, + { + "epoch": 1.0838373058223645, + "grad_norm": 1.3531109777878096, + "learning_rate": 4.2765709037883183e-05, + "loss": 0.3361, + "step": 9140 + }, + { + "epoch": 1.0839558875844895, + "grad_norm": 1.4539372963791206, + "learning_rate": 4.276402013973516e-05, + "loss": 0.2965, + "step": 9141 + }, + { + "epoch": 1.0840744693466144, + "grad_norm": 1.1794682083605723, + "learning_rate": 4.276233107782465e-05, + "loss": 0.2737, + "step": 9142 + }, + { + "epoch": 1.0841930511087394, + "grad_norm": 1.1602231114919903, + "learning_rate": 4.2760641852167225e-05, + "loss": 0.2407, + "step": 9143 + }, + { + "epoch": 1.0843116328708644, + "grad_norm": 1.023415265184284, + "learning_rate": 4.275895246277847e-05, + "loss": 0.1972, + "step": 9144 + }, + { + "epoch": 1.0844302146329894, + "grad_norm": 1.024978479089089, + "learning_rate": 4.275726290967394e-05, + "loss": 0.24, + "step": 9145 + }, + { + "epoch": 1.0845487963951144, + "grad_norm": 1.3752313401616316, + "learning_rate": 4.2755573192869226e-05, + "loss": 0.3268, + "step": 9146 + }, + { + "epoch": 1.0846673781572393, + "grad_norm": 1.1315551526965504, + "learning_rate": 4.275388331237991e-05, + "loss": 0.274, + "step": 9147 + }, + { + "epoch": 1.0847859599193643, + "grad_norm": 1.3658292891205541, + "learning_rate": 4.275219326822154e-05, + "loss": 0.3034, + "step": 9148 + }, + { + "epoch": 1.0849045416814893, + "grad_norm": 0.9785929654468707, + "learning_rate": 4.275050306040974e-05, + "loss": 0.2411, + "step": 9149 + }, + { + "epoch": 1.0850231234436143, + "grad_norm": 1.3315396216297213, + "learning_rate": 4.274881268896005e-05, + "loss": 0.3065, + "step": 9150 + }, + { + "epoch": 1.0851417052057393, + "grad_norm": 0.9102992321442495, + "learning_rate": 4.274712215388809e-05, + "loss": 0.147, + "step": 9151 + }, + { + "epoch": 1.0852602869678643, + "grad_norm": 1.1409938041952026, + "learning_rate": 4.2745431455209415e-05, + "loss": 0.2508, + "step": 9152 + }, + { + "epoch": 1.0853788687299892, + "grad_norm": 1.2035567030437635, + "learning_rate": 4.2743740592939616e-05, + "loss": 0.2943, + "step": 9153 + }, + { + "epoch": 1.0854974504921142, + "grad_norm": 0.9963882076772004, + "learning_rate": 4.27420495670943e-05, + "loss": 0.2472, + "step": 9154 + }, + { + "epoch": 1.0856160322542392, + "grad_norm": 1.6850783138322156, + "learning_rate": 4.274035837768904e-05, + "loss": 0.3729, + "step": 9155 + }, + { + "epoch": 1.0857346140163644, + "grad_norm": 1.4619231760322848, + "learning_rate": 4.2738667024739435e-05, + "loss": 0.2795, + "step": 9156 + }, + { + "epoch": 1.0858531957784892, + "grad_norm": 1.4228647173252573, + "learning_rate": 4.273697550826107e-05, + "loss": 0.2669, + "step": 9157 + }, + { + "epoch": 1.0859717775406144, + "grad_norm": 1.2624370539535106, + "learning_rate": 4.273528382826953e-05, + "loss": 0.2582, + "step": 9158 + }, + { + "epoch": 1.0860903593027393, + "grad_norm": 0.9940680320561808, + "learning_rate": 4.273359198478044e-05, + "loss": 0.2037, + "step": 9159 + }, + { + "epoch": 1.0862089410648643, + "grad_norm": 1.1891623404083551, + "learning_rate": 4.2731899977809366e-05, + "loss": 0.2766, + "step": 9160 + }, + { + "epoch": 1.0863275228269893, + "grad_norm": 1.3533342152120247, + "learning_rate": 4.273020780737192e-05, + "loss": 0.2513, + "step": 9161 + }, + { + "epoch": 1.0864461045891143, + "grad_norm": 1.4396197859206152, + "learning_rate": 4.27285154734837e-05, + "loss": 0.2331, + "step": 9162 + }, + { + "epoch": 1.0865646863512393, + "grad_norm": 1.3325042244407195, + "learning_rate": 4.272682297616032e-05, + "loss": 0.2743, + "step": 9163 + }, + { + "epoch": 1.0866832681133642, + "grad_norm": 1.228057402075941, + "learning_rate": 4.2725130315417354e-05, + "loss": 0.2842, + "step": 9164 + }, + { + "epoch": 1.0868018498754892, + "grad_norm": 1.3663128711474655, + "learning_rate": 4.272343749127043e-05, + "loss": 0.2357, + "step": 9165 + }, + { + "epoch": 1.0869204316376142, + "grad_norm": 1.2592922710360221, + "learning_rate": 4.2721744503735144e-05, + "loss": 0.2487, + "step": 9166 + }, + { + "epoch": 1.0870390133997392, + "grad_norm": 0.8347615286178848, + "learning_rate": 4.2720051352827106e-05, + "loss": 0.1447, + "step": 9167 + }, + { + "epoch": 1.0871575951618642, + "grad_norm": 1.2128423519082276, + "learning_rate": 4.2718358038561926e-05, + "loss": 0.2305, + "step": 9168 + }, + { + "epoch": 1.0872761769239891, + "grad_norm": 1.046602381197557, + "learning_rate": 4.271666456095521e-05, + "loss": 0.225, + "step": 9169 + }, + { + "epoch": 1.0873947586861141, + "grad_norm": 1.2569836006634818, + "learning_rate": 4.2714970920022574e-05, + "loss": 0.2962, + "step": 9170 + }, + { + "epoch": 1.087513340448239, + "grad_norm": 0.9337378634987525, + "learning_rate": 4.2713277115779626e-05, + "loss": 0.22, + "step": 9171 + }, + { + "epoch": 1.087631922210364, + "grad_norm": 1.1662354634181071, + "learning_rate": 4.2711583148241994e-05, + "loss": 0.2707, + "step": 9172 + }, + { + "epoch": 1.087750503972489, + "grad_norm": 2.0172091037622746, + "learning_rate": 4.2709889017425276e-05, + "loss": 0.4478, + "step": 9173 + }, + { + "epoch": 1.087869085734614, + "grad_norm": 1.3518027956778271, + "learning_rate": 4.27081947233451e-05, + "loss": 0.2468, + "step": 9174 + }, + { + "epoch": 1.087987667496739, + "grad_norm": 1.241243656519273, + "learning_rate": 4.270650026601708e-05, + "loss": 0.2238, + "step": 9175 + }, + { + "epoch": 1.088106249258864, + "grad_norm": 1.4276651035296595, + "learning_rate": 4.2704805645456845e-05, + "loss": 0.2564, + "step": 9176 + }, + { + "epoch": 1.088224831020989, + "grad_norm": 1.325456462086742, + "learning_rate": 4.270311086168002e-05, + "loss": 0.3167, + "step": 9177 + }, + { + "epoch": 1.088343412783114, + "grad_norm": 1.2902247078436213, + "learning_rate": 4.270141591470221e-05, + "loss": 0.2358, + "step": 9178 + }, + { + "epoch": 1.088461994545239, + "grad_norm": 1.2492777980281433, + "learning_rate": 4.269972080453906e-05, + "loss": 0.3023, + "step": 9179 + }, + { + "epoch": 1.088580576307364, + "grad_norm": 1.2660422587450932, + "learning_rate": 4.269802553120619e-05, + "loss": 0.3272, + "step": 9180 + }, + { + "epoch": 1.088699158069489, + "grad_norm": 1.3105136237045314, + "learning_rate": 4.2696330094719224e-05, + "loss": 0.2296, + "step": 9181 + }, + { + "epoch": 1.088817739831614, + "grad_norm": 1.0491650788946492, + "learning_rate": 4.2694634495093796e-05, + "loss": 0.2497, + "step": 9182 + }, + { + "epoch": 1.0889363215937389, + "grad_norm": 0.9287014668920095, + "learning_rate": 4.2692938732345536e-05, + "loss": 0.2148, + "step": 9183 + }, + { + "epoch": 1.0890549033558639, + "grad_norm": 0.907189967760282, + "learning_rate": 4.269124280649007e-05, + "loss": 0.1785, + "step": 9184 + }, + { + "epoch": 1.0891734851179888, + "grad_norm": 0.9734860544620869, + "learning_rate": 4.268954671754305e-05, + "loss": 0.201, + "step": 9185 + }, + { + "epoch": 1.0892920668801138, + "grad_norm": 1.338161799297769, + "learning_rate": 4.26878504655201e-05, + "loss": 0.3328, + "step": 9186 + }, + { + "epoch": 1.0894106486422388, + "grad_norm": 1.0606684645354016, + "learning_rate": 4.2686154050436854e-05, + "loss": 0.239, + "step": 9187 + }, + { + "epoch": 1.0895292304043638, + "grad_norm": 1.1197765737542993, + "learning_rate": 4.2684457472308956e-05, + "loss": 0.2608, + "step": 9188 + }, + { + "epoch": 1.0896478121664888, + "grad_norm": 1.1839268066559638, + "learning_rate": 4.268276073115206e-05, + "loss": 0.2588, + "step": 9189 + }, + { + "epoch": 1.0897663939286137, + "grad_norm": 1.0694378280198684, + "learning_rate": 4.268106382698178e-05, + "loss": 0.2596, + "step": 9190 + }, + { + "epoch": 1.0898849756907387, + "grad_norm": 1.1414435999241115, + "learning_rate": 4.267936675981378e-05, + "loss": 0.2449, + "step": 9191 + }, + { + "epoch": 1.0900035574528637, + "grad_norm": 0.9573981697396776, + "learning_rate": 4.267766952966369e-05, + "loss": 0.22, + "step": 9192 + }, + { + "epoch": 1.0901221392149887, + "grad_norm": 1.3568428763112232, + "learning_rate": 4.267597213654717e-05, + "loss": 0.3183, + "step": 9193 + }, + { + "epoch": 1.0902407209771137, + "grad_norm": 1.3316362067110235, + "learning_rate": 4.2674274580479866e-05, + "loss": 0.3866, + "step": 9194 + }, + { + "epoch": 1.0903593027392386, + "grad_norm": 1.4468905974318667, + "learning_rate": 4.2672576861477424e-05, + "loss": 0.2767, + "step": 9195 + }, + { + "epoch": 1.0904778845013636, + "grad_norm": 1.009169263608157, + "learning_rate": 4.2670878979555485e-05, + "loss": 0.2077, + "step": 9196 + }, + { + "epoch": 1.0905964662634886, + "grad_norm": 1.5731328503957709, + "learning_rate": 4.266918093472971e-05, + "loss": 0.3541, + "step": 9197 + }, + { + "epoch": 1.0907150480256136, + "grad_norm": 1.1164722787619714, + "learning_rate": 4.266748272701577e-05, + "loss": 0.2489, + "step": 9198 + }, + { + "epoch": 1.0908336297877386, + "grad_norm": 1.1480336569784453, + "learning_rate": 4.266578435642929e-05, + "loss": 0.2491, + "step": 9199 + }, + { + "epoch": 1.0909522115498635, + "grad_norm": 0.8443236756973576, + "learning_rate": 4.2664085822985956e-05, + "loss": 0.1959, + "step": 9200 + }, + { + "epoch": 1.0910707933119885, + "grad_norm": 1.4577428611434744, + "learning_rate": 4.26623871267014e-05, + "loss": 0.3376, + "step": 9201 + }, + { + "epoch": 1.0911893750741135, + "grad_norm": 1.2416899346552959, + "learning_rate": 4.2660688267591305e-05, + "loss": 0.2388, + "step": 9202 + }, + { + "epoch": 1.0913079568362385, + "grad_norm": 1.2904374202958961, + "learning_rate": 4.265898924567131e-05, + "loss": 0.2398, + "step": 9203 + }, + { + "epoch": 1.0914265385983635, + "grad_norm": 1.0849135622709687, + "learning_rate": 4.265729006095709e-05, + "loss": 0.2205, + "step": 9204 + }, + { + "epoch": 1.0915451203604887, + "grad_norm": 1.1317579512181228, + "learning_rate": 4.265559071346431e-05, + "loss": 0.2069, + "step": 9205 + }, + { + "epoch": 1.0916637021226134, + "grad_norm": 1.2168133551357452, + "learning_rate": 4.265389120320864e-05, + "loss": 0.2235, + "step": 9206 + }, + { + "epoch": 1.0917822838847386, + "grad_norm": 1.5190023256565974, + "learning_rate": 4.2652191530205735e-05, + "loss": 0.4071, + "step": 9207 + }, + { + "epoch": 1.0919008656468634, + "grad_norm": 1.1790934394785428, + "learning_rate": 4.265049169447127e-05, + "loss": 0.2389, + "step": 9208 + }, + { + "epoch": 1.0920194474089886, + "grad_norm": 0.8175285527868961, + "learning_rate": 4.2648791696020924e-05, + "loss": 0.1622, + "step": 9209 + }, + { + "epoch": 1.0921380291711136, + "grad_norm": 1.1053471140418476, + "learning_rate": 4.2647091534870355e-05, + "loss": 0.2419, + "step": 9210 + }, + { + "epoch": 1.0922566109332386, + "grad_norm": 1.1381055310431798, + "learning_rate": 4.264539121103525e-05, + "loss": 0.24, + "step": 9211 + }, + { + "epoch": 1.0923751926953635, + "grad_norm": 1.3492833534512376, + "learning_rate": 4.264369072453126e-05, + "loss": 0.3022, + "step": 9212 + }, + { + "epoch": 1.0924937744574885, + "grad_norm": 1.3436732399569105, + "learning_rate": 4.2641990075374096e-05, + "loss": 0.2961, + "step": 9213 + }, + { + "epoch": 1.0926123562196135, + "grad_norm": 1.3048377282071555, + "learning_rate": 4.26402892635794e-05, + "loss": 0.3426, + "step": 9214 + }, + { + "epoch": 1.0927309379817385, + "grad_norm": 1.3778724835090845, + "learning_rate": 4.263858828916288e-05, + "loss": 0.3846, + "step": 9215 + }, + { + "epoch": 1.0928495197438635, + "grad_norm": 1.387870575135757, + "learning_rate": 4.2636887152140205e-05, + "loss": 0.2875, + "step": 9216 + }, + { + "epoch": 1.0929681015059884, + "grad_norm": 1.1941143423445184, + "learning_rate": 4.263518585252706e-05, + "loss": 0.2805, + "step": 9217 + }, + { + "epoch": 1.0930866832681134, + "grad_norm": 1.1993962512969953, + "learning_rate": 4.2633484390339115e-05, + "loss": 0.2647, + "step": 9218 + }, + { + "epoch": 1.0932052650302384, + "grad_norm": 1.44899802330021, + "learning_rate": 4.263178276559208e-05, + "loss": 0.3078, + "step": 9219 + }, + { + "epoch": 1.0933238467923634, + "grad_norm": 1.1603161817205814, + "learning_rate": 4.263008097830162e-05, + "loss": 0.258, + "step": 9220 + }, + { + "epoch": 1.0934424285544884, + "grad_norm": 0.7747780431231321, + "learning_rate": 4.2628379028483444e-05, + "loss": 0.1485, + "step": 9221 + }, + { + "epoch": 1.0935610103166133, + "grad_norm": 1.135129841060286, + "learning_rate": 4.262667691615323e-05, + "loss": 0.2348, + "step": 9222 + }, + { + "epoch": 1.0936795920787383, + "grad_norm": 1.7300863176428136, + "learning_rate": 4.262497464132666e-05, + "loss": 0.3484, + "step": 9223 + }, + { + "epoch": 1.0937981738408633, + "grad_norm": 1.1776284904295375, + "learning_rate": 4.262327220401945e-05, + "loss": 0.212, + "step": 9224 + }, + { + "epoch": 1.0939167556029883, + "grad_norm": 1.0105541776136007, + "learning_rate": 4.262156960424727e-05, + "loss": 0.254, + "step": 9225 + }, + { + "epoch": 1.0940353373651133, + "grad_norm": 1.1437405045016575, + "learning_rate": 4.261986684202583e-05, + "loss": 0.2024, + "step": 9226 + }, + { + "epoch": 1.0941539191272383, + "grad_norm": 1.7185291081138585, + "learning_rate": 4.2618163917370824e-05, + "loss": 0.3264, + "step": 9227 + }, + { + "epoch": 1.0942725008893632, + "grad_norm": 1.2394068408409242, + "learning_rate": 4.261646083029795e-05, + "loss": 0.2373, + "step": 9228 + }, + { + "epoch": 1.0943910826514882, + "grad_norm": 1.5932971881120734, + "learning_rate": 4.261475758082292e-05, + "loss": 0.3077, + "step": 9229 + }, + { + "epoch": 1.0945096644136132, + "grad_norm": 0.9455972669532549, + "learning_rate": 4.261305416896142e-05, + "loss": 0.2038, + "step": 9230 + }, + { + "epoch": 1.0946282461757382, + "grad_norm": 1.2390059917903147, + "learning_rate": 4.261135059472915e-05, + "loss": 0.2463, + "step": 9231 + }, + { + "epoch": 1.0947468279378632, + "grad_norm": 1.0405097254882079, + "learning_rate": 4.260964685814184e-05, + "loss": 0.1948, + "step": 9232 + }, + { + "epoch": 1.0948654096999881, + "grad_norm": 1.1895912385411036, + "learning_rate": 4.260794295921516e-05, + "loss": 0.2732, + "step": 9233 + }, + { + "epoch": 1.0949839914621131, + "grad_norm": 1.6060138076384047, + "learning_rate": 4.260623889796486e-05, + "loss": 0.3613, + "step": 9234 + }, + { + "epoch": 1.095102573224238, + "grad_norm": 1.0005079505915218, + "learning_rate": 4.260453467440662e-05, + "loss": 0.2031, + "step": 9235 + }, + { + "epoch": 1.095221154986363, + "grad_norm": 0.9418734985927831, + "learning_rate": 4.260283028855615e-05, + "loss": 0.2036, + "step": 9236 + }, + { + "epoch": 1.095339736748488, + "grad_norm": 1.081796061344973, + "learning_rate": 4.2601125740429185e-05, + "loss": 0.2159, + "step": 9237 + }, + { + "epoch": 1.095458318510613, + "grad_norm": 1.6082870112706082, + "learning_rate": 4.259942103004141e-05, + "loss": 0.2494, + "step": 9238 + }, + { + "epoch": 1.095576900272738, + "grad_norm": 1.0747976483561006, + "learning_rate": 4.259771615740856e-05, + "loss": 0.2067, + "step": 9239 + }, + { + "epoch": 1.095695482034863, + "grad_norm": 1.0649857995394778, + "learning_rate": 4.259601112254635e-05, + "loss": 0.317, + "step": 9240 + }, + { + "epoch": 1.095814063796988, + "grad_norm": 1.1169876828508543, + "learning_rate": 4.2594305925470495e-05, + "loss": 0.252, + "step": 9241 + }, + { + "epoch": 1.095932645559113, + "grad_norm": 1.5995477954645707, + "learning_rate": 4.259260056619671e-05, + "loss": 0.3153, + "step": 9242 + }, + { + "epoch": 1.096051227321238, + "grad_norm": 1.845271639597029, + "learning_rate": 4.259089504474073e-05, + "loss": 0.4786, + "step": 9243 + }, + { + "epoch": 1.096169809083363, + "grad_norm": 1.2015727728066514, + "learning_rate": 4.2589189361118266e-05, + "loss": 0.2989, + "step": 9244 + }, + { + "epoch": 1.096288390845488, + "grad_norm": 1.0870535652718698, + "learning_rate": 4.258748351534504e-05, + "loss": 0.2191, + "step": 9245 + }, + { + "epoch": 1.0964069726076129, + "grad_norm": 1.0206711736917058, + "learning_rate": 4.258577750743678e-05, + "loss": 0.2153, + "step": 9246 + }, + { + "epoch": 1.0965255543697379, + "grad_norm": 1.4982287131448218, + "learning_rate": 4.2584071337409224e-05, + "loss": 0.3037, + "step": 9247 + }, + { + "epoch": 1.0966441361318628, + "grad_norm": 1.2135255609443034, + "learning_rate": 4.25823650052781e-05, + "loss": 0.2948, + "step": 9248 + }, + { + "epoch": 1.0967627178939878, + "grad_norm": 1.0505610530285765, + "learning_rate": 4.258065851105913e-05, + "loss": 0.208, + "step": 9249 + }, + { + "epoch": 1.0968812996561128, + "grad_norm": 1.207656906988747, + "learning_rate": 4.257895185476803e-05, + "loss": 0.2315, + "step": 9250 + }, + { + "epoch": 1.0969998814182378, + "grad_norm": 1.07808251993154, + "learning_rate": 4.257724503642056e-05, + "loss": 0.2068, + "step": 9251 + }, + { + "epoch": 1.0971184631803628, + "grad_norm": 1.014163606801111, + "learning_rate": 4.257553805603245e-05, + "loss": 0.2029, + "step": 9252 + }, + { + "epoch": 1.0972370449424877, + "grad_norm": 1.6482281269095507, + "learning_rate": 4.257383091361943e-05, + "loss": 0.4504, + "step": 9253 + }, + { + "epoch": 1.0973556267046127, + "grad_norm": 0.8389435966004057, + "learning_rate": 4.257212360919724e-05, + "loss": 0.185, + "step": 9254 + }, + { + "epoch": 1.0974742084667377, + "grad_norm": 1.306398236415953, + "learning_rate": 4.257041614278161e-05, + "loss": 0.2544, + "step": 9255 + }, + { + "epoch": 1.097592790228863, + "grad_norm": 1.422092395725231, + "learning_rate": 4.25687085143883e-05, + "loss": 0.3106, + "step": 9256 + }, + { + "epoch": 1.0977113719909877, + "grad_norm": 1.1284047401973236, + "learning_rate": 4.256700072403304e-05, + "loss": 0.2933, + "step": 9257 + }, + { + "epoch": 1.0978299537531129, + "grad_norm": 1.9658526117436346, + "learning_rate": 4.256529277173157e-05, + "loss": 0.3745, + "step": 9258 + }, + { + "epoch": 1.0979485355152379, + "grad_norm": 1.2658310956504497, + "learning_rate": 4.2563584657499645e-05, + "loss": 0.2285, + "step": 9259 + }, + { + "epoch": 1.0980671172773628, + "grad_norm": 1.2102194512398154, + "learning_rate": 4.256187638135301e-05, + "loss": 0.2746, + "step": 9260 + }, + { + "epoch": 1.0981856990394878, + "grad_norm": 1.3414506716796777, + "learning_rate": 4.2560167943307406e-05, + "loss": 0.2977, + "step": 9261 + }, + { + "epoch": 1.0983042808016128, + "grad_norm": 1.0481854078930073, + "learning_rate": 4.255845934337859e-05, + "loss": 0.2249, + "step": 9262 + }, + { + "epoch": 1.0984228625637378, + "grad_norm": 1.300661663056757, + "learning_rate": 4.2556750581582304e-05, + "loss": 0.2312, + "step": 9263 + }, + { + "epoch": 1.0985414443258628, + "grad_norm": 1.1118936067513359, + "learning_rate": 4.255504165793431e-05, + "loss": 0.2392, + "step": 9264 + }, + { + "epoch": 1.0986600260879877, + "grad_norm": 1.5436207924174938, + "learning_rate": 4.2553332572450364e-05, + "loss": 0.2995, + "step": 9265 + }, + { + "epoch": 1.0987786078501127, + "grad_norm": 0.9481634240913727, + "learning_rate": 4.2551623325146205e-05, + "loss": 0.2188, + "step": 9266 + }, + { + "epoch": 1.0988971896122377, + "grad_norm": 1.3985926245759273, + "learning_rate": 4.254991391603762e-05, + "loss": 0.2845, + "step": 9267 + }, + { + "epoch": 1.0990157713743627, + "grad_norm": 0.9849354756045638, + "learning_rate": 4.2548204345140344e-05, + "loss": 0.219, + "step": 9268 + }, + { + "epoch": 1.0991343531364877, + "grad_norm": 1.1724496177206996, + "learning_rate": 4.254649461247013e-05, + "loss": 0.2375, + "step": 9269 + }, + { + "epoch": 1.0992529348986126, + "grad_norm": 1.324494958360901, + "learning_rate": 4.254478471804276e-05, + "loss": 0.271, + "step": 9270 + }, + { + "epoch": 1.0993715166607376, + "grad_norm": 1.478864038368792, + "learning_rate": 4.2543074661874e-05, + "loss": 0.3473, + "step": 9271 + }, + { + "epoch": 1.0994900984228626, + "grad_norm": 1.0925536690567057, + "learning_rate": 4.25413644439796e-05, + "loss": 0.1966, + "step": 9272 + }, + { + "epoch": 1.0996086801849876, + "grad_norm": 1.2416507817736837, + "learning_rate": 4.253965406437532e-05, + "loss": 0.2354, + "step": 9273 + }, + { + "epoch": 1.0997272619471126, + "grad_norm": 1.158745199801717, + "learning_rate": 4.253794352307695e-05, + "loss": 0.2514, + "step": 9274 + }, + { + "epoch": 1.0998458437092375, + "grad_norm": 0.9640739236464869, + "learning_rate": 4.253623282010024e-05, + "loss": 0.1944, + "step": 9275 + }, + { + "epoch": 1.0999644254713625, + "grad_norm": 1.304531495557837, + "learning_rate": 4.253452195546097e-05, + "loss": 0.2977, + "step": 9276 + }, + { + "epoch": 1.1000830072334875, + "grad_norm": 1.3722428705246934, + "learning_rate": 4.253281092917492e-05, + "loss": 0.2273, + "step": 9277 + }, + { + "epoch": 1.1002015889956125, + "grad_norm": 1.5070009801935171, + "learning_rate": 4.253109974125784e-05, + "loss": 0.2608, + "step": 9278 + }, + { + "epoch": 1.1003201707577375, + "grad_norm": 1.1194295565612786, + "learning_rate": 4.252938839172552e-05, + "loss": 0.2368, + "step": 9279 + }, + { + "epoch": 1.1004387525198625, + "grad_norm": 0.8767194532975879, + "learning_rate": 4.252767688059374e-05, + "loss": 0.1911, + "step": 9280 + }, + { + "epoch": 1.1005573342819874, + "grad_norm": 1.608366223269653, + "learning_rate": 4.252596520787826e-05, + "loss": 0.3964, + "step": 9281 + }, + { + "epoch": 1.1006759160441124, + "grad_norm": 1.2353274023448717, + "learning_rate": 4.252425337359489e-05, + "loss": 0.2736, + "step": 9282 + }, + { + "epoch": 1.1007944978062374, + "grad_norm": 1.1803574404904464, + "learning_rate": 4.2522541377759385e-05, + "loss": 0.2129, + "step": 9283 + }, + { + "epoch": 1.1009130795683624, + "grad_norm": 1.6385121351753162, + "learning_rate": 4.252082922038754e-05, + "loss": 0.3201, + "step": 9284 + }, + { + "epoch": 1.1010316613304874, + "grad_norm": 1.008114358409775, + "learning_rate": 4.251911690149513e-05, + "loss": 0.2495, + "step": 9285 + }, + { + "epoch": 1.1011502430926123, + "grad_norm": 0.9658146832001766, + "learning_rate": 4.251740442109794e-05, + "loss": 0.2042, + "step": 9286 + }, + { + "epoch": 1.1012688248547373, + "grad_norm": 1.046071983152849, + "learning_rate": 4.251569177921178e-05, + "loss": 0.2538, + "step": 9287 + }, + { + "epoch": 1.1013874066168623, + "grad_norm": 1.3460545568560764, + "learning_rate": 4.251397897585241e-05, + "loss": 0.2636, + "step": 9288 + }, + { + "epoch": 1.1015059883789873, + "grad_norm": 0.9859513809783174, + "learning_rate": 4.251226601103563e-05, + "loss": 0.2045, + "step": 9289 + }, + { + "epoch": 1.1016245701411123, + "grad_norm": 1.2161621367087014, + "learning_rate": 4.251055288477723e-05, + "loss": 0.2516, + "step": 9290 + }, + { + "epoch": 1.1017431519032372, + "grad_norm": 0.9868836912484316, + "learning_rate": 4.250883959709301e-05, + "loss": 0.2032, + "step": 9291 + }, + { + "epoch": 1.1018617336653622, + "grad_norm": 1.4434264549451872, + "learning_rate": 4.250712614799876e-05, + "loss": 0.2946, + "step": 9292 + }, + { + "epoch": 1.1019803154274872, + "grad_norm": 1.4270436182454667, + "learning_rate": 4.250541253751028e-05, + "loss": 0.2856, + "step": 9293 + }, + { + "epoch": 1.1020988971896122, + "grad_norm": 1.6870539921653178, + "learning_rate": 4.250369876564335e-05, + "loss": 0.3867, + "step": 9294 + }, + { + "epoch": 1.1022174789517372, + "grad_norm": 1.4011801443581566, + "learning_rate": 4.2501984832413786e-05, + "loss": 0.2266, + "step": 9295 + }, + { + "epoch": 1.1023360607138621, + "grad_norm": 1.1675747330713295, + "learning_rate": 4.250027073783739e-05, + "loss": 0.254, + "step": 9296 + }, + { + "epoch": 1.1024546424759871, + "grad_norm": 1.4292703642100115, + "learning_rate": 4.2498556481929966e-05, + "loss": 0.2699, + "step": 9297 + }, + { + "epoch": 1.102573224238112, + "grad_norm": 1.2193584701203546, + "learning_rate": 4.249684206470729e-05, + "loss": 0.275, + "step": 9298 + }, + { + "epoch": 1.102691806000237, + "grad_norm": 1.3126048209807395, + "learning_rate": 4.24951274861852e-05, + "loss": 0.3379, + "step": 9299 + }, + { + "epoch": 1.102810387762362, + "grad_norm": 1.5592181336519266, + "learning_rate": 4.249341274637948e-05, + "loss": 0.3044, + "step": 9300 + }, + { + "epoch": 1.102928969524487, + "grad_norm": 1.3367268275100888, + "learning_rate": 4.249169784530595e-05, + "loss": 0.2421, + "step": 9301 + }, + { + "epoch": 1.103047551286612, + "grad_norm": 1.5044796789621735, + "learning_rate": 4.248998278298042e-05, + "loss": 0.2968, + "step": 9302 + }, + { + "epoch": 1.103166133048737, + "grad_norm": 1.3621607552184096, + "learning_rate": 4.2488267559418696e-05, + "loss": 0.2765, + "step": 9303 + }, + { + "epoch": 1.103284714810862, + "grad_norm": 1.0552376575642626, + "learning_rate": 4.248655217463658e-05, + "loss": 0.2265, + "step": 9304 + }, + { + "epoch": 1.1034032965729872, + "grad_norm": 1.6813935415320185, + "learning_rate": 4.2484836628649905e-05, + "loss": 0.3181, + "step": 9305 + }, + { + "epoch": 1.103521878335112, + "grad_norm": 1.2767808830664802, + "learning_rate": 4.248312092147447e-05, + "loss": 0.29, + "step": 9306 + }, + { + "epoch": 1.1036404600972372, + "grad_norm": 1.399857059821546, + "learning_rate": 4.248140505312611e-05, + "loss": 0.2794, + "step": 9307 + }, + { + "epoch": 1.1037590418593621, + "grad_norm": 1.2093920531586615, + "learning_rate": 4.247968902362063e-05, + "loss": 0.2454, + "step": 9308 + }, + { + "epoch": 1.1038776236214871, + "grad_norm": 1.3994969088574207, + "learning_rate": 4.2477972832973846e-05, + "loss": 0.3146, + "step": 9309 + }, + { + "epoch": 1.103996205383612, + "grad_norm": 1.4788199275818366, + "learning_rate": 4.247625648120159e-05, + "loss": 0.3718, + "step": 9310 + }, + { + "epoch": 1.104114787145737, + "grad_norm": 0.9641998105988202, + "learning_rate": 4.2474539968319684e-05, + "loss": 0.2068, + "step": 9311 + }, + { + "epoch": 1.104233368907862, + "grad_norm": 1.1436147867511381, + "learning_rate": 4.247282329434394e-05, + "loss": 0.2217, + "step": 9312 + }, + { + "epoch": 1.104351950669987, + "grad_norm": 1.4741519451135268, + "learning_rate": 4.24711064592902e-05, + "loss": 0.4207, + "step": 9313 + }, + { + "epoch": 1.104470532432112, + "grad_norm": 1.0264992321106334, + "learning_rate": 4.246938946317427e-05, + "loss": 0.2454, + "step": 9314 + }, + { + "epoch": 1.104589114194237, + "grad_norm": 0.8517851214075344, + "learning_rate": 4.2467672306012e-05, + "loss": 0.1638, + "step": 9315 + }, + { + "epoch": 1.104707695956362, + "grad_norm": 0.917733094422889, + "learning_rate": 4.246595498781922e-05, + "loss": 0.2032, + "step": 9316 + }, + { + "epoch": 1.104826277718487, + "grad_norm": 1.3223196503888859, + "learning_rate": 4.2464237508611735e-05, + "loss": 0.2828, + "step": 9317 + }, + { + "epoch": 1.104944859480612, + "grad_norm": 1.264934546104584, + "learning_rate": 4.246251986840541e-05, + "loss": 0.2937, + "step": 9318 + }, + { + "epoch": 1.105063441242737, + "grad_norm": 1.0605045406961018, + "learning_rate": 4.2460802067216066e-05, + "loss": 0.2254, + "step": 9319 + }, + { + "epoch": 1.105182023004862, + "grad_norm": 1.1910642578594683, + "learning_rate": 4.2459084105059534e-05, + "loss": 0.2063, + "step": 9320 + }, + { + "epoch": 1.1053006047669869, + "grad_norm": 0.7763314191221506, + "learning_rate": 4.245736598195165e-05, + "loss": 0.214, + "step": 9321 + }, + { + "epoch": 1.1054191865291119, + "grad_norm": 1.0913437753766362, + "learning_rate": 4.2455647697908266e-05, + "loss": 0.2314, + "step": 9322 + }, + { + "epoch": 1.1055377682912368, + "grad_norm": 1.1599151168793158, + "learning_rate": 4.2453929252945224e-05, + "loss": 0.2661, + "step": 9323 + }, + { + "epoch": 1.1056563500533618, + "grad_norm": 0.9895800269070656, + "learning_rate": 4.2452210647078346e-05, + "loss": 0.1995, + "step": 9324 + }, + { + "epoch": 1.1057749318154868, + "grad_norm": 1.6174268668353318, + "learning_rate": 4.245049188032349e-05, + "loss": 0.4101, + "step": 9325 + }, + { + "epoch": 1.1058935135776118, + "grad_norm": 1.0487360735263698, + "learning_rate": 4.2448772952696495e-05, + "loss": 0.2091, + "step": 9326 + }, + { + "epoch": 1.1060120953397368, + "grad_norm": 1.333670053505605, + "learning_rate": 4.2447053864213215e-05, + "loss": 0.3632, + "step": 9327 + }, + { + "epoch": 1.1061306771018617, + "grad_norm": 1.2898698433917748, + "learning_rate": 4.24453346148895e-05, + "loss": 0.2301, + "step": 9328 + }, + { + "epoch": 1.1062492588639867, + "grad_norm": 0.9058129632799933, + "learning_rate": 4.244361520474118e-05, + "loss": 0.2089, + "step": 9329 + }, + { + "epoch": 1.1063678406261117, + "grad_norm": 1.2382270100198876, + "learning_rate": 4.2441895633784126e-05, + "loss": 0.2847, + "step": 9330 + }, + { + "epoch": 1.1064864223882367, + "grad_norm": 1.2288051950548058, + "learning_rate": 4.244017590203418e-05, + "loss": 0.2881, + "step": 9331 + }, + { + "epoch": 1.1066050041503617, + "grad_norm": 1.0930599724047654, + "learning_rate": 4.2438456009507196e-05, + "loss": 0.2054, + "step": 9332 + }, + { + "epoch": 1.1067235859124867, + "grad_norm": 0.8781065280117154, + "learning_rate": 4.2436735956219034e-05, + "loss": 0.1791, + "step": 9333 + }, + { + "epoch": 1.1068421676746116, + "grad_norm": 1.066966617252039, + "learning_rate": 4.243501574218555e-05, + "loss": 0.2307, + "step": 9334 + }, + { + "epoch": 1.1069607494367366, + "grad_norm": 1.1921986389372716, + "learning_rate": 4.243329536742261e-05, + "loss": 0.2581, + "step": 9335 + }, + { + "epoch": 1.1070793311988616, + "grad_norm": 1.2531371679081817, + "learning_rate": 4.2431574831946055e-05, + "loss": 0.2805, + "step": 9336 + }, + { + "epoch": 1.1071979129609866, + "grad_norm": 1.4622659554263169, + "learning_rate": 4.242985413577175e-05, + "loss": 0.2714, + "step": 9337 + }, + { + "epoch": 1.1073164947231116, + "grad_norm": 0.9395156927588587, + "learning_rate": 4.2428133278915574e-05, + "loss": 0.2438, + "step": 9338 + }, + { + "epoch": 1.1074350764852365, + "grad_norm": 1.2496546659695043, + "learning_rate": 4.2426412261393375e-05, + "loss": 0.2962, + "step": 9339 + }, + { + "epoch": 1.1075536582473615, + "grad_norm": 1.3173542123018438, + "learning_rate": 4.242469108322104e-05, + "loss": 0.2726, + "step": 9340 + }, + { + "epoch": 1.1076722400094865, + "grad_norm": 0.8307401541361101, + "learning_rate": 4.2422969744414396e-05, + "loss": 0.2089, + "step": 9341 + }, + { + "epoch": 1.1077908217716115, + "grad_norm": 1.2067841137441755, + "learning_rate": 4.2421248244989356e-05, + "loss": 0.3581, + "step": 9342 + }, + { + "epoch": 1.1079094035337365, + "grad_norm": 1.2759106507290412, + "learning_rate": 4.241952658496176e-05, + "loss": 0.2579, + "step": 9343 + }, + { + "epoch": 1.1080279852958614, + "grad_norm": 0.976026287081038, + "learning_rate": 4.241780476434749e-05, + "loss": 0.1961, + "step": 9344 + }, + { + "epoch": 1.1081465670579864, + "grad_norm": 1.3154586542919822, + "learning_rate": 4.241608278316243e-05, + "loss": 0.3101, + "step": 9345 + }, + { + "epoch": 1.1082651488201114, + "grad_norm": 1.308331170537835, + "learning_rate": 4.241436064142244e-05, + "loss": 0.2776, + "step": 9346 + }, + { + "epoch": 1.1083837305822364, + "grad_norm": 1.2661938045211834, + "learning_rate": 4.241263833914339e-05, + "loss": 0.2554, + "step": 9347 + }, + { + "epoch": 1.1085023123443614, + "grad_norm": 1.3989914547762192, + "learning_rate": 4.241091587634117e-05, + "loss": 0.3095, + "step": 9348 + }, + { + "epoch": 1.1086208941064863, + "grad_norm": 0.8965737987075469, + "learning_rate": 4.240919325303167e-05, + "loss": 0.1786, + "step": 9349 + }, + { + "epoch": 1.1087394758686113, + "grad_norm": 0.9400241904867619, + "learning_rate": 4.2407470469230735e-05, + "loss": 0.2369, + "step": 9350 + }, + { + "epoch": 1.1088580576307363, + "grad_norm": 1.306578829516904, + "learning_rate": 4.240574752495429e-05, + "loss": 0.2902, + "step": 9351 + }, + { + "epoch": 1.1089766393928613, + "grad_norm": 1.1881484947589362, + "learning_rate": 4.240402442021818e-05, + "loss": 0.2438, + "step": 9352 + }, + { + "epoch": 1.1090952211549863, + "grad_norm": 1.0134823822588304, + "learning_rate": 4.240230115503832e-05, + "loss": 0.1934, + "step": 9353 + }, + { + "epoch": 1.1092138029171112, + "grad_norm": 1.205425685088804, + "learning_rate": 4.240057772943057e-05, + "loss": 0.2863, + "step": 9354 + }, + { + "epoch": 1.1093323846792362, + "grad_norm": 1.0698543787884205, + "learning_rate": 4.239885414341084e-05, + "loss": 0.257, + "step": 9355 + }, + { + "epoch": 1.1094509664413614, + "grad_norm": 1.3221395074864668, + "learning_rate": 4.239713039699501e-05, + "loss": 0.3091, + "step": 9356 + }, + { + "epoch": 1.1095695482034862, + "grad_norm": 1.2481239201762337, + "learning_rate": 4.2395406490198973e-05, + "loss": 0.2768, + "step": 9357 + }, + { + "epoch": 1.1096881299656114, + "grad_norm": 1.74169538114162, + "learning_rate": 4.239368242303862e-05, + "loss": 0.4179, + "step": 9358 + }, + { + "epoch": 1.1098067117277364, + "grad_norm": 1.3180577683217374, + "learning_rate": 4.2391958195529846e-05, + "loss": 0.2779, + "step": 9359 + }, + { + "epoch": 1.1099252934898614, + "grad_norm": 1.2701918132334962, + "learning_rate": 4.239023380768854e-05, + "loss": 0.2635, + "step": 9360 + }, + { + "epoch": 1.1100438752519863, + "grad_norm": 1.3631838812835353, + "learning_rate": 4.2388509259530604e-05, + "loss": 0.3254, + "step": 9361 + }, + { + "epoch": 1.1101624570141113, + "grad_norm": 0.7972353374308723, + "learning_rate": 4.238678455107194e-05, + "loss": 0.163, + "step": 9362 + }, + { + "epoch": 1.1102810387762363, + "grad_norm": 1.0371460946691855, + "learning_rate": 4.2385059682328444e-05, + "loss": 0.1939, + "step": 9363 + }, + { + "epoch": 1.1103996205383613, + "grad_norm": 1.1915398286047387, + "learning_rate": 4.238333465331601e-05, + "loss": 0.2173, + "step": 9364 + }, + { + "epoch": 1.1105182023004863, + "grad_norm": 1.1284254492101695, + "learning_rate": 4.238160946405055e-05, + "loss": 0.2083, + "step": 9365 + }, + { + "epoch": 1.1106367840626112, + "grad_norm": 1.0585046521096821, + "learning_rate": 4.2379884114547965e-05, + "loss": 0.2446, + "step": 9366 + }, + { + "epoch": 1.1107553658247362, + "grad_norm": 1.0933002639459048, + "learning_rate": 4.237815860482416e-05, + "loss": 0.2608, + "step": 9367 + }, + { + "epoch": 1.1108739475868612, + "grad_norm": 0.9340681428361943, + "learning_rate": 4.237643293489505e-05, + "loss": 0.184, + "step": 9368 + }, + { + "epoch": 1.1109925293489862, + "grad_norm": 0.9209828699727749, + "learning_rate": 4.237470710477653e-05, + "loss": 0.1983, + "step": 9369 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.2018983308457902, + "learning_rate": 4.237298111448452e-05, + "loss": 0.2774, + "step": 9370 + }, + { + "epoch": 1.1112296928732361, + "grad_norm": 1.3194538894159258, + "learning_rate": 4.237125496403492e-05, + "loss": 0.2944, + "step": 9371 + }, + { + "epoch": 1.1113482746353611, + "grad_norm": 1.3496461288269717, + "learning_rate": 4.236952865344367e-05, + "loss": 0.2635, + "step": 9372 + }, + { + "epoch": 1.111466856397486, + "grad_norm": 1.1640917505115014, + "learning_rate": 4.2367802182726653e-05, + "loss": 0.2499, + "step": 9373 + }, + { + "epoch": 1.111585438159611, + "grad_norm": 1.2814509491174613, + "learning_rate": 4.23660755518998e-05, + "loss": 0.328, + "step": 9374 + }, + { + "epoch": 1.111704019921736, + "grad_norm": 0.8983563021354333, + "learning_rate": 4.236434876097902e-05, + "loss": 0.2128, + "step": 9375 + }, + { + "epoch": 1.111822601683861, + "grad_norm": 1.0511252212535487, + "learning_rate": 4.236262180998025e-05, + "loss": 0.1914, + "step": 9376 + }, + { + "epoch": 1.111941183445986, + "grad_norm": 1.0179478862264009, + "learning_rate": 4.2360894698919384e-05, + "loss": 0.2648, + "step": 9377 + }, + { + "epoch": 1.112059765208111, + "grad_norm": 1.305092932579881, + "learning_rate": 4.235916742781236e-05, + "loss": 0.2481, + "step": 9378 + }, + { + "epoch": 1.112178346970236, + "grad_norm": 1.6380154858166616, + "learning_rate": 4.2357439996675116e-05, + "loss": 0.3442, + "step": 9379 + }, + { + "epoch": 1.112296928732361, + "grad_norm": 0.9438072231157584, + "learning_rate": 4.235571240552355e-05, + "loss": 0.2474, + "step": 9380 + }, + { + "epoch": 1.112415510494486, + "grad_norm": 1.45313262963439, + "learning_rate": 4.235398465437359e-05, + "loss": 0.2867, + "step": 9381 + }, + { + "epoch": 1.112534092256611, + "grad_norm": 1.698131573426022, + "learning_rate": 4.235225674324118e-05, + "loss": 0.4831, + "step": 9382 + }, + { + "epoch": 1.112652674018736, + "grad_norm": 1.2655280682618903, + "learning_rate": 4.235052867214225e-05, + "loss": 0.3239, + "step": 9383 + }, + { + "epoch": 1.112771255780861, + "grad_norm": 1.142053069965225, + "learning_rate": 4.234880044109271e-05, + "loss": 0.2545, + "step": 9384 + }, + { + "epoch": 1.1128898375429859, + "grad_norm": 1.0063932051429743, + "learning_rate": 4.234707205010851e-05, + "loss": 0.236, + "step": 9385 + }, + { + "epoch": 1.1130084193051109, + "grad_norm": 0.9415770148740454, + "learning_rate": 4.234534349920558e-05, + "loss": 0.1664, + "step": 9386 + }, + { + "epoch": 1.1131270010672358, + "grad_norm": 1.4801871318378694, + "learning_rate": 4.234361478839985e-05, + "loss": 0.27, + "step": 9387 + }, + { + "epoch": 1.1132455828293608, + "grad_norm": 1.3533804243634349, + "learning_rate": 4.2341885917707256e-05, + "loss": 0.3012, + "step": 9388 + }, + { + "epoch": 1.1133641645914858, + "grad_norm": 1.1878874764927498, + "learning_rate": 4.234015688714374e-05, + "loss": 0.2827, + "step": 9389 + }, + { + "epoch": 1.1134827463536108, + "grad_norm": 1.104991891328091, + "learning_rate": 4.2338427696725245e-05, + "loss": 0.2021, + "step": 9390 + }, + { + "epoch": 1.1136013281157358, + "grad_norm": 1.0568907081559218, + "learning_rate": 4.233669834646772e-05, + "loss": 0.2498, + "step": 9391 + }, + { + "epoch": 1.1137199098778607, + "grad_norm": 1.1869196093944383, + "learning_rate": 4.233496883638708e-05, + "loss": 0.2458, + "step": 9392 + }, + { + "epoch": 1.1138384916399857, + "grad_norm": 0.9460801052239469, + "learning_rate": 4.2333239166499284e-05, + "loss": 0.1794, + "step": 9393 + }, + { + "epoch": 1.1139570734021107, + "grad_norm": 1.11204845178699, + "learning_rate": 4.2331509336820284e-05, + "loss": 0.1973, + "step": 9394 + }, + { + "epoch": 1.1140756551642357, + "grad_norm": 1.1705044798617903, + "learning_rate": 4.232977934736602e-05, + "loss": 0.2412, + "step": 9395 + }, + { + "epoch": 1.1141942369263607, + "grad_norm": 1.4712884444335375, + "learning_rate": 4.232804919815245e-05, + "loss": 0.3128, + "step": 9396 + }, + { + "epoch": 1.1143128186884856, + "grad_norm": 0.8508198771548285, + "learning_rate": 4.232631888919551e-05, + "loss": 0.1653, + "step": 9397 + }, + { + "epoch": 1.1144314004506106, + "grad_norm": 1.3948802280579062, + "learning_rate": 4.2324588420511145e-05, + "loss": 0.3025, + "step": 9398 + }, + { + "epoch": 1.1145499822127356, + "grad_norm": 1.2557174325111164, + "learning_rate": 4.232285779211533e-05, + "loss": 0.241, + "step": 9399 + }, + { + "epoch": 1.1146685639748606, + "grad_norm": 1.5892370728618912, + "learning_rate": 4.2321127004024014e-05, + "loss": 0.3075, + "step": 9400 + }, + { + "epoch": 1.1147871457369856, + "grad_norm": 1.0022969959599117, + "learning_rate": 4.2319396056253134e-05, + "loss": 0.1913, + "step": 9401 + }, + { + "epoch": 1.1149057274991105, + "grad_norm": 1.2818853799490868, + "learning_rate": 4.231766494881867e-05, + "loss": 0.2249, + "step": 9402 + }, + { + "epoch": 1.1150243092612355, + "grad_norm": 1.2097298036876527, + "learning_rate": 4.231593368173658e-05, + "loss": 0.2342, + "step": 9403 + }, + { + "epoch": 1.1151428910233605, + "grad_norm": 0.9926001076374248, + "learning_rate": 4.23142022550228e-05, + "loss": 0.2513, + "step": 9404 + }, + { + "epoch": 1.1152614727854857, + "grad_norm": 1.009162547971583, + "learning_rate": 4.2312470668693314e-05, + "loss": 0.2196, + "step": 9405 + }, + { + "epoch": 1.1153800545476105, + "grad_norm": 1.07226202427023, + "learning_rate": 4.231073892276408e-05, + "loss": 0.2315, + "step": 9406 + }, + { + "epoch": 1.1154986363097357, + "grad_norm": 1.2399523483983215, + "learning_rate": 4.2309007017251056e-05, + "loss": 0.2621, + "step": 9407 + }, + { + "epoch": 1.1156172180718607, + "grad_norm": 1.4588646823172582, + "learning_rate": 4.230727495217021e-05, + "loss": 0.3572, + "step": 9408 + }, + { + "epoch": 1.1157357998339856, + "grad_norm": 1.35815004987775, + "learning_rate": 4.2305542727537515e-05, + "loss": 0.3358, + "step": 9409 + }, + { + "epoch": 1.1158543815961106, + "grad_norm": 0.9866226145159706, + "learning_rate": 4.230381034336894e-05, + "loss": 0.2286, + "step": 9410 + }, + { + "epoch": 1.1159729633582356, + "grad_norm": 0.7855986223441359, + "learning_rate": 4.2302077799680465e-05, + "loss": 0.1689, + "step": 9411 + }, + { + "epoch": 1.1160915451203606, + "grad_norm": 1.2162999648820363, + "learning_rate": 4.230034509648804e-05, + "loss": 0.2547, + "step": 9412 + }, + { + "epoch": 1.1162101268824856, + "grad_norm": 1.0287737232251362, + "learning_rate": 4.229861223380764e-05, + "loss": 0.2105, + "step": 9413 + }, + { + "epoch": 1.1163287086446105, + "grad_norm": 1.298022986063039, + "learning_rate": 4.229687921165525e-05, + "loss": 0.236, + "step": 9414 + }, + { + "epoch": 1.1164472904067355, + "grad_norm": 1.265667465140052, + "learning_rate": 4.2295146030046854e-05, + "loss": 0.3069, + "step": 9415 + }, + { + "epoch": 1.1165658721688605, + "grad_norm": 1.4455172004118615, + "learning_rate": 4.2293412688998416e-05, + "loss": 0.3132, + "step": 9416 + }, + { + "epoch": 1.1166844539309855, + "grad_norm": 1.3317260469665473, + "learning_rate": 4.229167918852592e-05, + "loss": 0.3028, + "step": 9417 + }, + { + "epoch": 1.1168030356931105, + "grad_norm": 0.956842399620342, + "learning_rate": 4.2289945528645344e-05, + "loss": 0.203, + "step": 9418 + }, + { + "epoch": 1.1169216174552354, + "grad_norm": 1.134431042887857, + "learning_rate": 4.2288211709372674e-05, + "loss": 0.2265, + "step": 9419 + }, + { + "epoch": 1.1170401992173604, + "grad_norm": 1.7916629101204904, + "learning_rate": 4.228647773072389e-05, + "loss": 0.4385, + "step": 9420 + }, + { + "epoch": 1.1171587809794854, + "grad_norm": 1.4361868079613795, + "learning_rate": 4.228474359271498e-05, + "loss": 0.2951, + "step": 9421 + }, + { + "epoch": 1.1172773627416104, + "grad_norm": 1.0682058176687264, + "learning_rate": 4.228300929536193e-05, + "loss": 0.2653, + "step": 9422 + }, + { + "epoch": 1.1173959445037354, + "grad_norm": 1.0706852418572903, + "learning_rate": 4.2281274838680735e-05, + "loss": 0.1983, + "step": 9423 + }, + { + "epoch": 1.1175145262658603, + "grad_norm": 0.9550560463854861, + "learning_rate": 4.227954022268738e-05, + "loss": 0.1798, + "step": 9424 + }, + { + "epoch": 1.1176331080279853, + "grad_norm": 1.4683541252510492, + "learning_rate": 4.227780544739783e-05, + "loss": 0.2582, + "step": 9425 + }, + { + "epoch": 1.1177516897901103, + "grad_norm": 1.5650915054174988, + "learning_rate": 4.227607051282813e-05, + "loss": 0.4009, + "step": 9426 + }, + { + "epoch": 1.1178702715522353, + "grad_norm": 1.037122860398587, + "learning_rate": 4.2274335418994224e-05, + "loss": 0.2252, + "step": 9427 + }, + { + "epoch": 1.1179888533143603, + "grad_norm": 1.1212525627206786, + "learning_rate": 4.2272600165912144e-05, + "loss": 0.2361, + "step": 9428 + }, + { + "epoch": 1.1181074350764852, + "grad_norm": 1.6344693417473175, + "learning_rate": 4.227086475359786e-05, + "loss": 0.4027, + "step": 9429 + }, + { + "epoch": 1.1182260168386102, + "grad_norm": 0.941061109470346, + "learning_rate": 4.226912918206739e-05, + "loss": 0.2002, + "step": 9430 + }, + { + "epoch": 1.1183445986007352, + "grad_norm": 1.035921419363779, + "learning_rate": 4.2267393451336724e-05, + "loss": 0.2161, + "step": 9431 + }, + { + "epoch": 1.1184631803628602, + "grad_norm": 1.0542905322076106, + "learning_rate": 4.226565756142186e-05, + "loss": 0.1981, + "step": 9432 + }, + { + "epoch": 1.1185817621249852, + "grad_norm": 1.1377264948338868, + "learning_rate": 4.226392151233881e-05, + "loss": 0.2324, + "step": 9433 + }, + { + "epoch": 1.1187003438871101, + "grad_norm": 1.122719189518195, + "learning_rate": 4.2262185304103574e-05, + "loss": 0.2762, + "step": 9434 + }, + { + "epoch": 1.1188189256492351, + "grad_norm": 1.0785445076213456, + "learning_rate": 4.2260448936732155e-05, + "loss": 0.2352, + "step": 9435 + }, + { + "epoch": 1.11893750741136, + "grad_norm": 0.8194734162702694, + "learning_rate": 4.225871241024056e-05, + "loss": 0.2154, + "step": 9436 + }, + { + "epoch": 1.119056089173485, + "grad_norm": 1.1417342467989287, + "learning_rate": 4.2256975724644815e-05, + "loss": 0.2885, + "step": 9437 + }, + { + "epoch": 1.11917467093561, + "grad_norm": 1.255196960447618, + "learning_rate": 4.22552388799609e-05, + "loss": 0.2737, + "step": 9438 + }, + { + "epoch": 1.119293252697735, + "grad_norm": 0.9707589884965621, + "learning_rate": 4.225350187620485e-05, + "loss": 0.2033, + "step": 9439 + }, + { + "epoch": 1.11941183445986, + "grad_norm": 1.2235711176442867, + "learning_rate": 4.225176471339267e-05, + "loss": 0.251, + "step": 9440 + }, + { + "epoch": 1.119530416221985, + "grad_norm": 1.2315301315296676, + "learning_rate": 4.225002739154038e-05, + "loss": 0.2634, + "step": 9441 + }, + { + "epoch": 1.11964899798411, + "grad_norm": 1.1656298272407657, + "learning_rate": 4.2248289910663976e-05, + "loss": 0.222, + "step": 9442 + }, + { + "epoch": 1.119767579746235, + "grad_norm": 0.9635057965986775, + "learning_rate": 4.2246552270779504e-05, + "loss": 0.1756, + "step": 9443 + }, + { + "epoch": 1.11988616150836, + "grad_norm": 1.0789148210403412, + "learning_rate": 4.224481447190296e-05, + "loss": 0.2206, + "step": 9444 + }, + { + "epoch": 1.120004743270485, + "grad_norm": 1.8488426018393302, + "learning_rate": 4.224307651405038e-05, + "loss": 0.5383, + "step": 9445 + }, + { + "epoch": 1.12012332503261, + "grad_norm": 1.2793495416222123, + "learning_rate": 4.224133839723778e-05, + "loss": 0.2208, + "step": 9446 + }, + { + "epoch": 1.120241906794735, + "grad_norm": 1.470932349421977, + "learning_rate": 4.223960012148118e-05, + "loss": 0.3492, + "step": 9447 + }, + { + "epoch": 1.1203604885568599, + "grad_norm": 1.565275508030098, + "learning_rate": 4.223786168679661e-05, + "loss": 0.3517, + "step": 9448 + }, + { + "epoch": 1.1204790703189849, + "grad_norm": 0.9734316245000455, + "learning_rate": 4.223612309320009e-05, + "loss": 0.2507, + "step": 9449 + }, + { + "epoch": 1.1205976520811098, + "grad_norm": 1.400912080661782, + "learning_rate": 4.223438434070765e-05, + "loss": 0.3433, + "step": 9450 + }, + { + "epoch": 1.1207162338432348, + "grad_norm": 1.8926519354009306, + "learning_rate": 4.223264542933532e-05, + "loss": 0.4143, + "step": 9451 + }, + { + "epoch": 1.1208348156053598, + "grad_norm": 0.9864296708791939, + "learning_rate": 4.2230906359099136e-05, + "loss": 0.2065, + "step": 9452 + }, + { + "epoch": 1.1209533973674848, + "grad_norm": 1.101689237497081, + "learning_rate": 4.222916713001512e-05, + "loss": 0.281, + "step": 9453 + }, + { + "epoch": 1.1210719791296098, + "grad_norm": 1.0995415390116752, + "learning_rate": 4.222742774209932e-05, + "loss": 0.2245, + "step": 9454 + }, + { + "epoch": 1.1211905608917347, + "grad_norm": 1.1280014544469223, + "learning_rate": 4.2225688195367755e-05, + "loss": 0.2517, + "step": 9455 + }, + { + "epoch": 1.12130914265386, + "grad_norm": 0.8191708169134806, + "learning_rate": 4.2223948489836475e-05, + "loss": 0.1978, + "step": 9456 + }, + { + "epoch": 1.1214277244159847, + "grad_norm": 1.2390824850476863, + "learning_rate": 4.22222086255215e-05, + "loss": 0.2462, + "step": 9457 + }, + { + "epoch": 1.12154630617811, + "grad_norm": 1.3340081564607897, + "learning_rate": 4.222046860243889e-05, + "loss": 0.2695, + "step": 9458 + }, + { + "epoch": 1.121664887940235, + "grad_norm": 0.8523200799988407, + "learning_rate": 4.221872842060467e-05, + "loss": 0.1478, + "step": 9459 + }, + { + "epoch": 1.1217834697023599, + "grad_norm": 1.5991545322615488, + "learning_rate": 4.221698808003489e-05, + "loss": 0.3828, + "step": 9460 + }, + { + "epoch": 1.1219020514644849, + "grad_norm": 0.847421306682847, + "learning_rate": 4.22152475807456e-05, + "loss": 0.1735, + "step": 9461 + }, + { + "epoch": 1.1220206332266098, + "grad_norm": 1.1153896131730445, + "learning_rate": 4.2213506922752836e-05, + "loss": 0.2368, + "step": 9462 + }, + { + "epoch": 1.1221392149887348, + "grad_norm": 1.4103354425365464, + "learning_rate": 4.2211766106072637e-05, + "loss": 0.3396, + "step": 9463 + }, + { + "epoch": 1.1222577967508598, + "grad_norm": 1.6252600445479943, + "learning_rate": 4.221002513072106e-05, + "loss": 0.3657, + "step": 9464 + }, + { + "epoch": 1.1223763785129848, + "grad_norm": 1.4062158536725813, + "learning_rate": 4.220828399671417e-05, + "loss": 0.2796, + "step": 9465 + }, + { + "epoch": 1.1224949602751098, + "grad_norm": 1.3783722370947131, + "learning_rate": 4.2206542704068e-05, + "loss": 0.3252, + "step": 9466 + }, + { + "epoch": 1.1226135420372347, + "grad_norm": 1.0755219536313942, + "learning_rate": 4.220480125279861e-05, + "loss": 0.2281, + "step": 9467 + }, + { + "epoch": 1.1227321237993597, + "grad_norm": 1.0249358945202947, + "learning_rate": 4.220305964292204e-05, + "loss": 0.2502, + "step": 9468 + }, + { + "epoch": 1.1228507055614847, + "grad_norm": 1.014782925424238, + "learning_rate": 4.2201317874454363e-05, + "loss": 0.2281, + "step": 9469 + }, + { + "epoch": 1.1229692873236097, + "grad_norm": 1.1291828665297095, + "learning_rate": 4.2199575947411617e-05, + "loss": 0.1808, + "step": 9470 + }, + { + "epoch": 1.1230878690857347, + "grad_norm": 1.1030525076530564, + "learning_rate": 4.2197833861809886e-05, + "loss": 0.2661, + "step": 9471 + }, + { + "epoch": 1.1232064508478596, + "grad_norm": 0.9824430815731175, + "learning_rate": 4.219609161766521e-05, + "loss": 0.1645, + "step": 9472 + }, + { + "epoch": 1.1233250326099846, + "grad_norm": 1.3746952793411726, + "learning_rate": 4.2194349214993656e-05, + "loss": 0.2386, + "step": 9473 + }, + { + "epoch": 1.1234436143721096, + "grad_norm": 1.3378276795758457, + "learning_rate": 4.219260665381129e-05, + "loss": 0.3062, + "step": 9474 + }, + { + "epoch": 1.1235621961342346, + "grad_norm": 1.0161395882345192, + "learning_rate": 4.2190863934134174e-05, + "loss": 0.2225, + "step": 9475 + }, + { + "epoch": 1.1236807778963596, + "grad_norm": 1.291029472157644, + "learning_rate": 4.218912105597838e-05, + "loss": 0.2624, + "step": 9476 + }, + { + "epoch": 1.1237993596584845, + "grad_norm": 1.457849123181044, + "learning_rate": 4.218737801935995e-05, + "loss": 0.2934, + "step": 9477 + }, + { + "epoch": 1.1239179414206095, + "grad_norm": 0.8935444942793107, + "learning_rate": 4.218563482429498e-05, + "loss": 0.1693, + "step": 9478 + }, + { + "epoch": 1.1240365231827345, + "grad_norm": 0.9377205719367073, + "learning_rate": 4.2183891470799536e-05, + "loss": 0.1835, + "step": 9479 + }, + { + "epoch": 1.1241551049448595, + "grad_norm": 1.181045941812752, + "learning_rate": 4.2182147958889685e-05, + "loss": 0.2492, + "step": 9480 + }, + { + "epoch": 1.1242736867069845, + "grad_norm": 0.9478662631935663, + "learning_rate": 4.2180404288581494e-05, + "loss": 0.2, + "step": 9481 + }, + { + "epoch": 1.1243922684691094, + "grad_norm": 1.023618891006282, + "learning_rate": 4.217866045989105e-05, + "loss": 0.2594, + "step": 9482 + }, + { + "epoch": 1.1245108502312344, + "grad_norm": 1.38338608803272, + "learning_rate": 4.2176916472834414e-05, + "loss": 0.296, + "step": 9483 + }, + { + "epoch": 1.1246294319933594, + "grad_norm": 0.9512473885866779, + "learning_rate": 4.217517232742768e-05, + "loss": 0.2053, + "step": 9484 + }, + { + "epoch": 1.1247480137554844, + "grad_norm": 1.5949051944152366, + "learning_rate": 4.217342802368691e-05, + "loss": 0.3626, + "step": 9485 + }, + { + "epoch": 1.1248665955176094, + "grad_norm": 1.1557801894731636, + "learning_rate": 4.2171683561628206e-05, + "loss": 0.233, + "step": 9486 + }, + { + "epoch": 1.1249851772797343, + "grad_norm": 1.0951945761196924, + "learning_rate": 4.216993894126763e-05, + "loss": 0.2461, + "step": 9487 + }, + { + "epoch": 1.1251037590418593, + "grad_norm": 1.2276149908987795, + "learning_rate": 4.2168194162621263e-05, + "loss": 0.2507, + "step": 9488 + }, + { + "epoch": 1.1252223408039843, + "grad_norm": 1.0878350625309503, + "learning_rate": 4.2166449225705214e-05, + "loss": 0.1807, + "step": 9489 + }, + { + "epoch": 1.1253409225661093, + "grad_norm": 1.2118505474902643, + "learning_rate": 4.216470413053555e-05, + "loss": 0.2719, + "step": 9490 + }, + { + "epoch": 1.1254595043282343, + "grad_norm": 1.1973040154396364, + "learning_rate": 4.2162958877128354e-05, + "loss": 0.2641, + "step": 9491 + }, + { + "epoch": 1.1255780860903593, + "grad_norm": 1.0506003237249484, + "learning_rate": 4.216121346549973e-05, + "loss": 0.2018, + "step": 9492 + }, + { + "epoch": 1.1256966678524842, + "grad_norm": 1.9136858208945668, + "learning_rate": 4.215946789566576e-05, + "loss": 0.4221, + "step": 9493 + }, + { + "epoch": 1.1258152496146092, + "grad_norm": 1.1709987133461066, + "learning_rate": 4.215772216764254e-05, + "loss": 0.2251, + "step": 9494 + }, + { + "epoch": 1.1259338313767342, + "grad_norm": 1.145604640747553, + "learning_rate": 4.215597628144616e-05, + "loss": 0.2274, + "step": 9495 + }, + { + "epoch": 1.1260524131388592, + "grad_norm": 1.0202132638577668, + "learning_rate": 4.215423023709272e-05, + "loss": 0.2514, + "step": 9496 + }, + { + "epoch": 1.1261709949009842, + "grad_norm": 1.0252527934689764, + "learning_rate": 4.215248403459832e-05, + "loss": 0.2196, + "step": 9497 + }, + { + "epoch": 1.1262895766631091, + "grad_norm": 1.0970065037496002, + "learning_rate": 4.215073767397904e-05, + "loss": 0.2066, + "step": 9498 + }, + { + "epoch": 1.1264081584252341, + "grad_norm": 1.277479675639411, + "learning_rate": 4.2148991155250996e-05, + "loss": 0.2971, + "step": 9499 + }, + { + "epoch": 1.126526740187359, + "grad_norm": 0.9927236121404561, + "learning_rate": 4.214724447843028e-05, + "loss": 0.2231, + "step": 9500 + }, + { + "epoch": 1.126645321949484, + "grad_norm": 1.3171319767368792, + "learning_rate": 4.2145497643532994e-05, + "loss": 0.2953, + "step": 9501 + }, + { + "epoch": 1.126763903711609, + "grad_norm": 1.2263969195848192, + "learning_rate": 4.2143750650575254e-05, + "loss": 0.2591, + "step": 9502 + }, + { + "epoch": 1.1268824854737343, + "grad_norm": 1.364307485943857, + "learning_rate": 4.2142003499573147e-05, + "loss": 0.2547, + "step": 9503 + }, + { + "epoch": 1.127001067235859, + "grad_norm": 1.9876024658814844, + "learning_rate": 4.214025619054279e-05, + "loss": 0.3253, + "step": 9504 + }, + { + "epoch": 1.1271196489979842, + "grad_norm": 1.1343553708268583, + "learning_rate": 4.213850872350029e-05, + "loss": 0.2207, + "step": 9505 + }, + { + "epoch": 1.127238230760109, + "grad_norm": 0.9945084851093255, + "learning_rate": 4.2136761098461766e-05, + "loss": 0.1504, + "step": 9506 + }, + { + "epoch": 1.1273568125222342, + "grad_norm": 1.1092792721881386, + "learning_rate": 4.213501331544331e-05, + "loss": 0.2354, + "step": 9507 + }, + { + "epoch": 1.127475394284359, + "grad_norm": 1.3410752683953866, + "learning_rate": 4.213326537446104e-05, + "loss": 0.2537, + "step": 9508 + }, + { + "epoch": 1.1275939760464841, + "grad_norm": 1.1883038857120036, + "learning_rate": 4.213151727553108e-05, + "loss": 0.2682, + "step": 9509 + }, + { + "epoch": 1.1277125578086091, + "grad_norm": 1.470514039271162, + "learning_rate": 4.212976901866954e-05, + "loss": 0.3706, + "step": 9510 + }, + { + "epoch": 1.127831139570734, + "grad_norm": 1.2652989776827253, + "learning_rate": 4.212802060389253e-05, + "loss": 0.2781, + "step": 9511 + }, + { + "epoch": 1.127949721332859, + "grad_norm": 1.6635125300242328, + "learning_rate": 4.212627203121617e-05, + "loss": 0.4596, + "step": 9512 + }, + { + "epoch": 1.128068303094984, + "grad_norm": 1.0303967704026114, + "learning_rate": 4.212452330065659e-05, + "loss": 0.1951, + "step": 9513 + }, + { + "epoch": 1.128186884857109, + "grad_norm": 0.9587794202929916, + "learning_rate": 4.2122774412229914e-05, + "loss": 0.144, + "step": 9514 + }, + { + "epoch": 1.128305466619234, + "grad_norm": 1.285648178940054, + "learning_rate": 4.2121025365952244e-05, + "loss": 0.2962, + "step": 9515 + }, + { + "epoch": 1.128424048381359, + "grad_norm": 0.9857441434565866, + "learning_rate": 4.2119276161839715e-05, + "loss": 0.2392, + "step": 9516 + }, + { + "epoch": 1.128542630143484, + "grad_norm": 1.1940985457581383, + "learning_rate": 4.2117526799908455e-05, + "loss": 0.3145, + "step": 9517 + }, + { + "epoch": 1.128661211905609, + "grad_norm": 1.188513840834025, + "learning_rate": 4.211577728017459e-05, + "loss": 0.243, + "step": 9518 + }, + { + "epoch": 1.128779793667734, + "grad_norm": 1.3144092783553007, + "learning_rate": 4.211402760265425e-05, + "loss": 0.3017, + "step": 9519 + }, + { + "epoch": 1.128898375429859, + "grad_norm": 0.974110437345182, + "learning_rate": 4.211227776736355e-05, + "loss": 0.2307, + "step": 9520 + }, + { + "epoch": 1.129016957191984, + "grad_norm": 0.978076044607706, + "learning_rate": 4.211052777431865e-05, + "loss": 0.2176, + "step": 9521 + }, + { + "epoch": 1.129135538954109, + "grad_norm": 1.042448048296959, + "learning_rate": 4.2108777623535657e-05, + "loss": 0.233, + "step": 9522 + }, + { + "epoch": 1.1292541207162339, + "grad_norm": 1.278477695545076, + "learning_rate": 4.210702731503071e-05, + "loss": 0.2548, + "step": 9523 + }, + { + "epoch": 1.1293727024783589, + "grad_norm": 1.1975634102002457, + "learning_rate": 4.2105276848819956e-05, + "loss": 0.2135, + "step": 9524 + }, + { + "epoch": 1.1294912842404838, + "grad_norm": 0.98552305655481, + "learning_rate": 4.210352622491953e-05, + "loss": 0.229, + "step": 9525 + }, + { + "epoch": 1.1296098660026088, + "grad_norm": 1.0491441536666648, + "learning_rate": 4.210177544334555e-05, + "loss": 0.2579, + "step": 9526 + }, + { + "epoch": 1.1297284477647338, + "grad_norm": 1.1252154564018293, + "learning_rate": 4.2100024504114185e-05, + "loss": 0.213, + "step": 9527 + }, + { + "epoch": 1.1298470295268588, + "grad_norm": 0.8729775627013676, + "learning_rate": 4.2098273407241565e-05, + "loss": 0.1629, + "step": 9528 + }, + { + "epoch": 1.1299656112889838, + "grad_norm": 0.9234447564164187, + "learning_rate": 4.2096522152743824e-05, + "loss": 0.1628, + "step": 9529 + }, + { + "epoch": 1.1300841930511087, + "grad_norm": 1.2140585616289477, + "learning_rate": 4.209477074063711e-05, + "loss": 0.1852, + "step": 9530 + }, + { + "epoch": 1.1302027748132337, + "grad_norm": 1.015256998014857, + "learning_rate": 4.209301917093759e-05, + "loss": 0.2104, + "step": 9531 + }, + { + "epoch": 1.1303213565753587, + "grad_norm": 0.9750203100611816, + "learning_rate": 4.2091267443661375e-05, + "loss": 0.2307, + "step": 9532 + }, + { + "epoch": 1.1304399383374837, + "grad_norm": 1.188611217910767, + "learning_rate": 4.2089515558824646e-05, + "loss": 0.2329, + "step": 9533 + }, + { + "epoch": 1.1305585200996087, + "grad_norm": 1.224883317964741, + "learning_rate": 4.2087763516443536e-05, + "loss": 0.2325, + "step": 9534 + }, + { + "epoch": 1.1306771018617336, + "grad_norm": 1.5112535019283295, + "learning_rate": 4.20860113165342e-05, + "loss": 0.341, + "step": 9535 + }, + { + "epoch": 1.1307956836238586, + "grad_norm": 1.2058590230305346, + "learning_rate": 4.2084258959112785e-05, + "loss": 0.2811, + "step": 9536 + }, + { + "epoch": 1.1309142653859836, + "grad_norm": 1.3351531074481564, + "learning_rate": 4.208250644419546e-05, + "loss": 0.2721, + "step": 9537 + }, + { + "epoch": 1.1310328471481086, + "grad_norm": 1.6231952693205658, + "learning_rate": 4.208075377179837e-05, + "loss": 0.3286, + "step": 9538 + }, + { + "epoch": 1.1311514289102336, + "grad_norm": 1.0566508518965614, + "learning_rate": 4.2079000941937685e-05, + "loss": 0.234, + "step": 9539 + }, + { + "epoch": 1.1312700106723586, + "grad_norm": 1.089411180702893, + "learning_rate": 4.207724795462955e-05, + "loss": 0.2016, + "step": 9540 + }, + { + "epoch": 1.1313885924344835, + "grad_norm": 0.8923797106022836, + "learning_rate": 4.207549480989013e-05, + "loss": 0.195, + "step": 9541 + }, + { + "epoch": 1.1315071741966085, + "grad_norm": 1.401385264675306, + "learning_rate": 4.2073741507735586e-05, + "loss": 0.3539, + "step": 9542 + }, + { + "epoch": 1.1316257559587335, + "grad_norm": 1.147712283664233, + "learning_rate": 4.207198804818209e-05, + "loss": 0.2629, + "step": 9543 + }, + { + "epoch": 1.1317443377208585, + "grad_norm": 1.3836353840532343, + "learning_rate": 4.2070234431245794e-05, + "loss": 0.3229, + "step": 9544 + }, + { + "epoch": 1.1318629194829835, + "grad_norm": 1.3996515972963874, + "learning_rate": 4.206848065694286e-05, + "loss": 0.2966, + "step": 9545 + }, + { + "epoch": 1.1319815012451084, + "grad_norm": 1.253022189614683, + "learning_rate": 4.206672672528947e-05, + "loss": 0.3101, + "step": 9546 + }, + { + "epoch": 1.1321000830072334, + "grad_norm": 0.8757351680813305, + "learning_rate": 4.20649726363018e-05, + "loss": 0.2038, + "step": 9547 + }, + { + "epoch": 1.1322186647693584, + "grad_norm": 1.0727111538596061, + "learning_rate": 4.2063218389995995e-05, + "loss": 0.2097, + "step": 9548 + }, + { + "epoch": 1.1323372465314834, + "grad_norm": 1.183969955631657, + "learning_rate": 4.206146398638825e-05, + "loss": 0.2514, + "step": 9549 + }, + { + "epoch": 1.1324558282936084, + "grad_norm": 1.561851846783253, + "learning_rate": 4.205970942549472e-05, + "loss": 0.3829, + "step": 9550 + }, + { + "epoch": 1.1325744100557333, + "grad_norm": 0.8617201147556757, + "learning_rate": 4.205795470733159e-05, + "loss": 0.2066, + "step": 9551 + }, + { + "epoch": 1.1326929918178583, + "grad_norm": 1.1490108560894308, + "learning_rate": 4.2056199831915045e-05, + "loss": 0.2834, + "step": 9552 + }, + { + "epoch": 1.1328115735799833, + "grad_norm": 1.0567016518899273, + "learning_rate": 4.2054444799261253e-05, + "loss": 0.2268, + "step": 9553 + }, + { + "epoch": 1.1329301553421085, + "grad_norm": 1.3169108425541456, + "learning_rate": 4.205268960938638e-05, + "loss": 0.2654, + "step": 9554 + }, + { + "epoch": 1.1330487371042333, + "grad_norm": 1.0324119472935256, + "learning_rate": 4.205093426230663e-05, + "loss": 0.2512, + "step": 9555 + }, + { + "epoch": 1.1331673188663585, + "grad_norm": 1.018042423918306, + "learning_rate": 4.2049178758038174e-05, + "loss": 0.1959, + "step": 9556 + }, + { + "epoch": 1.1332859006284832, + "grad_norm": 1.1911158289418267, + "learning_rate": 4.204742309659719e-05, + "loss": 0.264, + "step": 9557 + }, + { + "epoch": 1.1334044823906084, + "grad_norm": 0.902694393814431, + "learning_rate": 4.204566727799988e-05, + "loss": 0.2082, + "step": 9558 + }, + { + "epoch": 1.1335230641527334, + "grad_norm": 1.1052615841407465, + "learning_rate": 4.204391130226242e-05, + "loss": 0.2447, + "step": 9559 + }, + { + "epoch": 1.1336416459148584, + "grad_norm": 1.1963124685820308, + "learning_rate": 4.2042155169401e-05, + "loss": 0.2321, + "step": 9560 + }, + { + "epoch": 1.1337602276769834, + "grad_norm": 1.116310676071142, + "learning_rate": 4.2040398879431794e-05, + "loss": 0.2254, + "step": 9561 + }, + { + "epoch": 1.1338788094391083, + "grad_norm": 1.0201071065009046, + "learning_rate": 4.203864243237102e-05, + "loss": 0.2199, + "step": 9562 + }, + { + "epoch": 1.1339973912012333, + "grad_norm": 1.1849076342217482, + "learning_rate": 4.2036885828234856e-05, + "loss": 0.2684, + "step": 9563 + }, + { + "epoch": 1.1341159729633583, + "grad_norm": 1.238528657583238, + "learning_rate": 4.203512906703949e-05, + "loss": 0.242, + "step": 9564 + }, + { + "epoch": 1.1342345547254833, + "grad_norm": 1.0563521716306172, + "learning_rate": 4.203337214880113e-05, + "loss": 0.21, + "step": 9565 + }, + { + "epoch": 1.1343531364876083, + "grad_norm": 1.1532044032164726, + "learning_rate": 4.2031615073535965e-05, + "loss": 0.2708, + "step": 9566 + }, + { + "epoch": 1.1344717182497333, + "grad_norm": 1.1309636208754372, + "learning_rate": 4.202985784126019e-05, + "loss": 0.2449, + "step": 9567 + }, + { + "epoch": 1.1345903000118582, + "grad_norm": 1.4646997201424854, + "learning_rate": 4.202810045199001e-05, + "loss": 0.3174, + "step": 9568 + }, + { + "epoch": 1.1347088817739832, + "grad_norm": 1.4811087011274067, + "learning_rate": 4.202634290574163e-05, + "loss": 0.2856, + "step": 9569 + }, + { + "epoch": 1.1348274635361082, + "grad_norm": 1.6831835754356492, + "learning_rate": 4.2024585202531246e-05, + "loss": 0.3862, + "step": 9570 + }, + { + "epoch": 1.1349460452982332, + "grad_norm": 1.2042969711393754, + "learning_rate": 4.202282734237506e-05, + "loss": 0.3111, + "step": 9571 + }, + { + "epoch": 1.1350646270603582, + "grad_norm": 1.262085680311482, + "learning_rate": 4.202106932528929e-05, + "loss": 0.2455, + "step": 9572 + }, + { + "epoch": 1.1351832088224831, + "grad_norm": 1.5966754916798365, + "learning_rate": 4.2019311151290116e-05, + "loss": 0.3011, + "step": 9573 + }, + { + "epoch": 1.1353017905846081, + "grad_norm": 1.1659659255569643, + "learning_rate": 4.201755282039378e-05, + "loss": 0.2475, + "step": 9574 + }, + { + "epoch": 1.135420372346733, + "grad_norm": 1.1686446104522077, + "learning_rate": 4.201579433261647e-05, + "loss": 0.2568, + "step": 9575 + }, + { + "epoch": 1.135538954108858, + "grad_norm": 0.7386834513813855, + "learning_rate": 4.201403568797441e-05, + "loss": 0.1647, + "step": 9576 + }, + { + "epoch": 1.135657535870983, + "grad_norm": 0.8415000485117524, + "learning_rate": 4.201227688648379e-05, + "loss": 0.1841, + "step": 9577 + }, + { + "epoch": 1.135776117633108, + "grad_norm": 0.9381977190076284, + "learning_rate": 4.201051792816085e-05, + "loss": 0.1888, + "step": 9578 + }, + { + "epoch": 1.135894699395233, + "grad_norm": 1.1067299226740845, + "learning_rate": 4.2008758813021784e-05, + "loss": 0.2386, + "step": 9579 + }, + { + "epoch": 1.136013281157358, + "grad_norm": 1.1754939672333016, + "learning_rate": 4.200699954108283e-05, + "loss": 0.2551, + "step": 9580 + }, + { + "epoch": 1.136131862919483, + "grad_norm": 1.307308090706873, + "learning_rate": 4.2005240112360186e-05, + "loss": 0.2559, + "step": 9581 + }, + { + "epoch": 1.136250444681608, + "grad_norm": 1.120992503095006, + "learning_rate": 4.200348052687009e-05, + "loss": 0.2629, + "step": 9582 + }, + { + "epoch": 1.136369026443733, + "grad_norm": 0.9921956894919534, + "learning_rate": 4.200172078462875e-05, + "loss": 0.2325, + "step": 9583 + }, + { + "epoch": 1.136487608205858, + "grad_norm": 1.0673488629815806, + "learning_rate": 4.199996088565239e-05, + "loss": 0.2488, + "step": 9584 + }, + { + "epoch": 1.136606189967983, + "grad_norm": 1.2691425838262738, + "learning_rate": 4.1998200829957245e-05, + "loss": 0.2912, + "step": 9585 + }, + { + "epoch": 1.1367247717301079, + "grad_norm": 1.0965999845803793, + "learning_rate": 4.199644061755953e-05, + "loss": 0.2255, + "step": 9586 + }, + { + "epoch": 1.1368433534922329, + "grad_norm": 1.567321678194128, + "learning_rate": 4.1994680248475473e-05, + "loss": 0.3782, + "step": 9587 + }, + { + "epoch": 1.1369619352543578, + "grad_norm": 1.7361132592874282, + "learning_rate": 4.199291972272131e-05, + "loss": 0.3626, + "step": 9588 + }, + { + "epoch": 1.1370805170164828, + "grad_norm": 1.1186819039045572, + "learning_rate": 4.199115904031326e-05, + "loss": 0.2095, + "step": 9589 + }, + { + "epoch": 1.1371990987786078, + "grad_norm": 1.197814395753835, + "learning_rate": 4.198939820126756e-05, + "loss": 0.1897, + "step": 9590 + }, + { + "epoch": 1.1373176805407328, + "grad_norm": 0.9863672280123341, + "learning_rate": 4.198763720560044e-05, + "loss": 0.1915, + "step": 9591 + }, + { + "epoch": 1.1374362623028578, + "grad_norm": 1.1899704416817065, + "learning_rate": 4.1985876053328146e-05, + "loss": 0.2432, + "step": 9592 + }, + { + "epoch": 1.1375548440649828, + "grad_norm": 1.2791669673584152, + "learning_rate": 4.1984114744466895e-05, + "loss": 0.2516, + "step": 9593 + }, + { + "epoch": 1.1376734258271077, + "grad_norm": 1.5714901633926353, + "learning_rate": 4.198235327903294e-05, + "loss": 0.4119, + "step": 9594 + }, + { + "epoch": 1.1377920075892327, + "grad_norm": 1.3336019214432762, + "learning_rate": 4.19805916570425e-05, + "loss": 0.2824, + "step": 9595 + }, + { + "epoch": 1.1379105893513577, + "grad_norm": 0.8608551177534084, + "learning_rate": 4.197882987851185e-05, + "loss": 0.1853, + "step": 9596 + }, + { + "epoch": 1.1380291711134827, + "grad_norm": 1.050869385855372, + "learning_rate": 4.197706794345719e-05, + "loss": 0.2448, + "step": 9597 + }, + { + "epoch": 1.1381477528756077, + "grad_norm": 1.4875774953492171, + "learning_rate": 4.1975305851894786e-05, + "loss": 0.3488, + "step": 9598 + }, + { + "epoch": 1.1382663346377326, + "grad_norm": 1.5861479695328355, + "learning_rate": 4.197354360384088e-05, + "loss": 0.3372, + "step": 9599 + }, + { + "epoch": 1.1383849163998576, + "grad_norm": 1.7164875957372139, + "learning_rate": 4.197178119931172e-05, + "loss": 0.334, + "step": 9600 + }, + { + "epoch": 1.1385034981619826, + "grad_norm": 1.0260462479222048, + "learning_rate": 4.197001863832355e-05, + "loss": 0.2566, + "step": 9601 + }, + { + "epoch": 1.1386220799241076, + "grad_norm": 1.210948758665711, + "learning_rate": 4.1968255920892614e-05, + "loss": 0.2843, + "step": 9602 + }, + { + "epoch": 1.1387406616862328, + "grad_norm": 1.4568212438871426, + "learning_rate": 4.196649304703516e-05, + "loss": 0.2899, + "step": 9603 + }, + { + "epoch": 1.1388592434483575, + "grad_norm": 1.0234970520188253, + "learning_rate": 4.196473001676746e-05, + "loss": 0.2393, + "step": 9604 + }, + { + "epoch": 1.1389778252104827, + "grad_norm": 1.0343728187165242, + "learning_rate": 4.1962966830105744e-05, + "loss": 0.1754, + "step": 9605 + }, + { + "epoch": 1.1390964069726075, + "grad_norm": 1.0023079223415905, + "learning_rate": 4.196120348706628e-05, + "loss": 0.1937, + "step": 9606 + }, + { + "epoch": 1.1392149887347327, + "grad_norm": 1.661691120208774, + "learning_rate": 4.195943998766531e-05, + "loss": 0.3357, + "step": 9607 + }, + { + "epoch": 1.1393335704968575, + "grad_norm": 1.0633490580018665, + "learning_rate": 4.1957676331919104e-05, + "loss": 0.2522, + "step": 9608 + }, + { + "epoch": 1.1394521522589827, + "grad_norm": 0.803769495005734, + "learning_rate": 4.195591251984393e-05, + "loss": 0.2054, + "step": 9609 + }, + { + "epoch": 1.1395707340211076, + "grad_norm": 1.3841534324338196, + "learning_rate": 4.195414855145602e-05, + "loss": 0.2764, + "step": 9610 + }, + { + "epoch": 1.1396893157832326, + "grad_norm": 1.5709376501502126, + "learning_rate": 4.195238442677166e-05, + "loss": 0.3666, + "step": 9611 + }, + { + "epoch": 1.1398078975453576, + "grad_norm": 0.9138681078302203, + "learning_rate": 4.19506201458071e-05, + "loss": 0.2376, + "step": 9612 + }, + { + "epoch": 1.1399264793074826, + "grad_norm": 1.1423654455587167, + "learning_rate": 4.1948855708578606e-05, + "loss": 0.2838, + "step": 9613 + }, + { + "epoch": 1.1400450610696076, + "grad_norm": 0.9160321897640372, + "learning_rate": 4.1947091115102446e-05, + "loss": 0.2102, + "step": 9614 + }, + { + "epoch": 1.1401636428317325, + "grad_norm": 1.7556103785568193, + "learning_rate": 4.194532636539489e-05, + "loss": 0.4117, + "step": 9615 + }, + { + "epoch": 1.1402822245938575, + "grad_norm": 1.593385660312138, + "learning_rate": 4.1943561459472205e-05, + "loss": 0.4248, + "step": 9616 + }, + { + "epoch": 1.1404008063559825, + "grad_norm": 1.057610929153699, + "learning_rate": 4.1941796397350665e-05, + "loss": 0.2348, + "step": 9617 + }, + { + "epoch": 1.1405193881181075, + "grad_norm": 1.5464695195585707, + "learning_rate": 4.194003117904653e-05, + "loss": 0.3333, + "step": 9618 + }, + { + "epoch": 1.1406379698802325, + "grad_norm": 1.2185727957403742, + "learning_rate": 4.193826580457609e-05, + "loss": 0.2235, + "step": 9619 + }, + { + "epoch": 1.1407565516423575, + "grad_norm": 1.1896955917140066, + "learning_rate": 4.19365002739556e-05, + "loss": 0.2704, + "step": 9620 + }, + { + "epoch": 1.1408751334044824, + "grad_norm": 1.0562558401923945, + "learning_rate": 4.193473458720135e-05, + "loss": 0.229, + "step": 9621 + }, + { + "epoch": 1.1409937151666074, + "grad_norm": 1.0066642658022935, + "learning_rate": 4.193296874432961e-05, + "loss": 0.2498, + "step": 9622 + }, + { + "epoch": 1.1411122969287324, + "grad_norm": 1.3317183331496318, + "learning_rate": 4.1931202745356665e-05, + "loss": 0.353, + "step": 9623 + }, + { + "epoch": 1.1412308786908574, + "grad_norm": 1.1823374380379026, + "learning_rate": 4.192943659029881e-05, + "loss": 0.2581, + "step": 9624 + }, + { + "epoch": 1.1413494604529824, + "grad_norm": 1.028652549460109, + "learning_rate": 4.1927670279172294e-05, + "loss": 0.2389, + "step": 9625 + }, + { + "epoch": 1.1414680422151073, + "grad_norm": 0.9556883308619061, + "learning_rate": 4.1925903811993416e-05, + "loss": 0.1688, + "step": 9626 + }, + { + "epoch": 1.1415866239772323, + "grad_norm": 1.3017711349919292, + "learning_rate": 4.192413718877846e-05, + "loss": 0.1886, + "step": 9627 + }, + { + "epoch": 1.1417052057393573, + "grad_norm": 0.978280168234373, + "learning_rate": 4.192237040954372e-05, + "loss": 0.2512, + "step": 9628 + }, + { + "epoch": 1.1418237875014823, + "grad_norm": 1.0990891187615237, + "learning_rate": 4.1920603474305465e-05, + "loss": 0.1985, + "step": 9629 + }, + { + "epoch": 1.1419423692636073, + "grad_norm": 1.3484275075951457, + "learning_rate": 4.191883638308001e-05, + "loss": 0.314, + "step": 9630 + }, + { + "epoch": 1.1420609510257322, + "grad_norm": 1.6838172421743567, + "learning_rate": 4.191706913588362e-05, + "loss": 0.316, + "step": 9631 + }, + { + "epoch": 1.1421795327878572, + "grad_norm": 1.1578423336095374, + "learning_rate": 4.1915301732732605e-05, + "loss": 0.2099, + "step": 9632 + }, + { + "epoch": 1.1422981145499822, + "grad_norm": 1.0051581195829182, + "learning_rate": 4.191353417364324e-05, + "loss": 0.2551, + "step": 9633 + }, + { + "epoch": 1.1424166963121072, + "grad_norm": 1.18736384805768, + "learning_rate": 4.191176645863184e-05, + "loss": 0.2448, + "step": 9634 + }, + { + "epoch": 1.1425352780742322, + "grad_norm": 1.2802547233398902, + "learning_rate": 4.190999858771469e-05, + "loss": 0.3054, + "step": 9635 + }, + { + "epoch": 1.1426538598363571, + "grad_norm": 1.1671891164888442, + "learning_rate": 4.1908230560908086e-05, + "loss": 0.2145, + "step": 9636 + }, + { + "epoch": 1.1427724415984821, + "grad_norm": 0.9044852976777734, + "learning_rate": 4.190646237822833e-05, + "loss": 0.1591, + "step": 9637 + }, + { + "epoch": 1.142891023360607, + "grad_norm": 0.9705924900045289, + "learning_rate": 4.190469403969173e-05, + "loss": 0.2228, + "step": 9638 + }, + { + "epoch": 1.143009605122732, + "grad_norm": 1.3543573290259971, + "learning_rate": 4.190292554531458e-05, + "loss": 0.2791, + "step": 9639 + }, + { + "epoch": 1.143128186884857, + "grad_norm": 1.06463583370425, + "learning_rate": 4.190115689511318e-05, + "loss": 0.2715, + "step": 9640 + }, + { + "epoch": 1.143246768646982, + "grad_norm": 1.6372946834206963, + "learning_rate": 4.189938808910384e-05, + "loss": 0.3417, + "step": 9641 + }, + { + "epoch": 1.143365350409107, + "grad_norm": 0.9512463580407413, + "learning_rate": 4.189761912730286e-05, + "loss": 0.2276, + "step": 9642 + }, + { + "epoch": 1.143483932171232, + "grad_norm": 0.8910832626881221, + "learning_rate": 4.1895850009726564e-05, + "loss": 0.1938, + "step": 9643 + }, + { + "epoch": 1.143602513933357, + "grad_norm": 1.202146728787868, + "learning_rate": 4.189408073639124e-05, + "loss": 0.225, + "step": 9644 + }, + { + "epoch": 1.143721095695482, + "grad_norm": 0.997064125410387, + "learning_rate": 4.189231130731321e-05, + "loss": 0.2005, + "step": 9645 + }, + { + "epoch": 1.143839677457607, + "grad_norm": 1.6341589767831413, + "learning_rate": 4.189054172250879e-05, + "loss": 0.3481, + "step": 9646 + }, + { + "epoch": 1.143958259219732, + "grad_norm": 0.9985844789141011, + "learning_rate": 4.188877198199429e-05, + "loss": 0.2219, + "step": 9647 + }, + { + "epoch": 1.144076840981857, + "grad_norm": 1.5725267493933235, + "learning_rate": 4.188700208578601e-05, + "loss": 0.3693, + "step": 9648 + }, + { + "epoch": 1.144195422743982, + "grad_norm": 1.4405483176444003, + "learning_rate": 4.188523203390029e-05, + "loss": 0.3217, + "step": 9649 + }, + { + "epoch": 1.1443140045061069, + "grad_norm": 1.1204798776371678, + "learning_rate": 4.188346182635343e-05, + "loss": 0.2644, + "step": 9650 + }, + { + "epoch": 1.1444325862682319, + "grad_norm": 0.8976956338231642, + "learning_rate": 4.188169146316177e-05, + "loss": 0.1998, + "step": 9651 + }, + { + "epoch": 1.1445511680303568, + "grad_norm": 0.9660082546175005, + "learning_rate": 4.1879920944341597e-05, + "loss": 0.2122, + "step": 9652 + }, + { + "epoch": 1.1446697497924818, + "grad_norm": 1.4408541033882343, + "learning_rate": 4.187815026990926e-05, + "loss": 0.2839, + "step": 9653 + }, + { + "epoch": 1.144788331554607, + "grad_norm": 0.8218421461902522, + "learning_rate": 4.187637943988108e-05, + "loss": 0.1677, + "step": 9654 + }, + { + "epoch": 1.1449069133167318, + "grad_norm": 1.0757035127994463, + "learning_rate": 4.187460845427337e-05, + "loss": 0.2009, + "step": 9655 + }, + { + "epoch": 1.145025495078857, + "grad_norm": 1.511063346253126, + "learning_rate": 4.187283731310247e-05, + "loss": 0.3588, + "step": 9656 + }, + { + "epoch": 1.1451440768409817, + "grad_norm": 1.3304812910171966, + "learning_rate": 4.1871066016384694e-05, + "loss": 0.2471, + "step": 9657 + }, + { + "epoch": 1.145262658603107, + "grad_norm": 1.0134322941260123, + "learning_rate": 4.1869294564136383e-05, + "loss": 0.2195, + "step": 9658 + }, + { + "epoch": 1.145381240365232, + "grad_norm": 1.0809127651312571, + "learning_rate": 4.1867522956373854e-05, + "loss": 0.1924, + "step": 9659 + }, + { + "epoch": 1.145499822127357, + "grad_norm": 1.2802743484357586, + "learning_rate": 4.186575119311346e-05, + "loss": 0.2419, + "step": 9660 + }, + { + "epoch": 1.1456184038894819, + "grad_norm": 1.274586313360551, + "learning_rate": 4.186397927437151e-05, + "loss": 0.2481, + "step": 9661 + }, + { + "epoch": 1.1457369856516069, + "grad_norm": 1.2764285283071424, + "learning_rate": 4.186220720016436e-05, + "loss": 0.2441, + "step": 9662 + }, + { + "epoch": 1.1458555674137318, + "grad_norm": 1.1555743912083654, + "learning_rate": 4.186043497050833e-05, + "loss": 0.2376, + "step": 9663 + }, + { + "epoch": 1.1459741491758568, + "grad_norm": 1.572899136117067, + "learning_rate": 4.185866258541977e-05, + "loss": 0.3171, + "step": 9664 + }, + { + "epoch": 1.1460927309379818, + "grad_norm": 1.1180596797838855, + "learning_rate": 4.185689004491502e-05, + "loss": 0.2574, + "step": 9665 + }, + { + "epoch": 1.1462113127001068, + "grad_norm": 1.1042516220423861, + "learning_rate": 4.185511734901041e-05, + "loss": 0.2113, + "step": 9666 + }, + { + "epoch": 1.1463298944622318, + "grad_norm": 0.9760002736847903, + "learning_rate": 4.185334449772228e-05, + "loss": 0.2148, + "step": 9667 + }, + { + "epoch": 1.1464484762243567, + "grad_norm": 1.0250787578744802, + "learning_rate": 4.185157149106699e-05, + "loss": 0.2324, + "step": 9668 + }, + { + "epoch": 1.1465670579864817, + "grad_norm": 1.0226348762612674, + "learning_rate": 4.184979832906088e-05, + "loss": 0.2139, + "step": 9669 + }, + { + "epoch": 1.1466856397486067, + "grad_norm": 1.241533754088076, + "learning_rate": 4.184802501172028e-05, + "loss": 0.2974, + "step": 9670 + }, + { + "epoch": 1.1468042215107317, + "grad_norm": 1.6700638512684611, + "learning_rate": 4.184625153906155e-05, + "loss": 0.3145, + "step": 9671 + }, + { + "epoch": 1.1469228032728567, + "grad_norm": 1.1324410861247851, + "learning_rate": 4.184447791110104e-05, + "loss": 0.253, + "step": 9672 + }, + { + "epoch": 1.1470413850349817, + "grad_norm": 1.1366312943049606, + "learning_rate": 4.1842704127855106e-05, + "loss": 0.2645, + "step": 9673 + }, + { + "epoch": 1.1471599667971066, + "grad_norm": 0.9839740839698634, + "learning_rate": 4.18409301893401e-05, + "loss": 0.1953, + "step": 9674 + }, + { + "epoch": 1.1472785485592316, + "grad_norm": 1.0306888482476495, + "learning_rate": 4.183915609557236e-05, + "loss": 0.1884, + "step": 9675 + }, + { + "epoch": 1.1473971303213566, + "grad_norm": 0.9410340946536296, + "learning_rate": 4.1837381846568246e-05, + "loss": 0.242, + "step": 9676 + }, + { + "epoch": 1.1475157120834816, + "grad_norm": 1.1464753188951873, + "learning_rate": 4.183560744234413e-05, + "loss": 0.2693, + "step": 9677 + }, + { + "epoch": 1.1476342938456066, + "grad_norm": 1.0778195080349073, + "learning_rate": 4.1833832882916346e-05, + "loss": 0.2208, + "step": 9678 + }, + { + "epoch": 1.1477528756077315, + "grad_norm": 1.7559538043195355, + "learning_rate": 4.1832058168301266e-05, + "loss": 0.2907, + "step": 9679 + }, + { + "epoch": 1.1478714573698565, + "grad_norm": 1.0785611852367463, + "learning_rate": 4.183028329851526e-05, + "loss": 0.2656, + "step": 9680 + }, + { + "epoch": 1.1479900391319815, + "grad_norm": 1.1450716076064014, + "learning_rate": 4.182850827357467e-05, + "loss": 0.2628, + "step": 9681 + }, + { + "epoch": 1.1481086208941065, + "grad_norm": 0.9655641781103969, + "learning_rate": 4.1826733093495884e-05, + "loss": 0.1816, + "step": 9682 + }, + { + "epoch": 1.1482272026562315, + "grad_norm": 1.7014758720187313, + "learning_rate": 4.1824957758295244e-05, + "loss": 0.3545, + "step": 9683 + }, + { + "epoch": 1.1483457844183564, + "grad_norm": 0.9986963102955382, + "learning_rate": 4.182318226798913e-05, + "loss": 0.225, + "step": 9684 + }, + { + "epoch": 1.1484643661804814, + "grad_norm": 1.2945107823339637, + "learning_rate": 4.1821406622593904e-05, + "loss": 0.2757, + "step": 9685 + }, + { + "epoch": 1.1485829479426064, + "grad_norm": 1.143919673027592, + "learning_rate": 4.1819630822125934e-05, + "loss": 0.3033, + "step": 9686 + }, + { + "epoch": 1.1487015297047314, + "grad_norm": 0.8922398897989006, + "learning_rate": 4.1817854866601596e-05, + "loss": 0.2158, + "step": 9687 + }, + { + "epoch": 1.1488201114668564, + "grad_norm": 1.1197180930064663, + "learning_rate": 4.1816078756037256e-05, + "loss": 0.249, + "step": 9688 + }, + { + "epoch": 1.1489386932289813, + "grad_norm": 1.0191240075722947, + "learning_rate": 4.18143024904493e-05, + "loss": 0.2065, + "step": 9689 + }, + { + "epoch": 1.1490572749911063, + "grad_norm": 1.408094052512272, + "learning_rate": 4.1812526069854087e-05, + "loss": 0.2859, + "step": 9690 + }, + { + "epoch": 1.1491758567532313, + "grad_norm": 1.115043332026719, + "learning_rate": 4.1810749494268e-05, + "loss": 0.2125, + "step": 9691 + }, + { + "epoch": 1.1492944385153563, + "grad_norm": 1.322023439674604, + "learning_rate": 4.180897276370742e-05, + "loss": 0.2721, + "step": 9692 + }, + { + "epoch": 1.1494130202774813, + "grad_norm": 1.1035833364794494, + "learning_rate": 4.180719587818872e-05, + "loss": 0.2306, + "step": 9693 + }, + { + "epoch": 1.1495316020396062, + "grad_norm": 1.1553647181045057, + "learning_rate": 4.180541883772829e-05, + "loss": 0.2369, + "step": 9694 + }, + { + "epoch": 1.1496501838017312, + "grad_norm": 1.3394598211267421, + "learning_rate": 4.180364164234251e-05, + "loss": 0.2809, + "step": 9695 + }, + { + "epoch": 1.1497687655638562, + "grad_norm": 1.1603123838940432, + "learning_rate": 4.180186429204775e-05, + "loss": 0.2481, + "step": 9696 + }, + { + "epoch": 1.1498873473259812, + "grad_norm": 1.2810664759132562, + "learning_rate": 4.1800086786860406e-05, + "loss": 0.1886, + "step": 9697 + }, + { + "epoch": 1.1500059290881062, + "grad_norm": 1.3071092148340149, + "learning_rate": 4.179830912679686e-05, + "loss": 0.2472, + "step": 9698 + }, + { + "epoch": 1.1501245108502312, + "grad_norm": 1.241831256126419, + "learning_rate": 4.179653131187352e-05, + "loss": 0.3243, + "step": 9699 + }, + { + "epoch": 1.1502430926123561, + "grad_norm": 1.5699625508136668, + "learning_rate": 4.179475334210674e-05, + "loss": 0.31, + "step": 9700 + }, + { + "epoch": 1.1503616743744811, + "grad_norm": 1.197611415715529, + "learning_rate": 4.179297521751294e-05, + "loss": 0.2212, + "step": 9701 + }, + { + "epoch": 1.150480256136606, + "grad_norm": 1.190163368161288, + "learning_rate": 4.17911969381085e-05, + "loss": 0.2417, + "step": 9702 + }, + { + "epoch": 1.1505988378987313, + "grad_norm": 1.102185605154122, + "learning_rate": 4.178941850390981e-05, + "loss": 0.2305, + "step": 9703 + }, + { + "epoch": 1.150717419660856, + "grad_norm": 1.500447182652989, + "learning_rate": 4.1787639914933276e-05, + "loss": 0.2804, + "step": 9704 + }, + { + "epoch": 1.1508360014229813, + "grad_norm": 1.2376228886375582, + "learning_rate": 4.178586117119528e-05, + "loss": 0.2378, + "step": 9705 + }, + { + "epoch": 1.150954583185106, + "grad_norm": 1.3941356109738277, + "learning_rate": 4.178408227271224e-05, + "loss": 0.2521, + "step": 9706 + }, + { + "epoch": 1.1510731649472312, + "grad_norm": 0.8628374206930609, + "learning_rate": 4.178230321950054e-05, + "loss": 0.2044, + "step": 9707 + }, + { + "epoch": 1.151191746709356, + "grad_norm": 1.291538331549025, + "learning_rate": 4.1780524011576585e-05, + "loss": 0.2517, + "step": 9708 + }, + { + "epoch": 1.1513103284714812, + "grad_norm": 1.0365636059719874, + "learning_rate": 4.1778744648956767e-05, + "loss": 0.213, + "step": 9709 + }, + { + "epoch": 1.1514289102336062, + "grad_norm": 1.212879415988456, + "learning_rate": 4.177696513165751e-05, + "loss": 0.2173, + "step": 9710 + }, + { + "epoch": 1.1515474919957311, + "grad_norm": 1.198726704882082, + "learning_rate": 4.17751854596952e-05, + "loss": 0.2018, + "step": 9711 + }, + { + "epoch": 1.1516660737578561, + "grad_norm": 1.2350194253516336, + "learning_rate": 4.177340563308625e-05, + "loss": 0.2033, + "step": 9712 + }, + { + "epoch": 1.151784655519981, + "grad_norm": 1.0262618324698836, + "learning_rate": 4.177162565184707e-05, + "loss": 0.2147, + "step": 9713 + }, + { + "epoch": 1.151903237282106, + "grad_norm": 1.5794155626842283, + "learning_rate": 4.176984551599408e-05, + "loss": 0.2982, + "step": 9714 + }, + { + "epoch": 1.152021819044231, + "grad_norm": 0.8901675161974288, + "learning_rate": 4.176806522554366e-05, + "loss": 0.2129, + "step": 9715 + }, + { + "epoch": 1.152140400806356, + "grad_norm": 1.0848323518197425, + "learning_rate": 4.1766284780512255e-05, + "loss": 0.2593, + "step": 9716 + }, + { + "epoch": 1.152258982568481, + "grad_norm": 0.9251817762780647, + "learning_rate": 4.1764504180916264e-05, + "loss": 0.2564, + "step": 9717 + }, + { + "epoch": 1.152377564330606, + "grad_norm": 1.1773942713653593, + "learning_rate": 4.17627234267721e-05, + "loss": 0.1827, + "step": 9718 + }, + { + "epoch": 1.152496146092731, + "grad_norm": 0.9096649744936289, + "learning_rate": 4.176094251809618e-05, + "loss": 0.1824, + "step": 9719 + }, + { + "epoch": 1.152614727854856, + "grad_norm": 0.6915340935104606, + "learning_rate": 4.1759161454904924e-05, + "loss": 0.1727, + "step": 9720 + }, + { + "epoch": 1.152733309616981, + "grad_norm": 1.1324786579056412, + "learning_rate": 4.175738023721475e-05, + "loss": 0.2605, + "step": 9721 + }, + { + "epoch": 1.152851891379106, + "grad_norm": 0.9165939411140239, + "learning_rate": 4.175559886504208e-05, + "loss": 0.1771, + "step": 9722 + }, + { + "epoch": 1.152970473141231, + "grad_norm": 1.5248751684269233, + "learning_rate": 4.175381733840334e-05, + "loss": 0.3501, + "step": 9723 + }, + { + "epoch": 1.153089054903356, + "grad_norm": 1.2449844756770585, + "learning_rate": 4.175203565731493e-05, + "loss": 0.3235, + "step": 9724 + }, + { + "epoch": 1.1532076366654809, + "grad_norm": 1.2546645115452637, + "learning_rate": 4.1750253821793314e-05, + "loss": 0.3042, + "step": 9725 + }, + { + "epoch": 1.1533262184276059, + "grad_norm": 1.411971220307702, + "learning_rate": 4.17484718318549e-05, + "loss": 0.3095, + "step": 9726 + }, + { + "epoch": 1.1534448001897308, + "grad_norm": 2.0731875539536015, + "learning_rate": 4.1746689687516106e-05, + "loss": 0.4783, + "step": 9727 + }, + { + "epoch": 1.1535633819518558, + "grad_norm": 1.119912412113706, + "learning_rate": 4.174490738879336e-05, + "loss": 0.22, + "step": 9728 + }, + { + "epoch": 1.1536819637139808, + "grad_norm": 1.2737620732853396, + "learning_rate": 4.1743124935703123e-05, + "loss": 0.2579, + "step": 9729 + }, + { + "epoch": 1.1538005454761058, + "grad_norm": 0.9031692459746484, + "learning_rate": 4.174134232826179e-05, + "loss": 0.2415, + "step": 9730 + }, + { + "epoch": 1.1539191272382308, + "grad_norm": 1.1292252058815346, + "learning_rate": 4.173955956648582e-05, + "loss": 0.1949, + "step": 9731 + }, + { + "epoch": 1.1540377090003557, + "grad_norm": 0.9293089415154345, + "learning_rate": 4.173777665039163e-05, + "loss": 0.2468, + "step": 9732 + }, + { + "epoch": 1.1541562907624807, + "grad_norm": 1.0394854986713913, + "learning_rate": 4.173599357999567e-05, + "loss": 0.2069, + "step": 9733 + }, + { + "epoch": 1.1542748725246057, + "grad_norm": 0.9306677355959424, + "learning_rate": 4.1734210355314375e-05, + "loss": 0.2359, + "step": 9734 + }, + { + "epoch": 1.1543934542867307, + "grad_norm": 1.227113592635986, + "learning_rate": 4.173242697636417e-05, + "loss": 0.2583, + "step": 9735 + }, + { + "epoch": 1.1545120360488557, + "grad_norm": 1.0811616302298324, + "learning_rate": 4.173064344316151e-05, + "loss": 0.2062, + "step": 9736 + }, + { + "epoch": 1.1546306178109806, + "grad_norm": 1.097442212989027, + "learning_rate": 4.172885975572284e-05, + "loss": 0.2436, + "step": 9737 + }, + { + "epoch": 1.1547491995731056, + "grad_norm": 1.1538305820840913, + "learning_rate": 4.1727075914064594e-05, + "loss": 0.2317, + "step": 9738 + }, + { + "epoch": 1.1548677813352306, + "grad_norm": 1.2054163815806151, + "learning_rate": 4.1725291918203224e-05, + "loss": 0.2991, + "step": 9739 + }, + { + "epoch": 1.1549863630973556, + "grad_norm": 1.1594772665600603, + "learning_rate": 4.172350776815517e-05, + "loss": 0.2234, + "step": 9740 + }, + { + "epoch": 1.1551049448594806, + "grad_norm": 1.335084165955372, + "learning_rate": 4.172172346393688e-05, + "loss": 0.2907, + "step": 9741 + }, + { + "epoch": 1.1552235266216055, + "grad_norm": 1.2799698569810303, + "learning_rate": 4.171993900556481e-05, + "loss": 0.284, + "step": 9742 + }, + { + "epoch": 1.1553421083837305, + "grad_norm": 0.9033953699337584, + "learning_rate": 4.17181543930554e-05, + "loss": 0.1952, + "step": 9743 + }, + { + "epoch": 1.1554606901458555, + "grad_norm": 1.0318392144194215, + "learning_rate": 4.1716369626425115e-05, + "loss": 0.23, + "step": 9744 + }, + { + "epoch": 1.1555792719079805, + "grad_norm": 1.4668517690740324, + "learning_rate": 4.1714584705690394e-05, + "loss": 0.2414, + "step": 9745 + }, + { + "epoch": 1.1556978536701055, + "grad_norm": 1.3069755772806961, + "learning_rate": 4.17127996308677e-05, + "loss": 0.2536, + "step": 9746 + }, + { + "epoch": 1.1558164354322304, + "grad_norm": 1.6714885893640758, + "learning_rate": 4.171101440197349e-05, + "loss": 0.3386, + "step": 9747 + }, + { + "epoch": 1.1559350171943554, + "grad_norm": 1.3462706176178598, + "learning_rate": 4.170922901902422e-05, + "loss": 0.2946, + "step": 9748 + }, + { + "epoch": 1.1560535989564804, + "grad_norm": 1.4802040680786244, + "learning_rate": 4.1707443482036344e-05, + "loss": 0.4042, + "step": 9749 + }, + { + "epoch": 1.1561721807186054, + "grad_norm": 0.8312086513375094, + "learning_rate": 4.1705657791026334e-05, + "loss": 0.1692, + "step": 9750 + }, + { + "epoch": 1.1562907624807304, + "grad_norm": 1.1998655089222094, + "learning_rate": 4.1703871946010644e-05, + "loss": 0.2446, + "step": 9751 + }, + { + "epoch": 1.1564093442428554, + "grad_norm": 0.780388296784744, + "learning_rate": 4.170208594700573e-05, + "loss": 0.1699, + "step": 9752 + }, + { + "epoch": 1.1565279260049803, + "grad_norm": 0.9342335149491425, + "learning_rate": 4.1700299794028073e-05, + "loss": 0.2217, + "step": 9753 + }, + { + "epoch": 1.1566465077671055, + "grad_norm": 1.0633226550107404, + "learning_rate": 4.1698513487094126e-05, + "loss": 0.1839, + "step": 9754 + }, + { + "epoch": 1.1567650895292303, + "grad_norm": 1.1909133663811307, + "learning_rate": 4.169672702622037e-05, + "loss": 0.2673, + "step": 9755 + }, + { + "epoch": 1.1568836712913555, + "grad_norm": 1.3012722295630257, + "learning_rate": 4.169494041142326e-05, + "loss": 0.2886, + "step": 9756 + }, + { + "epoch": 1.1570022530534803, + "grad_norm": 1.3091384940568283, + "learning_rate": 4.169315364271927e-05, + "loss": 0.274, + "step": 9757 + }, + { + "epoch": 1.1571208348156055, + "grad_norm": 0.8991335564339591, + "learning_rate": 4.169136672012487e-05, + "loss": 0.2264, + "step": 9758 + }, + { + "epoch": 1.1572394165777304, + "grad_norm": 1.2097664518602953, + "learning_rate": 4.168957964365655e-05, + "loss": 0.2355, + "step": 9759 + }, + { + "epoch": 1.1573579983398554, + "grad_norm": 1.0346514373866775, + "learning_rate": 4.168779241333076e-05, + "loss": 0.231, + "step": 9760 + }, + { + "epoch": 1.1574765801019804, + "grad_norm": 1.0664691030531535, + "learning_rate": 4.1686005029164e-05, + "loss": 0.2143, + "step": 9761 + }, + { + "epoch": 1.1575951618641054, + "grad_norm": 1.0553275029458815, + "learning_rate": 4.1684217491172725e-05, + "loss": 0.2128, + "step": 9762 + }, + { + "epoch": 1.1577137436262304, + "grad_norm": 1.0243106800741328, + "learning_rate": 4.168242979937343e-05, + "loss": 0.1982, + "step": 9763 + }, + { + "epoch": 1.1578323253883553, + "grad_norm": 1.3418307299101522, + "learning_rate": 4.1680641953782586e-05, + "loss": 0.2899, + "step": 9764 + }, + { + "epoch": 1.1579509071504803, + "grad_norm": 0.9839271912034352, + "learning_rate": 4.167885395441668e-05, + "loss": 0.2123, + "step": 9765 + }, + { + "epoch": 1.1580694889126053, + "grad_norm": 1.1827584601587167, + "learning_rate": 4.167706580129219e-05, + "loss": 0.2943, + "step": 9766 + }, + { + "epoch": 1.1581880706747303, + "grad_norm": 1.4771893247582986, + "learning_rate": 4.167527749442561e-05, + "loss": 0.3659, + "step": 9767 + }, + { + "epoch": 1.1583066524368553, + "grad_norm": 1.6592091652561418, + "learning_rate": 4.167348903383342e-05, + "loss": 0.2699, + "step": 9768 + }, + { + "epoch": 1.1584252341989802, + "grad_norm": 1.170107441710715, + "learning_rate": 4.16717004195321e-05, + "loss": 0.2564, + "step": 9769 + }, + { + "epoch": 1.1585438159611052, + "grad_norm": 1.0123080467567847, + "learning_rate": 4.166991165153815e-05, + "loss": 0.2002, + "step": 9770 + }, + { + "epoch": 1.1586623977232302, + "grad_norm": 0.921675071556106, + "learning_rate": 4.166812272986806e-05, + "loss": 0.1614, + "step": 9771 + }, + { + "epoch": 1.1587809794853552, + "grad_norm": 1.0787532502577128, + "learning_rate": 4.1666333654538315e-05, + "loss": 0.2756, + "step": 9772 + }, + { + "epoch": 1.1588995612474802, + "grad_norm": 0.8226939628998595, + "learning_rate": 4.1664544425565413e-05, + "loss": 0.1796, + "step": 9773 + }, + { + "epoch": 1.1590181430096052, + "grad_norm": 1.6073377841583745, + "learning_rate": 4.166275504296585e-05, + "loss": 0.3575, + "step": 9774 + }, + { + "epoch": 1.1591367247717301, + "grad_norm": 1.454711590410997, + "learning_rate": 4.166096550675611e-05, + "loss": 0.3194, + "step": 9775 + }, + { + "epoch": 1.1592553065338551, + "grad_norm": 1.4885901661737309, + "learning_rate": 4.165917581695271e-05, + "loss": 0.38, + "step": 9776 + }, + { + "epoch": 1.15937388829598, + "grad_norm": 1.1313666844815329, + "learning_rate": 4.1657385973572125e-05, + "loss": 0.2185, + "step": 9777 + }, + { + "epoch": 1.159492470058105, + "grad_norm": 0.9167553498515507, + "learning_rate": 4.165559597663088e-05, + "loss": 0.2294, + "step": 9778 + }, + { + "epoch": 1.15961105182023, + "grad_norm": 1.3199569743343973, + "learning_rate": 4.165380582614545e-05, + "loss": 0.2633, + "step": 9779 + }, + { + "epoch": 1.159729633582355, + "grad_norm": 1.2466368348250365, + "learning_rate": 4.165201552213237e-05, + "loss": 0.2618, + "step": 9780 + }, + { + "epoch": 1.15984821534448, + "grad_norm": 1.0819060012568134, + "learning_rate": 4.1650225064608114e-05, + "loss": 0.2655, + "step": 9781 + }, + { + "epoch": 1.159966797106605, + "grad_norm": 1.048671910140433, + "learning_rate": 4.1648434453589205e-05, + "loss": 0.219, + "step": 9782 + }, + { + "epoch": 1.16008537886873, + "grad_norm": 1.2394270012766493, + "learning_rate": 4.164664368909215e-05, + "loss": 0.2569, + "step": 9783 + }, + { + "epoch": 1.160203960630855, + "grad_norm": 0.9911566886348759, + "learning_rate": 4.164485277113345e-05, + "loss": 0.2063, + "step": 9784 + }, + { + "epoch": 1.16032254239298, + "grad_norm": 0.9551759817701917, + "learning_rate": 4.1643061699729616e-05, + "loss": 0.2033, + "step": 9785 + }, + { + "epoch": 1.160441124155105, + "grad_norm": 1.5045113175578044, + "learning_rate": 4.164127047489716e-05, + "loss": 0.3585, + "step": 9786 + }, + { + "epoch": 1.16055970591723, + "grad_norm": 1.1165018265564772, + "learning_rate": 4.1639479096652615e-05, + "loss": 0.2774, + "step": 9787 + }, + { + "epoch": 1.1606782876793549, + "grad_norm": 0.9124884888775723, + "learning_rate": 4.1637687565012454e-05, + "loss": 0.231, + "step": 9788 + }, + { + "epoch": 1.1607968694414799, + "grad_norm": 1.0139559934694458, + "learning_rate": 4.1635895879993234e-05, + "loss": 0.248, + "step": 9789 + }, + { + "epoch": 1.1609154512036048, + "grad_norm": 1.4112774099724366, + "learning_rate": 4.163410404161144e-05, + "loss": 0.2861, + "step": 9790 + }, + { + "epoch": 1.1610340329657298, + "grad_norm": 1.1131576384905648, + "learning_rate": 4.163231204988362e-05, + "loss": 0.2568, + "step": 9791 + }, + { + "epoch": 1.1611526147278548, + "grad_norm": 0.9758611704113783, + "learning_rate": 4.163051990482626e-05, + "loss": 0.1853, + "step": 9792 + }, + { + "epoch": 1.1612711964899798, + "grad_norm": 1.285430208001626, + "learning_rate": 4.1628727606455916e-05, + "loss": 0.211, + "step": 9793 + }, + { + "epoch": 1.1613897782521048, + "grad_norm": 1.226431189814614, + "learning_rate": 4.162693515478909e-05, + "loss": 0.2379, + "step": 9794 + }, + { + "epoch": 1.1615083600142297, + "grad_norm": 1.25437174139283, + "learning_rate": 4.1625142549842314e-05, + "loss": 0.2994, + "step": 9795 + }, + { + "epoch": 1.1616269417763547, + "grad_norm": 1.3057988891547039, + "learning_rate": 4.1623349791632106e-05, + "loss": 0.2627, + "step": 9796 + }, + { + "epoch": 1.1617455235384797, + "grad_norm": 1.104830886125268, + "learning_rate": 4.1621556880174996e-05, + "loss": 0.2294, + "step": 9797 + }, + { + "epoch": 1.1618641053006047, + "grad_norm": 0.8775242047857768, + "learning_rate": 4.1619763815487526e-05, + "loss": 0.2051, + "step": 9798 + }, + { + "epoch": 1.1619826870627297, + "grad_norm": 1.0529561409815242, + "learning_rate": 4.1617970597586195e-05, + "loss": 0.2155, + "step": 9799 + }, + { + "epoch": 1.1621012688248546, + "grad_norm": 1.644925106954984, + "learning_rate": 4.161617722648757e-05, + "loss": 0.4443, + "step": 9800 + }, + { + "epoch": 1.1622198505869796, + "grad_norm": 1.1177934138536756, + "learning_rate": 4.161438370220816e-05, + "loss": 0.2234, + "step": 9801 + }, + { + "epoch": 1.1623384323491046, + "grad_norm": 1.088155934293742, + "learning_rate": 4.161259002476451e-05, + "loss": 0.238, + "step": 9802 + }, + { + "epoch": 1.1624570141112298, + "grad_norm": 1.1632184199437015, + "learning_rate": 4.161079619417314e-05, + "loss": 0.2541, + "step": 9803 + }, + { + "epoch": 1.1625755958733546, + "grad_norm": 1.4332054654630273, + "learning_rate": 4.1609002210450614e-05, + "loss": 0.2794, + "step": 9804 + }, + { + "epoch": 1.1626941776354798, + "grad_norm": 1.2202499467622312, + "learning_rate": 4.1607208073613444e-05, + "loss": 0.2355, + "step": 9805 + }, + { + "epoch": 1.1628127593976045, + "grad_norm": 1.0039622196510454, + "learning_rate": 4.160541378367819e-05, + "loss": 0.1935, + "step": 9806 + }, + { + "epoch": 1.1629313411597297, + "grad_norm": 1.1528751059491416, + "learning_rate": 4.160361934066138e-05, + "loss": 0.2299, + "step": 9807 + }, + { + "epoch": 1.1630499229218545, + "grad_norm": 1.3755391092912754, + "learning_rate": 4.160182474457955e-05, + "loss": 0.2727, + "step": 9808 + }, + { + "epoch": 1.1631685046839797, + "grad_norm": 1.3194624648364048, + "learning_rate": 4.1600029995449275e-05, + "loss": 0.3031, + "step": 9809 + }, + { + "epoch": 1.1632870864461047, + "grad_norm": 1.0407352475986944, + "learning_rate": 4.1598235093287066e-05, + "loss": 0.2053, + "step": 9810 + }, + { + "epoch": 1.1634056682082297, + "grad_norm": 1.0585452329867486, + "learning_rate": 4.1596440038109486e-05, + "loss": 0.1945, + "step": 9811 + }, + { + "epoch": 1.1635242499703546, + "grad_norm": 1.0840101349385005, + "learning_rate": 4.159464482993308e-05, + "loss": 0.2667, + "step": 9812 + }, + { + "epoch": 1.1636428317324796, + "grad_norm": 1.4310247026262102, + "learning_rate": 4.15928494687744e-05, + "loss": 0.334, + "step": 9813 + }, + { + "epoch": 1.1637614134946046, + "grad_norm": 1.2410633879844142, + "learning_rate": 4.1591053954649995e-05, + "loss": 0.2281, + "step": 9814 + }, + { + "epoch": 1.1638799952567296, + "grad_norm": 0.9290741487012489, + "learning_rate": 4.158925828757642e-05, + "loss": 0.1955, + "step": 9815 + }, + { + "epoch": 1.1639985770188546, + "grad_norm": 1.0803453483057235, + "learning_rate": 4.1587462467570214e-05, + "loss": 0.2263, + "step": 9816 + }, + { + "epoch": 1.1641171587809795, + "grad_norm": 1.47772207533791, + "learning_rate": 4.158566649464796e-05, + "loss": 0.2412, + "step": 9817 + }, + { + "epoch": 1.1642357405431045, + "grad_norm": 1.2927298286013915, + "learning_rate": 4.15838703688262e-05, + "loss": 0.2552, + "step": 9818 + }, + { + "epoch": 1.1643543223052295, + "grad_norm": 0.9744732739082427, + "learning_rate": 4.158207409012148e-05, + "loss": 0.2238, + "step": 9819 + }, + { + "epoch": 1.1644729040673545, + "grad_norm": 1.122468525704558, + "learning_rate": 4.158027765855038e-05, + "loss": 0.2521, + "step": 9820 + }, + { + "epoch": 1.1645914858294795, + "grad_norm": 1.1402595988510433, + "learning_rate": 4.157848107412945e-05, + "loss": 0.2352, + "step": 9821 + }, + { + "epoch": 1.1647100675916044, + "grad_norm": 1.2724707505654502, + "learning_rate": 4.157668433687525e-05, + "loss": 0.265, + "step": 9822 + }, + { + "epoch": 1.1648286493537294, + "grad_norm": 1.0017155432098488, + "learning_rate": 4.1574887446804355e-05, + "loss": 0.1616, + "step": 9823 + }, + { + "epoch": 1.1649472311158544, + "grad_norm": 1.3190684631312177, + "learning_rate": 4.1573090403933325e-05, + "loss": 0.2589, + "step": 9824 + }, + { + "epoch": 1.1650658128779794, + "grad_norm": 1.2840647576528115, + "learning_rate": 4.1571293208278714e-05, + "loss": 0.2458, + "step": 9825 + }, + { + "epoch": 1.1651843946401044, + "grad_norm": 1.3983914609076906, + "learning_rate": 4.156949585985711e-05, + "loss": 0.3042, + "step": 9826 + }, + { + "epoch": 1.1653029764022294, + "grad_norm": 1.6631557381441104, + "learning_rate": 4.1567698358685066e-05, + "loss": 0.2582, + "step": 9827 + }, + { + "epoch": 1.1654215581643543, + "grad_norm": 1.0759956898146648, + "learning_rate": 4.1565900704779166e-05, + "loss": 0.2936, + "step": 9828 + }, + { + "epoch": 1.1655401399264793, + "grad_norm": 0.9521726567399845, + "learning_rate": 4.156410289815597e-05, + "loss": 0.2113, + "step": 9829 + }, + { + "epoch": 1.1656587216886043, + "grad_norm": 1.144028532434178, + "learning_rate": 4.1562304938832065e-05, + "loss": 0.2072, + "step": 9830 + }, + { + "epoch": 1.1657773034507293, + "grad_norm": 1.25410391992121, + "learning_rate": 4.156050682682401e-05, + "loss": 0.2852, + "step": 9831 + }, + { + "epoch": 1.1658958852128543, + "grad_norm": 1.0774115931922472, + "learning_rate": 4.15587085621484e-05, + "loss": 0.2632, + "step": 9832 + }, + { + "epoch": 1.1660144669749792, + "grad_norm": 1.2431147371794047, + "learning_rate": 4.155691014482179e-05, + "loss": 0.2287, + "step": 9833 + }, + { + "epoch": 1.1661330487371042, + "grad_norm": 1.2406532194871465, + "learning_rate": 4.1555111574860785e-05, + "loss": 0.2741, + "step": 9834 + }, + { + "epoch": 1.1662516304992292, + "grad_norm": 0.9772674313310136, + "learning_rate": 4.155331285228194e-05, + "loss": 0.2026, + "step": 9835 + }, + { + "epoch": 1.1663702122613542, + "grad_norm": 0.993123148868634, + "learning_rate": 4.1551513977101855e-05, + "loss": 0.2521, + "step": 9836 + }, + { + "epoch": 1.1664887940234792, + "grad_norm": 1.6563210480289874, + "learning_rate": 4.154971494933712e-05, + "loss": 0.4134, + "step": 9837 + }, + { + "epoch": 1.1666073757856041, + "grad_norm": 1.150657114019511, + "learning_rate": 4.154791576900429e-05, + "loss": 0.2637, + "step": 9838 + }, + { + "epoch": 1.1667259575477291, + "grad_norm": 1.2727136471032243, + "learning_rate": 4.1546116436119974e-05, + "loss": 0.2992, + "step": 9839 + }, + { + "epoch": 1.166844539309854, + "grad_norm": 0.9388260427273223, + "learning_rate": 4.154431695070076e-05, + "loss": 0.1929, + "step": 9840 + }, + { + "epoch": 1.166963121071979, + "grad_norm": 0.7656858863570207, + "learning_rate": 4.154251731276323e-05, + "loss": 0.2054, + "step": 9841 + }, + { + "epoch": 1.167081702834104, + "grad_norm": 1.0439938466513767, + "learning_rate": 4.154071752232397e-05, + "loss": 0.2419, + "step": 9842 + }, + { + "epoch": 1.167200284596229, + "grad_norm": 1.1932076156053617, + "learning_rate": 4.153891757939958e-05, + "loss": 0.2065, + "step": 9843 + }, + { + "epoch": 1.167318866358354, + "grad_norm": 1.5972977229375684, + "learning_rate": 4.153711748400665e-05, + "loss": 0.3152, + "step": 9844 + }, + { + "epoch": 1.167437448120479, + "grad_norm": 0.9843814492084669, + "learning_rate": 4.1535317236161786e-05, + "loss": 0.1989, + "step": 9845 + }, + { + "epoch": 1.167556029882604, + "grad_norm": 1.2540010542527213, + "learning_rate": 4.153351683588157e-05, + "loss": 0.2573, + "step": 9846 + }, + { + "epoch": 1.167674611644729, + "grad_norm": 1.3198587568886864, + "learning_rate": 4.15317162831826e-05, + "loss": 0.3202, + "step": 9847 + }, + { + "epoch": 1.167793193406854, + "grad_norm": 1.027119386475778, + "learning_rate": 4.152991557808148e-05, + "loss": 0.204, + "step": 9848 + }, + { + "epoch": 1.167911775168979, + "grad_norm": 1.4814958043091926, + "learning_rate": 4.15281147205948e-05, + "loss": 0.3037, + "step": 9849 + }, + { + "epoch": 1.168030356931104, + "grad_norm": 1.0264446863245735, + "learning_rate": 4.1526313710739184e-05, + "loss": 0.2106, + "step": 9850 + }, + { + "epoch": 1.1681489386932289, + "grad_norm": 0.8266353023627034, + "learning_rate": 4.1524512548531216e-05, + "loss": 0.2119, + "step": 9851 + }, + { + "epoch": 1.1682675204553539, + "grad_norm": 0.9795202024734199, + "learning_rate": 4.15227112339875e-05, + "loss": 0.1908, + "step": 9852 + }, + { + "epoch": 1.1683861022174789, + "grad_norm": 1.3482002795640746, + "learning_rate": 4.152090976712466e-05, + "loss": 0.3398, + "step": 9853 + }, + { + "epoch": 1.168504683979604, + "grad_norm": 1.0874607370834783, + "learning_rate": 4.1519108147959284e-05, + "loss": 0.2613, + "step": 9854 + }, + { + "epoch": 1.1686232657417288, + "grad_norm": 1.0334165623123646, + "learning_rate": 4.1517306376507994e-05, + "loss": 0.2502, + "step": 9855 + }, + { + "epoch": 1.168741847503854, + "grad_norm": 1.1147390942817372, + "learning_rate": 4.151550445278739e-05, + "loss": 0.2406, + "step": 9856 + }, + { + "epoch": 1.1688604292659788, + "grad_norm": 0.9847574238416811, + "learning_rate": 4.151370237681408e-05, + "loss": 0.2393, + "step": 9857 + }, + { + "epoch": 1.168979011028104, + "grad_norm": 1.2351941771138877, + "learning_rate": 4.151190014860469e-05, + "loss": 0.2272, + "step": 9858 + }, + { + "epoch": 1.169097592790229, + "grad_norm": 1.2827311339507022, + "learning_rate": 4.151009776817583e-05, + "loss": 0.2621, + "step": 9859 + }, + { + "epoch": 1.169216174552354, + "grad_norm": 1.338624658076609, + "learning_rate": 4.150829523554412e-05, + "loss": 0.2221, + "step": 9860 + }, + { + "epoch": 1.169334756314479, + "grad_norm": 1.0858753494377686, + "learning_rate": 4.150649255072616e-05, + "loss": 0.2398, + "step": 9861 + }, + { + "epoch": 1.169453338076604, + "grad_norm": 1.1636164376677873, + "learning_rate": 4.150468971373859e-05, + "loss": 0.204, + "step": 9862 + }, + { + "epoch": 1.1695719198387289, + "grad_norm": 1.40193557302608, + "learning_rate": 4.1502886724598015e-05, + "loss": 0.299, + "step": 9863 + }, + { + "epoch": 1.1696905016008539, + "grad_norm": 1.2759804439624638, + "learning_rate": 4.1501083583321065e-05, + "loss": 0.2481, + "step": 9864 + }, + { + "epoch": 1.1698090833629788, + "grad_norm": 1.3464461917648929, + "learning_rate": 4.149928028992436e-05, + "loss": 0.3104, + "step": 9865 + }, + { + "epoch": 1.1699276651251038, + "grad_norm": 0.9735145839018802, + "learning_rate": 4.1497476844424514e-05, + "loss": 0.167, + "step": 9866 + }, + { + "epoch": 1.1700462468872288, + "grad_norm": 1.3373323055609687, + "learning_rate": 4.149567324683818e-05, + "loss": 0.2528, + "step": 9867 + }, + { + "epoch": 1.1701648286493538, + "grad_norm": 1.1418564074701067, + "learning_rate": 4.149386949718195e-05, + "loss": 0.2487, + "step": 9868 + }, + { + "epoch": 1.1702834104114788, + "grad_norm": 1.1166387221895955, + "learning_rate": 4.1492065595472476e-05, + "loss": 0.2517, + "step": 9869 + }, + { + "epoch": 1.1704019921736037, + "grad_norm": 1.6459870477127574, + "learning_rate": 4.1490261541726384e-05, + "loss": 0.3932, + "step": 9870 + }, + { + "epoch": 1.1705205739357287, + "grad_norm": 1.3640329752856664, + "learning_rate": 4.1488457335960294e-05, + "loss": 0.2512, + "step": 9871 + }, + { + "epoch": 1.1706391556978537, + "grad_norm": 0.8366678951735907, + "learning_rate": 4.1486652978190855e-05, + "loss": 0.1671, + "step": 9872 + }, + { + "epoch": 1.1707577374599787, + "grad_norm": 1.4251123419599485, + "learning_rate": 4.148484846843469e-05, + "loss": 0.2617, + "step": 9873 + }, + { + "epoch": 1.1708763192221037, + "grad_norm": 1.3714201235435066, + "learning_rate": 4.148304380670843e-05, + "loss": 0.3043, + "step": 9874 + }, + { + "epoch": 1.1709949009842286, + "grad_norm": 0.8941006401503712, + "learning_rate": 4.1481238993028734e-05, + "loss": 0.2101, + "step": 9875 + }, + { + "epoch": 1.1711134827463536, + "grad_norm": 0.9609752992345864, + "learning_rate": 4.147943402741221e-05, + "loss": 0.1801, + "step": 9876 + }, + { + "epoch": 1.1712320645084786, + "grad_norm": 1.2463443210245537, + "learning_rate": 4.147762890987552e-05, + "loss": 0.2546, + "step": 9877 + }, + { + "epoch": 1.1713506462706036, + "grad_norm": 1.4654431682726694, + "learning_rate": 4.14758236404353e-05, + "loss": 0.2724, + "step": 9878 + }, + { + "epoch": 1.1714692280327286, + "grad_norm": 1.304400839203449, + "learning_rate": 4.147401821910819e-05, + "loss": 0.3232, + "step": 9879 + }, + { + "epoch": 1.1715878097948536, + "grad_norm": 1.3041892830182265, + "learning_rate": 4.147221264591083e-05, + "loss": 0.3115, + "step": 9880 + }, + { + "epoch": 1.1717063915569785, + "grad_norm": 0.9615758740407383, + "learning_rate": 4.147040692085987e-05, + "loss": 0.2444, + "step": 9881 + }, + { + "epoch": 1.1718249733191035, + "grad_norm": 1.3555152780357733, + "learning_rate": 4.1468601043971954e-05, + "loss": 0.3392, + "step": 9882 + }, + { + "epoch": 1.1719435550812285, + "grad_norm": 1.1847817443463542, + "learning_rate": 4.146679501526373e-05, + "loss": 0.2324, + "step": 9883 + }, + { + "epoch": 1.1720621368433535, + "grad_norm": 1.1107691152678436, + "learning_rate": 4.146498883475186e-05, + "loss": 0.2426, + "step": 9884 + }, + { + "epoch": 1.1721807186054785, + "grad_norm": 1.049560617634572, + "learning_rate": 4.146318250245297e-05, + "loss": 0.2291, + "step": 9885 + }, + { + "epoch": 1.1722993003676034, + "grad_norm": 0.9992669082628642, + "learning_rate": 4.1461376018383736e-05, + "loss": 0.2381, + "step": 9886 + }, + { + "epoch": 1.1724178821297284, + "grad_norm": 1.0685505397464568, + "learning_rate": 4.145956938256079e-05, + "loss": 0.2727, + "step": 9887 + }, + { + "epoch": 1.1725364638918534, + "grad_norm": 1.2198324948515844, + "learning_rate": 4.145776259500081e-05, + "loss": 0.2798, + "step": 9888 + }, + { + "epoch": 1.1726550456539784, + "grad_norm": 1.1744359395413069, + "learning_rate": 4.1455955655720433e-05, + "loss": 0.2951, + "step": 9889 + }, + { + "epoch": 1.1727736274161034, + "grad_norm": 1.2864017940752666, + "learning_rate": 4.1454148564736325e-05, + "loss": 0.2742, + "step": 9890 + }, + { + "epoch": 1.1728922091782283, + "grad_norm": 1.1695769711880684, + "learning_rate": 4.1452341322065144e-05, + "loss": 0.208, + "step": 9891 + }, + { + "epoch": 1.1730107909403533, + "grad_norm": 1.1502385274142486, + "learning_rate": 4.1450533927723565e-05, + "loss": 0.2369, + "step": 9892 + }, + { + "epoch": 1.1731293727024783, + "grad_norm": 1.185523774907947, + "learning_rate": 4.144872638172822e-05, + "loss": 0.2686, + "step": 9893 + }, + { + "epoch": 1.1732479544646033, + "grad_norm": 1.2080063366779088, + "learning_rate": 4.144691868409579e-05, + "loss": 0.2312, + "step": 9894 + }, + { + "epoch": 1.1733665362267283, + "grad_norm": 0.7867245553011812, + "learning_rate": 4.1445110834842945e-05, + "loss": 0.179, + "step": 9895 + }, + { + "epoch": 1.1734851179888532, + "grad_norm": 1.3281814605842954, + "learning_rate": 4.1443302833986345e-05, + "loss": 0.2988, + "step": 9896 + }, + { + "epoch": 1.1736036997509782, + "grad_norm": 1.2534884365291157, + "learning_rate": 4.1441494681542655e-05, + "loss": 0.3231, + "step": 9897 + }, + { + "epoch": 1.1737222815131032, + "grad_norm": 1.5221586241226934, + "learning_rate": 4.143968637752854e-05, + "loss": 0.3021, + "step": 9898 + }, + { + "epoch": 1.1738408632752282, + "grad_norm": 1.0607087089451566, + "learning_rate": 4.1437877921960684e-05, + "loss": 0.2052, + "step": 9899 + }, + { + "epoch": 1.1739594450373532, + "grad_norm": 1.043886961719104, + "learning_rate": 4.143606931485574e-05, + "loss": 0.2224, + "step": 9900 + }, + { + "epoch": 1.1740780267994781, + "grad_norm": 1.6441170754799121, + "learning_rate": 4.143426055623041e-05, + "loss": 0.3937, + "step": 9901 + }, + { + "epoch": 1.1741966085616031, + "grad_norm": 1.332322003409126, + "learning_rate": 4.1432451646101336e-05, + "loss": 0.3119, + "step": 9902 + }, + { + "epoch": 1.1743151903237283, + "grad_norm": 1.1270320230376007, + "learning_rate": 4.143064258448521e-05, + "loss": 0.2808, + "step": 9903 + }, + { + "epoch": 1.174433772085853, + "grad_norm": 1.242482093182845, + "learning_rate": 4.1428833371398715e-05, + "loss": 0.2851, + "step": 9904 + }, + { + "epoch": 1.1745523538479783, + "grad_norm": 1.03475857944016, + "learning_rate": 4.1427024006858514e-05, + "loss": 0.2278, + "step": 9905 + }, + { + "epoch": 1.174670935610103, + "grad_norm": 1.295992646111444, + "learning_rate": 4.1425214490881305e-05, + "loss": 0.272, + "step": 9906 + }, + { + "epoch": 1.1747895173722283, + "grad_norm": 1.2126493211612044, + "learning_rate": 4.1423404823483756e-05, + "loss": 0.2915, + "step": 9907 + }, + { + "epoch": 1.174908099134353, + "grad_norm": 0.9577608443502641, + "learning_rate": 4.142159500468256e-05, + "loss": 0.1889, + "step": 9908 + }, + { + "epoch": 1.1750266808964782, + "grad_norm": 1.070377433805415, + "learning_rate": 4.1419785034494385e-05, + "loss": 0.249, + "step": 9909 + }, + { + "epoch": 1.1751452626586032, + "grad_norm": 1.0185744640922725, + "learning_rate": 4.141797491293593e-05, + "loss": 0.2583, + "step": 9910 + }, + { + "epoch": 1.1752638444207282, + "grad_norm": 1.3401447537764066, + "learning_rate": 4.141616464002388e-05, + "loss": 0.2689, + "step": 9911 + }, + { + "epoch": 1.1753824261828532, + "grad_norm": 1.166643613473188, + "learning_rate": 4.141435421577493e-05, + "loss": 0.2798, + "step": 9912 + }, + { + "epoch": 1.1755010079449781, + "grad_norm": 0.8489100320189321, + "learning_rate": 4.141254364020576e-05, + "loss": 0.1562, + "step": 9913 + }, + { + "epoch": 1.1756195897071031, + "grad_norm": 1.6571421388439227, + "learning_rate": 4.1410732913333056e-05, + "loss": 0.4044, + "step": 9914 + }, + { + "epoch": 1.175738171469228, + "grad_norm": 1.1316985155984705, + "learning_rate": 4.140892203517353e-05, + "loss": 0.242, + "step": 9915 + }, + { + "epoch": 1.175856753231353, + "grad_norm": 1.1661889056594879, + "learning_rate": 4.1407111005743855e-05, + "loss": 0.2853, + "step": 9916 + }, + { + "epoch": 1.175975334993478, + "grad_norm": 1.1691422034512469, + "learning_rate": 4.140529982506074e-05, + "loss": 0.2805, + "step": 9917 + }, + { + "epoch": 1.176093916755603, + "grad_norm": 1.2094653984802082, + "learning_rate": 4.140348849314087e-05, + "loss": 0.3113, + "step": 9918 + }, + { + "epoch": 1.176212498517728, + "grad_norm": 1.0417101181432251, + "learning_rate": 4.140167701000097e-05, + "loss": 0.2401, + "step": 9919 + }, + { + "epoch": 1.176331080279853, + "grad_norm": 0.8956477035028819, + "learning_rate": 4.139986537565771e-05, + "loss": 0.2167, + "step": 9920 + }, + { + "epoch": 1.176449662041978, + "grad_norm": 1.0118632710777948, + "learning_rate": 4.13980535901278e-05, + "loss": 0.2284, + "step": 9921 + }, + { + "epoch": 1.176568243804103, + "grad_norm": 1.0105674038917705, + "learning_rate": 4.139624165342794e-05, + "loss": 0.2221, + "step": 9922 + }, + { + "epoch": 1.176686825566228, + "grad_norm": 1.6524583559164925, + "learning_rate": 4.1394429565574847e-05, + "loss": 0.2712, + "step": 9923 + }, + { + "epoch": 1.176805407328353, + "grad_norm": 1.2149345787799346, + "learning_rate": 4.1392617326585215e-05, + "loss": 0.2246, + "step": 9924 + }, + { + "epoch": 1.176923989090478, + "grad_norm": 1.2909319929148482, + "learning_rate": 4.139080493647576e-05, + "loss": 0.246, + "step": 9925 + }, + { + "epoch": 1.1770425708526029, + "grad_norm": 1.0137554901265642, + "learning_rate": 4.138899239526317e-05, + "loss": 0.2327, + "step": 9926 + }, + { + "epoch": 1.1771611526147279, + "grad_norm": 0.6798392232470067, + "learning_rate": 4.138717970296419e-05, + "loss": 0.1331, + "step": 9927 + }, + { + "epoch": 1.1772797343768528, + "grad_norm": 1.0245696709648877, + "learning_rate": 4.138536685959549e-05, + "loss": 0.2533, + "step": 9928 + }, + { + "epoch": 1.1773983161389778, + "grad_norm": 0.8418573536443829, + "learning_rate": 4.13835538651738e-05, + "loss": 0.2054, + "step": 9929 + }, + { + "epoch": 1.1775168979011028, + "grad_norm": 1.6388329351227398, + "learning_rate": 4.138174071971583e-05, + "loss": 0.495, + "step": 9930 + }, + { + "epoch": 1.1776354796632278, + "grad_norm": 0.8757003463885359, + "learning_rate": 4.1379927423238315e-05, + "loss": 0.1773, + "step": 9931 + }, + { + "epoch": 1.1777540614253528, + "grad_norm": 1.1489619684296808, + "learning_rate": 4.1378113975757945e-05, + "loss": 0.2602, + "step": 9932 + }, + { + "epoch": 1.1778726431874778, + "grad_norm": 1.1702775921045052, + "learning_rate": 4.137630037729145e-05, + "loss": 0.267, + "step": 9933 + }, + { + "epoch": 1.1779912249496027, + "grad_norm": 0.9014630821867519, + "learning_rate": 4.137448662785555e-05, + "loss": 0.176, + "step": 9934 + }, + { + "epoch": 1.1781098067117277, + "grad_norm": 1.5502465033962605, + "learning_rate": 4.1372672727466963e-05, + "loss": 0.2839, + "step": 9935 + }, + { + "epoch": 1.1782283884738527, + "grad_norm": 1.0602085439812772, + "learning_rate": 4.137085867614241e-05, + "loss": 0.2311, + "step": 9936 + }, + { + "epoch": 1.1783469702359777, + "grad_norm": 1.009000720919505, + "learning_rate": 4.1369044473898614e-05, + "loss": 0.1745, + "step": 9937 + }, + { + "epoch": 1.1784655519981027, + "grad_norm": 0.7878211407300001, + "learning_rate": 4.136723012075231e-05, + "loss": 0.2336, + "step": 9938 + }, + { + "epoch": 1.1785841337602276, + "grad_norm": 1.096632040419667, + "learning_rate": 4.13654156167202e-05, + "loss": 0.2149, + "step": 9939 + }, + { + "epoch": 1.1787027155223526, + "grad_norm": 1.2029146150017398, + "learning_rate": 4.1363600961819035e-05, + "loss": 0.2328, + "step": 9940 + }, + { + "epoch": 1.1788212972844776, + "grad_norm": 1.5155268917453095, + "learning_rate": 4.136178615606553e-05, + "loss": 0.3418, + "step": 9941 + }, + { + "epoch": 1.1789398790466026, + "grad_norm": 1.1378860172565022, + "learning_rate": 4.1359971199476426e-05, + "loss": 0.2228, + "step": 9942 + }, + { + "epoch": 1.1790584608087276, + "grad_norm": 0.8742737294710003, + "learning_rate": 4.135815609206844e-05, + "loss": 0.1631, + "step": 9943 + }, + { + "epoch": 1.1791770425708525, + "grad_norm": 1.2351671383363447, + "learning_rate": 4.135634083385833e-05, + "loss": 0.2512, + "step": 9944 + }, + { + "epoch": 1.1792956243329775, + "grad_norm": 1.1057055300556915, + "learning_rate": 4.135452542486281e-05, + "loss": 0.2318, + "step": 9945 + }, + { + "epoch": 1.1794142060951025, + "grad_norm": 1.3033374750276467, + "learning_rate": 4.135270986509862e-05, + "loss": 0.2086, + "step": 9946 + }, + { + "epoch": 1.1795327878572275, + "grad_norm": 1.4020015000568131, + "learning_rate": 4.13508941545825e-05, + "loss": 0.3649, + "step": 9947 + }, + { + "epoch": 1.1796513696193525, + "grad_norm": 1.3336228107245442, + "learning_rate": 4.1349078293331176e-05, + "loss": 0.2111, + "step": 9948 + }, + { + "epoch": 1.1797699513814774, + "grad_norm": 1.3476119594423759, + "learning_rate": 4.1347262281361406e-05, + "loss": 0.2503, + "step": 9949 + }, + { + "epoch": 1.1798885331436024, + "grad_norm": 1.011779596905743, + "learning_rate": 4.134544611868992e-05, + "loss": 0.1999, + "step": 9950 + }, + { + "epoch": 1.1800071149057274, + "grad_norm": 1.1745208847776751, + "learning_rate": 4.1343629805333476e-05, + "loss": 0.2082, + "step": 9951 + }, + { + "epoch": 1.1801256966678526, + "grad_norm": 1.1142083367689635, + "learning_rate": 4.13418133413088e-05, + "loss": 0.2073, + "step": 9952 + }, + { + "epoch": 1.1802442784299774, + "grad_norm": 1.0875870060473547, + "learning_rate": 4.1339996726632645e-05, + "loss": 0.2165, + "step": 9953 + }, + { + "epoch": 1.1803628601921026, + "grad_norm": 0.9972110800563697, + "learning_rate": 4.1338179961321754e-05, + "loss": 0.2071, + "step": 9954 + }, + { + "epoch": 1.1804814419542273, + "grad_norm": 1.1627154791961782, + "learning_rate": 4.133636304539289e-05, + "loss": 0.3033, + "step": 9955 + }, + { + "epoch": 1.1806000237163525, + "grad_norm": 0.7330426456515045, + "learning_rate": 4.133454597886278e-05, + "loss": 0.1702, + "step": 9956 + }, + { + "epoch": 1.1807186054784773, + "grad_norm": 0.9822046096735356, + "learning_rate": 4.1332728761748196e-05, + "loss": 0.1811, + "step": 9957 + }, + { + "epoch": 1.1808371872406025, + "grad_norm": 1.1843700820217926, + "learning_rate": 4.1330911394065874e-05, + "loss": 0.2647, + "step": 9958 + }, + { + "epoch": 1.1809557690027275, + "grad_norm": 1.000045061574419, + "learning_rate": 4.1329093875832574e-05, + "loss": 0.2403, + "step": 9959 + }, + { + "epoch": 1.1810743507648525, + "grad_norm": 1.2402739748959346, + "learning_rate": 4.132727620706506e-05, + "loss": 0.3181, + "step": 9960 + }, + { + "epoch": 1.1811929325269774, + "grad_norm": 1.368562143952398, + "learning_rate": 4.132545838778008e-05, + "loss": 0.3102, + "step": 9961 + }, + { + "epoch": 1.1813115142891024, + "grad_norm": 0.931065424641166, + "learning_rate": 4.132364041799438e-05, + "loss": 0.1997, + "step": 9962 + }, + { + "epoch": 1.1814300960512274, + "grad_norm": 1.1081104477681922, + "learning_rate": 4.132182229772475e-05, + "loss": 0.2184, + "step": 9963 + }, + { + "epoch": 1.1815486778133524, + "grad_norm": 1.2246360042289595, + "learning_rate": 4.132000402698792e-05, + "loss": 0.2452, + "step": 9964 + }, + { + "epoch": 1.1816672595754774, + "grad_norm": 1.0581212692083337, + "learning_rate": 4.1318185605800674e-05, + "loss": 0.2098, + "step": 9965 + }, + { + "epoch": 1.1817858413376023, + "grad_norm": 1.2033934801249544, + "learning_rate": 4.131636703417977e-05, + "loss": 0.2729, + "step": 9966 + }, + { + "epoch": 1.1819044230997273, + "grad_norm": 1.0613843690185267, + "learning_rate": 4.1314548312141965e-05, + "loss": 0.2236, + "step": 9967 + }, + { + "epoch": 1.1820230048618523, + "grad_norm": 1.1566663203111867, + "learning_rate": 4.131272943970403e-05, + "loss": 0.2318, + "step": 9968 + }, + { + "epoch": 1.1821415866239773, + "grad_norm": 1.0669259034299514, + "learning_rate": 4.131091041688274e-05, + "loss": 0.2135, + "step": 9969 + }, + { + "epoch": 1.1822601683861023, + "grad_norm": 1.1012305396783593, + "learning_rate": 4.130909124369485e-05, + "loss": 0.273, + "step": 9970 + }, + { + "epoch": 1.1823787501482272, + "grad_norm": 1.4773069732204058, + "learning_rate": 4.130727192015714e-05, + "loss": 0.3403, + "step": 9971 + }, + { + "epoch": 1.1824973319103522, + "grad_norm": 0.899201688894045, + "learning_rate": 4.130545244628639e-05, + "loss": 0.2304, + "step": 9972 + }, + { + "epoch": 1.1826159136724772, + "grad_norm": 1.944712495797384, + "learning_rate": 4.130363282209935e-05, + "loss": 0.4701, + "step": 9973 + }, + { + "epoch": 1.1827344954346022, + "grad_norm": 0.8137255167219068, + "learning_rate": 4.130181304761281e-05, + "loss": 0.1747, + "step": 9974 + }, + { + "epoch": 1.1828530771967272, + "grad_norm": 1.631956465355469, + "learning_rate": 4.129999312284355e-05, + "loss": 0.3132, + "step": 9975 + }, + { + "epoch": 1.1829716589588521, + "grad_norm": 1.0177764839277437, + "learning_rate": 4.129817304780834e-05, + "loss": 0.2113, + "step": 9976 + }, + { + "epoch": 1.1830902407209771, + "grad_norm": 0.7716437464896979, + "learning_rate": 4.129635282252396e-05, + "loss": 0.1778, + "step": 9977 + }, + { + "epoch": 1.183208822483102, + "grad_norm": 1.2346177430571956, + "learning_rate": 4.1294532447007194e-05, + "loss": 0.257, + "step": 9978 + }, + { + "epoch": 1.183327404245227, + "grad_norm": 1.5415806149421716, + "learning_rate": 4.129271192127482e-05, + "loss": 0.3339, + "step": 9979 + }, + { + "epoch": 1.183445986007352, + "grad_norm": 1.020769336155493, + "learning_rate": 4.129089124534362e-05, + "loss": 0.2212, + "step": 9980 + }, + { + "epoch": 1.183564567769477, + "grad_norm": 1.0864521433861034, + "learning_rate": 4.128907041923038e-05, + "loss": 0.2162, + "step": 9981 + }, + { + "epoch": 1.183683149531602, + "grad_norm": 1.2607925093747274, + "learning_rate": 4.128724944295188e-05, + "loss": 0.3024, + "step": 9982 + }, + { + "epoch": 1.183801731293727, + "grad_norm": 0.8371038524811966, + "learning_rate": 4.128542831652492e-05, + "loss": 0.2069, + "step": 9983 + }, + { + "epoch": 1.183920313055852, + "grad_norm": 1.247981430858475, + "learning_rate": 4.128360703996628e-05, + "loss": 0.2543, + "step": 9984 + }, + { + "epoch": 1.184038894817977, + "grad_norm": 1.030058554382611, + "learning_rate": 4.128178561329275e-05, + "loss": 0.2269, + "step": 9985 + }, + { + "epoch": 1.184157476580102, + "grad_norm": 1.2731577244632462, + "learning_rate": 4.1279964036521126e-05, + "loss": 0.305, + "step": 9986 + }, + { + "epoch": 1.184276058342227, + "grad_norm": 0.9746936237369059, + "learning_rate": 4.1278142309668193e-05, + "loss": 0.1719, + "step": 9987 + }, + { + "epoch": 1.184394640104352, + "grad_norm": 1.2081597023803257, + "learning_rate": 4.127632043275075e-05, + "loss": 0.3007, + "step": 9988 + }, + { + "epoch": 1.184513221866477, + "grad_norm": 1.1196884408606516, + "learning_rate": 4.1274498405785594e-05, + "loss": 0.2249, + "step": 9989 + }, + { + "epoch": 1.1846318036286019, + "grad_norm": 1.1754652987737844, + "learning_rate": 4.127267622878952e-05, + "loss": 0.2012, + "step": 9990 + }, + { + "epoch": 1.1847503853907269, + "grad_norm": 1.1389923545073035, + "learning_rate": 4.1270853901779326e-05, + "loss": 0.2571, + "step": 9991 + }, + { + "epoch": 1.1848689671528518, + "grad_norm": 1.0346201405897446, + "learning_rate": 4.126903142477181e-05, + "loss": 0.2623, + "step": 9992 + }, + { + "epoch": 1.1849875489149768, + "grad_norm": 0.8393651470813581, + "learning_rate": 4.1267208797783774e-05, + "loss": 0.1585, + "step": 9993 + }, + { + "epoch": 1.1851061306771018, + "grad_norm": 0.9388010736867392, + "learning_rate": 4.126538602083202e-05, + "loss": 0.1895, + "step": 9994 + }, + { + "epoch": 1.1852247124392268, + "grad_norm": 0.969202782116698, + "learning_rate": 4.126356309393335e-05, + "loss": 0.2397, + "step": 9995 + }, + { + "epoch": 1.1853432942013518, + "grad_norm": 1.34194856154961, + "learning_rate": 4.1261740017104585e-05, + "loss": 0.2568, + "step": 9996 + }, + { + "epoch": 1.1854618759634767, + "grad_norm": 1.0472136527742688, + "learning_rate": 4.1259916790362504e-05, + "loss": 0.1761, + "step": 9997 + }, + { + "epoch": 1.1855804577256017, + "grad_norm": 1.0150440806496466, + "learning_rate": 4.1258093413723944e-05, + "loss": 0.1927, + "step": 9998 + }, + { + "epoch": 1.1856990394877267, + "grad_norm": 0.8894385428180444, + "learning_rate": 4.125626988720569e-05, + "loss": 0.2027, + "step": 9999 + }, + { + "epoch": 1.1858176212498517, + "grad_norm": 0.881098564401981, + "learning_rate": 4.125444621082456e-05, + "loss": 0.1994, + "step": 10000 + }, + { + "epoch": 1.1859362030119767, + "grad_norm": 0.9737219072876793, + "learning_rate": 4.1252622384597375e-05, + "loss": 0.2348, + "step": 10001 + }, + { + "epoch": 1.1860547847741016, + "grad_norm": 0.8533148673286778, + "learning_rate": 4.125079840854094e-05, + "loss": 0.2001, + "step": 10002 + }, + { + "epoch": 1.1861733665362268, + "grad_norm": 1.4647298068381414, + "learning_rate": 4.124897428267207e-05, + "loss": 0.2837, + "step": 10003 + }, + { + "epoch": 1.1862919482983516, + "grad_norm": 1.3187824055324755, + "learning_rate": 4.124715000700758e-05, + "loss": 0.2605, + "step": 10004 + }, + { + "epoch": 1.1864105300604768, + "grad_norm": 1.7101149000760858, + "learning_rate": 4.1245325581564295e-05, + "loss": 0.386, + "step": 10005 + }, + { + "epoch": 1.1865291118226016, + "grad_norm": 1.1252107640982718, + "learning_rate": 4.124350100635904e-05, + "loss": 0.2499, + "step": 10006 + }, + { + "epoch": 1.1866476935847268, + "grad_norm": 0.8412698370908729, + "learning_rate": 4.124167628140861e-05, + "loss": 0.1456, + "step": 10007 + }, + { + "epoch": 1.1867662753468518, + "grad_norm": 1.1121246103907676, + "learning_rate": 4.123985140672984e-05, + "loss": 0.2193, + "step": 10008 + }, + { + "epoch": 1.1868848571089767, + "grad_norm": 1.4925966309436212, + "learning_rate": 4.1238026382339564e-05, + "loss": 0.2634, + "step": 10009 + }, + { + "epoch": 1.1870034388711017, + "grad_norm": 1.6233904180461858, + "learning_rate": 4.123620120825459e-05, + "loss": 0.4017, + "step": 10010 + }, + { + "epoch": 1.1871220206332267, + "grad_norm": 1.872639664823403, + "learning_rate": 4.123437588449176e-05, + "loss": 0.3652, + "step": 10011 + }, + { + "epoch": 1.1872406023953517, + "grad_norm": 1.3672756235604047, + "learning_rate": 4.123255041106788e-05, + "loss": 0.2782, + "step": 10012 + }, + { + "epoch": 1.1873591841574767, + "grad_norm": 0.975838306491177, + "learning_rate": 4.123072478799981e-05, + "loss": 0.1946, + "step": 10013 + }, + { + "epoch": 1.1874777659196016, + "grad_norm": 0.9565713435940582, + "learning_rate": 4.122889901530434e-05, + "loss": 0.1999, + "step": 10014 + }, + { + "epoch": 1.1875963476817266, + "grad_norm": 0.8348810217156815, + "learning_rate": 4.122707309299834e-05, + "loss": 0.1577, + "step": 10015 + }, + { + "epoch": 1.1877149294438516, + "grad_norm": 1.2287444585258527, + "learning_rate": 4.122524702109861e-05, + "loss": 0.3203, + "step": 10016 + }, + { + "epoch": 1.1878335112059766, + "grad_norm": 1.2112152892478654, + "learning_rate": 4.1223420799622e-05, + "loss": 0.254, + "step": 10017 + }, + { + "epoch": 1.1879520929681016, + "grad_norm": 1.0246462970061452, + "learning_rate": 4.122159442858535e-05, + "loss": 0.1981, + "step": 10018 + }, + { + "epoch": 1.1880706747302265, + "grad_norm": 1.2763974390949775, + "learning_rate": 4.1219767908005494e-05, + "loss": 0.2967, + "step": 10019 + }, + { + "epoch": 1.1881892564923515, + "grad_norm": 1.0191008102146855, + "learning_rate": 4.121794123789926e-05, + "loss": 0.2076, + "step": 10020 + }, + { + "epoch": 1.1883078382544765, + "grad_norm": 1.0982502205477862, + "learning_rate": 4.12161144182835e-05, + "loss": 0.2287, + "step": 10021 + }, + { + "epoch": 1.1884264200166015, + "grad_norm": 1.2775144964224978, + "learning_rate": 4.121428744917506e-05, + "loss": 0.2909, + "step": 10022 + }, + { + "epoch": 1.1885450017787265, + "grad_norm": 1.0537366426450205, + "learning_rate": 4.1212460330590755e-05, + "loss": 0.2321, + "step": 10023 + }, + { + "epoch": 1.1886635835408514, + "grad_norm": 0.9737574393698872, + "learning_rate": 4.121063306254746e-05, + "loss": 0.2385, + "step": 10024 + }, + { + "epoch": 1.1887821653029764, + "grad_norm": 1.1708321551836631, + "learning_rate": 4.1208805645062e-05, + "loss": 0.2869, + "step": 10025 + }, + { + "epoch": 1.1889007470651014, + "grad_norm": 1.1327364750686586, + "learning_rate": 4.120697807815124e-05, + "loss": 0.2442, + "step": 10026 + }, + { + "epoch": 1.1890193288272264, + "grad_norm": 1.0383264198009878, + "learning_rate": 4.1205150361832004e-05, + "loss": 0.2008, + "step": 10027 + }, + { + "epoch": 1.1891379105893514, + "grad_norm": 1.342640713691856, + "learning_rate": 4.1203322496121156e-05, + "loss": 0.2899, + "step": 10028 + }, + { + "epoch": 1.1892564923514763, + "grad_norm": 0.9124764615881577, + "learning_rate": 4.120149448103554e-05, + "loss": 0.1979, + "step": 10029 + }, + { + "epoch": 1.1893750741136013, + "grad_norm": 1.1089271492263129, + "learning_rate": 4.119966631659202e-05, + "loss": 0.259, + "step": 10030 + }, + { + "epoch": 1.1894936558757263, + "grad_norm": 1.2162980141605995, + "learning_rate": 4.1197838002807444e-05, + "loss": 0.2527, + "step": 10031 + }, + { + "epoch": 1.1896122376378513, + "grad_norm": 1.2187457255297345, + "learning_rate": 4.1196009539698665e-05, + "loss": 0.2635, + "step": 10032 + }, + { + "epoch": 1.1897308193999763, + "grad_norm": 1.3382176576802678, + "learning_rate": 4.1194180927282534e-05, + "loss": 0.2451, + "step": 10033 + }, + { + "epoch": 1.1898494011621013, + "grad_norm": 1.3328680025956914, + "learning_rate": 4.119235216557591e-05, + "loss": 0.2787, + "step": 10034 + }, + { + "epoch": 1.1899679829242262, + "grad_norm": 1.7024757450642831, + "learning_rate": 4.119052325459566e-05, + "loss": 0.3156, + "step": 10035 + }, + { + "epoch": 1.1900865646863512, + "grad_norm": 1.305684560019124, + "learning_rate": 4.118869419435864e-05, + "loss": 0.2792, + "step": 10036 + }, + { + "epoch": 1.1902051464484762, + "grad_norm": 1.4580031137522618, + "learning_rate": 4.118686498488171e-05, + "loss": 0.3527, + "step": 10037 + }, + { + "epoch": 1.1903237282106012, + "grad_norm": 1.2809001717622404, + "learning_rate": 4.118503562618173e-05, + "loss": 0.2849, + "step": 10038 + }, + { + "epoch": 1.1904423099727262, + "grad_norm": 1.1052808408011252, + "learning_rate": 4.118320611827558e-05, + "loss": 0.2333, + "step": 10039 + }, + { + "epoch": 1.1905608917348511, + "grad_norm": 0.9615671160407735, + "learning_rate": 4.118137646118011e-05, + "loss": 0.1941, + "step": 10040 + }, + { + "epoch": 1.1906794734969761, + "grad_norm": 0.99486064818346, + "learning_rate": 4.1179546654912194e-05, + "loss": 0.2225, + "step": 10041 + }, + { + "epoch": 1.190798055259101, + "grad_norm": 1.5000986941521068, + "learning_rate": 4.1177716699488686e-05, + "loss": 0.3139, + "step": 10042 + }, + { + "epoch": 1.190916637021226, + "grad_norm": 0.9148342288496802, + "learning_rate": 4.117588659492648e-05, + "loss": 0.2072, + "step": 10043 + }, + { + "epoch": 1.191035218783351, + "grad_norm": 0.7814348508110738, + "learning_rate": 4.117405634124244e-05, + "loss": 0.1848, + "step": 10044 + }, + { + "epoch": 1.191153800545476, + "grad_norm": 0.9514781810529025, + "learning_rate": 4.117222593845343e-05, + "loss": 0.2113, + "step": 10045 + }, + { + "epoch": 1.191272382307601, + "grad_norm": 1.1813576145761178, + "learning_rate": 4.1170395386576324e-05, + "loss": 0.2665, + "step": 10046 + }, + { + "epoch": 1.191390964069726, + "grad_norm": 1.0993275348987124, + "learning_rate": 4.1168564685628006e-05, + "loss": 0.2255, + "step": 10047 + }, + { + "epoch": 1.191509545831851, + "grad_norm": 0.8468399693028651, + "learning_rate": 4.116673383562535e-05, + "loss": 0.1559, + "step": 10048 + }, + { + "epoch": 1.191628127593976, + "grad_norm": 1.2752742849027023, + "learning_rate": 4.116490283658523e-05, + "loss": 0.271, + "step": 10049 + }, + { + "epoch": 1.191746709356101, + "grad_norm": 1.01190539989614, + "learning_rate": 4.116307168852452e-05, + "loss": 0.2137, + "step": 10050 + }, + { + "epoch": 1.191865291118226, + "grad_norm": 1.10561980420319, + "learning_rate": 4.116124039146012e-05, + "loss": 0.2095, + "step": 10051 + }, + { + "epoch": 1.1919838728803511, + "grad_norm": 1.1340971778359288, + "learning_rate": 4.1159408945408905e-05, + "loss": 0.233, + "step": 10052 + }, + { + "epoch": 1.1921024546424759, + "grad_norm": 1.480943538506598, + "learning_rate": 4.1157577350387746e-05, + "loss": 0.345, + "step": 10053 + }, + { + "epoch": 1.192221036404601, + "grad_norm": 1.254460877011242, + "learning_rate": 4.1155745606413546e-05, + "loss": 0.2563, + "step": 10054 + }, + { + "epoch": 1.1923396181667258, + "grad_norm": 1.1760302150679185, + "learning_rate": 4.115391371350318e-05, + "loss": 0.2273, + "step": 10055 + }, + { + "epoch": 1.192458199928851, + "grad_norm": 0.9354294146194708, + "learning_rate": 4.1152081671673536e-05, + "loss": 0.2428, + "step": 10056 + }, + { + "epoch": 1.1925767816909758, + "grad_norm": 0.8622930063910595, + "learning_rate": 4.115024948094151e-05, + "loss": 0.1836, + "step": 10057 + }, + { + "epoch": 1.192695363453101, + "grad_norm": 1.11930570261697, + "learning_rate": 4.114841714132399e-05, + "loss": 0.2401, + "step": 10058 + }, + { + "epoch": 1.192813945215226, + "grad_norm": 1.002601771952509, + "learning_rate": 4.1146584652837864e-05, + "loss": 0.2655, + "step": 10059 + }, + { + "epoch": 1.192932526977351, + "grad_norm": 1.1950791654862472, + "learning_rate": 4.114475201550003e-05, + "loss": 0.2535, + "step": 10060 + }, + { + "epoch": 1.193051108739476, + "grad_norm": 1.2809739855902749, + "learning_rate": 4.114291922932737e-05, + "loss": 0.2397, + "step": 10061 + }, + { + "epoch": 1.193169690501601, + "grad_norm": 1.0530885695552947, + "learning_rate": 4.114108629433681e-05, + "loss": 0.2424, + "step": 10062 + }, + { + "epoch": 1.193288272263726, + "grad_norm": 1.1318306463590113, + "learning_rate": 4.11392532105452e-05, + "loss": 0.228, + "step": 10063 + }, + { + "epoch": 1.193406854025851, + "grad_norm": 1.3409888128825862, + "learning_rate": 4.113741997796949e-05, + "loss": 0.295, + "step": 10064 + }, + { + "epoch": 1.1935254357879759, + "grad_norm": 1.4078713392286897, + "learning_rate": 4.113558659662655e-05, + "loss": 0.2619, + "step": 10065 + }, + { + "epoch": 1.1936440175501009, + "grad_norm": 0.9910690185514176, + "learning_rate": 4.1133753066533287e-05, + "loss": 0.1899, + "step": 10066 + }, + { + "epoch": 1.1937625993122258, + "grad_norm": 1.8223101539495545, + "learning_rate": 4.113191938770661e-05, + "loss": 0.2983, + "step": 10067 + }, + { + "epoch": 1.1938811810743508, + "grad_norm": 0.8850762581803602, + "learning_rate": 4.1130085560163413e-05, + "loss": 0.1862, + "step": 10068 + }, + { + "epoch": 1.1939997628364758, + "grad_norm": 1.2139070059443837, + "learning_rate": 4.112825158392061e-05, + "loss": 0.3049, + "step": 10069 + }, + { + "epoch": 1.1941183445986008, + "grad_norm": 1.0318096176686138, + "learning_rate": 4.112641745899509e-05, + "loss": 0.2284, + "step": 10070 + }, + { + "epoch": 1.1942369263607258, + "grad_norm": 1.4071475200101742, + "learning_rate": 4.11245831854038e-05, + "loss": 0.3364, + "step": 10071 + }, + { + "epoch": 1.1943555081228507, + "grad_norm": 0.6896891120633695, + "learning_rate": 4.112274876316362e-05, + "loss": 0.1537, + "step": 10072 + }, + { + "epoch": 1.1944740898849757, + "grad_norm": 1.1305616906142837, + "learning_rate": 4.112091419229146e-05, + "loss": 0.1962, + "step": 10073 + }, + { + "epoch": 1.1945926716471007, + "grad_norm": 1.2711411237027763, + "learning_rate": 4.111907947280424e-05, + "loss": 0.2851, + "step": 10074 + }, + { + "epoch": 1.1947112534092257, + "grad_norm": 0.9873327457294878, + "learning_rate": 4.1117244604718885e-05, + "loss": 0.2085, + "step": 10075 + }, + { + "epoch": 1.1948298351713507, + "grad_norm": 0.9906426628166938, + "learning_rate": 4.111540958805229e-05, + "loss": 0.1932, + "step": 10076 + }, + { + "epoch": 1.1949484169334756, + "grad_norm": 1.0290947331949611, + "learning_rate": 4.1113574422821386e-05, + "loss": 0.2523, + "step": 10077 + }, + { + "epoch": 1.1950669986956006, + "grad_norm": 1.0091965916158459, + "learning_rate": 4.111173910904308e-05, + "loss": 0.2118, + "step": 10078 + }, + { + "epoch": 1.1951855804577256, + "grad_norm": 1.3905273924028243, + "learning_rate": 4.110990364673429e-05, + "loss": 0.2777, + "step": 10079 + }, + { + "epoch": 1.1953041622198506, + "grad_norm": 1.0163832196534681, + "learning_rate": 4.1108068035911953e-05, + "loss": 0.1906, + "step": 10080 + }, + { + "epoch": 1.1954227439819756, + "grad_norm": 1.2341878186217987, + "learning_rate": 4.110623227659298e-05, + "loss": 0.2672, + "step": 10081 + }, + { + "epoch": 1.1955413257441005, + "grad_norm": 1.0828517545624288, + "learning_rate": 4.1104396368794305e-05, + "loss": 0.2219, + "step": 10082 + }, + { + "epoch": 1.1956599075062255, + "grad_norm": 1.3479305823618697, + "learning_rate": 4.1102560312532836e-05, + "loss": 0.3233, + "step": 10083 + }, + { + "epoch": 1.1957784892683505, + "grad_norm": 1.425892492914055, + "learning_rate": 4.110072410782551e-05, + "loss": 0.296, + "step": 10084 + }, + { + "epoch": 1.1958970710304755, + "grad_norm": 1.0874423473863672, + "learning_rate": 4.1098887754689244e-05, + "loss": 0.2449, + "step": 10085 + }, + { + "epoch": 1.1960156527926005, + "grad_norm": 1.0316214363441358, + "learning_rate": 4.109705125314098e-05, + "loss": 0.1924, + "step": 10086 + }, + { + "epoch": 1.1961342345547255, + "grad_norm": 1.1897217944846108, + "learning_rate": 4.109521460319764e-05, + "loss": 0.224, + "step": 10087 + }, + { + "epoch": 1.1962528163168504, + "grad_norm": 1.0796877274996588, + "learning_rate": 4.1093377804876164e-05, + "loss": 0.223, + "step": 10088 + }, + { + "epoch": 1.1963713980789754, + "grad_norm": 1.4292158933650818, + "learning_rate": 4.109154085819348e-05, + "loss": 0.3201, + "step": 10089 + }, + { + "epoch": 1.1964899798411004, + "grad_norm": 1.077672814351693, + "learning_rate": 4.108970376316652e-05, + "loss": 0.2314, + "step": 10090 + }, + { + "epoch": 1.1966085616032254, + "grad_norm": 0.9703955192781616, + "learning_rate": 4.108786651981222e-05, + "loss": 0.2307, + "step": 10091 + }, + { + "epoch": 1.1967271433653504, + "grad_norm": 0.9929272234996583, + "learning_rate": 4.108602912814752e-05, + "loss": 0.2473, + "step": 10092 + }, + { + "epoch": 1.1968457251274753, + "grad_norm": 1.6204028045635575, + "learning_rate": 4.108419158818935e-05, + "loss": 0.2824, + "step": 10093 + }, + { + "epoch": 1.1969643068896003, + "grad_norm": 1.1126336367559204, + "learning_rate": 4.108235389995467e-05, + "loss": 0.2098, + "step": 10094 + }, + { + "epoch": 1.1970828886517253, + "grad_norm": 1.3047808175374893, + "learning_rate": 4.1080516063460404e-05, + "loss": 0.257, + "step": 10095 + }, + { + "epoch": 1.1972014704138503, + "grad_norm": 1.5239748642365574, + "learning_rate": 4.1078678078723496e-05, + "loss": 0.4302, + "step": 10096 + }, + { + "epoch": 1.1973200521759753, + "grad_norm": 0.9021567292839666, + "learning_rate": 4.10768399457609e-05, + "loss": 0.1945, + "step": 10097 + }, + { + "epoch": 1.1974386339381002, + "grad_norm": 1.2210753135489492, + "learning_rate": 4.107500166458954e-05, + "loss": 0.2492, + "step": 10098 + }, + { + "epoch": 1.1975572157002252, + "grad_norm": 1.7901417681180676, + "learning_rate": 4.107316323522639e-05, + "loss": 0.3029, + "step": 10099 + }, + { + "epoch": 1.1976757974623502, + "grad_norm": 1.2207776807407729, + "learning_rate": 4.107132465768838e-05, + "loss": 0.27, + "step": 10100 + }, + { + "epoch": 1.1977943792244752, + "grad_norm": 1.1523701573429772, + "learning_rate": 4.1069485931992476e-05, + "loss": 0.2149, + "step": 10101 + }, + { + "epoch": 1.1979129609866002, + "grad_norm": 1.3618277761590907, + "learning_rate": 4.10676470581556e-05, + "loss": 0.2334, + "step": 10102 + }, + { + "epoch": 1.1980315427487254, + "grad_norm": 1.220199250547045, + "learning_rate": 4.106580803619473e-05, + "loss": 0.2753, + "step": 10103 + }, + { + "epoch": 1.1981501245108501, + "grad_norm": 1.064974511226199, + "learning_rate": 4.106396886612681e-05, + "loss": 0.1995, + "step": 10104 + }, + { + "epoch": 1.1982687062729753, + "grad_norm": 1.4312149235302567, + "learning_rate": 4.10621295479688e-05, + "loss": 0.2297, + "step": 10105 + }, + { + "epoch": 1.1983872880351, + "grad_norm": 1.1388420398243768, + "learning_rate": 4.106029008173765e-05, + "loss": 0.244, + "step": 10106 + }, + { + "epoch": 1.1985058697972253, + "grad_norm": 0.9473173268032994, + "learning_rate": 4.1058450467450314e-05, + "loss": 0.1968, + "step": 10107 + }, + { + "epoch": 1.1986244515593503, + "grad_norm": 1.1399923720215428, + "learning_rate": 4.105661070512377e-05, + "loss": 0.2484, + "step": 10108 + }, + { + "epoch": 1.1987430333214752, + "grad_norm": 0.7975345000555337, + "learning_rate": 4.1054770794774955e-05, + "loss": 0.164, + "step": 10109 + }, + { + "epoch": 1.1988616150836002, + "grad_norm": 1.3836767097907945, + "learning_rate": 4.105293073642084e-05, + "loss": 0.3196, + "step": 10110 + }, + { + "epoch": 1.1989801968457252, + "grad_norm": 0.838408440100419, + "learning_rate": 4.105109053007839e-05, + "loss": 0.1594, + "step": 10111 + }, + { + "epoch": 1.1990987786078502, + "grad_norm": 1.0772857030288403, + "learning_rate": 4.104925017576458e-05, + "loss": 0.2494, + "step": 10112 + }, + { + "epoch": 1.1992173603699752, + "grad_norm": 1.4583294271764262, + "learning_rate": 4.104740967349635e-05, + "loss": 0.3401, + "step": 10113 + }, + { + "epoch": 1.1993359421321002, + "grad_norm": 1.7702943474503443, + "learning_rate": 4.104556902329069e-05, + "loss": 0.3734, + "step": 10114 + }, + { + "epoch": 1.1994545238942251, + "grad_norm": 1.1529287201234797, + "learning_rate": 4.1043728225164544e-05, + "loss": 0.2544, + "step": 10115 + }, + { + "epoch": 1.1995731056563501, + "grad_norm": 1.5990518288830944, + "learning_rate": 4.104188727913492e-05, + "loss": 0.3738, + "step": 10116 + }, + { + "epoch": 1.199691687418475, + "grad_norm": 1.0887062679382393, + "learning_rate": 4.1040046185218756e-05, + "loss": 0.2249, + "step": 10117 + }, + { + "epoch": 1.1998102691806, + "grad_norm": 1.4283878740281928, + "learning_rate": 4.1038204943433036e-05, + "loss": 0.3386, + "step": 10118 + }, + { + "epoch": 1.199928850942725, + "grad_norm": 1.2021059683705713, + "learning_rate": 4.103636355379473e-05, + "loss": 0.2849, + "step": 10119 + }, + { + "epoch": 1.20004743270485, + "grad_norm": 1.1353258113817053, + "learning_rate": 4.103452201632083e-05, + "loss": 0.2579, + "step": 10120 + }, + { + "epoch": 1.200166014466975, + "grad_norm": 1.0716221811004785, + "learning_rate": 4.1032680331028285e-05, + "loss": 0.2471, + "step": 10121 + }, + { + "epoch": 1.2002845962291, + "grad_norm": 1.064036276251583, + "learning_rate": 4.1030838497934096e-05, + "loss": 0.2319, + "step": 10122 + }, + { + "epoch": 1.200403177991225, + "grad_norm": 1.0551527579060316, + "learning_rate": 4.102899651705523e-05, + "loss": 0.1501, + "step": 10123 + }, + { + "epoch": 1.20052175975335, + "grad_norm": 1.389950639256929, + "learning_rate": 4.102715438840867e-05, + "loss": 0.3737, + "step": 10124 + }, + { + "epoch": 1.200640341515475, + "grad_norm": 1.1791191977489621, + "learning_rate": 4.1025312112011415e-05, + "loss": 0.2206, + "step": 10125 + }, + { + "epoch": 1.2007589232776, + "grad_norm": 1.1576957436745274, + "learning_rate": 4.102346968788042e-05, + "loss": 0.3197, + "step": 10126 + }, + { + "epoch": 1.200877505039725, + "grad_norm": 1.1800929722347389, + "learning_rate": 4.102162711603269e-05, + "loss": 0.2357, + "step": 10127 + }, + { + "epoch": 1.2009960868018499, + "grad_norm": 1.0819533359904676, + "learning_rate": 4.1019784396485205e-05, + "loss": 0.2657, + "step": 10128 + }, + { + "epoch": 1.2011146685639749, + "grad_norm": 1.8265215751647212, + "learning_rate": 4.1017941529254955e-05, + "loss": 0.3905, + "step": 10129 + }, + { + "epoch": 1.2012332503260998, + "grad_norm": 1.1358342153756338, + "learning_rate": 4.101609851435892e-05, + "loss": 0.2339, + "step": 10130 + }, + { + "epoch": 1.2013518320882248, + "grad_norm": 1.101327878669638, + "learning_rate": 4.10142553518141e-05, + "loss": 0.1796, + "step": 10131 + }, + { + "epoch": 1.2014704138503498, + "grad_norm": 0.936585809948274, + "learning_rate": 4.1012412041637483e-05, + "loss": 0.2048, + "step": 10132 + }, + { + "epoch": 1.2015889956124748, + "grad_norm": 1.0308399496397147, + "learning_rate": 4.101056858384606e-05, + "loss": 0.2356, + "step": 10133 + }, + { + "epoch": 1.2017075773745998, + "grad_norm": 0.9546384594511809, + "learning_rate": 4.100872497845683e-05, + "loss": 0.2272, + "step": 10134 + }, + { + "epoch": 1.2018261591367247, + "grad_norm": 1.195640423158244, + "learning_rate": 4.1006881225486794e-05, + "loss": 0.227, + "step": 10135 + }, + { + "epoch": 1.2019447408988497, + "grad_norm": 1.7188517006694826, + "learning_rate": 4.1005037324952934e-05, + "loss": 0.3612, + "step": 10136 + }, + { + "epoch": 1.2020633226609747, + "grad_norm": 1.2529696254001514, + "learning_rate": 4.1003193276872256e-05, + "loss": 0.3062, + "step": 10137 + }, + { + "epoch": 1.2021819044230997, + "grad_norm": 1.0569311222613382, + "learning_rate": 4.1001349081261756e-05, + "loss": 0.2565, + "step": 10138 + }, + { + "epoch": 1.2023004861852247, + "grad_norm": 1.0313310799546145, + "learning_rate": 4.099950473813845e-05, + "loss": 0.2163, + "step": 10139 + }, + { + "epoch": 1.2024190679473497, + "grad_norm": 1.4167794733864791, + "learning_rate": 4.0997660247519324e-05, + "loss": 0.2814, + "step": 10140 + }, + { + "epoch": 1.2025376497094746, + "grad_norm": 0.9442673676433876, + "learning_rate": 4.0995815609421394e-05, + "loss": 0.221, + "step": 10141 + }, + { + "epoch": 1.2026562314715996, + "grad_norm": 1.434688029996051, + "learning_rate": 4.099397082386166e-05, + "loss": 0.2853, + "step": 10142 + }, + { + "epoch": 1.2027748132337246, + "grad_norm": 1.1550169483429702, + "learning_rate": 4.099212589085712e-05, + "loss": 0.2873, + "step": 10143 + }, + { + "epoch": 1.2028933949958496, + "grad_norm": 1.5369203569969059, + "learning_rate": 4.09902808104248e-05, + "loss": 0.3733, + "step": 10144 + }, + { + "epoch": 1.2030119767579746, + "grad_norm": 1.1232691428485828, + "learning_rate": 4.098843558258168e-05, + "loss": 0.2502, + "step": 10145 + }, + { + "epoch": 1.2031305585200995, + "grad_norm": 1.6668873679469038, + "learning_rate": 4.098659020734481e-05, + "loss": 0.2806, + "step": 10146 + }, + { + "epoch": 1.2032491402822245, + "grad_norm": 1.134385391374349, + "learning_rate": 4.098474468473118e-05, + "loss": 0.2468, + "step": 10147 + }, + { + "epoch": 1.2033677220443495, + "grad_norm": 1.3438778018297415, + "learning_rate": 4.09828990147578e-05, + "loss": 0.2268, + "step": 10148 + }, + { + "epoch": 1.2034863038064745, + "grad_norm": 1.2946938991409886, + "learning_rate": 4.098105319744169e-05, + "loss": 0.3574, + "step": 10149 + }, + { + "epoch": 1.2036048855685995, + "grad_norm": 1.1016701317290052, + "learning_rate": 4.097920723279988e-05, + "loss": 0.2365, + "step": 10150 + }, + { + "epoch": 1.2037234673307244, + "grad_norm": 1.2243748979913949, + "learning_rate": 4.0977361120849353e-05, + "loss": 0.2229, + "step": 10151 + }, + { + "epoch": 1.2038420490928496, + "grad_norm": 1.3555098215189718, + "learning_rate": 4.0975514861607165e-05, + "loss": 0.2736, + "step": 10152 + }, + { + "epoch": 1.2039606308549744, + "grad_norm": 0.9198562896039261, + "learning_rate": 4.097366845509031e-05, + "loss": 0.1869, + "step": 10153 + }, + { + "epoch": 1.2040792126170996, + "grad_norm": 1.373201116406091, + "learning_rate": 4.097182190131583e-05, + "loss": 0.2407, + "step": 10154 + }, + { + "epoch": 1.2041977943792244, + "grad_norm": 0.9618674156468626, + "learning_rate": 4.096997520030073e-05, + "loss": 0.2317, + "step": 10155 + }, + { + "epoch": 1.2043163761413496, + "grad_norm": 0.9527515931393938, + "learning_rate": 4.096812835206204e-05, + "loss": 0.1907, + "step": 10156 + }, + { + "epoch": 1.2044349579034743, + "grad_norm": 1.5549003051983408, + "learning_rate": 4.09662813566168e-05, + "loss": 0.3926, + "step": 10157 + }, + { + "epoch": 1.2045535396655995, + "grad_norm": 1.1657579496902652, + "learning_rate": 4.096443421398202e-05, + "loss": 0.2506, + "step": 10158 + }, + { + "epoch": 1.2046721214277245, + "grad_norm": 1.0796789124999318, + "learning_rate": 4.096258692417473e-05, + "loss": 0.2003, + "step": 10159 + }, + { + "epoch": 1.2047907031898495, + "grad_norm": 1.0353586043321399, + "learning_rate": 4.0960739487211965e-05, + "loss": 0.2044, + "step": 10160 + }, + { + "epoch": 1.2049092849519745, + "grad_norm": 1.5863920207631093, + "learning_rate": 4.0958891903110764e-05, + "loss": 0.3363, + "step": 10161 + }, + { + "epoch": 1.2050278667140994, + "grad_norm": 1.190550368433452, + "learning_rate": 4.095704417188814e-05, + "loss": 0.2218, + "step": 10162 + }, + { + "epoch": 1.2051464484762244, + "grad_norm": 1.4497766015624562, + "learning_rate": 4.0955196293561134e-05, + "loss": 0.3323, + "step": 10163 + }, + { + "epoch": 1.2052650302383494, + "grad_norm": 1.3540693300328088, + "learning_rate": 4.095334826814678e-05, + "loss": 0.3355, + "step": 10164 + }, + { + "epoch": 1.2053836120004744, + "grad_norm": 0.9217079983859838, + "learning_rate": 4.095150009566213e-05, + "loss": 0.1792, + "step": 10165 + }, + { + "epoch": 1.2055021937625994, + "grad_norm": 1.465627581681539, + "learning_rate": 4.0949651776124205e-05, + "loss": 0.3237, + "step": 10166 + }, + { + "epoch": 1.2056207755247244, + "grad_norm": 1.3377624261039944, + "learning_rate": 4.094780330955005e-05, + "loss": 0.2164, + "step": 10167 + }, + { + "epoch": 1.2057393572868493, + "grad_norm": 0.9847702014764638, + "learning_rate": 4.09459546959567e-05, + "loss": 0.2422, + "step": 10168 + }, + { + "epoch": 1.2058579390489743, + "grad_norm": 1.0074892722766746, + "learning_rate": 4.094410593536121e-05, + "loss": 0.2148, + "step": 10169 + }, + { + "epoch": 1.2059765208110993, + "grad_norm": 1.0186595149053943, + "learning_rate": 4.094225702778061e-05, + "loss": 0.2394, + "step": 10170 + }, + { + "epoch": 1.2060951025732243, + "grad_norm": 1.2149873654893601, + "learning_rate": 4.094040797323195e-05, + "loss": 0.2761, + "step": 10171 + }, + { + "epoch": 1.2062136843353493, + "grad_norm": 1.0304441089442282, + "learning_rate": 4.093855877173228e-05, + "loss": 0.1904, + "step": 10172 + }, + { + "epoch": 1.2063322660974742, + "grad_norm": 0.9338174234477378, + "learning_rate": 4.093670942329864e-05, + "loss": 0.1893, + "step": 10173 + }, + { + "epoch": 1.2064508478595992, + "grad_norm": 0.7584986024159174, + "learning_rate": 4.093485992794809e-05, + "loss": 0.1817, + "step": 10174 + }, + { + "epoch": 1.2065694296217242, + "grad_norm": 1.2762350995987157, + "learning_rate": 4.093301028569766e-05, + "loss": 0.2703, + "step": 10175 + }, + { + "epoch": 1.2066880113838492, + "grad_norm": 1.0565452179733306, + "learning_rate": 4.093116049656442e-05, + "loss": 0.2249, + "step": 10176 + }, + { + "epoch": 1.2068065931459742, + "grad_norm": 1.1925868722593789, + "learning_rate": 4.092931056056542e-05, + "loss": 0.3015, + "step": 10177 + }, + { + "epoch": 1.2069251749080991, + "grad_norm": 0.9155573596006452, + "learning_rate": 4.092746047771769e-05, + "loss": 0.1967, + "step": 10178 + }, + { + "epoch": 1.2070437566702241, + "grad_norm": 0.9694333383942133, + "learning_rate": 4.0925610248038326e-05, + "loss": 0.1916, + "step": 10179 + }, + { + "epoch": 1.207162338432349, + "grad_norm": 1.205043977822242, + "learning_rate": 4.092375987154436e-05, + "loss": 0.2978, + "step": 10180 + }, + { + "epoch": 1.207280920194474, + "grad_norm": 2.0187936664709665, + "learning_rate": 4.0921909348252856e-05, + "loss": 0.4646, + "step": 10181 + }, + { + "epoch": 1.207399501956599, + "grad_norm": 1.1556435811578174, + "learning_rate": 4.092005867818087e-05, + "loss": 0.2126, + "step": 10182 + }, + { + "epoch": 1.207518083718724, + "grad_norm": 0.9428100955597953, + "learning_rate": 4.091820786134547e-05, + "loss": 0.1871, + "step": 10183 + }, + { + "epoch": 1.207636665480849, + "grad_norm": 1.1357021714374476, + "learning_rate": 4.091635689776371e-05, + "loss": 0.2679, + "step": 10184 + }, + { + "epoch": 1.207755247242974, + "grad_norm": 0.9922596132523961, + "learning_rate": 4.091450578745266e-05, + "loss": 0.2135, + "step": 10185 + }, + { + "epoch": 1.207873829005099, + "grad_norm": 1.072713743020643, + "learning_rate": 4.091265453042937e-05, + "loss": 0.185, + "step": 10186 + }, + { + "epoch": 1.207992410767224, + "grad_norm": 0.9650377249297603, + "learning_rate": 4.091080312671094e-05, + "loss": 0.2411, + "step": 10187 + }, + { + "epoch": 1.208110992529349, + "grad_norm": 1.1383697024052295, + "learning_rate": 4.0908951576314404e-05, + "loss": 0.2015, + "step": 10188 + }, + { + "epoch": 1.208229574291474, + "grad_norm": 0.9502630178513273, + "learning_rate": 4.0907099879256836e-05, + "loss": 0.2003, + "step": 10189 + }, + { + "epoch": 1.208348156053599, + "grad_norm": 0.7955936632410228, + "learning_rate": 4.090524803555533e-05, + "loss": 0.1835, + "step": 10190 + }, + { + "epoch": 1.208466737815724, + "grad_norm": 1.2611954705512252, + "learning_rate": 4.090339604522693e-05, + "loss": 0.2318, + "step": 10191 + }, + { + "epoch": 1.2085853195778489, + "grad_norm": 0.9190337939460484, + "learning_rate": 4.090154390828872e-05, + "loss": 0.1776, + "step": 10192 + }, + { + "epoch": 1.2087039013399739, + "grad_norm": 1.1839339866022651, + "learning_rate": 4.0899691624757786e-05, + "loss": 0.2182, + "step": 10193 + }, + { + "epoch": 1.2088224831020988, + "grad_norm": 1.2847279009215706, + "learning_rate": 4.089783919465118e-05, + "loss": 0.3102, + "step": 10194 + }, + { + "epoch": 1.2089410648642238, + "grad_norm": 1.1023585371658664, + "learning_rate": 4.0895986617986e-05, + "loss": 0.2306, + "step": 10195 + }, + { + "epoch": 1.2090596466263488, + "grad_norm": 1.1244134350481656, + "learning_rate": 4.0894133894779316e-05, + "loss": 0.2552, + "step": 10196 + }, + { + "epoch": 1.2091782283884738, + "grad_norm": 0.9931473144063955, + "learning_rate": 4.0892281025048204e-05, + "loss": 0.2399, + "step": 10197 + }, + { + "epoch": 1.2092968101505988, + "grad_norm": 1.1822287499739839, + "learning_rate": 4.089042800880975e-05, + "loss": 0.2841, + "step": 10198 + }, + { + "epoch": 1.2094153919127237, + "grad_norm": 0.9060762683156744, + "learning_rate": 4.088857484608104e-05, + "loss": 0.1796, + "step": 10199 + }, + { + "epoch": 1.2095339736748487, + "grad_norm": 0.872322498006015, + "learning_rate": 4.088672153687915e-05, + "loss": 0.2016, + "step": 10200 + }, + { + "epoch": 1.2096525554369737, + "grad_norm": 0.8663670952857345, + "learning_rate": 4.0884868081221174e-05, + "loss": 0.1608, + "step": 10201 + }, + { + "epoch": 1.2097711371990987, + "grad_norm": 0.8651539947637432, + "learning_rate": 4.088301447912418e-05, + "loss": 0.2232, + "step": 10202 + }, + { + "epoch": 1.2098897189612239, + "grad_norm": 1.608270089772309, + "learning_rate": 4.088116073060528e-05, + "loss": 0.3295, + "step": 10203 + }, + { + "epoch": 1.2100083007233486, + "grad_norm": 1.2352770109862852, + "learning_rate": 4.087930683568156e-05, + "loss": 0.2213, + "step": 10204 + }, + { + "epoch": 1.2101268824854738, + "grad_norm": 0.8391347101612521, + "learning_rate": 4.087745279437009e-05, + "loss": 0.1597, + "step": 10205 + }, + { + "epoch": 1.2102454642475986, + "grad_norm": 1.2410967343531722, + "learning_rate": 4.087559860668798e-05, + "loss": 0.2857, + "step": 10206 + }, + { + "epoch": 1.2103640460097238, + "grad_norm": 1.1709539095412569, + "learning_rate": 4.0873744272652325e-05, + "loss": 0.1823, + "step": 10207 + }, + { + "epoch": 1.2104826277718488, + "grad_norm": 1.3002403148307262, + "learning_rate": 4.08718897922802e-05, + "loss": 0.2529, + "step": 10208 + }, + { + "epoch": 1.2106012095339738, + "grad_norm": 1.4675355092217133, + "learning_rate": 4.087003516558873e-05, + "loss": 0.3008, + "step": 10209 + }, + { + "epoch": 1.2107197912960987, + "grad_norm": 1.28416685775307, + "learning_rate": 4.086818039259498e-05, + "loss": 0.2312, + "step": 10210 + }, + { + "epoch": 1.2108383730582237, + "grad_norm": 1.262348458846978, + "learning_rate": 4.086632547331608e-05, + "loss": 0.2084, + "step": 10211 + }, + { + "epoch": 1.2109569548203487, + "grad_norm": 1.2289776941981818, + "learning_rate": 4.0864470407769114e-05, + "loss": 0.2568, + "step": 10212 + }, + { + "epoch": 1.2110755365824737, + "grad_norm": 1.7375408420393477, + "learning_rate": 4.086261519597118e-05, + "loss": 0.3488, + "step": 10213 + }, + { + "epoch": 1.2111941183445987, + "grad_norm": 1.3243500338490757, + "learning_rate": 4.0860759837939386e-05, + "loss": 0.2741, + "step": 10214 + }, + { + "epoch": 1.2113127001067236, + "grad_norm": 0.7907342780452906, + "learning_rate": 4.085890433369084e-05, + "loss": 0.1431, + "step": 10215 + }, + { + "epoch": 1.2114312818688486, + "grad_norm": 1.1458723521952625, + "learning_rate": 4.085704868324264e-05, + "loss": 0.273, + "step": 10216 + }, + { + "epoch": 1.2115498636309736, + "grad_norm": 1.5893087495292009, + "learning_rate": 4.08551928866119e-05, + "loss": 0.253, + "step": 10217 + }, + { + "epoch": 1.2116684453930986, + "grad_norm": 0.891560372571105, + "learning_rate": 4.085333694381572e-05, + "loss": 0.1791, + "step": 10218 + }, + { + "epoch": 1.2117870271552236, + "grad_norm": 1.3952797129616346, + "learning_rate": 4.085148085487122e-05, + "loss": 0.2631, + "step": 10219 + }, + { + "epoch": 1.2119056089173486, + "grad_norm": 1.0798993190310615, + "learning_rate": 4.0849624619795504e-05, + "loss": 0.209, + "step": 10220 + }, + { + "epoch": 1.2120241906794735, + "grad_norm": 1.1906924347185477, + "learning_rate": 4.0847768238605676e-05, + "loss": 0.2098, + "step": 10221 + }, + { + "epoch": 1.2121427724415985, + "grad_norm": 1.1539969931428957, + "learning_rate": 4.084591171131886e-05, + "loss": 0.1802, + "step": 10222 + }, + { + "epoch": 1.2122613542037235, + "grad_norm": 1.0432603102133833, + "learning_rate": 4.084405503795218e-05, + "loss": 0.222, + "step": 10223 + }, + { + "epoch": 1.2123799359658485, + "grad_norm": 1.164833997568905, + "learning_rate": 4.084219821852274e-05, + "loss": 0.2373, + "step": 10224 + }, + { + "epoch": 1.2124985177279735, + "grad_norm": 0.965253810681616, + "learning_rate": 4.084034125304765e-05, + "loss": 0.253, + "step": 10225 + }, + { + "epoch": 1.2126170994900984, + "grad_norm": 0.9264097863019384, + "learning_rate": 4.083848414154405e-05, + "loss": 0.1713, + "step": 10226 + }, + { + "epoch": 1.2127356812522234, + "grad_norm": 1.0592164498487315, + "learning_rate": 4.083662688402904e-05, + "loss": 0.2185, + "step": 10227 + }, + { + "epoch": 1.2128542630143484, + "grad_norm": 1.457275860385385, + "learning_rate": 4.083476948051975e-05, + "loss": 0.2816, + "step": 10228 + }, + { + "epoch": 1.2129728447764734, + "grad_norm": 1.4903222080627936, + "learning_rate": 4.083291193103331e-05, + "loss": 0.3186, + "step": 10229 + }, + { + "epoch": 1.2130914265385984, + "grad_norm": 1.122566659933671, + "learning_rate": 4.0831054235586836e-05, + "loss": 0.221, + "step": 10230 + }, + { + "epoch": 1.2132100083007233, + "grad_norm": 1.0338368345738906, + "learning_rate": 4.082919639419745e-05, + "loss": 0.2021, + "step": 10231 + }, + { + "epoch": 1.2133285900628483, + "grad_norm": 0.9875112832883731, + "learning_rate": 4.082733840688229e-05, + "loss": 0.1724, + "step": 10232 + }, + { + "epoch": 1.2134471718249733, + "grad_norm": 1.081448727879954, + "learning_rate": 4.082548027365847e-05, + "loss": 0.2649, + "step": 10233 + }, + { + "epoch": 1.2135657535870983, + "grad_norm": 1.502245481414067, + "learning_rate": 4.0823621994543136e-05, + "loss": 0.3143, + "step": 10234 + }, + { + "epoch": 1.2136843353492233, + "grad_norm": 1.1478157591358962, + "learning_rate": 4.082176356955342e-05, + "loss": 0.2655, + "step": 10235 + }, + { + "epoch": 1.2138029171113482, + "grad_norm": 1.178054348545664, + "learning_rate": 4.0819904998706424e-05, + "loss": 0.2694, + "step": 10236 + }, + { + "epoch": 1.2139214988734732, + "grad_norm": 0.8321173507579744, + "learning_rate": 4.0818046282019315e-05, + "loss": 0.1633, + "step": 10237 + }, + { + "epoch": 1.2140400806355982, + "grad_norm": 1.454685502400587, + "learning_rate": 4.0816187419509224e-05, + "loss": 0.2763, + "step": 10238 + }, + { + "epoch": 1.2141586623977232, + "grad_norm": 0.9586447640542258, + "learning_rate": 4.081432841119327e-05, + "loss": 0.1947, + "step": 10239 + }, + { + "epoch": 1.2142772441598482, + "grad_norm": 1.2382880855381473, + "learning_rate": 4.0812469257088605e-05, + "loss": 0.2724, + "step": 10240 + }, + { + "epoch": 1.2143958259219731, + "grad_norm": 0.8753838249992335, + "learning_rate": 4.081060995721237e-05, + "loss": 0.1828, + "step": 10241 + }, + { + "epoch": 1.2145144076840981, + "grad_norm": 1.3642913008776578, + "learning_rate": 4.080875051158168e-05, + "loss": 0.2903, + "step": 10242 + }, + { + "epoch": 1.214632989446223, + "grad_norm": 0.9746579883733816, + "learning_rate": 4.080689092021372e-05, + "loss": 0.2368, + "step": 10243 + }, + { + "epoch": 1.214751571208348, + "grad_norm": 0.9208365052701576, + "learning_rate": 4.0805031183125594e-05, + "loss": 0.2216, + "step": 10244 + }, + { + "epoch": 1.214870152970473, + "grad_norm": 1.3900279026328566, + "learning_rate": 4.080317130033446e-05, + "loss": 0.2324, + "step": 10245 + }, + { + "epoch": 1.214988734732598, + "grad_norm": 1.295535716624153, + "learning_rate": 4.0801311271857476e-05, + "loss": 0.2556, + "step": 10246 + }, + { + "epoch": 1.215107316494723, + "grad_norm": 0.9883270327179872, + "learning_rate": 4.079945109771177e-05, + "loss": 0.2261, + "step": 10247 + }, + { + "epoch": 1.215225898256848, + "grad_norm": 1.3227013525179128, + "learning_rate": 4.0797590777914506e-05, + "loss": 0.3109, + "step": 10248 + }, + { + "epoch": 1.215344480018973, + "grad_norm": 1.746518958296574, + "learning_rate": 4.079573031248283e-05, + "loss": 0.3725, + "step": 10249 + }, + { + "epoch": 1.215463061781098, + "grad_norm": 1.9459292155309782, + "learning_rate": 4.0793869701433883e-05, + "loss": 0.3743, + "step": 10250 + }, + { + "epoch": 1.215581643543223, + "grad_norm": 1.6105986647054182, + "learning_rate": 4.0792008944784826e-05, + "loss": 0.4305, + "step": 10251 + }, + { + "epoch": 1.2157002253053482, + "grad_norm": 0.9910418377457845, + "learning_rate": 4.079014804255281e-05, + "loss": 0.2051, + "step": 10252 + }, + { + "epoch": 1.215818807067473, + "grad_norm": 1.2028534158447939, + "learning_rate": 4.0788286994755e-05, + "loss": 0.3145, + "step": 10253 + }, + { + "epoch": 1.2159373888295981, + "grad_norm": 0.9788081307773414, + "learning_rate": 4.078642580140854e-05, + "loss": 0.1889, + "step": 10254 + }, + { + "epoch": 1.2160559705917229, + "grad_norm": 0.9477192842644316, + "learning_rate": 4.078456446253059e-05, + "loss": 0.2279, + "step": 10255 + }, + { + "epoch": 1.216174552353848, + "grad_norm": 1.1928961694698341, + "learning_rate": 4.0782702978138323e-05, + "loss": 0.2094, + "step": 10256 + }, + { + "epoch": 1.2162931341159728, + "grad_norm": 1.1109019807824705, + "learning_rate": 4.0780841348248875e-05, + "loss": 0.2139, + "step": 10257 + }, + { + "epoch": 1.216411715878098, + "grad_norm": 1.5549994049655793, + "learning_rate": 4.077897957287943e-05, + "loss": 0.3279, + "step": 10258 + }, + { + "epoch": 1.216530297640223, + "grad_norm": 0.8197984422674387, + "learning_rate": 4.077711765204714e-05, + "loss": 0.1871, + "step": 10259 + }, + { + "epoch": 1.216648879402348, + "grad_norm": 1.4058301647667417, + "learning_rate": 4.077525558576918e-05, + "loss": 0.2794, + "step": 10260 + }, + { + "epoch": 1.216767461164473, + "grad_norm": 1.0770198909695483, + "learning_rate": 4.07733933740627e-05, + "loss": 0.237, + "step": 10261 + }, + { + "epoch": 1.216886042926598, + "grad_norm": 1.3475563051762618, + "learning_rate": 4.077153101694487e-05, + "loss": 0.2454, + "step": 10262 + }, + { + "epoch": 1.217004624688723, + "grad_norm": 1.3316602949519247, + "learning_rate": 4.076966851443287e-05, + "loss": 0.265, + "step": 10263 + }, + { + "epoch": 1.217123206450848, + "grad_norm": 1.020668856314294, + "learning_rate": 4.076780586654387e-05, + "loss": 0.2394, + "step": 10264 + }, + { + "epoch": 1.217241788212973, + "grad_norm": 1.3940977115137962, + "learning_rate": 4.0765943073295035e-05, + "loss": 0.3089, + "step": 10265 + }, + { + "epoch": 1.217360369975098, + "grad_norm": 1.4387248312315777, + "learning_rate": 4.076408013470353e-05, + "loss": 0.3201, + "step": 10266 + }, + { + "epoch": 1.2174789517372229, + "grad_norm": 1.4484811668436048, + "learning_rate": 4.0762217050786544e-05, + "loss": 0.2844, + "step": 10267 + }, + { + "epoch": 1.2175975334993479, + "grad_norm": 1.370809982485104, + "learning_rate": 4.0760353821561245e-05, + "loss": 0.3152, + "step": 10268 + }, + { + "epoch": 1.2177161152614728, + "grad_norm": 0.8523853760152775, + "learning_rate": 4.0758490447044805e-05, + "loss": 0.1921, + "step": 10269 + }, + { + "epoch": 1.2178346970235978, + "grad_norm": 1.351058469628779, + "learning_rate": 4.0756626927254414e-05, + "loss": 0.3423, + "step": 10270 + }, + { + "epoch": 1.2179532787857228, + "grad_norm": 0.8813399971637731, + "learning_rate": 4.075476326220724e-05, + "loss": 0.1994, + "step": 10271 + }, + { + "epoch": 1.2180718605478478, + "grad_norm": 1.2305426745761106, + "learning_rate": 4.0752899451920475e-05, + "loss": 0.2988, + "step": 10272 + }, + { + "epoch": 1.2181904423099728, + "grad_norm": 1.4218744293762504, + "learning_rate": 4.075103549641129e-05, + "loss": 0.3867, + "step": 10273 + }, + { + "epoch": 1.2183090240720977, + "grad_norm": 1.2401304491389225, + "learning_rate": 4.074917139569687e-05, + "loss": 0.3173, + "step": 10274 + }, + { + "epoch": 1.2184276058342227, + "grad_norm": 1.0296452519508474, + "learning_rate": 4.074730714979441e-05, + "loss": 0.2013, + "step": 10275 + }, + { + "epoch": 1.2185461875963477, + "grad_norm": 1.080363999820064, + "learning_rate": 4.074544275872109e-05, + "loss": 0.2161, + "step": 10276 + }, + { + "epoch": 1.2186647693584727, + "grad_norm": 1.074733604937567, + "learning_rate": 4.074357822249408e-05, + "loss": 0.2555, + "step": 10277 + }, + { + "epoch": 1.2187833511205977, + "grad_norm": 0.9837244023557823, + "learning_rate": 4.07417135411306e-05, + "loss": 0.2022, + "step": 10278 + }, + { + "epoch": 1.2189019328827226, + "grad_norm": 1.1730818126155909, + "learning_rate": 4.0739848714647814e-05, + "loss": 0.2586, + "step": 10279 + }, + { + "epoch": 1.2190205146448476, + "grad_norm": 0.9761842475621036, + "learning_rate": 4.0737983743062935e-05, + "loss": 0.1841, + "step": 10280 + }, + { + "epoch": 1.2191390964069726, + "grad_norm": 0.9474092624997897, + "learning_rate": 4.073611862639314e-05, + "loss": 0.2345, + "step": 10281 + }, + { + "epoch": 1.2192576781690976, + "grad_norm": 1.0317486687126054, + "learning_rate": 4.073425336465563e-05, + "loss": 0.2102, + "step": 10282 + }, + { + "epoch": 1.2193762599312226, + "grad_norm": 1.041609820733456, + "learning_rate": 4.0732387957867596e-05, + "loss": 0.2133, + "step": 10283 + }, + { + "epoch": 1.2194948416933475, + "grad_norm": 0.9366668988897691, + "learning_rate": 4.073052240604624e-05, + "loss": 0.1954, + "step": 10284 + }, + { + "epoch": 1.2196134234554725, + "grad_norm": 1.5611131767879702, + "learning_rate": 4.0728656709208755e-05, + "loss": 0.3343, + "step": 10285 + }, + { + "epoch": 1.2197320052175975, + "grad_norm": 1.1809157245042148, + "learning_rate": 4.072679086737235e-05, + "loss": 0.2618, + "step": 10286 + }, + { + "epoch": 1.2198505869797225, + "grad_norm": 1.031965069462004, + "learning_rate": 4.072492488055421e-05, + "loss": 0.1799, + "step": 10287 + }, + { + "epoch": 1.2199691687418475, + "grad_norm": 1.0258788719396572, + "learning_rate": 4.072305874877155e-05, + "loss": 0.2401, + "step": 10288 + }, + { + "epoch": 1.2200877505039724, + "grad_norm": 0.9382033281246679, + "learning_rate": 4.072119247204157e-05, + "loss": 0.1892, + "step": 10289 + }, + { + "epoch": 1.2202063322660974, + "grad_norm": 1.2299562016584125, + "learning_rate": 4.0719326050381465e-05, + "loss": 0.2897, + "step": 10290 + }, + { + "epoch": 1.2203249140282224, + "grad_norm": 1.3039396432243335, + "learning_rate": 4.071745948380846e-05, + "loss": 0.2879, + "step": 10291 + }, + { + "epoch": 1.2204434957903474, + "grad_norm": 1.3139195329004056, + "learning_rate": 4.071559277233975e-05, + "loss": 0.2549, + "step": 10292 + }, + { + "epoch": 1.2205620775524724, + "grad_norm": 0.8750268377486163, + "learning_rate": 4.071372591599255e-05, + "loss": 0.2093, + "step": 10293 + }, + { + "epoch": 1.2206806593145973, + "grad_norm": 0.8916521372738684, + "learning_rate": 4.0711858914784054e-05, + "loss": 0.164, + "step": 10294 + }, + { + "epoch": 1.2207992410767223, + "grad_norm": 1.1495165176746145, + "learning_rate": 4.0709991768731504e-05, + "loss": 0.202, + "step": 10295 + }, + { + "epoch": 1.2209178228388473, + "grad_norm": 1.2823566867758875, + "learning_rate": 4.070812447785208e-05, + "loss": 0.1934, + "step": 10296 + }, + { + "epoch": 1.2210364046009723, + "grad_norm": 0.9309214755027723, + "learning_rate": 4.070625704216302e-05, + "loss": 0.1552, + "step": 10297 + }, + { + "epoch": 1.2211549863630973, + "grad_norm": 1.0766286903510887, + "learning_rate": 4.070438946168153e-05, + "loss": 0.2329, + "step": 10298 + }, + { + "epoch": 1.2212735681252223, + "grad_norm": 1.5477588018276742, + "learning_rate": 4.070252173642481e-05, + "loss": 0.31, + "step": 10299 + }, + { + "epoch": 1.2213921498873472, + "grad_norm": 1.1052874841189015, + "learning_rate": 4.070065386641011e-05, + "loss": 0.2729, + "step": 10300 + }, + { + "epoch": 1.2215107316494722, + "grad_norm": 1.2032268934610537, + "learning_rate": 4.069878585165464e-05, + "loss": 0.2847, + "step": 10301 + }, + { + "epoch": 1.2216293134115972, + "grad_norm": 1.1666917251255546, + "learning_rate": 4.0696917692175605e-05, + "loss": 0.2604, + "step": 10302 + }, + { + "epoch": 1.2217478951737224, + "grad_norm": 1.0444779220342593, + "learning_rate": 4.069504938799025e-05, + "loss": 0.2461, + "step": 10303 + }, + { + "epoch": 1.2218664769358472, + "grad_norm": 1.2182181974880055, + "learning_rate": 4.069318093911577e-05, + "loss": 0.1934, + "step": 10304 + }, + { + "epoch": 1.2219850586979724, + "grad_norm": 1.0179658366359854, + "learning_rate": 4.069131234556942e-05, + "loss": 0.2186, + "step": 10305 + }, + { + "epoch": 1.2221036404600971, + "grad_norm": 1.2856905857748602, + "learning_rate": 4.0689443607368405e-05, + "loss": 0.2333, + "step": 10306 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.17300439391137, + "learning_rate": 4.068757472452996e-05, + "loss": 0.2742, + "step": 10307 + }, + { + "epoch": 1.2223408039843473, + "grad_norm": 1.338646078889366, + "learning_rate": 4.0685705697071317e-05, + "loss": 0.2201, + "step": 10308 + }, + { + "epoch": 1.2224593857464723, + "grad_norm": 0.7568836205283361, + "learning_rate": 4.06838365250097e-05, + "loss": 0.1354, + "step": 10309 + }, + { + "epoch": 1.2225779675085973, + "grad_norm": 0.9603537433216494, + "learning_rate": 4.0681967208362344e-05, + "loss": 0.1734, + "step": 10310 + }, + { + "epoch": 1.2226965492707222, + "grad_norm": 1.0761707993782752, + "learning_rate": 4.0680097747146474e-05, + "loss": 0.2543, + "step": 10311 + }, + { + "epoch": 1.2228151310328472, + "grad_norm": 1.184485265370033, + "learning_rate": 4.0678228141379336e-05, + "loss": 0.2354, + "step": 10312 + }, + { + "epoch": 1.2229337127949722, + "grad_norm": 1.1399778149041233, + "learning_rate": 4.067635839107816e-05, + "loss": 0.253, + "step": 10313 + }, + { + "epoch": 1.2230522945570972, + "grad_norm": 1.3069735074016442, + "learning_rate": 4.067448849626019e-05, + "loss": 0.2566, + "step": 10314 + }, + { + "epoch": 1.2231708763192222, + "grad_norm": 1.2106966747555683, + "learning_rate": 4.0672618456942654e-05, + "loss": 0.1975, + "step": 10315 + }, + { + "epoch": 1.2232894580813471, + "grad_norm": 1.0399887941766268, + "learning_rate": 4.067074827314279e-05, + "loss": 0.2532, + "step": 10316 + }, + { + "epoch": 1.2234080398434721, + "grad_norm": 0.8376393667687961, + "learning_rate": 4.0668877944877846e-05, + "loss": 0.1509, + "step": 10317 + }, + { + "epoch": 1.223526621605597, + "grad_norm": 1.281604557577526, + "learning_rate": 4.066700747216507e-05, + "loss": 0.335, + "step": 10318 + }, + { + "epoch": 1.223645203367722, + "grad_norm": 0.9539099633635346, + "learning_rate": 4.066513685502168e-05, + "loss": 0.2265, + "step": 10319 + }, + { + "epoch": 1.223763785129847, + "grad_norm": 1.0609306719199592, + "learning_rate": 4.066326609346495e-05, + "loss": 0.2395, + "step": 10320 + }, + { + "epoch": 1.223882366891972, + "grad_norm": 1.1311756235082806, + "learning_rate": 4.066139518751211e-05, + "loss": 0.1945, + "step": 10321 + }, + { + "epoch": 1.224000948654097, + "grad_norm": 1.1209256170493511, + "learning_rate": 4.065952413718041e-05, + "loss": 0.2149, + "step": 10322 + }, + { + "epoch": 1.224119530416222, + "grad_norm": 1.1253779253064846, + "learning_rate": 4.0657652942487114e-05, + "loss": 0.233, + "step": 10323 + }, + { + "epoch": 1.224238112178347, + "grad_norm": 1.4419135324799837, + "learning_rate": 4.065578160344944e-05, + "loss": 0.387, + "step": 10324 + }, + { + "epoch": 1.224356693940472, + "grad_norm": 1.042097524145393, + "learning_rate": 4.065391012008467e-05, + "loss": 0.179, + "step": 10325 + }, + { + "epoch": 1.224475275702597, + "grad_norm": 1.1054802779202186, + "learning_rate": 4.065203849241004e-05, + "loss": 0.2723, + "step": 10326 + }, + { + "epoch": 1.224593857464722, + "grad_norm": 0.9513154067175317, + "learning_rate": 4.065016672044281e-05, + "loss": 0.2806, + "step": 10327 + }, + { + "epoch": 1.224712439226847, + "grad_norm": 1.005001205353516, + "learning_rate": 4.064829480420023e-05, + "loss": 0.2596, + "step": 10328 + }, + { + "epoch": 1.224831020988972, + "grad_norm": 1.1072610700673473, + "learning_rate": 4.0646422743699574e-05, + "loss": 0.2189, + "step": 10329 + }, + { + "epoch": 1.2249496027510969, + "grad_norm": 1.0456194052300545, + "learning_rate": 4.064455053895807e-05, + "loss": 0.241, + "step": 10330 + }, + { + "epoch": 1.2250681845132219, + "grad_norm": 1.375734181396911, + "learning_rate": 4.064267818999301e-05, + "loss": 0.2874, + "step": 10331 + }, + { + "epoch": 1.2251867662753468, + "grad_norm": 0.954844341113241, + "learning_rate": 4.064080569682163e-05, + "loss": 0.222, + "step": 10332 + }, + { + "epoch": 1.2253053480374718, + "grad_norm": 1.3563880438139067, + "learning_rate": 4.06389330594612e-05, + "loss": 0.2648, + "step": 10333 + }, + { + "epoch": 1.2254239297995968, + "grad_norm": 1.2044479236654095, + "learning_rate": 4.0637060277928994e-05, + "loss": 0.212, + "step": 10334 + }, + { + "epoch": 1.2255425115617218, + "grad_norm": 1.0231161242173488, + "learning_rate": 4.063518735224227e-05, + "loss": 0.2666, + "step": 10335 + }, + { + "epoch": 1.2256610933238468, + "grad_norm": 1.0431945913763834, + "learning_rate": 4.0633314282418275e-05, + "loss": 0.2232, + "step": 10336 + }, + { + "epoch": 1.2257796750859717, + "grad_norm": 0.8807268737440037, + "learning_rate": 4.063144106847431e-05, + "loss": 0.2353, + "step": 10337 + }, + { + "epoch": 1.2258982568480967, + "grad_norm": 1.051394813427057, + "learning_rate": 4.062956771042762e-05, + "loss": 0.2021, + "step": 10338 + }, + { + "epoch": 1.2260168386102217, + "grad_norm": 1.2109507975941567, + "learning_rate": 4.062769420829547e-05, + "loss": 0.2134, + "step": 10339 + }, + { + "epoch": 1.2261354203723467, + "grad_norm": 1.6161296959054892, + "learning_rate": 4.0625820562095165e-05, + "loss": 0.4353, + "step": 10340 + }, + { + "epoch": 1.2262540021344717, + "grad_norm": 1.022331559835647, + "learning_rate": 4.062394677184395e-05, + "loss": 0.1981, + "step": 10341 + }, + { + "epoch": 1.2263725838965966, + "grad_norm": 1.1901834835802496, + "learning_rate": 4.0622072837559096e-05, + "loss": 0.2936, + "step": 10342 + }, + { + "epoch": 1.2264911656587216, + "grad_norm": 1.157240761062325, + "learning_rate": 4.0620198759257887e-05, + "loss": 0.2376, + "step": 10343 + }, + { + "epoch": 1.2266097474208466, + "grad_norm": 1.4580050676316623, + "learning_rate": 4.061832453695761e-05, + "loss": 0.2759, + "step": 10344 + }, + { + "epoch": 1.2267283291829716, + "grad_norm": 1.4070927018441504, + "learning_rate": 4.061645017067552e-05, + "loss": 0.3392, + "step": 10345 + }, + { + "epoch": 1.2268469109450966, + "grad_norm": 0.9715904545330502, + "learning_rate": 4.0614575660428927e-05, + "loss": 0.2124, + "step": 10346 + }, + { + "epoch": 1.2269654927072216, + "grad_norm": 1.3475876987384037, + "learning_rate": 4.0612701006235085e-05, + "loss": 0.3005, + "step": 10347 + }, + { + "epoch": 1.2270840744693465, + "grad_norm": 1.2259016992322325, + "learning_rate": 4.0610826208111284e-05, + "loss": 0.2501, + "step": 10348 + }, + { + "epoch": 1.2272026562314715, + "grad_norm": 1.0860151188507194, + "learning_rate": 4.060895126607481e-05, + "loss": 0.2425, + "step": 10349 + }, + { + "epoch": 1.2273212379935965, + "grad_norm": 1.013581272847547, + "learning_rate": 4.060707618014295e-05, + "loss": 0.177, + "step": 10350 + }, + { + "epoch": 1.2274398197557215, + "grad_norm": 1.484871612701876, + "learning_rate": 4.060520095033298e-05, + "loss": 0.27, + "step": 10351 + }, + { + "epoch": 1.2275584015178467, + "grad_norm": 1.3484467951333505, + "learning_rate": 4.06033255766622e-05, + "loss": 0.3577, + "step": 10352 + }, + { + "epoch": 1.2276769832799714, + "grad_norm": 1.35252320910601, + "learning_rate": 4.060145005914789e-05, + "loss": 0.3128, + "step": 10353 + }, + { + "epoch": 1.2277955650420966, + "grad_norm": 1.3558320560697066, + "learning_rate": 4.0599574397807335e-05, + "loss": 0.2908, + "step": 10354 + }, + { + "epoch": 1.2279141468042214, + "grad_norm": 1.2253928242668557, + "learning_rate": 4.059769859265785e-05, + "loss": 0.207, + "step": 10355 + }, + { + "epoch": 1.2280327285663466, + "grad_norm": 0.9305918148828173, + "learning_rate": 4.05958226437167e-05, + "loss": 0.176, + "step": 10356 + }, + { + "epoch": 1.2281513103284714, + "grad_norm": 1.0311692335832507, + "learning_rate": 4.059394655100119e-05, + "loss": 0.2427, + "step": 10357 + }, + { + "epoch": 1.2282698920905966, + "grad_norm": 0.9009773624812605, + "learning_rate": 4.0592070314528614e-05, + "loss": 0.2192, + "step": 10358 + }, + { + "epoch": 1.2283884738527215, + "grad_norm": 1.2035279914437245, + "learning_rate": 4.0590193934316274e-05, + "loss": 0.2474, + "step": 10359 + }, + { + "epoch": 1.2285070556148465, + "grad_norm": 1.1113872087730987, + "learning_rate": 4.058831741038146e-05, + "loss": 0.2708, + "step": 10360 + }, + { + "epoch": 1.2286256373769715, + "grad_norm": 1.0945385584320637, + "learning_rate": 4.058644074274148e-05, + "loss": 0.2138, + "step": 10361 + }, + { + "epoch": 1.2287442191390965, + "grad_norm": 1.5598622141249898, + "learning_rate": 4.058456393141362e-05, + "loss": 0.3975, + "step": 10362 + }, + { + "epoch": 1.2288628009012215, + "grad_norm": 0.8715948448383642, + "learning_rate": 4.0582686976415206e-05, + "loss": 0.1905, + "step": 10363 + }, + { + "epoch": 1.2289813826633464, + "grad_norm": 1.2610287703431173, + "learning_rate": 4.0580809877763513e-05, + "loss": 0.2849, + "step": 10364 + }, + { + "epoch": 1.2290999644254714, + "grad_norm": 1.1218073106246012, + "learning_rate": 4.057893263547587e-05, + "loss": 0.1861, + "step": 10365 + }, + { + "epoch": 1.2292185461875964, + "grad_norm": 1.0025476230465227, + "learning_rate": 4.057705524956957e-05, + "loss": 0.2125, + "step": 10366 + }, + { + "epoch": 1.2293371279497214, + "grad_norm": 0.9176731337235872, + "learning_rate": 4.057517772006192e-05, + "loss": 0.217, + "step": 10367 + }, + { + "epoch": 1.2294557097118464, + "grad_norm": 0.9488008286584205, + "learning_rate": 4.0573300046970233e-05, + "loss": 0.2148, + "step": 10368 + }, + { + "epoch": 1.2295742914739713, + "grad_norm": 1.0656062754853088, + "learning_rate": 4.057142223031181e-05, + "loss": 0.2003, + "step": 10369 + }, + { + "epoch": 1.2296928732360963, + "grad_norm": 1.1532068067971386, + "learning_rate": 4.056954427010398e-05, + "loss": 0.2795, + "step": 10370 + }, + { + "epoch": 1.2298114549982213, + "grad_norm": 1.1386834669612202, + "learning_rate": 4.0567666166364035e-05, + "loss": 0.2567, + "step": 10371 + }, + { + "epoch": 1.2299300367603463, + "grad_norm": 1.5347243857253139, + "learning_rate": 4.0565787919109307e-05, + "loss": 0.3564, + "step": 10372 + }, + { + "epoch": 1.2300486185224713, + "grad_norm": 1.4663995886488497, + "learning_rate": 4.056390952835709e-05, + "loss": 0.338, + "step": 10373 + }, + { + "epoch": 1.2301672002845963, + "grad_norm": 1.1885281558809169, + "learning_rate": 4.056203099412472e-05, + "loss": 0.322, + "step": 10374 + }, + { + "epoch": 1.2302857820467212, + "grad_norm": 1.2903020845770692, + "learning_rate": 4.0560152316429514e-05, + "loss": 0.3541, + "step": 10375 + }, + { + "epoch": 1.2304043638088462, + "grad_norm": 1.0443563001458418, + "learning_rate": 4.055827349528878e-05, + "loss": 0.2192, + "step": 10376 + }, + { + "epoch": 1.2305229455709712, + "grad_norm": 1.2919293087861499, + "learning_rate": 4.055639453071984e-05, + "loss": 0.317, + "step": 10377 + }, + { + "epoch": 1.2306415273330962, + "grad_norm": 1.2225131419413489, + "learning_rate": 4.055451542274002e-05, + "loss": 0.2503, + "step": 10378 + }, + { + "epoch": 1.2307601090952212, + "grad_norm": 1.1360546293642564, + "learning_rate": 4.0552636171366644e-05, + "loss": 0.2569, + "step": 10379 + }, + { + "epoch": 1.2308786908573461, + "grad_norm": 1.469185870123054, + "learning_rate": 4.055075677661703e-05, + "loss": 0.3153, + "step": 10380 + }, + { + "epoch": 1.2309972726194711, + "grad_norm": 1.316407755210929, + "learning_rate": 4.054887723850852e-05, + "loss": 0.266, + "step": 10381 + }, + { + "epoch": 1.231115854381596, + "grad_norm": 1.0166231201054174, + "learning_rate": 4.0546997557058416e-05, + "loss": 0.1845, + "step": 10382 + }, + { + "epoch": 1.231234436143721, + "grad_norm": 1.0772726607225094, + "learning_rate": 4.0545117732284065e-05, + "loss": 0.195, + "step": 10383 + }, + { + "epoch": 1.231353017905846, + "grad_norm": 1.436140172439385, + "learning_rate": 4.054323776420279e-05, + "loss": 0.2549, + "step": 10384 + }, + { + "epoch": 1.231471599667971, + "grad_norm": 0.8609131422718318, + "learning_rate": 4.054135765283192e-05, + "loss": 0.1775, + "step": 10385 + }, + { + "epoch": 1.231590181430096, + "grad_norm": 1.062905195915382, + "learning_rate": 4.05394773981888e-05, + "loss": 0.206, + "step": 10386 + }, + { + "epoch": 1.231708763192221, + "grad_norm": 1.134812544122325, + "learning_rate": 4.0537597000290744e-05, + "loss": 0.2421, + "step": 10387 + }, + { + "epoch": 1.231827344954346, + "grad_norm": 1.2308492256809125, + "learning_rate": 4.05357164591551e-05, + "loss": 0.2588, + "step": 10388 + }, + { + "epoch": 1.231945926716471, + "grad_norm": 0.9739838362179883, + "learning_rate": 4.053383577479921e-05, + "loss": 0.1823, + "step": 10389 + }, + { + "epoch": 1.232064508478596, + "grad_norm": 1.0102670363665212, + "learning_rate": 4.053195494724039e-05, + "loss": 0.2026, + "step": 10390 + }, + { + "epoch": 1.232183090240721, + "grad_norm": 1.202890138384261, + "learning_rate": 4.0530073976496e-05, + "loss": 0.3331, + "step": 10391 + }, + { + "epoch": 1.232301672002846, + "grad_norm": 1.4521372366814496, + "learning_rate": 4.052819286258337e-05, + "loss": 0.308, + "step": 10392 + }, + { + "epoch": 1.2324202537649709, + "grad_norm": 1.2687170600942417, + "learning_rate": 4.052631160551984e-05, + "loss": 0.2613, + "step": 10393 + }, + { + "epoch": 1.2325388355270959, + "grad_norm": 1.0096210892206983, + "learning_rate": 4.052443020532276e-05, + "loss": 0.2179, + "step": 10394 + }, + { + "epoch": 1.2326574172892208, + "grad_norm": 1.162285964577226, + "learning_rate": 4.052254866200947e-05, + "loss": 0.2242, + "step": 10395 + }, + { + "epoch": 1.2327759990513458, + "grad_norm": 1.2616258094738422, + "learning_rate": 4.052066697559732e-05, + "loss": 0.2734, + "step": 10396 + }, + { + "epoch": 1.2328945808134708, + "grad_norm": 1.3323279307450362, + "learning_rate": 4.051878514610365e-05, + "loss": 0.2801, + "step": 10397 + }, + { + "epoch": 1.2330131625755958, + "grad_norm": 1.5041861726248658, + "learning_rate": 4.0516903173545816e-05, + "loss": 0.3196, + "step": 10398 + }, + { + "epoch": 1.2331317443377208, + "grad_norm": 0.9259855329577563, + "learning_rate": 4.051502105794115e-05, + "loss": 0.185, + "step": 10399 + }, + { + "epoch": 1.2332503260998458, + "grad_norm": 1.253751688405963, + "learning_rate": 4.051313879930703e-05, + "loss": 0.2858, + "step": 10400 + }, + { + "epoch": 1.233368907861971, + "grad_norm": 1.5697294893196387, + "learning_rate": 4.051125639766079e-05, + "loss": 0.4019, + "step": 10401 + }, + { + "epoch": 1.2334874896240957, + "grad_norm": 1.1682495191139026, + "learning_rate": 4.0509373853019785e-05, + "loss": 0.2028, + "step": 10402 + }, + { + "epoch": 1.233606071386221, + "grad_norm": 1.1083219549893202, + "learning_rate": 4.050749116540137e-05, + "loss": 0.2328, + "step": 10403 + }, + { + "epoch": 1.2337246531483457, + "grad_norm": 0.9695602431625172, + "learning_rate": 4.050560833482291e-05, + "loss": 0.2011, + "step": 10404 + }, + { + "epoch": 1.2338432349104709, + "grad_norm": 1.4674056766231716, + "learning_rate": 4.0503725361301746e-05, + "loss": 0.3292, + "step": 10405 + }, + { + "epoch": 1.2339618166725956, + "grad_norm": 1.243027512160164, + "learning_rate": 4.050184224485525e-05, + "loss": 0.2848, + "step": 10406 + }, + { + "epoch": 1.2340803984347208, + "grad_norm": 1.3414468410444953, + "learning_rate": 4.049995898550078e-05, + "loss": 0.2861, + "step": 10407 + }, + { + "epoch": 1.2341989801968458, + "grad_norm": 1.0913395307614915, + "learning_rate": 4.049807558325569e-05, + "loss": 0.2028, + "step": 10408 + }, + { + "epoch": 1.2343175619589708, + "grad_norm": 1.017994955442891, + "learning_rate": 4.049619203813736e-05, + "loss": 0.2191, + "step": 10409 + }, + { + "epoch": 1.2344361437210958, + "grad_norm": 1.0807346083553915, + "learning_rate": 4.049430835016313e-05, + "loss": 0.2222, + "step": 10410 + }, + { + "epoch": 1.2345547254832208, + "grad_norm": 1.4643312074749304, + "learning_rate": 4.049242451935038e-05, + "loss": 0.289, + "step": 10411 + }, + { + "epoch": 1.2346733072453457, + "grad_norm": 1.337008294321385, + "learning_rate": 4.049054054571648e-05, + "loss": 0.3339, + "step": 10412 + }, + { + "epoch": 1.2347918890074707, + "grad_norm": 1.2171753287405522, + "learning_rate": 4.048865642927879e-05, + "loss": 0.2292, + "step": 10413 + }, + { + "epoch": 1.2349104707695957, + "grad_norm": 1.2654810421773326, + "learning_rate": 4.0486772170054675e-05, + "loss": 0.2801, + "step": 10414 + }, + { + "epoch": 1.2350290525317207, + "grad_norm": 1.1019240865802675, + "learning_rate": 4.048488776806152e-05, + "loss": 0.221, + "step": 10415 + }, + { + "epoch": 1.2351476342938457, + "grad_norm": 1.1903615042512776, + "learning_rate": 4.048300322331668e-05, + "loss": 0.2567, + "step": 10416 + }, + { + "epoch": 1.2352662160559706, + "grad_norm": 1.0366270642293351, + "learning_rate": 4.0481118535837546e-05, + "loss": 0.2571, + "step": 10417 + }, + { + "epoch": 1.2353847978180956, + "grad_norm": 1.323492449155374, + "learning_rate": 4.047923370564147e-05, + "loss": 0.2305, + "step": 10418 + }, + { + "epoch": 1.2355033795802206, + "grad_norm": 1.4476333954939236, + "learning_rate": 4.047734873274586e-05, + "loss": 0.246, + "step": 10419 + }, + { + "epoch": 1.2356219613423456, + "grad_norm": 1.1994137485677294, + "learning_rate": 4.047546361716806e-05, + "loss": 0.2672, + "step": 10420 + }, + { + "epoch": 1.2357405431044706, + "grad_norm": 1.0268641673309007, + "learning_rate": 4.047357835892546e-05, + "loss": 0.2415, + "step": 10421 + }, + { + "epoch": 1.2358591248665955, + "grad_norm": 1.6020146113053215, + "learning_rate": 4.047169295803545e-05, + "loss": 0.3262, + "step": 10422 + }, + { + "epoch": 1.2359777066287205, + "grad_norm": 1.1474300405533635, + "learning_rate": 4.04698074145154e-05, + "loss": 0.191, + "step": 10423 + }, + { + "epoch": 1.2360962883908455, + "grad_norm": 1.21633269032001, + "learning_rate": 4.046792172838271e-05, + "loss": 0.2752, + "step": 10424 + }, + { + "epoch": 1.2362148701529705, + "grad_norm": 1.1147321407881818, + "learning_rate": 4.046603589965473e-05, + "loss": 0.2546, + "step": 10425 + }, + { + "epoch": 1.2363334519150955, + "grad_norm": 1.0387741437188411, + "learning_rate": 4.0464149928348874e-05, + "loss": 0.2171, + "step": 10426 + }, + { + "epoch": 1.2364520336772205, + "grad_norm": 1.689542317782831, + "learning_rate": 4.0462263814482514e-05, + "loss": 0.4004, + "step": 10427 + }, + { + "epoch": 1.2365706154393454, + "grad_norm": 1.2335922217392163, + "learning_rate": 4.046037755807305e-05, + "loss": 0.2815, + "step": 10428 + }, + { + "epoch": 1.2366891972014704, + "grad_norm": 1.4114706557345076, + "learning_rate": 4.0458491159137854e-05, + "loss": 0.3576, + "step": 10429 + }, + { + "epoch": 1.2368077789635954, + "grad_norm": 1.2118961293792692, + "learning_rate": 4.045660461769434e-05, + "loss": 0.225, + "step": 10430 + }, + { + "epoch": 1.2369263607257204, + "grad_norm": 1.0786544296009202, + "learning_rate": 4.0454717933759875e-05, + "loss": 0.2503, + "step": 10431 + }, + { + "epoch": 1.2370449424878454, + "grad_norm": 1.4845167559253982, + "learning_rate": 4.045283110735187e-05, + "loss": 0.2648, + "step": 10432 + }, + { + "epoch": 1.2371635242499703, + "grad_norm": 0.9447311265303799, + "learning_rate": 4.04509441384877e-05, + "loss": 0.1867, + "step": 10433 + }, + { + "epoch": 1.2372821060120953, + "grad_norm": 1.023828830443806, + "learning_rate": 4.044905702718479e-05, + "loss": 0.2398, + "step": 10434 + }, + { + "epoch": 1.2374006877742203, + "grad_norm": 0.8192392423249338, + "learning_rate": 4.04471697734605e-05, + "loss": 0.1692, + "step": 10435 + }, + { + "epoch": 1.2375192695363453, + "grad_norm": 1.4831804850193884, + "learning_rate": 4.044528237733226e-05, + "loss": 0.2746, + "step": 10436 + }, + { + "epoch": 1.2376378512984703, + "grad_norm": 1.1623504757281935, + "learning_rate": 4.044339483881745e-05, + "loss": 0.2242, + "step": 10437 + }, + { + "epoch": 1.2377564330605952, + "grad_norm": 1.2146398150404634, + "learning_rate": 4.0441507157933475e-05, + "loss": 0.2517, + "step": 10438 + }, + { + "epoch": 1.2378750148227202, + "grad_norm": 1.141496008480598, + "learning_rate": 4.0439619334697754e-05, + "loss": 0.2611, + "step": 10439 + }, + { + "epoch": 1.2379935965848452, + "grad_norm": 0.9239401835496538, + "learning_rate": 4.043773136912766e-05, + "loss": 0.2127, + "step": 10440 + }, + { + "epoch": 1.2381121783469702, + "grad_norm": 1.1043275231075644, + "learning_rate": 4.043584326124062e-05, + "loss": 0.2257, + "step": 10441 + }, + { + "epoch": 1.2382307601090952, + "grad_norm": 1.1207237987783494, + "learning_rate": 4.0433955011054034e-05, + "loss": 0.2553, + "step": 10442 + }, + { + "epoch": 1.2383493418712201, + "grad_norm": 0.9587476298544969, + "learning_rate": 4.043206661858531e-05, + "loss": 0.1517, + "step": 10443 + }, + { + "epoch": 1.2384679236333451, + "grad_norm": 1.3635760389166562, + "learning_rate": 4.0430178083851855e-05, + "loss": 0.2611, + "step": 10444 + }, + { + "epoch": 1.23858650539547, + "grad_norm": 1.3829757900433353, + "learning_rate": 4.042828940687108e-05, + "loss": 0.2461, + "step": 10445 + }, + { + "epoch": 1.238705087157595, + "grad_norm": 1.1239778764736725, + "learning_rate": 4.0426400587660396e-05, + "loss": 0.2488, + "step": 10446 + }, + { + "epoch": 1.23882366891972, + "grad_norm": 1.2029635580188662, + "learning_rate": 4.042451162623722e-05, + "loss": 0.2676, + "step": 10447 + }, + { + "epoch": 1.238942250681845, + "grad_norm": 1.1371281154508992, + "learning_rate": 4.042262252261896e-05, + "loss": 0.3013, + "step": 10448 + }, + { + "epoch": 1.23906083244397, + "grad_norm": 1.138781374418562, + "learning_rate": 4.042073327682303e-05, + "loss": 0.2858, + "step": 10449 + }, + { + "epoch": 1.239179414206095, + "grad_norm": 1.1682930838709649, + "learning_rate": 4.0418843888866845e-05, + "loss": 0.2044, + "step": 10450 + }, + { + "epoch": 1.23929799596822, + "grad_norm": 1.228310868759271, + "learning_rate": 4.0416954358767835e-05, + "loss": 0.2554, + "step": 10451 + }, + { + "epoch": 1.2394165777303452, + "grad_norm": 1.0876521876803484, + "learning_rate": 4.04150646865434e-05, + "loss": 0.2446, + "step": 10452 + }, + { + "epoch": 1.23953515949247, + "grad_norm": 0.9352704547807149, + "learning_rate": 4.041317487221098e-05, + "loss": 0.1803, + "step": 10453 + }, + { + "epoch": 1.2396537412545952, + "grad_norm": 1.1896564090733655, + "learning_rate": 4.0411284915787994e-05, + "loss": 0.2628, + "step": 10454 + }, + { + "epoch": 1.23977232301672, + "grad_norm": 1.0410338235440035, + "learning_rate": 4.040939481729185e-05, + "loss": 0.1841, + "step": 10455 + }, + { + "epoch": 1.2398909047788451, + "grad_norm": 1.487967823511107, + "learning_rate": 4.0407504576739984e-05, + "loss": 0.3427, + "step": 10456 + }, + { + "epoch": 1.2400094865409699, + "grad_norm": 0.7966016917705205, + "learning_rate": 4.040561419414982e-05, + "loss": 0.1743, + "step": 10457 + }, + { + "epoch": 1.240128068303095, + "grad_norm": 1.4797816232336247, + "learning_rate": 4.0403723669538796e-05, + "loss": 0.2855, + "step": 10458 + }, + { + "epoch": 1.24024665006522, + "grad_norm": 1.1285288014030752, + "learning_rate": 4.040183300292432e-05, + "loss": 0.2423, + "step": 10459 + }, + { + "epoch": 1.240365231827345, + "grad_norm": 1.320081852621455, + "learning_rate": 4.0399942194323826e-05, + "loss": 0.2785, + "step": 10460 + }, + { + "epoch": 1.24048381358947, + "grad_norm": 1.201354055542455, + "learning_rate": 4.039805124375475e-05, + "loss": 0.2592, + "step": 10461 + }, + { + "epoch": 1.240602395351595, + "grad_norm": 2.105710918664603, + "learning_rate": 4.039616015123453e-05, + "loss": 0.195, + "step": 10462 + }, + { + "epoch": 1.24072097711372, + "grad_norm": 1.0975713323866878, + "learning_rate": 4.0394268916780594e-05, + "loss": 0.2166, + "step": 10463 + }, + { + "epoch": 1.240839558875845, + "grad_norm": 1.0910338190612414, + "learning_rate": 4.0392377540410375e-05, + "loss": 0.2122, + "step": 10464 + }, + { + "epoch": 1.24095814063797, + "grad_norm": 1.0113785130900554, + "learning_rate": 4.0390486022141306e-05, + "loss": 0.1932, + "step": 10465 + }, + { + "epoch": 1.241076722400095, + "grad_norm": 1.294315778340242, + "learning_rate": 4.038859436199083e-05, + "loss": 0.2959, + "step": 10466 + }, + { + "epoch": 1.24119530416222, + "grad_norm": 1.1686581154915705, + "learning_rate": 4.0386702559976385e-05, + "loss": 0.2836, + "step": 10467 + }, + { + "epoch": 1.2413138859243449, + "grad_norm": 1.2776349342091515, + "learning_rate": 4.0384810616115415e-05, + "loss": 0.2468, + "step": 10468 + }, + { + "epoch": 1.2414324676864699, + "grad_norm": 1.2149778562527616, + "learning_rate": 4.038291853042536e-05, + "loss": 0.2385, + "step": 10469 + }, + { + "epoch": 1.2415510494485948, + "grad_norm": 1.0338435217152584, + "learning_rate": 4.038102630292365e-05, + "loss": 0.2494, + "step": 10470 + }, + { + "epoch": 1.2416696312107198, + "grad_norm": 1.097141839281573, + "learning_rate": 4.0379133933627746e-05, + "loss": 0.2385, + "step": 10471 + }, + { + "epoch": 1.2417882129728448, + "grad_norm": 1.2308875405541801, + "learning_rate": 4.037724142255508e-05, + "loss": 0.2567, + "step": 10472 + }, + { + "epoch": 1.2419067947349698, + "grad_norm": 1.126828822169031, + "learning_rate": 4.037534876972311e-05, + "loss": 0.1779, + "step": 10473 + }, + { + "epoch": 1.2420253764970948, + "grad_norm": 0.9739911621170491, + "learning_rate": 4.0373455975149275e-05, + "loss": 0.2246, + "step": 10474 + }, + { + "epoch": 1.2421439582592197, + "grad_norm": 2.040190204424857, + "learning_rate": 4.037156303885103e-05, + "loss": 0.4745, + "step": 10475 + }, + { + "epoch": 1.2422625400213447, + "grad_norm": 1.165764032297576, + "learning_rate": 4.0369669960845826e-05, + "loss": 0.2342, + "step": 10476 + }, + { + "epoch": 1.2423811217834697, + "grad_norm": 1.2054712782500459, + "learning_rate": 4.036777674115111e-05, + "loss": 0.2642, + "step": 10477 + }, + { + "epoch": 1.2424997035455947, + "grad_norm": 0.9936247732604012, + "learning_rate": 4.0365883379784345e-05, + "loss": 0.2328, + "step": 10478 + }, + { + "epoch": 1.2426182853077197, + "grad_norm": 0.9038161000004851, + "learning_rate": 4.036398987676296e-05, + "loss": 0.2207, + "step": 10479 + }, + { + "epoch": 1.2427368670698447, + "grad_norm": 0.9434547096866889, + "learning_rate": 4.036209623210444e-05, + "loss": 0.2178, + "step": 10480 + }, + { + "epoch": 1.2428554488319696, + "grad_norm": 1.0554341977106048, + "learning_rate": 4.0360202445826236e-05, + "loss": 0.2744, + "step": 10481 + }, + { + "epoch": 1.2429740305940946, + "grad_norm": 0.9447917242351296, + "learning_rate": 4.03583085179458e-05, + "loss": 0.195, + "step": 10482 + }, + { + "epoch": 1.2430926123562196, + "grad_norm": 1.0645737766898753, + "learning_rate": 4.0356414448480585e-05, + "loss": 0.2231, + "step": 10483 + }, + { + "epoch": 1.2432111941183446, + "grad_norm": 2.1094616417604195, + "learning_rate": 4.035452023744807e-05, + "loss": 0.5596, + "step": 10484 + }, + { + "epoch": 1.2433297758804696, + "grad_norm": 0.7706261524419524, + "learning_rate": 4.03526258848657e-05, + "loss": 0.1499, + "step": 10485 + }, + { + "epoch": 1.2434483576425945, + "grad_norm": 0.9375155371413499, + "learning_rate": 4.0350731390750953e-05, + "loss": 0.2152, + "step": 10486 + }, + { + "epoch": 1.2435669394047195, + "grad_norm": 0.9043959443944448, + "learning_rate": 4.0348836755121274e-05, + "loss": 0.2472, + "step": 10487 + }, + { + "epoch": 1.2436855211668445, + "grad_norm": 1.217952308141904, + "learning_rate": 4.034694197799416e-05, + "loss": 0.2351, + "step": 10488 + }, + { + "epoch": 1.2438041029289695, + "grad_norm": 1.2363116913687606, + "learning_rate": 4.034504705938704e-05, + "loss": 0.2598, + "step": 10489 + }, + { + "epoch": 1.2439226846910945, + "grad_norm": 1.2799765754349888, + "learning_rate": 4.034315199931742e-05, + "loss": 0.2584, + "step": 10490 + }, + { + "epoch": 1.2440412664532194, + "grad_norm": 1.0190593038893978, + "learning_rate": 4.034125679780275e-05, + "loss": 0.2682, + "step": 10491 + }, + { + "epoch": 1.2441598482153444, + "grad_norm": 0.9391231118246326, + "learning_rate": 4.03393614548605e-05, + "loss": 0.1844, + "step": 10492 + }, + { + "epoch": 1.2442784299774694, + "grad_norm": 1.3192451946167671, + "learning_rate": 4.033746597050815e-05, + "loss": 0.2613, + "step": 10493 + }, + { + "epoch": 1.2443970117395944, + "grad_norm": 1.3958438620056595, + "learning_rate": 4.033557034476318e-05, + "loss": 0.259, + "step": 10494 + }, + { + "epoch": 1.2445155935017194, + "grad_norm": 1.0093463452116567, + "learning_rate": 4.033367457764304e-05, + "loss": 0.2251, + "step": 10495 + }, + { + "epoch": 1.2446341752638443, + "grad_norm": 0.9499408745125743, + "learning_rate": 4.033177866916523e-05, + "loss": 0.1551, + "step": 10496 + }, + { + "epoch": 1.2447527570259693, + "grad_norm": 1.0931319390045557, + "learning_rate": 4.032988261934723e-05, + "loss": 0.242, + "step": 10497 + }, + { + "epoch": 1.2448713387880943, + "grad_norm": 1.094501074541544, + "learning_rate": 4.032798642820651e-05, + "loss": 0.2536, + "step": 10498 + }, + { + "epoch": 1.2449899205502193, + "grad_norm": 1.2738152160055225, + "learning_rate": 4.032609009576055e-05, + "loss": 0.2876, + "step": 10499 + }, + { + "epoch": 1.2451085023123443, + "grad_norm": 1.0256005041217717, + "learning_rate": 4.032419362202683e-05, + "loss": 0.2323, + "step": 10500 + }, + { + "epoch": 1.2452270840744695, + "grad_norm": 0.9663864134321976, + "learning_rate": 4.0322297007022835e-05, + "loss": 0.2164, + "step": 10501 + }, + { + "epoch": 1.2453456658365942, + "grad_norm": 1.2343799216721234, + "learning_rate": 4.032040025076606e-05, + "loss": 0.2781, + "step": 10502 + }, + { + "epoch": 1.2454642475987194, + "grad_norm": 0.9479766832494176, + "learning_rate": 4.0318503353273975e-05, + "loss": 0.2279, + "step": 10503 + }, + { + "epoch": 1.2455828293608442, + "grad_norm": 0.9689145773770386, + "learning_rate": 4.0316606314564065e-05, + "loss": 0.2116, + "step": 10504 + }, + { + "epoch": 1.2457014111229694, + "grad_norm": 1.2615590637776244, + "learning_rate": 4.031470913465384e-05, + "loss": 0.2794, + "step": 10505 + }, + { + "epoch": 1.2458199928850942, + "grad_norm": 1.210611703776856, + "learning_rate": 4.031281181356077e-05, + "loss": 0.2498, + "step": 10506 + }, + { + "epoch": 1.2459385746472194, + "grad_norm": 1.0439387585696862, + "learning_rate": 4.031091435130237e-05, + "loss": 0.1851, + "step": 10507 + }, + { + "epoch": 1.2460571564093443, + "grad_norm": 1.0718587396981918, + "learning_rate": 4.030901674789609e-05, + "loss": 0.2336, + "step": 10508 + }, + { + "epoch": 1.2461757381714693, + "grad_norm": 1.2807458861142087, + "learning_rate": 4.030711900335946e-05, + "loss": 0.2318, + "step": 10509 + }, + { + "epoch": 1.2462943199335943, + "grad_norm": 1.5963507083245414, + "learning_rate": 4.0305221117709965e-05, + "loss": 0.3859, + "step": 10510 + }, + { + "epoch": 1.2464129016957193, + "grad_norm": 1.1154691834953003, + "learning_rate": 4.03033230909651e-05, + "loss": 0.2371, + "step": 10511 + }, + { + "epoch": 1.2465314834578443, + "grad_norm": 1.4643078988972753, + "learning_rate": 4.030142492314236e-05, + "loss": 0.4185, + "step": 10512 + }, + { + "epoch": 1.2466500652199692, + "grad_norm": 1.1373260571900035, + "learning_rate": 4.029952661425924e-05, + "loss": 0.228, + "step": 10513 + }, + { + "epoch": 1.2467686469820942, + "grad_norm": 1.189883913259819, + "learning_rate": 4.0297628164333264e-05, + "loss": 0.3118, + "step": 10514 + }, + { + "epoch": 1.2468872287442192, + "grad_norm": 1.4707296608570126, + "learning_rate": 4.0295729573381905e-05, + "loss": 0.2544, + "step": 10515 + }, + { + "epoch": 1.2470058105063442, + "grad_norm": 1.694070977915552, + "learning_rate": 4.0293830841422674e-05, + "loss": 0.3553, + "step": 10516 + }, + { + "epoch": 1.2471243922684692, + "grad_norm": 1.442309445209932, + "learning_rate": 4.0291931968473075e-05, + "loss": 0.255, + "step": 10517 + }, + { + "epoch": 1.2472429740305941, + "grad_norm": 1.2375233932447167, + "learning_rate": 4.0290032954550623e-05, + "loss": 0.2511, + "step": 10518 + }, + { + "epoch": 1.2473615557927191, + "grad_norm": 1.0892822408191938, + "learning_rate": 4.028813379967281e-05, + "loss": 0.218, + "step": 10519 + }, + { + "epoch": 1.247480137554844, + "grad_norm": 1.1583590969337567, + "learning_rate": 4.0286234503857156e-05, + "loss": 0.2445, + "step": 10520 + }, + { + "epoch": 1.247598719316969, + "grad_norm": 0.8883643924792928, + "learning_rate": 4.028433506712116e-05, + "loss": 0.2233, + "step": 10521 + }, + { + "epoch": 1.247717301079094, + "grad_norm": 0.8079169845569238, + "learning_rate": 4.028243548948234e-05, + "loss": 0.1984, + "step": 10522 + }, + { + "epoch": 1.247835882841219, + "grad_norm": 1.0283133469508123, + "learning_rate": 4.028053577095821e-05, + "loss": 0.2129, + "step": 10523 + }, + { + "epoch": 1.247954464603344, + "grad_norm": 1.1660471976518556, + "learning_rate": 4.027863591156627e-05, + "loss": 0.2954, + "step": 10524 + }, + { + "epoch": 1.248073046365469, + "grad_norm": 1.1271958540673999, + "learning_rate": 4.027673591132405e-05, + "loss": 0.2472, + "step": 10525 + }, + { + "epoch": 1.248191628127594, + "grad_norm": 1.1330934887037956, + "learning_rate": 4.027483577024906e-05, + "loss": 0.2825, + "step": 10526 + }, + { + "epoch": 1.248310209889719, + "grad_norm": 1.408379322310984, + "learning_rate": 4.027293548835881e-05, + "loss": 0.2967, + "step": 10527 + }, + { + "epoch": 1.248428791651844, + "grad_norm": 0.8739466059611773, + "learning_rate": 4.027103506567083e-05, + "loss": 0.186, + "step": 10528 + }, + { + "epoch": 1.248547373413969, + "grad_norm": 1.0366278478477022, + "learning_rate": 4.0269134502202626e-05, + "loss": 0.2945, + "step": 10529 + }, + { + "epoch": 1.248665955176094, + "grad_norm": 0.7808945938209709, + "learning_rate": 4.0267233797971725e-05, + "loss": 0.174, + "step": 10530 + }, + { + "epoch": 1.248784536938219, + "grad_norm": 1.3738126035192106, + "learning_rate": 4.026533295299566e-05, + "loss": 0.3106, + "step": 10531 + }, + { + "epoch": 1.2489031187003439, + "grad_norm": 1.0693772289922485, + "learning_rate": 4.026343196729194e-05, + "loss": 0.2113, + "step": 10532 + }, + { + "epoch": 1.2490217004624689, + "grad_norm": 0.7010631579168635, + "learning_rate": 4.026153084087809e-05, + "loss": 0.1841, + "step": 10533 + }, + { + "epoch": 1.2491402822245938, + "grad_norm": 1.1328815676799033, + "learning_rate": 4.025962957377164e-05, + "loss": 0.2591, + "step": 10534 + }, + { + "epoch": 1.2492588639867188, + "grad_norm": 1.4828476463484515, + "learning_rate": 4.025772816599013e-05, + "loss": 0.2572, + "step": 10535 + }, + { + "epoch": 1.2493774457488438, + "grad_norm": 0.9294358605405638, + "learning_rate": 4.025582661755106e-05, + "loss": 0.206, + "step": 10536 + }, + { + "epoch": 1.2494960275109688, + "grad_norm": 1.7796900458900629, + "learning_rate": 4.0253924928471984e-05, + "loss": 0.4766, + "step": 10537 + }, + { + "epoch": 1.2496146092730938, + "grad_norm": 1.054048524475053, + "learning_rate": 4.025202309877043e-05, + "loss": 0.2147, + "step": 10538 + }, + { + "epoch": 1.2497331910352187, + "grad_norm": 1.4888406814829203, + "learning_rate": 4.0250121128463924e-05, + "loss": 0.3532, + "step": 10539 + }, + { + "epoch": 1.2498517727973437, + "grad_norm": 1.2863881454807318, + "learning_rate": 4.024821901757e-05, + "loss": 0.2555, + "step": 10540 + }, + { + "epoch": 1.2499703545594687, + "grad_norm": 1.608068289493836, + "learning_rate": 4.02463167661062e-05, + "loss": 0.409, + "step": 10541 + }, + { + "epoch": 1.2500889363215937, + "grad_norm": 1.098914217182304, + "learning_rate": 4.024441437409005e-05, + "loss": 0.2133, + "step": 10542 + }, + { + "epoch": 1.2502075180837187, + "grad_norm": 0.904090266091376, + "learning_rate": 4.0242511841539095e-05, + "loss": 0.197, + "step": 10543 + }, + { + "epoch": 1.2503260998458436, + "grad_norm": 0.8821204150052284, + "learning_rate": 4.0240609168470876e-05, + "loss": 0.1482, + "step": 10544 + }, + { + "epoch": 1.2504446816079686, + "grad_norm": 1.421411973876386, + "learning_rate": 4.023870635490292e-05, + "loss": 0.3042, + "step": 10545 + }, + { + "epoch": 1.2505632633700936, + "grad_norm": 0.8906788752085187, + "learning_rate": 4.023680340085279e-05, + "loss": 0.2034, + "step": 10546 + }, + { + "epoch": 1.2506818451322186, + "grad_norm": 1.1714934346290409, + "learning_rate": 4.023490030633801e-05, + "loss": 0.2372, + "step": 10547 + }, + { + "epoch": 1.2508004268943438, + "grad_norm": 1.2621119886885481, + "learning_rate": 4.023299707137613e-05, + "loss": 0.2667, + "step": 10548 + }, + { + "epoch": 1.2509190086564685, + "grad_norm": 1.026897548318463, + "learning_rate": 4.023109369598471e-05, + "loss": 0.2092, + "step": 10549 + }, + { + "epoch": 1.2510375904185937, + "grad_norm": 1.1165845469337354, + "learning_rate": 4.022919018018127e-05, + "loss": 0.2402, + "step": 10550 + }, + { + "epoch": 1.2511561721807185, + "grad_norm": 0.9019856573736542, + "learning_rate": 4.022728652398338e-05, + "loss": 0.1358, + "step": 10551 + }, + { + "epoch": 1.2512747539428437, + "grad_norm": 1.699215762337985, + "learning_rate": 4.0225382727408565e-05, + "loss": 0.3368, + "step": 10552 + }, + { + "epoch": 1.2513933357049685, + "grad_norm": 1.400880118837076, + "learning_rate": 4.0223478790474415e-05, + "loss": 0.264, + "step": 10553 + }, + { + "epoch": 1.2515119174670937, + "grad_norm": 1.1040214887182063, + "learning_rate": 4.0221574713198445e-05, + "loss": 0.2091, + "step": 10554 + }, + { + "epoch": 1.2516304992292184, + "grad_norm": 1.0639382091714573, + "learning_rate": 4.021967049559823e-05, + "loss": 0.277, + "step": 10555 + }, + { + "epoch": 1.2517490809913436, + "grad_norm": 1.2770344151229518, + "learning_rate": 4.0217766137691304e-05, + "loss": 0.2566, + "step": 10556 + }, + { + "epoch": 1.2518676627534684, + "grad_norm": 1.269552253933047, + "learning_rate": 4.021586163949525e-05, + "loss": 0.2185, + "step": 10557 + }, + { + "epoch": 1.2519862445155936, + "grad_norm": 1.1906228199535545, + "learning_rate": 4.02139570010276e-05, + "loss": 0.2745, + "step": 10558 + }, + { + "epoch": 1.2521048262777184, + "grad_norm": 1.0876754735520582, + "learning_rate": 4.021205222230593e-05, + "loss": 0.2211, + "step": 10559 + }, + { + "epoch": 1.2522234080398436, + "grad_norm": 0.9004020313894792, + "learning_rate": 4.0210147303347786e-05, + "loss": 0.1939, + "step": 10560 + }, + { + "epoch": 1.2523419898019685, + "grad_norm": 1.3485927057076532, + "learning_rate": 4.020824224417074e-05, + "loss": 0.374, + "step": 10561 + }, + { + "epoch": 1.2524605715640935, + "grad_norm": 1.2431035257963305, + "learning_rate": 4.0206337044792345e-05, + "loss": 0.2769, + "step": 10562 + }, + { + "epoch": 1.2525791533262185, + "grad_norm": 1.0736428194254974, + "learning_rate": 4.020443170523018e-05, + "loss": 0.1874, + "step": 10563 + }, + { + "epoch": 1.2526977350883435, + "grad_norm": 1.386910940161215, + "learning_rate": 4.020252622550179e-05, + "loss": 0.333, + "step": 10564 + }, + { + "epoch": 1.2528163168504685, + "grad_norm": 1.2457872332610398, + "learning_rate": 4.020062060562475e-05, + "loss": 0.2795, + "step": 10565 + }, + { + "epoch": 1.2529348986125934, + "grad_norm": 1.22455121212678, + "learning_rate": 4.019871484561664e-05, + "loss": 0.283, + "step": 10566 + }, + { + "epoch": 1.2530534803747184, + "grad_norm": 0.6701795808994715, + "learning_rate": 4.019680894549501e-05, + "loss": 0.1446, + "step": 10567 + }, + { + "epoch": 1.2531720621368434, + "grad_norm": 1.106329211406439, + "learning_rate": 4.0194902905277436e-05, + "loss": 0.214, + "step": 10568 + }, + { + "epoch": 1.2532906438989684, + "grad_norm": 0.8740744638317279, + "learning_rate": 4.019299672498149e-05, + "loss": 0.1585, + "step": 10569 + }, + { + "epoch": 1.2534092256610934, + "grad_norm": 0.8324948620813536, + "learning_rate": 4.019109040462474e-05, + "loss": 0.2267, + "step": 10570 + }, + { + "epoch": 1.2535278074232183, + "grad_norm": 0.9907605752278357, + "learning_rate": 4.018918394422477e-05, + "loss": 0.2655, + "step": 10571 + }, + { + "epoch": 1.2536463891853433, + "grad_norm": 1.4124107508401187, + "learning_rate": 4.018727734379916e-05, + "loss": 0.2846, + "step": 10572 + }, + { + "epoch": 1.2537649709474683, + "grad_norm": 1.6581570178786065, + "learning_rate": 4.018537060336547e-05, + "loss": 0.2759, + "step": 10573 + }, + { + "epoch": 1.2538835527095933, + "grad_norm": 1.0927094739030272, + "learning_rate": 4.018346372294128e-05, + "loss": 0.2855, + "step": 10574 + }, + { + "epoch": 1.2540021344717183, + "grad_norm": 0.9463406600167925, + "learning_rate": 4.018155670254418e-05, + "loss": 0.215, + "step": 10575 + }, + { + "epoch": 1.2541207162338432, + "grad_norm": 1.7170418486546057, + "learning_rate": 4.017964954219174e-05, + "loss": 0.3879, + "step": 10576 + }, + { + "epoch": 1.2542392979959682, + "grad_norm": 1.1740391120139588, + "learning_rate": 4.0177742241901546e-05, + "loss": 0.2926, + "step": 10577 + }, + { + "epoch": 1.2543578797580932, + "grad_norm": 0.9455168386012042, + "learning_rate": 4.017583480169118e-05, + "loss": 0.2242, + "step": 10578 + }, + { + "epoch": 1.2544764615202182, + "grad_norm": 0.9289529212328501, + "learning_rate": 4.017392722157823e-05, + "loss": 0.2322, + "step": 10579 + }, + { + "epoch": 1.2545950432823432, + "grad_norm": 0.9035402776839989, + "learning_rate": 4.017201950158027e-05, + "loss": 0.2264, + "step": 10580 + }, + { + "epoch": 1.2547136250444682, + "grad_norm": 0.7481993170686446, + "learning_rate": 4.0170111641714905e-05, + "loss": 0.15, + "step": 10581 + }, + { + "epoch": 1.2548322068065931, + "grad_norm": 1.380032675601541, + "learning_rate": 4.016820364199971e-05, + "loss": 0.2922, + "step": 10582 + }, + { + "epoch": 1.2549507885687181, + "grad_norm": 1.0341225557928495, + "learning_rate": 4.0166295502452275e-05, + "loss": 0.2689, + "step": 10583 + }, + { + "epoch": 1.255069370330843, + "grad_norm": 1.3940712219323408, + "learning_rate": 4.01643872230902e-05, + "loss": 0.3234, + "step": 10584 + }, + { + "epoch": 1.255187952092968, + "grad_norm": 1.0438485305803942, + "learning_rate": 4.016247880393107e-05, + "loss": 0.1867, + "step": 10585 + }, + { + "epoch": 1.255306533855093, + "grad_norm": 1.0408866102631629, + "learning_rate": 4.016057024499248e-05, + "loss": 0.2099, + "step": 10586 + }, + { + "epoch": 1.255425115617218, + "grad_norm": 1.3765915397470567, + "learning_rate": 4.015866154629202e-05, + "loss": 0.2929, + "step": 10587 + }, + { + "epoch": 1.255543697379343, + "grad_norm": 1.3526164220262473, + "learning_rate": 4.0156752707847284e-05, + "loss": 0.3264, + "step": 10588 + }, + { + "epoch": 1.255662279141468, + "grad_norm": 1.4115313455036778, + "learning_rate": 4.015484372967588e-05, + "loss": 0.3204, + "step": 10589 + }, + { + "epoch": 1.255780860903593, + "grad_norm": 1.2925640279323831, + "learning_rate": 4.0152934611795404e-05, + "loss": 0.2742, + "step": 10590 + }, + { + "epoch": 1.255899442665718, + "grad_norm": 1.7454706134272517, + "learning_rate": 4.015102535422345e-05, + "loss": 0.3437, + "step": 10591 + }, + { + "epoch": 1.256018024427843, + "grad_norm": 1.0844293065490622, + "learning_rate": 4.014911595697762e-05, + "loss": 0.2418, + "step": 10592 + }, + { + "epoch": 1.256136606189968, + "grad_norm": 1.0145025765498206, + "learning_rate": 4.014720642007552e-05, + "loss": 0.2199, + "step": 10593 + }, + { + "epoch": 1.256255187952093, + "grad_norm": 1.0365113537959116, + "learning_rate": 4.014529674353474e-05, + "loss": 0.1514, + "step": 10594 + }, + { + "epoch": 1.2563737697142179, + "grad_norm": 0.7556012572626248, + "learning_rate": 4.014338692737291e-05, + "loss": 0.1682, + "step": 10595 + }, + { + "epoch": 1.2564923514763429, + "grad_norm": 0.8389109172342724, + "learning_rate": 4.0141476971607624e-05, + "loss": 0.1901, + "step": 10596 + }, + { + "epoch": 1.2566109332384678, + "grad_norm": 0.7607359171287091, + "learning_rate": 4.0139566876256476e-05, + "loss": 0.1607, + "step": 10597 + }, + { + "epoch": 1.2567295150005928, + "grad_norm": 1.3394907749120355, + "learning_rate": 4.01376566413371e-05, + "loss": 0.3087, + "step": 10598 + }, + { + "epoch": 1.256848096762718, + "grad_norm": 1.267931054374564, + "learning_rate": 4.013574626686708e-05, + "loss": 0.2562, + "step": 10599 + }, + { + "epoch": 1.2569666785248428, + "grad_norm": 0.7678971535225139, + "learning_rate": 4.0133835752864054e-05, + "loss": 0.1463, + "step": 10600 + }, + { + "epoch": 1.257085260286968, + "grad_norm": 0.8204870371285615, + "learning_rate": 4.013192509934561e-05, + "loss": 0.1616, + "step": 10601 + }, + { + "epoch": 1.2572038420490927, + "grad_norm": 0.7218933771661717, + "learning_rate": 4.013001430632938e-05, + "loss": 0.1621, + "step": 10602 + }, + { + "epoch": 1.257322423811218, + "grad_norm": 1.2838740272270943, + "learning_rate": 4.012810337383297e-05, + "loss": 0.2476, + "step": 10603 + }, + { + "epoch": 1.2574410055733427, + "grad_norm": 1.0388327256912735, + "learning_rate": 4.012619230187399e-05, + "loss": 0.2334, + "step": 10604 + }, + { + "epoch": 1.257559587335468, + "grad_norm": 0.9688822144774484, + "learning_rate": 4.012428109047007e-05, + "loss": 0.2317, + "step": 10605 + }, + { + "epoch": 1.2576781690975927, + "grad_norm": 1.520105372878284, + "learning_rate": 4.0122369739638835e-05, + "loss": 0.2545, + "step": 10606 + }, + { + "epoch": 1.2577967508597179, + "grad_norm": 1.2582582498061827, + "learning_rate": 4.012045824939788e-05, + "loss": 0.2729, + "step": 10607 + }, + { + "epoch": 1.2579153326218426, + "grad_norm": 1.0146580804416334, + "learning_rate": 4.011854661976485e-05, + "loss": 0.2163, + "step": 10608 + }, + { + "epoch": 1.2580339143839678, + "grad_norm": 1.1060118683948428, + "learning_rate": 4.011663485075736e-05, + "loss": 0.2125, + "step": 10609 + }, + { + "epoch": 1.2581524961460928, + "grad_norm": 1.0140132599832952, + "learning_rate": 4.011472294239303e-05, + "loss": 0.1977, + "step": 10610 + }, + { + "epoch": 1.2582710779082178, + "grad_norm": 0.8981796227561046, + "learning_rate": 4.01128108946895e-05, + "loss": 0.181, + "step": 10611 + }, + { + "epoch": 1.2583896596703428, + "grad_norm": 1.364893122035823, + "learning_rate": 4.011089870766437e-05, + "loss": 0.297, + "step": 10612 + }, + { + "epoch": 1.2585082414324678, + "grad_norm": 1.156791081761517, + "learning_rate": 4.0108986381335304e-05, + "loss": 0.2761, + "step": 10613 + }, + { + "epoch": 1.2586268231945927, + "grad_norm": 1.2617731370628869, + "learning_rate": 4.010707391571989e-05, + "loss": 0.2399, + "step": 10614 + }, + { + "epoch": 1.2587454049567177, + "grad_norm": 1.133165690336073, + "learning_rate": 4.010516131083579e-05, + "loss": 0.2842, + "step": 10615 + }, + { + "epoch": 1.2588639867188427, + "grad_norm": 1.1016158317968447, + "learning_rate": 4.010324856670063e-05, + "loss": 0.2692, + "step": 10616 + }, + { + "epoch": 1.2589825684809677, + "grad_norm": 1.1502618974028418, + "learning_rate": 4.010133568333204e-05, + "loss": 0.2598, + "step": 10617 + }, + { + "epoch": 1.2591011502430927, + "grad_norm": 1.074467199959485, + "learning_rate": 4.0099422660747645e-05, + "loss": 0.1515, + "step": 10618 + }, + { + "epoch": 1.2592197320052176, + "grad_norm": 1.2558737245545464, + "learning_rate": 4.009750949896509e-05, + "loss": 0.279, + "step": 10619 + }, + { + "epoch": 1.2593383137673426, + "grad_norm": 0.9119357655452501, + "learning_rate": 4.0095596198002014e-05, + "loss": 0.1888, + "step": 10620 + }, + { + "epoch": 1.2594568955294676, + "grad_norm": 1.1871888170577531, + "learning_rate": 4.009368275787606e-05, + "loss": 0.2591, + "step": 10621 + }, + { + "epoch": 1.2595754772915926, + "grad_norm": 0.8084549804476707, + "learning_rate": 4.009176917860484e-05, + "loss": 0.1749, + "step": 10622 + }, + { + "epoch": 1.2596940590537176, + "grad_norm": 1.080155702942602, + "learning_rate": 4.008985546020603e-05, + "loss": 0.2267, + "step": 10623 + }, + { + "epoch": 1.2598126408158425, + "grad_norm": 0.924844015233843, + "learning_rate": 4.008794160269725e-05, + "loss": 0.2337, + "step": 10624 + }, + { + "epoch": 1.2599312225779675, + "grad_norm": 0.9787574871091232, + "learning_rate": 4.008602760609616e-05, + "loss": 0.2354, + "step": 10625 + }, + { + "epoch": 1.2600498043400925, + "grad_norm": 1.0769525463952392, + "learning_rate": 4.0084113470420395e-05, + "loss": 0.1958, + "step": 10626 + }, + { + "epoch": 1.2601683861022175, + "grad_norm": 1.3767634672578035, + "learning_rate": 4.0082199195687594e-05, + "loss": 0.273, + "step": 10627 + }, + { + "epoch": 1.2602869678643425, + "grad_norm": 1.4158339523503718, + "learning_rate": 4.008028478191541e-05, + "loss": 0.3039, + "step": 10628 + }, + { + "epoch": 1.2604055496264674, + "grad_norm": 1.0004243844852918, + "learning_rate": 4.00783702291215e-05, + "loss": 0.2462, + "step": 10629 + }, + { + "epoch": 1.2605241313885924, + "grad_norm": 1.483396105839095, + "learning_rate": 4.007645553732351e-05, + "loss": 0.3742, + "step": 10630 + }, + { + "epoch": 1.2606427131507174, + "grad_norm": 1.1498480101278847, + "learning_rate": 4.0074540706539075e-05, + "loss": 0.2041, + "step": 10631 + }, + { + "epoch": 1.2607612949128424, + "grad_norm": 1.1919065555603667, + "learning_rate": 4.007262573678587e-05, + "loss": 0.2624, + "step": 10632 + }, + { + "epoch": 1.2608798766749674, + "grad_norm": 1.0832644062741912, + "learning_rate": 4.0070710628081534e-05, + "loss": 0.2236, + "step": 10633 + }, + { + "epoch": 1.2609984584370924, + "grad_norm": 0.9841329454685708, + "learning_rate": 4.006879538044372e-05, + "loss": 0.2265, + "step": 10634 + }, + { + "epoch": 1.2611170401992173, + "grad_norm": 1.2642497825940258, + "learning_rate": 4.006687999389011e-05, + "loss": 0.2764, + "step": 10635 + }, + { + "epoch": 1.2612356219613423, + "grad_norm": 1.32225812823312, + "learning_rate": 4.006496446843833e-05, + "loss": 0.296, + "step": 10636 + }, + { + "epoch": 1.2613542037234673, + "grad_norm": 1.6928657714658077, + "learning_rate": 4.006304880410605e-05, + "loss": 0.3346, + "step": 10637 + }, + { + "epoch": 1.2614727854855923, + "grad_norm": 1.0778045676224994, + "learning_rate": 4.006113300091093e-05, + "loss": 0.2829, + "step": 10638 + }, + { + "epoch": 1.2615913672477173, + "grad_norm": 1.1336464785442355, + "learning_rate": 4.0059217058870644e-05, + "loss": 0.2867, + "step": 10639 + }, + { + "epoch": 1.2617099490098422, + "grad_norm": 1.094497285344225, + "learning_rate": 4.0057300978002835e-05, + "loss": 0.2105, + "step": 10640 + }, + { + "epoch": 1.2618285307719672, + "grad_norm": 0.9013090009877783, + "learning_rate": 4.0055384758325185e-05, + "loss": 0.2085, + "step": 10641 + }, + { + "epoch": 1.2619471125340922, + "grad_norm": 1.241823199372829, + "learning_rate": 4.0053468399855345e-05, + "loss": 0.2617, + "step": 10642 + }, + { + "epoch": 1.2620656942962172, + "grad_norm": 1.384808779288354, + "learning_rate": 4.005155190261099e-05, + "loss": 0.2278, + "step": 10643 + }, + { + "epoch": 1.2621842760583422, + "grad_norm": 1.4409858363564618, + "learning_rate": 4.004963526660977e-05, + "loss": 0.2997, + "step": 10644 + }, + { + "epoch": 1.2623028578204671, + "grad_norm": 1.0048667941659195, + "learning_rate": 4.004771849186937e-05, + "loss": 0.2385, + "step": 10645 + }, + { + "epoch": 1.2624214395825921, + "grad_norm": 0.9905365513971506, + "learning_rate": 4.004580157840747e-05, + "loss": 0.2325, + "step": 10646 + }, + { + "epoch": 1.262540021344717, + "grad_norm": 1.0684044545888134, + "learning_rate": 4.004388452624172e-05, + "loss": 0.2372, + "step": 10647 + }, + { + "epoch": 1.2626586031068423, + "grad_norm": 1.0016323278891466, + "learning_rate": 4.004196733538981e-05, + "loss": 0.2203, + "step": 10648 + }, + { + "epoch": 1.262777184868967, + "grad_norm": 1.1041340292123465, + "learning_rate": 4.00400500058694e-05, + "loss": 0.2474, + "step": 10649 + }, + { + "epoch": 1.2628957666310923, + "grad_norm": 1.1591464328132572, + "learning_rate": 4.003813253769818e-05, + "loss": 0.2468, + "step": 10650 + }, + { + "epoch": 1.263014348393217, + "grad_norm": 1.126682481589318, + "learning_rate": 4.003621493089381e-05, + "loss": 0.276, + "step": 10651 + }, + { + "epoch": 1.2631329301553422, + "grad_norm": 1.5903999591298772, + "learning_rate": 4.003429718547399e-05, + "loss": 0.283, + "step": 10652 + }, + { + "epoch": 1.263251511917467, + "grad_norm": 1.1106224161184617, + "learning_rate": 4.0032379301456365e-05, + "loss": 0.2229, + "step": 10653 + }, + { + "epoch": 1.2633700936795922, + "grad_norm": 1.2156439127984733, + "learning_rate": 4.003046127885866e-05, + "loss": 0.2629, + "step": 10654 + }, + { + "epoch": 1.263488675441717, + "grad_norm": 0.6691641439512508, + "learning_rate": 4.002854311769852e-05, + "loss": 0.1881, + "step": 10655 + }, + { + "epoch": 1.2636072572038421, + "grad_norm": 0.8796583983295096, + "learning_rate": 4.002662481799365e-05, + "loss": 0.1548, + "step": 10656 + }, + { + "epoch": 1.263725838965967, + "grad_norm": 1.6707682254743603, + "learning_rate": 4.002470637976172e-05, + "loss": 0.2782, + "step": 10657 + }, + { + "epoch": 1.263844420728092, + "grad_norm": 1.3260456038689135, + "learning_rate": 4.002278780302042e-05, + "loss": 0.254, + "step": 10658 + }, + { + "epoch": 1.2639630024902169, + "grad_norm": 1.560101862213038, + "learning_rate": 4.002086908778744e-05, + "loss": 0.407, + "step": 10659 + }, + { + "epoch": 1.264081584252342, + "grad_norm": 0.9617866406249037, + "learning_rate": 4.001895023408047e-05, + "loss": 0.1848, + "step": 10660 + }, + { + "epoch": 1.264200166014467, + "grad_norm": 0.8342875921736955, + "learning_rate": 4.001703124191719e-05, + "loss": 0.1887, + "step": 10661 + }, + { + "epoch": 1.264318747776592, + "grad_norm": 1.1156495441699488, + "learning_rate": 4.00151121113153e-05, + "loss": 0.2343, + "step": 10662 + }, + { + "epoch": 1.264437329538717, + "grad_norm": 1.1603740761500894, + "learning_rate": 4.001319284229249e-05, + "loss": 0.1994, + "step": 10663 + }, + { + "epoch": 1.264555911300842, + "grad_norm": 1.4474455836035063, + "learning_rate": 4.001127343486646e-05, + "loss": 0.3058, + "step": 10664 + }, + { + "epoch": 1.264674493062967, + "grad_norm": 1.267754510075988, + "learning_rate": 4.000935388905489e-05, + "loss": 0.2659, + "step": 10665 + }, + { + "epoch": 1.264793074825092, + "grad_norm": 1.2412552669574035, + "learning_rate": 4.000743420487548e-05, + "loss": 0.2466, + "step": 10666 + }, + { + "epoch": 1.264911656587217, + "grad_norm": 1.0244323095552779, + "learning_rate": 4.000551438234593e-05, + "loss": 0.2061, + "step": 10667 + }, + { + "epoch": 1.265030238349342, + "grad_norm": 1.1490675767566865, + "learning_rate": 4.000359442148395e-05, + "loss": 0.2131, + "step": 10668 + }, + { + "epoch": 1.265148820111467, + "grad_norm": 1.0305381817892802, + "learning_rate": 4.000167432230721e-05, + "loss": 0.2136, + "step": 10669 + }, + { + "epoch": 1.2652674018735919, + "grad_norm": 1.1307076152535613, + "learning_rate": 3.999975408483344e-05, + "loss": 0.2314, + "step": 10670 + }, + { + "epoch": 1.2653859836357169, + "grad_norm": 1.2307108637618005, + "learning_rate": 3.999783370908033e-05, + "loss": 0.2778, + "step": 10671 + }, + { + "epoch": 1.2655045653978418, + "grad_norm": 0.8770689618885671, + "learning_rate": 3.999591319506558e-05, + "loss": 0.2063, + "step": 10672 + }, + { + "epoch": 1.2656231471599668, + "grad_norm": 1.2937093111149467, + "learning_rate": 3.999399254280691e-05, + "loss": 0.2523, + "step": 10673 + }, + { + "epoch": 1.2657417289220918, + "grad_norm": 1.7955767299821062, + "learning_rate": 3.999207175232201e-05, + "loss": 0.3439, + "step": 10674 + }, + { + "epoch": 1.2658603106842168, + "grad_norm": 1.5653976613227858, + "learning_rate": 3.999015082362859e-05, + "loss": 0.3406, + "step": 10675 + }, + { + "epoch": 1.2659788924463418, + "grad_norm": 0.9386469808658074, + "learning_rate": 3.998822975674437e-05, + "loss": 0.1864, + "step": 10676 + }, + { + "epoch": 1.2660974742084667, + "grad_norm": 1.1223111904740737, + "learning_rate": 3.998630855168705e-05, + "loss": 0.2015, + "step": 10677 + }, + { + "epoch": 1.2662160559705917, + "grad_norm": 1.745198469381485, + "learning_rate": 3.9984387208474325e-05, + "loss": 0.3706, + "step": 10678 + }, + { + "epoch": 1.2663346377327167, + "grad_norm": 1.192219137685321, + "learning_rate": 3.9982465727123944e-05, + "loss": 0.2628, + "step": 10679 + }, + { + "epoch": 1.2664532194948417, + "grad_norm": 0.8707276362436793, + "learning_rate": 3.998054410765359e-05, + "loss": 0.1823, + "step": 10680 + }, + { + "epoch": 1.2665718012569667, + "grad_norm": 1.4036879120328158, + "learning_rate": 3.9978622350081e-05, + "loss": 0.2951, + "step": 10681 + }, + { + "epoch": 1.2666903830190916, + "grad_norm": 0.8838612897522572, + "learning_rate": 3.997670045442388e-05, + "loss": 0.1718, + "step": 10682 + }, + { + "epoch": 1.2668089647812166, + "grad_norm": 1.00351266415984, + "learning_rate": 3.9974778420699935e-05, + "loss": 0.2139, + "step": 10683 + }, + { + "epoch": 1.2669275465433416, + "grad_norm": 1.3169446281343504, + "learning_rate": 3.997285624892691e-05, + "loss": 0.2999, + "step": 10684 + }, + { + "epoch": 1.2670461283054666, + "grad_norm": 1.0294864483851398, + "learning_rate": 3.99709339391225e-05, + "loss": 0.2103, + "step": 10685 + }, + { + "epoch": 1.2671647100675916, + "grad_norm": 0.9507200771841938, + "learning_rate": 3.996901149130444e-05, + "loss": 0.1867, + "step": 10686 + }, + { + "epoch": 1.2672832918297166, + "grad_norm": 0.9775158417424439, + "learning_rate": 3.996708890549046e-05, + "loss": 0.1851, + "step": 10687 + }, + { + "epoch": 1.2674018735918415, + "grad_norm": 1.423459055489313, + "learning_rate": 3.9965166181698266e-05, + "loss": 0.2574, + "step": 10688 + }, + { + "epoch": 1.2675204553539665, + "grad_norm": 0.965467331594832, + "learning_rate": 3.996324331994559e-05, + "loss": 0.1984, + "step": 10689 + }, + { + "epoch": 1.2676390371160915, + "grad_norm": 1.1541669513876114, + "learning_rate": 3.996132032025016e-05, + "loss": 0.2321, + "step": 10690 + }, + { + "epoch": 1.2677576188782165, + "grad_norm": 1.0800643790434257, + "learning_rate": 3.995939718262971e-05, + "loss": 0.2147, + "step": 10691 + }, + { + "epoch": 1.2678762006403415, + "grad_norm": 0.9554983101076154, + "learning_rate": 3.995747390710196e-05, + "loss": 0.1826, + "step": 10692 + }, + { + "epoch": 1.2679947824024664, + "grad_norm": 1.251250476876661, + "learning_rate": 3.995555049368463e-05, + "loss": 0.2345, + "step": 10693 + }, + { + "epoch": 1.2681133641645914, + "grad_norm": 0.9288623968813257, + "learning_rate": 3.995362694239548e-05, + "loss": 0.2067, + "step": 10694 + }, + { + "epoch": 1.2682319459267164, + "grad_norm": 1.1048194892975658, + "learning_rate": 3.995170325325223e-05, + "loss": 0.2068, + "step": 10695 + }, + { + "epoch": 1.2683505276888414, + "grad_norm": 1.3590855806006066, + "learning_rate": 3.9949779426272594e-05, + "loss": 0.2619, + "step": 10696 + }, + { + "epoch": 1.2684691094509664, + "grad_norm": 1.0691928941437427, + "learning_rate": 3.994785546147434e-05, + "loss": 0.2116, + "step": 10697 + }, + { + "epoch": 1.2685876912130913, + "grad_norm": 0.9484664944888733, + "learning_rate": 3.994593135887518e-05, + "loss": 0.1741, + "step": 10698 + }, + { + "epoch": 1.2687062729752165, + "grad_norm": 1.0407278170795875, + "learning_rate": 3.994400711849286e-05, + "loss": 0.1771, + "step": 10699 + }, + { + "epoch": 1.2688248547373413, + "grad_norm": 1.2180017255120155, + "learning_rate": 3.994208274034512e-05, + "loss": 0.2914, + "step": 10700 + }, + { + "epoch": 1.2689434364994665, + "grad_norm": 1.213265787973207, + "learning_rate": 3.994015822444971e-05, + "loss": 0.1975, + "step": 10701 + }, + { + "epoch": 1.2690620182615913, + "grad_norm": 1.3029946474889964, + "learning_rate": 3.993823357082435e-05, + "loss": 0.2691, + "step": 10702 + }, + { + "epoch": 1.2691806000237165, + "grad_norm": 0.8872970550052439, + "learning_rate": 3.99363087794868e-05, + "loss": 0.1836, + "step": 10703 + }, + { + "epoch": 1.2692991817858412, + "grad_norm": 0.9807060953964426, + "learning_rate": 3.99343838504548e-05, + "loss": 0.1923, + "step": 10704 + }, + { + "epoch": 1.2694177635479664, + "grad_norm": 0.8879667168084098, + "learning_rate": 3.99324587837461e-05, + "loss": 0.1681, + "step": 10705 + }, + { + "epoch": 1.2695363453100912, + "grad_norm": 1.2566264357518295, + "learning_rate": 3.993053357937843e-05, + "loss": 0.25, + "step": 10706 + }, + { + "epoch": 1.2696549270722164, + "grad_norm": 1.2781308141744867, + "learning_rate": 3.9928608237369556e-05, + "loss": 0.2842, + "step": 10707 + }, + { + "epoch": 1.2697735088343411, + "grad_norm": 1.1030641155428695, + "learning_rate": 3.992668275773722e-05, + "loss": 0.2188, + "step": 10708 + }, + { + "epoch": 1.2698920905964663, + "grad_norm": 0.7663596927902078, + "learning_rate": 3.992475714049917e-05, + "loss": 0.1508, + "step": 10709 + }, + { + "epoch": 1.2700106723585913, + "grad_norm": 1.0249351449811366, + "learning_rate": 3.9922831385673164e-05, + "loss": 0.2102, + "step": 10710 + }, + { + "epoch": 1.2701292541207163, + "grad_norm": 1.0020940310372732, + "learning_rate": 3.992090549327695e-05, + "loss": 0.2422, + "step": 10711 + }, + { + "epoch": 1.2702478358828413, + "grad_norm": 1.4685510705660805, + "learning_rate": 3.991897946332829e-05, + "loss": 0.3161, + "step": 10712 + }, + { + "epoch": 1.2703664176449663, + "grad_norm": 0.9749512040969037, + "learning_rate": 3.991705329584493e-05, + "loss": 0.2146, + "step": 10713 + }, + { + "epoch": 1.2704849994070913, + "grad_norm": 1.1043507980188454, + "learning_rate": 3.991512699084463e-05, + "loss": 0.2592, + "step": 10714 + }, + { + "epoch": 1.2706035811692162, + "grad_norm": 0.9464422163388854, + "learning_rate": 3.991320054834515e-05, + "loss": 0.1796, + "step": 10715 + }, + { + "epoch": 1.2707221629313412, + "grad_norm": 1.3141493894758716, + "learning_rate": 3.9911273968364255e-05, + "loss": 0.3514, + "step": 10716 + }, + { + "epoch": 1.2708407446934662, + "grad_norm": 1.1854478000384914, + "learning_rate": 3.990934725091969e-05, + "loss": 0.2624, + "step": 10717 + }, + { + "epoch": 1.2709593264555912, + "grad_norm": 1.5170063723404184, + "learning_rate": 3.9907420396029236e-05, + "loss": 0.3642, + "step": 10718 + }, + { + "epoch": 1.2710779082177162, + "grad_norm": 1.1767563666292837, + "learning_rate": 3.9905493403710634e-05, + "loss": 0.2707, + "step": 10719 + }, + { + "epoch": 1.2711964899798411, + "grad_norm": 1.0204997563820253, + "learning_rate": 3.990356627398167e-05, + "loss": 0.2561, + "step": 10720 + }, + { + "epoch": 1.2713150717419661, + "grad_norm": 0.8506459529618069, + "learning_rate": 3.9901639006860104e-05, + "loss": 0.1581, + "step": 10721 + }, + { + "epoch": 1.271433653504091, + "grad_norm": 1.318313785767226, + "learning_rate": 3.9899711602363696e-05, + "loss": 0.2488, + "step": 10722 + }, + { + "epoch": 1.271552235266216, + "grad_norm": 0.9829156483484653, + "learning_rate": 3.989778406051022e-05, + "loss": 0.1948, + "step": 10723 + }, + { + "epoch": 1.271670817028341, + "grad_norm": 1.5084978239224585, + "learning_rate": 3.989585638131744e-05, + "loss": 0.3187, + "step": 10724 + }, + { + "epoch": 1.271789398790466, + "grad_norm": 1.1319984850210534, + "learning_rate": 3.989392856480313e-05, + "loss": 0.2382, + "step": 10725 + }, + { + "epoch": 1.271907980552591, + "grad_norm": 1.1888636637056829, + "learning_rate": 3.989200061098506e-05, + "loss": 0.3327, + "step": 10726 + }, + { + "epoch": 1.272026562314716, + "grad_norm": 1.1398874179319805, + "learning_rate": 3.9890072519881016e-05, + "loss": 0.2559, + "step": 10727 + }, + { + "epoch": 1.272145144076841, + "grad_norm": 1.1705299011701351, + "learning_rate": 3.9888144291508756e-05, + "loss": 0.2712, + "step": 10728 + }, + { + "epoch": 1.272263725838966, + "grad_norm": 1.0852975003506278, + "learning_rate": 3.988621592588607e-05, + "loss": 0.2585, + "step": 10729 + }, + { + "epoch": 1.272382307601091, + "grad_norm": 1.3521493261108521, + "learning_rate": 3.988428742303072e-05, + "loss": 0.3471, + "step": 10730 + }, + { + "epoch": 1.272500889363216, + "grad_norm": 0.9796288780391064, + "learning_rate": 3.9882358782960497e-05, + "loss": 0.224, + "step": 10731 + }, + { + "epoch": 1.272619471125341, + "grad_norm": 1.0472668117434936, + "learning_rate": 3.988043000569317e-05, + "loss": 0.2175, + "step": 10732 + }, + { + "epoch": 1.2727380528874659, + "grad_norm": 0.9516384691870745, + "learning_rate": 3.987850109124653e-05, + "loss": 0.2174, + "step": 10733 + }, + { + "epoch": 1.2728566346495909, + "grad_norm": 0.9278705480811084, + "learning_rate": 3.987657203963835e-05, + "loss": 0.2052, + "step": 10734 + }, + { + "epoch": 1.2729752164117158, + "grad_norm": 0.996511502275814, + "learning_rate": 3.9874642850886426e-05, + "loss": 0.2108, + "step": 10735 + }, + { + "epoch": 1.2730937981738408, + "grad_norm": 1.3032703879335963, + "learning_rate": 3.9872713525008535e-05, + "loss": 0.2754, + "step": 10736 + }, + { + "epoch": 1.2732123799359658, + "grad_norm": 1.181016433098123, + "learning_rate": 3.987078406202246e-05, + "loss": 0.2387, + "step": 10737 + }, + { + "epoch": 1.2733309616980908, + "grad_norm": 1.5296302438097418, + "learning_rate": 3.9868854461945994e-05, + "loss": 0.314, + "step": 10738 + }, + { + "epoch": 1.2734495434602158, + "grad_norm": 0.9750327744606769, + "learning_rate": 3.986692472479692e-05, + "loss": 0.2094, + "step": 10739 + }, + { + "epoch": 1.2735681252223408, + "grad_norm": 1.1705208848042112, + "learning_rate": 3.9864994850593036e-05, + "loss": 0.2283, + "step": 10740 + }, + { + "epoch": 1.2736867069844657, + "grad_norm": 1.143361343776401, + "learning_rate": 3.986306483935213e-05, + "loss": 0.2315, + "step": 10741 + }, + { + "epoch": 1.2738052887465907, + "grad_norm": 1.178502739696743, + "learning_rate": 3.986113469109198e-05, + "loss": 0.2495, + "step": 10742 + }, + { + "epoch": 1.2739238705087157, + "grad_norm": 0.8692228637376979, + "learning_rate": 3.98592044058304e-05, + "loss": 0.1729, + "step": 10743 + }, + { + "epoch": 1.2740424522708407, + "grad_norm": 1.0705477451196304, + "learning_rate": 3.9857273983585183e-05, + "loss": 0.2202, + "step": 10744 + }, + { + "epoch": 1.2741610340329657, + "grad_norm": 0.9122347761068905, + "learning_rate": 3.985534342437411e-05, + "loss": 0.1694, + "step": 10745 + }, + { + "epoch": 1.2742796157950906, + "grad_norm": 0.9126119873345295, + "learning_rate": 3.985341272821499e-05, + "loss": 0.2235, + "step": 10746 + }, + { + "epoch": 1.2743981975572156, + "grad_norm": 0.8083512508531157, + "learning_rate": 3.985148189512562e-05, + "loss": 0.1431, + "step": 10747 + }, + { + "epoch": 1.2745167793193408, + "grad_norm": 1.0064643942516724, + "learning_rate": 3.98495509251238e-05, + "loss": 0.1957, + "step": 10748 + }, + { + "epoch": 1.2746353610814656, + "grad_norm": 0.8961918358572555, + "learning_rate": 3.984761981822732e-05, + "loss": 0.2004, + "step": 10749 + }, + { + "epoch": 1.2747539428435908, + "grad_norm": 1.2652547478491485, + "learning_rate": 3.984568857445401e-05, + "loss": 0.2099, + "step": 10750 + }, + { + "epoch": 1.2748725246057155, + "grad_norm": 1.5208584319985956, + "learning_rate": 3.9843757193821645e-05, + "loss": 0.2939, + "step": 10751 + }, + { + "epoch": 1.2749911063678407, + "grad_norm": 0.9807201050845276, + "learning_rate": 3.9841825676348047e-05, + "loss": 0.2089, + "step": 10752 + }, + { + "epoch": 1.2751096881299655, + "grad_norm": 1.126037823143914, + "learning_rate": 3.9839894022051013e-05, + "loss": 0.1855, + "step": 10753 + }, + { + "epoch": 1.2752282698920907, + "grad_norm": 1.2960040228685163, + "learning_rate": 3.983796223094835e-05, + "loss": 0.2512, + "step": 10754 + }, + { + "epoch": 1.2753468516542155, + "grad_norm": 1.0928294296753787, + "learning_rate": 3.9836030303057883e-05, + "loss": 0.2833, + "step": 10755 + }, + { + "epoch": 1.2754654334163407, + "grad_norm": 0.9802863026186557, + "learning_rate": 3.9834098238397396e-05, + "loss": 0.1926, + "step": 10756 + }, + { + "epoch": 1.2755840151784654, + "grad_norm": 1.123754737479265, + "learning_rate": 3.983216603698472e-05, + "loss": 0.1898, + "step": 10757 + }, + { + "epoch": 1.2757025969405906, + "grad_norm": 1.0710651706434926, + "learning_rate": 3.983023369883766e-05, + "loss": 0.2737, + "step": 10758 + }, + { + "epoch": 1.2758211787027154, + "grad_norm": 0.970645167287315, + "learning_rate": 3.982830122397403e-05, + "loss": 0.2004, + "step": 10759 + }, + { + "epoch": 1.2759397604648406, + "grad_norm": 1.071601575329305, + "learning_rate": 3.982636861241165e-05, + "loss": 0.2341, + "step": 10760 + }, + { + "epoch": 1.2760583422269656, + "grad_norm": 1.4662226414449882, + "learning_rate": 3.982443586416834e-05, + "loss": 0.3167, + "step": 10761 + }, + { + "epoch": 1.2761769239890906, + "grad_norm": 1.2488398534175147, + "learning_rate": 3.98225029792619e-05, + "loss": 0.2417, + "step": 10762 + }, + { + "epoch": 1.2762955057512155, + "grad_norm": 0.8550336124779022, + "learning_rate": 3.9820569957710167e-05, + "loss": 0.1687, + "step": 10763 + }, + { + "epoch": 1.2764140875133405, + "grad_norm": 1.0073469473416086, + "learning_rate": 3.981863679953094e-05, + "loss": 0.2044, + "step": 10764 + }, + { + "epoch": 1.2765326692754655, + "grad_norm": 0.9557902299502421, + "learning_rate": 3.981670350474206e-05, + "loss": 0.1839, + "step": 10765 + }, + { + "epoch": 1.2766512510375905, + "grad_norm": 1.339610070479773, + "learning_rate": 3.981477007336135e-05, + "loss": 0.2652, + "step": 10766 + }, + { + "epoch": 1.2767698327997155, + "grad_norm": 1.2781064307178756, + "learning_rate": 3.9812836505406614e-05, + "loss": 0.2661, + "step": 10767 + }, + { + "epoch": 1.2768884145618404, + "grad_norm": 1.1278602704510179, + "learning_rate": 3.98109028008957e-05, + "loss": 0.2513, + "step": 10768 + }, + { + "epoch": 1.2770069963239654, + "grad_norm": 1.2117200114402091, + "learning_rate": 3.9808968959846416e-05, + "loss": 0.2622, + "step": 10769 + }, + { + "epoch": 1.2771255780860904, + "grad_norm": 1.3018365154736355, + "learning_rate": 3.980703498227661e-05, + "loss": 0.2758, + "step": 10770 + }, + { + "epoch": 1.2772441598482154, + "grad_norm": 1.0535607868851413, + "learning_rate": 3.980510086820409e-05, + "loss": 0.2359, + "step": 10771 + }, + { + "epoch": 1.2773627416103404, + "grad_norm": 1.0191695626980086, + "learning_rate": 3.98031666176467e-05, + "loss": 0.2391, + "step": 10772 + }, + { + "epoch": 1.2774813233724653, + "grad_norm": 1.0529741652759002, + "learning_rate": 3.9801232230622266e-05, + "loss": 0.2414, + "step": 10773 + }, + { + "epoch": 1.2775999051345903, + "grad_norm": 1.0766937141432322, + "learning_rate": 3.979929770714862e-05, + "loss": 0.2038, + "step": 10774 + }, + { + "epoch": 1.2777184868967153, + "grad_norm": 1.1711256827254262, + "learning_rate": 3.97973630472436e-05, + "loss": 0.2177, + "step": 10775 + }, + { + "epoch": 1.2778370686588403, + "grad_norm": 1.2287305986693369, + "learning_rate": 3.979542825092503e-05, + "loss": 0.2456, + "step": 10776 + }, + { + "epoch": 1.2779556504209653, + "grad_norm": 1.0670594279674128, + "learning_rate": 3.979349331821076e-05, + "loss": 0.2782, + "step": 10777 + }, + { + "epoch": 1.2780742321830902, + "grad_norm": 1.1196378719961013, + "learning_rate": 3.979155824911863e-05, + "loss": 0.1896, + "step": 10778 + }, + { + "epoch": 1.2781928139452152, + "grad_norm": 2.662878470385042, + "learning_rate": 3.978962304366646e-05, + "loss": 0.4028, + "step": 10779 + }, + { + "epoch": 1.2783113957073402, + "grad_norm": 0.9258389119173809, + "learning_rate": 3.97876877018721e-05, + "loss": 0.1828, + "step": 10780 + }, + { + "epoch": 1.2784299774694652, + "grad_norm": 1.6205214831015902, + "learning_rate": 3.9785752223753406e-05, + "loss": 0.2429, + "step": 10781 + }, + { + "epoch": 1.2785485592315902, + "grad_norm": 1.0531458938995457, + "learning_rate": 3.97838166093282e-05, + "loss": 0.2343, + "step": 10782 + }, + { + "epoch": 1.2786671409937151, + "grad_norm": 1.3234376795672296, + "learning_rate": 3.978188085861434e-05, + "loss": 0.2859, + "step": 10783 + }, + { + "epoch": 1.2787857227558401, + "grad_norm": 0.7062582871288987, + "learning_rate": 3.977994497162966e-05, + "loss": 0.136, + "step": 10784 + }, + { + "epoch": 1.278904304517965, + "grad_norm": 1.077109028626667, + "learning_rate": 3.9778008948392006e-05, + "loss": 0.2251, + "step": 10785 + }, + { + "epoch": 1.27902288628009, + "grad_norm": 1.133612752429743, + "learning_rate": 3.977607278891924e-05, + "loss": 0.2201, + "step": 10786 + }, + { + "epoch": 1.279141468042215, + "grad_norm": 1.3091837012733343, + "learning_rate": 3.977413649322921e-05, + "loss": 0.2802, + "step": 10787 + }, + { + "epoch": 1.27926004980434, + "grad_norm": 0.9619663244214097, + "learning_rate": 3.9772200061339746e-05, + "loss": 0.241, + "step": 10788 + }, + { + "epoch": 1.279378631566465, + "grad_norm": 0.9414846498318026, + "learning_rate": 3.977026349326871e-05, + "loss": 0.2532, + "step": 10789 + }, + { + "epoch": 1.27949721332859, + "grad_norm": 0.8944483944557797, + "learning_rate": 3.9768326789033967e-05, + "loss": 0.2136, + "step": 10790 + }, + { + "epoch": 1.279615795090715, + "grad_norm": 0.9732708054867222, + "learning_rate": 3.976638994865336e-05, + "loss": 0.2023, + "step": 10791 + }, + { + "epoch": 1.27973437685284, + "grad_norm": 1.1212607027694528, + "learning_rate": 3.976445297214474e-05, + "loss": 0.2341, + "step": 10792 + }, + { + "epoch": 1.279852958614965, + "grad_norm": 1.15262076829852, + "learning_rate": 3.976251585952597e-05, + "loss": 0.2004, + "step": 10793 + }, + { + "epoch": 1.27997154037709, + "grad_norm": 1.2344976101366896, + "learning_rate": 3.9760578610814906e-05, + "loss": 0.2737, + "step": 10794 + }, + { + "epoch": 1.280090122139215, + "grad_norm": 1.2336276643157398, + "learning_rate": 3.975864122602941e-05, + "loss": 0.3141, + "step": 10795 + }, + { + "epoch": 1.28020870390134, + "grad_norm": 1.2102905537655648, + "learning_rate": 3.975670370518734e-05, + "loss": 0.2558, + "step": 10796 + }, + { + "epoch": 1.280327285663465, + "grad_norm": 1.0172884340862178, + "learning_rate": 3.9754766048306555e-05, + "loss": 0.1813, + "step": 10797 + }, + { + "epoch": 1.2804458674255899, + "grad_norm": 1.385540104185659, + "learning_rate": 3.975282825540493e-05, + "loss": 0.2708, + "step": 10798 + }, + { + "epoch": 1.280564449187715, + "grad_norm": 1.0339832845056875, + "learning_rate": 3.9750890326500313e-05, + "loss": 0.1863, + "step": 10799 + }, + { + "epoch": 1.2806830309498398, + "grad_norm": 1.0651917868920195, + "learning_rate": 3.9748952261610575e-05, + "loss": 0.2136, + "step": 10800 + }, + { + "epoch": 1.280801612711965, + "grad_norm": 1.0448083465828053, + "learning_rate": 3.974701406075358e-05, + "loss": 0.2798, + "step": 10801 + }, + { + "epoch": 1.2809201944740898, + "grad_norm": 0.9630826586442516, + "learning_rate": 3.974507572394721e-05, + "loss": 0.1939, + "step": 10802 + }, + { + "epoch": 1.281038776236215, + "grad_norm": 1.1392357919925158, + "learning_rate": 3.9743137251209315e-05, + "loss": 0.2562, + "step": 10803 + }, + { + "epoch": 1.2811573579983397, + "grad_norm": 1.0889679097238374, + "learning_rate": 3.974119864255778e-05, + "loss": 0.2037, + "step": 10804 + }, + { + "epoch": 1.281275939760465, + "grad_norm": 1.3210974350315363, + "learning_rate": 3.973925989801046e-05, + "loss": 0.2681, + "step": 10805 + }, + { + "epoch": 1.2813945215225897, + "grad_norm": 0.9583683508913416, + "learning_rate": 3.973732101758525e-05, + "loss": 0.2037, + "step": 10806 + }, + { + "epoch": 1.281513103284715, + "grad_norm": 1.3491855242132251, + "learning_rate": 3.97353820013e-05, + "loss": 0.2649, + "step": 10807 + }, + { + "epoch": 1.2816316850468397, + "grad_norm": 1.1048654093660821, + "learning_rate": 3.973344284917261e-05, + "loss": 0.222, + "step": 10808 + }, + { + "epoch": 1.2817502668089649, + "grad_norm": 1.0619835299881157, + "learning_rate": 3.9731503561220936e-05, + "loss": 0.2439, + "step": 10809 + }, + { + "epoch": 1.2818688485710898, + "grad_norm": 1.130662680304318, + "learning_rate": 3.972956413746286e-05, + "loss": 0.1912, + "step": 10810 + }, + { + "epoch": 1.2819874303332148, + "grad_norm": 0.8626962528850692, + "learning_rate": 3.9727624577916276e-05, + "loss": 0.1679, + "step": 10811 + }, + { + "epoch": 1.2821060120953398, + "grad_norm": 0.9733274136432473, + "learning_rate": 3.972568488259905e-05, + "loss": 0.1916, + "step": 10812 + }, + { + "epoch": 1.2822245938574648, + "grad_norm": 1.8788321132432857, + "learning_rate": 3.972374505152907e-05, + "loss": 0.3866, + "step": 10813 + }, + { + "epoch": 1.2823431756195898, + "grad_norm": 1.1036534492203087, + "learning_rate": 3.97218050847242e-05, + "loss": 0.2761, + "step": 10814 + }, + { + "epoch": 1.2824617573817148, + "grad_norm": 1.0978372989135854, + "learning_rate": 3.9719864982202364e-05, + "loss": 0.2496, + "step": 10815 + }, + { + "epoch": 1.2825803391438397, + "grad_norm": 1.2898888556319095, + "learning_rate": 3.9717924743981415e-05, + "loss": 0.3215, + "step": 10816 + }, + { + "epoch": 1.2826989209059647, + "grad_norm": 1.4634120850778907, + "learning_rate": 3.971598437007924e-05, + "loss": 0.365, + "step": 10817 + }, + { + "epoch": 1.2828175026680897, + "grad_norm": 0.9240282952853114, + "learning_rate": 3.9714043860513745e-05, + "loss": 0.1991, + "step": 10818 + }, + { + "epoch": 1.2829360844302147, + "grad_norm": 1.0256191252637823, + "learning_rate": 3.971210321530281e-05, + "loss": 0.2314, + "step": 10819 + }, + { + "epoch": 1.2830546661923397, + "grad_norm": 1.2544047813305328, + "learning_rate": 3.9710162434464314e-05, + "loss": 0.2486, + "step": 10820 + }, + { + "epoch": 1.2831732479544646, + "grad_norm": 1.1109123590789722, + "learning_rate": 3.970822151801616e-05, + "loss": 0.1996, + "step": 10821 + }, + { + "epoch": 1.2832918297165896, + "grad_norm": 1.2570038056889168, + "learning_rate": 3.970628046597625e-05, + "loss": 0.2712, + "step": 10822 + }, + { + "epoch": 1.2834104114787146, + "grad_norm": 1.182870748167845, + "learning_rate": 3.970433927836246e-05, + "loss": 0.246, + "step": 10823 + }, + { + "epoch": 1.2835289932408396, + "grad_norm": 1.1187804258687966, + "learning_rate": 3.97023979551927e-05, + "loss": 0.2353, + "step": 10824 + }, + { + "epoch": 1.2836475750029646, + "grad_norm": 1.2380539765894887, + "learning_rate": 3.970045649648485e-05, + "loss": 0.239, + "step": 10825 + }, + { + "epoch": 1.2837661567650895, + "grad_norm": 0.8080707455040895, + "learning_rate": 3.969851490225684e-05, + "loss": 0.1913, + "step": 10826 + }, + { + "epoch": 1.2838847385272145, + "grad_norm": 1.257570774927042, + "learning_rate": 3.969657317252652e-05, + "loss": 0.2509, + "step": 10827 + }, + { + "epoch": 1.2840033202893395, + "grad_norm": 1.0750878412680054, + "learning_rate": 3.969463130731183e-05, + "loss": 0.2294, + "step": 10828 + }, + { + "epoch": 1.2841219020514645, + "grad_norm": 1.0073654636667686, + "learning_rate": 3.969268930663066e-05, + "loss": 0.221, + "step": 10829 + }, + { + "epoch": 1.2842404838135895, + "grad_norm": 1.0101777759893429, + "learning_rate": 3.9690747170500906e-05, + "loss": 0.2062, + "step": 10830 + }, + { + "epoch": 1.2843590655757144, + "grad_norm": 1.0182331324116962, + "learning_rate": 3.9688804898940484e-05, + "loss": 0.228, + "step": 10831 + }, + { + "epoch": 1.2844776473378394, + "grad_norm": 1.081276421226411, + "learning_rate": 3.968686249196729e-05, + "loss": 0.2219, + "step": 10832 + }, + { + "epoch": 1.2845962290999644, + "grad_norm": 0.934956973006679, + "learning_rate": 3.9684919949599234e-05, + "loss": 0.1682, + "step": 10833 + }, + { + "epoch": 1.2847148108620894, + "grad_norm": 1.216757134148545, + "learning_rate": 3.968297727185423e-05, + "loss": 0.2496, + "step": 10834 + }, + { + "epoch": 1.2848333926242144, + "grad_norm": 1.6238736171895398, + "learning_rate": 3.968103445875018e-05, + "loss": 0.3954, + "step": 10835 + }, + { + "epoch": 1.2849519743863393, + "grad_norm": 1.0826496181446703, + "learning_rate": 3.967909151030499e-05, + "loss": 0.2013, + "step": 10836 + }, + { + "epoch": 1.2850705561484643, + "grad_norm": 1.1371912530561223, + "learning_rate": 3.9677148426536584e-05, + "loss": 0.2466, + "step": 10837 + }, + { + "epoch": 1.2851891379105893, + "grad_norm": 1.4511816622349496, + "learning_rate": 3.9675205207462854e-05, + "loss": 0.2694, + "step": 10838 + }, + { + "epoch": 1.2853077196727143, + "grad_norm": 1.131842578408116, + "learning_rate": 3.967326185310174e-05, + "loss": 0.2596, + "step": 10839 + }, + { + "epoch": 1.2854263014348393, + "grad_norm": 1.2886355944190715, + "learning_rate": 3.967131836347114e-05, + "loss": 0.2769, + "step": 10840 + }, + { + "epoch": 1.2855448831969642, + "grad_norm": 1.112183255024301, + "learning_rate": 3.9669374738588985e-05, + "loss": 0.2234, + "step": 10841 + }, + { + "epoch": 1.2856634649590892, + "grad_norm": 0.8106646531315126, + "learning_rate": 3.966743097847317e-05, + "loss": 0.234, + "step": 10842 + }, + { + "epoch": 1.2857820467212142, + "grad_norm": 1.488070038886557, + "learning_rate": 3.9665487083141636e-05, + "loss": 0.2762, + "step": 10843 + }, + { + "epoch": 1.2859006284833392, + "grad_norm": 1.046731742393729, + "learning_rate": 3.966354305261229e-05, + "loss": 0.213, + "step": 10844 + }, + { + "epoch": 1.2860192102454642, + "grad_norm": 1.159844054641228, + "learning_rate": 3.966159888690306e-05, + "loss": 0.2194, + "step": 10845 + }, + { + "epoch": 1.2861377920075892, + "grad_norm": 1.2967631573296186, + "learning_rate": 3.965965458603188e-05, + "loss": 0.3036, + "step": 10846 + }, + { + "epoch": 1.2862563737697141, + "grad_norm": 0.9062555437708207, + "learning_rate": 3.965771015001665e-05, + "loss": 0.1988, + "step": 10847 + }, + { + "epoch": 1.2863749555318393, + "grad_norm": 0.9766186773423624, + "learning_rate": 3.9655765578875305e-05, + "loss": 0.1658, + "step": 10848 + }, + { + "epoch": 1.286493537293964, + "grad_norm": 1.258270882776296, + "learning_rate": 3.965382087262578e-05, + "loss": 0.3109, + "step": 10849 + }, + { + "epoch": 1.2866121190560893, + "grad_norm": 1.2383936069880728, + "learning_rate": 3.965187603128598e-05, + "loss": 0.263, + "step": 10850 + }, + { + "epoch": 1.286730700818214, + "grad_norm": 1.0029879068069858, + "learning_rate": 3.964993105487387e-05, + "loss": 0.1763, + "step": 10851 + }, + { + "epoch": 1.2868492825803393, + "grad_norm": 0.776745860103505, + "learning_rate": 3.964798594340735e-05, + "loss": 0.145, + "step": 10852 + }, + { + "epoch": 1.286967864342464, + "grad_norm": 1.6526214290791585, + "learning_rate": 3.964604069690437e-05, + "loss": 0.296, + "step": 10853 + }, + { + "epoch": 1.2870864461045892, + "grad_norm": 1.2472859369583584, + "learning_rate": 3.9644095315382853e-05, + "loss": 0.2215, + "step": 10854 + }, + { + "epoch": 1.287205027866714, + "grad_norm": 1.0424786910271304, + "learning_rate": 3.9642149798860725e-05, + "loss": 0.2474, + "step": 10855 + }, + { + "epoch": 1.2873236096288392, + "grad_norm": 1.029839603055906, + "learning_rate": 3.964020414735594e-05, + "loss": 0.2264, + "step": 10856 + }, + { + "epoch": 1.287442191390964, + "grad_norm": 1.1329540560047393, + "learning_rate": 3.9638258360886425e-05, + "loss": 0.18, + "step": 10857 + }, + { + "epoch": 1.2875607731530891, + "grad_norm": 0.9460007286515121, + "learning_rate": 3.963631243947013e-05, + "loss": 0.1864, + "step": 10858 + }, + { + "epoch": 1.287679354915214, + "grad_norm": 1.0052041708525885, + "learning_rate": 3.963436638312496e-05, + "loss": 0.2188, + "step": 10859 + }, + { + "epoch": 1.287797936677339, + "grad_norm": 0.8110774137365021, + "learning_rate": 3.963242019186889e-05, + "loss": 0.1522, + "step": 10860 + }, + { + "epoch": 1.287916518439464, + "grad_norm": 0.9982825850996764, + "learning_rate": 3.963047386571985e-05, + "loss": 0.2381, + "step": 10861 + }, + { + "epoch": 1.288035100201589, + "grad_norm": 1.0251395933398348, + "learning_rate": 3.962852740469578e-05, + "loss": 0.2118, + "step": 10862 + }, + { + "epoch": 1.288153681963714, + "grad_norm": 1.0194117620348124, + "learning_rate": 3.962658080881462e-05, + "loss": 0.1888, + "step": 10863 + }, + { + "epoch": 1.288272263725839, + "grad_norm": 1.0366235112332165, + "learning_rate": 3.962463407809434e-05, + "loss": 0.2524, + "step": 10864 + }, + { + "epoch": 1.288390845487964, + "grad_norm": 1.0293300616643928, + "learning_rate": 3.962268721255284e-05, + "loss": 0.2, + "step": 10865 + }, + { + "epoch": 1.288509427250089, + "grad_norm": 1.4032066970026995, + "learning_rate": 3.962074021220812e-05, + "loss": 0.2279, + "step": 10866 + }, + { + "epoch": 1.288628009012214, + "grad_norm": 1.0953174453187209, + "learning_rate": 3.961879307707809e-05, + "loss": 0.201, + "step": 10867 + }, + { + "epoch": 1.288746590774339, + "grad_norm": 1.250480613723757, + "learning_rate": 3.961684580718072e-05, + "loss": 0.2494, + "step": 10868 + }, + { + "epoch": 1.288865172536464, + "grad_norm": 1.0564991976863756, + "learning_rate": 3.961489840253396e-05, + "loss": 0.1897, + "step": 10869 + }, + { + "epoch": 1.288983754298589, + "grad_norm": 1.488951499483191, + "learning_rate": 3.961295086315575e-05, + "loss": 0.3025, + "step": 10870 + }, + { + "epoch": 1.289102336060714, + "grad_norm": 1.1348335103772185, + "learning_rate": 3.9611003189064055e-05, + "loss": 0.2997, + "step": 10871 + }, + { + "epoch": 1.2892209178228389, + "grad_norm": 1.0958328606780834, + "learning_rate": 3.960905538027683e-05, + "loss": 0.29, + "step": 10872 + }, + { + "epoch": 1.2893394995849639, + "grad_norm": 1.5853389934726192, + "learning_rate": 3.960710743681203e-05, + "loss": 0.3408, + "step": 10873 + }, + { + "epoch": 1.2894580813470888, + "grad_norm": 1.3553147386580882, + "learning_rate": 3.960515935868761e-05, + "loss": 0.3124, + "step": 10874 + }, + { + "epoch": 1.2895766631092138, + "grad_norm": 1.0956780669018842, + "learning_rate": 3.9603211145921534e-05, + "loss": 0.262, + "step": 10875 + }, + { + "epoch": 1.2896952448713388, + "grad_norm": 1.2314281805100147, + "learning_rate": 3.960126279853175e-05, + "loss": 0.2984, + "step": 10876 + }, + { + "epoch": 1.2898138266334638, + "grad_norm": 1.262612356495236, + "learning_rate": 3.959931431653624e-05, + "loss": 0.2522, + "step": 10877 + }, + { + "epoch": 1.2899324083955888, + "grad_norm": 1.0905198432499834, + "learning_rate": 3.9597365699952946e-05, + "loss": 0.1944, + "step": 10878 + }, + { + "epoch": 1.2900509901577137, + "grad_norm": 1.0671757046165362, + "learning_rate": 3.9595416948799844e-05, + "loss": 0.2373, + "step": 10879 + }, + { + "epoch": 1.2901695719198387, + "grad_norm": 1.3834351524923154, + "learning_rate": 3.9593468063094893e-05, + "loss": 0.278, + "step": 10880 + }, + { + "epoch": 1.2902881536819637, + "grad_norm": 0.8893796294768886, + "learning_rate": 3.959151904285606e-05, + "loss": 0.1666, + "step": 10881 + }, + { + "epoch": 1.2904067354440887, + "grad_norm": 1.4486572060457568, + "learning_rate": 3.9589569888101326e-05, + "loss": 0.3251, + "step": 10882 + }, + { + "epoch": 1.2905253172062137, + "grad_norm": 1.292692264632047, + "learning_rate": 3.958762059884864e-05, + "loss": 0.3335, + "step": 10883 + }, + { + "epoch": 1.2906438989683386, + "grad_norm": 1.017353759313209, + "learning_rate": 3.958567117511599e-05, + "loss": 0.2442, + "step": 10884 + }, + { + "epoch": 1.2907624807304636, + "grad_norm": 0.8731184855866532, + "learning_rate": 3.958372161692132e-05, + "loss": 0.2272, + "step": 10885 + }, + { + "epoch": 1.2908810624925886, + "grad_norm": 1.2201772270588465, + "learning_rate": 3.958177192428264e-05, + "loss": 0.3452, + "step": 10886 + }, + { + "epoch": 1.2909996442547136, + "grad_norm": 1.6345679025474167, + "learning_rate": 3.957982209721789e-05, + "loss": 0.2544, + "step": 10887 + }, + { + "epoch": 1.2911182260168386, + "grad_norm": 1.3845043580271446, + "learning_rate": 3.957787213574506e-05, + "loss": 0.2538, + "step": 10888 + }, + { + "epoch": 1.2912368077789635, + "grad_norm": 1.0251734897505893, + "learning_rate": 3.9575922039882135e-05, + "loss": 0.2239, + "step": 10889 + }, + { + "epoch": 1.2913553895410885, + "grad_norm": 1.3402822784085564, + "learning_rate": 3.957397180964708e-05, + "loss": 0.2598, + "step": 10890 + }, + { + "epoch": 1.2914739713032135, + "grad_norm": 1.2416829162990342, + "learning_rate": 3.9572021445057874e-05, + "loss": 0.257, + "step": 10891 + }, + { + "epoch": 1.2915925530653385, + "grad_norm": 0.998296227017748, + "learning_rate": 3.9570070946132496e-05, + "loss": 0.1974, + "step": 10892 + }, + { + "epoch": 1.2917111348274635, + "grad_norm": 0.9488605144187368, + "learning_rate": 3.9568120312888935e-05, + "loss": 0.213, + "step": 10893 + }, + { + "epoch": 1.2918297165895885, + "grad_norm": 1.456984093395168, + "learning_rate": 3.956616954534517e-05, + "loss": 0.2892, + "step": 10894 + }, + { + "epoch": 1.2919482983517134, + "grad_norm": 1.4549135026537598, + "learning_rate": 3.956421864351919e-05, + "loss": 0.3438, + "step": 10895 + }, + { + "epoch": 1.2920668801138384, + "grad_norm": 1.3033457191151183, + "learning_rate": 3.956226760742896e-05, + "loss": 0.2935, + "step": 10896 + }, + { + "epoch": 1.2921854618759636, + "grad_norm": 1.3611386550927653, + "learning_rate": 3.956031643709249e-05, + "loss": 0.333, + "step": 10897 + }, + { + "epoch": 1.2923040436380884, + "grad_norm": 0.8482019499211835, + "learning_rate": 3.955836513252775e-05, + "loss": 0.1906, + "step": 10898 + }, + { + "epoch": 1.2924226254002136, + "grad_norm": 0.7592516879020468, + "learning_rate": 3.955641369375275e-05, + "loss": 0.1465, + "step": 10899 + }, + { + "epoch": 1.2925412071623383, + "grad_norm": 0.9114152660113787, + "learning_rate": 3.955446212078545e-05, + "loss": 0.2112, + "step": 10900 + }, + { + "epoch": 1.2926597889244635, + "grad_norm": 1.0176965527249355, + "learning_rate": 3.9552510413643876e-05, + "loss": 0.2444, + "step": 10901 + }, + { + "epoch": 1.2927783706865883, + "grad_norm": 1.3238726854403902, + "learning_rate": 3.955055857234599e-05, + "loss": 0.2632, + "step": 10902 + }, + { + "epoch": 1.2928969524487135, + "grad_norm": 1.2086654054290191, + "learning_rate": 3.954860659690979e-05, + "loss": 0.2145, + "step": 10903 + }, + { + "epoch": 1.2930155342108383, + "grad_norm": 1.166604912384743, + "learning_rate": 3.954665448735329e-05, + "loss": 0.2532, + "step": 10904 + }, + { + "epoch": 1.2931341159729635, + "grad_norm": 1.0259644434566155, + "learning_rate": 3.954470224369446e-05, + "loss": 0.2614, + "step": 10905 + }, + { + "epoch": 1.2932526977350882, + "grad_norm": 1.421475489821214, + "learning_rate": 3.9542749865951324e-05, + "loss": 0.325, + "step": 10906 + }, + { + "epoch": 1.2933712794972134, + "grad_norm": 0.6892659324852397, + "learning_rate": 3.9540797354141864e-05, + "loss": 0.1574, + "step": 10907 + }, + { + "epoch": 1.2934898612593382, + "grad_norm": 1.1943262884891912, + "learning_rate": 3.953884470828409e-05, + "loss": 0.2958, + "step": 10908 + }, + { + "epoch": 1.2936084430214634, + "grad_norm": 1.0059936374831007, + "learning_rate": 3.953689192839598e-05, + "loss": 0.2033, + "step": 10909 + }, + { + "epoch": 1.2937270247835884, + "grad_norm": 1.3049417394248664, + "learning_rate": 3.9534939014495566e-05, + "loss": 0.2484, + "step": 10910 + }, + { + "epoch": 1.2938456065457133, + "grad_norm": 1.2831307623216306, + "learning_rate": 3.953298596660083e-05, + "loss": 0.323, + "step": 10911 + }, + { + "epoch": 1.2939641883078383, + "grad_norm": 1.1153406161859811, + "learning_rate": 3.953103278472979e-05, + "loss": 0.3101, + "step": 10912 + }, + { + "epoch": 1.2940827700699633, + "grad_norm": 1.1685298747146309, + "learning_rate": 3.952907946890044e-05, + "loss": 0.2439, + "step": 10913 + }, + { + "epoch": 1.2942013518320883, + "grad_norm": 1.1133327678044826, + "learning_rate": 3.95271260191308e-05, + "loss": 0.2383, + "step": 10914 + }, + { + "epoch": 1.2943199335942133, + "grad_norm": 1.1485858526637018, + "learning_rate": 3.952517243543888e-05, + "loss": 0.2285, + "step": 10915 + }, + { + "epoch": 1.2944385153563382, + "grad_norm": 1.0614519519421255, + "learning_rate": 3.952321871784267e-05, + "loss": 0.2451, + "step": 10916 + }, + { + "epoch": 1.2945570971184632, + "grad_norm": 1.3389531246040944, + "learning_rate": 3.9521264866360205e-05, + "loss": 0.2777, + "step": 10917 + }, + { + "epoch": 1.2946756788805882, + "grad_norm": 0.9536185829351614, + "learning_rate": 3.951931088100947e-05, + "loss": 0.1754, + "step": 10918 + }, + { + "epoch": 1.2947942606427132, + "grad_norm": 1.442286288284422, + "learning_rate": 3.9517356761808505e-05, + "loss": 0.2943, + "step": 10919 + }, + { + "epoch": 1.2949128424048382, + "grad_norm": 1.0108269207595781, + "learning_rate": 3.951540250877531e-05, + "loss": 0.2064, + "step": 10920 + }, + { + "epoch": 1.2950314241669632, + "grad_norm": 1.1874412648391464, + "learning_rate": 3.9513448121927904e-05, + "loss": 0.2504, + "step": 10921 + }, + { + "epoch": 1.2951500059290881, + "grad_norm": 1.8577874653148276, + "learning_rate": 3.95114936012843e-05, + "loss": 0.3956, + "step": 10922 + }, + { + "epoch": 1.2952685876912131, + "grad_norm": 1.2745280955023635, + "learning_rate": 3.9509538946862525e-05, + "loss": 0.1945, + "step": 10923 + }, + { + "epoch": 1.295387169453338, + "grad_norm": 0.9017677114698852, + "learning_rate": 3.950758415868059e-05, + "loss": 0.2091, + "step": 10924 + }, + { + "epoch": 1.295505751215463, + "grad_norm": 1.36057581619686, + "learning_rate": 3.9505629236756526e-05, + "loss": 0.2721, + "step": 10925 + }, + { + "epoch": 1.295624332977588, + "grad_norm": 1.021380764669379, + "learning_rate": 3.9503674181108344e-05, + "loss": 0.2127, + "step": 10926 + }, + { + "epoch": 1.295742914739713, + "grad_norm": 1.3653272173770985, + "learning_rate": 3.950171899175407e-05, + "loss": 0.258, + "step": 10927 + }, + { + "epoch": 1.295861496501838, + "grad_norm": 1.103043843459674, + "learning_rate": 3.9499763668711725e-05, + "loss": 0.2232, + "step": 10928 + }, + { + "epoch": 1.295980078263963, + "grad_norm": 1.3460112082826168, + "learning_rate": 3.949780821199935e-05, + "loss": 0.2699, + "step": 10929 + }, + { + "epoch": 1.296098660026088, + "grad_norm": 0.8451803341454924, + "learning_rate": 3.949585262163496e-05, + "loss": 0.2076, + "step": 10930 + }, + { + "epoch": 1.296217241788213, + "grad_norm": 1.5258215906603612, + "learning_rate": 3.9493896897636587e-05, + "loss": 0.3435, + "step": 10931 + }, + { + "epoch": 1.296335823550338, + "grad_norm": 1.0594440382660262, + "learning_rate": 3.9491941040022245e-05, + "loss": 0.2177, + "step": 10932 + }, + { + "epoch": 1.296454405312463, + "grad_norm": 1.347716419928417, + "learning_rate": 3.9489985048809984e-05, + "loss": 0.2527, + "step": 10933 + }, + { + "epoch": 1.296572987074588, + "grad_norm": 1.039553294688813, + "learning_rate": 3.9488028924017836e-05, + "loss": 0.1952, + "step": 10934 + }, + { + "epoch": 1.2966915688367129, + "grad_norm": 1.4532963276403335, + "learning_rate": 3.9486072665663825e-05, + "loss": 0.2859, + "step": 10935 + }, + { + "epoch": 1.2968101505988379, + "grad_norm": 1.4832038361414788, + "learning_rate": 3.948411627376599e-05, + "loss": 0.3221, + "step": 10936 + }, + { + "epoch": 1.2969287323609628, + "grad_norm": 1.0660062250236255, + "learning_rate": 3.9482159748342354e-05, + "loss": 0.2303, + "step": 10937 + }, + { + "epoch": 1.2970473141230878, + "grad_norm": 1.0076370736756224, + "learning_rate": 3.948020308941097e-05, + "loss": 0.2754, + "step": 10938 + }, + { + "epoch": 1.2971658958852128, + "grad_norm": 0.967483758669861, + "learning_rate": 3.9478246296989873e-05, + "loss": 0.2274, + "step": 10939 + }, + { + "epoch": 1.2972844776473378, + "grad_norm": 1.0028042922740734, + "learning_rate": 3.9476289371097105e-05, + "loss": 0.2468, + "step": 10940 + }, + { + "epoch": 1.2974030594094628, + "grad_norm": 0.9473837525029029, + "learning_rate": 3.9474332311750696e-05, + "loss": 0.1803, + "step": 10941 + }, + { + "epoch": 1.2975216411715877, + "grad_norm": 1.1306772008251902, + "learning_rate": 3.947237511896869e-05, + "loss": 0.2152, + "step": 10942 + }, + { + "epoch": 1.2976402229337127, + "grad_norm": 0.9383691048947969, + "learning_rate": 3.947041779276913e-05, + "loss": 0.21, + "step": 10943 + }, + { + "epoch": 1.2977588046958377, + "grad_norm": 0.9319631144337592, + "learning_rate": 3.946846033317006e-05, + "loss": 0.1636, + "step": 10944 + }, + { + "epoch": 1.2978773864579627, + "grad_norm": 1.1611219824955554, + "learning_rate": 3.9466502740189544e-05, + "loss": 0.2503, + "step": 10945 + }, + { + "epoch": 1.2979959682200877, + "grad_norm": 1.1984024787537124, + "learning_rate": 3.94645450138456e-05, + "loss": 0.234, + "step": 10946 + }, + { + "epoch": 1.2981145499822127, + "grad_norm": 0.9987341047902589, + "learning_rate": 3.946258715415629e-05, + "loss": 0.1925, + "step": 10947 + }, + { + "epoch": 1.2982331317443379, + "grad_norm": 1.4300347102787359, + "learning_rate": 3.9460629161139676e-05, + "loss": 0.2923, + "step": 10948 + }, + { + "epoch": 1.2983517135064626, + "grad_norm": 0.9738341906524622, + "learning_rate": 3.9458671034813775e-05, + "loss": 0.243, + "step": 10949 + }, + { + "epoch": 1.2984702952685878, + "grad_norm": 1.1434331591476317, + "learning_rate": 3.945671277519667e-05, + "loss": 0.2379, + "step": 10950 + }, + { + "epoch": 1.2985888770307126, + "grad_norm": 1.2803592011531808, + "learning_rate": 3.945475438230639e-05, + "loss": 0.2531, + "step": 10951 + }, + { + "epoch": 1.2987074587928378, + "grad_norm": 0.8684743616037215, + "learning_rate": 3.9452795856161004e-05, + "loss": 0.1877, + "step": 10952 + }, + { + "epoch": 1.2988260405549625, + "grad_norm": 1.1635908093456846, + "learning_rate": 3.945083719677857e-05, + "loss": 0.2258, + "step": 10953 + }, + { + "epoch": 1.2989446223170877, + "grad_norm": 1.6479160793408856, + "learning_rate": 3.944887840417713e-05, + "loss": 0.3551, + "step": 10954 + }, + { + "epoch": 1.2990632040792125, + "grad_norm": 1.1475337948813804, + "learning_rate": 3.944691947837475e-05, + "loss": 0.2441, + "step": 10955 + }, + { + "epoch": 1.2991817858413377, + "grad_norm": 1.161098695672196, + "learning_rate": 3.944496041938949e-05, + "loss": 0.2649, + "step": 10956 + }, + { + "epoch": 1.2993003676034625, + "grad_norm": 1.1909902634831517, + "learning_rate": 3.944300122723941e-05, + "loss": 0.2574, + "step": 10957 + }, + { + "epoch": 1.2994189493655877, + "grad_norm": 1.1562883122580063, + "learning_rate": 3.9441041901942566e-05, + "loss": 0.256, + "step": 10958 + }, + { + "epoch": 1.2995375311277124, + "grad_norm": 0.8194073254128489, + "learning_rate": 3.9439082443517026e-05, + "loss": 0.1515, + "step": 10959 + }, + { + "epoch": 1.2996561128898376, + "grad_norm": 0.9454543556052423, + "learning_rate": 3.9437122851980855e-05, + "loss": 0.2575, + "step": 10960 + }, + { + "epoch": 1.2997746946519626, + "grad_norm": 0.8832766822840542, + "learning_rate": 3.943516312735211e-05, + "loss": 0.2172, + "step": 10961 + }, + { + "epoch": 1.2998932764140876, + "grad_norm": 1.2171855192479484, + "learning_rate": 3.9433203269648875e-05, + "loss": 0.2177, + "step": 10962 + }, + { + "epoch": 1.3000118581762126, + "grad_norm": 1.1632174660850447, + "learning_rate": 3.9431243278889197e-05, + "loss": 0.3033, + "step": 10963 + }, + { + "epoch": 1.3001304399383375, + "grad_norm": 1.2348335482179527, + "learning_rate": 3.942928315509115e-05, + "loss": 0.2972, + "step": 10964 + }, + { + "epoch": 1.3002490217004625, + "grad_norm": 1.1811467688845099, + "learning_rate": 3.942732289827281e-05, + "loss": 0.2101, + "step": 10965 + }, + { + "epoch": 1.3003676034625875, + "grad_norm": 1.0741466451234243, + "learning_rate": 3.942536250845225e-05, + "loss": 0.2256, + "step": 10966 + }, + { + "epoch": 1.3004861852247125, + "grad_norm": 1.452026231644293, + "learning_rate": 3.9423401985647526e-05, + "loss": 0.3787, + "step": 10967 + }, + { + "epoch": 1.3006047669868375, + "grad_norm": 0.9900339027586272, + "learning_rate": 3.942144132987673e-05, + "loss": 0.2116, + "step": 10968 + }, + { + "epoch": 1.3007233487489624, + "grad_norm": 1.5798822304224407, + "learning_rate": 3.941948054115792e-05, + "loss": 0.3551, + "step": 10969 + }, + { + "epoch": 1.3008419305110874, + "grad_norm": 1.0506342922115095, + "learning_rate": 3.941751961950919e-05, + "loss": 0.2405, + "step": 10970 + }, + { + "epoch": 1.3009605122732124, + "grad_norm": 1.052018683018909, + "learning_rate": 3.941555856494861e-05, + "loss": 0.2778, + "step": 10971 + }, + { + "epoch": 1.3010790940353374, + "grad_norm": 1.1147736653837486, + "learning_rate": 3.941359737749426e-05, + "loss": 0.2802, + "step": 10972 + }, + { + "epoch": 1.3011976757974624, + "grad_norm": 1.083706638428422, + "learning_rate": 3.9411636057164206e-05, + "loss": 0.2301, + "step": 10973 + }, + { + "epoch": 1.3013162575595874, + "grad_norm": 1.0532423278475032, + "learning_rate": 3.940967460397656e-05, + "loss": 0.174, + "step": 10974 + }, + { + "epoch": 1.3014348393217123, + "grad_norm": 0.987517799962196, + "learning_rate": 3.940771301794937e-05, + "loss": 0.2389, + "step": 10975 + }, + { + "epoch": 1.3015534210838373, + "grad_norm": 1.1261967856232036, + "learning_rate": 3.940575129910073e-05, + "loss": 0.2794, + "step": 10976 + }, + { + "epoch": 1.3016720028459623, + "grad_norm": 0.8671480576021543, + "learning_rate": 3.940378944744873e-05, + "loss": 0.1769, + "step": 10977 + }, + { + "epoch": 1.3017905846080873, + "grad_norm": 0.9708060629952727, + "learning_rate": 3.9401827463011465e-05, + "loss": 0.2216, + "step": 10978 + }, + { + "epoch": 1.3019091663702123, + "grad_norm": 1.009598848587565, + "learning_rate": 3.9399865345807e-05, + "loss": 0.2333, + "step": 10979 + }, + { + "epoch": 1.3020277481323372, + "grad_norm": 0.9939228181681855, + "learning_rate": 3.939790309585344e-05, + "loss": 0.232, + "step": 10980 + }, + { + "epoch": 1.3021463298944622, + "grad_norm": 0.9821716127518101, + "learning_rate": 3.939594071316887e-05, + "loss": 0.2662, + "step": 10981 + }, + { + "epoch": 1.3022649116565872, + "grad_norm": 0.9092369229597885, + "learning_rate": 3.939397819777138e-05, + "loss": 0.1931, + "step": 10982 + }, + { + "epoch": 1.3023834934187122, + "grad_norm": 0.9401910332321576, + "learning_rate": 3.939201554967906e-05, + "loss": 0.1702, + "step": 10983 + }, + { + "epoch": 1.3025020751808372, + "grad_norm": 1.1173591216225927, + "learning_rate": 3.939005276891001e-05, + "loss": 0.2367, + "step": 10984 + }, + { + "epoch": 1.3026206569429621, + "grad_norm": 0.9796743162363702, + "learning_rate": 3.938808985548231e-05, + "loss": 0.2115, + "step": 10985 + }, + { + "epoch": 1.3027392387050871, + "grad_norm": 1.0020710483908057, + "learning_rate": 3.9386126809414076e-05, + "loss": 0.1485, + "step": 10986 + }, + { + "epoch": 1.302857820467212, + "grad_norm": 1.7965607422536327, + "learning_rate": 3.938416363072339e-05, + "loss": 0.3728, + "step": 10987 + }, + { + "epoch": 1.302976402229337, + "grad_norm": 1.543581253018415, + "learning_rate": 3.938220031942835e-05, + "loss": 0.3855, + "step": 10988 + }, + { + "epoch": 1.303094983991462, + "grad_norm": 0.9895761832005385, + "learning_rate": 3.938023687554707e-05, + "loss": 0.2016, + "step": 10989 + }, + { + "epoch": 1.303213565753587, + "grad_norm": 1.5064454957267597, + "learning_rate": 3.9378273299097635e-05, + "loss": 0.3449, + "step": 10990 + }, + { + "epoch": 1.303332147515712, + "grad_norm": 0.9317585629842473, + "learning_rate": 3.937630959009815e-05, + "loss": 0.1831, + "step": 10991 + }, + { + "epoch": 1.303450729277837, + "grad_norm": 1.200933103085418, + "learning_rate": 3.937434574856672e-05, + "loss": 0.2867, + "step": 10992 + }, + { + "epoch": 1.303569311039962, + "grad_norm": 1.0706253583238232, + "learning_rate": 3.937238177452145e-05, + "loss": 0.2506, + "step": 10993 + }, + { + "epoch": 1.303687892802087, + "grad_norm": 1.0648096672750937, + "learning_rate": 3.9370417667980446e-05, + "loss": 0.2155, + "step": 10994 + }, + { + "epoch": 1.303806474564212, + "grad_norm": 1.1169240209925375, + "learning_rate": 3.936845342896181e-05, + "loss": 0.2264, + "step": 10995 + }, + { + "epoch": 1.303925056326337, + "grad_norm": 1.1864142136562092, + "learning_rate": 3.936648905748366e-05, + "loss": 0.2526, + "step": 10996 + }, + { + "epoch": 1.3040436380884621, + "grad_norm": 1.0362168935782063, + "learning_rate": 3.936452455356409e-05, + "loss": 0.2581, + "step": 10997 + }, + { + "epoch": 1.304162219850587, + "grad_norm": 1.049977026796436, + "learning_rate": 3.936255991722122e-05, + "loss": 0.2088, + "step": 10998 + }, + { + "epoch": 1.304280801612712, + "grad_norm": 0.931230089782395, + "learning_rate": 3.936059514847317e-05, + "loss": 0.181, + "step": 10999 + }, + { + "epoch": 1.3043993833748369, + "grad_norm": 0.9931849029603441, + "learning_rate": 3.9358630247338034e-05, + "loss": 0.1968, + "step": 11000 + }, + { + "epoch": 1.304517965136962, + "grad_norm": 1.0262459760439553, + "learning_rate": 3.9356665213833935e-05, + "loss": 0.2265, + "step": 11001 + }, + { + "epoch": 1.3046365468990868, + "grad_norm": 1.1955885270566438, + "learning_rate": 3.9354700047978986e-05, + "loss": 0.2763, + "step": 11002 + }, + { + "epoch": 1.304755128661212, + "grad_norm": 1.6458841590910434, + "learning_rate": 3.935273474979131e-05, + "loss": 0.2739, + "step": 11003 + }, + { + "epoch": 1.3048737104233368, + "grad_norm": 1.1783239350545998, + "learning_rate": 3.935076931928902e-05, + "loss": 0.2534, + "step": 11004 + }, + { + "epoch": 1.304992292185462, + "grad_norm": 1.8678454303605045, + "learning_rate": 3.9348803756490234e-05, + "loss": 0.5127, + "step": 11005 + }, + { + "epoch": 1.3051108739475867, + "grad_norm": 1.2591141646030697, + "learning_rate": 3.934683806141307e-05, + "loss": 0.2674, + "step": 11006 + }, + { + "epoch": 1.305229455709712, + "grad_norm": 1.4611253160457696, + "learning_rate": 3.934487223407566e-05, + "loss": 0.3019, + "step": 11007 + }, + { + "epoch": 1.3053480374718367, + "grad_norm": 0.902960384666571, + "learning_rate": 3.934290627449611e-05, + "loss": 0.1996, + "step": 11008 + }, + { + "epoch": 1.305466619233962, + "grad_norm": 1.0365335102156557, + "learning_rate": 3.9340940182692556e-05, + "loss": 0.2329, + "step": 11009 + }, + { + "epoch": 1.3055852009960869, + "grad_norm": 1.1667042248386066, + "learning_rate": 3.9338973958683115e-05, + "loss": 0.2371, + "step": 11010 + }, + { + "epoch": 1.3057037827582119, + "grad_norm": 1.1851575265309677, + "learning_rate": 3.933700760248593e-05, + "loss": 0.2399, + "step": 11011 + }, + { + "epoch": 1.3058223645203368, + "grad_norm": 1.03041721271189, + "learning_rate": 3.93350411141191e-05, + "loss": 0.2241, + "step": 11012 + }, + { + "epoch": 1.3059409462824618, + "grad_norm": 1.3439700183422978, + "learning_rate": 3.9333074493600786e-05, + "loss": 0.2866, + "step": 11013 + }, + { + "epoch": 1.3060595280445868, + "grad_norm": 1.0741181542259692, + "learning_rate": 3.933110774094909e-05, + "loss": 0.2257, + "step": 11014 + }, + { + "epoch": 1.3061781098067118, + "grad_norm": 1.2022991159271619, + "learning_rate": 3.932914085618217e-05, + "loss": 0.2792, + "step": 11015 + }, + { + "epoch": 1.3062966915688368, + "grad_norm": 1.1630013474836398, + "learning_rate": 3.932717383931812e-05, + "loss": 0.2461, + "step": 11016 + }, + { + "epoch": 1.3064152733309617, + "grad_norm": 0.9713579937469023, + "learning_rate": 3.932520669037511e-05, + "loss": 0.22, + "step": 11017 + }, + { + "epoch": 1.3065338550930867, + "grad_norm": 1.5403182852809536, + "learning_rate": 3.932323940937126e-05, + "loss": 0.3859, + "step": 11018 + }, + { + "epoch": 1.3066524368552117, + "grad_norm": 0.7076946892835153, + "learning_rate": 3.9321271996324706e-05, + "loss": 0.1386, + "step": 11019 + }, + { + "epoch": 1.3067710186173367, + "grad_norm": 1.4937714930209656, + "learning_rate": 3.931930445125358e-05, + "loss": 0.2549, + "step": 11020 + }, + { + "epoch": 1.3068896003794617, + "grad_norm": 1.03606494176738, + "learning_rate": 3.931733677417604e-05, + "loss": 0.2184, + "step": 11021 + }, + { + "epoch": 1.3070081821415866, + "grad_norm": 1.2388664837649044, + "learning_rate": 3.93153689651102e-05, + "loss": 0.209, + "step": 11022 + }, + { + "epoch": 1.3071267639037116, + "grad_norm": 0.9619298248351297, + "learning_rate": 3.931340102407421e-05, + "loss": 0.1912, + "step": 11023 + }, + { + "epoch": 1.3072453456658366, + "grad_norm": 1.1925713485668128, + "learning_rate": 3.9311432951086234e-05, + "loss": 0.2375, + "step": 11024 + }, + { + "epoch": 1.3073639274279616, + "grad_norm": 1.6650479425514484, + "learning_rate": 3.930946474616438e-05, + "loss": 0.3977, + "step": 11025 + }, + { + "epoch": 1.3074825091900866, + "grad_norm": 1.168668308668476, + "learning_rate": 3.9307496409326813e-05, + "loss": 0.2205, + "step": 11026 + }, + { + "epoch": 1.3076010909522116, + "grad_norm": 1.0875723042302883, + "learning_rate": 3.930552794059167e-05, + "loss": 0.2358, + "step": 11027 + }, + { + "epoch": 1.3077196727143365, + "grad_norm": 0.9141667471590318, + "learning_rate": 3.930355933997711e-05, + "loss": 0.2215, + "step": 11028 + }, + { + "epoch": 1.3078382544764615, + "grad_norm": 0.9600842082955442, + "learning_rate": 3.930159060750127e-05, + "loss": 0.1985, + "step": 11029 + }, + { + "epoch": 1.3079568362385865, + "grad_norm": 1.1549740524700642, + "learning_rate": 3.92996217431823e-05, + "loss": 0.2365, + "step": 11030 + }, + { + "epoch": 1.3080754180007115, + "grad_norm": 0.9633976416225218, + "learning_rate": 3.929765274703835e-05, + "loss": 0.2157, + "step": 11031 + }, + { + "epoch": 1.3081939997628365, + "grad_norm": 0.9757738777353772, + "learning_rate": 3.929568361908759e-05, + "loss": 0.1594, + "step": 11032 + }, + { + "epoch": 1.3083125815249614, + "grad_norm": 1.2527918634111816, + "learning_rate": 3.9293714359348135e-05, + "loss": 0.2412, + "step": 11033 + }, + { + "epoch": 1.3084311632870864, + "grad_norm": 1.3882991721045297, + "learning_rate": 3.929174496783818e-05, + "loss": 0.306, + "step": 11034 + }, + { + "epoch": 1.3085497450492114, + "grad_norm": 0.8849318706492632, + "learning_rate": 3.928977544457585e-05, + "loss": 0.2242, + "step": 11035 + }, + { + "epoch": 1.3086683268113364, + "grad_norm": 0.9885320485109715, + "learning_rate": 3.9287805789579326e-05, + "loss": 0.2017, + "step": 11036 + }, + { + "epoch": 1.3087869085734614, + "grad_norm": 0.863515234431776, + "learning_rate": 3.928583600286674e-05, + "loss": 0.2075, + "step": 11037 + }, + { + "epoch": 1.3089054903355863, + "grad_norm": 1.7540145162464784, + "learning_rate": 3.9283866084456265e-05, + "loss": 0.3283, + "step": 11038 + }, + { + "epoch": 1.3090240720977113, + "grad_norm": 1.500305891615403, + "learning_rate": 3.9281896034366075e-05, + "loss": 0.3512, + "step": 11039 + }, + { + "epoch": 1.3091426538598363, + "grad_norm": 0.8754008521091413, + "learning_rate": 3.927992585261431e-05, + "loss": 0.1753, + "step": 11040 + }, + { + "epoch": 1.3092612356219613, + "grad_norm": 1.0513044993372331, + "learning_rate": 3.927795553921914e-05, + "loss": 0.2164, + "step": 11041 + }, + { + "epoch": 1.3093798173840863, + "grad_norm": 1.266350547388877, + "learning_rate": 3.927598509419872e-05, + "loss": 0.2283, + "step": 11042 + }, + { + "epoch": 1.3094983991462112, + "grad_norm": 1.168973706337675, + "learning_rate": 3.927401451757123e-05, + "loss": 0.2615, + "step": 11043 + }, + { + "epoch": 1.3096169809083362, + "grad_norm": 1.603386605597296, + "learning_rate": 3.927204380935483e-05, + "loss": 0.3235, + "step": 11044 + }, + { + "epoch": 1.3097355626704612, + "grad_norm": 1.1697360939469852, + "learning_rate": 3.927007296956769e-05, + "loss": 0.2066, + "step": 11045 + }, + { + "epoch": 1.3098541444325862, + "grad_norm": 0.8139674193183849, + "learning_rate": 3.9268101998227976e-05, + "loss": 0.2113, + "step": 11046 + }, + { + "epoch": 1.3099727261947112, + "grad_norm": 0.9381711979380831, + "learning_rate": 3.926613089535386e-05, + "loss": 0.2164, + "step": 11047 + }, + { + "epoch": 1.3100913079568364, + "grad_norm": 1.110368195045124, + "learning_rate": 3.926415966096351e-05, + "loss": 0.254, + "step": 11048 + }, + { + "epoch": 1.3102098897189611, + "grad_norm": 1.4317934763089903, + "learning_rate": 3.92621882950751e-05, + "loss": 0.3357, + "step": 11049 + }, + { + "epoch": 1.3103284714810863, + "grad_norm": 1.3330238817727107, + "learning_rate": 3.9260216797706794e-05, + "loss": 0.2271, + "step": 11050 + }, + { + "epoch": 1.310447053243211, + "grad_norm": 1.8283583099543241, + "learning_rate": 3.925824516887679e-05, + "loss": 0.1875, + "step": 11051 + }, + { + "epoch": 1.3105656350053363, + "grad_norm": 1.112126808617559, + "learning_rate": 3.925627340860324e-05, + "loss": 0.2711, + "step": 11052 + }, + { + "epoch": 1.310684216767461, + "grad_norm": 1.0631748504201781, + "learning_rate": 3.9254301516904333e-05, + "loss": 0.2657, + "step": 11053 + }, + { + "epoch": 1.3108027985295863, + "grad_norm": 1.1373032644686027, + "learning_rate": 3.925232949379825e-05, + "loss": 0.2242, + "step": 11054 + }, + { + "epoch": 1.310921380291711, + "grad_norm": 1.0891894437720193, + "learning_rate": 3.9250357339303166e-05, + "loss": 0.2861, + "step": 11055 + }, + { + "epoch": 1.3110399620538362, + "grad_norm": 1.177565738783588, + "learning_rate": 3.9248385053437256e-05, + "loss": 0.3025, + "step": 11056 + }, + { + "epoch": 1.311158543815961, + "grad_norm": 1.021076895052079, + "learning_rate": 3.9246412636218715e-05, + "loss": 0.2359, + "step": 11057 + }, + { + "epoch": 1.3112771255780862, + "grad_norm": 0.7903176657034932, + "learning_rate": 3.9244440087665724e-05, + "loss": 0.2004, + "step": 11058 + }, + { + "epoch": 1.3113957073402112, + "grad_norm": 1.0865025883221286, + "learning_rate": 3.924246740779645e-05, + "loss": 0.2561, + "step": 11059 + }, + { + "epoch": 1.3115142891023361, + "grad_norm": 1.363566619894067, + "learning_rate": 3.9240494596629105e-05, + "loss": 0.3204, + "step": 11060 + }, + { + "epoch": 1.3116328708644611, + "grad_norm": 1.599123338318329, + "learning_rate": 3.923852165418185e-05, + "loss": 0.2995, + "step": 11061 + }, + { + "epoch": 1.311751452626586, + "grad_norm": 1.0673267631199486, + "learning_rate": 3.92365485804729e-05, + "loss": 0.2122, + "step": 11062 + }, + { + "epoch": 1.311870034388711, + "grad_norm": 0.8686340651229295, + "learning_rate": 3.923457537552042e-05, + "loss": 0.2269, + "step": 11063 + }, + { + "epoch": 1.311988616150836, + "grad_norm": 1.4547613599040894, + "learning_rate": 3.9232602039342614e-05, + "loss": 0.328, + "step": 11064 + }, + { + "epoch": 1.312107197912961, + "grad_norm": 0.917687321440443, + "learning_rate": 3.923062857195766e-05, + "loss": 0.2163, + "step": 11065 + }, + { + "epoch": 1.312225779675086, + "grad_norm": 1.8920434866946785, + "learning_rate": 3.9228654973383775e-05, + "loss": 0.4399, + "step": 11066 + }, + { + "epoch": 1.312344361437211, + "grad_norm": 1.3631238730417725, + "learning_rate": 3.922668124363914e-05, + "loss": 0.2841, + "step": 11067 + }, + { + "epoch": 1.312462943199336, + "grad_norm": 1.2049065224743407, + "learning_rate": 3.922470738274194e-05, + "loss": 0.2271, + "step": 11068 + }, + { + "epoch": 1.312581524961461, + "grad_norm": 0.7978080042036371, + "learning_rate": 3.922273339071039e-05, + "loss": 0.1822, + "step": 11069 + }, + { + "epoch": 1.312700106723586, + "grad_norm": 1.1817334623932707, + "learning_rate": 3.922075926756267e-05, + "loss": 0.2245, + "step": 11070 + }, + { + "epoch": 1.312818688485711, + "grad_norm": 0.9933488590652596, + "learning_rate": 3.9218785013316995e-05, + "loss": 0.1844, + "step": 11071 + }, + { + "epoch": 1.312937270247836, + "grad_norm": 1.1383745819867643, + "learning_rate": 3.9216810627991556e-05, + "loss": 0.2846, + "step": 11072 + }, + { + "epoch": 1.3130558520099609, + "grad_norm": 1.3634093720258977, + "learning_rate": 3.921483611160456e-05, + "loss": 0.2502, + "step": 11073 + }, + { + "epoch": 1.3131744337720859, + "grad_norm": 0.9425253275901578, + "learning_rate": 3.92128614641742e-05, + "loss": 0.1906, + "step": 11074 + }, + { + "epoch": 1.3132930155342109, + "grad_norm": 1.7546724133971325, + "learning_rate": 3.9210886685718696e-05, + "loss": 0.4319, + "step": 11075 + }, + { + "epoch": 1.3134115972963358, + "grad_norm": 0.8193982962349261, + "learning_rate": 3.9208911776256236e-05, + "loss": 0.1817, + "step": 11076 + }, + { + "epoch": 1.3135301790584608, + "grad_norm": 0.9706696553986108, + "learning_rate": 3.920693673580504e-05, + "loss": 0.2144, + "step": 11077 + }, + { + "epoch": 1.3136487608205858, + "grad_norm": 1.0387828538734425, + "learning_rate": 3.92049615643833e-05, + "loss": 0.2343, + "step": 11078 + }, + { + "epoch": 1.3137673425827108, + "grad_norm": 1.5092123672303897, + "learning_rate": 3.9202986262009245e-05, + "loss": 0.3625, + "step": 11079 + }, + { + "epoch": 1.3138859243448358, + "grad_norm": 1.0013589593752805, + "learning_rate": 3.920101082870106e-05, + "loss": 0.2126, + "step": 11080 + }, + { + "epoch": 1.3140045061069607, + "grad_norm": 1.455391233515485, + "learning_rate": 3.919903526447698e-05, + "loss": 0.3778, + "step": 11081 + }, + { + "epoch": 1.3141230878690857, + "grad_norm": 1.3306614495803373, + "learning_rate": 3.91970595693552e-05, + "loss": 0.2743, + "step": 11082 + }, + { + "epoch": 1.3142416696312107, + "grad_norm": 0.8854717817415506, + "learning_rate": 3.9195083743353946e-05, + "loss": 0.2086, + "step": 11083 + }, + { + "epoch": 1.3143602513933357, + "grad_norm": 0.9309228558840259, + "learning_rate": 3.9193107786491425e-05, + "loss": 0.1691, + "step": 11084 + }, + { + "epoch": 1.3144788331554607, + "grad_norm": 0.9639274883011228, + "learning_rate": 3.9191131698785854e-05, + "loss": 0.266, + "step": 11085 + }, + { + "epoch": 1.3145974149175856, + "grad_norm": 1.0099147993284139, + "learning_rate": 3.918915548025545e-05, + "loss": 0.2355, + "step": 11086 + }, + { + "epoch": 1.3147159966797106, + "grad_norm": 1.040950819401661, + "learning_rate": 3.918717913091843e-05, + "loss": 0.2711, + "step": 11087 + }, + { + "epoch": 1.3148345784418356, + "grad_norm": 0.8979440066396986, + "learning_rate": 3.918520265079302e-05, + "loss": 0.1599, + "step": 11088 + }, + { + "epoch": 1.3149531602039606, + "grad_norm": 1.4035668725970163, + "learning_rate": 3.918322603989743e-05, + "loss": 0.2931, + "step": 11089 + }, + { + "epoch": 1.3150717419660856, + "grad_norm": 1.016047967536112, + "learning_rate": 3.91812492982499e-05, + "loss": 0.1977, + "step": 11090 + }, + { + "epoch": 1.3151903237282105, + "grad_norm": 1.296860020455167, + "learning_rate": 3.917927242586864e-05, + "loss": 0.2835, + "step": 11091 + }, + { + "epoch": 1.3153089054903355, + "grad_norm": 1.132837159576494, + "learning_rate": 3.917729542277188e-05, + "loss": 0.244, + "step": 11092 + }, + { + "epoch": 1.3154274872524605, + "grad_norm": 0.9722535317061051, + "learning_rate": 3.9175318288977826e-05, + "loss": 0.2607, + "step": 11093 + }, + { + "epoch": 1.3155460690145855, + "grad_norm": 1.0111097029649245, + "learning_rate": 3.917334102450473e-05, + "loss": 0.2556, + "step": 11094 + }, + { + "epoch": 1.3156646507767105, + "grad_norm": 1.4376006768170924, + "learning_rate": 3.9171363629370804e-05, + "loss": 0.3211, + "step": 11095 + }, + { + "epoch": 1.3157832325388354, + "grad_norm": 0.9542360103268489, + "learning_rate": 3.916938610359429e-05, + "loss": 0.2045, + "step": 11096 + }, + { + "epoch": 1.3159018143009606, + "grad_norm": 1.1433301710210642, + "learning_rate": 3.916740844719341e-05, + "loss": 0.2653, + "step": 11097 + }, + { + "epoch": 1.3160203960630854, + "grad_norm": 1.2583909812749718, + "learning_rate": 3.9165430660186395e-05, + "loss": 0.3208, + "step": 11098 + }, + { + "epoch": 1.3161389778252106, + "grad_norm": 0.9847176257532655, + "learning_rate": 3.916345274259149e-05, + "loss": 0.2103, + "step": 11099 + }, + { + "epoch": 1.3162575595873354, + "grad_norm": 1.0560567427517145, + "learning_rate": 3.916147469442691e-05, + "loss": 0.2264, + "step": 11100 + }, + { + "epoch": 1.3163761413494606, + "grad_norm": 0.8502604719882291, + "learning_rate": 3.91594965157109e-05, + "loss": 0.2165, + "step": 11101 + }, + { + "epoch": 1.3164947231115853, + "grad_norm": 1.0935678469581984, + "learning_rate": 3.9157518206461694e-05, + "loss": 0.2414, + "step": 11102 + }, + { + "epoch": 1.3166133048737105, + "grad_norm": 1.0069150189229674, + "learning_rate": 3.915553976669754e-05, + "loss": 0.1997, + "step": 11103 + }, + { + "epoch": 1.3167318866358353, + "grad_norm": 1.0710528384740567, + "learning_rate": 3.9153561196436655e-05, + "loss": 0.202, + "step": 11104 + }, + { + "epoch": 1.3168504683979605, + "grad_norm": 1.166389040839692, + "learning_rate": 3.9151582495697305e-05, + "loss": 0.2152, + "step": 11105 + }, + { + "epoch": 1.3169690501600853, + "grad_norm": 0.9925540734786864, + "learning_rate": 3.91496036644977e-05, + "loss": 0.2319, + "step": 11106 + }, + { + "epoch": 1.3170876319222105, + "grad_norm": 1.117108087192255, + "learning_rate": 3.9147624702856125e-05, + "loss": 0.2334, + "step": 11107 + }, + { + "epoch": 1.3172062136843352, + "grad_norm": 1.190714820906956, + "learning_rate": 3.914564561079078e-05, + "loss": 0.2508, + "step": 11108 + }, + { + "epoch": 1.3173247954464604, + "grad_norm": 1.1961034923263305, + "learning_rate": 3.9143666388319933e-05, + "loss": 0.2563, + "step": 11109 + }, + { + "epoch": 1.3174433772085854, + "grad_norm": 1.4239545831995282, + "learning_rate": 3.914168703546183e-05, + "loss": 0.3119, + "step": 11110 + }, + { + "epoch": 1.3175619589707104, + "grad_norm": 0.8067246450069098, + "learning_rate": 3.913970755223472e-05, + "loss": 0.1634, + "step": 11111 + }, + { + "epoch": 1.3176805407328354, + "grad_norm": 1.0418585295439347, + "learning_rate": 3.913772793865683e-05, + "loss": 0.2279, + "step": 11112 + }, + { + "epoch": 1.3177991224949603, + "grad_norm": 1.2580135676688358, + "learning_rate": 3.913574819474644e-05, + "loss": 0.2971, + "step": 11113 + }, + { + "epoch": 1.3179177042570853, + "grad_norm": 1.2206671450237274, + "learning_rate": 3.913376832052178e-05, + "loss": 0.2903, + "step": 11114 + }, + { + "epoch": 1.3180362860192103, + "grad_norm": 0.8259670550928198, + "learning_rate": 3.913178831600111e-05, + "loss": 0.1669, + "step": 11115 + }, + { + "epoch": 1.3181548677813353, + "grad_norm": 1.154410999065161, + "learning_rate": 3.9129808181202676e-05, + "loss": 0.2217, + "step": 11116 + }, + { + "epoch": 1.3182734495434603, + "grad_norm": 1.1027419717510611, + "learning_rate": 3.912782791614473e-05, + "loss": 0.2292, + "step": 11117 + }, + { + "epoch": 1.3183920313055852, + "grad_norm": 1.1118561356501484, + "learning_rate": 3.912584752084555e-05, + "loss": 0.2497, + "step": 11118 + }, + { + "epoch": 1.3185106130677102, + "grad_norm": 1.168217443821604, + "learning_rate": 3.912386699532338e-05, + "loss": 0.2019, + "step": 11119 + }, + { + "epoch": 1.3186291948298352, + "grad_norm": 1.0279788244783217, + "learning_rate": 3.912188633959647e-05, + "loss": 0.2118, + "step": 11120 + }, + { + "epoch": 1.3187477765919602, + "grad_norm": 1.065350736431159, + "learning_rate": 3.911990555368308e-05, + "loss": 0.2389, + "step": 11121 + }, + { + "epoch": 1.3188663583540852, + "grad_norm": 1.7740945195203377, + "learning_rate": 3.911792463760149e-05, + "loss": 0.3953, + "step": 11122 + }, + { + "epoch": 1.3189849401162101, + "grad_norm": 1.2900500047263377, + "learning_rate": 3.911594359136993e-05, + "loss": 0.3559, + "step": 11123 + }, + { + "epoch": 1.3191035218783351, + "grad_norm": 1.1239349192456733, + "learning_rate": 3.911396241500669e-05, + "loss": 0.2205, + "step": 11124 + }, + { + "epoch": 1.31922210364046, + "grad_norm": 0.851071550010116, + "learning_rate": 3.911198110853003e-05, + "loss": 0.2023, + "step": 11125 + }, + { + "epoch": 1.319340685402585, + "grad_norm": 1.1291811921484072, + "learning_rate": 3.91099996719582e-05, + "loss": 0.246, + "step": 11126 + }, + { + "epoch": 1.31945926716471, + "grad_norm": 1.3932749920944223, + "learning_rate": 3.910801810530947e-05, + "loss": 0.2198, + "step": 11127 + }, + { + "epoch": 1.319577848926835, + "grad_norm": 1.182963377053831, + "learning_rate": 3.910603640860213e-05, + "loss": 0.2414, + "step": 11128 + }, + { + "epoch": 1.31969643068896, + "grad_norm": 1.3252350794263683, + "learning_rate": 3.910405458185442e-05, + "loss": 0.24, + "step": 11129 + }, + { + "epoch": 1.319815012451085, + "grad_norm": 0.8419268955558591, + "learning_rate": 3.910207262508463e-05, + "loss": 0.1393, + "step": 11130 + }, + { + "epoch": 1.31993359421321, + "grad_norm": 1.0711879766345491, + "learning_rate": 3.910009053831102e-05, + "loss": 0.2844, + "step": 11131 + }, + { + "epoch": 1.320052175975335, + "grad_norm": 1.1714430922191352, + "learning_rate": 3.909810832155186e-05, + "loss": 0.2655, + "step": 11132 + }, + { + "epoch": 1.32017075773746, + "grad_norm": 0.9792247598536512, + "learning_rate": 3.909612597482544e-05, + "loss": 0.2163, + "step": 11133 + }, + { + "epoch": 1.320289339499585, + "grad_norm": 0.8938720946758849, + "learning_rate": 3.9094143498150024e-05, + "loss": 0.2095, + "step": 11134 + }, + { + "epoch": 1.32040792126171, + "grad_norm": 1.0121530705308068, + "learning_rate": 3.909216089154388e-05, + "loss": 0.206, + "step": 11135 + }, + { + "epoch": 1.320526503023835, + "grad_norm": 1.6050876195324195, + "learning_rate": 3.90901781550253e-05, + "loss": 0.311, + "step": 11136 + }, + { + "epoch": 1.3206450847859599, + "grad_norm": 0.9355379678853118, + "learning_rate": 3.908819528861255e-05, + "loss": 0.2077, + "step": 11137 + }, + { + "epoch": 1.3207636665480849, + "grad_norm": 0.9297858434421828, + "learning_rate": 3.908621229232392e-05, + "loss": 0.1914, + "step": 11138 + }, + { + "epoch": 1.3208822483102098, + "grad_norm": 0.8550043052466346, + "learning_rate": 3.908422916617768e-05, + "loss": 0.1857, + "step": 11139 + }, + { + "epoch": 1.3210008300723348, + "grad_norm": 1.2616102303168062, + "learning_rate": 3.908224591019213e-05, + "loss": 0.2665, + "step": 11140 + }, + { + "epoch": 1.3211194118344598, + "grad_norm": 0.8969710874401429, + "learning_rate": 3.9080262524385536e-05, + "loss": 0.162, + "step": 11141 + }, + { + "epoch": 1.3212379935965848, + "grad_norm": 1.868253852967507, + "learning_rate": 3.907827900877619e-05, + "loss": 0.3202, + "step": 11142 + }, + { + "epoch": 1.3213565753587098, + "grad_norm": 1.0402102381527065, + "learning_rate": 3.907629536338236e-05, + "loss": 0.2029, + "step": 11143 + }, + { + "epoch": 1.3214751571208347, + "grad_norm": 1.1418412596259881, + "learning_rate": 3.9074311588222365e-05, + "loss": 0.1954, + "step": 11144 + }, + { + "epoch": 1.3215937388829597, + "grad_norm": 1.6388374811549664, + "learning_rate": 3.907232768331448e-05, + "loss": 0.3523, + "step": 11145 + }, + { + "epoch": 1.3217123206450847, + "grad_norm": 0.9468571963888505, + "learning_rate": 3.9070343648676976e-05, + "loss": 0.1764, + "step": 11146 + }, + { + "epoch": 1.3218309024072097, + "grad_norm": 1.043918059069733, + "learning_rate": 3.906835948432816e-05, + "loss": 0.22, + "step": 11147 + }, + { + "epoch": 1.3219494841693349, + "grad_norm": 1.1679314105015715, + "learning_rate": 3.906637519028633e-05, + "loss": 0.3077, + "step": 11148 + }, + { + "epoch": 1.3220680659314596, + "grad_norm": 1.1833998830447887, + "learning_rate": 3.906439076656977e-05, + "loss": 0.3058, + "step": 11149 + }, + { + "epoch": 1.3221866476935848, + "grad_norm": 0.966930902337545, + "learning_rate": 3.906240621319677e-05, + "loss": 0.2242, + "step": 11150 + }, + { + "epoch": 1.3223052294557096, + "grad_norm": 1.2967596647394277, + "learning_rate": 3.9060421530185623e-05, + "loss": 0.2763, + "step": 11151 + }, + { + "epoch": 1.3224238112178348, + "grad_norm": 1.4658769922684316, + "learning_rate": 3.9058436717554634e-05, + "loss": 0.3028, + "step": 11152 + }, + { + "epoch": 1.3225423929799596, + "grad_norm": 0.8857357137773915, + "learning_rate": 3.905645177532211e-05, + "loss": 0.1549, + "step": 11153 + }, + { + "epoch": 1.3226609747420848, + "grad_norm": 1.1049731273922976, + "learning_rate": 3.905446670350633e-05, + "loss": 0.2055, + "step": 11154 + }, + { + "epoch": 1.3227795565042095, + "grad_norm": 1.5346513620353337, + "learning_rate": 3.905248150212559e-05, + "loss": 0.4317, + "step": 11155 + }, + { + "epoch": 1.3228981382663347, + "grad_norm": 1.1371586792297637, + "learning_rate": 3.905049617119822e-05, + "loss": 0.1905, + "step": 11156 + }, + { + "epoch": 1.3230167200284595, + "grad_norm": 1.1571624989369873, + "learning_rate": 3.9048510710742505e-05, + "loss": 0.3097, + "step": 11157 + }, + { + "epoch": 1.3231353017905847, + "grad_norm": 1.1201197341147762, + "learning_rate": 3.9046525120776735e-05, + "loss": 0.2724, + "step": 11158 + }, + { + "epoch": 1.3232538835527097, + "grad_norm": 1.3189291942202692, + "learning_rate": 3.904453940131924e-05, + "loss": 0.2274, + "step": 11159 + }, + { + "epoch": 1.3233724653148347, + "grad_norm": 1.1363257131888023, + "learning_rate": 3.9042553552388304e-05, + "loss": 0.2552, + "step": 11160 + }, + { + "epoch": 1.3234910470769596, + "grad_norm": 1.0371309303330907, + "learning_rate": 3.9040567574002255e-05, + "loss": 0.2771, + "step": 11161 + }, + { + "epoch": 1.3236096288390846, + "grad_norm": 1.5136333042803067, + "learning_rate": 3.9038581466179386e-05, + "loss": 0.3229, + "step": 11162 + }, + { + "epoch": 1.3237282106012096, + "grad_norm": 0.8992216782241905, + "learning_rate": 3.903659522893801e-05, + "loss": 0.1991, + "step": 11163 + }, + { + "epoch": 1.3238467923633346, + "grad_norm": 1.2322927338098717, + "learning_rate": 3.903460886229644e-05, + "loss": 0.2677, + "step": 11164 + }, + { + "epoch": 1.3239653741254596, + "grad_norm": 0.9856368743401103, + "learning_rate": 3.903262236627299e-05, + "loss": 0.1896, + "step": 11165 + }, + { + "epoch": 1.3240839558875845, + "grad_norm": 1.1980670440785075, + "learning_rate": 3.9030635740885964e-05, + "loss": 0.2463, + "step": 11166 + }, + { + "epoch": 1.3242025376497095, + "grad_norm": 0.8476944387448164, + "learning_rate": 3.902864898615368e-05, + "loss": 0.1982, + "step": 11167 + }, + { + "epoch": 1.3243211194118345, + "grad_norm": 1.47158841706548, + "learning_rate": 3.9026662102094466e-05, + "loss": 0.3217, + "step": 11168 + }, + { + "epoch": 1.3244397011739595, + "grad_norm": 1.0298730310411413, + "learning_rate": 3.902467508872662e-05, + "loss": 0.2318, + "step": 11169 + }, + { + "epoch": 1.3245582829360845, + "grad_norm": 0.8706070593077753, + "learning_rate": 3.902268794606847e-05, + "loss": 0.1781, + "step": 11170 + }, + { + "epoch": 1.3246768646982094, + "grad_norm": 1.4207262526605546, + "learning_rate": 3.9020700674138334e-05, + "loss": 0.2787, + "step": 11171 + }, + { + "epoch": 1.3247954464603344, + "grad_norm": 1.0686297896799677, + "learning_rate": 3.901871327295453e-05, + "loss": 0.302, + "step": 11172 + }, + { + "epoch": 1.3249140282224594, + "grad_norm": 1.1692507743382499, + "learning_rate": 3.901672574253538e-05, + "loss": 0.3112, + "step": 11173 + }, + { + "epoch": 1.3250326099845844, + "grad_norm": 0.9730206222842813, + "learning_rate": 3.9014738082899204e-05, + "loss": 0.2001, + "step": 11174 + }, + { + "epoch": 1.3251511917467094, + "grad_norm": 1.1668338114185135, + "learning_rate": 3.901275029406434e-05, + "loss": 0.2391, + "step": 11175 + }, + { + "epoch": 1.3252697735088343, + "grad_norm": 0.9980371856304229, + "learning_rate": 3.901076237604909e-05, + "loss": 0.2374, + "step": 11176 + }, + { + "epoch": 1.3253883552709593, + "grad_norm": 1.1058211183288547, + "learning_rate": 3.90087743288718e-05, + "loss": 0.2514, + "step": 11177 + }, + { + "epoch": 1.3255069370330843, + "grad_norm": 0.814542366525708, + "learning_rate": 3.900678615255078e-05, + "loss": 0.1891, + "step": 11178 + }, + { + "epoch": 1.3256255187952093, + "grad_norm": 0.98966923127372, + "learning_rate": 3.900479784710438e-05, + "loss": 0.2742, + "step": 11179 + }, + { + "epoch": 1.3257441005573343, + "grad_norm": 0.7462805746393683, + "learning_rate": 3.9002809412550904e-05, + "loss": 0.1607, + "step": 11180 + }, + { + "epoch": 1.3258626823194593, + "grad_norm": 1.2398633190978576, + "learning_rate": 3.90008208489087e-05, + "loss": 0.2815, + "step": 11181 + }, + { + "epoch": 1.3259812640815842, + "grad_norm": 1.232846255983191, + "learning_rate": 3.8998832156196105e-05, + "loss": 0.283, + "step": 11182 + }, + { + "epoch": 1.3260998458437092, + "grad_norm": 0.8954867942803527, + "learning_rate": 3.899684333443144e-05, + "loss": 0.2049, + "step": 11183 + }, + { + "epoch": 1.3262184276058342, + "grad_norm": 1.0221004531487041, + "learning_rate": 3.899485438363304e-05, + "loss": 0.1974, + "step": 11184 + }, + { + "epoch": 1.3263370093679592, + "grad_norm": 0.93661717740525, + "learning_rate": 3.899286530381925e-05, + "loss": 0.2107, + "step": 11185 + }, + { + "epoch": 1.3264555911300842, + "grad_norm": 1.2537972839120064, + "learning_rate": 3.89908760950084e-05, + "loss": 0.3179, + "step": 11186 + }, + { + "epoch": 1.3265741728922091, + "grad_norm": 1.5728288947230045, + "learning_rate": 3.898888675721882e-05, + "loss": 0.2736, + "step": 11187 + }, + { + "epoch": 1.3266927546543341, + "grad_norm": 1.4314146265329413, + "learning_rate": 3.898689729046887e-05, + "loss": 0.3051, + "step": 11188 + }, + { + "epoch": 1.326811336416459, + "grad_norm": 1.3162707797449178, + "learning_rate": 3.898490769477688e-05, + "loss": 0.3067, + "step": 11189 + }, + { + "epoch": 1.326929918178584, + "grad_norm": 0.9717003695123547, + "learning_rate": 3.898291797016118e-05, + "loss": 0.2164, + "step": 11190 + }, + { + "epoch": 1.327048499940709, + "grad_norm": 1.0948849456320158, + "learning_rate": 3.898092811664013e-05, + "loss": 0.3064, + "step": 11191 + }, + { + "epoch": 1.327167081702834, + "grad_norm": 1.3771509115857439, + "learning_rate": 3.897893813423207e-05, + "loss": 0.292, + "step": 11192 + }, + { + "epoch": 1.327285663464959, + "grad_norm": 1.0252068657863302, + "learning_rate": 3.897694802295533e-05, + "loss": 0.2133, + "step": 11193 + }, + { + "epoch": 1.327404245227084, + "grad_norm": 0.9909389666011352, + "learning_rate": 3.897495778282828e-05, + "loss": 0.2067, + "step": 11194 + }, + { + "epoch": 1.327522826989209, + "grad_norm": 1.2635780433384007, + "learning_rate": 3.8972967413869254e-05, + "loss": 0.2325, + "step": 11195 + }, + { + "epoch": 1.327641408751334, + "grad_norm": 1.125877656536, + "learning_rate": 3.89709769160966e-05, + "loss": 0.25, + "step": 11196 + }, + { + "epoch": 1.3277599905134592, + "grad_norm": 1.0371067657512503, + "learning_rate": 3.8968986289528676e-05, + "loss": 0.2666, + "step": 11197 + }, + { + "epoch": 1.327878572275584, + "grad_norm": 0.9289754136664076, + "learning_rate": 3.896699553418382e-05, + "loss": 0.2024, + "step": 11198 + }, + { + "epoch": 1.3279971540377091, + "grad_norm": 1.58602006852646, + "learning_rate": 3.896500465008041e-05, + "loss": 0.2498, + "step": 11199 + }, + { + "epoch": 1.3281157357998339, + "grad_norm": 1.3338355380105875, + "learning_rate": 3.8963013637236766e-05, + "loss": 0.2906, + "step": 11200 + }, + { + "epoch": 1.328234317561959, + "grad_norm": 1.2582563349818527, + "learning_rate": 3.896102249567126e-05, + "loss": 0.2398, + "step": 11201 + }, + { + "epoch": 1.3283528993240838, + "grad_norm": 1.3798226878058262, + "learning_rate": 3.895903122540225e-05, + "loss": 0.2738, + "step": 11202 + }, + { + "epoch": 1.328471481086209, + "grad_norm": 1.2658459017053347, + "learning_rate": 3.8957039826448095e-05, + "loss": 0.2158, + "step": 11203 + }, + { + "epoch": 1.3285900628483338, + "grad_norm": 0.8943762040453472, + "learning_rate": 3.8955048298827146e-05, + "loss": 0.1746, + "step": 11204 + }, + { + "epoch": 1.328708644610459, + "grad_norm": 1.7916217925231084, + "learning_rate": 3.895305664255776e-05, + "loss": 0.4186, + "step": 11205 + }, + { + "epoch": 1.3288272263725838, + "grad_norm": 1.4504184329701393, + "learning_rate": 3.895106485765831e-05, + "loss": 0.3588, + "step": 11206 + }, + { + "epoch": 1.328945808134709, + "grad_norm": 1.063908695127009, + "learning_rate": 3.894907294414714e-05, + "loss": 0.1859, + "step": 11207 + }, + { + "epoch": 1.3290643898968337, + "grad_norm": 1.152374106520925, + "learning_rate": 3.894708090204263e-05, + "loss": 0.2801, + "step": 11208 + }, + { + "epoch": 1.329182971658959, + "grad_norm": 1.0283576636217397, + "learning_rate": 3.894508873136313e-05, + "loss": 0.2404, + "step": 11209 + }, + { + "epoch": 1.329301553421084, + "grad_norm": 1.2286602507556899, + "learning_rate": 3.894309643212701e-05, + "loss": 0.2117, + "step": 11210 + }, + { + "epoch": 1.329420135183209, + "grad_norm": 1.3524100330640447, + "learning_rate": 3.8941104004352655e-05, + "loss": 0.2606, + "step": 11211 + }, + { + "epoch": 1.3295387169453339, + "grad_norm": 1.1824870501575757, + "learning_rate": 3.8939111448058404e-05, + "loss": 0.224, + "step": 11212 + }, + { + "epoch": 1.3296572987074589, + "grad_norm": 0.9078572172656802, + "learning_rate": 3.893711876326265e-05, + "loss": 0.1746, + "step": 11213 + }, + { + "epoch": 1.3297758804695838, + "grad_norm": 1.0956395918222634, + "learning_rate": 3.893512594998374e-05, + "loss": 0.2466, + "step": 11214 + }, + { + "epoch": 1.3298944622317088, + "grad_norm": 0.9727696491368, + "learning_rate": 3.893313300824006e-05, + "loss": 0.2101, + "step": 11215 + }, + { + "epoch": 1.3300130439938338, + "grad_norm": 0.9793900576167454, + "learning_rate": 3.8931139938049975e-05, + "loss": 0.2394, + "step": 11216 + }, + { + "epoch": 1.3301316257559588, + "grad_norm": 1.4516921665387146, + "learning_rate": 3.892914673943188e-05, + "loss": 0.4469, + "step": 11217 + }, + { + "epoch": 1.3302502075180838, + "grad_norm": 0.9181860852591295, + "learning_rate": 3.892715341240411e-05, + "loss": 0.2238, + "step": 11218 + }, + { + "epoch": 1.3303687892802087, + "grad_norm": 1.0330185531287335, + "learning_rate": 3.892515995698508e-05, + "loss": 0.2292, + "step": 11219 + }, + { + "epoch": 1.3304873710423337, + "grad_norm": 1.0152625107201747, + "learning_rate": 3.892316637319315e-05, + "loss": 0.204, + "step": 11220 + }, + { + "epoch": 1.3306059528044587, + "grad_norm": 1.1931067144123002, + "learning_rate": 3.8921172661046704e-05, + "loss": 0.2631, + "step": 11221 + }, + { + "epoch": 1.3307245345665837, + "grad_norm": 1.1448001865217976, + "learning_rate": 3.891917882056411e-05, + "loss": 0.2791, + "step": 11222 + }, + { + "epoch": 1.3308431163287087, + "grad_norm": 1.1847539294720653, + "learning_rate": 3.891718485176375e-05, + "loss": 0.1951, + "step": 11223 + }, + { + "epoch": 1.3309616980908336, + "grad_norm": 1.0216674629133529, + "learning_rate": 3.891519075466402e-05, + "loss": 0.2178, + "step": 11224 + }, + { + "epoch": 1.3310802798529586, + "grad_norm": 1.009929664738147, + "learning_rate": 3.891319652928329e-05, + "loss": 0.2413, + "step": 11225 + }, + { + "epoch": 1.3311988616150836, + "grad_norm": 1.0409785535547231, + "learning_rate": 3.891120217563996e-05, + "loss": 0.1977, + "step": 11226 + }, + { + "epoch": 1.3313174433772086, + "grad_norm": 1.1472355774891425, + "learning_rate": 3.890920769375239e-05, + "loss": 0.2539, + "step": 11227 + }, + { + "epoch": 1.3314360251393336, + "grad_norm": 1.3276148382084325, + "learning_rate": 3.8907213083638996e-05, + "loss": 0.2639, + "step": 11228 + }, + { + "epoch": 1.3315546069014585, + "grad_norm": 1.3518018683738118, + "learning_rate": 3.8905218345318145e-05, + "loss": 0.3331, + "step": 11229 + }, + { + "epoch": 1.3316731886635835, + "grad_norm": 0.8234501011890035, + "learning_rate": 3.890322347880824e-05, + "loss": 0.182, + "step": 11230 + }, + { + "epoch": 1.3317917704257085, + "grad_norm": 0.9277818973713268, + "learning_rate": 3.890122848412765e-05, + "loss": 0.2058, + "step": 11231 + }, + { + "epoch": 1.3319103521878335, + "grad_norm": 1.29952062849528, + "learning_rate": 3.889923336129479e-05, + "loss": 0.3209, + "step": 11232 + }, + { + "epoch": 1.3320289339499585, + "grad_norm": 1.1602620814774403, + "learning_rate": 3.8897238110328035e-05, + "loss": 0.2327, + "step": 11233 + }, + { + "epoch": 1.3321475157120835, + "grad_norm": 1.053312910796751, + "learning_rate": 3.8895242731245795e-05, + "loss": 0.2567, + "step": 11234 + }, + { + "epoch": 1.3322660974742084, + "grad_norm": 0.830245580918566, + "learning_rate": 3.8893247224066456e-05, + "loss": 0.1902, + "step": 11235 + }, + { + "epoch": 1.3323846792363334, + "grad_norm": 0.9868038362806985, + "learning_rate": 3.889125158880841e-05, + "loss": 0.2272, + "step": 11236 + }, + { + "epoch": 1.3325032609984584, + "grad_norm": 1.083798719160613, + "learning_rate": 3.888925582549006e-05, + "loss": 0.2216, + "step": 11237 + }, + { + "epoch": 1.3326218427605834, + "grad_norm": 0.8347503057780409, + "learning_rate": 3.88872599341298e-05, + "loss": 0.177, + "step": 11238 + }, + { + "epoch": 1.3327404245227084, + "grad_norm": 0.9595527061280946, + "learning_rate": 3.888526391474604e-05, + "loss": 0.2268, + "step": 11239 + }, + { + "epoch": 1.3328590062848333, + "grad_norm": 1.3962729285315243, + "learning_rate": 3.888326776735717e-05, + "loss": 0.3504, + "step": 11240 + }, + { + "epoch": 1.3329775880469583, + "grad_norm": 1.3932683405037798, + "learning_rate": 3.8881271491981595e-05, + "loss": 0.3092, + "step": 11241 + }, + { + "epoch": 1.3330961698090833, + "grad_norm": 0.8335247694979706, + "learning_rate": 3.8879275088637726e-05, + "loss": 0.1445, + "step": 11242 + }, + { + "epoch": 1.3332147515712083, + "grad_norm": 0.7837249590522793, + "learning_rate": 3.8877278557343953e-05, + "loss": 0.1632, + "step": 11243 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.3284391978526873, + "learning_rate": 3.8875281898118695e-05, + "loss": 0.2674, + "step": 11244 + }, + { + "epoch": 1.3334519150954582, + "grad_norm": 0.992701857047241, + "learning_rate": 3.887328511098035e-05, + "loss": 0.2044, + "step": 11245 + }, + { + "epoch": 1.3335704968575832, + "grad_norm": 1.2452310236846582, + "learning_rate": 3.8871288195947326e-05, + "loss": 0.2911, + "step": 11246 + }, + { + "epoch": 1.3336890786197082, + "grad_norm": 0.7335653048920354, + "learning_rate": 3.886929115303804e-05, + "loss": 0.1407, + "step": 11247 + }, + { + "epoch": 1.3338076603818334, + "grad_norm": 1.7904225999663235, + "learning_rate": 3.886729398227089e-05, + "loss": 0.3361, + "step": 11248 + }, + { + "epoch": 1.3339262421439582, + "grad_norm": 0.8677102444193976, + "learning_rate": 3.88652966836643e-05, + "loss": 0.1532, + "step": 11249 + }, + { + "epoch": 1.3340448239060834, + "grad_norm": 0.970815483022368, + "learning_rate": 3.886329925723667e-05, + "loss": 0.1918, + "step": 11250 + }, + { + "epoch": 1.3341634056682081, + "grad_norm": 1.012658791404795, + "learning_rate": 3.8861301703006436e-05, + "loss": 0.1698, + "step": 11251 + }, + { + "epoch": 1.3342819874303333, + "grad_norm": 1.1246213908167446, + "learning_rate": 3.885930402099199e-05, + "loss": 0.1729, + "step": 11252 + }, + { + "epoch": 1.334400569192458, + "grad_norm": 0.9725757240337546, + "learning_rate": 3.885730621121175e-05, + "loss": 0.1859, + "step": 11253 + }, + { + "epoch": 1.3345191509545833, + "grad_norm": 1.5709184863805237, + "learning_rate": 3.8855308273684154e-05, + "loss": 0.3373, + "step": 11254 + }, + { + "epoch": 1.334637732716708, + "grad_norm": 1.253627156440161, + "learning_rate": 3.88533102084276e-05, + "loss": 0.2681, + "step": 11255 + }, + { + "epoch": 1.3347563144788333, + "grad_norm": 1.6703355699729674, + "learning_rate": 3.8851312015460504e-05, + "loss": 0.4387, + "step": 11256 + }, + { + "epoch": 1.334874896240958, + "grad_norm": 0.8692713520733102, + "learning_rate": 3.884931369480131e-05, + "loss": 0.2175, + "step": 11257 + }, + { + "epoch": 1.3349934780030832, + "grad_norm": 1.0531562053632904, + "learning_rate": 3.8847315246468425e-05, + "loss": 0.1719, + "step": 11258 + }, + { + "epoch": 1.3351120597652082, + "grad_norm": 1.188939968258312, + "learning_rate": 3.884531667048027e-05, + "loss": 0.2454, + "step": 11259 + }, + { + "epoch": 1.3352306415273332, + "grad_norm": 1.1542499281706509, + "learning_rate": 3.884331796685527e-05, + "loss": 0.2549, + "step": 11260 + }, + { + "epoch": 1.3353492232894582, + "grad_norm": 1.2036655887981451, + "learning_rate": 3.884131913561187e-05, + "loss": 0.2418, + "step": 11261 + }, + { + "epoch": 1.3354678050515831, + "grad_norm": 2.2302916028728417, + "learning_rate": 3.8839320176768465e-05, + "loss": 0.4717, + "step": 11262 + }, + { + "epoch": 1.3355863868137081, + "grad_norm": 0.8781422164635321, + "learning_rate": 3.88373210903435e-05, + "loss": 0.2021, + "step": 11263 + }, + { + "epoch": 1.335704968575833, + "grad_norm": 0.9116620045616166, + "learning_rate": 3.8835321876355407e-05, + "loss": 0.1792, + "step": 11264 + }, + { + "epoch": 1.335823550337958, + "grad_norm": 0.8887350135988367, + "learning_rate": 3.883332253482261e-05, + "loss": 0.2056, + "step": 11265 + }, + { + "epoch": 1.335942132100083, + "grad_norm": 1.1686091102705327, + "learning_rate": 3.883132306576355e-05, + "loss": 0.2434, + "step": 11266 + }, + { + "epoch": 1.336060713862208, + "grad_norm": 1.4135044068997948, + "learning_rate": 3.882932346919664e-05, + "loss": 0.3566, + "step": 11267 + }, + { + "epoch": 1.336179295624333, + "grad_norm": 1.1352004610235988, + "learning_rate": 3.8827323745140335e-05, + "loss": 0.2483, + "step": 11268 + }, + { + "epoch": 1.336297877386458, + "grad_norm": 0.76470348283328, + "learning_rate": 3.882532389361306e-05, + "loss": 0.2041, + "step": 11269 + }, + { + "epoch": 1.336416459148583, + "grad_norm": 0.9922064509909124, + "learning_rate": 3.882332391463325e-05, + "loss": 0.213, + "step": 11270 + }, + { + "epoch": 1.336535040910708, + "grad_norm": 1.3803752073725748, + "learning_rate": 3.882132380821935e-05, + "loss": 0.2415, + "step": 11271 + }, + { + "epoch": 1.336653622672833, + "grad_norm": 1.1780003722908097, + "learning_rate": 3.8819323574389785e-05, + "loss": 0.2849, + "step": 11272 + }, + { + "epoch": 1.336772204434958, + "grad_norm": 1.1499848683718805, + "learning_rate": 3.881732321316301e-05, + "loss": 0.2097, + "step": 11273 + }, + { + "epoch": 1.336890786197083, + "grad_norm": 1.080363129310601, + "learning_rate": 3.881532272455746e-05, + "loss": 0.2452, + "step": 11274 + }, + { + "epoch": 1.3370093679592079, + "grad_norm": 1.0365344969424422, + "learning_rate": 3.881332210859158e-05, + "loss": 0.218, + "step": 11275 + }, + { + "epoch": 1.3371279497213329, + "grad_norm": 1.0682448782830036, + "learning_rate": 3.8811321365283795e-05, + "loss": 0.1961, + "step": 11276 + }, + { + "epoch": 1.3372465314834578, + "grad_norm": 1.2393598476928143, + "learning_rate": 3.880932049465257e-05, + "loss": 0.2748, + "step": 11277 + }, + { + "epoch": 1.3373651132455828, + "grad_norm": 0.9513710483996061, + "learning_rate": 3.8807319496716345e-05, + "loss": 0.2445, + "step": 11278 + }, + { + "epoch": 1.3374836950077078, + "grad_norm": 1.111162744198963, + "learning_rate": 3.880531837149357e-05, + "loss": 0.2858, + "step": 11279 + }, + { + "epoch": 1.3376022767698328, + "grad_norm": 1.005747160062515, + "learning_rate": 3.880331711900268e-05, + "loss": 0.2245, + "step": 11280 + }, + { + "epoch": 1.3377208585319578, + "grad_norm": 0.8062097509414904, + "learning_rate": 3.880131573926213e-05, + "loss": 0.166, + "step": 11281 + }, + { + "epoch": 1.3378394402940827, + "grad_norm": 0.9556260969318162, + "learning_rate": 3.879931423229039e-05, + "loss": 0.1947, + "step": 11282 + }, + { + "epoch": 1.3379580220562077, + "grad_norm": 1.1212506081955949, + "learning_rate": 3.879731259810588e-05, + "loss": 0.2737, + "step": 11283 + }, + { + "epoch": 1.3380766038183327, + "grad_norm": 1.739173350783522, + "learning_rate": 3.8795310836727075e-05, + "loss": 0.3432, + "step": 11284 + }, + { + "epoch": 1.3381951855804577, + "grad_norm": 0.9166752738505088, + "learning_rate": 3.879330894817242e-05, + "loss": 0.193, + "step": 11285 + }, + { + "epoch": 1.3383137673425827, + "grad_norm": 0.9765640424530158, + "learning_rate": 3.879130693246037e-05, + "loss": 0.2067, + "step": 11286 + }, + { + "epoch": 1.3384323491047077, + "grad_norm": 0.901567343393729, + "learning_rate": 3.878930478960938e-05, + "loss": 0.1536, + "step": 11287 + }, + { + "epoch": 1.3385509308668326, + "grad_norm": 1.5418684006745853, + "learning_rate": 3.8787302519637914e-05, + "loss": 0.3063, + "step": 11288 + }, + { + "epoch": 1.3386695126289576, + "grad_norm": 0.9845392278434172, + "learning_rate": 3.8785300122564423e-05, + "loss": 0.1894, + "step": 11289 + }, + { + "epoch": 1.3387880943910826, + "grad_norm": 1.148671952840746, + "learning_rate": 3.878329759840736e-05, + "loss": 0.2334, + "step": 11290 + }, + { + "epoch": 1.3389066761532076, + "grad_norm": 1.371663308902444, + "learning_rate": 3.878129494718521e-05, + "loss": 0.2517, + "step": 11291 + }, + { + "epoch": 1.3390252579153326, + "grad_norm": 0.9274019524173648, + "learning_rate": 3.877929216891642e-05, + "loss": 0.1688, + "step": 11292 + }, + { + "epoch": 1.3391438396774575, + "grad_norm": 0.9408543554200137, + "learning_rate": 3.8777289263619444e-05, + "loss": 0.1895, + "step": 11293 + }, + { + "epoch": 1.3392624214395825, + "grad_norm": 1.2408909932500836, + "learning_rate": 3.877528623131276e-05, + "loss": 0.2811, + "step": 11294 + }, + { + "epoch": 1.3393810032017075, + "grad_norm": 0.8675798781545533, + "learning_rate": 3.877328307201483e-05, + "loss": 0.168, + "step": 11295 + }, + { + "epoch": 1.3394995849638325, + "grad_norm": 0.9478996748363286, + "learning_rate": 3.877127978574412e-05, + "loss": 0.2533, + "step": 11296 + }, + { + "epoch": 1.3396181667259577, + "grad_norm": 1.4484043257380494, + "learning_rate": 3.8769276372519096e-05, + "loss": 0.2429, + "step": 11297 + }, + { + "epoch": 1.3397367484880824, + "grad_norm": 1.0754606199095988, + "learning_rate": 3.876727283235823e-05, + "loss": 0.1772, + "step": 11298 + }, + { + "epoch": 1.3398553302502076, + "grad_norm": 1.1222928923049134, + "learning_rate": 3.8765269165279996e-05, + "loss": 0.2131, + "step": 11299 + }, + { + "epoch": 1.3399739120123324, + "grad_norm": 0.8496479001579185, + "learning_rate": 3.876326537130285e-05, + "loss": 0.1566, + "step": 11300 + }, + { + "epoch": 1.3400924937744576, + "grad_norm": 1.3498337021953544, + "learning_rate": 3.876126145044529e-05, + "loss": 0.2472, + "step": 11301 + }, + { + "epoch": 1.3402110755365824, + "grad_norm": 0.993948007357881, + "learning_rate": 3.8759257402725754e-05, + "loss": 0.2336, + "step": 11302 + }, + { + "epoch": 1.3403296572987076, + "grad_norm": 1.0710542913698144, + "learning_rate": 3.875725322816276e-05, + "loss": 0.2273, + "step": 11303 + }, + { + "epoch": 1.3404482390608323, + "grad_norm": 1.3599400447407781, + "learning_rate": 3.8755248926774746e-05, + "loss": 0.2475, + "step": 11304 + }, + { + "epoch": 1.3405668208229575, + "grad_norm": 1.1222051854444417, + "learning_rate": 3.875324449858021e-05, + "loss": 0.2136, + "step": 11305 + }, + { + "epoch": 1.3406854025850823, + "grad_norm": 1.060425263752431, + "learning_rate": 3.875123994359762e-05, + "loss": 0.2757, + "step": 11306 + }, + { + "epoch": 1.3408039843472075, + "grad_norm": 1.0919499683526668, + "learning_rate": 3.874923526184547e-05, + "loss": 0.2002, + "step": 11307 + }, + { + "epoch": 1.3409225661093322, + "grad_norm": 1.0642931248160736, + "learning_rate": 3.874723045334222e-05, + "loss": 0.2324, + "step": 11308 + }, + { + "epoch": 1.3410411478714575, + "grad_norm": 1.124702511248524, + "learning_rate": 3.874522551810637e-05, + "loss": 0.2085, + "step": 11309 + }, + { + "epoch": 1.3411597296335824, + "grad_norm": 1.202544248780369, + "learning_rate": 3.8743220456156394e-05, + "loss": 0.2283, + "step": 11310 + }, + { + "epoch": 1.3412783113957074, + "grad_norm": 1.1278686163628868, + "learning_rate": 3.8741215267510774e-05, + "loss": 0.2649, + "step": 11311 + }, + { + "epoch": 1.3413968931578324, + "grad_norm": 1.1755367294967765, + "learning_rate": 3.8739209952188016e-05, + "loss": 0.2057, + "step": 11312 + }, + { + "epoch": 1.3415154749199574, + "grad_norm": 1.3943350174542728, + "learning_rate": 3.873720451020658e-05, + "loss": 0.3089, + "step": 11313 + }, + { + "epoch": 1.3416340566820824, + "grad_norm": 1.0356761183256773, + "learning_rate": 3.873519894158496e-05, + "loss": 0.1948, + "step": 11314 + }, + { + "epoch": 1.3417526384442073, + "grad_norm": 0.9850707570593819, + "learning_rate": 3.873319324634165e-05, + "loss": 0.248, + "step": 11315 + }, + { + "epoch": 1.3418712202063323, + "grad_norm": 1.0574554017676447, + "learning_rate": 3.8731187424495146e-05, + "loss": 0.2225, + "step": 11316 + }, + { + "epoch": 1.3419898019684573, + "grad_norm": 1.3256186803482177, + "learning_rate": 3.872918147606393e-05, + "loss": 0.3169, + "step": 11317 + }, + { + "epoch": 1.3421083837305823, + "grad_norm": 0.9716412614070841, + "learning_rate": 3.87271754010665e-05, + "loss": 0.2113, + "step": 11318 + }, + { + "epoch": 1.3422269654927073, + "grad_norm": 1.2149523214277407, + "learning_rate": 3.8725169199521337e-05, + "loss": 0.2758, + "step": 11319 + }, + { + "epoch": 1.3423455472548322, + "grad_norm": 1.1028203157003447, + "learning_rate": 3.872316287144695e-05, + "loss": 0.2388, + "step": 11320 + }, + { + "epoch": 1.3424641290169572, + "grad_norm": 0.7664476288888664, + "learning_rate": 3.872115641686183e-05, + "loss": 0.2028, + "step": 11321 + }, + { + "epoch": 1.3425827107790822, + "grad_norm": 1.221894212915627, + "learning_rate": 3.871914983578447e-05, + "loss": 0.263, + "step": 11322 + }, + { + "epoch": 1.3427012925412072, + "grad_norm": 1.0800877832243465, + "learning_rate": 3.8717143128233375e-05, + "loss": 0.1931, + "step": 11323 + }, + { + "epoch": 1.3428198743033322, + "grad_norm": 1.4751327660435922, + "learning_rate": 3.871513629422704e-05, + "loss": 0.3222, + "step": 11324 + }, + { + "epoch": 1.3429384560654571, + "grad_norm": 1.2998631384232902, + "learning_rate": 3.8713129333783965e-05, + "loss": 0.3299, + "step": 11325 + }, + { + "epoch": 1.3430570378275821, + "grad_norm": 1.209338552274968, + "learning_rate": 3.871112224692266e-05, + "loss": 0.2394, + "step": 11326 + }, + { + "epoch": 1.343175619589707, + "grad_norm": 1.0163696750028253, + "learning_rate": 3.870911503366161e-05, + "loss": 0.1864, + "step": 11327 + }, + { + "epoch": 1.343294201351832, + "grad_norm": 0.9901762702210827, + "learning_rate": 3.8707107694019346e-05, + "loss": 0.2212, + "step": 11328 + }, + { + "epoch": 1.343412783113957, + "grad_norm": 1.099552960998955, + "learning_rate": 3.870510022801435e-05, + "loss": 0.2792, + "step": 11329 + }, + { + "epoch": 1.343531364876082, + "grad_norm": 1.0047563598139304, + "learning_rate": 3.870309263566514e-05, + "loss": 0.2313, + "step": 11330 + }, + { + "epoch": 1.343649946638207, + "grad_norm": 1.1638245034528119, + "learning_rate": 3.8701084916990216e-05, + "loss": 0.2668, + "step": 11331 + }, + { + "epoch": 1.343768528400332, + "grad_norm": 0.9574047059617741, + "learning_rate": 3.869907707200809e-05, + "loss": 0.1951, + "step": 11332 + }, + { + "epoch": 1.343887110162457, + "grad_norm": 1.063473166655854, + "learning_rate": 3.869706910073727e-05, + "loss": 0.173, + "step": 11333 + }, + { + "epoch": 1.344005691924582, + "grad_norm": 1.6178873368123503, + "learning_rate": 3.869506100319628e-05, + "loss": 0.316, + "step": 11334 + }, + { + "epoch": 1.344124273686707, + "grad_norm": 1.1821358700715576, + "learning_rate": 3.8693052779403613e-05, + "loss": 0.2505, + "step": 11335 + }, + { + "epoch": 1.344242855448832, + "grad_norm": 1.0610453122840684, + "learning_rate": 3.8691044429377786e-05, + "loss": 0.2024, + "step": 11336 + }, + { + "epoch": 1.344361437210957, + "grad_norm": 1.2133089250425866, + "learning_rate": 3.868903595313733e-05, + "loss": 0.2575, + "step": 11337 + }, + { + "epoch": 1.344480018973082, + "grad_norm": 1.3509699021809551, + "learning_rate": 3.868702735070074e-05, + "loss": 0.2935, + "step": 11338 + }, + { + "epoch": 1.3445986007352069, + "grad_norm": 1.303693410970034, + "learning_rate": 3.8685018622086545e-05, + "loss": 0.2496, + "step": 11339 + }, + { + "epoch": 1.3447171824973319, + "grad_norm": 0.9019047192998477, + "learning_rate": 3.868300976731326e-05, + "loss": 0.2097, + "step": 11340 + }, + { + "epoch": 1.3448357642594568, + "grad_norm": 0.9753806188134191, + "learning_rate": 3.86810007863994e-05, + "loss": 0.2016, + "step": 11341 + }, + { + "epoch": 1.3449543460215818, + "grad_norm": 0.9469060395657887, + "learning_rate": 3.86789916793635e-05, + "loss": 0.211, + "step": 11342 + }, + { + "epoch": 1.3450729277837068, + "grad_norm": 1.1572520794034677, + "learning_rate": 3.8676982446224065e-05, + "loss": 0.2879, + "step": 11343 + }, + { + "epoch": 1.3451915095458318, + "grad_norm": 1.2405378200255515, + "learning_rate": 3.867497308699962e-05, + "loss": 0.2758, + "step": 11344 + }, + { + "epoch": 1.3453100913079568, + "grad_norm": 0.8999392253151615, + "learning_rate": 3.86729636017087e-05, + "loss": 0.1871, + "step": 11345 + }, + { + "epoch": 1.345428673070082, + "grad_norm": 1.6378741959606367, + "learning_rate": 3.867095399036982e-05, + "loss": 0.3584, + "step": 11346 + }, + { + "epoch": 1.3455472548322067, + "grad_norm": 1.3443030618956133, + "learning_rate": 3.866894425300151e-05, + "loss": 0.2694, + "step": 11347 + }, + { + "epoch": 1.345665836594332, + "grad_norm": 0.9166398372519783, + "learning_rate": 3.866693438962229e-05, + "loss": 0.1568, + "step": 11348 + }, + { + "epoch": 1.3457844183564567, + "grad_norm": 1.355685940404418, + "learning_rate": 3.86649244002507e-05, + "loss": 0.221, + "step": 11349 + }, + { + "epoch": 1.3459030001185819, + "grad_norm": 0.9832628473411721, + "learning_rate": 3.866291428490526e-05, + "loss": 0.2008, + "step": 11350 + }, + { + "epoch": 1.3460215818807066, + "grad_norm": 1.1189094352556128, + "learning_rate": 3.866090404360451e-05, + "loss": 0.2125, + "step": 11351 + }, + { + "epoch": 1.3461401636428318, + "grad_norm": 1.0890232148020365, + "learning_rate": 3.8658893676366967e-05, + "loss": 0.2521, + "step": 11352 + }, + { + "epoch": 1.3462587454049566, + "grad_norm": 1.1745467162633203, + "learning_rate": 3.865688318321119e-05, + "loss": 0.2452, + "step": 11353 + }, + { + "epoch": 1.3463773271670818, + "grad_norm": 1.3025485627765956, + "learning_rate": 3.865487256415569e-05, + "loss": 0.326, + "step": 11354 + }, + { + "epoch": 1.3464959089292066, + "grad_norm": 1.0396327064829547, + "learning_rate": 3.8652861819219e-05, + "loss": 0.2477, + "step": 11355 + }, + { + "epoch": 1.3466144906913318, + "grad_norm": 1.1944055820533044, + "learning_rate": 3.865085094841968e-05, + "loss": 0.2793, + "step": 11356 + }, + { + "epoch": 1.3467330724534565, + "grad_norm": 1.097262654238483, + "learning_rate": 3.864883995177625e-05, + "loss": 0.2502, + "step": 11357 + }, + { + "epoch": 1.3468516542155817, + "grad_norm": 1.1176841114775684, + "learning_rate": 3.8646828829307245e-05, + "loss": 0.2505, + "step": 11358 + }, + { + "epoch": 1.3469702359777067, + "grad_norm": 1.4079513930277667, + "learning_rate": 3.864481758103122e-05, + "loss": 0.2878, + "step": 11359 + }, + { + "epoch": 1.3470888177398317, + "grad_norm": 1.7026719016718004, + "learning_rate": 3.8642806206966705e-05, + "loss": 0.4218, + "step": 11360 + }, + { + "epoch": 1.3472073995019567, + "grad_norm": 1.1766599164811282, + "learning_rate": 3.864079470713226e-05, + "loss": 0.2466, + "step": 11361 + }, + { + "epoch": 1.3473259812640817, + "grad_norm": 1.3158027699156127, + "learning_rate": 3.8638783081546396e-05, + "loss": 0.3565, + "step": 11362 + }, + { + "epoch": 1.3474445630262066, + "grad_norm": 1.513043263900365, + "learning_rate": 3.863677133022769e-05, + "loss": 0.32, + "step": 11363 + }, + { + "epoch": 1.3475631447883316, + "grad_norm": 1.273552397767931, + "learning_rate": 3.863475945319467e-05, + "loss": 0.2669, + "step": 11364 + }, + { + "epoch": 1.3476817265504566, + "grad_norm": 1.1142239243212528, + "learning_rate": 3.8632747450465886e-05, + "loss": 0.2699, + "step": 11365 + }, + { + "epoch": 1.3478003083125816, + "grad_norm": 0.7984180621583492, + "learning_rate": 3.8630735322059896e-05, + "loss": 0.1732, + "step": 11366 + }, + { + "epoch": 1.3479188900747066, + "grad_norm": 0.9098685216599087, + "learning_rate": 3.862872306799523e-05, + "loss": 0.1916, + "step": 11367 + }, + { + "epoch": 1.3480374718368315, + "grad_norm": 1.2101692578184067, + "learning_rate": 3.862671068829046e-05, + "loss": 0.2378, + "step": 11368 + }, + { + "epoch": 1.3481560535989565, + "grad_norm": 1.1230849396548137, + "learning_rate": 3.862469818296413e-05, + "loss": 0.2779, + "step": 11369 + }, + { + "epoch": 1.3482746353610815, + "grad_norm": 1.150992440136119, + "learning_rate": 3.8622685552034775e-05, + "loss": 0.2212, + "step": 11370 + }, + { + "epoch": 1.3483932171232065, + "grad_norm": 1.084377620769417, + "learning_rate": 3.8620672795520975e-05, + "loss": 0.2845, + "step": 11371 + }, + { + "epoch": 1.3485117988853315, + "grad_norm": 1.2633831808677862, + "learning_rate": 3.861865991344128e-05, + "loss": 0.2135, + "step": 11372 + }, + { + "epoch": 1.3486303806474564, + "grad_norm": 1.0637690717644257, + "learning_rate": 3.861664690581423e-05, + "loss": 0.2256, + "step": 11373 + }, + { + "epoch": 1.3487489624095814, + "grad_norm": 1.1129911926959992, + "learning_rate": 3.86146337726584e-05, + "loss": 0.2639, + "step": 11374 + }, + { + "epoch": 1.3488675441717064, + "grad_norm": 1.1531771686187675, + "learning_rate": 3.861262051399234e-05, + "loss": 0.3201, + "step": 11375 + }, + { + "epoch": 1.3489861259338314, + "grad_norm": 0.9055265714674992, + "learning_rate": 3.861060712983461e-05, + "loss": 0.2725, + "step": 11376 + }, + { + "epoch": 1.3491047076959564, + "grad_norm": 1.1716273964328419, + "learning_rate": 3.860859362020377e-05, + "loss": 0.2554, + "step": 11377 + }, + { + "epoch": 1.3492232894580813, + "grad_norm": 0.9134539562475308, + "learning_rate": 3.86065799851184e-05, + "loss": 0.2136, + "step": 11378 + }, + { + "epoch": 1.3493418712202063, + "grad_norm": 1.6427361047093187, + "learning_rate": 3.860456622459703e-05, + "loss": 0.4836, + "step": 11379 + }, + { + "epoch": 1.3494604529823313, + "grad_norm": 1.2976628892276372, + "learning_rate": 3.860255233865825e-05, + "loss": 0.3006, + "step": 11380 + }, + { + "epoch": 1.3495790347444563, + "grad_norm": 0.9517775640458085, + "learning_rate": 3.860053832732062e-05, + "loss": 0.2244, + "step": 11381 + }, + { + "epoch": 1.3496976165065813, + "grad_norm": 1.2441614374894716, + "learning_rate": 3.85985241906027e-05, + "loss": 0.212, + "step": 11382 + }, + { + "epoch": 1.3498161982687062, + "grad_norm": 1.2780967276346673, + "learning_rate": 3.859650992852306e-05, + "loss": 0.2544, + "step": 11383 + }, + { + "epoch": 1.3499347800308312, + "grad_norm": 1.047859594729622, + "learning_rate": 3.8594495541100284e-05, + "loss": 0.2207, + "step": 11384 + }, + { + "epoch": 1.3500533617929562, + "grad_norm": 1.1028029192838678, + "learning_rate": 3.8592481028352925e-05, + "loss": 0.2796, + "step": 11385 + }, + { + "epoch": 1.3501719435550812, + "grad_norm": 1.5292838791949017, + "learning_rate": 3.859046639029955e-05, + "loss": 0.291, + "step": 11386 + }, + { + "epoch": 1.3502905253172062, + "grad_norm": 0.9107623643634299, + "learning_rate": 3.858845162695875e-05, + "loss": 0.1913, + "step": 11387 + }, + { + "epoch": 1.3504091070793312, + "grad_norm": 1.0288877012352384, + "learning_rate": 3.8586436738349085e-05, + "loss": 0.249, + "step": 11388 + }, + { + "epoch": 1.3505276888414561, + "grad_norm": 1.5006707839421543, + "learning_rate": 3.8584421724489133e-05, + "loss": 0.3838, + "step": 11389 + }, + { + "epoch": 1.3506462706035811, + "grad_norm": 0.7504197158820977, + "learning_rate": 3.858240658539747e-05, + "loss": 0.1602, + "step": 11390 + }, + { + "epoch": 1.350764852365706, + "grad_norm": 1.140577413657662, + "learning_rate": 3.8580391321092677e-05, + "loss": 0.2087, + "step": 11391 + }, + { + "epoch": 1.350883434127831, + "grad_norm": 1.3040168860826156, + "learning_rate": 3.857837593159334e-05, + "loss": 0.3368, + "step": 11392 + }, + { + "epoch": 1.351002015889956, + "grad_norm": 1.0446807976651733, + "learning_rate": 3.857636041691801e-05, + "loss": 0.2482, + "step": 11393 + }, + { + "epoch": 1.351120597652081, + "grad_norm": 0.9951716259451728, + "learning_rate": 3.857434477708529e-05, + "loss": 0.2663, + "step": 11394 + }, + { + "epoch": 1.351239179414206, + "grad_norm": 0.9788369625422153, + "learning_rate": 3.857232901211376e-05, + "loss": 0.2551, + "step": 11395 + }, + { + "epoch": 1.351357761176331, + "grad_norm": 0.9907211688186556, + "learning_rate": 3.857031312202201e-05, + "loss": 0.2622, + "step": 11396 + }, + { + "epoch": 1.3514763429384562, + "grad_norm": 0.7347940306863251, + "learning_rate": 3.85682971068286e-05, + "loss": 0.2129, + "step": 11397 + }, + { + "epoch": 1.351594924700581, + "grad_norm": 1.1894439322544605, + "learning_rate": 3.856628096655213e-05, + "loss": 0.2705, + "step": 11398 + }, + { + "epoch": 1.3517135064627062, + "grad_norm": 0.9316688260721552, + "learning_rate": 3.856426470121119e-05, + "loss": 0.2016, + "step": 11399 + }, + { + "epoch": 1.351832088224831, + "grad_norm": 0.8100558185461376, + "learning_rate": 3.8562248310824364e-05, + "loss": 0.1823, + "step": 11400 + }, + { + "epoch": 1.3519506699869561, + "grad_norm": 0.9245203688113711, + "learning_rate": 3.8560231795410235e-05, + "loss": 0.2065, + "step": 11401 + }, + { + "epoch": 1.3520692517490809, + "grad_norm": 1.0173902926773999, + "learning_rate": 3.85582151549874e-05, + "loss": 0.2115, + "step": 11402 + }, + { + "epoch": 1.352187833511206, + "grad_norm": 1.357252232103516, + "learning_rate": 3.855619838957445e-05, + "loss": 0.2949, + "step": 11403 + }, + { + "epoch": 1.3523064152733308, + "grad_norm": 0.997860482678172, + "learning_rate": 3.855418149918997e-05, + "loss": 0.1918, + "step": 11404 + }, + { + "epoch": 1.352424997035456, + "grad_norm": 1.144264533259226, + "learning_rate": 3.8552164483852556e-05, + "loss": 0.256, + "step": 11405 + }, + { + "epoch": 1.3525435787975808, + "grad_norm": 1.1206608728417224, + "learning_rate": 3.855014734358081e-05, + "loss": 0.2218, + "step": 11406 + }, + { + "epoch": 1.352662160559706, + "grad_norm": 1.3682389552199388, + "learning_rate": 3.854813007839332e-05, + "loss": 0.2574, + "step": 11407 + }, + { + "epoch": 1.3527807423218308, + "grad_norm": 1.1305745854554832, + "learning_rate": 3.854611268830869e-05, + "loss": 0.306, + "step": 11408 + }, + { + "epoch": 1.352899324083956, + "grad_norm": 1.318054436467911, + "learning_rate": 3.85440951733455e-05, + "loss": 0.239, + "step": 11409 + }, + { + "epoch": 1.353017905846081, + "grad_norm": 1.0033563126130476, + "learning_rate": 3.854207753352237e-05, + "loss": 0.2536, + "step": 11410 + }, + { + "epoch": 1.353136487608206, + "grad_norm": 0.7125593617735019, + "learning_rate": 3.854005976885789e-05, + "loss": 0.1806, + "step": 11411 + }, + { + "epoch": 1.353255069370331, + "grad_norm": 1.3974778399922243, + "learning_rate": 3.853804187937066e-05, + "loss": 0.3547, + "step": 11412 + }, + { + "epoch": 1.353373651132456, + "grad_norm": 1.1104395110232534, + "learning_rate": 3.853602386507929e-05, + "loss": 0.2493, + "step": 11413 + }, + { + "epoch": 1.3534922328945809, + "grad_norm": 1.263538284543217, + "learning_rate": 3.8534005726002376e-05, + "loss": 0.2669, + "step": 11414 + }, + { + "epoch": 1.3536108146567059, + "grad_norm": 1.2049415415286113, + "learning_rate": 3.8531987462158524e-05, + "loss": 0.274, + "step": 11415 + }, + { + "epoch": 1.3537293964188308, + "grad_norm": 1.1146059623406104, + "learning_rate": 3.8529969073566344e-05, + "loss": 0.255, + "step": 11416 + }, + { + "epoch": 1.3538479781809558, + "grad_norm": 1.010098811693673, + "learning_rate": 3.852795056024445e-05, + "loss": 0.2466, + "step": 11417 + }, + { + "epoch": 1.3539665599430808, + "grad_norm": 1.1992444672807792, + "learning_rate": 3.8525931922211426e-05, + "loss": 0.2359, + "step": 11418 + }, + { + "epoch": 1.3540851417052058, + "grad_norm": 0.8510258760655327, + "learning_rate": 3.8523913159485904e-05, + "loss": 0.1981, + "step": 11419 + }, + { + "epoch": 1.3542037234673308, + "grad_norm": 0.906450223969736, + "learning_rate": 3.852189427208648e-05, + "loss": 0.201, + "step": 11420 + }, + { + "epoch": 1.3543223052294557, + "grad_norm": 1.1278762662824224, + "learning_rate": 3.851987526003178e-05, + "loss": 0.2281, + "step": 11421 + }, + { + "epoch": 1.3544408869915807, + "grad_norm": 1.0047371649560812, + "learning_rate": 3.85178561233404e-05, + "loss": 0.2413, + "step": 11422 + }, + { + "epoch": 1.3545594687537057, + "grad_norm": 1.2722109638708838, + "learning_rate": 3.851583686203097e-05, + "loss": 0.3338, + "step": 11423 + }, + { + "epoch": 1.3546780505158307, + "grad_norm": 1.1748380733464143, + "learning_rate": 3.85138174761221e-05, + "loss": 0.2291, + "step": 11424 + }, + { + "epoch": 1.3547966322779557, + "grad_norm": 1.2066028118976486, + "learning_rate": 3.8511797965632405e-05, + "loss": 0.2925, + "step": 11425 + }, + { + "epoch": 1.3549152140400806, + "grad_norm": 1.3245818944903474, + "learning_rate": 3.85097783305805e-05, + "loss": 0.2233, + "step": 11426 + }, + { + "epoch": 1.3550337958022056, + "grad_norm": 0.9354892351774108, + "learning_rate": 3.850775857098501e-05, + "loss": 0.1735, + "step": 11427 + }, + { + "epoch": 1.3551523775643306, + "grad_norm": 0.8693213181997481, + "learning_rate": 3.850573868686454e-05, + "loss": 0.2126, + "step": 11428 + }, + { + "epoch": 1.3552709593264556, + "grad_norm": 1.1131390248192903, + "learning_rate": 3.850371867823773e-05, + "loss": 0.2133, + "step": 11429 + }, + { + "epoch": 1.3553895410885806, + "grad_norm": 1.2837820255641308, + "learning_rate": 3.85016985451232e-05, + "loss": 0.2621, + "step": 11430 + }, + { + "epoch": 1.3555081228507055, + "grad_norm": 1.1926858099527637, + "learning_rate": 3.849967828753955e-05, + "loss": 0.2202, + "step": 11431 + }, + { + "epoch": 1.3556267046128305, + "grad_norm": 1.1856603051927073, + "learning_rate": 3.8497657905505425e-05, + "loss": 0.2476, + "step": 11432 + }, + { + "epoch": 1.3557452863749555, + "grad_norm": 1.0296567531718979, + "learning_rate": 3.849563739903944e-05, + "loss": 0.2513, + "step": 11433 + }, + { + "epoch": 1.3558638681370805, + "grad_norm": 1.0617623246765036, + "learning_rate": 3.8493616768160245e-05, + "loss": 0.1896, + "step": 11434 + }, + { + "epoch": 1.3559824498992055, + "grad_norm": 1.4600344102096678, + "learning_rate": 3.849159601288643e-05, + "loss": 0.2537, + "step": 11435 + }, + { + "epoch": 1.3561010316613304, + "grad_norm": 1.7070587640269037, + "learning_rate": 3.8489575133236666e-05, + "loss": 0.4201, + "step": 11436 + }, + { + "epoch": 1.3562196134234554, + "grad_norm": 1.4783749561369266, + "learning_rate": 3.848755412922955e-05, + "loss": 0.2688, + "step": 11437 + }, + { + "epoch": 1.3563381951855804, + "grad_norm": 0.9654320979982584, + "learning_rate": 3.848553300088372e-05, + "loss": 0.1826, + "step": 11438 + }, + { + "epoch": 1.3564567769477054, + "grad_norm": 1.219432606481704, + "learning_rate": 3.8483511748217814e-05, + "loss": 0.2474, + "step": 11439 + }, + { + "epoch": 1.3565753587098304, + "grad_norm": 0.7289999967829813, + "learning_rate": 3.8481490371250475e-05, + "loss": 0.1249, + "step": 11440 + }, + { + "epoch": 1.3566939404719554, + "grad_norm": 1.2263985525783696, + "learning_rate": 3.8479468870000316e-05, + "loss": 0.2651, + "step": 11441 + }, + { + "epoch": 1.3568125222340803, + "grad_norm": 1.0052616128333935, + "learning_rate": 3.8477447244485984e-05, + "loss": 0.2072, + "step": 11442 + }, + { + "epoch": 1.3569311039962053, + "grad_norm": 1.0048199491247325, + "learning_rate": 3.847542549472612e-05, + "loss": 0.168, + "step": 11443 + }, + { + "epoch": 1.3570496857583303, + "grad_norm": 1.2621016803257394, + "learning_rate": 3.847340362073936e-05, + "loss": 0.2765, + "step": 11444 + }, + { + "epoch": 1.3571682675204553, + "grad_norm": 1.0924651932739444, + "learning_rate": 3.8471381622544334e-05, + "loss": 0.2425, + "step": 11445 + }, + { + "epoch": 1.3572868492825805, + "grad_norm": 0.8015895859765464, + "learning_rate": 3.84693595001597e-05, + "loss": 0.1887, + "step": 11446 + }, + { + "epoch": 1.3574054310447052, + "grad_norm": 1.005734182431909, + "learning_rate": 3.8467337253604075e-05, + "loss": 0.1423, + "step": 11447 + }, + { + "epoch": 1.3575240128068304, + "grad_norm": 1.3060679015396828, + "learning_rate": 3.8465314882896116e-05, + "loss": 0.2507, + "step": 11448 + }, + { + "epoch": 1.3576425945689552, + "grad_norm": 1.0049221637216814, + "learning_rate": 3.8463292388054483e-05, + "loss": 0.1895, + "step": 11449 + }, + { + "epoch": 1.3577611763310804, + "grad_norm": 0.9269149054986429, + "learning_rate": 3.8461269769097784e-05, + "loss": 0.2073, + "step": 11450 + }, + { + "epoch": 1.3578797580932052, + "grad_norm": 1.197734536741664, + "learning_rate": 3.84592470260447e-05, + "loss": 0.2147, + "step": 11451 + }, + { + "epoch": 1.3579983398553304, + "grad_norm": 0.7225249640486606, + "learning_rate": 3.845722415891385e-05, + "loss": 0.1632, + "step": 11452 + }, + { + "epoch": 1.3581169216174551, + "grad_norm": 1.171865478296089, + "learning_rate": 3.84552011677239e-05, + "loss": 0.275, + "step": 11453 + }, + { + "epoch": 1.3582355033795803, + "grad_norm": 1.0862442512134167, + "learning_rate": 3.84531780524935e-05, + "loss": 0.2094, + "step": 11454 + }, + { + "epoch": 1.358354085141705, + "grad_norm": 0.8161204653052915, + "learning_rate": 3.845115481324129e-05, + "loss": 0.1788, + "step": 11455 + }, + { + "epoch": 1.3584726669038303, + "grad_norm": 1.2225481205543498, + "learning_rate": 3.844913144998593e-05, + "loss": 0.2468, + "step": 11456 + }, + { + "epoch": 1.358591248665955, + "grad_norm": 1.3136179162515014, + "learning_rate": 3.8447107962746077e-05, + "loss": 0.2373, + "step": 11457 + }, + { + "epoch": 1.3587098304280802, + "grad_norm": 0.898385227629991, + "learning_rate": 3.844508435154037e-05, + "loss": 0.1921, + "step": 11458 + }, + { + "epoch": 1.3588284121902052, + "grad_norm": 1.3606633751763424, + "learning_rate": 3.844306061638747e-05, + "loss": 0.3157, + "step": 11459 + }, + { + "epoch": 1.3589469939523302, + "grad_norm": 1.2579571844195148, + "learning_rate": 3.844103675730605e-05, + "loss": 0.2832, + "step": 11460 + }, + { + "epoch": 1.3590655757144552, + "grad_norm": 1.0408158775680338, + "learning_rate": 3.8439012774314744e-05, + "loss": 0.26, + "step": 11461 + }, + { + "epoch": 1.3591841574765802, + "grad_norm": 1.0095491466523432, + "learning_rate": 3.843698866743221e-05, + "loss": 0.2029, + "step": 11462 + }, + { + "epoch": 1.3593027392387051, + "grad_norm": 1.0058870186760083, + "learning_rate": 3.843496443667713e-05, + "loss": 0.2253, + "step": 11463 + }, + { + "epoch": 1.3594213210008301, + "grad_norm": 1.107608745386719, + "learning_rate": 3.843294008206816e-05, + "loss": 0.2284, + "step": 11464 + }, + { + "epoch": 1.359539902762955, + "grad_norm": 0.8224455272525313, + "learning_rate": 3.843091560362395e-05, + "loss": 0.2049, + "step": 11465 + }, + { + "epoch": 1.35965848452508, + "grad_norm": 1.1190316814987422, + "learning_rate": 3.8428891001363157e-05, + "loss": 0.215, + "step": 11466 + }, + { + "epoch": 1.359777066287205, + "grad_norm": 0.9114033992956317, + "learning_rate": 3.8426866275304475e-05, + "loss": 0.2165, + "step": 11467 + }, + { + "epoch": 1.35989564804933, + "grad_norm": 0.977401549705111, + "learning_rate": 3.842484142546654e-05, + "loss": 0.2453, + "step": 11468 + }, + { + "epoch": 1.360014229811455, + "grad_norm": 0.976894446755083, + "learning_rate": 3.8422816451868026e-05, + "loss": 0.1721, + "step": 11469 + }, + { + "epoch": 1.36013281157358, + "grad_norm": 0.8630101727429302, + "learning_rate": 3.842079135452761e-05, + "loss": 0.198, + "step": 11470 + }, + { + "epoch": 1.360251393335705, + "grad_norm": 1.434368217782574, + "learning_rate": 3.841876613346395e-05, + "loss": 0.2354, + "step": 11471 + }, + { + "epoch": 1.36036997509783, + "grad_norm": 1.3024601317440734, + "learning_rate": 3.8416740788695725e-05, + "loss": 0.3054, + "step": 11472 + }, + { + "epoch": 1.360488556859955, + "grad_norm": 1.6649248820599774, + "learning_rate": 3.8414715320241607e-05, + "loss": 0.3261, + "step": 11473 + }, + { + "epoch": 1.36060713862208, + "grad_norm": 0.9175847447753697, + "learning_rate": 3.841268972812026e-05, + "loss": 0.1727, + "step": 11474 + }, + { + "epoch": 1.360725720384205, + "grad_norm": 0.7431815482127834, + "learning_rate": 3.841066401235036e-05, + "loss": 0.1701, + "step": 11475 + }, + { + "epoch": 1.36084430214633, + "grad_norm": 0.900434844073702, + "learning_rate": 3.840863817295058e-05, + "loss": 0.1682, + "step": 11476 + }, + { + "epoch": 1.3609628839084549, + "grad_norm": 0.773803422724477, + "learning_rate": 3.8406612209939615e-05, + "loss": 0.1528, + "step": 11477 + }, + { + "epoch": 1.3610814656705799, + "grad_norm": 1.256355294037502, + "learning_rate": 3.840458612333611e-05, + "loss": 0.2643, + "step": 11478 + }, + { + "epoch": 1.3612000474327048, + "grad_norm": 0.9112005880448195, + "learning_rate": 3.840255991315876e-05, + "loss": 0.21, + "step": 11479 + }, + { + "epoch": 1.3613186291948298, + "grad_norm": 1.0700169181612262, + "learning_rate": 3.840053357942625e-05, + "loss": 0.2439, + "step": 11480 + }, + { + "epoch": 1.3614372109569548, + "grad_norm": 0.9452277840257948, + "learning_rate": 3.839850712215724e-05, + "loss": 0.1754, + "step": 11481 + }, + { + "epoch": 1.3615557927190798, + "grad_norm": 0.7380744725673793, + "learning_rate": 3.839648054137044e-05, + "loss": 0.1918, + "step": 11482 + }, + { + "epoch": 1.3616743744812048, + "grad_norm": 0.953591297228943, + "learning_rate": 3.839445383708451e-05, + "loss": 0.1652, + "step": 11483 + }, + { + "epoch": 1.3617929562433297, + "grad_norm": 0.8149213630635902, + "learning_rate": 3.8392427009318144e-05, + "loss": 0.1637, + "step": 11484 + }, + { + "epoch": 1.3619115380054547, + "grad_norm": 0.9999671125470065, + "learning_rate": 3.839040005809002e-05, + "loss": 0.1996, + "step": 11485 + }, + { + "epoch": 1.3620301197675797, + "grad_norm": 1.6573318953247655, + "learning_rate": 3.838837298341883e-05, + "loss": 0.3543, + "step": 11486 + }, + { + "epoch": 1.3621487015297047, + "grad_norm": 0.9747477288308484, + "learning_rate": 3.838634578532326e-05, + "loss": 0.2647, + "step": 11487 + }, + { + "epoch": 1.3622672832918297, + "grad_norm": 1.0588165656150592, + "learning_rate": 3.8384318463822e-05, + "loss": 0.2237, + "step": 11488 + }, + { + "epoch": 1.3623858650539546, + "grad_norm": 1.4264621898063805, + "learning_rate": 3.838229101893373e-05, + "loss": 0.2617, + "step": 11489 + }, + { + "epoch": 1.3625044468160796, + "grad_norm": 1.2926945236393466, + "learning_rate": 3.838026345067714e-05, + "loss": 0.2908, + "step": 11490 + }, + { + "epoch": 1.3626230285782046, + "grad_norm": 1.2289080478488237, + "learning_rate": 3.837823575907095e-05, + "loss": 0.2753, + "step": 11491 + }, + { + "epoch": 1.3627416103403296, + "grad_norm": 0.8101018912421671, + "learning_rate": 3.8376207944133815e-05, + "loss": 0.1868, + "step": 11492 + }, + { + "epoch": 1.3628601921024546, + "grad_norm": 0.8769091064184533, + "learning_rate": 3.8374180005884454e-05, + "loss": 0.2225, + "step": 11493 + }, + { + "epoch": 1.3629787738645796, + "grad_norm": 1.195922243412925, + "learning_rate": 3.8372151944341553e-05, + "loss": 0.2523, + "step": 11494 + }, + { + "epoch": 1.3630973556267045, + "grad_norm": 0.9095187598873063, + "learning_rate": 3.837012375952381e-05, + "loss": 0.1529, + "step": 11495 + }, + { + "epoch": 1.3632159373888295, + "grad_norm": 0.9448403451574178, + "learning_rate": 3.8368095451449915e-05, + "loss": 0.194, + "step": 11496 + }, + { + "epoch": 1.3633345191509547, + "grad_norm": 1.1431082287109404, + "learning_rate": 3.836606702013858e-05, + "loss": 0.2198, + "step": 11497 + }, + { + "epoch": 1.3634531009130795, + "grad_norm": 1.1404967249509663, + "learning_rate": 3.836403846560849e-05, + "loss": 0.24, + "step": 11498 + }, + { + "epoch": 1.3635716826752047, + "grad_norm": 1.36666311170755, + "learning_rate": 3.836200978787836e-05, + "loss": 0.2736, + "step": 11499 + }, + { + "epoch": 1.3636902644373294, + "grad_norm": 1.025543368466899, + "learning_rate": 3.835998098696688e-05, + "loss": 0.2328, + "step": 11500 + }, + { + "epoch": 1.3638088461994546, + "grad_norm": 0.8847607228113296, + "learning_rate": 3.835795206289277e-05, + "loss": 0.1679, + "step": 11501 + }, + { + "epoch": 1.3639274279615794, + "grad_norm": 0.7970909242100421, + "learning_rate": 3.835592301567471e-05, + "loss": 0.1229, + "step": 11502 + }, + { + "epoch": 1.3640460097237046, + "grad_norm": 1.49047387835025, + "learning_rate": 3.835389384533142e-05, + "loss": 0.2874, + "step": 11503 + }, + { + "epoch": 1.3641645914858294, + "grad_norm": 0.912482209221664, + "learning_rate": 3.835186455188161e-05, + "loss": 0.2111, + "step": 11504 + }, + { + "epoch": 1.3642831732479546, + "grad_norm": 1.1077398766680024, + "learning_rate": 3.834983513534398e-05, + "loss": 0.228, + "step": 11505 + }, + { + "epoch": 1.3644017550100793, + "grad_norm": 2.052197205717078, + "learning_rate": 3.834780559573724e-05, + "loss": 0.4021, + "step": 11506 + }, + { + "epoch": 1.3645203367722045, + "grad_norm": 1.063067780351459, + "learning_rate": 3.83457759330801e-05, + "loss": 0.2013, + "step": 11507 + }, + { + "epoch": 1.3646389185343293, + "grad_norm": 1.0036823501656926, + "learning_rate": 3.834374614739127e-05, + "loss": 0.242, + "step": 11508 + }, + { + "epoch": 1.3647575002964545, + "grad_norm": 1.1326788997355022, + "learning_rate": 3.834171623868946e-05, + "loss": 0.2513, + "step": 11509 + }, + { + "epoch": 1.3648760820585795, + "grad_norm": 0.8493611847781761, + "learning_rate": 3.833968620699339e-05, + "loss": 0.1966, + "step": 11510 + }, + { + "epoch": 1.3649946638207044, + "grad_norm": 0.9890119229361736, + "learning_rate": 3.833765605232178e-05, + "loss": 0.2178, + "step": 11511 + }, + { + "epoch": 1.3651132455828294, + "grad_norm": 1.1343872961005985, + "learning_rate": 3.8335625774693326e-05, + "loss": 0.2626, + "step": 11512 + }, + { + "epoch": 1.3652318273449544, + "grad_norm": 1.2166273732383015, + "learning_rate": 3.833359537412676e-05, + "loss": 0.3019, + "step": 11513 + }, + { + "epoch": 1.3653504091070794, + "grad_norm": 1.1402305228464653, + "learning_rate": 3.83315648506408e-05, + "loss": 0.2164, + "step": 11514 + }, + { + "epoch": 1.3654689908692044, + "grad_norm": 1.503306693888745, + "learning_rate": 3.832953420425415e-05, + "loss": 0.3534, + "step": 11515 + }, + { + "epoch": 1.3655875726313293, + "grad_norm": 1.7253554392617687, + "learning_rate": 3.8327503434985554e-05, + "loss": 0.3209, + "step": 11516 + }, + { + "epoch": 1.3657061543934543, + "grad_norm": 1.093432528015903, + "learning_rate": 3.83254725428537e-05, + "loss": 0.2201, + "step": 11517 + }, + { + "epoch": 1.3658247361555793, + "grad_norm": 0.8079654338126313, + "learning_rate": 3.832344152787735e-05, + "loss": 0.1727, + "step": 11518 + }, + { + "epoch": 1.3659433179177043, + "grad_norm": 1.1586627624400723, + "learning_rate": 3.8321410390075196e-05, + "loss": 0.2257, + "step": 11519 + }, + { + "epoch": 1.3660618996798293, + "grad_norm": 1.1831525800020468, + "learning_rate": 3.831937912946598e-05, + "loss": 0.2179, + "step": 11520 + }, + { + "epoch": 1.3661804814419543, + "grad_norm": 1.0330196534882996, + "learning_rate": 3.831734774606841e-05, + "loss": 0.2669, + "step": 11521 + }, + { + "epoch": 1.3662990632040792, + "grad_norm": 1.1053626875582552, + "learning_rate": 3.831531623990124e-05, + "loss": 0.1981, + "step": 11522 + }, + { + "epoch": 1.3664176449662042, + "grad_norm": 1.340344268892906, + "learning_rate": 3.8313284610983175e-05, + "loss": 0.2764, + "step": 11523 + }, + { + "epoch": 1.3665362267283292, + "grad_norm": 1.3839486056738226, + "learning_rate": 3.8311252859332945e-05, + "loss": 0.3521, + "step": 11524 + }, + { + "epoch": 1.3666548084904542, + "grad_norm": 1.824123001805835, + "learning_rate": 3.83092209849693e-05, + "loss": 0.4044, + "step": 11525 + }, + { + "epoch": 1.3667733902525792, + "grad_norm": 0.8807458330066636, + "learning_rate": 3.830718898791095e-05, + "loss": 0.1763, + "step": 11526 + }, + { + "epoch": 1.3668919720147041, + "grad_norm": 1.1126973618387652, + "learning_rate": 3.8305156868176645e-05, + "loss": 0.2552, + "step": 11527 + }, + { + "epoch": 1.3670105537768291, + "grad_norm": 1.0036934910623472, + "learning_rate": 3.8303124625785104e-05, + "loss": 0.2477, + "step": 11528 + }, + { + "epoch": 1.367129135538954, + "grad_norm": 0.8512367144386453, + "learning_rate": 3.8301092260755064e-05, + "loss": 0.1729, + "step": 11529 + }, + { + "epoch": 1.367247717301079, + "grad_norm": 0.918008725142799, + "learning_rate": 3.829905977310527e-05, + "loss": 0.2528, + "step": 11530 + }, + { + "epoch": 1.367366299063204, + "grad_norm": 0.9827566146622403, + "learning_rate": 3.829702716285445e-05, + "loss": 0.2203, + "step": 11531 + }, + { + "epoch": 1.367484880825329, + "grad_norm": 1.309830675658511, + "learning_rate": 3.829499443002134e-05, + "loss": 0.2768, + "step": 11532 + }, + { + "epoch": 1.367603462587454, + "grad_norm": 1.017628792982879, + "learning_rate": 3.82929615746247e-05, + "loss": 0.2241, + "step": 11533 + }, + { + "epoch": 1.367722044349579, + "grad_norm": 1.2112757510268062, + "learning_rate": 3.829092859668324e-05, + "loss": 0.2735, + "step": 11534 + }, + { + "epoch": 1.367840626111704, + "grad_norm": 0.9531134267394311, + "learning_rate": 3.8288895496215724e-05, + "loss": 0.191, + "step": 11535 + }, + { + "epoch": 1.367959207873829, + "grad_norm": 1.7717897626513173, + "learning_rate": 3.828686227324088e-05, + "loss": 0.504, + "step": 11536 + }, + { + "epoch": 1.368077789635954, + "grad_norm": 1.1539688548838405, + "learning_rate": 3.8284828927777474e-05, + "loss": 0.2758, + "step": 11537 + }, + { + "epoch": 1.368196371398079, + "grad_norm": 1.0438547459125327, + "learning_rate": 3.8282795459844224e-05, + "loss": 0.2184, + "step": 11538 + }, + { + "epoch": 1.368314953160204, + "grad_norm": 1.0848696191484088, + "learning_rate": 3.8280761869459894e-05, + "loss": 0.2216, + "step": 11539 + }, + { + "epoch": 1.3684335349223289, + "grad_norm": 1.127010423394401, + "learning_rate": 3.827872815664322e-05, + "loss": 0.2748, + "step": 11540 + }, + { + "epoch": 1.3685521166844539, + "grad_norm": 1.1294223157916825, + "learning_rate": 3.8276694321412956e-05, + "loss": 0.241, + "step": 11541 + }, + { + "epoch": 1.3686706984465788, + "grad_norm": 0.9698102749287434, + "learning_rate": 3.827466036378785e-05, + "loss": 0.2347, + "step": 11542 + }, + { + "epoch": 1.3687892802087038, + "grad_norm": 1.0640896817871968, + "learning_rate": 3.827262628378666e-05, + "loss": 0.1931, + "step": 11543 + }, + { + "epoch": 1.3689078619708288, + "grad_norm": 0.9317988244678225, + "learning_rate": 3.8270592081428134e-05, + "loss": 0.2018, + "step": 11544 + }, + { + "epoch": 1.3690264437329538, + "grad_norm": 0.9241003050253291, + "learning_rate": 3.826855775673102e-05, + "loss": 0.2148, + "step": 11545 + }, + { + "epoch": 1.369145025495079, + "grad_norm": 0.8968866785751012, + "learning_rate": 3.8266523309714067e-05, + "loss": 0.2259, + "step": 11546 + }, + { + "epoch": 1.3692636072572038, + "grad_norm": 0.9649333983262626, + "learning_rate": 3.8264488740396046e-05, + "loss": 0.1959, + "step": 11547 + }, + { + "epoch": 1.369382189019329, + "grad_norm": 0.9273549362921337, + "learning_rate": 3.826245404879571e-05, + "loss": 0.1734, + "step": 11548 + }, + { + "epoch": 1.3695007707814537, + "grad_norm": 0.9587931340697953, + "learning_rate": 3.8260419234931794e-05, + "loss": 0.1631, + "step": 11549 + }, + { + "epoch": 1.369619352543579, + "grad_norm": 1.100654722733813, + "learning_rate": 3.825838429882308e-05, + "loss": 0.2539, + "step": 11550 + }, + { + "epoch": 1.3697379343057037, + "grad_norm": 1.1756270371914692, + "learning_rate": 3.825634924048832e-05, + "loss": 0.2464, + "step": 11551 + }, + { + "epoch": 1.3698565160678289, + "grad_norm": 0.9665802287563825, + "learning_rate": 3.8254314059946286e-05, + "loss": 0.2167, + "step": 11552 + }, + { + "epoch": 1.3699750978299536, + "grad_norm": 0.8383600202020025, + "learning_rate": 3.825227875721572e-05, + "loss": 0.1653, + "step": 11553 + }, + { + "epoch": 1.3700936795920788, + "grad_norm": 0.7593284772416189, + "learning_rate": 3.82502433323154e-05, + "loss": 0.1445, + "step": 11554 + }, + { + "epoch": 1.3702122613542036, + "grad_norm": 1.3742819439173186, + "learning_rate": 3.824820778526409e-05, + "loss": 0.3419, + "step": 11555 + }, + { + "epoch": 1.3703308431163288, + "grad_norm": 1.093982319973032, + "learning_rate": 3.8246172116080536e-05, + "loss": 0.2115, + "step": 11556 + }, + { + "epoch": 1.3704494248784536, + "grad_norm": 0.8894848345571955, + "learning_rate": 3.824413632478353e-05, + "loss": 0.2066, + "step": 11557 + }, + { + "epoch": 1.3705680066405788, + "grad_norm": 1.1068800778365078, + "learning_rate": 3.8242100411391825e-05, + "loss": 0.2436, + "step": 11558 + }, + { + "epoch": 1.3706865884027037, + "grad_norm": 1.2381972366975642, + "learning_rate": 3.82400643759242e-05, + "loss": 0.2716, + "step": 11559 + }, + { + "epoch": 1.3708051701648287, + "grad_norm": 1.1203800997929338, + "learning_rate": 3.82380282183994e-05, + "loss": 0.2267, + "step": 11560 + }, + { + "epoch": 1.3709237519269537, + "grad_norm": 1.0173883597786517, + "learning_rate": 3.823599193883622e-05, + "loss": 0.1802, + "step": 11561 + }, + { + "epoch": 1.3710423336890787, + "grad_norm": 1.2049314011661474, + "learning_rate": 3.823395553725343e-05, + "loss": 0.2087, + "step": 11562 + }, + { + "epoch": 1.3711609154512037, + "grad_norm": 1.1506471826278661, + "learning_rate": 3.8231919013669805e-05, + "loss": 0.2158, + "step": 11563 + }, + { + "epoch": 1.3712794972133286, + "grad_norm": 1.5351631341909235, + "learning_rate": 3.82298823681041e-05, + "loss": 0.2089, + "step": 11564 + }, + { + "epoch": 1.3713980789754536, + "grad_norm": 1.0754374741018191, + "learning_rate": 3.822784560057512e-05, + "loss": 0.2687, + "step": 11565 + }, + { + "epoch": 1.3715166607375786, + "grad_norm": 1.181780986573648, + "learning_rate": 3.8225808711101606e-05, + "loss": 0.2546, + "step": 11566 + }, + { + "epoch": 1.3716352424997036, + "grad_norm": 1.0863857071217833, + "learning_rate": 3.822377169970236e-05, + "loss": 0.2276, + "step": 11567 + }, + { + "epoch": 1.3717538242618286, + "grad_norm": 1.1557923505945404, + "learning_rate": 3.822173456639616e-05, + "loss": 0.2198, + "step": 11568 + }, + { + "epoch": 1.3718724060239536, + "grad_norm": 0.8602494473676461, + "learning_rate": 3.8219697311201776e-05, + "loss": 0.2225, + "step": 11569 + }, + { + "epoch": 1.3719909877860785, + "grad_norm": 0.9471861373492907, + "learning_rate": 3.8217659934137986e-05, + "loss": 0.1714, + "step": 11570 + }, + { + "epoch": 1.3721095695482035, + "grad_norm": 1.2191310002122917, + "learning_rate": 3.8215622435223595e-05, + "loss": 0.2694, + "step": 11571 + }, + { + "epoch": 1.3722281513103285, + "grad_norm": 0.9498621765567019, + "learning_rate": 3.8213584814477364e-05, + "loss": 0.2492, + "step": 11572 + }, + { + "epoch": 1.3723467330724535, + "grad_norm": 1.1840802267198383, + "learning_rate": 3.8211547071918085e-05, + "loss": 0.2118, + "step": 11573 + }, + { + "epoch": 1.3724653148345785, + "grad_norm": 0.9708518021083982, + "learning_rate": 3.820950920756454e-05, + "loss": 0.2494, + "step": 11574 + }, + { + "epoch": 1.3725838965967034, + "grad_norm": 0.9255620189134793, + "learning_rate": 3.8207471221435524e-05, + "loss": 0.2241, + "step": 11575 + }, + { + "epoch": 1.3727024783588284, + "grad_norm": 1.1768253505733437, + "learning_rate": 3.8205433113549824e-05, + "loss": 0.2146, + "step": 11576 + }, + { + "epoch": 1.3728210601209534, + "grad_norm": 0.8830450319762738, + "learning_rate": 3.820339488392621e-05, + "loss": 0.1895, + "step": 11577 + }, + { + "epoch": 1.3729396418830784, + "grad_norm": 1.6767068833757066, + "learning_rate": 3.82013565325835e-05, + "loss": 0.3324, + "step": 11578 + }, + { + "epoch": 1.3730582236452034, + "grad_norm": 1.136085427530583, + "learning_rate": 3.819931805954047e-05, + "loss": 0.2611, + "step": 11579 + }, + { + "epoch": 1.3731768054073283, + "grad_norm": 1.141521663536434, + "learning_rate": 3.8197279464815906e-05, + "loss": 0.2312, + "step": 11580 + }, + { + "epoch": 1.3732953871694533, + "grad_norm": 1.1129458080629637, + "learning_rate": 3.819524074842861e-05, + "loss": 0.2197, + "step": 11581 + }, + { + "epoch": 1.3734139689315783, + "grad_norm": 1.0712925399265956, + "learning_rate": 3.819320191039737e-05, + "loss": 0.2008, + "step": 11582 + }, + { + "epoch": 1.3735325506937033, + "grad_norm": 0.9896909329715283, + "learning_rate": 3.8191162950741e-05, + "loss": 0.2692, + "step": 11583 + }, + { + "epoch": 1.3736511324558283, + "grad_norm": 1.015216987280327, + "learning_rate": 3.8189123869478276e-05, + "loss": 0.2214, + "step": 11584 + }, + { + "epoch": 1.3737697142179532, + "grad_norm": 0.9429739214065641, + "learning_rate": 3.818708466662801e-05, + "loss": 0.1657, + "step": 11585 + }, + { + "epoch": 1.3738882959800782, + "grad_norm": 1.4891212577670847, + "learning_rate": 3.8185045342208986e-05, + "loss": 0.2929, + "step": 11586 + }, + { + "epoch": 1.3740068777422032, + "grad_norm": 1.5069521992734345, + "learning_rate": 3.818300589624002e-05, + "loss": 0.3735, + "step": 11587 + }, + { + "epoch": 1.3741254595043282, + "grad_norm": 1.2264119528591744, + "learning_rate": 3.81809663287399e-05, + "loss": 0.2318, + "step": 11588 + }, + { + "epoch": 1.3742440412664532, + "grad_norm": 1.0675630651723664, + "learning_rate": 3.817892663972743e-05, + "loss": 0.2338, + "step": 11589 + }, + { + "epoch": 1.3743626230285781, + "grad_norm": 0.9458422756799072, + "learning_rate": 3.817688682922143e-05, + "loss": 0.2484, + "step": 11590 + }, + { + "epoch": 1.3744812047907031, + "grad_norm": 0.9475420898628014, + "learning_rate": 3.817484689724069e-05, + "loss": 0.2186, + "step": 11591 + }, + { + "epoch": 1.374599786552828, + "grad_norm": 1.0410249974497925, + "learning_rate": 3.817280684380401e-05, + "loss": 0.2179, + "step": 11592 + }, + { + "epoch": 1.374718368314953, + "grad_norm": 0.9529085605981507, + "learning_rate": 3.817076666893021e-05, + "loss": 0.2341, + "step": 11593 + }, + { + "epoch": 1.374836950077078, + "grad_norm": 0.9936687355267575, + "learning_rate": 3.816872637263809e-05, + "loss": 0.2282, + "step": 11594 + }, + { + "epoch": 1.374955531839203, + "grad_norm": 1.0346597098881127, + "learning_rate": 3.8166685954946456e-05, + "loss": 0.2294, + "step": 11595 + }, + { + "epoch": 1.375074113601328, + "grad_norm": 1.0148193278832296, + "learning_rate": 3.8164645415874136e-05, + "loss": 0.2002, + "step": 11596 + }, + { + "epoch": 1.3751926953634532, + "grad_norm": 1.1855932846075419, + "learning_rate": 3.816260475543991e-05, + "loss": 0.2413, + "step": 11597 + }, + { + "epoch": 1.375311277125578, + "grad_norm": 0.9187724556456782, + "learning_rate": 3.816056397366263e-05, + "loss": 0.1612, + "step": 11598 + }, + { + "epoch": 1.3754298588877032, + "grad_norm": 1.0141195498137883, + "learning_rate": 3.8158523070561084e-05, + "loss": 0.1624, + "step": 11599 + }, + { + "epoch": 1.375548440649828, + "grad_norm": 0.8879388542556891, + "learning_rate": 3.815648204615408e-05, + "loss": 0.1734, + "step": 11600 + }, + { + "epoch": 1.3756670224119532, + "grad_norm": 0.99433493475632, + "learning_rate": 3.8154440900460456e-05, + "loss": 0.2118, + "step": 11601 + }, + { + "epoch": 1.375785604174078, + "grad_norm": 0.8045154958643767, + "learning_rate": 3.815239963349901e-05, + "loss": 0.1331, + "step": 11602 + }, + { + "epoch": 1.3759041859362031, + "grad_norm": 0.8049675780375013, + "learning_rate": 3.8150358245288576e-05, + "loss": 0.1671, + "step": 11603 + }, + { + "epoch": 1.3760227676983279, + "grad_norm": 1.1362749011551458, + "learning_rate": 3.814831673584796e-05, + "loss": 0.2832, + "step": 11604 + }, + { + "epoch": 1.376141349460453, + "grad_norm": 0.9426112946406048, + "learning_rate": 3.8146275105195986e-05, + "loss": 0.2094, + "step": 11605 + }, + { + "epoch": 1.3762599312225778, + "grad_norm": 1.0271942317580842, + "learning_rate": 3.814423335335148e-05, + "loss": 0.2218, + "step": 11606 + }, + { + "epoch": 1.376378512984703, + "grad_norm": 0.9351871789873166, + "learning_rate": 3.814219148033326e-05, + "loss": 0.2355, + "step": 11607 + }, + { + "epoch": 1.376497094746828, + "grad_norm": 0.8841298083946295, + "learning_rate": 3.8140149486160156e-05, + "loss": 0.2, + "step": 11608 + }, + { + "epoch": 1.376615676508953, + "grad_norm": 1.3034162019656979, + "learning_rate": 3.813810737085098e-05, + "loss": 0.2954, + "step": 11609 + }, + { + "epoch": 1.376734258271078, + "grad_norm": 0.851182922106267, + "learning_rate": 3.813606513442456e-05, + "loss": 0.1462, + "step": 11610 + }, + { + "epoch": 1.376852840033203, + "grad_norm": 1.0758896158256905, + "learning_rate": 3.8134022776899735e-05, + "loss": 0.2417, + "step": 11611 + }, + { + "epoch": 1.376971421795328, + "grad_norm": 0.8121975138912816, + "learning_rate": 3.813198029829532e-05, + "loss": 0.1669, + "step": 11612 + }, + { + "epoch": 1.377090003557453, + "grad_norm": 1.633042659954068, + "learning_rate": 3.812993769863016e-05, + "loss": 0.2659, + "step": 11613 + }, + { + "epoch": 1.377208585319578, + "grad_norm": 1.0667632526822566, + "learning_rate": 3.812789497792307e-05, + "loss": 0.2201, + "step": 11614 + }, + { + "epoch": 1.3773271670817029, + "grad_norm": 1.2622523259623089, + "learning_rate": 3.81258521361929e-05, + "loss": 0.2976, + "step": 11615 + }, + { + "epoch": 1.3774457488438279, + "grad_norm": 1.0675948294172282, + "learning_rate": 3.812380917345845e-05, + "loss": 0.2095, + "step": 11616 + }, + { + "epoch": 1.3775643306059528, + "grad_norm": 0.7717286231292465, + "learning_rate": 3.8121766089738586e-05, + "loss": 0.1719, + "step": 11617 + }, + { + "epoch": 1.3776829123680778, + "grad_norm": 1.109007351821086, + "learning_rate": 3.811972288505212e-05, + "loss": 0.2306, + "step": 11618 + }, + { + "epoch": 1.3778014941302028, + "grad_norm": 1.0634386313111028, + "learning_rate": 3.8117679559417894e-05, + "loss": 0.2265, + "step": 11619 + }, + { + "epoch": 1.3779200758923278, + "grad_norm": 1.7928681841100762, + "learning_rate": 3.8115636112854757e-05, + "loss": 0.3998, + "step": 11620 + }, + { + "epoch": 1.3780386576544528, + "grad_norm": 1.079541693607142, + "learning_rate": 3.811359254538154e-05, + "loss": 0.2202, + "step": 11621 + }, + { + "epoch": 1.3781572394165778, + "grad_norm": 1.2273901826041866, + "learning_rate": 3.8111548857017074e-05, + "loss": 0.2429, + "step": 11622 + }, + { + "epoch": 1.3782758211787027, + "grad_norm": 1.396203445518278, + "learning_rate": 3.810950504778021e-05, + "loss": 0.3112, + "step": 11623 + }, + { + "epoch": 1.3783944029408277, + "grad_norm": 0.9264357221236016, + "learning_rate": 3.8107461117689794e-05, + "loss": 0.2397, + "step": 11624 + }, + { + "epoch": 1.3785129847029527, + "grad_norm": 1.146154163362731, + "learning_rate": 3.810541706676465e-05, + "loss": 0.2221, + "step": 11625 + }, + { + "epoch": 1.3786315664650777, + "grad_norm": 1.1765154749311542, + "learning_rate": 3.8103372895023634e-05, + "loss": 0.2548, + "step": 11626 + }, + { + "epoch": 1.3787501482272027, + "grad_norm": 1.0137171693162335, + "learning_rate": 3.810132860248559e-05, + "loss": 0.2036, + "step": 11627 + }, + { + "epoch": 1.3788687299893276, + "grad_norm": 1.1747356694680524, + "learning_rate": 3.809928418916936e-05, + "loss": 0.2546, + "step": 11628 + }, + { + "epoch": 1.3789873117514526, + "grad_norm": 0.8178394483600226, + "learning_rate": 3.809723965509379e-05, + "loss": 0.171, + "step": 11629 + }, + { + "epoch": 1.3791058935135776, + "grad_norm": 1.0185428391802518, + "learning_rate": 3.8095195000277726e-05, + "loss": 0.239, + "step": 11630 + }, + { + "epoch": 1.3792244752757026, + "grad_norm": 1.518794680485174, + "learning_rate": 3.809315022474003e-05, + "loss": 0.3, + "step": 11631 + }, + { + "epoch": 1.3793430570378276, + "grad_norm": 1.1779802847201954, + "learning_rate": 3.809110532849955e-05, + "loss": 0.2482, + "step": 11632 + }, + { + "epoch": 1.3794616387999525, + "grad_norm": 1.16300185382879, + "learning_rate": 3.8089060311575125e-05, + "loss": 0.2317, + "step": 11633 + }, + { + "epoch": 1.3795802205620775, + "grad_norm": 0.9136232371871255, + "learning_rate": 3.808701517398562e-05, + "loss": 0.1528, + "step": 11634 + }, + { + "epoch": 1.3796988023242025, + "grad_norm": 0.9047472022175441, + "learning_rate": 3.808496991574988e-05, + "loss": 0.2689, + "step": 11635 + }, + { + "epoch": 1.3798173840863275, + "grad_norm": 1.1227952552202538, + "learning_rate": 3.8082924536886765e-05, + "loss": 0.2256, + "step": 11636 + }, + { + "epoch": 1.3799359658484525, + "grad_norm": 1.480725492619134, + "learning_rate": 3.808087903741513e-05, + "loss": 0.3263, + "step": 11637 + }, + { + "epoch": 1.3800545476105774, + "grad_norm": 1.33639984186706, + "learning_rate": 3.8078833417353823e-05, + "loss": 0.2109, + "step": 11638 + }, + { + "epoch": 1.3801731293727024, + "grad_norm": 0.9780719283877892, + "learning_rate": 3.807678767672171e-05, + "loss": 0.218, + "step": 11639 + }, + { + "epoch": 1.3802917111348274, + "grad_norm": 0.9316156148818309, + "learning_rate": 3.807474181553766e-05, + "loss": 0.2025, + "step": 11640 + }, + { + "epoch": 1.3804102928969524, + "grad_norm": 1.4107972915352154, + "learning_rate": 3.8072695833820526e-05, + "loss": 0.2474, + "step": 11641 + }, + { + "epoch": 1.3805288746590774, + "grad_norm": 0.9223849617949521, + "learning_rate": 3.807064973158916e-05, + "loss": 0.1914, + "step": 11642 + }, + { + "epoch": 1.3806474564212023, + "grad_norm": 1.1148120253183535, + "learning_rate": 3.8068603508862434e-05, + "loss": 0.2177, + "step": 11643 + }, + { + "epoch": 1.3807660381833273, + "grad_norm": 1.5213213522216134, + "learning_rate": 3.806655716565921e-05, + "loss": 0.3332, + "step": 11644 + }, + { + "epoch": 1.3808846199454523, + "grad_norm": 0.9529819526746945, + "learning_rate": 3.806451070199835e-05, + "loss": 0.2316, + "step": 11645 + }, + { + "epoch": 1.3810032017075775, + "grad_norm": 1.126099537681736, + "learning_rate": 3.8062464117898724e-05, + "loss": 0.266, + "step": 11646 + }, + { + "epoch": 1.3811217834697023, + "grad_norm": 1.1656469345886902, + "learning_rate": 3.80604174133792e-05, + "loss": 0.2484, + "step": 11647 + }, + { + "epoch": 1.3812403652318275, + "grad_norm": 0.8726092315288855, + "learning_rate": 3.8058370588458636e-05, + "loss": 0.1724, + "step": 11648 + }, + { + "epoch": 1.3813589469939522, + "grad_norm": 1.2099201150480496, + "learning_rate": 3.805632364315591e-05, + "loss": 0.2263, + "step": 11649 + }, + { + "epoch": 1.3814775287560774, + "grad_norm": 2.072644380056955, + "learning_rate": 3.8054276577489886e-05, + "loss": 0.5645, + "step": 11650 + }, + { + "epoch": 1.3815961105182022, + "grad_norm": 2.3265894384322734, + "learning_rate": 3.805222939147945e-05, + "loss": 0.6096, + "step": 11651 + }, + { + "epoch": 1.3817146922803274, + "grad_norm": 0.9084863985400732, + "learning_rate": 3.805018208514347e-05, + "loss": 0.1949, + "step": 11652 + }, + { + "epoch": 1.3818332740424522, + "grad_norm": 1.1455596802559262, + "learning_rate": 3.80481346585008e-05, + "loss": 0.2712, + "step": 11653 + }, + { + "epoch": 1.3819518558045774, + "grad_norm": 1.3016917926864515, + "learning_rate": 3.804608711157034e-05, + "loss": 0.2982, + "step": 11654 + }, + { + "epoch": 1.3820704375667021, + "grad_norm": 1.1178173485500835, + "learning_rate": 3.804403944437095e-05, + "loss": 0.2413, + "step": 11655 + }, + { + "epoch": 1.3821890193288273, + "grad_norm": 1.0399181061615257, + "learning_rate": 3.804199165692151e-05, + "loss": 0.2305, + "step": 11656 + }, + { + "epoch": 1.382307601090952, + "grad_norm": 1.152040362215518, + "learning_rate": 3.8039943749240905e-05, + "loss": 0.3089, + "step": 11657 + }, + { + "epoch": 1.3824261828530773, + "grad_norm": 1.286512207853283, + "learning_rate": 3.8037895721348007e-05, + "loss": 0.2987, + "step": 11658 + }, + { + "epoch": 1.3825447646152023, + "grad_norm": 1.1408209034155599, + "learning_rate": 3.8035847573261695e-05, + "loss": 0.2888, + "step": 11659 + }, + { + "epoch": 1.3826633463773272, + "grad_norm": 1.3056536833291719, + "learning_rate": 3.8033799305000856e-05, + "loss": 0.3922, + "step": 11660 + }, + { + "epoch": 1.3827819281394522, + "grad_norm": 1.0222787632008368, + "learning_rate": 3.803175091658437e-05, + "loss": 0.243, + "step": 11661 + }, + { + "epoch": 1.3829005099015772, + "grad_norm": 0.8423498880318245, + "learning_rate": 3.802970240803112e-05, + "loss": 0.1834, + "step": 11662 + }, + { + "epoch": 1.3830190916637022, + "grad_norm": 0.9798343266481615, + "learning_rate": 3.8027653779359995e-05, + "loss": 0.264, + "step": 11663 + }, + { + "epoch": 1.3831376734258272, + "grad_norm": 1.515512063306586, + "learning_rate": 3.802560503058988e-05, + "loss": 0.4355, + "step": 11664 + }, + { + "epoch": 1.3832562551879521, + "grad_norm": 1.3241622408159397, + "learning_rate": 3.802355616173966e-05, + "loss": 0.2996, + "step": 11665 + }, + { + "epoch": 1.3833748369500771, + "grad_norm": 1.027519187556859, + "learning_rate": 3.802150717282822e-05, + "loss": 0.2413, + "step": 11666 + }, + { + "epoch": 1.383493418712202, + "grad_norm": 1.3940291603035448, + "learning_rate": 3.8019458063874445e-05, + "loss": 0.2797, + "step": 11667 + }, + { + "epoch": 1.383612000474327, + "grad_norm": 1.1145297411824455, + "learning_rate": 3.801740883489724e-05, + "loss": 0.241, + "step": 11668 + }, + { + "epoch": 1.383730582236452, + "grad_norm": 1.9317419340792936, + "learning_rate": 3.801535948591548e-05, + "loss": 0.2815, + "step": 11669 + }, + { + "epoch": 1.383849163998577, + "grad_norm": 1.336011361524933, + "learning_rate": 3.801331001694808e-05, + "loss": 0.3236, + "step": 11670 + }, + { + "epoch": 1.383967745760702, + "grad_norm": 0.8520568292283144, + "learning_rate": 3.801126042801391e-05, + "loss": 0.1993, + "step": 11671 + }, + { + "epoch": 1.384086327522827, + "grad_norm": 1.1248951978316697, + "learning_rate": 3.8009210719131876e-05, + "loss": 0.3045, + "step": 11672 + }, + { + "epoch": 1.384204909284952, + "grad_norm": 0.9817534650911878, + "learning_rate": 3.800716089032088e-05, + "loss": 0.2435, + "step": 11673 + }, + { + "epoch": 1.384323491047077, + "grad_norm": 1.0523897454457578, + "learning_rate": 3.80051109415998e-05, + "loss": 0.2522, + "step": 11674 + }, + { + "epoch": 1.384442072809202, + "grad_norm": 0.950664754110988, + "learning_rate": 3.800306087298755e-05, + "loss": 0.1879, + "step": 11675 + }, + { + "epoch": 1.384560654571327, + "grad_norm": 0.8518874958355376, + "learning_rate": 3.8001010684503023e-05, + "loss": 0.1772, + "step": 11676 + }, + { + "epoch": 1.384679236333452, + "grad_norm": 0.9346029666852868, + "learning_rate": 3.799896037616513e-05, + "loss": 0.2269, + "step": 11677 + }, + { + "epoch": 1.384797818095577, + "grad_norm": 0.9342471304717896, + "learning_rate": 3.799690994799275e-05, + "loss": 0.2379, + "step": 11678 + }, + { + "epoch": 1.3849163998577019, + "grad_norm": 0.7594883705202069, + "learning_rate": 3.79948594000048e-05, + "loss": 0.1857, + "step": 11679 + }, + { + "epoch": 1.3850349816198269, + "grad_norm": 1.1956819616438479, + "learning_rate": 3.7992808732220174e-05, + "loss": 0.2236, + "step": 11680 + }, + { + "epoch": 1.3851535633819518, + "grad_norm": 0.9501790191938247, + "learning_rate": 3.7990757944657795e-05, + "loss": 0.2102, + "step": 11681 + }, + { + "epoch": 1.3852721451440768, + "grad_norm": 1.0479365000232284, + "learning_rate": 3.798870703733656e-05, + "loss": 0.2162, + "step": 11682 + }, + { + "epoch": 1.3853907269062018, + "grad_norm": 0.9181600166156222, + "learning_rate": 3.798665601027537e-05, + "loss": 0.2081, + "step": 11683 + }, + { + "epoch": 1.3855093086683268, + "grad_norm": 1.1384317201913023, + "learning_rate": 3.798460486349314e-05, + "loss": 0.2052, + "step": 11684 + }, + { + "epoch": 1.3856278904304518, + "grad_norm": 1.375226648540397, + "learning_rate": 3.798255359700877e-05, + "loss": 0.3145, + "step": 11685 + }, + { + "epoch": 1.3857464721925767, + "grad_norm": 0.9720490622329074, + "learning_rate": 3.7980502210841184e-05, + "loss": 0.1543, + "step": 11686 + }, + { + "epoch": 1.3858650539547017, + "grad_norm": 1.0974139392926017, + "learning_rate": 3.7978450705009285e-05, + "loss": 0.2259, + "step": 11687 + }, + { + "epoch": 1.3859836357168267, + "grad_norm": 1.1655074874349016, + "learning_rate": 3.797639907953198e-05, + "loss": 0.2282, + "step": 11688 + }, + { + "epoch": 1.3861022174789517, + "grad_norm": 1.1676214107590557, + "learning_rate": 3.7974347334428186e-05, + "loss": 0.2547, + "step": 11689 + }, + { + "epoch": 1.3862207992410767, + "grad_norm": 0.9811833613767394, + "learning_rate": 3.7972295469716824e-05, + "loss": 0.1722, + "step": 11690 + }, + { + "epoch": 1.3863393810032016, + "grad_norm": 0.9217234419740634, + "learning_rate": 3.79702434854168e-05, + "loss": 0.2052, + "step": 11691 + }, + { + "epoch": 1.3864579627653266, + "grad_norm": 1.249622425796674, + "learning_rate": 3.7968191381547044e-05, + "loss": 0.2479, + "step": 11692 + }, + { + "epoch": 1.3865765445274516, + "grad_norm": 0.9470408896613219, + "learning_rate": 3.796613915812647e-05, + "loss": 0.1989, + "step": 11693 + }, + { + "epoch": 1.3866951262895766, + "grad_norm": 1.0287463204310414, + "learning_rate": 3.7964086815173985e-05, + "loss": 0.2309, + "step": 11694 + }, + { + "epoch": 1.3868137080517016, + "grad_norm": 1.2883314885896133, + "learning_rate": 3.796203435270852e-05, + "loss": 0.3235, + "step": 11695 + }, + { + "epoch": 1.3869322898138265, + "grad_norm": 1.1902605012635654, + "learning_rate": 3.795998177074899e-05, + "loss": 0.2782, + "step": 11696 + }, + { + "epoch": 1.3870508715759517, + "grad_norm": 0.8943886225828693, + "learning_rate": 3.795792906931432e-05, + "loss": 0.217, + "step": 11697 + }, + { + "epoch": 1.3871694533380765, + "grad_norm": 1.0353547084222592, + "learning_rate": 3.795587624842344e-05, + "loss": 0.208, + "step": 11698 + }, + { + "epoch": 1.3872880351002017, + "grad_norm": 0.9753218361465616, + "learning_rate": 3.795382330809526e-05, + "loss": 0.2515, + "step": 11699 + }, + { + "epoch": 1.3874066168623265, + "grad_norm": 1.2240788452078855, + "learning_rate": 3.7951770248348714e-05, + "loss": 0.2029, + "step": 11700 + }, + { + "epoch": 1.3875251986244517, + "grad_norm": 0.849879815525532, + "learning_rate": 3.794971706920274e-05, + "loss": 0.1781, + "step": 11701 + }, + { + "epoch": 1.3876437803865764, + "grad_norm": 1.2670248028831297, + "learning_rate": 3.794766377067624e-05, + "loss": 0.2519, + "step": 11702 + }, + { + "epoch": 1.3877623621487016, + "grad_norm": 0.870006188856422, + "learning_rate": 3.794561035278818e-05, + "loss": 0.1735, + "step": 11703 + }, + { + "epoch": 1.3878809439108264, + "grad_norm": 1.1249852039877075, + "learning_rate": 3.794355681555745e-05, + "loss": 0.2285, + "step": 11704 + }, + { + "epoch": 1.3879995256729516, + "grad_norm": 1.236465061605593, + "learning_rate": 3.7941503159003e-05, + "loss": 0.2502, + "step": 11705 + }, + { + "epoch": 1.3881181074350764, + "grad_norm": 1.3198892442342647, + "learning_rate": 3.7939449383143765e-05, + "loss": 0.2455, + "step": 11706 + }, + { + "epoch": 1.3882366891972016, + "grad_norm": 0.8376962412763824, + "learning_rate": 3.793739548799867e-05, + "loss": 0.1525, + "step": 11707 + }, + { + "epoch": 1.3883552709593265, + "grad_norm": 0.8736679623474605, + "learning_rate": 3.793534147358666e-05, + "loss": 0.2194, + "step": 11708 + }, + { + "epoch": 1.3884738527214515, + "grad_norm": 0.9773442219153157, + "learning_rate": 3.793328733992666e-05, + "loss": 0.2265, + "step": 11709 + }, + { + "epoch": 1.3885924344835765, + "grad_norm": 0.9678286249321079, + "learning_rate": 3.793123308703761e-05, + "loss": 0.1702, + "step": 11710 + }, + { + "epoch": 1.3887110162457015, + "grad_norm": 1.3757003915674009, + "learning_rate": 3.792917871493844e-05, + "loss": 0.3329, + "step": 11711 + }, + { + "epoch": 1.3888295980078265, + "grad_norm": 1.307952078222463, + "learning_rate": 3.792712422364812e-05, + "loss": 0.2832, + "step": 11712 + }, + { + "epoch": 1.3889481797699514, + "grad_norm": 1.1528172371530734, + "learning_rate": 3.7925069613185544e-05, + "loss": 0.299, + "step": 11713 + }, + { + "epoch": 1.3890667615320764, + "grad_norm": 1.33519536498042, + "learning_rate": 3.792301488356969e-05, + "loss": 0.2431, + "step": 11714 + }, + { + "epoch": 1.3891853432942014, + "grad_norm": 1.3736881808250359, + "learning_rate": 3.7920960034819474e-05, + "loss": 0.2634, + "step": 11715 + }, + { + "epoch": 1.3893039250563264, + "grad_norm": 0.8250222084118504, + "learning_rate": 3.7918905066953856e-05, + "loss": 0.1841, + "step": 11716 + }, + { + "epoch": 1.3894225068184514, + "grad_norm": 1.1041381911855892, + "learning_rate": 3.791684997999178e-05, + "loss": 0.2795, + "step": 11717 + }, + { + "epoch": 1.3895410885805763, + "grad_norm": 1.04829278744971, + "learning_rate": 3.7914794773952186e-05, + "loss": 0.2013, + "step": 11718 + }, + { + "epoch": 1.3896596703427013, + "grad_norm": 0.9036315199865541, + "learning_rate": 3.7912739448854015e-05, + "loss": 0.211, + "step": 11719 + }, + { + "epoch": 1.3897782521048263, + "grad_norm": 1.2388078387566788, + "learning_rate": 3.791068400471623e-05, + "loss": 0.3346, + "step": 11720 + }, + { + "epoch": 1.3898968338669513, + "grad_norm": 1.0278791585679479, + "learning_rate": 3.790862844155776e-05, + "loss": 0.2372, + "step": 11721 + }, + { + "epoch": 1.3900154156290763, + "grad_norm": 1.1933092327746997, + "learning_rate": 3.7906572759397574e-05, + "loss": 0.2939, + "step": 11722 + }, + { + "epoch": 1.3901339973912012, + "grad_norm": 0.8871064739745295, + "learning_rate": 3.790451695825461e-05, + "loss": 0.1589, + "step": 11723 + }, + { + "epoch": 1.3902525791533262, + "grad_norm": 0.8868960492789336, + "learning_rate": 3.790246103814783e-05, + "loss": 0.1856, + "step": 11724 + }, + { + "epoch": 1.3903711609154512, + "grad_norm": 0.9453466696642369, + "learning_rate": 3.7900404999096176e-05, + "loss": 0.1974, + "step": 11725 + }, + { + "epoch": 1.3904897426775762, + "grad_norm": 1.146836015019331, + "learning_rate": 3.789834884111861e-05, + "loss": 0.319, + "step": 11726 + }, + { + "epoch": 1.3906083244397012, + "grad_norm": 0.7689444107893225, + "learning_rate": 3.789629256423408e-05, + "loss": 0.1597, + "step": 11727 + }, + { + "epoch": 1.3907269062018262, + "grad_norm": 0.8697322990585152, + "learning_rate": 3.789423616846155e-05, + "loss": 0.1964, + "step": 11728 + }, + { + "epoch": 1.3908454879639511, + "grad_norm": 1.4327668193771204, + "learning_rate": 3.7892179653819974e-05, + "loss": 0.2773, + "step": 11729 + }, + { + "epoch": 1.3909640697260761, + "grad_norm": 1.0005031257388581, + "learning_rate": 3.789012302032831e-05, + "loss": 0.2074, + "step": 11730 + }, + { + "epoch": 1.391082651488201, + "grad_norm": 0.8623139045757928, + "learning_rate": 3.788806626800553e-05, + "loss": 0.1771, + "step": 11731 + }, + { + "epoch": 1.391201233250326, + "grad_norm": 1.07298305362667, + "learning_rate": 3.788600939687057e-05, + "loss": 0.2378, + "step": 11732 + }, + { + "epoch": 1.391319815012451, + "grad_norm": 1.1404889360853088, + "learning_rate": 3.788395240694241e-05, + "loss": 0.2385, + "step": 11733 + }, + { + "epoch": 1.391438396774576, + "grad_norm": 1.2103331975844942, + "learning_rate": 3.788189529824e-05, + "loss": 0.2908, + "step": 11734 + }, + { + "epoch": 1.391556978536701, + "grad_norm": 2.7156189389018635, + "learning_rate": 3.787983807078233e-05, + "loss": 0.1935, + "step": 11735 + }, + { + "epoch": 1.391675560298826, + "grad_norm": 1.5896943841687383, + "learning_rate": 3.787778072458833e-05, + "loss": 0.2962, + "step": 11736 + }, + { + "epoch": 1.391794142060951, + "grad_norm": 1.4440622629811657, + "learning_rate": 3.7875723259677e-05, + "loss": 0.3253, + "step": 11737 + }, + { + "epoch": 1.391912723823076, + "grad_norm": 0.8190177231102774, + "learning_rate": 3.787366567606727e-05, + "loss": 0.1758, + "step": 11738 + }, + { + "epoch": 1.392031305585201, + "grad_norm": 1.6331976136890054, + "learning_rate": 3.7871607973778144e-05, + "loss": 0.4214, + "step": 11739 + }, + { + "epoch": 1.392149887347326, + "grad_norm": 1.2860629235938787, + "learning_rate": 3.786955015282857e-05, + "loss": 0.223, + "step": 11740 + }, + { + "epoch": 1.392268469109451, + "grad_norm": 1.1303607993400593, + "learning_rate": 3.786749221323752e-05, + "loss": 0.2812, + "step": 11741 + }, + { + "epoch": 1.3923870508715759, + "grad_norm": 1.3279355384013456, + "learning_rate": 3.786543415502399e-05, + "loss": 0.2755, + "step": 11742 + }, + { + "epoch": 1.3925056326337009, + "grad_norm": 0.9986038301181903, + "learning_rate": 3.7863375978206915e-05, + "loss": 0.2531, + "step": 11743 + }, + { + "epoch": 1.3926242143958258, + "grad_norm": 0.8311163384039943, + "learning_rate": 3.78613176828053e-05, + "loss": 0.1935, + "step": 11744 + }, + { + "epoch": 1.3927427961579508, + "grad_norm": 0.8308046324562159, + "learning_rate": 3.7859259268838106e-05, + "loss": 0.1863, + "step": 11745 + }, + { + "epoch": 1.392861377920076, + "grad_norm": 2.5046081212450813, + "learning_rate": 3.7857200736324305e-05, + "loss": 0.271, + "step": 11746 + }, + { + "epoch": 1.3929799596822008, + "grad_norm": 0.8575847534885975, + "learning_rate": 3.7855142085282887e-05, + "loss": 0.18, + "step": 11747 + }, + { + "epoch": 1.393098541444326, + "grad_norm": 0.8984090271342068, + "learning_rate": 3.785308331573282e-05, + "loss": 0.1848, + "step": 11748 + }, + { + "epoch": 1.3932171232064507, + "grad_norm": 1.2230106267321248, + "learning_rate": 3.785102442769308e-05, + "loss": 0.2375, + "step": 11749 + }, + { + "epoch": 1.393335704968576, + "grad_norm": 1.1171287686649092, + "learning_rate": 3.784896542118266e-05, + "loss": 0.2653, + "step": 11750 + }, + { + "epoch": 1.3934542867307007, + "grad_norm": 1.063427377403751, + "learning_rate": 3.784690629622053e-05, + "loss": 0.206, + "step": 11751 + }, + { + "epoch": 1.393572868492826, + "grad_norm": 0.8617395483586597, + "learning_rate": 3.784484705282568e-05, + "loss": 0.1761, + "step": 11752 + }, + { + "epoch": 1.3936914502549507, + "grad_norm": 1.0298999241052507, + "learning_rate": 3.7842787691017096e-05, + "loss": 0.2664, + "step": 11753 + }, + { + "epoch": 1.3938100320170759, + "grad_norm": 1.0384370806660737, + "learning_rate": 3.7840728210813756e-05, + "loss": 0.2322, + "step": 11754 + }, + { + "epoch": 1.3939286137792006, + "grad_norm": 1.1372328081606993, + "learning_rate": 3.783866861223465e-05, + "loss": 0.2598, + "step": 11755 + }, + { + "epoch": 1.3940471955413258, + "grad_norm": 0.9977061737362231, + "learning_rate": 3.783660889529876e-05, + "loss": 0.1986, + "step": 11756 + }, + { + "epoch": 1.3941657773034506, + "grad_norm": 1.4764942900584554, + "learning_rate": 3.7834549060025084e-05, + "loss": 0.2102, + "step": 11757 + }, + { + "epoch": 1.3942843590655758, + "grad_norm": 1.0221256093147484, + "learning_rate": 3.783248910643259e-05, + "loss": 0.1585, + "step": 11758 + }, + { + "epoch": 1.3944029408277008, + "grad_norm": 0.797291166634366, + "learning_rate": 3.783042903454029e-05, + "loss": 0.1783, + "step": 11759 + }, + { + "epoch": 1.3945215225898258, + "grad_norm": 0.8529992128847337, + "learning_rate": 3.782836884436717e-05, + "loss": 0.1711, + "step": 11760 + }, + { + "epoch": 1.3946401043519507, + "grad_norm": 1.3322098087604564, + "learning_rate": 3.782630853593222e-05, + "loss": 0.3112, + "step": 11761 + }, + { + "epoch": 1.3947586861140757, + "grad_norm": 1.273044098823193, + "learning_rate": 3.782424810925444e-05, + "loss": 0.1832, + "step": 11762 + }, + { + "epoch": 1.3948772678762007, + "grad_norm": 1.6608952704991797, + "learning_rate": 3.782218756435281e-05, + "loss": 0.3393, + "step": 11763 + }, + { + "epoch": 1.3949958496383257, + "grad_norm": 0.9104161438772452, + "learning_rate": 3.7820126901246334e-05, + "loss": 0.1875, + "step": 11764 + }, + { + "epoch": 1.3951144314004507, + "grad_norm": 1.1481563404347932, + "learning_rate": 3.781806611995401e-05, + "loss": 0.2359, + "step": 11765 + }, + { + "epoch": 1.3952330131625756, + "grad_norm": 1.173275331474507, + "learning_rate": 3.781600522049484e-05, + "loss": 0.2571, + "step": 11766 + }, + { + "epoch": 1.3953515949247006, + "grad_norm": 1.1470768540722223, + "learning_rate": 3.781394420288781e-05, + "loss": 0.2518, + "step": 11767 + }, + { + "epoch": 1.3954701766868256, + "grad_norm": 1.1107885629881789, + "learning_rate": 3.7811883067151935e-05, + "loss": 0.2457, + "step": 11768 + }, + { + "epoch": 1.3955887584489506, + "grad_norm": 2.490921613395685, + "learning_rate": 3.78098218133062e-05, + "loss": 0.2939, + "step": 11769 + }, + { + "epoch": 1.3957073402110756, + "grad_norm": 1.0231487679148645, + "learning_rate": 3.780776044136963e-05, + "loss": 0.2085, + "step": 11770 + }, + { + "epoch": 1.3958259219732005, + "grad_norm": 0.9406222292479104, + "learning_rate": 3.78056989513612e-05, + "loss": 0.1853, + "step": 11771 + }, + { + "epoch": 1.3959445037353255, + "grad_norm": 1.0062257216473427, + "learning_rate": 3.780363734329994e-05, + "loss": 0.1847, + "step": 11772 + }, + { + "epoch": 1.3960630854974505, + "grad_norm": 1.3313921377081395, + "learning_rate": 3.780157561720484e-05, + "loss": 0.2355, + "step": 11773 + }, + { + "epoch": 1.3961816672595755, + "grad_norm": 1.011055802173935, + "learning_rate": 3.779951377309492e-05, + "loss": 0.1834, + "step": 11774 + }, + { + "epoch": 1.3963002490217005, + "grad_norm": 1.3306998606902443, + "learning_rate": 3.7797451810989166e-05, + "loss": 0.2434, + "step": 11775 + }, + { + "epoch": 1.3964188307838254, + "grad_norm": 1.0363478591591835, + "learning_rate": 3.7795389730906604e-05, + "loss": 0.2588, + "step": 11776 + }, + { + "epoch": 1.3965374125459504, + "grad_norm": 0.9159301178298277, + "learning_rate": 3.779332753286624e-05, + "loss": 0.2019, + "step": 11777 + }, + { + "epoch": 1.3966559943080754, + "grad_norm": 1.1827818095373892, + "learning_rate": 3.7791265216887085e-05, + "loss": 0.2138, + "step": 11778 + }, + { + "epoch": 1.3967745760702004, + "grad_norm": 0.9246945868409026, + "learning_rate": 3.778920278298814e-05, + "loss": 0.1631, + "step": 11779 + }, + { + "epoch": 1.3968931578323254, + "grad_norm": 0.9405347278233707, + "learning_rate": 3.7787140231188446e-05, + "loss": 0.2274, + "step": 11780 + }, + { + "epoch": 1.3970117395944504, + "grad_norm": 0.9645836219093814, + "learning_rate": 3.7785077561506986e-05, + "loss": 0.2364, + "step": 11781 + }, + { + "epoch": 1.3971303213565753, + "grad_norm": 1.0131436963429135, + "learning_rate": 3.77830147739628e-05, + "loss": 0.2385, + "step": 11782 + }, + { + "epoch": 1.3972489031187003, + "grad_norm": 1.2703098196935425, + "learning_rate": 3.778095186857489e-05, + "loss": 0.2502, + "step": 11783 + }, + { + "epoch": 1.3973674848808253, + "grad_norm": 1.2852045851020282, + "learning_rate": 3.777888884536227e-05, + "loss": 0.2135, + "step": 11784 + }, + { + "epoch": 1.3974860666429503, + "grad_norm": 0.8306602826284123, + "learning_rate": 3.7776825704343975e-05, + "loss": 0.1908, + "step": 11785 + }, + { + "epoch": 1.3976046484050753, + "grad_norm": 1.0203139471756995, + "learning_rate": 3.777476244553901e-05, + "loss": 0.1976, + "step": 11786 + }, + { + "epoch": 1.3977232301672002, + "grad_norm": 1.4870112607633188, + "learning_rate": 3.777269906896641e-05, + "loss": 0.2798, + "step": 11787 + }, + { + "epoch": 1.3978418119293252, + "grad_norm": 1.0975671781888432, + "learning_rate": 3.777063557464517e-05, + "loss": 0.2967, + "step": 11788 + }, + { + "epoch": 1.3979603936914502, + "grad_norm": 1.2504346582059775, + "learning_rate": 3.7768571962594333e-05, + "loss": 0.2741, + "step": 11789 + }, + { + "epoch": 1.3980789754535752, + "grad_norm": 1.3340061862501758, + "learning_rate": 3.776650823283293e-05, + "loss": 0.3024, + "step": 11790 + }, + { + "epoch": 1.3981975572157002, + "grad_norm": 1.047190534672841, + "learning_rate": 3.776444438537997e-05, + "loss": 0.2514, + "step": 11791 + }, + { + "epoch": 1.3983161389778251, + "grad_norm": 0.9404263242038233, + "learning_rate": 3.77623804202545e-05, + "loss": 0.149, + "step": 11792 + }, + { + "epoch": 1.3984347207399501, + "grad_norm": 1.7352635522962636, + "learning_rate": 3.776031633747551e-05, + "loss": 0.301, + "step": 11793 + }, + { + "epoch": 1.398553302502075, + "grad_norm": 1.6666972031718896, + "learning_rate": 3.775825213706207e-05, + "loss": 0.4312, + "step": 11794 + }, + { + "epoch": 1.3986718842642003, + "grad_norm": 0.7878124719931893, + "learning_rate": 3.7756187819033176e-05, + "loss": 0.1575, + "step": 11795 + }, + { + "epoch": 1.398790466026325, + "grad_norm": 1.0766199308433646, + "learning_rate": 3.775412338340788e-05, + "loss": 0.2398, + "step": 11796 + }, + { + "epoch": 1.3989090477884503, + "grad_norm": 1.0052987778744344, + "learning_rate": 3.775205883020521e-05, + "loss": 0.199, + "step": 11797 + }, + { + "epoch": 1.399027629550575, + "grad_norm": 0.8259400668321436, + "learning_rate": 3.7749994159444194e-05, + "loss": 0.1459, + "step": 11798 + }, + { + "epoch": 1.3991462113127002, + "grad_norm": 0.9589804964011711, + "learning_rate": 3.7747929371143855e-05, + "loss": 0.1789, + "step": 11799 + }, + { + "epoch": 1.399264793074825, + "grad_norm": 0.852149364912768, + "learning_rate": 3.774586446532326e-05, + "loss": 0.1561, + "step": 11800 + }, + { + "epoch": 1.3993833748369502, + "grad_norm": 1.4971129761212456, + "learning_rate": 3.77437994420014e-05, + "loss": 0.2937, + "step": 11801 + }, + { + "epoch": 1.399501956599075, + "grad_norm": 0.8262874223727408, + "learning_rate": 3.7741734301197365e-05, + "loss": 0.1406, + "step": 11802 + }, + { + "epoch": 1.3996205383612002, + "grad_norm": 0.680667999385187, + "learning_rate": 3.773966904293014e-05, + "loss": 0.1435, + "step": 11803 + }, + { + "epoch": 1.399739120123325, + "grad_norm": 1.18648017920351, + "learning_rate": 3.77376036672188e-05, + "loss": 0.2545, + "step": 11804 + }, + { + "epoch": 1.3998577018854501, + "grad_norm": 1.0060454719375114, + "learning_rate": 3.7735538174082376e-05, + "loss": 0.1967, + "step": 11805 + }, + { + "epoch": 1.3999762836475749, + "grad_norm": 1.0444755755816888, + "learning_rate": 3.773347256353991e-05, + "loss": 0.2311, + "step": 11806 + }, + { + "epoch": 1.4000948654097, + "grad_norm": 1.1484352785090235, + "learning_rate": 3.773140683561043e-05, + "loss": 0.2349, + "step": 11807 + }, + { + "epoch": 1.400213447171825, + "grad_norm": 0.8217323013419425, + "learning_rate": 3.7729340990312996e-05, + "loss": 0.1687, + "step": 11808 + }, + { + "epoch": 1.40033202893395, + "grad_norm": 1.0959543801613512, + "learning_rate": 3.772727502766665e-05, + "loss": 0.187, + "step": 11809 + }, + { + "epoch": 1.400450610696075, + "grad_norm": 1.0823676514694107, + "learning_rate": 3.7725208947690424e-05, + "loss": 0.2184, + "step": 11810 + }, + { + "epoch": 1.4005691924582, + "grad_norm": 1.044664026383668, + "learning_rate": 3.7723142750403394e-05, + "loss": 0.2033, + "step": 11811 + }, + { + "epoch": 1.400687774220325, + "grad_norm": 1.2886690514722077, + "learning_rate": 3.772107643582459e-05, + "loss": 0.2838, + "step": 11812 + }, + { + "epoch": 1.40080635598245, + "grad_norm": 0.9545228452033405, + "learning_rate": 3.771901000397305e-05, + "loss": 0.2544, + "step": 11813 + }, + { + "epoch": 1.400924937744575, + "grad_norm": 1.4028537881836198, + "learning_rate": 3.7716943454867835e-05, + "loss": 0.3332, + "step": 11814 + }, + { + "epoch": 1.4010435195067, + "grad_norm": 1.7008168368856453, + "learning_rate": 3.7714876788528004e-05, + "loss": 0.3563, + "step": 11815 + }, + { + "epoch": 1.401162101268825, + "grad_norm": 0.9842667966036264, + "learning_rate": 3.77128100049726e-05, + "loss": 0.2259, + "step": 11816 + }, + { + "epoch": 1.4012806830309499, + "grad_norm": 1.5080300995092297, + "learning_rate": 3.771074310422067e-05, + "loss": 0.3236, + "step": 11817 + }, + { + "epoch": 1.4013992647930749, + "grad_norm": 1.1789850316031516, + "learning_rate": 3.770867608629128e-05, + "loss": 0.222, + "step": 11818 + }, + { + "epoch": 1.4015178465551998, + "grad_norm": 1.2054065142076724, + "learning_rate": 3.7706608951203476e-05, + "loss": 0.2333, + "step": 11819 + }, + { + "epoch": 1.4016364283173248, + "grad_norm": 0.8048241079781335, + "learning_rate": 3.770454169897633e-05, + "loss": 0.1812, + "step": 11820 + }, + { + "epoch": 1.4017550100794498, + "grad_norm": 1.12448350324929, + "learning_rate": 3.770247432962888e-05, + "loss": 0.2011, + "step": 11821 + }, + { + "epoch": 1.4018735918415748, + "grad_norm": 0.9458123167953258, + "learning_rate": 3.770040684318019e-05, + "loss": 0.1709, + "step": 11822 + }, + { + "epoch": 1.4019921736036998, + "grad_norm": 1.1186694402959132, + "learning_rate": 3.7698339239649336e-05, + "loss": 0.2494, + "step": 11823 + }, + { + "epoch": 1.4021107553658247, + "grad_norm": 1.4487726670457945, + "learning_rate": 3.7696271519055354e-05, + "loss": 0.3046, + "step": 11824 + }, + { + "epoch": 1.4022293371279497, + "grad_norm": 1.221742077414103, + "learning_rate": 3.769420368141732e-05, + "loss": 0.2571, + "step": 11825 + }, + { + "epoch": 1.4023479188900747, + "grad_norm": 0.8406633998141934, + "learning_rate": 3.76921357267543e-05, + "loss": 0.1916, + "step": 11826 + }, + { + "epoch": 1.4024665006521997, + "grad_norm": 1.039797084956976, + "learning_rate": 3.769006765508535e-05, + "loss": 0.2388, + "step": 11827 + }, + { + "epoch": 1.4025850824143247, + "grad_norm": 0.8710523986047978, + "learning_rate": 3.7687999466429534e-05, + "loss": 0.1779, + "step": 11828 + }, + { + "epoch": 1.4027036641764496, + "grad_norm": 1.0326667033584702, + "learning_rate": 3.768593116080593e-05, + "loss": 0.198, + "step": 11829 + }, + { + "epoch": 1.4028222459385746, + "grad_norm": 0.8118419291513017, + "learning_rate": 3.7683862738233594e-05, + "loss": 0.2045, + "step": 11830 + }, + { + "epoch": 1.4029408277006996, + "grad_norm": 0.9993917635933244, + "learning_rate": 3.768179419873159e-05, + "loss": 0.1749, + "step": 11831 + }, + { + "epoch": 1.4030594094628246, + "grad_norm": 1.0837966681869857, + "learning_rate": 3.767972554231901e-05, + "loss": 0.2317, + "step": 11832 + }, + { + "epoch": 1.4031779912249496, + "grad_norm": 0.8601307548568242, + "learning_rate": 3.7677656769014904e-05, + "loss": 0.161, + "step": 11833 + }, + { + "epoch": 1.4032965729870746, + "grad_norm": 1.1534127262282365, + "learning_rate": 3.767558787883834e-05, + "loss": 0.2261, + "step": 11834 + }, + { + "epoch": 1.4034151547491995, + "grad_norm": 1.5879504687784103, + "learning_rate": 3.7673518871808404e-05, + "loss": 0.3387, + "step": 11835 + }, + { + "epoch": 1.4035337365113245, + "grad_norm": 1.2978821830014682, + "learning_rate": 3.7671449747944174e-05, + "loss": 0.2628, + "step": 11836 + }, + { + "epoch": 1.4036523182734495, + "grad_norm": 0.9078406631796384, + "learning_rate": 3.766938050726471e-05, + "loss": 0.1979, + "step": 11837 + }, + { + "epoch": 1.4037709000355745, + "grad_norm": 1.0414034786146722, + "learning_rate": 3.7667311149789084e-05, + "loss": 0.2022, + "step": 11838 + }, + { + "epoch": 1.4038894817976995, + "grad_norm": 1.0819858225469132, + "learning_rate": 3.766524167553639e-05, + "loss": 0.2153, + "step": 11839 + }, + { + "epoch": 1.4040080635598244, + "grad_norm": 1.0792031994468052, + "learning_rate": 3.76631720845257e-05, + "loss": 0.2576, + "step": 11840 + }, + { + "epoch": 1.4041266453219494, + "grad_norm": 1.425698747719983, + "learning_rate": 3.7661102376776094e-05, + "loss": 0.2905, + "step": 11841 + }, + { + "epoch": 1.4042452270840744, + "grad_norm": 1.1790534461351472, + "learning_rate": 3.765903255230665e-05, + "loss": 0.2384, + "step": 11842 + }, + { + "epoch": 1.4043638088461994, + "grad_norm": 0.9195574251849155, + "learning_rate": 3.765696261113645e-05, + "loss": 0.2024, + "step": 11843 + }, + { + "epoch": 1.4044823906083244, + "grad_norm": 1.5577361842521718, + "learning_rate": 3.765489255328457e-05, + "loss": 0.2465, + "step": 11844 + }, + { + "epoch": 1.4046009723704493, + "grad_norm": 1.0650532456756472, + "learning_rate": 3.765282237877011e-05, + "loss": 0.2096, + "step": 11845 + }, + { + "epoch": 1.4047195541325745, + "grad_norm": 1.3015790703224803, + "learning_rate": 3.765075208761213e-05, + "loss": 0.2567, + "step": 11846 + }, + { + "epoch": 1.4048381358946993, + "grad_norm": 1.0246279394618798, + "learning_rate": 3.764868167982974e-05, + "loss": 0.214, + "step": 11847 + }, + { + "epoch": 1.4049567176568245, + "grad_norm": 1.0740297637658935, + "learning_rate": 3.764661115544201e-05, + "loss": 0.2304, + "step": 11848 + }, + { + "epoch": 1.4050752994189493, + "grad_norm": 1.0723701209678436, + "learning_rate": 3.764454051446804e-05, + "loss": 0.2377, + "step": 11849 + }, + { + "epoch": 1.4051938811810745, + "grad_norm": 1.3871986202051383, + "learning_rate": 3.76424697569269e-05, + "loss": 0.2347, + "step": 11850 + }, + { + "epoch": 1.4053124629431992, + "grad_norm": 0.7979625622890033, + "learning_rate": 3.76403988828377e-05, + "loss": 0.137, + "step": 11851 + }, + { + "epoch": 1.4054310447053244, + "grad_norm": 0.9745408744635045, + "learning_rate": 3.763832789221953e-05, + "loss": 0.2296, + "step": 11852 + }, + { + "epoch": 1.4055496264674492, + "grad_norm": 0.92467853672325, + "learning_rate": 3.7636256785091465e-05, + "loss": 0.213, + "step": 11853 + }, + { + "epoch": 1.4056682082295744, + "grad_norm": 1.1455798346712767, + "learning_rate": 3.7634185561472606e-05, + "loss": 0.2676, + "step": 11854 + }, + { + "epoch": 1.4057867899916991, + "grad_norm": 0.991628055648665, + "learning_rate": 3.7632114221382056e-05, + "loss": 0.2446, + "step": 11855 + }, + { + "epoch": 1.4059053717538244, + "grad_norm": 1.5136196999683604, + "learning_rate": 3.76300427648389e-05, + "loss": 0.3018, + "step": 11856 + }, + { + "epoch": 1.406023953515949, + "grad_norm": 1.1249043015232225, + "learning_rate": 3.762797119186224e-05, + "loss": 0.2798, + "step": 11857 + }, + { + "epoch": 1.4061425352780743, + "grad_norm": 1.1228322416095846, + "learning_rate": 3.7625899502471165e-05, + "loss": 0.2096, + "step": 11858 + }, + { + "epoch": 1.4062611170401993, + "grad_norm": 0.9952026795715397, + "learning_rate": 3.762382769668478e-05, + "loss": 0.1946, + "step": 11859 + }, + { + "epoch": 1.4063796988023243, + "grad_norm": 0.9824810018765867, + "learning_rate": 3.7621755774522194e-05, + "loss": 0.1708, + "step": 11860 + }, + { + "epoch": 1.4064982805644493, + "grad_norm": 0.9032469326634257, + "learning_rate": 3.761968373600249e-05, + "loss": 0.1902, + "step": 11861 + }, + { + "epoch": 1.4066168623265742, + "grad_norm": 1.6392592595213498, + "learning_rate": 3.7617611581144786e-05, + "loss": 0.3493, + "step": 11862 + }, + { + "epoch": 1.4067354440886992, + "grad_norm": 1.1721221996946984, + "learning_rate": 3.761553930996816e-05, + "loss": 0.2327, + "step": 11863 + }, + { + "epoch": 1.4068540258508242, + "grad_norm": 1.0964767356255427, + "learning_rate": 3.761346692249175e-05, + "loss": 0.2188, + "step": 11864 + }, + { + "epoch": 1.4069726076129492, + "grad_norm": 1.0652929925046986, + "learning_rate": 3.761139441873463e-05, + "loss": 0.2284, + "step": 11865 + }, + { + "epoch": 1.4070911893750742, + "grad_norm": 0.8087170443706976, + "learning_rate": 3.760932179871592e-05, + "loss": 0.2004, + "step": 11866 + }, + { + "epoch": 1.4072097711371991, + "grad_norm": 0.8569407398863946, + "learning_rate": 3.760724906245473e-05, + "loss": 0.2134, + "step": 11867 + }, + { + "epoch": 1.4073283528993241, + "grad_norm": 0.9533567064763148, + "learning_rate": 3.7605176209970155e-05, + "loss": 0.2306, + "step": 11868 + }, + { + "epoch": 1.407446934661449, + "grad_norm": 1.2272651058505897, + "learning_rate": 3.760310324128132e-05, + "loss": 0.2475, + "step": 11869 + }, + { + "epoch": 1.407565516423574, + "grad_norm": 1.190124156897787, + "learning_rate": 3.760103015640733e-05, + "loss": 0.2301, + "step": 11870 + }, + { + "epoch": 1.407684098185699, + "grad_norm": 1.4695834910751413, + "learning_rate": 3.7598956955367285e-05, + "loss": 0.2983, + "step": 11871 + }, + { + "epoch": 1.407802679947824, + "grad_norm": 0.9110986177479906, + "learning_rate": 3.759688363818031e-05, + "loss": 0.2242, + "step": 11872 + }, + { + "epoch": 1.407921261709949, + "grad_norm": 1.0757468142947726, + "learning_rate": 3.7594810204865525e-05, + "loss": 0.2252, + "step": 11873 + }, + { + "epoch": 1.408039843472074, + "grad_norm": 1.269907369040871, + "learning_rate": 3.759273665544203e-05, + "loss": 0.2716, + "step": 11874 + }, + { + "epoch": 1.408158425234199, + "grad_norm": 1.5294149746890948, + "learning_rate": 3.7590662989928946e-05, + "loss": 0.4076, + "step": 11875 + }, + { + "epoch": 1.408277006996324, + "grad_norm": 1.3324966677578851, + "learning_rate": 3.758858920834538e-05, + "loss": 0.2784, + "step": 11876 + }, + { + "epoch": 1.408395588758449, + "grad_norm": 0.8091879553026329, + "learning_rate": 3.758651531071046e-05, + "loss": 0.164, + "step": 11877 + }, + { + "epoch": 1.408514170520574, + "grad_norm": 1.0329661980269718, + "learning_rate": 3.758444129704331e-05, + "loss": 0.1912, + "step": 11878 + }, + { + "epoch": 1.408632752282699, + "grad_norm": 1.0661677254857778, + "learning_rate": 3.758236716736304e-05, + "loss": 0.2375, + "step": 11879 + }, + { + "epoch": 1.4087513340448239, + "grad_norm": 1.2188916312947056, + "learning_rate": 3.758029292168877e-05, + "loss": 0.2833, + "step": 11880 + }, + { + "epoch": 1.4088699158069489, + "grad_norm": 0.6169914746171681, + "learning_rate": 3.757821856003963e-05, + "loss": 0.1448, + "step": 11881 + }, + { + "epoch": 1.4089884975690739, + "grad_norm": 1.40751295717724, + "learning_rate": 3.757614408243475e-05, + "loss": 0.3378, + "step": 11882 + }, + { + "epoch": 1.4091070793311988, + "grad_norm": 1.11195650803133, + "learning_rate": 3.757406948889324e-05, + "loss": 0.2343, + "step": 11883 + }, + { + "epoch": 1.4092256610933238, + "grad_norm": 1.140903046700466, + "learning_rate": 3.7571994779434225e-05, + "loss": 0.2003, + "step": 11884 + }, + { + "epoch": 1.4093442428554488, + "grad_norm": 0.9507215302066782, + "learning_rate": 3.7569919954076824e-05, + "loss": 0.2268, + "step": 11885 + }, + { + "epoch": 1.4094628246175738, + "grad_norm": 0.7545854341329731, + "learning_rate": 3.756784501284019e-05, + "loss": 0.1802, + "step": 11886 + }, + { + "epoch": 1.4095814063796988, + "grad_norm": 1.3528067661654315, + "learning_rate": 3.756576995574343e-05, + "loss": 0.2973, + "step": 11887 + }, + { + "epoch": 1.4096999881418237, + "grad_norm": 1.387296608661351, + "learning_rate": 3.756369478280568e-05, + "loss": 0.2782, + "step": 11888 + }, + { + "epoch": 1.4098185699039487, + "grad_norm": 0.9626680342901079, + "learning_rate": 3.7561619494046076e-05, + "loss": 0.203, + "step": 11889 + }, + { + "epoch": 1.4099371516660737, + "grad_norm": 1.0318387809753797, + "learning_rate": 3.755954408948374e-05, + "loss": 0.2162, + "step": 11890 + }, + { + "epoch": 1.4100557334281987, + "grad_norm": 0.9009392023340695, + "learning_rate": 3.755746856913781e-05, + "loss": 0.1966, + "step": 11891 + }, + { + "epoch": 1.4101743151903237, + "grad_norm": 1.247204195633211, + "learning_rate": 3.755539293302742e-05, + "loss": 0.2726, + "step": 11892 + }, + { + "epoch": 1.4102928969524486, + "grad_norm": 1.6774934494591192, + "learning_rate": 3.75533171811717e-05, + "loss": 0.3455, + "step": 11893 + }, + { + "epoch": 1.4104114787145736, + "grad_norm": 1.3083019318388487, + "learning_rate": 3.755124131358979e-05, + "loss": 0.2398, + "step": 11894 + }, + { + "epoch": 1.4105300604766988, + "grad_norm": 1.1568666306791122, + "learning_rate": 3.754916533030083e-05, + "loss": 0.2584, + "step": 11895 + }, + { + "epoch": 1.4106486422388236, + "grad_norm": 0.8178442916921913, + "learning_rate": 3.7547089231323954e-05, + "loss": 0.1885, + "step": 11896 + }, + { + "epoch": 1.4107672240009488, + "grad_norm": 0.9659711051453562, + "learning_rate": 3.754501301667829e-05, + "loss": 0.2397, + "step": 11897 + }, + { + "epoch": 1.4108858057630735, + "grad_norm": 0.8020143368167996, + "learning_rate": 3.7542936686383e-05, + "loss": 0.1823, + "step": 11898 + }, + { + "epoch": 1.4110043875251987, + "grad_norm": 1.3727905450598366, + "learning_rate": 3.754086024045722e-05, + "loss": 0.3669, + "step": 11899 + }, + { + "epoch": 1.4111229692873235, + "grad_norm": 0.9167431816796885, + "learning_rate": 3.7538783678920076e-05, + "loss": 0.1547, + "step": 11900 + }, + { + "epoch": 1.4112415510494487, + "grad_norm": 1.137542204077945, + "learning_rate": 3.753670700179073e-05, + "loss": 0.2106, + "step": 11901 + }, + { + "epoch": 1.4113601328115735, + "grad_norm": 1.2410342187508858, + "learning_rate": 3.7534630209088315e-05, + "loss": 0.2904, + "step": 11902 + }, + { + "epoch": 1.4114787145736987, + "grad_norm": 1.2799913132961949, + "learning_rate": 3.753255330083199e-05, + "loss": 0.2529, + "step": 11903 + }, + { + "epoch": 1.4115972963358234, + "grad_norm": 0.8382685704476928, + "learning_rate": 3.7530476277040886e-05, + "loss": 0.1676, + "step": 11904 + }, + { + "epoch": 1.4117158780979486, + "grad_norm": 1.146862656076266, + "learning_rate": 3.752839913773415e-05, + "loss": 0.2436, + "step": 11905 + }, + { + "epoch": 1.4118344598600734, + "grad_norm": 1.1180565632915234, + "learning_rate": 3.7526321882930947e-05, + "loss": 0.2535, + "step": 11906 + }, + { + "epoch": 1.4119530416221986, + "grad_norm": 1.315060130465517, + "learning_rate": 3.752424451265041e-05, + "loss": 0.2845, + "step": 11907 + }, + { + "epoch": 1.4120716233843236, + "grad_norm": 0.7935997888963029, + "learning_rate": 3.7522167026911695e-05, + "loss": 0.1949, + "step": 11908 + }, + { + "epoch": 1.4121902051464486, + "grad_norm": 1.529640993362518, + "learning_rate": 3.752008942573396e-05, + "loss": 0.2884, + "step": 11909 + }, + { + "epoch": 1.4123087869085735, + "grad_norm": 0.896038850539518, + "learning_rate": 3.7518011709136355e-05, + "loss": 0.1756, + "step": 11910 + }, + { + "epoch": 1.4124273686706985, + "grad_norm": 0.9648162778003362, + "learning_rate": 3.751593387713803e-05, + "loss": 0.2435, + "step": 11911 + }, + { + "epoch": 1.4125459504328235, + "grad_norm": 1.1620885681106663, + "learning_rate": 3.7513855929758146e-05, + "loss": 0.2758, + "step": 11912 + }, + { + "epoch": 1.4126645321949485, + "grad_norm": 1.206341713591499, + "learning_rate": 3.751177786701585e-05, + "loss": 0.288, + "step": 11913 + }, + { + "epoch": 1.4127831139570735, + "grad_norm": 1.139185567593586, + "learning_rate": 3.750969968893032e-05, + "loss": 0.2026, + "step": 11914 + }, + { + "epoch": 1.4129016957191984, + "grad_norm": 0.9162682953899665, + "learning_rate": 3.7507621395520683e-05, + "loss": 0.2023, + "step": 11915 + }, + { + "epoch": 1.4130202774813234, + "grad_norm": 0.9828866805489438, + "learning_rate": 3.750554298680612e-05, + "loss": 0.2539, + "step": 11916 + }, + { + "epoch": 1.4131388592434484, + "grad_norm": 1.2347286086574232, + "learning_rate": 3.7503464462805784e-05, + "loss": 0.2748, + "step": 11917 + }, + { + "epoch": 1.4132574410055734, + "grad_norm": 1.1667994047750778, + "learning_rate": 3.7501385823538834e-05, + "loss": 0.2748, + "step": 11918 + }, + { + "epoch": 1.4133760227676984, + "grad_norm": 1.1016760848721174, + "learning_rate": 3.7499307069024445e-05, + "loss": 0.2101, + "step": 11919 + }, + { + "epoch": 1.4134946045298233, + "grad_norm": 0.8045703681941118, + "learning_rate": 3.7497228199281774e-05, + "loss": 0.2198, + "step": 11920 + }, + { + "epoch": 1.4136131862919483, + "grad_norm": 0.9329928422753266, + "learning_rate": 3.7495149214329985e-05, + "loss": 0.1958, + "step": 11921 + }, + { + "epoch": 1.4137317680540733, + "grad_norm": 1.158455948946292, + "learning_rate": 3.749307011418824e-05, + "loss": 0.2426, + "step": 11922 + }, + { + "epoch": 1.4138503498161983, + "grad_norm": 0.8894028820745146, + "learning_rate": 3.74909908988757e-05, + "loss": 0.2332, + "step": 11923 + }, + { + "epoch": 1.4139689315783233, + "grad_norm": 1.0416727964699128, + "learning_rate": 3.748891156841155e-05, + "loss": 0.2044, + "step": 11924 + }, + { + "epoch": 1.4140875133404482, + "grad_norm": 1.024033634639564, + "learning_rate": 3.7486832122814955e-05, + "loss": 0.1915, + "step": 11925 + }, + { + "epoch": 1.4142060951025732, + "grad_norm": 1.100056857054474, + "learning_rate": 3.748475256210507e-05, + "loss": 0.2121, + "step": 11926 + }, + { + "epoch": 1.4143246768646982, + "grad_norm": 1.1410340384432158, + "learning_rate": 3.748267288630107e-05, + "loss": 0.2648, + "step": 11927 + }, + { + "epoch": 1.4144432586268232, + "grad_norm": 1.1831707448580802, + "learning_rate": 3.748059309542215e-05, + "loss": 0.2625, + "step": 11928 + }, + { + "epoch": 1.4145618403889482, + "grad_norm": 1.2722999210801806, + "learning_rate": 3.747851318948746e-05, + "loss": 0.2148, + "step": 11929 + }, + { + "epoch": 1.4146804221510731, + "grad_norm": 1.3130181242351964, + "learning_rate": 3.747643316851618e-05, + "loss": 0.2251, + "step": 11930 + }, + { + "epoch": 1.4147990039131981, + "grad_norm": 1.5799594332966045, + "learning_rate": 3.747435303252749e-05, + "loss": 0.2117, + "step": 11931 + }, + { + "epoch": 1.414917585675323, + "grad_norm": 1.3294431029406917, + "learning_rate": 3.747227278154055e-05, + "loss": 0.2554, + "step": 11932 + }, + { + "epoch": 1.415036167437448, + "grad_norm": 0.9651666913277549, + "learning_rate": 3.747019241557457e-05, + "loss": 0.2457, + "step": 11933 + }, + { + "epoch": 1.415154749199573, + "grad_norm": 1.0835257400081713, + "learning_rate": 3.7468111934648685e-05, + "loss": 0.284, + "step": 11934 + }, + { + "epoch": 1.415273330961698, + "grad_norm": 1.45307188083344, + "learning_rate": 3.746603133878212e-05, + "loss": 0.2681, + "step": 11935 + }, + { + "epoch": 1.415391912723823, + "grad_norm": 1.1006760954370562, + "learning_rate": 3.746395062799402e-05, + "loss": 0.2074, + "step": 11936 + }, + { + "epoch": 1.415510494485948, + "grad_norm": 1.1766132283014097, + "learning_rate": 3.746186980230357e-05, + "loss": 0.2148, + "step": 11937 + }, + { + "epoch": 1.415629076248073, + "grad_norm": 1.4735271651734851, + "learning_rate": 3.745978886172997e-05, + "loss": 0.2929, + "step": 11938 + }, + { + "epoch": 1.415747658010198, + "grad_norm": 1.1032395719154604, + "learning_rate": 3.7457707806292395e-05, + "loss": 0.207, + "step": 11939 + }, + { + "epoch": 1.415866239772323, + "grad_norm": 1.7197056694613981, + "learning_rate": 3.745562663601004e-05, + "loss": 0.3101, + "step": 11940 + }, + { + "epoch": 1.415984821534448, + "grad_norm": 0.78304786245339, + "learning_rate": 3.745354535090207e-05, + "loss": 0.1747, + "step": 11941 + }, + { + "epoch": 1.416103403296573, + "grad_norm": 1.3011632218647013, + "learning_rate": 3.74514639509877e-05, + "loss": 0.27, + "step": 11942 + }, + { + "epoch": 1.416221985058698, + "grad_norm": 1.132827101835751, + "learning_rate": 3.744938243628608e-05, + "loss": 0.209, + "step": 11943 + }, + { + "epoch": 1.4163405668208229, + "grad_norm": 1.3512940137015497, + "learning_rate": 3.744730080681644e-05, + "loss": 0.2804, + "step": 11944 + }, + { + "epoch": 1.4164591485829479, + "grad_norm": 1.0071277237834186, + "learning_rate": 3.744521906259794e-05, + "loss": 0.2146, + "step": 11945 + }, + { + "epoch": 1.416577730345073, + "grad_norm": 1.0701795011719524, + "learning_rate": 3.744313720364979e-05, + "loss": 0.2152, + "step": 11946 + }, + { + "epoch": 1.4166963121071978, + "grad_norm": 0.920506465424451, + "learning_rate": 3.744105522999116e-05, + "loss": 0.1941, + "step": 11947 + }, + { + "epoch": 1.416814893869323, + "grad_norm": 1.1259767990316916, + "learning_rate": 3.743897314164127e-05, + "loss": 0.1589, + "step": 11948 + }, + { + "epoch": 1.4169334756314478, + "grad_norm": 1.3050659813246257, + "learning_rate": 3.7436890938619286e-05, + "loss": 0.2772, + "step": 11949 + }, + { + "epoch": 1.417052057393573, + "grad_norm": 0.9982786210944002, + "learning_rate": 3.743480862094443e-05, + "loss": 0.2131, + "step": 11950 + }, + { + "epoch": 1.4171706391556977, + "grad_norm": 1.0368172154358368, + "learning_rate": 3.743272618863588e-05, + "loss": 0.1956, + "step": 11951 + }, + { + "epoch": 1.417289220917823, + "grad_norm": 1.1878495825113666, + "learning_rate": 3.743064364171285e-05, + "loss": 0.2009, + "step": 11952 + }, + { + "epoch": 1.4174078026799477, + "grad_norm": 0.9056630896747744, + "learning_rate": 3.742856098019452e-05, + "loss": 0.1921, + "step": 11953 + }, + { + "epoch": 1.417526384442073, + "grad_norm": 1.023166475119506, + "learning_rate": 3.74264782041001e-05, + "loss": 0.203, + "step": 11954 + }, + { + "epoch": 1.4176449662041977, + "grad_norm": 1.2991100820457953, + "learning_rate": 3.7424395313448804e-05, + "loss": 0.3219, + "step": 11955 + }, + { + "epoch": 1.4177635479663229, + "grad_norm": 1.333929919864602, + "learning_rate": 3.74223123082598e-05, + "loss": 0.2886, + "step": 11956 + }, + { + "epoch": 1.4178821297284476, + "grad_norm": 0.7151882655890506, + "learning_rate": 3.7420229188552316e-05, + "loss": 0.13, + "step": 11957 + }, + { + "epoch": 1.4180007114905728, + "grad_norm": 1.114094213487098, + "learning_rate": 3.7418145954345554e-05, + "loss": 0.2773, + "step": 11958 + }, + { + "epoch": 1.4181192932526978, + "grad_norm": 1.032364871530729, + "learning_rate": 3.741606260565871e-05, + "loss": 0.2148, + "step": 11959 + }, + { + "epoch": 1.4182378750148228, + "grad_norm": 1.1149114960402489, + "learning_rate": 3.741397914251099e-05, + "loss": 0.2025, + "step": 11960 + }, + { + "epoch": 1.4183564567769478, + "grad_norm": 1.0570946723796142, + "learning_rate": 3.741189556492162e-05, + "loss": 0.2223, + "step": 11961 + }, + { + "epoch": 1.4184750385390728, + "grad_norm": 0.961860692020639, + "learning_rate": 3.7409811872909776e-05, + "loss": 0.164, + "step": 11962 + }, + { + "epoch": 1.4185936203011977, + "grad_norm": 1.5840539159324694, + "learning_rate": 3.7407728066494696e-05, + "loss": 0.3376, + "step": 11963 + }, + { + "epoch": 1.4187122020633227, + "grad_norm": 1.0104171318173516, + "learning_rate": 3.7405644145695576e-05, + "loss": 0.2121, + "step": 11964 + }, + { + "epoch": 1.4188307838254477, + "grad_norm": 0.9911416726771245, + "learning_rate": 3.740356011053163e-05, + "loss": 0.2044, + "step": 11965 + }, + { + "epoch": 1.4189493655875727, + "grad_norm": 1.1102571499849925, + "learning_rate": 3.740147596102207e-05, + "loss": 0.2204, + "step": 11966 + }, + { + "epoch": 1.4190679473496977, + "grad_norm": 1.2333017217894224, + "learning_rate": 3.7399391697186105e-05, + "loss": 0.2295, + "step": 11967 + }, + { + "epoch": 1.4191865291118226, + "grad_norm": 1.052269815743381, + "learning_rate": 3.739730731904295e-05, + "loss": 0.2141, + "step": 11968 + }, + { + "epoch": 1.4193051108739476, + "grad_norm": 0.8673024693649638, + "learning_rate": 3.7395222826611834e-05, + "loss": 0.2055, + "step": 11969 + }, + { + "epoch": 1.4194236926360726, + "grad_norm": 1.389745331681283, + "learning_rate": 3.739313821991196e-05, + "loss": 0.3222, + "step": 11970 + }, + { + "epoch": 1.4195422743981976, + "grad_norm": 0.9039733242011868, + "learning_rate": 3.739105349896255e-05, + "loss": 0.1988, + "step": 11971 + }, + { + "epoch": 1.4196608561603226, + "grad_norm": 1.4090900881266861, + "learning_rate": 3.738896866378283e-05, + "loss": 0.3531, + "step": 11972 + }, + { + "epoch": 1.4197794379224475, + "grad_norm": 1.0602170145630208, + "learning_rate": 3.7386883714392e-05, + "loss": 0.2274, + "step": 11973 + }, + { + "epoch": 1.4198980196845725, + "grad_norm": 1.011896978753643, + "learning_rate": 3.738479865080929e-05, + "loss": 0.2633, + "step": 11974 + }, + { + "epoch": 1.4200166014466975, + "grad_norm": 0.8951701895706808, + "learning_rate": 3.7382713473053934e-05, + "loss": 0.1722, + "step": 11975 + }, + { + "epoch": 1.4201351832088225, + "grad_norm": 1.6032075863526767, + "learning_rate": 3.7380628181145134e-05, + "loss": 0.3008, + "step": 11976 + }, + { + "epoch": 1.4202537649709475, + "grad_norm": 1.322252080511777, + "learning_rate": 3.7378542775102126e-05, + "loss": 0.2216, + "step": 11977 + }, + { + "epoch": 1.4203723467330724, + "grad_norm": 1.1034763458698946, + "learning_rate": 3.737645725494414e-05, + "loss": 0.2545, + "step": 11978 + }, + { + "epoch": 1.4204909284951974, + "grad_norm": 1.2740354777225764, + "learning_rate": 3.737437162069039e-05, + "loss": 0.2532, + "step": 11979 + }, + { + "epoch": 1.4206095102573224, + "grad_norm": 0.9794190951265678, + "learning_rate": 3.7372285872360114e-05, + "loss": 0.1705, + "step": 11980 + }, + { + "epoch": 1.4207280920194474, + "grad_norm": 0.8449441033256117, + "learning_rate": 3.737020000997252e-05, + "loss": 0.1796, + "step": 11981 + }, + { + "epoch": 1.4208466737815724, + "grad_norm": 1.2959549418745546, + "learning_rate": 3.7368114033546866e-05, + "loss": 0.276, + "step": 11982 + }, + { + "epoch": 1.4209652555436973, + "grad_norm": 0.9812083671125547, + "learning_rate": 3.736602794310237e-05, + "loss": 0.1977, + "step": 11983 + }, + { + "epoch": 1.4210838373058223, + "grad_norm": 1.1658842635744877, + "learning_rate": 3.736394173865825e-05, + "loss": 0.178, + "step": 11984 + }, + { + "epoch": 1.4212024190679473, + "grad_norm": 0.8530627784623562, + "learning_rate": 3.736185542023375e-05, + "loss": 0.1682, + "step": 11985 + }, + { + "epoch": 1.4213210008300723, + "grad_norm": 1.1984105092870918, + "learning_rate": 3.7359768987848096e-05, + "loss": 0.2786, + "step": 11986 + }, + { + "epoch": 1.4214395825921973, + "grad_norm": 0.9211369612815075, + "learning_rate": 3.735768244152054e-05, + "loss": 0.1786, + "step": 11987 + }, + { + "epoch": 1.4215581643543223, + "grad_norm": 1.2911479641659251, + "learning_rate": 3.7355595781270304e-05, + "loss": 0.3608, + "step": 11988 + }, + { + "epoch": 1.4216767461164472, + "grad_norm": 0.8831923492960647, + "learning_rate": 3.735350900711663e-05, + "loss": 0.1867, + "step": 11989 + }, + { + "epoch": 1.4217953278785722, + "grad_norm": 1.2003106211001768, + "learning_rate": 3.735142211907874e-05, + "loss": 0.2373, + "step": 11990 + }, + { + "epoch": 1.4219139096406972, + "grad_norm": 1.0991150502179141, + "learning_rate": 3.7349335117175896e-05, + "loss": 0.2295, + "step": 11991 + }, + { + "epoch": 1.4220324914028222, + "grad_norm": 1.0859898570880155, + "learning_rate": 3.734724800142732e-05, + "loss": 0.2548, + "step": 11992 + }, + { + "epoch": 1.4221510731649472, + "grad_norm": 0.8668081328163169, + "learning_rate": 3.7345160771852254e-05, + "loss": 0.1905, + "step": 11993 + }, + { + "epoch": 1.4222696549270721, + "grad_norm": 0.838933825485151, + "learning_rate": 3.7343073428469954e-05, + "loss": 0.1867, + "step": 11994 + }, + { + "epoch": 1.4223882366891973, + "grad_norm": 1.3825626654733925, + "learning_rate": 3.7340985971299646e-05, + "loss": 0.276, + "step": 11995 + }, + { + "epoch": 1.422506818451322, + "grad_norm": 1.5663639011108836, + "learning_rate": 3.733889840036058e-05, + "loss": 0.355, + "step": 11996 + }, + { + "epoch": 1.4226254002134473, + "grad_norm": 1.3382757298278536, + "learning_rate": 3.7336810715672014e-05, + "loss": 0.2758, + "step": 11997 + }, + { + "epoch": 1.422743981975572, + "grad_norm": 1.1336223022982024, + "learning_rate": 3.7334722917253165e-05, + "loss": 0.2366, + "step": 11998 + }, + { + "epoch": 1.4228625637376973, + "grad_norm": 0.9367522206086788, + "learning_rate": 3.73326350051233e-05, + "loss": 0.176, + "step": 11999 + }, + { + "epoch": 1.422981145499822, + "grad_norm": 0.8218561579344483, + "learning_rate": 3.733054697930167e-05, + "loss": 0.1926, + "step": 12000 + }, + { + "epoch": 1.4230997272619472, + "grad_norm": 0.951255074023938, + "learning_rate": 3.732845883980752e-05, + "loss": 0.174, + "step": 12001 + }, + { + "epoch": 1.423218309024072, + "grad_norm": 1.1343281217573846, + "learning_rate": 3.7326370586660095e-05, + "loss": 0.2671, + "step": 12002 + }, + { + "epoch": 1.4233368907861972, + "grad_norm": 0.8652340403318025, + "learning_rate": 3.732428221987864e-05, + "loss": 0.2146, + "step": 12003 + }, + { + "epoch": 1.423455472548322, + "grad_norm": 1.0475610249036065, + "learning_rate": 3.732219373948243e-05, + "loss": 0.2222, + "step": 12004 + }, + { + "epoch": 1.4235740543104471, + "grad_norm": 1.002841497746369, + "learning_rate": 3.73201051454907e-05, + "loss": 0.2217, + "step": 12005 + }, + { + "epoch": 1.423692636072572, + "grad_norm": 1.2139889917722577, + "learning_rate": 3.7318016437922696e-05, + "loss": 0.2633, + "step": 12006 + }, + { + "epoch": 1.423811217834697, + "grad_norm": 1.0849538686013969, + "learning_rate": 3.7315927616797697e-05, + "loss": 0.2029, + "step": 12007 + }, + { + "epoch": 1.423929799596822, + "grad_norm": 1.1545427513661384, + "learning_rate": 3.731383868213495e-05, + "loss": 0.2125, + "step": 12008 + }, + { + "epoch": 1.424048381358947, + "grad_norm": 1.3358855964240732, + "learning_rate": 3.73117496339537e-05, + "loss": 0.2548, + "step": 12009 + }, + { + "epoch": 1.424166963121072, + "grad_norm": 1.0149872519199645, + "learning_rate": 3.7309660472273225e-05, + "loss": 0.1796, + "step": 12010 + }, + { + "epoch": 1.424285544883197, + "grad_norm": 0.8666149069817426, + "learning_rate": 3.7307571197112775e-05, + "loss": 0.1978, + "step": 12011 + }, + { + "epoch": 1.424404126645322, + "grad_norm": 1.4454027333665467, + "learning_rate": 3.730548180849161e-05, + "loss": 0.2692, + "step": 12012 + }, + { + "epoch": 1.424522708407447, + "grad_norm": 1.3573374758529448, + "learning_rate": 3.7303392306429e-05, + "loss": 0.2425, + "step": 12013 + }, + { + "epoch": 1.424641290169572, + "grad_norm": 0.9218075791379029, + "learning_rate": 3.730130269094418e-05, + "loss": 0.2105, + "step": 12014 + }, + { + "epoch": 1.424759871931697, + "grad_norm": 1.4538753403783078, + "learning_rate": 3.7299212962056454e-05, + "loss": 0.377, + "step": 12015 + }, + { + "epoch": 1.424878453693822, + "grad_norm": 1.227830273325606, + "learning_rate": 3.729712311978506e-05, + "loss": 0.2382, + "step": 12016 + }, + { + "epoch": 1.424997035455947, + "grad_norm": 1.499721348874308, + "learning_rate": 3.729503316414927e-05, + "loss": 0.3678, + "step": 12017 + }, + { + "epoch": 1.425115617218072, + "grad_norm": 0.9635882059339691, + "learning_rate": 3.729294309516835e-05, + "loss": 0.21, + "step": 12018 + }, + { + "epoch": 1.4252341989801969, + "grad_norm": 1.1989818204752054, + "learning_rate": 3.729085291286157e-05, + "loss": 0.2697, + "step": 12019 + }, + { + "epoch": 1.4253527807423219, + "grad_norm": 1.1411600711676084, + "learning_rate": 3.728876261724819e-05, + "loss": 0.2629, + "step": 12020 + }, + { + "epoch": 1.4254713625044468, + "grad_norm": 1.2925891537665914, + "learning_rate": 3.728667220834751e-05, + "loss": 0.2115, + "step": 12021 + }, + { + "epoch": 1.4255899442665718, + "grad_norm": 0.973266107412103, + "learning_rate": 3.728458168617877e-05, + "loss": 0.1993, + "step": 12022 + }, + { + "epoch": 1.4257085260286968, + "grad_norm": 1.5400997653157809, + "learning_rate": 3.728249105076125e-05, + "loss": 0.3209, + "step": 12023 + }, + { + "epoch": 1.4258271077908218, + "grad_norm": 1.389016798979837, + "learning_rate": 3.728040030211421e-05, + "loss": 0.3172, + "step": 12024 + }, + { + "epoch": 1.4259456895529468, + "grad_norm": 0.9016096773864852, + "learning_rate": 3.727830944025695e-05, + "loss": 0.1778, + "step": 12025 + }, + { + "epoch": 1.4260642713150717, + "grad_norm": 1.1155217521462248, + "learning_rate": 3.727621846520874e-05, + "loss": 0.2371, + "step": 12026 + }, + { + "epoch": 1.4261828530771967, + "grad_norm": 1.124666981951748, + "learning_rate": 3.727412737698884e-05, + "loss": 0.2122, + "step": 12027 + }, + { + "epoch": 1.4263014348393217, + "grad_norm": 1.094608374927019, + "learning_rate": 3.7272036175616544e-05, + "loss": 0.2329, + "step": 12028 + }, + { + "epoch": 1.4264200166014467, + "grad_norm": 1.1911518045668308, + "learning_rate": 3.726994486111112e-05, + "loss": 0.2304, + "step": 12029 + }, + { + "epoch": 1.4265385983635717, + "grad_norm": 0.915841192879363, + "learning_rate": 3.726785343349185e-05, + "loss": 0.1748, + "step": 12030 + }, + { + "epoch": 1.4266571801256966, + "grad_norm": 0.9268160013558113, + "learning_rate": 3.726576189277802e-05, + "loss": 0.1761, + "step": 12031 + }, + { + "epoch": 1.4267757618878216, + "grad_norm": 0.871737394083587, + "learning_rate": 3.72636702389889e-05, + "loss": 0.1965, + "step": 12032 + }, + { + "epoch": 1.4268943436499466, + "grad_norm": 1.0077043465372557, + "learning_rate": 3.7261578472143784e-05, + "loss": 0.2044, + "step": 12033 + }, + { + "epoch": 1.4270129254120716, + "grad_norm": 0.893832525571922, + "learning_rate": 3.725948659226195e-05, + "loss": 0.2089, + "step": 12034 + }, + { + "epoch": 1.4271315071741966, + "grad_norm": 1.1716682187387586, + "learning_rate": 3.725739459936268e-05, + "loss": 0.2721, + "step": 12035 + }, + { + "epoch": 1.4272500889363215, + "grad_norm": 1.1607246727973193, + "learning_rate": 3.725530249346526e-05, + "loss": 0.196, + "step": 12036 + }, + { + "epoch": 1.4273686706984465, + "grad_norm": 1.0237261638849915, + "learning_rate": 3.7253210274588984e-05, + "loss": 0.1978, + "step": 12037 + }, + { + "epoch": 1.4274872524605715, + "grad_norm": 1.0387635891096125, + "learning_rate": 3.725111794275313e-05, + "loss": 0.1784, + "step": 12038 + }, + { + "epoch": 1.4276058342226965, + "grad_norm": 1.0431852646773652, + "learning_rate": 3.7249025497977e-05, + "loss": 0.2518, + "step": 12039 + }, + { + "epoch": 1.4277244159848215, + "grad_norm": 0.850245247116313, + "learning_rate": 3.724693294027987e-05, + "loss": 0.1832, + "step": 12040 + }, + { + "epoch": 1.4278429977469465, + "grad_norm": 0.8146333053938954, + "learning_rate": 3.724484026968105e-05, + "loss": 0.1778, + "step": 12041 + }, + { + "epoch": 1.4279615795090714, + "grad_norm": 0.8717952922833572, + "learning_rate": 3.72427474861998e-05, + "loss": 0.1824, + "step": 12042 + }, + { + "epoch": 1.4280801612711964, + "grad_norm": 0.8932636885357316, + "learning_rate": 3.724065458985545e-05, + "loss": 0.2092, + "step": 12043 + }, + { + "epoch": 1.4281987430333214, + "grad_norm": 0.8245061288880984, + "learning_rate": 3.7238561580667255e-05, + "loss": 0.1652, + "step": 12044 + }, + { + "epoch": 1.4283173247954464, + "grad_norm": 0.9249864273581121, + "learning_rate": 3.7236468458654545e-05, + "loss": 0.1654, + "step": 12045 + }, + { + "epoch": 1.4284359065575716, + "grad_norm": 1.1490678478180048, + "learning_rate": 3.723437522383659e-05, + "loss": 0.226, + "step": 12046 + }, + { + "epoch": 1.4285544883196963, + "grad_norm": 0.7477944646145495, + "learning_rate": 3.72322818762327e-05, + "loss": 0.138, + "step": 12047 + }, + { + "epoch": 1.4286730700818215, + "grad_norm": 1.1906030203156808, + "learning_rate": 3.723018841586218e-05, + "loss": 0.2276, + "step": 12048 + }, + { + "epoch": 1.4287916518439463, + "grad_norm": 1.4780923816958342, + "learning_rate": 3.722809484274432e-05, + "loss": 0.2644, + "step": 12049 + }, + { + "epoch": 1.4289102336060715, + "grad_norm": 0.9700998358896614, + "learning_rate": 3.72260011568984e-05, + "loss": 0.229, + "step": 12050 + }, + { + "epoch": 1.4290288153681963, + "grad_norm": 1.137778644924611, + "learning_rate": 3.722390735834377e-05, + "loss": 0.2453, + "step": 12051 + }, + { + "epoch": 1.4291473971303215, + "grad_norm": 1.179072689979875, + "learning_rate": 3.722181344709969e-05, + "loss": 0.2413, + "step": 12052 + }, + { + "epoch": 1.4292659788924462, + "grad_norm": 1.1666626996176908, + "learning_rate": 3.721971942318547e-05, + "loss": 0.2275, + "step": 12053 + }, + { + "epoch": 1.4293845606545714, + "grad_norm": 1.5645182550933716, + "learning_rate": 3.721762528662044e-05, + "loss": 0.2766, + "step": 12054 + }, + { + "epoch": 1.4295031424166962, + "grad_norm": 0.8312756044786797, + "learning_rate": 3.721553103742388e-05, + "loss": 0.1641, + "step": 12055 + }, + { + "epoch": 1.4296217241788214, + "grad_norm": 1.0707763044730485, + "learning_rate": 3.72134366756151e-05, + "loss": 0.1908, + "step": 12056 + }, + { + "epoch": 1.4297403059409464, + "grad_norm": 1.6273476537518772, + "learning_rate": 3.7211342201213404e-05, + "loss": 0.3077, + "step": 12057 + }, + { + "epoch": 1.4298588877030713, + "grad_norm": 0.9757904487851846, + "learning_rate": 3.720924761423812e-05, + "loss": 0.2213, + "step": 12058 + }, + { + "epoch": 1.4299774694651963, + "grad_norm": 1.1808199419205503, + "learning_rate": 3.720715291470854e-05, + "loss": 0.284, + "step": 12059 + }, + { + "epoch": 1.4300960512273213, + "grad_norm": 0.9896720965606722, + "learning_rate": 3.720505810264399e-05, + "loss": 0.2181, + "step": 12060 + }, + { + "epoch": 1.4302146329894463, + "grad_norm": 0.902656262905738, + "learning_rate": 3.7202963178063756e-05, + "loss": 0.196, + "step": 12061 + }, + { + "epoch": 1.4303332147515713, + "grad_norm": 1.5313211884433833, + "learning_rate": 3.7200868140987174e-05, + "loss": 0.3908, + "step": 12062 + }, + { + "epoch": 1.4304517965136962, + "grad_norm": 0.8832586795729839, + "learning_rate": 3.719877299143354e-05, + "loss": 0.1637, + "step": 12063 + }, + { + "epoch": 1.4305703782758212, + "grad_norm": 0.8820369992393825, + "learning_rate": 3.719667772942219e-05, + "loss": 0.1685, + "step": 12064 + }, + { + "epoch": 1.4306889600379462, + "grad_norm": 1.226769331590938, + "learning_rate": 3.719458235497242e-05, + "loss": 0.2894, + "step": 12065 + }, + { + "epoch": 1.4308075418000712, + "grad_norm": 0.941602467711519, + "learning_rate": 3.719248686810356e-05, + "loss": 0.2124, + "step": 12066 + }, + { + "epoch": 1.4309261235621962, + "grad_norm": 1.0679796915360902, + "learning_rate": 3.719039126883492e-05, + "loss": 0.2658, + "step": 12067 + }, + { + "epoch": 1.4310447053243212, + "grad_norm": 0.9874963930411335, + "learning_rate": 3.718829555718581e-05, + "loss": 0.2346, + "step": 12068 + }, + { + "epoch": 1.4311632870864461, + "grad_norm": 1.2077980024789088, + "learning_rate": 3.718619973317558e-05, + "loss": 0.278, + "step": 12069 + }, + { + "epoch": 1.4312818688485711, + "grad_norm": 1.1387238459889275, + "learning_rate": 3.718410379682352e-05, + "loss": 0.2823, + "step": 12070 + }, + { + "epoch": 1.431400450610696, + "grad_norm": 1.1477317644925151, + "learning_rate": 3.7182007748148976e-05, + "loss": 0.2664, + "step": 12071 + }, + { + "epoch": 1.431519032372821, + "grad_norm": 0.9758634059137739, + "learning_rate": 3.717991158717125e-05, + "loss": 0.1765, + "step": 12072 + }, + { + "epoch": 1.431637614134946, + "grad_norm": 1.1430747434393123, + "learning_rate": 3.717781531390968e-05, + "loss": 0.1979, + "step": 12073 + }, + { + "epoch": 1.431756195897071, + "grad_norm": 0.8340528075254353, + "learning_rate": 3.7175718928383584e-05, + "loss": 0.1669, + "step": 12074 + }, + { + "epoch": 1.431874777659196, + "grad_norm": 1.4950489133715994, + "learning_rate": 3.7173622430612285e-05, + "loss": 0.2624, + "step": 12075 + }, + { + "epoch": 1.431993359421321, + "grad_norm": 0.9523535657562556, + "learning_rate": 3.7171525820615124e-05, + "loss": 0.2081, + "step": 12076 + }, + { + "epoch": 1.432111941183446, + "grad_norm": 1.273336048344334, + "learning_rate": 3.7169429098411415e-05, + "loss": 0.2601, + "step": 12077 + }, + { + "epoch": 1.432230522945571, + "grad_norm": 1.3605923416599812, + "learning_rate": 3.7167332264020495e-05, + "loss": 0.2948, + "step": 12078 + }, + { + "epoch": 1.432349104707696, + "grad_norm": 1.3030253022437912, + "learning_rate": 3.716523531746169e-05, + "loss": 0.276, + "step": 12079 + }, + { + "epoch": 1.432467686469821, + "grad_norm": 1.255223187864126, + "learning_rate": 3.7163138258754336e-05, + "loss": 0.2632, + "step": 12080 + }, + { + "epoch": 1.432586268231946, + "grad_norm": 0.8071621326685432, + "learning_rate": 3.7161041087917755e-05, + "loss": 0.171, + "step": 12081 + }, + { + "epoch": 1.4327048499940709, + "grad_norm": 0.912067898598135, + "learning_rate": 3.71589438049713e-05, + "loss": 0.1963, + "step": 12082 + }, + { + "epoch": 1.4328234317561959, + "grad_norm": 0.8013158887921227, + "learning_rate": 3.715684640993429e-05, + "loss": 0.2068, + "step": 12083 + }, + { + "epoch": 1.4329420135183208, + "grad_norm": 1.2304630901654887, + "learning_rate": 3.715474890282606e-05, + "loss": 0.267, + "step": 12084 + }, + { + "epoch": 1.4330605952804458, + "grad_norm": 1.7256209193966836, + "learning_rate": 3.7152651283665954e-05, + "loss": 0.2943, + "step": 12085 + }, + { + "epoch": 1.4331791770425708, + "grad_norm": 0.9879517642608836, + "learning_rate": 3.71505535524733e-05, + "loss": 0.2043, + "step": 12086 + }, + { + "epoch": 1.4332977588046958, + "grad_norm": 0.967355083607919, + "learning_rate": 3.714845570926745e-05, + "loss": 0.175, + "step": 12087 + }, + { + "epoch": 1.4334163405668208, + "grad_norm": 0.9073698796585046, + "learning_rate": 3.714635775406773e-05, + "loss": 0.2001, + "step": 12088 + }, + { + "epoch": 1.4335349223289457, + "grad_norm": 1.2170088208869456, + "learning_rate": 3.714425968689349e-05, + "loss": 0.2062, + "step": 12089 + }, + { + "epoch": 1.4336535040910707, + "grad_norm": 0.8362130324932386, + "learning_rate": 3.714216150776407e-05, + "loss": 0.2138, + "step": 12090 + }, + { + "epoch": 1.4337720858531957, + "grad_norm": 0.9977982798124712, + "learning_rate": 3.71400632166988e-05, + "loss": 0.1902, + "step": 12091 + }, + { + "epoch": 1.4338906676153207, + "grad_norm": 0.9574272309094275, + "learning_rate": 3.7137964813717045e-05, + "loss": 0.1756, + "step": 12092 + }, + { + "epoch": 1.4340092493774457, + "grad_norm": 1.076599308778476, + "learning_rate": 3.7135866298838135e-05, + "loss": 0.2802, + "step": 12093 + }, + { + "epoch": 1.4341278311395707, + "grad_norm": 0.8566292566126845, + "learning_rate": 3.713376767208142e-05, + "loss": 0.182, + "step": 12094 + }, + { + "epoch": 1.4342464129016959, + "grad_norm": 0.8252179342639918, + "learning_rate": 3.7131668933466244e-05, + "loss": 0.161, + "step": 12095 + }, + { + "epoch": 1.4343649946638206, + "grad_norm": 0.9479811908598932, + "learning_rate": 3.712957008301196e-05, + "loss": 0.1987, + "step": 12096 + }, + { + "epoch": 1.4344835764259458, + "grad_norm": 1.4195736003464925, + "learning_rate": 3.712747112073791e-05, + "loss": 0.3094, + "step": 12097 + }, + { + "epoch": 1.4346021581880706, + "grad_norm": 1.0737092730814193, + "learning_rate": 3.712537204666345e-05, + "loss": 0.2181, + "step": 12098 + }, + { + "epoch": 1.4347207399501958, + "grad_norm": 0.9343344956956153, + "learning_rate": 3.712327286080793e-05, + "loss": 0.2673, + "step": 12099 + }, + { + "epoch": 1.4348393217123205, + "grad_norm": 1.0676425421543592, + "learning_rate": 3.71211735631907e-05, + "loss": 0.2236, + "step": 12100 + }, + { + "epoch": 1.4349579034744457, + "grad_norm": 1.0374595375475222, + "learning_rate": 3.711907415383112e-05, + "loss": 0.2179, + "step": 12101 + }, + { + "epoch": 1.4350764852365705, + "grad_norm": 1.440455703086093, + "learning_rate": 3.711697463274853e-05, + "loss": 0.2994, + "step": 12102 + }, + { + "epoch": 1.4351950669986957, + "grad_norm": 0.9486640997205259, + "learning_rate": 3.71148749999623e-05, + "loss": 0.1957, + "step": 12103 + }, + { + "epoch": 1.4353136487608205, + "grad_norm": 1.0335167654517876, + "learning_rate": 3.7112775255491774e-05, + "loss": 0.2364, + "step": 12104 + }, + { + "epoch": 1.4354322305229457, + "grad_norm": 1.1417684487106745, + "learning_rate": 3.711067539935632e-05, + "loss": 0.2824, + "step": 12105 + }, + { + "epoch": 1.4355508122850704, + "grad_norm": 1.1466950780223135, + "learning_rate": 3.710857543157528e-05, + "loss": 0.2931, + "step": 12106 + }, + { + "epoch": 1.4356693940471956, + "grad_norm": 1.099878433542885, + "learning_rate": 3.710647535216802e-05, + "loss": 0.229, + "step": 12107 + }, + { + "epoch": 1.4357879758093206, + "grad_norm": 1.0674591836765452, + "learning_rate": 3.7104375161153915e-05, + "loss": 0.249, + "step": 12108 + }, + { + "epoch": 1.4359065575714456, + "grad_norm": 1.1641382150649549, + "learning_rate": 3.710227485855231e-05, + "loss": 0.2275, + "step": 12109 + }, + { + "epoch": 1.4360251393335706, + "grad_norm": 1.1939079148452676, + "learning_rate": 3.710017444438257e-05, + "loss": 0.2247, + "step": 12110 + }, + { + "epoch": 1.4361437210956955, + "grad_norm": 0.9780877309967877, + "learning_rate": 3.709807391866406e-05, + "loss": 0.2012, + "step": 12111 + }, + { + "epoch": 1.4362623028578205, + "grad_norm": 1.1015832834922257, + "learning_rate": 3.7095973281416144e-05, + "loss": 0.2087, + "step": 12112 + }, + { + "epoch": 1.4363808846199455, + "grad_norm": 0.9589659253628081, + "learning_rate": 3.709387253265819e-05, + "loss": 0.2327, + "step": 12113 + }, + { + "epoch": 1.4364994663820705, + "grad_norm": 1.2413148164399888, + "learning_rate": 3.709177167240957e-05, + "loss": 0.2332, + "step": 12114 + }, + { + "epoch": 1.4366180481441955, + "grad_norm": 1.243872161695892, + "learning_rate": 3.7089670700689625e-05, + "loss": 0.2474, + "step": 12115 + }, + { + "epoch": 1.4367366299063205, + "grad_norm": 0.8438765659915752, + "learning_rate": 3.7087569617517744e-05, + "loss": 0.1705, + "step": 12116 + }, + { + "epoch": 1.4368552116684454, + "grad_norm": 2.5432911230684123, + "learning_rate": 3.7085468422913307e-05, + "loss": 0.2623, + "step": 12117 + }, + { + "epoch": 1.4369737934305704, + "grad_norm": 0.9779322037913933, + "learning_rate": 3.7083367116895664e-05, + "loss": 0.187, + "step": 12118 + }, + { + "epoch": 1.4370923751926954, + "grad_norm": 0.9398953143023279, + "learning_rate": 3.708126569948419e-05, + "loss": 0.1888, + "step": 12119 + }, + { + "epoch": 1.4372109569548204, + "grad_norm": 1.1097102363172018, + "learning_rate": 3.7079164170698264e-05, + "loss": 0.2008, + "step": 12120 + }, + { + "epoch": 1.4373295387169454, + "grad_norm": 1.2990870377951567, + "learning_rate": 3.707706253055726e-05, + "loss": 0.2972, + "step": 12121 + }, + { + "epoch": 1.4374481204790703, + "grad_norm": 0.9153478521513964, + "learning_rate": 3.7074960779080546e-05, + "loss": 0.2001, + "step": 12122 + }, + { + "epoch": 1.4375667022411953, + "grad_norm": 1.1701098423406637, + "learning_rate": 3.707285891628751e-05, + "loss": 0.2556, + "step": 12123 + }, + { + "epoch": 1.4376852840033203, + "grad_norm": 1.104624130232348, + "learning_rate": 3.70707569421975e-05, + "loss": 0.2044, + "step": 12124 + }, + { + "epoch": 1.4378038657654453, + "grad_norm": 0.9976465637280325, + "learning_rate": 3.7068654856829934e-05, + "loss": 0.2025, + "step": 12125 + }, + { + "epoch": 1.4379224475275703, + "grad_norm": 1.9406724457452853, + "learning_rate": 3.7066552660204154e-05, + "loss": 0.4404, + "step": 12126 + }, + { + "epoch": 1.4380410292896952, + "grad_norm": 1.2948387207019898, + "learning_rate": 3.7064450352339564e-05, + "loss": 0.2187, + "step": 12127 + }, + { + "epoch": 1.4381596110518202, + "grad_norm": 1.344198570613, + "learning_rate": 3.706234793325553e-05, + "loss": 0.2131, + "step": 12128 + }, + { + "epoch": 1.4382781928139452, + "grad_norm": 0.9461295052049635, + "learning_rate": 3.706024540297145e-05, + "loss": 0.2024, + "step": 12129 + }, + { + "epoch": 1.4383967745760702, + "grad_norm": 1.4303000697114605, + "learning_rate": 3.705814276150669e-05, + "loss": 0.2768, + "step": 12130 + }, + { + "epoch": 1.4385153563381952, + "grad_norm": 0.9954608709565025, + "learning_rate": 3.705604000888064e-05, + "loss": 0.2413, + "step": 12131 + }, + { + "epoch": 1.4386339381003201, + "grad_norm": 1.4898175901826856, + "learning_rate": 3.7053937145112684e-05, + "loss": 0.3167, + "step": 12132 + }, + { + "epoch": 1.4387525198624451, + "grad_norm": 0.8421325585217315, + "learning_rate": 3.705183417022222e-05, + "loss": 0.1846, + "step": 12133 + }, + { + "epoch": 1.43887110162457, + "grad_norm": 1.6607221510599175, + "learning_rate": 3.7049731084228614e-05, + "loss": 0.4252, + "step": 12134 + }, + { + "epoch": 1.438989683386695, + "grad_norm": 1.5244514240362093, + "learning_rate": 3.704762788715126e-05, + "loss": 0.384, + "step": 12135 + }, + { + "epoch": 1.43910826514882, + "grad_norm": 1.2518813594184361, + "learning_rate": 3.704552457900955e-05, + "loss": 0.2263, + "step": 12136 + }, + { + "epoch": 1.439226846910945, + "grad_norm": 0.9071037063793584, + "learning_rate": 3.704342115982288e-05, + "loss": 0.2178, + "step": 12137 + }, + { + "epoch": 1.43934542867307, + "grad_norm": 1.0339193705580532, + "learning_rate": 3.704131762961064e-05, + "loss": 0.2015, + "step": 12138 + }, + { + "epoch": 1.439464010435195, + "grad_norm": 1.1621741118449682, + "learning_rate": 3.703921398839221e-05, + "loss": 0.2267, + "step": 12139 + }, + { + "epoch": 1.43958259219732, + "grad_norm": 0.8175945400536053, + "learning_rate": 3.7037110236187e-05, + "loss": 0.1566, + "step": 12140 + }, + { + "epoch": 1.439701173959445, + "grad_norm": 0.8640591755079933, + "learning_rate": 3.703500637301438e-05, + "loss": 0.1978, + "step": 12141 + }, + { + "epoch": 1.43981975572157, + "grad_norm": 1.2049961743000681, + "learning_rate": 3.7032902398893776e-05, + "loss": 0.2401, + "step": 12142 + }, + { + "epoch": 1.439938337483695, + "grad_norm": 0.9113250673839405, + "learning_rate": 3.703079831384456e-05, + "loss": 0.1918, + "step": 12143 + }, + { + "epoch": 1.44005691924582, + "grad_norm": 1.1132092973719285, + "learning_rate": 3.702869411788613e-05, + "loss": 0.2279, + "step": 12144 + }, + { + "epoch": 1.440175501007945, + "grad_norm": 0.9315445828953833, + "learning_rate": 3.70265898110379e-05, + "loss": 0.2287, + "step": 12145 + }, + { + "epoch": 1.44029408277007, + "grad_norm": 1.5793747923873895, + "learning_rate": 3.702448539331925e-05, + "loss": 0.3726, + "step": 12146 + }, + { + "epoch": 1.4404126645321949, + "grad_norm": 1.0579605258454343, + "learning_rate": 3.70223808647496e-05, + "loss": 0.2096, + "step": 12147 + }, + { + "epoch": 1.44053124629432, + "grad_norm": 0.8416604090059927, + "learning_rate": 3.702027622534834e-05, + "loss": 0.1974, + "step": 12148 + }, + { + "epoch": 1.4406498280564448, + "grad_norm": 1.6696086177691976, + "learning_rate": 3.701817147513487e-05, + "loss": 0.2514, + "step": 12149 + }, + { + "epoch": 1.44076840981857, + "grad_norm": 1.3720970327870996, + "learning_rate": 3.70160666141286e-05, + "loss": 0.4278, + "step": 12150 + }, + { + "epoch": 1.4408869915806948, + "grad_norm": 1.2912701658261259, + "learning_rate": 3.701396164234893e-05, + "loss": 0.3175, + "step": 12151 + }, + { + "epoch": 1.44100557334282, + "grad_norm": 0.8155265666240815, + "learning_rate": 3.7011856559815266e-05, + "loss": 0.2086, + "step": 12152 + }, + { + "epoch": 1.4411241551049447, + "grad_norm": 1.087376244295926, + "learning_rate": 3.7009751366547015e-05, + "loss": 0.2277, + "step": 12153 + }, + { + "epoch": 1.44124273686707, + "grad_norm": 1.1233840498441974, + "learning_rate": 3.7007646062563575e-05, + "loss": 0.264, + "step": 12154 + }, + { + "epoch": 1.4413613186291947, + "grad_norm": 1.0138217564956953, + "learning_rate": 3.7005540647884374e-05, + "loss": 0.208, + "step": 12155 + }, + { + "epoch": 1.44147990039132, + "grad_norm": 0.9935043427247249, + "learning_rate": 3.7003435122528806e-05, + "loss": 0.2126, + "step": 12156 + }, + { + "epoch": 1.4415984821534449, + "grad_norm": 1.4618013413565982, + "learning_rate": 3.7001329486516284e-05, + "loss": 0.3621, + "step": 12157 + }, + { + "epoch": 1.4417170639155699, + "grad_norm": 1.717423872743809, + "learning_rate": 3.699922373986622e-05, + "loss": 0.3687, + "step": 12158 + }, + { + "epoch": 1.4418356456776948, + "grad_norm": 0.8443764757110154, + "learning_rate": 3.6997117882598036e-05, + "loss": 0.1598, + "step": 12159 + }, + { + "epoch": 1.4419542274398198, + "grad_norm": 1.3657522143309473, + "learning_rate": 3.699501191473113e-05, + "loss": 0.1981, + "step": 12160 + }, + { + "epoch": 1.4420728092019448, + "grad_norm": 1.4337418602248349, + "learning_rate": 3.699290583628493e-05, + "loss": 0.2523, + "step": 12161 + }, + { + "epoch": 1.4421913909640698, + "grad_norm": 1.0364865255841373, + "learning_rate": 3.6990799647278837e-05, + "loss": 0.2144, + "step": 12162 + }, + { + "epoch": 1.4423099727261948, + "grad_norm": 1.7138358209049196, + "learning_rate": 3.698869334773228e-05, + "loss": 0.3402, + "step": 12163 + }, + { + "epoch": 1.4424285544883197, + "grad_norm": 0.9941980350224968, + "learning_rate": 3.698658693766467e-05, + "loss": 0.2237, + "step": 12164 + }, + { + "epoch": 1.4425471362504447, + "grad_norm": 1.649175940162588, + "learning_rate": 3.698448041709543e-05, + "loss": 0.3393, + "step": 12165 + }, + { + "epoch": 1.4426657180125697, + "grad_norm": 0.9749192857771277, + "learning_rate": 3.698237378604397e-05, + "loss": 0.2181, + "step": 12166 + }, + { + "epoch": 1.4427842997746947, + "grad_norm": 1.0291389934138202, + "learning_rate": 3.698026704452972e-05, + "loss": 0.2583, + "step": 12167 + }, + { + "epoch": 1.4429028815368197, + "grad_norm": 2.1383302194127616, + "learning_rate": 3.697816019257211e-05, + "loss": 0.3335, + "step": 12168 + }, + { + "epoch": 1.4430214632989447, + "grad_norm": 1.0626168768887325, + "learning_rate": 3.697605323019054e-05, + "loss": 0.2396, + "step": 12169 + }, + { + "epoch": 1.4431400450610696, + "grad_norm": 1.5120342182606679, + "learning_rate": 3.697394615740445e-05, + "loss": 0.3272, + "step": 12170 + }, + { + "epoch": 1.4432586268231946, + "grad_norm": 0.9601011709659812, + "learning_rate": 3.6971838974233255e-05, + "loss": 0.1828, + "step": 12171 + }, + { + "epoch": 1.4433772085853196, + "grad_norm": 1.1148842238749943, + "learning_rate": 3.696973168069639e-05, + "loss": 0.2987, + "step": 12172 + }, + { + "epoch": 1.4434957903474446, + "grad_norm": 1.2008868772250658, + "learning_rate": 3.6967624276813275e-05, + "loss": 0.2405, + "step": 12173 + }, + { + "epoch": 1.4436143721095696, + "grad_norm": 0.922807572795794, + "learning_rate": 3.696551676260334e-05, + "loss": 0.1468, + "step": 12174 + }, + { + "epoch": 1.4437329538716945, + "grad_norm": 1.0161915066318543, + "learning_rate": 3.696340913808601e-05, + "loss": 0.2165, + "step": 12175 + }, + { + "epoch": 1.4438515356338195, + "grad_norm": 0.9652384465093231, + "learning_rate": 3.6961301403280725e-05, + "loss": 0.2026, + "step": 12176 + }, + { + "epoch": 1.4439701173959445, + "grad_norm": 1.2685258524774872, + "learning_rate": 3.695919355820691e-05, + "loss": 0.2296, + "step": 12177 + }, + { + "epoch": 1.4440886991580695, + "grad_norm": 1.2943352977546778, + "learning_rate": 3.695708560288399e-05, + "loss": 0.2132, + "step": 12178 + }, + { + "epoch": 1.4442072809201945, + "grad_norm": 1.142763323018037, + "learning_rate": 3.69549775373314e-05, + "loss": 0.2621, + "step": 12179 + }, + { + "epoch": 1.4443258626823194, + "grad_norm": 0.8225918950128126, + "learning_rate": 3.695286936156859e-05, + "loss": 0.1497, + "step": 12180 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 1.5620511589821982, + "learning_rate": 3.695076107561497e-05, + "loss": 0.274, + "step": 12181 + }, + { + "epoch": 1.4445630262065694, + "grad_norm": 1.2069869313018584, + "learning_rate": 3.694865267948999e-05, + "loss": 0.3309, + "step": 12182 + }, + { + "epoch": 1.4446816079686944, + "grad_norm": 0.9333597906966883, + "learning_rate": 3.694654417321309e-05, + "loss": 0.1859, + "step": 12183 + }, + { + "epoch": 1.4448001897308194, + "grad_norm": 1.055216022500823, + "learning_rate": 3.69444355568037e-05, + "loss": 0.1933, + "step": 12184 + }, + { + "epoch": 1.4449187714929443, + "grad_norm": 0.9745454730887769, + "learning_rate": 3.694232683028126e-05, + "loss": 0.1879, + "step": 12185 + }, + { + "epoch": 1.4450373532550693, + "grad_norm": 1.512631232716978, + "learning_rate": 3.694021799366521e-05, + "loss": 0.3394, + "step": 12186 + }, + { + "epoch": 1.4451559350171943, + "grad_norm": 0.9395034124201209, + "learning_rate": 3.6938109046975e-05, + "loss": 0.2074, + "step": 12187 + }, + { + "epoch": 1.4452745167793193, + "grad_norm": 0.9503346359768939, + "learning_rate": 3.693599999023006e-05, + "loss": 0.2272, + "step": 12188 + }, + { + "epoch": 1.4453930985414443, + "grad_norm": 1.2195741660527957, + "learning_rate": 3.693389082344984e-05, + "loss": 0.2158, + "step": 12189 + }, + { + "epoch": 1.4455116803035692, + "grad_norm": 1.1719409108533285, + "learning_rate": 3.693178154665377e-05, + "loss": 0.2581, + "step": 12190 + }, + { + "epoch": 1.4456302620656942, + "grad_norm": 0.9129462030887218, + "learning_rate": 3.692967215986132e-05, + "loss": 0.2131, + "step": 12191 + }, + { + "epoch": 1.4457488438278192, + "grad_norm": 1.0619936866245898, + "learning_rate": 3.692756266309191e-05, + "loss": 0.2615, + "step": 12192 + }, + { + "epoch": 1.4458674255899442, + "grad_norm": 0.959478265348533, + "learning_rate": 3.6925453056365e-05, + "loss": 0.1997, + "step": 12193 + }, + { + "epoch": 1.4459860073520692, + "grad_norm": 1.1172310299964452, + "learning_rate": 3.6923343339700046e-05, + "loss": 0.2481, + "step": 12194 + }, + { + "epoch": 1.4461045891141944, + "grad_norm": 0.9074343287218414, + "learning_rate": 3.692123351311648e-05, + "loss": 0.1866, + "step": 12195 + }, + { + "epoch": 1.4462231708763191, + "grad_norm": 1.2495599752376991, + "learning_rate": 3.691912357663376e-05, + "loss": 0.257, + "step": 12196 + }, + { + "epoch": 1.4463417526384443, + "grad_norm": 0.7445929078642342, + "learning_rate": 3.691701353027133e-05, + "loss": 0.1257, + "step": 12197 + }, + { + "epoch": 1.446460334400569, + "grad_norm": 1.1957838421968132, + "learning_rate": 3.691490337404867e-05, + "loss": 0.2839, + "step": 12198 + }, + { + "epoch": 1.4465789161626943, + "grad_norm": 1.464660161213013, + "learning_rate": 3.6912793107985185e-05, + "loss": 0.3276, + "step": 12199 + }, + { + "epoch": 1.446697497924819, + "grad_norm": 1.5558596614619342, + "learning_rate": 3.691068273210038e-05, + "loss": 0.3633, + "step": 12200 + }, + { + "epoch": 1.4468160796869443, + "grad_norm": 0.9478738088155142, + "learning_rate": 3.690857224641367e-05, + "loss": 0.2722, + "step": 12201 + }, + { + "epoch": 1.446934661449069, + "grad_norm": 0.8906842293252789, + "learning_rate": 3.690646165094454e-05, + "loss": 0.1762, + "step": 12202 + }, + { + "epoch": 1.4470532432111942, + "grad_norm": 0.9707756252811277, + "learning_rate": 3.690435094571242e-05, + "loss": 0.2148, + "step": 12203 + }, + { + "epoch": 1.447171824973319, + "grad_norm": 0.9978080648395476, + "learning_rate": 3.69022401307368e-05, + "loss": 0.2074, + "step": 12204 + }, + { + "epoch": 1.4472904067354442, + "grad_norm": 1.1277481395581863, + "learning_rate": 3.690012920603711e-05, + "loss": 0.3017, + "step": 12205 + }, + { + "epoch": 1.447408988497569, + "grad_norm": 1.0921880466961518, + "learning_rate": 3.689801817163282e-05, + "loss": 0.2651, + "step": 12206 + }, + { + "epoch": 1.4475275702596941, + "grad_norm": 0.9383297844217009, + "learning_rate": 3.689590702754339e-05, + "loss": 0.1757, + "step": 12207 + }, + { + "epoch": 1.4476461520218191, + "grad_norm": 1.1426513218377106, + "learning_rate": 3.6893795773788296e-05, + "loss": 0.2813, + "step": 12208 + }, + { + "epoch": 1.447764733783944, + "grad_norm": 1.4067435698825275, + "learning_rate": 3.689168441038698e-05, + "loss": 0.222, + "step": 12209 + }, + { + "epoch": 1.447883315546069, + "grad_norm": 0.9918571098161546, + "learning_rate": 3.6889572937358925e-05, + "loss": 0.2544, + "step": 12210 + }, + { + "epoch": 1.448001897308194, + "grad_norm": 1.0255107367985967, + "learning_rate": 3.6887461354723584e-05, + "loss": 0.1987, + "step": 12211 + }, + { + "epoch": 1.448120479070319, + "grad_norm": 0.8716597002361176, + "learning_rate": 3.688534966250042e-05, + "loss": 0.2007, + "step": 12212 + }, + { + "epoch": 1.448239060832444, + "grad_norm": 1.2248894428232535, + "learning_rate": 3.688323786070892e-05, + "loss": 0.2144, + "step": 12213 + }, + { + "epoch": 1.448357642594569, + "grad_norm": 1.2035141977478596, + "learning_rate": 3.688112594936853e-05, + "loss": 0.2721, + "step": 12214 + }, + { + "epoch": 1.448476224356694, + "grad_norm": 1.1950343904592842, + "learning_rate": 3.687901392849873e-05, + "loss": 0.2582, + "step": 12215 + }, + { + "epoch": 1.448594806118819, + "grad_norm": 0.9655834022501604, + "learning_rate": 3.687690179811899e-05, + "loss": 0.1977, + "step": 12216 + }, + { + "epoch": 1.448713387880944, + "grad_norm": 0.895054909691713, + "learning_rate": 3.687478955824878e-05, + "loss": 0.1688, + "step": 12217 + }, + { + "epoch": 1.448831969643069, + "grad_norm": 0.9687424792982577, + "learning_rate": 3.687267720890757e-05, + "loss": 0.2351, + "step": 12218 + }, + { + "epoch": 1.448950551405194, + "grad_norm": 0.8549193893216772, + "learning_rate": 3.687056475011484e-05, + "loss": 0.2171, + "step": 12219 + }, + { + "epoch": 1.449069133167319, + "grad_norm": 1.5131787628528983, + "learning_rate": 3.6868452181890056e-05, + "loss": 0.3021, + "step": 12220 + }, + { + "epoch": 1.4491877149294439, + "grad_norm": 0.982503211298969, + "learning_rate": 3.68663395042527e-05, + "loss": 0.1963, + "step": 12221 + }, + { + "epoch": 1.4493062966915689, + "grad_norm": 0.9589542043067245, + "learning_rate": 3.686422671722224e-05, + "loss": 0.2101, + "step": 12222 + }, + { + "epoch": 1.4494248784536938, + "grad_norm": 0.9420526271178044, + "learning_rate": 3.686211382081816e-05, + "loss": 0.2201, + "step": 12223 + }, + { + "epoch": 1.4495434602158188, + "grad_norm": 1.023615929317855, + "learning_rate": 3.6860000815059936e-05, + "loss": 0.207, + "step": 12224 + }, + { + "epoch": 1.4496620419779438, + "grad_norm": 1.3800051521562446, + "learning_rate": 3.6857887699967044e-05, + "loss": 0.286, + "step": 12225 + }, + { + "epoch": 1.4497806237400688, + "grad_norm": 1.276240733579864, + "learning_rate": 3.685577447555898e-05, + "loss": 0.3147, + "step": 12226 + }, + { + "epoch": 1.4498992055021938, + "grad_norm": 0.9455233249549477, + "learning_rate": 3.6853661141855206e-05, + "loss": 0.2185, + "step": 12227 + }, + { + "epoch": 1.4500177872643187, + "grad_norm": 1.2079810297883649, + "learning_rate": 3.6851547698875216e-05, + "loss": 0.2528, + "step": 12228 + }, + { + "epoch": 1.4501363690264437, + "grad_norm": 1.1117682427973303, + "learning_rate": 3.6849434146638486e-05, + "loss": 0.2328, + "step": 12229 + }, + { + "epoch": 1.4502549507885687, + "grad_norm": 1.2394147393109731, + "learning_rate": 3.684732048516451e-05, + "loss": 0.2592, + "step": 12230 + }, + { + "epoch": 1.4503735325506937, + "grad_norm": 0.8979086673441969, + "learning_rate": 3.684520671447276e-05, + "loss": 0.1723, + "step": 12231 + }, + { + "epoch": 1.4504921143128187, + "grad_norm": 1.0650155506413088, + "learning_rate": 3.684309283458274e-05, + "loss": 0.257, + "step": 12232 + }, + { + "epoch": 1.4506106960749436, + "grad_norm": 0.6899012513926602, + "learning_rate": 3.6840978845513914e-05, + "loss": 0.1704, + "step": 12233 + }, + { + "epoch": 1.4507292778370686, + "grad_norm": 1.0380791173135553, + "learning_rate": 3.683886474728579e-05, + "loss": 0.2231, + "step": 12234 + }, + { + "epoch": 1.4508478595991936, + "grad_norm": 2.2284520112955897, + "learning_rate": 3.6836750539917845e-05, + "loss": 0.4795, + "step": 12235 + }, + { + "epoch": 1.4509664413613186, + "grad_norm": 1.0644428517785116, + "learning_rate": 3.6834636223429585e-05, + "loss": 0.2043, + "step": 12236 + }, + { + "epoch": 1.4510850231234436, + "grad_norm": 1.1183803323134953, + "learning_rate": 3.6832521797840487e-05, + "loss": 0.2395, + "step": 12237 + }, + { + "epoch": 1.4512036048855685, + "grad_norm": 1.4565152055620314, + "learning_rate": 3.6830407263170045e-05, + "loss": 0.2995, + "step": 12238 + }, + { + "epoch": 1.4513221866476935, + "grad_norm": 1.2251968985606962, + "learning_rate": 3.682829261943776e-05, + "loss": 0.334, + "step": 12239 + }, + { + "epoch": 1.4514407684098185, + "grad_norm": 1.2502312484779226, + "learning_rate": 3.682617786666312e-05, + "loss": 0.2735, + "step": 12240 + }, + { + "epoch": 1.4515593501719435, + "grad_norm": 0.8235032361825881, + "learning_rate": 3.682406300486562e-05, + "loss": 0.2104, + "step": 12241 + }, + { + "epoch": 1.4516779319340685, + "grad_norm": 1.435215667390115, + "learning_rate": 3.6821948034064763e-05, + "loss": 0.3475, + "step": 12242 + }, + { + "epoch": 1.4517965136961934, + "grad_norm": 1.25876237141467, + "learning_rate": 3.681983295428004e-05, + "loss": 0.2934, + "step": 12243 + }, + { + "epoch": 1.4519150954583184, + "grad_norm": 1.0084153803362628, + "learning_rate": 3.6817717765530954e-05, + "loss": 0.2319, + "step": 12244 + }, + { + "epoch": 1.4520336772204434, + "grad_norm": 0.6738907653291393, + "learning_rate": 3.6815602467837e-05, + "loss": 0.1469, + "step": 12245 + }, + { + "epoch": 1.4521522589825686, + "grad_norm": 0.8893729620002367, + "learning_rate": 3.681348706121768e-05, + "loss": 0.1898, + "step": 12246 + }, + { + "epoch": 1.4522708407446934, + "grad_norm": 1.0941492495264937, + "learning_rate": 3.68113715456925e-05, + "loss": 0.2736, + "step": 12247 + }, + { + "epoch": 1.4523894225068186, + "grad_norm": 0.8828198699332169, + "learning_rate": 3.680925592128095e-05, + "loss": 0.2152, + "step": 12248 + }, + { + "epoch": 1.4525080042689433, + "grad_norm": 1.0783744818660763, + "learning_rate": 3.6807140188002545e-05, + "loss": 0.2719, + "step": 12249 + }, + { + "epoch": 1.4526265860310685, + "grad_norm": 1.0956960076314506, + "learning_rate": 3.680502434587679e-05, + "loss": 0.2181, + "step": 12250 + }, + { + "epoch": 1.4527451677931933, + "grad_norm": 0.8558948218197985, + "learning_rate": 3.6802908394923184e-05, + "loss": 0.2197, + "step": 12251 + }, + { + "epoch": 1.4528637495553185, + "grad_norm": 0.9869588028573135, + "learning_rate": 3.680079233516124e-05, + "loss": 0.2144, + "step": 12252 + }, + { + "epoch": 1.4529823313174433, + "grad_norm": 1.1830935856323397, + "learning_rate": 3.679867616661046e-05, + "loss": 0.2112, + "step": 12253 + }, + { + "epoch": 1.4531009130795685, + "grad_norm": 1.1187869552407637, + "learning_rate": 3.679655988929035e-05, + "loss": 0.2699, + "step": 12254 + }, + { + "epoch": 1.4532194948416932, + "grad_norm": 0.9987588997600297, + "learning_rate": 3.679444350322043e-05, + "loss": 0.243, + "step": 12255 + }, + { + "epoch": 1.4533380766038184, + "grad_norm": 1.1012858111109607, + "learning_rate": 3.6792327008420196e-05, + "loss": 0.2859, + "step": 12256 + }, + { + "epoch": 1.4534566583659434, + "grad_norm": 1.0674056585774692, + "learning_rate": 3.679021040490917e-05, + "loss": 0.2445, + "step": 12257 + }, + { + "epoch": 1.4535752401280684, + "grad_norm": 0.8539995164157435, + "learning_rate": 3.678809369270687e-05, + "loss": 0.1604, + "step": 12258 + }, + { + "epoch": 1.4536938218901934, + "grad_norm": 0.9160646618215362, + "learning_rate": 3.6785976871832795e-05, + "loss": 0.1973, + "step": 12259 + }, + { + "epoch": 1.4538124036523183, + "grad_norm": 1.0344879301339405, + "learning_rate": 3.678385994230647e-05, + "loss": 0.2065, + "step": 12260 + }, + { + "epoch": 1.4539309854144433, + "grad_norm": 1.081235154701906, + "learning_rate": 3.6781742904147405e-05, + "loss": 0.1804, + "step": 12261 + }, + { + "epoch": 1.4540495671765683, + "grad_norm": 0.8614960220176746, + "learning_rate": 3.677962575737512e-05, + "loss": 0.1373, + "step": 12262 + }, + { + "epoch": 1.4541681489386933, + "grad_norm": 0.9684639090837732, + "learning_rate": 3.677750850200912e-05, + "loss": 0.172, + "step": 12263 + }, + { + "epoch": 1.4542867307008183, + "grad_norm": 0.8555663981142602, + "learning_rate": 3.677539113806894e-05, + "loss": 0.1567, + "step": 12264 + }, + { + "epoch": 1.4544053124629432, + "grad_norm": 1.0638556910569572, + "learning_rate": 3.67732736655741e-05, + "loss": 0.1807, + "step": 12265 + }, + { + "epoch": 1.4545238942250682, + "grad_norm": 1.1661544619865754, + "learning_rate": 3.677115608454411e-05, + "loss": 0.228, + "step": 12266 + }, + { + "epoch": 1.4546424759871932, + "grad_norm": 1.1994229884248102, + "learning_rate": 3.67690383949985e-05, + "loss": 0.215, + "step": 12267 + }, + { + "epoch": 1.4547610577493182, + "grad_norm": 0.9827556850296882, + "learning_rate": 3.676692059695678e-05, + "loss": 0.2167, + "step": 12268 + }, + { + "epoch": 1.4548796395114432, + "grad_norm": 2.0313707788650004, + "learning_rate": 3.6764802690438486e-05, + "loss": 0.4534, + "step": 12269 + }, + { + "epoch": 1.4549982212735681, + "grad_norm": 1.279833434244065, + "learning_rate": 3.676268467546314e-05, + "loss": 0.2403, + "step": 12270 + }, + { + "epoch": 1.4551168030356931, + "grad_norm": 0.8584872107195795, + "learning_rate": 3.676056655205026e-05, + "loss": 0.1796, + "step": 12271 + }, + { + "epoch": 1.455235384797818, + "grad_norm": 0.8262855633899494, + "learning_rate": 3.6758448320219384e-05, + "loss": 0.2049, + "step": 12272 + }, + { + "epoch": 1.455353966559943, + "grad_norm": 0.745077299437538, + "learning_rate": 3.6756329979990034e-05, + "loss": 0.1832, + "step": 12273 + }, + { + "epoch": 1.455472548322068, + "grad_norm": 1.2322572728066494, + "learning_rate": 3.675421153138173e-05, + "loss": 0.2364, + "step": 12274 + }, + { + "epoch": 1.455591130084193, + "grad_norm": 1.4030297853775844, + "learning_rate": 3.675209297441401e-05, + "loss": 0.2384, + "step": 12275 + }, + { + "epoch": 1.455709711846318, + "grad_norm": 0.8482227892171417, + "learning_rate": 3.674997430910641e-05, + "loss": 0.1692, + "step": 12276 + }, + { + "epoch": 1.455828293608443, + "grad_norm": 1.1332947876105621, + "learning_rate": 3.674785553547846e-05, + "loss": 0.2697, + "step": 12277 + }, + { + "epoch": 1.455946875370568, + "grad_norm": 1.0615378085733242, + "learning_rate": 3.6745736653549685e-05, + "loss": 0.2189, + "step": 12278 + }, + { + "epoch": 1.456065457132693, + "grad_norm": 1.0399396867542103, + "learning_rate": 3.674361766333962e-05, + "loss": 0.2197, + "step": 12279 + }, + { + "epoch": 1.456184038894818, + "grad_norm": 0.878744174154464, + "learning_rate": 3.67414985648678e-05, + "loss": 0.1667, + "step": 12280 + }, + { + "epoch": 1.456302620656943, + "grad_norm": 1.0191861916385463, + "learning_rate": 3.673937935815376e-05, + "loss": 0.173, + "step": 12281 + }, + { + "epoch": 1.456421202419068, + "grad_norm": 0.9447760219365109, + "learning_rate": 3.673726004321704e-05, + "loss": 0.2254, + "step": 12282 + }, + { + "epoch": 1.456539784181193, + "grad_norm": 1.1989098086515755, + "learning_rate": 3.673514062007718e-05, + "loss": 0.2812, + "step": 12283 + }, + { + "epoch": 1.4566583659433179, + "grad_norm": 1.0240422065439412, + "learning_rate": 3.673302108875371e-05, + "loss": 0.2159, + "step": 12284 + }, + { + "epoch": 1.4567769477054429, + "grad_norm": 1.166623398453887, + "learning_rate": 3.673090144926617e-05, + "loss": 0.2418, + "step": 12285 + }, + { + "epoch": 1.4568955294675678, + "grad_norm": 1.202779373678851, + "learning_rate": 3.6728781701634105e-05, + "loss": 0.2003, + "step": 12286 + }, + { + "epoch": 1.4570141112296928, + "grad_norm": 0.9741255441323444, + "learning_rate": 3.672666184587706e-05, + "loss": 0.1938, + "step": 12287 + }, + { + "epoch": 1.4571326929918178, + "grad_norm": 1.0691014345480012, + "learning_rate": 3.672454188201457e-05, + "loss": 0.17, + "step": 12288 + }, + { + "epoch": 1.4572512747539428, + "grad_norm": 1.0022702158888146, + "learning_rate": 3.672242181006618e-05, + "loss": 0.2512, + "step": 12289 + }, + { + "epoch": 1.4573698565160678, + "grad_norm": 0.9228886681320742, + "learning_rate": 3.672030163005145e-05, + "loss": 0.2221, + "step": 12290 + }, + { + "epoch": 1.4574884382781927, + "grad_norm": 0.9365219805134268, + "learning_rate": 3.6718181341989896e-05, + "loss": 0.1697, + "step": 12291 + }, + { + "epoch": 1.4576070200403177, + "grad_norm": 0.9647917952510675, + "learning_rate": 3.671606094590108e-05, + "loss": 0.2032, + "step": 12292 + }, + { + "epoch": 1.4577256018024427, + "grad_norm": 0.9031688426784392, + "learning_rate": 3.671394044180455e-05, + "loss": 0.1359, + "step": 12293 + }, + { + "epoch": 1.4578441835645677, + "grad_norm": 1.0630753365979604, + "learning_rate": 3.671181982971986e-05, + "loss": 0.2046, + "step": 12294 + }, + { + "epoch": 1.4579627653266929, + "grad_norm": 1.2857304351284575, + "learning_rate": 3.6709699109666544e-05, + "loss": 0.2248, + "step": 12295 + }, + { + "epoch": 1.4580813470888176, + "grad_norm": 1.2261079573542604, + "learning_rate": 3.6707578281664165e-05, + "loss": 0.2693, + "step": 12296 + }, + { + "epoch": 1.4581999288509429, + "grad_norm": 1.0895434361857286, + "learning_rate": 3.670545734573228e-05, + "loss": 0.2157, + "step": 12297 + }, + { + "epoch": 1.4583185106130676, + "grad_norm": 1.03110542776726, + "learning_rate": 3.670333630189042e-05, + "loss": 0.2504, + "step": 12298 + }, + { + "epoch": 1.4584370923751928, + "grad_norm": 0.7865946870586522, + "learning_rate": 3.670121515015817e-05, + "loss": 0.1974, + "step": 12299 + }, + { + "epoch": 1.4585556741373176, + "grad_norm": 1.093334756333083, + "learning_rate": 3.669909389055504e-05, + "loss": 0.2367, + "step": 12300 + }, + { + "epoch": 1.4586742558994428, + "grad_norm": 1.266400971550583, + "learning_rate": 3.669697252310063e-05, + "loss": 0.256, + "step": 12301 + }, + { + "epoch": 1.4587928376615675, + "grad_norm": 1.6086711997025454, + "learning_rate": 3.669485104781447e-05, + "loss": 0.2698, + "step": 12302 + }, + { + "epoch": 1.4589114194236927, + "grad_norm": 1.710026487805769, + "learning_rate": 3.6692729464716124e-05, + "loss": 0.3453, + "step": 12303 + }, + { + "epoch": 1.4590300011858175, + "grad_norm": 1.1283672705388401, + "learning_rate": 3.6690607773825146e-05, + "loss": 0.2484, + "step": 12304 + }, + { + "epoch": 1.4591485829479427, + "grad_norm": 1.5585241653931248, + "learning_rate": 3.6688485975161106e-05, + "loss": 0.3343, + "step": 12305 + }, + { + "epoch": 1.4592671647100675, + "grad_norm": 1.1926424195174943, + "learning_rate": 3.668636406874356e-05, + "loss": 0.2748, + "step": 12306 + }, + { + "epoch": 1.4593857464721927, + "grad_norm": 1.6112903562850456, + "learning_rate": 3.6684242054592065e-05, + "loss": 0.3529, + "step": 12307 + }, + { + "epoch": 1.4595043282343176, + "grad_norm": 0.9594574109864177, + "learning_rate": 3.668211993272619e-05, + "loss": 0.1908, + "step": 12308 + }, + { + "epoch": 1.4596229099964426, + "grad_norm": 0.9786564694463649, + "learning_rate": 3.667999770316549e-05, + "loss": 0.2018, + "step": 12309 + }, + { + "epoch": 1.4597414917585676, + "grad_norm": 1.0752999349573253, + "learning_rate": 3.667787536592954e-05, + "loss": 0.2195, + "step": 12310 + }, + { + "epoch": 1.4598600735206926, + "grad_norm": 0.996378416062834, + "learning_rate": 3.6675752921037885e-05, + "loss": 0.1789, + "step": 12311 + }, + { + "epoch": 1.4599786552828176, + "grad_norm": 1.0461935717462898, + "learning_rate": 3.667363036851012e-05, + "loss": 0.2966, + "step": 12312 + }, + { + "epoch": 1.4600972370449425, + "grad_norm": 1.1289552026985015, + "learning_rate": 3.66715077083658e-05, + "loss": 0.261, + "step": 12313 + }, + { + "epoch": 1.4602158188070675, + "grad_norm": 1.081670782459618, + "learning_rate": 3.6669384940624485e-05, + "loss": 0.2674, + "step": 12314 + }, + { + "epoch": 1.4603344005691925, + "grad_norm": 1.0737624611227476, + "learning_rate": 3.666726206530575e-05, + "loss": 0.1915, + "step": 12315 + }, + { + "epoch": 1.4604529823313175, + "grad_norm": 0.7112217217361511, + "learning_rate": 3.666513908242917e-05, + "loss": 0.1848, + "step": 12316 + }, + { + "epoch": 1.4605715640934425, + "grad_norm": 1.3946090787650036, + "learning_rate": 3.6663015992014304e-05, + "loss": 0.2832, + "step": 12317 + }, + { + "epoch": 1.4606901458555674, + "grad_norm": 0.8750720839595114, + "learning_rate": 3.666089279408075e-05, + "loss": 0.1729, + "step": 12318 + }, + { + "epoch": 1.4608087276176924, + "grad_norm": 0.9794133992751233, + "learning_rate": 3.6658769488648046e-05, + "loss": 0.2006, + "step": 12319 + }, + { + "epoch": 1.4609273093798174, + "grad_norm": 1.0574411839047053, + "learning_rate": 3.6656646075735794e-05, + "loss": 0.2228, + "step": 12320 + }, + { + "epoch": 1.4610458911419424, + "grad_norm": 0.9551736295748933, + "learning_rate": 3.6654522555363555e-05, + "loss": 0.2023, + "step": 12321 + }, + { + "epoch": 1.4611644729040674, + "grad_norm": 1.0311334964706225, + "learning_rate": 3.665239892755092e-05, + "loss": 0.2515, + "step": 12322 + }, + { + "epoch": 1.4612830546661923, + "grad_norm": 0.9440407751634907, + "learning_rate": 3.6650275192317444e-05, + "loss": 0.2101, + "step": 12323 + }, + { + "epoch": 1.4614016364283173, + "grad_norm": 1.6382732719086555, + "learning_rate": 3.664815134968272e-05, + "loss": 0.3546, + "step": 12324 + }, + { + "epoch": 1.4615202181904423, + "grad_norm": 0.9855442659404148, + "learning_rate": 3.6646027399666325e-05, + "loss": 0.2102, + "step": 12325 + }, + { + "epoch": 1.4616387999525673, + "grad_norm": 0.9557453989010988, + "learning_rate": 3.6643903342287835e-05, + "loss": 0.1812, + "step": 12326 + }, + { + "epoch": 1.4617573817146923, + "grad_norm": 1.026413728146968, + "learning_rate": 3.6641779177566845e-05, + "loss": 0.1916, + "step": 12327 + }, + { + "epoch": 1.4618759634768173, + "grad_norm": 1.2430076907617083, + "learning_rate": 3.663965490552292e-05, + "loss": 0.2072, + "step": 12328 + }, + { + "epoch": 1.4619945452389422, + "grad_norm": 0.8484890847817763, + "learning_rate": 3.663753052617565e-05, + "loss": 0.1851, + "step": 12329 + }, + { + "epoch": 1.4621131270010672, + "grad_norm": 1.2053906807278507, + "learning_rate": 3.6635406039544615e-05, + "loss": 0.2421, + "step": 12330 + }, + { + "epoch": 1.4622317087631922, + "grad_norm": 0.9865983469896434, + "learning_rate": 3.663328144564942e-05, + "loss": 0.2419, + "step": 12331 + }, + { + "epoch": 1.4623502905253172, + "grad_norm": 1.371945372084337, + "learning_rate": 3.663115674450962e-05, + "loss": 0.3365, + "step": 12332 + }, + { + "epoch": 1.4624688722874422, + "grad_norm": 0.9382855609483034, + "learning_rate": 3.662903193614483e-05, + "loss": 0.2315, + "step": 12333 + }, + { + "epoch": 1.4625874540495671, + "grad_norm": 0.9655609681419519, + "learning_rate": 3.662690702057462e-05, + "loss": 0.1742, + "step": 12334 + }, + { + "epoch": 1.4627060358116921, + "grad_norm": 1.082640250176452, + "learning_rate": 3.662478199781858e-05, + "loss": 0.2733, + "step": 12335 + }, + { + "epoch": 1.462824617573817, + "grad_norm": 1.0180068442918755, + "learning_rate": 3.662265686789631e-05, + "loss": 0.2141, + "step": 12336 + }, + { + "epoch": 1.462943199335942, + "grad_norm": 1.2466113593646266, + "learning_rate": 3.6620531630827406e-05, + "loss": 0.2338, + "step": 12337 + }, + { + "epoch": 1.463061781098067, + "grad_norm": 1.2288990134975681, + "learning_rate": 3.6618406286631434e-05, + "loss": 0.2546, + "step": 12338 + }, + { + "epoch": 1.463180362860192, + "grad_norm": 1.3810247476561341, + "learning_rate": 3.661628083532801e-05, + "loss": 0.253, + "step": 12339 + }, + { + "epoch": 1.463298944622317, + "grad_norm": 1.2576701873601877, + "learning_rate": 3.661415527693672e-05, + "loss": 0.2042, + "step": 12340 + }, + { + "epoch": 1.463417526384442, + "grad_norm": 1.1644194151180964, + "learning_rate": 3.6612029611477164e-05, + "loss": 0.2428, + "step": 12341 + }, + { + "epoch": 1.463536108146567, + "grad_norm": 1.2792061911871222, + "learning_rate": 3.660990383896894e-05, + "loss": 0.2157, + "step": 12342 + }, + { + "epoch": 1.463654689908692, + "grad_norm": 0.9887839319547241, + "learning_rate": 3.6607777959431627e-05, + "loss": 0.2125, + "step": 12343 + }, + { + "epoch": 1.4637732716708172, + "grad_norm": 1.4860402934677306, + "learning_rate": 3.660565197288484e-05, + "loss": 0.3007, + "step": 12344 + }, + { + "epoch": 1.463891853432942, + "grad_norm": 1.1456724446227498, + "learning_rate": 3.660352587934818e-05, + "loss": 0.2405, + "step": 12345 + }, + { + "epoch": 1.4640104351950671, + "grad_norm": 0.9784467126715762, + "learning_rate": 3.660139967884123e-05, + "loss": 0.2057, + "step": 12346 + }, + { + "epoch": 1.4641290169571919, + "grad_norm": 1.320205151320214, + "learning_rate": 3.659927337138361e-05, + "loss": 0.2149, + "step": 12347 + }, + { + "epoch": 1.464247598719317, + "grad_norm": 1.0695481181267392, + "learning_rate": 3.659714695699491e-05, + "loss": 0.1916, + "step": 12348 + }, + { + "epoch": 1.4643661804814418, + "grad_norm": 1.1138435085770009, + "learning_rate": 3.659502043569474e-05, + "loss": 0.2591, + "step": 12349 + }, + { + "epoch": 1.464484762243567, + "grad_norm": 0.9304644851497409, + "learning_rate": 3.65928938075027e-05, + "loss": 0.2156, + "step": 12350 + }, + { + "epoch": 1.4646033440056918, + "grad_norm": 0.8843822234770843, + "learning_rate": 3.659076707243839e-05, + "loss": 0.214, + "step": 12351 + }, + { + "epoch": 1.464721925767817, + "grad_norm": 0.9111461652187726, + "learning_rate": 3.6588640230521426e-05, + "loss": 0.2048, + "step": 12352 + }, + { + "epoch": 1.4648405075299418, + "grad_norm": 1.0683412783813215, + "learning_rate": 3.658651328177141e-05, + "loss": 0.1711, + "step": 12353 + }, + { + "epoch": 1.464959089292067, + "grad_norm": 0.9112118398928412, + "learning_rate": 3.6584386226207945e-05, + "loss": 0.206, + "step": 12354 + }, + { + "epoch": 1.4650776710541917, + "grad_norm": 0.8646674524733492, + "learning_rate": 3.658225906385064e-05, + "loss": 0.1876, + "step": 12355 + }, + { + "epoch": 1.465196252816317, + "grad_norm": 2.1749456004029994, + "learning_rate": 3.658013179471912e-05, + "loss": 0.4169, + "step": 12356 + }, + { + "epoch": 1.465314834578442, + "grad_norm": 1.2208833430020722, + "learning_rate": 3.657800441883298e-05, + "loss": 0.2355, + "step": 12357 + }, + { + "epoch": 1.465433416340567, + "grad_norm": 1.0744460969899985, + "learning_rate": 3.657587693621184e-05, + "loss": 0.2541, + "step": 12358 + }, + { + "epoch": 1.4655519981026919, + "grad_norm": 1.1294849890644532, + "learning_rate": 3.657374934687531e-05, + "loss": 0.2452, + "step": 12359 + }, + { + "epoch": 1.4656705798648169, + "grad_norm": 0.8792809647177741, + "learning_rate": 3.6571621650843005e-05, + "loss": 0.1944, + "step": 12360 + }, + { + "epoch": 1.4657891616269418, + "grad_norm": 0.9786865467535502, + "learning_rate": 3.656949384813454e-05, + "loss": 0.2454, + "step": 12361 + }, + { + "epoch": 1.4659077433890668, + "grad_norm": 1.0466843105316348, + "learning_rate": 3.6567365938769525e-05, + "loss": 0.226, + "step": 12362 + }, + { + "epoch": 1.4660263251511918, + "grad_norm": 0.9066714651993623, + "learning_rate": 3.656523792276758e-05, + "loss": 0.1787, + "step": 12363 + }, + { + "epoch": 1.4661449069133168, + "grad_norm": 0.8569475660476885, + "learning_rate": 3.656310980014832e-05, + "loss": 0.2456, + "step": 12364 + }, + { + "epoch": 1.4662634886754418, + "grad_norm": 0.9131221658855861, + "learning_rate": 3.6560981570931376e-05, + "loss": 0.2111, + "step": 12365 + }, + { + "epoch": 1.4663820704375667, + "grad_norm": 0.7418893424942871, + "learning_rate": 3.655885323513635e-05, + "loss": 0.1748, + "step": 12366 + }, + { + "epoch": 1.4665006521996917, + "grad_norm": 0.9936365346245545, + "learning_rate": 3.655672479278288e-05, + "loss": 0.2099, + "step": 12367 + }, + { + "epoch": 1.4666192339618167, + "grad_norm": 0.9983592452090637, + "learning_rate": 3.655459624389058e-05, + "loss": 0.2267, + "step": 12368 + }, + { + "epoch": 1.4667378157239417, + "grad_norm": 1.1621517879250578, + "learning_rate": 3.655246758847907e-05, + "loss": 0.2041, + "step": 12369 + }, + { + "epoch": 1.4668563974860667, + "grad_norm": 1.1130794696195996, + "learning_rate": 3.655033882656797e-05, + "loss": 0.1886, + "step": 12370 + }, + { + "epoch": 1.4669749792481916, + "grad_norm": 0.8761831765035717, + "learning_rate": 3.654820995817691e-05, + "loss": 0.2056, + "step": 12371 + }, + { + "epoch": 1.4670935610103166, + "grad_norm": 0.8804782275581297, + "learning_rate": 3.6546080983325527e-05, + "loss": 0.1748, + "step": 12372 + }, + { + "epoch": 1.4672121427724416, + "grad_norm": 0.9795086321613173, + "learning_rate": 3.6543951902033426e-05, + "loss": 0.2027, + "step": 12373 + }, + { + "epoch": 1.4673307245345666, + "grad_norm": 1.5240947843516957, + "learning_rate": 3.654182271432024e-05, + "loss": 0.2882, + "step": 12374 + }, + { + "epoch": 1.4674493062966916, + "grad_norm": 1.06657230058152, + "learning_rate": 3.653969342020561e-05, + "loss": 0.2361, + "step": 12375 + }, + { + "epoch": 1.4675678880588165, + "grad_norm": 1.242852902688949, + "learning_rate": 3.653756401970916e-05, + "loss": 0.2716, + "step": 12376 + }, + { + "epoch": 1.4676864698209415, + "grad_norm": 1.348213375793061, + "learning_rate": 3.6535434512850507e-05, + "loss": 0.2682, + "step": 12377 + }, + { + "epoch": 1.4678050515830665, + "grad_norm": 0.8727882389060837, + "learning_rate": 3.653330489964931e-05, + "loss": 0.268, + "step": 12378 + }, + { + "epoch": 1.4679236333451915, + "grad_norm": 1.09671254536131, + "learning_rate": 3.653117518012517e-05, + "loss": 0.2483, + "step": 12379 + }, + { + "epoch": 1.4680422151073165, + "grad_norm": 0.768070156041988, + "learning_rate": 3.6529045354297746e-05, + "loss": 0.2137, + "step": 12380 + }, + { + "epoch": 1.4681607968694415, + "grad_norm": 0.9090849554469114, + "learning_rate": 3.652691542218666e-05, + "loss": 0.2335, + "step": 12381 + }, + { + "epoch": 1.4682793786315664, + "grad_norm": 0.9212650772234733, + "learning_rate": 3.652478538381154e-05, + "loss": 0.1805, + "step": 12382 + }, + { + "epoch": 1.4683979603936914, + "grad_norm": 1.36235647359699, + "learning_rate": 3.652265523919204e-05, + "loss": 0.2686, + "step": 12383 + }, + { + "epoch": 1.4685165421558164, + "grad_norm": 1.3071815422753112, + "learning_rate": 3.652052498834778e-05, + "loss": 0.2551, + "step": 12384 + }, + { + "epoch": 1.4686351239179414, + "grad_norm": 0.9676389372007833, + "learning_rate": 3.6518394631298415e-05, + "loss": 0.2129, + "step": 12385 + }, + { + "epoch": 1.4687537056800664, + "grad_norm": 0.7869503222819934, + "learning_rate": 3.651626416806357e-05, + "loss": 0.1679, + "step": 12386 + }, + { + "epoch": 1.4688722874421913, + "grad_norm": 1.389054288855431, + "learning_rate": 3.6514133598662904e-05, + "loss": 0.3462, + "step": 12387 + }, + { + "epoch": 1.4689908692043163, + "grad_norm": 0.7923394330130631, + "learning_rate": 3.6512002923116026e-05, + "loss": 0.1799, + "step": 12388 + }, + { + "epoch": 1.4691094509664413, + "grad_norm": 0.9483388408560601, + "learning_rate": 3.650987214144262e-05, + "loss": 0.1484, + "step": 12389 + }, + { + "epoch": 1.4692280327285663, + "grad_norm": 0.8931070707398896, + "learning_rate": 3.650774125366229e-05, + "loss": 0.1873, + "step": 12390 + }, + { + "epoch": 1.4693466144906913, + "grad_norm": 1.0986234083476598, + "learning_rate": 3.650561025979471e-05, + "loss": 0.2329, + "step": 12391 + }, + { + "epoch": 1.4694651962528162, + "grad_norm": 1.5051881187709824, + "learning_rate": 3.650347915985951e-05, + "loss": 0.2832, + "step": 12392 + }, + { + "epoch": 1.4695837780149412, + "grad_norm": 1.039350408082825, + "learning_rate": 3.650134795387633e-05, + "loss": 0.2306, + "step": 12393 + }, + { + "epoch": 1.4697023597770662, + "grad_norm": 1.0206816627286972, + "learning_rate": 3.6499216641864825e-05, + "loss": 0.2229, + "step": 12394 + }, + { + "epoch": 1.4698209415391914, + "grad_norm": 1.4503479416744751, + "learning_rate": 3.649708522384465e-05, + "loss": 0.3339, + "step": 12395 + }, + { + "epoch": 1.4699395233013162, + "grad_norm": 1.054467065350489, + "learning_rate": 3.649495369983545e-05, + "loss": 0.229, + "step": 12396 + }, + { + "epoch": 1.4700581050634414, + "grad_norm": 0.8074766829708945, + "learning_rate": 3.649282206985687e-05, + "loss": 0.1415, + "step": 12397 + }, + { + "epoch": 1.4701766868255661, + "grad_norm": 1.3502874934598594, + "learning_rate": 3.649069033392857e-05, + "loss": 0.3511, + "step": 12398 + }, + { + "epoch": 1.4702952685876913, + "grad_norm": 1.4676275054539967, + "learning_rate": 3.6488558492070184e-05, + "loss": 0.3193, + "step": 12399 + }, + { + "epoch": 1.470413850349816, + "grad_norm": 0.9936680885892792, + "learning_rate": 3.648642654430139e-05, + "loss": 0.1672, + "step": 12400 + }, + { + "epoch": 1.4705324321119413, + "grad_norm": 0.9197420923804648, + "learning_rate": 3.648429449064182e-05, + "loss": 0.1663, + "step": 12401 + }, + { + "epoch": 1.470651013874066, + "grad_norm": 1.1302591130788422, + "learning_rate": 3.648216233111114e-05, + "loss": 0.2338, + "step": 12402 + }, + { + "epoch": 1.4707695956361913, + "grad_norm": 0.8516339670153439, + "learning_rate": 3.6480030065728996e-05, + "loss": 0.1557, + "step": 12403 + }, + { + "epoch": 1.470888177398316, + "grad_norm": 1.0503372138008173, + "learning_rate": 3.647789769451506e-05, + "loss": 0.2837, + "step": 12404 + }, + { + "epoch": 1.4710067591604412, + "grad_norm": 1.372206071615678, + "learning_rate": 3.647576521748898e-05, + "loss": 0.3297, + "step": 12405 + }, + { + "epoch": 1.471125340922566, + "grad_norm": 1.238263687132589, + "learning_rate": 3.647363263467043e-05, + "loss": 0.1871, + "step": 12406 + }, + { + "epoch": 1.4712439226846912, + "grad_norm": 1.4605723713583092, + "learning_rate": 3.647149994607904e-05, + "loss": 0.2479, + "step": 12407 + }, + { + "epoch": 1.4713625044468162, + "grad_norm": 1.0635405764908195, + "learning_rate": 3.6469367151734504e-05, + "loss": 0.2066, + "step": 12408 + }, + { + "epoch": 1.4714810862089411, + "grad_norm": 0.9274213856134043, + "learning_rate": 3.646723425165645e-05, + "loss": 0.2299, + "step": 12409 + }, + { + "epoch": 1.4715996679710661, + "grad_norm": 1.0937411147375538, + "learning_rate": 3.646510124586456e-05, + "loss": 0.2441, + "step": 12410 + }, + { + "epoch": 1.471718249733191, + "grad_norm": 0.9969916705772012, + "learning_rate": 3.646296813437851e-05, + "loss": 0.2307, + "step": 12411 + }, + { + "epoch": 1.471836831495316, + "grad_norm": 0.9437460272518766, + "learning_rate": 3.646083491721794e-05, + "loss": 0.192, + "step": 12412 + }, + { + "epoch": 1.471955413257441, + "grad_norm": 0.8676949075905334, + "learning_rate": 3.645870159440253e-05, + "loss": 0.1588, + "step": 12413 + }, + { + "epoch": 1.472073995019566, + "grad_norm": 1.0519203391299206, + "learning_rate": 3.645656816595194e-05, + "loss": 0.2506, + "step": 12414 + }, + { + "epoch": 1.472192576781691, + "grad_norm": 1.2645443384930473, + "learning_rate": 3.645443463188585e-05, + "loss": 0.2917, + "step": 12415 + }, + { + "epoch": 1.472311158543816, + "grad_norm": 1.3525097065132112, + "learning_rate": 3.645230099222391e-05, + "loss": 0.36, + "step": 12416 + }, + { + "epoch": 1.472429740305941, + "grad_norm": 0.754780939932602, + "learning_rate": 3.64501672469858e-05, + "loss": 0.1455, + "step": 12417 + }, + { + "epoch": 1.472548322068066, + "grad_norm": 1.2525221686339192, + "learning_rate": 3.644803339619118e-05, + "loss": 0.2564, + "step": 12418 + }, + { + "epoch": 1.472666903830191, + "grad_norm": 1.0725574806792075, + "learning_rate": 3.644589943985975e-05, + "loss": 0.2402, + "step": 12419 + }, + { + "epoch": 1.472785485592316, + "grad_norm": 1.0028510734988503, + "learning_rate": 3.644376537801115e-05, + "loss": 0.2399, + "step": 12420 + }, + { + "epoch": 1.472904067354441, + "grad_norm": 0.9980838774278437, + "learning_rate": 3.6441631210665075e-05, + "loss": 0.2023, + "step": 12421 + }, + { + "epoch": 1.4730226491165659, + "grad_norm": 1.1304144794476616, + "learning_rate": 3.643949693784118e-05, + "loss": 0.2368, + "step": 12422 + }, + { + "epoch": 1.4731412308786909, + "grad_norm": 1.2179398376159196, + "learning_rate": 3.6437362559559154e-05, + "loss": 0.2107, + "step": 12423 + }, + { + "epoch": 1.4732598126408158, + "grad_norm": 0.9697856249318505, + "learning_rate": 3.643522807583867e-05, + "loss": 0.2347, + "step": 12424 + }, + { + "epoch": 1.4733783944029408, + "grad_norm": 1.297657178624559, + "learning_rate": 3.643309348669941e-05, + "loss": 0.2771, + "step": 12425 + }, + { + "epoch": 1.4734969761650658, + "grad_norm": 1.3128479277369434, + "learning_rate": 3.643095879216105e-05, + "loss": 0.2614, + "step": 12426 + }, + { + "epoch": 1.4736155579271908, + "grad_norm": 1.3318712786624136, + "learning_rate": 3.6428823992243264e-05, + "loss": 0.2586, + "step": 12427 + }, + { + "epoch": 1.4737341396893158, + "grad_norm": 0.98877584666035, + "learning_rate": 3.642668908696574e-05, + "loss": 0.2126, + "step": 12428 + }, + { + "epoch": 1.4738527214514408, + "grad_norm": 0.9073026977961455, + "learning_rate": 3.642455407634815e-05, + "loss": 0.2114, + "step": 12429 + }, + { + "epoch": 1.4739713032135657, + "grad_norm": 0.9348074287867645, + "learning_rate": 3.6422418960410186e-05, + "loss": 0.2542, + "step": 12430 + }, + { + "epoch": 1.4740898849756907, + "grad_norm": 0.8656236461985849, + "learning_rate": 3.6420283739171515e-05, + "loss": 0.194, + "step": 12431 + }, + { + "epoch": 1.4742084667378157, + "grad_norm": 1.0840098412195487, + "learning_rate": 3.641814841265185e-05, + "loss": 0.2518, + "step": 12432 + }, + { + "epoch": 1.4743270484999407, + "grad_norm": 0.9228113613391247, + "learning_rate": 3.6416012980870846e-05, + "loss": 0.2275, + "step": 12433 + }, + { + "epoch": 1.4744456302620657, + "grad_norm": 0.7165140416358798, + "learning_rate": 3.6413877443848196e-05, + "loss": 0.1349, + "step": 12434 + }, + { + "epoch": 1.4745642120241906, + "grad_norm": 0.9864080702187864, + "learning_rate": 3.6411741801603604e-05, + "loss": 0.2014, + "step": 12435 + }, + { + "epoch": 1.4746827937863156, + "grad_norm": 1.1101378494153227, + "learning_rate": 3.6409606054156746e-05, + "loss": 0.2731, + "step": 12436 + }, + { + "epoch": 1.4748013755484406, + "grad_norm": 1.2467531846024609, + "learning_rate": 3.640747020152731e-05, + "loss": 0.2451, + "step": 12437 + }, + { + "epoch": 1.4749199573105656, + "grad_norm": 1.4186220960739688, + "learning_rate": 3.640533424373499e-05, + "loss": 0.2527, + "step": 12438 + }, + { + "epoch": 1.4750385390726906, + "grad_norm": 0.8973670679205755, + "learning_rate": 3.640319818079947e-05, + "loss": 0.153, + "step": 12439 + }, + { + "epoch": 1.4751571208348155, + "grad_norm": 1.1395969539525421, + "learning_rate": 3.640106201274044e-05, + "loss": 0.2302, + "step": 12440 + }, + { + "epoch": 1.4752757025969405, + "grad_norm": 1.112543196174642, + "learning_rate": 3.639892573957761e-05, + "loss": 0.2363, + "step": 12441 + }, + { + "epoch": 1.4753942843590655, + "grad_norm": 0.8945795457940593, + "learning_rate": 3.639678936133066e-05, + "loss": 0.1716, + "step": 12442 + }, + { + "epoch": 1.4755128661211905, + "grad_norm": 0.9250658709926433, + "learning_rate": 3.6394652878019284e-05, + "loss": 0.1678, + "step": 12443 + }, + { + "epoch": 1.4756314478833157, + "grad_norm": 1.279129598508207, + "learning_rate": 3.639251628966318e-05, + "loss": 0.3379, + "step": 12444 + }, + { + "epoch": 1.4757500296454404, + "grad_norm": 1.2440209188224252, + "learning_rate": 3.639037959628206e-05, + "loss": 0.2611, + "step": 12445 + }, + { + "epoch": 1.4758686114075656, + "grad_norm": 0.9385006146202874, + "learning_rate": 3.63882427978956e-05, + "loss": 0.1992, + "step": 12446 + }, + { + "epoch": 1.4759871931696904, + "grad_norm": 1.2267240848160825, + "learning_rate": 3.6386105894523504e-05, + "loss": 0.255, + "step": 12447 + }, + { + "epoch": 1.4761057749318156, + "grad_norm": 1.3324714224050949, + "learning_rate": 3.638396888618547e-05, + "loss": 0.3004, + "step": 12448 + }, + { + "epoch": 1.4762243566939404, + "grad_norm": 0.977868428806666, + "learning_rate": 3.6381831772901216e-05, + "loss": 0.2219, + "step": 12449 + }, + { + "epoch": 1.4763429384560656, + "grad_norm": 1.1041643860449097, + "learning_rate": 3.637969455469042e-05, + "loss": 0.2066, + "step": 12450 + }, + { + "epoch": 1.4764615202181903, + "grad_norm": 0.8463756277359401, + "learning_rate": 3.63775572315728e-05, + "loss": 0.169, + "step": 12451 + }, + { + "epoch": 1.4765801019803155, + "grad_norm": 0.9605706552964786, + "learning_rate": 3.637541980356805e-05, + "loss": 0.1966, + "step": 12452 + }, + { + "epoch": 1.4766986837424403, + "grad_norm": 1.397046646140779, + "learning_rate": 3.637328227069588e-05, + "loss": 0.2858, + "step": 12453 + }, + { + "epoch": 1.4768172655045655, + "grad_norm": 0.9515588338146465, + "learning_rate": 3.6371144632975994e-05, + "loss": 0.1954, + "step": 12454 + }, + { + "epoch": 1.4769358472666902, + "grad_norm": 1.3435691681283766, + "learning_rate": 3.63690068904281e-05, + "loss": 0.334, + "step": 12455 + }, + { + "epoch": 1.4770544290288155, + "grad_norm": 0.8560281340626763, + "learning_rate": 3.6366869043071904e-05, + "loss": 0.1895, + "step": 12456 + }, + { + "epoch": 1.4771730107909404, + "grad_norm": 0.9109579343806663, + "learning_rate": 3.6364731090927116e-05, + "loss": 0.1811, + "step": 12457 + }, + { + "epoch": 1.4772915925530654, + "grad_norm": 1.017585492131578, + "learning_rate": 3.6362593034013446e-05, + "loss": 0.2214, + "step": 12458 + }, + { + "epoch": 1.4774101743151904, + "grad_norm": 1.5212850716694017, + "learning_rate": 3.63604548723506e-05, + "loss": 0.3145, + "step": 12459 + }, + { + "epoch": 1.4775287560773154, + "grad_norm": 0.9918285750594532, + "learning_rate": 3.635831660595829e-05, + "loss": 0.2013, + "step": 12460 + }, + { + "epoch": 1.4776473378394404, + "grad_norm": 0.7132416155399051, + "learning_rate": 3.6356178234856225e-05, + "loss": 0.1882, + "step": 12461 + }, + { + "epoch": 1.4777659196015653, + "grad_norm": 0.8422407419923301, + "learning_rate": 3.635403975906413e-05, + "loss": 0.1648, + "step": 12462 + }, + { + "epoch": 1.4778845013636903, + "grad_norm": 0.9411505505316864, + "learning_rate": 3.6351901178601704e-05, + "loss": 0.2378, + "step": 12463 + }, + { + "epoch": 1.4780030831258153, + "grad_norm": 0.995927740783831, + "learning_rate": 3.634976249348867e-05, + "loss": 0.2302, + "step": 12464 + }, + { + "epoch": 1.4781216648879403, + "grad_norm": 1.021097245088831, + "learning_rate": 3.634762370374475e-05, + "loss": 0.2251, + "step": 12465 + }, + { + "epoch": 1.4782402466500653, + "grad_norm": 0.7171493944627431, + "learning_rate": 3.6345484809389654e-05, + "loss": 0.1612, + "step": 12466 + }, + { + "epoch": 1.4783588284121902, + "grad_norm": 1.2356665077426845, + "learning_rate": 3.6343345810443094e-05, + "loss": 0.2796, + "step": 12467 + }, + { + "epoch": 1.4784774101743152, + "grad_norm": 1.0981080196042237, + "learning_rate": 3.63412067069248e-05, + "loss": 0.2413, + "step": 12468 + }, + { + "epoch": 1.4785959919364402, + "grad_norm": 0.8413112290919859, + "learning_rate": 3.633906749885449e-05, + "loss": 0.177, + "step": 12469 + }, + { + "epoch": 1.4787145736985652, + "grad_norm": 1.4706263723380035, + "learning_rate": 3.6336928186251884e-05, + "loss": 0.2288, + "step": 12470 + }, + { + "epoch": 1.4788331554606902, + "grad_norm": 0.9957279370783017, + "learning_rate": 3.6334788769136704e-05, + "loss": 0.2114, + "step": 12471 + }, + { + "epoch": 1.4789517372228151, + "grad_norm": 1.0455768430363104, + "learning_rate": 3.633264924752866e-05, + "loss": 0.1975, + "step": 12472 + }, + { + "epoch": 1.4790703189849401, + "grad_norm": 0.9269204100881278, + "learning_rate": 3.633050962144749e-05, + "loss": 0.2093, + "step": 12473 + }, + { + "epoch": 1.479188900747065, + "grad_norm": 1.0141735195692683, + "learning_rate": 3.6328369890912924e-05, + "loss": 0.205, + "step": 12474 + }, + { + "epoch": 1.47930748250919, + "grad_norm": 1.028886460352578, + "learning_rate": 3.6326230055944676e-05, + "loss": 0.2141, + "step": 12475 + }, + { + "epoch": 1.479426064271315, + "grad_norm": 0.7967084948134183, + "learning_rate": 3.632409011656247e-05, + "loss": 0.1552, + "step": 12476 + }, + { + "epoch": 1.47954464603344, + "grad_norm": 0.9221619168543299, + "learning_rate": 3.632195007278605e-05, + "loss": 0.1822, + "step": 12477 + }, + { + "epoch": 1.479663227795565, + "grad_norm": 0.9637858330708172, + "learning_rate": 3.6319809924635126e-05, + "loss": 0.2571, + "step": 12478 + }, + { + "epoch": 1.47978180955769, + "grad_norm": 0.942893702002522, + "learning_rate": 3.631766967212944e-05, + "loss": 0.1836, + "step": 12479 + }, + { + "epoch": 1.479900391319815, + "grad_norm": 1.0911865620532917, + "learning_rate": 3.631552931528872e-05, + "loss": 0.2303, + "step": 12480 + }, + { + "epoch": 1.48001897308194, + "grad_norm": 1.1003223012559828, + "learning_rate": 3.6313388854132694e-05, + "loss": 0.2011, + "step": 12481 + }, + { + "epoch": 1.480137554844065, + "grad_norm": 1.0655279993955205, + "learning_rate": 3.631124828868109e-05, + "loss": 0.2657, + "step": 12482 + }, + { + "epoch": 1.48025613660619, + "grad_norm": 0.9992036082307079, + "learning_rate": 3.630910761895365e-05, + "loss": 0.2185, + "step": 12483 + }, + { + "epoch": 1.480374718368315, + "grad_norm": 0.8354344128756752, + "learning_rate": 3.630696684497011e-05, + "loss": 0.2127, + "step": 12484 + }, + { + "epoch": 1.48049330013044, + "grad_norm": 0.9669393571337663, + "learning_rate": 3.630482596675019e-05, + "loss": 0.1637, + "step": 12485 + }, + { + "epoch": 1.4806118818925649, + "grad_norm": 1.2526734700192843, + "learning_rate": 3.6302684984313654e-05, + "loss": 0.2422, + "step": 12486 + }, + { + "epoch": 1.4807304636546899, + "grad_norm": 0.9375729049406398, + "learning_rate": 3.630054389768022e-05, + "loss": 0.1831, + "step": 12487 + }, + { + "epoch": 1.4808490454168148, + "grad_norm": 1.1966893452813174, + "learning_rate": 3.6298402706869625e-05, + "loss": 0.1978, + "step": 12488 + }, + { + "epoch": 1.4809676271789398, + "grad_norm": 0.980544197028932, + "learning_rate": 3.6296261411901606e-05, + "loss": 0.1953, + "step": 12489 + }, + { + "epoch": 1.4810862089410648, + "grad_norm": 1.0221922500099783, + "learning_rate": 3.629412001279592e-05, + "loss": 0.2557, + "step": 12490 + }, + { + "epoch": 1.4812047907031898, + "grad_norm": 1.433298756513002, + "learning_rate": 3.629197850957229e-05, + "loss": 0.2642, + "step": 12491 + }, + { + "epoch": 1.4813233724653148, + "grad_norm": 0.8804183901197368, + "learning_rate": 3.628983690225046e-05, + "loss": 0.2342, + "step": 12492 + }, + { + "epoch": 1.4814419542274397, + "grad_norm": 1.1975600411541043, + "learning_rate": 3.6287695190850185e-05, + "loss": 0.1942, + "step": 12493 + }, + { + "epoch": 1.4815605359895647, + "grad_norm": 1.2119105706971645, + "learning_rate": 3.628555337539121e-05, + "loss": 0.2821, + "step": 12494 + }, + { + "epoch": 1.48167911775169, + "grad_norm": 1.1239705429258462, + "learning_rate": 3.628341145589326e-05, + "loss": 0.2352, + "step": 12495 + }, + { + "epoch": 1.4817976995138147, + "grad_norm": 1.3032473701700407, + "learning_rate": 3.6281269432376096e-05, + "loss": 0.1965, + "step": 12496 + }, + { + "epoch": 1.4819162812759399, + "grad_norm": 1.0261534787718256, + "learning_rate": 3.627912730485947e-05, + "loss": 0.1729, + "step": 12497 + }, + { + "epoch": 1.4820348630380646, + "grad_norm": 1.3036937870174787, + "learning_rate": 3.627698507336311e-05, + "loss": 0.2384, + "step": 12498 + }, + { + "epoch": 1.4821534448001898, + "grad_norm": 1.0885567084075596, + "learning_rate": 3.627484273790678e-05, + "loss": 0.2466, + "step": 12499 + }, + { + "epoch": 1.4822720265623146, + "grad_norm": 0.823122156000076, + "learning_rate": 3.627270029851023e-05, + "loss": 0.1875, + "step": 12500 + }, + { + "epoch": 1.4823906083244398, + "grad_norm": 1.1253713901331053, + "learning_rate": 3.627055775519321e-05, + "loss": 0.2838, + "step": 12501 + }, + { + "epoch": 1.4825091900865646, + "grad_norm": 1.1595265254799287, + "learning_rate": 3.626841510797546e-05, + "loss": 0.2422, + "step": 12502 + }, + { + "epoch": 1.4826277718486898, + "grad_norm": 1.0548102211037107, + "learning_rate": 3.626627235687674e-05, + "loss": 0.2168, + "step": 12503 + }, + { + "epoch": 1.4827463536108145, + "grad_norm": 0.959095612120881, + "learning_rate": 3.62641295019168e-05, + "loss": 0.2089, + "step": 12504 + }, + { + "epoch": 1.4828649353729397, + "grad_norm": 1.2381655666565883, + "learning_rate": 3.626198654311542e-05, + "loss": 0.231, + "step": 12505 + }, + { + "epoch": 1.4829835171350645, + "grad_norm": 1.1733002141339182, + "learning_rate": 3.625984348049232e-05, + "loss": 0.2397, + "step": 12506 + }, + { + "epoch": 1.4831020988971897, + "grad_norm": 0.915637655659585, + "learning_rate": 3.625770031406727e-05, + "loss": 0.1861, + "step": 12507 + }, + { + "epoch": 1.4832206806593147, + "grad_norm": 0.8388298136180768, + "learning_rate": 3.625555704386003e-05, + "loss": 0.207, + "step": 12508 + }, + { + "epoch": 1.4833392624214397, + "grad_norm": 0.8942597666724773, + "learning_rate": 3.625341366989036e-05, + "loss": 0.1869, + "step": 12509 + }, + { + "epoch": 1.4834578441835646, + "grad_norm": 1.0968714669364508, + "learning_rate": 3.625127019217801e-05, + "loss": 0.2551, + "step": 12510 + }, + { + "epoch": 1.4835764259456896, + "grad_norm": 1.5023246366262994, + "learning_rate": 3.624912661074275e-05, + "loss": 0.3157, + "step": 12511 + }, + { + "epoch": 1.4836950077078146, + "grad_norm": 1.0865127044655227, + "learning_rate": 3.624698292560434e-05, + "loss": 0.1946, + "step": 12512 + }, + { + "epoch": 1.4838135894699396, + "grad_norm": 1.025972202866654, + "learning_rate": 3.6244839136782535e-05, + "loss": 0.1731, + "step": 12513 + }, + { + "epoch": 1.4839321712320646, + "grad_norm": 0.7677167243015839, + "learning_rate": 3.62426952442971e-05, + "loss": 0.1831, + "step": 12514 + }, + { + "epoch": 1.4840507529941895, + "grad_norm": 0.9334833474781452, + "learning_rate": 3.6240551248167805e-05, + "loss": 0.17, + "step": 12515 + }, + { + "epoch": 1.4841693347563145, + "grad_norm": 1.1722849860822604, + "learning_rate": 3.623840714841441e-05, + "loss": 0.2345, + "step": 12516 + }, + { + "epoch": 1.4842879165184395, + "grad_norm": 1.0008074501035849, + "learning_rate": 3.623626294505668e-05, + "loss": 0.2064, + "step": 12517 + }, + { + "epoch": 1.4844064982805645, + "grad_norm": 1.3115329619532414, + "learning_rate": 3.6234118638114394e-05, + "loss": 0.2785, + "step": 12518 + }, + { + "epoch": 1.4845250800426895, + "grad_norm": 1.0288710804222867, + "learning_rate": 3.6231974227607304e-05, + "loss": 0.2433, + "step": 12519 + }, + { + "epoch": 1.4846436618048144, + "grad_norm": 0.8558148079478822, + "learning_rate": 3.622982971355519e-05, + "loss": 0.1648, + "step": 12520 + }, + { + "epoch": 1.4847622435669394, + "grad_norm": 0.9391035187728556, + "learning_rate": 3.622768509597781e-05, + "loss": 0.1606, + "step": 12521 + }, + { + "epoch": 1.4848808253290644, + "grad_norm": 0.8539685425556743, + "learning_rate": 3.622554037489494e-05, + "loss": 0.1644, + "step": 12522 + }, + { + "epoch": 1.4849994070911894, + "grad_norm": 0.957305554091712, + "learning_rate": 3.6223395550326355e-05, + "loss": 0.1888, + "step": 12523 + }, + { + "epoch": 1.4851179888533144, + "grad_norm": 1.1046943287936888, + "learning_rate": 3.622125062229184e-05, + "loss": 0.2145, + "step": 12524 + }, + { + "epoch": 1.4852365706154393, + "grad_norm": 0.900683564676804, + "learning_rate": 3.621910559081114e-05, + "loss": 0.2458, + "step": 12525 + }, + { + "epoch": 1.4853551523775643, + "grad_norm": 1.0743443058517064, + "learning_rate": 3.6216960455904045e-05, + "loss": 0.2434, + "step": 12526 + }, + { + "epoch": 1.4854737341396893, + "grad_norm": 1.479247341840166, + "learning_rate": 3.621481521759033e-05, + "loss": 0.2649, + "step": 12527 + }, + { + "epoch": 1.4855923159018143, + "grad_norm": 1.1742132822795739, + "learning_rate": 3.6212669875889776e-05, + "loss": 0.2142, + "step": 12528 + }, + { + "epoch": 1.4857108976639393, + "grad_norm": 1.314623231535018, + "learning_rate": 3.6210524430822154e-05, + "loss": 0.2518, + "step": 12529 + }, + { + "epoch": 1.4858294794260642, + "grad_norm": 1.2055070910916248, + "learning_rate": 3.620837888240724e-05, + "loss": 0.2099, + "step": 12530 + }, + { + "epoch": 1.4859480611881892, + "grad_norm": 1.1833824442862528, + "learning_rate": 3.620623323066483e-05, + "loss": 0.2153, + "step": 12531 + }, + { + "epoch": 1.4860666429503142, + "grad_norm": 1.44896967879138, + "learning_rate": 3.620408747561468e-05, + "loss": 0.2391, + "step": 12532 + }, + { + "epoch": 1.4861852247124392, + "grad_norm": 1.0616965167694405, + "learning_rate": 3.620194161727658e-05, + "loss": 0.2219, + "step": 12533 + }, + { + "epoch": 1.4863038064745642, + "grad_norm": 0.7734856586638722, + "learning_rate": 3.619979565567032e-05, + "loss": 0.1686, + "step": 12534 + }, + { + "epoch": 1.4864223882366892, + "grad_norm": 1.022202046729624, + "learning_rate": 3.619764959081568e-05, + "loss": 0.2188, + "step": 12535 + }, + { + "epoch": 1.4865409699988141, + "grad_norm": 0.9983228780253296, + "learning_rate": 3.6195503422732444e-05, + "loss": 0.1972, + "step": 12536 + }, + { + "epoch": 1.4866595517609391, + "grad_norm": 0.8178831607657975, + "learning_rate": 3.619335715144039e-05, + "loss": 0.1114, + "step": 12537 + }, + { + "epoch": 1.486778133523064, + "grad_norm": 1.1568636960256862, + "learning_rate": 3.619121077695931e-05, + "loss": 0.2328, + "step": 12538 + }, + { + "epoch": 1.486896715285189, + "grad_norm": 0.8298704715985096, + "learning_rate": 3.618906429930899e-05, + "loss": 0.201, + "step": 12539 + }, + { + "epoch": 1.487015297047314, + "grad_norm": 1.4860546768890166, + "learning_rate": 3.618691771850922e-05, + "loss": 0.261, + "step": 12540 + }, + { + "epoch": 1.487133878809439, + "grad_norm": 1.813371708498617, + "learning_rate": 3.618477103457978e-05, + "loss": 0.272, + "step": 12541 + }, + { + "epoch": 1.487252460571564, + "grad_norm": 0.8891833819687782, + "learning_rate": 3.618262424754048e-05, + "loss": 0.17, + "step": 12542 + }, + { + "epoch": 1.487371042333689, + "grad_norm": 0.813837061796493, + "learning_rate": 3.618047735741109e-05, + "loss": 0.1798, + "step": 12543 + }, + { + "epoch": 1.4874896240958142, + "grad_norm": 1.2676088084814578, + "learning_rate": 3.617833036421141e-05, + "loss": 0.2554, + "step": 12544 + }, + { + "epoch": 1.487608205857939, + "grad_norm": 0.976814345912257, + "learning_rate": 3.6176183267961225e-05, + "loss": 0.1982, + "step": 12545 + }, + { + "epoch": 1.4877267876200642, + "grad_norm": 0.8997407735825887, + "learning_rate": 3.617403606868035e-05, + "loss": 0.2005, + "step": 12546 + }, + { + "epoch": 1.487845369382189, + "grad_norm": 0.8424836208026085, + "learning_rate": 3.617188876638855e-05, + "loss": 0.1575, + "step": 12547 + }, + { + "epoch": 1.4879639511443141, + "grad_norm": 0.8718438372426466, + "learning_rate": 3.616974136110565e-05, + "loss": 0.2046, + "step": 12548 + }, + { + "epoch": 1.4880825329064389, + "grad_norm": 0.8797043689288352, + "learning_rate": 3.616759385285142e-05, + "loss": 0.1996, + "step": 12549 + }, + { + "epoch": 1.488201114668564, + "grad_norm": 0.7190997805148828, + "learning_rate": 3.6165446241645676e-05, + "loss": 0.1538, + "step": 12550 + }, + { + "epoch": 1.4883196964306888, + "grad_norm": 0.7927637806908981, + "learning_rate": 3.616329852750821e-05, + "loss": 0.1477, + "step": 12551 + }, + { + "epoch": 1.488438278192814, + "grad_norm": 1.0218591937495884, + "learning_rate": 3.616115071045881e-05, + "loss": 0.2455, + "step": 12552 + }, + { + "epoch": 1.4885568599549388, + "grad_norm": 1.2593884760339633, + "learning_rate": 3.615900279051729e-05, + "loss": 0.2261, + "step": 12553 + }, + { + "epoch": 1.488675441717064, + "grad_norm": 1.1949988764165909, + "learning_rate": 3.615685476770346e-05, + "loss": 0.2802, + "step": 12554 + }, + { + "epoch": 1.4887940234791888, + "grad_norm": 1.4096176486570557, + "learning_rate": 3.615470664203711e-05, + "loss": 0.2924, + "step": 12555 + }, + { + "epoch": 1.488912605241314, + "grad_norm": 1.105267933848277, + "learning_rate": 3.615255841353803e-05, + "loss": 0.191, + "step": 12556 + }, + { + "epoch": 1.489031187003439, + "grad_norm": 1.4288877741501853, + "learning_rate": 3.615041008222605e-05, + "loss": 0.3227, + "step": 12557 + }, + { + "epoch": 1.489149768765564, + "grad_norm": 0.7541964797264543, + "learning_rate": 3.6148261648120955e-05, + "loss": 0.1365, + "step": 12558 + }, + { + "epoch": 1.489268350527689, + "grad_norm": 1.0009212228984603, + "learning_rate": 3.614611311124256e-05, + "loss": 0.19, + "step": 12559 + }, + { + "epoch": 1.489386932289814, + "grad_norm": 0.8351160284616331, + "learning_rate": 3.614396447161067e-05, + "loss": 0.1643, + "step": 12560 + }, + { + "epoch": 1.4895055140519389, + "grad_norm": 0.7502958828860733, + "learning_rate": 3.61418157292451e-05, + "loss": 0.1543, + "step": 12561 + }, + { + "epoch": 1.4896240958140639, + "grad_norm": 0.9808135657690321, + "learning_rate": 3.613966688416565e-05, + "loss": 0.2274, + "step": 12562 + }, + { + "epoch": 1.4897426775761888, + "grad_norm": 1.2620717968628377, + "learning_rate": 3.613751793639212e-05, + "loss": 0.2016, + "step": 12563 + }, + { + "epoch": 1.4898612593383138, + "grad_norm": 0.8889006716876188, + "learning_rate": 3.6135368885944345e-05, + "loss": 0.218, + "step": 12564 + }, + { + "epoch": 1.4899798411004388, + "grad_norm": 1.0098338107825695, + "learning_rate": 3.6133219732842125e-05, + "loss": 0.22, + "step": 12565 + }, + { + "epoch": 1.4900984228625638, + "grad_norm": 0.9522662261250159, + "learning_rate": 3.613107047710526e-05, + "loss": 0.2256, + "step": 12566 + }, + { + "epoch": 1.4902170046246888, + "grad_norm": 1.122116959354855, + "learning_rate": 3.6128921118753575e-05, + "loss": 0.198, + "step": 12567 + }, + { + "epoch": 1.4903355863868137, + "grad_norm": 0.9721874764370407, + "learning_rate": 3.61267716578069e-05, + "loss": 0.2089, + "step": 12568 + }, + { + "epoch": 1.4904541681489387, + "grad_norm": 0.9593089074932214, + "learning_rate": 3.612462209428502e-05, + "loss": 0.1957, + "step": 12569 + }, + { + "epoch": 1.4905727499110637, + "grad_norm": 0.9658133681044773, + "learning_rate": 3.612247242820778e-05, + "loss": 0.1776, + "step": 12570 + }, + { + "epoch": 1.4906913316731887, + "grad_norm": 0.8738658422598973, + "learning_rate": 3.612032265959497e-05, + "loss": 0.1922, + "step": 12571 + }, + { + "epoch": 1.4908099134353137, + "grad_norm": 1.0680258374720017, + "learning_rate": 3.611817278846643e-05, + "loss": 0.2106, + "step": 12572 + }, + { + "epoch": 1.4909284951974386, + "grad_norm": 1.3315986080535107, + "learning_rate": 3.6116022814841966e-05, + "loss": 0.3238, + "step": 12573 + }, + { + "epoch": 1.4910470769595636, + "grad_norm": 0.719450488562133, + "learning_rate": 3.611387273874141e-05, + "loss": 0.1076, + "step": 12574 + }, + { + "epoch": 1.4911656587216886, + "grad_norm": 1.1534912725600186, + "learning_rate": 3.611172256018457e-05, + "loss": 0.2845, + "step": 12575 + }, + { + "epoch": 1.4912842404838136, + "grad_norm": 1.0151010020065765, + "learning_rate": 3.6109572279191276e-05, + "loss": 0.2374, + "step": 12576 + }, + { + "epoch": 1.4914028222459386, + "grad_norm": 0.8969565513403346, + "learning_rate": 3.6107421895781343e-05, + "loss": 0.1494, + "step": 12577 + }, + { + "epoch": 1.4915214040080635, + "grad_norm": 1.0743650770776538, + "learning_rate": 3.6105271409974614e-05, + "loss": 0.1814, + "step": 12578 + }, + { + "epoch": 1.4916399857701885, + "grad_norm": 0.8821877800268692, + "learning_rate": 3.610312082179089e-05, + "loss": 0.181, + "step": 12579 + }, + { + "epoch": 1.4917585675323135, + "grad_norm": 1.0306604561361838, + "learning_rate": 3.610097013125001e-05, + "loss": 0.2158, + "step": 12580 + }, + { + "epoch": 1.4918771492944385, + "grad_norm": 1.0527085906254638, + "learning_rate": 3.60988193383718e-05, + "loss": 0.2456, + "step": 12581 + }, + { + "epoch": 1.4919957310565635, + "grad_norm": 0.8784492303955894, + "learning_rate": 3.609666844317608e-05, + "loss": 0.1543, + "step": 12582 + }, + { + "epoch": 1.4921143128186884, + "grad_norm": 0.8871025299571554, + "learning_rate": 3.6094517445682694e-05, + "loss": 0.1514, + "step": 12583 + }, + { + "epoch": 1.4922328945808134, + "grad_norm": 0.999770377702513, + "learning_rate": 3.609236634591145e-05, + "loss": 0.2195, + "step": 12584 + }, + { + "epoch": 1.4923514763429384, + "grad_norm": 1.3742151018841378, + "learning_rate": 3.60902151438822e-05, + "loss": 0.2844, + "step": 12585 + }, + { + "epoch": 1.4924700581050634, + "grad_norm": 1.0120497572403335, + "learning_rate": 3.6088063839614765e-05, + "loss": 0.2328, + "step": 12586 + }, + { + "epoch": 1.4925886398671884, + "grad_norm": 0.8015996656086325, + "learning_rate": 3.608591243312898e-05, + "loss": 0.1814, + "step": 12587 + }, + { + "epoch": 1.4927072216293134, + "grad_norm": 0.955404143670177, + "learning_rate": 3.6083760924444676e-05, + "loss": 0.16, + "step": 12588 + }, + { + "epoch": 1.4928258033914383, + "grad_norm": 1.0674300081176327, + "learning_rate": 3.608160931358169e-05, + "loss": 0.2335, + "step": 12589 + }, + { + "epoch": 1.4929443851535633, + "grad_norm": 0.7979566644373979, + "learning_rate": 3.6079457600559846e-05, + "loss": 0.1414, + "step": 12590 + }, + { + "epoch": 1.4930629669156883, + "grad_norm": 0.6621274275247155, + "learning_rate": 3.6077305785399e-05, + "loss": 0.1278, + "step": 12591 + }, + { + "epoch": 1.4931815486778133, + "grad_norm": 1.0792476586353321, + "learning_rate": 3.607515386811897e-05, + "loss": 0.1909, + "step": 12592 + }, + { + "epoch": 1.4933001304399383, + "grad_norm": 0.8761876557045626, + "learning_rate": 3.6073001848739605e-05, + "loss": 0.2194, + "step": 12593 + }, + { + "epoch": 1.4934187122020632, + "grad_norm": 1.0318954337543318, + "learning_rate": 3.6070849727280745e-05, + "loss": 0.1785, + "step": 12594 + }, + { + "epoch": 1.4935372939641884, + "grad_norm": 1.6056878064211921, + "learning_rate": 3.606869750376223e-05, + "loss": 0.2929, + "step": 12595 + }, + { + "epoch": 1.4936558757263132, + "grad_norm": 0.9789292456005341, + "learning_rate": 3.606654517820389e-05, + "loss": 0.1861, + "step": 12596 + }, + { + "epoch": 1.4937744574884384, + "grad_norm": 0.9409560999962505, + "learning_rate": 3.606439275062557e-05, + "loss": 0.1856, + "step": 12597 + }, + { + "epoch": 1.4938930392505632, + "grad_norm": 0.8612320423133016, + "learning_rate": 3.606224022104713e-05, + "loss": 0.1566, + "step": 12598 + }, + { + "epoch": 1.4940116210126884, + "grad_norm": 1.092400620800352, + "learning_rate": 3.606008758948839e-05, + "loss": 0.2441, + "step": 12599 + }, + { + "epoch": 1.4941302027748131, + "grad_norm": 1.0630343313069401, + "learning_rate": 3.605793485596921e-05, + "loss": 0.2005, + "step": 12600 + }, + { + "epoch": 1.4942487845369383, + "grad_norm": 0.9923567375773299, + "learning_rate": 3.6055782020509424e-05, + "loss": 0.1876, + "step": 12601 + }, + { + "epoch": 1.494367366299063, + "grad_norm": 0.7733036391998498, + "learning_rate": 3.6053629083128893e-05, + "loss": 0.1132, + "step": 12602 + }, + { + "epoch": 1.4944859480611883, + "grad_norm": 1.1053666390174948, + "learning_rate": 3.605147604384745e-05, + "loss": 0.2224, + "step": 12603 + }, + { + "epoch": 1.494604529823313, + "grad_norm": 0.9426538131127932, + "learning_rate": 3.6049322902684964e-05, + "loss": 0.1985, + "step": 12604 + }, + { + "epoch": 1.4947231115854382, + "grad_norm": 1.4605190144606082, + "learning_rate": 3.6047169659661254e-05, + "loss": 0.3755, + "step": 12605 + }, + { + "epoch": 1.4948416933475632, + "grad_norm": 1.2768797332159922, + "learning_rate": 3.60450163147962e-05, + "loss": 0.2523, + "step": 12606 + }, + { + "epoch": 1.4949602751096882, + "grad_norm": 1.2022141252166887, + "learning_rate": 3.604286286810963e-05, + "loss": 0.2357, + "step": 12607 + }, + { + "epoch": 1.4950788568718132, + "grad_norm": 1.0429718571499795, + "learning_rate": 3.604070931962141e-05, + "loss": 0.2251, + "step": 12608 + }, + { + "epoch": 1.4951974386339382, + "grad_norm": 1.4083343238106147, + "learning_rate": 3.6038555669351396e-05, + "loss": 0.2112, + "step": 12609 + }, + { + "epoch": 1.4953160203960632, + "grad_norm": 0.9725806044719973, + "learning_rate": 3.603640191731942e-05, + "loss": 0.1968, + "step": 12610 + }, + { + "epoch": 1.4954346021581881, + "grad_norm": 1.0557482526853013, + "learning_rate": 3.603424806354536e-05, + "loss": 0.1873, + "step": 12611 + }, + { + "epoch": 1.4955531839203131, + "grad_norm": 1.1002983761386187, + "learning_rate": 3.603209410804906e-05, + "loss": 0.2589, + "step": 12612 + }, + { + "epoch": 1.495671765682438, + "grad_norm": 0.7362958170037772, + "learning_rate": 3.6029940050850384e-05, + "loss": 0.1652, + "step": 12613 + }, + { + "epoch": 1.495790347444563, + "grad_norm": 0.8504087934669863, + "learning_rate": 3.602778589196919e-05, + "loss": 0.1619, + "step": 12614 + }, + { + "epoch": 1.495908929206688, + "grad_norm": 1.4990842461779252, + "learning_rate": 3.602563163142533e-05, + "loss": 0.2575, + "step": 12615 + }, + { + "epoch": 1.496027510968813, + "grad_norm": 1.0550744083288, + "learning_rate": 3.602347726923867e-05, + "loss": 0.1992, + "step": 12616 + }, + { + "epoch": 1.496146092730938, + "grad_norm": 1.0593378710844725, + "learning_rate": 3.602132280542906e-05, + "loss": 0.2507, + "step": 12617 + }, + { + "epoch": 1.496264674493063, + "grad_norm": 0.895963069638438, + "learning_rate": 3.6019168240016375e-05, + "loss": 0.1629, + "step": 12618 + }, + { + "epoch": 1.496383256255188, + "grad_norm": 0.9690363991296089, + "learning_rate": 3.601701357302047e-05, + "loss": 0.2676, + "step": 12619 + }, + { + "epoch": 1.496501838017313, + "grad_norm": 0.7677250131045302, + "learning_rate": 3.601485880446122e-05, + "loss": 0.1577, + "step": 12620 + }, + { + "epoch": 1.496620419779438, + "grad_norm": 1.3064602271140566, + "learning_rate": 3.601270393435846e-05, + "loss": 0.2986, + "step": 12621 + }, + { + "epoch": 1.496739001541563, + "grad_norm": 1.001882508900169, + "learning_rate": 3.6010548962732086e-05, + "loss": 0.2464, + "step": 12622 + }, + { + "epoch": 1.496857583303688, + "grad_norm": 0.8103418335676881, + "learning_rate": 3.600839388960195e-05, + "loss": 0.1733, + "step": 12623 + }, + { + "epoch": 1.4969761650658129, + "grad_norm": 0.7912497892948075, + "learning_rate": 3.6006238714987915e-05, + "loss": 0.1864, + "step": 12624 + }, + { + "epoch": 1.4970947468279379, + "grad_norm": 1.7423952948251757, + "learning_rate": 3.6004083438909864e-05, + "loss": 0.3414, + "step": 12625 + }, + { + "epoch": 1.4972133285900628, + "grad_norm": 0.6971631711840531, + "learning_rate": 3.600192806138766e-05, + "loss": 0.154, + "step": 12626 + }, + { + "epoch": 1.4973319103521878, + "grad_norm": 0.9460583035350895, + "learning_rate": 3.599977258244116e-05, + "loss": 0.2225, + "step": 12627 + }, + { + "epoch": 1.4974504921143128, + "grad_norm": 1.0266047690379483, + "learning_rate": 3.599761700209026e-05, + "loss": 0.2217, + "step": 12628 + }, + { + "epoch": 1.4975690738764378, + "grad_norm": 0.9579189557211468, + "learning_rate": 3.5995461320354804e-05, + "loss": 0.2062, + "step": 12629 + }, + { + "epoch": 1.4976876556385628, + "grad_norm": 0.7504954577766069, + "learning_rate": 3.5993305537254696e-05, + "loss": 0.1429, + "step": 12630 + }, + { + "epoch": 1.4978062374006877, + "grad_norm": 1.0403688730345135, + "learning_rate": 3.599114965280977e-05, + "loss": 0.2947, + "step": 12631 + }, + { + "epoch": 1.4979248191628127, + "grad_norm": 1.2132742867301396, + "learning_rate": 3.598899366703994e-05, + "loss": 0.2891, + "step": 12632 + }, + { + "epoch": 1.4980434009249377, + "grad_norm": 0.9355481903629069, + "learning_rate": 3.598683757996505e-05, + "loss": 0.1918, + "step": 12633 + }, + { + "epoch": 1.4981619826870627, + "grad_norm": 0.903390695598067, + "learning_rate": 3.5984681391605005e-05, + "loss": 0.168, + "step": 12634 + }, + { + "epoch": 1.4982805644491877, + "grad_norm": 0.759154754908187, + "learning_rate": 3.5982525101979655e-05, + "loss": 0.1447, + "step": 12635 + }, + { + "epoch": 1.4983991462113126, + "grad_norm": 1.0178203191224262, + "learning_rate": 3.59803687111089e-05, + "loss": 0.2163, + "step": 12636 + }, + { + "epoch": 1.4985177279734376, + "grad_norm": 0.9523219152631872, + "learning_rate": 3.59782122190126e-05, + "loss": 0.1788, + "step": 12637 + }, + { + "epoch": 1.4986363097355626, + "grad_norm": 1.5725955763603392, + "learning_rate": 3.597605562571066e-05, + "loss": 0.3321, + "step": 12638 + }, + { + "epoch": 1.4987548914976876, + "grad_norm": 0.9666149871688506, + "learning_rate": 3.597389893122294e-05, + "loss": 0.2157, + "step": 12639 + }, + { + "epoch": 1.4988734732598126, + "grad_norm": 1.0269125463620583, + "learning_rate": 3.597174213556932e-05, + "loss": 0.1952, + "step": 12640 + }, + { + "epoch": 1.4989920550219376, + "grad_norm": 0.8218450559601408, + "learning_rate": 3.5969585238769705e-05, + "loss": 0.1669, + "step": 12641 + }, + { + "epoch": 1.4991106367840625, + "grad_norm": 1.1256686136681557, + "learning_rate": 3.596742824084396e-05, + "loss": 0.2388, + "step": 12642 + }, + { + "epoch": 1.4992292185461875, + "grad_norm": 1.1717839995730541, + "learning_rate": 3.596527114181197e-05, + "loss": 0.2529, + "step": 12643 + }, + { + "epoch": 1.4993478003083127, + "grad_norm": 0.9759498373038274, + "learning_rate": 3.596311394169363e-05, + "loss": 0.1605, + "step": 12644 + }, + { + "epoch": 1.4994663820704375, + "grad_norm": 1.006036111038569, + "learning_rate": 3.5960956640508835e-05, + "loss": 0.1941, + "step": 12645 + }, + { + "epoch": 1.4995849638325627, + "grad_norm": 1.2970927149255758, + "learning_rate": 3.595879923827745e-05, + "loss": 0.2054, + "step": 12646 + }, + { + "epoch": 1.4997035455946874, + "grad_norm": 1.2648163766565057, + "learning_rate": 3.595664173501938e-05, + "loss": 0.2483, + "step": 12647 + }, + { + "epoch": 1.4998221273568126, + "grad_norm": 1.1450626413133513, + "learning_rate": 3.595448413075451e-05, + "loss": 0.2324, + "step": 12648 + }, + { + "epoch": 1.4999407091189374, + "grad_norm": 1.3118284458463267, + "learning_rate": 3.595232642550273e-05, + "loss": 0.3055, + "step": 12649 + }, + { + "epoch": 1.5000592908810626, + "grad_norm": 0.7700136496247144, + "learning_rate": 3.595016861928392e-05, + "loss": 0.1843, + "step": 12650 + }, + { + "epoch": 1.5001778726431874, + "grad_norm": 0.8578225116222609, + "learning_rate": 3.594801071211798e-05, + "loss": 0.2396, + "step": 12651 + }, + { + "epoch": 1.5002964544053126, + "grad_norm": 0.8298136567741505, + "learning_rate": 3.594585270402482e-05, + "loss": 0.169, + "step": 12652 + }, + { + "epoch": 1.5004150361674373, + "grad_norm": 1.374648778316164, + "learning_rate": 3.594369459502432e-05, + "loss": 0.2253, + "step": 12653 + }, + { + "epoch": 1.5005336179295625, + "grad_norm": 1.1007666257424544, + "learning_rate": 3.5941536385136374e-05, + "loss": 0.2605, + "step": 12654 + }, + { + "epoch": 1.5006521996916873, + "grad_norm": 1.0056273571291243, + "learning_rate": 3.5939378074380873e-05, + "loss": 0.2513, + "step": 12655 + }, + { + "epoch": 1.5007707814538125, + "grad_norm": 0.9417307630364411, + "learning_rate": 3.593721966277773e-05, + "loss": 0.1983, + "step": 12656 + }, + { + "epoch": 1.5008893632159372, + "grad_norm": 0.8769478066326779, + "learning_rate": 3.593506115034683e-05, + "loss": 0.189, + "step": 12657 + }, + { + "epoch": 1.5010079449780624, + "grad_norm": 1.101941050978377, + "learning_rate": 3.593290253710808e-05, + "loss": 0.2084, + "step": 12658 + }, + { + "epoch": 1.5011265267401872, + "grad_norm": 0.812628127232092, + "learning_rate": 3.593074382308138e-05, + "loss": 0.2231, + "step": 12659 + }, + { + "epoch": 1.5012451085023124, + "grad_norm": 1.0921489888679725, + "learning_rate": 3.592858500828661e-05, + "loss": 0.2112, + "step": 12660 + }, + { + "epoch": 1.5013636902644372, + "grad_norm": 1.1174056829200203, + "learning_rate": 3.59264260927437e-05, + "loss": 0.2331, + "step": 12661 + }, + { + "epoch": 1.5014822720265624, + "grad_norm": 1.1015846066809256, + "learning_rate": 3.5924267076472534e-05, + "loss": 0.2562, + "step": 12662 + }, + { + "epoch": 1.5016008537886874, + "grad_norm": 1.0987097262847525, + "learning_rate": 3.592210795949302e-05, + "loss": 0.1901, + "step": 12663 + }, + { + "epoch": 1.5017194355508123, + "grad_norm": 1.1797912937646826, + "learning_rate": 3.591994874182507e-05, + "loss": 0.2419, + "step": 12664 + }, + { + "epoch": 1.5018380173129373, + "grad_norm": 1.2028485831376787, + "learning_rate": 3.5917789423488576e-05, + "loss": 0.2116, + "step": 12665 + }, + { + "epoch": 1.5019565990750623, + "grad_norm": 1.1185386069226784, + "learning_rate": 3.5915630004503465e-05, + "loss": 0.3004, + "step": 12666 + }, + { + "epoch": 1.5020751808371873, + "grad_norm": 0.8148685135378724, + "learning_rate": 3.591347048488962e-05, + "loss": 0.2608, + "step": 12667 + }, + { + "epoch": 1.5021937625993123, + "grad_norm": 0.9867738434580418, + "learning_rate": 3.5911310864666964e-05, + "loss": 0.176, + "step": 12668 + }, + { + "epoch": 1.5023123443614372, + "grad_norm": 1.3080999690399038, + "learning_rate": 3.5909151143855405e-05, + "loss": 0.3157, + "step": 12669 + }, + { + "epoch": 1.5024309261235622, + "grad_norm": 1.057046321600237, + "learning_rate": 3.590699132247485e-05, + "loss": 0.2015, + "step": 12670 + }, + { + "epoch": 1.5025495078856872, + "grad_norm": 0.9705565590843959, + "learning_rate": 3.5904831400545205e-05, + "loss": 0.193, + "step": 12671 + }, + { + "epoch": 1.5026680896478122, + "grad_norm": 1.5329197079519206, + "learning_rate": 3.590267137808639e-05, + "loss": 0.3042, + "step": 12672 + }, + { + "epoch": 1.5027866714099372, + "grad_norm": 0.8070296028475691, + "learning_rate": 3.590051125511831e-05, + "loss": 0.1445, + "step": 12673 + }, + { + "epoch": 1.5029052531720621, + "grad_norm": 1.6355556467491148, + "learning_rate": 3.589835103166088e-05, + "loss": 0.314, + "step": 12674 + }, + { + "epoch": 1.5030238349341871, + "grad_norm": 0.9362825478460693, + "learning_rate": 3.589619070773403e-05, + "loss": 0.199, + "step": 12675 + }, + { + "epoch": 1.503142416696312, + "grad_norm": 1.3420605522320408, + "learning_rate": 3.589403028335766e-05, + "loss": 0.2499, + "step": 12676 + }, + { + "epoch": 1.503260998458437, + "grad_norm": 0.8937738065772499, + "learning_rate": 3.5891869758551685e-05, + "loss": 0.1513, + "step": 12677 + }, + { + "epoch": 1.503379580220562, + "grad_norm": 0.9729036628692663, + "learning_rate": 3.588970913333604e-05, + "loss": 0.1645, + "step": 12678 + }, + { + "epoch": 1.503498161982687, + "grad_norm": 0.895990560649564, + "learning_rate": 3.588754840773062e-05, + "loss": 0.1911, + "step": 12679 + }, + { + "epoch": 1.503616743744812, + "grad_norm": 0.8486450881295491, + "learning_rate": 3.588538758175535e-05, + "loss": 0.2025, + "step": 12680 + }, + { + "epoch": 1.503735325506937, + "grad_norm": 1.4038210600664502, + "learning_rate": 3.588322665543016e-05, + "loss": 0.2347, + "step": 12681 + }, + { + "epoch": 1.503853907269062, + "grad_norm": 1.0335139311397177, + "learning_rate": 3.588106562877497e-05, + "loss": 0.2504, + "step": 12682 + }, + { + "epoch": 1.503972489031187, + "grad_norm": 0.9613026132621915, + "learning_rate": 3.587890450180969e-05, + "loss": 0.2092, + "step": 12683 + }, + { + "epoch": 1.504091070793312, + "grad_norm": 1.082650148671724, + "learning_rate": 3.587674327455426e-05, + "loss": 0.2239, + "step": 12684 + }, + { + "epoch": 1.504209652555437, + "grad_norm": 1.482892566998799, + "learning_rate": 3.587458194702859e-05, + "loss": 0.3049, + "step": 12685 + }, + { + "epoch": 1.504328234317562, + "grad_norm": 1.3724690523493046, + "learning_rate": 3.587242051925262e-05, + "loss": 0.2959, + "step": 12686 + }, + { + "epoch": 1.504446816079687, + "grad_norm": 1.2107173204261448, + "learning_rate": 3.587025899124625e-05, + "loss": 0.2557, + "step": 12687 + }, + { + "epoch": 1.5045653978418119, + "grad_norm": 1.0057665229375403, + "learning_rate": 3.586809736302943e-05, + "loss": 0.2032, + "step": 12688 + }, + { + "epoch": 1.504683979603937, + "grad_norm": 0.9730366139698524, + "learning_rate": 3.586593563462207e-05, + "loss": 0.1655, + "step": 12689 + }, + { + "epoch": 1.5048025613660618, + "grad_norm": 0.896697560374444, + "learning_rate": 3.586377380604413e-05, + "loss": 0.2311, + "step": 12690 + }, + { + "epoch": 1.504921143128187, + "grad_norm": 0.908972362967689, + "learning_rate": 3.58616118773155e-05, + "loss": 0.1797, + "step": 12691 + }, + { + "epoch": 1.5050397248903118, + "grad_norm": 1.01150434747925, + "learning_rate": 3.585944984845613e-05, + "loss": 0.2576, + "step": 12692 + }, + { + "epoch": 1.505158306652437, + "grad_norm": 1.0485064445565822, + "learning_rate": 3.5857287719485944e-05, + "loss": 0.2131, + "step": 12693 + }, + { + "epoch": 1.5052768884145618, + "grad_norm": 0.9263667666373326, + "learning_rate": 3.585512549042489e-05, + "loss": 0.1919, + "step": 12694 + }, + { + "epoch": 1.505395470176687, + "grad_norm": 0.9327124370108697, + "learning_rate": 3.5852963161292884e-05, + "loss": 0.2211, + "step": 12695 + }, + { + "epoch": 1.5055140519388117, + "grad_norm": 1.1494352599470572, + "learning_rate": 3.585080073210987e-05, + "loss": 0.1641, + "step": 12696 + }, + { + "epoch": 1.505632633700937, + "grad_norm": 0.9219666045613872, + "learning_rate": 3.584863820289578e-05, + "loss": 0.2221, + "step": 12697 + }, + { + "epoch": 1.5057512154630617, + "grad_norm": 0.6243764845791333, + "learning_rate": 3.584647557367055e-05, + "loss": 0.127, + "step": 12698 + }, + { + "epoch": 1.5058697972251869, + "grad_norm": 1.6516312681187, + "learning_rate": 3.5844312844454116e-05, + "loss": 0.3941, + "step": 12699 + }, + { + "epoch": 1.5059883789873116, + "grad_norm": 0.9651082863440835, + "learning_rate": 3.5842150015266404e-05, + "loss": 0.2077, + "step": 12700 + }, + { + "epoch": 1.5061069607494368, + "grad_norm": 0.9648012573550039, + "learning_rate": 3.583998708612738e-05, + "loss": 0.1866, + "step": 12701 + }, + { + "epoch": 1.5062255425115616, + "grad_norm": 1.0559260349317146, + "learning_rate": 3.5837824057056956e-05, + "loss": 0.1836, + "step": 12702 + }, + { + "epoch": 1.5063441242736868, + "grad_norm": 0.9793180490311588, + "learning_rate": 3.583566092807509e-05, + "loss": 0.1857, + "step": 12703 + }, + { + "epoch": 1.5064627060358116, + "grad_norm": 0.9292965210561899, + "learning_rate": 3.5833497699201724e-05, + "loss": 0.2305, + "step": 12704 + }, + { + "epoch": 1.5065812877979368, + "grad_norm": 1.1836461714445512, + "learning_rate": 3.583133437045679e-05, + "loss": 0.2775, + "step": 12705 + }, + { + "epoch": 1.5066998695600615, + "grad_norm": 1.081110013626963, + "learning_rate": 3.582917094186023e-05, + "loss": 0.2016, + "step": 12706 + }, + { + "epoch": 1.5068184513221867, + "grad_norm": 1.123167462431224, + "learning_rate": 3.5827007413432e-05, + "loss": 0.206, + "step": 12707 + }, + { + "epoch": 1.5069370330843115, + "grad_norm": 0.8189275792920854, + "learning_rate": 3.582484378519203e-05, + "loss": 0.1571, + "step": 12708 + }, + { + "epoch": 1.5070556148464367, + "grad_norm": 1.3267763623701139, + "learning_rate": 3.582268005716029e-05, + "loss": 0.2758, + "step": 12709 + }, + { + "epoch": 1.5071741966085614, + "grad_norm": 0.8957207785228737, + "learning_rate": 3.58205162293567e-05, + "loss": 0.2109, + "step": 12710 + }, + { + "epoch": 1.5072927783706866, + "grad_norm": 0.8071825510501123, + "learning_rate": 3.581835230180122e-05, + "loss": 0.174, + "step": 12711 + }, + { + "epoch": 1.5074113601328116, + "grad_norm": 0.9555791255856387, + "learning_rate": 3.5816188274513795e-05, + "loss": 0.1729, + "step": 12712 + }, + { + "epoch": 1.5075299418949366, + "grad_norm": 1.3383101815550953, + "learning_rate": 3.581402414751438e-05, + "loss": 0.3131, + "step": 12713 + }, + { + "epoch": 1.5076485236570616, + "grad_norm": 1.0330580258777353, + "learning_rate": 3.5811859920822923e-05, + "loss": 0.1927, + "step": 12714 + }, + { + "epoch": 1.5077671054191866, + "grad_norm": 1.12046174710847, + "learning_rate": 3.580969559445938e-05, + "loss": 0.2016, + "step": 12715 + }, + { + "epoch": 1.5078856871813116, + "grad_norm": 1.1177620117351186, + "learning_rate": 3.58075311684437e-05, + "loss": 0.2542, + "step": 12716 + }, + { + "epoch": 1.5080042689434365, + "grad_norm": 1.2863836303970093, + "learning_rate": 3.5805366642795836e-05, + "loss": 0.3188, + "step": 12717 + }, + { + "epoch": 1.5081228507055615, + "grad_norm": 0.8484385628616193, + "learning_rate": 3.580320201753574e-05, + "loss": 0.2534, + "step": 12718 + }, + { + "epoch": 1.5082414324676865, + "grad_norm": 1.1733804698709354, + "learning_rate": 3.5801037292683364e-05, + "loss": 0.224, + "step": 12719 + }, + { + "epoch": 1.5083600142298115, + "grad_norm": 1.3316265408263062, + "learning_rate": 3.579887246825868e-05, + "loss": 0.2651, + "step": 12720 + }, + { + "epoch": 1.5084785959919365, + "grad_norm": 0.8221191835053854, + "learning_rate": 3.579670754428163e-05, + "loss": 0.1677, + "step": 12721 + }, + { + "epoch": 1.5085971777540614, + "grad_norm": 1.3228755778591417, + "learning_rate": 3.5794542520772166e-05, + "loss": 0.2092, + "step": 12722 + }, + { + "epoch": 1.5087157595161864, + "grad_norm": 0.8407258942931625, + "learning_rate": 3.5792377397750263e-05, + "loss": 0.1633, + "step": 12723 + }, + { + "epoch": 1.5088343412783114, + "grad_norm": 1.4339249855501721, + "learning_rate": 3.5790212175235885e-05, + "loss": 0.2937, + "step": 12724 + }, + { + "epoch": 1.5089529230404364, + "grad_norm": 0.6985406406124984, + "learning_rate": 3.5788046853248966e-05, + "loss": 0.134, + "step": 12725 + }, + { + "epoch": 1.5090715048025614, + "grad_norm": 0.9221124110335374, + "learning_rate": 3.5785881431809496e-05, + "loss": 0.1712, + "step": 12726 + }, + { + "epoch": 1.5091900865646863, + "grad_norm": 0.9720867769212468, + "learning_rate": 3.5783715910937425e-05, + "loss": 0.2186, + "step": 12727 + }, + { + "epoch": 1.5093086683268113, + "grad_norm": 1.14200566334914, + "learning_rate": 3.5781550290652715e-05, + "loss": 0.1921, + "step": 12728 + }, + { + "epoch": 1.5094272500889363, + "grad_norm": 1.0326654965705222, + "learning_rate": 3.5779384570975336e-05, + "loss": 0.2012, + "step": 12729 + }, + { + "epoch": 1.5095458318510613, + "grad_norm": 1.585427265995397, + "learning_rate": 3.5777218751925245e-05, + "loss": 0.3802, + "step": 12730 + }, + { + "epoch": 1.5096644136131863, + "grad_norm": 1.6031332938147655, + "learning_rate": 3.577505283352241e-05, + "loss": 0.2549, + "step": 12731 + }, + { + "epoch": 1.5097829953753112, + "grad_norm": 1.5074725697006583, + "learning_rate": 3.57728868157868e-05, + "loss": 0.3201, + "step": 12732 + }, + { + "epoch": 1.5099015771374362, + "grad_norm": 1.1468015619528944, + "learning_rate": 3.57707206987384e-05, + "loss": 0.2379, + "step": 12733 + }, + { + "epoch": 1.5100201588995612, + "grad_norm": 0.8796726688261193, + "learning_rate": 3.576855448239715e-05, + "loss": 0.1593, + "step": 12734 + }, + { + "epoch": 1.5101387406616862, + "grad_norm": 0.800440393512957, + "learning_rate": 3.576638816678304e-05, + "loss": 0.1915, + "step": 12735 + }, + { + "epoch": 1.5102573224238112, + "grad_norm": 1.0511677043974554, + "learning_rate": 3.576422175191602e-05, + "loss": 0.21, + "step": 12736 + }, + { + "epoch": 1.5103759041859361, + "grad_norm": 1.0084603302683448, + "learning_rate": 3.576205523781609e-05, + "loss": 0.1821, + "step": 12737 + }, + { + "epoch": 1.5104944859480613, + "grad_norm": 1.0619121275524137, + "learning_rate": 3.57598886245032e-05, + "loss": 0.2232, + "step": 12738 + }, + { + "epoch": 1.510613067710186, + "grad_norm": 0.9403269366216286, + "learning_rate": 3.5757721911997334e-05, + "loss": 0.2026, + "step": 12739 + }, + { + "epoch": 1.5107316494723113, + "grad_norm": 1.3718333260681264, + "learning_rate": 3.5755555100318466e-05, + "loss": 0.2399, + "step": 12740 + }, + { + "epoch": 1.510850231234436, + "grad_norm": 0.8205815229321826, + "learning_rate": 3.575338818948657e-05, + "loss": 0.1642, + "step": 12741 + }, + { + "epoch": 1.5109688129965613, + "grad_norm": 1.3744135586803867, + "learning_rate": 3.5751221179521615e-05, + "loss": 0.2349, + "step": 12742 + }, + { + "epoch": 1.511087394758686, + "grad_norm": 1.1018175961645407, + "learning_rate": 3.574905407044359e-05, + "loss": 0.1886, + "step": 12743 + }, + { + "epoch": 1.5112059765208112, + "grad_norm": 0.7697956211017584, + "learning_rate": 3.574688686227247e-05, + "loss": 0.2052, + "step": 12744 + }, + { + "epoch": 1.511324558282936, + "grad_norm": 1.1286263952416127, + "learning_rate": 3.574471955502822e-05, + "loss": 0.2805, + "step": 12745 + }, + { + "epoch": 1.5114431400450612, + "grad_norm": 1.6411763318216321, + "learning_rate": 3.574255214873085e-05, + "loss": 0.3264, + "step": 12746 + }, + { + "epoch": 1.511561721807186, + "grad_norm": 1.3780513804687262, + "learning_rate": 3.57403846434003e-05, + "loss": 0.265, + "step": 12747 + }, + { + "epoch": 1.5116803035693112, + "grad_norm": 1.080941303019903, + "learning_rate": 3.57382170390566e-05, + "loss": 0.2146, + "step": 12748 + }, + { + "epoch": 1.511798885331436, + "grad_norm": 0.9524227195986534, + "learning_rate": 3.573604933571969e-05, + "loss": 0.2263, + "step": 12749 + }, + { + "epoch": 1.5119174670935611, + "grad_norm": 0.8713312090969385, + "learning_rate": 3.573388153340958e-05, + "loss": 0.1846, + "step": 12750 + }, + { + "epoch": 1.5120360488556859, + "grad_norm": 1.3548831406054143, + "learning_rate": 3.5731713632146236e-05, + "loss": 0.2935, + "step": 12751 + }, + { + "epoch": 1.512154630617811, + "grad_norm": 0.9141544501953268, + "learning_rate": 3.572954563194966e-05, + "loss": 0.188, + "step": 12752 + }, + { + "epoch": 1.5122732123799358, + "grad_norm": 1.4039703771087766, + "learning_rate": 3.5727377532839814e-05, + "loss": 0.2881, + "step": 12753 + }, + { + "epoch": 1.512391794142061, + "grad_norm": 0.9734263237149824, + "learning_rate": 3.572520933483672e-05, + "loss": 0.185, + "step": 12754 + }, + { + "epoch": 1.5125103759041858, + "grad_norm": 1.0575640639264685, + "learning_rate": 3.572304103796034e-05, + "loss": 0.1642, + "step": 12755 + }, + { + "epoch": 1.512628957666311, + "grad_norm": 0.9887121539493257, + "learning_rate": 3.5720872642230676e-05, + "loss": 0.1968, + "step": 12756 + }, + { + "epoch": 1.5127475394284358, + "grad_norm": 1.1430292849467827, + "learning_rate": 3.571870414766772e-05, + "loss": 0.2288, + "step": 12757 + }, + { + "epoch": 1.512866121190561, + "grad_norm": 0.7461829510504894, + "learning_rate": 3.571653555429144e-05, + "loss": 0.1473, + "step": 12758 + }, + { + "epoch": 1.5129847029526857, + "grad_norm": 1.0848394272005284, + "learning_rate": 3.571436686212185e-05, + "loss": 0.1939, + "step": 12759 + }, + { + "epoch": 1.513103284714811, + "grad_norm": 1.0639703071039854, + "learning_rate": 3.571219807117894e-05, + "loss": 0.2179, + "step": 12760 + }, + { + "epoch": 1.5132218664769357, + "grad_norm": 1.0195298847724614, + "learning_rate": 3.571002918148269e-05, + "loss": 0.2185, + "step": 12761 + }, + { + "epoch": 1.5133404482390609, + "grad_norm": 0.9877842918873289, + "learning_rate": 3.570786019305311e-05, + "loss": 0.2119, + "step": 12762 + }, + { + "epoch": 1.5134590300011859, + "grad_norm": 1.0039923058993818, + "learning_rate": 3.5705691105910196e-05, + "loss": 0.1891, + "step": 12763 + }, + { + "epoch": 1.5135776117633108, + "grad_norm": 1.2253095781652132, + "learning_rate": 3.570352192007393e-05, + "loss": 0.2881, + "step": 12764 + }, + { + "epoch": 1.5136961935254358, + "grad_norm": 1.0466714916040571, + "learning_rate": 3.570135263556432e-05, + "loss": 0.2299, + "step": 12765 + }, + { + "epoch": 1.5138147752875608, + "grad_norm": 1.0881499181607956, + "learning_rate": 3.569918325240136e-05, + "loss": 0.1716, + "step": 12766 + }, + { + "epoch": 1.5139333570496858, + "grad_norm": 0.9196742096788857, + "learning_rate": 3.569701377060506e-05, + "loss": 0.184, + "step": 12767 + }, + { + "epoch": 1.5140519388118108, + "grad_norm": 0.9819596299384474, + "learning_rate": 3.56948441901954e-05, + "loss": 0.1722, + "step": 12768 + }, + { + "epoch": 1.5141705205739358, + "grad_norm": 0.9712096266777772, + "learning_rate": 3.56926745111924e-05, + "loss": 0.2058, + "step": 12769 + }, + { + "epoch": 1.5142891023360607, + "grad_norm": 1.2521347303277481, + "learning_rate": 3.5690504733616046e-05, + "loss": 0.234, + "step": 12770 + }, + { + "epoch": 1.5144076840981857, + "grad_norm": 0.8785940606144099, + "learning_rate": 3.568833485748635e-05, + "loss": 0.178, + "step": 12771 + }, + { + "epoch": 1.5145262658603107, + "grad_norm": 0.9244786622288234, + "learning_rate": 3.5686164882823314e-05, + "loss": 0.2428, + "step": 12772 + }, + { + "epoch": 1.5146448476224357, + "grad_norm": 0.9541151069020402, + "learning_rate": 3.5683994809646935e-05, + "loss": 0.1546, + "step": 12773 + }, + { + "epoch": 1.5147634293845607, + "grad_norm": 1.3890041878498254, + "learning_rate": 3.5681824637977234e-05, + "loss": 0.2848, + "step": 12774 + }, + { + "epoch": 1.5148820111466856, + "grad_norm": 1.2850590229296657, + "learning_rate": 3.5679654367834206e-05, + "loss": 0.2439, + "step": 12775 + }, + { + "epoch": 1.5150005929088106, + "grad_norm": 0.8357224549051292, + "learning_rate": 3.567748399923787e-05, + "loss": 0.187, + "step": 12776 + }, + { + "epoch": 1.5151191746709356, + "grad_norm": 0.87767497569854, + "learning_rate": 3.567531353220821e-05, + "loss": 0.1989, + "step": 12777 + }, + { + "epoch": 1.5152377564330606, + "grad_norm": 1.3875731205839268, + "learning_rate": 3.567314296676526e-05, + "loss": 0.3499, + "step": 12778 + }, + { + "epoch": 1.5153563381951856, + "grad_norm": 0.9583110660547284, + "learning_rate": 3.5670972302929016e-05, + "loss": 0.1859, + "step": 12779 + }, + { + "epoch": 1.5154749199573105, + "grad_norm": 0.9119177102885186, + "learning_rate": 3.566880154071949e-05, + "loss": 0.1843, + "step": 12780 + }, + { + "epoch": 1.5155935017194355, + "grad_norm": 0.9146909844251435, + "learning_rate": 3.5666630680156707e-05, + "loss": 0.1467, + "step": 12781 + }, + { + "epoch": 1.5157120834815605, + "grad_norm": 1.376319594047053, + "learning_rate": 3.5664459721260655e-05, + "loss": 0.2491, + "step": 12782 + }, + { + "epoch": 1.5158306652436855, + "grad_norm": 1.1523574897344098, + "learning_rate": 3.566228866405138e-05, + "loss": 0.2368, + "step": 12783 + }, + { + "epoch": 1.5159492470058105, + "grad_norm": 0.9313533398486757, + "learning_rate": 3.5660117508548865e-05, + "loss": 0.1732, + "step": 12784 + }, + { + "epoch": 1.5160678287679354, + "grad_norm": 1.1577927985050922, + "learning_rate": 3.565794625477315e-05, + "loss": 0.2541, + "step": 12785 + }, + { + "epoch": 1.5161864105300604, + "grad_norm": 0.87658166295317, + "learning_rate": 3.565577490274423e-05, + "loss": 0.171, + "step": 12786 + }, + { + "epoch": 1.5163049922921856, + "grad_norm": 1.1116663433160037, + "learning_rate": 3.565360345248213e-05, + "loss": 0.2247, + "step": 12787 + }, + { + "epoch": 1.5164235740543104, + "grad_norm": 0.8389218221492477, + "learning_rate": 3.565143190400688e-05, + "loss": 0.1396, + "step": 12788 + }, + { + "epoch": 1.5165421558164356, + "grad_norm": 0.8041738989978519, + "learning_rate": 3.5649260257338484e-05, + "loss": 0.1862, + "step": 12789 + }, + { + "epoch": 1.5166607375785603, + "grad_norm": 0.7829207015899472, + "learning_rate": 3.5647088512496966e-05, + "loss": 0.155, + "step": 12790 + }, + { + "epoch": 1.5167793193406856, + "grad_norm": 1.1683134558381554, + "learning_rate": 3.564491666950235e-05, + "loss": 0.2808, + "step": 12791 + }, + { + "epoch": 1.5168979011028103, + "grad_norm": 1.005125876390253, + "learning_rate": 3.5642744728374653e-05, + "loss": 0.2282, + "step": 12792 + }, + { + "epoch": 1.5170164828649355, + "grad_norm": 1.2333536440769755, + "learning_rate": 3.564057268913391e-05, + "loss": 0.2006, + "step": 12793 + }, + { + "epoch": 1.5171350646270603, + "grad_norm": 1.2571839384608696, + "learning_rate": 3.563840055180013e-05, + "loss": 0.3047, + "step": 12794 + }, + { + "epoch": 1.5172536463891855, + "grad_norm": 0.8756647070688247, + "learning_rate": 3.563622831639334e-05, + "loss": 0.1727, + "step": 12795 + }, + { + "epoch": 1.5173722281513102, + "grad_norm": 1.1368674819383315, + "learning_rate": 3.563405598293357e-05, + "loss": 0.2427, + "step": 12796 + }, + { + "epoch": 1.5174908099134354, + "grad_norm": 1.0941870249127739, + "learning_rate": 3.5631883551440845e-05, + "loss": 0.2474, + "step": 12797 + }, + { + "epoch": 1.5176093916755602, + "grad_norm": 0.767669798087506, + "learning_rate": 3.562971102193519e-05, + "loss": 0.1733, + "step": 12798 + }, + { + "epoch": 1.5177279734376854, + "grad_norm": 1.1476021758264907, + "learning_rate": 3.562753839443664e-05, + "loss": 0.2104, + "step": 12799 + }, + { + "epoch": 1.5178465551998102, + "grad_norm": 1.6056432700605283, + "learning_rate": 3.562536566896522e-05, + "loss": 0.2675, + "step": 12800 + }, + { + "epoch": 1.5179651369619354, + "grad_norm": 1.332576526945381, + "learning_rate": 3.562319284554094e-05, + "loss": 0.3004, + "step": 12801 + }, + { + "epoch": 1.5180837187240601, + "grad_norm": 0.9714670889405904, + "learning_rate": 3.562101992418386e-05, + "loss": 0.1841, + "step": 12802 + }, + { + "epoch": 1.5182023004861853, + "grad_norm": 1.0212972628416346, + "learning_rate": 3.5618846904914e-05, + "loss": 0.2518, + "step": 12803 + }, + { + "epoch": 1.51832088224831, + "grad_norm": 0.9350185102823155, + "learning_rate": 3.5616673787751395e-05, + "loss": 0.2187, + "step": 12804 + }, + { + "epoch": 1.5184394640104353, + "grad_norm": 0.8978425614766266, + "learning_rate": 3.561450057271608e-05, + "loss": 0.1957, + "step": 12805 + }, + { + "epoch": 1.51855804577256, + "grad_norm": 1.1459662295568913, + "learning_rate": 3.561232725982808e-05, + "loss": 0.2757, + "step": 12806 + }, + { + "epoch": 1.5186766275346852, + "grad_norm": 0.8769609680392908, + "learning_rate": 3.5610153849107444e-05, + "loss": 0.2056, + "step": 12807 + }, + { + "epoch": 1.51879520929681, + "grad_norm": 1.1471329685256457, + "learning_rate": 3.5607980340574195e-05, + "loss": 0.2966, + "step": 12808 + }, + { + "epoch": 1.5189137910589352, + "grad_norm": 0.9833116586805987, + "learning_rate": 3.560580673424837e-05, + "loss": 0.1951, + "step": 12809 + }, + { + "epoch": 1.51903237282106, + "grad_norm": 0.9840205475972659, + "learning_rate": 3.560363303015002e-05, + "loss": 0.2172, + "step": 12810 + }, + { + "epoch": 1.5191509545831852, + "grad_norm": 0.8550160569047015, + "learning_rate": 3.560145922829917e-05, + "loss": 0.1858, + "step": 12811 + }, + { + "epoch": 1.5192695363453101, + "grad_norm": 1.3881691864993104, + "learning_rate": 3.559928532871587e-05, + "loss": 0.2803, + "step": 12812 + }, + { + "epoch": 1.5193881181074351, + "grad_norm": 1.3503220394436788, + "learning_rate": 3.559711133142016e-05, + "loss": 0.3773, + "step": 12813 + }, + { + "epoch": 1.51950669986956, + "grad_norm": 1.0530331822900083, + "learning_rate": 3.559493723643208e-05, + "loss": 0.2241, + "step": 12814 + }, + { + "epoch": 1.519625281631685, + "grad_norm": 1.1546545050975312, + "learning_rate": 3.559276304377167e-05, + "loss": 0.2227, + "step": 12815 + }, + { + "epoch": 1.51974386339381, + "grad_norm": 0.9668690494127918, + "learning_rate": 3.559058875345897e-05, + "loss": 0.2222, + "step": 12816 + }, + { + "epoch": 1.519862445155935, + "grad_norm": 0.8049246541591667, + "learning_rate": 3.5588414365514034e-05, + "loss": 0.219, + "step": 12817 + }, + { + "epoch": 1.51998102691806, + "grad_norm": 0.9535109359849181, + "learning_rate": 3.5586239879956896e-05, + "loss": 0.1934, + "step": 12818 + }, + { + "epoch": 1.520099608680185, + "grad_norm": 1.0605257292318742, + "learning_rate": 3.558406529680761e-05, + "loss": 0.2595, + "step": 12819 + }, + { + "epoch": 1.52021819044231, + "grad_norm": 1.0581513569365546, + "learning_rate": 3.558189061608622e-05, + "loss": 0.2845, + "step": 12820 + }, + { + "epoch": 1.520336772204435, + "grad_norm": 1.2354677684272697, + "learning_rate": 3.557971583781278e-05, + "loss": 0.2212, + "step": 12821 + }, + { + "epoch": 1.52045535396656, + "grad_norm": 1.1671579199233224, + "learning_rate": 3.557754096200733e-05, + "loss": 0.2539, + "step": 12822 + }, + { + "epoch": 1.520573935728685, + "grad_norm": 1.0417040783792737, + "learning_rate": 3.557536598868993e-05, + "loss": 0.1942, + "step": 12823 + }, + { + "epoch": 1.52069251749081, + "grad_norm": 1.4462090076118372, + "learning_rate": 3.5573190917880605e-05, + "loss": 0.2414, + "step": 12824 + }, + { + "epoch": 1.520811099252935, + "grad_norm": 1.2471241349503797, + "learning_rate": 3.557101574959944e-05, + "loss": 0.2038, + "step": 12825 + }, + { + "epoch": 1.5209296810150599, + "grad_norm": 1.0065580335726687, + "learning_rate": 3.5568840483866473e-05, + "loss": 0.1897, + "step": 12826 + }, + { + "epoch": 1.5210482627771849, + "grad_norm": 0.7772645430964024, + "learning_rate": 3.556666512070175e-05, + "loss": 0.1679, + "step": 12827 + }, + { + "epoch": 1.5211668445393098, + "grad_norm": 1.1299965183766798, + "learning_rate": 3.556448966012535e-05, + "loss": 0.2439, + "step": 12828 + }, + { + "epoch": 1.5212854263014348, + "grad_norm": 1.0003181616712027, + "learning_rate": 3.556231410215729e-05, + "loss": 0.1784, + "step": 12829 + }, + { + "epoch": 1.5214040080635598, + "grad_norm": 1.1718410073226182, + "learning_rate": 3.5560138446817655e-05, + "loss": 0.2474, + "step": 12830 + }, + { + "epoch": 1.5215225898256848, + "grad_norm": 0.8634869931508408, + "learning_rate": 3.555796269412649e-05, + "loss": 0.1887, + "step": 12831 + }, + { + "epoch": 1.5216411715878098, + "grad_norm": 0.8455399190518014, + "learning_rate": 3.5555786844103864e-05, + "loss": 0.1929, + "step": 12832 + }, + { + "epoch": 1.5217597533499347, + "grad_norm": 1.0827286405372403, + "learning_rate": 3.555361089676982e-05, + "loss": 0.2919, + "step": 12833 + }, + { + "epoch": 1.5218783351120597, + "grad_norm": 0.9629380632316193, + "learning_rate": 3.5551434852144425e-05, + "loss": 0.2184, + "step": 12834 + }, + { + "epoch": 1.5219969168741847, + "grad_norm": 1.0034264888382383, + "learning_rate": 3.554925871024774e-05, + "loss": 0.2343, + "step": 12835 + }, + { + "epoch": 1.5221154986363097, + "grad_norm": 0.9438581902226297, + "learning_rate": 3.554708247109984e-05, + "loss": 0.1872, + "step": 12836 + }, + { + "epoch": 1.5222340803984347, + "grad_norm": 1.5123536004102422, + "learning_rate": 3.554490613472075e-05, + "loss": 0.3089, + "step": 12837 + }, + { + "epoch": 1.5223526621605599, + "grad_norm": 1.239584567512024, + "learning_rate": 3.5542729701130574e-05, + "loss": 0.2444, + "step": 12838 + }, + { + "epoch": 1.5224712439226846, + "grad_norm": 0.8654599120372531, + "learning_rate": 3.554055317034935e-05, + "loss": 0.1491, + "step": 12839 + }, + { + "epoch": 1.5225898256848098, + "grad_norm": 1.020026004644902, + "learning_rate": 3.553837654239715e-05, + "loss": 0.2134, + "step": 12840 + }, + { + "epoch": 1.5227084074469346, + "grad_norm": 1.2606602599731052, + "learning_rate": 3.553619981729404e-05, + "loss": 0.256, + "step": 12841 + }, + { + "epoch": 1.5228269892090598, + "grad_norm": 1.0907434341978008, + "learning_rate": 3.5534022995060096e-05, + "loss": 0.1792, + "step": 12842 + }, + { + "epoch": 1.5229455709711845, + "grad_norm": 1.3443085921678128, + "learning_rate": 3.553184607571538e-05, + "loss": 0.3125, + "step": 12843 + }, + { + "epoch": 1.5230641527333098, + "grad_norm": 0.9554544165954907, + "learning_rate": 3.552966905927995e-05, + "loss": 0.211, + "step": 12844 + }, + { + "epoch": 1.5231827344954345, + "grad_norm": 1.3357655178312413, + "learning_rate": 3.5527491945773886e-05, + "loss": 0.3065, + "step": 12845 + }, + { + "epoch": 1.5233013162575597, + "grad_norm": 0.9979791480828737, + "learning_rate": 3.5525314735217255e-05, + "loss": 0.1694, + "step": 12846 + }, + { + "epoch": 1.5234198980196845, + "grad_norm": 0.8708678532978099, + "learning_rate": 3.552313742763013e-05, + "loss": 0.2002, + "step": 12847 + }, + { + "epoch": 1.5235384797818097, + "grad_norm": 1.0134185876935402, + "learning_rate": 3.5520960023032586e-05, + "loss": 0.2851, + "step": 12848 + }, + { + "epoch": 1.5236570615439344, + "grad_norm": 1.0077957680855296, + "learning_rate": 3.551878252144469e-05, + "loss": 0.2542, + "step": 12849 + }, + { + "epoch": 1.5237756433060596, + "grad_norm": 1.2626057072433472, + "learning_rate": 3.551660492288651e-05, + "loss": 0.2697, + "step": 12850 + }, + { + "epoch": 1.5238942250681844, + "grad_norm": 0.9001594375719711, + "learning_rate": 3.551442722737814e-05, + "loss": 0.209, + "step": 12851 + }, + { + "epoch": 1.5240128068303096, + "grad_norm": 1.0878311985434648, + "learning_rate": 3.551224943493964e-05, + "loss": 0.1912, + "step": 12852 + }, + { + "epoch": 1.5241313885924344, + "grad_norm": 1.1070184205268812, + "learning_rate": 3.551007154559109e-05, + "loss": 0.2237, + "step": 12853 + }, + { + "epoch": 1.5242499703545596, + "grad_norm": 0.7871784803337842, + "learning_rate": 3.550789355935257e-05, + "loss": 0.1779, + "step": 12854 + }, + { + "epoch": 1.5243685521166843, + "grad_norm": 0.8657013775048242, + "learning_rate": 3.5505715476244155e-05, + "loss": 0.2008, + "step": 12855 + }, + { + "epoch": 1.5244871338788095, + "grad_norm": 1.4275226546374824, + "learning_rate": 3.5503537296285936e-05, + "loss": 0.255, + "step": 12856 + }, + { + "epoch": 1.5246057156409343, + "grad_norm": 0.8925013656513742, + "learning_rate": 3.550135901949797e-05, + "loss": 0.2046, + "step": 12857 + }, + { + "epoch": 1.5247242974030595, + "grad_norm": 1.1836670901313346, + "learning_rate": 3.5499180645900365e-05, + "loss": 0.2601, + "step": 12858 + }, + { + "epoch": 1.5248428791651842, + "grad_norm": 0.8530506506935547, + "learning_rate": 3.5497002175513177e-05, + "loss": 0.1568, + "step": 12859 + }, + { + "epoch": 1.5249614609273094, + "grad_norm": 1.3513767712331632, + "learning_rate": 3.5494823608356505e-05, + "loss": 0.3589, + "step": 12860 + }, + { + "epoch": 1.5250800426894342, + "grad_norm": 0.9661768794815984, + "learning_rate": 3.549264494445042e-05, + "loss": 0.2249, + "step": 12861 + }, + { + "epoch": 1.5251986244515594, + "grad_norm": 0.9202535548331342, + "learning_rate": 3.5490466183815035e-05, + "loss": 0.2286, + "step": 12862 + }, + { + "epoch": 1.5253172062136844, + "grad_norm": 1.0176139725971, + "learning_rate": 3.5488287326470406e-05, + "loss": 0.2181, + "step": 12863 + }, + { + "epoch": 1.5254357879758094, + "grad_norm": 1.623399513861169, + "learning_rate": 3.5486108372436634e-05, + "loss": 0.3192, + "step": 12864 + }, + { + "epoch": 1.5255543697379343, + "grad_norm": 1.1976986769407152, + "learning_rate": 3.5483929321733796e-05, + "loss": 0.2433, + "step": 12865 + }, + { + "epoch": 1.5256729515000593, + "grad_norm": 1.0815221995316286, + "learning_rate": 3.5481750174381986e-05, + "loss": 0.2583, + "step": 12866 + }, + { + "epoch": 1.5257915332621843, + "grad_norm": 0.8038665838138259, + "learning_rate": 3.5479570930401294e-05, + "loss": 0.1452, + "step": 12867 + }, + { + "epoch": 1.5259101150243093, + "grad_norm": 0.6912178897160326, + "learning_rate": 3.547739158981181e-05, + "loss": 0.1438, + "step": 12868 + }, + { + "epoch": 1.5260286967864343, + "grad_norm": 1.2224699931214849, + "learning_rate": 3.547521215263363e-05, + "loss": 0.2463, + "step": 12869 + }, + { + "epoch": 1.5261472785485592, + "grad_norm": 0.9496634324967413, + "learning_rate": 3.5473032618886823e-05, + "loss": 0.2031, + "step": 12870 + }, + { + "epoch": 1.5262658603106842, + "grad_norm": 0.770577797093182, + "learning_rate": 3.5470852988591514e-05, + "loss": 0.1955, + "step": 12871 + }, + { + "epoch": 1.5263844420728092, + "grad_norm": 1.1345003742319786, + "learning_rate": 3.546867326176777e-05, + "loss": 0.1993, + "step": 12872 + }, + { + "epoch": 1.5265030238349342, + "grad_norm": 0.8066786640009451, + "learning_rate": 3.54664934384357e-05, + "loss": 0.1604, + "step": 12873 + }, + { + "epoch": 1.5266216055970592, + "grad_norm": 1.7802855145258445, + "learning_rate": 3.54643135186154e-05, + "loss": 0.2759, + "step": 12874 + }, + { + "epoch": 1.5267401873591842, + "grad_norm": 1.1443853435774451, + "learning_rate": 3.546213350232696e-05, + "loss": 0.302, + "step": 12875 + }, + { + "epoch": 1.5268587691213091, + "grad_norm": 1.0329282463425375, + "learning_rate": 3.5459953389590474e-05, + "loss": 0.2472, + "step": 12876 + }, + { + "epoch": 1.5269773508834341, + "grad_norm": 0.7996133325430156, + "learning_rate": 3.545777318042605e-05, + "loss": 0.1842, + "step": 12877 + }, + { + "epoch": 1.527095932645559, + "grad_norm": 0.8820779180606119, + "learning_rate": 3.5455592874853776e-05, + "loss": 0.1791, + "step": 12878 + }, + { + "epoch": 1.527214514407684, + "grad_norm": 0.7937470157244286, + "learning_rate": 3.5453412472893766e-05, + "loss": 0.1339, + "step": 12879 + }, + { + "epoch": 1.527333096169809, + "grad_norm": 1.0149029778613194, + "learning_rate": 3.54512319745661e-05, + "loss": 0.2346, + "step": 12880 + }, + { + "epoch": 1.527451677931934, + "grad_norm": 1.254795511678671, + "learning_rate": 3.5449051379890894e-05, + "loss": 0.2857, + "step": 12881 + }, + { + "epoch": 1.527570259694059, + "grad_norm": 0.9706799962497036, + "learning_rate": 3.544687068888825e-05, + "loss": 0.1826, + "step": 12882 + }, + { + "epoch": 1.527688841456184, + "grad_norm": 1.1371724695713745, + "learning_rate": 3.5444689901578264e-05, + "loss": 0.2382, + "step": 12883 + }, + { + "epoch": 1.527807423218309, + "grad_norm": 1.226433890498801, + "learning_rate": 3.5442509017981055e-05, + "loss": 0.2632, + "step": 12884 + }, + { + "epoch": 1.527926004980434, + "grad_norm": 0.762443992949207, + "learning_rate": 3.5440328038116716e-05, + "loss": 0.166, + "step": 12885 + }, + { + "epoch": 1.528044586742559, + "grad_norm": 0.9412518669724855, + "learning_rate": 3.543814696200536e-05, + "loss": 0.1808, + "step": 12886 + }, + { + "epoch": 1.5281631685046841, + "grad_norm": 1.200731140271332, + "learning_rate": 3.5435965789667074e-05, + "loss": 0.2495, + "step": 12887 + }, + { + "epoch": 1.528281750266809, + "grad_norm": 0.6075508909893481, + "learning_rate": 3.5433784521121993e-05, + "loss": 0.1306, + "step": 12888 + }, + { + "epoch": 1.528400332028934, + "grad_norm": 0.8812628728460592, + "learning_rate": 3.543160315639021e-05, + "loss": 0.1801, + "step": 12889 + }, + { + "epoch": 1.5285189137910589, + "grad_norm": 0.8385972728755706, + "learning_rate": 3.5429421695491835e-05, + "loss": 0.2116, + "step": 12890 + }, + { + "epoch": 1.528637495553184, + "grad_norm": 1.1739157183971491, + "learning_rate": 3.542724013844698e-05, + "loss": 0.2238, + "step": 12891 + }, + { + "epoch": 1.5287560773153088, + "grad_norm": 1.1223654891712085, + "learning_rate": 3.5425058485275766e-05, + "loss": 0.2437, + "step": 12892 + }, + { + "epoch": 1.528874659077434, + "grad_norm": 1.4932989723036958, + "learning_rate": 3.542287673599829e-05, + "loss": 0.2605, + "step": 12893 + }, + { + "epoch": 1.5289932408395588, + "grad_norm": 1.1281476404386483, + "learning_rate": 3.5420694890634674e-05, + "loss": 0.2735, + "step": 12894 + }, + { + "epoch": 1.529111822601684, + "grad_norm": 1.043464986449808, + "learning_rate": 3.5418512949205035e-05, + "loss": 0.2306, + "step": 12895 + }, + { + "epoch": 1.5292304043638087, + "grad_norm": 0.8648958692593001, + "learning_rate": 3.541633091172947e-05, + "loss": 0.1444, + "step": 12896 + }, + { + "epoch": 1.529348986125934, + "grad_norm": 1.0220171020899242, + "learning_rate": 3.5414148778228124e-05, + "loss": 0.1998, + "step": 12897 + }, + { + "epoch": 1.5294675678880587, + "grad_norm": 1.6114558155377965, + "learning_rate": 3.541196654872109e-05, + "loss": 0.37, + "step": 12898 + }, + { + "epoch": 1.529586149650184, + "grad_norm": 1.164656876250275, + "learning_rate": 3.540978422322849e-05, + "loss": 0.3151, + "step": 12899 + }, + { + "epoch": 1.5297047314123087, + "grad_norm": 0.9120332876626611, + "learning_rate": 3.540760180177044e-05, + "loss": 0.1984, + "step": 12900 + }, + { + "epoch": 1.5298233131744339, + "grad_norm": 1.0810625608759883, + "learning_rate": 3.540541928436707e-05, + "loss": 0.2299, + "step": 12901 + }, + { + "epoch": 1.5299418949365586, + "grad_norm": 1.1524900714041455, + "learning_rate": 3.540323667103849e-05, + "loss": 0.22, + "step": 12902 + }, + { + "epoch": 1.5300604766986838, + "grad_norm": 1.2421686526707942, + "learning_rate": 3.540105396180483e-05, + "loss": 0.2002, + "step": 12903 + }, + { + "epoch": 1.5301790584608086, + "grad_norm": 0.8278686031026299, + "learning_rate": 3.5398871156686206e-05, + "loss": 0.192, + "step": 12904 + }, + { + "epoch": 1.5302976402229338, + "grad_norm": 0.8565528097814316, + "learning_rate": 3.5396688255702747e-05, + "loss": 0.188, + "step": 12905 + }, + { + "epoch": 1.5304162219850586, + "grad_norm": 0.9556960490062892, + "learning_rate": 3.539450525887456e-05, + "loss": 0.2014, + "step": 12906 + }, + { + "epoch": 1.5305348037471838, + "grad_norm": 1.0472701562060664, + "learning_rate": 3.539232216622179e-05, + "loss": 0.1643, + "step": 12907 + }, + { + "epoch": 1.5306533855093085, + "grad_norm": 0.922581938190148, + "learning_rate": 3.539013897776455e-05, + "loss": 0.2216, + "step": 12908 + }, + { + "epoch": 1.5307719672714337, + "grad_norm": 1.1351067817012357, + "learning_rate": 3.5387955693522975e-05, + "loss": 0.3453, + "step": 12909 + }, + { + "epoch": 1.5308905490335585, + "grad_norm": 0.849977797024075, + "learning_rate": 3.5385772313517176e-05, + "loss": 0.1638, + "step": 12910 + }, + { + "epoch": 1.5310091307956837, + "grad_norm": 0.7121233787766494, + "learning_rate": 3.53835888377673e-05, + "loss": 0.1742, + "step": 12911 + }, + { + "epoch": 1.5311277125578087, + "grad_norm": 1.1308953267843558, + "learning_rate": 3.5381405266293464e-05, + "loss": 0.277, + "step": 12912 + }, + { + "epoch": 1.5312462943199336, + "grad_norm": 0.8656292514072345, + "learning_rate": 3.537922159911581e-05, + "loss": 0.1592, + "step": 12913 + }, + { + "epoch": 1.5313648760820586, + "grad_norm": 0.7599069833344445, + "learning_rate": 3.537703783625446e-05, + "loss": 0.1526, + "step": 12914 + }, + { + "epoch": 1.5314834578441836, + "grad_norm": 1.191425852818175, + "learning_rate": 3.537485397772954e-05, + "loss": 0.2135, + "step": 12915 + }, + { + "epoch": 1.5316020396063086, + "grad_norm": 0.6342451830811993, + "learning_rate": 3.537267002356119e-05, + "loss": 0.155, + "step": 12916 + }, + { + "epoch": 1.5317206213684336, + "grad_norm": 1.4591828702066778, + "learning_rate": 3.537048597376954e-05, + "loss": 0.3795, + "step": 12917 + }, + { + "epoch": 1.5318392031305585, + "grad_norm": 1.5562004046372202, + "learning_rate": 3.536830182837474e-05, + "loss": 0.326, + "step": 12918 + }, + { + "epoch": 1.5319577848926835, + "grad_norm": 0.7073277419706369, + "learning_rate": 3.536611758739689e-05, + "loss": 0.1347, + "step": 12919 + }, + { + "epoch": 1.5320763666548085, + "grad_norm": 0.9692130078385965, + "learning_rate": 3.536393325085616e-05, + "loss": 0.2401, + "step": 12920 + }, + { + "epoch": 1.5321949484169335, + "grad_norm": 1.1822539793464235, + "learning_rate": 3.536174881877267e-05, + "loss": 0.2354, + "step": 12921 + }, + { + "epoch": 1.5323135301790585, + "grad_norm": 0.8406792847645952, + "learning_rate": 3.535956429116657e-05, + "loss": 0.1794, + "step": 12922 + }, + { + "epoch": 1.5324321119411835, + "grad_norm": 0.8728744119799386, + "learning_rate": 3.535737966805798e-05, + "loss": 0.2003, + "step": 12923 + }, + { + "epoch": 1.5325506937033084, + "grad_norm": 0.7790032597596278, + "learning_rate": 3.535519494946706e-05, + "loss": 0.1514, + "step": 12924 + }, + { + "epoch": 1.5326692754654334, + "grad_norm": 1.3373013794959874, + "learning_rate": 3.535301013541394e-05, + "loss": 0.3106, + "step": 12925 + }, + { + "epoch": 1.5327878572275584, + "grad_norm": 1.0501721825218668, + "learning_rate": 3.535082522591876e-05, + "loss": 0.2316, + "step": 12926 + }, + { + "epoch": 1.5329064389896834, + "grad_norm": 1.2569406903245595, + "learning_rate": 3.534864022100167e-05, + "loss": 0.2263, + "step": 12927 + }, + { + "epoch": 1.5330250207518084, + "grad_norm": 0.9349563489277629, + "learning_rate": 3.5346455120682796e-05, + "loss": 0.2053, + "step": 12928 + }, + { + "epoch": 1.5331436025139333, + "grad_norm": 1.0345961169469389, + "learning_rate": 3.53442699249823e-05, + "loss": 0.2217, + "step": 12929 + }, + { + "epoch": 1.5332621842760583, + "grad_norm": 0.806763595036304, + "learning_rate": 3.5342084633920316e-05, + "loss": 0.1559, + "step": 12930 + }, + { + "epoch": 1.5333807660381833, + "grad_norm": 0.8119502272694814, + "learning_rate": 3.5339899247517e-05, + "loss": 0.186, + "step": 12931 + }, + { + "epoch": 1.5334993478003083, + "grad_norm": 0.8471037817058749, + "learning_rate": 3.5337713765792494e-05, + "loss": 0.223, + "step": 12932 + }, + { + "epoch": 1.5336179295624333, + "grad_norm": 0.9318811017977353, + "learning_rate": 3.533552818876694e-05, + "loss": 0.2028, + "step": 12933 + }, + { + "epoch": 1.5337365113245582, + "grad_norm": 0.8946494413316326, + "learning_rate": 3.533334251646049e-05, + "loss": 0.1626, + "step": 12934 + }, + { + "epoch": 1.5338550930866832, + "grad_norm": 1.0925514690441913, + "learning_rate": 3.5331156748893295e-05, + "loss": 0.269, + "step": 12935 + }, + { + "epoch": 1.5339736748488082, + "grad_norm": 0.9845897007602804, + "learning_rate": 3.53289708860855e-05, + "loss": 0.2335, + "step": 12936 + }, + { + "epoch": 1.5340922566109332, + "grad_norm": 1.1320380020439396, + "learning_rate": 3.5326784928057264e-05, + "loss": 0.2569, + "step": 12937 + }, + { + "epoch": 1.5342108383730584, + "grad_norm": 1.216845909061438, + "learning_rate": 3.532459887482873e-05, + "loss": 0.2486, + "step": 12938 + }, + { + "epoch": 1.5343294201351831, + "grad_norm": 0.8516156809713886, + "learning_rate": 3.532241272642006e-05, + "loss": 0.2259, + "step": 12939 + }, + { + "epoch": 1.5344480018973083, + "grad_norm": 1.0620111051324834, + "learning_rate": 3.53202264828514e-05, + "loss": 0.258, + "step": 12940 + }, + { + "epoch": 1.534566583659433, + "grad_norm": 1.2582016356972028, + "learning_rate": 3.53180401441429e-05, + "loss": 0.2061, + "step": 12941 + }, + { + "epoch": 1.5346851654215583, + "grad_norm": 0.9712485161430684, + "learning_rate": 3.531585371031474e-05, + "loss": 0.1954, + "step": 12942 + }, + { + "epoch": 1.534803747183683, + "grad_norm": 0.7788363307945733, + "learning_rate": 3.531366718138705e-05, + "loss": 0.1542, + "step": 12943 + }, + { + "epoch": 1.5349223289458083, + "grad_norm": 0.6948123573014953, + "learning_rate": 3.531148055738e-05, + "loss": 0.1678, + "step": 12944 + }, + { + "epoch": 1.535040910707933, + "grad_norm": 0.9358222672327046, + "learning_rate": 3.530929383831374e-05, + "loss": 0.2226, + "step": 12945 + }, + { + "epoch": 1.5351594924700582, + "grad_norm": 0.8553158346798698, + "learning_rate": 3.530710702420843e-05, + "loss": 0.1415, + "step": 12946 + }, + { + "epoch": 1.535278074232183, + "grad_norm": 1.7010113243899436, + "learning_rate": 3.530492011508424e-05, + "loss": 0.4698, + "step": 12947 + }, + { + "epoch": 1.5353966559943082, + "grad_norm": 1.2452001008046305, + "learning_rate": 3.5302733110961316e-05, + "loss": 0.2917, + "step": 12948 + }, + { + "epoch": 1.535515237756433, + "grad_norm": 1.1352727530819346, + "learning_rate": 3.530054601185983e-05, + "loss": 0.2326, + "step": 12949 + }, + { + "epoch": 1.5356338195185582, + "grad_norm": 0.9933986406624238, + "learning_rate": 3.5298358817799935e-05, + "loss": 0.2107, + "step": 12950 + }, + { + "epoch": 1.535752401280683, + "grad_norm": 0.9395524826304277, + "learning_rate": 3.5296171528801805e-05, + "loss": 0.2592, + "step": 12951 + }, + { + "epoch": 1.5358709830428081, + "grad_norm": 0.7801575493655623, + "learning_rate": 3.52939841448856e-05, + "loss": 0.1663, + "step": 12952 + }, + { + "epoch": 1.5359895648049329, + "grad_norm": 0.7122273573564518, + "learning_rate": 3.529179666607149e-05, + "loss": 0.145, + "step": 12953 + }, + { + "epoch": 1.536108146567058, + "grad_norm": 0.8597657971757691, + "learning_rate": 3.5289609092379636e-05, + "loss": 0.1882, + "step": 12954 + }, + { + "epoch": 1.5362267283291828, + "grad_norm": 1.2115345824538846, + "learning_rate": 3.5287421423830195e-05, + "loss": 0.2268, + "step": 12955 + }, + { + "epoch": 1.536345310091308, + "grad_norm": 1.306299420165853, + "learning_rate": 3.528523366044335e-05, + "loss": 0.3155, + "step": 12956 + }, + { + "epoch": 1.5364638918534328, + "grad_norm": 0.7890994178266642, + "learning_rate": 3.528304580223926e-05, + "loss": 0.1449, + "step": 12957 + }, + { + "epoch": 1.536582473615558, + "grad_norm": 1.359055306527004, + "learning_rate": 3.52808578492381e-05, + "loss": 0.2365, + "step": 12958 + }, + { + "epoch": 1.5367010553776828, + "grad_norm": 0.7914279994797084, + "learning_rate": 3.527866980146003e-05, + "loss": 0.1699, + "step": 12959 + }, + { + "epoch": 1.536819637139808, + "grad_norm": 0.9918689589861933, + "learning_rate": 3.5276481658925234e-05, + "loss": 0.1938, + "step": 12960 + }, + { + "epoch": 1.5369382189019327, + "grad_norm": 1.2261810332422252, + "learning_rate": 3.5274293421653885e-05, + "loss": 0.2212, + "step": 12961 + }, + { + "epoch": 1.537056800664058, + "grad_norm": 1.2139540664620825, + "learning_rate": 3.527210508966614e-05, + "loss": 0.2731, + "step": 12962 + }, + { + "epoch": 1.537175382426183, + "grad_norm": 0.8333907474252661, + "learning_rate": 3.526991666298219e-05, + "loss": 0.1638, + "step": 12963 + }, + { + "epoch": 1.5372939641883079, + "grad_norm": 1.1360137792394915, + "learning_rate": 3.5267728141622206e-05, + "loss": 0.2094, + "step": 12964 + }, + { + "epoch": 1.5374125459504329, + "grad_norm": 0.6873009977819795, + "learning_rate": 3.526553952560635e-05, + "loss": 0.1676, + "step": 12965 + }, + { + "epoch": 1.5375311277125578, + "grad_norm": 1.0429011040197398, + "learning_rate": 3.526335081495481e-05, + "loss": 0.2472, + "step": 12966 + }, + { + "epoch": 1.5376497094746828, + "grad_norm": 0.9282481774124294, + "learning_rate": 3.526116200968776e-05, + "loss": 0.1899, + "step": 12967 + }, + { + "epoch": 1.5377682912368078, + "grad_norm": 0.780724859456509, + "learning_rate": 3.5258973109825386e-05, + "loss": 0.2061, + "step": 12968 + }, + { + "epoch": 1.5378868729989328, + "grad_norm": 0.9612424249527737, + "learning_rate": 3.525678411538785e-05, + "loss": 0.1522, + "step": 12969 + }, + { + "epoch": 1.5380054547610578, + "grad_norm": 0.8730359727669729, + "learning_rate": 3.5254595026395355e-05, + "loss": 0.2064, + "step": 12970 + }, + { + "epoch": 1.5381240365231827, + "grad_norm": 0.7827427841143191, + "learning_rate": 3.525240584286806e-05, + "loss": 0.2007, + "step": 12971 + }, + { + "epoch": 1.5382426182853077, + "grad_norm": 0.7031202121711667, + "learning_rate": 3.525021656482616e-05, + "loss": 0.1431, + "step": 12972 + }, + { + "epoch": 1.5383612000474327, + "grad_norm": 1.0316232516244068, + "learning_rate": 3.524802719228983e-05, + "loss": 0.214, + "step": 12973 + }, + { + "epoch": 1.5384797818095577, + "grad_norm": 1.1479799640284831, + "learning_rate": 3.5245837725279254e-05, + "loss": 0.2845, + "step": 12974 + }, + { + "epoch": 1.5385983635716827, + "grad_norm": 1.3939127294167244, + "learning_rate": 3.524364816381463e-05, + "loss": 0.2533, + "step": 12975 + }, + { + "epoch": 1.5387169453338077, + "grad_norm": 1.0577933843877287, + "learning_rate": 3.524145850791612e-05, + "loss": 0.1721, + "step": 12976 + }, + { + "epoch": 1.5388355270959326, + "grad_norm": 1.229558536555313, + "learning_rate": 3.523926875760392e-05, + "loss": 0.2416, + "step": 12977 + }, + { + "epoch": 1.5389541088580576, + "grad_norm": 0.9664361911332459, + "learning_rate": 3.523707891289823e-05, + "loss": 0.2321, + "step": 12978 + }, + { + "epoch": 1.5390726906201826, + "grad_norm": 1.0806548754547245, + "learning_rate": 3.5234888973819215e-05, + "loss": 0.1982, + "step": 12979 + }, + { + "epoch": 1.5391912723823076, + "grad_norm": 1.308227331774289, + "learning_rate": 3.523269894038708e-05, + "loss": 0.2167, + "step": 12980 + }, + { + "epoch": 1.5393098541444326, + "grad_norm": 1.1974719436027046, + "learning_rate": 3.5230508812622e-05, + "loss": 0.2795, + "step": 12981 + }, + { + "epoch": 1.5394284359065575, + "grad_norm": 0.9459011140074187, + "learning_rate": 3.522831859054418e-05, + "loss": 0.1701, + "step": 12982 + }, + { + "epoch": 1.5395470176686825, + "grad_norm": 0.9274187603840137, + "learning_rate": 3.5226128274173806e-05, + "loss": 0.1889, + "step": 12983 + }, + { + "epoch": 1.5396655994308075, + "grad_norm": 1.2103907304938715, + "learning_rate": 3.522393786353108e-05, + "loss": 0.2477, + "step": 12984 + }, + { + "epoch": 1.5397841811929325, + "grad_norm": 1.117379798555609, + "learning_rate": 3.522174735863617e-05, + "loss": 0.2414, + "step": 12985 + }, + { + "epoch": 1.5399027629550575, + "grad_norm": 0.8621323733989461, + "learning_rate": 3.521955675950929e-05, + "loss": 0.1759, + "step": 12986 + }, + { + "epoch": 1.5400213447171827, + "grad_norm": 1.4750240682544027, + "learning_rate": 3.521736606617063e-05, + "loss": 0.312, + "step": 12987 + }, + { + "epoch": 1.5401399264793074, + "grad_norm": 1.0540793117810618, + "learning_rate": 3.521517527864038e-05, + "loss": 0.2664, + "step": 12988 + }, + { + "epoch": 1.5402585082414326, + "grad_norm": 0.8989872899700817, + "learning_rate": 3.5212984396938734e-05, + "loss": 0.1941, + "step": 12989 + }, + { + "epoch": 1.5403770900035574, + "grad_norm": 0.8114822814932272, + "learning_rate": 3.521079342108591e-05, + "loss": 0.1636, + "step": 12990 + }, + { + "epoch": 1.5404956717656826, + "grad_norm": 0.8777280171578562, + "learning_rate": 3.520860235110208e-05, + "loss": 0.1819, + "step": 12991 + }, + { + "epoch": 1.5406142535278073, + "grad_norm": 1.3591510201439414, + "learning_rate": 3.5206411187007466e-05, + "loss": 0.3091, + "step": 12992 + }, + { + "epoch": 1.5407328352899325, + "grad_norm": 1.0674957016481832, + "learning_rate": 3.5204219928822255e-05, + "loss": 0.2943, + "step": 12993 + }, + { + "epoch": 1.5408514170520573, + "grad_norm": 0.9574320138494344, + "learning_rate": 3.520202857656664e-05, + "loss": 0.2005, + "step": 12994 + }, + { + "epoch": 1.5409699988141825, + "grad_norm": 0.5656993264249255, + "learning_rate": 3.519983713026084e-05, + "loss": 0.1503, + "step": 12995 + }, + { + "epoch": 1.5410885805763073, + "grad_norm": 1.125911556772812, + "learning_rate": 3.519764558992505e-05, + "loss": 0.2624, + "step": 12996 + }, + { + "epoch": 1.5412071623384325, + "grad_norm": 1.163397370051345, + "learning_rate": 3.5195453955579464e-05, + "loss": 0.2074, + "step": 12997 + }, + { + "epoch": 1.5413257441005572, + "grad_norm": 1.3938242727766779, + "learning_rate": 3.5193262227244306e-05, + "loss": 0.2804, + "step": 12998 + }, + { + "epoch": 1.5414443258626824, + "grad_norm": 1.005019918323092, + "learning_rate": 3.519107040493976e-05, + "loss": 0.2243, + "step": 12999 + }, + { + "epoch": 1.5415629076248072, + "grad_norm": 0.8967494602600352, + "learning_rate": 3.518887848868605e-05, + "loss": 0.1659, + "step": 13000 + }, + { + "epoch": 1.5416814893869324, + "grad_norm": 1.0798536271305383, + "learning_rate": 3.5186686478503374e-05, + "loss": 0.2908, + "step": 13001 + }, + { + "epoch": 1.5418000711490571, + "grad_norm": 0.7699423639711502, + "learning_rate": 3.518449437441194e-05, + "loss": 0.1861, + "step": 13002 + }, + { + "epoch": 1.5419186529111824, + "grad_norm": 1.16641854329243, + "learning_rate": 3.518230217643195e-05, + "loss": 0.2536, + "step": 13003 + }, + { + "epoch": 1.5420372346733071, + "grad_norm": 0.8133774793515741, + "learning_rate": 3.5180109884583636e-05, + "loss": 0.19, + "step": 13004 + }, + { + "epoch": 1.5421558164354323, + "grad_norm": 0.9697880257622497, + "learning_rate": 3.517791749888718e-05, + "loss": 0.1939, + "step": 13005 + }, + { + "epoch": 1.542274398197557, + "grad_norm": 1.1560843793957765, + "learning_rate": 3.517572501936281e-05, + "loss": 0.2178, + "step": 13006 + }, + { + "epoch": 1.5423929799596823, + "grad_norm": 0.8554450456361307, + "learning_rate": 3.517353244603073e-05, + "loss": 0.1691, + "step": 13007 + }, + { + "epoch": 1.542511561721807, + "grad_norm": 0.6257118671982421, + "learning_rate": 3.517133977891116e-05, + "loss": 0.1558, + "step": 13008 + }, + { + "epoch": 1.5426301434839322, + "grad_norm": 1.1079535477444116, + "learning_rate": 3.516914701802431e-05, + "loss": 0.2849, + "step": 13009 + }, + { + "epoch": 1.542748725246057, + "grad_norm": 0.8969484811979774, + "learning_rate": 3.5166954163390384e-05, + "loss": 0.1859, + "step": 13010 + }, + { + "epoch": 1.5428673070081822, + "grad_norm": 0.7351545488196336, + "learning_rate": 3.516476121502962e-05, + "loss": 0.1363, + "step": 13011 + }, + { + "epoch": 1.5429858887703072, + "grad_norm": 1.2289123927171712, + "learning_rate": 3.516256817296222e-05, + "loss": 0.283, + "step": 13012 + }, + { + "epoch": 1.5431044705324322, + "grad_norm": 0.9515289848502695, + "learning_rate": 3.5160375037208404e-05, + "loss": 0.1787, + "step": 13013 + }, + { + "epoch": 1.5432230522945571, + "grad_norm": 0.9608412896001378, + "learning_rate": 3.5158181807788386e-05, + "loss": 0.2518, + "step": 13014 + }, + { + "epoch": 1.5433416340566821, + "grad_norm": 1.1206935710543817, + "learning_rate": 3.51559884847224e-05, + "loss": 0.2641, + "step": 13015 + }, + { + "epoch": 1.543460215818807, + "grad_norm": 0.9436479665184189, + "learning_rate": 3.515379506803064e-05, + "loss": 0.1884, + "step": 13016 + }, + { + "epoch": 1.543578797580932, + "grad_norm": 1.210087028758465, + "learning_rate": 3.515160155773335e-05, + "loss": 0.2635, + "step": 13017 + }, + { + "epoch": 1.543697379343057, + "grad_norm": 1.1035978405324391, + "learning_rate": 3.514940795385074e-05, + "loss": 0.1807, + "step": 13018 + }, + { + "epoch": 1.543815961105182, + "grad_norm": 0.9566180849829984, + "learning_rate": 3.5147214256403025e-05, + "loss": 0.2049, + "step": 13019 + }, + { + "epoch": 1.543934542867307, + "grad_norm": 0.7715351679186057, + "learning_rate": 3.514502046541045e-05, + "loss": 0.1719, + "step": 13020 + }, + { + "epoch": 1.544053124629432, + "grad_norm": 1.3308531470103715, + "learning_rate": 3.5142826580893226e-05, + "loss": 0.287, + "step": 13021 + }, + { + "epoch": 1.544171706391557, + "grad_norm": 0.8204744601623554, + "learning_rate": 3.514063260287157e-05, + "loss": 0.1989, + "step": 13022 + }, + { + "epoch": 1.544290288153682, + "grad_norm": 0.9854019867619556, + "learning_rate": 3.513843853136573e-05, + "loss": 0.1821, + "step": 13023 + }, + { + "epoch": 1.544408869915807, + "grad_norm": 0.7957680539764332, + "learning_rate": 3.513624436639591e-05, + "loss": 0.1751, + "step": 13024 + }, + { + "epoch": 1.544527451677932, + "grad_norm": 0.9311732494522149, + "learning_rate": 3.5134050107982345e-05, + "loss": 0.1906, + "step": 13025 + }, + { + "epoch": 1.544646033440057, + "grad_norm": 1.039127942363783, + "learning_rate": 3.5131855756145274e-05, + "loss": 0.1978, + "step": 13026 + }, + { + "epoch": 1.544764615202182, + "grad_norm": 0.9410772594522689, + "learning_rate": 3.512966131090492e-05, + "loss": 0.2041, + "step": 13027 + }, + { + "epoch": 1.5448831969643069, + "grad_norm": 0.9802741164736304, + "learning_rate": 3.5127466772281496e-05, + "loss": 0.1742, + "step": 13028 + }, + { + "epoch": 1.5450017787264319, + "grad_norm": 1.3425025108458843, + "learning_rate": 3.512527214029526e-05, + "loss": 0.2392, + "step": 13029 + }, + { + "epoch": 1.5451203604885568, + "grad_norm": 0.8435718835050805, + "learning_rate": 3.512307741496642e-05, + "loss": 0.1448, + "step": 13030 + }, + { + "epoch": 1.5452389422506818, + "grad_norm": 1.0110960655111323, + "learning_rate": 3.512088259631522e-05, + "loss": 0.1433, + "step": 13031 + }, + { + "epoch": 1.5453575240128068, + "grad_norm": 1.1656222313852176, + "learning_rate": 3.511868768436191e-05, + "loss": 0.2208, + "step": 13032 + }, + { + "epoch": 1.5454761057749318, + "grad_norm": 0.8146590854901843, + "learning_rate": 3.51164926791267e-05, + "loss": 0.1678, + "step": 13033 + }, + { + "epoch": 1.5455946875370568, + "grad_norm": 0.8677196349298805, + "learning_rate": 3.511429758062983e-05, + "loss": 0.1798, + "step": 13034 + }, + { + "epoch": 1.5457132692991817, + "grad_norm": 1.2680578365054633, + "learning_rate": 3.511210238889153e-05, + "loss": 0.2402, + "step": 13035 + }, + { + "epoch": 1.5458318510613067, + "grad_norm": 1.0679785096738463, + "learning_rate": 3.510990710393207e-05, + "loss": 0.2132, + "step": 13036 + }, + { + "epoch": 1.5459504328234317, + "grad_norm": 1.21574477350966, + "learning_rate": 3.510771172577164e-05, + "loss": 0.2434, + "step": 13037 + }, + { + "epoch": 1.546069014585557, + "grad_norm": 1.050381199497278, + "learning_rate": 3.510551625443051e-05, + "loss": 0.2208, + "step": 13038 + }, + { + "epoch": 1.5461875963476817, + "grad_norm": 1.0548831510153518, + "learning_rate": 3.510332068992892e-05, + "loss": 0.1855, + "step": 13039 + }, + { + "epoch": 1.5463061781098069, + "grad_norm": 0.9952235029001298, + "learning_rate": 3.51011250322871e-05, + "loss": 0.1878, + "step": 13040 + }, + { + "epoch": 1.5464247598719316, + "grad_norm": 1.7066225297407105, + "learning_rate": 3.509892928152529e-05, + "loss": 0.282, + "step": 13041 + }, + { + "epoch": 1.5465433416340568, + "grad_norm": 0.916588070782745, + "learning_rate": 3.5096733437663736e-05, + "loss": 0.1931, + "step": 13042 + }, + { + "epoch": 1.5466619233961816, + "grad_norm": 1.0023453979422223, + "learning_rate": 3.509453750072268e-05, + "loss": 0.2472, + "step": 13043 + }, + { + "epoch": 1.5467805051583068, + "grad_norm": 0.9595273326570879, + "learning_rate": 3.509234147072238e-05, + "loss": 0.1869, + "step": 13044 + }, + { + "epoch": 1.5468990869204315, + "grad_norm": 0.672928113799073, + "learning_rate": 3.509014534768306e-05, + "loss": 0.1716, + "step": 13045 + }, + { + "epoch": 1.5470176686825567, + "grad_norm": 1.2484328322170228, + "learning_rate": 3.508794913162496e-05, + "loss": 0.2761, + "step": 13046 + }, + { + "epoch": 1.5471362504446815, + "grad_norm": 0.9033157703449846, + "learning_rate": 3.508575282256836e-05, + "loss": 0.1915, + "step": 13047 + }, + { + "epoch": 1.5472548322068067, + "grad_norm": 1.1480952438110354, + "learning_rate": 3.508355642053348e-05, + "loss": 0.234, + "step": 13048 + }, + { + "epoch": 1.5473734139689315, + "grad_norm": 1.0453702842544539, + "learning_rate": 3.508135992554057e-05, + "loss": 0.2071, + "step": 13049 + }, + { + "epoch": 1.5474919957310567, + "grad_norm": 0.9988824550678258, + "learning_rate": 3.5079163337609884e-05, + "loss": 0.1829, + "step": 13050 + }, + { + "epoch": 1.5476105774931814, + "grad_norm": 0.9240896555106584, + "learning_rate": 3.5076966656761677e-05, + "loss": 0.2031, + "step": 13051 + }, + { + "epoch": 1.5477291592553066, + "grad_norm": 1.0949409812057098, + "learning_rate": 3.507476988301619e-05, + "loss": 0.2568, + "step": 13052 + }, + { + "epoch": 1.5478477410174314, + "grad_norm": 0.9198506836159268, + "learning_rate": 3.5072573016393685e-05, + "loss": 0.2146, + "step": 13053 + }, + { + "epoch": 1.5479663227795566, + "grad_norm": 1.0266702088188966, + "learning_rate": 3.5070376056914403e-05, + "loss": 0.237, + "step": 13054 + }, + { + "epoch": 1.5480849045416814, + "grad_norm": 0.7788213817412164, + "learning_rate": 3.50681790045986e-05, + "loss": 0.157, + "step": 13055 + }, + { + "epoch": 1.5482034863038066, + "grad_norm": 1.0017466110114257, + "learning_rate": 3.5065981859466545e-05, + "loss": 0.2031, + "step": 13056 + }, + { + "epoch": 1.5483220680659313, + "grad_norm": 1.4467566178648157, + "learning_rate": 3.5063784621538476e-05, + "loss": 0.3024, + "step": 13057 + }, + { + "epoch": 1.5484406498280565, + "grad_norm": 0.7696994498241706, + "learning_rate": 3.506158729083465e-05, + "loss": 0.1403, + "step": 13058 + }, + { + "epoch": 1.5485592315901813, + "grad_norm": 0.8286890474267681, + "learning_rate": 3.505938986737533e-05, + "loss": 0.1728, + "step": 13059 + }, + { + "epoch": 1.5486778133523065, + "grad_norm": 1.0212572920373388, + "learning_rate": 3.5057192351180766e-05, + "loss": 0.2196, + "step": 13060 + }, + { + "epoch": 1.5487963951144312, + "grad_norm": 0.7996385785179889, + "learning_rate": 3.505499474227123e-05, + "loss": 0.1524, + "step": 13061 + }, + { + "epoch": 1.5489149768765564, + "grad_norm": 0.9279176677819817, + "learning_rate": 3.5052797040666976e-05, + "loss": 0.1993, + "step": 13062 + }, + { + "epoch": 1.5490335586386814, + "grad_norm": 0.917622027986063, + "learning_rate": 3.505059924638825e-05, + "loss": 0.1662, + "step": 13063 + }, + { + "epoch": 1.5491521404008064, + "grad_norm": 1.0395938932733506, + "learning_rate": 3.504840135945533e-05, + "loss": 0.196, + "step": 13064 + }, + { + "epoch": 1.5492707221629314, + "grad_norm": 1.0471978220964535, + "learning_rate": 3.504620337988847e-05, + "loss": 0.2154, + "step": 13065 + }, + { + "epoch": 1.5493893039250564, + "grad_norm": 1.1668586063290236, + "learning_rate": 3.5044005307707926e-05, + "loss": 0.1981, + "step": 13066 + }, + { + "epoch": 1.5495078856871813, + "grad_norm": 1.6821725162158043, + "learning_rate": 3.504180714293398e-05, + "loss": 0.3727, + "step": 13067 + }, + { + "epoch": 1.5496264674493063, + "grad_norm": 1.0776016600845943, + "learning_rate": 3.503960888558688e-05, + "loss": 0.2294, + "step": 13068 + }, + { + "epoch": 1.5497450492114313, + "grad_norm": 1.2868963298416136, + "learning_rate": 3.50374105356869e-05, + "loss": 0.2545, + "step": 13069 + }, + { + "epoch": 1.5498636309735563, + "grad_norm": 1.0441579310427482, + "learning_rate": 3.503521209325431e-05, + "loss": 0.2129, + "step": 13070 + }, + { + "epoch": 1.5499822127356813, + "grad_norm": 0.8406246000742389, + "learning_rate": 3.5033013558309364e-05, + "loss": 0.1921, + "step": 13071 + }, + { + "epoch": 1.5501007944978062, + "grad_norm": 1.4279590781610474, + "learning_rate": 3.503081493087234e-05, + "loss": 0.229, + "step": 13072 + }, + { + "epoch": 1.5502193762599312, + "grad_norm": 0.9657246571399705, + "learning_rate": 3.502861621096349e-05, + "loss": 0.2107, + "step": 13073 + }, + { + "epoch": 1.5503379580220562, + "grad_norm": 0.872634330352901, + "learning_rate": 3.5026417398603114e-05, + "loss": 0.1513, + "step": 13074 + }, + { + "epoch": 1.5504565397841812, + "grad_norm": 0.9887145828471897, + "learning_rate": 3.502421849381146e-05, + "loss": 0.1516, + "step": 13075 + }, + { + "epoch": 1.5505751215463062, + "grad_norm": 0.7951344078562673, + "learning_rate": 3.50220194966088e-05, + "loss": 0.1662, + "step": 13076 + }, + { + "epoch": 1.5506937033084311, + "grad_norm": 0.9924477354321085, + "learning_rate": 3.501982040701541e-05, + "loss": 0.2227, + "step": 13077 + }, + { + "epoch": 1.5508122850705561, + "grad_norm": 1.0684090699813609, + "learning_rate": 3.5017621225051564e-05, + "loss": 0.2752, + "step": 13078 + }, + { + "epoch": 1.550930866832681, + "grad_norm": 0.7275373549364704, + "learning_rate": 3.5015421950737537e-05, + "loss": 0.1963, + "step": 13079 + }, + { + "epoch": 1.551049448594806, + "grad_norm": 1.3470975164693046, + "learning_rate": 3.50132225840936e-05, + "loss": 0.271, + "step": 13080 + }, + { + "epoch": 1.551168030356931, + "grad_norm": 0.9380366284748598, + "learning_rate": 3.501102312514003e-05, + "loss": 0.1907, + "step": 13081 + }, + { + "epoch": 1.551286612119056, + "grad_norm": 0.8075865378197027, + "learning_rate": 3.5008823573897096e-05, + "loss": 0.1921, + "step": 13082 + }, + { + "epoch": 1.551405193881181, + "grad_norm": 1.0404317924871154, + "learning_rate": 3.500662393038509e-05, + "loss": 0.2751, + "step": 13083 + }, + { + "epoch": 1.551523775643306, + "grad_norm": 0.9659705538542429, + "learning_rate": 3.5004424194624284e-05, + "loss": 0.2031, + "step": 13084 + }, + { + "epoch": 1.551642357405431, + "grad_norm": 1.3460301913964523, + "learning_rate": 3.500222436663495e-05, + "loss": 0.1922, + "step": 13085 + }, + { + "epoch": 1.551760939167556, + "grad_norm": 0.929401080154763, + "learning_rate": 3.500002444643738e-05, + "loss": 0.1899, + "step": 13086 + }, + { + "epoch": 1.5518795209296812, + "grad_norm": 0.6572845765107708, + "learning_rate": 3.499782443405184e-05, + "loss": 0.1476, + "step": 13087 + }, + { + "epoch": 1.551998102691806, + "grad_norm": 1.0528895662198148, + "learning_rate": 3.499562432949862e-05, + "loss": 0.2394, + "step": 13088 + }, + { + "epoch": 1.5521166844539311, + "grad_norm": 0.9642450757858113, + "learning_rate": 3.4993424132798e-05, + "loss": 0.2432, + "step": 13089 + }, + { + "epoch": 1.552235266216056, + "grad_norm": 0.9577589298138799, + "learning_rate": 3.499122384397027e-05, + "loss": 0.2211, + "step": 13090 + }, + { + "epoch": 1.552353847978181, + "grad_norm": 0.8359548967670991, + "learning_rate": 3.49890234630357e-05, + "loss": 0.1692, + "step": 13091 + }, + { + "epoch": 1.5524724297403059, + "grad_norm": 1.0742670570751842, + "learning_rate": 3.49868229900146e-05, + "loss": 0.1966, + "step": 13092 + }, + { + "epoch": 1.552591011502431, + "grad_norm": 1.098460007765056, + "learning_rate": 3.4984622424927214e-05, + "loss": 0.266, + "step": 13093 + }, + { + "epoch": 1.5527095932645558, + "grad_norm": 1.001684991231171, + "learning_rate": 3.498242176779387e-05, + "loss": 0.1978, + "step": 13094 + }, + { + "epoch": 1.552828175026681, + "grad_norm": 1.020942017416016, + "learning_rate": 3.498022101863483e-05, + "loss": 0.1689, + "step": 13095 + }, + { + "epoch": 1.5529467567888058, + "grad_norm": 1.0037811351896286, + "learning_rate": 3.497802017747041e-05, + "loss": 0.2373, + "step": 13096 + }, + { + "epoch": 1.553065338550931, + "grad_norm": 1.0030523640356488, + "learning_rate": 3.497581924432086e-05, + "loss": 0.2528, + "step": 13097 + }, + { + "epoch": 1.5531839203130557, + "grad_norm": 0.8545710128825859, + "learning_rate": 3.497361821920649e-05, + "loss": 0.2045, + "step": 13098 + }, + { + "epoch": 1.553302502075181, + "grad_norm": 1.173053381474065, + "learning_rate": 3.4971417102147595e-05, + "loss": 0.2468, + "step": 13099 + }, + { + "epoch": 1.5534210838373057, + "grad_norm": 1.016236958324545, + "learning_rate": 3.496921589316446e-05, + "loss": 0.2783, + "step": 13100 + }, + { + "epoch": 1.553539665599431, + "grad_norm": 0.868498869740926, + "learning_rate": 3.496701459227738e-05, + "loss": 0.1769, + "step": 13101 + }, + { + "epoch": 1.5536582473615557, + "grad_norm": 0.9722466723626487, + "learning_rate": 3.4964813199506654e-05, + "loss": 0.2514, + "step": 13102 + }, + { + "epoch": 1.5537768291236809, + "grad_norm": 0.8977767740813032, + "learning_rate": 3.4962611714872565e-05, + "loss": 0.1713, + "step": 13103 + }, + { + "epoch": 1.5538954108858056, + "grad_norm": 0.6958854050095565, + "learning_rate": 3.496041013839542e-05, + "loss": 0.1233, + "step": 13104 + }, + { + "epoch": 1.5540139926479308, + "grad_norm": 1.0593222338801247, + "learning_rate": 3.49582084700955e-05, + "loss": 0.2065, + "step": 13105 + }, + { + "epoch": 1.5541325744100556, + "grad_norm": 0.9595613580882345, + "learning_rate": 3.4956006709993105e-05, + "loss": 0.198, + "step": 13106 + }, + { + "epoch": 1.5542511561721808, + "grad_norm": 1.017860415584178, + "learning_rate": 3.495380485810855e-05, + "loss": 0.193, + "step": 13107 + }, + { + "epoch": 1.5543697379343056, + "grad_norm": 1.3310186285271595, + "learning_rate": 3.4951602914462114e-05, + "loss": 0.288, + "step": 13108 + }, + { + "epoch": 1.5544883196964308, + "grad_norm": 0.8549752818571824, + "learning_rate": 3.49494008790741e-05, + "loss": 0.1809, + "step": 13109 + }, + { + "epoch": 1.5546069014585555, + "grad_norm": 1.514987428659583, + "learning_rate": 3.4947198751964806e-05, + "loss": 0.3449, + "step": 13110 + }, + { + "epoch": 1.5547254832206807, + "grad_norm": 1.4370681489152513, + "learning_rate": 3.494499653315454e-05, + "loss": 0.231, + "step": 13111 + }, + { + "epoch": 1.5548440649828057, + "grad_norm": 0.8798598073730708, + "learning_rate": 3.4942794222663614e-05, + "loss": 0.1524, + "step": 13112 + }, + { + "epoch": 1.5549626467449307, + "grad_norm": 0.8643654177395045, + "learning_rate": 3.49405918205123e-05, + "loss": 0.1823, + "step": 13113 + }, + { + "epoch": 1.5550812285070557, + "grad_norm": 1.169254076067171, + "learning_rate": 3.493838932672093e-05, + "loss": 0.2577, + "step": 13114 + }, + { + "epoch": 1.5551998102691806, + "grad_norm": 0.9466897195807291, + "learning_rate": 3.49361867413098e-05, + "loss": 0.2144, + "step": 13115 + }, + { + "epoch": 1.5553183920313056, + "grad_norm": 1.1512895021469502, + "learning_rate": 3.493398406429921e-05, + "loss": 0.2144, + "step": 13116 + }, + { + "epoch": 1.5554369737934306, + "grad_norm": 0.870182091702603, + "learning_rate": 3.4931781295709466e-05, + "loss": 0.1885, + "step": 13117 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.9162967941398, + "learning_rate": 3.492957843556088e-05, + "loss": 0.1808, + "step": 13118 + }, + { + "epoch": 1.5556741373176806, + "grad_norm": 0.9545320451698327, + "learning_rate": 3.4927375483873756e-05, + "loss": 0.2328, + "step": 13119 + }, + { + "epoch": 1.5557927190798055, + "grad_norm": 0.9387178101470325, + "learning_rate": 3.492517244066841e-05, + "loss": 0.2126, + "step": 13120 + }, + { + "epoch": 1.5559113008419305, + "grad_norm": 0.8285605348662975, + "learning_rate": 3.492296930596514e-05, + "loss": 0.1727, + "step": 13121 + }, + { + "epoch": 1.5560298826040555, + "grad_norm": 1.1411185207194117, + "learning_rate": 3.4920766079784256e-05, + "loss": 0.2736, + "step": 13122 + }, + { + "epoch": 1.5561484643661805, + "grad_norm": 1.4504907902272868, + "learning_rate": 3.491856276214608e-05, + "loss": 0.3513, + "step": 13123 + }, + { + "epoch": 1.5562670461283055, + "grad_norm": 0.9169851624813995, + "learning_rate": 3.491635935307092e-05, + "loss": 0.1745, + "step": 13124 + }, + { + "epoch": 1.5563856278904304, + "grad_norm": 0.7225685037801819, + "learning_rate": 3.4914155852579084e-05, + "loss": 0.1477, + "step": 13125 + }, + { + "epoch": 1.5565042096525554, + "grad_norm": 0.8401533594041928, + "learning_rate": 3.4911952260690885e-05, + "loss": 0.2204, + "step": 13126 + }, + { + "epoch": 1.5566227914146804, + "grad_norm": 0.7677923788791334, + "learning_rate": 3.490974857742665e-05, + "loss": 0.1568, + "step": 13127 + }, + { + "epoch": 1.5567413731768054, + "grad_norm": 0.9133681432459265, + "learning_rate": 3.490754480280668e-05, + "loss": 0.2203, + "step": 13128 + }, + { + "epoch": 1.5568599549389304, + "grad_norm": 0.9382883976517621, + "learning_rate": 3.490534093685129e-05, + "loss": 0.2293, + "step": 13129 + }, + { + "epoch": 1.5569785367010553, + "grad_norm": 0.9171212760494835, + "learning_rate": 3.490313697958081e-05, + "loss": 0.1858, + "step": 13130 + }, + { + "epoch": 1.5570971184631803, + "grad_norm": 0.9558803791481454, + "learning_rate": 3.4900932931015557e-05, + "loss": 0.195, + "step": 13131 + }, + { + "epoch": 1.5572157002253053, + "grad_norm": 0.8143420738406641, + "learning_rate": 3.4898728791175836e-05, + "loss": 0.1277, + "step": 13132 + }, + { + "epoch": 1.5573342819874303, + "grad_norm": 1.0184367700783872, + "learning_rate": 3.4896524560081976e-05, + "loss": 0.2343, + "step": 13133 + }, + { + "epoch": 1.5574528637495553, + "grad_norm": 0.743029945149777, + "learning_rate": 3.48943202377543e-05, + "loss": 0.1549, + "step": 13134 + }, + { + "epoch": 1.5575714455116803, + "grad_norm": 1.0759046626010405, + "learning_rate": 3.4892115824213126e-05, + "loss": 0.2489, + "step": 13135 + }, + { + "epoch": 1.5576900272738052, + "grad_norm": 0.7685458075041345, + "learning_rate": 3.488991131947876e-05, + "loss": 0.1561, + "step": 13136 + }, + { + "epoch": 1.5578086090359302, + "grad_norm": 0.7843781122116337, + "learning_rate": 3.488770672357155e-05, + "loss": 0.1909, + "step": 13137 + }, + { + "epoch": 1.5579271907980554, + "grad_norm": 0.9633345607065357, + "learning_rate": 3.4885502036511805e-05, + "loss": 0.246, + "step": 13138 + }, + { + "epoch": 1.5580457725601802, + "grad_norm": 0.9983931072730781, + "learning_rate": 3.488329725831986e-05, + "loss": 0.2472, + "step": 13139 + }, + { + "epoch": 1.5581643543223054, + "grad_norm": 0.9764359336522147, + "learning_rate": 3.488109238901602e-05, + "loss": 0.202, + "step": 13140 + }, + { + "epoch": 1.5582829360844301, + "grad_norm": 1.365040164823216, + "learning_rate": 3.487888742862064e-05, + "loss": 0.2928, + "step": 13141 + }, + { + "epoch": 1.5584015178465553, + "grad_norm": 0.9783119528862977, + "learning_rate": 3.487668237715403e-05, + "loss": 0.2038, + "step": 13142 + }, + { + "epoch": 1.55852009960868, + "grad_norm": 0.7434150304840806, + "learning_rate": 3.487447723463652e-05, + "loss": 0.1703, + "step": 13143 + }, + { + "epoch": 1.5586386813708053, + "grad_norm": 1.136884150012758, + "learning_rate": 3.4872272001088436e-05, + "loss": 0.2454, + "step": 13144 + }, + { + "epoch": 1.55875726313293, + "grad_norm": 1.030295439461227, + "learning_rate": 3.4870066676530104e-05, + "loss": 0.2101, + "step": 13145 + }, + { + "epoch": 1.5588758448950553, + "grad_norm": 0.9885845766563467, + "learning_rate": 3.486786126098187e-05, + "loss": 0.1579, + "step": 13146 + }, + { + "epoch": 1.55899442665718, + "grad_norm": 0.8308606307170483, + "learning_rate": 3.4865655754464055e-05, + "loss": 0.1489, + "step": 13147 + }, + { + "epoch": 1.5591130084193052, + "grad_norm": 0.9507319579651601, + "learning_rate": 3.4863450156996993e-05, + "loss": 0.1766, + "step": 13148 + }, + { + "epoch": 1.55923159018143, + "grad_norm": 1.4461470309998787, + "learning_rate": 3.4861244468601006e-05, + "loss": 0.2676, + "step": 13149 + }, + { + "epoch": 1.5593501719435552, + "grad_norm": 0.937917364969022, + "learning_rate": 3.485903868929645e-05, + "loss": 0.1783, + "step": 13150 + }, + { + "epoch": 1.55946875370568, + "grad_norm": 0.9345252435868415, + "learning_rate": 3.485683281910364e-05, + "loss": 0.1293, + "step": 13151 + }, + { + "epoch": 1.5595873354678051, + "grad_norm": 0.7953220958005324, + "learning_rate": 3.485462685804292e-05, + "loss": 0.1725, + "step": 13152 + }, + { + "epoch": 1.55970591722993, + "grad_norm": 1.2236064144136873, + "learning_rate": 3.4852420806134625e-05, + "loss": 0.2779, + "step": 13153 + }, + { + "epoch": 1.559824498992055, + "grad_norm": 0.716966711499907, + "learning_rate": 3.4850214663399097e-05, + "loss": 0.1667, + "step": 13154 + }, + { + "epoch": 1.5599430807541799, + "grad_norm": 1.2624678998033978, + "learning_rate": 3.484800842985666e-05, + "loss": 0.2547, + "step": 13155 + }, + { + "epoch": 1.560061662516305, + "grad_norm": 1.4976435318468857, + "learning_rate": 3.484580210552767e-05, + "loss": 0.2893, + "step": 13156 + }, + { + "epoch": 1.5601802442784298, + "grad_norm": 1.3434666687946861, + "learning_rate": 3.484359569043245e-05, + "loss": 0.313, + "step": 13157 + }, + { + "epoch": 1.560298826040555, + "grad_norm": 0.75282790743166, + "learning_rate": 3.4841389184591354e-05, + "loss": 0.1262, + "step": 13158 + }, + { + "epoch": 1.5604174078026798, + "grad_norm": 1.0326077936016833, + "learning_rate": 3.483918258802471e-05, + "loss": 0.2633, + "step": 13159 + }, + { + "epoch": 1.560535989564805, + "grad_norm": 0.8636408224343743, + "learning_rate": 3.483697590075288e-05, + "loss": 0.1766, + "step": 13160 + }, + { + "epoch": 1.56065457132693, + "grad_norm": 0.8689257925996184, + "learning_rate": 3.48347691227962e-05, + "loss": 0.1448, + "step": 13161 + }, + { + "epoch": 1.560773153089055, + "grad_norm": 0.9796690272594073, + "learning_rate": 3.4832562254174996e-05, + "loss": 0.163, + "step": 13162 + }, + { + "epoch": 1.56089173485118, + "grad_norm": 1.2703920853303352, + "learning_rate": 3.4830355294909626e-05, + "loss": 0.2188, + "step": 13163 + }, + { + "epoch": 1.561010316613305, + "grad_norm": 1.019503899981079, + "learning_rate": 3.482814824502044e-05, + "loss": 0.2171, + "step": 13164 + }, + { + "epoch": 1.56112889837543, + "grad_norm": 1.0326705985466231, + "learning_rate": 3.4825941104527776e-05, + "loss": 0.1822, + "step": 13165 + }, + { + "epoch": 1.5612474801375549, + "grad_norm": 0.7451861516693179, + "learning_rate": 3.4823733873451986e-05, + "loss": 0.1498, + "step": 13166 + }, + { + "epoch": 1.5613660618996799, + "grad_norm": 1.029665812120613, + "learning_rate": 3.482152655181341e-05, + "loss": 0.1927, + "step": 13167 + }, + { + "epoch": 1.5614846436618048, + "grad_norm": 0.638219144555679, + "learning_rate": 3.481931913963241e-05, + "loss": 0.1288, + "step": 13168 + }, + { + "epoch": 1.5616032254239298, + "grad_norm": 0.8939337582924006, + "learning_rate": 3.481711163692933e-05, + "loss": 0.1928, + "step": 13169 + }, + { + "epoch": 1.5617218071860548, + "grad_norm": 1.2054224263876296, + "learning_rate": 3.4814904043724514e-05, + "loss": 0.2526, + "step": 13170 + }, + { + "epoch": 1.5618403889481798, + "grad_norm": 0.7357278429336755, + "learning_rate": 3.481269636003832e-05, + "loss": 0.1353, + "step": 13171 + }, + { + "epoch": 1.5619589707103048, + "grad_norm": 1.238856399947145, + "learning_rate": 3.4810488585891106e-05, + "loss": 0.2339, + "step": 13172 + }, + { + "epoch": 1.5620775524724297, + "grad_norm": 1.1983088503602157, + "learning_rate": 3.480828072130321e-05, + "loss": 0.2503, + "step": 13173 + }, + { + "epoch": 1.5621961342345547, + "grad_norm": 1.204405699804078, + "learning_rate": 3.4806072766295e-05, + "loss": 0.2597, + "step": 13174 + }, + { + "epoch": 1.5623147159966797, + "grad_norm": 1.2380819521129187, + "learning_rate": 3.480386472088681e-05, + "loss": 0.3873, + "step": 13175 + }, + { + "epoch": 1.5624332977588047, + "grad_norm": 1.0893692460197961, + "learning_rate": 3.480165658509903e-05, + "loss": 0.2768, + "step": 13176 + }, + { + "epoch": 1.5625518795209297, + "grad_norm": 0.8569574255706042, + "learning_rate": 3.479944835895198e-05, + "loss": 0.1614, + "step": 13177 + }, + { + "epoch": 1.5626704612830546, + "grad_norm": 0.7668269586692791, + "learning_rate": 3.479724004246604e-05, + "loss": 0.169, + "step": 13178 + }, + { + "epoch": 1.5627890430451796, + "grad_norm": 0.6682922608541269, + "learning_rate": 3.479503163566156e-05, + "loss": 0.1472, + "step": 13179 + }, + { + "epoch": 1.5629076248073046, + "grad_norm": 0.9389542021314308, + "learning_rate": 3.4792823138558896e-05, + "loss": 0.1868, + "step": 13180 + }, + { + "epoch": 1.5630262065694296, + "grad_norm": 0.8916742226957423, + "learning_rate": 3.479061455117841e-05, + "loss": 0.2176, + "step": 13181 + }, + { + "epoch": 1.5631447883315546, + "grad_norm": 1.0662468727269692, + "learning_rate": 3.478840587354047e-05, + "loss": 0.188, + "step": 13182 + }, + { + "epoch": 1.5632633700936795, + "grad_norm": 0.8892597472391855, + "learning_rate": 3.478619710566543e-05, + "loss": 0.2375, + "step": 13183 + }, + { + "epoch": 1.5633819518558045, + "grad_norm": 0.7259959465142727, + "learning_rate": 3.4783988247573654e-05, + "loss": 0.1825, + "step": 13184 + }, + { + "epoch": 1.5635005336179295, + "grad_norm": 1.3366955204197444, + "learning_rate": 3.478177929928551e-05, + "loss": 0.3019, + "step": 13185 + }, + { + "epoch": 1.5636191153800545, + "grad_norm": 1.0812665846776774, + "learning_rate": 3.4779570260821346e-05, + "loss": 0.2609, + "step": 13186 + }, + { + "epoch": 1.5637376971421797, + "grad_norm": 1.0869951484186313, + "learning_rate": 3.477736113220154e-05, + "loss": 0.1765, + "step": 13187 + }, + { + "epoch": 1.5638562789043045, + "grad_norm": 1.718947982543437, + "learning_rate": 3.477515191344645e-05, + "loss": 0.4162, + "step": 13188 + }, + { + "epoch": 1.5639748606664297, + "grad_norm": 0.8352294110688225, + "learning_rate": 3.477294260457645e-05, + "loss": 0.2067, + "step": 13189 + }, + { + "epoch": 1.5640934424285544, + "grad_norm": 0.8328858629247802, + "learning_rate": 3.47707332056119e-05, + "loss": 0.17, + "step": 13190 + }, + { + "epoch": 1.5642120241906796, + "grad_norm": 0.920419847545811, + "learning_rate": 3.476852371657318e-05, + "loss": 0.2326, + "step": 13191 + }, + { + "epoch": 1.5643306059528044, + "grad_norm": 0.9075128141393741, + "learning_rate": 3.4766314137480635e-05, + "loss": 0.1937, + "step": 13192 + }, + { + "epoch": 1.5644491877149296, + "grad_norm": 0.8884059638869388, + "learning_rate": 3.476410446835467e-05, + "loss": 0.2124, + "step": 13193 + }, + { + "epoch": 1.5645677694770543, + "grad_norm": 0.7438938334856057, + "learning_rate": 3.4761894709215625e-05, + "loss": 0.1661, + "step": 13194 + }, + { + "epoch": 1.5646863512391795, + "grad_norm": 0.6207332193627888, + "learning_rate": 3.475968486008388e-05, + "loss": 0.1408, + "step": 13195 + }, + { + "epoch": 1.5648049330013043, + "grad_norm": 0.7970662726306347, + "learning_rate": 3.4757474920979814e-05, + "loss": 0.1831, + "step": 13196 + }, + { + "epoch": 1.5649235147634295, + "grad_norm": 1.0948669748978865, + "learning_rate": 3.4755264891923787e-05, + "loss": 0.1935, + "step": 13197 + }, + { + "epoch": 1.5650420965255543, + "grad_norm": 0.9483677777044881, + "learning_rate": 3.4753054772936184e-05, + "loss": 0.1652, + "step": 13198 + }, + { + "epoch": 1.5651606782876795, + "grad_norm": 0.9375258536586146, + "learning_rate": 3.4750844564037376e-05, + "loss": 0.2012, + "step": 13199 + }, + { + "epoch": 1.5652792600498042, + "grad_norm": 1.0808514467618617, + "learning_rate": 3.4748634265247745e-05, + "loss": 0.2269, + "step": 13200 + }, + { + "epoch": 1.5653978418119294, + "grad_norm": 1.5372384586298424, + "learning_rate": 3.4746423876587654e-05, + "loss": 0.3357, + "step": 13201 + }, + { + "epoch": 1.5655164235740542, + "grad_norm": 0.9955527433387563, + "learning_rate": 3.4744213398077486e-05, + "loss": 0.1565, + "step": 13202 + }, + { + "epoch": 1.5656350053361794, + "grad_norm": 1.2086227462878647, + "learning_rate": 3.474200282973762e-05, + "loss": 0.3203, + "step": 13203 + }, + { + "epoch": 1.5657535870983041, + "grad_norm": 1.0048366896537224, + "learning_rate": 3.4739792171588445e-05, + "loss": 0.2073, + "step": 13204 + }, + { + "epoch": 1.5658721688604293, + "grad_norm": 0.9223992955295633, + "learning_rate": 3.473758142365031e-05, + "loss": 0.1863, + "step": 13205 + }, + { + "epoch": 1.565990750622554, + "grad_norm": 1.2426688099356693, + "learning_rate": 3.473537058594363e-05, + "loss": 0.2627, + "step": 13206 + }, + { + "epoch": 1.5661093323846793, + "grad_norm": 1.382349015499398, + "learning_rate": 3.473315965848877e-05, + "loss": 0.2615, + "step": 13207 + }, + { + "epoch": 1.566227914146804, + "grad_norm": 1.0100553360160247, + "learning_rate": 3.473094864130611e-05, + "loss": 0.2707, + "step": 13208 + }, + { + "epoch": 1.5663464959089293, + "grad_norm": 0.9363426345892094, + "learning_rate": 3.472873753441603e-05, + "loss": 0.2228, + "step": 13209 + }, + { + "epoch": 1.566465077671054, + "grad_norm": 0.8672278435320432, + "learning_rate": 3.472652633783893e-05, + "loss": 0.1778, + "step": 13210 + }, + { + "epoch": 1.5665836594331792, + "grad_norm": 1.0135695399107922, + "learning_rate": 3.4724315051595186e-05, + "loss": 0.2467, + "step": 13211 + }, + { + "epoch": 1.5667022411953042, + "grad_norm": 0.8248195741559083, + "learning_rate": 3.472210367570518e-05, + "loss": 0.1452, + "step": 13212 + }, + { + "epoch": 1.5668208229574292, + "grad_norm": 1.0866939203848895, + "learning_rate": 3.47198922101893e-05, + "loss": 0.2249, + "step": 13213 + }, + { + "epoch": 1.5669394047195542, + "grad_norm": 0.7831122653160899, + "learning_rate": 3.4717680655067926e-05, + "loss": 0.1746, + "step": 13214 + }, + { + "epoch": 1.5670579864816792, + "grad_norm": 1.0833003639953835, + "learning_rate": 3.471546901036146e-05, + "loss": 0.2007, + "step": 13215 + }, + { + "epoch": 1.5671765682438041, + "grad_norm": 1.4005742853964025, + "learning_rate": 3.471325727609028e-05, + "loss": 0.2518, + "step": 13216 + }, + { + "epoch": 1.5672951500059291, + "grad_norm": 1.2181356264479462, + "learning_rate": 3.4711045452274784e-05, + "loss": 0.2428, + "step": 13217 + }, + { + "epoch": 1.567413731768054, + "grad_norm": 0.6814681584806378, + "learning_rate": 3.470883353893535e-05, + "loss": 0.149, + "step": 13218 + }, + { + "epoch": 1.567532313530179, + "grad_norm": 0.9608356106472743, + "learning_rate": 3.4706621536092385e-05, + "loss": 0.2174, + "step": 13219 + }, + { + "epoch": 1.567650895292304, + "grad_norm": 0.9581203430609055, + "learning_rate": 3.4704409443766264e-05, + "loss": 0.2342, + "step": 13220 + }, + { + "epoch": 1.567769477054429, + "grad_norm": 1.2738806275813774, + "learning_rate": 3.47021972619774e-05, + "loss": 0.2597, + "step": 13221 + }, + { + "epoch": 1.567888058816554, + "grad_norm": 0.9149980568339752, + "learning_rate": 3.469998499074616e-05, + "loss": 0.1913, + "step": 13222 + }, + { + "epoch": 1.568006640578679, + "grad_norm": 1.079628212083828, + "learning_rate": 3.4697772630092965e-05, + "loss": 0.2206, + "step": 13223 + }, + { + "epoch": 1.568125222340804, + "grad_norm": 1.0741132240037679, + "learning_rate": 3.469556018003819e-05, + "loss": 0.2249, + "step": 13224 + }, + { + "epoch": 1.568243804102929, + "grad_norm": 1.4656309764865003, + "learning_rate": 3.469334764060225e-05, + "loss": 0.2927, + "step": 13225 + }, + { + "epoch": 1.568362385865054, + "grad_norm": 0.8148552502795045, + "learning_rate": 3.4691135011805525e-05, + "loss": 0.1777, + "step": 13226 + }, + { + "epoch": 1.568480967627179, + "grad_norm": 1.351307862582582, + "learning_rate": 3.4688922293668416e-05, + "loss": 0.2968, + "step": 13227 + }, + { + "epoch": 1.568599549389304, + "grad_norm": 0.912162892251187, + "learning_rate": 3.468670948621133e-05, + "loss": 0.2094, + "step": 13228 + }, + { + "epoch": 1.5687181311514289, + "grad_norm": 0.841427170544016, + "learning_rate": 3.468449658945466e-05, + "loss": 0.1485, + "step": 13229 + }, + { + "epoch": 1.5688367129135539, + "grad_norm": 1.206975331036354, + "learning_rate": 3.468228360341881e-05, + "loss": 0.27, + "step": 13230 + }, + { + "epoch": 1.5689552946756788, + "grad_norm": 0.8494602496163988, + "learning_rate": 3.468007052812417e-05, + "loss": 0.1788, + "step": 13231 + }, + { + "epoch": 1.5690738764378038, + "grad_norm": 0.9434594031383579, + "learning_rate": 3.467785736359116e-05, + "loss": 0.2507, + "step": 13232 + }, + { + "epoch": 1.5691924581999288, + "grad_norm": 0.923356628447899, + "learning_rate": 3.467564410984017e-05, + "loss": 0.1933, + "step": 13233 + }, + { + "epoch": 1.5693110399620538, + "grad_norm": 0.6845619841609953, + "learning_rate": 3.4673430766891604e-05, + "loss": 0.1472, + "step": 13234 + }, + { + "epoch": 1.5694296217241788, + "grad_norm": 0.7537206593042752, + "learning_rate": 3.4671217334765867e-05, + "loss": 0.147, + "step": 13235 + }, + { + "epoch": 1.569548203486304, + "grad_norm": 0.8939821243257468, + "learning_rate": 3.466900381348337e-05, + "loss": 0.2187, + "step": 13236 + }, + { + "epoch": 1.5696667852484287, + "grad_norm": 0.7676365612441697, + "learning_rate": 3.466679020306451e-05, + "loss": 0.1811, + "step": 13237 + }, + { + "epoch": 1.569785367010554, + "grad_norm": 0.919819351966761, + "learning_rate": 3.46645765035297e-05, + "loss": 0.2263, + "step": 13238 + }, + { + "epoch": 1.5699039487726787, + "grad_norm": 0.878796048344292, + "learning_rate": 3.466236271489935e-05, + "loss": 0.1863, + "step": 13239 + }, + { + "epoch": 1.570022530534804, + "grad_norm": 0.8706017186377146, + "learning_rate": 3.466014883719386e-05, + "loss": 0.206, + "step": 13240 + }, + { + "epoch": 1.5701411122969287, + "grad_norm": 1.0578735279385667, + "learning_rate": 3.465793487043365e-05, + "loss": 0.2153, + "step": 13241 + }, + { + "epoch": 1.5702596940590539, + "grad_norm": 0.6348067258936739, + "learning_rate": 3.4655720814639116e-05, + "loss": 0.1356, + "step": 13242 + }, + { + "epoch": 1.5703782758211786, + "grad_norm": 1.1682154473928497, + "learning_rate": 3.465350666983068e-05, + "loss": 0.1845, + "step": 13243 + }, + { + "epoch": 1.5704968575833038, + "grad_norm": 0.924577258662925, + "learning_rate": 3.465129243602875e-05, + "loss": 0.22, + "step": 13244 + }, + { + "epoch": 1.5706154393454286, + "grad_norm": 1.022088270546706, + "learning_rate": 3.4649078113253746e-05, + "loss": 0.1672, + "step": 13245 + }, + { + "epoch": 1.5707340211075538, + "grad_norm": 0.8683372750122006, + "learning_rate": 3.4646863701526064e-05, + "loss": 0.21, + "step": 13246 + }, + { + "epoch": 1.5708526028696785, + "grad_norm": 1.0484633275469653, + "learning_rate": 3.464464920086613e-05, + "loss": 0.2181, + "step": 13247 + }, + { + "epoch": 1.5709711846318037, + "grad_norm": 0.8961848419924358, + "learning_rate": 3.464243461129436e-05, + "loss": 0.1625, + "step": 13248 + }, + { + "epoch": 1.5710897663939285, + "grad_norm": 1.3231312574574834, + "learning_rate": 3.464021993283118e-05, + "loss": 0.3024, + "step": 13249 + }, + { + "epoch": 1.5712083481560537, + "grad_norm": 1.1374632186384155, + "learning_rate": 3.463800516549697e-05, + "loss": 0.1914, + "step": 13250 + }, + { + "epoch": 1.5713269299181785, + "grad_norm": 0.8911519714801194, + "learning_rate": 3.463579030931219e-05, + "loss": 0.1726, + "step": 13251 + }, + { + "epoch": 1.5714455116803037, + "grad_norm": 1.0335887920985332, + "learning_rate": 3.463357536429723e-05, + "loss": 0.2237, + "step": 13252 + }, + { + "epoch": 1.5715640934424284, + "grad_norm": 0.6823741749774485, + "learning_rate": 3.4631360330472526e-05, + "loss": 0.1424, + "step": 13253 + }, + { + "epoch": 1.5716826752045536, + "grad_norm": 0.8542246357711445, + "learning_rate": 3.4629145207858484e-05, + "loss": 0.186, + "step": 13254 + }, + { + "epoch": 1.5718012569666784, + "grad_norm": 1.0686970877889392, + "learning_rate": 3.4626929996475524e-05, + "loss": 0.1838, + "step": 13255 + }, + { + "epoch": 1.5719198387288036, + "grad_norm": 1.3792189562540869, + "learning_rate": 3.4624714696344084e-05, + "loss": 0.2749, + "step": 13256 + }, + { + "epoch": 1.5720384204909283, + "grad_norm": 0.8988524140555754, + "learning_rate": 3.462249930748458e-05, + "loss": 0.1558, + "step": 13257 + }, + { + "epoch": 1.5721570022530535, + "grad_norm": 0.9512910008584282, + "learning_rate": 3.4620283829917425e-05, + "loss": 0.1971, + "step": 13258 + }, + { + "epoch": 1.5722755840151783, + "grad_norm": 0.899238095453484, + "learning_rate": 3.461806826366305e-05, + "loss": 0.2091, + "step": 13259 + }, + { + "epoch": 1.5723941657773035, + "grad_norm": 1.1220614907948745, + "learning_rate": 3.461585260874189e-05, + "loss": 0.2079, + "step": 13260 + }, + { + "epoch": 1.5725127475394285, + "grad_norm": 0.9960531939814639, + "learning_rate": 3.461363686517435e-05, + "loss": 0.1867, + "step": 13261 + }, + { + "epoch": 1.5726313293015535, + "grad_norm": 1.0532916340049345, + "learning_rate": 3.461142103298087e-05, + "loss": 0.2352, + "step": 13262 + }, + { + "epoch": 1.5727499110636785, + "grad_norm": 1.0949180794393114, + "learning_rate": 3.460920511218188e-05, + "loss": 0.2, + "step": 13263 + }, + { + "epoch": 1.5728684928258034, + "grad_norm": 0.9889669595306804, + "learning_rate": 3.46069891027978e-05, + "loss": 0.1754, + "step": 13264 + }, + { + "epoch": 1.5729870745879284, + "grad_norm": 1.0037155632001806, + "learning_rate": 3.4604773004849064e-05, + "loss": 0.1705, + "step": 13265 + }, + { + "epoch": 1.5731056563500534, + "grad_norm": 0.7864541694755648, + "learning_rate": 3.460255681835609e-05, + "loss": 0.1821, + "step": 13266 + }, + { + "epoch": 1.5732242381121784, + "grad_norm": 1.0354486698217673, + "learning_rate": 3.4600340543339324e-05, + "loss": 0.2369, + "step": 13267 + }, + { + "epoch": 1.5733428198743034, + "grad_norm": 1.6961051388217772, + "learning_rate": 3.459812417981919e-05, + "loss": 0.2997, + "step": 13268 + }, + { + "epoch": 1.5734614016364283, + "grad_norm": 0.9972658033609357, + "learning_rate": 3.459590772781612e-05, + "loss": 0.1898, + "step": 13269 + }, + { + "epoch": 1.5735799833985533, + "grad_norm": 1.503612441506859, + "learning_rate": 3.459369118735054e-05, + "loss": 0.3582, + "step": 13270 + }, + { + "epoch": 1.5736985651606783, + "grad_norm": 0.9628108585596351, + "learning_rate": 3.4591474558442904e-05, + "loss": 0.2212, + "step": 13271 + }, + { + "epoch": 1.5738171469228033, + "grad_norm": 0.8123474317896026, + "learning_rate": 3.458925784111363e-05, + "loss": 0.1465, + "step": 13272 + }, + { + "epoch": 1.5739357286849283, + "grad_norm": 1.0025928295418158, + "learning_rate": 3.458704103538316e-05, + "loss": 0.2421, + "step": 13273 + }, + { + "epoch": 1.5740543104470532, + "grad_norm": 1.0218360777146323, + "learning_rate": 3.458482414127193e-05, + "loss": 0.2267, + "step": 13274 + }, + { + "epoch": 1.5741728922091782, + "grad_norm": 0.8185532441417953, + "learning_rate": 3.458260715880037e-05, + "loss": 0.1633, + "step": 13275 + }, + { + "epoch": 1.5742914739713032, + "grad_norm": 0.7502598974981589, + "learning_rate": 3.458039008798892e-05, + "loss": 0.1664, + "step": 13276 + }, + { + "epoch": 1.5744100557334282, + "grad_norm": 1.2935494061052137, + "learning_rate": 3.4578172928858035e-05, + "loss": 0.2982, + "step": 13277 + }, + { + "epoch": 1.5745286374955532, + "grad_norm": 1.0735427178719006, + "learning_rate": 3.4575955681428125e-05, + "loss": 0.2577, + "step": 13278 + }, + { + "epoch": 1.5746472192576781, + "grad_norm": 1.0613615314308766, + "learning_rate": 3.4573738345719656e-05, + "loss": 0.1973, + "step": 13279 + }, + { + "epoch": 1.5747658010198031, + "grad_norm": 0.9863939547759781, + "learning_rate": 3.457152092175305e-05, + "loss": 0.2253, + "step": 13280 + }, + { + "epoch": 1.574884382781928, + "grad_norm": 0.8362483842773701, + "learning_rate": 3.4569303409548774e-05, + "loss": 0.1812, + "step": 13281 + }, + { + "epoch": 1.575002964544053, + "grad_norm": 1.0384628995128522, + "learning_rate": 3.456708580912725e-05, + "loss": 0.1984, + "step": 13282 + }, + { + "epoch": 1.575121546306178, + "grad_norm": 1.4233200935262995, + "learning_rate": 3.456486812050892e-05, + "loss": 0.3227, + "step": 13283 + }, + { + "epoch": 1.575240128068303, + "grad_norm": 1.084707712477007, + "learning_rate": 3.4562650343714244e-05, + "loss": 0.2471, + "step": 13284 + }, + { + "epoch": 1.575358709830428, + "grad_norm": 0.8489281058434864, + "learning_rate": 3.456043247876365e-05, + "loss": 0.197, + "step": 13285 + }, + { + "epoch": 1.575477291592553, + "grad_norm": 1.0129519907256181, + "learning_rate": 3.4558214525677594e-05, + "loss": 0.1987, + "step": 13286 + }, + { + "epoch": 1.5755958733546782, + "grad_norm": 0.9847219047584574, + "learning_rate": 3.455599648447653e-05, + "loss": 0.2343, + "step": 13287 + }, + { + "epoch": 1.575714455116803, + "grad_norm": 0.8994012153287565, + "learning_rate": 3.4553778355180885e-05, + "loss": 0.1861, + "step": 13288 + }, + { + "epoch": 1.5758330368789282, + "grad_norm": 1.14669042992534, + "learning_rate": 3.4551560137811116e-05, + "loss": 0.2186, + "step": 13289 + }, + { + "epoch": 1.575951618641053, + "grad_norm": 0.930285195900521, + "learning_rate": 3.454934183238769e-05, + "loss": 0.2044, + "step": 13290 + }, + { + "epoch": 1.5760702004031781, + "grad_norm": 0.9686223065700884, + "learning_rate": 3.454712343893103e-05, + "loss": 0.1804, + "step": 13291 + }, + { + "epoch": 1.576188782165303, + "grad_norm": 0.7800524405634899, + "learning_rate": 3.454490495746161e-05, + "loss": 0.1572, + "step": 13292 + }, + { + "epoch": 1.576307363927428, + "grad_norm": 1.1040861351371538, + "learning_rate": 3.4542686387999866e-05, + "loss": 0.2591, + "step": 13293 + }, + { + "epoch": 1.5764259456895529, + "grad_norm": 1.3582232380129369, + "learning_rate": 3.454046773056626e-05, + "loss": 0.3502, + "step": 13294 + }, + { + "epoch": 1.576544527451678, + "grad_norm": 0.8073558426448947, + "learning_rate": 3.4538248985181235e-05, + "loss": 0.1783, + "step": 13295 + }, + { + "epoch": 1.5766631092138028, + "grad_norm": 0.9133698742469906, + "learning_rate": 3.4536030151865253e-05, + "loss": 0.2173, + "step": 13296 + }, + { + "epoch": 1.576781690975928, + "grad_norm": 1.0192054082841688, + "learning_rate": 3.453381123063876e-05, + "loss": 0.1745, + "step": 13297 + }, + { + "epoch": 1.5769002727380528, + "grad_norm": 1.1051053217174127, + "learning_rate": 3.453159222152223e-05, + "loss": 0.2444, + "step": 13298 + }, + { + "epoch": 1.577018854500178, + "grad_norm": 1.0020184613019196, + "learning_rate": 3.45293731245361e-05, + "loss": 0.2025, + "step": 13299 + }, + { + "epoch": 1.5771374362623027, + "grad_norm": 1.74153199161126, + "learning_rate": 3.4527153939700834e-05, + "loss": 0.4371, + "step": 13300 + }, + { + "epoch": 1.577256018024428, + "grad_norm": 1.1568727434214314, + "learning_rate": 3.45249346670369e-05, + "loss": 0.291, + "step": 13301 + }, + { + "epoch": 1.5773745997865527, + "grad_norm": 0.9104339222614066, + "learning_rate": 3.452271530656474e-05, + "loss": 0.195, + "step": 13302 + }, + { + "epoch": 1.577493181548678, + "grad_norm": 1.4303890678529187, + "learning_rate": 3.452049585830483e-05, + "loss": 0.3091, + "step": 13303 + }, + { + "epoch": 1.5776117633108027, + "grad_norm": 1.005178135890975, + "learning_rate": 3.451827632227762e-05, + "loss": 0.184, + "step": 13304 + }, + { + "epoch": 1.5777303450729279, + "grad_norm": 0.821670243135691, + "learning_rate": 3.451605669850358e-05, + "loss": 0.1988, + "step": 13305 + }, + { + "epoch": 1.5778489268350526, + "grad_norm": 1.0614104731568546, + "learning_rate": 3.4513836987003156e-05, + "loss": 0.2939, + "step": 13306 + }, + { + "epoch": 1.5779675085971778, + "grad_norm": 0.9544639354613433, + "learning_rate": 3.451161718779682e-05, + "loss": 0.1667, + "step": 13307 + }, + { + "epoch": 1.5780860903593026, + "grad_norm": 1.2972007451231395, + "learning_rate": 3.450939730090504e-05, + "loss": 0.238, + "step": 13308 + }, + { + "epoch": 1.5782046721214278, + "grad_norm": 1.0087216092835123, + "learning_rate": 3.4507177326348294e-05, + "loss": 0.2282, + "step": 13309 + }, + { + "epoch": 1.5783232538835525, + "grad_norm": 1.3316043361636618, + "learning_rate": 3.450495726414701e-05, + "loss": 0.2474, + "step": 13310 + }, + { + "epoch": 1.5784418356456777, + "grad_norm": 1.211564722206576, + "learning_rate": 3.450273711432169e-05, + "loss": 0.291, + "step": 13311 + }, + { + "epoch": 1.5785604174078027, + "grad_norm": 0.9468410188742312, + "learning_rate": 3.4500516876892786e-05, + "loss": 0.1936, + "step": 13312 + }, + { + "epoch": 1.5786789991699277, + "grad_norm": 1.167446615391396, + "learning_rate": 3.449829655188076e-05, + "loss": 0.2802, + "step": 13313 + }, + { + "epoch": 1.5787975809320527, + "grad_norm": 1.294948308961233, + "learning_rate": 3.449607613930609e-05, + "loss": 0.2475, + "step": 13314 + }, + { + "epoch": 1.5789161626941777, + "grad_norm": 0.8217348186237058, + "learning_rate": 3.4493855639189245e-05, + "loss": 0.1804, + "step": 13315 + }, + { + "epoch": 1.5790347444563027, + "grad_norm": 1.010223919669672, + "learning_rate": 3.44916350515507e-05, + "loss": 0.2602, + "step": 13316 + }, + { + "epoch": 1.5791533262184276, + "grad_norm": 0.8657899069850123, + "learning_rate": 3.44894143764109e-05, + "loss": 0.1505, + "step": 13317 + }, + { + "epoch": 1.5792719079805526, + "grad_norm": 1.108770242382398, + "learning_rate": 3.448719361379035e-05, + "loss": 0.2745, + "step": 13318 + }, + { + "epoch": 1.5793904897426776, + "grad_norm": 1.0048057300583726, + "learning_rate": 3.44849727637095e-05, + "loss": 0.2436, + "step": 13319 + }, + { + "epoch": 1.5795090715048026, + "grad_norm": 1.0697620830377153, + "learning_rate": 3.448275182618884e-05, + "loss": 0.2907, + "step": 13320 + }, + { + "epoch": 1.5796276532669276, + "grad_norm": 1.2059743996089733, + "learning_rate": 3.4480530801248836e-05, + "loss": 0.2354, + "step": 13321 + }, + { + "epoch": 1.5797462350290525, + "grad_norm": 0.9079730719560163, + "learning_rate": 3.4478309688909964e-05, + "loss": 0.1625, + "step": 13322 + }, + { + "epoch": 1.5798648167911775, + "grad_norm": 1.052646316018626, + "learning_rate": 3.44760884891927e-05, + "loss": 0.2199, + "step": 13323 + }, + { + "epoch": 1.5799833985533025, + "grad_norm": 0.9825469379620707, + "learning_rate": 3.4473867202117516e-05, + "loss": 0.2095, + "step": 13324 + }, + { + "epoch": 1.5801019803154275, + "grad_norm": 0.9539685836756817, + "learning_rate": 3.4471645827704896e-05, + "loss": 0.2405, + "step": 13325 + }, + { + "epoch": 1.5802205620775525, + "grad_norm": 0.9080364714132103, + "learning_rate": 3.4469424365975314e-05, + "loss": 0.2028, + "step": 13326 + }, + { + "epoch": 1.5803391438396774, + "grad_norm": 0.9345755542007149, + "learning_rate": 3.446720281694925e-05, + "loss": 0.1877, + "step": 13327 + }, + { + "epoch": 1.5804577256018024, + "grad_norm": 1.0467507409669483, + "learning_rate": 3.4464981180647195e-05, + "loss": 0.2353, + "step": 13328 + }, + { + "epoch": 1.5805763073639274, + "grad_norm": 0.9369665676654045, + "learning_rate": 3.446275945708961e-05, + "loss": 0.2291, + "step": 13329 + }, + { + "epoch": 1.5806948891260524, + "grad_norm": 0.9209363048604801, + "learning_rate": 3.4460537646296995e-05, + "loss": 0.2054, + "step": 13330 + }, + { + "epoch": 1.5808134708881774, + "grad_norm": 1.0698326913956147, + "learning_rate": 3.445831574828983e-05, + "loss": 0.2159, + "step": 13331 + }, + { + "epoch": 1.5809320526503023, + "grad_norm": 0.8346544027074242, + "learning_rate": 3.445609376308858e-05, + "loss": 0.153, + "step": 13332 + }, + { + "epoch": 1.5810506344124273, + "grad_norm": 1.0998952218916211, + "learning_rate": 3.445387169071375e-05, + "loss": 0.2213, + "step": 13333 + }, + { + "epoch": 1.5811692161745523, + "grad_norm": 1.0121023526846984, + "learning_rate": 3.44516495311858e-05, + "loss": 0.2665, + "step": 13334 + }, + { + "epoch": 1.5812877979366773, + "grad_norm": 0.8552495813760629, + "learning_rate": 3.444942728452525e-05, + "loss": 0.1224, + "step": 13335 + }, + { + "epoch": 1.5814063796988025, + "grad_norm": 1.0577391107780458, + "learning_rate": 3.444720495075256e-05, + "loss": 0.2064, + "step": 13336 + }, + { + "epoch": 1.5815249614609272, + "grad_norm": 0.9715937448671865, + "learning_rate": 3.4444982529888223e-05, + "loss": 0.1841, + "step": 13337 + }, + { + "epoch": 1.5816435432230525, + "grad_norm": 0.7721940569431855, + "learning_rate": 3.444276002195273e-05, + "loss": 0.1621, + "step": 13338 + }, + { + "epoch": 1.5817621249851772, + "grad_norm": 0.9244563477224146, + "learning_rate": 3.444053742696657e-05, + "loss": 0.1722, + "step": 13339 + }, + { + "epoch": 1.5818807067473024, + "grad_norm": 0.983028807556441, + "learning_rate": 3.443831474495024e-05, + "loss": 0.2262, + "step": 13340 + }, + { + "epoch": 1.5819992885094272, + "grad_norm": 0.8704360610124602, + "learning_rate": 3.443609197592421e-05, + "loss": 0.1484, + "step": 13341 + }, + { + "epoch": 1.5821178702715524, + "grad_norm": 1.1446351547807985, + "learning_rate": 3.443386911990899e-05, + "loss": 0.2226, + "step": 13342 + }, + { + "epoch": 1.5822364520336771, + "grad_norm": 1.462917108506428, + "learning_rate": 3.443164617692507e-05, + "loss": 0.2982, + "step": 13343 + }, + { + "epoch": 1.5823550337958023, + "grad_norm": 1.1205743929911003, + "learning_rate": 3.442942314699293e-05, + "loss": 0.2192, + "step": 13344 + }, + { + "epoch": 1.582473615557927, + "grad_norm": 0.8867768709169482, + "learning_rate": 3.442720003013307e-05, + "loss": 0.1794, + "step": 13345 + }, + { + "epoch": 1.5825921973200523, + "grad_norm": 1.0562728951567968, + "learning_rate": 3.4424976826366e-05, + "loss": 0.2269, + "step": 13346 + }, + { + "epoch": 1.582710779082177, + "grad_norm": 1.0822477203869245, + "learning_rate": 3.4422753535712184e-05, + "loss": 0.1959, + "step": 13347 + }, + { + "epoch": 1.5828293608443023, + "grad_norm": 0.9082446485480822, + "learning_rate": 3.442053015819214e-05, + "loss": 0.2015, + "step": 13348 + }, + { + "epoch": 1.582947942606427, + "grad_norm": 0.9880052406302757, + "learning_rate": 3.4418306693826356e-05, + "loss": 0.1994, + "step": 13349 + }, + { + "epoch": 1.5830665243685522, + "grad_norm": 1.3963316724123471, + "learning_rate": 3.4416083142635345e-05, + "loss": 0.2497, + "step": 13350 + }, + { + "epoch": 1.583185106130677, + "grad_norm": 0.8635802677400557, + "learning_rate": 3.441385950463959e-05, + "loss": 0.1953, + "step": 13351 + }, + { + "epoch": 1.5833036878928022, + "grad_norm": 0.9289946387219047, + "learning_rate": 3.44116357798596e-05, + "loss": 0.1987, + "step": 13352 + }, + { + "epoch": 1.583422269654927, + "grad_norm": 1.1551104484389705, + "learning_rate": 3.440941196831586e-05, + "loss": 0.2914, + "step": 13353 + }, + { + "epoch": 1.5835408514170521, + "grad_norm": 2.1667688792437203, + "learning_rate": 3.440718807002888e-05, + "loss": 0.3846, + "step": 13354 + }, + { + "epoch": 1.583659433179177, + "grad_norm": 0.8698224767590405, + "learning_rate": 3.4404964085019165e-05, + "loss": 0.2052, + "step": 13355 + }, + { + "epoch": 1.583778014941302, + "grad_norm": 0.7030163249635428, + "learning_rate": 3.440274001330722e-05, + "loss": 0.1179, + "step": 13356 + }, + { + "epoch": 1.5838965967034269, + "grad_norm": 0.647089124059883, + "learning_rate": 3.440051585491353e-05, + "loss": 0.1341, + "step": 13357 + }, + { + "epoch": 1.584015178465552, + "grad_norm": 0.8730927867278692, + "learning_rate": 3.439829160985862e-05, + "loss": 0.2041, + "step": 13358 + }, + { + "epoch": 1.5841337602276768, + "grad_norm": 0.8827560309735789, + "learning_rate": 3.439606727816299e-05, + "loss": 0.228, + "step": 13359 + }, + { + "epoch": 1.584252341989802, + "grad_norm": 1.0277225446869869, + "learning_rate": 3.439384285984713e-05, + "loss": 0.1994, + "step": 13360 + }, + { + "epoch": 1.584370923751927, + "grad_norm": 0.605952556297319, + "learning_rate": 3.4391618354931566e-05, + "loss": 0.1246, + "step": 13361 + }, + { + "epoch": 1.584489505514052, + "grad_norm": 0.8879551774132002, + "learning_rate": 3.438939376343679e-05, + "loss": 0.2202, + "step": 13362 + }, + { + "epoch": 1.584608087276177, + "grad_norm": 0.7396459139873478, + "learning_rate": 3.438716908538334e-05, + "loss": 0.1455, + "step": 13363 + }, + { + "epoch": 1.584726669038302, + "grad_norm": 0.8874101237925796, + "learning_rate": 3.4384944320791676e-05, + "loss": 0.1907, + "step": 13364 + }, + { + "epoch": 1.584845250800427, + "grad_norm": 0.6460124031562413, + "learning_rate": 3.438271946968235e-05, + "loss": 0.1805, + "step": 13365 + }, + { + "epoch": 1.584963832562552, + "grad_norm": 0.7004648498347694, + "learning_rate": 3.4380494532075846e-05, + "loss": 0.118, + "step": 13366 + }, + { + "epoch": 1.585082414324677, + "grad_norm": 0.8045458290478564, + "learning_rate": 3.437826950799269e-05, + "loss": 0.1881, + "step": 13367 + }, + { + "epoch": 1.5852009960868019, + "grad_norm": 1.028629407033567, + "learning_rate": 3.437604439745339e-05, + "loss": 0.2397, + "step": 13368 + }, + { + "epoch": 1.5853195778489269, + "grad_norm": 0.8680563726536847, + "learning_rate": 3.437381920047846e-05, + "loss": 0.1946, + "step": 13369 + }, + { + "epoch": 1.5854381596110518, + "grad_norm": 0.718379587088822, + "learning_rate": 3.437159391708841e-05, + "loss": 0.1377, + "step": 13370 + }, + { + "epoch": 1.5855567413731768, + "grad_norm": 0.8302742449303914, + "learning_rate": 3.4369368547303755e-05, + "loss": 0.1529, + "step": 13371 + }, + { + "epoch": 1.5856753231353018, + "grad_norm": 1.0157802229798385, + "learning_rate": 3.436714309114502e-05, + "loss": 0.2017, + "step": 13372 + }, + { + "epoch": 1.5857939048974268, + "grad_norm": 0.9788303415881313, + "learning_rate": 3.43649175486327e-05, + "loss": 0.1806, + "step": 13373 + }, + { + "epoch": 1.5859124866595518, + "grad_norm": 0.7677106527705798, + "learning_rate": 3.436269191978733e-05, + "loss": 0.1531, + "step": 13374 + }, + { + "epoch": 1.5860310684216767, + "grad_norm": 0.7724759454521409, + "learning_rate": 3.436046620462942e-05, + "loss": 0.1572, + "step": 13375 + }, + { + "epoch": 1.5861496501838017, + "grad_norm": 0.9546821654830531, + "learning_rate": 3.4358240403179486e-05, + "loss": 0.2209, + "step": 13376 + }, + { + "epoch": 1.5862682319459267, + "grad_norm": 0.899806909733568, + "learning_rate": 3.435601451545806e-05, + "loss": 0.169, + "step": 13377 + }, + { + "epoch": 1.5863868137080517, + "grad_norm": 0.7435280686984017, + "learning_rate": 3.4353788541485656e-05, + "loss": 0.136, + "step": 13378 + }, + { + "epoch": 1.5865053954701767, + "grad_norm": 0.7300145758135365, + "learning_rate": 3.435156248128279e-05, + "loss": 0.1504, + "step": 13379 + }, + { + "epoch": 1.5866239772323016, + "grad_norm": 0.7411773143008036, + "learning_rate": 3.4349336334869975e-05, + "loss": 0.1457, + "step": 13380 + }, + { + "epoch": 1.5867425589944266, + "grad_norm": 0.8018572702237099, + "learning_rate": 3.434711010226775e-05, + "loss": 0.1452, + "step": 13381 + }, + { + "epoch": 1.5868611407565516, + "grad_norm": 1.0333761641893158, + "learning_rate": 3.434488378349664e-05, + "loss": 0.1884, + "step": 13382 + }, + { + "epoch": 1.5869797225186766, + "grad_norm": 0.791992223882882, + "learning_rate": 3.434265737857715e-05, + "loss": 0.1449, + "step": 13383 + }, + { + "epoch": 1.5870983042808016, + "grad_norm": 1.0461127505212595, + "learning_rate": 3.434043088752982e-05, + "loss": 0.1963, + "step": 13384 + }, + { + "epoch": 1.5872168860429265, + "grad_norm": 0.8765481540758108, + "learning_rate": 3.4338204310375166e-05, + "loss": 0.19, + "step": 13385 + }, + { + "epoch": 1.5873354678050515, + "grad_norm": 1.0835498030768387, + "learning_rate": 3.433597764713372e-05, + "loss": 0.2241, + "step": 13386 + }, + { + "epoch": 1.5874540495671767, + "grad_norm": 0.9268936586179801, + "learning_rate": 3.433375089782601e-05, + "loss": 0.2091, + "step": 13387 + }, + { + "epoch": 1.5875726313293015, + "grad_norm": 0.8934072183969943, + "learning_rate": 3.433152406247257e-05, + "loss": 0.1686, + "step": 13388 + }, + { + "epoch": 1.5876912130914267, + "grad_norm": 0.8680618765073916, + "learning_rate": 3.432929714109392e-05, + "loss": 0.1538, + "step": 13389 + }, + { + "epoch": 1.5878097948535514, + "grad_norm": 0.7298674385989, + "learning_rate": 3.4327070133710585e-05, + "loss": 0.162, + "step": 13390 + }, + { + "epoch": 1.5879283766156767, + "grad_norm": 1.0114817463924457, + "learning_rate": 3.43248430403431e-05, + "loss": 0.2169, + "step": 13391 + }, + { + "epoch": 1.5880469583778014, + "grad_norm": 0.8882729638626742, + "learning_rate": 3.4322615861012e-05, + "loss": 0.146, + "step": 13392 + }, + { + "epoch": 1.5881655401399266, + "grad_norm": 1.0533190911166483, + "learning_rate": 3.432038859573782e-05, + "loss": 0.229, + "step": 13393 + }, + { + "epoch": 1.5882841219020514, + "grad_norm": 1.0232536391715625, + "learning_rate": 3.431816124454108e-05, + "loss": 0.2306, + "step": 13394 + }, + { + "epoch": 1.5884027036641766, + "grad_norm": 0.92904722039629, + "learning_rate": 3.431593380744233e-05, + "loss": 0.2545, + "step": 13395 + }, + { + "epoch": 1.5885212854263013, + "grad_norm": 1.1916362459487368, + "learning_rate": 3.4313706284462076e-05, + "loss": 0.2512, + "step": 13396 + }, + { + "epoch": 1.5886398671884265, + "grad_norm": 1.0017399163717118, + "learning_rate": 3.431147867562088e-05, + "loss": 0.1568, + "step": 13397 + }, + { + "epoch": 1.5887584489505513, + "grad_norm": 0.8528515793108605, + "learning_rate": 3.4309250980939265e-05, + "loss": 0.1697, + "step": 13398 + }, + { + "epoch": 1.5888770307126765, + "grad_norm": 0.7803612440108989, + "learning_rate": 3.430702320043777e-05, + "loss": 0.1722, + "step": 13399 + }, + { + "epoch": 1.5889956124748013, + "grad_norm": 1.053364594399141, + "learning_rate": 3.4304795334136944e-05, + "loss": 0.2425, + "step": 13400 + }, + { + "epoch": 1.5891141942369265, + "grad_norm": 1.1655494449414912, + "learning_rate": 3.430256738205731e-05, + "loss": 0.2493, + "step": 13401 + }, + { + "epoch": 1.5892327759990512, + "grad_norm": 0.773027001214362, + "learning_rate": 3.430033934421942e-05, + "loss": 0.1792, + "step": 13402 + }, + { + "epoch": 1.5893513577611764, + "grad_norm": 1.4027875770414469, + "learning_rate": 3.4298111220643795e-05, + "loss": 0.3214, + "step": 13403 + }, + { + "epoch": 1.5894699395233012, + "grad_norm": 0.8732905483940658, + "learning_rate": 3.429588301135099e-05, + "loss": 0.2244, + "step": 13404 + }, + { + "epoch": 1.5895885212854264, + "grad_norm": 1.0140281134591838, + "learning_rate": 3.429365471636155e-05, + "loss": 0.1802, + "step": 13405 + }, + { + "epoch": 1.5897071030475511, + "grad_norm": 0.8376038961745782, + "learning_rate": 3.4291426335696e-05, + "loss": 0.1797, + "step": 13406 + }, + { + "epoch": 1.5898256848096763, + "grad_norm": 0.7847067724968454, + "learning_rate": 3.4289197869374893e-05, + "loss": 0.1538, + "step": 13407 + }, + { + "epoch": 1.589944266571801, + "grad_norm": 0.870727861574082, + "learning_rate": 3.428696931741878e-05, + "loss": 0.1596, + "step": 13408 + }, + { + "epoch": 1.5900628483339263, + "grad_norm": 1.234153619716242, + "learning_rate": 3.4284740679848196e-05, + "loss": 0.2762, + "step": 13409 + }, + { + "epoch": 1.590181430096051, + "grad_norm": 0.6688878169616422, + "learning_rate": 3.428251195668369e-05, + "loss": 0.1419, + "step": 13410 + }, + { + "epoch": 1.5903000118581763, + "grad_norm": 0.9249975276735752, + "learning_rate": 3.428028314794581e-05, + "loss": 0.1565, + "step": 13411 + }, + { + "epoch": 1.5904185936203012, + "grad_norm": 0.762287401996631, + "learning_rate": 3.427805425365509e-05, + "loss": 0.1782, + "step": 13412 + }, + { + "epoch": 1.5905371753824262, + "grad_norm": 1.2499389616185752, + "learning_rate": 3.4275825273832094e-05, + "loss": 0.2483, + "step": 13413 + }, + { + "epoch": 1.5906557571445512, + "grad_norm": 0.8708550399845539, + "learning_rate": 3.427359620849736e-05, + "loss": 0.203, + "step": 13414 + }, + { + "epoch": 1.5907743389066762, + "grad_norm": 0.7624881814559452, + "learning_rate": 3.4271367057671446e-05, + "loss": 0.1556, + "step": 13415 + }, + { + "epoch": 1.5908929206688012, + "grad_norm": 0.8442363471378119, + "learning_rate": 3.426913782137489e-05, + "loss": 0.1682, + "step": 13416 + }, + { + "epoch": 1.5910115024309261, + "grad_norm": 0.8774588720910099, + "learning_rate": 3.4266908499628256e-05, + "loss": 0.2059, + "step": 13417 + }, + { + "epoch": 1.5911300841930511, + "grad_norm": 0.8568865003902818, + "learning_rate": 3.426467909245209e-05, + "loss": 0.1883, + "step": 13418 + }, + { + "epoch": 1.5912486659551761, + "grad_norm": 0.8144081505097701, + "learning_rate": 3.426244959986694e-05, + "loss": 0.2001, + "step": 13419 + }, + { + "epoch": 1.591367247717301, + "grad_norm": 0.9635752089409804, + "learning_rate": 3.426022002189336e-05, + "loss": 0.1557, + "step": 13420 + }, + { + "epoch": 1.591485829479426, + "grad_norm": 1.0312054535044797, + "learning_rate": 3.4257990358551915e-05, + "loss": 0.1959, + "step": 13421 + }, + { + "epoch": 1.591604411241551, + "grad_norm": 1.358734449422405, + "learning_rate": 3.425576060986315e-05, + "loss": 0.2197, + "step": 13422 + }, + { + "epoch": 1.591722993003676, + "grad_norm": 1.0549960932359117, + "learning_rate": 3.425353077584762e-05, + "loss": 0.2315, + "step": 13423 + }, + { + "epoch": 1.591841574765801, + "grad_norm": 0.8907473004491052, + "learning_rate": 3.425130085652588e-05, + "loss": 0.198, + "step": 13424 + }, + { + "epoch": 1.591960156527926, + "grad_norm": 0.7171634655965656, + "learning_rate": 3.424907085191849e-05, + "loss": 0.1486, + "step": 13425 + }, + { + "epoch": 1.592078738290051, + "grad_norm": 1.0058110851528745, + "learning_rate": 3.424684076204602e-05, + "loss": 0.1857, + "step": 13426 + }, + { + "epoch": 1.592197320052176, + "grad_norm": 0.7709835521422479, + "learning_rate": 3.4244610586929005e-05, + "loss": 0.1451, + "step": 13427 + }, + { + "epoch": 1.592315901814301, + "grad_norm": 1.211577512612124, + "learning_rate": 3.424238032658803e-05, + "loss": 0.2601, + "step": 13428 + }, + { + "epoch": 1.592434483576426, + "grad_norm": 1.23038571733898, + "learning_rate": 3.424014998104363e-05, + "loss": 0.2506, + "step": 13429 + }, + { + "epoch": 1.592553065338551, + "grad_norm": 1.054486948122292, + "learning_rate": 3.423791955031638e-05, + "loss": 0.2329, + "step": 13430 + }, + { + "epoch": 1.5926716471006759, + "grad_norm": 1.0133860509390649, + "learning_rate": 3.4235689034426846e-05, + "loss": 0.2138, + "step": 13431 + }, + { + "epoch": 1.5927902288628009, + "grad_norm": 0.9279083996411258, + "learning_rate": 3.4233458433395583e-05, + "loss": 0.1795, + "step": 13432 + }, + { + "epoch": 1.5929088106249258, + "grad_norm": 0.7417602507377317, + "learning_rate": 3.4231227747243154e-05, + "loss": 0.1456, + "step": 13433 + }, + { + "epoch": 1.5930273923870508, + "grad_norm": 0.9333533362981811, + "learning_rate": 3.422899697599013e-05, + "loss": 0.1856, + "step": 13434 + }, + { + "epoch": 1.5931459741491758, + "grad_norm": 0.8759470289790621, + "learning_rate": 3.422676611965706e-05, + "loss": 0.1855, + "step": 13435 + }, + { + "epoch": 1.593264555911301, + "grad_norm": 2.0830181029560806, + "learning_rate": 3.422453517826453e-05, + "loss": 0.458, + "step": 13436 + }, + { + "epoch": 1.5933831376734258, + "grad_norm": 1.1039251080509307, + "learning_rate": 3.42223041518331e-05, + "loss": 0.2105, + "step": 13437 + }, + { + "epoch": 1.593501719435551, + "grad_norm": 1.0372891205115091, + "learning_rate": 3.4220073040383326e-05, + "loss": 0.2162, + "step": 13438 + }, + { + "epoch": 1.5936203011976757, + "grad_norm": 0.6726639053269128, + "learning_rate": 3.421784184393579e-05, + "loss": 0.1443, + "step": 13439 + }, + { + "epoch": 1.593738882959801, + "grad_norm": 0.9883580562241387, + "learning_rate": 3.4215610562511055e-05, + "loss": 0.1731, + "step": 13440 + }, + { + "epoch": 1.5938574647219257, + "grad_norm": 0.9645077641332059, + "learning_rate": 3.421337919612969e-05, + "loss": 0.2153, + "step": 13441 + }, + { + "epoch": 1.593976046484051, + "grad_norm": 0.8677132104822005, + "learning_rate": 3.421114774481227e-05, + "loss": 0.1964, + "step": 13442 + }, + { + "epoch": 1.5940946282461756, + "grad_norm": 0.7137359257680594, + "learning_rate": 3.4208916208579364e-05, + "loss": 0.1626, + "step": 13443 + }, + { + "epoch": 1.5942132100083009, + "grad_norm": 0.8225049000386762, + "learning_rate": 3.4206684587451544e-05, + "loss": 0.1744, + "step": 13444 + }, + { + "epoch": 1.5943317917704256, + "grad_norm": 0.7959343077777657, + "learning_rate": 3.4204452881449376e-05, + "loss": 0.1783, + "step": 13445 + }, + { + "epoch": 1.5944503735325508, + "grad_norm": 0.9577236411316574, + "learning_rate": 3.420222109059345e-05, + "loss": 0.1877, + "step": 13446 + }, + { + "epoch": 1.5945689552946756, + "grad_norm": 0.8448101355027032, + "learning_rate": 3.4199989214904315e-05, + "loss": 0.1523, + "step": 13447 + }, + { + "epoch": 1.5946875370568008, + "grad_norm": 1.1198492697123907, + "learning_rate": 3.419775725440257e-05, + "loss": 0.2755, + "step": 13448 + }, + { + "epoch": 1.5948061188189255, + "grad_norm": 1.091715182507115, + "learning_rate": 3.419552520910878e-05, + "loss": 0.1886, + "step": 13449 + }, + { + "epoch": 1.5949247005810507, + "grad_norm": 0.9712354691051607, + "learning_rate": 3.4193293079043527e-05, + "loss": 0.2328, + "step": 13450 + }, + { + "epoch": 1.5950432823431755, + "grad_norm": 1.0222561548135638, + "learning_rate": 3.419106086422739e-05, + "loss": 0.2195, + "step": 13451 + }, + { + "epoch": 1.5951618641053007, + "grad_norm": 1.1901934150145486, + "learning_rate": 3.418882856468093e-05, + "loss": 0.2125, + "step": 13452 + }, + { + "epoch": 1.5952804458674255, + "grad_norm": 1.1138767100136906, + "learning_rate": 3.418659618042475e-05, + "loss": 0.2788, + "step": 13453 + }, + { + "epoch": 1.5953990276295507, + "grad_norm": 1.0314012795353706, + "learning_rate": 3.418436371147941e-05, + "loss": 0.2213, + "step": 13454 + }, + { + "epoch": 1.5955176093916754, + "grad_norm": 1.215090006536514, + "learning_rate": 3.4182131157865505e-05, + "loss": 0.2651, + "step": 13455 + }, + { + "epoch": 1.5956361911538006, + "grad_norm": 0.9044588232160928, + "learning_rate": 3.41798985196036e-05, + "loss": 0.1585, + "step": 13456 + }, + { + "epoch": 1.5957547729159254, + "grad_norm": 0.9911071138934332, + "learning_rate": 3.41776657967143e-05, + "loss": 0.1637, + "step": 13457 + }, + { + "epoch": 1.5958733546780506, + "grad_norm": 1.383739523690293, + "learning_rate": 3.4175432989218174e-05, + "loss": 0.3735, + "step": 13458 + }, + { + "epoch": 1.5959919364401753, + "grad_norm": 1.017627788576349, + "learning_rate": 3.417320009713581e-05, + "loss": 0.2059, + "step": 13459 + }, + { + "epoch": 1.5961105182023005, + "grad_norm": 1.0468587537867824, + "learning_rate": 3.4170967120487786e-05, + "loss": 0.2226, + "step": 13460 + }, + { + "epoch": 1.5962290999644255, + "grad_norm": 1.0904646146820587, + "learning_rate": 3.416873405929469e-05, + "loss": 0.2277, + "step": 13461 + }, + { + "epoch": 1.5963476817265505, + "grad_norm": 0.9803992787766296, + "learning_rate": 3.416650091357711e-05, + "loss": 0.2202, + "step": 13462 + }, + { + "epoch": 1.5964662634886755, + "grad_norm": 1.1132615052020571, + "learning_rate": 3.416426768335563e-05, + "loss": 0.2641, + "step": 13463 + }, + { + "epoch": 1.5965848452508005, + "grad_norm": 0.9333248472517162, + "learning_rate": 3.4162034368650845e-05, + "loss": 0.2212, + "step": 13464 + }, + { + "epoch": 1.5967034270129254, + "grad_norm": 0.8153775306673816, + "learning_rate": 3.4159800969483335e-05, + "loss": 0.1508, + "step": 13465 + }, + { + "epoch": 1.5968220087750504, + "grad_norm": 0.7916052970318194, + "learning_rate": 3.4157567485873694e-05, + "loss": 0.1638, + "step": 13466 + }, + { + "epoch": 1.5969405905371754, + "grad_norm": 1.4702995686961848, + "learning_rate": 3.415533391784251e-05, + "loss": 0.3761, + "step": 13467 + }, + { + "epoch": 1.5970591722993004, + "grad_norm": 0.7715947997622719, + "learning_rate": 3.415310026541037e-05, + "loss": 0.1627, + "step": 13468 + }, + { + "epoch": 1.5971777540614254, + "grad_norm": 0.9111882430835674, + "learning_rate": 3.4150866528597876e-05, + "loss": 0.1701, + "step": 13469 + }, + { + "epoch": 1.5972963358235504, + "grad_norm": 0.9484199666799671, + "learning_rate": 3.414863270742561e-05, + "loss": 0.2119, + "step": 13470 + }, + { + "epoch": 1.5974149175856753, + "grad_norm": 0.6520649635307016, + "learning_rate": 3.4146398801914175e-05, + "loss": 0.1697, + "step": 13471 + }, + { + "epoch": 1.5975334993478003, + "grad_norm": 1.1388107825911822, + "learning_rate": 3.414416481208416e-05, + "loss": 0.255, + "step": 13472 + }, + { + "epoch": 1.5976520811099253, + "grad_norm": 1.0733449740540182, + "learning_rate": 3.414193073795615e-05, + "loss": 0.3216, + "step": 13473 + }, + { + "epoch": 1.5977706628720503, + "grad_norm": 1.2312673642099778, + "learning_rate": 3.413969657955075e-05, + "loss": 0.3322, + "step": 13474 + }, + { + "epoch": 1.5978892446341753, + "grad_norm": 1.6892843469683152, + "learning_rate": 3.413746233688856e-05, + "loss": 0.3806, + "step": 13475 + }, + { + "epoch": 1.5980078263963002, + "grad_norm": 0.844824059647215, + "learning_rate": 3.413522800999017e-05, + "loss": 0.1902, + "step": 13476 + }, + { + "epoch": 1.5981264081584252, + "grad_norm": 0.7443518837096292, + "learning_rate": 3.413299359887618e-05, + "loss": 0.1335, + "step": 13477 + }, + { + "epoch": 1.5982449899205502, + "grad_norm": 0.9920964237745092, + "learning_rate": 3.413075910356719e-05, + "loss": 0.2419, + "step": 13478 + }, + { + "epoch": 1.5983635716826752, + "grad_norm": 0.7998344107100257, + "learning_rate": 3.4128524524083804e-05, + "loss": 0.1701, + "step": 13479 + }, + { + "epoch": 1.5984821534448002, + "grad_norm": 1.0423128812326956, + "learning_rate": 3.412628986044661e-05, + "loss": 0.2272, + "step": 13480 + }, + { + "epoch": 1.5986007352069251, + "grad_norm": 0.8284482589375506, + "learning_rate": 3.412405511267622e-05, + "loss": 0.1696, + "step": 13481 + }, + { + "epoch": 1.5987193169690501, + "grad_norm": 1.1648303612595803, + "learning_rate": 3.412182028079322e-05, + "loss": 0.2374, + "step": 13482 + }, + { + "epoch": 1.598837898731175, + "grad_norm": 0.9323866982979495, + "learning_rate": 3.4119585364818225e-05, + "loss": 0.188, + "step": 13483 + }, + { + "epoch": 1.5989564804933, + "grad_norm": 0.8688767386398583, + "learning_rate": 3.411735036477184e-05, + "loss": 0.2072, + "step": 13484 + }, + { + "epoch": 1.599075062255425, + "grad_norm": 0.7346797332143312, + "learning_rate": 3.411511528067466e-05, + "loss": 0.1634, + "step": 13485 + }, + { + "epoch": 1.59919364401755, + "grad_norm": 1.0067270733455558, + "learning_rate": 3.41128801125473e-05, + "loss": 0.174, + "step": 13486 + }, + { + "epoch": 1.5993122257796752, + "grad_norm": 0.728021897912261, + "learning_rate": 3.411064486041036e-05, + "loss": 0.1604, + "step": 13487 + }, + { + "epoch": 1.5994308075418, + "grad_norm": 1.1987512727202745, + "learning_rate": 3.410840952428445e-05, + "loss": 0.3049, + "step": 13488 + }, + { + "epoch": 1.5995493893039252, + "grad_norm": 0.8880635718155879, + "learning_rate": 3.4106174104190166e-05, + "loss": 0.2153, + "step": 13489 + }, + { + "epoch": 1.59966797106605, + "grad_norm": 1.1037324772736972, + "learning_rate": 3.410393860014813e-05, + "loss": 0.2513, + "step": 13490 + }, + { + "epoch": 1.5997865528281752, + "grad_norm": 1.3408036567694623, + "learning_rate": 3.4101703012178945e-05, + "loss": 0.2198, + "step": 13491 + }, + { + "epoch": 1.5999051345903, + "grad_norm": 0.8264223886498777, + "learning_rate": 3.4099467340303216e-05, + "loss": 0.1861, + "step": 13492 + }, + { + "epoch": 1.6000237163524251, + "grad_norm": 0.8634868711335765, + "learning_rate": 3.409723158454156e-05, + "loss": 0.1953, + "step": 13493 + }, + { + "epoch": 1.6001422981145499, + "grad_norm": 0.6849392218817861, + "learning_rate": 3.4094995744914585e-05, + "loss": 0.1777, + "step": 13494 + }, + { + "epoch": 1.600260879876675, + "grad_norm": 1.3082730991254246, + "learning_rate": 3.409275982144289e-05, + "loss": 0.2813, + "step": 13495 + }, + { + "epoch": 1.6003794616387998, + "grad_norm": 0.7481041318409017, + "learning_rate": 3.409052381414711e-05, + "loss": 0.209, + "step": 13496 + }, + { + "epoch": 1.600498043400925, + "grad_norm": 0.8823959029580243, + "learning_rate": 3.4088287723047844e-05, + "loss": 0.2048, + "step": 13497 + }, + { + "epoch": 1.6006166251630498, + "grad_norm": 0.9909934434767602, + "learning_rate": 3.408605154816571e-05, + "loss": 0.1911, + "step": 13498 + }, + { + "epoch": 1.600735206925175, + "grad_norm": 0.8885088019093903, + "learning_rate": 3.4083815289521325e-05, + "loss": 0.2094, + "step": 13499 + }, + { + "epoch": 1.6008537886872998, + "grad_norm": 0.8882085146771737, + "learning_rate": 3.40815789471353e-05, + "loss": 0.1583, + "step": 13500 + }, + { + "epoch": 1.600972370449425, + "grad_norm": 0.8863267014828995, + "learning_rate": 3.4079342521028254e-05, + "loss": 0.1834, + "step": 13501 + }, + { + "epoch": 1.6010909522115497, + "grad_norm": 0.8651881886348568, + "learning_rate": 3.407710601122081e-05, + "loss": 0.1813, + "step": 13502 + }, + { + "epoch": 1.601209533973675, + "grad_norm": 0.9892637603701658, + "learning_rate": 3.407486941773358e-05, + "loss": 0.2121, + "step": 13503 + }, + { + "epoch": 1.6013281157357997, + "grad_norm": 0.8872899848967587, + "learning_rate": 3.4072632740587175e-05, + "loss": 0.184, + "step": 13504 + }, + { + "epoch": 1.601446697497925, + "grad_norm": 0.9433550935921263, + "learning_rate": 3.407039597980222e-05, + "loss": 0.1666, + "step": 13505 + }, + { + "epoch": 1.6015652792600497, + "grad_norm": 0.773664110226806, + "learning_rate": 3.406815913539934e-05, + "loss": 0.1653, + "step": 13506 + }, + { + "epoch": 1.6016838610221749, + "grad_norm": 0.9340644596055254, + "learning_rate": 3.406592220739916e-05, + "loss": 0.1926, + "step": 13507 + }, + { + "epoch": 1.6018024427842996, + "grad_norm": 0.7853581707803934, + "learning_rate": 3.4063685195822284e-05, + "loss": 0.1464, + "step": 13508 + }, + { + "epoch": 1.6019210245464248, + "grad_norm": 0.900357697191691, + "learning_rate": 3.406144810068935e-05, + "loss": 0.1928, + "step": 13509 + }, + { + "epoch": 1.6020396063085496, + "grad_norm": 1.3370616188791227, + "learning_rate": 3.405921092202098e-05, + "loss": 0.298, + "step": 13510 + }, + { + "epoch": 1.6021581880706748, + "grad_norm": 1.8641030229243212, + "learning_rate": 3.405697365983779e-05, + "loss": 0.313, + "step": 13511 + }, + { + "epoch": 1.6022767698327998, + "grad_norm": 1.303538397997438, + "learning_rate": 3.405473631416041e-05, + "loss": 0.2473, + "step": 13512 + }, + { + "epoch": 1.6023953515949247, + "grad_norm": 1.355096838909902, + "learning_rate": 3.405249888500947e-05, + "loss": 0.3161, + "step": 13513 + }, + { + "epoch": 1.6025139333570497, + "grad_norm": 1.2352806256103936, + "learning_rate": 3.405026137240558e-05, + "loss": 0.2662, + "step": 13514 + }, + { + "epoch": 1.6026325151191747, + "grad_norm": 0.7402868544623429, + "learning_rate": 3.4048023776369386e-05, + "loss": 0.1536, + "step": 13515 + }, + { + "epoch": 1.6027510968812997, + "grad_norm": 0.7215650905772116, + "learning_rate": 3.404578609692151e-05, + "loss": 0.1594, + "step": 13516 + }, + { + "epoch": 1.6028696786434247, + "grad_norm": 0.7113366695819432, + "learning_rate": 3.4043548334082566e-05, + "loss": 0.1639, + "step": 13517 + }, + { + "epoch": 1.6029882604055496, + "grad_norm": 0.9807514432619729, + "learning_rate": 3.404131048787321e-05, + "loss": 0.2081, + "step": 13518 + }, + { + "epoch": 1.6031068421676746, + "grad_norm": 0.6698792388612282, + "learning_rate": 3.403907255831405e-05, + "loss": 0.1376, + "step": 13519 + }, + { + "epoch": 1.6032254239297996, + "grad_norm": 1.3451319202272793, + "learning_rate": 3.403683454542573e-05, + "loss": 0.2727, + "step": 13520 + }, + { + "epoch": 1.6033440056919246, + "grad_norm": 0.6851321135475673, + "learning_rate": 3.403459644922888e-05, + "loss": 0.1525, + "step": 13521 + }, + { + "epoch": 1.6034625874540496, + "grad_norm": 0.9362250143819911, + "learning_rate": 3.403235826974413e-05, + "loss": 0.241, + "step": 13522 + }, + { + "epoch": 1.6035811692161746, + "grad_norm": 0.8517756020187245, + "learning_rate": 3.403012000699211e-05, + "loss": 0.1622, + "step": 13523 + }, + { + "epoch": 1.6036997509782995, + "grad_norm": 0.8551200432747054, + "learning_rate": 3.402788166099346e-05, + "loss": 0.1997, + "step": 13524 + }, + { + "epoch": 1.6038183327404245, + "grad_norm": 0.6349959821925, + "learning_rate": 3.40256432317688e-05, + "loss": 0.1476, + "step": 13525 + }, + { + "epoch": 1.6039369145025495, + "grad_norm": 1.1472373520985402, + "learning_rate": 3.402340471933879e-05, + "loss": 0.2344, + "step": 13526 + }, + { + "epoch": 1.6040554962646745, + "grad_norm": 0.9845151270809698, + "learning_rate": 3.402116612372405e-05, + "loss": 0.1988, + "step": 13527 + }, + { + "epoch": 1.6041740780267995, + "grad_norm": 1.0368320768717643, + "learning_rate": 3.4018927444945216e-05, + "loss": 0.2257, + "step": 13528 + }, + { + "epoch": 1.6042926597889244, + "grad_norm": 0.906245701353201, + "learning_rate": 3.4016688683022936e-05, + "loss": 0.2469, + "step": 13529 + }, + { + "epoch": 1.6044112415510494, + "grad_norm": 1.1349569860058075, + "learning_rate": 3.401444983797784e-05, + "loss": 0.2529, + "step": 13530 + }, + { + "epoch": 1.6045298233131744, + "grad_norm": 0.8226979241294041, + "learning_rate": 3.4012210909830574e-05, + "loss": 0.16, + "step": 13531 + }, + { + "epoch": 1.6046484050752994, + "grad_norm": 1.7723820025336667, + "learning_rate": 3.400997189860177e-05, + "loss": 0.3181, + "step": 13532 + }, + { + "epoch": 1.6047669868374244, + "grad_norm": 0.9719278599251597, + "learning_rate": 3.400773280431208e-05, + "loss": 0.1983, + "step": 13533 + }, + { + "epoch": 1.6048855685995493, + "grad_norm": 0.7513047400382211, + "learning_rate": 3.4005493626982135e-05, + "loss": 0.1496, + "step": 13534 + }, + { + "epoch": 1.6050041503616743, + "grad_norm": 0.8075197451710521, + "learning_rate": 3.400325436663259e-05, + "loss": 0.1724, + "step": 13535 + }, + { + "epoch": 1.6051227321237995, + "grad_norm": 0.9466565523991356, + "learning_rate": 3.4001015023284065e-05, + "loss": 0.2019, + "step": 13536 + }, + { + "epoch": 1.6052413138859243, + "grad_norm": 1.7013584477514245, + "learning_rate": 3.399877559695723e-05, + "loss": 0.4311, + "step": 13537 + }, + { + "epoch": 1.6053598956480495, + "grad_norm": 0.7545895964515632, + "learning_rate": 3.399653608767272e-05, + "loss": 0.1816, + "step": 13538 + }, + { + "epoch": 1.6054784774101742, + "grad_norm": 1.048935274319745, + "learning_rate": 3.3994296495451184e-05, + "loss": 0.2007, + "step": 13539 + }, + { + "epoch": 1.6055970591722994, + "grad_norm": 0.7711310317899935, + "learning_rate": 3.3992056820313255e-05, + "loss": 0.1751, + "step": 13540 + }, + { + "epoch": 1.6057156409344242, + "grad_norm": 0.8894213924622344, + "learning_rate": 3.3989817062279586e-05, + "loss": 0.2152, + "step": 13541 + }, + { + "epoch": 1.6058342226965494, + "grad_norm": 1.1045790534147182, + "learning_rate": 3.398757722137084e-05, + "loss": 0.2196, + "step": 13542 + }, + { + "epoch": 1.6059528044586742, + "grad_norm": 1.2889404855573487, + "learning_rate": 3.3985337297607644e-05, + "loss": 0.2677, + "step": 13543 + }, + { + "epoch": 1.6060713862207994, + "grad_norm": 1.2664215108022452, + "learning_rate": 3.398309729101066e-05, + "loss": 0.2185, + "step": 13544 + }, + { + "epoch": 1.6061899679829241, + "grad_norm": 0.9043808989222406, + "learning_rate": 3.398085720160053e-05, + "loss": 0.1925, + "step": 13545 + }, + { + "epoch": 1.6063085497450493, + "grad_norm": 0.9116028894114818, + "learning_rate": 3.397861702939791e-05, + "loss": 0.1901, + "step": 13546 + }, + { + "epoch": 1.606427131507174, + "grad_norm": 1.0767153711932262, + "learning_rate": 3.397637677442346e-05, + "loss": 0.2257, + "step": 13547 + }, + { + "epoch": 1.6065457132692993, + "grad_norm": 0.9561191711667552, + "learning_rate": 3.397413643669782e-05, + "loss": 0.193, + "step": 13548 + }, + { + "epoch": 1.606664295031424, + "grad_norm": 1.0633503536870574, + "learning_rate": 3.397189601624164e-05, + "loss": 0.1792, + "step": 13549 + }, + { + "epoch": 1.6067828767935493, + "grad_norm": 0.7909377657456592, + "learning_rate": 3.396965551307559e-05, + "loss": 0.1506, + "step": 13550 + }, + { + "epoch": 1.606901458555674, + "grad_norm": 0.7212099584136074, + "learning_rate": 3.396741492722031e-05, + "loss": 0.1718, + "step": 13551 + }, + { + "epoch": 1.6070200403177992, + "grad_norm": 0.9788415314540474, + "learning_rate": 3.3965174258696466e-05, + "loss": 0.1838, + "step": 13552 + }, + { + "epoch": 1.607138622079924, + "grad_norm": 1.059660054395232, + "learning_rate": 3.39629335075247e-05, + "loss": 0.2037, + "step": 13553 + }, + { + "epoch": 1.6072572038420492, + "grad_norm": 1.2847542563770173, + "learning_rate": 3.396069267372568e-05, + "loss": 0.2758, + "step": 13554 + }, + { + "epoch": 1.607375785604174, + "grad_norm": 1.4677993605180935, + "learning_rate": 3.3958451757320065e-05, + "loss": 0.3552, + "step": 13555 + }, + { + "epoch": 1.6074943673662991, + "grad_norm": 0.9289287616071418, + "learning_rate": 3.395621075832851e-05, + "loss": 0.1592, + "step": 13556 + }, + { + "epoch": 1.607612949128424, + "grad_norm": 1.1886142770109522, + "learning_rate": 3.395396967677168e-05, + "loss": 0.2572, + "step": 13557 + }, + { + "epoch": 1.607731530890549, + "grad_norm": 1.1438909713878742, + "learning_rate": 3.395172851267022e-05, + "loss": 0.2613, + "step": 13558 + }, + { + "epoch": 1.6078501126526739, + "grad_norm": 0.7972614760463648, + "learning_rate": 3.394948726604481e-05, + "loss": 0.1747, + "step": 13559 + }, + { + "epoch": 1.607968694414799, + "grad_norm": 1.1815429855915538, + "learning_rate": 3.3947245936916095e-05, + "loss": 0.2473, + "step": 13560 + }, + { + "epoch": 1.608087276176924, + "grad_norm": 1.0623454070029827, + "learning_rate": 3.3945004525304754e-05, + "loss": 0.1659, + "step": 13561 + }, + { + "epoch": 1.608205857939049, + "grad_norm": 0.872210123151847, + "learning_rate": 3.394276303123143e-05, + "loss": 0.1654, + "step": 13562 + }, + { + "epoch": 1.608324439701174, + "grad_norm": 0.9517262388283974, + "learning_rate": 3.39405214547168e-05, + "loss": 0.2403, + "step": 13563 + }, + { + "epoch": 1.608443021463299, + "grad_norm": 1.1292200053343784, + "learning_rate": 3.393827979578153e-05, + "loss": 0.2162, + "step": 13564 + }, + { + "epoch": 1.608561603225424, + "grad_norm": 0.9521819178241844, + "learning_rate": 3.3936038054446274e-05, + "loss": 0.1972, + "step": 13565 + }, + { + "epoch": 1.608680184987549, + "grad_norm": 0.9734035684609293, + "learning_rate": 3.3933796230731706e-05, + "loss": 0.2032, + "step": 13566 + }, + { + "epoch": 1.608798766749674, + "grad_norm": 0.7637045836710352, + "learning_rate": 3.39315543246585e-05, + "loss": 0.1781, + "step": 13567 + }, + { + "epoch": 1.608917348511799, + "grad_norm": 1.1618996515184463, + "learning_rate": 3.392931233624731e-05, + "loss": 0.2103, + "step": 13568 + }, + { + "epoch": 1.6090359302739239, + "grad_norm": 0.7128452863015635, + "learning_rate": 3.392707026551881e-05, + "loss": 0.1395, + "step": 13569 + }, + { + "epoch": 1.6091545120360489, + "grad_norm": 0.972746411836376, + "learning_rate": 3.392482811249367e-05, + "loss": 0.2109, + "step": 13570 + }, + { + "epoch": 1.6092730937981738, + "grad_norm": 1.1433809053327062, + "learning_rate": 3.3922585877192557e-05, + "loss": 0.2589, + "step": 13571 + }, + { + "epoch": 1.6093916755602988, + "grad_norm": 0.9992032558866402, + "learning_rate": 3.3920343559636144e-05, + "loss": 0.2706, + "step": 13572 + }, + { + "epoch": 1.6095102573224238, + "grad_norm": 0.9181847723638037, + "learning_rate": 3.3918101159845104e-05, + "loss": 0.2265, + "step": 13573 + }, + { + "epoch": 1.6096288390845488, + "grad_norm": 1.250518235347137, + "learning_rate": 3.391585867784011e-05, + "loss": 0.2629, + "step": 13574 + }, + { + "epoch": 1.6097474208466738, + "grad_norm": 1.0989502994979954, + "learning_rate": 3.3913616113641834e-05, + "loss": 0.2549, + "step": 13575 + }, + { + "epoch": 1.6098660026087988, + "grad_norm": 1.1160069003899873, + "learning_rate": 3.391137346727094e-05, + "loss": 0.2489, + "step": 13576 + }, + { + "epoch": 1.6099845843709237, + "grad_norm": 0.7694789611053993, + "learning_rate": 3.3909130738748105e-05, + "loss": 0.14, + "step": 13577 + }, + { + "epoch": 1.6101031661330487, + "grad_norm": 0.9287851855737399, + "learning_rate": 3.390688792809403e-05, + "loss": 0.1954, + "step": 13578 + }, + { + "epoch": 1.6102217478951737, + "grad_norm": 0.758506650861356, + "learning_rate": 3.3904645035329355e-05, + "loss": 0.1318, + "step": 13579 + }, + { + "epoch": 1.6103403296572987, + "grad_norm": 1.2853399164951371, + "learning_rate": 3.390240206047478e-05, + "loss": 0.2498, + "step": 13580 + }, + { + "epoch": 1.6104589114194237, + "grad_norm": 1.0701881418754882, + "learning_rate": 3.390015900355097e-05, + "loss": 0.2254, + "step": 13581 + }, + { + "epoch": 1.6105774931815486, + "grad_norm": 0.9438365196683013, + "learning_rate": 3.389791586457861e-05, + "loss": 0.2015, + "step": 13582 + }, + { + "epoch": 1.6106960749436736, + "grad_norm": 0.5779318370597929, + "learning_rate": 3.389567264357838e-05, + "loss": 0.1472, + "step": 13583 + }, + { + "epoch": 1.6108146567057986, + "grad_norm": 0.985927092584453, + "learning_rate": 3.389342934057095e-05, + "loss": 0.2157, + "step": 13584 + }, + { + "epoch": 1.6109332384679236, + "grad_norm": 1.3364674397206713, + "learning_rate": 3.389118595557701e-05, + "loss": 0.312, + "step": 13585 + }, + { + "epoch": 1.6110518202300486, + "grad_norm": 1.0041316770231627, + "learning_rate": 3.388894248861724e-05, + "loss": 0.2339, + "step": 13586 + }, + { + "epoch": 1.6111704019921738, + "grad_norm": 0.8772251203687723, + "learning_rate": 3.388669893971232e-05, + "loss": 0.1844, + "step": 13587 + }, + { + "epoch": 1.6112889837542985, + "grad_norm": 0.8485816228409283, + "learning_rate": 3.388445530888293e-05, + "loss": 0.2122, + "step": 13588 + }, + { + "epoch": 1.6114075655164237, + "grad_norm": 1.4006400567697253, + "learning_rate": 3.3882211596149766e-05, + "loss": 0.3414, + "step": 13589 + }, + { + "epoch": 1.6115261472785485, + "grad_norm": 0.9761318398621757, + "learning_rate": 3.38799678015335e-05, + "loss": 0.2103, + "step": 13590 + }, + { + "epoch": 1.6116447290406737, + "grad_norm": 1.0944553669525197, + "learning_rate": 3.387772392505482e-05, + "loss": 0.227, + "step": 13591 + }, + { + "epoch": 1.6117633108027984, + "grad_norm": 0.8897190146954418, + "learning_rate": 3.3875479966734404e-05, + "loss": 0.2019, + "step": 13592 + }, + { + "epoch": 1.6118818925649236, + "grad_norm": 1.2168851302369554, + "learning_rate": 3.387323592659296e-05, + "loss": 0.2939, + "step": 13593 + }, + { + "epoch": 1.6120004743270484, + "grad_norm": 0.957771093158159, + "learning_rate": 3.387099180465115e-05, + "loss": 0.2045, + "step": 13594 + }, + { + "epoch": 1.6121190560891736, + "grad_norm": 1.032516204223673, + "learning_rate": 3.386874760092967e-05, + "loss": 0.1863, + "step": 13595 + }, + { + "epoch": 1.6122376378512984, + "grad_norm": 0.9330910551594108, + "learning_rate": 3.3866503315449225e-05, + "loss": 0.1939, + "step": 13596 + }, + { + "epoch": 1.6123562196134236, + "grad_norm": 0.8905323257331389, + "learning_rate": 3.3864258948230486e-05, + "loss": 0.17, + "step": 13597 + }, + { + "epoch": 1.6124748013755483, + "grad_norm": 0.9363631685014732, + "learning_rate": 3.3862014499294156e-05, + "loss": 0.2031, + "step": 13598 + }, + { + "epoch": 1.6125933831376735, + "grad_norm": 0.7559906888099894, + "learning_rate": 3.3859769968660915e-05, + "loss": 0.145, + "step": 13599 + }, + { + "epoch": 1.6127119648997983, + "grad_norm": 0.8632302460261173, + "learning_rate": 3.385752535635146e-05, + "loss": 0.1915, + "step": 13600 + }, + { + "epoch": 1.6128305466619235, + "grad_norm": 0.8458449082827031, + "learning_rate": 3.385528066238648e-05, + "loss": 0.1362, + "step": 13601 + }, + { + "epoch": 1.6129491284240483, + "grad_norm": 1.0572467947020574, + "learning_rate": 3.3853035886786675e-05, + "loss": 0.1928, + "step": 13602 + }, + { + "epoch": 1.6130677101861735, + "grad_norm": 1.1006473571029887, + "learning_rate": 3.385079102957274e-05, + "loss": 0.2142, + "step": 13603 + }, + { + "epoch": 1.6131862919482982, + "grad_norm": 0.9324211747258991, + "learning_rate": 3.384854609076536e-05, + "loss": 0.1968, + "step": 13604 + }, + { + "epoch": 1.6133048737104234, + "grad_norm": 2.5564974793817843, + "learning_rate": 3.384630107038523e-05, + "loss": 0.4576, + "step": 13605 + }, + { + "epoch": 1.6134234554725482, + "grad_norm": 0.9576276124380596, + "learning_rate": 3.384405596845306e-05, + "loss": 0.1624, + "step": 13606 + }, + { + "epoch": 1.6135420372346734, + "grad_norm": 0.7496703243729987, + "learning_rate": 3.384181078498954e-05, + "loss": 0.1593, + "step": 13607 + }, + { + "epoch": 1.6136606189967981, + "grad_norm": 0.9469569600337734, + "learning_rate": 3.383956552001536e-05, + "loss": 0.2052, + "step": 13608 + }, + { + "epoch": 1.6137792007589233, + "grad_norm": 1.0337092768983647, + "learning_rate": 3.383732017355123e-05, + "loss": 0.2379, + "step": 13609 + }, + { + "epoch": 1.613897782521048, + "grad_norm": 1.3485018596516543, + "learning_rate": 3.383507474561785e-05, + "loss": 0.3628, + "step": 13610 + }, + { + "epoch": 1.6140163642831733, + "grad_norm": 1.1649486844957613, + "learning_rate": 3.383282923623591e-05, + "loss": 0.2259, + "step": 13611 + }, + { + "epoch": 1.6141349460452983, + "grad_norm": 1.0235728135006552, + "learning_rate": 3.383058364542611e-05, + "loss": 0.2752, + "step": 13612 + }, + { + "epoch": 1.6142535278074233, + "grad_norm": 1.2034689234573563, + "learning_rate": 3.382833797320917e-05, + "loss": 0.2488, + "step": 13613 + }, + { + "epoch": 1.6143721095695482, + "grad_norm": 0.9116858919611394, + "learning_rate": 3.382609221960576e-05, + "loss": 0.1992, + "step": 13614 + }, + { + "epoch": 1.6144906913316732, + "grad_norm": 1.0119294344840337, + "learning_rate": 3.382384638463662e-05, + "loss": 0.2244, + "step": 13615 + }, + { + "epoch": 1.6146092730937982, + "grad_norm": 0.7971381777427082, + "learning_rate": 3.382160046832243e-05, + "loss": 0.1531, + "step": 13616 + }, + { + "epoch": 1.6147278548559232, + "grad_norm": 0.7946280995673286, + "learning_rate": 3.3819354470683914e-05, + "loss": 0.174, + "step": 13617 + }, + { + "epoch": 1.6148464366180482, + "grad_norm": 0.8876352261563554, + "learning_rate": 3.3817108391741754e-05, + "loss": 0.1513, + "step": 13618 + }, + { + "epoch": 1.6149650183801731, + "grad_norm": 0.9969255590467208, + "learning_rate": 3.381486223151667e-05, + "loss": 0.2378, + "step": 13619 + }, + { + "epoch": 1.6150836001422981, + "grad_norm": 0.9102081034310795, + "learning_rate": 3.3812615990029364e-05, + "loss": 0.2255, + "step": 13620 + }, + { + "epoch": 1.615202181904423, + "grad_norm": 0.9141708707197608, + "learning_rate": 3.3810369667300544e-05, + "loss": 0.1808, + "step": 13621 + }, + { + "epoch": 1.615320763666548, + "grad_norm": 1.0801083309084234, + "learning_rate": 3.380812326335092e-05, + "loss": 0.1811, + "step": 13622 + }, + { + "epoch": 1.615439345428673, + "grad_norm": 0.8539744516917497, + "learning_rate": 3.380587677820121e-05, + "loss": 0.211, + "step": 13623 + }, + { + "epoch": 1.615557927190798, + "grad_norm": 0.8491010028084615, + "learning_rate": 3.38036302118721e-05, + "loss": 0.2068, + "step": 13624 + }, + { + "epoch": 1.615676508952923, + "grad_norm": 1.232066113800596, + "learning_rate": 3.3801383564384326e-05, + "loss": 0.2229, + "step": 13625 + }, + { + "epoch": 1.615795090715048, + "grad_norm": 1.163170310118204, + "learning_rate": 3.379913683575858e-05, + "loss": 0.2366, + "step": 13626 + }, + { + "epoch": 1.615913672477173, + "grad_norm": 0.9876741576608851, + "learning_rate": 3.379689002601558e-05, + "loss": 0.2385, + "step": 13627 + }, + { + "epoch": 1.616032254239298, + "grad_norm": 0.8380318077750708, + "learning_rate": 3.379464313517606e-05, + "loss": 0.1902, + "step": 13628 + }, + { + "epoch": 1.616150836001423, + "grad_norm": 0.9556700246203035, + "learning_rate": 3.3792396163260696e-05, + "loss": 0.1538, + "step": 13629 + }, + { + "epoch": 1.616269417763548, + "grad_norm": 0.9471218654521043, + "learning_rate": 3.379014911029023e-05, + "loss": 0.2147, + "step": 13630 + }, + { + "epoch": 1.616387999525673, + "grad_norm": 0.7154635155848712, + "learning_rate": 3.378790197628537e-05, + "loss": 0.125, + "step": 13631 + }, + { + "epoch": 1.616506581287798, + "grad_norm": 1.4691132549086028, + "learning_rate": 3.378565476126683e-05, + "loss": 0.3429, + "step": 13632 + }, + { + "epoch": 1.6166251630499229, + "grad_norm": 1.1237551907931338, + "learning_rate": 3.378340746525532e-05, + "loss": 0.2205, + "step": 13633 + }, + { + "epoch": 1.6167437448120479, + "grad_norm": 0.6663581814621451, + "learning_rate": 3.378116008827157e-05, + "loss": 0.1489, + "step": 13634 + }, + { + "epoch": 1.6168623265741728, + "grad_norm": 0.6334973709544391, + "learning_rate": 3.377891263033629e-05, + "loss": 0.137, + "step": 13635 + }, + { + "epoch": 1.616980908336298, + "grad_norm": 1.1126421170450689, + "learning_rate": 3.37766650914702e-05, + "loss": 0.2472, + "step": 13636 + }, + { + "epoch": 1.6170994900984228, + "grad_norm": 0.8748109937749801, + "learning_rate": 3.377441747169402e-05, + "loss": 0.2094, + "step": 13637 + }, + { + "epoch": 1.617218071860548, + "grad_norm": 0.938285937442134, + "learning_rate": 3.3772169771028474e-05, + "loss": 0.2149, + "step": 13638 + }, + { + "epoch": 1.6173366536226728, + "grad_norm": 0.915445794994883, + "learning_rate": 3.376992198949428e-05, + "loss": 0.2014, + "step": 13639 + }, + { + "epoch": 1.617455235384798, + "grad_norm": 1.2191013802450434, + "learning_rate": 3.3767674127112155e-05, + "loss": 0.2428, + "step": 13640 + }, + { + "epoch": 1.6175738171469227, + "grad_norm": 1.0622185770943817, + "learning_rate": 3.376542618390283e-05, + "loss": 0.2307, + "step": 13641 + }, + { + "epoch": 1.617692398909048, + "grad_norm": 0.8969996372962068, + "learning_rate": 3.376317815988702e-05, + "loss": 0.1738, + "step": 13642 + }, + { + "epoch": 1.6178109806711727, + "grad_norm": 0.9060639817723778, + "learning_rate": 3.376093005508546e-05, + "loss": 0.2205, + "step": 13643 + }, + { + "epoch": 1.6179295624332979, + "grad_norm": 0.9315818016114056, + "learning_rate": 3.375868186951887e-05, + "loss": 0.1996, + "step": 13644 + }, + { + "epoch": 1.6180481441954226, + "grad_norm": 1.5527058719801659, + "learning_rate": 3.3756433603207965e-05, + "loss": 0.3372, + "step": 13645 + }, + { + "epoch": 1.6181667259575478, + "grad_norm": 0.8499769363408511, + "learning_rate": 3.375418525617348e-05, + "loss": 0.1696, + "step": 13646 + }, + { + "epoch": 1.6182853077196726, + "grad_norm": 0.8235942486906423, + "learning_rate": 3.3751936828436145e-05, + "loss": 0.1687, + "step": 13647 + }, + { + "epoch": 1.6184038894817978, + "grad_norm": 0.8739519026088557, + "learning_rate": 3.374968832001669e-05, + "loss": 0.2089, + "step": 13648 + }, + { + "epoch": 1.6185224712439226, + "grad_norm": 0.9829261735216673, + "learning_rate": 3.3747439730935835e-05, + "loss": 0.1619, + "step": 13649 + }, + { + "epoch": 1.6186410530060478, + "grad_norm": 1.216753712074762, + "learning_rate": 3.374519106121431e-05, + "loss": 0.2623, + "step": 13650 + }, + { + "epoch": 1.6187596347681725, + "grad_norm": 0.9858459730372046, + "learning_rate": 3.3742942310872855e-05, + "loss": 0.1781, + "step": 13651 + }, + { + "epoch": 1.6188782165302977, + "grad_norm": 1.7411001845849785, + "learning_rate": 3.374069347993218e-05, + "loss": 0.3463, + "step": 13652 + }, + { + "epoch": 1.6189967982924225, + "grad_norm": 1.4723357497902192, + "learning_rate": 3.373844456841305e-05, + "loss": 0.3116, + "step": 13653 + }, + { + "epoch": 1.6191153800545477, + "grad_norm": 1.267954642852169, + "learning_rate": 3.373619557633616e-05, + "loss": 0.2037, + "step": 13654 + }, + { + "epoch": 1.6192339618166725, + "grad_norm": 0.871727794910229, + "learning_rate": 3.373394650372226e-05, + "loss": 0.1903, + "step": 13655 + }, + { + "epoch": 1.6193525435787977, + "grad_norm": 0.7374357485001722, + "learning_rate": 3.3731697350592086e-05, + "loss": 0.1556, + "step": 13656 + }, + { + "epoch": 1.6194711253409224, + "grad_norm": 1.1286260384082023, + "learning_rate": 3.372944811696637e-05, + "loss": 0.2725, + "step": 13657 + }, + { + "epoch": 1.6195897071030476, + "grad_norm": 0.9243529784499751, + "learning_rate": 3.372719880286585e-05, + "loss": 0.2114, + "step": 13658 + }, + { + "epoch": 1.6197082888651724, + "grad_norm": 0.8707619944140542, + "learning_rate": 3.372494940831126e-05, + "loss": 0.2119, + "step": 13659 + }, + { + "epoch": 1.6198268706272976, + "grad_norm": 0.8030247233535237, + "learning_rate": 3.372269993332333e-05, + "loss": 0.1473, + "step": 13660 + }, + { + "epoch": 1.6199454523894226, + "grad_norm": 0.9684179055424077, + "learning_rate": 3.3720450377922804e-05, + "loss": 0.191, + "step": 13661 + }, + { + "epoch": 1.6200640341515475, + "grad_norm": 1.139724217614926, + "learning_rate": 3.371820074213042e-05, + "loss": 0.2273, + "step": 13662 + }, + { + "epoch": 1.6201826159136725, + "grad_norm": 1.0053170911613099, + "learning_rate": 3.371595102596692e-05, + "loss": 0.2145, + "step": 13663 + }, + { + "epoch": 1.6203011976757975, + "grad_norm": 1.2872698819670092, + "learning_rate": 3.371370122945304e-05, + "loss": 0.3245, + "step": 13664 + }, + { + "epoch": 1.6204197794379225, + "grad_norm": 0.9084503813914308, + "learning_rate": 3.371145135260951e-05, + "loss": 0.2285, + "step": 13665 + }, + { + "epoch": 1.6205383612000475, + "grad_norm": 1.0601116024385333, + "learning_rate": 3.370920139545709e-05, + "loss": 0.2723, + "step": 13666 + }, + { + "epoch": 1.6206569429621724, + "grad_norm": 1.1194379840375634, + "learning_rate": 3.3706951358016514e-05, + "loss": 0.3247, + "step": 13667 + }, + { + "epoch": 1.6207755247242974, + "grad_norm": 0.8360860860138564, + "learning_rate": 3.3704701240308525e-05, + "loss": 0.1669, + "step": 13668 + }, + { + "epoch": 1.6208941064864224, + "grad_norm": 1.0024386428734928, + "learning_rate": 3.370245104235386e-05, + "loss": 0.2479, + "step": 13669 + }, + { + "epoch": 1.6210126882485474, + "grad_norm": 0.9855184347802448, + "learning_rate": 3.370020076417327e-05, + "loss": 0.1785, + "step": 13670 + }, + { + "epoch": 1.6211312700106724, + "grad_norm": 0.7857733107456513, + "learning_rate": 3.3697950405787496e-05, + "loss": 0.1812, + "step": 13671 + }, + { + "epoch": 1.6212498517727973, + "grad_norm": 1.2717285673427725, + "learning_rate": 3.369569996721729e-05, + "loss": 0.379, + "step": 13672 + }, + { + "epoch": 1.6213684335349223, + "grad_norm": 0.8184735091620444, + "learning_rate": 3.3693449448483385e-05, + "loss": 0.1967, + "step": 13673 + }, + { + "epoch": 1.6214870152970473, + "grad_norm": 1.05557726058533, + "learning_rate": 3.369119884960654e-05, + "loss": 0.2475, + "step": 13674 + }, + { + "epoch": 1.6216055970591723, + "grad_norm": 0.8918837929186254, + "learning_rate": 3.3688948170607506e-05, + "loss": 0.1515, + "step": 13675 + }, + { + "epoch": 1.6217241788212973, + "grad_norm": 1.5871167293069581, + "learning_rate": 3.368669741150702e-05, + "loss": 0.3201, + "step": 13676 + }, + { + "epoch": 1.6218427605834222, + "grad_norm": 0.8295198969817767, + "learning_rate": 3.3684446572325835e-05, + "loss": 0.2048, + "step": 13677 + }, + { + "epoch": 1.6219613423455472, + "grad_norm": 1.0011743620673579, + "learning_rate": 3.3682195653084704e-05, + "loss": 0.1775, + "step": 13678 + }, + { + "epoch": 1.6220799241076722, + "grad_norm": 0.8083765229905293, + "learning_rate": 3.367994465380438e-05, + "loss": 0.14, + "step": 13679 + }, + { + "epoch": 1.6221985058697972, + "grad_norm": 1.1256861256571837, + "learning_rate": 3.3677693574505604e-05, + "loss": 0.2756, + "step": 13680 + }, + { + "epoch": 1.6223170876319222, + "grad_norm": 0.7687276890863579, + "learning_rate": 3.3675442415209135e-05, + "loss": 0.1671, + "step": 13681 + }, + { + "epoch": 1.6224356693940472, + "grad_norm": 1.6201980872883666, + "learning_rate": 3.3673191175935726e-05, + "loss": 0.3887, + "step": 13682 + }, + { + "epoch": 1.6225542511561721, + "grad_norm": 1.09592672004351, + "learning_rate": 3.367093985670613e-05, + "loss": 0.2065, + "step": 13683 + }, + { + "epoch": 1.6226728329182971, + "grad_norm": 0.9426105707307836, + "learning_rate": 3.36686884575411e-05, + "loss": 0.2428, + "step": 13684 + }, + { + "epoch": 1.6227914146804223, + "grad_norm": 0.8571974409893841, + "learning_rate": 3.36664369784614e-05, + "loss": 0.2022, + "step": 13685 + }, + { + "epoch": 1.622909996442547, + "grad_norm": 1.3707536227124213, + "learning_rate": 3.366418541948777e-05, + "loss": 0.3424, + "step": 13686 + }, + { + "epoch": 1.6230285782046723, + "grad_norm": 1.1490551981078874, + "learning_rate": 3.366193378064098e-05, + "loss": 0.244, + "step": 13687 + }, + { + "epoch": 1.623147159966797, + "grad_norm": 0.8777329585583704, + "learning_rate": 3.365968206194178e-05, + "loss": 0.1756, + "step": 13688 + }, + { + "epoch": 1.6232657417289222, + "grad_norm": 0.99789026074068, + "learning_rate": 3.365743026341093e-05, + "loss": 0.1669, + "step": 13689 + }, + { + "epoch": 1.623384323491047, + "grad_norm": 0.9797915397397243, + "learning_rate": 3.3655178385069197e-05, + "loss": 0.2385, + "step": 13690 + }, + { + "epoch": 1.6235029052531722, + "grad_norm": 0.8450567483241379, + "learning_rate": 3.365292642693732e-05, + "loss": 0.1593, + "step": 13691 + }, + { + "epoch": 1.623621487015297, + "grad_norm": 1.1075711241837154, + "learning_rate": 3.365067438903609e-05, + "loss": 0.223, + "step": 13692 + }, + { + "epoch": 1.6237400687774222, + "grad_norm": 0.7620249215726593, + "learning_rate": 3.364842227138624e-05, + "loss": 0.1936, + "step": 13693 + }, + { + "epoch": 1.623858650539547, + "grad_norm": 0.7810793572857115, + "learning_rate": 3.3646170074008536e-05, + "loss": 0.1404, + "step": 13694 + }, + { + "epoch": 1.6239772323016721, + "grad_norm": 1.0271868094004033, + "learning_rate": 3.364391779692375e-05, + "loss": 0.1893, + "step": 13695 + }, + { + "epoch": 1.6240958140637969, + "grad_norm": 1.1432133151637889, + "learning_rate": 3.3641665440152646e-05, + "loss": 0.2429, + "step": 13696 + }, + { + "epoch": 1.624214395825922, + "grad_norm": 0.7382413316272385, + "learning_rate": 3.3639413003715985e-05, + "loss": 0.1888, + "step": 13697 + }, + { + "epoch": 1.6243329775880468, + "grad_norm": 0.9068104451250998, + "learning_rate": 3.3637160487634524e-05, + "loss": 0.1428, + "step": 13698 + }, + { + "epoch": 1.624451559350172, + "grad_norm": 1.2544357807613191, + "learning_rate": 3.3634907891929046e-05, + "loss": 0.2671, + "step": 13699 + }, + { + "epoch": 1.6245701411122968, + "grad_norm": 1.0386748257401675, + "learning_rate": 3.36326552166203e-05, + "loss": 0.2134, + "step": 13700 + }, + { + "epoch": 1.624688722874422, + "grad_norm": 0.7479820648047979, + "learning_rate": 3.363040246172905e-05, + "loss": 0.1492, + "step": 13701 + }, + { + "epoch": 1.6248073046365468, + "grad_norm": 0.908190874567819, + "learning_rate": 3.362814962727608e-05, + "loss": 0.162, + "step": 13702 + }, + { + "epoch": 1.624925886398672, + "grad_norm": 1.085345193169849, + "learning_rate": 3.362589671328216e-05, + "loss": 0.1992, + "step": 13703 + }, + { + "epoch": 1.6250444681607967, + "grad_norm": 1.109566289833208, + "learning_rate": 3.362364371976804e-05, + "loss": 0.2168, + "step": 13704 + }, + { + "epoch": 1.625163049922922, + "grad_norm": 1.7251536466881205, + "learning_rate": 3.3621390646754496e-05, + "loss": 0.3641, + "step": 13705 + }, + { + "epoch": 1.6252816316850467, + "grad_norm": 0.9940243193765376, + "learning_rate": 3.3619137494262304e-05, + "loss": 0.207, + "step": 13706 + }, + { + "epoch": 1.625400213447172, + "grad_norm": 0.9428053595571985, + "learning_rate": 3.361688426231224e-05, + "loss": 0.1904, + "step": 13707 + }, + { + "epoch": 1.6255187952092967, + "grad_norm": 1.3344008206074287, + "learning_rate": 3.3614630950925066e-05, + "loss": 0.2656, + "step": 13708 + }, + { + "epoch": 1.6256373769714219, + "grad_norm": 0.9405180588123065, + "learning_rate": 3.3612377560121564e-05, + "loss": 0.1636, + "step": 13709 + }, + { + "epoch": 1.6257559587335468, + "grad_norm": 1.90621440343506, + "learning_rate": 3.361012408992249e-05, + "loss": 0.3802, + "step": 13710 + }, + { + "epoch": 1.6258745404956718, + "grad_norm": 1.083135816551555, + "learning_rate": 3.3607870540348655e-05, + "loss": 0.1819, + "step": 13711 + }, + { + "epoch": 1.6259931222577968, + "grad_norm": 0.9926448463457774, + "learning_rate": 3.3605616911420786e-05, + "loss": 0.1691, + "step": 13712 + }, + { + "epoch": 1.6261117040199218, + "grad_norm": 1.0127348348637626, + "learning_rate": 3.360336320315968e-05, + "loss": 0.1979, + "step": 13713 + }, + { + "epoch": 1.6262302857820468, + "grad_norm": 1.1247203056989743, + "learning_rate": 3.360110941558613e-05, + "loss": 0.1996, + "step": 13714 + }, + { + "epoch": 1.6263488675441717, + "grad_norm": 1.1884838810851956, + "learning_rate": 3.3598855548720884e-05, + "loss": 0.2213, + "step": 13715 + }, + { + "epoch": 1.6264674493062967, + "grad_norm": 0.6770383965193688, + "learning_rate": 3.359660160258475e-05, + "loss": 0.1158, + "step": 13716 + }, + { + "epoch": 1.6265860310684217, + "grad_norm": 0.7327824011959738, + "learning_rate": 3.359434757719848e-05, + "loss": 0.1525, + "step": 13717 + }, + { + "epoch": 1.6267046128305467, + "grad_norm": 0.7802458980463394, + "learning_rate": 3.359209347258288e-05, + "loss": 0.182, + "step": 13718 + }, + { + "epoch": 1.6268231945926717, + "grad_norm": 0.7671265860638286, + "learning_rate": 3.35898392887587e-05, + "loss": 0.1706, + "step": 13719 + }, + { + "epoch": 1.6269417763547966, + "grad_norm": 0.6861152293135646, + "learning_rate": 3.3587585025746734e-05, + "loss": 0.157, + "step": 13720 + }, + { + "epoch": 1.6270603581169216, + "grad_norm": 0.9985896885354253, + "learning_rate": 3.3585330683567766e-05, + "loss": 0.1979, + "step": 13721 + }, + { + "epoch": 1.6271789398790466, + "grad_norm": 1.589899374330507, + "learning_rate": 3.358307626224259e-05, + "loss": 0.2985, + "step": 13722 + }, + { + "epoch": 1.6272975216411716, + "grad_norm": 1.0333465580108316, + "learning_rate": 3.3580821761791956e-05, + "loss": 0.247, + "step": 13723 + }, + { + "epoch": 1.6274161034032966, + "grad_norm": 1.2963601545261456, + "learning_rate": 3.357856718223668e-05, + "loss": 0.2525, + "step": 13724 + }, + { + "epoch": 1.6275346851654215, + "grad_norm": 0.9499510709899827, + "learning_rate": 3.357631252359753e-05, + "loss": 0.1866, + "step": 13725 + }, + { + "epoch": 1.6276532669275465, + "grad_norm": 0.649623491850891, + "learning_rate": 3.35740577858953e-05, + "loss": 0.1671, + "step": 13726 + }, + { + "epoch": 1.6277718486896715, + "grad_norm": 0.9407060548933799, + "learning_rate": 3.3571802969150766e-05, + "loss": 0.2154, + "step": 13727 + }, + { + "epoch": 1.6278904304517965, + "grad_norm": 1.0169629598431156, + "learning_rate": 3.356954807338473e-05, + "loss": 0.2442, + "step": 13728 + }, + { + "epoch": 1.6280090122139215, + "grad_norm": 1.0747822166390397, + "learning_rate": 3.3567293098617966e-05, + "loss": 0.2695, + "step": 13729 + }, + { + "epoch": 1.6281275939760464, + "grad_norm": 1.1340566883835383, + "learning_rate": 3.356503804487126e-05, + "loss": 0.2189, + "step": 13730 + }, + { + "epoch": 1.6282461757381714, + "grad_norm": 1.380719376809305, + "learning_rate": 3.356278291216541e-05, + "loss": 0.3575, + "step": 13731 + }, + { + "epoch": 1.6283647575002964, + "grad_norm": 1.215477346044068, + "learning_rate": 3.3560527700521196e-05, + "loss": 0.2562, + "step": 13732 + }, + { + "epoch": 1.6284833392624214, + "grad_norm": 1.0944361030348233, + "learning_rate": 3.3558272409959424e-05, + "loss": 0.1915, + "step": 13733 + }, + { + "epoch": 1.6286019210245464, + "grad_norm": 1.60270632031045, + "learning_rate": 3.355601704050086e-05, + "loss": 0.4594, + "step": 13734 + }, + { + "epoch": 1.6287205027866714, + "grad_norm": 1.055078226526997, + "learning_rate": 3.355376159216633e-05, + "loss": 0.2079, + "step": 13735 + }, + { + "epoch": 1.6288390845487966, + "grad_norm": 1.1762846694768332, + "learning_rate": 3.35515060649766e-05, + "loss": 0.1996, + "step": 13736 + }, + { + "epoch": 1.6289576663109213, + "grad_norm": 1.2359519121642237, + "learning_rate": 3.354925045895247e-05, + "loss": 0.2293, + "step": 13737 + }, + { + "epoch": 1.6290762480730465, + "grad_norm": 0.9027008031495793, + "learning_rate": 3.3546994774114726e-05, + "loss": 0.1775, + "step": 13738 + }, + { + "epoch": 1.6291948298351713, + "grad_norm": 0.6659833684438726, + "learning_rate": 3.3544739010484185e-05, + "loss": 0.1611, + "step": 13739 + }, + { + "epoch": 1.6293134115972965, + "grad_norm": 0.8694023271797118, + "learning_rate": 3.354248316808162e-05, + "loss": 0.1757, + "step": 13740 + }, + { + "epoch": 1.6294319933594212, + "grad_norm": 0.7545629066106005, + "learning_rate": 3.354022724692783e-05, + "loss": 0.1917, + "step": 13741 + }, + { + "epoch": 1.6295505751215464, + "grad_norm": 0.9121861799578702, + "learning_rate": 3.3537971247043634e-05, + "loss": 0.1582, + "step": 13742 + }, + { + "epoch": 1.6296691568836712, + "grad_norm": 1.7021391666069037, + "learning_rate": 3.353571516844981e-05, + "loss": 0.3677, + "step": 13743 + }, + { + "epoch": 1.6297877386457964, + "grad_norm": 1.1203927963031186, + "learning_rate": 3.353345901116715e-05, + "loss": 0.2037, + "step": 13744 + }, + { + "epoch": 1.6299063204079212, + "grad_norm": 1.127484860047878, + "learning_rate": 3.3531202775216467e-05, + "loss": 0.2095, + "step": 13745 + }, + { + "epoch": 1.6300249021700464, + "grad_norm": 0.9851948389590602, + "learning_rate": 3.352894646061855e-05, + "loss": 0.1557, + "step": 13746 + }, + { + "epoch": 1.6301434839321711, + "grad_norm": 0.7627398575204261, + "learning_rate": 3.3526690067394215e-05, + "loss": 0.1683, + "step": 13747 + }, + { + "epoch": 1.6302620656942963, + "grad_norm": 1.2710316588812693, + "learning_rate": 3.352443359556425e-05, + "loss": 0.2423, + "step": 13748 + }, + { + "epoch": 1.630380647456421, + "grad_norm": 0.8249804297505521, + "learning_rate": 3.3522177045149464e-05, + "loss": 0.1644, + "step": 13749 + }, + { + "epoch": 1.6304992292185463, + "grad_norm": 0.8665920208066458, + "learning_rate": 3.351992041617065e-05, + "loss": 0.1699, + "step": 13750 + }, + { + "epoch": 1.630617810980671, + "grad_norm": 0.8666714367893166, + "learning_rate": 3.351766370864862e-05, + "loss": 0.1889, + "step": 13751 + }, + { + "epoch": 1.6307363927427962, + "grad_norm": 1.1035748284195952, + "learning_rate": 3.3515406922604174e-05, + "loss": 0.2543, + "step": 13752 + }, + { + "epoch": 1.630854974504921, + "grad_norm": 0.9771533650475324, + "learning_rate": 3.351315005805812e-05, + "loss": 0.2236, + "step": 13753 + }, + { + "epoch": 1.6309735562670462, + "grad_norm": 1.803395780928687, + "learning_rate": 3.351089311503126e-05, + "loss": 0.3571, + "step": 13754 + }, + { + "epoch": 1.631092138029171, + "grad_norm": 0.8719188826237051, + "learning_rate": 3.3508636093544406e-05, + "loss": 0.1757, + "step": 13755 + }, + { + "epoch": 1.6312107197912962, + "grad_norm": 1.1650072272565815, + "learning_rate": 3.3506378993618356e-05, + "loss": 0.2223, + "step": 13756 + }, + { + "epoch": 1.631329301553421, + "grad_norm": 1.0642389061617772, + "learning_rate": 3.350412181527393e-05, + "loss": 0.1976, + "step": 13757 + }, + { + "epoch": 1.6314478833155461, + "grad_norm": 0.8147563179423233, + "learning_rate": 3.3501864558531926e-05, + "loss": 0.1665, + "step": 13758 + }, + { + "epoch": 1.631566465077671, + "grad_norm": 1.534636508181445, + "learning_rate": 3.349960722341315e-05, + "loss": 0.3018, + "step": 13759 + }, + { + "epoch": 1.631685046839796, + "grad_norm": 0.9075494274052961, + "learning_rate": 3.349734980993843e-05, + "loss": 0.1571, + "step": 13760 + }, + { + "epoch": 1.631803628601921, + "grad_norm": 1.4695273850599475, + "learning_rate": 3.349509231812856e-05, + "loss": 0.2688, + "step": 13761 + }, + { + "epoch": 1.631922210364046, + "grad_norm": 0.9756912059370443, + "learning_rate": 3.349283474800435e-05, + "loss": 0.1837, + "step": 13762 + }, + { + "epoch": 1.632040792126171, + "grad_norm": 0.7838181746426034, + "learning_rate": 3.349057709958663e-05, + "loss": 0.1675, + "step": 13763 + }, + { + "epoch": 1.632159373888296, + "grad_norm": 0.8514098426739066, + "learning_rate": 3.3488319372896193e-05, + "loss": 0.1627, + "step": 13764 + }, + { + "epoch": 1.632277955650421, + "grad_norm": 0.7987923359328409, + "learning_rate": 3.3486061567953864e-05, + "loss": 0.1493, + "step": 13765 + }, + { + "epoch": 1.632396537412546, + "grad_norm": 0.8226991177941013, + "learning_rate": 3.348380368478045e-05, + "loss": 0.1626, + "step": 13766 + }, + { + "epoch": 1.632515119174671, + "grad_norm": 0.8390393213660257, + "learning_rate": 3.348154572339677e-05, + "loss": 0.1666, + "step": 13767 + }, + { + "epoch": 1.632633700936796, + "grad_norm": 1.0701953910350404, + "learning_rate": 3.3479287683823645e-05, + "loss": 0.2242, + "step": 13768 + }, + { + "epoch": 1.632752282698921, + "grad_norm": 0.9561159083701002, + "learning_rate": 3.347702956608188e-05, + "loss": 0.2188, + "step": 13769 + }, + { + "epoch": 1.632870864461046, + "grad_norm": 1.1985856348078674, + "learning_rate": 3.34747713701923e-05, + "loss": 0.2357, + "step": 13770 + }, + { + "epoch": 1.6329894462231709, + "grad_norm": 0.592240433646659, + "learning_rate": 3.3472513096175716e-05, + "loss": 0.1137, + "step": 13771 + }, + { + "epoch": 1.6331080279852959, + "grad_norm": 1.3643913516169943, + "learning_rate": 3.3470254744052956e-05, + "loss": 0.3056, + "step": 13772 + }, + { + "epoch": 1.6332266097474208, + "grad_norm": 0.9262575336299044, + "learning_rate": 3.346799631384484e-05, + "loss": 0.1786, + "step": 13773 + }, + { + "epoch": 1.6333451915095458, + "grad_norm": 0.9918169672841541, + "learning_rate": 3.346573780557217e-05, + "loss": 0.2055, + "step": 13774 + }, + { + "epoch": 1.6334637732716708, + "grad_norm": 0.7449651844058982, + "learning_rate": 3.346347921925578e-05, + "loss": 0.1713, + "step": 13775 + }, + { + "epoch": 1.6335823550337958, + "grad_norm": 0.8010099071540902, + "learning_rate": 3.34612205549165e-05, + "loss": 0.1773, + "step": 13776 + }, + { + "epoch": 1.6337009367959208, + "grad_norm": 1.2262407932251258, + "learning_rate": 3.345896181257513e-05, + "loss": 0.2869, + "step": 13777 + }, + { + "epoch": 1.6338195185580457, + "grad_norm": 0.9640538297597379, + "learning_rate": 3.345670299225252e-05, + "loss": 0.2111, + "step": 13778 + }, + { + "epoch": 1.6339381003201707, + "grad_norm": 0.8275757836018959, + "learning_rate": 3.345444409396946e-05, + "loss": 0.1268, + "step": 13779 + }, + { + "epoch": 1.6340566820822957, + "grad_norm": 0.8268227149554533, + "learning_rate": 3.3452185117746806e-05, + "loss": 0.1735, + "step": 13780 + }, + { + "epoch": 1.6341752638444207, + "grad_norm": 0.7760739403861777, + "learning_rate": 3.344992606360536e-05, + "loss": 0.1791, + "step": 13781 + }, + { + "epoch": 1.6342938456065457, + "grad_norm": 0.7381002693773655, + "learning_rate": 3.344766693156598e-05, + "loss": 0.1617, + "step": 13782 + }, + { + "epoch": 1.6344124273686707, + "grad_norm": 0.853393498544644, + "learning_rate": 3.344540772164945e-05, + "loss": 0.161, + "step": 13783 + }, + { + "epoch": 1.6345310091307956, + "grad_norm": 0.9931689474341445, + "learning_rate": 3.344314843387661e-05, + "loss": 0.1631, + "step": 13784 + }, + { + "epoch": 1.6346495908929208, + "grad_norm": 1.0301050951169208, + "learning_rate": 3.344088906826831e-05, + "loss": 0.2505, + "step": 13785 + }, + { + "epoch": 1.6347681726550456, + "grad_norm": 0.6251763303023079, + "learning_rate": 3.343862962484536e-05, + "loss": 0.142, + "step": 13786 + }, + { + "epoch": 1.6348867544171708, + "grad_norm": 0.9737059012586274, + "learning_rate": 3.3436370103628594e-05, + "loss": 0.1803, + "step": 13787 + }, + { + "epoch": 1.6350053361792956, + "grad_norm": 1.1205057269626946, + "learning_rate": 3.343411050463884e-05, + "loss": 0.1687, + "step": 13788 + }, + { + "epoch": 1.6351239179414208, + "grad_norm": 1.0265850994022747, + "learning_rate": 3.343185082789693e-05, + "loss": 0.2144, + "step": 13789 + }, + { + "epoch": 1.6352424997035455, + "grad_norm": 0.9853557178583822, + "learning_rate": 3.3429591073423694e-05, + "loss": 0.1956, + "step": 13790 + }, + { + "epoch": 1.6353610814656707, + "grad_norm": 0.7870502954286949, + "learning_rate": 3.342733124123997e-05, + "loss": 0.1384, + "step": 13791 + }, + { + "epoch": 1.6354796632277955, + "grad_norm": 1.3150673772233914, + "learning_rate": 3.342507133136659e-05, + "loss": 0.2602, + "step": 13792 + }, + { + "epoch": 1.6355982449899207, + "grad_norm": 1.6570515947376168, + "learning_rate": 3.3422811343824366e-05, + "loss": 0.2699, + "step": 13793 + }, + { + "epoch": 1.6357168267520454, + "grad_norm": 0.7780657506532218, + "learning_rate": 3.342055127863416e-05, + "loss": 0.1377, + "step": 13794 + }, + { + "epoch": 1.6358354085141706, + "grad_norm": 0.9047418022178666, + "learning_rate": 3.341829113581681e-05, + "loss": 0.2104, + "step": 13795 + }, + { + "epoch": 1.6359539902762954, + "grad_norm": 0.9812209423711028, + "learning_rate": 3.341603091539312e-05, + "loss": 0.1616, + "step": 13796 + }, + { + "epoch": 1.6360725720384206, + "grad_norm": 0.8370606591498236, + "learning_rate": 3.341377061738395e-05, + "loss": 0.1263, + "step": 13797 + }, + { + "epoch": 1.6361911538005454, + "grad_norm": 1.4463848587658161, + "learning_rate": 3.341151024181014e-05, + "loss": 0.2098, + "step": 13798 + }, + { + "epoch": 1.6363097355626706, + "grad_norm": 0.9745455336227403, + "learning_rate": 3.340924978869251e-05, + "loss": 0.1782, + "step": 13799 + }, + { + "epoch": 1.6364283173247953, + "grad_norm": 1.1556016282354855, + "learning_rate": 3.340698925805192e-05, + "loss": 0.2651, + "step": 13800 + }, + { + "epoch": 1.6365468990869205, + "grad_norm": 1.2875945476240973, + "learning_rate": 3.340472864990919e-05, + "loss": 0.2081, + "step": 13801 + }, + { + "epoch": 1.6366654808490453, + "grad_norm": 1.0189596392781486, + "learning_rate": 3.340246796428517e-05, + "loss": 0.223, + "step": 13802 + }, + { + "epoch": 1.6367840626111705, + "grad_norm": 0.8986468946541689, + "learning_rate": 3.340020720120071e-05, + "loss": 0.1841, + "step": 13803 + }, + { + "epoch": 1.6369026443732952, + "grad_norm": 0.9977372690294354, + "learning_rate": 3.3397946360676624e-05, + "loss": 0.1824, + "step": 13804 + }, + { + "epoch": 1.6370212261354204, + "grad_norm": 1.0007967336062522, + "learning_rate": 3.339568544273377e-05, + "loss": 0.2078, + "step": 13805 + }, + { + "epoch": 1.6371398078975452, + "grad_norm": 0.9573606853946208, + "learning_rate": 3.339342444739301e-05, + "loss": 0.1526, + "step": 13806 + }, + { + "epoch": 1.6372583896596704, + "grad_norm": 0.9080444191161291, + "learning_rate": 3.3391163374675156e-05, + "loss": 0.177, + "step": 13807 + }, + { + "epoch": 1.6373769714217952, + "grad_norm": 1.5448273269789141, + "learning_rate": 3.338890222460107e-05, + "loss": 0.3354, + "step": 13808 + }, + { + "epoch": 1.6374955531839204, + "grad_norm": 1.384779326481159, + "learning_rate": 3.3386640997191586e-05, + "loss": 0.2624, + "step": 13809 + }, + { + "epoch": 1.6376141349460454, + "grad_norm": 0.6840106123021105, + "learning_rate": 3.338437969246757e-05, + "loss": 0.1352, + "step": 13810 + }, + { + "epoch": 1.6377327167081703, + "grad_norm": 0.9562150873660082, + "learning_rate": 3.3382118310449836e-05, + "loss": 0.2099, + "step": 13811 + }, + { + "epoch": 1.6378512984702953, + "grad_norm": 0.9429511500239804, + "learning_rate": 3.3379856851159267e-05, + "loss": 0.1995, + "step": 13812 + }, + { + "epoch": 1.6379698802324203, + "grad_norm": 0.944505014751812, + "learning_rate": 3.337759531461668e-05, + "loss": 0.201, + "step": 13813 + }, + { + "epoch": 1.6380884619945453, + "grad_norm": 0.7830441918315074, + "learning_rate": 3.337533370084295e-05, + "loss": 0.1863, + "step": 13814 + }, + { + "epoch": 1.6382070437566703, + "grad_norm": 1.007904047970829, + "learning_rate": 3.3373072009858905e-05, + "loss": 0.2015, + "step": 13815 + }, + { + "epoch": 1.6383256255187952, + "grad_norm": 0.9659261930176246, + "learning_rate": 3.3370810241685405e-05, + "loss": 0.2136, + "step": 13816 + }, + { + "epoch": 1.6384442072809202, + "grad_norm": 0.8376393800558297, + "learning_rate": 3.3368548396343306e-05, + "loss": 0.1623, + "step": 13817 + }, + { + "epoch": 1.6385627890430452, + "grad_norm": 0.6537707457930788, + "learning_rate": 3.336628647385345e-05, + "loss": 0.1364, + "step": 13818 + }, + { + "epoch": 1.6386813708051702, + "grad_norm": 1.0544648529577976, + "learning_rate": 3.336402447423669e-05, + "loss": 0.17, + "step": 13819 + }, + { + "epoch": 1.6387999525672952, + "grad_norm": 0.9592757490926909, + "learning_rate": 3.336176239751388e-05, + "loss": 0.2087, + "step": 13820 + }, + { + "epoch": 1.6389185343294201, + "grad_norm": 0.9265357364024033, + "learning_rate": 3.3359500243705885e-05, + "loss": 0.1557, + "step": 13821 + }, + { + "epoch": 1.6390371160915451, + "grad_norm": 1.0366254103195818, + "learning_rate": 3.3357238012833535e-05, + "loss": 0.1667, + "step": 13822 + }, + { + "epoch": 1.63915569785367, + "grad_norm": 0.8213776520878997, + "learning_rate": 3.33549757049177e-05, + "loss": 0.2342, + "step": 13823 + }, + { + "epoch": 1.639274279615795, + "grad_norm": 0.9221586260141729, + "learning_rate": 3.335271331997924e-05, + "loss": 0.1246, + "step": 13824 + }, + { + "epoch": 1.63939286137792, + "grad_norm": 1.0265484314393691, + "learning_rate": 3.3350450858039004e-05, + "loss": 0.2647, + "step": 13825 + }, + { + "epoch": 1.639511443140045, + "grad_norm": 1.2220162756962376, + "learning_rate": 3.3348188319117854e-05, + "loss": 0.2595, + "step": 13826 + }, + { + "epoch": 1.63963002490217, + "grad_norm": 0.8455125257904342, + "learning_rate": 3.334592570323664e-05, + "loss": 0.1616, + "step": 13827 + }, + { + "epoch": 1.639748606664295, + "grad_norm": 0.93221857439907, + "learning_rate": 3.334366301041623e-05, + "loss": 0.2053, + "step": 13828 + }, + { + "epoch": 1.63986718842642, + "grad_norm": 0.9065258833235378, + "learning_rate": 3.334140024067748e-05, + "loss": 0.2131, + "step": 13829 + }, + { + "epoch": 1.639985770188545, + "grad_norm": 1.0120427881699554, + "learning_rate": 3.333913739404125e-05, + "loss": 0.232, + "step": 13830 + }, + { + "epoch": 1.64010435195067, + "grad_norm": 0.9816444681386551, + "learning_rate": 3.333687447052839e-05, + "loss": 0.1769, + "step": 13831 + }, + { + "epoch": 1.640222933712795, + "grad_norm": 0.8194731320585147, + "learning_rate": 3.3334611470159774e-05, + "loss": 0.1503, + "step": 13832 + }, + { + "epoch": 1.64034151547492, + "grad_norm": 0.8416103477092939, + "learning_rate": 3.333234839295626e-05, + "loss": 0.1732, + "step": 13833 + }, + { + "epoch": 1.640460097237045, + "grad_norm": 0.9022962059812977, + "learning_rate": 3.333008523893871e-05, + "loss": 0.2191, + "step": 13834 + }, + { + "epoch": 1.6405786789991699, + "grad_norm": 1.2396835675307525, + "learning_rate": 3.3327822008127996e-05, + "loss": 0.2222, + "step": 13835 + }, + { + "epoch": 1.640697260761295, + "grad_norm": 0.8950424490388157, + "learning_rate": 3.332555870054498e-05, + "loss": 0.2092, + "step": 13836 + }, + { + "epoch": 1.6408158425234198, + "grad_norm": 1.570387363913678, + "learning_rate": 3.332329531621051e-05, + "loss": 0.314, + "step": 13837 + }, + { + "epoch": 1.640934424285545, + "grad_norm": 1.2483607279222857, + "learning_rate": 3.3321031855145476e-05, + "loss": 0.195, + "step": 13838 + }, + { + "epoch": 1.6410530060476698, + "grad_norm": 1.2094921960380967, + "learning_rate": 3.331876831737072e-05, + "loss": 0.2015, + "step": 13839 + }, + { + "epoch": 1.641171587809795, + "grad_norm": 1.6582885557639762, + "learning_rate": 3.331650470290712e-05, + "loss": 0.3944, + "step": 13840 + }, + { + "epoch": 1.6412901695719198, + "grad_norm": 0.8531035861818216, + "learning_rate": 3.331424101177556e-05, + "loss": 0.186, + "step": 13841 + }, + { + "epoch": 1.641408751334045, + "grad_norm": 1.0711551570554148, + "learning_rate": 3.3311977243996886e-05, + "loss": 0.2175, + "step": 13842 + }, + { + "epoch": 1.6415273330961697, + "grad_norm": 0.9740725785293818, + "learning_rate": 3.3309713399591967e-05, + "loss": 0.1725, + "step": 13843 + }, + { + "epoch": 1.641645914858295, + "grad_norm": 0.8511476794468771, + "learning_rate": 3.3307449478581685e-05, + "loss": 0.181, + "step": 13844 + }, + { + "epoch": 1.6417644966204197, + "grad_norm": 1.0255554701313219, + "learning_rate": 3.3305185480986914e-05, + "loss": 0.2371, + "step": 13845 + }, + { + "epoch": 1.6418830783825449, + "grad_norm": 1.2344917940272402, + "learning_rate": 3.33029214068285e-05, + "loss": 0.2754, + "step": 13846 + }, + { + "epoch": 1.6420016601446696, + "grad_norm": 1.0236394865855631, + "learning_rate": 3.3300657256127354e-05, + "loss": 0.1823, + "step": 13847 + }, + { + "epoch": 1.6421202419067948, + "grad_norm": 0.6175532627495581, + "learning_rate": 3.3298393028904315e-05, + "loss": 0.1474, + "step": 13848 + }, + { + "epoch": 1.6422388236689196, + "grad_norm": 0.9885405716833187, + "learning_rate": 3.329612872518027e-05, + "loss": 0.221, + "step": 13849 + }, + { + "epoch": 1.6423574054310448, + "grad_norm": 1.013572765660701, + "learning_rate": 3.329386434497609e-05, + "loss": 0.2688, + "step": 13850 + }, + { + "epoch": 1.6424759871931696, + "grad_norm": 0.8421649981453955, + "learning_rate": 3.329159988831266e-05, + "loss": 0.1908, + "step": 13851 + }, + { + "epoch": 1.6425945689552948, + "grad_norm": 0.7017537414852232, + "learning_rate": 3.328933535521084e-05, + "loss": 0.1451, + "step": 13852 + }, + { + "epoch": 1.6427131507174195, + "grad_norm": 0.8711357825753703, + "learning_rate": 3.3287070745691506e-05, + "loss": 0.1816, + "step": 13853 + }, + { + "epoch": 1.6428317324795447, + "grad_norm": 1.0136839398157238, + "learning_rate": 3.328480605977555e-05, + "loss": 0.2081, + "step": 13854 + }, + { + "epoch": 1.6429503142416695, + "grad_norm": 1.1757713093406859, + "learning_rate": 3.328254129748384e-05, + "loss": 0.23, + "step": 13855 + }, + { + "epoch": 1.6430688960037947, + "grad_norm": 0.7302413228220652, + "learning_rate": 3.3280276458837254e-05, + "loss": 0.1519, + "step": 13856 + }, + { + "epoch": 1.6431874777659194, + "grad_norm": 0.8595289396260452, + "learning_rate": 3.327801154385667e-05, + "loss": 0.1945, + "step": 13857 + }, + { + "epoch": 1.6433060595280446, + "grad_norm": 1.0779159300957744, + "learning_rate": 3.327574655256298e-05, + "loss": 0.2641, + "step": 13858 + }, + { + "epoch": 1.6434246412901694, + "grad_norm": 0.9263287691564698, + "learning_rate": 3.3273481484977056e-05, + "loss": 0.1679, + "step": 13859 + }, + { + "epoch": 1.6435432230522946, + "grad_norm": 0.7378900284119562, + "learning_rate": 3.3271216341119776e-05, + "loss": 0.185, + "step": 13860 + }, + { + "epoch": 1.6436618048144196, + "grad_norm": 0.7255506612740806, + "learning_rate": 3.3268951121012015e-05, + "loss": 0.1197, + "step": 13861 + }, + { + "epoch": 1.6437803865765446, + "grad_norm": 1.7373345877610926, + "learning_rate": 3.326668582467468e-05, + "loss": 0.352, + "step": 13862 + }, + { + "epoch": 1.6438989683386696, + "grad_norm": 1.173645121866731, + "learning_rate": 3.326442045212863e-05, + "loss": 0.1948, + "step": 13863 + }, + { + "epoch": 1.6440175501007945, + "grad_norm": 0.7397754490423194, + "learning_rate": 3.326215500339476e-05, + "loss": 0.1418, + "step": 13864 + }, + { + "epoch": 1.6441361318629195, + "grad_norm": 0.8633350511541966, + "learning_rate": 3.3259889478493946e-05, + "loss": 0.1684, + "step": 13865 + }, + { + "epoch": 1.6442547136250445, + "grad_norm": 0.9904134991218374, + "learning_rate": 3.325762387744709e-05, + "loss": 0.1988, + "step": 13866 + }, + { + "epoch": 1.6443732953871695, + "grad_norm": 0.8876527708041332, + "learning_rate": 3.325535820027506e-05, + "loss": 0.2124, + "step": 13867 + }, + { + "epoch": 1.6444918771492945, + "grad_norm": 0.9838041879904412, + "learning_rate": 3.3253092446998754e-05, + "loss": 0.2075, + "step": 13868 + }, + { + "epoch": 1.6446104589114194, + "grad_norm": 0.9377139613408977, + "learning_rate": 3.325082661763905e-05, + "loss": 0.2086, + "step": 13869 + }, + { + "epoch": 1.6447290406735444, + "grad_norm": 0.7483597936105872, + "learning_rate": 3.324856071221685e-05, + "loss": 0.1556, + "step": 13870 + }, + { + "epoch": 1.6448476224356694, + "grad_norm": 0.7995351637515854, + "learning_rate": 3.3246294730753034e-05, + "loss": 0.1476, + "step": 13871 + }, + { + "epoch": 1.6449662041977944, + "grad_norm": 1.2097210158505574, + "learning_rate": 3.3244028673268494e-05, + "loss": 0.2289, + "step": 13872 + }, + { + "epoch": 1.6450847859599194, + "grad_norm": 0.8989324211230567, + "learning_rate": 3.324176253978412e-05, + "loss": 0.1671, + "step": 13873 + }, + { + "epoch": 1.6452033677220443, + "grad_norm": 0.7992434387108824, + "learning_rate": 3.3239496330320794e-05, + "loss": 0.1697, + "step": 13874 + }, + { + "epoch": 1.6453219494841693, + "grad_norm": 1.1259092630612604, + "learning_rate": 3.3237230044899424e-05, + "loss": 0.1992, + "step": 13875 + }, + { + "epoch": 1.6454405312462943, + "grad_norm": 0.9626395937509123, + "learning_rate": 3.3234963683540886e-05, + "loss": 0.2173, + "step": 13876 + }, + { + "epoch": 1.6455591130084193, + "grad_norm": 1.0732032909617721, + "learning_rate": 3.32326972462661e-05, + "loss": 0.1572, + "step": 13877 + }, + { + "epoch": 1.6456776947705443, + "grad_norm": 1.362162956814713, + "learning_rate": 3.323043073309592e-05, + "loss": 0.2987, + "step": 13878 + }, + { + "epoch": 1.6457962765326692, + "grad_norm": 0.9271324243446668, + "learning_rate": 3.322816414405128e-05, + "loss": 0.1462, + "step": 13879 + }, + { + "epoch": 1.6459148582947942, + "grad_norm": 1.1010887660870774, + "learning_rate": 3.322589747915304e-05, + "loss": 0.231, + "step": 13880 + }, + { + "epoch": 1.6460334400569192, + "grad_norm": 0.8538060988663343, + "learning_rate": 3.322363073842212e-05, + "loss": 0.1419, + "step": 13881 + }, + { + "epoch": 1.6461520218190442, + "grad_norm": 0.7558510836119439, + "learning_rate": 3.3221363921879407e-05, + "loss": 0.1497, + "step": 13882 + }, + { + "epoch": 1.6462706035811692, + "grad_norm": 0.8286156253908981, + "learning_rate": 3.32190970295458e-05, + "loss": 0.2019, + "step": 13883 + }, + { + "epoch": 1.6463891853432941, + "grad_norm": 0.8846299946023335, + "learning_rate": 3.32168300614422e-05, + "loss": 0.1769, + "step": 13884 + }, + { + "epoch": 1.6465077671054194, + "grad_norm": 0.8879951630429925, + "learning_rate": 3.32145630175895e-05, + "loss": 0.1919, + "step": 13885 + }, + { + "epoch": 1.646626348867544, + "grad_norm": 1.179404485853633, + "learning_rate": 3.3212295898008596e-05, + "loss": 0.2198, + "step": 13886 + }, + { + "epoch": 1.6467449306296693, + "grad_norm": 1.3038771862252292, + "learning_rate": 3.3210028702720406e-05, + "loss": 0.2653, + "step": 13887 + }, + { + "epoch": 1.646863512391794, + "grad_norm": 1.0434536435574837, + "learning_rate": 3.3207761431745816e-05, + "loss": 0.1882, + "step": 13888 + }, + { + "epoch": 1.6469820941539193, + "grad_norm": 0.9445151194392167, + "learning_rate": 3.3205494085105726e-05, + "loss": 0.1419, + "step": 13889 + }, + { + "epoch": 1.647100675916044, + "grad_norm": 0.8853655191893106, + "learning_rate": 3.3203226662821054e-05, + "loss": 0.2165, + "step": 13890 + }, + { + "epoch": 1.6472192576781692, + "grad_norm": 1.093965427868732, + "learning_rate": 3.3200959164912684e-05, + "loss": 0.2079, + "step": 13891 + }, + { + "epoch": 1.647337839440294, + "grad_norm": 1.1812491269490126, + "learning_rate": 3.319869159140152e-05, + "loss": 0.19, + "step": 13892 + }, + { + "epoch": 1.6474564212024192, + "grad_norm": 0.9746596427718994, + "learning_rate": 3.319642394230848e-05, + "loss": 0.181, + "step": 13893 + }, + { + "epoch": 1.647575002964544, + "grad_norm": 1.0018570942524019, + "learning_rate": 3.319415621765447e-05, + "loss": 0.2129, + "step": 13894 + }, + { + "epoch": 1.6476935847266692, + "grad_norm": 1.087763831119206, + "learning_rate": 3.3191888417460375e-05, + "loss": 0.2117, + "step": 13895 + }, + { + "epoch": 1.647812166488794, + "grad_norm": 1.0934254905375338, + "learning_rate": 3.318962054174712e-05, + "loss": 0.208, + "step": 13896 + }, + { + "epoch": 1.6479307482509191, + "grad_norm": 1.2480213416383779, + "learning_rate": 3.318735259053561e-05, + "loss": 0.2104, + "step": 13897 + }, + { + "epoch": 1.6480493300130439, + "grad_norm": 0.7867062492065029, + "learning_rate": 3.3185084563846744e-05, + "loss": 0.1608, + "step": 13898 + }, + { + "epoch": 1.648167911775169, + "grad_norm": 1.0619738248687995, + "learning_rate": 3.3182816461701445e-05, + "loss": 0.2036, + "step": 13899 + }, + { + "epoch": 1.6482864935372938, + "grad_norm": 1.1132526731570105, + "learning_rate": 3.31805482841206e-05, + "loss": 0.1928, + "step": 13900 + }, + { + "epoch": 1.648405075299419, + "grad_norm": 0.9531157746494681, + "learning_rate": 3.317828003112514e-05, + "loss": 0.1545, + "step": 13901 + }, + { + "epoch": 1.6485236570615438, + "grad_norm": 0.9760233971418915, + "learning_rate": 3.3176011702735964e-05, + "loss": 0.2536, + "step": 13902 + }, + { + "epoch": 1.648642238823669, + "grad_norm": 0.6614259049745874, + "learning_rate": 3.317374329897398e-05, + "loss": 0.1133, + "step": 13903 + }, + { + "epoch": 1.6487608205857938, + "grad_norm": 0.8774068357024186, + "learning_rate": 3.317147481986012e-05, + "loss": 0.1806, + "step": 13904 + }, + { + "epoch": 1.648879402347919, + "grad_norm": 1.0602129186677653, + "learning_rate": 3.316920626541527e-05, + "loss": 0.2315, + "step": 13905 + }, + { + "epoch": 1.6489979841100437, + "grad_norm": 0.861566434195778, + "learning_rate": 3.316693763566036e-05, + "loss": 0.1943, + "step": 13906 + }, + { + "epoch": 1.649116565872169, + "grad_norm": 1.0319345940996132, + "learning_rate": 3.3164668930616305e-05, + "loss": 0.1951, + "step": 13907 + }, + { + "epoch": 1.6492351476342937, + "grad_norm": 1.2568961248349955, + "learning_rate": 3.316240015030401e-05, + "loss": 0.2006, + "step": 13908 + }, + { + "epoch": 1.6493537293964189, + "grad_norm": 0.8317352542631572, + "learning_rate": 3.31601312947444e-05, + "loss": 0.1702, + "step": 13909 + }, + { + "epoch": 1.6494723111585439, + "grad_norm": 1.0025101202750757, + "learning_rate": 3.3157862363958374e-05, + "loss": 0.1952, + "step": 13910 + }, + { + "epoch": 1.6495908929206688, + "grad_norm": 1.071957838355202, + "learning_rate": 3.3155593357966876e-05, + "loss": 0.1609, + "step": 13911 + }, + { + "epoch": 1.6497094746827938, + "grad_norm": 1.2540299184019081, + "learning_rate": 3.315332427679079e-05, + "loss": 0.2944, + "step": 13912 + }, + { + "epoch": 1.6498280564449188, + "grad_norm": 0.9155324824162302, + "learning_rate": 3.3151055120451065e-05, + "loss": 0.1607, + "step": 13913 + }, + { + "epoch": 1.6499466382070438, + "grad_norm": 1.052905732107297, + "learning_rate": 3.31487858889686e-05, + "loss": 0.2767, + "step": 13914 + }, + { + "epoch": 1.6500652199691688, + "grad_norm": 0.9069887080085022, + "learning_rate": 3.3146516582364325e-05, + "loss": 0.1575, + "step": 13915 + }, + { + "epoch": 1.6501838017312938, + "grad_norm": 1.0279514709294884, + "learning_rate": 3.3144247200659166e-05, + "loss": 0.2337, + "step": 13916 + }, + { + "epoch": 1.6503023834934187, + "grad_norm": 1.253040605460322, + "learning_rate": 3.3141977743874024e-05, + "loss": 0.2406, + "step": 13917 + }, + { + "epoch": 1.6504209652555437, + "grad_norm": 0.836327638859467, + "learning_rate": 3.313970821202984e-05, + "loss": 0.1569, + "step": 13918 + }, + { + "epoch": 1.6505395470176687, + "grad_norm": 0.7881465859653758, + "learning_rate": 3.313743860514752e-05, + "loss": 0.2053, + "step": 13919 + }, + { + "epoch": 1.6506581287797937, + "grad_norm": 0.8295457334662909, + "learning_rate": 3.3135168923248e-05, + "loss": 0.1872, + "step": 13920 + }, + { + "epoch": 1.6507767105419187, + "grad_norm": 1.079958605851507, + "learning_rate": 3.31328991663522e-05, + "loss": 0.2146, + "step": 13921 + }, + { + "epoch": 1.6508952923040436, + "grad_norm": 1.3701470165740104, + "learning_rate": 3.3130629334481036e-05, + "loss": 0.2257, + "step": 13922 + }, + { + "epoch": 1.6510138740661686, + "grad_norm": 1.013170279567228, + "learning_rate": 3.312835942765544e-05, + "loss": 0.173, + "step": 13923 + }, + { + "epoch": 1.6511324558282936, + "grad_norm": 1.2634240658001792, + "learning_rate": 3.312608944589635e-05, + "loss": 0.2839, + "step": 13924 + }, + { + "epoch": 1.6512510375904186, + "grad_norm": 0.9102195727003829, + "learning_rate": 3.3123819389224665e-05, + "loss": 0.1459, + "step": 13925 + }, + { + "epoch": 1.6513696193525436, + "grad_norm": 1.1284790178101443, + "learning_rate": 3.3121549257661336e-05, + "loss": 0.2621, + "step": 13926 + }, + { + "epoch": 1.6514882011146685, + "grad_norm": 0.9292023944537605, + "learning_rate": 3.311927905122728e-05, + "loss": 0.196, + "step": 13927 + }, + { + "epoch": 1.6516067828767935, + "grad_norm": 0.6664443219556402, + "learning_rate": 3.3117008769943425e-05, + "loss": 0.1592, + "step": 13928 + }, + { + "epoch": 1.6517253646389185, + "grad_norm": 6.0294919960191615, + "learning_rate": 3.311473841383071e-05, + "loss": 0.2063, + "step": 13929 + }, + { + "epoch": 1.6518439464010435, + "grad_norm": 1.3010297748306767, + "learning_rate": 3.311246798291005e-05, + "loss": 0.2043, + "step": 13930 + }, + { + "epoch": 1.6519625281631685, + "grad_norm": 0.9306122970271944, + "learning_rate": 3.311019747720239e-05, + "loss": 0.2421, + "step": 13931 + }, + { + "epoch": 1.6520811099252934, + "grad_norm": 1.047804717940458, + "learning_rate": 3.310792689672865e-05, + "loss": 0.1757, + "step": 13932 + }, + { + "epoch": 1.6521996916874184, + "grad_norm": 1.4939889967119, + "learning_rate": 3.3105656241509764e-05, + "loss": 0.292, + "step": 13933 + }, + { + "epoch": 1.6523182734495434, + "grad_norm": 0.973963641851158, + "learning_rate": 3.310338551156667e-05, + "loss": 0.2021, + "step": 13934 + }, + { + "epoch": 1.6524368552116684, + "grad_norm": 1.054432373662916, + "learning_rate": 3.310111470692031e-05, + "loss": 0.2785, + "step": 13935 + }, + { + "epoch": 1.6525554369737936, + "grad_norm": 0.7742324127210048, + "learning_rate": 3.309884382759158e-05, + "loss": 0.1595, + "step": 13936 + }, + { + "epoch": 1.6526740187359183, + "grad_norm": 0.9944085031356489, + "learning_rate": 3.309657287360147e-05, + "loss": 0.2137, + "step": 13937 + }, + { + "epoch": 1.6527926004980436, + "grad_norm": 0.9808534758008662, + "learning_rate": 3.309430184497087e-05, + "loss": 0.2062, + "step": 13938 + }, + { + "epoch": 1.6529111822601683, + "grad_norm": 1.0777331654924396, + "learning_rate": 3.309203074172074e-05, + "loss": 0.2154, + "step": 13939 + }, + { + "epoch": 1.6530297640222935, + "grad_norm": 0.7692187432871385, + "learning_rate": 3.3089759563872006e-05, + "loss": 0.1934, + "step": 13940 + }, + { + "epoch": 1.6531483457844183, + "grad_norm": 1.12997910401799, + "learning_rate": 3.308748831144561e-05, + "loss": 0.2487, + "step": 13941 + }, + { + "epoch": 1.6532669275465435, + "grad_norm": 0.8214283849751711, + "learning_rate": 3.308521698446249e-05, + "loss": 0.1642, + "step": 13942 + }, + { + "epoch": 1.6533855093086682, + "grad_norm": 0.8921589143006421, + "learning_rate": 3.308294558294358e-05, + "loss": 0.2188, + "step": 13943 + }, + { + "epoch": 1.6535040910707934, + "grad_norm": 0.7336665077014756, + "learning_rate": 3.3080674106909834e-05, + "loss": 0.1477, + "step": 13944 + }, + { + "epoch": 1.6536226728329182, + "grad_norm": 0.9481238882538805, + "learning_rate": 3.3078402556382174e-05, + "loss": 0.171, + "step": 13945 + }, + { + "epoch": 1.6537412545950434, + "grad_norm": 0.754890465244835, + "learning_rate": 3.307613093138155e-05, + "loss": 0.1866, + "step": 13946 + }, + { + "epoch": 1.6538598363571682, + "grad_norm": 0.9910932132422134, + "learning_rate": 3.30738592319289e-05, + "loss": 0.2198, + "step": 13947 + }, + { + "epoch": 1.6539784181192934, + "grad_norm": 1.3561599483839972, + "learning_rate": 3.307158745804517e-05, + "loss": 0.2425, + "step": 13948 + }, + { + "epoch": 1.6540969998814181, + "grad_norm": 0.8779460468678094, + "learning_rate": 3.3069315609751303e-05, + "loss": 0.1572, + "step": 13949 + }, + { + "epoch": 1.6542155816435433, + "grad_norm": 1.1319792189097133, + "learning_rate": 3.3067043687068246e-05, + "loss": 0.1775, + "step": 13950 + }, + { + "epoch": 1.654334163405668, + "grad_norm": 1.179409261847428, + "learning_rate": 3.3064771690016935e-05, + "loss": 0.1704, + "step": 13951 + }, + { + "epoch": 1.6544527451677933, + "grad_norm": 0.9986249150821348, + "learning_rate": 3.306249961861832e-05, + "loss": 0.2136, + "step": 13952 + }, + { + "epoch": 1.654571326929918, + "grad_norm": 0.9936718673974518, + "learning_rate": 3.306022747289334e-05, + "loss": 0.2153, + "step": 13953 + }, + { + "epoch": 1.6546899086920432, + "grad_norm": 1.6246129254650212, + "learning_rate": 3.305795525286295e-05, + "loss": 0.3253, + "step": 13954 + }, + { + "epoch": 1.654808490454168, + "grad_norm": 1.071571223824016, + "learning_rate": 3.305568295854809e-05, + "loss": 0.1791, + "step": 13955 + }, + { + "epoch": 1.6549270722162932, + "grad_norm": 1.4688008832403654, + "learning_rate": 3.3053410589969715e-05, + "loss": 0.3198, + "step": 13956 + }, + { + "epoch": 1.655045653978418, + "grad_norm": 1.2328457206161871, + "learning_rate": 3.3051138147148776e-05, + "loss": 0.2315, + "step": 13957 + }, + { + "epoch": 1.6551642357405432, + "grad_norm": 0.7404911521293492, + "learning_rate": 3.304886563010621e-05, + "loss": 0.1203, + "step": 13958 + }, + { + "epoch": 1.655282817502668, + "grad_norm": 0.6757725432287234, + "learning_rate": 3.304659303886297e-05, + "loss": 0.124, + "step": 13959 + }, + { + "epoch": 1.6554013992647931, + "grad_norm": 1.2434154201322978, + "learning_rate": 3.3044320373440014e-05, + "loss": 0.2231, + "step": 13960 + }, + { + "epoch": 1.655519981026918, + "grad_norm": 0.9555935940883024, + "learning_rate": 3.304204763385829e-05, + "loss": 0.1673, + "step": 13961 + }, + { + "epoch": 1.655638562789043, + "grad_norm": 0.9219798353201587, + "learning_rate": 3.303977482013875e-05, + "loss": 0.2218, + "step": 13962 + }, + { + "epoch": 1.655757144551168, + "grad_norm": 0.7827503846712562, + "learning_rate": 3.303750193230234e-05, + "loss": 0.1911, + "step": 13963 + }, + { + "epoch": 1.655875726313293, + "grad_norm": 0.8945454948632056, + "learning_rate": 3.303522897037001e-05, + "loss": 0.1562, + "step": 13964 + }, + { + "epoch": 1.655994308075418, + "grad_norm": 1.2501402358300706, + "learning_rate": 3.303295593436274e-05, + "loss": 0.2275, + "step": 13965 + }, + { + "epoch": 1.656112889837543, + "grad_norm": 1.3857601867731115, + "learning_rate": 3.3030682824301456e-05, + "loss": 0.3046, + "step": 13966 + }, + { + "epoch": 1.656231471599668, + "grad_norm": 1.096857910786883, + "learning_rate": 3.3028409640207134e-05, + "loss": 0.2591, + "step": 13967 + }, + { + "epoch": 1.656350053361793, + "grad_norm": 0.8435884150642413, + "learning_rate": 3.3026136382100714e-05, + "loss": 0.197, + "step": 13968 + }, + { + "epoch": 1.656468635123918, + "grad_norm": 0.810873948244756, + "learning_rate": 3.302386305000316e-05, + "loss": 0.1903, + "step": 13969 + }, + { + "epoch": 1.656587216886043, + "grad_norm": 1.031833546838447, + "learning_rate": 3.3021589643935425e-05, + "loss": 0.2231, + "step": 13970 + }, + { + "epoch": 1.656705798648168, + "grad_norm": 0.9016917635958688, + "learning_rate": 3.301931616391847e-05, + "loss": 0.2329, + "step": 13971 + }, + { + "epoch": 1.656824380410293, + "grad_norm": 0.8897711720221393, + "learning_rate": 3.3017042609973254e-05, + "loss": 0.1862, + "step": 13972 + }, + { + "epoch": 1.6569429621724179, + "grad_norm": 0.9614721594772205, + "learning_rate": 3.301476898212074e-05, + "loss": 0.1823, + "step": 13973 + }, + { + "epoch": 1.6570615439345429, + "grad_norm": 0.8045516198981914, + "learning_rate": 3.301249528038188e-05, + "loss": 0.1786, + "step": 13974 + }, + { + "epoch": 1.6571801256966678, + "grad_norm": 0.7321541032294968, + "learning_rate": 3.3010221504777645e-05, + "loss": 0.1475, + "step": 13975 + }, + { + "epoch": 1.6572987074587928, + "grad_norm": 0.9752904026963553, + "learning_rate": 3.300794765532898e-05, + "loss": 0.1838, + "step": 13976 + }, + { + "epoch": 1.6574172892209178, + "grad_norm": 0.8831149656134609, + "learning_rate": 3.300567373205687e-05, + "loss": 0.1598, + "step": 13977 + }, + { + "epoch": 1.6575358709830428, + "grad_norm": 0.9304520869379564, + "learning_rate": 3.3003399734982266e-05, + "loss": 0.2174, + "step": 13978 + }, + { + "epoch": 1.6576544527451678, + "grad_norm": 0.9760102229040017, + "learning_rate": 3.300112566412612e-05, + "loss": 0.1842, + "step": 13979 + }, + { + "epoch": 1.6577730345072927, + "grad_norm": 1.0260226000544366, + "learning_rate": 3.299885151950942e-05, + "loss": 0.2154, + "step": 13980 + }, + { + "epoch": 1.6578916162694177, + "grad_norm": 0.6912876614515638, + "learning_rate": 3.299657730115311e-05, + "loss": 0.1127, + "step": 13981 + }, + { + "epoch": 1.6580101980315427, + "grad_norm": 1.4007288197272427, + "learning_rate": 3.299430300907816e-05, + "loss": 0.272, + "step": 13982 + }, + { + "epoch": 1.6581287797936677, + "grad_norm": 1.102471968746673, + "learning_rate": 3.299202864330554e-05, + "loss": 0.2136, + "step": 13983 + }, + { + "epoch": 1.6582473615557927, + "grad_norm": 0.9912018262943079, + "learning_rate": 3.2989754203856214e-05, + "loss": 0.2286, + "step": 13984 + }, + { + "epoch": 1.6583659433179179, + "grad_norm": 1.1238237244926046, + "learning_rate": 3.2987479690751165e-05, + "loss": 0.1941, + "step": 13985 + }, + { + "epoch": 1.6584845250800426, + "grad_norm": 1.237878218284402, + "learning_rate": 3.2985205104011334e-05, + "loss": 0.2857, + "step": 13986 + }, + { + "epoch": 1.6586031068421678, + "grad_norm": 0.9227144622753768, + "learning_rate": 3.298293044365772e-05, + "loss": 0.1833, + "step": 13987 + }, + { + "epoch": 1.6587216886042926, + "grad_norm": 0.8063986540824569, + "learning_rate": 3.298065570971126e-05, + "loss": 0.1413, + "step": 13988 + }, + { + "epoch": 1.6588402703664178, + "grad_norm": 0.7061612840073647, + "learning_rate": 3.2978380902192954e-05, + "loss": 0.1406, + "step": 13989 + }, + { + "epoch": 1.6589588521285425, + "grad_norm": 0.8651781674269983, + "learning_rate": 3.2976106021123756e-05, + "loss": 0.1396, + "step": 13990 + }, + { + "epoch": 1.6590774338906678, + "grad_norm": 0.945160083250895, + "learning_rate": 3.2973831066524644e-05, + "loss": 0.1787, + "step": 13991 + }, + { + "epoch": 1.6591960156527925, + "grad_norm": 1.2153255575877915, + "learning_rate": 3.297155603841657e-05, + "loss": 0.1961, + "step": 13992 + }, + { + "epoch": 1.6593145974149177, + "grad_norm": 0.9366390195412858, + "learning_rate": 3.296928093682054e-05, + "loss": 0.1276, + "step": 13993 + }, + { + "epoch": 1.6594331791770425, + "grad_norm": 0.9171548108052973, + "learning_rate": 3.296700576175751e-05, + "loss": 0.1534, + "step": 13994 + }, + { + "epoch": 1.6595517609391677, + "grad_norm": 1.1957651083346752, + "learning_rate": 3.296473051324846e-05, + "loss": 0.2406, + "step": 13995 + }, + { + "epoch": 1.6596703427012924, + "grad_norm": 1.1450872386599924, + "learning_rate": 3.2962455191314356e-05, + "loss": 0.2073, + "step": 13996 + }, + { + "epoch": 1.6597889244634176, + "grad_norm": 1.1111336216305703, + "learning_rate": 3.296017979597618e-05, + "loss": 0.248, + "step": 13997 + }, + { + "epoch": 1.6599075062255424, + "grad_norm": 1.1471513246894265, + "learning_rate": 3.2957904327254916e-05, + "loss": 0.2908, + "step": 13998 + }, + { + "epoch": 1.6600260879876676, + "grad_norm": 1.1647333850066206, + "learning_rate": 3.295562878517152e-05, + "loss": 0.1965, + "step": 13999 + }, + { + "epoch": 1.6601446697497924, + "grad_norm": 1.1210908544609235, + "learning_rate": 3.2953353169746995e-05, + "loss": 0.2022, + "step": 14000 + }, + { + "epoch": 1.6602632515119176, + "grad_norm": 0.7742748293930636, + "learning_rate": 3.2951077481002297e-05, + "loss": 0.1506, + "step": 14001 + }, + { + "epoch": 1.6603818332740423, + "grad_norm": 1.3030827505200004, + "learning_rate": 3.294880171895841e-05, + "loss": 0.2479, + "step": 14002 + }, + { + "epoch": 1.6605004150361675, + "grad_norm": 1.2511174718252214, + "learning_rate": 3.2946525883636326e-05, + "loss": 0.1976, + "step": 14003 + }, + { + "epoch": 1.6606189967982923, + "grad_norm": 1.1854587309161957, + "learning_rate": 3.294424997505703e-05, + "loss": 0.2256, + "step": 14004 + }, + { + "epoch": 1.6607375785604175, + "grad_norm": 1.084006499856509, + "learning_rate": 3.294197399324147e-05, + "loss": 0.1883, + "step": 14005 + }, + { + "epoch": 1.6608561603225422, + "grad_norm": 0.7518080216383431, + "learning_rate": 3.293969793821067e-05, + "loss": 0.1554, + "step": 14006 + }, + { + "epoch": 1.6609747420846674, + "grad_norm": 1.1193024949349886, + "learning_rate": 3.2937421809985574e-05, + "loss": 0.2354, + "step": 14007 + }, + { + "epoch": 1.6610933238467922, + "grad_norm": 1.375357562800624, + "learning_rate": 3.2935145608587195e-05, + "loss": 0.2969, + "step": 14008 + }, + { + "epoch": 1.6612119056089174, + "grad_norm": 0.8394376083881863, + "learning_rate": 3.2932869334036495e-05, + "loss": 0.1907, + "step": 14009 + }, + { + "epoch": 1.6613304873710424, + "grad_norm": 1.0081816168949649, + "learning_rate": 3.293059298635447e-05, + "loss": 0.2082, + "step": 14010 + }, + { + "epoch": 1.6614490691331674, + "grad_norm": 1.2124337752982537, + "learning_rate": 3.29283165655621e-05, + "loss": 0.3085, + "step": 14011 + }, + { + "epoch": 1.6615676508952923, + "grad_norm": 0.77580591748616, + "learning_rate": 3.292604007168037e-05, + "loss": 0.1498, + "step": 14012 + }, + { + "epoch": 1.6616862326574173, + "grad_norm": 1.0558924380318635, + "learning_rate": 3.292376350473028e-05, + "loss": 0.1942, + "step": 14013 + }, + { + "epoch": 1.6618048144195423, + "grad_norm": 0.9875345001588826, + "learning_rate": 3.292148686473281e-05, + "loss": 0.2, + "step": 14014 + }, + { + "epoch": 1.6619233961816673, + "grad_norm": 1.0059873770706214, + "learning_rate": 3.2919210151708936e-05, + "loss": 0.2199, + "step": 14015 + }, + { + "epoch": 1.6620419779437923, + "grad_norm": 1.0363584978288096, + "learning_rate": 3.291693336567966e-05, + "loss": 0.1745, + "step": 14016 + }, + { + "epoch": 1.6621605597059173, + "grad_norm": 0.9725676092523261, + "learning_rate": 3.291465650666596e-05, + "loss": 0.175, + "step": 14017 + }, + { + "epoch": 1.6622791414680422, + "grad_norm": 0.7108646561746085, + "learning_rate": 3.2912379574688845e-05, + "loss": 0.1799, + "step": 14018 + }, + { + "epoch": 1.6623977232301672, + "grad_norm": 1.8422253647322555, + "learning_rate": 3.291010256976928e-05, + "loss": 0.32, + "step": 14019 + }, + { + "epoch": 1.6625163049922922, + "grad_norm": 1.050708442150772, + "learning_rate": 3.290782549192828e-05, + "loss": 0.1996, + "step": 14020 + }, + { + "epoch": 1.6626348867544172, + "grad_norm": 1.0269971504693947, + "learning_rate": 3.290554834118682e-05, + "loss": 0.2097, + "step": 14021 + }, + { + "epoch": 1.6627534685165422, + "grad_norm": 0.9094733986112387, + "learning_rate": 3.2903271117565904e-05, + "loss": 0.1511, + "step": 14022 + }, + { + "epoch": 1.6628720502786671, + "grad_norm": 0.9027106115781086, + "learning_rate": 3.2900993821086515e-05, + "loss": 0.1529, + "step": 14023 + }, + { + "epoch": 1.6629906320407921, + "grad_norm": 1.2007047145439325, + "learning_rate": 3.289871645176965e-05, + "loss": 0.2491, + "step": 14024 + }, + { + "epoch": 1.663109213802917, + "grad_norm": 0.9995160436461187, + "learning_rate": 3.289643900963631e-05, + "loss": 0.2228, + "step": 14025 + }, + { + "epoch": 1.663227795565042, + "grad_norm": 1.1945822066059752, + "learning_rate": 3.2894161494707486e-05, + "loss": 0.2722, + "step": 14026 + }, + { + "epoch": 1.663346377327167, + "grad_norm": 0.7054959983999796, + "learning_rate": 3.289188390700417e-05, + "loss": 0.1804, + "step": 14027 + }, + { + "epoch": 1.663464959089292, + "grad_norm": 0.7449623012774739, + "learning_rate": 3.2889606246547365e-05, + "loss": 0.1504, + "step": 14028 + }, + { + "epoch": 1.663583540851417, + "grad_norm": 1.0791147494373272, + "learning_rate": 3.288732851335806e-05, + "loss": 0.2567, + "step": 14029 + }, + { + "epoch": 1.663702122613542, + "grad_norm": 1.0061533141784587, + "learning_rate": 3.288505070745727e-05, + "loss": 0.1643, + "step": 14030 + }, + { + "epoch": 1.663820704375667, + "grad_norm": 0.8504435492354446, + "learning_rate": 3.2882772828865974e-05, + "loss": 0.2181, + "step": 14031 + }, + { + "epoch": 1.663939286137792, + "grad_norm": 0.8424910509921807, + "learning_rate": 3.288049487760517e-05, + "loss": 0.1908, + "step": 14032 + }, + { + "epoch": 1.664057867899917, + "grad_norm": 0.9469789420478224, + "learning_rate": 3.287821685369588e-05, + "loss": 0.2446, + "step": 14033 + }, + { + "epoch": 1.664176449662042, + "grad_norm": 0.9685387017661671, + "learning_rate": 3.287593875715909e-05, + "loss": 0.1874, + "step": 14034 + }, + { + "epoch": 1.664295031424167, + "grad_norm": 1.1279582369341272, + "learning_rate": 3.2873660588015795e-05, + "loss": 0.3037, + "step": 14035 + }, + { + "epoch": 1.664413613186292, + "grad_norm": 0.7078239394088771, + "learning_rate": 3.2871382346287014e-05, + "loss": 0.1499, + "step": 14036 + }, + { + "epoch": 1.6645321949484169, + "grad_norm": 0.9734044500574901, + "learning_rate": 3.286910403199373e-05, + "loss": 0.2126, + "step": 14037 + }, + { + "epoch": 1.664650776710542, + "grad_norm": 0.9562641770239007, + "learning_rate": 3.286682564515697e-05, + "loss": 0.2106, + "step": 14038 + }, + { + "epoch": 1.6647693584726668, + "grad_norm": 1.0259329496286096, + "learning_rate": 3.286454718579772e-05, + "loss": 0.1789, + "step": 14039 + }, + { + "epoch": 1.664887940234792, + "grad_norm": 0.9664653584771096, + "learning_rate": 3.2862268653936985e-05, + "loss": 0.2359, + "step": 14040 + }, + { + "epoch": 1.6650065219969168, + "grad_norm": 0.8144475858200041, + "learning_rate": 3.285999004959578e-05, + "loss": 0.1691, + "step": 14041 + }, + { + "epoch": 1.665125103759042, + "grad_norm": 0.9681912208596729, + "learning_rate": 3.2857711372795096e-05, + "loss": 0.1922, + "step": 14042 + }, + { + "epoch": 1.6652436855211667, + "grad_norm": 1.077963793784443, + "learning_rate": 3.2855432623555953e-05, + "loss": 0.2869, + "step": 14043 + }, + { + "epoch": 1.665362267283292, + "grad_norm": 0.9300883327750298, + "learning_rate": 3.285315380189935e-05, + "loss": 0.1978, + "step": 14044 + }, + { + "epoch": 1.6654808490454167, + "grad_norm": 1.0330585331084008, + "learning_rate": 3.2850874907846304e-05, + "loss": 0.2611, + "step": 14045 + }, + { + "epoch": 1.665599430807542, + "grad_norm": 0.9132854026032966, + "learning_rate": 3.2848595941417824e-05, + "loss": 0.1709, + "step": 14046 + }, + { + "epoch": 1.6657180125696667, + "grad_norm": 0.9910039531375325, + "learning_rate": 3.284631690263491e-05, + "loss": 0.2113, + "step": 14047 + }, + { + "epoch": 1.6658365943317919, + "grad_norm": 0.9111016353181309, + "learning_rate": 3.284403779151857e-05, + "loss": 0.2095, + "step": 14048 + }, + { + "epoch": 1.6659551760939166, + "grad_norm": 0.7491117408664655, + "learning_rate": 3.284175860808983e-05, + "loss": 0.1551, + "step": 14049 + }, + { + "epoch": 1.6660737578560418, + "grad_norm": 1.0108849436105112, + "learning_rate": 3.283947935236969e-05, + "loss": 0.2167, + "step": 14050 + }, + { + "epoch": 1.6661923396181666, + "grad_norm": 0.9382525446004681, + "learning_rate": 3.283720002437916e-05, + "loss": 0.1767, + "step": 14051 + }, + { + "epoch": 1.6663109213802918, + "grad_norm": 0.7602689585317933, + "learning_rate": 3.283492062413925e-05, + "loss": 0.1812, + "step": 14052 + }, + { + "epoch": 1.6664295031424166, + "grad_norm": 0.8092471927942004, + "learning_rate": 3.2832641151670994e-05, + "loss": 0.1502, + "step": 14053 + }, + { + "epoch": 1.6665480849045418, + "grad_norm": 0.822931810401402, + "learning_rate": 3.283036160699538e-05, + "loss": 0.1944, + "step": 14054 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7661760818879054, + "learning_rate": 3.2828081990133444e-05, + "loss": 0.1475, + "step": 14055 + }, + { + "epoch": 1.6667852484287917, + "grad_norm": 0.9999674802213672, + "learning_rate": 3.2825802301106185e-05, + "loss": 0.2192, + "step": 14056 + }, + { + "epoch": 1.6669038301909165, + "grad_norm": 0.754966047711745, + "learning_rate": 3.282352253993463e-05, + "loss": 0.1975, + "step": 14057 + }, + { + "epoch": 1.6670224119530417, + "grad_norm": 0.8599580067865027, + "learning_rate": 3.282124270663979e-05, + "loss": 0.2323, + "step": 14058 + }, + { + "epoch": 1.6671409937151664, + "grad_norm": 1.067686018496924, + "learning_rate": 3.281896280124267e-05, + "loss": 0.186, + "step": 14059 + }, + { + "epoch": 1.6672595754772916, + "grad_norm": 1.2209231009506163, + "learning_rate": 3.281668282376432e-05, + "loss": 0.2929, + "step": 14060 + }, + { + "epoch": 1.6673781572394166, + "grad_norm": 0.9051758382888907, + "learning_rate": 3.281440277422573e-05, + "loss": 0.164, + "step": 14061 + }, + { + "epoch": 1.6674967390015416, + "grad_norm": 1.0766904751879616, + "learning_rate": 3.281212265264793e-05, + "loss": 0.2585, + "step": 14062 + }, + { + "epoch": 1.6676153207636666, + "grad_norm": 0.780234117263807, + "learning_rate": 3.280984245905194e-05, + "loss": 0.142, + "step": 14063 + }, + { + "epoch": 1.6677339025257916, + "grad_norm": 1.0581617219913209, + "learning_rate": 3.2807562193458785e-05, + "loss": 0.2127, + "step": 14064 + }, + { + "epoch": 1.6678524842879165, + "grad_norm": 1.1934151963385864, + "learning_rate": 3.280528185588947e-05, + "loss": 0.2254, + "step": 14065 + }, + { + "epoch": 1.6679710660500415, + "grad_norm": 0.9479915439032671, + "learning_rate": 3.280300144636504e-05, + "loss": 0.2107, + "step": 14066 + }, + { + "epoch": 1.6680896478121665, + "grad_norm": 1.093439621222816, + "learning_rate": 3.280072096490649e-05, + "loss": 0.1823, + "step": 14067 + }, + { + "epoch": 1.6682082295742915, + "grad_norm": 1.0178109136192943, + "learning_rate": 3.279844041153488e-05, + "loss": 0.2072, + "step": 14068 + }, + { + "epoch": 1.6683268113364165, + "grad_norm": 0.8416063520707616, + "learning_rate": 3.279615978627119e-05, + "loss": 0.1563, + "step": 14069 + }, + { + "epoch": 1.6684453930985415, + "grad_norm": 1.0998783089365505, + "learning_rate": 3.279387908913648e-05, + "loss": 0.2165, + "step": 14070 + }, + { + "epoch": 1.6685639748606664, + "grad_norm": 0.933458806760182, + "learning_rate": 3.279159832015176e-05, + "loss": 0.1542, + "step": 14071 + }, + { + "epoch": 1.6686825566227914, + "grad_norm": 0.8742342098809177, + "learning_rate": 3.2789317479338053e-05, + "loss": 0.1726, + "step": 14072 + }, + { + "epoch": 1.6688011383849164, + "grad_norm": 0.8238007003582317, + "learning_rate": 3.2787036566716394e-05, + "loss": 0.1931, + "step": 14073 + }, + { + "epoch": 1.6689197201470414, + "grad_norm": 0.9090173990545731, + "learning_rate": 3.278475558230781e-05, + "loss": 0.1328, + "step": 14074 + }, + { + "epoch": 1.6690383019091664, + "grad_norm": 0.803679216938461, + "learning_rate": 3.2782474526133325e-05, + "loss": 0.1687, + "step": 14075 + }, + { + "epoch": 1.6691568836712913, + "grad_norm": 0.862978169068937, + "learning_rate": 3.278019339821396e-05, + "loss": 0.2017, + "step": 14076 + }, + { + "epoch": 1.6692754654334163, + "grad_norm": 0.7874041924872629, + "learning_rate": 3.277791219857077e-05, + "loss": 0.1535, + "step": 14077 + }, + { + "epoch": 1.6693940471955413, + "grad_norm": 0.8804084138874005, + "learning_rate": 3.2775630927224755e-05, + "loss": 0.2187, + "step": 14078 + }, + { + "epoch": 1.6695126289576663, + "grad_norm": 1.402796073860145, + "learning_rate": 3.277334958419696e-05, + "loss": 0.3079, + "step": 14079 + }, + { + "epoch": 1.6696312107197913, + "grad_norm": 1.0735602640035617, + "learning_rate": 3.277106816950841e-05, + "loss": 0.1722, + "step": 14080 + }, + { + "epoch": 1.6697497924819162, + "grad_norm": 1.0785366068521374, + "learning_rate": 3.276878668318015e-05, + "loss": 0.1863, + "step": 14081 + }, + { + "epoch": 1.6698683742440412, + "grad_norm": 0.9260016330127745, + "learning_rate": 3.276650512523319e-05, + "loss": 0.1631, + "step": 14082 + }, + { + "epoch": 1.6699869560061662, + "grad_norm": 1.0106573818500837, + "learning_rate": 3.276422349568859e-05, + "loss": 0.2647, + "step": 14083 + }, + { + "epoch": 1.6701055377682912, + "grad_norm": 0.8136110175967916, + "learning_rate": 3.276194179456737e-05, + "loss": 0.1605, + "step": 14084 + }, + { + "epoch": 1.6702241195304164, + "grad_norm": 1.035596284527482, + "learning_rate": 3.275966002189056e-05, + "loss": 0.268, + "step": 14085 + }, + { + "epoch": 1.6703427012925411, + "grad_norm": 0.8568261172542134, + "learning_rate": 3.27573781776792e-05, + "loss": 0.1536, + "step": 14086 + }, + { + "epoch": 1.6704612830546663, + "grad_norm": 0.9218002185505508, + "learning_rate": 3.275509626195433e-05, + "loss": 0.1961, + "step": 14087 + }, + { + "epoch": 1.670579864816791, + "grad_norm": 1.02883912199065, + "learning_rate": 3.2752814274736974e-05, + "loss": 0.2212, + "step": 14088 + }, + { + "epoch": 1.6706984465789163, + "grad_norm": 0.8405282445958019, + "learning_rate": 3.2750532216048186e-05, + "loss": 0.1487, + "step": 14089 + }, + { + "epoch": 1.670817028341041, + "grad_norm": 0.8617758336828976, + "learning_rate": 3.274825008590899e-05, + "loss": 0.1553, + "step": 14090 + }, + { + "epoch": 1.6709356101031663, + "grad_norm": 1.201586643144924, + "learning_rate": 3.2745967884340436e-05, + "loss": 0.2074, + "step": 14091 + }, + { + "epoch": 1.671054191865291, + "grad_norm": 1.0391531079859624, + "learning_rate": 3.274368561136355e-05, + "loss": 0.2546, + "step": 14092 + }, + { + "epoch": 1.6711727736274162, + "grad_norm": 1.0067727608875576, + "learning_rate": 3.274140326699938e-05, + "loss": 0.2038, + "step": 14093 + }, + { + "epoch": 1.671291355389541, + "grad_norm": 1.1899788251996064, + "learning_rate": 3.273912085126897e-05, + "loss": 0.259, + "step": 14094 + }, + { + "epoch": 1.6714099371516662, + "grad_norm": 0.6595253726235208, + "learning_rate": 3.273683836419335e-05, + "loss": 0.1338, + "step": 14095 + }, + { + "epoch": 1.671528518913791, + "grad_norm": 1.4550577310445594, + "learning_rate": 3.2734555805793576e-05, + "loss": 0.3632, + "step": 14096 + }, + { + "epoch": 1.6716471006759162, + "grad_norm": 0.9441584322332947, + "learning_rate": 3.273227317609067e-05, + "loss": 0.1641, + "step": 14097 + }, + { + "epoch": 1.671765682438041, + "grad_norm": 1.1033761033145042, + "learning_rate": 3.272999047510569e-05, + "loss": 0.2344, + "step": 14098 + }, + { + "epoch": 1.6718842642001661, + "grad_norm": 0.8484187932610294, + "learning_rate": 3.272770770285969e-05, + "loss": 0.1878, + "step": 14099 + }, + { + "epoch": 1.6720028459622909, + "grad_norm": 1.5294403845103888, + "learning_rate": 3.272542485937369e-05, + "loss": 0.2918, + "step": 14100 + }, + { + "epoch": 1.672121427724416, + "grad_norm": 0.7881047586536996, + "learning_rate": 3.2723141944668745e-05, + "loss": 0.1873, + "step": 14101 + }, + { + "epoch": 1.6722400094865408, + "grad_norm": 0.9798134213761764, + "learning_rate": 3.27208589587659e-05, + "loss": 0.1952, + "step": 14102 + }, + { + "epoch": 1.672358591248666, + "grad_norm": 1.0722762129819527, + "learning_rate": 3.271857590168622e-05, + "loss": 0.2242, + "step": 14103 + }, + { + "epoch": 1.6724771730107908, + "grad_norm": 0.8452360675905167, + "learning_rate": 3.2716292773450716e-05, + "loss": 0.1909, + "step": 14104 + }, + { + "epoch": 1.672595754772916, + "grad_norm": 0.9752969075065397, + "learning_rate": 3.2714009574080466e-05, + "loss": 0.2126, + "step": 14105 + }, + { + "epoch": 1.6727143365350408, + "grad_norm": 0.8936718927168437, + "learning_rate": 3.27117263035965e-05, + "loss": 0.1906, + "step": 14106 + }, + { + "epoch": 1.672832918297166, + "grad_norm": 1.0757189324651528, + "learning_rate": 3.270944296201989e-05, + "loss": 0.2234, + "step": 14107 + }, + { + "epoch": 1.6729515000592907, + "grad_norm": 0.9210471705925822, + "learning_rate": 3.270715954937165e-05, + "loss": 0.2298, + "step": 14108 + }, + { + "epoch": 1.673070081821416, + "grad_norm": 1.2555156864487558, + "learning_rate": 3.2704876065672863e-05, + "loss": 0.2861, + "step": 14109 + }, + { + "epoch": 1.673188663583541, + "grad_norm": 1.2004612378867496, + "learning_rate": 3.2702592510944564e-05, + "loss": 0.3063, + "step": 14110 + }, + { + "epoch": 1.6733072453456659, + "grad_norm": 1.0713973828293866, + "learning_rate": 3.2700308885207805e-05, + "loss": 0.2012, + "step": 14111 + }, + { + "epoch": 1.6734258271077909, + "grad_norm": 1.2298459137320348, + "learning_rate": 3.269802518848364e-05, + "loss": 0.2409, + "step": 14112 + }, + { + "epoch": 1.6735444088699158, + "grad_norm": 1.0755418078792724, + "learning_rate": 3.269574142079312e-05, + "loss": 0.1994, + "step": 14113 + }, + { + "epoch": 1.6736629906320408, + "grad_norm": 0.6483791926151304, + "learning_rate": 3.269345758215731e-05, + "loss": 0.1514, + "step": 14114 + }, + { + "epoch": 1.6737815723941658, + "grad_norm": 0.7749709539317895, + "learning_rate": 3.269117367259725e-05, + "loss": 0.156, + "step": 14115 + }, + { + "epoch": 1.6739001541562908, + "grad_norm": 0.9911344953508283, + "learning_rate": 3.268888969213401e-05, + "loss": 0.2267, + "step": 14116 + }, + { + "epoch": 1.6740187359184158, + "grad_norm": 0.9786324310635992, + "learning_rate": 3.268660564078862e-05, + "loss": 0.209, + "step": 14117 + }, + { + "epoch": 1.6741373176805407, + "grad_norm": 1.258395568168295, + "learning_rate": 3.268432151858216e-05, + "loss": 0.2943, + "step": 14118 + }, + { + "epoch": 1.6742558994426657, + "grad_norm": 0.8181165294800651, + "learning_rate": 3.268203732553568e-05, + "loss": 0.1475, + "step": 14119 + }, + { + "epoch": 1.6743744812047907, + "grad_norm": 0.8274722114153531, + "learning_rate": 3.2679753061670236e-05, + "loss": 0.1386, + "step": 14120 + }, + { + "epoch": 1.6744930629669157, + "grad_norm": 0.7892305846791084, + "learning_rate": 3.267746872700688e-05, + "loss": 0.1902, + "step": 14121 + }, + { + "epoch": 1.6746116447290407, + "grad_norm": 0.7559416318089667, + "learning_rate": 3.2675184321566685e-05, + "loss": 0.1293, + "step": 14122 + }, + { + "epoch": 1.6747302264911657, + "grad_norm": 0.7789554308566117, + "learning_rate": 3.267289984537069e-05, + "loss": 0.1533, + "step": 14123 + }, + { + "epoch": 1.6748488082532906, + "grad_norm": 1.2197171781912353, + "learning_rate": 3.267061529843998e-05, + "loss": 0.2115, + "step": 14124 + }, + { + "epoch": 1.6749673900154156, + "grad_norm": 1.300556924826775, + "learning_rate": 3.26683306807956e-05, + "loss": 0.211, + "step": 14125 + }, + { + "epoch": 1.6750859717775406, + "grad_norm": 0.7556837887915353, + "learning_rate": 3.2666045992458616e-05, + "loss": 0.1399, + "step": 14126 + }, + { + "epoch": 1.6752045535396656, + "grad_norm": 1.1359203809439524, + "learning_rate": 3.2663761233450084e-05, + "loss": 0.2321, + "step": 14127 + }, + { + "epoch": 1.6753231353017906, + "grad_norm": 1.1151303304695956, + "learning_rate": 3.266147640379108e-05, + "loss": 0.2519, + "step": 14128 + }, + { + "epoch": 1.6754417170639155, + "grad_norm": 1.2163413741797233, + "learning_rate": 3.265919150350265e-05, + "loss": 0.2333, + "step": 14129 + }, + { + "epoch": 1.6755602988260405, + "grad_norm": 0.9090864047874648, + "learning_rate": 3.2656906532605866e-05, + "loss": 0.1996, + "step": 14130 + }, + { + "epoch": 1.6756788805881655, + "grad_norm": 1.2020993430371687, + "learning_rate": 3.26546214911218e-05, + "loss": 0.2381, + "step": 14131 + }, + { + "epoch": 1.6757974623502905, + "grad_norm": 0.7892415251728673, + "learning_rate": 3.265233637907151e-05, + "loss": 0.1522, + "step": 14132 + }, + { + "epoch": 1.6759160441124155, + "grad_norm": 0.995823506887617, + "learning_rate": 3.265005119647606e-05, + "loss": 0.2526, + "step": 14133 + }, + { + "epoch": 1.6760346258745404, + "grad_norm": 0.9835020373502017, + "learning_rate": 3.264776594335652e-05, + "loss": 0.1878, + "step": 14134 + }, + { + "epoch": 1.6761532076366654, + "grad_norm": 1.193188993631261, + "learning_rate": 3.2645480619733956e-05, + "loss": 0.2082, + "step": 14135 + }, + { + "epoch": 1.6762717893987906, + "grad_norm": 0.9553936431114728, + "learning_rate": 3.264319522562944e-05, + "loss": 0.1641, + "step": 14136 + }, + { + "epoch": 1.6763903711609154, + "grad_norm": 1.091447092157217, + "learning_rate": 3.264090976106403e-05, + "loss": 0.2136, + "step": 14137 + }, + { + "epoch": 1.6765089529230406, + "grad_norm": 0.7532120819630432, + "learning_rate": 3.26386242260588e-05, + "loss": 0.1359, + "step": 14138 + }, + { + "epoch": 1.6766275346851653, + "grad_norm": 1.0558583033079587, + "learning_rate": 3.263633862063483e-05, + "loss": 0.2221, + "step": 14139 + }, + { + "epoch": 1.6767461164472905, + "grad_norm": 0.6512982897408541, + "learning_rate": 3.263405294481318e-05, + "loss": 0.1412, + "step": 14140 + }, + { + "epoch": 1.6768646982094153, + "grad_norm": 1.0364525155173465, + "learning_rate": 3.263176719861492e-05, + "loss": 0.2402, + "step": 14141 + }, + { + "epoch": 1.6769832799715405, + "grad_norm": 0.6901414601402055, + "learning_rate": 3.262948138206112e-05, + "loss": 0.1281, + "step": 14142 + }, + { + "epoch": 1.6771018617336653, + "grad_norm": 1.067261426378336, + "learning_rate": 3.262719549517287e-05, + "loss": 0.1818, + "step": 14143 + }, + { + "epoch": 1.6772204434957905, + "grad_norm": 0.789088830750186, + "learning_rate": 3.262490953797123e-05, + "loss": 0.1402, + "step": 14144 + }, + { + "epoch": 1.6773390252579152, + "grad_norm": 0.9581810041186875, + "learning_rate": 3.2622623510477265e-05, + "loss": 0.1515, + "step": 14145 + }, + { + "epoch": 1.6774576070200404, + "grad_norm": 1.2163532282087561, + "learning_rate": 3.2620337412712065e-05, + "loss": 0.2164, + "step": 14146 + }, + { + "epoch": 1.6775761887821652, + "grad_norm": 1.1125405789161962, + "learning_rate": 3.2618051244696697e-05, + "loss": 0.2054, + "step": 14147 + }, + { + "epoch": 1.6776947705442904, + "grad_norm": 1.0790600611890262, + "learning_rate": 3.2615765006452234e-05, + "loss": 0.2149, + "step": 14148 + }, + { + "epoch": 1.6778133523064152, + "grad_norm": 0.8375563333619894, + "learning_rate": 3.2613478697999765e-05, + "loss": 0.2027, + "step": 14149 + }, + { + "epoch": 1.6779319340685404, + "grad_norm": 0.9098866514764675, + "learning_rate": 3.261119231936035e-05, + "loss": 0.1862, + "step": 14150 + }, + { + "epoch": 1.6780505158306651, + "grad_norm": 0.7214579578562321, + "learning_rate": 3.260890587055508e-05, + "loss": 0.1351, + "step": 14151 + }, + { + "epoch": 1.6781690975927903, + "grad_norm": 0.9028742733300292, + "learning_rate": 3.260661935160503e-05, + "loss": 0.1603, + "step": 14152 + }, + { + "epoch": 1.678287679354915, + "grad_norm": 0.9304319115653008, + "learning_rate": 3.2604332762531277e-05, + "loss": 0.2121, + "step": 14153 + }, + { + "epoch": 1.6784062611170403, + "grad_norm": 0.8712306183623889, + "learning_rate": 3.26020461033549e-05, + "loss": 0.1631, + "step": 14154 + }, + { + "epoch": 1.678524842879165, + "grad_norm": 0.9504757854757154, + "learning_rate": 3.259975937409698e-05, + "loss": 0.2441, + "step": 14155 + }, + { + "epoch": 1.6786434246412902, + "grad_norm": 1.1830478041163803, + "learning_rate": 3.25974725747786e-05, + "loss": 0.2154, + "step": 14156 + }, + { + "epoch": 1.678762006403415, + "grad_norm": 0.8945345271530879, + "learning_rate": 3.259518570542084e-05, + "loss": 0.1432, + "step": 14157 + }, + { + "epoch": 1.6788805881655402, + "grad_norm": 1.0629038040533951, + "learning_rate": 3.259289876604477e-05, + "loss": 0.1437, + "step": 14158 + }, + { + "epoch": 1.6789991699276652, + "grad_norm": 0.9884223128565195, + "learning_rate": 3.25906117566715e-05, + "loss": 0.1958, + "step": 14159 + }, + { + "epoch": 1.6791177516897902, + "grad_norm": 0.7352986774061605, + "learning_rate": 3.258832467732209e-05, + "loss": 0.1794, + "step": 14160 + }, + { + "epoch": 1.6792363334519151, + "grad_norm": 1.25992818464309, + "learning_rate": 3.258603752801763e-05, + "loss": 0.239, + "step": 14161 + }, + { + "epoch": 1.6793549152140401, + "grad_norm": 1.1417839935698186, + "learning_rate": 3.258375030877921e-05, + "loss": 0.1809, + "step": 14162 + }, + { + "epoch": 1.679473496976165, + "grad_norm": 1.2533050163876454, + "learning_rate": 3.2581463019627913e-05, + "loss": 0.2986, + "step": 14163 + }, + { + "epoch": 1.67959207873829, + "grad_norm": 0.8302052370296442, + "learning_rate": 3.257917566058482e-05, + "loss": 0.166, + "step": 14164 + }, + { + "epoch": 1.679710660500415, + "grad_norm": 1.0789985666541704, + "learning_rate": 3.257688823167103e-05, + "loss": 0.1756, + "step": 14165 + }, + { + "epoch": 1.67982924226254, + "grad_norm": 1.205627368736549, + "learning_rate": 3.257460073290761e-05, + "loss": 0.2641, + "step": 14166 + }, + { + "epoch": 1.679947824024665, + "grad_norm": 0.789627565387131, + "learning_rate": 3.257231316431567e-05, + "loss": 0.2132, + "step": 14167 + }, + { + "epoch": 1.68006640578679, + "grad_norm": 1.3530929251802595, + "learning_rate": 3.257002552591629e-05, + "loss": 0.2896, + "step": 14168 + }, + { + "epoch": 1.680184987548915, + "grad_norm": 0.8121384603060827, + "learning_rate": 3.2567737817730545e-05, + "loss": 0.1881, + "step": 14169 + }, + { + "epoch": 1.68030356931104, + "grad_norm": 1.0374155659931033, + "learning_rate": 3.256545003977955e-05, + "loss": 0.2054, + "step": 14170 + }, + { + "epoch": 1.680422151073165, + "grad_norm": 1.0425234324717128, + "learning_rate": 3.2563162192084375e-05, + "loss": 0.1843, + "step": 14171 + }, + { + "epoch": 1.68054073283529, + "grad_norm": 0.9081117517270473, + "learning_rate": 3.256087427466612e-05, + "loss": 0.2221, + "step": 14172 + }, + { + "epoch": 1.680659314597415, + "grad_norm": 1.2840197768703425, + "learning_rate": 3.255858628754588e-05, + "loss": 0.2976, + "step": 14173 + }, + { + "epoch": 1.68077789635954, + "grad_norm": 1.3043819292391445, + "learning_rate": 3.2556298230744744e-05, + "loss": 0.2081, + "step": 14174 + }, + { + "epoch": 1.6808964781216649, + "grad_norm": 0.8042531158330805, + "learning_rate": 3.25540101042838e-05, + "loss": 0.1796, + "step": 14175 + }, + { + "epoch": 1.6810150598837899, + "grad_norm": 0.7118691682462734, + "learning_rate": 3.2551721908184145e-05, + "loss": 0.1547, + "step": 14176 + }, + { + "epoch": 1.6811336416459148, + "grad_norm": 0.8172873833619847, + "learning_rate": 3.2549433642466875e-05, + "loss": 0.1618, + "step": 14177 + }, + { + "epoch": 1.6812522234080398, + "grad_norm": 1.2715086516488343, + "learning_rate": 3.2547145307153095e-05, + "loss": 0.2825, + "step": 14178 + }, + { + "epoch": 1.6813708051701648, + "grad_norm": 0.8839443437972022, + "learning_rate": 3.254485690226387e-05, + "loss": 0.2099, + "step": 14179 + }, + { + "epoch": 1.6814893869322898, + "grad_norm": 0.9858074561520345, + "learning_rate": 3.254256842782034e-05, + "loss": 0.1919, + "step": 14180 + }, + { + "epoch": 1.6816079686944148, + "grad_norm": 0.9028810723886407, + "learning_rate": 3.254027988384356e-05, + "loss": 0.1892, + "step": 14181 + }, + { + "epoch": 1.6817265504565397, + "grad_norm": 0.7047917300779597, + "learning_rate": 3.2537991270354656e-05, + "loss": 0.1704, + "step": 14182 + }, + { + "epoch": 1.6818451322186647, + "grad_norm": 1.7462350606544355, + "learning_rate": 3.253570258737471e-05, + "loss": 0.3977, + "step": 14183 + }, + { + "epoch": 1.6819637139807897, + "grad_norm": 0.7661245835241396, + "learning_rate": 3.253341383492483e-05, + "loss": 0.165, + "step": 14184 + }, + { + "epoch": 1.682082295742915, + "grad_norm": 1.0305763873107472, + "learning_rate": 3.253112501302611e-05, + "loss": 0.1906, + "step": 14185 + }, + { + "epoch": 1.6822008775050397, + "grad_norm": 0.7209972495716043, + "learning_rate": 3.252883612169966e-05, + "loss": 0.1948, + "step": 14186 + }, + { + "epoch": 1.6823194592671649, + "grad_norm": 1.0351462087452914, + "learning_rate": 3.2526547160966566e-05, + "loss": 0.1989, + "step": 14187 + }, + { + "epoch": 1.6824380410292896, + "grad_norm": 0.7453146288535434, + "learning_rate": 3.252425813084794e-05, + "loss": 0.1548, + "step": 14188 + }, + { + "epoch": 1.6825566227914148, + "grad_norm": 1.1306396421624252, + "learning_rate": 3.2521969031364885e-05, + "loss": 0.2613, + "step": 14189 + }, + { + "epoch": 1.6826752045535396, + "grad_norm": 0.784324188789529, + "learning_rate": 3.251967986253849e-05, + "loss": 0.1542, + "step": 14190 + }, + { + "epoch": 1.6827937863156648, + "grad_norm": 0.9101702061055479, + "learning_rate": 3.251739062438988e-05, + "loss": 0.2053, + "step": 14191 + }, + { + "epoch": 1.6829123680777895, + "grad_norm": 1.1396151259743459, + "learning_rate": 3.251510131694013e-05, + "loss": 0.231, + "step": 14192 + }, + { + "epoch": 1.6830309498399147, + "grad_norm": 0.682181337058548, + "learning_rate": 3.2512811940210375e-05, + "loss": 0.1308, + "step": 14193 + }, + { + "epoch": 1.6831495316020395, + "grad_norm": 1.121361887541787, + "learning_rate": 3.251052249422171e-05, + "loss": 0.239, + "step": 14194 + }, + { + "epoch": 1.6832681133641647, + "grad_norm": 1.1909209580690958, + "learning_rate": 3.250823297899523e-05, + "loss": 0.2668, + "step": 14195 + }, + { + "epoch": 1.6833866951262895, + "grad_norm": 0.7576429723068779, + "learning_rate": 3.250594339455205e-05, + "loss": 0.1501, + "step": 14196 + }, + { + "epoch": 1.6835052768884147, + "grad_norm": 0.8437984007028245, + "learning_rate": 3.2503653740913275e-05, + "loss": 0.2144, + "step": 14197 + }, + { + "epoch": 1.6836238586505394, + "grad_norm": 1.1785066350433826, + "learning_rate": 3.250136401810001e-05, + "loss": 0.2081, + "step": 14198 + }, + { + "epoch": 1.6837424404126646, + "grad_norm": 1.061128124461869, + "learning_rate": 3.249907422613337e-05, + "loss": 0.1784, + "step": 14199 + }, + { + "epoch": 1.6838610221747894, + "grad_norm": 1.1765533281827658, + "learning_rate": 3.249678436503447e-05, + "loss": 0.2499, + "step": 14200 + }, + { + "epoch": 1.6839796039369146, + "grad_norm": 0.9719483011122969, + "learning_rate": 3.2494494434824406e-05, + "loss": 0.1815, + "step": 14201 + }, + { + "epoch": 1.6840981856990394, + "grad_norm": 0.8360546726943604, + "learning_rate": 3.2492204435524295e-05, + "loss": 0.1786, + "step": 14202 + }, + { + "epoch": 1.6842167674611646, + "grad_norm": 0.9182910962054558, + "learning_rate": 3.248991436715524e-05, + "loss": 0.207, + "step": 14203 + }, + { + "epoch": 1.6843353492232893, + "grad_norm": 1.0308037395543952, + "learning_rate": 3.2487624229738364e-05, + "loss": 0.2325, + "step": 14204 + }, + { + "epoch": 1.6844539309854145, + "grad_norm": 0.7911861630228919, + "learning_rate": 3.248533402329478e-05, + "loss": 0.1398, + "step": 14205 + }, + { + "epoch": 1.6845725127475393, + "grad_norm": 0.6730710562353177, + "learning_rate": 3.248304374784559e-05, + "loss": 0.1278, + "step": 14206 + }, + { + "epoch": 1.6846910945096645, + "grad_norm": 1.1504067899707735, + "learning_rate": 3.248075340341191e-05, + "loss": 0.2332, + "step": 14207 + }, + { + "epoch": 1.6848096762717892, + "grad_norm": 1.0583590845014528, + "learning_rate": 3.247846299001486e-05, + "loss": 0.1892, + "step": 14208 + }, + { + "epoch": 1.6849282580339144, + "grad_norm": 0.9311837878490906, + "learning_rate": 3.2476172507675556e-05, + "loss": 0.1663, + "step": 14209 + }, + { + "epoch": 1.6850468397960394, + "grad_norm": 0.8285293701919322, + "learning_rate": 3.2473881956415105e-05, + "loss": 0.1872, + "step": 14210 + }, + { + "epoch": 1.6851654215581644, + "grad_norm": 1.098688604988998, + "learning_rate": 3.247159133625462e-05, + "loss": 0.2274, + "step": 14211 + }, + { + "epoch": 1.6852840033202894, + "grad_norm": 0.7601927008568096, + "learning_rate": 3.246930064721524e-05, + "loss": 0.1844, + "step": 14212 + }, + { + "epoch": 1.6854025850824144, + "grad_norm": 1.4008895384008968, + "learning_rate": 3.2467009889318056e-05, + "loss": 0.2782, + "step": 14213 + }, + { + "epoch": 1.6855211668445393, + "grad_norm": 1.0837098286888756, + "learning_rate": 3.24647190625842e-05, + "loss": 0.2571, + "step": 14214 + }, + { + "epoch": 1.6856397486066643, + "grad_norm": 0.9045142341655249, + "learning_rate": 3.2462428167034795e-05, + "loss": 0.1177, + "step": 14215 + }, + { + "epoch": 1.6857583303687893, + "grad_norm": 0.8053861254966125, + "learning_rate": 3.246013720269095e-05, + "loss": 0.149, + "step": 14216 + }, + { + "epoch": 1.6858769121309143, + "grad_norm": 0.7185803274574651, + "learning_rate": 3.245784616957379e-05, + "loss": 0.1602, + "step": 14217 + }, + { + "epoch": 1.6859954938930393, + "grad_norm": 1.009403146007683, + "learning_rate": 3.2455555067704426e-05, + "loss": 0.2449, + "step": 14218 + }, + { + "epoch": 1.6861140756551642, + "grad_norm": 0.9145856418686991, + "learning_rate": 3.2453263897103994e-05, + "loss": 0.1796, + "step": 14219 + }, + { + "epoch": 1.6862326574172892, + "grad_norm": 1.0888456522416476, + "learning_rate": 3.2450972657793605e-05, + "loss": 0.2001, + "step": 14220 + }, + { + "epoch": 1.6863512391794142, + "grad_norm": 0.8143565102214538, + "learning_rate": 3.2448681349794386e-05, + "loss": 0.1624, + "step": 14221 + }, + { + "epoch": 1.6864698209415392, + "grad_norm": 0.7383536725781559, + "learning_rate": 3.2446389973127454e-05, + "loss": 0.1408, + "step": 14222 + }, + { + "epoch": 1.6865884027036642, + "grad_norm": 1.0293861580753407, + "learning_rate": 3.244409852781395e-05, + "loss": 0.2156, + "step": 14223 + }, + { + "epoch": 1.6867069844657891, + "grad_norm": 0.9786622848496125, + "learning_rate": 3.244180701387497e-05, + "loss": 0.2176, + "step": 14224 + }, + { + "epoch": 1.6868255662279141, + "grad_norm": 1.097613184227653, + "learning_rate": 3.2439515431331675e-05, + "loss": 0.1995, + "step": 14225 + }, + { + "epoch": 1.6869441479900391, + "grad_norm": 0.7190377244710328, + "learning_rate": 3.2437223780205156e-05, + "loss": 0.1251, + "step": 14226 + }, + { + "epoch": 1.687062729752164, + "grad_norm": 0.9648372815453783, + "learning_rate": 3.243493206051655e-05, + "loss": 0.1772, + "step": 14227 + }, + { + "epoch": 1.687181311514289, + "grad_norm": 0.9917699936904283, + "learning_rate": 3.243264027228701e-05, + "loss": 0.1764, + "step": 14228 + }, + { + "epoch": 1.687299893276414, + "grad_norm": 0.8923621209566635, + "learning_rate": 3.243034841553762e-05, + "loss": 0.1411, + "step": 14229 + }, + { + "epoch": 1.687418475038539, + "grad_norm": 0.827718758083886, + "learning_rate": 3.242805649028953e-05, + "loss": 0.1648, + "step": 14230 + }, + { + "epoch": 1.687537056800664, + "grad_norm": 0.8547815251724015, + "learning_rate": 3.2425764496563875e-05, + "loss": 0.1552, + "step": 14231 + }, + { + "epoch": 1.687655638562789, + "grad_norm": 0.7496940968637957, + "learning_rate": 3.242347243438178e-05, + "loss": 0.1469, + "step": 14232 + }, + { + "epoch": 1.687774220324914, + "grad_norm": 0.8914162938190356, + "learning_rate": 3.242118030376437e-05, + "loss": 0.2186, + "step": 14233 + }, + { + "epoch": 1.6878928020870392, + "grad_norm": 0.8315333114513248, + "learning_rate": 3.241888810473278e-05, + "loss": 0.16, + "step": 14234 + }, + { + "epoch": 1.688011383849164, + "grad_norm": 1.1728224040948239, + "learning_rate": 3.241659583730813e-05, + "loss": 0.2113, + "step": 14235 + }, + { + "epoch": 1.6881299656112891, + "grad_norm": 0.800048859978388, + "learning_rate": 3.241430350151157e-05, + "loss": 0.1556, + "step": 14236 + }, + { + "epoch": 1.688248547373414, + "grad_norm": 0.7857848612107878, + "learning_rate": 3.2412011097364226e-05, + "loss": 0.1501, + "step": 14237 + }, + { + "epoch": 1.688367129135539, + "grad_norm": 1.3479007019800349, + "learning_rate": 3.2409718624887225e-05, + "loss": 0.2792, + "step": 14238 + }, + { + "epoch": 1.6884857108976639, + "grad_norm": 0.957391662178435, + "learning_rate": 3.24074260841017e-05, + "loss": 0.1931, + "step": 14239 + }, + { + "epoch": 1.688604292659789, + "grad_norm": 0.8840508991047685, + "learning_rate": 3.240513347502879e-05, + "loss": 0.2189, + "step": 14240 + }, + { + "epoch": 1.6887228744219138, + "grad_norm": 1.3952749182483883, + "learning_rate": 3.2402840797689636e-05, + "loss": 0.2497, + "step": 14241 + }, + { + "epoch": 1.688841456184039, + "grad_norm": 1.2936862500617867, + "learning_rate": 3.240054805210536e-05, + "loss": 0.2476, + "step": 14242 + }, + { + "epoch": 1.6889600379461638, + "grad_norm": 0.8263270609879297, + "learning_rate": 3.239825523829712e-05, + "loss": 0.1887, + "step": 14243 + }, + { + "epoch": 1.689078619708289, + "grad_norm": 0.8188779133838462, + "learning_rate": 3.239596235628603e-05, + "loss": 0.187, + "step": 14244 + }, + { + "epoch": 1.6891972014704137, + "grad_norm": 0.5534406163418794, + "learning_rate": 3.239366940609324e-05, + "loss": 0.1419, + "step": 14245 + }, + { + "epoch": 1.689315783232539, + "grad_norm": 0.9410360598056432, + "learning_rate": 3.239137638773988e-05, + "loss": 0.1927, + "step": 14246 + }, + { + "epoch": 1.6894343649946637, + "grad_norm": 0.8424012487308424, + "learning_rate": 3.2389083301247096e-05, + "loss": 0.1313, + "step": 14247 + }, + { + "epoch": 1.689552946756789, + "grad_norm": 0.9047070330292294, + "learning_rate": 3.2386790146636025e-05, + "loss": 0.1821, + "step": 14248 + }, + { + "epoch": 1.6896715285189137, + "grad_norm": 0.8681897831646453, + "learning_rate": 3.238449692392781e-05, + "loss": 0.1729, + "step": 14249 + }, + { + "epoch": 1.6897901102810389, + "grad_norm": 0.8356865672771268, + "learning_rate": 3.238220363314358e-05, + "loss": 0.1478, + "step": 14250 + }, + { + "epoch": 1.6899086920431636, + "grad_norm": 0.9128241354425364, + "learning_rate": 3.237991027430449e-05, + "loss": 0.1875, + "step": 14251 + }, + { + "epoch": 1.6900272738052888, + "grad_norm": 0.8379980327344866, + "learning_rate": 3.237761684743168e-05, + "loss": 0.1585, + "step": 14252 + }, + { + "epoch": 1.6901458555674136, + "grad_norm": 0.8404635213215531, + "learning_rate": 3.237532335254629e-05, + "loss": 0.1627, + "step": 14253 + }, + { + "epoch": 1.6902644373295388, + "grad_norm": 1.062552381039293, + "learning_rate": 3.237302978966946e-05, + "loss": 0.1737, + "step": 14254 + }, + { + "epoch": 1.6903830190916636, + "grad_norm": 0.9276685933600548, + "learning_rate": 3.2370736158822333e-05, + "loss": 0.1815, + "step": 14255 + }, + { + "epoch": 1.6905016008537888, + "grad_norm": 0.9658675007510016, + "learning_rate": 3.236844246002607e-05, + "loss": 0.1595, + "step": 14256 + }, + { + "epoch": 1.6906201826159135, + "grad_norm": 0.8506749963960034, + "learning_rate": 3.236614869330179e-05, + "loss": 0.1581, + "step": 14257 + }, + { + "epoch": 1.6907387643780387, + "grad_norm": 0.9771356684623155, + "learning_rate": 3.236385485867066e-05, + "loss": 0.1894, + "step": 14258 + }, + { + "epoch": 1.6908573461401637, + "grad_norm": 0.8969976935039412, + "learning_rate": 3.236156095615381e-05, + "loss": 0.1785, + "step": 14259 + }, + { + "epoch": 1.6909759279022887, + "grad_norm": 0.6489702016350414, + "learning_rate": 3.2359266985772405e-05, + "loss": 0.1575, + "step": 14260 + }, + { + "epoch": 1.6910945096644137, + "grad_norm": 1.7732027576351277, + "learning_rate": 3.2356972947547574e-05, + "loss": 0.3555, + "step": 14261 + }, + { + "epoch": 1.6912130914265386, + "grad_norm": 0.7184736013623478, + "learning_rate": 3.2354678841500484e-05, + "loss": 0.164, + "step": 14262 + }, + { + "epoch": 1.6913316731886636, + "grad_norm": 1.0240644960556071, + "learning_rate": 3.235238466765227e-05, + "loss": 0.2389, + "step": 14263 + }, + { + "epoch": 1.6914502549507886, + "grad_norm": 0.8086787002450174, + "learning_rate": 3.235009042602409e-05, + "loss": 0.1586, + "step": 14264 + }, + { + "epoch": 1.6915688367129136, + "grad_norm": 0.9751905361469323, + "learning_rate": 3.234779611663707e-05, + "loss": 0.1773, + "step": 14265 + }, + { + "epoch": 1.6916874184750386, + "grad_norm": 0.7571782265437204, + "learning_rate": 3.234550173951241e-05, + "loss": 0.1598, + "step": 14266 + }, + { + "epoch": 1.6918060002371635, + "grad_norm": 1.1670019490346188, + "learning_rate": 3.234320729467121e-05, + "loss": 0.2232, + "step": 14267 + }, + { + "epoch": 1.6919245819992885, + "grad_norm": 0.9623292975482933, + "learning_rate": 3.234091278213465e-05, + "loss": 0.206, + "step": 14268 + }, + { + "epoch": 1.6920431637614135, + "grad_norm": 1.0690629285417756, + "learning_rate": 3.233861820192388e-05, + "loss": 0.2591, + "step": 14269 + }, + { + "epoch": 1.6921617455235385, + "grad_norm": 0.7261249934773533, + "learning_rate": 3.233632355406004e-05, + "loss": 0.1827, + "step": 14270 + }, + { + "epoch": 1.6922803272856635, + "grad_norm": 0.9836623615577823, + "learning_rate": 3.2334028838564294e-05, + "loss": 0.2555, + "step": 14271 + }, + { + "epoch": 1.6923989090477884, + "grad_norm": 0.8678522337562645, + "learning_rate": 3.2331734055457795e-05, + "loss": 0.1694, + "step": 14272 + }, + { + "epoch": 1.6925174908099134, + "grad_norm": 0.9216200911527571, + "learning_rate": 3.232943920476171e-05, + "loss": 0.1631, + "step": 14273 + }, + { + "epoch": 1.6926360725720384, + "grad_norm": 0.8767130256723576, + "learning_rate": 3.2327144286497176e-05, + "loss": 0.1835, + "step": 14274 + }, + { + "epoch": 1.6927546543341634, + "grad_norm": 0.8381689183473718, + "learning_rate": 3.232484930068536e-05, + "loss": 0.1934, + "step": 14275 + }, + { + "epoch": 1.6928732360962884, + "grad_norm": 1.1405684477248532, + "learning_rate": 3.232255424734741e-05, + "loss": 0.1738, + "step": 14276 + }, + { + "epoch": 1.6929918178584134, + "grad_norm": 0.857045702255255, + "learning_rate": 3.23202591265045e-05, + "loss": 0.206, + "step": 14277 + }, + { + "epoch": 1.6931103996205383, + "grad_norm": 1.0265298322474004, + "learning_rate": 3.231796393817776e-05, + "loss": 0.1798, + "step": 14278 + }, + { + "epoch": 1.6932289813826633, + "grad_norm": 0.6773863268485648, + "learning_rate": 3.231566868238838e-05, + "loss": 0.1567, + "step": 14279 + }, + { + "epoch": 1.6933475631447883, + "grad_norm": 0.7133562260945336, + "learning_rate": 3.231337335915751e-05, + "loss": 0.1202, + "step": 14280 + }, + { + "epoch": 1.6934661449069133, + "grad_norm": 1.0181678445902773, + "learning_rate": 3.23110779685063e-05, + "loss": 0.2152, + "step": 14281 + }, + { + "epoch": 1.6935847266690383, + "grad_norm": 0.9012381187787735, + "learning_rate": 3.230878251045591e-05, + "loss": 0.1714, + "step": 14282 + }, + { + "epoch": 1.6937033084311632, + "grad_norm": 1.0545445204396877, + "learning_rate": 3.230648698502752e-05, + "loss": 0.2061, + "step": 14283 + }, + { + "epoch": 1.6938218901932882, + "grad_norm": 0.7643397508945838, + "learning_rate": 3.2304191392242266e-05, + "loss": 0.1578, + "step": 14284 + }, + { + "epoch": 1.6939404719554134, + "grad_norm": 0.9102177456013667, + "learning_rate": 3.230189573212133e-05, + "loss": 0.1853, + "step": 14285 + }, + { + "epoch": 1.6940590537175382, + "grad_norm": 0.8300364436745771, + "learning_rate": 3.229960000468587e-05, + "loss": 0.1616, + "step": 14286 + }, + { + "epoch": 1.6941776354796634, + "grad_norm": 1.0084698586782475, + "learning_rate": 3.229730420995705e-05, + "loss": 0.2817, + "step": 14287 + }, + { + "epoch": 1.6942962172417881, + "grad_norm": 0.9636559146502206, + "learning_rate": 3.229500834795604e-05, + "loss": 0.1892, + "step": 14288 + }, + { + "epoch": 1.6944147990039133, + "grad_norm": 1.139425922398314, + "learning_rate": 3.229271241870399e-05, + "loss": 0.2079, + "step": 14289 + }, + { + "epoch": 1.694533380766038, + "grad_norm": 0.7620230616739522, + "learning_rate": 3.2290416422222074e-05, + "loss": 0.1705, + "step": 14290 + }, + { + "epoch": 1.6946519625281633, + "grad_norm": 0.9814617053722875, + "learning_rate": 3.228812035853146e-05, + "loss": 0.1945, + "step": 14291 + }, + { + "epoch": 1.694770544290288, + "grad_norm": 0.7171947072142744, + "learning_rate": 3.228582422765332e-05, + "loss": 0.1257, + "step": 14292 + }, + { + "epoch": 1.6948891260524133, + "grad_norm": 0.950484921752032, + "learning_rate": 3.228352802960881e-05, + "loss": 0.1918, + "step": 14293 + }, + { + "epoch": 1.695007707814538, + "grad_norm": 1.0428456605138074, + "learning_rate": 3.22812317644191e-05, + "loss": 0.237, + "step": 14294 + }, + { + "epoch": 1.6951262895766632, + "grad_norm": 0.9159808356419789, + "learning_rate": 3.227893543210537e-05, + "loss": 0.1978, + "step": 14295 + }, + { + "epoch": 1.695244871338788, + "grad_norm": 0.878480374632727, + "learning_rate": 3.227663903268878e-05, + "loss": 0.1673, + "step": 14296 + }, + { + "epoch": 1.6953634531009132, + "grad_norm": 0.8997148555635859, + "learning_rate": 3.22743425661905e-05, + "loss": 0.1629, + "step": 14297 + }, + { + "epoch": 1.695482034863038, + "grad_norm": 1.3192942288418892, + "learning_rate": 3.227204603263169e-05, + "loss": 0.2537, + "step": 14298 + }, + { + "epoch": 1.6956006166251631, + "grad_norm": 0.7809831542144268, + "learning_rate": 3.226974943203355e-05, + "loss": 0.1708, + "step": 14299 + }, + { + "epoch": 1.695719198387288, + "grad_norm": 1.0755577495876862, + "learning_rate": 3.226745276441723e-05, + "loss": 0.2065, + "step": 14300 + }, + { + "epoch": 1.695837780149413, + "grad_norm": 0.7580451012646419, + "learning_rate": 3.22651560298039e-05, + "loss": 0.1819, + "step": 14301 + }, + { + "epoch": 1.6959563619115379, + "grad_norm": 0.7659848256761227, + "learning_rate": 3.2262859228214746e-05, + "loss": 0.191, + "step": 14302 + }, + { + "epoch": 1.696074943673663, + "grad_norm": 0.9064800590510798, + "learning_rate": 3.2260562359670935e-05, + "loss": 0.1769, + "step": 14303 + }, + { + "epoch": 1.6961935254357878, + "grad_norm": 1.0908998892512758, + "learning_rate": 3.2258265424193646e-05, + "loss": 0.2267, + "step": 14304 + }, + { + "epoch": 1.696312107197913, + "grad_norm": 0.9011951749173837, + "learning_rate": 3.225596842180405e-05, + "loss": 0.1577, + "step": 14305 + }, + { + "epoch": 1.6964306889600378, + "grad_norm": 0.8920911156772733, + "learning_rate": 3.2253671352523315e-05, + "loss": 0.1834, + "step": 14306 + }, + { + "epoch": 1.696549270722163, + "grad_norm": 1.0265449277024281, + "learning_rate": 3.2251374216372645e-05, + "loss": 0.2429, + "step": 14307 + }, + { + "epoch": 1.6966678524842878, + "grad_norm": 1.3410379472545695, + "learning_rate": 3.224907701337317e-05, + "loss": 0.338, + "step": 14308 + }, + { + "epoch": 1.696786434246413, + "grad_norm": 1.1830711028116796, + "learning_rate": 3.2246779743546114e-05, + "loss": 0.2714, + "step": 14309 + }, + { + "epoch": 1.696905016008538, + "grad_norm": 0.8928709999156607, + "learning_rate": 3.2244482406912626e-05, + "loss": 0.2257, + "step": 14310 + }, + { + "epoch": 1.697023597770663, + "grad_norm": 1.125398531543924, + "learning_rate": 3.2242185003493896e-05, + "loss": 0.2409, + "step": 14311 + }, + { + "epoch": 1.697142179532788, + "grad_norm": 1.020751157119398, + "learning_rate": 3.22398875333111e-05, + "loss": 0.1834, + "step": 14312 + }, + { + "epoch": 1.6972607612949129, + "grad_norm": 0.7817806795584271, + "learning_rate": 3.223758999638542e-05, + "loss": 0.1359, + "step": 14313 + }, + { + "epoch": 1.6973793430570379, + "grad_norm": 0.6310987881217248, + "learning_rate": 3.2235292392738045e-05, + "loss": 0.1495, + "step": 14314 + }, + { + "epoch": 1.6974979248191628, + "grad_norm": 0.9521644976480976, + "learning_rate": 3.2232994722390136e-05, + "loss": 0.232, + "step": 14315 + }, + { + "epoch": 1.6976165065812878, + "grad_norm": 1.4881539454423385, + "learning_rate": 3.223069698536289e-05, + "loss": 0.2639, + "step": 14316 + }, + { + "epoch": 1.6977350883434128, + "grad_norm": 1.082460481074212, + "learning_rate": 3.222839918167748e-05, + "loss": 0.235, + "step": 14317 + }, + { + "epoch": 1.6978536701055378, + "grad_norm": 0.9187106016533045, + "learning_rate": 3.2226101311355104e-05, + "loss": 0.1737, + "step": 14318 + }, + { + "epoch": 1.6979722518676628, + "grad_norm": 1.0688307675853228, + "learning_rate": 3.2223803374416924e-05, + "loss": 0.2367, + "step": 14319 + }, + { + "epoch": 1.6980908336297877, + "grad_norm": 0.9170388538497265, + "learning_rate": 3.222150537088413e-05, + "loss": 0.1547, + "step": 14320 + }, + { + "epoch": 1.6982094153919127, + "grad_norm": 0.9391343938354756, + "learning_rate": 3.221920730077793e-05, + "loss": 0.1974, + "step": 14321 + }, + { + "epoch": 1.6983279971540377, + "grad_norm": 0.7923324580800825, + "learning_rate": 3.221690916411948e-05, + "loss": 0.1798, + "step": 14322 + }, + { + "epoch": 1.6984465789161627, + "grad_norm": 0.9338046786413092, + "learning_rate": 3.221461096092998e-05, + "loss": 0.2055, + "step": 14323 + }, + { + "epoch": 1.6985651606782877, + "grad_norm": 0.714069990832568, + "learning_rate": 3.221231269123062e-05, + "loss": 0.1733, + "step": 14324 + }, + { + "epoch": 1.6986837424404126, + "grad_norm": 0.9065108220189639, + "learning_rate": 3.221001435504257e-05, + "loss": 0.2207, + "step": 14325 + }, + { + "epoch": 1.6988023242025376, + "grad_norm": 0.8354394537242456, + "learning_rate": 3.220771595238703e-05, + "loss": 0.1568, + "step": 14326 + }, + { + "epoch": 1.6989209059646626, + "grad_norm": 0.7879303006569716, + "learning_rate": 3.2205417483285193e-05, + "loss": 0.2342, + "step": 14327 + }, + { + "epoch": 1.6990394877267876, + "grad_norm": 0.856687765474829, + "learning_rate": 3.220311894775824e-05, + "loss": 0.1771, + "step": 14328 + }, + { + "epoch": 1.6991580694889126, + "grad_norm": 1.1320120803381037, + "learning_rate": 3.220082034582737e-05, + "loss": 0.2494, + "step": 14329 + }, + { + "epoch": 1.6992766512510376, + "grad_norm": 1.0983544129291527, + "learning_rate": 3.219852167751376e-05, + "loss": 0.2675, + "step": 14330 + }, + { + "epoch": 1.6993952330131625, + "grad_norm": 1.0187832635859149, + "learning_rate": 3.219622294283861e-05, + "loss": 0.2202, + "step": 14331 + }, + { + "epoch": 1.6995138147752875, + "grad_norm": 1.2233941274895865, + "learning_rate": 3.2193924141823106e-05, + "loss": 0.3193, + "step": 14332 + }, + { + "epoch": 1.6996323965374125, + "grad_norm": 0.8376101828071144, + "learning_rate": 3.219162527448845e-05, + "loss": 0.178, + "step": 14333 + }, + { + "epoch": 1.6997509782995377, + "grad_norm": 1.0863554769078119, + "learning_rate": 3.218932634085582e-05, + "loss": 0.2376, + "step": 14334 + }, + { + "epoch": 1.6998695600616625, + "grad_norm": 1.1392973531363515, + "learning_rate": 3.2187027340946424e-05, + "loss": 0.2324, + "step": 14335 + }, + { + "epoch": 1.6999881418237877, + "grad_norm": 1.1596004856360398, + "learning_rate": 3.2184728274781446e-05, + "loss": 0.2397, + "step": 14336 + }, + { + "epoch": 1.7001067235859124, + "grad_norm": 1.0485282450148024, + "learning_rate": 3.218242914238209e-05, + "loss": 0.2537, + "step": 14337 + }, + { + "epoch": 1.7002253053480376, + "grad_norm": 0.9311356830322288, + "learning_rate": 3.218012994376953e-05, + "loss": 0.2216, + "step": 14338 + }, + { + "epoch": 1.7003438871101624, + "grad_norm": 1.112709561758907, + "learning_rate": 3.217783067896498e-05, + "loss": 0.1778, + "step": 14339 + }, + { + "epoch": 1.7004624688722876, + "grad_norm": 1.0298722933919786, + "learning_rate": 3.217553134798964e-05, + "loss": 0.2305, + "step": 14340 + }, + { + "epoch": 1.7005810506344123, + "grad_norm": 0.9383487513465345, + "learning_rate": 3.2173231950864696e-05, + "loss": 0.2033, + "step": 14341 + }, + { + "epoch": 1.7006996323965375, + "grad_norm": 0.8722290617473583, + "learning_rate": 3.217093248761136e-05, + "loss": 0.2334, + "step": 14342 + }, + { + "epoch": 1.7008182141586623, + "grad_norm": 1.1348551372330216, + "learning_rate": 3.216863295825081e-05, + "loss": 0.2602, + "step": 14343 + }, + { + "epoch": 1.7009367959207875, + "grad_norm": 0.7470808335404696, + "learning_rate": 3.2166333362804256e-05, + "loss": 0.1536, + "step": 14344 + }, + { + "epoch": 1.7010553776829123, + "grad_norm": 1.0301171212960625, + "learning_rate": 3.21640337012929e-05, + "loss": 0.1996, + "step": 14345 + }, + { + "epoch": 1.7011739594450375, + "grad_norm": 0.6222124543088158, + "learning_rate": 3.216173397373794e-05, + "loss": 0.1306, + "step": 14346 + }, + { + "epoch": 1.7012925412071622, + "grad_norm": 1.9380448676891942, + "learning_rate": 3.215943418016057e-05, + "loss": 0.4509, + "step": 14347 + }, + { + "epoch": 1.7014111229692874, + "grad_norm": 0.9576820265239561, + "learning_rate": 3.2157134320582e-05, + "loss": 0.228, + "step": 14348 + }, + { + "epoch": 1.7015297047314122, + "grad_norm": 0.7868847850207586, + "learning_rate": 3.215483439502342e-05, + "loss": 0.1628, + "step": 14349 + }, + { + "epoch": 1.7016482864935374, + "grad_norm": 1.0860000611034908, + "learning_rate": 3.2152534403506046e-05, + "loss": 0.2482, + "step": 14350 + }, + { + "epoch": 1.7017668682556621, + "grad_norm": 0.8382223751022105, + "learning_rate": 3.215023434605108e-05, + "loss": 0.1914, + "step": 14351 + }, + { + "epoch": 1.7018854500177873, + "grad_norm": 0.9214465297032294, + "learning_rate": 3.2147934222679714e-05, + "loss": 0.1696, + "step": 14352 + }, + { + "epoch": 1.702004031779912, + "grad_norm": 1.1731679409524944, + "learning_rate": 3.2145634033413165e-05, + "loss": 0.2454, + "step": 14353 + }, + { + "epoch": 1.7021226135420373, + "grad_norm": 1.3920866963807712, + "learning_rate": 3.214333377827263e-05, + "loss": 0.2976, + "step": 14354 + }, + { + "epoch": 1.702241195304162, + "grad_norm": 0.7516369243577307, + "learning_rate": 3.214103345727932e-05, + "loss": 0.1316, + "step": 14355 + }, + { + "epoch": 1.7023597770662873, + "grad_norm": 1.0042020473530435, + "learning_rate": 3.213873307045443e-05, + "loss": 0.2031, + "step": 14356 + }, + { + "epoch": 1.702478358828412, + "grad_norm": 1.2436057194094277, + "learning_rate": 3.213643261781919e-05, + "loss": 0.2655, + "step": 14357 + }, + { + "epoch": 1.7025969405905372, + "grad_norm": 0.9988436050462884, + "learning_rate": 3.2134132099394785e-05, + "loss": 0.2326, + "step": 14358 + }, + { + "epoch": 1.7027155223526622, + "grad_norm": 0.7582132336274057, + "learning_rate": 3.2131831515202424e-05, + "loss": 0.1549, + "step": 14359 + }, + { + "epoch": 1.7028341041147872, + "grad_norm": 0.7838316232042197, + "learning_rate": 3.212953086526333e-05, + "loss": 0.1634, + "step": 14360 + }, + { + "epoch": 1.7029526858769122, + "grad_norm": 0.8156811148349372, + "learning_rate": 3.2127230149598705e-05, + "loss": 0.1974, + "step": 14361 + }, + { + "epoch": 1.7030712676390372, + "grad_norm": 1.0278841209748253, + "learning_rate": 3.212492936822975e-05, + "loss": 0.1902, + "step": 14362 + }, + { + "epoch": 1.7031898494011621, + "grad_norm": 0.9891798769655182, + "learning_rate": 3.21226285211777e-05, + "loss": 0.2114, + "step": 14363 + }, + { + "epoch": 1.7033084311632871, + "grad_norm": 1.0232011285856142, + "learning_rate": 3.212032760846374e-05, + "loss": 0.1523, + "step": 14364 + }, + { + "epoch": 1.703427012925412, + "grad_norm": 1.1735963085960446, + "learning_rate": 3.211802663010909e-05, + "loss": 0.2042, + "step": 14365 + }, + { + "epoch": 1.703545594687537, + "grad_norm": 0.9256364140426385, + "learning_rate": 3.211572558613496e-05, + "loss": 0.1884, + "step": 14366 + }, + { + "epoch": 1.703664176449662, + "grad_norm": 1.1856690785888904, + "learning_rate": 3.211342447656257e-05, + "loss": 0.3231, + "step": 14367 + }, + { + "epoch": 1.703782758211787, + "grad_norm": 0.6223672381479726, + "learning_rate": 3.2111123301413125e-05, + "loss": 0.1332, + "step": 14368 + }, + { + "epoch": 1.703901339973912, + "grad_norm": 0.9280122208304437, + "learning_rate": 3.210882206070784e-05, + "loss": 0.2137, + "step": 14369 + }, + { + "epoch": 1.704019921736037, + "grad_norm": 0.7933607842249649, + "learning_rate": 3.210652075446794e-05, + "loss": 0.148, + "step": 14370 + }, + { + "epoch": 1.704138503498162, + "grad_norm": 0.8177689811325, + "learning_rate": 3.2104219382714634e-05, + "loss": 0.1521, + "step": 14371 + }, + { + "epoch": 1.704257085260287, + "grad_norm": 0.9672172544348444, + "learning_rate": 3.210191794546914e-05, + "loss": 0.1784, + "step": 14372 + }, + { + "epoch": 1.704375667022412, + "grad_norm": 0.8651484599126852, + "learning_rate": 3.2099616442752664e-05, + "loss": 0.2006, + "step": 14373 + }, + { + "epoch": 1.704494248784537, + "grad_norm": 1.0411017717012006, + "learning_rate": 3.2097314874586434e-05, + "loss": 0.1767, + "step": 14374 + }, + { + "epoch": 1.704612830546662, + "grad_norm": 0.8963069927170062, + "learning_rate": 3.209501324099167e-05, + "loss": 0.1779, + "step": 14375 + }, + { + "epoch": 1.7047314123087869, + "grad_norm": 0.8736136327189761, + "learning_rate": 3.209271154198958e-05, + "loss": 0.2011, + "step": 14376 + }, + { + "epoch": 1.7048499940709119, + "grad_norm": 1.3124292852641823, + "learning_rate": 3.2090409777601384e-05, + "loss": 0.2738, + "step": 14377 + }, + { + "epoch": 1.7049685758330368, + "grad_norm": 1.05992843279346, + "learning_rate": 3.208810794784831e-05, + "loss": 0.2229, + "step": 14378 + }, + { + "epoch": 1.7050871575951618, + "grad_norm": 0.981017518873297, + "learning_rate": 3.2085806052751565e-05, + "loss": 0.1746, + "step": 14379 + }, + { + "epoch": 1.7052057393572868, + "grad_norm": 0.9036265900102088, + "learning_rate": 3.208350409233238e-05, + "loss": 0.2209, + "step": 14380 + }, + { + "epoch": 1.7053243211194118, + "grad_norm": 1.1464752497406812, + "learning_rate": 3.208120206661197e-05, + "loss": 0.2591, + "step": 14381 + }, + { + "epoch": 1.7054429028815368, + "grad_norm": 0.7523007703751239, + "learning_rate": 3.207889997561157e-05, + "loss": 0.1714, + "step": 14382 + }, + { + "epoch": 1.7055614846436618, + "grad_norm": 0.6646801582795583, + "learning_rate": 3.207659781935239e-05, + "loss": 0.1582, + "step": 14383 + }, + { + "epoch": 1.7056800664057867, + "grad_norm": 0.724726640756388, + "learning_rate": 3.207429559785565e-05, + "loss": 0.1659, + "step": 14384 + }, + { + "epoch": 1.705798648167912, + "grad_norm": 0.8541630499487631, + "learning_rate": 3.207199331114259e-05, + "loss": 0.1417, + "step": 14385 + }, + { + "epoch": 1.7059172299300367, + "grad_norm": 0.7007196704498605, + "learning_rate": 3.206969095923441e-05, + "loss": 0.1577, + "step": 14386 + }, + { + "epoch": 1.706035811692162, + "grad_norm": 1.1821680770203713, + "learning_rate": 3.206738854215237e-05, + "loss": 0.2503, + "step": 14387 + }, + { + "epoch": 1.7061543934542867, + "grad_norm": 0.9030027573591347, + "learning_rate": 3.206508605991765e-05, + "loss": 0.2073, + "step": 14388 + }, + { + "epoch": 1.7062729752164119, + "grad_norm": 0.7834400114695371, + "learning_rate": 3.206278351255151e-05, + "loss": 0.163, + "step": 14389 + }, + { + "epoch": 1.7063915569785366, + "grad_norm": 0.6964224228006413, + "learning_rate": 3.206048090007517e-05, + "loss": 0.171, + "step": 14390 + }, + { + "epoch": 1.7065101387406618, + "grad_norm": 0.8464832744019939, + "learning_rate": 3.2058178222509854e-05, + "loss": 0.1823, + "step": 14391 + }, + { + "epoch": 1.7066287205027866, + "grad_norm": 0.5881905811237658, + "learning_rate": 3.205587547987678e-05, + "loss": 0.1487, + "step": 14392 + }, + { + "epoch": 1.7067473022649118, + "grad_norm": 0.8637996161147478, + "learning_rate": 3.2053572672197206e-05, + "loss": 0.1172, + "step": 14393 + }, + { + "epoch": 1.7068658840270365, + "grad_norm": 0.897821095796049, + "learning_rate": 3.2051269799492326e-05, + "loss": 0.2323, + "step": 14394 + }, + { + "epoch": 1.7069844657891617, + "grad_norm": 1.3442617969415227, + "learning_rate": 3.2048966861783394e-05, + "loss": 0.233, + "step": 14395 + }, + { + "epoch": 1.7071030475512865, + "grad_norm": 0.7950285480812964, + "learning_rate": 3.2046663859091625e-05, + "loss": 0.1609, + "step": 14396 + }, + { + "epoch": 1.7072216293134117, + "grad_norm": 0.9085797349747294, + "learning_rate": 3.204436079143826e-05, + "loss": 0.2348, + "step": 14397 + }, + { + "epoch": 1.7073402110755365, + "grad_norm": 1.32027297954256, + "learning_rate": 3.2042057658844525e-05, + "loss": 0.217, + "step": 14398 + }, + { + "epoch": 1.7074587928376617, + "grad_norm": 0.7015987263759242, + "learning_rate": 3.2039754461331655e-05, + "loss": 0.1374, + "step": 14399 + }, + { + "epoch": 1.7075773745997864, + "grad_norm": 1.207298092078933, + "learning_rate": 3.203745119892088e-05, + "loss": 0.2366, + "step": 14400 + }, + { + "epoch": 1.7076959563619116, + "grad_norm": 0.5733050438931029, + "learning_rate": 3.2035147871633436e-05, + "loss": 0.1328, + "step": 14401 + }, + { + "epoch": 1.7078145381240364, + "grad_norm": 0.6859523994557106, + "learning_rate": 3.203284447949056e-05, + "loss": 0.1391, + "step": 14402 + }, + { + "epoch": 1.7079331198861616, + "grad_norm": 1.026465653364107, + "learning_rate": 3.2030541022513475e-05, + "loss": 0.2301, + "step": 14403 + }, + { + "epoch": 1.7080517016482863, + "grad_norm": 1.1178728951405832, + "learning_rate": 3.202823750072343e-05, + "loss": 0.1783, + "step": 14404 + }, + { + "epoch": 1.7081702834104115, + "grad_norm": 1.0310775165539028, + "learning_rate": 3.202593391414165e-05, + "loss": 0.2206, + "step": 14405 + }, + { + "epoch": 1.7082888651725363, + "grad_norm": 0.7659303446911929, + "learning_rate": 3.2023630262789375e-05, + "loss": 0.1729, + "step": 14406 + }, + { + "epoch": 1.7084074469346615, + "grad_norm": 0.7804771486671647, + "learning_rate": 3.202132654668784e-05, + "loss": 0.164, + "step": 14407 + }, + { + "epoch": 1.7085260286967863, + "grad_norm": 0.8717018199386842, + "learning_rate": 3.201902276585828e-05, + "loss": 0.1714, + "step": 14408 + }, + { + "epoch": 1.7086446104589115, + "grad_norm": 0.7940625192900783, + "learning_rate": 3.201671892032195e-05, + "loss": 0.1388, + "step": 14409 + }, + { + "epoch": 1.7087631922210365, + "grad_norm": 0.9017368388438781, + "learning_rate": 3.201441501010007e-05, + "loss": 0.2225, + "step": 14410 + }, + { + "epoch": 1.7088817739831614, + "grad_norm": 0.7926553873046902, + "learning_rate": 3.2012111035213885e-05, + "loss": 0.1898, + "step": 14411 + }, + { + "epoch": 1.7090003557452864, + "grad_norm": 1.6557510366067474, + "learning_rate": 3.200980699568463e-05, + "loss": 0.3016, + "step": 14412 + }, + { + "epoch": 1.7091189375074114, + "grad_norm": 1.1992075927114936, + "learning_rate": 3.200750289153356e-05, + "loss": 0.2226, + "step": 14413 + }, + { + "epoch": 1.7092375192695364, + "grad_norm": 0.7734003556793038, + "learning_rate": 3.2005198722781905e-05, + "loss": 0.1768, + "step": 14414 + }, + { + "epoch": 1.7093561010316614, + "grad_norm": 1.3195009908092206, + "learning_rate": 3.200289448945091e-05, + "loss": 0.2834, + "step": 14415 + }, + { + "epoch": 1.7094746827937863, + "grad_norm": 1.012636333412804, + "learning_rate": 3.20005901915618e-05, + "loss": 0.1815, + "step": 14416 + }, + { + "epoch": 1.7095932645559113, + "grad_norm": 1.505726959635743, + "learning_rate": 3.199828582913585e-05, + "loss": 0.3652, + "step": 14417 + }, + { + "epoch": 1.7097118463180363, + "grad_norm": 1.3079484695785732, + "learning_rate": 3.199598140219428e-05, + "loss": 0.2513, + "step": 14418 + }, + { + "epoch": 1.7098304280801613, + "grad_norm": 1.1767643983439344, + "learning_rate": 3.199367691075834e-05, + "loss": 0.2825, + "step": 14419 + }, + { + "epoch": 1.7099490098422863, + "grad_norm": 0.8973611552476327, + "learning_rate": 3.199137235484927e-05, + "loss": 0.1243, + "step": 14420 + }, + { + "epoch": 1.7100675916044112, + "grad_norm": 0.8312354682888484, + "learning_rate": 3.1989067734488335e-05, + "loss": 0.1287, + "step": 14421 + }, + { + "epoch": 1.7101861733665362, + "grad_norm": 0.8103411608119966, + "learning_rate": 3.1986763049696753e-05, + "loss": 0.1335, + "step": 14422 + }, + { + "epoch": 1.7103047551286612, + "grad_norm": 1.199392976776145, + "learning_rate": 3.198445830049579e-05, + "loss": 0.2333, + "step": 14423 + }, + { + "epoch": 1.7104233368907862, + "grad_norm": 0.7904740900447842, + "learning_rate": 3.198215348690668e-05, + "loss": 0.1563, + "step": 14424 + }, + { + "epoch": 1.7105419186529112, + "grad_norm": 0.8613864847459698, + "learning_rate": 3.197984860895069e-05, + "loss": 0.2045, + "step": 14425 + }, + { + "epoch": 1.7106605004150361, + "grad_norm": 0.5740395435457274, + "learning_rate": 3.197754366664904e-05, + "loss": 0.1137, + "step": 14426 + }, + { + "epoch": 1.7107790821771611, + "grad_norm": 0.8255860097000646, + "learning_rate": 3.1975238660022996e-05, + "loss": 0.2004, + "step": 14427 + }, + { + "epoch": 1.710897663939286, + "grad_norm": 1.0083153750957559, + "learning_rate": 3.1972933589093814e-05, + "loss": 0.2223, + "step": 14428 + }, + { + "epoch": 1.711016245701411, + "grad_norm": 0.9552836604904347, + "learning_rate": 3.1970628453882725e-05, + "loss": 0.2089, + "step": 14429 + }, + { + "epoch": 1.711134827463536, + "grad_norm": 1.801131278881435, + "learning_rate": 3.1968323254411e-05, + "loss": 0.454, + "step": 14430 + }, + { + "epoch": 1.711253409225661, + "grad_norm": 0.9183282144579255, + "learning_rate": 3.196601799069987e-05, + "loss": 0.1873, + "step": 14431 + }, + { + "epoch": 1.711371990987786, + "grad_norm": 0.9465792578950817, + "learning_rate": 3.1963712662770606e-05, + "loss": 0.2504, + "step": 14432 + }, + { + "epoch": 1.711490572749911, + "grad_norm": 0.7555496378763414, + "learning_rate": 3.1961407270644436e-05, + "loss": 0.1287, + "step": 14433 + }, + { + "epoch": 1.7116091545120362, + "grad_norm": 0.8323256270271441, + "learning_rate": 3.1959101814342646e-05, + "loss": 0.1891, + "step": 14434 + }, + { + "epoch": 1.711727736274161, + "grad_norm": 1.1165041249613685, + "learning_rate": 3.195679629388646e-05, + "loss": 0.294, + "step": 14435 + }, + { + "epoch": 1.7118463180362862, + "grad_norm": 1.3542579324728636, + "learning_rate": 3.195449070929715e-05, + "loss": 0.2636, + "step": 14436 + }, + { + "epoch": 1.711964899798411, + "grad_norm": 1.3517711025316923, + "learning_rate": 3.195218506059596e-05, + "loss": 0.2875, + "step": 14437 + }, + { + "epoch": 1.7120834815605361, + "grad_norm": 0.834909995254865, + "learning_rate": 3.194987934780415e-05, + "loss": 0.1645, + "step": 14438 + }, + { + "epoch": 1.712202063322661, + "grad_norm": 0.8012395779549538, + "learning_rate": 3.194757357094297e-05, + "loss": 0.1913, + "step": 14439 + }, + { + "epoch": 1.712320645084786, + "grad_norm": 0.79732840573039, + "learning_rate": 3.1945267730033686e-05, + "loss": 0.1733, + "step": 14440 + }, + { + "epoch": 1.7124392268469109, + "grad_norm": 0.8590946993592105, + "learning_rate": 3.194296182509755e-05, + "loss": 0.1909, + "step": 14441 + }, + { + "epoch": 1.712557808609036, + "grad_norm": 1.2009172648701005, + "learning_rate": 3.194065585615581e-05, + "loss": 0.3501, + "step": 14442 + }, + { + "epoch": 1.7126763903711608, + "grad_norm": 1.1693442182592593, + "learning_rate": 3.193834982322975e-05, + "loss": 0.2055, + "step": 14443 + }, + { + "epoch": 1.712794972133286, + "grad_norm": 0.9665331778877507, + "learning_rate": 3.19360437263406e-05, + "loss": 0.2199, + "step": 14444 + }, + { + "epoch": 1.7129135538954108, + "grad_norm": 1.25097070289549, + "learning_rate": 3.1933737565509645e-05, + "loss": 0.2817, + "step": 14445 + }, + { + "epoch": 1.713032135657536, + "grad_norm": 0.7072413272076769, + "learning_rate": 3.1931431340758124e-05, + "loss": 0.1373, + "step": 14446 + }, + { + "epoch": 1.7131507174196607, + "grad_norm": 0.810403515792779, + "learning_rate": 3.192912505210731e-05, + "loss": 0.1488, + "step": 14447 + }, + { + "epoch": 1.713269299181786, + "grad_norm": 0.7917813560033571, + "learning_rate": 3.192681869957845e-05, + "loss": 0.1323, + "step": 14448 + }, + { + "epoch": 1.7133878809439107, + "grad_norm": 0.618844083562031, + "learning_rate": 3.192451228319282e-05, + "loss": 0.1104, + "step": 14449 + }, + { + "epoch": 1.713506462706036, + "grad_norm": 1.4926167421919925, + "learning_rate": 3.192220580297169e-05, + "loss": 0.3628, + "step": 14450 + }, + { + "epoch": 1.7136250444681607, + "grad_norm": 1.1214632623017813, + "learning_rate": 3.19198992589363e-05, + "loss": 0.2879, + "step": 14451 + }, + { + "epoch": 1.7137436262302859, + "grad_norm": 0.7843127695533759, + "learning_rate": 3.191759265110793e-05, + "loss": 0.1737, + "step": 14452 + }, + { + "epoch": 1.7138622079924106, + "grad_norm": 0.7532382931113236, + "learning_rate": 3.191528597950784e-05, + "loss": 0.1836, + "step": 14453 + }, + { + "epoch": 1.7139807897545358, + "grad_norm": 0.956089162159907, + "learning_rate": 3.191297924415729e-05, + "loss": 0.235, + "step": 14454 + }, + { + "epoch": 1.7140993715166606, + "grad_norm": 0.7601254303996268, + "learning_rate": 3.1910672445077544e-05, + "loss": 0.1711, + "step": 14455 + }, + { + "epoch": 1.7142179532787858, + "grad_norm": 0.8127578512115333, + "learning_rate": 3.190836558228988e-05, + "loss": 0.1721, + "step": 14456 + }, + { + "epoch": 1.7143365350409105, + "grad_norm": 0.8721585421318665, + "learning_rate": 3.190605865581555e-05, + "loss": 0.1829, + "step": 14457 + }, + { + "epoch": 1.7144551168030358, + "grad_norm": 0.7232642153583739, + "learning_rate": 3.190375166567584e-05, + "loss": 0.1414, + "step": 14458 + }, + { + "epoch": 1.7145736985651607, + "grad_norm": 0.8396165863250783, + "learning_rate": 3.190144461189199e-05, + "loss": 0.1581, + "step": 14459 + }, + { + "epoch": 1.7146922803272857, + "grad_norm": 0.939144813352631, + "learning_rate": 3.189913749448529e-05, + "loss": 0.2342, + "step": 14460 + }, + { + "epoch": 1.7148108620894107, + "grad_norm": 0.7733284907121141, + "learning_rate": 3.1896830313477e-05, + "loss": 0.2024, + "step": 14461 + }, + { + "epoch": 1.7149294438515357, + "grad_norm": 0.8416801156159839, + "learning_rate": 3.18945230688884e-05, + "loss": 0.1444, + "step": 14462 + }, + { + "epoch": 1.7150480256136607, + "grad_norm": 0.9890691389491307, + "learning_rate": 3.189221576074075e-05, + "loss": 0.1663, + "step": 14463 + }, + { + "epoch": 1.7151666073757856, + "grad_norm": 1.5299653158251565, + "learning_rate": 3.188990838905532e-05, + "loss": 0.3492, + "step": 14464 + }, + { + "epoch": 1.7152851891379106, + "grad_norm": 1.3525288257426356, + "learning_rate": 3.188760095385338e-05, + "loss": 0.2389, + "step": 14465 + }, + { + "epoch": 1.7154037709000356, + "grad_norm": 1.2092095101000393, + "learning_rate": 3.188529345515621e-05, + "loss": 0.3104, + "step": 14466 + }, + { + "epoch": 1.7155223526621606, + "grad_norm": 0.7222490010699802, + "learning_rate": 3.188298589298507e-05, + "loss": 0.1803, + "step": 14467 + }, + { + "epoch": 1.7156409344242856, + "grad_norm": 1.5314844263476861, + "learning_rate": 3.1880678267361255e-05, + "loss": 0.2898, + "step": 14468 + }, + { + "epoch": 1.7157595161864105, + "grad_norm": 1.0589536622103803, + "learning_rate": 3.187837057830601e-05, + "loss": 0.2293, + "step": 14469 + }, + { + "epoch": 1.7158780979485355, + "grad_norm": 0.8692651385264556, + "learning_rate": 3.1876062825840624e-05, + "loss": 0.1834, + "step": 14470 + }, + { + "epoch": 1.7159966797106605, + "grad_norm": 0.8809533546791133, + "learning_rate": 3.1873755009986386e-05, + "loss": 0.208, + "step": 14471 + }, + { + "epoch": 1.7161152614727855, + "grad_norm": 1.0042004019198645, + "learning_rate": 3.187144713076454e-05, + "loss": 0.2313, + "step": 14472 + }, + { + "epoch": 1.7162338432349105, + "grad_norm": 0.9594416094795362, + "learning_rate": 3.186913918819639e-05, + "loss": 0.1747, + "step": 14473 + }, + { + "epoch": 1.7163524249970354, + "grad_norm": 0.7783018548883502, + "learning_rate": 3.186683118230319e-05, + "loss": 0.1678, + "step": 14474 + }, + { + "epoch": 1.7164710067591604, + "grad_norm": 0.6484011055315203, + "learning_rate": 3.1864523113106236e-05, + "loss": 0.135, + "step": 14475 + }, + { + "epoch": 1.7165895885212854, + "grad_norm": 0.8111429776270523, + "learning_rate": 3.186221498062679e-05, + "loss": 0.1995, + "step": 14476 + }, + { + "epoch": 1.7167081702834104, + "grad_norm": 0.9302421863355155, + "learning_rate": 3.185990678488614e-05, + "loss": 0.1929, + "step": 14477 + }, + { + "epoch": 1.7168267520455354, + "grad_norm": 0.793150857126126, + "learning_rate": 3.185759852590556e-05, + "loss": 0.1509, + "step": 14478 + }, + { + "epoch": 1.7169453338076603, + "grad_norm": 0.7944312864803509, + "learning_rate": 3.185529020370633e-05, + "loss": 0.1586, + "step": 14479 + }, + { + "epoch": 1.7170639155697853, + "grad_norm": 0.8599864137641585, + "learning_rate": 3.1852981818309724e-05, + "loss": 0.2101, + "step": 14480 + }, + { + "epoch": 1.7171824973319103, + "grad_norm": 0.8693744679999772, + "learning_rate": 3.185067336973705e-05, + "loss": 0.1665, + "step": 14481 + }, + { + "epoch": 1.7173010790940353, + "grad_norm": 0.8081608916265947, + "learning_rate": 3.184836485800955e-05, + "loss": 0.1766, + "step": 14482 + }, + { + "epoch": 1.7174196608561603, + "grad_norm": 0.9715105916256286, + "learning_rate": 3.184605628314853e-05, + "loss": 0.1701, + "step": 14483 + }, + { + "epoch": 1.7175382426182852, + "grad_norm": 0.9581644864955162, + "learning_rate": 3.184374764517526e-05, + "loss": 0.1882, + "step": 14484 + }, + { + "epoch": 1.7176568243804105, + "grad_norm": 0.8833760651880589, + "learning_rate": 3.1841438944111035e-05, + "loss": 0.2013, + "step": 14485 + }, + { + "epoch": 1.7177754061425352, + "grad_norm": 1.0214604903444913, + "learning_rate": 3.183913017997714e-05, + "loss": 0.2223, + "step": 14486 + }, + { + "epoch": 1.7178939879046604, + "grad_norm": 0.5776017659102184, + "learning_rate": 3.1836821352794844e-05, + "loss": 0.131, + "step": 14487 + }, + { + "epoch": 1.7180125696667852, + "grad_norm": 0.9063835995731884, + "learning_rate": 3.183451246258543e-05, + "loss": 0.177, + "step": 14488 + }, + { + "epoch": 1.7181311514289104, + "grad_norm": 1.3085849540585721, + "learning_rate": 3.18322035093702e-05, + "loss": 0.2757, + "step": 14489 + }, + { + "epoch": 1.7182497331910351, + "grad_norm": 1.0870020520330599, + "learning_rate": 3.1829894493170434e-05, + "loss": 0.2189, + "step": 14490 + }, + { + "epoch": 1.7183683149531603, + "grad_norm": 0.908359803694033, + "learning_rate": 3.1827585414007416e-05, + "loss": 0.1393, + "step": 14491 + }, + { + "epoch": 1.718486896715285, + "grad_norm": 1.038490096805204, + "learning_rate": 3.182527627190244e-05, + "loss": 0.1598, + "step": 14492 + }, + { + "epoch": 1.7186054784774103, + "grad_norm": 1.2932794047699467, + "learning_rate": 3.1822967066876775e-05, + "loss": 0.3022, + "step": 14493 + }, + { + "epoch": 1.718724060239535, + "grad_norm": 1.1627319558844944, + "learning_rate": 3.1820657798951726e-05, + "loss": 0.2563, + "step": 14494 + }, + { + "epoch": 1.7188426420016603, + "grad_norm": 1.2859874581480395, + "learning_rate": 3.1818348468148576e-05, + "loss": 0.2711, + "step": 14495 + }, + { + "epoch": 1.718961223763785, + "grad_norm": 0.8738329261586313, + "learning_rate": 3.1816039074488614e-05, + "loss": 0.1693, + "step": 14496 + }, + { + "epoch": 1.7190798055259102, + "grad_norm": 0.829466313832031, + "learning_rate": 3.181372961799313e-05, + "loss": 0.1599, + "step": 14497 + }, + { + "epoch": 1.719198387288035, + "grad_norm": 0.9080881280498784, + "learning_rate": 3.1811420098683404e-05, + "loss": 0.1888, + "step": 14498 + }, + { + "epoch": 1.7193169690501602, + "grad_norm": 1.2063387813040105, + "learning_rate": 3.180911051658074e-05, + "loss": 0.2997, + "step": 14499 + }, + { + "epoch": 1.719435550812285, + "grad_norm": 0.6674956455369669, + "learning_rate": 3.1806800871706436e-05, + "loss": 0.1575, + "step": 14500 + }, + { + "epoch": 1.7195541325744101, + "grad_norm": 1.2815825218293206, + "learning_rate": 3.180449116408178e-05, + "loss": 0.2943, + "step": 14501 + }, + { + "epoch": 1.719672714336535, + "grad_norm": 0.8348729240341957, + "learning_rate": 3.1802181393728046e-05, + "loss": 0.1606, + "step": 14502 + }, + { + "epoch": 1.71979129609866, + "grad_norm": 0.999547458441492, + "learning_rate": 3.179987156066655e-05, + "loss": 0.2131, + "step": 14503 + }, + { + "epoch": 1.7199098778607849, + "grad_norm": 1.131820309122839, + "learning_rate": 3.1797561664918575e-05, + "loss": 0.2113, + "step": 14504 + }, + { + "epoch": 1.72002845962291, + "grad_norm": 0.8241425445268646, + "learning_rate": 3.179525170650542e-05, + "loss": 0.2499, + "step": 14505 + }, + { + "epoch": 1.7201470413850348, + "grad_norm": 1.0487402773783634, + "learning_rate": 3.1792941685448366e-05, + "loss": 0.2458, + "step": 14506 + }, + { + "epoch": 1.72026562314716, + "grad_norm": 0.9065824519651341, + "learning_rate": 3.1790631601768735e-05, + "loss": 0.2162, + "step": 14507 + }, + { + "epoch": 1.7203842049092848, + "grad_norm": 0.9726148633173768, + "learning_rate": 3.17883214554878e-05, + "loss": 0.1741, + "step": 14508 + }, + { + "epoch": 1.72050278667141, + "grad_norm": 0.8307844591905694, + "learning_rate": 3.178601124662686e-05, + "loss": 0.168, + "step": 14509 + }, + { + "epoch": 1.720621368433535, + "grad_norm": 0.8227844705909498, + "learning_rate": 3.178370097520722e-05, + "loss": 0.1375, + "step": 14510 + }, + { + "epoch": 1.72073995019566, + "grad_norm": 0.7566125071633455, + "learning_rate": 3.178139064125018e-05, + "loss": 0.1541, + "step": 14511 + }, + { + "epoch": 1.720858531957785, + "grad_norm": 0.9510337561407485, + "learning_rate": 3.177908024477703e-05, + "loss": 0.1731, + "step": 14512 + }, + { + "epoch": 1.72097711371991, + "grad_norm": 1.7527500078133933, + "learning_rate": 3.177676978580908e-05, + "loss": 0.3238, + "step": 14513 + }, + { + "epoch": 1.721095695482035, + "grad_norm": 0.7056392366113586, + "learning_rate": 3.1774459264367616e-05, + "loss": 0.1839, + "step": 14514 + }, + { + "epoch": 1.7212142772441599, + "grad_norm": 0.7917041230967323, + "learning_rate": 3.177214868047395e-05, + "loss": 0.1616, + "step": 14515 + }, + { + "epoch": 1.7213328590062849, + "grad_norm": 0.8645588110964616, + "learning_rate": 3.176983803414938e-05, + "loss": 0.1584, + "step": 14516 + }, + { + "epoch": 1.7214514407684098, + "grad_norm": 0.9805061859537454, + "learning_rate": 3.1767527325415195e-05, + "loss": 0.2019, + "step": 14517 + }, + { + "epoch": 1.7215700225305348, + "grad_norm": 1.2901144715957598, + "learning_rate": 3.176521655429271e-05, + "loss": 0.2659, + "step": 14518 + }, + { + "epoch": 1.7216886042926598, + "grad_norm": 0.905454507554192, + "learning_rate": 3.1762905720803226e-05, + "loss": 0.1713, + "step": 14519 + }, + { + "epoch": 1.7218071860547848, + "grad_norm": 0.9696607283145809, + "learning_rate": 3.1760594824968056e-05, + "loss": 0.2167, + "step": 14520 + }, + { + "epoch": 1.7219257678169098, + "grad_norm": 0.9436504721834307, + "learning_rate": 3.1758283866808474e-05, + "loss": 0.2003, + "step": 14521 + }, + { + "epoch": 1.7220443495790347, + "grad_norm": 0.8668647695847328, + "learning_rate": 3.1755972846345816e-05, + "loss": 0.1763, + "step": 14522 + }, + { + "epoch": 1.7221629313411597, + "grad_norm": 0.8201692519258736, + "learning_rate": 3.175366176360136e-05, + "loss": 0.1608, + "step": 14523 + }, + { + "epoch": 1.7222815131032847, + "grad_norm": 0.7972566510047387, + "learning_rate": 3.175135061859644e-05, + "loss": 0.1858, + "step": 14524 + }, + { + "epoch": 1.7224000948654097, + "grad_norm": 1.0115792461850481, + "learning_rate": 3.174903941135233e-05, + "loss": 0.2111, + "step": 14525 + }, + { + "epoch": 1.7225186766275347, + "grad_norm": 1.0401181340598753, + "learning_rate": 3.174672814189037e-05, + "loss": 0.2422, + "step": 14526 + }, + { + "epoch": 1.7226372583896596, + "grad_norm": 1.1008857995270602, + "learning_rate": 3.1744416810231837e-05, + "loss": 0.1973, + "step": 14527 + }, + { + "epoch": 1.7227558401517846, + "grad_norm": 0.7867896563390347, + "learning_rate": 3.174210541639805e-05, + "loss": 0.1782, + "step": 14528 + }, + { + "epoch": 1.7228744219139096, + "grad_norm": 0.6187479793525065, + "learning_rate": 3.1739793960410325e-05, + "loss": 0.0961, + "step": 14529 + }, + { + "epoch": 1.7229930036760346, + "grad_norm": 0.9962805683900869, + "learning_rate": 3.173748244228996e-05, + "loss": 0.1841, + "step": 14530 + }, + { + "epoch": 1.7231115854381596, + "grad_norm": 1.1940362274642846, + "learning_rate": 3.173517086205828e-05, + "loss": 0.2741, + "step": 14531 + }, + { + "epoch": 1.7232301672002845, + "grad_norm": 0.7958229115003665, + "learning_rate": 3.173285921973657e-05, + "loss": 0.1235, + "step": 14532 + }, + { + "epoch": 1.7233487489624095, + "grad_norm": 0.8913708104572627, + "learning_rate": 3.173054751534617e-05, + "loss": 0.1707, + "step": 14533 + }, + { + "epoch": 1.7234673307245347, + "grad_norm": 0.9908137164689471, + "learning_rate": 3.172823574890836e-05, + "loss": 0.2132, + "step": 14534 + }, + { + "epoch": 1.7235859124866595, + "grad_norm": 0.808241532981632, + "learning_rate": 3.1725923920444476e-05, + "loss": 0.1921, + "step": 14535 + }, + { + "epoch": 1.7237044942487847, + "grad_norm": 1.0061510629146675, + "learning_rate": 3.172361202997582e-05, + "loss": 0.1966, + "step": 14536 + }, + { + "epoch": 1.7238230760109094, + "grad_norm": 0.8448355590640095, + "learning_rate": 3.1721300077523705e-05, + "loss": 0.1726, + "step": 14537 + }, + { + "epoch": 1.7239416577730347, + "grad_norm": 0.9487251009104689, + "learning_rate": 3.1718988063109436e-05, + "loss": 0.1685, + "step": 14538 + }, + { + "epoch": 1.7240602395351594, + "grad_norm": 1.189346051791899, + "learning_rate": 3.171667598675435e-05, + "loss": 0.188, + "step": 14539 + }, + { + "epoch": 1.7241788212972846, + "grad_norm": 1.016714922959491, + "learning_rate": 3.171436384847974e-05, + "loss": 0.1669, + "step": 14540 + }, + { + "epoch": 1.7242974030594094, + "grad_norm": 1.2061718095399188, + "learning_rate": 3.171205164830693e-05, + "loss": 0.1919, + "step": 14541 + }, + { + "epoch": 1.7244159848215346, + "grad_norm": 0.9893678822433706, + "learning_rate": 3.170973938625724e-05, + "loss": 0.209, + "step": 14542 + }, + { + "epoch": 1.7245345665836593, + "grad_norm": 0.6586631830792824, + "learning_rate": 3.170742706235198e-05, + "loss": 0.1459, + "step": 14543 + }, + { + "epoch": 1.7246531483457845, + "grad_norm": 0.973762807934994, + "learning_rate": 3.170511467661247e-05, + "loss": 0.183, + "step": 14544 + }, + { + "epoch": 1.7247717301079093, + "grad_norm": 1.3280351125221626, + "learning_rate": 3.170280222906002e-05, + "loss": 0.2568, + "step": 14545 + }, + { + "epoch": 1.7248903118700345, + "grad_norm": 1.004824430485803, + "learning_rate": 3.1700489719715956e-05, + "loss": 0.2022, + "step": 14546 + }, + { + "epoch": 1.7250088936321593, + "grad_norm": 0.8886490918581185, + "learning_rate": 3.1698177148601595e-05, + "loss": 0.2135, + "step": 14547 + }, + { + "epoch": 1.7251274753942845, + "grad_norm": 1.1793397168902264, + "learning_rate": 3.169586451573825e-05, + "loss": 0.2246, + "step": 14548 + }, + { + "epoch": 1.7252460571564092, + "grad_norm": 1.2359827850832896, + "learning_rate": 3.169355182114724e-05, + "loss": 0.2182, + "step": 14549 + }, + { + "epoch": 1.7253646389185344, + "grad_norm": 1.0388691469616662, + "learning_rate": 3.16912390648499e-05, + "loss": 0.1664, + "step": 14550 + }, + { + "epoch": 1.7254832206806592, + "grad_norm": 1.6728604780754945, + "learning_rate": 3.1688926246867534e-05, + "loss": 0.3015, + "step": 14551 + }, + { + "epoch": 1.7256018024427844, + "grad_norm": 0.7428732951354476, + "learning_rate": 3.1686613367221476e-05, + "loss": 0.1739, + "step": 14552 + }, + { + "epoch": 1.7257203842049091, + "grad_norm": 1.0511553399653415, + "learning_rate": 3.1684300425933036e-05, + "loss": 0.2352, + "step": 14553 + }, + { + "epoch": 1.7258389659670343, + "grad_norm": 2.008443789479163, + "learning_rate": 3.168198742302355e-05, + "loss": 0.4831, + "step": 14554 + }, + { + "epoch": 1.725957547729159, + "grad_norm": 1.3110276843950641, + "learning_rate": 3.1679674358514335e-05, + "loss": 0.2863, + "step": 14555 + }, + { + "epoch": 1.7260761294912843, + "grad_norm": 1.0181102225666052, + "learning_rate": 3.1677361232426704e-05, + "loss": 0.1899, + "step": 14556 + }, + { + "epoch": 1.726194711253409, + "grad_norm": 0.6860789077262858, + "learning_rate": 3.1675048044782004e-05, + "loss": 0.1553, + "step": 14557 + }, + { + "epoch": 1.7263132930155343, + "grad_norm": 0.6816219434686372, + "learning_rate": 3.167273479560153e-05, + "loss": 0.1331, + "step": 14558 + }, + { + "epoch": 1.7264318747776592, + "grad_norm": 0.7550833610404455, + "learning_rate": 3.1670421484906635e-05, + "loss": 0.1712, + "step": 14559 + }, + { + "epoch": 1.7265504565397842, + "grad_norm": 0.9961086947219595, + "learning_rate": 3.166810811271863e-05, + "loss": 0.2045, + "step": 14560 + }, + { + "epoch": 1.7266690383019092, + "grad_norm": 0.8420799147957879, + "learning_rate": 3.1665794679058854e-05, + "loss": 0.1726, + "step": 14561 + }, + { + "epoch": 1.7267876200640342, + "grad_norm": 0.7567175492683405, + "learning_rate": 3.1663481183948614e-05, + "loss": 0.1751, + "step": 14562 + }, + { + "epoch": 1.7269062018261592, + "grad_norm": 1.0151720802247088, + "learning_rate": 3.1661167627409255e-05, + "loss": 0.1996, + "step": 14563 + }, + { + "epoch": 1.7270247835882842, + "grad_norm": 0.8140414757305688, + "learning_rate": 3.1658854009462093e-05, + "loss": 0.1773, + "step": 14564 + }, + { + "epoch": 1.7271433653504091, + "grad_norm": 0.9275532328169278, + "learning_rate": 3.165654033012847e-05, + "loss": 0.1851, + "step": 14565 + }, + { + "epoch": 1.7272619471125341, + "grad_norm": 0.8035196969120909, + "learning_rate": 3.16542265894297e-05, + "loss": 0.1629, + "step": 14566 + }, + { + "epoch": 1.727380528874659, + "grad_norm": 0.838801866696642, + "learning_rate": 3.165191278738713e-05, + "loss": 0.2433, + "step": 14567 + }, + { + "epoch": 1.727499110636784, + "grad_norm": 1.1776987855088579, + "learning_rate": 3.164959892402207e-05, + "loss": 0.3074, + "step": 14568 + }, + { + "epoch": 1.727617692398909, + "grad_norm": 1.0205170180036778, + "learning_rate": 3.164728499935587e-05, + "loss": 0.2612, + "step": 14569 + }, + { + "epoch": 1.727736274161034, + "grad_norm": 1.0445462073270133, + "learning_rate": 3.164497101340986e-05, + "loss": 0.2617, + "step": 14570 + }, + { + "epoch": 1.727854855923159, + "grad_norm": 0.7260524627453735, + "learning_rate": 3.1642656966205354e-05, + "loss": 0.1674, + "step": 14571 + }, + { + "epoch": 1.727973437685284, + "grad_norm": 0.830858008551378, + "learning_rate": 3.164034285776371e-05, + "loss": 0.1752, + "step": 14572 + }, + { + "epoch": 1.728092019447409, + "grad_norm": 1.108440877848984, + "learning_rate": 3.163802868810623e-05, + "loss": 0.2058, + "step": 14573 + }, + { + "epoch": 1.728210601209534, + "grad_norm": 0.7151242084311277, + "learning_rate": 3.1635714457254275e-05, + "loss": 0.1831, + "step": 14574 + }, + { + "epoch": 1.728329182971659, + "grad_norm": 0.9413052227877987, + "learning_rate": 3.1633400165229166e-05, + "loss": 0.2308, + "step": 14575 + }, + { + "epoch": 1.728447764733784, + "grad_norm": 0.9464620916904293, + "learning_rate": 3.163108581205225e-05, + "loss": 0.1895, + "step": 14576 + }, + { + "epoch": 1.728566346495909, + "grad_norm": 0.9523520357798898, + "learning_rate": 3.162877139774485e-05, + "loss": 0.1841, + "step": 14577 + }, + { + "epoch": 1.7286849282580339, + "grad_norm": 1.599303402928309, + "learning_rate": 3.16264569223283e-05, + "loss": 0.3194, + "step": 14578 + }, + { + "epoch": 1.7288035100201589, + "grad_norm": 0.7017707138477537, + "learning_rate": 3.162414238582395e-05, + "loss": 0.1688, + "step": 14579 + }, + { + "epoch": 1.7289220917822838, + "grad_norm": 1.060493981671192, + "learning_rate": 3.162182778825313e-05, + "loss": 0.1684, + "step": 14580 + }, + { + "epoch": 1.7290406735444088, + "grad_norm": 0.7601197371626818, + "learning_rate": 3.161951312963718e-05, + "loss": 0.1362, + "step": 14581 + }, + { + "epoch": 1.7291592553065338, + "grad_norm": 1.2950632929470187, + "learning_rate": 3.1617198409997435e-05, + "loss": 0.2586, + "step": 14582 + }, + { + "epoch": 1.7292778370686588, + "grad_norm": 1.0317522655834799, + "learning_rate": 3.161488362935523e-05, + "loss": 0.2065, + "step": 14583 + }, + { + "epoch": 1.7293964188307838, + "grad_norm": 0.6782691345509848, + "learning_rate": 3.161256878773191e-05, + "loss": 0.1371, + "step": 14584 + }, + { + "epoch": 1.729515000592909, + "grad_norm": 0.7985036823169253, + "learning_rate": 3.161025388514882e-05, + "loss": 0.1513, + "step": 14585 + }, + { + "epoch": 1.7296335823550337, + "grad_norm": 0.9648452353997926, + "learning_rate": 3.1607938921627293e-05, + "loss": 0.204, + "step": 14586 + }, + { + "epoch": 1.729752164117159, + "grad_norm": 0.8430465558485014, + "learning_rate": 3.160562389718867e-05, + "loss": 0.2599, + "step": 14587 + }, + { + "epoch": 1.7298707458792837, + "grad_norm": 0.7966728869493312, + "learning_rate": 3.1603308811854305e-05, + "loss": 0.1918, + "step": 14588 + }, + { + "epoch": 1.729989327641409, + "grad_norm": 0.7246191189165064, + "learning_rate": 3.1600993665645516e-05, + "loss": 0.1508, + "step": 14589 + }, + { + "epoch": 1.7301079094035337, + "grad_norm": 0.7071757025152917, + "learning_rate": 3.159867845858367e-05, + "loss": 0.1439, + "step": 14590 + }, + { + "epoch": 1.7302264911656589, + "grad_norm": 1.136315322726186, + "learning_rate": 3.1596363190690095e-05, + "loss": 0.2378, + "step": 14591 + }, + { + "epoch": 1.7303450729277836, + "grad_norm": 1.4774917302485608, + "learning_rate": 3.159404786198614e-05, + "loss": 0.3528, + "step": 14592 + }, + { + "epoch": 1.7304636546899088, + "grad_norm": 1.0654817530331917, + "learning_rate": 3.1591732472493164e-05, + "loss": 0.2021, + "step": 14593 + }, + { + "epoch": 1.7305822364520336, + "grad_norm": 0.9225743927354116, + "learning_rate": 3.158941702223248e-05, + "loss": 0.2455, + "step": 14594 + }, + { + "epoch": 1.7307008182141588, + "grad_norm": 0.7872308672175207, + "learning_rate": 3.158710151122546e-05, + "loss": 0.1379, + "step": 14595 + }, + { + "epoch": 1.7308193999762835, + "grad_norm": 0.8917213221501692, + "learning_rate": 3.158478593949344e-05, + "loss": 0.1863, + "step": 14596 + }, + { + "epoch": 1.7309379817384087, + "grad_norm": 0.7908576061182048, + "learning_rate": 3.158247030705777e-05, + "loss": 0.1464, + "step": 14597 + }, + { + "epoch": 1.7310565635005335, + "grad_norm": 0.9576004296033396, + "learning_rate": 3.158015461393979e-05, + "loss": 0.1866, + "step": 14598 + }, + { + "epoch": 1.7311751452626587, + "grad_norm": 0.6751727429561115, + "learning_rate": 3.157783886016085e-05, + "loss": 0.1171, + "step": 14599 + }, + { + "epoch": 1.7312937270247835, + "grad_norm": 0.7363996852075781, + "learning_rate": 3.157552304574232e-05, + "loss": 0.1479, + "step": 14600 + }, + { + "epoch": 1.7314123087869087, + "grad_norm": 1.0778173419746042, + "learning_rate": 3.1573207170705515e-05, + "loss": 0.2107, + "step": 14601 + }, + { + "epoch": 1.7315308905490334, + "grad_norm": 0.9959831348027576, + "learning_rate": 3.157089123507181e-05, + "loss": 0.2288, + "step": 14602 + }, + { + "epoch": 1.7316494723111586, + "grad_norm": 0.674288171399284, + "learning_rate": 3.156857523886254e-05, + "loss": 0.1527, + "step": 14603 + }, + { + "epoch": 1.7317680540732834, + "grad_norm": 0.9296508603858928, + "learning_rate": 3.1566259182099064e-05, + "loss": 0.2045, + "step": 14604 + }, + { + "epoch": 1.7318866358354086, + "grad_norm": 0.8454764310049978, + "learning_rate": 3.1563943064802725e-05, + "loss": 0.1707, + "step": 14605 + }, + { + "epoch": 1.7320052175975333, + "grad_norm": 0.8484282397264464, + "learning_rate": 3.156162688699489e-05, + "loss": 0.1963, + "step": 14606 + }, + { + "epoch": 1.7321237993596585, + "grad_norm": 0.676255786693846, + "learning_rate": 3.155931064869689e-05, + "loss": 0.1232, + "step": 14607 + }, + { + "epoch": 1.7322423811217833, + "grad_norm": 0.863420367854814, + "learning_rate": 3.15569943499301e-05, + "loss": 0.2294, + "step": 14608 + }, + { + "epoch": 1.7323609628839085, + "grad_norm": 1.1163547090511354, + "learning_rate": 3.155467799071585e-05, + "loss": 0.2566, + "step": 14609 + }, + { + "epoch": 1.7324795446460335, + "grad_norm": 0.8473735338153475, + "learning_rate": 3.155236157107552e-05, + "loss": 0.1706, + "step": 14610 + }, + { + "epoch": 1.7325981264081585, + "grad_norm": 1.1931074992604154, + "learning_rate": 3.1550045091030445e-05, + "loss": 0.3056, + "step": 14611 + }, + { + "epoch": 1.7327167081702834, + "grad_norm": 1.2011965730113097, + "learning_rate": 3.154772855060198e-05, + "loss": 0.2517, + "step": 14612 + }, + { + "epoch": 1.7328352899324084, + "grad_norm": 0.9249255906668935, + "learning_rate": 3.1545411949811506e-05, + "loss": 0.1781, + "step": 14613 + }, + { + "epoch": 1.7329538716945334, + "grad_norm": 0.9204208714281856, + "learning_rate": 3.154309528868035e-05, + "loss": 0.1341, + "step": 14614 + }, + { + "epoch": 1.7330724534566584, + "grad_norm": 0.8442626442464138, + "learning_rate": 3.154077856722988e-05, + "loss": 0.1649, + "step": 14615 + }, + { + "epoch": 1.7331910352187834, + "grad_norm": 0.9538591087898891, + "learning_rate": 3.1538461785481444e-05, + "loss": 0.2245, + "step": 14616 + }, + { + "epoch": 1.7333096169809084, + "grad_norm": 0.9659436755430877, + "learning_rate": 3.153614494345641e-05, + "loss": 0.1975, + "step": 14617 + }, + { + "epoch": 1.7334281987430333, + "grad_norm": 0.855935387391529, + "learning_rate": 3.153382804117614e-05, + "loss": 0.1741, + "step": 14618 + }, + { + "epoch": 1.7335467805051583, + "grad_norm": 1.1605035618214385, + "learning_rate": 3.153151107866199e-05, + "loss": 0.3194, + "step": 14619 + }, + { + "epoch": 1.7336653622672833, + "grad_norm": 0.9518832406339551, + "learning_rate": 3.152919405593532e-05, + "loss": 0.2597, + "step": 14620 + }, + { + "epoch": 1.7337839440294083, + "grad_norm": 0.6711723714983292, + "learning_rate": 3.152687697301748e-05, + "loss": 0.1614, + "step": 14621 + }, + { + "epoch": 1.7339025257915333, + "grad_norm": 0.9607624733729908, + "learning_rate": 3.152455982992984e-05, + "loss": 0.2748, + "step": 14622 + }, + { + "epoch": 1.7340211075536582, + "grad_norm": 1.076818075516996, + "learning_rate": 3.152224262669377e-05, + "loss": 0.217, + "step": 14623 + }, + { + "epoch": 1.7341396893157832, + "grad_norm": 0.7858193273533192, + "learning_rate": 3.1519925363330606e-05, + "loss": 0.1582, + "step": 14624 + }, + { + "epoch": 1.7342582710779082, + "grad_norm": 1.0064975525861173, + "learning_rate": 3.151760803986173e-05, + "loss": 0.2395, + "step": 14625 + }, + { + "epoch": 1.7343768528400332, + "grad_norm": 0.9340366254937932, + "learning_rate": 3.15152906563085e-05, + "loss": 0.1831, + "step": 14626 + }, + { + "epoch": 1.7344954346021582, + "grad_norm": 0.7819281658446631, + "learning_rate": 3.151297321269228e-05, + "loss": 0.1342, + "step": 14627 + }, + { + "epoch": 1.7346140163642831, + "grad_norm": 1.2661173762974, + "learning_rate": 3.151065570903443e-05, + "loss": 0.2252, + "step": 14628 + }, + { + "epoch": 1.7347325981264081, + "grad_norm": 0.9826860408305178, + "learning_rate": 3.150833814535632e-05, + "loss": 0.2036, + "step": 14629 + }, + { + "epoch": 1.734851179888533, + "grad_norm": 0.7638686625338466, + "learning_rate": 3.150602052167931e-05, + "loss": 0.1996, + "step": 14630 + }, + { + "epoch": 1.734969761650658, + "grad_norm": 0.8372998969847668, + "learning_rate": 3.150370283802478e-05, + "loss": 0.1865, + "step": 14631 + }, + { + "epoch": 1.735088343412783, + "grad_norm": 0.8884921928459256, + "learning_rate": 3.150138509441408e-05, + "loss": 0.2263, + "step": 14632 + }, + { + "epoch": 1.735206925174908, + "grad_norm": 0.9680458205053256, + "learning_rate": 3.149906729086858e-05, + "loss": 0.1605, + "step": 14633 + }, + { + "epoch": 1.7353255069370332, + "grad_norm": 1.1690871115166355, + "learning_rate": 3.1496749427409654e-05, + "loss": 0.2408, + "step": 14634 + }, + { + "epoch": 1.735444088699158, + "grad_norm": 1.1263368538043, + "learning_rate": 3.149443150405866e-05, + "loss": 0.2864, + "step": 14635 + }, + { + "epoch": 1.7355626704612832, + "grad_norm": 0.8298346028091645, + "learning_rate": 3.149211352083698e-05, + "loss": 0.1391, + "step": 14636 + }, + { + "epoch": 1.735681252223408, + "grad_norm": 0.9003543713327644, + "learning_rate": 3.1489795477765963e-05, + "loss": 0.2127, + "step": 14637 + }, + { + "epoch": 1.7357998339855332, + "grad_norm": 0.8106678442471635, + "learning_rate": 3.1487477374867e-05, + "loss": 0.1644, + "step": 14638 + }, + { + "epoch": 1.735918415747658, + "grad_norm": 1.0491964902023354, + "learning_rate": 3.148515921216144e-05, + "loss": 0.2362, + "step": 14639 + }, + { + "epoch": 1.7360369975097831, + "grad_norm": 0.803025307255877, + "learning_rate": 3.148284098967067e-05, + "loss": 0.1733, + "step": 14640 + }, + { + "epoch": 1.736155579271908, + "grad_norm": 1.2399545162843797, + "learning_rate": 3.148052270741606e-05, + "loss": 0.3142, + "step": 14641 + }, + { + "epoch": 1.736274161034033, + "grad_norm": 0.9781859532475387, + "learning_rate": 3.1478204365418976e-05, + "loss": 0.2073, + "step": 14642 + }, + { + "epoch": 1.7363927427961579, + "grad_norm": 0.9455098026669628, + "learning_rate": 3.147588596370079e-05, + "loss": 0.1891, + "step": 14643 + }, + { + "epoch": 1.736511324558283, + "grad_norm": 1.0463011930881334, + "learning_rate": 3.147356750228287e-05, + "loss": 0.2418, + "step": 14644 + }, + { + "epoch": 1.7366299063204078, + "grad_norm": 0.9096292332461895, + "learning_rate": 3.147124898118661e-05, + "loss": 0.1994, + "step": 14645 + }, + { + "epoch": 1.736748488082533, + "grad_norm": 1.0838290261818042, + "learning_rate": 3.1468930400433364e-05, + "loss": 0.2262, + "step": 14646 + }, + { + "epoch": 1.7368670698446578, + "grad_norm": 1.0711042929792685, + "learning_rate": 3.1466611760044505e-05, + "loss": 0.2154, + "step": 14647 + }, + { + "epoch": 1.736985651606783, + "grad_norm": 0.7310156044854647, + "learning_rate": 3.146429306004142e-05, + "loss": 0.1525, + "step": 14648 + }, + { + "epoch": 1.7371042333689077, + "grad_norm": 1.1269248614109024, + "learning_rate": 3.146197430044549e-05, + "loss": 0.2381, + "step": 14649 + }, + { + "epoch": 1.737222815131033, + "grad_norm": 0.934010888190883, + "learning_rate": 3.1459655481278066e-05, + "loss": 0.1818, + "step": 14650 + }, + { + "epoch": 1.7373413968931577, + "grad_norm": 0.9602209243940671, + "learning_rate": 3.145733660256055e-05, + "loss": 0.155, + "step": 14651 + }, + { + "epoch": 1.737459978655283, + "grad_norm": 0.9278154860693225, + "learning_rate": 3.14550176643143e-05, + "loss": 0.1763, + "step": 14652 + }, + { + "epoch": 1.7375785604174077, + "grad_norm": 0.9770993884313157, + "learning_rate": 3.145269866656071e-05, + "loss": 0.2057, + "step": 14653 + }, + { + "epoch": 1.7376971421795329, + "grad_norm": 0.9023934822206634, + "learning_rate": 3.145037960932115e-05, + "loss": 0.1559, + "step": 14654 + }, + { + "epoch": 1.7378157239416576, + "grad_norm": 0.682003572272264, + "learning_rate": 3.1448060492617e-05, + "loss": 0.1823, + "step": 14655 + }, + { + "epoch": 1.7379343057037828, + "grad_norm": 1.4923543521820113, + "learning_rate": 3.1445741316469644e-05, + "loss": 0.3213, + "step": 14656 + }, + { + "epoch": 1.7380528874659076, + "grad_norm": 0.9884594838152617, + "learning_rate": 3.144342208090045e-05, + "loss": 0.1809, + "step": 14657 + }, + { + "epoch": 1.7381714692280328, + "grad_norm": 0.800214508745925, + "learning_rate": 3.14411027859308e-05, + "loss": 0.1595, + "step": 14658 + }, + { + "epoch": 1.7382900509901578, + "grad_norm": 0.5996948967104088, + "learning_rate": 3.143878343158209e-05, + "loss": 0.168, + "step": 14659 + }, + { + "epoch": 1.7384086327522827, + "grad_norm": 0.9062228148534073, + "learning_rate": 3.14364640178757e-05, + "loss": 0.1924, + "step": 14660 + }, + { + "epoch": 1.7385272145144077, + "grad_norm": 0.9670159204822516, + "learning_rate": 3.143414454483299e-05, + "loss": 0.1505, + "step": 14661 + }, + { + "epoch": 1.7386457962765327, + "grad_norm": 0.7681103738219432, + "learning_rate": 3.143182501247537e-05, + "loss": 0.1295, + "step": 14662 + }, + { + "epoch": 1.7387643780386577, + "grad_norm": 0.8287071645455922, + "learning_rate": 3.14295054208242e-05, + "loss": 0.1675, + "step": 14663 + }, + { + "epoch": 1.7388829598007827, + "grad_norm": 0.7773036165296797, + "learning_rate": 3.142718576990088e-05, + "loss": 0.1401, + "step": 14664 + }, + { + "epoch": 1.7390015415629076, + "grad_norm": 1.8203964931477934, + "learning_rate": 3.1424866059726785e-05, + "loss": 0.4113, + "step": 14665 + }, + { + "epoch": 1.7391201233250326, + "grad_norm": 0.8889366335839831, + "learning_rate": 3.142254629032331e-05, + "loss": 0.2143, + "step": 14666 + }, + { + "epoch": 1.7392387050871576, + "grad_norm": 0.8328197467821113, + "learning_rate": 3.142022646171183e-05, + "loss": 0.157, + "step": 14667 + }, + { + "epoch": 1.7393572868492826, + "grad_norm": 0.8258561661134728, + "learning_rate": 3.141790657391374e-05, + "loss": 0.1526, + "step": 14668 + }, + { + "epoch": 1.7394758686114076, + "grad_norm": 0.9240301264708763, + "learning_rate": 3.141558662695041e-05, + "loss": 0.165, + "step": 14669 + }, + { + "epoch": 1.7395944503735326, + "grad_norm": 0.9156998262947461, + "learning_rate": 3.141326662084324e-05, + "loss": 0.2143, + "step": 14670 + }, + { + "epoch": 1.7397130321356575, + "grad_norm": 0.5399444281783027, + "learning_rate": 3.141094655561363e-05, + "loss": 0.1053, + "step": 14671 + }, + { + "epoch": 1.7398316138977825, + "grad_norm": 0.7862811982302823, + "learning_rate": 3.140862643128294e-05, + "loss": 0.1598, + "step": 14672 + }, + { + "epoch": 1.7399501956599075, + "grad_norm": 1.154699509546788, + "learning_rate": 3.1406306247872585e-05, + "loss": 0.2476, + "step": 14673 + }, + { + "epoch": 1.7400687774220325, + "grad_norm": 0.8827236670093243, + "learning_rate": 3.140398600540393e-05, + "loss": 0.1817, + "step": 14674 + }, + { + "epoch": 1.7401873591841575, + "grad_norm": 0.9605166983042341, + "learning_rate": 3.1401665703898385e-05, + "loss": 0.179, + "step": 14675 + }, + { + "epoch": 1.7403059409462824, + "grad_norm": 0.9189651617858207, + "learning_rate": 3.139934534337734e-05, + "loss": 0.2002, + "step": 14676 + }, + { + "epoch": 1.7404245227084074, + "grad_norm": 0.8032194573203678, + "learning_rate": 3.139702492386216e-05, + "loss": 0.1679, + "step": 14677 + }, + { + "epoch": 1.7405431044705324, + "grad_norm": 1.3067340320942569, + "learning_rate": 3.1394704445374264e-05, + "loss": 0.2741, + "step": 14678 + }, + { + "epoch": 1.7406616862326574, + "grad_norm": 1.3831913318649027, + "learning_rate": 3.139238390793504e-05, + "loss": 0.3667, + "step": 14679 + }, + { + "epoch": 1.7407802679947824, + "grad_norm": 0.9110327401318978, + "learning_rate": 3.1390063311565865e-05, + "loss": 0.1533, + "step": 14680 + }, + { + "epoch": 1.7408988497569073, + "grad_norm": 0.9316356382908979, + "learning_rate": 3.138774265628815e-05, + "loss": 0.1608, + "step": 14681 + }, + { + "epoch": 1.7410174315190323, + "grad_norm": 0.8554466667344999, + "learning_rate": 3.1385421942123275e-05, + "loss": 0.2001, + "step": 14682 + }, + { + "epoch": 1.7411360132811575, + "grad_norm": 1.1070359285081497, + "learning_rate": 3.138310116909265e-05, + "loss": 0.2224, + "step": 14683 + }, + { + "epoch": 1.7412545950432823, + "grad_norm": 0.8542681652410032, + "learning_rate": 3.138078033721765e-05, + "loss": 0.1676, + "step": 14684 + }, + { + "epoch": 1.7413731768054075, + "grad_norm": 0.9376684157773574, + "learning_rate": 3.137845944651968e-05, + "loss": 0.1995, + "step": 14685 + }, + { + "epoch": 1.7414917585675322, + "grad_norm": 0.8309221645578139, + "learning_rate": 3.137613849702014e-05, + "loss": 0.1598, + "step": 14686 + }, + { + "epoch": 1.7416103403296574, + "grad_norm": 0.7711950793526482, + "learning_rate": 3.137381748874042e-05, + "loss": 0.1608, + "step": 14687 + }, + { + "epoch": 1.7417289220917822, + "grad_norm": 0.9142569069403164, + "learning_rate": 3.137149642170192e-05, + "loss": 0.1697, + "step": 14688 + }, + { + "epoch": 1.7418475038539074, + "grad_norm": 0.7734034215765511, + "learning_rate": 3.1369175295926027e-05, + "loss": 0.1824, + "step": 14689 + }, + { + "epoch": 1.7419660856160322, + "grad_norm": 1.3452277105230492, + "learning_rate": 3.1366854111434164e-05, + "loss": 0.2598, + "step": 14690 + }, + { + "epoch": 1.7420846673781574, + "grad_norm": 0.9950671312883218, + "learning_rate": 3.13645328682477e-05, + "loss": 0.2288, + "step": 14691 + }, + { + "epoch": 1.7422032491402821, + "grad_norm": 0.7882612121448725, + "learning_rate": 3.136221156638806e-05, + "loss": 0.1324, + "step": 14692 + }, + { + "epoch": 1.7423218309024073, + "grad_norm": 0.8620196054126018, + "learning_rate": 3.135989020587663e-05, + "loss": 0.195, + "step": 14693 + }, + { + "epoch": 1.742440412664532, + "grad_norm": 0.9606716670992508, + "learning_rate": 3.1357568786734805e-05, + "loss": 0.1661, + "step": 14694 + }, + { + "epoch": 1.7425589944266573, + "grad_norm": 1.1342449205856742, + "learning_rate": 3.135524730898399e-05, + "loss": 0.2408, + "step": 14695 + }, + { + "epoch": 1.742677576188782, + "grad_norm": 2.6422525428849837, + "learning_rate": 3.135292577264559e-05, + "loss": 0.2807, + "step": 14696 + }, + { + "epoch": 1.7427961579509073, + "grad_norm": 1.616996971615701, + "learning_rate": 3.1350604177741e-05, + "loss": 0.3264, + "step": 14697 + }, + { + "epoch": 1.742914739713032, + "grad_norm": 1.8423757275384607, + "learning_rate": 3.134828252429163e-05, + "loss": 0.4758, + "step": 14698 + }, + { + "epoch": 1.7430333214751572, + "grad_norm": 1.1188241421723581, + "learning_rate": 3.134596081231889e-05, + "loss": 0.1735, + "step": 14699 + }, + { + "epoch": 1.743151903237282, + "grad_norm": 1.0000767243701667, + "learning_rate": 3.134363904184416e-05, + "loss": 0.2055, + "step": 14700 + }, + { + "epoch": 1.7432704849994072, + "grad_norm": 1.1967815762690743, + "learning_rate": 3.1341317212888874e-05, + "loss": 0.3134, + "step": 14701 + }, + { + "epoch": 1.743389066761532, + "grad_norm": 0.6963059326845474, + "learning_rate": 3.133899532547441e-05, + "loss": 0.1605, + "step": 14702 + }, + { + "epoch": 1.7435076485236571, + "grad_norm": 1.6968112237836632, + "learning_rate": 3.133667337962218e-05, + "loss": 0.4621, + "step": 14703 + }, + { + "epoch": 1.743626230285782, + "grad_norm": 0.7482893613933279, + "learning_rate": 3.133435137535359e-05, + "loss": 0.1992, + "step": 14704 + }, + { + "epoch": 1.743744812047907, + "grad_norm": 0.7230953010387683, + "learning_rate": 3.1332029312690056e-05, + "loss": 0.1686, + "step": 14705 + }, + { + "epoch": 1.7438633938100319, + "grad_norm": 0.9106451481290773, + "learning_rate": 3.132970719165297e-05, + "loss": 0.1929, + "step": 14706 + }, + { + "epoch": 1.743981975572157, + "grad_norm": 0.7639473137429754, + "learning_rate": 3.132738501226375e-05, + "loss": 0.1575, + "step": 14707 + }, + { + "epoch": 1.744100557334282, + "grad_norm": 0.8380643505564083, + "learning_rate": 3.13250627745438e-05, + "loss": 0.1975, + "step": 14708 + }, + { + "epoch": 1.744219139096407, + "grad_norm": 0.8443983736718295, + "learning_rate": 3.132274047851452e-05, + "loss": 0.183, + "step": 14709 + }, + { + "epoch": 1.744337720858532, + "grad_norm": 0.9331108825718925, + "learning_rate": 3.132041812419733e-05, + "loss": 0.2171, + "step": 14710 + }, + { + "epoch": 1.744456302620657, + "grad_norm": 0.638278176435028, + "learning_rate": 3.1318095711613643e-05, + "loss": 0.154, + "step": 14711 + }, + { + "epoch": 1.744574884382782, + "grad_norm": 0.7813491811190624, + "learning_rate": 3.131577324078485e-05, + "loss": 0.1864, + "step": 14712 + }, + { + "epoch": 1.744693466144907, + "grad_norm": 0.9461946798100636, + "learning_rate": 3.131345071173238e-05, + "loss": 0.1783, + "step": 14713 + }, + { + "epoch": 1.744812047907032, + "grad_norm": 0.8909720894461108, + "learning_rate": 3.131112812447763e-05, + "loss": 0.156, + "step": 14714 + }, + { + "epoch": 1.744930629669157, + "grad_norm": 0.8328765881558634, + "learning_rate": 3.1308805479042026e-05, + "loss": 0.1633, + "step": 14715 + }, + { + "epoch": 1.7450492114312819, + "grad_norm": 0.9608920141694961, + "learning_rate": 3.1306482775446974e-05, + "loss": 0.1893, + "step": 14716 + }, + { + "epoch": 1.7451677931934069, + "grad_norm": 1.2340694978707625, + "learning_rate": 3.130416001371387e-05, + "loss": 0.2641, + "step": 14717 + }, + { + "epoch": 1.7452863749555318, + "grad_norm": 0.7715223509986534, + "learning_rate": 3.1301837193864144e-05, + "loss": 0.1799, + "step": 14718 + }, + { + "epoch": 1.7454049567176568, + "grad_norm": 0.9640005608566047, + "learning_rate": 3.129951431591921e-05, + "loss": 0.2533, + "step": 14719 + }, + { + "epoch": 1.7455235384797818, + "grad_norm": 0.905653941381957, + "learning_rate": 3.1297191379900495e-05, + "loss": 0.2196, + "step": 14720 + }, + { + "epoch": 1.7456421202419068, + "grad_norm": 0.6405748074877983, + "learning_rate": 3.129486838582938e-05, + "loss": 0.1378, + "step": 14721 + }, + { + "epoch": 1.7457607020040318, + "grad_norm": 1.183329449936503, + "learning_rate": 3.1292545333727304e-05, + "loss": 0.2353, + "step": 14722 + }, + { + "epoch": 1.7458792837661568, + "grad_norm": 1.0219283291141261, + "learning_rate": 3.1290222223615676e-05, + "loss": 0.2589, + "step": 14723 + }, + { + "epoch": 1.7459978655282817, + "grad_norm": 1.0576685873294662, + "learning_rate": 3.1287899055515915e-05, + "loss": 0.2196, + "step": 14724 + }, + { + "epoch": 1.7461164472904067, + "grad_norm": 0.7991294323974482, + "learning_rate": 3.128557582944942e-05, + "loss": 0.1679, + "step": 14725 + }, + { + "epoch": 1.7462350290525317, + "grad_norm": 0.9288195276321649, + "learning_rate": 3.128325254543764e-05, + "loss": 0.1965, + "step": 14726 + }, + { + "epoch": 1.7463536108146567, + "grad_norm": 0.8027063002262543, + "learning_rate": 3.128092920350196e-05, + "loss": 0.1476, + "step": 14727 + }, + { + "epoch": 1.7464721925767817, + "grad_norm": 1.3905991500351387, + "learning_rate": 3.1278605803663826e-05, + "loss": 0.2546, + "step": 14728 + }, + { + "epoch": 1.7465907743389066, + "grad_norm": 1.375260733284016, + "learning_rate": 3.127628234594465e-05, + "loss": 0.3339, + "step": 14729 + }, + { + "epoch": 1.7467093561010316, + "grad_norm": 1.0193104876446104, + "learning_rate": 3.1273958830365844e-05, + "loss": 0.2334, + "step": 14730 + }, + { + "epoch": 1.7468279378631566, + "grad_norm": 0.9776589915037509, + "learning_rate": 3.127163525694884e-05, + "loss": 0.2348, + "step": 14731 + }, + { + "epoch": 1.7469465196252816, + "grad_norm": 0.9696481908786015, + "learning_rate": 3.1269311625715035e-05, + "loss": 0.1854, + "step": 14732 + }, + { + "epoch": 1.7470651013874066, + "grad_norm": 1.0568378729643966, + "learning_rate": 3.126698793668588e-05, + "loss": 0.2669, + "step": 14733 + }, + { + "epoch": 1.7471836831495318, + "grad_norm": 1.126053108898335, + "learning_rate": 3.1264664189882764e-05, + "loss": 0.2103, + "step": 14734 + }, + { + "epoch": 1.7473022649116565, + "grad_norm": 1.0752732976341552, + "learning_rate": 3.1262340385327136e-05, + "loss": 0.2102, + "step": 14735 + }, + { + "epoch": 1.7474208466737817, + "grad_norm": 0.7112875099746112, + "learning_rate": 3.1260016523040405e-05, + "loss": 0.1934, + "step": 14736 + }, + { + "epoch": 1.7475394284359065, + "grad_norm": 1.068287474430743, + "learning_rate": 3.1257692603044e-05, + "loss": 0.1701, + "step": 14737 + }, + { + "epoch": 1.7476580101980317, + "grad_norm": 0.6446506295830842, + "learning_rate": 3.125536862535934e-05, + "loss": 0.1267, + "step": 14738 + }, + { + "epoch": 1.7477765919601564, + "grad_norm": 1.1681980035917268, + "learning_rate": 3.1253044590007854e-05, + "loss": 0.1785, + "step": 14739 + }, + { + "epoch": 1.7478951737222816, + "grad_norm": 0.7095332795504519, + "learning_rate": 3.1250720497010965e-05, + "loss": 0.1563, + "step": 14740 + }, + { + "epoch": 1.7480137554844064, + "grad_norm": 1.1152570389086014, + "learning_rate": 3.1248396346390105e-05, + "loss": 0.2397, + "step": 14741 + }, + { + "epoch": 1.7481323372465316, + "grad_norm": 0.8677553296943814, + "learning_rate": 3.124607213816669e-05, + "loss": 0.1894, + "step": 14742 + }, + { + "epoch": 1.7482509190086564, + "grad_norm": 0.8603168201419683, + "learning_rate": 3.124374787236214e-05, + "loss": 0.1551, + "step": 14743 + }, + { + "epoch": 1.7483695007707816, + "grad_norm": 0.7571817149681475, + "learning_rate": 3.1241423548997905e-05, + "loss": 0.1471, + "step": 14744 + }, + { + "epoch": 1.7484880825329063, + "grad_norm": 0.7195406685536916, + "learning_rate": 3.123909916809539e-05, + "loss": 0.1635, + "step": 14745 + }, + { + "epoch": 1.7486066642950315, + "grad_norm": 1.3518617374469146, + "learning_rate": 3.1236774729676025e-05, + "loss": 0.2909, + "step": 14746 + }, + { + "epoch": 1.7487252460571563, + "grad_norm": 1.1358711523202842, + "learning_rate": 3.1234450233761256e-05, + "loss": 0.2231, + "step": 14747 + }, + { + "epoch": 1.7488438278192815, + "grad_norm": 0.8602899634793333, + "learning_rate": 3.12321256803725e-05, + "loss": 0.1922, + "step": 14748 + }, + { + "epoch": 1.7489624095814063, + "grad_norm": 0.7821568922409815, + "learning_rate": 3.1229801069531186e-05, + "loss": 0.1454, + "step": 14749 + }, + { + "epoch": 1.7490809913435315, + "grad_norm": 0.6876107848519728, + "learning_rate": 3.122747640125875e-05, + "loss": 0.1492, + "step": 14750 + }, + { + "epoch": 1.7491995731056562, + "grad_norm": 0.8074976490003586, + "learning_rate": 3.1225151675576606e-05, + "loss": 0.1924, + "step": 14751 + }, + { + "epoch": 1.7493181548677814, + "grad_norm": 1.0429273827022623, + "learning_rate": 3.122282689250621e-05, + "loss": 0.2313, + "step": 14752 + }, + { + "epoch": 1.7494367366299062, + "grad_norm": 1.2574876430952961, + "learning_rate": 3.1220502052068974e-05, + "loss": 0.2791, + "step": 14753 + }, + { + "epoch": 1.7495553183920314, + "grad_norm": 0.7557862888492397, + "learning_rate": 3.121817715428634e-05, + "loss": 0.1504, + "step": 14754 + }, + { + "epoch": 1.7496739001541561, + "grad_norm": 0.7787964771864523, + "learning_rate": 3.121585219917973e-05, + "loss": 0.1472, + "step": 14755 + }, + { + "epoch": 1.7497924819162813, + "grad_norm": 0.9327345736794969, + "learning_rate": 3.12135271867706e-05, + "loss": 0.1806, + "step": 14756 + }, + { + "epoch": 1.749911063678406, + "grad_norm": 1.1406996299268843, + "learning_rate": 3.121120211708036e-05, + "loss": 0.2737, + "step": 14757 + }, + { + "epoch": 1.7500296454405313, + "grad_norm": 0.8233042585643373, + "learning_rate": 3.120887699013045e-05, + "loss": 0.1744, + "step": 14758 + }, + { + "epoch": 1.7501482272026563, + "grad_norm": 0.7156943581001208, + "learning_rate": 3.1206551805942315e-05, + "loss": 0.1177, + "step": 14759 + }, + { + "epoch": 1.7502668089647813, + "grad_norm": 0.8039082845097842, + "learning_rate": 3.1204226564537376e-05, + "loss": 0.1337, + "step": 14760 + }, + { + "epoch": 1.7503853907269062, + "grad_norm": 0.5788806248357289, + "learning_rate": 3.1201901265937086e-05, + "loss": 0.123, + "step": 14761 + }, + { + "epoch": 1.7505039724890312, + "grad_norm": 0.9380202183654657, + "learning_rate": 3.119957591016286e-05, + "loss": 0.2249, + "step": 14762 + }, + { + "epoch": 1.7506225542511562, + "grad_norm": 0.8686238872008828, + "learning_rate": 3.119725049723616e-05, + "loss": 0.1584, + "step": 14763 + }, + { + "epoch": 1.7507411360132812, + "grad_norm": 1.1543270811281086, + "learning_rate": 3.1194925027178404e-05, + "loss": 0.1993, + "step": 14764 + }, + { + "epoch": 1.7508597177754062, + "grad_norm": 0.7476332772636242, + "learning_rate": 3.119259950001103e-05, + "loss": 0.2021, + "step": 14765 + }, + { + "epoch": 1.7509782995375311, + "grad_norm": 1.2288452085247499, + "learning_rate": 3.1190273915755486e-05, + "loss": 0.3274, + "step": 14766 + }, + { + "epoch": 1.7510968812996561, + "grad_norm": 0.791181531632165, + "learning_rate": 3.118794827443321e-05, + "loss": 0.2077, + "step": 14767 + }, + { + "epoch": 1.751215463061781, + "grad_norm": 0.9159606029800984, + "learning_rate": 3.118562257606563e-05, + "loss": 0.1934, + "step": 14768 + }, + { + "epoch": 1.751334044823906, + "grad_norm": 0.973074791491308, + "learning_rate": 3.118329682067421e-05, + "loss": 0.1658, + "step": 14769 + }, + { + "epoch": 1.751452626586031, + "grad_norm": 0.8867746450918721, + "learning_rate": 3.118097100828037e-05, + "loss": 0.184, + "step": 14770 + }, + { + "epoch": 1.751571208348156, + "grad_norm": 0.9659115836581564, + "learning_rate": 3.1178645138905546e-05, + "loss": 0.2077, + "step": 14771 + }, + { + "epoch": 1.751689790110281, + "grad_norm": 1.1018051489028053, + "learning_rate": 3.1176319212571206e-05, + "loss": 0.2311, + "step": 14772 + }, + { + "epoch": 1.751808371872406, + "grad_norm": 0.9148713969804662, + "learning_rate": 3.117399322929877e-05, + "loss": 0.1677, + "step": 14773 + }, + { + "epoch": 1.751926953634531, + "grad_norm": 1.1134692663359749, + "learning_rate": 3.1171667189109695e-05, + "loss": 0.2657, + "step": 14774 + }, + { + "epoch": 1.752045535396656, + "grad_norm": 1.0058727704719947, + "learning_rate": 3.1169341092025405e-05, + "loss": 0.1817, + "step": 14775 + }, + { + "epoch": 1.752164117158781, + "grad_norm": 0.9396905789555027, + "learning_rate": 3.116701493806736e-05, + "loss": 0.2142, + "step": 14776 + }, + { + "epoch": 1.752282698920906, + "grad_norm": 0.9316735214847013, + "learning_rate": 3.1164688727257e-05, + "loss": 0.1357, + "step": 14777 + }, + { + "epoch": 1.752401280683031, + "grad_norm": 0.8234442536315836, + "learning_rate": 3.1162362459615765e-05, + "loss": 0.1387, + "step": 14778 + }, + { + "epoch": 1.752519862445156, + "grad_norm": 0.9204061486823071, + "learning_rate": 3.116003613516511e-05, + "loss": 0.2236, + "step": 14779 + }, + { + "epoch": 1.7526384442072809, + "grad_norm": 0.7365520341161451, + "learning_rate": 3.1157709753926475e-05, + "loss": 0.1365, + "step": 14780 + }, + { + "epoch": 1.7527570259694059, + "grad_norm": 0.6235580931159266, + "learning_rate": 3.115538331592131e-05, + "loss": 0.1367, + "step": 14781 + }, + { + "epoch": 1.7528756077315308, + "grad_norm": 0.8200355547090945, + "learning_rate": 3.1153056821171064e-05, + "loss": 0.1879, + "step": 14782 + }, + { + "epoch": 1.752994189493656, + "grad_norm": 1.1105314428172732, + "learning_rate": 3.115073026969717e-05, + "loss": 0.2443, + "step": 14783 + }, + { + "epoch": 1.7531127712557808, + "grad_norm": 1.032984283698428, + "learning_rate": 3.1148403661521086e-05, + "loss": 0.2422, + "step": 14784 + }, + { + "epoch": 1.753231353017906, + "grad_norm": 0.9740496894024482, + "learning_rate": 3.114607699666426e-05, + "loss": 0.2253, + "step": 14785 + }, + { + "epoch": 1.7533499347800308, + "grad_norm": 1.0272721348404499, + "learning_rate": 3.1143750275148144e-05, + "loss": 0.2141, + "step": 14786 + }, + { + "epoch": 1.753468516542156, + "grad_norm": 0.9121106254829645, + "learning_rate": 3.114142349699418e-05, + "loss": 0.1756, + "step": 14787 + }, + { + "epoch": 1.7535870983042807, + "grad_norm": 0.8818531638663641, + "learning_rate": 3.113909666222382e-05, + "loss": 0.2013, + "step": 14788 + }, + { + "epoch": 1.753705680066406, + "grad_norm": 0.8193679282455799, + "learning_rate": 3.1136769770858534e-05, + "loss": 0.1804, + "step": 14789 + }, + { + "epoch": 1.7538242618285307, + "grad_norm": 0.6742735831461614, + "learning_rate": 3.1134442822919735e-05, + "loss": 0.1236, + "step": 14790 + }, + { + "epoch": 1.7539428435906559, + "grad_norm": 0.7067647421730557, + "learning_rate": 3.113211581842891e-05, + "loss": 0.1548, + "step": 14791 + }, + { + "epoch": 1.7540614253527806, + "grad_norm": 0.6760108282135261, + "learning_rate": 3.112978875740749e-05, + "loss": 0.1412, + "step": 14792 + }, + { + "epoch": 1.7541800071149058, + "grad_norm": 1.0067637672678211, + "learning_rate": 3.112746163987694e-05, + "loss": 0.1796, + "step": 14793 + }, + { + "epoch": 1.7542985888770306, + "grad_norm": 1.0472344816540369, + "learning_rate": 3.11251344658587e-05, + "loss": 0.2555, + "step": 14794 + }, + { + "epoch": 1.7544171706391558, + "grad_norm": 1.2206057895173847, + "learning_rate": 3.112280723537424e-05, + "loss": 0.1968, + "step": 14795 + }, + { + "epoch": 1.7545357524012806, + "grad_norm": 0.7860388739489768, + "learning_rate": 3.1120479948445e-05, + "loss": 0.2181, + "step": 14796 + }, + { + "epoch": 1.7546543341634058, + "grad_norm": 1.2562880033526425, + "learning_rate": 3.1118152605092445e-05, + "loss": 0.2479, + "step": 14797 + }, + { + "epoch": 1.7547729159255305, + "grad_norm": 0.714937064824541, + "learning_rate": 3.111582520533802e-05, + "loss": 0.1731, + "step": 14798 + }, + { + "epoch": 1.7548914976876557, + "grad_norm": 0.9142893659595972, + "learning_rate": 3.111349774920318e-05, + "loss": 0.1947, + "step": 14799 + }, + { + "epoch": 1.7550100794497805, + "grad_norm": 0.9846530101392512, + "learning_rate": 3.11111702367094e-05, + "loss": 0.1851, + "step": 14800 + }, + { + "epoch": 1.7551286612119057, + "grad_norm": 1.2490529333031106, + "learning_rate": 3.110884266787812e-05, + "loss": 0.2408, + "step": 14801 + }, + { + "epoch": 1.7552472429740305, + "grad_norm": 0.8151878648435398, + "learning_rate": 3.1106515042730805e-05, + "loss": 0.1827, + "step": 14802 + }, + { + "epoch": 1.7553658247361557, + "grad_norm": 0.9334809806449261, + "learning_rate": 3.1104187361288904e-05, + "loss": 0.2064, + "step": 14803 + }, + { + "epoch": 1.7554844064982804, + "grad_norm": 0.8293816320351532, + "learning_rate": 3.110185962357389e-05, + "loss": 0.1723, + "step": 14804 + }, + { + "epoch": 1.7556029882604056, + "grad_norm": 1.2568819103594926, + "learning_rate": 3.1099531829607203e-05, + "loss": 0.2721, + "step": 14805 + }, + { + "epoch": 1.7557215700225304, + "grad_norm": 1.1244477694223052, + "learning_rate": 3.109720397941032e-05, + "loss": 0.2121, + "step": 14806 + }, + { + "epoch": 1.7558401517846556, + "grad_norm": 0.7259247614922956, + "learning_rate": 3.109487607300468e-05, + "loss": 0.1249, + "step": 14807 + }, + { + "epoch": 1.7559587335467806, + "grad_norm": 1.0451422002873032, + "learning_rate": 3.109254811041177e-05, + "loss": 0.1781, + "step": 14808 + }, + { + "epoch": 1.7560773153089055, + "grad_norm": 0.8314172859284606, + "learning_rate": 3.1090220091653035e-05, + "loss": 0.1733, + "step": 14809 + }, + { + "epoch": 1.7561958970710305, + "grad_norm": 1.1016417046454445, + "learning_rate": 3.108789201674994e-05, + "loss": 0.2009, + "step": 14810 + }, + { + "epoch": 1.7563144788331555, + "grad_norm": 0.9438207625630018, + "learning_rate": 3.108556388572394e-05, + "loss": 0.2224, + "step": 14811 + }, + { + "epoch": 1.7564330605952805, + "grad_norm": 1.0515292978023176, + "learning_rate": 3.1083235698596505e-05, + "loss": 0.2187, + "step": 14812 + }, + { + "epoch": 1.7565516423574055, + "grad_norm": 0.930839341522019, + "learning_rate": 3.1080907455389105e-05, + "loss": 0.2361, + "step": 14813 + }, + { + "epoch": 1.7566702241195304, + "grad_norm": 1.2809523209110942, + "learning_rate": 3.107857915612318e-05, + "loss": 0.2432, + "step": 14814 + }, + { + "epoch": 1.7567888058816554, + "grad_norm": 0.7698859945847877, + "learning_rate": 3.107625080082022e-05, + "loss": 0.1254, + "step": 14815 + }, + { + "epoch": 1.7569073876437804, + "grad_norm": 0.8398453376141921, + "learning_rate": 3.107392238950168e-05, + "loss": 0.1705, + "step": 14816 + }, + { + "epoch": 1.7570259694059054, + "grad_norm": 1.3656750425212543, + "learning_rate": 3.107159392218902e-05, + "loss": 0.2861, + "step": 14817 + }, + { + "epoch": 1.7571445511680304, + "grad_norm": 1.0553287706207823, + "learning_rate": 3.106926539890371e-05, + "loss": 0.2001, + "step": 14818 + }, + { + "epoch": 1.7572631329301553, + "grad_norm": 0.7387132463740662, + "learning_rate": 3.1066936819667214e-05, + "loss": 0.1679, + "step": 14819 + }, + { + "epoch": 1.7573817146922803, + "grad_norm": 1.0785606281740425, + "learning_rate": 3.1064608184501e-05, + "loss": 0.2516, + "step": 14820 + }, + { + "epoch": 1.7575002964544053, + "grad_norm": 0.6860230116532628, + "learning_rate": 3.1062279493426546e-05, + "loss": 0.1499, + "step": 14821 + }, + { + "epoch": 1.7576188782165303, + "grad_norm": 1.2550899718069486, + "learning_rate": 3.1059950746465295e-05, + "loss": 0.2505, + "step": 14822 + }, + { + "epoch": 1.7577374599786553, + "grad_norm": 0.6844196692592889, + "learning_rate": 3.105762194363874e-05, + "loss": 0.1729, + "step": 14823 + }, + { + "epoch": 1.7578560417407803, + "grad_norm": 0.7690059080033667, + "learning_rate": 3.105529308496833e-05, + "loss": 0.1652, + "step": 14824 + }, + { + "epoch": 1.7579746235029052, + "grad_norm": 0.9627255035898441, + "learning_rate": 3.105296417047555e-05, + "loss": 0.2157, + "step": 14825 + }, + { + "epoch": 1.7580932052650302, + "grad_norm": 1.1026628267394825, + "learning_rate": 3.105063520018186e-05, + "loss": 0.2269, + "step": 14826 + }, + { + "epoch": 1.7582117870271552, + "grad_norm": 0.9586249589782364, + "learning_rate": 3.104830617410873e-05, + "loss": 0.1996, + "step": 14827 + }, + { + "epoch": 1.7583303687892802, + "grad_norm": 1.3464576578263285, + "learning_rate": 3.104597709227764e-05, + "loss": 0.2908, + "step": 14828 + }, + { + "epoch": 1.7584489505514052, + "grad_norm": 0.758243279189508, + "learning_rate": 3.104364795471005e-05, + "loss": 0.1541, + "step": 14829 + }, + { + "epoch": 1.7585675323135301, + "grad_norm": 0.9140795583679873, + "learning_rate": 3.104131876142744e-05, + "loss": 0.2163, + "step": 14830 + }, + { + "epoch": 1.7586861140756551, + "grad_norm": 0.752817430527757, + "learning_rate": 3.1038989512451276e-05, + "loss": 0.1419, + "step": 14831 + }, + { + "epoch": 1.75880469583778, + "grad_norm": 0.6897713157901155, + "learning_rate": 3.1036660207803045e-05, + "loss": 0.144, + "step": 14832 + }, + { + "epoch": 1.758923277599905, + "grad_norm": 1.020275813633126, + "learning_rate": 3.10343308475042e-05, + "loss": 0.2451, + "step": 14833 + }, + { + "epoch": 1.7590418593620303, + "grad_norm": 1.1055411503175445, + "learning_rate": 3.103200143157623e-05, + "loss": 0.2883, + "step": 14834 + }, + { + "epoch": 1.759160441124155, + "grad_norm": 0.8050548160315656, + "learning_rate": 3.102967196004059e-05, + "loss": 0.1498, + "step": 14835 + }, + { + "epoch": 1.7592790228862802, + "grad_norm": 1.2688358731443792, + "learning_rate": 3.1027342432918785e-05, + "loss": 0.2424, + "step": 14836 + }, + { + "epoch": 1.759397604648405, + "grad_norm": 0.7735939519645442, + "learning_rate": 3.102501285023227e-05, + "loss": 0.177, + "step": 14837 + }, + { + "epoch": 1.7595161864105302, + "grad_norm": 0.7722088482400123, + "learning_rate": 3.102268321200252e-05, + "loss": 0.2002, + "step": 14838 + }, + { + "epoch": 1.759634768172655, + "grad_norm": 0.7144675888642609, + "learning_rate": 3.1020353518251014e-05, + "loss": 0.1572, + "step": 14839 + }, + { + "epoch": 1.7597533499347802, + "grad_norm": 0.9478802615539146, + "learning_rate": 3.101802376899924e-05, + "loss": 0.1799, + "step": 14840 + }, + { + "epoch": 1.759871931696905, + "grad_norm": 0.8745511797730666, + "learning_rate": 3.101569396426866e-05, + "loss": 0.1675, + "step": 14841 + }, + { + "epoch": 1.7599905134590301, + "grad_norm": 1.1451252451607612, + "learning_rate": 3.1013364104080764e-05, + "loss": 0.2262, + "step": 14842 + }, + { + "epoch": 1.7601090952211549, + "grad_norm": 0.8799421430990599, + "learning_rate": 3.101103418845702e-05, + "loss": 0.1882, + "step": 14843 + }, + { + "epoch": 1.76022767698328, + "grad_norm": 0.9975476227910289, + "learning_rate": 3.1008704217418915e-05, + "loss": 0.1673, + "step": 14844 + }, + { + "epoch": 1.7603462587454048, + "grad_norm": 0.9194817215568818, + "learning_rate": 3.100637419098793e-05, + "loss": 0.1644, + "step": 14845 + }, + { + "epoch": 1.76046484050753, + "grad_norm": 1.0648347752798892, + "learning_rate": 3.100404410918553e-05, + "loss": 0.2222, + "step": 14846 + }, + { + "epoch": 1.7605834222696548, + "grad_norm": 0.9026856360847394, + "learning_rate": 3.100171397203321e-05, + "loss": 0.1811, + "step": 14847 + }, + { + "epoch": 1.76070200403178, + "grad_norm": 0.736796544618207, + "learning_rate": 3.099938377955245e-05, + "loss": 0.157, + "step": 14848 + }, + { + "epoch": 1.7608205857939048, + "grad_norm": 0.9164514304698249, + "learning_rate": 3.099705353176473e-05, + "loss": 0.1738, + "step": 14849 + }, + { + "epoch": 1.76093916755603, + "grad_norm": 0.8438788198776679, + "learning_rate": 3.099472322869152e-05, + "loss": 0.1793, + "step": 14850 + }, + { + "epoch": 1.7610577493181547, + "grad_norm": 0.818062052804721, + "learning_rate": 3.099239287035432e-05, + "loss": 0.1471, + "step": 14851 + }, + { + "epoch": 1.76117633108028, + "grad_norm": 1.0953319062713938, + "learning_rate": 3.099006245677461e-05, + "loss": 0.2215, + "step": 14852 + }, + { + "epoch": 1.7612949128424047, + "grad_norm": 0.6900863229846177, + "learning_rate": 3.098773198797387e-05, + "loss": 0.1516, + "step": 14853 + }, + { + "epoch": 1.76141349460453, + "grad_norm": 1.3250150747414815, + "learning_rate": 3.098540146397358e-05, + "loss": 0.2375, + "step": 14854 + }, + { + "epoch": 1.7615320763666547, + "grad_norm": 0.8426236083124936, + "learning_rate": 3.0983070884795224e-05, + "loss": 0.162, + "step": 14855 + }, + { + "epoch": 1.7616506581287799, + "grad_norm": 1.644844554889216, + "learning_rate": 3.098074025046029e-05, + "loss": 0.3373, + "step": 14856 + }, + { + "epoch": 1.7617692398909046, + "grad_norm": 1.0056981877051856, + "learning_rate": 3.097840956099027e-05, + "loss": 0.1874, + "step": 14857 + }, + { + "epoch": 1.7618878216530298, + "grad_norm": 1.8666892052221433, + "learning_rate": 3.097607881640665e-05, + "loss": 0.4126, + "step": 14858 + }, + { + "epoch": 1.7620064034151548, + "grad_norm": 0.8536813796677761, + "learning_rate": 3.097374801673091e-05, + "loss": 0.1517, + "step": 14859 + }, + { + "epoch": 1.7621249851772798, + "grad_norm": 0.8759794903410523, + "learning_rate": 3.0971417161984534e-05, + "loss": 0.1344, + "step": 14860 + }, + { + "epoch": 1.7622435669394048, + "grad_norm": 0.9307098463954941, + "learning_rate": 3.096908625218902e-05, + "loss": 0.2269, + "step": 14861 + }, + { + "epoch": 1.7623621487015297, + "grad_norm": 0.6872345344468768, + "learning_rate": 3.096675528736585e-05, + "loss": 0.1812, + "step": 14862 + }, + { + "epoch": 1.7624807304636547, + "grad_norm": 0.9745679827003817, + "learning_rate": 3.09644242675365e-05, + "loss": 0.1809, + "step": 14863 + }, + { + "epoch": 1.7625993122257797, + "grad_norm": 1.1339729070058702, + "learning_rate": 3.096209319272249e-05, + "loss": 0.2187, + "step": 14864 + }, + { + "epoch": 1.7627178939879047, + "grad_norm": 1.1232845296982497, + "learning_rate": 3.095976206294528e-05, + "loss": 0.2575, + "step": 14865 + }, + { + "epoch": 1.7628364757500297, + "grad_norm": 0.7777146269260895, + "learning_rate": 3.095743087822637e-05, + "loss": 0.1693, + "step": 14866 + }, + { + "epoch": 1.7629550575121546, + "grad_norm": 0.9068219057166435, + "learning_rate": 3.095509963858726e-05, + "loss": 0.2083, + "step": 14867 + }, + { + "epoch": 1.7630736392742796, + "grad_norm": 0.7762841243825908, + "learning_rate": 3.095276834404944e-05, + "loss": 0.1443, + "step": 14868 + }, + { + "epoch": 1.7631922210364046, + "grad_norm": 1.1436813341941612, + "learning_rate": 3.095043699463439e-05, + "loss": 0.2448, + "step": 14869 + }, + { + "epoch": 1.7633108027985296, + "grad_norm": 0.8748980642054183, + "learning_rate": 3.0948105590363604e-05, + "loss": 0.1577, + "step": 14870 + }, + { + "epoch": 1.7634293845606546, + "grad_norm": 0.7238568695665142, + "learning_rate": 3.0945774131258584e-05, + "loss": 0.145, + "step": 14871 + }, + { + "epoch": 1.7635479663227795, + "grad_norm": 1.0429400334298955, + "learning_rate": 3.0943442617340815e-05, + "loss": 0.1963, + "step": 14872 + }, + { + "epoch": 1.7636665480849045, + "grad_norm": 1.0819674123791656, + "learning_rate": 3.094111104863179e-05, + "loss": 0.2487, + "step": 14873 + }, + { + "epoch": 1.7637851298470295, + "grad_norm": 1.027277875988551, + "learning_rate": 3.093877942515301e-05, + "loss": 0.1963, + "step": 14874 + }, + { + "epoch": 1.7639037116091545, + "grad_norm": 0.8512638129905388, + "learning_rate": 3.0936447746925965e-05, + "loss": 0.1716, + "step": 14875 + }, + { + "epoch": 1.7640222933712795, + "grad_norm": 0.9607419498915598, + "learning_rate": 3.093411601397215e-05, + "loss": 0.2385, + "step": 14876 + }, + { + "epoch": 1.7641408751334045, + "grad_norm": 0.6807954200171579, + "learning_rate": 3.0931784226313065e-05, + "loss": 0.1272, + "step": 14877 + }, + { + "epoch": 1.7642594568955294, + "grad_norm": 0.9326963816414402, + "learning_rate": 3.0929452383970204e-05, + "loss": 0.2625, + "step": 14878 + }, + { + "epoch": 1.7643780386576544, + "grad_norm": 0.7705740664861088, + "learning_rate": 3.092712048696507e-05, + "loss": 0.157, + "step": 14879 + }, + { + "epoch": 1.7644966204197794, + "grad_norm": 0.8713794370736185, + "learning_rate": 3.092478853531914e-05, + "loss": 0.1859, + "step": 14880 + }, + { + "epoch": 1.7646152021819044, + "grad_norm": 0.7536909456644649, + "learning_rate": 3.092245652905393e-05, + "loss": 0.2062, + "step": 14881 + }, + { + "epoch": 1.7647337839440294, + "grad_norm": 0.9061926521508114, + "learning_rate": 3.092012446819094e-05, + "loss": 0.1515, + "step": 14882 + }, + { + "epoch": 1.7648523657061546, + "grad_norm": 0.9931740595933588, + "learning_rate": 3.091779235275166e-05, + "loss": 0.1845, + "step": 14883 + }, + { + "epoch": 1.7649709474682793, + "grad_norm": 1.0183512887573098, + "learning_rate": 3.091546018275758e-05, + "loss": 0.1629, + "step": 14884 + }, + { + "epoch": 1.7650895292304045, + "grad_norm": 1.3535742509181703, + "learning_rate": 3.091312795823022e-05, + "loss": 0.3306, + "step": 14885 + }, + { + "epoch": 1.7652081109925293, + "grad_norm": 1.0705997614139597, + "learning_rate": 3.091079567919107e-05, + "loss": 0.25, + "step": 14886 + }, + { + "epoch": 1.7653266927546545, + "grad_norm": 1.4562354521449878, + "learning_rate": 3.0908463345661626e-05, + "loss": 0.3469, + "step": 14887 + }, + { + "epoch": 1.7654452745167792, + "grad_norm": 1.1274859500662144, + "learning_rate": 3.0906130957663406e-05, + "loss": 0.184, + "step": 14888 + }, + { + "epoch": 1.7655638562789044, + "grad_norm": 0.9178547106632408, + "learning_rate": 3.0903798515217895e-05, + "loss": 0.1945, + "step": 14889 + }, + { + "epoch": 1.7656824380410292, + "grad_norm": 0.6894766091951534, + "learning_rate": 3.090146601834661e-05, + "loss": 0.1535, + "step": 14890 + }, + { + "epoch": 1.7658010198031544, + "grad_norm": 0.7460550147882922, + "learning_rate": 3.089913346707104e-05, + "loss": 0.2011, + "step": 14891 + }, + { + "epoch": 1.7659196015652792, + "grad_norm": 1.2387376068597697, + "learning_rate": 3.089680086141269e-05, + "loss": 0.2803, + "step": 14892 + }, + { + "epoch": 1.7660381833274044, + "grad_norm": 1.3093663064585344, + "learning_rate": 3.089446820139307e-05, + "loss": 0.2886, + "step": 14893 + }, + { + "epoch": 1.7661567650895291, + "grad_norm": 0.8905581770664908, + "learning_rate": 3.089213548703368e-05, + "loss": 0.182, + "step": 14894 + }, + { + "epoch": 1.7662753468516543, + "grad_norm": 0.8540266705495438, + "learning_rate": 3.088980271835603e-05, + "loss": 0.1836, + "step": 14895 + }, + { + "epoch": 1.766393928613779, + "grad_norm": 0.7212904008625894, + "learning_rate": 3.0887469895381616e-05, + "loss": 0.1648, + "step": 14896 + }, + { + "epoch": 1.7665125103759043, + "grad_norm": 0.7934357530574591, + "learning_rate": 3.088513701813195e-05, + "loss": 0.1855, + "step": 14897 + }, + { + "epoch": 1.766631092138029, + "grad_norm": 0.7026571016118718, + "learning_rate": 3.088280408662854e-05, + "loss": 0.1334, + "step": 14898 + }, + { + "epoch": 1.7667496739001542, + "grad_norm": 1.099803772713782, + "learning_rate": 3.088047110089289e-05, + "loss": 0.2431, + "step": 14899 + }, + { + "epoch": 1.766868255662279, + "grad_norm": 0.872946891277008, + "learning_rate": 3.08781380609465e-05, + "loss": 0.1782, + "step": 14900 + }, + { + "epoch": 1.7669868374244042, + "grad_norm": 0.7424835877095736, + "learning_rate": 3.08758049668109e-05, + "loss": 0.17, + "step": 14901 + }, + { + "epoch": 1.767105419186529, + "grad_norm": 0.7889994749197983, + "learning_rate": 3.087347181850757e-05, + "loss": 0.1852, + "step": 14902 + }, + { + "epoch": 1.7672240009486542, + "grad_norm": 0.9506042980693984, + "learning_rate": 3.087113861605804e-05, + "loss": 0.2179, + "step": 14903 + }, + { + "epoch": 1.767342582710779, + "grad_norm": 0.7053111609151248, + "learning_rate": 3.086880535948381e-05, + "loss": 0.1705, + "step": 14904 + }, + { + "epoch": 1.7674611644729041, + "grad_norm": 1.0261610365139568, + "learning_rate": 3.0866472048806394e-05, + "loss": 0.209, + "step": 14905 + }, + { + "epoch": 1.767579746235029, + "grad_norm": 0.8434221740293864, + "learning_rate": 3.086413868404729e-05, + "loss": 0.1926, + "step": 14906 + }, + { + "epoch": 1.767698327997154, + "grad_norm": 1.3377230680880616, + "learning_rate": 3.0861805265228035e-05, + "loss": 0.2924, + "step": 14907 + }, + { + "epoch": 1.767816909759279, + "grad_norm": 1.722104145074528, + "learning_rate": 3.085947179237011e-05, + "loss": 0.3928, + "step": 14908 + }, + { + "epoch": 1.767935491521404, + "grad_norm": 0.8086444676962713, + "learning_rate": 3.085713826549505e-05, + "loss": 0.2293, + "step": 14909 + }, + { + "epoch": 1.768054073283529, + "grad_norm": 1.586877750638903, + "learning_rate": 3.085480468462435e-05, + "loss": 0.3532, + "step": 14910 + }, + { + "epoch": 1.768172655045654, + "grad_norm": 1.1244217399819385, + "learning_rate": 3.0852471049779534e-05, + "loss": 0.2448, + "step": 14911 + }, + { + "epoch": 1.768291236807779, + "grad_norm": 1.0249064296568375, + "learning_rate": 3.0850137360982104e-05, + "loss": 0.2498, + "step": 14912 + }, + { + "epoch": 1.768409818569904, + "grad_norm": 0.8252280907053989, + "learning_rate": 3.084780361825359e-05, + "loss": 0.1725, + "step": 14913 + }, + { + "epoch": 1.768528400332029, + "grad_norm": 0.8174667627816152, + "learning_rate": 3.08454698216155e-05, + "loss": 0.17, + "step": 14914 + }, + { + "epoch": 1.768646982094154, + "grad_norm": 0.743417016603677, + "learning_rate": 3.084313597108934e-05, + "loss": 0.1907, + "step": 14915 + }, + { + "epoch": 1.768765563856279, + "grad_norm": 1.4697328983173994, + "learning_rate": 3.084080206669663e-05, + "loss": 0.2824, + "step": 14916 + }, + { + "epoch": 1.768884145618404, + "grad_norm": 0.7494188534559302, + "learning_rate": 3.083846810845889e-05, + "loss": 0.142, + "step": 14917 + }, + { + "epoch": 1.7690027273805289, + "grad_norm": 1.146947017402319, + "learning_rate": 3.083613409639764e-05, + "loss": 0.2644, + "step": 14918 + }, + { + "epoch": 1.7691213091426539, + "grad_norm": 0.8244525448884426, + "learning_rate": 3.083380003053438e-05, + "loss": 0.1455, + "step": 14919 + }, + { + "epoch": 1.7692398909047788, + "grad_norm": 0.8250536612431655, + "learning_rate": 3.083146591089064e-05, + "loss": 0.1432, + "step": 14920 + }, + { + "epoch": 1.7693584726669038, + "grad_norm": 1.5362425789321994, + "learning_rate": 3.082913173748794e-05, + "loss": 0.3025, + "step": 14921 + }, + { + "epoch": 1.7694770544290288, + "grad_norm": 0.6047014533119218, + "learning_rate": 3.08267975103478e-05, + "loss": 0.1183, + "step": 14922 + }, + { + "epoch": 1.7695956361911538, + "grad_norm": 0.9864138487203894, + "learning_rate": 3.082446322949172e-05, + "loss": 0.2414, + "step": 14923 + }, + { + "epoch": 1.7697142179532788, + "grad_norm": 0.7380453223072034, + "learning_rate": 3.082212889494124e-05, + "loss": 0.1589, + "step": 14924 + }, + { + "epoch": 1.7698327997154037, + "grad_norm": 1.133064609588666, + "learning_rate": 3.081979450671787e-05, + "loss": 0.2509, + "step": 14925 + }, + { + "epoch": 1.7699513814775287, + "grad_norm": 0.9825869699262936, + "learning_rate": 3.0817460064843115e-05, + "loss": 0.2453, + "step": 14926 + }, + { + "epoch": 1.7700699632396537, + "grad_norm": 0.8381221991911467, + "learning_rate": 3.0815125569338526e-05, + "loss": 0.1471, + "step": 14927 + }, + { + "epoch": 1.7701885450017787, + "grad_norm": 0.8460066512651663, + "learning_rate": 3.081279102022561e-05, + "loss": 0.1849, + "step": 14928 + }, + { + "epoch": 1.7703071267639037, + "grad_norm": 0.6333603294719242, + "learning_rate": 3.081045641752589e-05, + "loss": 0.1176, + "step": 14929 + }, + { + "epoch": 1.7704257085260287, + "grad_norm": 0.9039309878310735, + "learning_rate": 3.0808121761260885e-05, + "loss": 0.1611, + "step": 14930 + }, + { + "epoch": 1.7705442902881536, + "grad_norm": 0.648047987368884, + "learning_rate": 3.0805787051452115e-05, + "loss": 0.1185, + "step": 14931 + }, + { + "epoch": 1.7706628720502786, + "grad_norm": 1.4091414157375028, + "learning_rate": 3.0803452288121116e-05, + "loss": 0.2722, + "step": 14932 + }, + { + "epoch": 1.7707814538124036, + "grad_norm": 0.7659554995026467, + "learning_rate": 3.08011174712894e-05, + "loss": 0.1684, + "step": 14933 + }, + { + "epoch": 1.7709000355745288, + "grad_norm": 0.773453295120672, + "learning_rate": 3.07987826009785e-05, + "loss": 0.1639, + "step": 14934 + }, + { + "epoch": 1.7710186173366536, + "grad_norm": 1.1035646769564353, + "learning_rate": 3.0796447677209927e-05, + "loss": 0.2582, + "step": 14935 + }, + { + "epoch": 1.7711371990987788, + "grad_norm": 0.9472112866666685, + "learning_rate": 3.079411270000521e-05, + "loss": 0.1885, + "step": 14936 + }, + { + "epoch": 1.7712557808609035, + "grad_norm": 1.0260743711977982, + "learning_rate": 3.0791777669385895e-05, + "loss": 0.2233, + "step": 14937 + }, + { + "epoch": 1.7713743626230287, + "grad_norm": 0.8121638761776354, + "learning_rate": 3.078944258537349e-05, + "loss": 0.1674, + "step": 14938 + }, + { + "epoch": 1.7714929443851535, + "grad_norm": 1.235920609229254, + "learning_rate": 3.078710744798951e-05, + "loss": 0.2424, + "step": 14939 + }, + { + "epoch": 1.7716115261472787, + "grad_norm": 0.5843569087846404, + "learning_rate": 3.078477225725551e-05, + "loss": 0.1221, + "step": 14940 + }, + { + "epoch": 1.7717301079094034, + "grad_norm": 0.7104824937842203, + "learning_rate": 3.0782437013192985e-05, + "loss": 0.1165, + "step": 14941 + }, + { + "epoch": 1.7718486896715286, + "grad_norm": 0.7050943187254827, + "learning_rate": 3.078010171582351e-05, + "loss": 0.1623, + "step": 14942 + }, + { + "epoch": 1.7719672714336534, + "grad_norm": 0.769093795182177, + "learning_rate": 3.077776636516856e-05, + "loss": 0.1526, + "step": 14943 + }, + { + "epoch": 1.7720858531957786, + "grad_norm": 1.5062514343798399, + "learning_rate": 3.07754309612497e-05, + "loss": 0.3496, + "step": 14944 + }, + { + "epoch": 1.7722044349579034, + "grad_norm": 0.6705266041276721, + "learning_rate": 3.077309550408845e-05, + "loss": 0.1375, + "step": 14945 + }, + { + "epoch": 1.7723230167200286, + "grad_norm": 0.75812204408352, + "learning_rate": 3.077075999370633e-05, + "loss": 0.1544, + "step": 14946 + }, + { + "epoch": 1.7724415984821533, + "grad_norm": 0.529950134762947, + "learning_rate": 3.076842443012489e-05, + "loss": 0.0821, + "step": 14947 + }, + { + "epoch": 1.7725601802442785, + "grad_norm": 0.6327788617322649, + "learning_rate": 3.076608881336565e-05, + "loss": 0.1239, + "step": 14948 + }, + { + "epoch": 1.7726787620064033, + "grad_norm": 0.7738752739076232, + "learning_rate": 3.076375314345013e-05, + "loss": 0.1719, + "step": 14949 + }, + { + "epoch": 1.7727973437685285, + "grad_norm": 0.8541319348220874, + "learning_rate": 3.076141742039989e-05, + "loss": 0.1447, + "step": 14950 + }, + { + "epoch": 1.7729159255306532, + "grad_norm": 1.1103026571327383, + "learning_rate": 3.075908164423643e-05, + "loss": 0.2498, + "step": 14951 + }, + { + "epoch": 1.7730345072927784, + "grad_norm": 1.0602905206385898, + "learning_rate": 3.0756745814981315e-05, + "loss": 0.2753, + "step": 14952 + }, + { + "epoch": 1.7731530890549032, + "grad_norm": 0.9444171922395216, + "learning_rate": 3.0754409932656055e-05, + "loss": 0.1999, + "step": 14953 + }, + { + "epoch": 1.7732716708170284, + "grad_norm": 0.7149022722468383, + "learning_rate": 3.075207399728219e-05, + "loss": 0.1363, + "step": 14954 + }, + { + "epoch": 1.7733902525791532, + "grad_norm": 0.7854202989572453, + "learning_rate": 3.0749738008881254e-05, + "loss": 0.1254, + "step": 14955 + }, + { + "epoch": 1.7735088343412784, + "grad_norm": 0.939213903478177, + "learning_rate": 3.074740196747479e-05, + "loss": 0.1941, + "step": 14956 + }, + { + "epoch": 1.7736274161034031, + "grad_norm": 0.7966194573334988, + "learning_rate": 3.0745065873084336e-05, + "loss": 0.1822, + "step": 14957 + }, + { + "epoch": 1.7737459978655283, + "grad_norm": 0.9311032569758605, + "learning_rate": 3.0742729725731404e-05, + "loss": 0.1686, + "step": 14958 + }, + { + "epoch": 1.7738645796276533, + "grad_norm": 0.7449887078203834, + "learning_rate": 3.074039352543756e-05, + "loss": 0.1467, + "step": 14959 + }, + { + "epoch": 1.7739831613897783, + "grad_norm": 1.1204953385657457, + "learning_rate": 3.073805727222432e-05, + "loss": 0.2418, + "step": 14960 + }, + { + "epoch": 1.7741017431519033, + "grad_norm": 0.8153322633095746, + "learning_rate": 3.073572096611323e-05, + "loss": 0.2334, + "step": 14961 + }, + { + "epoch": 1.7742203249140283, + "grad_norm": 0.926695429131681, + "learning_rate": 3.073338460712582e-05, + "loss": 0.1579, + "step": 14962 + }, + { + "epoch": 1.7743389066761532, + "grad_norm": 0.793108352436663, + "learning_rate": 3.073104819528365e-05, + "loss": 0.1728, + "step": 14963 + }, + { + "epoch": 1.7744574884382782, + "grad_norm": 1.0332402582618052, + "learning_rate": 3.072871173060823e-05, + "loss": 0.1847, + "step": 14964 + }, + { + "epoch": 1.7745760702004032, + "grad_norm": 0.7789862081011926, + "learning_rate": 3.0726375213121116e-05, + "loss": 0.1526, + "step": 14965 + }, + { + "epoch": 1.7746946519625282, + "grad_norm": 0.7982639673737543, + "learning_rate": 3.072403864284384e-05, + "loss": 0.205, + "step": 14966 + }, + { + "epoch": 1.7748132337246532, + "grad_norm": 0.9861523152445598, + "learning_rate": 3.072170201979795e-05, + "loss": 0.2242, + "step": 14967 + }, + { + "epoch": 1.7749318154867781, + "grad_norm": 0.7284610270076973, + "learning_rate": 3.071936534400499e-05, + "loss": 0.1317, + "step": 14968 + }, + { + "epoch": 1.7750503972489031, + "grad_norm": 0.9728520346767124, + "learning_rate": 3.071702861548649e-05, + "loss": 0.1993, + "step": 14969 + }, + { + "epoch": 1.775168979011028, + "grad_norm": 0.8971094188345041, + "learning_rate": 3.0714691834263995e-05, + "loss": 0.1976, + "step": 14970 + }, + { + "epoch": 1.775287560773153, + "grad_norm": 0.8033992267568183, + "learning_rate": 3.071235500035906e-05, + "loss": 0.1952, + "step": 14971 + }, + { + "epoch": 1.775406142535278, + "grad_norm": 0.9105917140746532, + "learning_rate": 3.07100181137932e-05, + "loss": 0.1681, + "step": 14972 + }, + { + "epoch": 1.775524724297403, + "grad_norm": 0.8952590319991965, + "learning_rate": 3.070768117458799e-05, + "loss": 0.1764, + "step": 14973 + }, + { + "epoch": 1.775643306059528, + "grad_norm": 0.9202958717461408, + "learning_rate": 3.0705344182764956e-05, + "loss": 0.2099, + "step": 14974 + }, + { + "epoch": 1.775761887821653, + "grad_norm": 0.9213130553559689, + "learning_rate": 3.070300713834564e-05, + "loss": 0.2051, + "step": 14975 + }, + { + "epoch": 1.775880469583778, + "grad_norm": 1.0180509796176664, + "learning_rate": 3.0700670041351586e-05, + "loss": 0.2582, + "step": 14976 + }, + { + "epoch": 1.775999051345903, + "grad_norm": 0.551568828264305, + "learning_rate": 3.0698332891804356e-05, + "loss": 0.1147, + "step": 14977 + }, + { + "epoch": 1.776117633108028, + "grad_norm": 0.8904363811469934, + "learning_rate": 3.069599568972548e-05, + "loss": 0.2116, + "step": 14978 + }, + { + "epoch": 1.776236214870153, + "grad_norm": 0.8851490267346912, + "learning_rate": 3.0693658435136504e-05, + "loss": 0.1751, + "step": 14979 + }, + { + "epoch": 1.776354796632278, + "grad_norm": 1.1680443817644814, + "learning_rate": 3.0691321128058994e-05, + "loss": 0.2197, + "step": 14980 + }, + { + "epoch": 1.776473378394403, + "grad_norm": 0.9532568099802776, + "learning_rate": 3.068898376851447e-05, + "loss": 0.2169, + "step": 14981 + }, + { + "epoch": 1.7765919601565279, + "grad_norm": 0.8198402535109108, + "learning_rate": 3.068664635652449e-05, + "loss": 0.1888, + "step": 14982 + }, + { + "epoch": 1.776710541918653, + "grad_norm": 0.9429668379177729, + "learning_rate": 3.068430889211061e-05, + "loss": 0.188, + "step": 14983 + }, + { + "epoch": 1.7768291236807778, + "grad_norm": 0.7535248557971345, + "learning_rate": 3.068197137529437e-05, + "loss": 0.1439, + "step": 14984 + }, + { + "epoch": 1.776947705442903, + "grad_norm": 1.0537731571207458, + "learning_rate": 3.067963380609732e-05, + "loss": 0.1749, + "step": 14985 + }, + { + "epoch": 1.7770662872050278, + "grad_norm": 0.9216070360814999, + "learning_rate": 3.0677296184541016e-05, + "loss": 0.187, + "step": 14986 + }, + { + "epoch": 1.777184868967153, + "grad_norm": 0.7119286720533268, + "learning_rate": 3.0674958510647004e-05, + "loss": 0.1601, + "step": 14987 + }, + { + "epoch": 1.7773034507292778, + "grad_norm": 0.9395134727814879, + "learning_rate": 3.067262078443683e-05, + "loss": 0.1757, + "step": 14988 + }, + { + "epoch": 1.777422032491403, + "grad_norm": 0.7366965566647228, + "learning_rate": 3.067028300593205e-05, + "loss": 0.1326, + "step": 14989 + }, + { + "epoch": 1.7775406142535277, + "grad_norm": 1.0205203795746545, + "learning_rate": 3.066794517515422e-05, + "loss": 0.1959, + "step": 14990 + }, + { + "epoch": 1.777659196015653, + "grad_norm": 0.6816006946145708, + "learning_rate": 3.066560729212488e-05, + "loss": 0.1276, + "step": 14991 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6798234913047136, + "learning_rate": 3.066326935686558e-05, + "loss": 0.1377, + "step": 14992 + }, + { + "epoch": 1.7778963595399029, + "grad_norm": 1.2113379719927337, + "learning_rate": 3.066093136939789e-05, + "loss": 0.2466, + "step": 14993 + }, + { + "epoch": 1.7780149413020276, + "grad_norm": 0.7218613853612004, + "learning_rate": 3.065859332974336e-05, + "loss": 0.1771, + "step": 14994 + }, + { + "epoch": 1.7781335230641528, + "grad_norm": 0.8149034277011729, + "learning_rate": 3.065625523792353e-05, + "loss": 0.1707, + "step": 14995 + }, + { + "epoch": 1.7782521048262776, + "grad_norm": 1.584067850863626, + "learning_rate": 3.065391709395996e-05, + "loss": 0.2827, + "step": 14996 + }, + { + "epoch": 1.7783706865884028, + "grad_norm": 0.9126565251704829, + "learning_rate": 3.065157889787422e-05, + "loss": 0.1604, + "step": 14997 + }, + { + "epoch": 1.7784892683505276, + "grad_norm": 1.0352062493238612, + "learning_rate": 3.0649240649687844e-05, + "loss": 0.2165, + "step": 14998 + }, + { + "epoch": 1.7786078501126528, + "grad_norm": 0.8600052620672848, + "learning_rate": 3.0646902349422394e-05, + "loss": 0.1965, + "step": 14999 + }, + { + "epoch": 1.7787264318747775, + "grad_norm": 0.7377764338695952, + "learning_rate": 3.0644563997099444e-05, + "loss": 0.1608, + "step": 15000 + }, + { + "epoch": 1.7788450136369027, + "grad_norm": 1.176408847205642, + "learning_rate": 3.0642225592740515e-05, + "loss": 0.2198, + "step": 15001 + }, + { + "epoch": 1.7789635953990275, + "grad_norm": 1.0422527805153636, + "learning_rate": 3.0639887136367204e-05, + "loss": 0.1901, + "step": 15002 + }, + { + "epoch": 1.7790821771611527, + "grad_norm": 1.3167774756997066, + "learning_rate": 3.063754862800103e-05, + "loss": 0.3092, + "step": 15003 + }, + { + "epoch": 1.7792007589232774, + "grad_norm": 0.8650113684213617, + "learning_rate": 3.063521006766358e-05, + "loss": 0.1668, + "step": 15004 + }, + { + "epoch": 1.7793193406854027, + "grad_norm": 1.045663671482237, + "learning_rate": 3.063287145537641e-05, + "loss": 0.1797, + "step": 15005 + }, + { + "epoch": 1.7794379224475274, + "grad_norm": 0.8360143987195683, + "learning_rate": 3.0630532791161074e-05, + "loss": 0.1275, + "step": 15006 + }, + { + "epoch": 1.7795565042096526, + "grad_norm": 0.7902629330071124, + "learning_rate": 3.062819407503912e-05, + "loss": 0.1997, + "step": 15007 + }, + { + "epoch": 1.7796750859717776, + "grad_norm": 0.954664613654714, + "learning_rate": 3.062585530703213e-05, + "loss": 0.2607, + "step": 15008 + }, + { + "epoch": 1.7797936677339026, + "grad_norm": 0.9263305888728587, + "learning_rate": 3.062351648716164e-05, + "loss": 0.1859, + "step": 15009 + }, + { + "epoch": 1.7799122494960276, + "grad_norm": 0.817246767281114, + "learning_rate": 3.0621177615449235e-05, + "loss": 0.187, + "step": 15010 + }, + { + "epoch": 1.7800308312581525, + "grad_norm": 1.2079901303750278, + "learning_rate": 3.061883869191646e-05, + "loss": 0.2487, + "step": 15011 + }, + { + "epoch": 1.7801494130202775, + "grad_norm": 0.9699270278591883, + "learning_rate": 3.061649971658488e-05, + "loss": 0.1738, + "step": 15012 + }, + { + "epoch": 1.7802679947824025, + "grad_norm": 0.7507735242219331, + "learning_rate": 3.0614160689476064e-05, + "loss": 0.1377, + "step": 15013 + }, + { + "epoch": 1.7803865765445275, + "grad_norm": 1.1141087734442858, + "learning_rate": 3.0611821610611565e-05, + "loss": 0.2101, + "step": 15014 + }, + { + "epoch": 1.7805051583066525, + "grad_norm": 0.6532339637609492, + "learning_rate": 3.060948248001296e-05, + "loss": 0.1807, + "step": 15015 + }, + { + "epoch": 1.7806237400687774, + "grad_norm": 0.5643074455594136, + "learning_rate": 3.0607143297701796e-05, + "loss": 0.1427, + "step": 15016 + }, + { + "epoch": 1.7807423218309024, + "grad_norm": 0.7555458571314678, + "learning_rate": 3.060480406369966e-05, + "loss": 0.1542, + "step": 15017 + }, + { + "epoch": 1.7808609035930274, + "grad_norm": 1.2327339806365896, + "learning_rate": 3.060246477802809e-05, + "loss": 0.2766, + "step": 15018 + }, + { + "epoch": 1.7809794853551524, + "grad_norm": 1.0940902733850615, + "learning_rate": 3.0600125440708675e-05, + "loss": 0.2074, + "step": 15019 + }, + { + "epoch": 1.7810980671172774, + "grad_norm": 1.2063116015688733, + "learning_rate": 3.0597786051762965e-05, + "loss": 0.2749, + "step": 15020 + }, + { + "epoch": 1.7812166488794023, + "grad_norm": 1.062567650995576, + "learning_rate": 3.059544661121254e-05, + "loss": 0.184, + "step": 15021 + }, + { + "epoch": 1.7813352306415273, + "grad_norm": 0.9317571605688232, + "learning_rate": 3.059310711907896e-05, + "loss": 0.2265, + "step": 15022 + }, + { + "epoch": 1.7814538124036523, + "grad_norm": 0.7919411203336657, + "learning_rate": 3.0590767575383784e-05, + "loss": 0.1988, + "step": 15023 + }, + { + "epoch": 1.7815723941657773, + "grad_norm": 1.0198363396393977, + "learning_rate": 3.058842798014858e-05, + "loss": 0.2363, + "step": 15024 + }, + { + "epoch": 1.7816909759279023, + "grad_norm": 1.480774275320996, + "learning_rate": 3.058608833339493e-05, + "loss": 0.2577, + "step": 15025 + }, + { + "epoch": 1.7818095576900272, + "grad_norm": 0.8630360784642233, + "learning_rate": 3.058374863514439e-05, + "loss": 0.1718, + "step": 15026 + }, + { + "epoch": 1.7819281394521522, + "grad_norm": 0.8706196892826681, + "learning_rate": 3.058140888541854e-05, + "loss": 0.1833, + "step": 15027 + }, + { + "epoch": 1.7820467212142772, + "grad_norm": 0.9402048825943531, + "learning_rate": 3.057906908423895e-05, + "loss": 0.202, + "step": 15028 + }, + { + "epoch": 1.7821653029764022, + "grad_norm": 0.8326133698185489, + "learning_rate": 3.0576729231627184e-05, + "loss": 0.2166, + "step": 15029 + }, + { + "epoch": 1.7822838847385272, + "grad_norm": 0.7161868396325797, + "learning_rate": 3.0574389327604805e-05, + "loss": 0.1385, + "step": 15030 + }, + { + "epoch": 1.7824024665006521, + "grad_norm": 1.1933798182262316, + "learning_rate": 3.05720493721934e-05, + "loss": 0.204, + "step": 15031 + }, + { + "epoch": 1.7825210482627771, + "grad_norm": 1.042768934772802, + "learning_rate": 3.056970936541453e-05, + "loss": 0.2543, + "step": 15032 + }, + { + "epoch": 1.782639630024902, + "grad_norm": 0.9144502665515345, + "learning_rate": 3.056736930728977e-05, + "loss": 0.164, + "step": 15033 + }, + { + "epoch": 1.7827582117870273, + "grad_norm": 0.8355971731378653, + "learning_rate": 3.056502919784069e-05, + "loss": 0.195, + "step": 15034 + }, + { + "epoch": 1.782876793549152, + "grad_norm": 0.7211392836998718, + "learning_rate": 3.056268903708886e-05, + "loss": 0.1276, + "step": 15035 + }, + { + "epoch": 1.7829953753112773, + "grad_norm": 0.869305008338028, + "learning_rate": 3.056034882505587e-05, + "loss": 0.2169, + "step": 15036 + }, + { + "epoch": 1.783113957073402, + "grad_norm": 1.1494142636927818, + "learning_rate": 3.055800856176327e-05, + "loss": 0.2094, + "step": 15037 + }, + { + "epoch": 1.7832325388355272, + "grad_norm": 0.7256864512178909, + "learning_rate": 3.055566824723266e-05, + "loss": 0.1495, + "step": 15038 + }, + { + "epoch": 1.783351120597652, + "grad_norm": 0.9934954311585353, + "learning_rate": 3.05533278814856e-05, + "loss": 0.1864, + "step": 15039 + }, + { + "epoch": 1.7834697023597772, + "grad_norm": 0.9194564824181981, + "learning_rate": 3.055098746454366e-05, + "loss": 0.2458, + "step": 15040 + }, + { + "epoch": 1.783588284121902, + "grad_norm": 0.831704591642692, + "learning_rate": 3.054864699642842e-05, + "loss": 0.1453, + "step": 15041 + }, + { + "epoch": 1.7837068658840272, + "grad_norm": 0.9378154091020408, + "learning_rate": 3.054630647716146e-05, + "loss": 0.2276, + "step": 15042 + }, + { + "epoch": 1.783825447646152, + "grad_norm": 0.8962476704165832, + "learning_rate": 3.0543965906764364e-05, + "loss": 0.1842, + "step": 15043 + }, + { + "epoch": 1.7839440294082771, + "grad_norm": 0.9313003693192361, + "learning_rate": 3.0541625285258695e-05, + "loss": 0.194, + "step": 15044 + }, + { + "epoch": 1.7840626111704019, + "grad_norm": 0.7060279050752727, + "learning_rate": 3.0539284612666036e-05, + "loss": 0.1588, + "step": 15045 + }, + { + "epoch": 1.784181192932527, + "grad_norm": 0.6942513710876012, + "learning_rate": 3.053694388900796e-05, + "loss": 0.1285, + "step": 15046 + }, + { + "epoch": 1.7842997746946518, + "grad_norm": 0.8277999247796552, + "learning_rate": 3.0534603114306065e-05, + "loss": 0.154, + "step": 15047 + }, + { + "epoch": 1.784418356456777, + "grad_norm": 0.8362993102099181, + "learning_rate": 3.053226228858191e-05, + "loss": 0.2005, + "step": 15048 + }, + { + "epoch": 1.7845369382189018, + "grad_norm": 0.7007602192444733, + "learning_rate": 3.0529921411857083e-05, + "loss": 0.1432, + "step": 15049 + }, + { + "epoch": 1.784655519981027, + "grad_norm": 0.7728600934580757, + "learning_rate": 3.052758048415316e-05, + "loss": 0.1928, + "step": 15050 + }, + { + "epoch": 1.7847741017431518, + "grad_norm": 1.0406956162897327, + "learning_rate": 3.052523950549172e-05, + "loss": 0.1924, + "step": 15051 + }, + { + "epoch": 1.784892683505277, + "grad_norm": 0.6120556733696315, + "learning_rate": 3.052289847589435e-05, + "loss": 0.1499, + "step": 15052 + }, + { + "epoch": 1.7850112652674017, + "grad_norm": 0.906324946952052, + "learning_rate": 3.052055739538263e-05, + "loss": 0.1948, + "step": 15053 + }, + { + "epoch": 1.785129847029527, + "grad_norm": 1.2520699830206519, + "learning_rate": 3.0518216263978136e-05, + "loss": 0.2368, + "step": 15054 + }, + { + "epoch": 1.7852484287916517, + "grad_norm": 1.0189197408386639, + "learning_rate": 3.0515875081702455e-05, + "loss": 0.2049, + "step": 15055 + }, + { + "epoch": 1.785367010553777, + "grad_norm": 0.9821141041547082, + "learning_rate": 3.051353384857717e-05, + "loss": 0.2048, + "step": 15056 + }, + { + "epoch": 1.7854855923159016, + "grad_norm": 0.6675484179948086, + "learning_rate": 3.051119256462387e-05, + "loss": 0.1505, + "step": 15057 + }, + { + "epoch": 1.7856041740780269, + "grad_norm": 1.0225506906417692, + "learning_rate": 3.0508851229864126e-05, + "loss": 0.2176, + "step": 15058 + }, + { + "epoch": 1.7857227558401518, + "grad_norm": 0.8316844997245436, + "learning_rate": 3.0506509844319536e-05, + "loss": 0.184, + "step": 15059 + }, + { + "epoch": 1.7858413376022768, + "grad_norm": 1.1294858162788888, + "learning_rate": 3.0504168408011675e-05, + "loss": 0.2417, + "step": 15060 + }, + { + "epoch": 1.7859599193644018, + "grad_norm": 0.8891842119211248, + "learning_rate": 3.050182692096213e-05, + "loss": 0.1762, + "step": 15061 + }, + { + "epoch": 1.7860785011265268, + "grad_norm": 1.1480686747320858, + "learning_rate": 3.0499485383192488e-05, + "loss": 0.2262, + "step": 15062 + }, + { + "epoch": 1.7861970828886518, + "grad_norm": 0.7085558198366173, + "learning_rate": 3.0497143794724337e-05, + "loss": 0.1709, + "step": 15063 + }, + { + "epoch": 1.7863156646507767, + "grad_norm": 1.1634498230188466, + "learning_rate": 3.0494802155579255e-05, + "loss": 0.1955, + "step": 15064 + }, + { + "epoch": 1.7864342464129017, + "grad_norm": 0.8670576851963812, + "learning_rate": 3.0492460465778837e-05, + "loss": 0.1618, + "step": 15065 + }, + { + "epoch": 1.7865528281750267, + "grad_norm": 0.8514071422251481, + "learning_rate": 3.0490118725344674e-05, + "loss": 0.1645, + "step": 15066 + }, + { + "epoch": 1.7866714099371517, + "grad_norm": 1.5799228254459239, + "learning_rate": 3.048777693429834e-05, + "loss": 0.3154, + "step": 15067 + }, + { + "epoch": 1.7867899916992767, + "grad_norm": 0.7002940213229736, + "learning_rate": 3.048543509266144e-05, + "loss": 0.1402, + "step": 15068 + }, + { + "epoch": 1.7869085734614016, + "grad_norm": 0.808569163294969, + "learning_rate": 3.0483093200455555e-05, + "loss": 0.1812, + "step": 15069 + }, + { + "epoch": 1.7870271552235266, + "grad_norm": 0.9085508765663662, + "learning_rate": 3.0480751257702267e-05, + "loss": 0.2018, + "step": 15070 + }, + { + "epoch": 1.7871457369856516, + "grad_norm": 1.1590469291362477, + "learning_rate": 3.0478409264423185e-05, + "loss": 0.2492, + "step": 15071 + }, + { + "epoch": 1.7872643187477766, + "grad_norm": 0.8165589159503883, + "learning_rate": 3.047606722063988e-05, + "loss": 0.1915, + "step": 15072 + }, + { + "epoch": 1.7873829005099016, + "grad_norm": 0.7786564104557779, + "learning_rate": 3.0473725126373952e-05, + "loss": 0.1444, + "step": 15073 + }, + { + "epoch": 1.7875014822720265, + "grad_norm": 1.2685882888561644, + "learning_rate": 3.047138298164699e-05, + "loss": 0.2116, + "step": 15074 + }, + { + "epoch": 1.7876200640341515, + "grad_norm": 0.9161617640014321, + "learning_rate": 3.0469040786480587e-05, + "loss": 0.2539, + "step": 15075 + }, + { + "epoch": 1.7877386457962765, + "grad_norm": 0.7121051470008238, + "learning_rate": 3.0466698540896333e-05, + "loss": 0.1494, + "step": 15076 + }, + { + "epoch": 1.7878572275584015, + "grad_norm": 1.3318210647411703, + "learning_rate": 3.0464356244915825e-05, + "loss": 0.219, + "step": 15077 + }, + { + "epoch": 1.7879758093205265, + "grad_norm": 0.9342947771225024, + "learning_rate": 3.0462013898560648e-05, + "loss": 0.2032, + "step": 15078 + }, + { + "epoch": 1.7880943910826514, + "grad_norm": 0.96882676450013, + "learning_rate": 3.045967150185241e-05, + "loss": 0.1979, + "step": 15079 + }, + { + "epoch": 1.7882129728447764, + "grad_norm": 0.7143986377637299, + "learning_rate": 3.0457329054812688e-05, + "loss": 0.149, + "step": 15080 + }, + { + "epoch": 1.7883315546069014, + "grad_norm": 1.0163838122433735, + "learning_rate": 3.045498655746309e-05, + "loss": 0.2226, + "step": 15081 + }, + { + "epoch": 1.7884501363690264, + "grad_norm": 0.7641621925583265, + "learning_rate": 3.04526440098252e-05, + "loss": 0.163, + "step": 15082 + }, + { + "epoch": 1.7885687181311516, + "grad_norm": 0.9102147617446654, + "learning_rate": 3.0450301411920618e-05, + "loss": 0.1707, + "step": 15083 + }, + { + "epoch": 1.7886872998932764, + "grad_norm": 1.130981273940528, + "learning_rate": 3.0447958763770946e-05, + "loss": 0.2366, + "step": 15084 + }, + { + "epoch": 1.7888058816554016, + "grad_norm": 0.9359421468752543, + "learning_rate": 3.0445616065397763e-05, + "loss": 0.2055, + "step": 15085 + }, + { + "epoch": 1.7889244634175263, + "grad_norm": 0.873492877673776, + "learning_rate": 3.0443273316822696e-05, + "loss": 0.2034, + "step": 15086 + }, + { + "epoch": 1.7890430451796515, + "grad_norm": 0.7094584358565091, + "learning_rate": 3.0440930518067313e-05, + "loss": 0.1612, + "step": 15087 + }, + { + "epoch": 1.7891616269417763, + "grad_norm": 0.7015105064174699, + "learning_rate": 3.0438587669153228e-05, + "loss": 0.156, + "step": 15088 + }, + { + "epoch": 1.7892802087039015, + "grad_norm": 1.3203278050531417, + "learning_rate": 3.043624477010203e-05, + "loss": 0.2429, + "step": 15089 + }, + { + "epoch": 1.7893987904660262, + "grad_norm": 1.002352749516082, + "learning_rate": 3.0433901820935323e-05, + "loss": 0.2344, + "step": 15090 + }, + { + "epoch": 1.7895173722281514, + "grad_norm": 1.0472292542025867, + "learning_rate": 3.04315588216747e-05, + "loss": 0.204, + "step": 15091 + }, + { + "epoch": 1.7896359539902762, + "grad_norm": 0.8671128480233045, + "learning_rate": 3.0429215772341773e-05, + "loss": 0.1446, + "step": 15092 + }, + { + "epoch": 1.7897545357524014, + "grad_norm": 0.949239721749239, + "learning_rate": 3.0426872672958127e-05, + "loss": 0.2235, + "step": 15093 + }, + { + "epoch": 1.7898731175145262, + "grad_norm": 0.7749236962079176, + "learning_rate": 3.0424529523545375e-05, + "loss": 0.1497, + "step": 15094 + }, + { + "epoch": 1.7899916992766514, + "grad_norm": 1.3278212978568515, + "learning_rate": 3.0422186324125106e-05, + "loss": 0.2652, + "step": 15095 + }, + { + "epoch": 1.7901102810387761, + "grad_norm": 0.787656950762092, + "learning_rate": 3.041984307471894e-05, + "loss": 0.1946, + "step": 15096 + }, + { + "epoch": 1.7902288628009013, + "grad_norm": 0.8533510174075591, + "learning_rate": 3.0417499775348456e-05, + "loss": 0.2073, + "step": 15097 + }, + { + "epoch": 1.790347444563026, + "grad_norm": 0.8411985974456255, + "learning_rate": 3.0415156426035275e-05, + "loss": 0.1779, + "step": 15098 + }, + { + "epoch": 1.7904660263251513, + "grad_norm": 0.9947739802413572, + "learning_rate": 3.041281302680099e-05, + "loss": 0.2616, + "step": 15099 + }, + { + "epoch": 1.790584608087276, + "grad_norm": 1.0104347502952047, + "learning_rate": 3.04104695776672e-05, + "loss": 0.198, + "step": 15100 + }, + { + "epoch": 1.7907031898494012, + "grad_norm": 0.8749769007912434, + "learning_rate": 3.040812607865553e-05, + "loss": 0.1507, + "step": 15101 + }, + { + "epoch": 1.790821771611526, + "grad_norm": 1.0426841810320806, + "learning_rate": 3.040578252978756e-05, + "loss": 0.189, + "step": 15102 + }, + { + "epoch": 1.7909403533736512, + "grad_norm": 0.8119972545994057, + "learning_rate": 3.0403438931084906e-05, + "loss": 0.2008, + "step": 15103 + }, + { + "epoch": 1.791058935135776, + "grad_norm": 0.9879411518619514, + "learning_rate": 3.0401095282569164e-05, + "loss": 0.1612, + "step": 15104 + }, + { + "epoch": 1.7911775168979012, + "grad_norm": 1.175734808141919, + "learning_rate": 3.039875158426195e-05, + "loss": 0.3046, + "step": 15105 + }, + { + "epoch": 1.791296098660026, + "grad_norm": 0.8854419016125191, + "learning_rate": 3.039640783618487e-05, + "loss": 0.1798, + "step": 15106 + }, + { + "epoch": 1.7914146804221511, + "grad_norm": 1.4170225891725052, + "learning_rate": 3.0394064038359526e-05, + "loss": 0.2838, + "step": 15107 + }, + { + "epoch": 1.791533262184276, + "grad_norm": 1.0318708646502612, + "learning_rate": 3.0391720190807528e-05, + "loss": 0.1907, + "step": 15108 + }, + { + "epoch": 1.791651843946401, + "grad_norm": 0.7220823211156863, + "learning_rate": 3.038937629355048e-05, + "loss": 0.19, + "step": 15109 + }, + { + "epoch": 1.791770425708526, + "grad_norm": 0.9113796520628461, + "learning_rate": 3.0387032346609995e-05, + "loss": 0.1505, + "step": 15110 + }, + { + "epoch": 1.791889007470651, + "grad_norm": 0.8636816079809962, + "learning_rate": 3.0384688350007673e-05, + "loss": 0.1737, + "step": 15111 + }, + { + "epoch": 1.792007589232776, + "grad_norm": 0.684387995312786, + "learning_rate": 3.038234430376513e-05, + "loss": 0.1551, + "step": 15112 + }, + { + "epoch": 1.792126170994901, + "grad_norm": 0.995834711040808, + "learning_rate": 3.038000020790397e-05, + "loss": 0.1999, + "step": 15113 + }, + { + "epoch": 1.792244752757026, + "grad_norm": 0.9854973981995784, + "learning_rate": 3.03776560624458e-05, + "loss": 0.2123, + "step": 15114 + }, + { + "epoch": 1.792363334519151, + "grad_norm": 1.2093926812717746, + "learning_rate": 3.037531186741225e-05, + "loss": 0.2837, + "step": 15115 + }, + { + "epoch": 1.792481916281276, + "grad_norm": 0.7347787174386884, + "learning_rate": 3.037296762282491e-05, + "loss": 0.1628, + "step": 15116 + }, + { + "epoch": 1.792600498043401, + "grad_norm": 0.7990073752812888, + "learning_rate": 3.0370623328705396e-05, + "loss": 0.1594, + "step": 15117 + }, + { + "epoch": 1.792719079805526, + "grad_norm": 0.7580084617810435, + "learning_rate": 3.036827898507532e-05, + "loss": 0.1729, + "step": 15118 + }, + { + "epoch": 1.792837661567651, + "grad_norm": 0.8370990284197574, + "learning_rate": 3.0365934591956297e-05, + "loss": 0.1524, + "step": 15119 + }, + { + "epoch": 1.7929562433297759, + "grad_norm": 0.8575889348462968, + "learning_rate": 3.0363590149369935e-05, + "loss": 0.1648, + "step": 15120 + }, + { + "epoch": 1.7930748250919009, + "grad_norm": 0.7589098387048121, + "learning_rate": 3.036124565733785e-05, + "loss": 0.1481, + "step": 15121 + }, + { + "epoch": 1.7931934068540258, + "grad_norm": 0.7493453031170967, + "learning_rate": 3.035890111588166e-05, + "loss": 0.1805, + "step": 15122 + }, + { + "epoch": 1.7933119886161508, + "grad_norm": 1.1922993449938477, + "learning_rate": 3.0356556525022967e-05, + "loss": 0.2409, + "step": 15123 + }, + { + "epoch": 1.7934305703782758, + "grad_norm": 0.9332618609896659, + "learning_rate": 3.035421188478339e-05, + "loss": 0.1698, + "step": 15124 + }, + { + "epoch": 1.7935491521404008, + "grad_norm": 0.9296148605729271, + "learning_rate": 3.0351867195184546e-05, + "loss": 0.1838, + "step": 15125 + }, + { + "epoch": 1.7936677339025258, + "grad_norm": 0.9865476139977759, + "learning_rate": 3.0349522456248054e-05, + "loss": 0.1866, + "step": 15126 + }, + { + "epoch": 1.7937863156646507, + "grad_norm": 0.6661975462218911, + "learning_rate": 3.034717766799552e-05, + "loss": 0.1232, + "step": 15127 + }, + { + "epoch": 1.7939048974267757, + "grad_norm": 0.6364870741996531, + "learning_rate": 3.0344832830448566e-05, + "loss": 0.1378, + "step": 15128 + }, + { + "epoch": 1.7940234791889007, + "grad_norm": 0.8690302360288122, + "learning_rate": 3.0342487943628812e-05, + "loss": 0.1837, + "step": 15129 + }, + { + "epoch": 1.7941420609510257, + "grad_norm": 1.0647984479267858, + "learning_rate": 3.0340143007557864e-05, + "loss": 0.2676, + "step": 15130 + }, + { + "epoch": 1.7942606427131507, + "grad_norm": 0.7935735954744033, + "learning_rate": 3.0337798022257353e-05, + "loss": 0.1464, + "step": 15131 + }, + { + "epoch": 1.7943792244752759, + "grad_norm": 0.9625630148857761, + "learning_rate": 3.0335452987748887e-05, + "loss": 0.2092, + "step": 15132 + }, + { + "epoch": 1.7944978062374006, + "grad_norm": 0.8680981898868116, + "learning_rate": 3.0333107904054082e-05, + "loss": 0.1884, + "step": 15133 + }, + { + "epoch": 1.7946163879995258, + "grad_norm": 1.022396140066591, + "learning_rate": 3.0330762771194564e-05, + "loss": 0.1762, + "step": 15134 + }, + { + "epoch": 1.7947349697616506, + "grad_norm": 1.0445331361915915, + "learning_rate": 3.0328417589191953e-05, + "loss": 0.1821, + "step": 15135 + }, + { + "epoch": 1.7948535515237758, + "grad_norm": 0.9287804388885226, + "learning_rate": 3.0326072358067865e-05, + "loss": 0.1658, + "step": 15136 + }, + { + "epoch": 1.7949721332859006, + "grad_norm": 0.7848682682898168, + "learning_rate": 3.0323727077843927e-05, + "loss": 0.1365, + "step": 15137 + }, + { + "epoch": 1.7950907150480258, + "grad_norm": 0.9035300127518623, + "learning_rate": 3.0321381748541744e-05, + "loss": 0.1803, + "step": 15138 + }, + { + "epoch": 1.7952092968101505, + "grad_norm": 0.9237393098342, + "learning_rate": 3.0319036370182957e-05, + "loss": 0.2235, + "step": 15139 + }, + { + "epoch": 1.7953278785722757, + "grad_norm": 0.7577108488858775, + "learning_rate": 3.0316690942789168e-05, + "loss": 0.1602, + "step": 15140 + }, + { + "epoch": 1.7954464603344005, + "grad_norm": 0.8902109742225938, + "learning_rate": 3.0314345466382014e-05, + "loss": 0.1374, + "step": 15141 + }, + { + "epoch": 1.7955650420965257, + "grad_norm": 1.2491200537282414, + "learning_rate": 3.031199994098311e-05, + "loss": 0.2703, + "step": 15142 + }, + { + "epoch": 1.7956836238586504, + "grad_norm": 0.8512950645413768, + "learning_rate": 3.0309654366614077e-05, + "loss": 0.2045, + "step": 15143 + }, + { + "epoch": 1.7958022056207756, + "grad_norm": 0.8153526486194095, + "learning_rate": 3.0307308743296543e-05, + "loss": 0.1865, + "step": 15144 + }, + { + "epoch": 1.7959207873829004, + "grad_norm": 0.7205293282425662, + "learning_rate": 3.030496307105213e-05, + "loss": 0.1219, + "step": 15145 + }, + { + "epoch": 1.7960393691450256, + "grad_norm": 0.8432247174457126, + "learning_rate": 3.0302617349902467e-05, + "loss": 0.1537, + "step": 15146 + }, + { + "epoch": 1.7961579509071504, + "grad_norm": 0.9184750924442107, + "learning_rate": 3.0300271579869173e-05, + "loss": 0.1787, + "step": 15147 + }, + { + "epoch": 1.7962765326692756, + "grad_norm": 0.7055807895837582, + "learning_rate": 3.0297925760973876e-05, + "loss": 0.1603, + "step": 15148 + }, + { + "epoch": 1.7963951144314003, + "grad_norm": 0.926546078610508, + "learning_rate": 3.0295579893238196e-05, + "loss": 0.2097, + "step": 15149 + }, + { + "epoch": 1.7965136961935255, + "grad_norm": 0.8381360355131298, + "learning_rate": 3.029323397668377e-05, + "loss": 0.2092, + "step": 15150 + }, + { + "epoch": 1.7966322779556503, + "grad_norm": 0.6224614451050496, + "learning_rate": 3.0290888011332212e-05, + "loss": 0.1095, + "step": 15151 + }, + { + "epoch": 1.7967508597177755, + "grad_norm": 1.0446531147203062, + "learning_rate": 3.0288541997205162e-05, + "loss": 0.2509, + "step": 15152 + }, + { + "epoch": 1.7968694414799002, + "grad_norm": 1.1698832100488008, + "learning_rate": 3.028619593432423e-05, + "loss": 0.2497, + "step": 15153 + }, + { + "epoch": 1.7969880232420254, + "grad_norm": 0.923408046294736, + "learning_rate": 3.0283849822711057e-05, + "loss": 0.1524, + "step": 15154 + }, + { + "epoch": 1.7971066050041502, + "grad_norm": 0.6647565274894796, + "learning_rate": 3.0281503662387266e-05, + "loss": 0.1357, + "step": 15155 + }, + { + "epoch": 1.7972251867662754, + "grad_norm": 1.1012529538844202, + "learning_rate": 3.027915745337449e-05, + "loss": 0.2267, + "step": 15156 + }, + { + "epoch": 1.7973437685284004, + "grad_norm": 1.1055065388469718, + "learning_rate": 3.027681119569436e-05, + "loss": 0.2895, + "step": 15157 + }, + { + "epoch": 1.7974623502905254, + "grad_norm": 1.659422312752869, + "learning_rate": 3.02744648893685e-05, + "loss": 0.3695, + "step": 15158 + }, + { + "epoch": 1.7975809320526503, + "grad_norm": 0.9575122080837081, + "learning_rate": 3.0272118534418542e-05, + "loss": 0.1872, + "step": 15159 + }, + { + "epoch": 1.7976995138147753, + "grad_norm": 0.8921310913126703, + "learning_rate": 3.0269772130866116e-05, + "loss": 0.1713, + "step": 15160 + }, + { + "epoch": 1.7978180955769003, + "grad_norm": 0.891996189661989, + "learning_rate": 3.0267425678732853e-05, + "loss": 0.2029, + "step": 15161 + }, + { + "epoch": 1.7979366773390253, + "grad_norm": 0.9277615167572174, + "learning_rate": 3.0265079178040378e-05, + "loss": 0.202, + "step": 15162 + }, + { + "epoch": 1.7980552591011503, + "grad_norm": 0.7906371024841173, + "learning_rate": 3.0262732628810336e-05, + "loss": 0.175, + "step": 15163 + }, + { + "epoch": 1.7981738408632753, + "grad_norm": 1.2059077480745255, + "learning_rate": 3.0260386031064348e-05, + "loss": 0.3199, + "step": 15164 + }, + { + "epoch": 1.7982924226254002, + "grad_norm": 1.196806725936194, + "learning_rate": 3.025803938482406e-05, + "loss": 0.1791, + "step": 15165 + }, + { + "epoch": 1.7984110043875252, + "grad_norm": 0.8649698282977638, + "learning_rate": 3.025569269011109e-05, + "loss": 0.1758, + "step": 15166 + }, + { + "epoch": 1.7985295861496502, + "grad_norm": 0.8237529866245534, + "learning_rate": 3.025334594694708e-05, + "loss": 0.1727, + "step": 15167 + }, + { + "epoch": 1.7986481679117752, + "grad_norm": 0.8799517695554747, + "learning_rate": 3.0250999155353665e-05, + "loss": 0.1801, + "step": 15168 + }, + { + "epoch": 1.7987667496739002, + "grad_norm": 0.8653762478060637, + "learning_rate": 3.0248652315352476e-05, + "loss": 0.1531, + "step": 15169 + }, + { + "epoch": 1.7988853314360251, + "grad_norm": 0.9687217116318256, + "learning_rate": 3.024630542696515e-05, + "loss": 0.1865, + "step": 15170 + }, + { + "epoch": 1.7990039131981501, + "grad_norm": 0.9614879824581637, + "learning_rate": 3.0243958490213314e-05, + "loss": 0.1723, + "step": 15171 + }, + { + "epoch": 1.799122494960275, + "grad_norm": 0.8152498587171324, + "learning_rate": 3.0241611505118617e-05, + "loss": 0.146, + "step": 15172 + }, + { + "epoch": 1.7992410767224, + "grad_norm": 0.7503507808764435, + "learning_rate": 3.023926447170269e-05, + "loss": 0.1786, + "step": 15173 + }, + { + "epoch": 1.799359658484525, + "grad_norm": 0.9909166457345013, + "learning_rate": 3.0236917389987164e-05, + "loss": 0.1819, + "step": 15174 + }, + { + "epoch": 1.79947824024665, + "grad_norm": 0.6490971410775556, + "learning_rate": 3.023457025999368e-05, + "loss": 0.1364, + "step": 15175 + }, + { + "epoch": 1.799596822008775, + "grad_norm": 0.6271221231773376, + "learning_rate": 3.0232223081743895e-05, + "loss": 0.1285, + "step": 15176 + }, + { + "epoch": 1.7997154037709, + "grad_norm": 1.1269145907481626, + "learning_rate": 3.0229875855259414e-05, + "loss": 0.318, + "step": 15177 + }, + { + "epoch": 1.799833985533025, + "grad_norm": 0.7080285625080588, + "learning_rate": 3.02275285805619e-05, + "loss": 0.1437, + "step": 15178 + }, + { + "epoch": 1.79995256729515, + "grad_norm": 1.195582649079599, + "learning_rate": 3.0225181257672974e-05, + "loss": 0.1973, + "step": 15179 + }, + { + "epoch": 1.800071149057275, + "grad_norm": 0.7056648506827501, + "learning_rate": 3.0222833886614287e-05, + "loss": 0.1537, + "step": 15180 + }, + { + "epoch": 1.8001897308194, + "grad_norm": 0.7634380181371664, + "learning_rate": 3.0220486467407476e-05, + "loss": 0.1679, + "step": 15181 + }, + { + "epoch": 1.800308312581525, + "grad_norm": 0.8305258626733053, + "learning_rate": 3.0218139000074184e-05, + "loss": 0.2006, + "step": 15182 + }, + { + "epoch": 1.80042689434365, + "grad_norm": 0.9614430946074176, + "learning_rate": 3.0215791484636045e-05, + "loss": 0.2417, + "step": 15183 + }, + { + "epoch": 1.8005454761057749, + "grad_norm": 0.8289136051339754, + "learning_rate": 3.021344392111471e-05, + "loss": 0.2345, + "step": 15184 + }, + { + "epoch": 1.8006640578679, + "grad_norm": 0.9665032960955334, + "learning_rate": 3.021109630953181e-05, + "loss": 0.1767, + "step": 15185 + }, + { + "epoch": 1.8007826396300248, + "grad_norm": 1.0381549283258653, + "learning_rate": 3.0208748649908992e-05, + "loss": 0.2341, + "step": 15186 + }, + { + "epoch": 1.80090122139215, + "grad_norm": 1.0135276992509468, + "learning_rate": 3.0206400942267903e-05, + "loss": 0.1753, + "step": 15187 + }, + { + "epoch": 1.8010198031542748, + "grad_norm": 0.9851074408076888, + "learning_rate": 3.0204053186630184e-05, + "loss": 0.2157, + "step": 15188 + }, + { + "epoch": 1.8011383849164, + "grad_norm": 0.7069709081805873, + "learning_rate": 3.0201705383017477e-05, + "loss": 0.1486, + "step": 15189 + }, + { + "epoch": 1.8012569666785248, + "grad_norm": 1.058078552829204, + "learning_rate": 3.0199357531451415e-05, + "loss": 0.216, + "step": 15190 + }, + { + "epoch": 1.80137554844065, + "grad_norm": 0.7255260183921318, + "learning_rate": 3.019700963195366e-05, + "loss": 0.1623, + "step": 15191 + }, + { + "epoch": 1.8014941302027747, + "grad_norm": 1.5467491054195392, + "learning_rate": 3.019466168454585e-05, + "loss": 0.3077, + "step": 15192 + }, + { + "epoch": 1.8016127119649, + "grad_norm": 0.5831057638334622, + "learning_rate": 3.0192313689249623e-05, + "loss": 0.131, + "step": 15193 + }, + { + "epoch": 1.8017312937270247, + "grad_norm": 1.1086214986819922, + "learning_rate": 3.0189965646086634e-05, + "loss": 0.2681, + "step": 15194 + }, + { + "epoch": 1.8018498754891499, + "grad_norm": 0.6323700057636477, + "learning_rate": 3.0187617555078527e-05, + "loss": 0.1433, + "step": 15195 + }, + { + "epoch": 1.8019684572512746, + "grad_norm": 1.3728689354747305, + "learning_rate": 3.0185269416246948e-05, + "loss": 0.2884, + "step": 15196 + }, + { + "epoch": 1.8020870390133998, + "grad_norm": 0.7284811163355518, + "learning_rate": 3.018292122961355e-05, + "loss": 0.1647, + "step": 15197 + }, + { + "epoch": 1.8022056207755246, + "grad_norm": 0.6014879788591126, + "learning_rate": 3.0180572995199962e-05, + "loss": 0.1184, + "step": 15198 + }, + { + "epoch": 1.8023242025376498, + "grad_norm": 1.009939388979859, + "learning_rate": 3.0178224713027847e-05, + "loss": 0.175, + "step": 15199 + }, + { + "epoch": 1.8024427842997746, + "grad_norm": 1.1736012838801961, + "learning_rate": 3.0175876383118855e-05, + "loss": 0.2469, + "step": 15200 + }, + { + "epoch": 1.8025613660618998, + "grad_norm": 1.0809549657714579, + "learning_rate": 3.0173528005494623e-05, + "loss": 0.3018, + "step": 15201 + }, + { + "epoch": 1.8026799478240245, + "grad_norm": 0.8707304119369763, + "learning_rate": 3.0171179580176816e-05, + "loss": 0.2143, + "step": 15202 + }, + { + "epoch": 1.8027985295861497, + "grad_norm": 0.7763017402391296, + "learning_rate": 3.016883110718707e-05, + "loss": 0.2098, + "step": 15203 + }, + { + "epoch": 1.8029171113482745, + "grad_norm": 1.0676342060227528, + "learning_rate": 3.0166482586547033e-05, + "loss": 0.2159, + "step": 15204 + }, + { + "epoch": 1.8030356931103997, + "grad_norm": 1.3177373353827886, + "learning_rate": 3.0164134018278366e-05, + "loss": 0.2086, + "step": 15205 + }, + { + "epoch": 1.8031542748725244, + "grad_norm": 0.8231907594151363, + "learning_rate": 3.0161785402402726e-05, + "loss": 0.1829, + "step": 15206 + }, + { + "epoch": 1.8032728566346496, + "grad_norm": 0.6775343531058204, + "learning_rate": 3.0159436738941744e-05, + "loss": 0.1595, + "step": 15207 + }, + { + "epoch": 1.8033914383967746, + "grad_norm": 1.1345164424781191, + "learning_rate": 3.0157088027917085e-05, + "loss": 0.1845, + "step": 15208 + }, + { + "epoch": 1.8035100201588996, + "grad_norm": 0.8007594495605799, + "learning_rate": 3.0154739269350397e-05, + "loss": 0.1715, + "step": 15209 + }, + { + "epoch": 1.8036286019210246, + "grad_norm": 1.2253192155462074, + "learning_rate": 3.015239046326334e-05, + "loss": 0.3129, + "step": 15210 + }, + { + "epoch": 1.8037471836831496, + "grad_norm": 0.44685152774035564, + "learning_rate": 3.0150041609677554e-05, + "loss": 0.1181, + "step": 15211 + }, + { + "epoch": 1.8038657654452745, + "grad_norm": 0.7157588618011987, + "learning_rate": 3.01476927086147e-05, + "loss": 0.1851, + "step": 15212 + }, + { + "epoch": 1.8039843472073995, + "grad_norm": 0.8767484621703151, + "learning_rate": 3.0145343760096435e-05, + "loss": 0.1753, + "step": 15213 + }, + { + "epoch": 1.8041029289695245, + "grad_norm": 0.6807568994789905, + "learning_rate": 3.0142994764144407e-05, + "loss": 0.1452, + "step": 15214 + }, + { + "epoch": 1.8042215107316495, + "grad_norm": 0.7805003746792329, + "learning_rate": 3.014064572078028e-05, + "loss": 0.1433, + "step": 15215 + }, + { + "epoch": 1.8043400924937745, + "grad_norm": 0.8650929419677594, + "learning_rate": 3.0138296630025696e-05, + "loss": 0.1771, + "step": 15216 + }, + { + "epoch": 1.8044586742558995, + "grad_norm": 0.7228113999420612, + "learning_rate": 3.013594749190232e-05, + "loss": 0.1505, + "step": 15217 + }, + { + "epoch": 1.8045772560180244, + "grad_norm": 0.7615932269820929, + "learning_rate": 3.013359830643181e-05, + "loss": 0.1465, + "step": 15218 + }, + { + "epoch": 1.8046958377801494, + "grad_norm": 1.3587083966242381, + "learning_rate": 3.013124907363582e-05, + "loss": 0.2333, + "step": 15219 + }, + { + "epoch": 1.8048144195422744, + "grad_norm": 0.5269649522379545, + "learning_rate": 3.0128899793536003e-05, + "loss": 0.1409, + "step": 15220 + }, + { + "epoch": 1.8049330013043994, + "grad_norm": 0.6832051578571429, + "learning_rate": 3.012655046615402e-05, + "loss": 0.1356, + "step": 15221 + }, + { + "epoch": 1.8050515830665244, + "grad_norm": 1.1215541056903264, + "learning_rate": 3.0124201091511527e-05, + "loss": 0.2105, + "step": 15222 + }, + { + "epoch": 1.8051701648286493, + "grad_norm": 1.0986712059210137, + "learning_rate": 3.012185166963018e-05, + "loss": 0.2195, + "step": 15223 + }, + { + "epoch": 1.8052887465907743, + "grad_norm": 0.7034890540046537, + "learning_rate": 3.0119502200531646e-05, + "loss": 0.147, + "step": 15224 + }, + { + "epoch": 1.8054073283528993, + "grad_norm": 1.347226414579973, + "learning_rate": 3.011715268423758e-05, + "loss": 0.2905, + "step": 15225 + }, + { + "epoch": 1.8055259101150243, + "grad_norm": 0.7704153796071466, + "learning_rate": 3.0114803120769642e-05, + "loss": 0.1809, + "step": 15226 + }, + { + "epoch": 1.8056444918771493, + "grad_norm": 0.7338520567410853, + "learning_rate": 3.0112453510149484e-05, + "loss": 0.1574, + "step": 15227 + }, + { + "epoch": 1.8057630736392742, + "grad_norm": 0.907874191340384, + "learning_rate": 3.0110103852398786e-05, + "loss": 0.1843, + "step": 15228 + }, + { + "epoch": 1.8058816554013992, + "grad_norm": 0.9088841774887669, + "learning_rate": 3.0107754147539185e-05, + "loss": 0.2221, + "step": 15229 + }, + { + "epoch": 1.8060002371635242, + "grad_norm": 0.7609009955089608, + "learning_rate": 3.010540439559236e-05, + "loss": 0.177, + "step": 15230 + }, + { + "epoch": 1.8061188189256492, + "grad_norm": 1.4646084576475629, + "learning_rate": 3.0103054596579973e-05, + "loss": 0.3375, + "step": 15231 + }, + { + "epoch": 1.8062374006877744, + "grad_norm": 0.9045036234033605, + "learning_rate": 3.010070475052368e-05, + "loss": 0.1942, + "step": 15232 + }, + { + "epoch": 1.8063559824498991, + "grad_norm": 0.7640047218126231, + "learning_rate": 3.009835485744513e-05, + "loss": 0.152, + "step": 15233 + }, + { + "epoch": 1.8064745642120243, + "grad_norm": 0.9173896470133076, + "learning_rate": 3.0096004917366012e-05, + "loss": 0.1898, + "step": 15234 + }, + { + "epoch": 1.806593145974149, + "grad_norm": 1.043075417077305, + "learning_rate": 3.009365493030797e-05, + "loss": 0.16, + "step": 15235 + }, + { + "epoch": 1.8067117277362743, + "grad_norm": 0.7275174754493908, + "learning_rate": 3.0091304896292682e-05, + "loss": 0.2044, + "step": 15236 + }, + { + "epoch": 1.806830309498399, + "grad_norm": 0.909334831950319, + "learning_rate": 3.0088954815341803e-05, + "loss": 0.1923, + "step": 15237 + }, + { + "epoch": 1.8069488912605243, + "grad_norm": 0.6629246171655451, + "learning_rate": 3.0086604687477004e-05, + "loss": 0.147, + "step": 15238 + }, + { + "epoch": 1.807067473022649, + "grad_norm": 1.1190756020745454, + "learning_rate": 3.0084254512719946e-05, + "loss": 0.2283, + "step": 15239 + }, + { + "epoch": 1.8071860547847742, + "grad_norm": 1.114967319532472, + "learning_rate": 3.00819042910923e-05, + "loss": 0.2024, + "step": 15240 + }, + { + "epoch": 1.807304636546899, + "grad_norm": 1.6740702817604287, + "learning_rate": 3.0079554022615725e-05, + "loss": 0.3497, + "step": 15241 + }, + { + "epoch": 1.8074232183090242, + "grad_norm": 0.6966267142243856, + "learning_rate": 3.0077203707311887e-05, + "loss": 0.1313, + "step": 15242 + }, + { + "epoch": 1.807541800071149, + "grad_norm": 0.6850710284267119, + "learning_rate": 3.0074853345202457e-05, + "loss": 0.1373, + "step": 15243 + }, + { + "epoch": 1.8076603818332742, + "grad_norm": 0.6888011907686001, + "learning_rate": 3.0072502936309104e-05, + "loss": 0.1354, + "step": 15244 + }, + { + "epoch": 1.807778963595399, + "grad_norm": 0.9073493498749794, + "learning_rate": 3.0070152480653502e-05, + "loss": 0.1837, + "step": 15245 + }, + { + "epoch": 1.8078975453575241, + "grad_norm": 0.8809591885366445, + "learning_rate": 3.0067801978257304e-05, + "loss": 0.167, + "step": 15246 + }, + { + "epoch": 1.8080161271196489, + "grad_norm": 1.0447136164065292, + "learning_rate": 3.006545142914219e-05, + "loss": 0.1936, + "step": 15247 + }, + { + "epoch": 1.808134708881774, + "grad_norm": 0.9471722626731461, + "learning_rate": 3.006310083332982e-05, + "loss": 0.1992, + "step": 15248 + }, + { + "epoch": 1.8082532906438988, + "grad_norm": 0.871852098025646, + "learning_rate": 3.0060750190841875e-05, + "loss": 0.1733, + "step": 15249 + }, + { + "epoch": 1.808371872406024, + "grad_norm": 0.7433856102331952, + "learning_rate": 3.0058399501700014e-05, + "loss": 0.1886, + "step": 15250 + }, + { + "epoch": 1.8084904541681488, + "grad_norm": 0.8056981082973372, + "learning_rate": 3.005604876592592e-05, + "loss": 0.1617, + "step": 15251 + }, + { + "epoch": 1.808609035930274, + "grad_norm": 0.8563069023566553, + "learning_rate": 3.005369798354125e-05, + "loss": 0.1679, + "step": 15252 + }, + { + "epoch": 1.8087276176923988, + "grad_norm": 0.675958477332458, + "learning_rate": 3.0051347154567684e-05, + "loss": 0.1134, + "step": 15253 + }, + { + "epoch": 1.808846199454524, + "grad_norm": 0.9784727745964439, + "learning_rate": 3.0048996279026885e-05, + "loss": 0.2005, + "step": 15254 + }, + { + "epoch": 1.8089647812166487, + "grad_norm": 0.763599462231589, + "learning_rate": 3.004664535694054e-05, + "loss": 0.1435, + "step": 15255 + }, + { + "epoch": 1.809083362978774, + "grad_norm": 1.0231670583924364, + "learning_rate": 3.004429438833031e-05, + "loss": 0.1974, + "step": 15256 + }, + { + "epoch": 1.809201944740899, + "grad_norm": 1.1335851175381495, + "learning_rate": 3.0041943373217874e-05, + "loss": 0.2226, + "step": 15257 + }, + { + "epoch": 1.8093205265030239, + "grad_norm": 0.9297806030470269, + "learning_rate": 3.0039592311624903e-05, + "loss": 0.154, + "step": 15258 + }, + { + "epoch": 1.8094391082651489, + "grad_norm": 1.325758308319614, + "learning_rate": 3.003724120357307e-05, + "loss": 0.2998, + "step": 15259 + }, + { + "epoch": 1.8095576900272738, + "grad_norm": 1.0361172507235572, + "learning_rate": 3.0034890049084054e-05, + "loss": 0.2395, + "step": 15260 + }, + { + "epoch": 1.8096762717893988, + "grad_norm": 0.8143665501022673, + "learning_rate": 3.0032538848179513e-05, + "loss": 0.1723, + "step": 15261 + }, + { + "epoch": 1.8097948535515238, + "grad_norm": 1.5104711784066196, + "learning_rate": 3.0030187600881145e-05, + "loss": 0.2959, + "step": 15262 + }, + { + "epoch": 1.8099134353136488, + "grad_norm": 0.7184342013057864, + "learning_rate": 3.002783630721061e-05, + "loss": 0.1632, + "step": 15263 + }, + { + "epoch": 1.8100320170757738, + "grad_norm": 0.9669895671358301, + "learning_rate": 3.0025484967189597e-05, + "loss": 0.1812, + "step": 15264 + }, + { + "epoch": 1.8101505988378987, + "grad_norm": 0.9439078425332885, + "learning_rate": 3.0023133580839773e-05, + "loss": 0.2075, + "step": 15265 + }, + { + "epoch": 1.8102691806000237, + "grad_norm": 0.7995536605362623, + "learning_rate": 3.002078214818282e-05, + "loss": 0.1618, + "step": 15266 + }, + { + "epoch": 1.8103877623621487, + "grad_norm": 0.895161977697565, + "learning_rate": 3.0018430669240406e-05, + "loss": 0.1823, + "step": 15267 + }, + { + "epoch": 1.8105063441242737, + "grad_norm": 0.773872719254621, + "learning_rate": 3.0016079144034227e-05, + "loss": 0.1946, + "step": 15268 + }, + { + "epoch": 1.8106249258863987, + "grad_norm": 0.8960827594249422, + "learning_rate": 3.001372757258594e-05, + "loss": 0.2036, + "step": 15269 + }, + { + "epoch": 1.8107435076485237, + "grad_norm": 0.68428085502591, + "learning_rate": 3.0011375954917227e-05, + "loss": 0.1773, + "step": 15270 + }, + { + "epoch": 1.8108620894106486, + "grad_norm": 0.8103033257681806, + "learning_rate": 3.000902429104978e-05, + "loss": 0.1728, + "step": 15271 + }, + { + "epoch": 1.8109806711727736, + "grad_norm": 0.807279193208687, + "learning_rate": 3.0006672581005274e-05, + "loss": 0.1736, + "step": 15272 + }, + { + "epoch": 1.8110992529348986, + "grad_norm": 0.6801779311640868, + "learning_rate": 3.000432082480538e-05, + "loss": 0.1524, + "step": 15273 + }, + { + "epoch": 1.8112178346970236, + "grad_norm": 1.0171118677664284, + "learning_rate": 3.0001969022471788e-05, + "loss": 0.1955, + "step": 15274 + }, + { + "epoch": 1.8113364164591486, + "grad_norm": 1.0212226607741433, + "learning_rate": 2.9999617174026178e-05, + "loss": 0.1894, + "step": 15275 + }, + { + "epoch": 1.8114549982212735, + "grad_norm": 0.7080880806476614, + "learning_rate": 2.9997265279490226e-05, + "loss": 0.1538, + "step": 15276 + }, + { + "epoch": 1.8115735799833985, + "grad_norm": 1.0457971179203098, + "learning_rate": 2.9994913338885623e-05, + "loss": 0.2108, + "step": 15277 + }, + { + "epoch": 1.8116921617455235, + "grad_norm": 1.4724577619725654, + "learning_rate": 2.9992561352234035e-05, + "loss": 0.3365, + "step": 15278 + }, + { + "epoch": 1.8118107435076485, + "grad_norm": 0.9014810510951258, + "learning_rate": 2.9990209319557156e-05, + "loss": 0.1571, + "step": 15279 + }, + { + "epoch": 1.8119293252697735, + "grad_norm": 0.7977892961064299, + "learning_rate": 2.9987857240876665e-05, + "loss": 0.1601, + "step": 15280 + }, + { + "epoch": 1.8120479070318984, + "grad_norm": 0.6278846627918053, + "learning_rate": 2.9985505116214252e-05, + "loss": 0.1319, + "step": 15281 + }, + { + "epoch": 1.8121664887940234, + "grad_norm": 0.8030058293635072, + "learning_rate": 2.998315294559159e-05, + "loss": 0.1915, + "step": 15282 + }, + { + "epoch": 1.8122850705561486, + "grad_norm": 0.8733656039367683, + "learning_rate": 2.9980800729030374e-05, + "loss": 0.2132, + "step": 15283 + }, + { + "epoch": 1.8124036523182734, + "grad_norm": 0.6327343413469493, + "learning_rate": 2.997844846655227e-05, + "loss": 0.1633, + "step": 15284 + }, + { + "epoch": 1.8125222340803986, + "grad_norm": 0.8320824111930306, + "learning_rate": 2.997609615817899e-05, + "loss": 0.1744, + "step": 15285 + }, + { + "epoch": 1.8126408158425233, + "grad_norm": 0.8340511143449201, + "learning_rate": 2.9973743803932203e-05, + "loss": 0.1963, + "step": 15286 + }, + { + "epoch": 1.8127593976046485, + "grad_norm": 0.5920607831492545, + "learning_rate": 2.9971391403833594e-05, + "loss": 0.1255, + "step": 15287 + }, + { + "epoch": 1.8128779793667733, + "grad_norm": 1.221464799820948, + "learning_rate": 2.9969038957904862e-05, + "loss": 0.2349, + "step": 15288 + }, + { + "epoch": 1.8129965611288985, + "grad_norm": 0.9977423505166422, + "learning_rate": 2.996668646616767e-05, + "loss": 0.2046, + "step": 15289 + }, + { + "epoch": 1.8131151428910233, + "grad_norm": 0.8799603782398817, + "learning_rate": 2.996433392864373e-05, + "loss": 0.2242, + "step": 15290 + }, + { + "epoch": 1.8132337246531485, + "grad_norm": 0.7008405922026111, + "learning_rate": 2.9961981345354713e-05, + "loss": 0.1273, + "step": 15291 + }, + { + "epoch": 1.8133523064152732, + "grad_norm": 0.7205679662758091, + "learning_rate": 2.9959628716322313e-05, + "loss": 0.1388, + "step": 15292 + }, + { + "epoch": 1.8134708881773984, + "grad_norm": 0.847835832058915, + "learning_rate": 2.9957276041568216e-05, + "loss": 0.2185, + "step": 15293 + }, + { + "epoch": 1.8135894699395232, + "grad_norm": 0.7688026160026602, + "learning_rate": 2.995492332111412e-05, + "loss": 0.1399, + "step": 15294 + }, + { + "epoch": 1.8137080517016484, + "grad_norm": 0.6920954988987542, + "learning_rate": 2.9952570554981703e-05, + "loss": 0.127, + "step": 15295 + }, + { + "epoch": 1.8138266334637732, + "grad_norm": 1.4581016470038146, + "learning_rate": 2.9950217743192664e-05, + "loss": 0.3385, + "step": 15296 + }, + { + "epoch": 1.8139452152258984, + "grad_norm": 1.2058478634898417, + "learning_rate": 2.9947864885768685e-05, + "loss": 0.2031, + "step": 15297 + }, + { + "epoch": 1.8140637969880231, + "grad_norm": 0.9085337558727927, + "learning_rate": 2.9945511982731457e-05, + "loss": 0.2007, + "step": 15298 + }, + { + "epoch": 1.8141823787501483, + "grad_norm": 1.0978107649282072, + "learning_rate": 2.9943159034102674e-05, + "loss": 0.2131, + "step": 15299 + }, + { + "epoch": 1.814300960512273, + "grad_norm": 1.0042024865730301, + "learning_rate": 2.9940806039904023e-05, + "loss": 0.1805, + "step": 15300 + }, + { + "epoch": 1.8144195422743983, + "grad_norm": 0.9854329195411209, + "learning_rate": 2.993845300015721e-05, + "loss": 0.1599, + "step": 15301 + }, + { + "epoch": 1.814538124036523, + "grad_norm": 0.8826217392959612, + "learning_rate": 2.99360999148839e-05, + "loss": 0.1916, + "step": 15302 + }, + { + "epoch": 1.8146567057986482, + "grad_norm": 1.0175564574012719, + "learning_rate": 2.993374678410581e-05, + "loss": 0.1997, + "step": 15303 + }, + { + "epoch": 1.814775287560773, + "grad_norm": 0.7939593342986472, + "learning_rate": 2.9931393607844623e-05, + "loss": 0.1635, + "step": 15304 + }, + { + "epoch": 1.8148938693228982, + "grad_norm": 0.9637371710870302, + "learning_rate": 2.9929040386122036e-05, + "loss": 0.2258, + "step": 15305 + }, + { + "epoch": 1.815012451085023, + "grad_norm": 1.2994510930677314, + "learning_rate": 2.992668711895974e-05, + "loss": 0.2524, + "step": 15306 + }, + { + "epoch": 1.8151310328471482, + "grad_norm": 0.873924200708601, + "learning_rate": 2.9924333806379433e-05, + "loss": 0.1638, + "step": 15307 + }, + { + "epoch": 1.8152496146092731, + "grad_norm": 0.8610701663439282, + "learning_rate": 2.9921980448402808e-05, + "loss": 0.174, + "step": 15308 + }, + { + "epoch": 1.8153681963713981, + "grad_norm": 0.8927793464123126, + "learning_rate": 2.9919627045051556e-05, + "loss": 0.2224, + "step": 15309 + }, + { + "epoch": 1.815486778133523, + "grad_norm": 0.7261982214172353, + "learning_rate": 2.991727359634738e-05, + "loss": 0.1432, + "step": 15310 + }, + { + "epoch": 1.815605359895648, + "grad_norm": 1.0124862913008617, + "learning_rate": 2.9914920102311966e-05, + "loss": 0.2074, + "step": 15311 + }, + { + "epoch": 1.815723941657773, + "grad_norm": 0.7830927192172253, + "learning_rate": 2.9912566562967014e-05, + "loss": 0.1335, + "step": 15312 + }, + { + "epoch": 1.815842523419898, + "grad_norm": 0.790795564121555, + "learning_rate": 2.9910212978334228e-05, + "loss": 0.196, + "step": 15313 + }, + { + "epoch": 1.815961105182023, + "grad_norm": 0.8555548766887383, + "learning_rate": 2.9907859348435297e-05, + "loss": 0.142, + "step": 15314 + }, + { + "epoch": 1.816079686944148, + "grad_norm": 0.8732728532257412, + "learning_rate": 2.9905505673291918e-05, + "loss": 0.2006, + "step": 15315 + }, + { + "epoch": 1.816198268706273, + "grad_norm": 0.8584631181511472, + "learning_rate": 2.9903151952925802e-05, + "loss": 0.143, + "step": 15316 + }, + { + "epoch": 1.816316850468398, + "grad_norm": 0.6221715423461472, + "learning_rate": 2.9900798187358624e-05, + "loss": 0.1491, + "step": 15317 + }, + { + "epoch": 1.816435432230523, + "grad_norm": 0.5700332378664487, + "learning_rate": 2.989844437661211e-05, + "loss": 0.1004, + "step": 15318 + }, + { + "epoch": 1.816554013992648, + "grad_norm": 0.8404444240931636, + "learning_rate": 2.9896090520707937e-05, + "loss": 0.1246, + "step": 15319 + }, + { + "epoch": 1.816672595754773, + "grad_norm": 0.7925946751066655, + "learning_rate": 2.9893736619667817e-05, + "loss": 0.1502, + "step": 15320 + }, + { + "epoch": 1.816791177516898, + "grad_norm": 0.7908484804618506, + "learning_rate": 2.9891382673513448e-05, + "loss": 0.1416, + "step": 15321 + }, + { + "epoch": 1.8169097592790229, + "grad_norm": 1.175105213501486, + "learning_rate": 2.9889028682266524e-05, + "loss": 0.2769, + "step": 15322 + }, + { + "epoch": 1.8170283410411479, + "grad_norm": 0.7574366576969697, + "learning_rate": 2.9886674645948747e-05, + "loss": 0.1647, + "step": 15323 + }, + { + "epoch": 1.8171469228032728, + "grad_norm": 0.9539844189983758, + "learning_rate": 2.9884320564581835e-05, + "loss": 0.1971, + "step": 15324 + }, + { + "epoch": 1.8172655045653978, + "grad_norm": 0.6507432565438186, + "learning_rate": 2.9881966438187464e-05, + "loss": 0.1636, + "step": 15325 + }, + { + "epoch": 1.8173840863275228, + "grad_norm": 0.8978130799492937, + "learning_rate": 2.987961226678736e-05, + "loss": 0.1719, + "step": 15326 + }, + { + "epoch": 1.8175026680896478, + "grad_norm": 0.7478027979128965, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.1524, + "step": 15327 + }, + { + "epoch": 1.8176212498517728, + "grad_norm": 0.6838669586010478, + "learning_rate": 2.987490378905672e-05, + "loss": 0.1771, + "step": 15328 + }, + { + "epoch": 1.8177398316138977, + "grad_norm": 0.9781849500248863, + "learning_rate": 2.9872549482769603e-05, + "loss": 0.228, + "step": 15329 + }, + { + "epoch": 1.8178584133760227, + "grad_norm": 1.0809597274270244, + "learning_rate": 2.9870195131563543e-05, + "loss": 0.1681, + "step": 15330 + }, + { + "epoch": 1.8179769951381477, + "grad_norm": 0.922626577611113, + "learning_rate": 2.9867840735460268e-05, + "loss": 0.1583, + "step": 15331 + }, + { + "epoch": 1.818095576900273, + "grad_norm": 1.2450145250132303, + "learning_rate": 2.986548629448146e-05, + "loss": 0.2921, + "step": 15332 + }, + { + "epoch": 1.8182141586623977, + "grad_norm": 0.792291964885216, + "learning_rate": 2.9863131808648836e-05, + "loss": 0.1943, + "step": 15333 + }, + { + "epoch": 1.8183327404245229, + "grad_norm": 1.3286886472692427, + "learning_rate": 2.98607772779841e-05, + "loss": 0.2372, + "step": 15334 + }, + { + "epoch": 1.8184513221866476, + "grad_norm": 0.7995073577085069, + "learning_rate": 2.985842270250897e-05, + "loss": 0.1448, + "step": 15335 + }, + { + "epoch": 1.8185699039487728, + "grad_norm": 0.9507616649199002, + "learning_rate": 2.985606808224513e-05, + "loss": 0.2083, + "step": 15336 + }, + { + "epoch": 1.8186884857108976, + "grad_norm": 1.0793503203095618, + "learning_rate": 2.9853713417214302e-05, + "loss": 0.1791, + "step": 15337 + }, + { + "epoch": 1.8188070674730228, + "grad_norm": 1.1863959696293285, + "learning_rate": 2.9851358707438182e-05, + "loss": 0.2076, + "step": 15338 + }, + { + "epoch": 1.8189256492351475, + "grad_norm": 1.2078118999498182, + "learning_rate": 2.9849003952938493e-05, + "loss": 0.2511, + "step": 15339 + }, + { + "epoch": 1.8190442309972727, + "grad_norm": 1.099621275013443, + "learning_rate": 2.9846649153736928e-05, + "loss": 0.2736, + "step": 15340 + }, + { + "epoch": 1.8191628127593975, + "grad_norm": 0.9767867432903887, + "learning_rate": 2.9844294309855198e-05, + "loss": 0.1962, + "step": 15341 + }, + { + "epoch": 1.8192813945215227, + "grad_norm": 0.9469528405618699, + "learning_rate": 2.9841939421315018e-05, + "loss": 0.2216, + "step": 15342 + }, + { + "epoch": 1.8193999762836475, + "grad_norm": 0.996312693064748, + "learning_rate": 2.983958448813809e-05, + "loss": 0.1728, + "step": 15343 + }, + { + "epoch": 1.8195185580457727, + "grad_norm": 1.0476534758728546, + "learning_rate": 2.983722951034614e-05, + "loss": 0.2146, + "step": 15344 + }, + { + "epoch": 1.8196371398078974, + "grad_norm": 1.152401842740534, + "learning_rate": 2.983487448796085e-05, + "loss": 0.2646, + "step": 15345 + }, + { + "epoch": 1.8197557215700226, + "grad_norm": 1.1457099663554928, + "learning_rate": 2.9832519421003962e-05, + "loss": 0.2418, + "step": 15346 + }, + { + "epoch": 1.8198743033321474, + "grad_norm": 0.9267420333995193, + "learning_rate": 2.983016430949716e-05, + "loss": 0.2161, + "step": 15347 + }, + { + "epoch": 1.8199928850942726, + "grad_norm": 0.9771873361154638, + "learning_rate": 2.982780915346217e-05, + "loss": 0.2135, + "step": 15348 + }, + { + "epoch": 1.8201114668563974, + "grad_norm": 1.0812969769250833, + "learning_rate": 2.98254539529207e-05, + "loss": 0.2441, + "step": 15349 + }, + { + "epoch": 1.8202300486185226, + "grad_norm": 0.9291653234283628, + "learning_rate": 2.9823098707894464e-05, + "loss": 0.1879, + "step": 15350 + }, + { + "epoch": 1.8203486303806473, + "grad_norm": 0.859331319835636, + "learning_rate": 2.9820743418405167e-05, + "loss": 0.1765, + "step": 15351 + }, + { + "epoch": 1.8204672121427725, + "grad_norm": 1.0588709013525603, + "learning_rate": 2.981838808447453e-05, + "loss": 0.2166, + "step": 15352 + }, + { + "epoch": 1.8205857939048973, + "grad_norm": 1.3089852941335953, + "learning_rate": 2.9816032706124264e-05, + "loss": 0.3057, + "step": 15353 + }, + { + "epoch": 1.8207043756670225, + "grad_norm": 1.0230032065837968, + "learning_rate": 2.9813677283376085e-05, + "loss": 0.2446, + "step": 15354 + }, + { + "epoch": 1.8208229574291472, + "grad_norm": 1.0036418454899414, + "learning_rate": 2.9811321816251697e-05, + "loss": 0.2351, + "step": 15355 + }, + { + "epoch": 1.8209415391912724, + "grad_norm": 0.7038436788421636, + "learning_rate": 2.9808966304772825e-05, + "loss": 0.1338, + "step": 15356 + }, + { + "epoch": 1.8210601209533974, + "grad_norm": 1.0770720631206359, + "learning_rate": 2.9806610748961183e-05, + "loss": 0.2132, + "step": 15357 + }, + { + "epoch": 1.8211787027155224, + "grad_norm": 0.7061355869679413, + "learning_rate": 2.9804255148838477e-05, + "loss": 0.1634, + "step": 15358 + }, + { + "epoch": 1.8212972844776474, + "grad_norm": 0.9026271738817263, + "learning_rate": 2.9801899504426444e-05, + "loss": 0.1593, + "step": 15359 + }, + { + "epoch": 1.8214158662397724, + "grad_norm": 0.9925600749233368, + "learning_rate": 2.9799543815746778e-05, + "loss": 0.1838, + "step": 15360 + }, + { + "epoch": 1.8215344480018973, + "grad_norm": 0.9236421000218497, + "learning_rate": 2.9797188082821205e-05, + "loss": 0.1995, + "step": 15361 + }, + { + "epoch": 1.8216530297640223, + "grad_norm": 0.8052098522377797, + "learning_rate": 2.9794832305671438e-05, + "loss": 0.1996, + "step": 15362 + }, + { + "epoch": 1.8217716115261473, + "grad_norm": 0.921372956876502, + "learning_rate": 2.9792476484319193e-05, + "loss": 0.2015, + "step": 15363 + }, + { + "epoch": 1.8218901932882723, + "grad_norm": 0.8483875686515787, + "learning_rate": 2.9790120618786194e-05, + "loss": 0.1435, + "step": 15364 + }, + { + "epoch": 1.8220087750503973, + "grad_norm": 0.6689870042009372, + "learning_rate": 2.978776470909416e-05, + "loss": 0.1421, + "step": 15365 + }, + { + "epoch": 1.8221273568125222, + "grad_norm": 0.7620896863115509, + "learning_rate": 2.978540875526481e-05, + "loss": 0.2035, + "step": 15366 + }, + { + "epoch": 1.8222459385746472, + "grad_norm": 0.7871718360389401, + "learning_rate": 2.9783052757319857e-05, + "loss": 0.1729, + "step": 15367 + }, + { + "epoch": 1.8223645203367722, + "grad_norm": 0.9348674533291015, + "learning_rate": 2.978069671528102e-05, + "loss": 0.1845, + "step": 15368 + }, + { + "epoch": 1.8224831020988972, + "grad_norm": 1.1764721518207681, + "learning_rate": 2.9778340629170027e-05, + "loss": 0.2088, + "step": 15369 + }, + { + "epoch": 1.8226016838610222, + "grad_norm": 0.7500456399926081, + "learning_rate": 2.9775984499008587e-05, + "loss": 0.1525, + "step": 15370 + }, + { + "epoch": 1.8227202656231472, + "grad_norm": 1.0043056354417137, + "learning_rate": 2.9773628324818425e-05, + "loss": 0.2123, + "step": 15371 + }, + { + "epoch": 1.8228388473852721, + "grad_norm": 0.9965126162440716, + "learning_rate": 2.9771272106621266e-05, + "loss": 0.2186, + "step": 15372 + }, + { + "epoch": 1.8229574291473971, + "grad_norm": 0.9466537864894152, + "learning_rate": 2.9768915844438834e-05, + "loss": 0.2412, + "step": 15373 + }, + { + "epoch": 1.823076010909522, + "grad_norm": 0.9685451430255317, + "learning_rate": 2.976655953829284e-05, + "loss": 0.1883, + "step": 15374 + }, + { + "epoch": 1.823194592671647, + "grad_norm": 0.976212434472658, + "learning_rate": 2.9764203188205016e-05, + "loss": 0.1611, + "step": 15375 + }, + { + "epoch": 1.823313174433772, + "grad_norm": 0.7993033054934024, + "learning_rate": 2.9761846794197085e-05, + "loss": 0.1639, + "step": 15376 + }, + { + "epoch": 1.823431756195897, + "grad_norm": 0.644423317884611, + "learning_rate": 2.975949035629076e-05, + "loss": 0.1584, + "step": 15377 + }, + { + "epoch": 1.823550337958022, + "grad_norm": 0.6361957467653376, + "learning_rate": 2.975713387450777e-05, + "loss": 0.1663, + "step": 15378 + }, + { + "epoch": 1.823668919720147, + "grad_norm": 0.8083390700507445, + "learning_rate": 2.9754777348869837e-05, + "loss": 0.1758, + "step": 15379 + }, + { + "epoch": 1.823787501482272, + "grad_norm": 1.0009482580330868, + "learning_rate": 2.97524207793987e-05, + "loss": 0.1971, + "step": 15380 + }, + { + "epoch": 1.823906083244397, + "grad_norm": 0.7749559525252164, + "learning_rate": 2.975006416611606e-05, + "loss": 0.1654, + "step": 15381 + }, + { + "epoch": 1.824024665006522, + "grad_norm": 1.1788922314814314, + "learning_rate": 2.9747707509043653e-05, + "loss": 0.2284, + "step": 15382 + }, + { + "epoch": 1.8241432467686471, + "grad_norm": 0.6472482042543826, + "learning_rate": 2.9745350808203203e-05, + "loss": 0.1458, + "step": 15383 + }, + { + "epoch": 1.824261828530772, + "grad_norm": 0.7378467345717279, + "learning_rate": 2.9742994063616447e-05, + "loss": 0.1184, + "step": 15384 + }, + { + "epoch": 1.824380410292897, + "grad_norm": 0.8306176972984514, + "learning_rate": 2.9740637275305094e-05, + "loss": 0.1609, + "step": 15385 + }, + { + "epoch": 1.8244989920550219, + "grad_norm": 0.9863432290638695, + "learning_rate": 2.973828044329088e-05, + "loss": 0.1549, + "step": 15386 + }, + { + "epoch": 1.824617573817147, + "grad_norm": 0.676357064815752, + "learning_rate": 2.9735923567595535e-05, + "loss": 0.1681, + "step": 15387 + }, + { + "epoch": 1.8247361555792718, + "grad_norm": 1.2228425595127967, + "learning_rate": 2.973356664824078e-05, + "loss": 0.3159, + "step": 15388 + }, + { + "epoch": 1.824854737341397, + "grad_norm": 1.103012663765399, + "learning_rate": 2.9731209685248345e-05, + "loss": 0.2055, + "step": 15389 + }, + { + "epoch": 1.8249733191035218, + "grad_norm": 0.8298451632690624, + "learning_rate": 2.9728852678639956e-05, + "loss": 0.1671, + "step": 15390 + }, + { + "epoch": 1.825091900865647, + "grad_norm": 0.49605046826054927, + "learning_rate": 2.9726495628437352e-05, + "loss": 0.0993, + "step": 15391 + }, + { + "epoch": 1.8252104826277717, + "grad_norm": 0.7322244791007606, + "learning_rate": 2.9724138534662243e-05, + "loss": 0.1571, + "step": 15392 + }, + { + "epoch": 1.825329064389897, + "grad_norm": 0.8681517052027562, + "learning_rate": 2.9721781397336378e-05, + "loss": 0.2187, + "step": 15393 + }, + { + "epoch": 1.8254476461520217, + "grad_norm": 0.6724897213208547, + "learning_rate": 2.9719424216481473e-05, + "loss": 0.1381, + "step": 15394 + }, + { + "epoch": 1.825566227914147, + "grad_norm": 0.8831776764121378, + "learning_rate": 2.971706699211927e-05, + "loss": 0.1749, + "step": 15395 + }, + { + "epoch": 1.8256848096762717, + "grad_norm": 0.9895974137765967, + "learning_rate": 2.971470972427149e-05, + "loss": 0.1499, + "step": 15396 + }, + { + "epoch": 1.8258033914383969, + "grad_norm": 0.869542004534008, + "learning_rate": 2.971235241295987e-05, + "loss": 0.1793, + "step": 15397 + }, + { + "epoch": 1.8259219732005216, + "grad_norm": 0.5760851819033119, + "learning_rate": 2.970999505820614e-05, + "loss": 0.141, + "step": 15398 + }, + { + "epoch": 1.8260405549626468, + "grad_norm": 0.8605444798430084, + "learning_rate": 2.9707637660032028e-05, + "loss": 0.1847, + "step": 15399 + }, + { + "epoch": 1.8261591367247716, + "grad_norm": 0.9596186165912983, + "learning_rate": 2.9705280218459274e-05, + "loss": 0.192, + "step": 15400 + }, + { + "epoch": 1.8262777184868968, + "grad_norm": 0.7495967277511423, + "learning_rate": 2.9702922733509604e-05, + "loss": 0.1691, + "step": 15401 + }, + { + "epoch": 1.8263963002490216, + "grad_norm": 1.1038455169199468, + "learning_rate": 2.970056520520475e-05, + "loss": 0.1809, + "step": 15402 + }, + { + "epoch": 1.8265148820111468, + "grad_norm": 0.9378227198402072, + "learning_rate": 2.969820763356645e-05, + "loss": 0.2196, + "step": 15403 + }, + { + "epoch": 1.8266334637732715, + "grad_norm": 1.2861399332835486, + "learning_rate": 2.9695850018616444e-05, + "loss": 0.2389, + "step": 15404 + }, + { + "epoch": 1.8267520455353967, + "grad_norm": 1.3444652415728569, + "learning_rate": 2.969349236037645e-05, + "loss": 0.2065, + "step": 15405 + }, + { + "epoch": 1.8268706272975215, + "grad_norm": 1.0241699260694561, + "learning_rate": 2.969113465886822e-05, + "loss": 0.194, + "step": 15406 + }, + { + "epoch": 1.8269892090596467, + "grad_norm": 0.8412949540116269, + "learning_rate": 2.9688776914113475e-05, + "loss": 0.2137, + "step": 15407 + }, + { + "epoch": 1.8271077908217717, + "grad_norm": 1.0514082042408757, + "learning_rate": 2.9686419126133963e-05, + "loss": 0.2093, + "step": 15408 + }, + { + "epoch": 1.8272263725838966, + "grad_norm": 0.7454526604520645, + "learning_rate": 2.968406129495141e-05, + "loss": 0.1343, + "step": 15409 + }, + { + "epoch": 1.8273449543460216, + "grad_norm": 0.8808717866449423, + "learning_rate": 2.9681703420587558e-05, + "loss": 0.1497, + "step": 15410 + }, + { + "epoch": 1.8274635361081466, + "grad_norm": 1.0360812975152616, + "learning_rate": 2.967934550306414e-05, + "loss": 0.2041, + "step": 15411 + }, + { + "epoch": 1.8275821178702716, + "grad_norm": 1.009231966050227, + "learning_rate": 2.967698754240289e-05, + "loss": 0.2066, + "step": 15412 + }, + { + "epoch": 1.8277006996323966, + "grad_norm": 0.7924510974596026, + "learning_rate": 2.9674629538625552e-05, + "loss": 0.1625, + "step": 15413 + }, + { + "epoch": 1.8278192813945215, + "grad_norm": 1.1483330575940787, + "learning_rate": 2.9672271491753862e-05, + "loss": 0.2367, + "step": 15414 + }, + { + "epoch": 1.8279378631566465, + "grad_norm": 0.9054021801137345, + "learning_rate": 2.966991340180956e-05, + "loss": 0.1423, + "step": 15415 + }, + { + "epoch": 1.8280564449187715, + "grad_norm": 1.0487710652400293, + "learning_rate": 2.9667555268814384e-05, + "loss": 0.2099, + "step": 15416 + }, + { + "epoch": 1.8281750266808965, + "grad_norm": 0.7264887400782071, + "learning_rate": 2.9665197092790074e-05, + "loss": 0.1622, + "step": 15417 + }, + { + "epoch": 1.8282936084430215, + "grad_norm": 0.9587722773048629, + "learning_rate": 2.9662838873758365e-05, + "loss": 0.237, + "step": 15418 + }, + { + "epoch": 1.8284121902051464, + "grad_norm": 0.8290794377845487, + "learning_rate": 2.9660480611741005e-05, + "loss": 0.1273, + "step": 15419 + }, + { + "epoch": 1.8285307719672714, + "grad_norm": 0.9029289325573021, + "learning_rate": 2.965812230675972e-05, + "loss": 0.1825, + "step": 15420 + }, + { + "epoch": 1.8286493537293964, + "grad_norm": 0.9126311854963453, + "learning_rate": 2.9655763958836262e-05, + "loss": 0.166, + "step": 15421 + }, + { + "epoch": 1.8287679354915214, + "grad_norm": 0.7776262613214269, + "learning_rate": 2.9653405567992366e-05, + "loss": 0.1064, + "step": 15422 + }, + { + "epoch": 1.8288865172536464, + "grad_norm": 0.8288257053584829, + "learning_rate": 2.965104713424979e-05, + "loss": 0.1915, + "step": 15423 + }, + { + "epoch": 1.8290050990157714, + "grad_norm": 0.871096126916273, + "learning_rate": 2.964868865763025e-05, + "loss": 0.1743, + "step": 15424 + }, + { + "epoch": 1.8291236807778963, + "grad_norm": 0.7337421050263578, + "learning_rate": 2.964633013815551e-05, + "loss": 0.1283, + "step": 15425 + }, + { + "epoch": 1.8292422625400213, + "grad_norm": 0.729404296390423, + "learning_rate": 2.9643971575847296e-05, + "loss": 0.1614, + "step": 15426 + }, + { + "epoch": 1.8293608443021463, + "grad_norm": 0.7908881932901992, + "learning_rate": 2.9641612970727368e-05, + "loss": 0.1573, + "step": 15427 + }, + { + "epoch": 1.8294794260642713, + "grad_norm": 0.80973418750176, + "learning_rate": 2.963925432281745e-05, + "loss": 0.1555, + "step": 15428 + }, + { + "epoch": 1.8295980078263963, + "grad_norm": 0.8351829055069806, + "learning_rate": 2.96368956321393e-05, + "loss": 0.1708, + "step": 15429 + }, + { + "epoch": 1.8297165895885212, + "grad_norm": 0.9051833998081217, + "learning_rate": 2.9634536898714667e-05, + "loss": 0.1881, + "step": 15430 + }, + { + "epoch": 1.8298351713506462, + "grad_norm": 1.3100036022578894, + "learning_rate": 2.9632178122565275e-05, + "loss": 0.2785, + "step": 15431 + }, + { + "epoch": 1.8299537531127714, + "grad_norm": 0.6186873760369141, + "learning_rate": 2.962981930371288e-05, + "loss": 0.1428, + "step": 15432 + }, + { + "epoch": 1.8300723348748962, + "grad_norm": 0.7272614353718987, + "learning_rate": 2.9627460442179233e-05, + "loss": 0.1452, + "step": 15433 + }, + { + "epoch": 1.8301909166370214, + "grad_norm": 1.0258556749257077, + "learning_rate": 2.9625101537986083e-05, + "loss": 0.1805, + "step": 15434 + }, + { + "epoch": 1.8303094983991461, + "grad_norm": 0.9667898621556686, + "learning_rate": 2.962274259115516e-05, + "loss": 0.1819, + "step": 15435 + }, + { + "epoch": 1.8304280801612713, + "grad_norm": 0.7271642203731177, + "learning_rate": 2.9620383601708223e-05, + "loss": 0.1364, + "step": 15436 + }, + { + "epoch": 1.830546661923396, + "grad_norm": 0.9296663800919625, + "learning_rate": 2.9618024569667014e-05, + "loss": 0.1839, + "step": 15437 + }, + { + "epoch": 1.8306652436855213, + "grad_norm": 0.8926007251429378, + "learning_rate": 2.961566549505328e-05, + "loss": 0.186, + "step": 15438 + }, + { + "epoch": 1.830783825447646, + "grad_norm": 0.6924331508317774, + "learning_rate": 2.9613306377888773e-05, + "loss": 0.1574, + "step": 15439 + }, + { + "epoch": 1.8309024072097713, + "grad_norm": 0.9997789716572151, + "learning_rate": 2.961094721819524e-05, + "loss": 0.1878, + "step": 15440 + }, + { + "epoch": 1.831020988971896, + "grad_norm": 1.0130366320506008, + "learning_rate": 2.9608588015994422e-05, + "loss": 0.2387, + "step": 15441 + }, + { + "epoch": 1.8311395707340212, + "grad_norm": 0.832014787651683, + "learning_rate": 2.960622877130807e-05, + "loss": 0.1572, + "step": 15442 + }, + { + "epoch": 1.831258152496146, + "grad_norm": 0.687106216687091, + "learning_rate": 2.9603869484157942e-05, + "loss": 0.1374, + "step": 15443 + }, + { + "epoch": 1.8313767342582712, + "grad_norm": 0.5982241800490347, + "learning_rate": 2.960151015456578e-05, + "loss": 0.1268, + "step": 15444 + }, + { + "epoch": 1.831495316020396, + "grad_norm": 0.8131698123129604, + "learning_rate": 2.9599150782553346e-05, + "loss": 0.1361, + "step": 15445 + }, + { + "epoch": 1.8316138977825211, + "grad_norm": 0.9452819807511228, + "learning_rate": 2.9596791368142372e-05, + "loss": 0.2196, + "step": 15446 + }, + { + "epoch": 1.831732479544646, + "grad_norm": 0.6721914521285541, + "learning_rate": 2.959443191135463e-05, + "loss": 0.1234, + "step": 15447 + }, + { + "epoch": 1.8318510613067711, + "grad_norm": 0.8309236120153571, + "learning_rate": 2.959207241221185e-05, + "loss": 0.1637, + "step": 15448 + }, + { + "epoch": 1.8319696430688959, + "grad_norm": 0.9556203360061042, + "learning_rate": 2.9589712870735804e-05, + "loss": 0.1369, + "step": 15449 + }, + { + "epoch": 1.832088224831021, + "grad_norm": 0.6941384659877499, + "learning_rate": 2.9587353286948222e-05, + "loss": 0.1231, + "step": 15450 + }, + { + "epoch": 1.8322068065931458, + "grad_norm": 0.6300492024424782, + "learning_rate": 2.9584993660870873e-05, + "loss": 0.1032, + "step": 15451 + }, + { + "epoch": 1.832325388355271, + "grad_norm": 0.8123198997757365, + "learning_rate": 2.9582633992525503e-05, + "loss": 0.1382, + "step": 15452 + }, + { + "epoch": 1.8324439701173958, + "grad_norm": 1.1008156604058594, + "learning_rate": 2.9580274281933874e-05, + "loss": 0.1857, + "step": 15453 + }, + { + "epoch": 1.832562551879521, + "grad_norm": 0.706450983837067, + "learning_rate": 2.9577914529117723e-05, + "loss": 0.1143, + "step": 15454 + }, + { + "epoch": 1.8326811336416458, + "grad_norm": 0.787339343938937, + "learning_rate": 2.957555473409882e-05, + "loss": 0.1338, + "step": 15455 + }, + { + "epoch": 1.832799715403771, + "grad_norm": 0.689838339152342, + "learning_rate": 2.9573194896898916e-05, + "loss": 0.1503, + "step": 15456 + }, + { + "epoch": 1.832918297165896, + "grad_norm": 0.8316656353624343, + "learning_rate": 2.9570835017539756e-05, + "loss": 0.1697, + "step": 15457 + }, + { + "epoch": 1.833036878928021, + "grad_norm": 0.8896455023816116, + "learning_rate": 2.9568475096043102e-05, + "loss": 0.1559, + "step": 15458 + }, + { + "epoch": 1.833155460690146, + "grad_norm": 0.6900176125576237, + "learning_rate": 2.9566115132430717e-05, + "loss": 0.1584, + "step": 15459 + }, + { + "epoch": 1.8332740424522709, + "grad_norm": 0.9899384929721701, + "learning_rate": 2.9563755126724345e-05, + "loss": 0.2518, + "step": 15460 + }, + { + "epoch": 1.8333926242143959, + "grad_norm": 1.000555084827324, + "learning_rate": 2.956139507894575e-05, + "loss": 0.2252, + "step": 15461 + }, + { + "epoch": 1.8335112059765208, + "grad_norm": 0.6994404393411617, + "learning_rate": 2.955903498911668e-05, + "loss": 0.16, + "step": 15462 + }, + { + "epoch": 1.8336297877386458, + "grad_norm": 1.0091171766989921, + "learning_rate": 2.9556674857258898e-05, + "loss": 0.2171, + "step": 15463 + }, + { + "epoch": 1.8337483695007708, + "grad_norm": 0.8129425636466915, + "learning_rate": 2.955431468339417e-05, + "loss": 0.1497, + "step": 15464 + }, + { + "epoch": 1.8338669512628958, + "grad_norm": 0.8011607802714994, + "learning_rate": 2.9551954467544242e-05, + "loss": 0.1302, + "step": 15465 + }, + { + "epoch": 1.8339855330250208, + "grad_norm": 1.4572126385665942, + "learning_rate": 2.9549594209730873e-05, + "loss": 0.3423, + "step": 15466 + }, + { + "epoch": 1.8341041147871457, + "grad_norm": 0.7780451977717283, + "learning_rate": 2.9547233909975824e-05, + "loss": 0.1729, + "step": 15467 + }, + { + "epoch": 1.8342226965492707, + "grad_norm": 0.8336083946032258, + "learning_rate": 2.9544873568300864e-05, + "loss": 0.1739, + "step": 15468 + }, + { + "epoch": 1.8343412783113957, + "grad_norm": 0.7265278259147545, + "learning_rate": 2.954251318472773e-05, + "loss": 0.1716, + "step": 15469 + }, + { + "epoch": 1.8344598600735207, + "grad_norm": 0.8752985539861614, + "learning_rate": 2.95401527592782e-05, + "loss": 0.174, + "step": 15470 + }, + { + "epoch": 1.8345784418356457, + "grad_norm": 0.878311609204327, + "learning_rate": 2.953779229197403e-05, + "loss": 0.151, + "step": 15471 + }, + { + "epoch": 1.8346970235977706, + "grad_norm": 0.785063578793951, + "learning_rate": 2.9535431782836974e-05, + "loss": 0.1409, + "step": 15472 + }, + { + "epoch": 1.8348156053598956, + "grad_norm": 0.6331385197087557, + "learning_rate": 2.953307123188881e-05, + "loss": 0.12, + "step": 15473 + }, + { + "epoch": 1.8349341871220206, + "grad_norm": 0.8597036336228239, + "learning_rate": 2.9530710639151278e-05, + "loss": 0.1727, + "step": 15474 + }, + { + "epoch": 1.8350527688841456, + "grad_norm": 1.073054088069077, + "learning_rate": 2.952835000464616e-05, + "loss": 0.1857, + "step": 15475 + }, + { + "epoch": 1.8351713506462706, + "grad_norm": 0.7755037909941674, + "learning_rate": 2.9525989328395197e-05, + "loss": 0.1701, + "step": 15476 + }, + { + "epoch": 1.8352899324083956, + "grad_norm": 0.6702205079578166, + "learning_rate": 2.952362861042017e-05, + "loss": 0.186, + "step": 15477 + }, + { + "epoch": 1.8354085141705205, + "grad_norm": 0.9094125965903891, + "learning_rate": 2.9521267850742834e-05, + "loss": 0.1797, + "step": 15478 + }, + { + "epoch": 1.8355270959326455, + "grad_norm": 0.7106060224626918, + "learning_rate": 2.9518907049384954e-05, + "loss": 0.1545, + "step": 15479 + }, + { + "epoch": 1.8356456776947705, + "grad_norm": 0.8622245866271262, + "learning_rate": 2.9516546206368285e-05, + "loss": 0.1572, + "step": 15480 + }, + { + "epoch": 1.8357642594568955, + "grad_norm": 0.6596835544119195, + "learning_rate": 2.9514185321714597e-05, + "loss": 0.1706, + "step": 15481 + }, + { + "epoch": 1.8358828412190205, + "grad_norm": 0.7560687931572966, + "learning_rate": 2.9511824395445663e-05, + "loss": 0.141, + "step": 15482 + }, + { + "epoch": 1.8360014229811457, + "grad_norm": 0.8890556943735853, + "learning_rate": 2.9509463427583246e-05, + "loss": 0.1313, + "step": 15483 + }, + { + "epoch": 1.8361200047432704, + "grad_norm": 1.119155717449911, + "learning_rate": 2.9507102418149096e-05, + "loss": 0.2614, + "step": 15484 + }, + { + "epoch": 1.8362385865053956, + "grad_norm": 0.6456642786562725, + "learning_rate": 2.950474136716499e-05, + "loss": 0.16, + "step": 15485 + }, + { + "epoch": 1.8363571682675204, + "grad_norm": 0.8557236347113389, + "learning_rate": 2.9502380274652696e-05, + "loss": 0.1846, + "step": 15486 + }, + { + "epoch": 1.8364757500296456, + "grad_norm": 0.8007346154416894, + "learning_rate": 2.9500019140633972e-05, + "loss": 0.1382, + "step": 15487 + }, + { + "epoch": 1.8365943317917703, + "grad_norm": 0.7492700198155701, + "learning_rate": 2.9497657965130597e-05, + "loss": 0.124, + "step": 15488 + }, + { + "epoch": 1.8367129135538955, + "grad_norm": 0.8095612631701082, + "learning_rate": 2.9495296748164332e-05, + "loss": 0.1623, + "step": 15489 + }, + { + "epoch": 1.8368314953160203, + "grad_norm": 0.9093624521336063, + "learning_rate": 2.949293548975694e-05, + "loss": 0.1888, + "step": 15490 + }, + { + "epoch": 1.8369500770781455, + "grad_norm": 1.1065149132694878, + "learning_rate": 2.949057418993019e-05, + "loss": 0.2164, + "step": 15491 + }, + { + "epoch": 1.8370686588402703, + "grad_norm": 0.8759685985459098, + "learning_rate": 2.9488212848705854e-05, + "loss": 0.18, + "step": 15492 + }, + { + "epoch": 1.8371872406023955, + "grad_norm": 1.1971151638825674, + "learning_rate": 2.9485851466105698e-05, + "loss": 0.2482, + "step": 15493 + }, + { + "epoch": 1.8373058223645202, + "grad_norm": 0.8391011156757014, + "learning_rate": 2.9483490042151496e-05, + "loss": 0.1037, + "step": 15494 + }, + { + "epoch": 1.8374244041266454, + "grad_norm": 2.0083428293393215, + "learning_rate": 2.9481128576865007e-05, + "loss": 0.3277, + "step": 15495 + }, + { + "epoch": 1.8375429858887702, + "grad_norm": 0.9209035404187014, + "learning_rate": 2.9478767070268016e-05, + "loss": 0.1841, + "step": 15496 + }, + { + "epoch": 1.8376615676508954, + "grad_norm": 0.9996791848066113, + "learning_rate": 2.947640552238228e-05, + "loss": 0.2278, + "step": 15497 + }, + { + "epoch": 1.8377801494130201, + "grad_norm": 0.6732401862127024, + "learning_rate": 2.947404393322958e-05, + "loss": 0.137, + "step": 15498 + }, + { + "epoch": 1.8378987311751454, + "grad_norm": 1.1084664256288934, + "learning_rate": 2.9471682302831672e-05, + "loss": 0.2691, + "step": 15499 + }, + { + "epoch": 1.83801731293727, + "grad_norm": 0.6482816576645161, + "learning_rate": 2.946932063121034e-05, + "loss": 0.1328, + "step": 15500 + }, + { + "epoch": 1.8381358946993953, + "grad_norm": 1.1227081780109371, + "learning_rate": 2.9466958918387354e-05, + "loss": 0.3145, + "step": 15501 + }, + { + "epoch": 1.83825447646152, + "grad_norm": 0.8758403831599904, + "learning_rate": 2.9464597164384485e-05, + "loss": 0.1942, + "step": 15502 + }, + { + "epoch": 1.8383730582236453, + "grad_norm": 1.0425075898135845, + "learning_rate": 2.9462235369223508e-05, + "loss": 0.1917, + "step": 15503 + }, + { + "epoch": 1.83849163998577, + "grad_norm": 0.630705654923278, + "learning_rate": 2.9459873532926186e-05, + "loss": 0.1221, + "step": 15504 + }, + { + "epoch": 1.8386102217478952, + "grad_norm": 0.9083342730679429, + "learning_rate": 2.9457511655514302e-05, + "loss": 0.1499, + "step": 15505 + }, + { + "epoch": 1.83872880351002, + "grad_norm": 0.664302784165351, + "learning_rate": 2.9455149737009628e-05, + "loss": 0.1401, + "step": 15506 + }, + { + "epoch": 1.8388473852721452, + "grad_norm": 0.7678042926863818, + "learning_rate": 2.9452787777433942e-05, + "loss": 0.1809, + "step": 15507 + }, + { + "epoch": 1.8389659670342702, + "grad_norm": 1.3778987524572128, + "learning_rate": 2.9450425776809003e-05, + "loss": 0.1862, + "step": 15508 + }, + { + "epoch": 1.8390845487963952, + "grad_norm": 0.6208772653689606, + "learning_rate": 2.9448063735156604e-05, + "loss": 0.1447, + "step": 15509 + }, + { + "epoch": 1.8392031305585201, + "grad_norm": 0.7850130714689275, + "learning_rate": 2.9445701652498503e-05, + "loss": 0.1949, + "step": 15510 + }, + { + "epoch": 1.8393217123206451, + "grad_norm": 0.8433228397821364, + "learning_rate": 2.944333952885649e-05, + "loss": 0.1658, + "step": 15511 + }, + { + "epoch": 1.83944029408277, + "grad_norm": 0.6878163139496245, + "learning_rate": 2.944097736425233e-05, + "loss": 0.1354, + "step": 15512 + }, + { + "epoch": 1.839558875844895, + "grad_norm": 0.704040443752837, + "learning_rate": 2.9438615158707812e-05, + "loss": 0.1534, + "step": 15513 + }, + { + "epoch": 1.83967745760702, + "grad_norm": 0.6676015212422032, + "learning_rate": 2.9436252912244706e-05, + "loss": 0.1264, + "step": 15514 + }, + { + "epoch": 1.839796039369145, + "grad_norm": 0.883115234643828, + "learning_rate": 2.943389062488478e-05, + "loss": 0.1725, + "step": 15515 + }, + { + "epoch": 1.83991462113127, + "grad_norm": 1.0172208142199568, + "learning_rate": 2.9431528296649828e-05, + "loss": 0.193, + "step": 15516 + }, + { + "epoch": 1.840033202893395, + "grad_norm": 0.8328197516015264, + "learning_rate": 2.9429165927561613e-05, + "loss": 0.2467, + "step": 15517 + }, + { + "epoch": 1.84015178465552, + "grad_norm": 0.8117451887952097, + "learning_rate": 2.9426803517641928e-05, + "loss": 0.142, + "step": 15518 + }, + { + "epoch": 1.840270366417645, + "grad_norm": 0.8397542825645542, + "learning_rate": 2.9424441066912534e-05, + "loss": 0.2458, + "step": 15519 + }, + { + "epoch": 1.84038894817977, + "grad_norm": 1.0078051282401075, + "learning_rate": 2.9422078575395225e-05, + "loss": 0.203, + "step": 15520 + }, + { + "epoch": 1.840507529941895, + "grad_norm": 0.5420193010594105, + "learning_rate": 2.9419716043111767e-05, + "loss": 0.1014, + "step": 15521 + }, + { + "epoch": 1.84062611170402, + "grad_norm": 1.0907618659174765, + "learning_rate": 2.9417353470083952e-05, + "loss": 0.222, + "step": 15522 + }, + { + "epoch": 1.8407446934661449, + "grad_norm": 1.0563309552253528, + "learning_rate": 2.9414990856333553e-05, + "loss": 0.2206, + "step": 15523 + }, + { + "epoch": 1.8408632752282699, + "grad_norm": 0.8458022975930581, + "learning_rate": 2.9412628201882358e-05, + "loss": 0.2163, + "step": 15524 + }, + { + "epoch": 1.8409818569903948, + "grad_norm": 0.6600922478012725, + "learning_rate": 2.9410265506752134e-05, + "loss": 0.1147, + "step": 15525 + }, + { + "epoch": 1.8411004387525198, + "grad_norm": 1.0199171113854102, + "learning_rate": 2.9407902770964678e-05, + "loss": 0.1825, + "step": 15526 + }, + { + "epoch": 1.8412190205146448, + "grad_norm": 1.103219083830544, + "learning_rate": 2.9405539994541758e-05, + "loss": 0.2176, + "step": 15527 + }, + { + "epoch": 1.8413376022767698, + "grad_norm": 1.2827932091432572, + "learning_rate": 2.940317717750516e-05, + "loss": 0.2402, + "step": 15528 + }, + { + "epoch": 1.8414561840388948, + "grad_norm": 0.8721853492738934, + "learning_rate": 2.9400814319876673e-05, + "loss": 0.165, + "step": 15529 + }, + { + "epoch": 1.8415747658010198, + "grad_norm": 1.122134696750533, + "learning_rate": 2.9398451421678076e-05, + "loss": 0.297, + "step": 15530 + }, + { + "epoch": 1.8416933475631447, + "grad_norm": 0.9829570932765097, + "learning_rate": 2.9396088482931144e-05, + "loss": 0.2062, + "step": 15531 + }, + { + "epoch": 1.84181192932527, + "grad_norm": 0.8042683453300616, + "learning_rate": 2.9393725503657664e-05, + "loss": 0.1666, + "step": 15532 + }, + { + "epoch": 1.8419305110873947, + "grad_norm": 0.9202558249894406, + "learning_rate": 2.9391362483879437e-05, + "loss": 0.1548, + "step": 15533 + }, + { + "epoch": 1.84204909284952, + "grad_norm": 0.8454260548450594, + "learning_rate": 2.9388999423618223e-05, + "loss": 0.2052, + "step": 15534 + }, + { + "epoch": 1.8421676746116447, + "grad_norm": 0.8925478968597303, + "learning_rate": 2.9386636322895823e-05, + "loss": 0.2294, + "step": 15535 + }, + { + "epoch": 1.8422862563737699, + "grad_norm": 0.9002653005814084, + "learning_rate": 2.938427318173401e-05, + "loss": 0.1629, + "step": 15536 + }, + { + "epoch": 1.8424048381358946, + "grad_norm": 0.6915961148266494, + "learning_rate": 2.9381910000154578e-05, + "loss": 0.144, + "step": 15537 + }, + { + "epoch": 1.8425234198980198, + "grad_norm": 0.6818877006349963, + "learning_rate": 2.9379546778179307e-05, + "loss": 0.1228, + "step": 15538 + }, + { + "epoch": 1.8426420016601446, + "grad_norm": 1.473406741035067, + "learning_rate": 2.9377183515829987e-05, + "loss": 0.3391, + "step": 15539 + }, + { + "epoch": 1.8427605834222698, + "grad_norm": 0.6705623357485878, + "learning_rate": 2.93748202131284e-05, + "loss": 0.1404, + "step": 15540 + }, + { + "epoch": 1.8428791651843945, + "grad_norm": 0.9203295163860467, + "learning_rate": 2.9372456870096333e-05, + "loss": 0.1858, + "step": 15541 + }, + { + "epoch": 1.8429977469465197, + "grad_norm": 0.9613777185043043, + "learning_rate": 2.937009348675558e-05, + "loss": 0.2122, + "step": 15542 + }, + { + "epoch": 1.8431163287086445, + "grad_norm": 0.6184935914394286, + "learning_rate": 2.9367730063127918e-05, + "loss": 0.1102, + "step": 15543 + }, + { + "epoch": 1.8432349104707697, + "grad_norm": 0.7539112068200723, + "learning_rate": 2.936536659923515e-05, + "loss": 0.1457, + "step": 15544 + }, + { + "epoch": 1.8433534922328945, + "grad_norm": 1.183561803533386, + "learning_rate": 2.936300309509905e-05, + "loss": 0.2121, + "step": 15545 + }, + { + "epoch": 1.8434720739950197, + "grad_norm": 0.9456086609690579, + "learning_rate": 2.9360639550741413e-05, + "loss": 0.1912, + "step": 15546 + }, + { + "epoch": 1.8435906557571444, + "grad_norm": 0.7899376814372949, + "learning_rate": 2.935827596618403e-05, + "loss": 0.1598, + "step": 15547 + }, + { + "epoch": 1.8437092375192696, + "grad_norm": 1.0370541990541948, + "learning_rate": 2.9355912341448682e-05, + "loss": 0.2376, + "step": 15548 + }, + { + "epoch": 1.8438278192813944, + "grad_norm": 0.8290271023304456, + "learning_rate": 2.9353548676557162e-05, + "loss": 0.1998, + "step": 15549 + }, + { + "epoch": 1.8439464010435196, + "grad_norm": 1.4183854732448595, + "learning_rate": 2.9351184971531263e-05, + "loss": 0.3062, + "step": 15550 + }, + { + "epoch": 1.8440649828056443, + "grad_norm": 1.0709302491114647, + "learning_rate": 2.9348821226392774e-05, + "loss": 0.1995, + "step": 15551 + }, + { + "epoch": 1.8441835645677696, + "grad_norm": 0.836206134439578, + "learning_rate": 2.9346457441163494e-05, + "loss": 0.2091, + "step": 15552 + }, + { + "epoch": 1.8443021463298943, + "grad_norm": 0.8987637128691102, + "learning_rate": 2.9344093615865198e-05, + "loss": 0.2017, + "step": 15553 + }, + { + "epoch": 1.8444207280920195, + "grad_norm": 0.7705290022867171, + "learning_rate": 2.93417297505197e-05, + "loss": 0.1475, + "step": 15554 + }, + { + "epoch": 1.8445393098541443, + "grad_norm": 0.8308075049951436, + "learning_rate": 2.933936584514876e-05, + "loss": 0.1606, + "step": 15555 + }, + { + "epoch": 1.8446578916162695, + "grad_norm": 0.7248935586388102, + "learning_rate": 2.9337001899774192e-05, + "loss": 0.1857, + "step": 15556 + }, + { + "epoch": 1.8447764733783945, + "grad_norm": 0.8031174152080248, + "learning_rate": 2.9334637914417796e-05, + "loss": 0.1593, + "step": 15557 + }, + { + "epoch": 1.8448950551405194, + "grad_norm": 0.5641743331417466, + "learning_rate": 2.9332273889101343e-05, + "loss": 0.1277, + "step": 15558 + }, + { + "epoch": 1.8450136369026444, + "grad_norm": 0.5708075724398769, + "learning_rate": 2.932990982384664e-05, + "loss": 0.1466, + "step": 15559 + }, + { + "epoch": 1.8451322186647694, + "grad_norm": 0.9241076569195626, + "learning_rate": 2.9327545718675475e-05, + "loss": 0.1991, + "step": 15560 + }, + { + "epoch": 1.8452508004268944, + "grad_norm": 1.0026829470406493, + "learning_rate": 2.932518157360965e-05, + "loss": 0.1558, + "step": 15561 + }, + { + "epoch": 1.8453693821890194, + "grad_norm": 0.8586297986464995, + "learning_rate": 2.9322817388670947e-05, + "loss": 0.1321, + "step": 15562 + }, + { + "epoch": 1.8454879639511443, + "grad_norm": 0.8255254271120334, + "learning_rate": 2.932045316388118e-05, + "loss": 0.1903, + "step": 15563 + }, + { + "epoch": 1.8456065457132693, + "grad_norm": 1.2823175562279303, + "learning_rate": 2.9318088899262124e-05, + "loss": 0.2436, + "step": 15564 + }, + { + "epoch": 1.8457251274753943, + "grad_norm": 0.874445783958458, + "learning_rate": 2.9315724594835593e-05, + "loss": 0.1928, + "step": 15565 + }, + { + "epoch": 1.8458437092375193, + "grad_norm": 0.8545078057303036, + "learning_rate": 2.9313360250623368e-05, + "loss": 0.2112, + "step": 15566 + }, + { + "epoch": 1.8459622909996443, + "grad_norm": 1.0224309740962911, + "learning_rate": 2.931099586664725e-05, + "loss": 0.1961, + "step": 15567 + }, + { + "epoch": 1.8460808727617692, + "grad_norm": 1.0297019893046673, + "learning_rate": 2.9308631442929036e-05, + "loss": 0.2588, + "step": 15568 + }, + { + "epoch": 1.8461994545238942, + "grad_norm": 1.7061857389486814, + "learning_rate": 2.930626697949053e-05, + "loss": 0.2817, + "step": 15569 + }, + { + "epoch": 1.8463180362860192, + "grad_norm": 0.8999758072998301, + "learning_rate": 2.9303902476353516e-05, + "loss": 0.1853, + "step": 15570 + }, + { + "epoch": 1.8464366180481442, + "grad_norm": 0.781073619641064, + "learning_rate": 2.93015379335398e-05, + "loss": 0.1344, + "step": 15571 + }, + { + "epoch": 1.8465551998102692, + "grad_norm": 0.7498439022285138, + "learning_rate": 2.929917335107118e-05, + "loss": 0.1365, + "step": 15572 + }, + { + "epoch": 1.8466737815723941, + "grad_norm": 0.9505779648776173, + "learning_rate": 2.929680872896945e-05, + "loss": 0.2285, + "step": 15573 + }, + { + "epoch": 1.8467923633345191, + "grad_norm": 0.8382397437900913, + "learning_rate": 2.929444406725642e-05, + "loss": 0.1491, + "step": 15574 + }, + { + "epoch": 1.846910945096644, + "grad_norm": 0.8415434501040656, + "learning_rate": 2.929207936595388e-05, + "loss": 0.1704, + "step": 15575 + }, + { + "epoch": 1.847029526858769, + "grad_norm": 0.9083594701187442, + "learning_rate": 2.928971462508363e-05, + "loss": 0.1749, + "step": 15576 + }, + { + "epoch": 1.847148108620894, + "grad_norm": 1.1491848036634968, + "learning_rate": 2.9287349844667473e-05, + "loss": 0.2666, + "step": 15577 + }, + { + "epoch": 1.847266690383019, + "grad_norm": 0.6493716535398631, + "learning_rate": 2.9284985024727206e-05, + "loss": 0.1155, + "step": 15578 + }, + { + "epoch": 1.847385272145144, + "grad_norm": 0.9759288863422841, + "learning_rate": 2.928262016528463e-05, + "loss": 0.2059, + "step": 15579 + }, + { + "epoch": 1.847503853907269, + "grad_norm": 0.7124178859063552, + "learning_rate": 2.928025526636155e-05, + "loss": 0.1488, + "step": 15580 + }, + { + "epoch": 1.847622435669394, + "grad_norm": 1.940151202141396, + "learning_rate": 2.9277890327979762e-05, + "loss": 0.4144, + "step": 15581 + }, + { + "epoch": 1.847741017431519, + "grad_norm": 1.4792745672100487, + "learning_rate": 2.9275525350161082e-05, + "loss": 0.2656, + "step": 15582 + }, + { + "epoch": 1.8478595991936442, + "grad_norm": 0.7330227774173688, + "learning_rate": 2.9273160332927286e-05, + "loss": 0.1487, + "step": 15583 + }, + { + "epoch": 1.847978180955769, + "grad_norm": 1.333876655140558, + "learning_rate": 2.9270795276300207e-05, + "loss": 0.2877, + "step": 15584 + }, + { + "epoch": 1.8480967627178941, + "grad_norm": 0.6386843792573899, + "learning_rate": 2.926843018030162e-05, + "loss": 0.1453, + "step": 15585 + }, + { + "epoch": 1.848215344480019, + "grad_norm": 0.5259683041838589, + "learning_rate": 2.9266065044953345e-05, + "loss": 0.1009, + "step": 15586 + }, + { + "epoch": 1.848333926242144, + "grad_norm": 0.9004323502534072, + "learning_rate": 2.9263699870277184e-05, + "loss": 0.224, + "step": 15587 + }, + { + "epoch": 1.8484525080042689, + "grad_norm": 1.4306150656678245, + "learning_rate": 2.9261334656294938e-05, + "loss": 0.2986, + "step": 15588 + }, + { + "epoch": 1.848571089766394, + "grad_norm": 0.7368854490568835, + "learning_rate": 2.9258969403028414e-05, + "loss": 0.185, + "step": 15589 + }, + { + "epoch": 1.8486896715285188, + "grad_norm": 1.025749766666629, + "learning_rate": 2.9256604110499413e-05, + "loss": 0.19, + "step": 15590 + }, + { + "epoch": 1.848808253290644, + "grad_norm": 0.7292812815899912, + "learning_rate": 2.9254238778729738e-05, + "loss": 0.1619, + "step": 15591 + }, + { + "epoch": 1.8489268350527688, + "grad_norm": 0.7071279221656349, + "learning_rate": 2.92518734077412e-05, + "loss": 0.1417, + "step": 15592 + }, + { + "epoch": 1.849045416814894, + "grad_norm": 0.7197525768949439, + "learning_rate": 2.924950799755561e-05, + "loss": 0.1615, + "step": 15593 + }, + { + "epoch": 1.8491639985770187, + "grad_norm": 0.8378307988576199, + "learning_rate": 2.9247142548194755e-05, + "loss": 0.1844, + "step": 15594 + }, + { + "epoch": 1.849282580339144, + "grad_norm": 1.3233558238150476, + "learning_rate": 2.9244777059680466e-05, + "loss": 0.2613, + "step": 15595 + }, + { + "epoch": 1.8494011621012687, + "grad_norm": 0.9012908047189936, + "learning_rate": 2.924241153203453e-05, + "loss": 0.2078, + "step": 15596 + }, + { + "epoch": 1.849519743863394, + "grad_norm": 0.5147452961093335, + "learning_rate": 2.924004596527877e-05, + "loss": 0.129, + "step": 15597 + }, + { + "epoch": 1.8496383256255187, + "grad_norm": 0.8858035348823747, + "learning_rate": 2.9237680359434984e-05, + "loss": 0.2661, + "step": 15598 + }, + { + "epoch": 1.8497569073876439, + "grad_norm": 0.6828880680424807, + "learning_rate": 2.9235314714524976e-05, + "loss": 0.166, + "step": 15599 + }, + { + "epoch": 1.8498754891497686, + "grad_norm": 1.1935208257339225, + "learning_rate": 2.9232949030570554e-05, + "loss": 0.2568, + "step": 15600 + }, + { + "epoch": 1.8499940709118938, + "grad_norm": 0.8898511995929292, + "learning_rate": 2.923058330759354e-05, + "loss": 0.1856, + "step": 15601 + }, + { + "epoch": 1.8501126526740186, + "grad_norm": 0.8124758221647069, + "learning_rate": 2.9228217545615743e-05, + "loss": 0.1444, + "step": 15602 + }, + { + "epoch": 1.8502312344361438, + "grad_norm": 0.8316478480568147, + "learning_rate": 2.9225851744658956e-05, + "loss": 0.2147, + "step": 15603 + }, + { + "epoch": 1.8503498161982685, + "grad_norm": 0.8019987553462476, + "learning_rate": 2.9223485904745006e-05, + "loss": 0.1753, + "step": 15604 + }, + { + "epoch": 1.8504683979603938, + "grad_norm": 0.7157762226264879, + "learning_rate": 2.9221120025895686e-05, + "loss": 0.1503, + "step": 15605 + }, + { + "epoch": 1.8505869797225185, + "grad_norm": 0.8714854020859208, + "learning_rate": 2.9218754108132822e-05, + "loss": 0.1978, + "step": 15606 + }, + { + "epoch": 1.8507055614846437, + "grad_norm": 1.103012406649438, + "learning_rate": 2.9216388151478213e-05, + "loss": 0.2639, + "step": 15607 + }, + { + "epoch": 1.8508241432467687, + "grad_norm": 0.8915620592165444, + "learning_rate": 2.9214022155953686e-05, + "loss": 0.191, + "step": 15608 + }, + { + "epoch": 1.8509427250088937, + "grad_norm": 1.5023907277175206, + "learning_rate": 2.9211656121581032e-05, + "loss": 0.265, + "step": 15609 + }, + { + "epoch": 1.8510613067710187, + "grad_norm": 0.7089361084194097, + "learning_rate": 2.9209290048382076e-05, + "loss": 0.1453, + "step": 15610 + }, + { + "epoch": 1.8511798885331436, + "grad_norm": 0.870043087568511, + "learning_rate": 2.9206923936378626e-05, + "loss": 0.1755, + "step": 15611 + }, + { + "epoch": 1.8512984702952686, + "grad_norm": 0.5965664842488005, + "learning_rate": 2.92045577855925e-05, + "loss": 0.1208, + "step": 15612 + }, + { + "epoch": 1.8514170520573936, + "grad_norm": 0.9279671591344625, + "learning_rate": 2.9202191596045503e-05, + "loss": 0.18, + "step": 15613 + }, + { + "epoch": 1.8515356338195186, + "grad_norm": 1.3482982801317842, + "learning_rate": 2.9199825367759452e-05, + "loss": 0.3088, + "step": 15614 + }, + { + "epoch": 1.8516542155816436, + "grad_norm": 1.0507404848474609, + "learning_rate": 2.919745910075617e-05, + "loss": 0.2782, + "step": 15615 + }, + { + "epoch": 1.8517727973437685, + "grad_norm": 0.9869963160485663, + "learning_rate": 2.9195092795057456e-05, + "loss": 0.2413, + "step": 15616 + }, + { + "epoch": 1.8518913791058935, + "grad_norm": 1.016546673477003, + "learning_rate": 2.9192726450685137e-05, + "loss": 0.2429, + "step": 15617 + }, + { + "epoch": 1.8520099608680185, + "grad_norm": 1.4008981084399015, + "learning_rate": 2.919036006766101e-05, + "loss": 0.3219, + "step": 15618 + }, + { + "epoch": 1.8521285426301435, + "grad_norm": 0.6283334707237043, + "learning_rate": 2.9187993646006913e-05, + "loss": 0.1806, + "step": 15619 + }, + { + "epoch": 1.8522471243922685, + "grad_norm": 0.9422028146771613, + "learning_rate": 2.918562718574464e-05, + "loss": 0.1768, + "step": 15620 + }, + { + "epoch": 1.8523657061543934, + "grad_norm": 1.07751274275259, + "learning_rate": 2.9183260686896025e-05, + "loss": 0.2334, + "step": 15621 + }, + { + "epoch": 1.8524842879165184, + "grad_norm": 0.7046371530438592, + "learning_rate": 2.9180894149482867e-05, + "loss": 0.1419, + "step": 15622 + }, + { + "epoch": 1.8526028696786434, + "grad_norm": 0.7243516592360328, + "learning_rate": 2.9178527573527004e-05, + "loss": 0.1689, + "step": 15623 + }, + { + "epoch": 1.8527214514407684, + "grad_norm": 0.871906904811501, + "learning_rate": 2.9176160959050236e-05, + "loss": 0.2386, + "step": 15624 + }, + { + "epoch": 1.8528400332028934, + "grad_norm": 1.3728896169990192, + "learning_rate": 2.9173794306074385e-05, + "loss": 0.3783, + "step": 15625 + }, + { + "epoch": 1.8529586149650183, + "grad_norm": 0.7782717284178692, + "learning_rate": 2.9171427614621266e-05, + "loss": 0.1511, + "step": 15626 + }, + { + "epoch": 1.8530771967271433, + "grad_norm": 0.9904610181942246, + "learning_rate": 2.9169060884712705e-05, + "loss": 0.2055, + "step": 15627 + }, + { + "epoch": 1.8531957784892683, + "grad_norm": 0.6846036994155176, + "learning_rate": 2.916669411637051e-05, + "loss": 0.1487, + "step": 15628 + }, + { + "epoch": 1.8533143602513933, + "grad_norm": 0.8130760677623585, + "learning_rate": 2.91643273096165e-05, + "loss": 0.1452, + "step": 15629 + }, + { + "epoch": 1.8534329420135183, + "grad_norm": 0.766813565558471, + "learning_rate": 2.916196046447251e-05, + "loss": 0.1518, + "step": 15630 + }, + { + "epoch": 1.8535515237756433, + "grad_norm": 0.7273536122778759, + "learning_rate": 2.9159593580960337e-05, + "loss": 0.1637, + "step": 15631 + }, + { + "epoch": 1.8536701055377685, + "grad_norm": 0.6366873424029477, + "learning_rate": 2.9157226659101822e-05, + "loss": 0.1361, + "step": 15632 + }, + { + "epoch": 1.8537886872998932, + "grad_norm": 0.8260746777707846, + "learning_rate": 2.9154859698918773e-05, + "loss": 0.202, + "step": 15633 + }, + { + "epoch": 1.8539072690620184, + "grad_norm": 0.8328156127031066, + "learning_rate": 2.9152492700433015e-05, + "loss": 0.177, + "step": 15634 + }, + { + "epoch": 1.8540258508241432, + "grad_norm": 0.8609523421584799, + "learning_rate": 2.9150125663666363e-05, + "loss": 0.1577, + "step": 15635 + }, + { + "epoch": 1.8541444325862684, + "grad_norm": 0.789895179635797, + "learning_rate": 2.9147758588640638e-05, + "loss": 0.1648, + "step": 15636 + }, + { + "epoch": 1.8542630143483931, + "grad_norm": 0.9408002739541972, + "learning_rate": 2.914539147537767e-05, + "loss": 0.1616, + "step": 15637 + }, + { + "epoch": 1.8543815961105183, + "grad_norm": 0.7373062096743352, + "learning_rate": 2.914302432389928e-05, + "loss": 0.1554, + "step": 15638 + }, + { + "epoch": 1.854500177872643, + "grad_norm": 0.982429227946715, + "learning_rate": 2.9140657134227278e-05, + "loss": 0.1523, + "step": 15639 + }, + { + "epoch": 1.8546187596347683, + "grad_norm": 0.8881578475008414, + "learning_rate": 2.9138289906383497e-05, + "loss": 0.2302, + "step": 15640 + }, + { + "epoch": 1.854737341396893, + "grad_norm": 0.8005171672667237, + "learning_rate": 2.9135922640389756e-05, + "loss": 0.1867, + "step": 15641 + }, + { + "epoch": 1.8548559231590183, + "grad_norm": 0.6910683775076653, + "learning_rate": 2.913355533626788e-05, + "loss": 0.162, + "step": 15642 + }, + { + "epoch": 1.854974504921143, + "grad_norm": 0.8798792327105698, + "learning_rate": 2.91311879940397e-05, + "loss": 0.1914, + "step": 15643 + }, + { + "epoch": 1.8550930866832682, + "grad_norm": 1.1819467065681852, + "learning_rate": 2.9128820613727025e-05, + "loss": 0.2476, + "step": 15644 + }, + { + "epoch": 1.855211668445393, + "grad_norm": 0.8273042250409575, + "learning_rate": 2.9126453195351694e-05, + "loss": 0.1672, + "step": 15645 + }, + { + "epoch": 1.8553302502075182, + "grad_norm": 1.0607297314171167, + "learning_rate": 2.912408573893552e-05, + "loss": 0.2329, + "step": 15646 + }, + { + "epoch": 1.855448831969643, + "grad_norm": 1.7986048301015167, + "learning_rate": 2.9121718244500335e-05, + "loss": 0.3663, + "step": 15647 + }, + { + "epoch": 1.8555674137317681, + "grad_norm": 0.8043641982440064, + "learning_rate": 2.9119350712067955e-05, + "loss": 0.1533, + "step": 15648 + }, + { + "epoch": 1.855685995493893, + "grad_norm": 0.9367515341028413, + "learning_rate": 2.9116983141660226e-05, + "loss": 0.1623, + "step": 15649 + }, + { + "epoch": 1.855804577256018, + "grad_norm": 0.644915371918487, + "learning_rate": 2.911461553329895e-05, + "loss": 0.1624, + "step": 15650 + }, + { + "epoch": 1.8559231590181429, + "grad_norm": 0.8575947820497104, + "learning_rate": 2.9112247887005968e-05, + "loss": 0.1819, + "step": 15651 + }, + { + "epoch": 1.856041740780268, + "grad_norm": 0.9230339500987473, + "learning_rate": 2.9109880202803095e-05, + "loss": 0.1956, + "step": 15652 + }, + { + "epoch": 1.8561603225423928, + "grad_norm": 0.8614010196402185, + "learning_rate": 2.9107512480712185e-05, + "loss": 0.2006, + "step": 15653 + }, + { + "epoch": 1.856278904304518, + "grad_norm": 0.9003924831140376, + "learning_rate": 2.910514472075503e-05, + "loss": 0.1394, + "step": 15654 + }, + { + "epoch": 1.8563974860666428, + "grad_norm": 1.1020308361792919, + "learning_rate": 2.910277692295349e-05, + "loss": 0.1606, + "step": 15655 + }, + { + "epoch": 1.856516067828768, + "grad_norm": 0.8904009821810605, + "learning_rate": 2.9100409087329363e-05, + "loss": 0.1629, + "step": 15656 + }, + { + "epoch": 1.856634649590893, + "grad_norm": 0.820372429711094, + "learning_rate": 2.9098041213904497e-05, + "loss": 0.171, + "step": 15657 + }, + { + "epoch": 1.856753231353018, + "grad_norm": 0.9399088111972209, + "learning_rate": 2.909567330270072e-05, + "loss": 0.168, + "step": 15658 + }, + { + "epoch": 1.856871813115143, + "grad_norm": 0.9272690330266089, + "learning_rate": 2.9093305353739853e-05, + "loss": 0.2163, + "step": 15659 + }, + { + "epoch": 1.856990394877268, + "grad_norm": 0.9729458100674276, + "learning_rate": 2.9090937367043734e-05, + "loss": 0.1511, + "step": 15660 + }, + { + "epoch": 1.857108976639393, + "grad_norm": 0.9285184514541763, + "learning_rate": 2.9088569342634185e-05, + "loss": 0.226, + "step": 15661 + }, + { + "epoch": 1.8572275584015179, + "grad_norm": 0.6976503687526869, + "learning_rate": 2.908620128053305e-05, + "loss": 0.1508, + "step": 15662 + }, + { + "epoch": 1.8573461401636429, + "grad_norm": 1.0018706443896965, + "learning_rate": 2.9083833180762138e-05, + "loss": 0.1921, + "step": 15663 + }, + { + "epoch": 1.8574647219257678, + "grad_norm": 0.9253091323930899, + "learning_rate": 2.9081465043343303e-05, + "loss": 0.2026, + "step": 15664 + }, + { + "epoch": 1.8575833036878928, + "grad_norm": 0.8241201571412, + "learning_rate": 2.9079096868298356e-05, + "loss": 0.1614, + "step": 15665 + }, + { + "epoch": 1.8577018854500178, + "grad_norm": 1.001766706979655, + "learning_rate": 2.9076728655649145e-05, + "loss": 0.2049, + "step": 15666 + }, + { + "epoch": 1.8578204672121428, + "grad_norm": 0.7216001944845267, + "learning_rate": 2.9074360405417488e-05, + "loss": 0.1498, + "step": 15667 + }, + { + "epoch": 1.8579390489742678, + "grad_norm": 1.1248656522948117, + "learning_rate": 2.9071992117625236e-05, + "loss": 0.2133, + "step": 15668 + }, + { + "epoch": 1.8580576307363927, + "grad_norm": 0.8359184327256771, + "learning_rate": 2.90696237922942e-05, + "loss": 0.1611, + "step": 15669 + }, + { + "epoch": 1.8581762124985177, + "grad_norm": 0.6594707286038638, + "learning_rate": 2.906725542944622e-05, + "loss": 0.1196, + "step": 15670 + }, + { + "epoch": 1.8582947942606427, + "grad_norm": 0.8706476299053684, + "learning_rate": 2.906488702910314e-05, + "loss": 0.156, + "step": 15671 + }, + { + "epoch": 1.8584133760227677, + "grad_norm": 0.9809505457370777, + "learning_rate": 2.906251859128678e-05, + "loss": 0.1839, + "step": 15672 + }, + { + "epoch": 1.8585319577848927, + "grad_norm": 0.908385687254952, + "learning_rate": 2.9060150116018987e-05, + "loss": 0.2052, + "step": 15673 + }, + { + "epoch": 1.8586505395470176, + "grad_norm": 0.6403852497813047, + "learning_rate": 2.905778160332158e-05, + "loss": 0.1558, + "step": 15674 + }, + { + "epoch": 1.8587691213091426, + "grad_norm": 0.6910058246520515, + "learning_rate": 2.905541305321642e-05, + "loss": 0.1422, + "step": 15675 + }, + { + "epoch": 1.8588877030712676, + "grad_norm": 0.868689309602238, + "learning_rate": 2.905304446572531e-05, + "loss": 0.1967, + "step": 15676 + }, + { + "epoch": 1.8590062848333926, + "grad_norm": 0.7812059251581103, + "learning_rate": 2.9050675840870106e-05, + "loss": 0.1892, + "step": 15677 + }, + { + "epoch": 1.8591248665955176, + "grad_norm": 0.9801019398123298, + "learning_rate": 2.9048307178672635e-05, + "loss": 0.1834, + "step": 15678 + }, + { + "epoch": 1.8592434483576425, + "grad_norm": 0.8751198766746738, + "learning_rate": 2.9045938479154734e-05, + "loss": 0.1898, + "step": 15679 + }, + { + "epoch": 1.8593620301197675, + "grad_norm": 0.6745627940207227, + "learning_rate": 2.9043569742338238e-05, + "loss": 0.1197, + "step": 15680 + }, + { + "epoch": 1.8594806118818927, + "grad_norm": 0.8375523037175205, + "learning_rate": 2.9041200968245e-05, + "loss": 0.1328, + "step": 15681 + }, + { + "epoch": 1.8595991936440175, + "grad_norm": 1.3329359252954864, + "learning_rate": 2.903883215689684e-05, + "loss": 0.3215, + "step": 15682 + }, + { + "epoch": 1.8597177754061427, + "grad_norm": 1.2509264056929286, + "learning_rate": 2.90364633083156e-05, + "loss": 0.2856, + "step": 15683 + }, + { + "epoch": 1.8598363571682675, + "grad_norm": 1.014680619244333, + "learning_rate": 2.9034094422523118e-05, + "loss": 0.2064, + "step": 15684 + }, + { + "epoch": 1.8599549389303927, + "grad_norm": 0.7746250640363895, + "learning_rate": 2.9031725499541228e-05, + "loss": 0.2197, + "step": 15685 + }, + { + "epoch": 1.8600735206925174, + "grad_norm": 0.9214808425410601, + "learning_rate": 2.9029356539391777e-05, + "loss": 0.1604, + "step": 15686 + }, + { + "epoch": 1.8601921024546426, + "grad_norm": 0.6913923478278179, + "learning_rate": 2.90269875420966e-05, + "loss": 0.1411, + "step": 15687 + }, + { + "epoch": 1.8603106842167674, + "grad_norm": 0.7744038931820316, + "learning_rate": 2.9024618507677538e-05, + "loss": 0.2111, + "step": 15688 + }, + { + "epoch": 1.8604292659788926, + "grad_norm": 1.0254662211981798, + "learning_rate": 2.9022249436156423e-05, + "loss": 0.2093, + "step": 15689 + }, + { + "epoch": 1.8605478477410173, + "grad_norm": 0.7516137964496158, + "learning_rate": 2.9019880327555103e-05, + "loss": 0.1231, + "step": 15690 + }, + { + "epoch": 1.8606664295031425, + "grad_norm": 0.5374938668771707, + "learning_rate": 2.9017511181895417e-05, + "loss": 0.1286, + "step": 15691 + }, + { + "epoch": 1.8607850112652673, + "grad_norm": 1.3718887838689324, + "learning_rate": 2.901514199919921e-05, + "loss": 0.2562, + "step": 15692 + }, + { + "epoch": 1.8609035930273925, + "grad_norm": 0.895108497796498, + "learning_rate": 2.9012772779488305e-05, + "loss": 0.1863, + "step": 15693 + }, + { + "epoch": 1.8610221747895173, + "grad_norm": 0.8744748401003857, + "learning_rate": 2.901040352278457e-05, + "loss": 0.1445, + "step": 15694 + }, + { + "epoch": 1.8611407565516425, + "grad_norm": 0.5645227287826177, + "learning_rate": 2.900803422910982e-05, + "loss": 0.1169, + "step": 15695 + }, + { + "epoch": 1.8612593383137672, + "grad_norm": 0.6337871798591447, + "learning_rate": 2.9005664898485917e-05, + "loss": 0.1115, + "step": 15696 + }, + { + "epoch": 1.8613779200758924, + "grad_norm": 0.8049843822911771, + "learning_rate": 2.9003295530934692e-05, + "loss": 0.1927, + "step": 15697 + }, + { + "epoch": 1.8614965018380172, + "grad_norm": 0.837988393959526, + "learning_rate": 2.9000926126477994e-05, + "loss": 0.1527, + "step": 15698 + }, + { + "epoch": 1.8616150836001424, + "grad_norm": 1.7878125229895176, + "learning_rate": 2.8998556685137663e-05, + "loss": 0.3658, + "step": 15699 + }, + { + "epoch": 1.8617336653622671, + "grad_norm": 0.8275902150069597, + "learning_rate": 2.899618720693554e-05, + "loss": 0.1851, + "step": 15700 + }, + { + "epoch": 1.8618522471243923, + "grad_norm": 0.8515986436068896, + "learning_rate": 2.8993817691893466e-05, + "loss": 0.1772, + "step": 15701 + }, + { + "epoch": 1.861970828886517, + "grad_norm": 0.812208020719093, + "learning_rate": 2.8991448140033296e-05, + "loss": 0.1915, + "step": 15702 + }, + { + "epoch": 1.8620894106486423, + "grad_norm": 0.7899743116550093, + "learning_rate": 2.8989078551376876e-05, + "loss": 0.1834, + "step": 15703 + }, + { + "epoch": 1.862207992410767, + "grad_norm": 0.6632674680948493, + "learning_rate": 2.8986708925946033e-05, + "loss": 0.1714, + "step": 15704 + }, + { + "epoch": 1.8623265741728923, + "grad_norm": 0.7421330041548343, + "learning_rate": 2.8984339263762634e-05, + "loss": 0.1437, + "step": 15705 + }, + { + "epoch": 1.8624451559350172, + "grad_norm": 0.8749952754176814, + "learning_rate": 2.8981969564848498e-05, + "loss": 0.1962, + "step": 15706 + }, + { + "epoch": 1.8625637376971422, + "grad_norm": 0.7052780358153463, + "learning_rate": 2.8979599829225503e-05, + "loss": 0.1338, + "step": 15707 + }, + { + "epoch": 1.8626823194592672, + "grad_norm": 0.9547332606122376, + "learning_rate": 2.897723005691546e-05, + "loss": 0.1595, + "step": 15708 + }, + { + "epoch": 1.8628009012213922, + "grad_norm": 0.6765995914432513, + "learning_rate": 2.8974860247940245e-05, + "loss": 0.1529, + "step": 15709 + }, + { + "epoch": 1.8629194829835172, + "grad_norm": 0.5745832210648041, + "learning_rate": 2.8972490402321683e-05, + "loss": 0.1342, + "step": 15710 + }, + { + "epoch": 1.8630380647456422, + "grad_norm": 0.9591877553947494, + "learning_rate": 2.897012052008164e-05, + "loss": 0.2211, + "step": 15711 + }, + { + "epoch": 1.8631566465077671, + "grad_norm": 0.8565683979125435, + "learning_rate": 2.896775060124195e-05, + "loss": 0.171, + "step": 15712 + }, + { + "epoch": 1.8632752282698921, + "grad_norm": 1.0795536365000349, + "learning_rate": 2.896538064582447e-05, + "loss": 0.2273, + "step": 15713 + }, + { + "epoch": 1.863393810032017, + "grad_norm": 1.002563274499882, + "learning_rate": 2.8963010653851035e-05, + "loss": 0.2383, + "step": 15714 + }, + { + "epoch": 1.863512391794142, + "grad_norm": 0.8637176975892262, + "learning_rate": 2.8960640625343505e-05, + "loss": 0.1448, + "step": 15715 + }, + { + "epoch": 1.863630973556267, + "grad_norm": 0.9866310733151303, + "learning_rate": 2.895827056032373e-05, + "loss": 0.2423, + "step": 15716 + }, + { + "epoch": 1.863749555318392, + "grad_norm": 0.8554819326118541, + "learning_rate": 2.895590045881355e-05, + "loss": 0.2256, + "step": 15717 + }, + { + "epoch": 1.863868137080517, + "grad_norm": 0.8037109261312879, + "learning_rate": 2.8953530320834822e-05, + "loss": 0.2161, + "step": 15718 + }, + { + "epoch": 1.863986718842642, + "grad_norm": 0.6474137052303837, + "learning_rate": 2.895116014640939e-05, + "loss": 0.1268, + "step": 15719 + }, + { + "epoch": 1.864105300604767, + "grad_norm": 0.8587270776136704, + "learning_rate": 2.8948789935559108e-05, + "loss": 0.2308, + "step": 15720 + }, + { + "epoch": 1.864223882366892, + "grad_norm": 0.8182373802362489, + "learning_rate": 2.8946419688305813e-05, + "loss": 0.1443, + "step": 15721 + }, + { + "epoch": 1.864342464129017, + "grad_norm": 1.3265198984237285, + "learning_rate": 2.8944049404671387e-05, + "loss": 0.297, + "step": 15722 + }, + { + "epoch": 1.864461045891142, + "grad_norm": 0.9118667041548154, + "learning_rate": 2.8941679084677654e-05, + "loss": 0.1955, + "step": 15723 + }, + { + "epoch": 1.864579627653267, + "grad_norm": 1.2622831460584076, + "learning_rate": 2.8939308728346475e-05, + "loss": 0.2508, + "step": 15724 + }, + { + "epoch": 1.8646982094153919, + "grad_norm": 0.9324726997806493, + "learning_rate": 2.8936938335699694e-05, + "loss": 0.2141, + "step": 15725 + }, + { + "epoch": 1.8648167911775169, + "grad_norm": 1.0928001754902494, + "learning_rate": 2.893456790675918e-05, + "loss": 0.197, + "step": 15726 + }, + { + "epoch": 1.8649353729396418, + "grad_norm": 0.8952926398432173, + "learning_rate": 2.8932197441546765e-05, + "loss": 0.1814, + "step": 15727 + }, + { + "epoch": 1.8650539547017668, + "grad_norm": 0.8758312594635707, + "learning_rate": 2.8929826940084314e-05, + "loss": 0.1482, + "step": 15728 + }, + { + "epoch": 1.8651725364638918, + "grad_norm": 1.2181686553786264, + "learning_rate": 2.8927456402393678e-05, + "loss": 0.2946, + "step": 15729 + }, + { + "epoch": 1.8652911182260168, + "grad_norm": 1.4131469436617585, + "learning_rate": 2.8925085828496706e-05, + "loss": 0.2779, + "step": 15730 + }, + { + "epoch": 1.8654096999881418, + "grad_norm": 1.0694390902186148, + "learning_rate": 2.8922715218415263e-05, + "loss": 0.1894, + "step": 15731 + }, + { + "epoch": 1.865528281750267, + "grad_norm": 0.6836333681877979, + "learning_rate": 2.892034457217119e-05, + "loss": 0.1541, + "step": 15732 + }, + { + "epoch": 1.8656468635123917, + "grad_norm": 0.8596774037946419, + "learning_rate": 2.891797388978636e-05, + "loss": 0.2096, + "step": 15733 + }, + { + "epoch": 1.865765445274517, + "grad_norm": 0.8250029415054085, + "learning_rate": 2.89156031712826e-05, + "loss": 0.1802, + "step": 15734 + }, + { + "epoch": 1.8658840270366417, + "grad_norm": 0.678079837059266, + "learning_rate": 2.8913232416681796e-05, + "loss": 0.1479, + "step": 15735 + }, + { + "epoch": 1.866002608798767, + "grad_norm": 0.809114400221667, + "learning_rate": 2.8910861626005776e-05, + "loss": 0.2033, + "step": 15736 + }, + { + "epoch": 1.8661211905608917, + "grad_norm": 0.6163417785646943, + "learning_rate": 2.8908490799276412e-05, + "loss": 0.1571, + "step": 15737 + }, + { + "epoch": 1.8662397723230169, + "grad_norm": 0.6840862693957209, + "learning_rate": 2.8906119936515554e-05, + "loss": 0.1481, + "step": 15738 + }, + { + "epoch": 1.8663583540851416, + "grad_norm": 0.8938975462306731, + "learning_rate": 2.890374903774506e-05, + "loss": 0.1463, + "step": 15739 + }, + { + "epoch": 1.8664769358472668, + "grad_norm": 0.9504721647322899, + "learning_rate": 2.8901378102986783e-05, + "loss": 0.233, + "step": 15740 + }, + { + "epoch": 1.8665955176093916, + "grad_norm": 0.7146363370393196, + "learning_rate": 2.889900713226259e-05, + "loss": 0.1504, + "step": 15741 + }, + { + "epoch": 1.8667140993715168, + "grad_norm": 1.0396518166153694, + "learning_rate": 2.8896636125594328e-05, + "loss": 0.259, + "step": 15742 + }, + { + "epoch": 1.8668326811336415, + "grad_norm": 0.9360046472568816, + "learning_rate": 2.8894265083003862e-05, + "loss": 0.2038, + "step": 15743 + }, + { + "epoch": 1.8669512628957667, + "grad_norm": 0.7872630127277365, + "learning_rate": 2.8891894004513053e-05, + "loss": 0.1399, + "step": 15744 + }, + { + "epoch": 1.8670698446578915, + "grad_norm": 1.0436326735132802, + "learning_rate": 2.8889522890143745e-05, + "loss": 0.2547, + "step": 15745 + }, + { + "epoch": 1.8671884264200167, + "grad_norm": 0.8509003437879862, + "learning_rate": 2.888715173991781e-05, + "loss": 0.1644, + "step": 15746 + }, + { + "epoch": 1.8673070081821415, + "grad_norm": 0.8684813648614095, + "learning_rate": 2.8884780553857104e-05, + "loss": 0.1927, + "step": 15747 + }, + { + "epoch": 1.8674255899442667, + "grad_norm": 0.7796511634699426, + "learning_rate": 2.8882409331983486e-05, + "loss": 0.1896, + "step": 15748 + }, + { + "epoch": 1.8675441717063914, + "grad_norm": 0.6767253517150421, + "learning_rate": 2.888003807431881e-05, + "loss": 0.1375, + "step": 15749 + }, + { + "epoch": 1.8676627534685166, + "grad_norm": 0.7616364986488229, + "learning_rate": 2.887766678088494e-05, + "loss": 0.157, + "step": 15750 + }, + { + "epoch": 1.8677813352306414, + "grad_norm": 0.547953528078479, + "learning_rate": 2.887529545170374e-05, + "loss": 0.1233, + "step": 15751 + }, + { + "epoch": 1.8678999169927666, + "grad_norm": 0.7030006715050804, + "learning_rate": 2.887292408679707e-05, + "loss": 0.1275, + "step": 15752 + }, + { + "epoch": 1.8680184987548913, + "grad_norm": 1.0234857295299398, + "learning_rate": 2.8870552686186784e-05, + "loss": 0.2364, + "step": 15753 + }, + { + "epoch": 1.8681370805170165, + "grad_norm": 0.7723577842499634, + "learning_rate": 2.8868181249894754e-05, + "loss": 0.1461, + "step": 15754 + }, + { + "epoch": 1.8682556622791413, + "grad_norm": 1.081796725484585, + "learning_rate": 2.8865809777942833e-05, + "loss": 0.2771, + "step": 15755 + }, + { + "epoch": 1.8683742440412665, + "grad_norm": 0.6623159770796955, + "learning_rate": 2.8863438270352892e-05, + "loss": 0.1409, + "step": 15756 + }, + { + "epoch": 1.8684928258033915, + "grad_norm": 0.9049185357901101, + "learning_rate": 2.8861066727146778e-05, + "loss": 0.1942, + "step": 15757 + }, + { + "epoch": 1.8686114075655165, + "grad_norm": 0.8300228381872246, + "learning_rate": 2.885869514834637e-05, + "loss": 0.1734, + "step": 15758 + }, + { + "epoch": 1.8687299893276414, + "grad_norm": 0.5910055447030439, + "learning_rate": 2.8856323533973524e-05, + "loss": 0.1207, + "step": 15759 + }, + { + "epoch": 1.8688485710897664, + "grad_norm": 0.8429615327450076, + "learning_rate": 2.88539518840501e-05, + "loss": 0.1679, + "step": 15760 + }, + { + "epoch": 1.8689671528518914, + "grad_norm": 0.9171764223833185, + "learning_rate": 2.8851580198597973e-05, + "loss": 0.1818, + "step": 15761 + }, + { + "epoch": 1.8690857346140164, + "grad_norm": 0.8901020469916581, + "learning_rate": 2.8849208477638996e-05, + "loss": 0.1819, + "step": 15762 + }, + { + "epoch": 1.8692043163761414, + "grad_norm": 0.49486570768482574, + "learning_rate": 2.884683672119504e-05, + "loss": 0.1031, + "step": 15763 + }, + { + "epoch": 1.8693228981382664, + "grad_norm": 0.6198386329817547, + "learning_rate": 2.884446492928796e-05, + "loss": 0.1285, + "step": 15764 + }, + { + "epoch": 1.8694414799003913, + "grad_norm": 0.9465585582843161, + "learning_rate": 2.8842093101939636e-05, + "loss": 0.2233, + "step": 15765 + }, + { + "epoch": 1.8695600616625163, + "grad_norm": 1.0089277903918226, + "learning_rate": 2.8839721239171918e-05, + "loss": 0.2366, + "step": 15766 + }, + { + "epoch": 1.8696786434246413, + "grad_norm": 0.9047673179759875, + "learning_rate": 2.8837349341006686e-05, + "loss": 0.1688, + "step": 15767 + }, + { + "epoch": 1.8697972251867663, + "grad_norm": 0.7831040885358498, + "learning_rate": 2.883497740746579e-05, + "loss": 0.1562, + "step": 15768 + }, + { + "epoch": 1.8699158069488913, + "grad_norm": 1.1423677225678779, + "learning_rate": 2.8832605438571108e-05, + "loss": 0.2168, + "step": 15769 + }, + { + "epoch": 1.8700343887110162, + "grad_norm": 1.1470798716965198, + "learning_rate": 2.8830233434344507e-05, + "loss": 0.2409, + "step": 15770 + }, + { + "epoch": 1.8701529704731412, + "grad_norm": 0.6515268178095849, + "learning_rate": 2.8827861394807843e-05, + "loss": 0.1295, + "step": 15771 + }, + { + "epoch": 1.8702715522352662, + "grad_norm": 0.9883520431756633, + "learning_rate": 2.8825489319983e-05, + "loss": 0.1725, + "step": 15772 + }, + { + "epoch": 1.8703901339973912, + "grad_norm": 1.3011487402953013, + "learning_rate": 2.8823117209891832e-05, + "loss": 0.2724, + "step": 15773 + }, + { + "epoch": 1.8705087157595162, + "grad_norm": 0.6957918847063239, + "learning_rate": 2.8820745064556216e-05, + "loss": 0.1197, + "step": 15774 + }, + { + "epoch": 1.8706272975216411, + "grad_norm": 0.7959840492919338, + "learning_rate": 2.8818372883998012e-05, + "loss": 0.2011, + "step": 15775 + }, + { + "epoch": 1.8707458792837661, + "grad_norm": 0.7658993926237069, + "learning_rate": 2.8816000668239095e-05, + "loss": 0.1614, + "step": 15776 + }, + { + "epoch": 1.870864461045891, + "grad_norm": 1.0084203696055216, + "learning_rate": 2.881362841730133e-05, + "loss": 0.2218, + "step": 15777 + }, + { + "epoch": 1.870983042808016, + "grad_norm": 1.136394541686231, + "learning_rate": 2.881125613120659e-05, + "loss": 0.243, + "step": 15778 + }, + { + "epoch": 1.871101624570141, + "grad_norm": 0.8303080198541085, + "learning_rate": 2.8808883809976734e-05, + "loss": 0.1496, + "step": 15779 + }, + { + "epoch": 1.871220206332266, + "grad_norm": 1.0694878797020637, + "learning_rate": 2.8806511453633644e-05, + "loss": 0.1996, + "step": 15780 + }, + { + "epoch": 1.8713387880943912, + "grad_norm": 0.8846714010112036, + "learning_rate": 2.8804139062199185e-05, + "loss": 0.263, + "step": 15781 + }, + { + "epoch": 1.871457369856516, + "grad_norm": 0.8180850329633103, + "learning_rate": 2.8801766635695238e-05, + "loss": 0.2213, + "step": 15782 + }, + { + "epoch": 1.8715759516186412, + "grad_norm": 0.7483747987442596, + "learning_rate": 2.8799394174143653e-05, + "loss": 0.138, + "step": 15783 + }, + { + "epoch": 1.871694533380766, + "grad_norm": 0.8033560879254849, + "learning_rate": 2.8797021677566323e-05, + "loss": 0.192, + "step": 15784 + }, + { + "epoch": 1.8718131151428912, + "grad_norm": 0.9286437354733176, + "learning_rate": 2.8794649145985096e-05, + "loss": 0.2106, + "step": 15785 + }, + { + "epoch": 1.871931696905016, + "grad_norm": 0.8309171293093917, + "learning_rate": 2.8792276579421866e-05, + "loss": 0.2542, + "step": 15786 + }, + { + "epoch": 1.8720502786671411, + "grad_norm": 0.6702850643355743, + "learning_rate": 2.8789903977898503e-05, + "loss": 0.1754, + "step": 15787 + }, + { + "epoch": 1.872168860429266, + "grad_norm": 0.8628348642666399, + "learning_rate": 2.8787531341436857e-05, + "loss": 0.1468, + "step": 15788 + }, + { + "epoch": 1.872287442191391, + "grad_norm": 1.3118133866539863, + "learning_rate": 2.8785158670058825e-05, + "loss": 0.2987, + "step": 15789 + }, + { + "epoch": 1.8724060239535159, + "grad_norm": 0.8733196993050373, + "learning_rate": 2.878278596378627e-05, + "loss": 0.1701, + "step": 15790 + }, + { + "epoch": 1.872524605715641, + "grad_norm": 1.127254408058649, + "learning_rate": 2.8780413222641074e-05, + "loss": 0.2604, + "step": 15791 + }, + { + "epoch": 1.8726431874777658, + "grad_norm": 0.7803722910197074, + "learning_rate": 2.8778040446645095e-05, + "loss": 0.1487, + "step": 15792 + }, + { + "epoch": 1.872761769239891, + "grad_norm": 0.7392848108693871, + "learning_rate": 2.8775667635820226e-05, + "loss": 0.1392, + "step": 15793 + }, + { + "epoch": 1.8728803510020158, + "grad_norm": 0.8377128178776392, + "learning_rate": 2.877329479018832e-05, + "loss": 0.1317, + "step": 15794 + }, + { + "epoch": 1.872998932764141, + "grad_norm": 0.6956347449923469, + "learning_rate": 2.8770921909771275e-05, + "loss": 0.191, + "step": 15795 + }, + { + "epoch": 1.8731175145262657, + "grad_norm": 0.8955426961979964, + "learning_rate": 2.8768548994590944e-05, + "loss": 0.1359, + "step": 15796 + }, + { + "epoch": 1.873236096288391, + "grad_norm": 0.620209676470644, + "learning_rate": 2.8766176044669218e-05, + "loss": 0.1216, + "step": 15797 + }, + { + "epoch": 1.8733546780505157, + "grad_norm": 0.7384927971220349, + "learning_rate": 2.8763803060027967e-05, + "loss": 0.1339, + "step": 15798 + }, + { + "epoch": 1.873473259812641, + "grad_norm": 1.2793953070138269, + "learning_rate": 2.8761430040689065e-05, + "loss": 0.2337, + "step": 15799 + }, + { + "epoch": 1.8735918415747657, + "grad_norm": 1.0879184836800282, + "learning_rate": 2.8759056986674382e-05, + "loss": 0.2385, + "step": 15800 + }, + { + "epoch": 1.8737104233368909, + "grad_norm": 1.1391263099350006, + "learning_rate": 2.8756683898005817e-05, + "loss": 0.2152, + "step": 15801 + }, + { + "epoch": 1.8738290050990156, + "grad_norm": 0.7595798212423558, + "learning_rate": 2.8754310774705228e-05, + "loss": 0.1319, + "step": 15802 + }, + { + "epoch": 1.8739475868611408, + "grad_norm": 0.9215425161435666, + "learning_rate": 2.8751937616794494e-05, + "loss": 0.1973, + "step": 15803 + }, + { + "epoch": 1.8740661686232656, + "grad_norm": 0.7113222638492109, + "learning_rate": 2.8749564424295505e-05, + "loss": 0.12, + "step": 15804 + }, + { + "epoch": 1.8741847503853908, + "grad_norm": 0.6872569213757911, + "learning_rate": 2.874719119723012e-05, + "loss": 0.1036, + "step": 15805 + }, + { + "epoch": 1.8743033321475158, + "grad_norm": 0.7909412229742584, + "learning_rate": 2.874481793562024e-05, + "loss": 0.1484, + "step": 15806 + }, + { + "epoch": 1.8744219139096407, + "grad_norm": 1.3991784629185209, + "learning_rate": 2.8742444639487722e-05, + "loss": 0.3713, + "step": 15807 + }, + { + "epoch": 1.8745404956717657, + "grad_norm": 0.5811833054537823, + "learning_rate": 2.8740071308854454e-05, + "loss": 0.1286, + "step": 15808 + }, + { + "epoch": 1.8746590774338907, + "grad_norm": 0.6316879586848315, + "learning_rate": 2.8737697943742313e-05, + "loss": 0.1546, + "step": 15809 + }, + { + "epoch": 1.8747776591960157, + "grad_norm": 1.057315311457953, + "learning_rate": 2.8735324544173188e-05, + "loss": 0.2356, + "step": 15810 + }, + { + "epoch": 1.8748962409581407, + "grad_norm": 0.795547436469296, + "learning_rate": 2.8732951110168944e-05, + "loss": 0.1591, + "step": 15811 + }, + { + "epoch": 1.8750148227202657, + "grad_norm": 0.9101019986594284, + "learning_rate": 2.8730577641751476e-05, + "loss": 0.2094, + "step": 15812 + }, + { + "epoch": 1.8751334044823906, + "grad_norm": 1.431042614289283, + "learning_rate": 2.8728204138942648e-05, + "loss": 0.3572, + "step": 15813 + }, + { + "epoch": 1.8752519862445156, + "grad_norm": 0.723854330976813, + "learning_rate": 2.8725830601764353e-05, + "loss": 0.1613, + "step": 15814 + }, + { + "epoch": 1.8753705680066406, + "grad_norm": 1.0490044559103697, + "learning_rate": 2.8723457030238473e-05, + "loss": 0.2424, + "step": 15815 + }, + { + "epoch": 1.8754891497687656, + "grad_norm": 0.9200026264472219, + "learning_rate": 2.872108342438688e-05, + "loss": 0.1486, + "step": 15816 + }, + { + "epoch": 1.8756077315308906, + "grad_norm": 0.8777700234210734, + "learning_rate": 2.8718709784231463e-05, + "loss": 0.142, + "step": 15817 + }, + { + "epoch": 1.8757263132930155, + "grad_norm": 1.0240132424454647, + "learning_rate": 2.87163361097941e-05, + "loss": 0.2031, + "step": 15818 + }, + { + "epoch": 1.8758448950551405, + "grad_norm": 0.5447338092567117, + "learning_rate": 2.8713962401096676e-05, + "loss": 0.1402, + "step": 15819 + }, + { + "epoch": 1.8759634768172655, + "grad_norm": 0.7489182593861344, + "learning_rate": 2.871158865816107e-05, + "loss": 0.1982, + "step": 15820 + }, + { + "epoch": 1.8760820585793905, + "grad_norm": 0.7011363253405474, + "learning_rate": 2.8709214881009178e-05, + "loss": 0.1326, + "step": 15821 + }, + { + "epoch": 1.8762006403415155, + "grad_norm": 1.0498203882457546, + "learning_rate": 2.8706841069662866e-05, + "loss": 0.2037, + "step": 15822 + }, + { + "epoch": 1.8763192221036404, + "grad_norm": 0.8399437081432923, + "learning_rate": 2.8704467224144026e-05, + "loss": 0.1521, + "step": 15823 + }, + { + "epoch": 1.8764378038657654, + "grad_norm": 0.9219637173389194, + "learning_rate": 2.870209334447454e-05, + "loss": 0.1953, + "step": 15824 + }, + { + "epoch": 1.8765563856278904, + "grad_norm": 0.7250677225635698, + "learning_rate": 2.86997194306763e-05, + "loss": 0.1235, + "step": 15825 + }, + { + "epoch": 1.8766749673900154, + "grad_norm": 0.8444827082249448, + "learning_rate": 2.8697345482771175e-05, + "loss": 0.1553, + "step": 15826 + }, + { + "epoch": 1.8767935491521404, + "grad_norm": 1.149716102343109, + "learning_rate": 2.8694971500781064e-05, + "loss": 0.2062, + "step": 15827 + }, + { + "epoch": 1.8769121309142653, + "grad_norm": 1.0112287717605917, + "learning_rate": 2.8692597484727847e-05, + "loss": 0.1754, + "step": 15828 + }, + { + "epoch": 1.8770307126763903, + "grad_norm": 0.9034522041821953, + "learning_rate": 2.86902234346334e-05, + "loss": 0.2108, + "step": 15829 + }, + { + "epoch": 1.8771492944385153, + "grad_norm": 0.7191857540201086, + "learning_rate": 2.8687849350519626e-05, + "loss": 0.147, + "step": 15830 + }, + { + "epoch": 1.8772678762006403, + "grad_norm": 0.8464233473673629, + "learning_rate": 2.86854752324084e-05, + "loss": 0.1819, + "step": 15831 + }, + { + "epoch": 1.8773864579627655, + "grad_norm": 0.9532910702993551, + "learning_rate": 2.8683101080321616e-05, + "loss": 0.1986, + "step": 15832 + }, + { + "epoch": 1.8775050397248902, + "grad_norm": 1.2533274050416605, + "learning_rate": 2.8680726894281152e-05, + "loss": 0.3698, + "step": 15833 + }, + { + "epoch": 1.8776236214870154, + "grad_norm": 0.8918494308597169, + "learning_rate": 2.8678352674308908e-05, + "loss": 0.1861, + "step": 15834 + }, + { + "epoch": 1.8777422032491402, + "grad_norm": 0.7399863162993228, + "learning_rate": 2.8675978420426753e-05, + "loss": 0.1346, + "step": 15835 + }, + { + "epoch": 1.8778607850112654, + "grad_norm": 0.7281018595813784, + "learning_rate": 2.8673604132656595e-05, + "loss": 0.181, + "step": 15836 + }, + { + "epoch": 1.8779793667733902, + "grad_norm": 0.7512920586800717, + "learning_rate": 2.86712298110203e-05, + "loss": 0.1375, + "step": 15837 + }, + { + "epoch": 1.8780979485355154, + "grad_norm": 0.6531525289679379, + "learning_rate": 2.8668855455539773e-05, + "loss": 0.1252, + "step": 15838 + }, + { + "epoch": 1.8782165302976401, + "grad_norm": 0.6870066980524422, + "learning_rate": 2.86664810662369e-05, + "loss": 0.1408, + "step": 15839 + }, + { + "epoch": 1.8783351120597653, + "grad_norm": 0.8085418275842104, + "learning_rate": 2.866410664313357e-05, + "loss": 0.1677, + "step": 15840 + }, + { + "epoch": 1.87845369382189, + "grad_norm": 0.6505484659622596, + "learning_rate": 2.866173218625166e-05, + "loss": 0.137, + "step": 15841 + }, + { + "epoch": 1.8785722755840153, + "grad_norm": 0.7438816055418921, + "learning_rate": 2.8659357695613076e-05, + "loss": 0.1593, + "step": 15842 + }, + { + "epoch": 1.87869085734614, + "grad_norm": 0.637284376388411, + "learning_rate": 2.8656983171239705e-05, + "loss": 0.1574, + "step": 15843 + }, + { + "epoch": 1.8788094391082653, + "grad_norm": 0.6428622878417586, + "learning_rate": 2.865460861315343e-05, + "loss": 0.1611, + "step": 15844 + }, + { + "epoch": 1.87892802087039, + "grad_norm": 0.7193178189529267, + "learning_rate": 2.8652234021376147e-05, + "loss": 0.1335, + "step": 15845 + }, + { + "epoch": 1.8790466026325152, + "grad_norm": 1.0333904552913857, + "learning_rate": 2.864985939592974e-05, + "loss": 0.2223, + "step": 15846 + }, + { + "epoch": 1.87916518439464, + "grad_norm": 0.7839281583015862, + "learning_rate": 2.864748473683611e-05, + "loss": 0.2335, + "step": 15847 + }, + { + "epoch": 1.8792837661567652, + "grad_norm": 0.8217527134566849, + "learning_rate": 2.864511004411714e-05, + "loss": 0.1678, + "step": 15848 + }, + { + "epoch": 1.87940234791889, + "grad_norm": 0.8490106958153745, + "learning_rate": 2.864273531779473e-05, + "loss": 0.1593, + "step": 15849 + }, + { + "epoch": 1.8795209296810151, + "grad_norm": 1.1899800683976633, + "learning_rate": 2.864036055789076e-05, + "loss": 0.268, + "step": 15850 + }, + { + "epoch": 1.87963951144314, + "grad_norm": 1.0204586214900662, + "learning_rate": 2.8637985764427138e-05, + "loss": 0.1938, + "step": 15851 + }, + { + "epoch": 1.879758093205265, + "grad_norm": 1.0863306914526316, + "learning_rate": 2.8635610937425743e-05, + "loss": 0.2479, + "step": 15852 + }, + { + "epoch": 1.8798766749673899, + "grad_norm": 1.005897280542907, + "learning_rate": 2.8633236076908475e-05, + "loss": 0.2022, + "step": 15853 + }, + { + "epoch": 1.879995256729515, + "grad_norm": 1.4397962790345318, + "learning_rate": 2.863086118289723e-05, + "loss": 0.3559, + "step": 15854 + }, + { + "epoch": 1.8801138384916398, + "grad_norm": 0.9109673127382304, + "learning_rate": 2.8628486255413895e-05, + "loss": 0.1868, + "step": 15855 + }, + { + "epoch": 1.880232420253765, + "grad_norm": 0.7250079848461775, + "learning_rate": 2.8626111294480362e-05, + "loss": 0.1733, + "step": 15856 + }, + { + "epoch": 1.88035100201589, + "grad_norm": 0.8702375394145215, + "learning_rate": 2.8623736300118532e-05, + "loss": 0.1633, + "step": 15857 + }, + { + "epoch": 1.880469583778015, + "grad_norm": 0.8460350651137697, + "learning_rate": 2.8621361272350295e-05, + "loss": 0.1626, + "step": 15858 + }, + { + "epoch": 1.88058816554014, + "grad_norm": 0.803103142835801, + "learning_rate": 2.8618986211197545e-05, + "loss": 0.1575, + "step": 15859 + }, + { + "epoch": 1.880706747302265, + "grad_norm": 0.6411971892017744, + "learning_rate": 2.861661111668219e-05, + "loss": 0.1559, + "step": 15860 + }, + { + "epoch": 1.88082532906439, + "grad_norm": 0.7238511811597965, + "learning_rate": 2.861423598882611e-05, + "loss": 0.1957, + "step": 15861 + }, + { + "epoch": 1.880943910826515, + "grad_norm": 0.923464860317249, + "learning_rate": 2.861186082765121e-05, + "loss": 0.1773, + "step": 15862 + }, + { + "epoch": 1.88106249258864, + "grad_norm": 0.7339977026754959, + "learning_rate": 2.8609485633179378e-05, + "loss": 0.1919, + "step": 15863 + }, + { + "epoch": 1.8811810743507649, + "grad_norm": 1.2446982130460174, + "learning_rate": 2.860711040543252e-05, + "loss": 0.2924, + "step": 15864 + }, + { + "epoch": 1.8812996561128899, + "grad_norm": 0.9480167786009551, + "learning_rate": 2.860473514443252e-05, + "loss": 0.2013, + "step": 15865 + }, + { + "epoch": 1.8814182378750148, + "grad_norm": 0.8041291829935788, + "learning_rate": 2.8602359850201288e-05, + "loss": 0.14, + "step": 15866 + }, + { + "epoch": 1.8815368196371398, + "grad_norm": 0.8451106182743051, + "learning_rate": 2.859998452276071e-05, + "loss": 0.1783, + "step": 15867 + }, + { + "epoch": 1.8816554013992648, + "grad_norm": 0.7277035876620315, + "learning_rate": 2.8597609162132694e-05, + "loss": 0.1817, + "step": 15868 + }, + { + "epoch": 1.8817739831613898, + "grad_norm": 0.8260323323414726, + "learning_rate": 2.8595233768339125e-05, + "loss": 0.1541, + "step": 15869 + }, + { + "epoch": 1.8818925649235148, + "grad_norm": 0.8453823535577939, + "learning_rate": 2.8592858341401922e-05, + "loss": 0.174, + "step": 15870 + }, + { + "epoch": 1.8820111466856397, + "grad_norm": 0.7986907549273424, + "learning_rate": 2.8590482881342963e-05, + "loss": 0.165, + "step": 15871 + }, + { + "epoch": 1.8821297284477647, + "grad_norm": 0.7616902978251614, + "learning_rate": 2.8588107388184155e-05, + "loss": 0.1701, + "step": 15872 + }, + { + "epoch": 1.8822483102098897, + "grad_norm": 0.6637644287942615, + "learning_rate": 2.85857318619474e-05, + "loss": 0.1475, + "step": 15873 + }, + { + "epoch": 1.8823668919720147, + "grad_norm": 0.7767490851495146, + "learning_rate": 2.8583356302654597e-05, + "loss": 0.1935, + "step": 15874 + }, + { + "epoch": 1.8824854737341397, + "grad_norm": 0.8769871254092575, + "learning_rate": 2.858098071032764e-05, + "loss": 0.181, + "step": 15875 + }, + { + "epoch": 1.8826040554962646, + "grad_norm": 0.8128251098503485, + "learning_rate": 2.857860508498843e-05, + "loss": 0.1643, + "step": 15876 + }, + { + "epoch": 1.8827226372583896, + "grad_norm": 2.1104847835798375, + "learning_rate": 2.8576229426658873e-05, + "loss": 0.4531, + "step": 15877 + }, + { + "epoch": 1.8828412190205146, + "grad_norm": 1.1747407170671436, + "learning_rate": 2.857385373536086e-05, + "loss": 0.2605, + "step": 15878 + }, + { + "epoch": 1.8829598007826396, + "grad_norm": 1.1611295695000332, + "learning_rate": 2.8571478011116303e-05, + "loss": 0.2616, + "step": 15879 + }, + { + "epoch": 1.8830783825447646, + "grad_norm": 0.633385296717795, + "learning_rate": 2.8569102253947094e-05, + "loss": 0.1255, + "step": 15880 + }, + { + "epoch": 1.8831969643068898, + "grad_norm": 0.7620284747839683, + "learning_rate": 2.8566726463875147e-05, + "loss": 0.1826, + "step": 15881 + }, + { + "epoch": 1.8833155460690145, + "grad_norm": 0.817773627716119, + "learning_rate": 2.856435064092235e-05, + "loss": 0.1932, + "step": 15882 + }, + { + "epoch": 1.8834341278311397, + "grad_norm": 0.789585754436894, + "learning_rate": 2.856197478511061e-05, + "loss": 0.1783, + "step": 15883 + }, + { + "epoch": 1.8835527095932645, + "grad_norm": 0.8667252616256016, + "learning_rate": 2.8559598896461837e-05, + "loss": 0.2143, + "step": 15884 + }, + { + "epoch": 1.8836712913553897, + "grad_norm": 0.9317505078209753, + "learning_rate": 2.8557222974997923e-05, + "loss": 0.2333, + "step": 15885 + }, + { + "epoch": 1.8837898731175144, + "grad_norm": 0.7069756825527386, + "learning_rate": 2.8554847020740776e-05, + "loss": 0.1955, + "step": 15886 + }, + { + "epoch": 1.8839084548796396, + "grad_norm": 0.8742708423228294, + "learning_rate": 2.8552471033712296e-05, + "loss": 0.1693, + "step": 15887 + }, + { + "epoch": 1.8840270366417644, + "grad_norm": 0.6809683033547083, + "learning_rate": 2.8550095013934386e-05, + "loss": 0.1607, + "step": 15888 + }, + { + "epoch": 1.8841456184038896, + "grad_norm": 0.9329416283888812, + "learning_rate": 2.854771896142896e-05, + "loss": 0.1548, + "step": 15889 + }, + { + "epoch": 1.8842642001660144, + "grad_norm": 0.8729116458764042, + "learning_rate": 2.854534287621792e-05, + "loss": 0.1865, + "step": 15890 + }, + { + "epoch": 1.8843827819281396, + "grad_norm": 0.8183274914108097, + "learning_rate": 2.8542966758323154e-05, + "loss": 0.1331, + "step": 15891 + }, + { + "epoch": 1.8845013636902643, + "grad_norm": 0.6617296450470821, + "learning_rate": 2.8540590607766592e-05, + "loss": 0.1662, + "step": 15892 + }, + { + "epoch": 1.8846199454523895, + "grad_norm": 0.9259566154390383, + "learning_rate": 2.853821442457012e-05, + "loss": 0.1696, + "step": 15893 + }, + { + "epoch": 1.8847385272145143, + "grad_norm": 0.762991301989314, + "learning_rate": 2.853583820875565e-05, + "loss": 0.138, + "step": 15894 + }, + { + "epoch": 1.8848571089766395, + "grad_norm": 0.6017013025675343, + "learning_rate": 2.853346196034509e-05, + "loss": 0.1263, + "step": 15895 + }, + { + "epoch": 1.8849756907387643, + "grad_norm": 1.4625274894902123, + "learning_rate": 2.8531085679360344e-05, + "loss": 0.2973, + "step": 15896 + }, + { + "epoch": 1.8850942725008895, + "grad_norm": 0.7515366610833764, + "learning_rate": 2.8528709365823313e-05, + "loss": 0.1634, + "step": 15897 + }, + { + "epoch": 1.8852128542630142, + "grad_norm": 0.8133817268513323, + "learning_rate": 2.8526333019755907e-05, + "loss": 0.1764, + "step": 15898 + }, + { + "epoch": 1.8853314360251394, + "grad_norm": 0.6771959429321358, + "learning_rate": 2.8523956641180037e-05, + "loss": 0.1677, + "step": 15899 + }, + { + "epoch": 1.8854500177872642, + "grad_norm": 0.6940516396894323, + "learning_rate": 2.852158023011761e-05, + "loss": 0.1468, + "step": 15900 + }, + { + "epoch": 1.8855685995493894, + "grad_norm": 0.7254744611844434, + "learning_rate": 2.8519203786590537e-05, + "loss": 0.1798, + "step": 15901 + }, + { + "epoch": 1.8856871813115141, + "grad_norm": 0.621228437857134, + "learning_rate": 2.8516827310620716e-05, + "loss": 0.1474, + "step": 15902 + }, + { + "epoch": 1.8858057630736393, + "grad_norm": 0.9662905380425313, + "learning_rate": 2.8514450802230057e-05, + "loss": 0.197, + "step": 15903 + }, + { + "epoch": 1.885924344835764, + "grad_norm": 1.4066582355604234, + "learning_rate": 2.8512074261440473e-05, + "loss": 0.3084, + "step": 15904 + }, + { + "epoch": 1.8860429265978893, + "grad_norm": 0.7810612479948721, + "learning_rate": 2.8509697688273878e-05, + "loss": 0.1443, + "step": 15905 + }, + { + "epoch": 1.8861615083600143, + "grad_norm": 0.920597089608908, + "learning_rate": 2.8507321082752163e-05, + "loss": 0.2234, + "step": 15906 + }, + { + "epoch": 1.8862800901221393, + "grad_norm": 0.926437852844144, + "learning_rate": 2.8504944444897258e-05, + "loss": 0.2257, + "step": 15907 + }, + { + "epoch": 1.8863986718842642, + "grad_norm": 1.0186248551516073, + "learning_rate": 2.850256777473106e-05, + "loss": 0.1741, + "step": 15908 + }, + { + "epoch": 1.8865172536463892, + "grad_norm": 1.026625390061044, + "learning_rate": 2.850019107227548e-05, + "loss": 0.2385, + "step": 15909 + }, + { + "epoch": 1.8866358354085142, + "grad_norm": 0.7178550748443047, + "learning_rate": 2.8497814337552426e-05, + "loss": 0.1613, + "step": 15910 + }, + { + "epoch": 1.8867544171706392, + "grad_norm": 1.2247029725639094, + "learning_rate": 2.8495437570583822e-05, + "loss": 0.2535, + "step": 15911 + }, + { + "epoch": 1.8868729989327642, + "grad_norm": 0.7130525817751445, + "learning_rate": 2.8493060771391567e-05, + "loss": 0.137, + "step": 15912 + }, + { + "epoch": 1.8869915806948891, + "grad_norm": 1.266562472512775, + "learning_rate": 2.8490683939997576e-05, + "loss": 0.3248, + "step": 15913 + }, + { + "epoch": 1.8871101624570141, + "grad_norm": 0.6758051391560961, + "learning_rate": 2.848830707642376e-05, + "loss": 0.1927, + "step": 15914 + }, + { + "epoch": 1.887228744219139, + "grad_norm": 0.7817099356362078, + "learning_rate": 2.8485930180692026e-05, + "loss": 0.1616, + "step": 15915 + }, + { + "epoch": 1.887347325981264, + "grad_norm": 0.6016896803533841, + "learning_rate": 2.8483553252824296e-05, + "loss": 0.1138, + "step": 15916 + }, + { + "epoch": 1.887465907743389, + "grad_norm": 1.089766600350401, + "learning_rate": 2.848117629284247e-05, + "loss": 0.2139, + "step": 15917 + }, + { + "epoch": 1.887584489505514, + "grad_norm": 0.8549196696737729, + "learning_rate": 2.847879930076847e-05, + "loss": 0.1576, + "step": 15918 + }, + { + "epoch": 1.887703071267639, + "grad_norm": 1.0030523897681358, + "learning_rate": 2.847642227662421e-05, + "loss": 0.1476, + "step": 15919 + }, + { + "epoch": 1.887821653029764, + "grad_norm": 0.9284807810343103, + "learning_rate": 2.84740452204316e-05, + "loss": 0.2019, + "step": 15920 + }, + { + "epoch": 1.887940234791889, + "grad_norm": 0.6662691672320165, + "learning_rate": 2.847166813221255e-05, + "loss": 0.1313, + "step": 15921 + }, + { + "epoch": 1.888058816554014, + "grad_norm": 0.7507297378417578, + "learning_rate": 2.846929101198898e-05, + "loss": 0.1334, + "step": 15922 + }, + { + "epoch": 1.888177398316139, + "grad_norm": 0.8617770205284616, + "learning_rate": 2.84669138597828e-05, + "loss": 0.185, + "step": 15923 + }, + { + "epoch": 1.888295980078264, + "grad_norm": 0.7449545921079807, + "learning_rate": 2.846453667561593e-05, + "loss": 0.2035, + "step": 15924 + }, + { + "epoch": 1.888414561840389, + "grad_norm": 0.7908801289333157, + "learning_rate": 2.8462159459510273e-05, + "loss": 0.17, + "step": 15925 + }, + { + "epoch": 1.888533143602514, + "grad_norm": 0.7203711478671512, + "learning_rate": 2.8459782211487758e-05, + "loss": 0.1826, + "step": 15926 + }, + { + "epoch": 1.8886517253646389, + "grad_norm": 0.9813860666403983, + "learning_rate": 2.8457404931570285e-05, + "loss": 0.1906, + "step": 15927 + }, + { + "epoch": 1.8887703071267639, + "grad_norm": 0.7170804898980425, + "learning_rate": 2.845502761977978e-05, + "loss": 0.1801, + "step": 15928 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5859171323721092, + "learning_rate": 2.845265027613816e-05, + "loss": 0.1633, + "step": 15929 + }, + { + "epoch": 1.8890074706510138, + "grad_norm": 0.9110420036918104, + "learning_rate": 2.8450272900667335e-05, + "loss": 0.1825, + "step": 15930 + }, + { + "epoch": 1.8891260524131388, + "grad_norm": 0.7261056264354866, + "learning_rate": 2.8447895493389233e-05, + "loss": 0.1787, + "step": 15931 + }, + { + "epoch": 1.889244634175264, + "grad_norm": 0.9258251772076243, + "learning_rate": 2.8445518054325752e-05, + "loss": 0.2418, + "step": 15932 + }, + { + "epoch": 1.8893632159373888, + "grad_norm": 0.8227350935006212, + "learning_rate": 2.8443140583498827e-05, + "loss": 0.1646, + "step": 15933 + }, + { + "epoch": 1.889481797699514, + "grad_norm": 0.8609373484457465, + "learning_rate": 2.8440763080930365e-05, + "loss": 0.2271, + "step": 15934 + }, + { + "epoch": 1.8896003794616387, + "grad_norm": 0.5488175149450569, + "learning_rate": 2.84383855466423e-05, + "loss": 0.0874, + "step": 15935 + }, + { + "epoch": 1.889718961223764, + "grad_norm": 0.9039248763888512, + "learning_rate": 2.843600798065652e-05, + "loss": 0.1912, + "step": 15936 + }, + { + "epoch": 1.8898375429858887, + "grad_norm": 1.0113086047395727, + "learning_rate": 2.8433630382994963e-05, + "loss": 0.1845, + "step": 15937 + }, + { + "epoch": 1.8899561247480139, + "grad_norm": 0.898896484791771, + "learning_rate": 2.8431252753679544e-05, + "loss": 0.1886, + "step": 15938 + }, + { + "epoch": 1.8900747065101386, + "grad_norm": 0.936266119465778, + "learning_rate": 2.8428875092732188e-05, + "loss": 0.2138, + "step": 15939 + }, + { + "epoch": 1.8901932882722638, + "grad_norm": 0.7482476850792282, + "learning_rate": 2.8426497400174802e-05, + "loss": 0.1751, + "step": 15940 + }, + { + "epoch": 1.8903118700343886, + "grad_norm": 0.7719530087828458, + "learning_rate": 2.842411967602932e-05, + "loss": 0.1551, + "step": 15941 + }, + { + "epoch": 1.8904304517965138, + "grad_norm": 0.6949046764361371, + "learning_rate": 2.8421741920317646e-05, + "loss": 0.1272, + "step": 15942 + }, + { + "epoch": 1.8905490335586386, + "grad_norm": 0.8440646701526194, + "learning_rate": 2.841936413306171e-05, + "loss": 0.1855, + "step": 15943 + }, + { + "epoch": 1.8906676153207638, + "grad_norm": 0.7172208503566945, + "learning_rate": 2.8416986314283434e-05, + "loss": 0.1425, + "step": 15944 + }, + { + "epoch": 1.8907861970828885, + "grad_norm": 0.7168880161380836, + "learning_rate": 2.8414608464004722e-05, + "loss": 0.1568, + "step": 15945 + }, + { + "epoch": 1.8909047788450137, + "grad_norm": 0.8351415017095588, + "learning_rate": 2.8412230582247525e-05, + "loss": 0.1702, + "step": 15946 + }, + { + "epoch": 1.8910233606071385, + "grad_norm": 0.9548518545989755, + "learning_rate": 2.8409852669033733e-05, + "loss": 0.2157, + "step": 15947 + }, + { + "epoch": 1.8911419423692637, + "grad_norm": 0.9038200586059147, + "learning_rate": 2.840747472438528e-05, + "loss": 0.1827, + "step": 15948 + }, + { + "epoch": 1.8912605241313885, + "grad_norm": 0.8807098750210066, + "learning_rate": 2.840509674832409e-05, + "loss": 0.1811, + "step": 15949 + }, + { + "epoch": 1.8913791058935137, + "grad_norm": 0.8628302054950924, + "learning_rate": 2.8402718740872093e-05, + "loss": 0.205, + "step": 15950 + }, + { + "epoch": 1.8914976876556384, + "grad_norm": 1.5663684429354032, + "learning_rate": 2.8400340702051192e-05, + "loss": 0.3297, + "step": 15951 + }, + { + "epoch": 1.8916162694177636, + "grad_norm": 1.1562137038263798, + "learning_rate": 2.839796263188333e-05, + "loss": 0.2213, + "step": 15952 + }, + { + "epoch": 1.8917348511798884, + "grad_norm": 0.8544874324691788, + "learning_rate": 2.8395584530390408e-05, + "loss": 0.1937, + "step": 15953 + }, + { + "epoch": 1.8918534329420136, + "grad_norm": 0.711607487327528, + "learning_rate": 2.839320639759437e-05, + "loss": 0.1496, + "step": 15954 + }, + { + "epoch": 1.8919720147041383, + "grad_norm": 1.049772365948029, + "learning_rate": 2.839082823351712e-05, + "loss": 0.2261, + "step": 15955 + }, + { + "epoch": 1.8920905964662635, + "grad_norm": 0.8264796534433995, + "learning_rate": 2.8388450038180604e-05, + "loss": 0.2026, + "step": 15956 + }, + { + "epoch": 1.8922091782283885, + "grad_norm": 0.9864084744574206, + "learning_rate": 2.8386071811606722e-05, + "loss": 0.2139, + "step": 15957 + }, + { + "epoch": 1.8923277599905135, + "grad_norm": 0.8968602318636496, + "learning_rate": 2.8383693553817414e-05, + "loss": 0.1876, + "step": 15958 + }, + { + "epoch": 1.8924463417526385, + "grad_norm": 0.7645094403579633, + "learning_rate": 2.8381315264834602e-05, + "loss": 0.1686, + "step": 15959 + }, + { + "epoch": 1.8925649235147635, + "grad_norm": 1.4766509873499134, + "learning_rate": 2.8378936944680205e-05, + "loss": 0.2581, + "step": 15960 + }, + { + "epoch": 1.8926835052768884, + "grad_norm": 1.8381490804590244, + "learning_rate": 2.8376558593376162e-05, + "loss": 0.3022, + "step": 15961 + }, + { + "epoch": 1.8928020870390134, + "grad_norm": 0.8841167458257453, + "learning_rate": 2.837418021094438e-05, + "loss": 0.1906, + "step": 15962 + }, + { + "epoch": 1.8929206688011384, + "grad_norm": 0.891420899134094, + "learning_rate": 2.8371801797406806e-05, + "loss": 0.2777, + "step": 15963 + }, + { + "epoch": 1.8930392505632634, + "grad_norm": 0.8881108727466321, + "learning_rate": 2.836942335278534e-05, + "loss": 0.2192, + "step": 15964 + }, + { + "epoch": 1.8931578323253884, + "grad_norm": 0.6629967210902918, + "learning_rate": 2.8367044877101934e-05, + "loss": 0.1414, + "step": 15965 + }, + { + "epoch": 1.8932764140875133, + "grad_norm": 0.8083390958531154, + "learning_rate": 2.8364666370378502e-05, + "loss": 0.1563, + "step": 15966 + }, + { + "epoch": 1.8933949958496383, + "grad_norm": 0.8255997458947091, + "learning_rate": 2.8362287832636964e-05, + "loss": 0.1721, + "step": 15967 + }, + { + "epoch": 1.8935135776117633, + "grad_norm": 1.1673362603067714, + "learning_rate": 2.8359909263899258e-05, + "loss": 0.1988, + "step": 15968 + }, + { + "epoch": 1.8936321593738883, + "grad_norm": 0.501122125091316, + "learning_rate": 2.8357530664187316e-05, + "loss": 0.105, + "step": 15969 + }, + { + "epoch": 1.8937507411360133, + "grad_norm": 0.8538961393301169, + "learning_rate": 2.8355152033523048e-05, + "loss": 0.2216, + "step": 15970 + }, + { + "epoch": 1.8938693228981383, + "grad_norm": 0.8679002423036752, + "learning_rate": 2.83527733719284e-05, + "loss": 0.1749, + "step": 15971 + }, + { + "epoch": 1.8939879046602632, + "grad_norm": 0.9136354346642834, + "learning_rate": 2.8350394679425292e-05, + "loss": 0.1551, + "step": 15972 + }, + { + "epoch": 1.8941064864223882, + "grad_norm": 0.7028766001817698, + "learning_rate": 2.834801595603565e-05, + "loss": 0.1305, + "step": 15973 + }, + { + "epoch": 1.8942250681845132, + "grad_norm": 0.5602754839121322, + "learning_rate": 2.8345637201781415e-05, + "loss": 0.1077, + "step": 15974 + }, + { + "epoch": 1.8943436499466382, + "grad_norm": 0.9522264462847185, + "learning_rate": 2.8343258416684498e-05, + "loss": 0.171, + "step": 15975 + }, + { + "epoch": 1.8944622317087632, + "grad_norm": 0.7910135684669101, + "learning_rate": 2.8340879600766847e-05, + "loss": 0.1925, + "step": 15976 + }, + { + "epoch": 1.8945808134708881, + "grad_norm": 0.8222129006091862, + "learning_rate": 2.8338500754050373e-05, + "loss": 0.1896, + "step": 15977 + }, + { + "epoch": 1.8946993952330131, + "grad_norm": 0.8210951664670892, + "learning_rate": 2.833612187655702e-05, + "loss": 0.1704, + "step": 15978 + }, + { + "epoch": 1.894817976995138, + "grad_norm": 0.7233371948722629, + "learning_rate": 2.8333742968308713e-05, + "loss": 0.1233, + "step": 15979 + }, + { + "epoch": 1.894936558757263, + "grad_norm": 0.6901952476435219, + "learning_rate": 2.833136402932739e-05, + "loss": 0.1377, + "step": 15980 + }, + { + "epoch": 1.8950551405193883, + "grad_norm": 0.6007473416266343, + "learning_rate": 2.832898505963497e-05, + "loss": 0.1199, + "step": 15981 + }, + { + "epoch": 1.895173722281513, + "grad_norm": 0.6342216059157425, + "learning_rate": 2.8326606059253398e-05, + "loss": 0.1425, + "step": 15982 + }, + { + "epoch": 1.8952923040436382, + "grad_norm": 0.8188516369177005, + "learning_rate": 2.8324227028204587e-05, + "loss": 0.2201, + "step": 15983 + }, + { + "epoch": 1.895410885805763, + "grad_norm": 0.7978125267332222, + "learning_rate": 2.832184796651049e-05, + "loss": 0.178, + "step": 15984 + }, + { + "epoch": 1.8955294675678882, + "grad_norm": 0.9694068910362328, + "learning_rate": 2.8319468874193016e-05, + "loss": 0.243, + "step": 15985 + }, + { + "epoch": 1.895648049330013, + "grad_norm": 0.7810894317955301, + "learning_rate": 2.8317089751274118e-05, + "loss": 0.146, + "step": 15986 + }, + { + "epoch": 1.8957666310921382, + "grad_norm": 0.773685022053031, + "learning_rate": 2.831471059777571e-05, + "loss": 0.142, + "step": 15987 + }, + { + "epoch": 1.895885212854263, + "grad_norm": 1.0319921775921364, + "learning_rate": 2.8312331413719746e-05, + "loss": 0.26, + "step": 15988 + }, + { + "epoch": 1.8960037946163881, + "grad_norm": 0.8716093468606806, + "learning_rate": 2.830995219912815e-05, + "loss": 0.1692, + "step": 15989 + }, + { + "epoch": 1.8961223763785129, + "grad_norm": 0.7674415364921217, + "learning_rate": 2.8307572954022844e-05, + "loss": 0.1288, + "step": 15990 + }, + { + "epoch": 1.896240958140638, + "grad_norm": 0.8893190144538159, + "learning_rate": 2.830519367842578e-05, + "loss": 0.1292, + "step": 15991 + }, + { + "epoch": 1.8963595399027628, + "grad_norm": 0.7911529282978075, + "learning_rate": 2.8302814372358876e-05, + "loss": 0.1566, + "step": 15992 + }, + { + "epoch": 1.896478121664888, + "grad_norm": 0.8400210912787021, + "learning_rate": 2.830043503584408e-05, + "loss": 0.1769, + "step": 15993 + }, + { + "epoch": 1.8965967034270128, + "grad_norm": 1.088465900908281, + "learning_rate": 2.8298055668903317e-05, + "loss": 0.2214, + "step": 15994 + }, + { + "epoch": 1.896715285189138, + "grad_norm": 0.7110240727868526, + "learning_rate": 2.829567627155853e-05, + "loss": 0.1636, + "step": 15995 + }, + { + "epoch": 1.8968338669512628, + "grad_norm": 0.7466832915856868, + "learning_rate": 2.8293296843831642e-05, + "loss": 0.1739, + "step": 15996 + }, + { + "epoch": 1.896952448713388, + "grad_norm": 1.0567876665679286, + "learning_rate": 2.8290917385744598e-05, + "loss": 0.2308, + "step": 15997 + }, + { + "epoch": 1.8970710304755127, + "grad_norm": 1.0959639892085768, + "learning_rate": 2.8288537897319328e-05, + "loss": 0.2597, + "step": 15998 + }, + { + "epoch": 1.897189612237638, + "grad_norm": 0.9944621634276215, + "learning_rate": 2.8286158378577776e-05, + "loss": 0.1995, + "step": 15999 + }, + { + "epoch": 1.8973081939997627, + "grad_norm": 0.6940648908477219, + "learning_rate": 2.8283778829541874e-05, + "loss": 0.1359, + "step": 16000 + }, + { + "epoch": 1.897426775761888, + "grad_norm": 0.8136907410973065, + "learning_rate": 2.8281399250233554e-05, + "loss": 0.1656, + "step": 16001 + }, + { + "epoch": 1.8975453575240127, + "grad_norm": 0.8845120568813197, + "learning_rate": 2.827901964067476e-05, + "loss": 0.1882, + "step": 16002 + }, + { + "epoch": 1.8976639392861379, + "grad_norm": 0.7977548385600857, + "learning_rate": 2.8276640000887422e-05, + "loss": 0.1366, + "step": 16003 + }, + { + "epoch": 1.8977825210482626, + "grad_norm": 0.7709988463076939, + "learning_rate": 2.8274260330893487e-05, + "loss": 0.176, + "step": 16004 + }, + { + "epoch": 1.8979011028103878, + "grad_norm": 0.8027915723846025, + "learning_rate": 2.8271880630714886e-05, + "loss": 0.1612, + "step": 16005 + }, + { + "epoch": 1.8980196845725128, + "grad_norm": 0.9014290223264383, + "learning_rate": 2.8269500900373557e-05, + "loss": 0.1791, + "step": 16006 + }, + { + "epoch": 1.8981382663346378, + "grad_norm": 0.682384806357561, + "learning_rate": 2.826712113989144e-05, + "loss": 0.117, + "step": 16007 + }, + { + "epoch": 1.8982568480967628, + "grad_norm": 1.0227507624447483, + "learning_rate": 2.8264741349290464e-05, + "loss": 0.2043, + "step": 16008 + }, + { + "epoch": 1.8983754298588877, + "grad_norm": 0.7678783514368877, + "learning_rate": 2.8262361528592583e-05, + "loss": 0.123, + "step": 16009 + }, + { + "epoch": 1.8984940116210127, + "grad_norm": 1.021820293845396, + "learning_rate": 2.825998167781973e-05, + "loss": 0.2025, + "step": 16010 + }, + { + "epoch": 1.8986125933831377, + "grad_norm": 0.8464141163006766, + "learning_rate": 2.8257601796993844e-05, + "loss": 0.2006, + "step": 16011 + }, + { + "epoch": 1.8987311751452627, + "grad_norm": 0.7293195641664636, + "learning_rate": 2.825522188613686e-05, + "loss": 0.1325, + "step": 16012 + }, + { + "epoch": 1.8988497569073877, + "grad_norm": 1.1563178343750835, + "learning_rate": 2.825284194527073e-05, + "loss": 0.2007, + "step": 16013 + }, + { + "epoch": 1.8989683386695126, + "grad_norm": 0.9231869470982893, + "learning_rate": 2.8250461974417375e-05, + "loss": 0.1718, + "step": 16014 + }, + { + "epoch": 1.8990869204316376, + "grad_norm": 0.8526594602045873, + "learning_rate": 2.824808197359876e-05, + "loss": 0.1674, + "step": 16015 + }, + { + "epoch": 1.8992055021937626, + "grad_norm": 1.8119774370952464, + "learning_rate": 2.82457019428368e-05, + "loss": 0.3478, + "step": 16016 + }, + { + "epoch": 1.8993240839558876, + "grad_norm": 1.0251312429323536, + "learning_rate": 2.824332188215345e-05, + "loss": 0.2506, + "step": 16017 + }, + { + "epoch": 1.8994426657180126, + "grad_norm": 0.9863246662577184, + "learning_rate": 2.824094179157065e-05, + "loss": 0.2059, + "step": 16018 + }, + { + "epoch": 1.8995612474801375, + "grad_norm": 0.985841786567784, + "learning_rate": 2.8238561671110355e-05, + "loss": 0.2147, + "step": 16019 + }, + { + "epoch": 1.8996798292422625, + "grad_norm": 0.9425934850450475, + "learning_rate": 2.8236181520794474e-05, + "loss": 0.127, + "step": 16020 + }, + { + "epoch": 1.8997984110043875, + "grad_norm": 1.2991209508884507, + "learning_rate": 2.823380134064498e-05, + "loss": 0.2661, + "step": 16021 + }, + { + "epoch": 1.8999169927665125, + "grad_norm": 1.0221231307519234, + "learning_rate": 2.8231421130683793e-05, + "loss": 0.2121, + "step": 16022 + }, + { + "epoch": 1.9000355745286375, + "grad_norm": 0.9122685059724239, + "learning_rate": 2.822904089093288e-05, + "loss": 0.2232, + "step": 16023 + }, + { + "epoch": 1.9001541562907625, + "grad_norm": 0.8463973568758093, + "learning_rate": 2.8226660621414157e-05, + "loss": 0.1934, + "step": 16024 + }, + { + "epoch": 1.9002727380528874, + "grad_norm": 0.8344088744620402, + "learning_rate": 2.8224280322149587e-05, + "loss": 0.1617, + "step": 16025 + }, + { + "epoch": 1.9003913198150124, + "grad_norm": 0.7691163974752263, + "learning_rate": 2.8221899993161098e-05, + "loss": 0.1466, + "step": 16026 + }, + { + "epoch": 1.9005099015771374, + "grad_norm": 1.2060567657194745, + "learning_rate": 2.8219519634470648e-05, + "loss": 0.208, + "step": 16027 + }, + { + "epoch": 1.9006284833392624, + "grad_norm": 0.8847725284929038, + "learning_rate": 2.8217139246100173e-05, + "loss": 0.2074, + "step": 16028 + }, + { + "epoch": 1.9007470651013874, + "grad_norm": 0.6719701690384398, + "learning_rate": 2.8214758828071618e-05, + "loss": 0.1475, + "step": 16029 + }, + { + "epoch": 1.9008656468635123, + "grad_norm": 0.706349929732018, + "learning_rate": 2.8212378380406933e-05, + "loss": 0.1444, + "step": 16030 + }, + { + "epoch": 1.9009842286256373, + "grad_norm": 1.3667963834118002, + "learning_rate": 2.8209997903128054e-05, + "loss": 0.3831, + "step": 16031 + }, + { + "epoch": 1.9011028103877625, + "grad_norm": 0.9340477194046695, + "learning_rate": 2.820761739625694e-05, + "loss": 0.2149, + "step": 16032 + }, + { + "epoch": 1.9012213921498873, + "grad_norm": 0.6139532108089062, + "learning_rate": 2.820523685981552e-05, + "loss": 0.1402, + "step": 16033 + }, + { + "epoch": 1.9013399739120125, + "grad_norm": 0.6580216803654312, + "learning_rate": 2.8202856293825748e-05, + "loss": 0.1361, + "step": 16034 + }, + { + "epoch": 1.9014585556741372, + "grad_norm": 0.8214783306192367, + "learning_rate": 2.8200475698309564e-05, + "loss": 0.1701, + "step": 16035 + }, + { + "epoch": 1.9015771374362624, + "grad_norm": 0.8063753830782938, + "learning_rate": 2.8198095073288926e-05, + "loss": 0.1715, + "step": 16036 + }, + { + "epoch": 1.9016957191983872, + "grad_norm": 1.0482580823956247, + "learning_rate": 2.8195714418785768e-05, + "loss": 0.1931, + "step": 16037 + }, + { + "epoch": 1.9018143009605124, + "grad_norm": 0.7934667790270736, + "learning_rate": 2.819333373482204e-05, + "loss": 0.186, + "step": 16038 + }, + { + "epoch": 1.9019328827226372, + "grad_norm": 1.00374053109999, + "learning_rate": 2.8190953021419698e-05, + "loss": 0.2018, + "step": 16039 + }, + { + "epoch": 1.9020514644847624, + "grad_norm": 1.2277980024021808, + "learning_rate": 2.8188572278600676e-05, + "loss": 0.3597, + "step": 16040 + }, + { + "epoch": 1.9021700462468871, + "grad_norm": 0.7845901191466269, + "learning_rate": 2.8186191506386935e-05, + "loss": 0.1651, + "step": 16041 + }, + { + "epoch": 1.9022886280090123, + "grad_norm": 0.6994670210377145, + "learning_rate": 2.818381070480041e-05, + "loss": 0.1503, + "step": 16042 + }, + { + "epoch": 1.902407209771137, + "grad_norm": 1.158949383402621, + "learning_rate": 2.818142987386305e-05, + "loss": 0.2605, + "step": 16043 + }, + { + "epoch": 1.9025257915332623, + "grad_norm": 0.9952893277227186, + "learning_rate": 2.8179049013596813e-05, + "loss": 0.2312, + "step": 16044 + }, + { + "epoch": 1.902644373295387, + "grad_norm": 0.9643211549192695, + "learning_rate": 2.8176668124023648e-05, + "loss": 0.1559, + "step": 16045 + }, + { + "epoch": 1.9027629550575123, + "grad_norm": 1.0830202112699108, + "learning_rate": 2.8174287205165495e-05, + "loss": 0.2738, + "step": 16046 + }, + { + "epoch": 1.902881536819637, + "grad_norm": 0.962180061913482, + "learning_rate": 2.81719062570443e-05, + "loss": 0.2471, + "step": 16047 + }, + { + "epoch": 1.9030001185817622, + "grad_norm": 0.7085835975243565, + "learning_rate": 2.8169525279682023e-05, + "loss": 0.1436, + "step": 16048 + }, + { + "epoch": 1.903118700343887, + "grad_norm": 0.9629448029979497, + "learning_rate": 2.816714427310061e-05, + "loss": 0.2127, + "step": 16049 + }, + { + "epoch": 1.9032372821060122, + "grad_norm": 1.08611614687514, + "learning_rate": 2.816476323732201e-05, + "loss": 0.2478, + "step": 16050 + }, + { + "epoch": 1.903355863868137, + "grad_norm": 1.270560170537835, + "learning_rate": 2.816238217236818e-05, + "loss": 0.2813, + "step": 16051 + }, + { + "epoch": 1.9034744456302621, + "grad_norm": 0.8123001145976401, + "learning_rate": 2.8160001078261054e-05, + "loss": 0.1807, + "step": 16052 + }, + { + "epoch": 1.903593027392387, + "grad_norm": 0.6630240310509419, + "learning_rate": 2.8157619955022606e-05, + "loss": 0.1215, + "step": 16053 + }, + { + "epoch": 1.903711609154512, + "grad_norm": 0.7756802943523601, + "learning_rate": 2.8155238802674765e-05, + "loss": 0.1846, + "step": 16054 + }, + { + "epoch": 1.9038301909166369, + "grad_norm": 0.7942594299651337, + "learning_rate": 2.815285762123949e-05, + "loss": 0.1562, + "step": 16055 + }, + { + "epoch": 1.903948772678762, + "grad_norm": 0.9065131807304972, + "learning_rate": 2.815047641073874e-05, + "loss": 0.2393, + "step": 16056 + }, + { + "epoch": 1.904067354440887, + "grad_norm": 0.8672367138018566, + "learning_rate": 2.8148095171194454e-05, + "loss": 0.1748, + "step": 16057 + }, + { + "epoch": 1.904185936203012, + "grad_norm": 0.7971496403043409, + "learning_rate": 2.8145713902628595e-05, + "loss": 0.1531, + "step": 16058 + }, + { + "epoch": 1.904304517965137, + "grad_norm": 0.5124821729717439, + "learning_rate": 2.8143332605063105e-05, + "loss": 0.1179, + "step": 16059 + }, + { + "epoch": 1.904423099727262, + "grad_norm": 0.7001395036273097, + "learning_rate": 2.8140951278519957e-05, + "loss": 0.1463, + "step": 16060 + }, + { + "epoch": 1.904541681489387, + "grad_norm": 0.7959357453403154, + "learning_rate": 2.813856992302108e-05, + "loss": 0.1908, + "step": 16061 + }, + { + "epoch": 1.904660263251512, + "grad_norm": 0.8065231863058255, + "learning_rate": 2.8136188538588438e-05, + "loss": 0.1445, + "step": 16062 + }, + { + "epoch": 1.904778845013637, + "grad_norm": 0.6411136853133865, + "learning_rate": 2.8133807125243983e-05, + "loss": 0.1423, + "step": 16063 + }, + { + "epoch": 1.904897426775762, + "grad_norm": 1.0400106420792206, + "learning_rate": 2.8131425683009677e-05, + "loss": 0.2795, + "step": 16064 + }, + { + "epoch": 1.9050160085378869, + "grad_norm": 0.9285993439934166, + "learning_rate": 2.812904421190746e-05, + "loss": 0.1903, + "step": 16065 + }, + { + "epoch": 1.9051345903000119, + "grad_norm": 1.0556325688688668, + "learning_rate": 2.8126662711959286e-05, + "loss": 0.2429, + "step": 16066 + }, + { + "epoch": 1.9052531720621368, + "grad_norm": 0.8016027049542219, + "learning_rate": 2.8124281183187123e-05, + "loss": 0.1694, + "step": 16067 + }, + { + "epoch": 1.9053717538242618, + "grad_norm": 0.8668186736621011, + "learning_rate": 2.8121899625612917e-05, + "loss": 0.212, + "step": 16068 + }, + { + "epoch": 1.9054903355863868, + "grad_norm": 1.0784058509316448, + "learning_rate": 2.8119518039258625e-05, + "loss": 0.3117, + "step": 16069 + }, + { + "epoch": 1.9056089173485118, + "grad_norm": 0.6697063949978745, + "learning_rate": 2.8117136424146206e-05, + "loss": 0.1682, + "step": 16070 + }, + { + "epoch": 1.9057274991106368, + "grad_norm": 0.7343773518141312, + "learning_rate": 2.8114754780297608e-05, + "loss": 0.1586, + "step": 16071 + }, + { + "epoch": 1.9058460808727617, + "grad_norm": 0.8926253964800768, + "learning_rate": 2.811237310773479e-05, + "loss": 0.1592, + "step": 16072 + }, + { + "epoch": 1.9059646626348867, + "grad_norm": 0.8531225239816421, + "learning_rate": 2.8109991406479713e-05, + "loss": 0.2018, + "step": 16073 + }, + { + "epoch": 1.9060832443970117, + "grad_norm": 0.9085552246567482, + "learning_rate": 2.8107609676554325e-05, + "loss": 0.1931, + "step": 16074 + }, + { + "epoch": 1.9062018261591367, + "grad_norm": 0.8172230014717179, + "learning_rate": 2.8105227917980587e-05, + "loss": 0.2137, + "step": 16075 + }, + { + "epoch": 1.9063204079212617, + "grad_norm": 0.8790515706738277, + "learning_rate": 2.8102846130780458e-05, + "loss": 0.1706, + "step": 16076 + }, + { + "epoch": 1.9064389896833867, + "grad_norm": 1.0129113430702423, + "learning_rate": 2.8100464314975887e-05, + "loss": 0.2222, + "step": 16077 + }, + { + "epoch": 1.9065575714455116, + "grad_norm": 0.8923724634131934, + "learning_rate": 2.8098082470588838e-05, + "loss": 0.193, + "step": 16078 + }, + { + "epoch": 1.9066761532076366, + "grad_norm": 0.9312261026954777, + "learning_rate": 2.8095700597641273e-05, + "loss": 0.2639, + "step": 16079 + }, + { + "epoch": 1.9067947349697616, + "grad_norm": 0.8689916871828128, + "learning_rate": 2.809331869615514e-05, + "loss": 0.177, + "step": 16080 + }, + { + "epoch": 1.9069133167318868, + "grad_norm": 0.9137513793053651, + "learning_rate": 2.8090936766152408e-05, + "loss": 0.203, + "step": 16081 + }, + { + "epoch": 1.9070318984940116, + "grad_norm": 1.2199647432285512, + "learning_rate": 2.808855480765502e-05, + "loss": 0.2628, + "step": 16082 + }, + { + "epoch": 1.9071504802561368, + "grad_norm": 0.933837232059193, + "learning_rate": 2.8086172820684953e-05, + "loss": 0.1937, + "step": 16083 + }, + { + "epoch": 1.9072690620182615, + "grad_norm": 1.5565611947789477, + "learning_rate": 2.8083790805264153e-05, + "loss": 0.382, + "step": 16084 + }, + { + "epoch": 1.9073876437803867, + "grad_norm": 1.8211068949077909, + "learning_rate": 2.808140876141458e-05, + "loss": 0.217, + "step": 16085 + }, + { + "epoch": 1.9075062255425115, + "grad_norm": 1.0398461009835385, + "learning_rate": 2.8079026689158204e-05, + "loss": 0.16, + "step": 16086 + }, + { + "epoch": 1.9076248073046367, + "grad_norm": 0.890194574819876, + "learning_rate": 2.8076644588516976e-05, + "loss": 0.2144, + "step": 16087 + }, + { + "epoch": 1.9077433890667614, + "grad_norm": 0.6436168850439142, + "learning_rate": 2.8074262459512852e-05, + "loss": 0.1569, + "step": 16088 + }, + { + "epoch": 1.9078619708288866, + "grad_norm": 0.8199967753319074, + "learning_rate": 2.8071880302167803e-05, + "loss": 0.1946, + "step": 16089 + }, + { + "epoch": 1.9079805525910114, + "grad_norm": 0.9516529419926232, + "learning_rate": 2.8069498116503785e-05, + "loss": 0.2559, + "step": 16090 + }, + { + "epoch": 1.9080991343531366, + "grad_norm": 0.8184937588329757, + "learning_rate": 2.8067115902542755e-05, + "loss": 0.2049, + "step": 16091 + }, + { + "epoch": 1.9082177161152614, + "grad_norm": 0.8665811718101377, + "learning_rate": 2.806473366030668e-05, + "loss": 0.1907, + "step": 16092 + }, + { + "epoch": 1.9083362978773866, + "grad_norm": 0.7865269074420737, + "learning_rate": 2.8062351389817522e-05, + "loss": 0.157, + "step": 16093 + }, + { + "epoch": 1.9084548796395113, + "grad_norm": 0.8422021674817751, + "learning_rate": 2.805996909109724e-05, + "loss": 0.1768, + "step": 16094 + }, + { + "epoch": 1.9085734614016365, + "grad_norm": 0.625207645975702, + "learning_rate": 2.8057586764167787e-05, + "loss": 0.1418, + "step": 16095 + }, + { + "epoch": 1.9086920431637613, + "grad_norm": 1.7619033950165237, + "learning_rate": 2.8055204409051133e-05, + "loss": 0.3946, + "step": 16096 + }, + { + "epoch": 1.9088106249258865, + "grad_norm": 0.8587232766951006, + "learning_rate": 2.8052822025769243e-05, + "loss": 0.1947, + "step": 16097 + }, + { + "epoch": 1.9089292066880112, + "grad_norm": 0.6993821636809319, + "learning_rate": 2.8050439614344088e-05, + "loss": 0.1408, + "step": 16098 + }, + { + "epoch": 1.9090477884501365, + "grad_norm": 0.7958806874270875, + "learning_rate": 2.804805717479761e-05, + "loss": 0.1337, + "step": 16099 + }, + { + "epoch": 1.9091663702122612, + "grad_norm": 1.1804482701163845, + "learning_rate": 2.8045674707151782e-05, + "loss": 0.2458, + "step": 16100 + }, + { + "epoch": 1.9092849519743864, + "grad_norm": 1.2110464888748813, + "learning_rate": 2.8043292211428573e-05, + "loss": 0.2419, + "step": 16101 + }, + { + "epoch": 1.9094035337365112, + "grad_norm": 0.5511865009157908, + "learning_rate": 2.804090968764994e-05, + "loss": 0.1245, + "step": 16102 + }, + { + "epoch": 1.9095221154986364, + "grad_norm": 0.8541879030060875, + "learning_rate": 2.803852713583785e-05, + "loss": 0.185, + "step": 16103 + }, + { + "epoch": 1.9096406972607611, + "grad_norm": 0.778221182153507, + "learning_rate": 2.8036144556014264e-05, + "loss": 0.1511, + "step": 16104 + }, + { + "epoch": 1.9097592790228863, + "grad_norm": 0.48146535588377526, + "learning_rate": 2.8033761948201152e-05, + "loss": 0.1201, + "step": 16105 + }, + { + "epoch": 1.9098778607850113, + "grad_norm": 0.7438181213533918, + "learning_rate": 2.803137931242047e-05, + "loss": 0.1298, + "step": 16106 + }, + { + "epoch": 1.9099964425471363, + "grad_norm": 0.6145567434063876, + "learning_rate": 2.802899664869419e-05, + "loss": 0.1278, + "step": 16107 + }, + { + "epoch": 1.9101150243092613, + "grad_norm": 0.7711216492703693, + "learning_rate": 2.8026613957044274e-05, + "loss": 0.186, + "step": 16108 + }, + { + "epoch": 1.9102336060713863, + "grad_norm": 0.7507813491788091, + "learning_rate": 2.8024231237492693e-05, + "loss": 0.1742, + "step": 16109 + }, + { + "epoch": 1.9103521878335112, + "grad_norm": 1.1401612682928808, + "learning_rate": 2.8021848490061403e-05, + "loss": 0.2296, + "step": 16110 + }, + { + "epoch": 1.9104707695956362, + "grad_norm": 0.8996888377989521, + "learning_rate": 2.801946571477238e-05, + "loss": 0.1982, + "step": 16111 + }, + { + "epoch": 1.9105893513577612, + "grad_norm": 1.1027614513473663, + "learning_rate": 2.8017082911647586e-05, + "loss": 0.2377, + "step": 16112 + }, + { + "epoch": 1.9107079331198862, + "grad_norm": 0.8095909958891714, + "learning_rate": 2.801470008070899e-05, + "loss": 0.1646, + "step": 16113 + }, + { + "epoch": 1.9108265148820112, + "grad_norm": 1.234732318365986, + "learning_rate": 2.8012317221978546e-05, + "loss": 0.2455, + "step": 16114 + }, + { + "epoch": 1.9109450966441361, + "grad_norm": 0.8875182616985962, + "learning_rate": 2.8009934335478238e-05, + "loss": 0.165, + "step": 16115 + }, + { + "epoch": 1.9110636784062611, + "grad_norm": 1.042240090624925, + "learning_rate": 2.800755142123003e-05, + "loss": 0.2305, + "step": 16116 + }, + { + "epoch": 1.911182260168386, + "grad_norm": 0.9209188015542018, + "learning_rate": 2.8005168479255876e-05, + "loss": 0.1726, + "step": 16117 + }, + { + "epoch": 1.911300841930511, + "grad_norm": 0.861622845146542, + "learning_rate": 2.8002785509577757e-05, + "loss": 0.1931, + "step": 16118 + }, + { + "epoch": 1.911419423692636, + "grad_norm": 0.7272874492035702, + "learning_rate": 2.8000402512217638e-05, + "loss": 0.176, + "step": 16119 + }, + { + "epoch": 1.911538005454761, + "grad_norm": 1.0042781627306168, + "learning_rate": 2.7998019487197492e-05, + "loss": 0.218, + "step": 16120 + }, + { + "epoch": 1.911656587216886, + "grad_norm": 0.8086556569412204, + "learning_rate": 2.799563643453928e-05, + "loss": 0.1775, + "step": 16121 + }, + { + "epoch": 1.911775168979011, + "grad_norm": 0.7291045710169852, + "learning_rate": 2.7993253354264974e-05, + "loss": 0.1632, + "step": 16122 + }, + { + "epoch": 1.911893750741136, + "grad_norm": 1.139694594633981, + "learning_rate": 2.799087024639654e-05, + "loss": 0.262, + "step": 16123 + }, + { + "epoch": 1.912012332503261, + "grad_norm": 0.8419922375736456, + "learning_rate": 2.798848711095596e-05, + "loss": 0.1626, + "step": 16124 + }, + { + "epoch": 1.912130914265386, + "grad_norm": 0.7207823124881841, + "learning_rate": 2.798610394796518e-05, + "loss": 0.1514, + "step": 16125 + }, + { + "epoch": 1.912249496027511, + "grad_norm": 0.6956671379823878, + "learning_rate": 2.7983720757446185e-05, + "loss": 0.1333, + "step": 16126 + }, + { + "epoch": 1.912368077789636, + "grad_norm": 1.2361873183822614, + "learning_rate": 2.798133753942095e-05, + "loss": 0.2139, + "step": 16127 + }, + { + "epoch": 1.912486659551761, + "grad_norm": 0.8577699022485038, + "learning_rate": 2.797895429391143e-05, + "loss": 0.1488, + "step": 16128 + }, + { + "epoch": 1.9126052413138859, + "grad_norm": 1.391498602024598, + "learning_rate": 2.7976571020939616e-05, + "loss": 0.2373, + "step": 16129 + }, + { + "epoch": 1.912723823076011, + "grad_norm": 0.6319294542090831, + "learning_rate": 2.797418772052746e-05, + "loss": 0.1185, + "step": 16130 + }, + { + "epoch": 1.9128424048381358, + "grad_norm": 0.9270571237828035, + "learning_rate": 2.797180439269695e-05, + "loss": 0.1995, + "step": 16131 + }, + { + "epoch": 1.912960986600261, + "grad_norm": 0.8273926286504675, + "learning_rate": 2.7969421037470035e-05, + "loss": 0.1624, + "step": 16132 + }, + { + "epoch": 1.9130795683623858, + "grad_norm": 0.8004937178560668, + "learning_rate": 2.796703765486871e-05, + "loss": 0.1331, + "step": 16133 + }, + { + "epoch": 1.913198150124511, + "grad_norm": 0.9043294698160846, + "learning_rate": 2.796465424491493e-05, + "loss": 0.1962, + "step": 16134 + }, + { + "epoch": 1.9133167318866358, + "grad_norm": 1.0080642095058048, + "learning_rate": 2.7962270807630675e-05, + "loss": 0.1884, + "step": 16135 + }, + { + "epoch": 1.913435313648761, + "grad_norm": 1.0460051698117656, + "learning_rate": 2.795988734303791e-05, + "loss": 0.1669, + "step": 16136 + }, + { + "epoch": 1.9135538954108857, + "grad_norm": 0.7882870146105752, + "learning_rate": 2.7957503851158617e-05, + "loss": 0.1595, + "step": 16137 + }, + { + "epoch": 1.913672477173011, + "grad_norm": 0.7678024661410756, + "learning_rate": 2.7955120332014768e-05, + "loss": 0.1526, + "step": 16138 + }, + { + "epoch": 1.9137910589351357, + "grad_norm": 0.7940783805265607, + "learning_rate": 2.7952736785628337e-05, + "loss": 0.1555, + "step": 16139 + }, + { + "epoch": 1.9139096406972609, + "grad_norm": 0.775496185015277, + "learning_rate": 2.7950353212021285e-05, + "loss": 0.1746, + "step": 16140 + }, + { + "epoch": 1.9140282224593856, + "grad_norm": 0.9983417367474882, + "learning_rate": 2.7947969611215603e-05, + "loss": 0.2126, + "step": 16141 + }, + { + "epoch": 1.9141468042215108, + "grad_norm": 1.4932645257275636, + "learning_rate": 2.7945585983233248e-05, + "loss": 0.2831, + "step": 16142 + }, + { + "epoch": 1.9142653859836356, + "grad_norm": 0.7621200842308553, + "learning_rate": 2.7943202328096202e-05, + "loss": 0.1521, + "step": 16143 + }, + { + "epoch": 1.9143839677457608, + "grad_norm": 0.7917148857111221, + "learning_rate": 2.7940818645826445e-05, + "loss": 0.16, + "step": 16144 + }, + { + "epoch": 1.9145025495078856, + "grad_norm": 0.8528245200908727, + "learning_rate": 2.7938434936445945e-05, + "loss": 0.1503, + "step": 16145 + }, + { + "epoch": 1.9146211312700108, + "grad_norm": 0.8898581154357232, + "learning_rate": 2.7936051199976677e-05, + "loss": 0.1662, + "step": 16146 + }, + { + "epoch": 1.9147397130321355, + "grad_norm": 0.843507441673002, + "learning_rate": 2.7933667436440614e-05, + "loss": 0.1982, + "step": 16147 + }, + { + "epoch": 1.9148582947942607, + "grad_norm": 1.2043005590326452, + "learning_rate": 2.793128364585974e-05, + "loss": 0.2122, + "step": 16148 + }, + { + "epoch": 1.9149768765563855, + "grad_norm": 0.7479489636339709, + "learning_rate": 2.7928899828256023e-05, + "loss": 0.1456, + "step": 16149 + }, + { + "epoch": 1.9150954583185107, + "grad_norm": 0.8522259276881368, + "learning_rate": 2.7926515983651446e-05, + "loss": 0.1571, + "step": 16150 + }, + { + "epoch": 1.9152140400806354, + "grad_norm": 0.742339807552376, + "learning_rate": 2.7924132112067975e-05, + "loss": 0.1338, + "step": 16151 + }, + { + "epoch": 1.9153326218427607, + "grad_norm": 0.5601343493911843, + "learning_rate": 2.79217482135276e-05, + "loss": 0.1209, + "step": 16152 + }, + { + "epoch": 1.9154512036048854, + "grad_norm": 0.8721545934293266, + "learning_rate": 2.791936428805228e-05, + "loss": 0.1909, + "step": 16153 + }, + { + "epoch": 1.9155697853670106, + "grad_norm": 1.2990035213005926, + "learning_rate": 2.7916980335664004e-05, + "loss": 0.2637, + "step": 16154 + }, + { + "epoch": 1.9156883671291356, + "grad_norm": 1.1676615106970518, + "learning_rate": 2.7914596356384746e-05, + "loss": 0.288, + "step": 16155 + }, + { + "epoch": 1.9158069488912606, + "grad_norm": 0.8729485062290261, + "learning_rate": 2.7912212350236482e-05, + "loss": 0.1356, + "step": 16156 + }, + { + "epoch": 1.9159255306533856, + "grad_norm": 0.8488522997522174, + "learning_rate": 2.7909828317241193e-05, + "loss": 0.2164, + "step": 16157 + }, + { + "epoch": 1.9160441124155105, + "grad_norm": 0.8463766653486496, + "learning_rate": 2.7907444257420855e-05, + "loss": 0.1466, + "step": 16158 + }, + { + "epoch": 1.9161626941776355, + "grad_norm": 0.7733224450732947, + "learning_rate": 2.7905060170797447e-05, + "loss": 0.1648, + "step": 16159 + }, + { + "epoch": 1.9162812759397605, + "grad_norm": 0.7455826895422575, + "learning_rate": 2.7902676057392947e-05, + "loss": 0.1582, + "step": 16160 + }, + { + "epoch": 1.9163998577018855, + "grad_norm": 1.004755159301484, + "learning_rate": 2.790029191722934e-05, + "loss": 0.1945, + "step": 16161 + }, + { + "epoch": 1.9165184394640105, + "grad_norm": 0.8997574366034471, + "learning_rate": 2.789790775032859e-05, + "loss": 0.2093, + "step": 16162 + }, + { + "epoch": 1.9166370212261354, + "grad_norm": 0.584404851309574, + "learning_rate": 2.7895523556712683e-05, + "loss": 0.122, + "step": 16163 + }, + { + "epoch": 1.9167556029882604, + "grad_norm": 1.1121063598050815, + "learning_rate": 2.78931393364036e-05, + "loss": 0.1659, + "step": 16164 + }, + { + "epoch": 1.9168741847503854, + "grad_norm": 0.8318585382515318, + "learning_rate": 2.7890755089423326e-05, + "loss": 0.1671, + "step": 16165 + }, + { + "epoch": 1.9169927665125104, + "grad_norm": 0.8429016001991769, + "learning_rate": 2.788837081579383e-05, + "loss": 0.1914, + "step": 16166 + }, + { + "epoch": 1.9171113482746354, + "grad_norm": 0.90819520035403, + "learning_rate": 2.7885986515537095e-05, + "loss": 0.1969, + "step": 16167 + }, + { + "epoch": 1.9172299300367603, + "grad_norm": 0.7719036826497917, + "learning_rate": 2.7883602188675106e-05, + "loss": 0.1331, + "step": 16168 + }, + { + "epoch": 1.9173485117988853, + "grad_norm": 0.7943196418705168, + "learning_rate": 2.7881217835229844e-05, + "loss": 0.1693, + "step": 16169 + }, + { + "epoch": 1.9174670935610103, + "grad_norm": 1.296083152250887, + "learning_rate": 2.787883345522328e-05, + "loss": 0.3394, + "step": 16170 + }, + { + "epoch": 1.9175856753231353, + "grad_norm": 0.8288673859703468, + "learning_rate": 2.7876449048677405e-05, + "loss": 0.1611, + "step": 16171 + }, + { + "epoch": 1.9177042570852603, + "grad_norm": 1.1990628487852457, + "learning_rate": 2.7874064615614204e-05, + "loss": 0.3308, + "step": 16172 + }, + { + "epoch": 1.9178228388473852, + "grad_norm": 1.0815767046385982, + "learning_rate": 2.787168015605564e-05, + "loss": 0.2013, + "step": 16173 + }, + { + "epoch": 1.9179414206095102, + "grad_norm": 0.9651252540753816, + "learning_rate": 2.7869295670023716e-05, + "loss": 0.2164, + "step": 16174 + }, + { + "epoch": 1.9180600023716352, + "grad_norm": 0.9239879289641345, + "learning_rate": 2.78669111575404e-05, + "loss": 0.2066, + "step": 16175 + }, + { + "epoch": 1.9181785841337602, + "grad_norm": 0.6745882885422091, + "learning_rate": 2.786452661862768e-05, + "loss": 0.1188, + "step": 16176 + }, + { + "epoch": 1.9182971658958852, + "grad_norm": 0.904872701022246, + "learning_rate": 2.7862142053307534e-05, + "loss": 0.2105, + "step": 16177 + }, + { + "epoch": 1.9184157476580102, + "grad_norm": 0.8668962811703257, + "learning_rate": 2.7859757461601948e-05, + "loss": 0.1646, + "step": 16178 + }, + { + "epoch": 1.9185343294201351, + "grad_norm": 1.2469100481072897, + "learning_rate": 2.7857372843532908e-05, + "loss": 0.2368, + "step": 16179 + }, + { + "epoch": 1.9186529111822601, + "grad_norm": 0.8283680587361831, + "learning_rate": 2.78549881991224e-05, + "loss": 0.1554, + "step": 16180 + }, + { + "epoch": 1.9187714929443853, + "grad_norm": 0.5693712086414155, + "learning_rate": 2.785260352839239e-05, + "loss": 0.099, + "step": 16181 + }, + { + "epoch": 1.91889007470651, + "grad_norm": 0.7993986978775055, + "learning_rate": 2.7850218831364883e-05, + "loss": 0.1769, + "step": 16182 + }, + { + "epoch": 1.9190086564686353, + "grad_norm": 0.8110727253418308, + "learning_rate": 2.784783410806185e-05, + "loss": 0.1218, + "step": 16183 + }, + { + "epoch": 1.91912723823076, + "grad_norm": 1.1593327078399485, + "learning_rate": 2.7845449358505282e-05, + "loss": 0.2755, + "step": 16184 + }, + { + "epoch": 1.9192458199928852, + "grad_norm": 0.6967215943401495, + "learning_rate": 2.7843064582717156e-05, + "loss": 0.1197, + "step": 16185 + }, + { + "epoch": 1.91936440175501, + "grad_norm": 0.6688461795638877, + "learning_rate": 2.7840679780719458e-05, + "loss": 0.169, + "step": 16186 + }, + { + "epoch": 1.9194829835171352, + "grad_norm": 0.9282752426252121, + "learning_rate": 2.7838294952534178e-05, + "loss": 0.1875, + "step": 16187 + }, + { + "epoch": 1.91960156527926, + "grad_norm": 1.2104754215557056, + "learning_rate": 2.78359100981833e-05, + "loss": 0.2421, + "step": 16188 + }, + { + "epoch": 1.9197201470413852, + "grad_norm": 0.6172802592106619, + "learning_rate": 2.783352521768881e-05, + "loss": 0.1522, + "step": 16189 + }, + { + "epoch": 1.91983872880351, + "grad_norm": 0.823135150093449, + "learning_rate": 2.783114031107269e-05, + "loss": 0.1952, + "step": 16190 + }, + { + "epoch": 1.9199573105656351, + "grad_norm": 0.9681224463951311, + "learning_rate": 2.7828755378356936e-05, + "loss": 0.1735, + "step": 16191 + }, + { + "epoch": 1.9200758923277599, + "grad_norm": 0.8974201362752957, + "learning_rate": 2.782637041956352e-05, + "loss": 0.1991, + "step": 16192 + }, + { + "epoch": 1.920194474089885, + "grad_norm": 1.0214410813564399, + "learning_rate": 2.7823985434714434e-05, + "loss": 0.2207, + "step": 16193 + }, + { + "epoch": 1.9203130558520098, + "grad_norm": 0.9555008758797051, + "learning_rate": 2.7821600423831663e-05, + "loss": 0.1818, + "step": 16194 + }, + { + "epoch": 1.920431637614135, + "grad_norm": 0.8925409387494988, + "learning_rate": 2.7819215386937197e-05, + "loss": 0.1422, + "step": 16195 + }, + { + "epoch": 1.9205502193762598, + "grad_norm": 0.711487173754342, + "learning_rate": 2.7816830324053023e-05, + "loss": 0.123, + "step": 16196 + }, + { + "epoch": 1.920668801138385, + "grad_norm": 0.9109872756109001, + "learning_rate": 2.781444523520113e-05, + "loss": 0.1688, + "step": 16197 + }, + { + "epoch": 1.9207873829005098, + "grad_norm": 0.8727054658235036, + "learning_rate": 2.7812060120403506e-05, + "loss": 0.2047, + "step": 16198 + }, + { + "epoch": 1.920905964662635, + "grad_norm": 0.8524400671780192, + "learning_rate": 2.7809674979682133e-05, + "loss": 0.1639, + "step": 16199 + }, + { + "epoch": 1.9210245464247597, + "grad_norm": 0.8243196618860221, + "learning_rate": 2.7807289813059002e-05, + "loss": 0.1829, + "step": 16200 + }, + { + "epoch": 1.921143128186885, + "grad_norm": 1.1584271819529106, + "learning_rate": 2.7804904620556093e-05, + "loss": 0.3051, + "step": 16201 + }, + { + "epoch": 1.9212617099490097, + "grad_norm": 0.908924212849637, + "learning_rate": 2.7802519402195416e-05, + "loss": 0.203, + "step": 16202 + }, + { + "epoch": 1.921380291711135, + "grad_norm": 0.6854458430119058, + "learning_rate": 2.780013415799894e-05, + "loss": 0.1246, + "step": 16203 + }, + { + "epoch": 1.9214988734732596, + "grad_norm": 1.2569344739811312, + "learning_rate": 2.779774888798867e-05, + "loss": 0.2272, + "step": 16204 + }, + { + "epoch": 1.9216174552353849, + "grad_norm": 0.8902749432834192, + "learning_rate": 2.7795363592186575e-05, + "loss": 0.1718, + "step": 16205 + }, + { + "epoch": 1.9217360369975098, + "grad_norm": 0.8104250291679126, + "learning_rate": 2.7792978270614656e-05, + "loss": 0.1681, + "step": 16206 + }, + { + "epoch": 1.9218546187596348, + "grad_norm": 0.8557527707274022, + "learning_rate": 2.77905929232949e-05, + "loss": 0.1548, + "step": 16207 + }, + { + "epoch": 1.9219732005217598, + "grad_norm": 0.7209200266438465, + "learning_rate": 2.778820755024931e-05, + "loss": 0.1817, + "step": 16208 + }, + { + "epoch": 1.9220917822838848, + "grad_norm": 0.6200105035109764, + "learning_rate": 2.7785822151499857e-05, + "loss": 0.1178, + "step": 16209 + }, + { + "epoch": 1.9222103640460098, + "grad_norm": 0.91659404515795, + "learning_rate": 2.7783436727068546e-05, + "loss": 0.2384, + "step": 16210 + }, + { + "epoch": 1.9223289458081347, + "grad_norm": 1.0217718651794356, + "learning_rate": 2.778105127697736e-05, + "loss": 0.179, + "step": 16211 + }, + { + "epoch": 1.9224475275702597, + "grad_norm": 0.7813068808972369, + "learning_rate": 2.7778665801248292e-05, + "loss": 0.2198, + "step": 16212 + }, + { + "epoch": 1.9225661093323847, + "grad_norm": 0.625999128828941, + "learning_rate": 2.777628029990333e-05, + "loss": 0.1519, + "step": 16213 + }, + { + "epoch": 1.9226846910945097, + "grad_norm": 0.9189648251183812, + "learning_rate": 2.7773894772964465e-05, + "loss": 0.2408, + "step": 16214 + }, + { + "epoch": 1.9228032728566347, + "grad_norm": 0.9841323240862208, + "learning_rate": 2.7771509220453702e-05, + "loss": 0.2086, + "step": 16215 + }, + { + "epoch": 1.9229218546187596, + "grad_norm": 0.8462774581570619, + "learning_rate": 2.7769123642393014e-05, + "loss": 0.2172, + "step": 16216 + }, + { + "epoch": 1.9230404363808846, + "grad_norm": 0.9331086031740888, + "learning_rate": 2.7766738038804402e-05, + "loss": 0.1985, + "step": 16217 + }, + { + "epoch": 1.9231590181430096, + "grad_norm": 1.0057817776703024, + "learning_rate": 2.776435240970986e-05, + "loss": 0.2721, + "step": 16218 + }, + { + "epoch": 1.9232775999051346, + "grad_norm": 0.8457802998666191, + "learning_rate": 2.776196675513138e-05, + "loss": 0.1626, + "step": 16219 + }, + { + "epoch": 1.9233961816672596, + "grad_norm": 0.6258755762034788, + "learning_rate": 2.775958107509095e-05, + "loss": 0.1316, + "step": 16220 + }, + { + "epoch": 1.9235147634293845, + "grad_norm": 0.8279269491512927, + "learning_rate": 2.7757195369610574e-05, + "loss": 0.152, + "step": 16221 + }, + { + "epoch": 1.9236333451915095, + "grad_norm": 0.7823081863674901, + "learning_rate": 2.7754809638712233e-05, + "loss": 0.1693, + "step": 16222 + }, + { + "epoch": 1.9237519269536345, + "grad_norm": 0.7832562949403702, + "learning_rate": 2.7752423882417927e-05, + "loss": 0.159, + "step": 16223 + }, + { + "epoch": 1.9238705087157595, + "grad_norm": 0.7446655525425439, + "learning_rate": 2.7750038100749644e-05, + "loss": 0.1868, + "step": 16224 + }, + { + "epoch": 1.9239890904778845, + "grad_norm": 0.824811696438495, + "learning_rate": 2.774765229372938e-05, + "loss": 0.1411, + "step": 16225 + }, + { + "epoch": 1.9241076722400094, + "grad_norm": 1.0288401744065663, + "learning_rate": 2.7745266461379138e-05, + "loss": 0.1877, + "step": 16226 + }, + { + "epoch": 1.9242262540021344, + "grad_norm": 2.2810718985038356, + "learning_rate": 2.7742880603720904e-05, + "loss": 0.3693, + "step": 16227 + }, + { + "epoch": 1.9243448357642594, + "grad_norm": 0.9043600279489694, + "learning_rate": 2.7740494720776673e-05, + "loss": 0.2176, + "step": 16228 + }, + { + "epoch": 1.9244634175263844, + "grad_norm": 0.7822689191303283, + "learning_rate": 2.773810881256844e-05, + "loss": 0.1741, + "step": 16229 + }, + { + "epoch": 1.9245819992885096, + "grad_norm": 0.6759252306295748, + "learning_rate": 2.773572287911821e-05, + "loss": 0.1609, + "step": 16230 + }, + { + "epoch": 1.9247005810506344, + "grad_norm": 0.8943259057526366, + "learning_rate": 2.7733336920447955e-05, + "loss": 0.2505, + "step": 16231 + }, + { + "epoch": 1.9248191628127596, + "grad_norm": 0.7325996518281909, + "learning_rate": 2.7730950936579698e-05, + "loss": 0.1261, + "step": 16232 + }, + { + "epoch": 1.9249377445748843, + "grad_norm": 0.780282457224162, + "learning_rate": 2.7728564927535415e-05, + "loss": 0.1474, + "step": 16233 + }, + { + "epoch": 1.9250563263370095, + "grad_norm": 0.7213809666798303, + "learning_rate": 2.7726178893337117e-05, + "loss": 0.1254, + "step": 16234 + }, + { + "epoch": 1.9251749080991343, + "grad_norm": 0.6811780071561927, + "learning_rate": 2.7723792834006785e-05, + "loss": 0.1463, + "step": 16235 + }, + { + "epoch": 1.9252934898612595, + "grad_norm": 1.2038537021479374, + "learning_rate": 2.7721406749566426e-05, + "loss": 0.2581, + "step": 16236 + }, + { + "epoch": 1.9254120716233842, + "grad_norm": 1.0023742592863671, + "learning_rate": 2.771902064003803e-05, + "loss": 0.1929, + "step": 16237 + }, + { + "epoch": 1.9255306533855094, + "grad_norm": 0.7024916222771815, + "learning_rate": 2.7716634505443606e-05, + "loss": 0.1254, + "step": 16238 + }, + { + "epoch": 1.9256492351476342, + "grad_norm": 1.3766061571718644, + "learning_rate": 2.771424834580514e-05, + "loss": 0.2775, + "step": 16239 + }, + { + "epoch": 1.9257678169097594, + "grad_norm": 0.9992837074834383, + "learning_rate": 2.7711862161144638e-05, + "loss": 0.2431, + "step": 16240 + }, + { + "epoch": 1.9258863986718842, + "grad_norm": 1.1121760910769325, + "learning_rate": 2.770947595148408e-05, + "loss": 0.2449, + "step": 16241 + }, + { + "epoch": 1.9260049804340094, + "grad_norm": 0.7659246361713892, + "learning_rate": 2.7707089716845485e-05, + "loss": 0.147, + "step": 16242 + }, + { + "epoch": 1.9261235621961341, + "grad_norm": 0.9633508059454885, + "learning_rate": 2.7704703457250842e-05, + "loss": 0.231, + "step": 16243 + }, + { + "epoch": 1.9262421439582593, + "grad_norm": 0.8816967274749952, + "learning_rate": 2.770231717272214e-05, + "loss": 0.133, + "step": 16244 + }, + { + "epoch": 1.926360725720384, + "grad_norm": 0.7608586527519601, + "learning_rate": 2.7699930863281405e-05, + "loss": 0.189, + "step": 16245 + }, + { + "epoch": 1.9264793074825093, + "grad_norm": 1.229595330220173, + "learning_rate": 2.7697544528950602e-05, + "loss": 0.2963, + "step": 16246 + }, + { + "epoch": 1.926597889244634, + "grad_norm": 0.6485218449705971, + "learning_rate": 2.769515816975175e-05, + "loss": 0.1451, + "step": 16247 + }, + { + "epoch": 1.9267164710067592, + "grad_norm": 0.7388957648250517, + "learning_rate": 2.769277178570685e-05, + "loss": 0.153, + "step": 16248 + }, + { + "epoch": 1.926835052768884, + "grad_norm": 1.2232775068761184, + "learning_rate": 2.7690385376837895e-05, + "loss": 0.2992, + "step": 16249 + }, + { + "epoch": 1.9269536345310092, + "grad_norm": 1.4506231048980174, + "learning_rate": 2.768799894316688e-05, + "loss": 0.2552, + "step": 16250 + }, + { + "epoch": 1.927072216293134, + "grad_norm": 0.9179888122521316, + "learning_rate": 2.768561248471582e-05, + "loss": 0.215, + "step": 16251 + }, + { + "epoch": 1.9271907980552592, + "grad_norm": 0.8323944848942463, + "learning_rate": 2.7683226001506703e-05, + "loss": 0.1726, + "step": 16252 + }, + { + "epoch": 1.927309379817384, + "grad_norm": 0.584966044025122, + "learning_rate": 2.768083949356154e-05, + "loss": 0.134, + "step": 16253 + }, + { + "epoch": 1.9274279615795091, + "grad_norm": 0.5383850519328199, + "learning_rate": 2.7678452960902312e-05, + "loss": 0.1309, + "step": 16254 + }, + { + "epoch": 1.927546543341634, + "grad_norm": 0.7630141026978897, + "learning_rate": 2.767606640355103e-05, + "loss": 0.1489, + "step": 16255 + }, + { + "epoch": 1.927665125103759, + "grad_norm": 0.6906362974429978, + "learning_rate": 2.76736798215297e-05, + "loss": 0.137, + "step": 16256 + }, + { + "epoch": 1.927783706865884, + "grad_norm": 0.6907790982016173, + "learning_rate": 2.7671293214860327e-05, + "loss": 0.1554, + "step": 16257 + }, + { + "epoch": 1.927902288628009, + "grad_norm": 0.6156634311355497, + "learning_rate": 2.7668906583564906e-05, + "loss": 0.1452, + "step": 16258 + }, + { + "epoch": 1.928020870390134, + "grad_norm": 0.7227098232628193, + "learning_rate": 2.7666519927665434e-05, + "loss": 0.1679, + "step": 16259 + }, + { + "epoch": 1.928139452152259, + "grad_norm": 1.0442215320130193, + "learning_rate": 2.7664133247183928e-05, + "loss": 0.164, + "step": 16260 + }, + { + "epoch": 1.928258033914384, + "grad_norm": 1.0350632296924778, + "learning_rate": 2.766174654214237e-05, + "loss": 0.2158, + "step": 16261 + }, + { + "epoch": 1.928376615676509, + "grad_norm": 0.7349667567042034, + "learning_rate": 2.7659359812562778e-05, + "loss": 0.1535, + "step": 16262 + }, + { + "epoch": 1.928495197438634, + "grad_norm": 0.8860017394076909, + "learning_rate": 2.765697305846715e-05, + "loss": 0.1502, + "step": 16263 + }, + { + "epoch": 1.928613779200759, + "grad_norm": 0.9147608470009988, + "learning_rate": 2.7654586279877487e-05, + "loss": 0.1662, + "step": 16264 + }, + { + "epoch": 1.928732360962884, + "grad_norm": 0.7024595547643817, + "learning_rate": 2.7652199476815795e-05, + "loss": 0.135, + "step": 16265 + }, + { + "epoch": 1.928850942725009, + "grad_norm": 0.9633473708628293, + "learning_rate": 2.7649812649304073e-05, + "loss": 0.1842, + "step": 16266 + }, + { + "epoch": 1.9289695244871339, + "grad_norm": 0.9024210858686416, + "learning_rate": 2.7647425797364325e-05, + "loss": 0.2349, + "step": 16267 + }, + { + "epoch": 1.9290881062492589, + "grad_norm": 0.6877292794218387, + "learning_rate": 2.764503892101857e-05, + "loss": 0.1005, + "step": 16268 + }, + { + "epoch": 1.9292066880113838, + "grad_norm": 0.6653825674517339, + "learning_rate": 2.7642652020288785e-05, + "loss": 0.0959, + "step": 16269 + }, + { + "epoch": 1.9293252697735088, + "grad_norm": 0.6476084560822362, + "learning_rate": 2.7640265095196997e-05, + "loss": 0.1598, + "step": 16270 + }, + { + "epoch": 1.9294438515356338, + "grad_norm": 0.9167282191087206, + "learning_rate": 2.7637878145765207e-05, + "loss": 0.1586, + "step": 16271 + }, + { + "epoch": 1.9295624332977588, + "grad_norm": 0.7935845040873356, + "learning_rate": 2.7635491172015406e-05, + "loss": 0.1277, + "step": 16272 + }, + { + "epoch": 1.9296810150598838, + "grad_norm": 0.7973014115427465, + "learning_rate": 2.7633104173969616e-05, + "loss": 0.1635, + "step": 16273 + }, + { + "epoch": 1.9297995968220087, + "grad_norm": 0.859827517265893, + "learning_rate": 2.7630717151649827e-05, + "loss": 0.1893, + "step": 16274 + }, + { + "epoch": 1.9299181785841337, + "grad_norm": 0.8105651419958483, + "learning_rate": 2.7628330105078055e-05, + "loss": 0.1934, + "step": 16275 + }, + { + "epoch": 1.9300367603462587, + "grad_norm": 1.1439449091997902, + "learning_rate": 2.7625943034276296e-05, + "loss": 0.2314, + "step": 16276 + }, + { + "epoch": 1.9301553421083837, + "grad_norm": 0.7478391658524234, + "learning_rate": 2.7623555939266576e-05, + "loss": 0.1687, + "step": 16277 + }, + { + "epoch": 1.9302739238705087, + "grad_norm": 0.7457222992299454, + "learning_rate": 2.7621168820070874e-05, + "loss": 0.1612, + "step": 16278 + }, + { + "epoch": 1.9303925056326336, + "grad_norm": 0.7934664126178257, + "learning_rate": 2.761878167671122e-05, + "loss": 0.145, + "step": 16279 + }, + { + "epoch": 1.9305110873947586, + "grad_norm": 0.6970576012944704, + "learning_rate": 2.761639450920961e-05, + "loss": 0.1423, + "step": 16280 + }, + { + "epoch": 1.9306296691568838, + "grad_norm": 0.8957741567654607, + "learning_rate": 2.7614007317588047e-05, + "loss": 0.1959, + "step": 16281 + }, + { + "epoch": 1.9307482509190086, + "grad_norm": 1.1135977631533982, + "learning_rate": 2.761162010186854e-05, + "loss": 0.2122, + "step": 16282 + }, + { + "epoch": 1.9308668326811338, + "grad_norm": 0.8975288745863577, + "learning_rate": 2.7609232862073102e-05, + "loss": 0.1304, + "step": 16283 + }, + { + "epoch": 1.9309854144432586, + "grad_norm": 0.5208941876843337, + "learning_rate": 2.7606845598223735e-05, + "loss": 0.1063, + "step": 16284 + }, + { + "epoch": 1.9311039962053838, + "grad_norm": 0.8103647574368822, + "learning_rate": 2.760445831034245e-05, + "loss": 0.1761, + "step": 16285 + }, + { + "epoch": 1.9312225779675085, + "grad_norm": 0.8965377710269227, + "learning_rate": 2.7602070998451253e-05, + "loss": 0.2241, + "step": 16286 + }, + { + "epoch": 1.9313411597296337, + "grad_norm": 0.9799503981826664, + "learning_rate": 2.7599683662572146e-05, + "loss": 0.2059, + "step": 16287 + }, + { + "epoch": 1.9314597414917585, + "grad_norm": 0.915503872099355, + "learning_rate": 2.7597296302727156e-05, + "loss": 0.1826, + "step": 16288 + }, + { + "epoch": 1.9315783232538837, + "grad_norm": 0.7785380510675531, + "learning_rate": 2.7594908918938266e-05, + "loss": 0.1673, + "step": 16289 + }, + { + "epoch": 1.9316969050160084, + "grad_norm": 0.8819746465338293, + "learning_rate": 2.759252151122751e-05, + "loss": 0.1671, + "step": 16290 + }, + { + "epoch": 1.9318154867781336, + "grad_norm": 0.7871891322657373, + "learning_rate": 2.7590134079616874e-05, + "loss": 0.1519, + "step": 16291 + }, + { + "epoch": 1.9319340685402584, + "grad_norm": 1.0679534426282065, + "learning_rate": 2.7587746624128386e-05, + "loss": 0.2357, + "step": 16292 + }, + { + "epoch": 1.9320526503023836, + "grad_norm": 0.7089512577965378, + "learning_rate": 2.758535914478404e-05, + "loss": 0.1529, + "step": 16293 + }, + { + "epoch": 1.9321712320645084, + "grad_norm": 0.6609916788809255, + "learning_rate": 2.7582971641605866e-05, + "loss": 0.1268, + "step": 16294 + }, + { + "epoch": 1.9322898138266336, + "grad_norm": 0.9383666713250818, + "learning_rate": 2.7580584114615848e-05, + "loss": 0.2337, + "step": 16295 + }, + { + "epoch": 1.9324083955887583, + "grad_norm": 0.7061799334726571, + "learning_rate": 2.757819656383601e-05, + "loss": 0.1436, + "step": 16296 + }, + { + "epoch": 1.9325269773508835, + "grad_norm": 0.6585925590992556, + "learning_rate": 2.7575808989288366e-05, + "loss": 0.1569, + "step": 16297 + }, + { + "epoch": 1.9326455591130083, + "grad_norm": 0.943007915627132, + "learning_rate": 2.7573421390994918e-05, + "loss": 0.2347, + "step": 16298 + }, + { + "epoch": 1.9327641408751335, + "grad_norm": 0.9839469387985489, + "learning_rate": 2.7571033768977683e-05, + "loss": 0.2285, + "step": 16299 + }, + { + "epoch": 1.9328827226372582, + "grad_norm": 0.9785911075820312, + "learning_rate": 2.7568646123258667e-05, + "loss": 0.169, + "step": 16300 + }, + { + "epoch": 1.9330013043993834, + "grad_norm": 0.817502052879532, + "learning_rate": 2.756625845385989e-05, + "loss": 0.1595, + "step": 16301 + }, + { + "epoch": 1.9331198861615082, + "grad_norm": 0.8998275141317014, + "learning_rate": 2.756387076080335e-05, + "loss": 0.2235, + "step": 16302 + }, + { + "epoch": 1.9332384679236334, + "grad_norm": 0.7354614494511376, + "learning_rate": 2.7561483044111074e-05, + "loss": 0.1375, + "step": 16303 + }, + { + "epoch": 1.9333570496857582, + "grad_norm": 0.8520295644481909, + "learning_rate": 2.7559095303805055e-05, + "loss": 0.1849, + "step": 16304 + }, + { + "epoch": 1.9334756314478834, + "grad_norm": 0.6540354108118182, + "learning_rate": 2.7556707539907317e-05, + "loss": 0.1353, + "step": 16305 + }, + { + "epoch": 1.9335942132100084, + "grad_norm": 0.9418615238307455, + "learning_rate": 2.7554319752439872e-05, + "loss": 0.1738, + "step": 16306 + }, + { + "epoch": 1.9337127949721333, + "grad_norm": 1.0894504445429882, + "learning_rate": 2.755193194142474e-05, + "loss": 0.2056, + "step": 16307 + }, + { + "epoch": 1.9338313767342583, + "grad_norm": 0.6648006297971638, + "learning_rate": 2.7549544106883918e-05, + "loss": 0.1707, + "step": 16308 + }, + { + "epoch": 1.9339499584963833, + "grad_norm": 0.7231447155144718, + "learning_rate": 2.7547156248839427e-05, + "loss": 0.1509, + "step": 16309 + }, + { + "epoch": 1.9340685402585083, + "grad_norm": 0.8173416581334517, + "learning_rate": 2.754476836731328e-05, + "loss": 0.1494, + "step": 16310 + }, + { + "epoch": 1.9341871220206333, + "grad_norm": 1.6449647292361331, + "learning_rate": 2.7542380462327487e-05, + "loss": 0.3053, + "step": 16311 + }, + { + "epoch": 1.9343057037827582, + "grad_norm": 1.0173777606578023, + "learning_rate": 2.7539992533904062e-05, + "loss": 0.1936, + "step": 16312 + }, + { + "epoch": 1.9344242855448832, + "grad_norm": 0.7012588044654365, + "learning_rate": 2.7537604582065025e-05, + "loss": 0.1641, + "step": 16313 + }, + { + "epoch": 1.9345428673070082, + "grad_norm": 0.5724858468951916, + "learning_rate": 2.753521660683238e-05, + "loss": 0.1414, + "step": 16314 + }, + { + "epoch": 1.9346614490691332, + "grad_norm": 0.8835845925655845, + "learning_rate": 2.753282860822815e-05, + "loss": 0.1482, + "step": 16315 + }, + { + "epoch": 1.9347800308312582, + "grad_norm": 0.8625225129654933, + "learning_rate": 2.753044058627434e-05, + "loss": 0.1614, + "step": 16316 + }, + { + "epoch": 1.9348986125933831, + "grad_norm": 0.8309879299801034, + "learning_rate": 2.7528052540992978e-05, + "loss": 0.1926, + "step": 16317 + }, + { + "epoch": 1.9350171943555081, + "grad_norm": 0.7142398825488963, + "learning_rate": 2.752566447240607e-05, + "loss": 0.1607, + "step": 16318 + }, + { + "epoch": 1.935135776117633, + "grad_norm": 0.9482058059125964, + "learning_rate": 2.7523276380535636e-05, + "loss": 0.2098, + "step": 16319 + }, + { + "epoch": 1.935254357879758, + "grad_norm": 0.759187084389351, + "learning_rate": 2.752088826540369e-05, + "loss": 0.153, + "step": 16320 + }, + { + "epoch": 1.935372939641883, + "grad_norm": 0.6458785358564313, + "learning_rate": 2.7518500127032236e-05, + "loss": 0.1448, + "step": 16321 + }, + { + "epoch": 1.935491521404008, + "grad_norm": 0.728189357666095, + "learning_rate": 2.7516111965443307e-05, + "loss": 0.1681, + "step": 16322 + }, + { + "epoch": 1.935610103166133, + "grad_norm": 1.8419132328993884, + "learning_rate": 2.7513723780658907e-05, + "loss": 0.4463, + "step": 16323 + }, + { + "epoch": 1.935728684928258, + "grad_norm": 0.8725323124622913, + "learning_rate": 2.7511335572701057e-05, + "loss": 0.1631, + "step": 16324 + }, + { + "epoch": 1.935847266690383, + "grad_norm": 0.7931191431772464, + "learning_rate": 2.7508947341591768e-05, + "loss": 0.1502, + "step": 16325 + }, + { + "epoch": 1.935965848452508, + "grad_norm": 0.9144589016597358, + "learning_rate": 2.7506559087353074e-05, + "loss": 0.185, + "step": 16326 + }, + { + "epoch": 1.936084430214633, + "grad_norm": 0.9093104363529504, + "learning_rate": 2.750417081000697e-05, + "loss": 0.2361, + "step": 16327 + }, + { + "epoch": 1.936203011976758, + "grad_norm": 0.7648873632389199, + "learning_rate": 2.750178250957548e-05, + "loss": 0.179, + "step": 16328 + }, + { + "epoch": 1.936321593738883, + "grad_norm": 0.8858035491669537, + "learning_rate": 2.749939418608063e-05, + "loss": 0.1852, + "step": 16329 + }, + { + "epoch": 1.936440175501008, + "grad_norm": 1.0528135687094582, + "learning_rate": 2.7497005839544422e-05, + "loss": 0.2213, + "step": 16330 + }, + { + "epoch": 1.9365587572631329, + "grad_norm": 0.7139425373034957, + "learning_rate": 2.749461746998889e-05, + "loss": 0.1541, + "step": 16331 + }, + { + "epoch": 1.936677339025258, + "grad_norm": 0.9323818551502069, + "learning_rate": 2.7492229077436037e-05, + "loss": 0.1848, + "step": 16332 + }, + { + "epoch": 1.9367959207873828, + "grad_norm": 0.5688576485391438, + "learning_rate": 2.7489840661907895e-05, + "loss": 0.149, + "step": 16333 + }, + { + "epoch": 1.936914502549508, + "grad_norm": 1.3937011772171348, + "learning_rate": 2.7487452223426474e-05, + "loss": 0.2975, + "step": 16334 + }, + { + "epoch": 1.9370330843116328, + "grad_norm": 0.8599078377533141, + "learning_rate": 2.748506376201379e-05, + "loss": 0.1914, + "step": 16335 + }, + { + "epoch": 1.937151666073758, + "grad_norm": 0.7272034967430947, + "learning_rate": 2.7482675277691866e-05, + "loss": 0.1618, + "step": 16336 + }, + { + "epoch": 1.9372702478358828, + "grad_norm": 0.6125430315886855, + "learning_rate": 2.7480286770482728e-05, + "loss": 0.1284, + "step": 16337 + }, + { + "epoch": 1.937388829598008, + "grad_norm": 0.9041990513005814, + "learning_rate": 2.747789824040838e-05, + "loss": 0.2463, + "step": 16338 + }, + { + "epoch": 1.9375074113601327, + "grad_norm": 0.6705350577483852, + "learning_rate": 2.7475509687490852e-05, + "loss": 0.159, + "step": 16339 + }, + { + "epoch": 1.937625993122258, + "grad_norm": 0.797215186118629, + "learning_rate": 2.7473121111752154e-05, + "loss": 0.2283, + "step": 16340 + }, + { + "epoch": 1.9377445748843827, + "grad_norm": 0.8459022043963801, + "learning_rate": 2.747073251321432e-05, + "loss": 0.2047, + "step": 16341 + }, + { + "epoch": 1.9378631566465079, + "grad_norm": 0.7823586784015667, + "learning_rate": 2.7468343891899358e-05, + "loss": 0.1895, + "step": 16342 + }, + { + "epoch": 1.9379817384086326, + "grad_norm": 0.7764249086962549, + "learning_rate": 2.746595524782929e-05, + "loss": 0.1712, + "step": 16343 + }, + { + "epoch": 1.9381003201707578, + "grad_norm": 0.6068356236344893, + "learning_rate": 2.7463566581026146e-05, + "loss": 0.1383, + "step": 16344 + }, + { + "epoch": 1.9382189019328826, + "grad_norm": 0.6820415381653931, + "learning_rate": 2.7461177891511936e-05, + "loss": 0.1263, + "step": 16345 + }, + { + "epoch": 1.9383374836950078, + "grad_norm": 0.8096432485707196, + "learning_rate": 2.745878917930868e-05, + "loss": 0.1641, + "step": 16346 + }, + { + "epoch": 1.9384560654571326, + "grad_norm": 0.6511095450026053, + "learning_rate": 2.74564004444384e-05, + "loss": 0.1313, + "step": 16347 + }, + { + "epoch": 1.9385746472192578, + "grad_norm": 0.6772779065713262, + "learning_rate": 2.7454011686923132e-05, + "loss": 0.1762, + "step": 16348 + }, + { + "epoch": 1.9386932289813825, + "grad_norm": 0.6253354929339672, + "learning_rate": 2.745162290678488e-05, + "loss": 0.1172, + "step": 16349 + }, + { + "epoch": 1.9388118107435077, + "grad_norm": 0.7082450791529888, + "learning_rate": 2.7449234104045673e-05, + "loss": 0.1386, + "step": 16350 + }, + { + "epoch": 1.9389303925056325, + "grad_norm": 1.1146440664974027, + "learning_rate": 2.7446845278727524e-05, + "loss": 0.1855, + "step": 16351 + }, + { + "epoch": 1.9390489742677577, + "grad_norm": 0.7530766642796596, + "learning_rate": 2.744445643085247e-05, + "loss": 0.167, + "step": 16352 + }, + { + "epoch": 1.9391675560298824, + "grad_norm": 0.8024650389847597, + "learning_rate": 2.744206756044252e-05, + "loss": 0.1425, + "step": 16353 + }, + { + "epoch": 1.9392861377920076, + "grad_norm": 0.731304097706486, + "learning_rate": 2.7439678667519704e-05, + "loss": 0.1403, + "step": 16354 + }, + { + "epoch": 1.9394047195541326, + "grad_norm": 0.8143238735796712, + "learning_rate": 2.743728975210604e-05, + "loss": 0.1612, + "step": 16355 + }, + { + "epoch": 1.9395233013162576, + "grad_norm": 0.9743668475176798, + "learning_rate": 2.7434900814223557e-05, + "loss": 0.1774, + "step": 16356 + }, + { + "epoch": 1.9396418830783826, + "grad_norm": 0.9130799474075061, + "learning_rate": 2.7432511853894272e-05, + "loss": 0.1582, + "step": 16357 + }, + { + "epoch": 1.9397604648405076, + "grad_norm": 0.7128480324089543, + "learning_rate": 2.7430122871140207e-05, + "loss": 0.1285, + "step": 16358 + }, + { + "epoch": 1.9398790466026326, + "grad_norm": 0.6083390742163582, + "learning_rate": 2.74277338659834e-05, + "loss": 0.1417, + "step": 16359 + }, + { + "epoch": 1.9399976283647575, + "grad_norm": 0.6018195734062577, + "learning_rate": 2.7425344838445853e-05, + "loss": 0.1315, + "step": 16360 + }, + { + "epoch": 1.9401162101268825, + "grad_norm": 0.7611954288820711, + "learning_rate": 2.742295578854961e-05, + "loss": 0.1466, + "step": 16361 + }, + { + "epoch": 1.9402347918890075, + "grad_norm": 0.6788752366781177, + "learning_rate": 2.742056671631668e-05, + "loss": 0.1486, + "step": 16362 + }, + { + "epoch": 1.9403533736511325, + "grad_norm": 0.785319084045297, + "learning_rate": 2.7418177621769097e-05, + "loss": 0.2194, + "step": 16363 + }, + { + "epoch": 1.9404719554132575, + "grad_norm": 0.9991907507158211, + "learning_rate": 2.741578850492888e-05, + "loss": 0.199, + "step": 16364 + }, + { + "epoch": 1.9405905371753824, + "grad_norm": 0.7381457393076227, + "learning_rate": 2.7413399365818048e-05, + "loss": 0.1919, + "step": 16365 + }, + { + "epoch": 1.9407091189375074, + "grad_norm": 0.8117235577213168, + "learning_rate": 2.741101020445864e-05, + "loss": 0.134, + "step": 16366 + }, + { + "epoch": 1.9408277006996324, + "grad_norm": 0.9458775134085682, + "learning_rate": 2.7408621020872675e-05, + "loss": 0.183, + "step": 16367 + }, + { + "epoch": 1.9409462824617574, + "grad_norm": 1.058519450650208, + "learning_rate": 2.7406231815082177e-05, + "loss": 0.2543, + "step": 16368 + }, + { + "epoch": 1.9410648642238824, + "grad_norm": 0.6233877642148292, + "learning_rate": 2.7403842587109173e-05, + "loss": 0.1255, + "step": 16369 + }, + { + "epoch": 1.9411834459860073, + "grad_norm": 0.7173998967989395, + "learning_rate": 2.740145333697569e-05, + "loss": 0.1979, + "step": 16370 + }, + { + "epoch": 1.9413020277481323, + "grad_norm": 0.9296352913349236, + "learning_rate": 2.7399064064703744e-05, + "loss": 0.1831, + "step": 16371 + }, + { + "epoch": 1.9414206095102573, + "grad_norm": 0.8824401959389832, + "learning_rate": 2.7396674770315378e-05, + "loss": 0.1788, + "step": 16372 + }, + { + "epoch": 1.9415391912723823, + "grad_norm": 0.7398227688437886, + "learning_rate": 2.7394285453832608e-05, + "loss": 0.1543, + "step": 16373 + }, + { + "epoch": 1.9416577730345073, + "grad_norm": 0.7402187303636013, + "learning_rate": 2.7391896115277464e-05, + "loss": 0.141, + "step": 16374 + }, + { + "epoch": 1.9417763547966322, + "grad_norm": 0.7574858219285326, + "learning_rate": 2.738950675467197e-05, + "loss": 0.1437, + "step": 16375 + }, + { + "epoch": 1.9418949365587572, + "grad_norm": 0.746793427846761, + "learning_rate": 2.7387117372038145e-05, + "loss": 0.1776, + "step": 16376 + }, + { + "epoch": 1.9420135183208822, + "grad_norm": 0.8147989342957679, + "learning_rate": 2.7384727967398026e-05, + "loss": 0.1737, + "step": 16377 + }, + { + "epoch": 1.9421321000830072, + "grad_norm": 1.143061005927861, + "learning_rate": 2.738233854077365e-05, + "loss": 0.262, + "step": 16378 + }, + { + "epoch": 1.9422506818451322, + "grad_norm": 0.9291367642948983, + "learning_rate": 2.737994909218703e-05, + "loss": 0.1923, + "step": 16379 + }, + { + "epoch": 1.9423692636072571, + "grad_norm": 0.9030873210704913, + "learning_rate": 2.73775596216602e-05, + "loss": 0.2449, + "step": 16380 + }, + { + "epoch": 1.9424878453693823, + "grad_norm": 1.0841538832186994, + "learning_rate": 2.7375170129215177e-05, + "loss": 0.1972, + "step": 16381 + }, + { + "epoch": 1.942606427131507, + "grad_norm": 0.7069322663339878, + "learning_rate": 2.7372780614874012e-05, + "loss": 0.1707, + "step": 16382 + }, + { + "epoch": 1.9427250088936323, + "grad_norm": 0.8454857553028744, + "learning_rate": 2.737039107865871e-05, + "loss": 0.1504, + "step": 16383 + }, + { + "epoch": 1.942843590655757, + "grad_norm": 0.8624521796313256, + "learning_rate": 2.7368001520591307e-05, + "loss": 0.1926, + "step": 16384 + }, + { + "epoch": 1.9429621724178823, + "grad_norm": 1.292689629658044, + "learning_rate": 2.7365611940693837e-05, + "loss": 0.16, + "step": 16385 + }, + { + "epoch": 1.943080754180007, + "grad_norm": 0.7811231430400088, + "learning_rate": 2.736322233898832e-05, + "loss": 0.1778, + "step": 16386 + }, + { + "epoch": 1.9431993359421322, + "grad_norm": 0.5827745965660086, + "learning_rate": 2.7360832715496803e-05, + "loss": 0.1215, + "step": 16387 + }, + { + "epoch": 1.943317917704257, + "grad_norm": 0.9308467819178561, + "learning_rate": 2.7358443070241298e-05, + "loss": 0.2123, + "step": 16388 + }, + { + "epoch": 1.9434364994663822, + "grad_norm": 0.9395802342889222, + "learning_rate": 2.7356053403243843e-05, + "loss": 0.2117, + "step": 16389 + }, + { + "epoch": 1.943555081228507, + "grad_norm": 1.6406697478601848, + "learning_rate": 2.7353663714526458e-05, + "loss": 0.3047, + "step": 16390 + }, + { + "epoch": 1.9436736629906322, + "grad_norm": 0.5318932204266572, + "learning_rate": 2.7351274004111182e-05, + "loss": 0.1398, + "step": 16391 + }, + { + "epoch": 1.943792244752757, + "grad_norm": 0.7738884716323468, + "learning_rate": 2.734888427202004e-05, + "loss": 0.1594, + "step": 16392 + }, + { + "epoch": 1.9439108265148821, + "grad_norm": 1.0457802250038821, + "learning_rate": 2.734649451827508e-05, + "loss": 0.2158, + "step": 16393 + }, + { + "epoch": 1.9440294082770069, + "grad_norm": 0.6743533589752824, + "learning_rate": 2.7344104742898302e-05, + "loss": 0.1269, + "step": 16394 + }, + { + "epoch": 1.944147990039132, + "grad_norm": 0.7698440110325474, + "learning_rate": 2.7341714945911756e-05, + "loss": 0.1598, + "step": 16395 + }, + { + "epoch": 1.9442665718012568, + "grad_norm": 0.7582315730944709, + "learning_rate": 2.7339325127337474e-05, + "loss": 0.1571, + "step": 16396 + }, + { + "epoch": 1.944385153563382, + "grad_norm": 0.8019494556277882, + "learning_rate": 2.733693528719748e-05, + "loss": 0.1554, + "step": 16397 + }, + { + "epoch": 1.9445037353255068, + "grad_norm": 0.8127280077126118, + "learning_rate": 2.7334545425513808e-05, + "loss": 0.1673, + "step": 16398 + }, + { + "epoch": 1.944622317087632, + "grad_norm": 1.210307901176505, + "learning_rate": 2.7332155542308497e-05, + "loss": 0.2264, + "step": 16399 + }, + { + "epoch": 1.9447408988497568, + "grad_norm": 0.8485233655157708, + "learning_rate": 2.7329765637603565e-05, + "loss": 0.2118, + "step": 16400 + }, + { + "epoch": 1.944859480611882, + "grad_norm": 0.9978498701023479, + "learning_rate": 2.7327375711421048e-05, + "loss": 0.17, + "step": 16401 + }, + { + "epoch": 1.9449780623740067, + "grad_norm": 0.8669581288470144, + "learning_rate": 2.732498576378299e-05, + "loss": 0.1411, + "step": 16402 + }, + { + "epoch": 1.945096644136132, + "grad_norm": 0.9519768887299245, + "learning_rate": 2.73225957947114e-05, + "loss": 0.2241, + "step": 16403 + }, + { + "epoch": 1.9452152258982567, + "grad_norm": 0.8321379612408843, + "learning_rate": 2.7320205804228337e-05, + "loss": 0.1779, + "step": 16404 + }, + { + "epoch": 1.9453338076603819, + "grad_norm": 1.1471754378922117, + "learning_rate": 2.7317815792355817e-05, + "loss": 0.2112, + "step": 16405 + }, + { + "epoch": 1.9454523894225069, + "grad_norm": 0.7565426420030024, + "learning_rate": 2.7315425759115885e-05, + "loss": 0.1563, + "step": 16406 + }, + { + "epoch": 1.9455709711846318, + "grad_norm": 0.8318162796998314, + "learning_rate": 2.731303570453056e-05, + "loss": 0.1345, + "step": 16407 + }, + { + "epoch": 1.9456895529467568, + "grad_norm": 0.7572990844022272, + "learning_rate": 2.7310645628621882e-05, + "loss": 0.1688, + "step": 16408 + }, + { + "epoch": 1.9458081347088818, + "grad_norm": 0.7020033291596982, + "learning_rate": 2.7308255531411887e-05, + "loss": 0.1214, + "step": 16409 + }, + { + "epoch": 1.9459267164710068, + "grad_norm": 0.892332692321887, + "learning_rate": 2.730586541292261e-05, + "loss": 0.23, + "step": 16410 + }, + { + "epoch": 1.9460452982331318, + "grad_norm": 0.6313933593535302, + "learning_rate": 2.7303475273176076e-05, + "loss": 0.1336, + "step": 16411 + }, + { + "epoch": 1.9461638799952568, + "grad_norm": 0.7454138663587897, + "learning_rate": 2.730108511219433e-05, + "loss": 0.1461, + "step": 16412 + }, + { + "epoch": 1.9462824617573817, + "grad_norm": 0.7386650471190996, + "learning_rate": 2.7298694929999398e-05, + "loss": 0.1684, + "step": 16413 + }, + { + "epoch": 1.9464010435195067, + "grad_norm": 0.8298127099209364, + "learning_rate": 2.7296304726613315e-05, + "loss": 0.1926, + "step": 16414 + }, + { + "epoch": 1.9465196252816317, + "grad_norm": 0.7079296657829692, + "learning_rate": 2.7293914502058117e-05, + "loss": 0.1352, + "step": 16415 + }, + { + "epoch": 1.9466382070437567, + "grad_norm": 0.5399515796729163, + "learning_rate": 2.729152425635584e-05, + "loss": 0.1086, + "step": 16416 + }, + { + "epoch": 1.9467567888058817, + "grad_norm": 0.6757614422334314, + "learning_rate": 2.7289133989528527e-05, + "loss": 0.119, + "step": 16417 + }, + { + "epoch": 1.9468753705680066, + "grad_norm": 0.6453150928410119, + "learning_rate": 2.72867437015982e-05, + "loss": 0.1583, + "step": 16418 + }, + { + "epoch": 1.9469939523301316, + "grad_norm": 0.834680827946555, + "learning_rate": 2.7284353392586904e-05, + "loss": 0.2113, + "step": 16419 + }, + { + "epoch": 1.9471125340922566, + "grad_norm": 0.9171516836566599, + "learning_rate": 2.728196306251667e-05, + "loss": 0.1557, + "step": 16420 + }, + { + "epoch": 1.9472311158543816, + "grad_norm": 0.852395521703101, + "learning_rate": 2.7279572711409534e-05, + "loss": 0.2354, + "step": 16421 + }, + { + "epoch": 1.9473496976165066, + "grad_norm": 0.8619263914951475, + "learning_rate": 2.7277182339287532e-05, + "loss": 0.1801, + "step": 16422 + }, + { + "epoch": 1.9474682793786315, + "grad_norm": 0.6426768880075667, + "learning_rate": 2.7274791946172706e-05, + "loss": 0.1053, + "step": 16423 + }, + { + "epoch": 1.9475868611407565, + "grad_norm": 0.8903693580422422, + "learning_rate": 2.7272401532087083e-05, + "loss": 0.1941, + "step": 16424 + }, + { + "epoch": 1.9477054429028815, + "grad_norm": 1.024760152382083, + "learning_rate": 2.72700110970527e-05, + "loss": 0.21, + "step": 16425 + }, + { + "epoch": 1.9478240246650065, + "grad_norm": 0.8347049789891208, + "learning_rate": 2.7267620641091602e-05, + "loss": 0.1545, + "step": 16426 + }, + { + "epoch": 1.9479426064271315, + "grad_norm": 0.8741021836107608, + "learning_rate": 2.726523016422583e-05, + "loss": 0.1484, + "step": 16427 + }, + { + "epoch": 1.9480611881892564, + "grad_norm": 0.9589723680434501, + "learning_rate": 2.7262839666477408e-05, + "loss": 0.172, + "step": 16428 + }, + { + "epoch": 1.9481797699513814, + "grad_norm": 0.6827656070098324, + "learning_rate": 2.7260449147868373e-05, + "loss": 0.148, + "step": 16429 + }, + { + "epoch": 1.9482983517135066, + "grad_norm": 0.8912472186789356, + "learning_rate": 2.725805860842078e-05, + "loss": 0.2131, + "step": 16430 + }, + { + "epoch": 1.9484169334756314, + "grad_norm": 0.6651441908962812, + "learning_rate": 2.7255668048156645e-05, + "loss": 0.1537, + "step": 16431 + }, + { + "epoch": 1.9485355152377566, + "grad_norm": 0.6448027717060855, + "learning_rate": 2.7253277467098026e-05, + "loss": 0.1155, + "step": 16432 + }, + { + "epoch": 1.9486540969998813, + "grad_norm": 0.8262713551052182, + "learning_rate": 2.725088686526695e-05, + "loss": 0.1538, + "step": 16433 + }, + { + "epoch": 1.9487726787620065, + "grad_norm": 0.9869174673776087, + "learning_rate": 2.724849624268545e-05, + "loss": 0.1977, + "step": 16434 + }, + { + "epoch": 1.9488912605241313, + "grad_norm": 0.6694721216147048, + "learning_rate": 2.7246105599375577e-05, + "loss": 0.1101, + "step": 16435 + }, + { + "epoch": 1.9490098422862565, + "grad_norm": 0.921490367336916, + "learning_rate": 2.724371493535937e-05, + "loss": 0.2207, + "step": 16436 + }, + { + "epoch": 1.9491284240483813, + "grad_norm": 0.9203854769799883, + "learning_rate": 2.7241324250658857e-05, + "loss": 0.1955, + "step": 16437 + }, + { + "epoch": 1.9492470058105065, + "grad_norm": 1.5312205098289042, + "learning_rate": 2.7238933545296086e-05, + "loss": 0.318, + "step": 16438 + }, + { + "epoch": 1.9493655875726312, + "grad_norm": 0.8208175879634465, + "learning_rate": 2.7236542819293087e-05, + "loss": 0.1808, + "step": 16439 + }, + { + "epoch": 1.9494841693347564, + "grad_norm": 0.6464959689423282, + "learning_rate": 2.7234152072671913e-05, + "loss": 0.1136, + "step": 16440 + }, + { + "epoch": 1.9496027510968812, + "grad_norm": 0.7349024878143514, + "learning_rate": 2.7231761305454595e-05, + "loss": 0.1563, + "step": 16441 + }, + { + "epoch": 1.9497213328590064, + "grad_norm": 1.2693008848118739, + "learning_rate": 2.722937051766317e-05, + "loss": 0.2746, + "step": 16442 + }, + { + "epoch": 1.9498399146211312, + "grad_norm": 1.0828850243065455, + "learning_rate": 2.722697970931969e-05, + "loss": 0.1803, + "step": 16443 + }, + { + "epoch": 1.9499584963832564, + "grad_norm": 0.7064120793781952, + "learning_rate": 2.722458888044618e-05, + "loss": 0.1357, + "step": 16444 + }, + { + "epoch": 1.9500770781453811, + "grad_norm": 0.7279213440228596, + "learning_rate": 2.722219803106469e-05, + "loss": 0.1697, + "step": 16445 + }, + { + "epoch": 1.9501956599075063, + "grad_norm": 0.9960093666776008, + "learning_rate": 2.7219807161197257e-05, + "loss": 0.2173, + "step": 16446 + }, + { + "epoch": 1.950314241669631, + "grad_norm": 0.9060928349742707, + "learning_rate": 2.7217416270865932e-05, + "loss": 0.1584, + "step": 16447 + }, + { + "epoch": 1.9504328234317563, + "grad_norm": 0.4904323795811234, + "learning_rate": 2.7215025360092745e-05, + "loss": 0.1099, + "step": 16448 + }, + { + "epoch": 1.950551405193881, + "grad_norm": 0.8074845600241674, + "learning_rate": 2.721263442889974e-05, + "loss": 0.139, + "step": 16449 + }, + { + "epoch": 1.9506699869560062, + "grad_norm": 1.105407215114659, + "learning_rate": 2.721024347730896e-05, + "loss": 0.2251, + "step": 16450 + }, + { + "epoch": 1.950788568718131, + "grad_norm": 1.448192083026777, + "learning_rate": 2.7207852505342444e-05, + "loss": 0.3364, + "step": 16451 + }, + { + "epoch": 1.9509071504802562, + "grad_norm": 0.5202596946909429, + "learning_rate": 2.7205461513022234e-05, + "loss": 0.1123, + "step": 16452 + }, + { + "epoch": 1.951025732242381, + "grad_norm": 0.8631740220265125, + "learning_rate": 2.7203070500370377e-05, + "loss": 0.1988, + "step": 16453 + }, + { + "epoch": 1.9511443140045062, + "grad_norm": 0.7214218569144734, + "learning_rate": 2.7200679467408906e-05, + "loss": 0.1705, + "step": 16454 + }, + { + "epoch": 1.9512628957666311, + "grad_norm": 0.8686523800743666, + "learning_rate": 2.7198288414159877e-05, + "loss": 0.1515, + "step": 16455 + }, + { + "epoch": 1.9513814775287561, + "grad_norm": 0.5706630343584121, + "learning_rate": 2.7195897340645315e-05, + "loss": 0.1244, + "step": 16456 + }, + { + "epoch": 1.951500059290881, + "grad_norm": 0.7454630893346157, + "learning_rate": 2.7193506246887278e-05, + "loss": 0.1682, + "step": 16457 + }, + { + "epoch": 1.951618641053006, + "grad_norm": 1.0342558395440251, + "learning_rate": 2.7191115132907807e-05, + "loss": 0.2257, + "step": 16458 + }, + { + "epoch": 1.951737222815131, + "grad_norm": 0.5128291243973956, + "learning_rate": 2.718872399872893e-05, + "loss": 0.1211, + "step": 16459 + }, + { + "epoch": 1.951855804577256, + "grad_norm": 0.5849978130548785, + "learning_rate": 2.7186332844372713e-05, + "loss": 0.1367, + "step": 16460 + }, + { + "epoch": 1.951974386339381, + "grad_norm": 0.6735204258300458, + "learning_rate": 2.7183941669861184e-05, + "loss": 0.193, + "step": 16461 + }, + { + "epoch": 1.952092968101506, + "grad_norm": 0.8738642127558361, + "learning_rate": 2.7181550475216394e-05, + "loss": 0.2495, + "step": 16462 + }, + { + "epoch": 1.952211549863631, + "grad_norm": 0.7788518849161493, + "learning_rate": 2.7179159260460378e-05, + "loss": 0.1376, + "step": 16463 + }, + { + "epoch": 1.952330131625756, + "grad_norm": 0.7397904468829547, + "learning_rate": 2.717676802561519e-05, + "loss": 0.1355, + "step": 16464 + }, + { + "epoch": 1.952448713387881, + "grad_norm": 0.8840622622493839, + "learning_rate": 2.7174376770702864e-05, + "loss": 0.2313, + "step": 16465 + }, + { + "epoch": 1.952567295150006, + "grad_norm": 0.8594010060670205, + "learning_rate": 2.7171985495745462e-05, + "loss": 0.1903, + "step": 16466 + }, + { + "epoch": 1.952685876912131, + "grad_norm": 0.647702481633997, + "learning_rate": 2.7169594200765008e-05, + "loss": 0.1585, + "step": 16467 + }, + { + "epoch": 1.952804458674256, + "grad_norm": 0.66641278504275, + "learning_rate": 2.716720288578356e-05, + "loss": 0.1344, + "step": 16468 + }, + { + "epoch": 1.9529230404363809, + "grad_norm": 0.8289383608158912, + "learning_rate": 2.7164811550823155e-05, + "loss": 0.1814, + "step": 16469 + }, + { + "epoch": 1.9530416221985059, + "grad_norm": 0.7672959345547058, + "learning_rate": 2.7162420195905847e-05, + "loss": 0.1555, + "step": 16470 + }, + { + "epoch": 1.9531602039606308, + "grad_norm": 0.912891594539876, + "learning_rate": 2.7160028821053675e-05, + "loss": 0.1629, + "step": 16471 + }, + { + "epoch": 1.9532787857227558, + "grad_norm": 0.7409091767764244, + "learning_rate": 2.7157637426288684e-05, + "loss": 0.1461, + "step": 16472 + }, + { + "epoch": 1.9533973674848808, + "grad_norm": 0.608858308766183, + "learning_rate": 2.7155246011632923e-05, + "loss": 0.1342, + "step": 16473 + }, + { + "epoch": 1.9535159492470058, + "grad_norm": 0.6836665160066846, + "learning_rate": 2.715285457710844e-05, + "loss": 0.1333, + "step": 16474 + }, + { + "epoch": 1.9536345310091308, + "grad_norm": 0.571941947554955, + "learning_rate": 2.715046312273727e-05, + "loss": 0.1095, + "step": 16475 + }, + { + "epoch": 1.9537531127712557, + "grad_norm": 0.8985937513376654, + "learning_rate": 2.7148071648541468e-05, + "loss": 0.1465, + "step": 16476 + }, + { + "epoch": 1.9538716945333807, + "grad_norm": 0.6740294221562099, + "learning_rate": 2.7145680154543084e-05, + "loss": 0.1269, + "step": 16477 + }, + { + "epoch": 1.9539902762955057, + "grad_norm": 1.3407414813262402, + "learning_rate": 2.7143288640764164e-05, + "loss": 0.2524, + "step": 16478 + }, + { + "epoch": 1.9541088580576307, + "grad_norm": 0.9449406028083756, + "learning_rate": 2.714089710722675e-05, + "loss": 0.196, + "step": 16479 + }, + { + "epoch": 1.9542274398197557, + "grad_norm": 1.0323321655916324, + "learning_rate": 2.713850555395288e-05, + "loss": 0.2091, + "step": 16480 + }, + { + "epoch": 1.9543460215818809, + "grad_norm": 0.6060339758311111, + "learning_rate": 2.7136113980964623e-05, + "loss": 0.1299, + "step": 16481 + }, + { + "epoch": 1.9544646033440056, + "grad_norm": 0.8322419769211384, + "learning_rate": 2.7133722388284004e-05, + "loss": 0.1522, + "step": 16482 + }, + { + "epoch": 1.9545831851061308, + "grad_norm": 1.0592457962418715, + "learning_rate": 2.713133077593309e-05, + "loss": 0.2363, + "step": 16483 + }, + { + "epoch": 1.9547017668682556, + "grad_norm": 0.8432522120667064, + "learning_rate": 2.7128939143933914e-05, + "loss": 0.1895, + "step": 16484 + }, + { + "epoch": 1.9548203486303808, + "grad_norm": 0.9619094737938466, + "learning_rate": 2.7126547492308536e-05, + "loss": 0.189, + "step": 16485 + }, + { + "epoch": 1.9549389303925055, + "grad_norm": 0.6759590331290899, + "learning_rate": 2.712415582107899e-05, + "loss": 0.1328, + "step": 16486 + }, + { + "epoch": 1.9550575121546307, + "grad_norm": 0.6619705200930935, + "learning_rate": 2.712176413026733e-05, + "loss": 0.176, + "step": 16487 + }, + { + "epoch": 1.9551760939167555, + "grad_norm": 0.8593371676791806, + "learning_rate": 2.7119372419895618e-05, + "loss": 0.1782, + "step": 16488 + }, + { + "epoch": 1.9552946756788807, + "grad_norm": 0.9361338359161707, + "learning_rate": 2.7116980689985883e-05, + "loss": 0.1528, + "step": 16489 + }, + { + "epoch": 1.9554132574410055, + "grad_norm": 1.0437144047949376, + "learning_rate": 2.7114588940560187e-05, + "loss": 0.2044, + "step": 16490 + }, + { + "epoch": 1.9555318392031307, + "grad_norm": 0.7949693411656087, + "learning_rate": 2.7112197171640574e-05, + "loss": 0.1815, + "step": 16491 + }, + { + "epoch": 1.9556504209652554, + "grad_norm": 1.42201322754865, + "learning_rate": 2.7109805383249093e-05, + "loss": 0.3236, + "step": 16492 + }, + { + "epoch": 1.9557690027273806, + "grad_norm": 0.7451534271756807, + "learning_rate": 2.710741357540779e-05, + "loss": 0.1728, + "step": 16493 + }, + { + "epoch": 1.9558875844895054, + "grad_norm": 0.7328613903464013, + "learning_rate": 2.7105021748138716e-05, + "loss": 0.1875, + "step": 16494 + }, + { + "epoch": 1.9560061662516306, + "grad_norm": 0.7306657605252423, + "learning_rate": 2.7102629901463928e-05, + "loss": 0.1169, + "step": 16495 + }, + { + "epoch": 1.9561247480137554, + "grad_norm": 1.559609201058611, + "learning_rate": 2.710023803540547e-05, + "loss": 0.1666, + "step": 16496 + }, + { + "epoch": 1.9562433297758806, + "grad_norm": 0.969975314832474, + "learning_rate": 2.709784614998539e-05, + "loss": 0.2072, + "step": 16497 + }, + { + "epoch": 1.9563619115380053, + "grad_norm": 1.227099851526805, + "learning_rate": 2.7095454245225748e-05, + "loss": 0.1995, + "step": 16498 + }, + { + "epoch": 1.9564804933001305, + "grad_norm": 0.6888542266660498, + "learning_rate": 2.7093062321148584e-05, + "loss": 0.1166, + "step": 16499 + }, + { + "epoch": 1.9565990750622553, + "grad_norm": 0.7439167846939587, + "learning_rate": 2.7090670377775945e-05, + "loss": 0.1793, + "step": 16500 + }, + { + "epoch": 1.9567176568243805, + "grad_norm": 0.7160934404120536, + "learning_rate": 2.7088278415129898e-05, + "loss": 0.1752, + "step": 16501 + }, + { + "epoch": 1.9568362385865052, + "grad_norm": 0.9201499955829006, + "learning_rate": 2.7085886433232476e-05, + "loss": 0.1644, + "step": 16502 + }, + { + "epoch": 1.9569548203486304, + "grad_norm": 0.6842413313526383, + "learning_rate": 2.708349443210575e-05, + "loss": 0.1165, + "step": 16503 + }, + { + "epoch": 1.9570734021107552, + "grad_norm": 0.8758838169318328, + "learning_rate": 2.708110241177175e-05, + "loss": 0.1831, + "step": 16504 + }, + { + "epoch": 1.9571919838728804, + "grad_norm": 0.993473493092241, + "learning_rate": 2.707871037225255e-05, + "loss": 0.1487, + "step": 16505 + }, + { + "epoch": 1.9573105656350054, + "grad_norm": 0.6877421075801853, + "learning_rate": 2.7076318313570177e-05, + "loss": 0.1556, + "step": 16506 + }, + { + "epoch": 1.9574291473971304, + "grad_norm": 0.9461478852080826, + "learning_rate": 2.70739262357467e-05, + "loss": 0.2483, + "step": 16507 + }, + { + "epoch": 1.9575477291592553, + "grad_norm": 1.2431777479351123, + "learning_rate": 2.7071534138804166e-05, + "loss": 0.2419, + "step": 16508 + }, + { + "epoch": 1.9576663109213803, + "grad_norm": 0.7443905324229713, + "learning_rate": 2.7069142022764633e-05, + "loss": 0.1421, + "step": 16509 + }, + { + "epoch": 1.9577848926835053, + "grad_norm": 0.8744163205151868, + "learning_rate": 2.706674988765014e-05, + "loss": 0.1749, + "step": 16510 + }, + { + "epoch": 1.9579034744456303, + "grad_norm": 0.7428399460025149, + "learning_rate": 2.7064357733482753e-05, + "loss": 0.1359, + "step": 16511 + }, + { + "epoch": 1.9580220562077553, + "grad_norm": 0.7244064007065121, + "learning_rate": 2.7061965560284512e-05, + "loss": 0.1796, + "step": 16512 + }, + { + "epoch": 1.9581406379698802, + "grad_norm": 0.798240407107549, + "learning_rate": 2.7059573368077483e-05, + "loss": 0.1732, + "step": 16513 + }, + { + "epoch": 1.9582592197320052, + "grad_norm": 0.5644818622359752, + "learning_rate": 2.705718115688371e-05, + "loss": 0.117, + "step": 16514 + }, + { + "epoch": 1.9583778014941302, + "grad_norm": 0.5744605797116448, + "learning_rate": 2.705478892672525e-05, + "loss": 0.1323, + "step": 16515 + }, + { + "epoch": 1.9584963832562552, + "grad_norm": 0.6663017560383973, + "learning_rate": 2.705239667762416e-05, + "loss": 0.128, + "step": 16516 + }, + { + "epoch": 1.9586149650183802, + "grad_norm": 0.8718954404925776, + "learning_rate": 2.705000440960248e-05, + "loss": 0.1687, + "step": 16517 + }, + { + "epoch": 1.9587335467805052, + "grad_norm": 0.7067106272465382, + "learning_rate": 2.7047612122682288e-05, + "loss": 0.1647, + "step": 16518 + }, + { + "epoch": 1.9588521285426301, + "grad_norm": 0.9135172075373489, + "learning_rate": 2.7045219816885613e-05, + "loss": 0.1944, + "step": 16519 + }, + { + "epoch": 1.9589707103047551, + "grad_norm": 0.915151370210491, + "learning_rate": 2.7042827492234524e-05, + "loss": 0.1546, + "step": 16520 + }, + { + "epoch": 1.95908929206688, + "grad_norm": 0.7479211474349213, + "learning_rate": 2.7040435148751063e-05, + "loss": 0.1691, + "step": 16521 + }, + { + "epoch": 1.959207873829005, + "grad_norm": 1.101924943870033, + "learning_rate": 2.70380427864573e-05, + "loss": 0.2427, + "step": 16522 + }, + { + "epoch": 1.95932645559113, + "grad_norm": 0.8908154424730587, + "learning_rate": 2.703565040537528e-05, + "loss": 0.1813, + "step": 16523 + }, + { + "epoch": 1.959445037353255, + "grad_norm": 0.9872005773986313, + "learning_rate": 2.7033258005527055e-05, + "loss": 0.2132, + "step": 16524 + }, + { + "epoch": 1.95956361911538, + "grad_norm": 0.6075991326308783, + "learning_rate": 2.7030865586934682e-05, + "loss": 0.1335, + "step": 16525 + }, + { + "epoch": 1.959682200877505, + "grad_norm": 0.8755033145280114, + "learning_rate": 2.7028473149620232e-05, + "loss": 0.1684, + "step": 16526 + }, + { + "epoch": 1.95980078263963, + "grad_norm": 0.8681917005486719, + "learning_rate": 2.7026080693605737e-05, + "loss": 0.1375, + "step": 16527 + }, + { + "epoch": 1.959919364401755, + "grad_norm": 1.0427891547465393, + "learning_rate": 2.7023688218913267e-05, + "loss": 0.1972, + "step": 16528 + }, + { + "epoch": 1.96003794616388, + "grad_norm": 0.8029077765828267, + "learning_rate": 2.702129572556487e-05, + "loss": 0.1649, + "step": 16529 + }, + { + "epoch": 1.9601565279260051, + "grad_norm": 0.7498292623449471, + "learning_rate": 2.7018903213582608e-05, + "loss": 0.1561, + "step": 16530 + }, + { + "epoch": 1.96027510968813, + "grad_norm": 0.8697453273812679, + "learning_rate": 2.7016510682988532e-05, + "loss": 0.1774, + "step": 16531 + }, + { + "epoch": 1.960393691450255, + "grad_norm": 1.04718749242534, + "learning_rate": 2.70141181338047e-05, + "loss": 0.19, + "step": 16532 + }, + { + "epoch": 1.9605122732123799, + "grad_norm": 0.8236271222683497, + "learning_rate": 2.701172556605317e-05, + "loss": 0.165, + "step": 16533 + }, + { + "epoch": 1.960630854974505, + "grad_norm": 0.7674643083116511, + "learning_rate": 2.700933297975599e-05, + "loss": 0.1257, + "step": 16534 + }, + { + "epoch": 1.9607494367366298, + "grad_norm": 1.1139014985542883, + "learning_rate": 2.7006940374935234e-05, + "loss": 0.2641, + "step": 16535 + }, + { + "epoch": 1.960868018498755, + "grad_norm": 0.8124603543357449, + "learning_rate": 2.7004547751612945e-05, + "loss": 0.2222, + "step": 16536 + }, + { + "epoch": 1.9609866002608798, + "grad_norm": 0.8525668558538714, + "learning_rate": 2.700215510981119e-05, + "loss": 0.1926, + "step": 16537 + }, + { + "epoch": 1.961105182023005, + "grad_norm": 0.5998973795969763, + "learning_rate": 2.6999762449552015e-05, + "loss": 0.126, + "step": 16538 + }, + { + "epoch": 1.9612237637851297, + "grad_norm": 0.7160057616007139, + "learning_rate": 2.6997369770857484e-05, + "loss": 0.1546, + "step": 16539 + }, + { + "epoch": 1.961342345547255, + "grad_norm": 0.6356856040048676, + "learning_rate": 2.6994977073749643e-05, + "loss": 0.1406, + "step": 16540 + }, + { + "epoch": 1.9614609273093797, + "grad_norm": 0.5780222789842443, + "learning_rate": 2.6992584358250572e-05, + "loss": 0.1408, + "step": 16541 + }, + { + "epoch": 1.961579509071505, + "grad_norm": 0.7222747002664324, + "learning_rate": 2.699019162438231e-05, + "loss": 0.1978, + "step": 16542 + }, + { + "epoch": 1.9616980908336297, + "grad_norm": 0.6221404230146084, + "learning_rate": 2.6987798872166926e-05, + "loss": 0.1652, + "step": 16543 + }, + { + "epoch": 1.9618166725957549, + "grad_norm": 0.7532997175047124, + "learning_rate": 2.698540610162647e-05, + "loss": 0.1481, + "step": 16544 + }, + { + "epoch": 1.9619352543578796, + "grad_norm": 0.7394093738772775, + "learning_rate": 2.6983013312783007e-05, + "loss": 0.1726, + "step": 16545 + }, + { + "epoch": 1.9620538361200048, + "grad_norm": 1.204200053517064, + "learning_rate": 2.6980620505658593e-05, + "loss": 0.2692, + "step": 16546 + }, + { + "epoch": 1.9621724178821296, + "grad_norm": 0.7859001873335504, + "learning_rate": 2.6978227680275286e-05, + "loss": 0.1542, + "step": 16547 + }, + { + "epoch": 1.9622909996442548, + "grad_norm": 0.8481919099150347, + "learning_rate": 2.6975834836655143e-05, + "loss": 0.1879, + "step": 16548 + }, + { + "epoch": 1.9624095814063796, + "grad_norm": 0.7239149716241182, + "learning_rate": 2.6973441974820228e-05, + "loss": 0.1597, + "step": 16549 + }, + { + "epoch": 1.9625281631685048, + "grad_norm": 0.7367964903229638, + "learning_rate": 2.6971049094792606e-05, + "loss": 0.1723, + "step": 16550 + }, + { + "epoch": 1.9626467449306295, + "grad_norm": 1.113211773473028, + "learning_rate": 2.6968656196594315e-05, + "loss": 0.1592, + "step": 16551 + }, + { + "epoch": 1.9627653266927547, + "grad_norm": 1.1251086793883442, + "learning_rate": 2.696626328024744e-05, + "loss": 0.2209, + "step": 16552 + }, + { + "epoch": 1.9628839084548795, + "grad_norm": 0.7078810502497369, + "learning_rate": 2.6963870345774024e-05, + "loss": 0.1134, + "step": 16553 + }, + { + "epoch": 1.9630024902170047, + "grad_norm": 0.9566022699671289, + "learning_rate": 2.6961477393196126e-05, + "loss": 0.188, + "step": 16554 + }, + { + "epoch": 1.9631210719791297, + "grad_norm": 0.9886913003032017, + "learning_rate": 2.6959084422535813e-05, + "loss": 0.2063, + "step": 16555 + }, + { + "epoch": 1.9632396537412546, + "grad_norm": 1.6578754562000646, + "learning_rate": 2.6956691433815152e-05, + "loss": 0.3195, + "step": 16556 + }, + { + "epoch": 1.9633582355033796, + "grad_norm": 0.8228138678375121, + "learning_rate": 2.6954298427056192e-05, + "loss": 0.1979, + "step": 16557 + }, + { + "epoch": 1.9634768172655046, + "grad_norm": 1.1795256960888452, + "learning_rate": 2.695190540228099e-05, + "loss": 0.2476, + "step": 16558 + }, + { + "epoch": 1.9635953990276296, + "grad_norm": 0.777647402648751, + "learning_rate": 2.6949512359511625e-05, + "loss": 0.1687, + "step": 16559 + }, + { + "epoch": 1.9637139807897546, + "grad_norm": 0.7948791011403281, + "learning_rate": 2.694711929877014e-05, + "loss": 0.1345, + "step": 16560 + }, + { + "epoch": 1.9638325625518795, + "grad_norm": 0.9385023832416008, + "learning_rate": 2.69447262200786e-05, + "loss": 0.2127, + "step": 16561 + }, + { + "epoch": 1.9639511443140045, + "grad_norm": 1.455590392159929, + "learning_rate": 2.6942333123459075e-05, + "loss": 0.2126, + "step": 16562 + }, + { + "epoch": 1.9640697260761295, + "grad_norm": 0.9423590928010162, + "learning_rate": 2.6939940008933618e-05, + "loss": 0.1776, + "step": 16563 + }, + { + "epoch": 1.9641883078382545, + "grad_norm": 0.7276505991346319, + "learning_rate": 2.693754687652429e-05, + "loss": 0.1112, + "step": 16564 + }, + { + "epoch": 1.9643068896003795, + "grad_norm": 0.7415338058040091, + "learning_rate": 2.6935153726253165e-05, + "loss": 0.1578, + "step": 16565 + }, + { + "epoch": 1.9644254713625044, + "grad_norm": 0.5990731921463058, + "learning_rate": 2.693276055814229e-05, + "loss": 0.1395, + "step": 16566 + }, + { + "epoch": 1.9645440531246294, + "grad_norm": 1.2622729789311924, + "learning_rate": 2.6930367372213733e-05, + "loss": 0.2209, + "step": 16567 + }, + { + "epoch": 1.9646626348867544, + "grad_norm": 0.8498867807188308, + "learning_rate": 2.6927974168489557e-05, + "loss": 0.1882, + "step": 16568 + }, + { + "epoch": 1.9647812166488794, + "grad_norm": 0.6309768493962936, + "learning_rate": 2.6925580946991824e-05, + "loss": 0.1371, + "step": 16569 + }, + { + "epoch": 1.9648997984110044, + "grad_norm": 0.9190544658811112, + "learning_rate": 2.692318770774259e-05, + "loss": 0.1749, + "step": 16570 + }, + { + "epoch": 1.9650183801731294, + "grad_norm": 0.9969878792619532, + "learning_rate": 2.692079445076393e-05, + "loss": 0.2143, + "step": 16571 + }, + { + "epoch": 1.9651369619352543, + "grad_norm": 0.7589432066192396, + "learning_rate": 2.6918401176077895e-05, + "loss": 0.1814, + "step": 16572 + }, + { + "epoch": 1.9652555436973793, + "grad_norm": 1.1533558753682775, + "learning_rate": 2.6916007883706557e-05, + "loss": 0.2298, + "step": 16573 + }, + { + "epoch": 1.9653741254595043, + "grad_norm": 0.8380815348800528, + "learning_rate": 2.6913614573671974e-05, + "loss": 0.1813, + "step": 16574 + }, + { + "epoch": 1.9654927072216293, + "grad_norm": 0.7731089431054058, + "learning_rate": 2.691122124599621e-05, + "loss": 0.1348, + "step": 16575 + }, + { + "epoch": 1.9656112889837543, + "grad_norm": 0.7326949943866915, + "learning_rate": 2.690882790070134e-05, + "loss": 0.1589, + "step": 16576 + }, + { + "epoch": 1.9657298707458792, + "grad_norm": 0.7076043935041055, + "learning_rate": 2.6906434537809404e-05, + "loss": 0.1704, + "step": 16577 + }, + { + "epoch": 1.9658484525080042, + "grad_norm": 0.7057399487194216, + "learning_rate": 2.690404115734249e-05, + "loss": 0.1299, + "step": 16578 + }, + { + "epoch": 1.9659670342701292, + "grad_norm": 0.9317370924309083, + "learning_rate": 2.6901647759322647e-05, + "loss": 0.2007, + "step": 16579 + }, + { + "epoch": 1.9660856160322542, + "grad_norm": 0.8550707742518132, + "learning_rate": 2.6899254343771946e-05, + "loss": 0.1646, + "step": 16580 + }, + { + "epoch": 1.9662041977943794, + "grad_norm": 0.893691361398224, + "learning_rate": 2.6896860910712444e-05, + "loss": 0.1785, + "step": 16581 + }, + { + "epoch": 1.9663227795565041, + "grad_norm": 0.8177559930284295, + "learning_rate": 2.689446746016621e-05, + "loss": 0.1859, + "step": 16582 + }, + { + "epoch": 1.9664413613186293, + "grad_norm": 0.8184391173826555, + "learning_rate": 2.6892073992155315e-05, + "loss": 0.1474, + "step": 16583 + }, + { + "epoch": 1.966559943080754, + "grad_norm": 0.7646538600748064, + "learning_rate": 2.688968050670182e-05, + "loss": 0.1465, + "step": 16584 + }, + { + "epoch": 1.9666785248428793, + "grad_norm": 1.3640930798883208, + "learning_rate": 2.6887287003827776e-05, + "loss": 0.1493, + "step": 16585 + }, + { + "epoch": 1.966797106605004, + "grad_norm": 0.7868113372509428, + "learning_rate": 2.6884893483555268e-05, + "loss": 0.1492, + "step": 16586 + }, + { + "epoch": 1.9669156883671293, + "grad_norm": 0.8051001913681641, + "learning_rate": 2.6882499945906357e-05, + "loss": 0.1751, + "step": 16587 + }, + { + "epoch": 1.967034270129254, + "grad_norm": 0.7135540599428525, + "learning_rate": 2.68801063909031e-05, + "loss": 0.1441, + "step": 16588 + }, + { + "epoch": 1.9671528518913792, + "grad_norm": 1.1296237550123442, + "learning_rate": 2.6877712818567573e-05, + "loss": 0.2837, + "step": 16589 + }, + { + "epoch": 1.967271433653504, + "grad_norm": 0.7342182561587253, + "learning_rate": 2.687531922892183e-05, + "loss": 0.1653, + "step": 16590 + }, + { + "epoch": 1.9673900154156292, + "grad_norm": 0.6979601421588587, + "learning_rate": 2.6872925621987948e-05, + "loss": 0.1165, + "step": 16591 + }, + { + "epoch": 1.967508597177754, + "grad_norm": 0.8695124212484039, + "learning_rate": 2.6870531997787984e-05, + "loss": 0.2135, + "step": 16592 + }, + { + "epoch": 1.9676271789398792, + "grad_norm": 0.8106851159846735, + "learning_rate": 2.6868138356344008e-05, + "loss": 0.1524, + "step": 16593 + }, + { + "epoch": 1.967745760702004, + "grad_norm": 1.1054388525684375, + "learning_rate": 2.686574469767809e-05, + "loss": 0.2279, + "step": 16594 + }, + { + "epoch": 1.9678643424641291, + "grad_norm": 0.5482725628502011, + "learning_rate": 2.6863351021812298e-05, + "loss": 0.121, + "step": 16595 + }, + { + "epoch": 1.9679829242262539, + "grad_norm": 0.7923884220987686, + "learning_rate": 2.6860957328768688e-05, + "loss": 0.1712, + "step": 16596 + }, + { + "epoch": 1.968101505988379, + "grad_norm": 0.6333308931433818, + "learning_rate": 2.6858563618569336e-05, + "loss": 0.1277, + "step": 16597 + }, + { + "epoch": 1.9682200877505038, + "grad_norm": 0.7231753197549791, + "learning_rate": 2.6856169891236304e-05, + "loss": 0.1353, + "step": 16598 + }, + { + "epoch": 1.968338669512629, + "grad_norm": 0.7132663422154958, + "learning_rate": 2.685377614679167e-05, + "loss": 0.1408, + "step": 16599 + }, + { + "epoch": 1.9684572512747538, + "grad_norm": 1.5301320865223567, + "learning_rate": 2.6851382385257485e-05, + "loss": 0.3425, + "step": 16600 + }, + { + "epoch": 1.968575833036879, + "grad_norm": 0.7808880347204206, + "learning_rate": 2.684898860665583e-05, + "loss": 0.1096, + "step": 16601 + }, + { + "epoch": 1.9686944147990038, + "grad_norm": 0.6058943629175368, + "learning_rate": 2.6846594811008763e-05, + "loss": 0.122, + "step": 16602 + }, + { + "epoch": 1.968812996561129, + "grad_norm": 0.747390341729377, + "learning_rate": 2.6844200998338357e-05, + "loss": 0.1502, + "step": 16603 + }, + { + "epoch": 1.9689315783232537, + "grad_norm": 0.8704839991326623, + "learning_rate": 2.6841807168666676e-05, + "loss": 0.1814, + "step": 16604 + }, + { + "epoch": 1.969050160085379, + "grad_norm": 0.770487641127278, + "learning_rate": 2.6839413322015795e-05, + "loss": 0.1868, + "step": 16605 + }, + { + "epoch": 1.969168741847504, + "grad_norm": 0.913773267039488, + "learning_rate": 2.6837019458407786e-05, + "loss": 0.2069, + "step": 16606 + }, + { + "epoch": 1.9692873236096289, + "grad_norm": 0.5921731462919477, + "learning_rate": 2.68346255778647e-05, + "loss": 0.1349, + "step": 16607 + }, + { + "epoch": 1.9694059053717539, + "grad_norm": 1.0182525409164622, + "learning_rate": 2.6832231680408626e-05, + "loss": 0.2173, + "step": 16608 + }, + { + "epoch": 1.9695244871338788, + "grad_norm": 0.9393029778679773, + "learning_rate": 2.682983776606161e-05, + "loss": 0.1424, + "step": 16609 + }, + { + "epoch": 1.9696430688960038, + "grad_norm": 0.9608571217602713, + "learning_rate": 2.6827443834845743e-05, + "loss": 0.216, + "step": 16610 + }, + { + "epoch": 1.9697616506581288, + "grad_norm": 0.6596452777394715, + "learning_rate": 2.682504988678308e-05, + "loss": 0.1319, + "step": 16611 + }, + { + "epoch": 1.9698802324202538, + "grad_norm": 0.9606461461726469, + "learning_rate": 2.6822655921895695e-05, + "loss": 0.204, + "step": 16612 + }, + { + "epoch": 1.9699988141823788, + "grad_norm": 1.1176293037247935, + "learning_rate": 2.682026194020566e-05, + "loss": 0.1453, + "step": 16613 + }, + { + "epoch": 1.9701173959445037, + "grad_norm": 1.3917511399702827, + "learning_rate": 2.681786794173504e-05, + "loss": 0.3473, + "step": 16614 + }, + { + "epoch": 1.9702359777066287, + "grad_norm": 1.1862650280459863, + "learning_rate": 2.6815473926505912e-05, + "loss": 0.2844, + "step": 16615 + }, + { + "epoch": 1.9703545594687537, + "grad_norm": 0.7199899265324935, + "learning_rate": 2.6813079894540333e-05, + "loss": 0.1474, + "step": 16616 + }, + { + "epoch": 1.9704731412308787, + "grad_norm": 0.9958322698673664, + "learning_rate": 2.681068584586039e-05, + "loss": 0.2013, + "step": 16617 + }, + { + "epoch": 1.9705917229930037, + "grad_norm": 1.0000753191581726, + "learning_rate": 2.6808291780488136e-05, + "loss": 0.2339, + "step": 16618 + }, + { + "epoch": 1.9707103047551287, + "grad_norm": 0.6604234604344458, + "learning_rate": 2.6805897698445654e-05, + "loss": 0.1409, + "step": 16619 + }, + { + "epoch": 1.9708288865172536, + "grad_norm": 0.6505033432578616, + "learning_rate": 2.6803503599755007e-05, + "loss": 0.1346, + "step": 16620 + }, + { + "epoch": 1.9709474682793786, + "grad_norm": 0.6745983159506549, + "learning_rate": 2.6801109484438276e-05, + "loss": 0.1378, + "step": 16621 + }, + { + "epoch": 1.9710660500415036, + "grad_norm": 0.6211787126136665, + "learning_rate": 2.679871535251751e-05, + "loss": 0.1328, + "step": 16622 + }, + { + "epoch": 1.9711846318036286, + "grad_norm": 0.5894084860811243, + "learning_rate": 2.67963212040148e-05, + "loss": 0.1274, + "step": 16623 + }, + { + "epoch": 1.9713032135657536, + "grad_norm": 0.45540326031448286, + "learning_rate": 2.679392703895221e-05, + "loss": 0.1071, + "step": 16624 + }, + { + "epoch": 1.9714217953278785, + "grad_norm": 0.528735025964823, + "learning_rate": 2.679153285735182e-05, + "loss": 0.0822, + "step": 16625 + }, + { + "epoch": 1.9715403770900035, + "grad_norm": 0.5924799605818535, + "learning_rate": 2.678913865923569e-05, + "loss": 0.1269, + "step": 16626 + }, + { + "epoch": 1.9716589588521285, + "grad_norm": 1.1076430098550323, + "learning_rate": 2.67867444446259e-05, + "loss": 0.2439, + "step": 16627 + }, + { + "epoch": 1.9717775406142535, + "grad_norm": 0.6810233459505042, + "learning_rate": 2.6784350213544508e-05, + "loss": 0.1358, + "step": 16628 + }, + { + "epoch": 1.9718961223763785, + "grad_norm": 0.7923239032533719, + "learning_rate": 2.6781955966013594e-05, + "loss": 0.1832, + "step": 16629 + }, + { + "epoch": 1.9720147041385037, + "grad_norm": 0.8211981450119527, + "learning_rate": 2.6779561702055246e-05, + "loss": 0.1811, + "step": 16630 + }, + { + "epoch": 1.9721332859006284, + "grad_norm": 0.9893360357653131, + "learning_rate": 2.6777167421691505e-05, + "loss": 0.2093, + "step": 16631 + }, + { + "epoch": 1.9722518676627536, + "grad_norm": 0.775755598045236, + "learning_rate": 2.6774773124944467e-05, + "loss": 0.1438, + "step": 16632 + }, + { + "epoch": 1.9723704494248784, + "grad_norm": 0.9098315965299378, + "learning_rate": 2.6772378811836195e-05, + "loss": 0.1941, + "step": 16633 + }, + { + "epoch": 1.9724890311870036, + "grad_norm": 0.8753262439117093, + "learning_rate": 2.6769984482388766e-05, + "loss": 0.2103, + "step": 16634 + }, + { + "epoch": 1.9726076129491283, + "grad_norm": 0.766947615716256, + "learning_rate": 2.676759013662425e-05, + "loss": 0.1964, + "step": 16635 + }, + { + "epoch": 1.9727261947112535, + "grad_norm": 0.8379974278734846, + "learning_rate": 2.676519577456472e-05, + "loss": 0.1614, + "step": 16636 + }, + { + "epoch": 1.9728447764733783, + "grad_norm": 1.0037501682092764, + "learning_rate": 2.676280139623225e-05, + "loss": 0.2116, + "step": 16637 + }, + { + "epoch": 1.9729633582355035, + "grad_norm": 1.1657522941883365, + "learning_rate": 2.6760407001648917e-05, + "loss": 0.1887, + "step": 16638 + }, + { + "epoch": 1.9730819399976283, + "grad_norm": 0.5948959504948274, + "learning_rate": 2.6758012590836783e-05, + "loss": 0.1377, + "step": 16639 + }, + { + "epoch": 1.9732005217597535, + "grad_norm": 0.7371245321180657, + "learning_rate": 2.6755618163817935e-05, + "loss": 0.1346, + "step": 16640 + }, + { + "epoch": 1.9733191035218782, + "grad_norm": 0.7608633923570466, + "learning_rate": 2.675322372061444e-05, + "loss": 0.1517, + "step": 16641 + }, + { + "epoch": 1.9734376852840034, + "grad_norm": 0.57595914596734, + "learning_rate": 2.6750829261248367e-05, + "loss": 0.1172, + "step": 16642 + }, + { + "epoch": 1.9735562670461282, + "grad_norm": 0.9958976191040615, + "learning_rate": 2.6748434785741795e-05, + "loss": 0.2094, + "step": 16643 + }, + { + "epoch": 1.9736748488082534, + "grad_norm": 0.816306266084852, + "learning_rate": 2.67460402941168e-05, + "loss": 0.1763, + "step": 16644 + }, + { + "epoch": 1.9737934305703781, + "grad_norm": 0.6759700751891955, + "learning_rate": 2.6743645786395453e-05, + "loss": 0.1752, + "step": 16645 + }, + { + "epoch": 1.9739120123325034, + "grad_norm": 1.099885530803151, + "learning_rate": 2.6741251262599838e-05, + "loss": 0.2141, + "step": 16646 + }, + { + "epoch": 1.974030594094628, + "grad_norm": 1.458129218322792, + "learning_rate": 2.6738856722752016e-05, + "loss": 0.3347, + "step": 16647 + }, + { + "epoch": 1.9741491758567533, + "grad_norm": 0.7135795023003078, + "learning_rate": 2.6736462166874066e-05, + "loss": 0.1468, + "step": 16648 + }, + { + "epoch": 1.974267757618878, + "grad_norm": 0.8896366335706559, + "learning_rate": 2.673406759498807e-05, + "loss": 0.2069, + "step": 16649 + }, + { + "epoch": 1.9743863393810033, + "grad_norm": 0.6851255761746561, + "learning_rate": 2.6731673007116093e-05, + "loss": 0.1307, + "step": 16650 + }, + { + "epoch": 1.974504921143128, + "grad_norm": 1.3551676690043897, + "learning_rate": 2.6729278403280218e-05, + "loss": 0.3068, + "step": 16651 + }, + { + "epoch": 1.9746235029052532, + "grad_norm": 0.6844252931122223, + "learning_rate": 2.6726883783502508e-05, + "loss": 0.1395, + "step": 16652 + }, + { + "epoch": 1.974742084667378, + "grad_norm": 1.0680400745883831, + "learning_rate": 2.6724489147805048e-05, + "loss": 0.2328, + "step": 16653 + }, + { + "epoch": 1.9748606664295032, + "grad_norm": 0.7780248936626137, + "learning_rate": 2.6722094496209922e-05, + "loss": 0.1426, + "step": 16654 + }, + { + "epoch": 1.9749792481916282, + "grad_norm": 1.011471855154614, + "learning_rate": 2.6719699828739192e-05, + "loss": 0.2384, + "step": 16655 + }, + { + "epoch": 1.9750978299537532, + "grad_norm": 0.7177761672341957, + "learning_rate": 2.6717305145414933e-05, + "loss": 0.1519, + "step": 16656 + }, + { + "epoch": 1.9752164117158781, + "grad_norm": 0.9548696664340259, + "learning_rate": 2.6714910446259235e-05, + "loss": 0.1843, + "step": 16657 + }, + { + "epoch": 1.9753349934780031, + "grad_norm": 0.7472940646774819, + "learning_rate": 2.6712515731294162e-05, + "loss": 0.1479, + "step": 16658 + }, + { + "epoch": 1.975453575240128, + "grad_norm": 0.9167677457225722, + "learning_rate": 2.6710121000541794e-05, + "loss": 0.1424, + "step": 16659 + }, + { + "epoch": 1.975572157002253, + "grad_norm": 1.1592182238513966, + "learning_rate": 2.670772625402421e-05, + "loss": 0.208, + "step": 16660 + }, + { + "epoch": 1.975690738764378, + "grad_norm": 0.8469375779706562, + "learning_rate": 2.670533149176348e-05, + "loss": 0.1454, + "step": 16661 + }, + { + "epoch": 1.975809320526503, + "grad_norm": 1.6156986994132243, + "learning_rate": 2.6702936713781685e-05, + "loss": 0.397, + "step": 16662 + }, + { + "epoch": 1.975927902288628, + "grad_norm": 0.6248122879127262, + "learning_rate": 2.67005419201009e-05, + "loss": 0.1474, + "step": 16663 + }, + { + "epoch": 1.976046484050753, + "grad_norm": 1.0555234426866862, + "learning_rate": 2.669814711074321e-05, + "loss": 0.1999, + "step": 16664 + }, + { + "epoch": 1.976165065812878, + "grad_norm": 0.6192123487913402, + "learning_rate": 2.669575228573068e-05, + "loss": 0.1698, + "step": 16665 + }, + { + "epoch": 1.976283647575003, + "grad_norm": 0.9184004553738836, + "learning_rate": 2.6693357445085405e-05, + "loss": 0.1859, + "step": 16666 + }, + { + "epoch": 1.976402229337128, + "grad_norm": 0.9250854446826734, + "learning_rate": 2.669096258882944e-05, + "loss": 0.1998, + "step": 16667 + }, + { + "epoch": 1.976520811099253, + "grad_norm": 0.7010979532184618, + "learning_rate": 2.668856771698488e-05, + "loss": 0.1576, + "step": 16668 + }, + { + "epoch": 1.976639392861378, + "grad_norm": 0.8668648912875988, + "learning_rate": 2.6686172829573784e-05, + "loss": 0.2092, + "step": 16669 + }, + { + "epoch": 1.976757974623503, + "grad_norm": 0.5868756746951749, + "learning_rate": 2.668377792661826e-05, + "loss": 0.123, + "step": 16670 + }, + { + "epoch": 1.9768765563856279, + "grad_norm": 0.6900566349989559, + "learning_rate": 2.6681383008140354e-05, + "loss": 0.1619, + "step": 16671 + }, + { + "epoch": 1.9769951381477529, + "grad_norm": 0.8663930510722719, + "learning_rate": 2.6678988074162163e-05, + "loss": 0.1467, + "step": 16672 + }, + { + "epoch": 1.9771137199098778, + "grad_norm": 0.7889850593638359, + "learning_rate": 2.6676593124705756e-05, + "loss": 0.2148, + "step": 16673 + }, + { + "epoch": 1.9772323016720028, + "grad_norm": 1.9090091591682792, + "learning_rate": 2.6674198159793224e-05, + "loss": 0.5439, + "step": 16674 + }, + { + "epoch": 1.9773508834341278, + "grad_norm": 0.8372788030231518, + "learning_rate": 2.6671803179446636e-05, + "loss": 0.1266, + "step": 16675 + }, + { + "epoch": 1.9774694651962528, + "grad_norm": 0.46492497165881647, + "learning_rate": 2.666940818368807e-05, + "loss": 0.0924, + "step": 16676 + }, + { + "epoch": 1.9775880469583778, + "grad_norm": 0.8313752080510048, + "learning_rate": 2.6667013172539618e-05, + "loss": 0.1873, + "step": 16677 + }, + { + "epoch": 1.9777066287205027, + "grad_norm": 0.5282648218052124, + "learning_rate": 2.6664618146023335e-05, + "loss": 0.1276, + "step": 16678 + }, + { + "epoch": 1.977825210482628, + "grad_norm": 0.5522505640707696, + "learning_rate": 2.6662223104161325e-05, + "loss": 0.1538, + "step": 16679 + }, + { + "epoch": 1.9779437922447527, + "grad_norm": 0.9997401554329624, + "learning_rate": 2.665982804697565e-05, + "loss": 0.2783, + "step": 16680 + }, + { + "epoch": 1.978062374006878, + "grad_norm": 0.8770707032339808, + "learning_rate": 2.66574329744884e-05, + "loss": 0.1952, + "step": 16681 + }, + { + "epoch": 1.9781809557690027, + "grad_norm": 0.7103568032807374, + "learning_rate": 2.6655037886721644e-05, + "loss": 0.1194, + "step": 16682 + }, + { + "epoch": 1.9782995375311279, + "grad_norm": 0.7330920004095182, + "learning_rate": 2.6652642783697473e-05, + "loss": 0.1752, + "step": 16683 + }, + { + "epoch": 1.9784181192932526, + "grad_norm": 0.9647894609612958, + "learning_rate": 2.6650247665437957e-05, + "loss": 0.2465, + "step": 16684 + }, + { + "epoch": 1.9785367010553778, + "grad_norm": 0.8466384348148461, + "learning_rate": 2.664785253196519e-05, + "loss": 0.1348, + "step": 16685 + }, + { + "epoch": 1.9786552828175026, + "grad_norm": 0.573554056213586, + "learning_rate": 2.6645457383301236e-05, + "loss": 0.1364, + "step": 16686 + }, + { + "epoch": 1.9787738645796278, + "grad_norm": 0.9738737101743017, + "learning_rate": 2.6643062219468183e-05, + "loss": 0.1947, + "step": 16687 + }, + { + "epoch": 1.9788924463417525, + "grad_norm": 0.6607049594721701, + "learning_rate": 2.6640667040488114e-05, + "loss": 0.1467, + "step": 16688 + }, + { + "epoch": 1.9790110281038777, + "grad_norm": 1.3189548496625905, + "learning_rate": 2.6638271846383106e-05, + "loss": 0.2606, + "step": 16689 + }, + { + "epoch": 1.9791296098660025, + "grad_norm": 0.9043728980504382, + "learning_rate": 2.6635876637175244e-05, + "loss": 0.162, + "step": 16690 + }, + { + "epoch": 1.9792481916281277, + "grad_norm": 0.6802738945856501, + "learning_rate": 2.6633481412886603e-05, + "loss": 0.1571, + "step": 16691 + }, + { + "epoch": 1.9793667733902525, + "grad_norm": 0.7087497496167907, + "learning_rate": 2.663108617353926e-05, + "loss": 0.1599, + "step": 16692 + }, + { + "epoch": 1.9794853551523777, + "grad_norm": 0.6635957519543633, + "learning_rate": 2.6628690919155306e-05, + "loss": 0.1519, + "step": 16693 + }, + { + "epoch": 1.9796039369145024, + "grad_norm": 0.6707326282663121, + "learning_rate": 2.6626295649756828e-05, + "loss": 0.1253, + "step": 16694 + }, + { + "epoch": 1.9797225186766276, + "grad_norm": 0.6698186404622073, + "learning_rate": 2.662390036536589e-05, + "loss": 0.139, + "step": 16695 + }, + { + "epoch": 1.9798411004387524, + "grad_norm": 0.6547856019955827, + "learning_rate": 2.6621505066004588e-05, + "loss": 0.1539, + "step": 16696 + }, + { + "epoch": 1.9799596822008776, + "grad_norm": 0.8160741165788528, + "learning_rate": 2.6619109751694986e-05, + "loss": 0.175, + "step": 16697 + }, + { + "epoch": 1.9800782639630023, + "grad_norm": 1.0784914545002389, + "learning_rate": 2.6616714422459194e-05, + "loss": 0.2734, + "step": 16698 + }, + { + "epoch": 1.9801968457251276, + "grad_norm": 0.9699277990435193, + "learning_rate": 2.661431907831926e-05, + "loss": 0.1807, + "step": 16699 + }, + { + "epoch": 1.9803154274872523, + "grad_norm": 0.930059262531024, + "learning_rate": 2.6611923719297294e-05, + "loss": 0.2015, + "step": 16700 + }, + { + "epoch": 1.9804340092493775, + "grad_norm": 0.9172297108729774, + "learning_rate": 2.660952834541537e-05, + "loss": 0.1686, + "step": 16701 + }, + { + "epoch": 1.9805525910115023, + "grad_norm": 0.6775636866482642, + "learning_rate": 2.6607132956695564e-05, + "loss": 0.1067, + "step": 16702 + }, + { + "epoch": 1.9806711727736275, + "grad_norm": 1.1195619735932778, + "learning_rate": 2.660473755315996e-05, + "loss": 0.2005, + "step": 16703 + }, + { + "epoch": 1.9807897545357525, + "grad_norm": 0.8679999364343632, + "learning_rate": 2.6602342134830643e-05, + "loss": 0.1573, + "step": 16704 + }, + { + "epoch": 1.9809083362978774, + "grad_norm": 0.9817329181210529, + "learning_rate": 2.6599946701729705e-05, + "loss": 0.1898, + "step": 16705 + }, + { + "epoch": 1.9810269180600024, + "grad_norm": 0.8267311017415286, + "learning_rate": 2.6597551253879217e-05, + "loss": 0.1755, + "step": 16706 + }, + { + "epoch": 1.9811454998221274, + "grad_norm": 0.8654103324490628, + "learning_rate": 2.659515579130127e-05, + "loss": 0.206, + "step": 16707 + }, + { + "epoch": 1.9812640815842524, + "grad_norm": 0.7785756671938064, + "learning_rate": 2.6592760314017927e-05, + "loss": 0.167, + "step": 16708 + }, + { + "epoch": 1.9813826633463774, + "grad_norm": 0.8413506827211478, + "learning_rate": 2.6590364822051307e-05, + "loss": 0.1687, + "step": 16709 + }, + { + "epoch": 1.9815012451085023, + "grad_norm": 0.621049138470212, + "learning_rate": 2.658796931542346e-05, + "loss": 0.1533, + "step": 16710 + }, + { + "epoch": 1.9816198268706273, + "grad_norm": 0.7495459686299473, + "learning_rate": 2.6585573794156486e-05, + "loss": 0.1443, + "step": 16711 + }, + { + "epoch": 1.9817384086327523, + "grad_norm": 0.7067267317084622, + "learning_rate": 2.6583178258272468e-05, + "loss": 0.1342, + "step": 16712 + }, + { + "epoch": 1.9818569903948773, + "grad_norm": 1.3565716088559836, + "learning_rate": 2.6580782707793482e-05, + "loss": 0.357, + "step": 16713 + }, + { + "epoch": 1.9819755721570023, + "grad_norm": 0.5408800511761266, + "learning_rate": 2.6578387142741624e-05, + "loss": 0.1434, + "step": 16714 + }, + { + "epoch": 1.9820941539191272, + "grad_norm": 0.8036031141519953, + "learning_rate": 2.657599156313897e-05, + "loss": 0.2262, + "step": 16715 + }, + { + "epoch": 1.9822127356812522, + "grad_norm": 0.6415761772398542, + "learning_rate": 2.6573595969007608e-05, + "loss": 0.1327, + "step": 16716 + }, + { + "epoch": 1.9823313174433772, + "grad_norm": 0.9084812600486402, + "learning_rate": 2.657120036036962e-05, + "loss": 0.1658, + "step": 16717 + }, + { + "epoch": 1.9824498992055022, + "grad_norm": 1.1051348667694987, + "learning_rate": 2.6568804737247092e-05, + "loss": 0.2528, + "step": 16718 + }, + { + "epoch": 1.9825684809676272, + "grad_norm": 0.9144892479263558, + "learning_rate": 2.6566409099662108e-05, + "loss": 0.2006, + "step": 16719 + }, + { + "epoch": 1.9826870627297521, + "grad_norm": 0.9294517785993252, + "learning_rate": 2.656401344763676e-05, + "loss": 0.2476, + "step": 16720 + }, + { + "epoch": 1.9828056444918771, + "grad_norm": 0.9004442135683733, + "learning_rate": 2.6561617781193117e-05, + "loss": 0.2015, + "step": 16721 + }, + { + "epoch": 1.982924226254002, + "grad_norm": 0.836427409790765, + "learning_rate": 2.655922210035327e-05, + "loss": 0.2028, + "step": 16722 + }, + { + "epoch": 1.983042808016127, + "grad_norm": 0.8465077163748829, + "learning_rate": 2.6556826405139312e-05, + "loss": 0.1779, + "step": 16723 + }, + { + "epoch": 1.983161389778252, + "grad_norm": 0.6951682222892629, + "learning_rate": 2.6554430695573322e-05, + "loss": 0.1461, + "step": 16724 + }, + { + "epoch": 1.983279971540377, + "grad_norm": 0.7643168491952845, + "learning_rate": 2.6552034971677385e-05, + "loss": 0.1325, + "step": 16725 + }, + { + "epoch": 1.983398553302502, + "grad_norm": 0.7036208344148681, + "learning_rate": 2.65496392334736e-05, + "loss": 0.1529, + "step": 16726 + }, + { + "epoch": 1.983517135064627, + "grad_norm": 0.5317387093638678, + "learning_rate": 2.654724348098403e-05, + "loss": 0.1169, + "step": 16727 + }, + { + "epoch": 1.983635716826752, + "grad_norm": 0.5651873467426337, + "learning_rate": 2.654484771423078e-05, + "loss": 0.1214, + "step": 16728 + }, + { + "epoch": 1.983754298588877, + "grad_norm": 0.9177841002068887, + "learning_rate": 2.6542451933235922e-05, + "loss": 0.2057, + "step": 16729 + }, + { + "epoch": 1.9838728803510022, + "grad_norm": 0.7029486835647333, + "learning_rate": 2.6540056138021552e-05, + "loss": 0.118, + "step": 16730 + }, + { + "epoch": 1.983991462113127, + "grad_norm": 0.8445857589650815, + "learning_rate": 2.6537660328609758e-05, + "loss": 0.1724, + "step": 16731 + }, + { + "epoch": 1.9841100438752521, + "grad_norm": 0.49511693399145434, + "learning_rate": 2.653526450502261e-05, + "loss": 0.1082, + "step": 16732 + }, + { + "epoch": 1.984228625637377, + "grad_norm": 0.6545419195598318, + "learning_rate": 2.6532868667282212e-05, + "loss": 0.1406, + "step": 16733 + }, + { + "epoch": 1.984347207399502, + "grad_norm": 0.769774988578381, + "learning_rate": 2.6530472815410645e-05, + "loss": 0.1907, + "step": 16734 + }, + { + "epoch": 1.9844657891616269, + "grad_norm": 0.9789345068205707, + "learning_rate": 2.6528076949429997e-05, + "loss": 0.1695, + "step": 16735 + }, + { + "epoch": 1.984584370923752, + "grad_norm": 1.041920882999858, + "learning_rate": 2.6525681069362353e-05, + "loss": 0.2516, + "step": 16736 + }, + { + "epoch": 1.9847029526858768, + "grad_norm": 0.8938804942486096, + "learning_rate": 2.65232851752298e-05, + "loss": 0.1909, + "step": 16737 + }, + { + "epoch": 1.984821534448002, + "grad_norm": 1.070014751526564, + "learning_rate": 2.6520889267054423e-05, + "loss": 0.2043, + "step": 16738 + }, + { + "epoch": 1.9849401162101268, + "grad_norm": 0.8285521653925805, + "learning_rate": 2.6518493344858315e-05, + "loss": 0.1471, + "step": 16739 + }, + { + "epoch": 1.985058697972252, + "grad_norm": 0.9111993487237223, + "learning_rate": 2.6516097408663555e-05, + "loss": 0.195, + "step": 16740 + }, + { + "epoch": 1.9851772797343767, + "grad_norm": 0.8691044034809952, + "learning_rate": 2.6513701458492242e-05, + "loss": 0.1655, + "step": 16741 + }, + { + "epoch": 1.985295861496502, + "grad_norm": 1.0407043061123809, + "learning_rate": 2.651130549436645e-05, + "loss": 0.2036, + "step": 16742 + }, + { + "epoch": 1.9854144432586267, + "grad_norm": 0.9962973514006571, + "learning_rate": 2.6508909516308277e-05, + "loss": 0.1782, + "step": 16743 + }, + { + "epoch": 1.985533025020752, + "grad_norm": 0.7027074787503084, + "learning_rate": 2.6506513524339815e-05, + "loss": 0.176, + "step": 16744 + }, + { + "epoch": 1.9856516067828767, + "grad_norm": 1.0024690650913353, + "learning_rate": 2.6504117518483147e-05, + "loss": 0.171, + "step": 16745 + }, + { + "epoch": 1.9857701885450019, + "grad_norm": 0.7566022858552286, + "learning_rate": 2.6501721498760357e-05, + "loss": 0.1771, + "step": 16746 + }, + { + "epoch": 1.9858887703071266, + "grad_norm": 0.8582618050864335, + "learning_rate": 2.649932546519353e-05, + "loss": 0.1593, + "step": 16747 + }, + { + "epoch": 1.9860073520692518, + "grad_norm": 0.8294361221501095, + "learning_rate": 2.649692941780477e-05, + "loss": 0.1328, + "step": 16748 + }, + { + "epoch": 1.9861259338313766, + "grad_norm": 0.97134547629697, + "learning_rate": 2.6494533356616153e-05, + "loss": 0.2288, + "step": 16749 + }, + { + "epoch": 1.9862445155935018, + "grad_norm": 0.7516701005184288, + "learning_rate": 2.649213728164977e-05, + "loss": 0.1419, + "step": 16750 + }, + { + "epoch": 1.9863630973556266, + "grad_norm": 0.9127637391173027, + "learning_rate": 2.6489741192927716e-05, + "loss": 0.1651, + "step": 16751 + }, + { + "epoch": 1.9864816791177518, + "grad_norm": 0.710579801350816, + "learning_rate": 2.6487345090472066e-05, + "loss": 0.1652, + "step": 16752 + }, + { + "epoch": 1.9866002608798765, + "grad_norm": 0.8368760428768354, + "learning_rate": 2.648494897430492e-05, + "loss": 0.1638, + "step": 16753 + }, + { + "epoch": 1.9867188426420017, + "grad_norm": 0.7544859279349941, + "learning_rate": 2.648255284444837e-05, + "loss": 0.1778, + "step": 16754 + }, + { + "epoch": 1.9868374244041267, + "grad_norm": 0.8200142245103672, + "learning_rate": 2.6480156700924498e-05, + "loss": 0.1566, + "step": 16755 + }, + { + "epoch": 1.9869560061662517, + "grad_norm": 0.6479656451314105, + "learning_rate": 2.64777605437554e-05, + "loss": 0.1188, + "step": 16756 + }, + { + "epoch": 1.9870745879283767, + "grad_norm": 0.6750447965807376, + "learning_rate": 2.6475364372963158e-05, + "loss": 0.1318, + "step": 16757 + }, + { + "epoch": 1.9871931696905016, + "grad_norm": 0.8597994116642423, + "learning_rate": 2.6472968188569864e-05, + "loss": 0.1965, + "step": 16758 + }, + { + "epoch": 1.9873117514526266, + "grad_norm": 0.8670267389509636, + "learning_rate": 2.6470571990597616e-05, + "loss": 0.1975, + "step": 16759 + }, + { + "epoch": 1.9874303332147516, + "grad_norm": 0.9843776937705451, + "learning_rate": 2.6468175779068494e-05, + "loss": 0.2128, + "step": 16760 + }, + { + "epoch": 1.9875489149768766, + "grad_norm": 0.8408952980226283, + "learning_rate": 2.6465779554004595e-05, + "loss": 0.1272, + "step": 16761 + }, + { + "epoch": 1.9876674967390016, + "grad_norm": 0.830125877533524, + "learning_rate": 2.6463383315427997e-05, + "loss": 0.1891, + "step": 16762 + }, + { + "epoch": 1.9877860785011265, + "grad_norm": 1.4026057165130161, + "learning_rate": 2.6460987063360803e-05, + "loss": 0.2902, + "step": 16763 + }, + { + "epoch": 1.9879046602632515, + "grad_norm": 0.5481902966749679, + "learning_rate": 2.64585907978251e-05, + "loss": 0.1136, + "step": 16764 + }, + { + "epoch": 1.9880232420253765, + "grad_norm": 1.0644283803307553, + "learning_rate": 2.6456194518842987e-05, + "loss": 0.2492, + "step": 16765 + }, + { + "epoch": 1.9881418237875015, + "grad_norm": 1.044711391541753, + "learning_rate": 2.6453798226436534e-05, + "loss": 0.1963, + "step": 16766 + }, + { + "epoch": 1.9882604055496265, + "grad_norm": 0.8084993382768519, + "learning_rate": 2.6451401920627854e-05, + "loss": 0.1626, + "step": 16767 + }, + { + "epoch": 1.9883789873117514, + "grad_norm": 0.5437533695924358, + "learning_rate": 2.644900560143902e-05, + "loss": 0.1099, + "step": 16768 + }, + { + "epoch": 1.9884975690738764, + "grad_norm": 0.6515448604857998, + "learning_rate": 2.644660926889214e-05, + "loss": 0.1169, + "step": 16769 + }, + { + "epoch": 1.9886161508360014, + "grad_norm": 0.8858863734503022, + "learning_rate": 2.6444212923009288e-05, + "loss": 0.1546, + "step": 16770 + }, + { + "epoch": 1.9887347325981264, + "grad_norm": 0.6699893289368085, + "learning_rate": 2.6441816563812565e-05, + "loss": 0.1718, + "step": 16771 + }, + { + "epoch": 1.9888533143602514, + "grad_norm": 0.6022967993330868, + "learning_rate": 2.6439420191324066e-05, + "loss": 0.164, + "step": 16772 + }, + { + "epoch": 1.9889718961223763, + "grad_norm": 1.1165596664933077, + "learning_rate": 2.643702380556587e-05, + "loss": 0.2574, + "step": 16773 + }, + { + "epoch": 1.9890904778845013, + "grad_norm": 0.8387274703863548, + "learning_rate": 2.6434627406560087e-05, + "loss": 0.1762, + "step": 16774 + }, + { + "epoch": 1.9892090596466263, + "grad_norm": 1.0249678575923673, + "learning_rate": 2.6432230994328795e-05, + "loss": 0.2283, + "step": 16775 + }, + { + "epoch": 1.9893276414087513, + "grad_norm": 0.8030606994305773, + "learning_rate": 2.6429834568894092e-05, + "loss": 0.1825, + "step": 16776 + }, + { + "epoch": 1.9894462231708763, + "grad_norm": 0.8409218269301227, + "learning_rate": 2.642743813027806e-05, + "loss": 0.2188, + "step": 16777 + }, + { + "epoch": 1.9895648049330013, + "grad_norm": 0.8218039141698346, + "learning_rate": 2.6425041678502804e-05, + "loss": 0.1856, + "step": 16778 + }, + { + "epoch": 1.9896833866951265, + "grad_norm": 0.6246852684401207, + "learning_rate": 2.642264521359041e-05, + "loss": 0.1206, + "step": 16779 + }, + { + "epoch": 1.9898019684572512, + "grad_norm": 0.916595214589751, + "learning_rate": 2.6420248735562974e-05, + "loss": 0.2092, + "step": 16780 + }, + { + "epoch": 1.9899205502193764, + "grad_norm": 0.7445435063982053, + "learning_rate": 2.641785224444258e-05, + "loss": 0.1499, + "step": 16781 + }, + { + "epoch": 1.9900391319815012, + "grad_norm": 0.6528769747193705, + "learning_rate": 2.6415455740251334e-05, + "loss": 0.1655, + "step": 16782 + }, + { + "epoch": 1.9901577137436264, + "grad_norm": 0.6637937587192689, + "learning_rate": 2.6413059223011318e-05, + "loss": 0.1371, + "step": 16783 + }, + { + "epoch": 1.9902762955057511, + "grad_norm": 0.6584263811286387, + "learning_rate": 2.6410662692744633e-05, + "loss": 0.1299, + "step": 16784 + }, + { + "epoch": 1.9903948772678763, + "grad_norm": 0.9684162862073348, + "learning_rate": 2.640826614947336e-05, + "loss": 0.1996, + "step": 16785 + }, + { + "epoch": 1.990513459030001, + "grad_norm": 0.6312743360700395, + "learning_rate": 2.6405869593219602e-05, + "loss": 0.115, + "step": 16786 + }, + { + "epoch": 1.9906320407921263, + "grad_norm": 1.0174965277611054, + "learning_rate": 2.640347302400546e-05, + "loss": 0.1971, + "step": 16787 + }, + { + "epoch": 1.990750622554251, + "grad_norm": 0.9600383270110412, + "learning_rate": 2.6401076441853007e-05, + "loss": 0.2581, + "step": 16788 + }, + { + "epoch": 1.9908692043163763, + "grad_norm": 0.896703965894882, + "learning_rate": 2.639867984678435e-05, + "loss": 0.1738, + "step": 16789 + }, + { + "epoch": 1.990987786078501, + "grad_norm": 0.7427486514169015, + "learning_rate": 2.6396283238821583e-05, + "loss": 0.1297, + "step": 16790 + }, + { + "epoch": 1.9911063678406262, + "grad_norm": 0.9751250788630055, + "learning_rate": 2.6393886617986795e-05, + "loss": 0.1704, + "step": 16791 + }, + { + "epoch": 1.991224949602751, + "grad_norm": 0.6430364087319996, + "learning_rate": 2.6391489984302076e-05, + "loss": 0.1297, + "step": 16792 + }, + { + "epoch": 1.9913435313648762, + "grad_norm": 1.0239276229655427, + "learning_rate": 2.6389093337789538e-05, + "loss": 0.213, + "step": 16793 + }, + { + "epoch": 1.991462113127001, + "grad_norm": 0.5981605111391587, + "learning_rate": 2.638669667847125e-05, + "loss": 0.1282, + "step": 16794 + }, + { + "epoch": 1.9915806948891261, + "grad_norm": 1.2506493868436903, + "learning_rate": 2.6384300006369333e-05, + "loss": 0.18, + "step": 16795 + }, + { + "epoch": 1.991699276651251, + "grad_norm": 0.8255312838389638, + "learning_rate": 2.6381903321505857e-05, + "loss": 0.1564, + "step": 16796 + }, + { + "epoch": 1.991817858413376, + "grad_norm": 0.8669727747430396, + "learning_rate": 2.6379506623902934e-05, + "loss": 0.1546, + "step": 16797 + }, + { + "epoch": 1.9919364401755009, + "grad_norm": 0.7777618462447186, + "learning_rate": 2.6377109913582642e-05, + "loss": 0.1612, + "step": 16798 + }, + { + "epoch": 1.992055021937626, + "grad_norm": 0.6665875064749528, + "learning_rate": 2.6374713190567098e-05, + "loss": 0.1309, + "step": 16799 + }, + { + "epoch": 1.9921736036997508, + "grad_norm": 0.9883374136627054, + "learning_rate": 2.6372316454878372e-05, + "loss": 0.2091, + "step": 16800 + }, + { + "epoch": 1.992292185461876, + "grad_norm": 0.6975006252181103, + "learning_rate": 2.6369919706538576e-05, + "loss": 0.1274, + "step": 16801 + }, + { + "epoch": 1.9924107672240008, + "grad_norm": 0.7313476820301791, + "learning_rate": 2.6367522945569796e-05, + "loss": 0.1633, + "step": 16802 + }, + { + "epoch": 1.992529348986126, + "grad_norm": 0.7172105782415245, + "learning_rate": 2.6365126171994137e-05, + "loss": 0.1096, + "step": 16803 + }, + { + "epoch": 1.992647930748251, + "grad_norm": 0.8174314120476643, + "learning_rate": 2.636272938583369e-05, + "loss": 0.1554, + "step": 16804 + }, + { + "epoch": 1.992766512510376, + "grad_norm": 0.8524341529360219, + "learning_rate": 2.6360332587110543e-05, + "loss": 0.1424, + "step": 16805 + }, + { + "epoch": 1.992885094272501, + "grad_norm": 0.7567588635717409, + "learning_rate": 2.6357935775846804e-05, + "loss": 0.1751, + "step": 16806 + }, + { + "epoch": 1.993003676034626, + "grad_norm": 0.9131409944394483, + "learning_rate": 2.6355538952064558e-05, + "loss": 0.2371, + "step": 16807 + }, + { + "epoch": 1.993122257796751, + "grad_norm": 0.5373783287418633, + "learning_rate": 2.6353142115785912e-05, + "loss": 0.1026, + "step": 16808 + }, + { + "epoch": 1.9932408395588759, + "grad_norm": 1.0629390420497449, + "learning_rate": 2.6350745267032945e-05, + "loss": 0.2258, + "step": 16809 + }, + { + "epoch": 1.9933594213210009, + "grad_norm": 0.6825397849098519, + "learning_rate": 2.634834840582777e-05, + "loss": 0.1352, + "step": 16810 + }, + { + "epoch": 1.9934780030831258, + "grad_norm": 0.8612489445615479, + "learning_rate": 2.6345951532192475e-05, + "loss": 0.1648, + "step": 16811 + }, + { + "epoch": 1.9935965848452508, + "grad_norm": 1.0186199978027857, + "learning_rate": 2.6343554646149154e-05, + "loss": 0.1284, + "step": 16812 + }, + { + "epoch": 1.9937151666073758, + "grad_norm": 0.8198833322047672, + "learning_rate": 2.6341157747719903e-05, + "loss": 0.1736, + "step": 16813 + }, + { + "epoch": 1.9938337483695008, + "grad_norm": 0.874558041527294, + "learning_rate": 2.6338760836926834e-05, + "loss": 0.1833, + "step": 16814 + }, + { + "epoch": 1.9939523301316258, + "grad_norm": 1.2924128910941703, + "learning_rate": 2.633636391379202e-05, + "loss": 0.2445, + "step": 16815 + }, + { + "epoch": 1.9940709118937507, + "grad_norm": 0.5635895581138842, + "learning_rate": 2.6333966978337575e-05, + "loss": 0.1214, + "step": 16816 + }, + { + "epoch": 1.9941894936558757, + "grad_norm": 0.9045732882162189, + "learning_rate": 2.6331570030585596e-05, + "loss": 0.229, + "step": 16817 + }, + { + "epoch": 1.9943080754180007, + "grad_norm": 0.8586422930364269, + "learning_rate": 2.6329173070558168e-05, + "loss": 0.1753, + "step": 16818 + }, + { + "epoch": 1.9944266571801257, + "grad_norm": 0.7327277010546452, + "learning_rate": 2.6326776098277395e-05, + "loss": 0.1553, + "step": 16819 + }, + { + "epoch": 1.9945452389422507, + "grad_norm": 0.8561549109880168, + "learning_rate": 2.6324379113765375e-05, + "loss": 0.2134, + "step": 16820 + }, + { + "epoch": 1.9946638207043756, + "grad_norm": 0.8751228099666745, + "learning_rate": 2.6321982117044196e-05, + "loss": 0.1844, + "step": 16821 + }, + { + "epoch": 1.9947824024665006, + "grad_norm": 0.9225967994311773, + "learning_rate": 2.631958510813597e-05, + "loss": 0.2057, + "step": 16822 + }, + { + "epoch": 1.9949009842286256, + "grad_norm": 0.8110724734236764, + "learning_rate": 2.631718808706279e-05, + "loss": 0.1717, + "step": 16823 + }, + { + "epoch": 1.9950195659907506, + "grad_norm": 0.7455972795830244, + "learning_rate": 2.6314791053846745e-05, + "loss": 0.1639, + "step": 16824 + }, + { + "epoch": 1.9951381477528756, + "grad_norm": 0.7686965587823957, + "learning_rate": 2.6312394008509943e-05, + "loss": 0.1843, + "step": 16825 + }, + { + "epoch": 1.9952567295150005, + "grad_norm": 0.9241554586892446, + "learning_rate": 2.6309996951074478e-05, + "loss": 0.1487, + "step": 16826 + }, + { + "epoch": 1.9953753112771255, + "grad_norm": 0.6156150184021076, + "learning_rate": 2.630759988156245e-05, + "loss": 0.1388, + "step": 16827 + }, + { + "epoch": 1.9954938930392505, + "grad_norm": 1.176632745761282, + "learning_rate": 2.630520279999595e-05, + "loss": 0.2238, + "step": 16828 + }, + { + "epoch": 1.9956124748013755, + "grad_norm": 1.2480846180744192, + "learning_rate": 2.6302805706397077e-05, + "loss": 0.2164, + "step": 16829 + }, + { + "epoch": 1.9957310565635007, + "grad_norm": 0.6406532505794443, + "learning_rate": 2.6300408600787942e-05, + "loss": 0.1167, + "step": 16830 + }, + { + "epoch": 1.9958496383256255, + "grad_norm": 0.8651108721580609, + "learning_rate": 2.629801148319062e-05, + "loss": 0.2578, + "step": 16831 + }, + { + "epoch": 1.9959682200877507, + "grad_norm": 0.8490404979505096, + "learning_rate": 2.6295614353627234e-05, + "loss": 0.1792, + "step": 16832 + }, + { + "epoch": 1.9960868018498754, + "grad_norm": 0.9466059656605132, + "learning_rate": 2.629321721211987e-05, + "loss": 0.1887, + "step": 16833 + }, + { + "epoch": 1.9962053836120006, + "grad_norm": 1.0355705925665357, + "learning_rate": 2.6290820058690636e-05, + "loss": 0.2428, + "step": 16834 + }, + { + "epoch": 1.9963239653741254, + "grad_norm": 0.6678457007275491, + "learning_rate": 2.6288422893361615e-05, + "loss": 0.1475, + "step": 16835 + }, + { + "epoch": 1.9964425471362506, + "grad_norm": 0.555195645967404, + "learning_rate": 2.6286025716154922e-05, + "loss": 0.1228, + "step": 16836 + }, + { + "epoch": 1.9965611288983753, + "grad_norm": 0.7579055650607659, + "learning_rate": 2.6283628527092642e-05, + "loss": 0.1927, + "step": 16837 + }, + { + "epoch": 1.9966797106605005, + "grad_norm": 0.6966314226540752, + "learning_rate": 2.6281231326196887e-05, + "loss": 0.1407, + "step": 16838 + }, + { + "epoch": 1.9967982924226253, + "grad_norm": 0.9971030956119412, + "learning_rate": 2.6278834113489746e-05, + "loss": 0.1888, + "step": 16839 + }, + { + "epoch": 1.9969168741847505, + "grad_norm": 0.8288479408966057, + "learning_rate": 2.6276436888993326e-05, + "loss": 0.1863, + "step": 16840 + }, + { + "epoch": 1.9970354559468753, + "grad_norm": 1.033365528780065, + "learning_rate": 2.6274039652729725e-05, + "loss": 0.2234, + "step": 16841 + }, + { + "epoch": 1.9971540377090005, + "grad_norm": 0.8360276960406765, + "learning_rate": 2.6271642404721035e-05, + "loss": 0.1538, + "step": 16842 + }, + { + "epoch": 1.9972726194711252, + "grad_norm": 0.863976590954877, + "learning_rate": 2.6269245144989363e-05, + "loss": 0.1523, + "step": 16843 + }, + { + "epoch": 1.9973912012332504, + "grad_norm": 0.929357078077887, + "learning_rate": 2.6266847873556804e-05, + "loss": 0.2322, + "step": 16844 + }, + { + "epoch": 1.9975097829953752, + "grad_norm": 1.1130274515089205, + "learning_rate": 2.6264450590445467e-05, + "loss": 0.2291, + "step": 16845 + }, + { + "epoch": 1.9976283647575004, + "grad_norm": 0.6776861365466567, + "learning_rate": 2.6262053295677445e-05, + "loss": 0.1178, + "step": 16846 + }, + { + "epoch": 1.9977469465196251, + "grad_norm": 0.6428839327571725, + "learning_rate": 2.625965598927484e-05, + "loss": 0.1338, + "step": 16847 + }, + { + "epoch": 1.9978655282817503, + "grad_norm": 0.9888267107554723, + "learning_rate": 2.625725867125975e-05, + "loss": 0.1968, + "step": 16848 + }, + { + "epoch": 1.997984110043875, + "grad_norm": 0.7202458671209022, + "learning_rate": 2.625486134165428e-05, + "loss": 0.1482, + "step": 16849 + }, + { + "epoch": 1.9981026918060003, + "grad_norm": 0.8207856398553659, + "learning_rate": 2.625246400048052e-05, + "loss": 0.1817, + "step": 16850 + }, + { + "epoch": 1.998221273568125, + "grad_norm": 0.8386813664870308, + "learning_rate": 2.625006664776058e-05, + "loss": 0.1406, + "step": 16851 + }, + { + "epoch": 1.9983398553302503, + "grad_norm": 0.6168524400517565, + "learning_rate": 2.624766928351656e-05, + "loss": 0.1356, + "step": 16852 + }, + { + "epoch": 1.998458437092375, + "grad_norm": 0.8324910929050228, + "learning_rate": 2.6245271907770564e-05, + "loss": 0.1608, + "step": 16853 + }, + { + "epoch": 1.9985770188545002, + "grad_norm": 1.035864251144, + "learning_rate": 2.6242874520544682e-05, + "loss": 0.2238, + "step": 16854 + }, + { + "epoch": 1.9986956006166252, + "grad_norm": 0.8609641547090066, + "learning_rate": 2.6240477121861025e-05, + "loss": 0.1411, + "step": 16855 + }, + { + "epoch": 1.9988141823787502, + "grad_norm": 1.045482775372267, + "learning_rate": 2.6238079711741685e-05, + "loss": 0.2008, + "step": 16856 + }, + { + "epoch": 1.9989327641408752, + "grad_norm": 0.869227913995403, + "learning_rate": 2.6235682290208773e-05, + "loss": 0.1744, + "step": 16857 + }, + { + "epoch": 1.9990513459030002, + "grad_norm": 0.7130851428563042, + "learning_rate": 2.6233284857284385e-05, + "loss": 0.1363, + "step": 16858 + }, + { + "epoch": 1.9991699276651251, + "grad_norm": 0.831475924730594, + "learning_rate": 2.6230887412990616e-05, + "loss": 0.1552, + "step": 16859 + }, + { + "epoch": 1.9992885094272501, + "grad_norm": 0.7833658932336717, + "learning_rate": 2.6228489957349584e-05, + "loss": 0.1667, + "step": 16860 + }, + { + "epoch": 1.999407091189375, + "grad_norm": 0.9088244395006139, + "learning_rate": 2.6226092490383376e-05, + "loss": 0.1663, + "step": 16861 + }, + { + "epoch": 1.9995256729515, + "grad_norm": 0.9796285776540565, + "learning_rate": 2.6223695012114096e-05, + "loss": 0.1813, + "step": 16862 + }, + { + "epoch": 1.999644254713625, + "grad_norm": 0.9920832083473846, + "learning_rate": 2.6221297522563852e-05, + "loss": 0.1768, + "step": 16863 + }, + { + "epoch": 1.99976283647575, + "grad_norm": 0.611802599086474, + "learning_rate": 2.6218900021754744e-05, + "loss": 0.1417, + "step": 16864 + }, + { + "epoch": 1.999881418237875, + "grad_norm": 0.9250190679278198, + "learning_rate": 2.6216502509708874e-05, + "loss": 0.1755, + "step": 16865 + }, + { + "epoch": 2.0, + "grad_norm": 0.8563955982635383, + "learning_rate": 2.621410498644834e-05, + "loss": 0.1644, + "step": 16866 + }, + { + "epoch": 2.000118581762125, + "grad_norm": 0.6961978689603201, + "learning_rate": 2.621170745199525e-05, + "loss": 0.1311, + "step": 16867 + }, + { + "epoch": 2.00023716352425, + "grad_norm": 0.745372156249083, + "learning_rate": 2.62093099063717e-05, + "loss": 0.0948, + "step": 16868 + }, + { + "epoch": 2.000355745286375, + "grad_norm": 0.6387425794802617, + "learning_rate": 2.6206912349599792e-05, + "loss": 0.0953, + "step": 16869 + }, + { + "epoch": 2.0004743270485, + "grad_norm": 0.5243466703832472, + "learning_rate": 2.6204514781701633e-05, + "loss": 0.122, + "step": 16870 + }, + { + "epoch": 2.000592908810625, + "grad_norm": 0.504872226471732, + "learning_rate": 2.620211720269933e-05, + "loss": 0.0822, + "step": 16871 + }, + { + "epoch": 2.00071149057275, + "grad_norm": 0.7193061337083732, + "learning_rate": 2.6199719612614974e-05, + "loss": 0.1169, + "step": 16872 + }, + { + "epoch": 2.000830072334875, + "grad_norm": 0.5317301181910219, + "learning_rate": 2.619732201147068e-05, + "loss": 0.0748, + "step": 16873 + }, + { + "epoch": 2.000948654097, + "grad_norm": 0.7171824160879245, + "learning_rate": 2.6194924399288544e-05, + "loss": 0.1115, + "step": 16874 + }, + { + "epoch": 2.001067235859125, + "grad_norm": 0.6838937274892781, + "learning_rate": 2.6192526776090677e-05, + "loss": 0.1172, + "step": 16875 + }, + { + "epoch": 2.00118581762125, + "grad_norm": 0.5644426353016092, + "learning_rate": 2.6190129141899165e-05, + "loss": 0.0834, + "step": 16876 + }, + { + "epoch": 2.001304399383375, + "grad_norm": 0.8755914648187667, + "learning_rate": 2.6187731496736127e-05, + "loss": 0.1384, + "step": 16877 + }, + { + "epoch": 2.0014229811454998, + "grad_norm": 0.7667935085120312, + "learning_rate": 2.618533384062366e-05, + "loss": 0.1501, + "step": 16878 + }, + { + "epoch": 2.001541562907625, + "grad_norm": 0.6808975428029741, + "learning_rate": 2.6182936173583873e-05, + "loss": 0.093, + "step": 16879 + }, + { + "epoch": 2.0016601446697497, + "grad_norm": 0.8897320784869379, + "learning_rate": 2.6180538495638858e-05, + "loss": 0.1433, + "step": 16880 + }, + { + "epoch": 2.001778726431875, + "grad_norm": 0.899711926367931, + "learning_rate": 2.6178140806810726e-05, + "loss": 0.105, + "step": 16881 + }, + { + "epoch": 2.0018973081939997, + "grad_norm": 0.5447082510947037, + "learning_rate": 2.6175743107121575e-05, + "loss": 0.1056, + "step": 16882 + }, + { + "epoch": 2.002015889956125, + "grad_norm": 0.5040033953458318, + "learning_rate": 2.617334539659353e-05, + "loss": 0.0726, + "step": 16883 + }, + { + "epoch": 2.0021344717182497, + "grad_norm": 0.5740791764465832, + "learning_rate": 2.617094767524867e-05, + "loss": 0.1124, + "step": 16884 + }, + { + "epoch": 2.002253053480375, + "grad_norm": 0.7866093668182068, + "learning_rate": 2.6168549943109112e-05, + "loss": 0.1329, + "step": 16885 + }, + { + "epoch": 2.0023716352424996, + "grad_norm": 0.7653327687452609, + "learning_rate": 2.6166152200196948e-05, + "loss": 0.1159, + "step": 16886 + }, + { + "epoch": 2.002490217004625, + "grad_norm": 0.901510355386172, + "learning_rate": 2.6163754446534295e-05, + "loss": 0.1092, + "step": 16887 + }, + { + "epoch": 2.0026087987667496, + "grad_norm": 0.5326249617430503, + "learning_rate": 2.6161356682143262e-05, + "loss": 0.0815, + "step": 16888 + }, + { + "epoch": 2.002727380528875, + "grad_norm": 0.9342038160847689, + "learning_rate": 2.6158958907045933e-05, + "loss": 0.128, + "step": 16889 + }, + { + "epoch": 2.0028459622909995, + "grad_norm": 0.6225871461584435, + "learning_rate": 2.6156561121264435e-05, + "loss": 0.0757, + "step": 16890 + }, + { + "epoch": 2.0029645440531247, + "grad_norm": 0.7912308725801209, + "learning_rate": 2.6154163324820852e-05, + "loss": 0.1172, + "step": 16891 + }, + { + "epoch": 2.0030831258152495, + "grad_norm": 0.7221587995587107, + "learning_rate": 2.61517655177373e-05, + "loss": 0.1044, + "step": 16892 + }, + { + "epoch": 2.0032017075773747, + "grad_norm": 0.8139702789420203, + "learning_rate": 2.6149367700035882e-05, + "loss": 0.1459, + "step": 16893 + }, + { + "epoch": 2.0033202893394995, + "grad_norm": 0.6528844354137825, + "learning_rate": 2.6146969871738707e-05, + "loss": 0.1059, + "step": 16894 + }, + { + "epoch": 2.0034388711016247, + "grad_norm": 0.9168180641870578, + "learning_rate": 2.614457203286787e-05, + "loss": 0.127, + "step": 16895 + }, + { + "epoch": 2.0035574528637494, + "grad_norm": 0.7380234195757243, + "learning_rate": 2.6142174183445494e-05, + "loss": 0.1016, + "step": 16896 + }, + { + "epoch": 2.0036760346258746, + "grad_norm": 0.7544639467536766, + "learning_rate": 2.6139776323493657e-05, + "loss": 0.0839, + "step": 16897 + }, + { + "epoch": 2.0037946163879994, + "grad_norm": 0.785235094021379, + "learning_rate": 2.6137378453034496e-05, + "loss": 0.1063, + "step": 16898 + }, + { + "epoch": 2.0039131981501246, + "grad_norm": 0.7398946334525268, + "learning_rate": 2.613498057209009e-05, + "loss": 0.1144, + "step": 16899 + }, + { + "epoch": 2.0040317799122493, + "grad_norm": 1.2593451157567548, + "learning_rate": 2.6132582680682556e-05, + "loss": 0.121, + "step": 16900 + }, + { + "epoch": 2.0041503616743745, + "grad_norm": 0.8177921352759894, + "learning_rate": 2.6130184778834e-05, + "loss": 0.1401, + "step": 16901 + }, + { + "epoch": 2.0042689434364993, + "grad_norm": 0.7695475676350472, + "learning_rate": 2.6127786866566522e-05, + "loss": 0.1127, + "step": 16902 + }, + { + "epoch": 2.0043875251986245, + "grad_norm": 0.8334307628914533, + "learning_rate": 2.612538894390224e-05, + "loss": 0.1146, + "step": 16903 + }, + { + "epoch": 2.0045061069607493, + "grad_norm": 0.70994232412504, + "learning_rate": 2.612299101086325e-05, + "loss": 0.1155, + "step": 16904 + }, + { + "epoch": 2.0046246887228745, + "grad_norm": 0.7284738355394564, + "learning_rate": 2.612059306747166e-05, + "loss": 0.0919, + "step": 16905 + }, + { + "epoch": 2.0047432704849992, + "grad_norm": 0.7416262431404086, + "learning_rate": 2.6118195113749572e-05, + "loss": 0.0885, + "step": 16906 + }, + { + "epoch": 2.0048618522471244, + "grad_norm": 0.5834293879172184, + "learning_rate": 2.6115797149719108e-05, + "loss": 0.0807, + "step": 16907 + }, + { + "epoch": 2.004980434009249, + "grad_norm": 0.8234780990837411, + "learning_rate": 2.6113399175402346e-05, + "loss": 0.1029, + "step": 16908 + }, + { + "epoch": 2.0050990157713744, + "grad_norm": 0.7454549156477106, + "learning_rate": 2.6111001190821423e-05, + "loss": 0.099, + "step": 16909 + }, + { + "epoch": 2.005217597533499, + "grad_norm": 0.5295786043925863, + "learning_rate": 2.610860319599842e-05, + "loss": 0.0662, + "step": 16910 + }, + { + "epoch": 2.0053361792956244, + "grad_norm": 0.6682096145104655, + "learning_rate": 2.610620519095546e-05, + "loss": 0.0942, + "step": 16911 + }, + { + "epoch": 2.005454761057749, + "grad_norm": 0.7204573119520556, + "learning_rate": 2.6103807175714645e-05, + "loss": 0.1077, + "step": 16912 + }, + { + "epoch": 2.0055733428198743, + "grad_norm": 0.593220691626613, + "learning_rate": 2.6101409150298084e-05, + "loss": 0.0782, + "step": 16913 + }, + { + "epoch": 2.005691924581999, + "grad_norm": 0.6255777257618824, + "learning_rate": 2.6099011114727874e-05, + "loss": 0.0927, + "step": 16914 + }, + { + "epoch": 2.0058105063441243, + "grad_norm": 0.6348903183774963, + "learning_rate": 2.6096613069026134e-05, + "loss": 0.0684, + "step": 16915 + }, + { + "epoch": 2.0059290881062495, + "grad_norm": 1.0836262035527398, + "learning_rate": 2.6094215013214967e-05, + "loss": 0.1491, + "step": 16916 + }, + { + "epoch": 2.0060476698683742, + "grad_norm": 0.8388435802924391, + "learning_rate": 2.6091816947316478e-05, + "loss": 0.126, + "step": 16917 + }, + { + "epoch": 2.0061662516304994, + "grad_norm": 0.8930856505456013, + "learning_rate": 2.6089418871352778e-05, + "loss": 0.1287, + "step": 16918 + }, + { + "epoch": 2.006284833392624, + "grad_norm": 0.5575754587405908, + "learning_rate": 2.6087020785345968e-05, + "loss": 0.1017, + "step": 16919 + }, + { + "epoch": 2.0064034151547494, + "grad_norm": 0.786389780303627, + "learning_rate": 2.608462268931816e-05, + "loss": 0.1187, + "step": 16920 + }, + { + "epoch": 2.006521996916874, + "grad_norm": 0.9317462419717074, + "learning_rate": 2.608222458329146e-05, + "loss": 0.1352, + "step": 16921 + }, + { + "epoch": 2.0066405786789994, + "grad_norm": 0.5318646374341366, + "learning_rate": 2.6079826467287984e-05, + "loss": 0.0925, + "step": 16922 + }, + { + "epoch": 2.006759160441124, + "grad_norm": 0.5517292711088767, + "learning_rate": 2.6077428341329824e-05, + "loss": 0.066, + "step": 16923 + }, + { + "epoch": 2.0068777422032493, + "grad_norm": 0.6548734587211145, + "learning_rate": 2.60750302054391e-05, + "loss": 0.0814, + "step": 16924 + }, + { + "epoch": 2.006996323965374, + "grad_norm": 0.5459847375615196, + "learning_rate": 2.6072632059637915e-05, + "loss": 0.075, + "step": 16925 + }, + { + "epoch": 2.0071149057274993, + "grad_norm": 0.8304234699798361, + "learning_rate": 2.6070233903948378e-05, + "loss": 0.1118, + "step": 16926 + }, + { + "epoch": 2.007233487489624, + "grad_norm": 0.7391247659650374, + "learning_rate": 2.6067835738392593e-05, + "loss": 0.083, + "step": 16927 + }, + { + "epoch": 2.0073520692517492, + "grad_norm": 0.7700225731492324, + "learning_rate": 2.6065437562992677e-05, + "loss": 0.1019, + "step": 16928 + }, + { + "epoch": 2.007470651013874, + "grad_norm": 0.6944742249104041, + "learning_rate": 2.6063039377770732e-05, + "loss": 0.1049, + "step": 16929 + }, + { + "epoch": 2.007589232775999, + "grad_norm": 0.7206789976277662, + "learning_rate": 2.6060641182748864e-05, + "loss": 0.0959, + "step": 16930 + }, + { + "epoch": 2.007707814538124, + "grad_norm": 1.0297303843752335, + "learning_rate": 2.6058242977949183e-05, + "loss": 0.1272, + "step": 16931 + }, + { + "epoch": 2.007826396300249, + "grad_norm": 0.8692985466088227, + "learning_rate": 2.6055844763393806e-05, + "loss": 0.1252, + "step": 16932 + }, + { + "epoch": 2.007944978062374, + "grad_norm": 0.8752688196611741, + "learning_rate": 2.6053446539104832e-05, + "loss": 0.1217, + "step": 16933 + }, + { + "epoch": 2.008063559824499, + "grad_norm": 0.6180421802264305, + "learning_rate": 2.6051048305104374e-05, + "loss": 0.0787, + "step": 16934 + }, + { + "epoch": 2.008182141586624, + "grad_norm": 0.6783292623190534, + "learning_rate": 2.6048650061414544e-05, + "loss": 0.0888, + "step": 16935 + }, + { + "epoch": 2.008300723348749, + "grad_norm": 0.726376165528946, + "learning_rate": 2.6046251808057435e-05, + "loss": 0.1078, + "step": 16936 + }, + { + "epoch": 2.008419305110874, + "grad_norm": 0.8334341221253349, + "learning_rate": 2.604385354505518e-05, + "loss": 0.1425, + "step": 16937 + }, + { + "epoch": 2.008537886872999, + "grad_norm": 0.7396686224603697, + "learning_rate": 2.6041455272429867e-05, + "loss": 0.121, + "step": 16938 + }, + { + "epoch": 2.008656468635124, + "grad_norm": 0.9477300834051182, + "learning_rate": 2.603905699020362e-05, + "loss": 0.1234, + "step": 16939 + }, + { + "epoch": 2.008775050397249, + "grad_norm": 0.5973177473525001, + "learning_rate": 2.6036658698398536e-05, + "loss": 0.0595, + "step": 16940 + }, + { + "epoch": 2.0088936321593738, + "grad_norm": 0.6643940249242061, + "learning_rate": 2.603426039703673e-05, + "loss": 0.1175, + "step": 16941 + }, + { + "epoch": 2.009012213921499, + "grad_norm": 0.8267686158779364, + "learning_rate": 2.6031862086140306e-05, + "loss": 0.0987, + "step": 16942 + }, + { + "epoch": 2.0091307956836237, + "grad_norm": 0.5168702090147972, + "learning_rate": 2.602946376573139e-05, + "loss": 0.0735, + "step": 16943 + }, + { + "epoch": 2.009249377445749, + "grad_norm": 0.7053517735392276, + "learning_rate": 2.6027065435832076e-05, + "loss": 0.0885, + "step": 16944 + }, + { + "epoch": 2.0093679592078737, + "grad_norm": 0.8633472622962132, + "learning_rate": 2.6024667096464477e-05, + "loss": 0.1258, + "step": 16945 + }, + { + "epoch": 2.009486540969999, + "grad_norm": 0.6426944626628582, + "learning_rate": 2.6022268747650706e-05, + "loss": 0.0908, + "step": 16946 + }, + { + "epoch": 2.0096051227321237, + "grad_norm": 0.6513382780759567, + "learning_rate": 2.601987038941287e-05, + "loss": 0.1095, + "step": 16947 + }, + { + "epoch": 2.009723704494249, + "grad_norm": 0.8472548328547815, + "learning_rate": 2.6017472021773083e-05, + "loss": 0.1098, + "step": 16948 + }, + { + "epoch": 2.0098422862563736, + "grad_norm": 0.6882126867928771, + "learning_rate": 2.6015073644753447e-05, + "loss": 0.0974, + "step": 16949 + }, + { + "epoch": 2.009960868018499, + "grad_norm": 0.6144364098396409, + "learning_rate": 2.6012675258376078e-05, + "loss": 0.0861, + "step": 16950 + }, + { + "epoch": 2.0100794497806236, + "grad_norm": 0.9937533218659163, + "learning_rate": 2.601027686266308e-05, + "loss": 0.1544, + "step": 16951 + }, + { + "epoch": 2.010198031542749, + "grad_norm": 0.8312559883003887, + "learning_rate": 2.6007878457636582e-05, + "loss": 0.138, + "step": 16952 + }, + { + "epoch": 2.0103166133048735, + "grad_norm": 0.7000010387459265, + "learning_rate": 2.600548004331867e-05, + "loss": 0.1106, + "step": 16953 + }, + { + "epoch": 2.0104351950669987, + "grad_norm": 0.6835837669410336, + "learning_rate": 2.6003081619731466e-05, + "loss": 0.0978, + "step": 16954 + }, + { + "epoch": 2.0105537768291235, + "grad_norm": 0.8152892936270512, + "learning_rate": 2.6000683186897078e-05, + "loss": 0.0927, + "step": 16955 + }, + { + "epoch": 2.0106723585912487, + "grad_norm": 0.6634205507938633, + "learning_rate": 2.5998284744837627e-05, + "loss": 0.1013, + "step": 16956 + }, + { + "epoch": 2.0107909403533735, + "grad_norm": 0.835426075139618, + "learning_rate": 2.5995886293575207e-05, + "loss": 0.1144, + "step": 16957 + }, + { + "epoch": 2.0109095221154987, + "grad_norm": 0.8987520586653677, + "learning_rate": 2.5993487833131936e-05, + "loss": 0.0876, + "step": 16958 + }, + { + "epoch": 2.0110281038776234, + "grad_norm": 0.7296205630944472, + "learning_rate": 2.599108936352993e-05, + "loss": 0.1204, + "step": 16959 + }, + { + "epoch": 2.0111466856397486, + "grad_norm": 0.6054068570723142, + "learning_rate": 2.5988690884791287e-05, + "loss": 0.1027, + "step": 16960 + }, + { + "epoch": 2.0112652674018734, + "grad_norm": 0.8810517763727529, + "learning_rate": 2.5986292396938132e-05, + "loss": 0.1293, + "step": 16961 + }, + { + "epoch": 2.0113838491639986, + "grad_norm": 0.6499481785379851, + "learning_rate": 2.5983893899992572e-05, + "loss": 0.086, + "step": 16962 + }, + { + "epoch": 2.0115024309261234, + "grad_norm": 0.8824483877288267, + "learning_rate": 2.598149539397672e-05, + "loss": 0.0988, + "step": 16963 + }, + { + "epoch": 2.0116210126882486, + "grad_norm": 0.6579386180120341, + "learning_rate": 2.5979096878912675e-05, + "loss": 0.1026, + "step": 16964 + }, + { + "epoch": 2.0117395944503733, + "grad_norm": 1.032103425834114, + "learning_rate": 2.5976698354822565e-05, + "loss": 0.1414, + "step": 16965 + }, + { + "epoch": 2.0118581762124985, + "grad_norm": 0.8918200846715295, + "learning_rate": 2.5974299821728492e-05, + "loss": 0.1189, + "step": 16966 + }, + { + "epoch": 2.0119767579746237, + "grad_norm": 0.8254702687106407, + "learning_rate": 2.5971901279652573e-05, + "loss": 0.143, + "step": 16967 + }, + { + "epoch": 2.0120953397367485, + "grad_norm": 0.7809869838898381, + "learning_rate": 2.5969502728616906e-05, + "loss": 0.1094, + "step": 16968 + }, + { + "epoch": 2.0122139214988737, + "grad_norm": 0.6433097795537194, + "learning_rate": 2.5967104168643625e-05, + "loss": 0.1079, + "step": 16969 + }, + { + "epoch": 2.0123325032609984, + "grad_norm": 0.8662513051917791, + "learning_rate": 2.5964705599754817e-05, + "loss": 0.1097, + "step": 16970 + }, + { + "epoch": 2.0124510850231236, + "grad_norm": 0.7389450364307818, + "learning_rate": 2.596230702197261e-05, + "loss": 0.1205, + "step": 16971 + }, + { + "epoch": 2.0125696667852484, + "grad_norm": 0.6250607579765874, + "learning_rate": 2.5959908435319113e-05, + "loss": 0.081, + "step": 16972 + }, + { + "epoch": 2.0126882485473736, + "grad_norm": 0.48101555982249194, + "learning_rate": 2.595750983981644e-05, + "loss": 0.0702, + "step": 16973 + }, + { + "epoch": 2.0128068303094984, + "grad_norm": 0.7505440314580127, + "learning_rate": 2.59551112354867e-05, + "loss": 0.1214, + "step": 16974 + }, + { + "epoch": 2.0129254120716236, + "grad_norm": 0.7699049564552406, + "learning_rate": 2.5952712622352006e-05, + "loss": 0.1284, + "step": 16975 + }, + { + "epoch": 2.0130439938337483, + "grad_norm": 0.9778149577013496, + "learning_rate": 2.5950314000434473e-05, + "loss": 0.1248, + "step": 16976 + }, + { + "epoch": 2.0131625755958735, + "grad_norm": 0.8255202814870506, + "learning_rate": 2.5947915369756198e-05, + "loss": 0.106, + "step": 16977 + }, + { + "epoch": 2.0132811573579983, + "grad_norm": 0.6821848788801772, + "learning_rate": 2.594551673033932e-05, + "loss": 0.0858, + "step": 16978 + }, + { + "epoch": 2.0133997391201235, + "grad_norm": 0.572912726212391, + "learning_rate": 2.594311808220593e-05, + "loss": 0.0941, + "step": 16979 + }, + { + "epoch": 2.0135183208822482, + "grad_norm": 0.8513278352284833, + "learning_rate": 2.5940719425378146e-05, + "loss": 0.1154, + "step": 16980 + }, + { + "epoch": 2.0136369026443734, + "grad_norm": 0.8342271752063198, + "learning_rate": 2.5938320759878082e-05, + "loss": 0.1003, + "step": 16981 + }, + { + "epoch": 2.013755484406498, + "grad_norm": 0.9680128147651842, + "learning_rate": 2.593592208572786e-05, + "loss": 0.1181, + "step": 16982 + }, + { + "epoch": 2.0138740661686234, + "grad_norm": 0.7527110966688894, + "learning_rate": 2.5933523402949576e-05, + "loss": 0.1241, + "step": 16983 + }, + { + "epoch": 2.013992647930748, + "grad_norm": 0.6050118490138805, + "learning_rate": 2.5931124711565357e-05, + "loss": 0.0773, + "step": 16984 + }, + { + "epoch": 2.0141112296928734, + "grad_norm": 0.6675047260071221, + "learning_rate": 2.5928726011597303e-05, + "loss": 0.086, + "step": 16985 + }, + { + "epoch": 2.014229811454998, + "grad_norm": 0.809397439483032, + "learning_rate": 2.5926327303067538e-05, + "loss": 0.0952, + "step": 16986 + }, + { + "epoch": 2.0143483932171233, + "grad_norm": 0.9165082921960497, + "learning_rate": 2.5923928585998177e-05, + "loss": 0.0967, + "step": 16987 + }, + { + "epoch": 2.014466974979248, + "grad_norm": 0.9305151506498842, + "learning_rate": 2.5921529860411316e-05, + "loss": 0.1387, + "step": 16988 + }, + { + "epoch": 2.0145855567413733, + "grad_norm": 0.6480431659725122, + "learning_rate": 2.5919131126329088e-05, + "loss": 0.0941, + "step": 16989 + }, + { + "epoch": 2.014704138503498, + "grad_norm": 0.6974491318931529, + "learning_rate": 2.5916732383773596e-05, + "loss": 0.1011, + "step": 16990 + }, + { + "epoch": 2.0148227202656233, + "grad_norm": 0.6662945695628625, + "learning_rate": 2.591433363276695e-05, + "loss": 0.0805, + "step": 16991 + }, + { + "epoch": 2.014941302027748, + "grad_norm": 0.8279482362281937, + "learning_rate": 2.5911934873331272e-05, + "loss": 0.1164, + "step": 16992 + }, + { + "epoch": 2.015059883789873, + "grad_norm": 1.060580728359988, + "learning_rate": 2.590953610548868e-05, + "loss": 0.1066, + "step": 16993 + }, + { + "epoch": 2.015178465551998, + "grad_norm": 0.8758924257011432, + "learning_rate": 2.5907137329261272e-05, + "loss": 0.1034, + "step": 16994 + }, + { + "epoch": 2.015297047314123, + "grad_norm": 0.7941194629503767, + "learning_rate": 2.5904738544671174e-05, + "loss": 0.1102, + "step": 16995 + }, + { + "epoch": 2.015415629076248, + "grad_norm": 0.8765848638423117, + "learning_rate": 2.5902339751740495e-05, + "loss": 0.1073, + "step": 16996 + }, + { + "epoch": 2.015534210838373, + "grad_norm": 0.7156224532582327, + "learning_rate": 2.5899940950491352e-05, + "loss": 0.0937, + "step": 16997 + }, + { + "epoch": 2.015652792600498, + "grad_norm": 0.6402336079564108, + "learning_rate": 2.5897542140945858e-05, + "loss": 0.0933, + "step": 16998 + }, + { + "epoch": 2.015771374362623, + "grad_norm": 0.763940236974313, + "learning_rate": 2.589514332312612e-05, + "loss": 0.1127, + "step": 16999 + }, + { + "epoch": 2.015889956124748, + "grad_norm": 0.8704257632058873, + "learning_rate": 2.5892744497054262e-05, + "loss": 0.1213, + "step": 17000 + }, + { + "epoch": 2.016008537886873, + "grad_norm": 0.565320939896494, + "learning_rate": 2.589034566275239e-05, + "loss": 0.0746, + "step": 17001 + }, + { + "epoch": 2.016127119648998, + "grad_norm": 0.8070066555212387, + "learning_rate": 2.5887946820242633e-05, + "loss": 0.1045, + "step": 17002 + }, + { + "epoch": 2.016245701411123, + "grad_norm": 0.9749089117483943, + "learning_rate": 2.5885547969547085e-05, + "loss": 0.1444, + "step": 17003 + }, + { + "epoch": 2.016364283173248, + "grad_norm": 0.8377268095765578, + "learning_rate": 2.5883149110687877e-05, + "loss": 0.1057, + "step": 17004 + }, + { + "epoch": 2.016482864935373, + "grad_norm": 0.6322534696494635, + "learning_rate": 2.5880750243687114e-05, + "loss": 0.0834, + "step": 17005 + }, + { + "epoch": 2.0166014466974977, + "grad_norm": 1.477683817095557, + "learning_rate": 2.5878351368566918e-05, + "loss": 0.1374, + "step": 17006 + }, + { + "epoch": 2.016720028459623, + "grad_norm": 0.6282295733433491, + "learning_rate": 2.587595248534939e-05, + "loss": 0.077, + "step": 17007 + }, + { + "epoch": 2.0168386102217477, + "grad_norm": 0.9184628926302872, + "learning_rate": 2.5873553594056665e-05, + "loss": 0.1066, + "step": 17008 + }, + { + "epoch": 2.016957191983873, + "grad_norm": 0.7572574993974743, + "learning_rate": 2.587115469471084e-05, + "loss": 0.0878, + "step": 17009 + }, + { + "epoch": 2.0170757737459977, + "grad_norm": 0.6563275916124504, + "learning_rate": 2.5868755787334044e-05, + "loss": 0.0867, + "step": 17010 + }, + { + "epoch": 2.017194355508123, + "grad_norm": 0.6450573351207184, + "learning_rate": 2.5866356871948377e-05, + "loss": 0.1021, + "step": 17011 + }, + { + "epoch": 2.0173129372702476, + "grad_norm": 0.7782744407286468, + "learning_rate": 2.586395794857597e-05, + "loss": 0.0953, + "step": 17012 + }, + { + "epoch": 2.017431519032373, + "grad_norm": 0.6505224118571163, + "learning_rate": 2.5861559017238924e-05, + "loss": 0.0935, + "step": 17013 + }, + { + "epoch": 2.0175501007944976, + "grad_norm": 0.9775341457861001, + "learning_rate": 2.5859160077959366e-05, + "loss": 0.1087, + "step": 17014 + }, + { + "epoch": 2.017668682556623, + "grad_norm": 0.7507399204993604, + "learning_rate": 2.58567611307594e-05, + "loss": 0.118, + "step": 17015 + }, + { + "epoch": 2.0177872643187476, + "grad_norm": 0.802512166205142, + "learning_rate": 2.5854362175661145e-05, + "loss": 0.0956, + "step": 17016 + }, + { + "epoch": 2.0179058460808728, + "grad_norm": 0.8483026718060154, + "learning_rate": 2.5851963212686725e-05, + "loss": 0.1259, + "step": 17017 + }, + { + "epoch": 2.018024427842998, + "grad_norm": 0.5599421421210834, + "learning_rate": 2.5849564241858243e-05, + "loss": 0.0843, + "step": 17018 + }, + { + "epoch": 2.0181430096051227, + "grad_norm": 0.8836350504547238, + "learning_rate": 2.5847165263197833e-05, + "loss": 0.1218, + "step": 17019 + }, + { + "epoch": 2.018261591367248, + "grad_norm": 0.7979396013056426, + "learning_rate": 2.5844766276727582e-05, + "loss": 0.104, + "step": 17020 + }, + { + "epoch": 2.0183801731293727, + "grad_norm": 0.7283250772029296, + "learning_rate": 2.5842367282469626e-05, + "loss": 0.1064, + "step": 17021 + }, + { + "epoch": 2.018498754891498, + "grad_norm": 0.7416022670645755, + "learning_rate": 2.5839968280446082e-05, + "loss": 0.0947, + "step": 17022 + }, + { + "epoch": 2.0186173366536226, + "grad_norm": 0.7418149138658839, + "learning_rate": 2.5837569270679058e-05, + "loss": 0.0916, + "step": 17023 + }, + { + "epoch": 2.018735918415748, + "grad_norm": 0.7180702601576594, + "learning_rate": 2.583517025319067e-05, + "loss": 0.0911, + "step": 17024 + }, + { + "epoch": 2.0188545001778726, + "grad_norm": 0.9536080717815969, + "learning_rate": 2.5832771228003043e-05, + "loss": 0.1259, + "step": 17025 + }, + { + "epoch": 2.018973081939998, + "grad_norm": 1.2895852461760804, + "learning_rate": 2.583037219513828e-05, + "loss": 0.1497, + "step": 17026 + }, + { + "epoch": 2.0190916637021226, + "grad_norm": 0.5890402150356825, + "learning_rate": 2.5827973154618512e-05, + "loss": 0.0622, + "step": 17027 + }, + { + "epoch": 2.0192102454642478, + "grad_norm": 0.8736120384860733, + "learning_rate": 2.582557410646584e-05, + "loss": 0.1066, + "step": 17028 + }, + { + "epoch": 2.0193288272263725, + "grad_norm": 0.6284803573145943, + "learning_rate": 2.582317505070238e-05, + "loss": 0.0836, + "step": 17029 + }, + { + "epoch": 2.0194474089884977, + "grad_norm": 0.8646189838002291, + "learning_rate": 2.5820775987350264e-05, + "loss": 0.1192, + "step": 17030 + }, + { + "epoch": 2.0195659907506225, + "grad_norm": 1.1344278692567753, + "learning_rate": 2.5818376916431598e-05, + "loss": 0.1298, + "step": 17031 + }, + { + "epoch": 2.0196845725127477, + "grad_norm": 0.5802589463332644, + "learning_rate": 2.5815977837968504e-05, + "loss": 0.0871, + "step": 17032 + }, + { + "epoch": 2.0198031542748724, + "grad_norm": 0.561298136400072, + "learning_rate": 2.581357875198309e-05, + "loss": 0.0801, + "step": 17033 + }, + { + "epoch": 2.0199217360369977, + "grad_norm": 0.7154329905210174, + "learning_rate": 2.5811179658497488e-05, + "loss": 0.1078, + "step": 17034 + }, + { + "epoch": 2.0200403177991224, + "grad_norm": 1.1173325290434155, + "learning_rate": 2.580878055753379e-05, + "loss": 0.1233, + "step": 17035 + }, + { + "epoch": 2.0201588995612476, + "grad_norm": 0.7587198296802913, + "learning_rate": 2.5806381449114137e-05, + "loss": 0.1036, + "step": 17036 + }, + { + "epoch": 2.0202774813233724, + "grad_norm": 0.7454429412774327, + "learning_rate": 2.580398233326063e-05, + "loss": 0.0895, + "step": 17037 + }, + { + "epoch": 2.0203960630854976, + "grad_norm": 0.9363438058079177, + "learning_rate": 2.58015832099954e-05, + "loss": 0.1122, + "step": 17038 + }, + { + "epoch": 2.0205146448476223, + "grad_norm": 0.6651623728141232, + "learning_rate": 2.5799184079340544e-05, + "loss": 0.1084, + "step": 17039 + }, + { + "epoch": 2.0206332266097475, + "grad_norm": 0.6002753393589734, + "learning_rate": 2.5796784941318197e-05, + "loss": 0.0917, + "step": 17040 + }, + { + "epoch": 2.0207518083718723, + "grad_norm": 0.981664567124827, + "learning_rate": 2.5794385795950464e-05, + "loss": 0.1069, + "step": 17041 + }, + { + "epoch": 2.0208703901339975, + "grad_norm": 0.6437059364630277, + "learning_rate": 2.579198664325948e-05, + "loss": 0.0824, + "step": 17042 + }, + { + "epoch": 2.0209889718961223, + "grad_norm": 0.8412214987496069, + "learning_rate": 2.5789587483267342e-05, + "loss": 0.1041, + "step": 17043 + }, + { + "epoch": 2.0211075536582475, + "grad_norm": 0.7276524318429696, + "learning_rate": 2.5787188315996175e-05, + "loss": 0.0888, + "step": 17044 + }, + { + "epoch": 2.021226135420372, + "grad_norm": 0.695936480006316, + "learning_rate": 2.5784789141468103e-05, + "loss": 0.1058, + "step": 17045 + }, + { + "epoch": 2.0213447171824974, + "grad_norm": 0.5626316256599394, + "learning_rate": 2.578238995970523e-05, + "loss": 0.0762, + "step": 17046 + }, + { + "epoch": 2.021463298944622, + "grad_norm": 0.6230322931026746, + "learning_rate": 2.5779990770729695e-05, + "loss": 0.0755, + "step": 17047 + }, + { + "epoch": 2.0215818807067474, + "grad_norm": 0.6651746926003953, + "learning_rate": 2.5777591574563587e-05, + "loss": 0.0838, + "step": 17048 + }, + { + "epoch": 2.021700462468872, + "grad_norm": 0.6452315594034443, + "learning_rate": 2.577519237122904e-05, + "loss": 0.0989, + "step": 17049 + }, + { + "epoch": 2.0218190442309973, + "grad_norm": 0.9349372324677323, + "learning_rate": 2.577279316074817e-05, + "loss": 0.0991, + "step": 17050 + }, + { + "epoch": 2.021937625993122, + "grad_norm": 0.8716240620074638, + "learning_rate": 2.5770393943143107e-05, + "loss": 0.1086, + "step": 17051 + }, + { + "epoch": 2.0220562077552473, + "grad_norm": 0.8973891395558506, + "learning_rate": 2.576799471843594e-05, + "loss": 0.0961, + "step": 17052 + }, + { + "epoch": 2.022174789517372, + "grad_norm": 0.7805505614285784, + "learning_rate": 2.5765595486648813e-05, + "loss": 0.0946, + "step": 17053 + }, + { + "epoch": 2.0222933712794973, + "grad_norm": 0.8853996509020174, + "learning_rate": 2.5763196247803832e-05, + "loss": 0.1239, + "step": 17054 + }, + { + "epoch": 2.022411953041622, + "grad_norm": 0.8497478953595783, + "learning_rate": 2.576079700192312e-05, + "loss": 0.1131, + "step": 17055 + }, + { + "epoch": 2.0225305348037472, + "grad_norm": 0.7257950660912152, + "learning_rate": 2.575839774902879e-05, + "loss": 0.0909, + "step": 17056 + }, + { + "epoch": 2.022649116565872, + "grad_norm": 0.6268970624268199, + "learning_rate": 2.5755998489142967e-05, + "loss": 0.0958, + "step": 17057 + }, + { + "epoch": 2.022767698327997, + "grad_norm": 0.9753271143104818, + "learning_rate": 2.5753599222287762e-05, + "loss": 0.1125, + "step": 17058 + }, + { + "epoch": 2.022886280090122, + "grad_norm": 0.7515901901542646, + "learning_rate": 2.5751199948485287e-05, + "loss": 0.0867, + "step": 17059 + }, + { + "epoch": 2.023004861852247, + "grad_norm": 0.9194005373700264, + "learning_rate": 2.574880066775768e-05, + "loss": 0.0927, + "step": 17060 + }, + { + "epoch": 2.023123443614372, + "grad_norm": 0.6945759128399535, + "learning_rate": 2.5746401380127046e-05, + "loss": 0.0592, + "step": 17061 + }, + { + "epoch": 2.023242025376497, + "grad_norm": 0.7696287877206168, + "learning_rate": 2.574400208561551e-05, + "loss": 0.1148, + "step": 17062 + }, + { + "epoch": 2.023360607138622, + "grad_norm": 0.7532940315124429, + "learning_rate": 2.5741602784245183e-05, + "loss": 0.11, + "step": 17063 + }, + { + "epoch": 2.023479188900747, + "grad_norm": 0.6937599794131871, + "learning_rate": 2.5739203476038192e-05, + "loss": 0.073, + "step": 17064 + }, + { + "epoch": 2.023597770662872, + "grad_norm": 0.7791104663188599, + "learning_rate": 2.573680416101665e-05, + "loss": 0.1074, + "step": 17065 + }, + { + "epoch": 2.023716352424997, + "grad_norm": 0.9057167020333378, + "learning_rate": 2.573440483920268e-05, + "loss": 0.1016, + "step": 17066 + }, + { + "epoch": 2.0238349341871222, + "grad_norm": 0.6316059368218953, + "learning_rate": 2.573200551061839e-05, + "loss": 0.0809, + "step": 17067 + }, + { + "epoch": 2.023953515949247, + "grad_norm": 0.7822805528735209, + "learning_rate": 2.5729606175285914e-05, + "loss": 0.0861, + "step": 17068 + }, + { + "epoch": 2.024072097711372, + "grad_norm": 0.74660995038701, + "learning_rate": 2.5727206833227357e-05, + "loss": 0.1041, + "step": 17069 + }, + { + "epoch": 2.024190679473497, + "grad_norm": 0.8356690181588445, + "learning_rate": 2.5724807484464845e-05, + "loss": 0.1059, + "step": 17070 + }, + { + "epoch": 2.024309261235622, + "grad_norm": 0.5339107080348244, + "learning_rate": 2.5722408129020504e-05, + "loss": 0.0587, + "step": 17071 + }, + { + "epoch": 2.024427842997747, + "grad_norm": 0.9804231617734867, + "learning_rate": 2.5720008766916437e-05, + "loss": 0.1427, + "step": 17072 + }, + { + "epoch": 2.024546424759872, + "grad_norm": 0.9604653151741845, + "learning_rate": 2.571760939817478e-05, + "loss": 0.1233, + "step": 17073 + }, + { + "epoch": 2.024665006521997, + "grad_norm": 0.6884276691468444, + "learning_rate": 2.5715210022817643e-05, + "loss": 0.1042, + "step": 17074 + }, + { + "epoch": 2.024783588284122, + "grad_norm": 0.9060697744579413, + "learning_rate": 2.5712810640867147e-05, + "loss": 0.1141, + "step": 17075 + }, + { + "epoch": 2.024902170046247, + "grad_norm": 0.7057465923283026, + "learning_rate": 2.5710411252345407e-05, + "loss": 0.0938, + "step": 17076 + }, + { + "epoch": 2.025020751808372, + "grad_norm": 0.6883731034488817, + "learning_rate": 2.570801185727455e-05, + "loss": 0.0869, + "step": 17077 + }, + { + "epoch": 2.025139333570497, + "grad_norm": 1.186062895007446, + "learning_rate": 2.570561245567669e-05, + "loss": 0.163, + "step": 17078 + }, + { + "epoch": 2.025257915332622, + "grad_norm": 1.1543371666190576, + "learning_rate": 2.5703213047573948e-05, + "loss": 0.1467, + "step": 17079 + }, + { + "epoch": 2.0253764970947468, + "grad_norm": 0.6572502179547971, + "learning_rate": 2.570081363298844e-05, + "loss": 0.083, + "step": 17080 + }, + { + "epoch": 2.025495078856872, + "grad_norm": 0.7886250622203809, + "learning_rate": 2.56984142119423e-05, + "loss": 0.0935, + "step": 17081 + }, + { + "epoch": 2.0256136606189967, + "grad_norm": 0.994791917373475, + "learning_rate": 2.5696014784457634e-05, + "loss": 0.1332, + "step": 17082 + }, + { + "epoch": 2.025732242381122, + "grad_norm": 0.5648207338701364, + "learning_rate": 2.5693615350556564e-05, + "loss": 0.0603, + "step": 17083 + }, + { + "epoch": 2.0258508241432467, + "grad_norm": 0.7229275810734652, + "learning_rate": 2.5691215910261206e-05, + "loss": 0.0928, + "step": 17084 + }, + { + "epoch": 2.025969405905372, + "grad_norm": 0.8276013725490831, + "learning_rate": 2.5688816463593696e-05, + "loss": 0.1175, + "step": 17085 + }, + { + "epoch": 2.0260879876674966, + "grad_norm": 0.8444169687651416, + "learning_rate": 2.5686417010576136e-05, + "loss": 0.1238, + "step": 17086 + }, + { + "epoch": 2.026206569429622, + "grad_norm": 0.7475774983235246, + "learning_rate": 2.5684017551230648e-05, + "loss": 0.0842, + "step": 17087 + }, + { + "epoch": 2.0263251511917466, + "grad_norm": 0.5182330941711295, + "learning_rate": 2.5681618085579368e-05, + "loss": 0.0663, + "step": 17088 + }, + { + "epoch": 2.026443732953872, + "grad_norm": 0.6347464389758741, + "learning_rate": 2.5679218613644395e-05, + "loss": 0.1043, + "step": 17089 + }, + { + "epoch": 2.0265623147159966, + "grad_norm": 0.7578994837101011, + "learning_rate": 2.567681913544786e-05, + "loss": 0.0842, + "step": 17090 + }, + { + "epoch": 2.0266808964781218, + "grad_norm": 0.6320586104307965, + "learning_rate": 2.567441965101189e-05, + "loss": 0.0746, + "step": 17091 + }, + { + "epoch": 2.0267994782402465, + "grad_norm": 0.8724461601409338, + "learning_rate": 2.5672020160358596e-05, + "loss": 0.1172, + "step": 17092 + }, + { + "epoch": 2.0269180600023717, + "grad_norm": 1.2592429030621068, + "learning_rate": 2.5669620663510097e-05, + "loss": 0.1544, + "step": 17093 + }, + { + "epoch": 2.0270366417644965, + "grad_norm": 0.7668679154467282, + "learning_rate": 2.566722116048852e-05, + "loss": 0.1044, + "step": 17094 + }, + { + "epoch": 2.0271552235266217, + "grad_norm": 0.559205659210123, + "learning_rate": 2.5664821651315972e-05, + "loss": 0.0798, + "step": 17095 + }, + { + "epoch": 2.0272738052887465, + "grad_norm": 0.7220715382530938, + "learning_rate": 2.5662422136014596e-05, + "loss": 0.1012, + "step": 17096 + }, + { + "epoch": 2.0273923870508717, + "grad_norm": 0.7657363291980417, + "learning_rate": 2.56600226146065e-05, + "loss": 0.0993, + "step": 17097 + }, + { + "epoch": 2.0275109688129964, + "grad_norm": 0.6784183079101309, + "learning_rate": 2.56576230871138e-05, + "loss": 0.0932, + "step": 17098 + }, + { + "epoch": 2.0276295505751216, + "grad_norm": 0.6030045868708649, + "learning_rate": 2.565522355355862e-05, + "loss": 0.0652, + "step": 17099 + }, + { + "epoch": 2.0277481323372464, + "grad_norm": 0.9313354223738955, + "learning_rate": 2.5652824013963082e-05, + "loss": 0.1054, + "step": 17100 + }, + { + "epoch": 2.0278667140993716, + "grad_norm": 0.7352117462217475, + "learning_rate": 2.565042446834931e-05, + "loss": 0.0764, + "step": 17101 + }, + { + "epoch": 2.0279852958614963, + "grad_norm": 1.0323174354291909, + "learning_rate": 2.5648024916739417e-05, + "loss": 0.1216, + "step": 17102 + }, + { + "epoch": 2.0281038776236215, + "grad_norm": 0.967034565508246, + "learning_rate": 2.5645625359155538e-05, + "loss": 0.1168, + "step": 17103 + }, + { + "epoch": 2.0282224593857463, + "grad_norm": 0.6200292497675817, + "learning_rate": 2.564322579561978e-05, + "loss": 0.0936, + "step": 17104 + }, + { + "epoch": 2.0283410411478715, + "grad_norm": 0.9083567889595318, + "learning_rate": 2.5640826226154275e-05, + "loss": 0.1175, + "step": 17105 + }, + { + "epoch": 2.0284596229099963, + "grad_norm": 0.5955925553708699, + "learning_rate": 2.563842665078113e-05, + "loss": 0.0917, + "step": 17106 + }, + { + "epoch": 2.0285782046721215, + "grad_norm": 0.9420046345779765, + "learning_rate": 2.563602706952248e-05, + "loss": 0.1374, + "step": 17107 + }, + { + "epoch": 2.0286967864342462, + "grad_norm": 0.8942214771122347, + "learning_rate": 2.563362748240043e-05, + "loss": 0.1136, + "step": 17108 + }, + { + "epoch": 2.0288153681963714, + "grad_norm": 0.7427830062636183, + "learning_rate": 2.563122788943712e-05, + "loss": 0.0828, + "step": 17109 + }, + { + "epoch": 2.028933949958496, + "grad_norm": 0.8469970320479016, + "learning_rate": 2.562882829065466e-05, + "loss": 0.0907, + "step": 17110 + }, + { + "epoch": 2.0290525317206214, + "grad_norm": 0.7356860877597104, + "learning_rate": 2.5626428686075175e-05, + "loss": 0.1006, + "step": 17111 + }, + { + "epoch": 2.029171113482746, + "grad_norm": 0.6072532689891982, + "learning_rate": 2.5624029075720785e-05, + "loss": 0.0976, + "step": 17112 + }, + { + "epoch": 2.0292896952448713, + "grad_norm": 0.8215295268604407, + "learning_rate": 2.5621629459613618e-05, + "loss": 0.0934, + "step": 17113 + }, + { + "epoch": 2.029408277006996, + "grad_norm": 1.0828270828400735, + "learning_rate": 2.561922983777578e-05, + "loss": 0.1445, + "step": 17114 + }, + { + "epoch": 2.0295268587691213, + "grad_norm": 0.7043023402986945, + "learning_rate": 2.5616830210229407e-05, + "loss": 0.1019, + "step": 17115 + }, + { + "epoch": 2.0296454405312465, + "grad_norm": 0.6252632262102591, + "learning_rate": 2.5614430576996623e-05, + "loss": 0.0991, + "step": 17116 + }, + { + "epoch": 2.0297640222933713, + "grad_norm": 0.5759798820493216, + "learning_rate": 2.5612030938099536e-05, + "loss": 0.0827, + "step": 17117 + }, + { + "epoch": 2.0298826040554965, + "grad_norm": 0.7912524568474645, + "learning_rate": 2.5609631293560275e-05, + "loss": 0.1095, + "step": 17118 + }, + { + "epoch": 2.0300011858176212, + "grad_norm": 0.6802165957015239, + "learning_rate": 2.5607231643400957e-05, + "loss": 0.1006, + "step": 17119 + }, + { + "epoch": 2.0301197675797464, + "grad_norm": 0.8040302901074956, + "learning_rate": 2.5604831987643714e-05, + "loss": 0.1141, + "step": 17120 + }, + { + "epoch": 2.030238349341871, + "grad_norm": 0.8420660064069628, + "learning_rate": 2.5602432326310656e-05, + "loss": 0.1258, + "step": 17121 + }, + { + "epoch": 2.0303569311039964, + "grad_norm": 0.5430076816338275, + "learning_rate": 2.5600032659423918e-05, + "loss": 0.0672, + "step": 17122 + }, + { + "epoch": 2.030475512866121, + "grad_norm": 0.803751424029278, + "learning_rate": 2.5597632987005604e-05, + "loss": 0.0996, + "step": 17123 + }, + { + "epoch": 2.0305940946282464, + "grad_norm": 0.9154898645706518, + "learning_rate": 2.559523330907786e-05, + "loss": 0.1164, + "step": 17124 + }, + { + "epoch": 2.030712676390371, + "grad_norm": 0.8258304849405087, + "learning_rate": 2.559283362566278e-05, + "loss": 0.1213, + "step": 17125 + }, + { + "epoch": 2.0308312581524963, + "grad_norm": 0.8546657257625709, + "learning_rate": 2.5590433936782514e-05, + "loss": 0.1054, + "step": 17126 + }, + { + "epoch": 2.030949839914621, + "grad_norm": 0.7237655126132667, + "learning_rate": 2.5588034242459162e-05, + "loss": 0.0914, + "step": 17127 + }, + { + "epoch": 2.0310684216767463, + "grad_norm": 0.9373188261096604, + "learning_rate": 2.558563454271486e-05, + "loss": 0.1359, + "step": 17128 + }, + { + "epoch": 2.031187003438871, + "grad_norm": 0.7212723887970613, + "learning_rate": 2.5583234837571724e-05, + "loss": 0.0868, + "step": 17129 + }, + { + "epoch": 2.0313055852009962, + "grad_norm": 0.697681672936011, + "learning_rate": 2.558083512705188e-05, + "loss": 0.0935, + "step": 17130 + }, + { + "epoch": 2.031424166963121, + "grad_norm": 0.856077545768736, + "learning_rate": 2.5578435411177448e-05, + "loss": 0.1098, + "step": 17131 + }, + { + "epoch": 2.031542748725246, + "grad_norm": 0.711742310593575, + "learning_rate": 2.5576035689970545e-05, + "loss": 0.0902, + "step": 17132 + }, + { + "epoch": 2.031661330487371, + "grad_norm": 0.7216678140190265, + "learning_rate": 2.5573635963453314e-05, + "loss": 0.1119, + "step": 17133 + }, + { + "epoch": 2.031779912249496, + "grad_norm": 0.6582327566660553, + "learning_rate": 2.557123623164785e-05, + "loss": 0.0823, + "step": 17134 + }, + { + "epoch": 2.031898494011621, + "grad_norm": 0.7436828718539309, + "learning_rate": 2.5568836494576294e-05, + "loss": 0.0794, + "step": 17135 + }, + { + "epoch": 2.032017075773746, + "grad_norm": 0.45305465666756656, + "learning_rate": 2.556643675226076e-05, + "loss": 0.0664, + "step": 17136 + }, + { + "epoch": 2.032135657535871, + "grad_norm": 0.8837176322523292, + "learning_rate": 2.556403700472338e-05, + "loss": 0.1107, + "step": 17137 + }, + { + "epoch": 2.032254239297996, + "grad_norm": 0.8724355849660478, + "learning_rate": 2.5561637251986258e-05, + "loss": 0.1027, + "step": 17138 + }, + { + "epoch": 2.032372821060121, + "grad_norm": 1.0413564499154007, + "learning_rate": 2.5559237494071535e-05, + "loss": 0.1266, + "step": 17139 + }, + { + "epoch": 2.032491402822246, + "grad_norm": 0.7421550628781671, + "learning_rate": 2.5556837731001326e-05, + "loss": 0.0967, + "step": 17140 + }, + { + "epoch": 2.032609984584371, + "grad_norm": 0.7064835562595498, + "learning_rate": 2.555443796279777e-05, + "loss": 0.0809, + "step": 17141 + }, + { + "epoch": 2.032728566346496, + "grad_norm": 1.0000569459316584, + "learning_rate": 2.5552038189482957e-05, + "loss": 0.1331, + "step": 17142 + }, + { + "epoch": 2.0328471481086208, + "grad_norm": 0.6163324208097111, + "learning_rate": 2.5549638411079042e-05, + "loss": 0.0924, + "step": 17143 + }, + { + "epoch": 2.032965729870746, + "grad_norm": 0.560162115620009, + "learning_rate": 2.5547238627608126e-05, + "loss": 0.0656, + "step": 17144 + }, + { + "epoch": 2.0330843116328707, + "grad_norm": 0.8187552082806973, + "learning_rate": 2.5544838839092346e-05, + "loss": 0.1132, + "step": 17145 + }, + { + "epoch": 2.033202893394996, + "grad_norm": 0.8143407419419377, + "learning_rate": 2.5542439045553823e-05, + "loss": 0.099, + "step": 17146 + }, + { + "epoch": 2.0333214751571207, + "grad_norm": 0.9975604018190043, + "learning_rate": 2.5540039247014668e-05, + "loss": 0.1116, + "step": 17147 + }, + { + "epoch": 2.033440056919246, + "grad_norm": 0.89708968692309, + "learning_rate": 2.5537639443497023e-05, + "loss": 0.1135, + "step": 17148 + }, + { + "epoch": 2.0335586386813707, + "grad_norm": 0.7374388256173335, + "learning_rate": 2.5535239635022995e-05, + "loss": 0.0895, + "step": 17149 + }, + { + "epoch": 2.033677220443496, + "grad_norm": 0.5945262275342809, + "learning_rate": 2.553283982161472e-05, + "loss": 0.0959, + "step": 17150 + }, + { + "epoch": 2.0337958022056206, + "grad_norm": 0.5949568784428877, + "learning_rate": 2.5530440003294302e-05, + "loss": 0.1014, + "step": 17151 + }, + { + "epoch": 2.033914383967746, + "grad_norm": 1.2070718924110628, + "learning_rate": 2.5528040180083897e-05, + "loss": 0.1002, + "step": 17152 + }, + { + "epoch": 2.0340329657298706, + "grad_norm": 0.5481130166178332, + "learning_rate": 2.5525640352005593e-05, + "loss": 0.0851, + "step": 17153 + }, + { + "epoch": 2.034151547491996, + "grad_norm": 0.6650548282519411, + "learning_rate": 2.552324051908154e-05, + "loss": 0.0921, + "step": 17154 + }, + { + "epoch": 2.0342701292541205, + "grad_norm": 0.6579508545418454, + "learning_rate": 2.5520840681333846e-05, + "loss": 0.0668, + "step": 17155 + }, + { + "epoch": 2.0343887110162457, + "grad_norm": 0.7726965330373178, + "learning_rate": 2.551844083878464e-05, + "loss": 0.108, + "step": 17156 + }, + { + "epoch": 2.0345072927783705, + "grad_norm": 0.5769942489225839, + "learning_rate": 2.5516040991456046e-05, + "loss": 0.0715, + "step": 17157 + }, + { + "epoch": 2.0346258745404957, + "grad_norm": 0.7019123335956361, + "learning_rate": 2.5513641139370182e-05, + "loss": 0.0989, + "step": 17158 + }, + { + "epoch": 2.0347444563026205, + "grad_norm": 0.899457040598042, + "learning_rate": 2.551124128254918e-05, + "loss": 0.1282, + "step": 17159 + }, + { + "epoch": 2.0348630380647457, + "grad_norm": 0.6136690650020217, + "learning_rate": 2.5508841421015163e-05, + "loss": 0.0731, + "step": 17160 + }, + { + "epoch": 2.0349816198268704, + "grad_norm": 0.7111649250780857, + "learning_rate": 2.550644155479025e-05, + "loss": 0.0895, + "step": 17161 + }, + { + "epoch": 2.0351002015889956, + "grad_norm": 0.9837016002678934, + "learning_rate": 2.5504041683896567e-05, + "loss": 0.1444, + "step": 17162 + }, + { + "epoch": 2.0352187833511204, + "grad_norm": 0.8094281431542377, + "learning_rate": 2.5501641808356243e-05, + "loss": 0.1238, + "step": 17163 + }, + { + "epoch": 2.0353373651132456, + "grad_norm": 0.8933273521801496, + "learning_rate": 2.5499241928191386e-05, + "loss": 0.1082, + "step": 17164 + }, + { + "epoch": 2.035455946875371, + "grad_norm": 0.8627677110838211, + "learning_rate": 2.549684204342414e-05, + "loss": 0.0961, + "step": 17165 + }, + { + "epoch": 2.0355745286374956, + "grad_norm": 0.937447163237741, + "learning_rate": 2.5494442154076614e-05, + "loss": 0.1265, + "step": 17166 + }, + { + "epoch": 2.0356931103996208, + "grad_norm": 1.1245233445450844, + "learning_rate": 2.5492042260170944e-05, + "loss": 0.173, + "step": 17167 + }, + { + "epoch": 2.0358116921617455, + "grad_norm": 1.2233203299505409, + "learning_rate": 2.5489642361729244e-05, + "loss": 0.1115, + "step": 17168 + }, + { + "epoch": 2.0359302739238707, + "grad_norm": 0.6846928329742754, + "learning_rate": 2.5487242458773634e-05, + "loss": 0.1007, + "step": 17169 + }, + { + "epoch": 2.0360488556859955, + "grad_norm": 0.77680738697427, + "learning_rate": 2.5484842551326256e-05, + "loss": 0.1348, + "step": 17170 + }, + { + "epoch": 2.0361674374481207, + "grad_norm": 0.7328886391236876, + "learning_rate": 2.548244263940922e-05, + "loss": 0.116, + "step": 17171 + }, + { + "epoch": 2.0362860192102454, + "grad_norm": 0.4123414097727911, + "learning_rate": 2.5480042723044656e-05, + "loss": 0.0618, + "step": 17172 + }, + { + "epoch": 2.0364046009723706, + "grad_norm": 0.9044851453408387, + "learning_rate": 2.5477642802254686e-05, + "loss": 0.106, + "step": 17173 + }, + { + "epoch": 2.0365231827344954, + "grad_norm": 1.0090451154892517, + "learning_rate": 2.547524287706144e-05, + "loss": 0.1077, + "step": 17174 + }, + { + "epoch": 2.0366417644966206, + "grad_norm": 0.8476140649731685, + "learning_rate": 2.547284294748703e-05, + "loss": 0.0804, + "step": 17175 + }, + { + "epoch": 2.0367603462587454, + "grad_norm": 1.2644333767774454, + "learning_rate": 2.5470443013553598e-05, + "loss": 0.1589, + "step": 17176 + }, + { + "epoch": 2.0368789280208706, + "grad_norm": 0.7963339222909953, + "learning_rate": 2.5468043075283243e-05, + "loss": 0.0838, + "step": 17177 + }, + { + "epoch": 2.0369975097829953, + "grad_norm": 0.7740753392343336, + "learning_rate": 2.5465643132698113e-05, + "loss": 0.1304, + "step": 17178 + }, + { + "epoch": 2.0371160915451205, + "grad_norm": 0.574792918004117, + "learning_rate": 2.5463243185820324e-05, + "loss": 0.0595, + "step": 17179 + }, + { + "epoch": 2.0372346733072453, + "grad_norm": 0.7161064856029704, + "learning_rate": 2.5460843234672006e-05, + "loss": 0.0688, + "step": 17180 + }, + { + "epoch": 2.0373532550693705, + "grad_norm": 0.6910893052919906, + "learning_rate": 2.5458443279275268e-05, + "loss": 0.0979, + "step": 17181 + }, + { + "epoch": 2.0374718368314952, + "grad_norm": 0.8132331231290478, + "learning_rate": 2.5456043319652256e-05, + "loss": 0.1155, + "step": 17182 + }, + { + "epoch": 2.0375904185936204, + "grad_norm": 0.7534945017637467, + "learning_rate": 2.545364335582508e-05, + "loss": 0.0905, + "step": 17183 + }, + { + "epoch": 2.037709000355745, + "grad_norm": 0.7840895937700697, + "learning_rate": 2.545124338781587e-05, + "loss": 0.0935, + "step": 17184 + }, + { + "epoch": 2.0378275821178704, + "grad_norm": 0.5410301749022228, + "learning_rate": 2.5448843415646745e-05, + "loss": 0.0644, + "step": 17185 + }, + { + "epoch": 2.037946163879995, + "grad_norm": 0.7203645657491574, + "learning_rate": 2.544644343933984e-05, + "loss": 0.0733, + "step": 17186 + }, + { + "epoch": 2.0380647456421204, + "grad_norm": 1.4618114902460237, + "learning_rate": 2.5444043458917267e-05, + "loss": 0.0759, + "step": 17187 + }, + { + "epoch": 2.038183327404245, + "grad_norm": 0.5680300174097448, + "learning_rate": 2.5441643474401156e-05, + "loss": 0.083, + "step": 17188 + }, + { + "epoch": 2.0383019091663703, + "grad_norm": 0.73499607337744, + "learning_rate": 2.5439243485813636e-05, + "loss": 0.0937, + "step": 17189 + }, + { + "epoch": 2.038420490928495, + "grad_norm": 0.6183150148114197, + "learning_rate": 2.543684349317683e-05, + "loss": 0.1038, + "step": 17190 + }, + { + "epoch": 2.0385390726906203, + "grad_norm": 0.7717527362636167, + "learning_rate": 2.543444349651287e-05, + "loss": 0.095, + "step": 17191 + }, + { + "epoch": 2.038657654452745, + "grad_norm": 0.6274099893015204, + "learning_rate": 2.5432043495843866e-05, + "loss": 0.0859, + "step": 17192 + }, + { + "epoch": 2.0387762362148703, + "grad_norm": 0.5591399586915927, + "learning_rate": 2.5429643491191955e-05, + "loss": 0.0826, + "step": 17193 + }, + { + "epoch": 2.038894817976995, + "grad_norm": 0.7108724385944604, + "learning_rate": 2.5427243482579255e-05, + "loss": 0.0962, + "step": 17194 + }, + { + "epoch": 2.03901339973912, + "grad_norm": 0.738325371324134, + "learning_rate": 2.54248434700279e-05, + "loss": 0.1019, + "step": 17195 + }, + { + "epoch": 2.039131981501245, + "grad_norm": 0.7696069687778948, + "learning_rate": 2.542244345356e-05, + "loss": 0.1049, + "step": 17196 + }, + { + "epoch": 2.03925056326337, + "grad_norm": 0.9183024538949882, + "learning_rate": 2.54200434331977e-05, + "loss": 0.1367, + "step": 17197 + }, + { + "epoch": 2.039369145025495, + "grad_norm": 0.8919507929824677, + "learning_rate": 2.5417643408963106e-05, + "loss": 0.1224, + "step": 17198 + }, + { + "epoch": 2.03948772678762, + "grad_norm": 0.8155695047315147, + "learning_rate": 2.541524338087835e-05, + "loss": 0.1043, + "step": 17199 + }, + { + "epoch": 2.039606308549745, + "grad_norm": 0.9296601033692349, + "learning_rate": 2.5412843348965566e-05, + "loss": 0.1354, + "step": 17200 + }, + { + "epoch": 2.03972489031187, + "grad_norm": 0.6131784579344348, + "learning_rate": 2.541044331324687e-05, + "loss": 0.0731, + "step": 17201 + }, + { + "epoch": 2.039843472073995, + "grad_norm": 0.5760572333107017, + "learning_rate": 2.540804327374439e-05, + "loss": 0.0745, + "step": 17202 + }, + { + "epoch": 2.03996205383612, + "grad_norm": 0.8100182908553717, + "learning_rate": 2.5405643230480247e-05, + "loss": 0.1027, + "step": 17203 + }, + { + "epoch": 2.040080635598245, + "grad_norm": 0.9540105050339577, + "learning_rate": 2.540324318347658e-05, + "loss": 0.1067, + "step": 17204 + }, + { + "epoch": 2.04019921736037, + "grad_norm": 0.5488115776568724, + "learning_rate": 2.5400843132755498e-05, + "loss": 0.0654, + "step": 17205 + }, + { + "epoch": 2.040317799122495, + "grad_norm": 0.7577541249916874, + "learning_rate": 2.539844307833914e-05, + "loss": 0.1181, + "step": 17206 + }, + { + "epoch": 2.04043638088462, + "grad_norm": 0.7088967797834963, + "learning_rate": 2.539604302024962e-05, + "loss": 0.0877, + "step": 17207 + }, + { + "epoch": 2.0405549626467447, + "grad_norm": 1.05132792706795, + "learning_rate": 2.539364295850907e-05, + "loss": 0.1641, + "step": 17208 + }, + { + "epoch": 2.04067354440887, + "grad_norm": 0.8111291263289154, + "learning_rate": 2.5391242893139617e-05, + "loss": 0.1215, + "step": 17209 + }, + { + "epoch": 2.0407921261709947, + "grad_norm": 0.8870372342542985, + "learning_rate": 2.5388842824163384e-05, + "loss": 0.0896, + "step": 17210 + }, + { + "epoch": 2.04091070793312, + "grad_norm": 0.8114259758363742, + "learning_rate": 2.538644275160249e-05, + "loss": 0.1219, + "step": 17211 + }, + { + "epoch": 2.0410292896952447, + "grad_norm": 0.7929628907671117, + "learning_rate": 2.538404267547908e-05, + "loss": 0.0932, + "step": 17212 + }, + { + "epoch": 2.04114787145737, + "grad_norm": 0.8667705238107806, + "learning_rate": 2.538164259581526e-05, + "loss": 0.1275, + "step": 17213 + }, + { + "epoch": 2.0412664532194946, + "grad_norm": 0.8673871174423586, + "learning_rate": 2.5379242512633167e-05, + "loss": 0.095, + "step": 17214 + }, + { + "epoch": 2.04138503498162, + "grad_norm": 0.9381284563251776, + "learning_rate": 2.5376842425954918e-05, + "loss": 0.1235, + "step": 17215 + }, + { + "epoch": 2.0415036167437446, + "grad_norm": 0.8355597687483332, + "learning_rate": 2.5374442335802645e-05, + "loss": 0.0951, + "step": 17216 + }, + { + "epoch": 2.04162219850587, + "grad_norm": 0.8024581956754345, + "learning_rate": 2.537204224219848e-05, + "loss": 0.0981, + "step": 17217 + }, + { + "epoch": 2.041740780267995, + "grad_norm": 0.5734878581488076, + "learning_rate": 2.536964214516453e-05, + "loss": 0.1069, + "step": 17218 + }, + { + "epoch": 2.0418593620301198, + "grad_norm": 0.4953034237645289, + "learning_rate": 2.5367242044722938e-05, + "loss": 0.0751, + "step": 17219 + }, + { + "epoch": 2.041977943792245, + "grad_norm": 0.7903399546548958, + "learning_rate": 2.5364841940895823e-05, + "loss": 0.0878, + "step": 17220 + }, + { + "epoch": 2.0420965255543697, + "grad_norm": 0.9503575303070972, + "learning_rate": 2.536244183370532e-05, + "loss": 0.1178, + "step": 17221 + }, + { + "epoch": 2.042215107316495, + "grad_norm": 0.6148463090628069, + "learning_rate": 2.536004172317354e-05, + "loss": 0.0849, + "step": 17222 + }, + { + "epoch": 2.0423336890786197, + "grad_norm": 0.8389076791776786, + "learning_rate": 2.5357641609322625e-05, + "loss": 0.0946, + "step": 17223 + }, + { + "epoch": 2.042452270840745, + "grad_norm": 0.8170796702119986, + "learning_rate": 2.5355241492174687e-05, + "loss": 0.1244, + "step": 17224 + }, + { + "epoch": 2.0425708526028696, + "grad_norm": 0.5144558116630039, + "learning_rate": 2.5352841371751855e-05, + "loss": 0.062, + "step": 17225 + }, + { + "epoch": 2.042689434364995, + "grad_norm": 0.8269906688445805, + "learning_rate": 2.535044124807626e-05, + "loss": 0.135, + "step": 17226 + }, + { + "epoch": 2.0428080161271196, + "grad_norm": 0.6930781588355727, + "learning_rate": 2.5348041121170036e-05, + "loss": 0.0858, + "step": 17227 + }, + { + "epoch": 2.042926597889245, + "grad_norm": 0.8007216634349194, + "learning_rate": 2.534564099105529e-05, + "loss": 0.0996, + "step": 17228 + }, + { + "epoch": 2.0430451796513696, + "grad_norm": 0.8381035860951902, + "learning_rate": 2.534324085775416e-05, + "loss": 0.104, + "step": 17229 + }, + { + "epoch": 2.0431637614134948, + "grad_norm": 0.9335938603043002, + "learning_rate": 2.5340840721288767e-05, + "loss": 0.1302, + "step": 17230 + }, + { + "epoch": 2.0432823431756195, + "grad_norm": 0.8966432668567896, + "learning_rate": 2.533844058168124e-05, + "loss": 0.1387, + "step": 17231 + }, + { + "epoch": 2.0434009249377447, + "grad_norm": 0.7738285089363771, + "learning_rate": 2.5336040438953712e-05, + "loss": 0.0996, + "step": 17232 + }, + { + "epoch": 2.0435195066998695, + "grad_norm": 0.9371198833730369, + "learning_rate": 2.53336402931283e-05, + "loss": 0.1135, + "step": 17233 + }, + { + "epoch": 2.0436380884619947, + "grad_norm": 0.7201626254657149, + "learning_rate": 2.5331240144227138e-05, + "loss": 0.0974, + "step": 17234 + }, + { + "epoch": 2.0437566702241194, + "grad_norm": 0.7445293250278847, + "learning_rate": 2.5328839992272342e-05, + "loss": 0.0987, + "step": 17235 + }, + { + "epoch": 2.0438752519862446, + "grad_norm": 0.8162464115155144, + "learning_rate": 2.5326439837286053e-05, + "loss": 0.088, + "step": 17236 + }, + { + "epoch": 2.0439938337483694, + "grad_norm": 0.9822702970230561, + "learning_rate": 2.5324039679290378e-05, + "loss": 0.1235, + "step": 17237 + }, + { + "epoch": 2.0441124155104946, + "grad_norm": 0.8237309809111716, + "learning_rate": 2.532163951830746e-05, + "loss": 0.087, + "step": 17238 + }, + { + "epoch": 2.0442309972726194, + "grad_norm": 0.710659450243611, + "learning_rate": 2.5319239354359415e-05, + "loss": 0.0978, + "step": 17239 + }, + { + "epoch": 2.0443495790347446, + "grad_norm": 0.8297034296333, + "learning_rate": 2.5316839187468383e-05, + "loss": 0.0965, + "step": 17240 + }, + { + "epoch": 2.0444681607968693, + "grad_norm": 0.6933196632210565, + "learning_rate": 2.5314439017656476e-05, + "loss": 0.0879, + "step": 17241 + }, + { + "epoch": 2.0445867425589945, + "grad_norm": 0.6467023829003546, + "learning_rate": 2.5312038844945836e-05, + "loss": 0.0741, + "step": 17242 + }, + { + "epoch": 2.0447053243211193, + "grad_norm": 0.7794495041392944, + "learning_rate": 2.530963866935857e-05, + "loss": 0.1094, + "step": 17243 + }, + { + "epoch": 2.0448239060832445, + "grad_norm": 0.5531254766616353, + "learning_rate": 2.5307238490916818e-05, + "loss": 0.0704, + "step": 17244 + }, + { + "epoch": 2.0449424878453693, + "grad_norm": 0.9129993805151213, + "learning_rate": 2.530483830964271e-05, + "loss": 0.113, + "step": 17245 + }, + { + "epoch": 2.0450610696074945, + "grad_norm": 0.5672802099551186, + "learning_rate": 2.5302438125558354e-05, + "loss": 0.0807, + "step": 17246 + }, + { + "epoch": 2.045179651369619, + "grad_norm": 0.6827076468992048, + "learning_rate": 2.53000379386859e-05, + "loss": 0.0801, + "step": 17247 + }, + { + "epoch": 2.0452982331317444, + "grad_norm": 0.8603059305043104, + "learning_rate": 2.529763774904746e-05, + "loss": 0.0987, + "step": 17248 + }, + { + "epoch": 2.045416814893869, + "grad_norm": 1.1044416377059272, + "learning_rate": 2.5295237556665163e-05, + "loss": 0.1361, + "step": 17249 + }, + { + "epoch": 2.0455353966559944, + "grad_norm": 0.5844603037224718, + "learning_rate": 2.529283736156114e-05, + "loss": 0.0604, + "step": 17250 + }, + { + "epoch": 2.045653978418119, + "grad_norm": 0.7444316748625215, + "learning_rate": 2.529043716375752e-05, + "loss": 0.0861, + "step": 17251 + }, + { + "epoch": 2.0457725601802443, + "grad_norm": 1.0653554613999525, + "learning_rate": 2.528803696327642e-05, + "loss": 0.147, + "step": 17252 + }, + { + "epoch": 2.045891141942369, + "grad_norm": 0.6378941383447395, + "learning_rate": 2.5285636760139973e-05, + "loss": 0.0681, + "step": 17253 + }, + { + "epoch": 2.0460097237044943, + "grad_norm": 0.6628789154445944, + "learning_rate": 2.5283236554370308e-05, + "loss": 0.0919, + "step": 17254 + }, + { + "epoch": 2.046128305466619, + "grad_norm": 0.6387138604432548, + "learning_rate": 2.528083634598955e-05, + "loss": 0.086, + "step": 17255 + }, + { + "epoch": 2.0462468872287443, + "grad_norm": 0.9159851150955914, + "learning_rate": 2.5278436135019824e-05, + "loss": 0.1419, + "step": 17256 + }, + { + "epoch": 2.046365468990869, + "grad_norm": 0.9577667548978641, + "learning_rate": 2.5276035921483253e-05, + "loss": 0.1441, + "step": 17257 + }, + { + "epoch": 2.046484050752994, + "grad_norm": 0.8545859831779087, + "learning_rate": 2.527363570540197e-05, + "loss": 0.1128, + "step": 17258 + }, + { + "epoch": 2.046602632515119, + "grad_norm": 0.9779277245345986, + "learning_rate": 2.5271235486798107e-05, + "loss": 0.1311, + "step": 17259 + }, + { + "epoch": 2.046721214277244, + "grad_norm": 0.6590918317056579, + "learning_rate": 2.5268835265693785e-05, + "loss": 0.1172, + "step": 17260 + }, + { + "epoch": 2.046839796039369, + "grad_norm": 0.7050818995517045, + "learning_rate": 2.5266435042111132e-05, + "loss": 0.0845, + "step": 17261 + }, + { + "epoch": 2.046958377801494, + "grad_norm": 0.6686723449738298, + "learning_rate": 2.5264034816072274e-05, + "loss": 0.1, + "step": 17262 + }, + { + "epoch": 2.047076959563619, + "grad_norm": 0.8261229084007741, + "learning_rate": 2.526163458759933e-05, + "loss": 0.1193, + "step": 17263 + }, + { + "epoch": 2.047195541325744, + "grad_norm": 0.8283100946640986, + "learning_rate": 2.525923435671445e-05, + "loss": 0.1139, + "step": 17264 + }, + { + "epoch": 2.047314123087869, + "grad_norm": 0.4235627386627182, + "learning_rate": 2.525683412343974e-05, + "loss": 0.0619, + "step": 17265 + }, + { + "epoch": 2.047432704849994, + "grad_norm": 0.6684450782990227, + "learning_rate": 2.5254433887797336e-05, + "loss": 0.1073, + "step": 17266 + }, + { + "epoch": 2.0475512866121193, + "grad_norm": 0.8844674247488972, + "learning_rate": 2.525203364980936e-05, + "loss": 0.1068, + "step": 17267 + }, + { + "epoch": 2.047669868374244, + "grad_norm": 0.6548932855823383, + "learning_rate": 2.5249633409497946e-05, + "loss": 0.0659, + "step": 17268 + }, + { + "epoch": 2.0477884501363692, + "grad_norm": 0.806759899845544, + "learning_rate": 2.5247233166885214e-05, + "loss": 0.1016, + "step": 17269 + }, + { + "epoch": 2.047907031898494, + "grad_norm": 0.6849290243785915, + "learning_rate": 2.52448329219933e-05, + "loss": 0.1067, + "step": 17270 + }, + { + "epoch": 2.048025613660619, + "grad_norm": 0.6678225180566302, + "learning_rate": 2.5242432674844324e-05, + "loss": 0.0833, + "step": 17271 + }, + { + "epoch": 2.048144195422744, + "grad_norm": 0.6709001309126991, + "learning_rate": 2.5240032425460414e-05, + "loss": 0.0975, + "step": 17272 + }, + { + "epoch": 2.048262777184869, + "grad_norm": 0.9820974719667647, + "learning_rate": 2.5237632173863705e-05, + "loss": 0.1282, + "step": 17273 + }, + { + "epoch": 2.048381358946994, + "grad_norm": 0.8551823750784705, + "learning_rate": 2.523523192007632e-05, + "loss": 0.1002, + "step": 17274 + }, + { + "epoch": 2.048499940709119, + "grad_norm": 0.8527961322160159, + "learning_rate": 2.5232831664120382e-05, + "loss": 0.1173, + "step": 17275 + }, + { + "epoch": 2.048618522471244, + "grad_norm": 0.5650993200792589, + "learning_rate": 2.523043140601802e-05, + "loss": 0.089, + "step": 17276 + }, + { + "epoch": 2.048737104233369, + "grad_norm": 0.8539906279720711, + "learning_rate": 2.522803114579137e-05, + "loss": 0.1489, + "step": 17277 + }, + { + "epoch": 2.048855685995494, + "grad_norm": 0.9337977110155599, + "learning_rate": 2.5225630883462543e-05, + "loss": 0.122, + "step": 17278 + }, + { + "epoch": 2.048974267757619, + "grad_norm": 1.232632667251937, + "learning_rate": 2.522323061905368e-05, + "loss": 0.1568, + "step": 17279 + }, + { + "epoch": 2.049092849519744, + "grad_norm": 0.6523927614928772, + "learning_rate": 2.52208303525869e-05, + "loss": 0.1005, + "step": 17280 + }, + { + "epoch": 2.049211431281869, + "grad_norm": 0.5378886319855706, + "learning_rate": 2.5218430084084345e-05, + "loss": 0.0697, + "step": 17281 + }, + { + "epoch": 2.0493300130439938, + "grad_norm": 0.6918385525423768, + "learning_rate": 2.5216029813568122e-05, + "loss": 0.0782, + "step": 17282 + }, + { + "epoch": 2.049448594806119, + "grad_norm": 0.7226585950906734, + "learning_rate": 2.5213629541060375e-05, + "loss": 0.0772, + "step": 17283 + }, + { + "epoch": 2.0495671765682437, + "grad_norm": 0.5853777654080909, + "learning_rate": 2.5211229266583226e-05, + "loss": 0.0806, + "step": 17284 + }, + { + "epoch": 2.049685758330369, + "grad_norm": 0.801103574635759, + "learning_rate": 2.52088289901588e-05, + "loss": 0.0991, + "step": 17285 + }, + { + "epoch": 2.0498043400924937, + "grad_norm": 0.9565901047513836, + "learning_rate": 2.5206428711809226e-05, + "loss": 0.1492, + "step": 17286 + }, + { + "epoch": 2.049922921854619, + "grad_norm": 0.5594351624436538, + "learning_rate": 2.520402843155663e-05, + "loss": 0.0613, + "step": 17287 + }, + { + "epoch": 2.0500415036167436, + "grad_norm": 0.5615061941742623, + "learning_rate": 2.5201628149423147e-05, + "loss": 0.081, + "step": 17288 + }, + { + "epoch": 2.050160085378869, + "grad_norm": 0.49190214833464163, + "learning_rate": 2.519922786543089e-05, + "loss": 0.0778, + "step": 17289 + }, + { + "epoch": 2.0502786671409936, + "grad_norm": 0.5361735252613351, + "learning_rate": 2.5196827579602013e-05, + "loss": 0.0759, + "step": 17290 + }, + { + "epoch": 2.050397248903119, + "grad_norm": 0.8431625119314979, + "learning_rate": 2.5194427291958617e-05, + "loss": 0.1066, + "step": 17291 + }, + { + "epoch": 2.0505158306652436, + "grad_norm": 0.814619379733209, + "learning_rate": 2.5192027002522843e-05, + "loss": 0.1017, + "step": 17292 + }, + { + "epoch": 2.0506344124273688, + "grad_norm": 1.0009646459090147, + "learning_rate": 2.518962671131681e-05, + "loss": 0.1482, + "step": 17293 + }, + { + "epoch": 2.0507529941894935, + "grad_norm": 0.6488660960195891, + "learning_rate": 2.5187226418362664e-05, + "loss": 0.073, + "step": 17294 + }, + { + "epoch": 2.0508715759516187, + "grad_norm": 0.859331075777397, + "learning_rate": 2.518482612368251e-05, + "loss": 0.12, + "step": 17295 + }, + { + "epoch": 2.0509901577137435, + "grad_norm": 0.7769438307464144, + "learning_rate": 2.518242582729849e-05, + "loss": 0.089, + "step": 17296 + }, + { + "epoch": 2.0511087394758687, + "grad_norm": 1.1180904894558377, + "learning_rate": 2.518002552923272e-05, + "loss": 0.1472, + "step": 17297 + }, + { + "epoch": 2.0512273212379935, + "grad_norm": 0.6975673398570656, + "learning_rate": 2.517762522950734e-05, + "loss": 0.0917, + "step": 17298 + }, + { + "epoch": 2.0513459030001187, + "grad_norm": 0.5700265467605818, + "learning_rate": 2.5175224928144468e-05, + "loss": 0.0911, + "step": 17299 + }, + { + "epoch": 2.0514644847622434, + "grad_norm": 0.9473984891778904, + "learning_rate": 2.5172824625166246e-05, + "loss": 0.1152, + "step": 17300 + }, + { + "epoch": 2.0515830665243686, + "grad_norm": 0.7007925952210723, + "learning_rate": 2.517042432059479e-05, + "loss": 0.0752, + "step": 17301 + }, + { + "epoch": 2.0517016482864934, + "grad_norm": 0.7857191563515793, + "learning_rate": 2.5168024014452224e-05, + "loss": 0.1068, + "step": 17302 + }, + { + "epoch": 2.0518202300486186, + "grad_norm": 0.9585678909029525, + "learning_rate": 2.5165623706760694e-05, + "loss": 0.148, + "step": 17303 + }, + { + "epoch": 2.0519388118107433, + "grad_norm": 0.5379678442702583, + "learning_rate": 2.5163223397542306e-05, + "loss": 0.0696, + "step": 17304 + }, + { + "epoch": 2.0520573935728685, + "grad_norm": 0.6434673449976782, + "learning_rate": 2.5160823086819208e-05, + "loss": 0.0801, + "step": 17305 + }, + { + "epoch": 2.0521759753349933, + "grad_norm": 0.8145734542412243, + "learning_rate": 2.5158422774613517e-05, + "loss": 0.1067, + "step": 17306 + }, + { + "epoch": 2.0522945570971185, + "grad_norm": 1.0703879973038493, + "learning_rate": 2.5156022460947355e-05, + "loss": 0.155, + "step": 17307 + }, + { + "epoch": 2.0524131388592433, + "grad_norm": 0.9127719219159255, + "learning_rate": 2.5153622145842857e-05, + "loss": 0.1136, + "step": 17308 + }, + { + "epoch": 2.0525317206213685, + "grad_norm": 0.6317510982479302, + "learning_rate": 2.5151221829322157e-05, + "loss": 0.0778, + "step": 17309 + }, + { + "epoch": 2.052650302383493, + "grad_norm": 0.7562078681946065, + "learning_rate": 2.5148821511407377e-05, + "loss": 0.0926, + "step": 17310 + }, + { + "epoch": 2.0527688841456184, + "grad_norm": 0.7701297182479242, + "learning_rate": 2.5146421192120646e-05, + "loss": 0.0974, + "step": 17311 + }, + { + "epoch": 2.052887465907743, + "grad_norm": 0.8802155301391068, + "learning_rate": 2.5144020871484088e-05, + "loss": 0.1247, + "step": 17312 + }, + { + "epoch": 2.0530060476698684, + "grad_norm": 0.6551511174288259, + "learning_rate": 2.514162054951984e-05, + "loss": 0.106, + "step": 17313 + }, + { + "epoch": 2.053124629431993, + "grad_norm": 0.6006635753676851, + "learning_rate": 2.5139220226250015e-05, + "loss": 0.0859, + "step": 17314 + }, + { + "epoch": 2.0532432111941183, + "grad_norm": 0.8559248001461366, + "learning_rate": 2.5136819901696755e-05, + "loss": 0.114, + "step": 17315 + }, + { + "epoch": 2.0533617929562435, + "grad_norm": 0.9144379690630021, + "learning_rate": 2.5134419575882183e-05, + "loss": 0.1159, + "step": 17316 + }, + { + "epoch": 2.0534803747183683, + "grad_norm": 0.4681361720810994, + "learning_rate": 2.5132019248828425e-05, + "loss": 0.07, + "step": 17317 + }, + { + "epoch": 2.0535989564804935, + "grad_norm": 0.6647163999155792, + "learning_rate": 2.5129618920557613e-05, + "loss": 0.0977, + "step": 17318 + }, + { + "epoch": 2.0537175382426183, + "grad_norm": 0.7878900383569605, + "learning_rate": 2.512721859109187e-05, + "loss": 0.0962, + "step": 17319 + }, + { + "epoch": 2.0538361200047435, + "grad_norm": 0.8051633033110276, + "learning_rate": 2.512481826045334e-05, + "loss": 0.1328, + "step": 17320 + }, + { + "epoch": 2.0539547017668682, + "grad_norm": 0.6162299887870892, + "learning_rate": 2.5122417928664126e-05, + "loss": 0.0628, + "step": 17321 + }, + { + "epoch": 2.0540732835289934, + "grad_norm": 0.664237532245905, + "learning_rate": 2.512001759574638e-05, + "loss": 0.087, + "step": 17322 + }, + { + "epoch": 2.054191865291118, + "grad_norm": 0.5538695362114293, + "learning_rate": 2.5117617261722208e-05, + "loss": 0.0685, + "step": 17323 + }, + { + "epoch": 2.0543104470532434, + "grad_norm": 0.8698840115791621, + "learning_rate": 2.511521692661376e-05, + "loss": 0.1145, + "step": 17324 + }, + { + "epoch": 2.054429028815368, + "grad_norm": 0.8154775192280642, + "learning_rate": 2.5112816590443145e-05, + "loss": 0.0933, + "step": 17325 + }, + { + "epoch": 2.0545476105774934, + "grad_norm": 0.479542029380308, + "learning_rate": 2.5110416253232504e-05, + "loss": 0.0683, + "step": 17326 + }, + { + "epoch": 2.054666192339618, + "grad_norm": 0.49458804676511914, + "learning_rate": 2.510801591500395e-05, + "loss": 0.05, + "step": 17327 + }, + { + "epoch": 2.0547847741017433, + "grad_norm": 0.6743539508358545, + "learning_rate": 2.5105615575779628e-05, + "loss": 0.1075, + "step": 17328 + }, + { + "epoch": 2.054903355863868, + "grad_norm": 0.6242414086413426, + "learning_rate": 2.5103215235581663e-05, + "loss": 0.0853, + "step": 17329 + }, + { + "epoch": 2.0550219376259933, + "grad_norm": 0.8067809867257045, + "learning_rate": 2.5100814894432173e-05, + "loss": 0.1129, + "step": 17330 + }, + { + "epoch": 2.055140519388118, + "grad_norm": 0.7586488835444153, + "learning_rate": 2.5098414552353304e-05, + "loss": 0.1026, + "step": 17331 + }, + { + "epoch": 2.0552591011502432, + "grad_norm": 0.6696189680738625, + "learning_rate": 2.5096014209367168e-05, + "loss": 0.0817, + "step": 17332 + }, + { + "epoch": 2.055377682912368, + "grad_norm": 0.7791020247650454, + "learning_rate": 2.5093613865495903e-05, + "loss": 0.1002, + "step": 17333 + }, + { + "epoch": 2.055496264674493, + "grad_norm": 0.9544249166240728, + "learning_rate": 2.5091213520761626e-05, + "loss": 0.1087, + "step": 17334 + }, + { + "epoch": 2.055614846436618, + "grad_norm": 0.7384603064975516, + "learning_rate": 2.5088813175186476e-05, + "loss": 0.0881, + "step": 17335 + }, + { + "epoch": 2.055733428198743, + "grad_norm": 0.5349458299250663, + "learning_rate": 2.5086412828792578e-05, + "loss": 0.087, + "step": 17336 + }, + { + "epoch": 2.055852009960868, + "grad_norm": 0.7139839381947889, + "learning_rate": 2.5084012481602055e-05, + "loss": 0.0806, + "step": 17337 + }, + { + "epoch": 2.055970591722993, + "grad_norm": 0.741966354295771, + "learning_rate": 2.508161213363704e-05, + "loss": 0.082, + "step": 17338 + }, + { + "epoch": 2.056089173485118, + "grad_norm": 1.0294325898716359, + "learning_rate": 2.5079211784919664e-05, + "loss": 0.1051, + "step": 17339 + }, + { + "epoch": 2.056207755247243, + "grad_norm": 0.5012361050374275, + "learning_rate": 2.5076811435472053e-05, + "loss": 0.0673, + "step": 17340 + }, + { + "epoch": 2.056326337009368, + "grad_norm": 0.4966263252115782, + "learning_rate": 2.5074411085316342e-05, + "loss": 0.0714, + "step": 17341 + }, + { + "epoch": 2.056444918771493, + "grad_norm": 0.6145999541600821, + "learning_rate": 2.507201073447464e-05, + "loss": 0.0899, + "step": 17342 + }, + { + "epoch": 2.056563500533618, + "grad_norm": 0.7172226531210161, + "learning_rate": 2.5069610382969093e-05, + "loss": 0.1026, + "step": 17343 + }, + { + "epoch": 2.056682082295743, + "grad_norm": 0.9305722033819583, + "learning_rate": 2.5067210030821818e-05, + "loss": 0.1323, + "step": 17344 + }, + { + "epoch": 2.0568006640578678, + "grad_norm": 0.7016495056442461, + "learning_rate": 2.5064809678054955e-05, + "loss": 0.0896, + "step": 17345 + }, + { + "epoch": 2.056919245819993, + "grad_norm": 0.49300254050396214, + "learning_rate": 2.5062409324690626e-05, + "loss": 0.0665, + "step": 17346 + }, + { + "epoch": 2.0570378275821177, + "grad_norm": 0.6162774312940101, + "learning_rate": 2.5060008970750958e-05, + "loss": 0.0809, + "step": 17347 + }, + { + "epoch": 2.057156409344243, + "grad_norm": 0.6707944989800667, + "learning_rate": 2.5057608616258076e-05, + "loss": 0.085, + "step": 17348 + }, + { + "epoch": 2.0572749911063677, + "grad_norm": 0.6836190295525338, + "learning_rate": 2.505520826123412e-05, + "loss": 0.0957, + "step": 17349 + }, + { + "epoch": 2.057393572868493, + "grad_norm": 0.6130067806482371, + "learning_rate": 2.5052807905701215e-05, + "loss": 0.0777, + "step": 17350 + }, + { + "epoch": 2.0575121546306177, + "grad_norm": 0.620184341828502, + "learning_rate": 2.5050407549681475e-05, + "loss": 0.0844, + "step": 17351 + }, + { + "epoch": 2.057630736392743, + "grad_norm": 0.6615369907875109, + "learning_rate": 2.504800719319705e-05, + "loss": 0.0933, + "step": 17352 + }, + { + "epoch": 2.0577493181548676, + "grad_norm": 0.9509091231322642, + "learning_rate": 2.504560683627005e-05, + "loss": 0.1069, + "step": 17353 + }, + { + "epoch": 2.057867899916993, + "grad_norm": 0.7606201870355929, + "learning_rate": 2.5043206478922614e-05, + "loss": 0.0948, + "step": 17354 + }, + { + "epoch": 2.0579864816791176, + "grad_norm": 0.7926225459572829, + "learning_rate": 2.5040806121176868e-05, + "loss": 0.0984, + "step": 17355 + }, + { + "epoch": 2.0581050634412428, + "grad_norm": 0.694434994328793, + "learning_rate": 2.503840576305494e-05, + "loss": 0.081, + "step": 17356 + }, + { + "epoch": 2.0582236452033675, + "grad_norm": 0.9263722873944029, + "learning_rate": 2.5036005404578954e-05, + "loss": 0.1568, + "step": 17357 + }, + { + "epoch": 2.0583422269654927, + "grad_norm": 0.7824805916775245, + "learning_rate": 2.5033605045771046e-05, + "loss": 0.0773, + "step": 17358 + }, + { + "epoch": 2.0584608087276175, + "grad_norm": 0.9976794330936313, + "learning_rate": 2.5031204686653336e-05, + "loss": 0.0978, + "step": 17359 + }, + { + "epoch": 2.0585793904897427, + "grad_norm": 0.8639758336376184, + "learning_rate": 2.5028804327247958e-05, + "loss": 0.1177, + "step": 17360 + }, + { + "epoch": 2.0586979722518675, + "grad_norm": 1.0204906541131034, + "learning_rate": 2.5026403967577045e-05, + "loss": 0.1393, + "step": 17361 + }, + { + "epoch": 2.0588165540139927, + "grad_norm": 0.794047996862685, + "learning_rate": 2.5024003607662715e-05, + "loss": 0.097, + "step": 17362 + }, + { + "epoch": 2.0589351357761174, + "grad_norm": 0.6230972933056133, + "learning_rate": 2.5021603247527103e-05, + "loss": 0.0773, + "step": 17363 + }, + { + "epoch": 2.0590537175382426, + "grad_norm": 0.5779396167671657, + "learning_rate": 2.5019202887192332e-05, + "loss": 0.0679, + "step": 17364 + }, + { + "epoch": 2.059172299300368, + "grad_norm": 0.7188071480834007, + "learning_rate": 2.5016802526680543e-05, + "loss": 0.081, + "step": 17365 + }, + { + "epoch": 2.0592908810624926, + "grad_norm": 0.7175663178486109, + "learning_rate": 2.501440216601385e-05, + "loss": 0.0809, + "step": 17366 + }, + { + "epoch": 2.059409462824618, + "grad_norm": 0.8167147709468672, + "learning_rate": 2.501200180521438e-05, + "loss": 0.0952, + "step": 17367 + }, + { + "epoch": 2.0595280445867425, + "grad_norm": 0.5601203037887833, + "learning_rate": 2.5009601444304276e-05, + "loss": 0.0714, + "step": 17368 + }, + { + "epoch": 2.0596466263488677, + "grad_norm": 0.710404668345531, + "learning_rate": 2.500720108330566e-05, + "loss": 0.0804, + "step": 17369 + }, + { + "epoch": 2.0597652081109925, + "grad_norm": 1.1915573272871223, + "learning_rate": 2.5004800722240652e-05, + "loss": 0.1329, + "step": 17370 + }, + { + "epoch": 2.0598837898731177, + "grad_norm": 0.9346327366477544, + "learning_rate": 2.5002400361131396e-05, + "loss": 0.1136, + "step": 17371 + }, + { + "epoch": 2.0600023716352425, + "grad_norm": 0.9218079671991417, + "learning_rate": 2.5e-05, + "loss": 0.0922, + "step": 17372 + }, + { + "epoch": 2.0601209533973677, + "grad_norm": 0.8079216936192907, + "learning_rate": 2.4997599638868617e-05, + "loss": 0.1009, + "step": 17373 + }, + { + "epoch": 2.0602395351594924, + "grad_norm": 0.7058243040490104, + "learning_rate": 2.4995199277759347e-05, + "loss": 0.1105, + "step": 17374 + }, + { + "epoch": 2.0603581169216176, + "grad_norm": 0.9013244553079822, + "learning_rate": 2.4992798916694346e-05, + "loss": 0.1285, + "step": 17375 + }, + { + "epoch": 2.0604766986837424, + "grad_norm": 0.7544294568680402, + "learning_rate": 2.4990398555695733e-05, + "loss": 0.0955, + "step": 17376 + }, + { + "epoch": 2.0605952804458676, + "grad_norm": 0.9910509765308024, + "learning_rate": 2.498799819478562e-05, + "loss": 0.1374, + "step": 17377 + }, + { + "epoch": 2.0607138622079924, + "grad_norm": 0.9116811334241266, + "learning_rate": 2.4985597833986158e-05, + "loss": 0.125, + "step": 17378 + }, + { + "epoch": 2.0608324439701176, + "grad_norm": 0.9496698813014962, + "learning_rate": 2.498319747331946e-05, + "loss": 0.1116, + "step": 17379 + }, + { + "epoch": 2.0609510257322423, + "grad_norm": 0.6033501593675298, + "learning_rate": 2.498079711280767e-05, + "loss": 0.0757, + "step": 17380 + }, + { + "epoch": 2.0610696074943675, + "grad_norm": 0.9742039958132089, + "learning_rate": 2.4978396752472903e-05, + "loss": 0.1261, + "step": 17381 + }, + { + "epoch": 2.0611881892564923, + "grad_norm": 0.7052985673542551, + "learning_rate": 2.4975996392337288e-05, + "loss": 0.0849, + "step": 17382 + }, + { + "epoch": 2.0613067710186175, + "grad_norm": 0.6331210219547435, + "learning_rate": 2.497359603242296e-05, + "loss": 0.085, + "step": 17383 + }, + { + "epoch": 2.0614253527807422, + "grad_norm": 0.7613102774878522, + "learning_rate": 2.4971195672752048e-05, + "loss": 0.08, + "step": 17384 + }, + { + "epoch": 2.0615439345428674, + "grad_norm": 0.5814165903207715, + "learning_rate": 2.4968795313346673e-05, + "loss": 0.0821, + "step": 17385 + }, + { + "epoch": 2.061662516304992, + "grad_norm": 0.7968028624284378, + "learning_rate": 2.496639495422896e-05, + "loss": 0.0857, + "step": 17386 + }, + { + "epoch": 2.0617810980671174, + "grad_norm": 0.7280287603778376, + "learning_rate": 2.4963994595421052e-05, + "loss": 0.0871, + "step": 17387 + }, + { + "epoch": 2.061899679829242, + "grad_norm": 0.7575432961342443, + "learning_rate": 2.496159423694507e-05, + "loss": 0.107, + "step": 17388 + }, + { + "epoch": 2.0620182615913674, + "grad_norm": 0.7819633502139661, + "learning_rate": 2.495919387882313e-05, + "loss": 0.094, + "step": 17389 + }, + { + "epoch": 2.062136843353492, + "grad_norm": 0.7573236215913882, + "learning_rate": 2.495679352107739e-05, + "loss": 0.1011, + "step": 17390 + }, + { + "epoch": 2.0622554251156173, + "grad_norm": 0.6168926173444154, + "learning_rate": 2.4954393163729955e-05, + "loss": 0.0814, + "step": 17391 + }, + { + "epoch": 2.062374006877742, + "grad_norm": 0.8145914435734596, + "learning_rate": 2.495199280680296e-05, + "loss": 0.1167, + "step": 17392 + }, + { + "epoch": 2.0624925886398673, + "grad_norm": 0.5177892389260967, + "learning_rate": 2.4949592450318525e-05, + "loss": 0.0672, + "step": 17393 + }, + { + "epoch": 2.062611170401992, + "grad_norm": 1.0768878183172357, + "learning_rate": 2.494719209429879e-05, + "loss": 0.1814, + "step": 17394 + }, + { + "epoch": 2.0627297521641172, + "grad_norm": 0.832796037165638, + "learning_rate": 2.4944791738765885e-05, + "loss": 0.1148, + "step": 17395 + }, + { + "epoch": 2.062848333926242, + "grad_norm": 0.8008599774313413, + "learning_rate": 2.4942391383741926e-05, + "loss": 0.1067, + "step": 17396 + }, + { + "epoch": 2.062966915688367, + "grad_norm": 0.7117029905504332, + "learning_rate": 2.4939991029249048e-05, + "loss": 0.1067, + "step": 17397 + }, + { + "epoch": 2.063085497450492, + "grad_norm": 0.9848679158363012, + "learning_rate": 2.4937590675309376e-05, + "loss": 0.1291, + "step": 17398 + }, + { + "epoch": 2.063204079212617, + "grad_norm": 0.9473192993649446, + "learning_rate": 2.493519032194505e-05, + "loss": 0.125, + "step": 17399 + }, + { + "epoch": 2.063322660974742, + "grad_norm": 0.9863792740506593, + "learning_rate": 2.4932789969178188e-05, + "loss": 0.1251, + "step": 17400 + }, + { + "epoch": 2.063441242736867, + "grad_norm": 0.9422638928282245, + "learning_rate": 2.4930389617030913e-05, + "loss": 0.1061, + "step": 17401 + }, + { + "epoch": 2.063559824498992, + "grad_norm": 0.6489765338447396, + "learning_rate": 2.4927989265525367e-05, + "loss": 0.1, + "step": 17402 + }, + { + "epoch": 2.063678406261117, + "grad_norm": 0.7555565256337547, + "learning_rate": 2.4925588914683674e-05, + "loss": 0.0924, + "step": 17403 + }, + { + "epoch": 2.063796988023242, + "grad_norm": 0.67499499680026, + "learning_rate": 2.4923188564527946e-05, + "loss": 0.0775, + "step": 17404 + }, + { + "epoch": 2.063915569785367, + "grad_norm": 0.9226940720124173, + "learning_rate": 2.4920788215080338e-05, + "loss": 0.1205, + "step": 17405 + }, + { + "epoch": 2.064034151547492, + "grad_norm": 0.815871920096329, + "learning_rate": 2.4918387866362967e-05, + "loss": 0.1041, + "step": 17406 + }, + { + "epoch": 2.064152733309617, + "grad_norm": 1.0368687441616573, + "learning_rate": 2.491598751839795e-05, + "loss": 0.1386, + "step": 17407 + }, + { + "epoch": 2.0642713150717418, + "grad_norm": 0.5477907425189817, + "learning_rate": 2.491358717120743e-05, + "loss": 0.064, + "step": 17408 + }, + { + "epoch": 2.064389896833867, + "grad_norm": 0.8273362174699883, + "learning_rate": 2.491118682481353e-05, + "loss": 0.0981, + "step": 17409 + }, + { + "epoch": 2.0645084785959917, + "grad_norm": 1.1202345412987194, + "learning_rate": 2.490878647923838e-05, + "loss": 0.1095, + "step": 17410 + }, + { + "epoch": 2.064627060358117, + "grad_norm": 1.0357873266219841, + "learning_rate": 2.490638613450411e-05, + "loss": 0.1479, + "step": 17411 + }, + { + "epoch": 2.0647456421202417, + "grad_norm": 1.0007571879469142, + "learning_rate": 2.4903985790632835e-05, + "loss": 0.1194, + "step": 17412 + }, + { + "epoch": 2.064864223882367, + "grad_norm": 0.8617522199577885, + "learning_rate": 2.49015854476467e-05, + "loss": 0.108, + "step": 17413 + }, + { + "epoch": 2.064982805644492, + "grad_norm": 0.5471503747883969, + "learning_rate": 2.489918510556783e-05, + "loss": 0.0795, + "step": 17414 + }, + { + "epoch": 2.065101387406617, + "grad_norm": 0.5893531229275928, + "learning_rate": 2.489678476441834e-05, + "loss": 0.0681, + "step": 17415 + }, + { + "epoch": 2.0652199691687416, + "grad_norm": 0.8561924053142149, + "learning_rate": 2.4894384424220375e-05, + "loss": 0.1375, + "step": 17416 + }, + { + "epoch": 2.065338550930867, + "grad_norm": 0.6637277093717949, + "learning_rate": 2.489198408499605e-05, + "loss": 0.0928, + "step": 17417 + }, + { + "epoch": 2.065457132692992, + "grad_norm": 0.825625850667534, + "learning_rate": 2.488958374676751e-05, + "loss": 0.0984, + "step": 17418 + }, + { + "epoch": 2.065575714455117, + "grad_norm": 0.7856345974055992, + "learning_rate": 2.4887183409556857e-05, + "loss": 0.1423, + "step": 17419 + }, + { + "epoch": 2.065694296217242, + "grad_norm": 0.8775316451716293, + "learning_rate": 2.4884783073386246e-05, + "loss": 0.1143, + "step": 17420 + }, + { + "epoch": 2.0658128779793667, + "grad_norm": 1.2032394781552735, + "learning_rate": 2.4882382738277794e-05, + "loss": 0.1487, + "step": 17421 + }, + { + "epoch": 2.065931459741492, + "grad_norm": 0.8158620796001086, + "learning_rate": 2.4879982404253633e-05, + "loss": 0.105, + "step": 17422 + }, + { + "epoch": 2.0660500415036167, + "grad_norm": 0.7724698444628536, + "learning_rate": 2.4877582071335873e-05, + "loss": 0.1023, + "step": 17423 + }, + { + "epoch": 2.066168623265742, + "grad_norm": 0.6189519374648508, + "learning_rate": 2.4875181739546665e-05, + "loss": 0.1072, + "step": 17424 + }, + { + "epoch": 2.0662872050278667, + "grad_norm": 0.8364203404272675, + "learning_rate": 2.487278140890813e-05, + "loss": 0.1049, + "step": 17425 + }, + { + "epoch": 2.066405786789992, + "grad_norm": 1.0718750502652417, + "learning_rate": 2.487038107944239e-05, + "loss": 0.1435, + "step": 17426 + }, + { + "epoch": 2.0665243685521166, + "grad_norm": 0.591789614784563, + "learning_rate": 2.4867980751171577e-05, + "loss": 0.0706, + "step": 17427 + }, + { + "epoch": 2.066642950314242, + "grad_norm": 0.8023197446764379, + "learning_rate": 2.486558042411782e-05, + "loss": 0.1025, + "step": 17428 + }, + { + "epoch": 2.0667615320763666, + "grad_norm": 0.6470044403810371, + "learning_rate": 2.486318009830325e-05, + "loss": 0.1073, + "step": 17429 + }, + { + "epoch": 2.066880113838492, + "grad_norm": 0.46605170592685263, + "learning_rate": 2.4860779773749984e-05, + "loss": 0.0694, + "step": 17430 + }, + { + "epoch": 2.0669986956006166, + "grad_norm": 0.7506070100844899, + "learning_rate": 2.4858379450480167e-05, + "loss": 0.0769, + "step": 17431 + }, + { + "epoch": 2.0671172773627418, + "grad_norm": 0.718483224984212, + "learning_rate": 2.4855979128515914e-05, + "loss": 0.0819, + "step": 17432 + }, + { + "epoch": 2.0672358591248665, + "grad_norm": 0.5167825363655733, + "learning_rate": 2.4853578807879363e-05, + "loss": 0.0873, + "step": 17433 + }, + { + "epoch": 2.0673544408869917, + "grad_norm": 0.693918161626258, + "learning_rate": 2.4851178488592622e-05, + "loss": 0.0942, + "step": 17434 + }, + { + "epoch": 2.0674730226491165, + "grad_norm": 0.5991142259960728, + "learning_rate": 2.4848778170677845e-05, + "loss": 0.0946, + "step": 17435 + }, + { + "epoch": 2.0675916044112417, + "grad_norm": 0.6589735329084676, + "learning_rate": 2.4846377854157145e-05, + "loss": 0.0955, + "step": 17436 + }, + { + "epoch": 2.0677101861733664, + "grad_norm": 0.7673235348964575, + "learning_rate": 2.484397753905265e-05, + "loss": 0.1143, + "step": 17437 + }, + { + "epoch": 2.0678287679354916, + "grad_norm": 0.5566386486894151, + "learning_rate": 2.4841577225386492e-05, + "loss": 0.0742, + "step": 17438 + }, + { + "epoch": 2.0679473496976164, + "grad_norm": 1.006724832717608, + "learning_rate": 2.4839176913180795e-05, + "loss": 0.1075, + "step": 17439 + }, + { + "epoch": 2.0680659314597416, + "grad_norm": 0.9521801682840229, + "learning_rate": 2.4836776602457696e-05, + "loss": 0.1407, + "step": 17440 + }, + { + "epoch": 2.0681845132218664, + "grad_norm": 0.5503396239170611, + "learning_rate": 2.4834376293239318e-05, + "loss": 0.0752, + "step": 17441 + }, + { + "epoch": 2.0683030949839916, + "grad_norm": 0.7356941114633565, + "learning_rate": 2.4831975985547775e-05, + "loss": 0.0992, + "step": 17442 + }, + { + "epoch": 2.0684216767461163, + "grad_norm": 0.9029228739961515, + "learning_rate": 2.4829575679405217e-05, + "loss": 0.135, + "step": 17443 + }, + { + "epoch": 2.0685402585082415, + "grad_norm": 0.6779259424724365, + "learning_rate": 2.4827175374833763e-05, + "loss": 0.0944, + "step": 17444 + }, + { + "epoch": 2.0686588402703663, + "grad_norm": 0.7128935352287706, + "learning_rate": 2.482477507185553e-05, + "loss": 0.105, + "step": 17445 + }, + { + "epoch": 2.0687774220324915, + "grad_norm": 0.7644405180561832, + "learning_rate": 2.482237477049267e-05, + "loss": 0.1005, + "step": 17446 + }, + { + "epoch": 2.0688960037946162, + "grad_norm": 1.0111958693075733, + "learning_rate": 2.4819974470767283e-05, + "loss": 0.1292, + "step": 17447 + }, + { + "epoch": 2.0690145855567414, + "grad_norm": 1.100901571546219, + "learning_rate": 2.4817574172701524e-05, + "loss": 0.1695, + "step": 17448 + }, + { + "epoch": 2.069133167318866, + "grad_norm": 0.6627063151389971, + "learning_rate": 2.4815173876317494e-05, + "loss": 0.109, + "step": 17449 + }, + { + "epoch": 2.0692517490809914, + "grad_norm": 0.6846387311082963, + "learning_rate": 2.4812773581637345e-05, + "loss": 0.0781, + "step": 17450 + }, + { + "epoch": 2.069370330843116, + "grad_norm": 0.7456614756363709, + "learning_rate": 2.4810373288683192e-05, + "loss": 0.0884, + "step": 17451 + }, + { + "epoch": 2.0694889126052414, + "grad_norm": 0.7755630180188738, + "learning_rate": 2.480797299747717e-05, + "loss": 0.1154, + "step": 17452 + }, + { + "epoch": 2.069607494367366, + "grad_norm": 0.6800636511163054, + "learning_rate": 2.4805572708041385e-05, + "loss": 0.1016, + "step": 17453 + }, + { + "epoch": 2.0697260761294913, + "grad_norm": 0.7674928930303733, + "learning_rate": 2.4803172420397993e-05, + "loss": 0.1169, + "step": 17454 + }, + { + "epoch": 2.069844657891616, + "grad_norm": 0.7542241209138679, + "learning_rate": 2.4800772134569112e-05, + "loss": 0.0821, + "step": 17455 + }, + { + "epoch": 2.0699632396537413, + "grad_norm": 0.7062613411967146, + "learning_rate": 2.4798371850576862e-05, + "loss": 0.0909, + "step": 17456 + }, + { + "epoch": 2.070081821415866, + "grad_norm": 0.8002659449382707, + "learning_rate": 2.4795971568443376e-05, + "loss": 0.1174, + "step": 17457 + }, + { + "epoch": 2.0702004031779913, + "grad_norm": 0.731067998027003, + "learning_rate": 2.479357128819078e-05, + "loss": 0.087, + "step": 17458 + }, + { + "epoch": 2.070318984940116, + "grad_norm": 0.672228602592148, + "learning_rate": 2.479117100984121e-05, + "loss": 0.0971, + "step": 17459 + }, + { + "epoch": 2.070437566702241, + "grad_norm": 1.1704731102944372, + "learning_rate": 2.4788770733416776e-05, + "loss": 0.1502, + "step": 17460 + }, + { + "epoch": 2.070556148464366, + "grad_norm": 0.6870449740869414, + "learning_rate": 2.478637045893963e-05, + "loss": 0.0958, + "step": 17461 + }, + { + "epoch": 2.070674730226491, + "grad_norm": 0.8634317731259946, + "learning_rate": 2.4783970186431884e-05, + "loss": 0.132, + "step": 17462 + }, + { + "epoch": 2.070793311988616, + "grad_norm": 0.6619572377933647, + "learning_rate": 2.478156991591567e-05, + "loss": 0.1125, + "step": 17463 + }, + { + "epoch": 2.070911893750741, + "grad_norm": 0.6098575742332693, + "learning_rate": 2.47791696474131e-05, + "loss": 0.0903, + "step": 17464 + }, + { + "epoch": 2.071030475512866, + "grad_norm": 0.5977225952807884, + "learning_rate": 2.477676938094633e-05, + "loss": 0.0861, + "step": 17465 + }, + { + "epoch": 2.071149057274991, + "grad_norm": 0.9308505459956217, + "learning_rate": 2.4774369116537462e-05, + "loss": 0.1169, + "step": 17466 + }, + { + "epoch": 2.0712676390371163, + "grad_norm": 0.6537130849144013, + "learning_rate": 2.4771968854208643e-05, + "loss": 0.0859, + "step": 17467 + }, + { + "epoch": 2.071386220799241, + "grad_norm": 0.7001951201599079, + "learning_rate": 2.476956859398198e-05, + "loss": 0.0782, + "step": 17468 + }, + { + "epoch": 2.0715048025613663, + "grad_norm": 0.8151969136124458, + "learning_rate": 2.4767168335879624e-05, + "loss": 0.1302, + "step": 17469 + }, + { + "epoch": 2.071623384323491, + "grad_norm": 0.5906058042568211, + "learning_rate": 2.476476807992369e-05, + "loss": 0.0727, + "step": 17470 + }, + { + "epoch": 2.0717419660856162, + "grad_norm": 0.5502265845891462, + "learning_rate": 2.4762367826136304e-05, + "loss": 0.0765, + "step": 17471 + }, + { + "epoch": 2.071860547847741, + "grad_norm": 0.6540609997372239, + "learning_rate": 2.475996757453958e-05, + "loss": 0.0833, + "step": 17472 + }, + { + "epoch": 2.071979129609866, + "grad_norm": 0.6107237720493016, + "learning_rate": 2.4757567325155682e-05, + "loss": 0.0858, + "step": 17473 + }, + { + "epoch": 2.072097711371991, + "grad_norm": 0.7826467514719663, + "learning_rate": 2.475516707800671e-05, + "loss": 0.115, + "step": 17474 + }, + { + "epoch": 2.072216293134116, + "grad_norm": 0.834847133865399, + "learning_rate": 2.4752766833114788e-05, + "loss": 0.0989, + "step": 17475 + }, + { + "epoch": 2.072334874896241, + "grad_norm": 0.9637651027812393, + "learning_rate": 2.475036659050206e-05, + "loss": 0.1282, + "step": 17476 + }, + { + "epoch": 2.072453456658366, + "grad_norm": 0.824813591040381, + "learning_rate": 2.4747966350190642e-05, + "loss": 0.1007, + "step": 17477 + }, + { + "epoch": 2.072572038420491, + "grad_norm": 0.8512803186351394, + "learning_rate": 2.4745566112202673e-05, + "loss": 0.1124, + "step": 17478 + }, + { + "epoch": 2.072690620182616, + "grad_norm": 0.835513229090426, + "learning_rate": 2.4743165876560263e-05, + "loss": 0.1357, + "step": 17479 + }, + { + "epoch": 2.072809201944741, + "grad_norm": 0.7230408011134642, + "learning_rate": 2.4740765643285555e-05, + "loss": 0.0845, + "step": 17480 + }, + { + "epoch": 2.072927783706866, + "grad_norm": 0.5292730228554473, + "learning_rate": 2.473836541240067e-05, + "loss": 0.068, + "step": 17481 + }, + { + "epoch": 2.073046365468991, + "grad_norm": 0.851011873802425, + "learning_rate": 2.473596518392774e-05, + "loss": 0.1234, + "step": 17482 + }, + { + "epoch": 2.073164947231116, + "grad_norm": 0.6998948592881266, + "learning_rate": 2.473356495788887e-05, + "loss": 0.1116, + "step": 17483 + }, + { + "epoch": 2.0732835289932408, + "grad_norm": 0.7823131013689462, + "learning_rate": 2.473116473430622e-05, + "loss": 0.0957, + "step": 17484 + }, + { + "epoch": 2.073402110755366, + "grad_norm": 0.7262117156711655, + "learning_rate": 2.47287645132019e-05, + "loss": 0.1059, + "step": 17485 + }, + { + "epoch": 2.0735206925174907, + "grad_norm": 0.7446288195084892, + "learning_rate": 2.4726364294598034e-05, + "loss": 0.0771, + "step": 17486 + }, + { + "epoch": 2.073639274279616, + "grad_norm": 0.8686893097541203, + "learning_rate": 2.472396407851675e-05, + "loss": 0.1046, + "step": 17487 + }, + { + "epoch": 2.0737578560417407, + "grad_norm": 0.9251466581273083, + "learning_rate": 2.4721563864980182e-05, + "loss": 0.0998, + "step": 17488 + }, + { + "epoch": 2.073876437803866, + "grad_norm": 0.6214377853897842, + "learning_rate": 2.471916365401046e-05, + "loss": 0.0902, + "step": 17489 + }, + { + "epoch": 2.0739950195659906, + "grad_norm": 0.6254542291000516, + "learning_rate": 2.4716763445629694e-05, + "loss": 0.0855, + "step": 17490 + }, + { + "epoch": 2.074113601328116, + "grad_norm": 0.7523832362464642, + "learning_rate": 2.471436323986003e-05, + "loss": 0.0972, + "step": 17491 + }, + { + "epoch": 2.0742321830902406, + "grad_norm": 0.8089277586336766, + "learning_rate": 2.4711963036723586e-05, + "loss": 0.1077, + "step": 17492 + }, + { + "epoch": 2.074350764852366, + "grad_norm": 0.9745265771555571, + "learning_rate": 2.470956283624249e-05, + "loss": 0.1404, + "step": 17493 + }, + { + "epoch": 2.0744693466144906, + "grad_norm": 0.523514167998119, + "learning_rate": 2.4707162638438862e-05, + "loss": 0.0707, + "step": 17494 + }, + { + "epoch": 2.0745879283766158, + "grad_norm": 0.6697298216260636, + "learning_rate": 2.4704762443334843e-05, + "loss": 0.0992, + "step": 17495 + }, + { + "epoch": 2.0747065101387405, + "grad_norm": 1.0885762035123567, + "learning_rate": 2.4702362250952547e-05, + "loss": 0.1761, + "step": 17496 + }, + { + "epoch": 2.0748250919008657, + "grad_norm": 0.5550552897508173, + "learning_rate": 2.469996206131411e-05, + "loss": 0.0735, + "step": 17497 + }, + { + "epoch": 2.0749436736629905, + "grad_norm": 0.763012749970266, + "learning_rate": 2.4697561874441645e-05, + "loss": 0.109, + "step": 17498 + }, + { + "epoch": 2.0750622554251157, + "grad_norm": 0.7762135285534536, + "learning_rate": 2.46951616903573e-05, + "loss": 0.0876, + "step": 17499 + }, + { + "epoch": 2.0751808371872404, + "grad_norm": 1.0242974179155813, + "learning_rate": 2.469276150908319e-05, + "loss": 0.1279, + "step": 17500 + }, + { + "epoch": 2.0752994189493656, + "grad_norm": 0.6581940153226955, + "learning_rate": 2.4690361330641432e-05, + "loss": 0.0738, + "step": 17501 + }, + { + "epoch": 2.0754180007114904, + "grad_norm": 0.8103622334349971, + "learning_rate": 2.4687961155054173e-05, + "loss": 0.1313, + "step": 17502 + }, + { + "epoch": 2.0755365824736156, + "grad_norm": 0.5619542655968248, + "learning_rate": 2.4685560982343527e-05, + "loss": 0.075, + "step": 17503 + }, + { + "epoch": 2.0756551642357404, + "grad_norm": 0.6337911668808532, + "learning_rate": 2.4683160812531627e-05, + "loss": 0.0765, + "step": 17504 + }, + { + "epoch": 2.0757737459978656, + "grad_norm": 0.9845624363669623, + "learning_rate": 2.4680760645640584e-05, + "loss": 0.125, + "step": 17505 + }, + { + "epoch": 2.0758923277599903, + "grad_norm": 0.9167232944212368, + "learning_rate": 2.4678360481692548e-05, + "loss": 0.1088, + "step": 17506 + }, + { + "epoch": 2.0760109095221155, + "grad_norm": 0.7797191591109154, + "learning_rate": 2.4675960320709628e-05, + "loss": 0.1256, + "step": 17507 + }, + { + "epoch": 2.0761294912842403, + "grad_norm": 0.5345357508808961, + "learning_rate": 2.467356016271396e-05, + "loss": 0.0727, + "step": 17508 + }, + { + "epoch": 2.0762480730463655, + "grad_norm": 0.5948296596698751, + "learning_rate": 2.467116000772766e-05, + "loss": 0.0777, + "step": 17509 + }, + { + "epoch": 2.0763666548084903, + "grad_norm": 0.6811774285130657, + "learning_rate": 2.4668759855772868e-05, + "loss": 0.0913, + "step": 17510 + }, + { + "epoch": 2.0764852365706155, + "grad_norm": 0.6720119257941368, + "learning_rate": 2.4666359706871704e-05, + "loss": 0.0971, + "step": 17511 + }, + { + "epoch": 2.07660381833274, + "grad_norm": 0.5958859309678184, + "learning_rate": 2.46639595610463e-05, + "loss": 0.0774, + "step": 17512 + }, + { + "epoch": 2.0767224000948654, + "grad_norm": 0.716237341038888, + "learning_rate": 2.4661559418318757e-05, + "loss": 0.0852, + "step": 17513 + }, + { + "epoch": 2.07684098185699, + "grad_norm": 1.0105586561385984, + "learning_rate": 2.465915927871124e-05, + "loss": 0.1247, + "step": 17514 + }, + { + "epoch": 2.0769595636191154, + "grad_norm": 0.8492776749552114, + "learning_rate": 2.4656759142245846e-05, + "loss": 0.1133, + "step": 17515 + }, + { + "epoch": 2.0770781453812406, + "grad_norm": 0.8272961600384933, + "learning_rate": 2.4654359008944716e-05, + "loss": 0.1258, + "step": 17516 + }, + { + "epoch": 2.0771967271433653, + "grad_norm": 0.49471177109418485, + "learning_rate": 2.465195887882997e-05, + "loss": 0.0817, + "step": 17517 + }, + { + "epoch": 2.0773153089054905, + "grad_norm": 1.1683998290003208, + "learning_rate": 2.464955875192374e-05, + "loss": 0.1547, + "step": 17518 + }, + { + "epoch": 2.0774338906676153, + "grad_norm": 0.8682636509523496, + "learning_rate": 2.464715862824815e-05, + "loss": 0.1078, + "step": 17519 + }, + { + "epoch": 2.0775524724297405, + "grad_norm": 0.7976375414808635, + "learning_rate": 2.4644758507825315e-05, + "loss": 0.1153, + "step": 17520 + }, + { + "epoch": 2.0776710541918653, + "grad_norm": 0.6301286004191206, + "learning_rate": 2.464235839067738e-05, + "loss": 0.1054, + "step": 17521 + }, + { + "epoch": 2.0777896359539905, + "grad_norm": 0.6273064868142915, + "learning_rate": 2.4639958276826467e-05, + "loss": 0.0907, + "step": 17522 + }, + { + "epoch": 2.0779082177161152, + "grad_norm": 0.9805440735070937, + "learning_rate": 2.463755816629469e-05, + "loss": 0.1059, + "step": 17523 + }, + { + "epoch": 2.0780267994782404, + "grad_norm": 1.1339877135164396, + "learning_rate": 2.4635158059104176e-05, + "loss": 0.1565, + "step": 17524 + }, + { + "epoch": 2.078145381240365, + "grad_norm": 0.8687485916670098, + "learning_rate": 2.4632757955277068e-05, + "loss": 0.1053, + "step": 17525 + }, + { + "epoch": 2.0782639630024904, + "grad_norm": 0.748006965290861, + "learning_rate": 2.4630357854835472e-05, + "loss": 0.1049, + "step": 17526 + }, + { + "epoch": 2.078382544764615, + "grad_norm": 0.9377184127845389, + "learning_rate": 2.4627957757801534e-05, + "loss": 0.1417, + "step": 17527 + }, + { + "epoch": 2.0785011265267404, + "grad_norm": 0.607561803405347, + "learning_rate": 2.4625557664197354e-05, + "loss": 0.0806, + "step": 17528 + }, + { + "epoch": 2.078619708288865, + "grad_norm": 0.8719246470156837, + "learning_rate": 2.4623157574045085e-05, + "loss": 0.1199, + "step": 17529 + }, + { + "epoch": 2.0787382900509903, + "grad_norm": 0.7114110731677584, + "learning_rate": 2.4620757487366843e-05, + "loss": 0.0866, + "step": 17530 + }, + { + "epoch": 2.078856871813115, + "grad_norm": 0.7098816482982149, + "learning_rate": 2.4618357404184743e-05, + "loss": 0.1083, + "step": 17531 + }, + { + "epoch": 2.0789754535752403, + "grad_norm": 0.5962859518727143, + "learning_rate": 2.4615957324520926e-05, + "loss": 0.0672, + "step": 17532 + }, + { + "epoch": 2.079094035337365, + "grad_norm": 0.5557378634183437, + "learning_rate": 2.461355724839751e-05, + "loss": 0.0752, + "step": 17533 + }, + { + "epoch": 2.0792126170994902, + "grad_norm": 0.7211975040859164, + "learning_rate": 2.4611157175836628e-05, + "loss": 0.0955, + "step": 17534 + }, + { + "epoch": 2.079331198861615, + "grad_norm": 0.6506114814105282, + "learning_rate": 2.4608757106860385e-05, + "loss": 0.0882, + "step": 17535 + }, + { + "epoch": 2.07944978062374, + "grad_norm": 0.7084767112120013, + "learning_rate": 2.4606357041490934e-05, + "loss": 0.0768, + "step": 17536 + }, + { + "epoch": 2.079568362385865, + "grad_norm": 0.8177746122898744, + "learning_rate": 2.4603956979750382e-05, + "loss": 0.1166, + "step": 17537 + }, + { + "epoch": 2.07968694414799, + "grad_norm": 0.6994291750083701, + "learning_rate": 2.460155692166087e-05, + "loss": 0.1, + "step": 17538 + }, + { + "epoch": 2.079805525910115, + "grad_norm": 0.901683699258696, + "learning_rate": 2.4599156867244498e-05, + "loss": 0.1528, + "step": 17539 + }, + { + "epoch": 2.07992410767224, + "grad_norm": 0.8341325783778095, + "learning_rate": 2.4596756816523425e-05, + "loss": 0.124, + "step": 17540 + }, + { + "epoch": 2.080042689434365, + "grad_norm": 0.9309733265333537, + "learning_rate": 2.4594356769519755e-05, + "loss": 0.1228, + "step": 17541 + }, + { + "epoch": 2.08016127119649, + "grad_norm": 0.8198339604041968, + "learning_rate": 2.459195672625562e-05, + "loss": 0.0907, + "step": 17542 + }, + { + "epoch": 2.080279852958615, + "grad_norm": 0.8877427765828633, + "learning_rate": 2.4589556686753133e-05, + "loss": 0.1109, + "step": 17543 + }, + { + "epoch": 2.08039843472074, + "grad_norm": 0.8152105320858147, + "learning_rate": 2.458715665103444e-05, + "loss": 0.076, + "step": 17544 + }, + { + "epoch": 2.080517016482865, + "grad_norm": 0.728634593349909, + "learning_rate": 2.4584756619121655e-05, + "loss": 0.1062, + "step": 17545 + }, + { + "epoch": 2.08063559824499, + "grad_norm": 1.0929498662265413, + "learning_rate": 2.4582356591036897e-05, + "loss": 0.1311, + "step": 17546 + }, + { + "epoch": 2.0807541800071148, + "grad_norm": 0.5988253003652865, + "learning_rate": 2.4579956566802307e-05, + "loss": 0.092, + "step": 17547 + }, + { + "epoch": 2.08087276176924, + "grad_norm": 0.6624654254923319, + "learning_rate": 2.4577556546440002e-05, + "loss": 0.092, + "step": 17548 + }, + { + "epoch": 2.0809913435313647, + "grad_norm": 0.8083842956592264, + "learning_rate": 2.457515652997211e-05, + "loss": 0.1041, + "step": 17549 + }, + { + "epoch": 2.08110992529349, + "grad_norm": 0.7751165072453795, + "learning_rate": 2.4572756517420744e-05, + "loss": 0.104, + "step": 17550 + }, + { + "epoch": 2.0812285070556147, + "grad_norm": 0.7158893203522532, + "learning_rate": 2.4570356508808047e-05, + "loss": 0.1028, + "step": 17551 + }, + { + "epoch": 2.08134708881774, + "grad_norm": 0.8784870868788169, + "learning_rate": 2.456795650415614e-05, + "loss": 0.1286, + "step": 17552 + }, + { + "epoch": 2.0814656705798646, + "grad_norm": 0.8973327844649757, + "learning_rate": 2.456555650348714e-05, + "loss": 0.1206, + "step": 17553 + }, + { + "epoch": 2.08158425234199, + "grad_norm": 0.9457314639356276, + "learning_rate": 2.456315650682317e-05, + "loss": 0.1191, + "step": 17554 + }, + { + "epoch": 2.0817028341041146, + "grad_norm": 0.6646022752950235, + "learning_rate": 2.4560756514186367e-05, + "loss": 0.1201, + "step": 17555 + }, + { + "epoch": 2.08182141586624, + "grad_norm": 1.0104121969263449, + "learning_rate": 2.455835652559885e-05, + "loss": 0.1462, + "step": 17556 + }, + { + "epoch": 2.0819399976283646, + "grad_norm": 0.7189441623391012, + "learning_rate": 2.4555956541082746e-05, + "loss": 0.0987, + "step": 17557 + }, + { + "epoch": 2.0820585793904898, + "grad_norm": 0.706220697546971, + "learning_rate": 2.455355656066017e-05, + "loss": 0.089, + "step": 17558 + }, + { + "epoch": 2.0821771611526145, + "grad_norm": 0.776011133866015, + "learning_rate": 2.4551156584353264e-05, + "loss": 0.1209, + "step": 17559 + }, + { + "epoch": 2.0822957429147397, + "grad_norm": 0.682417833366189, + "learning_rate": 2.4548756612184142e-05, + "loss": 0.1045, + "step": 17560 + }, + { + "epoch": 2.0824143246768645, + "grad_norm": 0.7746598914605438, + "learning_rate": 2.454635664417492e-05, + "loss": 0.1372, + "step": 17561 + }, + { + "epoch": 2.0825329064389897, + "grad_norm": 0.9844360354269304, + "learning_rate": 2.454395668034775e-05, + "loss": 0.1273, + "step": 17562 + }, + { + "epoch": 2.0826514882011145, + "grad_norm": 0.639976117616033, + "learning_rate": 2.4541556720724734e-05, + "loss": 0.0872, + "step": 17563 + }, + { + "epoch": 2.0827700699632397, + "grad_norm": 0.8419100723886338, + "learning_rate": 2.4539156765328007e-05, + "loss": 0.1303, + "step": 17564 + }, + { + "epoch": 2.082888651725365, + "grad_norm": 0.7230804309098228, + "learning_rate": 2.4536756814179675e-05, + "loss": 0.1101, + "step": 17565 + }, + { + "epoch": 2.0830072334874896, + "grad_norm": 1.0140153116664934, + "learning_rate": 2.453435686730189e-05, + "loss": 0.1764, + "step": 17566 + }, + { + "epoch": 2.083125815249615, + "grad_norm": 0.7102831110964155, + "learning_rate": 2.453195692471676e-05, + "loss": 0.1067, + "step": 17567 + }, + { + "epoch": 2.0832443970117396, + "grad_norm": 0.7996194201078745, + "learning_rate": 2.4529556986446418e-05, + "loss": 0.1293, + "step": 17568 + }, + { + "epoch": 2.083362978773865, + "grad_norm": 0.8108775112488361, + "learning_rate": 2.452715705251297e-05, + "loss": 0.1309, + "step": 17569 + }, + { + "epoch": 2.0834815605359895, + "grad_norm": 0.44894440894597726, + "learning_rate": 2.4524757122938567e-05, + "loss": 0.0538, + "step": 17570 + }, + { + "epoch": 2.0836001422981147, + "grad_norm": 1.0092731107540156, + "learning_rate": 2.452235719774532e-05, + "loss": 0.1328, + "step": 17571 + }, + { + "epoch": 2.0837187240602395, + "grad_norm": 0.76235051126539, + "learning_rate": 2.4519957276955354e-05, + "loss": 0.0952, + "step": 17572 + }, + { + "epoch": 2.0838373058223647, + "grad_norm": 0.4905061957884288, + "learning_rate": 2.4517557360590782e-05, + "loss": 0.0672, + "step": 17573 + }, + { + "epoch": 2.0839558875844895, + "grad_norm": 0.7663141212353435, + "learning_rate": 2.451515744867375e-05, + "loss": 0.1084, + "step": 17574 + }, + { + "epoch": 2.0840744693466147, + "grad_norm": 0.9391908667103068, + "learning_rate": 2.4512757541226368e-05, + "loss": 0.1126, + "step": 17575 + }, + { + "epoch": 2.0841930511087394, + "grad_norm": 1.0305139257393674, + "learning_rate": 2.4510357638270765e-05, + "loss": 0.1433, + "step": 17576 + }, + { + "epoch": 2.0843116328708646, + "grad_norm": 0.6285902237507764, + "learning_rate": 2.450795773982906e-05, + "loss": 0.0741, + "step": 17577 + }, + { + "epoch": 2.0844302146329894, + "grad_norm": 0.6943018202493522, + "learning_rate": 2.4505557845923392e-05, + "loss": 0.0779, + "step": 17578 + }, + { + "epoch": 2.0845487963951146, + "grad_norm": 0.8117865140706843, + "learning_rate": 2.450315795657587e-05, + "loss": 0.1091, + "step": 17579 + }, + { + "epoch": 2.0846673781572393, + "grad_norm": 0.9721600932731582, + "learning_rate": 2.4500758071808613e-05, + "loss": 0.1134, + "step": 17580 + }, + { + "epoch": 2.0847859599193646, + "grad_norm": 0.5661264548447491, + "learning_rate": 2.4498358191643763e-05, + "loss": 0.0867, + "step": 17581 + }, + { + "epoch": 2.0849045416814893, + "grad_norm": 0.8157213331216359, + "learning_rate": 2.449595831610344e-05, + "loss": 0.1255, + "step": 17582 + }, + { + "epoch": 2.0850231234436145, + "grad_norm": 0.9796931462223702, + "learning_rate": 2.449355844520976e-05, + "loss": 0.152, + "step": 17583 + }, + { + "epoch": 2.0851417052057393, + "grad_norm": 0.6666504916994459, + "learning_rate": 2.449115857898484e-05, + "loss": 0.1118, + "step": 17584 + }, + { + "epoch": 2.0852602869678645, + "grad_norm": 0.5124568468240628, + "learning_rate": 2.4488758717450825e-05, + "loss": 0.0601, + "step": 17585 + }, + { + "epoch": 2.0853788687299892, + "grad_norm": 0.6822491178229534, + "learning_rate": 2.448635886062982e-05, + "loss": 0.1087, + "step": 17586 + }, + { + "epoch": 2.0854974504921144, + "grad_norm": 0.6910763818323861, + "learning_rate": 2.448395900854396e-05, + "loss": 0.1028, + "step": 17587 + }, + { + "epoch": 2.085616032254239, + "grad_norm": 0.73425255902695, + "learning_rate": 2.4481559161215365e-05, + "loss": 0.0971, + "step": 17588 + }, + { + "epoch": 2.0857346140163644, + "grad_norm": 0.9889233631623555, + "learning_rate": 2.447915931866616e-05, + "loss": 0.0901, + "step": 17589 + }, + { + "epoch": 2.085853195778489, + "grad_norm": 0.8577216428977207, + "learning_rate": 2.447675948091847e-05, + "loss": 0.0895, + "step": 17590 + }, + { + "epoch": 2.0859717775406144, + "grad_norm": 0.9107646601297731, + "learning_rate": 2.4474359647994406e-05, + "loss": 0.1075, + "step": 17591 + }, + { + "epoch": 2.086090359302739, + "grad_norm": 0.8538064049803596, + "learning_rate": 2.4471959819916112e-05, + "loss": 0.1172, + "step": 17592 + }, + { + "epoch": 2.0862089410648643, + "grad_norm": 0.6550809201450599, + "learning_rate": 2.44695599967057e-05, + "loss": 0.0876, + "step": 17593 + }, + { + "epoch": 2.086327522826989, + "grad_norm": 0.8152713960218338, + "learning_rate": 2.446716017838529e-05, + "loss": 0.1126, + "step": 17594 + }, + { + "epoch": 2.0864461045891143, + "grad_norm": 0.9861425749236238, + "learning_rate": 2.446476036497701e-05, + "loss": 0.1404, + "step": 17595 + }, + { + "epoch": 2.086564686351239, + "grad_norm": 1.1338932156629615, + "learning_rate": 2.446236055650298e-05, + "loss": 0.1517, + "step": 17596 + }, + { + "epoch": 2.0866832681133642, + "grad_norm": 0.9200931438714708, + "learning_rate": 2.4459960752985338e-05, + "loss": 0.1108, + "step": 17597 + }, + { + "epoch": 2.086801849875489, + "grad_norm": 0.6969253686826722, + "learning_rate": 2.445756095444619e-05, + "loss": 0.0888, + "step": 17598 + }, + { + "epoch": 2.086920431637614, + "grad_norm": 1.0325795429643554, + "learning_rate": 2.4455161160907657e-05, + "loss": 0.1233, + "step": 17599 + }, + { + "epoch": 2.087039013399739, + "grad_norm": 0.6377420179786789, + "learning_rate": 2.4452761372391876e-05, + "loss": 0.1006, + "step": 17600 + }, + { + "epoch": 2.087157595161864, + "grad_norm": 0.7711336253125705, + "learning_rate": 2.445036158892097e-05, + "loss": 0.0961, + "step": 17601 + }, + { + "epoch": 2.087276176923989, + "grad_norm": 1.0033727868915332, + "learning_rate": 2.4447961810517042e-05, + "loss": 0.1009, + "step": 17602 + }, + { + "epoch": 2.087394758686114, + "grad_norm": 0.9466353534957586, + "learning_rate": 2.444556203720224e-05, + "loss": 0.1295, + "step": 17603 + }, + { + "epoch": 2.087513340448239, + "grad_norm": 0.6396654816264715, + "learning_rate": 2.4443162268998676e-05, + "loss": 0.0959, + "step": 17604 + }, + { + "epoch": 2.087631922210364, + "grad_norm": 0.7717400642353882, + "learning_rate": 2.444076250592847e-05, + "loss": 0.0956, + "step": 17605 + }, + { + "epoch": 2.087750503972489, + "grad_norm": 0.5085232405399873, + "learning_rate": 2.4438362748013748e-05, + "loss": 0.0859, + "step": 17606 + }, + { + "epoch": 2.087869085734614, + "grad_norm": 0.787164704662243, + "learning_rate": 2.443596299527663e-05, + "loss": 0.1239, + "step": 17607 + }, + { + "epoch": 2.087987667496739, + "grad_norm": 0.7226753304852985, + "learning_rate": 2.4433563247739246e-05, + "loss": 0.1049, + "step": 17608 + }, + { + "epoch": 2.088106249258864, + "grad_norm": 0.9304430633718083, + "learning_rate": 2.443116350542372e-05, + "loss": 0.1356, + "step": 17609 + }, + { + "epoch": 2.0882248310209888, + "grad_norm": 0.6855210883060936, + "learning_rate": 2.4428763768352153e-05, + "loss": 0.0931, + "step": 17610 + }, + { + "epoch": 2.088343412783114, + "grad_norm": 0.8787914443213503, + "learning_rate": 2.4426364036546695e-05, + "loss": 0.0993, + "step": 17611 + }, + { + "epoch": 2.0884619945452387, + "grad_norm": 0.7068518287368422, + "learning_rate": 2.4423964310029458e-05, + "loss": 0.1021, + "step": 17612 + }, + { + "epoch": 2.088580576307364, + "grad_norm": 0.6504642396711622, + "learning_rate": 2.442156458882256e-05, + "loss": 0.0841, + "step": 17613 + }, + { + "epoch": 2.088699158069489, + "grad_norm": 0.7433557604737919, + "learning_rate": 2.4419164872948123e-05, + "loss": 0.0924, + "step": 17614 + }, + { + "epoch": 2.088817739831614, + "grad_norm": 0.709053783611437, + "learning_rate": 2.4416765162428282e-05, + "loss": 0.0837, + "step": 17615 + }, + { + "epoch": 2.0889363215937387, + "grad_norm": 0.8398086788614837, + "learning_rate": 2.4414365457285145e-05, + "loss": 0.1131, + "step": 17616 + }, + { + "epoch": 2.089054903355864, + "grad_norm": 0.5883471183797552, + "learning_rate": 2.441196575754084e-05, + "loss": 0.0918, + "step": 17617 + }, + { + "epoch": 2.089173485117989, + "grad_norm": 0.7704184128060104, + "learning_rate": 2.440956606321749e-05, + "loss": 0.1151, + "step": 17618 + }, + { + "epoch": 2.089292066880114, + "grad_norm": 0.8413856651734203, + "learning_rate": 2.440716637433722e-05, + "loss": 0.106, + "step": 17619 + }, + { + "epoch": 2.089410648642239, + "grad_norm": 0.790596230558701, + "learning_rate": 2.4404766690922154e-05, + "loss": 0.1127, + "step": 17620 + }, + { + "epoch": 2.089529230404364, + "grad_norm": 0.9671456935997226, + "learning_rate": 2.4402367012994395e-05, + "loss": 0.1066, + "step": 17621 + }, + { + "epoch": 2.089647812166489, + "grad_norm": 0.7863601350384377, + "learning_rate": 2.439996734057609e-05, + "loss": 0.1147, + "step": 17622 + }, + { + "epoch": 2.0897663939286137, + "grad_norm": 0.5609428646860544, + "learning_rate": 2.4397567673689353e-05, + "loss": 0.0748, + "step": 17623 + }, + { + "epoch": 2.089884975690739, + "grad_norm": 0.9615468157565555, + "learning_rate": 2.4395168012356295e-05, + "loss": 0.1134, + "step": 17624 + }, + { + "epoch": 2.0900035574528637, + "grad_norm": 0.7620471522326553, + "learning_rate": 2.4392768356599045e-05, + "loss": 0.1067, + "step": 17625 + }, + { + "epoch": 2.090122139214989, + "grad_norm": 0.8240886647210064, + "learning_rate": 2.439036870643973e-05, + "loss": 0.1135, + "step": 17626 + }, + { + "epoch": 2.0902407209771137, + "grad_norm": 0.7106948296162857, + "learning_rate": 2.4387969061900473e-05, + "loss": 0.101, + "step": 17627 + }, + { + "epoch": 2.090359302739239, + "grad_norm": 0.7312884247017787, + "learning_rate": 2.4385569423003386e-05, + "loss": 0.1045, + "step": 17628 + }, + { + "epoch": 2.0904778845013636, + "grad_norm": 0.4792017999600944, + "learning_rate": 2.4383169789770592e-05, + "loss": 0.0582, + "step": 17629 + }, + { + "epoch": 2.090596466263489, + "grad_norm": 0.7092388227997387, + "learning_rate": 2.438077016222422e-05, + "loss": 0.1083, + "step": 17630 + }, + { + "epoch": 2.0907150480256136, + "grad_norm": 0.7503088423747168, + "learning_rate": 2.437837054038639e-05, + "loss": 0.0917, + "step": 17631 + }, + { + "epoch": 2.090833629787739, + "grad_norm": 0.8897422412715482, + "learning_rate": 2.437597092427921e-05, + "loss": 0.1282, + "step": 17632 + }, + { + "epoch": 2.0909522115498635, + "grad_norm": 0.9277302510988272, + "learning_rate": 2.4373571313924827e-05, + "loss": 0.1002, + "step": 17633 + }, + { + "epoch": 2.0910707933119888, + "grad_norm": 0.8077985333348756, + "learning_rate": 2.437117170934535e-05, + "loss": 0.1221, + "step": 17634 + }, + { + "epoch": 2.0911893750741135, + "grad_norm": 1.229855344940687, + "learning_rate": 2.436877211056289e-05, + "loss": 0.1641, + "step": 17635 + }, + { + "epoch": 2.0913079568362387, + "grad_norm": 0.9469704948747244, + "learning_rate": 2.4366372517599572e-05, + "loss": 0.1299, + "step": 17636 + }, + { + "epoch": 2.0914265385983635, + "grad_norm": 1.0463558536333686, + "learning_rate": 2.4363972930477527e-05, + "loss": 0.1188, + "step": 17637 + }, + { + "epoch": 2.0915451203604887, + "grad_norm": 0.7691373061105046, + "learning_rate": 2.4361573349218877e-05, + "loss": 0.0787, + "step": 17638 + }, + { + "epoch": 2.0916637021226134, + "grad_norm": 0.609574976790093, + "learning_rate": 2.4359173773845738e-05, + "loss": 0.0814, + "step": 17639 + }, + { + "epoch": 2.0917822838847386, + "grad_norm": 0.5768814457326507, + "learning_rate": 2.435677420438022e-05, + "loss": 0.0829, + "step": 17640 + }, + { + "epoch": 2.0919008656468634, + "grad_norm": 0.7862685009967889, + "learning_rate": 2.4354374640844468e-05, + "loss": 0.0965, + "step": 17641 + }, + { + "epoch": 2.0920194474089886, + "grad_norm": 0.7819893957648906, + "learning_rate": 2.4351975083260586e-05, + "loss": 0.0734, + "step": 17642 + }, + { + "epoch": 2.0921380291711134, + "grad_norm": 0.9640960034349493, + "learning_rate": 2.43495755316507e-05, + "loss": 0.1122, + "step": 17643 + }, + { + "epoch": 2.0922566109332386, + "grad_norm": 0.7052640202782057, + "learning_rate": 2.434717598603692e-05, + "loss": 0.0789, + "step": 17644 + }, + { + "epoch": 2.0923751926953633, + "grad_norm": 0.9684501802268194, + "learning_rate": 2.4344776446441385e-05, + "loss": 0.106, + "step": 17645 + }, + { + "epoch": 2.0924937744574885, + "grad_norm": 0.5948129398398712, + "learning_rate": 2.4342376912886212e-05, + "loss": 0.0912, + "step": 17646 + }, + { + "epoch": 2.0926123562196133, + "grad_norm": 0.9755656220530663, + "learning_rate": 2.4339977385393504e-05, + "loss": 0.1321, + "step": 17647 + }, + { + "epoch": 2.0927309379817385, + "grad_norm": 0.5399435344790019, + "learning_rate": 2.4337577863985406e-05, + "loss": 0.0838, + "step": 17648 + }, + { + "epoch": 2.0928495197438632, + "grad_norm": 0.7913035294045291, + "learning_rate": 2.433517834868403e-05, + "loss": 0.1345, + "step": 17649 + }, + { + "epoch": 2.0929681015059884, + "grad_norm": 0.8849693920752884, + "learning_rate": 2.4332778839511493e-05, + "loss": 0.1213, + "step": 17650 + }, + { + "epoch": 2.093086683268113, + "grad_norm": 0.7564928031194272, + "learning_rate": 2.4330379336489906e-05, + "loss": 0.1037, + "step": 17651 + }, + { + "epoch": 2.0932052650302384, + "grad_norm": 0.9379148021536517, + "learning_rate": 2.4327979839641413e-05, + "loss": 0.1338, + "step": 17652 + }, + { + "epoch": 2.093323846792363, + "grad_norm": 0.7643884004661348, + "learning_rate": 2.432558034898812e-05, + "loss": 0.0951, + "step": 17653 + }, + { + "epoch": 2.0934424285544884, + "grad_norm": 0.6303801348250948, + "learning_rate": 2.4323180864552143e-05, + "loss": 0.0919, + "step": 17654 + }, + { + "epoch": 2.093561010316613, + "grad_norm": 0.8447747999939689, + "learning_rate": 2.432078138635561e-05, + "loss": 0.1166, + "step": 17655 + }, + { + "epoch": 2.0936795920787383, + "grad_norm": 0.7994587036998436, + "learning_rate": 2.4318381914420638e-05, + "loss": 0.0979, + "step": 17656 + }, + { + "epoch": 2.093798173840863, + "grad_norm": 0.7793360224265851, + "learning_rate": 2.4315982448769355e-05, + "loss": 0.1032, + "step": 17657 + }, + { + "epoch": 2.0939167556029883, + "grad_norm": 0.8471945027335093, + "learning_rate": 2.4313582989423876e-05, + "loss": 0.1015, + "step": 17658 + }, + { + "epoch": 2.094035337365113, + "grad_norm": 0.6599988661901656, + "learning_rate": 2.431118353640631e-05, + "loss": 0.077, + "step": 17659 + }, + { + "epoch": 2.0941539191272383, + "grad_norm": 0.940169421718236, + "learning_rate": 2.4308784089738797e-05, + "loss": 0.1446, + "step": 17660 + }, + { + "epoch": 2.094272500889363, + "grad_norm": 1.0324906038422277, + "learning_rate": 2.430638464944345e-05, + "loss": 0.1417, + "step": 17661 + }, + { + "epoch": 2.094391082651488, + "grad_norm": 0.6776772147789699, + "learning_rate": 2.430398521554237e-05, + "loss": 0.0849, + "step": 17662 + }, + { + "epoch": 2.0945096644136134, + "grad_norm": 0.5057363333814702, + "learning_rate": 2.4301585788057703e-05, + "loss": 0.0695, + "step": 17663 + }, + { + "epoch": 2.094628246175738, + "grad_norm": 0.7274548497094527, + "learning_rate": 2.429918636701156e-05, + "loss": 0.0936, + "step": 17664 + }, + { + "epoch": 2.094746827937863, + "grad_norm": 0.6623585599965763, + "learning_rate": 2.4296786952426058e-05, + "loss": 0.1082, + "step": 17665 + }, + { + "epoch": 2.094865409699988, + "grad_norm": 0.7355729436029271, + "learning_rate": 2.4294387544323315e-05, + "loss": 0.0804, + "step": 17666 + }, + { + "epoch": 2.0949839914621133, + "grad_norm": 0.7126942093966403, + "learning_rate": 2.4291988142725452e-05, + "loss": 0.0874, + "step": 17667 + }, + { + "epoch": 2.095102573224238, + "grad_norm": 0.656426506460523, + "learning_rate": 2.42895887476546e-05, + "loss": 0.11, + "step": 17668 + }, + { + "epoch": 2.0952211549863633, + "grad_norm": 0.7540731303594375, + "learning_rate": 2.4287189359132866e-05, + "loss": 0.1033, + "step": 17669 + }, + { + "epoch": 2.095339736748488, + "grad_norm": 0.6316538331453777, + "learning_rate": 2.428478997718236e-05, + "loss": 0.0957, + "step": 17670 + }, + { + "epoch": 2.0954583185106133, + "grad_norm": 0.5933356186523511, + "learning_rate": 2.4282390601825226e-05, + "loss": 0.0841, + "step": 17671 + }, + { + "epoch": 2.095576900272738, + "grad_norm": 0.9463812725042277, + "learning_rate": 2.427999123308357e-05, + "loss": 0.131, + "step": 17672 + }, + { + "epoch": 2.095695482034863, + "grad_norm": 0.5598758932119056, + "learning_rate": 2.42775918709795e-05, + "loss": 0.0764, + "step": 17673 + }, + { + "epoch": 2.095814063796988, + "grad_norm": 1.0899660941760327, + "learning_rate": 2.4275192515535157e-05, + "loss": 0.1444, + "step": 17674 + }, + { + "epoch": 2.095932645559113, + "grad_norm": 0.803957916565864, + "learning_rate": 2.427279316677265e-05, + "loss": 0.0992, + "step": 17675 + }, + { + "epoch": 2.096051227321238, + "grad_norm": 0.5023693268536069, + "learning_rate": 2.4270393824714098e-05, + "loss": 0.0584, + "step": 17676 + }, + { + "epoch": 2.096169809083363, + "grad_norm": 0.8173363570260191, + "learning_rate": 2.426799448938161e-05, + "loss": 0.1106, + "step": 17677 + }, + { + "epoch": 2.096288390845488, + "grad_norm": 0.8118597190162863, + "learning_rate": 2.4265595160797326e-05, + "loss": 0.1144, + "step": 17678 + }, + { + "epoch": 2.096406972607613, + "grad_norm": 0.673899532209695, + "learning_rate": 2.4263195838983357e-05, + "loss": 0.087, + "step": 17679 + }, + { + "epoch": 2.096525554369738, + "grad_norm": 0.5679188514884848, + "learning_rate": 2.4260796523961817e-05, + "loss": 0.0689, + "step": 17680 + }, + { + "epoch": 2.096644136131863, + "grad_norm": 0.8101429492097422, + "learning_rate": 2.4258397215754816e-05, + "loss": 0.1177, + "step": 17681 + }, + { + "epoch": 2.096762717893988, + "grad_norm": 0.6217274515965322, + "learning_rate": 2.4255997914384496e-05, + "loss": 0.11, + "step": 17682 + }, + { + "epoch": 2.096881299656113, + "grad_norm": 0.725234668879478, + "learning_rate": 2.425359861987296e-05, + "loss": 0.0879, + "step": 17683 + }, + { + "epoch": 2.096999881418238, + "grad_norm": 0.8828961395564068, + "learning_rate": 2.4251199332242324e-05, + "loss": 0.098, + "step": 17684 + }, + { + "epoch": 2.097118463180363, + "grad_norm": 0.822838990991526, + "learning_rate": 2.4248800051514715e-05, + "loss": 0.1135, + "step": 17685 + }, + { + "epoch": 2.0972370449424877, + "grad_norm": 0.6420122210369841, + "learning_rate": 2.4246400777712247e-05, + "loss": 0.0962, + "step": 17686 + }, + { + "epoch": 2.097355626704613, + "grad_norm": 0.7427072417549657, + "learning_rate": 2.4244001510857042e-05, + "loss": 0.1083, + "step": 17687 + }, + { + "epoch": 2.0974742084667377, + "grad_norm": 0.9026830898643112, + "learning_rate": 2.424160225097121e-05, + "loss": 0.1075, + "step": 17688 + }, + { + "epoch": 2.097592790228863, + "grad_norm": 0.5616716478218573, + "learning_rate": 2.423920299807688e-05, + "loss": 0.066, + "step": 17689 + }, + { + "epoch": 2.0977113719909877, + "grad_norm": 0.6909623787362744, + "learning_rate": 2.423680375219617e-05, + "loss": 0.0942, + "step": 17690 + }, + { + "epoch": 2.097829953753113, + "grad_norm": 1.105110785788971, + "learning_rate": 2.4234404513351193e-05, + "loss": 0.13, + "step": 17691 + }, + { + "epoch": 2.0979485355152376, + "grad_norm": 0.7114622165563125, + "learning_rate": 2.4232005281564056e-05, + "loss": 0.0896, + "step": 17692 + }, + { + "epoch": 2.098067117277363, + "grad_norm": 0.8710466610109995, + "learning_rate": 2.4229606056856903e-05, + "loss": 0.118, + "step": 17693 + }, + { + "epoch": 2.0981856990394876, + "grad_norm": 0.6630048752358645, + "learning_rate": 2.4227206839251835e-05, + "loss": 0.0885, + "step": 17694 + }, + { + "epoch": 2.098304280801613, + "grad_norm": 0.7039339312468885, + "learning_rate": 2.4224807628770966e-05, + "loss": 0.0941, + "step": 17695 + }, + { + "epoch": 2.0984228625637376, + "grad_norm": 0.6497710909082565, + "learning_rate": 2.422240842543642e-05, + "loss": 0.0791, + "step": 17696 + }, + { + "epoch": 2.0985414443258628, + "grad_norm": 0.6425150647218325, + "learning_rate": 2.4220009229270314e-05, + "loss": 0.101, + "step": 17697 + }, + { + "epoch": 2.0986600260879875, + "grad_norm": 1.1405118261016693, + "learning_rate": 2.421761004029477e-05, + "loss": 0.1425, + "step": 17698 + }, + { + "epoch": 2.0987786078501127, + "grad_norm": 1.0265088551832222, + "learning_rate": 2.4215210858531906e-05, + "loss": 0.144, + "step": 17699 + }, + { + "epoch": 2.0988971896122375, + "grad_norm": 0.5761372645506845, + "learning_rate": 2.4212811684003824e-05, + "loss": 0.0715, + "step": 17700 + }, + { + "epoch": 2.0990157713743627, + "grad_norm": 0.7844613410716006, + "learning_rate": 2.4210412516732664e-05, + "loss": 0.1275, + "step": 17701 + }, + { + "epoch": 2.0991343531364874, + "grad_norm": 0.8224679819545181, + "learning_rate": 2.420801335674053e-05, + "loss": 0.0961, + "step": 17702 + }, + { + "epoch": 2.0992529348986126, + "grad_norm": 1.0322039740802258, + "learning_rate": 2.420561420404953e-05, + "loss": 0.1547, + "step": 17703 + }, + { + "epoch": 2.0993715166607374, + "grad_norm": 1.0556636487851627, + "learning_rate": 2.420321505868181e-05, + "loss": 0.1026, + "step": 17704 + }, + { + "epoch": 2.0994900984228626, + "grad_norm": 0.9495438803878967, + "learning_rate": 2.4200815920659458e-05, + "loss": 0.1355, + "step": 17705 + }, + { + "epoch": 2.0996086801849874, + "grad_norm": 0.4896309146710866, + "learning_rate": 2.4198416790004613e-05, + "loss": 0.0702, + "step": 17706 + }, + { + "epoch": 2.0997272619471126, + "grad_norm": 0.7753817724988226, + "learning_rate": 2.4196017666739372e-05, + "loss": 0.1062, + "step": 17707 + }, + { + "epoch": 2.0998458437092373, + "grad_norm": 0.7134151182700721, + "learning_rate": 2.419361855088587e-05, + "loss": 0.094, + "step": 17708 + }, + { + "epoch": 2.0999644254713625, + "grad_norm": 0.764586588293776, + "learning_rate": 2.4191219442466215e-05, + "loss": 0.1168, + "step": 17709 + }, + { + "epoch": 2.1000830072334873, + "grad_norm": 0.7634305650064045, + "learning_rate": 2.4188820341502528e-05, + "loss": 0.1187, + "step": 17710 + }, + { + "epoch": 2.1002015889956125, + "grad_norm": 0.6006375721324791, + "learning_rate": 2.4186421248016912e-05, + "loss": 0.0535, + "step": 17711 + }, + { + "epoch": 2.1003201707577372, + "grad_norm": 0.5843804994008797, + "learning_rate": 2.4184022162031502e-05, + "loss": 0.0782, + "step": 17712 + }, + { + "epoch": 2.1004387525198625, + "grad_norm": 0.6280050420341007, + "learning_rate": 2.4181623083568408e-05, + "loss": 0.0858, + "step": 17713 + }, + { + "epoch": 2.100557334281987, + "grad_norm": 0.9841011423399921, + "learning_rate": 2.4179224012649742e-05, + "loss": 0.1247, + "step": 17714 + }, + { + "epoch": 2.1006759160441124, + "grad_norm": 0.895230359301384, + "learning_rate": 2.4176824949297625e-05, + "loss": 0.106, + "step": 17715 + }, + { + "epoch": 2.1007944978062376, + "grad_norm": 0.6112718533517404, + "learning_rate": 2.417442589353417e-05, + "loss": 0.0845, + "step": 17716 + }, + { + "epoch": 2.1009130795683624, + "grad_norm": 0.8897589530194006, + "learning_rate": 2.41720268453815e-05, + "loss": 0.1458, + "step": 17717 + }, + { + "epoch": 2.1010316613304876, + "grad_norm": 0.8103495354626276, + "learning_rate": 2.4169627804861718e-05, + "loss": 0.1111, + "step": 17718 + }, + { + "epoch": 2.1011502430926123, + "grad_norm": 1.0141944510279146, + "learning_rate": 2.4167228771996963e-05, + "loss": 0.1064, + "step": 17719 + }, + { + "epoch": 2.1012688248547375, + "grad_norm": 0.9902496044941537, + "learning_rate": 2.4164829746809334e-05, + "loss": 0.1201, + "step": 17720 + }, + { + "epoch": 2.1013874066168623, + "grad_norm": 0.714358630911429, + "learning_rate": 2.4162430729320954e-05, + "loss": 0.1092, + "step": 17721 + }, + { + "epoch": 2.1015059883789875, + "grad_norm": 0.8795321749846019, + "learning_rate": 2.416003171955392e-05, + "loss": 0.1142, + "step": 17722 + }, + { + "epoch": 2.1016245701411123, + "grad_norm": 0.6559569863929873, + "learning_rate": 2.4157632717530376e-05, + "loss": 0.0912, + "step": 17723 + }, + { + "epoch": 2.1017431519032375, + "grad_norm": 0.9232786588239772, + "learning_rate": 2.415523372327242e-05, + "loss": 0.1106, + "step": 17724 + }, + { + "epoch": 2.101861733665362, + "grad_norm": 0.7764534046428707, + "learning_rate": 2.4152834736802183e-05, + "loss": 0.1084, + "step": 17725 + }, + { + "epoch": 2.1019803154274874, + "grad_norm": 0.6564453037413477, + "learning_rate": 2.4150435758141753e-05, + "loss": 0.1101, + "step": 17726 + }, + { + "epoch": 2.102098897189612, + "grad_norm": 0.8713781453181823, + "learning_rate": 2.414803678731328e-05, + "loss": 0.091, + "step": 17727 + }, + { + "epoch": 2.1022174789517374, + "grad_norm": 0.7758198426616747, + "learning_rate": 2.414563782433886e-05, + "loss": 0.1197, + "step": 17728 + }, + { + "epoch": 2.102336060713862, + "grad_norm": 0.5355027713253647, + "learning_rate": 2.4143238869240613e-05, + "loss": 0.0732, + "step": 17729 + }, + { + "epoch": 2.1024546424759873, + "grad_norm": 0.6464899771096846, + "learning_rate": 2.4140839922040643e-05, + "loss": 0.0857, + "step": 17730 + }, + { + "epoch": 2.102573224238112, + "grad_norm": 0.6803607006983631, + "learning_rate": 2.4138440982761085e-05, + "loss": 0.0892, + "step": 17731 + }, + { + "epoch": 2.1026918060002373, + "grad_norm": 1.0174868329402416, + "learning_rate": 2.4136042051424043e-05, + "loss": 0.1619, + "step": 17732 + }, + { + "epoch": 2.102810387762362, + "grad_norm": 0.7332631413163705, + "learning_rate": 2.4133643128051625e-05, + "loss": 0.0899, + "step": 17733 + }, + { + "epoch": 2.1029289695244873, + "grad_norm": 0.6066182471939238, + "learning_rate": 2.4131244212665965e-05, + "loss": 0.0782, + "step": 17734 + }, + { + "epoch": 2.103047551286612, + "grad_norm": 0.6900735547932704, + "learning_rate": 2.412884530528916e-05, + "loss": 0.0866, + "step": 17735 + }, + { + "epoch": 2.1031661330487372, + "grad_norm": 0.7094590637101457, + "learning_rate": 2.412644640594334e-05, + "loss": 0.0971, + "step": 17736 + }, + { + "epoch": 2.103284714810862, + "grad_norm": 0.7562324873939084, + "learning_rate": 2.4124047514650605e-05, + "loss": 0.0988, + "step": 17737 + }, + { + "epoch": 2.103403296572987, + "grad_norm": 0.8553330262311986, + "learning_rate": 2.4121648631433088e-05, + "loss": 0.1404, + "step": 17738 + }, + { + "epoch": 2.103521878335112, + "grad_norm": 0.7982551859644568, + "learning_rate": 2.411924975631289e-05, + "loss": 0.0965, + "step": 17739 + }, + { + "epoch": 2.103640460097237, + "grad_norm": 0.7902658656827232, + "learning_rate": 2.4116850889312135e-05, + "loss": 0.1056, + "step": 17740 + }, + { + "epoch": 2.103759041859362, + "grad_norm": 0.6339656275267481, + "learning_rate": 2.4114452030452918e-05, + "loss": 0.0878, + "step": 17741 + }, + { + "epoch": 2.103877623621487, + "grad_norm": 0.9634434942422359, + "learning_rate": 2.4112053179757377e-05, + "loss": 0.1187, + "step": 17742 + }, + { + "epoch": 2.103996205383612, + "grad_norm": 0.5700181400267402, + "learning_rate": 2.4109654337247616e-05, + "loss": 0.0724, + "step": 17743 + }, + { + "epoch": 2.104114787145737, + "grad_norm": 0.739067136967367, + "learning_rate": 2.410725550294574e-05, + "loss": 0.1165, + "step": 17744 + }, + { + "epoch": 2.104233368907862, + "grad_norm": 0.6761849926806303, + "learning_rate": 2.4104856676873884e-05, + "loss": 0.0889, + "step": 17745 + }, + { + "epoch": 2.104351950669987, + "grad_norm": 0.913632075367155, + "learning_rate": 2.410245785905415e-05, + "loss": 0.1075, + "step": 17746 + }, + { + "epoch": 2.104470532432112, + "grad_norm": 0.8617547109181376, + "learning_rate": 2.4100059049508657e-05, + "loss": 0.1147, + "step": 17747 + }, + { + "epoch": 2.104589114194237, + "grad_norm": 0.5983151200427411, + "learning_rate": 2.4097660248259504e-05, + "loss": 0.0943, + "step": 17748 + }, + { + "epoch": 2.1047076959563618, + "grad_norm": 0.6052797747410538, + "learning_rate": 2.409526145532883e-05, + "loss": 0.0751, + "step": 17749 + }, + { + "epoch": 2.104826277718487, + "grad_norm": 0.7319196233587663, + "learning_rate": 2.4092862670738734e-05, + "loss": 0.1074, + "step": 17750 + }, + { + "epoch": 2.1049448594806117, + "grad_norm": 0.7967879666773058, + "learning_rate": 2.4090463894511334e-05, + "loss": 0.1053, + "step": 17751 + }, + { + "epoch": 2.105063441242737, + "grad_norm": 0.8520511852485684, + "learning_rate": 2.4088065126668727e-05, + "loss": 0.0782, + "step": 17752 + }, + { + "epoch": 2.1051820230048617, + "grad_norm": 0.6620147064784508, + "learning_rate": 2.4085666367233058e-05, + "loss": 0.0961, + "step": 17753 + }, + { + "epoch": 2.105300604766987, + "grad_norm": 1.0810729059881963, + "learning_rate": 2.4083267616226413e-05, + "loss": 0.137, + "step": 17754 + }, + { + "epoch": 2.1054191865291116, + "grad_norm": 1.1459368994425814, + "learning_rate": 2.408086887367092e-05, + "loss": 0.1566, + "step": 17755 + }, + { + "epoch": 2.105537768291237, + "grad_norm": 0.689470151129963, + "learning_rate": 2.4078470139588686e-05, + "loss": 0.1001, + "step": 17756 + }, + { + "epoch": 2.1056563500533616, + "grad_norm": 0.8174044487007872, + "learning_rate": 2.4076071414001833e-05, + "loss": 0.0878, + "step": 17757 + }, + { + "epoch": 2.105774931815487, + "grad_norm": 0.8120325738384929, + "learning_rate": 2.4073672696932468e-05, + "loss": 0.1054, + "step": 17758 + }, + { + "epoch": 2.1058935135776116, + "grad_norm": 0.564762649992703, + "learning_rate": 2.4071273988402696e-05, + "loss": 0.0732, + "step": 17759 + }, + { + "epoch": 2.1060120953397368, + "grad_norm": 0.7006981982366348, + "learning_rate": 2.406887528843465e-05, + "loss": 0.108, + "step": 17760 + }, + { + "epoch": 2.1061306771018615, + "grad_norm": 0.7524271781285821, + "learning_rate": 2.406647659705043e-05, + "loss": 0.1124, + "step": 17761 + }, + { + "epoch": 2.1062492588639867, + "grad_norm": 0.5506039912980265, + "learning_rate": 2.406407791427215e-05, + "loss": 0.0895, + "step": 17762 + }, + { + "epoch": 2.1063678406261115, + "grad_norm": 0.6240073246551237, + "learning_rate": 2.4061679240121917e-05, + "loss": 0.095, + "step": 17763 + }, + { + "epoch": 2.1064864223882367, + "grad_norm": 0.6299683548672852, + "learning_rate": 2.405928057462186e-05, + "loss": 0.0943, + "step": 17764 + }, + { + "epoch": 2.106605004150362, + "grad_norm": 0.7578312100084077, + "learning_rate": 2.4056881917794078e-05, + "loss": 0.1007, + "step": 17765 + }, + { + "epoch": 2.1067235859124867, + "grad_norm": 0.9161755012684288, + "learning_rate": 2.4054483269660692e-05, + "loss": 0.1161, + "step": 17766 + }, + { + "epoch": 2.106842167674612, + "grad_norm": 0.7163363375820996, + "learning_rate": 2.4052084630243798e-05, + "loss": 0.1202, + "step": 17767 + }, + { + "epoch": 2.1069607494367366, + "grad_norm": 0.7294383753574907, + "learning_rate": 2.4049685999565536e-05, + "loss": 0.0956, + "step": 17768 + }, + { + "epoch": 2.107079331198862, + "grad_norm": 0.8449570676369849, + "learning_rate": 2.4047287377648e-05, + "loss": 0.0901, + "step": 17769 + }, + { + "epoch": 2.1071979129609866, + "grad_norm": 0.7827613058503096, + "learning_rate": 2.4044888764513308e-05, + "loss": 0.1193, + "step": 17770 + }, + { + "epoch": 2.1073164947231118, + "grad_norm": 0.8530923936744655, + "learning_rate": 2.404249016018356e-05, + "loss": 0.105, + "step": 17771 + }, + { + "epoch": 2.1074350764852365, + "grad_norm": 0.649209621160767, + "learning_rate": 2.4040091564680893e-05, + "loss": 0.0922, + "step": 17772 + }, + { + "epoch": 2.1075536582473617, + "grad_norm": 0.6070922340893752, + "learning_rate": 2.4037692978027392e-05, + "loss": 0.0769, + "step": 17773 + }, + { + "epoch": 2.1076722400094865, + "grad_norm": 0.6733284583004968, + "learning_rate": 2.4035294400245186e-05, + "loss": 0.0849, + "step": 17774 + }, + { + "epoch": 2.1077908217716117, + "grad_norm": 0.9105189636820087, + "learning_rate": 2.4032895831356385e-05, + "loss": 0.1208, + "step": 17775 + }, + { + "epoch": 2.1079094035337365, + "grad_norm": 0.49679875286117675, + "learning_rate": 2.40304972713831e-05, + "loss": 0.0565, + "step": 17776 + }, + { + "epoch": 2.1080279852958617, + "grad_norm": 0.656112978245929, + "learning_rate": 2.4028098720347443e-05, + "loss": 0.0874, + "step": 17777 + }, + { + "epoch": 2.1081465670579864, + "grad_norm": 0.5960129927328174, + "learning_rate": 2.402570017827151e-05, + "loss": 0.0665, + "step": 17778 + }, + { + "epoch": 2.1082651488201116, + "grad_norm": 0.8050447560936875, + "learning_rate": 2.402330164517744e-05, + "loss": 0.0967, + "step": 17779 + }, + { + "epoch": 2.1083837305822364, + "grad_norm": 0.7507278560063959, + "learning_rate": 2.4020903121087328e-05, + "loss": 0.1006, + "step": 17780 + }, + { + "epoch": 2.1085023123443616, + "grad_norm": 0.949728181147529, + "learning_rate": 2.4018504606023293e-05, + "loss": 0.1231, + "step": 17781 + }, + { + "epoch": 2.1086208941064863, + "grad_norm": 0.6076752731126464, + "learning_rate": 2.401610610000743e-05, + "loss": 0.0805, + "step": 17782 + }, + { + "epoch": 2.1087394758686115, + "grad_norm": 0.6930679748124102, + "learning_rate": 2.401370760306187e-05, + "loss": 0.0955, + "step": 17783 + }, + { + "epoch": 2.1088580576307363, + "grad_norm": 0.5573099705270429, + "learning_rate": 2.4011309115208715e-05, + "loss": 0.0776, + "step": 17784 + }, + { + "epoch": 2.1089766393928615, + "grad_norm": 0.7150654906236121, + "learning_rate": 2.400891063647008e-05, + "loss": 0.0925, + "step": 17785 + }, + { + "epoch": 2.1090952211549863, + "grad_norm": 0.8899017393344478, + "learning_rate": 2.4006512166868063e-05, + "loss": 0.1196, + "step": 17786 + }, + { + "epoch": 2.1092138029171115, + "grad_norm": 0.6412620519044003, + "learning_rate": 2.40041137064248e-05, + "loss": 0.0839, + "step": 17787 + }, + { + "epoch": 2.1093323846792362, + "grad_norm": 0.7191295068563591, + "learning_rate": 2.4001715255162386e-05, + "loss": 0.0738, + "step": 17788 + }, + { + "epoch": 2.1094509664413614, + "grad_norm": 0.7765443185627715, + "learning_rate": 2.399931681310292e-05, + "loss": 0.0991, + "step": 17789 + }, + { + "epoch": 2.109569548203486, + "grad_norm": 0.9567393272746687, + "learning_rate": 2.3996918380268536e-05, + "loss": 0.1305, + "step": 17790 + }, + { + "epoch": 2.1096881299656114, + "grad_norm": 0.6842986500556484, + "learning_rate": 2.399451995668134e-05, + "loss": 0.095, + "step": 17791 + }, + { + "epoch": 2.109806711727736, + "grad_norm": 0.7381934445125651, + "learning_rate": 2.3992121542363434e-05, + "loss": 0.0837, + "step": 17792 + }, + { + "epoch": 2.1099252934898614, + "grad_norm": 0.8771190703715759, + "learning_rate": 2.3989723137336917e-05, + "loss": 0.0807, + "step": 17793 + }, + { + "epoch": 2.110043875251986, + "grad_norm": 0.6530270771993045, + "learning_rate": 2.3987324741623928e-05, + "loss": 0.0899, + "step": 17794 + }, + { + "epoch": 2.1101624570141113, + "grad_norm": 0.6525910681525993, + "learning_rate": 2.3984926355246556e-05, + "loss": 0.0788, + "step": 17795 + }, + { + "epoch": 2.110281038776236, + "grad_norm": 0.8675742517335266, + "learning_rate": 2.3982527978226926e-05, + "loss": 0.1149, + "step": 17796 + }, + { + "epoch": 2.1103996205383613, + "grad_norm": 0.754610113296398, + "learning_rate": 2.398012961058713e-05, + "loss": 0.0969, + "step": 17797 + }, + { + "epoch": 2.110518202300486, + "grad_norm": 0.5747259637751972, + "learning_rate": 2.3977731252349296e-05, + "loss": 0.077, + "step": 17798 + }, + { + "epoch": 2.1106367840626112, + "grad_norm": 0.5626709239382442, + "learning_rate": 2.397533290353553e-05, + "loss": 0.0737, + "step": 17799 + }, + { + "epoch": 2.110755365824736, + "grad_norm": 0.8426468869969377, + "learning_rate": 2.3972934564167933e-05, + "loss": 0.1017, + "step": 17800 + }, + { + "epoch": 2.110873947586861, + "grad_norm": 1.4873802594728036, + "learning_rate": 2.3970536234268614e-05, + "loss": 0.1275, + "step": 17801 + }, + { + "epoch": 2.110992529348986, + "grad_norm": 0.6011422419712529, + "learning_rate": 2.39681379138597e-05, + "loss": 0.0853, + "step": 17802 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.7728765701838523, + "learning_rate": 2.3965739602963277e-05, + "loss": 0.1058, + "step": 17803 + }, + { + "epoch": 2.111229692873236, + "grad_norm": 1.1056663562313997, + "learning_rate": 2.3963341301601473e-05, + "loss": 0.1427, + "step": 17804 + }, + { + "epoch": 2.111348274635361, + "grad_norm": 0.6811363902703893, + "learning_rate": 2.3960943009796384e-05, + "loss": 0.0869, + "step": 17805 + }, + { + "epoch": 2.111466856397486, + "grad_norm": 0.9640801532611685, + "learning_rate": 2.395854472757014e-05, + "loss": 0.1238, + "step": 17806 + }, + { + "epoch": 2.111585438159611, + "grad_norm": 0.9107438050565287, + "learning_rate": 2.395614645494483e-05, + "loss": 0.1267, + "step": 17807 + }, + { + "epoch": 2.111704019921736, + "grad_norm": 0.9701612775025429, + "learning_rate": 2.3953748191942564e-05, + "loss": 0.0983, + "step": 17808 + }, + { + "epoch": 2.111822601683861, + "grad_norm": 0.8783902491264383, + "learning_rate": 2.3951349938585462e-05, + "loss": 0.1346, + "step": 17809 + }, + { + "epoch": 2.111941183445986, + "grad_norm": 0.650268689490047, + "learning_rate": 2.394895169489563e-05, + "loss": 0.0828, + "step": 17810 + }, + { + "epoch": 2.112059765208111, + "grad_norm": 0.7597270165914234, + "learning_rate": 2.3946553460895177e-05, + "loss": 0.0939, + "step": 17811 + }, + { + "epoch": 2.1121783469702358, + "grad_norm": 0.6929831137685498, + "learning_rate": 2.3944155236606196e-05, + "loss": 0.1053, + "step": 17812 + }, + { + "epoch": 2.112296928732361, + "grad_norm": 0.5949188911808995, + "learning_rate": 2.394175702205082e-05, + "loss": 0.084, + "step": 17813 + }, + { + "epoch": 2.112415510494486, + "grad_norm": 0.7279869069527176, + "learning_rate": 2.3939358817251142e-05, + "loss": 0.0807, + "step": 17814 + }, + { + "epoch": 2.112534092256611, + "grad_norm": 0.7574817326237568, + "learning_rate": 2.3936960622229284e-05, + "loss": 0.088, + "step": 17815 + }, + { + "epoch": 2.112652674018736, + "grad_norm": 0.7849555715671664, + "learning_rate": 2.3934562437007326e-05, + "loss": 0.0952, + "step": 17816 + }, + { + "epoch": 2.112771255780861, + "grad_norm": 0.804884572087379, + "learning_rate": 2.3932164261607412e-05, + "loss": 0.0908, + "step": 17817 + }, + { + "epoch": 2.112889837542986, + "grad_norm": 0.7482047034339723, + "learning_rate": 2.3929766096051635e-05, + "loss": 0.0916, + "step": 17818 + }, + { + "epoch": 2.113008419305111, + "grad_norm": 0.8927369943301499, + "learning_rate": 2.3927367940362087e-05, + "loss": 0.121, + "step": 17819 + }, + { + "epoch": 2.113127001067236, + "grad_norm": 0.6726783278809668, + "learning_rate": 2.3924969794560907e-05, + "loss": 0.1036, + "step": 17820 + }, + { + "epoch": 2.113245582829361, + "grad_norm": 0.8691615581385709, + "learning_rate": 2.3922571658670185e-05, + "loss": 0.1261, + "step": 17821 + }, + { + "epoch": 2.113364164591486, + "grad_norm": 0.7641283096014578, + "learning_rate": 2.3920173532712032e-05, + "loss": 0.1046, + "step": 17822 + }, + { + "epoch": 2.1134827463536108, + "grad_norm": 0.7431156182840443, + "learning_rate": 2.391777541670854e-05, + "loss": 0.1227, + "step": 17823 + }, + { + "epoch": 2.113601328115736, + "grad_norm": 0.9504905934445264, + "learning_rate": 2.3915377310681848e-05, + "loss": 0.1015, + "step": 17824 + }, + { + "epoch": 2.1137199098778607, + "grad_norm": 1.0267345234592098, + "learning_rate": 2.3912979214654038e-05, + "loss": 0.1461, + "step": 17825 + }, + { + "epoch": 2.113838491639986, + "grad_norm": 0.5827311428096316, + "learning_rate": 2.3910581128647234e-05, + "loss": 0.0721, + "step": 17826 + }, + { + "epoch": 2.1139570734021107, + "grad_norm": 0.8043094400104702, + "learning_rate": 2.390818305268352e-05, + "loss": 0.1129, + "step": 17827 + }, + { + "epoch": 2.114075655164236, + "grad_norm": 0.6764780388616012, + "learning_rate": 2.390578498678504e-05, + "loss": 0.0878, + "step": 17828 + }, + { + "epoch": 2.1141942369263607, + "grad_norm": 0.8778304084853604, + "learning_rate": 2.3903386930973872e-05, + "loss": 0.1199, + "step": 17829 + }, + { + "epoch": 2.114312818688486, + "grad_norm": 0.774064690962849, + "learning_rate": 2.3900988885272125e-05, + "loss": 0.1026, + "step": 17830 + }, + { + "epoch": 2.1144314004506106, + "grad_norm": 0.7959558239054422, + "learning_rate": 2.3898590849701926e-05, + "loss": 0.1072, + "step": 17831 + }, + { + "epoch": 2.114549982212736, + "grad_norm": 0.7663234810541174, + "learning_rate": 2.3896192824285364e-05, + "loss": 0.1087, + "step": 17832 + }, + { + "epoch": 2.1146685639748606, + "grad_norm": 1.1154225383463556, + "learning_rate": 2.3893794809044545e-05, + "loss": 0.126, + "step": 17833 + }, + { + "epoch": 2.114787145736986, + "grad_norm": 0.5991087705834621, + "learning_rate": 2.3891396804001585e-05, + "loss": 0.0875, + "step": 17834 + }, + { + "epoch": 2.1149057274991105, + "grad_norm": 1.0291719577881016, + "learning_rate": 2.3888998809178583e-05, + "loss": 0.1368, + "step": 17835 + }, + { + "epoch": 2.1150243092612357, + "grad_norm": 0.7016637997683266, + "learning_rate": 2.3886600824597656e-05, + "loss": 0.078, + "step": 17836 + }, + { + "epoch": 2.1151428910233605, + "grad_norm": 0.8377366007230417, + "learning_rate": 2.3884202850280908e-05, + "loss": 0.096, + "step": 17837 + }, + { + "epoch": 2.1152614727854857, + "grad_norm": 0.9003806095364464, + "learning_rate": 2.3881804886250424e-05, + "loss": 0.1167, + "step": 17838 + }, + { + "epoch": 2.1153800545476105, + "grad_norm": 0.6832788106182055, + "learning_rate": 2.3879406932528343e-05, + "loss": 0.0916, + "step": 17839 + }, + { + "epoch": 2.1154986363097357, + "grad_norm": 0.7336494287862443, + "learning_rate": 2.3877008989136755e-05, + "loss": 0.0847, + "step": 17840 + }, + { + "epoch": 2.1156172180718604, + "grad_norm": 0.6568314242449593, + "learning_rate": 2.387461105609777e-05, + "loss": 0.0975, + "step": 17841 + }, + { + "epoch": 2.1157357998339856, + "grad_norm": 0.7543059831922114, + "learning_rate": 2.3872213133433473e-05, + "loss": 0.1173, + "step": 17842 + }, + { + "epoch": 2.1158543815961104, + "grad_norm": 0.6734646176869874, + "learning_rate": 2.3869815221166006e-05, + "loss": 0.0792, + "step": 17843 + }, + { + "epoch": 2.1159729633582356, + "grad_norm": 0.8542798479702134, + "learning_rate": 2.3867417319317446e-05, + "loss": 0.11, + "step": 17844 + }, + { + "epoch": 2.1160915451203604, + "grad_norm": 0.7171418302867631, + "learning_rate": 2.3865019427909913e-05, + "loss": 0.085, + "step": 17845 + }, + { + "epoch": 2.1162101268824856, + "grad_norm": 1.1179012240997335, + "learning_rate": 2.3862621546965507e-05, + "loss": 0.136, + "step": 17846 + }, + { + "epoch": 2.1163287086446103, + "grad_norm": 0.7261333401535891, + "learning_rate": 2.3860223676506345e-05, + "loss": 0.1077, + "step": 17847 + }, + { + "epoch": 2.1164472904067355, + "grad_norm": 0.7971453221368986, + "learning_rate": 2.385782581655452e-05, + "loss": 0.0886, + "step": 17848 + }, + { + "epoch": 2.1165658721688603, + "grad_norm": 0.7218070388129837, + "learning_rate": 2.385542796713213e-05, + "loss": 0.0858, + "step": 17849 + }, + { + "epoch": 2.1166844539309855, + "grad_norm": 0.8154031966444868, + "learning_rate": 2.38530301282613e-05, + "loss": 0.0945, + "step": 17850 + }, + { + "epoch": 2.1168030356931102, + "grad_norm": 0.8123157591488545, + "learning_rate": 2.3850632299964127e-05, + "loss": 0.0997, + "step": 17851 + }, + { + "epoch": 2.1169216174552354, + "grad_norm": 0.7283823367334491, + "learning_rate": 2.3848234482262707e-05, + "loss": 0.1031, + "step": 17852 + }, + { + "epoch": 2.11704019921736, + "grad_norm": 0.7688741882437939, + "learning_rate": 2.3845836675179157e-05, + "loss": 0.1035, + "step": 17853 + }, + { + "epoch": 2.1171587809794854, + "grad_norm": 0.7757209817474158, + "learning_rate": 2.3843438878735574e-05, + "loss": 0.0872, + "step": 17854 + }, + { + "epoch": 2.11727736274161, + "grad_norm": 1.0053517108391412, + "learning_rate": 2.3841041092954073e-05, + "loss": 0.128, + "step": 17855 + }, + { + "epoch": 2.1173959445037354, + "grad_norm": 0.5602785490077342, + "learning_rate": 2.383864331785675e-05, + "loss": 0.0681, + "step": 17856 + }, + { + "epoch": 2.11751452626586, + "grad_norm": 0.6369077188256455, + "learning_rate": 2.3836245553465704e-05, + "loss": 0.0813, + "step": 17857 + }, + { + "epoch": 2.1176331080279853, + "grad_norm": 1.0777930882531748, + "learning_rate": 2.3833847799803055e-05, + "loss": 0.0964, + "step": 17858 + }, + { + "epoch": 2.11775168979011, + "grad_norm": 0.8869858106586543, + "learning_rate": 2.38314500568909e-05, + "loss": 0.1119, + "step": 17859 + }, + { + "epoch": 2.1178702715522353, + "grad_norm": 0.7220445621097537, + "learning_rate": 2.382905232475133e-05, + "loss": 0.1118, + "step": 17860 + }, + { + "epoch": 2.11798885331436, + "grad_norm": 0.7857301983006661, + "learning_rate": 2.382665460340648e-05, + "loss": 0.0915, + "step": 17861 + }, + { + "epoch": 2.1181074350764852, + "grad_norm": 0.6964569165588829, + "learning_rate": 2.3824256892878427e-05, + "loss": 0.0991, + "step": 17862 + }, + { + "epoch": 2.1182260168386104, + "grad_norm": 0.9674335713992094, + "learning_rate": 2.3821859193189283e-05, + "loss": 0.1091, + "step": 17863 + }, + { + "epoch": 2.118344598600735, + "grad_norm": 0.7203533898037802, + "learning_rate": 2.381946150436115e-05, + "loss": 0.0895, + "step": 17864 + }, + { + "epoch": 2.11846318036286, + "grad_norm": 0.9986072305664891, + "learning_rate": 2.3817063826416136e-05, + "loss": 0.1142, + "step": 17865 + }, + { + "epoch": 2.118581762124985, + "grad_norm": 0.6821687776719181, + "learning_rate": 2.381466615937635e-05, + "loss": 0.0853, + "step": 17866 + }, + { + "epoch": 2.1187003438871104, + "grad_norm": 0.7594488813347222, + "learning_rate": 2.3812268503263882e-05, + "loss": 0.0941, + "step": 17867 + }, + { + "epoch": 2.118818925649235, + "grad_norm": 0.6745054914564077, + "learning_rate": 2.3809870858100837e-05, + "loss": 0.0906, + "step": 17868 + }, + { + "epoch": 2.1189375074113603, + "grad_norm": 0.849617017878505, + "learning_rate": 2.3807473223909333e-05, + "loss": 0.1077, + "step": 17869 + }, + { + "epoch": 2.119056089173485, + "grad_norm": 0.6570250190706467, + "learning_rate": 2.380507560071146e-05, + "loss": 0.0815, + "step": 17870 + }, + { + "epoch": 2.1191746709356103, + "grad_norm": 0.6007178961558507, + "learning_rate": 2.380267798852933e-05, + "loss": 0.0816, + "step": 17871 + }, + { + "epoch": 2.119293252697735, + "grad_norm": 0.8433353265681695, + "learning_rate": 2.3800280387385025e-05, + "loss": 0.1133, + "step": 17872 + }, + { + "epoch": 2.1194118344598603, + "grad_norm": 0.8588940674706943, + "learning_rate": 2.3797882797300677e-05, + "loss": 0.1045, + "step": 17873 + }, + { + "epoch": 2.119530416221985, + "grad_norm": 0.6562080936684265, + "learning_rate": 2.379548521829837e-05, + "loss": 0.0731, + "step": 17874 + }, + { + "epoch": 2.11964899798411, + "grad_norm": 0.7884617964159583, + "learning_rate": 2.379308765040021e-05, + "loss": 0.1307, + "step": 17875 + }, + { + "epoch": 2.119767579746235, + "grad_norm": 0.7615858302390826, + "learning_rate": 2.3790690093628303e-05, + "loss": 0.0924, + "step": 17876 + }, + { + "epoch": 2.11988616150836, + "grad_norm": 0.910756655411388, + "learning_rate": 2.378829254800476e-05, + "loss": 0.1197, + "step": 17877 + }, + { + "epoch": 2.120004743270485, + "grad_norm": 0.7193684266250825, + "learning_rate": 2.378589501355167e-05, + "loss": 0.1011, + "step": 17878 + }, + { + "epoch": 2.12012332503261, + "grad_norm": 0.7481093992944469, + "learning_rate": 2.378349749029113e-05, + "loss": 0.1094, + "step": 17879 + }, + { + "epoch": 2.120241906794735, + "grad_norm": 0.5664161023390765, + "learning_rate": 2.3781099978245258e-05, + "loss": 0.0658, + "step": 17880 + }, + { + "epoch": 2.12036048855686, + "grad_norm": 0.6822564240447349, + "learning_rate": 2.3778702477436154e-05, + "loss": 0.0955, + "step": 17881 + }, + { + "epoch": 2.120479070318985, + "grad_norm": 0.7159845270892833, + "learning_rate": 2.377630498788591e-05, + "loss": 0.0902, + "step": 17882 + }, + { + "epoch": 2.12059765208111, + "grad_norm": 1.6363004739500218, + "learning_rate": 2.377390750961663e-05, + "loss": 0.1003, + "step": 17883 + }, + { + "epoch": 2.120716233843235, + "grad_norm": 0.7508743339591013, + "learning_rate": 2.3771510042650418e-05, + "loss": 0.0952, + "step": 17884 + }, + { + "epoch": 2.12083481560536, + "grad_norm": 0.7135938435593754, + "learning_rate": 2.3769112587009386e-05, + "loss": 0.0825, + "step": 17885 + }, + { + "epoch": 2.120953397367485, + "grad_norm": 0.5581287429631879, + "learning_rate": 2.3766715142715628e-05, + "loss": 0.0768, + "step": 17886 + }, + { + "epoch": 2.12107197912961, + "grad_norm": 0.7126186112110405, + "learning_rate": 2.376431770979123e-05, + "loss": 0.0877, + "step": 17887 + }, + { + "epoch": 2.1211905608917347, + "grad_norm": 0.5887446013855328, + "learning_rate": 2.376192028825832e-05, + "loss": 0.0856, + "step": 17888 + }, + { + "epoch": 2.12130914265386, + "grad_norm": 0.6599783319784512, + "learning_rate": 2.3759522878138987e-05, + "loss": 0.0813, + "step": 17889 + }, + { + "epoch": 2.1214277244159847, + "grad_norm": 0.8098462461135668, + "learning_rate": 2.375712547945532e-05, + "loss": 0.1184, + "step": 17890 + }, + { + "epoch": 2.12154630617811, + "grad_norm": 0.865996869142336, + "learning_rate": 2.375472809222944e-05, + "loss": 0.1065, + "step": 17891 + }, + { + "epoch": 2.1216648879402347, + "grad_norm": 0.7620143848912132, + "learning_rate": 2.3752330716483444e-05, + "loss": 0.1223, + "step": 17892 + }, + { + "epoch": 2.12178346970236, + "grad_norm": 0.7704642623591977, + "learning_rate": 2.374993335223942e-05, + "loss": 0.1101, + "step": 17893 + }, + { + "epoch": 2.1219020514644846, + "grad_norm": 0.6848403174587552, + "learning_rate": 2.3747535999519486e-05, + "loss": 0.0997, + "step": 17894 + }, + { + "epoch": 2.12202063322661, + "grad_norm": 0.5677025861701702, + "learning_rate": 2.3745138658345724e-05, + "loss": 0.066, + "step": 17895 + }, + { + "epoch": 2.1221392149887346, + "grad_norm": 0.9629647904451156, + "learning_rate": 2.3742741328740257e-05, + "loss": 0.1569, + "step": 17896 + }, + { + "epoch": 2.12225779675086, + "grad_norm": 0.7185129710325748, + "learning_rate": 2.3740344010725173e-05, + "loss": 0.0859, + "step": 17897 + }, + { + "epoch": 2.1223763785129846, + "grad_norm": 1.0197775429680558, + "learning_rate": 2.3737946704322557e-05, + "loss": 0.1644, + "step": 17898 + }, + { + "epoch": 2.1224949602751098, + "grad_norm": 0.705329123506078, + "learning_rate": 2.3735549409554536e-05, + "loss": 0.1002, + "step": 17899 + }, + { + "epoch": 2.1226135420372345, + "grad_norm": 0.5138364659795601, + "learning_rate": 2.3733152126443202e-05, + "loss": 0.0703, + "step": 17900 + }, + { + "epoch": 2.1227321237993597, + "grad_norm": 0.6311912081913832, + "learning_rate": 2.3730754855010646e-05, + "loss": 0.0683, + "step": 17901 + }, + { + "epoch": 2.1228507055614845, + "grad_norm": 0.6396745038508417, + "learning_rate": 2.372835759527897e-05, + "loss": 0.0823, + "step": 17902 + }, + { + "epoch": 2.1229692873236097, + "grad_norm": 0.6500550560464794, + "learning_rate": 2.372596034727028e-05, + "loss": 0.0904, + "step": 17903 + }, + { + "epoch": 2.1230878690857344, + "grad_norm": 0.8524321668201746, + "learning_rate": 2.372356311100668e-05, + "loss": 0.1262, + "step": 17904 + }, + { + "epoch": 2.1232064508478596, + "grad_norm": 0.9614391567077992, + "learning_rate": 2.3721165886510253e-05, + "loss": 0.1506, + "step": 17905 + }, + { + "epoch": 2.1233250326099844, + "grad_norm": 0.7279463263710606, + "learning_rate": 2.3718768673803115e-05, + "loss": 0.1005, + "step": 17906 + }, + { + "epoch": 2.1234436143721096, + "grad_norm": 0.7149919487680201, + "learning_rate": 2.371637147290736e-05, + "loss": 0.0795, + "step": 17907 + }, + { + "epoch": 2.1235621961342344, + "grad_norm": 0.598201347048442, + "learning_rate": 2.371397428384509e-05, + "loss": 0.077, + "step": 17908 + }, + { + "epoch": 2.1236807778963596, + "grad_norm": 1.0522016189169252, + "learning_rate": 2.3711577106638384e-05, + "loss": 0.1241, + "step": 17909 + }, + { + "epoch": 2.1237993596584843, + "grad_norm": 0.9406466612416545, + "learning_rate": 2.3709179941309374e-05, + "loss": 0.1334, + "step": 17910 + }, + { + "epoch": 2.1239179414206095, + "grad_norm": 0.5648109508057189, + "learning_rate": 2.3706782787880136e-05, + "loss": 0.0765, + "step": 17911 + }, + { + "epoch": 2.1240365231827343, + "grad_norm": 0.5519386263388136, + "learning_rate": 2.3704385646372772e-05, + "loss": 0.0597, + "step": 17912 + }, + { + "epoch": 2.1241551049448595, + "grad_norm": 0.7673184273601047, + "learning_rate": 2.3701988516809382e-05, + "loss": 0.0894, + "step": 17913 + }, + { + "epoch": 2.1242736867069842, + "grad_norm": 0.7424059576452738, + "learning_rate": 2.3699591399212067e-05, + "loss": 0.114, + "step": 17914 + }, + { + "epoch": 2.1243922684691094, + "grad_norm": 1.0996489305939712, + "learning_rate": 2.3697194293602932e-05, + "loss": 0.144, + "step": 17915 + }, + { + "epoch": 2.1245108502312346, + "grad_norm": 0.8487199506375223, + "learning_rate": 2.3694797200004053e-05, + "loss": 0.1244, + "step": 17916 + }, + { + "epoch": 2.1246294319933594, + "grad_norm": 0.5109911550126984, + "learning_rate": 2.3692400118437558e-05, + "loss": 0.0685, + "step": 17917 + }, + { + "epoch": 2.1247480137554846, + "grad_norm": 0.6238089767423283, + "learning_rate": 2.369000304892553e-05, + "loss": 0.101, + "step": 17918 + }, + { + "epoch": 2.1248665955176094, + "grad_norm": 0.8983865031878597, + "learning_rate": 2.3687605991490066e-05, + "loss": 0.1153, + "step": 17919 + }, + { + "epoch": 2.1249851772797346, + "grad_norm": 0.7457922643400969, + "learning_rate": 2.3685208946153254e-05, + "loss": 0.0908, + "step": 17920 + }, + { + "epoch": 2.1251037590418593, + "grad_norm": 1.1064800435150053, + "learning_rate": 2.3682811912937216e-05, + "loss": 0.1693, + "step": 17921 + }, + { + "epoch": 2.1252223408039845, + "grad_norm": 0.818937815399172, + "learning_rate": 2.3680414891864036e-05, + "loss": 0.0959, + "step": 17922 + }, + { + "epoch": 2.1253409225661093, + "grad_norm": 0.635415820200434, + "learning_rate": 2.3678017882955807e-05, + "loss": 0.0827, + "step": 17923 + }, + { + "epoch": 2.1254595043282345, + "grad_norm": 0.7315028646011373, + "learning_rate": 2.367562088623463e-05, + "loss": 0.0966, + "step": 17924 + }, + { + "epoch": 2.1255780860903593, + "grad_norm": 0.8212071829887027, + "learning_rate": 2.3673223901722607e-05, + "loss": 0.11, + "step": 17925 + }, + { + "epoch": 2.1256966678524845, + "grad_norm": 0.9939855279184496, + "learning_rate": 2.3670826929441838e-05, + "loss": 0.1105, + "step": 17926 + }, + { + "epoch": 2.125815249614609, + "grad_norm": 0.65075426924166, + "learning_rate": 2.3668429969414417e-05, + "loss": 0.105, + "step": 17927 + }, + { + "epoch": 2.1259338313767344, + "grad_norm": 0.9565522567237696, + "learning_rate": 2.3666033021662424e-05, + "loss": 0.1291, + "step": 17928 + }, + { + "epoch": 2.126052413138859, + "grad_norm": 0.8484159646406912, + "learning_rate": 2.366363608620798e-05, + "loss": 0.1014, + "step": 17929 + }, + { + "epoch": 2.1261709949009844, + "grad_norm": 0.8887452471984288, + "learning_rate": 2.3661239163073178e-05, + "loss": 0.1022, + "step": 17930 + }, + { + "epoch": 2.126289576663109, + "grad_norm": 0.8137404230117737, + "learning_rate": 2.3658842252280096e-05, + "loss": 0.0982, + "step": 17931 + }, + { + "epoch": 2.1264081584252343, + "grad_norm": 0.7173937793087729, + "learning_rate": 2.3656445353850852e-05, + "loss": 0.0942, + "step": 17932 + }, + { + "epoch": 2.126526740187359, + "grad_norm": 0.690157214057542, + "learning_rate": 2.3654048467807534e-05, + "loss": 0.0934, + "step": 17933 + }, + { + "epoch": 2.1266453219494843, + "grad_norm": 0.882030278353602, + "learning_rate": 2.365165159417224e-05, + "loss": 0.119, + "step": 17934 + }, + { + "epoch": 2.126763903711609, + "grad_norm": 0.7572209603852741, + "learning_rate": 2.3649254732967054e-05, + "loss": 0.1006, + "step": 17935 + }, + { + "epoch": 2.1268824854737343, + "grad_norm": 0.6535837997732545, + "learning_rate": 2.3646857884214097e-05, + "loss": 0.0779, + "step": 17936 + }, + { + "epoch": 2.127001067235859, + "grad_norm": 0.7266925742073694, + "learning_rate": 2.3644461047935448e-05, + "loss": 0.1019, + "step": 17937 + }, + { + "epoch": 2.1271196489979842, + "grad_norm": 0.8041646614861662, + "learning_rate": 2.3642064224153205e-05, + "loss": 0.0993, + "step": 17938 + }, + { + "epoch": 2.127238230760109, + "grad_norm": 0.8064137905588544, + "learning_rate": 2.363966741288946e-05, + "loss": 0.1184, + "step": 17939 + }, + { + "epoch": 2.127356812522234, + "grad_norm": 0.8567733802826183, + "learning_rate": 2.3637270614166317e-05, + "loss": 0.1115, + "step": 17940 + }, + { + "epoch": 2.127475394284359, + "grad_norm": 0.8030423582508885, + "learning_rate": 2.3634873828005872e-05, + "loss": 0.1137, + "step": 17941 + }, + { + "epoch": 2.127593976046484, + "grad_norm": 0.8405577757415722, + "learning_rate": 2.3632477054430207e-05, + "loss": 0.1207, + "step": 17942 + }, + { + "epoch": 2.127712557808609, + "grad_norm": 0.5808405231914165, + "learning_rate": 2.363008029346143e-05, + "loss": 0.0774, + "step": 17943 + }, + { + "epoch": 2.127831139570734, + "grad_norm": 0.6695355889903517, + "learning_rate": 2.362768354512163e-05, + "loss": 0.0841, + "step": 17944 + }, + { + "epoch": 2.127949721332859, + "grad_norm": 2.1794959808835808, + "learning_rate": 2.3625286809432915e-05, + "loss": 0.1505, + "step": 17945 + }, + { + "epoch": 2.128068303094984, + "grad_norm": 0.6918775036607938, + "learning_rate": 2.3622890086417357e-05, + "loss": 0.1014, + "step": 17946 + }, + { + "epoch": 2.128186884857109, + "grad_norm": 0.6931507101542764, + "learning_rate": 2.362049337609707e-05, + "loss": 0.098, + "step": 17947 + }, + { + "epoch": 2.128305466619234, + "grad_norm": 0.9732830820519827, + "learning_rate": 2.361809667849415e-05, + "loss": 0.1299, + "step": 17948 + }, + { + "epoch": 2.128424048381359, + "grad_norm": 0.7738195800671515, + "learning_rate": 2.361569999363068e-05, + "loss": 0.0868, + "step": 17949 + }, + { + "epoch": 2.128542630143484, + "grad_norm": 0.9156763357034299, + "learning_rate": 2.3613303321528744e-05, + "loss": 0.1398, + "step": 17950 + }, + { + "epoch": 2.1286612119056088, + "grad_norm": 0.7769478455894022, + "learning_rate": 2.3610906662210468e-05, + "loss": 0.1057, + "step": 17951 + }, + { + "epoch": 2.128779793667734, + "grad_norm": 0.7314998546887915, + "learning_rate": 2.3608510015697926e-05, + "loss": 0.091, + "step": 17952 + }, + { + "epoch": 2.1288983754298587, + "grad_norm": 0.929189689029976, + "learning_rate": 2.360611338201321e-05, + "loss": 0.1517, + "step": 17953 + }, + { + "epoch": 2.129016957191984, + "grad_norm": 0.8416267406235514, + "learning_rate": 2.3603716761178422e-05, + "loss": 0.1257, + "step": 17954 + }, + { + "epoch": 2.1291355389541087, + "grad_norm": 0.5313698852381401, + "learning_rate": 2.360132015321565e-05, + "loss": 0.055, + "step": 17955 + }, + { + "epoch": 2.129254120716234, + "grad_norm": 0.6489855520999889, + "learning_rate": 2.3598923558147e-05, + "loss": 0.0924, + "step": 17956 + }, + { + "epoch": 2.1293727024783586, + "grad_norm": 0.9645600636905274, + "learning_rate": 2.3596526975994555e-05, + "loss": 0.1255, + "step": 17957 + }, + { + "epoch": 2.129491284240484, + "grad_norm": 0.7505318286402709, + "learning_rate": 2.3594130406780397e-05, + "loss": 0.1029, + "step": 17958 + }, + { + "epoch": 2.1296098660026086, + "grad_norm": 0.776437155397659, + "learning_rate": 2.3591733850526647e-05, + "loss": 0.0983, + "step": 17959 + }, + { + "epoch": 2.129728447764734, + "grad_norm": 0.616023850659311, + "learning_rate": 2.358933730725538e-05, + "loss": 0.0831, + "step": 17960 + }, + { + "epoch": 2.129847029526859, + "grad_norm": 0.7128132904446529, + "learning_rate": 2.3586940776988685e-05, + "loss": 0.0952, + "step": 17961 + }, + { + "epoch": 2.1299656112889838, + "grad_norm": 0.9777230223252029, + "learning_rate": 2.3584544259748672e-05, + "loss": 0.0972, + "step": 17962 + }, + { + "epoch": 2.1300841930511085, + "grad_norm": 0.7560065532091526, + "learning_rate": 2.3582147755557422e-05, + "loss": 0.092, + "step": 17963 + }, + { + "epoch": 2.1302027748132337, + "grad_norm": 0.6113105681588883, + "learning_rate": 2.3579751264437035e-05, + "loss": 0.0765, + "step": 17964 + }, + { + "epoch": 2.130321356575359, + "grad_norm": 0.7285933518414346, + "learning_rate": 2.357735478640959e-05, + "loss": 0.1046, + "step": 17965 + }, + { + "epoch": 2.1304399383374837, + "grad_norm": 0.7796491007600945, + "learning_rate": 2.3574958321497202e-05, + "loss": 0.0943, + "step": 17966 + }, + { + "epoch": 2.130558520099609, + "grad_norm": 0.8515913640850464, + "learning_rate": 2.3572561869721946e-05, + "loss": 0.1023, + "step": 17967 + }, + { + "epoch": 2.1306771018617336, + "grad_norm": 0.7416116674353541, + "learning_rate": 2.3570165431105924e-05, + "loss": 0.1123, + "step": 17968 + }, + { + "epoch": 2.130795683623859, + "grad_norm": 0.7089334420762649, + "learning_rate": 2.3567769005671208e-05, + "loss": 0.1013, + "step": 17969 + }, + { + "epoch": 2.1309142653859836, + "grad_norm": 0.5332351112684102, + "learning_rate": 2.356537259343992e-05, + "loss": 0.0721, + "step": 17970 + }, + { + "epoch": 2.131032847148109, + "grad_norm": 0.6229097554779223, + "learning_rate": 2.3562976194434132e-05, + "loss": 0.0855, + "step": 17971 + }, + { + "epoch": 2.1311514289102336, + "grad_norm": 0.85019901906268, + "learning_rate": 2.3560579808675943e-05, + "loss": 0.1048, + "step": 17972 + }, + { + "epoch": 2.1312700106723588, + "grad_norm": 0.7237192884265834, + "learning_rate": 2.355818343618744e-05, + "loss": 0.1084, + "step": 17973 + }, + { + "epoch": 2.1313885924344835, + "grad_norm": 1.310789594927696, + "learning_rate": 2.3555787076990714e-05, + "loss": 0.1256, + "step": 17974 + }, + { + "epoch": 2.1315071741966087, + "grad_norm": 0.7189161756262489, + "learning_rate": 2.3553390731107872e-05, + "loss": 0.0841, + "step": 17975 + }, + { + "epoch": 2.1316257559587335, + "grad_norm": 0.7904979370066046, + "learning_rate": 2.3550994398560978e-05, + "loss": 0.1135, + "step": 17976 + }, + { + "epoch": 2.1317443377208587, + "grad_norm": 0.8374737107257041, + "learning_rate": 2.3548598079372152e-05, + "loss": 0.0875, + "step": 17977 + }, + { + "epoch": 2.1318629194829835, + "grad_norm": 1.041560735994387, + "learning_rate": 2.354620177356347e-05, + "loss": 0.1418, + "step": 17978 + }, + { + "epoch": 2.1319815012451087, + "grad_norm": 0.650547995376575, + "learning_rate": 2.354380548115703e-05, + "loss": 0.1067, + "step": 17979 + }, + { + "epoch": 2.1321000830072334, + "grad_norm": 0.6291266139420923, + "learning_rate": 2.35414092021749e-05, + "loss": 0.0943, + "step": 17980 + }, + { + "epoch": 2.1322186647693586, + "grad_norm": 0.8001775550464129, + "learning_rate": 2.3539012936639203e-05, + "loss": 0.1197, + "step": 17981 + }, + { + "epoch": 2.1323372465314834, + "grad_norm": 0.7459076322198885, + "learning_rate": 2.353661668457201e-05, + "loss": 0.1127, + "step": 17982 + }, + { + "epoch": 2.1324558282936086, + "grad_norm": 0.6126482173539698, + "learning_rate": 2.353422044599542e-05, + "loss": 0.0685, + "step": 17983 + }, + { + "epoch": 2.1325744100557333, + "grad_norm": 0.520303660516012, + "learning_rate": 2.353182422093151e-05, + "loss": 0.0803, + "step": 17984 + }, + { + "epoch": 2.1326929918178585, + "grad_norm": 0.969541353150509, + "learning_rate": 2.352942800940239e-05, + "loss": 0.1411, + "step": 17985 + }, + { + "epoch": 2.1328115735799833, + "grad_norm": 0.6226639599073154, + "learning_rate": 2.352703181143014e-05, + "loss": 0.0767, + "step": 17986 + }, + { + "epoch": 2.1329301553421085, + "grad_norm": 1.111531910988383, + "learning_rate": 2.352463562703684e-05, + "loss": 0.1597, + "step": 17987 + }, + { + "epoch": 2.1330487371042333, + "grad_norm": 0.6898020862188439, + "learning_rate": 2.3522239456244603e-05, + "loss": 0.1002, + "step": 17988 + }, + { + "epoch": 2.1331673188663585, + "grad_norm": 0.9035760661420732, + "learning_rate": 2.3519843299075508e-05, + "loss": 0.1013, + "step": 17989 + }, + { + "epoch": 2.1332859006284832, + "grad_norm": 0.8177121380055178, + "learning_rate": 2.3517447155551642e-05, + "loss": 0.1188, + "step": 17990 + }, + { + "epoch": 2.1334044823906084, + "grad_norm": 0.6883478418987434, + "learning_rate": 2.351505102569508e-05, + "loss": 0.1062, + "step": 17991 + }, + { + "epoch": 2.133523064152733, + "grad_norm": 0.8619965867122521, + "learning_rate": 2.3512654909527943e-05, + "loss": 0.1064, + "step": 17992 + }, + { + "epoch": 2.1336416459148584, + "grad_norm": 0.8933626276913544, + "learning_rate": 2.3510258807072294e-05, + "loss": 0.115, + "step": 17993 + }, + { + "epoch": 2.133760227676983, + "grad_norm": 0.6714201189075433, + "learning_rate": 2.3507862718350237e-05, + "loss": 0.0846, + "step": 17994 + }, + { + "epoch": 2.1338788094391083, + "grad_norm": 1.1744051070120978, + "learning_rate": 2.350546664338385e-05, + "loss": 0.1466, + "step": 17995 + }, + { + "epoch": 2.133997391201233, + "grad_norm": 0.7642441401724089, + "learning_rate": 2.3503070582195235e-05, + "loss": 0.1487, + "step": 17996 + }, + { + "epoch": 2.1341159729633583, + "grad_norm": 0.7529487793584818, + "learning_rate": 2.3500674534806475e-05, + "loss": 0.1013, + "step": 17997 + }, + { + "epoch": 2.134234554725483, + "grad_norm": 0.637900511054051, + "learning_rate": 2.3498278501239653e-05, + "loss": 0.0829, + "step": 17998 + }, + { + "epoch": 2.1343531364876083, + "grad_norm": 0.6729323309704297, + "learning_rate": 2.3495882481516855e-05, + "loss": 0.0946, + "step": 17999 + }, + { + "epoch": 2.134471718249733, + "grad_norm": 0.6712618665477792, + "learning_rate": 2.3493486475660187e-05, + "loss": 0.0957, + "step": 18000 + }, + { + "epoch": 2.1345903000118582, + "grad_norm": 0.7157267011133359, + "learning_rate": 2.3491090483691726e-05, + "loss": 0.0885, + "step": 18001 + }, + { + "epoch": 2.134708881773983, + "grad_norm": 0.8037486863320477, + "learning_rate": 2.3488694505633548e-05, + "loss": 0.1017, + "step": 18002 + }, + { + "epoch": 2.134827463536108, + "grad_norm": 0.5924802316743294, + "learning_rate": 2.3486298541507764e-05, + "loss": 0.0734, + "step": 18003 + }, + { + "epoch": 2.134946045298233, + "grad_norm": 1.0066599779909167, + "learning_rate": 2.3483902591336448e-05, + "loss": 0.132, + "step": 18004 + }, + { + "epoch": 2.135064627060358, + "grad_norm": 0.9566497703070097, + "learning_rate": 2.3481506655141697e-05, + "loss": 0.1722, + "step": 18005 + }, + { + "epoch": 2.135183208822483, + "grad_norm": 0.6279313542518444, + "learning_rate": 2.347911073294558e-05, + "loss": 0.0869, + "step": 18006 + }, + { + "epoch": 2.135301790584608, + "grad_norm": 0.8070261913629467, + "learning_rate": 2.3476714824770208e-05, + "loss": 0.121, + "step": 18007 + }, + { + "epoch": 2.135420372346733, + "grad_norm": 0.7292099898304665, + "learning_rate": 2.3474318930637656e-05, + "loss": 0.1111, + "step": 18008 + }, + { + "epoch": 2.135538954108858, + "grad_norm": 0.6116872377730583, + "learning_rate": 2.3471923050570013e-05, + "loss": 0.0647, + "step": 18009 + }, + { + "epoch": 2.135657535870983, + "grad_norm": 0.7905847848814176, + "learning_rate": 2.3469527184589357e-05, + "loss": 0.1084, + "step": 18010 + }, + { + "epoch": 2.135776117633108, + "grad_norm": 0.5529919037535964, + "learning_rate": 2.3467131332717793e-05, + "loss": 0.0677, + "step": 18011 + }, + { + "epoch": 2.135894699395233, + "grad_norm": 0.6439759243633505, + "learning_rate": 2.3464735494977392e-05, + "loss": 0.0927, + "step": 18012 + }, + { + "epoch": 2.136013281157358, + "grad_norm": 0.8702695590386543, + "learning_rate": 2.3462339671390255e-05, + "loss": 0.0895, + "step": 18013 + }, + { + "epoch": 2.136131862919483, + "grad_norm": 0.7412421377280523, + "learning_rate": 2.3459943861978447e-05, + "loss": 0.1066, + "step": 18014 + }, + { + "epoch": 2.136250444681608, + "grad_norm": 0.777798178681783, + "learning_rate": 2.345754806676408e-05, + "loss": 0.1057, + "step": 18015 + }, + { + "epoch": 2.1363690264437327, + "grad_norm": 0.9237364942404307, + "learning_rate": 2.345515228576923e-05, + "loss": 0.1342, + "step": 18016 + }, + { + "epoch": 2.136487608205858, + "grad_norm": 0.6106865786879563, + "learning_rate": 2.3452756519015965e-05, + "loss": 0.0837, + "step": 18017 + }, + { + "epoch": 2.136606189967983, + "grad_norm": 1.0693416307912376, + "learning_rate": 2.3450360766526405e-05, + "loss": 0.1498, + "step": 18018 + }, + { + "epoch": 2.136724771730108, + "grad_norm": 0.8146444400841573, + "learning_rate": 2.3447965028322617e-05, + "loss": 0.1066, + "step": 18019 + }, + { + "epoch": 2.136843353492233, + "grad_norm": 0.6368735070380152, + "learning_rate": 2.3445569304426687e-05, + "loss": 0.0834, + "step": 18020 + }, + { + "epoch": 2.136961935254358, + "grad_norm": 1.0317235321642375, + "learning_rate": 2.344317359486069e-05, + "loss": 0.139, + "step": 18021 + }, + { + "epoch": 2.137080517016483, + "grad_norm": 0.7518010711989375, + "learning_rate": 2.3440777899646736e-05, + "loss": 0.1381, + "step": 18022 + }, + { + "epoch": 2.137199098778608, + "grad_norm": 0.7584581441437331, + "learning_rate": 2.3438382218806892e-05, + "loss": 0.1088, + "step": 18023 + }, + { + "epoch": 2.137317680540733, + "grad_norm": 0.5213458827296431, + "learning_rate": 2.3435986552363255e-05, + "loss": 0.0727, + "step": 18024 + }, + { + "epoch": 2.1374362623028578, + "grad_norm": 0.9111548483046347, + "learning_rate": 2.343359090033789e-05, + "loss": 0.1081, + "step": 18025 + }, + { + "epoch": 2.137554844064983, + "grad_norm": 0.7391686080405271, + "learning_rate": 2.343119526275291e-05, + "loss": 0.119, + "step": 18026 + }, + { + "epoch": 2.1376734258271077, + "grad_norm": 0.5957188262381526, + "learning_rate": 2.3428799639630383e-05, + "loss": 0.0877, + "step": 18027 + }, + { + "epoch": 2.137792007589233, + "grad_norm": 0.6083177768801656, + "learning_rate": 2.3426404030992398e-05, + "loss": 0.0793, + "step": 18028 + }, + { + "epoch": 2.1379105893513577, + "grad_norm": 0.6342198883171318, + "learning_rate": 2.342400843686103e-05, + "loss": 0.0893, + "step": 18029 + }, + { + "epoch": 2.138029171113483, + "grad_norm": 0.599692922688417, + "learning_rate": 2.342161285725838e-05, + "loss": 0.0851, + "step": 18030 + }, + { + "epoch": 2.1381477528756077, + "grad_norm": 0.6498707769072296, + "learning_rate": 2.3419217292206523e-05, + "loss": 0.1022, + "step": 18031 + }, + { + "epoch": 2.138266334637733, + "grad_norm": 0.5250549935838396, + "learning_rate": 2.3416821741727534e-05, + "loss": 0.0675, + "step": 18032 + }, + { + "epoch": 2.1383849163998576, + "grad_norm": 0.5661442777501516, + "learning_rate": 2.3414426205843516e-05, + "loss": 0.0796, + "step": 18033 + }, + { + "epoch": 2.138503498161983, + "grad_norm": 0.8912426879759916, + "learning_rate": 2.3412030684576545e-05, + "loss": 0.1303, + "step": 18034 + }, + { + "epoch": 2.1386220799241076, + "grad_norm": 0.5285611167927607, + "learning_rate": 2.340963517794871e-05, + "loss": 0.0803, + "step": 18035 + }, + { + "epoch": 2.138740661686233, + "grad_norm": 0.7971832071440378, + "learning_rate": 2.340723968598207e-05, + "loss": 0.097, + "step": 18036 + }, + { + "epoch": 2.1388592434483575, + "grad_norm": 0.7886784610044397, + "learning_rate": 2.340484420869874e-05, + "loss": 0.1118, + "step": 18037 + }, + { + "epoch": 2.1389778252104827, + "grad_norm": 0.8325417157712541, + "learning_rate": 2.3402448746120792e-05, + "loss": 0.0998, + "step": 18038 + }, + { + "epoch": 2.1390964069726075, + "grad_norm": 0.8673012561978783, + "learning_rate": 2.3400053298270304e-05, + "loss": 0.1455, + "step": 18039 + }, + { + "epoch": 2.1392149887347327, + "grad_norm": 0.5721425723167158, + "learning_rate": 2.3397657865169356e-05, + "loss": 0.0842, + "step": 18040 + }, + { + "epoch": 2.1393335704968575, + "grad_norm": 0.7189776576445861, + "learning_rate": 2.3395262446840046e-05, + "loss": 0.0978, + "step": 18041 + }, + { + "epoch": 2.1394521522589827, + "grad_norm": 0.7867094070803604, + "learning_rate": 2.339286704330444e-05, + "loss": 0.0909, + "step": 18042 + }, + { + "epoch": 2.1395707340211074, + "grad_norm": 0.7332408463141352, + "learning_rate": 2.3390471654584642e-05, + "loss": 0.0835, + "step": 18043 + }, + { + "epoch": 2.1396893157832326, + "grad_norm": 1.3659974291512087, + "learning_rate": 2.3388076280702705e-05, + "loss": 0.1838, + "step": 18044 + }, + { + "epoch": 2.1398078975453574, + "grad_norm": 0.6900796712354379, + "learning_rate": 2.3385680921680742e-05, + "loss": 0.0716, + "step": 18045 + }, + { + "epoch": 2.1399264793074826, + "grad_norm": 0.6047944990875459, + "learning_rate": 2.338328557754082e-05, + "loss": 0.0822, + "step": 18046 + }, + { + "epoch": 2.1400450610696073, + "grad_norm": 0.5733761154197873, + "learning_rate": 2.3380890248305013e-05, + "loss": 0.0928, + "step": 18047 + }, + { + "epoch": 2.1401636428317325, + "grad_norm": 0.4893372739568274, + "learning_rate": 2.337849493399542e-05, + "loss": 0.0707, + "step": 18048 + }, + { + "epoch": 2.1402822245938573, + "grad_norm": 0.6264341399466726, + "learning_rate": 2.337609963463412e-05, + "loss": 0.0828, + "step": 18049 + }, + { + "epoch": 2.1404008063559825, + "grad_norm": 0.940528791794186, + "learning_rate": 2.3373704350243184e-05, + "loss": 0.1354, + "step": 18050 + }, + { + "epoch": 2.1405193881181073, + "grad_norm": 0.7472558180252735, + "learning_rate": 2.3371309080844693e-05, + "loss": 0.114, + "step": 18051 + }, + { + "epoch": 2.1406379698802325, + "grad_norm": 0.9147306334718411, + "learning_rate": 2.3368913826460744e-05, + "loss": 0.1285, + "step": 18052 + }, + { + "epoch": 2.1407565516423572, + "grad_norm": 0.8213501537537489, + "learning_rate": 2.3366518587113406e-05, + "loss": 0.116, + "step": 18053 + }, + { + "epoch": 2.1408751334044824, + "grad_norm": 0.5972004334156824, + "learning_rate": 2.3364123362824766e-05, + "loss": 0.0865, + "step": 18054 + }, + { + "epoch": 2.140993715166607, + "grad_norm": 0.7123993315244432, + "learning_rate": 2.3361728153616893e-05, + "loss": 0.0991, + "step": 18055 + }, + { + "epoch": 2.1411122969287324, + "grad_norm": 0.5830591414604122, + "learning_rate": 2.3359332959511888e-05, + "loss": 0.0747, + "step": 18056 + }, + { + "epoch": 2.141230878690857, + "grad_norm": 0.6714233130580632, + "learning_rate": 2.3356937780531823e-05, + "loss": 0.0816, + "step": 18057 + }, + { + "epoch": 2.1413494604529824, + "grad_norm": 0.8444375103388471, + "learning_rate": 2.3354542616698776e-05, + "loss": 0.0985, + "step": 18058 + }, + { + "epoch": 2.141468042215107, + "grad_norm": 0.597477490866996, + "learning_rate": 2.335214746803482e-05, + "loss": 0.0768, + "step": 18059 + }, + { + "epoch": 2.1415866239772323, + "grad_norm": 0.6547117945798402, + "learning_rate": 2.3349752334562046e-05, + "loss": 0.0739, + "step": 18060 + }, + { + "epoch": 2.141705205739357, + "grad_norm": 0.8075592259822908, + "learning_rate": 2.3347357216302533e-05, + "loss": 0.1068, + "step": 18061 + }, + { + "epoch": 2.1418237875014823, + "grad_norm": 0.7554655506309944, + "learning_rate": 2.3344962113278358e-05, + "loss": 0.0961, + "step": 18062 + }, + { + "epoch": 2.1419423692636075, + "grad_norm": 0.8016755043386448, + "learning_rate": 2.3342567025511605e-05, + "loss": 0.1092, + "step": 18063 + }, + { + "epoch": 2.1420609510257322, + "grad_norm": 1.015132411498408, + "learning_rate": 2.3340171953024356e-05, + "loss": 0.1177, + "step": 18064 + }, + { + "epoch": 2.142179532787857, + "grad_norm": 0.6061933401261164, + "learning_rate": 2.3337776895838687e-05, + "loss": 0.0887, + "step": 18065 + }, + { + "epoch": 2.142298114549982, + "grad_norm": 0.7898899825485469, + "learning_rate": 2.333538185397666e-05, + "loss": 0.0934, + "step": 18066 + }, + { + "epoch": 2.1424166963121074, + "grad_norm": 0.7577674260773044, + "learning_rate": 2.333298682746039e-05, + "loss": 0.0918, + "step": 18067 + }, + { + "epoch": 2.142535278074232, + "grad_norm": 0.9888382287923655, + "learning_rate": 2.3330591816311935e-05, + "loss": 0.1434, + "step": 18068 + }, + { + "epoch": 2.1426538598363574, + "grad_norm": 0.7023257267084432, + "learning_rate": 2.3328196820553373e-05, + "loss": 0.079, + "step": 18069 + }, + { + "epoch": 2.142772441598482, + "grad_norm": 0.8175957307719977, + "learning_rate": 2.3325801840206775e-05, + "loss": 0.0889, + "step": 18070 + }, + { + "epoch": 2.1428910233606073, + "grad_norm": 1.0274324968056765, + "learning_rate": 2.3323406875294246e-05, + "loss": 0.1313, + "step": 18071 + }, + { + "epoch": 2.143009605122732, + "grad_norm": 0.7463302568746567, + "learning_rate": 2.3321011925837843e-05, + "loss": 0.106, + "step": 18072 + }, + { + "epoch": 2.1431281868848573, + "grad_norm": 0.8828530244476475, + "learning_rate": 2.331861699185965e-05, + "loss": 0.1477, + "step": 18073 + }, + { + "epoch": 2.143246768646982, + "grad_norm": 0.9532853768012814, + "learning_rate": 2.3316222073381746e-05, + "loss": 0.1392, + "step": 18074 + }, + { + "epoch": 2.1433653504091073, + "grad_norm": 1.1508237242233292, + "learning_rate": 2.3313827170426218e-05, + "loss": 0.1747, + "step": 18075 + }, + { + "epoch": 2.143483932171232, + "grad_norm": 0.9052430644114066, + "learning_rate": 2.3311432283015134e-05, + "loss": 0.0929, + "step": 18076 + }, + { + "epoch": 2.143602513933357, + "grad_norm": 0.8316858898207852, + "learning_rate": 2.3309037411170563e-05, + "loss": 0.1027, + "step": 18077 + }, + { + "epoch": 2.143721095695482, + "grad_norm": 0.9304349672916578, + "learning_rate": 2.3306642554914604e-05, + "loss": 0.1432, + "step": 18078 + }, + { + "epoch": 2.143839677457607, + "grad_norm": 1.089490464534018, + "learning_rate": 2.3304247714269323e-05, + "loss": 0.0961, + "step": 18079 + }, + { + "epoch": 2.143958259219732, + "grad_norm": 0.6502530677779542, + "learning_rate": 2.33018528892568e-05, + "loss": 0.0824, + "step": 18080 + }, + { + "epoch": 2.144076840981857, + "grad_norm": 0.8596455476877474, + "learning_rate": 2.32994580798991e-05, + "loss": 0.1074, + "step": 18081 + }, + { + "epoch": 2.144195422743982, + "grad_norm": 0.6108617024143107, + "learning_rate": 2.329706328621832e-05, + "loss": 0.0741, + "step": 18082 + }, + { + "epoch": 2.144314004506107, + "grad_norm": 0.8077240730508112, + "learning_rate": 2.3294668508236524e-05, + "loss": 0.1306, + "step": 18083 + }, + { + "epoch": 2.144432586268232, + "grad_norm": 0.7494475487936177, + "learning_rate": 2.32922737459758e-05, + "loss": 0.0888, + "step": 18084 + }, + { + "epoch": 2.144551168030357, + "grad_norm": 0.8099613123334597, + "learning_rate": 2.3289878999458205e-05, + "loss": 0.093, + "step": 18085 + }, + { + "epoch": 2.144669749792482, + "grad_norm": 0.7005478689144264, + "learning_rate": 2.328748426870584e-05, + "loss": 0.12, + "step": 18086 + }, + { + "epoch": 2.144788331554607, + "grad_norm": 0.810246500005227, + "learning_rate": 2.3285089553740774e-05, + "loss": 0.0904, + "step": 18087 + }, + { + "epoch": 2.144906913316732, + "grad_norm": 1.4308581752294731, + "learning_rate": 2.3282694854585063e-05, + "loss": 0.1587, + "step": 18088 + }, + { + "epoch": 2.145025495078857, + "grad_norm": 0.6971963964331938, + "learning_rate": 2.3280300171260814e-05, + "loss": 0.0844, + "step": 18089 + }, + { + "epoch": 2.1451440768409817, + "grad_norm": 0.5584215659312805, + "learning_rate": 2.3277905503790087e-05, + "loss": 0.0776, + "step": 18090 + }, + { + "epoch": 2.145262658603107, + "grad_norm": 0.9591231617530674, + "learning_rate": 2.3275510852194954e-05, + "loss": 0.1376, + "step": 18091 + }, + { + "epoch": 2.1453812403652317, + "grad_norm": 0.8188823046807663, + "learning_rate": 2.3273116216497498e-05, + "loss": 0.1275, + "step": 18092 + }, + { + "epoch": 2.145499822127357, + "grad_norm": 0.730830718115018, + "learning_rate": 2.327072159671979e-05, + "loss": 0.103, + "step": 18093 + }, + { + "epoch": 2.1456184038894817, + "grad_norm": 0.9227901666081288, + "learning_rate": 2.3268326992883916e-05, + "loss": 0.1177, + "step": 18094 + }, + { + "epoch": 2.145736985651607, + "grad_norm": 0.6474206183261723, + "learning_rate": 2.3265932405011942e-05, + "loss": 0.0869, + "step": 18095 + }, + { + "epoch": 2.1458555674137316, + "grad_norm": 0.912011367935835, + "learning_rate": 2.3263537833125933e-05, + "loss": 0.1216, + "step": 18096 + }, + { + "epoch": 2.145974149175857, + "grad_norm": 0.965707705707891, + "learning_rate": 2.326114327724799e-05, + "loss": 0.0973, + "step": 18097 + }, + { + "epoch": 2.1460927309379816, + "grad_norm": 1.2442381816799404, + "learning_rate": 2.325874873740017e-05, + "loss": 0.1152, + "step": 18098 + }, + { + "epoch": 2.146211312700107, + "grad_norm": 0.9894280903397933, + "learning_rate": 2.3256354213604552e-05, + "loss": 0.1453, + "step": 18099 + }, + { + "epoch": 2.1463298944622315, + "grad_norm": 0.7078210763688166, + "learning_rate": 2.3253959705883203e-05, + "loss": 0.1028, + "step": 18100 + }, + { + "epoch": 2.1464484762243567, + "grad_norm": 0.8743458849957061, + "learning_rate": 2.325156521425821e-05, + "loss": 0.119, + "step": 18101 + }, + { + "epoch": 2.1465670579864815, + "grad_norm": 0.8211973595833604, + "learning_rate": 2.3249170738751642e-05, + "loss": 0.098, + "step": 18102 + }, + { + "epoch": 2.1466856397486067, + "grad_norm": 0.5701343810452626, + "learning_rate": 2.3246776279385568e-05, + "loss": 0.0685, + "step": 18103 + }, + { + "epoch": 2.1468042215107315, + "grad_norm": 0.49077395329758944, + "learning_rate": 2.3244381836182068e-05, + "loss": 0.07, + "step": 18104 + }, + { + "epoch": 2.1469228032728567, + "grad_norm": 0.6856528907769682, + "learning_rate": 2.3241987409163222e-05, + "loss": 0.0912, + "step": 18105 + }, + { + "epoch": 2.1470413850349814, + "grad_norm": 0.6731806465364306, + "learning_rate": 2.3239592998351092e-05, + "loss": 0.0816, + "step": 18106 + }, + { + "epoch": 2.1471599667971066, + "grad_norm": 0.9041999764862111, + "learning_rate": 2.3237198603767747e-05, + "loss": 0.1359, + "step": 18107 + }, + { + "epoch": 2.1472785485592314, + "grad_norm": 0.6715426001344005, + "learning_rate": 2.3234804225435282e-05, + "loss": 0.0835, + "step": 18108 + }, + { + "epoch": 2.1473971303213566, + "grad_norm": 0.4019711711116074, + "learning_rate": 2.3232409863375756e-05, + "loss": 0.0627, + "step": 18109 + }, + { + "epoch": 2.1475157120834814, + "grad_norm": 0.6717644933305216, + "learning_rate": 2.323001551761124e-05, + "loss": 0.0908, + "step": 18110 + }, + { + "epoch": 2.1476342938456066, + "grad_norm": 0.6852354148582213, + "learning_rate": 2.3227621188163807e-05, + "loss": 0.0975, + "step": 18111 + }, + { + "epoch": 2.1477528756077318, + "grad_norm": 0.6076907563551447, + "learning_rate": 2.3225226875055535e-05, + "loss": 0.0937, + "step": 18112 + }, + { + "epoch": 2.1478714573698565, + "grad_norm": 0.6103847015376641, + "learning_rate": 2.32228325783085e-05, + "loss": 0.0946, + "step": 18113 + }, + { + "epoch": 2.1479900391319813, + "grad_norm": 0.6414170256554548, + "learning_rate": 2.322043829794477e-05, + "loss": 0.0854, + "step": 18114 + }, + { + "epoch": 2.1481086208941065, + "grad_norm": 0.8497505343088174, + "learning_rate": 2.32180440339864e-05, + "loss": 0.1215, + "step": 18115 + }, + { + "epoch": 2.1482272026562317, + "grad_norm": 0.7877573202653888, + "learning_rate": 2.3215649786455498e-05, + "loss": 0.1275, + "step": 18116 + }, + { + "epoch": 2.1483457844183564, + "grad_norm": 0.6801568465767223, + "learning_rate": 2.3213255555374113e-05, + "loss": 0.0793, + "step": 18117 + }, + { + "epoch": 2.1484643661804816, + "grad_norm": 0.7955775847791441, + "learning_rate": 2.321086134076431e-05, + "loss": 0.1221, + "step": 18118 + }, + { + "epoch": 2.1485829479426064, + "grad_norm": 0.64617084693515, + "learning_rate": 2.3208467142648185e-05, + "loss": 0.0895, + "step": 18119 + }, + { + "epoch": 2.1487015297047316, + "grad_norm": 0.673220362222063, + "learning_rate": 2.320607296104779e-05, + "loss": 0.0853, + "step": 18120 + }, + { + "epoch": 2.1488201114668564, + "grad_norm": 0.7276050932085466, + "learning_rate": 2.3203678795985205e-05, + "loss": 0.1144, + "step": 18121 + }, + { + "epoch": 2.1489386932289816, + "grad_norm": 0.7485867798028866, + "learning_rate": 2.3201284647482492e-05, + "loss": 0.1071, + "step": 18122 + }, + { + "epoch": 2.1490572749911063, + "grad_norm": 0.988604949364844, + "learning_rate": 2.3198890515561733e-05, + "loss": 0.1082, + "step": 18123 + }, + { + "epoch": 2.1491758567532315, + "grad_norm": 0.7639925713735145, + "learning_rate": 2.3196496400245e-05, + "loss": 0.0803, + "step": 18124 + }, + { + "epoch": 2.1492944385153563, + "grad_norm": 0.9519986941569366, + "learning_rate": 2.3194102301554355e-05, + "loss": 0.1442, + "step": 18125 + }, + { + "epoch": 2.1494130202774815, + "grad_norm": 0.5932752594665959, + "learning_rate": 2.3191708219511867e-05, + "loss": 0.0761, + "step": 18126 + }, + { + "epoch": 2.1495316020396062, + "grad_norm": 0.5227746733073987, + "learning_rate": 2.3189314154139617e-05, + "loss": 0.0616, + "step": 18127 + }, + { + "epoch": 2.1496501838017315, + "grad_norm": 0.6815486317430457, + "learning_rate": 2.3186920105459672e-05, + "loss": 0.1018, + "step": 18128 + }, + { + "epoch": 2.149768765563856, + "grad_norm": 0.6575320457562552, + "learning_rate": 2.31845260734941e-05, + "loss": 0.0871, + "step": 18129 + }, + { + "epoch": 2.1498873473259814, + "grad_norm": 0.9138209980568756, + "learning_rate": 2.318213205826496e-05, + "loss": 0.1271, + "step": 18130 + }, + { + "epoch": 2.150005929088106, + "grad_norm": 0.8297299890809935, + "learning_rate": 2.3179738059794344e-05, + "loss": 0.1002, + "step": 18131 + }, + { + "epoch": 2.1501245108502314, + "grad_norm": 0.8180974031582056, + "learning_rate": 2.3177344078104308e-05, + "loss": 0.1201, + "step": 18132 + }, + { + "epoch": 2.150243092612356, + "grad_norm": 0.6370147294690185, + "learning_rate": 2.317495011321692e-05, + "loss": 0.0972, + "step": 18133 + }, + { + "epoch": 2.1503616743744813, + "grad_norm": 0.631388197838621, + "learning_rate": 2.317255616515426e-05, + "loss": 0.0914, + "step": 18134 + }, + { + "epoch": 2.150480256136606, + "grad_norm": 0.6340892292528569, + "learning_rate": 2.3170162233938394e-05, + "loss": 0.0831, + "step": 18135 + }, + { + "epoch": 2.1505988378987313, + "grad_norm": 0.6328260781577894, + "learning_rate": 2.3167768319591387e-05, + "loss": 0.0809, + "step": 18136 + }, + { + "epoch": 2.150717419660856, + "grad_norm": 0.8528781529900971, + "learning_rate": 2.31653744221353e-05, + "loss": 0.0807, + "step": 18137 + }, + { + "epoch": 2.1508360014229813, + "grad_norm": 0.8794962657783777, + "learning_rate": 2.3162980541592223e-05, + "loss": 0.1358, + "step": 18138 + }, + { + "epoch": 2.150954583185106, + "grad_norm": 0.5774211025513339, + "learning_rate": 2.3160586677984207e-05, + "loss": 0.0684, + "step": 18139 + }, + { + "epoch": 2.151073164947231, + "grad_norm": 0.6738072930774789, + "learning_rate": 2.3158192831333327e-05, + "loss": 0.0787, + "step": 18140 + }, + { + "epoch": 2.151191746709356, + "grad_norm": 0.6485456332558643, + "learning_rate": 2.3155799001661648e-05, + "loss": 0.0773, + "step": 18141 + }, + { + "epoch": 2.151310328471481, + "grad_norm": 0.7391262890084076, + "learning_rate": 2.315340518899124e-05, + "loss": 0.0846, + "step": 18142 + }, + { + "epoch": 2.151428910233606, + "grad_norm": 0.9779840652770714, + "learning_rate": 2.315101139334418e-05, + "loss": 0.1284, + "step": 18143 + }, + { + "epoch": 2.151547491995731, + "grad_norm": 0.7025012195214992, + "learning_rate": 2.3148617614742524e-05, + "loss": 0.0958, + "step": 18144 + }, + { + "epoch": 2.151666073757856, + "grad_norm": 0.9971336498004546, + "learning_rate": 2.3146223853208334e-05, + "loss": 0.1327, + "step": 18145 + }, + { + "epoch": 2.151784655519981, + "grad_norm": 0.5671534828665069, + "learning_rate": 2.3143830108763698e-05, + "loss": 0.0798, + "step": 18146 + }, + { + "epoch": 2.151903237282106, + "grad_norm": 0.8716908413898118, + "learning_rate": 2.3141436381430673e-05, + "loss": 0.1022, + "step": 18147 + }, + { + "epoch": 2.152021819044231, + "grad_norm": 0.46130784405367475, + "learning_rate": 2.313904267123131e-05, + "loss": 0.0727, + "step": 18148 + }, + { + "epoch": 2.152140400806356, + "grad_norm": 0.6351613759543203, + "learning_rate": 2.3136648978187708e-05, + "loss": 0.0765, + "step": 18149 + }, + { + "epoch": 2.152258982568481, + "grad_norm": 0.7888419177928546, + "learning_rate": 2.3134255302321915e-05, + "loss": 0.1222, + "step": 18150 + }, + { + "epoch": 2.152377564330606, + "grad_norm": 0.6741672998405454, + "learning_rate": 2.3131861643655998e-05, + "loss": 0.0783, + "step": 18151 + }, + { + "epoch": 2.152496146092731, + "grad_norm": 0.8988194486721663, + "learning_rate": 2.312946800221202e-05, + "loss": 0.1362, + "step": 18152 + }, + { + "epoch": 2.1526147278548557, + "grad_norm": 1.3386723568240328, + "learning_rate": 2.3127074378012058e-05, + "loss": 0.1307, + "step": 18153 + }, + { + "epoch": 2.152733309616981, + "grad_norm": 0.6641771760248938, + "learning_rate": 2.3124680771078176e-05, + "loss": 0.0939, + "step": 18154 + }, + { + "epoch": 2.1528518913791057, + "grad_norm": 0.8418614838862601, + "learning_rate": 2.312228718143244e-05, + "loss": 0.1174, + "step": 18155 + }, + { + "epoch": 2.152970473141231, + "grad_norm": 1.0587160378128124, + "learning_rate": 2.31198936090969e-05, + "loss": 0.137, + "step": 18156 + }, + { + "epoch": 2.1530890549033557, + "grad_norm": 0.7854008753925822, + "learning_rate": 2.3117500054093645e-05, + "loss": 0.1021, + "step": 18157 + }, + { + "epoch": 2.153207636665481, + "grad_norm": 0.7781030683517973, + "learning_rate": 2.3115106516444738e-05, + "loss": 0.123, + "step": 18158 + }, + { + "epoch": 2.1533262184276056, + "grad_norm": 0.8259606510106414, + "learning_rate": 2.311271299617222e-05, + "loss": 0.0962, + "step": 18159 + }, + { + "epoch": 2.153444800189731, + "grad_norm": 0.736209319071738, + "learning_rate": 2.311031949329819e-05, + "loss": 0.1014, + "step": 18160 + }, + { + "epoch": 2.153563381951856, + "grad_norm": 0.6972753163739709, + "learning_rate": 2.310792600784469e-05, + "loss": 0.0962, + "step": 18161 + }, + { + "epoch": 2.153681963713981, + "grad_norm": 0.7402370264979539, + "learning_rate": 2.310553253983379e-05, + "loss": 0.1091, + "step": 18162 + }, + { + "epoch": 2.1538005454761056, + "grad_norm": 0.9732503963958173, + "learning_rate": 2.3103139089287558e-05, + "loss": 0.1533, + "step": 18163 + }, + { + "epoch": 2.1539191272382308, + "grad_norm": 0.8138120912286035, + "learning_rate": 2.3100745656228056e-05, + "loss": 0.0912, + "step": 18164 + }, + { + "epoch": 2.154037709000356, + "grad_norm": 0.7029214470128695, + "learning_rate": 2.309835224067736e-05, + "loss": 0.1042, + "step": 18165 + }, + { + "epoch": 2.1541562907624807, + "grad_norm": 0.7728986354093755, + "learning_rate": 2.309595884265752e-05, + "loss": 0.1125, + "step": 18166 + }, + { + "epoch": 2.154274872524606, + "grad_norm": 1.0640671295149697, + "learning_rate": 2.3093565462190595e-05, + "loss": 0.1431, + "step": 18167 + }, + { + "epoch": 2.1543934542867307, + "grad_norm": 0.5772411816749196, + "learning_rate": 2.3091172099298665e-05, + "loss": 0.0656, + "step": 18168 + }, + { + "epoch": 2.154512036048856, + "grad_norm": 0.6896418390996412, + "learning_rate": 2.308877875400379e-05, + "loss": 0.077, + "step": 18169 + }, + { + "epoch": 2.1546306178109806, + "grad_norm": 0.7808587379063265, + "learning_rate": 2.3086385426328032e-05, + "loss": 0.0943, + "step": 18170 + }, + { + "epoch": 2.154749199573106, + "grad_norm": 0.8667898192465658, + "learning_rate": 2.308399211629345e-05, + "loss": 0.1355, + "step": 18171 + }, + { + "epoch": 2.1548677813352306, + "grad_norm": 0.6953066606399103, + "learning_rate": 2.3081598823922108e-05, + "loss": 0.1084, + "step": 18172 + }, + { + "epoch": 2.154986363097356, + "grad_norm": 0.8821050999682643, + "learning_rate": 2.307920554923608e-05, + "loss": 0.1098, + "step": 18173 + }, + { + "epoch": 2.1551049448594806, + "grad_norm": 0.6392239423028726, + "learning_rate": 2.307681229225741e-05, + "loss": 0.0772, + "step": 18174 + }, + { + "epoch": 2.1552235266216058, + "grad_norm": 0.9246335450249412, + "learning_rate": 2.3074419053008185e-05, + "loss": 0.1416, + "step": 18175 + }, + { + "epoch": 2.1553421083837305, + "grad_norm": 0.8330116734809548, + "learning_rate": 2.3072025831510452e-05, + "loss": 0.1143, + "step": 18176 + }, + { + "epoch": 2.1554606901458557, + "grad_norm": 0.718101012555311, + "learning_rate": 2.306963262778628e-05, + "loss": 0.0796, + "step": 18177 + }, + { + "epoch": 2.1555792719079805, + "grad_norm": 0.7573833062131254, + "learning_rate": 2.3067239441857713e-05, + "loss": 0.1117, + "step": 18178 + }, + { + "epoch": 2.1556978536701057, + "grad_norm": 0.8199884217452375, + "learning_rate": 2.3064846273746844e-05, + "loss": 0.1119, + "step": 18179 + }, + { + "epoch": 2.1558164354322304, + "grad_norm": 0.8031634785029379, + "learning_rate": 2.3062453123475712e-05, + "loss": 0.1374, + "step": 18180 + }, + { + "epoch": 2.1559350171943557, + "grad_norm": 0.8377048822939261, + "learning_rate": 2.3060059991066388e-05, + "loss": 0.1273, + "step": 18181 + }, + { + "epoch": 2.1560535989564804, + "grad_norm": 0.6836474477348978, + "learning_rate": 2.3057666876540927e-05, + "loss": 0.0996, + "step": 18182 + }, + { + "epoch": 2.1561721807186056, + "grad_norm": 0.5191959010832761, + "learning_rate": 2.30552737799214e-05, + "loss": 0.0669, + "step": 18183 + }, + { + "epoch": 2.1562907624807304, + "grad_norm": 0.5220909854671199, + "learning_rate": 2.305288070122987e-05, + "loss": 0.0853, + "step": 18184 + }, + { + "epoch": 2.1564093442428556, + "grad_norm": 0.5908015739492222, + "learning_rate": 2.3050487640488388e-05, + "loss": 0.0842, + "step": 18185 + }, + { + "epoch": 2.1565279260049803, + "grad_norm": 0.8052188534461072, + "learning_rate": 2.304809459771901e-05, + "loss": 0.1144, + "step": 18186 + }, + { + "epoch": 2.1566465077671055, + "grad_norm": 0.6946834553344882, + "learning_rate": 2.3045701572943817e-05, + "loss": 0.0962, + "step": 18187 + }, + { + "epoch": 2.1567650895292303, + "grad_norm": 0.4703431196004679, + "learning_rate": 2.304330856618486e-05, + "loss": 0.0668, + "step": 18188 + }, + { + "epoch": 2.1568836712913555, + "grad_norm": 0.8822679208091541, + "learning_rate": 2.3040915577464183e-05, + "loss": 0.1261, + "step": 18189 + }, + { + "epoch": 2.1570022530534803, + "grad_norm": 0.5484524810777485, + "learning_rate": 2.303852260680388e-05, + "loss": 0.0804, + "step": 18190 + }, + { + "epoch": 2.1571208348156055, + "grad_norm": 0.6806465226347443, + "learning_rate": 2.3036129654225982e-05, + "loss": 0.0848, + "step": 18191 + }, + { + "epoch": 2.15723941657773, + "grad_norm": 0.7515767610424349, + "learning_rate": 2.303373671975257e-05, + "loss": 0.107, + "step": 18192 + }, + { + "epoch": 2.1573579983398554, + "grad_norm": 0.8765387331930589, + "learning_rate": 2.303134380340568e-05, + "loss": 0.1297, + "step": 18193 + }, + { + "epoch": 2.15747658010198, + "grad_norm": 0.7120820531707, + "learning_rate": 2.30289509052074e-05, + "loss": 0.1031, + "step": 18194 + }, + { + "epoch": 2.1575951618641054, + "grad_norm": 0.8313599065207818, + "learning_rate": 2.3026558025179775e-05, + "loss": 0.0992, + "step": 18195 + }, + { + "epoch": 2.15771374362623, + "grad_norm": 1.0520444854702482, + "learning_rate": 2.3024165163344862e-05, + "loss": 0.1458, + "step": 18196 + }, + { + "epoch": 2.1578323253883553, + "grad_norm": 0.6369963534844891, + "learning_rate": 2.3021772319724716e-05, + "loss": 0.0992, + "step": 18197 + }, + { + "epoch": 2.15795090715048, + "grad_norm": 1.2679832026652074, + "learning_rate": 2.3019379494341412e-05, + "loss": 0.1128, + "step": 18198 + }, + { + "epoch": 2.1580694889126053, + "grad_norm": 0.87661997533867, + "learning_rate": 2.3016986687217002e-05, + "loss": 0.0942, + "step": 18199 + }, + { + "epoch": 2.15818807067473, + "grad_norm": 0.9937268469813308, + "learning_rate": 2.3014593898373534e-05, + "loss": 0.1095, + "step": 18200 + }, + { + "epoch": 2.1583066524368553, + "grad_norm": 0.708482344464271, + "learning_rate": 2.301220112783308e-05, + "loss": 0.0863, + "step": 18201 + }, + { + "epoch": 2.15842523419898, + "grad_norm": 0.949234488936392, + "learning_rate": 2.3009808375617692e-05, + "loss": 0.1126, + "step": 18202 + }, + { + "epoch": 2.1585438159611052, + "grad_norm": 0.5821814492676864, + "learning_rate": 2.3007415641749437e-05, + "loss": 0.0796, + "step": 18203 + }, + { + "epoch": 2.15866239772323, + "grad_norm": 0.5739562806168383, + "learning_rate": 2.3005022926250353e-05, + "loss": 0.0838, + "step": 18204 + }, + { + "epoch": 2.158780979485355, + "grad_norm": 0.6339446001154138, + "learning_rate": 2.3002630229142525e-05, + "loss": 0.0876, + "step": 18205 + }, + { + "epoch": 2.15889956124748, + "grad_norm": 0.5748548478942015, + "learning_rate": 2.3000237550447995e-05, + "loss": 0.0659, + "step": 18206 + }, + { + "epoch": 2.159018143009605, + "grad_norm": 0.7259764121124648, + "learning_rate": 2.299784489018882e-05, + "loss": 0.1092, + "step": 18207 + }, + { + "epoch": 2.15913672477173, + "grad_norm": 0.7915851391422019, + "learning_rate": 2.2995452248387054e-05, + "loss": 0.1002, + "step": 18208 + }, + { + "epoch": 2.159255306533855, + "grad_norm": 0.6583993269000132, + "learning_rate": 2.2993059625064768e-05, + "loss": 0.0858, + "step": 18209 + }, + { + "epoch": 2.15937388829598, + "grad_norm": 0.5447839740172392, + "learning_rate": 2.2990667020244013e-05, + "loss": 0.0768, + "step": 18210 + }, + { + "epoch": 2.159492470058105, + "grad_norm": 0.8844225903989047, + "learning_rate": 2.2988274433946838e-05, + "loss": 0.0922, + "step": 18211 + }, + { + "epoch": 2.15961105182023, + "grad_norm": 0.9061308563486061, + "learning_rate": 2.2985881866195307e-05, + "loss": 0.1221, + "step": 18212 + }, + { + "epoch": 2.159729633582355, + "grad_norm": 0.5727541154692406, + "learning_rate": 2.2983489317011473e-05, + "loss": 0.0721, + "step": 18213 + }, + { + "epoch": 2.1598482153444802, + "grad_norm": 0.779447172881068, + "learning_rate": 2.29810967864174e-05, + "loss": 0.1152, + "step": 18214 + }, + { + "epoch": 2.159966797106605, + "grad_norm": 0.7560684136078012, + "learning_rate": 2.297870427443514e-05, + "loss": 0.1104, + "step": 18215 + }, + { + "epoch": 2.1600853788687298, + "grad_norm": 0.8064054902796883, + "learning_rate": 2.2976311781086735e-05, + "loss": 0.1076, + "step": 18216 + }, + { + "epoch": 2.160203960630855, + "grad_norm": 0.8791936436689805, + "learning_rate": 2.297391930639427e-05, + "loss": 0.0933, + "step": 18217 + }, + { + "epoch": 2.16032254239298, + "grad_norm": 0.9778196724452377, + "learning_rate": 2.297152685037978e-05, + "loss": 0.1386, + "step": 18218 + }, + { + "epoch": 2.160441124155105, + "grad_norm": 1.1474200461570863, + "learning_rate": 2.2969134413065313e-05, + "loss": 0.1442, + "step": 18219 + }, + { + "epoch": 2.16055970591723, + "grad_norm": 0.6206208781347465, + "learning_rate": 2.296674199447295e-05, + "loss": 0.0854, + "step": 18220 + }, + { + "epoch": 2.160678287679355, + "grad_norm": 0.7846071374243185, + "learning_rate": 2.2964349594624726e-05, + "loss": 0.0843, + "step": 18221 + }, + { + "epoch": 2.16079686944148, + "grad_norm": 0.5699595273521295, + "learning_rate": 2.296195721354271e-05, + "loss": 0.0751, + "step": 18222 + }, + { + "epoch": 2.160915451203605, + "grad_norm": 2.6637669414925957, + "learning_rate": 2.2959564851248936e-05, + "loss": 0.1343, + "step": 18223 + }, + { + "epoch": 2.16103403296573, + "grad_norm": 0.5510407334097618, + "learning_rate": 2.2957172507765482e-05, + "loss": 0.0759, + "step": 18224 + }, + { + "epoch": 2.161152614727855, + "grad_norm": 1.104142927478328, + "learning_rate": 2.2954780183114392e-05, + "loss": 0.1436, + "step": 18225 + }, + { + "epoch": 2.16127119648998, + "grad_norm": 1.1549746325924886, + "learning_rate": 2.2952387877317724e-05, + "loss": 0.104, + "step": 18226 + }, + { + "epoch": 2.1613897782521048, + "grad_norm": 0.9521459397836233, + "learning_rate": 2.2949995590397518e-05, + "loss": 0.1217, + "step": 18227 + }, + { + "epoch": 2.16150836001423, + "grad_norm": 1.1029606918239991, + "learning_rate": 2.2947603322375846e-05, + "loss": 0.1699, + "step": 18228 + }, + { + "epoch": 2.1616269417763547, + "grad_norm": 0.7516645562852932, + "learning_rate": 2.294521107327476e-05, + "loss": 0.1009, + "step": 18229 + }, + { + "epoch": 2.16174552353848, + "grad_norm": 0.5804239178780992, + "learning_rate": 2.2942818843116297e-05, + "loss": 0.0647, + "step": 18230 + }, + { + "epoch": 2.1618641053006047, + "grad_norm": 0.6407003116576069, + "learning_rate": 2.2940426631922523e-05, + "loss": 0.1037, + "step": 18231 + }, + { + "epoch": 2.16198268706273, + "grad_norm": 0.6636363062692989, + "learning_rate": 2.293803443971549e-05, + "loss": 0.0875, + "step": 18232 + }, + { + "epoch": 2.1621012688248546, + "grad_norm": 0.7118411591031854, + "learning_rate": 2.2935642266517256e-05, + "loss": 0.1005, + "step": 18233 + }, + { + "epoch": 2.16221985058698, + "grad_norm": 0.8723383342226, + "learning_rate": 2.293325011234986e-05, + "loss": 0.1181, + "step": 18234 + }, + { + "epoch": 2.1623384323491046, + "grad_norm": 0.9404470103079484, + "learning_rate": 2.2930857977235372e-05, + "loss": 0.1239, + "step": 18235 + }, + { + "epoch": 2.16245701411123, + "grad_norm": 0.7508830457331587, + "learning_rate": 2.292846586119584e-05, + "loss": 0.0903, + "step": 18236 + }, + { + "epoch": 2.1625755958733546, + "grad_norm": 0.7981798034775671, + "learning_rate": 2.292607376425331e-05, + "loss": 0.1048, + "step": 18237 + }, + { + "epoch": 2.1626941776354798, + "grad_norm": 0.6357129292179071, + "learning_rate": 2.2923681686429825e-05, + "loss": 0.1041, + "step": 18238 + }, + { + "epoch": 2.1628127593976045, + "grad_norm": 0.9339322282307543, + "learning_rate": 2.292128962774746e-05, + "loss": 0.1286, + "step": 18239 + }, + { + "epoch": 2.1629313411597297, + "grad_norm": 0.5900079585744792, + "learning_rate": 2.291889758822825e-05, + "loss": 0.0777, + "step": 18240 + }, + { + "epoch": 2.1630499229218545, + "grad_norm": 0.6057186965395314, + "learning_rate": 2.291650556789426e-05, + "loss": 0.0851, + "step": 18241 + }, + { + "epoch": 2.1631685046839797, + "grad_norm": 1.0299354487485834, + "learning_rate": 2.291411356676752e-05, + "loss": 0.1511, + "step": 18242 + }, + { + "epoch": 2.1632870864461045, + "grad_norm": 0.6098408007362349, + "learning_rate": 2.2911721584870108e-05, + "loss": 0.0794, + "step": 18243 + }, + { + "epoch": 2.1634056682082297, + "grad_norm": 0.7852178840167741, + "learning_rate": 2.290932962222406e-05, + "loss": 0.1009, + "step": 18244 + }, + { + "epoch": 2.1635242499703544, + "grad_norm": 0.6059217794429567, + "learning_rate": 2.290693767885142e-05, + "loss": 0.0812, + "step": 18245 + }, + { + "epoch": 2.1636428317324796, + "grad_norm": 0.501371366623731, + "learning_rate": 2.2904545754774258e-05, + "loss": 0.0708, + "step": 18246 + }, + { + "epoch": 2.1637614134946044, + "grad_norm": 1.0136155232677024, + "learning_rate": 2.2902153850014616e-05, + "loss": 0.1086, + "step": 18247 + }, + { + "epoch": 2.1638799952567296, + "grad_norm": 1.0416178785885206, + "learning_rate": 2.289976196459454e-05, + "loss": 0.1484, + "step": 18248 + }, + { + "epoch": 2.1639985770188543, + "grad_norm": 0.6014384038024505, + "learning_rate": 2.2897370098536075e-05, + "loss": 0.0988, + "step": 18249 + }, + { + "epoch": 2.1641171587809795, + "grad_norm": 0.6771301118220179, + "learning_rate": 2.2894978251861286e-05, + "loss": 0.1024, + "step": 18250 + }, + { + "epoch": 2.1642357405431043, + "grad_norm": 0.6364682743103897, + "learning_rate": 2.2892586424592216e-05, + "loss": 0.1135, + "step": 18251 + }, + { + "epoch": 2.1643543223052295, + "grad_norm": 0.7864691796077026, + "learning_rate": 2.2890194616750916e-05, + "loss": 0.1307, + "step": 18252 + }, + { + "epoch": 2.1644729040673543, + "grad_norm": 0.7929437817317556, + "learning_rate": 2.288780282835943e-05, + "loss": 0.1184, + "step": 18253 + }, + { + "epoch": 2.1645914858294795, + "grad_norm": 0.7035995773669962, + "learning_rate": 2.2885411059439815e-05, + "loss": 0.1095, + "step": 18254 + }, + { + "epoch": 2.1647100675916042, + "grad_norm": 0.9533224956704623, + "learning_rate": 2.288301931001412e-05, + "loss": 0.1338, + "step": 18255 + }, + { + "epoch": 2.1648286493537294, + "grad_norm": 0.6429047166376552, + "learning_rate": 2.288062758010439e-05, + "loss": 0.0707, + "step": 18256 + }, + { + "epoch": 2.164947231115854, + "grad_norm": 0.9607835011856475, + "learning_rate": 2.2878235869732668e-05, + "loss": 0.1284, + "step": 18257 + }, + { + "epoch": 2.1650658128779794, + "grad_norm": 0.6514257882479402, + "learning_rate": 2.2875844178921015e-05, + "loss": 0.0776, + "step": 18258 + }, + { + "epoch": 2.165184394640104, + "grad_norm": 0.473091441549393, + "learning_rate": 2.287345250769148e-05, + "loss": 0.0642, + "step": 18259 + }, + { + "epoch": 2.1653029764022294, + "grad_norm": 0.598712124473958, + "learning_rate": 2.287106085606609e-05, + "loss": 0.0756, + "step": 18260 + }, + { + "epoch": 2.165421558164354, + "grad_norm": 0.48076209454749275, + "learning_rate": 2.286866922406692e-05, + "loss": 0.0618, + "step": 18261 + }, + { + "epoch": 2.1655401399264793, + "grad_norm": 0.6597757168699985, + "learning_rate": 2.2866277611716e-05, + "loss": 0.0909, + "step": 18262 + }, + { + "epoch": 2.1656587216886045, + "grad_norm": 0.9629040082152795, + "learning_rate": 2.286388601903539e-05, + "loss": 0.1445, + "step": 18263 + }, + { + "epoch": 2.1657773034507293, + "grad_norm": 0.9807162126865389, + "learning_rate": 2.2861494446047118e-05, + "loss": 0.133, + "step": 18264 + }, + { + "epoch": 2.165895885212854, + "grad_norm": 0.7777359188908318, + "learning_rate": 2.285910289277326e-05, + "loss": 0.093, + "step": 18265 + }, + { + "epoch": 2.1660144669749792, + "grad_norm": 1.0218242770480346, + "learning_rate": 2.2856711359235846e-05, + "loss": 0.1566, + "step": 18266 + }, + { + "epoch": 2.1661330487371044, + "grad_norm": 0.8204177483641968, + "learning_rate": 2.285431984545692e-05, + "loss": 0.1263, + "step": 18267 + }, + { + "epoch": 2.166251630499229, + "grad_norm": 0.622232035290334, + "learning_rate": 2.285192835145853e-05, + "loss": 0.0781, + "step": 18268 + }, + { + "epoch": 2.1663702122613544, + "grad_norm": 0.641021750534238, + "learning_rate": 2.2849536877262737e-05, + "loss": 0.0777, + "step": 18269 + }, + { + "epoch": 2.166488794023479, + "grad_norm": 0.6968676421006453, + "learning_rate": 2.284714542289157e-05, + "loss": 0.0744, + "step": 18270 + }, + { + "epoch": 2.1666073757856044, + "grad_norm": 0.7141336918824486, + "learning_rate": 2.2844753988367086e-05, + "loss": 0.1059, + "step": 18271 + }, + { + "epoch": 2.166725957547729, + "grad_norm": 0.8536177265001599, + "learning_rate": 2.284236257371132e-05, + "loss": 0.0962, + "step": 18272 + }, + { + "epoch": 2.1668445393098543, + "grad_norm": 0.5859319508345553, + "learning_rate": 2.283997117894633e-05, + "loss": 0.0915, + "step": 18273 + }, + { + "epoch": 2.166963121071979, + "grad_norm": 0.82883504866387, + "learning_rate": 2.2837579804094163e-05, + "loss": 0.1011, + "step": 18274 + }, + { + "epoch": 2.1670817028341043, + "grad_norm": 0.6979317134617556, + "learning_rate": 2.2835188449176844e-05, + "loss": 0.0818, + "step": 18275 + }, + { + "epoch": 2.167200284596229, + "grad_norm": 0.6957083314978458, + "learning_rate": 2.2832797114216445e-05, + "loss": 0.0683, + "step": 18276 + }, + { + "epoch": 2.1673188663583542, + "grad_norm": 0.7200992650012682, + "learning_rate": 2.2830405799235e-05, + "loss": 0.101, + "step": 18277 + }, + { + "epoch": 2.167437448120479, + "grad_norm": 1.193263642257512, + "learning_rate": 2.2828014504254554e-05, + "loss": 0.1607, + "step": 18278 + }, + { + "epoch": 2.167556029882604, + "grad_norm": 0.7075843010557745, + "learning_rate": 2.2825623229297135e-05, + "loss": 0.1239, + "step": 18279 + }, + { + "epoch": 2.167674611644729, + "grad_norm": 0.4375121011368977, + "learning_rate": 2.282323197438482e-05, + "loss": 0.0534, + "step": 18280 + }, + { + "epoch": 2.167793193406854, + "grad_norm": 0.8140769040455942, + "learning_rate": 2.2820840739539624e-05, + "loss": 0.1029, + "step": 18281 + }, + { + "epoch": 2.167911775168979, + "grad_norm": 0.6677092388348671, + "learning_rate": 2.2818449524783615e-05, + "loss": 0.0822, + "step": 18282 + }, + { + "epoch": 2.168030356931104, + "grad_norm": 0.7395631073800455, + "learning_rate": 2.2816058330138815e-05, + "loss": 0.0986, + "step": 18283 + }, + { + "epoch": 2.168148938693229, + "grad_norm": 0.7817260724117859, + "learning_rate": 2.281366715562729e-05, + "loss": 0.121, + "step": 18284 + }, + { + "epoch": 2.168267520455354, + "grad_norm": 1.1057630530401805, + "learning_rate": 2.2811276001271072e-05, + "loss": 0.132, + "step": 18285 + }, + { + "epoch": 2.168386102217479, + "grad_norm": 0.9439367959186609, + "learning_rate": 2.2808884867092206e-05, + "loss": 0.133, + "step": 18286 + }, + { + "epoch": 2.168504683979604, + "grad_norm": 0.9664546902477019, + "learning_rate": 2.2806493753112725e-05, + "loss": 0.1055, + "step": 18287 + }, + { + "epoch": 2.168623265741729, + "grad_norm": 0.6744569868047293, + "learning_rate": 2.280410265935469e-05, + "loss": 0.081, + "step": 18288 + }, + { + "epoch": 2.168741847503854, + "grad_norm": 0.9152277351924556, + "learning_rate": 2.2801711585840135e-05, + "loss": 0.1388, + "step": 18289 + }, + { + "epoch": 2.1688604292659788, + "grad_norm": 0.9080784985736691, + "learning_rate": 2.2799320532591097e-05, + "loss": 0.1149, + "step": 18290 + }, + { + "epoch": 2.168979011028104, + "grad_norm": 0.8869363878699742, + "learning_rate": 2.279692949962963e-05, + "loss": 0.128, + "step": 18291 + }, + { + "epoch": 2.1690975927902287, + "grad_norm": 1.1724835846878408, + "learning_rate": 2.279453848697777e-05, + "loss": 0.117, + "step": 18292 + }, + { + "epoch": 2.169216174552354, + "grad_norm": 0.5833403144601471, + "learning_rate": 2.2792147494657565e-05, + "loss": 0.078, + "step": 18293 + }, + { + "epoch": 2.1693347563144787, + "grad_norm": 0.760803300490983, + "learning_rate": 2.278975652269104e-05, + "loss": 0.1193, + "step": 18294 + }, + { + "epoch": 2.169453338076604, + "grad_norm": 0.5947458443103202, + "learning_rate": 2.2787365571100264e-05, + "loss": 0.0744, + "step": 18295 + }, + { + "epoch": 2.1695719198387287, + "grad_norm": 0.8576205515718955, + "learning_rate": 2.278497463990726e-05, + "loss": 0.0934, + "step": 18296 + }, + { + "epoch": 2.169690501600854, + "grad_norm": 0.6336413528338436, + "learning_rate": 2.2782583729134077e-05, + "loss": 0.0885, + "step": 18297 + }, + { + "epoch": 2.1698090833629786, + "grad_norm": 0.6694818081251378, + "learning_rate": 2.2780192838802742e-05, + "loss": 0.0805, + "step": 18298 + }, + { + "epoch": 2.169927665125104, + "grad_norm": 1.0330677456949513, + "learning_rate": 2.2777801968935317e-05, + "loss": 0.1083, + "step": 18299 + }, + { + "epoch": 2.1700462468872286, + "grad_norm": 0.8102788226463821, + "learning_rate": 2.2775411119553827e-05, + "loss": 0.1148, + "step": 18300 + }, + { + "epoch": 2.170164828649354, + "grad_norm": 0.9252918618323722, + "learning_rate": 2.2773020290680323e-05, + "loss": 0.1258, + "step": 18301 + }, + { + "epoch": 2.1702834104114785, + "grad_norm": 0.5699441198542027, + "learning_rate": 2.277062948233683e-05, + "loss": 0.0903, + "step": 18302 + }, + { + "epoch": 2.1704019921736037, + "grad_norm": 0.7166337252277443, + "learning_rate": 2.276823869454541e-05, + "loss": 0.1102, + "step": 18303 + }, + { + "epoch": 2.1705205739357285, + "grad_norm": 0.753154963573039, + "learning_rate": 2.2765847927328096e-05, + "loss": 0.1249, + "step": 18304 + }, + { + "epoch": 2.1706391556978537, + "grad_norm": 0.6752992691138443, + "learning_rate": 2.2763457180706912e-05, + "loss": 0.0984, + "step": 18305 + }, + { + "epoch": 2.1707577374599785, + "grad_norm": 0.8514397172089723, + "learning_rate": 2.276106645470392e-05, + "loss": 0.1133, + "step": 18306 + }, + { + "epoch": 2.1708763192221037, + "grad_norm": 0.722478801393599, + "learning_rate": 2.2758675749341152e-05, + "loss": 0.087, + "step": 18307 + }, + { + "epoch": 2.1709949009842284, + "grad_norm": 0.7176812619953669, + "learning_rate": 2.275628506464064e-05, + "loss": 0.1044, + "step": 18308 + }, + { + "epoch": 2.1711134827463536, + "grad_norm": 0.8417819659851523, + "learning_rate": 2.275389440062442e-05, + "loss": 0.1163, + "step": 18309 + }, + { + "epoch": 2.1712320645084784, + "grad_norm": 0.8737524081600735, + "learning_rate": 2.2751503757314552e-05, + "loss": 0.1173, + "step": 18310 + }, + { + "epoch": 2.1713506462706036, + "grad_norm": 0.7922417472269576, + "learning_rate": 2.2749113134733056e-05, + "loss": 0.1173, + "step": 18311 + }, + { + "epoch": 2.171469228032729, + "grad_norm": 0.9723759271753204, + "learning_rate": 2.274672253290198e-05, + "loss": 0.1004, + "step": 18312 + }, + { + "epoch": 2.1715878097948536, + "grad_norm": 0.8434536425209765, + "learning_rate": 2.274433195184335e-05, + "loss": 0.1049, + "step": 18313 + }, + { + "epoch": 2.1717063915569783, + "grad_norm": 0.5518200635381321, + "learning_rate": 2.2741941391579224e-05, + "loss": 0.078, + "step": 18314 + }, + { + "epoch": 2.1718249733191035, + "grad_norm": 0.8200728928633869, + "learning_rate": 2.273955085213163e-05, + "loss": 0.0899, + "step": 18315 + }, + { + "epoch": 2.1719435550812287, + "grad_norm": 0.5701326390560173, + "learning_rate": 2.2737160333522595e-05, + "loss": 0.0655, + "step": 18316 + }, + { + "epoch": 2.1720621368433535, + "grad_norm": 0.6631931948465511, + "learning_rate": 2.2734769835774175e-05, + "loss": 0.098, + "step": 18317 + }, + { + "epoch": 2.1721807186054787, + "grad_norm": 0.8217616601624214, + "learning_rate": 2.27323793589084e-05, + "loss": 0.1081, + "step": 18318 + }, + { + "epoch": 2.1722993003676034, + "grad_norm": 0.9520943155516961, + "learning_rate": 2.27299889029473e-05, + "loss": 0.1415, + "step": 18319 + }, + { + "epoch": 2.1724178821297286, + "grad_norm": 0.6464089155509882, + "learning_rate": 2.2727598467912927e-05, + "loss": 0.0945, + "step": 18320 + }, + { + "epoch": 2.1725364638918534, + "grad_norm": 0.8745565880353616, + "learning_rate": 2.27252080538273e-05, + "loss": 0.1164, + "step": 18321 + }, + { + "epoch": 2.1726550456539786, + "grad_norm": 0.7824845950562695, + "learning_rate": 2.2722817660712474e-05, + "loss": 0.1106, + "step": 18322 + }, + { + "epoch": 2.1727736274161034, + "grad_norm": 0.7511035302105583, + "learning_rate": 2.2720427288590475e-05, + "loss": 0.1129, + "step": 18323 + }, + { + "epoch": 2.1728922091782286, + "grad_norm": 1.0341494843139387, + "learning_rate": 2.271803693748333e-05, + "loss": 0.1049, + "step": 18324 + }, + { + "epoch": 2.1730107909403533, + "grad_norm": 0.4129142072829262, + "learning_rate": 2.27156466074131e-05, + "loss": 0.0539, + "step": 18325 + }, + { + "epoch": 2.1731293727024785, + "grad_norm": 0.9221415722164602, + "learning_rate": 2.2713256298401804e-05, + "loss": 0.1393, + "step": 18326 + }, + { + "epoch": 2.1732479544646033, + "grad_norm": 1.0565010273792987, + "learning_rate": 2.2710866010471485e-05, + "loss": 0.1546, + "step": 18327 + }, + { + "epoch": 2.1733665362267285, + "grad_norm": 0.5626137480955598, + "learning_rate": 2.270847574364416e-05, + "loss": 0.0796, + "step": 18328 + }, + { + "epoch": 2.1734851179888532, + "grad_norm": 0.7049042079273979, + "learning_rate": 2.270608549794189e-05, + "loss": 0.0953, + "step": 18329 + }, + { + "epoch": 2.1736036997509784, + "grad_norm": 0.8316444250734987, + "learning_rate": 2.2703695273386694e-05, + "loss": 0.1203, + "step": 18330 + }, + { + "epoch": 2.173722281513103, + "grad_norm": 0.8819118556347035, + "learning_rate": 2.2701305070000608e-05, + "loss": 0.1049, + "step": 18331 + }, + { + "epoch": 2.1738408632752284, + "grad_norm": 0.8606751771318868, + "learning_rate": 2.2698914887805677e-05, + "loss": 0.1334, + "step": 18332 + }, + { + "epoch": 2.173959445037353, + "grad_norm": 1.0430466488178458, + "learning_rate": 2.269652472682393e-05, + "loss": 0.134, + "step": 18333 + }, + { + "epoch": 2.1740780267994784, + "grad_norm": 1.1507572723655977, + "learning_rate": 2.26941345870774e-05, + "loss": 0.1388, + "step": 18334 + }, + { + "epoch": 2.174196608561603, + "grad_norm": 0.7096575564288057, + "learning_rate": 2.2691744468588112e-05, + "loss": 0.0978, + "step": 18335 + }, + { + "epoch": 2.1743151903237283, + "grad_norm": 0.6959499286390484, + "learning_rate": 2.268935437137812e-05, + "loss": 0.1017, + "step": 18336 + }, + { + "epoch": 2.174433772085853, + "grad_norm": 0.6089125546184191, + "learning_rate": 2.268696429546945e-05, + "loss": 0.0829, + "step": 18337 + }, + { + "epoch": 2.1745523538479783, + "grad_norm": 0.7151441485909925, + "learning_rate": 2.2684574240884127e-05, + "loss": 0.088, + "step": 18338 + }, + { + "epoch": 2.174670935610103, + "grad_norm": 3.003834054967566, + "learning_rate": 2.268218420764418e-05, + "loss": 0.1092, + "step": 18339 + }, + { + "epoch": 2.1747895173722283, + "grad_norm": 0.7515848285662812, + "learning_rate": 2.267979419577167e-05, + "loss": 0.1215, + "step": 18340 + }, + { + "epoch": 2.174908099134353, + "grad_norm": 0.625468584565516, + "learning_rate": 2.26774042052886e-05, + "loss": 0.1052, + "step": 18341 + }, + { + "epoch": 2.175026680896478, + "grad_norm": 0.9036370395316549, + "learning_rate": 2.2675014236217025e-05, + "loss": 0.1339, + "step": 18342 + }, + { + "epoch": 2.175145262658603, + "grad_norm": 0.7140695346848739, + "learning_rate": 2.2672624288578954e-05, + "loss": 0.0917, + "step": 18343 + }, + { + "epoch": 2.175263844420728, + "grad_norm": 0.7588220458319599, + "learning_rate": 2.267023436239644e-05, + "loss": 0.0932, + "step": 18344 + }, + { + "epoch": 2.175382426182853, + "grad_norm": 0.624413949780555, + "learning_rate": 2.2667844457691516e-05, + "loss": 0.0846, + "step": 18345 + }, + { + "epoch": 2.175501007944978, + "grad_norm": 0.49367819901287535, + "learning_rate": 2.266545457448619e-05, + "loss": 0.0709, + "step": 18346 + }, + { + "epoch": 2.175619589707103, + "grad_norm": 0.8211234904676614, + "learning_rate": 2.2663064712802523e-05, + "loss": 0.1002, + "step": 18347 + }, + { + "epoch": 2.175738171469228, + "grad_norm": 0.679032546004311, + "learning_rate": 2.2660674872662535e-05, + "loss": 0.0969, + "step": 18348 + }, + { + "epoch": 2.175856753231353, + "grad_norm": 0.8518795654572343, + "learning_rate": 2.265828505408825e-05, + "loss": 0.1209, + "step": 18349 + }, + { + "epoch": 2.175975334993478, + "grad_norm": 0.5803799459416905, + "learning_rate": 2.2655895257101704e-05, + "loss": 0.0774, + "step": 18350 + }, + { + "epoch": 2.176093916755603, + "grad_norm": 0.5721590550082291, + "learning_rate": 2.265350548172493e-05, + "loss": 0.0652, + "step": 18351 + }, + { + "epoch": 2.176212498517728, + "grad_norm": 0.6892649532482126, + "learning_rate": 2.265111572797996e-05, + "loss": 0.0915, + "step": 18352 + }, + { + "epoch": 2.176331080279853, + "grad_norm": 0.9691564054566336, + "learning_rate": 2.2648725995888824e-05, + "loss": 0.1115, + "step": 18353 + }, + { + "epoch": 2.176449662041978, + "grad_norm": 0.7303810162106675, + "learning_rate": 2.2646336285473544e-05, + "loss": 0.0842, + "step": 18354 + }, + { + "epoch": 2.1765682438041027, + "grad_norm": 0.6674254352084464, + "learning_rate": 2.2643946596756166e-05, + "loss": 0.0918, + "step": 18355 + }, + { + "epoch": 2.176686825566228, + "grad_norm": 0.7818826798810806, + "learning_rate": 2.264155692975871e-05, + "loss": 0.096, + "step": 18356 + }, + { + "epoch": 2.1768054073283527, + "grad_norm": 0.8140951146278154, + "learning_rate": 2.263916728450321e-05, + "loss": 0.1164, + "step": 18357 + }, + { + "epoch": 2.176923989090478, + "grad_norm": 0.61765318242083, + "learning_rate": 2.2636777661011678e-05, + "loss": 0.0934, + "step": 18358 + }, + { + "epoch": 2.1770425708526027, + "grad_norm": 0.7568926575318811, + "learning_rate": 2.2634388059306172e-05, + "loss": 0.1169, + "step": 18359 + }, + { + "epoch": 2.177161152614728, + "grad_norm": 0.9143073318977126, + "learning_rate": 2.26319984794087e-05, + "loss": 0.1456, + "step": 18360 + }, + { + "epoch": 2.177279734376853, + "grad_norm": 0.7416939863210084, + "learning_rate": 2.2629608921341297e-05, + "loss": 0.0873, + "step": 18361 + }, + { + "epoch": 2.177398316138978, + "grad_norm": 0.864125560380093, + "learning_rate": 2.2627219385125994e-05, + "loss": 0.1145, + "step": 18362 + }, + { + "epoch": 2.1775168979011026, + "grad_norm": 0.8401173559800118, + "learning_rate": 2.2624829870784825e-05, + "loss": 0.1287, + "step": 18363 + }, + { + "epoch": 2.177635479663228, + "grad_norm": 0.5757634253735214, + "learning_rate": 2.2622440378339814e-05, + "loss": 0.0707, + "step": 18364 + }, + { + "epoch": 2.177754061425353, + "grad_norm": 0.7346383461819286, + "learning_rate": 2.262005090781297e-05, + "loss": 0.1124, + "step": 18365 + }, + { + "epoch": 2.1778726431874778, + "grad_norm": 0.8092716789497358, + "learning_rate": 2.261766145922635e-05, + "loss": 0.1051, + "step": 18366 + }, + { + "epoch": 2.177991224949603, + "grad_norm": 0.5883538777524814, + "learning_rate": 2.2615272032601976e-05, + "loss": 0.0863, + "step": 18367 + }, + { + "epoch": 2.1781098067117277, + "grad_norm": 0.796243580203159, + "learning_rate": 2.261288262796186e-05, + "loss": 0.1114, + "step": 18368 + }, + { + "epoch": 2.178228388473853, + "grad_norm": 0.866030785134124, + "learning_rate": 2.261049324532804e-05, + "loss": 0.1257, + "step": 18369 + }, + { + "epoch": 2.1783469702359777, + "grad_norm": 0.632412797306405, + "learning_rate": 2.260810388472254e-05, + "loss": 0.0979, + "step": 18370 + }, + { + "epoch": 2.178465551998103, + "grad_norm": 0.8436535197050752, + "learning_rate": 2.2605714546167398e-05, + "loss": 0.1001, + "step": 18371 + }, + { + "epoch": 2.1785841337602276, + "grad_norm": 0.6733828937384795, + "learning_rate": 2.260332522968463e-05, + "loss": 0.1044, + "step": 18372 + }, + { + "epoch": 2.178702715522353, + "grad_norm": 0.7718749800912684, + "learning_rate": 2.2600935935296252e-05, + "loss": 0.0969, + "step": 18373 + }, + { + "epoch": 2.1788212972844776, + "grad_norm": 0.7443402946307769, + "learning_rate": 2.2598546663024316e-05, + "loss": 0.1064, + "step": 18374 + }, + { + "epoch": 2.178939879046603, + "grad_norm": 0.44122894960372244, + "learning_rate": 2.2596157412890833e-05, + "loss": 0.0707, + "step": 18375 + }, + { + "epoch": 2.1790584608087276, + "grad_norm": 0.6022952125115137, + "learning_rate": 2.2593768184917826e-05, + "loss": 0.0748, + "step": 18376 + }, + { + "epoch": 2.1791770425708528, + "grad_norm": 0.8551091655642724, + "learning_rate": 2.2591378979127327e-05, + "loss": 0.137, + "step": 18377 + }, + { + "epoch": 2.1792956243329775, + "grad_norm": 0.6321826786671534, + "learning_rate": 2.2588989795541364e-05, + "loss": 0.0676, + "step": 18378 + }, + { + "epoch": 2.1794142060951027, + "grad_norm": 0.9418576956569064, + "learning_rate": 2.2586600634181955e-05, + "loss": 0.1291, + "step": 18379 + }, + { + "epoch": 2.1795327878572275, + "grad_norm": 0.6218494200558723, + "learning_rate": 2.2584211495071127e-05, + "loss": 0.0894, + "step": 18380 + }, + { + "epoch": 2.1796513696193527, + "grad_norm": 1.6103034691789562, + "learning_rate": 2.258182237823091e-05, + "loss": 0.1591, + "step": 18381 + }, + { + "epoch": 2.1797699513814774, + "grad_norm": 0.717856525605982, + "learning_rate": 2.2579433283683328e-05, + "loss": 0.1064, + "step": 18382 + }, + { + "epoch": 2.1798885331436026, + "grad_norm": 0.5781432021685317, + "learning_rate": 2.25770442114504e-05, + "loss": 0.0887, + "step": 18383 + }, + { + "epoch": 2.1800071149057274, + "grad_norm": 0.6810072582568776, + "learning_rate": 2.2574655161554146e-05, + "loss": 0.0829, + "step": 18384 + }, + { + "epoch": 2.1801256966678526, + "grad_norm": 0.5796746374307297, + "learning_rate": 2.2572266134016605e-05, + "loss": 0.0748, + "step": 18385 + }, + { + "epoch": 2.1802442784299774, + "grad_norm": 0.7252697066664271, + "learning_rate": 2.2569877128859796e-05, + "loss": 0.0837, + "step": 18386 + }, + { + "epoch": 2.1803628601921026, + "grad_norm": 0.6936429972178799, + "learning_rate": 2.256748814610574e-05, + "loss": 0.0926, + "step": 18387 + }, + { + "epoch": 2.1804814419542273, + "grad_norm": 0.6408206579981919, + "learning_rate": 2.256509918577645e-05, + "loss": 0.0845, + "step": 18388 + }, + { + "epoch": 2.1806000237163525, + "grad_norm": 0.8111486459116686, + "learning_rate": 2.2562710247893967e-05, + "loss": 0.1062, + "step": 18389 + }, + { + "epoch": 2.1807186054784773, + "grad_norm": 0.8114353587718467, + "learning_rate": 2.2560321332480302e-05, + "loss": 0.103, + "step": 18390 + }, + { + "epoch": 2.1808371872406025, + "grad_norm": 1.0135205173648256, + "learning_rate": 2.2557932439557482e-05, + "loss": 0.1337, + "step": 18391 + }, + { + "epoch": 2.1809557690027273, + "grad_norm": 0.6727239415536918, + "learning_rate": 2.2555543569147535e-05, + "loss": 0.1008, + "step": 18392 + }, + { + "epoch": 2.1810743507648525, + "grad_norm": 0.6940193055648642, + "learning_rate": 2.255315472127248e-05, + "loss": 0.0998, + "step": 18393 + }, + { + "epoch": 2.181192932526977, + "grad_norm": 0.7916506435322717, + "learning_rate": 2.255076589595434e-05, + "loss": 0.121, + "step": 18394 + }, + { + "epoch": 2.1813115142891024, + "grad_norm": 0.8581349122837684, + "learning_rate": 2.254837709321512e-05, + "loss": 0.1223, + "step": 18395 + }, + { + "epoch": 2.181430096051227, + "grad_norm": 0.5744465272118806, + "learning_rate": 2.2545988313076873e-05, + "loss": 0.0782, + "step": 18396 + }, + { + "epoch": 2.1815486778133524, + "grad_norm": 0.6944368069181848, + "learning_rate": 2.2543599555561605e-05, + "loss": 0.0901, + "step": 18397 + }, + { + "epoch": 2.181667259575477, + "grad_norm": 0.8708443380600385, + "learning_rate": 2.2541210820691328e-05, + "loss": 0.1155, + "step": 18398 + }, + { + "epoch": 2.1817858413376023, + "grad_norm": 0.5016240411038565, + "learning_rate": 2.2538822108488073e-05, + "loss": 0.0708, + "step": 18399 + }, + { + "epoch": 2.181904423099727, + "grad_norm": 0.9561088761444287, + "learning_rate": 2.2536433418973856e-05, + "loss": 0.1169, + "step": 18400 + }, + { + "epoch": 2.1820230048618523, + "grad_norm": 1.156739113795515, + "learning_rate": 2.2534044752170714e-05, + "loss": 0.1472, + "step": 18401 + }, + { + "epoch": 2.182141586623977, + "grad_norm": 0.5885252429001123, + "learning_rate": 2.253165610810064e-05, + "loss": 0.0731, + "step": 18402 + }, + { + "epoch": 2.1822601683861023, + "grad_norm": 0.8219878705210989, + "learning_rate": 2.2529267486785682e-05, + "loss": 0.1269, + "step": 18403 + }, + { + "epoch": 2.182378750148227, + "grad_norm": 0.6607280206530093, + "learning_rate": 2.252687888824785e-05, + "loss": 0.092, + "step": 18404 + }, + { + "epoch": 2.1824973319103522, + "grad_norm": 0.958121881826483, + "learning_rate": 2.252449031250916e-05, + "loss": 0.1376, + "step": 18405 + }, + { + "epoch": 2.182615913672477, + "grad_norm": 0.682000712620283, + "learning_rate": 2.2522101759591624e-05, + "loss": 0.077, + "step": 18406 + }, + { + "epoch": 2.182734495434602, + "grad_norm": 0.6361306425914606, + "learning_rate": 2.251971322951728e-05, + "loss": 0.0906, + "step": 18407 + }, + { + "epoch": 2.182853077196727, + "grad_norm": 0.8364681159743217, + "learning_rate": 2.2517324722308143e-05, + "loss": 0.1297, + "step": 18408 + }, + { + "epoch": 2.182971658958852, + "grad_norm": 0.8008265498746963, + "learning_rate": 2.2514936237986218e-05, + "loss": 0.1167, + "step": 18409 + }, + { + "epoch": 2.1830902407209773, + "grad_norm": 0.7846279771798135, + "learning_rate": 2.2512547776573535e-05, + "loss": 0.1175, + "step": 18410 + }, + { + "epoch": 2.183208822483102, + "grad_norm": 0.5780426779490427, + "learning_rate": 2.2510159338092108e-05, + "loss": 0.0739, + "step": 18411 + }, + { + "epoch": 2.183327404245227, + "grad_norm": 0.7598249805558362, + "learning_rate": 2.2507770922563966e-05, + "loss": 0.1014, + "step": 18412 + }, + { + "epoch": 2.183445986007352, + "grad_norm": 0.6995332081899455, + "learning_rate": 2.250538253001112e-05, + "loss": 0.0952, + "step": 18413 + }, + { + "epoch": 2.1835645677694773, + "grad_norm": 0.7459147098463338, + "learning_rate": 2.250299416045558e-05, + "loss": 0.0935, + "step": 18414 + }, + { + "epoch": 2.183683149531602, + "grad_norm": 0.9633422322286446, + "learning_rate": 2.250060581391938e-05, + "loss": 0.125, + "step": 18415 + }, + { + "epoch": 2.183801731293727, + "grad_norm": 1.0553145512838658, + "learning_rate": 2.249821749042453e-05, + "loss": 0.1494, + "step": 18416 + }, + { + "epoch": 2.183920313055852, + "grad_norm": 0.6919259422274493, + "learning_rate": 2.2495829189993032e-05, + "loss": 0.0957, + "step": 18417 + }, + { + "epoch": 2.184038894817977, + "grad_norm": 0.5981364680504196, + "learning_rate": 2.2493440912646935e-05, + "loss": 0.0818, + "step": 18418 + }, + { + "epoch": 2.184157476580102, + "grad_norm": 0.6521651346338427, + "learning_rate": 2.2491052658408235e-05, + "loss": 0.0888, + "step": 18419 + }, + { + "epoch": 2.184276058342227, + "grad_norm": 0.5605763378526037, + "learning_rate": 2.248866442729895e-05, + "loss": 0.078, + "step": 18420 + }, + { + "epoch": 2.184394640104352, + "grad_norm": 0.5865603058262729, + "learning_rate": 2.24862762193411e-05, + "loss": 0.083, + "step": 18421 + }, + { + "epoch": 2.184513221866477, + "grad_norm": 0.9324292117579831, + "learning_rate": 2.24838880345567e-05, + "loss": 0.1278, + "step": 18422 + }, + { + "epoch": 2.184631803628602, + "grad_norm": 0.5569855986859769, + "learning_rate": 2.248149987296777e-05, + "loss": 0.0854, + "step": 18423 + }, + { + "epoch": 2.184750385390727, + "grad_norm": 0.7840562013418243, + "learning_rate": 2.2479111734596323e-05, + "loss": 0.1206, + "step": 18424 + }, + { + "epoch": 2.184868967152852, + "grad_norm": 0.8125754311456357, + "learning_rate": 2.2476723619464367e-05, + "loss": 0.1045, + "step": 18425 + }, + { + "epoch": 2.184987548914977, + "grad_norm": 0.7263410690845682, + "learning_rate": 2.247433552759393e-05, + "loss": 0.093, + "step": 18426 + }, + { + "epoch": 2.185106130677102, + "grad_norm": 0.6530092881140867, + "learning_rate": 2.2471947459007028e-05, + "loss": 0.0993, + "step": 18427 + }, + { + "epoch": 2.185224712439227, + "grad_norm": 0.5746708668182857, + "learning_rate": 2.2469559413725664e-05, + "loss": 0.0884, + "step": 18428 + }, + { + "epoch": 2.1853432942013518, + "grad_norm": 1.0667740435359643, + "learning_rate": 2.246717139177186e-05, + "loss": 0.1448, + "step": 18429 + }, + { + "epoch": 2.185461875963477, + "grad_norm": 0.7855047756584274, + "learning_rate": 2.2464783393167624e-05, + "loss": 0.1275, + "step": 18430 + }, + { + "epoch": 2.1855804577256017, + "grad_norm": 0.5478857324114155, + "learning_rate": 2.2462395417934987e-05, + "loss": 0.0696, + "step": 18431 + }, + { + "epoch": 2.185699039487727, + "grad_norm": 0.7546049923186586, + "learning_rate": 2.246000746609594e-05, + "loss": 0.1018, + "step": 18432 + }, + { + "epoch": 2.1858176212498517, + "grad_norm": 0.6674927981661097, + "learning_rate": 2.245761953767252e-05, + "loss": 0.0877, + "step": 18433 + }, + { + "epoch": 2.185936203011977, + "grad_norm": 0.5170105384388965, + "learning_rate": 2.245523163268673e-05, + "loss": 0.0758, + "step": 18434 + }, + { + "epoch": 2.1860547847741016, + "grad_norm": 0.9407099424580031, + "learning_rate": 2.2452843751160585e-05, + "loss": 0.15, + "step": 18435 + }, + { + "epoch": 2.186173366536227, + "grad_norm": 1.1189742144371786, + "learning_rate": 2.2450455893116085e-05, + "loss": 0.133, + "step": 18436 + }, + { + "epoch": 2.1862919482983516, + "grad_norm": 0.5132011815421205, + "learning_rate": 2.2448068058575268e-05, + "loss": 0.0611, + "step": 18437 + }, + { + "epoch": 2.186410530060477, + "grad_norm": 0.48964948950489356, + "learning_rate": 2.244568024756013e-05, + "loss": 0.0678, + "step": 18438 + }, + { + "epoch": 2.1865291118226016, + "grad_norm": 0.7074923457498284, + "learning_rate": 2.2443292460092685e-05, + "loss": 0.1117, + "step": 18439 + }, + { + "epoch": 2.1866476935847268, + "grad_norm": 0.5985010694243094, + "learning_rate": 2.244090469619495e-05, + "loss": 0.0601, + "step": 18440 + }, + { + "epoch": 2.1867662753468515, + "grad_norm": 0.6273669030015802, + "learning_rate": 2.243851695588893e-05, + "loss": 0.1091, + "step": 18441 + }, + { + "epoch": 2.1868848571089767, + "grad_norm": 0.6008041323131077, + "learning_rate": 2.2436129239196656e-05, + "loss": 0.0939, + "step": 18442 + }, + { + "epoch": 2.1870034388711015, + "grad_norm": 0.6174083399242325, + "learning_rate": 2.2433741546140123e-05, + "loss": 0.1119, + "step": 18443 + }, + { + "epoch": 2.1871220206332267, + "grad_norm": 0.5840861335942009, + "learning_rate": 2.2431353876741332e-05, + "loss": 0.0932, + "step": 18444 + }, + { + "epoch": 2.1872406023953515, + "grad_norm": 0.6382506427265218, + "learning_rate": 2.2428966231022323e-05, + "loss": 0.0912, + "step": 18445 + }, + { + "epoch": 2.1873591841574767, + "grad_norm": 0.8222890368025033, + "learning_rate": 2.2426578609005088e-05, + "loss": 0.0916, + "step": 18446 + }, + { + "epoch": 2.1874777659196014, + "grad_norm": 0.5734600593615354, + "learning_rate": 2.2424191010711636e-05, + "loss": 0.085, + "step": 18447 + }, + { + "epoch": 2.1875963476817266, + "grad_norm": 0.596340578378995, + "learning_rate": 2.2421803436163998e-05, + "loss": 0.0837, + "step": 18448 + }, + { + "epoch": 2.1877149294438514, + "grad_norm": 1.0084540696461124, + "learning_rate": 2.2419415885384158e-05, + "loss": 0.113, + "step": 18449 + }, + { + "epoch": 2.1878335112059766, + "grad_norm": 0.6722281774865035, + "learning_rate": 2.241702835839415e-05, + "loss": 0.089, + "step": 18450 + }, + { + "epoch": 2.1879520929681013, + "grad_norm": 0.7528960985132602, + "learning_rate": 2.2414640855215956e-05, + "loss": 0.1007, + "step": 18451 + }, + { + "epoch": 2.1880706747302265, + "grad_norm": 0.5037195078723284, + "learning_rate": 2.241225337587162e-05, + "loss": 0.0648, + "step": 18452 + }, + { + "epoch": 2.1881892564923513, + "grad_norm": 0.8574874749033047, + "learning_rate": 2.240986592038313e-05, + "loss": 0.1445, + "step": 18453 + }, + { + "epoch": 2.1883078382544765, + "grad_norm": 1.0094148512797658, + "learning_rate": 2.2407478488772503e-05, + "loss": 0.1021, + "step": 18454 + }, + { + "epoch": 2.1884264200166013, + "grad_norm": 0.8530636284562935, + "learning_rate": 2.2405091081061733e-05, + "loss": 0.0886, + "step": 18455 + }, + { + "epoch": 2.1885450017787265, + "grad_norm": 0.8561293182107678, + "learning_rate": 2.2402703697272853e-05, + "loss": 0.1057, + "step": 18456 + }, + { + "epoch": 2.188663583540851, + "grad_norm": 0.675044492595026, + "learning_rate": 2.240031633742786e-05, + "loss": 0.101, + "step": 18457 + }, + { + "epoch": 2.1887821653029764, + "grad_norm": 0.9778476741242377, + "learning_rate": 2.2397929001548756e-05, + "loss": 0.1187, + "step": 18458 + }, + { + "epoch": 2.188900747065101, + "grad_norm": 0.5090457736613733, + "learning_rate": 2.2395541689657558e-05, + "loss": 0.0589, + "step": 18459 + }, + { + "epoch": 2.1890193288272264, + "grad_norm": 0.5474901828642386, + "learning_rate": 2.2393154401776267e-05, + "loss": 0.0896, + "step": 18460 + }, + { + "epoch": 2.189137910589351, + "grad_norm": 0.6353467834164056, + "learning_rate": 2.2390767137926904e-05, + "loss": 0.0926, + "step": 18461 + }, + { + "epoch": 2.1892564923514763, + "grad_norm": 0.8884683516963249, + "learning_rate": 2.238837989813146e-05, + "loss": 0.1159, + "step": 18462 + }, + { + "epoch": 2.1893750741136015, + "grad_norm": 0.8250137811405346, + "learning_rate": 2.238599268241196e-05, + "loss": 0.1205, + "step": 18463 + }, + { + "epoch": 2.1894936558757263, + "grad_norm": 0.6623926876237836, + "learning_rate": 2.23836054907904e-05, + "loss": 0.0898, + "step": 18464 + }, + { + "epoch": 2.189612237637851, + "grad_norm": 0.6715827609717537, + "learning_rate": 2.2381218323288787e-05, + "loss": 0.0821, + "step": 18465 + }, + { + "epoch": 2.1897308193999763, + "grad_norm": 0.6881309542800916, + "learning_rate": 2.2378831179929125e-05, + "loss": 0.0852, + "step": 18466 + }, + { + "epoch": 2.1898494011621015, + "grad_norm": 0.7839381071951821, + "learning_rate": 2.2376444060733433e-05, + "loss": 0.0819, + "step": 18467 + }, + { + "epoch": 2.1899679829242262, + "grad_norm": 0.7753718805237698, + "learning_rate": 2.237405696572371e-05, + "loss": 0.1071, + "step": 18468 + }, + { + "epoch": 2.1900865646863514, + "grad_norm": 0.823825977896918, + "learning_rate": 2.2371669894921954e-05, + "loss": 0.1264, + "step": 18469 + }, + { + "epoch": 2.190205146448476, + "grad_norm": 0.5974648359718348, + "learning_rate": 2.2369282848350182e-05, + "loss": 0.085, + "step": 18470 + }, + { + "epoch": 2.1903237282106014, + "grad_norm": 0.6543164570813214, + "learning_rate": 2.2366895826030393e-05, + "loss": 0.0841, + "step": 18471 + }, + { + "epoch": 2.190442309972726, + "grad_norm": 1.181393737484423, + "learning_rate": 2.23645088279846e-05, + "loss": 0.116, + "step": 18472 + }, + { + "epoch": 2.1905608917348514, + "grad_norm": 1.079893083388744, + "learning_rate": 2.236212185423481e-05, + "loss": 0.1349, + "step": 18473 + }, + { + "epoch": 2.190679473496976, + "grad_norm": 0.6615243863550132, + "learning_rate": 2.2359734904803005e-05, + "loss": 0.0759, + "step": 18474 + }, + { + "epoch": 2.1907980552591013, + "grad_norm": 0.4717051269460087, + "learning_rate": 2.2357347979711218e-05, + "loss": 0.0551, + "step": 18475 + }, + { + "epoch": 2.190916637021226, + "grad_norm": 0.9057013742077762, + "learning_rate": 2.2354961078981447e-05, + "loss": 0.0869, + "step": 18476 + }, + { + "epoch": 2.1910352187833513, + "grad_norm": 0.760753499100508, + "learning_rate": 2.2352574202635674e-05, + "loss": 0.0957, + "step": 18477 + }, + { + "epoch": 2.191153800545476, + "grad_norm": 0.6368180482369633, + "learning_rate": 2.2350187350695932e-05, + "loss": 0.0795, + "step": 18478 + }, + { + "epoch": 2.1912723823076012, + "grad_norm": 0.8281067604284843, + "learning_rate": 2.234780052318421e-05, + "loss": 0.1143, + "step": 18479 + }, + { + "epoch": 2.191390964069726, + "grad_norm": 0.7883761229061127, + "learning_rate": 2.2345413720122523e-05, + "loss": 0.1271, + "step": 18480 + }, + { + "epoch": 2.191509545831851, + "grad_norm": 0.8190226298975014, + "learning_rate": 2.2343026941532853e-05, + "loss": 0.1176, + "step": 18481 + }, + { + "epoch": 2.191628127593976, + "grad_norm": 0.49293805780579264, + "learning_rate": 2.2340640187437224e-05, + "loss": 0.0785, + "step": 18482 + }, + { + "epoch": 2.191746709356101, + "grad_norm": 0.6268657248331693, + "learning_rate": 2.2338253457857636e-05, + "loss": 0.0904, + "step": 18483 + }, + { + "epoch": 2.191865291118226, + "grad_norm": 1.2354411864211547, + "learning_rate": 2.2335866752816084e-05, + "loss": 0.1678, + "step": 18484 + }, + { + "epoch": 2.191983872880351, + "grad_norm": 0.8590678768656969, + "learning_rate": 2.2333480072334565e-05, + "loss": 0.1328, + "step": 18485 + }, + { + "epoch": 2.192102454642476, + "grad_norm": 0.5508647975535224, + "learning_rate": 2.2331093416435096e-05, + "loss": 0.081, + "step": 18486 + }, + { + "epoch": 2.192221036404601, + "grad_norm": 0.6217181643038421, + "learning_rate": 2.232870678513968e-05, + "loss": 0.0893, + "step": 18487 + }, + { + "epoch": 2.192339618166726, + "grad_norm": 0.5724887270324105, + "learning_rate": 2.2326320178470294e-05, + "loss": 0.0742, + "step": 18488 + }, + { + "epoch": 2.192458199928851, + "grad_norm": 0.9439234280840562, + "learning_rate": 2.232393359644897e-05, + "loss": 0.1246, + "step": 18489 + }, + { + "epoch": 2.192576781690976, + "grad_norm": 0.9992920410286019, + "learning_rate": 2.2321547039097694e-05, + "loss": 0.1319, + "step": 18490 + }, + { + "epoch": 2.192695363453101, + "grad_norm": 0.5904562611430167, + "learning_rate": 2.2319160506438474e-05, + "loss": 0.0882, + "step": 18491 + }, + { + "epoch": 2.1928139452152258, + "grad_norm": 0.57287236357984, + "learning_rate": 2.2316773998493296e-05, + "loss": 0.0878, + "step": 18492 + }, + { + "epoch": 2.192932526977351, + "grad_norm": 0.7128153048590048, + "learning_rate": 2.231438751528418e-05, + "loss": 0.1141, + "step": 18493 + }, + { + "epoch": 2.1930511087394757, + "grad_norm": 0.653657657716474, + "learning_rate": 2.231200105683312e-05, + "loss": 0.0924, + "step": 18494 + }, + { + "epoch": 2.193169690501601, + "grad_norm": 0.6582609781713341, + "learning_rate": 2.2309614623162115e-05, + "loss": 0.0862, + "step": 18495 + }, + { + "epoch": 2.1932882722637257, + "grad_norm": 0.8900287232145305, + "learning_rate": 2.230722821429315e-05, + "loss": 0.1263, + "step": 18496 + }, + { + "epoch": 2.193406854025851, + "grad_norm": 0.7548047350126029, + "learning_rate": 2.2304841830248253e-05, + "loss": 0.101, + "step": 18497 + }, + { + "epoch": 2.1935254357879757, + "grad_norm": 0.8875815058046704, + "learning_rate": 2.2302455471049403e-05, + "loss": 0.1011, + "step": 18498 + }, + { + "epoch": 2.193644017550101, + "grad_norm": 0.5062227000328744, + "learning_rate": 2.230006913671861e-05, + "loss": 0.0616, + "step": 18499 + }, + { + "epoch": 2.1937625993122256, + "grad_norm": 0.6396079713520615, + "learning_rate": 2.2297682827277858e-05, + "loss": 0.0928, + "step": 18500 + }, + { + "epoch": 2.193881181074351, + "grad_norm": 1.184536068596578, + "learning_rate": 2.2295296542749167e-05, + "loss": 0.1243, + "step": 18501 + }, + { + "epoch": 2.1939997628364756, + "grad_norm": 0.7714183539614202, + "learning_rate": 2.2292910283154524e-05, + "loss": 0.0963, + "step": 18502 + }, + { + "epoch": 2.194118344598601, + "grad_norm": 0.6098704111105191, + "learning_rate": 2.229052404851592e-05, + "loss": 0.0834, + "step": 18503 + }, + { + "epoch": 2.1942369263607255, + "grad_norm": 0.5464479276897053, + "learning_rate": 2.228813783885537e-05, + "loss": 0.0673, + "step": 18504 + }, + { + "epoch": 2.1943555081228507, + "grad_norm": 0.7853634212661972, + "learning_rate": 2.2285751654194867e-05, + "loss": 0.0991, + "step": 18505 + }, + { + "epoch": 2.1944740898849755, + "grad_norm": 0.6374050492009214, + "learning_rate": 2.2283365494556404e-05, + "loss": 0.0792, + "step": 18506 + }, + { + "epoch": 2.1945926716471007, + "grad_norm": 0.780897838208035, + "learning_rate": 2.2280979359961967e-05, + "loss": 0.0951, + "step": 18507 + }, + { + "epoch": 2.1947112534092255, + "grad_norm": 0.858538377033894, + "learning_rate": 2.227859325043358e-05, + "loss": 0.1287, + "step": 18508 + }, + { + "epoch": 2.1948298351713507, + "grad_norm": 0.6647020688707223, + "learning_rate": 2.2276207165993217e-05, + "loss": 0.0692, + "step": 18509 + }, + { + "epoch": 2.1949484169334754, + "grad_norm": 0.7807601421241424, + "learning_rate": 2.2273821106662892e-05, + "loss": 0.0902, + "step": 18510 + }, + { + "epoch": 2.1950669986956006, + "grad_norm": 0.858675262934243, + "learning_rate": 2.2271435072464584e-05, + "loss": 0.0934, + "step": 18511 + }, + { + "epoch": 2.195185580457726, + "grad_norm": 0.8200956669682851, + "learning_rate": 2.2269049063420305e-05, + "loss": 0.1145, + "step": 18512 + }, + { + "epoch": 2.1953041622198506, + "grad_norm": 1.035577099384347, + "learning_rate": 2.2266663079552047e-05, + "loss": 0.1273, + "step": 18513 + }, + { + "epoch": 2.1954227439819753, + "grad_norm": 0.732997696023518, + "learning_rate": 2.2264277120881803e-05, + "loss": 0.0906, + "step": 18514 + }, + { + "epoch": 2.1955413257441005, + "grad_norm": 0.6850650766663792, + "learning_rate": 2.2261891187431562e-05, + "loss": 0.1142, + "step": 18515 + }, + { + "epoch": 2.1956599075062257, + "grad_norm": 0.8176970851743066, + "learning_rate": 2.2259505279223333e-05, + "loss": 0.0962, + "step": 18516 + }, + { + "epoch": 2.1957784892683505, + "grad_norm": 0.8054752461664582, + "learning_rate": 2.2257119396279105e-05, + "loss": 0.1084, + "step": 18517 + }, + { + "epoch": 2.1958970710304757, + "grad_norm": 0.553423809719829, + "learning_rate": 2.2254733538620864e-05, + "loss": 0.0691, + "step": 18518 + }, + { + "epoch": 2.1960156527926005, + "grad_norm": 0.6709360335210435, + "learning_rate": 2.2252347706270622e-05, + "loss": 0.0909, + "step": 18519 + }, + { + "epoch": 2.1961342345547257, + "grad_norm": 0.6160422823644436, + "learning_rate": 2.224996189925036e-05, + "loss": 0.0726, + "step": 18520 + }, + { + "epoch": 2.1962528163168504, + "grad_norm": 0.6556075571607048, + "learning_rate": 2.2247576117582083e-05, + "loss": 0.0804, + "step": 18521 + }, + { + "epoch": 2.1963713980789756, + "grad_norm": 0.8650940461960032, + "learning_rate": 2.224519036128777e-05, + "loss": 0.0992, + "step": 18522 + }, + { + "epoch": 2.1964899798411004, + "grad_norm": 0.6203374370418595, + "learning_rate": 2.224280463038943e-05, + "loss": 0.0802, + "step": 18523 + }, + { + "epoch": 2.1966085616032256, + "grad_norm": 0.786557174691255, + "learning_rate": 2.2240418924909055e-05, + "loss": 0.1099, + "step": 18524 + }, + { + "epoch": 2.1967271433653504, + "grad_norm": 0.853142201818019, + "learning_rate": 2.223803324486863e-05, + "loss": 0.1017, + "step": 18525 + }, + { + "epoch": 2.1968457251274756, + "grad_norm": 1.0522875220294179, + "learning_rate": 2.223564759029014e-05, + "loss": 0.1418, + "step": 18526 + }, + { + "epoch": 2.1969643068896003, + "grad_norm": 0.6809372935979422, + "learning_rate": 2.2233261961195603e-05, + "loss": 0.0922, + "step": 18527 + }, + { + "epoch": 2.1970828886517255, + "grad_norm": 0.6505065750432619, + "learning_rate": 2.2230876357606992e-05, + "loss": 0.0722, + "step": 18528 + }, + { + "epoch": 2.1972014704138503, + "grad_norm": 0.8094962424326148, + "learning_rate": 2.222849077954631e-05, + "loss": 0.1164, + "step": 18529 + }, + { + "epoch": 2.1973200521759755, + "grad_norm": 0.9082711875351711, + "learning_rate": 2.222610522703553e-05, + "loss": 0.1137, + "step": 18530 + }, + { + "epoch": 2.1974386339381002, + "grad_norm": 0.7536310593385196, + "learning_rate": 2.2223719700096675e-05, + "loss": 0.0867, + "step": 18531 + }, + { + "epoch": 2.1975572157002254, + "grad_norm": 0.6776671801795742, + "learning_rate": 2.222133419875172e-05, + "loss": 0.0995, + "step": 18532 + }, + { + "epoch": 2.19767579746235, + "grad_norm": 1.0260202813762966, + "learning_rate": 2.221894872302264e-05, + "loss": 0.1205, + "step": 18533 + }, + { + "epoch": 2.1977943792244754, + "grad_norm": 0.8453640607351962, + "learning_rate": 2.2216563272931457e-05, + "loss": 0.1093, + "step": 18534 + }, + { + "epoch": 2.1979129609866, + "grad_norm": 0.9434136525317276, + "learning_rate": 2.221417784850015e-05, + "loss": 0.0934, + "step": 18535 + }, + { + "epoch": 2.1980315427487254, + "grad_norm": 0.714320708593729, + "learning_rate": 2.22117924497507e-05, + "loss": 0.0902, + "step": 18536 + }, + { + "epoch": 2.19815012451085, + "grad_norm": 0.5878713275571802, + "learning_rate": 2.22094070767051e-05, + "loss": 0.0805, + "step": 18537 + }, + { + "epoch": 2.1982687062729753, + "grad_norm": 0.626199835045335, + "learning_rate": 2.2207021729385353e-05, + "loss": 0.083, + "step": 18538 + }, + { + "epoch": 2.1983872880351, + "grad_norm": 0.6031177979564564, + "learning_rate": 2.2204636407813434e-05, + "loss": 0.0799, + "step": 18539 + }, + { + "epoch": 2.1985058697972253, + "grad_norm": 0.5918239560552421, + "learning_rate": 2.2202251112011347e-05, + "loss": 0.0781, + "step": 18540 + }, + { + "epoch": 2.19862445155935, + "grad_norm": 0.5385284521300315, + "learning_rate": 2.219986584200106e-05, + "loss": 0.0783, + "step": 18541 + }, + { + "epoch": 2.1987430333214752, + "grad_norm": 0.8289809387521189, + "learning_rate": 2.2197480597804586e-05, + "loss": 0.1263, + "step": 18542 + }, + { + "epoch": 2.1988616150836, + "grad_norm": 0.8465947648863564, + "learning_rate": 2.219509537944391e-05, + "loss": 0.0911, + "step": 18543 + }, + { + "epoch": 2.198980196845725, + "grad_norm": 0.9051284399314428, + "learning_rate": 2.2192710186941014e-05, + "loss": 0.1244, + "step": 18544 + }, + { + "epoch": 2.19909877860785, + "grad_norm": 0.5645343672141971, + "learning_rate": 2.2190325020317872e-05, + "loss": 0.0909, + "step": 18545 + }, + { + "epoch": 2.199217360369975, + "grad_norm": 0.7067184591053518, + "learning_rate": 2.2187939879596504e-05, + "loss": 0.1012, + "step": 18546 + }, + { + "epoch": 2.1993359421321, + "grad_norm": 0.9157719499821433, + "learning_rate": 2.2185554764798876e-05, + "loss": 0.117, + "step": 18547 + }, + { + "epoch": 2.199454523894225, + "grad_norm": 1.2119930891830306, + "learning_rate": 2.2183169675946976e-05, + "loss": 0.1369, + "step": 18548 + }, + { + "epoch": 2.19957310565635, + "grad_norm": 1.0840501471801565, + "learning_rate": 2.218078461306281e-05, + "loss": 0.1488, + "step": 18549 + }, + { + "epoch": 2.199691687418475, + "grad_norm": 1.0280660258688932, + "learning_rate": 2.217839957616834e-05, + "loss": 0.1423, + "step": 18550 + }, + { + "epoch": 2.1998102691806, + "grad_norm": 0.5007376855546856, + "learning_rate": 2.2176014565285578e-05, + "loss": 0.0629, + "step": 18551 + }, + { + "epoch": 2.199928850942725, + "grad_norm": 1.1936499174749404, + "learning_rate": 2.2173629580436483e-05, + "loss": 0.1402, + "step": 18552 + }, + { + "epoch": 2.20004743270485, + "grad_norm": 0.6690989685537171, + "learning_rate": 2.2171244621643073e-05, + "loss": 0.1, + "step": 18553 + }, + { + "epoch": 2.200166014466975, + "grad_norm": 0.784882037108575, + "learning_rate": 2.2168859688927314e-05, + "loss": 0.0798, + "step": 18554 + }, + { + "epoch": 2.2002845962290998, + "grad_norm": 0.6684540464323727, + "learning_rate": 2.21664747823112e-05, + "loss": 0.0855, + "step": 18555 + }, + { + "epoch": 2.200403177991225, + "grad_norm": 0.7413071325703999, + "learning_rate": 2.21640899018167e-05, + "loss": 0.0969, + "step": 18556 + }, + { + "epoch": 2.2005217597533497, + "grad_norm": 0.8361374705201728, + "learning_rate": 2.2161705047465828e-05, + "loss": 0.1091, + "step": 18557 + }, + { + "epoch": 2.200640341515475, + "grad_norm": 0.8041811971504875, + "learning_rate": 2.2159320219280548e-05, + "loss": 0.0976, + "step": 18558 + }, + { + "epoch": 2.2007589232775997, + "grad_norm": 1.0524303402299942, + "learning_rate": 2.215693541728285e-05, + "loss": 0.1337, + "step": 18559 + }, + { + "epoch": 2.200877505039725, + "grad_norm": 0.6680201243633164, + "learning_rate": 2.2154550641494724e-05, + "loss": 0.0886, + "step": 18560 + }, + { + "epoch": 2.20099608680185, + "grad_norm": 0.6501557660392016, + "learning_rate": 2.2152165891938157e-05, + "loss": 0.0827, + "step": 18561 + }, + { + "epoch": 2.201114668563975, + "grad_norm": 0.6146163813522737, + "learning_rate": 2.2149781168635126e-05, + "loss": 0.0934, + "step": 18562 + }, + { + "epoch": 2.2012332503260996, + "grad_norm": 0.8826705546658759, + "learning_rate": 2.2147396471607606e-05, + "loss": 0.1202, + "step": 18563 + }, + { + "epoch": 2.201351832088225, + "grad_norm": 0.9852576266034967, + "learning_rate": 2.2145011800877606e-05, + "loss": 0.1175, + "step": 18564 + }, + { + "epoch": 2.20147041385035, + "grad_norm": 0.9513530019885685, + "learning_rate": 2.2142627156467098e-05, + "loss": 0.1302, + "step": 18565 + }, + { + "epoch": 2.201588995612475, + "grad_norm": 0.8654444352781222, + "learning_rate": 2.2140242538398058e-05, + "loss": 0.1266, + "step": 18566 + }, + { + "epoch": 2.2017075773746, + "grad_norm": 0.5434129379293668, + "learning_rate": 2.2137857946692468e-05, + "loss": 0.0753, + "step": 18567 + }, + { + "epoch": 2.2018261591367247, + "grad_norm": 0.5480297654142707, + "learning_rate": 2.213547338137233e-05, + "loss": 0.0639, + "step": 18568 + }, + { + "epoch": 2.20194474089885, + "grad_norm": 1.0351814034498477, + "learning_rate": 2.2133088842459607e-05, + "loss": 0.1388, + "step": 18569 + }, + { + "epoch": 2.2020633226609747, + "grad_norm": 0.71055519406069, + "learning_rate": 2.2130704329976293e-05, + "loss": 0.0943, + "step": 18570 + }, + { + "epoch": 2.2021819044231, + "grad_norm": 0.9193631799444636, + "learning_rate": 2.212831984394436e-05, + "loss": 0.1487, + "step": 18571 + }, + { + "epoch": 2.2023004861852247, + "grad_norm": 0.5067931795845955, + "learning_rate": 2.2125935384385805e-05, + "loss": 0.0572, + "step": 18572 + }, + { + "epoch": 2.20241906794735, + "grad_norm": 0.7943488855387889, + "learning_rate": 2.21235509513226e-05, + "loss": 0.1133, + "step": 18573 + }, + { + "epoch": 2.2025376497094746, + "grad_norm": 0.6930737956642986, + "learning_rate": 2.2121166544776715e-05, + "loss": 0.1048, + "step": 18574 + }, + { + "epoch": 2.2026562314716, + "grad_norm": 1.0697652505712003, + "learning_rate": 2.211878216477016e-05, + "loss": 0.1427, + "step": 18575 + }, + { + "epoch": 2.2027748132337246, + "grad_norm": 0.7925548364088736, + "learning_rate": 2.21163978113249e-05, + "loss": 0.0976, + "step": 18576 + }, + { + "epoch": 2.20289339499585, + "grad_norm": 0.6274133625874148, + "learning_rate": 2.2114013484462907e-05, + "loss": 0.0988, + "step": 18577 + }, + { + "epoch": 2.2030119767579746, + "grad_norm": 0.7632437822820491, + "learning_rate": 2.2111629184206176e-05, + "loss": 0.1309, + "step": 18578 + }, + { + "epoch": 2.2031305585200998, + "grad_norm": 0.7631345567142237, + "learning_rate": 2.210924491057668e-05, + "loss": 0.1263, + "step": 18579 + }, + { + "epoch": 2.2032491402822245, + "grad_norm": 1.3662623194326764, + "learning_rate": 2.2106860663596403e-05, + "loss": 0.157, + "step": 18580 + }, + { + "epoch": 2.2033677220443497, + "grad_norm": 0.7038165548268492, + "learning_rate": 2.2104476443287326e-05, + "loss": 0.0907, + "step": 18581 + }, + { + "epoch": 2.2034863038064745, + "grad_norm": 0.8349491913234616, + "learning_rate": 2.2102092249671412e-05, + "loss": 0.0767, + "step": 18582 + }, + { + "epoch": 2.2036048855685997, + "grad_norm": 0.8946905256324299, + "learning_rate": 2.2099708082770667e-05, + "loss": 0.1168, + "step": 18583 + }, + { + "epoch": 2.2037234673307244, + "grad_norm": 0.44266671844258443, + "learning_rate": 2.2097323942607055e-05, + "loss": 0.0691, + "step": 18584 + }, + { + "epoch": 2.2038420490928496, + "grad_norm": 0.6523131661658357, + "learning_rate": 2.209493982920256e-05, + "loss": 0.0627, + "step": 18585 + }, + { + "epoch": 2.2039606308549744, + "grad_norm": 0.5016930785632562, + "learning_rate": 2.2092555742579148e-05, + "loss": 0.0771, + "step": 18586 + }, + { + "epoch": 2.2040792126170996, + "grad_norm": 0.5978781784812954, + "learning_rate": 2.2090171682758813e-05, + "loss": 0.0814, + "step": 18587 + }, + { + "epoch": 2.2041977943792244, + "grad_norm": 0.8955751112518372, + "learning_rate": 2.208778764976352e-05, + "loss": 0.1425, + "step": 18588 + }, + { + "epoch": 2.2043163761413496, + "grad_norm": 0.7723725234874677, + "learning_rate": 2.2085403643615256e-05, + "loss": 0.0796, + "step": 18589 + }, + { + "epoch": 2.2044349579034743, + "grad_norm": 0.6198159230380986, + "learning_rate": 2.2083019664335998e-05, + "loss": 0.0926, + "step": 18590 + }, + { + "epoch": 2.2045535396655995, + "grad_norm": 0.756360778800563, + "learning_rate": 2.2080635711947725e-05, + "loss": 0.0956, + "step": 18591 + }, + { + "epoch": 2.2046721214277243, + "grad_norm": 0.7996900093697139, + "learning_rate": 2.2078251786472413e-05, + "loss": 0.1095, + "step": 18592 + }, + { + "epoch": 2.2047907031898495, + "grad_norm": 0.45190679371499526, + "learning_rate": 2.207586788793202e-05, + "loss": 0.0668, + "step": 18593 + }, + { + "epoch": 2.2049092849519742, + "grad_norm": 0.665975726164127, + "learning_rate": 2.207348401634856e-05, + "loss": 0.0962, + "step": 18594 + }, + { + "epoch": 2.2050278667140994, + "grad_norm": 0.5858734212135126, + "learning_rate": 2.207110017174398e-05, + "loss": 0.0808, + "step": 18595 + }, + { + "epoch": 2.205146448476224, + "grad_norm": 0.4829699188710921, + "learning_rate": 2.206871635414027e-05, + "loss": 0.0618, + "step": 18596 + }, + { + "epoch": 2.2052650302383494, + "grad_norm": 0.7795387758030494, + "learning_rate": 2.2066332563559385e-05, + "loss": 0.0969, + "step": 18597 + }, + { + "epoch": 2.205383612000474, + "grad_norm": 0.7947729608033122, + "learning_rate": 2.2063948800023332e-05, + "loss": 0.108, + "step": 18598 + }, + { + "epoch": 2.2055021937625994, + "grad_norm": 0.7497829229789136, + "learning_rate": 2.2061565063554064e-05, + "loss": 0.1023, + "step": 18599 + }, + { + "epoch": 2.205620775524724, + "grad_norm": 0.6595083176035033, + "learning_rate": 2.2059181354173564e-05, + "loss": 0.0858, + "step": 18600 + }, + { + "epoch": 2.2057393572868493, + "grad_norm": 0.6879356036744354, + "learning_rate": 2.2056797671903797e-05, + "loss": 0.0911, + "step": 18601 + }, + { + "epoch": 2.205857939048974, + "grad_norm": 0.5811243878593294, + "learning_rate": 2.2054414016766758e-05, + "loss": 0.0897, + "step": 18602 + }, + { + "epoch": 2.2059765208110993, + "grad_norm": 0.8096793251905643, + "learning_rate": 2.2052030388784413e-05, + "loss": 0.1127, + "step": 18603 + }, + { + "epoch": 2.206095102573224, + "grad_norm": 0.7451417839162476, + "learning_rate": 2.2049646787978717e-05, + "loss": 0.0958, + "step": 18604 + }, + { + "epoch": 2.2062136843353493, + "grad_norm": 0.670796870891284, + "learning_rate": 2.2047263214371673e-05, + "loss": 0.1041, + "step": 18605 + }, + { + "epoch": 2.206332266097474, + "grad_norm": 0.7376735863330911, + "learning_rate": 2.2044879667985238e-05, + "loss": 0.0954, + "step": 18606 + }, + { + "epoch": 2.206450847859599, + "grad_norm": 0.8633651388194945, + "learning_rate": 2.204249614884139e-05, + "loss": 0.1302, + "step": 18607 + }, + { + "epoch": 2.206569429621724, + "grad_norm": 0.7518356842963364, + "learning_rate": 2.2040112656962093e-05, + "loss": 0.1029, + "step": 18608 + }, + { + "epoch": 2.206688011383849, + "grad_norm": 0.8492067734332795, + "learning_rate": 2.203772919236933e-05, + "loss": 0.1234, + "step": 18609 + }, + { + "epoch": 2.2068065931459744, + "grad_norm": 0.7183766620489905, + "learning_rate": 2.203534575508508e-05, + "loss": 0.0836, + "step": 18610 + }, + { + "epoch": 2.206925174908099, + "grad_norm": 0.974540143537155, + "learning_rate": 2.2032962345131303e-05, + "loss": 0.1102, + "step": 18611 + }, + { + "epoch": 2.207043756670224, + "grad_norm": 1.0038836971111686, + "learning_rate": 2.2030578962529964e-05, + "loss": 0.1035, + "step": 18612 + }, + { + "epoch": 2.207162338432349, + "grad_norm": 0.644306625108896, + "learning_rate": 2.202819560730306e-05, + "loss": 0.0822, + "step": 18613 + }, + { + "epoch": 2.2072809201944743, + "grad_norm": 1.2027020384967633, + "learning_rate": 2.2025812279472546e-05, + "loss": 0.1395, + "step": 18614 + }, + { + "epoch": 2.207399501956599, + "grad_norm": 1.0594622702598857, + "learning_rate": 2.2023428979060396e-05, + "loss": 0.1305, + "step": 18615 + }, + { + "epoch": 2.2075180837187243, + "grad_norm": 0.6584386444540188, + "learning_rate": 2.2021045706088567e-05, + "loss": 0.0919, + "step": 18616 + }, + { + "epoch": 2.207636665480849, + "grad_norm": 0.9100718979565664, + "learning_rate": 2.201866246057906e-05, + "loss": 0.1136, + "step": 18617 + }, + { + "epoch": 2.2077552472429742, + "grad_norm": 0.7153970729503815, + "learning_rate": 2.2016279242553818e-05, + "loss": 0.0959, + "step": 18618 + }, + { + "epoch": 2.207873829005099, + "grad_norm": 0.7279621761995054, + "learning_rate": 2.2013896052034825e-05, + "loss": 0.1148, + "step": 18619 + }, + { + "epoch": 2.207992410767224, + "grad_norm": 0.870027704700297, + "learning_rate": 2.201151288904405e-05, + "loss": 0.0857, + "step": 18620 + }, + { + "epoch": 2.208110992529349, + "grad_norm": 0.8337689045850897, + "learning_rate": 2.2009129753603463e-05, + "loss": 0.0928, + "step": 18621 + }, + { + "epoch": 2.208229574291474, + "grad_norm": 0.6440783583648777, + "learning_rate": 2.2006746645735035e-05, + "loss": 0.1206, + "step": 18622 + }, + { + "epoch": 2.208348156053599, + "grad_norm": 0.6891570753357061, + "learning_rate": 2.200436356546072e-05, + "loss": 0.0984, + "step": 18623 + }, + { + "epoch": 2.208466737815724, + "grad_norm": 1.0249860097179124, + "learning_rate": 2.2001980512802514e-05, + "loss": 0.1741, + "step": 18624 + }, + { + "epoch": 2.208585319577849, + "grad_norm": 0.6903485651092206, + "learning_rate": 2.1999597487782368e-05, + "loss": 0.0875, + "step": 18625 + }, + { + "epoch": 2.208703901339974, + "grad_norm": 0.6062910246557257, + "learning_rate": 2.199721449042225e-05, + "loss": 0.0915, + "step": 18626 + }, + { + "epoch": 2.208822483102099, + "grad_norm": 0.936409676322443, + "learning_rate": 2.199483152074413e-05, + "loss": 0.1506, + "step": 18627 + }, + { + "epoch": 2.208941064864224, + "grad_norm": 0.6947070103436033, + "learning_rate": 2.1992448578769977e-05, + "loss": 0.1154, + "step": 18628 + }, + { + "epoch": 2.209059646626349, + "grad_norm": 0.7530375633730423, + "learning_rate": 2.199006566452177e-05, + "loss": 0.1077, + "step": 18629 + }, + { + "epoch": 2.209178228388474, + "grad_norm": 1.0009365088898239, + "learning_rate": 2.1987682778021463e-05, + "loss": 0.1236, + "step": 18630 + }, + { + "epoch": 2.2092968101505988, + "grad_norm": 0.5542601465160594, + "learning_rate": 2.198529991929102e-05, + "loss": 0.068, + "step": 18631 + }, + { + "epoch": 2.209415391912724, + "grad_norm": 0.7528113402301521, + "learning_rate": 2.1982917088352424e-05, + "loss": 0.0988, + "step": 18632 + }, + { + "epoch": 2.2095339736748487, + "grad_norm": 0.5881278347818135, + "learning_rate": 2.198053428522763e-05, + "loss": 0.0926, + "step": 18633 + }, + { + "epoch": 2.209652555436974, + "grad_norm": 0.6880838367554584, + "learning_rate": 2.1978151509938596e-05, + "loss": 0.1027, + "step": 18634 + }, + { + "epoch": 2.2097711371990987, + "grad_norm": 0.8814054904354307, + "learning_rate": 2.1975768762507312e-05, + "loss": 0.1039, + "step": 18635 + }, + { + "epoch": 2.209889718961224, + "grad_norm": 0.8794332308575439, + "learning_rate": 2.197338604295573e-05, + "loss": 0.1296, + "step": 18636 + }, + { + "epoch": 2.2100083007233486, + "grad_norm": 0.6604809888978591, + "learning_rate": 2.1971003351305816e-05, + "loss": 0.1183, + "step": 18637 + }, + { + "epoch": 2.210126882485474, + "grad_norm": 0.6830949084814016, + "learning_rate": 2.1968620687579533e-05, + "loss": 0.0938, + "step": 18638 + }, + { + "epoch": 2.2102454642475986, + "grad_norm": 0.7034628813258055, + "learning_rate": 2.196623805179885e-05, + "loss": 0.105, + "step": 18639 + }, + { + "epoch": 2.210364046009724, + "grad_norm": 0.9532083007683606, + "learning_rate": 2.196385544398574e-05, + "loss": 0.1424, + "step": 18640 + }, + { + "epoch": 2.2104826277718486, + "grad_norm": 0.8156634050913079, + "learning_rate": 2.196147286416216e-05, + "loss": 0.0878, + "step": 18641 + }, + { + "epoch": 2.2106012095339738, + "grad_norm": 0.8729415376990854, + "learning_rate": 2.195909031235006e-05, + "loss": 0.1203, + "step": 18642 + }, + { + "epoch": 2.2107197912960985, + "grad_norm": 0.5770768557473204, + "learning_rate": 2.1956707788571433e-05, + "loss": 0.0794, + "step": 18643 + }, + { + "epoch": 2.2108383730582237, + "grad_norm": 0.9712460024882714, + "learning_rate": 2.1954325292848224e-05, + "loss": 0.1368, + "step": 18644 + }, + { + "epoch": 2.2109569548203485, + "grad_norm": 0.787624591616509, + "learning_rate": 2.195194282520239e-05, + "loss": 0.0997, + "step": 18645 + }, + { + "epoch": 2.2110755365824737, + "grad_norm": 0.6452216279782554, + "learning_rate": 2.194956038565592e-05, + "loss": 0.0984, + "step": 18646 + }, + { + "epoch": 2.2111941183445984, + "grad_norm": 0.7059312389006707, + "learning_rate": 2.194717797423076e-05, + "loss": 0.0973, + "step": 18647 + }, + { + "epoch": 2.2113127001067236, + "grad_norm": 0.627076525728745, + "learning_rate": 2.1944795590948873e-05, + "loss": 0.0884, + "step": 18648 + }, + { + "epoch": 2.2114312818688484, + "grad_norm": 0.6011650331397197, + "learning_rate": 2.1942413235832222e-05, + "loss": 0.0755, + "step": 18649 + }, + { + "epoch": 2.2115498636309736, + "grad_norm": 0.8795694101074933, + "learning_rate": 2.194003090890277e-05, + "loss": 0.1183, + "step": 18650 + }, + { + "epoch": 2.2116684453930984, + "grad_norm": 1.0597435514606912, + "learning_rate": 2.1937648610182487e-05, + "loss": 0.1607, + "step": 18651 + }, + { + "epoch": 2.2117870271552236, + "grad_norm": 0.7917411486357385, + "learning_rate": 2.1935266339693328e-05, + "loss": 0.1237, + "step": 18652 + }, + { + "epoch": 2.2119056089173483, + "grad_norm": 0.8914149438945368, + "learning_rate": 2.1932884097457247e-05, + "loss": 0.0945, + "step": 18653 + }, + { + "epoch": 2.2120241906794735, + "grad_norm": 0.6407896648716348, + "learning_rate": 2.193050188349622e-05, + "loss": 0.1078, + "step": 18654 + }, + { + "epoch": 2.2121427724415983, + "grad_norm": 0.6021451242435333, + "learning_rate": 2.1928119697832206e-05, + "loss": 0.1067, + "step": 18655 + }, + { + "epoch": 2.2122613542037235, + "grad_norm": 0.7070466979194842, + "learning_rate": 2.1925737540487153e-05, + "loss": 0.081, + "step": 18656 + }, + { + "epoch": 2.2123799359658483, + "grad_norm": 0.5853492223139105, + "learning_rate": 2.1923355411483033e-05, + "loss": 0.0786, + "step": 18657 + }, + { + "epoch": 2.2124985177279735, + "grad_norm": 0.7853003727267024, + "learning_rate": 2.1920973310841798e-05, + "loss": 0.1064, + "step": 18658 + }, + { + "epoch": 2.212617099490098, + "grad_norm": 0.7082067726618405, + "learning_rate": 2.1918591238585423e-05, + "loss": 0.0844, + "step": 18659 + }, + { + "epoch": 2.2127356812522234, + "grad_norm": 0.6728041224700175, + "learning_rate": 2.1916209194735846e-05, + "loss": 0.0897, + "step": 18660 + }, + { + "epoch": 2.212854263014348, + "grad_norm": 0.6432716893785607, + "learning_rate": 2.191382717931505e-05, + "loss": 0.0825, + "step": 18661 + }, + { + "epoch": 2.2129728447764734, + "grad_norm": 0.7051747978325846, + "learning_rate": 2.1911445192344985e-05, + "loss": 0.0983, + "step": 18662 + }, + { + "epoch": 2.2130914265385986, + "grad_norm": 0.6713429748288177, + "learning_rate": 2.1909063233847605e-05, + "loss": 0.1123, + "step": 18663 + }, + { + "epoch": 2.2132100083007233, + "grad_norm": 0.8600886502656668, + "learning_rate": 2.190668130384486e-05, + "loss": 0.1237, + "step": 18664 + }, + { + "epoch": 2.213328590062848, + "grad_norm": 0.7151052667107956, + "learning_rate": 2.1904299402358733e-05, + "loss": 0.0885, + "step": 18665 + }, + { + "epoch": 2.2134471718249733, + "grad_norm": 0.6618300311391481, + "learning_rate": 2.1901917529411168e-05, + "loss": 0.1092, + "step": 18666 + }, + { + "epoch": 2.2135657535870985, + "grad_norm": 0.7116766422676849, + "learning_rate": 2.1899535685024118e-05, + "loss": 0.0816, + "step": 18667 + }, + { + "epoch": 2.2136843353492233, + "grad_norm": 0.687131103742838, + "learning_rate": 2.189715386921955e-05, + "loss": 0.0921, + "step": 18668 + }, + { + "epoch": 2.2138029171113485, + "grad_norm": 0.6379089671428497, + "learning_rate": 2.1894772082019416e-05, + "loss": 0.0854, + "step": 18669 + }, + { + "epoch": 2.2139214988734732, + "grad_norm": 0.7488806146928201, + "learning_rate": 2.1892390323445684e-05, + "loss": 0.1102, + "step": 18670 + }, + { + "epoch": 2.2140400806355984, + "grad_norm": 0.3744134510030876, + "learning_rate": 2.1890008593520297e-05, + "loss": 0.0476, + "step": 18671 + }, + { + "epoch": 2.214158662397723, + "grad_norm": 0.8765545168514761, + "learning_rate": 2.1887626892265213e-05, + "loss": 0.141, + "step": 18672 + }, + { + "epoch": 2.2142772441598484, + "grad_norm": 0.6520933403408776, + "learning_rate": 2.1885245219702398e-05, + "loss": 0.0685, + "step": 18673 + }, + { + "epoch": 2.214395825921973, + "grad_norm": 0.8071999468964205, + "learning_rate": 2.18828635758538e-05, + "loss": 0.1089, + "step": 18674 + }, + { + "epoch": 2.2145144076840984, + "grad_norm": 0.8515546905045722, + "learning_rate": 2.1880481960741374e-05, + "loss": 0.1157, + "step": 18675 + }, + { + "epoch": 2.214632989446223, + "grad_norm": 0.6034643129478984, + "learning_rate": 2.1878100374387085e-05, + "loss": 0.0784, + "step": 18676 + }, + { + "epoch": 2.2147515712083483, + "grad_norm": 0.6186813312770109, + "learning_rate": 2.1875718816812886e-05, + "loss": 0.0735, + "step": 18677 + }, + { + "epoch": 2.214870152970473, + "grad_norm": 1.0821210241338188, + "learning_rate": 2.1873337288040717e-05, + "loss": 0.1885, + "step": 18678 + }, + { + "epoch": 2.2149887347325983, + "grad_norm": 0.6850738906615209, + "learning_rate": 2.1870955788092547e-05, + "loss": 0.1102, + "step": 18679 + }, + { + "epoch": 2.215107316494723, + "grad_norm": 0.7692298389014989, + "learning_rate": 2.1868574316990332e-05, + "loss": 0.1102, + "step": 18680 + }, + { + "epoch": 2.2152258982568482, + "grad_norm": 0.7942619554401589, + "learning_rate": 2.186619287475602e-05, + "loss": 0.1335, + "step": 18681 + }, + { + "epoch": 2.215344480018973, + "grad_norm": 0.7533815851106666, + "learning_rate": 2.186381146141157e-05, + "loss": 0.0972, + "step": 18682 + }, + { + "epoch": 2.215463061781098, + "grad_norm": 0.6283102927787916, + "learning_rate": 2.1861430076978923e-05, + "loss": 0.0755, + "step": 18683 + }, + { + "epoch": 2.215581643543223, + "grad_norm": 0.8301645002043025, + "learning_rate": 2.1859048721480052e-05, + "loss": 0.119, + "step": 18684 + }, + { + "epoch": 2.215700225305348, + "grad_norm": 0.8977144414912559, + "learning_rate": 2.1856667394936898e-05, + "loss": 0.1471, + "step": 18685 + }, + { + "epoch": 2.215818807067473, + "grad_norm": 0.8741160018895833, + "learning_rate": 2.185428609737141e-05, + "loss": 0.1314, + "step": 18686 + }, + { + "epoch": 2.215937388829598, + "grad_norm": 0.9612728434343771, + "learning_rate": 2.185190482880555e-05, + "loss": 0.1361, + "step": 18687 + }, + { + "epoch": 2.216055970591723, + "grad_norm": 0.8580653727279521, + "learning_rate": 2.1849523589261266e-05, + "loss": 0.1311, + "step": 18688 + }, + { + "epoch": 2.216174552353848, + "grad_norm": 0.9588323801052889, + "learning_rate": 2.1847142378760514e-05, + "loss": 0.1302, + "step": 18689 + }, + { + "epoch": 2.216293134115973, + "grad_norm": 0.6198404001980826, + "learning_rate": 2.1844761197325237e-05, + "loss": 0.0927, + "step": 18690 + }, + { + "epoch": 2.216411715878098, + "grad_norm": 0.5784735277613501, + "learning_rate": 2.18423800449774e-05, + "loss": 0.0788, + "step": 18691 + }, + { + "epoch": 2.216530297640223, + "grad_norm": 1.0121844892101148, + "learning_rate": 2.1839998921738948e-05, + "loss": 0.139, + "step": 18692 + }, + { + "epoch": 2.216648879402348, + "grad_norm": 0.8568426153757488, + "learning_rate": 2.1837617827631833e-05, + "loss": 0.1223, + "step": 18693 + }, + { + "epoch": 2.2167674611644728, + "grad_norm": 0.8927015304356817, + "learning_rate": 2.183523676267799e-05, + "loss": 0.0987, + "step": 18694 + }, + { + "epoch": 2.216886042926598, + "grad_norm": 0.6804630945180242, + "learning_rate": 2.1832855726899392e-05, + "loss": 0.1159, + "step": 18695 + }, + { + "epoch": 2.2170046246887227, + "grad_norm": 0.8287177261518044, + "learning_rate": 2.1830474720317983e-05, + "loss": 0.1027, + "step": 18696 + }, + { + "epoch": 2.217123206450848, + "grad_norm": 1.0614487307462328, + "learning_rate": 2.1828093742955705e-05, + "loss": 0.1204, + "step": 18697 + }, + { + "epoch": 2.2172417882129727, + "grad_norm": 0.6141940127027554, + "learning_rate": 2.1825712794834514e-05, + "loss": 0.0762, + "step": 18698 + }, + { + "epoch": 2.217360369975098, + "grad_norm": 0.6426103008100169, + "learning_rate": 2.1823331875976354e-05, + "loss": 0.1018, + "step": 18699 + }, + { + "epoch": 2.2174789517372226, + "grad_norm": 0.7452552849171687, + "learning_rate": 2.182095098640319e-05, + "loss": 0.0952, + "step": 18700 + }, + { + "epoch": 2.217597533499348, + "grad_norm": 0.790238901613885, + "learning_rate": 2.1818570126136954e-05, + "loss": 0.1026, + "step": 18701 + }, + { + "epoch": 2.2177161152614726, + "grad_norm": 0.9266693321909398, + "learning_rate": 2.1816189295199594e-05, + "loss": 0.144, + "step": 18702 + }, + { + "epoch": 2.217834697023598, + "grad_norm": 0.7009742291438549, + "learning_rate": 2.181380849361307e-05, + "loss": 0.098, + "step": 18703 + }, + { + "epoch": 2.2179532787857226, + "grad_norm": 0.5725876141914784, + "learning_rate": 2.181142772139933e-05, + "loss": 0.0792, + "step": 18704 + }, + { + "epoch": 2.2180718605478478, + "grad_norm": 0.6208478932717707, + "learning_rate": 2.1809046978580305e-05, + "loss": 0.0677, + "step": 18705 + }, + { + "epoch": 2.2181904423099725, + "grad_norm": 0.5393205769434588, + "learning_rate": 2.180666626517796e-05, + "loss": 0.0784, + "step": 18706 + }, + { + "epoch": 2.2183090240720977, + "grad_norm": 0.6827932661282602, + "learning_rate": 2.1804285581214235e-05, + "loss": 0.1237, + "step": 18707 + }, + { + "epoch": 2.2184276058342225, + "grad_norm": 0.7334561509085107, + "learning_rate": 2.1801904926711083e-05, + "loss": 0.1245, + "step": 18708 + }, + { + "epoch": 2.2185461875963477, + "grad_norm": 0.9663215358778052, + "learning_rate": 2.179952430169043e-05, + "loss": 0.1421, + "step": 18709 + }, + { + "epoch": 2.2186647693584725, + "grad_norm": 0.6344197290360618, + "learning_rate": 2.1797143706174254e-05, + "loss": 0.0822, + "step": 18710 + }, + { + "epoch": 2.2187833511205977, + "grad_norm": 0.7638435880135139, + "learning_rate": 2.1794763140184487e-05, + "loss": 0.1113, + "step": 18711 + }, + { + "epoch": 2.218901932882723, + "grad_norm": 0.6012733108718435, + "learning_rate": 2.179238260374307e-05, + "loss": 0.089, + "step": 18712 + }, + { + "epoch": 2.2190205146448476, + "grad_norm": 0.6539711158768791, + "learning_rate": 2.1790002096871945e-05, + "loss": 0.0603, + "step": 18713 + }, + { + "epoch": 2.2191390964069724, + "grad_norm": 0.7488235439497176, + "learning_rate": 2.178762161959307e-05, + "loss": 0.1014, + "step": 18714 + }, + { + "epoch": 2.2192576781690976, + "grad_norm": 0.9571688597791002, + "learning_rate": 2.1785241171928388e-05, + "loss": 0.1471, + "step": 18715 + }, + { + "epoch": 2.219376259931223, + "grad_norm": 0.5724504931816764, + "learning_rate": 2.1782860753899833e-05, + "loss": 0.0714, + "step": 18716 + }, + { + "epoch": 2.2194948416933475, + "grad_norm": 0.5679356555553766, + "learning_rate": 2.1780480365529358e-05, + "loss": 0.0765, + "step": 18717 + }, + { + "epoch": 2.2196134234554727, + "grad_norm": 0.5857518003283306, + "learning_rate": 2.1778100006838904e-05, + "loss": 0.0936, + "step": 18718 + }, + { + "epoch": 2.2197320052175975, + "grad_norm": 0.8120237484943815, + "learning_rate": 2.1775719677850426e-05, + "loss": 0.108, + "step": 18719 + }, + { + "epoch": 2.2198505869797227, + "grad_norm": 0.7136030016253614, + "learning_rate": 2.1773339378585845e-05, + "loss": 0.0973, + "step": 18720 + }, + { + "epoch": 2.2199691687418475, + "grad_norm": 0.6244744878349615, + "learning_rate": 2.177095910906713e-05, + "loss": 0.0776, + "step": 18721 + }, + { + "epoch": 2.2200877505039727, + "grad_norm": 0.5949523161318707, + "learning_rate": 2.176857886931621e-05, + "loss": 0.0724, + "step": 18722 + }, + { + "epoch": 2.2202063322660974, + "grad_norm": 0.625403615098908, + "learning_rate": 2.176619865935503e-05, + "loss": 0.0994, + "step": 18723 + }, + { + "epoch": 2.2203249140282226, + "grad_norm": 0.7921867517076254, + "learning_rate": 2.1763818479205525e-05, + "loss": 0.0853, + "step": 18724 + }, + { + "epoch": 2.2204434957903474, + "grad_norm": 0.9848042325838526, + "learning_rate": 2.1761438328889655e-05, + "loss": 0.133, + "step": 18725 + }, + { + "epoch": 2.2205620775524726, + "grad_norm": 0.6968242006894193, + "learning_rate": 2.1759058208429355e-05, + "loss": 0.08, + "step": 18726 + }, + { + "epoch": 2.2206806593145973, + "grad_norm": 0.6351210625642078, + "learning_rate": 2.1756678117846553e-05, + "loss": 0.0838, + "step": 18727 + }, + { + "epoch": 2.2207992410767226, + "grad_norm": 0.7519041942911971, + "learning_rate": 2.17542980571632e-05, + "loss": 0.0933, + "step": 18728 + }, + { + "epoch": 2.2209178228388473, + "grad_norm": 0.5309643113493044, + "learning_rate": 2.1751918026401248e-05, + "loss": 0.0884, + "step": 18729 + }, + { + "epoch": 2.2210364046009725, + "grad_norm": 0.5706025259925694, + "learning_rate": 2.1749538025582627e-05, + "loss": 0.0932, + "step": 18730 + }, + { + "epoch": 2.2211549863630973, + "grad_norm": 0.6736978131525626, + "learning_rate": 2.174715805472927e-05, + "loss": 0.0828, + "step": 18731 + }, + { + "epoch": 2.2212735681252225, + "grad_norm": 0.8550123104962464, + "learning_rate": 2.174477811386314e-05, + "loss": 0.1158, + "step": 18732 + }, + { + "epoch": 2.2213921498873472, + "grad_norm": 0.8533094435047427, + "learning_rate": 2.1742398203006162e-05, + "loss": 0.1158, + "step": 18733 + }, + { + "epoch": 2.2215107316494724, + "grad_norm": 0.6019263942042418, + "learning_rate": 2.174001832218028e-05, + "loss": 0.081, + "step": 18734 + }, + { + "epoch": 2.221629313411597, + "grad_norm": 0.7087622543174938, + "learning_rate": 2.173763847140742e-05, + "loss": 0.1098, + "step": 18735 + }, + { + "epoch": 2.2217478951737224, + "grad_norm": 0.855204827978792, + "learning_rate": 2.1735258650709538e-05, + "loss": 0.1096, + "step": 18736 + }, + { + "epoch": 2.221866476935847, + "grad_norm": 0.672713353971956, + "learning_rate": 2.1732878860108567e-05, + "loss": 0.0819, + "step": 18737 + }, + { + "epoch": 2.2219850586979724, + "grad_norm": 0.6423223416370696, + "learning_rate": 2.1730499099626452e-05, + "loss": 0.0963, + "step": 18738 + }, + { + "epoch": 2.222103640460097, + "grad_norm": 0.8017845214030946, + "learning_rate": 2.1728119369285116e-05, + "loss": 0.1171, + "step": 18739 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5484568966715241, + "learning_rate": 2.1725739669106515e-05, + "loss": 0.0821, + "step": 18740 + }, + { + "epoch": 2.222340803984347, + "grad_norm": 0.8276292428236063, + "learning_rate": 2.1723359999112584e-05, + "loss": 0.1047, + "step": 18741 + }, + { + "epoch": 2.2224593857464723, + "grad_norm": 0.8199129944311886, + "learning_rate": 2.172098035932525e-05, + "loss": 0.1044, + "step": 18742 + }, + { + "epoch": 2.222577967508597, + "grad_norm": 0.7741103604108954, + "learning_rate": 2.171860074976645e-05, + "loss": 0.1167, + "step": 18743 + }, + { + "epoch": 2.2226965492707222, + "grad_norm": 0.9296993249199452, + "learning_rate": 2.1716221170458135e-05, + "loss": 0.1029, + "step": 18744 + }, + { + "epoch": 2.222815131032847, + "grad_norm": 0.6322449514740297, + "learning_rate": 2.1713841621422236e-05, + "loss": 0.0829, + "step": 18745 + }, + { + "epoch": 2.222933712794972, + "grad_norm": 0.7561955579018941, + "learning_rate": 2.1711462102680674e-05, + "loss": 0.096, + "step": 18746 + }, + { + "epoch": 2.223052294557097, + "grad_norm": 0.7086477882702891, + "learning_rate": 2.170908261425541e-05, + "loss": 0.0989, + "step": 18747 + }, + { + "epoch": 2.223170876319222, + "grad_norm": 0.7641369952199866, + "learning_rate": 2.1706703156168364e-05, + "loss": 0.1088, + "step": 18748 + }, + { + "epoch": 2.223289458081347, + "grad_norm": 0.8338466259243604, + "learning_rate": 2.1704323728441485e-05, + "loss": 0.0993, + "step": 18749 + }, + { + "epoch": 2.223408039843472, + "grad_norm": 0.6418169743060544, + "learning_rate": 2.1701944331096685e-05, + "loss": 0.0998, + "step": 18750 + }, + { + "epoch": 2.223526621605597, + "grad_norm": 0.7896333584908579, + "learning_rate": 2.1699564964155926e-05, + "loss": 0.112, + "step": 18751 + }, + { + "epoch": 2.223645203367722, + "grad_norm": 0.5436595342286243, + "learning_rate": 2.169718562764113e-05, + "loss": 0.0754, + "step": 18752 + }, + { + "epoch": 2.223763785129847, + "grad_norm": 0.9491578386402872, + "learning_rate": 2.1694806321574233e-05, + "loss": 0.1597, + "step": 18753 + }, + { + "epoch": 2.223882366891972, + "grad_norm": 0.7068064879579861, + "learning_rate": 2.1692427045977158e-05, + "loss": 0.107, + "step": 18754 + }, + { + "epoch": 2.224000948654097, + "grad_norm": 0.6234575465611063, + "learning_rate": 2.169004780087186e-05, + "loss": 0.0774, + "step": 18755 + }, + { + "epoch": 2.224119530416222, + "grad_norm": 1.0342195399836753, + "learning_rate": 2.1687668586280263e-05, + "loss": 0.1083, + "step": 18756 + }, + { + "epoch": 2.2242381121783468, + "grad_norm": 0.6785739803533407, + "learning_rate": 2.1685289402224292e-05, + "loss": 0.0872, + "step": 18757 + }, + { + "epoch": 2.224356693940472, + "grad_norm": 0.8490115567075731, + "learning_rate": 2.1682910248725885e-05, + "loss": 0.101, + "step": 18758 + }, + { + "epoch": 2.2244752757025967, + "grad_norm": 0.6927880533800067, + "learning_rate": 2.168053112580699e-05, + "loss": 0.0987, + "step": 18759 + }, + { + "epoch": 2.224593857464722, + "grad_norm": 0.5264706318452989, + "learning_rate": 2.1678152033489525e-05, + "loss": 0.0845, + "step": 18760 + }, + { + "epoch": 2.224712439226847, + "grad_norm": 0.7547232201709428, + "learning_rate": 2.1675772971795412e-05, + "loss": 0.1234, + "step": 18761 + }, + { + "epoch": 2.224831020988972, + "grad_norm": 0.5581102517391483, + "learning_rate": 2.1673393940746608e-05, + "loss": 0.0712, + "step": 18762 + }, + { + "epoch": 2.2249496027510967, + "grad_norm": 0.9319771514438503, + "learning_rate": 2.1671014940365036e-05, + "loss": 0.1636, + "step": 18763 + }, + { + "epoch": 2.225068184513222, + "grad_norm": 0.8595773018880462, + "learning_rate": 2.1668635970672622e-05, + "loss": 0.1056, + "step": 18764 + }, + { + "epoch": 2.225186766275347, + "grad_norm": 0.7048883200320013, + "learning_rate": 2.1666257031691286e-05, + "loss": 0.1132, + "step": 18765 + }, + { + "epoch": 2.225305348037472, + "grad_norm": 0.8996344855583275, + "learning_rate": 2.1663878123442984e-05, + "loss": 0.1061, + "step": 18766 + }, + { + "epoch": 2.225423929799597, + "grad_norm": 0.8797930047428949, + "learning_rate": 2.166149924594963e-05, + "loss": 0.1033, + "step": 18767 + }, + { + "epoch": 2.225542511561722, + "grad_norm": 0.603600491775206, + "learning_rate": 2.1659120399233166e-05, + "loss": 0.0944, + "step": 18768 + }, + { + "epoch": 2.225661093323847, + "grad_norm": 0.7087980287441159, + "learning_rate": 2.16567415833155e-05, + "loss": 0.0818, + "step": 18769 + }, + { + "epoch": 2.2257796750859717, + "grad_norm": 0.8365635078548959, + "learning_rate": 2.165436279821859e-05, + "loss": 0.1226, + "step": 18770 + }, + { + "epoch": 2.225898256848097, + "grad_norm": 0.6541044937624289, + "learning_rate": 2.1651984043964356e-05, + "loss": 0.0826, + "step": 18771 + }, + { + "epoch": 2.2260168386102217, + "grad_norm": 0.5660494797102648, + "learning_rate": 2.1649605320574717e-05, + "loss": 0.0721, + "step": 18772 + }, + { + "epoch": 2.226135420372347, + "grad_norm": 0.6366593759643671, + "learning_rate": 2.1647226628071603e-05, + "loss": 0.0908, + "step": 18773 + }, + { + "epoch": 2.2262540021344717, + "grad_norm": 0.7773936632106131, + "learning_rate": 2.1644847966476955e-05, + "loss": 0.1007, + "step": 18774 + }, + { + "epoch": 2.226372583896597, + "grad_norm": 0.7805968138256604, + "learning_rate": 2.1642469335812697e-05, + "loss": 0.0819, + "step": 18775 + }, + { + "epoch": 2.2264911656587216, + "grad_norm": 0.7487688021158309, + "learning_rate": 2.1640090736100744e-05, + "loss": 0.1095, + "step": 18776 + }, + { + "epoch": 2.226609747420847, + "grad_norm": 0.7873243018571832, + "learning_rate": 2.1637712167363042e-05, + "loss": 0.0948, + "step": 18777 + }, + { + "epoch": 2.2267283291829716, + "grad_norm": 0.5833673606835681, + "learning_rate": 2.1635333629621507e-05, + "loss": 0.0867, + "step": 18778 + }, + { + "epoch": 2.226846910945097, + "grad_norm": 0.7736650311829509, + "learning_rate": 2.1632955122898075e-05, + "loss": 0.0986, + "step": 18779 + }, + { + "epoch": 2.2269654927072216, + "grad_norm": 0.6152818324151181, + "learning_rate": 2.1630576647214657e-05, + "loss": 0.0838, + "step": 18780 + }, + { + "epoch": 2.2270840744693468, + "grad_norm": 0.693872274101368, + "learning_rate": 2.1628198202593203e-05, + "loss": 0.083, + "step": 18781 + }, + { + "epoch": 2.2272026562314715, + "grad_norm": 0.8169361671615917, + "learning_rate": 2.1625819789055623e-05, + "loss": 0.1012, + "step": 18782 + }, + { + "epoch": 2.2273212379935967, + "grad_norm": 0.6628543288112296, + "learning_rate": 2.162344140662385e-05, + "loss": 0.0642, + "step": 18783 + }, + { + "epoch": 2.2274398197557215, + "grad_norm": 0.6511974274077093, + "learning_rate": 2.1621063055319797e-05, + "loss": 0.084, + "step": 18784 + }, + { + "epoch": 2.2275584015178467, + "grad_norm": 0.6247431062310503, + "learning_rate": 2.1618684735165407e-05, + "loss": 0.0893, + "step": 18785 + }, + { + "epoch": 2.2276769832799714, + "grad_norm": 0.609766894539109, + "learning_rate": 2.1616306446182592e-05, + "loss": 0.0756, + "step": 18786 + }, + { + "epoch": 2.2277955650420966, + "grad_norm": 0.5695849437434747, + "learning_rate": 2.161392818839329e-05, + "loss": 0.0749, + "step": 18787 + }, + { + "epoch": 2.2279141468042214, + "grad_norm": 0.787348347238981, + "learning_rate": 2.1611549961819405e-05, + "loss": 0.0951, + "step": 18788 + }, + { + "epoch": 2.2280327285663466, + "grad_norm": 0.6905358469519165, + "learning_rate": 2.1609171766482882e-05, + "loss": 0.0795, + "step": 18789 + }, + { + "epoch": 2.2281513103284714, + "grad_norm": 1.0354238292423859, + "learning_rate": 2.1606793602405643e-05, + "loss": 0.125, + "step": 18790 + }, + { + "epoch": 2.2282698920905966, + "grad_norm": 0.8314321206602734, + "learning_rate": 2.160441546960959e-05, + "loss": 0.1337, + "step": 18791 + }, + { + "epoch": 2.2283884738527213, + "grad_norm": 1.1733671730858974, + "learning_rate": 2.160203736811668e-05, + "loss": 0.1607, + "step": 18792 + }, + { + "epoch": 2.2285070556148465, + "grad_norm": 0.8593392276768198, + "learning_rate": 2.1599659297948813e-05, + "loss": 0.1098, + "step": 18793 + }, + { + "epoch": 2.2286256373769713, + "grad_norm": 0.9183073058252581, + "learning_rate": 2.159728125912792e-05, + "loss": 0.1323, + "step": 18794 + }, + { + "epoch": 2.2287442191390965, + "grad_norm": 0.7059799461980566, + "learning_rate": 2.159490325167591e-05, + "loss": 0.107, + "step": 18795 + }, + { + "epoch": 2.2288628009012212, + "grad_norm": 0.8024018376737095, + "learning_rate": 2.1592525275614725e-05, + "loss": 0.1165, + "step": 18796 + }, + { + "epoch": 2.2289813826633464, + "grad_norm": 0.7223505266758906, + "learning_rate": 2.1590147330966273e-05, + "loss": 0.1147, + "step": 18797 + }, + { + "epoch": 2.229099964425471, + "grad_norm": 0.6282650319680421, + "learning_rate": 2.158776941775249e-05, + "loss": 0.0672, + "step": 18798 + }, + { + "epoch": 2.2292185461875964, + "grad_norm": 0.6831447982654134, + "learning_rate": 2.1585391535995277e-05, + "loss": 0.0955, + "step": 18799 + }, + { + "epoch": 2.229337127949721, + "grad_norm": 0.7273066142755623, + "learning_rate": 2.1583013685716575e-05, + "loss": 0.0838, + "step": 18800 + }, + { + "epoch": 2.2294557097118464, + "grad_norm": 0.7426825357542508, + "learning_rate": 2.1580635866938296e-05, + "loss": 0.0924, + "step": 18801 + }, + { + "epoch": 2.229574291473971, + "grad_norm": 0.7319850654352308, + "learning_rate": 2.1578258079682366e-05, + "loss": 0.1012, + "step": 18802 + }, + { + "epoch": 2.2296928732360963, + "grad_norm": 0.5214485258605044, + "learning_rate": 2.1575880323970688e-05, + "loss": 0.0756, + "step": 18803 + }, + { + "epoch": 2.229811454998221, + "grad_norm": 0.6081553509926184, + "learning_rate": 2.15735025998252e-05, + "loss": 0.0854, + "step": 18804 + }, + { + "epoch": 2.2299300367603463, + "grad_norm": 0.8328187106150088, + "learning_rate": 2.157112490726782e-05, + "loss": 0.1084, + "step": 18805 + }, + { + "epoch": 2.230048618522471, + "grad_norm": 0.6132669417220075, + "learning_rate": 2.1568747246320455e-05, + "loss": 0.0813, + "step": 18806 + }, + { + "epoch": 2.2301672002845963, + "grad_norm": 0.7922557391904104, + "learning_rate": 2.1566369617005046e-05, + "loss": 0.1045, + "step": 18807 + }, + { + "epoch": 2.230285782046721, + "grad_norm": 0.7801383735906623, + "learning_rate": 2.156399201934349e-05, + "loss": 0.1239, + "step": 18808 + }, + { + "epoch": 2.230404363808846, + "grad_norm": 0.7151482338731723, + "learning_rate": 2.1561614453357714e-05, + "loss": 0.0854, + "step": 18809 + }, + { + "epoch": 2.2305229455709714, + "grad_norm": 0.9638759466985427, + "learning_rate": 2.155923691906963e-05, + "loss": 0.1122, + "step": 18810 + }, + { + "epoch": 2.230641527333096, + "grad_norm": 0.7750739485000125, + "learning_rate": 2.1556859416501175e-05, + "loss": 0.1076, + "step": 18811 + }, + { + "epoch": 2.230760109095221, + "grad_norm": 0.8145145509739613, + "learning_rate": 2.155448194567425e-05, + "loss": 0.1302, + "step": 18812 + }, + { + "epoch": 2.230878690857346, + "grad_norm": 0.9073695748759717, + "learning_rate": 2.155210450661078e-05, + "loss": 0.1367, + "step": 18813 + }, + { + "epoch": 2.2309972726194713, + "grad_norm": 1.1816394577260945, + "learning_rate": 2.1549727099332664e-05, + "loss": 0.1566, + "step": 18814 + }, + { + "epoch": 2.231115854381596, + "grad_norm": 0.8331976078740441, + "learning_rate": 2.1547349723861848e-05, + "loss": 0.1104, + "step": 18815 + }, + { + "epoch": 2.2312344361437213, + "grad_norm": 0.6937543285825699, + "learning_rate": 2.1544972380220223e-05, + "loss": 0.1014, + "step": 18816 + }, + { + "epoch": 2.231353017905846, + "grad_norm": 0.7294410537668036, + "learning_rate": 2.154259506842972e-05, + "loss": 0.0954, + "step": 18817 + }, + { + "epoch": 2.2314715996679713, + "grad_norm": 0.5338344444228786, + "learning_rate": 2.154021778851225e-05, + "loss": 0.0793, + "step": 18818 + }, + { + "epoch": 2.231590181430096, + "grad_norm": 0.9035805176108189, + "learning_rate": 2.1537840540489733e-05, + "loss": 0.1282, + "step": 18819 + }, + { + "epoch": 2.2317087631922212, + "grad_norm": 0.7050253398578582, + "learning_rate": 2.1535463324384084e-05, + "loss": 0.1095, + "step": 18820 + }, + { + "epoch": 2.231827344954346, + "grad_norm": 0.9032088897333036, + "learning_rate": 2.15330861402172e-05, + "loss": 0.1387, + "step": 18821 + }, + { + "epoch": 2.231945926716471, + "grad_norm": 0.6553096511908723, + "learning_rate": 2.1530708988011023e-05, + "loss": 0.095, + "step": 18822 + }, + { + "epoch": 2.232064508478596, + "grad_norm": 0.8031440439437122, + "learning_rate": 2.1528331867787453e-05, + "loss": 0.1144, + "step": 18823 + }, + { + "epoch": 2.232183090240721, + "grad_norm": 0.7182569189548866, + "learning_rate": 2.152595477956841e-05, + "loss": 0.0861, + "step": 18824 + }, + { + "epoch": 2.232301672002846, + "grad_norm": 0.6915433891118755, + "learning_rate": 2.152357772337579e-05, + "loss": 0.0833, + "step": 18825 + }, + { + "epoch": 2.232420253764971, + "grad_norm": 0.7490665367998338, + "learning_rate": 2.1521200699231532e-05, + "loss": 0.0943, + "step": 18826 + }, + { + "epoch": 2.232538835527096, + "grad_norm": 1.2242729249336664, + "learning_rate": 2.1518823707157532e-05, + "loss": 0.1634, + "step": 18827 + }, + { + "epoch": 2.232657417289221, + "grad_norm": 1.133322763656763, + "learning_rate": 2.1516446747175713e-05, + "loss": 0.1498, + "step": 18828 + }, + { + "epoch": 2.232775999051346, + "grad_norm": 0.9999142839799646, + "learning_rate": 2.1514069819307976e-05, + "loss": 0.1121, + "step": 18829 + }, + { + "epoch": 2.232894580813471, + "grad_norm": 0.8495200033038777, + "learning_rate": 2.1511692923576246e-05, + "loss": 0.1115, + "step": 18830 + }, + { + "epoch": 2.233013162575596, + "grad_norm": 0.662889091394239, + "learning_rate": 2.1509316060002433e-05, + "loss": 0.0977, + "step": 18831 + }, + { + "epoch": 2.233131744337721, + "grad_norm": 0.8262139496396805, + "learning_rate": 2.150693922860843e-05, + "loss": 0.1093, + "step": 18832 + }, + { + "epoch": 2.2332503260998458, + "grad_norm": 0.8990809309299874, + "learning_rate": 2.150456242941618e-05, + "loss": 0.0928, + "step": 18833 + }, + { + "epoch": 2.233368907861971, + "grad_norm": 0.9379962875256497, + "learning_rate": 2.1502185662447576e-05, + "loss": 0.1235, + "step": 18834 + }, + { + "epoch": 2.2334874896240957, + "grad_norm": 0.7531394175953882, + "learning_rate": 2.149980892772453e-05, + "loss": 0.1111, + "step": 18835 + }, + { + "epoch": 2.233606071386221, + "grad_norm": 0.7997052703584738, + "learning_rate": 2.1497432225268947e-05, + "loss": 0.0936, + "step": 18836 + }, + { + "epoch": 2.2337246531483457, + "grad_norm": 0.7761639998283222, + "learning_rate": 2.1495055555102748e-05, + "loss": 0.1171, + "step": 18837 + }, + { + "epoch": 2.233843234910471, + "grad_norm": 0.6841340464595771, + "learning_rate": 2.149267891724784e-05, + "loss": 0.0957, + "step": 18838 + }, + { + "epoch": 2.2339618166725956, + "grad_norm": 0.7271188037776125, + "learning_rate": 2.1490302311726135e-05, + "loss": 0.113, + "step": 18839 + }, + { + "epoch": 2.234080398434721, + "grad_norm": 0.5344064706108158, + "learning_rate": 2.1487925738559526e-05, + "loss": 0.0826, + "step": 18840 + }, + { + "epoch": 2.2341989801968456, + "grad_norm": 0.755979095229751, + "learning_rate": 2.1485549197769945e-05, + "loss": 0.0979, + "step": 18841 + }, + { + "epoch": 2.234317561958971, + "grad_norm": 0.7890720888992585, + "learning_rate": 2.1483172689379294e-05, + "loss": 0.1094, + "step": 18842 + }, + { + "epoch": 2.2344361437210956, + "grad_norm": 0.7907479320712044, + "learning_rate": 2.1480796213409475e-05, + "loss": 0.1014, + "step": 18843 + }, + { + "epoch": 2.2345547254832208, + "grad_norm": 0.6358448794271879, + "learning_rate": 2.147841976988239e-05, + "loss": 0.084, + "step": 18844 + }, + { + "epoch": 2.2346733072453455, + "grad_norm": 0.7735682252406759, + "learning_rate": 2.1476043358819965e-05, + "loss": 0.1144, + "step": 18845 + }, + { + "epoch": 2.2347918890074707, + "grad_norm": 0.875111801265609, + "learning_rate": 2.14736669802441e-05, + "loss": 0.0965, + "step": 18846 + }, + { + "epoch": 2.2349104707695955, + "grad_norm": 0.7379134265865817, + "learning_rate": 2.1471290634176693e-05, + "loss": 0.0984, + "step": 18847 + }, + { + "epoch": 2.2350290525317207, + "grad_norm": 0.6718381116612439, + "learning_rate": 2.1468914320639662e-05, + "loss": 0.0946, + "step": 18848 + }, + { + "epoch": 2.2351476342938454, + "grad_norm": 0.7589504529752553, + "learning_rate": 2.1466538039654917e-05, + "loss": 0.0947, + "step": 18849 + }, + { + "epoch": 2.2352662160559706, + "grad_norm": 0.6800429576995427, + "learning_rate": 2.146416179124436e-05, + "loss": 0.1116, + "step": 18850 + }, + { + "epoch": 2.2353847978180954, + "grad_norm": 0.7833938571961641, + "learning_rate": 2.1461785575429882e-05, + "loss": 0.1013, + "step": 18851 + }, + { + "epoch": 2.2355033795802206, + "grad_norm": 0.5454432007354336, + "learning_rate": 2.1459409392233414e-05, + "loss": 0.0746, + "step": 18852 + }, + { + "epoch": 2.2356219613423454, + "grad_norm": 0.6861710359905958, + "learning_rate": 2.1457033241676848e-05, + "loss": 0.0745, + "step": 18853 + }, + { + "epoch": 2.2357405431044706, + "grad_norm": 0.6380638803104269, + "learning_rate": 2.1454657123782094e-05, + "loss": 0.0828, + "step": 18854 + }, + { + "epoch": 2.2358591248665953, + "grad_norm": 0.6411095310624352, + "learning_rate": 2.145228103857104e-05, + "loss": 0.088, + "step": 18855 + }, + { + "epoch": 2.2359777066287205, + "grad_norm": 0.8277616572126475, + "learning_rate": 2.1449904986065617e-05, + "loss": 0.1116, + "step": 18856 + }, + { + "epoch": 2.2360962883908453, + "grad_norm": 0.8441712303381849, + "learning_rate": 2.144752896628771e-05, + "loss": 0.092, + "step": 18857 + }, + { + "epoch": 2.2362148701529705, + "grad_norm": 0.8235665871619438, + "learning_rate": 2.1445152979259237e-05, + "loss": 0.1193, + "step": 18858 + }, + { + "epoch": 2.2363334519150957, + "grad_norm": 0.747603940495877, + "learning_rate": 2.144277702500208e-05, + "loss": 0.0924, + "step": 18859 + }, + { + "epoch": 2.2364520336772205, + "grad_norm": 0.6985403316308801, + "learning_rate": 2.1440401103538172e-05, + "loss": 0.1115, + "step": 18860 + }, + { + "epoch": 2.236570615439345, + "grad_norm": 0.5831015556023748, + "learning_rate": 2.1438025214889395e-05, + "loss": 0.0683, + "step": 18861 + }, + { + "epoch": 2.2366891972014704, + "grad_norm": 0.7646415199579337, + "learning_rate": 2.143564935907765e-05, + "loss": 0.1064, + "step": 18862 + }, + { + "epoch": 2.2368077789635956, + "grad_norm": 0.9318470984950791, + "learning_rate": 2.143327353612486e-05, + "loss": 0.1177, + "step": 18863 + }, + { + "epoch": 2.2369263607257204, + "grad_norm": 0.7586036405627115, + "learning_rate": 2.1430897746052912e-05, + "loss": 0.0942, + "step": 18864 + }, + { + "epoch": 2.237044942487845, + "grad_norm": 0.8410941835501364, + "learning_rate": 2.1428521988883706e-05, + "loss": 0.1103, + "step": 18865 + }, + { + "epoch": 2.2371635242499703, + "grad_norm": 0.5670953140084716, + "learning_rate": 2.1426146264639145e-05, + "loss": 0.0661, + "step": 18866 + }, + { + "epoch": 2.2372821060120955, + "grad_norm": 0.6849696905241878, + "learning_rate": 2.1423770573341133e-05, + "loss": 0.1142, + "step": 18867 + }, + { + "epoch": 2.2374006877742203, + "grad_norm": 1.0178732364438488, + "learning_rate": 2.1421394915011578e-05, + "loss": 0.1404, + "step": 18868 + }, + { + "epoch": 2.2375192695363455, + "grad_norm": 0.7417392836522756, + "learning_rate": 2.141901928967237e-05, + "loss": 0.1105, + "step": 18869 + }, + { + "epoch": 2.2376378512984703, + "grad_norm": 0.8306179505585711, + "learning_rate": 2.1416643697345405e-05, + "loss": 0.0786, + "step": 18870 + }, + { + "epoch": 2.2377564330605955, + "grad_norm": 0.8543598082115276, + "learning_rate": 2.1414268138052603e-05, + "loss": 0.0845, + "step": 18871 + }, + { + "epoch": 2.23787501482272, + "grad_norm": 0.6520212588362649, + "learning_rate": 2.141189261181585e-05, + "loss": 0.1006, + "step": 18872 + }, + { + "epoch": 2.2379935965848454, + "grad_norm": 0.5386387311043194, + "learning_rate": 2.1409517118657046e-05, + "loss": 0.0788, + "step": 18873 + }, + { + "epoch": 2.23811217834697, + "grad_norm": 0.6834238086064753, + "learning_rate": 2.1407141658598084e-05, + "loss": 0.0774, + "step": 18874 + }, + { + "epoch": 2.2382307601090954, + "grad_norm": 0.7645859343333278, + "learning_rate": 2.1404766231660878e-05, + "loss": 0.0951, + "step": 18875 + }, + { + "epoch": 2.23834934187122, + "grad_norm": 0.7661819972371129, + "learning_rate": 2.1402390837867315e-05, + "loss": 0.1138, + "step": 18876 + }, + { + "epoch": 2.2384679236333453, + "grad_norm": 0.6632013097506498, + "learning_rate": 2.1400015477239294e-05, + "loss": 0.1125, + "step": 18877 + }, + { + "epoch": 2.23858650539547, + "grad_norm": 0.7099746074185574, + "learning_rate": 2.1397640149798718e-05, + "loss": 0.0879, + "step": 18878 + }, + { + "epoch": 2.2387050871575953, + "grad_norm": 0.6229539869285571, + "learning_rate": 2.139526485556749e-05, + "loss": 0.1106, + "step": 18879 + }, + { + "epoch": 2.23882366891972, + "grad_norm": 0.7134464996813806, + "learning_rate": 2.1392889594567496e-05, + "loss": 0.1187, + "step": 18880 + }, + { + "epoch": 2.2389422506818453, + "grad_norm": 0.6717632038325828, + "learning_rate": 2.1390514366820624e-05, + "loss": 0.102, + "step": 18881 + }, + { + "epoch": 2.23906083244397, + "grad_norm": 0.5955593377091798, + "learning_rate": 2.13881391723488e-05, + "loss": 0.083, + "step": 18882 + }, + { + "epoch": 2.2391794142060952, + "grad_norm": 0.8472397525940243, + "learning_rate": 2.13857640111739e-05, + "loss": 0.1122, + "step": 18883 + }, + { + "epoch": 2.23929799596822, + "grad_norm": 0.9843788050704898, + "learning_rate": 2.138338888331782e-05, + "loss": 0.1483, + "step": 18884 + }, + { + "epoch": 2.239416577730345, + "grad_norm": 0.7358397207627863, + "learning_rate": 2.1381013788802454e-05, + "loss": 0.1018, + "step": 18885 + }, + { + "epoch": 2.23953515949247, + "grad_norm": 0.6440591621804126, + "learning_rate": 2.1378638727649714e-05, + "loss": 0.0997, + "step": 18886 + }, + { + "epoch": 2.239653741254595, + "grad_norm": 0.7018834654611592, + "learning_rate": 2.1376263699881474e-05, + "loss": 0.0827, + "step": 18887 + }, + { + "epoch": 2.23977232301672, + "grad_norm": 0.5952155952241978, + "learning_rate": 2.1373888705519643e-05, + "loss": 0.0968, + "step": 18888 + }, + { + "epoch": 2.239890904778845, + "grad_norm": 0.6913547250129303, + "learning_rate": 2.137151374458611e-05, + "loss": 0.0896, + "step": 18889 + }, + { + "epoch": 2.24000948654097, + "grad_norm": 0.6219516172228303, + "learning_rate": 2.1369138817102776e-05, + "loss": 0.0857, + "step": 18890 + }, + { + "epoch": 2.240128068303095, + "grad_norm": 0.6545288669349081, + "learning_rate": 2.136676392309153e-05, + "loss": 0.0909, + "step": 18891 + }, + { + "epoch": 2.24024665006522, + "grad_norm": 0.6263711047690557, + "learning_rate": 2.136438906257426e-05, + "loss": 0.088, + "step": 18892 + }, + { + "epoch": 2.240365231827345, + "grad_norm": 0.7130280433219129, + "learning_rate": 2.1362014235572868e-05, + "loss": 0.1028, + "step": 18893 + }, + { + "epoch": 2.24048381358947, + "grad_norm": 0.7952716289697666, + "learning_rate": 2.1359639442109245e-05, + "loss": 0.0793, + "step": 18894 + }, + { + "epoch": 2.240602395351595, + "grad_norm": 0.8678019998458418, + "learning_rate": 2.1357264682205277e-05, + "loss": 0.1181, + "step": 18895 + }, + { + "epoch": 2.2407209771137198, + "grad_norm": 0.9001229098397029, + "learning_rate": 2.1354889955882862e-05, + "loss": 0.1153, + "step": 18896 + }, + { + "epoch": 2.240839558875845, + "grad_norm": 0.599454135667106, + "learning_rate": 2.135251526316389e-05, + "loss": 0.084, + "step": 18897 + }, + { + "epoch": 2.2409581406379697, + "grad_norm": 1.3688572755857746, + "learning_rate": 2.1350140604070264e-05, + "loss": 0.243, + "step": 18898 + }, + { + "epoch": 2.241076722400095, + "grad_norm": 0.8259674590773197, + "learning_rate": 2.1347765978623863e-05, + "loss": 0.1063, + "step": 18899 + }, + { + "epoch": 2.2411953041622197, + "grad_norm": 0.6359413094814956, + "learning_rate": 2.1345391386846574e-05, + "loss": 0.0935, + "step": 18900 + }, + { + "epoch": 2.241313885924345, + "grad_norm": 0.70845256581606, + "learning_rate": 2.13430168287603e-05, + "loss": 0.085, + "step": 18901 + }, + { + "epoch": 2.2414324676864696, + "grad_norm": 1.0382199812724466, + "learning_rate": 2.134064230438693e-05, + "loss": 0.1304, + "step": 18902 + }, + { + "epoch": 2.241551049448595, + "grad_norm": 0.8277205927541627, + "learning_rate": 2.1338267813748335e-05, + "loss": 0.0987, + "step": 18903 + }, + { + "epoch": 2.2416696312107196, + "grad_norm": 0.8075284445579336, + "learning_rate": 2.1335893356866437e-05, + "loss": 0.1025, + "step": 18904 + }, + { + "epoch": 2.241788212972845, + "grad_norm": 0.8756098676671197, + "learning_rate": 2.1333518933763108e-05, + "loss": 0.1143, + "step": 18905 + }, + { + "epoch": 2.2419067947349696, + "grad_norm": 0.8035422139112733, + "learning_rate": 2.133114454446023e-05, + "loss": 0.0828, + "step": 18906 + }, + { + "epoch": 2.2420253764970948, + "grad_norm": 0.9547125371853618, + "learning_rate": 2.13287701889797e-05, + "loss": 0.1474, + "step": 18907 + }, + { + "epoch": 2.2421439582592195, + "grad_norm": 0.48853763498941616, + "learning_rate": 2.1326395867343414e-05, + "loss": 0.075, + "step": 18908 + }, + { + "epoch": 2.2422625400213447, + "grad_norm": 0.7224447801368307, + "learning_rate": 2.132402157957325e-05, + "loss": 0.0956, + "step": 18909 + }, + { + "epoch": 2.2423811217834695, + "grad_norm": 0.47093722074855326, + "learning_rate": 2.1321647325691105e-05, + "loss": 0.0737, + "step": 18910 + }, + { + "epoch": 2.2424997035455947, + "grad_norm": 0.6743478127427879, + "learning_rate": 2.1319273105718847e-05, + "loss": 0.0899, + "step": 18911 + }, + { + "epoch": 2.24261828530772, + "grad_norm": 0.69067812164709, + "learning_rate": 2.131689891967839e-05, + "loss": 0.0889, + "step": 18912 + }, + { + "epoch": 2.2427368670698447, + "grad_norm": 0.7939170693137828, + "learning_rate": 2.1314524767591607e-05, + "loss": 0.1069, + "step": 18913 + }, + { + "epoch": 2.2428554488319694, + "grad_norm": 0.8261261696780542, + "learning_rate": 2.131215064948038e-05, + "loss": 0.1189, + "step": 18914 + }, + { + "epoch": 2.2429740305940946, + "grad_norm": 0.6709396937455359, + "learning_rate": 2.1309776565366603e-05, + "loss": 0.0775, + "step": 18915 + }, + { + "epoch": 2.24309261235622, + "grad_norm": 0.8673974138933519, + "learning_rate": 2.1307402515272162e-05, + "loss": 0.1319, + "step": 18916 + }, + { + "epoch": 2.2432111941183446, + "grad_norm": 1.022985165412336, + "learning_rate": 2.1305028499218946e-05, + "loss": 0.1237, + "step": 18917 + }, + { + "epoch": 2.24332977588047, + "grad_norm": 0.6265655882377251, + "learning_rate": 2.1302654517228827e-05, + "loss": 0.0963, + "step": 18918 + }, + { + "epoch": 2.2434483576425945, + "grad_norm": 0.807205876697594, + "learning_rate": 2.1300280569323707e-05, + "loss": 0.1129, + "step": 18919 + }, + { + "epoch": 2.2435669394047197, + "grad_norm": 1.1017051998758185, + "learning_rate": 2.1297906655525464e-05, + "loss": 0.1503, + "step": 18920 + }, + { + "epoch": 2.2436855211668445, + "grad_norm": 0.7684624117905947, + "learning_rate": 2.1295532775855983e-05, + "loss": 0.1096, + "step": 18921 + }, + { + "epoch": 2.2438041029289697, + "grad_norm": 0.5204420018695394, + "learning_rate": 2.1293158930337136e-05, + "loss": 0.08, + "step": 18922 + }, + { + "epoch": 2.2439226846910945, + "grad_norm": 1.0760012758866255, + "learning_rate": 2.129078511899083e-05, + "loss": 0.1941, + "step": 18923 + }, + { + "epoch": 2.2440412664532197, + "grad_norm": 0.5774032878535313, + "learning_rate": 2.1288411341838934e-05, + "loss": 0.0731, + "step": 18924 + }, + { + "epoch": 2.2441598482153444, + "grad_norm": 0.7014611983506263, + "learning_rate": 2.128603759890333e-05, + "loss": 0.1102, + "step": 18925 + }, + { + "epoch": 2.2442784299774696, + "grad_norm": 0.7083999328470206, + "learning_rate": 2.1283663890205905e-05, + "loss": 0.0918, + "step": 18926 + }, + { + "epoch": 2.2443970117395944, + "grad_norm": 0.5775466347580386, + "learning_rate": 2.128129021576854e-05, + "loss": 0.0846, + "step": 18927 + }, + { + "epoch": 2.2445155935017196, + "grad_norm": 0.5489275259153773, + "learning_rate": 2.127891657561313e-05, + "loss": 0.0874, + "step": 18928 + }, + { + "epoch": 2.2446341752638443, + "grad_norm": 0.7412139072646254, + "learning_rate": 2.127654296976154e-05, + "loss": 0.0916, + "step": 18929 + }, + { + "epoch": 2.2447527570259695, + "grad_norm": 0.6859437068761141, + "learning_rate": 2.127416939823565e-05, + "loss": 0.1018, + "step": 18930 + }, + { + "epoch": 2.2448713387880943, + "grad_norm": 0.7469130187199577, + "learning_rate": 2.1271795861057358e-05, + "loss": 0.1051, + "step": 18931 + }, + { + "epoch": 2.2449899205502195, + "grad_norm": 0.8873685255282718, + "learning_rate": 2.1269422358248536e-05, + "loss": 0.1223, + "step": 18932 + }, + { + "epoch": 2.2451085023123443, + "grad_norm": 0.9610368627359948, + "learning_rate": 2.1267048889831055e-05, + "loss": 0.1028, + "step": 18933 + }, + { + "epoch": 2.2452270840744695, + "grad_norm": 0.5981277406930261, + "learning_rate": 2.1264675455826818e-05, + "loss": 0.0922, + "step": 18934 + }, + { + "epoch": 2.2453456658365942, + "grad_norm": 0.6826640567441972, + "learning_rate": 2.1262302056257692e-05, + "loss": 0.108, + "step": 18935 + }, + { + "epoch": 2.2454642475987194, + "grad_norm": 0.6461657561206243, + "learning_rate": 2.1259928691145552e-05, + "loss": 0.0999, + "step": 18936 + }, + { + "epoch": 2.245582829360844, + "grad_norm": 0.8322745827744034, + "learning_rate": 2.125755536051228e-05, + "loss": 0.1181, + "step": 18937 + }, + { + "epoch": 2.2457014111229694, + "grad_norm": 0.9613591938960737, + "learning_rate": 2.1255182064379763e-05, + "loss": 0.1336, + "step": 18938 + }, + { + "epoch": 2.245819992885094, + "grad_norm": 0.8883616601047762, + "learning_rate": 2.125280880276988e-05, + "loss": 0.089, + "step": 18939 + }, + { + "epoch": 2.2459385746472194, + "grad_norm": 0.563186814226707, + "learning_rate": 2.1250435575704504e-05, + "loss": 0.0893, + "step": 18940 + }, + { + "epoch": 2.246057156409344, + "grad_norm": 0.6873457316351242, + "learning_rate": 2.12480623832055e-05, + "loss": 0.0844, + "step": 18941 + }, + { + "epoch": 2.2461757381714693, + "grad_norm": 0.7606682796626696, + "learning_rate": 2.1245689225294778e-05, + "loss": 0.1063, + "step": 18942 + }, + { + "epoch": 2.246294319933594, + "grad_norm": 0.7517825449773925, + "learning_rate": 2.1243316101994192e-05, + "loss": 0.1097, + "step": 18943 + }, + { + "epoch": 2.2464129016957193, + "grad_norm": 0.699587381222274, + "learning_rate": 2.124094301332562e-05, + "loss": 0.0892, + "step": 18944 + }, + { + "epoch": 2.246531483457844, + "grad_norm": 0.7711457725122731, + "learning_rate": 2.1238569959310944e-05, + "loss": 0.1074, + "step": 18945 + }, + { + "epoch": 2.2466500652199692, + "grad_norm": 0.7756336214677487, + "learning_rate": 2.1236196939972042e-05, + "loss": 0.1097, + "step": 18946 + }, + { + "epoch": 2.246768646982094, + "grad_norm": 0.8654655698138872, + "learning_rate": 2.123382395533079e-05, + "loss": 0.1148, + "step": 18947 + }, + { + "epoch": 2.246887228744219, + "grad_norm": 1.1143090698948723, + "learning_rate": 2.1231451005409055e-05, + "loss": 0.1679, + "step": 18948 + }, + { + "epoch": 2.247005810506344, + "grad_norm": 0.7503619400576503, + "learning_rate": 2.122907809022873e-05, + "loss": 0.1159, + "step": 18949 + }, + { + "epoch": 2.247124392268469, + "grad_norm": 0.5276034584792226, + "learning_rate": 2.1226705209811686e-05, + "loss": 0.0754, + "step": 18950 + }, + { + "epoch": 2.247242974030594, + "grad_norm": 0.6056072309389757, + "learning_rate": 2.122433236417979e-05, + "loss": 0.0681, + "step": 18951 + }, + { + "epoch": 2.247361555792719, + "grad_norm": 0.8157470344104641, + "learning_rate": 2.1221959553354904e-05, + "loss": 0.1008, + "step": 18952 + }, + { + "epoch": 2.247480137554844, + "grad_norm": 0.9269273638921808, + "learning_rate": 2.1219586777358932e-05, + "loss": 0.1545, + "step": 18953 + }, + { + "epoch": 2.247598719316969, + "grad_norm": 0.43468208959259647, + "learning_rate": 2.1217214036213734e-05, + "loss": 0.0601, + "step": 18954 + }, + { + "epoch": 2.247717301079094, + "grad_norm": 0.87788732550073, + "learning_rate": 2.121484132994118e-05, + "loss": 0.0789, + "step": 18955 + }, + { + "epoch": 2.247835882841219, + "grad_norm": 1.0342873096727407, + "learning_rate": 2.1212468658563146e-05, + "loss": 0.0945, + "step": 18956 + }, + { + "epoch": 2.247954464603344, + "grad_norm": 0.45864633357910395, + "learning_rate": 2.1210096022101507e-05, + "loss": 0.0602, + "step": 18957 + }, + { + "epoch": 2.248073046365469, + "grad_norm": 0.7505465678781328, + "learning_rate": 2.1207723420578136e-05, + "loss": 0.095, + "step": 18958 + }, + { + "epoch": 2.2481916281275938, + "grad_norm": 0.5777824008204963, + "learning_rate": 2.120535085401491e-05, + "loss": 0.0759, + "step": 18959 + }, + { + "epoch": 2.248310209889719, + "grad_norm": 0.5415957483720728, + "learning_rate": 2.1202978322433683e-05, + "loss": 0.0789, + "step": 18960 + }, + { + "epoch": 2.248428791651844, + "grad_norm": 0.744181198473498, + "learning_rate": 2.120060582585635e-05, + "loss": 0.0787, + "step": 18961 + }, + { + "epoch": 2.248547373413969, + "grad_norm": 0.6184984687924054, + "learning_rate": 2.1198233364304774e-05, + "loss": 0.0875, + "step": 18962 + }, + { + "epoch": 2.2486659551760937, + "grad_norm": 0.9848627018416894, + "learning_rate": 2.1195860937800817e-05, + "loss": 0.1276, + "step": 18963 + }, + { + "epoch": 2.248784536938219, + "grad_norm": 0.6739047177239939, + "learning_rate": 2.119348854636636e-05, + "loss": 0.0723, + "step": 18964 + }, + { + "epoch": 2.248903118700344, + "grad_norm": 0.5664251003846466, + "learning_rate": 2.1191116190023268e-05, + "loss": 0.078, + "step": 18965 + }, + { + "epoch": 2.249021700462469, + "grad_norm": 0.6816252722221686, + "learning_rate": 2.1188743868793424e-05, + "loss": 0.0867, + "step": 18966 + }, + { + "epoch": 2.249140282224594, + "grad_norm": 0.8814507342848475, + "learning_rate": 2.1186371582698672e-05, + "loss": 0.1254, + "step": 18967 + }, + { + "epoch": 2.249258863986719, + "grad_norm": 0.7701764114053672, + "learning_rate": 2.1183999331760908e-05, + "loss": 0.0853, + "step": 18968 + }, + { + "epoch": 2.249377445748844, + "grad_norm": 0.8526434642784664, + "learning_rate": 2.1181627116001997e-05, + "loss": 0.1219, + "step": 18969 + }, + { + "epoch": 2.2494960275109688, + "grad_norm": 0.5323647228391825, + "learning_rate": 2.1179254935443793e-05, + "loss": 0.0594, + "step": 18970 + }, + { + "epoch": 2.249614609273094, + "grad_norm": 0.8569539658646298, + "learning_rate": 2.1176882790108167e-05, + "loss": 0.1509, + "step": 18971 + }, + { + "epoch": 2.2497331910352187, + "grad_norm": 0.658287323596999, + "learning_rate": 2.1174510680017003e-05, + "loss": 0.1064, + "step": 18972 + }, + { + "epoch": 2.249851772797344, + "grad_norm": 0.7188078596127473, + "learning_rate": 2.117213860519216e-05, + "loss": 0.0781, + "step": 18973 + }, + { + "epoch": 2.2499703545594687, + "grad_norm": 0.6536073128512936, + "learning_rate": 2.1169766565655496e-05, + "loss": 0.0818, + "step": 18974 + }, + { + "epoch": 2.250088936321594, + "grad_norm": 0.5811204961292361, + "learning_rate": 2.1167394561428898e-05, + "loss": 0.0623, + "step": 18975 + }, + { + "epoch": 2.2502075180837187, + "grad_norm": 0.9489249789868841, + "learning_rate": 2.1165022592534212e-05, + "loss": 0.1291, + "step": 18976 + }, + { + "epoch": 2.250326099845844, + "grad_norm": 0.6894935899382619, + "learning_rate": 2.1162650658993323e-05, + "loss": 0.0872, + "step": 18977 + }, + { + "epoch": 2.2504446816079686, + "grad_norm": 0.9497802059269165, + "learning_rate": 2.116027876082808e-05, + "loss": 0.1596, + "step": 18978 + }, + { + "epoch": 2.250563263370094, + "grad_norm": 0.5695756401563482, + "learning_rate": 2.115790689806037e-05, + "loss": 0.0776, + "step": 18979 + }, + { + "epoch": 2.2506818451322186, + "grad_norm": 0.773402356391573, + "learning_rate": 2.1155535070712042e-05, + "loss": 0.1206, + "step": 18980 + }, + { + "epoch": 2.250800426894344, + "grad_norm": 0.6024278135369523, + "learning_rate": 2.1153163278804972e-05, + "loss": 0.0781, + "step": 18981 + }, + { + "epoch": 2.2509190086564685, + "grad_norm": 0.7188108336265119, + "learning_rate": 2.1150791522361003e-05, + "loss": 0.0939, + "step": 18982 + }, + { + "epoch": 2.2510375904185937, + "grad_norm": 0.6320656859745113, + "learning_rate": 2.114841980140203e-05, + "loss": 0.0955, + "step": 18983 + }, + { + "epoch": 2.2511561721807185, + "grad_norm": 0.5944094817620409, + "learning_rate": 2.1146048115949905e-05, + "loss": 0.0792, + "step": 18984 + }, + { + "epoch": 2.2512747539428437, + "grad_norm": 0.8517744893713042, + "learning_rate": 2.1143676466026482e-05, + "loss": 0.1172, + "step": 18985 + }, + { + "epoch": 2.2513933357049685, + "grad_norm": 1.3934710464685394, + "learning_rate": 2.1141304851653635e-05, + "loss": 0.1572, + "step": 18986 + }, + { + "epoch": 2.2515119174670937, + "grad_norm": 0.7979234048921818, + "learning_rate": 2.1138933272853224e-05, + "loss": 0.1066, + "step": 18987 + }, + { + "epoch": 2.2516304992292184, + "grad_norm": 0.9094071662287244, + "learning_rate": 2.113656172964712e-05, + "loss": 0.1265, + "step": 18988 + }, + { + "epoch": 2.2517490809913436, + "grad_norm": 0.7743089794025334, + "learning_rate": 2.1134190222057166e-05, + "loss": 0.1006, + "step": 18989 + }, + { + "epoch": 2.2518676627534684, + "grad_norm": 0.7411886064193347, + "learning_rate": 2.113181875010525e-05, + "loss": 0.1234, + "step": 18990 + }, + { + "epoch": 2.2519862445155936, + "grad_norm": 0.8205624181656376, + "learning_rate": 2.112944731381322e-05, + "loss": 0.1017, + "step": 18991 + }, + { + "epoch": 2.2521048262777184, + "grad_norm": 0.9378917442216863, + "learning_rate": 2.1127075913202942e-05, + "loss": 0.1244, + "step": 18992 + }, + { + "epoch": 2.2522234080398436, + "grad_norm": 0.6419689555143541, + "learning_rate": 2.112470454829626e-05, + "loss": 0.0778, + "step": 18993 + }, + { + "epoch": 2.2523419898019683, + "grad_norm": 0.9297988497950536, + "learning_rate": 2.1122333219115063e-05, + "loss": 0.117, + "step": 18994 + }, + { + "epoch": 2.2524605715640935, + "grad_norm": 0.6609726618094524, + "learning_rate": 2.1119961925681198e-05, + "loss": 0.0956, + "step": 18995 + }, + { + "epoch": 2.2525791533262183, + "grad_norm": 0.935873103125508, + "learning_rate": 2.1117590668016527e-05, + "loss": 0.1222, + "step": 18996 + }, + { + "epoch": 2.2526977350883435, + "grad_norm": 0.65742140806973, + "learning_rate": 2.1115219446142898e-05, + "loss": 0.0887, + "step": 18997 + }, + { + "epoch": 2.2528163168504682, + "grad_norm": 0.9869060373985011, + "learning_rate": 2.1112848260082195e-05, + "loss": 0.1335, + "step": 18998 + }, + { + "epoch": 2.2529348986125934, + "grad_norm": 0.7048185737884956, + "learning_rate": 2.111047710985626e-05, + "loss": 0.0869, + "step": 18999 + }, + { + "epoch": 2.253053480374718, + "grad_norm": 0.8199998674400801, + "learning_rate": 2.110810599548696e-05, + "loss": 0.1015, + "step": 19000 + }, + { + "epoch": 2.2531720621368434, + "grad_norm": 0.6994936897709783, + "learning_rate": 2.1105734916996137e-05, + "loss": 0.0765, + "step": 19001 + }, + { + "epoch": 2.253290643898968, + "grad_norm": 0.7611550129695417, + "learning_rate": 2.1103363874405675e-05, + "loss": 0.0943, + "step": 19002 + }, + { + "epoch": 2.2534092256610934, + "grad_norm": 0.8930682456829255, + "learning_rate": 2.110099286773742e-05, + "loss": 0.1411, + "step": 19003 + }, + { + "epoch": 2.253527807423218, + "grad_norm": 0.7074982654728084, + "learning_rate": 2.109862189701322e-05, + "loss": 0.0889, + "step": 19004 + }, + { + "epoch": 2.2536463891853433, + "grad_norm": 0.5736377549670725, + "learning_rate": 2.1096250962254947e-05, + "loss": 0.0702, + "step": 19005 + }, + { + "epoch": 2.2537649709474685, + "grad_norm": 0.922325299402054, + "learning_rate": 2.1093880063484452e-05, + "loss": 0.1302, + "step": 19006 + }, + { + "epoch": 2.2538835527095933, + "grad_norm": 0.939569258804825, + "learning_rate": 2.1091509200723597e-05, + "loss": 0.1074, + "step": 19007 + }, + { + "epoch": 2.254002134471718, + "grad_norm": 0.9180041148578808, + "learning_rate": 2.1089138373994223e-05, + "loss": 0.1106, + "step": 19008 + }, + { + "epoch": 2.2541207162338432, + "grad_norm": 1.1532584644064932, + "learning_rate": 2.1086767583318213e-05, + "loss": 0.1548, + "step": 19009 + }, + { + "epoch": 2.2542392979959684, + "grad_norm": 0.7597800533335355, + "learning_rate": 2.10843968287174e-05, + "loss": 0.1046, + "step": 19010 + }, + { + "epoch": 2.254357879758093, + "grad_norm": 0.6806702321968584, + "learning_rate": 2.1082026110213654e-05, + "loss": 0.0827, + "step": 19011 + }, + { + "epoch": 2.254476461520218, + "grad_norm": 0.6461291521130037, + "learning_rate": 2.1079655427828807e-05, + "loss": 0.0922, + "step": 19012 + }, + { + "epoch": 2.254595043282343, + "grad_norm": 0.6973348020273231, + "learning_rate": 2.1077284781584743e-05, + "loss": 0.103, + "step": 19013 + }, + { + "epoch": 2.2547136250444684, + "grad_norm": 0.7130398927111192, + "learning_rate": 2.10749141715033e-05, + "loss": 0.0944, + "step": 19014 + }, + { + "epoch": 2.254832206806593, + "grad_norm": 0.8552868569840796, + "learning_rate": 2.107254359760633e-05, + "loss": 0.118, + "step": 19015 + }, + { + "epoch": 2.254950788568718, + "grad_norm": 0.6076907873620738, + "learning_rate": 2.1070173059915692e-05, + "loss": 0.0888, + "step": 19016 + }, + { + "epoch": 2.255069370330843, + "grad_norm": 1.0245930457541086, + "learning_rate": 2.106780255845324e-05, + "loss": 0.1536, + "step": 19017 + }, + { + "epoch": 2.2551879520929683, + "grad_norm": 0.6787786062576756, + "learning_rate": 2.1065432093240832e-05, + "loss": 0.084, + "step": 19018 + }, + { + "epoch": 2.255306533855093, + "grad_norm": 0.7891600963867963, + "learning_rate": 2.1063061664300305e-05, + "loss": 0.1107, + "step": 19019 + }, + { + "epoch": 2.2554251156172183, + "grad_norm": 0.6550424597603444, + "learning_rate": 2.106069127165353e-05, + "loss": 0.0768, + "step": 19020 + }, + { + "epoch": 2.255543697379343, + "grad_norm": 0.8972838959045875, + "learning_rate": 2.1058320915322356e-05, + "loss": 0.1442, + "step": 19021 + }, + { + "epoch": 2.255662279141468, + "grad_norm": 0.655910352025063, + "learning_rate": 2.1055950595328626e-05, + "loss": 0.0761, + "step": 19022 + }, + { + "epoch": 2.255780860903593, + "grad_norm": 0.5576248863794625, + "learning_rate": 2.1053580311694182e-05, + "loss": 0.0858, + "step": 19023 + }, + { + "epoch": 2.255899442665718, + "grad_norm": 0.6186647790286841, + "learning_rate": 2.10512100644409e-05, + "loss": 0.08, + "step": 19024 + }, + { + "epoch": 2.256018024427843, + "grad_norm": 0.7670979376099206, + "learning_rate": 2.1048839853590616e-05, + "loss": 0.1076, + "step": 19025 + }, + { + "epoch": 2.256136606189968, + "grad_norm": 1.0035868970443778, + "learning_rate": 2.1046469679165187e-05, + "loss": 0.1506, + "step": 19026 + }, + { + "epoch": 2.256255187952093, + "grad_norm": 0.7957271525735949, + "learning_rate": 2.1044099541186448e-05, + "loss": 0.094, + "step": 19027 + }, + { + "epoch": 2.256373769714218, + "grad_norm": 0.5165895253654987, + "learning_rate": 2.104172943967627e-05, + "loss": 0.0895, + "step": 19028 + }, + { + "epoch": 2.256492351476343, + "grad_norm": 0.9626625646082806, + "learning_rate": 2.1039359374656498e-05, + "loss": 0.1686, + "step": 19029 + }, + { + "epoch": 2.256610933238468, + "grad_norm": 1.0655772423066538, + "learning_rate": 2.103698934614897e-05, + "loss": 0.135, + "step": 19030 + }, + { + "epoch": 2.256729515000593, + "grad_norm": 0.7642193498387415, + "learning_rate": 2.1034619354175533e-05, + "loss": 0.1002, + "step": 19031 + }, + { + "epoch": 2.256848096762718, + "grad_norm": 0.606739170951713, + "learning_rate": 2.1032249398758055e-05, + "loss": 0.0802, + "step": 19032 + }, + { + "epoch": 2.256966678524843, + "grad_norm": 0.7962680248922646, + "learning_rate": 2.1029879479918367e-05, + "loss": 0.1229, + "step": 19033 + }, + { + "epoch": 2.257085260286968, + "grad_norm": 0.7406737158630667, + "learning_rate": 2.1027509597678316e-05, + "loss": 0.0813, + "step": 19034 + }, + { + "epoch": 2.2572038420490927, + "grad_norm": 0.7273602578594168, + "learning_rate": 2.1025139752059764e-05, + "loss": 0.0862, + "step": 19035 + }, + { + "epoch": 2.257322423811218, + "grad_norm": 0.7650585843566732, + "learning_rate": 2.1022769943084542e-05, + "loss": 0.0979, + "step": 19036 + }, + { + "epoch": 2.2574410055733427, + "grad_norm": 0.6316738482405768, + "learning_rate": 2.1020400170774513e-05, + "loss": 0.0877, + "step": 19037 + }, + { + "epoch": 2.257559587335468, + "grad_norm": 0.952254904244572, + "learning_rate": 2.1018030435151498e-05, + "loss": 0.1347, + "step": 19038 + }, + { + "epoch": 2.2576781690975927, + "grad_norm": 0.6579637189630273, + "learning_rate": 2.1015660736237375e-05, + "loss": 0.0941, + "step": 19039 + }, + { + "epoch": 2.257796750859718, + "grad_norm": 1.0251592602927104, + "learning_rate": 2.1013291074053973e-05, + "loss": 0.0992, + "step": 19040 + }, + { + "epoch": 2.2579153326218426, + "grad_norm": 1.0283246201044156, + "learning_rate": 2.1010921448623136e-05, + "loss": 0.1544, + "step": 19041 + }, + { + "epoch": 2.258033914383968, + "grad_norm": 0.6158578581907558, + "learning_rate": 2.1008551859966703e-05, + "loss": 0.0915, + "step": 19042 + }, + { + "epoch": 2.2581524961460926, + "grad_norm": 0.7697647652226256, + "learning_rate": 2.1006182308106537e-05, + "loss": 0.1067, + "step": 19043 + }, + { + "epoch": 2.258271077908218, + "grad_norm": 0.6181553755303844, + "learning_rate": 2.100381279306447e-05, + "loss": 0.0806, + "step": 19044 + }, + { + "epoch": 2.2583896596703426, + "grad_norm": 1.042292609688342, + "learning_rate": 2.1001443314862353e-05, + "loss": 0.0997, + "step": 19045 + }, + { + "epoch": 2.2585082414324678, + "grad_norm": 0.7338877948864893, + "learning_rate": 2.099907387352201e-05, + "loss": 0.0757, + "step": 19046 + }, + { + "epoch": 2.2586268231945925, + "grad_norm": 0.7240121964295021, + "learning_rate": 2.0996704469065314e-05, + "loss": 0.0966, + "step": 19047 + }, + { + "epoch": 2.2587454049567177, + "grad_norm": 0.7766283705011711, + "learning_rate": 2.0994335101514096e-05, + "loss": 0.1068, + "step": 19048 + }, + { + "epoch": 2.2588639867188425, + "grad_norm": 0.6592380113778299, + "learning_rate": 2.0991965770890183e-05, + "loss": 0.1046, + "step": 19049 + }, + { + "epoch": 2.2589825684809677, + "grad_norm": 0.6454739250026689, + "learning_rate": 2.098959647721544e-05, + "loss": 0.0796, + "step": 19050 + }, + { + "epoch": 2.2591011502430924, + "grad_norm": 0.9012522270691243, + "learning_rate": 2.0987227220511697e-05, + "loss": 0.1293, + "step": 19051 + }, + { + "epoch": 2.2592197320052176, + "grad_norm": 0.4596711365357253, + "learning_rate": 2.0984858000800807e-05, + "loss": 0.0812, + "step": 19052 + }, + { + "epoch": 2.2593383137673424, + "grad_norm": 0.4683238466676581, + "learning_rate": 2.0982488818104585e-05, + "loss": 0.0779, + "step": 19053 + }, + { + "epoch": 2.2594568955294676, + "grad_norm": 0.7869509545470769, + "learning_rate": 2.09801196724449e-05, + "loss": 0.1043, + "step": 19054 + }, + { + "epoch": 2.2595754772915924, + "grad_norm": 0.7440629485843135, + "learning_rate": 2.097775056384358e-05, + "loss": 0.0857, + "step": 19055 + }, + { + "epoch": 2.2596940590537176, + "grad_norm": 0.8414342628452722, + "learning_rate": 2.097538149232247e-05, + "loss": 0.1007, + "step": 19056 + }, + { + "epoch": 2.2598126408158423, + "grad_norm": 0.8388670576628786, + "learning_rate": 2.0973012457903403e-05, + "loss": 0.1226, + "step": 19057 + }, + { + "epoch": 2.2599312225779675, + "grad_norm": 0.7898747186670583, + "learning_rate": 2.0970643460608226e-05, + "loss": 0.0993, + "step": 19058 + }, + { + "epoch": 2.2600498043400927, + "grad_norm": 1.1226706497174557, + "learning_rate": 2.0968274500458777e-05, + "loss": 0.1356, + "step": 19059 + }, + { + "epoch": 2.2601683861022175, + "grad_norm": 0.9875345533594934, + "learning_rate": 2.0965905577476884e-05, + "loss": 0.0896, + "step": 19060 + }, + { + "epoch": 2.2602869678643422, + "grad_norm": 1.026584840820942, + "learning_rate": 2.0963536691684403e-05, + "loss": 0.1735, + "step": 19061 + }, + { + "epoch": 2.2604055496264674, + "grad_norm": 0.7927098021734523, + "learning_rate": 2.096116784310317e-05, + "loss": 0.0987, + "step": 19062 + }, + { + "epoch": 2.2605241313885927, + "grad_norm": 0.7396541246673995, + "learning_rate": 2.095879903175501e-05, + "loss": 0.108, + "step": 19063 + }, + { + "epoch": 2.2606427131507174, + "grad_norm": 0.6483771730637219, + "learning_rate": 2.0956430257661758e-05, + "loss": 0.1089, + "step": 19064 + }, + { + "epoch": 2.260761294912842, + "grad_norm": 0.9145706638181258, + "learning_rate": 2.0954061520845272e-05, + "loss": 0.1064, + "step": 19065 + }, + { + "epoch": 2.2608798766749674, + "grad_norm": 0.668566093775409, + "learning_rate": 2.0951692821327374e-05, + "loss": 0.1073, + "step": 19066 + }, + { + "epoch": 2.2609984584370926, + "grad_norm": 0.7740892295396714, + "learning_rate": 2.0949324159129906e-05, + "loss": 0.1031, + "step": 19067 + }, + { + "epoch": 2.2611170401992173, + "grad_norm": 0.8365310244954411, + "learning_rate": 2.0946955534274693e-05, + "loss": 0.1008, + "step": 19068 + }, + { + "epoch": 2.2612356219613425, + "grad_norm": 0.8645992290608445, + "learning_rate": 2.094458694678359e-05, + "loss": 0.1582, + "step": 19069 + }, + { + "epoch": 2.2613542037234673, + "grad_norm": 0.7138004982365895, + "learning_rate": 2.094221839667842e-05, + "loss": 0.1217, + "step": 19070 + }, + { + "epoch": 2.2614727854855925, + "grad_norm": 0.868578394640825, + "learning_rate": 2.0939849883981026e-05, + "loss": 0.1254, + "step": 19071 + }, + { + "epoch": 2.2615913672477173, + "grad_norm": 0.9099219663972437, + "learning_rate": 2.0937481408713223e-05, + "loss": 0.1481, + "step": 19072 + }, + { + "epoch": 2.2617099490098425, + "grad_norm": 0.7932171843159301, + "learning_rate": 2.093511297089687e-05, + "loss": 0.127, + "step": 19073 + }, + { + "epoch": 2.261828530771967, + "grad_norm": 0.7842286088072226, + "learning_rate": 2.0932744570553788e-05, + "loss": 0.1126, + "step": 19074 + }, + { + "epoch": 2.2619471125340924, + "grad_norm": 0.575303979689616, + "learning_rate": 2.093037620770581e-05, + "loss": 0.0795, + "step": 19075 + }, + { + "epoch": 2.262065694296217, + "grad_norm": 0.8784181574128842, + "learning_rate": 2.0928007882374773e-05, + "loss": 0.1125, + "step": 19076 + }, + { + "epoch": 2.2621842760583424, + "grad_norm": 0.8829314731541245, + "learning_rate": 2.0925639594582518e-05, + "loss": 0.0955, + "step": 19077 + }, + { + "epoch": 2.262302857820467, + "grad_norm": 0.6880560266211093, + "learning_rate": 2.0923271344350864e-05, + "loss": 0.095, + "step": 19078 + }, + { + "epoch": 2.2624214395825923, + "grad_norm": 0.6011007565898318, + "learning_rate": 2.0920903131701643e-05, + "loss": 0.0806, + "step": 19079 + }, + { + "epoch": 2.262540021344717, + "grad_norm": 0.6759378949841988, + "learning_rate": 2.0918534956656706e-05, + "loss": 0.0805, + "step": 19080 + }, + { + "epoch": 2.2626586031068423, + "grad_norm": 0.9134641190271963, + "learning_rate": 2.0916166819237868e-05, + "loss": 0.1219, + "step": 19081 + }, + { + "epoch": 2.262777184868967, + "grad_norm": 0.745725361069267, + "learning_rate": 2.0913798719466964e-05, + "loss": 0.1062, + "step": 19082 + }, + { + "epoch": 2.2628957666310923, + "grad_norm": 0.8631063978182343, + "learning_rate": 2.0911430657365817e-05, + "loss": 0.1086, + "step": 19083 + }, + { + "epoch": 2.263014348393217, + "grad_norm": 0.9911692893228753, + "learning_rate": 2.090906263295627e-05, + "loss": 0.1281, + "step": 19084 + }, + { + "epoch": 2.2631329301553422, + "grad_norm": 0.7184018255579484, + "learning_rate": 2.090669464626015e-05, + "loss": 0.1151, + "step": 19085 + }, + { + "epoch": 2.263251511917467, + "grad_norm": 0.7343871339205411, + "learning_rate": 2.090432669729929e-05, + "loss": 0.1046, + "step": 19086 + }, + { + "epoch": 2.263370093679592, + "grad_norm": 0.8572090628943982, + "learning_rate": 2.0901958786095502e-05, + "loss": 0.0816, + "step": 19087 + }, + { + "epoch": 2.263488675441717, + "grad_norm": 0.693983961405414, + "learning_rate": 2.089959091267064e-05, + "loss": 0.106, + "step": 19088 + }, + { + "epoch": 2.263607257203842, + "grad_norm": 0.5577624248096062, + "learning_rate": 2.0897223077046523e-05, + "loss": 0.0635, + "step": 19089 + }, + { + "epoch": 2.263725838965967, + "grad_norm": 0.707950048752636, + "learning_rate": 2.089485527924497e-05, + "loss": 0.0885, + "step": 19090 + }, + { + "epoch": 2.263844420728092, + "grad_norm": 0.7196720926490557, + "learning_rate": 2.0892487519287824e-05, + "loss": 0.0667, + "step": 19091 + }, + { + "epoch": 2.263963002490217, + "grad_norm": 0.49920917630014433, + "learning_rate": 2.0890119797196907e-05, + "loss": 0.0849, + "step": 19092 + }, + { + "epoch": 2.264081584252342, + "grad_norm": 1.106861393532934, + "learning_rate": 2.088775211299404e-05, + "loss": 0.1167, + "step": 19093 + }, + { + "epoch": 2.264200166014467, + "grad_norm": 0.7692171960499528, + "learning_rate": 2.0885384466701054e-05, + "loss": 0.0771, + "step": 19094 + }, + { + "epoch": 2.264318747776592, + "grad_norm": 0.5477082543112483, + "learning_rate": 2.0883016858339784e-05, + "loss": 0.0718, + "step": 19095 + }, + { + "epoch": 2.264437329538717, + "grad_norm": 0.6522196919614662, + "learning_rate": 2.0880649287932047e-05, + "loss": 0.09, + "step": 19096 + }, + { + "epoch": 2.264555911300842, + "grad_norm": 0.7575052826367621, + "learning_rate": 2.0878281755499678e-05, + "loss": 0.0822, + "step": 19097 + }, + { + "epoch": 2.2646744930629668, + "grad_norm": 1.168862424715898, + "learning_rate": 2.087591426106448e-05, + "loss": 0.1643, + "step": 19098 + }, + { + "epoch": 2.264793074825092, + "grad_norm": 0.5143522750725421, + "learning_rate": 2.0873546804648315e-05, + "loss": 0.0771, + "step": 19099 + }, + { + "epoch": 2.2649116565872167, + "grad_norm": 0.6769987132346699, + "learning_rate": 2.087117938627298e-05, + "loss": 0.1105, + "step": 19100 + }, + { + "epoch": 2.265030238349342, + "grad_norm": 0.7612085677940538, + "learning_rate": 2.0868812005960315e-05, + "loss": 0.0928, + "step": 19101 + }, + { + "epoch": 2.2651488201114667, + "grad_norm": 0.8811852539020466, + "learning_rate": 2.086644466373212e-05, + "loss": 0.1137, + "step": 19102 + }, + { + "epoch": 2.265267401873592, + "grad_norm": 0.8991620600214749, + "learning_rate": 2.086407735961025e-05, + "loss": 0.1381, + "step": 19103 + }, + { + "epoch": 2.2653859836357166, + "grad_norm": 0.6234836585536662, + "learning_rate": 2.086171009361651e-05, + "loss": 0.0803, + "step": 19104 + }, + { + "epoch": 2.265504565397842, + "grad_norm": 0.8655072984032574, + "learning_rate": 2.0859342865772728e-05, + "loss": 0.1294, + "step": 19105 + }, + { + "epoch": 2.2656231471599666, + "grad_norm": 0.9447337810983776, + "learning_rate": 2.0856975676100725e-05, + "loss": 0.1468, + "step": 19106 + }, + { + "epoch": 2.265741728922092, + "grad_norm": 0.6283615638531427, + "learning_rate": 2.0854608524622335e-05, + "loss": 0.095, + "step": 19107 + }, + { + "epoch": 2.265860310684217, + "grad_norm": 0.9440495461649917, + "learning_rate": 2.0852241411359368e-05, + "loss": 0.1498, + "step": 19108 + }, + { + "epoch": 2.2659788924463418, + "grad_norm": 0.5549439840029914, + "learning_rate": 2.084987433633364e-05, + "loss": 0.0726, + "step": 19109 + }, + { + "epoch": 2.2660974742084665, + "grad_norm": 0.8581652515466366, + "learning_rate": 2.084750729956699e-05, + "loss": 0.1187, + "step": 19110 + }, + { + "epoch": 2.2662160559705917, + "grad_norm": 0.7293111225892883, + "learning_rate": 2.0845140301081233e-05, + "loss": 0.1016, + "step": 19111 + }, + { + "epoch": 2.266334637732717, + "grad_norm": 0.5274874859525202, + "learning_rate": 2.0842773340898187e-05, + "loss": 0.0751, + "step": 19112 + }, + { + "epoch": 2.2664532194948417, + "grad_norm": 0.6097803077004579, + "learning_rate": 2.0840406419039658e-05, + "loss": 0.101, + "step": 19113 + }, + { + "epoch": 2.2665718012569664, + "grad_norm": 0.9683941023586264, + "learning_rate": 2.0838039535527497e-05, + "loss": 0.1565, + "step": 19114 + }, + { + "epoch": 2.2666903830190916, + "grad_norm": 0.6659318938527254, + "learning_rate": 2.0835672690383502e-05, + "loss": 0.102, + "step": 19115 + }, + { + "epoch": 2.266808964781217, + "grad_norm": 0.7680071577096189, + "learning_rate": 2.08333058836295e-05, + "loss": 0.1063, + "step": 19116 + }, + { + "epoch": 2.2669275465433416, + "grad_norm": 0.7882555745910979, + "learning_rate": 2.08309391152873e-05, + "loss": 0.1136, + "step": 19117 + }, + { + "epoch": 2.267046128305467, + "grad_norm": 0.9056377476821434, + "learning_rate": 2.082857238537874e-05, + "loss": 0.0789, + "step": 19118 + }, + { + "epoch": 2.2671647100675916, + "grad_norm": 0.6227740299275926, + "learning_rate": 2.0826205693925627e-05, + "loss": 0.0926, + "step": 19119 + }, + { + "epoch": 2.2672832918297168, + "grad_norm": 0.8109330737614423, + "learning_rate": 2.0823839040949766e-05, + "loss": 0.1262, + "step": 19120 + }, + { + "epoch": 2.2674018735918415, + "grad_norm": 0.7189962678063827, + "learning_rate": 2.0821472426473002e-05, + "loss": 0.1185, + "step": 19121 + }, + { + "epoch": 2.2675204553539667, + "grad_norm": 0.5495082647782953, + "learning_rate": 2.0819105850517138e-05, + "loss": 0.066, + "step": 19122 + }, + { + "epoch": 2.2676390371160915, + "grad_norm": 1.0778053192978372, + "learning_rate": 2.0816739313103984e-05, + "loss": 0.1367, + "step": 19123 + }, + { + "epoch": 2.2677576188782167, + "grad_norm": 0.6989521845873901, + "learning_rate": 2.0814372814255363e-05, + "loss": 0.0669, + "step": 19124 + }, + { + "epoch": 2.2678762006403415, + "grad_norm": 0.6015121421934836, + "learning_rate": 2.0812006353993093e-05, + "loss": 0.0862, + "step": 19125 + }, + { + "epoch": 2.2679947824024667, + "grad_norm": 0.6043537025565169, + "learning_rate": 2.0809639932338996e-05, + "loss": 0.0821, + "step": 19126 + }, + { + "epoch": 2.2681133641645914, + "grad_norm": 1.0556055703481757, + "learning_rate": 2.0807273549314876e-05, + "loss": 0.1227, + "step": 19127 + }, + { + "epoch": 2.2682319459267166, + "grad_norm": 0.8704670377491764, + "learning_rate": 2.0804907204942546e-05, + "loss": 0.0935, + "step": 19128 + }, + { + "epoch": 2.2683505276888414, + "grad_norm": 1.1887979124386518, + "learning_rate": 2.0802540899243833e-05, + "loss": 0.1266, + "step": 19129 + }, + { + "epoch": 2.2684691094509666, + "grad_norm": 0.989170728318912, + "learning_rate": 2.080017463224055e-05, + "loss": 0.1307, + "step": 19130 + }, + { + "epoch": 2.2685876912130913, + "grad_norm": 0.6001166540020825, + "learning_rate": 2.0797808403954493e-05, + "loss": 0.0629, + "step": 19131 + }, + { + "epoch": 2.2687062729752165, + "grad_norm": 0.6420886296824327, + "learning_rate": 2.0795442214407504e-05, + "loss": 0.1075, + "step": 19132 + }, + { + "epoch": 2.2688248547373413, + "grad_norm": 0.7854034694422092, + "learning_rate": 2.079307606362138e-05, + "loss": 0.1031, + "step": 19133 + }, + { + "epoch": 2.2689434364994665, + "grad_norm": 0.7270524835318557, + "learning_rate": 2.079070995161793e-05, + "loss": 0.0875, + "step": 19134 + }, + { + "epoch": 2.2690620182615913, + "grad_norm": 0.572177861388817, + "learning_rate": 2.0788343878418974e-05, + "loss": 0.084, + "step": 19135 + }, + { + "epoch": 2.2691806000237165, + "grad_norm": 0.696969399506367, + "learning_rate": 2.078597784404632e-05, + "loss": 0.1199, + "step": 19136 + }, + { + "epoch": 2.2692991817858412, + "grad_norm": 0.6045318934330434, + "learning_rate": 2.078361184852179e-05, + "loss": 0.095, + "step": 19137 + }, + { + "epoch": 2.2694177635479664, + "grad_norm": 0.9184305027267113, + "learning_rate": 2.0781245891867187e-05, + "loss": 0.1079, + "step": 19138 + }, + { + "epoch": 2.269536345310091, + "grad_norm": 0.7114197178721431, + "learning_rate": 2.0778879974104316e-05, + "loss": 0.1101, + "step": 19139 + }, + { + "epoch": 2.2696549270722164, + "grad_norm": 0.7448772891291007, + "learning_rate": 2.0776514095255e-05, + "loss": 0.1135, + "step": 19140 + }, + { + "epoch": 2.269773508834341, + "grad_norm": 0.590216387124899, + "learning_rate": 2.077414825534105e-05, + "loss": 0.0951, + "step": 19141 + }, + { + "epoch": 2.2698920905964663, + "grad_norm": 1.0972146848824706, + "learning_rate": 2.077178245438427e-05, + "loss": 0.1327, + "step": 19142 + }, + { + "epoch": 2.270010672358591, + "grad_norm": 0.7801052223675795, + "learning_rate": 2.0769416692406458e-05, + "loss": 0.119, + "step": 19143 + }, + { + "epoch": 2.2701292541207163, + "grad_norm": 0.6102100495112817, + "learning_rate": 2.076705096942945e-05, + "loss": 0.0799, + "step": 19144 + }, + { + "epoch": 2.270247835882841, + "grad_norm": 0.8144869785783221, + "learning_rate": 2.0764685285475033e-05, + "loss": 0.0985, + "step": 19145 + }, + { + "epoch": 2.2703664176449663, + "grad_norm": 0.9294382159091734, + "learning_rate": 2.0762319640565026e-05, + "loss": 0.1171, + "step": 19146 + }, + { + "epoch": 2.270484999407091, + "grad_norm": 0.6845415434817347, + "learning_rate": 2.0759954034721234e-05, + "loss": 0.1084, + "step": 19147 + }, + { + "epoch": 2.2706035811692162, + "grad_norm": 0.8043958799401203, + "learning_rate": 2.0757588467965473e-05, + "loss": 0.1289, + "step": 19148 + }, + { + "epoch": 2.270722162931341, + "grad_norm": 0.6855092378067706, + "learning_rate": 2.0755222940319543e-05, + "loss": 0.0915, + "step": 19149 + }, + { + "epoch": 2.270840744693466, + "grad_norm": 0.7516895479295843, + "learning_rate": 2.075285745180524e-05, + "loss": 0.0814, + "step": 19150 + }, + { + "epoch": 2.270959326455591, + "grad_norm": 0.5843487961458899, + "learning_rate": 2.0750492002444397e-05, + "loss": 0.095, + "step": 19151 + }, + { + "epoch": 2.271077908217716, + "grad_norm": 0.5066814285290991, + "learning_rate": 2.0748126592258803e-05, + "loss": 0.088, + "step": 19152 + }, + { + "epoch": 2.271196489979841, + "grad_norm": 0.7381264945390102, + "learning_rate": 2.0745761221270265e-05, + "loss": 0.0843, + "step": 19153 + }, + { + "epoch": 2.271315071741966, + "grad_norm": 0.9141150732020663, + "learning_rate": 2.0743395889500596e-05, + "loss": 0.0861, + "step": 19154 + }, + { + "epoch": 2.271433653504091, + "grad_norm": 0.6384595902996227, + "learning_rate": 2.0741030596971588e-05, + "loss": 0.0827, + "step": 19155 + }, + { + "epoch": 2.271552235266216, + "grad_norm": 0.43844523236452704, + "learning_rate": 2.0738665343705065e-05, + "loss": 0.0522, + "step": 19156 + }, + { + "epoch": 2.2716708170283413, + "grad_norm": 0.6146275540249703, + "learning_rate": 2.0736300129722825e-05, + "loss": 0.098, + "step": 19157 + }, + { + "epoch": 2.271789398790466, + "grad_norm": 0.8722438198342779, + "learning_rate": 2.0733934955046654e-05, + "loss": 0.1303, + "step": 19158 + }, + { + "epoch": 2.271907980552591, + "grad_norm": 0.8031863990557682, + "learning_rate": 2.0731569819698386e-05, + "loss": 0.1073, + "step": 19159 + }, + { + "epoch": 2.272026562314716, + "grad_norm": 0.8578489118593859, + "learning_rate": 2.0729204723699806e-05, + "loss": 0.1074, + "step": 19160 + }, + { + "epoch": 2.272145144076841, + "grad_norm": 0.6040864819919964, + "learning_rate": 2.0726839667072713e-05, + "loss": 0.0972, + "step": 19161 + }, + { + "epoch": 2.272263725838966, + "grad_norm": 0.7849055302776805, + "learning_rate": 2.0724474649838927e-05, + "loss": 0.1223, + "step": 19162 + }, + { + "epoch": 2.2723823076010907, + "grad_norm": 0.9380154149544941, + "learning_rate": 2.0722109672020244e-05, + "loss": 0.1143, + "step": 19163 + }, + { + "epoch": 2.272500889363216, + "grad_norm": 0.7530387064391912, + "learning_rate": 2.071974473363846e-05, + "loss": 0.1106, + "step": 19164 + }, + { + "epoch": 2.272619471125341, + "grad_norm": 0.9638100293743602, + "learning_rate": 2.0717379834715372e-05, + "loss": 0.1171, + "step": 19165 + }, + { + "epoch": 2.272738052887466, + "grad_norm": 0.7199473937380547, + "learning_rate": 2.07150149752728e-05, + "loss": 0.0878, + "step": 19166 + }, + { + "epoch": 2.2728566346495906, + "grad_norm": 0.9664043426847073, + "learning_rate": 2.0712650155332536e-05, + "loss": 0.1541, + "step": 19167 + }, + { + "epoch": 2.272975216411716, + "grad_norm": 0.6536860207000298, + "learning_rate": 2.071028537491638e-05, + "loss": 0.0805, + "step": 19168 + }, + { + "epoch": 2.273093798173841, + "grad_norm": 0.6196023717080401, + "learning_rate": 2.0707920634046123e-05, + "loss": 0.1104, + "step": 19169 + }, + { + "epoch": 2.273212379935966, + "grad_norm": 0.6870146726198311, + "learning_rate": 2.0705555932743585e-05, + "loss": 0.1208, + "step": 19170 + }, + { + "epoch": 2.273330961698091, + "grad_norm": 0.7198773077565136, + "learning_rate": 2.0703191271030557e-05, + "loss": 0.0915, + "step": 19171 + }, + { + "epoch": 2.2734495434602158, + "grad_norm": 1.0679228434207566, + "learning_rate": 2.070082664892883e-05, + "loss": 0.1479, + "step": 19172 + }, + { + "epoch": 2.273568125222341, + "grad_norm": 1.0592283324171519, + "learning_rate": 2.0698462066460205e-05, + "loss": 0.1377, + "step": 19173 + }, + { + "epoch": 2.2736867069844657, + "grad_norm": 0.7582371079136339, + "learning_rate": 2.0696097523646486e-05, + "loss": 0.1211, + "step": 19174 + }, + { + "epoch": 2.273805288746591, + "grad_norm": 0.6788624568265205, + "learning_rate": 2.069373302050948e-05, + "loss": 0.101, + "step": 19175 + }, + { + "epoch": 2.2739238705087157, + "grad_norm": 0.7367816515739971, + "learning_rate": 2.0691368557070963e-05, + "loss": 0.101, + "step": 19176 + }, + { + "epoch": 2.274042452270841, + "grad_norm": 0.770807780621233, + "learning_rate": 2.0689004133352753e-05, + "loss": 0.1177, + "step": 19177 + }, + { + "epoch": 2.2741610340329657, + "grad_norm": 0.7042518494516887, + "learning_rate": 2.068663974937664e-05, + "loss": 0.1175, + "step": 19178 + }, + { + "epoch": 2.274279615795091, + "grad_norm": 0.9957904278291896, + "learning_rate": 2.0684275405164417e-05, + "loss": 0.1229, + "step": 19179 + }, + { + "epoch": 2.2743981975572156, + "grad_norm": 1.0795763455747813, + "learning_rate": 2.0681911100737872e-05, + "loss": 0.1324, + "step": 19180 + }, + { + "epoch": 2.274516779319341, + "grad_norm": 0.5879968540482619, + "learning_rate": 2.0679546836118823e-05, + "loss": 0.0696, + "step": 19181 + }, + { + "epoch": 2.2746353610814656, + "grad_norm": 0.6620131923351383, + "learning_rate": 2.0677182611329055e-05, + "loss": 0.091, + "step": 19182 + }, + { + "epoch": 2.274753942843591, + "grad_norm": 1.1363020310966965, + "learning_rate": 2.0674818426390358e-05, + "loss": 0.1419, + "step": 19183 + }, + { + "epoch": 2.2748725246057155, + "grad_norm": 0.981368017768397, + "learning_rate": 2.067245428132453e-05, + "loss": 0.1236, + "step": 19184 + }, + { + "epoch": 2.2749911063678407, + "grad_norm": 0.4903574304812896, + "learning_rate": 2.0670090176153366e-05, + "loss": 0.0696, + "step": 19185 + }, + { + "epoch": 2.2751096881299655, + "grad_norm": 0.7975328607968287, + "learning_rate": 2.0667726110898666e-05, + "loss": 0.1076, + "step": 19186 + }, + { + "epoch": 2.2752282698920907, + "grad_norm": 0.9385414474839877, + "learning_rate": 2.066536208558222e-05, + "loss": 0.1153, + "step": 19187 + }, + { + "epoch": 2.2753468516542155, + "grad_norm": 0.7840708111736276, + "learning_rate": 2.0662998100225807e-05, + "loss": 0.1009, + "step": 19188 + }, + { + "epoch": 2.2754654334163407, + "grad_norm": 0.8631761612995749, + "learning_rate": 2.0660634154851246e-05, + "loss": 0.1083, + "step": 19189 + }, + { + "epoch": 2.2755840151784654, + "grad_norm": 0.5418115093445542, + "learning_rate": 2.0658270249480317e-05, + "loss": 0.066, + "step": 19190 + }, + { + "epoch": 2.2757025969405906, + "grad_norm": 0.9712095193287779, + "learning_rate": 2.0655906384134798e-05, + "loss": 0.1427, + "step": 19191 + }, + { + "epoch": 2.2758211787027154, + "grad_norm": 0.678167924504207, + "learning_rate": 2.065354255883651e-05, + "loss": 0.0832, + "step": 19192 + }, + { + "epoch": 2.2759397604648406, + "grad_norm": 0.8492762201729729, + "learning_rate": 2.065117877360723e-05, + "loss": 0.1097, + "step": 19193 + }, + { + "epoch": 2.2760583422269653, + "grad_norm": 0.7790942117272454, + "learning_rate": 2.064881502846874e-05, + "loss": 0.1101, + "step": 19194 + }, + { + "epoch": 2.2761769239890906, + "grad_norm": 0.8656173543514246, + "learning_rate": 2.064645132344284e-05, + "loss": 0.1405, + "step": 19195 + }, + { + "epoch": 2.2762955057512153, + "grad_norm": 0.9234769251165286, + "learning_rate": 2.0644087658551324e-05, + "loss": 0.1337, + "step": 19196 + }, + { + "epoch": 2.2764140875133405, + "grad_norm": 0.7738830792429456, + "learning_rate": 2.064172403381598e-05, + "loss": 0.1081, + "step": 19197 + }, + { + "epoch": 2.2765326692754653, + "grad_norm": 1.0871535124931713, + "learning_rate": 2.0639360449258596e-05, + "loss": 0.1401, + "step": 19198 + }, + { + "epoch": 2.2766512510375905, + "grad_norm": 0.903751093774981, + "learning_rate": 2.0636996904900952e-05, + "loss": 0.1002, + "step": 19199 + }, + { + "epoch": 2.2767698327997152, + "grad_norm": 0.5384939581806427, + "learning_rate": 2.0634633400764854e-05, + "loss": 0.0785, + "step": 19200 + }, + { + "epoch": 2.2768884145618404, + "grad_norm": 0.9254094748195427, + "learning_rate": 2.0632269936872088e-05, + "loss": 0.1178, + "step": 19201 + }, + { + "epoch": 2.277006996323965, + "grad_norm": 0.5715720700546214, + "learning_rate": 2.062990651324443e-05, + "loss": 0.0679, + "step": 19202 + }, + { + "epoch": 2.2771255780860904, + "grad_norm": 1.0364274290674282, + "learning_rate": 2.062754312990367e-05, + "loss": 0.1112, + "step": 19203 + }, + { + "epoch": 2.277244159848215, + "grad_norm": 0.8890384015769935, + "learning_rate": 2.062517978687161e-05, + "loss": 0.1043, + "step": 19204 + }, + { + "epoch": 2.2773627416103404, + "grad_norm": 0.6717203309836065, + "learning_rate": 2.0622816484170025e-05, + "loss": 0.1167, + "step": 19205 + }, + { + "epoch": 2.2774813233724656, + "grad_norm": 0.7854720806665783, + "learning_rate": 2.0620453221820695e-05, + "loss": 0.1019, + "step": 19206 + }, + { + "epoch": 2.2775999051345903, + "grad_norm": 0.8158414368332162, + "learning_rate": 2.0618089999845428e-05, + "loss": 0.1008, + "step": 19207 + }, + { + "epoch": 2.277718486896715, + "grad_norm": 0.5918990403733447, + "learning_rate": 2.0615726818265995e-05, + "loss": 0.0797, + "step": 19208 + }, + { + "epoch": 2.2778370686588403, + "grad_norm": 0.919444833081891, + "learning_rate": 2.0613363677104186e-05, + "loss": 0.1426, + "step": 19209 + }, + { + "epoch": 2.2779556504209655, + "grad_norm": 0.7731467893785997, + "learning_rate": 2.0611000576381776e-05, + "loss": 0.0932, + "step": 19210 + }, + { + "epoch": 2.2780742321830902, + "grad_norm": 0.4644003754960123, + "learning_rate": 2.060863751612057e-05, + "loss": 0.0638, + "step": 19211 + }, + { + "epoch": 2.278192813945215, + "grad_norm": 0.7621204685505636, + "learning_rate": 2.060627449634234e-05, + "loss": 0.1216, + "step": 19212 + }, + { + "epoch": 2.27831139570734, + "grad_norm": 0.8176993436914342, + "learning_rate": 2.0603911517068865e-05, + "loss": 0.1065, + "step": 19213 + }, + { + "epoch": 2.2784299774694654, + "grad_norm": 0.6179857489135397, + "learning_rate": 2.0601548578321933e-05, + "loss": 0.0793, + "step": 19214 + }, + { + "epoch": 2.27854855923159, + "grad_norm": 0.70709518542041, + "learning_rate": 2.059918568012333e-05, + "loss": 0.1144, + "step": 19215 + }, + { + "epoch": 2.278667140993715, + "grad_norm": 0.7315247181251626, + "learning_rate": 2.0596822822494842e-05, + "loss": 0.107, + "step": 19216 + }, + { + "epoch": 2.27878572275584, + "grad_norm": 0.6430198661897033, + "learning_rate": 2.0594460005458244e-05, + "loss": 0.0886, + "step": 19217 + }, + { + "epoch": 2.2789043045179653, + "grad_norm": 0.5441313523345018, + "learning_rate": 2.0592097229035328e-05, + "loss": 0.0677, + "step": 19218 + }, + { + "epoch": 2.27902288628009, + "grad_norm": 0.428320364919597, + "learning_rate": 2.0589734493247868e-05, + "loss": 0.0651, + "step": 19219 + }, + { + "epoch": 2.2791414680422153, + "grad_norm": 0.9984555750978432, + "learning_rate": 2.0587371798117655e-05, + "loss": 0.1417, + "step": 19220 + }, + { + "epoch": 2.27926004980434, + "grad_norm": 0.8599782707807125, + "learning_rate": 2.058500914366645e-05, + "loss": 0.1158, + "step": 19221 + }, + { + "epoch": 2.2793786315664653, + "grad_norm": 0.5969748465952649, + "learning_rate": 2.0582646529916054e-05, + "loss": 0.0793, + "step": 19222 + }, + { + "epoch": 2.27949721332859, + "grad_norm": 0.6762920655474683, + "learning_rate": 2.0580283956888235e-05, + "loss": 0.0803, + "step": 19223 + }, + { + "epoch": 2.279615795090715, + "grad_norm": 0.7560625730594209, + "learning_rate": 2.0577921424604785e-05, + "loss": 0.1105, + "step": 19224 + }, + { + "epoch": 2.27973437685284, + "grad_norm": 0.5690085973618046, + "learning_rate": 2.0575558933087465e-05, + "loss": 0.0734, + "step": 19225 + }, + { + "epoch": 2.279852958614965, + "grad_norm": 0.7529499840960499, + "learning_rate": 2.0573196482358078e-05, + "loss": 0.0998, + "step": 19226 + }, + { + "epoch": 2.27997154037709, + "grad_norm": 0.7849298058165083, + "learning_rate": 2.0570834072438392e-05, + "loss": 0.0957, + "step": 19227 + }, + { + "epoch": 2.280090122139215, + "grad_norm": 0.6419833273953517, + "learning_rate": 2.0568471703350185e-05, + "loss": 0.1042, + "step": 19228 + }, + { + "epoch": 2.28020870390134, + "grad_norm": 0.7213281164189219, + "learning_rate": 2.056610937511522e-05, + "loss": 0.0941, + "step": 19229 + }, + { + "epoch": 2.280327285663465, + "grad_norm": 0.5418800154535591, + "learning_rate": 2.0563747087755303e-05, + "loss": 0.056, + "step": 19230 + }, + { + "epoch": 2.28044586742559, + "grad_norm": 0.8654167316475169, + "learning_rate": 2.0561384841292197e-05, + "loss": 0.129, + "step": 19231 + }, + { + "epoch": 2.280564449187715, + "grad_norm": 0.8173990140024149, + "learning_rate": 2.0559022635747668e-05, + "loss": 0.091, + "step": 19232 + }, + { + "epoch": 2.28068303094984, + "grad_norm": 0.6383706107186635, + "learning_rate": 2.0556660471143517e-05, + "loss": 0.0945, + "step": 19233 + }, + { + "epoch": 2.280801612711965, + "grad_norm": 0.7524865298328898, + "learning_rate": 2.05542983475015e-05, + "loss": 0.1071, + "step": 19234 + }, + { + "epoch": 2.28092019447409, + "grad_norm": 0.7289841140821686, + "learning_rate": 2.055193626484341e-05, + "loss": 0.0647, + "step": 19235 + }, + { + "epoch": 2.281038776236215, + "grad_norm": 0.483883460375925, + "learning_rate": 2.0549574223191e-05, + "loss": 0.0621, + "step": 19236 + }, + { + "epoch": 2.2811573579983397, + "grad_norm": 0.565199374624868, + "learning_rate": 2.0547212222566064e-05, + "loss": 0.0663, + "step": 19237 + }, + { + "epoch": 2.281275939760465, + "grad_norm": 0.6496429590072367, + "learning_rate": 2.0544850262990378e-05, + "loss": 0.0799, + "step": 19238 + }, + { + "epoch": 2.2813945215225897, + "grad_norm": 0.6630005017747211, + "learning_rate": 2.0542488344485707e-05, + "loss": 0.093, + "step": 19239 + }, + { + "epoch": 2.281513103284715, + "grad_norm": 0.7739857551173044, + "learning_rate": 2.0540126467073813e-05, + "loss": 0.1034, + "step": 19240 + }, + { + "epoch": 2.2816316850468397, + "grad_norm": 0.7056168520138631, + "learning_rate": 2.0537764630776498e-05, + "loss": 0.0913, + "step": 19241 + }, + { + "epoch": 2.281750266808965, + "grad_norm": 0.6191492266153039, + "learning_rate": 2.053540283561552e-05, + "loss": 0.0888, + "step": 19242 + }, + { + "epoch": 2.2818688485710896, + "grad_norm": 0.6685073692921839, + "learning_rate": 2.053304108161265e-05, + "loss": 0.1011, + "step": 19243 + }, + { + "epoch": 2.281987430333215, + "grad_norm": 0.9857392715620021, + "learning_rate": 2.0530679368789664e-05, + "loss": 0.1737, + "step": 19244 + }, + { + "epoch": 2.2821060120953396, + "grad_norm": 0.9757437265838953, + "learning_rate": 2.052831769716833e-05, + "loss": 0.1146, + "step": 19245 + }, + { + "epoch": 2.282224593857465, + "grad_norm": 0.9218974068761951, + "learning_rate": 2.0525956066770434e-05, + "loss": 0.1081, + "step": 19246 + }, + { + "epoch": 2.2823431756195895, + "grad_norm": 0.7765133370072175, + "learning_rate": 2.052359447761772e-05, + "loss": 0.1046, + "step": 19247 + }, + { + "epoch": 2.2824617573817148, + "grad_norm": 1.00015118792085, + "learning_rate": 2.0521232929731986e-05, + "loss": 0.1225, + "step": 19248 + }, + { + "epoch": 2.2825803391438395, + "grad_norm": 0.7542421383165739, + "learning_rate": 2.0518871423135e-05, + "loss": 0.094, + "step": 19249 + }, + { + "epoch": 2.2826989209059647, + "grad_norm": 1.0468288432918036, + "learning_rate": 2.0516509957848517e-05, + "loss": 0.1299, + "step": 19250 + }, + { + "epoch": 2.2828175026680895, + "grad_norm": 0.859189482224707, + "learning_rate": 2.0514148533894305e-05, + "loss": 0.1314, + "step": 19251 + }, + { + "epoch": 2.2829360844302147, + "grad_norm": 0.9334741588463504, + "learning_rate": 2.0511787151294155e-05, + "loss": 0.1444, + "step": 19252 + }, + { + "epoch": 2.2830546661923394, + "grad_norm": 0.6578772791881029, + "learning_rate": 2.0509425810069814e-05, + "loss": 0.0689, + "step": 19253 + }, + { + "epoch": 2.2831732479544646, + "grad_norm": 0.8696439214720704, + "learning_rate": 2.050706451024307e-05, + "loss": 0.1188, + "step": 19254 + }, + { + "epoch": 2.28329182971659, + "grad_norm": 0.8260382843255265, + "learning_rate": 2.050470325183567e-05, + "loss": 0.1054, + "step": 19255 + }, + { + "epoch": 2.2834104114787146, + "grad_norm": 1.1360567544359537, + "learning_rate": 2.0502342034869406e-05, + "loss": 0.1767, + "step": 19256 + }, + { + "epoch": 2.2835289932408394, + "grad_norm": 0.7724005631678209, + "learning_rate": 2.049998085936603e-05, + "loss": 0.1049, + "step": 19257 + }, + { + "epoch": 2.2836475750029646, + "grad_norm": 0.6743902607956366, + "learning_rate": 2.0497619725347314e-05, + "loss": 0.0887, + "step": 19258 + }, + { + "epoch": 2.2837661567650898, + "grad_norm": 0.6187374524484708, + "learning_rate": 2.0495258632835014e-05, + "loss": 0.0664, + "step": 19259 + }, + { + "epoch": 2.2838847385272145, + "grad_norm": 0.7085611442316739, + "learning_rate": 2.0492897581850913e-05, + "loss": 0.0699, + "step": 19260 + }, + { + "epoch": 2.2840033202893393, + "grad_norm": 0.7403015439595959, + "learning_rate": 2.049053657241677e-05, + "loss": 0.1, + "step": 19261 + }, + { + "epoch": 2.2841219020514645, + "grad_norm": 0.6464808052800475, + "learning_rate": 2.048817560455434e-05, + "loss": 0.0755, + "step": 19262 + }, + { + "epoch": 2.2842404838135897, + "grad_norm": 0.7622344207413221, + "learning_rate": 2.0485814678285405e-05, + "loss": 0.1159, + "step": 19263 + }, + { + "epoch": 2.2843590655757144, + "grad_norm": 0.5761554846875863, + "learning_rate": 2.048345379363172e-05, + "loss": 0.0862, + "step": 19264 + }, + { + "epoch": 2.284477647337839, + "grad_norm": 0.6642076079911737, + "learning_rate": 2.048109295061506e-05, + "loss": 0.0978, + "step": 19265 + }, + { + "epoch": 2.2845962290999644, + "grad_norm": 0.8310738674783811, + "learning_rate": 2.0478732149257168e-05, + "loss": 0.1238, + "step": 19266 + }, + { + "epoch": 2.2847148108620896, + "grad_norm": 1.0314956372654764, + "learning_rate": 2.0476371389579834e-05, + "loss": 0.1751, + "step": 19267 + }, + { + "epoch": 2.2848333926242144, + "grad_norm": 0.5996143291581897, + "learning_rate": 2.0474010671604805e-05, + "loss": 0.0673, + "step": 19268 + }, + { + "epoch": 2.2849519743863396, + "grad_norm": 0.8882320823984076, + "learning_rate": 2.0471649995353854e-05, + "loss": 0.1252, + "step": 19269 + }, + { + "epoch": 2.2850705561484643, + "grad_norm": 0.5457057234219289, + "learning_rate": 2.046928936084872e-05, + "loss": 0.0729, + "step": 19270 + }, + { + "epoch": 2.2851891379105895, + "grad_norm": 0.794058981324232, + "learning_rate": 2.0466928768111197e-05, + "loss": 0.1007, + "step": 19271 + }, + { + "epoch": 2.2853077196727143, + "grad_norm": 0.6705227257694566, + "learning_rate": 2.046456821716303e-05, + "loss": 0.0879, + "step": 19272 + }, + { + "epoch": 2.2854263014348395, + "grad_norm": 0.614133904929565, + "learning_rate": 2.0462207708025975e-05, + "loss": 0.0862, + "step": 19273 + }, + { + "epoch": 2.2855448831969642, + "grad_norm": 0.794170195682687, + "learning_rate": 2.0459847240721805e-05, + "loss": 0.1047, + "step": 19274 + }, + { + "epoch": 2.2856634649590895, + "grad_norm": 0.7859942709335062, + "learning_rate": 2.045748681527227e-05, + "loss": 0.0988, + "step": 19275 + }, + { + "epoch": 2.285782046721214, + "grad_norm": 0.8247297894699633, + "learning_rate": 2.045512643169915e-05, + "loss": 0.1052, + "step": 19276 + }, + { + "epoch": 2.2859006284833394, + "grad_norm": 0.835556872723295, + "learning_rate": 2.045276609002417e-05, + "loss": 0.132, + "step": 19277 + }, + { + "epoch": 2.286019210245464, + "grad_norm": 0.5603548678961798, + "learning_rate": 2.045040579026913e-05, + "loss": 0.0792, + "step": 19278 + }, + { + "epoch": 2.2861377920075894, + "grad_norm": 0.5783226199260886, + "learning_rate": 2.0448045532455767e-05, + "loss": 0.0863, + "step": 19279 + }, + { + "epoch": 2.286256373769714, + "grad_norm": 0.5599909355644187, + "learning_rate": 2.044568531660584e-05, + "loss": 0.0757, + "step": 19280 + }, + { + "epoch": 2.2863749555318393, + "grad_norm": 0.7511329034126231, + "learning_rate": 2.04433251427411e-05, + "loss": 0.0939, + "step": 19281 + }, + { + "epoch": 2.286493537293964, + "grad_norm": 0.5933164546793431, + "learning_rate": 2.0440965010883327e-05, + "loss": 0.0948, + "step": 19282 + }, + { + "epoch": 2.2866121190560893, + "grad_norm": 0.7132617980035278, + "learning_rate": 2.0438604921054257e-05, + "loss": 0.0867, + "step": 19283 + }, + { + "epoch": 2.286730700818214, + "grad_norm": 0.6911315952089401, + "learning_rate": 2.0436244873275664e-05, + "loss": 0.0968, + "step": 19284 + }, + { + "epoch": 2.2868492825803393, + "grad_norm": 0.8220070590437693, + "learning_rate": 2.0433884867569285e-05, + "loss": 0.1102, + "step": 19285 + }, + { + "epoch": 2.286967864342464, + "grad_norm": 0.7317384016344366, + "learning_rate": 2.04315249039569e-05, + "loss": 0.1085, + "step": 19286 + }, + { + "epoch": 2.287086446104589, + "grad_norm": 0.6905703741230856, + "learning_rate": 2.0429164982460253e-05, + "loss": 0.0727, + "step": 19287 + }, + { + "epoch": 2.287205027866714, + "grad_norm": 0.6690898808587948, + "learning_rate": 2.04268051031011e-05, + "loss": 0.1058, + "step": 19288 + }, + { + "epoch": 2.287323609628839, + "grad_norm": 0.8076604350454905, + "learning_rate": 2.0424445265901183e-05, + "loss": 0.1148, + "step": 19289 + }, + { + "epoch": 2.287442191390964, + "grad_norm": 0.8817845378262333, + "learning_rate": 2.0422085470882283e-05, + "loss": 0.124, + "step": 19290 + }, + { + "epoch": 2.287560773153089, + "grad_norm": 0.5578120811937483, + "learning_rate": 2.0419725718066142e-05, + "loss": 0.0768, + "step": 19291 + }, + { + "epoch": 2.287679354915214, + "grad_norm": 0.757299570701759, + "learning_rate": 2.04173660074745e-05, + "loss": 0.0894, + "step": 19292 + }, + { + "epoch": 2.287797936677339, + "grad_norm": 0.7164330194254496, + "learning_rate": 2.0415006339129132e-05, + "loss": 0.1019, + "step": 19293 + }, + { + "epoch": 2.287916518439464, + "grad_norm": 0.5777314068762575, + "learning_rate": 2.041264671305178e-05, + "loss": 0.0767, + "step": 19294 + }, + { + "epoch": 2.288035100201589, + "grad_norm": 0.46176258401088255, + "learning_rate": 2.041028712926421e-05, + "loss": 0.0645, + "step": 19295 + }, + { + "epoch": 2.288153681963714, + "grad_norm": 0.7006649148980835, + "learning_rate": 2.0407927587788148e-05, + "loss": 0.0755, + "step": 19296 + }, + { + "epoch": 2.288272263725839, + "grad_norm": 0.6438573255623036, + "learning_rate": 2.0405568088645375e-05, + "loss": 0.0852, + "step": 19297 + }, + { + "epoch": 2.288390845487964, + "grad_norm": 0.7736884705678021, + "learning_rate": 2.040320863185763e-05, + "loss": 0.111, + "step": 19298 + }, + { + "epoch": 2.288509427250089, + "grad_norm": 0.6824398736791147, + "learning_rate": 2.0400849217446663e-05, + "loss": 0.0953, + "step": 19299 + }, + { + "epoch": 2.2886280090122137, + "grad_norm": 0.7062430605090256, + "learning_rate": 2.0398489845434218e-05, + "loss": 0.1057, + "step": 19300 + }, + { + "epoch": 2.288746590774339, + "grad_norm": 0.7130453320070889, + "learning_rate": 2.0396130515842064e-05, + "loss": 0.0848, + "step": 19301 + }, + { + "epoch": 2.2888651725364637, + "grad_norm": 0.9635573734868425, + "learning_rate": 2.0393771228691935e-05, + "loss": 0.1383, + "step": 19302 + }, + { + "epoch": 2.288983754298589, + "grad_norm": 0.7093140359420517, + "learning_rate": 2.0391411984005587e-05, + "loss": 0.1215, + "step": 19303 + }, + { + "epoch": 2.2891023360607137, + "grad_norm": 0.7187973817434513, + "learning_rate": 2.038905278180477e-05, + "loss": 0.0952, + "step": 19304 + }, + { + "epoch": 2.289220917822839, + "grad_norm": 0.5674224206215714, + "learning_rate": 2.0386693622111236e-05, + "loss": 0.0793, + "step": 19305 + }, + { + "epoch": 2.2893394995849636, + "grad_norm": 0.9287261589838464, + "learning_rate": 2.038433450494673e-05, + "loss": 0.1031, + "step": 19306 + }, + { + "epoch": 2.289458081347089, + "grad_norm": 0.681139716228886, + "learning_rate": 2.038197543033299e-05, + "loss": 0.1086, + "step": 19307 + }, + { + "epoch": 2.289576663109214, + "grad_norm": 0.6809644867244221, + "learning_rate": 2.0379616398291783e-05, + "loss": 0.1124, + "step": 19308 + }, + { + "epoch": 2.289695244871339, + "grad_norm": 0.4533257922626541, + "learning_rate": 2.0377257408844848e-05, + "loss": 0.0554, + "step": 19309 + }, + { + "epoch": 2.2898138266334636, + "grad_norm": 1.1568026967715022, + "learning_rate": 2.0374898462013926e-05, + "loss": 0.1213, + "step": 19310 + }, + { + "epoch": 2.2899324083955888, + "grad_norm": 0.6118144235826306, + "learning_rate": 2.0372539557820763e-05, + "loss": 0.0699, + "step": 19311 + }, + { + "epoch": 2.290050990157714, + "grad_norm": 0.4871498127907941, + "learning_rate": 2.0370180696287123e-05, + "loss": 0.0702, + "step": 19312 + }, + { + "epoch": 2.2901695719198387, + "grad_norm": 0.6767891766358276, + "learning_rate": 2.036782187743473e-05, + "loss": 0.1042, + "step": 19313 + }, + { + "epoch": 2.2902881536819635, + "grad_norm": 0.9115089920770655, + "learning_rate": 2.036546310128535e-05, + "loss": 0.1102, + "step": 19314 + }, + { + "epoch": 2.2904067354440887, + "grad_norm": 0.7640385535612699, + "learning_rate": 2.03631043678607e-05, + "loss": 0.1021, + "step": 19315 + }, + { + "epoch": 2.290525317206214, + "grad_norm": 1.0970011399877608, + "learning_rate": 2.0360745677182553e-05, + "loss": 0.1494, + "step": 19316 + }, + { + "epoch": 2.2906438989683386, + "grad_norm": 0.568876312567954, + "learning_rate": 2.0358387029272645e-05, + "loss": 0.0893, + "step": 19317 + }, + { + "epoch": 2.290762480730464, + "grad_norm": 1.3684796768263194, + "learning_rate": 2.0356028424152703e-05, + "loss": 0.0907, + "step": 19318 + }, + { + "epoch": 2.2908810624925886, + "grad_norm": 0.618732165248974, + "learning_rate": 2.0353669861844497e-05, + "loss": 0.099, + "step": 19319 + }, + { + "epoch": 2.290999644254714, + "grad_norm": 0.6925458573927483, + "learning_rate": 2.0351311342369755e-05, + "loss": 0.1048, + "step": 19320 + }, + { + "epoch": 2.2911182260168386, + "grad_norm": 0.6906037527765264, + "learning_rate": 2.0348952865750224e-05, + "loss": 0.0869, + "step": 19321 + }, + { + "epoch": 2.2912368077789638, + "grad_norm": 1.0020190904136268, + "learning_rate": 2.0346594432007633e-05, + "loss": 0.1454, + "step": 19322 + }, + { + "epoch": 2.2913553895410885, + "grad_norm": 0.8391588965852007, + "learning_rate": 2.0344236041163744e-05, + "loss": 0.1058, + "step": 19323 + }, + { + "epoch": 2.2914739713032137, + "grad_norm": 0.796058085391224, + "learning_rate": 2.0341877693240285e-05, + "loss": 0.0982, + "step": 19324 + }, + { + "epoch": 2.2915925530653385, + "grad_norm": 0.5253463575448818, + "learning_rate": 2.0339519388259008e-05, + "loss": 0.0782, + "step": 19325 + }, + { + "epoch": 2.2917111348274637, + "grad_norm": 0.569674246338537, + "learning_rate": 2.0337161126241637e-05, + "loss": 0.0788, + "step": 19326 + }, + { + "epoch": 2.2918297165895885, + "grad_norm": 0.6708297197753342, + "learning_rate": 2.0334802907209928e-05, + "loss": 0.086, + "step": 19327 + }, + { + "epoch": 2.2919482983517137, + "grad_norm": 0.8561609047259156, + "learning_rate": 2.0332444731185622e-05, + "loss": 0.1026, + "step": 19328 + }, + { + "epoch": 2.2920668801138384, + "grad_norm": 0.6976312464079919, + "learning_rate": 2.0330086598190448e-05, + "loss": 0.1081, + "step": 19329 + }, + { + "epoch": 2.2921854618759636, + "grad_norm": 0.6358461910508553, + "learning_rate": 2.0327728508246137e-05, + "loss": 0.0847, + "step": 19330 + }, + { + "epoch": 2.2923040436380884, + "grad_norm": 0.46136079307948147, + "learning_rate": 2.0325370461374454e-05, + "loss": 0.073, + "step": 19331 + }, + { + "epoch": 2.2924226254002136, + "grad_norm": 0.521142676788854, + "learning_rate": 2.0323012457597117e-05, + "loss": 0.074, + "step": 19332 + }, + { + "epoch": 2.2925412071623383, + "grad_norm": 1.0133053034094206, + "learning_rate": 2.0320654496935867e-05, + "loss": 0.1414, + "step": 19333 + }, + { + "epoch": 2.2926597889244635, + "grad_norm": 0.7068490434045177, + "learning_rate": 2.0318296579412448e-05, + "loss": 0.1054, + "step": 19334 + }, + { + "epoch": 2.2927783706865883, + "grad_norm": 0.7662328500064516, + "learning_rate": 2.0315938705048596e-05, + "loss": 0.1159, + "step": 19335 + }, + { + "epoch": 2.2928969524487135, + "grad_norm": 0.9891864842675068, + "learning_rate": 2.0313580873866046e-05, + "loss": 0.0987, + "step": 19336 + }, + { + "epoch": 2.2930155342108383, + "grad_norm": 0.715012608535461, + "learning_rate": 2.0311223085886524e-05, + "loss": 0.0791, + "step": 19337 + }, + { + "epoch": 2.2931341159729635, + "grad_norm": 0.5511383636788786, + "learning_rate": 2.030886534113178e-05, + "loss": 0.0792, + "step": 19338 + }, + { + "epoch": 2.293252697735088, + "grad_norm": 0.7023454532013924, + "learning_rate": 2.0306507639623556e-05, + "loss": 0.0876, + "step": 19339 + }, + { + "epoch": 2.2933712794972134, + "grad_norm": 0.7764746978587386, + "learning_rate": 2.030414998138357e-05, + "loss": 0.1156, + "step": 19340 + }, + { + "epoch": 2.293489861259338, + "grad_norm": 0.9529682454048283, + "learning_rate": 2.030179236643355e-05, + "loss": 0.1301, + "step": 19341 + }, + { + "epoch": 2.2936084430214634, + "grad_norm": 0.7430733999273693, + "learning_rate": 2.0299434794795257e-05, + "loss": 0.0812, + "step": 19342 + }, + { + "epoch": 2.293727024783588, + "grad_norm": 0.7215794310711059, + "learning_rate": 2.0297077266490405e-05, + "loss": 0.1109, + "step": 19343 + }, + { + "epoch": 2.2938456065457133, + "grad_norm": 0.5390012830071894, + "learning_rate": 2.029471978154074e-05, + "loss": 0.0809, + "step": 19344 + }, + { + "epoch": 2.293964188307838, + "grad_norm": 0.6825101403959933, + "learning_rate": 2.0292362339967975e-05, + "loss": 0.0968, + "step": 19345 + }, + { + "epoch": 2.2940827700699633, + "grad_norm": 0.7646323010000067, + "learning_rate": 2.0290004941793865e-05, + "loss": 0.1191, + "step": 19346 + }, + { + "epoch": 2.294201351832088, + "grad_norm": 0.7360976645823127, + "learning_rate": 2.028764758704014e-05, + "loss": 0.1162, + "step": 19347 + }, + { + "epoch": 2.2943199335942133, + "grad_norm": 0.5735217346581379, + "learning_rate": 2.028529027572851e-05, + "loss": 0.079, + "step": 19348 + }, + { + "epoch": 2.294438515356338, + "grad_norm": 0.5705181140148529, + "learning_rate": 2.0282933007880736e-05, + "loss": 0.071, + "step": 19349 + }, + { + "epoch": 2.2945570971184632, + "grad_norm": 0.7607344717948376, + "learning_rate": 2.0280575783518533e-05, + "loss": 0.1298, + "step": 19350 + }, + { + "epoch": 2.294675678880588, + "grad_norm": 0.7055793617239015, + "learning_rate": 2.0278218602663628e-05, + "loss": 0.0923, + "step": 19351 + }, + { + "epoch": 2.294794260642713, + "grad_norm": 0.6478123711846586, + "learning_rate": 2.027586146533776e-05, + "loss": 0.0924, + "step": 19352 + }, + { + "epoch": 2.294912842404838, + "grad_norm": 0.8516234965282263, + "learning_rate": 2.0273504371562653e-05, + "loss": 0.1169, + "step": 19353 + }, + { + "epoch": 2.295031424166963, + "grad_norm": 0.7441171406620491, + "learning_rate": 2.027114732136005e-05, + "loss": 0.1105, + "step": 19354 + }, + { + "epoch": 2.295150005929088, + "grad_norm": 0.7032662265203407, + "learning_rate": 2.0268790314751664e-05, + "loss": 0.091, + "step": 19355 + }, + { + "epoch": 2.295268587691213, + "grad_norm": 0.8590446707047361, + "learning_rate": 2.0266433351759224e-05, + "loss": 0.1009, + "step": 19356 + }, + { + "epoch": 2.2953871694533383, + "grad_norm": 0.6889212627535122, + "learning_rate": 2.026407643240447e-05, + "loss": 0.0851, + "step": 19357 + }, + { + "epoch": 2.295505751215463, + "grad_norm": 0.721944027318995, + "learning_rate": 2.0261719556709126e-05, + "loss": 0.1031, + "step": 19358 + }, + { + "epoch": 2.295624332977588, + "grad_norm": 1.0515953115492114, + "learning_rate": 2.0259362724694915e-05, + "loss": 0.1522, + "step": 19359 + }, + { + "epoch": 2.295742914739713, + "grad_norm": 0.6652391608242114, + "learning_rate": 2.025700593638356e-05, + "loss": 0.1073, + "step": 19360 + }, + { + "epoch": 2.2958614965018382, + "grad_norm": 0.7777475958803388, + "learning_rate": 2.0254649191796802e-05, + "loss": 0.1035, + "step": 19361 + }, + { + "epoch": 2.295980078263963, + "grad_norm": 0.7669133282714261, + "learning_rate": 2.0252292490956353e-05, + "loss": 0.12, + "step": 19362 + }, + { + "epoch": 2.2960986600260878, + "grad_norm": 0.5629293244182242, + "learning_rate": 2.024993583388395e-05, + "loss": 0.0824, + "step": 19363 + }, + { + "epoch": 2.296217241788213, + "grad_norm": 0.7960076991474768, + "learning_rate": 2.024757922060131e-05, + "loss": 0.0958, + "step": 19364 + }, + { + "epoch": 2.296335823550338, + "grad_norm": 0.5642739092275387, + "learning_rate": 2.0245222651130165e-05, + "loss": 0.0908, + "step": 19365 + }, + { + "epoch": 2.296454405312463, + "grad_norm": 0.8405023551154287, + "learning_rate": 2.0242866125492237e-05, + "loss": 0.1397, + "step": 19366 + }, + { + "epoch": 2.2965729870745877, + "grad_norm": 0.6988088310085141, + "learning_rate": 2.0240509643709243e-05, + "loss": 0.0845, + "step": 19367 + }, + { + "epoch": 2.296691568836713, + "grad_norm": 0.8135788810703107, + "learning_rate": 2.0238153205802925e-05, + "loss": 0.1169, + "step": 19368 + }, + { + "epoch": 2.296810150598838, + "grad_norm": 0.5269513662238885, + "learning_rate": 2.0235796811794987e-05, + "loss": 0.0777, + "step": 19369 + }, + { + "epoch": 2.296928732360963, + "grad_norm": 0.6847598748246339, + "learning_rate": 2.0233440461707168e-05, + "loss": 0.0897, + "step": 19370 + }, + { + "epoch": 2.297047314123088, + "grad_norm": 0.5184045271664632, + "learning_rate": 2.0231084155561168e-05, + "loss": 0.0827, + "step": 19371 + }, + { + "epoch": 2.297165895885213, + "grad_norm": 0.7976048528659898, + "learning_rate": 2.0228727893378737e-05, + "loss": 0.105, + "step": 19372 + }, + { + "epoch": 2.297284477647338, + "grad_norm": 1.0686901238373245, + "learning_rate": 2.0226371675181577e-05, + "loss": 0.1253, + "step": 19373 + }, + { + "epoch": 2.2974030594094628, + "grad_norm": 0.5685958190872293, + "learning_rate": 2.0224015500991422e-05, + "loss": 0.0818, + "step": 19374 + }, + { + "epoch": 2.297521641171588, + "grad_norm": 0.7821769426985752, + "learning_rate": 2.0221659370829975e-05, + "loss": 0.1105, + "step": 19375 + }, + { + "epoch": 2.2976402229337127, + "grad_norm": 0.7013230082452497, + "learning_rate": 2.0219303284718985e-05, + "loss": 0.0728, + "step": 19376 + }, + { + "epoch": 2.297758804695838, + "grad_norm": 0.7131972188536402, + "learning_rate": 2.0216947242680152e-05, + "loss": 0.07, + "step": 19377 + }, + { + "epoch": 2.2978773864579627, + "grad_norm": 0.555032406529683, + "learning_rate": 2.021459124473519e-05, + "loss": 0.0899, + "step": 19378 + }, + { + "epoch": 2.297995968220088, + "grad_norm": 0.809217341117534, + "learning_rate": 2.021223529090584e-05, + "loss": 0.1239, + "step": 19379 + }, + { + "epoch": 2.2981145499822127, + "grad_norm": 0.889689873959065, + "learning_rate": 2.020987938121381e-05, + "loss": 0.1335, + "step": 19380 + }, + { + "epoch": 2.298233131744338, + "grad_norm": 0.6196681059220045, + "learning_rate": 2.0207523515680813e-05, + "loss": 0.0943, + "step": 19381 + }, + { + "epoch": 2.2983517135064626, + "grad_norm": 0.5229136358161144, + "learning_rate": 2.020516769432857e-05, + "loss": 0.0564, + "step": 19382 + }, + { + "epoch": 2.298470295268588, + "grad_norm": 0.8261892219418496, + "learning_rate": 2.02028119171788e-05, + "loss": 0.0965, + "step": 19383 + }, + { + "epoch": 2.2985888770307126, + "grad_norm": 0.5549836804729648, + "learning_rate": 2.020045618425323e-05, + "loss": 0.0924, + "step": 19384 + }, + { + "epoch": 2.2987074587928378, + "grad_norm": 0.6017332266395271, + "learning_rate": 2.019810049557357e-05, + "loss": 0.0898, + "step": 19385 + }, + { + "epoch": 2.2988260405549625, + "grad_norm": 0.6453493911874947, + "learning_rate": 2.019574485116152e-05, + "loss": 0.0995, + "step": 19386 + }, + { + "epoch": 2.2989446223170877, + "grad_norm": 0.8825344014456329, + "learning_rate": 2.0193389251038823e-05, + "loss": 0.0846, + "step": 19387 + }, + { + "epoch": 2.2990632040792125, + "grad_norm": 0.6262424115062848, + "learning_rate": 2.019103369522718e-05, + "loss": 0.0888, + "step": 19388 + }, + { + "epoch": 2.2991817858413377, + "grad_norm": 0.816148150850977, + "learning_rate": 2.0188678183748306e-05, + "loss": 0.131, + "step": 19389 + }, + { + "epoch": 2.2993003676034625, + "grad_norm": 0.7321381054505017, + "learning_rate": 2.0186322716623924e-05, + "loss": 0.1103, + "step": 19390 + }, + { + "epoch": 2.2994189493655877, + "grad_norm": 0.4310172926841919, + "learning_rate": 2.0183967293875745e-05, + "loss": 0.0635, + "step": 19391 + }, + { + "epoch": 2.2995375311277124, + "grad_norm": 0.5464152998705958, + "learning_rate": 2.0181611915525475e-05, + "loss": 0.0839, + "step": 19392 + }, + { + "epoch": 2.2996561128898376, + "grad_norm": 0.6433442539813817, + "learning_rate": 2.0179256581594836e-05, + "loss": 0.0782, + "step": 19393 + }, + { + "epoch": 2.2997746946519624, + "grad_norm": 0.623256417167043, + "learning_rate": 2.0176901292105542e-05, + "loss": 0.098, + "step": 19394 + }, + { + "epoch": 2.2998932764140876, + "grad_norm": 0.7777457103637672, + "learning_rate": 2.0174546047079305e-05, + "loss": 0.0926, + "step": 19395 + }, + { + "epoch": 2.3000118581762123, + "grad_norm": 0.6584474517268538, + "learning_rate": 2.0172190846537835e-05, + "loss": 0.0827, + "step": 19396 + }, + { + "epoch": 2.3001304399383375, + "grad_norm": 0.7838229857700109, + "learning_rate": 2.0169835690502838e-05, + "loss": 0.1172, + "step": 19397 + }, + { + "epoch": 2.3002490217004623, + "grad_norm": 0.5541647393464497, + "learning_rate": 2.0167480578996044e-05, + "loss": 0.0788, + "step": 19398 + }, + { + "epoch": 2.3003676034625875, + "grad_norm": 0.8564196077741129, + "learning_rate": 2.016512551203915e-05, + "loss": 0.1234, + "step": 19399 + }, + { + "epoch": 2.3004861852247123, + "grad_norm": 0.48585531127661735, + "learning_rate": 2.0162770489653874e-05, + "loss": 0.0599, + "step": 19400 + }, + { + "epoch": 2.3006047669868375, + "grad_norm": 0.7946132021560789, + "learning_rate": 2.0160415511861907e-05, + "loss": 0.1265, + "step": 19401 + }, + { + "epoch": 2.3007233487489622, + "grad_norm": 0.5250935147066745, + "learning_rate": 2.0158060578684988e-05, + "loss": 0.0773, + "step": 19402 + }, + { + "epoch": 2.3008419305110874, + "grad_norm": 0.6876997593857951, + "learning_rate": 2.0155705690144808e-05, + "loss": 0.0837, + "step": 19403 + }, + { + "epoch": 2.300960512273212, + "grad_norm": 0.8175745394616659, + "learning_rate": 2.015335084626308e-05, + "loss": 0.1044, + "step": 19404 + }, + { + "epoch": 2.3010790940353374, + "grad_norm": 0.77155422653049, + "learning_rate": 2.0150996047061513e-05, + "loss": 0.1158, + "step": 19405 + }, + { + "epoch": 2.3011976757974626, + "grad_norm": 0.6701007967267799, + "learning_rate": 2.014864129256182e-05, + "loss": 0.0889, + "step": 19406 + }, + { + "epoch": 2.3013162575595874, + "grad_norm": 0.7100427568870292, + "learning_rate": 2.0146286582785707e-05, + "loss": 0.122, + "step": 19407 + }, + { + "epoch": 2.301434839321712, + "grad_norm": 1.0020926084534914, + "learning_rate": 2.0143931917754872e-05, + "loss": 0.1162, + "step": 19408 + }, + { + "epoch": 2.3015534210838373, + "grad_norm": 0.6957996751124674, + "learning_rate": 2.0141577297491037e-05, + "loss": 0.1004, + "step": 19409 + }, + { + "epoch": 2.3016720028459625, + "grad_norm": 1.0593699126365925, + "learning_rate": 2.01392227220159e-05, + "loss": 0.1523, + "step": 19410 + }, + { + "epoch": 2.3017905846080873, + "grad_norm": 0.7729142198656017, + "learning_rate": 2.013686819135117e-05, + "loss": 0.088, + "step": 19411 + }, + { + "epoch": 2.301909166370212, + "grad_norm": 0.7652837867341464, + "learning_rate": 2.0134513705518547e-05, + "loss": 0.0834, + "step": 19412 + }, + { + "epoch": 2.3020277481323372, + "grad_norm": 0.5286088537099453, + "learning_rate": 2.013215926453974e-05, + "loss": 0.0806, + "step": 19413 + }, + { + "epoch": 2.3021463298944624, + "grad_norm": 0.8927823380437614, + "learning_rate": 2.0129804868436463e-05, + "loss": 0.12, + "step": 19414 + }, + { + "epoch": 2.302264911656587, + "grad_norm": 0.7729137957577683, + "learning_rate": 2.0127450517230413e-05, + "loss": 0.1176, + "step": 19415 + }, + { + "epoch": 2.302383493418712, + "grad_norm": 0.5047947033245339, + "learning_rate": 2.012509621094328e-05, + "loss": 0.0637, + "step": 19416 + }, + { + "epoch": 2.302502075180837, + "grad_norm": 0.5972869187641996, + "learning_rate": 2.0122741949596797e-05, + "loss": 0.0836, + "step": 19417 + }, + { + "epoch": 2.3026206569429624, + "grad_norm": 0.5153627445155109, + "learning_rate": 2.0120387733212647e-05, + "loss": 0.0709, + "step": 19418 + }, + { + "epoch": 2.302739238705087, + "grad_norm": 0.5242397418858895, + "learning_rate": 2.0118033561812532e-05, + "loss": 0.0817, + "step": 19419 + }, + { + "epoch": 2.3028578204672123, + "grad_norm": 1.0243478575828784, + "learning_rate": 2.011567943541817e-05, + "loss": 0.1539, + "step": 19420 + }, + { + "epoch": 2.302976402229337, + "grad_norm": 0.6450694231047359, + "learning_rate": 2.011332535405126e-05, + "loss": 0.0842, + "step": 19421 + }, + { + "epoch": 2.3030949839914623, + "grad_norm": 0.5625627551443518, + "learning_rate": 2.0110971317733485e-05, + "loss": 0.0744, + "step": 19422 + }, + { + "epoch": 2.303213565753587, + "grad_norm": 1.1015133644573898, + "learning_rate": 2.010861732648656e-05, + "loss": 0.147, + "step": 19423 + }, + { + "epoch": 2.3033321475157122, + "grad_norm": 0.8166247211058721, + "learning_rate": 2.0106263380332186e-05, + "loss": 0.1348, + "step": 19424 + }, + { + "epoch": 2.303450729277837, + "grad_norm": 0.7481777340703266, + "learning_rate": 2.010390947929207e-05, + "loss": 0.0935, + "step": 19425 + }, + { + "epoch": 2.303569311039962, + "grad_norm": 0.656004992376425, + "learning_rate": 2.0101555623387902e-05, + "loss": 0.102, + "step": 19426 + }, + { + "epoch": 2.303687892802087, + "grad_norm": 0.7369271646673531, + "learning_rate": 2.0099201812641375e-05, + "loss": 0.0983, + "step": 19427 + }, + { + "epoch": 2.303806474564212, + "grad_norm": 0.751544507553673, + "learning_rate": 2.0096848047074207e-05, + "loss": 0.0907, + "step": 19428 + }, + { + "epoch": 2.303925056326337, + "grad_norm": 0.8796345881474645, + "learning_rate": 2.0094494326708088e-05, + "loss": 0.1373, + "step": 19429 + }, + { + "epoch": 2.304043638088462, + "grad_norm": 0.6482901845082302, + "learning_rate": 2.0092140651564712e-05, + "loss": 0.0839, + "step": 19430 + }, + { + "epoch": 2.304162219850587, + "grad_norm": 0.8263887947010895, + "learning_rate": 2.0089787021665778e-05, + "loss": 0.1266, + "step": 19431 + }, + { + "epoch": 2.304280801612712, + "grad_norm": 0.6051037436955741, + "learning_rate": 2.008743343703299e-05, + "loss": 0.0843, + "step": 19432 + }, + { + "epoch": 2.304399383374837, + "grad_norm": 0.6036957418789926, + "learning_rate": 2.0085079897688043e-05, + "loss": 0.0675, + "step": 19433 + }, + { + "epoch": 2.304517965136962, + "grad_norm": 0.7402950804968166, + "learning_rate": 2.008272640365262e-05, + "loss": 0.0852, + "step": 19434 + }, + { + "epoch": 2.304636546899087, + "grad_norm": 0.8162108572827044, + "learning_rate": 2.0080372954948446e-05, + "loss": 0.1047, + "step": 19435 + }, + { + "epoch": 2.304755128661212, + "grad_norm": 0.7433492840701581, + "learning_rate": 2.0078019551597198e-05, + "loss": 0.0862, + "step": 19436 + }, + { + "epoch": 2.3048737104233368, + "grad_norm": 0.6105505317058032, + "learning_rate": 2.0075666193620573e-05, + "loss": 0.0712, + "step": 19437 + }, + { + "epoch": 2.304992292185462, + "grad_norm": 0.7660922974364301, + "learning_rate": 2.0073312881040255e-05, + "loss": 0.0944, + "step": 19438 + }, + { + "epoch": 2.3051108739475867, + "grad_norm": 0.821975394552258, + "learning_rate": 2.0070959613877966e-05, + "loss": 0.0989, + "step": 19439 + }, + { + "epoch": 2.305229455709712, + "grad_norm": 0.8277305675160203, + "learning_rate": 2.0068606392155383e-05, + "loss": 0.1086, + "step": 19440 + }, + { + "epoch": 2.3053480374718367, + "grad_norm": 0.7460435929851807, + "learning_rate": 2.0066253215894196e-05, + "loss": 0.1258, + "step": 19441 + }, + { + "epoch": 2.305466619233962, + "grad_norm": 0.9109004690029164, + "learning_rate": 2.0063900085116104e-05, + "loss": 0.0999, + "step": 19442 + }, + { + "epoch": 2.3055852009960867, + "grad_norm": 0.47888685411980325, + "learning_rate": 2.00615469998428e-05, + "loss": 0.0726, + "step": 19443 + }, + { + "epoch": 2.305703782758212, + "grad_norm": 0.7052707637708844, + "learning_rate": 2.0059193960095986e-05, + "loss": 0.0749, + "step": 19444 + }, + { + "epoch": 2.3058223645203366, + "grad_norm": 0.8710302350129289, + "learning_rate": 2.005684096589734e-05, + "loss": 0.0895, + "step": 19445 + }, + { + "epoch": 2.305940946282462, + "grad_norm": 0.6981314003104832, + "learning_rate": 2.0054488017268545e-05, + "loss": 0.0773, + "step": 19446 + }, + { + "epoch": 2.3060595280445866, + "grad_norm": 0.8288291128569834, + "learning_rate": 2.0052135114231324e-05, + "loss": 0.1007, + "step": 19447 + }, + { + "epoch": 2.306178109806712, + "grad_norm": 0.9429552203417806, + "learning_rate": 2.004978225680735e-05, + "loss": 0.1384, + "step": 19448 + }, + { + "epoch": 2.3062966915688365, + "grad_norm": 0.5643267167430157, + "learning_rate": 2.0047429445018296e-05, + "loss": 0.0789, + "step": 19449 + }, + { + "epoch": 2.3064152733309617, + "grad_norm": 0.635706988858197, + "learning_rate": 2.0045076678885882e-05, + "loss": 0.08, + "step": 19450 + }, + { + "epoch": 2.3065338550930865, + "grad_norm": 0.8312213620431713, + "learning_rate": 2.0042723958431786e-05, + "loss": 0.1007, + "step": 19451 + }, + { + "epoch": 2.3066524368552117, + "grad_norm": 0.6120616342568947, + "learning_rate": 2.0040371283677693e-05, + "loss": 0.0875, + "step": 19452 + }, + { + "epoch": 2.3067710186173365, + "grad_norm": 0.6450283327682592, + "learning_rate": 2.003801865464529e-05, + "loss": 0.0834, + "step": 19453 + }, + { + "epoch": 2.3068896003794617, + "grad_norm": 0.8945899152545437, + "learning_rate": 2.0035666071356274e-05, + "loss": 0.1149, + "step": 19454 + }, + { + "epoch": 2.307008182141587, + "grad_norm": 0.6910211025401297, + "learning_rate": 2.0033313533832336e-05, + "loss": 0.0984, + "step": 19455 + }, + { + "epoch": 2.3071267639037116, + "grad_norm": 1.2470675877366493, + "learning_rate": 2.0030961042095154e-05, + "loss": 0.1984, + "step": 19456 + }, + { + "epoch": 2.3072453456658364, + "grad_norm": 0.651441747875577, + "learning_rate": 2.0028608596166405e-05, + "loss": 0.0751, + "step": 19457 + }, + { + "epoch": 2.3073639274279616, + "grad_norm": 0.5766297427246903, + "learning_rate": 2.0026256196067803e-05, + "loss": 0.086, + "step": 19458 + }, + { + "epoch": 2.307482509190087, + "grad_norm": 0.7825650915137339, + "learning_rate": 2.002390384182102e-05, + "loss": 0.1012, + "step": 19459 + }, + { + "epoch": 2.3076010909522116, + "grad_norm": 0.5179230206807838, + "learning_rate": 2.0021551533447724e-05, + "loss": 0.0582, + "step": 19460 + }, + { + "epoch": 2.3077196727143363, + "grad_norm": 0.8421475564730062, + "learning_rate": 2.0019199270969635e-05, + "loss": 0.1182, + "step": 19461 + }, + { + "epoch": 2.3078382544764615, + "grad_norm": 0.7663389612902954, + "learning_rate": 2.0016847054408412e-05, + "loss": 0.1021, + "step": 19462 + }, + { + "epoch": 2.3079568362385867, + "grad_norm": 0.7094299579522881, + "learning_rate": 2.0014494883785757e-05, + "loss": 0.087, + "step": 19463 + }, + { + "epoch": 2.3080754180007115, + "grad_norm": 1.0277254292117632, + "learning_rate": 2.0012142759123334e-05, + "loss": 0.1384, + "step": 19464 + }, + { + "epoch": 2.3081939997628362, + "grad_norm": 0.6419057075382641, + "learning_rate": 2.0009790680442846e-05, + "loss": 0.1063, + "step": 19465 + }, + { + "epoch": 2.3083125815249614, + "grad_norm": 0.591407046282619, + "learning_rate": 2.000743864776597e-05, + "loss": 0.0925, + "step": 19466 + }, + { + "epoch": 2.3084311632870866, + "grad_norm": 0.6204018941866264, + "learning_rate": 2.000508666111439e-05, + "loss": 0.0793, + "step": 19467 + }, + { + "epoch": 2.3085497450492114, + "grad_norm": 0.5998293286823643, + "learning_rate": 2.0002734720509776e-05, + "loss": 0.084, + "step": 19468 + }, + { + "epoch": 2.3086683268113366, + "grad_norm": 0.8942062966524026, + "learning_rate": 2.0000382825973828e-05, + "loss": 0.0921, + "step": 19469 + }, + { + "epoch": 2.3087869085734614, + "grad_norm": 1.147351945375658, + "learning_rate": 1.9998030977528218e-05, + "loss": 0.1823, + "step": 19470 + }, + { + "epoch": 2.3089054903355866, + "grad_norm": 0.6449766986417694, + "learning_rate": 1.9995679175194625e-05, + "loss": 0.0921, + "step": 19471 + }, + { + "epoch": 2.3090240720977113, + "grad_norm": 0.6412417895789546, + "learning_rate": 1.999332741899473e-05, + "loss": 0.0851, + "step": 19472 + }, + { + "epoch": 2.3091426538598365, + "grad_norm": 0.43901846870254496, + "learning_rate": 1.9990975708950223e-05, + "loss": 0.0585, + "step": 19473 + }, + { + "epoch": 2.3092612356219613, + "grad_norm": 0.6871945908823685, + "learning_rate": 1.9988624045082776e-05, + "loss": 0.1112, + "step": 19474 + }, + { + "epoch": 2.3093798173840865, + "grad_norm": 0.903528615036626, + "learning_rate": 1.9986272427414064e-05, + "loss": 0.1077, + "step": 19475 + }, + { + "epoch": 2.3094983991462112, + "grad_norm": 0.6870463688072702, + "learning_rate": 1.9983920855965782e-05, + "loss": 0.0811, + "step": 19476 + }, + { + "epoch": 2.3096169809083364, + "grad_norm": 0.6726087995500419, + "learning_rate": 1.9981569330759597e-05, + "loss": 0.1008, + "step": 19477 + }, + { + "epoch": 2.309735562670461, + "grad_norm": 0.5944697005781283, + "learning_rate": 1.997921785181719e-05, + "loss": 0.0802, + "step": 19478 + }, + { + "epoch": 2.3098541444325864, + "grad_norm": 0.7383816364442289, + "learning_rate": 1.9976866419160226e-05, + "loss": 0.0846, + "step": 19479 + }, + { + "epoch": 2.309972726194711, + "grad_norm": 0.5449047745015513, + "learning_rate": 1.9974515032810405e-05, + "loss": 0.068, + "step": 19480 + }, + { + "epoch": 2.3100913079568364, + "grad_norm": 0.7118929265370622, + "learning_rate": 1.9972163692789393e-05, + "loss": 0.1415, + "step": 19481 + }, + { + "epoch": 2.310209889718961, + "grad_norm": 0.6633309988168661, + "learning_rate": 1.9969812399118858e-05, + "loss": 0.0845, + "step": 19482 + }, + { + "epoch": 2.3103284714810863, + "grad_norm": 0.44887097952780264, + "learning_rate": 1.996746115182049e-05, + "loss": 0.0661, + "step": 19483 + }, + { + "epoch": 2.310447053243211, + "grad_norm": 0.48956358426121327, + "learning_rate": 1.9965109950915956e-05, + "loss": 0.071, + "step": 19484 + }, + { + "epoch": 2.3105656350053363, + "grad_norm": 0.8634225340731697, + "learning_rate": 1.996275879642694e-05, + "loss": 0.1245, + "step": 19485 + }, + { + "epoch": 2.310684216767461, + "grad_norm": 0.676481389162232, + "learning_rate": 1.9960407688375106e-05, + "loss": 0.0835, + "step": 19486 + }, + { + "epoch": 2.3108027985295863, + "grad_norm": 1.0712256586226838, + "learning_rate": 1.995805662678213e-05, + "loss": 0.1262, + "step": 19487 + }, + { + "epoch": 2.310921380291711, + "grad_norm": 0.7892003741933462, + "learning_rate": 1.9955705611669694e-05, + "loss": 0.0918, + "step": 19488 + }, + { + "epoch": 2.311039962053836, + "grad_norm": 0.9584174381622288, + "learning_rate": 1.995335464305947e-05, + "loss": 0.0969, + "step": 19489 + }, + { + "epoch": 2.311158543815961, + "grad_norm": 0.6658673835401892, + "learning_rate": 1.9951003720973117e-05, + "loss": 0.0812, + "step": 19490 + }, + { + "epoch": 2.311277125578086, + "grad_norm": 0.5894626174079831, + "learning_rate": 1.9948652845432326e-05, + "loss": 0.0849, + "step": 19491 + }, + { + "epoch": 2.311395707340211, + "grad_norm": 0.5959504993073242, + "learning_rate": 1.9946302016458756e-05, + "loss": 0.1015, + "step": 19492 + }, + { + "epoch": 2.311514289102336, + "grad_norm": 0.731612608034272, + "learning_rate": 1.9943951234074092e-05, + "loss": 0.0945, + "step": 19493 + }, + { + "epoch": 2.311632870864461, + "grad_norm": 0.5142237613350996, + "learning_rate": 1.9941600498299985e-05, + "loss": 0.0745, + "step": 19494 + }, + { + "epoch": 2.311751452626586, + "grad_norm": 0.6311016213909606, + "learning_rate": 1.993924980915813e-05, + "loss": 0.0836, + "step": 19495 + }, + { + "epoch": 2.311870034388711, + "grad_norm": 1.2131841800505143, + "learning_rate": 1.9936899166670188e-05, + "loss": 0.1118, + "step": 19496 + }, + { + "epoch": 2.311988616150836, + "grad_norm": 1.2261626087689872, + "learning_rate": 1.9934548570857823e-05, + "loss": 0.1698, + "step": 19497 + }, + { + "epoch": 2.312107197912961, + "grad_norm": 0.709818491577816, + "learning_rate": 1.99321980217427e-05, + "loss": 0.099, + "step": 19498 + }, + { + "epoch": 2.312225779675086, + "grad_norm": 1.19255819735611, + "learning_rate": 1.9929847519346507e-05, + "loss": 0.1361, + "step": 19499 + }, + { + "epoch": 2.312344361437211, + "grad_norm": 0.7024706404644304, + "learning_rate": 1.99274970636909e-05, + "loss": 0.0837, + "step": 19500 + }, + { + "epoch": 2.312462943199336, + "grad_norm": 1.0499534068280045, + "learning_rate": 1.992514665479755e-05, + "loss": 0.1274, + "step": 19501 + }, + { + "epoch": 2.3125815249614607, + "grad_norm": 0.64253994103003, + "learning_rate": 1.9922796292688116e-05, + "loss": 0.1061, + "step": 19502 + }, + { + "epoch": 2.312700106723586, + "grad_norm": 0.8490238846701407, + "learning_rate": 1.9920445977384284e-05, + "loss": 0.1202, + "step": 19503 + }, + { + "epoch": 2.3128186884857107, + "grad_norm": 0.512750930417355, + "learning_rate": 1.991809570890771e-05, + "loss": 0.0813, + "step": 19504 + }, + { + "epoch": 2.312937270247836, + "grad_norm": 0.4276462797389517, + "learning_rate": 1.9915745487280053e-05, + "loss": 0.0593, + "step": 19505 + }, + { + "epoch": 2.3130558520099607, + "grad_norm": 0.8384790174158365, + "learning_rate": 1.9913395312523e-05, + "loss": 0.1281, + "step": 19506 + }, + { + "epoch": 2.313174433772086, + "grad_norm": 1.1063189528515829, + "learning_rate": 1.99110451846582e-05, + "loss": 0.1278, + "step": 19507 + }, + { + "epoch": 2.313293015534211, + "grad_norm": 0.8046009780998737, + "learning_rate": 1.9908695103707327e-05, + "loss": 0.1046, + "step": 19508 + }, + { + "epoch": 2.313411597296336, + "grad_norm": 0.447946703767494, + "learning_rate": 1.990634506969203e-05, + "loss": 0.0722, + "step": 19509 + }, + { + "epoch": 2.3135301790584606, + "grad_norm": 0.7444092759381513, + "learning_rate": 1.9903995082633997e-05, + "loss": 0.0922, + "step": 19510 + }, + { + "epoch": 2.313648760820586, + "grad_norm": 0.9879504724016865, + "learning_rate": 1.990164514255487e-05, + "loss": 0.1272, + "step": 19511 + }, + { + "epoch": 2.313767342582711, + "grad_norm": 0.6344211996443779, + "learning_rate": 1.9899295249476337e-05, + "loss": 0.0951, + "step": 19512 + }, + { + "epoch": 2.3138859243448358, + "grad_norm": 0.5900483101671112, + "learning_rate": 1.989694540342003e-05, + "loss": 0.0767, + "step": 19513 + }, + { + "epoch": 2.3140045061069605, + "grad_norm": 0.5377512623883821, + "learning_rate": 1.989459560440764e-05, + "loss": 0.0534, + "step": 19514 + }, + { + "epoch": 2.3141230878690857, + "grad_norm": 0.7103578712142256, + "learning_rate": 1.9892245852460818e-05, + "loss": 0.0918, + "step": 19515 + }, + { + "epoch": 2.314241669631211, + "grad_norm": 0.8039556893095237, + "learning_rate": 1.9889896147601227e-05, + "loss": 0.095, + "step": 19516 + }, + { + "epoch": 2.3143602513933357, + "grad_norm": 0.7845241147483656, + "learning_rate": 1.9887546489850515e-05, + "loss": 0.1065, + "step": 19517 + }, + { + "epoch": 2.314478833155461, + "grad_norm": 0.8441020564562346, + "learning_rate": 1.9885196879230367e-05, + "loss": 0.1262, + "step": 19518 + }, + { + "epoch": 2.3145974149175856, + "grad_norm": 0.8188597771379673, + "learning_rate": 1.988284731576243e-05, + "loss": 0.0847, + "step": 19519 + }, + { + "epoch": 2.314715996679711, + "grad_norm": 0.5769080881484961, + "learning_rate": 1.9880497799468357e-05, + "loss": 0.0719, + "step": 19520 + }, + { + "epoch": 2.3148345784418356, + "grad_norm": 0.9425391382660115, + "learning_rate": 1.9878148330369826e-05, + "loss": 0.1239, + "step": 19521 + }, + { + "epoch": 2.314953160203961, + "grad_norm": 0.6535475198077045, + "learning_rate": 1.987579890848848e-05, + "loss": 0.0764, + "step": 19522 + }, + { + "epoch": 2.3150717419660856, + "grad_norm": 0.6450343843113554, + "learning_rate": 1.987344953384599e-05, + "loss": 0.0978, + "step": 19523 + }, + { + "epoch": 2.3151903237282108, + "grad_norm": 0.6999815375798448, + "learning_rate": 1.9871100206464e-05, + "loss": 0.102, + "step": 19524 + }, + { + "epoch": 2.3153089054903355, + "grad_norm": 0.9815367797233678, + "learning_rate": 1.9868750926364185e-05, + "loss": 0.0998, + "step": 19525 + }, + { + "epoch": 2.3154274872524607, + "grad_norm": 1.1408641534879014, + "learning_rate": 1.9866401693568195e-05, + "loss": 0.1292, + "step": 19526 + }, + { + "epoch": 2.3155460690145855, + "grad_norm": 0.8075052508368438, + "learning_rate": 1.9864052508097684e-05, + "loss": 0.0804, + "step": 19527 + }, + { + "epoch": 2.3156646507767107, + "grad_norm": 0.8112218612023014, + "learning_rate": 1.9861703369974306e-05, + "loss": 0.0784, + "step": 19528 + }, + { + "epoch": 2.3157832325388354, + "grad_norm": 1.040371198222734, + "learning_rate": 1.9859354279219728e-05, + "loss": 0.1204, + "step": 19529 + }, + { + "epoch": 2.3159018143009606, + "grad_norm": 1.0551748277403288, + "learning_rate": 1.98570052358556e-05, + "loss": 0.1203, + "step": 19530 + }, + { + "epoch": 2.3160203960630854, + "grad_norm": 0.7034080004619069, + "learning_rate": 1.9854656239903574e-05, + "loss": 0.0783, + "step": 19531 + }, + { + "epoch": 2.3161389778252106, + "grad_norm": 0.7760391319543551, + "learning_rate": 1.9852307291385306e-05, + "loss": 0.0825, + "step": 19532 + }, + { + "epoch": 2.3162575595873354, + "grad_norm": 0.812389387933463, + "learning_rate": 1.984995839032245e-05, + "loss": 0.106, + "step": 19533 + }, + { + "epoch": 2.3163761413494606, + "grad_norm": 1.1819771022325625, + "learning_rate": 1.984760953673667e-05, + "loss": 0.1921, + "step": 19534 + }, + { + "epoch": 2.3164947231115853, + "grad_norm": 0.6942867081654149, + "learning_rate": 1.9845260730649602e-05, + "loss": 0.0933, + "step": 19535 + }, + { + "epoch": 2.3166133048737105, + "grad_norm": 0.6423297314158021, + "learning_rate": 1.984291197208292e-05, + "loss": 0.0657, + "step": 19536 + }, + { + "epoch": 2.3167318866358353, + "grad_norm": 1.3269219865232027, + "learning_rate": 1.9840563261058265e-05, + "loss": 0.1905, + "step": 19537 + }, + { + "epoch": 2.3168504683979605, + "grad_norm": 0.8632993360116618, + "learning_rate": 1.9838214597597287e-05, + "loss": 0.0887, + "step": 19538 + }, + { + "epoch": 2.3169690501600853, + "grad_norm": 0.6089540378495435, + "learning_rate": 1.9835865981721633e-05, + "loss": 0.0844, + "step": 19539 + }, + { + "epoch": 2.3170876319222105, + "grad_norm": 0.7190937806357365, + "learning_rate": 1.983351741345297e-05, + "loss": 0.0986, + "step": 19540 + }, + { + "epoch": 2.317206213684335, + "grad_norm": 0.7009237675292992, + "learning_rate": 1.9831168892812937e-05, + "loss": 0.0973, + "step": 19541 + }, + { + "epoch": 2.3173247954464604, + "grad_norm": 0.7459161027002865, + "learning_rate": 1.9828820419823197e-05, + "loss": 0.0993, + "step": 19542 + }, + { + "epoch": 2.317443377208585, + "grad_norm": 0.8311516293764057, + "learning_rate": 1.9826471994505376e-05, + "loss": 0.107, + "step": 19543 + }, + { + "epoch": 2.3175619589707104, + "grad_norm": 0.707807650933055, + "learning_rate": 1.982412361688115e-05, + "loss": 0.1113, + "step": 19544 + }, + { + "epoch": 2.317680540732835, + "grad_norm": 0.7639304983171717, + "learning_rate": 1.982177528697216e-05, + "loss": 0.0995, + "step": 19545 + }, + { + "epoch": 2.3177991224949603, + "grad_norm": 0.6340470964965855, + "learning_rate": 1.981942700480004e-05, + "loss": 0.0734, + "step": 19546 + }, + { + "epoch": 2.317917704257085, + "grad_norm": 0.7944096592372047, + "learning_rate": 1.981707877038646e-05, + "loss": 0.1063, + "step": 19547 + }, + { + "epoch": 2.3180362860192103, + "grad_norm": 0.6045125509221941, + "learning_rate": 1.9814730583753058e-05, + "loss": 0.0722, + "step": 19548 + }, + { + "epoch": 2.318154867781335, + "grad_norm": 0.853854824177437, + "learning_rate": 1.9812382444921482e-05, + "loss": 0.0878, + "step": 19549 + }, + { + "epoch": 2.3182734495434603, + "grad_norm": 0.5882713606520377, + "learning_rate": 1.981003435391337e-05, + "loss": 0.0833, + "step": 19550 + }, + { + "epoch": 2.318392031305585, + "grad_norm": 0.6006016191154062, + "learning_rate": 1.9807686310750383e-05, + "loss": 0.0914, + "step": 19551 + }, + { + "epoch": 2.3185106130677102, + "grad_norm": 0.6351846623092647, + "learning_rate": 1.9805338315454157e-05, + "loss": 0.0894, + "step": 19552 + }, + { + "epoch": 2.318629194829835, + "grad_norm": 0.6013585188866459, + "learning_rate": 1.980299036804635e-05, + "loss": 0.0671, + "step": 19553 + }, + { + "epoch": 2.31874777659196, + "grad_norm": 0.7852954857359877, + "learning_rate": 1.9800642468548587e-05, + "loss": 0.1047, + "step": 19554 + }, + { + "epoch": 2.318866358354085, + "grad_norm": 0.9450697516809748, + "learning_rate": 1.9798294616982533e-05, + "loss": 0.106, + "step": 19555 + }, + { + "epoch": 2.31898494011621, + "grad_norm": 0.8812428539388657, + "learning_rate": 1.9795946813369826e-05, + "loss": 0.1348, + "step": 19556 + }, + { + "epoch": 2.3191035218783353, + "grad_norm": 0.9691708442322058, + "learning_rate": 1.9793599057732102e-05, + "loss": 0.1502, + "step": 19557 + }, + { + "epoch": 2.31922210364046, + "grad_norm": 0.9950858566419218, + "learning_rate": 1.979125135009101e-05, + "loss": 0.1181, + "step": 19558 + }, + { + "epoch": 2.319340685402585, + "grad_norm": 0.7778314961196723, + "learning_rate": 1.97889036904682e-05, + "loss": 0.1314, + "step": 19559 + }, + { + "epoch": 2.31945926716471, + "grad_norm": 1.0055452006417973, + "learning_rate": 1.9786556078885297e-05, + "loss": 0.1705, + "step": 19560 + }, + { + "epoch": 2.3195778489268353, + "grad_norm": 0.541504628145367, + "learning_rate": 1.9784208515363957e-05, + "loss": 0.0793, + "step": 19561 + }, + { + "epoch": 2.31969643068896, + "grad_norm": 0.6604952245766792, + "learning_rate": 1.9781860999925818e-05, + "loss": 0.1047, + "step": 19562 + }, + { + "epoch": 2.319815012451085, + "grad_norm": 0.6218952823831445, + "learning_rate": 1.9779513532592526e-05, + "loss": 0.0838, + "step": 19563 + }, + { + "epoch": 2.31993359421321, + "grad_norm": 0.6726587400585863, + "learning_rate": 1.9777166113385722e-05, + "loss": 0.0897, + "step": 19564 + }, + { + "epoch": 2.320052175975335, + "grad_norm": 0.7331279651465841, + "learning_rate": 1.977481874232703e-05, + "loss": 0.1069, + "step": 19565 + }, + { + "epoch": 2.32017075773746, + "grad_norm": 0.7522635274520322, + "learning_rate": 1.977247141943811e-05, + "loss": 0.103, + "step": 19566 + }, + { + "epoch": 2.320289339499585, + "grad_norm": 1.0909727646435528, + "learning_rate": 1.977012414474059e-05, + "loss": 0.1239, + "step": 19567 + }, + { + "epoch": 2.32040792126171, + "grad_norm": 0.9385352698033184, + "learning_rate": 1.976777691825612e-05, + "loss": 0.1139, + "step": 19568 + }, + { + "epoch": 2.320526503023835, + "grad_norm": 0.6316216311924798, + "learning_rate": 1.9765429740006314e-05, + "loss": 0.0795, + "step": 19569 + }, + { + "epoch": 2.32064508478596, + "grad_norm": 0.7070766735745233, + "learning_rate": 1.976308261001284e-05, + "loss": 0.1063, + "step": 19570 + }, + { + "epoch": 2.320763666548085, + "grad_norm": 0.651622763099539, + "learning_rate": 1.9760735528297317e-05, + "loss": 0.0878, + "step": 19571 + }, + { + "epoch": 2.32088224831021, + "grad_norm": 0.7984324501934332, + "learning_rate": 1.9758388494881392e-05, + "loss": 0.1148, + "step": 19572 + }, + { + "epoch": 2.321000830072335, + "grad_norm": 0.8143879075953584, + "learning_rate": 1.975604150978669e-05, + "loss": 0.1058, + "step": 19573 + }, + { + "epoch": 2.32111941183446, + "grad_norm": 0.6316095463962412, + "learning_rate": 1.975369457303486e-05, + "loss": 0.0737, + "step": 19574 + }, + { + "epoch": 2.321237993596585, + "grad_norm": 0.6648258564808079, + "learning_rate": 1.9751347684647533e-05, + "loss": 0.075, + "step": 19575 + }, + { + "epoch": 2.3213565753587098, + "grad_norm": 0.618418146743983, + "learning_rate": 1.9749000844646337e-05, + "loss": 0.1091, + "step": 19576 + }, + { + "epoch": 2.321475157120835, + "grad_norm": 0.5645326368080992, + "learning_rate": 1.9746654053052922e-05, + "loss": 0.0663, + "step": 19577 + }, + { + "epoch": 2.3215937388829597, + "grad_norm": 0.9819120024066299, + "learning_rate": 1.9744307309888914e-05, + "loss": 0.1465, + "step": 19578 + }, + { + "epoch": 2.321712320645085, + "grad_norm": 0.6507165978905625, + "learning_rate": 1.9741960615175952e-05, + "loss": 0.1037, + "step": 19579 + }, + { + "epoch": 2.3218309024072097, + "grad_norm": 0.5382783820634356, + "learning_rate": 1.973961396893565e-05, + "loss": 0.0815, + "step": 19580 + }, + { + "epoch": 2.321949484169335, + "grad_norm": 0.6600320733273624, + "learning_rate": 1.9737267371189673e-05, + "loss": 0.0713, + "step": 19581 + }, + { + "epoch": 2.3220680659314596, + "grad_norm": 0.9607757608438607, + "learning_rate": 1.9734920821959625e-05, + "loss": 0.1592, + "step": 19582 + }, + { + "epoch": 2.322186647693585, + "grad_norm": 0.8226844498073277, + "learning_rate": 1.973257432126716e-05, + "loss": 0.1218, + "step": 19583 + }, + { + "epoch": 2.3223052294557096, + "grad_norm": 0.9862022264668764, + "learning_rate": 1.9730227869133887e-05, + "loss": 0.1227, + "step": 19584 + }, + { + "epoch": 2.322423811217835, + "grad_norm": 0.4100522886855501, + "learning_rate": 1.9727881465581464e-05, + "loss": 0.0637, + "step": 19585 + }, + { + "epoch": 2.3225423929799596, + "grad_norm": 0.6752619136160796, + "learning_rate": 1.9725535110631507e-05, + "loss": 0.115, + "step": 19586 + }, + { + "epoch": 2.3226609747420848, + "grad_norm": 0.8271875797107203, + "learning_rate": 1.972318880430565e-05, + "loss": 0.1032, + "step": 19587 + }, + { + "epoch": 2.3227795565042095, + "grad_norm": 0.44179272657253565, + "learning_rate": 1.972084254662551e-05, + "loss": 0.0731, + "step": 19588 + }, + { + "epoch": 2.3228981382663347, + "grad_norm": 0.6720423138232527, + "learning_rate": 1.9718496337612737e-05, + "loss": 0.0864, + "step": 19589 + }, + { + "epoch": 2.3230167200284595, + "grad_norm": 0.6489550988701074, + "learning_rate": 1.971615017728895e-05, + "loss": 0.0856, + "step": 19590 + }, + { + "epoch": 2.3231353017905847, + "grad_norm": 0.754656323958397, + "learning_rate": 1.9713804065675775e-05, + "loss": 0.0964, + "step": 19591 + }, + { + "epoch": 2.3232538835527095, + "grad_norm": 0.6024092987410897, + "learning_rate": 1.9711458002794844e-05, + "loss": 0.098, + "step": 19592 + }, + { + "epoch": 2.3233724653148347, + "grad_norm": 0.9505005202994333, + "learning_rate": 1.9709111988667794e-05, + "loss": 0.1371, + "step": 19593 + }, + { + "epoch": 2.3234910470769594, + "grad_norm": 0.5984828937950394, + "learning_rate": 1.9706766023316238e-05, + "loss": 0.0894, + "step": 19594 + }, + { + "epoch": 2.3236096288390846, + "grad_norm": 1.201781102426866, + "learning_rate": 1.9704420106761803e-05, + "loss": 0.1407, + "step": 19595 + }, + { + "epoch": 2.3237282106012094, + "grad_norm": 0.5586530100470325, + "learning_rate": 1.970207423902613e-05, + "loss": 0.0962, + "step": 19596 + }, + { + "epoch": 2.3238467923633346, + "grad_norm": 0.8388997351133879, + "learning_rate": 1.9699728420130833e-05, + "loss": 0.1178, + "step": 19597 + }, + { + "epoch": 2.3239653741254593, + "grad_norm": 0.6799894863401414, + "learning_rate": 1.9697382650097543e-05, + "loss": 0.0713, + "step": 19598 + }, + { + "epoch": 2.3240839558875845, + "grad_norm": 0.734462881266322, + "learning_rate": 1.9695036928947873e-05, + "loss": 0.109, + "step": 19599 + }, + { + "epoch": 2.3242025376497093, + "grad_norm": 0.8916128202521879, + "learning_rate": 1.9692691256703463e-05, + "loss": 0.1293, + "step": 19600 + }, + { + "epoch": 2.3243211194118345, + "grad_norm": 0.5509371283389083, + "learning_rate": 1.969034563338593e-05, + "loss": 0.0956, + "step": 19601 + }, + { + "epoch": 2.3244397011739593, + "grad_norm": 0.691293193765222, + "learning_rate": 1.9688000059016904e-05, + "loss": 0.0867, + "step": 19602 + }, + { + "epoch": 2.3245582829360845, + "grad_norm": 0.5227752425732899, + "learning_rate": 1.968565453361799e-05, + "loss": 0.0781, + "step": 19603 + }, + { + "epoch": 2.3246768646982092, + "grad_norm": 0.7314106253762587, + "learning_rate": 1.9683309057210838e-05, + "loss": 0.0911, + "step": 19604 + }, + { + "epoch": 2.3247954464603344, + "grad_norm": 0.7422039010714777, + "learning_rate": 1.9680963629817056e-05, + "loss": 0.088, + "step": 19605 + }, + { + "epoch": 2.3249140282224596, + "grad_norm": 0.5386597718884868, + "learning_rate": 1.9678618251458255e-05, + "loss": 0.08, + "step": 19606 + }, + { + "epoch": 2.3250326099845844, + "grad_norm": 0.9260304925510692, + "learning_rate": 1.967627292215608e-05, + "loss": 0.1453, + "step": 19607 + }, + { + "epoch": 2.325151191746709, + "grad_norm": 0.6727558626817153, + "learning_rate": 1.9673927641932137e-05, + "loss": 0.0932, + "step": 19608 + }, + { + "epoch": 2.3252697735088343, + "grad_norm": 0.8060927853960526, + "learning_rate": 1.9671582410808056e-05, + "loss": 0.1048, + "step": 19609 + }, + { + "epoch": 2.3253883552709596, + "grad_norm": 0.7238883464192284, + "learning_rate": 1.9669237228805438e-05, + "loss": 0.082, + "step": 19610 + }, + { + "epoch": 2.3255069370330843, + "grad_norm": 0.7003090385631497, + "learning_rate": 1.9666892095945924e-05, + "loss": 0.083, + "step": 19611 + }, + { + "epoch": 2.325625518795209, + "grad_norm": 0.8737299823262829, + "learning_rate": 1.9664547012251122e-05, + "loss": 0.1111, + "step": 19612 + }, + { + "epoch": 2.3257441005573343, + "grad_norm": 0.7012433779526921, + "learning_rate": 1.966220197774266e-05, + "loss": 0.114, + "step": 19613 + }, + { + "epoch": 2.3258626823194595, + "grad_norm": 0.7569027016007822, + "learning_rate": 1.9659856992442135e-05, + "loss": 0.0978, + "step": 19614 + }, + { + "epoch": 2.3259812640815842, + "grad_norm": 0.9514026626406528, + "learning_rate": 1.9657512056371193e-05, + "loss": 0.1157, + "step": 19615 + }, + { + "epoch": 2.326099845843709, + "grad_norm": 1.2791249210001723, + "learning_rate": 1.9655167169551443e-05, + "loss": 0.1262, + "step": 19616 + }, + { + "epoch": 2.326218427605834, + "grad_norm": 0.8003484174904755, + "learning_rate": 1.965282233200449e-05, + "loss": 0.0937, + "step": 19617 + }, + { + "epoch": 2.3263370093679594, + "grad_norm": 1.0696019656946385, + "learning_rate": 1.965047754375195e-05, + "loss": 0.123, + "step": 19618 + }, + { + "epoch": 2.326455591130084, + "grad_norm": 0.465430307793162, + "learning_rate": 1.964813280481546e-05, + "loss": 0.0748, + "step": 19619 + }, + { + "epoch": 2.3265741728922094, + "grad_norm": 0.5700660865085883, + "learning_rate": 1.9645788115216614e-05, + "loss": 0.0931, + "step": 19620 + }, + { + "epoch": 2.326692754654334, + "grad_norm": 0.6136573610865601, + "learning_rate": 1.964344347497704e-05, + "loss": 0.0935, + "step": 19621 + }, + { + "epoch": 2.3268113364164593, + "grad_norm": 0.623024883853895, + "learning_rate": 1.9641098884118345e-05, + "loss": 0.0765, + "step": 19622 + }, + { + "epoch": 2.326929918178584, + "grad_norm": 0.8577286996179616, + "learning_rate": 1.963875434266215e-05, + "loss": 0.1103, + "step": 19623 + }, + { + "epoch": 2.3270484999407093, + "grad_norm": 0.7425164041624043, + "learning_rate": 1.963640985063007e-05, + "loss": 0.1037, + "step": 19624 + }, + { + "epoch": 2.327167081702834, + "grad_norm": 0.6984518078082514, + "learning_rate": 1.9634065408043706e-05, + "loss": 0.095, + "step": 19625 + }, + { + "epoch": 2.3272856634649592, + "grad_norm": 0.7388524919515016, + "learning_rate": 1.9631721014924684e-05, + "loss": 0.11, + "step": 19626 + }, + { + "epoch": 2.327404245227084, + "grad_norm": 0.6444896473936335, + "learning_rate": 1.9629376671294614e-05, + "loss": 0.0849, + "step": 19627 + }, + { + "epoch": 2.327522826989209, + "grad_norm": 0.9125596871168674, + "learning_rate": 1.9627032377175102e-05, + "loss": 0.127, + "step": 19628 + }, + { + "epoch": 2.327641408751334, + "grad_norm": 0.6091693328749415, + "learning_rate": 1.9624688132587754e-05, + "loss": 0.0737, + "step": 19629 + }, + { + "epoch": 2.327759990513459, + "grad_norm": 0.4682559415356481, + "learning_rate": 1.96223439375542e-05, + "loss": 0.061, + "step": 19630 + }, + { + "epoch": 2.327878572275584, + "grad_norm": 0.7655616942254497, + "learning_rate": 1.9619999792096036e-05, + "loss": 0.0879, + "step": 19631 + }, + { + "epoch": 2.327997154037709, + "grad_norm": 0.6616630555046017, + "learning_rate": 1.9617655696234876e-05, + "loss": 0.107, + "step": 19632 + }, + { + "epoch": 2.328115735799834, + "grad_norm": 1.103396690661148, + "learning_rate": 1.961531164999233e-05, + "loss": 0.2043, + "step": 19633 + }, + { + "epoch": 2.328234317561959, + "grad_norm": 0.7656191004064901, + "learning_rate": 1.9612967653390014e-05, + "loss": 0.0997, + "step": 19634 + }, + { + "epoch": 2.328352899324084, + "grad_norm": 0.7725198295403711, + "learning_rate": 1.9610623706449528e-05, + "loss": 0.1036, + "step": 19635 + }, + { + "epoch": 2.328471481086209, + "grad_norm": 0.6879006644903779, + "learning_rate": 1.9608279809192475e-05, + "loss": 0.0698, + "step": 19636 + }, + { + "epoch": 2.328590062848334, + "grad_norm": 0.6136471258780314, + "learning_rate": 1.9605935961640476e-05, + "loss": 0.0834, + "step": 19637 + }, + { + "epoch": 2.328708644610459, + "grad_norm": 0.751937081289528, + "learning_rate": 1.9603592163815137e-05, + "loss": 0.1065, + "step": 19638 + }, + { + "epoch": 2.3288272263725838, + "grad_norm": 0.5815452714196307, + "learning_rate": 1.9601248415738054e-05, + "loss": 0.0944, + "step": 19639 + }, + { + "epoch": 2.328945808134709, + "grad_norm": 0.9153912070454847, + "learning_rate": 1.9598904717430842e-05, + "loss": 0.1436, + "step": 19640 + }, + { + "epoch": 2.3290643898968337, + "grad_norm": 0.7490436463238564, + "learning_rate": 1.9596561068915103e-05, + "loss": 0.1169, + "step": 19641 + }, + { + "epoch": 2.329182971658959, + "grad_norm": 0.716307560085018, + "learning_rate": 1.959421747021245e-05, + "loss": 0.0911, + "step": 19642 + }, + { + "epoch": 2.3293015534210837, + "grad_norm": 0.5933023454175723, + "learning_rate": 1.9591873921344483e-05, + "loss": 0.084, + "step": 19643 + }, + { + "epoch": 2.329420135183209, + "grad_norm": 0.7205943000590765, + "learning_rate": 1.9589530422332797e-05, + "loss": 0.1094, + "step": 19644 + }, + { + "epoch": 2.3295387169453337, + "grad_norm": 0.6682982771411663, + "learning_rate": 1.958718697319902e-05, + "loss": 0.1025, + "step": 19645 + }, + { + "epoch": 2.329657298707459, + "grad_norm": 0.9048478487967007, + "learning_rate": 1.9584843573964735e-05, + "loss": 0.1116, + "step": 19646 + }, + { + "epoch": 2.3297758804695836, + "grad_norm": 0.7393912062264846, + "learning_rate": 1.9582500224651543e-05, + "loss": 0.1067, + "step": 19647 + }, + { + "epoch": 2.329894462231709, + "grad_norm": 0.8093916872243289, + "learning_rate": 1.9580156925281067e-05, + "loss": 0.1233, + "step": 19648 + }, + { + "epoch": 2.3300130439938336, + "grad_norm": 0.4508682903165856, + "learning_rate": 1.9577813675874897e-05, + "loss": 0.0592, + "step": 19649 + }, + { + "epoch": 2.330131625755959, + "grad_norm": 0.8673545628162307, + "learning_rate": 1.9575470476454634e-05, + "loss": 0.0999, + "step": 19650 + }, + { + "epoch": 2.3302502075180835, + "grad_norm": 0.6487028212588282, + "learning_rate": 1.9573127327041875e-05, + "loss": 0.0951, + "step": 19651 + }, + { + "epoch": 2.3303687892802087, + "grad_norm": 0.8081895582119544, + "learning_rate": 1.9570784227658233e-05, + "loss": 0.1246, + "step": 19652 + }, + { + "epoch": 2.3304873710423335, + "grad_norm": 0.8685687986685234, + "learning_rate": 1.9568441178325304e-05, + "loss": 0.0946, + "step": 19653 + }, + { + "epoch": 2.3306059528044587, + "grad_norm": 1.308034299292033, + "learning_rate": 1.9566098179064686e-05, + "loss": 0.1625, + "step": 19654 + }, + { + "epoch": 2.330724534566584, + "grad_norm": 0.5762780804481167, + "learning_rate": 1.9563755229897973e-05, + "loss": 0.0978, + "step": 19655 + }, + { + "epoch": 2.3308431163287087, + "grad_norm": 0.6615979346505892, + "learning_rate": 1.9561412330846778e-05, + "loss": 0.0827, + "step": 19656 + }, + { + "epoch": 2.3309616980908334, + "grad_norm": 0.8439458520280403, + "learning_rate": 1.9559069481932696e-05, + "loss": 0.1164, + "step": 19657 + }, + { + "epoch": 2.3310802798529586, + "grad_norm": 0.6022588170968688, + "learning_rate": 1.9556726683177317e-05, + "loss": 0.0834, + "step": 19658 + }, + { + "epoch": 2.331198861615084, + "grad_norm": 0.7771491188197321, + "learning_rate": 1.9554383934602233e-05, + "loss": 0.0942, + "step": 19659 + }, + { + "epoch": 2.3313174433772086, + "grad_norm": 0.661049339849892, + "learning_rate": 1.9552041236229063e-05, + "loss": 0.0868, + "step": 19660 + }, + { + "epoch": 2.3314360251393333, + "grad_norm": 0.7660036541731323, + "learning_rate": 1.9549698588079385e-05, + "loss": 0.0995, + "step": 19661 + }, + { + "epoch": 2.3315546069014585, + "grad_norm": 0.546759546632222, + "learning_rate": 1.9547355990174802e-05, + "loss": 0.0888, + "step": 19662 + }, + { + "epoch": 2.3316731886635838, + "grad_norm": 0.6276902736150629, + "learning_rate": 1.9545013442536912e-05, + "loss": 0.0668, + "step": 19663 + }, + { + "epoch": 2.3317917704257085, + "grad_norm": 0.5222905331930124, + "learning_rate": 1.9542670945187318e-05, + "loss": 0.0673, + "step": 19664 + }, + { + "epoch": 2.3319103521878333, + "grad_norm": 0.6095844730792853, + "learning_rate": 1.95403284981476e-05, + "loss": 0.0756, + "step": 19665 + }, + { + "epoch": 2.3320289339499585, + "grad_norm": 0.8351580165020239, + "learning_rate": 1.953798610143935e-05, + "loss": 0.1133, + "step": 19666 + }, + { + "epoch": 2.3321475157120837, + "grad_norm": 0.9468603894207163, + "learning_rate": 1.953564375508418e-05, + "loss": 0.1199, + "step": 19667 + }, + { + "epoch": 2.3322660974742084, + "grad_norm": 0.7985714177422902, + "learning_rate": 1.9533301459103676e-05, + "loss": 0.1079, + "step": 19668 + }, + { + "epoch": 2.3323846792363336, + "grad_norm": 0.9678303791845877, + "learning_rate": 1.953095921351942e-05, + "loss": 0.1162, + "step": 19669 + }, + { + "epoch": 2.3325032609984584, + "grad_norm": 0.5527903826354482, + "learning_rate": 1.9528617018353016e-05, + "loss": 0.0737, + "step": 19670 + }, + { + "epoch": 2.3326218427605836, + "grad_norm": 1.062955385549872, + "learning_rate": 1.952627487362605e-05, + "loss": 0.1211, + "step": 19671 + }, + { + "epoch": 2.3327404245227084, + "grad_norm": 0.6674053674517046, + "learning_rate": 1.9523932779360126e-05, + "loss": 0.099, + "step": 19672 + }, + { + "epoch": 2.3328590062848336, + "grad_norm": 0.9679840305458146, + "learning_rate": 1.9521590735576824e-05, + "loss": 0.1375, + "step": 19673 + }, + { + "epoch": 2.3329775880469583, + "grad_norm": 0.7046136398564593, + "learning_rate": 1.951924874229773e-05, + "loss": 0.0792, + "step": 19674 + }, + { + "epoch": 2.3330961698090835, + "grad_norm": 0.6265409137721989, + "learning_rate": 1.951690679954445e-05, + "loss": 0.1119, + "step": 19675 + }, + { + "epoch": 2.3332147515712083, + "grad_norm": 0.7679439433878709, + "learning_rate": 1.951456490733857e-05, + "loss": 0.0928, + "step": 19676 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6046103274596862, + "learning_rate": 1.9512223065701658e-05, + "loss": 0.1045, + "step": 19677 + }, + { + "epoch": 2.3334519150954582, + "grad_norm": 0.5452435691395431, + "learning_rate": 1.9509881274655335e-05, + "loss": 0.0649, + "step": 19678 + }, + { + "epoch": 2.3335704968575834, + "grad_norm": 0.7194761730955362, + "learning_rate": 1.950753953422117e-05, + "loss": 0.1072, + "step": 19679 + }, + { + "epoch": 2.333689078619708, + "grad_norm": 0.7047804862838191, + "learning_rate": 1.950519784442075e-05, + "loss": 0.0791, + "step": 19680 + }, + { + "epoch": 2.3338076603818334, + "grad_norm": 0.6057165150684416, + "learning_rate": 1.9502856205275672e-05, + "loss": 0.0824, + "step": 19681 + }, + { + "epoch": 2.333926242143958, + "grad_norm": 0.6418007828358009, + "learning_rate": 1.9500514616807518e-05, + "loss": 0.0741, + "step": 19682 + }, + { + "epoch": 2.3340448239060834, + "grad_norm": 0.5978403637457337, + "learning_rate": 1.9498173079037876e-05, + "loss": 0.076, + "step": 19683 + }, + { + "epoch": 2.334163405668208, + "grad_norm": 0.5283480307554714, + "learning_rate": 1.9495831591988334e-05, + "loss": 0.0744, + "step": 19684 + }, + { + "epoch": 2.3342819874303333, + "grad_norm": 0.9003501111134286, + "learning_rate": 1.9493490155680466e-05, + "loss": 0.1163, + "step": 19685 + }, + { + "epoch": 2.334400569192458, + "grad_norm": 0.5050102995885705, + "learning_rate": 1.9491148770135876e-05, + "loss": 0.0671, + "step": 19686 + }, + { + "epoch": 2.3345191509545833, + "grad_norm": 0.6631994156165885, + "learning_rate": 1.948880743537614e-05, + "loss": 0.0883, + "step": 19687 + }, + { + "epoch": 2.334637732716708, + "grad_norm": 0.9061859891115267, + "learning_rate": 1.9486466151422834e-05, + "loss": 0.1279, + "step": 19688 + }, + { + "epoch": 2.3347563144788333, + "grad_norm": 0.5468868359407588, + "learning_rate": 1.9484124918297547e-05, + "loss": 0.0733, + "step": 19689 + }, + { + "epoch": 2.334874896240958, + "grad_norm": 0.8104339970184976, + "learning_rate": 1.948178373602187e-05, + "loss": 0.1272, + "step": 19690 + }, + { + "epoch": 2.334993478003083, + "grad_norm": 0.6025015903654741, + "learning_rate": 1.9479442604617383e-05, + "loss": 0.0749, + "step": 19691 + }, + { + "epoch": 2.335112059765208, + "grad_norm": 0.5440059701386678, + "learning_rate": 1.947710152410565e-05, + "loss": 0.079, + "step": 19692 + }, + { + "epoch": 2.335230641527333, + "grad_norm": 0.7751655072712493, + "learning_rate": 1.9474760494508286e-05, + "loss": 0.1283, + "step": 19693 + }, + { + "epoch": 2.335349223289458, + "grad_norm": 0.7328838140024185, + "learning_rate": 1.947241951584685e-05, + "loss": 0.1161, + "step": 19694 + }, + { + "epoch": 2.335467805051583, + "grad_norm": 0.7167600048414362, + "learning_rate": 1.947007858814293e-05, + "loss": 0.1063, + "step": 19695 + }, + { + "epoch": 2.335586386813708, + "grad_norm": 0.7071499132477514, + "learning_rate": 1.9467737711418095e-05, + "loss": 0.0953, + "step": 19696 + }, + { + "epoch": 2.335704968575833, + "grad_norm": 0.6963646487515647, + "learning_rate": 1.946539688569394e-05, + "loss": 0.0826, + "step": 19697 + }, + { + "epoch": 2.335823550337958, + "grad_norm": 0.7230030399307877, + "learning_rate": 1.9463056110992043e-05, + "loss": 0.1187, + "step": 19698 + }, + { + "epoch": 2.335942132100083, + "grad_norm": 0.9302146387773629, + "learning_rate": 1.9460715387333973e-05, + "loss": 0.1226, + "step": 19699 + }, + { + "epoch": 2.336060713862208, + "grad_norm": 0.8262518044903985, + "learning_rate": 1.945837471474131e-05, + "loss": 0.0937, + "step": 19700 + }, + { + "epoch": 2.336179295624333, + "grad_norm": 0.885883779918424, + "learning_rate": 1.945603409323564e-05, + "loss": 0.0853, + "step": 19701 + }, + { + "epoch": 2.3362978773864578, + "grad_norm": 0.8798300985808893, + "learning_rate": 1.945369352283854e-05, + "loss": 0.0969, + "step": 19702 + }, + { + "epoch": 2.336416459148583, + "grad_norm": 0.852929187049192, + "learning_rate": 1.945135300357159e-05, + "loss": 0.1103, + "step": 19703 + }, + { + "epoch": 2.3365350409107077, + "grad_norm": 0.6397432125221462, + "learning_rate": 1.9449012535456346e-05, + "loss": 0.0923, + "step": 19704 + }, + { + "epoch": 2.336653622672833, + "grad_norm": 0.7438897731870475, + "learning_rate": 1.9446672118514415e-05, + "loss": 0.0993, + "step": 19705 + }, + { + "epoch": 2.3367722044349577, + "grad_norm": 0.45016352253737263, + "learning_rate": 1.944433175276735e-05, + "loss": 0.0681, + "step": 19706 + }, + { + "epoch": 2.336890786197083, + "grad_norm": 0.6005121500929034, + "learning_rate": 1.9441991438236727e-05, + "loss": 0.0969, + "step": 19707 + }, + { + "epoch": 2.337009367959208, + "grad_norm": 0.4964019039508338, + "learning_rate": 1.9439651174944137e-05, + "loss": 0.0706, + "step": 19708 + }, + { + "epoch": 2.337127949721333, + "grad_norm": 0.9093817777558677, + "learning_rate": 1.9437310962911146e-05, + "loss": 0.1329, + "step": 19709 + }, + { + "epoch": 2.3372465314834576, + "grad_norm": 1.19056677509281, + "learning_rate": 1.9434970802159318e-05, + "loss": 0.1382, + "step": 19710 + }, + { + "epoch": 2.337365113245583, + "grad_norm": 0.6385920399928435, + "learning_rate": 1.943263069271024e-05, + "loss": 0.0883, + "step": 19711 + }, + { + "epoch": 2.337483695007708, + "grad_norm": 0.9542566440145908, + "learning_rate": 1.9430290634585475e-05, + "loss": 0.1387, + "step": 19712 + }, + { + "epoch": 2.337602276769833, + "grad_norm": 1.0145455332029072, + "learning_rate": 1.9427950627806608e-05, + "loss": 0.128, + "step": 19713 + }, + { + "epoch": 2.3377208585319575, + "grad_norm": 0.6339858608827293, + "learning_rate": 1.94256106723952e-05, + "loss": 0.0784, + "step": 19714 + }, + { + "epoch": 2.3378394402940827, + "grad_norm": 0.6498993438421286, + "learning_rate": 1.942327076837282e-05, + "loss": 0.105, + "step": 19715 + }, + { + "epoch": 2.337958022056208, + "grad_norm": 0.7012573692346179, + "learning_rate": 1.9420930915761053e-05, + "loss": 0.1016, + "step": 19716 + }, + { + "epoch": 2.3380766038183327, + "grad_norm": 0.5535169045110334, + "learning_rate": 1.941859111458146e-05, + "loss": 0.0691, + "step": 19717 + }, + { + "epoch": 2.338195185580458, + "grad_norm": 0.8002325088268389, + "learning_rate": 1.94162513648556e-05, + "loss": 0.1378, + "step": 19718 + }, + { + "epoch": 2.3383137673425827, + "grad_norm": 0.6086876027604352, + "learning_rate": 1.9413911666605074e-05, + "loss": 0.0844, + "step": 19719 + }, + { + "epoch": 2.338432349104708, + "grad_norm": 0.7849635112057574, + "learning_rate": 1.941157201985142e-05, + "loss": 0.135, + "step": 19720 + }, + { + "epoch": 2.3385509308668326, + "grad_norm": 0.932555263183837, + "learning_rate": 1.9409232424616232e-05, + "loss": 0.1545, + "step": 19721 + }, + { + "epoch": 2.338669512628958, + "grad_norm": 0.8794876046105843, + "learning_rate": 1.9406892880921045e-05, + "loss": 0.1197, + "step": 19722 + }, + { + "epoch": 2.3387880943910826, + "grad_norm": 0.5943152842185945, + "learning_rate": 1.9404553388787464e-05, + "loss": 0.0685, + "step": 19723 + }, + { + "epoch": 2.338906676153208, + "grad_norm": 0.8238051825266502, + "learning_rate": 1.9402213948237034e-05, + "loss": 0.1166, + "step": 19724 + }, + { + "epoch": 2.3390252579153326, + "grad_norm": 0.784311353500338, + "learning_rate": 1.939987455929133e-05, + "loss": 0.1021, + "step": 19725 + }, + { + "epoch": 2.3391438396774578, + "grad_norm": 0.8048938194411933, + "learning_rate": 1.9397535221971907e-05, + "loss": 0.1095, + "step": 19726 + }, + { + "epoch": 2.3392624214395825, + "grad_norm": 0.859953989584921, + "learning_rate": 1.9395195936300347e-05, + "loss": 0.1189, + "step": 19727 + }, + { + "epoch": 2.3393810032017077, + "grad_norm": 0.5923798689844896, + "learning_rate": 1.9392856702298207e-05, + "loss": 0.0905, + "step": 19728 + }, + { + "epoch": 2.3394995849638325, + "grad_norm": 0.5321311125332476, + "learning_rate": 1.9390517519987048e-05, + "loss": 0.0754, + "step": 19729 + }, + { + "epoch": 2.3396181667259577, + "grad_norm": 0.6100689098342109, + "learning_rate": 1.938817838938844e-05, + "loss": 0.0929, + "step": 19730 + }, + { + "epoch": 2.3397367484880824, + "grad_norm": 0.5591611123443301, + "learning_rate": 1.9385839310523942e-05, + "loss": 0.0638, + "step": 19731 + }, + { + "epoch": 2.3398553302502076, + "grad_norm": 0.645484209315893, + "learning_rate": 1.938350028341513e-05, + "loss": 0.102, + "step": 19732 + }, + { + "epoch": 2.3399739120123324, + "grad_norm": 0.7001111475243336, + "learning_rate": 1.9381161308083545e-05, + "loss": 0.0969, + "step": 19733 + }, + { + "epoch": 2.3400924937744576, + "grad_norm": 0.5013442041598899, + "learning_rate": 1.9378822384550774e-05, + "loss": 0.0751, + "step": 19734 + }, + { + "epoch": 2.3402110755365824, + "grad_norm": 0.8394260205917882, + "learning_rate": 1.9376483512838368e-05, + "loss": 0.1273, + "step": 19735 + }, + { + "epoch": 2.3403296572987076, + "grad_norm": 0.5343040232702672, + "learning_rate": 1.9374144692967885e-05, + "loss": 0.0664, + "step": 19736 + }, + { + "epoch": 2.3404482390608323, + "grad_norm": 0.5831301773840358, + "learning_rate": 1.937180592496088e-05, + "loss": 0.0967, + "step": 19737 + }, + { + "epoch": 2.3405668208229575, + "grad_norm": 0.7213953099544432, + "learning_rate": 1.9369467208838936e-05, + "loss": 0.1004, + "step": 19738 + }, + { + "epoch": 2.3406854025850823, + "grad_norm": 1.1032907756567942, + "learning_rate": 1.9367128544623597e-05, + "loss": 0.1707, + "step": 19739 + }, + { + "epoch": 2.3408039843472075, + "grad_norm": 0.6261687788453586, + "learning_rate": 1.936478993233642e-05, + "loss": 0.0891, + "step": 19740 + }, + { + "epoch": 2.3409225661093322, + "grad_norm": 0.644738856512835, + "learning_rate": 1.936245137199897e-05, + "loss": 0.0839, + "step": 19741 + }, + { + "epoch": 2.3410411478714575, + "grad_norm": 0.7120428435143825, + "learning_rate": 1.9360112863632808e-05, + "loss": 0.0944, + "step": 19742 + }, + { + "epoch": 2.341159729633582, + "grad_norm": 0.9127382456328282, + "learning_rate": 1.9357774407259487e-05, + "loss": 0.1404, + "step": 19743 + }, + { + "epoch": 2.3412783113957074, + "grad_norm": 0.5518385402521992, + "learning_rate": 1.9355436002900572e-05, + "loss": 0.0633, + "step": 19744 + }, + { + "epoch": 2.341396893157832, + "grad_norm": 0.609748355163441, + "learning_rate": 1.9353097650577605e-05, + "loss": 0.0817, + "step": 19745 + }, + { + "epoch": 2.3415154749199574, + "grad_norm": 0.6471446153383712, + "learning_rate": 1.9350759350312165e-05, + "loss": 0.0804, + "step": 19746 + }, + { + "epoch": 2.341634056682082, + "grad_norm": 0.813520647194993, + "learning_rate": 1.9348421102125794e-05, + "loss": 0.1242, + "step": 19747 + }, + { + "epoch": 2.3417526384442073, + "grad_norm": 0.9546953262479119, + "learning_rate": 1.934608290604004e-05, + "loss": 0.1617, + "step": 19748 + }, + { + "epoch": 2.341871220206332, + "grad_norm": 0.5825445619721515, + "learning_rate": 1.9343744762076477e-05, + "loss": 0.0672, + "step": 19749 + }, + { + "epoch": 2.3419898019684573, + "grad_norm": 0.5514580577342542, + "learning_rate": 1.9341406670256647e-05, + "loss": 0.0748, + "step": 19750 + }, + { + "epoch": 2.342108383730582, + "grad_norm": 0.547867145504884, + "learning_rate": 1.9339068630602112e-05, + "loss": 0.0883, + "step": 19751 + }, + { + "epoch": 2.3422269654927073, + "grad_norm": 0.6029595041376784, + "learning_rate": 1.9336730643134417e-05, + "loss": 0.0713, + "step": 19752 + }, + { + "epoch": 2.342345547254832, + "grad_norm": 0.5367020330942217, + "learning_rate": 1.9334392707875128e-05, + "loss": 0.0804, + "step": 19753 + }, + { + "epoch": 2.342464129016957, + "grad_norm": 0.738647823957988, + "learning_rate": 1.933205482484579e-05, + "loss": 0.0923, + "step": 19754 + }, + { + "epoch": 2.342582710779082, + "grad_norm": 0.662668803125056, + "learning_rate": 1.932971699406796e-05, + "loss": 0.0729, + "step": 19755 + }, + { + "epoch": 2.342701292541207, + "grad_norm": 0.7973397700803287, + "learning_rate": 1.932737921556317e-05, + "loss": 0.0827, + "step": 19756 + }, + { + "epoch": 2.3428198743033324, + "grad_norm": 0.6275708490014971, + "learning_rate": 1.9325041489352998e-05, + "loss": 0.0724, + "step": 19757 + }, + { + "epoch": 2.342938456065457, + "grad_norm": 0.47540294258275795, + "learning_rate": 1.932270381545899e-05, + "loss": 0.0572, + "step": 19758 + }, + { + "epoch": 2.343057037827582, + "grad_norm": 0.9093494813092565, + "learning_rate": 1.932036619390268e-05, + "loss": 0.1197, + "step": 19759 + }, + { + "epoch": 2.343175619589707, + "grad_norm": 0.5195766936470755, + "learning_rate": 1.931802862470563e-05, + "loss": 0.0536, + "step": 19760 + }, + { + "epoch": 2.3432942013518323, + "grad_norm": 0.9041600548697046, + "learning_rate": 1.9315691107889393e-05, + "loss": 0.1294, + "step": 19761 + }, + { + "epoch": 2.343412783113957, + "grad_norm": 0.6909154171215438, + "learning_rate": 1.9313353643475513e-05, + "loss": 0.106, + "step": 19762 + }, + { + "epoch": 2.343531364876082, + "grad_norm": 0.8259139049507424, + "learning_rate": 1.9311016231485528e-05, + "loss": 0.1373, + "step": 19763 + }, + { + "epoch": 2.343649946638207, + "grad_norm": 0.945278727795582, + "learning_rate": 1.930867887194101e-05, + "loss": 0.1414, + "step": 19764 + }, + { + "epoch": 2.3437685284003322, + "grad_norm": 0.6710708204029491, + "learning_rate": 1.9306341564863495e-05, + "loss": 0.0958, + "step": 19765 + }, + { + "epoch": 2.343887110162457, + "grad_norm": 0.846903222593842, + "learning_rate": 1.930400431027453e-05, + "loss": 0.1085, + "step": 19766 + }, + { + "epoch": 2.344005691924582, + "grad_norm": 0.7893763158241576, + "learning_rate": 1.9301667108195647e-05, + "loss": 0.0969, + "step": 19767 + }, + { + "epoch": 2.344124273686707, + "grad_norm": 0.9502660844568719, + "learning_rate": 1.9299329958648416e-05, + "loss": 0.1233, + "step": 19768 + }, + { + "epoch": 2.344242855448832, + "grad_norm": 0.7930684079201124, + "learning_rate": 1.9296992861654365e-05, + "loss": 0.1049, + "step": 19769 + }, + { + "epoch": 2.344361437210957, + "grad_norm": 0.7117666331400104, + "learning_rate": 1.9294655817235054e-05, + "loss": 0.0942, + "step": 19770 + }, + { + "epoch": 2.344480018973082, + "grad_norm": 0.5352042000066355, + "learning_rate": 1.9292318825412012e-05, + "loss": 0.067, + "step": 19771 + }, + { + "epoch": 2.344598600735207, + "grad_norm": 0.732324123265562, + "learning_rate": 1.92899818862068e-05, + "loss": 0.1188, + "step": 19772 + }, + { + "epoch": 2.344717182497332, + "grad_norm": 0.8942760579746863, + "learning_rate": 1.928764499964095e-05, + "loss": 0.0925, + "step": 19773 + }, + { + "epoch": 2.344835764259457, + "grad_norm": 0.8730917955815471, + "learning_rate": 1.928530816573601e-05, + "loss": 0.1165, + "step": 19774 + }, + { + "epoch": 2.344954346021582, + "grad_norm": 0.6454134179398539, + "learning_rate": 1.928297138451351e-05, + "loss": 0.0952, + "step": 19775 + }, + { + "epoch": 2.345072927783707, + "grad_norm": 1.0236629590657687, + "learning_rate": 1.9280634655995016e-05, + "loss": 0.143, + "step": 19776 + }, + { + "epoch": 2.345191509545832, + "grad_norm": 0.5561109857553563, + "learning_rate": 1.9278297980202054e-05, + "loss": 0.0645, + "step": 19777 + }, + { + "epoch": 2.3453100913079568, + "grad_norm": 0.7143221421012023, + "learning_rate": 1.9275961357156158e-05, + "loss": 0.0915, + "step": 19778 + }, + { + "epoch": 2.345428673070082, + "grad_norm": 0.7362716418379507, + "learning_rate": 1.927362478687889e-05, + "loss": 0.1066, + "step": 19779 + }, + { + "epoch": 2.3455472548322067, + "grad_norm": 0.5862018638967907, + "learning_rate": 1.9271288269391775e-05, + "loss": 0.0812, + "step": 19780 + }, + { + "epoch": 2.345665836594332, + "grad_norm": 0.759167104771286, + "learning_rate": 1.926895180471636e-05, + "loss": 0.0819, + "step": 19781 + }, + { + "epoch": 2.3457844183564567, + "grad_norm": 0.6340702208384057, + "learning_rate": 1.9266615392874176e-05, + "loss": 0.0865, + "step": 19782 + }, + { + "epoch": 2.345903000118582, + "grad_norm": 0.9414069799285542, + "learning_rate": 1.9264279033886773e-05, + "loss": 0.1086, + "step": 19783 + }, + { + "epoch": 2.3460215818807066, + "grad_norm": 0.6111695038203748, + "learning_rate": 1.9261942727775686e-05, + "loss": 0.0773, + "step": 19784 + }, + { + "epoch": 2.346140163642832, + "grad_norm": 0.7739393484556246, + "learning_rate": 1.925960647456245e-05, + "loss": 0.1039, + "step": 19785 + }, + { + "epoch": 2.3462587454049566, + "grad_norm": 0.8277164227650491, + "learning_rate": 1.9257270274268595e-05, + "loss": 0.1195, + "step": 19786 + }, + { + "epoch": 2.346377327167082, + "grad_norm": 0.7543015721018771, + "learning_rate": 1.9254934126915674e-05, + "loss": 0.1224, + "step": 19787 + }, + { + "epoch": 2.3464959089292066, + "grad_norm": 0.6314558329946334, + "learning_rate": 1.9252598032525212e-05, + "loss": 0.0779, + "step": 19788 + }, + { + "epoch": 2.3466144906913318, + "grad_norm": 0.5411889510724461, + "learning_rate": 1.925026199111874e-05, + "loss": 0.0761, + "step": 19789 + }, + { + "epoch": 2.3467330724534565, + "grad_norm": 0.5929264616699161, + "learning_rate": 1.9247926002717814e-05, + "loss": 0.0798, + "step": 19790 + }, + { + "epoch": 2.3468516542155817, + "grad_norm": 0.638449023004693, + "learning_rate": 1.924559006734395e-05, + "loss": 0.0724, + "step": 19791 + }, + { + "epoch": 2.3469702359777065, + "grad_norm": 0.6722579479339007, + "learning_rate": 1.9243254185018694e-05, + "loss": 0.1076, + "step": 19792 + }, + { + "epoch": 2.3470888177398317, + "grad_norm": 0.6275003425221145, + "learning_rate": 1.9240918355763565e-05, + "loss": 0.0822, + "step": 19793 + }, + { + "epoch": 2.3472073995019564, + "grad_norm": 0.8466869530758483, + "learning_rate": 1.923858257960012e-05, + "loss": 0.1189, + "step": 19794 + }, + { + "epoch": 2.3473259812640817, + "grad_norm": 0.6323141594548182, + "learning_rate": 1.9236246856549874e-05, + "loss": 0.0909, + "step": 19795 + }, + { + "epoch": 2.3474445630262064, + "grad_norm": 0.8831156396962699, + "learning_rate": 1.9233911186634364e-05, + "loss": 0.1338, + "step": 19796 + }, + { + "epoch": 2.3475631447883316, + "grad_norm": 0.5984376097872689, + "learning_rate": 1.9231575569875116e-05, + "loss": 0.0673, + "step": 19797 + }, + { + "epoch": 2.3476817265504564, + "grad_norm": 0.9409244213173198, + "learning_rate": 1.9229240006293673e-05, + "loss": 0.1048, + "step": 19798 + }, + { + "epoch": 2.3478003083125816, + "grad_norm": 0.7626834185464162, + "learning_rate": 1.9226904495911558e-05, + "loss": 0.1179, + "step": 19799 + }, + { + "epoch": 2.3479188900747063, + "grad_norm": 0.6421182570507952, + "learning_rate": 1.9224569038750308e-05, + "loss": 0.0967, + "step": 19800 + }, + { + "epoch": 2.3480374718368315, + "grad_norm": 0.8978157066620671, + "learning_rate": 1.922223363483144e-05, + "loss": 0.1261, + "step": 19801 + }, + { + "epoch": 2.3481560535989563, + "grad_norm": 0.7233825952185878, + "learning_rate": 1.92198982841765e-05, + "loss": 0.094, + "step": 19802 + }, + { + "epoch": 2.3482746353610815, + "grad_norm": 0.6737318182095349, + "learning_rate": 1.9217562986807014e-05, + "loss": 0.103, + "step": 19803 + }, + { + "epoch": 2.3483932171232063, + "grad_norm": 0.7747020458422441, + "learning_rate": 1.9215227742744495e-05, + "loss": 0.0921, + "step": 19804 + }, + { + "epoch": 2.3485117988853315, + "grad_norm": 1.1471690218904318, + "learning_rate": 1.9212892552010493e-05, + "loss": 0.1526, + "step": 19805 + }, + { + "epoch": 2.3486303806474567, + "grad_norm": 1.0830756717501433, + "learning_rate": 1.9210557414626522e-05, + "loss": 0.1654, + "step": 19806 + }, + { + "epoch": 2.3487489624095814, + "grad_norm": 0.7177847027961992, + "learning_rate": 1.9208222330614117e-05, + "loss": 0.1086, + "step": 19807 + }, + { + "epoch": 2.348867544171706, + "grad_norm": 0.8701836263846132, + "learning_rate": 1.9205887299994783e-05, + "loss": 0.1142, + "step": 19808 + }, + { + "epoch": 2.3489861259338314, + "grad_norm": 0.5826198252567217, + "learning_rate": 1.920355232279008e-05, + "loss": 0.0864, + "step": 19809 + }, + { + "epoch": 2.3491047076959566, + "grad_norm": 0.6251157946987146, + "learning_rate": 1.920121739902151e-05, + "loss": 0.0827, + "step": 19810 + }, + { + "epoch": 2.3492232894580813, + "grad_norm": 0.7922254825335069, + "learning_rate": 1.9198882528710608e-05, + "loss": 0.136, + "step": 19811 + }, + { + "epoch": 2.349341871220206, + "grad_norm": 0.7256369643690842, + "learning_rate": 1.9196547711878883e-05, + "loss": 0.1194, + "step": 19812 + }, + { + "epoch": 2.3494604529823313, + "grad_norm": 0.5793194977530085, + "learning_rate": 1.9194212948547884e-05, + "loss": 0.0879, + "step": 19813 + }, + { + "epoch": 2.3495790347444565, + "grad_norm": 0.548692633728102, + "learning_rate": 1.919187823873912e-05, + "loss": 0.0775, + "step": 19814 + }, + { + "epoch": 2.3496976165065813, + "grad_norm": 0.6754916291897857, + "learning_rate": 1.9189543582474123e-05, + "loss": 0.1035, + "step": 19815 + }, + { + "epoch": 2.349816198268706, + "grad_norm": 0.7762238902797676, + "learning_rate": 1.918720897977439e-05, + "loss": 0.1012, + "step": 19816 + }, + { + "epoch": 2.3499347800308312, + "grad_norm": 0.6663830316645823, + "learning_rate": 1.918487443066148e-05, + "loss": 0.0795, + "step": 19817 + }, + { + "epoch": 2.3500533617929564, + "grad_norm": 0.7375851298626243, + "learning_rate": 1.9182539935156884e-05, + "loss": 0.1027, + "step": 19818 + }, + { + "epoch": 2.350171943555081, + "grad_norm": 0.705449094417613, + "learning_rate": 1.9180205493282143e-05, + "loss": 0.0747, + "step": 19819 + }, + { + "epoch": 2.3502905253172064, + "grad_norm": 0.8659654096583117, + "learning_rate": 1.9177871105058766e-05, + "loss": 0.1162, + "step": 19820 + }, + { + "epoch": 2.350409107079331, + "grad_norm": 0.6892590392475275, + "learning_rate": 1.9175536770508286e-05, + "loss": 0.1073, + "step": 19821 + }, + { + "epoch": 2.3505276888414564, + "grad_norm": 0.5399384648640292, + "learning_rate": 1.917320248965221e-05, + "loss": 0.0741, + "step": 19822 + }, + { + "epoch": 2.350646270603581, + "grad_norm": 0.7297137154513963, + "learning_rate": 1.9170868262512057e-05, + "loss": 0.1, + "step": 19823 + }, + { + "epoch": 2.3507648523657063, + "grad_norm": 0.6302455281561586, + "learning_rate": 1.9168534089109358e-05, + "loss": 0.086, + "step": 19824 + }, + { + "epoch": 2.350883434127831, + "grad_norm": 0.8790209207206711, + "learning_rate": 1.9166199969465627e-05, + "loss": 0.1324, + "step": 19825 + }, + { + "epoch": 2.3510020158899563, + "grad_norm": 0.7489139956891488, + "learning_rate": 1.9163865903602374e-05, + "loss": 0.0976, + "step": 19826 + }, + { + "epoch": 2.351120597652081, + "grad_norm": 0.7562583080799781, + "learning_rate": 1.916153189154111e-05, + "loss": 0.1228, + "step": 19827 + }, + { + "epoch": 2.3512391794142062, + "grad_norm": 0.8174630110142237, + "learning_rate": 1.9159197933303374e-05, + "loss": 0.1127, + "step": 19828 + }, + { + "epoch": 2.351357761176331, + "grad_norm": 0.7082858625607088, + "learning_rate": 1.9156864028910665e-05, + "loss": 0.104, + "step": 19829 + }, + { + "epoch": 2.351476342938456, + "grad_norm": 0.7274738513368273, + "learning_rate": 1.915453017838451e-05, + "loss": 0.1214, + "step": 19830 + }, + { + "epoch": 2.351594924700581, + "grad_norm": 0.726896386416814, + "learning_rate": 1.915219638174641e-05, + "loss": 0.0984, + "step": 19831 + }, + { + "epoch": 2.351713506462706, + "grad_norm": 0.631438451060847, + "learning_rate": 1.9149862639017895e-05, + "loss": 0.0825, + "step": 19832 + }, + { + "epoch": 2.351832088224831, + "grad_norm": 0.6246939140584256, + "learning_rate": 1.914752895022048e-05, + "loss": 0.0885, + "step": 19833 + }, + { + "epoch": 2.351950669986956, + "grad_norm": 0.7910329437392917, + "learning_rate": 1.9145195315375652e-05, + "loss": 0.0886, + "step": 19834 + }, + { + "epoch": 2.352069251749081, + "grad_norm": 0.7895658861988475, + "learning_rate": 1.9142861734504958e-05, + "loss": 0.1256, + "step": 19835 + }, + { + "epoch": 2.352187833511206, + "grad_norm": 1.054074322324468, + "learning_rate": 1.9140528207629897e-05, + "loss": 0.1358, + "step": 19836 + }, + { + "epoch": 2.352306415273331, + "grad_norm": 0.8596446818299028, + "learning_rate": 1.913819473477198e-05, + "loss": 0.101, + "step": 19837 + }, + { + "epoch": 2.352424997035456, + "grad_norm": 0.6682199950058394, + "learning_rate": 1.913586131595271e-05, + "loss": 0.1105, + "step": 19838 + }, + { + "epoch": 2.352543578797581, + "grad_norm": 0.8712253936077597, + "learning_rate": 1.9133527951193615e-05, + "loss": 0.1157, + "step": 19839 + }, + { + "epoch": 2.352662160559706, + "grad_norm": 1.3984441475866805, + "learning_rate": 1.9131194640516192e-05, + "loss": 0.2518, + "step": 19840 + }, + { + "epoch": 2.3527807423218308, + "grad_norm": 0.6505983405250783, + "learning_rate": 1.9128861383941965e-05, + "loss": 0.0825, + "step": 19841 + }, + { + "epoch": 2.352899324083956, + "grad_norm": 0.8278307276182898, + "learning_rate": 1.9126528181492425e-05, + "loss": 0.1069, + "step": 19842 + }, + { + "epoch": 2.3530179058460807, + "grad_norm": 0.9723643652949461, + "learning_rate": 1.9124195033189108e-05, + "loss": 0.1197, + "step": 19843 + }, + { + "epoch": 2.353136487608206, + "grad_norm": 1.2826229192036434, + "learning_rate": 1.9121861939053503e-05, + "loss": 0.1549, + "step": 19844 + }, + { + "epoch": 2.3532550693703307, + "grad_norm": 0.5026035557759794, + "learning_rate": 1.9119528899107122e-05, + "loss": 0.0873, + "step": 19845 + }, + { + "epoch": 2.353373651132456, + "grad_norm": 0.6775614818952257, + "learning_rate": 1.9117195913371467e-05, + "loss": 0.0801, + "step": 19846 + }, + { + "epoch": 2.3534922328945806, + "grad_norm": 0.5644852513466332, + "learning_rate": 1.9114862981868058e-05, + "loss": 0.0859, + "step": 19847 + }, + { + "epoch": 2.353610814656706, + "grad_norm": 0.5274703675187851, + "learning_rate": 1.9112530104618394e-05, + "loss": 0.066, + "step": 19848 + }, + { + "epoch": 2.3537293964188306, + "grad_norm": 0.6975473551743911, + "learning_rate": 1.9110197281643978e-05, + "loss": 0.0922, + "step": 19849 + }, + { + "epoch": 2.353847978180956, + "grad_norm": 0.9034120340807436, + "learning_rate": 1.9107864512966323e-05, + "loss": 0.1101, + "step": 19850 + }, + { + "epoch": 2.3539665599430806, + "grad_norm": 0.5871157366074711, + "learning_rate": 1.910553179860694e-05, + "loss": 0.0643, + "step": 19851 + }, + { + "epoch": 2.3540851417052058, + "grad_norm": 0.669264411511396, + "learning_rate": 1.910319913858732e-05, + "loss": 0.093, + "step": 19852 + }, + { + "epoch": 2.3542037234673305, + "grad_norm": 0.5815072177925263, + "learning_rate": 1.9100866532928965e-05, + "loss": 0.0661, + "step": 19853 + }, + { + "epoch": 2.3543223052294557, + "grad_norm": 0.9551699858858056, + "learning_rate": 1.9098533981653395e-05, + "loss": 0.1476, + "step": 19854 + }, + { + "epoch": 2.354440886991581, + "grad_norm": 0.7399535789505438, + "learning_rate": 1.9096201484782107e-05, + "loss": 0.1143, + "step": 19855 + }, + { + "epoch": 2.3545594687537057, + "grad_norm": 0.6501666067495723, + "learning_rate": 1.9093869042336603e-05, + "loss": 0.0966, + "step": 19856 + }, + { + "epoch": 2.3546780505158305, + "grad_norm": 0.7693990446451955, + "learning_rate": 1.909153665433837e-05, + "loss": 0.1361, + "step": 19857 + }, + { + "epoch": 2.3547966322779557, + "grad_norm": 0.7379651079016216, + "learning_rate": 1.908920432080894e-05, + "loss": 0.0992, + "step": 19858 + }, + { + "epoch": 2.354915214040081, + "grad_norm": 0.8159564627233838, + "learning_rate": 1.908687204176979e-05, + "loss": 0.096, + "step": 19859 + }, + { + "epoch": 2.3550337958022056, + "grad_norm": 0.6483743967761181, + "learning_rate": 1.908453981724243e-05, + "loss": 0.0905, + "step": 19860 + }, + { + "epoch": 2.3551523775643304, + "grad_norm": 0.7336448548410783, + "learning_rate": 1.9082207647248352e-05, + "loss": 0.0902, + "step": 19861 + }, + { + "epoch": 2.3552709593264556, + "grad_norm": 0.9597187172715144, + "learning_rate": 1.9079875531809072e-05, + "loss": 0.1036, + "step": 19862 + }, + { + "epoch": 2.355389541088581, + "grad_norm": 0.8663429031859886, + "learning_rate": 1.9077543470946076e-05, + "loss": 0.1202, + "step": 19863 + }, + { + "epoch": 2.3555081228507055, + "grad_norm": 0.8742438180109534, + "learning_rate": 1.9075211464680862e-05, + "loss": 0.1135, + "step": 19864 + }, + { + "epoch": 2.3556267046128303, + "grad_norm": 1.0394826477405401, + "learning_rate": 1.907287951303494e-05, + "loss": 0.1149, + "step": 19865 + }, + { + "epoch": 2.3557452863749555, + "grad_norm": 0.7646873060114979, + "learning_rate": 1.9070547616029802e-05, + "loss": 0.1123, + "step": 19866 + }, + { + "epoch": 2.3558638681370807, + "grad_norm": 0.6409418979347721, + "learning_rate": 1.9068215773686944e-05, + "loss": 0.0842, + "step": 19867 + }, + { + "epoch": 2.3559824498992055, + "grad_norm": 0.562437137355907, + "learning_rate": 1.9065883986027852e-05, + "loss": 0.0786, + "step": 19868 + }, + { + "epoch": 2.3561010316613307, + "grad_norm": 0.6335742049633799, + "learning_rate": 1.9063552253074044e-05, + "loss": 0.078, + "step": 19869 + }, + { + "epoch": 2.3562196134234554, + "grad_norm": 0.7030561559825409, + "learning_rate": 1.9061220574846998e-05, + "loss": 0.1105, + "step": 19870 + }, + { + "epoch": 2.3563381951855806, + "grad_norm": 0.9221840403760874, + "learning_rate": 1.905888895136822e-05, + "loss": 0.1432, + "step": 19871 + }, + { + "epoch": 2.3564567769477054, + "grad_norm": 0.7220588075103385, + "learning_rate": 1.905655738265919e-05, + "loss": 0.1082, + "step": 19872 + }, + { + "epoch": 2.3565753587098306, + "grad_norm": 0.7144417130030801, + "learning_rate": 1.9054225868741426e-05, + "loss": 0.104, + "step": 19873 + }, + { + "epoch": 2.3566939404719554, + "grad_norm": 0.623889755175121, + "learning_rate": 1.9051894409636406e-05, + "loss": 0.0789, + "step": 19874 + }, + { + "epoch": 2.3568125222340806, + "grad_norm": 0.6617505360199163, + "learning_rate": 1.9049563005365615e-05, + "loss": 0.0692, + "step": 19875 + }, + { + "epoch": 2.3569311039962053, + "grad_norm": 0.9281244400222286, + "learning_rate": 1.904723165595057e-05, + "loss": 0.1109, + "step": 19876 + }, + { + "epoch": 2.3570496857583305, + "grad_norm": 0.5593013847280809, + "learning_rate": 1.9044900361412744e-05, + "loss": 0.0618, + "step": 19877 + }, + { + "epoch": 2.3571682675204553, + "grad_norm": 0.7553037364350648, + "learning_rate": 1.904256912177363e-05, + "loss": 0.1101, + "step": 19878 + }, + { + "epoch": 2.3572868492825805, + "grad_norm": 0.8935580767507609, + "learning_rate": 1.9040237937054724e-05, + "loss": 0.1257, + "step": 19879 + }, + { + "epoch": 2.3574054310447052, + "grad_norm": 0.6175700671350106, + "learning_rate": 1.9037906807277516e-05, + "loss": 0.087, + "step": 19880 + }, + { + "epoch": 2.3575240128068304, + "grad_norm": 0.586312170015034, + "learning_rate": 1.90355757324635e-05, + "loss": 0.0933, + "step": 19881 + }, + { + "epoch": 2.357642594568955, + "grad_norm": 0.5806703324516157, + "learning_rate": 1.9033244712634165e-05, + "loss": 0.0781, + "step": 19882 + }, + { + "epoch": 2.3577611763310804, + "grad_norm": 0.9564470972189384, + "learning_rate": 1.9030913747810986e-05, + "loss": 0.1083, + "step": 19883 + }, + { + "epoch": 2.357879758093205, + "grad_norm": 0.7116920857010154, + "learning_rate": 1.9028582838015472e-05, + "loss": 0.0813, + "step": 19884 + }, + { + "epoch": 2.3579983398553304, + "grad_norm": 1.0127890265485777, + "learning_rate": 1.90262519832691e-05, + "loss": 0.1051, + "step": 19885 + }, + { + "epoch": 2.358116921617455, + "grad_norm": 0.7004707639386736, + "learning_rate": 1.902392118359336e-05, + "loss": 0.0866, + "step": 19886 + }, + { + "epoch": 2.3582355033795803, + "grad_norm": 0.7386486494071466, + "learning_rate": 1.902159043900973e-05, + "loss": 0.109, + "step": 19887 + }, + { + "epoch": 2.358354085141705, + "grad_norm": 0.9755352516217708, + "learning_rate": 1.901925974953971e-05, + "loss": 0.1321, + "step": 19888 + }, + { + "epoch": 2.3584726669038303, + "grad_norm": 0.7269959368807667, + "learning_rate": 1.9016929115204782e-05, + "loss": 0.1041, + "step": 19889 + }, + { + "epoch": 2.358591248665955, + "grad_norm": 1.124987155974015, + "learning_rate": 1.901459853602643e-05, + "loss": 0.1433, + "step": 19890 + }, + { + "epoch": 2.3587098304280802, + "grad_norm": 0.7173994222167089, + "learning_rate": 1.9012268012026137e-05, + "loss": 0.0789, + "step": 19891 + }, + { + "epoch": 2.358828412190205, + "grad_norm": 1.1139010944296883, + "learning_rate": 1.9009937543225394e-05, + "loss": 0.1503, + "step": 19892 + }, + { + "epoch": 2.35894699395233, + "grad_norm": 0.8242291058444831, + "learning_rate": 1.9007607129645684e-05, + "loss": 0.1034, + "step": 19893 + }, + { + "epoch": 2.359065575714455, + "grad_norm": 0.9676347375556623, + "learning_rate": 1.9005276771308476e-05, + "loss": 0.1114, + "step": 19894 + }, + { + "epoch": 2.35918415747658, + "grad_norm": 0.9390613014127689, + "learning_rate": 1.9002946468235276e-05, + "loss": 0.1384, + "step": 19895 + }, + { + "epoch": 2.359302739238705, + "grad_norm": 0.6495182431153013, + "learning_rate": 1.9000616220447558e-05, + "loss": 0.1042, + "step": 19896 + }, + { + "epoch": 2.35942132100083, + "grad_norm": 0.7104690312019263, + "learning_rate": 1.8998286027966794e-05, + "loss": 0.0958, + "step": 19897 + }, + { + "epoch": 2.359539902762955, + "grad_norm": 0.5783462146022365, + "learning_rate": 1.8995955890814474e-05, + "loss": 0.0882, + "step": 19898 + }, + { + "epoch": 2.35965848452508, + "grad_norm": 0.6487935321917006, + "learning_rate": 1.8993625809012077e-05, + "loss": 0.0907, + "step": 19899 + }, + { + "epoch": 2.359777066287205, + "grad_norm": 0.7661267038295545, + "learning_rate": 1.899129578258109e-05, + "loss": 0.0838, + "step": 19900 + }, + { + "epoch": 2.35989564804933, + "grad_norm": 0.5843039470981641, + "learning_rate": 1.8988965811542987e-05, + "loss": 0.0601, + "step": 19901 + }, + { + "epoch": 2.360014229811455, + "grad_norm": 0.5158724456391116, + "learning_rate": 1.8986635895919238e-05, + "loss": 0.0677, + "step": 19902 + }, + { + "epoch": 2.36013281157358, + "grad_norm": 0.7526477058401673, + "learning_rate": 1.8984306035731343e-05, + "loss": 0.1034, + "step": 19903 + }, + { + "epoch": 2.360251393335705, + "grad_norm": 0.6892706222282654, + "learning_rate": 1.898197623100077e-05, + "loss": 0.0905, + "step": 19904 + }, + { + "epoch": 2.36036997509783, + "grad_norm": 0.7696994708635592, + "learning_rate": 1.8979646481748982e-05, + "loss": 0.0899, + "step": 19905 + }, + { + "epoch": 2.3604885568599547, + "grad_norm": 0.9435518134021565, + "learning_rate": 1.8977316787997484e-05, + "loss": 0.107, + "step": 19906 + }, + { + "epoch": 2.36060713862208, + "grad_norm": 0.7430218755818324, + "learning_rate": 1.8974987149767742e-05, + "loss": 0.1123, + "step": 19907 + }, + { + "epoch": 2.360725720384205, + "grad_norm": 0.7339742647781969, + "learning_rate": 1.8972657567081224e-05, + "loss": 0.107, + "step": 19908 + }, + { + "epoch": 2.36084430214633, + "grad_norm": 0.5741792078095238, + "learning_rate": 1.897032803995941e-05, + "loss": 0.0621, + "step": 19909 + }, + { + "epoch": 2.3609628839084547, + "grad_norm": 0.7839605335493782, + "learning_rate": 1.8967998568423777e-05, + "loss": 0.0934, + "step": 19910 + }, + { + "epoch": 2.36108146567058, + "grad_norm": 0.8353336038019893, + "learning_rate": 1.8965669152495806e-05, + "loss": 0.1314, + "step": 19911 + }, + { + "epoch": 2.361200047432705, + "grad_norm": 0.45598454802048627, + "learning_rate": 1.8963339792196968e-05, + "loss": 0.0662, + "step": 19912 + }, + { + "epoch": 2.36131862919483, + "grad_norm": 0.6740446257299825, + "learning_rate": 1.8961010487548723e-05, + "loss": 0.1068, + "step": 19913 + }, + { + "epoch": 2.3614372109569546, + "grad_norm": 0.8494219236681743, + "learning_rate": 1.8958681238572566e-05, + "loss": 0.13, + "step": 19914 + }, + { + "epoch": 2.36155579271908, + "grad_norm": 0.6728658091237214, + "learning_rate": 1.8956352045289956e-05, + "loss": 0.0863, + "step": 19915 + }, + { + "epoch": 2.361674374481205, + "grad_norm": 0.610201917025246, + "learning_rate": 1.8954022907722375e-05, + "loss": 0.0855, + "step": 19916 + }, + { + "epoch": 2.3617929562433297, + "grad_norm": 0.8165771099835846, + "learning_rate": 1.8951693825891277e-05, + "loss": 0.1132, + "step": 19917 + }, + { + "epoch": 2.361911538005455, + "grad_norm": 0.8269950070467138, + "learning_rate": 1.8949364799818152e-05, + "loss": 0.1029, + "step": 19918 + }, + { + "epoch": 2.3620301197675797, + "grad_norm": 0.8217942785014746, + "learning_rate": 1.894703582952446e-05, + "loss": 0.0843, + "step": 19919 + }, + { + "epoch": 2.362148701529705, + "grad_norm": 0.4771207143828343, + "learning_rate": 1.8944706915031673e-05, + "loss": 0.0779, + "step": 19920 + }, + { + "epoch": 2.3622672832918297, + "grad_norm": 0.6878069183292139, + "learning_rate": 1.8942378056361267e-05, + "loss": 0.0894, + "step": 19921 + }, + { + "epoch": 2.362385865053955, + "grad_norm": 0.7508953956717498, + "learning_rate": 1.894004925353471e-05, + "loss": 0.1135, + "step": 19922 + }, + { + "epoch": 2.3625044468160796, + "grad_norm": 0.7423991021993497, + "learning_rate": 1.893772050657347e-05, + "loss": 0.0757, + "step": 19923 + }, + { + "epoch": 2.362623028578205, + "grad_norm": 0.5857239107523646, + "learning_rate": 1.8935391815499e-05, + "loss": 0.0703, + "step": 19924 + }, + { + "epoch": 2.3627416103403296, + "grad_norm": 0.7825219143585538, + "learning_rate": 1.893306318033279e-05, + "loss": 0.112, + "step": 19925 + }, + { + "epoch": 2.362860192102455, + "grad_norm": 0.5749701813711697, + "learning_rate": 1.8930734601096302e-05, + "loss": 0.0773, + "step": 19926 + }, + { + "epoch": 2.3629787738645796, + "grad_norm": 0.750079780636554, + "learning_rate": 1.892840607781099e-05, + "loss": 0.0961, + "step": 19927 + }, + { + "epoch": 2.3630973556267048, + "grad_norm": 0.8772427984184082, + "learning_rate": 1.8926077610498328e-05, + "loss": 0.1434, + "step": 19928 + }, + { + "epoch": 2.3632159373888295, + "grad_norm": 0.6890904001253891, + "learning_rate": 1.892374919917978e-05, + "loss": 0.0946, + "step": 19929 + }, + { + "epoch": 2.3633345191509547, + "grad_norm": 0.7430910933224466, + "learning_rate": 1.8921420843876824e-05, + "loss": 0.1019, + "step": 19930 + }, + { + "epoch": 2.3634531009130795, + "grad_norm": 0.6121896308552848, + "learning_rate": 1.891909254461091e-05, + "loss": 0.0691, + "step": 19931 + }, + { + "epoch": 2.3635716826752047, + "grad_norm": 0.9919160581002853, + "learning_rate": 1.8916764301403498e-05, + "loss": 0.1491, + "step": 19932 + }, + { + "epoch": 2.3636902644373294, + "grad_norm": 0.7910497561177082, + "learning_rate": 1.891443611427607e-05, + "loss": 0.1142, + "step": 19933 + }, + { + "epoch": 2.3638088461994546, + "grad_norm": 0.8722898224173303, + "learning_rate": 1.8912107983250073e-05, + "loss": 0.1028, + "step": 19934 + }, + { + "epoch": 2.3639274279615794, + "grad_norm": 0.7264868666385337, + "learning_rate": 1.890977990834697e-05, + "loss": 0.0969, + "step": 19935 + }, + { + "epoch": 2.3640460097237046, + "grad_norm": 0.8822827991262268, + "learning_rate": 1.8907451889588236e-05, + "loss": 0.1029, + "step": 19936 + }, + { + "epoch": 2.3641645914858294, + "grad_norm": 0.5721121777530824, + "learning_rate": 1.8905123926995323e-05, + "loss": 0.092, + "step": 19937 + }, + { + "epoch": 2.3642831732479546, + "grad_norm": 1.2508364482372387, + "learning_rate": 1.890279602058969e-05, + "loss": 0.1622, + "step": 19938 + }, + { + "epoch": 2.3644017550100793, + "grad_norm": 0.7250648440542052, + "learning_rate": 1.8900468170392806e-05, + "loss": 0.1054, + "step": 19939 + }, + { + "epoch": 2.3645203367722045, + "grad_norm": 0.9006123805228402, + "learning_rate": 1.8898140376426118e-05, + "loss": 0.1269, + "step": 19940 + }, + { + "epoch": 2.3646389185343293, + "grad_norm": 0.7791109758853495, + "learning_rate": 1.8895812638711102e-05, + "loss": 0.113, + "step": 19941 + }, + { + "epoch": 2.3647575002964545, + "grad_norm": 0.7831450902538151, + "learning_rate": 1.8893484957269207e-05, + "loss": 0.1053, + "step": 19942 + }, + { + "epoch": 2.3648760820585792, + "grad_norm": 0.8503348803455865, + "learning_rate": 1.889115733212188e-05, + "loss": 0.109, + "step": 19943 + }, + { + "epoch": 2.3649946638207044, + "grad_norm": 0.6761577097189633, + "learning_rate": 1.8888829763290605e-05, + "loss": 0.0801, + "step": 19944 + }, + { + "epoch": 2.365113245582829, + "grad_norm": 0.6971905259958479, + "learning_rate": 1.888650225079682e-05, + "loss": 0.1074, + "step": 19945 + }, + { + "epoch": 2.3652318273449544, + "grad_norm": 0.7663806181912163, + "learning_rate": 1.888417479466199e-05, + "loss": 0.1111, + "step": 19946 + }, + { + "epoch": 2.365350409107079, + "grad_norm": 0.8295301504607903, + "learning_rate": 1.8881847394907564e-05, + "loss": 0.1129, + "step": 19947 + }, + { + "epoch": 2.3654689908692044, + "grad_norm": 0.6733879538178824, + "learning_rate": 1.8879520051555005e-05, + "loss": 0.1107, + "step": 19948 + }, + { + "epoch": 2.365587572631329, + "grad_norm": 0.9754037879394877, + "learning_rate": 1.8877192764625767e-05, + "loss": 0.1304, + "step": 19949 + }, + { + "epoch": 2.3657061543934543, + "grad_norm": 0.6060083266017028, + "learning_rate": 1.8874865534141296e-05, + "loss": 0.091, + "step": 19950 + }, + { + "epoch": 2.365824736155579, + "grad_norm": 0.7782708785301196, + "learning_rate": 1.8872538360123064e-05, + "loss": 0.1197, + "step": 19951 + }, + { + "epoch": 2.3659433179177043, + "grad_norm": 0.6181480422484958, + "learning_rate": 1.8870211242592514e-05, + "loss": 0.099, + "step": 19952 + }, + { + "epoch": 2.366061899679829, + "grad_norm": 0.6494849007931369, + "learning_rate": 1.88678841815711e-05, + "loss": 0.0725, + "step": 19953 + }, + { + "epoch": 2.3661804814419543, + "grad_norm": 0.9626175748259966, + "learning_rate": 1.886555717708026e-05, + "loss": 0.1358, + "step": 19954 + }, + { + "epoch": 2.366299063204079, + "grad_norm": 0.907583915773801, + "learning_rate": 1.8863230229141475e-05, + "loss": 0.1463, + "step": 19955 + }, + { + "epoch": 2.366417644966204, + "grad_norm": 0.6635840868681778, + "learning_rate": 1.8860903337776183e-05, + "loss": 0.1219, + "step": 19956 + }, + { + "epoch": 2.3665362267283294, + "grad_norm": 0.6226300110420255, + "learning_rate": 1.8858576503005827e-05, + "loss": 0.0837, + "step": 19957 + }, + { + "epoch": 2.366654808490454, + "grad_norm": 0.7989249326444906, + "learning_rate": 1.885624972485186e-05, + "loss": 0.1125, + "step": 19958 + }, + { + "epoch": 2.366773390252579, + "grad_norm": 0.8603938491010702, + "learning_rate": 1.8853923003335743e-05, + "loss": 0.1051, + "step": 19959 + }, + { + "epoch": 2.366891972014704, + "grad_norm": 0.5146153452773083, + "learning_rate": 1.8851596338478923e-05, + "loss": 0.0843, + "step": 19960 + }, + { + "epoch": 2.3670105537768293, + "grad_norm": 0.7405172860890489, + "learning_rate": 1.8849269730302833e-05, + "loss": 0.0979, + "step": 19961 + }, + { + "epoch": 2.367129135538954, + "grad_norm": 0.6124937029173426, + "learning_rate": 1.8846943178828945e-05, + "loss": 0.0981, + "step": 19962 + }, + { + "epoch": 2.367247717301079, + "grad_norm": 0.775286751039157, + "learning_rate": 1.8844616684078695e-05, + "loss": 0.1241, + "step": 19963 + }, + { + "epoch": 2.367366299063204, + "grad_norm": 0.7923354704395553, + "learning_rate": 1.884229024607353e-05, + "loss": 0.1004, + "step": 19964 + }, + { + "epoch": 2.3674848808253293, + "grad_norm": 0.4930555783615494, + "learning_rate": 1.8839963864834888e-05, + "loss": 0.0853, + "step": 19965 + }, + { + "epoch": 2.367603462587454, + "grad_norm": 0.5344725032971978, + "learning_rate": 1.8837637540384237e-05, + "loss": 0.077, + "step": 19966 + }, + { + "epoch": 2.3677220443495792, + "grad_norm": 0.7722218934976957, + "learning_rate": 1.883531127274301e-05, + "loss": 0.0956, + "step": 19967 + }, + { + "epoch": 2.367840626111704, + "grad_norm": 0.5752518975705441, + "learning_rate": 1.8832985061932647e-05, + "loss": 0.0822, + "step": 19968 + }, + { + "epoch": 2.367959207873829, + "grad_norm": 0.5175758134146375, + "learning_rate": 1.88306589079746e-05, + "loss": 0.0672, + "step": 19969 + }, + { + "epoch": 2.368077789635954, + "grad_norm": 0.6570221986468157, + "learning_rate": 1.8828332810890314e-05, + "loss": 0.1005, + "step": 19970 + }, + { + "epoch": 2.368196371398079, + "grad_norm": 0.611769797492702, + "learning_rate": 1.8826006770701234e-05, + "loss": 0.0766, + "step": 19971 + }, + { + "epoch": 2.368314953160204, + "grad_norm": 0.5617197751643379, + "learning_rate": 1.8823680787428803e-05, + "loss": 0.0755, + "step": 19972 + }, + { + "epoch": 2.368433534922329, + "grad_norm": 0.7877788952640095, + "learning_rate": 1.882135486109445e-05, + "loss": 0.1043, + "step": 19973 + }, + { + "epoch": 2.368552116684454, + "grad_norm": 0.9514509479428287, + "learning_rate": 1.881902899171964e-05, + "loss": 0.121, + "step": 19974 + }, + { + "epoch": 2.368670698446579, + "grad_norm": 1.0356955371518892, + "learning_rate": 1.88167031793258e-05, + "loss": 0.0901, + "step": 19975 + }, + { + "epoch": 2.368789280208704, + "grad_norm": 0.8150220276802356, + "learning_rate": 1.8814377423934363e-05, + "loss": 0.1124, + "step": 19976 + }, + { + "epoch": 2.368907861970829, + "grad_norm": 0.8076262178988202, + "learning_rate": 1.8812051725566798e-05, + "loss": 0.1193, + "step": 19977 + }, + { + "epoch": 2.369026443732954, + "grad_norm": 0.7509835040646926, + "learning_rate": 1.8809726084244516e-05, + "loss": 0.104, + "step": 19978 + }, + { + "epoch": 2.369145025495079, + "grad_norm": 0.46853320952499655, + "learning_rate": 1.880740049998898e-05, + "loss": 0.059, + "step": 19979 + }, + { + "epoch": 2.3692636072572038, + "grad_norm": 0.9098227811272487, + "learning_rate": 1.88050749728216e-05, + "loss": 0.1448, + "step": 19980 + }, + { + "epoch": 2.369382189019329, + "grad_norm": 0.5427449191678437, + "learning_rate": 1.8802749502763846e-05, + "loss": 0.0734, + "step": 19981 + }, + { + "epoch": 2.3695007707814537, + "grad_norm": 0.4884119532013986, + "learning_rate": 1.8800424089837144e-05, + "loss": 0.0688, + "step": 19982 + }, + { + "epoch": 2.369619352543579, + "grad_norm": 0.8124719565471314, + "learning_rate": 1.8798098734062926e-05, + "loss": 0.1323, + "step": 19983 + }, + { + "epoch": 2.3697379343057037, + "grad_norm": 0.6369246503169449, + "learning_rate": 1.8795773435462623e-05, + "loss": 0.0851, + "step": 19984 + }, + { + "epoch": 2.369856516067829, + "grad_norm": 0.7712196186113499, + "learning_rate": 1.879344819405769e-05, + "loss": 0.1618, + "step": 19985 + }, + { + "epoch": 2.3699750978299536, + "grad_norm": 0.7075111478268126, + "learning_rate": 1.8791123009869557e-05, + "loss": 0.0873, + "step": 19986 + }, + { + "epoch": 2.370093679592079, + "grad_norm": 0.6642668229008403, + "learning_rate": 1.878879788291965e-05, + "loss": 0.0973, + "step": 19987 + }, + { + "epoch": 2.3702122613542036, + "grad_norm": 0.7724202527018885, + "learning_rate": 1.8786472813229408e-05, + "loss": 0.0854, + "step": 19988 + }, + { + "epoch": 2.370330843116329, + "grad_norm": 0.6479160696632843, + "learning_rate": 1.8784147800820267e-05, + "loss": 0.0813, + "step": 19989 + }, + { + "epoch": 2.3704494248784536, + "grad_norm": 0.5513009510332562, + "learning_rate": 1.878182284571367e-05, + "loss": 0.0868, + "step": 19990 + }, + { + "epoch": 2.3705680066405788, + "grad_norm": 0.4597233408655208, + "learning_rate": 1.8779497947931028e-05, + "loss": 0.0607, + "step": 19991 + }, + { + "epoch": 2.3706865884027035, + "grad_norm": 0.6797456376956849, + "learning_rate": 1.8777173107493793e-05, + "loss": 0.0865, + "step": 19992 + }, + { + "epoch": 2.3708051701648287, + "grad_norm": 0.6627717403628137, + "learning_rate": 1.8774848324423397e-05, + "loss": 0.0996, + "step": 19993 + }, + { + "epoch": 2.3709237519269535, + "grad_norm": 0.4259367205894566, + "learning_rate": 1.8772523598741264e-05, + "loss": 0.0698, + "step": 19994 + }, + { + "epoch": 2.3710423336890787, + "grad_norm": 0.7483280847053476, + "learning_rate": 1.8770198930468816e-05, + "loss": 0.0812, + "step": 19995 + }, + { + "epoch": 2.3711609154512034, + "grad_norm": 0.8425786695993075, + "learning_rate": 1.8767874319627506e-05, + "loss": 0.1266, + "step": 19996 + }, + { + "epoch": 2.3712794972133286, + "grad_norm": 0.6904726069700745, + "learning_rate": 1.8765549766238753e-05, + "loss": 0.0985, + "step": 19997 + }, + { + "epoch": 2.3713980789754534, + "grad_norm": 0.7439175748092133, + "learning_rate": 1.8763225270323977e-05, + "loss": 0.0903, + "step": 19998 + }, + { + "epoch": 2.3715166607375786, + "grad_norm": 0.6079903772508615, + "learning_rate": 1.876090083190462e-05, + "loss": 0.1002, + "step": 19999 + }, + { + "epoch": 2.3716352424997034, + "grad_norm": 0.5023677329147301, + "learning_rate": 1.87585764510021e-05, + "loss": 0.0686, + "step": 20000 + }, + { + "epoch": 2.3717538242618286, + "grad_norm": 0.6833155940781851, + "learning_rate": 1.8756252127637864e-05, + "loss": 0.1008, + "step": 20001 + }, + { + "epoch": 2.3718724060239533, + "grad_norm": 0.6187885239704627, + "learning_rate": 1.8753927861833324e-05, + "loss": 0.1041, + "step": 20002 + }, + { + "epoch": 2.3719909877860785, + "grad_norm": 0.7107883772514625, + "learning_rate": 1.8751603653609897e-05, + "loss": 0.115, + "step": 20003 + }, + { + "epoch": 2.3721095695482033, + "grad_norm": 0.7619485316360244, + "learning_rate": 1.8749279502989038e-05, + "loss": 0.1139, + "step": 20004 + }, + { + "epoch": 2.3722281513103285, + "grad_norm": 0.6603883485460309, + "learning_rate": 1.874695540999215e-05, + "loss": 0.096, + "step": 20005 + }, + { + "epoch": 2.3723467330724537, + "grad_norm": 0.7199818399211823, + "learning_rate": 1.8744631374640658e-05, + "loss": 0.1054, + "step": 20006 + }, + { + "epoch": 2.3724653148345785, + "grad_norm": 0.5125007184360268, + "learning_rate": 1.8742307396956005e-05, + "loss": 0.0682, + "step": 20007 + }, + { + "epoch": 2.372583896596703, + "grad_norm": 0.8531313601882597, + "learning_rate": 1.87399834769596e-05, + "loss": 0.137, + "step": 20008 + }, + { + "epoch": 2.3727024783588284, + "grad_norm": 0.8474209415371391, + "learning_rate": 1.8737659614672877e-05, + "loss": 0.1373, + "step": 20009 + }, + { + "epoch": 2.3728210601209536, + "grad_norm": 0.6745005106913177, + "learning_rate": 1.8735335810117238e-05, + "loss": 0.0788, + "step": 20010 + }, + { + "epoch": 2.3729396418830784, + "grad_norm": 0.9650746860494496, + "learning_rate": 1.8733012063314132e-05, + "loss": 0.1137, + "step": 20011 + }, + { + "epoch": 2.373058223645203, + "grad_norm": 0.6071916149519339, + "learning_rate": 1.873068837428497e-05, + "loss": 0.0817, + "step": 20012 + }, + { + "epoch": 2.3731768054073283, + "grad_norm": 0.6490907511035657, + "learning_rate": 1.8728364743051176e-05, + "loss": 0.1016, + "step": 20013 + }, + { + "epoch": 2.3732953871694535, + "grad_norm": 0.618994327912844, + "learning_rate": 1.8726041169634152e-05, + "loss": 0.0996, + "step": 20014 + }, + { + "epoch": 2.3734139689315783, + "grad_norm": 0.7522258510340007, + "learning_rate": 1.872371765405535e-05, + "loss": 0.11, + "step": 20015 + }, + { + "epoch": 2.3735325506937035, + "grad_norm": 0.7696799759593597, + "learning_rate": 1.8721394196336173e-05, + "loss": 0.0949, + "step": 20016 + }, + { + "epoch": 2.3736511324558283, + "grad_norm": 0.8883080679971082, + "learning_rate": 1.871907079649804e-05, + "loss": 0.125, + "step": 20017 + }, + { + "epoch": 2.3737697142179535, + "grad_norm": 0.6872628592380868, + "learning_rate": 1.871674745456237e-05, + "loss": 0.1013, + "step": 20018 + }, + { + "epoch": 2.3738882959800782, + "grad_norm": 0.7921745092339323, + "learning_rate": 1.8714424170550583e-05, + "loss": 0.1244, + "step": 20019 + }, + { + "epoch": 2.3740068777422034, + "grad_norm": 0.7793545829117099, + "learning_rate": 1.87121009444841e-05, + "loss": 0.0878, + "step": 20020 + }, + { + "epoch": 2.374125459504328, + "grad_norm": 0.8684786676302539, + "learning_rate": 1.870977777638433e-05, + "loss": 0.1199, + "step": 20021 + }, + { + "epoch": 2.3742440412664534, + "grad_norm": 0.6407039448041054, + "learning_rate": 1.8707454666272702e-05, + "loss": 0.0742, + "step": 20022 + }, + { + "epoch": 2.374362623028578, + "grad_norm": 0.4610225208020165, + "learning_rate": 1.870513161417063e-05, + "loss": 0.0629, + "step": 20023 + }, + { + "epoch": 2.3744812047907033, + "grad_norm": 0.7673068371138112, + "learning_rate": 1.870280862009952e-05, + "loss": 0.1029, + "step": 20024 + }, + { + "epoch": 2.374599786552828, + "grad_norm": 1.0252714015606006, + "learning_rate": 1.8700485684080783e-05, + "loss": 0.1533, + "step": 20025 + }, + { + "epoch": 2.3747183683149533, + "grad_norm": 0.45361727481259223, + "learning_rate": 1.8698162806135855e-05, + "loss": 0.0745, + "step": 20026 + }, + { + "epoch": 2.374836950077078, + "grad_norm": 0.7798144307738056, + "learning_rate": 1.8695839986286134e-05, + "loss": 0.1067, + "step": 20027 + }, + { + "epoch": 2.3749555318392033, + "grad_norm": 0.7710058113181061, + "learning_rate": 1.8693517224553042e-05, + "loss": 0.1056, + "step": 20028 + }, + { + "epoch": 2.375074113601328, + "grad_norm": 0.9960954006215434, + "learning_rate": 1.8691194520957977e-05, + "loss": 0.1222, + "step": 20029 + }, + { + "epoch": 2.3751926953634532, + "grad_norm": 0.8253639992451972, + "learning_rate": 1.868887187552237e-05, + "loss": 0.1041, + "step": 20030 + }, + { + "epoch": 2.375311277125578, + "grad_norm": 0.5603990856004272, + "learning_rate": 1.868654928826763e-05, + "loss": 0.0811, + "step": 20031 + }, + { + "epoch": 2.375429858887703, + "grad_norm": 0.7315301939088297, + "learning_rate": 1.8684226759215147e-05, + "loss": 0.0941, + "step": 20032 + }, + { + "epoch": 2.375548440649828, + "grad_norm": 0.6786204561647634, + "learning_rate": 1.8681904288386366e-05, + "loss": 0.0872, + "step": 20033 + }, + { + "epoch": 2.375667022411953, + "grad_norm": 0.5846913753469047, + "learning_rate": 1.8679581875802673e-05, + "loss": 0.0891, + "step": 20034 + }, + { + "epoch": 2.375785604174078, + "grad_norm": 0.8852418012716938, + "learning_rate": 1.867725952148549e-05, + "loss": 0.1002, + "step": 20035 + }, + { + "epoch": 2.375904185936203, + "grad_norm": 0.5460127583227511, + "learning_rate": 1.8674937225456207e-05, + "loss": 0.0797, + "step": 20036 + }, + { + "epoch": 2.376022767698328, + "grad_norm": 0.7141657086561574, + "learning_rate": 1.8672614987736256e-05, + "loss": 0.0756, + "step": 20037 + }, + { + "epoch": 2.376141349460453, + "grad_norm": 0.6768208917454803, + "learning_rate": 1.8670292808347033e-05, + "loss": 0.0904, + "step": 20038 + }, + { + "epoch": 2.376259931222578, + "grad_norm": 0.6809921578826068, + "learning_rate": 1.8667970687309953e-05, + "loss": 0.1285, + "step": 20039 + }, + { + "epoch": 2.376378512984703, + "grad_norm": 0.5420894548196419, + "learning_rate": 1.8665648624646406e-05, + "loss": 0.0645, + "step": 20040 + }, + { + "epoch": 2.376497094746828, + "grad_norm": 0.7304202582181731, + "learning_rate": 1.8663326620377826e-05, + "loss": 0.1126, + "step": 20041 + }, + { + "epoch": 2.376615676508953, + "grad_norm": 0.6503655707257104, + "learning_rate": 1.86610046745256e-05, + "loss": 0.0821, + "step": 20042 + }, + { + "epoch": 2.3767342582710778, + "grad_norm": 0.6129648842904731, + "learning_rate": 1.865868278711114e-05, + "loss": 0.0705, + "step": 20043 + }, + { + "epoch": 2.376852840033203, + "grad_norm": 0.5945242063665328, + "learning_rate": 1.8656360958155834e-05, + "loss": 0.092, + "step": 20044 + }, + { + "epoch": 2.3769714217953277, + "grad_norm": 0.4965356396674816, + "learning_rate": 1.8654039187681117e-05, + "loss": 0.0709, + "step": 20045 + }, + { + "epoch": 2.377090003557453, + "grad_norm": 0.7081069488984314, + "learning_rate": 1.865171747570837e-05, + "loss": 0.0757, + "step": 20046 + }, + { + "epoch": 2.3772085853195777, + "grad_norm": 0.824380649449185, + "learning_rate": 1.8649395822258996e-05, + "loss": 0.115, + "step": 20047 + }, + { + "epoch": 2.377327167081703, + "grad_norm": 0.7254372060162212, + "learning_rate": 1.8647074227354416e-05, + "loss": 0.0995, + "step": 20048 + }, + { + "epoch": 2.3774457488438276, + "grad_norm": 0.9314634499926518, + "learning_rate": 1.8644752691016014e-05, + "loss": 0.1129, + "step": 20049 + }, + { + "epoch": 2.377564330605953, + "grad_norm": 0.671753283320527, + "learning_rate": 1.8642431213265208e-05, + "loss": 0.0881, + "step": 20050 + }, + { + "epoch": 2.3776829123680776, + "grad_norm": 1.1805712281715173, + "learning_rate": 1.864010979412338e-05, + "loss": 0.1617, + "step": 20051 + }, + { + "epoch": 2.377801494130203, + "grad_norm": 0.8086816647047935, + "learning_rate": 1.8637788433611948e-05, + "loss": 0.1072, + "step": 20052 + }, + { + "epoch": 2.3779200758923276, + "grad_norm": 0.9277793351333308, + "learning_rate": 1.8635467131752303e-05, + "loss": 0.123, + "step": 20053 + }, + { + "epoch": 2.3780386576544528, + "grad_norm": 0.936608419437902, + "learning_rate": 1.8633145888565852e-05, + "loss": 0.1445, + "step": 20054 + }, + { + "epoch": 2.378157239416578, + "grad_norm": 0.6109312966984188, + "learning_rate": 1.8630824704073972e-05, + "loss": 0.081, + "step": 20055 + }, + { + "epoch": 2.3782758211787027, + "grad_norm": 0.6592282112881449, + "learning_rate": 1.862850357829809e-05, + "loss": 0.0877, + "step": 20056 + }, + { + "epoch": 2.3783944029408275, + "grad_norm": 0.6752436648790341, + "learning_rate": 1.862618251125959e-05, + "loss": 0.0819, + "step": 20057 + }, + { + "epoch": 2.3785129847029527, + "grad_norm": 1.0437211360078944, + "learning_rate": 1.8623861502979873e-05, + "loss": 0.1431, + "step": 20058 + }, + { + "epoch": 2.378631566465078, + "grad_norm": 0.6676588929151782, + "learning_rate": 1.862154055348032e-05, + "loss": 0.0908, + "step": 20059 + }, + { + "epoch": 2.3787501482272027, + "grad_norm": 0.5702793301403799, + "learning_rate": 1.8619219662782354e-05, + "loss": 0.0693, + "step": 20060 + }, + { + "epoch": 2.3788687299893274, + "grad_norm": 0.7336422261197367, + "learning_rate": 1.8616898830907363e-05, + "loss": 0.1041, + "step": 20061 + }, + { + "epoch": 2.3789873117514526, + "grad_norm": 0.41698981607663144, + "learning_rate": 1.8614578057876724e-05, + "loss": 0.0696, + "step": 20062 + }, + { + "epoch": 2.379105893513578, + "grad_norm": 0.8740310957967917, + "learning_rate": 1.8612257343711855e-05, + "loss": 0.0984, + "step": 20063 + }, + { + "epoch": 2.3792244752757026, + "grad_norm": 0.6751166075426867, + "learning_rate": 1.8609936688434144e-05, + "loss": 0.0925, + "step": 20064 + }, + { + "epoch": 2.3793430570378273, + "grad_norm": 0.7952344454621917, + "learning_rate": 1.8607616092064975e-05, + "loss": 0.1003, + "step": 20065 + }, + { + "epoch": 2.3794616387999525, + "grad_norm": 1.154005013267378, + "learning_rate": 1.8605295554625738e-05, + "loss": 0.1292, + "step": 20066 + }, + { + "epoch": 2.3795802205620777, + "grad_norm": 1.0289451292869782, + "learning_rate": 1.8602975076137846e-05, + "loss": 0.1543, + "step": 20067 + }, + { + "epoch": 2.3796988023242025, + "grad_norm": 0.7788258309354711, + "learning_rate": 1.8600654656622672e-05, + "loss": 0.0962, + "step": 20068 + }, + { + "epoch": 2.3798173840863277, + "grad_norm": 0.6317958394057012, + "learning_rate": 1.859833429610162e-05, + "loss": 0.0963, + "step": 20069 + }, + { + "epoch": 2.3799359658484525, + "grad_norm": 0.6946637552595233, + "learning_rate": 1.8596013994596066e-05, + "loss": 0.087, + "step": 20070 + }, + { + "epoch": 2.3800545476105777, + "grad_norm": 0.8769487903987615, + "learning_rate": 1.8593693752127424e-05, + "loss": 0.1369, + "step": 20071 + }, + { + "epoch": 2.3801731293727024, + "grad_norm": 0.9252076348023556, + "learning_rate": 1.8591373568717063e-05, + "loss": 0.1562, + "step": 20072 + }, + { + "epoch": 2.3802917111348276, + "grad_norm": 0.6480870293235775, + "learning_rate": 1.8589053444386386e-05, + "loss": 0.0985, + "step": 20073 + }, + { + "epoch": 2.3804102928969524, + "grad_norm": 0.846661432174298, + "learning_rate": 1.8586733379156758e-05, + "loss": 0.1321, + "step": 20074 + }, + { + "epoch": 2.3805288746590776, + "grad_norm": 0.7394596556416534, + "learning_rate": 1.8584413373049597e-05, + "loss": 0.114, + "step": 20075 + }, + { + "epoch": 2.3806474564212023, + "grad_norm": 1.0010875859540005, + "learning_rate": 1.8582093426086277e-05, + "loss": 0.1298, + "step": 20076 + }, + { + "epoch": 2.3807660381833275, + "grad_norm": 0.7308638998002007, + "learning_rate": 1.8579773538288175e-05, + "loss": 0.0967, + "step": 20077 + }, + { + "epoch": 2.3808846199454523, + "grad_norm": 0.714591273044114, + "learning_rate": 1.8577453709676693e-05, + "loss": 0.0989, + "step": 20078 + }, + { + "epoch": 2.3810032017075775, + "grad_norm": 0.7902299195135827, + "learning_rate": 1.8575133940273214e-05, + "loss": 0.1005, + "step": 20079 + }, + { + "epoch": 2.3811217834697023, + "grad_norm": 0.9537961211179414, + "learning_rate": 1.8572814230099127e-05, + "loss": 0.1363, + "step": 20080 + }, + { + "epoch": 2.3812403652318275, + "grad_norm": 0.5100236929041791, + "learning_rate": 1.8570494579175798e-05, + "loss": 0.0636, + "step": 20081 + }, + { + "epoch": 2.3813589469939522, + "grad_norm": 0.9179879961126634, + "learning_rate": 1.8568174987524636e-05, + "loss": 0.1027, + "step": 20082 + }, + { + "epoch": 2.3814775287560774, + "grad_norm": 0.7645832329311963, + "learning_rate": 1.8565855455167013e-05, + "loss": 0.0884, + "step": 20083 + }, + { + "epoch": 2.381596110518202, + "grad_norm": 0.6962922121066605, + "learning_rate": 1.8563535982124313e-05, + "loss": 0.1011, + "step": 20084 + }, + { + "epoch": 2.3817146922803274, + "grad_norm": 0.7865002424786374, + "learning_rate": 1.8561216568417907e-05, + "loss": 0.0939, + "step": 20085 + }, + { + "epoch": 2.381833274042452, + "grad_norm": 0.7489400939392612, + "learning_rate": 1.8558897214069198e-05, + "loss": 0.1028, + "step": 20086 + }, + { + "epoch": 2.3819518558045774, + "grad_norm": 0.5382695584284976, + "learning_rate": 1.8556577919099556e-05, + "loss": 0.073, + "step": 20087 + }, + { + "epoch": 2.382070437566702, + "grad_norm": 0.7400212902354133, + "learning_rate": 1.855425868353037e-05, + "loss": 0.0956, + "step": 20088 + }, + { + "epoch": 2.3821890193288273, + "grad_norm": 0.49158317711938265, + "learning_rate": 1.8551939507383e-05, + "loss": 0.0621, + "step": 20089 + }, + { + "epoch": 2.382307601090952, + "grad_norm": 0.6803653029980141, + "learning_rate": 1.854962039067885e-05, + "loss": 0.0862, + "step": 20090 + }, + { + "epoch": 2.3824261828530773, + "grad_norm": 0.606567152581531, + "learning_rate": 1.8547301333439297e-05, + "loss": 0.0797, + "step": 20091 + }, + { + "epoch": 2.382544764615202, + "grad_norm": 0.6799834855839283, + "learning_rate": 1.8544982335685696e-05, + "loss": 0.0927, + "step": 20092 + }, + { + "epoch": 2.3826633463773272, + "grad_norm": 0.6452006948029182, + "learning_rate": 1.8542663397439456e-05, + "loss": 0.1055, + "step": 20093 + }, + { + "epoch": 2.382781928139452, + "grad_norm": 0.715877539285693, + "learning_rate": 1.8540344518721943e-05, + "loss": 0.1044, + "step": 20094 + }, + { + "epoch": 2.382900509901577, + "grad_norm": 0.6146348137781142, + "learning_rate": 1.8538025699554526e-05, + "loss": 0.0773, + "step": 20095 + }, + { + "epoch": 2.383019091663702, + "grad_norm": 0.7561514380298094, + "learning_rate": 1.853570693995858e-05, + "loss": 0.1019, + "step": 20096 + }, + { + "epoch": 2.383137673425827, + "grad_norm": 0.8490290911103311, + "learning_rate": 1.85333882399555e-05, + "loss": 0.0785, + "step": 20097 + }, + { + "epoch": 2.383256255187952, + "grad_norm": 0.6031138654592563, + "learning_rate": 1.853106959956665e-05, + "loss": 0.0729, + "step": 20098 + }, + { + "epoch": 2.383374836950077, + "grad_norm": 0.5083160176213388, + "learning_rate": 1.8528751018813402e-05, + "loss": 0.0597, + "step": 20099 + }, + { + "epoch": 2.383493418712202, + "grad_norm": 0.5828003995040271, + "learning_rate": 1.852643249771713e-05, + "loss": 0.086, + "step": 20100 + }, + { + "epoch": 2.383612000474327, + "grad_norm": 0.8730734543627151, + "learning_rate": 1.8524114036299217e-05, + "loss": 0.1217, + "step": 20101 + }, + { + "epoch": 2.383730582236452, + "grad_norm": 0.8862285314833737, + "learning_rate": 1.8521795634581034e-05, + "loss": 0.1263, + "step": 20102 + }, + { + "epoch": 2.383849163998577, + "grad_norm": 0.547920223294931, + "learning_rate": 1.8519477292583952e-05, + "loss": 0.07, + "step": 20103 + }, + { + "epoch": 2.3839677457607023, + "grad_norm": 0.5784012338015486, + "learning_rate": 1.8517159010329334e-05, + "loss": 0.0858, + "step": 20104 + }, + { + "epoch": 2.384086327522827, + "grad_norm": 0.6928142497916311, + "learning_rate": 1.8514840787838565e-05, + "loss": 0.0909, + "step": 20105 + }, + { + "epoch": 2.3842049092849518, + "grad_norm": 0.7270575658989316, + "learning_rate": 1.8512522625133012e-05, + "loss": 0.1023, + "step": 20106 + }, + { + "epoch": 2.384323491047077, + "grad_norm": 0.7665738236075885, + "learning_rate": 1.851020452223404e-05, + "loss": 0.0845, + "step": 20107 + }, + { + "epoch": 2.384442072809202, + "grad_norm": 0.571308994529564, + "learning_rate": 1.850788647916303e-05, + "loss": 0.0697, + "step": 20108 + }, + { + "epoch": 2.384560654571327, + "grad_norm": 1.0036755923705976, + "learning_rate": 1.8505568495941345e-05, + "loss": 0.1415, + "step": 20109 + }, + { + "epoch": 2.3846792363334517, + "grad_norm": 0.6579777939156916, + "learning_rate": 1.850325057259035e-05, + "loss": 0.0845, + "step": 20110 + }, + { + "epoch": 2.384797818095577, + "grad_norm": 0.7550888607147503, + "learning_rate": 1.8500932709131418e-05, + "loss": 0.1041, + "step": 20111 + }, + { + "epoch": 2.384916399857702, + "grad_norm": 0.7802694622606636, + "learning_rate": 1.8498614905585924e-05, + "loss": 0.1114, + "step": 20112 + }, + { + "epoch": 2.385034981619827, + "grad_norm": 0.8830622364728647, + "learning_rate": 1.8496297161975227e-05, + "loss": 0.148, + "step": 20113 + }, + { + "epoch": 2.3851535633819516, + "grad_norm": 0.9390721423182212, + "learning_rate": 1.8493979478320693e-05, + "loss": 0.1088, + "step": 20114 + }, + { + "epoch": 2.385272145144077, + "grad_norm": 0.7759337102611471, + "learning_rate": 1.8491661854643682e-05, + "loss": 0.1073, + "step": 20115 + }, + { + "epoch": 2.385390726906202, + "grad_norm": 0.633293861692498, + "learning_rate": 1.8489344290965578e-05, + "loss": 0.094, + "step": 20116 + }, + { + "epoch": 2.3855093086683268, + "grad_norm": 1.069678331844954, + "learning_rate": 1.8487026787307728e-05, + "loss": 0.1248, + "step": 20117 + }, + { + "epoch": 2.385627890430452, + "grad_norm": 0.8635212812352847, + "learning_rate": 1.848470934369151e-05, + "loss": 0.1101, + "step": 20118 + }, + { + "epoch": 2.3857464721925767, + "grad_norm": 0.6179964084225661, + "learning_rate": 1.8482391960138276e-05, + "loss": 0.0898, + "step": 20119 + }, + { + "epoch": 2.385865053954702, + "grad_norm": 0.6034043302271764, + "learning_rate": 1.8480074636669403e-05, + "loss": 0.0848, + "step": 20120 + }, + { + "epoch": 2.3859836357168267, + "grad_norm": 0.7443436211880056, + "learning_rate": 1.8477757373306248e-05, + "loss": 0.0879, + "step": 20121 + }, + { + "epoch": 2.386102217478952, + "grad_norm": 0.7370646218346185, + "learning_rate": 1.847544017007016e-05, + "loss": 0.1, + "step": 20122 + }, + { + "epoch": 2.3862207992410767, + "grad_norm": 0.76969332414105, + "learning_rate": 1.8473123026982527e-05, + "loss": 0.0908, + "step": 20123 + }, + { + "epoch": 2.386339381003202, + "grad_norm": 0.8140302772481701, + "learning_rate": 1.847080594406469e-05, + "loss": 0.1097, + "step": 20124 + }, + { + "epoch": 2.3864579627653266, + "grad_norm": 0.7572864845930481, + "learning_rate": 1.8468488921338016e-05, + "loss": 0.1133, + "step": 20125 + }, + { + "epoch": 2.386576544527452, + "grad_norm": 0.6819373369272999, + "learning_rate": 1.8466171958823858e-05, + "loss": 0.0957, + "step": 20126 + }, + { + "epoch": 2.3866951262895766, + "grad_norm": 0.7114992538091188, + "learning_rate": 1.846385505654359e-05, + "loss": 0.1, + "step": 20127 + }, + { + "epoch": 2.386813708051702, + "grad_norm": 0.8720320332960965, + "learning_rate": 1.8461538214518558e-05, + "loss": 0.1153, + "step": 20128 + }, + { + "epoch": 2.3869322898138265, + "grad_norm": 0.5325852206775019, + "learning_rate": 1.8459221432770134e-05, + "loss": 0.084, + "step": 20129 + }, + { + "epoch": 2.3870508715759517, + "grad_norm": 0.8559706578497095, + "learning_rate": 1.8456904711319655e-05, + "loss": 0.1091, + "step": 20130 + }, + { + "epoch": 2.3871694533380765, + "grad_norm": 0.8900713036895793, + "learning_rate": 1.84545880501885e-05, + "loss": 0.1096, + "step": 20131 + }, + { + "epoch": 2.3872880351002017, + "grad_norm": 0.7227316517801932, + "learning_rate": 1.8452271449398016e-05, + "loss": 0.0876, + "step": 20132 + }, + { + "epoch": 2.3874066168623265, + "grad_norm": 0.6601361169528078, + "learning_rate": 1.8449954908969554e-05, + "loss": 0.0884, + "step": 20133 + }, + { + "epoch": 2.3875251986244517, + "grad_norm": 0.7769074136119168, + "learning_rate": 1.8447638428924484e-05, + "loss": 0.1076, + "step": 20134 + }, + { + "epoch": 2.3876437803865764, + "grad_norm": 0.7546398331725148, + "learning_rate": 1.8445322009284153e-05, + "loss": 0.0726, + "step": 20135 + }, + { + "epoch": 2.3877623621487016, + "grad_norm": 0.7113727449694359, + "learning_rate": 1.844300565006991e-05, + "loss": 0.1104, + "step": 20136 + }, + { + "epoch": 2.3878809439108264, + "grad_norm": 0.6342722037012878, + "learning_rate": 1.844068935130311e-05, + "loss": 0.075, + "step": 20137 + }, + { + "epoch": 2.3879995256729516, + "grad_norm": 0.8984795034638305, + "learning_rate": 1.8438373113005114e-05, + "loss": 0.0923, + "step": 20138 + }, + { + "epoch": 2.3881181074350764, + "grad_norm": 0.6127748659194642, + "learning_rate": 1.8436056935197277e-05, + "loss": 0.0672, + "step": 20139 + }, + { + "epoch": 2.3882366891972016, + "grad_norm": 0.8484250993793856, + "learning_rate": 1.8433740817900945e-05, + "loss": 0.1018, + "step": 20140 + }, + { + "epoch": 2.3883552709593263, + "grad_norm": 0.7660280322166166, + "learning_rate": 1.843142476113746e-05, + "loss": 0.117, + "step": 20141 + }, + { + "epoch": 2.3884738527214515, + "grad_norm": 0.9378746180522559, + "learning_rate": 1.8429108764928195e-05, + "loss": 0.1417, + "step": 20142 + }, + { + "epoch": 2.3885924344835763, + "grad_norm": 0.7199514271976312, + "learning_rate": 1.842679282929449e-05, + "loss": 0.098, + "step": 20143 + }, + { + "epoch": 2.3887110162457015, + "grad_norm": 0.6814133727203312, + "learning_rate": 1.8424476954257697e-05, + "loss": 0.0888, + "step": 20144 + }, + { + "epoch": 2.3888295980078262, + "grad_norm": 0.6722072537453876, + "learning_rate": 1.8422161139839147e-05, + "loss": 0.1106, + "step": 20145 + }, + { + "epoch": 2.3889481797699514, + "grad_norm": 0.9524938679253532, + "learning_rate": 1.841984538606022e-05, + "loss": 0.1293, + "step": 20146 + }, + { + "epoch": 2.389066761532076, + "grad_norm": 0.7070416198910837, + "learning_rate": 1.841752969294224e-05, + "loss": 0.1159, + "step": 20147 + }, + { + "epoch": 2.3891853432942014, + "grad_norm": 0.7464184682673104, + "learning_rate": 1.8415214060506566e-05, + "loss": 0.091, + "step": 20148 + }, + { + "epoch": 2.389303925056326, + "grad_norm": 0.6283072751324671, + "learning_rate": 1.8412898488774544e-05, + "loss": 0.0866, + "step": 20149 + }, + { + "epoch": 2.3894225068184514, + "grad_norm": 0.5021124399646364, + "learning_rate": 1.8410582977767526e-05, + "loss": 0.0641, + "step": 20150 + }, + { + "epoch": 2.389541088580576, + "grad_norm": 0.6896894174073899, + "learning_rate": 1.8408267527506852e-05, + "loss": 0.1122, + "step": 20151 + }, + { + "epoch": 2.3896596703427013, + "grad_norm": 0.8128090666106595, + "learning_rate": 1.8405952138013856e-05, + "loss": 0.1066, + "step": 20152 + }, + { + "epoch": 2.389778252104826, + "grad_norm": 0.6807739981026396, + "learning_rate": 1.8403636809309908e-05, + "loss": 0.0846, + "step": 20153 + }, + { + "epoch": 2.3898968338669513, + "grad_norm": 0.6607749850341189, + "learning_rate": 1.8401321541416338e-05, + "loss": 0.0895, + "step": 20154 + }, + { + "epoch": 2.390015415629076, + "grad_norm": 1.0606973886080773, + "learning_rate": 1.8399006334354487e-05, + "loss": 0.1386, + "step": 20155 + }, + { + "epoch": 2.3901339973912012, + "grad_norm": 0.5479365974117776, + "learning_rate": 1.8396691188145704e-05, + "loss": 0.0654, + "step": 20156 + }, + { + "epoch": 2.3902525791533265, + "grad_norm": 1.1454007980736631, + "learning_rate": 1.8394376102811327e-05, + "loss": 0.165, + "step": 20157 + }, + { + "epoch": 2.390371160915451, + "grad_norm": 0.7050448707201571, + "learning_rate": 1.8392061078372712e-05, + "loss": 0.1016, + "step": 20158 + }, + { + "epoch": 2.390489742677576, + "grad_norm": 0.733314181156609, + "learning_rate": 1.8389746114851186e-05, + "loss": 0.1158, + "step": 20159 + }, + { + "epoch": 2.390608324439701, + "grad_norm": 0.7058487377720404, + "learning_rate": 1.8387431212268087e-05, + "loss": 0.0818, + "step": 20160 + }, + { + "epoch": 2.3907269062018264, + "grad_norm": 0.8188609207654292, + "learning_rate": 1.8385116370644777e-05, + "loss": 0.1267, + "step": 20161 + }, + { + "epoch": 2.390845487963951, + "grad_norm": 0.7790449431345251, + "learning_rate": 1.8382801590002577e-05, + "loss": 0.1062, + "step": 20162 + }, + { + "epoch": 2.390964069726076, + "grad_norm": 0.7622456388919014, + "learning_rate": 1.8380486870362823e-05, + "loss": 0.0921, + "step": 20163 + }, + { + "epoch": 2.391082651488201, + "grad_norm": 0.6218911706626589, + "learning_rate": 1.8378172211746876e-05, + "loss": 0.0874, + "step": 20164 + }, + { + "epoch": 2.3912012332503263, + "grad_norm": 0.8106494735629298, + "learning_rate": 1.8375857614176055e-05, + "loss": 0.0989, + "step": 20165 + }, + { + "epoch": 2.391319815012451, + "grad_norm": 0.8403933660181092, + "learning_rate": 1.8373543077671705e-05, + "loss": 0.0999, + "step": 20166 + }, + { + "epoch": 2.3914383967745763, + "grad_norm": 0.8096917198541487, + "learning_rate": 1.8371228602255158e-05, + "loss": 0.1061, + "step": 20167 + }, + { + "epoch": 2.391556978536701, + "grad_norm": 0.8539334635748179, + "learning_rate": 1.8368914187947756e-05, + "loss": 0.1041, + "step": 20168 + }, + { + "epoch": 2.391675560298826, + "grad_norm": 0.6872406873429642, + "learning_rate": 1.8366599834770836e-05, + "loss": 0.1005, + "step": 20169 + }, + { + "epoch": 2.391794142060951, + "grad_norm": 0.6998399206428075, + "learning_rate": 1.8364285542745735e-05, + "loss": 0.0907, + "step": 20170 + }, + { + "epoch": 2.391912723823076, + "grad_norm": 0.7878944488121976, + "learning_rate": 1.8361971311893773e-05, + "loss": 0.1133, + "step": 20171 + }, + { + "epoch": 2.392031305585201, + "grad_norm": 0.8912595980965038, + "learning_rate": 1.8359657142236302e-05, + "loss": 0.1126, + "step": 20172 + }, + { + "epoch": 2.392149887347326, + "grad_norm": 0.6450963853434746, + "learning_rate": 1.8357343033794655e-05, + "loss": 0.0815, + "step": 20173 + }, + { + "epoch": 2.392268469109451, + "grad_norm": 0.7305521201649157, + "learning_rate": 1.8355028986590156e-05, + "loss": 0.105, + "step": 20174 + }, + { + "epoch": 2.392387050871576, + "grad_norm": 0.9125933819244482, + "learning_rate": 1.835271500064413e-05, + "loss": 0.1096, + "step": 20175 + }, + { + "epoch": 2.392505632633701, + "grad_norm": 0.4820707428904465, + "learning_rate": 1.8350401075977933e-05, + "loss": 0.069, + "step": 20176 + }, + { + "epoch": 2.392624214395826, + "grad_norm": 0.8702795776093413, + "learning_rate": 1.834808721261288e-05, + "loss": 0.1304, + "step": 20177 + }, + { + "epoch": 2.392742796157951, + "grad_norm": 0.852161136214693, + "learning_rate": 1.83457734105703e-05, + "loss": 0.0977, + "step": 20178 + }, + { + "epoch": 2.392861377920076, + "grad_norm": 0.6986128126588067, + "learning_rate": 1.8343459669871533e-05, + "loss": 0.0888, + "step": 20179 + }, + { + "epoch": 2.392979959682201, + "grad_norm": 0.5752810955632451, + "learning_rate": 1.8341145990537912e-05, + "loss": 0.0728, + "step": 20180 + }, + { + "epoch": 2.393098541444326, + "grad_norm": 1.079436702713155, + "learning_rate": 1.8338832372590754e-05, + "loss": 0.1676, + "step": 20181 + }, + { + "epoch": 2.3932171232064507, + "grad_norm": 0.7425203041594929, + "learning_rate": 1.8336518816051385e-05, + "loss": 0.096, + "step": 20182 + }, + { + "epoch": 2.393335704968576, + "grad_norm": 0.7616232576644861, + "learning_rate": 1.833420532094115e-05, + "loss": 0.1142, + "step": 20183 + }, + { + "epoch": 2.3934542867307007, + "grad_norm": 0.6964163997229822, + "learning_rate": 1.8331891887281373e-05, + "loss": 0.0931, + "step": 20184 + }, + { + "epoch": 2.393572868492826, + "grad_norm": 0.6110957423807838, + "learning_rate": 1.8329578515093367e-05, + "loss": 0.0821, + "step": 20185 + }, + { + "epoch": 2.3936914502549507, + "grad_norm": 0.8454678092906918, + "learning_rate": 1.832726520439847e-05, + "loss": 0.118, + "step": 20186 + }, + { + "epoch": 2.393810032017076, + "grad_norm": 0.8189225310664142, + "learning_rate": 1.8324951955218002e-05, + "loss": 0.1067, + "step": 20187 + }, + { + "epoch": 2.3939286137792006, + "grad_norm": 0.5771304777291457, + "learning_rate": 1.83226387675733e-05, + "loss": 0.0698, + "step": 20188 + }, + { + "epoch": 2.394047195541326, + "grad_norm": 0.6142344193327101, + "learning_rate": 1.8320325641485678e-05, + "loss": 0.1012, + "step": 20189 + }, + { + "epoch": 2.3941657773034506, + "grad_norm": 0.6959814582365136, + "learning_rate": 1.831801257697645e-05, + "loss": 0.0746, + "step": 20190 + }, + { + "epoch": 2.394284359065576, + "grad_norm": 0.9340565941007684, + "learning_rate": 1.8315699574066967e-05, + "loss": 0.1338, + "step": 20191 + }, + { + "epoch": 2.3944029408277006, + "grad_norm": 0.720978300244913, + "learning_rate": 1.8313386632778533e-05, + "loss": 0.0976, + "step": 20192 + }, + { + "epoch": 2.3945215225898258, + "grad_norm": 0.8015433482908784, + "learning_rate": 1.8311073753132468e-05, + "loss": 0.1069, + "step": 20193 + }, + { + "epoch": 2.3946401043519505, + "grad_norm": 0.9327580341810934, + "learning_rate": 1.8308760935150107e-05, + "loss": 0.131, + "step": 20194 + }, + { + "epoch": 2.3947586861140757, + "grad_norm": 0.9035398903111694, + "learning_rate": 1.8306448178852768e-05, + "loss": 0.1395, + "step": 20195 + }, + { + "epoch": 2.3948772678762005, + "grad_norm": 0.8906762321184629, + "learning_rate": 1.8304135484261762e-05, + "loss": 0.1255, + "step": 20196 + }, + { + "epoch": 2.3949958496383257, + "grad_norm": 0.6518388451696009, + "learning_rate": 1.8301822851398414e-05, + "loss": 0.0974, + "step": 20197 + }, + { + "epoch": 2.3951144314004504, + "grad_norm": 0.7914938004378994, + "learning_rate": 1.829951028028405e-05, + "loss": 0.1105, + "step": 20198 + }, + { + "epoch": 2.3952330131625756, + "grad_norm": 0.5831025011964127, + "learning_rate": 1.8297197770939986e-05, + "loss": 0.0865, + "step": 20199 + }, + { + "epoch": 2.3953515949247004, + "grad_norm": 0.6483507161309409, + "learning_rate": 1.829488532338754e-05, + "loss": 0.1092, + "step": 20200 + }, + { + "epoch": 2.3954701766868256, + "grad_norm": 0.7720660789455756, + "learning_rate": 1.829257293764802e-05, + "loss": 0.108, + "step": 20201 + }, + { + "epoch": 2.3955887584489504, + "grad_norm": 0.608105796589993, + "learning_rate": 1.829026061374276e-05, + "loss": 0.0695, + "step": 20202 + }, + { + "epoch": 2.3957073402110756, + "grad_norm": 0.6322678578394265, + "learning_rate": 1.828794835169307e-05, + "loss": 0.0904, + "step": 20203 + }, + { + "epoch": 2.3958259219732003, + "grad_norm": 0.5383261242506496, + "learning_rate": 1.8285636151520254e-05, + "loss": 0.0875, + "step": 20204 + }, + { + "epoch": 2.3959445037353255, + "grad_norm": 0.608096844237224, + "learning_rate": 1.828332401324565e-05, + "loss": 0.0673, + "step": 20205 + }, + { + "epoch": 2.3960630854974507, + "grad_norm": 0.6930222564636366, + "learning_rate": 1.8281011936890566e-05, + "loss": 0.0992, + "step": 20206 + }, + { + "epoch": 2.3961816672595755, + "grad_norm": 0.9193628072341307, + "learning_rate": 1.8278699922476304e-05, + "loss": 0.1314, + "step": 20207 + }, + { + "epoch": 2.3963002490217002, + "grad_norm": 1.2028738665327825, + "learning_rate": 1.8276387970024186e-05, + "loss": 0.1753, + "step": 20208 + }, + { + "epoch": 2.3964188307838254, + "grad_norm": 1.2614226362823981, + "learning_rate": 1.8274076079555526e-05, + "loss": 0.0881, + "step": 20209 + }, + { + "epoch": 2.3965374125459507, + "grad_norm": 0.9380052023510161, + "learning_rate": 1.8271764251091643e-05, + "loss": 0.1376, + "step": 20210 + }, + { + "epoch": 2.3966559943080754, + "grad_norm": 0.5306159867315311, + "learning_rate": 1.8269452484653842e-05, + "loss": 0.0669, + "step": 20211 + }, + { + "epoch": 2.3967745760702, + "grad_norm": 0.7831651722768345, + "learning_rate": 1.8267140780263427e-05, + "loss": 0.0973, + "step": 20212 + }, + { + "epoch": 2.3968931578323254, + "grad_norm": 0.9719080156719679, + "learning_rate": 1.8264829137941725e-05, + "loss": 0.1298, + "step": 20213 + }, + { + "epoch": 2.3970117395944506, + "grad_norm": 0.5944239320543169, + "learning_rate": 1.826251755771004e-05, + "loss": 0.0892, + "step": 20214 + }, + { + "epoch": 2.3971303213565753, + "grad_norm": 0.6146054451290544, + "learning_rate": 1.8260206039589678e-05, + "loss": 0.0729, + "step": 20215 + }, + { + "epoch": 2.3972489031187005, + "grad_norm": 0.6886656749621559, + "learning_rate": 1.825789458360195e-05, + "loss": 0.0646, + "step": 20216 + }, + { + "epoch": 2.3973674848808253, + "grad_norm": 0.6470472843153998, + "learning_rate": 1.8255583189768166e-05, + "loss": 0.0694, + "step": 20217 + }, + { + "epoch": 2.3974860666429505, + "grad_norm": 0.7780576826309862, + "learning_rate": 1.8253271858109642e-05, + "loss": 0.1146, + "step": 20218 + }, + { + "epoch": 2.3976046484050753, + "grad_norm": 0.5343725792360327, + "learning_rate": 1.8250960588647663e-05, + "loss": 0.0899, + "step": 20219 + }, + { + "epoch": 2.3977232301672005, + "grad_norm": 0.42819331639618413, + "learning_rate": 1.8248649381403565e-05, + "loss": 0.0657, + "step": 20220 + }, + { + "epoch": 2.397841811929325, + "grad_norm": 0.5697335403239753, + "learning_rate": 1.824633823639864e-05, + "loss": 0.0963, + "step": 20221 + }, + { + "epoch": 2.3979603936914504, + "grad_norm": 0.8334343324950962, + "learning_rate": 1.8244027153654196e-05, + "loss": 0.1495, + "step": 20222 + }, + { + "epoch": 2.398078975453575, + "grad_norm": 0.6392587644905395, + "learning_rate": 1.8241716133191522e-05, + "loss": 0.0838, + "step": 20223 + }, + { + "epoch": 2.3981975572157004, + "grad_norm": 0.7595413064720618, + "learning_rate": 1.8239405175031953e-05, + "loss": 0.0953, + "step": 20224 + }, + { + "epoch": 2.398316138977825, + "grad_norm": 0.5863932203033548, + "learning_rate": 1.8237094279196776e-05, + "loss": 0.0962, + "step": 20225 + }, + { + "epoch": 2.3984347207399503, + "grad_norm": 0.9070849128170728, + "learning_rate": 1.8234783445707294e-05, + "loss": 0.1509, + "step": 20226 + }, + { + "epoch": 2.398553302502075, + "grad_norm": 0.5121760376522431, + "learning_rate": 1.8232472674584804e-05, + "loss": 0.073, + "step": 20227 + }, + { + "epoch": 2.3986718842642003, + "grad_norm": 0.7619461116100906, + "learning_rate": 1.8230161965850626e-05, + "loss": 0.1249, + "step": 20228 + }, + { + "epoch": 2.398790466026325, + "grad_norm": 0.7895902224923023, + "learning_rate": 1.8227851319526057e-05, + "loss": 0.1039, + "step": 20229 + }, + { + "epoch": 2.3989090477884503, + "grad_norm": 0.6155987870310066, + "learning_rate": 1.822554073563239e-05, + "loss": 0.105, + "step": 20230 + }, + { + "epoch": 2.399027629550575, + "grad_norm": 0.6428010846888255, + "learning_rate": 1.8223230214190923e-05, + "loss": 0.0956, + "step": 20231 + }, + { + "epoch": 2.3991462113127002, + "grad_norm": 0.5540420363552103, + "learning_rate": 1.822091975522297e-05, + "loss": 0.0602, + "step": 20232 + }, + { + "epoch": 2.399264793074825, + "grad_norm": 0.7667660368351557, + "learning_rate": 1.8218609358749827e-05, + "loss": 0.09, + "step": 20233 + }, + { + "epoch": 2.39938337483695, + "grad_norm": 0.7680139255877487, + "learning_rate": 1.821629902479278e-05, + "loss": 0.0906, + "step": 20234 + }, + { + "epoch": 2.399501956599075, + "grad_norm": 0.7585755502828114, + "learning_rate": 1.8213988753373146e-05, + "loss": 0.0969, + "step": 20235 + }, + { + "epoch": 2.3996205383612, + "grad_norm": 0.71150019056051, + "learning_rate": 1.821167854451221e-05, + "loss": 0.0817, + "step": 20236 + }, + { + "epoch": 2.399739120123325, + "grad_norm": 0.6523919549730853, + "learning_rate": 1.8209368398231278e-05, + "loss": 0.0838, + "step": 20237 + }, + { + "epoch": 2.39985770188545, + "grad_norm": 0.5185640564691125, + "learning_rate": 1.820705831455163e-05, + "loss": 0.0608, + "step": 20238 + }, + { + "epoch": 2.399976283647575, + "grad_norm": 0.9012230812888348, + "learning_rate": 1.820474829349459e-05, + "loss": 0.1518, + "step": 20239 + }, + { + "epoch": 2.4000948654097, + "grad_norm": 0.8628843375187636, + "learning_rate": 1.820243833508143e-05, + "loss": 0.0947, + "step": 20240 + }, + { + "epoch": 2.400213447171825, + "grad_norm": 0.7848530143345541, + "learning_rate": 1.8200128439333457e-05, + "loss": 0.1137, + "step": 20241 + }, + { + "epoch": 2.40033202893395, + "grad_norm": 0.395766160262144, + "learning_rate": 1.8197818606271953e-05, + "loss": 0.0534, + "step": 20242 + }, + { + "epoch": 2.400450610696075, + "grad_norm": 0.7554848909622699, + "learning_rate": 1.819550883591823e-05, + "loss": 0.0953, + "step": 20243 + }, + { + "epoch": 2.4005691924582, + "grad_norm": 0.5696078136193001, + "learning_rate": 1.819319912829357e-05, + "loss": 0.0783, + "step": 20244 + }, + { + "epoch": 2.4006877742203248, + "grad_norm": 0.8611194799947217, + "learning_rate": 1.819088948341926e-05, + "loss": 0.109, + "step": 20245 + }, + { + "epoch": 2.40080635598245, + "grad_norm": 0.808389714861101, + "learning_rate": 1.81885799013166e-05, + "loss": 0.1086, + "step": 20246 + }, + { + "epoch": 2.4009249377445747, + "grad_norm": 0.9093187969641613, + "learning_rate": 1.8186270382006882e-05, + "loss": 0.1309, + "step": 20247 + }, + { + "epoch": 2.4010435195067, + "grad_norm": 0.6545186402923558, + "learning_rate": 1.81839609255114e-05, + "loss": 0.0952, + "step": 20248 + }, + { + "epoch": 2.4011621012688247, + "grad_norm": 0.4448987227150256, + "learning_rate": 1.818165153185143e-05, + "loss": 0.0697, + "step": 20249 + }, + { + "epoch": 2.40128068303095, + "grad_norm": 0.6612335969842862, + "learning_rate": 1.817934220104828e-05, + "loss": 0.105, + "step": 20250 + }, + { + "epoch": 2.4013992647930746, + "grad_norm": 0.6239384863150024, + "learning_rate": 1.8177032933123235e-05, + "loss": 0.0811, + "step": 20251 + }, + { + "epoch": 2.4015178465552, + "grad_norm": 0.7528620249478584, + "learning_rate": 1.8174723728097575e-05, + "loss": 0.0889, + "step": 20252 + }, + { + "epoch": 2.4016364283173246, + "grad_norm": 0.8704089126696227, + "learning_rate": 1.8172414585992583e-05, + "loss": 0.1259, + "step": 20253 + }, + { + "epoch": 2.40175501007945, + "grad_norm": 0.8046961026245355, + "learning_rate": 1.8170105506829565e-05, + "loss": 0.1042, + "step": 20254 + }, + { + "epoch": 2.401873591841575, + "grad_norm": 0.6143015146758154, + "learning_rate": 1.8167796490629803e-05, + "loss": 0.0867, + "step": 20255 + }, + { + "epoch": 2.4019921736036998, + "grad_norm": 0.451299354282776, + "learning_rate": 1.816548753741457e-05, + "loss": 0.0576, + "step": 20256 + }, + { + "epoch": 2.4021107553658245, + "grad_norm": 0.6569404228745713, + "learning_rate": 1.8163178647205165e-05, + "loss": 0.0865, + "step": 20257 + }, + { + "epoch": 2.4022293371279497, + "grad_norm": 0.8051712221091767, + "learning_rate": 1.816086982002287e-05, + "loss": 0.0843, + "step": 20258 + }, + { + "epoch": 2.402347918890075, + "grad_norm": 0.9204449173894176, + "learning_rate": 1.8158561055888968e-05, + "loss": 0.1215, + "step": 20259 + }, + { + "epoch": 2.4024665006521997, + "grad_norm": 0.6151187472691754, + "learning_rate": 1.8156252354824747e-05, + "loss": 0.0808, + "step": 20260 + }, + { + "epoch": 2.4025850824143244, + "grad_norm": 0.6195039663614463, + "learning_rate": 1.8153943716851474e-05, + "loss": 0.0937, + "step": 20261 + }, + { + "epoch": 2.4027036641764496, + "grad_norm": 0.8179649549813086, + "learning_rate": 1.8151635141990457e-05, + "loss": 0.1129, + "step": 20262 + }, + { + "epoch": 2.402822245938575, + "grad_norm": 0.5767144441753485, + "learning_rate": 1.8149326630262966e-05, + "loss": 0.0569, + "step": 20263 + }, + { + "epoch": 2.4029408277006996, + "grad_norm": 0.862229218455208, + "learning_rate": 1.814701818169027e-05, + "loss": 0.1202, + "step": 20264 + }, + { + "epoch": 2.4030594094628244, + "grad_norm": 0.7929866410648235, + "learning_rate": 1.814470979629368e-05, + "loss": 0.117, + "step": 20265 + }, + { + "epoch": 2.4031779912249496, + "grad_norm": 0.7246571599252207, + "learning_rate": 1.8142401474094448e-05, + "loss": 0.0956, + "step": 20266 + }, + { + "epoch": 2.4032965729870748, + "grad_norm": 0.8485486527499116, + "learning_rate": 1.814009321511387e-05, + "loss": 0.1348, + "step": 20267 + }, + { + "epoch": 2.4034151547491995, + "grad_norm": 0.7095158743623139, + "learning_rate": 1.8137785019373212e-05, + "loss": 0.0962, + "step": 20268 + }, + { + "epoch": 2.4035337365113247, + "grad_norm": 0.7311967612706811, + "learning_rate": 1.813547688689377e-05, + "loss": 0.1142, + "step": 20269 + }, + { + "epoch": 2.4036523182734495, + "grad_norm": 0.889555435554713, + "learning_rate": 1.8133168817696815e-05, + "loss": 0.1197, + "step": 20270 + }, + { + "epoch": 2.4037709000355747, + "grad_norm": 0.7499216462521076, + "learning_rate": 1.8130860811803624e-05, + "loss": 0.1083, + "step": 20271 + }, + { + "epoch": 2.4038894817976995, + "grad_norm": 0.4865696765007904, + "learning_rate": 1.812855286923546e-05, + "loss": 0.0691, + "step": 20272 + }, + { + "epoch": 2.4040080635598247, + "grad_norm": 0.7936154334613709, + "learning_rate": 1.8126244990013623e-05, + "loss": 0.1218, + "step": 20273 + }, + { + "epoch": 2.4041266453219494, + "grad_norm": 0.49068779666559714, + "learning_rate": 1.812393717415938e-05, + "loss": 0.0714, + "step": 20274 + }, + { + "epoch": 2.4042452270840746, + "grad_norm": 0.5555146549581277, + "learning_rate": 1.8121629421693996e-05, + "loss": 0.0668, + "step": 20275 + }, + { + "epoch": 2.4043638088461994, + "grad_norm": 0.46195251749620236, + "learning_rate": 1.8119321732638754e-05, + "loss": 0.0653, + "step": 20276 + }, + { + "epoch": 2.4044823906083246, + "grad_norm": 0.5497777448237029, + "learning_rate": 1.811701410701493e-05, + "loss": 0.062, + "step": 20277 + }, + { + "epoch": 2.4046009723704493, + "grad_norm": 0.9721381634825974, + "learning_rate": 1.81147065448438e-05, + "loss": 0.1451, + "step": 20278 + }, + { + "epoch": 2.4047195541325745, + "grad_norm": 1.205398533873662, + "learning_rate": 1.8112399046146623e-05, + "loss": 0.156, + "step": 20279 + }, + { + "epoch": 2.4048381358946993, + "grad_norm": 0.7470030294455692, + "learning_rate": 1.811009161094469e-05, + "loss": 0.0946, + "step": 20280 + }, + { + "epoch": 2.4049567176568245, + "grad_norm": 0.6455839224126199, + "learning_rate": 1.810778423925926e-05, + "loss": 0.089, + "step": 20281 + }, + { + "epoch": 2.4050752994189493, + "grad_norm": 0.5713770791967856, + "learning_rate": 1.8105476931111612e-05, + "loss": 0.0932, + "step": 20282 + }, + { + "epoch": 2.4051938811810745, + "grad_norm": 0.7432208347462781, + "learning_rate": 1.8103169686523e-05, + "loss": 0.0812, + "step": 20283 + }, + { + "epoch": 2.4053124629431992, + "grad_norm": 1.1168451610322907, + "learning_rate": 1.8100862505514715e-05, + "loss": 0.1337, + "step": 20284 + }, + { + "epoch": 2.4054310447053244, + "grad_norm": 0.7296503454066819, + "learning_rate": 1.8098555388108016e-05, + "loss": 0.0824, + "step": 20285 + }, + { + "epoch": 2.405549626467449, + "grad_norm": 1.0157457081905075, + "learning_rate": 1.8096248334324175e-05, + "loss": 0.1222, + "step": 20286 + }, + { + "epoch": 2.4056682082295744, + "grad_norm": 0.5496706056616437, + "learning_rate": 1.809394134418445e-05, + "loss": 0.0773, + "step": 20287 + }, + { + "epoch": 2.405786789991699, + "grad_norm": 0.9326010145920656, + "learning_rate": 1.8091634417710128e-05, + "loss": 0.1277, + "step": 20288 + }, + { + "epoch": 2.4059053717538244, + "grad_norm": 0.5526212424774384, + "learning_rate": 1.8089327554922465e-05, + "loss": 0.0738, + "step": 20289 + }, + { + "epoch": 2.406023953515949, + "grad_norm": 0.832048241824183, + "learning_rate": 1.8087020755842715e-05, + "loss": 0.1142, + "step": 20290 + }, + { + "epoch": 2.4061425352780743, + "grad_norm": 0.7039024507934304, + "learning_rate": 1.8084714020492165e-05, + "loss": 0.0744, + "step": 20291 + }, + { + "epoch": 2.406261117040199, + "grad_norm": 0.8461714939991171, + "learning_rate": 1.808240734889208e-05, + "loss": 0.1027, + "step": 20292 + }, + { + "epoch": 2.4063796988023243, + "grad_norm": 0.664733199278252, + "learning_rate": 1.8080100741063708e-05, + "loss": 0.0984, + "step": 20293 + }, + { + "epoch": 2.406498280564449, + "grad_norm": 0.5455124217545693, + "learning_rate": 1.8077794197028315e-05, + "loss": 0.0897, + "step": 20294 + }, + { + "epoch": 2.4066168623265742, + "grad_norm": 0.6665608524809564, + "learning_rate": 1.8075487716807176e-05, + "loss": 0.0962, + "step": 20295 + }, + { + "epoch": 2.406735444088699, + "grad_norm": 0.7377360885691243, + "learning_rate": 1.807318130042155e-05, + "loss": 0.0819, + "step": 20296 + }, + { + "epoch": 2.406854025850824, + "grad_norm": 0.5665908120815473, + "learning_rate": 1.8070874947892703e-05, + "loss": 0.0731, + "step": 20297 + }, + { + "epoch": 2.406972607612949, + "grad_norm": 0.44184085929687356, + "learning_rate": 1.8068568659241878e-05, + "loss": 0.0558, + "step": 20298 + }, + { + "epoch": 2.407091189375074, + "grad_norm": 0.728125369270018, + "learning_rate": 1.806626243449036e-05, + "loss": 0.0915, + "step": 20299 + }, + { + "epoch": 2.407209771137199, + "grad_norm": 0.5531013929079277, + "learning_rate": 1.8063956273659406e-05, + "loss": 0.0792, + "step": 20300 + }, + { + "epoch": 2.407328352899324, + "grad_norm": 1.0796998474877508, + "learning_rate": 1.806165017677026e-05, + "loss": 0.1309, + "step": 20301 + }, + { + "epoch": 2.407446934661449, + "grad_norm": 0.6900631384691506, + "learning_rate": 1.805934414384419e-05, + "loss": 0.0896, + "step": 20302 + }, + { + "epoch": 2.407565516423574, + "grad_norm": 0.5823068681293946, + "learning_rate": 1.8057038174902462e-05, + "loss": 0.093, + "step": 20303 + }, + { + "epoch": 2.4076840981856993, + "grad_norm": 0.7142500215869957, + "learning_rate": 1.805473226996633e-05, + "loss": 0.0836, + "step": 20304 + }, + { + "epoch": 2.407802679947824, + "grad_norm": 0.6384902967100908, + "learning_rate": 1.8052426429057036e-05, + "loss": 0.1004, + "step": 20305 + }, + { + "epoch": 2.407921261709949, + "grad_norm": 0.7859206484258509, + "learning_rate": 1.8050120652195864e-05, + "loss": 0.1007, + "step": 20306 + }, + { + "epoch": 2.408039843472074, + "grad_norm": 0.6133774415717719, + "learning_rate": 1.8047814939404047e-05, + "loss": 0.0709, + "step": 20307 + }, + { + "epoch": 2.408158425234199, + "grad_norm": 0.7922114813952226, + "learning_rate": 1.8045509290702863e-05, + "loss": 0.1136, + "step": 20308 + }, + { + "epoch": 2.408277006996324, + "grad_norm": 0.6222953194908614, + "learning_rate": 1.8043203706113538e-05, + "loss": 0.0952, + "step": 20309 + }, + { + "epoch": 2.4083955887584487, + "grad_norm": 0.7691824160959038, + "learning_rate": 1.804089818565736e-05, + "loss": 0.1142, + "step": 20310 + }, + { + "epoch": 2.408514170520574, + "grad_norm": 0.7043058293773815, + "learning_rate": 1.8038592729355563e-05, + "loss": 0.083, + "step": 20311 + }, + { + "epoch": 2.408632752282699, + "grad_norm": 0.9097585254286246, + "learning_rate": 1.8036287337229407e-05, + "loss": 0.1158, + "step": 20312 + }, + { + "epoch": 2.408751334044824, + "grad_norm": 0.7029099799155933, + "learning_rate": 1.803398200930013e-05, + "loss": 0.0804, + "step": 20313 + }, + { + "epoch": 2.4088699158069486, + "grad_norm": 0.641806891222451, + "learning_rate": 1.803167674558901e-05, + "loss": 0.0884, + "step": 20314 + }, + { + "epoch": 2.408988497569074, + "grad_norm": 0.5998150379625741, + "learning_rate": 1.8029371546117274e-05, + "loss": 0.0913, + "step": 20315 + }, + { + "epoch": 2.409107079331199, + "grad_norm": 0.47997620134672975, + "learning_rate": 1.8027066410906195e-05, + "loss": 0.0512, + "step": 20316 + }, + { + "epoch": 2.409225661093324, + "grad_norm": 0.7131111061415353, + "learning_rate": 1.8024761339977e-05, + "loss": 0.0933, + "step": 20317 + }, + { + "epoch": 2.409344242855449, + "grad_norm": 0.8215834974774233, + "learning_rate": 1.8022456333350964e-05, + "loss": 0.1027, + "step": 20318 + }, + { + "epoch": 2.4094628246175738, + "grad_norm": 0.685318405419804, + "learning_rate": 1.802015139104932e-05, + "loss": 0.0828, + "step": 20319 + }, + { + "epoch": 2.409581406379699, + "grad_norm": 0.691091199619396, + "learning_rate": 1.8017846513093316e-05, + "loss": 0.0781, + "step": 20320 + }, + { + "epoch": 2.4096999881418237, + "grad_norm": 0.6891089505263236, + "learning_rate": 1.8015541699504215e-05, + "loss": 0.0764, + "step": 20321 + }, + { + "epoch": 2.409818569903949, + "grad_norm": 0.79587171835386, + "learning_rate": 1.8013236950303252e-05, + "loss": 0.1178, + "step": 20322 + }, + { + "epoch": 2.4099371516660737, + "grad_norm": 0.8249123459010869, + "learning_rate": 1.8010932265511678e-05, + "loss": 0.0771, + "step": 20323 + }, + { + "epoch": 2.410055733428199, + "grad_norm": 0.5841417314090225, + "learning_rate": 1.8008627645150728e-05, + "loss": 0.0835, + "step": 20324 + }, + { + "epoch": 2.4101743151903237, + "grad_norm": 0.8205867546376542, + "learning_rate": 1.8006323089241667e-05, + "loss": 0.1062, + "step": 20325 + }, + { + "epoch": 2.410292896952449, + "grad_norm": 0.6887145093461642, + "learning_rate": 1.800401859780573e-05, + "loss": 0.0783, + "step": 20326 + }, + { + "epoch": 2.4104114787145736, + "grad_norm": 1.1034546289688203, + "learning_rate": 1.800171417086416e-05, + "loss": 0.1503, + "step": 20327 + }, + { + "epoch": 2.410530060476699, + "grad_norm": 0.5540826900876453, + "learning_rate": 1.7999409808438193e-05, + "loss": 0.0754, + "step": 20328 + }, + { + "epoch": 2.4106486422388236, + "grad_norm": 0.8487158204589229, + "learning_rate": 1.79971055105491e-05, + "loss": 0.1134, + "step": 20329 + }, + { + "epoch": 2.410767224000949, + "grad_norm": 1.4232341533663748, + "learning_rate": 1.7994801277218104e-05, + "loss": 0.1521, + "step": 20330 + }, + { + "epoch": 2.4108858057630735, + "grad_norm": 0.6553119424182533, + "learning_rate": 1.799249710846645e-05, + "loss": 0.0847, + "step": 20331 + }, + { + "epoch": 2.4110043875251987, + "grad_norm": 0.7234114381848568, + "learning_rate": 1.799019300431537e-05, + "loss": 0.0934, + "step": 20332 + }, + { + "epoch": 2.4111229692873235, + "grad_norm": 0.5889199244837947, + "learning_rate": 1.798788896478612e-05, + "loss": 0.0697, + "step": 20333 + }, + { + "epoch": 2.4112415510494487, + "grad_norm": 0.6658008510574517, + "learning_rate": 1.798558498989994e-05, + "loss": 0.0909, + "step": 20334 + }, + { + "epoch": 2.4113601328115735, + "grad_norm": 0.5523731498855026, + "learning_rate": 1.798328107967805e-05, + "loss": 0.07, + "step": 20335 + }, + { + "epoch": 2.4114787145736987, + "grad_norm": 0.842829892505244, + "learning_rate": 1.7980977234141723e-05, + "loss": 0.0907, + "step": 20336 + }, + { + "epoch": 2.4115972963358234, + "grad_norm": 0.7290979108624341, + "learning_rate": 1.7978673453312163e-05, + "loss": 0.1044, + "step": 20337 + }, + { + "epoch": 2.4117158780979486, + "grad_norm": 0.8632725002381179, + "learning_rate": 1.7976369737210634e-05, + "loss": 0.0785, + "step": 20338 + }, + { + "epoch": 2.4118344598600734, + "grad_norm": 0.877089042745193, + "learning_rate": 1.7974066085858354e-05, + "loss": 0.129, + "step": 20339 + }, + { + "epoch": 2.4119530416221986, + "grad_norm": 0.9413547980998684, + "learning_rate": 1.7971762499276577e-05, + "loss": 0.1345, + "step": 20340 + }, + { + "epoch": 2.4120716233843233, + "grad_norm": 0.6403470033201398, + "learning_rate": 1.796945897748653e-05, + "loss": 0.1115, + "step": 20341 + }, + { + "epoch": 2.4121902051464486, + "grad_norm": 0.68543951834747, + "learning_rate": 1.7967155520509454e-05, + "loss": 0.1042, + "step": 20342 + }, + { + "epoch": 2.4123087869085733, + "grad_norm": 0.721709613048537, + "learning_rate": 1.7964852128366566e-05, + "loss": 0.1137, + "step": 20343 + }, + { + "epoch": 2.4124273686706985, + "grad_norm": 0.7114750916648644, + "learning_rate": 1.7962548801079127e-05, + "loss": 0.0806, + "step": 20344 + }, + { + "epoch": 2.4125459504328233, + "grad_norm": 0.7116440082954338, + "learning_rate": 1.796024553866835e-05, + "loss": 0.105, + "step": 20345 + }, + { + "epoch": 2.4126645321949485, + "grad_norm": 0.6440824914983375, + "learning_rate": 1.795794234115548e-05, + "loss": 0.0795, + "step": 20346 + }, + { + "epoch": 2.4127831139570732, + "grad_norm": 0.7659315024717536, + "learning_rate": 1.7955639208561743e-05, + "loss": 0.1256, + "step": 20347 + }, + { + "epoch": 2.4129016957191984, + "grad_norm": 1.1032284105133852, + "learning_rate": 1.7953336140908378e-05, + "loss": 0.1564, + "step": 20348 + }, + { + "epoch": 2.413020277481323, + "grad_norm": 0.9906175694673709, + "learning_rate": 1.7951033138216615e-05, + "loss": 0.1369, + "step": 20349 + }, + { + "epoch": 2.4131388592434484, + "grad_norm": 0.4968447384434471, + "learning_rate": 1.7948730200507673e-05, + "loss": 0.0719, + "step": 20350 + }, + { + "epoch": 2.413257441005573, + "grad_norm": 0.6794086311751871, + "learning_rate": 1.79464273278028e-05, + "loss": 0.103, + "step": 20351 + }, + { + "epoch": 2.4133760227676984, + "grad_norm": 0.6852265704928223, + "learning_rate": 1.794412452012322e-05, + "loss": 0.0947, + "step": 20352 + }, + { + "epoch": 2.4134946045298236, + "grad_norm": 0.7242891244087203, + "learning_rate": 1.794182177749016e-05, + "loss": 0.1058, + "step": 20353 + }, + { + "epoch": 2.4136131862919483, + "grad_norm": 0.8661195022626802, + "learning_rate": 1.7939519099924834e-05, + "loss": 0.0897, + "step": 20354 + }, + { + "epoch": 2.413731768054073, + "grad_norm": 0.8544060886565612, + "learning_rate": 1.7937216487448493e-05, + "loss": 0.1093, + "step": 20355 + }, + { + "epoch": 2.4138503498161983, + "grad_norm": 0.763056819559556, + "learning_rate": 1.7934913940082355e-05, + "loss": 0.0823, + "step": 20356 + }, + { + "epoch": 2.4139689315783235, + "grad_norm": 0.5577270620659378, + "learning_rate": 1.793261145784765e-05, + "loss": 0.0645, + "step": 20357 + }, + { + "epoch": 2.4140875133404482, + "grad_norm": 0.8426332042798265, + "learning_rate": 1.7930309040765586e-05, + "loss": 0.0959, + "step": 20358 + }, + { + "epoch": 2.414206095102573, + "grad_norm": 0.9039182241560071, + "learning_rate": 1.792800668885742e-05, + "loss": 0.1041, + "step": 20359 + }, + { + "epoch": 2.414324676864698, + "grad_norm": 0.5275264617815396, + "learning_rate": 1.7925704402144356e-05, + "loss": 0.0767, + "step": 20360 + }, + { + "epoch": 2.4144432586268234, + "grad_norm": 0.4930812196812944, + "learning_rate": 1.7923402180647615e-05, + "loss": 0.0857, + "step": 20361 + }, + { + "epoch": 2.414561840388948, + "grad_norm": 0.6781076066669152, + "learning_rate": 1.7921100024388434e-05, + "loss": 0.0791, + "step": 20362 + }, + { + "epoch": 2.414680422151073, + "grad_norm": 0.6943628233344732, + "learning_rate": 1.7918797933388035e-05, + "loss": 0.1052, + "step": 20363 + }, + { + "epoch": 2.414799003913198, + "grad_norm": 0.7777562089045689, + "learning_rate": 1.7916495907667628e-05, + "loss": 0.104, + "step": 20364 + }, + { + "epoch": 2.4149175856753233, + "grad_norm": 0.9008085768548023, + "learning_rate": 1.7914193947248444e-05, + "loss": 0.1238, + "step": 20365 + }, + { + "epoch": 2.415036167437448, + "grad_norm": 0.6889801752137291, + "learning_rate": 1.7911892052151695e-05, + "loss": 0.108, + "step": 20366 + }, + { + "epoch": 2.4151547491995733, + "grad_norm": 0.6335280096932957, + "learning_rate": 1.7909590222398625e-05, + "loss": 0.0872, + "step": 20367 + }, + { + "epoch": 2.415273330961698, + "grad_norm": 0.6755504505486465, + "learning_rate": 1.790728845801043e-05, + "loss": 0.0895, + "step": 20368 + }, + { + "epoch": 2.4153919127238233, + "grad_norm": 0.589022565588989, + "learning_rate": 1.7904986759008335e-05, + "loss": 0.0826, + "step": 20369 + }, + { + "epoch": 2.415510494485948, + "grad_norm": 1.0294705222915697, + "learning_rate": 1.7902685125413565e-05, + "loss": 0.1218, + "step": 20370 + }, + { + "epoch": 2.415629076248073, + "grad_norm": 0.6921235899973825, + "learning_rate": 1.7900383557247342e-05, + "loss": 0.0929, + "step": 20371 + }, + { + "epoch": 2.415747658010198, + "grad_norm": 0.6715519429136312, + "learning_rate": 1.789808205453087e-05, + "loss": 0.0685, + "step": 20372 + }, + { + "epoch": 2.415866239772323, + "grad_norm": 0.7600351953488707, + "learning_rate": 1.7895780617285365e-05, + "loss": 0.1299, + "step": 20373 + }, + { + "epoch": 2.415984821534448, + "grad_norm": 0.7894240052347854, + "learning_rate": 1.7893479245532063e-05, + "loss": 0.1247, + "step": 20374 + }, + { + "epoch": 2.416103403296573, + "grad_norm": 1.0226253125109874, + "learning_rate": 1.7891177939292158e-05, + "loss": 0.1343, + "step": 20375 + }, + { + "epoch": 2.416221985058698, + "grad_norm": 0.710551923654119, + "learning_rate": 1.7888876698586878e-05, + "loss": 0.113, + "step": 20376 + }, + { + "epoch": 2.416340566820823, + "grad_norm": 0.6688259011651109, + "learning_rate": 1.7886575523437434e-05, + "loss": 0.0836, + "step": 20377 + }, + { + "epoch": 2.416459148582948, + "grad_norm": 0.42604752737069523, + "learning_rate": 1.7884274413865045e-05, + "loss": 0.056, + "step": 20378 + }, + { + "epoch": 2.416577730345073, + "grad_norm": 0.5550142514747911, + "learning_rate": 1.788197336989092e-05, + "loss": 0.0887, + "step": 20379 + }, + { + "epoch": 2.416696312107198, + "grad_norm": 0.4296732299265416, + "learning_rate": 1.7879672391536265e-05, + "loss": 0.0591, + "step": 20380 + }, + { + "epoch": 2.416814893869323, + "grad_norm": 0.7010178675539561, + "learning_rate": 1.7877371478822307e-05, + "loss": 0.0969, + "step": 20381 + }, + { + "epoch": 2.416933475631448, + "grad_norm": 0.7438365413252666, + "learning_rate": 1.787507063177025e-05, + "loss": 0.1063, + "step": 20382 + }, + { + "epoch": 2.417052057393573, + "grad_norm": 0.7171978055889504, + "learning_rate": 1.7872769850401304e-05, + "loss": 0.1006, + "step": 20383 + }, + { + "epoch": 2.4171706391556977, + "grad_norm": 0.6490827457707088, + "learning_rate": 1.7870469134736667e-05, + "loss": 0.0819, + "step": 20384 + }, + { + "epoch": 2.417289220917823, + "grad_norm": 0.6544473572133686, + "learning_rate": 1.7868168484797575e-05, + "loss": 0.0819, + "step": 20385 + }, + { + "epoch": 2.4174078026799477, + "grad_norm": 0.8495328525011101, + "learning_rate": 1.786586790060522e-05, + "loss": 0.1459, + "step": 20386 + }, + { + "epoch": 2.417526384442073, + "grad_norm": 0.7150343998791518, + "learning_rate": 1.7863567382180818e-05, + "loss": 0.0945, + "step": 20387 + }, + { + "epoch": 2.4176449662041977, + "grad_norm": 0.7351382920192172, + "learning_rate": 1.7861266929545562e-05, + "loss": 0.1095, + "step": 20388 + }, + { + "epoch": 2.417763547966323, + "grad_norm": 0.7400104880740804, + "learning_rate": 1.7858966542720684e-05, + "loss": 0.078, + "step": 20389 + }, + { + "epoch": 2.4178821297284476, + "grad_norm": 0.7961161399815287, + "learning_rate": 1.7856666221727376e-05, + "loss": 0.0862, + "step": 20390 + }, + { + "epoch": 2.418000711490573, + "grad_norm": 0.6925610593302615, + "learning_rate": 1.7854365966586834e-05, + "loss": 0.0818, + "step": 20391 + }, + { + "epoch": 2.4181192932526976, + "grad_norm": 0.650273287667905, + "learning_rate": 1.7852065777320288e-05, + "loss": 0.0999, + "step": 20392 + }, + { + "epoch": 2.418237875014823, + "grad_norm": 0.8336432049026081, + "learning_rate": 1.784976565394893e-05, + "loss": 0.1372, + "step": 20393 + }, + { + "epoch": 2.4183564567769475, + "grad_norm": 0.5410252484916681, + "learning_rate": 1.7847465596493957e-05, + "loss": 0.0783, + "step": 20394 + }, + { + "epoch": 2.4184750385390728, + "grad_norm": 0.9507257596046369, + "learning_rate": 1.7845165604976583e-05, + "loss": 0.1223, + "step": 20395 + }, + { + "epoch": 2.4185936203011975, + "grad_norm": 0.510126933971923, + "learning_rate": 1.7842865679418008e-05, + "loss": 0.0662, + "step": 20396 + }, + { + "epoch": 2.4187122020633227, + "grad_norm": 0.8270115937971692, + "learning_rate": 1.784056581983944e-05, + "loss": 0.1383, + "step": 20397 + }, + { + "epoch": 2.4188307838254475, + "grad_norm": 0.9823908801673128, + "learning_rate": 1.7838266026262072e-05, + "loss": 0.136, + "step": 20398 + }, + { + "epoch": 2.4189493655875727, + "grad_norm": 0.6114626084487321, + "learning_rate": 1.78359662987071e-05, + "loss": 0.0786, + "step": 20399 + }, + { + "epoch": 2.4190679473496974, + "grad_norm": 0.5774529712456424, + "learning_rate": 1.7833666637195746e-05, + "loss": 0.075, + "step": 20400 + }, + { + "epoch": 2.4191865291118226, + "grad_norm": 0.701019160058869, + "learning_rate": 1.7831367041749197e-05, + "loss": 0.1063, + "step": 20401 + }, + { + "epoch": 2.4193051108739474, + "grad_norm": 0.81391867151255, + "learning_rate": 1.7829067512388653e-05, + "loss": 0.104, + "step": 20402 + }, + { + "epoch": 2.4194236926360726, + "grad_norm": 0.63205301895864, + "learning_rate": 1.78267680491353e-05, + "loss": 0.0773, + "step": 20403 + }, + { + "epoch": 2.4195422743981974, + "grad_norm": 0.6703247130382289, + "learning_rate": 1.7824468652010365e-05, + "loss": 0.0861, + "step": 20404 + }, + { + "epoch": 2.4196608561603226, + "grad_norm": 0.959164178360623, + "learning_rate": 1.782216932103502e-05, + "loss": 0.1269, + "step": 20405 + }, + { + "epoch": 2.4197794379224478, + "grad_norm": 0.6592435798864569, + "learning_rate": 1.7819870056230475e-05, + "loss": 0.0994, + "step": 20406 + }, + { + "epoch": 2.4198980196845725, + "grad_norm": 0.7560067435760086, + "learning_rate": 1.781757085761792e-05, + "loss": 0.0895, + "step": 20407 + }, + { + "epoch": 2.4200166014466973, + "grad_norm": 0.536022058205427, + "learning_rate": 1.7815271725218563e-05, + "loss": 0.0671, + "step": 20408 + }, + { + "epoch": 2.4201351832088225, + "grad_norm": 0.3719572601311168, + "learning_rate": 1.7812972659053585e-05, + "loss": 0.0566, + "step": 20409 + }, + { + "epoch": 2.4202537649709477, + "grad_norm": 0.4961670031828941, + "learning_rate": 1.7810673659144182e-05, + "loss": 0.0639, + "step": 20410 + }, + { + "epoch": 2.4203723467330724, + "grad_norm": 0.5275788035966217, + "learning_rate": 1.7808374725511556e-05, + "loss": 0.0691, + "step": 20411 + }, + { + "epoch": 2.420490928495197, + "grad_norm": 0.9006486932335451, + "learning_rate": 1.7806075858176903e-05, + "loss": 0.1316, + "step": 20412 + }, + { + "epoch": 2.4206095102573224, + "grad_norm": 0.7974682847932089, + "learning_rate": 1.7803777057161396e-05, + "loss": 0.1226, + "step": 20413 + }, + { + "epoch": 2.4207280920194476, + "grad_norm": 0.5429000692507847, + "learning_rate": 1.780147832248624e-05, + "loss": 0.0699, + "step": 20414 + }, + { + "epoch": 2.4208466737815724, + "grad_norm": 0.837050318534204, + "learning_rate": 1.7799179654172632e-05, + "loss": 0.1157, + "step": 20415 + }, + { + "epoch": 2.4209652555436976, + "grad_norm": 0.869746107241255, + "learning_rate": 1.779688105224176e-05, + "loss": 0.1337, + "step": 20416 + }, + { + "epoch": 2.4210838373058223, + "grad_norm": 0.9459285242669578, + "learning_rate": 1.7794582516714812e-05, + "loss": 0.1177, + "step": 20417 + }, + { + "epoch": 2.4212024190679475, + "grad_norm": 1.2760387717024477, + "learning_rate": 1.7792284047612967e-05, + "loss": 0.1526, + "step": 20418 + }, + { + "epoch": 2.4213210008300723, + "grad_norm": 0.6554226744419813, + "learning_rate": 1.7789985644957435e-05, + "loss": 0.081, + "step": 20419 + }, + { + "epoch": 2.4214395825921975, + "grad_norm": 0.6248936978811822, + "learning_rate": 1.7787687308769395e-05, + "loss": 0.1108, + "step": 20420 + }, + { + "epoch": 2.4215581643543223, + "grad_norm": 0.6061225149734762, + "learning_rate": 1.7785389039070022e-05, + "loss": 0.0865, + "step": 20421 + }, + { + "epoch": 2.4216767461164475, + "grad_norm": 0.5961794824075508, + "learning_rate": 1.7783090835880524e-05, + "loss": 0.0655, + "step": 20422 + }, + { + "epoch": 2.421795327878572, + "grad_norm": 0.5790243447593155, + "learning_rate": 1.778079269922208e-05, + "loss": 0.0857, + "step": 20423 + }, + { + "epoch": 2.4219139096406974, + "grad_norm": 0.7490582063957336, + "learning_rate": 1.777849462911587e-05, + "loss": 0.1062, + "step": 20424 + }, + { + "epoch": 2.422032491402822, + "grad_norm": 0.7254507227861589, + "learning_rate": 1.777619662558308e-05, + "loss": 0.1081, + "step": 20425 + }, + { + "epoch": 2.4221510731649474, + "grad_norm": 0.716258434765107, + "learning_rate": 1.7773898688644902e-05, + "loss": 0.0937, + "step": 20426 + }, + { + "epoch": 2.422269654927072, + "grad_norm": 1.0323222081850865, + "learning_rate": 1.7771600818322523e-05, + "loss": 0.1547, + "step": 20427 + }, + { + "epoch": 2.4223882366891973, + "grad_norm": 1.001040233467527, + "learning_rate": 1.7769303014637117e-05, + "loss": 0.1148, + "step": 20428 + }, + { + "epoch": 2.422506818451322, + "grad_norm": 0.7394296218509101, + "learning_rate": 1.7767005277609863e-05, + "loss": 0.0876, + "step": 20429 + }, + { + "epoch": 2.4226254002134473, + "grad_norm": 0.7419737569404186, + "learning_rate": 1.776470760726196e-05, + "loss": 0.0987, + "step": 20430 + }, + { + "epoch": 2.422743981975572, + "grad_norm": 0.8988042889422243, + "learning_rate": 1.7762410003614583e-05, + "loss": 0.1217, + "step": 20431 + }, + { + "epoch": 2.4228625637376973, + "grad_norm": 0.6520278300568709, + "learning_rate": 1.7760112466688905e-05, + "loss": 0.0924, + "step": 20432 + }, + { + "epoch": 2.422981145499822, + "grad_norm": 0.5610814896983861, + "learning_rate": 1.7757814996506107e-05, + "loss": 0.0676, + "step": 20433 + }, + { + "epoch": 2.4230997272619472, + "grad_norm": 0.7734763896543018, + "learning_rate": 1.775551759308738e-05, + "loss": 0.122, + "step": 20434 + }, + { + "epoch": 2.423218309024072, + "grad_norm": 0.6767679885070514, + "learning_rate": 1.7753220256453895e-05, + "loss": 0.0949, + "step": 20435 + }, + { + "epoch": 2.423336890786197, + "grad_norm": 0.5738823060329283, + "learning_rate": 1.7750922986626826e-05, + "loss": 0.0798, + "step": 20436 + }, + { + "epoch": 2.423455472548322, + "grad_norm": 0.7380471497712895, + "learning_rate": 1.7748625783627364e-05, + "loss": 0.0846, + "step": 20437 + }, + { + "epoch": 2.423574054310447, + "grad_norm": 0.46494998978635366, + "learning_rate": 1.7746328647476684e-05, + "loss": 0.0575, + "step": 20438 + }, + { + "epoch": 2.423692636072572, + "grad_norm": 1.014720570954471, + "learning_rate": 1.7744031578195962e-05, + "loss": 0.1361, + "step": 20439 + }, + { + "epoch": 2.423811217834697, + "grad_norm": 0.5878083072812218, + "learning_rate": 1.7741734575806356e-05, + "loss": 0.0823, + "step": 20440 + }, + { + "epoch": 2.423929799596822, + "grad_norm": 0.7053203856125371, + "learning_rate": 1.7739437640329067e-05, + "loss": 0.0882, + "step": 20441 + }, + { + "epoch": 2.424048381358947, + "grad_norm": 0.5040214809223516, + "learning_rate": 1.773714077178526e-05, + "loss": 0.0653, + "step": 20442 + }, + { + "epoch": 2.424166963121072, + "grad_norm": 0.5864368943995412, + "learning_rate": 1.7734843970196108e-05, + "loss": 0.0846, + "step": 20443 + }, + { + "epoch": 2.424285544883197, + "grad_norm": 0.5665577715240618, + "learning_rate": 1.7732547235582782e-05, + "loss": 0.0675, + "step": 20444 + }, + { + "epoch": 2.424404126645322, + "grad_norm": 0.9231916136463244, + "learning_rate": 1.7730250567966456e-05, + "loss": 0.1022, + "step": 20445 + }, + { + "epoch": 2.424522708407447, + "grad_norm": 0.6333689132726998, + "learning_rate": 1.7727953967368314e-05, + "loss": 0.0825, + "step": 20446 + }, + { + "epoch": 2.4246412901695718, + "grad_norm": 0.8066715878000948, + "learning_rate": 1.7725657433809507e-05, + "loss": 0.1134, + "step": 20447 + }, + { + "epoch": 2.424759871931697, + "grad_norm": 0.9953450639603674, + "learning_rate": 1.7723360967311227e-05, + "loss": 0.15, + "step": 20448 + }, + { + "epoch": 2.4248784536938217, + "grad_norm": 0.6018688472092898, + "learning_rate": 1.772106456789464e-05, + "loss": 0.0949, + "step": 20449 + }, + { + "epoch": 2.424997035455947, + "grad_norm": 0.5345112968172905, + "learning_rate": 1.7718768235580906e-05, + "loss": 0.0672, + "step": 20450 + }, + { + "epoch": 2.4251156172180717, + "grad_norm": 0.8309287743637392, + "learning_rate": 1.771647197039119e-05, + "loss": 0.1042, + "step": 20451 + }, + { + "epoch": 2.425234198980197, + "grad_norm": 0.8281976748814304, + "learning_rate": 1.7714175772346687e-05, + "loss": 0.1002, + "step": 20452 + }, + { + "epoch": 2.4253527807423216, + "grad_norm": 0.8822818981024819, + "learning_rate": 1.7711879641468546e-05, + "loss": 0.1465, + "step": 20453 + }, + { + "epoch": 2.425471362504447, + "grad_norm": 0.5486536342817294, + "learning_rate": 1.7709583577777932e-05, + "loss": 0.0728, + "step": 20454 + }, + { + "epoch": 2.425589944266572, + "grad_norm": 0.6644891326080791, + "learning_rate": 1.7707287581296018e-05, + "loss": 0.0889, + "step": 20455 + }, + { + "epoch": 2.425708526028697, + "grad_norm": 0.8015843491333265, + "learning_rate": 1.7704991652043967e-05, + "loss": 0.0921, + "step": 20456 + }, + { + "epoch": 2.4258271077908216, + "grad_norm": 1.0718291054538691, + "learning_rate": 1.7702695790042957e-05, + "loss": 0.1599, + "step": 20457 + }, + { + "epoch": 2.4259456895529468, + "grad_norm": 0.5103521097160069, + "learning_rate": 1.7700399995314137e-05, + "loss": 0.071, + "step": 20458 + }, + { + "epoch": 2.426064271315072, + "grad_norm": 0.6056492201556413, + "learning_rate": 1.769810426787867e-05, + "loss": 0.0839, + "step": 20459 + }, + { + "epoch": 2.4261828530771967, + "grad_norm": 0.6689255210713124, + "learning_rate": 1.769580860775774e-05, + "loss": 0.0749, + "step": 20460 + }, + { + "epoch": 2.4263014348393215, + "grad_norm": 0.7484535945669374, + "learning_rate": 1.7693513014972495e-05, + "loss": 0.116, + "step": 20461 + }, + { + "epoch": 2.4264200166014467, + "grad_norm": 0.802917210262137, + "learning_rate": 1.7691217489544092e-05, + "loss": 0.1292, + "step": 20462 + }, + { + "epoch": 2.426538598363572, + "grad_norm": 0.6574222088049687, + "learning_rate": 1.768892203149371e-05, + "loss": 0.0821, + "step": 20463 + }, + { + "epoch": 2.4266571801256966, + "grad_norm": 0.42480374180660097, + "learning_rate": 1.76866266408425e-05, + "loss": 0.0601, + "step": 20464 + }, + { + "epoch": 2.426775761887822, + "grad_norm": 0.8407009588273316, + "learning_rate": 1.7684331317611623e-05, + "loss": 0.0972, + "step": 20465 + }, + { + "epoch": 2.4268943436499466, + "grad_norm": 0.4296580665971884, + "learning_rate": 1.7682036061822237e-05, + "loss": 0.0708, + "step": 20466 + }, + { + "epoch": 2.427012925412072, + "grad_norm": 0.6027649520813165, + "learning_rate": 1.7679740873495506e-05, + "loss": 0.0793, + "step": 20467 + }, + { + "epoch": 2.4271315071741966, + "grad_norm": 0.7801262814863572, + "learning_rate": 1.7677445752652593e-05, + "loss": 0.0975, + "step": 20468 + }, + { + "epoch": 2.4272500889363218, + "grad_norm": 0.8135428835759627, + "learning_rate": 1.767515069931465e-05, + "loss": 0.1385, + "step": 20469 + }, + { + "epoch": 2.4273686706984465, + "grad_norm": 0.7244333243267554, + "learning_rate": 1.7672855713502823e-05, + "loss": 0.0979, + "step": 20470 + }, + { + "epoch": 2.4274872524605717, + "grad_norm": 0.8135420065650913, + "learning_rate": 1.7670560795238296e-05, + "loss": 0.0737, + "step": 20471 + }, + { + "epoch": 2.4276058342226965, + "grad_norm": 0.5129902178920598, + "learning_rate": 1.7668265944542207e-05, + "loss": 0.0624, + "step": 20472 + }, + { + "epoch": 2.4277244159848217, + "grad_norm": 0.8828272208090617, + "learning_rate": 1.7665971161435712e-05, + "loss": 0.1125, + "step": 20473 + }, + { + "epoch": 2.4278429977469465, + "grad_norm": 0.5847147172305629, + "learning_rate": 1.7663676445939966e-05, + "loss": 0.0834, + "step": 20474 + }, + { + "epoch": 2.4279615795090717, + "grad_norm": 0.6890617088399857, + "learning_rate": 1.7661381798076128e-05, + "loss": 0.0738, + "step": 20475 + }, + { + "epoch": 2.4280801612711964, + "grad_norm": 0.5123387749273092, + "learning_rate": 1.7659087217865357e-05, + "loss": 0.0614, + "step": 20476 + }, + { + "epoch": 2.4281987430333216, + "grad_norm": 0.6846570578830027, + "learning_rate": 1.765679270532879e-05, + "loss": 0.0995, + "step": 20477 + }, + { + "epoch": 2.4283173247954464, + "grad_norm": 0.5804695092191787, + "learning_rate": 1.76544982604876e-05, + "loss": 0.0859, + "step": 20478 + }, + { + "epoch": 2.4284359065575716, + "grad_norm": 0.4520663363328892, + "learning_rate": 1.7652203883362927e-05, + "loss": 0.0708, + "step": 20479 + }, + { + "epoch": 2.4285544883196963, + "grad_norm": 0.9911754141011777, + "learning_rate": 1.7649909573975925e-05, + "loss": 0.1221, + "step": 20480 + }, + { + "epoch": 2.4286730700818215, + "grad_norm": 0.5980720432339427, + "learning_rate": 1.7647615332347734e-05, + "loss": 0.091, + "step": 20481 + }, + { + "epoch": 2.4287916518439463, + "grad_norm": 0.61412097770083, + "learning_rate": 1.764532115849952e-05, + "loss": 0.0723, + "step": 20482 + }, + { + "epoch": 2.4289102336060715, + "grad_norm": 0.9661384126435733, + "learning_rate": 1.7643027052452428e-05, + "loss": 0.127, + "step": 20483 + }, + { + "epoch": 2.4290288153681963, + "grad_norm": 0.909030466057467, + "learning_rate": 1.76407330142276e-05, + "loss": 0.1314, + "step": 20484 + }, + { + "epoch": 2.4291473971303215, + "grad_norm": 0.7729020310182478, + "learning_rate": 1.763843904384619e-05, + "loss": 0.1083, + "step": 20485 + }, + { + "epoch": 2.429265978892446, + "grad_norm": 0.5057066437562823, + "learning_rate": 1.763614514132934e-05, + "loss": 0.0889, + "step": 20486 + }, + { + "epoch": 2.4293845606545714, + "grad_norm": 0.8305859150582953, + "learning_rate": 1.7633851306698213e-05, + "loss": 0.0848, + "step": 20487 + }, + { + "epoch": 2.429503142416696, + "grad_norm": 0.6251824861875199, + "learning_rate": 1.763155753997394e-05, + "loss": 0.0789, + "step": 20488 + }, + { + "epoch": 2.4296217241788214, + "grad_norm": 0.7973353018506113, + "learning_rate": 1.7629263841177662e-05, + "loss": 0.0783, + "step": 20489 + }, + { + "epoch": 2.429740305940946, + "grad_norm": 0.8737040074358955, + "learning_rate": 1.7626970210330546e-05, + "loss": 0.1137, + "step": 20490 + }, + { + "epoch": 2.4298588877030713, + "grad_norm": 0.848734323375219, + "learning_rate": 1.7624676647453718e-05, + "loss": 0.113, + "step": 20491 + }, + { + "epoch": 2.429977469465196, + "grad_norm": 0.8180492089311855, + "learning_rate": 1.762238315256832e-05, + "loss": 0.1131, + "step": 20492 + }, + { + "epoch": 2.4300960512273213, + "grad_norm": 0.6015019708668995, + "learning_rate": 1.762008972569551e-05, + "loss": 0.0621, + "step": 20493 + }, + { + "epoch": 2.430214632989446, + "grad_norm": 0.9486168885582937, + "learning_rate": 1.7617796366856422e-05, + "loss": 0.1049, + "step": 20494 + }, + { + "epoch": 2.4303332147515713, + "grad_norm": 0.6805806427266702, + "learning_rate": 1.7615503076072198e-05, + "loss": 0.1174, + "step": 20495 + }, + { + "epoch": 2.430451796513696, + "grad_norm": 0.6817426364293354, + "learning_rate": 1.7613209853363974e-05, + "loss": 0.0755, + "step": 20496 + }, + { + "epoch": 2.4305703782758212, + "grad_norm": 0.576965789304253, + "learning_rate": 1.7610916698752906e-05, + "loss": 0.098, + "step": 20497 + }, + { + "epoch": 2.430688960037946, + "grad_norm": 1.0476580197457392, + "learning_rate": 1.7608623612260126e-05, + "loss": 0.1469, + "step": 20498 + }, + { + "epoch": 2.430807541800071, + "grad_norm": 0.750791201144994, + "learning_rate": 1.7606330593906773e-05, + "loss": 0.0788, + "step": 20499 + }, + { + "epoch": 2.430926123562196, + "grad_norm": 0.7888014268210428, + "learning_rate": 1.760403764371397e-05, + "loss": 0.0959, + "step": 20500 + }, + { + "epoch": 2.431044705324321, + "grad_norm": 0.5323138821912876, + "learning_rate": 1.7601744761702887e-05, + "loss": 0.0921, + "step": 20501 + }, + { + "epoch": 2.431163287086446, + "grad_norm": 0.6290232211007444, + "learning_rate": 1.759945194789464e-05, + "loss": 0.0774, + "step": 20502 + }, + { + "epoch": 2.431281868848571, + "grad_norm": 0.5787941775963371, + "learning_rate": 1.759715920231037e-05, + "loss": 0.0794, + "step": 20503 + }, + { + "epoch": 2.4314004506106963, + "grad_norm": 0.7025650212228934, + "learning_rate": 1.7594866524971214e-05, + "loss": 0.0894, + "step": 20504 + }, + { + "epoch": 2.431519032372821, + "grad_norm": 0.6856215901444034, + "learning_rate": 1.7592573915898302e-05, + "loss": 0.0939, + "step": 20505 + }, + { + "epoch": 2.431637614134946, + "grad_norm": 0.741660921528089, + "learning_rate": 1.7590281375112787e-05, + "loss": 0.1008, + "step": 20506 + }, + { + "epoch": 2.431756195897071, + "grad_norm": 0.4849562069104137, + "learning_rate": 1.7587988902635777e-05, + "loss": 0.0723, + "step": 20507 + }, + { + "epoch": 2.4318747776591962, + "grad_norm": 0.4608767050801377, + "learning_rate": 1.758569649848843e-05, + "loss": 0.0609, + "step": 20508 + }, + { + "epoch": 2.431993359421321, + "grad_norm": 0.7666960686737548, + "learning_rate": 1.7583404162691874e-05, + "loss": 0.1115, + "step": 20509 + }, + { + "epoch": 2.4321119411834458, + "grad_norm": 0.6653772629164727, + "learning_rate": 1.7581111895267232e-05, + "loss": 0.0944, + "step": 20510 + }, + { + "epoch": 2.432230522945571, + "grad_norm": 0.5733993081499409, + "learning_rate": 1.757881969623563e-05, + "loss": 0.086, + "step": 20511 + }, + { + "epoch": 2.432349104707696, + "grad_norm": 0.6918144156262968, + "learning_rate": 1.757652756561822e-05, + "loss": 0.1013, + "step": 20512 + }, + { + "epoch": 2.432467686469821, + "grad_norm": 0.7400317373177974, + "learning_rate": 1.757423550343613e-05, + "loss": 0.0991, + "step": 20513 + }, + { + "epoch": 2.4325862682319457, + "grad_norm": 0.7823815261385417, + "learning_rate": 1.757194350971047e-05, + "loss": 0.1197, + "step": 20514 + }, + { + "epoch": 2.432704849994071, + "grad_norm": 0.5187802642625817, + "learning_rate": 1.7569651584462385e-05, + "loss": 0.0696, + "step": 20515 + }, + { + "epoch": 2.432823431756196, + "grad_norm": 0.656669837566061, + "learning_rate": 1.7567359727713e-05, + "loss": 0.1069, + "step": 20516 + }, + { + "epoch": 2.432942013518321, + "grad_norm": 0.688817851343126, + "learning_rate": 1.7565067939483448e-05, + "loss": 0.0875, + "step": 20517 + }, + { + "epoch": 2.433060595280446, + "grad_norm": 0.8741790709733156, + "learning_rate": 1.7562776219794853e-05, + "loss": 0.1056, + "step": 20518 + }, + { + "epoch": 2.433179177042571, + "grad_norm": 0.7489178695141648, + "learning_rate": 1.7560484568668334e-05, + "loss": 0.1067, + "step": 20519 + }, + { + "epoch": 2.433297758804696, + "grad_norm": 0.5786204640239, + "learning_rate": 1.755819298612503e-05, + "loss": 0.0617, + "step": 20520 + }, + { + "epoch": 2.4334163405668208, + "grad_norm": 0.49346179493446485, + "learning_rate": 1.7555901472186064e-05, + "loss": 0.0649, + "step": 20521 + }, + { + "epoch": 2.433534922328946, + "grad_norm": 0.7167050619812333, + "learning_rate": 1.7553610026872545e-05, + "loss": 0.0904, + "step": 20522 + }, + { + "epoch": 2.4336535040910707, + "grad_norm": 1.0807429972725695, + "learning_rate": 1.7551318650205623e-05, + "loss": 0.1276, + "step": 20523 + }, + { + "epoch": 2.433772085853196, + "grad_norm": 1.0062568187412955, + "learning_rate": 1.7549027342206398e-05, + "loss": 0.1017, + "step": 20524 + }, + { + "epoch": 2.4338906676153207, + "grad_norm": 0.822369275188511, + "learning_rate": 1.7546736102896015e-05, + "loss": 0.0751, + "step": 20525 + }, + { + "epoch": 2.434009249377446, + "grad_norm": 0.7792357420657303, + "learning_rate": 1.7544444932295573e-05, + "loss": 0.0866, + "step": 20526 + }, + { + "epoch": 2.4341278311395707, + "grad_norm": 0.6984320947151539, + "learning_rate": 1.7542153830426217e-05, + "loss": 0.0876, + "step": 20527 + }, + { + "epoch": 2.434246412901696, + "grad_norm": 0.8503929164411931, + "learning_rate": 1.7539862797309058e-05, + "loss": 0.1072, + "step": 20528 + }, + { + "epoch": 2.4343649946638206, + "grad_norm": 0.6294120927796336, + "learning_rate": 1.7537571832965218e-05, + "loss": 0.0661, + "step": 20529 + }, + { + "epoch": 2.434483576425946, + "grad_norm": 0.9407869390453607, + "learning_rate": 1.7535280937415798e-05, + "loss": 0.1302, + "step": 20530 + }, + { + "epoch": 2.4346021581880706, + "grad_norm": 0.5431809596140349, + "learning_rate": 1.7532990110681947e-05, + "loss": 0.074, + "step": 20531 + }, + { + "epoch": 2.434720739950196, + "grad_norm": 0.8229096006936102, + "learning_rate": 1.7530699352784773e-05, + "loss": 0.1034, + "step": 20532 + }, + { + "epoch": 2.4348393217123205, + "grad_norm": 1.1503698958257544, + "learning_rate": 1.7528408663745377e-05, + "loss": 0.1546, + "step": 20533 + }, + { + "epoch": 2.4349579034744457, + "grad_norm": 0.8134469088032726, + "learning_rate": 1.7526118043584904e-05, + "loss": 0.1209, + "step": 20534 + }, + { + "epoch": 2.4350764852365705, + "grad_norm": 0.9644072780464764, + "learning_rate": 1.752382749232445e-05, + "loss": 0.1571, + "step": 20535 + }, + { + "epoch": 2.4351950669986957, + "grad_norm": 0.9621637097537685, + "learning_rate": 1.7521537009985146e-05, + "loss": 0.116, + "step": 20536 + }, + { + "epoch": 2.4353136487608205, + "grad_norm": 0.4411337036976172, + "learning_rate": 1.751924659658809e-05, + "loss": 0.0675, + "step": 20537 + }, + { + "epoch": 2.4354322305229457, + "grad_norm": 0.60748575790993, + "learning_rate": 1.7516956252154415e-05, + "loss": 0.0871, + "step": 20538 + }, + { + "epoch": 2.4355508122850704, + "grad_norm": 0.3796155094161408, + "learning_rate": 1.751466597670523e-05, + "loss": 0.0529, + "step": 20539 + }, + { + "epoch": 2.4356693940471956, + "grad_norm": 0.8656278199888006, + "learning_rate": 1.7512375770261645e-05, + "loss": 0.1186, + "step": 20540 + }, + { + "epoch": 2.4357879758093204, + "grad_norm": 0.6476835203426963, + "learning_rate": 1.7510085632844762e-05, + "loss": 0.0884, + "step": 20541 + }, + { + "epoch": 2.4359065575714456, + "grad_norm": 0.6457363257656736, + "learning_rate": 1.7507795564475714e-05, + "loss": 0.0696, + "step": 20542 + }, + { + "epoch": 2.4360251393335703, + "grad_norm": 0.7323743649166528, + "learning_rate": 1.75055055651756e-05, + "loss": 0.1081, + "step": 20543 + }, + { + "epoch": 2.4361437210956955, + "grad_norm": 0.733474936875559, + "learning_rate": 1.750321563496554e-05, + "loss": 0.0808, + "step": 20544 + }, + { + "epoch": 2.4362623028578203, + "grad_norm": 0.5113663111178807, + "learning_rate": 1.7500925773866626e-05, + "loss": 0.0652, + "step": 20545 + }, + { + "epoch": 2.4363808846199455, + "grad_norm": 0.6876968259736314, + "learning_rate": 1.749863598189999e-05, + "loss": 0.0966, + "step": 20546 + }, + { + "epoch": 2.4364994663820703, + "grad_norm": 0.6425109730468664, + "learning_rate": 1.7496346259086734e-05, + "loss": 0.0814, + "step": 20547 + }, + { + "epoch": 2.4366180481441955, + "grad_norm": 0.692502053040945, + "learning_rate": 1.7494056605447955e-05, + "loss": 0.1036, + "step": 20548 + }, + { + "epoch": 2.4367366299063202, + "grad_norm": 0.7954179455385406, + "learning_rate": 1.7491767021004778e-05, + "loss": 0.1196, + "step": 20549 + }, + { + "epoch": 2.4368552116684454, + "grad_norm": 0.7454411856236889, + "learning_rate": 1.74894775057783e-05, + "loss": 0.0917, + "step": 20550 + }, + { + "epoch": 2.43697379343057, + "grad_norm": 0.574152852879181, + "learning_rate": 1.7487188059789635e-05, + "loss": 0.0765, + "step": 20551 + }, + { + "epoch": 2.4370923751926954, + "grad_norm": 0.6320028906352021, + "learning_rate": 1.7484898683059868e-05, + "loss": 0.0946, + "step": 20552 + }, + { + "epoch": 2.4372109569548206, + "grad_norm": 0.8191241413469318, + "learning_rate": 1.7482609375610132e-05, + "loss": 0.1379, + "step": 20553 + }, + { + "epoch": 2.4373295387169454, + "grad_norm": 0.9545684611458045, + "learning_rate": 1.7480320137461508e-05, + "loss": 0.1284, + "step": 20554 + }, + { + "epoch": 2.43744812047907, + "grad_norm": 0.7373664593727889, + "learning_rate": 1.747803096863512e-05, + "loss": 0.0897, + "step": 20555 + }, + { + "epoch": 2.4375667022411953, + "grad_norm": 0.6350567357986692, + "learning_rate": 1.7475741869152056e-05, + "loss": 0.0776, + "step": 20556 + }, + { + "epoch": 2.4376852840033205, + "grad_norm": 0.694073078652184, + "learning_rate": 1.7473452839033433e-05, + "loss": 0.0864, + "step": 20557 + }, + { + "epoch": 2.4378038657654453, + "grad_norm": 0.6783527151499834, + "learning_rate": 1.7471163878300344e-05, + "loss": 0.0917, + "step": 20558 + }, + { + "epoch": 2.43792244752757, + "grad_norm": 0.724953337734948, + "learning_rate": 1.7468874986973893e-05, + "loss": 0.0878, + "step": 20559 + }, + { + "epoch": 2.4380410292896952, + "grad_norm": 0.9561467363894501, + "learning_rate": 1.7466586165075173e-05, + "loss": 0.1182, + "step": 20560 + }, + { + "epoch": 2.4381596110518204, + "grad_norm": 0.8194012346468562, + "learning_rate": 1.7464297412625293e-05, + "loss": 0.1325, + "step": 20561 + }, + { + "epoch": 2.438278192813945, + "grad_norm": 0.5521671270851761, + "learning_rate": 1.7462008729645353e-05, + "loss": 0.0687, + "step": 20562 + }, + { + "epoch": 2.43839677457607, + "grad_norm": 0.965076548062945, + "learning_rate": 1.7459720116156443e-05, + "loss": 0.1453, + "step": 20563 + }, + { + "epoch": 2.438515356338195, + "grad_norm": 0.6882994900578039, + "learning_rate": 1.745743157217967e-05, + "loss": 0.1061, + "step": 20564 + }, + { + "epoch": 2.4386339381003204, + "grad_norm": 0.6510997544278733, + "learning_rate": 1.7455143097736126e-05, + "loss": 0.0907, + "step": 20565 + }, + { + "epoch": 2.438752519862445, + "grad_norm": 0.9685684366849682, + "learning_rate": 1.7452854692846918e-05, + "loss": 0.1175, + "step": 20566 + }, + { + "epoch": 2.4388711016245703, + "grad_norm": 0.7505913100916941, + "learning_rate": 1.745056635753312e-05, + "loss": 0.1029, + "step": 20567 + }, + { + "epoch": 2.438989683386695, + "grad_norm": 0.7138473802661519, + "learning_rate": 1.7448278091815858e-05, + "loss": 0.1007, + "step": 20568 + }, + { + "epoch": 2.4391082651488203, + "grad_norm": 0.8301486893118868, + "learning_rate": 1.744598989571621e-05, + "loss": 0.141, + "step": 20569 + }, + { + "epoch": 2.439226846910945, + "grad_norm": 0.9714382462273952, + "learning_rate": 1.744370176925527e-05, + "loss": 0.1164, + "step": 20570 + }, + { + "epoch": 2.4393454286730702, + "grad_norm": 0.9530837484299615, + "learning_rate": 1.7441413712454124e-05, + "loss": 0.0903, + "step": 20571 + }, + { + "epoch": 2.439464010435195, + "grad_norm": 0.4806813767034375, + "learning_rate": 1.7439125725333885e-05, + "loss": 0.0778, + "step": 20572 + }, + { + "epoch": 2.43958259219732, + "grad_norm": 0.4636226355894945, + "learning_rate": 1.7436837807915627e-05, + "loss": 0.0748, + "step": 20573 + }, + { + "epoch": 2.439701173959445, + "grad_norm": 0.7838810637228693, + "learning_rate": 1.743454996022046e-05, + "loss": 0.1138, + "step": 20574 + }, + { + "epoch": 2.43981975572157, + "grad_norm": 0.5349520643310081, + "learning_rate": 1.7432262182269454e-05, + "loss": 0.0806, + "step": 20575 + }, + { + "epoch": 2.439938337483695, + "grad_norm": 0.6204787594560555, + "learning_rate": 1.7429974474083717e-05, + "loss": 0.0944, + "step": 20576 + }, + { + "epoch": 2.44005691924582, + "grad_norm": 0.4864649398804172, + "learning_rate": 1.7427686835684336e-05, + "loss": 0.0744, + "step": 20577 + }, + { + "epoch": 2.440175501007945, + "grad_norm": 0.8285263800624789, + "learning_rate": 1.7425399267092383e-05, + "loss": 0.1042, + "step": 20578 + }, + { + "epoch": 2.44029408277007, + "grad_norm": 0.45467112433212414, + "learning_rate": 1.7423111768328976e-05, + "loss": 0.0535, + "step": 20579 + }, + { + "epoch": 2.440412664532195, + "grad_norm": 0.5045304192691862, + "learning_rate": 1.7420824339415182e-05, + "loss": 0.063, + "step": 20580 + }, + { + "epoch": 2.44053124629432, + "grad_norm": 0.6872786500252062, + "learning_rate": 1.7418536980372096e-05, + "loss": 0.0994, + "step": 20581 + }, + { + "epoch": 2.440649828056445, + "grad_norm": 0.756411679807771, + "learning_rate": 1.7416249691220795e-05, + "loss": 0.0913, + "step": 20582 + }, + { + "epoch": 2.44076840981857, + "grad_norm": 1.0215751956453878, + "learning_rate": 1.7413962471982377e-05, + "loss": 0.1168, + "step": 20583 + }, + { + "epoch": 2.4408869915806948, + "grad_norm": 1.09017099306803, + "learning_rate": 1.7411675322677916e-05, + "loss": 0.1204, + "step": 20584 + }, + { + "epoch": 2.44100557334282, + "grad_norm": 0.9983386427865891, + "learning_rate": 1.740938824332851e-05, + "loss": 0.1346, + "step": 20585 + }, + { + "epoch": 2.4411241551049447, + "grad_norm": 0.7741768644450451, + "learning_rate": 1.7407101233955224e-05, + "loss": 0.0913, + "step": 20586 + }, + { + "epoch": 2.44124273686707, + "grad_norm": 0.8552167980676849, + "learning_rate": 1.7404814294579168e-05, + "loss": 0.1068, + "step": 20587 + }, + { + "epoch": 2.4413613186291947, + "grad_norm": 0.7288657026876763, + "learning_rate": 1.740252742522141e-05, + "loss": 0.0799, + "step": 20588 + }, + { + "epoch": 2.44147990039132, + "grad_norm": 0.5147913576744787, + "learning_rate": 1.740024062590303e-05, + "loss": 0.0765, + "step": 20589 + }, + { + "epoch": 2.4415984821534447, + "grad_norm": 0.8236188261629301, + "learning_rate": 1.7397953896645103e-05, + "loss": 0.1191, + "step": 20590 + }, + { + "epoch": 2.44171706391557, + "grad_norm": 0.7285016085895992, + "learning_rate": 1.7395667237468733e-05, + "loss": 0.1045, + "step": 20591 + }, + { + "epoch": 2.4418356456776946, + "grad_norm": 0.8460424537233397, + "learning_rate": 1.739338064839498e-05, + "loss": 0.1027, + "step": 20592 + }, + { + "epoch": 2.44195422743982, + "grad_norm": 0.6394736315218411, + "learning_rate": 1.739109412944492e-05, + "loss": 0.0882, + "step": 20593 + }, + { + "epoch": 2.4420728092019446, + "grad_norm": 0.6872383614108158, + "learning_rate": 1.7388807680639655e-05, + "loss": 0.0952, + "step": 20594 + }, + { + "epoch": 2.44219139096407, + "grad_norm": 0.7518910190054401, + "learning_rate": 1.738652130200024e-05, + "loss": 0.1235, + "step": 20595 + }, + { + "epoch": 2.4423099727261945, + "grad_norm": 1.1416730695277892, + "learning_rate": 1.738423499354777e-05, + "loss": 0.1209, + "step": 20596 + }, + { + "epoch": 2.4424285544883197, + "grad_norm": 0.6893737552999919, + "learning_rate": 1.7381948755303306e-05, + "loss": 0.0786, + "step": 20597 + }, + { + "epoch": 2.4425471362504445, + "grad_norm": 0.7130118751058202, + "learning_rate": 1.737966258728794e-05, + "loss": 0.1089, + "step": 20598 + }, + { + "epoch": 2.4426657180125697, + "grad_norm": 0.6488331081266713, + "learning_rate": 1.737737648952274e-05, + "loss": 0.0988, + "step": 20599 + }, + { + "epoch": 2.4427842997746945, + "grad_norm": 0.8242305527172274, + "learning_rate": 1.7375090462028787e-05, + "loss": 0.1227, + "step": 20600 + }, + { + "epoch": 2.4429028815368197, + "grad_norm": 0.6852341620811824, + "learning_rate": 1.7372804504827132e-05, + "loss": 0.1078, + "step": 20601 + }, + { + "epoch": 2.4430214632989444, + "grad_norm": 0.4990574898999918, + "learning_rate": 1.737051861793888e-05, + "loss": 0.0582, + "step": 20602 + }, + { + "epoch": 2.4431400450610696, + "grad_norm": 0.650274925896552, + "learning_rate": 1.7368232801385086e-05, + "loss": 0.1148, + "step": 20603 + }, + { + "epoch": 2.4432586268231944, + "grad_norm": 0.6379775134057803, + "learning_rate": 1.7365947055186827e-05, + "loss": 0.0792, + "step": 20604 + }, + { + "epoch": 2.4433772085853196, + "grad_norm": 1.0406083264737265, + "learning_rate": 1.7363661379365174e-05, + "loss": 0.1179, + "step": 20605 + }, + { + "epoch": 2.443495790347445, + "grad_norm": 0.48944762221820415, + "learning_rate": 1.73613757739412e-05, + "loss": 0.0628, + "step": 20606 + }, + { + "epoch": 2.4436143721095696, + "grad_norm": 0.6507145661079988, + "learning_rate": 1.735909023893598e-05, + "loss": 0.086, + "step": 20607 + }, + { + "epoch": 2.4437329538716943, + "grad_norm": 0.7364688983701472, + "learning_rate": 1.7356804774370565e-05, + "loss": 0.0924, + "step": 20608 + }, + { + "epoch": 2.4438515356338195, + "grad_norm": 0.805729154334331, + "learning_rate": 1.735451938026605e-05, + "loss": 0.1342, + "step": 20609 + }, + { + "epoch": 2.4439701173959447, + "grad_norm": 0.928480013438251, + "learning_rate": 1.7352234056643488e-05, + "loss": 0.1338, + "step": 20610 + }, + { + "epoch": 2.4440886991580695, + "grad_norm": 0.9266314716924047, + "learning_rate": 1.734994880352395e-05, + "loss": 0.1294, + "step": 20611 + }, + { + "epoch": 2.4442072809201942, + "grad_norm": 0.5453040668964139, + "learning_rate": 1.7347663620928495e-05, + "loss": 0.0722, + "step": 20612 + }, + { + "epoch": 2.4443258626823194, + "grad_norm": 0.8332706076822329, + "learning_rate": 1.7345378508878206e-05, + "loss": 0.1402, + "step": 20613 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.8407623236069776, + "learning_rate": 1.7343093467394133e-05, + "loss": 0.1106, + "step": 20614 + }, + { + "epoch": 2.4445630262065694, + "grad_norm": 0.6323529820958378, + "learning_rate": 1.7340808496497358e-05, + "loss": 0.09, + "step": 20615 + }, + { + "epoch": 2.4446816079686946, + "grad_norm": 0.6042538023486035, + "learning_rate": 1.733852359620892e-05, + "loss": 0.0795, + "step": 20616 + }, + { + "epoch": 2.4448001897308194, + "grad_norm": 0.7231274228070389, + "learning_rate": 1.733623876654992e-05, + "loss": 0.0901, + "step": 20617 + }, + { + "epoch": 2.4449187714929446, + "grad_norm": 0.5799592583649538, + "learning_rate": 1.7333954007541393e-05, + "loss": 0.0709, + "step": 20618 + }, + { + "epoch": 2.4450373532550693, + "grad_norm": 0.7638795715868576, + "learning_rate": 1.73316693192044e-05, + "loss": 0.1257, + "step": 20619 + }, + { + "epoch": 2.4451559350171945, + "grad_norm": 0.5556023119603624, + "learning_rate": 1.7329384701560023e-05, + "loss": 0.0796, + "step": 20620 + }, + { + "epoch": 2.4452745167793193, + "grad_norm": 0.7641301363635972, + "learning_rate": 1.7327100154629312e-05, + "loss": 0.1167, + "step": 20621 + }, + { + "epoch": 2.4453930985414445, + "grad_norm": 1.0633401766103183, + "learning_rate": 1.7324815678433324e-05, + "loss": 0.1134, + "step": 20622 + }, + { + "epoch": 2.4455116803035692, + "grad_norm": 0.8603164762713452, + "learning_rate": 1.7322531272993125e-05, + "loss": 0.1225, + "step": 20623 + }, + { + "epoch": 2.4456302620656944, + "grad_norm": 0.8806634849411635, + "learning_rate": 1.732024693832977e-05, + "loss": 0.0989, + "step": 20624 + }, + { + "epoch": 2.445748843827819, + "grad_norm": 0.6601042775963958, + "learning_rate": 1.7317962674464328e-05, + "loss": 0.0948, + "step": 20625 + }, + { + "epoch": 2.4458674255899444, + "grad_norm": 0.5277571994543706, + "learning_rate": 1.7315678481417848e-05, + "loss": 0.0722, + "step": 20626 + }, + { + "epoch": 2.445986007352069, + "grad_norm": 0.6957240548938082, + "learning_rate": 1.7313394359211382e-05, + "loss": 0.1074, + "step": 20627 + }, + { + "epoch": 2.4461045891141944, + "grad_norm": 0.7079549865104884, + "learning_rate": 1.7311110307866002e-05, + "loss": 0.0872, + "step": 20628 + }, + { + "epoch": 2.446223170876319, + "grad_norm": 0.682266693292401, + "learning_rate": 1.7308826327402755e-05, + "loss": 0.0804, + "step": 20629 + }, + { + "epoch": 2.4463417526384443, + "grad_norm": 0.8045841707327871, + "learning_rate": 1.7306542417842704e-05, + "loss": 0.1145, + "step": 20630 + }, + { + "epoch": 2.446460334400569, + "grad_norm": 0.6111302203005289, + "learning_rate": 1.730425857920688e-05, + "loss": 0.096, + "step": 20631 + }, + { + "epoch": 2.4465789161626943, + "grad_norm": 0.6780752106039678, + "learning_rate": 1.7301974811516368e-05, + "loss": 0.087, + "step": 20632 + }, + { + "epoch": 2.446697497924819, + "grad_norm": 0.5410577719577452, + "learning_rate": 1.7299691114792204e-05, + "loss": 0.0757, + "step": 20633 + }, + { + "epoch": 2.4468160796869443, + "grad_norm": 0.7660098513061135, + "learning_rate": 1.7297407489055445e-05, + "loss": 0.0803, + "step": 20634 + }, + { + "epoch": 2.446934661449069, + "grad_norm": 0.6969447459121716, + "learning_rate": 1.729512393432714e-05, + "loss": 0.089, + "step": 20635 + }, + { + "epoch": 2.447053243211194, + "grad_norm": 0.5030663985695636, + "learning_rate": 1.7292840450628353e-05, + "loss": 0.0726, + "step": 20636 + }, + { + "epoch": 2.447171824973319, + "grad_norm": 0.959201114598613, + "learning_rate": 1.7290557037980126e-05, + "loss": 0.123, + "step": 20637 + }, + { + "epoch": 2.447290406735444, + "grad_norm": 0.841351637113285, + "learning_rate": 1.7288273696403494e-05, + "loss": 0.13, + "step": 20638 + }, + { + "epoch": 2.447408988497569, + "grad_norm": 0.7420667933554135, + "learning_rate": 1.7285990425919536e-05, + "loss": 0.1147, + "step": 20639 + }, + { + "epoch": 2.447527570259694, + "grad_norm": 0.666337515076891, + "learning_rate": 1.728370722654929e-05, + "loss": 0.1003, + "step": 20640 + }, + { + "epoch": 2.447646152021819, + "grad_norm": 0.6292501415945561, + "learning_rate": 1.7281424098313795e-05, + "loss": 0.078, + "step": 20641 + }, + { + "epoch": 2.447764733783944, + "grad_norm": 0.6043837541234782, + "learning_rate": 1.7279141041234097e-05, + "loss": 0.071, + "step": 20642 + }, + { + "epoch": 2.447883315546069, + "grad_norm": 0.6909210683816364, + "learning_rate": 1.727685805533126e-05, + "loss": 0.1009, + "step": 20643 + }, + { + "epoch": 2.448001897308194, + "grad_norm": 0.545552949209228, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.0698, + "step": 20644 + }, + { + "epoch": 2.448120479070319, + "grad_norm": 0.8094543423372235, + "learning_rate": 1.7272292297140322e-05, + "loss": 0.0913, + "step": 20645 + }, + { + "epoch": 2.448239060832444, + "grad_norm": 0.8169079042288704, + "learning_rate": 1.7270009524894303e-05, + "loss": 0.0967, + "step": 20646 + }, + { + "epoch": 2.448357642594569, + "grad_norm": 0.7202884643729462, + "learning_rate": 1.7267726823909333e-05, + "loss": 0.0992, + "step": 20647 + }, + { + "epoch": 2.448476224356694, + "grad_norm": 0.6731865720893271, + "learning_rate": 1.7265444194206436e-05, + "loss": 0.0918, + "step": 20648 + }, + { + "epoch": 2.4485948061188187, + "grad_norm": 0.5727225174476387, + "learning_rate": 1.726316163580665e-05, + "loss": 0.0636, + "step": 20649 + }, + { + "epoch": 2.448713387880944, + "grad_norm": 0.6190379343259996, + "learning_rate": 1.7260879148731036e-05, + "loss": 0.0763, + "step": 20650 + }, + { + "epoch": 2.4488319696430687, + "grad_norm": 0.6585955819196773, + "learning_rate": 1.7258596733000625e-05, + "loss": 0.0925, + "step": 20651 + }, + { + "epoch": 2.448950551405194, + "grad_norm": 0.7032723098575442, + "learning_rate": 1.7256314388636454e-05, + "loss": 0.0737, + "step": 20652 + }, + { + "epoch": 2.4490691331673187, + "grad_norm": 0.7510088915659506, + "learning_rate": 1.725403211565957e-05, + "loss": 0.1061, + "step": 20653 + }, + { + "epoch": 2.449187714929444, + "grad_norm": 0.48384843116692244, + "learning_rate": 1.7251749914091008e-05, + "loss": 0.0895, + "step": 20654 + }, + { + "epoch": 2.449306296691569, + "grad_norm": 0.5672097329875889, + "learning_rate": 1.724946778395182e-05, + "loss": 0.0791, + "step": 20655 + }, + { + "epoch": 2.449424878453694, + "grad_norm": 0.7311053300686302, + "learning_rate": 1.7247185725263032e-05, + "loss": 0.0919, + "step": 20656 + }, + { + "epoch": 2.4495434602158186, + "grad_norm": 0.6491733702485836, + "learning_rate": 1.7244903738045675e-05, + "loss": 0.0838, + "step": 20657 + }, + { + "epoch": 2.449662041977944, + "grad_norm": 0.6016747332409366, + "learning_rate": 1.7242621822320802e-05, + "loss": 0.075, + "step": 20658 + }, + { + "epoch": 2.449780623740069, + "grad_norm": 0.45238387191227286, + "learning_rate": 1.7240339978109448e-05, + "loss": 0.0649, + "step": 20659 + }, + { + "epoch": 2.4498992055021938, + "grad_norm": 0.5458136648623211, + "learning_rate": 1.7238058205432644e-05, + "loss": 0.0735, + "step": 20660 + }, + { + "epoch": 2.4500177872643185, + "grad_norm": 0.6241162072549651, + "learning_rate": 1.7235776504311412e-05, + "loss": 0.0905, + "step": 20661 + }, + { + "epoch": 2.4501363690264437, + "grad_norm": 0.5172845980238713, + "learning_rate": 1.723349487476681e-05, + "loss": 0.0778, + "step": 20662 + }, + { + "epoch": 2.450254950788569, + "grad_norm": 0.7687110034584835, + "learning_rate": 1.7231213316819854e-05, + "loss": 0.1529, + "step": 20663 + }, + { + "epoch": 2.4503735325506937, + "grad_norm": 1.027448874490979, + "learning_rate": 1.722893183049159e-05, + "loss": 0.1371, + "step": 20664 + }, + { + "epoch": 2.450492114312819, + "grad_norm": 0.829382107827657, + "learning_rate": 1.722665041580304e-05, + "loss": 0.0923, + "step": 20665 + }, + { + "epoch": 2.4506106960749436, + "grad_norm": 0.6521298261420704, + "learning_rate": 1.722436907277525e-05, + "loss": 0.0981, + "step": 20666 + }, + { + "epoch": 2.450729277837069, + "grad_norm": 0.6739328104033177, + "learning_rate": 1.7222087801429242e-05, + "loss": 0.0826, + "step": 20667 + }, + { + "epoch": 2.4508478595991936, + "grad_norm": 0.6710648357440095, + "learning_rate": 1.7219806601786033e-05, + "loss": 0.0746, + "step": 20668 + }, + { + "epoch": 2.450966441361319, + "grad_norm": 0.5415076284243004, + "learning_rate": 1.721752547386668e-05, + "loss": 0.0744, + "step": 20669 + }, + { + "epoch": 2.4510850231234436, + "grad_norm": 0.9078999517583739, + "learning_rate": 1.72152444176922e-05, + "loss": 0.1229, + "step": 20670 + }, + { + "epoch": 2.4512036048855688, + "grad_norm": 0.7716367283922257, + "learning_rate": 1.721296343328361e-05, + "loss": 0.1037, + "step": 20671 + }, + { + "epoch": 2.4513221866476935, + "grad_norm": 0.6293445284241302, + "learning_rate": 1.721068252066195e-05, + "loss": 0.0863, + "step": 20672 + }, + { + "epoch": 2.4514407684098187, + "grad_norm": 0.5510232692174492, + "learning_rate": 1.7208401679848246e-05, + "loss": 0.0903, + "step": 20673 + }, + { + "epoch": 2.4515593501719435, + "grad_norm": 0.7636611264392492, + "learning_rate": 1.7206120910863527e-05, + "loss": 0.0729, + "step": 20674 + }, + { + "epoch": 2.4516779319340687, + "grad_norm": 0.6618452472523966, + "learning_rate": 1.7203840213728817e-05, + "loss": 0.0896, + "step": 20675 + }, + { + "epoch": 2.4517965136961934, + "grad_norm": 0.7589325594404099, + "learning_rate": 1.720155958846513e-05, + "loss": 0.0952, + "step": 20676 + }, + { + "epoch": 2.4519150954583186, + "grad_norm": 1.225435835852238, + "learning_rate": 1.7199279035093513e-05, + "loss": 0.1987, + "step": 20677 + }, + { + "epoch": 2.4520336772204434, + "grad_norm": 0.5960674518193757, + "learning_rate": 1.7196998553634974e-05, + "loss": 0.087, + "step": 20678 + }, + { + "epoch": 2.4521522589825686, + "grad_norm": 0.8508331027558069, + "learning_rate": 1.719471814411053e-05, + "loss": 0.1155, + "step": 20679 + }, + { + "epoch": 2.4522708407446934, + "grad_norm": 0.5399054810817019, + "learning_rate": 1.7192437806541224e-05, + "loss": 0.0748, + "step": 20680 + }, + { + "epoch": 2.4523894225068186, + "grad_norm": 0.778903838751472, + "learning_rate": 1.7190157540948067e-05, + "loss": 0.1036, + "step": 20681 + }, + { + "epoch": 2.4525080042689433, + "grad_norm": 0.7838523505778842, + "learning_rate": 1.7187877347352073e-05, + "loss": 0.0946, + "step": 20682 + }, + { + "epoch": 2.4526265860310685, + "grad_norm": 0.4658373898373317, + "learning_rate": 1.7185597225774276e-05, + "loss": 0.0549, + "step": 20683 + }, + { + "epoch": 2.4527451677931933, + "grad_norm": 0.8030462255876556, + "learning_rate": 1.7183317176235686e-05, + "loss": 0.1209, + "step": 20684 + }, + { + "epoch": 2.4528637495553185, + "grad_norm": 0.725086099998036, + "learning_rate": 1.718103719875733e-05, + "loss": 0.1069, + "step": 20685 + }, + { + "epoch": 2.4529823313174433, + "grad_norm": 0.5637888978602948, + "learning_rate": 1.7178757293360227e-05, + "loss": 0.078, + "step": 20686 + }, + { + "epoch": 2.4531009130795685, + "grad_norm": 0.9298620444114336, + "learning_rate": 1.7176477460065377e-05, + "loss": 0.1109, + "step": 20687 + }, + { + "epoch": 2.453219494841693, + "grad_norm": 0.6242542919415361, + "learning_rate": 1.717419769889382e-05, + "loss": 0.0816, + "step": 20688 + }, + { + "epoch": 2.4533380766038184, + "grad_norm": 0.810179826049842, + "learning_rate": 1.7171918009866565e-05, + "loss": 0.1125, + "step": 20689 + }, + { + "epoch": 2.453456658365943, + "grad_norm": 0.6804402373606038, + "learning_rate": 1.7169638393004615e-05, + "loss": 0.0833, + "step": 20690 + }, + { + "epoch": 2.4535752401280684, + "grad_norm": 1.0331740420827813, + "learning_rate": 1.7167358848329012e-05, + "loss": 0.1428, + "step": 20691 + }, + { + "epoch": 2.453693821890193, + "grad_norm": 0.806913346781553, + "learning_rate": 1.7165079375860755e-05, + "loss": 0.0995, + "step": 20692 + }, + { + "epoch": 2.4538124036523183, + "grad_norm": 0.6699461398483338, + "learning_rate": 1.7162799975620848e-05, + "loss": 0.0857, + "step": 20693 + }, + { + "epoch": 2.453930985414443, + "grad_norm": 0.8506504365843588, + "learning_rate": 1.716052064763032e-05, + "loss": 0.1066, + "step": 20694 + }, + { + "epoch": 2.4540495671765683, + "grad_norm": 0.7536798769216085, + "learning_rate": 1.7158241391910174e-05, + "loss": 0.0886, + "step": 20695 + }, + { + "epoch": 2.454168148938693, + "grad_norm": 0.6007489884333291, + "learning_rate": 1.715596220848143e-05, + "loss": 0.0891, + "step": 20696 + }, + { + "epoch": 2.4542867307008183, + "grad_norm": 0.6970013884414722, + "learning_rate": 1.7153683097365097e-05, + "loss": 0.0884, + "step": 20697 + }, + { + "epoch": 2.454405312462943, + "grad_norm": 0.7896184770952923, + "learning_rate": 1.715140405858218e-05, + "loss": 0.1114, + "step": 20698 + }, + { + "epoch": 2.4545238942250682, + "grad_norm": 0.5907643080150148, + "learning_rate": 1.7149125092153695e-05, + "loss": 0.0533, + "step": 20699 + }, + { + "epoch": 2.454642475987193, + "grad_norm": 0.838773836410216, + "learning_rate": 1.7146846198100653e-05, + "loss": 0.1141, + "step": 20700 + }, + { + "epoch": 2.454761057749318, + "grad_norm": 0.8718974771349209, + "learning_rate": 1.7144567376444052e-05, + "loss": 0.0988, + "step": 20701 + }, + { + "epoch": 2.454879639511443, + "grad_norm": 1.0893218566129321, + "learning_rate": 1.714228862720491e-05, + "loss": 0.1552, + "step": 20702 + }, + { + "epoch": 2.454998221273568, + "grad_norm": 0.8373746260281485, + "learning_rate": 1.7140009950404228e-05, + "loss": 0.1312, + "step": 20703 + }, + { + "epoch": 2.4551168030356934, + "grad_norm": 1.0272575885875217, + "learning_rate": 1.7137731346063024e-05, + "loss": 0.1395, + "step": 20704 + }, + { + "epoch": 2.455235384797818, + "grad_norm": 0.7064733980300377, + "learning_rate": 1.713545281420228e-05, + "loss": 0.084, + "step": 20705 + }, + { + "epoch": 2.455353966559943, + "grad_norm": 0.48064404090402707, + "learning_rate": 1.7133174354843033e-05, + "loss": 0.074, + "step": 20706 + }, + { + "epoch": 2.455472548322068, + "grad_norm": 0.983688647780766, + "learning_rate": 1.7130895968006266e-05, + "loss": 0.1494, + "step": 20707 + }, + { + "epoch": 2.4555911300841933, + "grad_norm": 0.5682623779642088, + "learning_rate": 1.7128617653712995e-05, + "loss": 0.0894, + "step": 20708 + }, + { + "epoch": 2.455709711846318, + "grad_norm": 0.92987816949919, + "learning_rate": 1.71263394119842e-05, + "loss": 0.1369, + "step": 20709 + }, + { + "epoch": 2.455828293608443, + "grad_norm": 1.114728635468689, + "learning_rate": 1.7124061242840915e-05, + "loss": 0.1171, + "step": 20710 + }, + { + "epoch": 2.455946875370568, + "grad_norm": 0.75004729641078, + "learning_rate": 1.7121783146304128e-05, + "loss": 0.1105, + "step": 20711 + }, + { + "epoch": 2.456065457132693, + "grad_norm": 0.7090513495845456, + "learning_rate": 1.711950512239483e-05, + "loss": 0.0957, + "step": 20712 + }, + { + "epoch": 2.456184038894818, + "grad_norm": 0.5673411041180091, + "learning_rate": 1.7117227171134032e-05, + "loss": 0.0843, + "step": 20713 + }, + { + "epoch": 2.4563026206569427, + "grad_norm": 0.7534423195457624, + "learning_rate": 1.7114949292542735e-05, + "loss": 0.1053, + "step": 20714 + }, + { + "epoch": 2.456421202419068, + "grad_norm": 0.6386549525004825, + "learning_rate": 1.711267148664194e-05, + "loss": 0.0931, + "step": 20715 + }, + { + "epoch": 2.456539784181193, + "grad_norm": 0.8179496571374062, + "learning_rate": 1.711039375345264e-05, + "loss": 0.1125, + "step": 20716 + }, + { + "epoch": 2.456658365943318, + "grad_norm": 0.590395900467978, + "learning_rate": 1.710811609299583e-05, + "loss": 0.1005, + "step": 20717 + }, + { + "epoch": 2.456776947705443, + "grad_norm": 0.43932923915635047, + "learning_rate": 1.710583850529252e-05, + "loss": 0.0643, + "step": 20718 + }, + { + "epoch": 2.456895529467568, + "grad_norm": 0.6971032617480657, + "learning_rate": 1.7103560990363696e-05, + "loss": 0.0955, + "step": 20719 + }, + { + "epoch": 2.457014111229693, + "grad_norm": 0.4761993358266804, + "learning_rate": 1.7101283548230347e-05, + "loss": 0.0694, + "step": 20720 + }, + { + "epoch": 2.457132692991818, + "grad_norm": 0.5690947823584435, + "learning_rate": 1.709900617891349e-05, + "loss": 0.0732, + "step": 20721 + }, + { + "epoch": 2.457251274753943, + "grad_norm": 0.6194894518129641, + "learning_rate": 1.7096728882434105e-05, + "loss": 0.0796, + "step": 20722 + }, + { + "epoch": 2.4573698565160678, + "grad_norm": 0.5692458567364709, + "learning_rate": 1.7094451658813183e-05, + "loss": 0.0793, + "step": 20723 + }, + { + "epoch": 2.457488438278193, + "grad_norm": 0.6118870772239535, + "learning_rate": 1.7092174508071725e-05, + "loss": 0.1035, + "step": 20724 + }, + { + "epoch": 2.4576070200403177, + "grad_norm": 0.7833486133996022, + "learning_rate": 1.7089897430230716e-05, + "loss": 0.1085, + "step": 20725 + }, + { + "epoch": 2.457725601802443, + "grad_norm": 0.7123978399525839, + "learning_rate": 1.7087620425311164e-05, + "loss": 0.1068, + "step": 20726 + }, + { + "epoch": 2.4578441835645677, + "grad_norm": 0.5545706573656838, + "learning_rate": 1.7085343493334043e-05, + "loss": 0.0853, + "step": 20727 + }, + { + "epoch": 2.457962765326693, + "grad_norm": 0.69181583901534, + "learning_rate": 1.7083066634320344e-05, + "loss": 0.0935, + "step": 20728 + }, + { + "epoch": 2.4580813470888176, + "grad_norm": 0.6367548304137194, + "learning_rate": 1.7080789848291066e-05, + "loss": 0.0869, + "step": 20729 + }, + { + "epoch": 2.458199928850943, + "grad_norm": 0.6287653160504295, + "learning_rate": 1.70785131352672e-05, + "loss": 0.0842, + "step": 20730 + }, + { + "epoch": 2.4583185106130676, + "grad_norm": 0.4747450642971999, + "learning_rate": 1.7076236495269725e-05, + "loss": 0.0786, + "step": 20731 + }, + { + "epoch": 2.458437092375193, + "grad_norm": 0.4435758192105646, + "learning_rate": 1.7073959928319627e-05, + "loss": 0.0621, + "step": 20732 + }, + { + "epoch": 2.4585556741373176, + "grad_norm": 0.8595675776563956, + "learning_rate": 1.70716834344379e-05, + "loss": 0.1254, + "step": 20733 + }, + { + "epoch": 2.4586742558994428, + "grad_norm": 0.5049559956172341, + "learning_rate": 1.706940701364554e-05, + "loss": 0.0705, + "step": 20734 + }, + { + "epoch": 2.4587928376615675, + "grad_norm": 0.5755230054484719, + "learning_rate": 1.706713066596351e-05, + "loss": 0.1004, + "step": 20735 + }, + { + "epoch": 2.4589114194236927, + "grad_norm": 0.928125280869315, + "learning_rate": 1.7064854391412814e-05, + "loss": 0.101, + "step": 20736 + }, + { + "epoch": 2.4590300011858175, + "grad_norm": 0.8798653726755944, + "learning_rate": 1.7062578190014435e-05, + "loss": 0.1032, + "step": 20737 + }, + { + "epoch": 2.4591485829479427, + "grad_norm": 0.7391205434172601, + "learning_rate": 1.7060302061789345e-05, + "loss": 0.1158, + "step": 20738 + }, + { + "epoch": 2.4592671647100675, + "grad_norm": 0.6241218180898043, + "learning_rate": 1.7058026006758526e-05, + "loss": 0.0816, + "step": 20739 + }, + { + "epoch": 2.4593857464721927, + "grad_norm": 0.9129901370021706, + "learning_rate": 1.705575002494298e-05, + "loss": 0.1269, + "step": 20740 + }, + { + "epoch": 2.4595043282343174, + "grad_norm": 0.7853614965000809, + "learning_rate": 1.7053474116363676e-05, + "loss": 0.1131, + "step": 20741 + }, + { + "epoch": 2.4596229099964426, + "grad_norm": 0.6805143473698542, + "learning_rate": 1.7051198281041593e-05, + "loss": 0.0821, + "step": 20742 + }, + { + "epoch": 2.4597414917585674, + "grad_norm": 0.6933708523138653, + "learning_rate": 1.7048922518997713e-05, + "loss": 0.0891, + "step": 20743 + }, + { + "epoch": 2.4598600735206926, + "grad_norm": 0.8001773645776676, + "learning_rate": 1.7046646830253017e-05, + "loss": 0.1288, + "step": 20744 + }, + { + "epoch": 2.4599786552828173, + "grad_norm": 1.1201929494605851, + "learning_rate": 1.7044371214828487e-05, + "loss": 0.1192, + "step": 20745 + }, + { + "epoch": 2.4600972370449425, + "grad_norm": 0.9153553407158945, + "learning_rate": 1.70420956727451e-05, + "loss": 0.1244, + "step": 20746 + }, + { + "epoch": 2.4602158188070673, + "grad_norm": 0.7389488851208383, + "learning_rate": 1.703982020402382e-05, + "loss": 0.1127, + "step": 20747 + }, + { + "epoch": 2.4603344005691925, + "grad_norm": 1.1523753301387447, + "learning_rate": 1.7037544808685653e-05, + "loss": 0.1479, + "step": 20748 + }, + { + "epoch": 2.4604529823313173, + "grad_norm": 0.6671207068770151, + "learning_rate": 1.7035269486751554e-05, + "loss": 0.0994, + "step": 20749 + }, + { + "epoch": 2.4605715640934425, + "grad_norm": 0.8258596313317011, + "learning_rate": 1.7032994238242492e-05, + "loss": 0.0902, + "step": 20750 + }, + { + "epoch": 2.4606901458555672, + "grad_norm": 0.799154387052367, + "learning_rate": 1.7030719063179464e-05, + "loss": 0.1457, + "step": 20751 + }, + { + "epoch": 2.4608087276176924, + "grad_norm": 0.6779122176254897, + "learning_rate": 1.702844396158343e-05, + "loss": 0.0999, + "step": 20752 + }, + { + "epoch": 2.4609273093798176, + "grad_norm": 0.9056489379806298, + "learning_rate": 1.7026168933475372e-05, + "loss": 0.105, + "step": 20753 + }, + { + "epoch": 2.4610458911419424, + "grad_norm": 0.5827249285188854, + "learning_rate": 1.702389397887625e-05, + "loss": 0.088, + "step": 20754 + }, + { + "epoch": 2.461164472904067, + "grad_norm": 0.6170775452447961, + "learning_rate": 1.7021619097807052e-05, + "loss": 0.1038, + "step": 20755 + }, + { + "epoch": 2.4612830546661923, + "grad_norm": 0.7667511235444905, + "learning_rate": 1.7019344290288744e-05, + "loss": 0.1067, + "step": 20756 + }, + { + "epoch": 2.4614016364283176, + "grad_norm": 0.6275994452482672, + "learning_rate": 1.7017069556342294e-05, + "loss": 0.0716, + "step": 20757 + }, + { + "epoch": 2.4615202181904423, + "grad_norm": 0.7299637542168926, + "learning_rate": 1.7014794895988662e-05, + "loss": 0.1157, + "step": 20758 + }, + { + "epoch": 2.461638799952567, + "grad_norm": 0.6221920224885246, + "learning_rate": 1.701252030924884e-05, + "loss": 0.0588, + "step": 20759 + }, + { + "epoch": 2.4617573817146923, + "grad_norm": 0.6430015647547059, + "learning_rate": 1.7010245796143785e-05, + "loss": 0.0886, + "step": 20760 + }, + { + "epoch": 2.4618759634768175, + "grad_norm": 0.6116693458371941, + "learning_rate": 1.7007971356694466e-05, + "loss": 0.0763, + "step": 20761 + }, + { + "epoch": 2.4619945452389422, + "grad_norm": 0.566986709936935, + "learning_rate": 1.700569699092185e-05, + "loss": 0.0763, + "step": 20762 + }, + { + "epoch": 2.462113127001067, + "grad_norm": 1.1885859149895452, + "learning_rate": 1.70034226988469e-05, + "loss": 0.1636, + "step": 20763 + }, + { + "epoch": 2.462231708763192, + "grad_norm": 0.7382774916257784, + "learning_rate": 1.7001148480490593e-05, + "loss": 0.1049, + "step": 20764 + }, + { + "epoch": 2.4623502905253174, + "grad_norm": 0.7122169619600097, + "learning_rate": 1.6998874335873876e-05, + "loss": 0.1031, + "step": 20765 + }, + { + "epoch": 2.462468872287442, + "grad_norm": 0.8568723934270331, + "learning_rate": 1.699660026501774e-05, + "loss": 0.1362, + "step": 20766 + }, + { + "epoch": 2.4625874540495674, + "grad_norm": 0.9129848078809151, + "learning_rate": 1.6994326267943134e-05, + "loss": 0.1301, + "step": 20767 + }, + { + "epoch": 2.462706035811692, + "grad_norm": 1.004682991546434, + "learning_rate": 1.6992052344671023e-05, + "loss": 0.1407, + "step": 20768 + }, + { + "epoch": 2.4628246175738173, + "grad_norm": 0.5274869455322904, + "learning_rate": 1.6989778495222357e-05, + "loss": 0.0768, + "step": 20769 + }, + { + "epoch": 2.462943199335942, + "grad_norm": 0.981989817381269, + "learning_rate": 1.6987504719618122e-05, + "loss": 0.1373, + "step": 20770 + }, + { + "epoch": 2.4630617810980673, + "grad_norm": 1.6364125518893593, + "learning_rate": 1.698523101787927e-05, + "loss": 0.192, + "step": 20771 + }, + { + "epoch": 2.463180362860192, + "grad_norm": 0.7375696201122908, + "learning_rate": 1.6982957390026748e-05, + "loss": 0.089, + "step": 20772 + }, + { + "epoch": 2.4632989446223172, + "grad_norm": 0.6379653717319274, + "learning_rate": 1.6980683836081535e-05, + "loss": 0.098, + "step": 20773 + }, + { + "epoch": 2.463417526384442, + "grad_norm": 0.823009970682833, + "learning_rate": 1.697841035606458e-05, + "loss": 0.138, + "step": 20774 + }, + { + "epoch": 2.463536108146567, + "grad_norm": 0.43252702956795147, + "learning_rate": 1.6976136949996853e-05, + "loss": 0.0577, + "step": 20775 + }, + { + "epoch": 2.463654689908692, + "grad_norm": 0.5038853493531708, + "learning_rate": 1.697386361789929e-05, + "loss": 0.077, + "step": 20776 + }, + { + "epoch": 2.463773271670817, + "grad_norm": 0.60103834317059, + "learning_rate": 1.697159035979287e-05, + "loss": 0.0886, + "step": 20777 + }, + { + "epoch": 2.463891853432942, + "grad_norm": 0.5737636061698748, + "learning_rate": 1.6969317175698547e-05, + "loss": 0.0888, + "step": 20778 + }, + { + "epoch": 2.464010435195067, + "grad_norm": 0.870653423814872, + "learning_rate": 1.6967044065637273e-05, + "loss": 0.1389, + "step": 20779 + }, + { + "epoch": 2.464129016957192, + "grad_norm": 0.6397697773338261, + "learning_rate": 1.6964771029629986e-05, + "loss": 0.0704, + "step": 20780 + }, + { + "epoch": 2.464247598719317, + "grad_norm": 0.5074488480020862, + "learning_rate": 1.696249806769767e-05, + "loss": 0.0754, + "step": 20781 + }, + { + "epoch": 2.464366180481442, + "grad_norm": 0.7673985569711886, + "learning_rate": 1.6960225179861262e-05, + "loss": 0.121, + "step": 20782 + }, + { + "epoch": 2.464484762243567, + "grad_norm": 0.8646260904109409, + "learning_rate": 1.695795236614172e-05, + "loss": 0.105, + "step": 20783 + }, + { + "epoch": 2.464603344005692, + "grad_norm": 0.6509168327452073, + "learning_rate": 1.6955679626559985e-05, + "loss": 0.1061, + "step": 20784 + }, + { + "epoch": 2.464721925767817, + "grad_norm": 1.3666039418112053, + "learning_rate": 1.6953406961137032e-05, + "loss": 0.1688, + "step": 20785 + }, + { + "epoch": 2.4648405075299418, + "grad_norm": 0.5179677156640112, + "learning_rate": 1.69511343698938e-05, + "loss": 0.0746, + "step": 20786 + }, + { + "epoch": 2.464959089292067, + "grad_norm": 0.5558608098946262, + "learning_rate": 1.694886185285124e-05, + "loss": 0.076, + "step": 20787 + }, + { + "epoch": 2.4650776710541917, + "grad_norm": 0.4255606961514756, + "learning_rate": 1.6946589410030287e-05, + "loss": 0.0597, + "step": 20788 + }, + { + "epoch": 2.465196252816317, + "grad_norm": 0.7737026815826021, + "learning_rate": 1.6944317041451915e-05, + "loss": 0.1137, + "step": 20789 + }, + { + "epoch": 2.4653148345784417, + "grad_norm": 0.5396480267058706, + "learning_rate": 1.694204474713706e-05, + "loss": 0.0665, + "step": 20790 + }, + { + "epoch": 2.465433416340567, + "grad_norm": 0.6876677017619486, + "learning_rate": 1.6939772527106665e-05, + "loss": 0.108, + "step": 20791 + }, + { + "epoch": 2.4655519981026917, + "grad_norm": 0.7176614857676281, + "learning_rate": 1.6937500381381692e-05, + "loss": 0.0985, + "step": 20792 + }, + { + "epoch": 2.465670579864817, + "grad_norm": 0.733196446799048, + "learning_rate": 1.693522830998307e-05, + "loss": 0.1015, + "step": 20793 + }, + { + "epoch": 2.4657891616269416, + "grad_norm": 0.9722975490572883, + "learning_rate": 1.6932956312931764e-05, + "loss": 0.114, + "step": 20794 + }, + { + "epoch": 2.465907743389067, + "grad_norm": 0.72007172237148, + "learning_rate": 1.6930684390248696e-05, + "loss": 0.1273, + "step": 20795 + }, + { + "epoch": 2.4660263251511916, + "grad_norm": 0.9439747019329725, + "learning_rate": 1.692841254195483e-05, + "loss": 0.1231, + "step": 20796 + }, + { + "epoch": 2.466144906913317, + "grad_norm": 0.839343208694002, + "learning_rate": 1.6926140768071106e-05, + "loss": 0.1138, + "step": 20797 + }, + { + "epoch": 2.4662634886754415, + "grad_norm": 0.6414732619751775, + "learning_rate": 1.692386906861846e-05, + "loss": 0.1076, + "step": 20798 + }, + { + "epoch": 2.4663820704375667, + "grad_norm": 0.7197324642693047, + "learning_rate": 1.6921597443617832e-05, + "loss": 0.0836, + "step": 20799 + }, + { + "epoch": 2.4665006521996915, + "grad_norm": 0.7101516697473397, + "learning_rate": 1.6919325893090175e-05, + "loss": 0.095, + "step": 20800 + }, + { + "epoch": 2.4666192339618167, + "grad_norm": 0.8001347536685708, + "learning_rate": 1.6917054417056423e-05, + "loss": 0.1194, + "step": 20801 + }, + { + "epoch": 2.466737815723942, + "grad_norm": 0.5281789561741611, + "learning_rate": 1.6914783015537518e-05, + "loss": 0.0807, + "step": 20802 + }, + { + "epoch": 2.4668563974860667, + "grad_norm": 0.8253117695585921, + "learning_rate": 1.691251168855439e-05, + "loss": 0.1292, + "step": 20803 + }, + { + "epoch": 2.4669749792481914, + "grad_norm": 0.6951253769962257, + "learning_rate": 1.6910240436127997e-05, + "loss": 0.1087, + "step": 20804 + }, + { + "epoch": 2.4670935610103166, + "grad_norm": 0.8507626889453094, + "learning_rate": 1.6907969258279265e-05, + "loss": 0.1304, + "step": 20805 + }, + { + "epoch": 2.467212142772442, + "grad_norm": 0.47753121639335266, + "learning_rate": 1.6905698155029124e-05, + "loss": 0.0684, + "step": 20806 + }, + { + "epoch": 2.4673307245345666, + "grad_norm": 1.33201830589515, + "learning_rate": 1.6903427126398534e-05, + "loss": 0.1835, + "step": 20807 + }, + { + "epoch": 2.4674493062966913, + "grad_norm": 0.5364184670646888, + "learning_rate": 1.6901156172408416e-05, + "loss": 0.0619, + "step": 20808 + }, + { + "epoch": 2.4675678880588165, + "grad_norm": 0.5538779292713681, + "learning_rate": 1.689888529307971e-05, + "loss": 0.0839, + "step": 20809 + }, + { + "epoch": 2.4676864698209418, + "grad_norm": 0.6816349757690612, + "learning_rate": 1.6896614488433328e-05, + "loss": 0.0847, + "step": 20810 + }, + { + "epoch": 2.4678050515830665, + "grad_norm": 0.874855711035919, + "learning_rate": 1.6894343758490238e-05, + "loss": 0.1089, + "step": 20811 + }, + { + "epoch": 2.4679236333451913, + "grad_norm": 0.8386167168719807, + "learning_rate": 1.6892073103271355e-05, + "loss": 0.1102, + "step": 20812 + }, + { + "epoch": 2.4680422151073165, + "grad_norm": 0.6675902130738254, + "learning_rate": 1.688980252279762e-05, + "loss": 0.0845, + "step": 20813 + }, + { + "epoch": 2.4681607968694417, + "grad_norm": 0.59146941613212, + "learning_rate": 1.6887532017089948e-05, + "loss": 0.0695, + "step": 20814 + }, + { + "epoch": 2.4682793786315664, + "grad_norm": 0.742236030931941, + "learning_rate": 1.6885261586169296e-05, + "loss": 0.102, + "step": 20815 + }, + { + "epoch": 2.4683979603936916, + "grad_norm": 0.5222529052743768, + "learning_rate": 1.688299123005658e-05, + "loss": 0.0633, + "step": 20816 + }, + { + "epoch": 2.4685165421558164, + "grad_norm": 0.7314251504842825, + "learning_rate": 1.688072094877273e-05, + "loss": 0.1058, + "step": 20817 + }, + { + "epoch": 2.4686351239179416, + "grad_norm": 0.5015893350329986, + "learning_rate": 1.6878450742338666e-05, + "loss": 0.072, + "step": 20818 + }, + { + "epoch": 2.4687537056800664, + "grad_norm": 0.5642978848452547, + "learning_rate": 1.6876180610775338e-05, + "loss": 0.0842, + "step": 20819 + }, + { + "epoch": 2.4688722874421916, + "grad_norm": 0.7908886411978082, + "learning_rate": 1.6873910554103664e-05, + "loss": 0.1044, + "step": 20820 + }, + { + "epoch": 2.4689908692043163, + "grad_norm": 0.5625421057479723, + "learning_rate": 1.687164057234456e-05, + "loss": 0.0667, + "step": 20821 + }, + { + "epoch": 2.4691094509664415, + "grad_norm": 0.6304259620510066, + "learning_rate": 1.686937066551897e-05, + "loss": 0.0876, + "step": 20822 + }, + { + "epoch": 2.4692280327285663, + "grad_norm": 0.7795828747700618, + "learning_rate": 1.6867100833647807e-05, + "loss": 0.0939, + "step": 20823 + }, + { + "epoch": 2.4693466144906915, + "grad_norm": 0.8188389853966487, + "learning_rate": 1.6864831076752006e-05, + "loss": 0.1132, + "step": 20824 + }, + { + "epoch": 2.4694651962528162, + "grad_norm": 0.5247779674095279, + "learning_rate": 1.686256139485248e-05, + "loss": 0.0582, + "step": 20825 + }, + { + "epoch": 2.4695837780149414, + "grad_norm": 0.7492646728498968, + "learning_rate": 1.6860291787970164e-05, + "loss": 0.1026, + "step": 20826 + }, + { + "epoch": 2.469702359777066, + "grad_norm": 0.659325577657611, + "learning_rate": 1.685802225612598e-05, + "loss": 0.0914, + "step": 20827 + }, + { + "epoch": 2.4698209415391914, + "grad_norm": 0.9625294419921993, + "learning_rate": 1.6855752799340846e-05, + "loss": 0.1369, + "step": 20828 + }, + { + "epoch": 2.469939523301316, + "grad_norm": 0.6760899523224889, + "learning_rate": 1.6853483417635674e-05, + "loss": 0.0792, + "step": 20829 + }, + { + "epoch": 2.4700581050634414, + "grad_norm": 0.4786214984169984, + "learning_rate": 1.6851214111031403e-05, + "loss": 0.054, + "step": 20830 + }, + { + "epoch": 2.470176686825566, + "grad_norm": 0.8780740686007096, + "learning_rate": 1.684894487954894e-05, + "loss": 0.1465, + "step": 20831 + }, + { + "epoch": 2.4702952685876913, + "grad_norm": 0.9258599112082196, + "learning_rate": 1.6846675723209216e-05, + "loss": 0.1309, + "step": 20832 + }, + { + "epoch": 2.470413850349816, + "grad_norm": 0.418278182923433, + "learning_rate": 1.6844406642033133e-05, + "loss": 0.0692, + "step": 20833 + }, + { + "epoch": 2.4705324321119413, + "grad_norm": 0.44993278633825823, + "learning_rate": 1.684213763604163e-05, + "loss": 0.0532, + "step": 20834 + }, + { + "epoch": 2.470651013874066, + "grad_norm": 0.6645901154689963, + "learning_rate": 1.6839868705255614e-05, + "loss": 0.0891, + "step": 20835 + }, + { + "epoch": 2.4707695956361913, + "grad_norm": 1.0365730572358296, + "learning_rate": 1.6837599849695995e-05, + "loss": 0.1032, + "step": 20836 + }, + { + "epoch": 2.470888177398316, + "grad_norm": 0.587135642422548, + "learning_rate": 1.6835331069383704e-05, + "loss": 0.0736, + "step": 20837 + }, + { + "epoch": 2.471006759160441, + "grad_norm": 0.73036808403641, + "learning_rate": 1.6833062364339648e-05, + "loss": 0.0855, + "step": 20838 + }, + { + "epoch": 2.471125340922566, + "grad_norm": 0.8579687972394711, + "learning_rate": 1.6830793734584742e-05, + "loss": 0.1123, + "step": 20839 + }, + { + "epoch": 2.471243922684691, + "grad_norm": 0.5133076166621489, + "learning_rate": 1.6828525180139888e-05, + "loss": 0.0568, + "step": 20840 + }, + { + "epoch": 2.471362504446816, + "grad_norm": 0.7116661095306224, + "learning_rate": 1.682625670102602e-05, + "loss": 0.0965, + "step": 20841 + }, + { + "epoch": 2.471481086208941, + "grad_norm": 0.4673459356829709, + "learning_rate": 1.6823988297264042e-05, + "loss": 0.0665, + "step": 20842 + }, + { + "epoch": 2.471599667971066, + "grad_norm": 0.5896110873083569, + "learning_rate": 1.6821719968874873e-05, + "loss": 0.0813, + "step": 20843 + }, + { + "epoch": 2.471718249733191, + "grad_norm": 0.5183638055753489, + "learning_rate": 1.68194517158794e-05, + "loss": 0.0759, + "step": 20844 + }, + { + "epoch": 2.471836831495316, + "grad_norm": 0.9030521975474598, + "learning_rate": 1.6817183538298565e-05, + "loss": 0.1475, + "step": 20845 + }, + { + "epoch": 2.471955413257441, + "grad_norm": 0.6263288392695359, + "learning_rate": 1.681491543615326e-05, + "loss": 0.0774, + "step": 20846 + }, + { + "epoch": 2.472073995019566, + "grad_norm": 0.7477516937382337, + "learning_rate": 1.6812647409464398e-05, + "loss": 0.0965, + "step": 20847 + }, + { + "epoch": 2.472192576781691, + "grad_norm": 0.5724601862420308, + "learning_rate": 1.6810379458252878e-05, + "loss": 0.0731, + "step": 20848 + }, + { + "epoch": 2.472311158543816, + "grad_norm": 0.8359756643915937, + "learning_rate": 1.6808111582539628e-05, + "loss": 0.0873, + "step": 20849 + }, + { + "epoch": 2.472429740305941, + "grad_norm": 0.5309654285608767, + "learning_rate": 1.6805843782345544e-05, + "loss": 0.0819, + "step": 20850 + }, + { + "epoch": 2.4725483220680657, + "grad_norm": 0.5490688889326046, + "learning_rate": 1.6803576057691516e-05, + "loss": 0.0657, + "step": 20851 + }, + { + "epoch": 2.472666903830191, + "grad_norm": 0.63263180598042, + "learning_rate": 1.6801308408598482e-05, + "loss": 0.0944, + "step": 20852 + }, + { + "epoch": 2.4727854855923157, + "grad_norm": 0.9599170512523392, + "learning_rate": 1.6799040835087325e-05, + "loss": 0.1342, + "step": 20853 + }, + { + "epoch": 2.472904067354441, + "grad_norm": 0.5064674152518135, + "learning_rate": 1.6796773337178955e-05, + "loss": 0.0607, + "step": 20854 + }, + { + "epoch": 2.473022649116566, + "grad_norm": 1.1686459350117822, + "learning_rate": 1.679450591489427e-05, + "loss": 0.172, + "step": 20855 + }, + { + "epoch": 2.473141230878691, + "grad_norm": 1.0091980741443507, + "learning_rate": 1.6792238568254186e-05, + "loss": 0.1215, + "step": 20856 + }, + { + "epoch": 2.4732598126408156, + "grad_norm": 0.6320187378887063, + "learning_rate": 1.67899712972796e-05, + "loss": 0.0856, + "step": 20857 + }, + { + "epoch": 2.473378394402941, + "grad_norm": 0.8091061130061451, + "learning_rate": 1.678770410199141e-05, + "loss": 0.0863, + "step": 20858 + }, + { + "epoch": 2.473496976165066, + "grad_norm": 0.8853092726514739, + "learning_rate": 1.678543698241051e-05, + "loss": 0.1121, + "step": 20859 + }, + { + "epoch": 2.473615557927191, + "grad_norm": 0.8236733445708486, + "learning_rate": 1.6783169938557812e-05, + "loss": 0.0953, + "step": 20860 + }, + { + "epoch": 2.4737341396893155, + "grad_norm": 0.7030607368041392, + "learning_rate": 1.678090297045421e-05, + "loss": 0.0916, + "step": 20861 + }, + { + "epoch": 2.4738527214514408, + "grad_norm": 0.6666951533446697, + "learning_rate": 1.67786360781206e-05, + "loss": 0.0987, + "step": 20862 + }, + { + "epoch": 2.473971303213566, + "grad_norm": 0.8857836874015975, + "learning_rate": 1.6776369261577885e-05, + "loss": 0.1154, + "step": 20863 + }, + { + "epoch": 2.4740898849756907, + "grad_norm": 0.7493463685740321, + "learning_rate": 1.6774102520846968e-05, + "loss": 0.1085, + "step": 20864 + }, + { + "epoch": 2.474208466737816, + "grad_norm": 0.4527035276209304, + "learning_rate": 1.6771835855948738e-05, + "loss": 0.0535, + "step": 20865 + }, + { + "epoch": 2.4743270484999407, + "grad_norm": 0.7629990952780102, + "learning_rate": 1.6769569266904078e-05, + "loss": 0.1346, + "step": 20866 + }, + { + "epoch": 2.474445630262066, + "grad_norm": 1.0294989187828374, + "learning_rate": 1.676730275373391e-05, + "loss": 0.1171, + "step": 20867 + }, + { + "epoch": 2.4745642120241906, + "grad_norm": 0.6702732317329356, + "learning_rate": 1.6765036316459113e-05, + "loss": 0.0665, + "step": 20868 + }, + { + "epoch": 2.474682793786316, + "grad_norm": 0.8388952650128475, + "learning_rate": 1.6762769955100585e-05, + "loss": 0.0887, + "step": 20869 + }, + { + "epoch": 2.4748013755484406, + "grad_norm": 0.5945641814413383, + "learning_rate": 1.6760503669679205e-05, + "loss": 0.0937, + "step": 20870 + }, + { + "epoch": 2.474919957310566, + "grad_norm": 0.5439257491835755, + "learning_rate": 1.675823746021589e-05, + "loss": 0.0745, + "step": 20871 + }, + { + "epoch": 2.4750385390726906, + "grad_norm": 0.8361707445319233, + "learning_rate": 1.6755971326731512e-05, + "loss": 0.0992, + "step": 20872 + }, + { + "epoch": 2.4751571208348158, + "grad_norm": 0.5613437874430875, + "learning_rate": 1.675370526924697e-05, + "loss": 0.066, + "step": 20873 + }, + { + "epoch": 2.4752757025969405, + "grad_norm": 0.5137499077786829, + "learning_rate": 1.675143928778315e-05, + "loss": 0.0681, + "step": 20874 + }, + { + "epoch": 2.4753942843590657, + "grad_norm": 0.6730086576044848, + "learning_rate": 1.674917338236095e-05, + "loss": 0.0812, + "step": 20875 + }, + { + "epoch": 2.4755128661211905, + "grad_norm": 1.0388758307379284, + "learning_rate": 1.6746907553001258e-05, + "loss": 0.1472, + "step": 20876 + }, + { + "epoch": 2.4756314478833157, + "grad_norm": 0.5251882127768448, + "learning_rate": 1.6744641799724942e-05, + "loss": 0.0804, + "step": 20877 + }, + { + "epoch": 2.4757500296454404, + "grad_norm": 0.5857378395106151, + "learning_rate": 1.6742376122552923e-05, + "loss": 0.0915, + "step": 20878 + }, + { + "epoch": 2.4758686114075656, + "grad_norm": 0.6276790742194166, + "learning_rate": 1.6740110521506063e-05, + "loss": 0.0823, + "step": 20879 + }, + { + "epoch": 2.4759871931696904, + "grad_norm": 0.5708576823823224, + "learning_rate": 1.6737844996605252e-05, + "loss": 0.0905, + "step": 20880 + }, + { + "epoch": 2.4761057749318156, + "grad_norm": 0.5879974987925588, + "learning_rate": 1.6735579547871378e-05, + "loss": 0.0709, + "step": 20881 + }, + { + "epoch": 2.4762243566939404, + "grad_norm": 0.8166135196007918, + "learning_rate": 1.6733314175325327e-05, + "loss": 0.1281, + "step": 20882 + }, + { + "epoch": 2.4763429384560656, + "grad_norm": 1.01793250411701, + "learning_rate": 1.6731048878987988e-05, + "loss": 0.1437, + "step": 20883 + }, + { + "epoch": 2.4764615202181903, + "grad_norm": 0.6000151118382209, + "learning_rate": 1.6728783658880236e-05, + "loss": 0.0777, + "step": 20884 + }, + { + "epoch": 2.4765801019803155, + "grad_norm": 0.5661926944313093, + "learning_rate": 1.6726518515022946e-05, + "loss": 0.086, + "step": 20885 + }, + { + "epoch": 2.4766986837424403, + "grad_norm": 0.9086423712169093, + "learning_rate": 1.6724253447437023e-05, + "loss": 0.1042, + "step": 20886 + }, + { + "epoch": 2.4768172655045655, + "grad_norm": 0.7487607421652654, + "learning_rate": 1.672198845614333e-05, + "loss": 0.0993, + "step": 20887 + }, + { + "epoch": 2.4769358472666902, + "grad_norm": 0.594224825883056, + "learning_rate": 1.6719723541162755e-05, + "loss": 0.0908, + "step": 20888 + }, + { + "epoch": 2.4770544290288155, + "grad_norm": 0.6166587855565865, + "learning_rate": 1.6717458702516166e-05, + "loss": 0.0799, + "step": 20889 + }, + { + "epoch": 2.47717301079094, + "grad_norm": 0.5861612401130948, + "learning_rate": 1.6715193940224457e-05, + "loss": 0.0969, + "step": 20890 + }, + { + "epoch": 2.4772915925530654, + "grad_norm": 0.4830215559146203, + "learning_rate": 1.67129292543085e-05, + "loss": 0.0654, + "step": 20891 + }, + { + "epoch": 2.47741017431519, + "grad_norm": 0.527961540532863, + "learning_rate": 1.671066464478917e-05, + "loss": 0.0667, + "step": 20892 + }, + { + "epoch": 2.4775287560773154, + "grad_norm": 0.5203924182673586, + "learning_rate": 1.6708400111687346e-05, + "loss": 0.061, + "step": 20893 + }, + { + "epoch": 2.47764733783944, + "grad_norm": 0.8834460911290144, + "learning_rate": 1.6706135655023912e-05, + "loss": 0.1159, + "step": 20894 + }, + { + "epoch": 2.4777659196015653, + "grad_norm": 0.6784038947702673, + "learning_rate": 1.6703871274819737e-05, + "loss": 0.0849, + "step": 20895 + }, + { + "epoch": 2.47788450136369, + "grad_norm": 0.6192179507101, + "learning_rate": 1.6701606971095687e-05, + "loss": 0.0977, + "step": 20896 + }, + { + "epoch": 2.4780030831258153, + "grad_norm": 0.5783644581521827, + "learning_rate": 1.6699342743872655e-05, + "loss": 0.0714, + "step": 20897 + }, + { + "epoch": 2.47812166488794, + "grad_norm": 0.7346089541084143, + "learning_rate": 1.66970785931715e-05, + "loss": 0.116, + "step": 20898 + }, + { + "epoch": 2.4782402466500653, + "grad_norm": 0.6916119273759727, + "learning_rate": 1.6694814519013102e-05, + "loss": 0.0917, + "step": 20899 + }, + { + "epoch": 2.47835882841219, + "grad_norm": 0.5770411178297564, + "learning_rate": 1.6692550521418317e-05, + "loss": 0.0721, + "step": 20900 + }, + { + "epoch": 2.478477410174315, + "grad_norm": 0.632840244238328, + "learning_rate": 1.669028660040804e-05, + "loss": 0.0851, + "step": 20901 + }, + { + "epoch": 2.47859599193644, + "grad_norm": 0.5075395247053768, + "learning_rate": 1.6688022756003123e-05, + "loss": 0.0742, + "step": 20902 + }, + { + "epoch": 2.478714573698565, + "grad_norm": 0.7468849260735413, + "learning_rate": 1.6685758988224453e-05, + "loss": 0.1225, + "step": 20903 + }, + { + "epoch": 2.4788331554606904, + "grad_norm": 0.7409211705218411, + "learning_rate": 1.6683495297092876e-05, + "loss": 0.1123, + "step": 20904 + }, + { + "epoch": 2.478951737222815, + "grad_norm": 0.7113152381720811, + "learning_rate": 1.6681231682629287e-05, + "loss": 0.0827, + "step": 20905 + }, + { + "epoch": 2.47907031898494, + "grad_norm": 0.6905949020336172, + "learning_rate": 1.667896814485454e-05, + "loss": 0.0797, + "step": 20906 + }, + { + "epoch": 2.479188900747065, + "grad_norm": 0.5787937602614343, + "learning_rate": 1.667670468378949e-05, + "loss": 0.0794, + "step": 20907 + }, + { + "epoch": 2.4793074825091903, + "grad_norm": 0.6054909455882728, + "learning_rate": 1.667444129945503e-05, + "loss": 0.0951, + "step": 20908 + }, + { + "epoch": 2.479426064271315, + "grad_norm": 0.505244236828614, + "learning_rate": 1.6672177991872007e-05, + "loss": 0.0649, + "step": 20909 + }, + { + "epoch": 2.47954464603344, + "grad_norm": 0.7433867246391286, + "learning_rate": 1.6669914761061288e-05, + "loss": 0.1021, + "step": 20910 + }, + { + "epoch": 2.479663227795565, + "grad_norm": 0.6960733448430833, + "learning_rate": 1.666765160704374e-05, + "loss": 0.0837, + "step": 20911 + }, + { + "epoch": 2.4797818095576902, + "grad_norm": 0.657769913239249, + "learning_rate": 1.6665388529840225e-05, + "loss": 0.0928, + "step": 20912 + }, + { + "epoch": 2.479900391319815, + "grad_norm": 0.7013208499142256, + "learning_rate": 1.666312552947162e-05, + "loss": 0.0941, + "step": 20913 + }, + { + "epoch": 2.4800189730819397, + "grad_norm": 0.7322826566240648, + "learning_rate": 1.6660862605958766e-05, + "loss": 0.0942, + "step": 20914 + }, + { + "epoch": 2.480137554844065, + "grad_norm": 0.8629534244016511, + "learning_rate": 1.6658599759322524e-05, + "loss": 0.1141, + "step": 20915 + }, + { + "epoch": 2.48025613660619, + "grad_norm": 0.7162965006787277, + "learning_rate": 1.6656336989583775e-05, + "loss": 0.0923, + "step": 20916 + }, + { + "epoch": 2.480374718368315, + "grad_norm": 0.7931900673066015, + "learning_rate": 1.6654074296763366e-05, + "loss": 0.1217, + "step": 20917 + }, + { + "epoch": 2.48049330013044, + "grad_norm": 0.8875287626150337, + "learning_rate": 1.6651811680882158e-05, + "loss": 0.0912, + "step": 20918 + }, + { + "epoch": 2.480611881892565, + "grad_norm": 0.7192129266069622, + "learning_rate": 1.6649549141960995e-05, + "loss": 0.0895, + "step": 20919 + }, + { + "epoch": 2.48073046365469, + "grad_norm": 0.5772411023556344, + "learning_rate": 1.6647286680020765e-05, + "loss": 0.069, + "step": 20920 + }, + { + "epoch": 2.480849045416815, + "grad_norm": 1.2966894282815704, + "learning_rate": 1.66450242950823e-05, + "loss": 0.1865, + "step": 20921 + }, + { + "epoch": 2.48096762717894, + "grad_norm": 0.6656895757611063, + "learning_rate": 1.664276198716647e-05, + "loss": 0.0819, + "step": 20922 + }, + { + "epoch": 2.481086208941065, + "grad_norm": 0.7166193321589911, + "learning_rate": 1.6640499756294124e-05, + "loss": 0.1119, + "step": 20923 + }, + { + "epoch": 2.48120479070319, + "grad_norm": 0.46124509979925676, + "learning_rate": 1.6638237602486123e-05, + "loss": 0.0621, + "step": 20924 + }, + { + "epoch": 2.4813233724653148, + "grad_norm": 0.6874137745423312, + "learning_rate": 1.663597552576332e-05, + "loss": 0.0836, + "step": 20925 + }, + { + "epoch": 2.48144195422744, + "grad_norm": 0.8654707313955415, + "learning_rate": 1.6633713526146552e-05, + "loss": 0.1146, + "step": 20926 + }, + { + "epoch": 2.4815605359895647, + "grad_norm": 0.7531655452112359, + "learning_rate": 1.6631451603656696e-05, + "loss": 0.1217, + "step": 20927 + }, + { + "epoch": 2.48167911775169, + "grad_norm": 0.9923874710229172, + "learning_rate": 1.6629189758314598e-05, + "loss": 0.1462, + "step": 20928 + }, + { + "epoch": 2.4817976995138147, + "grad_norm": 0.8159198518300302, + "learning_rate": 1.6626927990141107e-05, + "loss": 0.104, + "step": 20929 + }, + { + "epoch": 2.48191628127594, + "grad_norm": 0.39740758692740447, + "learning_rate": 1.6624666299157055e-05, + "loss": 0.0499, + "step": 20930 + }, + { + "epoch": 2.4820348630380646, + "grad_norm": 0.5903897924439601, + "learning_rate": 1.6622404685383326e-05, + "loss": 0.0765, + "step": 20931 + }, + { + "epoch": 2.48215344480019, + "grad_norm": 0.7859192777542752, + "learning_rate": 1.6620143148840743e-05, + "loss": 0.1177, + "step": 20932 + }, + { + "epoch": 2.4822720265623146, + "grad_norm": 0.7632807089580872, + "learning_rate": 1.6617881689550163e-05, + "loss": 0.1091, + "step": 20933 + }, + { + "epoch": 2.48239060832444, + "grad_norm": 0.7561981229300224, + "learning_rate": 1.6615620307532437e-05, + "loss": 0.0872, + "step": 20934 + }, + { + "epoch": 2.4825091900865646, + "grad_norm": 0.7965787867353483, + "learning_rate": 1.6613359002808416e-05, + "loss": 0.0919, + "step": 20935 + }, + { + "epoch": 2.4826277718486898, + "grad_norm": 0.6757902610536951, + "learning_rate": 1.6611097775398943e-05, + "loss": 0.0944, + "step": 20936 + }, + { + "epoch": 2.4827463536108145, + "grad_norm": 0.5083696099669677, + "learning_rate": 1.6608836625324846e-05, + "loss": 0.0842, + "step": 20937 + }, + { + "epoch": 2.4828649353729397, + "grad_norm": 0.9906304326052547, + "learning_rate": 1.6606575552606998e-05, + "loss": 0.1336, + "step": 20938 + }, + { + "epoch": 2.4829835171350645, + "grad_norm": 0.5215708039581911, + "learning_rate": 1.6604314557266233e-05, + "loss": 0.071, + "step": 20939 + }, + { + "epoch": 2.4831020988971897, + "grad_norm": 0.8223940491869038, + "learning_rate": 1.6602053639323385e-05, + "loss": 0.1286, + "step": 20940 + }, + { + "epoch": 2.4832206806593144, + "grad_norm": 0.8003510933586482, + "learning_rate": 1.65997927987993e-05, + "loss": 0.0887, + "step": 20941 + }, + { + "epoch": 2.4833392624214397, + "grad_norm": 0.7793940417115367, + "learning_rate": 1.659753203571483e-05, + "loss": 0.1091, + "step": 20942 + }, + { + "epoch": 2.4834578441835644, + "grad_norm": 0.6130098144960732, + "learning_rate": 1.6595271350090817e-05, + "loss": 0.0815, + "step": 20943 + }, + { + "epoch": 2.4835764259456896, + "grad_norm": 0.5664847305352829, + "learning_rate": 1.659301074194809e-05, + "loss": 0.0783, + "step": 20944 + }, + { + "epoch": 2.4836950077078144, + "grad_norm": 0.7097231409368371, + "learning_rate": 1.6590750211307487e-05, + "loss": 0.0973, + "step": 20945 + }, + { + "epoch": 2.4838135894699396, + "grad_norm": 0.9354876555123837, + "learning_rate": 1.658848975818987e-05, + "loss": 0.1341, + "step": 20946 + }, + { + "epoch": 2.4839321712320643, + "grad_norm": 0.5934335198534626, + "learning_rate": 1.6586229382616054e-05, + "loss": 0.071, + "step": 20947 + }, + { + "epoch": 2.4840507529941895, + "grad_norm": 0.88385131474436, + "learning_rate": 1.6583969084606883e-05, + "loss": 0.1079, + "step": 20948 + }, + { + "epoch": 2.4841693347563143, + "grad_norm": 0.7613334979374042, + "learning_rate": 1.6581708864183202e-05, + "loss": 0.1218, + "step": 20949 + }, + { + "epoch": 2.4842879165184395, + "grad_norm": 0.9421212087182589, + "learning_rate": 1.6579448721365842e-05, + "loss": 0.1313, + "step": 20950 + }, + { + "epoch": 2.4844064982805643, + "grad_norm": 0.6488531968603838, + "learning_rate": 1.6577188656175636e-05, + "loss": 0.075, + "step": 20951 + }, + { + "epoch": 2.4845250800426895, + "grad_norm": 0.5357455069988216, + "learning_rate": 1.6574928668633422e-05, + "loss": 0.0817, + "step": 20952 + }, + { + "epoch": 2.4846436618048147, + "grad_norm": 0.6503647907513621, + "learning_rate": 1.6572668758760033e-05, + "loss": 0.0996, + "step": 20953 + }, + { + "epoch": 2.4847622435669394, + "grad_norm": 0.6420818844364695, + "learning_rate": 1.657040892657631e-05, + "loss": 0.0855, + "step": 20954 + }, + { + "epoch": 2.484880825329064, + "grad_norm": 0.7605492243152511, + "learning_rate": 1.6568149172103077e-05, + "loss": 0.0917, + "step": 20955 + }, + { + "epoch": 2.4849994070911894, + "grad_norm": 0.47617936737169475, + "learning_rate": 1.6565889495361158e-05, + "loss": 0.0649, + "step": 20956 + }, + { + "epoch": 2.4851179888533146, + "grad_norm": 0.37976325081509865, + "learning_rate": 1.656362989637141e-05, + "loss": 0.0565, + "step": 20957 + }, + { + "epoch": 2.4852365706154393, + "grad_norm": 0.5427047367810001, + "learning_rate": 1.6561370375154648e-05, + "loss": 0.0633, + "step": 20958 + }, + { + "epoch": 2.485355152377564, + "grad_norm": 0.6911774052469812, + "learning_rate": 1.6559110931731698e-05, + "loss": 0.1105, + "step": 20959 + }, + { + "epoch": 2.4854737341396893, + "grad_norm": 0.6763546002861934, + "learning_rate": 1.6556851566123388e-05, + "loss": 0.1038, + "step": 20960 + }, + { + "epoch": 2.4855923159018145, + "grad_norm": 0.7472788979962444, + "learning_rate": 1.655459227835056e-05, + "loss": 0.0944, + "step": 20961 + }, + { + "epoch": 2.4857108976639393, + "grad_norm": 0.696215834188691, + "learning_rate": 1.655233306843404e-05, + "loss": 0.1207, + "step": 20962 + }, + { + "epoch": 2.485829479426064, + "grad_norm": 0.9048574467671708, + "learning_rate": 1.6550073936394634e-05, + "loss": 0.1323, + "step": 20963 + }, + { + "epoch": 2.4859480611881892, + "grad_norm": 0.5097156039690814, + "learning_rate": 1.65478148822532e-05, + "loss": 0.0778, + "step": 20964 + }, + { + "epoch": 2.4860666429503144, + "grad_norm": 0.7942265007997724, + "learning_rate": 1.6545555906030547e-05, + "loss": 0.0939, + "step": 20965 + }, + { + "epoch": 2.486185224712439, + "grad_norm": 0.8946468106744507, + "learning_rate": 1.65432970077475e-05, + "loss": 0.1422, + "step": 20966 + }, + { + "epoch": 2.4863038064745644, + "grad_norm": 0.6750826983195275, + "learning_rate": 1.6541038187424875e-05, + "loss": 0.0794, + "step": 20967 + }, + { + "epoch": 2.486422388236689, + "grad_norm": 0.5975614703106592, + "learning_rate": 1.653877944508351e-05, + "loss": 0.0661, + "step": 20968 + }, + { + "epoch": 2.4865409699988144, + "grad_norm": 0.6815471556242623, + "learning_rate": 1.6536520780744228e-05, + "loss": 0.0964, + "step": 20969 + }, + { + "epoch": 2.486659551760939, + "grad_norm": 0.8540552157673297, + "learning_rate": 1.6534262194427842e-05, + "loss": 0.0908, + "step": 20970 + }, + { + "epoch": 2.4867781335230643, + "grad_norm": 0.42788080757395663, + "learning_rate": 1.653200368615517e-05, + "loss": 0.0632, + "step": 20971 + }, + { + "epoch": 2.486896715285189, + "grad_norm": 0.7668171583071989, + "learning_rate": 1.6529745255947046e-05, + "loss": 0.1092, + "step": 20972 + }, + { + "epoch": 2.4870152970473143, + "grad_norm": 0.6664006723519624, + "learning_rate": 1.652748690382429e-05, + "loss": 0.0874, + "step": 20973 + }, + { + "epoch": 2.487133878809439, + "grad_norm": 0.4329012765958965, + "learning_rate": 1.652522862980771e-05, + "loss": 0.0775, + "step": 20974 + }, + { + "epoch": 2.4872524605715642, + "grad_norm": 1.0088065068974856, + "learning_rate": 1.6522970433918122e-05, + "loss": 0.1187, + "step": 20975 + }, + { + "epoch": 2.487371042333689, + "grad_norm": 0.8337394514819793, + "learning_rate": 1.652071231617636e-05, + "loss": 0.1302, + "step": 20976 + }, + { + "epoch": 2.487489624095814, + "grad_norm": 0.6270559694773218, + "learning_rate": 1.6518454276603236e-05, + "loss": 0.0822, + "step": 20977 + }, + { + "epoch": 2.487608205857939, + "grad_norm": 0.9116914622003164, + "learning_rate": 1.6516196315219552e-05, + "loss": 0.1203, + "step": 20978 + }, + { + "epoch": 2.487726787620064, + "grad_norm": 0.5898512638481359, + "learning_rate": 1.651393843204614e-05, + "loss": 0.1004, + "step": 20979 + }, + { + "epoch": 2.487845369382189, + "grad_norm": 0.8227093328341436, + "learning_rate": 1.6511680627103816e-05, + "loss": 0.1056, + "step": 20980 + }, + { + "epoch": 2.487963951144314, + "grad_norm": 0.7380753205886903, + "learning_rate": 1.6509422900413375e-05, + "loss": 0.0932, + "step": 20981 + }, + { + "epoch": 2.488082532906439, + "grad_norm": 0.6816959465422353, + "learning_rate": 1.650716525199565e-05, + "loss": 0.1157, + "step": 20982 + }, + { + "epoch": 2.488201114668564, + "grad_norm": 0.5790035443115789, + "learning_rate": 1.6504907681871444e-05, + "loss": 0.0771, + "step": 20983 + }, + { + "epoch": 2.488319696430689, + "grad_norm": 0.9090626175679626, + "learning_rate": 1.6502650190061576e-05, + "loss": 0.1337, + "step": 20984 + }, + { + "epoch": 2.488438278192814, + "grad_norm": 0.8202928200551096, + "learning_rate": 1.6500392776586854e-05, + "loss": 0.0955, + "step": 20985 + }, + { + "epoch": 2.488556859954939, + "grad_norm": 0.57657790025274, + "learning_rate": 1.6498135441468076e-05, + "loss": 0.0872, + "step": 20986 + }, + { + "epoch": 2.488675441717064, + "grad_norm": 0.6535433848161041, + "learning_rate": 1.6495878184726077e-05, + "loss": 0.0934, + "step": 20987 + }, + { + "epoch": 2.4887940234791888, + "grad_norm": 0.6682790318445135, + "learning_rate": 1.649362100638165e-05, + "loss": 0.0824, + "step": 20988 + }, + { + "epoch": 2.488912605241314, + "grad_norm": 0.590494952523208, + "learning_rate": 1.64913639064556e-05, + "loss": 0.0648, + "step": 20989 + }, + { + "epoch": 2.4890311870034387, + "grad_norm": 0.5219055670054221, + "learning_rate": 1.6489106884968745e-05, + "loss": 0.0778, + "step": 20990 + }, + { + "epoch": 2.489149768765564, + "grad_norm": 0.9277046891009498, + "learning_rate": 1.6486849941941885e-05, + "loss": 0.136, + "step": 20991 + }, + { + "epoch": 2.4892683505276887, + "grad_norm": 0.7934382714301292, + "learning_rate": 1.6484593077395832e-05, + "loss": 0.0843, + "step": 20992 + }, + { + "epoch": 2.489386932289814, + "grad_norm": 1.1096030669620274, + "learning_rate": 1.648233629135138e-05, + "loss": 0.1613, + "step": 20993 + }, + { + "epoch": 2.4895055140519387, + "grad_norm": 0.9015432757463602, + "learning_rate": 1.6480079583829356e-05, + "loss": 0.1054, + "step": 20994 + }, + { + "epoch": 2.489624095814064, + "grad_norm": 0.8349292984166051, + "learning_rate": 1.6477822954850548e-05, + "loss": 0.071, + "step": 20995 + }, + { + "epoch": 2.4897426775761886, + "grad_norm": 0.9883349893246602, + "learning_rate": 1.647556640443576e-05, + "loss": 0.1251, + "step": 20996 + }, + { + "epoch": 2.489861259338314, + "grad_norm": 0.8285140231457055, + "learning_rate": 1.647330993260579e-05, + "loss": 0.1245, + "step": 20997 + }, + { + "epoch": 2.4899798411004386, + "grad_norm": 0.752604353976522, + "learning_rate": 1.6471053539381454e-05, + "loss": 0.1213, + "step": 20998 + }, + { + "epoch": 2.4900984228625638, + "grad_norm": 0.7210798247126474, + "learning_rate": 1.6468797224783543e-05, + "loss": 0.1058, + "step": 20999 + }, + { + "epoch": 2.4902170046246885, + "grad_norm": 0.7166485219379403, + "learning_rate": 1.646654098883286e-05, + "loss": 0.1034, + "step": 21000 + }, + { + "epoch": 2.4903355863868137, + "grad_norm": 0.6165038951169779, + "learning_rate": 1.6464284831550202e-05, + "loss": 0.078, + "step": 21001 + }, + { + "epoch": 2.490454168148939, + "grad_norm": 1.1492604397342356, + "learning_rate": 1.646202875295637e-05, + "loss": 0.1568, + "step": 21002 + }, + { + "epoch": 2.4905727499110637, + "grad_norm": 0.7022925631336464, + "learning_rate": 1.645977275307217e-05, + "loss": 0.0816, + "step": 21003 + }, + { + "epoch": 2.4906913316731885, + "grad_norm": 0.5788172433099744, + "learning_rate": 1.645751683191839e-05, + "loss": 0.0842, + "step": 21004 + }, + { + "epoch": 2.4908099134353137, + "grad_norm": 0.6369284621193753, + "learning_rate": 1.645526098951582e-05, + "loss": 0.0854, + "step": 21005 + }, + { + "epoch": 2.490928495197439, + "grad_norm": 0.5218012636557446, + "learning_rate": 1.6453005225885276e-05, + "loss": 0.0722, + "step": 21006 + }, + { + "epoch": 2.4910470769595636, + "grad_norm": 0.5099624490551737, + "learning_rate": 1.6450749541047546e-05, + "loss": 0.0897, + "step": 21007 + }, + { + "epoch": 2.4911656587216884, + "grad_norm": 0.7414125441011692, + "learning_rate": 1.644849393502341e-05, + "loss": 0.1024, + "step": 21008 + }, + { + "epoch": 2.4912842404838136, + "grad_norm": 0.6426714720391214, + "learning_rate": 1.6446238407833678e-05, + "loss": 0.0814, + "step": 21009 + }, + { + "epoch": 2.491402822245939, + "grad_norm": 1.177101796614947, + "learning_rate": 1.6443982959499137e-05, + "loss": 0.1364, + "step": 21010 + }, + { + "epoch": 2.4915214040080635, + "grad_norm": 0.8369995287616867, + "learning_rate": 1.6441727590040586e-05, + "loss": 0.0995, + "step": 21011 + }, + { + "epoch": 2.4916399857701883, + "grad_norm": 0.6911979989012658, + "learning_rate": 1.6439472299478803e-05, + "loss": 0.0854, + "step": 21012 + }, + { + "epoch": 2.4917585675323135, + "grad_norm": 0.7541301444725492, + "learning_rate": 1.6437217087834598e-05, + "loss": 0.0962, + "step": 21013 + }, + { + "epoch": 2.4918771492944387, + "grad_norm": 0.6268166704635806, + "learning_rate": 1.643496195512875e-05, + "loss": 0.086, + "step": 21014 + }, + { + "epoch": 2.4919957310565635, + "grad_norm": 0.5415278267112105, + "learning_rate": 1.643270690138205e-05, + "loss": 0.0847, + "step": 21015 + }, + { + "epoch": 2.4921143128186887, + "grad_norm": 0.7128709100363025, + "learning_rate": 1.6430451926615275e-05, + "loss": 0.1086, + "step": 21016 + }, + { + "epoch": 2.4922328945808134, + "grad_norm": 0.6830142959178864, + "learning_rate": 1.6428197030849236e-05, + "loss": 0.1033, + "step": 21017 + }, + { + "epoch": 2.4923514763429386, + "grad_norm": 0.5625407940099288, + "learning_rate": 1.6425942214104707e-05, + "loss": 0.0985, + "step": 21018 + }, + { + "epoch": 2.4924700581050634, + "grad_norm": 0.7122861556638524, + "learning_rate": 1.6423687476402468e-05, + "loss": 0.1037, + "step": 21019 + }, + { + "epoch": 2.4925886398671886, + "grad_norm": 0.7527540324239157, + "learning_rate": 1.6421432817763323e-05, + "loss": 0.0967, + "step": 21020 + }, + { + "epoch": 2.4927072216293134, + "grad_norm": 0.5801564456535857, + "learning_rate": 1.6419178238208043e-05, + "loss": 0.098, + "step": 21021 + }, + { + "epoch": 2.4928258033914386, + "grad_norm": 0.7579988005053704, + "learning_rate": 1.641692373775742e-05, + "loss": 0.0993, + "step": 21022 + }, + { + "epoch": 2.4929443851535633, + "grad_norm": 0.6612872194761049, + "learning_rate": 1.641466931643223e-05, + "loss": 0.093, + "step": 21023 + }, + { + "epoch": 2.4930629669156885, + "grad_norm": 0.45438235603645605, + "learning_rate": 1.641241497425327e-05, + "loss": 0.0748, + "step": 21024 + }, + { + "epoch": 2.4931815486778133, + "grad_norm": 0.6688133350667054, + "learning_rate": 1.6410160711241312e-05, + "loss": 0.0989, + "step": 21025 + }, + { + "epoch": 2.4933001304399385, + "grad_norm": 0.6313075290433969, + "learning_rate": 1.640790652741714e-05, + "loss": 0.0835, + "step": 21026 + }, + { + "epoch": 2.4934187122020632, + "grad_norm": 0.8674709343223904, + "learning_rate": 1.640565242280152e-05, + "loss": 0.1222, + "step": 21027 + }, + { + "epoch": 2.4935372939641884, + "grad_norm": 0.6333832834710779, + "learning_rate": 1.6403398397415256e-05, + "loss": 0.0781, + "step": 21028 + }, + { + "epoch": 2.493655875726313, + "grad_norm": 0.6297365831626656, + "learning_rate": 1.640114445127912e-05, + "loss": 0.084, + "step": 21029 + }, + { + "epoch": 2.4937744574884384, + "grad_norm": 0.500447357258479, + "learning_rate": 1.6398890584413878e-05, + "loss": 0.0584, + "step": 21030 + }, + { + "epoch": 2.493893039250563, + "grad_norm": 0.6678671549588854, + "learning_rate": 1.639663679684032e-05, + "loss": 0.0983, + "step": 21031 + }, + { + "epoch": 2.4940116210126884, + "grad_norm": 0.5505614993383509, + "learning_rate": 1.639438308857922e-05, + "loss": 0.0676, + "step": 21032 + }, + { + "epoch": 2.494130202774813, + "grad_norm": 0.8532831111387837, + "learning_rate": 1.639212945965136e-05, + "loss": 0.1202, + "step": 21033 + }, + { + "epoch": 2.4942487845369383, + "grad_norm": 0.8146634902145333, + "learning_rate": 1.63898759100775e-05, + "loss": 0.1057, + "step": 21034 + }, + { + "epoch": 2.494367366299063, + "grad_norm": 0.7831359742311974, + "learning_rate": 1.6387622439878442e-05, + "loss": 0.1011, + "step": 21035 + }, + { + "epoch": 2.4944859480611883, + "grad_norm": 0.5707830189779725, + "learning_rate": 1.638536904907494e-05, + "loss": 0.0805, + "step": 21036 + }, + { + "epoch": 2.494604529823313, + "grad_norm": 0.4423528038086672, + "learning_rate": 1.6383115737687767e-05, + "loss": 0.0697, + "step": 21037 + }, + { + "epoch": 2.4947231115854382, + "grad_norm": 0.9445001071695924, + "learning_rate": 1.6380862505737695e-05, + "loss": 0.1245, + "step": 21038 + }, + { + "epoch": 2.494841693347563, + "grad_norm": 0.841551249513039, + "learning_rate": 1.637860935324551e-05, + "loss": 0.1179, + "step": 21039 + }, + { + "epoch": 2.494960275109688, + "grad_norm": 0.5425900343138955, + "learning_rate": 1.6376356280231968e-05, + "loss": 0.0817, + "step": 21040 + }, + { + "epoch": 2.495078856871813, + "grad_norm": 0.5798785672795183, + "learning_rate": 1.6374103286717852e-05, + "loss": 0.0976, + "step": 21041 + }, + { + "epoch": 2.495197438633938, + "grad_norm": 0.5878476819163321, + "learning_rate": 1.6371850372723917e-05, + "loss": 0.089, + "step": 21042 + }, + { + "epoch": 2.495316020396063, + "grad_norm": 0.6358450239154725, + "learning_rate": 1.636959753827095e-05, + "loss": 0.1019, + "step": 21043 + }, + { + "epoch": 2.495434602158188, + "grad_norm": 0.33361724864436526, + "learning_rate": 1.636734478337971e-05, + "loss": 0.0507, + "step": 21044 + }, + { + "epoch": 2.495553183920313, + "grad_norm": 0.5190870336727907, + "learning_rate": 1.6365092108070967e-05, + "loss": 0.0787, + "step": 21045 + }, + { + "epoch": 2.495671765682438, + "grad_norm": 1.0654419884861914, + "learning_rate": 1.6362839512365475e-05, + "loss": 0.1403, + "step": 21046 + }, + { + "epoch": 2.495790347444563, + "grad_norm": 0.6703725500474584, + "learning_rate": 1.636058699628402e-05, + "loss": 0.0865, + "step": 21047 + }, + { + "epoch": 2.495908929206688, + "grad_norm": 0.6616198330154293, + "learning_rate": 1.635833455984736e-05, + "loss": 0.0634, + "step": 21048 + }, + { + "epoch": 2.496027510968813, + "grad_norm": 0.6748966282606551, + "learning_rate": 1.6356082203076246e-05, + "loss": 0.0947, + "step": 21049 + }, + { + "epoch": 2.496146092730938, + "grad_norm": 0.7050391832256626, + "learning_rate": 1.6353829925991467e-05, + "loss": 0.1053, + "step": 21050 + }, + { + "epoch": 2.4962646744930628, + "grad_norm": 0.6031446126341878, + "learning_rate": 1.6351577728613765e-05, + "loss": 0.0786, + "step": 21051 + }, + { + "epoch": 2.496383256255188, + "grad_norm": 0.9357567648945577, + "learning_rate": 1.634932561096392e-05, + "loss": 0.14, + "step": 21052 + }, + { + "epoch": 2.4965018380173127, + "grad_norm": 0.44262326959543896, + "learning_rate": 1.6347073573062672e-05, + "loss": 0.0677, + "step": 21053 + }, + { + "epoch": 2.496620419779438, + "grad_norm": 0.8741395677259681, + "learning_rate": 1.6344821614930806e-05, + "loss": 0.1107, + "step": 21054 + }, + { + "epoch": 2.496739001541563, + "grad_norm": 0.38073980503737137, + "learning_rate": 1.634256973658907e-05, + "loss": 0.0612, + "step": 21055 + }, + { + "epoch": 2.496857583303688, + "grad_norm": 0.7601296368103896, + "learning_rate": 1.6340317938058225e-05, + "loss": 0.0946, + "step": 21056 + }, + { + "epoch": 2.4969761650658127, + "grad_norm": 0.5080509930198253, + "learning_rate": 1.633806621935902e-05, + "loss": 0.068, + "step": 21057 + }, + { + "epoch": 2.497094746827938, + "grad_norm": 0.4078579120013963, + "learning_rate": 1.6335814580512233e-05, + "loss": 0.0597, + "step": 21058 + }, + { + "epoch": 2.497213328590063, + "grad_norm": 0.7001256215314647, + "learning_rate": 1.6333563021538612e-05, + "loss": 0.0908, + "step": 21059 + }, + { + "epoch": 2.497331910352188, + "grad_norm": 0.6005160247782392, + "learning_rate": 1.6331311542458904e-05, + "loss": 0.0814, + "step": 21060 + }, + { + "epoch": 2.4974504921143126, + "grad_norm": 0.8590810676546597, + "learning_rate": 1.632906014329387e-05, + "loss": 0.1146, + "step": 21061 + }, + { + "epoch": 2.497569073876438, + "grad_norm": 0.598194631996934, + "learning_rate": 1.6326808824064276e-05, + "loss": 0.0994, + "step": 21062 + }, + { + "epoch": 2.497687655638563, + "grad_norm": 0.5895626496027416, + "learning_rate": 1.632455758479087e-05, + "loss": 0.0584, + "step": 21063 + }, + { + "epoch": 2.4978062374006877, + "grad_norm": 0.5571637429098188, + "learning_rate": 1.6322306425494398e-05, + "loss": 0.0696, + "step": 21064 + }, + { + "epoch": 2.497924819162813, + "grad_norm": 1.194417885821483, + "learning_rate": 1.632005534619563e-05, + "loss": 0.1339, + "step": 21065 + }, + { + "epoch": 2.4980434009249377, + "grad_norm": 0.7092644330854693, + "learning_rate": 1.6317804346915302e-05, + "loss": 0.0949, + "step": 21066 + }, + { + "epoch": 2.498161982687063, + "grad_norm": 0.672649230103092, + "learning_rate": 1.6315553427674174e-05, + "loss": 0.0884, + "step": 21067 + }, + { + "epoch": 2.4982805644491877, + "grad_norm": 0.6369340069997936, + "learning_rate": 1.6313302588492983e-05, + "loss": 0.0769, + "step": 21068 + }, + { + "epoch": 2.498399146211313, + "grad_norm": 0.8208171643275337, + "learning_rate": 1.63110518293925e-05, + "loss": 0.1358, + "step": 21069 + }, + { + "epoch": 2.4985177279734376, + "grad_norm": 0.5886825724371734, + "learning_rate": 1.630880115039346e-05, + "loss": 0.1013, + "step": 21070 + }, + { + "epoch": 2.498636309735563, + "grad_norm": 0.5215580416763855, + "learning_rate": 1.630655055151662e-05, + "loss": 0.0529, + "step": 21071 + }, + { + "epoch": 2.4987548914976876, + "grad_norm": 0.7391223397330811, + "learning_rate": 1.6304300032782715e-05, + "loss": 0.093, + "step": 21072 + }, + { + "epoch": 2.498873473259813, + "grad_norm": 0.49089771203862625, + "learning_rate": 1.630204959421251e-05, + "loss": 0.0766, + "step": 21073 + }, + { + "epoch": 2.4989920550219376, + "grad_norm": 0.8058096311132605, + "learning_rate": 1.629979923582674e-05, + "loss": 0.1332, + "step": 21074 + }, + { + "epoch": 2.4991106367840628, + "grad_norm": 0.9004524895138818, + "learning_rate": 1.6297548957646152e-05, + "loss": 0.121, + "step": 21075 + }, + { + "epoch": 2.4992292185461875, + "grad_norm": 0.9395667005202466, + "learning_rate": 1.629529875969148e-05, + "loss": 0.1451, + "step": 21076 + }, + { + "epoch": 2.4993478003083127, + "grad_norm": 0.7594174723515573, + "learning_rate": 1.6293048641983492e-05, + "loss": 0.1021, + "step": 21077 + }, + { + "epoch": 2.4994663820704375, + "grad_norm": 0.7551149106994824, + "learning_rate": 1.6290798604542916e-05, + "loss": 0.0959, + "step": 21078 + }, + { + "epoch": 2.4995849638325627, + "grad_norm": 0.9602651456545053, + "learning_rate": 1.628854864739049e-05, + "loss": 0.1496, + "step": 21079 + }, + { + "epoch": 2.4997035455946874, + "grad_norm": 0.6952490883077984, + "learning_rate": 1.6286298770546968e-05, + "loss": 0.0778, + "step": 21080 + }, + { + "epoch": 2.4998221273568126, + "grad_norm": 0.7586346124935794, + "learning_rate": 1.6284048974033083e-05, + "loss": 0.0949, + "step": 21081 + }, + { + "epoch": 2.4999407091189374, + "grad_norm": 0.4286714203589519, + "learning_rate": 1.6281799257869583e-05, + "loss": 0.0576, + "step": 21082 + }, + { + "epoch": 2.5000592908810626, + "grad_norm": 0.6033000594068344, + "learning_rate": 1.6279549622077195e-05, + "loss": 0.0702, + "step": 21083 + }, + { + "epoch": 2.5001778726431874, + "grad_norm": 0.7281039306164327, + "learning_rate": 1.6277300066676676e-05, + "loss": 0.111, + "step": 21084 + }, + { + "epoch": 2.5002964544053126, + "grad_norm": 0.48487470244200453, + "learning_rate": 1.6275050591688752e-05, + "loss": 0.0687, + "step": 21085 + }, + { + "epoch": 2.5004150361674373, + "grad_norm": 0.6558655928395012, + "learning_rate": 1.6272801197134163e-05, + "loss": 0.0981, + "step": 21086 + }, + { + "epoch": 2.5005336179295625, + "grad_norm": 0.5486047494231322, + "learning_rate": 1.6270551883033632e-05, + "loss": 0.0594, + "step": 21087 + }, + { + "epoch": 2.5006521996916873, + "grad_norm": 0.7297009101147324, + "learning_rate": 1.626830264940792e-05, + "loss": 0.0774, + "step": 21088 + }, + { + "epoch": 2.5007707814538125, + "grad_norm": 0.7381310700437813, + "learning_rate": 1.6266053496277744e-05, + "loss": 0.1074, + "step": 21089 + }, + { + "epoch": 2.5008893632159372, + "grad_norm": 0.7611058171077287, + "learning_rate": 1.6263804423663853e-05, + "loss": 0.0828, + "step": 21090 + }, + { + "epoch": 2.5010079449780624, + "grad_norm": 0.6721245629414353, + "learning_rate": 1.626155543158696e-05, + "loss": 0.0866, + "step": 21091 + }, + { + "epoch": 2.501126526740187, + "grad_norm": 0.6720205933846405, + "learning_rate": 1.625930652006782e-05, + "loss": 0.0786, + "step": 21092 + }, + { + "epoch": 2.5012451085023124, + "grad_norm": 0.5942359043274482, + "learning_rate": 1.6257057689127157e-05, + "loss": 0.0845, + "step": 21093 + }, + { + "epoch": 2.501363690264437, + "grad_norm": 0.7796736102857782, + "learning_rate": 1.6254808938785688e-05, + "loss": 0.1096, + "step": 21094 + }, + { + "epoch": 2.5014822720265624, + "grad_norm": 0.6023745000357921, + "learning_rate": 1.6252560269064168e-05, + "loss": 0.0909, + "step": 21095 + }, + { + "epoch": 2.5016008537886876, + "grad_norm": 0.5270114302487215, + "learning_rate": 1.6250311679983314e-05, + "loss": 0.0771, + "step": 21096 + }, + { + "epoch": 2.5017194355508123, + "grad_norm": 0.6368728934491102, + "learning_rate": 1.624806317156386e-05, + "loss": 0.1035, + "step": 21097 + }, + { + "epoch": 2.501838017312937, + "grad_norm": 0.7554421930450529, + "learning_rate": 1.624581474382652e-05, + "loss": 0.1031, + "step": 21098 + }, + { + "epoch": 2.5019565990750623, + "grad_norm": 0.7338338767631697, + "learning_rate": 1.6243566396792044e-05, + "loss": 0.1392, + "step": 21099 + }, + { + "epoch": 2.5020751808371875, + "grad_norm": 0.8630656316923839, + "learning_rate": 1.624131813048114e-05, + "loss": 0.1342, + "step": 21100 + }, + { + "epoch": 2.5021937625993123, + "grad_norm": 0.4285471761069529, + "learning_rate": 1.6239069944914547e-05, + "loss": 0.0518, + "step": 21101 + }, + { + "epoch": 2.502312344361437, + "grad_norm": 0.5858206773555166, + "learning_rate": 1.6236821840112977e-05, + "loss": 0.0768, + "step": 21102 + }, + { + "epoch": 2.502430926123562, + "grad_norm": 1.0179343089741784, + "learning_rate": 1.6234573816097174e-05, + "loss": 0.1331, + "step": 21103 + }, + { + "epoch": 2.5025495078856874, + "grad_norm": 0.7640512597837301, + "learning_rate": 1.623232587288785e-05, + "loss": 0.0897, + "step": 21104 + }, + { + "epoch": 2.502668089647812, + "grad_norm": 0.587564883460058, + "learning_rate": 1.623007801050572e-05, + "loss": 0.0763, + "step": 21105 + }, + { + "epoch": 2.502786671409937, + "grad_norm": 0.7818188416904707, + "learning_rate": 1.622783022897153e-05, + "loss": 0.1023, + "step": 21106 + }, + { + "epoch": 2.502905253172062, + "grad_norm": 0.7199714788891135, + "learning_rate": 1.622558252830599e-05, + "loss": 0.0813, + "step": 21107 + }, + { + "epoch": 2.5030238349341873, + "grad_norm": 0.6392257724187865, + "learning_rate": 1.622333490852981e-05, + "loss": 0.0831, + "step": 21108 + }, + { + "epoch": 2.503142416696312, + "grad_norm": 0.5904942136453676, + "learning_rate": 1.6221087369663715e-05, + "loss": 0.1005, + "step": 21109 + }, + { + "epoch": 2.503260998458437, + "grad_norm": 0.5831743352152686, + "learning_rate": 1.621883991172844e-05, + "loss": 0.0944, + "step": 21110 + }, + { + "epoch": 2.503379580220562, + "grad_norm": 0.542619296881233, + "learning_rate": 1.6216592534744684e-05, + "loss": 0.0685, + "step": 21111 + }, + { + "epoch": 2.5034981619826873, + "grad_norm": 0.6617774696689404, + "learning_rate": 1.621434523873318e-05, + "loss": 0.1018, + "step": 21112 + }, + { + "epoch": 2.503616743744812, + "grad_norm": 1.0481507196537825, + "learning_rate": 1.6212098023714634e-05, + "loss": 0.1692, + "step": 21113 + }, + { + "epoch": 2.503735325506937, + "grad_norm": 0.643732658667772, + "learning_rate": 1.6209850889709773e-05, + "loss": 0.0895, + "step": 21114 + }, + { + "epoch": 2.503853907269062, + "grad_norm": 0.5982056636137397, + "learning_rate": 1.620760383673931e-05, + "loss": 0.1002, + "step": 21115 + }, + { + "epoch": 2.503972489031187, + "grad_norm": 0.7658680497746043, + "learning_rate": 1.6205356864823955e-05, + "loss": 0.0792, + "step": 21116 + }, + { + "epoch": 2.504091070793312, + "grad_norm": 0.8868926006883749, + "learning_rate": 1.6203109973984414e-05, + "loss": 0.1231, + "step": 21117 + }, + { + "epoch": 2.5042096525554367, + "grad_norm": 0.842258850795142, + "learning_rate": 1.6200863164241425e-05, + "loss": 0.1021, + "step": 21118 + }, + { + "epoch": 2.504328234317562, + "grad_norm": 0.9746483719451734, + "learning_rate": 1.6198616435615683e-05, + "loss": 0.1147, + "step": 21119 + }, + { + "epoch": 2.504446816079687, + "grad_norm": 0.9152385740714258, + "learning_rate": 1.6196369788127903e-05, + "loss": 0.1246, + "step": 21120 + }, + { + "epoch": 2.504565397841812, + "grad_norm": 0.5728330921181314, + "learning_rate": 1.6194123221798797e-05, + "loss": 0.0568, + "step": 21121 + }, + { + "epoch": 2.504683979603937, + "grad_norm": 0.6517795301602478, + "learning_rate": 1.619187673664908e-05, + "loss": 0.0977, + "step": 21122 + }, + { + "epoch": 2.504802561366062, + "grad_norm": 1.0361559447042468, + "learning_rate": 1.618963033269946e-05, + "loss": 0.1061, + "step": 21123 + }, + { + "epoch": 2.504921143128187, + "grad_norm": 0.582555806287187, + "learning_rate": 1.6187384009970638e-05, + "loss": 0.0721, + "step": 21124 + }, + { + "epoch": 2.505039724890312, + "grad_norm": 0.6181166870481027, + "learning_rate": 1.6185137768483337e-05, + "loss": 0.1168, + "step": 21125 + }, + { + "epoch": 2.505158306652437, + "grad_norm": 0.6739163209054891, + "learning_rate": 1.618289160825825e-05, + "loss": 0.0764, + "step": 21126 + }, + { + "epoch": 2.5052768884145618, + "grad_norm": 0.5839819608437394, + "learning_rate": 1.6180645529316095e-05, + "loss": 0.0787, + "step": 21127 + }, + { + "epoch": 2.505395470176687, + "grad_norm": 0.7607065161068104, + "learning_rate": 1.6178399531677565e-05, + "loss": 0.1007, + "step": 21128 + }, + { + "epoch": 2.5055140519388117, + "grad_norm": 0.8262306818744597, + "learning_rate": 1.6176153615363382e-05, + "loss": 0.1113, + "step": 21129 + }, + { + "epoch": 2.505632633700937, + "grad_norm": 0.5636130975301148, + "learning_rate": 1.6173907780394236e-05, + "loss": 0.0671, + "step": 21130 + }, + { + "epoch": 2.5057512154630617, + "grad_norm": 0.9521062750254975, + "learning_rate": 1.617166202679084e-05, + "loss": 0.1437, + "step": 21131 + }, + { + "epoch": 2.505869797225187, + "grad_norm": 0.6627753308162271, + "learning_rate": 1.6169416354573892e-05, + "loss": 0.1045, + "step": 21132 + }, + { + "epoch": 2.5059883789873116, + "grad_norm": 0.5764820735013353, + "learning_rate": 1.61671707637641e-05, + "loss": 0.0755, + "step": 21133 + }, + { + "epoch": 2.506106960749437, + "grad_norm": 0.7572754219825468, + "learning_rate": 1.616492525438216e-05, + "loss": 0.1218, + "step": 21134 + }, + { + "epoch": 2.5062255425115616, + "grad_norm": 0.8408150035696037, + "learning_rate": 1.6162679826448772e-05, + "loss": 0.1109, + "step": 21135 + }, + { + "epoch": 2.506344124273687, + "grad_norm": 0.5757024579243644, + "learning_rate": 1.616043447998464e-05, + "loss": 0.0697, + "step": 21136 + }, + { + "epoch": 2.5064627060358116, + "grad_norm": 0.7653203109199602, + "learning_rate": 1.615818921501047e-05, + "loss": 0.137, + "step": 21137 + }, + { + "epoch": 2.5065812877979368, + "grad_norm": 0.6397537663865878, + "learning_rate": 1.6155944031546945e-05, + "loss": 0.0996, + "step": 21138 + }, + { + "epoch": 2.5066998695600615, + "grad_norm": 1.1691253996130948, + "learning_rate": 1.6153698929614776e-05, + "loss": 0.1325, + "step": 21139 + }, + { + "epoch": 2.5068184513221867, + "grad_norm": 0.4747747925562896, + "learning_rate": 1.615145390923465e-05, + "loss": 0.0668, + "step": 21140 + }, + { + "epoch": 2.5069370330843115, + "grad_norm": 0.6915127267555807, + "learning_rate": 1.614920897042727e-05, + "loss": 0.1155, + "step": 21141 + }, + { + "epoch": 2.5070556148464367, + "grad_norm": 0.5877855421865675, + "learning_rate": 1.614696411321333e-05, + "loss": 0.0766, + "step": 21142 + }, + { + "epoch": 2.5071741966085614, + "grad_norm": 0.9368142808347628, + "learning_rate": 1.614471933761352e-05, + "loss": 0.1295, + "step": 21143 + }, + { + "epoch": 2.5072927783706866, + "grad_norm": 0.5588919312477221, + "learning_rate": 1.6142474643648548e-05, + "loss": 0.095, + "step": 21144 + }, + { + "epoch": 2.507411360132812, + "grad_norm": 0.7437785462582569, + "learning_rate": 1.6140230031339095e-05, + "loss": 0.1087, + "step": 21145 + }, + { + "epoch": 2.5075299418949366, + "grad_norm": 1.1440651886030582, + "learning_rate": 1.6137985500705854e-05, + "loss": 0.1259, + "step": 21146 + }, + { + "epoch": 2.5076485236570614, + "grad_norm": 0.5100359729476346, + "learning_rate": 1.6135741051769516e-05, + "loss": 0.061, + "step": 21147 + }, + { + "epoch": 2.5077671054191866, + "grad_norm": 0.9561436716459476, + "learning_rate": 1.613349668455078e-05, + "loss": 0.1458, + "step": 21148 + }, + { + "epoch": 2.5078856871813118, + "grad_norm": 0.8346914258509308, + "learning_rate": 1.6131252399070328e-05, + "loss": 0.1313, + "step": 21149 + }, + { + "epoch": 2.5080042689434365, + "grad_norm": 1.0793388479332586, + "learning_rate": 1.6129008195348853e-05, + "loss": 0.0951, + "step": 21150 + }, + { + "epoch": 2.5081228507055613, + "grad_norm": 0.6006742998901674, + "learning_rate": 1.6126764073407048e-05, + "loss": 0.0747, + "step": 21151 + }, + { + "epoch": 2.5082414324676865, + "grad_norm": 0.6422206330645762, + "learning_rate": 1.61245200332656e-05, + "loss": 0.0862, + "step": 21152 + }, + { + "epoch": 2.5083600142298117, + "grad_norm": 0.6937765464670296, + "learning_rate": 1.612227607494519e-05, + "loss": 0.0711, + "step": 21153 + }, + { + "epoch": 2.5084785959919365, + "grad_norm": 0.8102706779057159, + "learning_rate": 1.6120032198466507e-05, + "loss": 0.1356, + "step": 21154 + }, + { + "epoch": 2.508597177754061, + "grad_norm": 0.6922246944778802, + "learning_rate": 1.611778840385024e-05, + "loss": 0.0792, + "step": 21155 + }, + { + "epoch": 2.5087157595161864, + "grad_norm": 0.7808227063432985, + "learning_rate": 1.6115544691117074e-05, + "loss": 0.1033, + "step": 21156 + }, + { + "epoch": 2.5088343412783116, + "grad_norm": 0.5979220493151917, + "learning_rate": 1.611330106028769e-05, + "loss": 0.079, + "step": 21157 + }, + { + "epoch": 2.5089529230404364, + "grad_norm": 0.8131332024416453, + "learning_rate": 1.6111057511382764e-05, + "loss": 0.1123, + "step": 21158 + }, + { + "epoch": 2.509071504802561, + "grad_norm": 0.7257758163689607, + "learning_rate": 1.6108814044422994e-05, + "loss": 0.1009, + "step": 21159 + }, + { + "epoch": 2.5091900865646863, + "grad_norm": 0.6929750379220999, + "learning_rate": 1.6106570659429055e-05, + "loss": 0.0975, + "step": 21160 + }, + { + "epoch": 2.5093086683268115, + "grad_norm": 0.5700697341794875, + "learning_rate": 1.6104327356421633e-05, + "loss": 0.0699, + "step": 21161 + }, + { + "epoch": 2.5094272500889363, + "grad_norm": 0.6928684983813714, + "learning_rate": 1.610208413542139e-05, + "loss": 0.0774, + "step": 21162 + }, + { + "epoch": 2.509545831851061, + "grad_norm": 0.7731091151700584, + "learning_rate": 1.6099840996449035e-05, + "loss": 0.1206, + "step": 21163 + }, + { + "epoch": 2.5096644136131863, + "grad_norm": 0.5599173950107457, + "learning_rate": 1.609759793952523e-05, + "loss": 0.067, + "step": 21164 + }, + { + "epoch": 2.5097829953753115, + "grad_norm": 0.7412282651517007, + "learning_rate": 1.6095354964670644e-05, + "loss": 0.1114, + "step": 21165 + }, + { + "epoch": 2.5099015771374362, + "grad_norm": 0.571506374677571, + "learning_rate": 1.6093112071905982e-05, + "loss": 0.0881, + "step": 21166 + }, + { + "epoch": 2.510020158899561, + "grad_norm": 0.6368275125936826, + "learning_rate": 1.6090869261251894e-05, + "loss": 0.0897, + "step": 21167 + }, + { + "epoch": 2.510138740661686, + "grad_norm": 1.0584719650990955, + "learning_rate": 1.6088626532729068e-05, + "loss": 0.1598, + "step": 21168 + }, + { + "epoch": 2.5102573224238114, + "grad_norm": 0.5799771825190118, + "learning_rate": 1.6086383886358176e-05, + "loss": 0.0847, + "step": 21169 + }, + { + "epoch": 2.510375904185936, + "grad_norm": 0.6510756646644285, + "learning_rate": 1.6084141322159894e-05, + "loss": 0.0648, + "step": 21170 + }, + { + "epoch": 2.5104944859480613, + "grad_norm": 0.6339639709063764, + "learning_rate": 1.60818988401549e-05, + "loss": 0.0878, + "step": 21171 + }, + { + "epoch": 2.510613067710186, + "grad_norm": 0.8567401309828432, + "learning_rate": 1.6079656440363865e-05, + "loss": 0.1366, + "step": 21172 + }, + { + "epoch": 2.5107316494723113, + "grad_norm": 0.9866758779082196, + "learning_rate": 1.6077414122807442e-05, + "loss": 0.1294, + "step": 21173 + }, + { + "epoch": 2.510850231234436, + "grad_norm": 1.007889576951197, + "learning_rate": 1.6075171887506335e-05, + "loss": 0.1185, + "step": 21174 + }, + { + "epoch": 2.5109688129965613, + "grad_norm": 0.7373451638446665, + "learning_rate": 1.60729297344812e-05, + "loss": 0.0975, + "step": 21175 + }, + { + "epoch": 2.511087394758686, + "grad_norm": 0.6009333135575639, + "learning_rate": 1.6070687663752693e-05, + "loss": 0.0857, + "step": 21176 + }, + { + "epoch": 2.5112059765208112, + "grad_norm": 0.8012535845621486, + "learning_rate": 1.606844567534151e-05, + "loss": 0.0638, + "step": 21177 + }, + { + "epoch": 2.511324558282936, + "grad_norm": 0.6619486379249423, + "learning_rate": 1.60662037692683e-05, + "loss": 0.0957, + "step": 21178 + }, + { + "epoch": 2.511443140045061, + "grad_norm": 0.6822073365194059, + "learning_rate": 1.6063961945553732e-05, + "loss": 0.1087, + "step": 21179 + }, + { + "epoch": 2.511561721807186, + "grad_norm": 0.7428110053192709, + "learning_rate": 1.606172020421848e-05, + "loss": 0.1075, + "step": 21180 + }, + { + "epoch": 2.511680303569311, + "grad_norm": 0.5176230749773819, + "learning_rate": 1.6059478545283203e-05, + "loss": 0.0679, + "step": 21181 + }, + { + "epoch": 2.511798885331436, + "grad_norm": 0.563625431628325, + "learning_rate": 1.6057236968768574e-05, + "loss": 0.063, + "step": 21182 + }, + { + "epoch": 2.511917467093561, + "grad_norm": 0.8606204972811653, + "learning_rate": 1.605499547469526e-05, + "loss": 0.1132, + "step": 21183 + }, + { + "epoch": 2.512036048855686, + "grad_norm": 0.7822027362067526, + "learning_rate": 1.60527540630839e-05, + "loss": 0.0991, + "step": 21184 + }, + { + "epoch": 2.512154630617811, + "grad_norm": 0.5612508037681575, + "learning_rate": 1.605051273395519e-05, + "loss": 0.0756, + "step": 21185 + }, + { + "epoch": 2.512273212379936, + "grad_norm": 0.7891987253655633, + "learning_rate": 1.6048271487329782e-05, + "loss": 0.0924, + "step": 21186 + }, + { + "epoch": 2.512391794142061, + "grad_norm": 0.5989264886433577, + "learning_rate": 1.6046030323228332e-05, + "loss": 0.0712, + "step": 21187 + }, + { + "epoch": 2.512510375904186, + "grad_norm": 0.7573045985997359, + "learning_rate": 1.604378924167149e-05, + "loss": 0.1066, + "step": 21188 + }, + { + "epoch": 2.512628957666311, + "grad_norm": 1.0946505912221336, + "learning_rate": 1.6041548242679937e-05, + "loss": 0.1464, + "step": 21189 + }, + { + "epoch": 2.5127475394284358, + "grad_norm": 0.6098427773513393, + "learning_rate": 1.6039307326274323e-05, + "loss": 0.0514, + "step": 21190 + }, + { + "epoch": 2.512866121190561, + "grad_norm": 0.6510119936805168, + "learning_rate": 1.6037066492475306e-05, + "loss": 0.0671, + "step": 21191 + }, + { + "epoch": 2.5129847029526857, + "grad_norm": 0.46791017399413354, + "learning_rate": 1.603482574130354e-05, + "loss": 0.0649, + "step": 21192 + }, + { + "epoch": 2.513103284714811, + "grad_norm": 0.6618851519477186, + "learning_rate": 1.60325850727797e-05, + "loss": 0.1012, + "step": 21193 + }, + { + "epoch": 2.5132218664769357, + "grad_norm": 0.7260795218145009, + "learning_rate": 1.603034448692442e-05, + "loss": 0.093, + "step": 21194 + }, + { + "epoch": 2.513340448239061, + "grad_norm": 0.4446402415870884, + "learning_rate": 1.602810398375836e-05, + "loss": 0.0632, + "step": 21195 + }, + { + "epoch": 2.5134590300011856, + "grad_norm": 0.5490139517931548, + "learning_rate": 1.602586356330219e-05, + "loss": 0.087, + "step": 21196 + }, + { + "epoch": 2.513577611763311, + "grad_norm": 0.9974464193836134, + "learning_rate": 1.602362322557655e-05, + "loss": 0.1256, + "step": 21197 + }, + { + "epoch": 2.513696193525436, + "grad_norm": 0.7183247002678562, + "learning_rate": 1.6021382970602094e-05, + "loss": 0.0888, + "step": 21198 + }, + { + "epoch": 2.513814775287561, + "grad_norm": 0.6905717781448057, + "learning_rate": 1.6019142798399472e-05, + "loss": 0.0926, + "step": 21199 + }, + { + "epoch": 2.5139333570496856, + "grad_norm": 0.6922766969248829, + "learning_rate": 1.6016902708989346e-05, + "loss": 0.1103, + "step": 21200 + }, + { + "epoch": 2.5140519388118108, + "grad_norm": 0.532485902784654, + "learning_rate": 1.6014662702392365e-05, + "loss": 0.0627, + "step": 21201 + }, + { + "epoch": 2.514170520573936, + "grad_norm": 0.7207496130375991, + "learning_rate": 1.601242277862917e-05, + "loss": 0.1012, + "step": 21202 + }, + { + "epoch": 2.5142891023360607, + "grad_norm": 0.8036522880946295, + "learning_rate": 1.6010182937720413e-05, + "loss": 0.0833, + "step": 21203 + }, + { + "epoch": 2.5144076840981855, + "grad_norm": 0.5365242018451132, + "learning_rate": 1.600794317968675e-05, + "loss": 0.0455, + "step": 21204 + }, + { + "epoch": 2.5145262658603107, + "grad_norm": 0.8245399410927114, + "learning_rate": 1.600570350454883e-05, + "loss": 0.1202, + "step": 21205 + }, + { + "epoch": 2.514644847622436, + "grad_norm": 0.9347200521136813, + "learning_rate": 1.600346391232728e-05, + "loss": 0.1076, + "step": 21206 + }, + { + "epoch": 2.5147634293845607, + "grad_norm": 0.4728885134283516, + "learning_rate": 1.600122440304277e-05, + "loss": 0.075, + "step": 21207 + }, + { + "epoch": 2.5148820111466854, + "grad_norm": 0.4142569114603341, + "learning_rate": 1.5998984976715934e-05, + "loss": 0.0664, + "step": 21208 + }, + { + "epoch": 2.5150005929088106, + "grad_norm": 0.6603189676975086, + "learning_rate": 1.599674563336742e-05, + "loss": 0.1087, + "step": 21209 + }, + { + "epoch": 2.515119174670936, + "grad_norm": 0.7157895741410386, + "learning_rate": 1.5994506373017867e-05, + "loss": 0.0883, + "step": 21210 + }, + { + "epoch": 2.5152377564330606, + "grad_norm": 0.8958167539895541, + "learning_rate": 1.599226719568792e-05, + "loss": 0.1462, + "step": 21211 + }, + { + "epoch": 2.5153563381951853, + "grad_norm": 0.6297402675771617, + "learning_rate": 1.5990028101398234e-05, + "loss": 0.1036, + "step": 21212 + }, + { + "epoch": 2.5154749199573105, + "grad_norm": 0.6449638904939751, + "learning_rate": 1.5987789090169435e-05, + "loss": 0.0942, + "step": 21213 + }, + { + "epoch": 2.5155935017194357, + "grad_norm": 0.6226921231345522, + "learning_rate": 1.598555016202216e-05, + "loss": 0.0963, + "step": 21214 + }, + { + "epoch": 2.5157120834815605, + "grad_norm": 0.6936114071149169, + "learning_rate": 1.5983311316977073e-05, + "loss": 0.072, + "step": 21215 + }, + { + "epoch": 2.5158306652436853, + "grad_norm": 0.7546230822107276, + "learning_rate": 1.5981072555054794e-05, + "loss": 0.1191, + "step": 21216 + }, + { + "epoch": 2.5159492470058105, + "grad_norm": 0.6952587993939495, + "learning_rate": 1.597883387627596e-05, + "loss": 0.0884, + "step": 21217 + }, + { + "epoch": 2.5160678287679357, + "grad_norm": 0.5856828432582977, + "learning_rate": 1.5976595280661218e-05, + "loss": 0.0621, + "step": 21218 + }, + { + "epoch": 2.5161864105300604, + "grad_norm": 0.6747654401282737, + "learning_rate": 1.5974356768231202e-05, + "loss": 0.1107, + "step": 21219 + }, + { + "epoch": 2.5163049922921856, + "grad_norm": 0.8071487310929476, + "learning_rate": 1.5972118339006555e-05, + "loss": 0.1145, + "step": 21220 + }, + { + "epoch": 2.5164235740543104, + "grad_norm": 0.5376883883596805, + "learning_rate": 1.5969879993007895e-05, + "loss": 0.069, + "step": 21221 + }, + { + "epoch": 2.5165421558164356, + "grad_norm": 0.8495537943063647, + "learning_rate": 1.5967641730255877e-05, + "loss": 0.1097, + "step": 21222 + }, + { + "epoch": 2.5166607375785603, + "grad_norm": 0.6993414598176287, + "learning_rate": 1.5965403550771125e-05, + "loss": 0.0989, + "step": 21223 + }, + { + "epoch": 2.5167793193406856, + "grad_norm": 0.979090746635705, + "learning_rate": 1.5963165454574276e-05, + "loss": 0.116, + "step": 21224 + }, + { + "epoch": 2.5168979011028103, + "grad_norm": 0.6425007592096162, + "learning_rate": 1.596092744168595e-05, + "loss": 0.083, + "step": 21225 + }, + { + "epoch": 2.5170164828649355, + "grad_norm": 1.4340654342075105, + "learning_rate": 1.5958689512126796e-05, + "loss": 0.1496, + "step": 21226 + }, + { + "epoch": 2.5171350646270603, + "grad_norm": 0.8443594787466172, + "learning_rate": 1.5956451665917437e-05, + "loss": 0.1346, + "step": 21227 + }, + { + "epoch": 2.5172536463891855, + "grad_norm": 0.6472856763117454, + "learning_rate": 1.59542139030785e-05, + "loss": 0.09, + "step": 21228 + }, + { + "epoch": 2.5173722281513102, + "grad_norm": 0.6663024515116961, + "learning_rate": 1.595197622363062e-05, + "loss": 0.0877, + "step": 21229 + }, + { + "epoch": 2.5174908099134354, + "grad_norm": 0.7530457177489921, + "learning_rate": 1.5949738627594424e-05, + "loss": 0.0844, + "step": 21230 + }, + { + "epoch": 2.51760939167556, + "grad_norm": 0.6769323080684769, + "learning_rate": 1.5947501114990542e-05, + "loss": 0.103, + "step": 21231 + }, + { + "epoch": 2.5177279734376854, + "grad_norm": 0.6126467305943148, + "learning_rate": 1.59452636858396e-05, + "loss": 0.0909, + "step": 21232 + }, + { + "epoch": 2.51784655519981, + "grad_norm": 0.8163258232955648, + "learning_rate": 1.5943026340162215e-05, + "loss": 0.1119, + "step": 21233 + }, + { + "epoch": 2.5179651369619354, + "grad_norm": 0.614448428275557, + "learning_rate": 1.594078907797903e-05, + "loss": 0.0803, + "step": 21234 + }, + { + "epoch": 2.51808371872406, + "grad_norm": 0.511285322680352, + "learning_rate": 1.5938551899310655e-05, + "loss": 0.0541, + "step": 21235 + }, + { + "epoch": 2.5182023004861853, + "grad_norm": 0.5824805455676073, + "learning_rate": 1.5936314804177715e-05, + "loss": 0.087, + "step": 21236 + }, + { + "epoch": 2.51832088224831, + "grad_norm": 0.6113144790880425, + "learning_rate": 1.593407779260085e-05, + "loss": 0.0836, + "step": 21237 + }, + { + "epoch": 2.5184394640104353, + "grad_norm": 0.6897680667976167, + "learning_rate": 1.5931840864600666e-05, + "loss": 0.0895, + "step": 21238 + }, + { + "epoch": 2.51855804577256, + "grad_norm": 0.7698529461466672, + "learning_rate": 1.5929604020197787e-05, + "loss": 0.0846, + "step": 21239 + }, + { + "epoch": 2.5186766275346852, + "grad_norm": 0.6485024022592171, + "learning_rate": 1.592736725941283e-05, + "loss": 0.0976, + "step": 21240 + }, + { + "epoch": 2.51879520929681, + "grad_norm": 0.9602080900584585, + "learning_rate": 1.592513058226643e-05, + "loss": 0.1429, + "step": 21241 + }, + { + "epoch": 2.518913791058935, + "grad_norm": 0.7968558645579773, + "learning_rate": 1.59228939887792e-05, + "loss": 0.1218, + "step": 21242 + }, + { + "epoch": 2.51903237282106, + "grad_norm": 0.7842297243723687, + "learning_rate": 1.5920657478971752e-05, + "loss": 0.1117, + "step": 21243 + }, + { + "epoch": 2.519150954583185, + "grad_norm": 0.7184039579375429, + "learning_rate": 1.59184210528647e-05, + "loss": 0.0876, + "step": 21244 + }, + { + "epoch": 2.51926953634531, + "grad_norm": 0.432472229028866, + "learning_rate": 1.5916184710478678e-05, + "loss": 0.0554, + "step": 21245 + }, + { + "epoch": 2.519388118107435, + "grad_norm": 0.687183045116382, + "learning_rate": 1.5913948451834298e-05, + "loss": 0.085, + "step": 21246 + }, + { + "epoch": 2.5195066998695603, + "grad_norm": 0.5864287301129602, + "learning_rate": 1.5911712276952162e-05, + "loss": 0.0698, + "step": 21247 + }, + { + "epoch": 2.519625281631685, + "grad_norm": 0.6846620742459917, + "learning_rate": 1.5909476185852898e-05, + "loss": 0.1071, + "step": 21248 + }, + { + "epoch": 2.51974386339381, + "grad_norm": 0.6260072587031013, + "learning_rate": 1.5907240178557115e-05, + "loss": 0.0814, + "step": 21249 + }, + { + "epoch": 2.519862445155935, + "grad_norm": 0.7705515979468751, + "learning_rate": 1.590500425508543e-05, + "loss": 0.1016, + "step": 21250 + }, + { + "epoch": 2.5199810269180603, + "grad_norm": 0.6867039540103161, + "learning_rate": 1.5902768415458448e-05, + "loss": 0.0915, + "step": 21251 + }, + { + "epoch": 2.520099608680185, + "grad_norm": 0.4893199393934805, + "learning_rate": 1.5900532659696787e-05, + "loss": 0.0665, + "step": 21252 + }, + { + "epoch": 2.5202181904423098, + "grad_norm": 0.527711742334573, + "learning_rate": 1.5898296987821064e-05, + "loss": 0.0678, + "step": 21253 + }, + { + "epoch": 2.520336772204435, + "grad_norm": 0.5925465391598634, + "learning_rate": 1.589606139985188e-05, + "loss": 0.0891, + "step": 21254 + }, + { + "epoch": 2.52045535396656, + "grad_norm": 0.7313935304750119, + "learning_rate": 1.5893825895809833e-05, + "loss": 0.0978, + "step": 21255 + }, + { + "epoch": 2.520573935728685, + "grad_norm": 0.5381858404958506, + "learning_rate": 1.5891590475715558e-05, + "loss": 0.0762, + "step": 21256 + }, + { + "epoch": 2.5206925174908097, + "grad_norm": 1.0878183173839218, + "learning_rate": 1.5889355139589645e-05, + "loss": 0.0989, + "step": 21257 + }, + { + "epoch": 2.520811099252935, + "grad_norm": 0.6887122326538243, + "learning_rate": 1.58871198874527e-05, + "loss": 0.0852, + "step": 21258 + }, + { + "epoch": 2.52092968101506, + "grad_norm": 0.6866559063405675, + "learning_rate": 1.588488471932534e-05, + "loss": 0.0768, + "step": 21259 + }, + { + "epoch": 2.521048262777185, + "grad_norm": 0.6488916116757518, + "learning_rate": 1.5882649635228164e-05, + "loss": 0.0948, + "step": 21260 + }, + { + "epoch": 2.5211668445393096, + "grad_norm": 0.5172522362637477, + "learning_rate": 1.5880414635181777e-05, + "loss": 0.0823, + "step": 21261 + }, + { + "epoch": 2.521285426301435, + "grad_norm": 0.6948933453001559, + "learning_rate": 1.587817971920678e-05, + "loss": 0.1039, + "step": 21262 + }, + { + "epoch": 2.52140400806356, + "grad_norm": 0.8030353967306548, + "learning_rate": 1.587594488732379e-05, + "loss": 0.1027, + "step": 21263 + }, + { + "epoch": 2.521522589825685, + "grad_norm": 0.5790485137881716, + "learning_rate": 1.58737101395534e-05, + "loss": 0.0767, + "step": 21264 + }, + { + "epoch": 2.5216411715878095, + "grad_norm": 0.8458766697666417, + "learning_rate": 1.587147547591621e-05, + "loss": 0.1408, + "step": 21265 + }, + { + "epoch": 2.5217597533499347, + "grad_norm": 0.4800373873828188, + "learning_rate": 1.5869240896432806e-05, + "loss": 0.0728, + "step": 21266 + }, + { + "epoch": 2.52187833511206, + "grad_norm": 0.5450342010884944, + "learning_rate": 1.586700640112382e-05, + "loss": 0.0691, + "step": 21267 + }, + { + "epoch": 2.5219969168741847, + "grad_norm": 0.795968038598675, + "learning_rate": 1.586477199000983e-05, + "loss": 0.089, + "step": 21268 + }, + { + "epoch": 2.5221154986363095, + "grad_norm": 0.7026515821204286, + "learning_rate": 1.5862537663111443e-05, + "loss": 0.0966, + "step": 21269 + }, + { + "epoch": 2.5222340803984347, + "grad_norm": 1.0283008232737523, + "learning_rate": 1.5860303420449245e-05, + "loss": 0.1082, + "step": 21270 + }, + { + "epoch": 2.52235266216056, + "grad_norm": 0.7113579075457984, + "learning_rate": 1.5858069262043854e-05, + "loss": 0.101, + "step": 21271 + }, + { + "epoch": 2.5224712439226846, + "grad_norm": 0.6153374128872715, + "learning_rate": 1.5855835187915852e-05, + "loss": 0.0778, + "step": 21272 + }, + { + "epoch": 2.52258982568481, + "grad_norm": 0.7571337036008207, + "learning_rate": 1.5853601198085834e-05, + "loss": 0.0953, + "step": 21273 + }, + { + "epoch": 2.5227084074469346, + "grad_norm": 0.8499453637619396, + "learning_rate": 1.585136729257439e-05, + "loss": 0.0763, + "step": 21274 + }, + { + "epoch": 2.52282698920906, + "grad_norm": 0.48813810173494915, + "learning_rate": 1.5849133471402127e-05, + "loss": 0.0674, + "step": 21275 + }, + { + "epoch": 2.5229455709711845, + "grad_norm": 0.798607820942903, + "learning_rate": 1.5846899734589638e-05, + "loss": 0.0914, + "step": 21276 + }, + { + "epoch": 2.5230641527333098, + "grad_norm": 0.8220704567490645, + "learning_rate": 1.5844666082157494e-05, + "loss": 0.079, + "step": 21277 + }, + { + "epoch": 2.5231827344954345, + "grad_norm": 1.2074088504968805, + "learning_rate": 1.5842432514126316e-05, + "loss": 0.1424, + "step": 21278 + }, + { + "epoch": 2.5233013162575597, + "grad_norm": 0.8473358634742717, + "learning_rate": 1.584019903051667e-05, + "loss": 0.1022, + "step": 21279 + }, + { + "epoch": 2.5234198980196845, + "grad_norm": 0.7860802042912769, + "learning_rate": 1.5837965631349164e-05, + "loss": 0.105, + "step": 21280 + }, + { + "epoch": 2.5235384797818097, + "grad_norm": 0.8088072671402129, + "learning_rate": 1.583573231664437e-05, + "loss": 0.1275, + "step": 21281 + }, + { + "epoch": 2.5236570615439344, + "grad_norm": 0.6148884886052473, + "learning_rate": 1.5833499086422894e-05, + "loss": 0.0904, + "step": 21282 + }, + { + "epoch": 2.5237756433060596, + "grad_norm": 0.49898405079213337, + "learning_rate": 1.583126594070532e-05, + "loss": 0.0675, + "step": 21283 + }, + { + "epoch": 2.5238942250681844, + "grad_norm": 0.8358561825814387, + "learning_rate": 1.582903287951223e-05, + "loss": 0.1006, + "step": 21284 + }, + { + "epoch": 2.5240128068303096, + "grad_norm": 0.580590977791898, + "learning_rate": 1.58267999028642e-05, + "loss": 0.091, + "step": 21285 + }, + { + "epoch": 2.5241313885924344, + "grad_norm": 0.7144230402814509, + "learning_rate": 1.5824567010781832e-05, + "loss": 0.0896, + "step": 21286 + }, + { + "epoch": 2.5242499703545596, + "grad_norm": 0.6599775877907417, + "learning_rate": 1.5822334203285704e-05, + "loss": 0.093, + "step": 21287 + }, + { + "epoch": 2.5243685521166843, + "grad_norm": 0.6735555424812282, + "learning_rate": 1.58201014803964e-05, + "loss": 0.0799, + "step": 21288 + }, + { + "epoch": 2.5244871338788095, + "grad_norm": 0.6469368809132241, + "learning_rate": 1.5817868842134504e-05, + "loss": 0.1072, + "step": 21289 + }, + { + "epoch": 2.5246057156409343, + "grad_norm": 0.655316105510702, + "learning_rate": 1.5815636288520594e-05, + "loss": 0.079, + "step": 21290 + }, + { + "epoch": 2.5247242974030595, + "grad_norm": 0.6535783033483098, + "learning_rate": 1.581340381957526e-05, + "loss": 0.0854, + "step": 21291 + }, + { + "epoch": 2.5248428791651842, + "grad_norm": 0.666211953830134, + "learning_rate": 1.5811171435319067e-05, + "loss": 0.0915, + "step": 21292 + }, + { + "epoch": 2.5249614609273094, + "grad_norm": 0.781464983928972, + "learning_rate": 1.580893913577262e-05, + "loss": 0.108, + "step": 21293 + }, + { + "epoch": 2.525080042689434, + "grad_norm": 0.6152789615941434, + "learning_rate": 1.5806706920956476e-05, + "loss": 0.0847, + "step": 21294 + }, + { + "epoch": 2.5251986244515594, + "grad_norm": 0.5779655423595016, + "learning_rate": 1.5804474790891227e-05, + "loss": 0.0862, + "step": 21295 + }, + { + "epoch": 2.5253172062136846, + "grad_norm": 0.5749657128377035, + "learning_rate": 1.5802242745597432e-05, + "loss": 0.0864, + "step": 21296 + }, + { + "epoch": 2.5254357879758094, + "grad_norm": 0.9151627899529122, + "learning_rate": 1.580001078509569e-05, + "loss": 0.1068, + "step": 21297 + }, + { + "epoch": 2.525554369737934, + "grad_norm": 0.740132708607028, + "learning_rate": 1.579777890940656e-05, + "loss": 0.1184, + "step": 21298 + }, + { + "epoch": 2.5256729515000593, + "grad_norm": 0.6812029282939753, + "learning_rate": 1.579554711855063e-05, + "loss": 0.0957, + "step": 21299 + }, + { + "epoch": 2.5257915332621845, + "grad_norm": 0.6280226773195359, + "learning_rate": 1.579331541254846e-05, + "loss": 0.0779, + "step": 21300 + }, + { + "epoch": 2.5259101150243093, + "grad_norm": 0.819494004408467, + "learning_rate": 1.5791083791420642e-05, + "loss": 0.1158, + "step": 21301 + }, + { + "epoch": 2.526028696786434, + "grad_norm": 0.789435546291929, + "learning_rate": 1.5788852255187736e-05, + "loss": 0.0826, + "step": 21302 + }, + { + "epoch": 2.5261472785485592, + "grad_norm": 0.7825214458603302, + "learning_rate": 1.5786620803870317e-05, + "loss": 0.0809, + "step": 21303 + }, + { + "epoch": 2.5262658603106845, + "grad_norm": 0.5654859634549121, + "learning_rate": 1.5784389437488947e-05, + "loss": 0.0725, + "step": 21304 + }, + { + "epoch": 2.526384442072809, + "grad_norm": 0.61824833713517, + "learning_rate": 1.5782158156064216e-05, + "loss": 0.0892, + "step": 21305 + }, + { + "epoch": 2.526503023834934, + "grad_norm": 0.5881089797525876, + "learning_rate": 1.5779926959616683e-05, + "loss": 0.0824, + "step": 21306 + }, + { + "epoch": 2.526621605597059, + "grad_norm": 0.6388794775630645, + "learning_rate": 1.5777695848166907e-05, + "loss": 0.0771, + "step": 21307 + }, + { + "epoch": 2.5267401873591844, + "grad_norm": 0.8025774161591848, + "learning_rate": 1.5775464821735475e-05, + "loss": 0.1343, + "step": 21308 + }, + { + "epoch": 2.526858769121309, + "grad_norm": 0.5742635003862505, + "learning_rate": 1.5773233880342942e-05, + "loss": 0.0787, + "step": 21309 + }, + { + "epoch": 2.526977350883434, + "grad_norm": 0.6401381837169694, + "learning_rate": 1.5771003024009884e-05, + "loss": 0.0746, + "step": 21310 + }, + { + "epoch": 2.527095932645559, + "grad_norm": 0.5758265508949326, + "learning_rate": 1.5768772252756848e-05, + "loss": 0.0752, + "step": 21311 + }, + { + "epoch": 2.5272145144076843, + "grad_norm": 0.4727605809320031, + "learning_rate": 1.576654156660442e-05, + "loss": 0.0614, + "step": 21312 + }, + { + "epoch": 2.527333096169809, + "grad_norm": 1.135875076729461, + "learning_rate": 1.576431096557316e-05, + "loss": 0.1525, + "step": 21313 + }, + { + "epoch": 2.527451677931934, + "grad_norm": 0.9188057472627202, + "learning_rate": 1.5762080449683627e-05, + "loss": 0.128, + "step": 21314 + }, + { + "epoch": 2.527570259694059, + "grad_norm": 0.8312820094796594, + "learning_rate": 1.5759850018956375e-05, + "loss": 0.1296, + "step": 21315 + }, + { + "epoch": 2.527688841456184, + "grad_norm": 1.1083637333029084, + "learning_rate": 1.5757619673411984e-05, + "loss": 0.136, + "step": 21316 + }, + { + "epoch": 2.527807423218309, + "grad_norm": 0.8689547664020171, + "learning_rate": 1.5755389413071004e-05, + "loss": 0.131, + "step": 21317 + }, + { + "epoch": 2.5279260049804337, + "grad_norm": 0.6218064871013912, + "learning_rate": 1.575315923795399e-05, + "loss": 0.0796, + "step": 21318 + }, + { + "epoch": 2.528044586742559, + "grad_norm": 1.075657374751013, + "learning_rate": 1.575092914808151e-05, + "loss": 0.14, + "step": 21319 + }, + { + "epoch": 2.528163168504684, + "grad_norm": 0.8143566622234711, + "learning_rate": 1.5748699143474125e-05, + "loss": 0.1162, + "step": 21320 + }, + { + "epoch": 2.528281750266809, + "grad_norm": 0.6309696135830213, + "learning_rate": 1.5746469224152393e-05, + "loss": 0.0898, + "step": 21321 + }, + { + "epoch": 2.528400332028934, + "grad_norm": 0.8991769490985351, + "learning_rate": 1.5744239390136854e-05, + "loss": 0.0948, + "step": 21322 + }, + { + "epoch": 2.528518913791059, + "grad_norm": 0.675833658404797, + "learning_rate": 1.574200964144809e-05, + "loss": 0.0652, + "step": 21323 + }, + { + "epoch": 2.528637495553184, + "grad_norm": 0.9999458834740371, + "learning_rate": 1.5739779978106644e-05, + "loss": 0.1419, + "step": 21324 + }, + { + "epoch": 2.528756077315309, + "grad_norm": 0.6585057640300015, + "learning_rate": 1.5737550400133072e-05, + "loss": 0.0919, + "step": 21325 + }, + { + "epoch": 2.528874659077434, + "grad_norm": 0.563245916762423, + "learning_rate": 1.5735320907547918e-05, + "loss": 0.0684, + "step": 21326 + }, + { + "epoch": 2.528993240839559, + "grad_norm": 0.49375253271008424, + "learning_rate": 1.573309150037175e-05, + "loss": 0.067, + "step": 21327 + }, + { + "epoch": 2.529111822601684, + "grad_norm": 0.5974908024149669, + "learning_rate": 1.573086217862511e-05, + "loss": 0.0772, + "step": 21328 + }, + { + "epoch": 2.5292304043638087, + "grad_norm": 0.6794303625722746, + "learning_rate": 1.5728632942328563e-05, + "loss": 0.0891, + "step": 21329 + }, + { + "epoch": 2.529348986125934, + "grad_norm": 0.5602299732057155, + "learning_rate": 1.572640379150264e-05, + "loss": 0.0649, + "step": 21330 + }, + { + "epoch": 2.5294675678880587, + "grad_norm": 0.6162324998859603, + "learning_rate": 1.5724174726167912e-05, + "loss": 0.0971, + "step": 21331 + }, + { + "epoch": 2.529586149650184, + "grad_norm": 0.6382535469280032, + "learning_rate": 1.5721945746344917e-05, + "loss": 0.0917, + "step": 21332 + }, + { + "epoch": 2.5297047314123087, + "grad_norm": 0.5277946082473173, + "learning_rate": 1.5719716852054204e-05, + "loss": 0.0678, + "step": 21333 + }, + { + "epoch": 2.529823313174434, + "grad_norm": 0.6672909301635114, + "learning_rate": 1.571748804331631e-05, + "loss": 0.0739, + "step": 21334 + }, + { + "epoch": 2.5299418949365586, + "grad_norm": 0.7452947120658344, + "learning_rate": 1.5715259320151806e-05, + "loss": 0.097, + "step": 21335 + }, + { + "epoch": 2.530060476698684, + "grad_norm": 0.5493272464846725, + "learning_rate": 1.5713030682581225e-05, + "loss": 0.0672, + "step": 21336 + }, + { + "epoch": 2.5301790584608086, + "grad_norm": 0.6562297082760508, + "learning_rate": 1.5710802130625102e-05, + "loss": 0.0782, + "step": 21337 + }, + { + "epoch": 2.530297640222934, + "grad_norm": 0.8142567015662449, + "learning_rate": 1.5708573664304004e-05, + "loss": 0.1055, + "step": 21338 + }, + { + "epoch": 2.5304162219850586, + "grad_norm": 0.6832243852624851, + "learning_rate": 1.5706345283638458e-05, + "loss": 0.0966, + "step": 21339 + }, + { + "epoch": 2.5305348037471838, + "grad_norm": 0.5776624940880218, + "learning_rate": 1.5704116988649014e-05, + "loss": 0.0855, + "step": 21340 + }, + { + "epoch": 2.5306533855093085, + "grad_norm": 0.45085309349237535, + "learning_rate": 1.5701888779356204e-05, + "loss": 0.0722, + "step": 21341 + }, + { + "epoch": 2.5307719672714337, + "grad_norm": 0.4581341049199007, + "learning_rate": 1.5699660655780584e-05, + "loss": 0.0712, + "step": 21342 + }, + { + "epoch": 2.5308905490335585, + "grad_norm": 0.8217490533547537, + "learning_rate": 1.5697432617942692e-05, + "loss": 0.1139, + "step": 21343 + }, + { + "epoch": 2.5310091307956837, + "grad_norm": 0.5671148754417874, + "learning_rate": 1.5695204665863065e-05, + "loss": 0.0738, + "step": 21344 + }, + { + "epoch": 2.531127712557809, + "grad_norm": 0.6787102124359455, + "learning_rate": 1.5692976799562227e-05, + "loss": 0.0891, + "step": 21345 + }, + { + "epoch": 2.5312462943199336, + "grad_norm": 0.5643183433788461, + "learning_rate": 1.569074901906074e-05, + "loss": 0.073, + "step": 21346 + }, + { + "epoch": 2.5313648760820584, + "grad_norm": 0.9145787226535894, + "learning_rate": 1.568852132437913e-05, + "loss": 0.1468, + "step": 21347 + }, + { + "epoch": 2.5314834578441836, + "grad_norm": 0.9637523565972025, + "learning_rate": 1.568629371553793e-05, + "loss": 0.1546, + "step": 21348 + }, + { + "epoch": 2.531602039606309, + "grad_norm": 0.575715205523021, + "learning_rate": 1.5684066192557685e-05, + "loss": 0.0878, + "step": 21349 + }, + { + "epoch": 2.5317206213684336, + "grad_norm": 0.6033988695582727, + "learning_rate": 1.5681838755458928e-05, + "loss": 0.0843, + "step": 21350 + }, + { + "epoch": 2.5318392031305583, + "grad_norm": 0.5954625349158761, + "learning_rate": 1.567961140426219e-05, + "loss": 0.0783, + "step": 21351 + }, + { + "epoch": 2.5319577848926835, + "grad_norm": 0.5964618890167728, + "learning_rate": 1.5677384138988e-05, + "loss": 0.0932, + "step": 21352 + }, + { + "epoch": 2.5320763666548087, + "grad_norm": 0.671647332768306, + "learning_rate": 1.56751569596569e-05, + "loss": 0.1, + "step": 21353 + }, + { + "epoch": 2.5321949484169335, + "grad_norm": 0.679007150663439, + "learning_rate": 1.567292986628942e-05, + "loss": 0.1071, + "step": 21354 + }, + { + "epoch": 2.5323135301790582, + "grad_norm": 0.5295665330085858, + "learning_rate": 1.5670702858906094e-05, + "loss": 0.0747, + "step": 21355 + }, + { + "epoch": 2.5324321119411835, + "grad_norm": 0.5475992943520677, + "learning_rate": 1.5668475937527432e-05, + "loss": 0.0741, + "step": 21356 + }, + { + "epoch": 2.5325506937033087, + "grad_norm": 0.7221759018154783, + "learning_rate": 1.566624910217399e-05, + "loss": 0.0975, + "step": 21357 + }, + { + "epoch": 2.5326692754654334, + "grad_norm": 0.7057870014221922, + "learning_rate": 1.566402235286628e-05, + "loss": 0.1027, + "step": 21358 + }, + { + "epoch": 2.532787857227558, + "grad_norm": 0.6032345886881225, + "learning_rate": 1.5661795689624843e-05, + "loss": 0.0728, + "step": 21359 + }, + { + "epoch": 2.5329064389896834, + "grad_norm": 0.7550559841281823, + "learning_rate": 1.5659569112470183e-05, + "loss": 0.1097, + "step": 21360 + }, + { + "epoch": 2.5330250207518086, + "grad_norm": 1.0084981879530714, + "learning_rate": 1.565734262142286e-05, + "loss": 0.1146, + "step": 21361 + }, + { + "epoch": 2.5331436025139333, + "grad_norm": 0.9713821551938056, + "learning_rate": 1.5655116216503375e-05, + "loss": 0.154, + "step": 21362 + }, + { + "epoch": 2.533262184276058, + "grad_norm": 0.7035524880434478, + "learning_rate": 1.565288989773225e-05, + "loss": 0.0904, + "step": 21363 + }, + { + "epoch": 2.5333807660381833, + "grad_norm": 0.5443701854309239, + "learning_rate": 1.565066366513003e-05, + "loss": 0.077, + "step": 21364 + }, + { + "epoch": 2.5334993478003085, + "grad_norm": 0.5578310551512045, + "learning_rate": 1.5648437518717223e-05, + "loss": 0.0858, + "step": 21365 + }, + { + "epoch": 2.5336179295624333, + "grad_norm": 1.0640638221568757, + "learning_rate": 1.564621145851436e-05, + "loss": 0.1337, + "step": 21366 + }, + { + "epoch": 2.533736511324558, + "grad_norm": 0.8377480761565995, + "learning_rate": 1.564398548454194e-05, + "loss": 0.1426, + "step": 21367 + }, + { + "epoch": 2.533855093086683, + "grad_norm": 0.5361796297558525, + "learning_rate": 1.5641759596820517e-05, + "loss": 0.0835, + "step": 21368 + }, + { + "epoch": 2.5339736748488084, + "grad_norm": 0.9823883848553014, + "learning_rate": 1.5639533795370584e-05, + "loss": 0.1114, + "step": 21369 + }, + { + "epoch": 2.534092256610933, + "grad_norm": 0.5313059740724674, + "learning_rate": 1.5637308080212677e-05, + "loss": 0.0795, + "step": 21370 + }, + { + "epoch": 2.5342108383730584, + "grad_norm": 0.7764688797809753, + "learning_rate": 1.56350824513673e-05, + "loss": 0.1257, + "step": 21371 + }, + { + "epoch": 2.534329420135183, + "grad_norm": 0.6919005308959864, + "learning_rate": 1.563285690885499e-05, + "loss": 0.0828, + "step": 21372 + }, + { + "epoch": 2.5344480018973083, + "grad_norm": 0.544993903860499, + "learning_rate": 1.563063145269625e-05, + "loss": 0.0736, + "step": 21373 + }, + { + "epoch": 2.534566583659433, + "grad_norm": 0.5405379773721206, + "learning_rate": 1.56284060829116e-05, + "loss": 0.0854, + "step": 21374 + }, + { + "epoch": 2.5346851654215583, + "grad_norm": 0.6989543644469031, + "learning_rate": 1.5626180799521546e-05, + "loss": 0.0908, + "step": 21375 + }, + { + "epoch": 2.534803747183683, + "grad_norm": 0.7815316312278002, + "learning_rate": 1.5623955602546613e-05, + "loss": 0.1219, + "step": 21376 + }, + { + "epoch": 2.5349223289458083, + "grad_norm": 0.5144071957960994, + "learning_rate": 1.5621730492007312e-05, + "loss": 0.0814, + "step": 21377 + }, + { + "epoch": 2.535040910707933, + "grad_norm": 0.6321964247914796, + "learning_rate": 1.5619505467924157e-05, + "loss": 0.0996, + "step": 21378 + }, + { + "epoch": 2.5351594924700582, + "grad_norm": 0.6857003285264767, + "learning_rate": 1.5617280530317654e-05, + "loss": 0.0919, + "step": 21379 + }, + { + "epoch": 2.535278074232183, + "grad_norm": 0.4153169763339736, + "learning_rate": 1.5615055679208327e-05, + "loss": 0.0521, + "step": 21380 + }, + { + "epoch": 2.535396655994308, + "grad_norm": 0.8701194181280495, + "learning_rate": 1.561283091461668e-05, + "loss": 0.117, + "step": 21381 + }, + { + "epoch": 2.535515237756433, + "grad_norm": 1.1011608010275402, + "learning_rate": 1.5610606236563203e-05, + "loss": 0.1225, + "step": 21382 + }, + { + "epoch": 2.535633819518558, + "grad_norm": 0.62425900057351, + "learning_rate": 1.5608381645068437e-05, + "loss": 0.0721, + "step": 21383 + }, + { + "epoch": 2.535752401280683, + "grad_norm": 0.5114191792089512, + "learning_rate": 1.5606157140152876e-05, + "loss": 0.0679, + "step": 21384 + }, + { + "epoch": 2.535870983042808, + "grad_norm": 0.493276936517542, + "learning_rate": 1.5603932721837025e-05, + "loss": 0.0611, + "step": 21385 + }, + { + "epoch": 2.535989564804933, + "grad_norm": 0.7496334077889462, + "learning_rate": 1.5601708390141385e-05, + "loss": 0.0957, + "step": 21386 + }, + { + "epoch": 2.536108146567058, + "grad_norm": 0.8543792878882609, + "learning_rate": 1.5599484145086473e-05, + "loss": 0.1426, + "step": 21387 + }, + { + "epoch": 2.536226728329183, + "grad_norm": 0.5702387280409568, + "learning_rate": 1.559725998669279e-05, + "loss": 0.0759, + "step": 21388 + }, + { + "epoch": 2.536345310091308, + "grad_norm": 0.6277846728364588, + "learning_rate": 1.5595035914980844e-05, + "loss": 0.0996, + "step": 21389 + }, + { + "epoch": 2.536463891853433, + "grad_norm": 0.5883480082640337, + "learning_rate": 1.559281192997112e-05, + "loss": 0.0775, + "step": 21390 + }, + { + "epoch": 2.536582473615558, + "grad_norm": 0.6124578368288347, + "learning_rate": 1.559058803168415e-05, + "loss": 0.0988, + "step": 21391 + }, + { + "epoch": 2.5367010553776828, + "grad_norm": 0.5528245083018609, + "learning_rate": 1.5588364220140417e-05, + "loss": 0.0956, + "step": 21392 + }, + { + "epoch": 2.536819637139808, + "grad_norm": 0.5001961901196859, + "learning_rate": 1.558614049536041e-05, + "loss": 0.0686, + "step": 21393 + }, + { + "epoch": 2.5369382189019327, + "grad_norm": 0.5972809955739095, + "learning_rate": 1.558391685736466e-05, + "loss": 0.0774, + "step": 21394 + }, + { + "epoch": 2.537056800664058, + "grad_norm": 0.6669966395763532, + "learning_rate": 1.5581693306173646e-05, + "loss": 0.1055, + "step": 21395 + }, + { + "epoch": 2.5371753824261827, + "grad_norm": 0.713907845185839, + "learning_rate": 1.5579469841807866e-05, + "loss": 0.0996, + "step": 21396 + }, + { + "epoch": 2.537293964188308, + "grad_norm": 0.46105404314145265, + "learning_rate": 1.5577246464287825e-05, + "loss": 0.0557, + "step": 21397 + }, + { + "epoch": 2.537412545950433, + "grad_norm": 0.5420500525651486, + "learning_rate": 1.557502317363401e-05, + "loss": 0.0748, + "step": 21398 + }, + { + "epoch": 2.537531127712558, + "grad_norm": 0.9311110346310092, + "learning_rate": 1.5572799969866936e-05, + "loss": 0.1051, + "step": 21399 + }, + { + "epoch": 2.5376497094746826, + "grad_norm": 0.4657393539884365, + "learning_rate": 1.557057685300708e-05, + "loss": 0.069, + "step": 21400 + }, + { + "epoch": 2.537768291236808, + "grad_norm": 0.8134063879912088, + "learning_rate": 1.556835382307494e-05, + "loss": 0.1028, + "step": 21401 + }, + { + "epoch": 2.537886872998933, + "grad_norm": 0.9631775927323866, + "learning_rate": 1.5566130880091012e-05, + "loss": 0.1201, + "step": 21402 + }, + { + "epoch": 2.5380054547610578, + "grad_norm": 0.7152505155982368, + "learning_rate": 1.5563908024075797e-05, + "loss": 0.0933, + "step": 21403 + }, + { + "epoch": 2.5381240365231825, + "grad_norm": 0.7994502238370568, + "learning_rate": 1.5561685255049775e-05, + "loss": 0.1227, + "step": 21404 + }, + { + "epoch": 2.5382426182853077, + "grad_norm": 1.0339864605388562, + "learning_rate": 1.555946257303343e-05, + "loss": 0.1428, + "step": 21405 + }, + { + "epoch": 2.538361200047433, + "grad_norm": 0.7414231040154973, + "learning_rate": 1.5557239978047274e-05, + "loss": 0.0751, + "step": 21406 + }, + { + "epoch": 2.5384797818095577, + "grad_norm": 0.9259074107430612, + "learning_rate": 1.5555017470111782e-05, + "loss": 0.1588, + "step": 21407 + }, + { + "epoch": 2.5385983635716824, + "grad_norm": 0.7932805087623734, + "learning_rate": 1.555279504924745e-05, + "loss": 0.0987, + "step": 21408 + }, + { + "epoch": 2.5387169453338077, + "grad_norm": 0.8855268781604224, + "learning_rate": 1.5550572715474754e-05, + "loss": 0.1438, + "step": 21409 + }, + { + "epoch": 2.538835527095933, + "grad_norm": 0.9026828476576046, + "learning_rate": 1.5548350468814198e-05, + "loss": 0.1254, + "step": 21410 + }, + { + "epoch": 2.5389541088580576, + "grad_norm": 0.9209641152021051, + "learning_rate": 1.5546128309286264e-05, + "loss": 0.0957, + "step": 21411 + }, + { + "epoch": 2.5390726906201824, + "grad_norm": 0.758288466936238, + "learning_rate": 1.5543906236911424e-05, + "loss": 0.1083, + "step": 21412 + }, + { + "epoch": 2.5391912723823076, + "grad_norm": 0.48787556222178047, + "learning_rate": 1.5541684251710182e-05, + "loss": 0.062, + "step": 21413 + }, + { + "epoch": 2.5393098541444328, + "grad_norm": 0.7094957488320865, + "learning_rate": 1.553946235370301e-05, + "loss": 0.1158, + "step": 21414 + }, + { + "epoch": 2.5394284359065575, + "grad_norm": 0.7280227640118505, + "learning_rate": 1.5537240542910396e-05, + "loss": 0.0916, + "step": 21415 + }, + { + "epoch": 2.5395470176686823, + "grad_norm": 0.6456939860591353, + "learning_rate": 1.553501881935281e-05, + "loss": 0.0911, + "step": 21416 + }, + { + "epoch": 2.5396655994308075, + "grad_norm": 0.6817602383441491, + "learning_rate": 1.5532797183050752e-05, + "loss": 0.1093, + "step": 21417 + }, + { + "epoch": 2.5397841811929327, + "grad_norm": 0.8174861987319114, + "learning_rate": 1.553057563402469e-05, + "loss": 0.1081, + "step": 21418 + }, + { + "epoch": 2.5399027629550575, + "grad_norm": 0.5869234126782907, + "learning_rate": 1.5528354172295114e-05, + "loss": 0.0744, + "step": 21419 + }, + { + "epoch": 2.5400213447171827, + "grad_norm": 0.6361858583171346, + "learning_rate": 1.5526132797882486e-05, + "loss": 0.0651, + "step": 21420 + }, + { + "epoch": 2.5401399264793074, + "grad_norm": 0.5402051189483245, + "learning_rate": 1.552391151080731e-05, + "loss": 0.0761, + "step": 21421 + }, + { + "epoch": 2.5402585082414326, + "grad_norm": 0.6763458230637077, + "learning_rate": 1.5521690311090048e-05, + "loss": 0.1024, + "step": 21422 + }, + { + "epoch": 2.5403770900035574, + "grad_norm": 0.7096811962777245, + "learning_rate": 1.5519469198751166e-05, + "loss": 0.0933, + "step": 21423 + }, + { + "epoch": 2.5404956717656826, + "grad_norm": 0.9749822315615653, + "learning_rate": 1.5517248173811163e-05, + "loss": 0.1364, + "step": 21424 + }, + { + "epoch": 2.5406142535278073, + "grad_norm": 0.5835655812161914, + "learning_rate": 1.5515027236290503e-05, + "loss": 0.0977, + "step": 21425 + }, + { + "epoch": 2.5407328352899325, + "grad_norm": 0.5919165998638725, + "learning_rate": 1.5512806386209658e-05, + "loss": 0.0911, + "step": 21426 + }, + { + "epoch": 2.5408514170520573, + "grad_norm": 0.662203739211671, + "learning_rate": 1.55105856235891e-05, + "loss": 0.1059, + "step": 21427 + }, + { + "epoch": 2.5409699988141825, + "grad_norm": 0.5889288540780524, + "learning_rate": 1.550836494844931e-05, + "loss": 0.0782, + "step": 21428 + }, + { + "epoch": 2.5410885805763073, + "grad_norm": 0.5725810396661768, + "learning_rate": 1.550614436081076e-05, + "loss": 0.0763, + "step": 21429 + }, + { + "epoch": 2.5412071623384325, + "grad_norm": 0.6387576091701502, + "learning_rate": 1.5503923860693917e-05, + "loss": 0.1007, + "step": 21430 + }, + { + "epoch": 2.5413257441005572, + "grad_norm": 0.7429378169660595, + "learning_rate": 1.550170344811924e-05, + "loss": 0.1034, + "step": 21431 + }, + { + "epoch": 2.5414443258626824, + "grad_norm": 0.6366780073179603, + "learning_rate": 1.549948312310722e-05, + "loss": 0.1024, + "step": 21432 + }, + { + "epoch": 2.541562907624807, + "grad_norm": 0.9647917424638454, + "learning_rate": 1.5497262885678315e-05, + "loss": 0.1033, + "step": 21433 + }, + { + "epoch": 2.5416814893869324, + "grad_norm": 0.558587641076497, + "learning_rate": 1.5495042735852983e-05, + "loss": 0.0784, + "step": 21434 + }, + { + "epoch": 2.541800071149057, + "grad_norm": 0.5147839030478643, + "learning_rate": 1.5492822673651712e-05, + "loss": 0.0755, + "step": 21435 + }, + { + "epoch": 2.5419186529111824, + "grad_norm": 0.5052937601392505, + "learning_rate": 1.549060269909496e-05, + "loss": 0.0734, + "step": 21436 + }, + { + "epoch": 2.542037234673307, + "grad_norm": 0.6818045785775417, + "learning_rate": 1.548838281220318e-05, + "loss": 0.0879, + "step": 21437 + }, + { + "epoch": 2.5421558164354323, + "grad_norm": 0.6562784640422374, + "learning_rate": 1.548616301299685e-05, + "loss": 0.0879, + "step": 21438 + }, + { + "epoch": 2.542274398197557, + "grad_norm": 1.0696565845285737, + "learning_rate": 1.5483943301496428e-05, + "loss": 0.1381, + "step": 21439 + }, + { + "epoch": 2.5423929799596823, + "grad_norm": 0.6283801722724008, + "learning_rate": 1.548172367772239e-05, + "loss": 0.0894, + "step": 21440 + }, + { + "epoch": 2.542511561721807, + "grad_norm": 0.843453889689084, + "learning_rate": 1.547950414169518e-05, + "loss": 0.1094, + "step": 21441 + }, + { + "epoch": 2.5426301434839322, + "grad_norm": 1.1521712014606824, + "learning_rate": 1.5477284693435257e-05, + "loss": 0.1656, + "step": 21442 + }, + { + "epoch": 2.542748725246057, + "grad_norm": 0.45660154182440227, + "learning_rate": 1.5475065332963106e-05, + "loss": 0.0822, + "step": 21443 + }, + { + "epoch": 2.542867307008182, + "grad_norm": 0.7739344402713508, + "learning_rate": 1.5472846060299172e-05, + "loss": 0.0998, + "step": 21444 + }, + { + "epoch": 2.542985888770307, + "grad_norm": 0.6289935742838848, + "learning_rate": 1.547062687546391e-05, + "loss": 0.0859, + "step": 21445 + }, + { + "epoch": 2.543104470532432, + "grad_norm": 0.644146346376722, + "learning_rate": 1.5468407778477778e-05, + "loss": 0.094, + "step": 21446 + }, + { + "epoch": 2.5432230522945574, + "grad_norm": 0.9789036341628862, + "learning_rate": 1.5466188769361244e-05, + "loss": 0.1321, + "step": 21447 + }, + { + "epoch": 2.543341634056682, + "grad_norm": 0.8966140991627146, + "learning_rate": 1.5463969848134756e-05, + "loss": 0.1209, + "step": 21448 + }, + { + "epoch": 2.543460215818807, + "grad_norm": 0.6114925988228893, + "learning_rate": 1.5461751014818775e-05, + "loss": 0.0679, + "step": 21449 + }, + { + "epoch": 2.543578797580932, + "grad_norm": 0.5033737544656481, + "learning_rate": 1.5459532269433748e-05, + "loss": 0.061, + "step": 21450 + }, + { + "epoch": 2.5436973793430573, + "grad_norm": 0.6233589067309314, + "learning_rate": 1.545731361200014e-05, + "loss": 0.0851, + "step": 21451 + }, + { + "epoch": 2.543815961105182, + "grad_norm": 0.524864858780048, + "learning_rate": 1.5455095042538398e-05, + "loss": 0.0695, + "step": 21452 + }, + { + "epoch": 2.543934542867307, + "grad_norm": 0.6301121761405246, + "learning_rate": 1.545287656106897e-05, + "loss": 0.1054, + "step": 21453 + }, + { + "epoch": 2.544053124629432, + "grad_norm": 0.7239488665118252, + "learning_rate": 1.5450658167612316e-05, + "loss": 0.1002, + "step": 21454 + }, + { + "epoch": 2.544171706391557, + "grad_norm": 0.8667762040345961, + "learning_rate": 1.5448439862188883e-05, + "loss": 0.1026, + "step": 21455 + }, + { + "epoch": 2.544290288153682, + "grad_norm": 0.7148979238998129, + "learning_rate": 1.5446221644819124e-05, + "loss": 0.093, + "step": 21456 + }, + { + "epoch": 2.5444088699158067, + "grad_norm": 0.5987976989961494, + "learning_rate": 1.544400351552348e-05, + "loss": 0.0864, + "step": 21457 + }, + { + "epoch": 2.544527451677932, + "grad_norm": 0.8563346723131567, + "learning_rate": 1.5441785474322405e-05, + "loss": 0.1164, + "step": 21458 + }, + { + "epoch": 2.544646033440057, + "grad_norm": 0.6076988941050389, + "learning_rate": 1.5439567521236358e-05, + "loss": 0.0709, + "step": 21459 + }, + { + "epoch": 2.544764615202182, + "grad_norm": 0.48474161477327726, + "learning_rate": 1.5437349656285772e-05, + "loss": 0.0779, + "step": 21460 + }, + { + "epoch": 2.5448831969643066, + "grad_norm": 0.6433545354337432, + "learning_rate": 1.5435131879491083e-05, + "loss": 0.0803, + "step": 21461 + }, + { + "epoch": 2.545001778726432, + "grad_norm": 0.38199027031267835, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.0453, + "step": 21462 + }, + { + "epoch": 2.545120360488557, + "grad_norm": 0.7789196129436371, + "learning_rate": 1.5430696590451235e-05, + "loss": 0.0753, + "step": 21463 + }, + { + "epoch": 2.545238942250682, + "grad_norm": 0.8602037024729732, + "learning_rate": 1.5428479078246944e-05, + "loss": 0.0913, + "step": 21464 + }, + { + "epoch": 2.5453575240128066, + "grad_norm": 0.6951208287350019, + "learning_rate": 1.542626165428035e-05, + "loss": 0.1, + "step": 21465 + }, + { + "epoch": 2.5454761057749318, + "grad_norm": 0.7912608544861333, + "learning_rate": 1.5424044318571877e-05, + "loss": 0.1153, + "step": 21466 + }, + { + "epoch": 2.545594687537057, + "grad_norm": 1.3073424004298193, + "learning_rate": 1.5421827071141975e-05, + "loss": 0.1364, + "step": 21467 + }, + { + "epoch": 2.5457132692991817, + "grad_norm": 0.5463785718416431, + "learning_rate": 1.541960991201108e-05, + "loss": 0.0915, + "step": 21468 + }, + { + "epoch": 2.5458318510613065, + "grad_norm": 0.5173562358048313, + "learning_rate": 1.5417392841199632e-05, + "loss": 0.0584, + "step": 21469 + }, + { + "epoch": 2.5459504328234317, + "grad_norm": 1.1440123620017242, + "learning_rate": 1.5415175858728076e-05, + "loss": 0.1247, + "step": 21470 + }, + { + "epoch": 2.546069014585557, + "grad_norm": 0.5773117610614737, + "learning_rate": 1.5412958964616846e-05, + "loss": 0.0834, + "step": 21471 + }, + { + "epoch": 2.5461875963476817, + "grad_norm": 0.5004584238614178, + "learning_rate": 1.541074215888637e-05, + "loss": 0.0656, + "step": 21472 + }, + { + "epoch": 2.546306178109807, + "grad_norm": 0.4580511828382621, + "learning_rate": 1.5408525441557098e-05, + "loss": 0.0666, + "step": 21473 + }, + { + "epoch": 2.5464247598719316, + "grad_norm": 0.6678188521107324, + "learning_rate": 1.540630881264946e-05, + "loss": 0.0827, + "step": 21474 + }, + { + "epoch": 2.546543341634057, + "grad_norm": 0.9138976732676265, + "learning_rate": 1.540409227218389e-05, + "loss": 0.1249, + "step": 21475 + }, + { + "epoch": 2.5466619233961816, + "grad_norm": 0.499510770625477, + "learning_rate": 1.5401875820180818e-05, + "loss": 0.0755, + "step": 21476 + }, + { + "epoch": 2.546780505158307, + "grad_norm": 0.6844799929605587, + "learning_rate": 1.5399659456660682e-05, + "loss": 0.0847, + "step": 21477 + }, + { + "epoch": 2.5468990869204315, + "grad_norm": 0.7893369095330867, + "learning_rate": 1.5397443181643915e-05, + "loss": 0.1134, + "step": 21478 + }, + { + "epoch": 2.5470176686825567, + "grad_norm": 0.6559199531640769, + "learning_rate": 1.539522699515094e-05, + "loss": 0.0902, + "step": 21479 + }, + { + "epoch": 2.5471362504446815, + "grad_norm": 0.5614910780279838, + "learning_rate": 1.5393010897202204e-05, + "loss": 0.0683, + "step": 21480 + }, + { + "epoch": 2.5472548322068067, + "grad_norm": 0.7058706454508963, + "learning_rate": 1.5390794887818124e-05, + "loss": 0.0839, + "step": 21481 + }, + { + "epoch": 2.5473734139689315, + "grad_norm": 0.6620980111975824, + "learning_rate": 1.5388578967019135e-05, + "loss": 0.1026, + "step": 21482 + }, + { + "epoch": 2.5474919957310567, + "grad_norm": 0.7335341191367418, + "learning_rate": 1.5386363134825647e-05, + "loss": 0.0899, + "step": 21483 + }, + { + "epoch": 2.5476105774931814, + "grad_norm": 0.5748404841842454, + "learning_rate": 1.5384147391258117e-05, + "loss": 0.0666, + "step": 21484 + }, + { + "epoch": 2.5477291592553066, + "grad_norm": 0.956147980821703, + "learning_rate": 1.5381931736336953e-05, + "loss": 0.1263, + "step": 21485 + }, + { + "epoch": 2.5478477410174314, + "grad_norm": 0.6493586498898836, + "learning_rate": 1.537971617008258e-05, + "loss": 0.0819, + "step": 21486 + }, + { + "epoch": 2.5479663227795566, + "grad_norm": 0.5636166612439507, + "learning_rate": 1.5377500692515428e-05, + "loss": 0.0787, + "step": 21487 + }, + { + "epoch": 2.5480849045416814, + "grad_norm": 0.6431838170284558, + "learning_rate": 1.537528530365592e-05, + "loss": 0.0763, + "step": 21488 + }, + { + "epoch": 2.5482034863038066, + "grad_norm": 0.7746290430475276, + "learning_rate": 1.5373070003524482e-05, + "loss": 0.0946, + "step": 21489 + }, + { + "epoch": 2.5483220680659313, + "grad_norm": 0.7160616558953101, + "learning_rate": 1.5370854792141532e-05, + "loss": 0.0871, + "step": 21490 + }, + { + "epoch": 2.5484406498280565, + "grad_norm": 0.5184570047247066, + "learning_rate": 1.5368639669527483e-05, + "loss": 0.0783, + "step": 21491 + }, + { + "epoch": 2.5485592315901813, + "grad_norm": 0.8687831063111374, + "learning_rate": 1.5366424635702775e-05, + "loss": 0.103, + "step": 21492 + }, + { + "epoch": 2.5486778133523065, + "grad_norm": 0.524412475977883, + "learning_rate": 1.536420969068782e-05, + "loss": 0.0667, + "step": 21493 + }, + { + "epoch": 2.5487963951144312, + "grad_norm": 1.1075373021684356, + "learning_rate": 1.536199483450303e-05, + "loss": 0.1765, + "step": 21494 + }, + { + "epoch": 2.5489149768765564, + "grad_norm": 0.5227129131950715, + "learning_rate": 1.5359780067168832e-05, + "loss": 0.0766, + "step": 21495 + }, + { + "epoch": 2.5490335586386816, + "grad_norm": 0.46669929946426936, + "learning_rate": 1.535756538870564e-05, + "loss": 0.063, + "step": 21496 + }, + { + "epoch": 2.5491521404008064, + "grad_norm": 0.9826810859167289, + "learning_rate": 1.5355350799133873e-05, + "loss": 0.1073, + "step": 21497 + }, + { + "epoch": 2.549270722162931, + "grad_norm": 0.549185901708805, + "learning_rate": 1.535313629847394e-05, + "loss": 0.0775, + "step": 21498 + }, + { + "epoch": 2.5493893039250564, + "grad_norm": 0.5628427233800181, + "learning_rate": 1.535092188674626e-05, + "loss": 0.066, + "step": 21499 + }, + { + "epoch": 2.5495078856871816, + "grad_norm": 0.5671079796931414, + "learning_rate": 1.534870756397125e-05, + "loss": 0.083, + "step": 21500 + }, + { + "epoch": 2.5496264674493063, + "grad_norm": 0.600349483300442, + "learning_rate": 1.5346493330169327e-05, + "loss": 0.0864, + "step": 21501 + }, + { + "epoch": 2.549745049211431, + "grad_norm": 0.9406665216622622, + "learning_rate": 1.5344279185360883e-05, + "loss": 0.1381, + "step": 21502 + }, + { + "epoch": 2.5498636309735563, + "grad_norm": 0.7734731884328327, + "learning_rate": 1.534206512956636e-05, + "loss": 0.0941, + "step": 21503 + }, + { + "epoch": 2.5499822127356815, + "grad_norm": 0.4973297547910033, + "learning_rate": 1.5339851162806147e-05, + "loss": 0.0677, + "step": 21504 + }, + { + "epoch": 2.5501007944978062, + "grad_norm": 0.5435677813430415, + "learning_rate": 1.5337637285100653e-05, + "loss": 0.0783, + "step": 21505 + }, + { + "epoch": 2.550219376259931, + "grad_norm": 0.8005652512421672, + "learning_rate": 1.5335423496470302e-05, + "loss": 0.096, + "step": 21506 + }, + { + "epoch": 2.550337958022056, + "grad_norm": 0.6899938943839169, + "learning_rate": 1.5333209796935494e-05, + "loss": 0.0875, + "step": 21507 + }, + { + "epoch": 2.5504565397841814, + "grad_norm": 0.6756843285357619, + "learning_rate": 1.533099618651664e-05, + "loss": 0.0827, + "step": 21508 + }, + { + "epoch": 2.550575121546306, + "grad_norm": 0.5848270113345713, + "learning_rate": 1.5328782665234132e-05, + "loss": 0.0771, + "step": 21509 + }, + { + "epoch": 2.550693703308431, + "grad_norm": 0.7539631275597056, + "learning_rate": 1.5326569233108402e-05, + "loss": 0.0928, + "step": 21510 + }, + { + "epoch": 2.550812285070556, + "grad_norm": 0.7484594525889698, + "learning_rate": 1.5324355890159842e-05, + "loss": 0.0958, + "step": 21511 + }, + { + "epoch": 2.5509308668326813, + "grad_norm": 1.4495184427474699, + "learning_rate": 1.5322142636408855e-05, + "loss": 0.1372, + "step": 21512 + }, + { + "epoch": 2.551049448594806, + "grad_norm": 0.4575057816101567, + "learning_rate": 1.5319929471875832e-05, + "loss": 0.0721, + "step": 21513 + }, + { + "epoch": 2.551168030356931, + "grad_norm": 0.6731440760318099, + "learning_rate": 1.53177163965812e-05, + "loss": 0.0861, + "step": 21514 + }, + { + "epoch": 2.551286612119056, + "grad_norm": 0.9305714541728859, + "learning_rate": 1.531550341054535e-05, + "loss": 0.1016, + "step": 21515 + }, + { + "epoch": 2.5514051938811813, + "grad_norm": 0.6599011524908213, + "learning_rate": 1.531329051378868e-05, + "loss": 0.0834, + "step": 21516 + }, + { + "epoch": 2.551523775643306, + "grad_norm": 0.6787023871294922, + "learning_rate": 1.531107770633159e-05, + "loss": 0.1017, + "step": 21517 + }, + { + "epoch": 2.5516423574054308, + "grad_norm": 0.49962345968484945, + "learning_rate": 1.530886498819448e-05, + "loss": 0.0678, + "step": 21518 + }, + { + "epoch": 2.551760939167556, + "grad_norm": 0.5638828639428904, + "learning_rate": 1.530665235939776e-05, + "loss": 0.0683, + "step": 21519 + }, + { + "epoch": 2.551879520929681, + "grad_norm": 0.6929171020453118, + "learning_rate": 1.5304439819961807e-05, + "loss": 0.1081, + "step": 21520 + }, + { + "epoch": 2.551998102691806, + "grad_norm": 0.5746706264906336, + "learning_rate": 1.530222736990704e-05, + "loss": 0.0686, + "step": 21521 + }, + { + "epoch": 2.552116684453931, + "grad_norm": 0.6234282627737747, + "learning_rate": 1.5300015009253842e-05, + "loss": 0.0825, + "step": 21522 + }, + { + "epoch": 2.552235266216056, + "grad_norm": 0.615533535388, + "learning_rate": 1.5297802738022616e-05, + "loss": 0.0889, + "step": 21523 + }, + { + "epoch": 2.552353847978181, + "grad_norm": 0.4376768892716821, + "learning_rate": 1.5295590556233735e-05, + "loss": 0.0586, + "step": 21524 + }, + { + "epoch": 2.552472429740306, + "grad_norm": 0.856561450005793, + "learning_rate": 1.5293378463907624e-05, + "loss": 0.1424, + "step": 21525 + }, + { + "epoch": 2.552591011502431, + "grad_norm": 0.6097963834290009, + "learning_rate": 1.529116646106466e-05, + "loss": 0.0993, + "step": 21526 + }, + { + "epoch": 2.552709593264556, + "grad_norm": 0.529370514111545, + "learning_rate": 1.5288954547725225e-05, + "loss": 0.0626, + "step": 21527 + }, + { + "epoch": 2.552828175026681, + "grad_norm": 0.5202038813083469, + "learning_rate": 1.5286742723909724e-05, + "loss": 0.0791, + "step": 21528 + }, + { + "epoch": 2.552946756788806, + "grad_norm": 0.4763423986612556, + "learning_rate": 1.5284530989638545e-05, + "loss": 0.0634, + "step": 21529 + }, + { + "epoch": 2.553065338550931, + "grad_norm": 1.1495245062671344, + "learning_rate": 1.528231934493208e-05, + "loss": 0.1227, + "step": 21530 + }, + { + "epoch": 2.5531839203130557, + "grad_norm": 0.7378088365454362, + "learning_rate": 1.5280107789810717e-05, + "loss": 0.1023, + "step": 21531 + }, + { + "epoch": 2.553302502075181, + "grad_norm": 0.7000303954421144, + "learning_rate": 1.5277896324294828e-05, + "loss": 0.0864, + "step": 21532 + }, + { + "epoch": 2.5534210838373057, + "grad_norm": 0.6317022533748192, + "learning_rate": 1.5275684948404823e-05, + "loss": 0.0903, + "step": 21533 + }, + { + "epoch": 2.553539665599431, + "grad_norm": 0.6750427146372631, + "learning_rate": 1.5273473662161076e-05, + "loss": 0.0905, + "step": 21534 + }, + { + "epoch": 2.5536582473615557, + "grad_norm": 0.801380869294865, + "learning_rate": 1.527126246558397e-05, + "loss": 0.0872, + "step": 21535 + }, + { + "epoch": 2.553776829123681, + "grad_norm": 0.4344270657209867, + "learning_rate": 1.5269051358693898e-05, + "loss": 0.0639, + "step": 21536 + }, + { + "epoch": 2.5538954108858056, + "grad_norm": 0.8117982343733693, + "learning_rate": 1.5266840341511233e-05, + "loss": 0.1041, + "step": 21537 + }, + { + "epoch": 2.554013992647931, + "grad_norm": 0.747367452759796, + "learning_rate": 1.5264629414056375e-05, + "loss": 0.1041, + "step": 21538 + }, + { + "epoch": 2.5541325744100556, + "grad_norm": 0.6489958032485993, + "learning_rate": 1.5262418576349685e-05, + "loss": 0.0632, + "step": 21539 + }, + { + "epoch": 2.554251156172181, + "grad_norm": 0.5033593392146642, + "learning_rate": 1.5260207828411564e-05, + "loss": 0.086, + "step": 21540 + }, + { + "epoch": 2.5543697379343056, + "grad_norm": 0.5547911340090377, + "learning_rate": 1.5257997170262383e-05, + "loss": 0.0791, + "step": 21541 + }, + { + "epoch": 2.5544883196964308, + "grad_norm": 0.817274504252948, + "learning_rate": 1.5255786601922523e-05, + "loss": 0.1085, + "step": 21542 + }, + { + "epoch": 2.5546069014585555, + "grad_norm": 0.48362533508957767, + "learning_rate": 1.525357612341235e-05, + "loss": 0.0756, + "step": 21543 + }, + { + "epoch": 2.5547254832206807, + "grad_norm": 0.4849448539412303, + "learning_rate": 1.5251365734752262e-05, + "loss": 0.073, + "step": 21544 + }, + { + "epoch": 2.554844064982806, + "grad_norm": 0.6699229826309621, + "learning_rate": 1.5249155435962632e-05, + "loss": 0.0793, + "step": 21545 + }, + { + "epoch": 2.5549626467449307, + "grad_norm": 0.9567447350571209, + "learning_rate": 1.5246945227063818e-05, + "loss": 0.1037, + "step": 21546 + }, + { + "epoch": 2.5550812285070554, + "grad_norm": 0.7191741995384846, + "learning_rate": 1.5244735108076216e-05, + "loss": 0.0727, + "step": 21547 + }, + { + "epoch": 2.5551998102691806, + "grad_norm": 0.7371644389664014, + "learning_rate": 1.5242525079020192e-05, + "loss": 0.0979, + "step": 21548 + }, + { + "epoch": 2.555318392031306, + "grad_norm": 0.45144490793855985, + "learning_rate": 1.5240315139916128e-05, + "loss": 0.0508, + "step": 21549 + }, + { + "epoch": 2.5554369737934306, + "grad_norm": 0.5353253455466163, + "learning_rate": 1.5238105290784377e-05, + "loss": 0.0663, + "step": 21550 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.6751711314775343, + "learning_rate": 1.5235895531645336e-05, + "loss": 0.1007, + "step": 21551 + }, + { + "epoch": 2.5556741373176806, + "grad_norm": 0.6436186272622699, + "learning_rate": 1.5233685862519364e-05, + "loss": 0.0837, + "step": 21552 + }, + { + "epoch": 2.5557927190798058, + "grad_norm": 0.5608886882365024, + "learning_rate": 1.5231476283426832e-05, + "loss": 0.0879, + "step": 21553 + }, + { + "epoch": 2.5559113008419305, + "grad_norm": 0.71393162187896, + "learning_rate": 1.5229266794388103e-05, + "loss": 0.1135, + "step": 21554 + }, + { + "epoch": 2.5560298826040553, + "grad_norm": 0.5518071095610267, + "learning_rate": 1.5227057395423558e-05, + "loss": 0.0845, + "step": 21555 + }, + { + "epoch": 2.5561484643661805, + "grad_norm": 0.8540694201828075, + "learning_rate": 1.5224848086553555e-05, + "loss": 0.1272, + "step": 21556 + }, + { + "epoch": 2.5562670461283057, + "grad_norm": 0.7786594827888234, + "learning_rate": 1.5222638867798473e-05, + "loss": 0.0891, + "step": 21557 + }, + { + "epoch": 2.5563856278904304, + "grad_norm": 0.8172627050172733, + "learning_rate": 1.522042973917866e-05, + "loss": 0.1143, + "step": 21558 + }, + { + "epoch": 2.556504209652555, + "grad_norm": 1.0522725207218766, + "learning_rate": 1.5218220700714503e-05, + "loss": 0.1301, + "step": 21559 + }, + { + "epoch": 2.5566227914146804, + "grad_norm": 0.6218741935041552, + "learning_rate": 1.5216011752426355e-05, + "loss": 0.0869, + "step": 21560 + }, + { + "epoch": 2.5567413731768056, + "grad_norm": 0.6670783372953689, + "learning_rate": 1.5213802894334578e-05, + "loss": 0.0885, + "step": 21561 + }, + { + "epoch": 2.5568599549389304, + "grad_norm": 0.8533189327687157, + "learning_rate": 1.5211594126459533e-05, + "loss": 0.0964, + "step": 21562 + }, + { + "epoch": 2.556978536701055, + "grad_norm": 0.47046287925409436, + "learning_rate": 1.5209385448821592e-05, + "loss": 0.061, + "step": 21563 + }, + { + "epoch": 2.5570971184631803, + "grad_norm": 0.7877904224778446, + "learning_rate": 1.5207176861441113e-05, + "loss": 0.1373, + "step": 21564 + }, + { + "epoch": 2.5572157002253055, + "grad_norm": 0.5656624633846677, + "learning_rate": 1.5204968364338446e-05, + "loss": 0.082, + "step": 21565 + }, + { + "epoch": 2.5573342819874303, + "grad_norm": 0.5718776034419393, + "learning_rate": 1.5202759957533968e-05, + "loss": 0.0789, + "step": 21566 + }, + { + "epoch": 2.557452863749555, + "grad_norm": 0.802213385299035, + "learning_rate": 1.5200551641048022e-05, + "loss": 0.1035, + "step": 21567 + }, + { + "epoch": 2.5575714455116803, + "grad_norm": 0.5474683267121736, + "learning_rate": 1.519834341490098e-05, + "loss": 0.0757, + "step": 21568 + }, + { + "epoch": 2.5576900272738055, + "grad_norm": 0.6456820387436741, + "learning_rate": 1.5196135279113183e-05, + "loss": 0.0914, + "step": 21569 + }, + { + "epoch": 2.55780860903593, + "grad_norm": 0.49256358254202215, + "learning_rate": 1.5193927233705007e-05, + "loss": 0.0609, + "step": 21570 + }, + { + "epoch": 2.5579271907980554, + "grad_norm": 0.5554933670644486, + "learning_rate": 1.5191719278696795e-05, + "loss": 0.0877, + "step": 21571 + }, + { + "epoch": 2.55804577256018, + "grad_norm": 0.4525755948239112, + "learning_rate": 1.5189511414108903e-05, + "loss": 0.0681, + "step": 21572 + }, + { + "epoch": 2.5581643543223054, + "grad_norm": 0.9855266267066317, + "learning_rate": 1.5187303639961678e-05, + "loss": 0.1278, + "step": 21573 + }, + { + "epoch": 2.55828293608443, + "grad_norm": 0.6287277273350552, + "learning_rate": 1.5185095956275489e-05, + "loss": 0.0874, + "step": 21574 + }, + { + "epoch": 2.5584015178465553, + "grad_norm": 0.8867984031848025, + "learning_rate": 1.518288836307068e-05, + "loss": 0.138, + "step": 21575 + }, + { + "epoch": 2.55852009960868, + "grad_norm": 0.7649660844028009, + "learning_rate": 1.5180680860367594e-05, + "loss": 0.1038, + "step": 21576 + }, + { + "epoch": 2.5586386813708053, + "grad_norm": 0.5521774311611686, + "learning_rate": 1.5178473448186593e-05, + "loss": 0.0716, + "step": 21577 + }, + { + "epoch": 2.55875726313293, + "grad_norm": 0.7502675002995498, + "learning_rate": 1.517626612654802e-05, + "loss": 0.1062, + "step": 21578 + }, + { + "epoch": 2.5588758448950553, + "grad_norm": 0.7428038468458193, + "learning_rate": 1.5174058895472232e-05, + "loss": 0.0983, + "step": 21579 + }, + { + "epoch": 2.55899442665718, + "grad_norm": 0.47134660894857666, + "learning_rate": 1.5171851754979564e-05, + "loss": 0.0556, + "step": 21580 + }, + { + "epoch": 2.5591130084193052, + "grad_norm": 0.4452400363851594, + "learning_rate": 1.5169644705090378e-05, + "loss": 0.0595, + "step": 21581 + }, + { + "epoch": 2.55923159018143, + "grad_norm": 0.8087707627688798, + "learning_rate": 1.5167437745825013e-05, + "loss": 0.0928, + "step": 21582 + }, + { + "epoch": 2.559350171943555, + "grad_norm": 0.9393552633573798, + "learning_rate": 1.5165230877203817e-05, + "loss": 0.1254, + "step": 21583 + }, + { + "epoch": 2.55946875370568, + "grad_norm": 0.75587454669452, + "learning_rate": 1.5163024099247119e-05, + "loss": 0.1183, + "step": 21584 + }, + { + "epoch": 2.559587335467805, + "grad_norm": 0.8666801441090518, + "learning_rate": 1.5160817411975286e-05, + "loss": 0.0925, + "step": 21585 + }, + { + "epoch": 2.55970591722993, + "grad_norm": 0.508309015728604, + "learning_rate": 1.515861081540865e-05, + "loss": 0.0602, + "step": 21586 + }, + { + "epoch": 2.559824498992055, + "grad_norm": 0.5378086757683902, + "learning_rate": 1.5156404309567558e-05, + "loss": 0.0886, + "step": 21587 + }, + { + "epoch": 2.55994308075418, + "grad_norm": 0.8886868812189218, + "learning_rate": 1.5154197894472333e-05, + "loss": 0.1111, + "step": 21588 + }, + { + "epoch": 2.560061662516305, + "grad_norm": 0.6058173741040422, + "learning_rate": 1.5151991570143342e-05, + "loss": 0.0842, + "step": 21589 + }, + { + "epoch": 2.56018024427843, + "grad_norm": 0.5930147362267632, + "learning_rate": 1.5149785336600913e-05, + "loss": 0.0719, + "step": 21590 + }, + { + "epoch": 2.560298826040555, + "grad_norm": 0.7174754996131719, + "learning_rate": 1.5147579193865374e-05, + "loss": 0.0915, + "step": 21591 + }, + { + "epoch": 2.56041740780268, + "grad_norm": 0.7454602225265462, + "learning_rate": 1.5145373141957084e-05, + "loss": 0.1141, + "step": 21592 + }, + { + "epoch": 2.560535989564805, + "grad_norm": 0.7526547527083474, + "learning_rate": 1.5143167180896367e-05, + "loss": 0.1239, + "step": 21593 + }, + { + "epoch": 2.56065457132693, + "grad_norm": 0.60874423813648, + "learning_rate": 1.5140961310703562e-05, + "loss": 0.0827, + "step": 21594 + }, + { + "epoch": 2.560773153089055, + "grad_norm": 0.7418962254164355, + "learning_rate": 1.5138755531398996e-05, + "loss": 0.09, + "step": 21595 + }, + { + "epoch": 2.5608917348511797, + "grad_norm": 0.5761304807459101, + "learning_rate": 1.5136549843003017e-05, + "loss": 0.072, + "step": 21596 + }, + { + "epoch": 2.561010316613305, + "grad_norm": 0.5189049548131666, + "learning_rate": 1.5134344245535953e-05, + "loss": 0.0703, + "step": 21597 + }, + { + "epoch": 2.56112889837543, + "grad_norm": 0.7133618923613796, + "learning_rate": 1.5132138739018136e-05, + "loss": 0.0897, + "step": 21598 + }, + { + "epoch": 2.561247480137555, + "grad_norm": 0.5076156837825103, + "learning_rate": 1.5129933323469895e-05, + "loss": 0.0627, + "step": 21599 + }, + { + "epoch": 2.5613660618996796, + "grad_norm": 0.4840020260479851, + "learning_rate": 1.5127727998911572e-05, + "loss": 0.0668, + "step": 21600 + }, + { + "epoch": 2.561484643661805, + "grad_norm": 0.583807374840249, + "learning_rate": 1.5125522765363492e-05, + "loss": 0.078, + "step": 21601 + }, + { + "epoch": 2.56160322542393, + "grad_norm": 0.7137320991535598, + "learning_rate": 1.512331762284598e-05, + "loss": 0.0922, + "step": 21602 + }, + { + "epoch": 2.561721807186055, + "grad_norm": 0.7249578771701454, + "learning_rate": 1.5121112571379362e-05, + "loss": 0.1105, + "step": 21603 + }, + { + "epoch": 2.5618403889481796, + "grad_norm": 0.4693733682727568, + "learning_rate": 1.5118907610983978e-05, + "loss": 0.0649, + "step": 21604 + }, + { + "epoch": 2.5619589707103048, + "grad_norm": 0.8654648798780028, + "learning_rate": 1.511670274168015e-05, + "loss": 0.0888, + "step": 21605 + }, + { + "epoch": 2.56207755247243, + "grad_norm": 0.41193765234083096, + "learning_rate": 1.5114497963488198e-05, + "loss": 0.0667, + "step": 21606 + }, + { + "epoch": 2.5621961342345547, + "grad_norm": 0.7793662448730261, + "learning_rate": 1.5112293276428452e-05, + "loss": 0.1249, + "step": 21607 + }, + { + "epoch": 2.5623147159966795, + "grad_norm": 0.741628239750056, + "learning_rate": 1.5110088680521245e-05, + "loss": 0.1322, + "step": 21608 + }, + { + "epoch": 2.5624332977588047, + "grad_norm": 0.694177962820042, + "learning_rate": 1.510788417578689e-05, + "loss": 0.0814, + "step": 21609 + }, + { + "epoch": 2.56255187952093, + "grad_norm": 0.5854668993509184, + "learning_rate": 1.5105679762245706e-05, + "loss": 0.0923, + "step": 21610 + }, + { + "epoch": 2.5626704612830546, + "grad_norm": 0.5948354501133889, + "learning_rate": 1.5103475439918027e-05, + "loss": 0.0742, + "step": 21611 + }, + { + "epoch": 2.5627890430451794, + "grad_norm": 1.0591606325544272, + "learning_rate": 1.5101271208824168e-05, + "loss": 0.1376, + "step": 21612 + }, + { + "epoch": 2.5629076248073046, + "grad_norm": 0.6427837584769859, + "learning_rate": 1.5099067068984454e-05, + "loss": 0.087, + "step": 21613 + }, + { + "epoch": 2.56302620656943, + "grad_norm": 0.9129566004494236, + "learning_rate": 1.509686302041919e-05, + "loss": 0.0492, + "step": 21614 + }, + { + "epoch": 2.5631447883315546, + "grad_norm": 0.7529971953263135, + "learning_rate": 1.5094659063148714e-05, + "loss": 0.0957, + "step": 21615 + }, + { + "epoch": 2.5632633700936793, + "grad_norm": 0.8433332771390282, + "learning_rate": 1.5092455197193328e-05, + "loss": 0.1018, + "step": 21616 + }, + { + "epoch": 2.5633819518558045, + "grad_norm": 0.8250739196398834, + "learning_rate": 1.509025142257336e-05, + "loss": 0.1262, + "step": 21617 + }, + { + "epoch": 2.5635005336179297, + "grad_norm": 0.6312412098319896, + "learning_rate": 1.5088047739309114e-05, + "loss": 0.0713, + "step": 21618 + }, + { + "epoch": 2.5636191153800545, + "grad_norm": 0.8555915671576594, + "learning_rate": 1.5085844147420925e-05, + "loss": 0.1051, + "step": 21619 + }, + { + "epoch": 2.5637376971421797, + "grad_norm": 0.5956735273817276, + "learning_rate": 1.508364064692909e-05, + "loss": 0.0891, + "step": 21620 + }, + { + "epoch": 2.5638562789043045, + "grad_norm": 0.6142613232801595, + "learning_rate": 1.5081437237853921e-05, + "loss": 0.0614, + "step": 21621 + }, + { + "epoch": 2.5639748606664297, + "grad_norm": 0.5079918294752009, + "learning_rate": 1.5079233920215747e-05, + "loss": 0.0665, + "step": 21622 + }, + { + "epoch": 2.5640934424285544, + "grad_norm": 0.5813681051246512, + "learning_rate": 1.5077030694034872e-05, + "loss": 0.0669, + "step": 21623 + }, + { + "epoch": 2.5642120241906796, + "grad_norm": 0.574592859422881, + "learning_rate": 1.5074827559331606e-05, + "loss": 0.0832, + "step": 21624 + }, + { + "epoch": 2.5643306059528044, + "grad_norm": 0.6781371765067258, + "learning_rate": 1.5072624516126246e-05, + "loss": 0.0771, + "step": 21625 + }, + { + "epoch": 2.5644491877149296, + "grad_norm": 0.7410987414276756, + "learning_rate": 1.5070421564439125e-05, + "loss": 0.1174, + "step": 21626 + }, + { + "epoch": 2.5645677694770543, + "grad_norm": 0.5340045796101812, + "learning_rate": 1.5068218704290538e-05, + "loss": 0.0921, + "step": 21627 + }, + { + "epoch": 2.5646863512391795, + "grad_norm": 0.7193624165644557, + "learning_rate": 1.50660159357008e-05, + "loss": 0.0999, + "step": 21628 + }, + { + "epoch": 2.5648049330013043, + "grad_norm": 0.7135751848431026, + "learning_rate": 1.5063813258690202e-05, + "loss": 0.0892, + "step": 21629 + }, + { + "epoch": 2.5649235147634295, + "grad_norm": 0.7245288276012181, + "learning_rate": 1.506161067327907e-05, + "loss": 0.101, + "step": 21630 + }, + { + "epoch": 2.5650420965255543, + "grad_norm": 0.6345234794903442, + "learning_rate": 1.5059408179487702e-05, + "loss": 0.1099, + "step": 21631 + }, + { + "epoch": 2.5651606782876795, + "grad_norm": 0.5118128633552554, + "learning_rate": 1.50572057773364e-05, + "loss": 0.0704, + "step": 21632 + }, + { + "epoch": 2.565279260049804, + "grad_norm": 0.6808180570674003, + "learning_rate": 1.5055003466845457e-05, + "loss": 0.1105, + "step": 21633 + }, + { + "epoch": 2.5653978418119294, + "grad_norm": 0.5972296059501417, + "learning_rate": 1.50528012480352e-05, + "loss": 0.0837, + "step": 21634 + }, + { + "epoch": 2.565516423574054, + "grad_norm": 0.6392264241060398, + "learning_rate": 1.5050599120925912e-05, + "loss": 0.0889, + "step": 21635 + }, + { + "epoch": 2.5656350053361794, + "grad_norm": 0.8560960296856371, + "learning_rate": 1.5048397085537897e-05, + "loss": 0.139, + "step": 21636 + }, + { + "epoch": 2.565753587098304, + "grad_norm": 0.8589151127290491, + "learning_rate": 1.5046195141891456e-05, + "loss": 0.1189, + "step": 21637 + }, + { + "epoch": 2.5658721688604293, + "grad_norm": 0.8223494462556574, + "learning_rate": 1.5043993290006898e-05, + "loss": 0.1181, + "step": 21638 + }, + { + "epoch": 2.565990750622554, + "grad_norm": 0.7833019578219851, + "learning_rate": 1.5041791529904514e-05, + "loss": 0.0938, + "step": 21639 + }, + { + "epoch": 2.5661093323846793, + "grad_norm": 0.3442889724275852, + "learning_rate": 1.5039589861604586e-05, + "loss": 0.0444, + "step": 21640 + }, + { + "epoch": 2.566227914146804, + "grad_norm": 0.5260364287013648, + "learning_rate": 1.5037388285127441e-05, + "loss": 0.0576, + "step": 21641 + }, + { + "epoch": 2.5663464959089293, + "grad_norm": 0.8145095618803387, + "learning_rate": 1.5035186800493354e-05, + "loss": 0.1141, + "step": 21642 + }, + { + "epoch": 2.566465077671054, + "grad_norm": 0.7382935330868678, + "learning_rate": 1.5032985407722627e-05, + "loss": 0.0963, + "step": 21643 + }, + { + "epoch": 2.5665836594331792, + "grad_norm": 0.8710180035016022, + "learning_rate": 1.503078410683554e-05, + "loss": 0.094, + "step": 21644 + }, + { + "epoch": 2.566702241195304, + "grad_norm": 0.6271282012444919, + "learning_rate": 1.5028582897852411e-05, + "loss": 0.0901, + "step": 21645 + }, + { + "epoch": 2.566820822957429, + "grad_norm": 0.5889286618631938, + "learning_rate": 1.5026381780793514e-05, + "loss": 0.0642, + "step": 21646 + }, + { + "epoch": 2.5669394047195544, + "grad_norm": 0.5161417142322506, + "learning_rate": 1.5024180755679151e-05, + "loss": 0.0667, + "step": 21647 + }, + { + "epoch": 2.567057986481679, + "grad_norm": 0.8150782924770895, + "learning_rate": 1.5021979822529602e-05, + "loss": 0.1117, + "step": 21648 + }, + { + "epoch": 2.567176568243804, + "grad_norm": 0.586412658057895, + "learning_rate": 1.5019778981365167e-05, + "loss": 0.077, + "step": 21649 + }, + { + "epoch": 2.567295150005929, + "grad_norm": 0.5585231956063005, + "learning_rate": 1.5017578232206137e-05, + "loss": 0.0749, + "step": 21650 + }, + { + "epoch": 2.5674137317680543, + "grad_norm": 0.534343747127927, + "learning_rate": 1.5015377575072781e-05, + "loss": 0.0703, + "step": 21651 + }, + { + "epoch": 2.567532313530179, + "grad_norm": 0.700382611785473, + "learning_rate": 1.5013177009985413e-05, + "loss": 0.1035, + "step": 21652 + }, + { + "epoch": 2.567650895292304, + "grad_norm": 0.7812694210297213, + "learning_rate": 1.5010976536964302e-05, + "loss": 0.1252, + "step": 21653 + }, + { + "epoch": 2.567769477054429, + "grad_norm": 1.0388591769338569, + "learning_rate": 1.500877615602974e-05, + "loss": 0.1151, + "step": 21654 + }, + { + "epoch": 2.5678880588165542, + "grad_norm": 1.112199254626833, + "learning_rate": 1.5006575867201999e-05, + "loss": 0.1386, + "step": 21655 + }, + { + "epoch": 2.568006640578679, + "grad_norm": 0.5689346978709972, + "learning_rate": 1.5004375670501386e-05, + "loss": 0.0766, + "step": 21656 + }, + { + "epoch": 2.5681252223408038, + "grad_norm": 0.6953000980744428, + "learning_rate": 1.5002175565948167e-05, + "loss": 0.0962, + "step": 21657 + }, + { + "epoch": 2.568243804102929, + "grad_norm": 0.8379539004057394, + "learning_rate": 1.4999975553562632e-05, + "loss": 0.1195, + "step": 21658 + }, + { + "epoch": 2.568362385865054, + "grad_norm": 0.9785013225130647, + "learning_rate": 1.499777563336505e-05, + "loss": 0.1804, + "step": 21659 + }, + { + "epoch": 2.568480967627179, + "grad_norm": 0.6822317431587638, + "learning_rate": 1.4995575805375722e-05, + "loss": 0.0795, + "step": 21660 + }, + { + "epoch": 2.5685995493893037, + "grad_norm": 0.6715105470330339, + "learning_rate": 1.4993376069614918e-05, + "loss": 0.0907, + "step": 21661 + }, + { + "epoch": 2.568718131151429, + "grad_norm": 0.6433745939860942, + "learning_rate": 1.499117642610291e-05, + "loss": 0.0761, + "step": 21662 + }, + { + "epoch": 2.568836712913554, + "grad_norm": 0.5570140828278738, + "learning_rate": 1.4988976874859978e-05, + "loss": 0.0788, + "step": 21663 + }, + { + "epoch": 2.568955294675679, + "grad_norm": 0.4424933858333607, + "learning_rate": 1.4986777415906412e-05, + "loss": 0.065, + "step": 21664 + }, + { + "epoch": 2.5690738764378036, + "grad_norm": 0.6706752807625477, + "learning_rate": 1.4984578049262471e-05, + "loss": 0.0967, + "step": 21665 + }, + { + "epoch": 2.569192458199929, + "grad_norm": 0.715232374386694, + "learning_rate": 1.4982378774948442e-05, + "loss": 0.089, + "step": 21666 + }, + { + "epoch": 2.569311039962054, + "grad_norm": 0.571970307782617, + "learning_rate": 1.4980179592984592e-05, + "loss": 0.0876, + "step": 21667 + }, + { + "epoch": 2.5694296217241788, + "grad_norm": 0.8050458993280805, + "learning_rate": 1.4977980503391206e-05, + "loss": 0.133, + "step": 21668 + }, + { + "epoch": 2.569548203486304, + "grad_norm": 0.603219641744991, + "learning_rate": 1.4975781506188552e-05, + "loss": 0.0847, + "step": 21669 + }, + { + "epoch": 2.5696667852484287, + "grad_norm": 0.634634800643695, + "learning_rate": 1.4973582601396888e-05, + "loss": 0.104, + "step": 21670 + }, + { + "epoch": 2.569785367010554, + "grad_norm": 0.6254110365704031, + "learning_rate": 1.4971383789036509e-05, + "loss": 0.0611, + "step": 21671 + }, + { + "epoch": 2.5699039487726787, + "grad_norm": 0.7127538325373887, + "learning_rate": 1.4969185069127673e-05, + "loss": 0.1078, + "step": 21672 + }, + { + "epoch": 2.570022530534804, + "grad_norm": 0.718140475145398, + "learning_rate": 1.4966986441690647e-05, + "loss": 0.1202, + "step": 21673 + }, + { + "epoch": 2.5701411122969287, + "grad_norm": 0.6017448558045624, + "learning_rate": 1.4964787906745698e-05, + "loss": 0.0603, + "step": 21674 + }, + { + "epoch": 2.570259694059054, + "grad_norm": 0.6273593483489913, + "learning_rate": 1.4962589464313104e-05, + "loss": 0.101, + "step": 21675 + }, + { + "epoch": 2.5703782758211786, + "grad_norm": 0.5915141731842348, + "learning_rate": 1.4960391114413125e-05, + "loss": 0.0935, + "step": 21676 + }, + { + "epoch": 2.570496857583304, + "grad_norm": 0.5232904734667644, + "learning_rate": 1.4958192857066025e-05, + "loss": 0.074, + "step": 21677 + }, + { + "epoch": 2.5706154393454286, + "grad_norm": 0.5897842086567708, + "learning_rate": 1.4955994692292075e-05, + "loss": 0.0794, + "step": 21678 + }, + { + "epoch": 2.570734021107554, + "grad_norm": 0.762754589899015, + "learning_rate": 1.495379662011154e-05, + "loss": 0.0966, + "step": 21679 + }, + { + "epoch": 2.5708526028696785, + "grad_norm": 0.7491599304886862, + "learning_rate": 1.4951598640544684e-05, + "loss": 0.1103, + "step": 21680 + }, + { + "epoch": 2.5709711846318037, + "grad_norm": 0.4826249215308687, + "learning_rate": 1.4949400753611754e-05, + "loss": 0.0727, + "step": 21681 + }, + { + "epoch": 2.5710897663939285, + "grad_norm": 0.7302745939731571, + "learning_rate": 1.4947202959333035e-05, + "loss": 0.0881, + "step": 21682 + }, + { + "epoch": 2.5712083481560537, + "grad_norm": 0.6602552869319273, + "learning_rate": 1.4945005257728779e-05, + "loss": 0.0939, + "step": 21683 + }, + { + "epoch": 2.5713269299181785, + "grad_norm": 0.9686713077241917, + "learning_rate": 1.4942807648819234e-05, + "loss": 0.1315, + "step": 21684 + }, + { + "epoch": 2.5714455116803037, + "grad_norm": 0.8359340336348573, + "learning_rate": 1.4940610132624675e-05, + "loss": 0.0926, + "step": 21685 + }, + { + "epoch": 2.5715640934424284, + "grad_norm": 0.9400227972403319, + "learning_rate": 1.493841270916535e-05, + "loss": 0.1263, + "step": 21686 + }, + { + "epoch": 2.5716826752045536, + "grad_norm": 0.6051028704995187, + "learning_rate": 1.4936215378461533e-05, + "loss": 0.0982, + "step": 21687 + }, + { + "epoch": 2.5718012569666784, + "grad_norm": 0.6434527448922712, + "learning_rate": 1.4934018140533463e-05, + "loss": 0.0768, + "step": 21688 + }, + { + "epoch": 2.5719198387288036, + "grad_norm": 0.9486424879845745, + "learning_rate": 1.4931820995401396e-05, + "loss": 0.1384, + "step": 21689 + }, + { + "epoch": 2.5720384204909283, + "grad_norm": 0.4808848246263055, + "learning_rate": 1.4929623943085602e-05, + "loss": 0.0482, + "step": 21690 + }, + { + "epoch": 2.5721570022530535, + "grad_norm": 0.626028306888111, + "learning_rate": 1.4927426983606324e-05, + "loss": 0.0791, + "step": 21691 + }, + { + "epoch": 2.5722755840151783, + "grad_norm": 0.8022772004509159, + "learning_rate": 1.4925230116983808e-05, + "loss": 0.1161, + "step": 21692 + }, + { + "epoch": 2.5723941657773035, + "grad_norm": 0.6211677418502322, + "learning_rate": 1.492303334323833e-05, + "loss": 0.0819, + "step": 21693 + }, + { + "epoch": 2.5725127475394283, + "grad_norm": 0.832106339813787, + "learning_rate": 1.4920836662390122e-05, + "loss": 0.1432, + "step": 21694 + }, + { + "epoch": 2.5726313293015535, + "grad_norm": 0.8088366121898353, + "learning_rate": 1.4918640074459438e-05, + "loss": 0.0974, + "step": 21695 + }, + { + "epoch": 2.5727499110636787, + "grad_norm": 0.6281647132201728, + "learning_rate": 1.4916443579466526e-05, + "loss": 0.0648, + "step": 21696 + }, + { + "epoch": 2.5728684928258034, + "grad_norm": 0.7048129593271925, + "learning_rate": 1.4914247177431644e-05, + "loss": 0.0796, + "step": 21697 + }, + { + "epoch": 2.572987074587928, + "grad_norm": 0.8221617397142938, + "learning_rate": 1.4912050868375037e-05, + "loss": 0.1266, + "step": 21698 + }, + { + "epoch": 2.5731056563500534, + "grad_norm": 0.6775433624893877, + "learning_rate": 1.4909854652316954e-05, + "loss": 0.0911, + "step": 21699 + }, + { + "epoch": 2.5732242381121786, + "grad_norm": 0.49907579507674843, + "learning_rate": 1.4907658529277624e-05, + "loss": 0.0691, + "step": 21700 + }, + { + "epoch": 2.5733428198743034, + "grad_norm": 0.6474134553311868, + "learning_rate": 1.4905462499277317e-05, + "loss": 0.0968, + "step": 21701 + }, + { + "epoch": 2.573461401636428, + "grad_norm": 0.9669609733775465, + "learning_rate": 1.4903266562336272e-05, + "loss": 0.1267, + "step": 21702 + }, + { + "epoch": 2.5735799833985533, + "grad_norm": 0.4397791900064576, + "learning_rate": 1.4901070718474718e-05, + "loss": 0.0643, + "step": 21703 + }, + { + "epoch": 2.5736985651606785, + "grad_norm": 0.5288441900915787, + "learning_rate": 1.4898874967712905e-05, + "loss": 0.0826, + "step": 21704 + }, + { + "epoch": 2.5738171469228033, + "grad_norm": 0.7284008350540951, + "learning_rate": 1.4896679310071088e-05, + "loss": 0.1005, + "step": 21705 + }, + { + "epoch": 2.573935728684928, + "grad_norm": 0.592819740511121, + "learning_rate": 1.4894483745569493e-05, + "loss": 0.0953, + "step": 21706 + }, + { + "epoch": 2.5740543104470532, + "grad_norm": 0.6556629916516582, + "learning_rate": 1.489228827422836e-05, + "loss": 0.0878, + "step": 21707 + }, + { + "epoch": 2.5741728922091784, + "grad_norm": 0.5240090667409226, + "learning_rate": 1.4890092896067942e-05, + "loss": 0.0613, + "step": 21708 + }, + { + "epoch": 2.574291473971303, + "grad_norm": 0.6005623963731114, + "learning_rate": 1.488789761110847e-05, + "loss": 0.0886, + "step": 21709 + }, + { + "epoch": 2.574410055733428, + "grad_norm": 0.7668724731485362, + "learning_rate": 1.4885702419370184e-05, + "loss": 0.1032, + "step": 21710 + }, + { + "epoch": 2.574528637495553, + "grad_norm": 0.45812941161944915, + "learning_rate": 1.4883507320873307e-05, + "loss": 0.0667, + "step": 21711 + }, + { + "epoch": 2.5746472192576784, + "grad_norm": 0.627933372827679, + "learning_rate": 1.4881312315638097e-05, + "loss": 0.0893, + "step": 21712 + }, + { + "epoch": 2.574765801019803, + "grad_norm": 0.7326715772764358, + "learning_rate": 1.487911740368478e-05, + "loss": 0.1003, + "step": 21713 + }, + { + "epoch": 2.574884382781928, + "grad_norm": 0.8933160365353854, + "learning_rate": 1.4876922585033586e-05, + "loss": 0.1265, + "step": 21714 + }, + { + "epoch": 2.575002964544053, + "grad_norm": 0.637647193138884, + "learning_rate": 1.487472785970475e-05, + "loss": 0.0909, + "step": 21715 + }, + { + "epoch": 2.5751215463061783, + "grad_norm": 0.6437862655288752, + "learning_rate": 1.4872533227718505e-05, + "loss": 0.0819, + "step": 21716 + }, + { + "epoch": 2.575240128068303, + "grad_norm": 0.8071747944800559, + "learning_rate": 1.4870338689095092e-05, + "loss": 0.117, + "step": 21717 + }, + { + "epoch": 2.575358709830428, + "grad_norm": 0.7382188023295648, + "learning_rate": 1.4868144243854737e-05, + "loss": 0.0949, + "step": 21718 + }, + { + "epoch": 2.575477291592553, + "grad_norm": 0.5366161517309066, + "learning_rate": 1.4865949892017653e-05, + "loss": 0.0696, + "step": 21719 + }, + { + "epoch": 2.575595873354678, + "grad_norm": 1.0310473634157802, + "learning_rate": 1.4863755633604092e-05, + "loss": 0.1164, + "step": 21720 + }, + { + "epoch": 2.575714455116803, + "grad_norm": 0.6363163874472662, + "learning_rate": 1.486156146863428e-05, + "loss": 0.0781, + "step": 21721 + }, + { + "epoch": 2.575833036878928, + "grad_norm": 0.6138661206552454, + "learning_rate": 1.4859367397128426e-05, + "loss": 0.0598, + "step": 21722 + }, + { + "epoch": 2.575951618641053, + "grad_norm": 0.756478703299259, + "learning_rate": 1.485717341910678e-05, + "loss": 0.1134, + "step": 21723 + }, + { + "epoch": 2.576070200403178, + "grad_norm": 1.0028471528134524, + "learning_rate": 1.4854979534589558e-05, + "loss": 0.1066, + "step": 21724 + }, + { + "epoch": 2.576188782165303, + "grad_norm": 0.5717410821576738, + "learning_rate": 1.4852785743596975e-05, + "loss": 0.089, + "step": 21725 + }, + { + "epoch": 2.576307363927428, + "grad_norm": 0.9690371048057043, + "learning_rate": 1.485059204614927e-05, + "loss": 0.1084, + "step": 21726 + }, + { + "epoch": 2.576425945689553, + "grad_norm": 0.6103300996159365, + "learning_rate": 1.4848398442266655e-05, + "loss": 0.083, + "step": 21727 + }, + { + "epoch": 2.576544527451678, + "grad_norm": 0.6205994997492682, + "learning_rate": 1.4846204931969365e-05, + "loss": 0.0789, + "step": 21728 + }, + { + "epoch": 2.576663109213803, + "grad_norm": 0.4673166897481148, + "learning_rate": 1.4844011515277615e-05, + "loss": 0.0761, + "step": 21729 + }, + { + "epoch": 2.576781690975928, + "grad_norm": 0.6963422986929692, + "learning_rate": 1.4841818192211615e-05, + "loss": 0.1045, + "step": 21730 + }, + { + "epoch": 2.5769002727380528, + "grad_norm": 0.7129043592672168, + "learning_rate": 1.48396249627916e-05, + "loss": 0.0974, + "step": 21731 + }, + { + "epoch": 2.577018854500178, + "grad_norm": 0.7254610499177668, + "learning_rate": 1.4837431827037787e-05, + "loss": 0.1094, + "step": 21732 + }, + { + "epoch": 2.5771374362623027, + "grad_norm": 1.0300367884628725, + "learning_rate": 1.4835238784970385e-05, + "loss": 0.1289, + "step": 21733 + }, + { + "epoch": 2.577256018024428, + "grad_norm": 0.7956197751969057, + "learning_rate": 1.4833045836609615e-05, + "loss": 0.1152, + "step": 21734 + }, + { + "epoch": 2.5773745997865527, + "grad_norm": 0.7486623024657607, + "learning_rate": 1.4830852981975698e-05, + "loss": 0.1124, + "step": 21735 + }, + { + "epoch": 2.577493181548678, + "grad_norm": 0.8153017552791842, + "learning_rate": 1.482866022108885e-05, + "loss": 0.1124, + "step": 21736 + }, + { + "epoch": 2.5776117633108027, + "grad_norm": 0.803876540296278, + "learning_rate": 1.482646755396927e-05, + "loss": 0.1129, + "step": 21737 + }, + { + "epoch": 2.577730345072928, + "grad_norm": 0.7702943442520692, + "learning_rate": 1.4824274980637196e-05, + "loss": 0.0856, + "step": 21738 + }, + { + "epoch": 2.5778489268350526, + "grad_norm": 0.632991507822899, + "learning_rate": 1.4822082501112827e-05, + "loss": 0.087, + "step": 21739 + }, + { + "epoch": 2.577967508597178, + "grad_norm": 0.5636498931522826, + "learning_rate": 1.481989011541638e-05, + "loss": 0.0487, + "step": 21740 + }, + { + "epoch": 2.5780860903593026, + "grad_norm": 0.7629009147613559, + "learning_rate": 1.4817697823568046e-05, + "loss": 0.0906, + "step": 21741 + }, + { + "epoch": 2.578204672121428, + "grad_norm": 0.49488157014563944, + "learning_rate": 1.4815505625588066e-05, + "loss": 0.0635, + "step": 21742 + }, + { + "epoch": 2.5783232538835525, + "grad_norm": 0.7100886911264174, + "learning_rate": 1.4813313521496634e-05, + "loss": 0.0802, + "step": 21743 + }, + { + "epoch": 2.5784418356456777, + "grad_norm": 0.7590768286038645, + "learning_rate": 1.4811121511313953e-05, + "loss": 0.0911, + "step": 21744 + }, + { + "epoch": 2.578560417407803, + "grad_norm": 0.7382663624817576, + "learning_rate": 1.480892959506024e-05, + "loss": 0.102, + "step": 21745 + }, + { + "epoch": 2.5786789991699277, + "grad_norm": 0.6040406091153081, + "learning_rate": 1.4806737772755696e-05, + "loss": 0.0731, + "step": 21746 + }, + { + "epoch": 2.5787975809320525, + "grad_norm": 0.8258329369519675, + "learning_rate": 1.4804546044420537e-05, + "loss": 0.0801, + "step": 21747 + }, + { + "epoch": 2.5789161626941777, + "grad_norm": 0.7248536502234016, + "learning_rate": 1.4802354410074962e-05, + "loss": 0.1146, + "step": 21748 + }, + { + "epoch": 2.579034744456303, + "grad_norm": 0.6576359662583103, + "learning_rate": 1.4800162869739162e-05, + "loss": 0.0749, + "step": 21749 + }, + { + "epoch": 2.5791533262184276, + "grad_norm": 0.7558702461824902, + "learning_rate": 1.4797971423433365e-05, + "loss": 0.1161, + "step": 21750 + }, + { + "epoch": 2.5792719079805524, + "grad_norm": 0.6610476409448179, + "learning_rate": 1.4795780071177756e-05, + "loss": 0.0873, + "step": 21751 + }, + { + "epoch": 2.5793904897426776, + "grad_norm": 0.7472351946302279, + "learning_rate": 1.4793588812992537e-05, + "loss": 0.1094, + "step": 21752 + }, + { + "epoch": 2.579509071504803, + "grad_norm": 0.5528221623751591, + "learning_rate": 1.479139764889792e-05, + "loss": 0.0826, + "step": 21753 + }, + { + "epoch": 2.5796276532669276, + "grad_norm": 0.844263479575088, + "learning_rate": 1.4789206578914099e-05, + "loss": 0.0923, + "step": 21754 + }, + { + "epoch": 2.5797462350290523, + "grad_norm": 0.711774993356527, + "learning_rate": 1.4787015603061267e-05, + "loss": 0.0861, + "step": 21755 + }, + { + "epoch": 2.5798648167911775, + "grad_norm": 1.0121975611636214, + "learning_rate": 1.4784824721359628e-05, + "loss": 0.1379, + "step": 21756 + }, + { + "epoch": 2.5799833985533027, + "grad_norm": 0.7712917282481041, + "learning_rate": 1.4782633933829376e-05, + "loss": 0.1088, + "step": 21757 + }, + { + "epoch": 2.5801019803154275, + "grad_norm": 0.6238586333041432, + "learning_rate": 1.4780443240490719e-05, + "loss": 0.069, + "step": 21758 + }, + { + "epoch": 2.5802205620775522, + "grad_norm": 0.9102398468088011, + "learning_rate": 1.4778252641363838e-05, + "loss": 0.1331, + "step": 21759 + }, + { + "epoch": 2.5803391438396774, + "grad_norm": 0.6586803994014576, + "learning_rate": 1.4776062136468927e-05, + "loss": 0.0944, + "step": 21760 + }, + { + "epoch": 2.5804577256018026, + "grad_norm": 0.5530106649977531, + "learning_rate": 1.4773871725826193e-05, + "loss": 0.0805, + "step": 21761 + }, + { + "epoch": 2.5805763073639274, + "grad_norm": 0.6930010774635833, + "learning_rate": 1.4771681409455822e-05, + "loss": 0.09, + "step": 21762 + }, + { + "epoch": 2.580694889126052, + "grad_norm": 0.7679093656184596, + "learning_rate": 1.4769491187377998e-05, + "loss": 0.1136, + "step": 21763 + }, + { + "epoch": 2.5808134708881774, + "grad_norm": 0.8251742900100529, + "learning_rate": 1.4767301059612928e-05, + "loss": 0.1047, + "step": 21764 + }, + { + "epoch": 2.5809320526503026, + "grad_norm": 0.5934908917119558, + "learning_rate": 1.476511102618079e-05, + "loss": 0.0817, + "step": 21765 + }, + { + "epoch": 2.5810506344124273, + "grad_norm": 0.6269676968937224, + "learning_rate": 1.4762921087101783e-05, + "loss": 0.0583, + "step": 21766 + }, + { + "epoch": 2.581169216174552, + "grad_norm": 0.7640254956383136, + "learning_rate": 1.4760731242396078e-05, + "loss": 0.1321, + "step": 21767 + }, + { + "epoch": 2.5812877979366773, + "grad_norm": 0.520672830137325, + "learning_rate": 1.4758541492083885e-05, + "loss": 0.0699, + "step": 21768 + }, + { + "epoch": 2.5814063796988025, + "grad_norm": 0.734013270822722, + "learning_rate": 1.4756351836185382e-05, + "loss": 0.0998, + "step": 21769 + }, + { + "epoch": 2.5815249614609272, + "grad_norm": 1.1158770542942729, + "learning_rate": 1.4754162274720752e-05, + "loss": 0.1385, + "step": 21770 + }, + { + "epoch": 2.5816435432230525, + "grad_norm": 0.5093019277391224, + "learning_rate": 1.4751972807710173e-05, + "loss": 0.0785, + "step": 21771 + }, + { + "epoch": 2.581762124985177, + "grad_norm": 0.43308484435515665, + "learning_rate": 1.4749783435173847e-05, + "loss": 0.056, + "step": 21772 + }, + { + "epoch": 2.5818807067473024, + "grad_norm": 0.46946366163170616, + "learning_rate": 1.4747594157131946e-05, + "loss": 0.0758, + "step": 21773 + }, + { + "epoch": 2.581999288509427, + "grad_norm": 0.5771162567949781, + "learning_rate": 1.4745404973604654e-05, + "loss": 0.0923, + "step": 21774 + }, + { + "epoch": 2.5821178702715524, + "grad_norm": 0.7133420577650004, + "learning_rate": 1.474321588461215e-05, + "loss": 0.1246, + "step": 21775 + }, + { + "epoch": 2.582236452033677, + "grad_norm": 0.5933405029968767, + "learning_rate": 1.4741026890174617e-05, + "loss": 0.0761, + "step": 21776 + }, + { + "epoch": 2.5823550337958023, + "grad_norm": 0.6123507631179006, + "learning_rate": 1.4738837990312243e-05, + "loss": 0.0697, + "step": 21777 + }, + { + "epoch": 2.582473615557927, + "grad_norm": 0.6265104381339655, + "learning_rate": 1.4736649185045187e-05, + "loss": 0.0849, + "step": 21778 + }, + { + "epoch": 2.5825921973200523, + "grad_norm": 0.37618166268537195, + "learning_rate": 1.4734460474393651e-05, + "loss": 0.0569, + "step": 21779 + }, + { + "epoch": 2.582710779082177, + "grad_norm": 0.6856120405857276, + "learning_rate": 1.4732271858377802e-05, + "loss": 0.0947, + "step": 21780 + }, + { + "epoch": 2.5828293608443023, + "grad_norm": 0.8209717751762402, + "learning_rate": 1.4730083337017816e-05, + "loss": 0.1033, + "step": 21781 + }, + { + "epoch": 2.582947942606427, + "grad_norm": 0.9848514228787356, + "learning_rate": 1.4727894910333856e-05, + "loss": 0.1373, + "step": 21782 + }, + { + "epoch": 2.583066524368552, + "grad_norm": 0.43828576542117875, + "learning_rate": 1.472570657834612e-05, + "loss": 0.0496, + "step": 21783 + }, + { + "epoch": 2.583185106130677, + "grad_norm": 0.5504615960044469, + "learning_rate": 1.4723518341074772e-05, + "loss": 0.0782, + "step": 21784 + }, + { + "epoch": 2.583303687892802, + "grad_norm": 0.6105902236571252, + "learning_rate": 1.4721330198539974e-05, + "loss": 0.0899, + "step": 21785 + }, + { + "epoch": 2.583422269654927, + "grad_norm": 0.40449620787696483, + "learning_rate": 1.4719142150761906e-05, + "loss": 0.0482, + "step": 21786 + }, + { + "epoch": 2.583540851417052, + "grad_norm": 0.7286299777791738, + "learning_rate": 1.4716954197760744e-05, + "loss": 0.0998, + "step": 21787 + }, + { + "epoch": 2.583659433179177, + "grad_norm": 0.7210767039189184, + "learning_rate": 1.4714766339556657e-05, + "loss": 0.1189, + "step": 21788 + }, + { + "epoch": 2.583778014941302, + "grad_norm": 0.608788752144289, + "learning_rate": 1.4712578576169814e-05, + "loss": 0.0656, + "step": 21789 + }, + { + "epoch": 2.583896596703427, + "grad_norm": 0.738700770320291, + "learning_rate": 1.4710390907620369e-05, + "loss": 0.1046, + "step": 21790 + }, + { + "epoch": 2.584015178465552, + "grad_norm": 0.6679379122298252, + "learning_rate": 1.4708203333928513e-05, + "loss": 0.0996, + "step": 21791 + }, + { + "epoch": 2.584133760227677, + "grad_norm": 0.7505490537157985, + "learning_rate": 1.4706015855114403e-05, + "loss": 0.0978, + "step": 21792 + }, + { + "epoch": 2.584252341989802, + "grad_norm": 0.7773780075034072, + "learning_rate": 1.4703828471198189e-05, + "loss": 0.0992, + "step": 21793 + }, + { + "epoch": 2.5843709237519272, + "grad_norm": 0.7646436838756954, + "learning_rate": 1.4701641182200065e-05, + "loss": 0.0987, + "step": 21794 + }, + { + "epoch": 2.584489505514052, + "grad_norm": 0.7512534631943374, + "learning_rate": 1.4699453988140175e-05, + "loss": 0.1076, + "step": 21795 + }, + { + "epoch": 2.5846080872761767, + "grad_norm": 0.8153579790707216, + "learning_rate": 1.4697266889038691e-05, + "loss": 0.1065, + "step": 21796 + }, + { + "epoch": 2.584726669038302, + "grad_norm": 0.5307333347330933, + "learning_rate": 1.4695079884915764e-05, + "loss": 0.0864, + "step": 21797 + }, + { + "epoch": 2.584845250800427, + "grad_norm": 0.7130223298217921, + "learning_rate": 1.4692892975791572e-05, + "loss": 0.0939, + "step": 21798 + }, + { + "epoch": 2.584963832562552, + "grad_norm": 0.5583135524854778, + "learning_rate": 1.4690706161686268e-05, + "loss": 0.0871, + "step": 21799 + }, + { + "epoch": 2.5850824143246767, + "grad_norm": 0.5411204290376191, + "learning_rate": 1.4688519442620013e-05, + "loss": 0.0754, + "step": 21800 + }, + { + "epoch": 2.585200996086802, + "grad_norm": 0.6494975197136568, + "learning_rate": 1.4686332818612952e-05, + "loss": 0.0782, + "step": 21801 + }, + { + "epoch": 2.585319577848927, + "grad_norm": 0.9038696210095611, + "learning_rate": 1.4684146289685263e-05, + "loss": 0.1249, + "step": 21802 + }, + { + "epoch": 2.585438159611052, + "grad_norm": 1.1416510069112726, + "learning_rate": 1.4681959855857096e-05, + "loss": 0.128, + "step": 21803 + }, + { + "epoch": 2.5855567413731766, + "grad_norm": 0.9404993983329561, + "learning_rate": 1.4679773517148604e-05, + "loss": 0.1348, + "step": 21804 + }, + { + "epoch": 2.585675323135302, + "grad_norm": 0.9993269070376036, + "learning_rate": 1.4677587273579945e-05, + "loss": 0.1207, + "step": 21805 + }, + { + "epoch": 2.585793904897427, + "grad_norm": 0.6192944844229286, + "learning_rate": 1.4675401125171272e-05, + "loss": 0.0861, + "step": 21806 + }, + { + "epoch": 2.5859124866595518, + "grad_norm": 0.5660927503825468, + "learning_rate": 1.4673215071942741e-05, + "loss": 0.0758, + "step": 21807 + }, + { + "epoch": 2.5860310684216765, + "grad_norm": 1.2649721246894226, + "learning_rate": 1.4671029113914498e-05, + "loss": 0.1343, + "step": 21808 + }, + { + "epoch": 2.5861496501838017, + "grad_norm": 0.95789899710996, + "learning_rate": 1.4668843251106712e-05, + "loss": 0.1455, + "step": 21809 + }, + { + "epoch": 2.586268231945927, + "grad_norm": 0.957963158790284, + "learning_rate": 1.4666657483539517e-05, + "loss": 0.1194, + "step": 21810 + }, + { + "epoch": 2.5863868137080517, + "grad_norm": 0.4739566167416007, + "learning_rate": 1.4664471811233072e-05, + "loss": 0.0555, + "step": 21811 + }, + { + "epoch": 2.5865053954701764, + "grad_norm": 0.6124419189555691, + "learning_rate": 1.4662286234207512e-05, + "loss": 0.0802, + "step": 21812 + }, + { + "epoch": 2.5866239772323016, + "grad_norm": 0.6426601552212815, + "learning_rate": 1.4660100752483005e-05, + "loss": 0.0998, + "step": 21813 + }, + { + "epoch": 2.586742558994427, + "grad_norm": 0.6171044235575566, + "learning_rate": 1.4657915366079683e-05, + "loss": 0.0734, + "step": 21814 + }, + { + "epoch": 2.5868611407565516, + "grad_norm": 0.5918732571316935, + "learning_rate": 1.465573007501771e-05, + "loss": 0.0892, + "step": 21815 + }, + { + "epoch": 2.5869797225186764, + "grad_norm": 0.764862203424092, + "learning_rate": 1.4653544879317205e-05, + "loss": 0.1097, + "step": 21816 + }, + { + "epoch": 2.5870983042808016, + "grad_norm": 0.9086965655406026, + "learning_rate": 1.4651359778998339e-05, + "loss": 0.1113, + "step": 21817 + }, + { + "epoch": 2.5872168860429268, + "grad_norm": 0.6261062487085839, + "learning_rate": 1.4649174774081248e-05, + "loss": 0.0826, + "step": 21818 + }, + { + "epoch": 2.5873354678050515, + "grad_norm": 0.6867252640893505, + "learning_rate": 1.4646989864586069e-05, + "loss": 0.1049, + "step": 21819 + }, + { + "epoch": 2.5874540495671767, + "grad_norm": 0.7009093891775063, + "learning_rate": 1.4644805050532942e-05, + "loss": 0.0919, + "step": 21820 + }, + { + "epoch": 2.5875726313293015, + "grad_norm": 0.5692399928648344, + "learning_rate": 1.464262033194202e-05, + "loss": 0.0825, + "step": 21821 + }, + { + "epoch": 2.5876912130914267, + "grad_norm": 0.5275658929207139, + "learning_rate": 1.4640435708833442e-05, + "loss": 0.0716, + "step": 21822 + }, + { + "epoch": 2.5878097948535514, + "grad_norm": 0.5343377701628462, + "learning_rate": 1.463825118122733e-05, + "loss": 0.0585, + "step": 21823 + }, + { + "epoch": 2.5879283766156767, + "grad_norm": 1.1802523494584924, + "learning_rate": 1.4636066749143846e-05, + "loss": 0.1263, + "step": 21824 + }, + { + "epoch": 2.5880469583778014, + "grad_norm": 0.5975004820123287, + "learning_rate": 1.4633882412603111e-05, + "loss": 0.0748, + "step": 21825 + }, + { + "epoch": 2.5881655401399266, + "grad_norm": 0.6344168775799386, + "learning_rate": 1.4631698171625277e-05, + "loss": 0.0947, + "step": 21826 + }, + { + "epoch": 2.5882841219020514, + "grad_norm": 1.0378387312758124, + "learning_rate": 1.462951402623046e-05, + "loss": 0.1354, + "step": 21827 + }, + { + "epoch": 2.5884027036641766, + "grad_norm": 0.9206497938666299, + "learning_rate": 1.4627329976438813e-05, + "loss": 0.1093, + "step": 21828 + }, + { + "epoch": 2.5885212854263013, + "grad_norm": 0.7282452157289558, + "learning_rate": 1.4625146022270469e-05, + "loss": 0.0845, + "step": 21829 + }, + { + "epoch": 2.5886398671884265, + "grad_norm": 0.573627220875112, + "learning_rate": 1.4622962163745555e-05, + "loss": 0.08, + "step": 21830 + }, + { + "epoch": 2.5887584489505513, + "grad_norm": 0.56032907593689, + "learning_rate": 1.4620778400884194e-05, + "loss": 0.0753, + "step": 21831 + }, + { + "epoch": 2.5888770307126765, + "grad_norm": 0.6626105855416736, + "learning_rate": 1.4618594733706537e-05, + "loss": 0.1026, + "step": 21832 + }, + { + "epoch": 2.5889956124748013, + "grad_norm": 0.5769811184071237, + "learning_rate": 1.4616411162232708e-05, + "loss": 0.0692, + "step": 21833 + }, + { + "epoch": 2.5891141942369265, + "grad_norm": 0.682182347965938, + "learning_rate": 1.4614227686482823e-05, + "loss": 0.1032, + "step": 21834 + }, + { + "epoch": 2.589232775999051, + "grad_norm": 0.6434721186352659, + "learning_rate": 1.4612044306477036e-05, + "loss": 0.0879, + "step": 21835 + }, + { + "epoch": 2.5893513577611764, + "grad_norm": 0.42842258121234456, + "learning_rate": 1.4609861022235456e-05, + "loss": 0.061, + "step": 21836 + }, + { + "epoch": 2.589469939523301, + "grad_norm": 0.700828992025767, + "learning_rate": 1.4607677833778216e-05, + "loss": 0.0632, + "step": 21837 + }, + { + "epoch": 2.5895885212854264, + "grad_norm": 0.6219394189778532, + "learning_rate": 1.4605494741125436e-05, + "loss": 0.0782, + "step": 21838 + }, + { + "epoch": 2.589707103047551, + "grad_norm": 0.5452188839732415, + "learning_rate": 1.460331174429726e-05, + "loss": 0.0722, + "step": 21839 + }, + { + "epoch": 2.5898256848096763, + "grad_norm": 0.6314283683952631, + "learning_rate": 1.46011288433138e-05, + "loss": 0.0883, + "step": 21840 + }, + { + "epoch": 2.589944266571801, + "grad_norm": 0.6479156039933638, + "learning_rate": 1.4598946038195176e-05, + "loss": 0.0902, + "step": 21841 + }, + { + "epoch": 2.5900628483339263, + "grad_norm": 0.9375475878399752, + "learning_rate": 1.459676332896151e-05, + "loss": 0.0879, + "step": 21842 + }, + { + "epoch": 2.590181430096051, + "grad_norm": 0.814710060310782, + "learning_rate": 1.4594580715632932e-05, + "loss": 0.1133, + "step": 21843 + }, + { + "epoch": 2.5903000118581763, + "grad_norm": 0.490232984578286, + "learning_rate": 1.4592398198229568e-05, + "loss": 0.0629, + "step": 21844 + }, + { + "epoch": 2.590418593620301, + "grad_norm": 0.9649545853254412, + "learning_rate": 1.4590215776771521e-05, + "loss": 0.1204, + "step": 21845 + }, + { + "epoch": 2.5905371753824262, + "grad_norm": 0.7925282690876486, + "learning_rate": 1.458803345127892e-05, + "loss": 0.1012, + "step": 21846 + }, + { + "epoch": 2.5906557571445514, + "grad_norm": 0.6233523372015507, + "learning_rate": 1.4585851221771885e-05, + "loss": 0.075, + "step": 21847 + }, + { + "epoch": 2.590774338906676, + "grad_norm": 1.196658129667339, + "learning_rate": 1.4583669088270536e-05, + "loss": 0.1883, + "step": 21848 + }, + { + "epoch": 2.590892920668801, + "grad_norm": 0.6656796660015559, + "learning_rate": 1.4581487050794967e-05, + "loss": 0.0649, + "step": 21849 + }, + { + "epoch": 2.591011502430926, + "grad_norm": 0.6513694688295485, + "learning_rate": 1.4579305109365331e-05, + "loss": 0.0853, + "step": 21850 + }, + { + "epoch": 2.5911300841930514, + "grad_norm": 0.8462903823806432, + "learning_rate": 1.4577123264001714e-05, + "loss": 0.128, + "step": 21851 + }, + { + "epoch": 2.591248665955176, + "grad_norm": 0.5190570143188311, + "learning_rate": 1.4574941514724238e-05, + "loss": 0.0723, + "step": 21852 + }, + { + "epoch": 2.591367247717301, + "grad_norm": 0.8465063643099384, + "learning_rate": 1.457275986155302e-05, + "loss": 0.1374, + "step": 21853 + }, + { + "epoch": 2.591485829479426, + "grad_norm": 0.6180715958759728, + "learning_rate": 1.4570578304508164e-05, + "loss": 0.0788, + "step": 21854 + }, + { + "epoch": 2.5916044112415513, + "grad_norm": 0.6756996986007032, + "learning_rate": 1.45683968436098e-05, + "loss": 0.1118, + "step": 21855 + }, + { + "epoch": 2.591722993003676, + "grad_norm": 1.0225187041059296, + "learning_rate": 1.4566215478878014e-05, + "loss": 0.1284, + "step": 21856 + }, + { + "epoch": 2.591841574765801, + "grad_norm": 0.6338084487485439, + "learning_rate": 1.4564034210332927e-05, + "loss": 0.085, + "step": 21857 + }, + { + "epoch": 2.591960156527926, + "grad_norm": 0.5701002716434888, + "learning_rate": 1.4561853037994649e-05, + "loss": 0.081, + "step": 21858 + }, + { + "epoch": 2.592078738290051, + "grad_norm": 0.6656742626160084, + "learning_rate": 1.4559671961883286e-05, + "loss": 0.1166, + "step": 21859 + }, + { + "epoch": 2.592197320052176, + "grad_norm": 0.6514052021529805, + "learning_rate": 1.455749098201895e-05, + "loss": 0.0824, + "step": 21860 + }, + { + "epoch": 2.5923159018143007, + "grad_norm": 0.7769790292038351, + "learning_rate": 1.4555310098421727e-05, + "loss": 0.0983, + "step": 21861 + }, + { + "epoch": 2.592434483576426, + "grad_norm": 0.8690896641840874, + "learning_rate": 1.4553129311111755e-05, + "loss": 0.1322, + "step": 21862 + }, + { + "epoch": 2.592553065338551, + "grad_norm": 1.0845065946585886, + "learning_rate": 1.4550948620109109e-05, + "loss": 0.1411, + "step": 21863 + }, + { + "epoch": 2.592671647100676, + "grad_norm": 0.4678598888864147, + "learning_rate": 1.4548768025433904e-05, + "loss": 0.0776, + "step": 21864 + }, + { + "epoch": 2.5927902288628006, + "grad_norm": 0.629996477674819, + "learning_rate": 1.4546587527106242e-05, + "loss": 0.0842, + "step": 21865 + }, + { + "epoch": 2.592908810624926, + "grad_norm": 0.7044284102880053, + "learning_rate": 1.4544407125146235e-05, + "loss": 0.0969, + "step": 21866 + }, + { + "epoch": 2.593027392387051, + "grad_norm": 0.6649477169412196, + "learning_rate": 1.4542226819573963e-05, + "loss": 0.0949, + "step": 21867 + }, + { + "epoch": 2.593145974149176, + "grad_norm": 0.6932447908965876, + "learning_rate": 1.454004661040953e-05, + "loss": 0.084, + "step": 21868 + }, + { + "epoch": 2.593264555911301, + "grad_norm": 0.7597309862884968, + "learning_rate": 1.4537866497673047e-05, + "loss": 0.0785, + "step": 21869 + }, + { + "epoch": 2.5933831376734258, + "grad_norm": 0.6749460699391026, + "learning_rate": 1.4535686481384603e-05, + "loss": 0.0869, + "step": 21870 + }, + { + "epoch": 2.593501719435551, + "grad_norm": 0.9001985482542, + "learning_rate": 1.4533506561564306e-05, + "loss": 0.1514, + "step": 21871 + }, + { + "epoch": 2.5936203011976757, + "grad_norm": 0.9418552040751123, + "learning_rate": 1.4531326738232226e-05, + "loss": 0.1279, + "step": 21872 + }, + { + "epoch": 2.593738882959801, + "grad_norm": 1.0785766334677316, + "learning_rate": 1.4529147011408495e-05, + "loss": 0.128, + "step": 21873 + }, + { + "epoch": 2.5938574647219257, + "grad_norm": 0.6825180286492357, + "learning_rate": 1.4526967381113177e-05, + "loss": 0.0955, + "step": 21874 + }, + { + "epoch": 2.593976046484051, + "grad_norm": 0.5887334917047614, + "learning_rate": 1.4524787847366388e-05, + "loss": 0.0671, + "step": 21875 + }, + { + "epoch": 2.5940946282461756, + "grad_norm": 0.5228853760508094, + "learning_rate": 1.4522608410188187e-05, + "loss": 0.0713, + "step": 21876 + }, + { + "epoch": 2.594213210008301, + "grad_norm": 0.49636602861551615, + "learning_rate": 1.4520429069598712e-05, + "loss": 0.0709, + "step": 21877 + }, + { + "epoch": 2.5943317917704256, + "grad_norm": 0.798987557848694, + "learning_rate": 1.4518249825618018e-05, + "loss": 0.0952, + "step": 21878 + }, + { + "epoch": 2.594450373532551, + "grad_norm": 0.521073936744459, + "learning_rate": 1.4516070678266208e-05, + "loss": 0.0668, + "step": 21879 + }, + { + "epoch": 2.5945689552946756, + "grad_norm": 0.5462403865759258, + "learning_rate": 1.451389162756337e-05, + "loss": 0.0666, + "step": 21880 + }, + { + "epoch": 2.5946875370568008, + "grad_norm": 0.47564381643212805, + "learning_rate": 1.4511712673529593e-05, + "loss": 0.0693, + "step": 21881 + }, + { + "epoch": 2.5948061188189255, + "grad_norm": 0.6613749938766706, + "learning_rate": 1.4509533816184972e-05, + "loss": 0.0791, + "step": 21882 + }, + { + "epoch": 2.5949247005810507, + "grad_norm": 0.7535750999404823, + "learning_rate": 1.4507355055549565e-05, + "loss": 0.1036, + "step": 21883 + }, + { + "epoch": 2.5950432823431755, + "grad_norm": 0.7030252475088511, + "learning_rate": 1.4505176391643499e-05, + "loss": 0.115, + "step": 21884 + }, + { + "epoch": 2.5951618641053007, + "grad_norm": 0.7226411234683512, + "learning_rate": 1.4502997824486828e-05, + "loss": 0.108, + "step": 21885 + }, + { + "epoch": 2.5952804458674255, + "grad_norm": 0.630112700636151, + "learning_rate": 1.4500819354099654e-05, + "loss": 0.0764, + "step": 21886 + }, + { + "epoch": 2.5953990276295507, + "grad_norm": 0.5329018974252739, + "learning_rate": 1.4498640980502026e-05, + "loss": 0.066, + "step": 21887 + }, + { + "epoch": 2.5955176093916754, + "grad_norm": 0.751671932464693, + "learning_rate": 1.4496462703714075e-05, + "loss": 0.126, + "step": 21888 + }, + { + "epoch": 2.5956361911538006, + "grad_norm": 0.7610935510092147, + "learning_rate": 1.4494284523755847e-05, + "loss": 0.0896, + "step": 21889 + }, + { + "epoch": 2.5957547729159254, + "grad_norm": 0.5189908719553425, + "learning_rate": 1.4492106440647435e-05, + "loss": 0.078, + "step": 21890 + }, + { + "epoch": 2.5958733546780506, + "grad_norm": 1.0857623308431144, + "learning_rate": 1.4489928454408914e-05, + "loss": 0.1327, + "step": 21891 + }, + { + "epoch": 2.5959919364401753, + "grad_norm": 0.8368192280853721, + "learning_rate": 1.4487750565060362e-05, + "loss": 0.0766, + "step": 21892 + }, + { + "epoch": 2.5961105182023005, + "grad_norm": 0.7211322831577193, + "learning_rate": 1.448557277262187e-05, + "loss": 0.0693, + "step": 21893 + }, + { + "epoch": 2.5962290999644253, + "grad_norm": 0.5444049031318132, + "learning_rate": 1.4483395077113485e-05, + "loss": 0.0759, + "step": 21894 + }, + { + "epoch": 2.5963476817265505, + "grad_norm": 1.0041425237287758, + "learning_rate": 1.448121747855532e-05, + "loss": 0.1321, + "step": 21895 + }, + { + "epoch": 2.5964662634886757, + "grad_norm": 0.817323441106573, + "learning_rate": 1.447903997696742e-05, + "loss": 0.1067, + "step": 21896 + }, + { + "epoch": 2.5965848452508005, + "grad_norm": 0.9637536957499933, + "learning_rate": 1.4476862572369881e-05, + "loss": 0.1474, + "step": 21897 + }, + { + "epoch": 2.5967034270129252, + "grad_norm": 0.6044587505601111, + "learning_rate": 1.4474685264782744e-05, + "loss": 0.0946, + "step": 21898 + }, + { + "epoch": 2.5968220087750504, + "grad_norm": 0.44860507084239276, + "learning_rate": 1.4472508054226125e-05, + "loss": 0.061, + "step": 21899 + }, + { + "epoch": 2.5969405905371756, + "grad_norm": 0.5950359425072482, + "learning_rate": 1.447033094072006e-05, + "loss": 0.0815, + "step": 21900 + }, + { + "epoch": 2.5970591722993004, + "grad_norm": 0.6608943024351142, + "learning_rate": 1.4468153924284627e-05, + "loss": 0.1082, + "step": 21901 + }, + { + "epoch": 2.597177754061425, + "grad_norm": 0.5513874937950206, + "learning_rate": 1.4465977004939907e-05, + "loss": 0.0452, + "step": 21902 + }, + { + "epoch": 2.5972963358235504, + "grad_norm": 0.4959952385383308, + "learning_rate": 1.4463800182705956e-05, + "loss": 0.0686, + "step": 21903 + }, + { + "epoch": 2.5974149175856756, + "grad_norm": 0.7476107540597091, + "learning_rate": 1.446162345760286e-05, + "loss": 0.0864, + "step": 21904 + }, + { + "epoch": 2.5975334993478003, + "grad_norm": 0.81857276337261, + "learning_rate": 1.4459446829650664e-05, + "loss": 0.1284, + "step": 21905 + }, + { + "epoch": 2.597652081109925, + "grad_norm": 0.6722180335653745, + "learning_rate": 1.4457270298869439e-05, + "loss": 0.094, + "step": 21906 + }, + { + "epoch": 2.5977706628720503, + "grad_norm": 0.7211016340671558, + "learning_rate": 1.4455093865279254e-05, + "loss": 0.1055, + "step": 21907 + }, + { + "epoch": 2.5978892446341755, + "grad_norm": 0.8658192503839995, + "learning_rate": 1.4452917528900173e-05, + "loss": 0.1221, + "step": 21908 + }, + { + "epoch": 2.5980078263963002, + "grad_norm": 0.5935646706945945, + "learning_rate": 1.4450741289752259e-05, + "loss": 0.0887, + "step": 21909 + }, + { + "epoch": 2.598126408158425, + "grad_norm": 0.6251300078424941, + "learning_rate": 1.4448565147855574e-05, + "loss": 0.0661, + "step": 21910 + }, + { + "epoch": 2.59824498992055, + "grad_norm": 0.7540830159370536, + "learning_rate": 1.444638910323019e-05, + "loss": 0.101, + "step": 21911 + }, + { + "epoch": 2.5983635716826754, + "grad_norm": 0.6999104356863861, + "learning_rate": 1.4444213155896147e-05, + "loss": 0.0946, + "step": 21912 + }, + { + "epoch": 2.5984821534448, + "grad_norm": 0.4537776944361022, + "learning_rate": 1.4442037305873512e-05, + "loss": 0.0581, + "step": 21913 + }, + { + "epoch": 2.598600735206925, + "grad_norm": 0.47561548433033324, + "learning_rate": 1.443986155318235e-05, + "loss": 0.0566, + "step": 21914 + }, + { + "epoch": 2.59871931696905, + "grad_norm": 0.7002137181979591, + "learning_rate": 1.4437685897842718e-05, + "loss": 0.1095, + "step": 21915 + }, + { + "epoch": 2.5988378987311753, + "grad_norm": 0.7600646277041309, + "learning_rate": 1.4435510339874664e-05, + "loss": 0.107, + "step": 21916 + }, + { + "epoch": 2.5989564804933, + "grad_norm": 0.8393710554470967, + "learning_rate": 1.4433334879298249e-05, + "loss": 0.1169, + "step": 21917 + }, + { + "epoch": 2.599075062255425, + "grad_norm": 0.8423453564560103, + "learning_rate": 1.443115951613353e-05, + "loss": 0.0952, + "step": 21918 + }, + { + "epoch": 2.59919364401755, + "grad_norm": 0.7607358007857821, + "learning_rate": 1.4428984250400559e-05, + "loss": 0.0922, + "step": 21919 + }, + { + "epoch": 2.5993122257796752, + "grad_norm": 0.7766283969782733, + "learning_rate": 1.4426809082119392e-05, + "loss": 0.1161, + "step": 21920 + }, + { + "epoch": 2.5994308075418, + "grad_norm": 0.6408704656399453, + "learning_rate": 1.4424634011310079e-05, + "loss": 0.0984, + "step": 21921 + }, + { + "epoch": 2.599549389303925, + "grad_norm": 0.644643684277401, + "learning_rate": 1.4422459037992681e-05, + "loss": 0.0889, + "step": 21922 + }, + { + "epoch": 2.59966797106605, + "grad_norm": 0.7646363635114254, + "learning_rate": 1.4420284162187229e-05, + "loss": 0.1104, + "step": 21923 + }, + { + "epoch": 2.599786552828175, + "grad_norm": 0.46875542066733955, + "learning_rate": 1.441810938391378e-05, + "loss": 0.0625, + "step": 21924 + }, + { + "epoch": 2.5999051345903, + "grad_norm": 0.8271353749623191, + "learning_rate": 1.441593470319239e-05, + "loss": 0.0931, + "step": 21925 + }, + { + "epoch": 2.600023716352425, + "grad_norm": 0.6143227556634457, + "learning_rate": 1.4413760120043115e-05, + "loss": 0.0681, + "step": 21926 + }, + { + "epoch": 2.60014229811455, + "grad_norm": 0.5652176239646199, + "learning_rate": 1.441158563448598e-05, + "loss": 0.0598, + "step": 21927 + }, + { + "epoch": 2.600260879876675, + "grad_norm": 0.7572079283942202, + "learning_rate": 1.4409411246541038e-05, + "loss": 0.0815, + "step": 21928 + }, + { + "epoch": 2.6003794616388, + "grad_norm": 0.5840458806668437, + "learning_rate": 1.4407236956228337e-05, + "loss": 0.0672, + "step": 21929 + }, + { + "epoch": 2.600498043400925, + "grad_norm": 0.5487847173046987, + "learning_rate": 1.4405062763567923e-05, + "loss": 0.078, + "step": 21930 + }, + { + "epoch": 2.60061662516305, + "grad_norm": 0.7255678808337844, + "learning_rate": 1.4402888668579847e-05, + "loss": 0.0787, + "step": 21931 + }, + { + "epoch": 2.600735206925175, + "grad_norm": 0.5757746528575257, + "learning_rate": 1.4400714671284123e-05, + "loss": 0.0563, + "step": 21932 + }, + { + "epoch": 2.6008537886872998, + "grad_norm": 0.6369798685970102, + "learning_rate": 1.4398540771700833e-05, + "loss": 0.0876, + "step": 21933 + }, + { + "epoch": 2.600972370449425, + "grad_norm": 1.1328039052713466, + "learning_rate": 1.4396366969849987e-05, + "loss": 0.1532, + "step": 21934 + }, + { + "epoch": 2.6010909522115497, + "grad_norm": 0.5510183329627586, + "learning_rate": 1.4394193265751631e-05, + "loss": 0.0809, + "step": 21935 + }, + { + "epoch": 2.601209533973675, + "grad_norm": 0.6855714177168133, + "learning_rate": 1.4392019659425813e-05, + "loss": 0.0882, + "step": 21936 + }, + { + "epoch": 2.6013281157357997, + "grad_norm": 0.4818063336760342, + "learning_rate": 1.4389846150892572e-05, + "loss": 0.067, + "step": 21937 + }, + { + "epoch": 2.601446697497925, + "grad_norm": 0.4500511656433213, + "learning_rate": 1.4387672740171926e-05, + "loss": 0.0506, + "step": 21938 + }, + { + "epoch": 2.6015652792600497, + "grad_norm": 0.565833215314141, + "learning_rate": 1.4385499427283927e-05, + "loss": 0.0745, + "step": 21939 + }, + { + "epoch": 2.601683861022175, + "grad_norm": 0.7891899105527502, + "learning_rate": 1.4383326212248607e-05, + "loss": 0.1173, + "step": 21940 + }, + { + "epoch": 2.6018024427842996, + "grad_norm": 0.4390213237851913, + "learning_rate": 1.4381153095086e-05, + "loss": 0.0516, + "step": 21941 + }, + { + "epoch": 2.601921024546425, + "grad_norm": 0.832882930714744, + "learning_rate": 1.4378980075816152e-05, + "loss": 0.1084, + "step": 21942 + }, + { + "epoch": 2.6020396063085496, + "grad_norm": 0.6157465995086525, + "learning_rate": 1.4376807154459057e-05, + "loss": 0.0561, + "step": 21943 + }, + { + "epoch": 2.602158188070675, + "grad_norm": 1.0238998496398375, + "learning_rate": 1.4374634331034798e-05, + "loss": 0.1251, + "step": 21944 + }, + { + "epoch": 2.6022767698328, + "grad_norm": 0.43801504308296424, + "learning_rate": 1.437246160556337e-05, + "loss": 0.056, + "step": 21945 + }, + { + "epoch": 2.6023953515949247, + "grad_norm": 0.6480094903932844, + "learning_rate": 1.4370288978064822e-05, + "loss": 0.098, + "step": 21946 + }, + { + "epoch": 2.6025139333570495, + "grad_norm": 0.8342263616727594, + "learning_rate": 1.4368116448559154e-05, + "loss": 0.127, + "step": 21947 + }, + { + "epoch": 2.6026325151191747, + "grad_norm": 0.7280111786339267, + "learning_rate": 1.4365944017066438e-05, + "loss": 0.096, + "step": 21948 + }, + { + "epoch": 2.6027510968813, + "grad_norm": 0.7756930917084097, + "learning_rate": 1.4363771683606666e-05, + "loss": 0.0889, + "step": 21949 + }, + { + "epoch": 2.6028696786434247, + "grad_norm": 0.4282781177434303, + "learning_rate": 1.4361599448199875e-05, + "loss": 0.063, + "step": 21950 + }, + { + "epoch": 2.6029882604055494, + "grad_norm": 1.00980511020672, + "learning_rate": 1.4359427310866095e-05, + "loss": 0.085, + "step": 21951 + }, + { + "epoch": 2.6031068421676746, + "grad_norm": 0.7939143629600948, + "learning_rate": 1.4357255271625344e-05, + "loss": 0.1146, + "step": 21952 + }, + { + "epoch": 2.6032254239298, + "grad_norm": 0.6210729608485229, + "learning_rate": 1.435508333049766e-05, + "loss": 0.0652, + "step": 21953 + }, + { + "epoch": 2.6033440056919246, + "grad_norm": 0.45851687091808857, + "learning_rate": 1.4352911487503033e-05, + "loss": 0.0539, + "step": 21954 + }, + { + "epoch": 2.6034625874540493, + "grad_norm": 0.8487725312404782, + "learning_rate": 1.4350739742661523e-05, + "loss": 0.108, + "step": 21955 + }, + { + "epoch": 2.6035811692161746, + "grad_norm": 0.571299010340367, + "learning_rate": 1.4348568095993128e-05, + "loss": 0.0728, + "step": 21956 + }, + { + "epoch": 2.6036997509782998, + "grad_norm": 0.6429748982808718, + "learning_rate": 1.4346396547517872e-05, + "loss": 0.0901, + "step": 21957 + }, + { + "epoch": 2.6038183327404245, + "grad_norm": 0.6871948818640818, + "learning_rate": 1.4344225097255775e-05, + "loss": 0.0884, + "step": 21958 + }, + { + "epoch": 2.6039369145025493, + "grad_norm": 0.5465864760181628, + "learning_rate": 1.4342053745226857e-05, + "loss": 0.0668, + "step": 21959 + }, + { + "epoch": 2.6040554962646745, + "grad_norm": 0.7065958871326251, + "learning_rate": 1.433988249145114e-05, + "loss": 0.1228, + "step": 21960 + }, + { + "epoch": 2.6041740780267997, + "grad_norm": 0.7608516475750442, + "learning_rate": 1.4337711335948629e-05, + "loss": 0.1064, + "step": 21961 + }, + { + "epoch": 2.6042926597889244, + "grad_norm": 0.6477986701724795, + "learning_rate": 1.4335540278739342e-05, + "loss": 0.069, + "step": 21962 + }, + { + "epoch": 2.604411241551049, + "grad_norm": 0.7973911132692165, + "learning_rate": 1.4333369319843298e-05, + "loss": 0.1099, + "step": 21963 + }, + { + "epoch": 2.6045298233131744, + "grad_norm": 0.43892974169749177, + "learning_rate": 1.4331198459280514e-05, + "loss": 0.0731, + "step": 21964 + }, + { + "epoch": 2.6046484050752996, + "grad_norm": 0.47568498593935515, + "learning_rate": 1.4329027697070982e-05, + "loss": 0.0636, + "step": 21965 + }, + { + "epoch": 2.6047669868374244, + "grad_norm": 0.6652970359918451, + "learning_rate": 1.4326857033234748e-05, + "loss": 0.0782, + "step": 21966 + }, + { + "epoch": 2.604885568599549, + "grad_norm": 0.6876341742636708, + "learning_rate": 1.4324686467791792e-05, + "loss": 0.1046, + "step": 21967 + }, + { + "epoch": 2.6050041503616743, + "grad_norm": 0.7700344042803501, + "learning_rate": 1.4322516000762137e-05, + "loss": 0.1185, + "step": 21968 + }, + { + "epoch": 2.6051227321237995, + "grad_norm": 0.4763278630152302, + "learning_rate": 1.4320345632165793e-05, + "loss": 0.0683, + "step": 21969 + }, + { + "epoch": 2.6052413138859243, + "grad_norm": 0.8046687184194397, + "learning_rate": 1.4318175362022762e-05, + "loss": 0.1074, + "step": 21970 + }, + { + "epoch": 2.6053598956480495, + "grad_norm": 0.6572968857727476, + "learning_rate": 1.4316005190353069e-05, + "loss": 0.0802, + "step": 21971 + }, + { + "epoch": 2.6054784774101742, + "grad_norm": 0.6869110047228827, + "learning_rate": 1.4313835117176694e-05, + "loss": 0.0885, + "step": 21972 + }, + { + "epoch": 2.6055970591722994, + "grad_norm": 0.7370054033158944, + "learning_rate": 1.4311665142513656e-05, + "loss": 0.0914, + "step": 21973 + }, + { + "epoch": 2.605715640934424, + "grad_norm": 0.7933736332902336, + "learning_rate": 1.4309495266383958e-05, + "loss": 0.114, + "step": 21974 + }, + { + "epoch": 2.6058342226965494, + "grad_norm": 0.7626327331658775, + "learning_rate": 1.4307325488807616e-05, + "loss": 0.091, + "step": 21975 + }, + { + "epoch": 2.605952804458674, + "grad_norm": 0.6335595301907293, + "learning_rate": 1.4305155809804608e-05, + "loss": 0.076, + "step": 21976 + }, + { + "epoch": 2.6060713862207994, + "grad_norm": 0.6520711288707952, + "learning_rate": 1.4302986229394949e-05, + "loss": 0.1014, + "step": 21977 + }, + { + "epoch": 2.606189967982924, + "grad_norm": 0.6074123041009495, + "learning_rate": 1.4300816747598639e-05, + "loss": 0.0824, + "step": 21978 + }, + { + "epoch": 2.6063085497450493, + "grad_norm": 1.0066834251479257, + "learning_rate": 1.429864736443568e-05, + "loss": 0.1294, + "step": 21979 + }, + { + "epoch": 2.606427131507174, + "grad_norm": 0.6652064684411911, + "learning_rate": 1.4296478079926067e-05, + "loss": 0.107, + "step": 21980 + }, + { + "epoch": 2.6065457132692993, + "grad_norm": 0.7353842707089959, + "learning_rate": 1.4294308894089803e-05, + "loss": 0.1129, + "step": 21981 + }, + { + "epoch": 2.606664295031424, + "grad_norm": 0.6634663998500046, + "learning_rate": 1.4292139806946894e-05, + "loss": 0.0752, + "step": 21982 + }, + { + "epoch": 2.6067828767935493, + "grad_norm": 0.6868012436688146, + "learning_rate": 1.4289970818517312e-05, + "loss": 0.1075, + "step": 21983 + }, + { + "epoch": 2.606901458555674, + "grad_norm": 0.863441761624392, + "learning_rate": 1.4287801928821068e-05, + "loss": 0.1009, + "step": 21984 + }, + { + "epoch": 2.607020040317799, + "grad_norm": 0.7747307251873706, + "learning_rate": 1.4285633137878152e-05, + "loss": 0.1081, + "step": 21985 + }, + { + "epoch": 2.607138622079924, + "grad_norm": 0.7454826861950284, + "learning_rate": 1.4283464445708572e-05, + "loss": 0.088, + "step": 21986 + }, + { + "epoch": 2.607257203842049, + "grad_norm": 0.6552469762355371, + "learning_rate": 1.4281295852332297e-05, + "loss": 0.081, + "step": 21987 + }, + { + "epoch": 2.607375785604174, + "grad_norm": 0.9688757896919075, + "learning_rate": 1.4279127357769328e-05, + "loss": 0.1298, + "step": 21988 + }, + { + "epoch": 2.607494367366299, + "grad_norm": 0.6318960274674293, + "learning_rate": 1.427695896203966e-05, + "loss": 0.1037, + "step": 21989 + }, + { + "epoch": 2.607612949128424, + "grad_norm": 0.7072657134780546, + "learning_rate": 1.4274790665163281e-05, + "loss": 0.0971, + "step": 21990 + }, + { + "epoch": 2.607731530890549, + "grad_norm": 0.4987905580284949, + "learning_rate": 1.427262246716019e-05, + "loss": 0.0686, + "step": 21991 + }, + { + "epoch": 2.607850112652674, + "grad_norm": 0.6960246929704086, + "learning_rate": 1.4270454368050345e-05, + "loss": 0.0993, + "step": 21992 + }, + { + "epoch": 2.607968694414799, + "grad_norm": 0.858453407148519, + "learning_rate": 1.4268286367853773e-05, + "loss": 0.1089, + "step": 21993 + }, + { + "epoch": 2.6080872761769243, + "grad_norm": 0.6508930480242765, + "learning_rate": 1.4266118466590433e-05, + "loss": 0.1034, + "step": 21994 + }, + { + "epoch": 2.608205857939049, + "grad_norm": 1.0798617016953551, + "learning_rate": 1.4263950664280314e-05, + "loss": 0.1197, + "step": 21995 + }, + { + "epoch": 2.608324439701174, + "grad_norm": 0.6551486599253281, + "learning_rate": 1.4261782960943407e-05, + "loss": 0.0878, + "step": 21996 + }, + { + "epoch": 2.608443021463299, + "grad_norm": 0.3718019515094267, + "learning_rate": 1.4259615356599704e-05, + "loss": 0.0534, + "step": 21997 + }, + { + "epoch": 2.608561603225424, + "grad_norm": 0.8029631478372832, + "learning_rate": 1.4257447851269163e-05, + "loss": 0.1027, + "step": 21998 + }, + { + "epoch": 2.608680184987549, + "grad_norm": 0.608005073678563, + "learning_rate": 1.425528044497178e-05, + "loss": 0.0692, + "step": 21999 + }, + { + "epoch": 2.6087987667496737, + "grad_norm": 0.5077167697640207, + "learning_rate": 1.4253113137727538e-05, + "loss": 0.0853, + "step": 22000 + }, + { + "epoch": 2.608917348511799, + "grad_norm": 0.6519510863812491, + "learning_rate": 1.4250945929556414e-05, + "loss": 0.0933, + "step": 22001 + }, + { + "epoch": 2.609035930273924, + "grad_norm": 0.7530685576257208, + "learning_rate": 1.4248778820478395e-05, + "loss": 0.1075, + "step": 22002 + }, + { + "epoch": 2.609154512036049, + "grad_norm": 0.6632582899756148, + "learning_rate": 1.4246611810513432e-05, + "loss": 0.1006, + "step": 22003 + }, + { + "epoch": 2.6092730937981736, + "grad_norm": 0.8692552724377568, + "learning_rate": 1.4244444899681541e-05, + "loss": 0.1206, + "step": 22004 + }, + { + "epoch": 2.609391675560299, + "grad_norm": 0.37423051511025546, + "learning_rate": 1.424227808800267e-05, + "loss": 0.0512, + "step": 22005 + }, + { + "epoch": 2.609510257322424, + "grad_norm": 0.5827624088149662, + "learning_rate": 1.4240111375496803e-05, + "loss": 0.0781, + "step": 22006 + }, + { + "epoch": 2.609628839084549, + "grad_norm": 0.8349939495767131, + "learning_rate": 1.4237944762183914e-05, + "loss": 0.0949, + "step": 22007 + }, + { + "epoch": 2.6097474208466735, + "grad_norm": 0.5017979559421278, + "learning_rate": 1.4235778248083975e-05, + "loss": 0.0825, + "step": 22008 + }, + { + "epoch": 2.6098660026087988, + "grad_norm": 0.6201845822195293, + "learning_rate": 1.4233611833216975e-05, + "loss": 0.0868, + "step": 22009 + }, + { + "epoch": 2.609984584370924, + "grad_norm": 0.6589119723773808, + "learning_rate": 1.4231445517602849e-05, + "loss": 0.1032, + "step": 22010 + }, + { + "epoch": 2.6101031661330487, + "grad_norm": 0.8378868432393067, + "learning_rate": 1.4229279301261608e-05, + "loss": 0.1373, + "step": 22011 + }, + { + "epoch": 2.6102217478951735, + "grad_norm": 0.6032792385627211, + "learning_rate": 1.4227113184213198e-05, + "loss": 0.1029, + "step": 22012 + }, + { + "epoch": 2.6103403296572987, + "grad_norm": 1.206624222905882, + "learning_rate": 1.4224947166477597e-05, + "loss": 0.178, + "step": 22013 + }, + { + "epoch": 2.610458911419424, + "grad_norm": 0.6933874492398453, + "learning_rate": 1.4222781248074758e-05, + "loss": 0.1075, + "step": 22014 + }, + { + "epoch": 2.6105774931815486, + "grad_norm": 0.9234596145340672, + "learning_rate": 1.4220615429024675e-05, + "loss": 0.127, + "step": 22015 + }, + { + "epoch": 2.6106960749436734, + "grad_norm": 0.3973387888002447, + "learning_rate": 1.4218449709347293e-05, + "loss": 0.0613, + "step": 22016 + }, + { + "epoch": 2.6108146567057986, + "grad_norm": 0.696097475169207, + "learning_rate": 1.4216284089062581e-05, + "loss": 0.0808, + "step": 22017 + }, + { + "epoch": 2.610933238467924, + "grad_norm": 0.5859736349945868, + "learning_rate": 1.4214118568190505e-05, + "loss": 0.0906, + "step": 22018 + }, + { + "epoch": 2.6110518202300486, + "grad_norm": 0.659072906313902, + "learning_rate": 1.4211953146751028e-05, + "loss": 0.0685, + "step": 22019 + }, + { + "epoch": 2.6111704019921738, + "grad_norm": 0.7545274015083119, + "learning_rate": 1.4209787824764126e-05, + "loss": 0.1192, + "step": 22020 + }, + { + "epoch": 2.6112889837542985, + "grad_norm": 0.8126813519890249, + "learning_rate": 1.4207622602249732e-05, + "loss": 0.135, + "step": 22021 + }, + { + "epoch": 2.6114075655164237, + "grad_norm": 0.7879401780304492, + "learning_rate": 1.4205457479227839e-05, + "loss": 0.1172, + "step": 22022 + }, + { + "epoch": 2.6115261472785485, + "grad_norm": 0.8057281858446868, + "learning_rate": 1.420329245571838e-05, + "loss": 0.0952, + "step": 22023 + }, + { + "epoch": 2.6116447290406737, + "grad_norm": 0.5951317139414, + "learning_rate": 1.4201127531741337e-05, + "loss": 0.0715, + "step": 22024 + }, + { + "epoch": 2.6117633108027984, + "grad_norm": 0.636747818244709, + "learning_rate": 1.4198962707316635e-05, + "loss": 0.0661, + "step": 22025 + }, + { + "epoch": 2.6118818925649236, + "grad_norm": 0.6460736343122466, + "learning_rate": 1.4196797982464272e-05, + "loss": 0.097, + "step": 22026 + }, + { + "epoch": 2.6120004743270484, + "grad_norm": 0.6414859164814994, + "learning_rate": 1.4194633357204173e-05, + "loss": 0.0926, + "step": 22027 + }, + { + "epoch": 2.6121190560891736, + "grad_norm": 0.5986514588944696, + "learning_rate": 1.4192468831556304e-05, + "loss": 0.0885, + "step": 22028 + }, + { + "epoch": 2.6122376378512984, + "grad_norm": 0.4950773419880529, + "learning_rate": 1.419030440554062e-05, + "loss": 0.0834, + "step": 22029 + }, + { + "epoch": 2.6123562196134236, + "grad_norm": 0.45262106066950664, + "learning_rate": 1.4188140079177072e-05, + "loss": 0.0666, + "step": 22030 + }, + { + "epoch": 2.6124748013755483, + "grad_norm": 0.5863136558830888, + "learning_rate": 1.4185975852485628e-05, + "loss": 0.0759, + "step": 22031 + }, + { + "epoch": 2.6125933831376735, + "grad_norm": 0.5438509320253987, + "learning_rate": 1.4183811725486213e-05, + "loss": 0.0798, + "step": 22032 + }, + { + "epoch": 2.6127119648997983, + "grad_norm": 0.8348074449100498, + "learning_rate": 1.4181647698198791e-05, + "loss": 0.1278, + "step": 22033 + }, + { + "epoch": 2.6128305466619235, + "grad_norm": 0.6310382253071126, + "learning_rate": 1.4179483770643309e-05, + "loss": 0.0892, + "step": 22034 + }, + { + "epoch": 2.6129491284240483, + "grad_norm": 0.5378481003319695, + "learning_rate": 1.4177319942839728e-05, + "loss": 0.084, + "step": 22035 + }, + { + "epoch": 2.6130677101861735, + "grad_norm": 0.6085625805121494, + "learning_rate": 1.4175156214807966e-05, + "loss": 0.094, + "step": 22036 + }, + { + "epoch": 2.613186291948298, + "grad_norm": 0.586867649946835, + "learning_rate": 1.4172992586568013e-05, + "loss": 0.0762, + "step": 22037 + }, + { + "epoch": 2.6133048737104234, + "grad_norm": 0.8887690859236409, + "learning_rate": 1.4170829058139775e-05, + "loss": 0.1368, + "step": 22038 + }, + { + "epoch": 2.613423455472548, + "grad_norm": 0.45566621924585266, + "learning_rate": 1.4168665629543218e-05, + "loss": 0.0696, + "step": 22039 + }, + { + "epoch": 2.6135420372346734, + "grad_norm": 0.7466442645651531, + "learning_rate": 1.4166502300798279e-05, + "loss": 0.0941, + "step": 22040 + }, + { + "epoch": 2.613660618996798, + "grad_norm": 1.3313633301940182, + "learning_rate": 1.4164339071924906e-05, + "loss": 0.1287, + "step": 22041 + }, + { + "epoch": 2.6137792007589233, + "grad_norm": 0.33963971941990795, + "learning_rate": 1.4162175942943048e-05, + "loss": 0.0413, + "step": 22042 + }, + { + "epoch": 2.613897782521048, + "grad_norm": 0.9437286618295434, + "learning_rate": 1.4160012913872628e-05, + "loss": 0.1296, + "step": 22043 + }, + { + "epoch": 2.6140163642831733, + "grad_norm": 0.7776980464062251, + "learning_rate": 1.4157849984733595e-05, + "loss": 0.105, + "step": 22044 + }, + { + "epoch": 2.614134946045298, + "grad_norm": 0.49808928722597695, + "learning_rate": 1.4155687155545893e-05, + "loss": 0.0691, + "step": 22045 + }, + { + "epoch": 2.6142535278074233, + "grad_norm": 0.6025463951437224, + "learning_rate": 1.4153524426329461e-05, + "loss": 0.0878, + "step": 22046 + }, + { + "epoch": 2.6143721095695485, + "grad_norm": 0.43384403483705175, + "learning_rate": 1.4151361797104229e-05, + "loss": 0.0649, + "step": 22047 + }, + { + "epoch": 2.6144906913316732, + "grad_norm": 0.3993971393882321, + "learning_rate": 1.4149199267890133e-05, + "loss": 0.0467, + "step": 22048 + }, + { + "epoch": 2.614609273093798, + "grad_norm": 0.7310734113482398, + "learning_rate": 1.414703683870712e-05, + "loss": 0.1037, + "step": 22049 + }, + { + "epoch": 2.614727854855923, + "grad_norm": 0.42887847621311836, + "learning_rate": 1.4144874509575112e-05, + "loss": 0.0526, + "step": 22050 + }, + { + "epoch": 2.6148464366180484, + "grad_norm": 0.6403932077826392, + "learning_rate": 1.4142712280514053e-05, + "loss": 0.0968, + "step": 22051 + }, + { + "epoch": 2.614965018380173, + "grad_norm": 0.6661826163351403, + "learning_rate": 1.4140550151543874e-05, + "loss": 0.0984, + "step": 22052 + }, + { + "epoch": 2.615083600142298, + "grad_norm": 0.4466440038848482, + "learning_rate": 1.4138388122684515e-05, + "loss": 0.0503, + "step": 22053 + }, + { + "epoch": 2.615202181904423, + "grad_norm": 0.9054708551669358, + "learning_rate": 1.4136226193955888e-05, + "loss": 0.1356, + "step": 22054 + }, + { + "epoch": 2.6153207636665483, + "grad_norm": 0.670468585587432, + "learning_rate": 1.413406436537793e-05, + "loss": 0.1093, + "step": 22055 + }, + { + "epoch": 2.615439345428673, + "grad_norm": 0.557879932887071, + "learning_rate": 1.4131902636970575e-05, + "loss": 0.0617, + "step": 22056 + }, + { + "epoch": 2.615557927190798, + "grad_norm": 0.6936947291936187, + "learning_rate": 1.4129741008753755e-05, + "loss": 0.0976, + "step": 22057 + }, + { + "epoch": 2.615676508952923, + "grad_norm": 0.8842025207013137, + "learning_rate": 1.41275794807474e-05, + "loss": 0.1049, + "step": 22058 + }, + { + "epoch": 2.6157950907150482, + "grad_norm": 0.49721401181884883, + "learning_rate": 1.4125418052971407e-05, + "loss": 0.0676, + "step": 22059 + }, + { + "epoch": 2.615913672477173, + "grad_norm": 0.9241849166727963, + "learning_rate": 1.4123256725445747e-05, + "loss": 0.1305, + "step": 22060 + }, + { + "epoch": 2.6160322542392977, + "grad_norm": 0.6968014938969033, + "learning_rate": 1.4121095498190313e-05, + "loss": 0.1008, + "step": 22061 + }, + { + "epoch": 2.616150836001423, + "grad_norm": 0.4150478859714025, + "learning_rate": 1.4118934371225045e-05, + "loss": 0.0575, + "step": 22062 + }, + { + "epoch": 2.616269417763548, + "grad_norm": 0.5677525816627891, + "learning_rate": 1.411677334456984e-05, + "loss": 0.0838, + "step": 22063 + }, + { + "epoch": 2.616387999525673, + "grad_norm": 0.5824702041921683, + "learning_rate": 1.411461241824466e-05, + "loss": 0.0689, + "step": 22064 + }, + { + "epoch": 2.6165065812877977, + "grad_norm": 0.6662346225076204, + "learning_rate": 1.4112451592269393e-05, + "loss": 0.0856, + "step": 22065 + }, + { + "epoch": 2.616625163049923, + "grad_norm": 0.725868937504414, + "learning_rate": 1.4110290866663973e-05, + "loss": 0.0944, + "step": 22066 + }, + { + "epoch": 2.616743744812048, + "grad_norm": 0.8628552457425012, + "learning_rate": 1.4108130241448314e-05, + "loss": 0.1119, + "step": 22067 + }, + { + "epoch": 2.616862326574173, + "grad_norm": 0.8290815162911118, + "learning_rate": 1.4105969716642342e-05, + "loss": 0.1126, + "step": 22068 + }, + { + "epoch": 2.616980908336298, + "grad_norm": 0.45724158994630126, + "learning_rate": 1.4103809292265979e-05, + "loss": 0.0624, + "step": 22069 + }, + { + "epoch": 2.617099490098423, + "grad_norm": 0.8173260661929446, + "learning_rate": 1.410164896833911e-05, + "loss": 0.12, + "step": 22070 + }, + { + "epoch": 2.617218071860548, + "grad_norm": 0.5891285068889657, + "learning_rate": 1.4099488744881695e-05, + "loss": 0.0793, + "step": 22071 + }, + { + "epoch": 2.6173366536226728, + "grad_norm": 0.9957060998999003, + "learning_rate": 1.4097328621913616e-05, + "loss": 0.0987, + "step": 22072 + }, + { + "epoch": 2.617455235384798, + "grad_norm": 0.8345897874678252, + "learning_rate": 1.4095168599454808e-05, + "loss": 0.1216, + "step": 22073 + }, + { + "epoch": 2.6175738171469227, + "grad_norm": 0.8425111960352636, + "learning_rate": 1.4093008677525155e-05, + "loss": 0.1122, + "step": 22074 + }, + { + "epoch": 2.617692398909048, + "grad_norm": 0.7685647131298633, + "learning_rate": 1.4090848856144603e-05, + "loss": 0.1128, + "step": 22075 + }, + { + "epoch": 2.6178109806711727, + "grad_norm": 1.187519512795012, + "learning_rate": 1.4088689135333039e-05, + "loss": 0.1396, + "step": 22076 + }, + { + "epoch": 2.617929562433298, + "grad_norm": 0.6053539248092762, + "learning_rate": 1.408652951511038e-05, + "loss": 0.0605, + "step": 22077 + }, + { + "epoch": 2.6180481441954226, + "grad_norm": 0.6977890849943897, + "learning_rate": 1.4084369995496537e-05, + "loss": 0.1021, + "step": 22078 + }, + { + "epoch": 2.618166725957548, + "grad_norm": 0.7414447879185294, + "learning_rate": 1.4082210576511418e-05, + "loss": 0.103, + "step": 22079 + }, + { + "epoch": 2.6182853077196726, + "grad_norm": 0.7797282385648604, + "learning_rate": 1.4080051258174936e-05, + "loss": 0.0934, + "step": 22080 + }, + { + "epoch": 2.618403889481798, + "grad_norm": 0.4645408528289416, + "learning_rate": 1.4077892040506973e-05, + "loss": 0.0644, + "step": 22081 + }, + { + "epoch": 2.6185224712439226, + "grad_norm": 0.695107456979117, + "learning_rate": 1.4075732923527474e-05, + "loss": 0.0915, + "step": 22082 + }, + { + "epoch": 2.6186410530060478, + "grad_norm": 0.933684612310306, + "learning_rate": 1.4073573907256305e-05, + "loss": 0.1398, + "step": 22083 + }, + { + "epoch": 2.6187596347681725, + "grad_norm": 0.5130376230187196, + "learning_rate": 1.40714149917134e-05, + "loss": 0.0699, + "step": 22084 + }, + { + "epoch": 2.6188782165302977, + "grad_norm": 0.7611352087072213, + "learning_rate": 1.4069256176918627e-05, + "loss": 0.1071, + "step": 22085 + }, + { + "epoch": 2.6189967982924225, + "grad_norm": 0.4084123432899418, + "learning_rate": 1.4067097462891926e-05, + "loss": 0.064, + "step": 22086 + }, + { + "epoch": 2.6191153800545477, + "grad_norm": 0.5315349710766702, + "learning_rate": 1.406493884965317e-05, + "loss": 0.0659, + "step": 22087 + }, + { + "epoch": 2.6192339618166725, + "grad_norm": 0.9267917092551091, + "learning_rate": 1.406278033722227e-05, + "loss": 0.1384, + "step": 22088 + }, + { + "epoch": 2.6193525435787977, + "grad_norm": 0.674446128634552, + "learning_rate": 1.4060621925619122e-05, + "loss": 0.0866, + "step": 22089 + }, + { + "epoch": 2.6194711253409224, + "grad_norm": 0.7215348650471252, + "learning_rate": 1.4058463614863627e-05, + "loss": 0.0997, + "step": 22090 + }, + { + "epoch": 2.6195897071030476, + "grad_norm": 0.4896739959868906, + "learning_rate": 1.405630540497569e-05, + "loss": 0.0707, + "step": 22091 + }, + { + "epoch": 2.6197082888651724, + "grad_norm": 1.2320983608961948, + "learning_rate": 1.4054147295975174e-05, + "loss": 0.122, + "step": 22092 + }, + { + "epoch": 2.6198268706272976, + "grad_norm": 0.7910570985542978, + "learning_rate": 1.4051989287882016e-05, + "loss": 0.1068, + "step": 22093 + }, + { + "epoch": 2.6199454523894223, + "grad_norm": 0.5944947965236823, + "learning_rate": 1.4049831380716089e-05, + "loss": 0.0765, + "step": 22094 + }, + { + "epoch": 2.6200640341515475, + "grad_norm": 0.7900748295899002, + "learning_rate": 1.4047673574497292e-05, + "loss": 0.1025, + "step": 22095 + }, + { + "epoch": 2.6201826159136727, + "grad_norm": 0.8484444588310283, + "learning_rate": 1.4045515869245496e-05, + "loss": 0.1282, + "step": 22096 + }, + { + "epoch": 2.6203011976757975, + "grad_norm": 0.5091275260517677, + "learning_rate": 1.404335826498063e-05, + "loss": 0.078, + "step": 22097 + }, + { + "epoch": 2.6204197794379223, + "grad_norm": 0.5256236901404228, + "learning_rate": 1.4041200761722556e-05, + "loss": 0.0582, + "step": 22098 + }, + { + "epoch": 2.6205383612000475, + "grad_norm": 0.9043511999296037, + "learning_rate": 1.4039043359491172e-05, + "loss": 0.1259, + "step": 22099 + }, + { + "epoch": 2.6206569429621727, + "grad_norm": 0.54798289071529, + "learning_rate": 1.4036886058306364e-05, + "loss": 0.0815, + "step": 22100 + }, + { + "epoch": 2.6207755247242974, + "grad_norm": 0.6936639019885543, + "learning_rate": 1.4034728858188029e-05, + "loss": 0.0915, + "step": 22101 + }, + { + "epoch": 2.620894106486422, + "grad_norm": 0.45931446433298817, + "learning_rate": 1.4032571759156051e-05, + "loss": 0.0682, + "step": 22102 + }, + { + "epoch": 2.6210126882485474, + "grad_norm": 0.9087489195381161, + "learning_rate": 1.4030414761230304e-05, + "loss": 0.1403, + "step": 22103 + }, + { + "epoch": 2.6211312700106726, + "grad_norm": 0.5179112105506419, + "learning_rate": 1.4028257864430682e-05, + "loss": 0.0675, + "step": 22104 + }, + { + "epoch": 2.6212498517727973, + "grad_norm": 0.6644394434462044, + "learning_rate": 1.402610106877707e-05, + "loss": 0.1117, + "step": 22105 + }, + { + "epoch": 2.621368433534922, + "grad_norm": 0.7344412545105654, + "learning_rate": 1.4023944374289358e-05, + "loss": 0.0993, + "step": 22106 + }, + { + "epoch": 2.6214870152970473, + "grad_norm": 0.7893790326316028, + "learning_rate": 1.40217877809874e-05, + "loss": 0.104, + "step": 22107 + }, + { + "epoch": 2.6216055970591725, + "grad_norm": 0.4613142734995136, + "learning_rate": 1.401963128889111e-05, + "loss": 0.0616, + "step": 22108 + }, + { + "epoch": 2.6217241788212973, + "grad_norm": 0.44862839606781835, + "learning_rate": 1.401747489802035e-05, + "loss": 0.0582, + "step": 22109 + }, + { + "epoch": 2.621842760583422, + "grad_norm": 0.9686913155559255, + "learning_rate": 1.4015318608395006e-05, + "loss": 0.1282, + "step": 22110 + }, + { + "epoch": 2.6219613423455472, + "grad_norm": 0.6818754974363416, + "learning_rate": 1.4013162420034947e-05, + "loss": 0.0846, + "step": 22111 + }, + { + "epoch": 2.6220799241076724, + "grad_norm": 0.6098538300646913, + "learning_rate": 1.4011006332960064e-05, + "loss": 0.0886, + "step": 22112 + }, + { + "epoch": 2.622198505869797, + "grad_norm": 0.7095376129649297, + "learning_rate": 1.4008850347190234e-05, + "loss": 0.1306, + "step": 22113 + }, + { + "epoch": 2.622317087631922, + "grad_norm": 0.7763266087302444, + "learning_rate": 1.4006694462745318e-05, + "loss": 0.0853, + "step": 22114 + }, + { + "epoch": 2.622435669394047, + "grad_norm": 0.8839987414679412, + "learning_rate": 1.4004538679645197e-05, + "loss": 0.1295, + "step": 22115 + }, + { + "epoch": 2.6225542511561724, + "grad_norm": 0.6589532805745818, + "learning_rate": 1.4002382997909743e-05, + "loss": 0.0891, + "step": 22116 + }, + { + "epoch": 2.622672832918297, + "grad_norm": 0.5212228359633704, + "learning_rate": 1.4000227417558834e-05, + "loss": 0.0751, + "step": 22117 + }, + { + "epoch": 2.6227914146804223, + "grad_norm": 0.5788874289351714, + "learning_rate": 1.399807193861235e-05, + "loss": 0.0665, + "step": 22118 + }, + { + "epoch": 2.622909996442547, + "grad_norm": 0.8174172432377894, + "learning_rate": 1.399591656109013e-05, + "loss": 0.0866, + "step": 22119 + }, + { + "epoch": 2.6230285782046723, + "grad_norm": 0.626944515945624, + "learning_rate": 1.3993761285012085e-05, + "loss": 0.0898, + "step": 22120 + }, + { + "epoch": 2.623147159966797, + "grad_norm": 0.41718093694748926, + "learning_rate": 1.3991606110398056e-05, + "loss": 0.0575, + "step": 22121 + }, + { + "epoch": 2.6232657417289222, + "grad_norm": 0.78267180733908, + "learning_rate": 1.3989451037267918e-05, + "loss": 0.0943, + "step": 22122 + }, + { + "epoch": 2.623384323491047, + "grad_norm": 0.7604390336073532, + "learning_rate": 1.398729606564154e-05, + "loss": 0.0914, + "step": 22123 + }, + { + "epoch": 2.623502905253172, + "grad_norm": 0.7439342274590975, + "learning_rate": 1.3985141195538797e-05, + "loss": 0.0838, + "step": 22124 + }, + { + "epoch": 2.623621487015297, + "grad_norm": 0.9279847640735366, + "learning_rate": 1.3982986426979534e-05, + "loss": 0.1451, + "step": 22125 + }, + { + "epoch": 2.623740068777422, + "grad_norm": 0.6122406797529125, + "learning_rate": 1.3980831759983625e-05, + "loss": 0.0955, + "step": 22126 + }, + { + "epoch": 2.623858650539547, + "grad_norm": 0.443038232159116, + "learning_rate": 1.3978677194570938e-05, + "loss": 0.0642, + "step": 22127 + }, + { + "epoch": 2.623977232301672, + "grad_norm": 0.8851124787130357, + "learning_rate": 1.3976522730761332e-05, + "loss": 0.1, + "step": 22128 + }, + { + "epoch": 2.624095814063797, + "grad_norm": 0.9078638729991346, + "learning_rate": 1.3974368368574678e-05, + "loss": 0.1036, + "step": 22129 + }, + { + "epoch": 2.624214395825922, + "grad_norm": 0.9007282628903428, + "learning_rate": 1.3972214108030807e-05, + "loss": 0.1337, + "step": 22130 + }, + { + "epoch": 2.624332977588047, + "grad_norm": 0.4920367164653129, + "learning_rate": 1.3970059949149616e-05, + "loss": 0.0673, + "step": 22131 + }, + { + "epoch": 2.624451559350172, + "grad_norm": 0.5486185623107377, + "learning_rate": 1.396790589195094e-05, + "loss": 0.0721, + "step": 22132 + }, + { + "epoch": 2.624570141112297, + "grad_norm": 0.6857830241260149, + "learning_rate": 1.3965751936454651e-05, + "loss": 0.1077, + "step": 22133 + }, + { + "epoch": 2.624688722874422, + "grad_norm": 0.9104489564791624, + "learning_rate": 1.396359808268058e-05, + "loss": 0.1235, + "step": 22134 + }, + { + "epoch": 2.6248073046365468, + "grad_norm": 0.7978423069121005, + "learning_rate": 1.396144433064862e-05, + "loss": 0.0873, + "step": 22135 + }, + { + "epoch": 2.624925886398672, + "grad_norm": 0.7950951662726007, + "learning_rate": 1.3959290680378594e-05, + "loss": 0.1041, + "step": 22136 + }, + { + "epoch": 2.6250444681607967, + "grad_norm": 0.9318946152938719, + "learning_rate": 1.3957137131890374e-05, + "loss": 0.1166, + "step": 22137 + }, + { + "epoch": 2.625163049922922, + "grad_norm": 0.6637333748308374, + "learning_rate": 1.3954983685203807e-05, + "loss": 0.0875, + "step": 22138 + }, + { + "epoch": 2.6252816316850467, + "grad_norm": 0.7707681898836856, + "learning_rate": 1.3952830340338746e-05, + "loss": 0.1061, + "step": 22139 + }, + { + "epoch": 2.625400213447172, + "grad_norm": 0.6198413964884286, + "learning_rate": 1.3950677097315052e-05, + "loss": 0.0877, + "step": 22140 + }, + { + "epoch": 2.6255187952092967, + "grad_norm": 0.9037560291642205, + "learning_rate": 1.3948523956152543e-05, + "loss": 0.1028, + "step": 22141 + }, + { + "epoch": 2.625637376971422, + "grad_norm": 0.507952937411963, + "learning_rate": 1.394637091687111e-05, + "loss": 0.0603, + "step": 22142 + }, + { + "epoch": 2.6257559587335466, + "grad_norm": 0.6026737851084257, + "learning_rate": 1.3944217979490576e-05, + "loss": 0.0795, + "step": 22143 + }, + { + "epoch": 2.625874540495672, + "grad_norm": 0.9326740997783389, + "learning_rate": 1.3942065144030803e-05, + "loss": 0.1014, + "step": 22144 + }, + { + "epoch": 2.625993122257797, + "grad_norm": 0.7672550111175824, + "learning_rate": 1.3939912410511612e-05, + "loss": 0.0817, + "step": 22145 + }, + { + "epoch": 2.6261117040199218, + "grad_norm": 0.5568911894386099, + "learning_rate": 1.3937759778952883e-05, + "loss": 0.0805, + "step": 22146 + }, + { + "epoch": 2.6262302857820465, + "grad_norm": 0.6049450965403951, + "learning_rate": 1.3935607249374433e-05, + "loss": 0.0885, + "step": 22147 + }, + { + "epoch": 2.6263488675441717, + "grad_norm": 0.6264098961009751, + "learning_rate": 1.3933454821796118e-05, + "loss": 0.0935, + "step": 22148 + }, + { + "epoch": 2.626467449306297, + "grad_norm": 0.5574360863286957, + "learning_rate": 1.393130249623778e-05, + "loss": 0.0739, + "step": 22149 + }, + { + "epoch": 2.6265860310684217, + "grad_norm": 0.5697681261421522, + "learning_rate": 1.3929150272719254e-05, + "loss": 0.0717, + "step": 22150 + }, + { + "epoch": 2.6267046128305465, + "grad_norm": 0.4319333482136354, + "learning_rate": 1.3926998151260401e-05, + "loss": 0.0571, + "step": 22151 + }, + { + "epoch": 2.6268231945926717, + "grad_norm": 0.4302085906469913, + "learning_rate": 1.3924846131881028e-05, + "loss": 0.0639, + "step": 22152 + }, + { + "epoch": 2.626941776354797, + "grad_norm": 0.9720234561848611, + "learning_rate": 1.392269421460101e-05, + "loss": 0.117, + "step": 22153 + }, + { + "epoch": 2.6270603581169216, + "grad_norm": 0.7329467712992083, + "learning_rate": 1.3920542399440157e-05, + "loss": 0.1065, + "step": 22154 + }, + { + "epoch": 2.6271789398790464, + "grad_norm": 0.7764919990311401, + "learning_rate": 1.3918390686418326e-05, + "loss": 0.101, + "step": 22155 + }, + { + "epoch": 2.6272975216411716, + "grad_norm": 0.7912887565834839, + "learning_rate": 1.3916239075555326e-05, + "loss": 0.0917, + "step": 22156 + }, + { + "epoch": 2.627416103403297, + "grad_norm": 0.5709022099476014, + "learning_rate": 1.3914087566871029e-05, + "loss": 0.0857, + "step": 22157 + }, + { + "epoch": 2.6275346851654215, + "grad_norm": 0.7058198661260656, + "learning_rate": 1.3911936160385241e-05, + "loss": 0.0857, + "step": 22158 + }, + { + "epoch": 2.6276532669275463, + "grad_norm": 0.5201187386982704, + "learning_rate": 1.3909784856117802e-05, + "loss": 0.0738, + "step": 22159 + }, + { + "epoch": 2.6277718486896715, + "grad_norm": 0.5128811486622034, + "learning_rate": 1.3907633654088548e-05, + "loss": 0.0797, + "step": 22160 + }, + { + "epoch": 2.6278904304517967, + "grad_norm": 0.59374815571535, + "learning_rate": 1.3905482554317312e-05, + "loss": 0.0796, + "step": 22161 + }, + { + "epoch": 2.6280090122139215, + "grad_norm": 0.7997204202614849, + "learning_rate": 1.390333155682393e-05, + "loss": 0.1151, + "step": 22162 + }, + { + "epoch": 2.6281275939760462, + "grad_norm": 0.7630002781916789, + "learning_rate": 1.39011806616282e-05, + "loss": 0.1117, + "step": 22163 + }, + { + "epoch": 2.6282461757381714, + "grad_norm": 0.9696008582421144, + "learning_rate": 1.3899029868749996e-05, + "loss": 0.131, + "step": 22164 + }, + { + "epoch": 2.6283647575002966, + "grad_norm": 0.5855431068055443, + "learning_rate": 1.3896879178209115e-05, + "loss": 0.0925, + "step": 22165 + }, + { + "epoch": 2.6284833392624214, + "grad_norm": 0.4867148816912166, + "learning_rate": 1.3894728590025393e-05, + "loss": 0.0743, + "step": 22166 + }, + { + "epoch": 2.628601921024546, + "grad_norm": 0.8491947220708185, + "learning_rate": 1.3892578104218657e-05, + "loss": 0.1087, + "step": 22167 + }, + { + "epoch": 2.6287205027866714, + "grad_norm": 0.818668786175448, + "learning_rate": 1.3890427720808725e-05, + "loss": 0.1066, + "step": 22168 + }, + { + "epoch": 2.6288390845487966, + "grad_norm": 0.5280692603776892, + "learning_rate": 1.388827743981544e-05, + "loss": 0.0757, + "step": 22169 + }, + { + "epoch": 2.6289576663109213, + "grad_norm": 0.6912859236574017, + "learning_rate": 1.3886127261258596e-05, + "loss": 0.0954, + "step": 22170 + }, + { + "epoch": 2.6290762480730465, + "grad_norm": 0.7848262301246707, + "learning_rate": 1.3883977185158036e-05, + "loss": 0.094, + "step": 22171 + }, + { + "epoch": 2.6291948298351713, + "grad_norm": 0.42876699920622513, + "learning_rate": 1.3881827211533572e-05, + "loss": 0.0516, + "step": 22172 + }, + { + "epoch": 2.6293134115972965, + "grad_norm": 0.6051052049434806, + "learning_rate": 1.3879677340405037e-05, + "loss": 0.0784, + "step": 22173 + }, + { + "epoch": 2.6294319933594212, + "grad_norm": 0.6948694751839898, + "learning_rate": 1.3877527571792231e-05, + "loss": 0.1071, + "step": 22174 + }, + { + "epoch": 2.6295505751215464, + "grad_norm": 0.8161680983241022, + "learning_rate": 1.3875377905714981e-05, + "loss": 0.1233, + "step": 22175 + }, + { + "epoch": 2.629669156883671, + "grad_norm": 0.6224221909119257, + "learning_rate": 1.3873228342193106e-05, + "loss": 0.0936, + "step": 22176 + }, + { + "epoch": 2.6297877386457964, + "grad_norm": 0.5173701176813528, + "learning_rate": 1.3871078881246419e-05, + "loss": 0.0555, + "step": 22177 + }, + { + "epoch": 2.629906320407921, + "grad_norm": 0.7928521031219999, + "learning_rate": 1.386892952289474e-05, + "loss": 0.108, + "step": 22178 + }, + { + "epoch": 2.6300249021700464, + "grad_norm": 0.6814001058646414, + "learning_rate": 1.3866780267157877e-05, + "loss": 0.0707, + "step": 22179 + }, + { + "epoch": 2.630143483932171, + "grad_norm": 0.4088224181624444, + "learning_rate": 1.3864631114055659e-05, + "loss": 0.0564, + "step": 22180 + }, + { + "epoch": 2.6302620656942963, + "grad_norm": 0.9204195170568712, + "learning_rate": 1.3862482063607879e-05, + "loss": 0.1423, + "step": 22181 + }, + { + "epoch": 2.630380647456421, + "grad_norm": 0.5431104443226505, + "learning_rate": 1.3860333115834356e-05, + "loss": 0.0706, + "step": 22182 + }, + { + "epoch": 2.6304992292185463, + "grad_norm": 0.40779525572753234, + "learning_rate": 1.3858184270754903e-05, + "loss": 0.0643, + "step": 22183 + }, + { + "epoch": 2.630617810980671, + "grad_norm": 0.9517822844298176, + "learning_rate": 1.3856035528389335e-05, + "loss": 0.1234, + "step": 22184 + }, + { + "epoch": 2.6307363927427962, + "grad_norm": 0.7141798361680881, + "learning_rate": 1.3853886888757444e-05, + "loss": 0.0745, + "step": 22185 + }, + { + "epoch": 2.630854974504921, + "grad_norm": 0.5722959320038491, + "learning_rate": 1.385173835187905e-05, + "loss": 0.084, + "step": 22186 + }, + { + "epoch": 2.630973556267046, + "grad_norm": 0.7240467350803335, + "learning_rate": 1.3849589917773956e-05, + "loss": 0.0933, + "step": 22187 + }, + { + "epoch": 2.631092138029171, + "grad_norm": 0.5359581401396568, + "learning_rate": 1.3847441586461968e-05, + "loss": 0.0766, + "step": 22188 + }, + { + "epoch": 2.631210719791296, + "grad_norm": 0.6082267547919531, + "learning_rate": 1.3845293357962905e-05, + "loss": 0.085, + "step": 22189 + }, + { + "epoch": 2.631329301553421, + "grad_norm": 0.928895810540593, + "learning_rate": 1.3843145232296537e-05, + "loss": 0.0921, + "step": 22190 + }, + { + "epoch": 2.631447883315546, + "grad_norm": 0.779867159925717, + "learning_rate": 1.3840997209482708e-05, + "loss": 0.1042, + "step": 22191 + }, + { + "epoch": 2.631566465077671, + "grad_norm": 0.7592112976324823, + "learning_rate": 1.3838849289541192e-05, + "loss": 0.094, + "step": 22192 + }, + { + "epoch": 2.631685046839796, + "grad_norm": 0.7860439131070778, + "learning_rate": 1.3836701472491797e-05, + "loss": 0.0843, + "step": 22193 + }, + { + "epoch": 2.6318036286019213, + "grad_norm": 0.6402353985840931, + "learning_rate": 1.3834553758354326e-05, + "loss": 0.0994, + "step": 22194 + }, + { + "epoch": 2.631922210364046, + "grad_norm": 0.46959561649001574, + "learning_rate": 1.3832406147148588e-05, + "loss": 0.0637, + "step": 22195 + }, + { + "epoch": 2.632040792126171, + "grad_norm": 0.5018801484360986, + "learning_rate": 1.383025863889436e-05, + "loss": 0.057, + "step": 22196 + }, + { + "epoch": 2.632159373888296, + "grad_norm": 0.6945522281298866, + "learning_rate": 1.3828111233611452e-05, + "loss": 0.0748, + "step": 22197 + }, + { + "epoch": 2.632277955650421, + "grad_norm": 0.7304319376483229, + "learning_rate": 1.3825963931319658e-05, + "loss": 0.1061, + "step": 22198 + }, + { + "epoch": 2.632396537412546, + "grad_norm": 0.5171804165439179, + "learning_rate": 1.3823816732038772e-05, + "loss": 0.0903, + "step": 22199 + }, + { + "epoch": 2.6325151191746707, + "grad_norm": 0.6019883484958491, + "learning_rate": 1.3821669635788604e-05, + "loss": 0.0697, + "step": 22200 + }, + { + "epoch": 2.632633700936796, + "grad_norm": 0.6753107879008461, + "learning_rate": 1.3819522642588912e-05, + "loss": 0.0897, + "step": 22201 + }, + { + "epoch": 2.632752282698921, + "grad_norm": 0.6199375590575888, + "learning_rate": 1.3817375752459527e-05, + "loss": 0.0915, + "step": 22202 + }, + { + "epoch": 2.632870864461046, + "grad_norm": 1.032341743781071, + "learning_rate": 1.3815228965420218e-05, + "loss": 0.1307, + "step": 22203 + }, + { + "epoch": 2.6329894462231707, + "grad_norm": 0.634497929145572, + "learning_rate": 1.3813082281490794e-05, + "loss": 0.0906, + "step": 22204 + }, + { + "epoch": 2.633108027985296, + "grad_norm": 0.6915668285963579, + "learning_rate": 1.3810935700691013e-05, + "loss": 0.0876, + "step": 22205 + }, + { + "epoch": 2.633226609747421, + "grad_norm": 0.9311953356826111, + "learning_rate": 1.3808789223040703e-05, + "loss": 0.1026, + "step": 22206 + }, + { + "epoch": 2.633345191509546, + "grad_norm": 0.8039110211361633, + "learning_rate": 1.380664284855962e-05, + "loss": 0.0805, + "step": 22207 + }, + { + "epoch": 2.6334637732716706, + "grad_norm": 1.2485535100646803, + "learning_rate": 1.3804496577267567e-05, + "loss": 0.1522, + "step": 22208 + }, + { + "epoch": 2.633582355033796, + "grad_norm": 0.8587375744360274, + "learning_rate": 1.3802350409184326e-05, + "loss": 0.138, + "step": 22209 + }, + { + "epoch": 2.633700936795921, + "grad_norm": 0.5124076038757015, + "learning_rate": 1.380020434432968e-05, + "loss": 0.0742, + "step": 22210 + }, + { + "epoch": 2.6338195185580457, + "grad_norm": 0.6849855549893764, + "learning_rate": 1.3798058382723427e-05, + "loss": 0.0976, + "step": 22211 + }, + { + "epoch": 2.6339381003201705, + "grad_norm": 0.5959526373017247, + "learning_rate": 1.3795912524385323e-05, + "loss": 0.0928, + "step": 22212 + }, + { + "epoch": 2.6340566820822957, + "grad_norm": 0.6581394563203158, + "learning_rate": 1.3793766769335185e-05, + "loss": 0.1033, + "step": 22213 + }, + { + "epoch": 2.634175263844421, + "grad_norm": 0.7360942860469871, + "learning_rate": 1.3791621117592762e-05, + "loss": 0.0975, + "step": 22214 + }, + { + "epoch": 2.6342938456065457, + "grad_norm": 0.5996263231738402, + "learning_rate": 1.378947556917785e-05, + "loss": 0.0953, + "step": 22215 + }, + { + "epoch": 2.6344124273686704, + "grad_norm": 0.6257862628555317, + "learning_rate": 1.3787330124110227e-05, + "loss": 0.0647, + "step": 22216 + }, + { + "epoch": 2.6345310091307956, + "grad_norm": 0.47243843537866426, + "learning_rate": 1.3785184782409666e-05, + "loss": 0.0776, + "step": 22217 + }, + { + "epoch": 2.634649590892921, + "grad_norm": 0.7809582307063082, + "learning_rate": 1.3783039544095962e-05, + "loss": 0.1134, + "step": 22218 + }, + { + "epoch": 2.6347681726550456, + "grad_norm": 0.5640445780646849, + "learning_rate": 1.3780894409188868e-05, + "loss": 0.0686, + "step": 22219 + }, + { + "epoch": 2.634886754417171, + "grad_norm": 0.5487077626956448, + "learning_rate": 1.3778749377708173e-05, + "loss": 0.0807, + "step": 22220 + }, + { + "epoch": 2.6350053361792956, + "grad_norm": 0.6614289505482247, + "learning_rate": 1.3776604449673641e-05, + "loss": 0.0863, + "step": 22221 + }, + { + "epoch": 2.6351239179414208, + "grad_norm": 0.8117087400258742, + "learning_rate": 1.3774459625105068e-05, + "loss": 0.1121, + "step": 22222 + }, + { + "epoch": 2.6352424997035455, + "grad_norm": 0.6731307564216018, + "learning_rate": 1.3772314904022193e-05, + "loss": 0.0831, + "step": 22223 + }, + { + "epoch": 2.6353610814656707, + "grad_norm": 0.3668819362489026, + "learning_rate": 1.3770170286444822e-05, + "loss": 0.0456, + "step": 22224 + }, + { + "epoch": 2.6354796632277955, + "grad_norm": 0.6231658107247141, + "learning_rate": 1.3768025772392701e-05, + "loss": 0.0664, + "step": 22225 + }, + { + "epoch": 2.6355982449899207, + "grad_norm": 0.8109993158603936, + "learning_rate": 1.376588136188561e-05, + "loss": 0.1143, + "step": 22226 + }, + { + "epoch": 2.6357168267520454, + "grad_norm": 0.6039973736083312, + "learning_rate": 1.3763737054943315e-05, + "loss": 0.0867, + "step": 22227 + }, + { + "epoch": 2.6358354085141706, + "grad_norm": 0.879719824785237, + "learning_rate": 1.3761592851585589e-05, + "loss": 0.1121, + "step": 22228 + }, + { + "epoch": 2.6359539902762954, + "grad_norm": 0.6499741106417386, + "learning_rate": 1.3759448751832204e-05, + "loss": 0.0945, + "step": 22229 + }, + { + "epoch": 2.6360725720384206, + "grad_norm": 0.468690792165345, + "learning_rate": 1.3757304755702904e-05, + "loss": 0.0679, + "step": 22230 + }, + { + "epoch": 2.6361911538005454, + "grad_norm": 0.6297925689834445, + "learning_rate": 1.3755160863217472e-05, + "loss": 0.0987, + "step": 22231 + }, + { + "epoch": 2.6363097355626706, + "grad_norm": 0.6773955963661832, + "learning_rate": 1.3753017074395666e-05, + "loss": 0.0931, + "step": 22232 + }, + { + "epoch": 2.6364283173247953, + "grad_norm": 1.989505281096847, + "learning_rate": 1.3750873389257262e-05, + "loss": 0.0777, + "step": 22233 + }, + { + "epoch": 2.6365468990869205, + "grad_norm": 0.9214782140601401, + "learning_rate": 1.3748729807821997e-05, + "loss": 0.1094, + "step": 22234 + }, + { + "epoch": 2.6366654808490453, + "grad_norm": 0.5907870698673934, + "learning_rate": 1.3746586330109651e-05, + "loss": 0.0773, + "step": 22235 + }, + { + "epoch": 2.6367840626111705, + "grad_norm": 0.8660722457766189, + "learning_rate": 1.3744442956139974e-05, + "loss": 0.1238, + "step": 22236 + }, + { + "epoch": 2.6369026443732952, + "grad_norm": 0.6017584808035219, + "learning_rate": 1.3742299685932733e-05, + "loss": 0.0842, + "step": 22237 + }, + { + "epoch": 2.6370212261354204, + "grad_norm": 0.9342749784505846, + "learning_rate": 1.3740156519507686e-05, + "loss": 0.124, + "step": 22238 + }, + { + "epoch": 2.637139807897545, + "grad_norm": 0.7779451954055261, + "learning_rate": 1.3738013456884585e-05, + "loss": 0.1117, + "step": 22239 + }, + { + "epoch": 2.6372583896596704, + "grad_norm": 0.6230090202151877, + "learning_rate": 1.3735870498083198e-05, + "loss": 0.1053, + "step": 22240 + }, + { + "epoch": 2.637376971421795, + "grad_norm": 0.6084111073666847, + "learning_rate": 1.3733727643123264e-05, + "loss": 0.0766, + "step": 22241 + }, + { + "epoch": 2.6374955531839204, + "grad_norm": 0.6266536198399347, + "learning_rate": 1.3731584892024546e-05, + "loss": 0.0867, + "step": 22242 + }, + { + "epoch": 2.6376141349460456, + "grad_norm": 0.7869616432309493, + "learning_rate": 1.3729442244806797e-05, + "loss": 0.1064, + "step": 22243 + }, + { + "epoch": 2.6377327167081703, + "grad_norm": 0.7946330747531307, + "learning_rate": 1.3727299701489782e-05, + "loss": 0.1305, + "step": 22244 + }, + { + "epoch": 2.637851298470295, + "grad_norm": 0.6661820777026175, + "learning_rate": 1.3725157262093225e-05, + "loss": 0.0933, + "step": 22245 + }, + { + "epoch": 2.6379698802324203, + "grad_norm": 0.7921410157074745, + "learning_rate": 1.3723014926636896e-05, + "loss": 0.0851, + "step": 22246 + }, + { + "epoch": 2.6380884619945455, + "grad_norm": 0.7455175831710226, + "learning_rate": 1.372087269514054e-05, + "loss": 0.1184, + "step": 22247 + }, + { + "epoch": 2.6382070437566703, + "grad_norm": 0.8625832954143843, + "learning_rate": 1.3718730567623905e-05, + "loss": 0.1319, + "step": 22248 + }, + { + "epoch": 2.638325625518795, + "grad_norm": 0.48004858106230847, + "learning_rate": 1.371658854410674e-05, + "loss": 0.0624, + "step": 22249 + }, + { + "epoch": 2.63844420728092, + "grad_norm": 0.7348558660078384, + "learning_rate": 1.3714446624608793e-05, + "loss": 0.0953, + "step": 22250 + }, + { + "epoch": 2.6385627890430454, + "grad_norm": 0.56136359535958, + "learning_rate": 1.3712304809149817e-05, + "loss": 0.093, + "step": 22251 + }, + { + "epoch": 2.63868137080517, + "grad_norm": 0.895492094261596, + "learning_rate": 1.3710163097749542e-05, + "loss": 0.1314, + "step": 22252 + }, + { + "epoch": 2.638799952567295, + "grad_norm": 0.9034569066076977, + "learning_rate": 1.370802149042772e-05, + "loss": 0.0991, + "step": 22253 + }, + { + "epoch": 2.63891853432942, + "grad_norm": 0.6497242170569917, + "learning_rate": 1.3705879987204087e-05, + "loss": 0.0741, + "step": 22254 + }, + { + "epoch": 2.6390371160915453, + "grad_norm": 1.0478984109730474, + "learning_rate": 1.3703738588098402e-05, + "loss": 0.1002, + "step": 22255 + }, + { + "epoch": 2.63915569785367, + "grad_norm": 0.5367270213450344, + "learning_rate": 1.370159729313039e-05, + "loss": 0.0771, + "step": 22256 + }, + { + "epoch": 2.639274279615795, + "grad_norm": 0.9875703313956785, + "learning_rate": 1.369945610231979e-05, + "loss": 0.1258, + "step": 22257 + }, + { + "epoch": 2.63939286137792, + "grad_norm": 0.8600354591529581, + "learning_rate": 1.3697315015686352e-05, + "loss": 0.124, + "step": 22258 + }, + { + "epoch": 2.6395114431400453, + "grad_norm": 0.8646319852760859, + "learning_rate": 1.3695174033249805e-05, + "loss": 0.1329, + "step": 22259 + }, + { + "epoch": 2.63963002490217, + "grad_norm": 0.8220237193832501, + "learning_rate": 1.3693033155029902e-05, + "loss": 0.0953, + "step": 22260 + }, + { + "epoch": 2.639748606664295, + "grad_norm": 0.8368729448048801, + "learning_rate": 1.369089238104635e-05, + "loss": 0.1075, + "step": 22261 + }, + { + "epoch": 2.63986718842642, + "grad_norm": 0.5183473037701828, + "learning_rate": 1.368875171131892e-05, + "loss": 0.0665, + "step": 22262 + }, + { + "epoch": 2.639985770188545, + "grad_norm": 0.7966170747165653, + "learning_rate": 1.3686611145867317e-05, + "loss": 0.1227, + "step": 22263 + }, + { + "epoch": 2.64010435195067, + "grad_norm": 0.727170424746305, + "learning_rate": 1.3684470684711287e-05, + "loss": 0.0983, + "step": 22264 + }, + { + "epoch": 2.6402229337127947, + "grad_norm": 0.8324062679521113, + "learning_rate": 1.3682330327870563e-05, + "loss": 0.1304, + "step": 22265 + }, + { + "epoch": 2.64034151547492, + "grad_norm": 0.9396823368751351, + "learning_rate": 1.3680190075364874e-05, + "loss": 0.1108, + "step": 22266 + }, + { + "epoch": 2.640460097237045, + "grad_norm": 0.6011808344617622, + "learning_rate": 1.367804992721396e-05, + "loss": 0.0824, + "step": 22267 + }, + { + "epoch": 2.64057867899917, + "grad_norm": 0.7957646558450844, + "learning_rate": 1.367590988343752e-05, + "loss": 0.0813, + "step": 22268 + }, + { + "epoch": 2.640697260761295, + "grad_norm": 0.5921617195058195, + "learning_rate": 1.3673769944055331e-05, + "loss": 0.0867, + "step": 22269 + }, + { + "epoch": 2.64081584252342, + "grad_norm": 0.7783224661630448, + "learning_rate": 1.367163010908708e-05, + "loss": 0.1186, + "step": 22270 + }, + { + "epoch": 2.640934424285545, + "grad_norm": 1.1540002146973427, + "learning_rate": 1.3669490378552515e-05, + "loss": 0.1079, + "step": 22271 + }, + { + "epoch": 2.64105300604767, + "grad_norm": 0.5870514841936023, + "learning_rate": 1.3667350752471336e-05, + "loss": 0.0788, + "step": 22272 + }, + { + "epoch": 2.641171587809795, + "grad_norm": 0.7290364567207867, + "learning_rate": 1.3665211230863308e-05, + "loss": 0.0919, + "step": 22273 + }, + { + "epoch": 2.6412901695719198, + "grad_norm": 0.675075329340873, + "learning_rate": 1.3663071813748124e-05, + "loss": 0.1063, + "step": 22274 + }, + { + "epoch": 2.641408751334045, + "grad_norm": 0.5998197254765072, + "learning_rate": 1.3660932501145513e-05, + "loss": 0.1003, + "step": 22275 + }, + { + "epoch": 2.6415273330961697, + "grad_norm": 0.7342591438569843, + "learning_rate": 1.3658793293075198e-05, + "loss": 0.102, + "step": 22276 + }, + { + "epoch": 2.641645914858295, + "grad_norm": 0.5144864754169727, + "learning_rate": 1.3656654189556904e-05, + "loss": 0.0685, + "step": 22277 + }, + { + "epoch": 2.6417644966204197, + "grad_norm": 0.47388077657754435, + "learning_rate": 1.3654515190610356e-05, + "loss": 0.0658, + "step": 22278 + }, + { + "epoch": 2.641883078382545, + "grad_norm": 0.7867178336679954, + "learning_rate": 1.3652376296255248e-05, + "loss": 0.1044, + "step": 22279 + }, + { + "epoch": 2.6420016601446696, + "grad_norm": 1.1234680239382326, + "learning_rate": 1.3650237506511331e-05, + "loss": 0.1212, + "step": 22280 + }, + { + "epoch": 2.642120241906795, + "grad_norm": 0.7253028319182525, + "learning_rate": 1.36480988213983e-05, + "loss": 0.0917, + "step": 22281 + }, + { + "epoch": 2.6422388236689196, + "grad_norm": 0.47562949616306605, + "learning_rate": 1.3645960240935888e-05, + "loss": 0.0636, + "step": 22282 + }, + { + "epoch": 2.642357405431045, + "grad_norm": 0.8924424319818337, + "learning_rate": 1.3643821765143778e-05, + "loss": 0.1356, + "step": 22283 + }, + { + "epoch": 2.6424759871931696, + "grad_norm": 0.6699982850162018, + "learning_rate": 1.3641683394041721e-05, + "loss": 0.0817, + "step": 22284 + }, + { + "epoch": 2.6425945689552948, + "grad_norm": 0.6539467015682556, + "learning_rate": 1.3639545127649412e-05, + "loss": 0.0931, + "step": 22285 + }, + { + "epoch": 2.6427131507174195, + "grad_norm": 0.5854449441297692, + "learning_rate": 1.363740696598656e-05, + "loss": 0.0806, + "step": 22286 + }, + { + "epoch": 2.6428317324795447, + "grad_norm": 0.6490480132122486, + "learning_rate": 1.3635268909072885e-05, + "loss": 0.0795, + "step": 22287 + }, + { + "epoch": 2.6429503142416695, + "grad_norm": 0.5114373031357491, + "learning_rate": 1.3633130956928095e-05, + "loss": 0.0784, + "step": 22288 + }, + { + "epoch": 2.6430688960037947, + "grad_norm": 0.4542647398573746, + "learning_rate": 1.3630993109571904e-05, + "loss": 0.0655, + "step": 22289 + }, + { + "epoch": 2.6431874777659194, + "grad_norm": 0.8029890594360549, + "learning_rate": 1.362885536702401e-05, + "loss": 0.0958, + "step": 22290 + }, + { + "epoch": 2.6433060595280446, + "grad_norm": 0.7794406605723889, + "learning_rate": 1.3626717729304122e-05, + "loss": 0.1112, + "step": 22291 + }, + { + "epoch": 2.6434246412901694, + "grad_norm": 0.5792831289746059, + "learning_rate": 1.3624580196431952e-05, + "loss": 0.0885, + "step": 22292 + }, + { + "epoch": 2.6435432230522946, + "grad_norm": 0.6434121572359125, + "learning_rate": 1.3622442768427212e-05, + "loss": 0.0957, + "step": 22293 + }, + { + "epoch": 2.6436618048144194, + "grad_norm": 0.5996318668310618, + "learning_rate": 1.3620305445309579e-05, + "loss": 0.0911, + "step": 22294 + }, + { + "epoch": 2.6437803865765446, + "grad_norm": 0.6436668690642515, + "learning_rate": 1.3618168227098793e-05, + "loss": 0.0971, + "step": 22295 + }, + { + "epoch": 2.6438989683386698, + "grad_norm": 0.7780706115429668, + "learning_rate": 1.3616031113814531e-05, + "loss": 0.0882, + "step": 22296 + }, + { + "epoch": 2.6440175501007945, + "grad_norm": 0.9640570783466451, + "learning_rate": 1.3613894105476499e-05, + "loss": 0.1247, + "step": 22297 + }, + { + "epoch": 2.6441361318629193, + "grad_norm": 0.6478577770377933, + "learning_rate": 1.3611757202104402e-05, + "loss": 0.0822, + "step": 22298 + }, + { + "epoch": 2.6442547136250445, + "grad_norm": 0.9016167297335723, + "learning_rate": 1.3609620403717943e-05, + "loss": 0.0997, + "step": 22299 + }, + { + "epoch": 2.6443732953871697, + "grad_norm": 0.6116366166334369, + "learning_rate": 1.3607483710336822e-05, + "loss": 0.1077, + "step": 22300 + }, + { + "epoch": 2.6444918771492945, + "grad_norm": 0.36713593671397315, + "learning_rate": 1.3605347121980722e-05, + "loss": 0.0521, + "step": 22301 + }, + { + "epoch": 2.644610458911419, + "grad_norm": 0.5869737888249048, + "learning_rate": 1.3603210638669345e-05, + "loss": 0.0611, + "step": 22302 + }, + { + "epoch": 2.6447290406735444, + "grad_norm": 0.63658818258785, + "learning_rate": 1.3601074260422392e-05, + "loss": 0.0906, + "step": 22303 + }, + { + "epoch": 2.6448476224356696, + "grad_norm": 0.5740372226669672, + "learning_rate": 1.359893798725957e-05, + "loss": 0.089, + "step": 22304 + }, + { + "epoch": 2.6449662041977944, + "grad_norm": 0.6041451585720002, + "learning_rate": 1.3596801819200543e-05, + "loss": 0.0865, + "step": 22305 + }, + { + "epoch": 2.645084785959919, + "grad_norm": 0.5438856934024773, + "learning_rate": 1.3594665756265023e-05, + "loss": 0.0814, + "step": 22306 + }, + { + "epoch": 2.6452033677220443, + "grad_norm": 0.7237948726754726, + "learning_rate": 1.3592529798472698e-05, + "loss": 0.1006, + "step": 22307 + }, + { + "epoch": 2.6453219494841695, + "grad_norm": 0.674215258116859, + "learning_rate": 1.3590393945843258e-05, + "loss": 0.0857, + "step": 22308 + }, + { + "epoch": 2.6454405312462943, + "grad_norm": 0.7095410638021892, + "learning_rate": 1.3588258198396397e-05, + "loss": 0.0928, + "step": 22309 + }, + { + "epoch": 2.645559113008419, + "grad_norm": 0.8122148492561208, + "learning_rate": 1.3586122556151798e-05, + "loss": 0.1067, + "step": 22310 + }, + { + "epoch": 2.6456776947705443, + "grad_norm": 0.7861276537211432, + "learning_rate": 1.3583987019129165e-05, + "loss": 0.1017, + "step": 22311 + }, + { + "epoch": 2.6457962765326695, + "grad_norm": 0.7081552448995176, + "learning_rate": 1.358185158734816e-05, + "loss": 0.1161, + "step": 22312 + }, + { + "epoch": 2.6459148582947942, + "grad_norm": 0.7094526681358254, + "learning_rate": 1.3579716260828484e-05, + "loss": 0.0907, + "step": 22313 + }, + { + "epoch": 2.646033440056919, + "grad_norm": 0.7921679972750866, + "learning_rate": 1.357758103958982e-05, + "loss": 0.1051, + "step": 22314 + }, + { + "epoch": 2.646152021819044, + "grad_norm": 0.7950575013185712, + "learning_rate": 1.3575445923651862e-05, + "loss": 0.0957, + "step": 22315 + }, + { + "epoch": 2.6462706035811694, + "grad_norm": 0.6263597402939008, + "learning_rate": 1.3573310913034271e-05, + "loss": 0.0743, + "step": 22316 + }, + { + "epoch": 2.646389185343294, + "grad_norm": 0.8282487508292655, + "learning_rate": 1.3571176007756742e-05, + "loss": 0.123, + "step": 22317 + }, + { + "epoch": 2.6465077671054194, + "grad_norm": 0.6958813844603023, + "learning_rate": 1.3569041207838957e-05, + "loss": 0.0873, + "step": 22318 + }, + { + "epoch": 2.646626348867544, + "grad_norm": 0.4343599679462023, + "learning_rate": 1.3566906513300589e-05, + "loss": 0.0597, + "step": 22319 + }, + { + "epoch": 2.6467449306296693, + "grad_norm": 0.8063947657026559, + "learning_rate": 1.3564771924161338e-05, + "loss": 0.0878, + "step": 22320 + }, + { + "epoch": 2.646863512391794, + "grad_norm": 0.6377965969254634, + "learning_rate": 1.3562637440440845e-05, + "loss": 0.0709, + "step": 22321 + }, + { + "epoch": 2.6469820941539193, + "grad_norm": 0.3919075055630714, + "learning_rate": 1.356050306215883e-05, + "loss": 0.0588, + "step": 22322 + }, + { + "epoch": 2.647100675916044, + "grad_norm": 1.0921658829163259, + "learning_rate": 1.3558368789334938e-05, + "loss": 0.1157, + "step": 22323 + }, + { + "epoch": 2.6472192576781692, + "grad_norm": 0.4130211399963169, + "learning_rate": 1.3556234621988855e-05, + "loss": 0.0677, + "step": 22324 + }, + { + "epoch": 2.647337839440294, + "grad_norm": 0.9645555641139941, + "learning_rate": 1.3554100560140257e-05, + "loss": 0.1261, + "step": 22325 + }, + { + "epoch": 2.647456421202419, + "grad_norm": 0.6407070660300077, + "learning_rate": 1.3551966603808814e-05, + "loss": 0.0866, + "step": 22326 + }, + { + "epoch": 2.647575002964544, + "grad_norm": 0.6467561613296895, + "learning_rate": 1.3549832753014214e-05, + "loss": 0.0836, + "step": 22327 + }, + { + "epoch": 2.647693584726669, + "grad_norm": 0.7507721832121049, + "learning_rate": 1.3547699007776087e-05, + "loss": 0.0927, + "step": 22328 + }, + { + "epoch": 2.647812166488794, + "grad_norm": 0.6981454797136427, + "learning_rate": 1.354556536811416e-05, + "loss": 0.0751, + "step": 22329 + }, + { + "epoch": 2.647930748250919, + "grad_norm": 0.6396209055442964, + "learning_rate": 1.3543431834048059e-05, + "loss": 0.0787, + "step": 22330 + }, + { + "epoch": 2.648049330013044, + "grad_norm": 0.6698337930888734, + "learning_rate": 1.354129840559748e-05, + "loss": 0.0755, + "step": 22331 + }, + { + "epoch": 2.648167911775169, + "grad_norm": 0.4895550014089284, + "learning_rate": 1.3539165082782056e-05, + "loss": 0.0539, + "step": 22332 + }, + { + "epoch": 2.648286493537294, + "grad_norm": 0.6551944507654328, + "learning_rate": 1.3537031865621496e-05, + "loss": 0.0767, + "step": 22333 + }, + { + "epoch": 2.648405075299419, + "grad_norm": 1.0978412298902758, + "learning_rate": 1.3534898754135438e-05, + "loss": 0.1638, + "step": 22334 + }, + { + "epoch": 2.648523657061544, + "grad_norm": 0.6332610492926669, + "learning_rate": 1.353276574834355e-05, + "loss": 0.0756, + "step": 22335 + }, + { + "epoch": 2.648642238823669, + "grad_norm": 1.3321234019303214, + "learning_rate": 1.3530632848265507e-05, + "loss": 0.2136, + "step": 22336 + }, + { + "epoch": 2.6487608205857938, + "grad_norm": 0.7376251067049799, + "learning_rate": 1.3528500053920956e-05, + "loss": 0.0993, + "step": 22337 + }, + { + "epoch": 2.648879402347919, + "grad_norm": 0.7791418049799559, + "learning_rate": 1.3526367365329584e-05, + "loss": 0.0948, + "step": 22338 + }, + { + "epoch": 2.6489979841100437, + "grad_norm": 0.9809228865410576, + "learning_rate": 1.3524234782511012e-05, + "loss": 0.1547, + "step": 22339 + }, + { + "epoch": 2.649116565872169, + "grad_norm": 0.7692211597074591, + "learning_rate": 1.352210230548494e-05, + "loss": 0.1038, + "step": 22340 + }, + { + "epoch": 2.6492351476342937, + "grad_norm": 0.9892910020846395, + "learning_rate": 1.3519969934271005e-05, + "loss": 0.1106, + "step": 22341 + }, + { + "epoch": 2.649353729396419, + "grad_norm": 0.8182760507425587, + "learning_rate": 1.3517837668888875e-05, + "loss": 0.0824, + "step": 22342 + }, + { + "epoch": 2.6494723111585436, + "grad_norm": 0.8209116020975719, + "learning_rate": 1.3515705509358184e-05, + "loss": 0.1034, + "step": 22343 + }, + { + "epoch": 2.649590892920669, + "grad_norm": 0.6000567136805243, + "learning_rate": 1.3513573455698625e-05, + "loss": 0.0822, + "step": 22344 + }, + { + "epoch": 2.649709474682794, + "grad_norm": 0.50546255287739, + "learning_rate": 1.351144150792982e-05, + "loss": 0.0899, + "step": 22345 + }, + { + "epoch": 2.649828056444919, + "grad_norm": 0.8474080599220745, + "learning_rate": 1.350930966607144e-05, + "loss": 0.1483, + "step": 22346 + }, + { + "epoch": 2.6499466382070436, + "grad_norm": 0.8581963893717277, + "learning_rate": 1.3507177930143133e-05, + "loss": 0.0991, + "step": 22347 + }, + { + "epoch": 2.6500652199691688, + "grad_norm": 0.41770074459340734, + "learning_rate": 1.350504630016455e-05, + "loss": 0.0636, + "step": 22348 + }, + { + "epoch": 2.650183801731294, + "grad_norm": 0.7560780022103756, + "learning_rate": 1.3502914776155357e-05, + "loss": 0.0898, + "step": 22349 + }, + { + "epoch": 2.6503023834934187, + "grad_norm": 0.4551065211893728, + "learning_rate": 1.3500783358135169e-05, + "loss": 0.0582, + "step": 22350 + }, + { + "epoch": 2.6504209652555435, + "grad_norm": 0.33019807383573796, + "learning_rate": 1.3498652046123677e-05, + "loss": 0.0462, + "step": 22351 + }, + { + "epoch": 2.6505395470176687, + "grad_norm": 0.7470003301305436, + "learning_rate": 1.3496520840140503e-05, + "loss": 0.0878, + "step": 22352 + }, + { + "epoch": 2.650658128779794, + "grad_norm": 0.5667743981405591, + "learning_rate": 1.3494389740205305e-05, + "loss": 0.0816, + "step": 22353 + }, + { + "epoch": 2.6507767105419187, + "grad_norm": 0.6766170544527722, + "learning_rate": 1.3492258746337707e-05, + "loss": 0.0851, + "step": 22354 + }, + { + "epoch": 2.6508952923040434, + "grad_norm": 0.7017844760563791, + "learning_rate": 1.3490127858557395e-05, + "loss": 0.0916, + "step": 22355 + }, + { + "epoch": 2.6510138740661686, + "grad_norm": 0.5333119200856681, + "learning_rate": 1.3487997076883973e-05, + "loss": 0.0692, + "step": 22356 + }, + { + "epoch": 2.651132455828294, + "grad_norm": 0.7051426181411415, + "learning_rate": 1.3485866401337105e-05, + "loss": 0.0736, + "step": 22357 + }, + { + "epoch": 2.6512510375904186, + "grad_norm": 0.7627332659122888, + "learning_rate": 1.3483735831936428e-05, + "loss": 0.1035, + "step": 22358 + }, + { + "epoch": 2.6513696193525433, + "grad_norm": 0.6363993954861621, + "learning_rate": 1.3481605368701589e-05, + "loss": 0.0841, + "step": 22359 + }, + { + "epoch": 2.6514882011146685, + "grad_norm": 0.7875903348717576, + "learning_rate": 1.3479475011652226e-05, + "loss": 0.0977, + "step": 22360 + }, + { + "epoch": 2.6516067828767937, + "grad_norm": 0.52000868083316, + "learning_rate": 1.3477344760807969e-05, + "loss": 0.0677, + "step": 22361 + }, + { + "epoch": 2.6517253646389185, + "grad_norm": 1.0859185698372573, + "learning_rate": 1.3475214616188467e-05, + "loss": 0.1401, + "step": 22362 + }, + { + "epoch": 2.6518439464010433, + "grad_norm": 0.6384404439853918, + "learning_rate": 1.347308457781335e-05, + "loss": 0.089, + "step": 22363 + }, + { + "epoch": 2.6519625281631685, + "grad_norm": 0.7256074690987778, + "learning_rate": 1.3470954645702268e-05, + "loss": 0.1007, + "step": 22364 + }, + { + "epoch": 2.6520811099252937, + "grad_norm": 0.5080146590907124, + "learning_rate": 1.346882481987483e-05, + "loss": 0.0694, + "step": 22365 + }, + { + "epoch": 2.6521996916874184, + "grad_norm": 0.6011597563145563, + "learning_rate": 1.3466695100350701e-05, + "loss": 0.0861, + "step": 22366 + }, + { + "epoch": 2.652318273449543, + "grad_norm": 0.6221114842936272, + "learning_rate": 1.3464565487149494e-05, + "loss": 0.0867, + "step": 22367 + }, + { + "epoch": 2.6524368552116684, + "grad_norm": 0.6179259806795024, + "learning_rate": 1.3462435980290846e-05, + "loss": 0.0838, + "step": 22368 + }, + { + "epoch": 2.6525554369737936, + "grad_norm": 0.5770580095983447, + "learning_rate": 1.346030657979439e-05, + "loss": 0.0653, + "step": 22369 + }, + { + "epoch": 2.6526740187359183, + "grad_norm": 0.8552710866281239, + "learning_rate": 1.3458177285679757e-05, + "loss": 0.1268, + "step": 22370 + }, + { + "epoch": 2.6527926004980436, + "grad_norm": 0.574505951890669, + "learning_rate": 1.3456048097966586e-05, + "loss": 0.0696, + "step": 22371 + }, + { + "epoch": 2.6529111822601683, + "grad_norm": 1.06526329208451, + "learning_rate": 1.3453919016674484e-05, + "loss": 0.1104, + "step": 22372 + }, + { + "epoch": 2.6530297640222935, + "grad_norm": 0.5721819413653225, + "learning_rate": 1.3451790041823092e-05, + "loss": 0.0732, + "step": 22373 + }, + { + "epoch": 2.6531483457844183, + "grad_norm": 0.5500039485919671, + "learning_rate": 1.3449661173432032e-05, + "loss": 0.08, + "step": 22374 + }, + { + "epoch": 2.6532669275465435, + "grad_norm": 0.6181097080996742, + "learning_rate": 1.3447532411520936e-05, + "loss": 0.0863, + "step": 22375 + }, + { + "epoch": 2.6533855093086682, + "grad_norm": 0.6465968489713814, + "learning_rate": 1.3445403756109432e-05, + "loss": 0.1112, + "step": 22376 + }, + { + "epoch": 2.6535040910707934, + "grad_norm": 0.49092561788914174, + "learning_rate": 1.3443275207217116e-05, + "loss": 0.0807, + "step": 22377 + }, + { + "epoch": 2.653622672832918, + "grad_norm": 0.6354866601071564, + "learning_rate": 1.344114676486365e-05, + "loss": 0.0943, + "step": 22378 + }, + { + "epoch": 2.6537412545950434, + "grad_norm": 0.6157762999720815, + "learning_rate": 1.3439018429068628e-05, + "loss": 0.069, + "step": 22379 + }, + { + "epoch": 2.653859836357168, + "grad_norm": 0.8414706823319743, + "learning_rate": 1.3436890199851679e-05, + "loss": 0.1085, + "step": 22380 + }, + { + "epoch": 2.6539784181192934, + "grad_norm": 0.5408184497298884, + "learning_rate": 1.3434762077232422e-05, + "loss": 0.0803, + "step": 22381 + }, + { + "epoch": 2.654096999881418, + "grad_norm": 1.1708259067745304, + "learning_rate": 1.3432634061230488e-05, + "loss": 0.1183, + "step": 22382 + }, + { + "epoch": 2.6542155816435433, + "grad_norm": 0.685648906271257, + "learning_rate": 1.3430506151865468e-05, + "loss": 0.1043, + "step": 22383 + }, + { + "epoch": 2.654334163405668, + "grad_norm": 0.675117747536794, + "learning_rate": 1.3428378349157001e-05, + "loss": 0.0787, + "step": 22384 + }, + { + "epoch": 2.6544527451677933, + "grad_norm": 0.6994008783631318, + "learning_rate": 1.3426250653124692e-05, + "loss": 0.0908, + "step": 22385 + }, + { + "epoch": 2.654571326929918, + "grad_norm": 0.744327650009034, + "learning_rate": 1.3424123063788158e-05, + "loss": 0.1125, + "step": 22386 + }, + { + "epoch": 2.6546899086920432, + "grad_norm": 0.5687776890252535, + "learning_rate": 1.3421995581167026e-05, + "loss": 0.0716, + "step": 22387 + }, + { + "epoch": 2.654808490454168, + "grad_norm": 0.5040096790515542, + "learning_rate": 1.3419868205280878e-05, + "loss": 0.0642, + "step": 22388 + }, + { + "epoch": 2.654927072216293, + "grad_norm": 1.031997632640187, + "learning_rate": 1.3417740936149361e-05, + "loss": 0.1266, + "step": 22389 + }, + { + "epoch": 2.655045653978418, + "grad_norm": 1.1194900511058625, + "learning_rate": 1.3415613773792063e-05, + "loss": 0.1299, + "step": 22390 + }, + { + "epoch": 2.655164235740543, + "grad_norm": 0.7851439051294935, + "learning_rate": 1.3413486718228607e-05, + "loss": 0.1197, + "step": 22391 + }, + { + "epoch": 2.655282817502668, + "grad_norm": 0.5419916566019246, + "learning_rate": 1.341135976947858e-05, + "loss": 0.0692, + "step": 22392 + }, + { + "epoch": 2.655401399264793, + "grad_norm": 0.7931895039277034, + "learning_rate": 1.3409232927561621e-05, + "loss": 0.0963, + "step": 22393 + }, + { + "epoch": 2.6555199810269183, + "grad_norm": 0.6451054345977111, + "learning_rate": 1.340710619249731e-05, + "loss": 0.0939, + "step": 22394 + }, + { + "epoch": 2.655638562789043, + "grad_norm": 0.5193334533098175, + "learning_rate": 1.3404979564305268e-05, + "loss": 0.0636, + "step": 22395 + }, + { + "epoch": 2.655757144551168, + "grad_norm": 0.7961317690483893, + "learning_rate": 1.3402853043005092e-05, + "loss": 0.0981, + "step": 22396 + }, + { + "epoch": 2.655875726313293, + "grad_norm": 0.7414180385071996, + "learning_rate": 1.3400726628616393e-05, + "loss": 0.0975, + "step": 22397 + }, + { + "epoch": 2.6559943080754183, + "grad_norm": 0.6277473095766107, + "learning_rate": 1.3398600321158777e-05, + "loss": 0.0839, + "step": 22398 + }, + { + "epoch": 2.656112889837543, + "grad_norm": 0.5362848009075686, + "learning_rate": 1.339647412065182e-05, + "loss": 0.0719, + "step": 22399 + }, + { + "epoch": 2.6562314715996678, + "grad_norm": 0.9166106352708544, + "learning_rate": 1.3394348027115164e-05, + "loss": 0.1541, + "step": 22400 + }, + { + "epoch": 2.656350053361793, + "grad_norm": 0.6350178024530285, + "learning_rate": 1.3392222040568376e-05, + "loss": 0.0811, + "step": 22401 + }, + { + "epoch": 2.656468635123918, + "grad_norm": 0.4913492914267452, + "learning_rate": 1.339009616103108e-05, + "loss": 0.0824, + "step": 22402 + }, + { + "epoch": 2.656587216886043, + "grad_norm": 0.866264080041984, + "learning_rate": 1.3387970388522836e-05, + "loss": 0.1221, + "step": 22403 + }, + { + "epoch": 2.6567057986481677, + "grad_norm": 0.8272121721870792, + "learning_rate": 1.3385844723063284e-05, + "loss": 0.1068, + "step": 22404 + }, + { + "epoch": 2.656824380410293, + "grad_norm": 0.49376756988135784, + "learning_rate": 1.3383719164671998e-05, + "loss": 0.0641, + "step": 22405 + }, + { + "epoch": 2.656942962172418, + "grad_norm": 0.9391437955072227, + "learning_rate": 1.3381593713368573e-05, + "loss": 0.1691, + "step": 22406 + }, + { + "epoch": 2.657061543934543, + "grad_norm": 0.6842462248478755, + "learning_rate": 1.3379468369172604e-05, + "loss": 0.1062, + "step": 22407 + }, + { + "epoch": 2.6571801256966676, + "grad_norm": 0.8790591306427025, + "learning_rate": 1.337734313210369e-05, + "loss": 0.1168, + "step": 22408 + }, + { + "epoch": 2.657298707458793, + "grad_norm": 0.8816985108213671, + "learning_rate": 1.3375218002181428e-05, + "loss": 0.1451, + "step": 22409 + }, + { + "epoch": 2.657417289220918, + "grad_norm": 0.8291035499835709, + "learning_rate": 1.3373092979425383e-05, + "loss": 0.1266, + "step": 22410 + }, + { + "epoch": 2.657535870983043, + "grad_norm": 0.5785898123081427, + "learning_rate": 1.337096806385518e-05, + "loss": 0.0792, + "step": 22411 + }, + { + "epoch": 2.6576544527451675, + "grad_norm": 0.7566743200168137, + "learning_rate": 1.3368843255490383e-05, + "loss": 0.0975, + "step": 22412 + }, + { + "epoch": 2.6577730345072927, + "grad_norm": 0.7058434871760788, + "learning_rate": 1.3366718554350597e-05, + "loss": 0.1029, + "step": 22413 + }, + { + "epoch": 2.657891616269418, + "grad_norm": 0.7050785335660046, + "learning_rate": 1.336459396045538e-05, + "loss": 0.1063, + "step": 22414 + }, + { + "epoch": 2.6580101980315427, + "grad_norm": 0.8487139551582903, + "learning_rate": 1.3362469473824357e-05, + "loss": 0.0783, + "step": 22415 + }, + { + "epoch": 2.6581287797936675, + "grad_norm": 0.7371115687277869, + "learning_rate": 1.3360345094477089e-05, + "loss": 0.0953, + "step": 22416 + }, + { + "epoch": 2.6582473615557927, + "grad_norm": 0.4243011362685989, + "learning_rate": 1.3358220822433162e-05, + "loss": 0.0612, + "step": 22417 + }, + { + "epoch": 2.658365943317918, + "grad_norm": 0.6226591017653567, + "learning_rate": 1.3356096657712164e-05, + "loss": 0.0874, + "step": 22418 + }, + { + "epoch": 2.6584845250800426, + "grad_norm": 0.5508245675434108, + "learning_rate": 1.3353972600333678e-05, + "loss": 0.0807, + "step": 22419 + }, + { + "epoch": 2.658603106842168, + "grad_norm": 0.6145860560080613, + "learning_rate": 1.3351848650317288e-05, + "loss": 0.0762, + "step": 22420 + }, + { + "epoch": 2.6587216886042926, + "grad_norm": 0.6701534941552142, + "learning_rate": 1.3349724807682557e-05, + "loss": 0.0778, + "step": 22421 + }, + { + "epoch": 2.658840270366418, + "grad_norm": 0.5298993742711228, + "learning_rate": 1.3347601072449093e-05, + "loss": 0.08, + "step": 22422 + }, + { + "epoch": 2.6589588521285425, + "grad_norm": 0.5049555068860782, + "learning_rate": 1.3345477444636447e-05, + "loss": 0.0586, + "step": 22423 + }, + { + "epoch": 2.6590774338906678, + "grad_norm": 0.5849987727013093, + "learning_rate": 1.3343353924264207e-05, + "loss": 0.0848, + "step": 22424 + }, + { + "epoch": 2.6591960156527925, + "grad_norm": 0.767606881256208, + "learning_rate": 1.3341230511351955e-05, + "loss": 0.1221, + "step": 22425 + }, + { + "epoch": 2.6593145974149177, + "grad_norm": 0.7089199647771987, + "learning_rate": 1.3339107205919254e-05, + "loss": 0.098, + "step": 22426 + }, + { + "epoch": 2.6594331791770425, + "grad_norm": 0.639793393905938, + "learning_rate": 1.3336984007985697e-05, + "loss": 0.0992, + "step": 22427 + }, + { + "epoch": 2.6595517609391677, + "grad_norm": 0.820550869140673, + "learning_rate": 1.3334860917570835e-05, + "loss": 0.1093, + "step": 22428 + }, + { + "epoch": 2.6596703427012924, + "grad_norm": 0.6118879663495935, + "learning_rate": 1.333273793469425e-05, + "loss": 0.0869, + "step": 22429 + }, + { + "epoch": 2.6597889244634176, + "grad_norm": 0.7445666144948621, + "learning_rate": 1.3330615059375517e-05, + "loss": 0.0984, + "step": 22430 + }, + { + "epoch": 2.6599075062255424, + "grad_norm": 0.5791254705942529, + "learning_rate": 1.3328492291634212e-05, + "loss": 0.0881, + "step": 22431 + }, + { + "epoch": 2.6600260879876676, + "grad_norm": 0.77210292159417, + "learning_rate": 1.3326369631489885e-05, + "loss": 0.1236, + "step": 22432 + }, + { + "epoch": 2.6601446697497924, + "grad_norm": 0.7583768914574568, + "learning_rate": 1.3324247078962116e-05, + "loss": 0.0979, + "step": 22433 + }, + { + "epoch": 2.6602632515119176, + "grad_norm": 0.5189143690408851, + "learning_rate": 1.3322124634070472e-05, + "loss": 0.0718, + "step": 22434 + }, + { + "epoch": 2.6603818332740423, + "grad_norm": 0.6575158284779892, + "learning_rate": 1.3320002296834514e-05, + "loss": 0.0806, + "step": 22435 + }, + { + "epoch": 2.6605004150361675, + "grad_norm": 0.6763895436385835, + "learning_rate": 1.3317880067273813e-05, + "loss": 0.0823, + "step": 22436 + }, + { + "epoch": 2.6606189967982923, + "grad_norm": 0.5254427939897763, + "learning_rate": 1.3315757945407936e-05, + "loss": 0.0823, + "step": 22437 + }, + { + "epoch": 2.6607375785604175, + "grad_norm": 0.6345636980210823, + "learning_rate": 1.3313635931256448e-05, + "loss": 0.0896, + "step": 22438 + }, + { + "epoch": 2.6608561603225422, + "grad_norm": 0.8052778163050947, + "learning_rate": 1.33115140248389e-05, + "loss": 0.1045, + "step": 22439 + }, + { + "epoch": 2.6609747420846674, + "grad_norm": 0.6046268823599645, + "learning_rate": 1.3309392226174858e-05, + "loss": 0.0895, + "step": 22440 + }, + { + "epoch": 2.661093323846792, + "grad_norm": 0.5507189618283719, + "learning_rate": 1.3307270535283884e-05, + "loss": 0.0771, + "step": 22441 + }, + { + "epoch": 2.6612119056089174, + "grad_norm": 0.7285015518336657, + "learning_rate": 1.3305148952185546e-05, + "loss": 0.1041, + "step": 22442 + }, + { + "epoch": 2.6613304873710426, + "grad_norm": 0.4472958029172166, + "learning_rate": 1.3303027476899382e-05, + "loss": 0.0605, + "step": 22443 + }, + { + "epoch": 2.6614490691331674, + "grad_norm": 0.8214893793757132, + "learning_rate": 1.3300906109444963e-05, + "loss": 0.1338, + "step": 22444 + }, + { + "epoch": 2.661567650895292, + "grad_norm": 0.6330771129637435, + "learning_rate": 1.3298784849841844e-05, + "loss": 0.077, + "step": 22445 + }, + { + "epoch": 2.6616862326574173, + "grad_norm": 0.7155178879551087, + "learning_rate": 1.329666369810958e-05, + "loss": 0.0874, + "step": 22446 + }, + { + "epoch": 2.6618048144195425, + "grad_norm": 0.6127504524333259, + "learning_rate": 1.3294542654267733e-05, + "loss": 0.0857, + "step": 22447 + }, + { + "epoch": 2.6619233961816673, + "grad_norm": 0.6303653149845752, + "learning_rate": 1.329242171833583e-05, + "loss": 0.0979, + "step": 22448 + }, + { + "epoch": 2.662041977943792, + "grad_norm": 0.4694227884877666, + "learning_rate": 1.329030089033346e-05, + "loss": 0.0523, + "step": 22449 + }, + { + "epoch": 2.6621605597059173, + "grad_norm": 0.6562839955459945, + "learning_rate": 1.3288180170280146e-05, + "loss": 0.0846, + "step": 22450 + }, + { + "epoch": 2.6622791414680425, + "grad_norm": 0.5217468090903937, + "learning_rate": 1.3286059558195452e-05, + "loss": 0.0725, + "step": 22451 + }, + { + "epoch": 2.662397723230167, + "grad_norm": 0.47437416605155425, + "learning_rate": 1.328393905409892e-05, + "loss": 0.0624, + "step": 22452 + }, + { + "epoch": 2.662516304992292, + "grad_norm": 0.5591201995061557, + "learning_rate": 1.328181865801012e-05, + "loss": 0.0705, + "step": 22453 + }, + { + "epoch": 2.662634886754417, + "grad_norm": 0.7126168649876842, + "learning_rate": 1.3279698369948567e-05, + "loss": 0.0973, + "step": 22454 + }, + { + "epoch": 2.6627534685165424, + "grad_norm": 0.4576086896012243, + "learning_rate": 1.3277578189933823e-05, + "loss": 0.0537, + "step": 22455 + }, + { + "epoch": 2.662872050278667, + "grad_norm": 0.6372267520825832, + "learning_rate": 1.3275458117985434e-05, + "loss": 0.1053, + "step": 22456 + }, + { + "epoch": 2.662990632040792, + "grad_norm": 0.6351668193232108, + "learning_rate": 1.3273338154122943e-05, + "loss": 0.0917, + "step": 22457 + }, + { + "epoch": 2.663109213802917, + "grad_norm": 0.7353219157655141, + "learning_rate": 1.3271218298365904e-05, + "loss": 0.0716, + "step": 22458 + }, + { + "epoch": 2.6632277955650423, + "grad_norm": 0.7327992426092528, + "learning_rate": 1.3269098550733827e-05, + "loss": 0.1142, + "step": 22459 + }, + { + "epoch": 2.663346377327167, + "grad_norm": 0.6932339488996301, + "learning_rate": 1.32669789112463e-05, + "loss": 0.092, + "step": 22460 + }, + { + "epoch": 2.663464959089292, + "grad_norm": 0.796770140145577, + "learning_rate": 1.3264859379922829e-05, + "loss": 0.0857, + "step": 22461 + }, + { + "epoch": 2.663583540851417, + "grad_norm": 0.7647214537783893, + "learning_rate": 1.3262739956782972e-05, + "loss": 0.0922, + "step": 22462 + }, + { + "epoch": 2.6637021226135422, + "grad_norm": 0.6486676284477038, + "learning_rate": 1.3260620641846242e-05, + "loss": 0.0907, + "step": 22463 + }, + { + "epoch": 2.663820704375667, + "grad_norm": 0.7440429510528012, + "learning_rate": 1.3258501435132212e-05, + "loss": 0.1033, + "step": 22464 + }, + { + "epoch": 2.6639392861377917, + "grad_norm": 0.5578018343913153, + "learning_rate": 1.3256382336660389e-05, + "loss": 0.0822, + "step": 22465 + }, + { + "epoch": 2.664057867899917, + "grad_norm": 0.6530233572361547, + "learning_rate": 1.3254263346450324e-05, + "loss": 0.0996, + "step": 22466 + }, + { + "epoch": 2.664176449662042, + "grad_norm": 0.7232963825095917, + "learning_rate": 1.3252144464521544e-05, + "loss": 0.1004, + "step": 22467 + }, + { + "epoch": 2.664295031424167, + "grad_norm": 0.9456924179014553, + "learning_rate": 1.3250025690893586e-05, + "loss": 0.0963, + "step": 22468 + }, + { + "epoch": 2.664413613186292, + "grad_norm": 0.7340584602581961, + "learning_rate": 1.3247907025585992e-05, + "loss": 0.1315, + "step": 22469 + }, + { + "epoch": 2.664532194948417, + "grad_norm": 0.6321511422862255, + "learning_rate": 1.3245788468618268e-05, + "loss": 0.0929, + "step": 22470 + }, + { + "epoch": 2.664650776710542, + "grad_norm": 0.8406107973248735, + "learning_rate": 1.3243670020009977e-05, + "loss": 0.1105, + "step": 22471 + }, + { + "epoch": 2.664769358472667, + "grad_norm": 0.5081360674079848, + "learning_rate": 1.3241551679780623e-05, + "loss": 0.0674, + "step": 22472 + }, + { + "epoch": 2.664887940234792, + "grad_norm": 0.6016269363229536, + "learning_rate": 1.3239433447949743e-05, + "loss": 0.089, + "step": 22473 + }, + { + "epoch": 2.665006521996917, + "grad_norm": 0.6594904072670792, + "learning_rate": 1.3237315324536864e-05, + "loss": 0.0874, + "step": 22474 + }, + { + "epoch": 2.665125103759042, + "grad_norm": 0.6949524360961182, + "learning_rate": 1.3235197309561514e-05, + "loss": 0.0885, + "step": 22475 + }, + { + "epoch": 2.6652436855211667, + "grad_norm": 0.5588776909669847, + "learning_rate": 1.3233079403043224e-05, + "loss": 0.0708, + "step": 22476 + }, + { + "epoch": 2.665362267283292, + "grad_norm": 0.6816993975273699, + "learning_rate": 1.3230961605001506e-05, + "loss": 0.0696, + "step": 22477 + }, + { + "epoch": 2.6654808490454167, + "grad_norm": 0.4278146003401951, + "learning_rate": 1.3228843915455893e-05, + "loss": 0.0631, + "step": 22478 + }, + { + "epoch": 2.665599430807542, + "grad_norm": 0.9180550725982773, + "learning_rate": 1.3226726334425902e-05, + "loss": 0.1285, + "step": 22479 + }, + { + "epoch": 2.6657180125696667, + "grad_norm": 0.46025989845458337, + "learning_rate": 1.3224608861931065e-05, + "loss": 0.066, + "step": 22480 + }, + { + "epoch": 2.665836594331792, + "grad_norm": 0.535353733242164, + "learning_rate": 1.3222491497990874e-05, + "loss": 0.0679, + "step": 22481 + }, + { + "epoch": 2.6659551760939166, + "grad_norm": 0.6458543993607475, + "learning_rate": 1.3220374242624892e-05, + "loss": 0.0982, + "step": 22482 + }, + { + "epoch": 2.666073757856042, + "grad_norm": 0.7549070358604119, + "learning_rate": 1.3218257095852604e-05, + "loss": 0.0952, + "step": 22483 + }, + { + "epoch": 2.6661923396181666, + "grad_norm": 0.5995009616947008, + "learning_rate": 1.3216140057693535e-05, + "loss": 0.0797, + "step": 22484 + }, + { + "epoch": 2.666310921380292, + "grad_norm": 0.5749846485756692, + "learning_rate": 1.3214023128167209e-05, + "loss": 0.0855, + "step": 22485 + }, + { + "epoch": 2.6664295031424166, + "grad_norm": 0.8126639111252257, + "learning_rate": 1.321190630729313e-05, + "loss": 0.123, + "step": 22486 + }, + { + "epoch": 2.6665480849045418, + "grad_norm": 0.8080673185783322, + "learning_rate": 1.3209789595090832e-05, + "loss": 0.0917, + "step": 22487 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.5728512842470257, + "learning_rate": 1.3207672991579808e-05, + "loss": 0.0749, + "step": 22488 + }, + { + "epoch": 2.6667852484287917, + "grad_norm": 0.7411836756777824, + "learning_rate": 1.3205556496779578e-05, + "loss": 0.1361, + "step": 22489 + }, + { + "epoch": 2.6669038301909165, + "grad_norm": 0.6652720118189394, + "learning_rate": 1.3203440110709653e-05, + "loss": 0.0876, + "step": 22490 + }, + { + "epoch": 2.6670224119530417, + "grad_norm": 0.748383246525158, + "learning_rate": 1.3201323833389557e-05, + "loss": 0.1025, + "step": 22491 + }, + { + "epoch": 2.6671409937151664, + "grad_norm": 0.5960270656536343, + "learning_rate": 1.3199207664838762e-05, + "loss": 0.0745, + "step": 22492 + }, + { + "epoch": 2.6672595754772916, + "grad_norm": 0.5908379279142165, + "learning_rate": 1.3197091605076822e-05, + "loss": 0.074, + "step": 22493 + }, + { + "epoch": 2.6673781572394164, + "grad_norm": 0.5869325862795102, + "learning_rate": 1.3194975654123214e-05, + "loss": 0.0549, + "step": 22494 + }, + { + "epoch": 2.6674967390015416, + "grad_norm": 0.707571119830174, + "learning_rate": 1.3192859811997457e-05, + "loss": 0.0734, + "step": 22495 + }, + { + "epoch": 2.667615320763667, + "grad_norm": 0.6723050965212071, + "learning_rate": 1.3190744078719052e-05, + "loss": 0.065, + "step": 22496 + }, + { + "epoch": 2.6677339025257916, + "grad_norm": 0.5054257466112098, + "learning_rate": 1.3188628454307506e-05, + "loss": 0.0682, + "step": 22497 + }, + { + "epoch": 2.6678524842879163, + "grad_norm": 0.838571202296284, + "learning_rate": 1.318651293878233e-05, + "loss": 0.112, + "step": 22498 + }, + { + "epoch": 2.6679710660500415, + "grad_norm": 0.604181781400712, + "learning_rate": 1.3184397532163007e-05, + "loss": 0.109, + "step": 22499 + }, + { + "epoch": 2.6680896478121667, + "grad_norm": 0.6084951192188945, + "learning_rate": 1.3182282234469052e-05, + "loss": 0.0708, + "step": 22500 + }, + { + "epoch": 2.6682082295742915, + "grad_norm": 0.7899293029752712, + "learning_rate": 1.318016704571996e-05, + "loss": 0.116, + "step": 22501 + }, + { + "epoch": 2.6683268113364162, + "grad_norm": 0.6259195271962176, + "learning_rate": 1.3178051965935246e-05, + "loss": 0.073, + "step": 22502 + }, + { + "epoch": 2.6684453930985415, + "grad_norm": 0.7578006500054932, + "learning_rate": 1.3175936995134385e-05, + "loss": 0.1026, + "step": 22503 + }, + { + "epoch": 2.6685639748606667, + "grad_norm": 0.4403115222826238, + "learning_rate": 1.3173822133336888e-05, + "loss": 0.0578, + "step": 22504 + }, + { + "epoch": 2.6686825566227914, + "grad_norm": 0.7322169197610137, + "learning_rate": 1.3171707380562246e-05, + "loss": 0.1094, + "step": 22505 + }, + { + "epoch": 2.668801138384916, + "grad_norm": 1.0375836559357705, + "learning_rate": 1.3169592736829956e-05, + "loss": 0.129, + "step": 22506 + }, + { + "epoch": 2.6689197201470414, + "grad_norm": 0.5876456592696075, + "learning_rate": 1.3167478202159516e-05, + "loss": 0.0791, + "step": 22507 + }, + { + "epoch": 2.6690383019091666, + "grad_norm": 0.7263845168104083, + "learning_rate": 1.3165363776570416e-05, + "loss": 0.0998, + "step": 22508 + }, + { + "epoch": 2.6691568836712913, + "grad_norm": 0.6081879447095457, + "learning_rate": 1.3163249460082161e-05, + "loss": 0.0893, + "step": 22509 + }, + { + "epoch": 2.669275465433416, + "grad_norm": 0.47166798478382943, + "learning_rate": 1.3161135252714219e-05, + "loss": 0.0637, + "step": 22510 + }, + { + "epoch": 2.6693940471955413, + "grad_norm": 0.7297959040572417, + "learning_rate": 1.315902115448609e-05, + "loss": 0.0889, + "step": 22511 + }, + { + "epoch": 2.6695126289576665, + "grad_norm": 0.8003942491403716, + "learning_rate": 1.3156907165417271e-05, + "loss": 0.1198, + "step": 22512 + }, + { + "epoch": 2.6696312107197913, + "grad_norm": 0.6343017785350753, + "learning_rate": 1.315479328552725e-05, + "loss": 0.0879, + "step": 22513 + }, + { + "epoch": 2.669749792481916, + "grad_norm": 0.6811776572516606, + "learning_rate": 1.31526795148355e-05, + "loss": 0.074, + "step": 22514 + }, + { + "epoch": 2.669868374244041, + "grad_norm": 0.6716160865651479, + "learning_rate": 1.3150565853361518e-05, + "loss": 0.0964, + "step": 22515 + }, + { + "epoch": 2.6699869560061664, + "grad_norm": 0.6078943109542597, + "learning_rate": 1.3148452301124788e-05, + "loss": 0.0827, + "step": 22516 + }, + { + "epoch": 2.670105537768291, + "grad_norm": 0.9435191383194063, + "learning_rate": 1.3146338858144797e-05, + "loss": 0.1322, + "step": 22517 + }, + { + "epoch": 2.6702241195304164, + "grad_norm": 0.5962735599829234, + "learning_rate": 1.3144225524441034e-05, + "loss": 0.0998, + "step": 22518 + }, + { + "epoch": 2.670342701292541, + "grad_norm": 0.4798409382001057, + "learning_rate": 1.314211230003295e-05, + "loss": 0.0796, + "step": 22519 + }, + { + "epoch": 2.6704612830546663, + "grad_norm": 0.6233764832367209, + "learning_rate": 1.3139999184940071e-05, + "loss": 0.1041, + "step": 22520 + }, + { + "epoch": 2.670579864816791, + "grad_norm": 0.8137069297344217, + "learning_rate": 1.313788617918185e-05, + "loss": 0.0958, + "step": 22521 + }, + { + "epoch": 2.6706984465789163, + "grad_norm": 0.5213185373021973, + "learning_rate": 1.3135773282777768e-05, + "loss": 0.076, + "step": 22522 + }, + { + "epoch": 2.670817028341041, + "grad_norm": 0.5148815844000691, + "learning_rate": 1.3133660495747308e-05, + "loss": 0.0644, + "step": 22523 + }, + { + "epoch": 2.6709356101031663, + "grad_norm": 0.6690004749598457, + "learning_rate": 1.3131547818109947e-05, + "loss": 0.1112, + "step": 22524 + }, + { + "epoch": 2.671054191865291, + "grad_norm": 0.695653300800375, + "learning_rate": 1.3129435249885173e-05, + "loss": 0.0808, + "step": 22525 + }, + { + "epoch": 2.6711727736274162, + "grad_norm": 0.6767937819232538, + "learning_rate": 1.3127322791092427e-05, + "loss": 0.0655, + "step": 22526 + }, + { + "epoch": 2.671291355389541, + "grad_norm": 0.4652638734154, + "learning_rate": 1.3125210441751223e-05, + "loss": 0.0639, + "step": 22527 + }, + { + "epoch": 2.671409937151666, + "grad_norm": 0.8142550874721474, + "learning_rate": 1.3123098201881012e-05, + "loss": 0.0946, + "step": 22528 + }, + { + "epoch": 2.671528518913791, + "grad_norm": 0.9159056501010985, + "learning_rate": 1.3120986071501279e-05, + "loss": 0.1287, + "step": 22529 + }, + { + "epoch": 2.671647100675916, + "grad_norm": 0.9383881151837395, + "learning_rate": 1.3118874050631469e-05, + "loss": 0.1001, + "step": 22530 + }, + { + "epoch": 2.671765682438041, + "grad_norm": 0.6829298237272435, + "learning_rate": 1.3116762139291089e-05, + "loss": 0.0762, + "step": 22531 + }, + { + "epoch": 2.671884264200166, + "grad_norm": 0.885322432647032, + "learning_rate": 1.3114650337499578e-05, + "loss": 0.1118, + "step": 22532 + }, + { + "epoch": 2.672002845962291, + "grad_norm": 0.63959464851416, + "learning_rate": 1.311253864527642e-05, + "loss": 0.0858, + "step": 22533 + }, + { + "epoch": 2.672121427724416, + "grad_norm": 0.7319527723998631, + "learning_rate": 1.3110427062641078e-05, + "loss": 0.1007, + "step": 22534 + }, + { + "epoch": 2.672240009486541, + "grad_norm": 0.553495636622783, + "learning_rate": 1.3108315589613018e-05, + "loss": 0.0861, + "step": 22535 + }, + { + "epoch": 2.672358591248666, + "grad_norm": 0.8097738547380178, + "learning_rate": 1.3106204226211713e-05, + "loss": 0.1055, + "step": 22536 + }, + { + "epoch": 2.672477173010791, + "grad_norm": 0.7025386572783162, + "learning_rate": 1.3104092972456602e-05, + "loss": 0.0853, + "step": 22537 + }, + { + "epoch": 2.672595754772916, + "grad_norm": 0.8928333583174839, + "learning_rate": 1.3101981828367186e-05, + "loss": 0.1138, + "step": 22538 + }, + { + "epoch": 2.6727143365350408, + "grad_norm": 0.5722382818014681, + "learning_rate": 1.3099870793962899e-05, + "loss": 0.0701, + "step": 22539 + }, + { + "epoch": 2.672832918297166, + "grad_norm": 0.6801978174545812, + "learning_rate": 1.3097759869263215e-05, + "loss": 0.0868, + "step": 22540 + }, + { + "epoch": 2.6729515000592907, + "grad_norm": 1.1737511240022422, + "learning_rate": 1.3095649054287573e-05, + "loss": 0.1465, + "step": 22541 + }, + { + "epoch": 2.673070081821416, + "grad_norm": 0.7844933381075219, + "learning_rate": 1.309353834905547e-05, + "loss": 0.0825, + "step": 22542 + }, + { + "epoch": 2.6731886635835407, + "grad_norm": 0.7559803969098862, + "learning_rate": 1.3091427753586333e-05, + "loss": 0.0845, + "step": 22543 + }, + { + "epoch": 2.673307245345666, + "grad_norm": 0.5093099934713377, + "learning_rate": 1.3089317267899626e-05, + "loss": 0.0786, + "step": 22544 + }, + { + "epoch": 2.673425827107791, + "grad_norm": 0.5257702430103471, + "learning_rate": 1.3087206892014809e-05, + "loss": 0.0612, + "step": 22545 + }, + { + "epoch": 2.673544408869916, + "grad_norm": 0.9904086250391138, + "learning_rate": 1.3085096625951338e-05, + "loss": 0.1271, + "step": 22546 + }, + { + "epoch": 2.6736629906320406, + "grad_norm": 0.3660725551266328, + "learning_rate": 1.308298646972867e-05, + "loss": 0.0469, + "step": 22547 + }, + { + "epoch": 2.673781572394166, + "grad_norm": 0.6686395017488083, + "learning_rate": 1.3080876423366249e-05, + "loss": 0.1062, + "step": 22548 + }, + { + "epoch": 2.673900154156291, + "grad_norm": 0.6969277428866165, + "learning_rate": 1.3078766486883526e-05, + "loss": 0.1009, + "step": 22549 + }, + { + "epoch": 2.6740187359184158, + "grad_norm": 0.5945445320917959, + "learning_rate": 1.3076656660299963e-05, + "loss": 0.0715, + "step": 22550 + }, + { + "epoch": 2.6741373176805405, + "grad_norm": 0.6617036234482043, + "learning_rate": 1.3074546943635008e-05, + "loss": 0.0975, + "step": 22551 + }, + { + "epoch": 2.6742558994426657, + "grad_norm": 0.8432212403150774, + "learning_rate": 1.3072437336908092e-05, + "loss": 0.1337, + "step": 22552 + }, + { + "epoch": 2.674374481204791, + "grad_norm": 0.5998808945628443, + "learning_rate": 1.3070327840138693e-05, + "loss": 0.0776, + "step": 22553 + }, + { + "epoch": 2.6744930629669157, + "grad_norm": 1.3045267082035905, + "learning_rate": 1.3068218453346234e-05, + "loss": 0.1809, + "step": 22554 + }, + { + "epoch": 2.6746116447290404, + "grad_norm": 0.5472729700189259, + "learning_rate": 1.3066109176550167e-05, + "loss": 0.0749, + "step": 22555 + }, + { + "epoch": 2.6747302264911657, + "grad_norm": 0.41719885440746557, + "learning_rate": 1.3064000009769945e-05, + "loss": 0.0551, + "step": 22556 + }, + { + "epoch": 2.674848808253291, + "grad_norm": 0.608588556688199, + "learning_rate": 1.3061890953025e-05, + "loss": 0.0851, + "step": 22557 + }, + { + "epoch": 2.6749673900154156, + "grad_norm": 0.7481995998797529, + "learning_rate": 1.3059782006334792e-05, + "loss": 0.084, + "step": 22558 + }, + { + "epoch": 2.6750859717775404, + "grad_norm": 0.6063700621325764, + "learning_rate": 1.3057673169718743e-05, + "loss": 0.0754, + "step": 22559 + }, + { + "epoch": 2.6752045535396656, + "grad_norm": 0.768959103070248, + "learning_rate": 1.3055564443196302e-05, + "loss": 0.1229, + "step": 22560 + }, + { + "epoch": 2.6753231353017908, + "grad_norm": 0.46113714805090067, + "learning_rate": 1.3053455826786908e-05, + "loss": 0.0621, + "step": 22561 + }, + { + "epoch": 2.6754417170639155, + "grad_norm": 0.5976749485633504, + "learning_rate": 1.3051347320510015e-05, + "loss": 0.0845, + "step": 22562 + }, + { + "epoch": 2.6755602988260403, + "grad_norm": 0.7582802416129806, + "learning_rate": 1.3049238924385037e-05, + "loss": 0.1071, + "step": 22563 + }, + { + "epoch": 2.6756788805881655, + "grad_norm": 0.539339396528167, + "learning_rate": 1.304713063843142e-05, + "loss": 0.0472, + "step": 22564 + }, + { + "epoch": 2.6757974623502907, + "grad_norm": 0.5365091344307127, + "learning_rate": 1.30450224626686e-05, + "loss": 0.0775, + "step": 22565 + }, + { + "epoch": 2.6759160441124155, + "grad_norm": 0.5852040160349374, + "learning_rate": 1.3042914397116014e-05, + "loss": 0.0657, + "step": 22566 + }, + { + "epoch": 2.67603462587454, + "grad_norm": 0.6230579094285195, + "learning_rate": 1.3040806441793097e-05, + "loss": 0.0888, + "step": 22567 + }, + { + "epoch": 2.6761532076366654, + "grad_norm": 0.6008179650082505, + "learning_rate": 1.3038698596719274e-05, + "loss": 0.1001, + "step": 22568 + }, + { + "epoch": 2.6762717893987906, + "grad_norm": 0.7495241102454784, + "learning_rate": 1.3036590861913994e-05, + "loss": 0.1131, + "step": 22569 + }, + { + "epoch": 2.6763903711609154, + "grad_norm": 0.5708030110920733, + "learning_rate": 1.3034483237396666e-05, + "loss": 0.0961, + "step": 22570 + }, + { + "epoch": 2.6765089529230406, + "grad_norm": 0.6023630725712099, + "learning_rate": 1.303237572318673e-05, + "loss": 0.0884, + "step": 22571 + }, + { + "epoch": 2.6766275346851653, + "grad_norm": 0.6670894635291013, + "learning_rate": 1.3030268319303612e-05, + "loss": 0.0797, + "step": 22572 + }, + { + "epoch": 2.6767461164472905, + "grad_norm": 0.5445366737897201, + "learning_rate": 1.3028161025766756e-05, + "loss": 0.0856, + "step": 22573 + }, + { + "epoch": 2.6768646982094153, + "grad_norm": 0.5786607568376005, + "learning_rate": 1.3026053842595562e-05, + "loss": 0.0868, + "step": 22574 + }, + { + "epoch": 2.6769832799715405, + "grad_norm": 0.9557764025136658, + "learning_rate": 1.302394676980947e-05, + "loss": 0.1377, + "step": 22575 + }, + { + "epoch": 2.6771018617336653, + "grad_norm": 0.7755827801835828, + "learning_rate": 1.3021839807427899e-05, + "loss": 0.1031, + "step": 22576 + }, + { + "epoch": 2.6772204434957905, + "grad_norm": 0.7154862803918858, + "learning_rate": 1.3019732955470279e-05, + "loss": 0.0992, + "step": 22577 + }, + { + "epoch": 2.6773390252579152, + "grad_norm": 0.7804949838433117, + "learning_rate": 1.301762621395603e-05, + "loss": 0.102, + "step": 22578 + }, + { + "epoch": 2.6774576070200404, + "grad_norm": 0.8546001355114005, + "learning_rate": 1.3015519582904572e-05, + "loss": 0.1059, + "step": 22579 + }, + { + "epoch": 2.677576188782165, + "grad_norm": 0.6024873667861805, + "learning_rate": 1.3013413062335339e-05, + "loss": 0.078, + "step": 22580 + }, + { + "epoch": 2.6776947705442904, + "grad_norm": 0.6989664533450602, + "learning_rate": 1.3011306652267726e-05, + "loss": 0.0983, + "step": 22581 + }, + { + "epoch": 2.677813352306415, + "grad_norm": 0.7872324917177467, + "learning_rate": 1.3009200352721168e-05, + "loss": 0.1124, + "step": 22582 + }, + { + "epoch": 2.6779319340685404, + "grad_norm": 0.4457466582979142, + "learning_rate": 1.3007094163715075e-05, + "loss": 0.061, + "step": 22583 + }, + { + "epoch": 2.678050515830665, + "grad_norm": 0.4454620976672655, + "learning_rate": 1.3004988085268868e-05, + "loss": 0.0638, + "step": 22584 + }, + { + "epoch": 2.6781690975927903, + "grad_norm": 0.7642290496076534, + "learning_rate": 1.3002882117401971e-05, + "loss": 0.0996, + "step": 22585 + }, + { + "epoch": 2.678287679354915, + "grad_norm": 0.7793633558365591, + "learning_rate": 1.3000776260133773e-05, + "loss": 0.1126, + "step": 22586 + }, + { + "epoch": 2.6784062611170403, + "grad_norm": 0.8598984686876435, + "learning_rate": 1.2998670513483718e-05, + "loss": 0.1451, + "step": 22587 + }, + { + "epoch": 2.678524842879165, + "grad_norm": 0.4226059163639984, + "learning_rate": 1.2996564877471198e-05, + "loss": 0.0531, + "step": 22588 + }, + { + "epoch": 2.6786434246412902, + "grad_norm": 0.6558943411705176, + "learning_rate": 1.299445935211564e-05, + "loss": 0.0821, + "step": 22589 + }, + { + "epoch": 2.678762006403415, + "grad_norm": 0.7877057358038038, + "learning_rate": 1.2992353937436424e-05, + "loss": 0.1128, + "step": 22590 + }, + { + "epoch": 2.67888058816554, + "grad_norm": 0.515969814248458, + "learning_rate": 1.2990248633452998e-05, + "loss": 0.0736, + "step": 22591 + }, + { + "epoch": 2.678999169927665, + "grad_norm": 0.5560232823728928, + "learning_rate": 1.2988143440184743e-05, + "loss": 0.0775, + "step": 22592 + }, + { + "epoch": 2.67911775168979, + "grad_norm": 0.8928168315379594, + "learning_rate": 1.2986038357651076e-05, + "loss": 0.12, + "step": 22593 + }, + { + "epoch": 2.6792363334519154, + "grad_norm": 0.8959685394457125, + "learning_rate": 1.2983933385871405e-05, + "loss": 0.1154, + "step": 22594 + }, + { + "epoch": 2.67935491521404, + "grad_norm": 0.7332237947999771, + "learning_rate": 1.2981828524865131e-05, + "loss": 0.0928, + "step": 22595 + }, + { + "epoch": 2.679473496976165, + "grad_norm": 0.6430865194443881, + "learning_rate": 1.297972377465167e-05, + "loss": 0.0977, + "step": 22596 + }, + { + "epoch": 2.67959207873829, + "grad_norm": 0.7872516924675793, + "learning_rate": 1.2977619135250395e-05, + "loss": 0.0986, + "step": 22597 + }, + { + "epoch": 2.6797106605004153, + "grad_norm": 0.8465221659676099, + "learning_rate": 1.297551460668075e-05, + "loss": 0.1018, + "step": 22598 + }, + { + "epoch": 2.67982924226254, + "grad_norm": 0.6132168467964908, + "learning_rate": 1.2973410188962104e-05, + "loss": 0.0834, + "step": 22599 + }, + { + "epoch": 2.679947824024665, + "grad_norm": 0.55155907354466, + "learning_rate": 1.2971305882113882e-05, + "loss": 0.0724, + "step": 22600 + }, + { + "epoch": 2.68006640578679, + "grad_norm": 0.602773298926713, + "learning_rate": 1.2969201686155446e-05, + "loss": 0.0752, + "step": 22601 + }, + { + "epoch": 2.680184987548915, + "grad_norm": 0.6772003995382532, + "learning_rate": 1.2967097601106237e-05, + "loss": 0.0931, + "step": 22602 + }, + { + "epoch": 2.68030356931104, + "grad_norm": 0.5013828927436841, + "learning_rate": 1.2964993626985622e-05, + "loss": 0.0597, + "step": 22603 + }, + { + "epoch": 2.6804221510731647, + "grad_norm": 0.6927189242159728, + "learning_rate": 1.2962889763813008e-05, + "loss": 0.0876, + "step": 22604 + }, + { + "epoch": 2.68054073283529, + "grad_norm": 0.9444979662087531, + "learning_rate": 1.296078601160779e-05, + "loss": 0.08, + "step": 22605 + }, + { + "epoch": 2.680659314597415, + "grad_norm": 0.5263653131016401, + "learning_rate": 1.2958682370389364e-05, + "loss": 0.0652, + "step": 22606 + }, + { + "epoch": 2.68077789635954, + "grad_norm": 0.8634956589013477, + "learning_rate": 1.2956578840177126e-05, + "loss": 0.1248, + "step": 22607 + }, + { + "epoch": 2.6808964781216647, + "grad_norm": 0.8224833484780887, + "learning_rate": 1.2954475420990441e-05, + "loss": 0.0978, + "step": 22608 + }, + { + "epoch": 2.68101505988379, + "grad_norm": 0.681683332656767, + "learning_rate": 1.2952372112848748e-05, + "loss": 0.1007, + "step": 22609 + }, + { + "epoch": 2.681133641645915, + "grad_norm": 0.8029408613082729, + "learning_rate": 1.2950268915771397e-05, + "loss": 0.1204, + "step": 22610 + }, + { + "epoch": 2.68125222340804, + "grad_norm": 0.5668308634950144, + "learning_rate": 1.2948165829777797e-05, + "loss": 0.1, + "step": 22611 + }, + { + "epoch": 2.6813708051701646, + "grad_norm": 0.840276803549313, + "learning_rate": 1.2946062854887313e-05, + "loss": 0.1084, + "step": 22612 + }, + { + "epoch": 2.6814893869322898, + "grad_norm": 0.9922874734572971, + "learning_rate": 1.294395999111937e-05, + "loss": 0.1487, + "step": 22613 + }, + { + "epoch": 2.681607968694415, + "grad_norm": 0.7732838906662184, + "learning_rate": 1.2941857238493319e-05, + "loss": 0.1048, + "step": 22614 + }, + { + "epoch": 2.6817265504565397, + "grad_norm": 0.5312101629266369, + "learning_rate": 1.2939754597028559e-05, + "loss": 0.0846, + "step": 22615 + }, + { + "epoch": 2.6818451322186645, + "grad_norm": 0.6340044117603783, + "learning_rate": 1.2937652066744472e-05, + "loss": 0.0802, + "step": 22616 + }, + { + "epoch": 2.6819637139807897, + "grad_norm": 0.5590043966177853, + "learning_rate": 1.2935549647660439e-05, + "loss": 0.0642, + "step": 22617 + }, + { + "epoch": 2.682082295742915, + "grad_norm": 0.6326294674493468, + "learning_rate": 1.2933447339795853e-05, + "loss": 0.0976, + "step": 22618 + }, + { + "epoch": 2.6822008775050397, + "grad_norm": 0.772358510742775, + "learning_rate": 1.2931345143170077e-05, + "loss": 0.1104, + "step": 22619 + }, + { + "epoch": 2.682319459267165, + "grad_norm": 0.7177861821910948, + "learning_rate": 1.2929243057802501e-05, + "loss": 0.1115, + "step": 22620 + }, + { + "epoch": 2.6824380410292896, + "grad_norm": 0.6279231858499384, + "learning_rate": 1.2927141083712502e-05, + "loss": 0.077, + "step": 22621 + }, + { + "epoch": 2.682556622791415, + "grad_norm": 0.6998931109227527, + "learning_rate": 1.2925039220919466e-05, + "loss": 0.0981, + "step": 22622 + }, + { + "epoch": 2.6826752045535396, + "grad_norm": 0.48002255840685587, + "learning_rate": 1.2922937469442741e-05, + "loss": 0.0535, + "step": 22623 + }, + { + "epoch": 2.682793786315665, + "grad_norm": 0.8856892828373653, + "learning_rate": 1.2920835829301741e-05, + "loss": 0.0932, + "step": 22624 + }, + { + "epoch": 2.6829123680777895, + "grad_norm": 0.8538375115299279, + "learning_rate": 1.2918734300515816e-05, + "loss": 0.1186, + "step": 22625 + }, + { + "epoch": 2.6830309498399147, + "grad_norm": 0.611275326158649, + "learning_rate": 1.2916632883104344e-05, + "loss": 0.0856, + "step": 22626 + }, + { + "epoch": 2.6831495316020395, + "grad_norm": 0.35883720298190264, + "learning_rate": 1.2914531577086697e-05, + "loss": 0.0495, + "step": 22627 + }, + { + "epoch": 2.6832681133641647, + "grad_norm": 0.6751086444940376, + "learning_rate": 1.2912430382482251e-05, + "loss": 0.0954, + "step": 22628 + }, + { + "epoch": 2.6833866951262895, + "grad_norm": 0.7734813449562702, + "learning_rate": 1.2910329299310383e-05, + "loss": 0.1102, + "step": 22629 + }, + { + "epoch": 2.6835052768884147, + "grad_norm": 0.6642451811057386, + "learning_rate": 1.2908228327590444e-05, + "loss": 0.0865, + "step": 22630 + }, + { + "epoch": 2.6836238586505394, + "grad_norm": 0.7830406115771968, + "learning_rate": 1.2906127467341813e-05, + "loss": 0.1012, + "step": 22631 + }, + { + "epoch": 2.6837424404126646, + "grad_norm": 0.7516967905983076, + "learning_rate": 1.2904026718583857e-05, + "loss": 0.0877, + "step": 22632 + }, + { + "epoch": 2.6838610221747894, + "grad_norm": 0.779702776837603, + "learning_rate": 1.2901926081335941e-05, + "loss": 0.1191, + "step": 22633 + }, + { + "epoch": 2.6839796039369146, + "grad_norm": 0.644065032080313, + "learning_rate": 1.289982555561744e-05, + "loss": 0.0907, + "step": 22634 + }, + { + "epoch": 2.6840981856990394, + "grad_norm": 0.8191456447255977, + "learning_rate": 1.2897725141447686e-05, + "loss": 0.1254, + "step": 22635 + }, + { + "epoch": 2.6842167674611646, + "grad_norm": 0.7682378688115873, + "learning_rate": 1.289562483884609e-05, + "loss": 0.107, + "step": 22636 + }, + { + "epoch": 2.6843353492232893, + "grad_norm": 0.4961955207599558, + "learning_rate": 1.2893524647831977e-05, + "loss": 0.076, + "step": 22637 + }, + { + "epoch": 2.6844539309854145, + "grad_norm": 0.6084416927062515, + "learning_rate": 1.2891424568424726e-05, + "loss": 0.0783, + "step": 22638 + }, + { + "epoch": 2.6845725127475393, + "grad_norm": 0.7895022820155893, + "learning_rate": 1.2889324600643688e-05, + "loss": 0.1059, + "step": 22639 + }, + { + "epoch": 2.6846910945096645, + "grad_norm": 0.5394857424793932, + "learning_rate": 1.2887224744508233e-05, + "loss": 0.073, + "step": 22640 + }, + { + "epoch": 2.6848096762717892, + "grad_norm": 1.0965841259979192, + "learning_rate": 1.2885125000037706e-05, + "loss": 0.1649, + "step": 22641 + }, + { + "epoch": 2.6849282580339144, + "grad_norm": 0.49983519568062734, + "learning_rate": 1.288302536725147e-05, + "loss": 0.0682, + "step": 22642 + }, + { + "epoch": 2.6850468397960396, + "grad_norm": 0.6748702996729742, + "learning_rate": 1.288092584616888e-05, + "loss": 0.0855, + "step": 22643 + }, + { + "epoch": 2.6851654215581644, + "grad_norm": 0.561522219066556, + "learning_rate": 1.2878826436809294e-05, + "loss": 0.0655, + "step": 22644 + }, + { + "epoch": 2.685284003320289, + "grad_norm": 0.5190045685327433, + "learning_rate": 1.2876727139192074e-05, + "loss": 0.0577, + "step": 22645 + }, + { + "epoch": 2.6854025850824144, + "grad_norm": 0.803480557222526, + "learning_rate": 1.2874627953336544e-05, + "loss": 0.1119, + "step": 22646 + }, + { + "epoch": 2.6855211668445396, + "grad_norm": 0.9388068697901837, + "learning_rate": 1.2872528879262091e-05, + "loss": 0.1428, + "step": 22647 + }, + { + "epoch": 2.6856397486066643, + "grad_norm": 0.8278226041566398, + "learning_rate": 1.2870429916988042e-05, + "loss": 0.1131, + "step": 22648 + }, + { + "epoch": 2.685758330368789, + "grad_norm": 0.8435568621108217, + "learning_rate": 1.2868331066533757e-05, + "loss": 0.1138, + "step": 22649 + }, + { + "epoch": 2.6858769121309143, + "grad_norm": 0.5384964873037766, + "learning_rate": 1.2866232327918582e-05, + "loss": 0.0742, + "step": 22650 + }, + { + "epoch": 2.6859954938930395, + "grad_norm": 0.6297222039348218, + "learning_rate": 1.2864133701161873e-05, + "loss": 0.0717, + "step": 22651 + }, + { + "epoch": 2.6861140756551642, + "grad_norm": 0.8162208279300454, + "learning_rate": 1.2862035186282961e-05, + "loss": 0.1102, + "step": 22652 + }, + { + "epoch": 2.686232657417289, + "grad_norm": 1.0896833710125664, + "learning_rate": 1.2859936783301201e-05, + "loss": 0.1168, + "step": 22653 + }, + { + "epoch": 2.686351239179414, + "grad_norm": 0.8442768283272248, + "learning_rate": 1.2857838492235938e-05, + "loss": 0.1249, + "step": 22654 + }, + { + "epoch": 2.6864698209415394, + "grad_norm": 0.9050368229979442, + "learning_rate": 1.2855740313106512e-05, + "loss": 0.1393, + "step": 22655 + }, + { + "epoch": 2.686588402703664, + "grad_norm": 0.6635551297519555, + "learning_rate": 1.2853642245932279e-05, + "loss": 0.0836, + "step": 22656 + }, + { + "epoch": 2.686706984465789, + "grad_norm": 0.6029486728710902, + "learning_rate": 1.2851544290732547e-05, + "loss": 0.0796, + "step": 22657 + }, + { + "epoch": 2.686825566227914, + "grad_norm": 0.6238188519703463, + "learning_rate": 1.2849446447526703e-05, + "loss": 0.0943, + "step": 22658 + }, + { + "epoch": 2.6869441479900393, + "grad_norm": 0.820610606373146, + "learning_rate": 1.2847348716334052e-05, + "loss": 0.1144, + "step": 22659 + }, + { + "epoch": 2.687062729752164, + "grad_norm": 0.6650553414997744, + "learning_rate": 1.2845251097173949e-05, + "loss": 0.0922, + "step": 22660 + }, + { + "epoch": 2.687181311514289, + "grad_norm": 0.6166728237903555, + "learning_rate": 1.284315359006571e-05, + "loss": 0.0786, + "step": 22661 + }, + { + "epoch": 2.687299893276414, + "grad_norm": 0.9116481490605151, + "learning_rate": 1.2841056195028709e-05, + "loss": 0.0947, + "step": 22662 + }, + { + "epoch": 2.6874184750385393, + "grad_norm": 0.7257384742951588, + "learning_rate": 1.2838958912082244e-05, + "loss": 0.1017, + "step": 22663 + }, + { + "epoch": 2.687537056800664, + "grad_norm": 0.6000004375240212, + "learning_rate": 1.283686174124567e-05, + "loss": 0.0801, + "step": 22664 + }, + { + "epoch": 2.6876556385627888, + "grad_norm": 0.5659495189000521, + "learning_rate": 1.2834764682538313e-05, + "loss": 0.0814, + "step": 22665 + }, + { + "epoch": 2.687774220324914, + "grad_norm": 0.8945074949274806, + "learning_rate": 1.283266773597951e-05, + "loss": 0.1105, + "step": 22666 + }, + { + "epoch": 2.687892802087039, + "grad_norm": 0.7233521829452693, + "learning_rate": 1.2830570901588596e-05, + "loss": 0.0814, + "step": 22667 + }, + { + "epoch": 2.688011383849164, + "grad_norm": 0.5628142672995153, + "learning_rate": 1.2828474179384875e-05, + "loss": 0.0647, + "step": 22668 + }, + { + "epoch": 2.688129965611289, + "grad_norm": 0.9517943618211849, + "learning_rate": 1.282637756938772e-05, + "loss": 0.1237, + "step": 22669 + }, + { + "epoch": 2.688248547373414, + "grad_norm": 0.7883951748852276, + "learning_rate": 1.2824281071616424e-05, + "loss": 0.1114, + "step": 22670 + }, + { + "epoch": 2.688367129135539, + "grad_norm": 0.5898198134822347, + "learning_rate": 1.2822184686090333e-05, + "loss": 0.0862, + "step": 22671 + }, + { + "epoch": 2.688485710897664, + "grad_norm": 0.4455873516508463, + "learning_rate": 1.282008841282875e-05, + "loss": 0.0639, + "step": 22672 + }, + { + "epoch": 2.688604292659789, + "grad_norm": 0.7240027923832637, + "learning_rate": 1.2817992251851035e-05, + "loss": 0.0986, + "step": 22673 + }, + { + "epoch": 2.688722874421914, + "grad_norm": 0.8634321375389595, + "learning_rate": 1.2815896203176481e-05, + "loss": 0.114, + "step": 22674 + }, + { + "epoch": 2.688841456184039, + "grad_norm": 0.5883618816805731, + "learning_rate": 1.2813800266824424e-05, + "loss": 0.0897, + "step": 22675 + }, + { + "epoch": 2.688960037946164, + "grad_norm": 0.4709347078044427, + "learning_rate": 1.2811704442814185e-05, + "loss": 0.0598, + "step": 22676 + }, + { + "epoch": 2.689078619708289, + "grad_norm": 0.5210881954664954, + "learning_rate": 1.2809608731165085e-05, + "loss": 0.0773, + "step": 22677 + }, + { + "epoch": 2.6891972014704137, + "grad_norm": 0.6967156372828484, + "learning_rate": 1.2807513131896453e-05, + "loss": 0.0721, + "step": 22678 + }, + { + "epoch": 2.689315783232539, + "grad_norm": 0.7184347620096857, + "learning_rate": 1.280541764502758e-05, + "loss": 0.0929, + "step": 22679 + }, + { + "epoch": 2.6894343649946637, + "grad_norm": 0.5242849811702933, + "learning_rate": 1.2803322270577816e-05, + "loss": 0.0786, + "step": 22680 + }, + { + "epoch": 2.689552946756789, + "grad_norm": 0.5292645674866578, + "learning_rate": 1.2801227008566464e-05, + "loss": 0.0745, + "step": 22681 + }, + { + "epoch": 2.6896715285189137, + "grad_norm": 0.8444688514139957, + "learning_rate": 1.2799131859012831e-05, + "loss": 0.1115, + "step": 22682 + }, + { + "epoch": 2.689790110281039, + "grad_norm": 0.9573062524650158, + "learning_rate": 1.2797036821936248e-05, + "loss": 0.1202, + "step": 22683 + }, + { + "epoch": 2.6899086920431636, + "grad_norm": 0.5923501326680854, + "learning_rate": 1.2794941897356017e-05, + "loss": 0.0754, + "step": 22684 + }, + { + "epoch": 2.690027273805289, + "grad_norm": 0.7005881614422458, + "learning_rate": 1.2792847085291465e-05, + "loss": 0.0802, + "step": 22685 + }, + { + "epoch": 2.6901458555674136, + "grad_norm": 0.5866115123047189, + "learning_rate": 1.2790752385761884e-05, + "loss": 0.0699, + "step": 22686 + }, + { + "epoch": 2.690264437329539, + "grad_norm": 0.5634834566192848, + "learning_rate": 1.2788657798786593e-05, + "loss": 0.0692, + "step": 22687 + }, + { + "epoch": 2.6903830190916636, + "grad_norm": 0.5539177053486987, + "learning_rate": 1.2786563324384904e-05, + "loss": 0.0858, + "step": 22688 + }, + { + "epoch": 2.6905016008537888, + "grad_norm": 0.5547846843556626, + "learning_rate": 1.2784468962576136e-05, + "loss": 0.0628, + "step": 22689 + }, + { + "epoch": 2.6906201826159135, + "grad_norm": 1.225163933453412, + "learning_rate": 1.278237471337957e-05, + "loss": 0.1326, + "step": 22690 + }, + { + "epoch": 2.6907387643780387, + "grad_norm": 0.647136713290715, + "learning_rate": 1.278028057681453e-05, + "loss": 0.093, + "step": 22691 + }, + { + "epoch": 2.690857346140164, + "grad_norm": 0.8336036597636207, + "learning_rate": 1.2778186552900318e-05, + "loss": 0.1272, + "step": 22692 + }, + { + "epoch": 2.6909759279022887, + "grad_norm": 0.7059278712312694, + "learning_rate": 1.2776092641656234e-05, + "loss": 0.0778, + "step": 22693 + }, + { + "epoch": 2.6910945096644134, + "grad_norm": 0.5486106623814532, + "learning_rate": 1.2773998843101592e-05, + "loss": 0.0675, + "step": 22694 + }, + { + "epoch": 2.6912130914265386, + "grad_norm": 0.7799844212557053, + "learning_rate": 1.2771905157255688e-05, + "loss": 0.0908, + "step": 22695 + }, + { + "epoch": 2.691331673188664, + "grad_norm": 0.6331366498284883, + "learning_rate": 1.2769811584137832e-05, + "loss": 0.0815, + "step": 22696 + }, + { + "epoch": 2.6914502549507886, + "grad_norm": 0.5732080735812091, + "learning_rate": 1.2767718123767303e-05, + "loss": 0.0808, + "step": 22697 + }, + { + "epoch": 2.6915688367129134, + "grad_norm": 0.6494235169326608, + "learning_rate": 1.2765624776163416e-05, + "loss": 0.105, + "step": 22698 + }, + { + "epoch": 2.6916874184750386, + "grad_norm": 0.5322120167540427, + "learning_rate": 1.2763531541345464e-05, + "loss": 0.0815, + "step": 22699 + }, + { + "epoch": 2.6918060002371638, + "grad_norm": 1.0682987176848144, + "learning_rate": 1.2761438419332754e-05, + "loss": 0.1448, + "step": 22700 + }, + { + "epoch": 2.6919245819992885, + "grad_norm": 0.532037379812489, + "learning_rate": 1.2759345410144566e-05, + "loss": 0.0727, + "step": 22701 + }, + { + "epoch": 2.6920431637614133, + "grad_norm": 0.5858687410497273, + "learning_rate": 1.2757252513800205e-05, + "loss": 0.0885, + "step": 22702 + }, + { + "epoch": 2.6921617455235385, + "grad_norm": 0.6708315590432451, + "learning_rate": 1.275515973031896e-05, + "loss": 0.0891, + "step": 22703 + }, + { + "epoch": 2.6922803272856637, + "grad_norm": 0.6465279679674759, + "learning_rate": 1.2753067059720125e-05, + "loss": 0.0863, + "step": 22704 + }, + { + "epoch": 2.6923989090477884, + "grad_norm": 0.8461458061323175, + "learning_rate": 1.2750974502023005e-05, + "loss": 0.0892, + "step": 22705 + }, + { + "epoch": 2.692517490809913, + "grad_norm": 0.6884100616377786, + "learning_rate": 1.2748882057246864e-05, + "loss": 0.0929, + "step": 22706 + }, + { + "epoch": 2.6926360725720384, + "grad_norm": 1.0122048509222914, + "learning_rate": 1.2746789725411023e-05, + "loss": 0.133, + "step": 22707 + }, + { + "epoch": 2.6927546543341636, + "grad_norm": 0.7799189200738704, + "learning_rate": 1.2744697506534744e-05, + "loss": 0.081, + "step": 22708 + }, + { + "epoch": 2.6928732360962884, + "grad_norm": 0.5806361554165879, + "learning_rate": 1.2742605400637325e-05, + "loss": 0.0722, + "step": 22709 + }, + { + "epoch": 2.692991817858413, + "grad_norm": 0.8439791872339972, + "learning_rate": 1.2740513407738059e-05, + "loss": 0.1294, + "step": 22710 + }, + { + "epoch": 2.6931103996205383, + "grad_norm": 0.6892327064446699, + "learning_rate": 1.273842152785623e-05, + "loss": 0.0969, + "step": 22711 + }, + { + "epoch": 2.6932289813826635, + "grad_norm": 0.7442483493005982, + "learning_rate": 1.273632976101111e-05, + "loss": 0.1036, + "step": 22712 + }, + { + "epoch": 2.6933475631447883, + "grad_norm": 0.605087677901092, + "learning_rate": 1.273423810722199e-05, + "loss": 0.0883, + "step": 22713 + }, + { + "epoch": 2.693466144906913, + "grad_norm": 0.6296173101571032, + "learning_rate": 1.2732146566508155e-05, + "loss": 0.0841, + "step": 22714 + }, + { + "epoch": 2.6935847266690383, + "grad_norm": 0.6553535504718296, + "learning_rate": 1.2730055138888886e-05, + "loss": 0.09, + "step": 22715 + }, + { + "epoch": 2.6937033084311635, + "grad_norm": 0.7019030850484651, + "learning_rate": 1.2727963824383469e-05, + "loss": 0.079, + "step": 22716 + }, + { + "epoch": 2.693821890193288, + "grad_norm": 0.5357386368162855, + "learning_rate": 1.2725872623011159e-05, + "loss": 0.0876, + "step": 22717 + }, + { + "epoch": 2.6939404719554134, + "grad_norm": 0.7198348065927875, + "learning_rate": 1.2723781534791268e-05, + "loss": 0.108, + "step": 22718 + }, + { + "epoch": 2.694059053717538, + "grad_norm": 0.6465112287929389, + "learning_rate": 1.2721690559743053e-05, + "loss": 0.0825, + "step": 22719 + }, + { + "epoch": 2.6941776354796634, + "grad_norm": 0.705006458967942, + "learning_rate": 1.2719599697885798e-05, + "loss": 0.1031, + "step": 22720 + }, + { + "epoch": 2.694296217241788, + "grad_norm": 0.6324114384865103, + "learning_rate": 1.271750894923876e-05, + "loss": 0.087, + "step": 22721 + }, + { + "epoch": 2.6944147990039133, + "grad_norm": 0.7740025820539483, + "learning_rate": 1.2715418313821246e-05, + "loss": 0.0911, + "step": 22722 + }, + { + "epoch": 2.694533380766038, + "grad_norm": 0.670932296731335, + "learning_rate": 1.2713327791652501e-05, + "loss": 0.0938, + "step": 22723 + }, + { + "epoch": 2.6946519625281633, + "grad_norm": 0.6416768760788588, + "learning_rate": 1.2711237382751806e-05, + "loss": 0.0861, + "step": 22724 + }, + { + "epoch": 2.694770544290288, + "grad_norm": 0.9209888406094003, + "learning_rate": 1.2709147087138435e-05, + "loss": 0.1158, + "step": 22725 + }, + { + "epoch": 2.6948891260524133, + "grad_norm": 0.7313138612174371, + "learning_rate": 1.2707056904831652e-05, + "loss": 0.0923, + "step": 22726 + }, + { + "epoch": 2.695007707814538, + "grad_norm": 0.5958644377380091, + "learning_rate": 1.270496683585074e-05, + "loss": 0.0774, + "step": 22727 + }, + { + "epoch": 2.6951262895766632, + "grad_norm": 0.6597566816130558, + "learning_rate": 1.2702876880214939e-05, + "loss": 0.0771, + "step": 22728 + }, + { + "epoch": 2.695244871338788, + "grad_norm": 0.6699954876689014, + "learning_rate": 1.2700787037943554e-05, + "loss": 0.0992, + "step": 22729 + }, + { + "epoch": 2.695363453100913, + "grad_norm": 0.793372103057005, + "learning_rate": 1.2698697309055818e-05, + "loss": 0.1213, + "step": 22730 + }, + { + "epoch": 2.695482034863038, + "grad_norm": 0.6519483671296115, + "learning_rate": 1.2696607693571012e-05, + "loss": 0.0904, + "step": 22731 + }, + { + "epoch": 2.695600616625163, + "grad_norm": 0.4441598468101067, + "learning_rate": 1.2694518191508392e-05, + "loss": 0.0683, + "step": 22732 + }, + { + "epoch": 2.695719198387288, + "grad_norm": 0.7165968269370394, + "learning_rate": 1.2692428802887224e-05, + "loss": 0.1055, + "step": 22733 + }, + { + "epoch": 2.695837780149413, + "grad_norm": 0.5324137991379294, + "learning_rate": 1.269033952772678e-05, + "loss": 0.07, + "step": 22734 + }, + { + "epoch": 2.695956361911538, + "grad_norm": 0.6247029104998374, + "learning_rate": 1.2688250366046293e-05, + "loss": 0.0793, + "step": 22735 + }, + { + "epoch": 2.696074943673663, + "grad_norm": 0.5387210650355754, + "learning_rate": 1.2686161317865055e-05, + "loss": 0.0586, + "step": 22736 + }, + { + "epoch": 2.696193525435788, + "grad_norm": 0.5029929192715705, + "learning_rate": 1.2684072383202306e-05, + "loss": 0.0707, + "step": 22737 + }, + { + "epoch": 2.696312107197913, + "grad_norm": 0.5135868078229175, + "learning_rate": 1.268198356207731e-05, + "loss": 0.072, + "step": 22738 + }, + { + "epoch": 2.696430688960038, + "grad_norm": 0.5423198595512981, + "learning_rate": 1.2679894854509306e-05, + "loss": 0.0846, + "step": 22739 + }, + { + "epoch": 2.696549270722163, + "grad_norm": 0.7709149546288857, + "learning_rate": 1.267780626051758e-05, + "loss": 0.071, + "step": 22740 + }, + { + "epoch": 2.6966678524842878, + "grad_norm": 0.7321824371871846, + "learning_rate": 1.267571778012136e-05, + "loss": 0.0909, + "step": 22741 + }, + { + "epoch": 2.696786434246413, + "grad_norm": 0.8413501086362907, + "learning_rate": 1.2673629413339911e-05, + "loss": 0.136, + "step": 22742 + }, + { + "epoch": 2.6969050160085377, + "grad_norm": 0.5979879019243964, + "learning_rate": 1.2671541160192481e-05, + "loss": 0.0674, + "step": 22743 + }, + { + "epoch": 2.697023597770663, + "grad_norm": 1.0112677797376557, + "learning_rate": 1.2669453020698327e-05, + "loss": 0.1616, + "step": 22744 + }, + { + "epoch": 2.697142179532788, + "grad_norm": 0.44633091094511135, + "learning_rate": 1.2667364994876702e-05, + "loss": 0.0774, + "step": 22745 + }, + { + "epoch": 2.697260761294913, + "grad_norm": 0.5995920200690452, + "learning_rate": 1.2665277082746841e-05, + "loss": 0.0905, + "step": 22746 + }, + { + "epoch": 2.6973793430570376, + "grad_norm": 0.6545687583819564, + "learning_rate": 1.2663189284327997e-05, + "loss": 0.0893, + "step": 22747 + }, + { + "epoch": 2.697497924819163, + "grad_norm": 0.6522568688850834, + "learning_rate": 1.2661101599639419e-05, + "loss": 0.0892, + "step": 22748 + }, + { + "epoch": 2.697616506581288, + "grad_norm": 0.78077782629118, + "learning_rate": 1.2659014028700367e-05, + "loss": 0.0941, + "step": 22749 + }, + { + "epoch": 2.697735088343413, + "grad_norm": 0.4313899223225727, + "learning_rate": 1.2656926571530048e-05, + "loss": 0.0565, + "step": 22750 + }, + { + "epoch": 2.6978536701055376, + "grad_norm": 0.5804578202992583, + "learning_rate": 1.2654839228147753e-05, + "loss": 0.0844, + "step": 22751 + }, + { + "epoch": 2.6979722518676628, + "grad_norm": 0.7257741640901009, + "learning_rate": 1.265275199857269e-05, + "loss": 0.0878, + "step": 22752 + }, + { + "epoch": 2.698090833629788, + "grad_norm": 0.780080404762754, + "learning_rate": 1.2650664882824115e-05, + "loss": 0.1059, + "step": 22753 + }, + { + "epoch": 2.6982094153919127, + "grad_norm": 0.6713721973552561, + "learning_rate": 1.2648577880921262e-05, + "loss": 0.0925, + "step": 22754 + }, + { + "epoch": 2.6983279971540375, + "grad_norm": 0.8257228921985101, + "learning_rate": 1.2646490992883375e-05, + "loss": 0.1034, + "step": 22755 + }, + { + "epoch": 2.6984465789161627, + "grad_norm": 0.7776384631622199, + "learning_rate": 1.2644404218729706e-05, + "loss": 0.0988, + "step": 22756 + }, + { + "epoch": 2.698565160678288, + "grad_norm": 0.47418332836791444, + "learning_rate": 1.2642317558479467e-05, + "loss": 0.0675, + "step": 22757 + }, + { + "epoch": 2.6986837424404126, + "grad_norm": 0.6842871709767068, + "learning_rate": 1.2640231012151903e-05, + "loss": 0.0882, + "step": 22758 + }, + { + "epoch": 2.6988023242025374, + "grad_norm": 0.629514492385526, + "learning_rate": 1.2638144579766253e-05, + "loss": 0.084, + "step": 22759 + }, + { + "epoch": 2.6989209059646626, + "grad_norm": 0.8248015747684961, + "learning_rate": 1.2636058261341766e-05, + "loss": 0.1371, + "step": 22760 + }, + { + "epoch": 2.699039487726788, + "grad_norm": 0.7165385736909052, + "learning_rate": 1.2633972056897647e-05, + "loss": 0.1006, + "step": 22761 + }, + { + "epoch": 2.6991580694889126, + "grad_norm": 0.40186081802343737, + "learning_rate": 1.2631885966453138e-05, + "loss": 0.0613, + "step": 22762 + }, + { + "epoch": 2.6992766512510373, + "grad_norm": 0.8079884618888045, + "learning_rate": 1.262979999002748e-05, + "loss": 0.1193, + "step": 22763 + }, + { + "epoch": 2.6993952330131625, + "grad_norm": 0.7911981050110795, + "learning_rate": 1.2627714127639894e-05, + "loss": 0.1062, + "step": 22764 + }, + { + "epoch": 2.6995138147752877, + "grad_norm": 0.6034663853816326, + "learning_rate": 1.2625628379309612e-05, + "loss": 0.0793, + "step": 22765 + }, + { + "epoch": 2.6996323965374125, + "grad_norm": 0.7721710474435802, + "learning_rate": 1.262354274505586e-05, + "loss": 0.0927, + "step": 22766 + }, + { + "epoch": 2.6997509782995377, + "grad_norm": 0.539145852289442, + "learning_rate": 1.2621457224897878e-05, + "loss": 0.0802, + "step": 22767 + }, + { + "epoch": 2.6998695600616625, + "grad_norm": 0.8787092567582525, + "learning_rate": 1.2619371818854872e-05, + "loss": 0.0867, + "step": 22768 + }, + { + "epoch": 2.6999881418237877, + "grad_norm": 0.9219153252362043, + "learning_rate": 1.2617286526946076e-05, + "loss": 0.1202, + "step": 22769 + }, + { + "epoch": 2.7001067235859124, + "grad_norm": 0.5162954680785747, + "learning_rate": 1.2615201349190712e-05, + "loss": 0.06, + "step": 22770 + }, + { + "epoch": 2.7002253053480376, + "grad_norm": 0.5366795977500038, + "learning_rate": 1.2613116285608012e-05, + "loss": 0.0769, + "step": 22771 + }, + { + "epoch": 2.7003438871101624, + "grad_norm": 0.604692381514184, + "learning_rate": 1.2611031336217182e-05, + "loss": 0.0851, + "step": 22772 + }, + { + "epoch": 2.7004624688722876, + "grad_norm": 0.8246092472578473, + "learning_rate": 1.2608946501037455e-05, + "loss": 0.1235, + "step": 22773 + }, + { + "epoch": 2.7005810506344123, + "grad_norm": 0.5313543586962263, + "learning_rate": 1.2606861780088042e-05, + "loss": 0.0608, + "step": 22774 + }, + { + "epoch": 2.7006996323965375, + "grad_norm": 0.8630990604154145, + "learning_rate": 1.2604777173388167e-05, + "loss": 0.098, + "step": 22775 + }, + { + "epoch": 2.7008182141586623, + "grad_norm": 0.5979208730013187, + "learning_rate": 1.2602692680957056e-05, + "loss": 0.0679, + "step": 22776 + }, + { + "epoch": 2.7009367959207875, + "grad_norm": 0.8501698939811153, + "learning_rate": 1.2600608302813895e-05, + "loss": 0.1038, + "step": 22777 + }, + { + "epoch": 2.7010553776829123, + "grad_norm": 0.6146874683107714, + "learning_rate": 1.2598524038977944e-05, + "loss": 0.0941, + "step": 22778 + }, + { + "epoch": 2.7011739594450375, + "grad_norm": 0.534531662759171, + "learning_rate": 1.2596439889468382e-05, + "loss": 0.0792, + "step": 22779 + }, + { + "epoch": 2.7012925412071622, + "grad_norm": 0.6439750482548712, + "learning_rate": 1.259435585430443e-05, + "loss": 0.0912, + "step": 22780 + }, + { + "epoch": 2.7014111229692874, + "grad_norm": 0.6977257382973537, + "learning_rate": 1.2592271933505306e-05, + "loss": 0.1085, + "step": 22781 + }, + { + "epoch": 2.701529704731412, + "grad_norm": 0.6413768927050115, + "learning_rate": 1.2590188127090222e-05, + "loss": 0.068, + "step": 22782 + }, + { + "epoch": 2.7016482864935374, + "grad_norm": 0.7698311302591774, + "learning_rate": 1.2588104435078395e-05, + "loss": 0.1081, + "step": 22783 + }, + { + "epoch": 2.701766868255662, + "grad_norm": 1.0268962135063155, + "learning_rate": 1.2586020857489004e-05, + "loss": 0.1082, + "step": 22784 + }, + { + "epoch": 2.7018854500177873, + "grad_norm": 0.6953674355330706, + "learning_rate": 1.2583937394341297e-05, + "loss": 0.0683, + "step": 22785 + }, + { + "epoch": 2.702004031779912, + "grad_norm": 0.6257758384148192, + "learning_rate": 1.258185404565445e-05, + "loss": 0.1004, + "step": 22786 + }, + { + "epoch": 2.7021226135420373, + "grad_norm": 0.5432076205125019, + "learning_rate": 1.2579770811447695e-05, + "loss": 0.0666, + "step": 22787 + }, + { + "epoch": 2.702241195304162, + "grad_norm": 0.9111465561298466, + "learning_rate": 1.2577687691740201e-05, + "loss": 0.0829, + "step": 22788 + }, + { + "epoch": 2.7023597770662873, + "grad_norm": 0.5810821822972202, + "learning_rate": 1.2575604686551212e-05, + "loss": 0.0825, + "step": 22789 + }, + { + "epoch": 2.702478358828412, + "grad_norm": 0.7561677902546714, + "learning_rate": 1.2573521795899901e-05, + "loss": 0.1037, + "step": 22790 + }, + { + "epoch": 2.7025969405905372, + "grad_norm": 0.9266115007644398, + "learning_rate": 1.2571439019805482e-05, + "loss": 0.1069, + "step": 22791 + }, + { + "epoch": 2.702715522352662, + "grad_norm": 0.9985679763559371, + "learning_rate": 1.2569356358287154e-05, + "loss": 0.1311, + "step": 22792 + }, + { + "epoch": 2.702834104114787, + "grad_norm": 0.6372499202762824, + "learning_rate": 1.2567273811364116e-05, + "loss": 0.1019, + "step": 22793 + }, + { + "epoch": 2.7029526858769124, + "grad_norm": 0.711136879616522, + "learning_rate": 1.256519137905558e-05, + "loss": 0.0996, + "step": 22794 + }, + { + "epoch": 2.703071267639037, + "grad_norm": 0.9298979532337991, + "learning_rate": 1.2563109061380713e-05, + "loss": 0.0797, + "step": 22795 + }, + { + "epoch": 2.703189849401162, + "grad_norm": 0.6653092481378606, + "learning_rate": 1.2561026858358743e-05, + "loss": 0.0825, + "step": 22796 + }, + { + "epoch": 2.703308431163287, + "grad_norm": 0.7647847020085917, + "learning_rate": 1.2558944770008846e-05, + "loss": 0.1134, + "step": 22797 + }, + { + "epoch": 2.7034270129254123, + "grad_norm": 0.8613889167254154, + "learning_rate": 1.255686279635023e-05, + "loss": 0.1146, + "step": 22798 + }, + { + "epoch": 2.703545594687537, + "grad_norm": 0.5455728001927529, + "learning_rate": 1.2554780937402061e-05, + "loss": 0.064, + "step": 22799 + }, + { + "epoch": 2.703664176449662, + "grad_norm": 0.5428192454065235, + "learning_rate": 1.255269919318357e-05, + "loss": 0.0917, + "step": 22800 + }, + { + "epoch": 2.703782758211787, + "grad_norm": 0.4657333647306136, + "learning_rate": 1.2550617563713918e-05, + "loss": 0.0814, + "step": 22801 + }, + { + "epoch": 2.7039013399739122, + "grad_norm": 0.5949642419644904, + "learning_rate": 1.2548536049012308e-05, + "loss": 0.0974, + "step": 22802 + }, + { + "epoch": 2.704019921736037, + "grad_norm": 0.8039710204079231, + "learning_rate": 1.2546454649097924e-05, + "loss": 0.111, + "step": 22803 + }, + { + "epoch": 2.7041385034981618, + "grad_norm": 0.8566026578523163, + "learning_rate": 1.2544373363989958e-05, + "loss": 0.0755, + "step": 22804 + }, + { + "epoch": 2.704257085260287, + "grad_norm": 0.43534226114137814, + "learning_rate": 1.2542292193707606e-05, + "loss": 0.0685, + "step": 22805 + }, + { + "epoch": 2.704375667022412, + "grad_norm": 0.7520580456728888, + "learning_rate": 1.254021113827003e-05, + "loss": 0.0916, + "step": 22806 + }, + { + "epoch": 2.704494248784537, + "grad_norm": 0.6236481238895565, + "learning_rate": 1.2538130197696433e-05, + "loss": 0.0762, + "step": 22807 + }, + { + "epoch": 2.7046128305466617, + "grad_norm": 0.5363956374060455, + "learning_rate": 1.2536049372005992e-05, + "loss": 0.0772, + "step": 22808 + }, + { + "epoch": 2.704731412308787, + "grad_norm": 0.7105527627905014, + "learning_rate": 1.2533968661217898e-05, + "loss": 0.0895, + "step": 22809 + }, + { + "epoch": 2.704849994070912, + "grad_norm": 0.6473717854007198, + "learning_rate": 1.253188806535131e-05, + "loss": 0.102, + "step": 22810 + }, + { + "epoch": 2.704968575833037, + "grad_norm": 0.8088086016206997, + "learning_rate": 1.2529807584425446e-05, + "loss": 0.1016, + "step": 22811 + }, + { + "epoch": 2.7050871575951616, + "grad_norm": 0.5877096205178001, + "learning_rate": 1.252772721845945e-05, + "loss": 0.0726, + "step": 22812 + }, + { + "epoch": 2.705205739357287, + "grad_norm": 0.5343936207138081, + "learning_rate": 1.2525646967472515e-05, + "loss": 0.0795, + "step": 22813 + }, + { + "epoch": 2.705324321119412, + "grad_norm": 0.5345260523556793, + "learning_rate": 1.2523566831483821e-05, + "loss": 0.0794, + "step": 22814 + }, + { + "epoch": 2.7054429028815368, + "grad_norm": 0.5281301911982569, + "learning_rate": 1.252148681051254e-05, + "loss": 0.0575, + "step": 22815 + }, + { + "epoch": 2.7055614846436615, + "grad_norm": 0.38248873104317355, + "learning_rate": 1.2519406904577857e-05, + "loss": 0.0528, + "step": 22816 + }, + { + "epoch": 2.7056800664057867, + "grad_norm": 0.7372460918791258, + "learning_rate": 1.2517327113698928e-05, + "loss": 0.0937, + "step": 22817 + }, + { + "epoch": 2.705798648167912, + "grad_norm": 0.4920837893911828, + "learning_rate": 1.2515247437894936e-05, + "loss": 0.056, + "step": 22818 + }, + { + "epoch": 2.7059172299300367, + "grad_norm": 0.5566417864746563, + "learning_rate": 1.2513167877185052e-05, + "loss": 0.0909, + "step": 22819 + }, + { + "epoch": 2.706035811692162, + "grad_norm": 0.6779354736080779, + "learning_rate": 1.251108843158846e-05, + "loss": 0.0982, + "step": 22820 + }, + { + "epoch": 2.7061543934542867, + "grad_norm": 0.6219419636692997, + "learning_rate": 1.2509009101124297e-05, + "loss": 0.0784, + "step": 22821 + }, + { + "epoch": 2.706272975216412, + "grad_norm": 0.6898093255830678, + "learning_rate": 1.2506929885811772e-05, + "loss": 0.0781, + "step": 22822 + }, + { + "epoch": 2.7063915569785366, + "grad_norm": 0.6287696468078497, + "learning_rate": 1.2504850785670024e-05, + "loss": 0.0723, + "step": 22823 + }, + { + "epoch": 2.706510138740662, + "grad_norm": 0.6932862393740953, + "learning_rate": 1.2502771800718226e-05, + "loss": 0.0951, + "step": 22824 + }, + { + "epoch": 2.7066287205027866, + "grad_norm": 0.8136746180222879, + "learning_rate": 1.2500692930975552e-05, + "loss": 0.0991, + "step": 22825 + }, + { + "epoch": 2.706747302264912, + "grad_norm": 0.8253373568847392, + "learning_rate": 1.249861417646116e-05, + "loss": 0.1037, + "step": 22826 + }, + { + "epoch": 2.7068658840270365, + "grad_norm": 0.5860322379114561, + "learning_rate": 1.2496535537194223e-05, + "loss": 0.0845, + "step": 22827 + }, + { + "epoch": 2.7069844657891617, + "grad_norm": 0.524337309319647, + "learning_rate": 1.249445701319389e-05, + "loss": 0.0762, + "step": 22828 + }, + { + "epoch": 2.7071030475512865, + "grad_norm": 0.5089274671411005, + "learning_rate": 1.2492378604479324e-05, + "loss": 0.0599, + "step": 22829 + }, + { + "epoch": 2.7072216293134117, + "grad_norm": 0.745548622668968, + "learning_rate": 1.2490300311069688e-05, + "loss": 0.0924, + "step": 22830 + }, + { + "epoch": 2.7073402110755365, + "grad_norm": 0.6682834502891294, + "learning_rate": 1.2488222132984156e-05, + "loss": 0.087, + "step": 22831 + }, + { + "epoch": 2.7074587928376617, + "grad_norm": 1.3366672230134253, + "learning_rate": 1.2486144070241862e-05, + "loss": 0.134, + "step": 22832 + }, + { + "epoch": 2.7075773745997864, + "grad_norm": 0.7167203849734176, + "learning_rate": 1.2484066122861973e-05, + "loss": 0.1013, + "step": 22833 + }, + { + "epoch": 2.7076959563619116, + "grad_norm": 0.5799490051558938, + "learning_rate": 1.248198829086365e-05, + "loss": 0.0757, + "step": 22834 + }, + { + "epoch": 2.7078145381240364, + "grad_norm": 0.8193958791129795, + "learning_rate": 1.247991057426604e-05, + "loss": 0.1011, + "step": 22835 + }, + { + "epoch": 2.7079331198861616, + "grad_norm": 0.6417244978648117, + "learning_rate": 1.2477832973088304e-05, + "loss": 0.0816, + "step": 22836 + }, + { + "epoch": 2.7080517016482863, + "grad_norm": 0.7637656844310203, + "learning_rate": 1.2475755487349592e-05, + "loss": 0.0961, + "step": 22837 + }, + { + "epoch": 2.7081702834104115, + "grad_norm": 0.6340956905357933, + "learning_rate": 1.2473678117069066e-05, + "loss": 0.0808, + "step": 22838 + }, + { + "epoch": 2.7082888651725363, + "grad_norm": 0.528598087849251, + "learning_rate": 1.2471600862265857e-05, + "loss": 0.0722, + "step": 22839 + }, + { + "epoch": 2.7084074469346615, + "grad_norm": 0.6226957755527607, + "learning_rate": 1.2469523722959123e-05, + "loss": 0.101, + "step": 22840 + }, + { + "epoch": 2.7085260286967863, + "grad_norm": 0.6195094990534684, + "learning_rate": 1.2467446699168014e-05, + "loss": 0.082, + "step": 22841 + }, + { + "epoch": 2.7086446104589115, + "grad_norm": 0.5194022368243967, + "learning_rate": 1.2465369790911683e-05, + "loss": 0.0708, + "step": 22842 + }, + { + "epoch": 2.7087631922210367, + "grad_norm": 1.0808221294973022, + "learning_rate": 1.2463292998209275e-05, + "loss": 0.1339, + "step": 22843 + }, + { + "epoch": 2.7088817739831614, + "grad_norm": 0.6483559974085983, + "learning_rate": 1.2461216321079916e-05, + "loss": 0.0819, + "step": 22844 + }, + { + "epoch": 2.709000355745286, + "grad_norm": 0.6803266141989134, + "learning_rate": 1.2459139759542788e-05, + "loss": 0.0856, + "step": 22845 + }, + { + "epoch": 2.7091189375074114, + "grad_norm": 0.5553743934385044, + "learning_rate": 1.2457063313616998e-05, + "loss": 0.0735, + "step": 22846 + }, + { + "epoch": 2.7092375192695366, + "grad_norm": 0.3830516995931219, + "learning_rate": 1.2454986983321718e-05, + "loss": 0.0511, + "step": 22847 + }, + { + "epoch": 2.7093561010316614, + "grad_norm": 0.5828969364784844, + "learning_rate": 1.2452910768676052e-05, + "loss": 0.0664, + "step": 22848 + }, + { + "epoch": 2.709474682793786, + "grad_norm": 0.7675105431035145, + "learning_rate": 1.2450834669699182e-05, + "loss": 0.0972, + "step": 22849 + }, + { + "epoch": 2.7095932645559113, + "grad_norm": 0.7275462209317104, + "learning_rate": 1.2448758686410217e-05, + "loss": 0.0994, + "step": 22850 + }, + { + "epoch": 2.7097118463180365, + "grad_norm": 0.7439111423053639, + "learning_rate": 1.2446682818828307e-05, + "loss": 0.0846, + "step": 22851 + }, + { + "epoch": 2.7098304280801613, + "grad_norm": 0.45570641600368145, + "learning_rate": 1.2444607066972586e-05, + "loss": 0.0549, + "step": 22852 + }, + { + "epoch": 2.709949009842286, + "grad_norm": 0.6852969500117976, + "learning_rate": 1.2442531430862192e-05, + "loss": 0.1005, + "step": 22853 + }, + { + "epoch": 2.7100675916044112, + "grad_norm": 0.6139432802002925, + "learning_rate": 1.2440455910516272e-05, + "loss": 0.0947, + "step": 22854 + }, + { + "epoch": 2.7101861733665364, + "grad_norm": 0.5154886521750516, + "learning_rate": 1.2438380505953923e-05, + "loss": 0.0765, + "step": 22855 + }, + { + "epoch": 2.710304755128661, + "grad_norm": 1.0804847376774664, + "learning_rate": 1.2436305217194325e-05, + "loss": 0.1165, + "step": 22856 + }, + { + "epoch": 2.710423336890786, + "grad_norm": 0.4790650613935944, + "learning_rate": 1.2434230044256573e-05, + "loss": 0.066, + "step": 22857 + }, + { + "epoch": 2.710541918652911, + "grad_norm": 0.6431504673910016, + "learning_rate": 1.2432154987159822e-05, + "loss": 0.1057, + "step": 22858 + }, + { + "epoch": 2.7106605004150364, + "grad_norm": 0.7911571245499318, + "learning_rate": 1.243008004592317e-05, + "loss": 0.0869, + "step": 22859 + }, + { + "epoch": 2.710779082177161, + "grad_norm": 0.4676069502455532, + "learning_rate": 1.2428005220565786e-05, + "loss": 0.0552, + "step": 22860 + }, + { + "epoch": 2.710897663939286, + "grad_norm": 0.5779695153533326, + "learning_rate": 1.2425930511106768e-05, + "loss": 0.0803, + "step": 22861 + }, + { + "epoch": 2.711016245701411, + "grad_norm": 0.7941448274811136, + "learning_rate": 1.2423855917565252e-05, + "loss": 0.099, + "step": 22862 + }, + { + "epoch": 2.7111348274635363, + "grad_norm": 0.5819781146105081, + "learning_rate": 1.2421781439960361e-05, + "loss": 0.0739, + "step": 22863 + }, + { + "epoch": 2.711253409225661, + "grad_norm": 1.1310453644458134, + "learning_rate": 1.2419707078311221e-05, + "loss": 0.1213, + "step": 22864 + }, + { + "epoch": 2.711371990987786, + "grad_norm": 0.6569073664634694, + "learning_rate": 1.2417632832636965e-05, + "loss": 0.0942, + "step": 22865 + }, + { + "epoch": 2.711490572749911, + "grad_norm": 0.6389891217903565, + "learning_rate": 1.2415558702956684e-05, + "loss": 0.065, + "step": 22866 + }, + { + "epoch": 2.711609154512036, + "grad_norm": 0.7497562909086473, + "learning_rate": 1.241348468928954e-05, + "loss": 0.0938, + "step": 22867 + }, + { + "epoch": 2.711727736274161, + "grad_norm": 1.027733759207597, + "learning_rate": 1.2411410791654623e-05, + "loss": 0.1499, + "step": 22868 + }, + { + "epoch": 2.711846318036286, + "grad_norm": 0.9601614101352067, + "learning_rate": 1.240933701007107e-05, + "loss": 0.1114, + "step": 22869 + }, + { + "epoch": 2.711964899798411, + "grad_norm": 0.5291827578599926, + "learning_rate": 1.2407263344557973e-05, + "loss": 0.0697, + "step": 22870 + }, + { + "epoch": 2.712083481560536, + "grad_norm": 0.6359025234610513, + "learning_rate": 1.2405189795134484e-05, + "loss": 0.057, + "step": 22871 + }, + { + "epoch": 2.712202063322661, + "grad_norm": 0.6765666765117524, + "learning_rate": 1.2403116361819692e-05, + "loss": 0.0996, + "step": 22872 + }, + { + "epoch": 2.712320645084786, + "grad_norm": 0.3995658060435283, + "learning_rate": 1.2401043044632719e-05, + "loss": 0.0551, + "step": 22873 + }, + { + "epoch": 2.712439226846911, + "grad_norm": 0.606911174856811, + "learning_rate": 1.2398969843592675e-05, + "loss": 0.0857, + "step": 22874 + }, + { + "epoch": 2.712557808609036, + "grad_norm": 0.757569506779207, + "learning_rate": 1.2396896758718682e-05, + "loss": 0.0933, + "step": 22875 + }, + { + "epoch": 2.712676390371161, + "grad_norm": 0.795232083441143, + "learning_rate": 1.2394823790029852e-05, + "loss": 0.1212, + "step": 22876 + }, + { + "epoch": 2.712794972133286, + "grad_norm": 0.5590888744987811, + "learning_rate": 1.2392750937545283e-05, + "loss": 0.0783, + "step": 22877 + }, + { + "epoch": 2.712913553895411, + "grad_norm": 0.5571002738970988, + "learning_rate": 1.2390678201284086e-05, + "loss": 0.0902, + "step": 22878 + }, + { + "epoch": 2.713032135657536, + "grad_norm": 0.42121118529149043, + "learning_rate": 1.2388605581265378e-05, + "loss": 0.062, + "step": 22879 + }, + { + "epoch": 2.7131507174196607, + "grad_norm": 1.2514692085427344, + "learning_rate": 1.238653307750827e-05, + "loss": 0.1403, + "step": 22880 + }, + { + "epoch": 2.713269299181786, + "grad_norm": 0.44872690709384033, + "learning_rate": 1.2384460690031838e-05, + "loss": 0.0694, + "step": 22881 + }, + { + "epoch": 2.7133878809439107, + "grad_norm": 0.7391091424429785, + "learning_rate": 1.2382388418855228e-05, + "loss": 0.11, + "step": 22882 + }, + { + "epoch": 2.713506462706036, + "grad_norm": 0.5008868516462135, + "learning_rate": 1.2380316263997516e-05, + "loss": 0.0536, + "step": 22883 + }, + { + "epoch": 2.7136250444681607, + "grad_norm": 0.7121144848834023, + "learning_rate": 1.237824422547781e-05, + "loss": 0.086, + "step": 22884 + }, + { + "epoch": 2.713743626230286, + "grad_norm": 0.7763758695451651, + "learning_rate": 1.2376172303315218e-05, + "loss": 0.1057, + "step": 22885 + }, + { + "epoch": 2.7138622079924106, + "grad_norm": 0.7939523250731322, + "learning_rate": 1.2374100497528834e-05, + "loss": 0.1044, + "step": 22886 + }, + { + "epoch": 2.713980789754536, + "grad_norm": 0.5841660899277804, + "learning_rate": 1.2372028808137773e-05, + "loss": 0.0744, + "step": 22887 + }, + { + "epoch": 2.7140993715166606, + "grad_norm": 0.7781785347689407, + "learning_rate": 1.236995723516111e-05, + "loss": 0.1092, + "step": 22888 + }, + { + "epoch": 2.714217953278786, + "grad_norm": 0.7427718372259811, + "learning_rate": 1.2367885778617952e-05, + "loss": 0.1028, + "step": 22889 + }, + { + "epoch": 2.7143365350409105, + "grad_norm": 0.5181631342207078, + "learning_rate": 1.2365814438527398e-05, + "loss": 0.0749, + "step": 22890 + }, + { + "epoch": 2.7144551168030358, + "grad_norm": 0.642453691545869, + "learning_rate": 1.2363743214908541e-05, + "loss": 0.0919, + "step": 22891 + }, + { + "epoch": 2.714573698565161, + "grad_norm": 0.7863588227433153, + "learning_rate": 1.2361672107780486e-05, + "loss": 0.1007, + "step": 22892 + }, + { + "epoch": 2.7146922803272857, + "grad_norm": 0.6028335792968061, + "learning_rate": 1.2359601117162297e-05, + "loss": 0.0956, + "step": 22893 + }, + { + "epoch": 2.7148108620894105, + "grad_norm": 0.5925335163666156, + "learning_rate": 1.2357530243073104e-05, + "loss": 0.0836, + "step": 22894 + }, + { + "epoch": 2.7149294438515357, + "grad_norm": 0.7302883222805482, + "learning_rate": 1.235545948553197e-05, + "loss": 0.105, + "step": 22895 + }, + { + "epoch": 2.715048025613661, + "grad_norm": 0.7888032928748052, + "learning_rate": 1.2353388844557991e-05, + "loss": 0.1105, + "step": 22896 + }, + { + "epoch": 2.7151666073757856, + "grad_norm": 0.7068486638993073, + "learning_rate": 1.2351318320170264e-05, + "loss": 0.0915, + "step": 22897 + }, + { + "epoch": 2.7152851891379104, + "grad_norm": 0.6591111343257902, + "learning_rate": 1.2349247912387875e-05, + "loss": 0.0952, + "step": 22898 + }, + { + "epoch": 2.7154037709000356, + "grad_norm": 0.5914160611862005, + "learning_rate": 1.23471776212299e-05, + "loss": 0.0751, + "step": 22899 + }, + { + "epoch": 2.715522352662161, + "grad_norm": 0.8146634746121066, + "learning_rate": 1.2345107446715431e-05, + "loss": 0.1119, + "step": 22900 + }, + { + "epoch": 2.7156409344242856, + "grad_norm": 0.6507851599835665, + "learning_rate": 1.2343037388863554e-05, + "loss": 0.0872, + "step": 22901 + }, + { + "epoch": 2.7157595161864103, + "grad_norm": 0.8532501043364852, + "learning_rate": 1.234096744769335e-05, + "loss": 0.1272, + "step": 22902 + }, + { + "epoch": 2.7158780979485355, + "grad_norm": 0.7670833482097494, + "learning_rate": 1.2338897623223913e-05, + "loss": 0.0913, + "step": 22903 + }, + { + "epoch": 2.7159966797106607, + "grad_norm": 0.6626462206022, + "learning_rate": 1.2336827915474294e-05, + "loss": 0.0951, + "step": 22904 + }, + { + "epoch": 2.7161152614727855, + "grad_norm": 0.5553441865098555, + "learning_rate": 1.2334758324463613e-05, + "loss": 0.0683, + "step": 22905 + }, + { + "epoch": 2.7162338432349102, + "grad_norm": 0.8137276025278782, + "learning_rate": 1.2332688850210919e-05, + "loss": 0.1137, + "step": 22906 + }, + { + "epoch": 2.7163524249970354, + "grad_norm": 0.8935785305606706, + "learning_rate": 1.2330619492735298e-05, + "loss": 0.117, + "step": 22907 + }, + { + "epoch": 2.7164710067591606, + "grad_norm": 0.5685596922071227, + "learning_rate": 1.2328550252055832e-05, + "loss": 0.0699, + "step": 22908 + }, + { + "epoch": 2.7165895885212854, + "grad_norm": 0.6272957739245556, + "learning_rate": 1.2326481128191602e-05, + "loss": 0.0868, + "step": 22909 + }, + { + "epoch": 2.71670817028341, + "grad_norm": 0.8150409959715319, + "learning_rate": 1.2324412121161666e-05, + "loss": 0.1297, + "step": 22910 + }, + { + "epoch": 2.7168267520455354, + "grad_norm": 0.5964121092698188, + "learning_rate": 1.2322343230985105e-05, + "loss": 0.0893, + "step": 22911 + }, + { + "epoch": 2.7169453338076606, + "grad_norm": 0.6549637088891821, + "learning_rate": 1.2320274457680997e-05, + "loss": 0.0797, + "step": 22912 + }, + { + "epoch": 2.7170639155697853, + "grad_norm": 0.7739316640841268, + "learning_rate": 1.2318205801268406e-05, + "loss": 0.1061, + "step": 22913 + }, + { + "epoch": 2.71718249733191, + "grad_norm": 0.6937517887758134, + "learning_rate": 1.2316137261766417e-05, + "loss": 0.0709, + "step": 22914 + }, + { + "epoch": 2.7173010790940353, + "grad_norm": 0.6251265210023421, + "learning_rate": 1.2314068839194068e-05, + "loss": 0.0888, + "step": 22915 + }, + { + "epoch": 2.7174196608561605, + "grad_norm": 0.5231777501634355, + "learning_rate": 1.231200053357047e-05, + "loss": 0.0507, + "step": 22916 + }, + { + "epoch": 2.7175382426182852, + "grad_norm": 0.5576241221954318, + "learning_rate": 1.2309932344914653e-05, + "loss": 0.0782, + "step": 22917 + }, + { + "epoch": 2.7176568243804105, + "grad_norm": 0.5789261546374365, + "learning_rate": 1.2307864273245711e-05, + "loss": 0.079, + "step": 22918 + }, + { + "epoch": 2.717775406142535, + "grad_norm": 0.7551630447256571, + "learning_rate": 1.230579631858268e-05, + "loss": 0.0999, + "step": 22919 + }, + { + "epoch": 2.7178939879046604, + "grad_norm": 0.5544152847895917, + "learning_rate": 1.2303728480944654e-05, + "loss": 0.0813, + "step": 22920 + }, + { + "epoch": 2.718012569666785, + "grad_norm": 0.6496513763096124, + "learning_rate": 1.2301660760350675e-05, + "loss": 0.0784, + "step": 22921 + }, + { + "epoch": 2.7181311514289104, + "grad_norm": 0.9357046161191285, + "learning_rate": 1.2299593156819813e-05, + "loss": 0.139, + "step": 22922 + }, + { + "epoch": 2.718249733191035, + "grad_norm": 0.5730038134998272, + "learning_rate": 1.2297525670371126e-05, + "loss": 0.073, + "step": 22923 + }, + { + "epoch": 2.7183683149531603, + "grad_norm": 0.9311479146732368, + "learning_rate": 1.2295458301023678e-05, + "loss": 0.1219, + "step": 22924 + }, + { + "epoch": 2.718486896715285, + "grad_norm": 0.5698268273328686, + "learning_rate": 1.2293391048796532e-05, + "loss": 0.0799, + "step": 22925 + }, + { + "epoch": 2.7186054784774103, + "grad_norm": 0.4898443704107063, + "learning_rate": 1.2291323913708721e-05, + "loss": 0.0605, + "step": 22926 + }, + { + "epoch": 2.718724060239535, + "grad_norm": 0.6742819900756006, + "learning_rate": 1.2289256895779338e-05, + "loss": 0.0769, + "step": 22927 + }, + { + "epoch": 2.7188426420016603, + "grad_norm": 0.589151909344817, + "learning_rate": 1.228718999502741e-05, + "loss": 0.0905, + "step": 22928 + }, + { + "epoch": 2.718961223763785, + "grad_norm": 1.051182786152531, + "learning_rate": 1.228512321147201e-05, + "loss": 0.1307, + "step": 22929 + }, + { + "epoch": 2.71907980552591, + "grad_norm": 0.6474150770960652, + "learning_rate": 1.2283056545132162e-05, + "loss": 0.09, + "step": 22930 + }, + { + "epoch": 2.719198387288035, + "grad_norm": 0.7042881382502459, + "learning_rate": 1.2280989996026959e-05, + "loss": 0.102, + "step": 22931 + }, + { + "epoch": 2.71931696905016, + "grad_norm": 0.6365743310249772, + "learning_rate": 1.227892356417542e-05, + "loss": 0.0761, + "step": 22932 + }, + { + "epoch": 2.719435550812285, + "grad_norm": 0.6095895588828529, + "learning_rate": 1.227685724959661e-05, + "loss": 0.0727, + "step": 22933 + }, + { + "epoch": 2.71955413257441, + "grad_norm": 0.6404726307090183, + "learning_rate": 1.227479105230957e-05, + "loss": 0.0903, + "step": 22934 + }, + { + "epoch": 2.719672714336535, + "grad_norm": 0.8766688161337733, + "learning_rate": 1.2272724972333351e-05, + "loss": 0.1041, + "step": 22935 + }, + { + "epoch": 2.71979129609866, + "grad_norm": 0.7115574059841383, + "learning_rate": 1.2270659009687013e-05, + "loss": 0.0786, + "step": 22936 + }, + { + "epoch": 2.719909877860785, + "grad_norm": 0.7211259117694715, + "learning_rate": 1.226859316438957e-05, + "loss": 0.0658, + "step": 22937 + }, + { + "epoch": 2.72002845962291, + "grad_norm": 0.555475925214798, + "learning_rate": 1.2266527436460103e-05, + "loss": 0.0914, + "step": 22938 + }, + { + "epoch": 2.720147041385035, + "grad_norm": 0.8727343575824346, + "learning_rate": 1.2264461825917631e-05, + "loss": 0.1227, + "step": 22939 + }, + { + "epoch": 2.72026562314716, + "grad_norm": 0.8608031661201022, + "learning_rate": 1.22623963327812e-05, + "loss": 0.1124, + "step": 22940 + }, + { + "epoch": 2.720384204909285, + "grad_norm": 0.5814307430265034, + "learning_rate": 1.2260330957069858e-05, + "loss": 0.0757, + "step": 22941 + }, + { + "epoch": 2.72050278667141, + "grad_norm": 0.8749405716247962, + "learning_rate": 1.2258265698802642e-05, + "loss": 0.1058, + "step": 22942 + }, + { + "epoch": 2.7206213684335347, + "grad_norm": 0.7214975260385246, + "learning_rate": 1.2256200557998602e-05, + "loss": 0.0897, + "step": 22943 + }, + { + "epoch": 2.72073995019566, + "grad_norm": 0.7801033000360772, + "learning_rate": 1.2254135534676754e-05, + "loss": 0.0994, + "step": 22944 + }, + { + "epoch": 2.720858531957785, + "grad_norm": 0.8147078009297245, + "learning_rate": 1.2252070628856144e-05, + "loss": 0.085, + "step": 22945 + }, + { + "epoch": 2.72097711371991, + "grad_norm": 0.5090636442528014, + "learning_rate": 1.2250005840555812e-05, + "loss": 0.0746, + "step": 22946 + }, + { + "epoch": 2.7210956954820347, + "grad_norm": 0.5462060154449243, + "learning_rate": 1.2247941169794802e-05, + "loss": 0.068, + "step": 22947 + }, + { + "epoch": 2.72121427724416, + "grad_norm": 0.9559319204735842, + "learning_rate": 1.2245876616592125e-05, + "loss": 0.1005, + "step": 22948 + }, + { + "epoch": 2.721332859006285, + "grad_norm": 0.5983504109964133, + "learning_rate": 1.224381218096683e-05, + "loss": 0.0742, + "step": 22949 + }, + { + "epoch": 2.72145144076841, + "grad_norm": 0.48325227330931014, + "learning_rate": 1.2241747862937938e-05, + "loss": 0.0586, + "step": 22950 + }, + { + "epoch": 2.7215700225305346, + "grad_norm": 0.473241154412509, + "learning_rate": 1.223968366252449e-05, + "loss": 0.0535, + "step": 22951 + }, + { + "epoch": 2.72168860429266, + "grad_norm": 0.7958948518400485, + "learning_rate": 1.2237619579745507e-05, + "loss": 0.1135, + "step": 22952 + }, + { + "epoch": 2.721807186054785, + "grad_norm": 0.6465389206606225, + "learning_rate": 1.2235555614620026e-05, + "loss": 0.0752, + "step": 22953 + }, + { + "epoch": 2.7219257678169098, + "grad_norm": 0.8227220919767011, + "learning_rate": 1.2233491767167077e-05, + "loss": 0.1148, + "step": 22954 + }, + { + "epoch": 2.7220443495790345, + "grad_norm": 0.7601743433837387, + "learning_rate": 1.2231428037405667e-05, + "loss": 0.1097, + "step": 22955 + }, + { + "epoch": 2.7221629313411597, + "grad_norm": 0.5132660428422361, + "learning_rate": 1.2229364425354833e-05, + "loss": 0.0664, + "step": 22956 + }, + { + "epoch": 2.722281513103285, + "grad_norm": 0.5713599142219753, + "learning_rate": 1.22273009310336e-05, + "loss": 0.0725, + "step": 22957 + }, + { + "epoch": 2.7224000948654097, + "grad_norm": 0.6711265567764158, + "learning_rate": 1.2225237554461e-05, + "loss": 0.1056, + "step": 22958 + }, + { + "epoch": 2.7225186766275344, + "grad_norm": 0.5295158168166484, + "learning_rate": 1.2223174295656034e-05, + "loss": 0.0854, + "step": 22959 + }, + { + "epoch": 2.7226372583896596, + "grad_norm": 0.8623088183931276, + "learning_rate": 1.2221111154637732e-05, + "loss": 0.0954, + "step": 22960 + }, + { + "epoch": 2.722755840151785, + "grad_norm": 0.6591783976960001, + "learning_rate": 1.2219048131425114e-05, + "loss": 0.084, + "step": 22961 + }, + { + "epoch": 2.7228744219139096, + "grad_norm": 0.6213267491700256, + "learning_rate": 1.2216985226037204e-05, + "loss": 0.0875, + "step": 22962 + }, + { + "epoch": 2.7229930036760344, + "grad_norm": 0.7077599907359853, + "learning_rate": 1.2214922438493018e-05, + "loss": 0.1052, + "step": 22963 + }, + { + "epoch": 2.7231115854381596, + "grad_norm": 0.6929788518300855, + "learning_rate": 1.2212859768811553e-05, + "loss": 0.0846, + "step": 22964 + }, + { + "epoch": 2.7232301672002848, + "grad_norm": 0.6059340304424424, + "learning_rate": 1.221079721701186e-05, + "loss": 0.0731, + "step": 22965 + }, + { + "epoch": 2.7233487489624095, + "grad_norm": 0.7268879586979806, + "learning_rate": 1.220873478311292e-05, + "loss": 0.0855, + "step": 22966 + }, + { + "epoch": 2.7234673307245347, + "grad_norm": 0.8298752264632223, + "learning_rate": 1.2206672467133764e-05, + "loss": 0.1296, + "step": 22967 + }, + { + "epoch": 2.7235859124866595, + "grad_norm": 0.6129033833567423, + "learning_rate": 1.2204610269093398e-05, + "loss": 0.0868, + "step": 22968 + }, + { + "epoch": 2.7237044942487847, + "grad_norm": 0.5479818366575606, + "learning_rate": 1.2202548189010845e-05, + "loss": 0.0962, + "step": 22969 + }, + { + "epoch": 2.7238230760109094, + "grad_norm": 0.6745242272804527, + "learning_rate": 1.2200486226905095e-05, + "loss": 0.0949, + "step": 22970 + }, + { + "epoch": 2.7239416577730347, + "grad_norm": 1.0468308692588777, + "learning_rate": 1.2198424382795165e-05, + "loss": 0.1243, + "step": 22971 + }, + { + "epoch": 2.7240602395351594, + "grad_norm": 0.5038063669118098, + "learning_rate": 1.2196362656700063e-05, + "loss": 0.0639, + "step": 22972 + }, + { + "epoch": 2.7241788212972846, + "grad_norm": 0.9219349432508519, + "learning_rate": 1.2194301048638797e-05, + "loss": 0.1328, + "step": 22973 + }, + { + "epoch": 2.7242974030594094, + "grad_norm": 0.47930546309200295, + "learning_rate": 1.2192239558630384e-05, + "loss": 0.0715, + "step": 22974 + }, + { + "epoch": 2.7244159848215346, + "grad_norm": 0.6044382576946699, + "learning_rate": 1.2190178186693795e-05, + "loss": 0.0853, + "step": 22975 + }, + { + "epoch": 2.7245345665836593, + "grad_norm": 0.7549361905507401, + "learning_rate": 1.2188116932848073e-05, + "loss": 0.0935, + "step": 22976 + }, + { + "epoch": 2.7246531483457845, + "grad_norm": 0.9508866153140477, + "learning_rate": 1.2186055797112195e-05, + "loss": 0.1139, + "step": 22977 + }, + { + "epoch": 2.7247717301079093, + "grad_norm": 0.6026728214768886, + "learning_rate": 1.2183994779505167e-05, + "loss": 0.0846, + "step": 22978 + }, + { + "epoch": 2.7248903118700345, + "grad_norm": 0.6876996675529145, + "learning_rate": 1.2181933880045993e-05, + "loss": 0.1101, + "step": 22979 + }, + { + "epoch": 2.7250088936321593, + "grad_norm": 0.6821289071574927, + "learning_rate": 1.2179873098753677e-05, + "loss": 0.0821, + "step": 22980 + }, + { + "epoch": 2.7251274753942845, + "grad_norm": 0.6347486877983024, + "learning_rate": 1.2177812435647202e-05, + "loss": 0.0781, + "step": 22981 + }, + { + "epoch": 2.725246057156409, + "grad_norm": 0.4456925117931907, + "learning_rate": 1.217575189074557e-05, + "loss": 0.0748, + "step": 22982 + }, + { + "epoch": 2.7253646389185344, + "grad_norm": 0.6404976014436174, + "learning_rate": 1.2173691464067785e-05, + "loss": 0.075, + "step": 22983 + }, + { + "epoch": 2.725483220680659, + "grad_norm": 0.36819921769251374, + "learning_rate": 1.2171631155632831e-05, + "loss": 0.057, + "step": 22984 + }, + { + "epoch": 2.7256018024427844, + "grad_norm": 0.478207688459665, + "learning_rate": 1.216957096545972e-05, + "loss": 0.071, + "step": 22985 + }, + { + "epoch": 2.725720384204909, + "grad_norm": 0.6696619256353288, + "learning_rate": 1.2167510893567408e-05, + "loss": 0.0992, + "step": 22986 + }, + { + "epoch": 2.7258389659670343, + "grad_norm": 0.5475511365655622, + "learning_rate": 1.2165450939974932e-05, + "loss": 0.075, + "step": 22987 + }, + { + "epoch": 2.725957547729159, + "grad_norm": 0.6321117164572155, + "learning_rate": 1.2163391104701246e-05, + "loss": 0.0748, + "step": 22988 + }, + { + "epoch": 2.7260761294912843, + "grad_norm": 0.5534564881299621, + "learning_rate": 1.2161331387765357e-05, + "loss": 0.0827, + "step": 22989 + }, + { + "epoch": 2.726194711253409, + "grad_norm": 0.747047164302015, + "learning_rate": 1.2159271789186246e-05, + "loss": 0.1026, + "step": 22990 + }, + { + "epoch": 2.7263132930155343, + "grad_norm": 1.1855773213314587, + "learning_rate": 1.2157212308982905e-05, + "loss": 0.1041, + "step": 22991 + }, + { + "epoch": 2.726431874777659, + "grad_norm": 0.8821099941717395, + "learning_rate": 1.2155152947174326e-05, + "loss": 0.1179, + "step": 22992 + }, + { + "epoch": 2.7265504565397842, + "grad_norm": 0.6404524431331026, + "learning_rate": 1.2153093703779464e-05, + "loss": 0.0892, + "step": 22993 + }, + { + "epoch": 2.7266690383019094, + "grad_norm": 0.5864135795204584, + "learning_rate": 1.2151034578817346e-05, + "loss": 0.0636, + "step": 22994 + }, + { + "epoch": 2.726787620064034, + "grad_norm": 0.6296097805809004, + "learning_rate": 1.2148975572306923e-05, + "loss": 0.0759, + "step": 22995 + }, + { + "epoch": 2.726906201826159, + "grad_norm": 0.6341058600515478, + "learning_rate": 1.2146916684267195e-05, + "loss": 0.0882, + "step": 22996 + }, + { + "epoch": 2.727024783588284, + "grad_norm": 0.7592245410810152, + "learning_rate": 1.2144857914717118e-05, + "loss": 0.1002, + "step": 22997 + }, + { + "epoch": 2.7271433653504094, + "grad_norm": 0.8660526213212911, + "learning_rate": 1.21427992636757e-05, + "loss": 0.1143, + "step": 22998 + }, + { + "epoch": 2.727261947112534, + "grad_norm": 0.5230701864853842, + "learning_rate": 1.2140740731161901e-05, + "loss": 0.0553, + "step": 22999 + }, + { + "epoch": 2.727380528874659, + "grad_norm": 0.6310807056751019, + "learning_rate": 1.2138682317194703e-05, + "loss": 0.0941, + "step": 23000 + }, + { + "epoch": 2.727499110636784, + "grad_norm": 0.566106268524832, + "learning_rate": 1.2136624021793081e-05, + "loss": 0.0848, + "step": 23001 + }, + { + "epoch": 2.7276176923989093, + "grad_norm": 0.48327421856590974, + "learning_rate": 1.2134565844976012e-05, + "loss": 0.0651, + "step": 23002 + }, + { + "epoch": 2.727736274161034, + "grad_norm": 0.6914048958118376, + "learning_rate": 1.2132507786762479e-05, + "loss": 0.0666, + "step": 23003 + }, + { + "epoch": 2.727854855923159, + "grad_norm": 0.7063237220247371, + "learning_rate": 1.2130449847171434e-05, + "loss": 0.0849, + "step": 23004 + }, + { + "epoch": 2.727973437685284, + "grad_norm": 0.49816492366509024, + "learning_rate": 1.2128392026221864e-05, + "loss": 0.0757, + "step": 23005 + }, + { + "epoch": 2.728092019447409, + "grad_norm": 0.7099997615020643, + "learning_rate": 1.212633432393273e-05, + "loss": 0.0942, + "step": 23006 + }, + { + "epoch": 2.728210601209534, + "grad_norm": 0.437460881815232, + "learning_rate": 1.2124276740323018e-05, + "loss": 0.0554, + "step": 23007 + }, + { + "epoch": 2.7283291829716587, + "grad_norm": 0.6665224268613121, + "learning_rate": 1.2122219275411668e-05, + "loss": 0.0948, + "step": 23008 + }, + { + "epoch": 2.728447764733784, + "grad_norm": 0.5990805378530335, + "learning_rate": 1.2120161929217682e-05, + "loss": 0.0766, + "step": 23009 + }, + { + "epoch": 2.728566346495909, + "grad_norm": 0.8142979052419979, + "learning_rate": 1.211810470176e-05, + "loss": 0.0972, + "step": 23010 + }, + { + "epoch": 2.728684928258034, + "grad_norm": 0.5632124498516123, + "learning_rate": 1.2116047593057595e-05, + "loss": 0.0718, + "step": 23011 + }, + { + "epoch": 2.7288035100201586, + "grad_norm": 0.6851194088061068, + "learning_rate": 1.2113990603129433e-05, + "loss": 0.0889, + "step": 23012 + }, + { + "epoch": 2.728922091782284, + "grad_norm": 0.7011190813113061, + "learning_rate": 1.2111933731994474e-05, + "loss": 0.0831, + "step": 23013 + }, + { + "epoch": 2.729040673544409, + "grad_norm": 0.5905789315041149, + "learning_rate": 1.2109876979671692e-05, + "loss": 0.0795, + "step": 23014 + }, + { + "epoch": 2.729159255306534, + "grad_norm": 0.5630293888629859, + "learning_rate": 1.2107820346180029e-05, + "loss": 0.0572, + "step": 23015 + }, + { + "epoch": 2.7292778370686586, + "grad_norm": 0.793781195176863, + "learning_rate": 1.2105763831538452e-05, + "loss": 0.1024, + "step": 23016 + }, + { + "epoch": 2.7293964188307838, + "grad_norm": 0.5686305606129101, + "learning_rate": 1.210370743576592e-05, + "loss": 0.0743, + "step": 23017 + }, + { + "epoch": 2.729515000592909, + "grad_norm": 0.773305812413292, + "learning_rate": 1.2101651158881403e-05, + "loss": 0.109, + "step": 23018 + }, + { + "epoch": 2.7296335823550337, + "grad_norm": 0.603966970166552, + "learning_rate": 1.2099595000903831e-05, + "loss": 0.0788, + "step": 23019 + }, + { + "epoch": 2.729752164117159, + "grad_norm": 0.5430200385638231, + "learning_rate": 1.2097538961852178e-05, + "loss": 0.0781, + "step": 23020 + }, + { + "epoch": 2.7298707458792837, + "grad_norm": 0.7850105371320054, + "learning_rate": 1.2095483041745392e-05, + "loss": 0.0914, + "step": 23021 + }, + { + "epoch": 2.729989327641409, + "grad_norm": 0.606624188160079, + "learning_rate": 1.2093427240602428e-05, + "loss": 0.0803, + "step": 23022 + }, + { + "epoch": 2.7301079094035337, + "grad_norm": 0.558199476300812, + "learning_rate": 1.2091371558442239e-05, + "loss": 0.0724, + "step": 23023 + }, + { + "epoch": 2.730226491165659, + "grad_norm": 0.8903959528100039, + "learning_rate": 1.208931599528377e-05, + "loss": 0.0997, + "step": 23024 + }, + { + "epoch": 2.7303450729277836, + "grad_norm": 0.9391698108208114, + "learning_rate": 1.208726055114599e-05, + "loss": 0.1003, + "step": 23025 + }, + { + "epoch": 2.730463654689909, + "grad_norm": 0.666286808384473, + "learning_rate": 1.2085205226047821e-05, + "loss": 0.1058, + "step": 23026 + }, + { + "epoch": 2.7305822364520336, + "grad_norm": 0.6966513590385467, + "learning_rate": 1.2083150020008224e-05, + "loss": 0.0848, + "step": 23027 + }, + { + "epoch": 2.7307008182141588, + "grad_norm": 0.7123579547412119, + "learning_rate": 1.2081094933046142e-05, + "loss": 0.089, + "step": 23028 + }, + { + "epoch": 2.7308193999762835, + "grad_norm": 0.9696084299500916, + "learning_rate": 1.2079039965180532e-05, + "loss": 0.1078, + "step": 23029 + }, + { + "epoch": 2.7309379817384087, + "grad_norm": 0.599851910423277, + "learning_rate": 1.2076985116430322e-05, + "loss": 0.061, + "step": 23030 + }, + { + "epoch": 2.7310565635005335, + "grad_norm": 0.621765409942884, + "learning_rate": 1.2074930386814462e-05, + "loss": 0.0795, + "step": 23031 + }, + { + "epoch": 2.7311751452626587, + "grad_norm": 0.6423753605194666, + "learning_rate": 1.207287577635189e-05, + "loss": 0.0872, + "step": 23032 + }, + { + "epoch": 2.7312937270247835, + "grad_norm": 0.9276133822050114, + "learning_rate": 1.2070821285061556e-05, + "loss": 0.1205, + "step": 23033 + }, + { + "epoch": 2.7314123087869087, + "grad_norm": 0.8250449166814429, + "learning_rate": 1.2068766912962403e-05, + "loss": 0.1237, + "step": 23034 + }, + { + "epoch": 2.7315308905490334, + "grad_norm": 0.8117054239868996, + "learning_rate": 1.2066712660073342e-05, + "loss": 0.0883, + "step": 23035 + }, + { + "epoch": 2.7316494723111586, + "grad_norm": 0.6038419496388142, + "learning_rate": 1.206465852641335e-05, + "loss": 0.0944, + "step": 23036 + }, + { + "epoch": 2.7317680540732834, + "grad_norm": 0.7233984034982177, + "learning_rate": 1.2062604512001333e-05, + "loss": 0.0891, + "step": 23037 + }, + { + "epoch": 2.7318866358354086, + "grad_norm": 0.7061106040435764, + "learning_rate": 1.2060550616856239e-05, + "loss": 0.1057, + "step": 23038 + }, + { + "epoch": 2.7320052175975333, + "grad_norm": 0.7410352113852099, + "learning_rate": 1.2058496840997e-05, + "loss": 0.1081, + "step": 23039 + }, + { + "epoch": 2.7321237993596585, + "grad_norm": 1.1220993536532746, + "learning_rate": 1.2056443184442562e-05, + "loss": 0.1135, + "step": 23040 + }, + { + "epoch": 2.7322423811217833, + "grad_norm": 0.800671894653105, + "learning_rate": 1.2054389647211833e-05, + "loss": 0.0977, + "step": 23041 + }, + { + "epoch": 2.7323609628839085, + "grad_norm": 0.6906868146628204, + "learning_rate": 1.2052336229323757e-05, + "loss": 0.0974, + "step": 23042 + }, + { + "epoch": 2.7324795446460337, + "grad_norm": 0.46752021440072034, + "learning_rate": 1.2050282930797263e-05, + "loss": 0.0546, + "step": 23043 + }, + { + "epoch": 2.7325981264081585, + "grad_norm": 0.49228973651040475, + "learning_rate": 1.2048229751651283e-05, + "loss": 0.082, + "step": 23044 + }, + { + "epoch": 2.7327167081702832, + "grad_norm": 0.6415345423724852, + "learning_rate": 1.204617669190475e-05, + "loss": 0.0821, + "step": 23045 + }, + { + "epoch": 2.7328352899324084, + "grad_norm": 0.8040138238428041, + "learning_rate": 1.2044123751576564e-05, + "loss": 0.1142, + "step": 23046 + }, + { + "epoch": 2.7329538716945336, + "grad_norm": 0.8646681610341688, + "learning_rate": 1.204207093068569e-05, + "loss": 0.1028, + "step": 23047 + }, + { + "epoch": 2.7330724534566584, + "grad_norm": 0.7452872888712717, + "learning_rate": 1.2040018229251018e-05, + "loss": 0.0917, + "step": 23048 + }, + { + "epoch": 2.733191035218783, + "grad_norm": 0.4911370938882959, + "learning_rate": 1.2037965647291489e-05, + "loss": 0.0525, + "step": 23049 + }, + { + "epoch": 2.7333096169809084, + "grad_norm": 0.4734322563755914, + "learning_rate": 1.203591318482602e-05, + "loss": 0.0687, + "step": 23050 + }, + { + "epoch": 2.7334281987430336, + "grad_norm": 0.8037009097054694, + "learning_rate": 1.2033860841873536e-05, + "loss": 0.1168, + "step": 23051 + }, + { + "epoch": 2.7335467805051583, + "grad_norm": 0.7116718912830884, + "learning_rate": 1.2031808618452964e-05, + "loss": 0.0965, + "step": 23052 + }, + { + "epoch": 2.733665362267283, + "grad_norm": 0.6802849407263313, + "learning_rate": 1.2029756514583192e-05, + "loss": 0.0779, + "step": 23053 + }, + { + "epoch": 2.7337839440294083, + "grad_norm": 0.6929793486052311, + "learning_rate": 1.2027704530283182e-05, + "loss": 0.0879, + "step": 23054 + }, + { + "epoch": 2.7339025257915335, + "grad_norm": 0.6863774602146308, + "learning_rate": 1.2025652665571818e-05, + "loss": 0.0865, + "step": 23055 + }, + { + "epoch": 2.7340211075536582, + "grad_norm": 0.5342201968684808, + "learning_rate": 1.2023600920468033e-05, + "loss": 0.062, + "step": 23056 + }, + { + "epoch": 2.734139689315783, + "grad_norm": 0.5743407078724405, + "learning_rate": 1.2021549294990719e-05, + "loss": 0.0716, + "step": 23057 + }, + { + "epoch": 2.734258271077908, + "grad_norm": 0.42395862756474906, + "learning_rate": 1.2019497789158826e-05, + "loss": 0.0604, + "step": 23058 + }, + { + "epoch": 2.7343768528400334, + "grad_norm": 0.48382394584177574, + "learning_rate": 1.2017446402991234e-05, + "loss": 0.0559, + "step": 23059 + }, + { + "epoch": 2.734495434602158, + "grad_norm": 0.9854129165798985, + "learning_rate": 1.2015395136506866e-05, + "loss": 0.1044, + "step": 23060 + }, + { + "epoch": 2.734614016364283, + "grad_norm": 0.6087149266921947, + "learning_rate": 1.2013343989724629e-05, + "loss": 0.0823, + "step": 23061 + }, + { + "epoch": 2.734732598126408, + "grad_norm": 1.0226208223974493, + "learning_rate": 1.201129296266344e-05, + "loss": 0.1123, + "step": 23062 + }, + { + "epoch": 2.7348511798885333, + "grad_norm": 0.6840903715628531, + "learning_rate": 1.2009242055342209e-05, + "loss": 0.0913, + "step": 23063 + }, + { + "epoch": 2.734969761650658, + "grad_norm": 0.7285194041642314, + "learning_rate": 1.2007191267779816e-05, + "loss": 0.0959, + "step": 23064 + }, + { + "epoch": 2.735088343412783, + "grad_norm": 0.48133704334331157, + "learning_rate": 1.2005140599995208e-05, + "loss": 0.066, + "step": 23065 + }, + { + "epoch": 2.735206925174908, + "grad_norm": 0.5202112242778785, + "learning_rate": 1.2003090052007257e-05, + "loss": 0.0681, + "step": 23066 + }, + { + "epoch": 2.7353255069370332, + "grad_norm": 0.8080572442892797, + "learning_rate": 1.200103962383489e-05, + "loss": 0.1201, + "step": 23067 + }, + { + "epoch": 2.735444088699158, + "grad_norm": 0.9905204382721391, + "learning_rate": 1.1998989315496976e-05, + "loss": 0.125, + "step": 23068 + }, + { + "epoch": 2.735562670461283, + "grad_norm": 0.7716289025041229, + "learning_rate": 1.1996939127012457e-05, + "loss": 0.0973, + "step": 23069 + }, + { + "epoch": 2.735681252223408, + "grad_norm": 0.6124046805506348, + "learning_rate": 1.1994889058400205e-05, + "loss": 0.0808, + "step": 23070 + }, + { + "epoch": 2.735799833985533, + "grad_norm": 0.7977043548730671, + "learning_rate": 1.1992839109679127e-05, + "loss": 0.0994, + "step": 23071 + }, + { + "epoch": 2.735918415747658, + "grad_norm": 0.7910628357926021, + "learning_rate": 1.1990789280868123e-05, + "loss": 0.0894, + "step": 23072 + }, + { + "epoch": 2.736036997509783, + "grad_norm": 0.5127537355202552, + "learning_rate": 1.1988739571986088e-05, + "loss": 0.0634, + "step": 23073 + }, + { + "epoch": 2.736155579271908, + "grad_norm": 0.48736373436842617, + "learning_rate": 1.1986689983051927e-05, + "loss": 0.0545, + "step": 23074 + }, + { + "epoch": 2.736274161034033, + "grad_norm": 0.8247651361643328, + "learning_rate": 1.198464051408452e-05, + "loss": 0.1021, + "step": 23075 + }, + { + "epoch": 2.736392742796158, + "grad_norm": 0.5244339755518762, + "learning_rate": 1.1982591165102764e-05, + "loss": 0.0821, + "step": 23076 + }, + { + "epoch": 2.736511324558283, + "grad_norm": 0.660831383078059, + "learning_rate": 1.1980541936125555e-05, + "loss": 0.1009, + "step": 23077 + }, + { + "epoch": 2.736629906320408, + "grad_norm": 0.9934536527998663, + "learning_rate": 1.1978492827171794e-05, + "loss": 0.1238, + "step": 23078 + }, + { + "epoch": 2.736748488082533, + "grad_norm": 0.5950771479451896, + "learning_rate": 1.1976443838260343e-05, + "loss": 0.0927, + "step": 23079 + }, + { + "epoch": 2.7368670698446578, + "grad_norm": 0.8181555584629641, + "learning_rate": 1.1974394969410127e-05, + "loss": 0.0961, + "step": 23080 + }, + { + "epoch": 2.736985651606783, + "grad_norm": 0.5690534119184455, + "learning_rate": 1.197234622064001e-05, + "loss": 0.0757, + "step": 23081 + }, + { + "epoch": 2.7371042333689077, + "grad_norm": 0.7247676031073997, + "learning_rate": 1.1970297591968882e-05, + "loss": 0.0967, + "step": 23082 + }, + { + "epoch": 2.737222815131033, + "grad_norm": 0.9502016901087762, + "learning_rate": 1.1968249083415631e-05, + "loss": 0.1096, + "step": 23083 + }, + { + "epoch": 2.7373413968931577, + "grad_norm": 0.744277339112612, + "learning_rate": 1.1966200694999147e-05, + "loss": 0.0847, + "step": 23084 + }, + { + "epoch": 2.737459978655283, + "grad_norm": 0.515226601155056, + "learning_rate": 1.1964152426738314e-05, + "loss": 0.0693, + "step": 23085 + }, + { + "epoch": 2.7375785604174077, + "grad_norm": 0.7051696137683684, + "learning_rate": 1.1962104278652003e-05, + "loss": 0.0747, + "step": 23086 + }, + { + "epoch": 2.737697142179533, + "grad_norm": 0.8923779953863118, + "learning_rate": 1.1960056250759105e-05, + "loss": 0.1254, + "step": 23087 + }, + { + "epoch": 2.7378157239416576, + "grad_norm": 0.7258364245242953, + "learning_rate": 1.1958008343078495e-05, + "loss": 0.0997, + "step": 23088 + }, + { + "epoch": 2.737934305703783, + "grad_norm": 0.6471806930467116, + "learning_rate": 1.1955960555629064e-05, + "loss": 0.0757, + "step": 23089 + }, + { + "epoch": 2.7380528874659076, + "grad_norm": 0.5663824680111791, + "learning_rate": 1.1953912888429672e-05, + "loss": 0.0782, + "step": 23090 + }, + { + "epoch": 2.738171469228033, + "grad_norm": 0.8869348306532913, + "learning_rate": 1.1951865341499204e-05, + "loss": 0.1372, + "step": 23091 + }, + { + "epoch": 2.738290050990158, + "grad_norm": 0.8174850902507066, + "learning_rate": 1.1949817914856539e-05, + "loss": 0.1, + "step": 23092 + }, + { + "epoch": 2.7384086327522827, + "grad_norm": 0.5503972159664063, + "learning_rate": 1.194777060852055e-05, + "loss": 0.0657, + "step": 23093 + }, + { + "epoch": 2.7385272145144075, + "grad_norm": 0.6758769634335257, + "learning_rate": 1.1945723422510108e-05, + "loss": 0.0886, + "step": 23094 + }, + { + "epoch": 2.7386457962765327, + "grad_norm": 0.891715021946282, + "learning_rate": 1.1943676356844088e-05, + "loss": 0.1325, + "step": 23095 + }, + { + "epoch": 2.738764378038658, + "grad_norm": 0.5254079648953677, + "learning_rate": 1.1941629411541371e-05, + "loss": 0.0595, + "step": 23096 + }, + { + "epoch": 2.7388829598007827, + "grad_norm": 0.7877805937817046, + "learning_rate": 1.193958258662081e-05, + "loss": 0.1152, + "step": 23097 + }, + { + "epoch": 2.7390015415629074, + "grad_norm": 0.4346954446194328, + "learning_rate": 1.1937535882101281e-05, + "loss": 0.0559, + "step": 23098 + }, + { + "epoch": 2.7391201233250326, + "grad_norm": 0.7460255317026866, + "learning_rate": 1.193548929800165e-05, + "loss": 0.0803, + "step": 23099 + }, + { + "epoch": 2.739238705087158, + "grad_norm": 0.5830992596601138, + "learning_rate": 1.1933442834340792e-05, + "loss": 0.0776, + "step": 23100 + }, + { + "epoch": 2.7393572868492826, + "grad_norm": 0.739242161601661, + "learning_rate": 1.1931396491137575e-05, + "loss": 0.0871, + "step": 23101 + }, + { + "epoch": 2.7394758686114073, + "grad_norm": 0.6334974330719747, + "learning_rate": 1.1929350268410836e-05, + "loss": 0.0752, + "step": 23102 + }, + { + "epoch": 2.7395944503735326, + "grad_norm": 0.7683917073286749, + "learning_rate": 1.1927304166179478e-05, + "loss": 0.1055, + "step": 23103 + }, + { + "epoch": 2.7397130321356578, + "grad_norm": 1.2435054073679588, + "learning_rate": 1.1925258184462337e-05, + "loss": 0.1359, + "step": 23104 + }, + { + "epoch": 2.7398316138977825, + "grad_norm": 0.7257584780848799, + "learning_rate": 1.1923212323278294e-05, + "loss": 0.0822, + "step": 23105 + }, + { + "epoch": 2.7399501956599073, + "grad_norm": 0.6298042578372426, + "learning_rate": 1.1921166582646176e-05, + "loss": 0.0823, + "step": 23106 + }, + { + "epoch": 2.7400687774220325, + "grad_norm": 1.1187060070318102, + "learning_rate": 1.1919120962584881e-05, + "loss": 0.1402, + "step": 23107 + }, + { + "epoch": 2.7401873591841577, + "grad_norm": 0.5684241901149921, + "learning_rate": 1.1917075463113243e-05, + "loss": 0.0704, + "step": 23108 + }, + { + "epoch": 2.7403059409462824, + "grad_norm": 1.0314117372453524, + "learning_rate": 1.1915030084250126e-05, + "loss": 0.1439, + "step": 23109 + }, + { + "epoch": 2.740424522708407, + "grad_norm": 1.0292443061038474, + "learning_rate": 1.1912984826014384e-05, + "loss": 0.126, + "step": 23110 + }, + { + "epoch": 2.7405431044705324, + "grad_norm": 0.6915565714184718, + "learning_rate": 1.1910939688424874e-05, + "loss": 0.0972, + "step": 23111 + }, + { + "epoch": 2.7406616862326576, + "grad_norm": 0.543985283469277, + "learning_rate": 1.1908894671500459e-05, + "loss": 0.0897, + "step": 23112 + }, + { + "epoch": 2.7407802679947824, + "grad_norm": 0.4910810802485434, + "learning_rate": 1.1906849775259963e-05, + "loss": 0.0728, + "step": 23113 + }, + { + "epoch": 2.740898849756907, + "grad_norm": 0.4819163360907256, + "learning_rate": 1.1904804999722275e-05, + "loss": 0.0606, + "step": 23114 + }, + { + "epoch": 2.7410174315190323, + "grad_norm": 0.6985967397273753, + "learning_rate": 1.1902760344906217e-05, + "loss": 0.1188, + "step": 23115 + }, + { + "epoch": 2.7411360132811575, + "grad_norm": 0.6328884678863625, + "learning_rate": 1.1900715810830656e-05, + "loss": 0.0741, + "step": 23116 + }, + { + "epoch": 2.7412545950432823, + "grad_norm": 0.5386810087423353, + "learning_rate": 1.1898671397514416e-05, + "loss": 0.0793, + "step": 23117 + }, + { + "epoch": 2.7413731768054075, + "grad_norm": 0.5980652631625646, + "learning_rate": 1.1896627104976377e-05, + "loss": 0.0736, + "step": 23118 + }, + { + "epoch": 2.7414917585675322, + "grad_norm": 0.4614697445183384, + "learning_rate": 1.1894582933235357e-05, + "loss": 0.0643, + "step": 23119 + }, + { + "epoch": 2.7416103403296574, + "grad_norm": 0.7200859552413702, + "learning_rate": 1.1892538882310213e-05, + "loss": 0.0766, + "step": 23120 + }, + { + "epoch": 2.741728922091782, + "grad_norm": 0.6907502118815246, + "learning_rate": 1.1890494952219788e-05, + "loss": 0.114, + "step": 23121 + }, + { + "epoch": 2.7418475038539074, + "grad_norm": 0.865598098409033, + "learning_rate": 1.188845114298292e-05, + "loss": 0.1391, + "step": 23122 + }, + { + "epoch": 2.741966085616032, + "grad_norm": 0.7194538319221677, + "learning_rate": 1.1886407454618467e-05, + "loss": 0.1162, + "step": 23123 + }, + { + "epoch": 2.7420846673781574, + "grad_norm": 0.7105125018141333, + "learning_rate": 1.1884363887145236e-05, + "loss": 0.093, + "step": 23124 + }, + { + "epoch": 2.742203249140282, + "grad_norm": 0.6800941052590278, + "learning_rate": 1.1882320440582107e-05, + "loss": 0.0882, + "step": 23125 + }, + { + "epoch": 2.7423218309024073, + "grad_norm": 0.46367496179140644, + "learning_rate": 1.1880277114947886e-05, + "loss": 0.0612, + "step": 23126 + }, + { + "epoch": 2.742440412664532, + "grad_norm": 0.7881347228507966, + "learning_rate": 1.187823391026143e-05, + "loss": 0.0977, + "step": 23127 + }, + { + "epoch": 2.7425589944266573, + "grad_norm": 0.6917065781051245, + "learning_rate": 1.187619082654155e-05, + "loss": 0.0828, + "step": 23128 + }, + { + "epoch": 2.742677576188782, + "grad_norm": 0.6608229653215935, + "learning_rate": 1.1874147863807114e-05, + "loss": 0.0895, + "step": 23129 + }, + { + "epoch": 2.7427961579509073, + "grad_norm": 1.2206853768861883, + "learning_rate": 1.1872105022076933e-05, + "loss": 0.1279, + "step": 23130 + }, + { + "epoch": 2.742914739713032, + "grad_norm": 0.6197965758551146, + "learning_rate": 1.1870062301369841e-05, + "loss": 0.0783, + "step": 23131 + }, + { + "epoch": 2.743033321475157, + "grad_norm": 0.7143669259807663, + "learning_rate": 1.1868019701704675e-05, + "loss": 0.0932, + "step": 23132 + }, + { + "epoch": 2.743151903237282, + "grad_norm": 0.7399615047909145, + "learning_rate": 1.1865977223100264e-05, + "loss": 0.108, + "step": 23133 + }, + { + "epoch": 2.743270484999407, + "grad_norm": 0.8559245154917009, + "learning_rate": 1.1863934865575447e-05, + "loss": 0.1033, + "step": 23134 + }, + { + "epoch": 2.743389066761532, + "grad_norm": 0.572563723591045, + "learning_rate": 1.1861892629149033e-05, + "loss": 0.0672, + "step": 23135 + }, + { + "epoch": 2.743507648523657, + "grad_norm": 0.5574742855253119, + "learning_rate": 1.1859850513839857e-05, + "loss": 0.0769, + "step": 23136 + }, + { + "epoch": 2.743626230285782, + "grad_norm": 0.5932449919886581, + "learning_rate": 1.1857808519666743e-05, + "loss": 0.0839, + "step": 23137 + }, + { + "epoch": 2.743744812047907, + "grad_norm": 0.642047499816333, + "learning_rate": 1.1855766646648531e-05, + "loss": 0.0849, + "step": 23138 + }, + { + "epoch": 2.743863393810032, + "grad_norm": 0.7430136713736295, + "learning_rate": 1.1853724894804013e-05, + "loss": 0.0724, + "step": 23139 + }, + { + "epoch": 2.743981975572157, + "grad_norm": 0.8194555100206072, + "learning_rate": 1.185168326415205e-05, + "loss": 0.1138, + "step": 23140 + }, + { + "epoch": 2.7441005573342823, + "grad_norm": 0.6262802348038256, + "learning_rate": 1.1849641754711433e-05, + "loss": 0.0775, + "step": 23141 + }, + { + "epoch": 2.744219139096407, + "grad_norm": 0.7193131787260711, + "learning_rate": 1.1847600366500993e-05, + "loss": 0.1066, + "step": 23142 + }, + { + "epoch": 2.744337720858532, + "grad_norm": 0.5716881353120598, + "learning_rate": 1.184555909953955e-05, + "loss": 0.0746, + "step": 23143 + }, + { + "epoch": 2.744456302620657, + "grad_norm": 0.43631985943222434, + "learning_rate": 1.184351795384592e-05, + "loss": 0.0519, + "step": 23144 + }, + { + "epoch": 2.744574884382782, + "grad_norm": 0.7082124563675943, + "learning_rate": 1.184147692943893e-05, + "loss": 0.0601, + "step": 23145 + }, + { + "epoch": 2.744693466144907, + "grad_norm": 0.8330704892147364, + "learning_rate": 1.183943602633738e-05, + "loss": 0.1082, + "step": 23146 + }, + { + "epoch": 2.7448120479070317, + "grad_norm": 0.6986217569706727, + "learning_rate": 1.183739524456009e-05, + "loss": 0.0934, + "step": 23147 + }, + { + "epoch": 2.744930629669157, + "grad_norm": 0.8732521909319247, + "learning_rate": 1.1835354584125873e-05, + "loss": 0.1007, + "step": 23148 + }, + { + "epoch": 2.745049211431282, + "grad_norm": 0.763817611371851, + "learning_rate": 1.1833314045053546e-05, + "loss": 0.1201, + "step": 23149 + }, + { + "epoch": 2.745167793193407, + "grad_norm": 0.7646920651004684, + "learning_rate": 1.1831273627361914e-05, + "loss": 0.108, + "step": 23150 + }, + { + "epoch": 2.7452863749555316, + "grad_norm": 0.7726998346658454, + "learning_rate": 1.1829233331069792e-05, + "loss": 0.0814, + "step": 23151 + }, + { + "epoch": 2.745404956717657, + "grad_norm": 0.7290121863596605, + "learning_rate": 1.1827193156195998e-05, + "loss": 0.0872, + "step": 23152 + }, + { + "epoch": 2.745523538479782, + "grad_norm": 0.714872884854006, + "learning_rate": 1.1825153102759321e-05, + "loss": 0.1008, + "step": 23153 + }, + { + "epoch": 2.745642120241907, + "grad_norm": 0.5539275144104406, + "learning_rate": 1.1823113170778574e-05, + "loss": 0.0881, + "step": 23154 + }, + { + "epoch": 2.7457607020040316, + "grad_norm": 0.6873414189376511, + "learning_rate": 1.1821073360272564e-05, + "loss": 0.0812, + "step": 23155 + }, + { + "epoch": 2.7458792837661568, + "grad_norm": 0.45798000962287394, + "learning_rate": 1.181903367126011e-05, + "loss": 0.0581, + "step": 23156 + }, + { + "epoch": 2.745997865528282, + "grad_norm": 0.6099623557091703, + "learning_rate": 1.181699410375999e-05, + "loss": 0.0648, + "step": 23157 + }, + { + "epoch": 2.7461164472904067, + "grad_norm": 0.7226081787052613, + "learning_rate": 1.1814954657791018e-05, + "loss": 0.0908, + "step": 23158 + }, + { + "epoch": 2.7462350290525315, + "grad_norm": 0.8675639387170122, + "learning_rate": 1.1812915333371996e-05, + "loss": 0.1147, + "step": 23159 + }, + { + "epoch": 2.7463536108146567, + "grad_norm": 0.5375888567709732, + "learning_rate": 1.1810876130521726e-05, + "loss": 0.0667, + "step": 23160 + }, + { + "epoch": 2.746472192576782, + "grad_norm": 0.6832110038975256, + "learning_rate": 1.1808837049259009e-05, + "loss": 0.0881, + "step": 23161 + }, + { + "epoch": 2.7465907743389066, + "grad_norm": 0.5205632312676047, + "learning_rate": 1.1806798089602623e-05, + "loss": 0.0549, + "step": 23162 + }, + { + "epoch": 2.7467093561010314, + "grad_norm": 0.6139078980951905, + "learning_rate": 1.1804759251571398e-05, + "loss": 0.0911, + "step": 23163 + }, + { + "epoch": 2.7468279378631566, + "grad_norm": 0.7016036313385798, + "learning_rate": 1.1802720535184103e-05, + "loss": 0.115, + "step": 23164 + }, + { + "epoch": 2.746946519625282, + "grad_norm": 0.6829602416788452, + "learning_rate": 1.180068194045954e-05, + "loss": 0.0855, + "step": 23165 + }, + { + "epoch": 2.7470651013874066, + "grad_norm": 0.6060883927442322, + "learning_rate": 1.1798643467416504e-05, + "loss": 0.0688, + "step": 23166 + }, + { + "epoch": 2.7471836831495318, + "grad_norm": 0.6672183704070178, + "learning_rate": 1.1796605116073794e-05, + "loss": 0.0891, + "step": 23167 + }, + { + "epoch": 2.7473022649116565, + "grad_norm": 0.9331675915869254, + "learning_rate": 1.1794566886450187e-05, + "loss": 0.1274, + "step": 23168 + }, + { + "epoch": 2.7474208466737817, + "grad_norm": 0.6574574314680772, + "learning_rate": 1.179252877856448e-05, + "loss": 0.0855, + "step": 23169 + }, + { + "epoch": 2.7475394284359065, + "grad_norm": 0.5074808629826048, + "learning_rate": 1.1790490792435461e-05, + "loss": 0.0807, + "step": 23170 + }, + { + "epoch": 2.7476580101980317, + "grad_norm": 0.8329929925104862, + "learning_rate": 1.1788452928081916e-05, + "loss": 0.1043, + "step": 23171 + }, + { + "epoch": 2.7477765919601564, + "grad_norm": 0.564710608666219, + "learning_rate": 1.1786415185522645e-05, + "loss": 0.0709, + "step": 23172 + }, + { + "epoch": 2.7478951737222816, + "grad_norm": 0.5932547642121235, + "learning_rate": 1.1784377564776406e-05, + "loss": 0.0871, + "step": 23173 + }, + { + "epoch": 2.7480137554844064, + "grad_norm": 0.5824500130420875, + "learning_rate": 1.1782340065862016e-05, + "loss": 0.0883, + "step": 23174 + }, + { + "epoch": 2.7481323372465316, + "grad_norm": 0.8767838113870643, + "learning_rate": 1.1780302688798231e-05, + "loss": 0.1109, + "step": 23175 + }, + { + "epoch": 2.7482509190086564, + "grad_norm": 0.536105045307961, + "learning_rate": 1.1778265433603858e-05, + "loss": 0.0807, + "step": 23176 + }, + { + "epoch": 2.7483695007707816, + "grad_norm": 0.5534242662881256, + "learning_rate": 1.1776228300297643e-05, + "loss": 0.0811, + "step": 23177 + }, + { + "epoch": 2.7484880825329063, + "grad_norm": 0.6707116299125645, + "learning_rate": 1.1774191288898403e-05, + "loss": 0.0948, + "step": 23178 + }, + { + "epoch": 2.7486066642950315, + "grad_norm": 0.6036276969488735, + "learning_rate": 1.1772154399424895e-05, + "loss": 0.0719, + "step": 23179 + }, + { + "epoch": 2.7487252460571563, + "grad_norm": 0.6773470820402991, + "learning_rate": 1.1770117631895902e-05, + "loss": 0.0864, + "step": 23180 + }, + { + "epoch": 2.7488438278192815, + "grad_norm": 0.7423429118236902, + "learning_rate": 1.17680809863302e-05, + "loss": 0.1111, + "step": 23181 + }, + { + "epoch": 2.7489624095814063, + "grad_norm": 0.8310268445102894, + "learning_rate": 1.1766044462746565e-05, + "loss": 0.1026, + "step": 23182 + }, + { + "epoch": 2.7490809913435315, + "grad_norm": 0.6858817708633038, + "learning_rate": 1.1764008061163781e-05, + "loss": 0.1016, + "step": 23183 + }, + { + "epoch": 2.749199573105656, + "grad_norm": 0.5477690041742157, + "learning_rate": 1.1761971781600595e-05, + "loss": 0.066, + "step": 23184 + }, + { + "epoch": 2.7493181548677814, + "grad_norm": 0.5640038618196758, + "learning_rate": 1.1759935624075813e-05, + "loss": 0.0707, + "step": 23185 + }, + { + "epoch": 2.749436736629906, + "grad_norm": 0.7474787906030458, + "learning_rate": 1.175789958860818e-05, + "loss": 0.108, + "step": 23186 + }, + { + "epoch": 2.7495553183920314, + "grad_norm": 0.911031840995823, + "learning_rate": 1.175586367521648e-05, + "loss": 0.1244, + "step": 23187 + }, + { + "epoch": 2.749673900154156, + "grad_norm": 0.4824626093640524, + "learning_rate": 1.175382788391946e-05, + "loss": 0.066, + "step": 23188 + }, + { + "epoch": 2.7497924819162813, + "grad_norm": 0.7701567431069881, + "learning_rate": 1.1751792214735921e-05, + "loss": 0.0835, + "step": 23189 + }, + { + "epoch": 2.749911063678406, + "grad_norm": 0.7239493623504701, + "learning_rate": 1.1749756667684605e-05, + "loss": 0.1096, + "step": 23190 + }, + { + "epoch": 2.7500296454405313, + "grad_norm": 0.4710844094835971, + "learning_rate": 1.1747721242784281e-05, + "loss": 0.0649, + "step": 23191 + }, + { + "epoch": 2.750148227202656, + "grad_norm": 0.5233701442149173, + "learning_rate": 1.1745685940053718e-05, + "loss": 0.0689, + "step": 23192 + }, + { + "epoch": 2.7502668089647813, + "grad_norm": 0.8504072954685145, + "learning_rate": 1.1743650759511676e-05, + "loss": 0.1119, + "step": 23193 + }, + { + "epoch": 2.7503853907269065, + "grad_norm": 0.5532022153350937, + "learning_rate": 1.1741615701176929e-05, + "loss": 0.0903, + "step": 23194 + }, + { + "epoch": 2.7505039724890312, + "grad_norm": 0.7374505109362389, + "learning_rate": 1.1739580765068205e-05, + "loss": 0.0691, + "step": 23195 + }, + { + "epoch": 2.750622554251156, + "grad_norm": 0.7906816537403468, + "learning_rate": 1.1737545951204307e-05, + "loss": 0.0772, + "step": 23196 + }, + { + "epoch": 2.750741136013281, + "grad_norm": 0.6615695759315355, + "learning_rate": 1.173551125960396e-05, + "loss": 0.0769, + "step": 23197 + }, + { + "epoch": 2.7508597177754064, + "grad_norm": 0.6220267347376586, + "learning_rate": 1.1733476690285936e-05, + "loss": 0.0704, + "step": 23198 + }, + { + "epoch": 2.750978299537531, + "grad_norm": 0.8156369029073308, + "learning_rate": 1.1731442243268986e-05, + "loss": 0.1299, + "step": 23199 + }, + { + "epoch": 2.751096881299656, + "grad_norm": 0.7291976561187722, + "learning_rate": 1.1729407918571868e-05, + "loss": 0.1055, + "step": 23200 + }, + { + "epoch": 2.751215463061781, + "grad_norm": 0.7628092324390173, + "learning_rate": 1.1727373716213347e-05, + "loss": 0.0648, + "step": 23201 + }, + { + "epoch": 2.7513340448239063, + "grad_norm": 0.47274566437729115, + "learning_rate": 1.1725339636212152e-05, + "loss": 0.0668, + "step": 23202 + }, + { + "epoch": 2.751452626586031, + "grad_norm": 0.4557146895878627, + "learning_rate": 1.172330567858705e-05, + "loss": 0.0528, + "step": 23203 + }, + { + "epoch": 2.751571208348156, + "grad_norm": 0.781567521938458, + "learning_rate": 1.1721271843356785e-05, + "loss": 0.0794, + "step": 23204 + }, + { + "epoch": 2.751689790110281, + "grad_norm": 0.5324589047348279, + "learning_rate": 1.1719238130540122e-05, + "loss": 0.0625, + "step": 23205 + }, + { + "epoch": 2.7518083718724062, + "grad_norm": 1.0243788622800671, + "learning_rate": 1.1717204540155788e-05, + "loss": 0.152, + "step": 23206 + }, + { + "epoch": 2.751926953634531, + "grad_norm": 0.5710371615873754, + "learning_rate": 1.1715171072222539e-05, + "loss": 0.0721, + "step": 23207 + }, + { + "epoch": 2.7520455353966558, + "grad_norm": 0.668131498094448, + "learning_rate": 1.1713137726759121e-05, + "loss": 0.0691, + "step": 23208 + }, + { + "epoch": 2.752164117158781, + "grad_norm": 0.7643715407016686, + "learning_rate": 1.1711104503784282e-05, + "loss": 0.1096, + "step": 23209 + }, + { + "epoch": 2.752282698920906, + "grad_norm": 0.9220865768194336, + "learning_rate": 1.1709071403316762e-05, + "loss": 0.1271, + "step": 23210 + }, + { + "epoch": 2.752401280683031, + "grad_norm": 0.5811655236168525, + "learning_rate": 1.1707038425375308e-05, + "loss": 0.0607, + "step": 23211 + }, + { + "epoch": 2.7525198624451557, + "grad_norm": 0.5055479625183099, + "learning_rate": 1.1705005569978664e-05, + "loss": 0.0589, + "step": 23212 + }, + { + "epoch": 2.752638444207281, + "grad_norm": 0.5834324561900491, + "learning_rate": 1.170297283714556e-05, + "loss": 0.0864, + "step": 23213 + }, + { + "epoch": 2.752757025969406, + "grad_norm": 0.9530297161050078, + "learning_rate": 1.1700940226894739e-05, + "loss": 0.1005, + "step": 23214 + }, + { + "epoch": 2.752875607731531, + "grad_norm": 0.4900741662942976, + "learning_rate": 1.1698907739244938e-05, + "loss": 0.054, + "step": 23215 + }, + { + "epoch": 2.752994189493656, + "grad_norm": 0.9030241336553417, + "learning_rate": 1.1696875374214907e-05, + "loss": 0.1471, + "step": 23216 + }, + { + "epoch": 2.753112771255781, + "grad_norm": 0.9769466530893934, + "learning_rate": 1.1694843131823366e-05, + "loss": 0.1186, + "step": 23217 + }, + { + "epoch": 2.753231353017906, + "grad_norm": 0.9068042400304376, + "learning_rate": 1.1692811012089053e-05, + "loss": 0.1256, + "step": 23218 + }, + { + "epoch": 2.7533499347800308, + "grad_norm": 0.6259703570407561, + "learning_rate": 1.1690779015030704e-05, + "loss": 0.0761, + "step": 23219 + }, + { + "epoch": 2.753468516542156, + "grad_norm": 0.6396594215431457, + "learning_rate": 1.168874714066705e-05, + "loss": 0.0886, + "step": 23220 + }, + { + "epoch": 2.7535870983042807, + "grad_norm": 0.8706575368164459, + "learning_rate": 1.1686715389016826e-05, + "loss": 0.1354, + "step": 23221 + }, + { + "epoch": 2.753705680066406, + "grad_norm": 0.6308450130044098, + "learning_rate": 1.1684683760098761e-05, + "loss": 0.087, + "step": 23222 + }, + { + "epoch": 2.7538242618285307, + "grad_norm": 0.4752296567049809, + "learning_rate": 1.1682652253931592e-05, + "loss": 0.0565, + "step": 23223 + }, + { + "epoch": 2.753942843590656, + "grad_norm": 0.5079119976685699, + "learning_rate": 1.1680620870534028e-05, + "loss": 0.0697, + "step": 23224 + }, + { + "epoch": 2.7540614253527806, + "grad_norm": 0.7120180024650957, + "learning_rate": 1.1678589609924809e-05, + "loss": 0.0953, + "step": 23225 + }, + { + "epoch": 2.754180007114906, + "grad_norm": 0.7641696036898306, + "learning_rate": 1.1676558472122654e-05, + "loss": 0.1204, + "step": 23226 + }, + { + "epoch": 2.7542985888770306, + "grad_norm": 0.7266781344738421, + "learning_rate": 1.1674527457146303e-05, + "loss": 0.1124, + "step": 23227 + }, + { + "epoch": 2.754417170639156, + "grad_norm": 0.5153508626326616, + "learning_rate": 1.167249656501446e-05, + "loss": 0.0652, + "step": 23228 + }, + { + "epoch": 2.7545357524012806, + "grad_norm": 0.8319796714487494, + "learning_rate": 1.1670465795745855e-05, + "loss": 0.0945, + "step": 23229 + }, + { + "epoch": 2.7546543341634058, + "grad_norm": 0.5574868875753586, + "learning_rate": 1.1668435149359207e-05, + "loss": 0.0702, + "step": 23230 + }, + { + "epoch": 2.7547729159255305, + "grad_norm": 0.9530105664030789, + "learning_rate": 1.1666404625873243e-05, + "loss": 0.1125, + "step": 23231 + }, + { + "epoch": 2.7548914976876557, + "grad_norm": 1.2054929310270381, + "learning_rate": 1.1664374225306681e-05, + "loss": 0.1461, + "step": 23232 + }, + { + "epoch": 2.7550100794497805, + "grad_norm": 0.6333081002597546, + "learning_rate": 1.1662343947678223e-05, + "loss": 0.0947, + "step": 23233 + }, + { + "epoch": 2.7551286612119057, + "grad_norm": 0.5226178631345191, + "learning_rate": 1.1660313793006614e-05, + "loss": 0.0679, + "step": 23234 + }, + { + "epoch": 2.7552472429740305, + "grad_norm": 0.4992619557426473, + "learning_rate": 1.1658283761310543e-05, + "loss": 0.0724, + "step": 23235 + }, + { + "epoch": 2.7553658247361557, + "grad_norm": 0.7398892847942955, + "learning_rate": 1.1656253852608737e-05, + "loss": 0.1298, + "step": 23236 + }, + { + "epoch": 2.7554844064982804, + "grad_norm": 0.566883579239692, + "learning_rate": 1.1654224066919906e-05, + "loss": 0.0817, + "step": 23237 + }, + { + "epoch": 2.7556029882604056, + "grad_norm": 0.8076735117892134, + "learning_rate": 1.1652194404262773e-05, + "loss": 0.1101, + "step": 23238 + }, + { + "epoch": 2.7557215700225304, + "grad_norm": 0.5089694423602504, + "learning_rate": 1.1650164864656032e-05, + "loss": 0.0666, + "step": 23239 + }, + { + "epoch": 2.7558401517846556, + "grad_norm": 0.656702611614249, + "learning_rate": 1.1648135448118397e-05, + "loss": 0.0825, + "step": 23240 + }, + { + "epoch": 2.7559587335467803, + "grad_norm": 0.842515503464508, + "learning_rate": 1.1646106154668582e-05, + "loss": 0.1331, + "step": 23241 + }, + { + "epoch": 2.7560773153089055, + "grad_norm": 0.8428625635548418, + "learning_rate": 1.1644076984325292e-05, + "loss": 0.1056, + "step": 23242 + }, + { + "epoch": 2.7561958970710307, + "grad_norm": 0.6562214446557147, + "learning_rate": 1.1642047937107246e-05, + "loss": 0.0826, + "step": 23243 + }, + { + "epoch": 2.7563144788331555, + "grad_norm": 0.47945732905923144, + "learning_rate": 1.1640019013033115e-05, + "loss": 0.0749, + "step": 23244 + }, + { + "epoch": 2.7564330605952803, + "grad_norm": 0.5305519751948833, + "learning_rate": 1.1637990212121646e-05, + "loss": 0.0635, + "step": 23245 + }, + { + "epoch": 2.7565516423574055, + "grad_norm": 0.6613964189324458, + "learning_rate": 1.1635961534391513e-05, + "loss": 0.0978, + "step": 23246 + }, + { + "epoch": 2.7566702241195307, + "grad_norm": 0.6249804084678224, + "learning_rate": 1.1633932979861425e-05, + "loss": 0.0781, + "step": 23247 + }, + { + "epoch": 2.7567888058816554, + "grad_norm": 0.5188163252537769, + "learning_rate": 1.1631904548550086e-05, + "loss": 0.0649, + "step": 23248 + }, + { + "epoch": 2.75690738764378, + "grad_norm": 0.6278208479737726, + "learning_rate": 1.1629876240476192e-05, + "loss": 0.083, + "step": 23249 + }, + { + "epoch": 2.7570259694059054, + "grad_norm": 0.6053085938702235, + "learning_rate": 1.1627848055658456e-05, + "loss": 0.0788, + "step": 23250 + }, + { + "epoch": 2.7571445511680306, + "grad_norm": 0.6059622991678569, + "learning_rate": 1.1625819994115541e-05, + "loss": 0.0735, + "step": 23251 + }, + { + "epoch": 2.7572631329301553, + "grad_norm": 0.5126108803591934, + "learning_rate": 1.1623792055866184e-05, + "loss": 0.0668, + "step": 23252 + }, + { + "epoch": 2.75738171469228, + "grad_norm": 0.9577015470226847, + "learning_rate": 1.1621764240929053e-05, + "loss": 0.1346, + "step": 23253 + }, + { + "epoch": 2.7575002964544053, + "grad_norm": 0.7054508274891346, + "learning_rate": 1.161973654932286e-05, + "loss": 0.1132, + "step": 23254 + }, + { + "epoch": 2.7576188782165305, + "grad_norm": 0.9454499525837586, + "learning_rate": 1.161770898106627e-05, + "loss": 0.1141, + "step": 23255 + }, + { + "epoch": 2.7577374599786553, + "grad_norm": 0.8639526446855226, + "learning_rate": 1.1615681536178011e-05, + "loss": 0.0979, + "step": 23256 + }, + { + "epoch": 2.75785604174078, + "grad_norm": 0.6646357053917555, + "learning_rate": 1.1613654214676744e-05, + "loss": 0.108, + "step": 23257 + }, + { + "epoch": 2.7579746235029052, + "grad_norm": 0.6994130065185173, + "learning_rate": 1.1611627016581173e-05, + "loss": 0.0949, + "step": 23258 + }, + { + "epoch": 2.7580932052650304, + "grad_norm": 0.7370143765838, + "learning_rate": 1.1609599941909979e-05, + "loss": 0.0913, + "step": 23259 + }, + { + "epoch": 2.758211787027155, + "grad_norm": 0.8145597353659629, + "learning_rate": 1.1607572990681857e-05, + "loss": 0.0884, + "step": 23260 + }, + { + "epoch": 2.75833036878928, + "grad_norm": 0.755683606584451, + "learning_rate": 1.1605546162915495e-05, + "loss": 0.0914, + "step": 23261 + }, + { + "epoch": 2.758448950551405, + "grad_norm": 0.7982427033390126, + "learning_rate": 1.1603519458629564e-05, + "loss": 0.1218, + "step": 23262 + }, + { + "epoch": 2.7585675323135304, + "grad_norm": 0.5197582612355064, + "learning_rate": 1.1601492877842756e-05, + "loss": 0.0739, + "step": 23263 + }, + { + "epoch": 2.758686114075655, + "grad_norm": 0.7584675540523226, + "learning_rate": 1.1599466420573754e-05, + "loss": 0.0935, + "step": 23264 + }, + { + "epoch": 2.75880469583778, + "grad_norm": 0.5798880276573742, + "learning_rate": 1.159744008684125e-05, + "loss": 0.0784, + "step": 23265 + }, + { + "epoch": 2.758923277599905, + "grad_norm": 0.7099682157881472, + "learning_rate": 1.159541387666389e-05, + "loss": 0.1022, + "step": 23266 + }, + { + "epoch": 2.7590418593620303, + "grad_norm": 0.5297049571085045, + "learning_rate": 1.1593387790060397e-05, + "loss": 0.0689, + "step": 23267 + }, + { + "epoch": 2.759160441124155, + "grad_norm": 0.7394542871229782, + "learning_rate": 1.1591361827049418e-05, + "loss": 0.0786, + "step": 23268 + }, + { + "epoch": 2.7592790228862802, + "grad_norm": 0.9031389693201954, + "learning_rate": 1.158933598764964e-05, + "loss": 0.1152, + "step": 23269 + }, + { + "epoch": 2.759397604648405, + "grad_norm": 0.6602983535695344, + "learning_rate": 1.1587310271879742e-05, + "loss": 0.0675, + "step": 23270 + }, + { + "epoch": 2.75951618641053, + "grad_norm": 0.7099046392761678, + "learning_rate": 1.1585284679758393e-05, + "loss": 0.0852, + "step": 23271 + }, + { + "epoch": 2.759634768172655, + "grad_norm": 0.7838198491590664, + "learning_rate": 1.1583259211304279e-05, + "loss": 0.0998, + "step": 23272 + }, + { + "epoch": 2.75975334993478, + "grad_norm": 0.9412534715529464, + "learning_rate": 1.1581233866536054e-05, + "loss": 0.1364, + "step": 23273 + }, + { + "epoch": 2.759871931696905, + "grad_norm": 0.4234409832166605, + "learning_rate": 1.1579208645472397e-05, + "loss": 0.0547, + "step": 23274 + }, + { + "epoch": 2.75999051345903, + "grad_norm": 0.7878448620175719, + "learning_rate": 1.1577183548131978e-05, + "loss": 0.1055, + "step": 23275 + }, + { + "epoch": 2.760109095221155, + "grad_norm": 0.42337009067927306, + "learning_rate": 1.1575158574533475e-05, + "loss": 0.0446, + "step": 23276 + }, + { + "epoch": 2.76022767698328, + "grad_norm": 1.0373711734965738, + "learning_rate": 1.157313372469554e-05, + "loss": 0.143, + "step": 23277 + }, + { + "epoch": 2.760346258745405, + "grad_norm": 0.5745957793321691, + "learning_rate": 1.1571108998636846e-05, + "loss": 0.0776, + "step": 23278 + }, + { + "epoch": 2.76046484050753, + "grad_norm": 0.9687250682583013, + "learning_rate": 1.156908439637606e-05, + "loss": 0.1109, + "step": 23279 + }, + { + "epoch": 2.760583422269655, + "grad_norm": 0.7308304182036192, + "learning_rate": 1.1567059917931844e-05, + "loss": 0.0984, + "step": 23280 + }, + { + "epoch": 2.76070200403178, + "grad_norm": 0.5997574067741324, + "learning_rate": 1.1565035563322866e-05, + "loss": 0.0737, + "step": 23281 + }, + { + "epoch": 2.7608205857939048, + "grad_norm": 0.5432995121155878, + "learning_rate": 1.1563011332567782e-05, + "loss": 0.0736, + "step": 23282 + }, + { + "epoch": 2.76093916755603, + "grad_norm": 0.5460655131407403, + "learning_rate": 1.1560987225685265e-05, + "loss": 0.0663, + "step": 23283 + }, + { + "epoch": 2.7610577493181547, + "grad_norm": 0.5020434905573673, + "learning_rate": 1.155896324269396e-05, + "loss": 0.0688, + "step": 23284 + }, + { + "epoch": 2.76117633108028, + "grad_norm": 0.6367110578217066, + "learning_rate": 1.155693938361253e-05, + "loss": 0.0774, + "step": 23285 + }, + { + "epoch": 2.7612949128424047, + "grad_norm": 0.9362888084039903, + "learning_rate": 1.1554915648459633e-05, + "loss": 0.1176, + "step": 23286 + }, + { + "epoch": 2.76141349460453, + "grad_norm": 0.6931854928125081, + "learning_rate": 1.1552892037253934e-05, + "loss": 0.1019, + "step": 23287 + }, + { + "epoch": 2.7615320763666547, + "grad_norm": 0.6396179856485513, + "learning_rate": 1.1550868550014074e-05, + "loss": 0.0835, + "step": 23288 + }, + { + "epoch": 2.76165065812878, + "grad_norm": 0.5402306976064243, + "learning_rate": 1.1548845186758713e-05, + "loss": 0.072, + "step": 23289 + }, + { + "epoch": 2.7617692398909046, + "grad_norm": 1.0526982260744158, + "learning_rate": 1.1546821947506506e-05, + "loss": 0.1412, + "step": 23290 + }, + { + "epoch": 2.76188782165303, + "grad_norm": 0.5665635218866482, + "learning_rate": 1.15447988322761e-05, + "loss": 0.0777, + "step": 23291 + }, + { + "epoch": 2.762006403415155, + "grad_norm": 0.7918626368213728, + "learning_rate": 1.154277584108616e-05, + "loss": 0.1171, + "step": 23292 + }, + { + "epoch": 2.76212498517728, + "grad_norm": 0.46444171727530853, + "learning_rate": 1.1540752973955305e-05, + "loss": 0.064, + "step": 23293 + }, + { + "epoch": 2.7622435669394045, + "grad_norm": 0.8922312189161457, + "learning_rate": 1.1538730230902222e-05, + "loss": 0.0995, + "step": 23294 + }, + { + "epoch": 2.7623621487015297, + "grad_norm": 0.8093222293804998, + "learning_rate": 1.1536707611945529e-05, + "loss": 0.1219, + "step": 23295 + }, + { + "epoch": 2.762480730463655, + "grad_norm": 0.7851358563393909, + "learning_rate": 1.1534685117103883e-05, + "loss": 0.105, + "step": 23296 + }, + { + "epoch": 2.7625993122257797, + "grad_norm": 0.4576093595737123, + "learning_rate": 1.1532662746395928e-05, + "loss": 0.0576, + "step": 23297 + }, + { + "epoch": 2.7627178939879045, + "grad_norm": 0.5444537439459317, + "learning_rate": 1.1530640499840317e-05, + "loss": 0.0656, + "step": 23298 + }, + { + "epoch": 2.7628364757500297, + "grad_norm": 0.6582817680778555, + "learning_rate": 1.1528618377455675e-05, + "loss": 0.0821, + "step": 23299 + }, + { + "epoch": 2.762955057512155, + "grad_norm": 0.505184720603796, + "learning_rate": 1.1526596379260649e-05, + "loss": 0.0612, + "step": 23300 + }, + { + "epoch": 2.7630736392742796, + "grad_norm": 0.794154853980335, + "learning_rate": 1.1524574505273883e-05, + "loss": 0.1029, + "step": 23301 + }, + { + "epoch": 2.7631922210364044, + "grad_norm": 0.539451222815244, + "learning_rate": 1.1522552755514016e-05, + "loss": 0.0764, + "step": 23302 + }, + { + "epoch": 2.7633108027985296, + "grad_norm": 0.6895413125412634, + "learning_rate": 1.1520531129999695e-05, + "loss": 0.0894, + "step": 23303 + }, + { + "epoch": 2.763429384560655, + "grad_norm": 0.4813650381435795, + "learning_rate": 1.1518509628749529e-05, + "loss": 0.0669, + "step": 23304 + }, + { + "epoch": 2.7635479663227795, + "grad_norm": 0.4856189387824942, + "learning_rate": 1.151648825178219e-05, + "loss": 0.0589, + "step": 23305 + }, + { + "epoch": 2.7636665480849043, + "grad_norm": 0.6221462204652989, + "learning_rate": 1.1514466999116286e-05, + "loss": 0.0815, + "step": 23306 + }, + { + "epoch": 2.7637851298470295, + "grad_norm": 0.5302939001522381, + "learning_rate": 1.1512445870770458e-05, + "loss": 0.0573, + "step": 23307 + }, + { + "epoch": 2.7639037116091547, + "grad_norm": 0.6888417807278513, + "learning_rate": 1.1510424866763341e-05, + "loss": 0.0871, + "step": 23308 + }, + { + "epoch": 2.7640222933712795, + "grad_norm": 0.6251860628045091, + "learning_rate": 1.1508403987113563e-05, + "loss": 0.0801, + "step": 23309 + }, + { + "epoch": 2.7641408751334042, + "grad_norm": 0.5388388846968905, + "learning_rate": 1.1506383231839767e-05, + "loss": 0.0798, + "step": 23310 + }, + { + "epoch": 2.7642594568955294, + "grad_norm": 0.7559227987560897, + "learning_rate": 1.1504362600960552e-05, + "loss": 0.1076, + "step": 23311 + }, + { + "epoch": 2.7643780386576546, + "grad_norm": 0.548984212307179, + "learning_rate": 1.1502342094494584e-05, + "loss": 0.0776, + "step": 23312 + }, + { + "epoch": 2.7644966204197794, + "grad_norm": 0.653059990482883, + "learning_rate": 1.1500321712460457e-05, + "loss": 0.0854, + "step": 23313 + }, + { + "epoch": 2.764615202181904, + "grad_norm": 0.7876343906844039, + "learning_rate": 1.1498301454876822e-05, + "loss": 0.1079, + "step": 23314 + }, + { + "epoch": 2.7647337839440294, + "grad_norm": 0.7448892793079768, + "learning_rate": 1.149628132176227e-05, + "loss": 0.0708, + "step": 23315 + }, + { + "epoch": 2.7648523657061546, + "grad_norm": 0.6658886928423825, + "learning_rate": 1.1494261313135466e-05, + "loss": 0.08, + "step": 23316 + }, + { + "epoch": 2.7649709474682793, + "grad_norm": 0.7739774515176077, + "learning_rate": 1.1492241429014999e-05, + "loss": 0.0762, + "step": 23317 + }, + { + "epoch": 2.7650895292304045, + "grad_norm": 0.5776154862076927, + "learning_rate": 1.1490221669419501e-05, + "loss": 0.0767, + "step": 23318 + }, + { + "epoch": 2.7652081109925293, + "grad_norm": 0.46537944833319633, + "learning_rate": 1.1488202034367596e-05, + "loss": 0.0506, + "step": 23319 + }, + { + "epoch": 2.7653266927546545, + "grad_norm": 0.5448760824536825, + "learning_rate": 1.1486182523877895e-05, + "loss": 0.0792, + "step": 23320 + }, + { + "epoch": 2.7654452745167792, + "grad_norm": 0.4462326028866552, + "learning_rate": 1.148416313796903e-05, + "loss": 0.0537, + "step": 23321 + }, + { + "epoch": 2.7655638562789044, + "grad_norm": 0.5705133966073183, + "learning_rate": 1.148214387665959e-05, + "loss": 0.0779, + "step": 23322 + }, + { + "epoch": 2.765682438041029, + "grad_norm": 0.7417066776767419, + "learning_rate": 1.1480124739968223e-05, + "loss": 0.1124, + "step": 23323 + }, + { + "epoch": 2.7658010198031544, + "grad_norm": 0.6634549195528942, + "learning_rate": 1.1478105727913518e-05, + "loss": 0.0726, + "step": 23324 + }, + { + "epoch": 2.765919601565279, + "grad_norm": 0.6394946677835075, + "learning_rate": 1.1476086840514107e-05, + "loss": 0.0792, + "step": 23325 + }, + { + "epoch": 2.7660381833274044, + "grad_norm": 0.8017176771985788, + "learning_rate": 1.1474068077788572e-05, + "loss": 0.0872, + "step": 23326 + }, + { + "epoch": 2.766156765089529, + "grad_norm": 0.7098987646759262, + "learning_rate": 1.1472049439755561e-05, + "loss": 0.1023, + "step": 23327 + }, + { + "epoch": 2.7662753468516543, + "grad_norm": 0.9067473373385154, + "learning_rate": 1.1470030926433659e-05, + "loss": 0.0843, + "step": 23328 + }, + { + "epoch": 2.766393928613779, + "grad_norm": 0.46563506577301267, + "learning_rate": 1.1468012537841477e-05, + "loss": 0.0719, + "step": 23329 + }, + { + "epoch": 2.7665125103759043, + "grad_norm": 0.37837519169115247, + "learning_rate": 1.1465994273997623e-05, + "loss": 0.0541, + "step": 23330 + }, + { + "epoch": 2.766631092138029, + "grad_norm": 0.5709801762622949, + "learning_rate": 1.1463976134920709e-05, + "loss": 0.0687, + "step": 23331 + }, + { + "epoch": 2.7667496739001542, + "grad_norm": 0.7055184252950649, + "learning_rate": 1.1461958120629347e-05, + "loss": 0.0903, + "step": 23332 + }, + { + "epoch": 2.766868255662279, + "grad_norm": 0.4641426089532821, + "learning_rate": 1.1459940231142116e-05, + "loss": 0.0603, + "step": 23333 + }, + { + "epoch": 2.766986837424404, + "grad_norm": 0.6595292511284725, + "learning_rate": 1.1457922466477633e-05, + "loss": 0.1007, + "step": 23334 + }, + { + "epoch": 2.767105419186529, + "grad_norm": 0.47117515132971005, + "learning_rate": 1.1455904826654501e-05, + "loss": 0.064, + "step": 23335 + }, + { + "epoch": 2.767224000948654, + "grad_norm": 0.9393848472851971, + "learning_rate": 1.1453887311691325e-05, + "loss": 0.123, + "step": 23336 + }, + { + "epoch": 2.767342582710779, + "grad_norm": 0.737654421065937, + "learning_rate": 1.1451869921606678e-05, + "loss": 0.094, + "step": 23337 + }, + { + "epoch": 2.767461164472904, + "grad_norm": 0.5741057997960162, + "learning_rate": 1.1449852656419196e-05, + "loss": 0.0717, + "step": 23338 + }, + { + "epoch": 2.767579746235029, + "grad_norm": 0.8468171731757752, + "learning_rate": 1.1447835516147446e-05, + "loss": 0.1237, + "step": 23339 + }, + { + "epoch": 2.767698327997154, + "grad_norm": 0.5350642536662792, + "learning_rate": 1.1445818500810035e-05, + "loss": 0.0825, + "step": 23340 + }, + { + "epoch": 2.7678169097592793, + "grad_norm": 0.8982257950633143, + "learning_rate": 1.1443801610425555e-05, + "loss": 0.1249, + "step": 23341 + }, + { + "epoch": 2.767935491521404, + "grad_norm": 0.5855653357882245, + "learning_rate": 1.1441784845012602e-05, + "loss": 0.0889, + "step": 23342 + }, + { + "epoch": 2.768054073283529, + "grad_norm": 0.7692341243865607, + "learning_rate": 1.1439768204589774e-05, + "loss": 0.1054, + "step": 23343 + }, + { + "epoch": 2.768172655045654, + "grad_norm": 0.5728546993457404, + "learning_rate": 1.1437751689175647e-05, + "loss": 0.0778, + "step": 23344 + }, + { + "epoch": 2.768291236807779, + "grad_norm": 0.6192168893400326, + "learning_rate": 1.1435735298788816e-05, + "loss": 0.0788, + "step": 23345 + }, + { + "epoch": 2.768409818569904, + "grad_norm": 0.7936938617041035, + "learning_rate": 1.1433719033447873e-05, + "loss": 0.1044, + "step": 23346 + }, + { + "epoch": 2.7685284003320287, + "grad_norm": 0.547924720873421, + "learning_rate": 1.1431702893171414e-05, + "loss": 0.088, + "step": 23347 + }, + { + "epoch": 2.768646982094154, + "grad_norm": 0.5610784004624985, + "learning_rate": 1.1429686877978005e-05, + "loss": 0.0779, + "step": 23348 + }, + { + "epoch": 2.768765563856279, + "grad_norm": 0.7195750973540213, + "learning_rate": 1.1427670987886244e-05, + "loss": 0.0957, + "step": 23349 + }, + { + "epoch": 2.768884145618404, + "grad_norm": 0.7515808115424233, + "learning_rate": 1.1425655222914713e-05, + "loss": 0.1134, + "step": 23350 + }, + { + "epoch": 2.7690027273805287, + "grad_norm": 0.48379055752324496, + "learning_rate": 1.1423639583081994e-05, + "loss": 0.0604, + "step": 23351 + }, + { + "epoch": 2.769121309142654, + "grad_norm": 0.5486073344076978, + "learning_rate": 1.1421624068406669e-05, + "loss": 0.0827, + "step": 23352 + }, + { + "epoch": 2.769239890904779, + "grad_norm": 0.7772881082960681, + "learning_rate": 1.141960867890732e-05, + "loss": 0.0937, + "step": 23353 + }, + { + "epoch": 2.769358472666904, + "grad_norm": 0.6981563216572481, + "learning_rate": 1.1417593414602534e-05, + "loss": 0.1012, + "step": 23354 + }, + { + "epoch": 2.7694770544290286, + "grad_norm": 0.49997338205292047, + "learning_rate": 1.1415578275510874e-05, + "loss": 0.0743, + "step": 23355 + }, + { + "epoch": 2.769595636191154, + "grad_norm": 0.5883682065953693, + "learning_rate": 1.1413563261650923e-05, + "loss": 0.0716, + "step": 23356 + }, + { + "epoch": 2.769714217953279, + "grad_norm": 0.5370651452180656, + "learning_rate": 1.1411548373041256e-05, + "loss": 0.0814, + "step": 23357 + }, + { + "epoch": 2.7698327997154037, + "grad_norm": 0.6621706147524129, + "learning_rate": 1.1409533609700451e-05, + "loss": 0.0871, + "step": 23358 + }, + { + "epoch": 2.7699513814775285, + "grad_norm": 0.6409495134611329, + "learning_rate": 1.140751897164709e-05, + "loss": 0.0904, + "step": 23359 + }, + { + "epoch": 2.7700699632396537, + "grad_norm": 0.53667660135656, + "learning_rate": 1.1405504458899719e-05, + "loss": 0.0662, + "step": 23360 + }, + { + "epoch": 2.770188545001779, + "grad_norm": 0.7542789881243879, + "learning_rate": 1.140349007147694e-05, + "loss": 0.1142, + "step": 23361 + }, + { + "epoch": 2.7703071267639037, + "grad_norm": 0.7099627898056768, + "learning_rate": 1.1401475809397306e-05, + "loss": 0.0799, + "step": 23362 + }, + { + "epoch": 2.7704257085260284, + "grad_norm": 0.7916353591707399, + "learning_rate": 1.1399461672679396e-05, + "loss": 0.1121, + "step": 23363 + }, + { + "epoch": 2.7705442902881536, + "grad_norm": 0.45507026604928325, + "learning_rate": 1.139744766134175e-05, + "loss": 0.0651, + "step": 23364 + }, + { + "epoch": 2.770662872050279, + "grad_norm": 0.6668814071840288, + "learning_rate": 1.1395433775402977e-05, + "loss": 0.1073, + "step": 23365 + }, + { + "epoch": 2.7707814538124036, + "grad_norm": 0.8745505513566292, + "learning_rate": 1.1393420014881615e-05, + "loss": 0.1252, + "step": 23366 + }, + { + "epoch": 2.770900035574529, + "grad_norm": 0.7363926793999217, + "learning_rate": 1.1391406379796232e-05, + "loss": 0.0992, + "step": 23367 + }, + { + "epoch": 2.7710186173366536, + "grad_norm": 0.4161016289198815, + "learning_rate": 1.1389392870165395e-05, + "loss": 0.055, + "step": 23368 + }, + { + "epoch": 2.7711371990987788, + "grad_norm": 0.6919787471634709, + "learning_rate": 1.1387379486007662e-05, + "loss": 0.0868, + "step": 23369 + }, + { + "epoch": 2.7712557808609035, + "grad_norm": 0.5464815170727892, + "learning_rate": 1.138536622734161e-05, + "loss": 0.0948, + "step": 23370 + }, + { + "epoch": 2.7713743626230287, + "grad_norm": 0.7340054691771282, + "learning_rate": 1.1383353094185767e-05, + "loss": 0.0903, + "step": 23371 + }, + { + "epoch": 2.7714929443851535, + "grad_norm": 0.5296227315584351, + "learning_rate": 1.1381340086558729e-05, + "loss": 0.0773, + "step": 23372 + }, + { + "epoch": 2.7716115261472787, + "grad_norm": 0.945135670142948, + "learning_rate": 1.1379327204479026e-05, + "loss": 0.0852, + "step": 23373 + }, + { + "epoch": 2.7717301079094034, + "grad_norm": 0.7888145318470273, + "learning_rate": 1.1377314447965231e-05, + "loss": 0.0974, + "step": 23374 + }, + { + "epoch": 2.7718486896715286, + "grad_norm": 0.5861396644734556, + "learning_rate": 1.1375301817035877e-05, + "loss": 0.0727, + "step": 23375 + }, + { + "epoch": 2.7719672714336534, + "grad_norm": 0.6379101022446391, + "learning_rate": 1.1373289311709548e-05, + "loss": 0.0964, + "step": 23376 + }, + { + "epoch": 2.7720858531957786, + "grad_norm": 0.5497181388919203, + "learning_rate": 1.1371276932004774e-05, + "loss": 0.0575, + "step": 23377 + }, + { + "epoch": 2.7722044349579034, + "grad_norm": 0.6854865097937454, + "learning_rate": 1.1369264677940112e-05, + "loss": 0.0988, + "step": 23378 + }, + { + "epoch": 2.7723230167200286, + "grad_norm": 0.44659065601487286, + "learning_rate": 1.1367252549534116e-05, + "loss": 0.054, + "step": 23379 + }, + { + "epoch": 2.7724415984821533, + "grad_norm": 0.7234591094438547, + "learning_rate": 1.1365240546805331e-05, + "loss": 0.0996, + "step": 23380 + }, + { + "epoch": 2.7725601802442785, + "grad_norm": 0.6941569146083866, + "learning_rate": 1.1363228669772321e-05, + "loss": 0.0829, + "step": 23381 + }, + { + "epoch": 2.7726787620064033, + "grad_norm": 0.742955521816391, + "learning_rate": 1.13612169184536e-05, + "loss": 0.1014, + "step": 23382 + }, + { + "epoch": 2.7727973437685285, + "grad_norm": 0.6934556287992936, + "learning_rate": 1.1359205292867754e-05, + "loss": 0.0893, + "step": 23383 + }, + { + "epoch": 2.7729159255306532, + "grad_norm": 0.5126996984529428, + "learning_rate": 1.1357193793033297e-05, + "loss": 0.0689, + "step": 23384 + }, + { + "epoch": 2.7730345072927784, + "grad_norm": 0.49324541001856353, + "learning_rate": 1.1355182418968791e-05, + "loss": 0.0656, + "step": 23385 + }, + { + "epoch": 2.773153089054903, + "grad_norm": 0.5782235802757719, + "learning_rate": 1.1353171170692755e-05, + "loss": 0.0779, + "step": 23386 + }, + { + "epoch": 2.7732716708170284, + "grad_norm": 0.7313779843211881, + "learning_rate": 1.1351160048223764e-05, + "loss": 0.1065, + "step": 23387 + }, + { + "epoch": 2.773390252579153, + "grad_norm": 0.6747690931764576, + "learning_rate": 1.1349149051580329e-05, + "loss": 0.0718, + "step": 23388 + }, + { + "epoch": 2.7735088343412784, + "grad_norm": 0.7624767170465404, + "learning_rate": 1.1347138180781003e-05, + "loss": 0.114, + "step": 23389 + }, + { + "epoch": 2.773627416103403, + "grad_norm": 0.6396540037061718, + "learning_rate": 1.134512743584432e-05, + "loss": 0.0806, + "step": 23390 + }, + { + "epoch": 2.7737459978655283, + "grad_norm": 0.6363896486785939, + "learning_rate": 1.1343116816788815e-05, + "loss": 0.0821, + "step": 23391 + }, + { + "epoch": 2.773864579627653, + "grad_norm": 0.6699315860091096, + "learning_rate": 1.1341106323633036e-05, + "loss": 0.0678, + "step": 23392 + }, + { + "epoch": 2.7739831613897783, + "grad_norm": 0.5829622522369377, + "learning_rate": 1.1339095956395488e-05, + "loss": 0.0722, + "step": 23393 + }, + { + "epoch": 2.7741017431519035, + "grad_norm": 0.5885598194121422, + "learning_rate": 1.1337085715094743e-05, + "loss": 0.0897, + "step": 23394 + }, + { + "epoch": 2.7742203249140283, + "grad_norm": 0.6903262979015492, + "learning_rate": 1.1335075599749304e-05, + "loss": 0.1006, + "step": 23395 + }, + { + "epoch": 2.774338906676153, + "grad_norm": 0.9352921979957686, + "learning_rate": 1.133306561037772e-05, + "loss": 0.1449, + "step": 23396 + }, + { + "epoch": 2.774457488438278, + "grad_norm": 0.6504432537003031, + "learning_rate": 1.1331055746998493e-05, + "loss": 0.0712, + "step": 23397 + }, + { + "epoch": 2.7745760702004034, + "grad_norm": 0.5477667416647569, + "learning_rate": 1.1329046009630187e-05, + "loss": 0.0576, + "step": 23398 + }, + { + "epoch": 2.774694651962528, + "grad_norm": 0.6496991031399796, + "learning_rate": 1.1327036398291304e-05, + "loss": 0.1025, + "step": 23399 + }, + { + "epoch": 2.774813233724653, + "grad_norm": 0.4666977099580574, + "learning_rate": 1.1325026913000378e-05, + "loss": 0.0729, + "step": 23400 + }, + { + "epoch": 2.774931815486778, + "grad_norm": 0.5293185400135155, + "learning_rate": 1.1323017553775936e-05, + "loss": 0.0857, + "step": 23401 + }, + { + "epoch": 2.7750503972489033, + "grad_norm": 0.48627569457089326, + "learning_rate": 1.13210083206365e-05, + "loss": 0.0737, + "step": 23402 + }, + { + "epoch": 2.775168979011028, + "grad_norm": 0.45805963640638914, + "learning_rate": 1.1318999213600603e-05, + "loss": 0.0604, + "step": 23403 + }, + { + "epoch": 2.775287560773153, + "grad_norm": 0.5417810624275166, + "learning_rate": 1.1316990232686745e-05, + "loss": 0.0704, + "step": 23404 + }, + { + "epoch": 2.775406142535278, + "grad_norm": 0.6853309247786881, + "learning_rate": 1.1314981377913459e-05, + "loss": 0.1141, + "step": 23405 + }, + { + "epoch": 2.7755247242974033, + "grad_norm": 0.5338295154476604, + "learning_rate": 1.1312972649299264e-05, + "loss": 0.0614, + "step": 23406 + }, + { + "epoch": 2.775643306059528, + "grad_norm": 0.6712073322349924, + "learning_rate": 1.1310964046862676e-05, + "loss": 0.0668, + "step": 23407 + }, + { + "epoch": 2.775761887821653, + "grad_norm": 0.7405397344719932, + "learning_rate": 1.1308955570622212e-05, + "loss": 0.0939, + "step": 23408 + }, + { + "epoch": 2.775880469583778, + "grad_norm": 0.8364662860923527, + "learning_rate": 1.1306947220596389e-05, + "loss": 0.123, + "step": 23409 + }, + { + "epoch": 2.775999051345903, + "grad_norm": 0.7164374863204429, + "learning_rate": 1.1304938996803732e-05, + "loss": 0.0898, + "step": 23410 + }, + { + "epoch": 2.776117633108028, + "grad_norm": 0.4272130960707243, + "learning_rate": 1.1302930899262734e-05, + "loss": 0.048, + "step": 23411 + }, + { + "epoch": 2.7762362148701527, + "grad_norm": 0.6537685553592821, + "learning_rate": 1.1300922927991913e-05, + "loss": 0.0911, + "step": 23412 + }, + { + "epoch": 2.776354796632278, + "grad_norm": 0.7987374489929583, + "learning_rate": 1.1298915083009787e-05, + "loss": 0.0878, + "step": 23413 + }, + { + "epoch": 2.776473378394403, + "grad_norm": 0.6359388984102033, + "learning_rate": 1.1296907364334872e-05, + "loss": 0.0582, + "step": 23414 + }, + { + "epoch": 2.776591960156528, + "grad_norm": 0.6306017588761372, + "learning_rate": 1.1294899771985657e-05, + "loss": 0.0839, + "step": 23415 + }, + { + "epoch": 2.776710541918653, + "grad_norm": 0.5303771852182746, + "learning_rate": 1.129289230598066e-05, + "loss": 0.0851, + "step": 23416 + }, + { + "epoch": 2.776829123680778, + "grad_norm": 0.5262721251144085, + "learning_rate": 1.1290884966338388e-05, + "loss": 0.0799, + "step": 23417 + }, + { + "epoch": 2.776947705442903, + "grad_norm": 0.7592412015378753, + "learning_rate": 1.1288877753077346e-05, + "loss": 0.0819, + "step": 23418 + }, + { + "epoch": 2.777066287205028, + "grad_norm": 0.6289334901514505, + "learning_rate": 1.1286870666216043e-05, + "loss": 0.0847, + "step": 23419 + }, + { + "epoch": 2.777184868967153, + "grad_norm": 0.6161980919296172, + "learning_rate": 1.1284863705772961e-05, + "loss": 0.0774, + "step": 23420 + }, + { + "epoch": 2.7773034507292778, + "grad_norm": 0.4469107934819624, + "learning_rate": 1.1282856871766634e-05, + "loss": 0.0668, + "step": 23421 + }, + { + "epoch": 2.777422032491403, + "grad_norm": 0.7742595820781518, + "learning_rate": 1.1280850164215537e-05, + "loss": 0.0972, + "step": 23422 + }, + { + "epoch": 2.7775406142535277, + "grad_norm": 0.8023681302508702, + "learning_rate": 1.127884358313818e-05, + "loss": 0.1154, + "step": 23423 + }, + { + "epoch": 2.777659196015653, + "grad_norm": 0.4821847835482715, + "learning_rate": 1.1276837128553055e-05, + "loss": 0.0566, + "step": 23424 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.532604842604301, + "learning_rate": 1.1274830800478674e-05, + "loss": 0.079, + "step": 23425 + }, + { + "epoch": 2.777896359539903, + "grad_norm": 0.6810571639025529, + "learning_rate": 1.1272824598933515e-05, + "loss": 0.0976, + "step": 23426 + }, + { + "epoch": 2.7780149413020276, + "grad_norm": 0.470918066238956, + "learning_rate": 1.1270818523936078e-05, + "loss": 0.0541, + "step": 23427 + }, + { + "epoch": 2.778133523064153, + "grad_norm": 0.8312389584111695, + "learning_rate": 1.1268812575504858e-05, + "loss": 0.0964, + "step": 23428 + }, + { + "epoch": 2.7782521048262776, + "grad_norm": 0.570237066460678, + "learning_rate": 1.126680675365835e-05, + "loss": 0.0843, + "step": 23429 + }, + { + "epoch": 2.778370686588403, + "grad_norm": 0.6971174895239831, + "learning_rate": 1.1264801058415048e-05, + "loss": 0.1085, + "step": 23430 + }, + { + "epoch": 2.7784892683505276, + "grad_norm": 1.0652720386557615, + "learning_rate": 1.126279548979342e-05, + "loss": 0.118, + "step": 23431 + }, + { + "epoch": 2.7786078501126528, + "grad_norm": 0.6056262957213158, + "learning_rate": 1.1260790047811993e-05, + "loss": 0.0859, + "step": 23432 + }, + { + "epoch": 2.7787264318747775, + "grad_norm": 0.6108381834806963, + "learning_rate": 1.1258784732489221e-05, + "loss": 0.0787, + "step": 23433 + }, + { + "epoch": 2.7788450136369027, + "grad_norm": 0.6663302154018853, + "learning_rate": 1.1256779543843616e-05, + "loss": 0.0931, + "step": 23434 + }, + { + "epoch": 2.7789635953990275, + "grad_norm": 0.5494370359363738, + "learning_rate": 1.1254774481893632e-05, + "loss": 0.0886, + "step": 23435 + }, + { + "epoch": 2.7790821771611527, + "grad_norm": 0.6283477671677368, + "learning_rate": 1.1252769546657788e-05, + "loss": 0.0751, + "step": 23436 + }, + { + "epoch": 2.7792007589232774, + "grad_norm": 0.7279076356755718, + "learning_rate": 1.1250764738154543e-05, + "loss": 0.0639, + "step": 23437 + }, + { + "epoch": 2.7793193406854027, + "grad_norm": 0.845724506704306, + "learning_rate": 1.1248760056402386e-05, + "loss": 0.1077, + "step": 23438 + }, + { + "epoch": 2.7794379224475274, + "grad_norm": 0.603552704210284, + "learning_rate": 1.1246755501419797e-05, + "loss": 0.075, + "step": 23439 + }, + { + "epoch": 2.7795565042096526, + "grad_norm": 0.610258779230439, + "learning_rate": 1.1244751073225257e-05, + "loss": 0.0915, + "step": 23440 + }, + { + "epoch": 2.7796750859717774, + "grad_norm": 0.6081004399159734, + "learning_rate": 1.1242746771837256e-05, + "loss": 0.067, + "step": 23441 + }, + { + "epoch": 2.7797936677339026, + "grad_norm": 0.6791452516700297, + "learning_rate": 1.124074259727424e-05, + "loss": 0.0981, + "step": 23442 + }, + { + "epoch": 2.7799122494960278, + "grad_norm": 0.8201242855706209, + "learning_rate": 1.1238738549554722e-05, + "loss": 0.102, + "step": 23443 + }, + { + "epoch": 2.7800308312581525, + "grad_norm": 0.588990882440281, + "learning_rate": 1.123673462869715e-05, + "loss": 0.0841, + "step": 23444 + }, + { + "epoch": 2.7801494130202773, + "grad_norm": 0.556993754250878, + "learning_rate": 1.123473083472002e-05, + "loss": 0.0659, + "step": 23445 + }, + { + "epoch": 2.7802679947824025, + "grad_norm": 0.5045315666076647, + "learning_rate": 1.1232727167641768e-05, + "loss": 0.0558, + "step": 23446 + }, + { + "epoch": 2.7803865765445277, + "grad_norm": 0.6534574266477436, + "learning_rate": 1.123072362748091e-05, + "loss": 0.0711, + "step": 23447 + }, + { + "epoch": 2.7805051583066525, + "grad_norm": 0.7167052592650225, + "learning_rate": 1.1228720214255886e-05, + "loss": 0.085, + "step": 23448 + }, + { + "epoch": 2.780623740068777, + "grad_norm": 0.612888435768226, + "learning_rate": 1.1226716927985176e-05, + "loss": 0.0753, + "step": 23449 + }, + { + "epoch": 2.7807423218309024, + "grad_norm": 0.7784823921815903, + "learning_rate": 1.1224713768687243e-05, + "loss": 0.0748, + "step": 23450 + }, + { + "epoch": 2.7808609035930276, + "grad_norm": 0.5162271459894991, + "learning_rate": 1.1222710736380557e-05, + "loss": 0.0598, + "step": 23451 + }, + { + "epoch": 2.7809794853551524, + "grad_norm": 0.7549309166560052, + "learning_rate": 1.1220707831083593e-05, + "loss": 0.0727, + "step": 23452 + }, + { + "epoch": 2.781098067117277, + "grad_norm": 0.6835079449399556, + "learning_rate": 1.1218705052814788e-05, + "loss": 0.0864, + "step": 23453 + }, + { + "epoch": 2.7812166488794023, + "grad_norm": 0.9241988238358865, + "learning_rate": 1.1216702401592638e-05, + "loss": 0.0947, + "step": 23454 + }, + { + "epoch": 2.7813352306415275, + "grad_norm": 0.8975612506561685, + "learning_rate": 1.1214699877435584e-05, + "loss": 0.1087, + "step": 23455 + }, + { + "epoch": 2.7814538124036523, + "grad_norm": 0.49290726169435695, + "learning_rate": 1.1212697480362092e-05, + "loss": 0.0576, + "step": 23456 + }, + { + "epoch": 2.781572394165777, + "grad_norm": 0.5147423779811976, + "learning_rate": 1.121069521039062e-05, + "loss": 0.0726, + "step": 23457 + }, + { + "epoch": 2.7816909759279023, + "grad_norm": 0.4940548261934171, + "learning_rate": 1.120869306753963e-05, + "loss": 0.0632, + "step": 23458 + }, + { + "epoch": 2.7818095576900275, + "grad_norm": 0.9873955003405185, + "learning_rate": 1.1206691051827587e-05, + "loss": 0.1027, + "step": 23459 + }, + { + "epoch": 2.7819281394521522, + "grad_norm": 0.8376053718581348, + "learning_rate": 1.120468916327293e-05, + "loss": 0.1098, + "step": 23460 + }, + { + "epoch": 2.782046721214277, + "grad_norm": 0.6619117672145138, + "learning_rate": 1.120268740189412e-05, + "loss": 0.0973, + "step": 23461 + }, + { + "epoch": 2.782165302976402, + "grad_norm": 0.8134380274847055, + "learning_rate": 1.1200685767709613e-05, + "loss": 0.0945, + "step": 23462 + }, + { + "epoch": 2.7822838847385274, + "grad_norm": 0.5811536555515214, + "learning_rate": 1.119868426073787e-05, + "loss": 0.0682, + "step": 23463 + }, + { + "epoch": 2.782402466500652, + "grad_norm": 0.6035045089928263, + "learning_rate": 1.1196682880997328e-05, + "loss": 0.0597, + "step": 23464 + }, + { + "epoch": 2.782521048262777, + "grad_norm": 0.626328831919556, + "learning_rate": 1.119468162850644e-05, + "loss": 0.0879, + "step": 23465 + }, + { + "epoch": 2.782639630024902, + "grad_norm": 0.6240672075564561, + "learning_rate": 1.119268050328366e-05, + "loss": 0.074, + "step": 23466 + }, + { + "epoch": 2.7827582117870273, + "grad_norm": 0.803402551626901, + "learning_rate": 1.1190679505347433e-05, + "loss": 0.0961, + "step": 23467 + }, + { + "epoch": 2.782876793549152, + "grad_norm": 0.5287979135709724, + "learning_rate": 1.1188678634716207e-05, + "loss": 0.0815, + "step": 23468 + }, + { + "epoch": 2.7829953753112773, + "grad_norm": 0.568364163332609, + "learning_rate": 1.1186677891408425e-05, + "loss": 0.0811, + "step": 23469 + }, + { + "epoch": 2.783113957073402, + "grad_norm": 0.6304454981956065, + "learning_rate": 1.1184677275442548e-05, + "loss": 0.0709, + "step": 23470 + }, + { + "epoch": 2.7832325388355272, + "grad_norm": 0.5699722596827271, + "learning_rate": 1.1182676786836994e-05, + "loss": 0.0723, + "step": 23471 + }, + { + "epoch": 2.783351120597652, + "grad_norm": 0.5971293391273854, + "learning_rate": 1.1180676425610214e-05, + "loss": 0.0766, + "step": 23472 + }, + { + "epoch": 2.783469702359777, + "grad_norm": 0.6066639618925744, + "learning_rate": 1.1178676191780651e-05, + "loss": 0.0962, + "step": 23473 + }, + { + "epoch": 2.783588284121902, + "grad_norm": 0.9015036971421962, + "learning_rate": 1.1176676085366757e-05, + "loss": 0.1021, + "step": 23474 + }, + { + "epoch": 2.783706865884027, + "grad_norm": 0.6717263611140653, + "learning_rate": 1.1174676106386948e-05, + "loss": 0.0885, + "step": 23475 + }, + { + "epoch": 2.783825447646152, + "grad_norm": 0.9319969793067343, + "learning_rate": 1.1172676254859671e-05, + "loss": 0.1398, + "step": 23476 + }, + { + "epoch": 2.783944029408277, + "grad_norm": 0.5164587545153685, + "learning_rate": 1.1170676530803361e-05, + "loss": 0.0651, + "step": 23477 + }, + { + "epoch": 2.784062611170402, + "grad_norm": 1.011169895731756, + "learning_rate": 1.1168676934236458e-05, + "loss": 0.1251, + "step": 23478 + }, + { + "epoch": 2.784181192932527, + "grad_norm": 0.7119900996439639, + "learning_rate": 1.1166677465177389e-05, + "loss": 0.1025, + "step": 23479 + }, + { + "epoch": 2.784299774694652, + "grad_norm": 0.5501909472383811, + "learning_rate": 1.1164678123644592e-05, + "loss": 0.0741, + "step": 23480 + }, + { + "epoch": 2.784418356456777, + "grad_norm": 0.631576968311274, + "learning_rate": 1.1162678909656508e-05, + "loss": 0.0676, + "step": 23481 + }, + { + "epoch": 2.784536938218902, + "grad_norm": 0.6539996602266178, + "learning_rate": 1.1160679823231542e-05, + "loss": 0.0965, + "step": 23482 + }, + { + "epoch": 2.784655519981027, + "grad_norm": 0.740754414704883, + "learning_rate": 1.1158680864388141e-05, + "loss": 0.0989, + "step": 23483 + }, + { + "epoch": 2.7847741017431518, + "grad_norm": 0.7634668640623319, + "learning_rate": 1.115668203314473e-05, + "loss": 0.091, + "step": 23484 + }, + { + "epoch": 2.784892683505277, + "grad_norm": 0.706710783243423, + "learning_rate": 1.115468332951974e-05, + "loss": 0.1147, + "step": 23485 + }, + { + "epoch": 2.7850112652674017, + "grad_norm": 0.7672791685751229, + "learning_rate": 1.1152684753531586e-05, + "loss": 0.1093, + "step": 23486 + }, + { + "epoch": 2.785129847029527, + "grad_norm": 0.6207090623249102, + "learning_rate": 1.1150686305198697e-05, + "loss": 0.0814, + "step": 23487 + }, + { + "epoch": 2.7852484287916517, + "grad_norm": 0.9007226900579376, + "learning_rate": 1.1148687984539497e-05, + "loss": 0.1005, + "step": 23488 + }, + { + "epoch": 2.785367010553777, + "grad_norm": 0.6141684648072834, + "learning_rate": 1.1146689791572407e-05, + "loss": 0.062, + "step": 23489 + }, + { + "epoch": 2.7854855923159016, + "grad_norm": 0.6623831922357807, + "learning_rate": 1.1144691726315857e-05, + "loss": 0.0857, + "step": 23490 + }, + { + "epoch": 2.785604174078027, + "grad_norm": 0.5284856008294956, + "learning_rate": 1.1142693788788241e-05, + "loss": 0.0677, + "step": 23491 + }, + { + "epoch": 2.785722755840152, + "grad_norm": 0.7588908160261999, + "learning_rate": 1.1140695979008017e-05, + "loss": 0.1018, + "step": 23492 + }, + { + "epoch": 2.785841337602277, + "grad_norm": 0.8162243606879179, + "learning_rate": 1.1138698296993568e-05, + "loss": 0.0994, + "step": 23493 + }, + { + "epoch": 2.7859599193644016, + "grad_norm": 0.6466143413237161, + "learning_rate": 1.1136700742763327e-05, + "loss": 0.0723, + "step": 23494 + }, + { + "epoch": 2.7860785011265268, + "grad_norm": 0.8059849747205549, + "learning_rate": 1.1134703316335701e-05, + "loss": 0.103, + "step": 23495 + }, + { + "epoch": 2.786197082888652, + "grad_norm": 0.8718699518958477, + "learning_rate": 1.1132706017729117e-05, + "loss": 0.0979, + "step": 23496 + }, + { + "epoch": 2.7863156646507767, + "grad_norm": 0.6083779735028215, + "learning_rate": 1.1130708846961971e-05, + "loss": 0.0705, + "step": 23497 + }, + { + "epoch": 2.7864342464129015, + "grad_norm": 0.4795914345781358, + "learning_rate": 1.112871180405268e-05, + "loss": 0.0576, + "step": 23498 + }, + { + "epoch": 2.7865528281750267, + "grad_norm": 0.544732258130227, + "learning_rate": 1.1126714889019657e-05, + "loss": 0.0658, + "step": 23499 + }, + { + "epoch": 2.786671409937152, + "grad_norm": 0.5130214911146745, + "learning_rate": 1.1124718101881309e-05, + "loss": 0.0745, + "step": 23500 + }, + { + "epoch": 2.7867899916992767, + "grad_norm": 0.7444856011993779, + "learning_rate": 1.1122721442656056e-05, + "loss": 0.0981, + "step": 23501 + }, + { + "epoch": 2.7869085734614014, + "grad_norm": 0.7026096382957676, + "learning_rate": 1.1120724911362276e-05, + "loss": 0.0997, + "step": 23502 + }, + { + "epoch": 2.7870271552235266, + "grad_norm": 0.759724003948782, + "learning_rate": 1.1118728508018409e-05, + "loss": 0.1022, + "step": 23503 + }, + { + "epoch": 2.787145736985652, + "grad_norm": 0.6056908082648312, + "learning_rate": 1.1116732232642834e-05, + "loss": 0.0926, + "step": 23504 + }, + { + "epoch": 2.7872643187477766, + "grad_norm": 0.6478154617932798, + "learning_rate": 1.1114736085253974e-05, + "loss": 0.0985, + "step": 23505 + }, + { + "epoch": 2.7873829005099013, + "grad_norm": 0.8882022131112246, + "learning_rate": 1.11127400658702e-05, + "loss": 0.1038, + "step": 23506 + }, + { + "epoch": 2.7875014822720265, + "grad_norm": 0.6982842121914995, + "learning_rate": 1.1110744174509952e-05, + "loss": 0.0866, + "step": 23507 + }, + { + "epoch": 2.7876200640341517, + "grad_norm": 0.6068082672123419, + "learning_rate": 1.11087484111916e-05, + "loss": 0.076, + "step": 23508 + }, + { + "epoch": 2.7877386457962765, + "grad_norm": 0.6080240347483469, + "learning_rate": 1.1106752775933546e-05, + "loss": 0.0774, + "step": 23509 + }, + { + "epoch": 2.7878572275584013, + "grad_norm": 0.5441174528217985, + "learning_rate": 1.110475726875421e-05, + "loss": 0.0698, + "step": 23510 + }, + { + "epoch": 2.7879758093205265, + "grad_norm": 0.7805890240472266, + "learning_rate": 1.1102761889671964e-05, + "loss": 0.1019, + "step": 23511 + }, + { + "epoch": 2.7880943910826517, + "grad_norm": 0.8389724546742463, + "learning_rate": 1.1100766638705224e-05, + "loss": 0.1116, + "step": 23512 + }, + { + "epoch": 2.7882129728447764, + "grad_norm": 0.5910590015139133, + "learning_rate": 1.109877151587235e-05, + "loss": 0.0833, + "step": 23513 + }, + { + "epoch": 2.788331554606901, + "grad_norm": 0.4197031037741578, + "learning_rate": 1.1096776521191774e-05, + "loss": 0.0558, + "step": 23514 + }, + { + "epoch": 2.7884501363690264, + "grad_norm": 0.9747193304602154, + "learning_rate": 1.109478165468186e-05, + "loss": 0.1412, + "step": 23515 + }, + { + "epoch": 2.7885687181311516, + "grad_norm": 0.4045914668731407, + "learning_rate": 1.1092786916361008e-05, + "loss": 0.0554, + "step": 23516 + }, + { + "epoch": 2.7886872998932764, + "grad_norm": 0.48530372395231597, + "learning_rate": 1.1090792306247608e-05, + "loss": 0.0719, + "step": 23517 + }, + { + "epoch": 2.7888058816554016, + "grad_norm": 0.41533769820696853, + "learning_rate": 1.1088797824360045e-05, + "loss": 0.0543, + "step": 23518 + }, + { + "epoch": 2.7889244634175263, + "grad_norm": 0.6039825110559885, + "learning_rate": 1.1086803470716714e-05, + "loss": 0.0822, + "step": 23519 + }, + { + "epoch": 2.7890430451796515, + "grad_norm": 0.7124490149936346, + "learning_rate": 1.1084809245335987e-05, + "loss": 0.0816, + "step": 23520 + }, + { + "epoch": 2.7891616269417763, + "grad_norm": 0.604238080209942, + "learning_rate": 1.1082815148236256e-05, + "loss": 0.0844, + "step": 23521 + }, + { + "epoch": 2.7892802087039015, + "grad_norm": 0.8323349019145171, + "learning_rate": 1.10808211794359e-05, + "loss": 0.1195, + "step": 23522 + }, + { + "epoch": 2.7893987904660262, + "grad_norm": 0.7277775408674639, + "learning_rate": 1.1078827338953312e-05, + "loss": 0.1068, + "step": 23523 + }, + { + "epoch": 2.7895173722281514, + "grad_norm": 0.7522493043752245, + "learning_rate": 1.1076833626806852e-05, + "loss": 0.1025, + "step": 23524 + }, + { + "epoch": 2.789635953990276, + "grad_norm": 0.6877997757117645, + "learning_rate": 1.1074840043014925e-05, + "loss": 0.0736, + "step": 23525 + }, + { + "epoch": 2.7897545357524014, + "grad_norm": 1.0432445555293344, + "learning_rate": 1.107284658759589e-05, + "loss": 0.1326, + "step": 23526 + }, + { + "epoch": 2.789873117514526, + "grad_norm": 0.7764411556315272, + "learning_rate": 1.1070853260568132e-05, + "loss": 0.1168, + "step": 23527 + }, + { + "epoch": 2.7899916992766514, + "grad_norm": 0.5950715740178205, + "learning_rate": 1.1068860061950024e-05, + "loss": 0.0946, + "step": 23528 + }, + { + "epoch": 2.790110281038776, + "grad_norm": 0.685383707559297, + "learning_rate": 1.1066866991759942e-05, + "loss": 0.087, + "step": 23529 + }, + { + "epoch": 2.7902288628009013, + "grad_norm": 0.5991045350442441, + "learning_rate": 1.1064874050016273e-05, + "loss": 0.0813, + "step": 23530 + }, + { + "epoch": 2.790347444563026, + "grad_norm": 0.5754770989093753, + "learning_rate": 1.1062881236737362e-05, + "loss": 0.0699, + "step": 23531 + }, + { + "epoch": 2.7904660263251513, + "grad_norm": 0.48396684103025783, + "learning_rate": 1.10608885519416e-05, + "loss": 0.0707, + "step": 23532 + }, + { + "epoch": 2.790584608087276, + "grad_norm": 0.8696506287986616, + "learning_rate": 1.1058895995647351e-05, + "loss": 0.1413, + "step": 23533 + }, + { + "epoch": 2.7907031898494012, + "grad_norm": 0.6833433879985161, + "learning_rate": 1.1056903567872994e-05, + "loss": 0.1126, + "step": 23534 + }, + { + "epoch": 2.790821771611526, + "grad_norm": 0.7613598321371872, + "learning_rate": 1.1054911268636879e-05, + "loss": 0.0952, + "step": 23535 + }, + { + "epoch": 2.790940353373651, + "grad_norm": 0.45074584691929903, + "learning_rate": 1.105291909795738e-05, + "loss": 0.0641, + "step": 23536 + }, + { + "epoch": 2.791058935135776, + "grad_norm": 0.6696657520349137, + "learning_rate": 1.1050927055852866e-05, + "loss": 0.0872, + "step": 23537 + }, + { + "epoch": 2.791177516897901, + "grad_norm": 0.5449393193319935, + "learning_rate": 1.1048935142341699e-05, + "loss": 0.0668, + "step": 23538 + }, + { + "epoch": 2.791296098660026, + "grad_norm": 0.5756803640747002, + "learning_rate": 1.104694335744224e-05, + "loss": 0.0761, + "step": 23539 + }, + { + "epoch": 2.791414680422151, + "grad_norm": 0.6634937730556476, + "learning_rate": 1.1044951701172857e-05, + "loss": 0.0941, + "step": 23540 + }, + { + "epoch": 2.7915332621842763, + "grad_norm": 1.1593905083911784, + "learning_rate": 1.104296017355191e-05, + "loss": 0.1585, + "step": 23541 + }, + { + "epoch": 2.791651843946401, + "grad_norm": 0.6893990941596838, + "learning_rate": 1.1040968774597751e-05, + "loss": 0.0986, + "step": 23542 + }, + { + "epoch": 2.791770425708526, + "grad_norm": 0.7681533497539526, + "learning_rate": 1.103897750432874e-05, + "loss": 0.0442, + "step": 23543 + }, + { + "epoch": 2.791889007470651, + "grad_norm": 0.6622474756981004, + "learning_rate": 1.1036986362763235e-05, + "loss": 0.093, + "step": 23544 + }, + { + "epoch": 2.7920075892327763, + "grad_norm": 0.47892105116481704, + "learning_rate": 1.1034995349919605e-05, + "loss": 0.0694, + "step": 23545 + }, + { + "epoch": 2.792126170994901, + "grad_norm": 0.9685702425222598, + "learning_rate": 1.1033004465816183e-05, + "loss": 0.0817, + "step": 23546 + }, + { + "epoch": 2.7922447527570258, + "grad_norm": 0.4998250766863483, + "learning_rate": 1.103101371047133e-05, + "loss": 0.0756, + "step": 23547 + }, + { + "epoch": 2.792363334519151, + "grad_norm": 0.7568335489336373, + "learning_rate": 1.1029023083903403e-05, + "loss": 0.0867, + "step": 23548 + }, + { + "epoch": 2.792481916281276, + "grad_norm": 1.030694847835742, + "learning_rate": 1.102703258613075e-05, + "loss": 0.0885, + "step": 23549 + }, + { + "epoch": 2.792600498043401, + "grad_norm": 0.49283598608618645, + "learning_rate": 1.1025042217171721e-05, + "loss": 0.063, + "step": 23550 + }, + { + "epoch": 2.7927190798055257, + "grad_norm": 0.6788894010468353, + "learning_rate": 1.1023051977044668e-05, + "loss": 0.0856, + "step": 23551 + }, + { + "epoch": 2.792837661567651, + "grad_norm": 0.5259770853323518, + "learning_rate": 1.1021061865767942e-05, + "loss": 0.0651, + "step": 23552 + }, + { + "epoch": 2.792956243329776, + "grad_norm": 0.5938451334235852, + "learning_rate": 1.1019071883359878e-05, + "loss": 0.093, + "step": 23553 + }, + { + "epoch": 2.793074825091901, + "grad_norm": 0.7103605172423068, + "learning_rate": 1.1017082029838823e-05, + "loss": 0.0728, + "step": 23554 + }, + { + "epoch": 2.7931934068540256, + "grad_norm": 0.6241433449296186, + "learning_rate": 1.1015092305223126e-05, + "loss": 0.0821, + "step": 23555 + }, + { + "epoch": 2.793311988616151, + "grad_norm": 0.453685268953769, + "learning_rate": 1.101310270953114e-05, + "loss": 0.058, + "step": 23556 + }, + { + "epoch": 2.793430570378276, + "grad_norm": 0.47895426416608106, + "learning_rate": 1.1011113242781185e-05, + "loss": 0.0606, + "step": 23557 + }, + { + "epoch": 2.793549152140401, + "grad_norm": 0.7398800661829196, + "learning_rate": 1.1009123904991608e-05, + "loss": 0.0674, + "step": 23558 + }, + { + "epoch": 2.7936677339025255, + "grad_norm": 0.5174059228752153, + "learning_rate": 1.1007134696180757e-05, + "loss": 0.0679, + "step": 23559 + }, + { + "epoch": 2.7937863156646507, + "grad_norm": 0.45226774767713107, + "learning_rate": 1.100514561636696e-05, + "loss": 0.0548, + "step": 23560 + }, + { + "epoch": 2.793904897426776, + "grad_norm": 0.5764169491599761, + "learning_rate": 1.1003156665568572e-05, + "loss": 0.0664, + "step": 23561 + }, + { + "epoch": 2.7940234791889007, + "grad_norm": 0.6634990468146553, + "learning_rate": 1.1001167843803894e-05, + "loss": 0.0817, + "step": 23562 + }, + { + "epoch": 2.7941420609510255, + "grad_norm": 0.5752715199390365, + "learning_rate": 1.0999179151091305e-05, + "loss": 0.0835, + "step": 23563 + }, + { + "epoch": 2.7942606427131507, + "grad_norm": 0.834645710220044, + "learning_rate": 1.0997190587449102e-05, + "loss": 0.1037, + "step": 23564 + }, + { + "epoch": 2.794379224475276, + "grad_norm": 0.7632071919303824, + "learning_rate": 1.0995202152895632e-05, + "loss": 0.1083, + "step": 23565 + }, + { + "epoch": 2.7944978062374006, + "grad_norm": 0.6047487044114402, + "learning_rate": 1.0993213847449224e-05, + "loss": 0.0845, + "step": 23566 + }, + { + "epoch": 2.794616387999526, + "grad_norm": 0.6353127815825889, + "learning_rate": 1.0991225671128208e-05, + "loss": 0.0716, + "step": 23567 + }, + { + "epoch": 2.7947349697616506, + "grad_norm": 0.6840139581753084, + "learning_rate": 1.0989237623950921e-05, + "loss": 0.0928, + "step": 23568 + }, + { + "epoch": 2.794853551523776, + "grad_norm": 0.6532244770047617, + "learning_rate": 1.0987249705935662e-05, + "loss": 0.0937, + "step": 23569 + }, + { + "epoch": 2.7949721332859006, + "grad_norm": 0.4526396342341503, + "learning_rate": 1.09852619171008e-05, + "loss": 0.0471, + "step": 23570 + }, + { + "epoch": 2.7950907150480258, + "grad_norm": 0.600143701939048, + "learning_rate": 1.0983274257464627e-05, + "loss": 0.0857, + "step": 23571 + }, + { + "epoch": 2.7952092968101505, + "grad_norm": 0.5626780487701015, + "learning_rate": 1.0981286727045483e-05, + "loss": 0.0713, + "step": 23572 + }, + { + "epoch": 2.7953278785722757, + "grad_norm": 0.5735390034668544, + "learning_rate": 1.0979299325861666e-05, + "loss": 0.101, + "step": 23573 + }, + { + "epoch": 2.7954464603344005, + "grad_norm": 0.8654833869667998, + "learning_rate": 1.0977312053931538e-05, + "loss": 0.0967, + "step": 23574 + }, + { + "epoch": 2.7955650420965257, + "grad_norm": 0.60700024543871, + "learning_rate": 1.0975324911273385e-05, + "loss": 0.0883, + "step": 23575 + }, + { + "epoch": 2.7956836238586504, + "grad_norm": 0.664407111953768, + "learning_rate": 1.097333789790554e-05, + "loss": 0.073, + "step": 23576 + }, + { + "epoch": 2.7958022056207756, + "grad_norm": 0.8148204547437383, + "learning_rate": 1.0971351013846318e-05, + "loss": 0.1005, + "step": 23577 + }, + { + "epoch": 2.7959207873829004, + "grad_norm": 0.5630361685147656, + "learning_rate": 1.0969364259114036e-05, + "loss": 0.0685, + "step": 23578 + }, + { + "epoch": 2.7960393691450256, + "grad_norm": 0.6000887371011749, + "learning_rate": 1.0967377633727022e-05, + "loss": 0.093, + "step": 23579 + }, + { + "epoch": 2.7961579509071504, + "grad_norm": 0.6316920483191582, + "learning_rate": 1.0965391137703556e-05, + "loss": 0.0887, + "step": 23580 + }, + { + "epoch": 2.7962765326692756, + "grad_norm": 0.7489408322441435, + "learning_rate": 1.0963404771061995e-05, + "loss": 0.0645, + "step": 23581 + }, + { + "epoch": 2.7963951144314003, + "grad_norm": 0.6167259089654593, + "learning_rate": 1.096141853382062e-05, + "loss": 0.0835, + "step": 23582 + }, + { + "epoch": 2.7965136961935255, + "grad_norm": 0.4634641862568996, + "learning_rate": 1.0959432425997756e-05, + "loss": 0.0567, + "step": 23583 + }, + { + "epoch": 2.7966322779556503, + "grad_norm": 0.6331279864321742, + "learning_rate": 1.0957446447611692e-05, + "loss": 0.098, + "step": 23584 + }, + { + "epoch": 2.7967508597177755, + "grad_norm": 0.7926022811994479, + "learning_rate": 1.0955460598680769e-05, + "loss": 0.0702, + "step": 23585 + }, + { + "epoch": 2.7968694414799002, + "grad_norm": 0.7439577962878062, + "learning_rate": 1.0953474879223269e-05, + "loss": 0.1055, + "step": 23586 + }, + { + "epoch": 2.7969880232420254, + "grad_norm": 0.6557615754516626, + "learning_rate": 1.0951489289257504e-05, + "loss": 0.0972, + "step": 23587 + }, + { + "epoch": 2.79710660500415, + "grad_norm": 0.5916170958664307, + "learning_rate": 1.0949503828801782e-05, + "loss": 0.0682, + "step": 23588 + }, + { + "epoch": 2.7972251867662754, + "grad_norm": 0.628392664328486, + "learning_rate": 1.0947518497874403e-05, + "loss": 0.0914, + "step": 23589 + }, + { + "epoch": 2.7973437685284006, + "grad_norm": 0.6861520989128417, + "learning_rate": 1.0945533296493682e-05, + "loss": 0.0993, + "step": 23590 + }, + { + "epoch": 2.7974623502905254, + "grad_norm": 1.0573612459060657, + "learning_rate": 1.0943548224677899e-05, + "loss": 0.1076, + "step": 23591 + }, + { + "epoch": 2.79758093205265, + "grad_norm": 0.5608808816785612, + "learning_rate": 1.0941563282445367e-05, + "loss": 0.0707, + "step": 23592 + }, + { + "epoch": 2.7976995138147753, + "grad_norm": 0.9196731104998727, + "learning_rate": 1.0939578469814379e-05, + "loss": 0.0999, + "step": 23593 + }, + { + "epoch": 2.7978180955769005, + "grad_norm": 0.6340230909119936, + "learning_rate": 1.0937593786803244e-05, + "loss": 0.0763, + "step": 23594 + }, + { + "epoch": 2.7979366773390253, + "grad_norm": 0.6263445126227181, + "learning_rate": 1.0935609233430232e-05, + "loss": 0.0761, + "step": 23595 + }, + { + "epoch": 2.79805525910115, + "grad_norm": 0.5518522071872508, + "learning_rate": 1.0933624809713677e-05, + "loss": 0.06, + "step": 23596 + }, + { + "epoch": 2.7981738408632753, + "grad_norm": 0.6174772632976541, + "learning_rate": 1.093164051567184e-05, + "loss": 0.074, + "step": 23597 + }, + { + "epoch": 2.7982924226254005, + "grad_norm": 0.686974972683224, + "learning_rate": 1.0929656351323025e-05, + "loss": 0.099, + "step": 23598 + }, + { + "epoch": 2.798411004387525, + "grad_norm": 0.6473302143599269, + "learning_rate": 1.0927672316685528e-05, + "loss": 0.0915, + "step": 23599 + }, + { + "epoch": 2.79852958614965, + "grad_norm": 0.9277406707585905, + "learning_rate": 1.0925688411777632e-05, + "loss": 0.1291, + "step": 23600 + }, + { + "epoch": 2.798648167911775, + "grad_norm": 0.6156580063429085, + "learning_rate": 1.092370463661764e-05, + "loss": 0.0868, + "step": 23601 + }, + { + "epoch": 2.7987667496739004, + "grad_norm": 0.60269906904038, + "learning_rate": 1.0921720991223822e-05, + "loss": 0.0904, + "step": 23602 + }, + { + "epoch": 2.798885331436025, + "grad_norm": 0.5436340204633221, + "learning_rate": 1.0919737475614473e-05, + "loss": 0.0575, + "step": 23603 + }, + { + "epoch": 2.79900391319815, + "grad_norm": 0.8232487731096371, + "learning_rate": 1.0917754089807874e-05, + "loss": 0.0897, + "step": 23604 + }, + { + "epoch": 2.799122494960275, + "grad_norm": 0.6605622063574442, + "learning_rate": 1.0915770833822326e-05, + "loss": 0.0766, + "step": 23605 + }, + { + "epoch": 2.7992410767224003, + "grad_norm": 0.646541173752105, + "learning_rate": 1.091378770767609e-05, + "loss": 0.0858, + "step": 23606 + }, + { + "epoch": 2.799359658484525, + "grad_norm": 0.840584740131326, + "learning_rate": 1.0911804711387458e-05, + "loss": 0.1008, + "step": 23607 + }, + { + "epoch": 2.79947824024665, + "grad_norm": 0.8095230039828296, + "learning_rate": 1.0909821844974707e-05, + "loss": 0.0764, + "step": 23608 + }, + { + "epoch": 2.799596822008775, + "grad_norm": 0.8178376840722514, + "learning_rate": 1.0907839108456126e-05, + "loss": 0.1029, + "step": 23609 + }, + { + "epoch": 2.7997154037709002, + "grad_norm": 0.5239948635349123, + "learning_rate": 1.0905856501849982e-05, + "loss": 0.0701, + "step": 23610 + }, + { + "epoch": 2.799833985533025, + "grad_norm": 0.7574676074051285, + "learning_rate": 1.0903874025174559e-05, + "loss": 0.0941, + "step": 23611 + }, + { + "epoch": 2.7999525672951497, + "grad_norm": 0.7261333076311026, + "learning_rate": 1.0901891678448144e-05, + "loss": 0.0873, + "step": 23612 + }, + { + "epoch": 2.800071149057275, + "grad_norm": 0.8540561736683205, + "learning_rate": 1.0899909461688988e-05, + "loss": 0.1128, + "step": 23613 + }, + { + "epoch": 2.8001897308194, + "grad_norm": 0.5872833913699418, + "learning_rate": 1.0897927374915377e-05, + "loss": 0.0697, + "step": 23614 + }, + { + "epoch": 2.800308312581525, + "grad_norm": 0.8646428183632873, + "learning_rate": 1.0895945418145582e-05, + "loss": 0.0831, + "step": 23615 + }, + { + "epoch": 2.80042689434365, + "grad_norm": 0.8210148451816569, + "learning_rate": 1.0893963591397873e-05, + "loss": 0.1126, + "step": 23616 + }, + { + "epoch": 2.800545476105775, + "grad_norm": 0.5910364236606048, + "learning_rate": 1.0891981894690534e-05, + "loss": 0.0764, + "step": 23617 + }, + { + "epoch": 2.8006640578679, + "grad_norm": 0.5781147982745879, + "learning_rate": 1.0890000328041799e-05, + "loss": 0.0674, + "step": 23618 + }, + { + "epoch": 2.800782639630025, + "grad_norm": 1.0973523377642078, + "learning_rate": 1.088801889146998e-05, + "loss": 0.1484, + "step": 23619 + }, + { + "epoch": 2.80090122139215, + "grad_norm": 0.7213777531717688, + "learning_rate": 1.0886037584993312e-05, + "loss": 0.1115, + "step": 23620 + }, + { + "epoch": 2.801019803154275, + "grad_norm": 0.4645794690112459, + "learning_rate": 1.0884056408630077e-05, + "loss": 0.0691, + "step": 23621 + }, + { + "epoch": 2.8011383849164, + "grad_norm": 0.6373183183657428, + "learning_rate": 1.0882075362398517e-05, + "loss": 0.1019, + "step": 23622 + }, + { + "epoch": 2.8012569666785248, + "grad_norm": 0.7552400103226458, + "learning_rate": 1.0880094446316926e-05, + "loss": 0.117, + "step": 23623 + }, + { + "epoch": 2.80137554844065, + "grad_norm": 0.6650974731140802, + "learning_rate": 1.0878113660403538e-05, + "loss": 0.0886, + "step": 23624 + }, + { + "epoch": 2.8014941302027747, + "grad_norm": 0.5845377845118885, + "learning_rate": 1.0876133004676628e-05, + "loss": 0.068, + "step": 23625 + }, + { + "epoch": 2.8016127119649, + "grad_norm": 0.5533974719294145, + "learning_rate": 1.0874152479154451e-05, + "loss": 0.0555, + "step": 23626 + }, + { + "epoch": 2.8017312937270247, + "grad_norm": 0.6863687411654897, + "learning_rate": 1.0872172083855265e-05, + "loss": 0.1093, + "step": 23627 + }, + { + "epoch": 2.80184987548915, + "grad_norm": 0.7972273560809497, + "learning_rate": 1.0870191818797338e-05, + "loss": 0.0983, + "step": 23628 + }, + { + "epoch": 2.8019684572512746, + "grad_norm": 0.5375576909235069, + "learning_rate": 1.0868211683998894e-05, + "loss": 0.0723, + "step": 23629 + }, + { + "epoch": 2.8020870390134, + "grad_norm": 0.958711652080157, + "learning_rate": 1.086623167947823e-05, + "loss": 0.1349, + "step": 23630 + }, + { + "epoch": 2.8022056207755246, + "grad_norm": 0.6656647183436327, + "learning_rate": 1.086425180525357e-05, + "loss": 0.0861, + "step": 23631 + }, + { + "epoch": 2.80232420253765, + "grad_norm": 0.5669532564067225, + "learning_rate": 1.0862272061343179e-05, + "loss": 0.0888, + "step": 23632 + }, + { + "epoch": 2.8024427842997746, + "grad_norm": 0.7488855712734561, + "learning_rate": 1.0860292447765289e-05, + "loss": 0.1063, + "step": 23633 + }, + { + "epoch": 2.8025613660618998, + "grad_norm": 0.9721107395624091, + "learning_rate": 1.0858312964538178e-05, + "loss": 0.1284, + "step": 23634 + }, + { + "epoch": 2.8026799478240245, + "grad_norm": 0.5707992436210468, + "learning_rate": 1.0856333611680072e-05, + "loss": 0.1, + "step": 23635 + }, + { + "epoch": 2.8027985295861497, + "grad_norm": 0.6267984360587457, + "learning_rate": 1.0854354389209227e-05, + "loss": 0.0828, + "step": 23636 + }, + { + "epoch": 2.8029171113482745, + "grad_norm": 0.520678087724199, + "learning_rate": 1.0852375297143886e-05, + "loss": 0.0754, + "step": 23637 + }, + { + "epoch": 2.8030356931103997, + "grad_norm": 0.7343831221722324, + "learning_rate": 1.0850396335502298e-05, + "loss": 0.1014, + "step": 23638 + }, + { + "epoch": 2.8031542748725244, + "grad_norm": 0.8139350718119551, + "learning_rate": 1.0848417504302711e-05, + "loss": 0.1265, + "step": 23639 + }, + { + "epoch": 2.8032728566346496, + "grad_norm": 0.5949760801230392, + "learning_rate": 1.0846438803563344e-05, + "loss": 0.0992, + "step": 23640 + }, + { + "epoch": 2.8033914383967744, + "grad_norm": 0.5471308642825482, + "learning_rate": 1.0844460233302472e-05, + "loss": 0.0689, + "step": 23641 + }, + { + "epoch": 2.8035100201588996, + "grad_norm": 0.6449651449761193, + "learning_rate": 1.0842481793538308e-05, + "loss": 0.0956, + "step": 23642 + }, + { + "epoch": 2.803628601921025, + "grad_norm": 1.0445468472449098, + "learning_rate": 1.0840503484289113e-05, + "loss": 0.1158, + "step": 23643 + }, + { + "epoch": 2.8037471836831496, + "grad_norm": 0.4654745456286987, + "learning_rate": 1.0838525305573093e-05, + "loss": 0.0746, + "step": 23644 + }, + { + "epoch": 2.8038657654452743, + "grad_norm": 0.774971584590526, + "learning_rate": 1.0836547257408522e-05, + "loss": 0.1064, + "step": 23645 + }, + { + "epoch": 2.8039843472073995, + "grad_norm": 0.527577052282642, + "learning_rate": 1.0834569339813608e-05, + "loss": 0.0703, + "step": 23646 + }, + { + "epoch": 2.8041029289695247, + "grad_norm": 0.8828495131035237, + "learning_rate": 1.0832591552806592e-05, + "loss": 0.1296, + "step": 23647 + }, + { + "epoch": 2.8042215107316495, + "grad_norm": 0.72114630096771, + "learning_rate": 1.0830613896405712e-05, + "loss": 0.0959, + "step": 23648 + }, + { + "epoch": 2.8043400924937743, + "grad_norm": 0.8251797441412064, + "learning_rate": 1.0828636370629195e-05, + "loss": 0.1173, + "step": 23649 + }, + { + "epoch": 2.8044586742558995, + "grad_norm": 0.5989928091688499, + "learning_rate": 1.082665897549528e-05, + "loss": 0.0944, + "step": 23650 + }, + { + "epoch": 2.8045772560180247, + "grad_norm": 0.718926104974931, + "learning_rate": 1.0824681711022173e-05, + "loss": 0.1131, + "step": 23651 + }, + { + "epoch": 2.8046958377801494, + "grad_norm": 0.8010455910014552, + "learning_rate": 1.0822704577228132e-05, + "loss": 0.1136, + "step": 23652 + }, + { + "epoch": 2.804814419542274, + "grad_norm": 0.6147366708652549, + "learning_rate": 1.0820727574131365e-05, + "loss": 0.0977, + "step": 23653 + }, + { + "epoch": 2.8049330013043994, + "grad_norm": 0.5099098582494804, + "learning_rate": 1.0818750701750107e-05, + "loss": 0.0518, + "step": 23654 + }, + { + "epoch": 2.8050515830665246, + "grad_norm": 0.8255545181186152, + "learning_rate": 1.0816773960102561e-05, + "loss": 0.1248, + "step": 23655 + }, + { + "epoch": 2.8051701648286493, + "grad_norm": 0.7403607768533896, + "learning_rate": 1.0814797349206987e-05, + "loss": 0.1004, + "step": 23656 + }, + { + "epoch": 2.805288746590774, + "grad_norm": 1.3397210573473675, + "learning_rate": 1.0812820869081575e-05, + "loss": 0.193, + "step": 23657 + }, + { + "epoch": 2.8054073283528993, + "grad_norm": 0.5807243093037502, + "learning_rate": 1.0810844519744556e-05, + "loss": 0.0764, + "step": 23658 + }, + { + "epoch": 2.8055259101150245, + "grad_norm": 0.5527536720039181, + "learning_rate": 1.080886830121415e-05, + "loss": 0.0803, + "step": 23659 + }, + { + "epoch": 2.8056444918771493, + "grad_norm": 1.0568281757681117, + "learning_rate": 1.080689221350858e-05, + "loss": 0.1413, + "step": 23660 + }, + { + "epoch": 2.805763073639274, + "grad_norm": 0.5718982701524848, + "learning_rate": 1.0804916256646066e-05, + "loss": 0.0812, + "step": 23661 + }, + { + "epoch": 2.805881655401399, + "grad_norm": 0.6653637133842821, + "learning_rate": 1.0802940430644805e-05, + "loss": 0.0958, + "step": 23662 + }, + { + "epoch": 2.8060002371635244, + "grad_norm": 0.5082234330637397, + "learning_rate": 1.0800964735523029e-05, + "loss": 0.0751, + "step": 23663 + }, + { + "epoch": 2.806118818925649, + "grad_norm": 0.6055972845365272, + "learning_rate": 1.0798989171298943e-05, + "loss": 0.0735, + "step": 23664 + }, + { + "epoch": 2.8062374006877744, + "grad_norm": 0.8371353562794941, + "learning_rate": 1.0797013737990763e-05, + "loss": 0.0996, + "step": 23665 + }, + { + "epoch": 2.806355982449899, + "grad_norm": 0.5262735412047529, + "learning_rate": 1.07950384356167e-05, + "loss": 0.0728, + "step": 23666 + }, + { + "epoch": 2.8064745642120243, + "grad_norm": 0.6005316053736811, + "learning_rate": 1.0793063264194964e-05, + "loss": 0.1016, + "step": 23667 + }, + { + "epoch": 2.806593145974149, + "grad_norm": 0.7639400924158317, + "learning_rate": 1.079108822374377e-05, + "loss": 0.0925, + "step": 23668 + }, + { + "epoch": 2.8067117277362743, + "grad_norm": 0.527271588928279, + "learning_rate": 1.078911331428131e-05, + "loss": 0.0694, + "step": 23669 + }, + { + "epoch": 2.806830309498399, + "grad_norm": 0.5480928865730844, + "learning_rate": 1.07871385358258e-05, + "loss": 0.071, + "step": 23670 + }, + { + "epoch": 2.8069488912605243, + "grad_norm": 0.6586045119824193, + "learning_rate": 1.0785163888395442e-05, + "loss": 0.0891, + "step": 23671 + }, + { + "epoch": 2.807067473022649, + "grad_norm": 0.7215217175697418, + "learning_rate": 1.0783189372008451e-05, + "loss": 0.1007, + "step": 23672 + }, + { + "epoch": 2.8071860547847742, + "grad_norm": 0.782133800898967, + "learning_rate": 1.0781214986683011e-05, + "loss": 0.0987, + "step": 23673 + }, + { + "epoch": 2.807304636546899, + "grad_norm": 0.8002464605416594, + "learning_rate": 1.0779240732437334e-05, + "loss": 0.1017, + "step": 23674 + }, + { + "epoch": 2.807423218309024, + "grad_norm": 0.532112045824138, + "learning_rate": 1.0777266609289618e-05, + "loss": 0.0605, + "step": 23675 + }, + { + "epoch": 2.807541800071149, + "grad_norm": 0.6816275583863695, + "learning_rate": 1.0775292617258062e-05, + "loss": 0.0827, + "step": 23676 + }, + { + "epoch": 2.807660381833274, + "grad_norm": 0.47870374116648556, + "learning_rate": 1.0773318756360872e-05, + "loss": 0.0599, + "step": 23677 + }, + { + "epoch": 2.807778963595399, + "grad_norm": 0.8903808901058717, + "learning_rate": 1.0771345026616223e-05, + "loss": 0.1193, + "step": 23678 + }, + { + "epoch": 2.807897545357524, + "grad_norm": 0.3533418788836065, + "learning_rate": 1.0769371428042341e-05, + "loss": 0.0434, + "step": 23679 + }, + { + "epoch": 2.808016127119649, + "grad_norm": 0.5469935267712775, + "learning_rate": 1.0767397960657394e-05, + "loss": 0.0646, + "step": 23680 + }, + { + "epoch": 2.808134708881774, + "grad_norm": 0.49994491125418605, + "learning_rate": 1.0765424624479587e-05, + "loss": 0.0572, + "step": 23681 + }, + { + "epoch": 2.808253290643899, + "grad_norm": 0.5009666179191757, + "learning_rate": 1.0763451419527106e-05, + "loss": 0.076, + "step": 23682 + }, + { + "epoch": 2.808371872406024, + "grad_norm": 0.5893973054839963, + "learning_rate": 1.0761478345818157e-05, + "loss": 0.0632, + "step": 23683 + }, + { + "epoch": 2.808490454168149, + "grad_norm": 0.8267550007647957, + "learning_rate": 1.0759505403370906e-05, + "loss": 0.0931, + "step": 23684 + }, + { + "epoch": 2.808609035930274, + "grad_norm": 0.6121810270319018, + "learning_rate": 1.0757532592203553e-05, + "loss": 0.0746, + "step": 23685 + }, + { + "epoch": 2.8087276176923988, + "grad_norm": 0.44152799289364647, + "learning_rate": 1.0755559912334284e-05, + "loss": 0.0578, + "step": 23686 + }, + { + "epoch": 2.808846199454524, + "grad_norm": 0.5083699151601409, + "learning_rate": 1.0753587363781287e-05, + "loss": 0.0784, + "step": 23687 + }, + { + "epoch": 2.8089647812166487, + "grad_norm": 0.9296603632191489, + "learning_rate": 1.075161494656275e-05, + "loss": 0.1084, + "step": 23688 + }, + { + "epoch": 2.809083362978774, + "grad_norm": 0.545922447355714, + "learning_rate": 1.0749642660696835e-05, + "loss": 0.0875, + "step": 23689 + }, + { + "epoch": 2.8092019447408987, + "grad_norm": 0.4859196231138172, + "learning_rate": 1.0747670506201756e-05, + "loss": 0.071, + "step": 23690 + }, + { + "epoch": 2.809320526503024, + "grad_norm": 0.5356678804310886, + "learning_rate": 1.074569848309567e-05, + "loss": 0.0761, + "step": 23691 + }, + { + "epoch": 2.809439108265149, + "grad_norm": 0.4278726637330649, + "learning_rate": 1.0743726591396774e-05, + "loss": 0.0505, + "step": 23692 + }, + { + "epoch": 2.809557690027274, + "grad_norm": 0.6961396451125965, + "learning_rate": 1.0741754831123216e-05, + "loss": 0.1013, + "step": 23693 + }, + { + "epoch": 2.8096762717893986, + "grad_norm": 0.565999601370747, + "learning_rate": 1.0739783202293213e-05, + "loss": 0.0859, + "step": 23694 + }, + { + "epoch": 2.809794853551524, + "grad_norm": 0.5365043177921548, + "learning_rate": 1.0737811704924913e-05, + "loss": 0.0926, + "step": 23695 + }, + { + "epoch": 2.809913435313649, + "grad_norm": 0.6093967209390238, + "learning_rate": 1.0735840339036502e-05, + "loss": 0.0827, + "step": 23696 + }, + { + "epoch": 2.8100320170757738, + "grad_norm": 0.35227049625576856, + "learning_rate": 1.0733869104646146e-05, + "loss": 0.0506, + "step": 23697 + }, + { + "epoch": 2.8101505988378985, + "grad_norm": 0.6514808812989781, + "learning_rate": 1.0731898001772028e-05, + "loss": 0.0828, + "step": 23698 + }, + { + "epoch": 2.8102691806000237, + "grad_norm": 0.6883742556187223, + "learning_rate": 1.072992703043232e-05, + "loss": 0.1041, + "step": 23699 + }, + { + "epoch": 2.810387762362149, + "grad_norm": 0.6151782912030235, + "learning_rate": 1.0727956190645166e-05, + "loss": 0.0695, + "step": 23700 + }, + { + "epoch": 2.8105063441242737, + "grad_norm": 0.5914069983155515, + "learning_rate": 1.0725985482428776e-05, + "loss": 0.085, + "step": 23701 + }, + { + "epoch": 2.8106249258863985, + "grad_norm": 0.8725170034629668, + "learning_rate": 1.0724014905801285e-05, + "loss": 0.0879, + "step": 23702 + }, + { + "epoch": 2.8107435076485237, + "grad_norm": 0.5385494694886028, + "learning_rate": 1.0722044460780877e-05, + "loss": 0.0776, + "step": 23703 + }, + { + "epoch": 2.810862089410649, + "grad_norm": 0.5054365529942443, + "learning_rate": 1.0720074147385698e-05, + "loss": 0.0638, + "step": 23704 + }, + { + "epoch": 2.8109806711727736, + "grad_norm": 0.5635664815923033, + "learning_rate": 1.0718103965633938e-05, + "loss": 0.068, + "step": 23705 + }, + { + "epoch": 2.8110992529348984, + "grad_norm": 0.8202549329097416, + "learning_rate": 1.0716133915543738e-05, + "loss": 0.1034, + "step": 23706 + }, + { + "epoch": 2.8112178346970236, + "grad_norm": 0.549204177260597, + "learning_rate": 1.0714163997133268e-05, + "loss": 0.0847, + "step": 23707 + }, + { + "epoch": 2.811336416459149, + "grad_norm": 0.5479892463444132, + "learning_rate": 1.0712194210420684e-05, + "loss": 0.0835, + "step": 23708 + }, + { + "epoch": 2.8114549982212735, + "grad_norm": 0.37380656411235713, + "learning_rate": 1.071022455542415e-05, + "loss": 0.0526, + "step": 23709 + }, + { + "epoch": 2.8115735799833983, + "grad_norm": 0.5256768672382078, + "learning_rate": 1.0708255032161835e-05, + "loss": 0.0592, + "step": 23710 + }, + { + "epoch": 2.8116921617455235, + "grad_norm": 0.6411771931255557, + "learning_rate": 1.0706285640651862e-05, + "loss": 0.1009, + "step": 23711 + }, + { + "epoch": 2.8118107435076487, + "grad_norm": 0.5526306259615674, + "learning_rate": 1.0704316380912425e-05, + "loss": 0.075, + "step": 23712 + }, + { + "epoch": 2.8119293252697735, + "grad_norm": 0.559063765625672, + "learning_rate": 1.070234725296165e-05, + "loss": 0.0782, + "step": 23713 + }, + { + "epoch": 2.812047907031898, + "grad_norm": 0.5172482279314841, + "learning_rate": 1.0700378256817703e-05, + "loss": 0.0614, + "step": 23714 + }, + { + "epoch": 2.8121664887940234, + "grad_norm": 0.6180587409171935, + "learning_rate": 1.0698409392498732e-05, + "loss": 0.077, + "step": 23715 + }, + { + "epoch": 2.8122850705561486, + "grad_norm": 0.35900193770008004, + "learning_rate": 1.0696440660022891e-05, + "loss": 0.045, + "step": 23716 + }, + { + "epoch": 2.8124036523182734, + "grad_norm": 0.5969804957578965, + "learning_rate": 1.0694472059408334e-05, + "loss": 0.0981, + "step": 23717 + }, + { + "epoch": 2.8125222340803986, + "grad_norm": 0.5532537311891125, + "learning_rate": 1.0692503590673192e-05, + "loss": 0.0729, + "step": 23718 + }, + { + "epoch": 2.8126408158425233, + "grad_norm": 0.47378444978458045, + "learning_rate": 1.0690535253835623e-05, + "loss": 0.0577, + "step": 23719 + }, + { + "epoch": 2.8127593976046485, + "grad_norm": 0.9832786362975712, + "learning_rate": 1.0688567048913772e-05, + "loss": 0.1194, + "step": 23720 + }, + { + "epoch": 2.8128779793667733, + "grad_norm": 0.39003626401532515, + "learning_rate": 1.0686598975925791e-05, + "loss": 0.0451, + "step": 23721 + }, + { + "epoch": 2.8129965611288985, + "grad_norm": 0.6310116300166011, + "learning_rate": 1.0684631034889798e-05, + "loss": 0.0852, + "step": 23722 + }, + { + "epoch": 2.8131151428910233, + "grad_norm": 0.5792917084545122, + "learning_rate": 1.0682663225823968e-05, + "loss": 0.0822, + "step": 23723 + }, + { + "epoch": 2.8132337246531485, + "grad_norm": 0.8808333066704965, + "learning_rate": 1.068069554874642e-05, + "loss": 0.1282, + "step": 23724 + }, + { + "epoch": 2.8133523064152732, + "grad_norm": 0.5500673379993618, + "learning_rate": 1.0678728003675298e-05, + "loss": 0.0643, + "step": 23725 + }, + { + "epoch": 2.8134708881773984, + "grad_norm": 0.5986926879055103, + "learning_rate": 1.0676760590628742e-05, + "loss": 0.0808, + "step": 23726 + }, + { + "epoch": 2.813589469939523, + "grad_norm": 0.6058344137615969, + "learning_rate": 1.0674793309624888e-05, + "loss": 0.0811, + "step": 23727 + }, + { + "epoch": 2.8137080517016484, + "grad_norm": 0.4916683179863628, + "learning_rate": 1.0672826160681884e-05, + "loss": 0.0679, + "step": 23728 + }, + { + "epoch": 2.813826633463773, + "grad_norm": 1.0005823995978569, + "learning_rate": 1.0670859143817844e-05, + "loss": 0.1334, + "step": 23729 + }, + { + "epoch": 2.8139452152258984, + "grad_norm": 0.6988423241344452, + "learning_rate": 1.066889225905091e-05, + "loss": 0.0736, + "step": 23730 + }, + { + "epoch": 2.814063796988023, + "grad_norm": 0.4136759900734558, + "learning_rate": 1.0666925506399216e-05, + "loss": 0.0652, + "step": 23731 + }, + { + "epoch": 2.8141823787501483, + "grad_norm": 0.4236904524496318, + "learning_rate": 1.0664958885880903e-05, + "loss": 0.0536, + "step": 23732 + }, + { + "epoch": 2.814300960512273, + "grad_norm": 0.6226513308721846, + "learning_rate": 1.066299239751408e-05, + "loss": 0.0711, + "step": 23733 + }, + { + "epoch": 2.8144195422743983, + "grad_norm": 0.8388667841518095, + "learning_rate": 1.0661026041316884e-05, + "loss": 0.1094, + "step": 23734 + }, + { + "epoch": 2.814538124036523, + "grad_norm": 0.7167530924560225, + "learning_rate": 1.0659059817307448e-05, + "loss": 0.0815, + "step": 23735 + }, + { + "epoch": 2.8146567057986482, + "grad_norm": 0.6720485987187659, + "learning_rate": 1.0657093725503895e-05, + "loss": 0.0788, + "step": 23736 + }, + { + "epoch": 2.814775287560773, + "grad_norm": 0.7593678462661502, + "learning_rate": 1.0655127765924345e-05, + "loss": 0.1075, + "step": 23737 + }, + { + "epoch": 2.814893869322898, + "grad_norm": 0.5897087396301391, + "learning_rate": 1.0653161938586927e-05, + "loss": 0.0705, + "step": 23738 + }, + { + "epoch": 2.815012451085023, + "grad_norm": 0.8341244532461679, + "learning_rate": 1.0651196243509776e-05, + "loss": 0.0901, + "step": 23739 + }, + { + "epoch": 2.815131032847148, + "grad_norm": 0.5407186851701997, + "learning_rate": 1.0649230680710986e-05, + "loss": 0.0659, + "step": 23740 + }, + { + "epoch": 2.8152496146092734, + "grad_norm": 0.8161043796340677, + "learning_rate": 1.0647265250208694e-05, + "loss": 0.0983, + "step": 23741 + }, + { + "epoch": 2.815368196371398, + "grad_norm": 0.7140660059971243, + "learning_rate": 1.0645299952021014e-05, + "loss": 0.104, + "step": 23742 + }, + { + "epoch": 2.815486778133523, + "grad_norm": 0.49087150512896993, + "learning_rate": 1.0643334786166076e-05, + "loss": 0.0681, + "step": 23743 + }, + { + "epoch": 2.815605359895648, + "grad_norm": 0.8356188787850176, + "learning_rate": 1.0641369752661976e-05, + "loss": 0.115, + "step": 23744 + }, + { + "epoch": 2.8157239416577733, + "grad_norm": 0.7876714712439317, + "learning_rate": 1.0639404851526841e-05, + "loss": 0.1027, + "step": 23745 + }, + { + "epoch": 2.815842523419898, + "grad_norm": 0.9126757999756093, + "learning_rate": 1.063744008277878e-05, + "loss": 0.0735, + "step": 23746 + }, + { + "epoch": 2.815961105182023, + "grad_norm": 0.6564123676919913, + "learning_rate": 1.0635475446435911e-05, + "loss": 0.0608, + "step": 23747 + }, + { + "epoch": 2.816079686944148, + "grad_norm": 0.6920715466905459, + "learning_rate": 1.0633510942516354e-05, + "loss": 0.0727, + "step": 23748 + }, + { + "epoch": 2.816198268706273, + "grad_norm": 0.9903694015906713, + "learning_rate": 1.0631546571038187e-05, + "loss": 0.1059, + "step": 23749 + }, + { + "epoch": 2.816316850468398, + "grad_norm": 0.46701950955176713, + "learning_rate": 1.0629582332019561e-05, + "loss": 0.0689, + "step": 23750 + }, + { + "epoch": 2.8164354322305227, + "grad_norm": 1.0189882738286231, + "learning_rate": 1.0627618225478555e-05, + "loss": 0.147, + "step": 23751 + }, + { + "epoch": 2.816554013992648, + "grad_norm": 0.47254083289986, + "learning_rate": 1.0625654251433284e-05, + "loss": 0.0509, + "step": 23752 + }, + { + "epoch": 2.816672595754773, + "grad_norm": 0.6479723132800653, + "learning_rate": 1.0623690409901854e-05, + "loss": 0.0662, + "step": 23753 + }, + { + "epoch": 2.816791177516898, + "grad_norm": 0.5373318702632125, + "learning_rate": 1.0621726700902376e-05, + "loss": 0.0763, + "step": 23754 + }, + { + "epoch": 2.8169097592790227, + "grad_norm": 0.610831593268313, + "learning_rate": 1.0619763124452938e-05, + "loss": 0.076, + "step": 23755 + }, + { + "epoch": 2.817028341041148, + "grad_norm": 0.532981798318027, + "learning_rate": 1.0617799680571653e-05, + "loss": 0.0822, + "step": 23756 + }, + { + "epoch": 2.817146922803273, + "grad_norm": 0.7629447644766998, + "learning_rate": 1.0615836369276613e-05, + "loss": 0.0761, + "step": 23757 + }, + { + "epoch": 2.817265504565398, + "grad_norm": 1.2951250239924097, + "learning_rate": 1.0613873190585926e-05, + "loss": 0.1426, + "step": 23758 + }, + { + "epoch": 2.8173840863275226, + "grad_norm": 0.8801474157955219, + "learning_rate": 1.0611910144517695e-05, + "loss": 0.1342, + "step": 23759 + }, + { + "epoch": 2.8175026680896478, + "grad_norm": 0.6659869296050666, + "learning_rate": 1.0609947231089992e-05, + "loss": 0.0903, + "step": 23760 + }, + { + "epoch": 2.817621249851773, + "grad_norm": 0.6315948346896855, + "learning_rate": 1.0607984450320945e-05, + "loss": 0.0718, + "step": 23761 + }, + { + "epoch": 2.8177398316138977, + "grad_norm": 0.722767127922329, + "learning_rate": 1.0606021802228625e-05, + "loss": 0.0922, + "step": 23762 + }, + { + "epoch": 2.8178584133760225, + "grad_norm": 0.46189811612105286, + "learning_rate": 1.0604059286831141e-05, + "loss": 0.065, + "step": 23763 + }, + { + "epoch": 2.8179769951381477, + "grad_norm": 0.8960723411924778, + "learning_rate": 1.060209690414656e-05, + "loss": 0.1283, + "step": 23764 + }, + { + "epoch": 2.818095576900273, + "grad_norm": 0.8298899237661, + "learning_rate": 1.0600134654193008e-05, + "loss": 0.1093, + "step": 23765 + }, + { + "epoch": 2.8182141586623977, + "grad_norm": 0.5024519822168046, + "learning_rate": 1.0598172536988546e-05, + "loss": 0.0542, + "step": 23766 + }, + { + "epoch": 2.818332740424523, + "grad_norm": 0.7122989078983483, + "learning_rate": 1.0596210552551272e-05, + "loss": 0.079, + "step": 23767 + }, + { + "epoch": 2.8184513221866476, + "grad_norm": 0.6833068626755471, + "learning_rate": 1.0594248700899276e-05, + "loss": 0.0988, + "step": 23768 + }, + { + "epoch": 2.818569903948773, + "grad_norm": 0.5497625494926425, + "learning_rate": 1.059228698205064e-05, + "loss": 0.0716, + "step": 23769 + }, + { + "epoch": 2.8186884857108976, + "grad_norm": 0.8164413689888779, + "learning_rate": 1.0590325396023459e-05, + "loss": 0.1008, + "step": 23770 + }, + { + "epoch": 2.818807067473023, + "grad_norm": 0.4995471207546836, + "learning_rate": 1.0588363942835791e-05, + "loss": 0.0662, + "step": 23771 + }, + { + "epoch": 2.8189256492351475, + "grad_norm": 0.5841296649727478, + "learning_rate": 1.0586402622505751e-05, + "loss": 0.0754, + "step": 23772 + }, + { + "epoch": 2.8190442309972727, + "grad_norm": 0.8284505782431335, + "learning_rate": 1.0584441435051396e-05, + "loss": 0.0979, + "step": 23773 + }, + { + "epoch": 2.8191628127593975, + "grad_norm": 0.4201820683835193, + "learning_rate": 1.0582480380490811e-05, + "loss": 0.0457, + "step": 23774 + }, + { + "epoch": 2.8192813945215227, + "grad_norm": 0.5995308780864204, + "learning_rate": 1.0580519458842078e-05, + "loss": 0.0782, + "step": 23775 + }, + { + "epoch": 2.8193999762836475, + "grad_norm": 0.6404690073452419, + "learning_rate": 1.0578558670123276e-05, + "loss": 0.0842, + "step": 23776 + }, + { + "epoch": 2.8195185580457727, + "grad_norm": 0.45887211212098394, + "learning_rate": 1.0576598014352485e-05, + "loss": 0.063, + "step": 23777 + }, + { + "epoch": 2.8196371398078974, + "grad_norm": 0.69404967733088, + "learning_rate": 1.0574637491547763e-05, + "loss": 0.0889, + "step": 23778 + }, + { + "epoch": 2.8197557215700226, + "grad_norm": 0.9101310672331558, + "learning_rate": 1.0572677101727197e-05, + "loss": 0.1268, + "step": 23779 + }, + { + "epoch": 2.8198743033321474, + "grad_norm": 0.7960243419549252, + "learning_rate": 1.0570716844908854e-05, + "loss": 0.1077, + "step": 23780 + }, + { + "epoch": 2.8199928850942726, + "grad_norm": 0.4596722628108303, + "learning_rate": 1.0568756721110814e-05, + "loss": 0.0646, + "step": 23781 + }, + { + "epoch": 2.8201114668563974, + "grad_norm": 0.4834225185285542, + "learning_rate": 1.0566796730351126e-05, + "loss": 0.0636, + "step": 23782 + }, + { + "epoch": 2.8202300486185226, + "grad_norm": 0.441909699851818, + "learning_rate": 1.056483687264789e-05, + "loss": 0.0565, + "step": 23783 + }, + { + "epoch": 2.8203486303806473, + "grad_norm": 0.5692772178731074, + "learning_rate": 1.0562877148019149e-05, + "loss": 0.0619, + "step": 23784 + }, + { + "epoch": 2.8204672121427725, + "grad_norm": 0.47716008799432597, + "learning_rate": 1.0560917556482975e-05, + "loss": 0.058, + "step": 23785 + }, + { + "epoch": 2.8205857939048973, + "grad_norm": 0.6725489606538804, + "learning_rate": 1.0558958098057433e-05, + "loss": 0.0782, + "step": 23786 + }, + { + "epoch": 2.8207043756670225, + "grad_norm": 0.5935417824119965, + "learning_rate": 1.055699877276059e-05, + "loss": 0.0893, + "step": 23787 + }, + { + "epoch": 2.8208229574291472, + "grad_norm": 0.7032940318462935, + "learning_rate": 1.0555039580610517e-05, + "loss": 0.0917, + "step": 23788 + }, + { + "epoch": 2.8209415391912724, + "grad_norm": 0.7048237433454274, + "learning_rate": 1.0553080521625255e-05, + "loss": 0.1096, + "step": 23789 + }, + { + "epoch": 2.8210601209533976, + "grad_norm": 0.7495004363018791, + "learning_rate": 1.0551121595822877e-05, + "loss": 0.1188, + "step": 23790 + }, + { + "epoch": 2.8211787027155224, + "grad_norm": 0.9930260415469246, + "learning_rate": 1.0549162803221439e-05, + "loss": 0.127, + "step": 23791 + }, + { + "epoch": 2.821297284477647, + "grad_norm": 0.6977639732683725, + "learning_rate": 1.0547204143839007e-05, + "loss": 0.1238, + "step": 23792 + }, + { + "epoch": 2.8214158662397724, + "grad_norm": 0.8620960084532441, + "learning_rate": 1.054524561769362e-05, + "loss": 0.0999, + "step": 23793 + }, + { + "epoch": 2.8215344480018976, + "grad_norm": 0.586802277715516, + "learning_rate": 1.0543287224803344e-05, + "loss": 0.0915, + "step": 23794 + }, + { + "epoch": 2.8216530297640223, + "grad_norm": 0.8592279159023308, + "learning_rate": 1.054132896518623e-05, + "loss": 0.1107, + "step": 23795 + }, + { + "epoch": 2.821771611526147, + "grad_norm": 0.9457237402408418, + "learning_rate": 1.0539370838860335e-05, + "loss": 0.1073, + "step": 23796 + }, + { + "epoch": 2.8218901932882723, + "grad_norm": 0.6204454510924319, + "learning_rate": 1.0537412845843708e-05, + "loss": 0.082, + "step": 23797 + }, + { + "epoch": 2.8220087750503975, + "grad_norm": 0.40056061428103235, + "learning_rate": 1.0535454986154397e-05, + "loss": 0.0525, + "step": 23798 + }, + { + "epoch": 2.8221273568125222, + "grad_norm": 0.616289393127539, + "learning_rate": 1.0533497259810465e-05, + "loss": 0.0669, + "step": 23799 + }, + { + "epoch": 2.822245938574647, + "grad_norm": 0.5516059698116464, + "learning_rate": 1.0531539666829938e-05, + "loss": 0.0701, + "step": 23800 + }, + { + "epoch": 2.822364520336772, + "grad_norm": 0.8789717829046028, + "learning_rate": 1.0529582207230874e-05, + "loss": 0.1123, + "step": 23801 + }, + { + "epoch": 2.8224831020988974, + "grad_norm": 0.494937314698148, + "learning_rate": 1.0527624881031315e-05, + "loss": 0.0514, + "step": 23802 + }, + { + "epoch": 2.822601683861022, + "grad_norm": 0.7000342922817735, + "learning_rate": 1.052566768824932e-05, + "loss": 0.0925, + "step": 23803 + }, + { + "epoch": 2.822720265623147, + "grad_norm": 0.6169464473824983, + "learning_rate": 1.0523710628902905e-05, + "loss": 0.0652, + "step": 23804 + }, + { + "epoch": 2.822838847385272, + "grad_norm": 0.8127582116055794, + "learning_rate": 1.0521753703010129e-05, + "loss": 0.0916, + "step": 23805 + }, + { + "epoch": 2.8229574291473973, + "grad_norm": 0.6205732562841709, + "learning_rate": 1.051979691058903e-05, + "loss": 0.0739, + "step": 23806 + }, + { + "epoch": 2.823076010909522, + "grad_norm": 0.5609390435810635, + "learning_rate": 1.0517840251657646e-05, + "loss": 0.0884, + "step": 23807 + }, + { + "epoch": 2.823194592671647, + "grad_norm": 0.6738246328870717, + "learning_rate": 1.0515883726234017e-05, + "loss": 0.0992, + "step": 23808 + }, + { + "epoch": 2.823313174433772, + "grad_norm": 0.5118812142146344, + "learning_rate": 1.0513927334336176e-05, + "loss": 0.0597, + "step": 23809 + }, + { + "epoch": 2.8234317561958973, + "grad_norm": 0.8427540923165535, + "learning_rate": 1.0511971075982172e-05, + "loss": 0.1026, + "step": 23810 + }, + { + "epoch": 2.823550337958022, + "grad_norm": 0.6894978833955688, + "learning_rate": 1.0510014951190016e-05, + "loss": 0.0792, + "step": 23811 + }, + { + "epoch": 2.8236689197201468, + "grad_norm": 0.9822146270141293, + "learning_rate": 1.0508058959977757e-05, + "loss": 0.1185, + "step": 23812 + }, + { + "epoch": 2.823787501482272, + "grad_norm": 0.80181772410509, + "learning_rate": 1.0506103102363424e-05, + "loss": 0.1085, + "step": 23813 + }, + { + "epoch": 2.823906083244397, + "grad_norm": 0.7185586803661643, + "learning_rate": 1.0504147378365054e-05, + "loss": 0.1051, + "step": 23814 + }, + { + "epoch": 2.824024665006522, + "grad_norm": 0.6696530326421929, + "learning_rate": 1.050219178800066e-05, + "loss": 0.0794, + "step": 23815 + }, + { + "epoch": 2.824143246768647, + "grad_norm": 0.8464557869025621, + "learning_rate": 1.0500236331288279e-05, + "loss": 0.0989, + "step": 23816 + }, + { + "epoch": 2.824261828530772, + "grad_norm": 0.6995856007810035, + "learning_rate": 1.0498281008245939e-05, + "loss": 0.0876, + "step": 23817 + }, + { + "epoch": 2.824380410292897, + "grad_norm": 0.6368290508054114, + "learning_rate": 1.0496325818891665e-05, + "loss": 0.0872, + "step": 23818 + }, + { + "epoch": 2.824498992055022, + "grad_norm": 0.6169550566606034, + "learning_rate": 1.0494370763243488e-05, + "loss": 0.0889, + "step": 23819 + }, + { + "epoch": 2.824617573817147, + "grad_norm": 0.619076235715699, + "learning_rate": 1.049241584131941e-05, + "loss": 0.0688, + "step": 23820 + }, + { + "epoch": 2.824736155579272, + "grad_norm": 0.7520098744252404, + "learning_rate": 1.0490461053137484e-05, + "loss": 0.0851, + "step": 23821 + }, + { + "epoch": 2.824854737341397, + "grad_norm": 0.5608413253767294, + "learning_rate": 1.0488506398715706e-05, + "loss": 0.0902, + "step": 23822 + }, + { + "epoch": 2.824973319103522, + "grad_norm": 0.6450508415494578, + "learning_rate": 1.0486551878072102e-05, + "loss": 0.0855, + "step": 23823 + }, + { + "epoch": 2.825091900865647, + "grad_norm": 0.9286545652755016, + "learning_rate": 1.0484597491224692e-05, + "loss": 0.1021, + "step": 23824 + }, + { + "epoch": 2.8252104826277717, + "grad_norm": 0.8498673646080401, + "learning_rate": 1.0482643238191495e-05, + "loss": 0.1145, + "step": 23825 + }, + { + "epoch": 2.825329064389897, + "grad_norm": 0.8883344213149525, + "learning_rate": 1.0480689118990534e-05, + "loss": 0.1127, + "step": 23826 + }, + { + "epoch": 2.8254476461520217, + "grad_norm": 0.7525265827856628, + "learning_rate": 1.0478735133639797e-05, + "loss": 0.09, + "step": 23827 + }, + { + "epoch": 2.825566227914147, + "grad_norm": 0.6367724667081325, + "learning_rate": 1.047678128215733e-05, + "loss": 0.0672, + "step": 23828 + }, + { + "epoch": 2.8256848096762717, + "grad_norm": 0.7502386189321952, + "learning_rate": 1.0474827564561124e-05, + "loss": 0.0894, + "step": 23829 + }, + { + "epoch": 2.825803391438397, + "grad_norm": 0.6622554884924331, + "learning_rate": 1.0472873980869205e-05, + "loss": 0.0991, + "step": 23830 + }, + { + "epoch": 2.8259219732005216, + "grad_norm": 0.591359000566455, + "learning_rate": 1.0470920531099554e-05, + "loss": 0.0794, + "step": 23831 + }, + { + "epoch": 2.826040554962647, + "grad_norm": 0.3076503251348632, + "learning_rate": 1.0468967215270217e-05, + "loss": 0.0435, + "step": 23832 + }, + { + "epoch": 2.8261591367247716, + "grad_norm": 1.114059832626202, + "learning_rate": 1.0467014033399175e-05, + "loss": 0.1425, + "step": 23833 + }, + { + "epoch": 2.826277718486897, + "grad_norm": 0.44655987779462797, + "learning_rate": 1.0465060985504441e-05, + "loss": 0.0601, + "step": 23834 + }, + { + "epoch": 2.8263963002490216, + "grad_norm": 0.4834944469500781, + "learning_rate": 1.0463108071604022e-05, + "loss": 0.0599, + "step": 23835 + }, + { + "epoch": 2.8265148820111468, + "grad_norm": 0.3989599496591911, + "learning_rate": 1.046115529171592e-05, + "loss": 0.0619, + "step": 23836 + }, + { + "epoch": 2.8266334637732715, + "grad_norm": 1.1756868264574083, + "learning_rate": 1.0459202645858147e-05, + "loss": 0.1007, + "step": 23837 + }, + { + "epoch": 2.8267520455353967, + "grad_norm": 0.6045675676316454, + "learning_rate": 1.0457250134048673e-05, + "loss": 0.0756, + "step": 23838 + }, + { + "epoch": 2.8268706272975215, + "grad_norm": 0.47017971613045006, + "learning_rate": 1.0455297756305544e-05, + "loss": 0.0704, + "step": 23839 + }, + { + "epoch": 2.8269892090596467, + "grad_norm": 0.6654461003755512, + "learning_rate": 1.045334551264672e-05, + "loss": 0.0697, + "step": 23840 + }, + { + "epoch": 2.8271077908217714, + "grad_norm": 1.0354167798014329, + "learning_rate": 1.045139340309022e-05, + "loss": 0.1403, + "step": 23841 + }, + { + "epoch": 2.8272263725838966, + "grad_norm": 0.568594961029302, + "learning_rate": 1.0449441427654017e-05, + "loss": 0.0834, + "step": 23842 + }, + { + "epoch": 2.827344954346022, + "grad_norm": 0.8309336289684611, + "learning_rate": 1.0447489586356138e-05, + "loss": 0.1019, + "step": 23843 + }, + { + "epoch": 2.8274635361081466, + "grad_norm": 0.6274543938531099, + "learning_rate": 1.0445537879214551e-05, + "loss": 0.0716, + "step": 23844 + }, + { + "epoch": 2.8275821178702714, + "grad_norm": 0.657835015360919, + "learning_rate": 1.0443586306247255e-05, + "loss": 0.0839, + "step": 23845 + }, + { + "epoch": 2.8277006996323966, + "grad_norm": 0.5293494906357262, + "learning_rate": 1.0441634867472243e-05, + "loss": 0.0814, + "step": 23846 + }, + { + "epoch": 2.8278192813945218, + "grad_norm": 0.46574178139668837, + "learning_rate": 1.043968356290751e-05, + "loss": 0.0615, + "step": 23847 + }, + { + "epoch": 2.8279378631566465, + "grad_norm": 0.7286369463435364, + "learning_rate": 1.0437732392571042e-05, + "loss": 0.0824, + "step": 23848 + }, + { + "epoch": 2.8280564449187713, + "grad_norm": 0.5752693349648198, + "learning_rate": 1.043578135648082e-05, + "loss": 0.0738, + "step": 23849 + }, + { + "epoch": 2.8281750266808965, + "grad_norm": 0.6043140416185935, + "learning_rate": 1.043383045465483e-05, + "loss": 0.0676, + "step": 23850 + }, + { + "epoch": 2.8282936084430217, + "grad_norm": 0.47594748481235505, + "learning_rate": 1.0431879687111065e-05, + "loss": 0.0571, + "step": 23851 + }, + { + "epoch": 2.8284121902051464, + "grad_norm": 0.6635051706046304, + "learning_rate": 1.042992905386751e-05, + "loss": 0.0818, + "step": 23852 + }, + { + "epoch": 2.828530771967271, + "grad_norm": 0.5951863375140738, + "learning_rate": 1.0427978554942125e-05, + "loss": 0.0741, + "step": 23853 + }, + { + "epoch": 2.8286493537293964, + "grad_norm": 0.9616685634313061, + "learning_rate": 1.042602819035293e-05, + "loss": 0.1182, + "step": 23854 + }, + { + "epoch": 2.8287679354915216, + "grad_norm": 0.7105290180572675, + "learning_rate": 1.042407796011787e-05, + "loss": 0.0881, + "step": 23855 + }, + { + "epoch": 2.8288865172536464, + "grad_norm": 0.6122452345503079, + "learning_rate": 1.0422127864254938e-05, + "loss": 0.0879, + "step": 23856 + }, + { + "epoch": 2.829005099015771, + "grad_norm": 0.5758624624548543, + "learning_rate": 1.042017790278211e-05, + "loss": 0.0723, + "step": 23857 + }, + { + "epoch": 2.8291236807778963, + "grad_norm": 0.8791302746185502, + "learning_rate": 1.0418228075717366e-05, + "loss": 0.0932, + "step": 23858 + }, + { + "epoch": 2.8292422625400215, + "grad_norm": 0.9570143430106155, + "learning_rate": 1.041627838307868e-05, + "loss": 0.1061, + "step": 23859 + }, + { + "epoch": 2.8293608443021463, + "grad_norm": 0.654801565816076, + "learning_rate": 1.0414328824884021e-05, + "loss": 0.0871, + "step": 23860 + }, + { + "epoch": 2.829479426064271, + "grad_norm": 0.9440362281955649, + "learning_rate": 1.0412379401151363e-05, + "loss": 0.1184, + "step": 23861 + }, + { + "epoch": 2.8295980078263963, + "grad_norm": 0.5993550330116183, + "learning_rate": 1.0410430111898678e-05, + "loss": 0.0732, + "step": 23862 + }, + { + "epoch": 2.8297165895885215, + "grad_norm": 0.7161942488625683, + "learning_rate": 1.0408480957143943e-05, + "loss": 0.0775, + "step": 23863 + }, + { + "epoch": 2.829835171350646, + "grad_norm": 0.8174603046694873, + "learning_rate": 1.0406531936905114e-05, + "loss": 0.1148, + "step": 23864 + }, + { + "epoch": 2.8299537531127714, + "grad_norm": 0.5752944660450884, + "learning_rate": 1.0404583051200164e-05, + "loss": 0.0732, + "step": 23865 + }, + { + "epoch": 2.830072334874896, + "grad_norm": 0.5389350007159895, + "learning_rate": 1.040263430004706e-05, + "loss": 0.0845, + "step": 23866 + }, + { + "epoch": 2.8301909166370214, + "grad_norm": 0.5715876327420878, + "learning_rate": 1.0400685683463768e-05, + "loss": 0.0838, + "step": 23867 + }, + { + "epoch": 2.830309498399146, + "grad_norm": 0.6762715475690781, + "learning_rate": 1.039873720146825e-05, + "loss": 0.0899, + "step": 23868 + }, + { + "epoch": 2.8304280801612713, + "grad_norm": 0.7409993526630868, + "learning_rate": 1.0396788854078471e-05, + "loss": 0.0946, + "step": 23869 + }, + { + "epoch": 2.830546661923396, + "grad_norm": 0.831702385202434, + "learning_rate": 1.0394840641312397e-05, + "loss": 0.1216, + "step": 23870 + }, + { + "epoch": 2.8306652436855213, + "grad_norm": 0.42206352145507037, + "learning_rate": 1.0392892563187978e-05, + "loss": 0.0544, + "step": 23871 + }, + { + "epoch": 2.830783825447646, + "grad_norm": 0.8437299822058303, + "learning_rate": 1.0390944619723175e-05, + "loss": 0.0816, + "step": 23872 + }, + { + "epoch": 2.8309024072097713, + "grad_norm": 0.8507037642354829, + "learning_rate": 1.0388996810935948e-05, + "loss": 0.1274, + "step": 23873 + }, + { + "epoch": 2.831020988971896, + "grad_norm": 0.6708538851759137, + "learning_rate": 1.038704913684425e-05, + "loss": 0.0829, + "step": 23874 + }, + { + "epoch": 2.8311395707340212, + "grad_norm": 0.9315242396404548, + "learning_rate": 1.0385101597466054e-05, + "loss": 0.1083, + "step": 23875 + }, + { + "epoch": 2.831258152496146, + "grad_norm": 0.8169635478804586, + "learning_rate": 1.0383154192819278e-05, + "loss": 0.1144, + "step": 23876 + }, + { + "epoch": 2.831376734258271, + "grad_norm": 0.5341460281162749, + "learning_rate": 1.0381206922921915e-05, + "loss": 0.0657, + "step": 23877 + }, + { + "epoch": 2.831495316020396, + "grad_norm": 0.6208984587473538, + "learning_rate": 1.0379259787791887e-05, + "loss": 0.0681, + "step": 23878 + }, + { + "epoch": 2.831613897782521, + "grad_norm": 0.5643034136665415, + "learning_rate": 1.0377312787447157e-05, + "loss": 0.0743, + "step": 23879 + }, + { + "epoch": 2.831732479544646, + "grad_norm": 0.5953667677463591, + "learning_rate": 1.0375365921905669e-05, + "loss": 0.0928, + "step": 23880 + }, + { + "epoch": 2.831851061306771, + "grad_norm": 0.6858551655467482, + "learning_rate": 1.0373419191185385e-05, + "loss": 0.0797, + "step": 23881 + }, + { + "epoch": 2.831969643068896, + "grad_norm": 0.8084799155212737, + "learning_rate": 1.037147259530423e-05, + "loss": 0.1146, + "step": 23882 + }, + { + "epoch": 2.832088224831021, + "grad_norm": 0.45192986787898004, + "learning_rate": 1.0369526134280156e-05, + "loss": 0.0603, + "step": 23883 + }, + { + "epoch": 2.832206806593146, + "grad_norm": 0.7441417646847788, + "learning_rate": 1.0367579808131112e-05, + "loss": 0.0949, + "step": 23884 + }, + { + "epoch": 2.832325388355271, + "grad_norm": 0.5514670166810305, + "learning_rate": 1.0365633616875039e-05, + "loss": 0.0875, + "step": 23885 + }, + { + "epoch": 2.832443970117396, + "grad_norm": 0.6724699922309714, + "learning_rate": 1.0363687560529889e-05, + "loss": 0.0882, + "step": 23886 + }, + { + "epoch": 2.832562551879521, + "grad_norm": 0.5255535645941433, + "learning_rate": 1.036174163911357e-05, + "loss": 0.0677, + "step": 23887 + }, + { + "epoch": 2.8326811336416458, + "grad_norm": 0.514037029291515, + "learning_rate": 1.0359795852644063e-05, + "loss": 0.061, + "step": 23888 + }, + { + "epoch": 2.832799715403771, + "grad_norm": 0.5272120367132883, + "learning_rate": 1.0357850201139274e-05, + "loss": 0.0737, + "step": 23889 + }, + { + "epoch": 2.8329182971658957, + "grad_norm": 0.637955102389592, + "learning_rate": 1.0355904684617162e-05, + "loss": 0.089, + "step": 23890 + }, + { + "epoch": 2.833036878928021, + "grad_norm": 0.7253321648041352, + "learning_rate": 1.0353959303095631e-05, + "loss": 0.0988, + "step": 23891 + }, + { + "epoch": 2.833155460690146, + "grad_norm": 0.4974976978143321, + "learning_rate": 1.0352014056592655e-05, + "loss": 0.0519, + "step": 23892 + }, + { + "epoch": 2.833274042452271, + "grad_norm": 0.6558966894672918, + "learning_rate": 1.0350068945126137e-05, + "loss": 0.0967, + "step": 23893 + }, + { + "epoch": 2.8333926242143956, + "grad_norm": 0.5273862134161543, + "learning_rate": 1.0348123968714019e-05, + "loss": 0.0773, + "step": 23894 + }, + { + "epoch": 2.833511205976521, + "grad_norm": 0.6460019524701236, + "learning_rate": 1.0346179127374228e-05, + "loss": 0.0799, + "step": 23895 + }, + { + "epoch": 2.833629787738646, + "grad_norm": 0.39327055331106797, + "learning_rate": 1.03442344211247e-05, + "loss": 0.0486, + "step": 23896 + }, + { + "epoch": 2.833748369500771, + "grad_norm": 0.5812844941222087, + "learning_rate": 1.0342289849983364e-05, + "loss": 0.0832, + "step": 23897 + }, + { + "epoch": 2.8338669512628956, + "grad_norm": 0.7535222003266849, + "learning_rate": 1.0340345413968125e-05, + "loss": 0.0858, + "step": 23898 + }, + { + "epoch": 2.8339855330250208, + "grad_norm": 0.6758957107772973, + "learning_rate": 1.033840111309694e-05, + "loss": 0.0831, + "step": 23899 + }, + { + "epoch": 2.834104114787146, + "grad_norm": 0.7164260731239147, + "learning_rate": 1.0336456947387713e-05, + "loss": 0.0946, + "step": 23900 + }, + { + "epoch": 2.8342226965492707, + "grad_norm": 0.7079710433109707, + "learning_rate": 1.0334512916858378e-05, + "loss": 0.1083, + "step": 23901 + }, + { + "epoch": 2.8343412783113955, + "grad_norm": 0.6393998190984466, + "learning_rate": 1.0332569021526831e-05, + "loss": 0.0803, + "step": 23902 + }, + { + "epoch": 2.8344598600735207, + "grad_norm": 0.4744838088730723, + "learning_rate": 1.0330625261411028e-05, + "loss": 0.0699, + "step": 23903 + }, + { + "epoch": 2.834578441835646, + "grad_norm": 1.1858049455653026, + "learning_rate": 1.0328681636528864e-05, + "loss": 0.1474, + "step": 23904 + }, + { + "epoch": 2.8346970235977706, + "grad_norm": 0.8535376442582071, + "learning_rate": 1.0326738146898266e-05, + "loss": 0.1151, + "step": 23905 + }, + { + "epoch": 2.8348156053598954, + "grad_norm": 0.6263769286788688, + "learning_rate": 1.0324794792537146e-05, + "loss": 0.0987, + "step": 23906 + }, + { + "epoch": 2.8349341871220206, + "grad_norm": 0.5796860149290024, + "learning_rate": 1.0322851573463425e-05, + "loss": 0.0699, + "step": 23907 + }, + { + "epoch": 2.835052768884146, + "grad_norm": 0.5089261828270222, + "learning_rate": 1.0320908489695019e-05, + "loss": 0.0782, + "step": 23908 + }, + { + "epoch": 2.8351713506462706, + "grad_norm": 0.5969611375505518, + "learning_rate": 1.0318965541249823e-05, + "loss": 0.0629, + "step": 23909 + }, + { + "epoch": 2.8352899324083953, + "grad_norm": 0.48131237264155025, + "learning_rate": 1.0317022728145776e-05, + "loss": 0.0653, + "step": 23910 + }, + { + "epoch": 2.8354085141705205, + "grad_norm": 0.7549415555820216, + "learning_rate": 1.0315080050400767e-05, + "loss": 0.1028, + "step": 23911 + }, + { + "epoch": 2.8355270959326457, + "grad_norm": 0.6088014840042304, + "learning_rate": 1.0313137508032719e-05, + "loss": 0.0829, + "step": 23912 + }, + { + "epoch": 2.8356456776947705, + "grad_norm": 0.6381406253988157, + "learning_rate": 1.0311195101059515e-05, + "loss": 0.1005, + "step": 23913 + }, + { + "epoch": 2.8357642594568953, + "grad_norm": 0.8612422583145198, + "learning_rate": 1.03092528294991e-05, + "loss": 0.1367, + "step": 23914 + }, + { + "epoch": 2.8358828412190205, + "grad_norm": 0.3828778965057416, + "learning_rate": 1.0307310693369349e-05, + "loss": 0.048, + "step": 23915 + }, + { + "epoch": 2.8360014229811457, + "grad_norm": 0.8292028013196637, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.1068, + "step": 23916 + }, + { + "epoch": 2.8361200047432704, + "grad_norm": 0.7441650198310613, + "learning_rate": 1.030342682747348e-05, + "loss": 0.0889, + "step": 23917 + }, + { + "epoch": 2.8362385865053956, + "grad_norm": 0.6611667484493016, + "learning_rate": 1.0301485097743172e-05, + "loss": 0.0823, + "step": 23918 + }, + { + "epoch": 2.8363571682675204, + "grad_norm": 0.680203172577958, + "learning_rate": 1.0299543503515152e-05, + "loss": 0.1043, + "step": 23919 + }, + { + "epoch": 2.8364757500296456, + "grad_norm": 0.7839093428030047, + "learning_rate": 1.029760204480731e-05, + "loss": 0.125, + "step": 23920 + }, + { + "epoch": 2.8365943317917703, + "grad_norm": 0.6266849957507009, + "learning_rate": 1.0295660721637544e-05, + "loss": 0.0865, + "step": 23921 + }, + { + "epoch": 2.8367129135538955, + "grad_norm": 0.48774326787941236, + "learning_rate": 1.0293719534023754e-05, + "loss": 0.0665, + "step": 23922 + }, + { + "epoch": 2.8368314953160203, + "grad_norm": 0.6157500529971958, + "learning_rate": 1.0291778481983838e-05, + "loss": 0.0793, + "step": 23923 + }, + { + "epoch": 2.8369500770781455, + "grad_norm": 0.5380836387268747, + "learning_rate": 1.0289837565535686e-05, + "loss": 0.0738, + "step": 23924 + }, + { + "epoch": 2.8370686588402703, + "grad_norm": 0.7066844349741134, + "learning_rate": 1.0287896784697193e-05, + "loss": 0.0613, + "step": 23925 + }, + { + "epoch": 2.8371872406023955, + "grad_norm": 0.4448555396475084, + "learning_rate": 1.0285956139486261e-05, + "loss": 0.0567, + "step": 23926 + }, + { + "epoch": 2.8373058223645202, + "grad_norm": 0.502845026498411, + "learning_rate": 1.0284015629920762e-05, + "loss": 0.0711, + "step": 23927 + }, + { + "epoch": 2.8374244041266454, + "grad_norm": 0.9194352149490268, + "learning_rate": 1.028207525601859e-05, + "loss": 0.1157, + "step": 23928 + }, + { + "epoch": 2.83754298588877, + "grad_norm": 0.8148818434841729, + "learning_rate": 1.0280135017797637e-05, + "loss": 0.1016, + "step": 23929 + }, + { + "epoch": 2.8376615676508954, + "grad_norm": 0.44804442941584, + "learning_rate": 1.02781949152758e-05, + "loss": 0.0551, + "step": 23930 + }, + { + "epoch": 2.83778014941302, + "grad_norm": 0.8932978188866842, + "learning_rate": 1.0276254948470939e-05, + "loss": 0.0941, + "step": 23931 + }, + { + "epoch": 2.8378987311751454, + "grad_norm": 0.5539383469381187, + "learning_rate": 1.0274315117400954e-05, + "loss": 0.0683, + "step": 23932 + }, + { + "epoch": 2.83801731293727, + "grad_norm": 0.47841123887840625, + "learning_rate": 1.0272375422083728e-05, + "loss": 0.0625, + "step": 23933 + }, + { + "epoch": 2.8381358946993953, + "grad_norm": 0.6250223569734594, + "learning_rate": 1.0270435862537136e-05, + "loss": 0.0868, + "step": 23934 + }, + { + "epoch": 2.83825447646152, + "grad_norm": 0.6386409162286979, + "learning_rate": 1.0268496438779075e-05, + "loss": 0.087, + "step": 23935 + }, + { + "epoch": 2.8383730582236453, + "grad_norm": 0.7736975855041484, + "learning_rate": 1.0266557150827394e-05, + "loss": 0.0948, + "step": 23936 + }, + { + "epoch": 2.83849163998577, + "grad_norm": 0.5697347286669494, + "learning_rate": 1.0264617998700005e-05, + "loss": 0.0732, + "step": 23937 + }, + { + "epoch": 2.8386102217478952, + "grad_norm": 0.53808592094386, + "learning_rate": 1.0262678982414761e-05, + "loss": 0.074, + "step": 23938 + }, + { + "epoch": 2.83872880351002, + "grad_norm": 0.47576149678951407, + "learning_rate": 1.0260740101989544e-05, + "loss": 0.0639, + "step": 23939 + }, + { + "epoch": 2.838847385272145, + "grad_norm": 0.5040723987323494, + "learning_rate": 1.0258801357442227e-05, + "loss": 0.0603, + "step": 23940 + }, + { + "epoch": 2.8389659670342704, + "grad_norm": 0.7002232679513348, + "learning_rate": 1.0256862748790696e-05, + "loss": 0.0768, + "step": 23941 + }, + { + "epoch": 2.839084548796395, + "grad_norm": 0.5346125102183429, + "learning_rate": 1.0254924276052803e-05, + "loss": 0.075, + "step": 23942 + }, + { + "epoch": 2.83920313055852, + "grad_norm": 0.7389926323608602, + "learning_rate": 1.0252985939246423e-05, + "loss": 0.1082, + "step": 23943 + }, + { + "epoch": 2.839321712320645, + "grad_norm": 0.878076152691137, + "learning_rate": 1.025104773838943e-05, + "loss": 0.0915, + "step": 23944 + }, + { + "epoch": 2.8394402940827703, + "grad_norm": 0.3806550559911846, + "learning_rate": 1.024910967349969e-05, + "loss": 0.0461, + "step": 23945 + }, + { + "epoch": 2.839558875844895, + "grad_norm": 0.47838596822270063, + "learning_rate": 1.0247171744595083e-05, + "loss": 0.0664, + "step": 23946 + }, + { + "epoch": 2.83967745760702, + "grad_norm": 0.5014663678948579, + "learning_rate": 1.024523395169344e-05, + "loss": 0.0662, + "step": 23947 + }, + { + "epoch": 2.839796039369145, + "grad_norm": 0.5569509765815179, + "learning_rate": 1.0243296294812665e-05, + "loss": 0.0719, + "step": 23948 + }, + { + "epoch": 2.8399146211312702, + "grad_norm": 0.4149761871504751, + "learning_rate": 1.0241358773970594e-05, + "loss": 0.0486, + "step": 23949 + }, + { + "epoch": 2.840033202893395, + "grad_norm": 0.91323786596332, + "learning_rate": 1.0239421389185105e-05, + "loss": 0.1125, + "step": 23950 + }, + { + "epoch": 2.8401517846555198, + "grad_norm": 0.5223051657930876, + "learning_rate": 1.0237484140474032e-05, + "loss": 0.0618, + "step": 23951 + }, + { + "epoch": 2.840270366417645, + "grad_norm": 0.6121187230327332, + "learning_rate": 1.0235547027855272e-05, + "loss": 0.0642, + "step": 23952 + }, + { + "epoch": 2.84038894817977, + "grad_norm": 0.9551302415132396, + "learning_rate": 1.0233610051346651e-05, + "loss": 0.1016, + "step": 23953 + }, + { + "epoch": 2.840507529941895, + "grad_norm": 0.532602163258562, + "learning_rate": 1.023167321096604e-05, + "loss": 0.0743, + "step": 23954 + }, + { + "epoch": 2.8406261117040197, + "grad_norm": 0.5411833206582843, + "learning_rate": 1.022973650673129e-05, + "loss": 0.0686, + "step": 23955 + }, + { + "epoch": 2.840744693466145, + "grad_norm": 0.46044813890557656, + "learning_rate": 1.022779993866026e-05, + "loss": 0.0653, + "step": 23956 + }, + { + "epoch": 2.84086327522827, + "grad_norm": 0.4707832520305943, + "learning_rate": 1.0225863506770805e-05, + "loss": 0.049, + "step": 23957 + }, + { + "epoch": 2.840981856990395, + "grad_norm": 0.7903645244101684, + "learning_rate": 1.0223927211080755e-05, + "loss": 0.1019, + "step": 23958 + }, + { + "epoch": 2.8411004387525196, + "grad_norm": 0.7070056403405238, + "learning_rate": 1.0221991051607996e-05, + "loss": 0.0989, + "step": 23959 + }, + { + "epoch": 2.841219020514645, + "grad_norm": 0.5838097140353159, + "learning_rate": 1.0220055028370346e-05, + "loss": 0.0734, + "step": 23960 + }, + { + "epoch": 2.84133760227677, + "grad_norm": 0.8197295434400714, + "learning_rate": 1.0218119141385674e-05, + "loss": 0.1043, + "step": 23961 + }, + { + "epoch": 2.8414561840388948, + "grad_norm": 0.5832662602142006, + "learning_rate": 1.02161833906718e-05, + "loss": 0.0673, + "step": 23962 + }, + { + "epoch": 2.8415747658010195, + "grad_norm": 0.6250810776504498, + "learning_rate": 1.0214247776246602e-05, + "loss": 0.0651, + "step": 23963 + }, + { + "epoch": 2.8416933475631447, + "grad_norm": 0.7453008462835679, + "learning_rate": 1.0212312298127901e-05, + "loss": 0.1017, + "step": 23964 + }, + { + "epoch": 2.84181192932527, + "grad_norm": 0.8207908539945641, + "learning_rate": 1.0210376956333544e-05, + "loss": 0.0949, + "step": 23965 + }, + { + "epoch": 2.8419305110873947, + "grad_norm": 0.5801251463945364, + "learning_rate": 1.0208441750881378e-05, + "loss": 0.0744, + "step": 23966 + }, + { + "epoch": 2.84204909284952, + "grad_norm": 0.5280818153748221, + "learning_rate": 1.0206506681789241e-05, + "loss": 0.0722, + "step": 23967 + }, + { + "epoch": 2.8421676746116447, + "grad_norm": 0.6006366351068599, + "learning_rate": 1.0204571749074979e-05, + "loss": 0.064, + "step": 23968 + }, + { + "epoch": 2.84228625637377, + "grad_norm": 0.6410921227420314, + "learning_rate": 1.0202636952756403e-05, + "loss": 0.0923, + "step": 23969 + }, + { + "epoch": 2.8424048381358946, + "grad_norm": 0.45675159469623716, + "learning_rate": 1.0200702292851388e-05, + "loss": 0.0535, + "step": 23970 + }, + { + "epoch": 2.84252341989802, + "grad_norm": 0.4798903718979398, + "learning_rate": 1.019876776937774e-05, + "loss": 0.064, + "step": 23971 + }, + { + "epoch": 2.8426420016601446, + "grad_norm": 0.8584794018575118, + "learning_rate": 1.0196833382353304e-05, + "loss": 0.1069, + "step": 23972 + }, + { + "epoch": 2.84276058342227, + "grad_norm": 0.6174608486911981, + "learning_rate": 1.019489913179591e-05, + "loss": 0.0699, + "step": 23973 + }, + { + "epoch": 2.8428791651843945, + "grad_norm": 0.5334232897111003, + "learning_rate": 1.0192965017723392e-05, + "loss": 0.0667, + "step": 23974 + }, + { + "epoch": 2.8429977469465197, + "grad_norm": 0.4602354000315843, + "learning_rate": 1.0191031040153588e-05, + "loss": 0.066, + "step": 23975 + }, + { + "epoch": 2.8431163287086445, + "grad_norm": 0.5889784314633367, + "learning_rate": 1.0189097199104305e-05, + "loss": 0.0726, + "step": 23976 + }, + { + "epoch": 2.8432349104707697, + "grad_norm": 0.8190642171121745, + "learning_rate": 1.0187163494593388e-05, + "loss": 0.1093, + "step": 23977 + }, + { + "epoch": 2.8433534922328945, + "grad_norm": 0.7146812654919443, + "learning_rate": 1.0185229926638657e-05, + "loss": 0.0931, + "step": 23978 + }, + { + "epoch": 2.8434720739950197, + "grad_norm": 0.635223554146891, + "learning_rate": 1.0183296495257949e-05, + "loss": 0.0665, + "step": 23979 + }, + { + "epoch": 2.8435906557571444, + "grad_norm": 0.9419932088698812, + "learning_rate": 1.018136320046906e-05, + "loss": 0.1189, + "step": 23980 + }, + { + "epoch": 2.8437092375192696, + "grad_norm": 0.3987769658698771, + "learning_rate": 1.0179430042289848e-05, + "loss": 0.0555, + "step": 23981 + }, + { + "epoch": 2.8438278192813944, + "grad_norm": 0.6070125107854947, + "learning_rate": 1.0177497020738105e-05, + "loss": 0.0787, + "step": 23982 + }, + { + "epoch": 2.8439464010435196, + "grad_norm": 0.6748796027164554, + "learning_rate": 1.0175564135831667e-05, + "loss": 0.0957, + "step": 23983 + }, + { + "epoch": 2.8440649828056443, + "grad_norm": 0.5518636857473262, + "learning_rate": 1.0173631387588348e-05, + "loss": 0.0587, + "step": 23984 + }, + { + "epoch": 2.8441835645677696, + "grad_norm": 1.1040740114174112, + "learning_rate": 1.0171698776025964e-05, + "loss": 0.1576, + "step": 23985 + }, + { + "epoch": 2.8443021463298943, + "grad_norm": 0.4997275720569531, + "learning_rate": 1.0169766301162345e-05, + "loss": 0.0617, + "step": 23986 + }, + { + "epoch": 2.8444207280920195, + "grad_norm": 0.6386900406121399, + "learning_rate": 1.0167833963015286e-05, + "loss": 0.0969, + "step": 23987 + }, + { + "epoch": 2.8445393098541443, + "grad_norm": 0.5844213818357292, + "learning_rate": 1.0165901761602608e-05, + "loss": 0.079, + "step": 23988 + }, + { + "epoch": 2.8446578916162695, + "grad_norm": 0.5508786141029828, + "learning_rate": 1.0163969696942124e-05, + "loss": 0.0714, + "step": 23989 + }, + { + "epoch": 2.8447764733783947, + "grad_norm": 0.9247235193753938, + "learning_rate": 1.0162037769051658e-05, + "loss": 0.1466, + "step": 23990 + }, + { + "epoch": 2.8448950551405194, + "grad_norm": 0.7796931044349732, + "learning_rate": 1.0160105977948997e-05, + "loss": 0.0914, + "step": 23991 + }, + { + "epoch": 2.845013636902644, + "grad_norm": 0.6409777536421586, + "learning_rate": 1.0158174323651961e-05, + "loss": 0.0807, + "step": 23992 + }, + { + "epoch": 2.8451322186647694, + "grad_norm": 0.5449501319610986, + "learning_rate": 1.0156242806178359e-05, + "loss": 0.0712, + "step": 23993 + }, + { + "epoch": 2.8452508004268946, + "grad_norm": 0.5568931224460306, + "learning_rate": 1.0154311425545995e-05, + "loss": 0.0836, + "step": 23994 + }, + { + "epoch": 2.8453693821890194, + "grad_norm": 0.6567089239148143, + "learning_rate": 1.0152380181772676e-05, + "loss": 0.0793, + "step": 23995 + }, + { + "epoch": 2.845487963951144, + "grad_norm": 0.5041530108061812, + "learning_rate": 1.0150449074876202e-05, + "loss": 0.0618, + "step": 23996 + }, + { + "epoch": 2.8456065457132693, + "grad_norm": 0.5465825745389942, + "learning_rate": 1.0148518104874389e-05, + "loss": 0.081, + "step": 23997 + }, + { + "epoch": 2.8457251274753945, + "grad_norm": 0.7246228507703061, + "learning_rate": 1.0146587271785015e-05, + "loss": 0.1046, + "step": 23998 + }, + { + "epoch": 2.8458437092375193, + "grad_norm": 0.5051293376784369, + "learning_rate": 1.0144656575625894e-05, + "loss": 0.0597, + "step": 23999 + }, + { + "epoch": 2.845962290999644, + "grad_norm": 1.013033743988622, + "learning_rate": 1.0142726016414822e-05, + "loss": 0.1077, + "step": 24000 + }, + { + "epoch": 2.8460808727617692, + "grad_norm": 0.6252876471345057, + "learning_rate": 1.0140795594169608e-05, + "loss": 0.0861, + "step": 24001 + }, + { + "epoch": 2.8461994545238944, + "grad_norm": 0.6675624777836318, + "learning_rate": 1.0138865308908025e-05, + "loss": 0.0803, + "step": 24002 + }, + { + "epoch": 2.846318036286019, + "grad_norm": 0.591505920674996, + "learning_rate": 1.0136935160647881e-05, + "loss": 0.0786, + "step": 24003 + }, + { + "epoch": 2.846436618048144, + "grad_norm": 0.8704919388292923, + "learning_rate": 1.013500514940697e-05, + "loss": 0.123, + "step": 24004 + }, + { + "epoch": 2.846555199810269, + "grad_norm": 1.072829563947362, + "learning_rate": 1.013307527520308e-05, + "loss": 0.129, + "step": 24005 + }, + { + "epoch": 2.8466737815723944, + "grad_norm": 0.45592659702893057, + "learning_rate": 1.0131145538054015e-05, + "loss": 0.0614, + "step": 24006 + }, + { + "epoch": 2.846792363334519, + "grad_norm": 0.8146596288953463, + "learning_rate": 1.012921593797754e-05, + "loss": 0.1135, + "step": 24007 + }, + { + "epoch": 2.846910945096644, + "grad_norm": 0.7852036697079973, + "learning_rate": 1.0127286474991474e-05, + "loss": 0.1001, + "step": 24008 + }, + { + "epoch": 2.847029526858769, + "grad_norm": 0.4974874526252747, + "learning_rate": 1.012535714911358e-05, + "loss": 0.058, + "step": 24009 + }, + { + "epoch": 2.8471481086208943, + "grad_norm": 0.5246418325998012, + "learning_rate": 1.012342796036165e-05, + "loss": 0.0758, + "step": 24010 + }, + { + "epoch": 2.847266690383019, + "grad_norm": 0.9840976015619735, + "learning_rate": 1.0121498908753474e-05, + "loss": 0.1272, + "step": 24011 + }, + { + "epoch": 2.847385272145144, + "grad_norm": 0.7023104327309884, + "learning_rate": 1.0119569994306841e-05, + "loss": 0.093, + "step": 24012 + }, + { + "epoch": 2.847503853907269, + "grad_norm": 0.6782208223548045, + "learning_rate": 1.0117641217039514e-05, + "loss": 0.0942, + "step": 24013 + }, + { + "epoch": 2.847622435669394, + "grad_norm": 0.7229920448765793, + "learning_rate": 1.0115712576969289e-05, + "loss": 0.0949, + "step": 24014 + }, + { + "epoch": 2.847741017431519, + "grad_norm": 0.724058759429099, + "learning_rate": 1.0113784074113939e-05, + "loss": 0.0913, + "step": 24015 + }, + { + "epoch": 2.847859599193644, + "grad_norm": 0.8651456597557529, + "learning_rate": 1.0111855708491245e-05, + "loss": 0.1216, + "step": 24016 + }, + { + "epoch": 2.847978180955769, + "grad_norm": 0.9045692971106987, + "learning_rate": 1.0109927480118995e-05, + "loss": 0.1294, + "step": 24017 + }, + { + "epoch": 2.848096762717894, + "grad_norm": 0.6262334308613537, + "learning_rate": 1.0107999389014936e-05, + "loss": 0.0889, + "step": 24018 + }, + { + "epoch": 2.848215344480019, + "grad_norm": 0.49963605947624196, + "learning_rate": 1.0106071435196875e-05, + "loss": 0.0639, + "step": 24019 + }, + { + "epoch": 2.848333926242144, + "grad_norm": 0.88741196038654, + "learning_rate": 1.0104143618682568e-05, + "loss": 0.1151, + "step": 24020 + }, + { + "epoch": 2.848452508004269, + "grad_norm": 0.6858899487686075, + "learning_rate": 1.0102215939489796e-05, + "loss": 0.0839, + "step": 24021 + }, + { + "epoch": 2.848571089766394, + "grad_norm": 0.737741739323823, + "learning_rate": 1.0100288397636306e-05, + "loss": 0.0962, + "step": 24022 + }, + { + "epoch": 2.848689671528519, + "grad_norm": 0.6530872379917853, + "learning_rate": 1.0098360993139907e-05, + "loss": 0.082, + "step": 24023 + }, + { + "epoch": 2.848808253290644, + "grad_norm": 0.4291425515920265, + "learning_rate": 1.0096433726018333e-05, + "loss": 0.0675, + "step": 24024 + }, + { + "epoch": 2.848926835052769, + "grad_norm": 0.5678797444325326, + "learning_rate": 1.0094506596289366e-05, + "loss": 0.0743, + "step": 24025 + }, + { + "epoch": 2.849045416814894, + "grad_norm": 0.7339713701644687, + "learning_rate": 1.009257960397077e-05, + "loss": 0.0989, + "step": 24026 + }, + { + "epoch": 2.8491639985770187, + "grad_norm": 1.1796523123389735, + "learning_rate": 1.0090652749080309e-05, + "loss": 0.1405, + "step": 24027 + }, + { + "epoch": 2.849282580339144, + "grad_norm": 0.7144227104054262, + "learning_rate": 1.0088726031635756e-05, + "loss": 0.0831, + "step": 24028 + }, + { + "epoch": 2.8494011621012687, + "grad_norm": 0.5869205453047858, + "learning_rate": 1.0086799451654846e-05, + "loss": 0.0878, + "step": 24029 + }, + { + "epoch": 2.849519743863394, + "grad_norm": 0.7539710582129604, + "learning_rate": 1.0084873009155374e-05, + "loss": 0.0827, + "step": 24030 + }, + { + "epoch": 2.8496383256255187, + "grad_norm": 0.6055103163663869, + "learning_rate": 1.0082946704155074e-05, + "loss": 0.0724, + "step": 24031 + }, + { + "epoch": 2.849756907387644, + "grad_norm": 0.5407502291173129, + "learning_rate": 1.0081020536671715e-05, + "loss": 0.0762, + "step": 24032 + }, + { + "epoch": 2.8498754891497686, + "grad_norm": 0.767563288299795, + "learning_rate": 1.007909450672305e-05, + "loss": 0.0997, + "step": 24033 + }, + { + "epoch": 2.849994070911894, + "grad_norm": 0.7702980568138958, + "learning_rate": 1.0077168614326835e-05, + "loss": 0.0922, + "step": 24034 + }, + { + "epoch": 2.8501126526740186, + "grad_norm": 0.5734351076762108, + "learning_rate": 1.0075242859500836e-05, + "loss": 0.0858, + "step": 24035 + }, + { + "epoch": 2.850231234436144, + "grad_norm": 0.6826821688434936, + "learning_rate": 1.0073317242262787e-05, + "loss": 0.0744, + "step": 24036 + }, + { + "epoch": 2.8503498161982685, + "grad_norm": 0.639258115580154, + "learning_rate": 1.007139176263045e-05, + "loss": 0.0848, + "step": 24037 + }, + { + "epoch": 2.8504683979603938, + "grad_norm": 0.4412251948176937, + "learning_rate": 1.0069466420621574e-05, + "loss": 0.0692, + "step": 24038 + }, + { + "epoch": 2.8505869797225185, + "grad_norm": 0.6581493807730258, + "learning_rate": 1.0067541216253915e-05, + "loss": 0.0897, + "step": 24039 + }, + { + "epoch": 2.8507055614846437, + "grad_norm": 0.6539868849079093, + "learning_rate": 1.0065616149545198e-05, + "loss": 0.1076, + "step": 24040 + }, + { + "epoch": 2.8508241432467685, + "grad_norm": 0.5440916203342356, + "learning_rate": 1.0063691220513205e-05, + "loss": 0.0666, + "step": 24041 + }, + { + "epoch": 2.8509427250088937, + "grad_norm": 0.5529182073065049, + "learning_rate": 1.0061766429175654e-05, + "loss": 0.0824, + "step": 24042 + }, + { + "epoch": 2.851061306771019, + "grad_norm": 0.46698807293926997, + "learning_rate": 1.0059841775550297e-05, + "loss": 0.057, + "step": 24043 + }, + { + "epoch": 2.8511798885331436, + "grad_norm": 0.4642022540943324, + "learning_rate": 1.0057917259654878e-05, + "loss": 0.0608, + "step": 24044 + }, + { + "epoch": 2.8512984702952684, + "grad_norm": 0.5802511482244278, + "learning_rate": 1.005599288150714e-05, + "loss": 0.0749, + "step": 24045 + }, + { + "epoch": 2.8514170520573936, + "grad_norm": 0.645199288577071, + "learning_rate": 1.0054068641124832e-05, + "loss": 0.0992, + "step": 24046 + }, + { + "epoch": 2.851535633819519, + "grad_norm": 0.5213131212094146, + "learning_rate": 1.005214453852567e-05, + "loss": 0.0666, + "step": 24047 + }, + { + "epoch": 2.8516542155816436, + "grad_norm": 0.936511753514155, + "learning_rate": 1.005022057372741e-05, + "loss": 0.1134, + "step": 24048 + }, + { + "epoch": 2.8517727973437683, + "grad_norm": 0.7262641717955587, + "learning_rate": 1.004829674674778e-05, + "loss": 0.0994, + "step": 24049 + }, + { + "epoch": 2.8518913791058935, + "grad_norm": 0.588541036809643, + "learning_rate": 1.0046373057604527e-05, + "loss": 0.0825, + "step": 24050 + }, + { + "epoch": 2.8520099608680187, + "grad_norm": 0.7462423491197995, + "learning_rate": 1.0044449506315363e-05, + "loss": 0.1273, + "step": 24051 + }, + { + "epoch": 2.8521285426301435, + "grad_norm": 0.9774372263095188, + "learning_rate": 1.0042526092898049e-05, + "loss": 0.1412, + "step": 24052 + }, + { + "epoch": 2.8522471243922682, + "grad_norm": 0.7189558083380084, + "learning_rate": 1.0040602817370298e-05, + "loss": 0.0854, + "step": 24053 + }, + { + "epoch": 2.8523657061543934, + "grad_norm": 0.5383710059016054, + "learning_rate": 1.003867967974984e-05, + "loss": 0.0587, + "step": 24054 + }, + { + "epoch": 2.8524842879165186, + "grad_norm": 0.5805936810253558, + "learning_rate": 1.0036756680054413e-05, + "loss": 0.0756, + "step": 24055 + }, + { + "epoch": 2.8526028696786434, + "grad_norm": 0.7105309346826402, + "learning_rate": 1.0034833818301737e-05, + "loss": 0.0999, + "step": 24056 + }, + { + "epoch": 2.852721451440768, + "grad_norm": 0.4838395474129487, + "learning_rate": 1.0032911094509551e-05, + "loss": 0.0636, + "step": 24057 + }, + { + "epoch": 2.8528400332028934, + "grad_norm": 0.9611134887800568, + "learning_rate": 1.0030988508695564e-05, + "loss": 0.1232, + "step": 24058 + }, + { + "epoch": 2.8529586149650186, + "grad_norm": 1.0876229636779122, + "learning_rate": 1.0029066060877504e-05, + "loss": 0.1657, + "step": 24059 + }, + { + "epoch": 2.8530771967271433, + "grad_norm": 0.5620251391804836, + "learning_rate": 1.0027143751073098e-05, + "loss": 0.0792, + "step": 24060 + }, + { + "epoch": 2.853195778489268, + "grad_norm": 0.6594157439714008, + "learning_rate": 1.0025221579300074e-05, + "loss": 0.0789, + "step": 24061 + }, + { + "epoch": 2.8533143602513933, + "grad_norm": 0.805588777414057, + "learning_rate": 1.0023299545576134e-05, + "loss": 0.1242, + "step": 24062 + }, + { + "epoch": 2.8534329420135185, + "grad_norm": 0.6899835819368925, + "learning_rate": 1.0021377649919008e-05, + "loss": 0.0856, + "step": 24063 + }, + { + "epoch": 2.8535515237756433, + "grad_norm": 0.45644309736601407, + "learning_rate": 1.0019455892346413e-05, + "loss": 0.0624, + "step": 24064 + }, + { + "epoch": 2.8536701055377685, + "grad_norm": 0.46881098713939434, + "learning_rate": 1.0017534272876062e-05, + "loss": 0.0623, + "step": 24065 + }, + { + "epoch": 2.853788687299893, + "grad_norm": 0.6268016992172027, + "learning_rate": 1.0015612791525673e-05, + "loss": 0.0765, + "step": 24066 + }, + { + "epoch": 2.8539072690620184, + "grad_norm": 0.5547002108196517, + "learning_rate": 1.0013691448312959e-05, + "loss": 0.0815, + "step": 24067 + }, + { + "epoch": 2.854025850824143, + "grad_norm": 0.553010036006762, + "learning_rate": 1.0011770243255642e-05, + "loss": 0.0872, + "step": 24068 + }, + { + "epoch": 2.8541444325862684, + "grad_norm": 0.5390776528284686, + "learning_rate": 1.0009849176371414e-05, + "loss": 0.0699, + "step": 24069 + }, + { + "epoch": 2.854263014348393, + "grad_norm": 0.4408615171981366, + "learning_rate": 1.0007928247677995e-05, + "loss": 0.0614, + "step": 24070 + }, + { + "epoch": 2.8543815961105183, + "grad_norm": 0.6683513615230656, + "learning_rate": 1.0006007457193092e-05, + "loss": 0.0993, + "step": 24071 + }, + { + "epoch": 2.854500177872643, + "grad_norm": 0.587524095250223, + "learning_rate": 1.0004086804934424e-05, + "loss": 0.0907, + "step": 24072 + }, + { + "epoch": 2.8546187596347683, + "grad_norm": 0.6783730755996218, + "learning_rate": 1.0002166290919675e-05, + "loss": 0.0926, + "step": 24073 + }, + { + "epoch": 2.854737341396893, + "grad_norm": 0.7050639611483905, + "learning_rate": 1.0000245915166564e-05, + "loss": 0.0945, + "step": 24074 + }, + { + "epoch": 2.8548559231590183, + "grad_norm": 0.6070280996927345, + "learning_rate": 9.998325677692791e-06, + "loss": 0.0806, + "step": 24075 + }, + { + "epoch": 2.854974504921143, + "grad_norm": 0.8333095311983663, + "learning_rate": 9.996405578516058e-06, + "loss": 0.1145, + "step": 24076 + }, + { + "epoch": 2.855093086683268, + "grad_norm": 0.6835317186527146, + "learning_rate": 9.994485617654075e-06, + "loss": 0.0873, + "step": 24077 + }, + { + "epoch": 2.855211668445393, + "grad_norm": 0.6006241097191253, + "learning_rate": 9.99256579512452e-06, + "loss": 0.0983, + "step": 24078 + }, + { + "epoch": 2.855330250207518, + "grad_norm": 0.47838598274090155, + "learning_rate": 9.990646110945117e-06, + "loss": 0.0633, + "step": 24079 + }, + { + "epoch": 2.855448831969643, + "grad_norm": 0.5752210214622134, + "learning_rate": 9.988726565133546e-06, + "loss": 0.0798, + "step": 24080 + }, + { + "epoch": 2.855567413731768, + "grad_norm": 0.900043518090903, + "learning_rate": 9.986807157707509e-06, + "loss": 0.0975, + "step": 24081 + }, + { + "epoch": 2.855685995493893, + "grad_norm": 0.544361985345438, + "learning_rate": 9.984887888684696e-06, + "loss": 0.0575, + "step": 24082 + }, + { + "epoch": 2.855804577256018, + "grad_norm": 0.5776394134567036, + "learning_rate": 9.982968758082809e-06, + "loss": 0.0786, + "step": 24083 + }, + { + "epoch": 2.855923159018143, + "grad_norm": 0.9940426045873706, + "learning_rate": 9.98104976591954e-06, + "loss": 0.1133, + "step": 24084 + }, + { + "epoch": 2.856041740780268, + "grad_norm": 0.6800152074460611, + "learning_rate": 9.979130912212556e-06, + "loss": 0.1, + "step": 24085 + }, + { + "epoch": 2.856160322542393, + "grad_norm": 0.5203857377003935, + "learning_rate": 9.977212196979585e-06, + "loss": 0.0736, + "step": 24086 + }, + { + "epoch": 2.856278904304518, + "grad_norm": 0.6724024176305442, + "learning_rate": 9.975293620238285e-06, + "loss": 0.0871, + "step": 24087 + }, + { + "epoch": 2.856397486066643, + "grad_norm": 0.5745557535206329, + "learning_rate": 9.973375182006364e-06, + "loss": 0.0726, + "step": 24088 + }, + { + "epoch": 2.856516067828768, + "grad_norm": 0.5115962196092065, + "learning_rate": 9.97145688230148e-06, + "loss": 0.0591, + "step": 24089 + }, + { + "epoch": 2.8566346495908927, + "grad_norm": 0.6939954466163035, + "learning_rate": 9.96953872114135e-06, + "loss": 0.0703, + "step": 24090 + }, + { + "epoch": 2.856753231353018, + "grad_norm": 0.5445136242860684, + "learning_rate": 9.967620698543634e-06, + "loss": 0.0718, + "step": 24091 + }, + { + "epoch": 2.856871813115143, + "grad_norm": 0.6766243697119227, + "learning_rate": 9.96570281452602e-06, + "loss": 0.0941, + "step": 24092 + }, + { + "epoch": 2.856990394877268, + "grad_norm": 0.7034276253037391, + "learning_rate": 9.963785069106191e-06, + "loss": 0.0933, + "step": 24093 + }, + { + "epoch": 2.8571089766393927, + "grad_norm": 0.870105787808026, + "learning_rate": 9.961867462301821e-06, + "loss": 0.1121, + "step": 24094 + }, + { + "epoch": 2.857227558401518, + "grad_norm": 0.5322807393106799, + "learning_rate": 9.959949994130605e-06, + "loss": 0.0606, + "step": 24095 + }, + { + "epoch": 2.857346140163643, + "grad_norm": 0.5025495942108675, + "learning_rate": 9.95803266461019e-06, + "loss": 0.059, + "step": 24096 + }, + { + "epoch": 2.857464721925768, + "grad_norm": 0.6905193755966604, + "learning_rate": 9.956115473758282e-06, + "loss": 0.0924, + "step": 24097 + }, + { + "epoch": 2.8575833036878926, + "grad_norm": 0.6302929367797483, + "learning_rate": 9.954198421592536e-06, + "loss": 0.0691, + "step": 24098 + }, + { + "epoch": 2.857701885450018, + "grad_norm": 0.8236918958825613, + "learning_rate": 9.952281508130638e-06, + "loss": 0.1109, + "step": 24099 + }, + { + "epoch": 2.857820467212143, + "grad_norm": 0.6540744397939461, + "learning_rate": 9.950364733390234e-06, + "loss": 0.089, + "step": 24100 + }, + { + "epoch": 2.8579390489742678, + "grad_norm": 0.5840022805973916, + "learning_rate": 9.948448097389027e-06, + "loss": 0.0825, + "step": 24101 + }, + { + "epoch": 2.8580576307363925, + "grad_norm": 0.5266410749202782, + "learning_rate": 9.946531600144665e-06, + "loss": 0.0521, + "step": 24102 + }, + { + "epoch": 2.8581762124985177, + "grad_norm": 0.7215864697066492, + "learning_rate": 9.944615241674821e-06, + "loss": 0.0979, + "step": 24103 + }, + { + "epoch": 2.858294794260643, + "grad_norm": 0.472997274458136, + "learning_rate": 9.942699021997162e-06, + "loss": 0.0526, + "step": 24104 + }, + { + "epoch": 2.8584133760227677, + "grad_norm": 0.7190395628033189, + "learning_rate": 9.940782941129357e-06, + "loss": 0.0973, + "step": 24105 + }, + { + "epoch": 2.8585319577848924, + "grad_norm": 0.8189768188516319, + "learning_rate": 9.938866999089072e-06, + "loss": 0.0756, + "step": 24106 + }, + { + "epoch": 2.8586505395470176, + "grad_norm": 0.6920278613100245, + "learning_rate": 9.936951195893954e-06, + "loss": 0.0935, + "step": 24107 + }, + { + "epoch": 2.858769121309143, + "grad_norm": 0.627683203899578, + "learning_rate": 9.935035531561677e-06, + "loss": 0.0835, + "step": 24108 + }, + { + "epoch": 2.8588877030712676, + "grad_norm": 0.6116784892471467, + "learning_rate": 9.933120006109897e-06, + "loss": 0.07, + "step": 24109 + }, + { + "epoch": 2.8590062848333924, + "grad_norm": 0.5863321968987643, + "learning_rate": 9.931204619556283e-06, + "loss": 0.0783, + "step": 24110 + }, + { + "epoch": 2.8591248665955176, + "grad_norm": 0.5347013619020896, + "learning_rate": 9.929289371918469e-06, + "loss": 0.0864, + "step": 24111 + }, + { + "epoch": 2.8592434483576428, + "grad_norm": 0.7888552231601472, + "learning_rate": 9.927374263214143e-06, + "loss": 0.0967, + "step": 24112 + }, + { + "epoch": 2.8593620301197675, + "grad_norm": 0.4908226659789212, + "learning_rate": 9.925459293460932e-06, + "loss": 0.0644, + "step": 24113 + }, + { + "epoch": 2.8594806118818927, + "grad_norm": 0.5444555198228115, + "learning_rate": 9.923544462676502e-06, + "loss": 0.0776, + "step": 24114 + }, + { + "epoch": 2.8595991936440175, + "grad_norm": 0.8212604064559308, + "learning_rate": 9.921629770878501e-06, + "loss": 0.1364, + "step": 24115 + }, + { + "epoch": 2.8597177754061427, + "grad_norm": 0.6529842507512087, + "learning_rate": 9.919715218084589e-06, + "loss": 0.0836, + "step": 24116 + }, + { + "epoch": 2.8598363571682675, + "grad_norm": 0.45999510917549863, + "learning_rate": 9.917800804312415e-06, + "loss": 0.0708, + "step": 24117 + }, + { + "epoch": 2.8599549389303927, + "grad_norm": 0.72678739822603, + "learning_rate": 9.915886529579616e-06, + "loss": 0.0856, + "step": 24118 + }, + { + "epoch": 2.8600735206925174, + "grad_norm": 0.7290533739588461, + "learning_rate": 9.913972393903841e-06, + "loss": 0.0874, + "step": 24119 + }, + { + "epoch": 2.8601921024546426, + "grad_norm": 0.7505935750769345, + "learning_rate": 9.912058397302745e-06, + "loss": 0.0886, + "step": 24120 + }, + { + "epoch": 2.8603106842167674, + "grad_norm": 0.8393869887417525, + "learning_rate": 9.910144539793977e-06, + "loss": 0.0927, + "step": 24121 + }, + { + "epoch": 2.8604292659788926, + "grad_norm": 0.4922425166239362, + "learning_rate": 9.908230821395154e-06, + "loss": 0.0665, + "step": 24122 + }, + { + "epoch": 2.8605478477410173, + "grad_norm": 0.7595122923551864, + "learning_rate": 9.906317242123953e-06, + "loss": 0.1002, + "step": 24123 + }, + { + "epoch": 2.8606664295031425, + "grad_norm": 0.5075036383948484, + "learning_rate": 9.90440380199799e-06, + "loss": 0.0627, + "step": 24124 + }, + { + "epoch": 2.8607850112652673, + "grad_norm": 0.7203883462977506, + "learning_rate": 9.902490501034914e-06, + "loss": 0.0972, + "step": 24125 + }, + { + "epoch": 2.8609035930273925, + "grad_norm": 0.5655566016494068, + "learning_rate": 9.900577339252359e-06, + "loss": 0.077, + "step": 24126 + }, + { + "epoch": 2.8610221747895173, + "grad_norm": 0.7027515442456616, + "learning_rate": 9.898664316667965e-06, + "loss": 0.0893, + "step": 24127 + }, + { + "epoch": 2.8611407565516425, + "grad_norm": 0.5293370893572891, + "learning_rate": 9.896751433299378e-06, + "loss": 0.0662, + "step": 24128 + }, + { + "epoch": 2.861259338313767, + "grad_norm": 0.5579814209907156, + "learning_rate": 9.894838689164213e-06, + "loss": 0.0728, + "step": 24129 + }, + { + "epoch": 2.8613779200758924, + "grad_norm": 0.7438631228855059, + "learning_rate": 9.89292608428011e-06, + "loss": 0.1066, + "step": 24130 + }, + { + "epoch": 2.861496501838017, + "grad_norm": 0.6972769604714664, + "learning_rate": 9.891013618664704e-06, + "loss": 0.0831, + "step": 24131 + }, + { + "epoch": 2.8616150836001424, + "grad_norm": 0.6154698726966229, + "learning_rate": 9.889101292335627e-06, + "loss": 0.0898, + "step": 24132 + }, + { + "epoch": 2.861733665362267, + "grad_norm": 0.5197741248370578, + "learning_rate": 9.88718910531051e-06, + "loss": 0.0762, + "step": 24133 + }, + { + "epoch": 2.8618522471243923, + "grad_norm": 0.5411774734615187, + "learning_rate": 9.885277057606962e-06, + "loss": 0.0732, + "step": 24134 + }, + { + "epoch": 2.861970828886517, + "grad_norm": 0.6867543019534519, + "learning_rate": 9.883365149242643e-06, + "loss": 0.0823, + "step": 24135 + }, + { + "epoch": 2.8620894106486423, + "grad_norm": 0.7224479588777071, + "learning_rate": 9.88145338023515e-06, + "loss": 0.0887, + "step": 24136 + }, + { + "epoch": 2.862207992410767, + "grad_norm": 0.6127206574345438, + "learning_rate": 9.879541750602117e-06, + "loss": 0.0868, + "step": 24137 + }, + { + "epoch": 2.8623265741728923, + "grad_norm": 0.8258416260857121, + "learning_rate": 9.877630260361167e-06, + "loss": 0.1043, + "step": 24138 + }, + { + "epoch": 2.862445155935017, + "grad_norm": 0.41295100837272497, + "learning_rate": 9.875718909529932e-06, + "loss": 0.061, + "step": 24139 + }, + { + "epoch": 2.8625637376971422, + "grad_norm": 0.5763640543657235, + "learning_rate": 9.873807698126015e-06, + "loss": 0.0775, + "step": 24140 + }, + { + "epoch": 2.8626823194592674, + "grad_norm": 0.8319656379056423, + "learning_rate": 9.871896626167038e-06, + "loss": 0.111, + "step": 24141 + }, + { + "epoch": 2.862800901221392, + "grad_norm": 0.5829408189802396, + "learning_rate": 9.869985693670625e-06, + "loss": 0.0757, + "step": 24142 + }, + { + "epoch": 2.862919482983517, + "grad_norm": 0.7330988159043462, + "learning_rate": 9.86807490065439e-06, + "loss": 0.1094, + "step": 24143 + }, + { + "epoch": 2.863038064745642, + "grad_norm": 0.591522891975986, + "learning_rate": 9.866164247135956e-06, + "loss": 0.0807, + "step": 24144 + }, + { + "epoch": 2.8631566465077674, + "grad_norm": 0.6887971950132535, + "learning_rate": 9.864253733132914e-06, + "loss": 0.0823, + "step": 24145 + }, + { + "epoch": 2.863275228269892, + "grad_norm": 0.878060688949358, + "learning_rate": 9.862343358662906e-06, + "loss": 0.1019, + "step": 24146 + }, + { + "epoch": 2.863393810032017, + "grad_norm": 0.45940666048763645, + "learning_rate": 9.860433123743523e-06, + "loss": 0.0653, + "step": 24147 + }, + { + "epoch": 2.863512391794142, + "grad_norm": 0.5160483812944235, + "learning_rate": 9.858523028392389e-06, + "loss": 0.0669, + "step": 24148 + }, + { + "epoch": 2.8636309735562673, + "grad_norm": 0.8016611304573242, + "learning_rate": 9.856613072627089e-06, + "loss": 0.0954, + "step": 24149 + }, + { + "epoch": 2.863749555318392, + "grad_norm": 0.46734790323244957, + "learning_rate": 9.854703256465261e-06, + "loss": 0.0579, + "step": 24150 + }, + { + "epoch": 2.863868137080517, + "grad_norm": 0.5924612066591324, + "learning_rate": 9.852793579924491e-06, + "loss": 0.0665, + "step": 24151 + }, + { + "epoch": 2.863986718842642, + "grad_norm": 0.4906981567333014, + "learning_rate": 9.850884043022388e-06, + "loss": 0.0748, + "step": 24152 + }, + { + "epoch": 2.864105300604767, + "grad_norm": 0.8395429240787958, + "learning_rate": 9.848974645776558e-06, + "loss": 0.1043, + "step": 24153 + }, + { + "epoch": 2.864223882366892, + "grad_norm": 0.6203977610021565, + "learning_rate": 9.847065388204602e-06, + "loss": 0.0751, + "step": 24154 + }, + { + "epoch": 2.8643424641290167, + "grad_norm": 0.563054215878696, + "learning_rate": 9.84515627032413e-06, + "loss": 0.0658, + "step": 24155 + }, + { + "epoch": 2.864461045891142, + "grad_norm": 0.7329314685790989, + "learning_rate": 9.843247292152713e-06, + "loss": 0.0932, + "step": 24156 + }, + { + "epoch": 2.864579627653267, + "grad_norm": 0.7162830799777715, + "learning_rate": 9.841338453707991e-06, + "loss": 0.0805, + "step": 24157 + }, + { + "epoch": 2.864698209415392, + "grad_norm": 0.6619546485922194, + "learning_rate": 9.839429755007529e-06, + "loss": 0.0902, + "step": 24158 + }, + { + "epoch": 2.8648167911775166, + "grad_norm": 0.6019981536524102, + "learning_rate": 9.837521196068941e-06, + "loss": 0.0801, + "step": 24159 + }, + { + "epoch": 2.864935372939642, + "grad_norm": 1.3354716538420044, + "learning_rate": 9.8356127769098e-06, + "loss": 0.1242, + "step": 24160 + }, + { + "epoch": 2.865053954701767, + "grad_norm": 0.528098300707396, + "learning_rate": 9.83370449754773e-06, + "loss": 0.0591, + "step": 24161 + }, + { + "epoch": 2.865172536463892, + "grad_norm": 0.7062508933255351, + "learning_rate": 9.831796358000295e-06, + "loss": 0.0802, + "step": 24162 + }, + { + "epoch": 2.8652911182260166, + "grad_norm": 0.6272868692815767, + "learning_rate": 9.8298883582851e-06, + "loss": 0.0742, + "step": 24163 + }, + { + "epoch": 2.8654096999881418, + "grad_norm": 0.779456836915705, + "learning_rate": 9.827980498419728e-06, + "loss": 0.1133, + "step": 24164 + }, + { + "epoch": 2.865528281750267, + "grad_norm": 0.6877473827268551, + "learning_rate": 9.826072778421775e-06, + "loss": 0.1088, + "step": 24165 + }, + { + "epoch": 2.8656468635123917, + "grad_norm": 0.5161945747652237, + "learning_rate": 9.82416519830883e-06, + "loss": 0.0617, + "step": 24166 + }, + { + "epoch": 2.865765445274517, + "grad_norm": 0.5342933817939847, + "learning_rate": 9.822257758098455e-06, + "loss": 0.0804, + "step": 24167 + }, + { + "epoch": 2.8658840270366417, + "grad_norm": 0.5075231321549839, + "learning_rate": 9.820350457808267e-06, + "loss": 0.0708, + "step": 24168 + }, + { + "epoch": 2.866002608798767, + "grad_norm": 0.6821945359546759, + "learning_rate": 9.818443297455826e-06, + "loss": 0.0991, + "step": 24169 + }, + { + "epoch": 2.8661211905608917, + "grad_norm": 0.5964713272752493, + "learning_rate": 9.81653627705873e-06, + "loss": 0.0869, + "step": 24170 + }, + { + "epoch": 2.866239772323017, + "grad_norm": 0.39636887767446827, + "learning_rate": 9.814629396634533e-06, + "loss": 0.0468, + "step": 24171 + }, + { + "epoch": 2.8663583540851416, + "grad_norm": 0.5142410930077496, + "learning_rate": 9.812722656200848e-06, + "loss": 0.0587, + "step": 24172 + }, + { + "epoch": 2.866476935847267, + "grad_norm": 0.6769233836843491, + "learning_rate": 9.810816055775229e-06, + "loss": 0.0809, + "step": 24173 + }, + { + "epoch": 2.8665955176093916, + "grad_norm": 0.7872511670108744, + "learning_rate": 9.808909595375257e-06, + "loss": 0.1061, + "step": 24174 + }, + { + "epoch": 2.8667140993715168, + "grad_norm": 0.7073616494099164, + "learning_rate": 9.807003275018514e-06, + "loss": 0.0774, + "step": 24175 + }, + { + "epoch": 2.8668326811336415, + "grad_norm": 0.6899284004077019, + "learning_rate": 9.805097094722568e-06, + "loss": 0.0976, + "step": 24176 + }, + { + "epoch": 2.8669512628957667, + "grad_norm": 0.5636209663560173, + "learning_rate": 9.803191054505001e-06, + "loss": 0.0597, + "step": 24177 + }, + { + "epoch": 2.8670698446578915, + "grad_norm": 0.7398726141762736, + "learning_rate": 9.801285154383369e-06, + "loss": 0.0788, + "step": 24178 + }, + { + "epoch": 2.8671884264200167, + "grad_norm": 0.5375363201156016, + "learning_rate": 9.79937939437525e-06, + "loss": 0.0673, + "step": 24179 + }, + { + "epoch": 2.8673070081821415, + "grad_norm": 0.7847001558418119, + "learning_rate": 9.797473774498211e-06, + "loss": 0.0688, + "step": 24180 + }, + { + "epoch": 2.8674255899442667, + "grad_norm": 0.552469476610959, + "learning_rate": 9.795568294769824e-06, + "loss": 0.06, + "step": 24181 + }, + { + "epoch": 2.8675441717063914, + "grad_norm": 0.6879393989474678, + "learning_rate": 9.793662955207652e-06, + "loss": 0.1035, + "step": 24182 + }, + { + "epoch": 2.8676627534685166, + "grad_norm": 0.7356531789601378, + "learning_rate": 9.791757755829258e-06, + "loss": 0.0774, + "step": 24183 + }, + { + "epoch": 2.8677813352306414, + "grad_norm": 0.3888901731747142, + "learning_rate": 9.789852696652222e-06, + "loss": 0.0508, + "step": 24184 + }, + { + "epoch": 2.8678999169927666, + "grad_norm": 0.6452590511718309, + "learning_rate": 9.787947777694078e-06, + "loss": 0.0788, + "step": 24185 + }, + { + "epoch": 2.8680184987548913, + "grad_norm": 1.135816214373962, + "learning_rate": 9.786042998972403e-06, + "loss": 0.1229, + "step": 24186 + }, + { + "epoch": 2.8681370805170165, + "grad_norm": 1.0299609973952193, + "learning_rate": 9.784138360504755e-06, + "loss": 0.1426, + "step": 24187 + }, + { + "epoch": 2.8682556622791413, + "grad_norm": 0.3276638673588135, + "learning_rate": 9.782233862308702e-06, + "loss": 0.0462, + "step": 24188 + }, + { + "epoch": 2.8683742440412665, + "grad_norm": 0.6239452468036728, + "learning_rate": 9.780329504401784e-06, + "loss": 0.0884, + "step": 24189 + }, + { + "epoch": 2.8684928258033917, + "grad_norm": 0.768757553044192, + "learning_rate": 9.778425286801559e-06, + "loss": 0.0777, + "step": 24190 + }, + { + "epoch": 2.8686114075655165, + "grad_norm": 0.5023621844557291, + "learning_rate": 9.776521209525591e-06, + "loss": 0.0682, + "step": 24191 + }, + { + "epoch": 2.8687299893276412, + "grad_norm": 0.6889094945301277, + "learning_rate": 9.77461727259143e-06, + "loss": 0.0925, + "step": 24192 + }, + { + "epoch": 2.8688485710897664, + "grad_norm": 0.47658093930170725, + "learning_rate": 9.772713476016634e-06, + "loss": 0.0577, + "step": 24193 + }, + { + "epoch": 2.8689671528518916, + "grad_norm": 0.6511563636131903, + "learning_rate": 9.77080981981873e-06, + "loss": 0.0796, + "step": 24194 + }, + { + "epoch": 2.8690857346140164, + "grad_norm": 0.5335144197232538, + "learning_rate": 9.768906304015302e-06, + "loss": 0.0654, + "step": 24195 + }, + { + "epoch": 2.869204316376141, + "grad_norm": 0.5247833053169224, + "learning_rate": 9.767002928623869e-06, + "loss": 0.0551, + "step": 24196 + }, + { + "epoch": 2.8693228981382664, + "grad_norm": 0.6295754351022136, + "learning_rate": 9.765099693661992e-06, + "loss": 0.075, + "step": 24197 + }, + { + "epoch": 2.8694414799003916, + "grad_norm": 1.0360730755988647, + "learning_rate": 9.763196599147213e-06, + "loss": 0.1245, + "step": 24198 + }, + { + "epoch": 2.8695600616625163, + "grad_norm": 0.7554778394187468, + "learning_rate": 9.761293645097086e-06, + "loss": 0.1003, + "step": 24199 + }, + { + "epoch": 2.869678643424641, + "grad_norm": 0.5609791151265551, + "learning_rate": 9.759390831529133e-06, + "loss": 0.0835, + "step": 24200 + }, + { + "epoch": 2.8697972251867663, + "grad_norm": 0.6515306874371858, + "learning_rate": 9.75748815846091e-06, + "loss": 0.0823, + "step": 24201 + }, + { + "epoch": 2.8699158069488915, + "grad_norm": 1.1717534479229756, + "learning_rate": 9.755585625909954e-06, + "loss": 0.1427, + "step": 24202 + }, + { + "epoch": 2.8700343887110162, + "grad_norm": 0.799308400401133, + "learning_rate": 9.753683233893804e-06, + "loss": 0.1155, + "step": 24203 + }, + { + "epoch": 2.870152970473141, + "grad_norm": 0.5787850358701255, + "learning_rate": 9.75178098243001e-06, + "loss": 0.0736, + "step": 24204 + }, + { + "epoch": 2.870271552235266, + "grad_norm": 0.5243238719548187, + "learning_rate": 9.749878871536075e-06, + "loss": 0.054, + "step": 24205 + }, + { + "epoch": 2.8703901339973914, + "grad_norm": 0.6992097746622686, + "learning_rate": 9.747976901229574e-06, + "loss": 0.0874, + "step": 24206 + }, + { + "epoch": 2.870508715759516, + "grad_norm": 0.7289295105558224, + "learning_rate": 9.746075071528015e-06, + "loss": 0.0743, + "step": 24207 + }, + { + "epoch": 2.870627297521641, + "grad_norm": 0.6110072550274349, + "learning_rate": 9.74417338244894e-06, + "loss": 0.0971, + "step": 24208 + }, + { + "epoch": 2.870745879283766, + "grad_norm": 0.6756233460067339, + "learning_rate": 9.742271834009876e-06, + "loss": 0.0862, + "step": 24209 + }, + { + "epoch": 2.8708644610458913, + "grad_norm": 0.852735528693052, + "learning_rate": 9.740370426228365e-06, + "loss": 0.087, + "step": 24210 + }, + { + "epoch": 2.870983042808016, + "grad_norm": 1.0557114743708866, + "learning_rate": 9.738469159121919e-06, + "loss": 0.0951, + "step": 24211 + }, + { + "epoch": 2.871101624570141, + "grad_norm": 0.5549613963946007, + "learning_rate": 9.736568032708069e-06, + "loss": 0.0604, + "step": 24212 + }, + { + "epoch": 2.871220206332266, + "grad_norm": 0.7256480436091368, + "learning_rate": 9.734667047004348e-06, + "loss": 0.1023, + "step": 24213 + }, + { + "epoch": 2.8713387880943912, + "grad_norm": 0.7684136289164121, + "learning_rate": 9.732766202028274e-06, + "loss": 0.0937, + "step": 24214 + }, + { + "epoch": 2.871457369856516, + "grad_norm": 0.5058961086721484, + "learning_rate": 9.730865497797383e-06, + "loss": 0.0615, + "step": 24215 + }, + { + "epoch": 2.871575951618641, + "grad_norm": 0.6211323305895127, + "learning_rate": 9.728964934329172e-06, + "loss": 0.0762, + "step": 24216 + }, + { + "epoch": 2.871694533380766, + "grad_norm": 0.691732063313816, + "learning_rate": 9.727064511641196e-06, + "loss": 0.0687, + "step": 24217 + }, + { + "epoch": 2.871813115142891, + "grad_norm": 0.5507133735317322, + "learning_rate": 9.725164229750944e-06, + "loss": 0.0713, + "step": 24218 + }, + { + "epoch": 2.871931696905016, + "grad_norm": 0.7059652037919698, + "learning_rate": 9.723264088675958e-06, + "loss": 0.0906, + "step": 24219 + }, + { + "epoch": 2.872050278667141, + "grad_norm": 0.7363184233280686, + "learning_rate": 9.721364088433726e-06, + "loss": 0.0757, + "step": 24220 + }, + { + "epoch": 2.872168860429266, + "grad_norm": 0.6279751575752014, + "learning_rate": 9.719464229041797e-06, + "loss": 0.0826, + "step": 24221 + }, + { + "epoch": 2.872287442191391, + "grad_norm": 0.5912707635045874, + "learning_rate": 9.717564510517663e-06, + "loss": 0.0877, + "step": 24222 + }, + { + "epoch": 2.872406023953516, + "grad_norm": 0.6195516759483557, + "learning_rate": 9.715664932878843e-06, + "loss": 0.0654, + "step": 24223 + }, + { + "epoch": 2.872524605715641, + "grad_norm": 0.6550115045857947, + "learning_rate": 9.713765496142848e-06, + "loss": 0.0745, + "step": 24224 + }, + { + "epoch": 2.872643187477766, + "grad_norm": 0.8442237532600653, + "learning_rate": 9.711866200327191e-06, + "loss": 0.1111, + "step": 24225 + }, + { + "epoch": 2.872761769239891, + "grad_norm": 0.65274440783701, + "learning_rate": 9.709967045449389e-06, + "loss": 0.0959, + "step": 24226 + }, + { + "epoch": 2.8728803510020158, + "grad_norm": 0.7752579566320228, + "learning_rate": 9.708068031526924e-06, + "loss": 0.0975, + "step": 24227 + }, + { + "epoch": 2.872998932764141, + "grad_norm": 0.6790512852613181, + "learning_rate": 9.706169158577336e-06, + "loss": 0.0713, + "step": 24228 + }, + { + "epoch": 2.8731175145262657, + "grad_norm": 0.5813458041156797, + "learning_rate": 9.704270426618104e-06, + "loss": 0.0744, + "step": 24229 + }, + { + "epoch": 2.873236096288391, + "grad_norm": 0.7050430949026748, + "learning_rate": 9.702371835666752e-06, + "loss": 0.0969, + "step": 24230 + }, + { + "epoch": 2.8733546780505157, + "grad_norm": 0.5910347614818323, + "learning_rate": 9.700473385740755e-06, + "loss": 0.064, + "step": 24231 + }, + { + "epoch": 2.873473259812641, + "grad_norm": 0.7084251010724286, + "learning_rate": 9.69857507685765e-06, + "loss": 0.099, + "step": 24232 + }, + { + "epoch": 2.8735918415747657, + "grad_norm": 0.8008555896583675, + "learning_rate": 9.696676909034908e-06, + "loss": 0.1072, + "step": 24233 + }, + { + "epoch": 2.873710423336891, + "grad_norm": 0.7908743116862521, + "learning_rate": 9.69477888229004e-06, + "loss": 0.0958, + "step": 24234 + }, + { + "epoch": 2.8738290050990156, + "grad_norm": 0.8679892894189345, + "learning_rate": 9.692880996640543e-06, + "loss": 0.1282, + "step": 24235 + }, + { + "epoch": 2.873947586861141, + "grad_norm": 0.4896899143328698, + "learning_rate": 9.690983252103911e-06, + "loss": 0.0486, + "step": 24236 + }, + { + "epoch": 2.8740661686232656, + "grad_norm": 0.6202836652514163, + "learning_rate": 9.689085648697652e-06, + "loss": 0.0623, + "step": 24237 + }, + { + "epoch": 2.874184750385391, + "grad_norm": 0.6240916399230694, + "learning_rate": 9.687188186439228e-06, + "loss": 0.0615, + "step": 24238 + }, + { + "epoch": 2.874303332147516, + "grad_norm": 1.0507771281008194, + "learning_rate": 9.685290865346167e-06, + "loss": 0.1447, + "step": 24239 + }, + { + "epoch": 2.8744219139096407, + "grad_norm": 1.0493172667611728, + "learning_rate": 9.683393685435938e-06, + "loss": 0.1647, + "step": 24240 + }, + { + "epoch": 2.8745404956717655, + "grad_norm": 0.7032908979148609, + "learning_rate": 9.681496646726035e-06, + "loss": 0.0713, + "step": 24241 + }, + { + "epoch": 2.8746590774338907, + "grad_norm": 0.6211974839778929, + "learning_rate": 9.67959974923395e-06, + "loss": 0.0676, + "step": 24242 + }, + { + "epoch": 2.874777659196016, + "grad_norm": 0.9286183393906605, + "learning_rate": 9.677702992977167e-06, + "loss": 0.1301, + "step": 24243 + }, + { + "epoch": 2.8748962409581407, + "grad_norm": 0.6564083052557635, + "learning_rate": 9.675806377973182e-06, + "loss": 0.0759, + "step": 24244 + }, + { + "epoch": 2.8750148227202654, + "grad_norm": 0.7126416815352716, + "learning_rate": 9.673909904239462e-06, + "loss": 0.0895, + "step": 24245 + }, + { + "epoch": 2.8751334044823906, + "grad_norm": 0.7814312603006369, + "learning_rate": 9.672013571793495e-06, + "loss": 0.109, + "step": 24246 + }, + { + "epoch": 2.875251986244516, + "grad_norm": 0.6601452561629403, + "learning_rate": 9.670117380652772e-06, + "loss": 0.0713, + "step": 24247 + }, + { + "epoch": 2.8753705680066406, + "grad_norm": 0.7404996475492971, + "learning_rate": 9.668221330834773e-06, + "loss": 0.0938, + "step": 24248 + }, + { + "epoch": 2.8754891497687654, + "grad_norm": 0.5020782178472621, + "learning_rate": 9.666325422356964e-06, + "loss": 0.0729, + "step": 24249 + }, + { + "epoch": 2.8756077315308906, + "grad_norm": 0.5470769177643736, + "learning_rate": 9.664429655236834e-06, + "loss": 0.0721, + "step": 24250 + }, + { + "epoch": 2.8757263132930158, + "grad_norm": 0.7011819035943143, + "learning_rate": 9.662534029491855e-06, + "loss": 0.0719, + "step": 24251 + }, + { + "epoch": 2.8758448950551405, + "grad_norm": 0.743819551670039, + "learning_rate": 9.660638545139503e-06, + "loss": 0.0855, + "step": 24252 + }, + { + "epoch": 2.8759634768172653, + "grad_norm": 0.5105483348904337, + "learning_rate": 9.658743202197255e-06, + "loss": 0.0717, + "step": 24253 + }, + { + "epoch": 2.8760820585793905, + "grad_norm": 0.7445082294471929, + "learning_rate": 9.65684800068258e-06, + "loss": 0.1096, + "step": 24254 + }, + { + "epoch": 2.8762006403415157, + "grad_norm": 0.6236451756179221, + "learning_rate": 9.654952940612963e-06, + "loss": 0.0747, + "step": 24255 + }, + { + "epoch": 2.8763192221036404, + "grad_norm": 0.8399149448630175, + "learning_rate": 9.65305802200585e-06, + "loss": 0.1041, + "step": 24256 + }, + { + "epoch": 2.876437803865765, + "grad_norm": 0.4691900116132239, + "learning_rate": 9.651163244878725e-06, + "loss": 0.0689, + "step": 24257 + }, + { + "epoch": 2.8765563856278904, + "grad_norm": 0.9469582836846846, + "learning_rate": 9.649268609249054e-06, + "loss": 0.1328, + "step": 24258 + }, + { + "epoch": 2.8766749673900156, + "grad_norm": 0.30424479844658225, + "learning_rate": 9.647374115134308e-06, + "loss": 0.0393, + "step": 24259 + }, + { + "epoch": 2.8767935491521404, + "grad_norm": 0.7159213694620831, + "learning_rate": 9.64547976255194e-06, + "loss": 0.0874, + "step": 24260 + }, + { + "epoch": 2.876912130914265, + "grad_norm": 0.883905694526706, + "learning_rate": 9.643585551519418e-06, + "loss": 0.1161, + "step": 24261 + }, + { + "epoch": 2.8770307126763903, + "grad_norm": 0.5368610184400805, + "learning_rate": 9.641691482054206e-06, + "loss": 0.0688, + "step": 24262 + }, + { + "epoch": 2.8771492944385155, + "grad_norm": 0.5542012326357308, + "learning_rate": 9.639797554173766e-06, + "loss": 0.0842, + "step": 24263 + }, + { + "epoch": 2.8772678762006403, + "grad_norm": 0.651932772253232, + "learning_rate": 9.637903767895565e-06, + "loss": 0.0886, + "step": 24264 + }, + { + "epoch": 2.8773864579627655, + "grad_norm": 0.571638769768261, + "learning_rate": 9.636010123237032e-06, + "loss": 0.0652, + "step": 24265 + }, + { + "epoch": 2.8775050397248902, + "grad_norm": 0.6271560597552994, + "learning_rate": 9.634116620215666e-06, + "loss": 0.0888, + "step": 24266 + }, + { + "epoch": 2.8776236214870154, + "grad_norm": 0.8220789995158057, + "learning_rate": 9.632223258848893e-06, + "loss": 0.1025, + "step": 24267 + }, + { + "epoch": 2.87774220324914, + "grad_norm": 0.5442702548851737, + "learning_rate": 9.630330039154178e-06, + "loss": 0.0655, + "step": 24268 + }, + { + "epoch": 2.8778607850112654, + "grad_norm": 0.5916092438203944, + "learning_rate": 9.62843696114897e-06, + "loss": 0.1005, + "step": 24269 + }, + { + "epoch": 2.87797936677339, + "grad_norm": 0.8307093301484013, + "learning_rate": 9.626544024850734e-06, + "loss": 0.0757, + "step": 24270 + }, + { + "epoch": 2.8780979485355154, + "grad_norm": 0.6475851807582721, + "learning_rate": 9.624651230276898e-06, + "loss": 0.0835, + "step": 24271 + }, + { + "epoch": 2.87821653029764, + "grad_norm": 0.8109741880769133, + "learning_rate": 9.622758577444926e-06, + "loss": 0.1023, + "step": 24272 + }, + { + "epoch": 2.8783351120597653, + "grad_norm": 0.6072218860582641, + "learning_rate": 9.620866066372262e-06, + "loss": 0.0798, + "step": 24273 + }, + { + "epoch": 2.87845369382189, + "grad_norm": 0.7162524325385096, + "learning_rate": 9.618973697076354e-06, + "loss": 0.0889, + "step": 24274 + }, + { + "epoch": 2.8785722755840153, + "grad_norm": 0.6694888694249848, + "learning_rate": 9.617081469574654e-06, + "loss": 0.0799, + "step": 24275 + }, + { + "epoch": 2.87869085734614, + "grad_norm": 0.4976827352934302, + "learning_rate": 9.615189383884585e-06, + "loss": 0.0707, + "step": 24276 + }, + { + "epoch": 2.8788094391082653, + "grad_norm": 0.5147898004075495, + "learning_rate": 9.61329744002362e-06, + "loss": 0.0629, + "step": 24277 + }, + { + "epoch": 2.87892802087039, + "grad_norm": 0.5186259091007422, + "learning_rate": 9.611405638009175e-06, + "loss": 0.0582, + "step": 24278 + }, + { + "epoch": 2.879046602632515, + "grad_norm": 1.2573994037102276, + "learning_rate": 9.609513977858706e-06, + "loss": 0.1794, + "step": 24279 + }, + { + "epoch": 2.87916518439464, + "grad_norm": 0.5021859571886245, + "learning_rate": 9.60762245958963e-06, + "loss": 0.0619, + "step": 24280 + }, + { + "epoch": 2.879283766156765, + "grad_norm": 1.400308497093427, + "learning_rate": 9.605731083219417e-06, + "loss": 0.2042, + "step": 24281 + }, + { + "epoch": 2.87940234791889, + "grad_norm": 0.473108058551573, + "learning_rate": 9.603839848765478e-06, + "loss": 0.0657, + "step": 24282 + }, + { + "epoch": 2.879520929681015, + "grad_norm": 0.651655342293578, + "learning_rate": 9.601948756245252e-06, + "loss": 0.0973, + "step": 24283 + }, + { + "epoch": 2.87963951144314, + "grad_norm": 0.6277993771346898, + "learning_rate": 9.60005780567618e-06, + "loss": 0.0773, + "step": 24284 + }, + { + "epoch": 2.879758093205265, + "grad_norm": 0.8343031469878129, + "learning_rate": 9.598166997075689e-06, + "loss": 0.1039, + "step": 24285 + }, + { + "epoch": 2.87987667496739, + "grad_norm": 0.35427524001773364, + "learning_rate": 9.596276330461218e-06, + "loss": 0.0447, + "step": 24286 + }, + { + "epoch": 2.879995256729515, + "grad_norm": 0.6367429152813913, + "learning_rate": 9.594385805850176e-06, + "loss": 0.0748, + "step": 24287 + }, + { + "epoch": 2.88011383849164, + "grad_norm": 0.607633358897898, + "learning_rate": 9.59249542326002e-06, + "loss": 0.0753, + "step": 24288 + }, + { + "epoch": 2.880232420253765, + "grad_norm": 0.457062550278196, + "learning_rate": 9.590605182708154e-06, + "loss": 0.0683, + "step": 24289 + }, + { + "epoch": 2.88035100201589, + "grad_norm": 0.7130041110145274, + "learning_rate": 9.588715084212013e-06, + "loss": 0.086, + "step": 24290 + }, + { + "epoch": 2.880469583778015, + "grad_norm": 0.5899431248598694, + "learning_rate": 9.586825127789018e-06, + "loss": 0.1014, + "step": 24291 + }, + { + "epoch": 2.88058816554014, + "grad_norm": 0.7749651337610757, + "learning_rate": 9.584935313456597e-06, + "loss": 0.0823, + "step": 24292 + }, + { + "epoch": 2.880706747302265, + "grad_norm": 0.5390546920835869, + "learning_rate": 9.583045641232177e-06, + "loss": 0.0849, + "step": 24293 + }, + { + "epoch": 2.8808253290643897, + "grad_norm": 0.8213482905186275, + "learning_rate": 9.581156111133152e-06, + "loss": 0.0982, + "step": 24294 + }, + { + "epoch": 2.880943910826515, + "grad_norm": 0.38084843519449324, + "learning_rate": 9.57926672317698e-06, + "loss": 0.0461, + "step": 24295 + }, + { + "epoch": 2.88106249258864, + "grad_norm": 0.32892522805714086, + "learning_rate": 9.57737747738105e-06, + "loss": 0.044, + "step": 24296 + }, + { + "epoch": 2.881181074350765, + "grad_norm": 1.030705792772188, + "learning_rate": 9.575488373762792e-06, + "loss": 0.1671, + "step": 24297 + }, + { + "epoch": 2.8812996561128896, + "grad_norm": 0.7312915059466485, + "learning_rate": 9.573599412339601e-06, + "loss": 0.0985, + "step": 24298 + }, + { + "epoch": 2.881418237875015, + "grad_norm": 1.2159444127439145, + "learning_rate": 9.571710593128927e-06, + "loss": 0.1631, + "step": 24299 + }, + { + "epoch": 2.88153681963714, + "grad_norm": 0.7091925852817951, + "learning_rate": 9.569821916148151e-06, + "loss": 0.0736, + "step": 24300 + }, + { + "epoch": 2.881655401399265, + "grad_norm": 0.5066843154013346, + "learning_rate": 9.567933381414693e-06, + "loss": 0.0802, + "step": 24301 + }, + { + "epoch": 2.8817739831613896, + "grad_norm": 0.6595801879542789, + "learning_rate": 9.566044988945965e-06, + "loss": 0.0856, + "step": 24302 + }, + { + "epoch": 2.8818925649235148, + "grad_norm": 0.7929348121526864, + "learning_rate": 9.564156738759377e-06, + "loss": 0.1112, + "step": 24303 + }, + { + "epoch": 2.88201114668564, + "grad_norm": 0.5705927076229116, + "learning_rate": 9.562268630872346e-06, + "loss": 0.0795, + "step": 24304 + }, + { + "epoch": 2.8821297284477647, + "grad_norm": 0.49959660355620317, + "learning_rate": 9.560380665302257e-06, + "loss": 0.0865, + "step": 24305 + }, + { + "epoch": 2.8822483102098895, + "grad_norm": 0.7246100209029499, + "learning_rate": 9.558492842066524e-06, + "loss": 0.1191, + "step": 24306 + }, + { + "epoch": 2.8823668919720147, + "grad_norm": 0.7052810103819313, + "learning_rate": 9.556605161182553e-06, + "loss": 0.0976, + "step": 24307 + }, + { + "epoch": 2.88248547373414, + "grad_norm": 0.6458475840558362, + "learning_rate": 9.554717622667753e-06, + "loss": 0.0994, + "step": 24308 + }, + { + "epoch": 2.8826040554962646, + "grad_norm": 0.8083785960744031, + "learning_rate": 9.552830226539497e-06, + "loss": 0.1109, + "step": 24309 + }, + { + "epoch": 2.8827226372583894, + "grad_norm": 0.6024628619776521, + "learning_rate": 9.550942972815224e-06, + "loss": 0.0969, + "step": 24310 + }, + { + "epoch": 2.8828412190205146, + "grad_norm": 0.6016017033418634, + "learning_rate": 9.549055861512301e-06, + "loss": 0.0553, + "step": 24311 + }, + { + "epoch": 2.88295980078264, + "grad_norm": 0.46792761993089366, + "learning_rate": 9.547168892648136e-06, + "loss": 0.0683, + "step": 24312 + }, + { + "epoch": 2.8830783825447646, + "grad_norm": 0.6286620608057498, + "learning_rate": 9.545282066240124e-06, + "loss": 0.0698, + "step": 24313 + }, + { + "epoch": 2.8831969643068898, + "grad_norm": 0.5573485234706418, + "learning_rate": 9.543395382305662e-06, + "loss": 0.0767, + "step": 24314 + }, + { + "epoch": 2.8833155460690145, + "grad_norm": 0.9970155970632395, + "learning_rate": 9.541508840862148e-06, + "loss": 0.1278, + "step": 24315 + }, + { + "epoch": 2.8834341278311397, + "grad_norm": 0.4818983498260057, + "learning_rate": 9.539622441926958e-06, + "loss": 0.0533, + "step": 24316 + }, + { + "epoch": 2.8835527095932645, + "grad_norm": 0.7118962562777247, + "learning_rate": 9.537736185517487e-06, + "loss": 0.105, + "step": 24317 + }, + { + "epoch": 2.8836712913553897, + "grad_norm": 0.6571654755791823, + "learning_rate": 9.535850071651128e-06, + "loss": 0.1164, + "step": 24318 + }, + { + "epoch": 2.8837898731175144, + "grad_norm": 0.460909089646429, + "learning_rate": 9.53396410034528e-06, + "loss": 0.0663, + "step": 24319 + }, + { + "epoch": 2.8839084548796396, + "grad_norm": 0.7985261908461723, + "learning_rate": 9.532078271617306e-06, + "loss": 0.1331, + "step": 24320 + }, + { + "epoch": 2.8840270366417644, + "grad_norm": 0.5848277592018569, + "learning_rate": 9.530192585484605e-06, + "loss": 0.0744, + "step": 24321 + }, + { + "epoch": 2.8841456184038896, + "grad_norm": 0.7726277018571783, + "learning_rate": 9.528307041964554e-06, + "loss": 0.0937, + "step": 24322 + }, + { + "epoch": 2.8842642001660144, + "grad_norm": 0.4837956339012145, + "learning_rate": 9.52642164107454e-06, + "loss": 0.0643, + "step": 24323 + }, + { + "epoch": 2.8843827819281396, + "grad_norm": 0.9547178154947382, + "learning_rate": 9.524536382831947e-06, + "loss": 0.1308, + "step": 24324 + }, + { + "epoch": 2.8845013636902643, + "grad_norm": 1.1941500946742083, + "learning_rate": 9.522651267254149e-06, + "loss": 0.1545, + "step": 24325 + }, + { + "epoch": 2.8846199454523895, + "grad_norm": 0.6520079715274173, + "learning_rate": 9.520766294358536e-06, + "loss": 0.0693, + "step": 24326 + }, + { + "epoch": 2.8847385272145143, + "grad_norm": 0.6380869887452477, + "learning_rate": 9.518881464162465e-06, + "loss": 0.0871, + "step": 24327 + }, + { + "epoch": 2.8848571089766395, + "grad_norm": 0.5960683698329208, + "learning_rate": 9.516996776683327e-06, + "loss": 0.0762, + "step": 24328 + }, + { + "epoch": 2.8849756907387643, + "grad_norm": 0.5504683897225934, + "learning_rate": 9.51511223193849e-06, + "loss": 0.0845, + "step": 24329 + }, + { + "epoch": 2.8850942725008895, + "grad_norm": 0.6632650347049234, + "learning_rate": 9.513227829945334e-06, + "loss": 0.0935, + "step": 24330 + }, + { + "epoch": 2.885212854263014, + "grad_norm": 0.6582849291494628, + "learning_rate": 9.511343570721224e-06, + "loss": 0.0937, + "step": 24331 + }, + { + "epoch": 2.8853314360251394, + "grad_norm": 0.7250658749730947, + "learning_rate": 9.509459454283528e-06, + "loss": 0.0876, + "step": 24332 + }, + { + "epoch": 2.885450017787264, + "grad_norm": 0.528162271667388, + "learning_rate": 9.507575480649622e-06, + "loss": 0.08, + "step": 24333 + }, + { + "epoch": 2.8855685995493894, + "grad_norm": 0.4600510851168668, + "learning_rate": 9.505691649836874e-06, + "loss": 0.068, + "step": 24334 + }, + { + "epoch": 2.885687181311514, + "grad_norm": 0.4902374348484885, + "learning_rate": 9.503807961862654e-06, + "loss": 0.0788, + "step": 24335 + }, + { + "epoch": 2.8858057630736393, + "grad_norm": 0.6449234255649149, + "learning_rate": 9.501924416744307e-06, + "loss": 0.0923, + "step": 24336 + }, + { + "epoch": 2.885924344835764, + "grad_norm": 0.4577834070195004, + "learning_rate": 9.500041014499228e-06, + "loss": 0.0637, + "step": 24337 + }, + { + "epoch": 2.8860429265978893, + "grad_norm": 0.6372520971907949, + "learning_rate": 9.498157755144754e-06, + "loss": 0.0928, + "step": 24338 + }, + { + "epoch": 2.886161508360014, + "grad_norm": 0.3987895651814518, + "learning_rate": 9.496274638698258e-06, + "loss": 0.0639, + "step": 24339 + }, + { + "epoch": 2.8862800901221393, + "grad_norm": 0.6125099830307607, + "learning_rate": 9.494391665177097e-06, + "loss": 0.0731, + "step": 24340 + }, + { + "epoch": 2.8863986718842645, + "grad_norm": 0.9000922855399754, + "learning_rate": 9.492508834598632e-06, + "loss": 0.1002, + "step": 24341 + }, + { + "epoch": 2.8865172536463892, + "grad_norm": 0.5085479389263218, + "learning_rate": 9.490626146980225e-06, + "loss": 0.0758, + "step": 24342 + }, + { + "epoch": 2.886635835408514, + "grad_norm": 0.6628863006756498, + "learning_rate": 9.488743602339212e-06, + "loss": 0.0872, + "step": 24343 + }, + { + "epoch": 2.886754417170639, + "grad_norm": 0.776946515265349, + "learning_rate": 9.486861200692975e-06, + "loss": 0.1047, + "step": 24344 + }, + { + "epoch": 2.8868729989327644, + "grad_norm": 0.5057864731365002, + "learning_rate": 9.484978942058847e-06, + "loss": 0.0736, + "step": 24345 + }, + { + "epoch": 2.886991580694889, + "grad_norm": 0.5957754597677981, + "learning_rate": 9.483096826454199e-06, + "loss": 0.0531, + "step": 24346 + }, + { + "epoch": 2.887110162457014, + "grad_norm": 0.4897874723116628, + "learning_rate": 9.48121485389635e-06, + "loss": 0.0629, + "step": 24347 + }, + { + "epoch": 2.887228744219139, + "grad_norm": 0.6195878889144102, + "learning_rate": 9.479333024402687e-06, + "loss": 0.0722, + "step": 24348 + }, + { + "epoch": 2.8873473259812643, + "grad_norm": 0.8733937948817143, + "learning_rate": 9.477451337990533e-06, + "loss": 0.0936, + "step": 24349 + }, + { + "epoch": 2.887465907743389, + "grad_norm": 0.48232472070766663, + "learning_rate": 9.47556979467724e-06, + "loss": 0.0542, + "step": 24350 + }, + { + "epoch": 2.887584489505514, + "grad_norm": 0.5846769818011252, + "learning_rate": 9.473688394480163e-06, + "loss": 0.0596, + "step": 24351 + }, + { + "epoch": 2.887703071267639, + "grad_norm": 0.5356619947184457, + "learning_rate": 9.471807137416632e-06, + "loss": 0.069, + "step": 24352 + }, + { + "epoch": 2.8878216530297642, + "grad_norm": 0.6640556554907017, + "learning_rate": 9.46992602350401e-06, + "loss": 0.0945, + "step": 24353 + }, + { + "epoch": 2.887940234791889, + "grad_norm": 0.7550799709031142, + "learning_rate": 9.468045052759608e-06, + "loss": 0.112, + "step": 24354 + }, + { + "epoch": 2.8880588165540138, + "grad_norm": 0.6794419882504484, + "learning_rate": 9.4661642252008e-06, + "loss": 0.0872, + "step": 24355 + }, + { + "epoch": 2.888177398316139, + "grad_norm": 0.9101402422581034, + "learning_rate": 9.464283540844901e-06, + "loss": 0.1321, + "step": 24356 + }, + { + "epoch": 2.888295980078264, + "grad_norm": 0.9359801237695862, + "learning_rate": 9.462402999709265e-06, + "loss": 0.109, + "step": 24357 + }, + { + "epoch": 2.888414561840389, + "grad_norm": 0.803380049218197, + "learning_rate": 9.460522601811204e-06, + "loss": 0.0919, + "step": 24358 + }, + { + "epoch": 2.8885331436025137, + "grad_norm": 0.7390849212019065, + "learning_rate": 9.458642347168084e-06, + "loss": 0.0774, + "step": 24359 + }, + { + "epoch": 2.888651725364639, + "grad_norm": 0.5635627339414034, + "learning_rate": 9.456762235797218e-06, + "loss": 0.086, + "step": 24360 + }, + { + "epoch": 2.888770307126764, + "grad_norm": 0.6485746113511717, + "learning_rate": 9.454882267715943e-06, + "loss": 0.0997, + "step": 24361 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7720477936678297, + "learning_rate": 9.453002442941588e-06, + "loss": 0.1006, + "step": 24362 + }, + { + "epoch": 2.8890074706510136, + "grad_norm": 0.5502124225757064, + "learning_rate": 9.451122761491488e-06, + "loss": 0.0723, + "step": 24363 + }, + { + "epoch": 2.889126052413139, + "grad_norm": 1.1458870845690898, + "learning_rate": 9.449243223382975e-06, + "loss": 0.1311, + "step": 24364 + }, + { + "epoch": 2.889244634175264, + "grad_norm": 0.3807697516126871, + "learning_rate": 9.447363828633363e-06, + "loss": 0.0539, + "step": 24365 + }, + { + "epoch": 2.8893632159373888, + "grad_norm": 0.4034703162545005, + "learning_rate": 9.445484577259983e-06, + "loss": 0.0547, + "step": 24366 + }, + { + "epoch": 2.889481797699514, + "grad_norm": 0.9891553278309152, + "learning_rate": 9.443605469280164e-06, + "loss": 0.0882, + "step": 24367 + }, + { + "epoch": 2.8896003794616387, + "grad_norm": 0.6525119841265319, + "learning_rate": 9.441726504711232e-06, + "loss": 0.1, + "step": 24368 + }, + { + "epoch": 2.889718961223764, + "grad_norm": 0.5659834154733822, + "learning_rate": 9.439847683570489e-06, + "loss": 0.0734, + "step": 24369 + }, + { + "epoch": 2.8898375429858887, + "grad_norm": 0.8162560992255016, + "learning_rate": 9.437969005875283e-06, + "loss": 0.0949, + "step": 24370 + }, + { + "epoch": 2.889956124748014, + "grad_norm": 0.6962717405158366, + "learning_rate": 9.43609047164291e-06, + "loss": 0.1025, + "step": 24371 + }, + { + "epoch": 2.8900747065101386, + "grad_norm": 0.42315393074092034, + "learning_rate": 9.434212080890701e-06, + "loss": 0.0573, + "step": 24372 + }, + { + "epoch": 2.890193288272264, + "grad_norm": 0.7971086082345452, + "learning_rate": 9.432333833635967e-06, + "loss": 0.0994, + "step": 24373 + }, + { + "epoch": 2.8903118700343886, + "grad_norm": 0.65848409628581, + "learning_rate": 9.430455729896023e-06, + "loss": 0.0787, + "step": 24374 + }, + { + "epoch": 2.890430451796514, + "grad_norm": 0.5216415343184689, + "learning_rate": 9.428577769688196e-06, + "loss": 0.0656, + "step": 24375 + }, + { + "epoch": 2.8905490335586386, + "grad_norm": 0.6942812947054995, + "learning_rate": 9.426699953029774e-06, + "loss": 0.09, + "step": 24376 + }, + { + "epoch": 2.8906676153207638, + "grad_norm": 0.7042348763294157, + "learning_rate": 9.424822279938086e-06, + "loss": 0.0824, + "step": 24377 + }, + { + "epoch": 2.8907861970828885, + "grad_norm": 0.7851473135587312, + "learning_rate": 9.422944750430435e-06, + "loss": 0.101, + "step": 24378 + }, + { + "epoch": 2.8909047788450137, + "grad_norm": 0.4234522563269217, + "learning_rate": 9.42106736452414e-06, + "loss": 0.0588, + "step": 24379 + }, + { + "epoch": 2.8910233606071385, + "grad_norm": 0.6876657046413623, + "learning_rate": 9.419190122236482e-06, + "loss": 0.0896, + "step": 24380 + }, + { + "epoch": 2.8911419423692637, + "grad_norm": 0.6504259445395477, + "learning_rate": 9.417313023584803e-06, + "loss": 0.0816, + "step": 24381 + }, + { + "epoch": 2.8912605241313885, + "grad_norm": 0.7393012652867903, + "learning_rate": 9.41543606858638e-06, + "loss": 0.1044, + "step": 24382 + }, + { + "epoch": 2.8913791058935137, + "grad_norm": 0.502014777861103, + "learning_rate": 9.413559257258524e-06, + "loss": 0.0644, + "step": 24383 + }, + { + "epoch": 2.8914976876556384, + "grad_norm": 0.6868119770883993, + "learning_rate": 9.411682589618543e-06, + "loss": 0.0756, + "step": 24384 + }, + { + "epoch": 2.8916162694177636, + "grad_norm": 0.5558671708511199, + "learning_rate": 9.40980606568373e-06, + "loss": 0.0747, + "step": 24385 + }, + { + "epoch": 2.8917348511798884, + "grad_norm": 0.5726825057178048, + "learning_rate": 9.407929685471395e-06, + "loss": 0.0891, + "step": 24386 + }, + { + "epoch": 2.8918534329420136, + "grad_norm": 0.6441412159571398, + "learning_rate": 9.406053448998819e-06, + "loss": 0.0852, + "step": 24387 + }, + { + "epoch": 2.8919720147041383, + "grad_norm": 0.6762293130620526, + "learning_rate": 9.404177356283309e-06, + "loss": 0.0689, + "step": 24388 + }, + { + "epoch": 2.8920905964662635, + "grad_norm": 0.6945957574846151, + "learning_rate": 9.40230140734216e-06, + "loss": 0.1039, + "step": 24389 + }, + { + "epoch": 2.8922091782283887, + "grad_norm": 0.7994336800333349, + "learning_rate": 9.400425602192662e-06, + "loss": 0.0877, + "step": 24390 + }, + { + "epoch": 2.8923277599905135, + "grad_norm": 0.7044501523660955, + "learning_rate": 9.39854994085212e-06, + "loss": 0.0936, + "step": 24391 + }, + { + "epoch": 2.8924463417526383, + "grad_norm": 0.7111586515637429, + "learning_rate": 9.396674423337799e-06, + "loss": 0.0928, + "step": 24392 + }, + { + "epoch": 2.8925649235147635, + "grad_norm": 0.9183795624038582, + "learning_rate": 9.394799049667021e-06, + "loss": 0.0935, + "step": 24393 + }, + { + "epoch": 2.8926835052768887, + "grad_norm": 0.5158507866371822, + "learning_rate": 9.392923819857055e-06, + "loss": 0.0784, + "step": 24394 + }, + { + "epoch": 2.8928020870390134, + "grad_norm": 0.6117648310510746, + "learning_rate": 9.39104873392519e-06, + "loss": 0.0909, + "step": 24395 + }, + { + "epoch": 2.892920668801138, + "grad_norm": 0.601908795627444, + "learning_rate": 9.389173791888715e-06, + "loss": 0.076, + "step": 24396 + }, + { + "epoch": 2.8930392505632634, + "grad_norm": 0.5880765924974992, + "learning_rate": 9.387298993764922e-06, + "loss": 0.0702, + "step": 24397 + }, + { + "epoch": 2.8931578323253886, + "grad_norm": 0.7512907466270561, + "learning_rate": 9.385424339571081e-06, + "loss": 0.1018, + "step": 24398 + }, + { + "epoch": 2.8932764140875133, + "grad_norm": 0.8946032357995368, + "learning_rate": 9.383549829324478e-06, + "loss": 0.1084, + "step": 24399 + }, + { + "epoch": 2.893394995849638, + "grad_norm": 0.5402999645496737, + "learning_rate": 9.381675463042394e-06, + "loss": 0.0705, + "step": 24400 + }, + { + "epoch": 2.8935135776117633, + "grad_norm": 0.8624302092110202, + "learning_rate": 9.37980124074211e-06, + "loss": 0.1123, + "step": 24401 + }, + { + "epoch": 2.8936321593738885, + "grad_norm": 0.5860202833026167, + "learning_rate": 9.377927162440917e-06, + "loss": 0.0664, + "step": 24402 + }, + { + "epoch": 2.8937507411360133, + "grad_norm": 0.5715373941525547, + "learning_rate": 9.376053228156056e-06, + "loss": 0.0807, + "step": 24403 + }, + { + "epoch": 2.893869322898138, + "grad_norm": 0.5323641539691534, + "learning_rate": 9.374179437904846e-06, + "loss": 0.0573, + "step": 24404 + }, + { + "epoch": 2.8939879046602632, + "grad_norm": 0.8318316935757017, + "learning_rate": 9.372305791704527e-06, + "loss": 0.1016, + "step": 24405 + }, + { + "epoch": 2.8941064864223884, + "grad_norm": 0.6011276712961015, + "learning_rate": 9.370432289572398e-06, + "loss": 0.078, + "step": 24406 + }, + { + "epoch": 2.894225068184513, + "grad_norm": 0.43058510779488085, + "learning_rate": 9.368558931525696e-06, + "loss": 0.072, + "step": 24407 + }, + { + "epoch": 2.894343649946638, + "grad_norm": 0.7320039763255602, + "learning_rate": 9.36668571758173e-06, + "loss": 0.0969, + "step": 24408 + }, + { + "epoch": 2.894462231708763, + "grad_norm": 0.6817685973738036, + "learning_rate": 9.364812647757743e-06, + "loss": 0.0812, + "step": 24409 + }, + { + "epoch": 2.8945808134708884, + "grad_norm": 0.576598086303818, + "learning_rate": 9.36293972207101e-06, + "loss": 0.0671, + "step": 24410 + }, + { + "epoch": 2.894699395233013, + "grad_norm": 0.3730749033286328, + "learning_rate": 9.3610669405388e-06, + "loss": 0.0493, + "step": 24411 + }, + { + "epoch": 2.894817976995138, + "grad_norm": 0.451824469195757, + "learning_rate": 9.359194303178371e-06, + "loss": 0.0548, + "step": 24412 + }, + { + "epoch": 2.894936558757263, + "grad_norm": 1.133622254817231, + "learning_rate": 9.357321810007e-06, + "loss": 0.1346, + "step": 24413 + }, + { + "epoch": 2.8950551405193883, + "grad_norm": 0.3305987360203168, + "learning_rate": 9.355449461041923e-06, + "loss": 0.0381, + "step": 24414 + }, + { + "epoch": 2.895173722281513, + "grad_norm": 0.36311354093017156, + "learning_rate": 9.353577256300435e-06, + "loss": 0.0426, + "step": 24415 + }, + { + "epoch": 2.8952923040436382, + "grad_norm": 0.8278988080684128, + "learning_rate": 9.351705195799771e-06, + "loss": 0.1084, + "step": 24416 + }, + { + "epoch": 2.895410885805763, + "grad_norm": 0.6267428306899921, + "learning_rate": 9.349833279557202e-06, + "loss": 0.082, + "step": 24417 + }, + { + "epoch": 2.895529467567888, + "grad_norm": 1.1037011704479522, + "learning_rate": 9.347961507589961e-06, + "loss": 0.1001, + "step": 24418 + }, + { + "epoch": 2.895648049330013, + "grad_norm": 0.7263607865456848, + "learning_rate": 9.34608987991534e-06, + "loss": 0.0855, + "step": 24419 + }, + { + "epoch": 2.895766631092138, + "grad_norm": 0.6786409577478659, + "learning_rate": 9.344218396550563e-06, + "loss": 0.0746, + "step": 24420 + }, + { + "epoch": 2.895885212854263, + "grad_norm": 0.6822850777241748, + "learning_rate": 9.342347057512896e-06, + "loss": 0.0854, + "step": 24421 + }, + { + "epoch": 2.896003794616388, + "grad_norm": 0.6351803439046501, + "learning_rate": 9.34047586281959e-06, + "loss": 0.0739, + "step": 24422 + }, + { + "epoch": 2.896122376378513, + "grad_norm": 0.4921996806161358, + "learning_rate": 9.33860481248789e-06, + "loss": 0.0575, + "step": 24423 + }, + { + "epoch": 2.896240958140638, + "grad_norm": 0.7005099242206372, + "learning_rate": 9.336733906535059e-06, + "loss": 0.0818, + "step": 24424 + }, + { + "epoch": 2.896359539902763, + "grad_norm": 0.7423659402073587, + "learning_rate": 9.334863144978318e-06, + "loss": 0.0861, + "step": 24425 + }, + { + "epoch": 2.896478121664888, + "grad_norm": 0.3840682519044292, + "learning_rate": 9.332992527834944e-06, + "loss": 0.0531, + "step": 24426 + }, + { + "epoch": 2.896596703427013, + "grad_norm": 0.7369431238156041, + "learning_rate": 9.331122055122158e-06, + "loss": 0.0633, + "step": 24427 + }, + { + "epoch": 2.896715285189138, + "grad_norm": 0.7283524528976126, + "learning_rate": 9.329251726857222e-06, + "loss": 0.0974, + "step": 24428 + }, + { + "epoch": 2.8968338669512628, + "grad_norm": 0.5694119997508854, + "learning_rate": 9.327381543057349e-06, + "loss": 0.0766, + "step": 24429 + }, + { + "epoch": 2.896952448713388, + "grad_norm": 1.098741576332052, + "learning_rate": 9.325511503739817e-06, + "loss": 0.1002, + "step": 24430 + }, + { + "epoch": 2.8970710304755127, + "grad_norm": 0.5077075480278486, + "learning_rate": 9.32364160892184e-06, + "loss": 0.0584, + "step": 24431 + }, + { + "epoch": 2.897189612237638, + "grad_norm": 0.7167714328443007, + "learning_rate": 9.321771858620665e-06, + "loss": 0.1003, + "step": 24432 + }, + { + "epoch": 2.8973081939997627, + "grad_norm": 0.7894765224753961, + "learning_rate": 9.319902252853527e-06, + "loss": 0.1041, + "step": 24433 + }, + { + "epoch": 2.897426775761888, + "grad_norm": 0.725368344593293, + "learning_rate": 9.31803279163766e-06, + "loss": 0.0925, + "step": 24434 + }, + { + "epoch": 2.8975453575240127, + "grad_norm": 0.8385608624953019, + "learning_rate": 9.316163474990311e-06, + "loss": 0.1088, + "step": 24435 + }, + { + "epoch": 2.897663939286138, + "grad_norm": 0.7699757125301655, + "learning_rate": 9.314294302928691e-06, + "loss": 0.0917, + "step": 24436 + }, + { + "epoch": 2.8977825210482626, + "grad_norm": 0.6906869065211082, + "learning_rate": 9.312425275470043e-06, + "loss": 0.1035, + "step": 24437 + }, + { + "epoch": 2.897901102810388, + "grad_norm": 0.8013822598569222, + "learning_rate": 9.310556392631597e-06, + "loss": 0.0961, + "step": 24438 + }, + { + "epoch": 2.898019684572513, + "grad_norm": 0.8903038952759228, + "learning_rate": 9.308687654430585e-06, + "loss": 0.1272, + "step": 24439 + }, + { + "epoch": 2.898138266334638, + "grad_norm": 0.7303464823845696, + "learning_rate": 9.306819060884226e-06, + "loss": 0.0758, + "step": 24440 + }, + { + "epoch": 2.8982568480967625, + "grad_norm": 0.59755484331057, + "learning_rate": 9.304950612009753e-06, + "loss": 0.0918, + "step": 24441 + }, + { + "epoch": 2.8983754298588877, + "grad_norm": 0.6232029480684115, + "learning_rate": 9.303082307824395e-06, + "loss": 0.0947, + "step": 24442 + }, + { + "epoch": 2.898494011621013, + "grad_norm": 0.7703395507076388, + "learning_rate": 9.301214148345364e-06, + "loss": 0.0967, + "step": 24443 + }, + { + "epoch": 2.8986125933831377, + "grad_norm": 0.933209401348064, + "learning_rate": 9.299346133589886e-06, + "loss": 0.1114, + "step": 24444 + }, + { + "epoch": 2.8987311751452625, + "grad_norm": 0.6357070257844913, + "learning_rate": 9.297478263575184e-06, + "loss": 0.0796, + "step": 24445 + }, + { + "epoch": 2.8988497569073877, + "grad_norm": 0.3550172406324586, + "learning_rate": 9.295610538318486e-06, + "loss": 0.0417, + "step": 24446 + }, + { + "epoch": 2.898968338669513, + "grad_norm": 0.6243039435477831, + "learning_rate": 9.29374295783699e-06, + "loss": 0.0797, + "step": 24447 + }, + { + "epoch": 2.8990869204316376, + "grad_norm": 0.5108711405090849, + "learning_rate": 9.291875522147924e-06, + "loss": 0.0426, + "step": 24448 + }, + { + "epoch": 2.8992055021937624, + "grad_norm": 0.4678394379535922, + "learning_rate": 9.290008231268505e-06, + "loss": 0.0477, + "step": 24449 + }, + { + "epoch": 2.8993240839558876, + "grad_norm": 0.7694579120462469, + "learning_rate": 9.288141085215943e-06, + "loss": 0.0981, + "step": 24450 + }, + { + "epoch": 2.899442665718013, + "grad_norm": 0.9523959120077785, + "learning_rate": 9.286274084007454e-06, + "loss": 0.1234, + "step": 24451 + }, + { + "epoch": 2.8995612474801375, + "grad_norm": 0.705285322885866, + "learning_rate": 9.28440722766025e-06, + "loss": 0.0769, + "step": 24452 + }, + { + "epoch": 2.8996798292422623, + "grad_norm": 0.6984090073632909, + "learning_rate": 9.282540516191545e-06, + "loss": 0.114, + "step": 24453 + }, + { + "epoch": 2.8997984110043875, + "grad_norm": 0.8073916135253997, + "learning_rate": 9.280673949618534e-06, + "loss": 0.1105, + "step": 24454 + }, + { + "epoch": 2.8999169927665127, + "grad_norm": 0.6445166714780363, + "learning_rate": 9.278807527958435e-06, + "loss": 0.0839, + "step": 24455 + }, + { + "epoch": 2.9000355745286375, + "grad_norm": 0.6656506292748096, + "learning_rate": 9.276941251228452e-06, + "loss": 0.0686, + "step": 24456 + }, + { + "epoch": 2.9001541562907622, + "grad_norm": 0.5051717528775664, + "learning_rate": 9.275075119445798e-06, + "loss": 0.0516, + "step": 24457 + }, + { + "epoch": 2.9002727380528874, + "grad_norm": 0.4461358401353634, + "learning_rate": 9.27320913262766e-06, + "loss": 0.0654, + "step": 24458 + }, + { + "epoch": 2.9003913198150126, + "grad_norm": 0.9245493698019928, + "learning_rate": 9.271343290791249e-06, + "loss": 0.1175, + "step": 24459 + }, + { + "epoch": 2.9005099015771374, + "grad_norm": 0.6063487620495317, + "learning_rate": 9.269477593953762e-06, + "loss": 0.0825, + "step": 24460 + }, + { + "epoch": 2.900628483339262, + "grad_norm": 0.6254729943483786, + "learning_rate": 9.267612042132403e-06, + "loss": 0.0759, + "step": 24461 + }, + { + "epoch": 2.9007470651013874, + "grad_norm": 0.8160922059710519, + "learning_rate": 9.26574663534438e-06, + "loss": 0.1162, + "step": 24462 + }, + { + "epoch": 2.9008656468635126, + "grad_norm": 0.47777382987558253, + "learning_rate": 9.263881373606859e-06, + "loss": 0.0627, + "step": 24463 + }, + { + "epoch": 2.9009842286256373, + "grad_norm": 0.5868199287943977, + "learning_rate": 9.26201625693707e-06, + "loss": 0.0717, + "step": 24464 + }, + { + "epoch": 2.9011028103877625, + "grad_norm": 0.6381514042180989, + "learning_rate": 9.260151285352187e-06, + "loss": 0.0562, + "step": 24465 + }, + { + "epoch": 2.9012213921498873, + "grad_norm": 0.7438409307350594, + "learning_rate": 9.258286458869405e-06, + "loss": 0.094, + "step": 24466 + }, + { + "epoch": 2.9013399739120125, + "grad_norm": 0.6820733851374595, + "learning_rate": 9.256421777505919e-06, + "loss": 0.0845, + "step": 24467 + }, + { + "epoch": 2.9014585556741372, + "grad_norm": 0.5114040887237391, + "learning_rate": 9.254557241278928e-06, + "loss": 0.0645, + "step": 24468 + }, + { + "epoch": 2.9015771374362624, + "grad_norm": 0.7137435960198472, + "learning_rate": 9.252692850205599e-06, + "loss": 0.1033, + "step": 24469 + }, + { + "epoch": 2.901695719198387, + "grad_norm": 0.7255936348872871, + "learning_rate": 9.250828604303133e-06, + "loss": 0.0897, + "step": 24470 + }, + { + "epoch": 2.9018143009605124, + "grad_norm": 0.6479313906339416, + "learning_rate": 9.248964503588715e-06, + "loss": 0.099, + "step": 24471 + }, + { + "epoch": 2.901932882722637, + "grad_norm": 0.5478125446699276, + "learning_rate": 9.24710054807953e-06, + "loss": 0.0894, + "step": 24472 + }, + { + "epoch": 2.9020514644847624, + "grad_norm": 0.767984061716492, + "learning_rate": 9.245236737792768e-06, + "loss": 0.0915, + "step": 24473 + }, + { + "epoch": 2.902170046246887, + "grad_norm": 0.9377884915823741, + "learning_rate": 9.243373072745585e-06, + "loss": 0.1196, + "step": 24474 + }, + { + "epoch": 2.9022886280090123, + "grad_norm": 0.6489635688087836, + "learning_rate": 9.2415095529552e-06, + "loss": 0.0896, + "step": 24475 + }, + { + "epoch": 2.902407209771137, + "grad_norm": 0.6365565667463203, + "learning_rate": 9.23964617843876e-06, + "loss": 0.0963, + "step": 24476 + }, + { + "epoch": 2.9025257915332623, + "grad_norm": 0.6661926398774086, + "learning_rate": 9.237782949213466e-06, + "loss": 0.0961, + "step": 24477 + }, + { + "epoch": 2.902644373295387, + "grad_norm": 0.7194923628748076, + "learning_rate": 9.23591986529647e-06, + "loss": 0.1103, + "step": 24478 + }, + { + "epoch": 2.9027629550575123, + "grad_norm": 0.9918467559368086, + "learning_rate": 9.234056926704976e-06, + "loss": 0.1236, + "step": 24479 + }, + { + "epoch": 2.902881536819637, + "grad_norm": 0.5835459686085515, + "learning_rate": 9.232194133456134e-06, + "loss": 0.0724, + "step": 24480 + }, + { + "epoch": 2.903000118581762, + "grad_norm": 0.5508011278913136, + "learning_rate": 9.230331485567131e-06, + "loss": 0.0694, + "step": 24481 + }, + { + "epoch": 2.903118700343887, + "grad_norm": 0.7816489417372173, + "learning_rate": 9.22846898305513e-06, + "loss": 0.0951, + "step": 24482 + }, + { + "epoch": 2.903237282106012, + "grad_norm": 0.5933175231711821, + "learning_rate": 9.226606625937306e-06, + "loss": 0.0791, + "step": 24483 + }, + { + "epoch": 2.903355863868137, + "grad_norm": 0.5982353723259934, + "learning_rate": 9.224744414230831e-06, + "loss": 0.087, + "step": 24484 + }, + { + "epoch": 2.903474445630262, + "grad_norm": 0.4518604811592768, + "learning_rate": 9.222882347952854e-06, + "loss": 0.0639, + "step": 24485 + }, + { + "epoch": 2.903593027392387, + "grad_norm": 0.8069695632948581, + "learning_rate": 9.221020427120575e-06, + "loss": 0.1073, + "step": 24486 + }, + { + "epoch": 2.903711609154512, + "grad_norm": 0.8973973889046186, + "learning_rate": 9.219158651751126e-06, + "loss": 0.0906, + "step": 24487 + }, + { + "epoch": 2.903830190916637, + "grad_norm": 0.926409502691092, + "learning_rate": 9.21729702186169e-06, + "loss": 0.1166, + "step": 24488 + }, + { + "epoch": 2.903948772678762, + "grad_norm": 0.6029451563648202, + "learning_rate": 9.215435537469406e-06, + "loss": 0.066, + "step": 24489 + }, + { + "epoch": 2.904067354440887, + "grad_norm": 0.5686746230181288, + "learning_rate": 9.213574198591465e-06, + "loss": 0.0661, + "step": 24490 + }, + { + "epoch": 2.904185936203012, + "grad_norm": 0.6417171781409, + "learning_rate": 9.211713005245003e-06, + "loss": 0.1014, + "step": 24491 + }, + { + "epoch": 2.9043045179651372, + "grad_norm": 0.7428777712043617, + "learning_rate": 9.209851957447187e-06, + "loss": 0.0774, + "step": 24492 + }, + { + "epoch": 2.904423099727262, + "grad_norm": 0.8235882852695118, + "learning_rate": 9.207991055215175e-06, + "loss": 0.117, + "step": 24493 + }, + { + "epoch": 2.9045416814893867, + "grad_norm": 0.5725987878980657, + "learning_rate": 9.206130298566117e-06, + "loss": 0.0778, + "step": 24494 + }, + { + "epoch": 2.904660263251512, + "grad_norm": 0.5383689519344482, + "learning_rate": 9.204269687517181e-06, + "loss": 0.0663, + "step": 24495 + }, + { + "epoch": 2.904778845013637, + "grad_norm": 0.5083346150311777, + "learning_rate": 9.202409222085493e-06, + "loss": 0.0661, + "step": 24496 + }, + { + "epoch": 2.904897426775762, + "grad_norm": 0.7417845490488046, + "learning_rate": 9.200548902288231e-06, + "loss": 0.1042, + "step": 24497 + }, + { + "epoch": 2.9050160085378867, + "grad_norm": 0.7414906795176864, + "learning_rate": 9.19868872814253e-06, + "loss": 0.0968, + "step": 24498 + }, + { + "epoch": 2.905134590300012, + "grad_norm": 0.8376180114083874, + "learning_rate": 9.196828699665538e-06, + "loss": 0.1204, + "step": 24499 + }, + { + "epoch": 2.905253172062137, + "grad_norm": 0.7097875189676165, + "learning_rate": 9.19496881687441e-06, + "loss": 0.0755, + "step": 24500 + }, + { + "epoch": 2.905371753824262, + "grad_norm": 0.4913583882469641, + "learning_rate": 9.193109079786285e-06, + "loss": 0.0628, + "step": 24501 + }, + { + "epoch": 2.9054903355863866, + "grad_norm": 0.47536910532375665, + "learning_rate": 9.191249488418322e-06, + "loss": 0.0659, + "step": 24502 + }, + { + "epoch": 2.905608917348512, + "grad_norm": 0.7220503525035675, + "learning_rate": 9.189390042787644e-06, + "loss": 0.117, + "step": 24503 + }, + { + "epoch": 2.905727499110637, + "grad_norm": 0.5324155276899389, + "learning_rate": 9.187530742911399e-06, + "loss": 0.0709, + "step": 24504 + }, + { + "epoch": 2.9058460808727617, + "grad_norm": 0.7622600092772447, + "learning_rate": 9.18567158880673e-06, + "loss": 0.0814, + "step": 24505 + }, + { + "epoch": 2.9059646626348865, + "grad_norm": 0.8023893866944384, + "learning_rate": 9.183812580490786e-06, + "loss": 0.087, + "step": 24506 + }, + { + "epoch": 2.9060832443970117, + "grad_norm": 0.6054624450717911, + "learning_rate": 9.181953717980687e-06, + "loss": 0.0791, + "step": 24507 + }, + { + "epoch": 2.906201826159137, + "grad_norm": 0.786739984315748, + "learning_rate": 9.180095001293575e-06, + "loss": 0.1054, + "step": 24508 + }, + { + "epoch": 2.9063204079212617, + "grad_norm": 1.0980185522615622, + "learning_rate": 9.17823643044659e-06, + "loss": 0.1262, + "step": 24509 + }, + { + "epoch": 2.9064389896833864, + "grad_norm": 0.6681452202792318, + "learning_rate": 9.176378005456865e-06, + "loss": 0.0604, + "step": 24510 + }, + { + "epoch": 2.9065575714455116, + "grad_norm": 0.6578010767055299, + "learning_rate": 9.174519726341527e-06, + "loss": 0.0826, + "step": 24511 + }, + { + "epoch": 2.906676153207637, + "grad_norm": 0.3885663889487954, + "learning_rate": 9.172661593117712e-06, + "loss": 0.0522, + "step": 24512 + }, + { + "epoch": 2.9067947349697616, + "grad_norm": 0.800850309857716, + "learning_rate": 9.170803605802555e-06, + "loss": 0.086, + "step": 24513 + }, + { + "epoch": 2.906913316731887, + "grad_norm": 0.5484241021453575, + "learning_rate": 9.16894576441317e-06, + "loss": 0.0689, + "step": 24514 + }, + { + "epoch": 2.9070318984940116, + "grad_norm": 0.6259210803973991, + "learning_rate": 9.167088068966694e-06, + "loss": 0.0688, + "step": 24515 + }, + { + "epoch": 2.9071504802561368, + "grad_norm": 0.8748246710085603, + "learning_rate": 9.165230519480247e-06, + "loss": 0.0807, + "step": 24516 + }, + { + "epoch": 2.9072690620182615, + "grad_norm": 0.48554359081651755, + "learning_rate": 9.16337311597097e-06, + "loss": 0.0424, + "step": 24517 + }, + { + "epoch": 2.9073876437803867, + "grad_norm": 0.995207417158025, + "learning_rate": 9.16151585845596e-06, + "loss": 0.1192, + "step": 24518 + }, + { + "epoch": 2.9075062255425115, + "grad_norm": 0.5110789422574848, + "learning_rate": 9.159658746952352e-06, + "loss": 0.0619, + "step": 24519 + }, + { + "epoch": 2.9076248073046367, + "grad_norm": 0.526599794379962, + "learning_rate": 9.157801781477266e-06, + "loss": 0.0687, + "step": 24520 + }, + { + "epoch": 2.9077433890667614, + "grad_norm": 0.7276440937991232, + "learning_rate": 9.155944962047821e-06, + "loss": 0.0821, + "step": 24521 + }, + { + "epoch": 2.9078619708288866, + "grad_norm": 0.5814665249325005, + "learning_rate": 9.154088288681142e-06, + "loss": 0.0678, + "step": 24522 + }, + { + "epoch": 2.9079805525910114, + "grad_norm": 0.8252102173866787, + "learning_rate": 9.152231761394323e-06, + "loss": 0.1182, + "step": 24523 + }, + { + "epoch": 2.9080991343531366, + "grad_norm": 0.5587723778567326, + "learning_rate": 9.150375380204507e-06, + "loss": 0.0843, + "step": 24524 + }, + { + "epoch": 2.9082177161152614, + "grad_norm": 0.48529664469034295, + "learning_rate": 9.148519145128786e-06, + "loss": 0.0573, + "step": 24525 + }, + { + "epoch": 2.9083362978773866, + "grad_norm": 0.4851058338169155, + "learning_rate": 9.146663056184282e-06, + "loss": 0.0655, + "step": 24526 + }, + { + "epoch": 2.9084548796395113, + "grad_norm": 0.7755412264867338, + "learning_rate": 9.144807113388101e-06, + "loss": 0.1089, + "step": 24527 + }, + { + "epoch": 2.9085734614016365, + "grad_norm": 0.7449878408653902, + "learning_rate": 9.142951316757367e-06, + "loss": 0.1039, + "step": 24528 + }, + { + "epoch": 2.9086920431637613, + "grad_norm": 1.0050036259990787, + "learning_rate": 9.141095666309165e-06, + "loss": 0.1068, + "step": 24529 + }, + { + "epoch": 2.9088106249258865, + "grad_norm": 0.5594837307832465, + "learning_rate": 9.139240162060617e-06, + "loss": 0.075, + "step": 24530 + }, + { + "epoch": 2.9089292066880112, + "grad_norm": 0.6296518455637233, + "learning_rate": 9.137384804028823e-06, + "loss": 0.0724, + "step": 24531 + }, + { + "epoch": 2.9090477884501365, + "grad_norm": 0.7503108575886691, + "learning_rate": 9.13552959223089e-06, + "loss": 0.0748, + "step": 24532 + }, + { + "epoch": 2.909166370212261, + "grad_norm": 0.5293340427527681, + "learning_rate": 9.133674526683927e-06, + "loss": 0.071, + "step": 24533 + }, + { + "epoch": 2.9092849519743864, + "grad_norm": 0.9763628532769871, + "learning_rate": 9.131819607405012e-06, + "loss": 0.1304, + "step": 24534 + }, + { + "epoch": 2.909403533736511, + "grad_norm": 0.824233451163118, + "learning_rate": 9.129964834411281e-06, + "loss": 0.0667, + "step": 24535 + }, + { + "epoch": 2.9095221154986364, + "grad_norm": 0.6235406387976102, + "learning_rate": 9.128110207719801e-06, + "loss": 0.0802, + "step": 24536 + }, + { + "epoch": 2.909640697260761, + "grad_norm": 0.4718102426395717, + "learning_rate": 9.126255727347683e-06, + "loss": 0.0693, + "step": 24537 + }, + { + "epoch": 2.9097592790228863, + "grad_norm": 0.5091859969568697, + "learning_rate": 9.12440139331202e-06, + "loss": 0.0465, + "step": 24538 + }, + { + "epoch": 2.909877860785011, + "grad_norm": 0.7585168609901312, + "learning_rate": 9.12254720562992e-06, + "loss": 0.0785, + "step": 24539 + }, + { + "epoch": 2.9099964425471363, + "grad_norm": 1.0817770864806353, + "learning_rate": 9.120693164318452e-06, + "loss": 0.1564, + "step": 24540 + }, + { + "epoch": 2.9101150243092615, + "grad_norm": 0.5578094042262853, + "learning_rate": 9.118839269394722e-06, + "loss": 0.0726, + "step": 24541 + }, + { + "epoch": 2.9102336060713863, + "grad_norm": 0.7299095670134237, + "learning_rate": 9.116985520875821e-06, + "loss": 0.075, + "step": 24542 + }, + { + "epoch": 2.910352187833511, + "grad_norm": 0.9523248345298998, + "learning_rate": 9.115131918778835e-06, + "loss": 0.1027, + "step": 24543 + }, + { + "epoch": 2.910470769595636, + "grad_norm": 0.8507416103598583, + "learning_rate": 9.11327846312086e-06, + "loss": 0.1034, + "step": 24544 + }, + { + "epoch": 2.9105893513577614, + "grad_norm": 0.5802364976262676, + "learning_rate": 9.111425153918962e-06, + "loss": 0.0726, + "step": 24545 + }, + { + "epoch": 2.910707933119886, + "grad_norm": 0.596298680937823, + "learning_rate": 9.109571991190254e-06, + "loss": 0.0893, + "step": 24546 + }, + { + "epoch": 2.910826514882011, + "grad_norm": 0.551890142620337, + "learning_rate": 9.1077189749518e-06, + "loss": 0.0654, + "step": 24547 + }, + { + "epoch": 2.910945096644136, + "grad_norm": 0.5866829205007619, + "learning_rate": 9.10586610522069e-06, + "loss": 0.0832, + "step": 24548 + }, + { + "epoch": 2.9110636784062613, + "grad_norm": 0.5478386057304628, + "learning_rate": 9.104013382014e-06, + "loss": 0.0798, + "step": 24549 + }, + { + "epoch": 2.911182260168386, + "grad_norm": 0.35101596617253017, + "learning_rate": 9.102160805348817e-06, + "loss": 0.0426, + "step": 24550 + }, + { + "epoch": 2.911300841930511, + "grad_norm": 0.724655782747847, + "learning_rate": 9.100308375242225e-06, + "loss": 0.073, + "step": 24551 + }, + { + "epoch": 2.911419423692636, + "grad_norm": 0.6054737068192145, + "learning_rate": 9.098456091711275e-06, + "loss": 0.0683, + "step": 24552 + }, + { + "epoch": 2.9115380054547613, + "grad_norm": 0.5593408177447576, + "learning_rate": 9.096603954773075e-06, + "loss": 0.0668, + "step": 24553 + }, + { + "epoch": 2.911656587216886, + "grad_norm": 0.6647969582815879, + "learning_rate": 9.094751964444676e-06, + "loss": 0.0935, + "step": 24554 + }, + { + "epoch": 2.911775168979011, + "grad_norm": 0.945188173537514, + "learning_rate": 9.09290012074317e-06, + "loss": 0.1139, + "step": 24555 + }, + { + "epoch": 2.911893750741136, + "grad_norm": 0.5777541736012495, + "learning_rate": 9.0910484236856e-06, + "loss": 0.0785, + "step": 24556 + }, + { + "epoch": 2.912012332503261, + "grad_norm": 0.6449401310609532, + "learning_rate": 9.089196873289071e-06, + "loss": 0.0934, + "step": 24557 + }, + { + "epoch": 2.912130914265386, + "grad_norm": 0.8397985425881849, + "learning_rate": 9.087345469570629e-06, + "loss": 0.0864, + "step": 24558 + }, + { + "epoch": 2.9122494960275107, + "grad_norm": 0.7867180067438102, + "learning_rate": 9.085494212547346e-06, + "loss": 0.1244, + "step": 24559 + }, + { + "epoch": 2.912368077789636, + "grad_norm": 0.6550225224911509, + "learning_rate": 9.083643102236294e-06, + "loss": 0.0696, + "step": 24560 + }, + { + "epoch": 2.912486659551761, + "grad_norm": 0.7065264655061265, + "learning_rate": 9.081792138654533e-06, + "loss": 0.0969, + "step": 24561 + }, + { + "epoch": 2.912605241313886, + "grad_norm": 0.5905222804629374, + "learning_rate": 9.079941321819135e-06, + "loss": 0.0954, + "step": 24562 + }, + { + "epoch": 2.912723823076011, + "grad_norm": 0.3338167647359463, + "learning_rate": 9.078090651747151e-06, + "loss": 0.0432, + "step": 24563 + }, + { + "epoch": 2.912842404838136, + "grad_norm": 0.5704439065255432, + "learning_rate": 9.076240128455644e-06, + "loss": 0.0766, + "step": 24564 + }, + { + "epoch": 2.912960986600261, + "grad_norm": 0.5826241571123815, + "learning_rate": 9.074389751961677e-06, + "loss": 0.0895, + "step": 24565 + }, + { + "epoch": 2.913079568362386, + "grad_norm": 0.9833514042613896, + "learning_rate": 9.072539522282314e-06, + "loss": 0.1309, + "step": 24566 + }, + { + "epoch": 2.913198150124511, + "grad_norm": 0.7677964088990966, + "learning_rate": 9.070689439434587e-06, + "loss": 0.1017, + "step": 24567 + }, + { + "epoch": 2.9133167318866358, + "grad_norm": 0.715549241582121, + "learning_rate": 9.06883950343559e-06, + "loss": 0.0798, + "step": 24568 + }, + { + "epoch": 2.913435313648761, + "grad_norm": 0.48223449142172586, + "learning_rate": 9.066989714302345e-06, + "loss": 0.0594, + "step": 24569 + }, + { + "epoch": 2.9135538954108857, + "grad_norm": 0.8447312785872844, + "learning_rate": 9.06514007205192e-06, + "loss": 0.0853, + "step": 24570 + }, + { + "epoch": 2.913672477173011, + "grad_norm": 0.6170664255268716, + "learning_rate": 9.063290576701362e-06, + "loss": 0.0821, + "step": 24571 + }, + { + "epoch": 2.9137910589351357, + "grad_norm": 0.5266074273398179, + "learning_rate": 9.06144122826772e-06, + "loss": 0.0628, + "step": 24572 + }, + { + "epoch": 2.913909640697261, + "grad_norm": 0.8998910404972797, + "learning_rate": 9.059592026768054e-06, + "loss": 0.1018, + "step": 24573 + }, + { + "epoch": 2.9140282224593856, + "grad_norm": 0.7679503312634754, + "learning_rate": 9.057742972219397e-06, + "loss": 0.0987, + "step": 24574 + }, + { + "epoch": 2.914146804221511, + "grad_norm": 0.8251374611790814, + "learning_rate": 9.055894064638795e-06, + "loss": 0.1047, + "step": 24575 + }, + { + "epoch": 2.9142653859836356, + "grad_norm": 0.7020711741097312, + "learning_rate": 9.0540453040433e-06, + "loss": 0.0885, + "step": 24576 + }, + { + "epoch": 2.914383967745761, + "grad_norm": 0.4692628626252707, + "learning_rate": 9.052196690449963e-06, + "loss": 0.0612, + "step": 24577 + }, + { + "epoch": 2.9145025495078856, + "grad_norm": 0.4992200649538469, + "learning_rate": 9.050348223875804e-06, + "loss": 0.0584, + "step": 24578 + }, + { + "epoch": 2.9146211312700108, + "grad_norm": 0.8157333203874256, + "learning_rate": 9.04849990433788e-06, + "loss": 0.0956, + "step": 24579 + }, + { + "epoch": 2.9147397130321355, + "grad_norm": 0.7884160770057916, + "learning_rate": 9.046651731853223e-06, + "loss": 0.0969, + "step": 24580 + }, + { + "epoch": 2.9148582947942607, + "grad_norm": 0.7720295596017132, + "learning_rate": 9.044803706438873e-06, + "loss": 0.0865, + "step": 24581 + }, + { + "epoch": 2.9149768765563855, + "grad_norm": 0.5977832833181875, + "learning_rate": 9.042955828111868e-06, + "loss": 0.082, + "step": 24582 + }, + { + "epoch": 2.9150954583185107, + "grad_norm": 0.5264287908229192, + "learning_rate": 9.041108096889242e-06, + "loss": 0.0588, + "step": 24583 + }, + { + "epoch": 2.9152140400806354, + "grad_norm": 0.5750669073661085, + "learning_rate": 9.03926051278804e-06, + "loss": 0.0676, + "step": 24584 + }, + { + "epoch": 2.9153326218427607, + "grad_norm": 0.7687242926380207, + "learning_rate": 9.037413075825275e-06, + "loss": 0.0709, + "step": 24585 + }, + { + "epoch": 2.9154512036048854, + "grad_norm": 0.9981743438269084, + "learning_rate": 9.035565786017986e-06, + "loss": 0.1376, + "step": 24586 + }, + { + "epoch": 2.9155697853670106, + "grad_norm": 0.6868287604211158, + "learning_rate": 9.033718643383201e-06, + "loss": 0.0954, + "step": 24587 + }, + { + "epoch": 2.9156883671291354, + "grad_norm": 0.7652729826443756, + "learning_rate": 9.031871647937961e-06, + "loss": 0.0961, + "step": 24588 + }, + { + "epoch": 2.9158069488912606, + "grad_norm": 0.7800252313588222, + "learning_rate": 9.030024799699276e-06, + "loss": 0.0659, + "step": 24589 + }, + { + "epoch": 2.9159255306533858, + "grad_norm": 0.3879649306763293, + "learning_rate": 9.02817809868418e-06, + "loss": 0.0509, + "step": 24590 + }, + { + "epoch": 2.9160441124155105, + "grad_norm": 1.1559761245524864, + "learning_rate": 9.026331544909692e-06, + "loss": 0.1745, + "step": 24591 + }, + { + "epoch": 2.9161626941776353, + "grad_norm": 0.539229000152414, + "learning_rate": 9.024485138392841e-06, + "loss": 0.0707, + "step": 24592 + }, + { + "epoch": 2.9162812759397605, + "grad_norm": 0.5103741350237563, + "learning_rate": 9.022638879150656e-06, + "loss": 0.0546, + "step": 24593 + }, + { + "epoch": 2.9163998577018857, + "grad_norm": 0.903861819156669, + "learning_rate": 9.020792767200129e-06, + "loss": 0.1348, + "step": 24594 + }, + { + "epoch": 2.9165184394640105, + "grad_norm": 0.49291135759426813, + "learning_rate": 9.018946802558315e-06, + "loss": 0.0728, + "step": 24595 + }, + { + "epoch": 2.916637021226135, + "grad_norm": 0.7130782729136605, + "learning_rate": 9.017100985242207e-06, + "loss": 0.1008, + "step": 24596 + }, + { + "epoch": 2.9167556029882604, + "grad_norm": 0.717943051571022, + "learning_rate": 9.015255315268828e-06, + "loss": 0.1056, + "step": 24597 + }, + { + "epoch": 2.9168741847503856, + "grad_norm": 0.44462226229293017, + "learning_rate": 9.013409792655193e-06, + "loss": 0.0512, + "step": 24598 + }, + { + "epoch": 2.9169927665125104, + "grad_norm": 0.5893542222583511, + "learning_rate": 9.011564417418314e-06, + "loss": 0.066, + "step": 24599 + }, + { + "epoch": 2.917111348274635, + "grad_norm": 0.6074012851065281, + "learning_rate": 9.009719189575213e-06, + "loss": 0.0933, + "step": 24600 + }, + { + "epoch": 2.9172299300367603, + "grad_norm": 0.9547761506371666, + "learning_rate": 9.00787410914288e-06, + "loss": 0.1265, + "step": 24601 + }, + { + "epoch": 2.9173485117988855, + "grad_norm": 0.594829012684177, + "learning_rate": 9.00602917613835e-06, + "loss": 0.0655, + "step": 24602 + }, + { + "epoch": 2.9174670935610103, + "grad_norm": 0.8087484301969218, + "learning_rate": 9.00418439057861e-06, + "loss": 0.129, + "step": 24603 + }, + { + "epoch": 2.917585675323135, + "grad_norm": 0.4930951070831003, + "learning_rate": 9.002339752480684e-06, + "loss": 0.0709, + "step": 24604 + }, + { + "epoch": 2.9177042570852603, + "grad_norm": 0.4632219428155486, + "learning_rate": 9.00049526186155e-06, + "loss": 0.0583, + "step": 24605 + }, + { + "epoch": 2.9178228388473855, + "grad_norm": 0.44180306454807344, + "learning_rate": 8.998650918738246e-06, + "loss": 0.0504, + "step": 24606 + }, + { + "epoch": 2.9179414206095102, + "grad_norm": 0.9191200044034793, + "learning_rate": 8.996806723127751e-06, + "loss": 0.1065, + "step": 24607 + }, + { + "epoch": 2.918060002371635, + "grad_norm": 0.5340725977944338, + "learning_rate": 8.994962675047075e-06, + "loss": 0.0742, + "step": 24608 + }, + { + "epoch": 2.91817858413376, + "grad_norm": 0.7451228832175886, + "learning_rate": 8.993118774513215e-06, + "loss": 0.1164, + "step": 24609 + }, + { + "epoch": 2.9182971658958854, + "grad_norm": 0.6164350011484476, + "learning_rate": 8.991275021543171e-06, + "loss": 0.0694, + "step": 24610 + }, + { + "epoch": 2.91841574765801, + "grad_norm": 0.6182222022102911, + "learning_rate": 8.989431416153949e-06, + "loss": 0.0914, + "step": 24611 + }, + { + "epoch": 2.918534329420135, + "grad_norm": 0.6285152632443216, + "learning_rate": 8.987587958362517e-06, + "loss": 0.0856, + "step": 24612 + }, + { + "epoch": 2.91865291118226, + "grad_norm": 0.5228464056474311, + "learning_rate": 8.98574464818591e-06, + "loss": 0.0635, + "step": 24613 + }, + { + "epoch": 2.9187714929443853, + "grad_norm": 0.5144794743148586, + "learning_rate": 8.983901485641085e-06, + "loss": 0.0637, + "step": 24614 + }, + { + "epoch": 2.91889007470651, + "grad_norm": 0.7200460024503357, + "learning_rate": 8.98205847074506e-06, + "loss": 0.0976, + "step": 24615 + }, + { + "epoch": 2.9190086564686353, + "grad_norm": 0.7285023560210849, + "learning_rate": 8.980215603514797e-06, + "loss": 0.1047, + "step": 24616 + }, + { + "epoch": 2.91912723823076, + "grad_norm": 0.5647822893070963, + "learning_rate": 8.978372883967314e-06, + "loss": 0.0817, + "step": 24617 + }, + { + "epoch": 2.9192458199928852, + "grad_norm": 0.6688771391323443, + "learning_rate": 8.976530312119583e-06, + "loss": 0.0767, + "step": 24618 + }, + { + "epoch": 2.91936440175501, + "grad_norm": 0.6487078201441365, + "learning_rate": 8.97468788798859e-06, + "loss": 0.0798, + "step": 24619 + }, + { + "epoch": 2.919482983517135, + "grad_norm": 0.6702780148704641, + "learning_rate": 8.972845611591323e-06, + "loss": 0.0873, + "step": 24620 + }, + { + "epoch": 2.91960156527926, + "grad_norm": 0.7905632255481346, + "learning_rate": 8.971003482944767e-06, + "loss": 0.0867, + "step": 24621 + }, + { + "epoch": 2.919720147041385, + "grad_norm": 0.46321215629215806, + "learning_rate": 8.969161502065912e-06, + "loss": 0.0624, + "step": 24622 + }, + { + "epoch": 2.91983872880351, + "grad_norm": 0.8309863826109576, + "learning_rate": 8.96731966897171e-06, + "loss": 0.1168, + "step": 24623 + }, + { + "epoch": 2.919957310565635, + "grad_norm": 0.6098529348179421, + "learning_rate": 8.96547798367918e-06, + "loss": 0.07, + "step": 24624 + }, + { + "epoch": 2.92007589232776, + "grad_norm": 0.6649431780774924, + "learning_rate": 8.96363644620527e-06, + "loss": 0.1016, + "step": 24625 + }, + { + "epoch": 2.920194474089885, + "grad_norm": 0.5719213255371928, + "learning_rate": 8.961795056566974e-06, + "loss": 0.077, + "step": 24626 + }, + { + "epoch": 2.92031305585201, + "grad_norm": 0.809087186234105, + "learning_rate": 8.959953814781247e-06, + "loss": 0.0875, + "step": 24627 + }, + { + "epoch": 2.920431637614135, + "grad_norm": 0.668102695289529, + "learning_rate": 8.958112720865089e-06, + "loss": 0.0875, + "step": 24628 + }, + { + "epoch": 2.92055021937626, + "grad_norm": 0.6743606174459467, + "learning_rate": 8.956271774835454e-06, + "loss": 0.0892, + "step": 24629 + }, + { + "epoch": 2.920668801138385, + "grad_norm": 0.6593847209950313, + "learning_rate": 8.954430976709318e-06, + "loss": 0.0885, + "step": 24630 + }, + { + "epoch": 2.9207873829005098, + "grad_norm": 0.46436674774844977, + "learning_rate": 8.952590326503655e-06, + "loss": 0.0504, + "step": 24631 + }, + { + "epoch": 2.920905964662635, + "grad_norm": 0.5396144921450765, + "learning_rate": 8.950749824235428e-06, + "loss": 0.0581, + "step": 24632 + }, + { + "epoch": 2.9210245464247597, + "grad_norm": 0.6275604963658415, + "learning_rate": 8.948909469921615e-06, + "loss": 0.0774, + "step": 24633 + }, + { + "epoch": 2.921143128186885, + "grad_norm": 0.7750265312496867, + "learning_rate": 8.947069263579163e-06, + "loss": 0.109, + "step": 24634 + }, + { + "epoch": 2.9212617099490097, + "grad_norm": 0.6428000810637188, + "learning_rate": 8.945229205225051e-06, + "loss": 0.0786, + "step": 24635 + }, + { + "epoch": 2.921380291711135, + "grad_norm": 0.6487431389471783, + "learning_rate": 8.943389294876236e-06, + "loss": 0.0684, + "step": 24636 + }, + { + "epoch": 2.9214988734732596, + "grad_norm": 0.498225467813398, + "learning_rate": 8.941549532549692e-06, + "loss": 0.0725, + "step": 24637 + }, + { + "epoch": 2.921617455235385, + "grad_norm": 0.396240331672947, + "learning_rate": 8.939709918262351e-06, + "loss": 0.046, + "step": 24638 + }, + { + "epoch": 2.92173603699751, + "grad_norm": 0.5892018995995384, + "learning_rate": 8.937870452031205e-06, + "loss": 0.0692, + "step": 24639 + }, + { + "epoch": 2.921854618759635, + "grad_norm": 0.7042846627434335, + "learning_rate": 8.93603113387319e-06, + "loss": 0.0976, + "step": 24640 + }, + { + "epoch": 2.9219732005217596, + "grad_norm": 0.6233061696148056, + "learning_rate": 8.934191963805272e-06, + "loss": 0.0812, + "step": 24641 + }, + { + "epoch": 2.9220917822838848, + "grad_norm": 0.8579335618270972, + "learning_rate": 8.9323529418444e-06, + "loss": 0.1245, + "step": 24642 + }, + { + "epoch": 2.92221036404601, + "grad_norm": 1.0854301179500851, + "learning_rate": 8.930514068007529e-06, + "loss": 0.1414, + "step": 24643 + }, + { + "epoch": 2.9223289458081347, + "grad_norm": 0.5006394364394183, + "learning_rate": 8.928675342311625e-06, + "loss": 0.0643, + "step": 24644 + }, + { + "epoch": 2.9224475275702595, + "grad_norm": 0.648214586092759, + "learning_rate": 8.926836764773613e-06, + "loss": 0.0889, + "step": 24645 + }, + { + "epoch": 2.9225661093323847, + "grad_norm": 0.4485280451013365, + "learning_rate": 8.924998335410459e-06, + "loss": 0.0571, + "step": 24646 + }, + { + "epoch": 2.92268469109451, + "grad_norm": 0.7171626571866129, + "learning_rate": 8.923160054239108e-06, + "loss": 0.1179, + "step": 24647 + }, + { + "epoch": 2.9228032728566347, + "grad_norm": 0.8261920462896355, + "learning_rate": 8.921321921276504e-06, + "loss": 0.1277, + "step": 24648 + }, + { + "epoch": 2.9229218546187594, + "grad_norm": 0.9247288030825495, + "learning_rate": 8.919483936539608e-06, + "loss": 0.1039, + "step": 24649 + }, + { + "epoch": 2.9230404363808846, + "grad_norm": 0.5691949632843076, + "learning_rate": 8.91764610004533e-06, + "loss": 0.0651, + "step": 24650 + }, + { + "epoch": 2.92315901814301, + "grad_norm": 0.4807495894415854, + "learning_rate": 8.91580841181065e-06, + "loss": 0.057, + "step": 24651 + }, + { + "epoch": 2.9232775999051346, + "grad_norm": 0.5744179337683247, + "learning_rate": 8.913970871852487e-06, + "loss": 0.0938, + "step": 24652 + }, + { + "epoch": 2.9233961816672593, + "grad_norm": 0.9250710475703056, + "learning_rate": 8.912133480187785e-06, + "loss": 0.1025, + "step": 24653 + }, + { + "epoch": 2.9235147634293845, + "grad_norm": 0.6167917133282643, + "learning_rate": 8.910296236833484e-06, + "loss": 0.0955, + "step": 24654 + }, + { + "epoch": 2.9236333451915097, + "grad_norm": 0.7048539954028515, + "learning_rate": 8.90845914180653e-06, + "loss": 0.0875, + "step": 24655 + }, + { + "epoch": 2.9237519269536345, + "grad_norm": 0.7231351514781331, + "learning_rate": 8.906622195123842e-06, + "loss": 0.1038, + "step": 24656 + }, + { + "epoch": 2.9238705087157593, + "grad_norm": 0.6390732490459617, + "learning_rate": 8.904785396802363e-06, + "loss": 0.0744, + "step": 24657 + }, + { + "epoch": 2.9239890904778845, + "grad_norm": 0.6566273075315932, + "learning_rate": 8.902948746859025e-06, + "loss": 0.0814, + "step": 24658 + }, + { + "epoch": 2.9241076722400097, + "grad_norm": 0.5699865364524855, + "learning_rate": 8.901112245310758e-06, + "loss": 0.0561, + "step": 24659 + }, + { + "epoch": 2.9242262540021344, + "grad_norm": 0.5475287301645801, + "learning_rate": 8.899275892174506e-06, + "loss": 0.0545, + "step": 24660 + }, + { + "epoch": 2.924344835764259, + "grad_norm": 0.5687601253948326, + "learning_rate": 8.897439687467165e-06, + "loss": 0.0754, + "step": 24661 + }, + { + "epoch": 2.9244634175263844, + "grad_norm": 0.5485067429610113, + "learning_rate": 8.895603631205705e-06, + "loss": 0.0735, + "step": 24662 + }, + { + "epoch": 2.9245819992885096, + "grad_norm": 0.7721686334340689, + "learning_rate": 8.89376772340702e-06, + "loss": 0.1086, + "step": 24663 + }, + { + "epoch": 2.9247005810506344, + "grad_norm": 0.7807047624704917, + "learning_rate": 8.891931964088054e-06, + "loss": 0.1341, + "step": 24664 + }, + { + "epoch": 2.9248191628127596, + "grad_norm": 0.8467109116724829, + "learning_rate": 8.89009635326571e-06, + "loss": 0.0929, + "step": 24665 + }, + { + "epoch": 2.9249377445748843, + "grad_norm": 0.41194673130605264, + "learning_rate": 8.888260890956935e-06, + "loss": 0.047, + "step": 24666 + }, + { + "epoch": 2.9250563263370095, + "grad_norm": 0.9455020209947617, + "learning_rate": 8.886425577178628e-06, + "loss": 0.1286, + "step": 24667 + }, + { + "epoch": 2.9251749080991343, + "grad_norm": 0.6685832469525708, + "learning_rate": 8.884590411947719e-06, + "loss": 0.0924, + "step": 24668 + }, + { + "epoch": 2.9252934898612595, + "grad_norm": 0.7490620127521881, + "learning_rate": 8.882755395281123e-06, + "loss": 0.0862, + "step": 24669 + }, + { + "epoch": 2.9254120716233842, + "grad_norm": 0.46771905294981925, + "learning_rate": 8.88092052719576e-06, + "loss": 0.0633, + "step": 24670 + }, + { + "epoch": 2.9255306533855094, + "grad_norm": 0.697712878110552, + "learning_rate": 8.879085807708552e-06, + "loss": 0.0864, + "step": 24671 + }, + { + "epoch": 2.925649235147634, + "grad_norm": 0.6061346917250952, + "learning_rate": 8.877251236836383e-06, + "loss": 0.0845, + "step": 24672 + }, + { + "epoch": 2.9257678169097594, + "grad_norm": 0.5909073209392788, + "learning_rate": 8.875416814596207e-06, + "loss": 0.0689, + "step": 24673 + }, + { + "epoch": 2.925886398671884, + "grad_norm": 0.5452377077395657, + "learning_rate": 8.873582541004907e-06, + "loss": 0.0807, + "step": 24674 + }, + { + "epoch": 2.9260049804340094, + "grad_norm": 0.7187729996279948, + "learning_rate": 8.871748416079406e-06, + "loss": 0.1081, + "step": 24675 + }, + { + "epoch": 2.926123562196134, + "grad_norm": 0.7911401264971634, + "learning_rate": 8.869914439836594e-06, + "loss": 0.0838, + "step": 24676 + }, + { + "epoch": 2.9262421439582593, + "grad_norm": 0.4966693046276024, + "learning_rate": 8.868080612293403e-06, + "loss": 0.0664, + "step": 24677 + }, + { + "epoch": 2.926360725720384, + "grad_norm": 0.815528489649856, + "learning_rate": 8.866246933466721e-06, + "loss": 0.0985, + "step": 24678 + }, + { + "epoch": 2.9264793074825093, + "grad_norm": 1.0855030725468082, + "learning_rate": 8.864413403373456e-06, + "loss": 0.1283, + "step": 24679 + }, + { + "epoch": 2.926597889244634, + "grad_norm": 0.5622127189312047, + "learning_rate": 8.862580022030515e-06, + "loss": 0.0701, + "step": 24680 + }, + { + "epoch": 2.9267164710067592, + "grad_norm": 0.7776329305306384, + "learning_rate": 8.860746789454796e-06, + "loss": 0.0958, + "step": 24681 + }, + { + "epoch": 2.926835052768884, + "grad_norm": 0.5031012913141516, + "learning_rate": 8.858913705663207e-06, + "loss": 0.0617, + "step": 24682 + }, + { + "epoch": 2.926953634531009, + "grad_norm": 0.6847242554563422, + "learning_rate": 8.857080770672626e-06, + "loss": 0.0734, + "step": 24683 + }, + { + "epoch": 2.927072216293134, + "grad_norm": 0.6169857162620137, + "learning_rate": 8.85524798449998e-06, + "loss": 0.0784, + "step": 24684 + }, + { + "epoch": 2.927190798055259, + "grad_norm": 0.42632011916537993, + "learning_rate": 8.853415347162142e-06, + "loss": 0.0539, + "step": 24685 + }, + { + "epoch": 2.927309379817384, + "grad_norm": 0.6859775026656922, + "learning_rate": 8.851582858676022e-06, + "loss": 0.0738, + "step": 24686 + }, + { + "epoch": 2.927427961579509, + "grad_norm": 0.6105624407216185, + "learning_rate": 8.84975051905849e-06, + "loss": 0.0816, + "step": 24687 + }, + { + "epoch": 2.9275465433416343, + "grad_norm": 0.6316565231513708, + "learning_rate": 8.847918328326469e-06, + "loss": 0.0887, + "step": 24688 + }, + { + "epoch": 2.927665125103759, + "grad_norm": 0.5190690438104787, + "learning_rate": 8.846086286496824e-06, + "loss": 0.0738, + "step": 24689 + }, + { + "epoch": 2.927783706865884, + "grad_norm": 0.5598389706474172, + "learning_rate": 8.844254393586456e-06, + "loss": 0.0564, + "step": 24690 + }, + { + "epoch": 2.927902288628009, + "grad_norm": 0.560566394850453, + "learning_rate": 8.842422649612253e-06, + "loss": 0.072, + "step": 24691 + }, + { + "epoch": 2.9280208703901343, + "grad_norm": 0.6223906236228309, + "learning_rate": 8.840591054591096e-06, + "loss": 0.0701, + "step": 24692 + }, + { + "epoch": 2.928139452152259, + "grad_norm": 0.7250588018316637, + "learning_rate": 8.838759608539884e-06, + "loss": 0.0817, + "step": 24693 + }, + { + "epoch": 2.9282580339143838, + "grad_norm": 0.7143226237190697, + "learning_rate": 8.836928311475473e-06, + "loss": 0.0845, + "step": 24694 + }, + { + "epoch": 2.928376615676509, + "grad_norm": 0.6380522300206152, + "learning_rate": 8.83509716341478e-06, + "loss": 0.0812, + "step": 24695 + }, + { + "epoch": 2.928495197438634, + "grad_norm": 0.4294251711641108, + "learning_rate": 8.833266164374657e-06, + "loss": 0.0602, + "step": 24696 + }, + { + "epoch": 2.928613779200759, + "grad_norm": 0.5587718264127084, + "learning_rate": 8.831435314371996e-06, + "loss": 0.0904, + "step": 24697 + }, + { + "epoch": 2.9287323609628837, + "grad_norm": 0.9896487192926706, + "learning_rate": 8.829604613423678e-06, + "loss": 0.0817, + "step": 24698 + }, + { + "epoch": 2.928850942725009, + "grad_norm": 0.3720439336818933, + "learning_rate": 8.827774061546574e-06, + "loss": 0.05, + "step": 24699 + }, + { + "epoch": 2.928969524487134, + "grad_norm": 0.6630687419879981, + "learning_rate": 8.825943658757568e-06, + "loss": 0.0934, + "step": 24700 + }, + { + "epoch": 2.929088106249259, + "grad_norm": 0.40713828424587184, + "learning_rate": 8.82411340507352e-06, + "loss": 0.0566, + "step": 24701 + }, + { + "epoch": 2.9292066880113836, + "grad_norm": 0.5028429455011845, + "learning_rate": 8.822283300511312e-06, + "loss": 0.0676, + "step": 24702 + }, + { + "epoch": 2.929325269773509, + "grad_norm": 0.6802922654237165, + "learning_rate": 8.820453345087809e-06, + "loss": 0.0901, + "step": 24703 + }, + { + "epoch": 2.929443851535634, + "grad_norm": 0.6203841947370131, + "learning_rate": 8.818623538819898e-06, + "loss": 0.073, + "step": 24704 + }, + { + "epoch": 2.929562433297759, + "grad_norm": 0.8628518246160247, + "learning_rate": 8.816793881724424e-06, + "loss": 0.0977, + "step": 24705 + }, + { + "epoch": 2.9296810150598835, + "grad_norm": 0.773125234113313, + "learning_rate": 8.814964373818268e-06, + "loss": 0.1172, + "step": 24706 + }, + { + "epoch": 2.9297995968220087, + "grad_norm": 0.6782259658374168, + "learning_rate": 8.813135015118292e-06, + "loss": 0.0616, + "step": 24707 + }, + { + "epoch": 2.929918178584134, + "grad_norm": 0.6149852643136238, + "learning_rate": 8.811305805641363e-06, + "loss": 0.0624, + "step": 24708 + }, + { + "epoch": 2.9300367603462587, + "grad_norm": 0.5758848304287975, + "learning_rate": 8.80947674540434e-06, + "loss": 0.0883, + "step": 24709 + }, + { + "epoch": 2.9301553421083835, + "grad_norm": 0.5535755407820191, + "learning_rate": 8.807647834424088e-06, + "loss": 0.0676, + "step": 24710 + }, + { + "epoch": 2.9302739238705087, + "grad_norm": 0.5384370086560454, + "learning_rate": 8.805819072717475e-06, + "loss": 0.0702, + "step": 24711 + }, + { + "epoch": 2.930392505632634, + "grad_norm": 1.0593883243901645, + "learning_rate": 8.803990460301343e-06, + "loss": 0.1308, + "step": 24712 + }, + { + "epoch": 2.9305110873947586, + "grad_norm": 0.6221016689316223, + "learning_rate": 8.80216199719256e-06, + "loss": 0.0854, + "step": 24713 + }, + { + "epoch": 2.930629669156884, + "grad_norm": 0.4723742873395775, + "learning_rate": 8.800333683407977e-06, + "loss": 0.076, + "step": 24714 + }, + { + "epoch": 2.9307482509190086, + "grad_norm": 0.8703270975149389, + "learning_rate": 8.798505518964465e-06, + "loss": 0.1187, + "step": 24715 + }, + { + "epoch": 2.930866832681134, + "grad_norm": 0.5447046087274953, + "learning_rate": 8.79667750387885e-06, + "loss": 0.0684, + "step": 24716 + }, + { + "epoch": 2.9309854144432586, + "grad_norm": 0.5589663524525419, + "learning_rate": 8.794849638168004e-06, + "loss": 0.0617, + "step": 24717 + }, + { + "epoch": 2.9311039962053838, + "grad_norm": 0.8464465475164042, + "learning_rate": 8.793021921848772e-06, + "loss": 0.0978, + "step": 24718 + }, + { + "epoch": 2.9312225779675085, + "grad_norm": 0.9922520043326963, + "learning_rate": 8.791194354937998e-06, + "loss": 0.1329, + "step": 24719 + }, + { + "epoch": 2.9313411597296337, + "grad_norm": 0.8459845493191571, + "learning_rate": 8.78936693745255e-06, + "loss": 0.0998, + "step": 24720 + }, + { + "epoch": 2.9314597414917585, + "grad_norm": 0.8323812875289722, + "learning_rate": 8.787539669409239e-06, + "loss": 0.0964, + "step": 24721 + }, + { + "epoch": 2.9315783232538837, + "grad_norm": 0.4620373146061505, + "learning_rate": 8.78571255082495e-06, + "loss": 0.0615, + "step": 24722 + }, + { + "epoch": 2.9316969050160084, + "grad_norm": 0.717080625708546, + "learning_rate": 8.7838855817165e-06, + "loss": 0.0979, + "step": 24723 + }, + { + "epoch": 2.9318154867781336, + "grad_norm": 0.5040615255725238, + "learning_rate": 8.782058762100739e-06, + "loss": 0.0571, + "step": 24724 + }, + { + "epoch": 2.9319340685402584, + "grad_norm": 0.6417192795877278, + "learning_rate": 8.780232091994509e-06, + "loss": 0.0903, + "step": 24725 + }, + { + "epoch": 2.9320526503023836, + "grad_norm": 0.7360877623724098, + "learning_rate": 8.778405571414658e-06, + "loss": 0.0772, + "step": 24726 + }, + { + "epoch": 2.9321712320645084, + "grad_norm": 0.5935160281793074, + "learning_rate": 8.776579200378004e-06, + "loss": 0.0845, + "step": 24727 + }, + { + "epoch": 2.9322898138266336, + "grad_norm": 0.5516238640933109, + "learning_rate": 8.7747529789014e-06, + "loss": 0.0802, + "step": 24728 + }, + { + "epoch": 2.9324083955887583, + "grad_norm": 0.6396126585044817, + "learning_rate": 8.772926907001672e-06, + "loss": 0.0606, + "step": 24729 + }, + { + "epoch": 2.9325269773508835, + "grad_norm": 0.7513802395381658, + "learning_rate": 8.771100984695662e-06, + "loss": 0.0876, + "step": 24730 + }, + { + "epoch": 2.9326455591130083, + "grad_norm": 0.7429696194609956, + "learning_rate": 8.769275212000208e-06, + "loss": 0.1014, + "step": 24731 + }, + { + "epoch": 2.9327641408751335, + "grad_norm": 0.7586738873501908, + "learning_rate": 8.767449588932114e-06, + "loss": 0.0968, + "step": 24732 + }, + { + "epoch": 2.9328827226372582, + "grad_norm": 1.2887219140965822, + "learning_rate": 8.765624115508247e-06, + "loss": 0.1333, + "step": 24733 + }, + { + "epoch": 2.9330013043993834, + "grad_norm": 0.8100491864330978, + "learning_rate": 8.763798791745411e-06, + "loss": 0.1077, + "step": 24734 + }, + { + "epoch": 2.933119886161508, + "grad_norm": 0.5373006659545942, + "learning_rate": 8.761973617660446e-06, + "loss": 0.0622, + "step": 24735 + }, + { + "epoch": 2.9332384679236334, + "grad_norm": 0.5633486661377247, + "learning_rate": 8.760148593270156e-06, + "loss": 0.07, + "step": 24736 + }, + { + "epoch": 2.933357049685758, + "grad_norm": 0.7064455730295925, + "learning_rate": 8.758323718591399e-06, + "loss": 0.1137, + "step": 24737 + }, + { + "epoch": 2.9334756314478834, + "grad_norm": 0.5849761869605637, + "learning_rate": 8.756498993640972e-06, + "loss": 0.087, + "step": 24738 + }, + { + "epoch": 2.933594213210008, + "grad_norm": 0.709219773385666, + "learning_rate": 8.754674418435704e-06, + "loss": 0.0652, + "step": 24739 + }, + { + "epoch": 2.9337127949721333, + "grad_norm": 0.783231466224613, + "learning_rate": 8.752849992992417e-06, + "loss": 0.1143, + "step": 24740 + }, + { + "epoch": 2.9338313767342585, + "grad_norm": 0.5357709462577418, + "learning_rate": 8.751025717327929e-06, + "loss": 0.0658, + "step": 24741 + }, + { + "epoch": 2.9339499584963833, + "grad_norm": 1.1876952322933052, + "learning_rate": 8.749201591459069e-06, + "loss": 0.1096, + "step": 24742 + }, + { + "epoch": 2.934068540258508, + "grad_norm": 0.5775612374245657, + "learning_rate": 8.747377615402624e-06, + "loss": 0.0695, + "step": 24743 + }, + { + "epoch": 2.9341871220206333, + "grad_norm": 0.7962539643400423, + "learning_rate": 8.745553789175443e-06, + "loss": 0.1055, + "step": 24744 + }, + { + "epoch": 2.9343057037827585, + "grad_norm": 0.5370744660576445, + "learning_rate": 8.743730112794315e-06, + "loss": 0.0602, + "step": 24745 + }, + { + "epoch": 2.934424285544883, + "grad_norm": 0.5256012041210142, + "learning_rate": 8.741906586276068e-06, + "loss": 0.073, + "step": 24746 + }, + { + "epoch": 2.934542867307008, + "grad_norm": 0.6077567697521994, + "learning_rate": 8.740083209637493e-06, + "loss": 0.0584, + "step": 24747 + }, + { + "epoch": 2.934661449069133, + "grad_norm": 0.768853600281149, + "learning_rate": 8.738259982895425e-06, + "loss": 0.1042, + "step": 24748 + }, + { + "epoch": 2.9347800308312584, + "grad_norm": 0.7186041625939387, + "learning_rate": 8.73643690606665e-06, + "loss": 0.0858, + "step": 24749 + }, + { + "epoch": 2.934898612593383, + "grad_norm": 0.5656160271605689, + "learning_rate": 8.734613979167985e-06, + "loss": 0.0798, + "step": 24750 + }, + { + "epoch": 2.935017194355508, + "grad_norm": 0.6221128715674417, + "learning_rate": 8.732791202216232e-06, + "loss": 0.0822, + "step": 24751 + }, + { + "epoch": 2.935135776117633, + "grad_norm": 0.6447154099687978, + "learning_rate": 8.730968575228193e-06, + "loss": 0.0839, + "step": 24752 + }, + { + "epoch": 2.9352543578797583, + "grad_norm": 0.6771676591759492, + "learning_rate": 8.729146098220687e-06, + "loss": 0.0977, + "step": 24753 + }, + { + "epoch": 2.935372939641883, + "grad_norm": 0.41102605502643474, + "learning_rate": 8.72732377121048e-06, + "loss": 0.0514, + "step": 24754 + }, + { + "epoch": 2.935491521404008, + "grad_norm": 0.5856953485085992, + "learning_rate": 8.725501594214413e-06, + "loss": 0.0684, + "step": 24755 + }, + { + "epoch": 2.935610103166133, + "grad_norm": 0.47818693549072966, + "learning_rate": 8.723679567249254e-06, + "loss": 0.064, + "step": 24756 + }, + { + "epoch": 2.9357286849282582, + "grad_norm": 1.0708643362904045, + "learning_rate": 8.72185769033181e-06, + "loss": 0.1324, + "step": 24757 + }, + { + "epoch": 2.935847266690383, + "grad_norm": 0.5928517530064702, + "learning_rate": 8.720035963478878e-06, + "loss": 0.0867, + "step": 24758 + }, + { + "epoch": 2.9359658484525077, + "grad_norm": 0.6483791990100886, + "learning_rate": 8.71821438670725e-06, + "loss": 0.0915, + "step": 24759 + }, + { + "epoch": 2.936084430214633, + "grad_norm": 0.8310517485902916, + "learning_rate": 8.716392960033729e-06, + "loss": 0.0914, + "step": 24760 + }, + { + "epoch": 2.936203011976758, + "grad_norm": 0.5965338021885986, + "learning_rate": 8.714571683475083e-06, + "loss": 0.0733, + "step": 24761 + }, + { + "epoch": 2.936321593738883, + "grad_norm": 0.48540391515062387, + "learning_rate": 8.712750557048122e-06, + "loss": 0.0746, + "step": 24762 + }, + { + "epoch": 2.936440175501008, + "grad_norm": 0.9832935264964617, + "learning_rate": 8.710929580769625e-06, + "loss": 0.1345, + "step": 24763 + }, + { + "epoch": 2.936558757263133, + "grad_norm": 0.4145882546448261, + "learning_rate": 8.709108754656392e-06, + "loss": 0.0441, + "step": 24764 + }, + { + "epoch": 2.936677339025258, + "grad_norm": 0.5593425107200681, + "learning_rate": 8.70728807872519e-06, + "loss": 0.0793, + "step": 24765 + }, + { + "epoch": 2.936795920787383, + "grad_norm": 0.6079767405943625, + "learning_rate": 8.70546755299281e-06, + "loss": 0.0913, + "step": 24766 + }, + { + "epoch": 2.936914502549508, + "grad_norm": 0.7112365716887155, + "learning_rate": 8.70364717747604e-06, + "loss": 0.0906, + "step": 24767 + }, + { + "epoch": 2.937033084311633, + "grad_norm": 0.6082529972350551, + "learning_rate": 8.70182695219166e-06, + "loss": 0.0866, + "step": 24768 + }, + { + "epoch": 2.937151666073758, + "grad_norm": 0.8958845124015028, + "learning_rate": 8.700006877156447e-06, + "loss": 0.1041, + "step": 24769 + }, + { + "epoch": 2.9372702478358828, + "grad_norm": 0.7080701050598175, + "learning_rate": 8.698186952387186e-06, + "loss": 0.0963, + "step": 24770 + }, + { + "epoch": 2.937388829598008, + "grad_norm": 0.5351035425892591, + "learning_rate": 8.696367177900654e-06, + "loss": 0.0831, + "step": 24771 + }, + { + "epoch": 2.9375074113601327, + "grad_norm": 0.5778009730022323, + "learning_rate": 8.69454755371362e-06, + "loss": 0.0794, + "step": 24772 + }, + { + "epoch": 2.937625993122258, + "grad_norm": 0.8391491650282211, + "learning_rate": 8.69272807984286e-06, + "loss": 0.089, + "step": 24773 + }, + { + "epoch": 2.9377445748843827, + "grad_norm": 0.6850317050622876, + "learning_rate": 8.69090875630515e-06, + "loss": 0.0943, + "step": 24774 + }, + { + "epoch": 2.937863156646508, + "grad_norm": 0.7564830617307748, + "learning_rate": 8.689089583117269e-06, + "loss": 0.0919, + "step": 24775 + }, + { + "epoch": 2.9379817384086326, + "grad_norm": 0.5077279713063442, + "learning_rate": 8.687270560295974e-06, + "loss": 0.0539, + "step": 24776 + }, + { + "epoch": 2.938100320170758, + "grad_norm": 0.528191085574374, + "learning_rate": 8.68545168785804e-06, + "loss": 0.0772, + "step": 24777 + }, + { + "epoch": 2.9382189019328826, + "grad_norm": 1.131480232924144, + "learning_rate": 8.683632965820235e-06, + "loss": 0.0941, + "step": 24778 + }, + { + "epoch": 2.938337483695008, + "grad_norm": 0.5230578114063742, + "learning_rate": 8.681814394199325e-06, + "loss": 0.0725, + "step": 24779 + }, + { + "epoch": 2.9384560654571326, + "grad_norm": 0.8413845362559406, + "learning_rate": 8.679995973012076e-06, + "loss": 0.0939, + "step": 24780 + }, + { + "epoch": 2.9385746472192578, + "grad_norm": 0.6645354165832876, + "learning_rate": 8.67817770227525e-06, + "loss": 0.0764, + "step": 24781 + }, + { + "epoch": 2.9386932289813825, + "grad_norm": 0.6830753717994534, + "learning_rate": 8.676359582005622e-06, + "loss": 0.0936, + "step": 24782 + }, + { + "epoch": 2.9388118107435077, + "grad_norm": 0.7895109603097671, + "learning_rate": 8.67454161221993e-06, + "loss": 0.0856, + "step": 24783 + }, + { + "epoch": 2.9389303925056325, + "grad_norm": 0.7710936825768432, + "learning_rate": 8.672723792934946e-06, + "loss": 0.1071, + "step": 24784 + }, + { + "epoch": 2.9390489742677577, + "grad_norm": 0.5486363583566113, + "learning_rate": 8.670906124167427e-06, + "loss": 0.0606, + "step": 24785 + }, + { + "epoch": 2.9391675560298824, + "grad_norm": 0.729783791320936, + "learning_rate": 8.669088605934137e-06, + "loss": 0.1025, + "step": 24786 + }, + { + "epoch": 2.9392861377920076, + "grad_norm": 0.695958022958059, + "learning_rate": 8.667271238251814e-06, + "loss": 0.0877, + "step": 24787 + }, + { + "epoch": 2.9394047195541324, + "grad_norm": 0.5908772374661878, + "learning_rate": 8.665454021137226e-06, + "loss": 0.0798, + "step": 24788 + }, + { + "epoch": 2.9395233013162576, + "grad_norm": 0.7300094154407167, + "learning_rate": 8.663636954607119e-06, + "loss": 0.103, + "step": 24789 + }, + { + "epoch": 2.939641883078383, + "grad_norm": 0.40577617231242286, + "learning_rate": 8.661820038678245e-06, + "loss": 0.0603, + "step": 24790 + }, + { + "epoch": 2.9397604648405076, + "grad_norm": 0.5689876723912375, + "learning_rate": 8.660003273367365e-06, + "loss": 0.0622, + "step": 24791 + }, + { + "epoch": 2.9398790466026323, + "grad_norm": 0.9421611514933969, + "learning_rate": 8.6581866586912e-06, + "loss": 0.124, + "step": 24792 + }, + { + "epoch": 2.9399976283647575, + "grad_norm": 0.4122304771566546, + "learning_rate": 8.65637019466653e-06, + "loss": 0.0429, + "step": 24793 + }, + { + "epoch": 2.9401162101268827, + "grad_norm": 0.5904227874452045, + "learning_rate": 8.654553881310078e-06, + "loss": 0.0662, + "step": 24794 + }, + { + "epoch": 2.9402347918890075, + "grad_norm": 0.476765823747926, + "learning_rate": 8.652737718638596e-06, + "loss": 0.0706, + "step": 24795 + }, + { + "epoch": 2.9403533736511323, + "grad_norm": 0.5587955606299259, + "learning_rate": 8.650921706668823e-06, + "loss": 0.0812, + "step": 24796 + }, + { + "epoch": 2.9404719554132575, + "grad_norm": 0.695489738978365, + "learning_rate": 8.649105845417513e-06, + "loss": 0.0965, + "step": 24797 + }, + { + "epoch": 2.9405905371753827, + "grad_norm": 0.5661081060809391, + "learning_rate": 8.64729013490139e-06, + "loss": 0.0713, + "step": 24798 + }, + { + "epoch": 2.9407091189375074, + "grad_norm": 0.7597726475816727, + "learning_rate": 8.645474575137197e-06, + "loss": 0.1235, + "step": 24799 + }, + { + "epoch": 2.940827700699632, + "grad_norm": 0.37006713561369814, + "learning_rate": 8.643659166141673e-06, + "loss": 0.0445, + "step": 24800 + }, + { + "epoch": 2.9409462824617574, + "grad_norm": 0.66183928496454, + "learning_rate": 8.641843907931555e-06, + "loss": 0.0802, + "step": 24801 + }, + { + "epoch": 2.9410648642238826, + "grad_norm": 0.883969700142438, + "learning_rate": 8.640028800523583e-06, + "loss": 0.1116, + "step": 24802 + }, + { + "epoch": 2.9411834459860073, + "grad_norm": 0.5527947764433717, + "learning_rate": 8.638213843934467e-06, + "loss": 0.0796, + "step": 24803 + }, + { + "epoch": 2.941302027748132, + "grad_norm": 0.8099259027376194, + "learning_rate": 8.636399038180975e-06, + "loss": 0.1081, + "step": 24804 + }, + { + "epoch": 2.9414206095102573, + "grad_norm": 0.6464849850683712, + "learning_rate": 8.634584383279806e-06, + "loss": 0.0832, + "step": 24805 + }, + { + "epoch": 2.9415391912723825, + "grad_norm": 0.5934857759688033, + "learning_rate": 8.6327698792477e-06, + "loss": 0.0838, + "step": 24806 + }, + { + "epoch": 2.9416577730345073, + "grad_norm": 0.633325724755356, + "learning_rate": 8.630955526101389e-06, + "loss": 0.1012, + "step": 24807 + }, + { + "epoch": 2.941776354796632, + "grad_norm": 0.5836177236633525, + "learning_rate": 8.629141323857593e-06, + "loss": 0.0651, + "step": 24808 + }, + { + "epoch": 2.9418949365587572, + "grad_norm": 0.531091823877163, + "learning_rate": 8.627327272533046e-06, + "loss": 0.0773, + "step": 24809 + }, + { + "epoch": 2.9420135183208824, + "grad_norm": 0.8416932371869764, + "learning_rate": 8.625513372144447e-06, + "loss": 0.1096, + "step": 24810 + }, + { + "epoch": 2.942132100083007, + "grad_norm": 0.543199966491379, + "learning_rate": 8.623699622708554e-06, + "loss": 0.065, + "step": 24811 + }, + { + "epoch": 2.942250681845132, + "grad_norm": 0.4810339741080473, + "learning_rate": 8.621886024242057e-06, + "loss": 0.0567, + "step": 24812 + }, + { + "epoch": 2.942369263607257, + "grad_norm": 1.03391179959168, + "learning_rate": 8.620072576761697e-06, + "loss": 0.0842, + "step": 24813 + }, + { + "epoch": 2.9424878453693823, + "grad_norm": 0.43167789310052, + "learning_rate": 8.618259280284164e-06, + "loss": 0.0592, + "step": 24814 + }, + { + "epoch": 2.942606427131507, + "grad_norm": 0.84607828112058, + "learning_rate": 8.616446134826209e-06, + "loss": 0.0976, + "step": 24815 + }, + { + "epoch": 2.9427250088936323, + "grad_norm": 0.5566094102298988, + "learning_rate": 8.614633140404521e-06, + "loss": 0.0763, + "step": 24816 + }, + { + "epoch": 2.942843590655757, + "grad_norm": 0.5321887873103681, + "learning_rate": 8.612820297035823e-06, + "loss": 0.0608, + "step": 24817 + }, + { + "epoch": 2.9429621724178823, + "grad_norm": 0.7088996926898826, + "learning_rate": 8.611007604736827e-06, + "loss": 0.0669, + "step": 24818 + }, + { + "epoch": 2.943080754180007, + "grad_norm": 0.8924546912679007, + "learning_rate": 8.609195063524242e-06, + "loss": 0.1047, + "step": 24819 + }, + { + "epoch": 2.9431993359421322, + "grad_norm": 0.47075146627214687, + "learning_rate": 8.607382673414787e-06, + "loss": 0.0798, + "step": 24820 + }, + { + "epoch": 2.943317917704257, + "grad_norm": 0.6462048173919328, + "learning_rate": 8.605570434425156e-06, + "loss": 0.0724, + "step": 24821 + }, + { + "epoch": 2.943436499466382, + "grad_norm": 0.8569473381666879, + "learning_rate": 8.60375834657206e-06, + "loss": 0.1308, + "step": 24822 + }, + { + "epoch": 2.943555081228507, + "grad_norm": 1.0918654396600556, + "learning_rate": 8.601946409872205e-06, + "loss": 0.0977, + "step": 24823 + }, + { + "epoch": 2.943673662990632, + "grad_norm": 0.8436090383775363, + "learning_rate": 8.600134624342304e-06, + "loss": 0.0796, + "step": 24824 + }, + { + "epoch": 2.943792244752757, + "grad_norm": 0.64977172343791, + "learning_rate": 8.598322989999036e-06, + "loss": 0.1004, + "step": 24825 + }, + { + "epoch": 2.943910826514882, + "grad_norm": 0.5252524752845684, + "learning_rate": 8.596511506859132e-06, + "loss": 0.0629, + "step": 24826 + }, + { + "epoch": 2.944029408277007, + "grad_norm": 0.5367878055079518, + "learning_rate": 8.594700174939269e-06, + "loss": 0.0806, + "step": 24827 + }, + { + "epoch": 2.944147990039132, + "grad_norm": 0.638582383622751, + "learning_rate": 8.592888994256151e-06, + "loss": 0.0733, + "step": 24828 + }, + { + "epoch": 2.944266571801257, + "grad_norm": 0.7826894977517451, + "learning_rate": 8.591077964826477e-06, + "loss": 0.1023, + "step": 24829 + }, + { + "epoch": 2.944385153563382, + "grad_norm": 0.7345088143916557, + "learning_rate": 8.589267086666945e-06, + "loss": 0.102, + "step": 24830 + }, + { + "epoch": 2.944503735325507, + "grad_norm": 0.8711993271787554, + "learning_rate": 8.587456359794253e-06, + "loss": 0.1183, + "step": 24831 + }, + { + "epoch": 2.944622317087632, + "grad_norm": 0.45433350623435903, + "learning_rate": 8.585645784225077e-06, + "loss": 0.0747, + "step": 24832 + }, + { + "epoch": 2.9447408988497568, + "grad_norm": 0.7533104116620911, + "learning_rate": 8.583835359976119e-06, + "loss": 0.0795, + "step": 24833 + }, + { + "epoch": 2.944859480611882, + "grad_norm": 0.6116656994096187, + "learning_rate": 8.582025087064069e-06, + "loss": 0.0797, + "step": 24834 + }, + { + "epoch": 2.9449780623740067, + "grad_norm": 0.44621516219507434, + "learning_rate": 8.580214965505622e-06, + "loss": 0.0592, + "step": 24835 + }, + { + "epoch": 2.945096644136132, + "grad_norm": 0.5661043462759373, + "learning_rate": 8.578404995317452e-06, + "loss": 0.0868, + "step": 24836 + }, + { + "epoch": 2.9452152258982567, + "grad_norm": 0.9192700460723962, + "learning_rate": 8.57659517651625e-06, + "loss": 0.1147, + "step": 24837 + }, + { + "epoch": 2.945333807660382, + "grad_norm": 0.7366552196281656, + "learning_rate": 8.5747855091187e-06, + "loss": 0.104, + "step": 24838 + }, + { + "epoch": 2.945452389422507, + "grad_norm": 1.1217396287282013, + "learning_rate": 8.572975993141483e-06, + "loss": 0.1206, + "step": 24839 + }, + { + "epoch": 2.945570971184632, + "grad_norm": 0.6685521317026742, + "learning_rate": 8.571166628601287e-06, + "loss": 0.0805, + "step": 24840 + }, + { + "epoch": 2.9456895529467566, + "grad_norm": 0.7118684838264259, + "learning_rate": 8.569357415514786e-06, + "loss": 0.1027, + "step": 24841 + }, + { + "epoch": 2.945808134708882, + "grad_norm": 0.9054536084023223, + "learning_rate": 8.567548353898668e-06, + "loss": 0.1147, + "step": 24842 + }, + { + "epoch": 2.945926716471007, + "grad_norm": 0.5126747133043137, + "learning_rate": 8.565739443769599e-06, + "loss": 0.06, + "step": 24843 + }, + { + "epoch": 2.9460452982331318, + "grad_norm": 0.48421645178380696, + "learning_rate": 8.563930685144258e-06, + "loss": 0.0654, + "step": 24844 + }, + { + "epoch": 2.9461638799952565, + "grad_norm": 0.8550750406716907, + "learning_rate": 8.56212207803932e-06, + "loss": 0.1133, + "step": 24845 + }, + { + "epoch": 2.9462824617573817, + "grad_norm": 0.398746977071758, + "learning_rate": 8.560313622471466e-06, + "loss": 0.0518, + "step": 24846 + }, + { + "epoch": 2.946401043519507, + "grad_norm": 0.6700064202730145, + "learning_rate": 8.558505318457356e-06, + "loss": 0.0908, + "step": 24847 + }, + { + "epoch": 2.9465196252816317, + "grad_norm": 0.5268597144375241, + "learning_rate": 8.556697166013664e-06, + "loss": 0.0753, + "step": 24848 + }, + { + "epoch": 2.9466382070437565, + "grad_norm": 0.8061635432075565, + "learning_rate": 8.554889165157057e-06, + "loss": 0.0977, + "step": 24849 + }, + { + "epoch": 2.9467567888058817, + "grad_norm": 0.5352903107254531, + "learning_rate": 8.553081315904208e-06, + "loss": 0.0682, + "step": 24850 + }, + { + "epoch": 2.946875370568007, + "grad_norm": 0.8506801156107616, + "learning_rate": 8.551273618271788e-06, + "loss": 0.1117, + "step": 24851 + }, + { + "epoch": 2.9469939523301316, + "grad_norm": 0.5927901605876238, + "learning_rate": 8.549466072276441e-06, + "loss": 0.0779, + "step": 24852 + }, + { + "epoch": 2.9471125340922564, + "grad_norm": 0.7321066979785279, + "learning_rate": 8.547658677934855e-06, + "loss": 0.0903, + "step": 24853 + }, + { + "epoch": 2.9472311158543816, + "grad_norm": 0.41059949918497873, + "learning_rate": 8.545851435263677e-06, + "loss": 0.0528, + "step": 24854 + }, + { + "epoch": 2.947349697616507, + "grad_norm": 0.7077606295676836, + "learning_rate": 8.54404434427957e-06, + "loss": 0.088, + "step": 24855 + }, + { + "epoch": 2.9474682793786315, + "grad_norm": 0.7797914032803686, + "learning_rate": 8.542237404999196e-06, + "loss": 0.1112, + "step": 24856 + }, + { + "epoch": 2.9475868611407563, + "grad_norm": 0.5395873961467965, + "learning_rate": 8.540430617439208e-06, + "loss": 0.0634, + "step": 24857 + }, + { + "epoch": 2.9477054429028815, + "grad_norm": 0.6185022815217405, + "learning_rate": 8.538623981616275e-06, + "loss": 0.088, + "step": 24858 + }, + { + "epoch": 2.9478240246650067, + "grad_norm": 0.6757455335825635, + "learning_rate": 8.536817497547029e-06, + "loss": 0.0935, + "step": 24859 + }, + { + "epoch": 2.9479426064271315, + "grad_norm": 0.9048632900337387, + "learning_rate": 8.53501116524815e-06, + "loss": 0.1021, + "step": 24860 + }, + { + "epoch": 2.948061188189256, + "grad_norm": 0.9357293604857116, + "learning_rate": 8.53320498473627e-06, + "loss": 0.1207, + "step": 24861 + }, + { + "epoch": 2.9481797699513814, + "grad_norm": 0.6373912866305842, + "learning_rate": 8.531398956028055e-06, + "loss": 0.0909, + "step": 24862 + }, + { + "epoch": 2.9482983517135066, + "grad_norm": 0.7187456551582418, + "learning_rate": 8.529593079140133e-06, + "loss": 0.0907, + "step": 24863 + }, + { + "epoch": 2.9484169334756314, + "grad_norm": 0.7052476654526412, + "learning_rate": 8.527787354089179e-06, + "loss": 0.0791, + "step": 24864 + }, + { + "epoch": 2.9485355152377566, + "grad_norm": 0.8560693790522845, + "learning_rate": 8.525981780891818e-06, + "loss": 0.1299, + "step": 24865 + }, + { + "epoch": 2.9486540969998813, + "grad_norm": 0.7665981581159069, + "learning_rate": 8.524176359564703e-06, + "loss": 0.0986, + "step": 24866 + }, + { + "epoch": 2.9487726787620065, + "grad_norm": 0.6707623420171622, + "learning_rate": 8.522371090124479e-06, + "loss": 0.0781, + "step": 24867 + }, + { + "epoch": 2.9488912605241313, + "grad_norm": 0.35517605935763413, + "learning_rate": 8.520565972587788e-06, + "loss": 0.0459, + "step": 24868 + }, + { + "epoch": 2.9490098422862565, + "grad_norm": 0.6893995382469758, + "learning_rate": 8.518761006971276e-06, + "loss": 0.0753, + "step": 24869 + }, + { + "epoch": 2.9491284240483813, + "grad_norm": 0.6206184171640068, + "learning_rate": 8.516956193291565e-06, + "loss": 0.0591, + "step": 24870 + }, + { + "epoch": 2.9492470058105065, + "grad_norm": 0.5875098371817383, + "learning_rate": 8.515151531565316e-06, + "loss": 0.0862, + "step": 24871 + }, + { + "epoch": 2.9493655875726312, + "grad_norm": 0.48817298508389634, + "learning_rate": 8.51334702180915e-06, + "loss": 0.0792, + "step": 24872 + }, + { + "epoch": 2.9494841693347564, + "grad_norm": 0.5163145161420353, + "learning_rate": 8.511542664039715e-06, + "loss": 0.0753, + "step": 24873 + }, + { + "epoch": 2.949602751096881, + "grad_norm": 0.43760987512852734, + "learning_rate": 8.50973845827362e-06, + "loss": 0.0609, + "step": 24874 + }, + { + "epoch": 2.9497213328590064, + "grad_norm": 0.6036459884231009, + "learning_rate": 8.507934404527531e-06, + "loss": 0.0858, + "step": 24875 + }, + { + "epoch": 2.949839914621131, + "grad_norm": 0.9735970517292327, + "learning_rate": 8.506130502818055e-06, + "loss": 0.1042, + "step": 24876 + }, + { + "epoch": 2.9499584963832564, + "grad_norm": 0.7573242904364428, + "learning_rate": 8.50432675316183e-06, + "loss": 0.1069, + "step": 24877 + }, + { + "epoch": 2.950077078145381, + "grad_norm": 0.614379295989985, + "learning_rate": 8.502523155575485e-06, + "loss": 0.0901, + "step": 24878 + }, + { + "epoch": 2.9501956599075063, + "grad_norm": 0.6069737465775981, + "learning_rate": 8.500719710075645e-06, + "loss": 0.0791, + "step": 24879 + }, + { + "epoch": 2.950314241669631, + "grad_norm": 0.7546342542907178, + "learning_rate": 8.498916416678942e-06, + "loss": 0.1143, + "step": 24880 + }, + { + "epoch": 2.9504328234317563, + "grad_norm": 0.6171893307501221, + "learning_rate": 8.497113275401983e-06, + "loss": 0.0706, + "step": 24881 + }, + { + "epoch": 2.950551405193881, + "grad_norm": 0.5549327958481122, + "learning_rate": 8.495310286261416e-06, + "loss": 0.0771, + "step": 24882 + }, + { + "epoch": 2.9506699869560062, + "grad_norm": 0.48456935229215914, + "learning_rate": 8.493507449273841e-06, + "loss": 0.0723, + "step": 24883 + }, + { + "epoch": 2.950788568718131, + "grad_norm": 0.6980285587077166, + "learning_rate": 8.491704764455893e-06, + "loss": 0.0798, + "step": 24884 + }, + { + "epoch": 2.950907150480256, + "grad_norm": 0.9105776784779269, + "learning_rate": 8.489902231824168e-06, + "loss": 0.1383, + "step": 24885 + }, + { + "epoch": 2.951025732242381, + "grad_norm": 0.7045000905132064, + "learning_rate": 8.488099851395314e-06, + "loss": 0.1152, + "step": 24886 + }, + { + "epoch": 2.951144314004506, + "grad_norm": 0.44307057672015726, + "learning_rate": 8.486297623185926e-06, + "loss": 0.063, + "step": 24887 + }, + { + "epoch": 2.9512628957666314, + "grad_norm": 0.8150608713537312, + "learning_rate": 8.48449554721262e-06, + "loss": 0.1255, + "step": 24888 + }, + { + "epoch": 2.951381477528756, + "grad_norm": 0.450778306241808, + "learning_rate": 8.482693623492013e-06, + "loss": 0.0563, + "step": 24889 + }, + { + "epoch": 2.951500059290881, + "grad_norm": 0.40262196696530844, + "learning_rate": 8.480891852040717e-06, + "loss": 0.0562, + "step": 24890 + }, + { + "epoch": 2.951618641053006, + "grad_norm": 0.5475768222472176, + "learning_rate": 8.47909023287535e-06, + "loss": 0.0685, + "step": 24891 + }, + { + "epoch": 2.9517372228151313, + "grad_norm": 0.6165750851533892, + "learning_rate": 8.4772887660125e-06, + "loss": 0.0706, + "step": 24892 + }, + { + "epoch": 2.951855804577256, + "grad_norm": 0.6231212714169482, + "learning_rate": 8.47548745146879e-06, + "loss": 0.0749, + "step": 24893 + }, + { + "epoch": 2.951974386339381, + "grad_norm": 0.40342469706944756, + "learning_rate": 8.473686289260821e-06, + "loss": 0.0424, + "step": 24894 + }, + { + "epoch": 2.952092968101506, + "grad_norm": 0.6520423772733575, + "learning_rate": 8.471885279405207e-06, + "loss": 0.0765, + "step": 24895 + }, + { + "epoch": 2.952211549863631, + "grad_norm": 0.6804896637432355, + "learning_rate": 8.470084421918522e-06, + "loss": 0.0764, + "step": 24896 + }, + { + "epoch": 2.952330131625756, + "grad_norm": 0.5279512521683731, + "learning_rate": 8.46828371681741e-06, + "loss": 0.0637, + "step": 24897 + }, + { + "epoch": 2.9524487133878807, + "grad_norm": 0.6622766670859754, + "learning_rate": 8.466483164118439e-06, + "loss": 0.0848, + "step": 24898 + }, + { + "epoch": 2.952567295150006, + "grad_norm": 0.7534715583473958, + "learning_rate": 8.46468276383822e-06, + "loss": 0.0989, + "step": 24899 + }, + { + "epoch": 2.952685876912131, + "grad_norm": 0.4014553904105658, + "learning_rate": 8.462882515993348e-06, + "loss": 0.0545, + "step": 24900 + }, + { + "epoch": 2.952804458674256, + "grad_norm": 0.41021199532896985, + "learning_rate": 8.461082420600419e-06, + "loss": 0.0556, + "step": 24901 + }, + { + "epoch": 2.9529230404363807, + "grad_norm": 0.5031025696954625, + "learning_rate": 8.459282477676036e-06, + "loss": 0.0648, + "step": 24902 + }, + { + "epoch": 2.953041622198506, + "grad_norm": 0.5224439758505441, + "learning_rate": 8.45748268723678e-06, + "loss": 0.0688, + "step": 24903 + }, + { + "epoch": 2.953160203960631, + "grad_norm": 0.6120297824600056, + "learning_rate": 8.455683049299245e-06, + "loss": 0.084, + "step": 24904 + }, + { + "epoch": 2.953278785722756, + "grad_norm": 0.7840301593991882, + "learning_rate": 8.453883563880027e-06, + "loss": 0.0927, + "step": 24905 + }, + { + "epoch": 2.9533973674848806, + "grad_norm": 0.664680639161485, + "learning_rate": 8.45208423099571e-06, + "loss": 0.0858, + "step": 24906 + }, + { + "epoch": 2.953515949247006, + "grad_norm": 0.4663220041705974, + "learning_rate": 8.450285050662896e-06, + "loss": 0.0611, + "step": 24907 + }, + { + "epoch": 2.953634531009131, + "grad_norm": 0.5758439353293299, + "learning_rate": 8.448486022898139e-06, + "loss": 0.0746, + "step": 24908 + }, + { + "epoch": 2.9537531127712557, + "grad_norm": 0.6130956577972677, + "learning_rate": 8.446687147718063e-06, + "loss": 0.0744, + "step": 24909 + }, + { + "epoch": 2.9538716945333805, + "grad_norm": 0.6976046985249674, + "learning_rate": 8.444888425139223e-06, + "loss": 0.0789, + "step": 24910 + }, + { + "epoch": 2.9539902762955057, + "grad_norm": 0.5626273116748953, + "learning_rate": 8.44308985517821e-06, + "loss": 0.0636, + "step": 24911 + }, + { + "epoch": 2.954108858057631, + "grad_norm": 0.46582125879776565, + "learning_rate": 8.441291437851605e-06, + "loss": 0.0575, + "step": 24912 + }, + { + "epoch": 2.9542274398197557, + "grad_norm": 0.974592152147461, + "learning_rate": 8.439493173175996e-06, + "loss": 0.1461, + "step": 24913 + }, + { + "epoch": 2.954346021581881, + "grad_norm": 0.6057572292138327, + "learning_rate": 8.437695061167944e-06, + "loss": 0.0574, + "step": 24914 + }, + { + "epoch": 2.9544646033440056, + "grad_norm": 0.7322178267994706, + "learning_rate": 8.435897101844035e-06, + "loss": 0.0784, + "step": 24915 + }, + { + "epoch": 2.954583185106131, + "grad_norm": 0.7639605283537837, + "learning_rate": 8.43409929522084e-06, + "loss": 0.0948, + "step": 24916 + }, + { + "epoch": 2.9547017668682556, + "grad_norm": 0.6881922942485581, + "learning_rate": 8.432301641314935e-06, + "loss": 0.0772, + "step": 24917 + }, + { + "epoch": 2.954820348630381, + "grad_norm": 0.6065526348839341, + "learning_rate": 8.430504140142903e-06, + "loss": 0.0834, + "step": 24918 + }, + { + "epoch": 2.9549389303925055, + "grad_norm": 0.5705579041980235, + "learning_rate": 8.428706791721283e-06, + "loss": 0.0535, + "step": 24919 + }, + { + "epoch": 2.9550575121546307, + "grad_norm": 0.4436107608015805, + "learning_rate": 8.426909596066687e-06, + "loss": 0.0661, + "step": 24920 + }, + { + "epoch": 2.9551760939167555, + "grad_norm": 0.5276316933132184, + "learning_rate": 8.425112553195649e-06, + "loss": 0.0656, + "step": 24921 + }, + { + "epoch": 2.9552946756788807, + "grad_norm": 0.7230092524933076, + "learning_rate": 8.42331566312476e-06, + "loss": 0.0973, + "step": 24922 + }, + { + "epoch": 2.9554132574410055, + "grad_norm": 0.658192532938016, + "learning_rate": 8.421518925870552e-06, + "loss": 0.0942, + "step": 24923 + }, + { + "epoch": 2.9555318392031307, + "grad_norm": 0.7096681203841102, + "learning_rate": 8.419722341449628e-06, + "loss": 0.0851, + "step": 24924 + }, + { + "epoch": 2.9556504209652554, + "grad_norm": 0.615011693064617, + "learning_rate": 8.417925909878524e-06, + "loss": 0.0661, + "step": 24925 + }, + { + "epoch": 2.9557690027273806, + "grad_norm": 0.9388965619283623, + "learning_rate": 8.41612963117381e-06, + "loss": 0.0986, + "step": 24926 + }, + { + "epoch": 2.9558875844895054, + "grad_norm": 0.5065306430750611, + "learning_rate": 8.414333505352042e-06, + "loss": 0.0695, + "step": 24927 + }, + { + "epoch": 2.9560061662516306, + "grad_norm": 0.45424222150541815, + "learning_rate": 8.412537532429782e-06, + "loss": 0.0556, + "step": 24928 + }, + { + "epoch": 2.9561247480137554, + "grad_norm": 1.1487314347013853, + "learning_rate": 8.410741712423596e-06, + "loss": 0.1431, + "step": 24929 + }, + { + "epoch": 2.9562433297758806, + "grad_norm": 0.8276240532007881, + "learning_rate": 8.408946045350006e-06, + "loss": 0.1235, + "step": 24930 + }, + { + "epoch": 2.9563619115380053, + "grad_norm": 0.6186766952924879, + "learning_rate": 8.407150531225608e-06, + "loss": 0.0792, + "step": 24931 + }, + { + "epoch": 2.9564804933001305, + "grad_norm": 0.6409551939919713, + "learning_rate": 8.405355170066926e-06, + "loss": 0.0874, + "step": 24932 + }, + { + "epoch": 2.9565990750622553, + "grad_norm": 0.6432513905771361, + "learning_rate": 8.403559961890528e-06, + "loss": 0.0705, + "step": 24933 + }, + { + "epoch": 2.9567176568243805, + "grad_norm": 0.8751046585981382, + "learning_rate": 8.401764906712937e-06, + "loss": 0.1084, + "step": 24934 + }, + { + "epoch": 2.9568362385865052, + "grad_norm": 0.8442325381182473, + "learning_rate": 8.399970004550736e-06, + "loss": 0.1236, + "step": 24935 + }, + { + "epoch": 2.9569548203486304, + "grad_norm": 1.0964589189566087, + "learning_rate": 8.398175255420448e-06, + "loss": 0.095, + "step": 24936 + }, + { + "epoch": 2.957073402110755, + "grad_norm": 0.587066634022615, + "learning_rate": 8.396380659338627e-06, + "loss": 0.0738, + "step": 24937 + }, + { + "epoch": 2.9571919838728804, + "grad_norm": 0.8372759296058759, + "learning_rate": 8.394586216321814e-06, + "loss": 0.0823, + "step": 24938 + }, + { + "epoch": 2.957310565635005, + "grad_norm": 0.6830286638055436, + "learning_rate": 8.392791926386554e-06, + "loss": 0.0767, + "step": 24939 + }, + { + "epoch": 2.9574291473971304, + "grad_norm": 0.6784927743323133, + "learning_rate": 8.390997789549395e-06, + "loss": 0.0678, + "step": 24940 + }, + { + "epoch": 2.9575477291592556, + "grad_norm": 0.6682048388701711, + "learning_rate": 8.389203805826853e-06, + "loss": 0.0833, + "step": 24941 + }, + { + "epoch": 2.9576663109213803, + "grad_norm": 0.9032408096846829, + "learning_rate": 8.3874099752355e-06, + "loss": 0.1319, + "step": 24942 + }, + { + "epoch": 2.957784892683505, + "grad_norm": 0.5744825164087858, + "learning_rate": 8.385616297791846e-06, + "loss": 0.0753, + "step": 24943 + }, + { + "epoch": 2.9579034744456303, + "grad_norm": 1.066853948100581, + "learning_rate": 8.383822773512442e-06, + "loss": 0.1414, + "step": 24944 + }, + { + "epoch": 2.9580220562077555, + "grad_norm": 0.7057981815665716, + "learning_rate": 8.382029402413801e-06, + "loss": 0.099, + "step": 24945 + }, + { + "epoch": 2.9581406379698802, + "grad_norm": 0.5536071748795899, + "learning_rate": 8.380236184512489e-06, + "loss": 0.0704, + "step": 24946 + }, + { + "epoch": 2.958259219732005, + "grad_norm": 0.6924956092841382, + "learning_rate": 8.378443119825011e-06, + "loss": 0.0762, + "step": 24947 + }, + { + "epoch": 2.95837780149413, + "grad_norm": 0.8730313336863538, + "learning_rate": 8.3766502083679e-06, + "loss": 0.1284, + "step": 24948 + }, + { + "epoch": 2.9584963832562554, + "grad_norm": 0.6459489174827514, + "learning_rate": 8.374857450157693e-06, + "loss": 0.0868, + "step": 24949 + }, + { + "epoch": 2.95861496501838, + "grad_norm": 0.3820607941439928, + "learning_rate": 8.373064845210913e-06, + "loss": 0.0404, + "step": 24950 + }, + { + "epoch": 2.958733546780505, + "grad_norm": 0.4814439350680984, + "learning_rate": 8.371272393544093e-06, + "loss": 0.0518, + "step": 24951 + }, + { + "epoch": 2.95885212854263, + "grad_norm": 0.6389993808306158, + "learning_rate": 8.369480095173735e-06, + "loss": 0.0835, + "step": 24952 + }, + { + "epoch": 2.9589707103047553, + "grad_norm": 1.1415060595763102, + "learning_rate": 8.36768795011639e-06, + "loss": 0.1438, + "step": 24953 + }, + { + "epoch": 2.95908929206688, + "grad_norm": 0.7000315933670301, + "learning_rate": 8.365895958388561e-06, + "loss": 0.0913, + "step": 24954 + }, + { + "epoch": 2.959207873829005, + "grad_norm": 0.566761377535573, + "learning_rate": 8.36410412000678e-06, + "loss": 0.0638, + "step": 24955 + }, + { + "epoch": 2.95932645559113, + "grad_norm": 0.5980539438693783, + "learning_rate": 8.362312434987543e-06, + "loss": 0.0839, + "step": 24956 + }, + { + "epoch": 2.9594450373532553, + "grad_norm": 0.6694450929582195, + "learning_rate": 8.3605209033474e-06, + "loss": 0.0787, + "step": 24957 + }, + { + "epoch": 2.95956361911538, + "grad_norm": 0.523309094179477, + "learning_rate": 8.35872952510284e-06, + "loss": 0.072, + "step": 24958 + }, + { + "epoch": 2.9596822008775048, + "grad_norm": 0.6343074793551762, + "learning_rate": 8.356938300270386e-06, + "loss": 0.0862, + "step": 24959 + }, + { + "epoch": 2.95980078263963, + "grad_norm": 0.5588065474297204, + "learning_rate": 8.355147228866555e-06, + "loss": 0.0727, + "step": 24960 + }, + { + "epoch": 2.959919364401755, + "grad_norm": 0.7615503096170427, + "learning_rate": 8.353356310907853e-06, + "loss": 0.0846, + "step": 24961 + }, + { + "epoch": 2.96003794616388, + "grad_norm": 0.5055747846184072, + "learning_rate": 8.3515655464108e-06, + "loss": 0.0539, + "step": 24962 + }, + { + "epoch": 2.960156527926005, + "grad_norm": 0.6113934538800171, + "learning_rate": 8.34977493539189e-06, + "loss": 0.0708, + "step": 24963 + }, + { + "epoch": 2.96027510968813, + "grad_norm": 0.523179979814414, + "learning_rate": 8.347984477867637e-06, + "loss": 0.0601, + "step": 24964 + }, + { + "epoch": 2.960393691450255, + "grad_norm": 0.7051673993579949, + "learning_rate": 8.34619417385455e-06, + "loss": 0.0889, + "step": 24965 + }, + { + "epoch": 2.96051227321238, + "grad_norm": 0.7551440718466316, + "learning_rate": 8.344404023369126e-06, + "loss": 0.1023, + "step": 24966 + }, + { + "epoch": 2.960630854974505, + "grad_norm": 0.6617571194315479, + "learning_rate": 8.342614026427876e-06, + "loss": 0.0886, + "step": 24967 + }, + { + "epoch": 2.96074943673663, + "grad_norm": 0.8119947604468143, + "learning_rate": 8.340824183047293e-06, + "loss": 0.0934, + "step": 24968 + }, + { + "epoch": 2.960868018498755, + "grad_norm": 0.6300027997815346, + "learning_rate": 8.339034493243894e-06, + "loss": 0.0715, + "step": 24969 + }, + { + "epoch": 2.96098660026088, + "grad_norm": 0.5744852813496583, + "learning_rate": 8.337244957034157e-06, + "loss": 0.0646, + "step": 24970 + }, + { + "epoch": 2.961105182023005, + "grad_norm": 0.5950266103382224, + "learning_rate": 8.335455574434589e-06, + "loss": 0.0747, + "step": 24971 + }, + { + "epoch": 2.9612237637851297, + "grad_norm": 0.7031356589860767, + "learning_rate": 8.333666345461686e-06, + "loss": 0.0886, + "step": 24972 + }, + { + "epoch": 2.961342345547255, + "grad_norm": 0.6283254057240496, + "learning_rate": 8.331877270131949e-06, + "loss": 0.077, + "step": 24973 + }, + { + "epoch": 2.9614609273093797, + "grad_norm": 0.7839815011265179, + "learning_rate": 8.330088348461854e-06, + "loss": 0.0947, + "step": 24974 + }, + { + "epoch": 2.961579509071505, + "grad_norm": 0.556575998804133, + "learning_rate": 8.328299580467905e-06, + "loss": 0.0733, + "step": 24975 + }, + { + "epoch": 2.9616980908336297, + "grad_norm": 0.7212489717964854, + "learning_rate": 8.326510966166588e-06, + "loss": 0.09, + "step": 24976 + }, + { + "epoch": 2.961816672595755, + "grad_norm": 0.616468115921763, + "learning_rate": 8.324722505574392e-06, + "loss": 0.0781, + "step": 24977 + }, + { + "epoch": 2.9619352543578796, + "grad_norm": 0.7464546269954274, + "learning_rate": 8.322934198707818e-06, + "loss": 0.1033, + "step": 24978 + }, + { + "epoch": 2.962053836120005, + "grad_norm": 0.7326200635554021, + "learning_rate": 8.32114604558332e-06, + "loss": 0.1105, + "step": 24979 + }, + { + "epoch": 2.9621724178821296, + "grad_norm": 0.6174065793461089, + "learning_rate": 8.319358046217421e-06, + "loss": 0.092, + "step": 24980 + }, + { + "epoch": 2.962290999644255, + "grad_norm": 0.825138387094861, + "learning_rate": 8.317570200626577e-06, + "loss": 0.1058, + "step": 24981 + }, + { + "epoch": 2.9624095814063796, + "grad_norm": 0.76689809894808, + "learning_rate": 8.315782508827277e-06, + "loss": 0.09, + "step": 24982 + }, + { + "epoch": 2.9625281631685048, + "grad_norm": 0.6810528099339241, + "learning_rate": 8.313994970836007e-06, + "loss": 0.1004, + "step": 24983 + }, + { + "epoch": 2.9626467449306295, + "grad_norm": 0.595290978873251, + "learning_rate": 8.312207586669247e-06, + "loss": 0.0802, + "step": 24984 + }, + { + "epoch": 2.9627653266927547, + "grad_norm": 0.6317833699046512, + "learning_rate": 8.31042035634346e-06, + "loss": 0.0878, + "step": 24985 + }, + { + "epoch": 2.9628839084548795, + "grad_norm": 0.4705830497135391, + "learning_rate": 8.308633279875131e-06, + "loss": 0.0753, + "step": 24986 + }, + { + "epoch": 2.9630024902170047, + "grad_norm": 0.6474715211364932, + "learning_rate": 8.306846357280734e-06, + "loss": 0.0928, + "step": 24987 + }, + { + "epoch": 2.9631210719791294, + "grad_norm": 0.5600371833194989, + "learning_rate": 8.305059588576746e-06, + "loss": 0.0871, + "step": 24988 + }, + { + "epoch": 2.9632396537412546, + "grad_norm": 0.6482965053701433, + "learning_rate": 8.303272973779645e-06, + "loss": 0.098, + "step": 24989 + }, + { + "epoch": 2.96335823550338, + "grad_norm": 0.7784142344313768, + "learning_rate": 8.301486512905873e-06, + "loss": 0.1271, + "step": 24990 + }, + { + "epoch": 2.9634768172655046, + "grad_norm": 0.37296140211835693, + "learning_rate": 8.299700205971936e-06, + "loss": 0.0545, + "step": 24991 + }, + { + "epoch": 2.9635953990276294, + "grad_norm": 0.7429509722716806, + "learning_rate": 8.297914052994275e-06, + "loss": 0.0812, + "step": 24992 + }, + { + "epoch": 2.9637139807897546, + "grad_norm": 0.5310847473421103, + "learning_rate": 8.296128053989372e-06, + "loss": 0.0757, + "step": 24993 + }, + { + "epoch": 2.9638325625518798, + "grad_norm": 0.5524271113748292, + "learning_rate": 8.294342208973671e-06, + "loss": 0.0868, + "step": 24994 + }, + { + "epoch": 2.9639511443140045, + "grad_norm": 0.6420537004001788, + "learning_rate": 8.292556517963661e-06, + "loss": 0.0827, + "step": 24995 + }, + { + "epoch": 2.9640697260761293, + "grad_norm": 0.7446449060933787, + "learning_rate": 8.29077098097579e-06, + "loss": 0.09, + "step": 24996 + }, + { + "epoch": 2.9641883078382545, + "grad_norm": 0.620192476012595, + "learning_rate": 8.288985598026517e-06, + "loss": 0.0907, + "step": 24997 + }, + { + "epoch": 2.9643068896003797, + "grad_norm": 0.5121201546230418, + "learning_rate": 8.287200369132302e-06, + "loss": 0.0622, + "step": 24998 + }, + { + "epoch": 2.9644254713625044, + "grad_norm": 0.5149099330101119, + "learning_rate": 8.285415294309608e-06, + "loss": 0.0689, + "step": 24999 + }, + { + "epoch": 2.964544053124629, + "grad_norm": 0.6606328405294786, + "learning_rate": 8.283630373574896e-06, + "loss": 0.0828, + "step": 25000 + }, + { + "epoch": 2.9646626348867544, + "grad_norm": 0.7175444026779345, + "learning_rate": 8.281845606944596e-06, + "loss": 0.0812, + "step": 25001 + }, + { + "epoch": 2.9647812166488796, + "grad_norm": 0.5784303993634138, + "learning_rate": 8.280060994435196e-06, + "loss": 0.0859, + "step": 25002 + }, + { + "epoch": 2.9648997984110044, + "grad_norm": 0.5397351333180957, + "learning_rate": 8.278276536063123e-06, + "loss": 0.076, + "step": 25003 + }, + { + "epoch": 2.965018380173129, + "grad_norm": 0.5455669767193746, + "learning_rate": 8.27649223184484e-06, + "loss": 0.0583, + "step": 25004 + }, + { + "epoch": 2.9651369619352543, + "grad_norm": 0.7795770411864749, + "learning_rate": 8.274708081796775e-06, + "loss": 0.1262, + "step": 25005 + }, + { + "epoch": 2.9652555436973795, + "grad_norm": 0.5786239306539307, + "learning_rate": 8.27292408593541e-06, + "loss": 0.0755, + "step": 25006 + }, + { + "epoch": 2.9653741254595043, + "grad_norm": 0.6720398280172609, + "learning_rate": 8.271140244277164e-06, + "loss": 0.0742, + "step": 25007 + }, + { + "epoch": 2.965492707221629, + "grad_norm": 0.6616950366586247, + "learning_rate": 8.26935655683849e-06, + "loss": 0.099, + "step": 25008 + }, + { + "epoch": 2.9656112889837543, + "grad_norm": 0.5150329522093849, + "learning_rate": 8.267573023635832e-06, + "loss": 0.0566, + "step": 25009 + }, + { + "epoch": 2.9657298707458795, + "grad_norm": 0.4846534428684467, + "learning_rate": 8.26578964468563e-06, + "loss": 0.0587, + "step": 25010 + }, + { + "epoch": 2.965848452508004, + "grad_norm": 0.6917207924186327, + "learning_rate": 8.264006420004338e-06, + "loss": 0.076, + "step": 25011 + }, + { + "epoch": 2.965967034270129, + "grad_norm": 0.586488191059885, + "learning_rate": 8.262223349608366e-06, + "loss": 0.0671, + "step": 25012 + }, + { + "epoch": 2.966085616032254, + "grad_norm": 0.8621787675289279, + "learning_rate": 8.260440433514189e-06, + "loss": 0.1044, + "step": 25013 + }, + { + "epoch": 2.9662041977943794, + "grad_norm": 1.151435090955448, + "learning_rate": 8.258657671738212e-06, + "loss": 0.1375, + "step": 25014 + }, + { + "epoch": 2.966322779556504, + "grad_norm": 0.8155259989003408, + "learning_rate": 8.256875064296882e-06, + "loss": 0.1142, + "step": 25015 + }, + { + "epoch": 2.9664413613186293, + "grad_norm": 0.4618325501879452, + "learning_rate": 8.255092611206633e-06, + "loss": 0.0503, + "step": 25016 + }, + { + "epoch": 2.966559943080754, + "grad_norm": 0.5129818091889444, + "learning_rate": 8.253310312483897e-06, + "loss": 0.0682, + "step": 25017 + }, + { + "epoch": 2.9666785248428793, + "grad_norm": 0.6090363915865067, + "learning_rate": 8.251528168145109e-06, + "loss": 0.0837, + "step": 25018 + }, + { + "epoch": 2.966797106605004, + "grad_norm": 0.6858547679689723, + "learning_rate": 8.249746178206688e-06, + "loss": 0.0814, + "step": 25019 + }, + { + "epoch": 2.9669156883671293, + "grad_norm": 0.6904840052808182, + "learning_rate": 8.247964342685066e-06, + "loss": 0.0893, + "step": 25020 + }, + { + "epoch": 2.967034270129254, + "grad_norm": 0.7925494334207361, + "learning_rate": 8.246182661596669e-06, + "loss": 0.1103, + "step": 25021 + }, + { + "epoch": 2.9671528518913792, + "grad_norm": 1.0122343222670969, + "learning_rate": 8.244401134957932e-06, + "loss": 0.1413, + "step": 25022 + }, + { + "epoch": 2.967271433653504, + "grad_norm": 0.6578290043626758, + "learning_rate": 8.242619762785253e-06, + "loss": 0.0793, + "step": 25023 + }, + { + "epoch": 2.967390015415629, + "grad_norm": 0.5304652616567476, + "learning_rate": 8.240838545095087e-06, + "loss": 0.0798, + "step": 25024 + }, + { + "epoch": 2.967508597177754, + "grad_norm": 0.47449869860404453, + "learning_rate": 8.239057481903828e-06, + "loss": 0.0637, + "step": 25025 + }, + { + "epoch": 2.967627178939879, + "grad_norm": 0.6700878788742385, + "learning_rate": 8.237276573227906e-06, + "loss": 0.0843, + "step": 25026 + }, + { + "epoch": 2.967745760702004, + "grad_norm": 0.682191247952562, + "learning_rate": 8.23549581908374e-06, + "loss": 0.0883, + "step": 25027 + }, + { + "epoch": 2.967864342464129, + "grad_norm": 0.5458445562386511, + "learning_rate": 8.233715219487744e-06, + "loss": 0.0726, + "step": 25028 + }, + { + "epoch": 2.967982924226254, + "grad_norm": 0.6453179608862155, + "learning_rate": 8.23193477445634e-06, + "loss": 0.0942, + "step": 25029 + }, + { + "epoch": 2.968101505988379, + "grad_norm": 0.6551343656216135, + "learning_rate": 8.230154484005931e-06, + "loss": 0.0751, + "step": 25030 + }, + { + "epoch": 2.968220087750504, + "grad_norm": 0.6788283655715286, + "learning_rate": 8.228374348152928e-06, + "loss": 0.0892, + "step": 25031 + }, + { + "epoch": 2.968338669512629, + "grad_norm": 0.6485794714592112, + "learning_rate": 8.226594366913751e-06, + "loss": 0.0804, + "step": 25032 + }, + { + "epoch": 2.968457251274754, + "grad_norm": 0.628419345756267, + "learning_rate": 8.22481454030481e-06, + "loss": 0.0858, + "step": 25033 + }, + { + "epoch": 2.968575833036879, + "grad_norm": 0.8395659290745587, + "learning_rate": 8.223034868342503e-06, + "loss": 0.1034, + "step": 25034 + }, + { + "epoch": 2.9686944147990038, + "grad_norm": 0.4864948025161855, + "learning_rate": 8.22125535104324e-06, + "loss": 0.0577, + "step": 25035 + }, + { + "epoch": 2.968812996561129, + "grad_norm": 0.31370958234907537, + "learning_rate": 8.219475988423425e-06, + "loss": 0.044, + "step": 25036 + }, + { + "epoch": 2.9689315783232537, + "grad_norm": 0.5497591518060113, + "learning_rate": 8.217696780499465e-06, + "loss": 0.0711, + "step": 25037 + }, + { + "epoch": 2.969050160085379, + "grad_norm": 0.7402247613853963, + "learning_rate": 8.21591772728776e-06, + "loss": 0.0824, + "step": 25038 + }, + { + "epoch": 2.969168741847504, + "grad_norm": 0.8431266282939593, + "learning_rate": 8.214138828804712e-06, + "loss": 0.0946, + "step": 25039 + }, + { + "epoch": 2.969287323609629, + "grad_norm": 0.43937806776982236, + "learning_rate": 8.212360085066728e-06, + "loss": 0.0594, + "step": 25040 + }, + { + "epoch": 2.9694059053717536, + "grad_norm": 0.6895547736020192, + "learning_rate": 8.210581496090192e-06, + "loss": 0.0926, + "step": 25041 + }, + { + "epoch": 2.969524487133879, + "grad_norm": 0.8912721197965394, + "learning_rate": 8.208803061891505e-06, + "loss": 0.0992, + "step": 25042 + }, + { + "epoch": 2.969643068896004, + "grad_norm": 0.5135453498500496, + "learning_rate": 8.20702478248706e-06, + "loss": 0.057, + "step": 25043 + }, + { + "epoch": 2.969761650658129, + "grad_norm": 1.037444693549099, + "learning_rate": 8.205246657893265e-06, + "loss": 0.1179, + "step": 25044 + }, + { + "epoch": 2.9698802324202536, + "grad_norm": 0.9024078168717531, + "learning_rate": 8.203468688126493e-06, + "loss": 0.0812, + "step": 25045 + }, + { + "epoch": 2.9699988141823788, + "grad_norm": 0.5554014741807811, + "learning_rate": 8.201690873203139e-06, + "loss": 0.0688, + "step": 25046 + }, + { + "epoch": 2.970117395944504, + "grad_norm": 0.6714481478236857, + "learning_rate": 8.199913213139598e-06, + "loss": 0.0921, + "step": 25047 + }, + { + "epoch": 2.9702359777066287, + "grad_norm": 0.588650698050504, + "learning_rate": 8.198135707952256e-06, + "loss": 0.0811, + "step": 25048 + }, + { + "epoch": 2.9703545594687535, + "grad_norm": 0.6701419682103539, + "learning_rate": 8.196358357657506e-06, + "loss": 0.0934, + "step": 25049 + }, + { + "epoch": 2.9704731412308787, + "grad_norm": 0.8381190804391415, + "learning_rate": 8.194581162271708e-06, + "loss": 0.0985, + "step": 25050 + }, + { + "epoch": 2.970591722993004, + "grad_norm": 0.8113337098540379, + "learning_rate": 8.192804121811285e-06, + "loss": 0.0866, + "step": 25051 + }, + { + "epoch": 2.9707103047551287, + "grad_norm": 0.6823460880288441, + "learning_rate": 8.191027236292584e-06, + "loss": 0.0913, + "step": 25052 + }, + { + "epoch": 2.9708288865172534, + "grad_norm": 0.7223242128255606, + "learning_rate": 8.189250505732002e-06, + "loss": 0.0822, + "step": 25053 + }, + { + "epoch": 2.9709474682793786, + "grad_norm": 0.5341774446927711, + "learning_rate": 8.187473930145914e-06, + "loss": 0.0695, + "step": 25054 + }, + { + "epoch": 2.971066050041504, + "grad_norm": 0.7374500866007058, + "learning_rate": 8.18569750955071e-06, + "loss": 0.0835, + "step": 25055 + }, + { + "epoch": 2.9711846318036286, + "grad_norm": 0.5536378343154398, + "learning_rate": 8.183921243962747e-06, + "loss": 0.0516, + "step": 25056 + }, + { + "epoch": 2.9713032135657533, + "grad_norm": 0.6965738086151192, + "learning_rate": 8.182145133398408e-06, + "loss": 0.092, + "step": 25057 + }, + { + "epoch": 2.9714217953278785, + "grad_norm": 1.1716439432172778, + "learning_rate": 8.18036917787407e-06, + "loss": 0.1129, + "step": 25058 + }, + { + "epoch": 2.9715403770900037, + "grad_norm": 0.45483086163496295, + "learning_rate": 8.1785933774061e-06, + "loss": 0.0525, + "step": 25059 + }, + { + "epoch": 2.9716589588521285, + "grad_norm": 0.5092475800589117, + "learning_rate": 8.17681773201088e-06, + "loss": 0.0704, + "step": 25060 + }, + { + "epoch": 2.9717775406142533, + "grad_norm": 0.6717739268805931, + "learning_rate": 8.175042241704753e-06, + "loss": 0.0915, + "step": 25061 + }, + { + "epoch": 2.9718961223763785, + "grad_norm": 0.6178427753006543, + "learning_rate": 8.173266906504124e-06, + "loss": 0.0829, + "step": 25062 + }, + { + "epoch": 2.9720147041385037, + "grad_norm": 0.7315092287547226, + "learning_rate": 8.171491726425329e-06, + "loss": 0.0759, + "step": 25063 + }, + { + "epoch": 2.9721332859006284, + "grad_norm": 0.5244750610410481, + "learning_rate": 8.169716701484744e-06, + "loss": 0.0589, + "step": 25064 + }, + { + "epoch": 2.9722518676627536, + "grad_norm": 0.5256088894048527, + "learning_rate": 8.167941831698733e-06, + "loss": 0.073, + "step": 25065 + }, + { + "epoch": 2.9723704494248784, + "grad_norm": 0.5570578480607858, + "learning_rate": 8.166167117083656e-06, + "loss": 0.0704, + "step": 25066 + }, + { + "epoch": 2.9724890311870036, + "grad_norm": 0.7344410582613705, + "learning_rate": 8.164392557655887e-06, + "loss": 0.0919, + "step": 25067 + }, + { + "epoch": 2.9726076129491283, + "grad_norm": 0.6438672764973817, + "learning_rate": 8.162618153431753e-06, + "loss": 0.0884, + "step": 25068 + }, + { + "epoch": 2.9727261947112535, + "grad_norm": 0.6042705331948534, + "learning_rate": 8.160843904427652e-06, + "loss": 0.0724, + "step": 25069 + }, + { + "epoch": 2.9728447764733783, + "grad_norm": 0.5890480272489989, + "learning_rate": 8.159069810659909e-06, + "loss": 0.0828, + "step": 25070 + }, + { + "epoch": 2.9729633582355035, + "grad_norm": 0.4379957252162962, + "learning_rate": 8.157295872144901e-06, + "loss": 0.0599, + "step": 25071 + }, + { + "epoch": 2.9730819399976283, + "grad_norm": 0.4819715993467981, + "learning_rate": 8.155522088898954e-06, + "loss": 0.0627, + "step": 25072 + }, + { + "epoch": 2.9732005217597535, + "grad_norm": 0.537008111018313, + "learning_rate": 8.153748460938454e-06, + "loss": 0.0798, + "step": 25073 + }, + { + "epoch": 2.9733191035218782, + "grad_norm": 1.0088238905509268, + "learning_rate": 8.151974988279728e-06, + "loss": 0.1404, + "step": 25074 + }, + { + "epoch": 2.9734376852840034, + "grad_norm": 1.0400447928137853, + "learning_rate": 8.15020167093913e-06, + "loss": 0.0955, + "step": 25075 + }, + { + "epoch": 2.973556267046128, + "grad_norm": 0.9964000230263935, + "learning_rate": 8.148428508933012e-06, + "loss": 0.1431, + "step": 25076 + }, + { + "epoch": 2.9736748488082534, + "grad_norm": 0.7717813645370776, + "learning_rate": 8.146655502277717e-06, + "loss": 0.1109, + "step": 25077 + }, + { + "epoch": 2.973793430570378, + "grad_norm": 0.9096200466330429, + "learning_rate": 8.144882650989599e-06, + "loss": 0.0935, + "step": 25078 + }, + { + "epoch": 2.9739120123325034, + "grad_norm": 0.6179439713576736, + "learning_rate": 8.143109955084988e-06, + "loss": 0.0716, + "step": 25079 + }, + { + "epoch": 2.974030594094628, + "grad_norm": 0.6080656266707402, + "learning_rate": 8.141337414580231e-06, + "loss": 0.0588, + "step": 25080 + }, + { + "epoch": 2.9741491758567533, + "grad_norm": 0.8512911391406075, + "learning_rate": 8.13956502949167e-06, + "loss": 0.1092, + "step": 25081 + }, + { + "epoch": 2.974267757618878, + "grad_norm": 0.7962475539373741, + "learning_rate": 8.13779279983565e-06, + "loss": 0.1053, + "step": 25082 + }, + { + "epoch": 2.9743863393810033, + "grad_norm": 0.8183231892099131, + "learning_rate": 8.136020725628487e-06, + "loss": 0.0845, + "step": 25083 + }, + { + "epoch": 2.974504921143128, + "grad_norm": 0.626896227545927, + "learning_rate": 8.134248806886549e-06, + "loss": 0.0738, + "step": 25084 + }, + { + "epoch": 2.9746235029052532, + "grad_norm": 0.7393767906963233, + "learning_rate": 8.132477043626147e-06, + "loss": 0.1124, + "step": 25085 + }, + { + "epoch": 2.974742084667378, + "grad_norm": 0.48056985660529977, + "learning_rate": 8.130705435863622e-06, + "loss": 0.0503, + "step": 25086 + }, + { + "epoch": 2.974860666429503, + "grad_norm": 0.5775181147132066, + "learning_rate": 8.128933983615309e-06, + "loss": 0.0785, + "step": 25087 + }, + { + "epoch": 2.9749792481916284, + "grad_norm": 0.7809476031598717, + "learning_rate": 8.127162686897533e-06, + "loss": 0.0883, + "step": 25088 + }, + { + "epoch": 2.975097829953753, + "grad_norm": 0.8324888258418185, + "learning_rate": 8.125391545726635e-06, + "loss": 0.1128, + "step": 25089 + }, + { + "epoch": 2.975216411715878, + "grad_norm": 0.487183895950958, + "learning_rate": 8.123620560118928e-06, + "loss": 0.0745, + "step": 25090 + }, + { + "epoch": 2.975334993478003, + "grad_norm": 0.5206158406887362, + "learning_rate": 8.121849730090741e-06, + "loss": 0.0805, + "step": 25091 + }, + { + "epoch": 2.9754535752401283, + "grad_norm": 0.6666742379371215, + "learning_rate": 8.120079055658402e-06, + "loss": 0.0664, + "step": 25092 + }, + { + "epoch": 2.975572157002253, + "grad_norm": 0.8207468404070949, + "learning_rate": 8.118308536838245e-06, + "loss": 0.0803, + "step": 25093 + }, + { + "epoch": 2.975690738764378, + "grad_norm": 0.7380469103614871, + "learning_rate": 8.116538173646574e-06, + "loss": 0.0754, + "step": 25094 + }, + { + "epoch": 2.975809320526503, + "grad_norm": 0.7258183859010993, + "learning_rate": 8.114767966099715e-06, + "loss": 0.0897, + "step": 25095 + }, + { + "epoch": 2.9759279022886282, + "grad_norm": 0.8576498891058388, + "learning_rate": 8.11299791421399e-06, + "loss": 0.1104, + "step": 25096 + }, + { + "epoch": 2.976046484050753, + "grad_norm": 0.5385527161379939, + "learning_rate": 8.111228018005718e-06, + "loss": 0.0779, + "step": 25097 + }, + { + "epoch": 2.9761650658128778, + "grad_norm": 0.6299972618197482, + "learning_rate": 8.109458277491212e-06, + "loss": 0.0679, + "step": 25098 + }, + { + "epoch": 2.976283647575003, + "grad_norm": 0.6564966006296298, + "learning_rate": 8.107688692686786e-06, + "loss": 0.0991, + "step": 25099 + }, + { + "epoch": 2.976402229337128, + "grad_norm": 1.033689108991281, + "learning_rate": 8.105919263608766e-06, + "loss": 0.1136, + "step": 25100 + }, + { + "epoch": 2.976520811099253, + "grad_norm": 1.2916528159799927, + "learning_rate": 8.104149990273444e-06, + "loss": 0.1072, + "step": 25101 + }, + { + "epoch": 2.9766393928613777, + "grad_norm": 0.49294503347397245, + "learning_rate": 8.10238087269714e-06, + "loss": 0.0548, + "step": 25102 + }, + { + "epoch": 2.976757974623503, + "grad_norm": 0.6955175169004751, + "learning_rate": 8.100611910896164e-06, + "loss": 0.0983, + "step": 25103 + }, + { + "epoch": 2.976876556385628, + "grad_norm": 0.7190641452825438, + "learning_rate": 8.098843104886833e-06, + "loss": 0.0832, + "step": 25104 + }, + { + "epoch": 2.976995138147753, + "grad_norm": 0.5776247333443606, + "learning_rate": 8.09707445468543e-06, + "loss": 0.0726, + "step": 25105 + }, + { + "epoch": 2.9771137199098776, + "grad_norm": 0.6973471171558073, + "learning_rate": 8.095305960308278e-06, + "loss": 0.1049, + "step": 25106 + }, + { + "epoch": 2.977232301672003, + "grad_norm": 0.5798014429969458, + "learning_rate": 8.093537621771671e-06, + "loss": 0.0644, + "step": 25107 + }, + { + "epoch": 2.977350883434128, + "grad_norm": 0.5008700981837714, + "learning_rate": 8.091769439091917e-06, + "loss": 0.0597, + "step": 25108 + }, + { + "epoch": 2.9774694651962528, + "grad_norm": 0.6633031513686832, + "learning_rate": 8.090001412285315e-06, + "loss": 0.087, + "step": 25109 + }, + { + "epoch": 2.9775880469583775, + "grad_norm": 0.5376947845189397, + "learning_rate": 8.088233541368162e-06, + "loss": 0.0636, + "step": 25110 + }, + { + "epoch": 2.9777066287205027, + "grad_norm": 1.2052986175006224, + "learning_rate": 8.086465826356765e-06, + "loss": 0.1145, + "step": 25111 + }, + { + "epoch": 2.977825210482628, + "grad_norm": 0.7310029290668759, + "learning_rate": 8.084698267267408e-06, + "loss": 0.1103, + "step": 25112 + }, + { + "epoch": 2.9779437922447527, + "grad_norm": 0.538592009460005, + "learning_rate": 8.082930864116384e-06, + "loss": 0.0693, + "step": 25113 + }, + { + "epoch": 2.978062374006878, + "grad_norm": 0.536763732153074, + "learning_rate": 8.081163616919996e-06, + "loss": 0.074, + "step": 25114 + }, + { + "epoch": 2.9781809557690027, + "grad_norm": 0.482183290540528, + "learning_rate": 8.079396525694532e-06, + "loss": 0.0472, + "step": 25115 + }, + { + "epoch": 2.978299537531128, + "grad_norm": 0.4944550722359037, + "learning_rate": 8.07762959045629e-06, + "loss": 0.0732, + "step": 25116 + }, + { + "epoch": 2.9784181192932526, + "grad_norm": 0.9080878338910515, + "learning_rate": 8.075862811221537e-06, + "loss": 0.0944, + "step": 25117 + }, + { + "epoch": 2.978536701055378, + "grad_norm": 0.893106016034601, + "learning_rate": 8.074096188006591e-06, + "loss": 0.124, + "step": 25118 + }, + { + "epoch": 2.9786552828175026, + "grad_norm": 0.7048207009191976, + "learning_rate": 8.072329720827711e-06, + "loss": 0.0844, + "step": 25119 + }, + { + "epoch": 2.978773864579628, + "grad_norm": 0.5159845284960578, + "learning_rate": 8.070563409701204e-06, + "loss": 0.0709, + "step": 25120 + }, + { + "epoch": 2.9788924463417525, + "grad_norm": 0.9420771758714879, + "learning_rate": 8.068797254643327e-06, + "loss": 0.1432, + "step": 25121 + }, + { + "epoch": 2.9790110281038777, + "grad_norm": 0.9578213414064427, + "learning_rate": 8.06703125567039e-06, + "loss": 0.116, + "step": 25122 + }, + { + "epoch": 2.9791296098660025, + "grad_norm": 0.6979383618934383, + "learning_rate": 8.065265412798654e-06, + "loss": 0.0977, + "step": 25123 + }, + { + "epoch": 2.9792481916281277, + "grad_norm": 0.4313835608468267, + "learning_rate": 8.063499726044405e-06, + "loss": 0.0673, + "step": 25124 + }, + { + "epoch": 2.9793667733902525, + "grad_norm": 0.6086859259313473, + "learning_rate": 8.061734195423917e-06, + "loss": 0.0719, + "step": 25125 + }, + { + "epoch": 2.9794853551523777, + "grad_norm": 0.5549573498375592, + "learning_rate": 8.05996882095347e-06, + "loss": 0.0604, + "step": 25126 + }, + { + "epoch": 2.9796039369145024, + "grad_norm": 0.5883752638881121, + "learning_rate": 8.058203602649344e-06, + "loss": 0.0802, + "step": 25127 + }, + { + "epoch": 2.9797225186766276, + "grad_norm": 0.6569459361251455, + "learning_rate": 8.056438540527792e-06, + "loss": 0.0724, + "step": 25128 + }, + { + "epoch": 2.9798411004387524, + "grad_norm": 0.49845489120230535, + "learning_rate": 8.054673634605114e-06, + "loss": 0.0793, + "step": 25129 + }, + { + "epoch": 2.9799596822008776, + "grad_norm": 0.621818348069744, + "learning_rate": 8.052908884897555e-06, + "loss": 0.0792, + "step": 25130 + }, + { + "epoch": 2.9800782639630023, + "grad_norm": 0.6550864695381045, + "learning_rate": 8.051144291421406e-06, + "loss": 0.0865, + "step": 25131 + }, + { + "epoch": 2.9801968457251276, + "grad_norm": 0.5804156557730891, + "learning_rate": 8.049379854192904e-06, + "loss": 0.0634, + "step": 25132 + }, + { + "epoch": 2.9803154274872523, + "grad_norm": 0.6735563436682396, + "learning_rate": 8.047615573228351e-06, + "loss": 0.0664, + "step": 25133 + }, + { + "epoch": 2.9804340092493775, + "grad_norm": 0.6203687160229402, + "learning_rate": 8.045851448543986e-06, + "loss": 0.0639, + "step": 25134 + }, + { + "epoch": 2.9805525910115023, + "grad_norm": 0.8118766900975812, + "learning_rate": 8.044087480156079e-06, + "loss": 0.0817, + "step": 25135 + }, + { + "epoch": 2.9806711727736275, + "grad_norm": 0.8372651659800823, + "learning_rate": 8.042323668080892e-06, + "loss": 0.1058, + "step": 25136 + }, + { + "epoch": 2.9807897545357527, + "grad_norm": 0.8849844320122865, + "learning_rate": 8.040560012334688e-06, + "loss": 0.1027, + "step": 25137 + }, + { + "epoch": 2.9809083362978774, + "grad_norm": 0.4896335176102449, + "learning_rate": 8.03879651293373e-06, + "loss": 0.0555, + "step": 25138 + }, + { + "epoch": 2.981026918060002, + "grad_norm": 0.6437804681545011, + "learning_rate": 8.037033169894253e-06, + "loss": 0.0741, + "step": 25139 + }, + { + "epoch": 2.9811454998221274, + "grad_norm": 0.3382461594304886, + "learning_rate": 8.035269983232546e-06, + "loss": 0.0481, + "step": 25140 + }, + { + "epoch": 2.9812640815842526, + "grad_norm": 1.1197995087708081, + "learning_rate": 8.03350695296484e-06, + "loss": 0.1401, + "step": 25141 + }, + { + "epoch": 2.9813826633463774, + "grad_norm": 0.6532482121885097, + "learning_rate": 8.031744079107397e-06, + "loss": 0.08, + "step": 25142 + }, + { + "epoch": 2.981501245108502, + "grad_norm": 0.9340862915422692, + "learning_rate": 8.029981361676456e-06, + "loss": 0.1304, + "step": 25143 + }, + { + "epoch": 2.9816198268706273, + "grad_norm": 0.5424286419443398, + "learning_rate": 8.02821880068829e-06, + "loss": 0.0758, + "step": 25144 + }, + { + "epoch": 2.9817384086327525, + "grad_norm": 0.5204117364670016, + "learning_rate": 8.026456396159124e-06, + "loss": 0.0607, + "step": 25145 + }, + { + "epoch": 2.9818569903948773, + "grad_norm": 0.8658795909123477, + "learning_rate": 8.024694148105217e-06, + "loss": 0.101, + "step": 25146 + }, + { + "epoch": 2.981975572157002, + "grad_norm": 0.8915524426930114, + "learning_rate": 8.022932056542815e-06, + "loss": 0.1191, + "step": 25147 + }, + { + "epoch": 2.9820941539191272, + "grad_norm": 0.6900858814950592, + "learning_rate": 8.021170121488159e-06, + "loss": 0.0851, + "step": 25148 + }, + { + "epoch": 2.9822127356812524, + "grad_norm": 0.5877577741465561, + "learning_rate": 8.019408342957504e-06, + "loss": 0.074, + "step": 25149 + }, + { + "epoch": 2.982331317443377, + "grad_norm": 0.8464906746191311, + "learning_rate": 8.01764672096707e-06, + "loss": 0.0911, + "step": 25150 + }, + { + "epoch": 2.982449899205502, + "grad_norm": 0.8797362246964182, + "learning_rate": 8.01588525553311e-06, + "loss": 0.1213, + "step": 25151 + }, + { + "epoch": 2.982568480967627, + "grad_norm": 0.7726497814496257, + "learning_rate": 8.014123946671862e-06, + "loss": 0.1316, + "step": 25152 + }, + { + "epoch": 2.9826870627297524, + "grad_norm": 0.6008900014386737, + "learning_rate": 8.012362794399566e-06, + "loss": 0.0838, + "step": 25153 + }, + { + "epoch": 2.982805644491877, + "grad_norm": 0.5225933285169287, + "learning_rate": 8.010601798732439e-06, + "loss": 0.0704, + "step": 25154 + }, + { + "epoch": 2.982924226254002, + "grad_norm": 0.6901202357523094, + "learning_rate": 8.008840959686747e-06, + "loss": 0.0904, + "step": 25155 + }, + { + "epoch": 2.983042808016127, + "grad_norm": 0.5416255035636441, + "learning_rate": 8.007080277278697e-06, + "loss": 0.0845, + "step": 25156 + }, + { + "epoch": 2.9831613897782523, + "grad_norm": 0.6390979658380151, + "learning_rate": 8.005319751524529e-06, + "loss": 0.0659, + "step": 25157 + }, + { + "epoch": 2.983279971540377, + "grad_norm": 0.6161378148701206, + "learning_rate": 8.003559382440473e-06, + "loss": 0.0776, + "step": 25158 + }, + { + "epoch": 2.983398553302502, + "grad_norm": 0.6733098218050515, + "learning_rate": 8.001799170042756e-06, + "loss": 0.0815, + "step": 25159 + }, + { + "epoch": 2.983517135064627, + "grad_norm": 0.5885467724007992, + "learning_rate": 8.000039114347613e-06, + "loss": 0.0722, + "step": 25160 + }, + { + "epoch": 2.983635716826752, + "grad_norm": 0.9101713520277888, + "learning_rate": 7.998279215371258e-06, + "loss": 0.119, + "step": 25161 + }, + { + "epoch": 2.983754298588877, + "grad_norm": 0.8201354298683057, + "learning_rate": 7.996519473129915e-06, + "loss": 0.1159, + "step": 25162 + }, + { + "epoch": 2.983872880351002, + "grad_norm": 0.9108977375950748, + "learning_rate": 7.994759887639816e-06, + "loss": 0.111, + "step": 25163 + }, + { + "epoch": 2.983991462113127, + "grad_norm": 0.6955988415098615, + "learning_rate": 7.993000458917175e-06, + "loss": 0.1001, + "step": 25164 + }, + { + "epoch": 2.984110043875252, + "grad_norm": 0.773114947746799, + "learning_rate": 7.991241186978221e-06, + "loss": 0.0986, + "step": 25165 + }, + { + "epoch": 2.984228625637377, + "grad_norm": 0.6509424296130524, + "learning_rate": 7.989482071839152e-06, + "loss": 0.085, + "step": 25166 + }, + { + "epoch": 2.984347207399502, + "grad_norm": 0.729516847836406, + "learning_rate": 7.987723113516216e-06, + "loss": 0.074, + "step": 25167 + }, + { + "epoch": 2.984465789161627, + "grad_norm": 0.6435366613240379, + "learning_rate": 7.9859643120256e-06, + "loss": 0.0836, + "step": 25168 + }, + { + "epoch": 2.984584370923752, + "grad_norm": 0.5272618926144214, + "learning_rate": 7.984205667383531e-06, + "loss": 0.0788, + "step": 25169 + }, + { + "epoch": 2.984702952685877, + "grad_norm": 0.7230919285159274, + "learning_rate": 7.98244717960622e-06, + "loss": 0.0786, + "step": 25170 + }, + { + "epoch": 2.984821534448002, + "grad_norm": 0.6038744499148276, + "learning_rate": 7.980688848709886e-06, + "loss": 0.0779, + "step": 25171 + }, + { + "epoch": 2.984940116210127, + "grad_norm": 0.5505003389398774, + "learning_rate": 7.978930674710719e-06, + "loss": 0.0645, + "step": 25172 + }, + { + "epoch": 2.985058697972252, + "grad_norm": 0.7174649530263422, + "learning_rate": 7.97717265762494e-06, + "loss": 0.0978, + "step": 25173 + }, + { + "epoch": 2.9851772797343767, + "grad_norm": 0.5693701235233923, + "learning_rate": 7.975414797468755e-06, + "loss": 0.0655, + "step": 25174 + }, + { + "epoch": 2.985295861496502, + "grad_norm": 0.7920706566572723, + "learning_rate": 7.973657094258369e-06, + "loss": 0.0809, + "step": 25175 + }, + { + "epoch": 2.9854144432586267, + "grad_norm": 0.7223894520757056, + "learning_rate": 7.971899548009994e-06, + "loss": 0.0866, + "step": 25176 + }, + { + "epoch": 2.985533025020752, + "grad_norm": 0.7473784714485121, + "learning_rate": 7.970142158739807e-06, + "loss": 0.0977, + "step": 25177 + }, + { + "epoch": 2.9856516067828767, + "grad_norm": 0.5033799143451254, + "learning_rate": 7.968384926464042e-06, + "loss": 0.0651, + "step": 25178 + }, + { + "epoch": 2.985770188545002, + "grad_norm": 0.48212426373074635, + "learning_rate": 7.966627851198874e-06, + "loss": 0.052, + "step": 25179 + }, + { + "epoch": 2.9858887703071266, + "grad_norm": 0.5246198623595583, + "learning_rate": 7.96487093296052e-06, + "loss": 0.0799, + "step": 25180 + }, + { + "epoch": 2.986007352069252, + "grad_norm": 0.46927330092265485, + "learning_rate": 7.963114171765146e-06, + "loss": 0.0484, + "step": 25181 + }, + { + "epoch": 2.9861259338313766, + "grad_norm": 0.7116452570151323, + "learning_rate": 7.961357567628986e-06, + "loss": 0.0725, + "step": 25182 + }, + { + "epoch": 2.986244515593502, + "grad_norm": 0.6278910140924329, + "learning_rate": 7.959601120568208e-06, + "loss": 0.0677, + "step": 25183 + }, + { + "epoch": 2.9863630973556266, + "grad_norm": 0.5622990782732475, + "learning_rate": 7.95784483059901e-06, + "loss": 0.0657, + "step": 25184 + }, + { + "epoch": 2.9864816791177518, + "grad_norm": 0.9172211387297396, + "learning_rate": 7.956088697737582e-06, + "loss": 0.112, + "step": 25185 + }, + { + "epoch": 2.9866002608798765, + "grad_norm": 0.5107751513274198, + "learning_rate": 7.954332722000119e-06, + "loss": 0.0708, + "step": 25186 + }, + { + "epoch": 2.9867188426420017, + "grad_norm": 1.0070136947658836, + "learning_rate": 7.952576903402812e-06, + "loss": 0.1242, + "step": 25187 + }, + { + "epoch": 2.9868374244041265, + "grad_norm": 0.5611844529217258, + "learning_rate": 7.950821241961825e-06, + "loss": 0.0576, + "step": 25188 + }, + { + "epoch": 2.9869560061662517, + "grad_norm": 0.7137708571220374, + "learning_rate": 7.949065737693376e-06, + "loss": 0.1051, + "step": 25189 + }, + { + "epoch": 2.987074587928377, + "grad_norm": 0.7926122991065726, + "learning_rate": 7.947310390613621e-06, + "loss": 0.0964, + "step": 25190 + }, + { + "epoch": 2.9871931696905016, + "grad_norm": 0.5689281747864622, + "learning_rate": 7.945555200738764e-06, + "loss": 0.0581, + "step": 25191 + }, + { + "epoch": 2.9873117514526264, + "grad_norm": 0.5423210326666267, + "learning_rate": 7.943800168084956e-06, + "loss": 0.0672, + "step": 25192 + }, + { + "epoch": 2.9874303332147516, + "grad_norm": 0.6022002742544891, + "learning_rate": 7.942045292668412e-06, + "loss": 0.0795, + "step": 25193 + }, + { + "epoch": 2.987548914976877, + "grad_norm": 0.7056863704625274, + "learning_rate": 7.940290574505286e-06, + "loss": 0.0796, + "step": 25194 + }, + { + "epoch": 2.9876674967390016, + "grad_norm": 0.620413613110901, + "learning_rate": 7.938536013611759e-06, + "loss": 0.0829, + "step": 25195 + }, + { + "epoch": 2.9877860785011263, + "grad_norm": 0.4261206926600032, + "learning_rate": 7.936781610004007e-06, + "loss": 0.0451, + "step": 25196 + }, + { + "epoch": 2.9879046602632515, + "grad_norm": 0.6858231657706492, + "learning_rate": 7.935027363698206e-06, + "loss": 0.0893, + "step": 25197 + }, + { + "epoch": 2.9880232420253767, + "grad_norm": 0.4686473568475326, + "learning_rate": 7.933273274710534e-06, + "loss": 0.0635, + "step": 25198 + }, + { + "epoch": 2.9881418237875015, + "grad_norm": 0.4551520150346174, + "learning_rate": 7.931519343057136e-06, + "loss": 0.0636, + "step": 25199 + }, + { + "epoch": 2.9882604055496262, + "grad_norm": 0.8589423160306479, + "learning_rate": 7.929765568754219e-06, + "loss": 0.0943, + "step": 25200 + }, + { + "epoch": 2.9883789873117514, + "grad_norm": 0.9696076472123668, + "learning_rate": 7.92801195181792e-06, + "loss": 0.1374, + "step": 25201 + }, + { + "epoch": 2.9884975690738766, + "grad_norm": 0.4102760201324566, + "learning_rate": 7.926258492264425e-06, + "loss": 0.0454, + "step": 25202 + }, + { + "epoch": 2.9886161508360014, + "grad_norm": 0.6632497242275227, + "learning_rate": 7.924505190109871e-06, + "loss": 0.0819, + "step": 25203 + }, + { + "epoch": 2.988734732598126, + "grad_norm": 0.8474446853171365, + "learning_rate": 7.922752045370458e-06, + "loss": 0.1036, + "step": 25204 + }, + { + "epoch": 2.9888533143602514, + "grad_norm": 0.6992645165523089, + "learning_rate": 7.92099905806232e-06, + "loss": 0.0861, + "step": 25205 + }, + { + "epoch": 2.9889718961223766, + "grad_norm": 0.7600498511882009, + "learning_rate": 7.919246228201626e-06, + "loss": 0.1059, + "step": 25206 + }, + { + "epoch": 2.9890904778845013, + "grad_norm": 0.7897153132120887, + "learning_rate": 7.917493555804539e-06, + "loss": 0.0901, + "step": 25207 + }, + { + "epoch": 2.989209059646626, + "grad_norm": 0.9524295993778669, + "learning_rate": 7.915741040887212e-06, + "loss": 0.1182, + "step": 25208 + }, + { + "epoch": 2.9893276414087513, + "grad_norm": 1.1557778904548075, + "learning_rate": 7.913988683465811e-06, + "loss": 0.1113, + "step": 25209 + }, + { + "epoch": 2.9894462231708765, + "grad_norm": 0.7065524279376227, + "learning_rate": 7.912236483556465e-06, + "loss": 0.084, + "step": 25210 + }, + { + "epoch": 2.9895648049330013, + "grad_norm": 0.5736492023046409, + "learning_rate": 7.910484441175362e-06, + "loss": 0.0585, + "step": 25211 + }, + { + "epoch": 2.9896833866951265, + "grad_norm": 0.8911983281743301, + "learning_rate": 7.908732556338628e-06, + "loss": 0.1275, + "step": 25212 + }, + { + "epoch": 2.989801968457251, + "grad_norm": 0.831958483125377, + "learning_rate": 7.906980829062428e-06, + "loss": 0.1184, + "step": 25213 + }, + { + "epoch": 2.9899205502193764, + "grad_norm": 0.819593132040254, + "learning_rate": 7.905229259362887e-06, + "loss": 0.0827, + "step": 25214 + }, + { + "epoch": 2.990039131981501, + "grad_norm": 0.6787546838148077, + "learning_rate": 7.903477847256185e-06, + "loss": 0.0934, + "step": 25215 + }, + { + "epoch": 2.9901577137436264, + "grad_norm": 0.5760385754448866, + "learning_rate": 7.901726592758446e-06, + "loss": 0.0837, + "step": 25216 + }, + { + "epoch": 2.990276295505751, + "grad_norm": 0.5717104254658266, + "learning_rate": 7.899975495885819e-06, + "loss": 0.0576, + "step": 25217 + }, + { + "epoch": 2.9903948772678763, + "grad_norm": 0.6747365930417875, + "learning_rate": 7.89822455665445e-06, + "loss": 0.0681, + "step": 25218 + }, + { + "epoch": 2.990513459030001, + "grad_norm": 0.8124179686344388, + "learning_rate": 7.896473775080476e-06, + "loss": 0.1168, + "step": 25219 + }, + { + "epoch": 2.9906320407921263, + "grad_norm": 0.6586963205968985, + "learning_rate": 7.894723151180054e-06, + "loss": 0.0659, + "step": 25220 + }, + { + "epoch": 2.990750622554251, + "grad_norm": 0.598642011208212, + "learning_rate": 7.892972684969294e-06, + "loss": 0.0749, + "step": 25221 + }, + { + "epoch": 2.9908692043163763, + "grad_norm": 0.6574014956272434, + "learning_rate": 7.891222376464353e-06, + "loss": 0.0767, + "step": 25222 + }, + { + "epoch": 2.990987786078501, + "grad_norm": 0.6631876099562936, + "learning_rate": 7.889472225681357e-06, + "loss": 0.0822, + "step": 25223 + }, + { + "epoch": 2.9911063678406262, + "grad_norm": 0.656952448763852, + "learning_rate": 7.88772223263645e-06, + "loss": 0.0858, + "step": 25224 + }, + { + "epoch": 2.991224949602751, + "grad_norm": 0.8077323341683675, + "learning_rate": 7.885972397345756e-06, + "loss": 0.0961, + "step": 25225 + }, + { + "epoch": 2.991343531364876, + "grad_norm": 0.5285834188856238, + "learning_rate": 7.88422271982541e-06, + "loss": 0.0707, + "step": 25226 + }, + { + "epoch": 2.991462113127001, + "grad_norm": 0.7982717677192989, + "learning_rate": 7.88247320009155e-06, + "loss": 0.1157, + "step": 25227 + }, + { + "epoch": 2.991580694889126, + "grad_norm": 0.6520966914470229, + "learning_rate": 7.88072383816029e-06, + "loss": 0.0818, + "step": 25228 + }, + { + "epoch": 2.991699276651251, + "grad_norm": 0.6322162696009809, + "learning_rate": 7.878974634047759e-06, + "loss": 0.0757, + "step": 25229 + }, + { + "epoch": 2.991817858413376, + "grad_norm": 0.4765633115323194, + "learning_rate": 7.87722558777009e-06, + "loss": 0.0726, + "step": 25230 + }, + { + "epoch": 2.991936440175501, + "grad_norm": 0.6050242119454192, + "learning_rate": 7.875476699343412e-06, + "loss": 0.0883, + "step": 25231 + }, + { + "epoch": 2.992055021937626, + "grad_norm": 0.8694844534459029, + "learning_rate": 7.873727968783831e-06, + "loss": 0.0859, + "step": 25232 + }, + { + "epoch": 2.992173603699751, + "grad_norm": 0.521512058807829, + "learning_rate": 7.871979396107476e-06, + "loss": 0.061, + "step": 25233 + }, + { + "epoch": 2.992292185461876, + "grad_norm": 0.7138826160336067, + "learning_rate": 7.870230981330468e-06, + "loss": 0.088, + "step": 25234 + }, + { + "epoch": 2.992410767224001, + "grad_norm": 0.9295340591340745, + "learning_rate": 7.868482724468923e-06, + "loss": 0.1061, + "step": 25235 + }, + { + "epoch": 2.992529348986126, + "grad_norm": 0.5549676758937078, + "learning_rate": 7.866734625538966e-06, + "loss": 0.0622, + "step": 25236 + }, + { + "epoch": 2.9926479307482508, + "grad_norm": 0.4951886607993743, + "learning_rate": 7.864986684556692e-06, + "loss": 0.071, + "step": 25237 + }, + { + "epoch": 2.992766512510376, + "grad_norm": 0.5509325861015106, + "learning_rate": 7.863238901538245e-06, + "loss": 0.0777, + "step": 25238 + }, + { + "epoch": 2.992885094272501, + "grad_norm": 0.6070510956704995, + "learning_rate": 7.86149127649971e-06, + "loss": 0.068, + "step": 25239 + }, + { + "epoch": 2.993003676034626, + "grad_norm": 0.5233767665289992, + "learning_rate": 7.859743809457212e-06, + "loss": 0.0545, + "step": 25240 + }, + { + "epoch": 2.9931222577967507, + "grad_norm": 0.5705709742330571, + "learning_rate": 7.857996500426856e-06, + "loss": 0.0699, + "step": 25241 + }, + { + "epoch": 2.993240839558876, + "grad_norm": 0.7184691218353527, + "learning_rate": 7.856249349424757e-06, + "loss": 0.1195, + "step": 25242 + }, + { + "epoch": 2.993359421321001, + "grad_norm": 0.8460818116815699, + "learning_rate": 7.85450235646701e-06, + "loss": 0.1306, + "step": 25243 + }, + { + "epoch": 2.993478003083126, + "grad_norm": 0.6231702991214165, + "learning_rate": 7.852755521569727e-06, + "loss": 0.0799, + "step": 25244 + }, + { + "epoch": 2.9935965848452506, + "grad_norm": 0.5269989492860602, + "learning_rate": 7.851008844749011e-06, + "loss": 0.0728, + "step": 25245 + }, + { + "epoch": 2.993715166607376, + "grad_norm": 0.5950432786858968, + "learning_rate": 7.849262326020963e-06, + "loss": 0.0715, + "step": 25246 + }, + { + "epoch": 2.993833748369501, + "grad_norm": 0.7145967738703982, + "learning_rate": 7.847515965401692e-06, + "loss": 0.0982, + "step": 25247 + }, + { + "epoch": 2.9939523301316258, + "grad_norm": 0.815379042412025, + "learning_rate": 7.845769762907274e-06, + "loss": 0.101, + "step": 25248 + }, + { + "epoch": 2.9940709118937505, + "grad_norm": 0.741114986648285, + "learning_rate": 7.844023718553839e-06, + "loss": 0.0917, + "step": 25249 + }, + { + "epoch": 2.9941894936558757, + "grad_norm": 0.690947211112258, + "learning_rate": 7.842277832357461e-06, + "loss": 0.0886, + "step": 25250 + }, + { + "epoch": 2.994308075418001, + "grad_norm": 0.8754880215846245, + "learning_rate": 7.840532104334247e-06, + "loss": 0.1234, + "step": 25251 + }, + { + "epoch": 2.9944266571801257, + "grad_norm": 0.7607310659269182, + "learning_rate": 7.83878653450027e-06, + "loss": 0.0824, + "step": 25252 + }, + { + "epoch": 2.9945452389422504, + "grad_norm": 0.7239485446626567, + "learning_rate": 7.837041122871652e-06, + "loss": 0.1, + "step": 25253 + }, + { + "epoch": 2.9946638207043756, + "grad_norm": 1.0981227518580587, + "learning_rate": 7.835295869464462e-06, + "loss": 0.1331, + "step": 25254 + }, + { + "epoch": 2.994782402466501, + "grad_norm": 0.8828140819771635, + "learning_rate": 7.833550774294793e-06, + "loss": 0.1142, + "step": 25255 + }, + { + "epoch": 2.9949009842286256, + "grad_norm": 0.8095807662199901, + "learning_rate": 7.831805837378736e-06, + "loss": 0.1085, + "step": 25256 + }, + { + "epoch": 2.9950195659907504, + "grad_norm": 0.6413380262594447, + "learning_rate": 7.830061058732377e-06, + "loss": 0.0806, + "step": 25257 + }, + { + "epoch": 2.9951381477528756, + "grad_norm": 0.5935886533654066, + "learning_rate": 7.828316438371805e-06, + "loss": 0.0797, + "step": 25258 + }, + { + "epoch": 2.9952567295150008, + "grad_norm": 0.9397436959122861, + "learning_rate": 7.826571976313085e-06, + "loss": 0.1404, + "step": 25259 + }, + { + "epoch": 2.9953753112771255, + "grad_norm": 0.6745401105418741, + "learning_rate": 7.824827672572326e-06, + "loss": 0.1099, + "step": 25260 + }, + { + "epoch": 2.9954938930392503, + "grad_norm": 0.6330757498653369, + "learning_rate": 7.82308352716559e-06, + "loss": 0.0894, + "step": 25261 + }, + { + "epoch": 2.9956124748013755, + "grad_norm": 0.6003050689912874, + "learning_rate": 7.821339540108962e-06, + "loss": 0.0808, + "step": 25262 + }, + { + "epoch": 2.9957310565635007, + "grad_norm": 0.7722834072031718, + "learning_rate": 7.819595711418507e-06, + "loss": 0.0668, + "step": 25263 + }, + { + "epoch": 2.9958496383256255, + "grad_norm": 0.5536692675252685, + "learning_rate": 7.817852041110324e-06, + "loss": 0.0713, + "step": 25264 + }, + { + "epoch": 2.9959682200877507, + "grad_norm": 0.8908127539836134, + "learning_rate": 7.81610852920047e-06, + "loss": 0.1049, + "step": 25265 + }, + { + "epoch": 2.9960868018498754, + "grad_norm": 0.5425158439026335, + "learning_rate": 7.814365175705022e-06, + "loss": 0.0613, + "step": 25266 + }, + { + "epoch": 2.9962053836120006, + "grad_norm": 0.7505593778187494, + "learning_rate": 7.81262198064005e-06, + "loss": 0.0934, + "step": 25267 + }, + { + "epoch": 2.9963239653741254, + "grad_norm": 0.6088132107322531, + "learning_rate": 7.81087894402163e-06, + "loss": 0.0727, + "step": 25268 + }, + { + "epoch": 2.9964425471362506, + "grad_norm": 0.6069165277321645, + "learning_rate": 7.809136065865836e-06, + "loss": 0.0835, + "step": 25269 + }, + { + "epoch": 2.9965611288983753, + "grad_norm": 0.5711286678982278, + "learning_rate": 7.807393346188706e-06, + "loss": 0.0877, + "step": 25270 + }, + { + "epoch": 2.9966797106605005, + "grad_norm": 1.189273296662366, + "learning_rate": 7.805650785006346e-06, + "loss": 0.1082, + "step": 25271 + }, + { + "epoch": 2.9967982924226253, + "grad_norm": 0.4869226958481434, + "learning_rate": 7.803908382334792e-06, + "loss": 0.0639, + "step": 25272 + }, + { + "epoch": 2.9969168741847505, + "grad_norm": 1.0287821596449727, + "learning_rate": 7.802166138190117e-06, + "loss": 0.1257, + "step": 25273 + }, + { + "epoch": 2.9970354559468753, + "grad_norm": 0.5355696746484441, + "learning_rate": 7.800424052588379e-06, + "loss": 0.0671, + "step": 25274 + }, + { + "epoch": 2.9971540377090005, + "grad_norm": 0.5875351473657155, + "learning_rate": 7.798682125545642e-06, + "loss": 0.0711, + "step": 25275 + }, + { + "epoch": 2.997272619471125, + "grad_norm": 0.5942348404748906, + "learning_rate": 7.796940357077966e-06, + "loss": 0.0759, + "step": 25276 + }, + { + "epoch": 2.9973912012332504, + "grad_norm": 0.4826426034613959, + "learning_rate": 7.7951987472014e-06, + "loss": 0.0549, + "step": 25277 + }, + { + "epoch": 2.997509782995375, + "grad_norm": 0.4229056151173487, + "learning_rate": 7.793457295932002e-06, + "loss": 0.054, + "step": 25278 + }, + { + "epoch": 2.9976283647575004, + "grad_norm": 0.7746374040415275, + "learning_rate": 7.79171600328583e-06, + "loss": 0.0824, + "step": 25279 + }, + { + "epoch": 2.997746946519625, + "grad_norm": 0.5730604645236836, + "learning_rate": 7.789974869278938e-06, + "loss": 0.0672, + "step": 25280 + }, + { + "epoch": 2.9978655282817503, + "grad_norm": 0.7358408537569285, + "learning_rate": 7.78823389392736e-06, + "loss": 0.1132, + "step": 25281 + }, + { + "epoch": 2.997984110043875, + "grad_norm": 0.4748562226311641, + "learning_rate": 7.786493077247174e-06, + "loss": 0.058, + "step": 25282 + }, + { + "epoch": 2.9981026918060003, + "grad_norm": 0.7299297033352722, + "learning_rate": 7.784752419254406e-06, + "loss": 0.088, + "step": 25283 + }, + { + "epoch": 2.998221273568125, + "grad_norm": 0.790749497579351, + "learning_rate": 7.783011919965108e-06, + "loss": 0.1043, + "step": 25284 + }, + { + "epoch": 2.9983398553302503, + "grad_norm": 0.5135826227828129, + "learning_rate": 7.78127157939533e-06, + "loss": 0.0872, + "step": 25285 + }, + { + "epoch": 2.998458437092375, + "grad_norm": 0.4881535141370342, + "learning_rate": 7.779531397561113e-06, + "loss": 0.054, + "step": 25286 + }, + { + "epoch": 2.9985770188545002, + "grad_norm": 0.8641967477189936, + "learning_rate": 7.777791374478505e-06, + "loss": 0.1046, + "step": 25287 + }, + { + "epoch": 2.9986956006166254, + "grad_norm": 0.39636011073817334, + "learning_rate": 7.776051510163534e-06, + "loss": 0.0448, + "step": 25288 + }, + { + "epoch": 2.99881418237875, + "grad_norm": 0.7625473208992781, + "learning_rate": 7.774311804632248e-06, + "loss": 0.1118, + "step": 25289 + }, + { + "epoch": 2.998932764140875, + "grad_norm": 0.5668778963932375, + "learning_rate": 7.772572257900684e-06, + "loss": 0.0687, + "step": 25290 + }, + { + "epoch": 2.999051345903, + "grad_norm": 0.8637773032695936, + "learning_rate": 7.770832869984885e-06, + "loss": 0.1227, + "step": 25291 + }, + { + "epoch": 2.9991699276651254, + "grad_norm": 0.6867367231646458, + "learning_rate": 7.76909364090087e-06, + "loss": 0.0735, + "step": 25292 + }, + { + "epoch": 2.99928850942725, + "grad_norm": 0.37403637025196546, + "learning_rate": 7.767354570664684e-06, + "loss": 0.0518, + "step": 25293 + }, + { + "epoch": 2.999407091189375, + "grad_norm": 0.6623590080242845, + "learning_rate": 7.765615659292355e-06, + "loss": 0.0716, + "step": 25294 + }, + { + "epoch": 2.9995256729515, + "grad_norm": 0.7604372295553489, + "learning_rate": 7.763876906799917e-06, + "loss": 0.0896, + "step": 25295 + }, + { + "epoch": 2.9996442547136253, + "grad_norm": 0.6685344707824025, + "learning_rate": 7.762138313203396e-06, + "loss": 0.0658, + "step": 25296 + }, + { + "epoch": 2.99976283647575, + "grad_norm": 0.583796315175131, + "learning_rate": 7.76039987851882e-06, + "loss": 0.0627, + "step": 25297 + }, + { + "epoch": 2.999881418237875, + "grad_norm": 0.7340032238519414, + "learning_rate": 7.758661602762227e-06, + "loss": 0.0837, + "step": 25298 + }, + { + "epoch": 3.0, + "grad_norm": 0.6339409897348451, + "learning_rate": 7.756923485949624e-06, + "loss": 0.0847, + "step": 25299 + }, + { + "epoch": 3.000118581762125, + "grad_norm": 0.27818727890102973, + "learning_rate": 7.755185528097039e-06, + "loss": 0.021, + "step": 25300 + }, + { + "epoch": 3.00023716352425, + "grad_norm": 0.35217358455840414, + "learning_rate": 7.753447729220498e-06, + "loss": 0.0347, + "step": 25301 + }, + { + "epoch": 3.000355745286375, + "grad_norm": 0.27283024111008736, + "learning_rate": 7.751710089336028e-06, + "loss": 0.0244, + "step": 25302 + }, + { + "epoch": 3.0004743270485, + "grad_norm": 0.43092502941457056, + "learning_rate": 7.749972608459633e-06, + "loss": 0.0262, + "step": 25303 + }, + { + "epoch": 3.000592908810625, + "grad_norm": 0.7553318211907485, + "learning_rate": 7.748235286607334e-06, + "loss": 0.053, + "step": 25304 + }, + { + "epoch": 3.00071149057275, + "grad_norm": 0.36197013319953925, + "learning_rate": 7.746498123795152e-06, + "loss": 0.0305, + "step": 25305 + }, + { + "epoch": 3.000830072334875, + "grad_norm": 0.3202295039619893, + "learning_rate": 7.744761120039098e-06, + "loss": 0.034, + "step": 25306 + }, + { + "epoch": 3.000948654097, + "grad_norm": 0.48235980957807045, + "learning_rate": 7.743024275355196e-06, + "loss": 0.0558, + "step": 25307 + }, + { + "epoch": 3.001067235859125, + "grad_norm": 0.3056866212466977, + "learning_rate": 7.741287589759433e-06, + "loss": 0.0284, + "step": 25308 + }, + { + "epoch": 3.00118581762125, + "grad_norm": 0.5736939229410981, + "learning_rate": 7.73955106326785e-06, + "loss": 0.0585, + "step": 25309 + }, + { + "epoch": 3.001304399383375, + "grad_norm": 0.5367376690072162, + "learning_rate": 7.73781469589643e-06, + "loss": 0.0422, + "step": 25310 + }, + { + "epoch": 3.0014229811454998, + "grad_norm": 0.31583627608957515, + "learning_rate": 7.736078487661195e-06, + "loss": 0.0372, + "step": 25311 + }, + { + "epoch": 3.001541562907625, + "grad_norm": 0.551794205818285, + "learning_rate": 7.73434243857814e-06, + "loss": 0.0591, + "step": 25312 + }, + { + "epoch": 3.0016601446697497, + "grad_norm": 0.38367492631767275, + "learning_rate": 7.732606548663287e-06, + "loss": 0.0326, + "step": 25313 + }, + { + "epoch": 3.001778726431875, + "grad_norm": 0.4613275293292097, + "learning_rate": 7.730870817932617e-06, + "loss": 0.0383, + "step": 25314 + }, + { + "epoch": 3.0018973081939997, + "grad_norm": 0.5358568882722229, + "learning_rate": 7.729135246402145e-06, + "loss": 0.0634, + "step": 25315 + }, + { + "epoch": 3.002015889956125, + "grad_norm": 0.42210067041232824, + "learning_rate": 7.727399834087862e-06, + "loss": 0.0408, + "step": 25316 + }, + { + "epoch": 3.0021344717182497, + "grad_norm": 0.3145389464558861, + "learning_rate": 7.725664581005773e-06, + "loss": 0.0237, + "step": 25317 + }, + { + "epoch": 3.002253053480375, + "grad_norm": 0.2520551295018644, + "learning_rate": 7.723929487171882e-06, + "loss": 0.0172, + "step": 25318 + }, + { + "epoch": 3.0023716352424996, + "grad_norm": 0.37448281255490334, + "learning_rate": 7.722194552602161e-06, + "loss": 0.0389, + "step": 25319 + }, + { + "epoch": 3.002490217004625, + "grad_norm": 0.46808233269923394, + "learning_rate": 7.720459777312633e-06, + "loss": 0.0441, + "step": 25320 + }, + { + "epoch": 3.0026087987667496, + "grad_norm": 0.34471910179115867, + "learning_rate": 7.718725161319271e-06, + "loss": 0.0244, + "step": 25321 + }, + { + "epoch": 3.002727380528875, + "grad_norm": 0.35315788093451483, + "learning_rate": 7.71699070463807e-06, + "loss": 0.028, + "step": 25322 + }, + { + "epoch": 3.0028459622909995, + "grad_norm": 0.46816812381872286, + "learning_rate": 7.715256407285019e-06, + "loss": 0.0428, + "step": 25323 + }, + { + "epoch": 3.0029645440531247, + "grad_norm": 0.5268345562761996, + "learning_rate": 7.713522269276107e-06, + "loss": 0.0413, + "step": 25324 + }, + { + "epoch": 3.0030831258152495, + "grad_norm": 0.36311358433956514, + "learning_rate": 7.711788290627333e-06, + "loss": 0.036, + "step": 25325 + }, + { + "epoch": 3.0032017075773747, + "grad_norm": 0.4759122861182023, + "learning_rate": 7.710054471354655e-06, + "loss": 0.034, + "step": 25326 + }, + { + "epoch": 3.0033202893394995, + "grad_norm": 0.3904688167882179, + "learning_rate": 7.708320811474087e-06, + "loss": 0.0339, + "step": 25327 + }, + { + "epoch": 3.0034388711016247, + "grad_norm": 0.4223150674943986, + "learning_rate": 7.706587311001587e-06, + "loss": 0.033, + "step": 25328 + }, + { + "epoch": 3.0035574528637494, + "grad_norm": 0.36237905522696834, + "learning_rate": 7.704853969953155e-06, + "loss": 0.0348, + "step": 25329 + }, + { + "epoch": 3.0036760346258746, + "grad_norm": 0.4359669304129758, + "learning_rate": 7.703120788344745e-06, + "loss": 0.0349, + "step": 25330 + }, + { + "epoch": 3.0037946163879994, + "grad_norm": 0.3435525988853921, + "learning_rate": 7.701387766192367e-06, + "loss": 0.0257, + "step": 25331 + }, + { + "epoch": 3.0039131981501246, + "grad_norm": 0.6073826643054211, + "learning_rate": 7.699654903511972e-06, + "loss": 0.0459, + "step": 25332 + }, + { + "epoch": 3.0040317799122493, + "grad_norm": 0.4289672442904183, + "learning_rate": 7.697922200319543e-06, + "loss": 0.0367, + "step": 25333 + }, + { + "epoch": 3.0041503616743745, + "grad_norm": 0.8193486560506394, + "learning_rate": 7.696189656631056e-06, + "loss": 0.0604, + "step": 25334 + }, + { + "epoch": 3.0042689434364993, + "grad_norm": 0.503661824178371, + "learning_rate": 7.694457272462479e-06, + "loss": 0.0549, + "step": 25335 + }, + { + "epoch": 3.0043875251986245, + "grad_norm": 0.5903578466248951, + "learning_rate": 7.692725047829794e-06, + "loss": 0.0469, + "step": 25336 + }, + { + "epoch": 3.0045061069607493, + "grad_norm": 0.46457097160583094, + "learning_rate": 7.690992982748951e-06, + "loss": 0.0306, + "step": 25337 + }, + { + "epoch": 3.0046246887228745, + "grad_norm": 0.20876976530492458, + "learning_rate": 7.68926107723593e-06, + "loss": 0.0185, + "step": 25338 + }, + { + "epoch": 3.0047432704849992, + "grad_norm": 0.4331729389407821, + "learning_rate": 7.68752933130669e-06, + "loss": 0.0337, + "step": 25339 + }, + { + "epoch": 3.0048618522471244, + "grad_norm": 0.31807159279801295, + "learning_rate": 7.68579774497721e-06, + "loss": 0.0265, + "step": 25340 + }, + { + "epoch": 3.004980434009249, + "grad_norm": 0.42754621474455795, + "learning_rate": 7.684066318263425e-06, + "loss": 0.0242, + "step": 25341 + }, + { + "epoch": 3.0050990157713744, + "grad_norm": 0.26186124757045237, + "learning_rate": 7.682335051181332e-06, + "loss": 0.0193, + "step": 25342 + }, + { + "epoch": 3.005217597533499, + "grad_norm": 0.5245283633670818, + "learning_rate": 7.680603943746866e-06, + "loss": 0.0257, + "step": 25343 + }, + { + "epoch": 3.0053361792956244, + "grad_norm": 0.29416918952890303, + "learning_rate": 7.678872995975992e-06, + "loss": 0.025, + "step": 25344 + }, + { + "epoch": 3.005454761057749, + "grad_norm": 0.3433107662747983, + "learning_rate": 7.677142207884668e-06, + "loss": 0.0256, + "step": 25345 + }, + { + "epoch": 3.0055733428198743, + "grad_norm": 0.5501222903932953, + "learning_rate": 7.67541157948885e-06, + "loss": 0.0405, + "step": 25346 + }, + { + "epoch": 3.005691924581999, + "grad_norm": 0.2768194733225938, + "learning_rate": 7.673681110804504e-06, + "loss": 0.0231, + "step": 25347 + }, + { + "epoch": 3.0058105063441243, + "grad_norm": 0.2941069138209204, + "learning_rate": 7.671950801847558e-06, + "loss": 0.0253, + "step": 25348 + }, + { + "epoch": 3.0059290881062495, + "grad_norm": 0.6416164977314439, + "learning_rate": 7.670220652633981e-06, + "loss": 0.0427, + "step": 25349 + }, + { + "epoch": 3.0060476698683742, + "grad_norm": 0.470782626272427, + "learning_rate": 7.668490663179715e-06, + "loss": 0.0349, + "step": 25350 + }, + { + "epoch": 3.0061662516304994, + "grad_norm": 0.5461681129762601, + "learning_rate": 7.666760833500722e-06, + "loss": 0.0427, + "step": 25351 + }, + { + "epoch": 3.006284833392624, + "grad_norm": 0.4361119557985114, + "learning_rate": 7.665031163612921e-06, + "loss": 0.0277, + "step": 25352 + }, + { + "epoch": 3.0064034151547494, + "grad_norm": 0.4890419971828236, + "learning_rate": 7.663301653532293e-06, + "loss": 0.0321, + "step": 25353 + }, + { + "epoch": 3.006521996916874, + "grad_norm": 0.5029343081983291, + "learning_rate": 7.661572303274756e-06, + "loss": 0.0265, + "step": 25354 + }, + { + "epoch": 3.0066405786789994, + "grad_norm": 0.44630892821797713, + "learning_rate": 7.659843112856258e-06, + "loss": 0.0367, + "step": 25355 + }, + { + "epoch": 3.006759160441124, + "grad_norm": 0.47680585725105595, + "learning_rate": 7.658114082292744e-06, + "loss": 0.0289, + "step": 25356 + }, + { + "epoch": 3.0068777422032493, + "grad_norm": 0.3759872146988128, + "learning_rate": 7.656385211600154e-06, + "loss": 0.0308, + "step": 25357 + }, + { + "epoch": 3.006996323965374, + "grad_norm": 0.44452476696037296, + "learning_rate": 7.65465650079443e-06, + "loss": 0.0411, + "step": 25358 + }, + { + "epoch": 3.0071149057274993, + "grad_norm": 0.4222140817141822, + "learning_rate": 7.652927949891495e-06, + "loss": 0.0411, + "step": 25359 + }, + { + "epoch": 3.007233487489624, + "grad_norm": 0.4472191310101325, + "learning_rate": 7.651199558907293e-06, + "loss": 0.0241, + "step": 25360 + }, + { + "epoch": 3.0073520692517492, + "grad_norm": 0.5802810185708355, + "learning_rate": 7.649471327857754e-06, + "loss": 0.0471, + "step": 25361 + }, + { + "epoch": 3.007470651013874, + "grad_norm": 0.6064955604777108, + "learning_rate": 7.647743256758824e-06, + "loss": 0.0303, + "step": 25362 + }, + { + "epoch": 3.007589232775999, + "grad_norm": 0.36335979433933546, + "learning_rate": 7.646015345626414e-06, + "loss": 0.029, + "step": 25363 + }, + { + "epoch": 3.007707814538124, + "grad_norm": 0.2531722069430395, + "learning_rate": 7.644287594476459e-06, + "loss": 0.0155, + "step": 25364 + }, + { + "epoch": 3.007826396300249, + "grad_norm": 0.46042782393622717, + "learning_rate": 7.642560003324892e-06, + "loss": 0.0315, + "step": 25365 + }, + { + "epoch": 3.007944978062374, + "grad_norm": 0.3109669690984699, + "learning_rate": 7.640832572187636e-06, + "loss": 0.0165, + "step": 25366 + }, + { + "epoch": 3.008063559824499, + "grad_norm": 0.5457014048185769, + "learning_rate": 7.639105301080615e-06, + "loss": 0.0299, + "step": 25367 + }, + { + "epoch": 3.008182141586624, + "grad_norm": 0.3474581289206686, + "learning_rate": 7.637378190019757e-06, + "loss": 0.0248, + "step": 25368 + }, + { + "epoch": 3.008300723348749, + "grad_norm": 0.42491363989754444, + "learning_rate": 7.635651239020986e-06, + "loss": 0.0241, + "step": 25369 + }, + { + "epoch": 3.008419305110874, + "grad_norm": 0.3587769260082179, + "learning_rate": 7.63392444810021e-06, + "loss": 0.0286, + "step": 25370 + }, + { + "epoch": 3.008537886872999, + "grad_norm": 0.5104000025690947, + "learning_rate": 7.632197817273354e-06, + "loss": 0.0319, + "step": 25371 + }, + { + "epoch": 3.008656468635124, + "grad_norm": 0.5650411135422209, + "learning_rate": 7.630471346556337e-06, + "loss": 0.0419, + "step": 25372 + }, + { + "epoch": 3.008775050397249, + "grad_norm": 0.3713623111155855, + "learning_rate": 7.628745035965073e-06, + "loss": 0.023, + "step": 25373 + }, + { + "epoch": 3.0088936321593738, + "grad_norm": 0.5907039796316705, + "learning_rate": 7.627018885515489e-06, + "loss": 0.0381, + "step": 25374 + }, + { + "epoch": 3.009012213921499, + "grad_norm": 0.37882622170220515, + "learning_rate": 7.625292895223468e-06, + "loss": 0.0211, + "step": 25375 + }, + { + "epoch": 3.0091307956836237, + "grad_norm": 0.39308977428435743, + "learning_rate": 7.6235670651049554e-06, + "loss": 0.0266, + "step": 25376 + }, + { + "epoch": 3.009249377445749, + "grad_norm": 0.5816389466022209, + "learning_rate": 7.6218413951758414e-06, + "loss": 0.0366, + "step": 25377 + }, + { + "epoch": 3.0093679592078737, + "grad_norm": 0.505493144867611, + "learning_rate": 7.6201158854520446e-06, + "loss": 0.0353, + "step": 25378 + }, + { + "epoch": 3.009486540969999, + "grad_norm": 0.43790797403121146, + "learning_rate": 7.618390535949452e-06, + "loss": 0.0337, + "step": 25379 + }, + { + "epoch": 3.0096051227321237, + "grad_norm": 0.6844038288853503, + "learning_rate": 7.616665346683999e-06, + "loss": 0.0408, + "step": 25380 + }, + { + "epoch": 3.009723704494249, + "grad_norm": 0.5432711097006784, + "learning_rate": 7.614940317671568e-06, + "loss": 0.0291, + "step": 25381 + }, + { + "epoch": 3.0098422862563736, + "grad_norm": 0.3267560731312228, + "learning_rate": 7.6132154489280656e-06, + "loss": 0.0244, + "step": 25382 + }, + { + "epoch": 3.009960868018499, + "grad_norm": 0.531546862791762, + "learning_rate": 7.611490740469398e-06, + "loss": 0.0244, + "step": 25383 + }, + { + "epoch": 3.0100794497806236, + "grad_norm": 0.6619541563963426, + "learning_rate": 7.609766192311463e-06, + "loss": 0.0385, + "step": 25384 + }, + { + "epoch": 3.010198031542749, + "grad_norm": 0.519515379490031, + "learning_rate": 7.6080418044701646e-06, + "loss": 0.033, + "step": 25385 + }, + { + "epoch": 3.0103166133048735, + "grad_norm": 0.5581110928982194, + "learning_rate": 7.606317576961378e-06, + "loss": 0.0408, + "step": 25386 + }, + { + "epoch": 3.0104351950669987, + "grad_norm": 0.5830136000335294, + "learning_rate": 7.604593509801031e-06, + "loss": 0.0225, + "step": 25387 + }, + { + "epoch": 3.0105537768291235, + "grad_norm": 0.5932803466550683, + "learning_rate": 7.602869603004989e-06, + "loss": 0.039, + "step": 25388 + }, + { + "epoch": 3.0106723585912487, + "grad_norm": 0.3653226217483757, + "learning_rate": 7.601145856589168e-06, + "loss": 0.0205, + "step": 25389 + }, + { + "epoch": 3.0107909403533735, + "grad_norm": 0.5335073654554573, + "learning_rate": 7.599422270569426e-06, + "loss": 0.0439, + "step": 25390 + }, + { + "epoch": 3.0109095221154987, + "grad_norm": 0.501314571655461, + "learning_rate": 7.59769884496169e-06, + "loss": 0.0313, + "step": 25391 + }, + { + "epoch": 3.0110281038776234, + "grad_norm": 0.6311929209988606, + "learning_rate": 7.595975579781825e-06, + "loss": 0.0293, + "step": 25392 + }, + { + "epoch": 3.0111466856397486, + "grad_norm": 0.5608401543106221, + "learning_rate": 7.594252475045721e-06, + "loss": 0.0332, + "step": 25393 + }, + { + "epoch": 3.0112652674018734, + "grad_norm": 0.7724999945553648, + "learning_rate": 7.5925295307692625e-06, + "loss": 0.0441, + "step": 25394 + }, + { + "epoch": 3.0113838491639986, + "grad_norm": 0.3729481528411709, + "learning_rate": 7.59080674696834e-06, + "loss": 0.0239, + "step": 25395 + }, + { + "epoch": 3.0115024309261234, + "grad_norm": 0.42519706082795705, + "learning_rate": 7.589084123658835e-06, + "loss": 0.0244, + "step": 25396 + }, + { + "epoch": 3.0116210126882486, + "grad_norm": 0.4523489840687546, + "learning_rate": 7.587361660856609e-06, + "loss": 0.0384, + "step": 25397 + }, + { + "epoch": 3.0117395944503733, + "grad_norm": 0.5731833973772033, + "learning_rate": 7.585639358577573e-06, + "loss": 0.0377, + "step": 25398 + }, + { + "epoch": 3.0118581762124985, + "grad_norm": 0.45622100060080334, + "learning_rate": 7.583917216837577e-06, + "loss": 0.0236, + "step": 25399 + }, + { + "epoch": 3.0119767579746237, + "grad_norm": 0.33163745275135703, + "learning_rate": 7.5821952356525175e-06, + "loss": 0.0151, + "step": 25400 + }, + { + "epoch": 3.0120953397367485, + "grad_norm": 0.32951217013573225, + "learning_rate": 7.580473415038241e-06, + "loss": 0.016, + "step": 25401 + }, + { + "epoch": 3.0122139214988737, + "grad_norm": 0.43511688077262295, + "learning_rate": 7.578751755010655e-06, + "loss": 0.0307, + "step": 25402 + }, + { + "epoch": 3.0123325032609984, + "grad_norm": 0.6094388201504408, + "learning_rate": 7.577030255585607e-06, + "loss": 0.0321, + "step": 25403 + }, + { + "epoch": 3.0124510850231236, + "grad_norm": 0.34180785539117436, + "learning_rate": 7.575308916778973e-06, + "loss": 0.0213, + "step": 25404 + }, + { + "epoch": 3.0125696667852484, + "grad_norm": 0.5030362628932578, + "learning_rate": 7.573587738606625e-06, + "loss": 0.0348, + "step": 25405 + }, + { + "epoch": 3.0126882485473736, + "grad_norm": 0.3843773073384051, + "learning_rate": 7.571866721084428e-06, + "loss": 0.0223, + "step": 25406 + }, + { + "epoch": 3.0128068303094984, + "grad_norm": 0.3393651335151554, + "learning_rate": 7.5701458642282534e-06, + "loss": 0.0138, + "step": 25407 + }, + { + "epoch": 3.0129254120716236, + "grad_norm": 0.4249248547583195, + "learning_rate": 7.568425168053955e-06, + "loss": 0.0192, + "step": 25408 + }, + { + "epoch": 3.0130439938337483, + "grad_norm": 0.4269149796282927, + "learning_rate": 7.566704632577401e-06, + "loss": 0.0291, + "step": 25409 + }, + { + "epoch": 3.0131625755958735, + "grad_norm": 0.9291809820174342, + "learning_rate": 7.5649842578144505e-06, + "loss": 0.0408, + "step": 25410 + }, + { + "epoch": 3.0132811573579983, + "grad_norm": 0.7037282114644994, + "learning_rate": 7.563264043780974e-06, + "loss": 0.026, + "step": 25411 + }, + { + "epoch": 3.0133997391201235, + "grad_norm": 0.6498371476007979, + "learning_rate": 7.561543990492803e-06, + "loss": 0.049, + "step": 25412 + }, + { + "epoch": 3.0135183208822482, + "grad_norm": 0.5460178927172625, + "learning_rate": 7.559824097965829e-06, + "loss": 0.0416, + "step": 25413 + }, + { + "epoch": 3.0136369026443734, + "grad_norm": 0.4557348617868993, + "learning_rate": 7.558104366215882e-06, + "loss": 0.0349, + "step": 25414 + }, + { + "epoch": 3.013755484406498, + "grad_norm": 0.5160595846758902, + "learning_rate": 7.5563847952588255e-06, + "loss": 0.0283, + "step": 25415 + }, + { + "epoch": 3.0138740661686234, + "grad_norm": 0.6561190285564303, + "learning_rate": 7.554665385110507e-06, + "loss": 0.0373, + "step": 25416 + }, + { + "epoch": 3.013992647930748, + "grad_norm": 0.3665816233604149, + "learning_rate": 7.552946135786784e-06, + "loss": 0.0299, + "step": 25417 + }, + { + "epoch": 3.0141112296928734, + "grad_norm": 0.5901339208803061, + "learning_rate": 7.551227047303511e-06, + "loss": 0.0361, + "step": 25418 + }, + { + "epoch": 3.014229811454998, + "grad_norm": 0.7071965063824553, + "learning_rate": 7.549508119676518e-06, + "loss": 0.0428, + "step": 25419 + }, + { + "epoch": 3.0143483932171233, + "grad_norm": 0.5965694740222681, + "learning_rate": 7.547789352921661e-06, + "loss": 0.0456, + "step": 25420 + }, + { + "epoch": 3.014466974979248, + "grad_norm": 0.33008458886896436, + "learning_rate": 7.546070747054784e-06, + "loss": 0.0192, + "step": 25421 + }, + { + "epoch": 3.0145855567413733, + "grad_norm": 0.6834306122913407, + "learning_rate": 7.544352302091731e-06, + "loss": 0.0317, + "step": 25422 + }, + { + "epoch": 3.014704138503498, + "grad_norm": 0.4077458094104322, + "learning_rate": 7.542634018048355e-06, + "loss": 0.0233, + "step": 25423 + }, + { + "epoch": 3.0148227202656233, + "grad_norm": 0.2889129402649483, + "learning_rate": 7.540915894940467e-06, + "loss": 0.014, + "step": 25424 + }, + { + "epoch": 3.014941302027748, + "grad_norm": 0.4146099568217442, + "learning_rate": 7.539197932783942e-06, + "loss": 0.0232, + "step": 25425 + }, + { + "epoch": 3.015059883789873, + "grad_norm": 0.4852870317192876, + "learning_rate": 7.537480131594593e-06, + "loss": 0.0237, + "step": 25426 + }, + { + "epoch": 3.015178465551998, + "grad_norm": 0.5029454665275506, + "learning_rate": 7.5357624913882624e-06, + "loss": 0.0305, + "step": 25427 + }, + { + "epoch": 3.015297047314123, + "grad_norm": 0.46594863195527386, + "learning_rate": 7.534045012180785e-06, + "loss": 0.0298, + "step": 25428 + }, + { + "epoch": 3.015415629076248, + "grad_norm": 0.627448694880681, + "learning_rate": 7.532327693988003e-06, + "loss": 0.0268, + "step": 25429 + }, + { + "epoch": 3.015534210838373, + "grad_norm": 0.4082427994379912, + "learning_rate": 7.530610536825733e-06, + "loss": 0.0246, + "step": 25430 + }, + { + "epoch": 3.015652792600498, + "grad_norm": 0.5804619848273482, + "learning_rate": 7.528893540709808e-06, + "loss": 0.0336, + "step": 25431 + }, + { + "epoch": 3.015771374362623, + "grad_norm": 0.38520930168703293, + "learning_rate": 7.527176705656064e-06, + "loss": 0.0178, + "step": 25432 + }, + { + "epoch": 3.015889956124748, + "grad_norm": 0.46814477530008647, + "learning_rate": 7.5254600316803205e-06, + "loss": 0.0244, + "step": 25433 + }, + { + "epoch": 3.016008537886873, + "grad_norm": 0.647671379254069, + "learning_rate": 7.523743518798418e-06, + "loss": 0.0317, + "step": 25434 + }, + { + "epoch": 3.016127119648998, + "grad_norm": 0.42928796524742885, + "learning_rate": 7.52202716702615e-06, + "loss": 0.0235, + "step": 25435 + }, + { + "epoch": 3.016245701411123, + "grad_norm": 0.8138572936687117, + "learning_rate": 7.520310976379377e-06, + "loss": 0.0365, + "step": 25436 + }, + { + "epoch": 3.016364283173248, + "grad_norm": 0.5580902746399836, + "learning_rate": 7.5185949468738945e-06, + "loss": 0.0351, + "step": 25437 + }, + { + "epoch": 3.016482864935373, + "grad_norm": 0.44280617287973545, + "learning_rate": 7.516879078525527e-06, + "loss": 0.0262, + "step": 25438 + }, + { + "epoch": 3.0166014466974977, + "grad_norm": 0.723328104005943, + "learning_rate": 7.5151633713500955e-06, + "loss": 0.0423, + "step": 25439 + }, + { + "epoch": 3.016720028459623, + "grad_norm": 0.3132665552190127, + "learning_rate": 7.513447825363426e-06, + "loss": 0.0177, + "step": 25440 + }, + { + "epoch": 3.0168386102217477, + "grad_norm": 0.44284573465599325, + "learning_rate": 7.511732440581315e-06, + "loss": 0.025, + "step": 25441 + }, + { + "epoch": 3.016957191983873, + "grad_norm": 0.5274997634394825, + "learning_rate": 7.510017217019585e-06, + "loss": 0.0354, + "step": 25442 + }, + { + "epoch": 3.0170757737459977, + "grad_norm": 0.4956010103901101, + "learning_rate": 7.508302154694049e-06, + "loss": 0.0223, + "step": 25443 + }, + { + "epoch": 3.017194355508123, + "grad_norm": 0.3588426228123138, + "learning_rate": 7.50658725362052e-06, + "loss": 0.0232, + "step": 25444 + }, + { + "epoch": 3.0173129372702476, + "grad_norm": 0.5731811465014716, + "learning_rate": 7.504872513814809e-06, + "loss": 0.0364, + "step": 25445 + }, + { + "epoch": 3.017431519032373, + "grad_norm": 0.5455588406732829, + "learning_rate": 7.503157935292707e-06, + "loss": 0.0185, + "step": 25446 + }, + { + "epoch": 3.0175501007944976, + "grad_norm": 0.5028066201597671, + "learning_rate": 7.501443518070045e-06, + "loss": 0.0267, + "step": 25447 + }, + { + "epoch": 3.017668682556623, + "grad_norm": 0.3065145814931162, + "learning_rate": 7.499729262162611e-06, + "loss": 0.0114, + "step": 25448 + }, + { + "epoch": 3.0177872643187476, + "grad_norm": 0.5697332413740279, + "learning_rate": 7.498015167586217e-06, + "loss": 0.0267, + "step": 25449 + }, + { + "epoch": 3.0179058460808728, + "grad_norm": 0.4971967592931499, + "learning_rate": 7.496301234356648e-06, + "loss": 0.0249, + "step": 25450 + }, + { + "epoch": 3.018024427842998, + "grad_norm": 0.5962900140790216, + "learning_rate": 7.494587462489733e-06, + "loss": 0.0311, + "step": 25451 + }, + { + "epoch": 3.0181430096051227, + "grad_norm": 0.5537780303139198, + "learning_rate": 7.492873852001245e-06, + "loss": 0.0335, + "step": 25452 + }, + { + "epoch": 3.018261591367248, + "grad_norm": 0.4272505611817636, + "learning_rate": 7.491160402906994e-06, + "loss": 0.031, + "step": 25453 + }, + { + "epoch": 3.0183801731293727, + "grad_norm": 0.39656564031427666, + "learning_rate": 7.489447115222773e-06, + "loss": 0.0268, + "step": 25454 + }, + { + "epoch": 3.018498754891498, + "grad_norm": 0.3603879682030226, + "learning_rate": 7.487733988964374e-06, + "loss": 0.0216, + "step": 25455 + }, + { + "epoch": 3.0186173366536226, + "grad_norm": 0.6655804326461775, + "learning_rate": 7.486021024147602e-06, + "loss": 0.0282, + "step": 25456 + }, + { + "epoch": 3.018735918415748, + "grad_norm": 0.2865516950031116, + "learning_rate": 7.4843082207882245e-06, + "loss": 0.0133, + "step": 25457 + }, + { + "epoch": 3.0188545001778726, + "grad_norm": 0.5542351168382047, + "learning_rate": 7.4825955789020604e-06, + "loss": 0.0265, + "step": 25458 + }, + { + "epoch": 3.018973081939998, + "grad_norm": 0.5509018829834003, + "learning_rate": 7.480883098504876e-06, + "loss": 0.0227, + "step": 25459 + }, + { + "epoch": 3.0190916637021226, + "grad_norm": 0.46691607209142694, + "learning_rate": 7.479170779612476e-06, + "loss": 0.0239, + "step": 25460 + }, + { + "epoch": 3.0192102454642478, + "grad_norm": 0.49743638913125854, + "learning_rate": 7.477458622240618e-06, + "loss": 0.0186, + "step": 25461 + }, + { + "epoch": 3.0193288272263725, + "grad_norm": 0.539179207255222, + "learning_rate": 7.475746626405122e-06, + "loss": 0.0372, + "step": 25462 + }, + { + "epoch": 3.0194474089884977, + "grad_norm": 0.5734367213765001, + "learning_rate": 7.474034792121742e-06, + "loss": 0.0278, + "step": 25463 + }, + { + "epoch": 3.0195659907506225, + "grad_norm": 0.44320008915197917, + "learning_rate": 7.472323119406272e-06, + "loss": 0.0235, + "step": 25464 + }, + { + "epoch": 3.0196845725127477, + "grad_norm": 0.543820303240899, + "learning_rate": 7.470611608274486e-06, + "loss": 0.0279, + "step": 25465 + }, + { + "epoch": 3.0198031542748724, + "grad_norm": 0.3452124879199205, + "learning_rate": 7.468900258742167e-06, + "loss": 0.0206, + "step": 25466 + }, + { + "epoch": 3.0199217360369977, + "grad_norm": 0.6951963660685124, + "learning_rate": 7.4671890708250954e-06, + "loss": 0.0311, + "step": 25467 + }, + { + "epoch": 3.0200403177991224, + "grad_norm": 0.3528550892162062, + "learning_rate": 7.465478044539026e-06, + "loss": 0.0204, + "step": 25468 + }, + { + "epoch": 3.0201588995612476, + "grad_norm": 0.34822199429530587, + "learning_rate": 7.463767179899764e-06, + "loss": 0.0201, + "step": 25469 + }, + { + "epoch": 3.0202774813233724, + "grad_norm": 0.5058397970584119, + "learning_rate": 7.462056476923054e-06, + "loss": 0.024, + "step": 25470 + }, + { + "epoch": 3.0203960630854976, + "grad_norm": 0.6135611477534872, + "learning_rate": 7.460345935624685e-06, + "loss": 0.0297, + "step": 25471 + }, + { + "epoch": 3.0205146448476223, + "grad_norm": 0.407701367865082, + "learning_rate": 7.458635556020405e-06, + "loss": 0.0206, + "step": 25472 + }, + { + "epoch": 3.0206332266097475, + "grad_norm": 0.5893773354722381, + "learning_rate": 7.456925338126009e-06, + "loss": 0.0366, + "step": 25473 + }, + { + "epoch": 3.0207518083718723, + "grad_norm": 0.40468513353752417, + "learning_rate": 7.45521528195724e-06, + "loss": 0.0237, + "step": 25474 + }, + { + "epoch": 3.0208703901339975, + "grad_norm": 0.506267146506876, + "learning_rate": 7.453505387529869e-06, + "loss": 0.0234, + "step": 25475 + }, + { + "epoch": 3.0209889718961223, + "grad_norm": 0.46690684453358017, + "learning_rate": 7.451795654859664e-06, + "loss": 0.0211, + "step": 25476 + }, + { + "epoch": 3.0211075536582475, + "grad_norm": 0.38917389295522664, + "learning_rate": 7.450086083962385e-06, + "loss": 0.019, + "step": 25477 + }, + { + "epoch": 3.021226135420372, + "grad_norm": 0.9520255297834674, + "learning_rate": 7.448376674853796e-06, + "loss": 0.0567, + "step": 25478 + }, + { + "epoch": 3.0213447171824974, + "grad_norm": 0.6491049203808418, + "learning_rate": 7.446667427549645e-06, + "loss": 0.0348, + "step": 25479 + }, + { + "epoch": 3.021463298944622, + "grad_norm": 0.5401192613479493, + "learning_rate": 7.444958342065694e-06, + "loss": 0.0348, + "step": 25480 + }, + { + "epoch": 3.0215818807067474, + "grad_norm": 0.3978183287733996, + "learning_rate": 7.4432494184176995e-06, + "loss": 0.0255, + "step": 25481 + }, + { + "epoch": 3.021700462468872, + "grad_norm": 0.6458013935391205, + "learning_rate": 7.441540656621418e-06, + "loss": 0.0371, + "step": 25482 + }, + { + "epoch": 3.0218190442309973, + "grad_norm": 0.46580433101473323, + "learning_rate": 7.4398320566926e-06, + "loss": 0.0211, + "step": 25483 + }, + { + "epoch": 3.021937625993122, + "grad_norm": 0.5672087461617801, + "learning_rate": 7.438123618646994e-06, + "loss": 0.0287, + "step": 25484 + }, + { + "epoch": 3.0220562077552473, + "grad_norm": 0.23006123234189121, + "learning_rate": 7.436415342500363e-06, + "loss": 0.0141, + "step": 25485 + }, + { + "epoch": 3.022174789517372, + "grad_norm": 0.4669247616101962, + "learning_rate": 7.434707228268434e-06, + "loss": 0.0193, + "step": 25486 + }, + { + "epoch": 3.0222933712794973, + "grad_norm": 0.5390706610589175, + "learning_rate": 7.4329992759669676e-06, + "loss": 0.0285, + "step": 25487 + }, + { + "epoch": 3.022411953041622, + "grad_norm": 0.45281783674892206, + "learning_rate": 7.4312914856117025e-06, + "loss": 0.0199, + "step": 25488 + }, + { + "epoch": 3.0225305348037472, + "grad_norm": 0.6043914276354383, + "learning_rate": 7.429583857218395e-06, + "loss": 0.0311, + "step": 25489 + }, + { + "epoch": 3.022649116565872, + "grad_norm": 0.4681167595521451, + "learning_rate": 7.427876390802771e-06, + "loss": 0.0242, + "step": 25490 + }, + { + "epoch": 3.022767698327997, + "grad_norm": 0.5553074656442504, + "learning_rate": 7.426169086380577e-06, + "loss": 0.0229, + "step": 25491 + }, + { + "epoch": 3.022886280090122, + "grad_norm": 0.4984089144414229, + "learning_rate": 7.424461943967556e-06, + "loss": 0.0242, + "step": 25492 + }, + { + "epoch": 3.023004861852247, + "grad_norm": 0.3980506446070038, + "learning_rate": 7.42275496357944e-06, + "loss": 0.0202, + "step": 25493 + }, + { + "epoch": 3.023123443614372, + "grad_norm": 0.4487572316534197, + "learning_rate": 7.421048145231979e-06, + "loss": 0.0223, + "step": 25494 + }, + { + "epoch": 3.023242025376497, + "grad_norm": 0.4599336378042825, + "learning_rate": 7.419341488940882e-06, + "loss": 0.0191, + "step": 25495 + }, + { + "epoch": 3.023360607138622, + "grad_norm": 0.6497118454685568, + "learning_rate": 7.417634994721911e-06, + "loss": 0.0306, + "step": 25496 + }, + { + "epoch": 3.023479188900747, + "grad_norm": 0.6240516743373182, + "learning_rate": 7.415928662590776e-06, + "loss": 0.0341, + "step": 25497 + }, + { + "epoch": 3.023597770662872, + "grad_norm": 0.6396505858844798, + "learning_rate": 7.414222492563219e-06, + "loss": 0.03, + "step": 25498 + }, + { + "epoch": 3.023716352424997, + "grad_norm": 0.49692055649460204, + "learning_rate": 7.412516484654963e-06, + "loss": 0.0251, + "step": 25499 + }, + { + "epoch": 3.0238349341871222, + "grad_norm": 0.38333478252429115, + "learning_rate": 7.410810638881746e-06, + "loss": 0.0221, + "step": 25500 + }, + { + "epoch": 3.023953515949247, + "grad_norm": 0.5292921259006801, + "learning_rate": 7.40910495525928e-06, + "loss": 0.0292, + "step": 25501 + }, + { + "epoch": 3.024072097711372, + "grad_norm": 0.7714009184534717, + "learning_rate": 7.407399433803294e-06, + "loss": 0.0394, + "step": 25502 + }, + { + "epoch": 3.024190679473497, + "grad_norm": 0.4984785947562804, + "learning_rate": 7.4056940745295095e-06, + "loss": 0.0287, + "step": 25503 + }, + { + "epoch": 3.024309261235622, + "grad_norm": 0.3162949668172879, + "learning_rate": 7.403988877453652e-06, + "loss": 0.0192, + "step": 25504 + }, + { + "epoch": 3.024427842997747, + "grad_norm": 0.39241984280731035, + "learning_rate": 7.402283842591448e-06, + "loss": 0.0215, + "step": 25505 + }, + { + "epoch": 3.024546424759872, + "grad_norm": 0.4095016754355888, + "learning_rate": 7.400578969958588e-06, + "loss": 0.0217, + "step": 25506 + }, + { + "epoch": 3.024665006521997, + "grad_norm": 0.5149680771165915, + "learning_rate": 7.398874259570826e-06, + "loss": 0.0338, + "step": 25507 + }, + { + "epoch": 3.024783588284122, + "grad_norm": 0.3419044233024707, + "learning_rate": 7.397169711443852e-06, + "loss": 0.016, + "step": 25508 + }, + { + "epoch": 3.024902170046247, + "grad_norm": 0.682506261179491, + "learning_rate": 7.395465325593395e-06, + "loss": 0.0396, + "step": 25509 + }, + { + "epoch": 3.025020751808372, + "grad_norm": 0.693481942071081, + "learning_rate": 7.393761102035143e-06, + "loss": 0.0564, + "step": 25510 + }, + { + "epoch": 3.025139333570497, + "grad_norm": 0.8032710234003713, + "learning_rate": 7.392057040784842e-06, + "loss": 0.0346, + "step": 25511 + }, + { + "epoch": 3.025257915332622, + "grad_norm": 0.3852059827950563, + "learning_rate": 7.390353141858172e-06, + "loss": 0.0197, + "step": 25512 + }, + { + "epoch": 3.0253764970947468, + "grad_norm": 0.64323019124524, + "learning_rate": 7.3886494052708534e-06, + "loss": 0.0315, + "step": 25513 + }, + { + "epoch": 3.025495078856872, + "grad_norm": 0.4890779997762815, + "learning_rate": 7.3869458310385886e-06, + "loss": 0.0234, + "step": 25514 + }, + { + "epoch": 3.0256136606189967, + "grad_norm": 0.40555980939821606, + "learning_rate": 7.385242419177086e-06, + "loss": 0.0273, + "step": 25515 + }, + { + "epoch": 3.025732242381122, + "grad_norm": 0.44748269893771897, + "learning_rate": 7.3835391697020554e-06, + "loss": 0.0321, + "step": 25516 + }, + { + "epoch": 3.0258508241432467, + "grad_norm": 0.4395189246817603, + "learning_rate": 7.381836082629176e-06, + "loss": 0.0242, + "step": 25517 + }, + { + "epoch": 3.025969405905372, + "grad_norm": 0.6776546167948023, + "learning_rate": 7.380133157974178e-06, + "loss": 0.0296, + "step": 25518 + }, + { + "epoch": 3.0260879876674966, + "grad_norm": 0.2925616888456235, + "learning_rate": 7.378430395752736e-06, + "loss": 0.0137, + "step": 25519 + }, + { + "epoch": 3.026206569429622, + "grad_norm": 0.4032751310903437, + "learning_rate": 7.376727795980568e-06, + "loss": 0.0182, + "step": 25520 + }, + { + "epoch": 3.0263251511917466, + "grad_norm": 0.47261942660661505, + "learning_rate": 7.37502535867334e-06, + "loss": 0.0297, + "step": 25521 + }, + { + "epoch": 3.026443732953872, + "grad_norm": 0.3904915060181876, + "learning_rate": 7.373323083846781e-06, + "loss": 0.0218, + "step": 25522 + }, + { + "epoch": 3.0265623147159966, + "grad_norm": 0.4027048592966785, + "learning_rate": 7.371620971516563e-06, + "loss": 0.0248, + "step": 25523 + }, + { + "epoch": 3.0266808964781218, + "grad_norm": 0.7468072613979407, + "learning_rate": 7.369919021698379e-06, + "loss": 0.0299, + "step": 25524 + }, + { + "epoch": 3.0267994782402465, + "grad_norm": 0.9586876626137988, + "learning_rate": 7.368217234407923e-06, + "loss": 0.0348, + "step": 25525 + }, + { + "epoch": 3.0269180600023717, + "grad_norm": 0.6120987427317511, + "learning_rate": 7.366515609660885e-06, + "loss": 0.031, + "step": 25526 + }, + { + "epoch": 3.0270366417644965, + "grad_norm": 0.5589865555166775, + "learning_rate": 7.3648141474729535e-06, + "loss": 0.0316, + "step": 25527 + }, + { + "epoch": 3.0271552235266217, + "grad_norm": 0.5701176187389578, + "learning_rate": 7.3631128478597964e-06, + "loss": 0.0274, + "step": 25528 + }, + { + "epoch": 3.0272738052887465, + "grad_norm": 0.3921430539882623, + "learning_rate": 7.361411710837127e-06, + "loss": 0.0226, + "step": 25529 + }, + { + "epoch": 3.0273923870508717, + "grad_norm": 0.7863232964293633, + "learning_rate": 7.359710736420603e-06, + "loss": 0.0398, + "step": 25530 + }, + { + "epoch": 3.0275109688129964, + "grad_norm": 0.41143809000056625, + "learning_rate": 7.358009924625914e-06, + "loss": 0.0217, + "step": 25531 + }, + { + "epoch": 3.0276295505751216, + "grad_norm": 0.6986414696695976, + "learning_rate": 7.35630927546874e-06, + "loss": 0.0351, + "step": 25532 + }, + { + "epoch": 3.0277481323372464, + "grad_norm": 0.35532763353248, + "learning_rate": 7.354608788964756e-06, + "loss": 0.0155, + "step": 25533 + }, + { + "epoch": 3.0278667140993716, + "grad_norm": 0.6338524528490971, + "learning_rate": 7.352908465129651e-06, + "loss": 0.0333, + "step": 25534 + }, + { + "epoch": 3.0279852958614963, + "grad_norm": 0.5969539212927654, + "learning_rate": 7.351208303979082e-06, + "loss": 0.0324, + "step": 25535 + }, + { + "epoch": 3.0281038776236215, + "grad_norm": 0.7062939375641806, + "learning_rate": 7.3495083055287295e-06, + "loss": 0.0466, + "step": 25536 + }, + { + "epoch": 3.0282224593857463, + "grad_norm": 0.6272453158278343, + "learning_rate": 7.347808469794265e-06, + "loss": 0.0433, + "step": 25537 + }, + { + "epoch": 3.0283410411478715, + "grad_norm": 0.4463708884863511, + "learning_rate": 7.34610879679137e-06, + "loss": 0.0233, + "step": 25538 + }, + { + "epoch": 3.0284596229099963, + "grad_norm": 0.5352132734665388, + "learning_rate": 7.344409286535686e-06, + "loss": 0.0295, + "step": 25539 + }, + { + "epoch": 3.0285782046721215, + "grad_norm": 0.44037882877644996, + "learning_rate": 7.342709939042913e-06, + "loss": 0.0234, + "step": 25540 + }, + { + "epoch": 3.0286967864342462, + "grad_norm": 0.3895129966720545, + "learning_rate": 7.341010754328695e-06, + "loss": 0.0227, + "step": 25541 + }, + { + "epoch": 3.0288153681963714, + "grad_norm": 0.513295188817292, + "learning_rate": 7.339311732408702e-06, + "loss": 0.0251, + "step": 25542 + }, + { + "epoch": 3.028933949958496, + "grad_norm": 0.5039763841980448, + "learning_rate": 7.3376128732986e-06, + "loss": 0.0275, + "step": 25543 + }, + { + "epoch": 3.0290525317206214, + "grad_norm": 0.7012567311969867, + "learning_rate": 7.335914177014044e-06, + "loss": 0.0505, + "step": 25544 + }, + { + "epoch": 3.029171113482746, + "grad_norm": 0.2812554849044171, + "learning_rate": 7.334215643570711e-06, + "loss": 0.0181, + "step": 25545 + }, + { + "epoch": 3.0292896952448713, + "grad_norm": 0.3976174913121928, + "learning_rate": 7.332517272984235e-06, + "loss": 0.0185, + "step": 25546 + }, + { + "epoch": 3.029408277006996, + "grad_norm": 0.3518922581715751, + "learning_rate": 7.330819065270286e-06, + "loss": 0.019, + "step": 25547 + }, + { + "epoch": 3.0295268587691213, + "grad_norm": 0.6428847040917856, + "learning_rate": 7.3291210204445156e-06, + "loss": 0.037, + "step": 25548 + }, + { + "epoch": 3.0296454405312465, + "grad_norm": 0.37447333136607486, + "learning_rate": 7.327423138522591e-06, + "loss": 0.0198, + "step": 25549 + }, + { + "epoch": 3.0297640222933713, + "grad_norm": 0.7737848156820663, + "learning_rate": 7.3257254195201434e-06, + "loss": 0.0454, + "step": 25550 + }, + { + "epoch": 3.0298826040554965, + "grad_norm": 0.5464934604177705, + "learning_rate": 7.3240278634528345e-06, + "loss": 0.0229, + "step": 25551 + }, + { + "epoch": 3.0300011858176212, + "grad_norm": 0.4770666812463229, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.0283, + "step": 25552 + }, + { + "epoch": 3.0301197675797464, + "grad_norm": 0.3675954774533955, + "learning_rate": 7.320633240186228e-06, + "loss": 0.0214, + "step": 25553 + }, + { + "epoch": 3.030238349341871, + "grad_norm": 0.4034747377498527, + "learning_rate": 7.318936173018223e-06, + "loss": 0.0192, + "step": 25554 + }, + { + "epoch": 3.0303569311039964, + "grad_norm": 0.4266273690376254, + "learning_rate": 7.317239268847945e-06, + "loss": 0.0234, + "step": 25555 + }, + { + "epoch": 3.030475512866121, + "grad_norm": 1.0435956703356837, + "learning_rate": 7.315542527691047e-06, + "loss": 0.0562, + "step": 25556 + }, + { + "epoch": 3.0305940946282464, + "grad_norm": 0.4538907691931695, + "learning_rate": 7.31384594956315e-06, + "loss": 0.0299, + "step": 25557 + }, + { + "epoch": 3.030712676390371, + "grad_norm": 0.35267055838517186, + "learning_rate": 7.312149534479906e-06, + "loss": 0.0181, + "step": 25558 + }, + { + "epoch": 3.0308312581524963, + "grad_norm": 0.4189697192965319, + "learning_rate": 7.3104532824569525e-06, + "loss": 0.0254, + "step": 25559 + }, + { + "epoch": 3.030949839914621, + "grad_norm": 0.678255412314605, + "learning_rate": 7.308757193509935e-06, + "loss": 0.03, + "step": 25560 + }, + { + "epoch": 3.0310684216767463, + "grad_norm": 0.48959123466019977, + "learning_rate": 7.3070612676544745e-06, + "loss": 0.0253, + "step": 25561 + }, + { + "epoch": 3.031187003438871, + "grad_norm": 0.39231716559149554, + "learning_rate": 7.305365504906214e-06, + "loss": 0.0209, + "step": 25562 + }, + { + "epoch": 3.0313055852009962, + "grad_norm": 0.5500229471722172, + "learning_rate": 7.3036699052807826e-06, + "loss": 0.0275, + "step": 25563 + }, + { + "epoch": 3.031424166963121, + "grad_norm": 0.5094475728346692, + "learning_rate": 7.301974468793815e-06, + "loss": 0.0243, + "step": 25564 + }, + { + "epoch": 3.031542748725246, + "grad_norm": 0.5467733286973911, + "learning_rate": 7.300279195460949e-06, + "loss": 0.0288, + "step": 25565 + }, + { + "epoch": 3.031661330487371, + "grad_norm": 0.46525147750966833, + "learning_rate": 7.298584085297785e-06, + "loss": 0.0304, + "step": 25566 + }, + { + "epoch": 3.031779912249496, + "grad_norm": 0.5371142021514349, + "learning_rate": 7.296889138319988e-06, + "loss": 0.0334, + "step": 25567 + }, + { + "epoch": 3.031898494011621, + "grad_norm": 0.36629876386139965, + "learning_rate": 7.295194354543155e-06, + "loss": 0.0176, + "step": 25568 + }, + { + "epoch": 3.032017075773746, + "grad_norm": 0.4126924088140684, + "learning_rate": 7.293499733982919e-06, + "loss": 0.0166, + "step": 25569 + }, + { + "epoch": 3.032135657535871, + "grad_norm": 0.5572112703695168, + "learning_rate": 7.291805276654903e-06, + "loss": 0.0415, + "step": 25570 + }, + { + "epoch": 3.032254239297996, + "grad_norm": 0.5998509453378863, + "learning_rate": 7.290110982574732e-06, + "loss": 0.0303, + "step": 25571 + }, + { + "epoch": 3.032372821060121, + "grad_norm": 0.41347250991397716, + "learning_rate": 7.288416851758017e-06, + "loss": 0.0233, + "step": 25572 + }, + { + "epoch": 3.032491402822246, + "grad_norm": 0.7657516581507618, + "learning_rate": 7.286722884220376e-06, + "loss": 0.0288, + "step": 25573 + }, + { + "epoch": 3.032609984584371, + "grad_norm": 0.6774773212085557, + "learning_rate": 7.285029079977432e-06, + "loss": 0.0432, + "step": 25574 + }, + { + "epoch": 3.032728566346496, + "grad_norm": 0.46274927790559184, + "learning_rate": 7.283335439044792e-06, + "loss": 0.0361, + "step": 25575 + }, + { + "epoch": 3.0328471481086208, + "grad_norm": 0.5144320941415619, + "learning_rate": 7.2816419614380845e-06, + "loss": 0.0359, + "step": 25576 + }, + { + "epoch": 3.032965729870746, + "grad_norm": 0.42129238380731465, + "learning_rate": 7.279948647172893e-06, + "loss": 0.0247, + "step": 25577 + }, + { + "epoch": 3.0330843116328707, + "grad_norm": 0.45349360367703695, + "learning_rate": 7.278255496264863e-06, + "loss": 0.0238, + "step": 25578 + }, + { + "epoch": 3.033202893394996, + "grad_norm": 0.4589787893878855, + "learning_rate": 7.276562508729576e-06, + "loss": 0.0273, + "step": 25579 + }, + { + "epoch": 3.0333214751571207, + "grad_norm": 0.5910752154588953, + "learning_rate": 7.2748696845826496e-06, + "loss": 0.0321, + "step": 25580 + }, + { + "epoch": 3.033440056919246, + "grad_norm": 0.2870225492228098, + "learning_rate": 7.273177023839686e-06, + "loss": 0.0191, + "step": 25581 + }, + { + "epoch": 3.0335586386813707, + "grad_norm": 0.5123756153332801, + "learning_rate": 7.271484526516295e-06, + "loss": 0.023, + "step": 25582 + }, + { + "epoch": 3.033677220443496, + "grad_norm": 0.5223266134086794, + "learning_rate": 7.269792192628083e-06, + "loss": 0.0278, + "step": 25583 + }, + { + "epoch": 3.0337958022056206, + "grad_norm": 0.5712989809555142, + "learning_rate": 7.26810002219063e-06, + "loss": 0.0363, + "step": 25584 + }, + { + "epoch": 3.033914383967746, + "grad_norm": 0.6214327098928413, + "learning_rate": 7.2664080152195666e-06, + "loss": 0.0308, + "step": 25585 + }, + { + "epoch": 3.0340329657298706, + "grad_norm": 0.6214361828820734, + "learning_rate": 7.264716171730468e-06, + "loss": 0.0252, + "step": 25586 + }, + { + "epoch": 3.034151547491996, + "grad_norm": 0.6398108586930592, + "learning_rate": 7.263024491738943e-06, + "loss": 0.0335, + "step": 25587 + }, + { + "epoch": 3.0342701292541205, + "grad_norm": 0.559035698901699, + "learning_rate": 7.261332975260568e-06, + "loss": 0.0289, + "step": 25588 + }, + { + "epoch": 3.0343887110162457, + "grad_norm": 0.27383549985859873, + "learning_rate": 7.259641622310964e-06, + "loss": 0.0139, + "step": 25589 + }, + { + "epoch": 3.0345072927783705, + "grad_norm": 0.9987368935432386, + "learning_rate": 7.257950432905702e-06, + "loss": 0.0452, + "step": 25590 + }, + { + "epoch": 3.0346258745404957, + "grad_norm": 0.6360451708718425, + "learning_rate": 7.256259407060384e-06, + "loss": 0.0404, + "step": 25591 + }, + { + "epoch": 3.0347444563026205, + "grad_norm": 0.611533712873175, + "learning_rate": 7.254568544790591e-06, + "loss": 0.0356, + "step": 25592 + }, + { + "epoch": 3.0348630380647457, + "grad_norm": 0.3582516646364771, + "learning_rate": 7.252877846111916e-06, + "loss": 0.017, + "step": 25593 + }, + { + "epoch": 3.0349816198268704, + "grad_norm": 0.5173962866184124, + "learning_rate": 7.251187311039953e-06, + "loss": 0.0262, + "step": 25594 + }, + { + "epoch": 3.0351002015889956, + "grad_norm": 0.6078580629437235, + "learning_rate": 7.249496939590264e-06, + "loss": 0.0337, + "step": 25595 + }, + { + "epoch": 3.0352187833511204, + "grad_norm": 1.1519081016826567, + "learning_rate": 7.247806731778459e-06, + "loss": 0.0565, + "step": 25596 + }, + { + "epoch": 3.0353373651132456, + "grad_norm": 0.2680548843384091, + "learning_rate": 7.2461166876201e-06, + "loss": 0.0144, + "step": 25597 + }, + { + "epoch": 3.035455946875371, + "grad_norm": 0.3589625551513458, + "learning_rate": 7.244426807130783e-06, + "loss": 0.0189, + "step": 25598 + }, + { + "epoch": 3.0355745286374956, + "grad_norm": 0.5832286055227365, + "learning_rate": 7.24273709032606e-06, + "loss": 0.0324, + "step": 25599 + }, + { + "epoch": 3.0356931103996208, + "grad_norm": 0.5794606940623375, + "learning_rate": 7.24104753722154e-06, + "loss": 0.0264, + "step": 25600 + }, + { + "epoch": 3.0358116921617455, + "grad_norm": 0.4667255672821096, + "learning_rate": 7.239358147832781e-06, + "loss": 0.0194, + "step": 25601 + }, + { + "epoch": 3.0359302739238707, + "grad_norm": 0.5811785598811245, + "learning_rate": 7.237668922175356e-06, + "loss": 0.0358, + "step": 25602 + }, + { + "epoch": 3.0360488556859955, + "grad_norm": 0.8183454791301821, + "learning_rate": 7.2359798602648435e-06, + "loss": 0.0463, + "step": 25603 + }, + { + "epoch": 3.0361674374481207, + "grad_norm": 0.5252439107218659, + "learning_rate": 7.234290962116813e-06, + "loss": 0.0256, + "step": 25604 + }, + { + "epoch": 3.0362860192102454, + "grad_norm": 0.538037475437534, + "learning_rate": 7.232602227746843e-06, + "loss": 0.0233, + "step": 25605 + }, + { + "epoch": 3.0364046009723706, + "grad_norm": 0.47562896272659233, + "learning_rate": 7.230913657170482e-06, + "loss": 0.0254, + "step": 25606 + }, + { + "epoch": 3.0365231827344954, + "grad_norm": 0.5927918563434063, + "learning_rate": 7.22922525040331e-06, + "loss": 0.0255, + "step": 25607 + }, + { + "epoch": 3.0366417644966206, + "grad_norm": 0.48024653702667325, + "learning_rate": 7.227537007460888e-06, + "loss": 0.0272, + "step": 25608 + }, + { + "epoch": 3.0367603462587454, + "grad_norm": 0.5013196079392649, + "learning_rate": 7.225848928358789e-06, + "loss": 0.0312, + "step": 25609 + }, + { + "epoch": 3.0368789280208706, + "grad_norm": 0.3844052038060899, + "learning_rate": 7.224161013112551e-06, + "loss": 0.0207, + "step": 25610 + }, + { + "epoch": 3.0369975097829953, + "grad_norm": 0.29359167870184716, + "learning_rate": 7.222473261737767e-06, + "loss": 0.0132, + "step": 25611 + }, + { + "epoch": 3.0371160915451205, + "grad_norm": 0.4355699414648827, + "learning_rate": 7.2207856742499695e-06, + "loss": 0.0257, + "step": 25612 + }, + { + "epoch": 3.0372346733072453, + "grad_norm": 0.628120709711594, + "learning_rate": 7.219098250664727e-06, + "loss": 0.0276, + "step": 25613 + }, + { + "epoch": 3.0373532550693705, + "grad_norm": 0.40491959263471494, + "learning_rate": 7.217410990997592e-06, + "loss": 0.0235, + "step": 25614 + }, + { + "epoch": 3.0374718368314952, + "grad_norm": 0.46840720876292546, + "learning_rate": 7.215723895264123e-06, + "loss": 0.0226, + "step": 25615 + }, + { + "epoch": 3.0375904185936204, + "grad_norm": 0.4422022466468966, + "learning_rate": 7.214036963479878e-06, + "loss": 0.021, + "step": 25616 + }, + { + "epoch": 3.037709000355745, + "grad_norm": 0.5858133075757084, + "learning_rate": 7.212350195660397e-06, + "loss": 0.0215, + "step": 25617 + }, + { + "epoch": 3.0378275821178704, + "grad_norm": 0.3569306643013583, + "learning_rate": 7.2106635918212315e-06, + "loss": 0.0194, + "step": 25618 + }, + { + "epoch": 3.037946163879995, + "grad_norm": 0.5904604738461098, + "learning_rate": 7.208977151977933e-06, + "loss": 0.0315, + "step": 25619 + }, + { + "epoch": 3.0380647456421204, + "grad_norm": 0.6580836070541726, + "learning_rate": 7.207290876146058e-06, + "loss": 0.0292, + "step": 25620 + }, + { + "epoch": 3.038183327404245, + "grad_norm": 0.5284492394292898, + "learning_rate": 7.205604764341134e-06, + "loss": 0.0243, + "step": 25621 + }, + { + "epoch": 3.0383019091663703, + "grad_norm": 0.47705860229871694, + "learning_rate": 7.203918816578712e-06, + "loss": 0.0296, + "step": 25622 + }, + { + "epoch": 3.038420490928495, + "grad_norm": 0.7113063512925267, + "learning_rate": 7.202233032874337e-06, + "loss": 0.0369, + "step": 25623 + }, + { + "epoch": 3.0385390726906203, + "grad_norm": 0.4558707316789305, + "learning_rate": 7.200547413243547e-06, + "loss": 0.0235, + "step": 25624 + }, + { + "epoch": 3.038657654452745, + "grad_norm": 0.7429285270477268, + "learning_rate": 7.198861957701883e-06, + "loss": 0.0466, + "step": 25625 + }, + { + "epoch": 3.0387762362148703, + "grad_norm": 0.38822655752650936, + "learning_rate": 7.197176666264882e-06, + "loss": 0.0227, + "step": 25626 + }, + { + "epoch": 3.038894817976995, + "grad_norm": 0.7322494642077816, + "learning_rate": 7.19549153894809e-06, + "loss": 0.0266, + "step": 25627 + }, + { + "epoch": 3.03901339973912, + "grad_norm": 0.6333058803843093, + "learning_rate": 7.193806575767023e-06, + "loss": 0.0441, + "step": 25628 + }, + { + "epoch": 3.039131981501245, + "grad_norm": 0.5950155366955171, + "learning_rate": 7.1921217767372265e-06, + "loss": 0.034, + "step": 25629 + }, + { + "epoch": 3.03925056326337, + "grad_norm": 0.6807040934951091, + "learning_rate": 7.190437141874229e-06, + "loss": 0.0381, + "step": 25630 + }, + { + "epoch": 3.039369145025495, + "grad_norm": 0.48061921916731054, + "learning_rate": 7.18875267119356e-06, + "loss": 0.0261, + "step": 25631 + }, + { + "epoch": 3.03948772678762, + "grad_norm": 0.5642687452334069, + "learning_rate": 7.187068364710758e-06, + "loss": 0.0233, + "step": 25632 + }, + { + "epoch": 3.039606308549745, + "grad_norm": 0.33438174643318763, + "learning_rate": 7.185384222441327e-06, + "loss": 0.0228, + "step": 25633 + }, + { + "epoch": 3.03972489031187, + "grad_norm": 0.7844095089304419, + "learning_rate": 7.183700244400824e-06, + "loss": 0.044, + "step": 25634 + }, + { + "epoch": 3.039843472073995, + "grad_norm": 0.9068355884035788, + "learning_rate": 7.182016430604749e-06, + "loss": 0.0389, + "step": 25635 + }, + { + "epoch": 3.03996205383612, + "grad_norm": 1.0305473409151047, + "learning_rate": 7.180332781068638e-06, + "loss": 0.0415, + "step": 25636 + }, + { + "epoch": 3.040080635598245, + "grad_norm": 0.4780893321237964, + "learning_rate": 7.178649295807993e-06, + "loss": 0.0353, + "step": 25637 + }, + { + "epoch": 3.04019921736037, + "grad_norm": 0.6278386441443284, + "learning_rate": 7.176965974838365e-06, + "loss": 0.0206, + "step": 25638 + }, + { + "epoch": 3.040317799122495, + "grad_norm": 0.33118724491277135, + "learning_rate": 7.175282818175244e-06, + "loss": 0.0169, + "step": 25639 + }, + { + "epoch": 3.04043638088462, + "grad_norm": 0.7962534475520948, + "learning_rate": 7.17359982583416e-06, + "loss": 0.0301, + "step": 25640 + }, + { + "epoch": 3.0405549626467447, + "grad_norm": 0.5965613816626361, + "learning_rate": 7.171916997830621e-06, + "loss": 0.0267, + "step": 25641 + }, + { + "epoch": 3.04067354440887, + "grad_norm": 0.47351196431390985, + "learning_rate": 7.170234334180151e-06, + "loss": 0.0266, + "step": 25642 + }, + { + "epoch": 3.0407921261709947, + "grad_norm": 0.7394377050117784, + "learning_rate": 7.16855183489826e-06, + "loss": 0.0343, + "step": 25643 + }, + { + "epoch": 3.04091070793312, + "grad_norm": 0.3860249704907154, + "learning_rate": 7.1668695000004394e-06, + "loss": 0.0212, + "step": 25644 + }, + { + "epoch": 3.0410292896952447, + "grad_norm": 0.6495034400487649, + "learning_rate": 7.165187329502229e-06, + "loss": 0.0405, + "step": 25645 + }, + { + "epoch": 3.04114787145737, + "grad_norm": 0.4841623476857207, + "learning_rate": 7.1635053234191145e-06, + "loss": 0.0224, + "step": 25646 + }, + { + "epoch": 3.0412664532194946, + "grad_norm": 0.4468764228883938, + "learning_rate": 7.161823481766616e-06, + "loss": 0.0261, + "step": 25647 + }, + { + "epoch": 3.04138503498162, + "grad_norm": 0.42925567022030775, + "learning_rate": 7.160141804560214e-06, + "loss": 0.0217, + "step": 25648 + }, + { + "epoch": 3.0415036167437446, + "grad_norm": 0.41470415343212974, + "learning_rate": 7.158460291815444e-06, + "loss": 0.0243, + "step": 25649 + }, + { + "epoch": 3.04162219850587, + "grad_norm": 0.38713412142802606, + "learning_rate": 7.156778943547784e-06, + "loss": 0.0182, + "step": 25650 + }, + { + "epoch": 3.041740780267995, + "grad_norm": 0.4204874935196305, + "learning_rate": 7.155097759772742e-06, + "loss": 0.0237, + "step": 25651 + }, + { + "epoch": 3.0418593620301198, + "grad_norm": 0.494062708097531, + "learning_rate": 7.1534167405058135e-06, + "loss": 0.0177, + "step": 25652 + }, + { + "epoch": 3.041977943792245, + "grad_norm": 0.5859597347144514, + "learning_rate": 7.1517358857624975e-06, + "loss": 0.0313, + "step": 25653 + }, + { + "epoch": 3.0420965255543697, + "grad_norm": 0.4789678932838485, + "learning_rate": 7.1500551955583e-06, + "loss": 0.0212, + "step": 25654 + }, + { + "epoch": 3.042215107316495, + "grad_norm": 0.5971098139122184, + "learning_rate": 7.148374669908686e-06, + "loss": 0.035, + "step": 25655 + }, + { + "epoch": 3.0423336890786197, + "grad_norm": 0.28853330880716554, + "learning_rate": 7.146694308829186e-06, + "loss": 0.0138, + "step": 25656 + }, + { + "epoch": 3.042452270840745, + "grad_norm": 0.4588879212859405, + "learning_rate": 7.1450141123352604e-06, + "loss": 0.0297, + "step": 25657 + }, + { + "epoch": 3.0425708526028696, + "grad_norm": 0.6459429196558409, + "learning_rate": 7.143334080442418e-06, + "loss": 0.0316, + "step": 25658 + }, + { + "epoch": 3.042689434364995, + "grad_norm": 0.8413861988211172, + "learning_rate": 7.141654213166121e-06, + "loss": 0.055, + "step": 25659 + }, + { + "epoch": 3.0428080161271196, + "grad_norm": 0.5371077179959387, + "learning_rate": 7.13997451052189e-06, + "loss": 0.0294, + "step": 25660 + }, + { + "epoch": 3.042926597889245, + "grad_norm": 0.4895304295407183, + "learning_rate": 7.1382949725251866e-06, + "loss": 0.0334, + "step": 25661 + }, + { + "epoch": 3.0430451796513696, + "grad_norm": 0.3788876305901472, + "learning_rate": 7.1366155991914954e-06, + "loss": 0.0187, + "step": 25662 + }, + { + "epoch": 3.0431637614134948, + "grad_norm": 0.9246530848047887, + "learning_rate": 7.134936390536307e-06, + "loss": 0.0438, + "step": 25663 + }, + { + "epoch": 3.0432823431756195, + "grad_norm": 0.6251586299501516, + "learning_rate": 7.133257346575095e-06, + "loss": 0.0428, + "step": 25664 + }, + { + "epoch": 3.0434009249377447, + "grad_norm": 0.34853761414836393, + "learning_rate": 7.131578467323352e-06, + "loss": 0.0169, + "step": 25665 + }, + { + "epoch": 3.0435195066998695, + "grad_norm": 0.6259589930523619, + "learning_rate": 7.129899752796532e-06, + "loss": 0.0304, + "step": 25666 + }, + { + "epoch": 3.0436380884619947, + "grad_norm": 0.3609650028266282, + "learning_rate": 7.128221203010124e-06, + "loss": 0.0164, + "step": 25667 + }, + { + "epoch": 3.0437566702241194, + "grad_norm": 0.761655446487549, + "learning_rate": 7.1265428179796005e-06, + "loss": 0.0419, + "step": 25668 + }, + { + "epoch": 3.0438752519862446, + "grad_norm": 0.41876560421631387, + "learning_rate": 7.124864597720443e-06, + "loss": 0.0183, + "step": 25669 + }, + { + "epoch": 3.0439938337483694, + "grad_norm": 0.45494253828785697, + "learning_rate": 7.123186542248097e-06, + "loss": 0.0211, + "step": 25670 + }, + { + "epoch": 3.0441124155104946, + "grad_norm": 0.5332898704893814, + "learning_rate": 7.121508651578066e-06, + "loss": 0.0274, + "step": 25671 + }, + { + "epoch": 3.0442309972726194, + "grad_norm": 0.7185109375180969, + "learning_rate": 7.119830925725793e-06, + "loss": 0.0336, + "step": 25672 + }, + { + "epoch": 3.0443495790347446, + "grad_norm": 0.7519227798063824, + "learning_rate": 7.118153364706753e-06, + "loss": 0.0434, + "step": 25673 + }, + { + "epoch": 3.0444681607968693, + "grad_norm": 0.5716813863219283, + "learning_rate": 7.1164759685364095e-06, + "loss": 0.032, + "step": 25674 + }, + { + "epoch": 3.0445867425589945, + "grad_norm": 0.5518543688276104, + "learning_rate": 7.11479873723023e-06, + "loss": 0.0305, + "step": 25675 + }, + { + "epoch": 3.0447053243211193, + "grad_norm": 0.4662502434311144, + "learning_rate": 7.11312167080368e-06, + "loss": 0.0246, + "step": 25676 + }, + { + "epoch": 3.0448239060832445, + "grad_norm": 0.3986982511856967, + "learning_rate": 7.1114447692722045e-06, + "loss": 0.0202, + "step": 25677 + }, + { + "epoch": 3.0449424878453693, + "grad_norm": 0.5059119381185545, + "learning_rate": 7.109768032651274e-06, + "loss": 0.0226, + "step": 25678 + }, + { + "epoch": 3.0450610696074945, + "grad_norm": 0.31519470986580544, + "learning_rate": 7.108091460956342e-06, + "loss": 0.0194, + "step": 25679 + }, + { + "epoch": 3.045179651369619, + "grad_norm": 0.46108403407734533, + "learning_rate": 7.1064150542028754e-06, + "loss": 0.0207, + "step": 25680 + }, + { + "epoch": 3.0452982331317444, + "grad_norm": 0.6638182047951712, + "learning_rate": 7.104738812406303e-06, + "loss": 0.0337, + "step": 25681 + }, + { + "epoch": 3.045416814893869, + "grad_norm": 0.4737633590561371, + "learning_rate": 7.103062735582111e-06, + "loss": 0.0268, + "step": 25682 + }, + { + "epoch": 3.0455353966559944, + "grad_norm": 0.6502132978790345, + "learning_rate": 7.101386823745726e-06, + "loss": 0.0425, + "step": 25683 + }, + { + "epoch": 3.045653978418119, + "grad_norm": 0.7390798809762883, + "learning_rate": 7.099711076912604e-06, + "loss": 0.0479, + "step": 25684 + }, + { + "epoch": 3.0457725601802443, + "grad_norm": 0.5982526814922612, + "learning_rate": 7.098035495098196e-06, + "loss": 0.0322, + "step": 25685 + }, + { + "epoch": 3.045891141942369, + "grad_norm": 0.5172581241521865, + "learning_rate": 7.096360078317949e-06, + "loss": 0.023, + "step": 25686 + }, + { + "epoch": 3.0460097237044943, + "grad_norm": 0.8477402082824489, + "learning_rate": 7.094684826587314e-06, + "loss": 0.052, + "step": 25687 + }, + { + "epoch": 3.046128305466619, + "grad_norm": 0.6491577025051294, + "learning_rate": 7.093009739921721e-06, + "loss": 0.0401, + "step": 25688 + }, + { + "epoch": 3.0462468872287443, + "grad_norm": 0.40525738123351823, + "learning_rate": 7.091334818336618e-06, + "loss": 0.0183, + "step": 25689 + }, + { + "epoch": 3.046365468990869, + "grad_norm": 0.8061535335764177, + "learning_rate": 7.089660061847447e-06, + "loss": 0.0404, + "step": 25690 + }, + { + "epoch": 3.046484050752994, + "grad_norm": 0.47579761689992567, + "learning_rate": 7.087985470469649e-06, + "loss": 0.0203, + "step": 25691 + }, + { + "epoch": 3.046602632515119, + "grad_norm": 0.6117218637725415, + "learning_rate": 7.086311044218666e-06, + "loss": 0.0325, + "step": 25692 + }, + { + "epoch": 3.046721214277244, + "grad_norm": 0.3542142086518, + "learning_rate": 7.08463678310991e-06, + "loss": 0.0162, + "step": 25693 + }, + { + "epoch": 3.046839796039369, + "grad_norm": 0.5798577308061234, + "learning_rate": 7.082962687158853e-06, + "loss": 0.0311, + "step": 25694 + }, + { + "epoch": 3.046958377801494, + "grad_norm": 0.49872348877988787, + "learning_rate": 7.0812887563808994e-06, + "loss": 0.0271, + "step": 25695 + }, + { + "epoch": 3.047076959563619, + "grad_norm": 0.3739921578097023, + "learning_rate": 7.079614990791492e-06, + "loss": 0.0175, + "step": 25696 + }, + { + "epoch": 3.047195541325744, + "grad_norm": 0.4226222232699488, + "learning_rate": 7.077941390406057e-06, + "loss": 0.0206, + "step": 25697 + }, + { + "epoch": 3.047314123087869, + "grad_norm": 0.6009363403687604, + "learning_rate": 7.076267955240032e-06, + "loss": 0.0442, + "step": 25698 + }, + { + "epoch": 3.047432704849994, + "grad_norm": 0.5989263077232984, + "learning_rate": 7.07459468530883e-06, + "loss": 0.0275, + "step": 25699 + }, + { + "epoch": 3.0475512866121193, + "grad_norm": 0.521859375519805, + "learning_rate": 7.072921580627884e-06, + "loss": 0.0288, + "step": 25700 + }, + { + "epoch": 3.047669868374244, + "grad_norm": 0.8005361329865595, + "learning_rate": 7.071248641212616e-06, + "loss": 0.0482, + "step": 25701 + }, + { + "epoch": 3.0477884501363692, + "grad_norm": 0.42749046281241404, + "learning_rate": 7.069575867078451e-06, + "loss": 0.0291, + "step": 25702 + }, + { + "epoch": 3.047907031898494, + "grad_norm": 0.3656719829091871, + "learning_rate": 7.067903258240815e-06, + "loss": 0.0196, + "step": 25703 + }, + { + "epoch": 3.048025613660619, + "grad_norm": 0.5069086389646187, + "learning_rate": 7.066230814715108e-06, + "loss": 0.026, + "step": 25704 + }, + { + "epoch": 3.048144195422744, + "grad_norm": 0.4673833707724831, + "learning_rate": 7.064558536516774e-06, + "loss": 0.0238, + "step": 25705 + }, + { + "epoch": 3.048262777184869, + "grad_norm": 0.3814666583443058, + "learning_rate": 7.062886423661211e-06, + "loss": 0.0224, + "step": 25706 + }, + { + "epoch": 3.048381358946994, + "grad_norm": 0.6554430138251789, + "learning_rate": 7.0612144761638465e-06, + "loss": 0.0347, + "step": 25707 + }, + { + "epoch": 3.048499940709119, + "grad_norm": 0.43196383761449986, + "learning_rate": 7.05954269404007e-06, + "loss": 0.0282, + "step": 25708 + }, + { + "epoch": 3.048618522471244, + "grad_norm": 0.428741652526595, + "learning_rate": 7.0578710773053255e-06, + "loss": 0.0285, + "step": 25709 + }, + { + "epoch": 3.048737104233369, + "grad_norm": 0.4870265560148238, + "learning_rate": 7.056199625974999e-06, + "loss": 0.024, + "step": 25710 + }, + { + "epoch": 3.048855685995494, + "grad_norm": 0.43395603041345515, + "learning_rate": 7.054528340064512e-06, + "loss": 0.024, + "step": 25711 + }, + { + "epoch": 3.048974267757619, + "grad_norm": 0.7441802764430697, + "learning_rate": 7.052857219589262e-06, + "loss": 0.031, + "step": 25712 + }, + { + "epoch": 3.049092849519744, + "grad_norm": 0.40220964181567764, + "learning_rate": 7.051186264564663e-06, + "loss": 0.0206, + "step": 25713 + }, + { + "epoch": 3.049211431281869, + "grad_norm": 0.5984120999457562, + "learning_rate": 7.049515475006124e-06, + "loss": 0.0467, + "step": 25714 + }, + { + "epoch": 3.0493300130439938, + "grad_norm": 0.4316491936542684, + "learning_rate": 7.047844850929025e-06, + "loss": 0.0301, + "step": 25715 + }, + { + "epoch": 3.049448594806119, + "grad_norm": 0.6744264712630901, + "learning_rate": 7.046174392348798e-06, + "loss": 0.0376, + "step": 25716 + }, + { + "epoch": 3.0495671765682437, + "grad_norm": 0.3040634034093401, + "learning_rate": 7.044504099280816e-06, + "loss": 0.0177, + "step": 25717 + }, + { + "epoch": 3.049685758330369, + "grad_norm": 0.5858214965749343, + "learning_rate": 7.042833971740498e-06, + "loss": 0.0342, + "step": 25718 + }, + { + "epoch": 3.0498043400924937, + "grad_norm": 0.5895239442776448, + "learning_rate": 7.0411640097432145e-06, + "loss": 0.0221, + "step": 25719 + }, + { + "epoch": 3.049922921854619, + "grad_norm": 0.4729320149091077, + "learning_rate": 7.03949421330439e-06, + "loss": 0.0226, + "step": 25720 + }, + { + "epoch": 3.0500415036167436, + "grad_norm": 0.7000018816006867, + "learning_rate": 7.037824582439398e-06, + "loss": 0.0238, + "step": 25721 + }, + { + "epoch": 3.050160085378869, + "grad_norm": 0.8953740458699684, + "learning_rate": 7.036155117163637e-06, + "loss": 0.0523, + "step": 25722 + }, + { + "epoch": 3.0502786671409936, + "grad_norm": 0.4719396906997645, + "learning_rate": 7.034485817492498e-06, + "loss": 0.0329, + "step": 25723 + }, + { + "epoch": 3.050397248903119, + "grad_norm": 0.7087747650690799, + "learning_rate": 7.032816683441365e-06, + "loss": 0.0359, + "step": 25724 + }, + { + "epoch": 3.0505158306652436, + "grad_norm": 0.3070650430381829, + "learning_rate": 7.03114771502564e-06, + "loss": 0.0137, + "step": 25725 + }, + { + "epoch": 3.0506344124273688, + "grad_norm": 0.6384980543805574, + "learning_rate": 7.0294789122606845e-06, + "loss": 0.0335, + "step": 25726 + }, + { + "epoch": 3.0507529941894935, + "grad_norm": 0.46817391892082816, + "learning_rate": 7.027810275161909e-06, + "loss": 0.0279, + "step": 25727 + }, + { + "epoch": 3.0508715759516187, + "grad_norm": 0.3960503637737531, + "learning_rate": 7.026141803744676e-06, + "loss": 0.0217, + "step": 25728 + }, + { + "epoch": 3.0509901577137435, + "grad_norm": 0.6453767347375313, + "learning_rate": 7.024473498024384e-06, + "loss": 0.0434, + "step": 25729 + }, + { + "epoch": 3.0511087394758687, + "grad_norm": 0.4620642846709467, + "learning_rate": 7.022805358016388e-06, + "loss": 0.0202, + "step": 25730 + }, + { + "epoch": 3.0512273212379935, + "grad_norm": 0.6660174823081271, + "learning_rate": 7.0211373837360955e-06, + "loss": 0.0403, + "step": 25731 + }, + { + "epoch": 3.0513459030001187, + "grad_norm": 0.3305803551715751, + "learning_rate": 7.019469575198862e-06, + "loss": 0.0207, + "step": 25732 + }, + { + "epoch": 3.0514644847622434, + "grad_norm": 0.4749702304505118, + "learning_rate": 7.017801932420068e-06, + "loss": 0.0189, + "step": 25733 + }, + { + "epoch": 3.0515830665243686, + "grad_norm": 0.679584337044462, + "learning_rate": 7.016134455415091e-06, + "loss": 0.0284, + "step": 25734 + }, + { + "epoch": 3.0517016482864934, + "grad_norm": 0.5547645119487171, + "learning_rate": 7.014467144199302e-06, + "loss": 0.0251, + "step": 25735 + }, + { + "epoch": 3.0518202300486186, + "grad_norm": 0.5072465378202821, + "learning_rate": 7.012799998788078e-06, + "loss": 0.0204, + "step": 25736 + }, + { + "epoch": 3.0519388118107433, + "grad_norm": 0.782930441826833, + "learning_rate": 7.011133019196769e-06, + "loss": 0.0399, + "step": 25737 + }, + { + "epoch": 3.0520573935728685, + "grad_norm": 0.5044744566223648, + "learning_rate": 7.009466205440759e-06, + "loss": 0.0253, + "step": 25738 + }, + { + "epoch": 3.0521759753349933, + "grad_norm": 1.0103959765221462, + "learning_rate": 7.007799557535408e-06, + "loss": 0.055, + "step": 25739 + }, + { + "epoch": 3.0522945570971185, + "grad_norm": 0.4586430689230486, + "learning_rate": 7.00613307549608e-06, + "loss": 0.0231, + "step": 25740 + }, + { + "epoch": 3.0524131388592433, + "grad_norm": 0.5228517753026451, + "learning_rate": 7.0044667593381405e-06, + "loss": 0.0192, + "step": 25741 + }, + { + "epoch": 3.0525317206213685, + "grad_norm": 0.7544336612109552, + "learning_rate": 7.002800609076951e-06, + "loss": 0.043, + "step": 25742 + }, + { + "epoch": 3.052650302383493, + "grad_norm": 0.7253538468794315, + "learning_rate": 7.001134624727876e-06, + "loss": 0.022, + "step": 25743 + }, + { + "epoch": 3.0527688841456184, + "grad_norm": 0.42820433679385855, + "learning_rate": 6.999468806306261e-06, + "loss": 0.0169, + "step": 25744 + }, + { + "epoch": 3.052887465907743, + "grad_norm": 0.68101123444321, + "learning_rate": 6.99780315382747e-06, + "loss": 0.0362, + "step": 25745 + }, + { + "epoch": 3.0530060476698684, + "grad_norm": 0.5631980186546279, + "learning_rate": 6.99613766730686e-06, + "loss": 0.0275, + "step": 25746 + }, + { + "epoch": 3.053124629431993, + "grad_norm": 0.33209260441437727, + "learning_rate": 6.994472346759787e-06, + "loss": 0.0148, + "step": 25747 + }, + { + "epoch": 3.0532432111941183, + "grad_norm": 0.6085522096962065, + "learning_rate": 6.992807192201595e-06, + "loss": 0.046, + "step": 25748 + }, + { + "epoch": 3.0533617929562435, + "grad_norm": 0.4694148274477506, + "learning_rate": 6.991142203647635e-06, + "loss": 0.0305, + "step": 25749 + }, + { + "epoch": 3.0534803747183683, + "grad_norm": 0.43401706184643485, + "learning_rate": 6.989477381113263e-06, + "loss": 0.0222, + "step": 25750 + }, + { + "epoch": 3.0535989564804935, + "grad_norm": 0.5959435707723132, + "learning_rate": 6.987812724613824e-06, + "loss": 0.0302, + "step": 25751 + }, + { + "epoch": 3.0537175382426183, + "grad_norm": 0.7734547484918611, + "learning_rate": 6.986148234164669e-06, + "loss": 0.0377, + "step": 25752 + }, + { + "epoch": 3.0538361200047435, + "grad_norm": 0.5260675031197611, + "learning_rate": 6.9844839097811225e-06, + "loss": 0.0288, + "step": 25753 + }, + { + "epoch": 3.0539547017668682, + "grad_norm": 0.7853168920531363, + "learning_rate": 6.982819751478559e-06, + "loss": 0.0276, + "step": 25754 + }, + { + "epoch": 3.0540732835289934, + "grad_norm": 0.5514933068128056, + "learning_rate": 6.981155759272293e-06, + "loss": 0.0269, + "step": 25755 + }, + { + "epoch": 3.054191865291118, + "grad_norm": 0.3780481239459865, + "learning_rate": 6.979491933177676e-06, + "loss": 0.0226, + "step": 25756 + }, + { + "epoch": 3.0543104470532434, + "grad_norm": 1.0214054934816537, + "learning_rate": 6.977828273210046e-06, + "loss": 0.0305, + "step": 25757 + }, + { + "epoch": 3.054429028815368, + "grad_norm": 0.4436511779882615, + "learning_rate": 6.976164779384747e-06, + "loss": 0.0267, + "step": 25758 + }, + { + "epoch": 3.0545476105774934, + "grad_norm": 0.38840819588017933, + "learning_rate": 6.974501451717097e-06, + "loss": 0.0237, + "step": 25759 + }, + { + "epoch": 3.054666192339618, + "grad_norm": 0.5090109943146182, + "learning_rate": 6.972838290222441e-06, + "loss": 0.0309, + "step": 25760 + }, + { + "epoch": 3.0547847741017433, + "grad_norm": 0.7082551377553794, + "learning_rate": 6.97117529491611e-06, + "loss": 0.0378, + "step": 25761 + }, + { + "epoch": 3.054903355863868, + "grad_norm": 0.6171946508172774, + "learning_rate": 6.9695124658134326e-06, + "loss": 0.0338, + "step": 25762 + }, + { + "epoch": 3.0550219376259933, + "grad_norm": 0.544922771491039, + "learning_rate": 6.967849802929749e-06, + "loss": 0.0293, + "step": 25763 + }, + { + "epoch": 3.055140519388118, + "grad_norm": 0.5255244655327902, + "learning_rate": 6.966187306280361e-06, + "loss": 0.0313, + "step": 25764 + }, + { + "epoch": 3.0552591011502432, + "grad_norm": 0.6548627026355068, + "learning_rate": 6.964524975880627e-06, + "loss": 0.031, + "step": 25765 + }, + { + "epoch": 3.055377682912368, + "grad_norm": 0.4451188408935132, + "learning_rate": 6.962862811745849e-06, + "loss": 0.0239, + "step": 25766 + }, + { + "epoch": 3.055496264674493, + "grad_norm": 0.4355291471565756, + "learning_rate": 6.961200813891358e-06, + "loss": 0.024, + "step": 25767 + }, + { + "epoch": 3.055614846436618, + "grad_norm": 0.466256907231154, + "learning_rate": 6.9595389823324725e-06, + "loss": 0.0301, + "step": 25768 + }, + { + "epoch": 3.055733428198743, + "grad_norm": 0.38401684296510374, + "learning_rate": 6.957877317084524e-06, + "loss": 0.0189, + "step": 25769 + }, + { + "epoch": 3.055852009960868, + "grad_norm": 0.5919805023892334, + "learning_rate": 6.956215818162814e-06, + "loss": 0.0235, + "step": 25770 + }, + { + "epoch": 3.055970591722993, + "grad_norm": 0.37052701137189514, + "learning_rate": 6.954554485582671e-06, + "loss": 0.0215, + "step": 25771 + }, + { + "epoch": 3.056089173485118, + "grad_norm": 0.45827125056667717, + "learning_rate": 6.952893319359402e-06, + "loss": 0.0221, + "step": 25772 + }, + { + "epoch": 3.056207755247243, + "grad_norm": 0.49035394111056885, + "learning_rate": 6.951232319508327e-06, + "loss": 0.0207, + "step": 25773 + }, + { + "epoch": 3.056326337009368, + "grad_norm": 0.719291418845679, + "learning_rate": 6.9495714860447645e-06, + "loss": 0.0339, + "step": 25774 + }, + { + "epoch": 3.056444918771493, + "grad_norm": 0.568831438226503, + "learning_rate": 6.947910818984005e-06, + "loss": 0.0415, + "step": 25775 + }, + { + "epoch": 3.056563500533618, + "grad_norm": 0.5568137102820796, + "learning_rate": 6.946250318341385e-06, + "loss": 0.0278, + "step": 25776 + }, + { + "epoch": 3.056682082295743, + "grad_norm": 0.35188762122798484, + "learning_rate": 6.9445899841321884e-06, + "loss": 0.016, + "step": 25777 + }, + { + "epoch": 3.0568006640578678, + "grad_norm": 0.34386535001292284, + "learning_rate": 6.9429298163717405e-06, + "loss": 0.015, + "step": 25778 + }, + { + "epoch": 3.056919245819993, + "grad_norm": 0.5714147326626452, + "learning_rate": 6.941269815075322e-06, + "loss": 0.0301, + "step": 25779 + }, + { + "epoch": 3.0570378275821177, + "grad_norm": 0.5411813796549171, + "learning_rate": 6.939609980258266e-06, + "loss": 0.0272, + "step": 25780 + }, + { + "epoch": 3.057156409344243, + "grad_norm": 0.607024756412698, + "learning_rate": 6.9379503119358505e-06, + "loss": 0.0266, + "step": 25781 + }, + { + "epoch": 3.0572749911063677, + "grad_norm": 0.4330951119300781, + "learning_rate": 6.936290810123383e-06, + "loss": 0.0253, + "step": 25782 + }, + { + "epoch": 3.057393572868493, + "grad_norm": 0.5591926917613065, + "learning_rate": 6.934631474836165e-06, + "loss": 0.0259, + "step": 25783 + }, + { + "epoch": 3.0575121546306177, + "grad_norm": 0.5674151046967072, + "learning_rate": 6.932972306089491e-06, + "loss": 0.0213, + "step": 25784 + }, + { + "epoch": 3.057630736392743, + "grad_norm": 0.6008829734802543, + "learning_rate": 6.931313303898662e-06, + "loss": 0.0214, + "step": 25785 + }, + { + "epoch": 3.0577493181548676, + "grad_norm": 0.5691489084671257, + "learning_rate": 6.929654468278956e-06, + "loss": 0.0271, + "step": 25786 + }, + { + "epoch": 3.057867899916993, + "grad_norm": 0.45693182015021594, + "learning_rate": 6.9279957992456905e-06, + "loss": 0.0305, + "step": 25787 + }, + { + "epoch": 3.0579864816791176, + "grad_norm": 0.5430877288018532, + "learning_rate": 6.926337296814134e-06, + "loss": 0.0296, + "step": 25788 + }, + { + "epoch": 3.0581050634412428, + "grad_norm": 0.6787097291722429, + "learning_rate": 6.9246789609995834e-06, + "loss": 0.0451, + "step": 25789 + }, + { + "epoch": 3.0582236452033675, + "grad_norm": 0.37606153381369456, + "learning_rate": 6.923020791817328e-06, + "loss": 0.0204, + "step": 25790 + }, + { + "epoch": 3.0583422269654927, + "grad_norm": 0.8022909081912585, + "learning_rate": 6.921362789282654e-06, + "loss": 0.0461, + "step": 25791 + }, + { + "epoch": 3.0584608087276175, + "grad_norm": 0.47664877278163165, + "learning_rate": 6.919704953410852e-06, + "loss": 0.0249, + "step": 25792 + }, + { + "epoch": 3.0585793904897427, + "grad_norm": 0.4774977809235436, + "learning_rate": 6.918047284217194e-06, + "loss": 0.0264, + "step": 25793 + }, + { + "epoch": 3.0586979722518675, + "grad_norm": 0.32735948785500757, + "learning_rate": 6.916389781716964e-06, + "loss": 0.0159, + "step": 25794 + }, + { + "epoch": 3.0588165540139927, + "grad_norm": 0.7060606687930043, + "learning_rate": 6.914732445925445e-06, + "loss": 0.0444, + "step": 25795 + }, + { + "epoch": 3.0589351357761174, + "grad_norm": 0.4268015467240367, + "learning_rate": 6.913075276857922e-06, + "loss": 0.0209, + "step": 25796 + }, + { + "epoch": 3.0590537175382426, + "grad_norm": 0.6203091626563271, + "learning_rate": 6.911418274529652e-06, + "loss": 0.0317, + "step": 25797 + }, + { + "epoch": 3.059172299300368, + "grad_norm": 0.4838970179708307, + "learning_rate": 6.909761438955939e-06, + "loss": 0.0246, + "step": 25798 + }, + { + "epoch": 3.0592908810624926, + "grad_norm": 0.7978607484712558, + "learning_rate": 6.908104770152032e-06, + "loss": 0.0382, + "step": 25799 + }, + { + "epoch": 3.059409462824618, + "grad_norm": 0.5507959003195486, + "learning_rate": 6.906448268133214e-06, + "loss": 0.0246, + "step": 25800 + }, + { + "epoch": 3.0595280445867425, + "grad_norm": 0.5857655908451135, + "learning_rate": 6.904791932914759e-06, + "loss": 0.0252, + "step": 25801 + }, + { + "epoch": 3.0596466263488677, + "grad_norm": 1.1184656339710428, + "learning_rate": 6.903135764511928e-06, + "loss": 0.0517, + "step": 25802 + }, + { + "epoch": 3.0597652081109925, + "grad_norm": 0.4865340630688182, + "learning_rate": 6.901479762940002e-06, + "loss": 0.0235, + "step": 25803 + }, + { + "epoch": 3.0598837898731177, + "grad_norm": 0.6597280550054939, + "learning_rate": 6.899823928214233e-06, + "loss": 0.0337, + "step": 25804 + }, + { + "epoch": 3.0600023716352425, + "grad_norm": 0.4900448895763137, + "learning_rate": 6.898168260349888e-06, + "loss": 0.0279, + "step": 25805 + }, + { + "epoch": 3.0601209533973677, + "grad_norm": 0.3485483320855055, + "learning_rate": 6.896512759362236e-06, + "loss": 0.0182, + "step": 25806 + }, + { + "epoch": 3.0602395351594924, + "grad_norm": 0.5881307469099415, + "learning_rate": 6.894857425266543e-06, + "loss": 0.0384, + "step": 25807 + }, + { + "epoch": 3.0603581169216176, + "grad_norm": 0.40650897000557606, + "learning_rate": 6.893202258078057e-06, + "loss": 0.0215, + "step": 25808 + }, + { + "epoch": 3.0604766986837424, + "grad_norm": 0.36216889734036656, + "learning_rate": 6.891547257812042e-06, + "loss": 0.0183, + "step": 25809 + }, + { + "epoch": 3.0605952804458676, + "grad_norm": 0.3756832185446683, + "learning_rate": 6.889892424483754e-06, + "loss": 0.0151, + "step": 25810 + }, + { + "epoch": 3.0607138622079924, + "grad_norm": 0.42426682647770025, + "learning_rate": 6.8882377581084485e-06, + "loss": 0.0265, + "step": 25811 + }, + { + "epoch": 3.0608324439701176, + "grad_norm": 0.4794668841413802, + "learning_rate": 6.886583258701382e-06, + "loss": 0.0212, + "step": 25812 + }, + { + "epoch": 3.0609510257322423, + "grad_norm": 0.4846359900405756, + "learning_rate": 6.884928926277806e-06, + "loss": 0.0255, + "step": 25813 + }, + { + "epoch": 3.0610696074943675, + "grad_norm": 0.4309565563567072, + "learning_rate": 6.883274760852979e-06, + "loss": 0.0199, + "step": 25814 + }, + { + "epoch": 3.0611881892564923, + "grad_norm": 0.5380942434254913, + "learning_rate": 6.881620762442134e-06, + "loss": 0.034, + "step": 25815 + }, + { + "epoch": 3.0613067710186175, + "grad_norm": 0.5093015654017277, + "learning_rate": 6.879966931060527e-06, + "loss": 0.0263, + "step": 25816 + }, + { + "epoch": 3.0614253527807422, + "grad_norm": 0.42525135528409663, + "learning_rate": 6.878313266723407e-06, + "loss": 0.0216, + "step": 25817 + }, + { + "epoch": 3.0615439345428674, + "grad_norm": 0.5517435099452616, + "learning_rate": 6.876659769446023e-06, + "loss": 0.0252, + "step": 25818 + }, + { + "epoch": 3.061662516304992, + "grad_norm": 0.9779500253573901, + "learning_rate": 6.875006439243603e-06, + "loss": 0.0617, + "step": 25819 + }, + { + "epoch": 3.0617810980671174, + "grad_norm": 0.5819855514862882, + "learning_rate": 6.873353276131397e-06, + "loss": 0.038, + "step": 25820 + }, + { + "epoch": 3.061899679829242, + "grad_norm": 0.47633736339051425, + "learning_rate": 6.871700280124646e-06, + "loss": 0.0191, + "step": 25821 + }, + { + "epoch": 3.0620182615913674, + "grad_norm": 0.6353145236188867, + "learning_rate": 6.8700474512385894e-06, + "loss": 0.0329, + "step": 25822 + }, + { + "epoch": 3.062136843353492, + "grad_norm": 0.42993114419843786, + "learning_rate": 6.868394789488469e-06, + "loss": 0.0148, + "step": 25823 + }, + { + "epoch": 3.0622554251156173, + "grad_norm": 0.49620444191909197, + "learning_rate": 6.8667422948895e-06, + "loss": 0.0255, + "step": 25824 + }, + { + "epoch": 3.062374006877742, + "grad_norm": 0.7351365217697372, + "learning_rate": 6.865089967456945e-06, + "loss": 0.0455, + "step": 25825 + }, + { + "epoch": 3.0624925886398673, + "grad_norm": 0.6639661622594325, + "learning_rate": 6.8634378072060135e-06, + "loss": 0.0414, + "step": 25826 + }, + { + "epoch": 3.062611170401992, + "grad_norm": 0.6958009562140369, + "learning_rate": 6.861785814151947e-06, + "loss": 0.0389, + "step": 25827 + }, + { + "epoch": 3.0627297521641172, + "grad_norm": 0.6615669799424448, + "learning_rate": 6.8601339883099715e-06, + "loss": 0.0372, + "step": 25828 + }, + { + "epoch": 3.062848333926242, + "grad_norm": 0.423052211877331, + "learning_rate": 6.858482329695324e-06, + "loss": 0.019, + "step": 25829 + }, + { + "epoch": 3.062966915688367, + "grad_norm": 0.5918559671098086, + "learning_rate": 6.856830838323214e-06, + "loss": 0.0299, + "step": 25830 + }, + { + "epoch": 3.063085497450492, + "grad_norm": 0.7177482042210066, + "learning_rate": 6.855179514208876e-06, + "loss": 0.0307, + "step": 25831 + }, + { + "epoch": 3.063204079212617, + "grad_norm": 0.7414089321718169, + "learning_rate": 6.8535283573675335e-06, + "loss": 0.0318, + "step": 25832 + }, + { + "epoch": 3.063322660974742, + "grad_norm": 0.42536487059598616, + "learning_rate": 6.8518773678144045e-06, + "loss": 0.0248, + "step": 25833 + }, + { + "epoch": 3.063441242736867, + "grad_norm": 0.9257222276313929, + "learning_rate": 6.8502265455647195e-06, + "loss": 0.039, + "step": 25834 + }, + { + "epoch": 3.063559824498992, + "grad_norm": 0.7426316904198073, + "learning_rate": 6.848575890633674e-06, + "loss": 0.0296, + "step": 25835 + }, + { + "epoch": 3.063678406261117, + "grad_norm": 0.5042090417220441, + "learning_rate": 6.846925403036517e-06, + "loss": 0.0308, + "step": 25836 + }, + { + "epoch": 3.063796988023242, + "grad_norm": 0.6049909276258166, + "learning_rate": 6.845275082788438e-06, + "loss": 0.0291, + "step": 25837 + }, + { + "epoch": 3.063915569785367, + "grad_norm": 0.6169826222956333, + "learning_rate": 6.84362492990466e-06, + "loss": 0.0295, + "step": 25838 + }, + { + "epoch": 3.064034151547492, + "grad_norm": 0.5844787027949316, + "learning_rate": 6.841974944400395e-06, + "loss": 0.0324, + "step": 25839 + }, + { + "epoch": 3.064152733309617, + "grad_norm": 0.6078471162182267, + "learning_rate": 6.840325126290856e-06, + "loss": 0.0346, + "step": 25840 + }, + { + "epoch": 3.0642713150717418, + "grad_norm": 0.5051496136165976, + "learning_rate": 6.838675475591256e-06, + "loss": 0.026, + "step": 25841 + }, + { + "epoch": 3.064389896833867, + "grad_norm": 0.5062291150583061, + "learning_rate": 6.837025992316784e-06, + "loss": 0.0225, + "step": 25842 + }, + { + "epoch": 3.0645084785959917, + "grad_norm": 0.43092542008669, + "learning_rate": 6.835376676482672e-06, + "loss": 0.0229, + "step": 25843 + }, + { + "epoch": 3.064627060358117, + "grad_norm": 0.4431354798462679, + "learning_rate": 6.833727528104106e-06, + "loss": 0.0218, + "step": 25844 + }, + { + "epoch": 3.0647456421202417, + "grad_norm": 0.639235035690723, + "learning_rate": 6.832078547196302e-06, + "loss": 0.0285, + "step": 25845 + }, + { + "epoch": 3.064864223882367, + "grad_norm": 0.915774658134227, + "learning_rate": 6.8304297337744406e-06, + "loss": 0.0479, + "step": 25846 + }, + { + "epoch": 3.064982805644492, + "grad_norm": 0.557791260046144, + "learning_rate": 6.828781087853753e-06, + "loss": 0.0343, + "step": 25847 + }, + { + "epoch": 3.065101387406617, + "grad_norm": 0.5048933344371328, + "learning_rate": 6.8271326094494105e-06, + "loss": 0.0272, + "step": 25848 + }, + { + "epoch": 3.0652199691687416, + "grad_norm": 0.5339461983290035, + "learning_rate": 6.825484298576621e-06, + "loss": 0.0315, + "step": 25849 + }, + { + "epoch": 3.065338550930867, + "grad_norm": 0.48765067927091527, + "learning_rate": 6.823836155250579e-06, + "loss": 0.0162, + "step": 25850 + }, + { + "epoch": 3.065457132692992, + "grad_norm": 0.7186481360532898, + "learning_rate": 6.822188179486477e-06, + "loss": 0.0415, + "step": 25851 + }, + { + "epoch": 3.065575714455117, + "grad_norm": 0.49809011870022457, + "learning_rate": 6.8205403712995195e-06, + "loss": 0.0301, + "step": 25852 + }, + { + "epoch": 3.065694296217242, + "grad_norm": 0.45164947745730993, + "learning_rate": 6.818892730704871e-06, + "loss": 0.0211, + "step": 25853 + }, + { + "epoch": 3.0658128779793667, + "grad_norm": 0.5880306453973525, + "learning_rate": 6.81724525771775e-06, + "loss": 0.0376, + "step": 25854 + }, + { + "epoch": 3.065931459741492, + "grad_norm": 0.44198185050182615, + "learning_rate": 6.815597952353323e-06, + "loss": 0.016, + "step": 25855 + }, + { + "epoch": 3.0660500415036167, + "grad_norm": 0.46574299328846847, + "learning_rate": 6.813950814626793e-06, + "loss": 0.0264, + "step": 25856 + }, + { + "epoch": 3.066168623265742, + "grad_norm": 0.49415000395660835, + "learning_rate": 6.812303844553319e-06, + "loss": 0.0188, + "step": 25857 + }, + { + "epoch": 3.0662872050278667, + "grad_norm": 0.8737964975475574, + "learning_rate": 6.8106570421481135e-06, + "loss": 0.0421, + "step": 25858 + }, + { + "epoch": 3.066405786789992, + "grad_norm": 0.7576365459302427, + "learning_rate": 6.809010407426342e-06, + "loss": 0.0339, + "step": 25859 + }, + { + "epoch": 3.0665243685521166, + "grad_norm": 0.5442203235355177, + "learning_rate": 6.807363940403183e-06, + "loss": 0.0295, + "step": 25860 + }, + { + "epoch": 3.066642950314242, + "grad_norm": 0.5790749516284069, + "learning_rate": 6.80571764109382e-06, + "loss": 0.0306, + "step": 25861 + }, + { + "epoch": 3.0667615320763666, + "grad_norm": 0.604843369059809, + "learning_rate": 6.804071509513432e-06, + "loss": 0.0368, + "step": 25862 + }, + { + "epoch": 3.066880113838492, + "grad_norm": 0.3822210337800399, + "learning_rate": 6.802425545677196e-06, + "loss": 0.0207, + "step": 25863 + }, + { + "epoch": 3.0669986956006166, + "grad_norm": 0.48186421846021993, + "learning_rate": 6.800779749600275e-06, + "loss": 0.0303, + "step": 25864 + }, + { + "epoch": 3.0671172773627418, + "grad_norm": 0.33738222726826406, + "learning_rate": 6.799134121297846e-06, + "loss": 0.0166, + "step": 25865 + }, + { + "epoch": 3.0672358591248665, + "grad_norm": 0.4726914493630969, + "learning_rate": 6.7974886607850815e-06, + "loss": 0.0255, + "step": 25866 + }, + { + "epoch": 3.0673544408869917, + "grad_norm": 0.425367214463027, + "learning_rate": 6.795843368077156e-06, + "loss": 0.0215, + "step": 25867 + }, + { + "epoch": 3.0674730226491165, + "grad_norm": 0.40042809549233377, + "learning_rate": 6.79419824318922e-06, + "loss": 0.0223, + "step": 25868 + }, + { + "epoch": 3.0675916044112417, + "grad_norm": 0.5356514329832545, + "learning_rate": 6.792553286136463e-06, + "loss": 0.029, + "step": 25869 + }, + { + "epoch": 3.0677101861733664, + "grad_norm": 0.21396593341027106, + "learning_rate": 6.790908496934032e-06, + "loss": 0.0091, + "step": 25870 + }, + { + "epoch": 3.0678287679354916, + "grad_norm": 0.5688104277766369, + "learning_rate": 6.789263875597094e-06, + "loss": 0.034, + "step": 25871 + }, + { + "epoch": 3.0679473496976164, + "grad_norm": 0.5458570857043821, + "learning_rate": 6.787619422140812e-06, + "loss": 0.0291, + "step": 25872 + }, + { + "epoch": 3.0680659314597416, + "grad_norm": 0.32012408333736764, + "learning_rate": 6.785975136580344e-06, + "loss": 0.0177, + "step": 25873 + }, + { + "epoch": 3.0681845132218664, + "grad_norm": 0.4727060662961032, + "learning_rate": 6.784331018930856e-06, + "loss": 0.0193, + "step": 25874 + }, + { + "epoch": 3.0683030949839916, + "grad_norm": 0.6464418134210175, + "learning_rate": 6.782687069207494e-06, + "loss": 0.0306, + "step": 25875 + }, + { + "epoch": 3.0684216767461163, + "grad_norm": 0.5787655379751475, + "learning_rate": 6.781043287425418e-06, + "loss": 0.0299, + "step": 25876 + }, + { + "epoch": 3.0685402585082415, + "grad_norm": 0.3658910380590534, + "learning_rate": 6.779399673599779e-06, + "loss": 0.0161, + "step": 25877 + }, + { + "epoch": 3.0686588402703663, + "grad_norm": 0.5293957086092779, + "learning_rate": 6.777756227745741e-06, + "loss": 0.0343, + "step": 25878 + }, + { + "epoch": 3.0687774220324915, + "grad_norm": 0.5562311714225917, + "learning_rate": 6.7761129498784355e-06, + "loss": 0.0259, + "step": 25879 + }, + { + "epoch": 3.0688960037946162, + "grad_norm": 0.6656504298263599, + "learning_rate": 6.774469840013023e-06, + "loss": 0.0463, + "step": 25880 + }, + { + "epoch": 3.0690145855567414, + "grad_norm": 0.5021857512383711, + "learning_rate": 6.772826898164647e-06, + "loss": 0.027, + "step": 25881 + }, + { + "epoch": 3.069133167318866, + "grad_norm": 0.5302041301119917, + "learning_rate": 6.771184124348457e-06, + "loss": 0.0294, + "step": 25882 + }, + { + "epoch": 3.0692517490809914, + "grad_norm": 0.7264072103536825, + "learning_rate": 6.769541518579594e-06, + "loss": 0.0397, + "step": 25883 + }, + { + "epoch": 3.069370330843116, + "grad_norm": 0.7163393501755547, + "learning_rate": 6.767899080873202e-06, + "loss": 0.0369, + "step": 25884 + }, + { + "epoch": 3.0694889126052414, + "grad_norm": 0.6041071071269902, + "learning_rate": 6.766256811244434e-06, + "loss": 0.0257, + "step": 25885 + }, + { + "epoch": 3.069607494367366, + "grad_norm": 0.6160479698774085, + "learning_rate": 6.764614709708409e-06, + "loss": 0.0327, + "step": 25886 + }, + { + "epoch": 3.0697260761294913, + "grad_norm": 0.7277326951784558, + "learning_rate": 6.7629727762802745e-06, + "loss": 0.0395, + "step": 25887 + }, + { + "epoch": 3.069844657891616, + "grad_norm": 0.5503031520352925, + "learning_rate": 6.761331010975167e-06, + "loss": 0.0274, + "step": 25888 + }, + { + "epoch": 3.0699632396537413, + "grad_norm": 0.539842881824107, + "learning_rate": 6.7596894138082205e-06, + "loss": 0.0335, + "step": 25889 + }, + { + "epoch": 3.070081821415866, + "grad_norm": 0.3794459420552934, + "learning_rate": 6.7580479847945795e-06, + "loss": 0.0165, + "step": 25890 + }, + { + "epoch": 3.0702004031779913, + "grad_norm": 0.433417860380022, + "learning_rate": 6.756406723949351e-06, + "loss": 0.0214, + "step": 25891 + }, + { + "epoch": 3.070318984940116, + "grad_norm": 0.4504631558960605, + "learning_rate": 6.754765631287696e-06, + "loss": 0.018, + "step": 25892 + }, + { + "epoch": 3.070437566702241, + "grad_norm": 0.45073397316565667, + "learning_rate": 6.753124706824721e-06, + "loss": 0.0241, + "step": 25893 + }, + { + "epoch": 3.070556148464366, + "grad_norm": 0.5830727702341724, + "learning_rate": 6.751483950575566e-06, + "loss": 0.025, + "step": 25894 + }, + { + "epoch": 3.070674730226491, + "grad_norm": 0.5957239029171869, + "learning_rate": 6.749843362555339e-06, + "loss": 0.0302, + "step": 25895 + }, + { + "epoch": 3.070793311988616, + "grad_norm": 0.565179982150426, + "learning_rate": 6.748202942779189e-06, + "loss": 0.0286, + "step": 25896 + }, + { + "epoch": 3.070911893750741, + "grad_norm": 0.5249380185163606, + "learning_rate": 6.746562691262218e-06, + "loss": 0.0287, + "step": 25897 + }, + { + "epoch": 3.071030475512866, + "grad_norm": 0.6442152819803039, + "learning_rate": 6.744922608019557e-06, + "loss": 0.0335, + "step": 25898 + }, + { + "epoch": 3.071149057274991, + "grad_norm": 0.5158825609294077, + "learning_rate": 6.743282693066322e-06, + "loss": 0.0323, + "step": 25899 + }, + { + "epoch": 3.0712676390371163, + "grad_norm": 0.7804451032898571, + "learning_rate": 6.741642946417634e-06, + "loss": 0.039, + "step": 25900 + }, + { + "epoch": 3.071386220799241, + "grad_norm": 0.331957877931185, + "learning_rate": 6.740003368088615e-06, + "loss": 0.0155, + "step": 25901 + }, + { + "epoch": 3.0715048025613663, + "grad_norm": 0.5732318369015555, + "learning_rate": 6.73836395809436e-06, + "loss": 0.0369, + "step": 25902 + }, + { + "epoch": 3.071623384323491, + "grad_norm": 0.5571725632372766, + "learning_rate": 6.736724716450007e-06, + "loss": 0.0229, + "step": 25903 + }, + { + "epoch": 3.0717419660856162, + "grad_norm": 0.6046886985022194, + "learning_rate": 6.735085643170652e-06, + "loss": 0.0246, + "step": 25904 + }, + { + "epoch": 3.071860547847741, + "grad_norm": 0.6789798219781968, + "learning_rate": 6.733446738271415e-06, + "loss": 0.0268, + "step": 25905 + }, + { + "epoch": 3.071979129609866, + "grad_norm": 0.4999417015898718, + "learning_rate": 6.731808001767384e-06, + "loss": 0.0263, + "step": 25906 + }, + { + "epoch": 3.072097711371991, + "grad_norm": 0.44104058731017104, + "learning_rate": 6.730169433673697e-06, + "loss": 0.0216, + "step": 25907 + }, + { + "epoch": 3.072216293134116, + "grad_norm": 0.47351955317250677, + "learning_rate": 6.728531034005436e-06, + "loss": 0.0209, + "step": 25908 + }, + { + "epoch": 3.072334874896241, + "grad_norm": 0.5262492090055991, + "learning_rate": 6.7268928027777125e-06, + "loss": 0.0342, + "step": 25909 + }, + { + "epoch": 3.072453456658366, + "grad_norm": 0.4591880761273801, + "learning_rate": 6.725254740005632e-06, + "loss": 0.0322, + "step": 25910 + }, + { + "epoch": 3.072572038420491, + "grad_norm": 0.6395357646933344, + "learning_rate": 6.72361684570429e-06, + "loss": 0.0268, + "step": 25911 + }, + { + "epoch": 3.072690620182616, + "grad_norm": 0.60017315930188, + "learning_rate": 6.721979119888796e-06, + "loss": 0.0294, + "step": 25912 + }, + { + "epoch": 3.072809201944741, + "grad_norm": 0.45477161757698537, + "learning_rate": 6.720341562574229e-06, + "loss": 0.0172, + "step": 25913 + }, + { + "epoch": 3.072927783706866, + "grad_norm": 0.5052483364994256, + "learning_rate": 6.718704173775708e-06, + "loss": 0.0276, + "step": 25914 + }, + { + "epoch": 3.073046365468991, + "grad_norm": 0.5143902325757742, + "learning_rate": 6.717066953508311e-06, + "loss": 0.0251, + "step": 25915 + }, + { + "epoch": 3.073164947231116, + "grad_norm": 0.5186563203319153, + "learning_rate": 6.715429901787143e-06, + "loss": 0.0307, + "step": 25916 + }, + { + "epoch": 3.0732835289932408, + "grad_norm": 0.5447876411100923, + "learning_rate": 6.7137930186272735e-06, + "loss": 0.0272, + "step": 25917 + }, + { + "epoch": 3.073402110755366, + "grad_norm": 0.37968710528348865, + "learning_rate": 6.712156304043826e-06, + "loss": 0.0221, + "step": 25918 + }, + { + "epoch": 3.0735206925174907, + "grad_norm": 0.4072395098457812, + "learning_rate": 6.7105197580518616e-06, + "loss": 0.0271, + "step": 25919 + }, + { + "epoch": 3.073639274279616, + "grad_norm": 0.5667311751760737, + "learning_rate": 6.708883380666478e-06, + "loss": 0.0386, + "step": 25920 + }, + { + "epoch": 3.0737578560417407, + "grad_norm": 0.46432565369581397, + "learning_rate": 6.707247171902761e-06, + "loss": 0.025, + "step": 25921 + }, + { + "epoch": 3.073876437803866, + "grad_norm": 0.948709046899295, + "learning_rate": 6.705611131775791e-06, + "loss": 0.0684, + "step": 25922 + }, + { + "epoch": 3.0739950195659906, + "grad_norm": 0.7328721994866092, + "learning_rate": 6.70397526030066e-06, + "loss": 0.0203, + "step": 25923 + }, + { + "epoch": 3.074113601328116, + "grad_norm": 0.7437448381255534, + "learning_rate": 6.702339557492426e-06, + "loss": 0.033, + "step": 25924 + }, + { + "epoch": 3.0742321830902406, + "grad_norm": 0.2928563639643633, + "learning_rate": 6.7007040233662e-06, + "loss": 0.0158, + "step": 25925 + }, + { + "epoch": 3.074350764852366, + "grad_norm": 0.4004930120455854, + "learning_rate": 6.699068657937033e-06, + "loss": 0.0223, + "step": 25926 + }, + { + "epoch": 3.0744693466144906, + "grad_norm": 0.35397344748753046, + "learning_rate": 6.697433461220021e-06, + "loss": 0.0194, + "step": 25927 + }, + { + "epoch": 3.0745879283766158, + "grad_norm": 0.6660547468923519, + "learning_rate": 6.69579843323021e-06, + "loss": 0.0451, + "step": 25928 + }, + { + "epoch": 3.0747065101387405, + "grad_norm": 0.3983931657422914, + "learning_rate": 6.694163573982709e-06, + "loss": 0.0159, + "step": 25929 + }, + { + "epoch": 3.0748250919008657, + "grad_norm": 0.7248030601621892, + "learning_rate": 6.692528883492563e-06, + "loss": 0.043, + "step": 25930 + }, + { + "epoch": 3.0749436736629905, + "grad_norm": 0.29697685819447456, + "learning_rate": 6.69089436177485e-06, + "loss": 0.0169, + "step": 25931 + }, + { + "epoch": 3.0750622554251157, + "grad_norm": 0.47128071031494084, + "learning_rate": 6.689260008844642e-06, + "loss": 0.0245, + "step": 25932 + }, + { + "epoch": 3.0751808371872404, + "grad_norm": 0.5045354158244515, + "learning_rate": 6.687625824716998e-06, + "loss": 0.0296, + "step": 25933 + }, + { + "epoch": 3.0752994189493656, + "grad_norm": 0.47521115053413315, + "learning_rate": 6.685991809407e-06, + "loss": 0.0267, + "step": 25934 + }, + { + "epoch": 3.0754180007114904, + "grad_norm": 0.5427210469182296, + "learning_rate": 6.684357962929688e-06, + "loss": 0.0284, + "step": 25935 + }, + { + "epoch": 3.0755365824736156, + "grad_norm": 0.4680366271340309, + "learning_rate": 6.682724285300138e-06, + "loss": 0.0256, + "step": 25936 + }, + { + "epoch": 3.0756551642357404, + "grad_norm": 0.5073478344346518, + "learning_rate": 6.681090776533405e-06, + "loss": 0.0185, + "step": 25937 + }, + { + "epoch": 3.0757737459978656, + "grad_norm": 0.707258533804839, + "learning_rate": 6.679457436644562e-06, + "loss": 0.0481, + "step": 25938 + }, + { + "epoch": 3.0758923277599903, + "grad_norm": 0.5655719063524077, + "learning_rate": 6.6778242656486384e-06, + "loss": 0.0356, + "step": 25939 + }, + { + "epoch": 3.0760109095221155, + "grad_norm": 0.3338516124342359, + "learning_rate": 6.676191263560721e-06, + "loss": 0.0197, + "step": 25940 + }, + { + "epoch": 3.0761294912842403, + "grad_norm": 0.7008272852628471, + "learning_rate": 6.6745584303958445e-06, + "loss": 0.0385, + "step": 25941 + }, + { + "epoch": 3.0762480730463655, + "grad_norm": 0.5813366625513711, + "learning_rate": 6.672925766169067e-06, + "loss": 0.0336, + "step": 25942 + }, + { + "epoch": 3.0763666548084903, + "grad_norm": 0.33185583763258386, + "learning_rate": 6.671293270895437e-06, + "loss": 0.0183, + "step": 25943 + }, + { + "epoch": 3.0764852365706155, + "grad_norm": 0.5645713815139736, + "learning_rate": 6.6696609445900114e-06, + "loss": 0.0276, + "step": 25944 + }, + { + "epoch": 3.07660381833274, + "grad_norm": 0.6200641791451986, + "learning_rate": 6.6680287872678385e-06, + "loss": 0.033, + "step": 25945 + }, + { + "epoch": 3.0767224000948654, + "grad_norm": 0.5217965316192931, + "learning_rate": 6.666396798943952e-06, + "loss": 0.0313, + "step": 25946 + }, + { + "epoch": 3.07684098185699, + "grad_norm": 0.44747791484559546, + "learning_rate": 6.664764979633406e-06, + "loss": 0.0277, + "step": 25947 + }, + { + "epoch": 3.0769595636191154, + "grad_norm": 0.605239895008461, + "learning_rate": 6.663133329351242e-06, + "loss": 0.0334, + "step": 25948 + }, + { + "epoch": 3.0770781453812406, + "grad_norm": 0.3575997759549617, + "learning_rate": 6.661501848112503e-06, + "loss": 0.0149, + "step": 25949 + }, + { + "epoch": 3.0771967271433653, + "grad_norm": 0.5300955645364865, + "learning_rate": 6.659870535932236e-06, + "loss": 0.0256, + "step": 25950 + }, + { + "epoch": 3.0773153089054905, + "grad_norm": 0.4404148374999319, + "learning_rate": 6.658239392825458e-06, + "loss": 0.0251, + "step": 25951 + }, + { + "epoch": 3.0774338906676153, + "grad_norm": 0.454633415464408, + "learning_rate": 6.6566084188072366e-06, + "loss": 0.0254, + "step": 25952 + }, + { + "epoch": 3.0775524724297405, + "grad_norm": 0.4190149943582766, + "learning_rate": 6.654977613892582e-06, + "loss": 0.0218, + "step": 25953 + }, + { + "epoch": 3.0776710541918653, + "grad_norm": 0.5675263024464763, + "learning_rate": 6.653346978096539e-06, + "loss": 0.0234, + "step": 25954 + }, + { + "epoch": 3.0777896359539905, + "grad_norm": 0.4828041754269332, + "learning_rate": 6.651716511434139e-06, + "loss": 0.0388, + "step": 25955 + }, + { + "epoch": 3.0779082177161152, + "grad_norm": 0.7056754517121647, + "learning_rate": 6.65008621392042e-06, + "loss": 0.03, + "step": 25956 + }, + { + "epoch": 3.0780267994782404, + "grad_norm": 0.7171374270320224, + "learning_rate": 6.648456085570395e-06, + "loss": 0.0353, + "step": 25957 + }, + { + "epoch": 3.078145381240365, + "grad_norm": 0.48563659580290375, + "learning_rate": 6.6468261263991e-06, + "loss": 0.0208, + "step": 25958 + }, + { + "epoch": 3.0782639630024904, + "grad_norm": 0.5130931344452003, + "learning_rate": 6.645196336421564e-06, + "loss": 0.0259, + "step": 25959 + }, + { + "epoch": 3.078382544764615, + "grad_norm": 0.3749734399854593, + "learning_rate": 6.643566715652811e-06, + "loss": 0.0243, + "step": 25960 + }, + { + "epoch": 3.0785011265267404, + "grad_norm": 0.48067106995466385, + "learning_rate": 6.641937264107867e-06, + "loss": 0.0275, + "step": 25961 + }, + { + "epoch": 3.078619708288865, + "grad_norm": 0.40427296122172973, + "learning_rate": 6.640307981801735e-06, + "loss": 0.0216, + "step": 25962 + }, + { + "epoch": 3.0787382900509903, + "grad_norm": 0.5519668107864291, + "learning_rate": 6.638678868749465e-06, + "loss": 0.0359, + "step": 25963 + }, + { + "epoch": 3.078856871813115, + "grad_norm": 0.4789663619443339, + "learning_rate": 6.637049924966052e-06, + "loss": 0.0277, + "step": 25964 + }, + { + "epoch": 3.0789754535752403, + "grad_norm": 0.8152120851929396, + "learning_rate": 6.635421150466528e-06, + "loss": 0.0418, + "step": 25965 + }, + { + "epoch": 3.079094035337365, + "grad_norm": 0.48097894293493126, + "learning_rate": 6.633792545265888e-06, + "loss": 0.0213, + "step": 25966 + }, + { + "epoch": 3.0792126170994902, + "grad_norm": 0.879799524835373, + "learning_rate": 6.632164109379172e-06, + "loss": 0.0538, + "step": 25967 + }, + { + "epoch": 3.079331198861615, + "grad_norm": 0.4976087320992294, + "learning_rate": 6.63053584282137e-06, + "loss": 0.0226, + "step": 25968 + }, + { + "epoch": 3.07944978062374, + "grad_norm": 0.5124747792340648, + "learning_rate": 6.628907745607502e-06, + "loss": 0.0335, + "step": 25969 + }, + { + "epoch": 3.079568362385865, + "grad_norm": 0.6426188538542209, + "learning_rate": 6.627279817752577e-06, + "loss": 0.0343, + "step": 25970 + }, + { + "epoch": 3.07968694414799, + "grad_norm": 0.5484701421463061, + "learning_rate": 6.6256520592716e-06, + "loss": 0.0253, + "step": 25971 + }, + { + "epoch": 3.079805525910115, + "grad_norm": 0.6540499677653276, + "learning_rate": 6.624024470179591e-06, + "loss": 0.0323, + "step": 25972 + }, + { + "epoch": 3.07992410767224, + "grad_norm": 0.5288305571702047, + "learning_rate": 6.622397050491522e-06, + "loss": 0.017, + "step": 25973 + }, + { + "epoch": 3.080042689434365, + "grad_norm": 0.4656192330169103, + "learning_rate": 6.620769800222434e-06, + "loss": 0.0274, + "step": 25974 + }, + { + "epoch": 3.08016127119649, + "grad_norm": 0.58610074477054, + "learning_rate": 6.619142719387303e-06, + "loss": 0.0287, + "step": 25975 + }, + { + "epoch": 3.080279852958615, + "grad_norm": 0.5001379414832273, + "learning_rate": 6.617515808001143e-06, + "loss": 0.0343, + "step": 25976 + }, + { + "epoch": 3.08039843472074, + "grad_norm": 0.4225576013570139, + "learning_rate": 6.615889066078929e-06, + "loss": 0.025, + "step": 25977 + }, + { + "epoch": 3.080517016482865, + "grad_norm": 0.7017133954642204, + "learning_rate": 6.6142624936356895e-06, + "loss": 0.031, + "step": 25978 + }, + { + "epoch": 3.08063559824499, + "grad_norm": 0.3715763857088239, + "learning_rate": 6.612636090686397e-06, + "loss": 0.0224, + "step": 25979 + }, + { + "epoch": 3.0807541800071148, + "grad_norm": 0.5764439137282856, + "learning_rate": 6.611009857246051e-06, + "loss": 0.0282, + "step": 25980 + }, + { + "epoch": 3.08087276176924, + "grad_norm": 0.534889794221993, + "learning_rate": 6.609383793329646e-06, + "loss": 0.0375, + "step": 25981 + }, + { + "epoch": 3.0809913435313647, + "grad_norm": 0.5414781577732999, + "learning_rate": 6.607757898952166e-06, + "loss": 0.033, + "step": 25982 + }, + { + "epoch": 3.08110992529349, + "grad_norm": 0.4242410781553771, + "learning_rate": 6.606132174128615e-06, + "loss": 0.0174, + "step": 25983 + }, + { + "epoch": 3.0812285070556147, + "grad_norm": 0.7079930818340444, + "learning_rate": 6.604506618873954e-06, + "loss": 0.0314, + "step": 25984 + }, + { + "epoch": 3.08134708881774, + "grad_norm": 0.45773454886821563, + "learning_rate": 6.602881233203198e-06, + "loss": 0.0317, + "step": 25985 + }, + { + "epoch": 3.0814656705798646, + "grad_norm": 0.7240846407864003, + "learning_rate": 6.60125601713131e-06, + "loss": 0.0375, + "step": 25986 + }, + { + "epoch": 3.08158425234199, + "grad_norm": 0.31766103810571283, + "learning_rate": 6.599630970673288e-06, + "loss": 0.0161, + "step": 25987 + }, + { + "epoch": 3.0817028341041146, + "grad_norm": 0.3293169853157706, + "learning_rate": 6.598006093844086e-06, + "loss": 0.0174, + "step": 25988 + }, + { + "epoch": 3.08182141586624, + "grad_norm": 0.6611321989966826, + "learning_rate": 6.596381386658721e-06, + "loss": 0.0327, + "step": 25989 + }, + { + "epoch": 3.0819399976283646, + "grad_norm": 0.48905176181743343, + "learning_rate": 6.594756849132142e-06, + "loss": 0.0197, + "step": 25990 + }, + { + "epoch": 3.0820585793904898, + "grad_norm": 0.7123950419147906, + "learning_rate": 6.5931324812793345e-06, + "loss": 0.0407, + "step": 25991 + }, + { + "epoch": 3.0821771611526145, + "grad_norm": 0.6289870448804806, + "learning_rate": 6.591508283115274e-06, + "loss": 0.0377, + "step": 25992 + }, + { + "epoch": 3.0822957429147397, + "grad_norm": 0.4311515109705671, + "learning_rate": 6.589884254654932e-06, + "loss": 0.0262, + "step": 25993 + }, + { + "epoch": 3.0824143246768645, + "grad_norm": 0.8784319018096395, + "learning_rate": 6.588260395913293e-06, + "loss": 0.051, + "step": 25994 + }, + { + "epoch": 3.0825329064389897, + "grad_norm": 0.49729826456178705, + "learning_rate": 6.586636706905303e-06, + "loss": 0.0231, + "step": 25995 + }, + { + "epoch": 3.0826514882011145, + "grad_norm": 0.717239423927012, + "learning_rate": 6.585013187645944e-06, + "loss": 0.0459, + "step": 25996 + }, + { + "epoch": 3.0827700699632397, + "grad_norm": 0.565218427395357, + "learning_rate": 6.583389838150181e-06, + "loss": 0.0242, + "step": 25997 + }, + { + "epoch": 3.082888651725365, + "grad_norm": 0.7997870949802006, + "learning_rate": 6.581766658432981e-06, + "loss": 0.0499, + "step": 25998 + }, + { + "epoch": 3.0830072334874896, + "grad_norm": 0.4324894770740627, + "learning_rate": 6.580143648509307e-06, + "loss": 0.0231, + "step": 25999 + }, + { + "epoch": 3.083125815249615, + "grad_norm": 0.6268098730406804, + "learning_rate": 6.57852080839412e-06, + "loss": 0.0299, + "step": 26000 + }, + { + "epoch": 3.0832443970117396, + "grad_norm": 0.4833575874483491, + "learning_rate": 6.576898138102386e-06, + "loss": 0.0248, + "step": 26001 + }, + { + "epoch": 3.083362978773865, + "grad_norm": 0.4922517181288967, + "learning_rate": 6.575275637649056e-06, + "loss": 0.025, + "step": 26002 + }, + { + "epoch": 3.0834815605359895, + "grad_norm": 0.7073501529210374, + "learning_rate": 6.573653307049088e-06, + "loss": 0.0293, + "step": 26003 + }, + { + "epoch": 3.0836001422981147, + "grad_norm": 0.6919452446827044, + "learning_rate": 6.572031146317443e-06, + "loss": 0.0318, + "step": 26004 + }, + { + "epoch": 3.0837187240602395, + "grad_norm": 0.6100910706523739, + "learning_rate": 6.570409155469076e-06, + "loss": 0.0369, + "step": 26005 + }, + { + "epoch": 3.0838373058223647, + "grad_norm": 0.3088219258351878, + "learning_rate": 6.568787334518934e-06, + "loss": 0.0161, + "step": 26006 + }, + { + "epoch": 3.0839558875844895, + "grad_norm": 0.7093426330780543, + "learning_rate": 6.567165683481968e-06, + "loss": 0.0503, + "step": 26007 + }, + { + "epoch": 3.0840744693466147, + "grad_norm": 0.6919160496538017, + "learning_rate": 6.565544202373133e-06, + "loss": 0.0335, + "step": 26008 + }, + { + "epoch": 3.0841930511087394, + "grad_norm": 0.5167383835095553, + "learning_rate": 6.5639228912073715e-06, + "loss": 0.0273, + "step": 26009 + }, + { + "epoch": 3.0843116328708646, + "grad_norm": 0.6214526479294019, + "learning_rate": 6.562301749999636e-06, + "loss": 0.0261, + "step": 26010 + }, + { + "epoch": 3.0844302146329894, + "grad_norm": 0.44531902237942017, + "learning_rate": 6.560680778764866e-06, + "loss": 0.0134, + "step": 26011 + }, + { + "epoch": 3.0845487963951146, + "grad_norm": 0.4863516764959551, + "learning_rate": 6.559059977518017e-06, + "loss": 0.0274, + "step": 26012 + }, + { + "epoch": 3.0846673781572393, + "grad_norm": 0.7104606924491721, + "learning_rate": 6.557439346274014e-06, + "loss": 0.0281, + "step": 26013 + }, + { + "epoch": 3.0847859599193646, + "grad_norm": 0.333014670373954, + "learning_rate": 6.555818885047804e-06, + "loss": 0.0155, + "step": 26014 + }, + { + "epoch": 3.0849045416814893, + "grad_norm": 0.436665404637552, + "learning_rate": 6.554198593854324e-06, + "loss": 0.0218, + "step": 26015 + }, + { + "epoch": 3.0850231234436145, + "grad_norm": 0.3905988278179602, + "learning_rate": 6.552578472708523e-06, + "loss": 0.0169, + "step": 26016 + }, + { + "epoch": 3.0851417052057393, + "grad_norm": 0.5594439728415496, + "learning_rate": 6.55095852162532e-06, + "loss": 0.0328, + "step": 26017 + }, + { + "epoch": 3.0852602869678645, + "grad_norm": 0.6152464330328824, + "learning_rate": 6.549338740619654e-06, + "loss": 0.03, + "step": 26018 + }, + { + "epoch": 3.0853788687299892, + "grad_norm": 0.3864452028102754, + "learning_rate": 6.547719129706459e-06, + "loss": 0.022, + "step": 26019 + }, + { + "epoch": 3.0854974504921144, + "grad_norm": 0.5161766484415163, + "learning_rate": 6.546099688900667e-06, + "loss": 0.0215, + "step": 26020 + }, + { + "epoch": 3.085616032254239, + "grad_norm": 0.6192341118768159, + "learning_rate": 6.544480418217214e-06, + "loss": 0.0396, + "step": 26021 + }, + { + "epoch": 3.0857346140163644, + "grad_norm": 0.4950366101394761, + "learning_rate": 6.542861317671003e-06, + "loss": 0.0253, + "step": 26022 + }, + { + "epoch": 3.085853195778489, + "grad_norm": 0.4434046054761947, + "learning_rate": 6.541242387276994e-06, + "loss": 0.0232, + "step": 26023 + }, + { + "epoch": 3.0859717775406144, + "grad_norm": 0.7428801192215463, + "learning_rate": 6.5396236270500845e-06, + "loss": 0.0346, + "step": 26024 + }, + { + "epoch": 3.086090359302739, + "grad_norm": 0.5405437648916125, + "learning_rate": 6.5380050370052095e-06, + "loss": 0.0339, + "step": 26025 + }, + { + "epoch": 3.0862089410648643, + "grad_norm": 0.6394170193230403, + "learning_rate": 6.536386617157289e-06, + "loss": 0.0447, + "step": 26026 + }, + { + "epoch": 3.086327522826989, + "grad_norm": 0.4450824035829191, + "learning_rate": 6.534768367521249e-06, + "loss": 0.0236, + "step": 26027 + }, + { + "epoch": 3.0864461045891143, + "grad_norm": 0.3928210764250835, + "learning_rate": 6.533150288111992e-06, + "loss": 0.0123, + "step": 26028 + }, + { + "epoch": 3.086564686351239, + "grad_norm": 0.7628377051536036, + "learning_rate": 6.531532378944446e-06, + "loss": 0.0538, + "step": 26029 + }, + { + "epoch": 3.0866832681133642, + "grad_norm": 0.476141190421601, + "learning_rate": 6.529914640033524e-06, + "loss": 0.0237, + "step": 26030 + }, + { + "epoch": 3.086801849875489, + "grad_norm": 0.3992736288780532, + "learning_rate": 6.528297071394138e-06, + "loss": 0.0224, + "step": 26031 + }, + { + "epoch": 3.086920431637614, + "grad_norm": 0.39158701755428715, + "learning_rate": 6.526679673041211e-06, + "loss": 0.02, + "step": 26032 + }, + { + "epoch": 3.087039013399739, + "grad_norm": 0.4995681483269818, + "learning_rate": 6.525062444989627e-06, + "loss": 0.0309, + "step": 26033 + }, + { + "epoch": 3.087157595161864, + "grad_norm": 0.47622242371285595, + "learning_rate": 6.523445387254326e-06, + "loss": 0.024, + "step": 26034 + }, + { + "epoch": 3.087276176923989, + "grad_norm": 0.4822889715952465, + "learning_rate": 6.521828499850197e-06, + "loss": 0.0218, + "step": 26035 + }, + { + "epoch": 3.087394758686114, + "grad_norm": 0.3947165075191613, + "learning_rate": 6.520211782792154e-06, + "loss": 0.0249, + "step": 26036 + }, + { + "epoch": 3.087513340448239, + "grad_norm": 0.7401827631733047, + "learning_rate": 6.518595236095084e-06, + "loss": 0.0245, + "step": 26037 + }, + { + "epoch": 3.087631922210364, + "grad_norm": 0.5925232133644974, + "learning_rate": 6.516978859773917e-06, + "loss": 0.0249, + "step": 26038 + }, + { + "epoch": 3.087750503972489, + "grad_norm": 0.5927016230377626, + "learning_rate": 6.515362653843532e-06, + "loss": 0.0317, + "step": 26039 + }, + { + "epoch": 3.087869085734614, + "grad_norm": 0.53365322631287, + "learning_rate": 6.513746618318836e-06, + "loss": 0.0293, + "step": 26040 + }, + { + "epoch": 3.087987667496739, + "grad_norm": 0.4297130706179819, + "learning_rate": 6.512130753214726e-06, + "loss": 0.0259, + "step": 26041 + }, + { + "epoch": 3.088106249258864, + "grad_norm": 0.6590294629012833, + "learning_rate": 6.510515058546099e-06, + "loss": 0.0315, + "step": 26042 + }, + { + "epoch": 3.0882248310209888, + "grad_norm": 0.3969071013258093, + "learning_rate": 6.508899534327858e-06, + "loss": 0.0223, + "step": 26043 + }, + { + "epoch": 3.088343412783114, + "grad_norm": 0.6890257396124768, + "learning_rate": 6.507284180574874e-06, + "loss": 0.0328, + "step": 26044 + }, + { + "epoch": 3.0884619945452387, + "grad_norm": 0.39502478958012394, + "learning_rate": 6.505668997302067e-06, + "loss": 0.0244, + "step": 26045 + }, + { + "epoch": 3.088580576307364, + "grad_norm": 0.49492961933499946, + "learning_rate": 6.504053984524305e-06, + "loss": 0.0229, + "step": 26046 + }, + { + "epoch": 3.088699158069489, + "grad_norm": 0.5379565912596361, + "learning_rate": 6.502439142256484e-06, + "loss": 0.0267, + "step": 26047 + }, + { + "epoch": 3.088817739831614, + "grad_norm": 0.4276569010260428, + "learning_rate": 6.500824470513492e-06, + "loss": 0.0218, + "step": 26048 + }, + { + "epoch": 3.0889363215937387, + "grad_norm": 0.7430565957769454, + "learning_rate": 6.49920996931021e-06, + "loss": 0.0306, + "step": 26049 + }, + { + "epoch": 3.089054903355864, + "grad_norm": 0.40178050499033524, + "learning_rate": 6.497595638661535e-06, + "loss": 0.0243, + "step": 26050 + }, + { + "epoch": 3.089173485117989, + "grad_norm": 0.3567049333889741, + "learning_rate": 6.49598147858233e-06, + "loss": 0.0215, + "step": 26051 + }, + { + "epoch": 3.089292066880114, + "grad_norm": 0.48087754570985825, + "learning_rate": 6.4943674890874885e-06, + "loss": 0.025, + "step": 26052 + }, + { + "epoch": 3.089410648642239, + "grad_norm": 0.46757723379112104, + "learning_rate": 6.4927536701918815e-06, + "loss": 0.0261, + "step": 26053 + }, + { + "epoch": 3.089529230404364, + "grad_norm": 0.7440723165720353, + "learning_rate": 6.491140021910399e-06, + "loss": 0.0297, + "step": 26054 + }, + { + "epoch": 3.089647812166489, + "grad_norm": 0.4924604747150375, + "learning_rate": 6.489526544257893e-06, + "loss": 0.0326, + "step": 26055 + }, + { + "epoch": 3.0897663939286137, + "grad_norm": 0.6617313019587737, + "learning_rate": 6.4879132372492714e-06, + "loss": 0.0395, + "step": 26056 + }, + { + "epoch": 3.089884975690739, + "grad_norm": 0.5708943102887274, + "learning_rate": 6.486300100899379e-06, + "loss": 0.0317, + "step": 26057 + }, + { + "epoch": 3.0900035574528637, + "grad_norm": 0.5943811059048867, + "learning_rate": 6.4846871352230945e-06, + "loss": 0.0299, + "step": 26058 + }, + { + "epoch": 3.090122139214989, + "grad_norm": 0.6235873308204609, + "learning_rate": 6.4830743402352925e-06, + "loss": 0.0356, + "step": 26059 + }, + { + "epoch": 3.0902407209771137, + "grad_norm": 0.6517355605667484, + "learning_rate": 6.4814617159508375e-06, + "loss": 0.0327, + "step": 26060 + }, + { + "epoch": 3.090359302739239, + "grad_norm": 0.6607459360935296, + "learning_rate": 6.479849262384605e-06, + "loss": 0.0293, + "step": 26061 + }, + { + "epoch": 3.0904778845013636, + "grad_norm": 0.34996653675546363, + "learning_rate": 6.478236979551441e-06, + "loss": 0.0184, + "step": 26062 + }, + { + "epoch": 3.090596466263489, + "grad_norm": 0.5253416480052093, + "learning_rate": 6.476624867466222e-06, + "loss": 0.0407, + "step": 26063 + }, + { + "epoch": 3.0907150480256136, + "grad_norm": 0.46496221764541534, + "learning_rate": 6.475012926143806e-06, + "loss": 0.024, + "step": 26064 + }, + { + "epoch": 3.090833629787739, + "grad_norm": 0.6791902826723922, + "learning_rate": 6.47340115559906e-06, + "loss": 0.0359, + "step": 26065 + }, + { + "epoch": 3.0909522115498635, + "grad_norm": 0.435439009503598, + "learning_rate": 6.47178955584683e-06, + "loss": 0.023, + "step": 26066 + }, + { + "epoch": 3.0910707933119888, + "grad_norm": 0.4099788133572411, + "learning_rate": 6.47017812690198e-06, + "loss": 0.02, + "step": 26067 + }, + { + "epoch": 3.0911893750741135, + "grad_norm": 0.46143631459436923, + "learning_rate": 6.468566868779366e-06, + "loss": 0.0196, + "step": 26068 + }, + { + "epoch": 3.0913079568362387, + "grad_norm": 0.5754176118179641, + "learning_rate": 6.4669557814938385e-06, + "loss": 0.0259, + "step": 26069 + }, + { + "epoch": 3.0914265385983635, + "grad_norm": 0.35391677572652575, + "learning_rate": 6.465344865060252e-06, + "loss": 0.0165, + "step": 26070 + }, + { + "epoch": 3.0915451203604887, + "grad_norm": 0.4467724976243358, + "learning_rate": 6.463734119493459e-06, + "loss": 0.025, + "step": 26071 + }, + { + "epoch": 3.0916637021226134, + "grad_norm": 0.34964913588438584, + "learning_rate": 6.462123544808313e-06, + "loss": 0.0184, + "step": 26072 + }, + { + "epoch": 3.0917822838847386, + "grad_norm": 0.6823587938764629, + "learning_rate": 6.460513141019647e-06, + "loss": 0.0322, + "step": 26073 + }, + { + "epoch": 3.0919008656468634, + "grad_norm": 0.74573373887155, + "learning_rate": 6.458902908142317e-06, + "loss": 0.039, + "step": 26074 + }, + { + "epoch": 3.0920194474089886, + "grad_norm": 0.40816573136992246, + "learning_rate": 6.457292846191165e-06, + "loss": 0.0208, + "step": 26075 + }, + { + "epoch": 3.0921380291711134, + "grad_norm": 0.6587267117361991, + "learning_rate": 6.455682955181041e-06, + "loss": 0.0362, + "step": 26076 + }, + { + "epoch": 3.0922566109332386, + "grad_norm": 0.8188778750560314, + "learning_rate": 6.454073235126773e-06, + "loss": 0.0568, + "step": 26077 + }, + { + "epoch": 3.0923751926953633, + "grad_norm": 0.652906726667297, + "learning_rate": 6.452463686043208e-06, + "loss": 0.0324, + "step": 26078 + }, + { + "epoch": 3.0924937744574885, + "grad_norm": 0.48402914168977423, + "learning_rate": 6.450854307945181e-06, + "loss": 0.0258, + "step": 26079 + }, + { + "epoch": 3.0926123562196133, + "grad_norm": 0.8139689697659419, + "learning_rate": 6.449245100847534e-06, + "loss": 0.0398, + "step": 26080 + }, + { + "epoch": 3.0927309379817385, + "grad_norm": 0.5386061958055983, + "learning_rate": 6.447636064765103e-06, + "loss": 0.0278, + "step": 26081 + }, + { + "epoch": 3.0928495197438632, + "grad_norm": 0.8552742057520043, + "learning_rate": 6.4460271997127036e-06, + "loss": 0.0411, + "step": 26082 + }, + { + "epoch": 3.0929681015059884, + "grad_norm": 0.5608867718886952, + "learning_rate": 6.444418505705197e-06, + "loss": 0.0247, + "step": 26083 + }, + { + "epoch": 3.093086683268113, + "grad_norm": 0.4739128024999165, + "learning_rate": 6.442809982757389e-06, + "loss": 0.0252, + "step": 26084 + }, + { + "epoch": 3.0932052650302384, + "grad_norm": 0.7288042162777812, + "learning_rate": 6.4412016308841165e-06, + "loss": 0.0348, + "step": 26085 + }, + { + "epoch": 3.093323846792363, + "grad_norm": 0.5094538540042433, + "learning_rate": 6.4395934501002064e-06, + "loss": 0.026, + "step": 26086 + }, + { + "epoch": 3.0934424285544884, + "grad_norm": 0.465996766650947, + "learning_rate": 6.437985440420491e-06, + "loss": 0.025, + "step": 26087 + }, + { + "epoch": 3.093561010316613, + "grad_norm": 0.517471552499677, + "learning_rate": 6.436377601859784e-06, + "loss": 0.0332, + "step": 26088 + }, + { + "epoch": 3.0936795920787383, + "grad_norm": 0.5555464629530564, + "learning_rate": 6.434769934432908e-06, + "loss": 0.0272, + "step": 26089 + }, + { + "epoch": 3.093798173840863, + "grad_norm": 0.5764550458155953, + "learning_rate": 6.433162438154686e-06, + "loss": 0.0329, + "step": 26090 + }, + { + "epoch": 3.0939167556029883, + "grad_norm": 0.6175872489853209, + "learning_rate": 6.4315551130399415e-06, + "loss": 0.0397, + "step": 26091 + }, + { + "epoch": 3.094035337365113, + "grad_norm": 0.575816710613068, + "learning_rate": 6.429947959103494e-06, + "loss": 0.0237, + "step": 26092 + }, + { + "epoch": 3.0941539191272383, + "grad_norm": 0.6492289104129336, + "learning_rate": 6.428340976360139e-06, + "loss": 0.0293, + "step": 26093 + }, + { + "epoch": 3.094272500889363, + "grad_norm": 0.6265035535082848, + "learning_rate": 6.426734164824722e-06, + "loss": 0.0304, + "step": 26094 + }, + { + "epoch": 3.094391082651488, + "grad_norm": 0.4358127569980879, + "learning_rate": 6.4251275245120314e-06, + "loss": 0.0214, + "step": 26095 + }, + { + "epoch": 3.0945096644136134, + "grad_norm": 0.5054357215257627, + "learning_rate": 6.423521055436887e-06, + "loss": 0.0196, + "step": 26096 + }, + { + "epoch": 3.094628246175738, + "grad_norm": 0.5745817211277335, + "learning_rate": 6.4219147576141e-06, + "loss": 0.0319, + "step": 26097 + }, + { + "epoch": 3.094746827937863, + "grad_norm": 0.3563434163305405, + "learning_rate": 6.420308631058477e-06, + "loss": 0.0125, + "step": 26098 + }, + { + "epoch": 3.094865409699988, + "grad_norm": 0.7071026399693929, + "learning_rate": 6.4187026757848325e-06, + "loss": 0.0319, + "step": 26099 + }, + { + "epoch": 3.0949839914621133, + "grad_norm": 0.7004558454359763, + "learning_rate": 6.417096891807947e-06, + "loss": 0.0353, + "step": 26100 + }, + { + "epoch": 3.095102573224238, + "grad_norm": 0.7292360374990524, + "learning_rate": 6.415491279142655e-06, + "loss": 0.0413, + "step": 26101 + }, + { + "epoch": 3.0952211549863633, + "grad_norm": 0.7670420474174356, + "learning_rate": 6.413885837803738e-06, + "loss": 0.0445, + "step": 26102 + }, + { + "epoch": 3.095339736748488, + "grad_norm": 0.4713770252526953, + "learning_rate": 6.412280567806006e-06, + "loss": 0.0271, + "step": 26103 + }, + { + "epoch": 3.0954583185106133, + "grad_norm": 0.4099736588742806, + "learning_rate": 6.41067546916424e-06, + "loss": 0.0187, + "step": 26104 + }, + { + "epoch": 3.095576900272738, + "grad_norm": 0.3314777783927386, + "learning_rate": 6.4090705418932665e-06, + "loss": 0.0205, + "step": 26105 + }, + { + "epoch": 3.095695482034863, + "grad_norm": 0.4781650328230959, + "learning_rate": 6.407465786007857e-06, + "loss": 0.0204, + "step": 26106 + }, + { + "epoch": 3.095814063796988, + "grad_norm": 0.6459632347641338, + "learning_rate": 6.405861201522812e-06, + "loss": 0.0515, + "step": 26107 + }, + { + "epoch": 3.095932645559113, + "grad_norm": 0.6143532582629664, + "learning_rate": 6.404256788452928e-06, + "loss": 0.0295, + "step": 26108 + }, + { + "epoch": 3.096051227321238, + "grad_norm": 0.37624946306459617, + "learning_rate": 6.40265254681299e-06, + "loss": 0.0167, + "step": 26109 + }, + { + "epoch": 3.096169809083363, + "grad_norm": 0.47942195552259165, + "learning_rate": 6.401048476617799e-06, + "loss": 0.0311, + "step": 26110 + }, + { + "epoch": 3.096288390845488, + "grad_norm": 0.46734469051664285, + "learning_rate": 6.399444577882116e-06, + "loss": 0.0248, + "step": 26111 + }, + { + "epoch": 3.096406972607613, + "grad_norm": 0.4174875944450662, + "learning_rate": 6.397840850620762e-06, + "loss": 0.0166, + "step": 26112 + }, + { + "epoch": 3.096525554369738, + "grad_norm": 0.7021774639070125, + "learning_rate": 6.396237294848495e-06, + "loss": 0.0368, + "step": 26113 + }, + { + "epoch": 3.096644136131863, + "grad_norm": 0.5650737757798722, + "learning_rate": 6.394633910580117e-06, + "loss": 0.0347, + "step": 26114 + }, + { + "epoch": 3.096762717893988, + "grad_norm": 0.42941695748394354, + "learning_rate": 6.39303069783038e-06, + "loss": 0.0249, + "step": 26115 + }, + { + "epoch": 3.096881299656113, + "grad_norm": 0.3815517075895673, + "learning_rate": 6.391427656614099e-06, + "loss": 0.0227, + "step": 26116 + }, + { + "epoch": 3.096999881418238, + "grad_norm": 0.42865949722174695, + "learning_rate": 6.38982478694603e-06, + "loss": 0.0237, + "step": 26117 + }, + { + "epoch": 3.097118463180363, + "grad_norm": 0.6148091098283479, + "learning_rate": 6.3882220888409535e-06, + "loss": 0.0352, + "step": 26118 + }, + { + "epoch": 3.0972370449424877, + "grad_norm": 0.9776604886746922, + "learning_rate": 6.386619562313645e-06, + "loss": 0.0252, + "step": 26119 + }, + { + "epoch": 3.097355626704613, + "grad_norm": 0.5186746240473057, + "learning_rate": 6.385017207378882e-06, + "loss": 0.0289, + "step": 26120 + }, + { + "epoch": 3.0974742084667377, + "grad_norm": 0.35456380628504547, + "learning_rate": 6.383415024051437e-06, + "loss": 0.017, + "step": 26121 + }, + { + "epoch": 3.097592790228863, + "grad_norm": 0.4721376671801722, + "learning_rate": 6.381813012346072e-06, + "loss": 0.0187, + "step": 26122 + }, + { + "epoch": 3.0977113719909877, + "grad_norm": 0.500040686280097, + "learning_rate": 6.380211172277559e-06, + "loss": 0.0206, + "step": 26123 + }, + { + "epoch": 3.097829953753113, + "grad_norm": 0.5783981179098923, + "learning_rate": 6.378609503860664e-06, + "loss": 0.0257, + "step": 26124 + }, + { + "epoch": 3.0979485355152376, + "grad_norm": 0.718243938403343, + "learning_rate": 6.3770080071101634e-06, + "loss": 0.032, + "step": 26125 + }, + { + "epoch": 3.098067117277363, + "grad_norm": 0.46007860692600205, + "learning_rate": 6.375406682040797e-06, + "loss": 0.0259, + "step": 26126 + }, + { + "epoch": 3.0981856990394876, + "grad_norm": 0.2682857679339162, + "learning_rate": 6.3738055286673565e-06, + "loss": 0.013, + "step": 26127 + }, + { + "epoch": 3.098304280801613, + "grad_norm": 0.5876134087215134, + "learning_rate": 6.372204547004582e-06, + "loss": 0.0393, + "step": 26128 + }, + { + "epoch": 3.0984228625637376, + "grad_norm": 0.3860610685957886, + "learning_rate": 6.370603737067237e-06, + "loss": 0.023, + "step": 26129 + }, + { + "epoch": 3.0985414443258628, + "grad_norm": 0.5742058485515639, + "learning_rate": 6.36900309887008e-06, + "loss": 0.0201, + "step": 26130 + }, + { + "epoch": 3.0986600260879875, + "grad_norm": 0.5404164781061723, + "learning_rate": 6.3674026324278715e-06, + "loss": 0.0353, + "step": 26131 + }, + { + "epoch": 3.0987786078501127, + "grad_norm": 0.6375048028861412, + "learning_rate": 6.365802337755364e-06, + "loss": 0.0339, + "step": 26132 + }, + { + "epoch": 3.0988971896122375, + "grad_norm": 0.4719265436184998, + "learning_rate": 6.364202214867304e-06, + "loss": 0.0369, + "step": 26133 + }, + { + "epoch": 3.0990157713743627, + "grad_norm": 0.5649501941399752, + "learning_rate": 6.362602263778447e-06, + "loss": 0.0301, + "step": 26134 + }, + { + "epoch": 3.0991343531364874, + "grad_norm": 0.5069996707479983, + "learning_rate": 6.361002484503542e-06, + "loss": 0.0229, + "step": 26135 + }, + { + "epoch": 3.0992529348986126, + "grad_norm": 0.4811692753328815, + "learning_rate": 6.359402877057344e-06, + "loss": 0.0253, + "step": 26136 + }, + { + "epoch": 3.0993715166607374, + "grad_norm": 0.4527373067829245, + "learning_rate": 6.357803441454585e-06, + "loss": 0.0214, + "step": 26137 + }, + { + "epoch": 3.0994900984228626, + "grad_norm": 0.543717208047477, + "learning_rate": 6.356204177710021e-06, + "loss": 0.0256, + "step": 26138 + }, + { + "epoch": 3.0996086801849874, + "grad_norm": 0.29689793949334753, + "learning_rate": 6.354605085838389e-06, + "loss": 0.015, + "step": 26139 + }, + { + "epoch": 3.0997272619471126, + "grad_norm": 0.4255895091790392, + "learning_rate": 6.353006165854433e-06, + "loss": 0.0256, + "step": 26140 + }, + { + "epoch": 3.0998458437092373, + "grad_norm": 0.4924913820572335, + "learning_rate": 6.351407417772895e-06, + "loss": 0.023, + "step": 26141 + }, + { + "epoch": 3.0999644254713625, + "grad_norm": 0.2611360846122386, + "learning_rate": 6.349808841608512e-06, + "loss": 0.0132, + "step": 26142 + }, + { + "epoch": 3.1000830072334873, + "grad_norm": 0.6231538052656527, + "learning_rate": 6.3482104373760285e-06, + "loss": 0.0268, + "step": 26143 + }, + { + "epoch": 3.1002015889956125, + "grad_norm": 0.5581988894052708, + "learning_rate": 6.346612205090166e-06, + "loss": 0.0343, + "step": 26144 + }, + { + "epoch": 3.1003201707577372, + "grad_norm": 0.467943269195353, + "learning_rate": 6.345014144765665e-06, + "loss": 0.0246, + "step": 26145 + }, + { + "epoch": 3.1004387525198625, + "grad_norm": 0.39490299166866516, + "learning_rate": 6.343416256417256e-06, + "loss": 0.0216, + "step": 26146 + }, + { + "epoch": 3.100557334281987, + "grad_norm": 0.40526310950422917, + "learning_rate": 6.341818540059672e-06, + "loss": 0.0189, + "step": 26147 + }, + { + "epoch": 3.1006759160441124, + "grad_norm": 0.9096834403711268, + "learning_rate": 6.340220995707646e-06, + "loss": 0.0494, + "step": 26148 + }, + { + "epoch": 3.1007944978062376, + "grad_norm": 0.7194759164932298, + "learning_rate": 6.338623623375886e-06, + "loss": 0.0315, + "step": 26149 + }, + { + "epoch": 3.1009130795683624, + "grad_norm": 0.9069395760537126, + "learning_rate": 6.337026423079148e-06, + "loss": 0.0345, + "step": 26150 + }, + { + "epoch": 3.1010316613304876, + "grad_norm": 0.2881014049473789, + "learning_rate": 6.3354293948321345e-06, + "loss": 0.0153, + "step": 26151 + }, + { + "epoch": 3.1011502430926123, + "grad_norm": 0.412915766422724, + "learning_rate": 6.333832538649578e-06, + "loss": 0.0199, + "step": 26152 + }, + { + "epoch": 3.1012688248547375, + "grad_norm": 0.7464201383240343, + "learning_rate": 6.332235854546184e-06, + "loss": 0.0355, + "step": 26153 + }, + { + "epoch": 3.1013874066168623, + "grad_norm": 0.4052856316928445, + "learning_rate": 6.330639342536696e-06, + "loss": 0.0219, + "step": 26154 + }, + { + "epoch": 3.1015059883789875, + "grad_norm": 0.46503804348169514, + "learning_rate": 6.329043002635812e-06, + "loss": 0.021, + "step": 26155 + }, + { + "epoch": 3.1016245701411123, + "grad_norm": 0.6772933924026678, + "learning_rate": 6.327446834858259e-06, + "loss": 0.0301, + "step": 26156 + }, + { + "epoch": 3.1017431519032375, + "grad_norm": 0.4369348388843227, + "learning_rate": 6.3258508392187445e-06, + "loss": 0.0206, + "step": 26157 + }, + { + "epoch": 3.101861733665362, + "grad_norm": 0.4568157068083222, + "learning_rate": 6.324255015731986e-06, + "loss": 0.0272, + "step": 26158 + }, + { + "epoch": 3.1019803154274874, + "grad_norm": 0.7929178633046083, + "learning_rate": 6.322659364412703e-06, + "loss": 0.044, + "step": 26159 + }, + { + "epoch": 3.102098897189612, + "grad_norm": 0.5468994257523234, + "learning_rate": 6.321063885275583e-06, + "loss": 0.0265, + "step": 26160 + }, + { + "epoch": 3.1022174789517374, + "grad_norm": 0.3919872481637659, + "learning_rate": 6.319468578335361e-06, + "loss": 0.0197, + "step": 26161 + }, + { + "epoch": 3.102336060713862, + "grad_norm": 0.5259115291583194, + "learning_rate": 6.317873443606726e-06, + "loss": 0.0293, + "step": 26162 + }, + { + "epoch": 3.1024546424759873, + "grad_norm": 0.45732702986419904, + "learning_rate": 6.316278481104393e-06, + "loss": 0.0198, + "step": 26163 + }, + { + "epoch": 3.102573224238112, + "grad_norm": 0.49014440440116797, + "learning_rate": 6.314683690843048e-06, + "loss": 0.0294, + "step": 26164 + }, + { + "epoch": 3.1026918060002373, + "grad_norm": 0.6621076537610031, + "learning_rate": 6.313089072837419e-06, + "loss": 0.0485, + "step": 26165 + }, + { + "epoch": 3.102810387762362, + "grad_norm": 0.4798318940313289, + "learning_rate": 6.311494627102188e-06, + "loss": 0.0167, + "step": 26166 + }, + { + "epoch": 3.1029289695244873, + "grad_norm": 0.7725896287506294, + "learning_rate": 6.309900353652056e-06, + "loss": 0.0272, + "step": 26167 + }, + { + "epoch": 3.103047551286612, + "grad_norm": 0.4707758511473212, + "learning_rate": 6.308306252501728e-06, + "loss": 0.0237, + "step": 26168 + }, + { + "epoch": 3.1031661330487372, + "grad_norm": 0.47674213614228134, + "learning_rate": 6.3067123236658924e-06, + "loss": 0.0214, + "step": 26169 + }, + { + "epoch": 3.103284714810862, + "grad_norm": 0.4040571008650443, + "learning_rate": 6.305118567159254e-06, + "loss": 0.015, + "step": 26170 + }, + { + "epoch": 3.103403296572987, + "grad_norm": 0.5013733217426917, + "learning_rate": 6.303524982996481e-06, + "loss": 0.0211, + "step": 26171 + }, + { + "epoch": 3.103521878335112, + "grad_norm": 0.5708457959477325, + "learning_rate": 6.3019315711922985e-06, + "loss": 0.0297, + "step": 26172 + }, + { + "epoch": 3.103640460097237, + "grad_norm": 0.4303491387378306, + "learning_rate": 6.300338331761368e-06, + "loss": 0.0257, + "step": 26173 + }, + { + "epoch": 3.103759041859362, + "grad_norm": 0.5712810381636688, + "learning_rate": 6.298745264718395e-06, + "loss": 0.0238, + "step": 26174 + }, + { + "epoch": 3.103877623621487, + "grad_norm": 0.6337820052975662, + "learning_rate": 6.297152370078044e-06, + "loss": 0.0252, + "step": 26175 + }, + { + "epoch": 3.103996205383612, + "grad_norm": 0.3697059540460847, + "learning_rate": 6.295559647855026e-06, + "loss": 0.023, + "step": 26176 + }, + { + "epoch": 3.104114787145737, + "grad_norm": 0.3406037064297466, + "learning_rate": 6.2939670980640044e-06, + "loss": 0.0166, + "step": 26177 + }, + { + "epoch": 3.104233368907862, + "grad_norm": 0.3465394481968439, + "learning_rate": 6.292374720719668e-06, + "loss": 0.0151, + "step": 26178 + }, + { + "epoch": 3.104351950669987, + "grad_norm": 0.5471383595162395, + "learning_rate": 6.290782515836693e-06, + "loss": 0.0253, + "step": 26179 + }, + { + "epoch": 3.104470532432112, + "grad_norm": 0.5977196001023531, + "learning_rate": 6.2891904834297635e-06, + "loss": 0.034, + "step": 26180 + }, + { + "epoch": 3.104589114194237, + "grad_norm": 0.322087546047683, + "learning_rate": 6.287598623513561e-06, + "loss": 0.0157, + "step": 26181 + }, + { + "epoch": 3.1047076959563618, + "grad_norm": 0.515620873152897, + "learning_rate": 6.286006936102737e-06, + "loss": 0.0241, + "step": 26182 + }, + { + "epoch": 3.104826277718487, + "grad_norm": 1.105780492860416, + "learning_rate": 6.284415421211995e-06, + "loss": 0.0373, + "step": 26183 + }, + { + "epoch": 3.1049448594806117, + "grad_norm": 0.6671472947546507, + "learning_rate": 6.282824078855984e-06, + "loss": 0.0366, + "step": 26184 + }, + { + "epoch": 3.105063441242737, + "grad_norm": 0.3657149944095711, + "learning_rate": 6.28123290904939e-06, + "loss": 0.0148, + "step": 26185 + }, + { + "epoch": 3.1051820230048617, + "grad_norm": 0.4677163757661089, + "learning_rate": 6.279641911806861e-06, + "loss": 0.0263, + "step": 26186 + }, + { + "epoch": 3.105300604766987, + "grad_norm": 0.6634729342814972, + "learning_rate": 6.278051087143089e-06, + "loss": 0.0358, + "step": 26187 + }, + { + "epoch": 3.1054191865291116, + "grad_norm": 0.6650557226389416, + "learning_rate": 6.2764604350727226e-06, + "loss": 0.0269, + "step": 26188 + }, + { + "epoch": 3.105537768291237, + "grad_norm": 0.5158551401085014, + "learning_rate": 6.27486995561043e-06, + "loss": 0.0277, + "step": 26189 + }, + { + "epoch": 3.1056563500533616, + "grad_norm": 0.3636550865915899, + "learning_rate": 6.273279648770874e-06, + "loss": 0.0126, + "step": 26190 + }, + { + "epoch": 3.105774931815487, + "grad_norm": 0.548194025131478, + "learning_rate": 6.271689514568715e-06, + "loss": 0.027, + "step": 26191 + }, + { + "epoch": 3.1058935135776116, + "grad_norm": 0.7543154328423592, + "learning_rate": 6.270099553018621e-06, + "loss": 0.0362, + "step": 26192 + }, + { + "epoch": 3.1060120953397368, + "grad_norm": 0.8585245432987427, + "learning_rate": 6.268509764135233e-06, + "loss": 0.0384, + "step": 26193 + }, + { + "epoch": 3.1061306771018615, + "grad_norm": 0.5438979068703628, + "learning_rate": 6.266920147933214e-06, + "loss": 0.0361, + "step": 26194 + }, + { + "epoch": 3.1062492588639867, + "grad_norm": 0.47608983359420165, + "learning_rate": 6.265330704427219e-06, + "loss": 0.0221, + "step": 26195 + }, + { + "epoch": 3.1063678406261115, + "grad_norm": 0.5851248257257475, + "learning_rate": 6.263741433631912e-06, + "loss": 0.0334, + "step": 26196 + }, + { + "epoch": 3.1064864223882367, + "grad_norm": 0.7180837176882194, + "learning_rate": 6.2621523355619136e-06, + "loss": 0.0403, + "step": 26197 + }, + { + "epoch": 3.106605004150362, + "grad_norm": 0.5668140545060149, + "learning_rate": 6.260563410231909e-06, + "loss": 0.0258, + "step": 26198 + }, + { + "epoch": 3.1067235859124867, + "grad_norm": 0.49279006628514976, + "learning_rate": 6.258974657656525e-06, + "loss": 0.0239, + "step": 26199 + }, + { + "epoch": 3.106842167674612, + "grad_norm": 0.3602721722995767, + "learning_rate": 6.257386077850411e-06, + "loss": 0.017, + "step": 26200 + }, + { + "epoch": 3.1069607494367366, + "grad_norm": 0.43464324767864015, + "learning_rate": 6.255797670828215e-06, + "loss": 0.014, + "step": 26201 + }, + { + "epoch": 3.107079331198862, + "grad_norm": 0.7187527548542603, + "learning_rate": 6.2542094366045775e-06, + "loss": 0.031, + "step": 26202 + }, + { + "epoch": 3.1071979129609866, + "grad_norm": 0.5209400554885527, + "learning_rate": 6.252621375194148e-06, + "loss": 0.0224, + "step": 26203 + }, + { + "epoch": 3.1073164947231118, + "grad_norm": 0.8959580971168821, + "learning_rate": 6.251033486611554e-06, + "loss": 0.0462, + "step": 26204 + }, + { + "epoch": 3.1074350764852365, + "grad_norm": 0.6117897944961294, + "learning_rate": 6.249445770871437e-06, + "loss": 0.0331, + "step": 26205 + }, + { + "epoch": 3.1075536582473617, + "grad_norm": 0.4864062189881036, + "learning_rate": 6.24785822798844e-06, + "loss": 0.0277, + "step": 26206 + }, + { + "epoch": 3.1076722400094865, + "grad_norm": 0.34375536332310835, + "learning_rate": 6.246270857977193e-06, + "loss": 0.017, + "step": 26207 + }, + { + "epoch": 3.1077908217716117, + "grad_norm": 0.5515455651915314, + "learning_rate": 6.244683660852341e-06, + "loss": 0.024, + "step": 26208 + }, + { + "epoch": 3.1079094035337365, + "grad_norm": 0.6770005272404406, + "learning_rate": 6.24309663662849e-06, + "loss": 0.0316, + "step": 26209 + }, + { + "epoch": 3.1080279852958617, + "grad_norm": 0.8336155106196668, + "learning_rate": 6.241509785320298e-06, + "loss": 0.0392, + "step": 26210 + }, + { + "epoch": 3.1081465670579864, + "grad_norm": 0.8887607302091973, + "learning_rate": 6.23992310694238e-06, + "loss": 0.0313, + "step": 26211 + }, + { + "epoch": 3.1082651488201116, + "grad_norm": 0.30569133403223414, + "learning_rate": 6.238336601509365e-06, + "loss": 0.0123, + "step": 26212 + }, + { + "epoch": 3.1083837305822364, + "grad_norm": 0.6165676113326908, + "learning_rate": 6.2367502690358795e-06, + "loss": 0.0269, + "step": 26213 + }, + { + "epoch": 3.1085023123443616, + "grad_norm": 0.6718343721116631, + "learning_rate": 6.235164109536554e-06, + "loss": 0.0311, + "step": 26214 + }, + { + "epoch": 3.1086208941064863, + "grad_norm": 0.35416484315472335, + "learning_rate": 6.233578123025996e-06, + "loss": 0.0208, + "step": 26215 + }, + { + "epoch": 3.1087394758686115, + "grad_norm": 0.49332196230191394, + "learning_rate": 6.231992309518836e-06, + "loss": 0.0272, + "step": 26216 + }, + { + "epoch": 3.1088580576307363, + "grad_norm": 0.5688896801795142, + "learning_rate": 6.230406669029693e-06, + "loss": 0.0313, + "step": 26217 + }, + { + "epoch": 3.1089766393928615, + "grad_norm": 0.616522881474446, + "learning_rate": 6.228821201573182e-06, + "loss": 0.0201, + "step": 26218 + }, + { + "epoch": 3.1090952211549863, + "grad_norm": 0.63616730972599, + "learning_rate": 6.227235907163928e-06, + "loss": 0.0365, + "step": 26219 + }, + { + "epoch": 3.1092138029171115, + "grad_norm": 0.4397305539556511, + "learning_rate": 6.225650785816525e-06, + "loss": 0.0252, + "step": 26220 + }, + { + "epoch": 3.1093323846792362, + "grad_norm": 0.46680021410540623, + "learning_rate": 6.2240658375456145e-06, + "loss": 0.0197, + "step": 26221 + }, + { + "epoch": 3.1094509664413614, + "grad_norm": 0.568702417637025, + "learning_rate": 6.222481062365784e-06, + "loss": 0.0341, + "step": 26222 + }, + { + "epoch": 3.109569548203486, + "grad_norm": 0.502489025253057, + "learning_rate": 6.220896460291658e-06, + "loss": 0.0126, + "step": 26223 + }, + { + "epoch": 3.1096881299656114, + "grad_norm": 0.45140662597873343, + "learning_rate": 6.219312031337823e-06, + "loss": 0.0278, + "step": 26224 + }, + { + "epoch": 3.109806711727736, + "grad_norm": 0.7083775000050564, + "learning_rate": 6.21772777551892e-06, + "loss": 0.0288, + "step": 26225 + }, + { + "epoch": 3.1099252934898614, + "grad_norm": 0.6953384466977269, + "learning_rate": 6.216143692849522e-06, + "loss": 0.0383, + "step": 26226 + }, + { + "epoch": 3.110043875251986, + "grad_norm": 0.37717068588820113, + "learning_rate": 6.214559783344248e-06, + "loss": 0.0302, + "step": 26227 + }, + { + "epoch": 3.1101624570141113, + "grad_norm": 0.6486995332270011, + "learning_rate": 6.212976047017693e-06, + "loss": 0.027, + "step": 26228 + }, + { + "epoch": 3.110281038776236, + "grad_norm": 0.43221895613542033, + "learning_rate": 6.211392483884465e-06, + "loss": 0.0257, + "step": 26229 + }, + { + "epoch": 3.1103996205383613, + "grad_norm": 0.4420670967039843, + "learning_rate": 6.209809093959162e-06, + "loss": 0.0257, + "step": 26230 + }, + { + "epoch": 3.110518202300486, + "grad_norm": 0.5846597177599898, + "learning_rate": 6.208225877256365e-06, + "loss": 0.0245, + "step": 26231 + }, + { + "epoch": 3.1106367840626112, + "grad_norm": 0.4132947153591283, + "learning_rate": 6.206642833790697e-06, + "loss": 0.0178, + "step": 26232 + }, + { + "epoch": 3.110755365824736, + "grad_norm": 0.7255629429753506, + "learning_rate": 6.205059963576726e-06, + "loss": 0.0409, + "step": 26233 + }, + { + "epoch": 3.110873947586861, + "grad_norm": 0.45664943772320615, + "learning_rate": 6.203477266629063e-06, + "loss": 0.0309, + "step": 26234 + }, + { + "epoch": 3.110992529348986, + "grad_norm": 0.4277818756474455, + "learning_rate": 6.201894742962272e-06, + "loss": 0.0237, + "step": 26235 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.4685305222137804, + "learning_rate": 6.200312392590979e-06, + "loss": 0.029, + "step": 26236 + }, + { + "epoch": 3.111229692873236, + "grad_norm": 0.7582771968955652, + "learning_rate": 6.198730215529741e-06, + "loss": 0.045, + "step": 26237 + }, + { + "epoch": 3.111348274635361, + "grad_norm": 0.42218489555420885, + "learning_rate": 6.197148211793158e-06, + "loss": 0.0291, + "step": 26238 + }, + { + "epoch": 3.111466856397486, + "grad_norm": 0.7115387031444904, + "learning_rate": 6.1955663813958095e-06, + "loss": 0.0354, + "step": 26239 + }, + { + "epoch": 3.111585438159611, + "grad_norm": 0.5283888267842933, + "learning_rate": 6.193984724352278e-06, + "loss": 0.026, + "step": 26240 + }, + { + "epoch": 3.111704019921736, + "grad_norm": 0.7192516683336835, + "learning_rate": 6.1924032406771555e-06, + "loss": 0.0283, + "step": 26241 + }, + { + "epoch": 3.111822601683861, + "grad_norm": 0.7288191672369203, + "learning_rate": 6.190821930384993e-06, + "loss": 0.0341, + "step": 26242 + }, + { + "epoch": 3.111941183445986, + "grad_norm": 0.46128648150005486, + "learning_rate": 6.189240793490406e-06, + "loss": 0.0197, + "step": 26243 + }, + { + "epoch": 3.112059765208111, + "grad_norm": 0.3410948768404213, + "learning_rate": 6.187659830007944e-06, + "loss": 0.0182, + "step": 26244 + }, + { + "epoch": 3.1121783469702358, + "grad_norm": 0.5240195228257548, + "learning_rate": 6.1860790399521945e-06, + "loss": 0.0246, + "step": 26245 + }, + { + "epoch": 3.112296928732361, + "grad_norm": 0.4001617304394909, + "learning_rate": 6.184498423337712e-06, + "loss": 0.0195, + "step": 26246 + }, + { + "epoch": 3.112415510494486, + "grad_norm": 0.727974674543846, + "learning_rate": 6.182917980179095e-06, + "loss": 0.0542, + "step": 26247 + }, + { + "epoch": 3.112534092256611, + "grad_norm": 0.3889790685633258, + "learning_rate": 6.181337710490892e-06, + "loss": 0.0222, + "step": 26248 + }, + { + "epoch": 3.112652674018736, + "grad_norm": 0.5382050914405286, + "learning_rate": 6.179757614287676e-06, + "loss": 0.0286, + "step": 26249 + }, + { + "epoch": 3.112771255780861, + "grad_norm": 0.5093180945173544, + "learning_rate": 6.178177691584019e-06, + "loss": 0.0299, + "step": 26250 + }, + { + "epoch": 3.112889837542986, + "grad_norm": 0.7609135769509627, + "learning_rate": 6.1765979423944825e-06, + "loss": 0.0493, + "step": 26251 + }, + { + "epoch": 3.113008419305111, + "grad_norm": 0.5791319427426144, + "learning_rate": 6.175018366733637e-06, + "loss": 0.0262, + "step": 26252 + }, + { + "epoch": 3.113127001067236, + "grad_norm": 0.631420043736167, + "learning_rate": 6.173438964616021e-06, + "loss": 0.0386, + "step": 26253 + }, + { + "epoch": 3.113245582829361, + "grad_norm": 1.081654847200261, + "learning_rate": 6.171859736056229e-06, + "loss": 0.052, + "step": 26254 + }, + { + "epoch": 3.113364164591486, + "grad_norm": 0.3129093976209349, + "learning_rate": 6.170280681068793e-06, + "loss": 0.0159, + "step": 26255 + }, + { + "epoch": 3.1134827463536108, + "grad_norm": 0.40100901051509386, + "learning_rate": 6.168701799668278e-06, + "loss": 0.0219, + "step": 26256 + }, + { + "epoch": 3.113601328115736, + "grad_norm": 0.6646319086608931, + "learning_rate": 6.167123091869239e-06, + "loss": 0.0366, + "step": 26257 + }, + { + "epoch": 3.1137199098778607, + "grad_norm": 0.5138584603742744, + "learning_rate": 6.165544557686234e-06, + "loss": 0.0265, + "step": 26258 + }, + { + "epoch": 3.113838491639986, + "grad_norm": 0.4781233901173742, + "learning_rate": 6.163966197133817e-06, + "loss": 0.0229, + "step": 26259 + }, + { + "epoch": 3.1139570734021107, + "grad_norm": 0.823658527505835, + "learning_rate": 6.1623880102265245e-06, + "loss": 0.0495, + "step": 26260 + }, + { + "epoch": 3.114075655164236, + "grad_norm": 0.37239294454884825, + "learning_rate": 6.160809996978917e-06, + "loss": 0.0212, + "step": 26261 + }, + { + "epoch": 3.1141942369263607, + "grad_norm": 0.33186510048859447, + "learning_rate": 6.15923215740554e-06, + "loss": 0.0179, + "step": 26262 + }, + { + "epoch": 3.114312818688486, + "grad_norm": 0.7190116307334551, + "learning_rate": 6.157654491520942e-06, + "loss": 0.0362, + "step": 26263 + }, + { + "epoch": 3.1144314004506106, + "grad_norm": 0.48536041226647314, + "learning_rate": 6.156076999339658e-06, + "loss": 0.0295, + "step": 26264 + }, + { + "epoch": 3.114549982212736, + "grad_norm": 0.6586722441334608, + "learning_rate": 6.1544996808762384e-06, + "loss": 0.0269, + "step": 26265 + }, + { + "epoch": 3.1146685639748606, + "grad_norm": 0.3992533423413259, + "learning_rate": 6.15292253614522e-06, + "loss": 0.0203, + "step": 26266 + }, + { + "epoch": 3.114787145736986, + "grad_norm": 0.4031307533358063, + "learning_rate": 6.151345565161143e-06, + "loss": 0.0164, + "step": 26267 + }, + { + "epoch": 3.1149057274991105, + "grad_norm": 0.4459740798021636, + "learning_rate": 6.149768767938546e-06, + "loss": 0.0256, + "step": 26268 + }, + { + "epoch": 3.1150243092612357, + "grad_norm": 0.45403022301777946, + "learning_rate": 6.148192144491965e-06, + "loss": 0.0298, + "step": 26269 + }, + { + "epoch": 3.1151428910233605, + "grad_norm": 0.4360993414821474, + "learning_rate": 6.1466156948359446e-06, + "loss": 0.0283, + "step": 26270 + }, + { + "epoch": 3.1152614727854857, + "grad_norm": 0.5202946078847035, + "learning_rate": 6.1450394189849995e-06, + "loss": 0.0324, + "step": 26271 + }, + { + "epoch": 3.1153800545476105, + "grad_norm": 0.5347746934446267, + "learning_rate": 6.143463316953671e-06, + "loss": 0.0293, + "step": 26272 + }, + { + "epoch": 3.1154986363097357, + "grad_norm": 0.545493000013581, + "learning_rate": 6.141887388756484e-06, + "loss": 0.0315, + "step": 26273 + }, + { + "epoch": 3.1156172180718604, + "grad_norm": 0.4168115978082355, + "learning_rate": 6.14031163440798e-06, + "loss": 0.0211, + "step": 26274 + }, + { + "epoch": 3.1157357998339856, + "grad_norm": 0.7299160151343137, + "learning_rate": 6.138736053922667e-06, + "loss": 0.0344, + "step": 26275 + }, + { + "epoch": 3.1158543815961104, + "grad_norm": 0.46450530358740605, + "learning_rate": 6.13716064731508e-06, + "loss": 0.0297, + "step": 26276 + }, + { + "epoch": 3.1159729633582356, + "grad_norm": 0.5160852309218061, + "learning_rate": 6.1355854145997374e-06, + "loss": 0.0216, + "step": 26277 + }, + { + "epoch": 3.1160915451203604, + "grad_norm": 0.4153077112997744, + "learning_rate": 6.134010355791167e-06, + "loss": 0.0232, + "step": 26278 + }, + { + "epoch": 3.1162101268824856, + "grad_norm": 0.4378318319357323, + "learning_rate": 6.132435470903891e-06, + "loss": 0.0245, + "step": 26279 + }, + { + "epoch": 3.1163287086446103, + "grad_norm": 0.48532728153617644, + "learning_rate": 6.130860759952409e-06, + "loss": 0.0274, + "step": 26280 + }, + { + "epoch": 3.1164472904067355, + "grad_norm": 0.5672209230944391, + "learning_rate": 6.129286222951269e-06, + "loss": 0.0286, + "step": 26281 + }, + { + "epoch": 3.1165658721688603, + "grad_norm": 0.5534314541471204, + "learning_rate": 6.127711859914959e-06, + "loss": 0.023, + "step": 26282 + }, + { + "epoch": 3.1166844539309855, + "grad_norm": 0.5959560136849181, + "learning_rate": 6.126137670858004e-06, + "loss": 0.025, + "step": 26283 + }, + { + "epoch": 3.1168030356931102, + "grad_norm": 0.4647694886206378, + "learning_rate": 6.124563655794916e-06, + "loss": 0.0237, + "step": 26284 + }, + { + "epoch": 3.1169216174552354, + "grad_norm": 0.4483179311976623, + "learning_rate": 6.1229898147402096e-06, + "loss": 0.0201, + "step": 26285 + }, + { + "epoch": 3.11704019921736, + "grad_norm": 0.38058502336118705, + "learning_rate": 6.121416147708384e-06, + "loss": 0.0194, + "step": 26286 + }, + { + "epoch": 3.1171587809794854, + "grad_norm": 0.42267102794530037, + "learning_rate": 6.119842654713951e-06, + "loss": 0.02, + "step": 26287 + }, + { + "epoch": 3.11727736274161, + "grad_norm": 0.46499943024481627, + "learning_rate": 6.118269335771415e-06, + "loss": 0.0285, + "step": 26288 + }, + { + "epoch": 3.1173959445037354, + "grad_norm": 0.5938377226165401, + "learning_rate": 6.116696190895279e-06, + "loss": 0.0352, + "step": 26289 + }, + { + "epoch": 3.11751452626586, + "grad_norm": 0.25061663200722223, + "learning_rate": 6.115123220100058e-06, + "loss": 0.0138, + "step": 26290 + }, + { + "epoch": 3.1176331080279853, + "grad_norm": 0.5165507533506261, + "learning_rate": 6.11355042340023e-06, + "loss": 0.0362, + "step": 26291 + }, + { + "epoch": 3.11775168979011, + "grad_norm": 0.7371341139299494, + "learning_rate": 6.1119778008103175e-06, + "loss": 0.0532, + "step": 26292 + }, + { + "epoch": 3.1178702715522353, + "grad_norm": 0.8152258301619669, + "learning_rate": 6.110405352344803e-06, + "loss": 0.0438, + "step": 26293 + }, + { + "epoch": 3.11798885331436, + "grad_norm": 0.7877495621241245, + "learning_rate": 6.108833078018195e-06, + "loss": 0.0487, + "step": 26294 + }, + { + "epoch": 3.1181074350764852, + "grad_norm": 0.48826246453853916, + "learning_rate": 6.107260977844964e-06, + "loss": 0.0248, + "step": 26295 + }, + { + "epoch": 3.1182260168386104, + "grad_norm": 0.595456657684731, + "learning_rate": 6.105689051839633e-06, + "loss": 0.0291, + "step": 26296 + }, + { + "epoch": 3.118344598600735, + "grad_norm": 0.3909451340086784, + "learning_rate": 6.104117300016673e-06, + "loss": 0.0302, + "step": 26297 + }, + { + "epoch": 3.11846318036286, + "grad_norm": 0.5127090371223674, + "learning_rate": 6.102545722390579e-06, + "loss": 0.0273, + "step": 26298 + }, + { + "epoch": 3.118581762124985, + "grad_norm": 0.43141772523821204, + "learning_rate": 6.100974318975841e-06, + "loss": 0.0225, + "step": 26299 + }, + { + "epoch": 3.1187003438871104, + "grad_norm": 0.6828681656112371, + "learning_rate": 6.099403089786945e-06, + "loss": 0.0356, + "step": 26300 + }, + { + "epoch": 3.118818925649235, + "grad_norm": 0.5918741701517921, + "learning_rate": 6.09783203483838e-06, + "loss": 0.0422, + "step": 26301 + }, + { + "epoch": 3.1189375074113603, + "grad_norm": 0.5052372997534127, + "learning_rate": 6.096261154144611e-06, + "loss": 0.0282, + "step": 26302 + }, + { + "epoch": 3.119056089173485, + "grad_norm": 0.4074068395070854, + "learning_rate": 6.0946904477201474e-06, + "loss": 0.0205, + "step": 26303 + }, + { + "epoch": 3.1191746709356103, + "grad_norm": 0.45291976869728684, + "learning_rate": 6.093119915579448e-06, + "loss": 0.0237, + "step": 26304 + }, + { + "epoch": 3.119293252697735, + "grad_norm": 0.4058642600599523, + "learning_rate": 6.0915495577369976e-06, + "loss": 0.0216, + "step": 26305 + }, + { + "epoch": 3.1194118344598603, + "grad_norm": 0.48196768105082105, + "learning_rate": 6.0899793742072716e-06, + "loss": 0.03, + "step": 26306 + }, + { + "epoch": 3.119530416221985, + "grad_norm": 0.47160646293205244, + "learning_rate": 6.088409365004746e-06, + "loss": 0.0262, + "step": 26307 + }, + { + "epoch": 3.11964899798411, + "grad_norm": 0.5809473993530636, + "learning_rate": 6.086839530143907e-06, + "loss": 0.0271, + "step": 26308 + }, + { + "epoch": 3.119767579746235, + "grad_norm": 0.6103514826344902, + "learning_rate": 6.085269869639204e-06, + "loss": 0.0327, + "step": 26309 + }, + { + "epoch": 3.11988616150836, + "grad_norm": 0.4650399805277606, + "learning_rate": 6.083700383505117e-06, + "loss": 0.0198, + "step": 26310 + }, + { + "epoch": 3.120004743270485, + "grad_norm": 0.515006356325317, + "learning_rate": 6.082131071756117e-06, + "loss": 0.0284, + "step": 26311 + }, + { + "epoch": 3.12012332503261, + "grad_norm": 0.5000140479252765, + "learning_rate": 6.080561934406678e-06, + "loss": 0.0272, + "step": 26312 + }, + { + "epoch": 3.120241906794735, + "grad_norm": 0.3930970358327909, + "learning_rate": 6.078992971471242e-06, + "loss": 0.0227, + "step": 26313 + }, + { + "epoch": 3.12036048855686, + "grad_norm": 0.5412777648050726, + "learning_rate": 6.077424182964306e-06, + "loss": 0.0313, + "step": 26314 + }, + { + "epoch": 3.120479070318985, + "grad_norm": 0.5190185716822356, + "learning_rate": 6.075855568900304e-06, + "loss": 0.0338, + "step": 26315 + }, + { + "epoch": 3.12059765208111, + "grad_norm": 0.32161030933237994, + "learning_rate": 6.074287129293707e-06, + "loss": 0.013, + "step": 26316 + }, + { + "epoch": 3.120716233843235, + "grad_norm": 0.5576879644492603, + "learning_rate": 6.072718864158977e-06, + "loss": 0.0287, + "step": 26317 + }, + { + "epoch": 3.12083481560536, + "grad_norm": 0.5006503810964035, + "learning_rate": 6.071150773510567e-06, + "loss": 0.0288, + "step": 26318 + }, + { + "epoch": 3.120953397367485, + "grad_norm": 0.507611208082678, + "learning_rate": 6.06958285736294e-06, + "loss": 0.0295, + "step": 26319 + }, + { + "epoch": 3.12107197912961, + "grad_norm": 0.46810844406434715, + "learning_rate": 6.0680151157305406e-06, + "loss": 0.0188, + "step": 26320 + }, + { + "epoch": 3.1211905608917347, + "grad_norm": 0.4144453655671915, + "learning_rate": 6.066447548627827e-06, + "loss": 0.0242, + "step": 26321 + }, + { + "epoch": 3.12130914265386, + "grad_norm": 0.613335183349102, + "learning_rate": 6.064880156069247e-06, + "loss": 0.0309, + "step": 26322 + }, + { + "epoch": 3.1214277244159847, + "grad_norm": 0.609281565917231, + "learning_rate": 6.0633129380692595e-06, + "loss": 0.0369, + "step": 26323 + }, + { + "epoch": 3.12154630617811, + "grad_norm": 0.4342602315160974, + "learning_rate": 6.061745894642298e-06, + "loss": 0.0243, + "step": 26324 + }, + { + "epoch": 3.1216648879402347, + "grad_norm": 0.3951429952582306, + "learning_rate": 6.0601790258028175e-06, + "loss": 0.0309, + "step": 26325 + }, + { + "epoch": 3.12178346970236, + "grad_norm": 0.4382296969511378, + "learning_rate": 6.058612331565258e-06, + "loss": 0.0211, + "step": 26326 + }, + { + "epoch": 3.1219020514644846, + "grad_norm": 0.4928037609163024, + "learning_rate": 6.057045811944067e-06, + "loss": 0.0347, + "step": 26327 + }, + { + "epoch": 3.12202063322661, + "grad_norm": 0.7214476577076309, + "learning_rate": 6.055479466953684e-06, + "loss": 0.0434, + "step": 26328 + }, + { + "epoch": 3.1221392149887346, + "grad_norm": 0.6170254593204545, + "learning_rate": 6.053913296608551e-06, + "loss": 0.0342, + "step": 26329 + }, + { + "epoch": 3.12225779675086, + "grad_norm": 0.5866828875534099, + "learning_rate": 6.052347300923109e-06, + "loss": 0.0318, + "step": 26330 + }, + { + "epoch": 3.1223763785129846, + "grad_norm": 0.48473290033556543, + "learning_rate": 6.050781479911783e-06, + "loss": 0.0255, + "step": 26331 + }, + { + "epoch": 3.1224949602751098, + "grad_norm": 0.5607967479719365, + "learning_rate": 6.049215833589015e-06, + "loss": 0.0265, + "step": 26332 + }, + { + "epoch": 3.1226135420372345, + "grad_norm": 0.48070067262679655, + "learning_rate": 6.047650361969237e-06, + "loss": 0.0237, + "step": 26333 + }, + { + "epoch": 3.1227321237993597, + "grad_norm": 0.4869936767607306, + "learning_rate": 6.046085065066889e-06, + "loss": 0.03, + "step": 26334 + }, + { + "epoch": 3.1228507055614845, + "grad_norm": 0.40442386869805635, + "learning_rate": 6.044519942896387e-06, + "loss": 0.0169, + "step": 26335 + }, + { + "epoch": 3.1229692873236097, + "grad_norm": 0.9445767047642402, + "learning_rate": 6.0429549954721666e-06, + "loss": 0.062, + "step": 26336 + }, + { + "epoch": 3.1230878690857344, + "grad_norm": 0.7502960983523899, + "learning_rate": 6.0413902228086545e-06, + "loss": 0.0351, + "step": 26337 + }, + { + "epoch": 3.1232064508478596, + "grad_norm": 0.6056312213643601, + "learning_rate": 6.039825624920276e-06, + "loss": 0.0344, + "step": 26338 + }, + { + "epoch": 3.1233250326099844, + "grad_norm": 0.6716946265459287, + "learning_rate": 6.038261201821455e-06, + "loss": 0.0263, + "step": 26339 + }, + { + "epoch": 3.1234436143721096, + "grad_norm": 0.5957391892203963, + "learning_rate": 6.0366969535266135e-06, + "loss": 0.0353, + "step": 26340 + }, + { + "epoch": 3.1235621961342344, + "grad_norm": 0.44570344106230136, + "learning_rate": 6.035132880050179e-06, + "loss": 0.0227, + "step": 26341 + }, + { + "epoch": 3.1236807778963596, + "grad_norm": 0.4075217306155211, + "learning_rate": 6.033568981406554e-06, + "loss": 0.0239, + "step": 26342 + }, + { + "epoch": 3.1237993596584843, + "grad_norm": 0.7316214611352528, + "learning_rate": 6.0320052576101686e-06, + "loss": 0.0393, + "step": 26343 + }, + { + "epoch": 3.1239179414206095, + "grad_norm": 0.4998675873110259, + "learning_rate": 6.03044170867543e-06, + "loss": 0.0235, + "step": 26344 + }, + { + "epoch": 3.1240365231827343, + "grad_norm": 0.8009544908385927, + "learning_rate": 6.028878334616767e-06, + "loss": 0.0486, + "step": 26345 + }, + { + "epoch": 3.1241551049448595, + "grad_norm": 0.48724126620947783, + "learning_rate": 6.0273151354485745e-06, + "loss": 0.0378, + "step": 26346 + }, + { + "epoch": 3.1242736867069842, + "grad_norm": 0.5946988404712186, + "learning_rate": 6.02575211118527e-06, + "loss": 0.0394, + "step": 26347 + }, + { + "epoch": 3.1243922684691094, + "grad_norm": 0.7605585468068908, + "learning_rate": 6.024189261841262e-06, + "loss": 0.0391, + "step": 26348 + }, + { + "epoch": 3.1245108502312346, + "grad_norm": 0.4573221715754893, + "learning_rate": 6.02262658743096e-06, + "loss": 0.0286, + "step": 26349 + }, + { + "epoch": 3.1246294319933594, + "grad_norm": 0.46191099144268455, + "learning_rate": 6.021064087968778e-06, + "loss": 0.0206, + "step": 26350 + }, + { + "epoch": 3.1247480137554846, + "grad_norm": 0.4737641159367521, + "learning_rate": 6.019501763469096e-06, + "loss": 0.0261, + "step": 26351 + }, + { + "epoch": 3.1248665955176094, + "grad_norm": 0.6951175687898896, + "learning_rate": 6.017939613946347e-06, + "loss": 0.041, + "step": 26352 + }, + { + "epoch": 3.1249851772797346, + "grad_norm": 0.47867307850830426, + "learning_rate": 6.016377639414911e-06, + "loss": 0.0379, + "step": 26353 + }, + { + "epoch": 3.1251037590418593, + "grad_norm": 0.5491373313112263, + "learning_rate": 6.014815839889193e-06, + "loss": 0.0261, + "step": 26354 + }, + { + "epoch": 3.1252223408039845, + "grad_norm": 0.4554406616685739, + "learning_rate": 6.01325421538359e-06, + "loss": 0.0249, + "step": 26355 + }, + { + "epoch": 3.1253409225661093, + "grad_norm": 0.4227603963653256, + "learning_rate": 6.011692765912502e-06, + "loss": 0.029, + "step": 26356 + }, + { + "epoch": 3.1254595043282345, + "grad_norm": 0.6049835976340525, + "learning_rate": 6.0101314914903315e-06, + "loss": 0.0276, + "step": 26357 + }, + { + "epoch": 3.1255780860903593, + "grad_norm": 0.8690625408649898, + "learning_rate": 6.008570392131443e-06, + "loss": 0.0407, + "step": 26358 + }, + { + "epoch": 3.1256966678524845, + "grad_norm": 0.4016350991813743, + "learning_rate": 6.0070094678502635e-06, + "loss": 0.0175, + "step": 26359 + }, + { + "epoch": 3.125815249614609, + "grad_norm": 0.6427182839279686, + "learning_rate": 6.005448718661158e-06, + "loss": 0.0409, + "step": 26360 + }, + { + "epoch": 3.1259338313767344, + "grad_norm": 0.40102628240276084, + "learning_rate": 6.003888144578529e-06, + "loss": 0.0213, + "step": 26361 + }, + { + "epoch": 3.126052413138859, + "grad_norm": 0.5061796594992519, + "learning_rate": 6.002327745616743e-06, + "loss": 0.0253, + "step": 26362 + }, + { + "epoch": 3.1261709949009844, + "grad_norm": 0.8603491806947539, + "learning_rate": 6.000767521790215e-06, + "loss": 0.0383, + "step": 26363 + }, + { + "epoch": 3.126289576663109, + "grad_norm": 0.49096725115672774, + "learning_rate": 5.999207473113302e-06, + "loss": 0.0313, + "step": 26364 + }, + { + "epoch": 3.1264081584252343, + "grad_norm": 0.8236129254157163, + "learning_rate": 5.997647599600398e-06, + "loss": 0.0479, + "step": 26365 + }, + { + "epoch": 3.126526740187359, + "grad_norm": 0.5101912292947374, + "learning_rate": 5.996087901265879e-06, + "loss": 0.0226, + "step": 26366 + }, + { + "epoch": 3.1266453219494843, + "grad_norm": 0.4943259352041767, + "learning_rate": 5.994528378124126e-06, + "loss": 0.0305, + "step": 26367 + }, + { + "epoch": 3.126763903711609, + "grad_norm": 0.371631003130342, + "learning_rate": 5.992969030189524e-06, + "loss": 0.0265, + "step": 26368 + }, + { + "epoch": 3.1268824854737343, + "grad_norm": 0.36844481772271553, + "learning_rate": 5.9914098574764234e-06, + "loss": 0.015, + "step": 26369 + }, + { + "epoch": 3.127001067235859, + "grad_norm": 0.7779838178229455, + "learning_rate": 5.989850859999227e-06, + "loss": 0.0487, + "step": 26370 + }, + { + "epoch": 3.1271196489979842, + "grad_norm": 0.43866744934416524, + "learning_rate": 5.988292037772289e-06, + "loss": 0.0181, + "step": 26371 + }, + { + "epoch": 3.127238230760109, + "grad_norm": 0.4595146021177555, + "learning_rate": 5.986733390809993e-06, + "loss": 0.021, + "step": 26372 + }, + { + "epoch": 3.127356812522234, + "grad_norm": 0.6045776803566405, + "learning_rate": 5.985174919126682e-06, + "loss": 0.031, + "step": 26373 + }, + { + "epoch": 3.127475394284359, + "grad_norm": 0.7736068139302532, + "learning_rate": 5.983616622736757e-06, + "loss": 0.0347, + "step": 26374 + }, + { + "epoch": 3.127593976046484, + "grad_norm": 0.7934318659164906, + "learning_rate": 5.982058501654561e-06, + "loss": 0.0475, + "step": 26375 + }, + { + "epoch": 3.127712557808609, + "grad_norm": 0.5069609624416779, + "learning_rate": 5.9805005558944636e-06, + "loss": 0.0245, + "step": 26376 + }, + { + "epoch": 3.127831139570734, + "grad_norm": 0.7257143168598176, + "learning_rate": 5.978942785470826e-06, + "loss": 0.0468, + "step": 26377 + }, + { + "epoch": 3.127949721332859, + "grad_norm": 0.41502856633980395, + "learning_rate": 5.977385190398014e-06, + "loss": 0.0148, + "step": 26378 + }, + { + "epoch": 3.128068303094984, + "grad_norm": 0.47333048523842425, + "learning_rate": 5.975827770690387e-06, + "loss": 0.0214, + "step": 26379 + }, + { + "epoch": 3.128186884857109, + "grad_norm": 0.4828907615134411, + "learning_rate": 5.9742705263622935e-06, + "loss": 0.0304, + "step": 26380 + }, + { + "epoch": 3.128305466619234, + "grad_norm": 0.3927308010571228, + "learning_rate": 5.972713457428098e-06, + "loss": 0.0198, + "step": 26381 + }, + { + "epoch": 3.128424048381359, + "grad_norm": 0.4779933849400146, + "learning_rate": 5.9711565639021475e-06, + "loss": 0.0208, + "step": 26382 + }, + { + "epoch": 3.128542630143484, + "grad_norm": 0.6865227068016373, + "learning_rate": 5.969599845798807e-06, + "loss": 0.0405, + "step": 26383 + }, + { + "epoch": 3.1286612119056088, + "grad_norm": 0.5915808760547434, + "learning_rate": 5.9680433031324035e-06, + "loss": 0.0298, + "step": 26384 + }, + { + "epoch": 3.128779793667734, + "grad_norm": 0.48361589072970923, + "learning_rate": 5.966486935917321e-06, + "loss": 0.0281, + "step": 26385 + }, + { + "epoch": 3.1288983754298587, + "grad_norm": 0.3941513020193435, + "learning_rate": 5.964930744167877e-06, + "loss": 0.0193, + "step": 26386 + }, + { + "epoch": 3.129016957191984, + "grad_norm": 0.3585370821688364, + "learning_rate": 5.96337472789843e-06, + "loss": 0.0151, + "step": 26387 + }, + { + "epoch": 3.1291355389541087, + "grad_norm": 0.47346362897325456, + "learning_rate": 5.9618188871233244e-06, + "loss": 0.0322, + "step": 26388 + }, + { + "epoch": 3.129254120716234, + "grad_norm": 0.6131846849259461, + "learning_rate": 5.9602632218569015e-06, + "loss": 0.043, + "step": 26389 + }, + { + "epoch": 3.1293727024783586, + "grad_norm": 0.7949724378229978, + "learning_rate": 5.958707732113513e-06, + "loss": 0.0327, + "step": 26390 + }, + { + "epoch": 3.129491284240484, + "grad_norm": 0.3767653565407148, + "learning_rate": 5.95715241790748e-06, + "loss": 0.0168, + "step": 26391 + }, + { + "epoch": 3.1296098660026086, + "grad_norm": 0.2752933865805243, + "learning_rate": 5.95559727925315e-06, + "loss": 0.0161, + "step": 26392 + }, + { + "epoch": 3.129728447764734, + "grad_norm": 0.4417689417423417, + "learning_rate": 5.954042316164862e-06, + "loss": 0.0213, + "step": 26393 + }, + { + "epoch": 3.129847029526859, + "grad_norm": 0.5887560914003729, + "learning_rate": 5.952487528656953e-06, + "loss": 0.0211, + "step": 26394 + }, + { + "epoch": 3.1299656112889838, + "grad_norm": 0.493439560677845, + "learning_rate": 5.950932916743743e-06, + "loss": 0.0361, + "step": 26395 + }, + { + "epoch": 3.1300841930511085, + "grad_norm": 0.7784391929764075, + "learning_rate": 5.9493784804395745e-06, + "loss": 0.0364, + "step": 26396 + }, + { + "epoch": 3.1302027748132337, + "grad_norm": 0.5293853444452674, + "learning_rate": 5.947824219758774e-06, + "loss": 0.0297, + "step": 26397 + }, + { + "epoch": 3.130321356575359, + "grad_norm": 0.8962762795334234, + "learning_rate": 5.9462701347156704e-06, + "loss": 0.0429, + "step": 26398 + }, + { + "epoch": 3.1304399383374837, + "grad_norm": 0.47727229477196587, + "learning_rate": 5.944716225324592e-06, + "loss": 0.0216, + "step": 26399 + }, + { + "epoch": 3.130558520099609, + "grad_norm": 0.32726285505453734, + "learning_rate": 5.943162491599863e-06, + "loss": 0.016, + "step": 26400 + }, + { + "epoch": 3.1306771018617336, + "grad_norm": 0.31048007736542066, + "learning_rate": 5.941608933555812e-06, + "loss": 0.0261, + "step": 26401 + }, + { + "epoch": 3.130795683623859, + "grad_norm": 0.5514542281103719, + "learning_rate": 5.940055551206749e-06, + "loss": 0.0303, + "step": 26402 + }, + { + "epoch": 3.1309142653859836, + "grad_norm": 0.7477712363324263, + "learning_rate": 5.938502344567004e-06, + "loss": 0.0403, + "step": 26403 + }, + { + "epoch": 3.131032847148109, + "grad_norm": 0.7823471342572753, + "learning_rate": 5.936949313650888e-06, + "loss": 0.0357, + "step": 26404 + }, + { + "epoch": 3.1311514289102336, + "grad_norm": 0.579467259852118, + "learning_rate": 5.935396458472734e-06, + "loss": 0.029, + "step": 26405 + }, + { + "epoch": 3.1312700106723588, + "grad_norm": 0.6721024470669525, + "learning_rate": 5.933843779046835e-06, + "loss": 0.0279, + "step": 26406 + }, + { + "epoch": 3.1313885924344835, + "grad_norm": 0.5884805015347351, + "learning_rate": 5.932291275387519e-06, + "loss": 0.0313, + "step": 26407 + }, + { + "epoch": 3.1315071741966087, + "grad_norm": 0.44325785899973896, + "learning_rate": 5.930738947509093e-06, + "loss": 0.0218, + "step": 26408 + }, + { + "epoch": 3.1316257559587335, + "grad_norm": 0.6866505128404966, + "learning_rate": 5.9291867954258725e-06, + "loss": 0.0385, + "step": 26409 + }, + { + "epoch": 3.1317443377208587, + "grad_norm": 0.45584616241746495, + "learning_rate": 5.9276348191521695e-06, + "loss": 0.0203, + "step": 26410 + }, + { + "epoch": 3.1318629194829835, + "grad_norm": 0.37365316392737813, + "learning_rate": 5.926083018702269e-06, + "loss": 0.0155, + "step": 26411 + }, + { + "epoch": 3.1319815012451087, + "grad_norm": 0.5125858513314979, + "learning_rate": 5.92453139409051e-06, + "loss": 0.0212, + "step": 26412 + }, + { + "epoch": 3.1321000830072334, + "grad_norm": 0.5250983188841336, + "learning_rate": 5.92297994533117e-06, + "loss": 0.0282, + "step": 26413 + }, + { + "epoch": 3.1322186647693586, + "grad_norm": 0.5038561471555543, + "learning_rate": 5.9214286724385644e-06, + "loss": 0.0233, + "step": 26414 + }, + { + "epoch": 3.1323372465314834, + "grad_norm": 0.6067244182628121, + "learning_rate": 5.919877575426988e-06, + "loss": 0.031, + "step": 26415 + }, + { + "epoch": 3.1324558282936086, + "grad_norm": 0.6994347652606668, + "learning_rate": 5.918326654310743e-06, + "loss": 0.0286, + "step": 26416 + }, + { + "epoch": 3.1325744100557333, + "grad_norm": 0.44977341647341645, + "learning_rate": 5.9167759091041355e-06, + "loss": 0.0212, + "step": 26417 + }, + { + "epoch": 3.1326929918178585, + "grad_norm": 0.8705929139586636, + "learning_rate": 5.915225339821437e-06, + "loss": 0.0334, + "step": 26418 + }, + { + "epoch": 3.1328115735799833, + "grad_norm": 0.6184187365000753, + "learning_rate": 5.913674946476971e-06, + "loss": 0.0336, + "step": 26419 + }, + { + "epoch": 3.1329301553421085, + "grad_norm": 0.5119581090748316, + "learning_rate": 5.9121247290850115e-06, + "loss": 0.0309, + "step": 26420 + }, + { + "epoch": 3.1330487371042333, + "grad_norm": 0.4235545784500126, + "learning_rate": 5.9105746876598615e-06, + "loss": 0.02, + "step": 26421 + }, + { + "epoch": 3.1331673188663585, + "grad_norm": 0.3658310155542368, + "learning_rate": 5.909024822215789e-06, + "loss": 0.0155, + "step": 26422 + }, + { + "epoch": 3.1332859006284832, + "grad_norm": 0.47707851159812303, + "learning_rate": 5.907475132767113e-06, + "loss": 0.0191, + "step": 26423 + }, + { + "epoch": 3.1334044823906084, + "grad_norm": 0.8203498612815477, + "learning_rate": 5.905925619328095e-06, + "loss": 0.0481, + "step": 26424 + }, + { + "epoch": 3.133523064152733, + "grad_norm": 0.7090082445993743, + "learning_rate": 5.9043762819130266e-06, + "loss": 0.0461, + "step": 26425 + }, + { + "epoch": 3.1336416459148584, + "grad_norm": 0.5728713166606242, + "learning_rate": 5.9028271205361945e-06, + "loss": 0.0237, + "step": 26426 + }, + { + "epoch": 3.133760227676983, + "grad_norm": 0.623498335929264, + "learning_rate": 5.9012781352118765e-06, + "loss": 0.0403, + "step": 26427 + }, + { + "epoch": 3.1338788094391083, + "grad_norm": 0.569968443263714, + "learning_rate": 5.899729325954362e-06, + "loss": 0.0359, + "step": 26428 + }, + { + "epoch": 3.133997391201233, + "grad_norm": 0.5205932755447436, + "learning_rate": 5.898180692777908e-06, + "loss": 0.0283, + "step": 26429 + }, + { + "epoch": 3.1341159729633583, + "grad_norm": 0.46263736641812175, + "learning_rate": 5.896632235696814e-06, + "loss": 0.0223, + "step": 26430 + }, + { + "epoch": 3.134234554725483, + "grad_norm": 0.5598539815368102, + "learning_rate": 5.895083954725339e-06, + "loss": 0.0237, + "step": 26431 + }, + { + "epoch": 3.1343531364876083, + "grad_norm": 0.5680516967838447, + "learning_rate": 5.893535849877771e-06, + "loss": 0.0232, + "step": 26432 + }, + { + "epoch": 3.134471718249733, + "grad_norm": 0.5254584326820175, + "learning_rate": 5.891987921168357e-06, + "loss": 0.0215, + "step": 26433 + }, + { + "epoch": 3.1345903000118582, + "grad_norm": 0.6897241536801275, + "learning_rate": 5.890440168611397e-06, + "loss": 0.0289, + "step": 26434 + }, + { + "epoch": 3.134708881773983, + "grad_norm": 0.4470248122040568, + "learning_rate": 5.888892592221137e-06, + "loss": 0.0198, + "step": 26435 + }, + { + "epoch": 3.134827463536108, + "grad_norm": 0.32402684436571594, + "learning_rate": 5.887345192011853e-06, + "loss": 0.018, + "step": 26436 + }, + { + "epoch": 3.134946045298233, + "grad_norm": 0.6100102390395041, + "learning_rate": 5.885797967997808e-06, + "loss": 0.0408, + "step": 26437 + }, + { + "epoch": 3.135064627060358, + "grad_norm": 0.5965415193464118, + "learning_rate": 5.884250920193265e-06, + "loss": 0.0282, + "step": 26438 + }, + { + "epoch": 3.135183208822483, + "grad_norm": 0.4431857966213763, + "learning_rate": 5.882704048612497e-06, + "loss": 0.0168, + "step": 26439 + }, + { + "epoch": 3.135301790584608, + "grad_norm": 0.799073517910963, + "learning_rate": 5.88115735326974e-06, + "loss": 0.0472, + "step": 26440 + }, + { + "epoch": 3.135420372346733, + "grad_norm": 0.4816932517660804, + "learning_rate": 5.87961083417928e-06, + "loss": 0.0241, + "step": 26441 + }, + { + "epoch": 3.135538954108858, + "grad_norm": 0.2928295585116497, + "learning_rate": 5.878064491355354e-06, + "loss": 0.0122, + "step": 26442 + }, + { + "epoch": 3.135657535870983, + "grad_norm": 0.3198009853728703, + "learning_rate": 5.876518324812233e-06, + "loss": 0.014, + "step": 26443 + }, + { + "epoch": 3.135776117633108, + "grad_norm": 0.559712962574276, + "learning_rate": 5.874972334564146e-06, + "loss": 0.025, + "step": 26444 + }, + { + "epoch": 3.135894699395233, + "grad_norm": 0.6365757290384054, + "learning_rate": 5.873426520625377e-06, + "loss": 0.0305, + "step": 26445 + }, + { + "epoch": 3.136013281157358, + "grad_norm": 0.5776621995043156, + "learning_rate": 5.871880883010156e-06, + "loss": 0.0326, + "step": 26446 + }, + { + "epoch": 3.136131862919483, + "grad_norm": 0.25001734108606954, + "learning_rate": 5.8703354217327335e-06, + "loss": 0.0164, + "step": 26447 + }, + { + "epoch": 3.136250444681608, + "grad_norm": 0.8194171706129216, + "learning_rate": 5.8687901368073616e-06, + "loss": 0.0364, + "step": 26448 + }, + { + "epoch": 3.1363690264437327, + "grad_norm": 0.30881677104782224, + "learning_rate": 5.867245028248283e-06, + "loss": 0.0127, + "step": 26449 + }, + { + "epoch": 3.136487608205858, + "grad_norm": 0.522278803746144, + "learning_rate": 5.865700096069751e-06, + "loss": 0.0286, + "step": 26450 + }, + { + "epoch": 3.136606189967983, + "grad_norm": 0.4973938982393696, + "learning_rate": 5.864155340285993e-06, + "loss": 0.0252, + "step": 26451 + }, + { + "epoch": 3.136724771730108, + "grad_norm": 0.5151207150795732, + "learning_rate": 5.862610760911258e-06, + "loss": 0.0249, + "step": 26452 + }, + { + "epoch": 3.136843353492233, + "grad_norm": 0.5230333977043645, + "learning_rate": 5.861066357959783e-06, + "loss": 0.0225, + "step": 26453 + }, + { + "epoch": 3.136961935254358, + "grad_norm": 0.5236756429436434, + "learning_rate": 5.859522131445813e-06, + "loss": 0.0248, + "step": 26454 + }, + { + "epoch": 3.137080517016483, + "grad_norm": 0.43740192076927503, + "learning_rate": 5.857978081383564e-06, + "loss": 0.0172, + "step": 26455 + }, + { + "epoch": 3.137199098778608, + "grad_norm": 0.802520009947024, + "learning_rate": 5.856434207787296e-06, + "loss": 0.0401, + "step": 26456 + }, + { + "epoch": 3.137317680540733, + "grad_norm": 0.5800233754308818, + "learning_rate": 5.854890510671224e-06, + "loss": 0.0212, + "step": 26457 + }, + { + "epoch": 3.1374362623028578, + "grad_norm": 0.35403084677039753, + "learning_rate": 5.853346990049583e-06, + "loss": 0.0134, + "step": 26458 + }, + { + "epoch": 3.137554844064983, + "grad_norm": 0.5606625572563435, + "learning_rate": 5.851803645936604e-06, + "loss": 0.0295, + "step": 26459 + }, + { + "epoch": 3.1376734258271077, + "grad_norm": 0.5941022795162523, + "learning_rate": 5.850260478346514e-06, + "loss": 0.0332, + "step": 26460 + }, + { + "epoch": 3.137792007589233, + "grad_norm": 0.5123755014698302, + "learning_rate": 5.848717487293548e-06, + "loss": 0.032, + "step": 26461 + }, + { + "epoch": 3.1379105893513577, + "grad_norm": 0.9467003144816032, + "learning_rate": 5.847174672791913e-06, + "loss": 0.0507, + "step": 26462 + }, + { + "epoch": 3.138029171113483, + "grad_norm": 0.3678269012625654, + "learning_rate": 5.845632034855844e-06, + "loss": 0.0184, + "step": 26463 + }, + { + "epoch": 3.1381477528756077, + "grad_norm": 0.38417938121235634, + "learning_rate": 5.844089573499556e-06, + "loss": 0.0169, + "step": 26464 + }, + { + "epoch": 3.138266334637733, + "grad_norm": 0.46682660694710315, + "learning_rate": 5.842547288737271e-06, + "loss": 0.0244, + "step": 26465 + }, + { + "epoch": 3.1383849163998576, + "grad_norm": 0.49705350705706053, + "learning_rate": 5.841005180583217e-06, + "loss": 0.0271, + "step": 26466 + }, + { + "epoch": 3.138503498161983, + "grad_norm": 0.49426167298939766, + "learning_rate": 5.839463249051586e-06, + "loss": 0.0188, + "step": 26467 + }, + { + "epoch": 3.1386220799241076, + "grad_norm": 0.9147184875531823, + "learning_rate": 5.8379214941566205e-06, + "loss": 0.0395, + "step": 26468 + }, + { + "epoch": 3.138740661686233, + "grad_norm": 0.5753097905519559, + "learning_rate": 5.836379915912515e-06, + "loss": 0.0268, + "step": 26469 + }, + { + "epoch": 3.1388592434483575, + "grad_norm": 0.36535928606265883, + "learning_rate": 5.834838514333488e-06, + "loss": 0.021, + "step": 26470 + }, + { + "epoch": 3.1389778252104827, + "grad_norm": 0.6847106876046791, + "learning_rate": 5.833297289433745e-06, + "loss": 0.0288, + "step": 26471 + }, + { + "epoch": 3.1390964069726075, + "grad_norm": 0.7769879549766886, + "learning_rate": 5.831756241227507e-06, + "loss": 0.0439, + "step": 26472 + }, + { + "epoch": 3.1392149887347327, + "grad_norm": 0.3987841890029213, + "learning_rate": 5.8302153697289615e-06, + "loss": 0.016, + "step": 26473 + }, + { + "epoch": 3.1393335704968575, + "grad_norm": 0.3551076401751938, + "learning_rate": 5.8286746749523226e-06, + "loss": 0.0264, + "step": 26474 + }, + { + "epoch": 3.1394521522589827, + "grad_norm": 0.7492744707264353, + "learning_rate": 5.827134156911796e-06, + "loss": 0.0388, + "step": 26475 + }, + { + "epoch": 3.1395707340211074, + "grad_norm": 0.7684243553827571, + "learning_rate": 5.82559381562158e-06, + "loss": 0.0371, + "step": 26476 + }, + { + "epoch": 3.1396893157832326, + "grad_norm": 0.34510580256249623, + "learning_rate": 5.824053651095881e-06, + "loss": 0.0178, + "step": 26477 + }, + { + "epoch": 3.1398078975453574, + "grad_norm": 0.5870244371204735, + "learning_rate": 5.82251366334888e-06, + "loss": 0.0356, + "step": 26478 + }, + { + "epoch": 3.1399264793074826, + "grad_norm": 0.7246354449158723, + "learning_rate": 5.820973852394801e-06, + "loss": 0.0337, + "step": 26479 + }, + { + "epoch": 3.1400450610696073, + "grad_norm": 0.5408743572799807, + "learning_rate": 5.8194342182478155e-06, + "loss": 0.0259, + "step": 26480 + }, + { + "epoch": 3.1401636428317325, + "grad_norm": 1.083729733510719, + "learning_rate": 5.817894760922135e-06, + "loss": 0.0508, + "step": 26481 + }, + { + "epoch": 3.1402822245938573, + "grad_norm": 0.5086976505850244, + "learning_rate": 5.816355480431926e-06, + "loss": 0.0279, + "step": 26482 + }, + { + "epoch": 3.1404008063559825, + "grad_norm": 0.6513428722790767, + "learning_rate": 5.81481637679141e-06, + "loss": 0.0257, + "step": 26483 + }, + { + "epoch": 3.1405193881181073, + "grad_norm": 0.524775943595944, + "learning_rate": 5.813277450014754e-06, + "loss": 0.0268, + "step": 26484 + }, + { + "epoch": 3.1406379698802325, + "grad_norm": 0.5549800699734857, + "learning_rate": 5.811738700116151e-06, + "loss": 0.0334, + "step": 26485 + }, + { + "epoch": 3.1407565516423572, + "grad_norm": 0.5536522248793825, + "learning_rate": 5.810200127109786e-06, + "loss": 0.0363, + "step": 26486 + }, + { + "epoch": 3.1408751334044824, + "grad_norm": 0.8982560559618565, + "learning_rate": 5.808661731009843e-06, + "loss": 0.0623, + "step": 26487 + }, + { + "epoch": 3.140993715166607, + "grad_norm": 1.006771120041674, + "learning_rate": 5.807123511830514e-06, + "loss": 0.0607, + "step": 26488 + }, + { + "epoch": 3.1411122969287324, + "grad_norm": 0.6439602654526553, + "learning_rate": 5.805585469585956e-06, + "loss": 0.0279, + "step": 26489 + }, + { + "epoch": 3.141230878690857, + "grad_norm": 0.5232060820862042, + "learning_rate": 5.8040476042903774e-06, + "loss": 0.028, + "step": 26490 + }, + { + "epoch": 3.1413494604529824, + "grad_norm": 0.49938830392681344, + "learning_rate": 5.802509915957932e-06, + "loss": 0.0291, + "step": 26491 + }, + { + "epoch": 3.141468042215107, + "grad_norm": 0.5155873274323398, + "learning_rate": 5.800972404602808e-06, + "loss": 0.0204, + "step": 26492 + }, + { + "epoch": 3.1415866239772323, + "grad_norm": 0.5754756773878571, + "learning_rate": 5.799435070239165e-06, + "loss": 0.0275, + "step": 26493 + }, + { + "epoch": 3.141705205739357, + "grad_norm": 0.4611119720889473, + "learning_rate": 5.797897912881198e-06, + "loss": 0.0218, + "step": 26494 + }, + { + "epoch": 3.1418237875014823, + "grad_norm": 0.4184032661806354, + "learning_rate": 5.796360932543058e-06, + "loss": 0.0184, + "step": 26495 + }, + { + "epoch": 3.1419423692636075, + "grad_norm": 0.5644947445114421, + "learning_rate": 5.794824129238921e-06, + "loss": 0.0223, + "step": 26496 + }, + { + "epoch": 3.1420609510257322, + "grad_norm": 0.4977074214246076, + "learning_rate": 5.793287502982955e-06, + "loss": 0.0305, + "step": 26497 + }, + { + "epoch": 3.142179532787857, + "grad_norm": 0.3833348030377763, + "learning_rate": 5.791751053789324e-06, + "loss": 0.0219, + "step": 26498 + }, + { + "epoch": 3.142298114549982, + "grad_norm": 0.492182054527954, + "learning_rate": 5.790214781672201e-06, + "loss": 0.0196, + "step": 26499 + }, + { + "epoch": 3.1424166963121074, + "grad_norm": 0.6002204341957319, + "learning_rate": 5.788678686645729e-06, + "loss": 0.0207, + "step": 26500 + }, + { + "epoch": 3.142535278074232, + "grad_norm": 0.6327196377616867, + "learning_rate": 5.787142768724094e-06, + "loss": 0.0345, + "step": 26501 + }, + { + "epoch": 3.1426538598363574, + "grad_norm": 0.5646726849599687, + "learning_rate": 5.785607027921436e-06, + "loss": 0.0267, + "step": 26502 + }, + { + "epoch": 3.142772441598482, + "grad_norm": 0.5522083773191634, + "learning_rate": 5.7840714642519254e-06, + "loss": 0.0258, + "step": 26503 + }, + { + "epoch": 3.1428910233606073, + "grad_norm": 0.44312626503577884, + "learning_rate": 5.782536077729697e-06, + "loss": 0.0211, + "step": 26504 + }, + { + "epoch": 3.143009605122732, + "grad_norm": 0.5922276562849648, + "learning_rate": 5.781000868368932e-06, + "loss": 0.0324, + "step": 26505 + }, + { + "epoch": 3.1431281868848573, + "grad_norm": 0.5774722123487237, + "learning_rate": 5.779465836183768e-06, + "loss": 0.0331, + "step": 26506 + }, + { + "epoch": 3.143246768646982, + "grad_norm": 0.5334660386945912, + "learning_rate": 5.777930981188356e-06, + "loss": 0.0198, + "step": 26507 + }, + { + "epoch": 3.1433653504091073, + "grad_norm": 0.4237401786488765, + "learning_rate": 5.776396303396853e-06, + "loss": 0.0219, + "step": 26508 + }, + { + "epoch": 3.143483932171232, + "grad_norm": 0.3782750539849547, + "learning_rate": 5.774861802823398e-06, + "loss": 0.016, + "step": 26509 + }, + { + "epoch": 3.143602513933357, + "grad_norm": 0.5964203984234823, + "learning_rate": 5.773327479482152e-06, + "loss": 0.0213, + "step": 26510 + }, + { + "epoch": 3.143721095695482, + "grad_norm": 0.4701924915399486, + "learning_rate": 5.771793333387232e-06, + "loss": 0.0228, + "step": 26511 + }, + { + "epoch": 3.143839677457607, + "grad_norm": 0.6454118133131835, + "learning_rate": 5.770259364552816e-06, + "loss": 0.0273, + "step": 26512 + }, + { + "epoch": 3.143958259219732, + "grad_norm": 0.552411031097624, + "learning_rate": 5.7687255729930194e-06, + "loss": 0.0273, + "step": 26513 + }, + { + "epoch": 3.144076840981857, + "grad_norm": 0.524483328863811, + "learning_rate": 5.7671919587219875e-06, + "loss": 0.0181, + "step": 26514 + }, + { + "epoch": 3.144195422743982, + "grad_norm": 0.4530422848865083, + "learning_rate": 5.765658521753864e-06, + "loss": 0.028, + "step": 26515 + }, + { + "epoch": 3.144314004506107, + "grad_norm": 0.5909972867375151, + "learning_rate": 5.764125262102782e-06, + "loss": 0.024, + "step": 26516 + }, + { + "epoch": 3.144432586268232, + "grad_norm": 0.6490524107886798, + "learning_rate": 5.762592179782883e-06, + "loss": 0.0244, + "step": 26517 + }, + { + "epoch": 3.144551168030357, + "grad_norm": 0.6284204537498008, + "learning_rate": 5.761059274808286e-06, + "loss": 0.0336, + "step": 26518 + }, + { + "epoch": 3.144669749792482, + "grad_norm": 0.60442366777456, + "learning_rate": 5.759526547193131e-06, + "loss": 0.027, + "step": 26519 + }, + { + "epoch": 3.144788331554607, + "grad_norm": 0.4308110041276725, + "learning_rate": 5.757993996951547e-06, + "loss": 0.0206, + "step": 26520 + }, + { + "epoch": 3.144906913316732, + "grad_norm": 0.32672195390668596, + "learning_rate": 5.756461624097667e-06, + "loss": 0.0165, + "step": 26521 + }, + { + "epoch": 3.145025495078857, + "grad_norm": 0.6177497124058408, + "learning_rate": 5.754929428645609e-06, + "loss": 0.0263, + "step": 26522 + }, + { + "epoch": 3.1451440768409817, + "grad_norm": 0.3755258770553064, + "learning_rate": 5.753397410609498e-06, + "loss": 0.0244, + "step": 26523 + }, + { + "epoch": 3.145262658603107, + "grad_norm": 0.5002124135303487, + "learning_rate": 5.7518655700034646e-06, + "loss": 0.0322, + "step": 26524 + }, + { + "epoch": 3.1453812403652317, + "grad_norm": 0.7858284166251983, + "learning_rate": 5.750333906841626e-06, + "loss": 0.0299, + "step": 26525 + }, + { + "epoch": 3.145499822127357, + "grad_norm": 0.7236651107708738, + "learning_rate": 5.748802421138103e-06, + "loss": 0.0434, + "step": 26526 + }, + { + "epoch": 3.1456184038894817, + "grad_norm": 0.954658505183258, + "learning_rate": 5.747271112907016e-06, + "loss": 0.0385, + "step": 26527 + }, + { + "epoch": 3.145736985651607, + "grad_norm": 0.6379611975864471, + "learning_rate": 5.745739982162485e-06, + "loss": 0.0386, + "step": 26528 + }, + { + "epoch": 3.1458555674137316, + "grad_norm": 0.6190511002061756, + "learning_rate": 5.7442090289186124e-06, + "loss": 0.026, + "step": 26529 + }, + { + "epoch": 3.145974149175857, + "grad_norm": 0.6827590386808182, + "learning_rate": 5.742678253189521e-06, + "loss": 0.0371, + "step": 26530 + }, + { + "epoch": 3.1460927309379816, + "grad_norm": 0.45200367410830866, + "learning_rate": 5.74114765498932e-06, + "loss": 0.0234, + "step": 26531 + }, + { + "epoch": 3.146211312700107, + "grad_norm": 0.6796563803710274, + "learning_rate": 5.7396172343321315e-06, + "loss": 0.0277, + "step": 26532 + }, + { + "epoch": 3.1463298944622315, + "grad_norm": 0.46064238859130574, + "learning_rate": 5.738086991232045e-06, + "loss": 0.0172, + "step": 26533 + }, + { + "epoch": 3.1464484762243567, + "grad_norm": 0.5218751141072743, + "learning_rate": 5.7365569257031735e-06, + "loss": 0.0235, + "step": 26534 + }, + { + "epoch": 3.1465670579864815, + "grad_norm": 0.4013512672705241, + "learning_rate": 5.7350270377596275e-06, + "loss": 0.0183, + "step": 26535 + }, + { + "epoch": 3.1466856397486067, + "grad_norm": 0.6946502655787471, + "learning_rate": 5.733497327415505e-06, + "loss": 0.0279, + "step": 26536 + }, + { + "epoch": 3.1468042215107315, + "grad_norm": 0.5394938520723539, + "learning_rate": 5.7319677946849195e-06, + "loss": 0.0304, + "step": 26537 + }, + { + "epoch": 3.1469228032728567, + "grad_norm": 0.7056571504280278, + "learning_rate": 5.730438439581948e-06, + "loss": 0.0322, + "step": 26538 + }, + { + "epoch": 3.1470413850349814, + "grad_norm": 0.48864677955113806, + "learning_rate": 5.728909262120721e-06, + "loss": 0.0153, + "step": 26539 + }, + { + "epoch": 3.1471599667971066, + "grad_norm": 0.6448594654091826, + "learning_rate": 5.72738026231531e-06, + "loss": 0.0288, + "step": 26540 + }, + { + "epoch": 3.1472785485592314, + "grad_norm": 0.40891234158686096, + "learning_rate": 5.725851440179819e-06, + "loss": 0.0191, + "step": 26541 + }, + { + "epoch": 3.1473971303213566, + "grad_norm": 0.3742260613568129, + "learning_rate": 5.724322795728343e-06, + "loss": 0.0183, + "step": 26542 + }, + { + "epoch": 3.1475157120834814, + "grad_norm": 0.4555088845215266, + "learning_rate": 5.722794328974978e-06, + "loss": 0.0232, + "step": 26543 + }, + { + "epoch": 3.1476342938456066, + "grad_norm": 1.0298972948721947, + "learning_rate": 5.721266039933806e-06, + "loss": 0.0526, + "step": 26544 + }, + { + "epoch": 3.1477528756077318, + "grad_norm": 0.34464262379438, + "learning_rate": 5.719737928618918e-06, + "loss": 0.0152, + "step": 26545 + }, + { + "epoch": 3.1478714573698565, + "grad_norm": 0.31128886138342626, + "learning_rate": 5.7182099950444045e-06, + "loss": 0.0209, + "step": 26546 + }, + { + "epoch": 3.1479900391319813, + "grad_norm": 0.6376810950066052, + "learning_rate": 5.71668223922435e-06, + "loss": 0.0441, + "step": 26547 + }, + { + "epoch": 3.1481086208941065, + "grad_norm": 0.6372475864418388, + "learning_rate": 5.715154661172844e-06, + "loss": 0.0299, + "step": 26548 + }, + { + "epoch": 3.1482272026562317, + "grad_norm": 0.7366146216610789, + "learning_rate": 5.713627260903954e-06, + "loss": 0.0431, + "step": 26549 + }, + { + "epoch": 3.1483457844183564, + "grad_norm": 0.3576252486601014, + "learning_rate": 5.712100038431778e-06, + "loss": 0.0142, + "step": 26550 + }, + { + "epoch": 3.1484643661804816, + "grad_norm": 0.6693882808168642, + "learning_rate": 5.710572993770383e-06, + "loss": 0.0372, + "step": 26551 + }, + { + "epoch": 3.1485829479426064, + "grad_norm": 0.6454864410202077, + "learning_rate": 5.7090461269338595e-06, + "loss": 0.04, + "step": 26552 + }, + { + "epoch": 3.1487015297047316, + "grad_norm": 0.40647217274426994, + "learning_rate": 5.707519437936257e-06, + "loss": 0.0188, + "step": 26553 + }, + { + "epoch": 3.1488201114668564, + "grad_norm": 0.4958807839197245, + "learning_rate": 5.705992926791684e-06, + "loss": 0.0171, + "step": 26554 + }, + { + "epoch": 3.1489386932289816, + "grad_norm": 0.4987326682964474, + "learning_rate": 5.7044665935141876e-06, + "loss": 0.0251, + "step": 26555 + }, + { + "epoch": 3.1490572749911063, + "grad_norm": 0.5462990981653086, + "learning_rate": 5.702940438117849e-06, + "loss": 0.0203, + "step": 26556 + }, + { + "epoch": 3.1491758567532315, + "grad_norm": 0.48780061276303704, + "learning_rate": 5.701414460616735e-06, + "loss": 0.0233, + "step": 26557 + }, + { + "epoch": 3.1492944385153563, + "grad_norm": 0.4951421467599058, + "learning_rate": 5.699888661024916e-06, + "loss": 0.0255, + "step": 26558 + }, + { + "epoch": 3.1494130202774815, + "grad_norm": 0.8189411315742582, + "learning_rate": 5.698363039356461e-06, + "loss": 0.0428, + "step": 26559 + }, + { + "epoch": 3.1495316020396062, + "grad_norm": 0.6380509961310558, + "learning_rate": 5.696837595625415e-06, + "loss": 0.0289, + "step": 26560 + }, + { + "epoch": 3.1496501838017315, + "grad_norm": 0.6512545278836288, + "learning_rate": 5.69531232984587e-06, + "loss": 0.0285, + "step": 26561 + }, + { + "epoch": 3.149768765563856, + "grad_norm": 0.3440223846764162, + "learning_rate": 5.693787242031868e-06, + "loss": 0.0163, + "step": 26562 + }, + { + "epoch": 3.1498873473259814, + "grad_norm": 0.40498095197337325, + "learning_rate": 5.69226233219747e-06, + "loss": 0.0219, + "step": 26563 + }, + { + "epoch": 3.150005929088106, + "grad_norm": 0.4011267816332027, + "learning_rate": 5.690737600356735e-06, + "loss": 0.0176, + "step": 26564 + }, + { + "epoch": 3.1501245108502314, + "grad_norm": 0.46662903999268157, + "learning_rate": 5.689213046523725e-06, + "loss": 0.0215, + "step": 26565 + }, + { + "epoch": 3.150243092612356, + "grad_norm": 0.9363607701605602, + "learning_rate": 5.687688670712493e-06, + "loss": 0.0386, + "step": 26566 + }, + { + "epoch": 3.1503616743744813, + "grad_norm": 0.6887110046460401, + "learning_rate": 5.686164472937086e-06, + "loss": 0.0234, + "step": 26567 + }, + { + "epoch": 3.150480256136606, + "grad_norm": 0.4885368418439986, + "learning_rate": 5.6846404532115565e-06, + "loss": 0.0319, + "step": 26568 + }, + { + "epoch": 3.1505988378987313, + "grad_norm": 0.6528168409109995, + "learning_rate": 5.683116611549955e-06, + "loss": 0.0326, + "step": 26569 + }, + { + "epoch": 3.150717419660856, + "grad_norm": 0.8395616197642036, + "learning_rate": 5.681592947966338e-06, + "loss": 0.0411, + "step": 26570 + }, + { + "epoch": 3.1508360014229813, + "grad_norm": 0.5861161698914285, + "learning_rate": 5.680069462474733e-06, + "loss": 0.025, + "step": 26571 + }, + { + "epoch": 3.150954583185106, + "grad_norm": 0.3887593293051977, + "learning_rate": 5.678546155089207e-06, + "loss": 0.0187, + "step": 26572 + }, + { + "epoch": 3.151073164947231, + "grad_norm": 0.23024903190683405, + "learning_rate": 5.677023025823785e-06, + "loss": 0.014, + "step": 26573 + }, + { + "epoch": 3.151191746709356, + "grad_norm": 0.47631072378532624, + "learning_rate": 5.6755000746925165e-06, + "loss": 0.027, + "step": 26574 + }, + { + "epoch": 3.151310328471481, + "grad_norm": 0.6769521497072645, + "learning_rate": 5.6739773017094375e-06, + "loss": 0.031, + "step": 26575 + }, + { + "epoch": 3.151428910233606, + "grad_norm": 0.5895592546007068, + "learning_rate": 5.672454706888591e-06, + "loss": 0.0291, + "step": 26576 + }, + { + "epoch": 3.151547491995731, + "grad_norm": 0.5016624943202945, + "learning_rate": 5.6709322902440186e-06, + "loss": 0.0241, + "step": 26577 + }, + { + "epoch": 3.151666073757856, + "grad_norm": 0.4795681599393951, + "learning_rate": 5.669410051789739e-06, + "loss": 0.022, + "step": 26578 + }, + { + "epoch": 3.151784655519981, + "grad_norm": 0.37123960081839974, + "learning_rate": 5.667887991539794e-06, + "loss": 0.024, + "step": 26579 + }, + { + "epoch": 3.151903237282106, + "grad_norm": 0.5478626072401132, + "learning_rate": 5.6663661095082175e-06, + "loss": 0.0245, + "step": 26580 + }, + { + "epoch": 3.152021819044231, + "grad_norm": 0.6797552609797196, + "learning_rate": 5.664844405709041e-06, + "loss": 0.0315, + "step": 26581 + }, + { + "epoch": 3.152140400806356, + "grad_norm": 0.5169363666498958, + "learning_rate": 5.663322880156277e-06, + "loss": 0.0258, + "step": 26582 + }, + { + "epoch": 3.152258982568481, + "grad_norm": 0.5983167696969309, + "learning_rate": 5.661801532863978e-06, + "loss": 0.0246, + "step": 26583 + }, + { + "epoch": 3.152377564330606, + "grad_norm": 0.45995688807837276, + "learning_rate": 5.660280363846146e-06, + "loss": 0.0271, + "step": 26584 + }, + { + "epoch": 3.152496146092731, + "grad_norm": 0.5916461285047279, + "learning_rate": 5.6587593731168125e-06, + "loss": 0.0319, + "step": 26585 + }, + { + "epoch": 3.1526147278548557, + "grad_norm": 0.44402319585487193, + "learning_rate": 5.657238560690001e-06, + "loss": 0.0193, + "step": 26586 + }, + { + "epoch": 3.152733309616981, + "grad_norm": 0.4831016992576706, + "learning_rate": 5.655717926579731e-06, + "loss": 0.0239, + "step": 26587 + }, + { + "epoch": 3.1528518913791057, + "grad_norm": 0.5122591679597002, + "learning_rate": 5.654197470800026e-06, + "loss": 0.0247, + "step": 26588 + }, + { + "epoch": 3.152970473141231, + "grad_norm": 0.4522386655295397, + "learning_rate": 5.6526771933648915e-06, + "loss": 0.0165, + "step": 26589 + }, + { + "epoch": 3.1530890549033557, + "grad_norm": 0.863066393736565, + "learning_rate": 5.651157094288345e-06, + "loss": 0.0446, + "step": 26590 + }, + { + "epoch": 3.153207636665481, + "grad_norm": 0.886404180688407, + "learning_rate": 5.649637173584407e-06, + "loss": 0.0608, + "step": 26591 + }, + { + "epoch": 3.1533262184276056, + "grad_norm": 0.639482269552278, + "learning_rate": 5.6481174312670885e-06, + "loss": 0.0303, + "step": 26592 + }, + { + "epoch": 3.153444800189731, + "grad_norm": 0.7825256477438343, + "learning_rate": 5.646597867350392e-06, + "loss": 0.0366, + "step": 26593 + }, + { + "epoch": 3.153563381951856, + "grad_norm": 0.6222978926406758, + "learning_rate": 5.645078481848329e-06, + "loss": 0.0322, + "step": 26594 + }, + { + "epoch": 3.153681963713981, + "grad_norm": 0.5722274146757342, + "learning_rate": 5.64355927477491e-06, + "loss": 0.0328, + "step": 26595 + }, + { + "epoch": 3.1538005454761056, + "grad_norm": 0.5014712175888643, + "learning_rate": 5.642040246144137e-06, + "loss": 0.0236, + "step": 26596 + }, + { + "epoch": 3.1539191272382308, + "grad_norm": 0.6811818094397059, + "learning_rate": 5.640521395970014e-06, + "loss": 0.0497, + "step": 26597 + }, + { + "epoch": 3.154037709000356, + "grad_norm": 0.3537557709195875, + "learning_rate": 5.6390027242665425e-06, + "loss": 0.0171, + "step": 26598 + }, + { + "epoch": 3.1541562907624807, + "grad_norm": 0.5690250867398374, + "learning_rate": 5.637484231047732e-06, + "loss": 0.0334, + "step": 26599 + }, + { + "epoch": 3.154274872524606, + "grad_norm": 0.30618952500065916, + "learning_rate": 5.635965916327565e-06, + "loss": 0.0218, + "step": 26600 + }, + { + "epoch": 3.1543934542867307, + "grad_norm": 0.3962223201012897, + "learning_rate": 5.634447780120047e-06, + "loss": 0.0253, + "step": 26601 + }, + { + "epoch": 3.154512036048856, + "grad_norm": 0.7390115206593836, + "learning_rate": 5.6329298224391754e-06, + "loss": 0.045, + "step": 26602 + }, + { + "epoch": 3.1546306178109806, + "grad_norm": 0.8415808425312654, + "learning_rate": 5.631412043298945e-06, + "loss": 0.0429, + "step": 26603 + }, + { + "epoch": 3.154749199573106, + "grad_norm": 0.45609098772567436, + "learning_rate": 5.629894442713341e-06, + "loss": 0.0244, + "step": 26604 + }, + { + "epoch": 3.1548677813352306, + "grad_norm": 0.6407850438187326, + "learning_rate": 5.6283770206963545e-06, + "loss": 0.0318, + "step": 26605 + }, + { + "epoch": 3.154986363097356, + "grad_norm": 0.5197665351046238, + "learning_rate": 5.626859777261975e-06, + "loss": 0.0216, + "step": 26606 + }, + { + "epoch": 3.1551049448594806, + "grad_norm": 0.44382483187713045, + "learning_rate": 5.625342712424195e-06, + "loss": 0.0222, + "step": 26607 + }, + { + "epoch": 3.1552235266216058, + "grad_norm": 0.44687303323880107, + "learning_rate": 5.623825826197002e-06, + "loss": 0.0229, + "step": 26608 + }, + { + "epoch": 3.1553421083837305, + "grad_norm": 0.6016046586231091, + "learning_rate": 5.62230911859436e-06, + "loss": 0.0322, + "step": 26609 + }, + { + "epoch": 3.1554606901458557, + "grad_norm": 0.4525455067817098, + "learning_rate": 5.620792589630278e-06, + "loss": 0.0277, + "step": 26610 + }, + { + "epoch": 3.1555792719079805, + "grad_norm": 0.664016422136047, + "learning_rate": 5.619276239318719e-06, + "loss": 0.0337, + "step": 26611 + }, + { + "epoch": 3.1556978536701057, + "grad_norm": 0.4769741349010641, + "learning_rate": 5.617760067673666e-06, + "loss": 0.0212, + "step": 26612 + }, + { + "epoch": 3.1558164354322304, + "grad_norm": 0.24428607418664183, + "learning_rate": 5.616244074709098e-06, + "loss": 0.0112, + "step": 26613 + }, + { + "epoch": 3.1559350171943557, + "grad_norm": 0.43167705189003325, + "learning_rate": 5.6147282604389895e-06, + "loss": 0.0241, + "step": 26614 + }, + { + "epoch": 3.1560535989564804, + "grad_norm": 0.6038021328274408, + "learning_rate": 5.61321262487732e-06, + "loss": 0.0322, + "step": 26615 + }, + { + "epoch": 3.1561721807186056, + "grad_norm": 0.40571367420814874, + "learning_rate": 5.6116971680380435e-06, + "loss": 0.02, + "step": 26616 + }, + { + "epoch": 3.1562907624807304, + "grad_norm": 0.42774192457661264, + "learning_rate": 5.610181889935159e-06, + "loss": 0.0308, + "step": 26617 + }, + { + "epoch": 3.1564093442428556, + "grad_norm": 0.7603706727904274, + "learning_rate": 5.6086667905826115e-06, + "loss": 0.0446, + "step": 26618 + }, + { + "epoch": 3.1565279260049803, + "grad_norm": 0.39898155567516974, + "learning_rate": 5.6071518699943846e-06, + "loss": 0.0179, + "step": 26619 + }, + { + "epoch": 3.1566465077671055, + "grad_norm": 0.6306825913867191, + "learning_rate": 5.605637128184424e-06, + "loss": 0.0371, + "step": 26620 + }, + { + "epoch": 3.1567650895292303, + "grad_norm": 0.6076442742104416, + "learning_rate": 5.604122565166717e-06, + "loss": 0.027, + "step": 26621 + }, + { + "epoch": 3.1568836712913555, + "grad_norm": 0.4557759464683438, + "learning_rate": 5.60260818095521e-06, + "loss": 0.0227, + "step": 26622 + }, + { + "epoch": 3.1570022530534803, + "grad_norm": 0.3119016819665012, + "learning_rate": 5.601093975563868e-06, + "loss": 0.0119, + "step": 26623 + }, + { + "epoch": 3.1571208348156055, + "grad_norm": 0.5644064562534722, + "learning_rate": 5.599579949006651e-06, + "loss": 0.0198, + "step": 26624 + }, + { + "epoch": 3.15723941657773, + "grad_norm": 0.7195544666409116, + "learning_rate": 5.5980661012975165e-06, + "loss": 0.0409, + "step": 26625 + }, + { + "epoch": 3.1573579983398554, + "grad_norm": 0.5560744065102661, + "learning_rate": 5.596552432450428e-06, + "loss": 0.0266, + "step": 26626 + }, + { + "epoch": 3.15747658010198, + "grad_norm": 0.3694347643354028, + "learning_rate": 5.59503894247932e-06, + "loss": 0.0166, + "step": 26627 + }, + { + "epoch": 3.1575951618641054, + "grad_norm": 0.3895285804571392, + "learning_rate": 5.593525631398167e-06, + "loss": 0.019, + "step": 26628 + }, + { + "epoch": 3.15771374362623, + "grad_norm": 0.49707156261795765, + "learning_rate": 5.592012499220906e-06, + "loss": 0.0152, + "step": 26629 + }, + { + "epoch": 3.1578323253883553, + "grad_norm": 0.5476513881473059, + "learning_rate": 5.590499545961495e-06, + "loss": 0.0292, + "step": 26630 + }, + { + "epoch": 3.15795090715048, + "grad_norm": 0.4434607461536729, + "learning_rate": 5.588986771633864e-06, + "loss": 0.0247, + "step": 26631 + }, + { + "epoch": 3.1580694889126053, + "grad_norm": 0.39486660494170345, + "learning_rate": 5.587474176251984e-06, + "loss": 0.0198, + "step": 26632 + }, + { + "epoch": 3.15818807067473, + "grad_norm": 0.7158026753544494, + "learning_rate": 5.58596175982978e-06, + "loss": 0.0357, + "step": 26633 + }, + { + "epoch": 3.1583066524368553, + "grad_norm": 0.35456394169791267, + "learning_rate": 5.584449522381205e-06, + "loss": 0.0143, + "step": 26634 + }, + { + "epoch": 3.15842523419898, + "grad_norm": 0.7763390793083799, + "learning_rate": 5.582937463920193e-06, + "loss": 0.0425, + "step": 26635 + }, + { + "epoch": 3.1585438159611052, + "grad_norm": 0.6020293867749708, + "learning_rate": 5.581425584460687e-06, + "loss": 0.0316, + "step": 26636 + }, + { + "epoch": 3.15866239772323, + "grad_norm": 0.5066444049013469, + "learning_rate": 5.579913884016633e-06, + "loss": 0.0234, + "step": 26637 + }, + { + "epoch": 3.158780979485355, + "grad_norm": 0.5129731182383555, + "learning_rate": 5.5784023626019504e-06, + "loss": 0.0284, + "step": 26638 + }, + { + "epoch": 3.15889956124748, + "grad_norm": 0.41223949646486846, + "learning_rate": 5.576891020230582e-06, + "loss": 0.0185, + "step": 26639 + }, + { + "epoch": 3.159018143009605, + "grad_norm": 0.31086884832271366, + "learning_rate": 5.57537985691646e-06, + "loss": 0.014, + "step": 26640 + }, + { + "epoch": 3.15913672477173, + "grad_norm": 0.495380124354331, + "learning_rate": 5.573868872673524e-06, + "loss": 0.0224, + "step": 26641 + }, + { + "epoch": 3.159255306533855, + "grad_norm": 0.6532597895721629, + "learning_rate": 5.5723580675156786e-06, + "loss": 0.0287, + "step": 26642 + }, + { + "epoch": 3.15937388829598, + "grad_norm": 0.7877203254114351, + "learning_rate": 5.570847441456883e-06, + "loss": 0.0379, + "step": 26643 + }, + { + "epoch": 3.159492470058105, + "grad_norm": 0.37896503831854017, + "learning_rate": 5.569336994511043e-06, + "loss": 0.0157, + "step": 26644 + }, + { + "epoch": 3.15961105182023, + "grad_norm": 0.5480394174531984, + "learning_rate": 5.567826726692088e-06, + "loss": 0.0264, + "step": 26645 + }, + { + "epoch": 3.159729633582355, + "grad_norm": 0.610554383340205, + "learning_rate": 5.566316638013941e-06, + "loss": 0.0268, + "step": 26646 + }, + { + "epoch": 3.1598482153444802, + "grad_norm": 0.5985654750563624, + "learning_rate": 5.5648067284905205e-06, + "loss": 0.0324, + "step": 26647 + }, + { + "epoch": 3.159966797106605, + "grad_norm": 0.9368413839201747, + "learning_rate": 5.563296998135758e-06, + "loss": 0.0484, + "step": 26648 + }, + { + "epoch": 3.1600853788687298, + "grad_norm": 0.5897524142164748, + "learning_rate": 5.561787446963557e-06, + "loss": 0.0288, + "step": 26649 + }, + { + "epoch": 3.160203960630855, + "grad_norm": 0.4997943625897073, + "learning_rate": 5.560278074987838e-06, + "loss": 0.0261, + "step": 26650 + }, + { + "epoch": 3.16032254239298, + "grad_norm": 0.33964160769384344, + "learning_rate": 5.558768882222518e-06, + "loss": 0.0191, + "step": 26651 + }, + { + "epoch": 3.160441124155105, + "grad_norm": 0.5867920944600887, + "learning_rate": 5.557259868681513e-06, + "loss": 0.0328, + "step": 26652 + }, + { + "epoch": 3.16055970591723, + "grad_norm": 1.2876167773105969, + "learning_rate": 5.555751034378723e-06, + "loss": 0.0261, + "step": 26653 + }, + { + "epoch": 3.160678287679355, + "grad_norm": 0.5914519963827372, + "learning_rate": 5.554242379328065e-06, + "loss": 0.0292, + "step": 26654 + }, + { + "epoch": 3.16079686944148, + "grad_norm": 0.47826560871631996, + "learning_rate": 5.552733903543447e-06, + "loss": 0.022, + "step": 26655 + }, + { + "epoch": 3.160915451203605, + "grad_norm": 0.5591702200099782, + "learning_rate": 5.5512256070387735e-06, + "loss": 0.0218, + "step": 26656 + }, + { + "epoch": 3.16103403296573, + "grad_norm": 0.5618164289533202, + "learning_rate": 5.549717489827952e-06, + "loss": 0.0315, + "step": 26657 + }, + { + "epoch": 3.161152614727855, + "grad_norm": 1.372807177882171, + "learning_rate": 5.548209551924882e-06, + "loss": 0.0677, + "step": 26658 + }, + { + "epoch": 3.16127119648998, + "grad_norm": 0.43490764991400105, + "learning_rate": 5.5467017933434725e-06, + "loss": 0.0218, + "step": 26659 + }, + { + "epoch": 3.1613897782521048, + "grad_norm": 0.6329157173893086, + "learning_rate": 5.545194214097612e-06, + "loss": 0.0295, + "step": 26660 + }, + { + "epoch": 3.16150836001423, + "grad_norm": 0.4793956041476161, + "learning_rate": 5.543686814201204e-06, + "loss": 0.0176, + "step": 26661 + }, + { + "epoch": 3.1616269417763547, + "grad_norm": 0.8133051954632533, + "learning_rate": 5.542179593668142e-06, + "loss": 0.0395, + "step": 26662 + }, + { + "epoch": 3.16174552353848, + "grad_norm": 0.3922801346389591, + "learning_rate": 5.540672552512335e-06, + "loss": 0.0177, + "step": 26663 + }, + { + "epoch": 3.1618641053006047, + "grad_norm": 0.8189906547589164, + "learning_rate": 5.539165690747655e-06, + "loss": 0.0347, + "step": 26664 + }, + { + "epoch": 3.16198268706273, + "grad_norm": 0.8532087012400474, + "learning_rate": 5.537659008388002e-06, + "loss": 0.0495, + "step": 26665 + }, + { + "epoch": 3.1621012688248546, + "grad_norm": 0.6355937775994145, + "learning_rate": 5.536152505447268e-06, + "loss": 0.0335, + "step": 26666 + }, + { + "epoch": 3.16221985058698, + "grad_norm": 0.4679963914209739, + "learning_rate": 5.53464618193934e-06, + "loss": 0.026, + "step": 26667 + }, + { + "epoch": 3.1623384323491046, + "grad_norm": 0.3046907697715899, + "learning_rate": 5.533140037878104e-06, + "loss": 0.0158, + "step": 26668 + }, + { + "epoch": 3.16245701411123, + "grad_norm": 0.6167404360071604, + "learning_rate": 5.531634073277442e-06, + "loss": 0.0303, + "step": 26669 + }, + { + "epoch": 3.1625755958733546, + "grad_norm": 0.581131604725843, + "learning_rate": 5.530128288151251e-06, + "loss": 0.0245, + "step": 26670 + }, + { + "epoch": 3.1626941776354798, + "grad_norm": 0.49421846995523716, + "learning_rate": 5.528622682513393e-06, + "loss": 0.0242, + "step": 26671 + }, + { + "epoch": 3.1628127593976045, + "grad_norm": 0.6346374649248514, + "learning_rate": 5.527117256377756e-06, + "loss": 0.0409, + "step": 26672 + }, + { + "epoch": 3.1629313411597297, + "grad_norm": 0.5369635809030873, + "learning_rate": 5.5256120097582215e-06, + "loss": 0.0176, + "step": 26673 + }, + { + "epoch": 3.1630499229218545, + "grad_norm": 0.5053882107652605, + "learning_rate": 5.524106942668661e-06, + "loss": 0.0298, + "step": 26674 + }, + { + "epoch": 3.1631685046839797, + "grad_norm": 0.5193856002686691, + "learning_rate": 5.522602055122958e-06, + "loss": 0.0235, + "step": 26675 + }, + { + "epoch": 3.1632870864461045, + "grad_norm": 0.2787619779247167, + "learning_rate": 5.5210973471349656e-06, + "loss": 0.0133, + "step": 26676 + }, + { + "epoch": 3.1634056682082297, + "grad_norm": 0.5416771010394478, + "learning_rate": 5.519592818718583e-06, + "loss": 0.0227, + "step": 26677 + }, + { + "epoch": 3.1635242499703544, + "grad_norm": 0.45513253190118896, + "learning_rate": 5.518088469887661e-06, + "loss": 0.0204, + "step": 26678 + }, + { + "epoch": 3.1636428317324796, + "grad_norm": 0.7058362929988333, + "learning_rate": 5.516584300656077e-06, + "loss": 0.0444, + "step": 26679 + }, + { + "epoch": 3.1637614134946044, + "grad_norm": 0.6633681150523794, + "learning_rate": 5.515080311037682e-06, + "loss": 0.0342, + "step": 26680 + }, + { + "epoch": 3.1638799952567296, + "grad_norm": 0.4698969827194819, + "learning_rate": 5.513576501046366e-06, + "loss": 0.0203, + "step": 26681 + }, + { + "epoch": 3.1639985770188543, + "grad_norm": 0.37103222759795124, + "learning_rate": 5.5120728706959724e-06, + "loss": 0.0225, + "step": 26682 + }, + { + "epoch": 3.1641171587809795, + "grad_norm": 0.6556792537597127, + "learning_rate": 5.510569420000367e-06, + "loss": 0.0353, + "step": 26683 + }, + { + "epoch": 3.1642357405431043, + "grad_norm": 0.5246302529082818, + "learning_rate": 5.509066148973413e-06, + "loss": 0.0364, + "step": 26684 + }, + { + "epoch": 3.1643543223052295, + "grad_norm": 0.5120428286771012, + "learning_rate": 5.50756305762897e-06, + "loss": 0.0243, + "step": 26685 + }, + { + "epoch": 3.1644729040673543, + "grad_norm": 0.6400210556389612, + "learning_rate": 5.506060145980896e-06, + "loss": 0.0422, + "step": 26686 + }, + { + "epoch": 3.1645914858294795, + "grad_norm": 0.7028948818814551, + "learning_rate": 5.504557414043032e-06, + "loss": 0.0379, + "step": 26687 + }, + { + "epoch": 3.1647100675916042, + "grad_norm": 0.5320965549280915, + "learning_rate": 5.5030548618292515e-06, + "loss": 0.0317, + "step": 26688 + }, + { + "epoch": 3.1648286493537294, + "grad_norm": 0.38345873417187376, + "learning_rate": 5.501552489353392e-06, + "loss": 0.0258, + "step": 26689 + }, + { + "epoch": 3.164947231115854, + "grad_norm": 0.5988139009626915, + "learning_rate": 5.500050296629317e-06, + "loss": 0.0308, + "step": 26690 + }, + { + "epoch": 3.1650658128779794, + "grad_norm": 0.527700164358762, + "learning_rate": 5.498548283670848e-06, + "loss": 0.0281, + "step": 26691 + }, + { + "epoch": 3.165184394640104, + "grad_norm": 0.45917698734887175, + "learning_rate": 5.497046450491866e-06, + "loss": 0.0186, + "step": 26692 + }, + { + "epoch": 3.1653029764022294, + "grad_norm": 0.3961475707044245, + "learning_rate": 5.495544797106192e-06, + "loss": 0.019, + "step": 26693 + }, + { + "epoch": 3.165421558164354, + "grad_norm": 0.43082807047418076, + "learning_rate": 5.494043323527675e-06, + "loss": 0.0281, + "step": 26694 + }, + { + "epoch": 3.1655401399264793, + "grad_norm": 0.4875196624021285, + "learning_rate": 5.4925420297701616e-06, + "loss": 0.0244, + "step": 26695 + }, + { + "epoch": 3.1656587216886045, + "grad_norm": 0.5695649295574221, + "learning_rate": 5.491040915847487e-06, + "loss": 0.0301, + "step": 26696 + }, + { + "epoch": 3.1657773034507293, + "grad_norm": 0.29331902382863034, + "learning_rate": 5.4895399817735e-06, + "loss": 0.011, + "step": 26697 + }, + { + "epoch": 3.165895885212854, + "grad_norm": 0.5610909233469809, + "learning_rate": 5.4880392275620164e-06, + "loss": 0.0275, + "step": 26698 + }, + { + "epoch": 3.1660144669749792, + "grad_norm": 0.3878037471302639, + "learning_rate": 5.486538653226894e-06, + "loss": 0.0261, + "step": 26699 + }, + { + "epoch": 3.1661330487371044, + "grad_norm": 0.6906787482529037, + "learning_rate": 5.4850382587819525e-06, + "loss": 0.0329, + "step": 26700 + }, + { + "epoch": 3.166251630499229, + "grad_norm": 0.4454800388023638, + "learning_rate": 5.483538044241035e-06, + "loss": 0.0235, + "step": 26701 + }, + { + "epoch": 3.1663702122613544, + "grad_norm": 0.7748776986314141, + "learning_rate": 5.482038009617949e-06, + "loss": 0.0442, + "step": 26702 + }, + { + "epoch": 3.166488794023479, + "grad_norm": 0.5263954710983711, + "learning_rate": 5.480538154926551e-06, + "loss": 0.0298, + "step": 26703 + }, + { + "epoch": 3.1666073757856044, + "grad_norm": 0.4256685301779993, + "learning_rate": 5.479038480180649e-06, + "loss": 0.0197, + "step": 26704 + }, + { + "epoch": 3.166725957547729, + "grad_norm": 0.6694135167410876, + "learning_rate": 5.4775389853940735e-06, + "loss": 0.0441, + "step": 26705 + }, + { + "epoch": 3.1668445393098543, + "grad_norm": 0.7857330373306312, + "learning_rate": 5.476039670580646e-06, + "loss": 0.0512, + "step": 26706 + }, + { + "epoch": 3.166963121071979, + "grad_norm": 0.36020711887452855, + "learning_rate": 5.474540535754194e-06, + "loss": 0.0163, + "step": 26707 + }, + { + "epoch": 3.1670817028341043, + "grad_norm": 0.46759409713390915, + "learning_rate": 5.4730415809285425e-06, + "loss": 0.0274, + "step": 26708 + }, + { + "epoch": 3.167200284596229, + "grad_norm": 0.7367709031179004, + "learning_rate": 5.471542806117494e-06, + "loss": 0.0395, + "step": 26709 + }, + { + "epoch": 3.1673188663583542, + "grad_norm": 0.3291176863433781, + "learning_rate": 5.470044211334871e-06, + "loss": 0.0169, + "step": 26710 + }, + { + "epoch": 3.167437448120479, + "grad_norm": 1.0399312302954402, + "learning_rate": 5.468545796594493e-06, + "loss": 0.047, + "step": 26711 + }, + { + "epoch": 3.167556029882604, + "grad_norm": 0.49275819577584107, + "learning_rate": 5.46704756191018e-06, + "loss": 0.0225, + "step": 26712 + }, + { + "epoch": 3.167674611644729, + "grad_norm": 0.5455624506497216, + "learning_rate": 5.46554950729572e-06, + "loss": 0.0202, + "step": 26713 + }, + { + "epoch": 3.167793193406854, + "grad_norm": 0.5702892253792198, + "learning_rate": 5.464051632764955e-06, + "loss": 0.04, + "step": 26714 + }, + { + "epoch": 3.167911775168979, + "grad_norm": 0.5225814026508266, + "learning_rate": 5.462553938331666e-06, + "loss": 0.0333, + "step": 26715 + }, + { + "epoch": 3.168030356931104, + "grad_norm": 0.5089805022195287, + "learning_rate": 5.461056424009675e-06, + "loss": 0.0262, + "step": 26716 + }, + { + "epoch": 3.168148938693229, + "grad_norm": 0.854781909004644, + "learning_rate": 5.459559089812783e-06, + "loss": 0.0453, + "step": 26717 + }, + { + "epoch": 3.168267520455354, + "grad_norm": 0.5736952935338923, + "learning_rate": 5.458061935754794e-06, + "loss": 0.0312, + "step": 26718 + }, + { + "epoch": 3.168386102217479, + "grad_norm": 0.49248841529545195, + "learning_rate": 5.456564961849517e-06, + "loss": 0.029, + "step": 26719 + }, + { + "epoch": 3.168504683979604, + "grad_norm": 0.43418511108965685, + "learning_rate": 5.455068168110735e-06, + "loss": 0.0148, + "step": 26720 + }, + { + "epoch": 3.168623265741729, + "grad_norm": 0.48311070922969684, + "learning_rate": 5.453571554552262e-06, + "loss": 0.022, + "step": 26721 + }, + { + "epoch": 3.168741847503854, + "grad_norm": 0.5955857962872563, + "learning_rate": 5.4520751211878886e-06, + "loss": 0.0294, + "step": 26722 + }, + { + "epoch": 3.1688604292659788, + "grad_norm": 0.401166068841645, + "learning_rate": 5.450578868031409e-06, + "loss": 0.0174, + "step": 26723 + }, + { + "epoch": 3.168979011028104, + "grad_norm": 0.564569226103879, + "learning_rate": 5.449082795096627e-06, + "loss": 0.0249, + "step": 26724 + }, + { + "epoch": 3.1690975927902287, + "grad_norm": 0.6738634091322023, + "learning_rate": 5.447586902397314e-06, + "loss": 0.0375, + "step": 26725 + }, + { + "epoch": 3.169216174552354, + "grad_norm": 0.38784789238391076, + "learning_rate": 5.4460911899472864e-06, + "loss": 0.0205, + "step": 26726 + }, + { + "epoch": 3.1693347563144787, + "grad_norm": 0.7138085552075453, + "learning_rate": 5.444595657760312e-06, + "loss": 0.0344, + "step": 26727 + }, + { + "epoch": 3.169453338076604, + "grad_norm": 0.4621935581536481, + "learning_rate": 5.443100305850185e-06, + "loss": 0.0242, + "step": 26728 + }, + { + "epoch": 3.1695719198387287, + "grad_norm": 0.7939860276844112, + "learning_rate": 5.441605134230693e-06, + "loss": 0.0494, + "step": 26729 + }, + { + "epoch": 3.169690501600854, + "grad_norm": 0.8543588480729267, + "learning_rate": 5.440110142915622e-06, + "loss": 0.0363, + "step": 26730 + }, + { + "epoch": 3.1698090833629786, + "grad_norm": 0.7898565018637371, + "learning_rate": 5.438615331918745e-06, + "loss": 0.0391, + "step": 26731 + }, + { + "epoch": 3.169927665125104, + "grad_norm": 0.34098792414729207, + "learning_rate": 5.437120701253845e-06, + "loss": 0.0198, + "step": 26732 + }, + { + "epoch": 3.1700462468872286, + "grad_norm": 0.5485708613383049, + "learning_rate": 5.435626250934703e-06, + "loss": 0.0275, + "step": 26733 + }, + { + "epoch": 3.170164828649354, + "grad_norm": 0.481115096659025, + "learning_rate": 5.4341319809750964e-06, + "loss": 0.0332, + "step": 26734 + }, + { + "epoch": 3.1702834104114785, + "grad_norm": 0.6878561817478288, + "learning_rate": 5.432637891388806e-06, + "loss": 0.0383, + "step": 26735 + }, + { + "epoch": 3.1704019921736037, + "grad_norm": 0.4079535202290307, + "learning_rate": 5.431143982189585e-06, + "loss": 0.0172, + "step": 26736 + }, + { + "epoch": 3.1705205739357285, + "grad_norm": 0.4214816620644817, + "learning_rate": 5.429650253391233e-06, + "loss": 0.026, + "step": 26737 + }, + { + "epoch": 3.1706391556978537, + "grad_norm": 0.6137695776477892, + "learning_rate": 5.4281567050075e-06, + "loss": 0.0368, + "step": 26738 + }, + { + "epoch": 3.1707577374599785, + "grad_norm": 0.78725275220998, + "learning_rate": 5.426663337052162e-06, + "loss": 0.0334, + "step": 26739 + }, + { + "epoch": 3.1708763192221037, + "grad_norm": 0.4399612354286195, + "learning_rate": 5.425170149538986e-06, + "loss": 0.0231, + "step": 26740 + }, + { + "epoch": 3.1709949009842284, + "grad_norm": 0.5910032812961574, + "learning_rate": 5.4236771424817455e-06, + "loss": 0.0271, + "step": 26741 + }, + { + "epoch": 3.1711134827463536, + "grad_norm": 0.6578898526940677, + "learning_rate": 5.422184315894185e-06, + "loss": 0.032, + "step": 26742 + }, + { + "epoch": 3.1712320645084784, + "grad_norm": 0.6214806636989694, + "learning_rate": 5.420691669790079e-06, + "loss": 0.0319, + "step": 26743 + }, + { + "epoch": 3.1713506462706036, + "grad_norm": 0.3947991240829268, + "learning_rate": 5.419199204183187e-06, + "loss": 0.018, + "step": 26744 + }, + { + "epoch": 3.171469228032729, + "grad_norm": 0.34455790541452275, + "learning_rate": 5.417706919087265e-06, + "loss": 0.0182, + "step": 26745 + }, + { + "epoch": 3.1715878097948536, + "grad_norm": 0.5638888866163443, + "learning_rate": 5.41621481451608e-06, + "loss": 0.035, + "step": 26746 + }, + { + "epoch": 3.1717063915569783, + "grad_norm": 0.5817569221000787, + "learning_rate": 5.414722890483364e-06, + "loss": 0.0243, + "step": 26747 + }, + { + "epoch": 3.1718249733191035, + "grad_norm": 0.43688663114223547, + "learning_rate": 5.413231147002903e-06, + "loss": 0.0217, + "step": 26748 + }, + { + "epoch": 3.1719435550812287, + "grad_norm": 0.5309896156379215, + "learning_rate": 5.411739584088421e-06, + "loss": 0.029, + "step": 26749 + }, + { + "epoch": 3.1720621368433535, + "grad_norm": 0.48447605829924373, + "learning_rate": 5.41024820175369e-06, + "loss": 0.027, + "step": 26750 + }, + { + "epoch": 3.1721807186054787, + "grad_norm": 0.30425748007982434, + "learning_rate": 5.408757000012432e-06, + "loss": 0.0136, + "step": 26751 + }, + { + "epoch": 3.1722993003676034, + "grad_norm": 0.8706417153069349, + "learning_rate": 5.407265978878423e-06, + "loss": 0.0633, + "step": 26752 + }, + { + "epoch": 3.1724178821297286, + "grad_norm": 0.711793280719569, + "learning_rate": 5.405775138365391e-06, + "loss": 0.032, + "step": 26753 + }, + { + "epoch": 3.1725364638918534, + "grad_norm": 0.6767505318807576, + "learning_rate": 5.404284478487082e-06, + "loss": 0.0281, + "step": 26754 + }, + { + "epoch": 3.1726550456539786, + "grad_norm": 0.6276445223452962, + "learning_rate": 5.4027939992572415e-06, + "loss": 0.0386, + "step": 26755 + }, + { + "epoch": 3.1727736274161034, + "grad_norm": 0.5104551509351485, + "learning_rate": 5.401303700689608e-06, + "loss": 0.0315, + "step": 26756 + }, + { + "epoch": 3.1728922091782286, + "grad_norm": 0.783167499475459, + "learning_rate": 5.399813582797928e-06, + "loss": 0.0369, + "step": 26757 + }, + { + "epoch": 3.1730107909403533, + "grad_norm": 0.5055425303238021, + "learning_rate": 5.398323645595918e-06, + "loss": 0.0258, + "step": 26758 + }, + { + "epoch": 3.1731293727024785, + "grad_norm": 0.528385705826086, + "learning_rate": 5.396833889097341e-06, + "loss": 0.0285, + "step": 26759 + }, + { + "epoch": 3.1732479544646033, + "grad_norm": 0.6380171405945844, + "learning_rate": 5.3953443133159085e-06, + "loss": 0.0276, + "step": 26760 + }, + { + "epoch": 3.1733665362267285, + "grad_norm": 0.8326696812288806, + "learning_rate": 5.393854918265367e-06, + "loss": 0.0373, + "step": 26761 + }, + { + "epoch": 3.1734851179888532, + "grad_norm": 0.5059928927794698, + "learning_rate": 5.392365703959429e-06, + "loss": 0.0267, + "step": 26762 + }, + { + "epoch": 3.1736036997509784, + "grad_norm": 0.412125306577279, + "learning_rate": 5.390876670411848e-06, + "loss": 0.0173, + "step": 26763 + }, + { + "epoch": 3.173722281513103, + "grad_norm": 0.6338115349567435, + "learning_rate": 5.389387817636329e-06, + "loss": 0.0263, + "step": 26764 + }, + { + "epoch": 3.1738408632752284, + "grad_norm": 0.5826166912721881, + "learning_rate": 5.3878991456466075e-06, + "loss": 0.0258, + "step": 26765 + }, + { + "epoch": 3.173959445037353, + "grad_norm": 0.6729858767563561, + "learning_rate": 5.386410654456406e-06, + "loss": 0.0344, + "step": 26766 + }, + { + "epoch": 3.1740780267994784, + "grad_norm": 0.5270856821897392, + "learning_rate": 5.384922344079446e-06, + "loss": 0.0296, + "step": 26767 + }, + { + "epoch": 3.174196608561603, + "grad_norm": 0.46378438198269584, + "learning_rate": 5.383434214529456e-06, + "loss": 0.0261, + "step": 26768 + }, + { + "epoch": 3.1743151903237283, + "grad_norm": 0.870864313638614, + "learning_rate": 5.381946265820134e-06, + "loss": 0.0478, + "step": 26769 + }, + { + "epoch": 3.174433772085853, + "grad_norm": 0.651864068768727, + "learning_rate": 5.380458497965221e-06, + "loss": 0.04, + "step": 26770 + }, + { + "epoch": 3.1745523538479783, + "grad_norm": 0.692815191892806, + "learning_rate": 5.378970910978417e-06, + "loss": 0.0361, + "step": 26771 + }, + { + "epoch": 3.174670935610103, + "grad_norm": 0.47676657747842716, + "learning_rate": 5.37748350487344e-06, + "loss": 0.0192, + "step": 26772 + }, + { + "epoch": 3.1747895173722283, + "grad_norm": 0.48527014032001664, + "learning_rate": 5.375996279664003e-06, + "loss": 0.0295, + "step": 26773 + }, + { + "epoch": 3.174908099134353, + "grad_norm": 0.2384127238127367, + "learning_rate": 5.374509235363815e-06, + "loss": 0.0116, + "step": 26774 + }, + { + "epoch": 3.175026680896478, + "grad_norm": 0.5821865983844606, + "learning_rate": 5.373022371986591e-06, + "loss": 0.0271, + "step": 26775 + }, + { + "epoch": 3.175145262658603, + "grad_norm": 0.4950805157951881, + "learning_rate": 5.371535689546028e-06, + "loss": 0.0263, + "step": 26776 + }, + { + "epoch": 3.175263844420728, + "grad_norm": 0.4251689163878971, + "learning_rate": 5.370049188055834e-06, + "loss": 0.0159, + "step": 26777 + }, + { + "epoch": 3.175382426182853, + "grad_norm": 0.7167405557750278, + "learning_rate": 5.3685628675297164e-06, + "loss": 0.0277, + "step": 26778 + }, + { + "epoch": 3.175501007944978, + "grad_norm": 0.599474579689702, + "learning_rate": 5.367076727981382e-06, + "loss": 0.0287, + "step": 26779 + }, + { + "epoch": 3.175619589707103, + "grad_norm": 0.6315400415551827, + "learning_rate": 5.365590769424517e-06, + "loss": 0.0346, + "step": 26780 + }, + { + "epoch": 3.175738171469228, + "grad_norm": 0.5872893752072457, + "learning_rate": 5.3641049918728285e-06, + "loss": 0.032, + "step": 26781 + }, + { + "epoch": 3.175856753231353, + "grad_norm": 0.5339187719664221, + "learning_rate": 5.362619395340013e-06, + "loss": 0.0287, + "step": 26782 + }, + { + "epoch": 3.175975334993478, + "grad_norm": 0.3552311541599536, + "learning_rate": 5.361133979839764e-06, + "loss": 0.0133, + "step": 26783 + }, + { + "epoch": 3.176093916755603, + "grad_norm": 0.5186729839076002, + "learning_rate": 5.359648745385778e-06, + "loss": 0.0231, + "step": 26784 + }, + { + "epoch": 3.176212498517728, + "grad_norm": 0.5225114148599227, + "learning_rate": 5.358163691991746e-06, + "loss": 0.0274, + "step": 26785 + }, + { + "epoch": 3.176331080279853, + "grad_norm": 0.5108205334355377, + "learning_rate": 5.356678819671368e-06, + "loss": 0.0283, + "step": 26786 + }, + { + "epoch": 3.176449662041978, + "grad_norm": 0.7275248216514436, + "learning_rate": 5.355194128438312e-06, + "loss": 0.0306, + "step": 26787 + }, + { + "epoch": 3.1765682438041027, + "grad_norm": 0.43844123997498297, + "learning_rate": 5.353709618306277e-06, + "loss": 0.0237, + "step": 26788 + }, + { + "epoch": 3.176686825566228, + "grad_norm": 0.28223519563035665, + "learning_rate": 5.352225289288948e-06, + "loss": 0.0141, + "step": 26789 + }, + { + "epoch": 3.1768054073283527, + "grad_norm": 0.4396327739376631, + "learning_rate": 5.350741141400018e-06, + "loss": 0.0184, + "step": 26790 + }, + { + "epoch": 3.176923989090478, + "grad_norm": 0.5099486453534019, + "learning_rate": 5.349257174653147e-06, + "loss": 0.0288, + "step": 26791 + }, + { + "epoch": 3.1770425708526027, + "grad_norm": 0.5993588671905306, + "learning_rate": 5.3477733890620305e-06, + "loss": 0.0323, + "step": 26792 + }, + { + "epoch": 3.177161152614728, + "grad_norm": 0.7938445013696611, + "learning_rate": 5.3462897846403435e-06, + "loss": 0.029, + "step": 26793 + }, + { + "epoch": 3.177279734376853, + "grad_norm": 0.5865005496649374, + "learning_rate": 5.3448063614017636e-06, + "loss": 0.0254, + "step": 26794 + }, + { + "epoch": 3.177398316138978, + "grad_norm": 0.3818087169213731, + "learning_rate": 5.3433231193599744e-06, + "loss": 0.0266, + "step": 26795 + }, + { + "epoch": 3.1775168979011026, + "grad_norm": 0.5594325596336476, + "learning_rate": 5.341840058528627e-06, + "loss": 0.0302, + "step": 26796 + }, + { + "epoch": 3.177635479663228, + "grad_norm": 0.5802394438463239, + "learning_rate": 5.340357178921421e-06, + "loss": 0.0299, + "step": 26797 + }, + { + "epoch": 3.177754061425353, + "grad_norm": 0.5279920513652755, + "learning_rate": 5.338874480552006e-06, + "loss": 0.0305, + "step": 26798 + }, + { + "epoch": 3.1778726431874778, + "grad_norm": 0.47305673537198467, + "learning_rate": 5.33739196343406e-06, + "loss": 0.0296, + "step": 26799 + }, + { + "epoch": 3.177991224949603, + "grad_norm": 0.4474453899512623, + "learning_rate": 5.335909627581248e-06, + "loss": 0.0194, + "step": 26800 + }, + { + "epoch": 3.1781098067117277, + "grad_norm": 0.7296312190459509, + "learning_rate": 5.334427473007242e-06, + "loss": 0.0279, + "step": 26801 + }, + { + "epoch": 3.178228388473853, + "grad_norm": 1.0048302875385127, + "learning_rate": 5.3329454997256914e-06, + "loss": 0.0633, + "step": 26802 + }, + { + "epoch": 3.1783469702359777, + "grad_norm": 0.4337947930426528, + "learning_rate": 5.331463707750267e-06, + "loss": 0.0217, + "step": 26803 + }, + { + "epoch": 3.178465551998103, + "grad_norm": 0.5181409822665912, + "learning_rate": 5.329982097094627e-06, + "loss": 0.0256, + "step": 26804 + }, + { + "epoch": 3.1785841337602276, + "grad_norm": 0.9807587412101548, + "learning_rate": 5.328500667772432e-06, + "loss": 0.02, + "step": 26805 + }, + { + "epoch": 3.178702715522353, + "grad_norm": 0.27950838349928675, + "learning_rate": 5.327019419797344e-06, + "loss": 0.0125, + "step": 26806 + }, + { + "epoch": 3.1788212972844776, + "grad_norm": 0.37850986303017686, + "learning_rate": 5.325538353182999e-06, + "loss": 0.0212, + "step": 26807 + }, + { + "epoch": 3.178939879046603, + "grad_norm": 0.7206390767464694, + "learning_rate": 5.32405746794308e-06, + "loss": 0.0477, + "step": 26808 + }, + { + "epoch": 3.1790584608087276, + "grad_norm": 0.6863932849820379, + "learning_rate": 5.322576764091214e-06, + "loss": 0.0345, + "step": 26809 + }, + { + "epoch": 3.1791770425708528, + "grad_norm": 0.6605511570906201, + "learning_rate": 5.321096241641066e-06, + "loss": 0.0369, + "step": 26810 + }, + { + "epoch": 3.1792956243329775, + "grad_norm": 0.7328284820131797, + "learning_rate": 5.319615900606268e-06, + "loss": 0.0388, + "step": 26811 + }, + { + "epoch": 3.1794142060951027, + "grad_norm": 0.7076397884328195, + "learning_rate": 5.318135741000488e-06, + "loss": 0.0363, + "step": 26812 + }, + { + "epoch": 3.1795327878572275, + "grad_norm": 0.689626470306249, + "learning_rate": 5.316655762837355e-06, + "loss": 0.0443, + "step": 26813 + }, + { + "epoch": 3.1796513696193527, + "grad_norm": 0.39094048085489175, + "learning_rate": 5.31517596613052e-06, + "loss": 0.0238, + "step": 26814 + }, + { + "epoch": 3.1797699513814774, + "grad_norm": 0.3773723682130621, + "learning_rate": 5.313696350893624e-06, + "loss": 0.0261, + "step": 26815 + }, + { + "epoch": 3.1798885331436026, + "grad_norm": 0.7033633002365893, + "learning_rate": 5.312216917140306e-06, + "loss": 0.0481, + "step": 26816 + }, + { + "epoch": 3.1800071149057274, + "grad_norm": 0.5500596841137808, + "learning_rate": 5.31073766488421e-06, + "loss": 0.0276, + "step": 26817 + }, + { + "epoch": 3.1801256966678526, + "grad_norm": 0.7472721728101827, + "learning_rate": 5.309258594138955e-06, + "loss": 0.0417, + "step": 26818 + }, + { + "epoch": 3.1802442784299774, + "grad_norm": 0.24367490078301382, + "learning_rate": 5.307779704918206e-06, + "loss": 0.0114, + "step": 26819 + }, + { + "epoch": 3.1803628601921026, + "grad_norm": 0.5434923152316425, + "learning_rate": 5.30630099723557e-06, + "loss": 0.0284, + "step": 26820 + }, + { + "epoch": 3.1804814419542273, + "grad_norm": 0.4574806081281509, + "learning_rate": 5.304822471104689e-06, + "loss": 0.0244, + "step": 26821 + }, + { + "epoch": 3.1806000237163525, + "grad_norm": 0.7202678387298678, + "learning_rate": 5.303344126539195e-06, + "loss": 0.0313, + "step": 26822 + }, + { + "epoch": 3.1807186054784773, + "grad_norm": 0.5166203110422887, + "learning_rate": 5.301865963552713e-06, + "loss": 0.0362, + "step": 26823 + }, + { + "epoch": 3.1808371872406025, + "grad_norm": 0.638123706666866, + "learning_rate": 5.3003879821588804e-06, + "loss": 0.0358, + "step": 26824 + }, + { + "epoch": 3.1809557690027273, + "grad_norm": 0.7691235530570324, + "learning_rate": 5.298910182371297e-06, + "loss": 0.0289, + "step": 26825 + }, + { + "epoch": 3.1810743507648525, + "grad_norm": 0.4029409742305586, + "learning_rate": 5.297432564203617e-06, + "loss": 0.0192, + "step": 26826 + }, + { + "epoch": 3.181192932526977, + "grad_norm": 0.6299706603324136, + "learning_rate": 5.295955127669442e-06, + "loss": 0.0321, + "step": 26827 + }, + { + "epoch": 3.1813115142891024, + "grad_norm": 0.6449479160731105, + "learning_rate": 5.294477872782405e-06, + "loss": 0.0341, + "step": 26828 + }, + { + "epoch": 3.181430096051227, + "grad_norm": 0.3931527033012658, + "learning_rate": 5.2930007995561026e-06, + "loss": 0.0124, + "step": 26829 + }, + { + "epoch": 3.1815486778133524, + "grad_norm": 0.5888279715254057, + "learning_rate": 5.291523908004181e-06, + "loss": 0.0252, + "step": 26830 + }, + { + "epoch": 3.181667259575477, + "grad_norm": 0.5147081623279509, + "learning_rate": 5.290047198140233e-06, + "loss": 0.0336, + "step": 26831 + }, + { + "epoch": 3.1817858413376023, + "grad_norm": 0.4088419844042816, + "learning_rate": 5.28857066997788e-06, + "loss": 0.0222, + "step": 26832 + }, + { + "epoch": 3.181904423099727, + "grad_norm": 0.5598879424699281, + "learning_rate": 5.287094323530736e-06, + "loss": 0.0263, + "step": 26833 + }, + { + "epoch": 3.1820230048618523, + "grad_norm": 0.34939301068446005, + "learning_rate": 5.285618158812405e-06, + "loss": 0.0162, + "step": 26834 + }, + { + "epoch": 3.182141586623977, + "grad_norm": 0.6924403396130859, + "learning_rate": 5.2841421758365096e-06, + "loss": 0.0371, + "step": 26835 + }, + { + "epoch": 3.1822601683861023, + "grad_norm": 0.6677149912649722, + "learning_rate": 5.282666374616638e-06, + "loss": 0.0365, + "step": 26836 + }, + { + "epoch": 3.182378750148227, + "grad_norm": 0.3554379760659778, + "learning_rate": 5.281190755166401e-06, + "loss": 0.0145, + "step": 26837 + }, + { + "epoch": 3.1824973319103522, + "grad_norm": 0.4363064642971755, + "learning_rate": 5.279715317499409e-06, + "loss": 0.0228, + "step": 26838 + }, + { + "epoch": 3.182615913672477, + "grad_norm": 0.5314700989520961, + "learning_rate": 5.278240061629263e-06, + "loss": 0.0259, + "step": 26839 + }, + { + "epoch": 3.182734495434602, + "grad_norm": 0.6523889440008109, + "learning_rate": 5.276764987569546e-06, + "loss": 0.0296, + "step": 26840 + }, + { + "epoch": 3.182853077196727, + "grad_norm": 0.5062895369129786, + "learning_rate": 5.275290095333882e-06, + "loss": 0.0259, + "step": 26841 + }, + { + "epoch": 3.182971658958852, + "grad_norm": 0.4792294278546509, + "learning_rate": 5.273815384935851e-06, + "loss": 0.0205, + "step": 26842 + }, + { + "epoch": 3.1830902407209773, + "grad_norm": 0.5541322949990094, + "learning_rate": 5.272340856389052e-06, + "loss": 0.0297, + "step": 26843 + }, + { + "epoch": 3.183208822483102, + "grad_norm": 0.45610817757689254, + "learning_rate": 5.270866509707076e-06, + "loss": 0.0302, + "step": 26844 + }, + { + "epoch": 3.183327404245227, + "grad_norm": 0.5102327367347915, + "learning_rate": 5.269392344903521e-06, + "loss": 0.0206, + "step": 26845 + }, + { + "epoch": 3.183445986007352, + "grad_norm": 0.6441262173664005, + "learning_rate": 5.267918361991977e-06, + "loss": 0.035, + "step": 26846 + }, + { + "epoch": 3.1835645677694773, + "grad_norm": 0.663606786625218, + "learning_rate": 5.2664445609860225e-06, + "loss": 0.033, + "step": 26847 + }, + { + "epoch": 3.183683149531602, + "grad_norm": 0.4426758501637859, + "learning_rate": 5.2649709418992525e-06, + "loss": 0.0208, + "step": 26848 + }, + { + "epoch": 3.183801731293727, + "grad_norm": 0.37986531636850274, + "learning_rate": 5.263497504745246e-06, + "loss": 0.0208, + "step": 26849 + }, + { + "epoch": 3.183920313055852, + "grad_norm": 0.3593113126318941, + "learning_rate": 5.262024249537601e-06, + "loss": 0.0177, + "step": 26850 + }, + { + "epoch": 3.184038894817977, + "grad_norm": 0.5883108326470412, + "learning_rate": 5.260551176289879e-06, + "loss": 0.0305, + "step": 26851 + }, + { + "epoch": 3.184157476580102, + "grad_norm": 0.5020708291046785, + "learning_rate": 5.259078285015668e-06, + "loss": 0.0223, + "step": 26852 + }, + { + "epoch": 3.184276058342227, + "grad_norm": 0.6172789996855184, + "learning_rate": 5.25760557572855e-06, + "loss": 0.0306, + "step": 26853 + }, + { + "epoch": 3.184394640104352, + "grad_norm": 0.49382882496351843, + "learning_rate": 5.256133048442097e-06, + "loss": 0.0193, + "step": 26854 + }, + { + "epoch": 3.184513221866477, + "grad_norm": 0.3687136612839422, + "learning_rate": 5.254660703169886e-06, + "loss": 0.0182, + "step": 26855 + }, + { + "epoch": 3.184631803628602, + "grad_norm": 0.3495508239636011, + "learning_rate": 5.253188539925491e-06, + "loss": 0.0128, + "step": 26856 + }, + { + "epoch": 3.184750385390727, + "grad_norm": 0.46449517700966914, + "learning_rate": 5.251716558722486e-06, + "loss": 0.0218, + "step": 26857 + }, + { + "epoch": 3.184868967152852, + "grad_norm": 0.33572881035942825, + "learning_rate": 5.250244759574435e-06, + "loss": 0.0142, + "step": 26858 + }, + { + "epoch": 3.184987548914977, + "grad_norm": 0.5132430921719489, + "learning_rate": 5.248773142494906e-06, + "loss": 0.0265, + "step": 26859 + }, + { + "epoch": 3.185106130677102, + "grad_norm": 0.5954197946549163, + "learning_rate": 5.247301707497468e-06, + "loss": 0.0308, + "step": 26860 + }, + { + "epoch": 3.185224712439227, + "grad_norm": 0.4879108104150854, + "learning_rate": 5.245830454595693e-06, + "loss": 0.0268, + "step": 26861 + }, + { + "epoch": 3.1853432942013518, + "grad_norm": 0.49730229513760893, + "learning_rate": 5.244359383803133e-06, + "loss": 0.0185, + "step": 26862 + }, + { + "epoch": 3.185461875963477, + "grad_norm": 0.41433594843682797, + "learning_rate": 5.24288849513335e-06, + "loss": 0.017, + "step": 26863 + }, + { + "epoch": 3.1855804577256017, + "grad_norm": 0.46623146862940057, + "learning_rate": 5.241417788599909e-06, + "loss": 0.0221, + "step": 26864 + }, + { + "epoch": 3.185699039487727, + "grad_norm": 0.5698166875020524, + "learning_rate": 5.239947264216366e-06, + "loss": 0.0247, + "step": 26865 + }, + { + "epoch": 3.1858176212498517, + "grad_norm": 0.22618105242619485, + "learning_rate": 5.238476921996282e-06, + "loss": 0.0082, + "step": 26866 + }, + { + "epoch": 3.185936203011977, + "grad_norm": 0.712252788864357, + "learning_rate": 5.237006761953198e-06, + "loss": 0.0228, + "step": 26867 + }, + { + "epoch": 3.1860547847741016, + "grad_norm": 0.4714340938192217, + "learning_rate": 5.235536784100689e-06, + "loss": 0.0271, + "step": 26868 + }, + { + "epoch": 3.186173366536227, + "grad_norm": 0.6783402657536916, + "learning_rate": 5.234066988452288e-06, + "loss": 0.0396, + "step": 26869 + }, + { + "epoch": 3.1862919482983516, + "grad_norm": 0.8315144517770839, + "learning_rate": 5.232597375021548e-06, + "loss": 0.0317, + "step": 26870 + }, + { + "epoch": 3.186410530060477, + "grad_norm": 0.7220648204853435, + "learning_rate": 5.231127943822023e-06, + "loss": 0.0372, + "step": 26871 + }, + { + "epoch": 3.1865291118226016, + "grad_norm": 0.44113899681960206, + "learning_rate": 5.229658694867254e-06, + "loss": 0.0195, + "step": 26872 + }, + { + "epoch": 3.1866476935847268, + "grad_norm": 0.351324277541486, + "learning_rate": 5.228189628170796e-06, + "loss": 0.0211, + "step": 26873 + }, + { + "epoch": 3.1867662753468515, + "grad_norm": 0.34591846390019915, + "learning_rate": 5.226720743746169e-06, + "loss": 0.0197, + "step": 26874 + }, + { + "epoch": 3.1868848571089767, + "grad_norm": 0.47834410517266174, + "learning_rate": 5.225252041606943e-06, + "loss": 0.0195, + "step": 26875 + }, + { + "epoch": 3.1870034388711015, + "grad_norm": 0.5567583130931552, + "learning_rate": 5.223783521766637e-06, + "loss": 0.0381, + "step": 26876 + }, + { + "epoch": 3.1871220206332267, + "grad_norm": 0.590401183149789, + "learning_rate": 5.222315184238805e-06, + "loss": 0.0325, + "step": 26877 + }, + { + "epoch": 3.1872406023953515, + "grad_norm": 0.4935294640730042, + "learning_rate": 5.220847029036957e-06, + "loss": 0.0273, + "step": 26878 + }, + { + "epoch": 3.1873591841574767, + "grad_norm": 0.7959633571533623, + "learning_rate": 5.219379056174659e-06, + "loss": 0.0267, + "step": 26879 + }, + { + "epoch": 3.1874777659196014, + "grad_norm": 0.3919850445173988, + "learning_rate": 5.217911265665423e-06, + "loss": 0.0236, + "step": 26880 + }, + { + "epoch": 3.1875963476817266, + "grad_norm": 0.44386047966804726, + "learning_rate": 5.216443657522788e-06, + "loss": 0.0224, + "step": 26881 + }, + { + "epoch": 3.1877149294438514, + "grad_norm": 0.48728790945666384, + "learning_rate": 5.214976231760282e-06, + "loss": 0.0267, + "step": 26882 + }, + { + "epoch": 3.1878335112059766, + "grad_norm": 0.6534744615939063, + "learning_rate": 5.21350898839143e-06, + "loss": 0.0397, + "step": 26883 + }, + { + "epoch": 3.1879520929681013, + "grad_norm": 0.631548320934242, + "learning_rate": 5.212041927429773e-06, + "loss": 0.0407, + "step": 26884 + }, + { + "epoch": 3.1880706747302265, + "grad_norm": 0.6741993654369586, + "learning_rate": 5.210575048888808e-06, + "loss": 0.0284, + "step": 26885 + }, + { + "epoch": 3.1881892564923513, + "grad_norm": 0.42135164548227744, + "learning_rate": 5.2091083527820855e-06, + "loss": 0.027, + "step": 26886 + }, + { + "epoch": 3.1883078382544765, + "grad_norm": 0.4167114539793162, + "learning_rate": 5.207641839123109e-06, + "loss": 0.021, + "step": 26887 + }, + { + "epoch": 3.1884264200166013, + "grad_norm": 0.5585956950253819, + "learning_rate": 5.206175507925412e-06, + "loss": 0.0332, + "step": 26888 + }, + { + "epoch": 3.1885450017787265, + "grad_norm": 0.3798935372401742, + "learning_rate": 5.204709359202489e-06, + "loss": 0.0213, + "step": 26889 + }, + { + "epoch": 3.188663583540851, + "grad_norm": 0.3852500064502099, + "learning_rate": 5.203243392967885e-06, + "loss": 0.0176, + "step": 26890 + }, + { + "epoch": 3.1887821653029764, + "grad_norm": 0.40524215163086674, + "learning_rate": 5.201777609235095e-06, + "loss": 0.0213, + "step": 26891 + }, + { + "epoch": 3.188900747065101, + "grad_norm": 0.3874925616092312, + "learning_rate": 5.200312008017636e-06, + "loss": 0.0185, + "step": 26892 + }, + { + "epoch": 3.1890193288272264, + "grad_norm": 0.488996036762367, + "learning_rate": 5.19884658932902e-06, + "loss": 0.023, + "step": 26893 + }, + { + "epoch": 3.189137910589351, + "grad_norm": 0.4602820510744636, + "learning_rate": 5.197381353182757e-06, + "loss": 0.0194, + "step": 26894 + }, + { + "epoch": 3.1892564923514763, + "grad_norm": 0.6089486670454884, + "learning_rate": 5.19591629959236e-06, + "loss": 0.0366, + "step": 26895 + }, + { + "epoch": 3.1893750741136015, + "grad_norm": 0.5130614591717626, + "learning_rate": 5.194451428571326e-06, + "loss": 0.0225, + "step": 26896 + }, + { + "epoch": 3.1894936558757263, + "grad_norm": 0.8288422824931505, + "learning_rate": 5.19298674013316e-06, + "loss": 0.0446, + "step": 26897 + }, + { + "epoch": 3.189612237637851, + "grad_norm": 0.43701231129706497, + "learning_rate": 5.1915222342913685e-06, + "loss": 0.0246, + "step": 26898 + }, + { + "epoch": 3.1897308193999763, + "grad_norm": 0.4929255422692809, + "learning_rate": 5.190057911059457e-06, + "loss": 0.0223, + "step": 26899 + }, + { + "epoch": 3.1898494011621015, + "grad_norm": 0.565366128086729, + "learning_rate": 5.188593770450906e-06, + "loss": 0.0286, + "step": 26900 + }, + { + "epoch": 3.1899679829242262, + "grad_norm": 0.633630582106647, + "learning_rate": 5.1871298124792385e-06, + "loss": 0.0301, + "step": 26901 + }, + { + "epoch": 3.1900865646863514, + "grad_norm": 0.9457000334095375, + "learning_rate": 5.185666037157932e-06, + "loss": 0.0644, + "step": 26902 + }, + { + "epoch": 3.190205146448476, + "grad_norm": 0.7453050979299765, + "learning_rate": 5.184202444500486e-06, + "loss": 0.0342, + "step": 26903 + }, + { + "epoch": 3.1903237282106014, + "grad_norm": 0.4326760769074383, + "learning_rate": 5.182739034520395e-06, + "loss": 0.0168, + "step": 26904 + }, + { + "epoch": 3.190442309972726, + "grad_norm": 0.4672052559254063, + "learning_rate": 5.1812758072311475e-06, + "loss": 0.0214, + "step": 26905 + }, + { + "epoch": 3.1905608917348514, + "grad_norm": 0.6210463267289279, + "learning_rate": 5.1798127626462415e-06, + "loss": 0.0298, + "step": 26906 + }, + { + "epoch": 3.190679473496976, + "grad_norm": 0.4613150462225525, + "learning_rate": 5.17834990077915e-06, + "loss": 0.0215, + "step": 26907 + }, + { + "epoch": 3.1907980552591013, + "grad_norm": 0.7078582233017807, + "learning_rate": 5.176887221643364e-06, + "loss": 0.0332, + "step": 26908 + }, + { + "epoch": 3.190916637021226, + "grad_norm": 0.6968138130032312, + "learning_rate": 5.17542472525237e-06, + "loss": 0.0339, + "step": 26909 + }, + { + "epoch": 3.1910352187833513, + "grad_norm": 0.7380974991422068, + "learning_rate": 5.173962411619656e-06, + "loss": 0.0603, + "step": 26910 + }, + { + "epoch": 3.191153800545476, + "grad_norm": 0.5399279622744187, + "learning_rate": 5.172500280758685e-06, + "loss": 0.0271, + "step": 26911 + }, + { + "epoch": 3.1912723823076012, + "grad_norm": 0.5230416023677075, + "learning_rate": 5.171038332682959e-06, + "loss": 0.0198, + "step": 26912 + }, + { + "epoch": 3.191390964069726, + "grad_norm": 0.4267788106189349, + "learning_rate": 5.169576567405937e-06, + "loss": 0.0192, + "step": 26913 + }, + { + "epoch": 3.191509545831851, + "grad_norm": 0.5444334376043003, + "learning_rate": 5.1681149849411e-06, + "loss": 0.03, + "step": 26914 + }, + { + "epoch": 3.191628127593976, + "grad_norm": 0.6605983668055896, + "learning_rate": 5.166653585301925e-06, + "loss": 0.046, + "step": 26915 + }, + { + "epoch": 3.191746709356101, + "grad_norm": 0.5804601109006878, + "learning_rate": 5.1651923685018835e-06, + "loss": 0.0238, + "step": 26916 + }, + { + "epoch": 3.191865291118226, + "grad_norm": 0.5506490675674369, + "learning_rate": 5.1637313345544506e-06, + "loss": 0.0349, + "step": 26917 + }, + { + "epoch": 3.191983872880351, + "grad_norm": 0.2737267421503641, + "learning_rate": 5.1622704834730834e-06, + "loss": 0.0135, + "step": 26918 + }, + { + "epoch": 3.192102454642476, + "grad_norm": 0.7075414080767747, + "learning_rate": 5.160809815271256e-06, + "loss": 0.0313, + "step": 26919 + }, + { + "epoch": 3.192221036404601, + "grad_norm": 0.435815790185961, + "learning_rate": 5.159349329962435e-06, + "loss": 0.022, + "step": 26920 + }, + { + "epoch": 3.192339618166726, + "grad_norm": 0.6664140898913483, + "learning_rate": 5.1578890275600865e-06, + "loss": 0.0345, + "step": 26921 + }, + { + "epoch": 3.192458199928851, + "grad_norm": 0.5166507247554195, + "learning_rate": 5.156428908077665e-06, + "loss": 0.0284, + "step": 26922 + }, + { + "epoch": 3.192576781690976, + "grad_norm": 0.4568992427265952, + "learning_rate": 5.154968971528634e-06, + "loss": 0.025, + "step": 26923 + }, + { + "epoch": 3.192695363453101, + "grad_norm": 0.4249494605353448, + "learning_rate": 5.153509217926453e-06, + "loss": 0.0183, + "step": 26924 + }, + { + "epoch": 3.1928139452152258, + "grad_norm": 0.3654665189903101, + "learning_rate": 5.152049647284582e-06, + "loss": 0.0212, + "step": 26925 + }, + { + "epoch": 3.192932526977351, + "grad_norm": 0.5554830484481221, + "learning_rate": 5.150590259616472e-06, + "loss": 0.0386, + "step": 26926 + }, + { + "epoch": 3.1930511087394757, + "grad_norm": 0.5777776823105643, + "learning_rate": 5.14913105493558e-06, + "loss": 0.0272, + "step": 26927 + }, + { + "epoch": 3.193169690501601, + "grad_norm": 0.3645540197811924, + "learning_rate": 5.147672033255363e-06, + "loss": 0.019, + "step": 26928 + }, + { + "epoch": 3.1932882722637257, + "grad_norm": 0.5371166395928136, + "learning_rate": 5.146213194589258e-06, + "loss": 0.0223, + "step": 26929 + }, + { + "epoch": 3.193406854025851, + "grad_norm": 0.49485726358195337, + "learning_rate": 5.144754538950722e-06, + "loss": 0.0211, + "step": 26930 + }, + { + "epoch": 3.1935254357879757, + "grad_norm": 0.6303027171134735, + "learning_rate": 5.143296066353201e-06, + "loss": 0.037, + "step": 26931 + }, + { + "epoch": 3.193644017550101, + "grad_norm": 0.6376935049765313, + "learning_rate": 5.141837776810141e-06, + "loss": 0.0303, + "step": 26932 + }, + { + "epoch": 3.1937625993122256, + "grad_norm": 0.8571658843404708, + "learning_rate": 5.140379670334988e-06, + "loss": 0.0412, + "step": 26933 + }, + { + "epoch": 3.193881181074351, + "grad_norm": 0.5557093152771418, + "learning_rate": 5.138921746941172e-06, + "loss": 0.0317, + "step": 26934 + }, + { + "epoch": 3.1939997628364756, + "grad_norm": 0.5338415654053801, + "learning_rate": 5.137464006642154e-06, + "loss": 0.0289, + "step": 26935 + }, + { + "epoch": 3.194118344598601, + "grad_norm": 0.49504335197646526, + "learning_rate": 5.1360064494513555e-06, + "loss": 0.0233, + "step": 26936 + }, + { + "epoch": 3.1942369263607255, + "grad_norm": 0.593200567055769, + "learning_rate": 5.134549075382222e-06, + "loss": 0.0226, + "step": 26937 + }, + { + "epoch": 3.1943555081228507, + "grad_norm": 0.757324279441108, + "learning_rate": 5.133091884448174e-06, + "loss": 0.037, + "step": 26938 + }, + { + "epoch": 3.1944740898849755, + "grad_norm": 0.47938179582051904, + "learning_rate": 5.1316348766626695e-06, + "loss": 0.0305, + "step": 26939 + }, + { + "epoch": 3.1945926716471007, + "grad_norm": 0.7188923463162126, + "learning_rate": 5.130178052039123e-06, + "loss": 0.0312, + "step": 26940 + }, + { + "epoch": 3.1947112534092255, + "grad_norm": 0.531042918874453, + "learning_rate": 5.128721410590967e-06, + "loss": 0.0226, + "step": 26941 + }, + { + "epoch": 3.1948298351713507, + "grad_norm": 0.9329678800153057, + "learning_rate": 5.127264952331631e-06, + "loss": 0.0271, + "step": 26942 + }, + { + "epoch": 3.1949484169334754, + "grad_norm": 0.7363641898866483, + "learning_rate": 5.125808677274543e-06, + "loss": 0.0274, + "step": 26943 + }, + { + "epoch": 3.1950669986956006, + "grad_norm": 0.4205734845729374, + "learning_rate": 5.124352585433134e-06, + "loss": 0.0278, + "step": 26944 + }, + { + "epoch": 3.195185580457726, + "grad_norm": 0.819016515921391, + "learning_rate": 5.122896676820807e-06, + "loss": 0.0358, + "step": 26945 + }, + { + "epoch": 3.1953041622198506, + "grad_norm": 0.5645696568523688, + "learning_rate": 5.121440951451015e-06, + "loss": 0.0339, + "step": 26946 + }, + { + "epoch": 3.1954227439819753, + "grad_norm": 0.587494219987295, + "learning_rate": 5.11998540933715e-06, + "loss": 0.033, + "step": 26947 + }, + { + "epoch": 3.1955413257441005, + "grad_norm": 0.8491334166511576, + "learning_rate": 5.1185300504926485e-06, + "loss": 0.0486, + "step": 26948 + }, + { + "epoch": 3.1956599075062257, + "grad_norm": 0.5721603121715274, + "learning_rate": 5.117074874930908e-06, + "loss": 0.0335, + "step": 26949 + }, + { + "epoch": 3.1957784892683505, + "grad_norm": 0.9624704081770229, + "learning_rate": 5.115619882665365e-06, + "loss": 0.0658, + "step": 26950 + }, + { + "epoch": 3.1958970710304757, + "grad_norm": 0.5620849120622815, + "learning_rate": 5.1141650737094205e-06, + "loss": 0.0216, + "step": 26951 + }, + { + "epoch": 3.1960156527926005, + "grad_norm": 0.6688371887508554, + "learning_rate": 5.112710448076485e-06, + "loss": 0.028, + "step": 26952 + }, + { + "epoch": 3.1961342345547257, + "grad_norm": 0.47348511612817334, + "learning_rate": 5.111256005779974e-06, + "loss": 0.0285, + "step": 26953 + }, + { + "epoch": 3.1962528163168504, + "grad_norm": 0.6652151528020434, + "learning_rate": 5.109801746833292e-06, + "loss": 0.0428, + "step": 26954 + }, + { + "epoch": 3.1963713980789756, + "grad_norm": 0.5249441914903855, + "learning_rate": 5.108347671249855e-06, + "loss": 0.0187, + "step": 26955 + }, + { + "epoch": 3.1964899798411004, + "grad_norm": 0.5799846680153278, + "learning_rate": 5.106893779043045e-06, + "loss": 0.0285, + "step": 26956 + }, + { + "epoch": 3.1966085616032256, + "grad_norm": 0.6993994776526516, + "learning_rate": 5.105440070226294e-06, + "loss": 0.0371, + "step": 26957 + }, + { + "epoch": 3.1967271433653504, + "grad_norm": 0.5021421296473735, + "learning_rate": 5.103986544812983e-06, + "loss": 0.0264, + "step": 26958 + }, + { + "epoch": 3.1968457251274756, + "grad_norm": 0.4171662770103438, + "learning_rate": 5.102533202816523e-06, + "loss": 0.0201, + "step": 26959 + }, + { + "epoch": 3.1969643068896003, + "grad_norm": 0.4653848871200171, + "learning_rate": 5.101080044250295e-06, + "loss": 0.0285, + "step": 26960 + }, + { + "epoch": 3.1970828886517255, + "grad_norm": 0.366187067557477, + "learning_rate": 5.099627069127719e-06, + "loss": 0.0228, + "step": 26961 + }, + { + "epoch": 3.1972014704138503, + "grad_norm": 0.5357662143837045, + "learning_rate": 5.098174277462173e-06, + "loss": 0.0258, + "step": 26962 + }, + { + "epoch": 3.1973200521759755, + "grad_norm": 0.445187534544748, + "learning_rate": 5.096721669267055e-06, + "loss": 0.0239, + "step": 26963 + }, + { + "epoch": 3.1974386339381002, + "grad_norm": 0.6798602734259284, + "learning_rate": 5.095269244555756e-06, + "loss": 0.0301, + "step": 26964 + }, + { + "epoch": 3.1975572157002254, + "grad_norm": 0.3319861509842143, + "learning_rate": 5.093817003341664e-06, + "loss": 0.0179, + "step": 26965 + }, + { + "epoch": 3.19767579746235, + "grad_norm": 0.7620064736457987, + "learning_rate": 5.092364945638178e-06, + "loss": 0.0396, + "step": 26966 + }, + { + "epoch": 3.1977943792244754, + "grad_norm": 0.6782925350114105, + "learning_rate": 5.090913071458667e-06, + "loss": 0.0401, + "step": 26967 + }, + { + "epoch": 3.1979129609866, + "grad_norm": 0.3936527582674498, + "learning_rate": 5.089461380816524e-06, + "loss": 0.0217, + "step": 26968 + }, + { + "epoch": 3.1980315427487254, + "grad_norm": 0.4108785335325245, + "learning_rate": 5.088009873725128e-06, + "loss": 0.0302, + "step": 26969 + }, + { + "epoch": 3.19815012451085, + "grad_norm": 0.5556151159635605, + "learning_rate": 5.086558550197873e-06, + "loss": 0.0259, + "step": 26970 + }, + { + "epoch": 3.1982687062729753, + "grad_norm": 0.39165405415067356, + "learning_rate": 5.0851074102481165e-06, + "loss": 0.0163, + "step": 26971 + }, + { + "epoch": 3.1983872880351, + "grad_norm": 0.6823119537551505, + "learning_rate": 5.083656453889257e-06, + "loss": 0.0284, + "step": 26972 + }, + { + "epoch": 3.1985058697972253, + "grad_norm": 0.4942851588830452, + "learning_rate": 5.0822056811346604e-06, + "loss": 0.0371, + "step": 26973 + }, + { + "epoch": 3.19862445155935, + "grad_norm": 0.5242442853084224, + "learning_rate": 5.080755091997699e-06, + "loss": 0.0246, + "step": 26974 + }, + { + "epoch": 3.1987430333214752, + "grad_norm": 0.9727699985113757, + "learning_rate": 5.079304686491751e-06, + "loss": 0.0498, + "step": 26975 + }, + { + "epoch": 3.1988616150836, + "grad_norm": 0.6081117222061302, + "learning_rate": 5.077854464630186e-06, + "loss": 0.0261, + "step": 26976 + }, + { + "epoch": 3.198980196845725, + "grad_norm": 0.5412970283348593, + "learning_rate": 5.076404426426376e-06, + "loss": 0.0232, + "step": 26977 + }, + { + "epoch": 3.19909877860785, + "grad_norm": 0.501996230416171, + "learning_rate": 5.074954571893681e-06, + "loss": 0.024, + "step": 26978 + }, + { + "epoch": 3.199217360369975, + "grad_norm": 0.3473407292522982, + "learning_rate": 5.073504901045473e-06, + "loss": 0.0159, + "step": 26979 + }, + { + "epoch": 3.1993359421321, + "grad_norm": 0.32571162881402876, + "learning_rate": 5.072055413895111e-06, + "loss": 0.019, + "step": 26980 + }, + { + "epoch": 3.199454523894225, + "grad_norm": 0.5457594387112908, + "learning_rate": 5.0706061104559635e-06, + "loss": 0.0204, + "step": 26981 + }, + { + "epoch": 3.19957310565635, + "grad_norm": 0.5557206569432737, + "learning_rate": 5.069156990741394e-06, + "loss": 0.0287, + "step": 26982 + }, + { + "epoch": 3.199691687418475, + "grad_norm": 0.46074883778845904, + "learning_rate": 5.0677080547647416e-06, + "loss": 0.0187, + "step": 26983 + }, + { + "epoch": 3.1998102691806, + "grad_norm": 0.6070037521275514, + "learning_rate": 5.066259302539395e-06, + "loss": 0.0288, + "step": 26984 + }, + { + "epoch": 3.199928850942725, + "grad_norm": 0.958927570581507, + "learning_rate": 5.064810734078681e-06, + "loss": 0.047, + "step": 26985 + }, + { + "epoch": 3.20004743270485, + "grad_norm": 0.44476879634917704, + "learning_rate": 5.0633623493959695e-06, + "loss": 0.0137, + "step": 26986 + }, + { + "epoch": 3.200166014466975, + "grad_norm": 0.9451667499560445, + "learning_rate": 5.0619141485046096e-06, + "loss": 0.0498, + "step": 26987 + }, + { + "epoch": 3.2002845962290998, + "grad_norm": 0.43749996325044727, + "learning_rate": 5.06046613141796e-06, + "loss": 0.0199, + "step": 26988 + }, + { + "epoch": 3.200403177991225, + "grad_norm": 0.48543583744740887, + "learning_rate": 5.059018298149351e-06, + "loss": 0.0232, + "step": 26989 + }, + { + "epoch": 3.2005217597533497, + "grad_norm": 0.4681283058829727, + "learning_rate": 5.057570648712143e-06, + "loss": 0.0206, + "step": 26990 + }, + { + "epoch": 3.200640341515475, + "grad_norm": 0.39888778038165357, + "learning_rate": 5.056123183119676e-06, + "loss": 0.0173, + "step": 26991 + }, + { + "epoch": 3.2007589232775997, + "grad_norm": 0.41691228848472983, + "learning_rate": 5.054675901385297e-06, + "loss": 0.0212, + "step": 26992 + }, + { + "epoch": 3.200877505039725, + "grad_norm": 0.5189003098316355, + "learning_rate": 5.053228803522356e-06, + "loss": 0.0304, + "step": 26993 + }, + { + "epoch": 3.20099608680185, + "grad_norm": 0.5883890898892518, + "learning_rate": 5.051781889544169e-06, + "loss": 0.0274, + "step": 26994 + }, + { + "epoch": 3.201114668563975, + "grad_norm": 0.46807797485358543, + "learning_rate": 5.050335159464106e-06, + "loss": 0.0293, + "step": 26995 + }, + { + "epoch": 3.2012332503260996, + "grad_norm": 0.5441720170603063, + "learning_rate": 5.0488886132954825e-06, + "loss": 0.0331, + "step": 26996 + }, + { + "epoch": 3.201351832088225, + "grad_norm": 0.42801238799822366, + "learning_rate": 5.047442251051637e-06, + "loss": 0.0216, + "step": 26997 + }, + { + "epoch": 3.20147041385035, + "grad_norm": 0.46029273728171827, + "learning_rate": 5.045996072745912e-06, + "loss": 0.0313, + "step": 26998 + }, + { + "epoch": 3.201588995612475, + "grad_norm": 0.4618799528244907, + "learning_rate": 5.044550078391635e-06, + "loss": 0.0263, + "step": 26999 + }, + { + "epoch": 3.2017075773746, + "grad_norm": 0.4225154990092172, + "learning_rate": 5.043104268002133e-06, + "loss": 0.0266, + "step": 27000 + }, + { + "epoch": 3.2018261591367247, + "grad_norm": 0.49078660720147144, + "learning_rate": 5.041658641590735e-06, + "loss": 0.0224, + "step": 27001 + }, + { + "epoch": 3.20194474089885, + "grad_norm": 0.46418812320321884, + "learning_rate": 5.040213199170771e-06, + "loss": 0.0248, + "step": 27002 + }, + { + "epoch": 3.2020633226609747, + "grad_norm": 0.5874413973202138, + "learning_rate": 5.0387679407555645e-06, + "loss": 0.0323, + "step": 27003 + }, + { + "epoch": 3.2021819044231, + "grad_norm": 0.44344644418150325, + "learning_rate": 5.037322866358446e-06, + "loss": 0.0256, + "step": 27004 + }, + { + "epoch": 3.2023004861852247, + "grad_norm": 0.4924068526311927, + "learning_rate": 5.035877975992717e-06, + "loss": 0.0303, + "step": 27005 + }, + { + "epoch": 3.20241906794735, + "grad_norm": 0.5700032887492784, + "learning_rate": 5.034433269671723e-06, + "loss": 0.0271, + "step": 27006 + }, + { + "epoch": 3.2025376497094746, + "grad_norm": 0.5998223131560486, + "learning_rate": 5.032988747408768e-06, + "loss": 0.0342, + "step": 27007 + }, + { + "epoch": 3.2026562314716, + "grad_norm": 0.5514272677471899, + "learning_rate": 5.031544409217176e-06, + "loss": 0.0247, + "step": 27008 + }, + { + "epoch": 3.2027748132337246, + "grad_norm": 0.607516964120448, + "learning_rate": 5.030100255110246e-06, + "loss": 0.0263, + "step": 27009 + }, + { + "epoch": 3.20289339499585, + "grad_norm": 0.4976230338614118, + "learning_rate": 5.028656285101313e-06, + "loss": 0.0236, + "step": 27010 + }, + { + "epoch": 3.2030119767579746, + "grad_norm": 0.5984066332696342, + "learning_rate": 5.027212499203676e-06, + "loss": 0.0364, + "step": 27011 + }, + { + "epoch": 3.2031305585200998, + "grad_norm": 0.8242423879729986, + "learning_rate": 5.025768897430644e-06, + "loss": 0.0406, + "step": 27012 + }, + { + "epoch": 3.2032491402822245, + "grad_norm": 0.43333348996469395, + "learning_rate": 5.02432547979553e-06, + "loss": 0.0153, + "step": 27013 + }, + { + "epoch": 3.2033677220443497, + "grad_norm": 0.6039307882506398, + "learning_rate": 5.022882246311641e-06, + "loss": 0.0424, + "step": 27014 + }, + { + "epoch": 3.2034863038064745, + "grad_norm": 0.9679023078825638, + "learning_rate": 5.021439196992283e-06, + "loss": 0.047, + "step": 27015 + }, + { + "epoch": 3.2036048855685997, + "grad_norm": 0.6667951644951909, + "learning_rate": 5.019996331850746e-06, + "loss": 0.0335, + "step": 27016 + }, + { + "epoch": 3.2037234673307244, + "grad_norm": 0.7390087243550055, + "learning_rate": 5.0185536509003536e-06, + "loss": 0.0357, + "step": 27017 + }, + { + "epoch": 3.2038420490928496, + "grad_norm": 0.7018625511454313, + "learning_rate": 5.017111154154389e-06, + "loss": 0.0394, + "step": 27018 + }, + { + "epoch": 3.2039606308549744, + "grad_norm": 0.49231765453931814, + "learning_rate": 5.0156688416261615e-06, + "loss": 0.0319, + "step": 27019 + }, + { + "epoch": 3.2040792126170996, + "grad_norm": 0.5099516512560945, + "learning_rate": 5.014226713328945e-06, + "loss": 0.0255, + "step": 27020 + }, + { + "epoch": 3.2041977943792244, + "grad_norm": 0.8101994007336396, + "learning_rate": 5.012784769276066e-06, + "loss": 0.0526, + "step": 27021 + }, + { + "epoch": 3.2043163761413496, + "grad_norm": 0.37430980102540684, + "learning_rate": 5.011343009480793e-06, + "loss": 0.0167, + "step": 27022 + }, + { + "epoch": 3.2044349579034743, + "grad_norm": 0.47601551850807333, + "learning_rate": 5.009901433956427e-06, + "loss": 0.0291, + "step": 27023 + }, + { + "epoch": 3.2045535396655995, + "grad_norm": 1.2641074905364464, + "learning_rate": 5.008460042716254e-06, + "loss": 0.0733, + "step": 27024 + }, + { + "epoch": 3.2046721214277243, + "grad_norm": 0.4581452423669284, + "learning_rate": 5.007018835773567e-06, + "loss": 0.0289, + "step": 27025 + }, + { + "epoch": 3.2047907031898495, + "grad_norm": 0.6501507776870904, + "learning_rate": 5.005577813141654e-06, + "loss": 0.0291, + "step": 27026 + }, + { + "epoch": 3.2049092849519742, + "grad_norm": 0.6948443016552884, + "learning_rate": 5.004136974833782e-06, + "loss": 0.0342, + "step": 27027 + }, + { + "epoch": 3.2050278667140994, + "grad_norm": 0.8696228859462399, + "learning_rate": 5.002696320863259e-06, + "loss": 0.0478, + "step": 27028 + }, + { + "epoch": 3.205146448476224, + "grad_norm": 0.6465157202831386, + "learning_rate": 5.00125585124335e-06, + "loss": 0.0331, + "step": 27029 + }, + { + "epoch": 3.2052650302383494, + "grad_norm": 0.7477856646787003, + "learning_rate": 4.999815565987334e-06, + "loss": 0.0463, + "step": 27030 + }, + { + "epoch": 3.205383612000474, + "grad_norm": 0.6122153423727835, + "learning_rate": 4.998375465108496e-06, + "loss": 0.0345, + "step": 27031 + }, + { + "epoch": 3.2055021937625994, + "grad_norm": 0.574014058150516, + "learning_rate": 4.996935548620108e-06, + "loss": 0.0199, + "step": 27032 + }, + { + "epoch": 3.205620775524724, + "grad_norm": 0.7814319621461882, + "learning_rate": 4.99549581653545e-06, + "loss": 0.0379, + "step": 27033 + }, + { + "epoch": 3.2057393572868493, + "grad_norm": 0.45107671511856645, + "learning_rate": 4.994056268867783e-06, + "loss": 0.0183, + "step": 27034 + }, + { + "epoch": 3.205857939048974, + "grad_norm": 0.739923069996488, + "learning_rate": 4.992616905630385e-06, + "loss": 0.0331, + "step": 27035 + }, + { + "epoch": 3.2059765208110993, + "grad_norm": 0.5714840892985907, + "learning_rate": 4.991177726836524e-06, + "loss": 0.0217, + "step": 27036 + }, + { + "epoch": 3.206095102573224, + "grad_norm": 0.5405641356418535, + "learning_rate": 4.989738732499474e-06, + "loss": 0.0371, + "step": 27037 + }, + { + "epoch": 3.2062136843353493, + "grad_norm": 0.5890050645721773, + "learning_rate": 4.9882999226324885e-06, + "loss": 0.0295, + "step": 27038 + }, + { + "epoch": 3.206332266097474, + "grad_norm": 0.7399834596570002, + "learning_rate": 4.986861297248837e-06, + "loss": 0.0377, + "step": 27039 + }, + { + "epoch": 3.206450847859599, + "grad_norm": 0.7275216183452793, + "learning_rate": 4.985422856361785e-06, + "loss": 0.0422, + "step": 27040 + }, + { + "epoch": 3.206569429621724, + "grad_norm": 0.32120667345484194, + "learning_rate": 4.98398459998459e-06, + "loss": 0.0196, + "step": 27041 + }, + { + "epoch": 3.206688011383849, + "grad_norm": 0.4771235024221319, + "learning_rate": 4.98254652813051e-06, + "loss": 0.0194, + "step": 27042 + }, + { + "epoch": 3.2068065931459744, + "grad_norm": 0.651345734592974, + "learning_rate": 4.9811086408128065e-06, + "loss": 0.0266, + "step": 27043 + }, + { + "epoch": 3.206925174908099, + "grad_norm": 0.803997498964712, + "learning_rate": 4.979670938044736e-06, + "loss": 0.0505, + "step": 27044 + }, + { + "epoch": 3.207043756670224, + "grad_norm": 0.4982464681226307, + "learning_rate": 4.9782334198395454e-06, + "loss": 0.017, + "step": 27045 + }, + { + "epoch": 3.207162338432349, + "grad_norm": 0.6368611994785668, + "learning_rate": 4.976796086210489e-06, + "loss": 0.036, + "step": 27046 + }, + { + "epoch": 3.2072809201944743, + "grad_norm": 0.7811527215216876, + "learning_rate": 4.975358937170818e-06, + "loss": 0.0349, + "step": 27047 + }, + { + "epoch": 3.207399501956599, + "grad_norm": 0.4002245382166025, + "learning_rate": 4.9739219727337885e-06, + "loss": 0.0217, + "step": 27048 + }, + { + "epoch": 3.2075180837187243, + "grad_norm": 0.4443350364265798, + "learning_rate": 4.972485192912637e-06, + "loss": 0.0263, + "step": 27049 + }, + { + "epoch": 3.207636665480849, + "grad_norm": 0.60458807625833, + "learning_rate": 4.971048597720609e-06, + "loss": 0.0222, + "step": 27050 + }, + { + "epoch": 3.2077552472429742, + "grad_norm": 0.6688705494664458, + "learning_rate": 4.969612187170955e-06, + "loss": 0.0499, + "step": 27051 + }, + { + "epoch": 3.207873829005099, + "grad_norm": 0.3736367763607001, + "learning_rate": 4.968175961276911e-06, + "loss": 0.0199, + "step": 27052 + }, + { + "epoch": 3.207992410767224, + "grad_norm": 0.3708767055072503, + "learning_rate": 4.966739920051728e-06, + "loss": 0.0153, + "step": 27053 + }, + { + "epoch": 3.208110992529349, + "grad_norm": 1.005678773778029, + "learning_rate": 4.965304063508622e-06, + "loss": 0.0356, + "step": 27054 + }, + { + "epoch": 3.208229574291474, + "grad_norm": 0.377417483074164, + "learning_rate": 4.963868391660859e-06, + "loss": 0.0189, + "step": 27055 + }, + { + "epoch": 3.208348156053599, + "grad_norm": 0.3347756174342757, + "learning_rate": 4.962432904521652e-06, + "loss": 0.0151, + "step": 27056 + }, + { + "epoch": 3.208466737815724, + "grad_norm": 0.4269748662688113, + "learning_rate": 4.960997602104242e-06, + "loss": 0.0181, + "step": 27057 + }, + { + "epoch": 3.208585319577849, + "grad_norm": 0.7544825183747258, + "learning_rate": 4.959562484421862e-06, + "loss": 0.043, + "step": 27058 + }, + { + "epoch": 3.208703901339974, + "grad_norm": 0.7040985803380599, + "learning_rate": 4.958127551487746e-06, + "loss": 0.0326, + "step": 27059 + }, + { + "epoch": 3.208822483102099, + "grad_norm": 0.3385875643611163, + "learning_rate": 4.95669280331511e-06, + "loss": 0.0193, + "step": 27060 + }, + { + "epoch": 3.208941064864224, + "grad_norm": 0.3186687415153728, + "learning_rate": 4.955258239917187e-06, + "loss": 0.0131, + "step": 27061 + }, + { + "epoch": 3.209059646626349, + "grad_norm": 0.5627789663838764, + "learning_rate": 4.953823861307205e-06, + "loss": 0.0329, + "step": 27062 + }, + { + "epoch": 3.209178228388474, + "grad_norm": 0.6553090016138743, + "learning_rate": 4.952389667498383e-06, + "loss": 0.0301, + "step": 27063 + }, + { + "epoch": 3.2092968101505988, + "grad_norm": 0.8261983408702537, + "learning_rate": 4.950955658503953e-06, + "loss": 0.0481, + "step": 27064 + }, + { + "epoch": 3.209415391912724, + "grad_norm": 0.6844416030563497, + "learning_rate": 4.9495218343371115e-06, + "loss": 0.039, + "step": 27065 + }, + { + "epoch": 3.2095339736748487, + "grad_norm": 0.4254018947038711, + "learning_rate": 4.948088195011103e-06, + "loss": 0.0188, + "step": 27066 + }, + { + "epoch": 3.209652555436974, + "grad_norm": 0.42440294512836524, + "learning_rate": 4.946654740539128e-06, + "loss": 0.0233, + "step": 27067 + }, + { + "epoch": 3.2097711371990987, + "grad_norm": 0.5836880152690598, + "learning_rate": 4.9452214709344025e-06, + "loss": 0.0221, + "step": 27068 + }, + { + "epoch": 3.209889718961224, + "grad_norm": 0.5532553111630646, + "learning_rate": 4.943788386210144e-06, + "loss": 0.0233, + "step": 27069 + }, + { + "epoch": 3.2100083007233486, + "grad_norm": 0.7099197626351222, + "learning_rate": 4.942355486379568e-06, + "loss": 0.0292, + "step": 27070 + }, + { + "epoch": 3.210126882485474, + "grad_norm": 0.6750075678336807, + "learning_rate": 4.940922771455872e-06, + "loss": 0.0326, + "step": 27071 + }, + { + "epoch": 3.2102454642475986, + "grad_norm": 0.24672741959556543, + "learning_rate": 4.9394902414522685e-06, + "loss": 0.0095, + "step": 27072 + }, + { + "epoch": 3.210364046009724, + "grad_norm": 1.0587347708000714, + "learning_rate": 4.938057896381968e-06, + "loss": 0.0393, + "step": 27073 + }, + { + "epoch": 3.2104826277718486, + "grad_norm": 0.3589549981023637, + "learning_rate": 4.936625736258171e-06, + "loss": 0.0179, + "step": 27074 + }, + { + "epoch": 3.2106012095339738, + "grad_norm": 0.9341626036407942, + "learning_rate": 4.935193761094087e-06, + "loss": 0.0519, + "step": 27075 + }, + { + "epoch": 3.2107197912960985, + "grad_norm": 0.4606809112924022, + "learning_rate": 4.933761970902897e-06, + "loss": 0.0229, + "step": 27076 + }, + { + "epoch": 3.2108383730582237, + "grad_norm": 0.6385392694709144, + "learning_rate": 4.932330365697832e-06, + "loss": 0.0297, + "step": 27077 + }, + { + "epoch": 3.2109569548203485, + "grad_norm": 0.46613393651723734, + "learning_rate": 4.930898945492063e-06, + "loss": 0.0256, + "step": 27078 + }, + { + "epoch": 3.2110755365824737, + "grad_norm": 0.5412300386164081, + "learning_rate": 4.929467710298796e-06, + "loss": 0.0288, + "step": 27079 + }, + { + "epoch": 3.2111941183445984, + "grad_norm": 0.6170897691184155, + "learning_rate": 4.9280366601312245e-06, + "loss": 0.0221, + "step": 27080 + }, + { + "epoch": 3.2113127001067236, + "grad_norm": 0.3608064221916736, + "learning_rate": 4.92660579500254e-06, + "loss": 0.0181, + "step": 27081 + }, + { + "epoch": 3.2114312818688484, + "grad_norm": 0.4280773022820683, + "learning_rate": 4.9251751149259426e-06, + "loss": 0.0179, + "step": 27082 + }, + { + "epoch": 3.2115498636309736, + "grad_norm": 0.5708351776761729, + "learning_rate": 4.9237446199146e-06, + "loss": 0.0314, + "step": 27083 + }, + { + "epoch": 3.2116684453930984, + "grad_norm": 0.5309402318453341, + "learning_rate": 4.922314309981727e-06, + "loss": 0.0292, + "step": 27084 + }, + { + "epoch": 3.2117870271552236, + "grad_norm": 0.6746003432433283, + "learning_rate": 4.920884185140487e-06, + "loss": 0.0403, + "step": 27085 + }, + { + "epoch": 3.2119056089173483, + "grad_norm": 0.49827690586654494, + "learning_rate": 4.919454245404079e-06, + "loss": 0.0235, + "step": 27086 + }, + { + "epoch": 3.2120241906794735, + "grad_norm": 0.5211201699153272, + "learning_rate": 4.918024490785664e-06, + "loss": 0.0244, + "step": 27087 + }, + { + "epoch": 3.2121427724415983, + "grad_norm": 0.7221696912377594, + "learning_rate": 4.9165949212984505e-06, + "loss": 0.0395, + "step": 27088 + }, + { + "epoch": 3.2122613542037235, + "grad_norm": 0.5111766467096094, + "learning_rate": 4.915165536955599e-06, + "loss": 0.0307, + "step": 27089 + }, + { + "epoch": 3.2123799359658483, + "grad_norm": 0.4668384807154018, + "learning_rate": 4.913736337770292e-06, + "loss": 0.0231, + "step": 27090 + }, + { + "epoch": 3.2124985177279735, + "grad_norm": 0.4502471405867554, + "learning_rate": 4.912307323755702e-06, + "loss": 0.0166, + "step": 27091 + }, + { + "epoch": 3.212617099490098, + "grad_norm": 0.4790266769230135, + "learning_rate": 4.910878494925008e-06, + "loss": 0.0302, + "step": 27092 + }, + { + "epoch": 3.2127356812522234, + "grad_norm": 0.47904577771318585, + "learning_rate": 4.909449851291384e-06, + "loss": 0.0226, + "step": 27093 + }, + { + "epoch": 3.212854263014348, + "grad_norm": 0.5950838707099914, + "learning_rate": 4.908021392867989e-06, + "loss": 0.0222, + "step": 27094 + }, + { + "epoch": 3.2129728447764734, + "grad_norm": 0.595833036685722, + "learning_rate": 4.906593119668001e-06, + "loss": 0.0352, + "step": 27095 + }, + { + "epoch": 3.2130914265385986, + "grad_norm": 0.6218192035972047, + "learning_rate": 4.905165031704584e-06, + "loss": 0.0461, + "step": 27096 + }, + { + "epoch": 3.2132100083007233, + "grad_norm": 0.5048711979939895, + "learning_rate": 4.903737128990907e-06, + "loss": 0.0271, + "step": 27097 + }, + { + "epoch": 3.213328590062848, + "grad_norm": 0.7317280229295835, + "learning_rate": 4.902309411540118e-06, + "loss": 0.0451, + "step": 27098 + }, + { + "epoch": 3.2134471718249733, + "grad_norm": 0.6032619348672169, + "learning_rate": 4.900881879365404e-06, + "loss": 0.0302, + "step": 27099 + }, + { + "epoch": 3.2135657535870985, + "grad_norm": 0.3661808598821854, + "learning_rate": 4.899454532479908e-06, + "loss": 0.0149, + "step": 27100 + }, + { + "epoch": 3.2136843353492233, + "grad_norm": 0.5011063889781777, + "learning_rate": 4.898027370896788e-06, + "loss": 0.0265, + "step": 27101 + }, + { + "epoch": 3.2138029171113485, + "grad_norm": 0.6382682593000024, + "learning_rate": 4.8966003946292085e-06, + "loss": 0.0369, + "step": 27102 + }, + { + "epoch": 3.2139214988734732, + "grad_norm": 0.7181157597680852, + "learning_rate": 4.89517360369032e-06, + "loss": 0.0373, + "step": 27103 + }, + { + "epoch": 3.2140400806355984, + "grad_norm": 0.706273821929243, + "learning_rate": 4.893746998093282e-06, + "loss": 0.0372, + "step": 27104 + }, + { + "epoch": 3.214158662397723, + "grad_norm": 0.3861241538474286, + "learning_rate": 4.8923205778512335e-06, + "loss": 0.0235, + "step": 27105 + }, + { + "epoch": 3.2142772441598484, + "grad_norm": 0.5401576288422283, + "learning_rate": 4.890894342977334e-06, + "loss": 0.03, + "step": 27106 + }, + { + "epoch": 3.214395825921973, + "grad_norm": 0.4343890969821429, + "learning_rate": 4.889468293484728e-06, + "loss": 0.0302, + "step": 27107 + }, + { + "epoch": 3.2145144076840984, + "grad_norm": 0.46471599184794277, + "learning_rate": 4.8880424293865715e-06, + "loss": 0.0183, + "step": 27108 + }, + { + "epoch": 3.214632989446223, + "grad_norm": 0.4021538951870774, + "learning_rate": 4.886616750695991e-06, + "loss": 0.0172, + "step": 27109 + }, + { + "epoch": 3.2147515712083483, + "grad_norm": 0.5637703410099437, + "learning_rate": 4.885191257426142e-06, + "loss": 0.034, + "step": 27110 + }, + { + "epoch": 3.214870152970473, + "grad_norm": 0.4348594010007127, + "learning_rate": 4.883765949590163e-06, + "loss": 0.0253, + "step": 27111 + }, + { + "epoch": 3.2149887347325983, + "grad_norm": 0.34667848526763956, + "learning_rate": 4.882340827201193e-06, + "loss": 0.0191, + "step": 27112 + }, + { + "epoch": 3.215107316494723, + "grad_norm": 0.5809350441624331, + "learning_rate": 4.880915890272372e-06, + "loss": 0.0261, + "step": 27113 + }, + { + "epoch": 3.2152258982568482, + "grad_norm": 0.9528832511135107, + "learning_rate": 4.879491138816833e-06, + "loss": 0.0442, + "step": 27114 + }, + { + "epoch": 3.215344480018973, + "grad_norm": 0.3868807965045889, + "learning_rate": 4.87806657284772e-06, + "loss": 0.0114, + "step": 27115 + }, + { + "epoch": 3.215463061781098, + "grad_norm": 0.43152103354340143, + "learning_rate": 4.87664219237815e-06, + "loss": 0.0229, + "step": 27116 + }, + { + "epoch": 3.215581643543223, + "grad_norm": 0.6712397869178642, + "learning_rate": 4.875217997421264e-06, + "loss": 0.0439, + "step": 27117 + }, + { + "epoch": 3.215700225305348, + "grad_norm": 0.362022499431157, + "learning_rate": 4.8737939879901884e-06, + "loss": 0.0212, + "step": 27118 + }, + { + "epoch": 3.215818807067473, + "grad_norm": 0.326404774477491, + "learning_rate": 4.872370164098058e-06, + "loss": 0.019, + "step": 27119 + }, + { + "epoch": 3.215937388829598, + "grad_norm": 0.46732394697785645, + "learning_rate": 4.870946525757988e-06, + "loss": 0.0182, + "step": 27120 + }, + { + "epoch": 3.216055970591723, + "grad_norm": 0.6140216446254606, + "learning_rate": 4.869523072983107e-06, + "loss": 0.0476, + "step": 27121 + }, + { + "epoch": 3.216174552353848, + "grad_norm": 0.6480079462522715, + "learning_rate": 4.868099805786535e-06, + "loss": 0.0341, + "step": 27122 + }, + { + "epoch": 3.216293134115973, + "grad_norm": 0.43391969400473984, + "learning_rate": 4.866676724181396e-06, + "loss": 0.021, + "step": 27123 + }, + { + "epoch": 3.216411715878098, + "grad_norm": 0.5521883071652235, + "learning_rate": 4.865253828180816e-06, + "loss": 0.0281, + "step": 27124 + }, + { + "epoch": 3.216530297640223, + "grad_norm": 0.4393152817037006, + "learning_rate": 4.863831117797893e-06, + "loss": 0.0286, + "step": 27125 + }, + { + "epoch": 3.216648879402348, + "grad_norm": 0.5039165618084218, + "learning_rate": 4.862408593045764e-06, + "loss": 0.028, + "step": 27126 + }, + { + "epoch": 3.2167674611644728, + "grad_norm": 0.9214251305383692, + "learning_rate": 4.86098625393753e-06, + "loss": 0.0526, + "step": 27127 + }, + { + "epoch": 3.216886042926598, + "grad_norm": 0.373713198876367, + "learning_rate": 4.859564100486305e-06, + "loss": 0.0176, + "step": 27128 + }, + { + "epoch": 3.2170046246887227, + "grad_norm": 0.5630233497742907, + "learning_rate": 4.858142132705204e-06, + "loss": 0.0243, + "step": 27129 + }, + { + "epoch": 3.217123206450848, + "grad_norm": 0.4484121259477251, + "learning_rate": 4.856720350607335e-06, + "loss": 0.0167, + "step": 27130 + }, + { + "epoch": 3.2172417882129727, + "grad_norm": 0.4132359966213061, + "learning_rate": 4.8552987542058005e-06, + "loss": 0.0245, + "step": 27131 + }, + { + "epoch": 3.217360369975098, + "grad_norm": 0.3698949788257355, + "learning_rate": 4.853877343513708e-06, + "loss": 0.0177, + "step": 27132 + }, + { + "epoch": 3.2174789517372226, + "grad_norm": 0.5029102724569371, + "learning_rate": 4.85245611854416e-06, + "loss": 0.0224, + "step": 27133 + }, + { + "epoch": 3.217597533499348, + "grad_norm": 0.4030284089438012, + "learning_rate": 4.851035079310259e-06, + "loss": 0.0285, + "step": 27134 + }, + { + "epoch": 3.2177161152614726, + "grad_norm": 0.47646274945417577, + "learning_rate": 4.849614225825116e-06, + "loss": 0.0262, + "step": 27135 + }, + { + "epoch": 3.217834697023598, + "grad_norm": 0.4590026839089797, + "learning_rate": 4.848193558101804e-06, + "loss": 0.0284, + "step": 27136 + }, + { + "epoch": 3.2179532787857226, + "grad_norm": 0.6170639358475326, + "learning_rate": 4.846773076153449e-06, + "loss": 0.0225, + "step": 27137 + }, + { + "epoch": 3.2180718605478478, + "grad_norm": 0.5422598124536067, + "learning_rate": 4.8453527799931285e-06, + "loss": 0.0279, + "step": 27138 + }, + { + "epoch": 3.2181904423099725, + "grad_norm": 0.7069774808154977, + "learning_rate": 4.843932669633938e-06, + "loss": 0.0222, + "step": 27139 + }, + { + "epoch": 3.2183090240720977, + "grad_norm": 0.5965760294677922, + "learning_rate": 4.842512745088973e-06, + "loss": 0.0365, + "step": 27140 + }, + { + "epoch": 3.2184276058342225, + "grad_norm": 0.2759040607079294, + "learning_rate": 4.841093006371319e-06, + "loss": 0.0111, + "step": 27141 + }, + { + "epoch": 3.2185461875963477, + "grad_norm": 0.40280837084055854, + "learning_rate": 4.839673453494076e-06, + "loss": 0.0189, + "step": 27142 + }, + { + "epoch": 3.2186647693584725, + "grad_norm": 0.36165063826515925, + "learning_rate": 4.838254086470306e-06, + "loss": 0.017, + "step": 27143 + }, + { + "epoch": 3.2187833511205977, + "grad_norm": 0.43430132713101016, + "learning_rate": 4.836834905313126e-06, + "loss": 0.0196, + "step": 27144 + }, + { + "epoch": 3.218901932882723, + "grad_norm": 0.5135293199952029, + "learning_rate": 4.835415910035593e-06, + "loss": 0.0231, + "step": 27145 + }, + { + "epoch": 3.2190205146448476, + "grad_norm": 0.3014963065494346, + "learning_rate": 4.833997100650806e-06, + "loss": 0.015, + "step": 27146 + }, + { + "epoch": 3.2191390964069724, + "grad_norm": 0.3996442367640868, + "learning_rate": 4.832578477171823e-06, + "loss": 0.0222, + "step": 27147 + }, + { + "epoch": 3.2192576781690976, + "grad_norm": 0.3951939941393159, + "learning_rate": 4.831160039611748e-06, + "loss": 0.0211, + "step": 27148 + }, + { + "epoch": 3.219376259931223, + "grad_norm": 0.3872800449136801, + "learning_rate": 4.829741787983638e-06, + "loss": 0.0196, + "step": 27149 + }, + { + "epoch": 3.2194948416933475, + "grad_norm": 0.45373269610860995, + "learning_rate": 4.828323722300573e-06, + "loss": 0.02, + "step": 27150 + }, + { + "epoch": 3.2196134234554727, + "grad_norm": 0.5716222343559536, + "learning_rate": 4.826905842575629e-06, + "loss": 0.0306, + "step": 27151 + }, + { + "epoch": 3.2197320052175975, + "grad_norm": 0.846795912393875, + "learning_rate": 4.825488148821877e-06, + "loss": 0.0595, + "step": 27152 + }, + { + "epoch": 3.2198505869797227, + "grad_norm": 0.4850570304323358, + "learning_rate": 4.824070641052389e-06, + "loss": 0.0252, + "step": 27153 + }, + { + "epoch": 3.2199691687418475, + "grad_norm": 8.9272579282371, + "learning_rate": 4.822653319280216e-06, + "loss": 0.0186, + "step": 27154 + }, + { + "epoch": 3.2200877505039727, + "grad_norm": 0.47755910618699005, + "learning_rate": 4.821236183518449e-06, + "loss": 0.0279, + "step": 27155 + }, + { + "epoch": 3.2202063322660974, + "grad_norm": 0.4100536017443718, + "learning_rate": 4.819819233780135e-06, + "loss": 0.0161, + "step": 27156 + }, + { + "epoch": 3.2203249140282226, + "grad_norm": 0.4427425726145863, + "learning_rate": 4.818402470078346e-06, + "loss": 0.0255, + "step": 27157 + }, + { + "epoch": 3.2204434957903474, + "grad_norm": 0.4636609009081708, + "learning_rate": 4.8169858924261235e-06, + "loss": 0.0302, + "step": 27158 + }, + { + "epoch": 3.2205620775524726, + "grad_norm": 0.26778667652964355, + "learning_rate": 4.8155695008365535e-06, + "loss": 0.011, + "step": 27159 + }, + { + "epoch": 3.2206806593145973, + "grad_norm": 0.6961199130508007, + "learning_rate": 4.8141532953226764e-06, + "loss": 0.0399, + "step": 27160 + }, + { + "epoch": 3.2207992410767226, + "grad_norm": 0.6196551936558504, + "learning_rate": 4.812737275897553e-06, + "loss": 0.0317, + "step": 27161 + }, + { + "epoch": 3.2209178228388473, + "grad_norm": 0.4883468496797669, + "learning_rate": 4.811321442574235e-06, + "loss": 0.0281, + "step": 27162 + }, + { + "epoch": 3.2210364046009725, + "grad_norm": 0.43617583432439855, + "learning_rate": 4.809905795365777e-06, + "loss": 0.0244, + "step": 27163 + }, + { + "epoch": 3.2211549863630973, + "grad_norm": 0.35113986524343976, + "learning_rate": 4.808490334285237e-06, + "loss": 0.0179, + "step": 27164 + }, + { + "epoch": 3.2212735681252225, + "grad_norm": 0.3626224811718582, + "learning_rate": 4.807075059345647e-06, + "loss": 0.0156, + "step": 27165 + }, + { + "epoch": 3.2213921498873472, + "grad_norm": 0.7076252079584505, + "learning_rate": 4.805659970560061e-06, + "loss": 0.0424, + "step": 27166 + }, + { + "epoch": 3.2215107316494724, + "grad_norm": 0.7130614216583722, + "learning_rate": 4.804245067941529e-06, + "loss": 0.0333, + "step": 27167 + }, + { + "epoch": 3.221629313411597, + "grad_norm": 0.41116083335624093, + "learning_rate": 4.802830351503098e-06, + "loss": 0.0247, + "step": 27168 + }, + { + "epoch": 3.2217478951737224, + "grad_norm": 0.5468125883755182, + "learning_rate": 4.80141582125779e-06, + "loss": 0.022, + "step": 27169 + }, + { + "epoch": 3.221866476935847, + "grad_norm": 0.5473328506678831, + "learning_rate": 4.800001477218674e-06, + "loss": 0.0264, + "step": 27170 + }, + { + "epoch": 3.2219850586979724, + "grad_norm": 0.6068111300145196, + "learning_rate": 4.798587319398765e-06, + "loss": 0.0282, + "step": 27171 + }, + { + "epoch": 3.222103640460097, + "grad_norm": 0.38567903666739445, + "learning_rate": 4.79717334781111e-06, + "loss": 0.0164, + "step": 27172 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.7611366238625927, + "learning_rate": 4.795759562468741e-06, + "loss": 0.035, + "step": 27173 + }, + { + "epoch": 3.222340803984347, + "grad_norm": 0.6287737515709998, + "learning_rate": 4.794345963384692e-06, + "loss": 0.0192, + "step": 27174 + }, + { + "epoch": 3.2224593857464723, + "grad_norm": 0.6116212450539225, + "learning_rate": 4.792932550572002e-06, + "loss": 0.0242, + "step": 27175 + }, + { + "epoch": 3.222577967508597, + "grad_norm": 0.46821756156886396, + "learning_rate": 4.791519324043689e-06, + "loss": 0.0225, + "step": 27176 + }, + { + "epoch": 3.2226965492707222, + "grad_norm": 0.5022430533376327, + "learning_rate": 4.790106283812787e-06, + "loss": 0.0243, + "step": 27177 + }, + { + "epoch": 3.222815131032847, + "grad_norm": 0.3838372176504816, + "learning_rate": 4.7886934298923214e-06, + "loss": 0.022, + "step": 27178 + }, + { + "epoch": 3.222933712794972, + "grad_norm": 0.6696419063302662, + "learning_rate": 4.787280762295323e-06, + "loss": 0.0332, + "step": 27179 + }, + { + "epoch": 3.223052294557097, + "grad_norm": 0.3607563745955057, + "learning_rate": 4.785868281034805e-06, + "loss": 0.0146, + "step": 27180 + }, + { + "epoch": 3.223170876319222, + "grad_norm": 0.48588530841065447, + "learning_rate": 4.784455986123793e-06, + "loss": 0.0273, + "step": 27181 + }, + { + "epoch": 3.223289458081347, + "grad_norm": 0.6021044113496843, + "learning_rate": 4.783043877575305e-06, + "loss": 0.0285, + "step": 27182 + }, + { + "epoch": 3.223408039843472, + "grad_norm": 0.492233230842435, + "learning_rate": 4.781631955402363e-06, + "loss": 0.0217, + "step": 27183 + }, + { + "epoch": 3.223526621605597, + "grad_norm": 0.4813082936505619, + "learning_rate": 4.7802202196179785e-06, + "loss": 0.0278, + "step": 27184 + }, + { + "epoch": 3.223645203367722, + "grad_norm": 0.3972934109123442, + "learning_rate": 4.778808670235169e-06, + "loss": 0.0173, + "step": 27185 + }, + { + "epoch": 3.223763785129847, + "grad_norm": 0.5287712987925662, + "learning_rate": 4.777397307266953e-06, + "loss": 0.0233, + "step": 27186 + }, + { + "epoch": 3.223882366891972, + "grad_norm": 0.6304803889652346, + "learning_rate": 4.7759861307263316e-06, + "loss": 0.0363, + "step": 27187 + }, + { + "epoch": 3.224000948654097, + "grad_norm": 0.4729973360915809, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.0249, + "step": 27188 + }, + { + "epoch": 3.224119530416222, + "grad_norm": 0.4966293707801062, + "learning_rate": 4.773164336979916e-06, + "loss": 0.0293, + "step": 27189 + }, + { + "epoch": 3.2242381121783468, + "grad_norm": 0.5559308855874671, + "learning_rate": 4.771753719800137e-06, + "loss": 0.0206, + "step": 27190 + }, + { + "epoch": 3.224356693940472, + "grad_norm": 0.6513599687497477, + "learning_rate": 4.770343289099988e-06, + "loss": 0.0274, + "step": 27191 + }, + { + "epoch": 3.2244752757025967, + "grad_norm": 0.4374826064104485, + "learning_rate": 4.7689330448924545e-06, + "loss": 0.0257, + "step": 27192 + }, + { + "epoch": 3.224593857464722, + "grad_norm": 0.34141180696292306, + "learning_rate": 4.767522987190562e-06, + "loss": 0.016, + "step": 27193 + }, + { + "epoch": 3.224712439226847, + "grad_norm": 0.41551493681557905, + "learning_rate": 4.766113116007293e-06, + "loss": 0.0171, + "step": 27194 + }, + { + "epoch": 3.224831020988972, + "grad_norm": 0.579757331846841, + "learning_rate": 4.764703431355652e-06, + "loss": 0.0269, + "step": 27195 + }, + { + "epoch": 3.2249496027510967, + "grad_norm": 0.699520498545478, + "learning_rate": 4.763293933248619e-06, + "loss": 0.0375, + "step": 27196 + }, + { + "epoch": 3.225068184513222, + "grad_norm": 0.5656277662874142, + "learning_rate": 4.761884621699214e-06, + "loss": 0.0331, + "step": 27197 + }, + { + "epoch": 3.225186766275347, + "grad_norm": 0.7478985043410501, + "learning_rate": 4.760475496720407e-06, + "loss": 0.0378, + "step": 27198 + }, + { + "epoch": 3.225305348037472, + "grad_norm": 0.4541358333010349, + "learning_rate": 4.759066558325198e-06, + "loss": 0.0144, + "step": 27199 + }, + { + "epoch": 3.225423929799597, + "grad_norm": 0.5908917204278737, + "learning_rate": 4.757657806526575e-06, + "loss": 0.0285, + "step": 27200 + }, + { + "epoch": 3.225542511561722, + "grad_norm": 0.4268808018888554, + "learning_rate": 4.756249241337524e-06, + "loss": 0.0273, + "step": 27201 + }, + { + "epoch": 3.225661093323847, + "grad_norm": 0.5174343959506252, + "learning_rate": 4.754840862771035e-06, + "loss": 0.0302, + "step": 27202 + }, + { + "epoch": 3.2257796750859717, + "grad_norm": 0.4488819410253685, + "learning_rate": 4.753432670840075e-06, + "loss": 0.0274, + "step": 27203 + }, + { + "epoch": 3.225898256848097, + "grad_norm": 0.7268380559881276, + "learning_rate": 4.75202466555765e-06, + "loss": 0.0317, + "step": 27204 + }, + { + "epoch": 3.2260168386102217, + "grad_norm": 0.5164663497540717, + "learning_rate": 4.750616846936723e-06, + "loss": 0.0192, + "step": 27205 + }, + { + "epoch": 3.226135420372347, + "grad_norm": 0.5448167221231031, + "learning_rate": 4.749209214990283e-06, + "loss": 0.0215, + "step": 27206 + }, + { + "epoch": 3.2262540021344717, + "grad_norm": 0.5155822948114486, + "learning_rate": 4.747801769731289e-06, + "loss": 0.0244, + "step": 27207 + }, + { + "epoch": 3.226372583896597, + "grad_norm": 0.5333707520888621, + "learning_rate": 4.746394511172739e-06, + "loss": 0.0321, + "step": 27208 + }, + { + "epoch": 3.2264911656587216, + "grad_norm": 0.6479083009218652, + "learning_rate": 4.744987439327589e-06, + "loss": 0.0375, + "step": 27209 + }, + { + "epoch": 3.226609747420847, + "grad_norm": 0.4860643838989823, + "learning_rate": 4.743580554208815e-06, + "loss": 0.019, + "step": 27210 + }, + { + "epoch": 3.2267283291829716, + "grad_norm": 0.41854442424160354, + "learning_rate": 4.742173855829391e-06, + "loss": 0.0168, + "step": 27211 + }, + { + "epoch": 3.226846910945097, + "grad_norm": 0.4299372891995034, + "learning_rate": 4.740767344202282e-06, + "loss": 0.0275, + "step": 27212 + }, + { + "epoch": 3.2269654927072216, + "grad_norm": 0.7581268589265074, + "learning_rate": 4.739361019340461e-06, + "loss": 0.0378, + "step": 27213 + }, + { + "epoch": 3.2270840744693468, + "grad_norm": 0.4823390743884749, + "learning_rate": 4.73795488125687e-06, + "loss": 0.0276, + "step": 27214 + }, + { + "epoch": 3.2272026562314715, + "grad_norm": 0.5149344518590976, + "learning_rate": 4.736548929964505e-06, + "loss": 0.0249, + "step": 27215 + }, + { + "epoch": 3.2273212379935967, + "grad_norm": 0.6881579566905157, + "learning_rate": 4.735143165476302e-06, + "loss": 0.0247, + "step": 27216 + }, + { + "epoch": 3.2274398197557215, + "grad_norm": 0.7089730601168054, + "learning_rate": 4.733737587805235e-06, + "loss": 0.0398, + "step": 27217 + }, + { + "epoch": 3.2275584015178467, + "grad_norm": 0.5598952929888358, + "learning_rate": 4.732332196964243e-06, + "loss": 0.0208, + "step": 27218 + }, + { + "epoch": 3.2276769832799714, + "grad_norm": 0.6215132880781084, + "learning_rate": 4.730926992966306e-06, + "loss": 0.0331, + "step": 27219 + }, + { + "epoch": 3.2277955650420966, + "grad_norm": 0.6885247839542904, + "learning_rate": 4.729521975824361e-06, + "loss": 0.038, + "step": 27220 + }, + { + "epoch": 3.2279141468042214, + "grad_norm": 0.8884847774825126, + "learning_rate": 4.728117145551364e-06, + "loss": 0.0454, + "step": 27221 + }, + { + "epoch": 3.2280327285663466, + "grad_norm": 0.4914453375744442, + "learning_rate": 4.726712502160271e-06, + "loss": 0.0324, + "step": 27222 + }, + { + "epoch": 3.2281513103284714, + "grad_norm": 0.3346655981494968, + "learning_rate": 4.725308045664026e-06, + "loss": 0.0163, + "step": 27223 + }, + { + "epoch": 3.2282698920905966, + "grad_norm": 0.5529492898912853, + "learning_rate": 4.723903776075586e-06, + "loss": 0.0276, + "step": 27224 + }, + { + "epoch": 3.2283884738527213, + "grad_norm": 0.743271418933701, + "learning_rate": 4.722499693407883e-06, + "loss": 0.0362, + "step": 27225 + }, + { + "epoch": 3.2285070556148465, + "grad_norm": 0.7114762183776034, + "learning_rate": 4.721095797673866e-06, + "loss": 0.0309, + "step": 27226 + }, + { + "epoch": 3.2286256373769713, + "grad_norm": 0.35137482278333276, + "learning_rate": 4.719692088886477e-06, + "loss": 0.0159, + "step": 27227 + }, + { + "epoch": 3.2287442191390965, + "grad_norm": 0.4726193434232054, + "learning_rate": 4.7182885670586654e-06, + "loss": 0.0295, + "step": 27228 + }, + { + "epoch": 3.2288628009012212, + "grad_norm": 0.4085488101353301, + "learning_rate": 4.71688523220335e-06, + "loss": 0.0219, + "step": 27229 + }, + { + "epoch": 3.2289813826633464, + "grad_norm": 0.3365209878045844, + "learning_rate": 4.715482084333492e-06, + "loss": 0.0156, + "step": 27230 + }, + { + "epoch": 3.229099964425471, + "grad_norm": 0.3277080834762484, + "learning_rate": 4.7140791234620066e-06, + "loss": 0.0133, + "step": 27231 + }, + { + "epoch": 3.2292185461875964, + "grad_norm": 0.5837757441141516, + "learning_rate": 4.712676349601838e-06, + "loss": 0.0173, + "step": 27232 + }, + { + "epoch": 3.229337127949721, + "grad_norm": 1.3201098765321355, + "learning_rate": 4.711273762765914e-06, + "loss": 0.0469, + "step": 27233 + }, + { + "epoch": 3.2294557097118464, + "grad_norm": 0.43680062058148034, + "learning_rate": 4.7098713629671665e-06, + "loss": 0.0177, + "step": 27234 + }, + { + "epoch": 3.229574291473971, + "grad_norm": 0.42391958258498086, + "learning_rate": 4.708469150218531e-06, + "loss": 0.0256, + "step": 27235 + }, + { + "epoch": 3.2296928732360963, + "grad_norm": 0.49478193726192476, + "learning_rate": 4.707067124532918e-06, + "loss": 0.019, + "step": 27236 + }, + { + "epoch": 3.229811454998221, + "grad_norm": 0.4059518878359518, + "learning_rate": 4.705665285923263e-06, + "loss": 0.0188, + "step": 27237 + }, + { + "epoch": 3.2299300367603463, + "grad_norm": 0.30795073622635466, + "learning_rate": 4.704263634402489e-06, + "loss": 0.0162, + "step": 27238 + }, + { + "epoch": 3.230048618522471, + "grad_norm": 0.6678603193228485, + "learning_rate": 4.702862169983513e-06, + "loss": 0.0268, + "step": 27239 + }, + { + "epoch": 3.2301672002845963, + "grad_norm": 0.3355299364201921, + "learning_rate": 4.70146089267926e-06, + "loss": 0.0164, + "step": 27240 + }, + { + "epoch": 3.230285782046721, + "grad_norm": 0.73707987109389, + "learning_rate": 4.700059802502646e-06, + "loss": 0.0422, + "step": 27241 + }, + { + "epoch": 3.230404363808846, + "grad_norm": 0.7557841445410667, + "learning_rate": 4.698658899466593e-06, + "loss": 0.0361, + "step": 27242 + }, + { + "epoch": 3.2305229455709714, + "grad_norm": 0.6338994056629211, + "learning_rate": 4.6972581835840025e-06, + "loss": 0.0367, + "step": 27243 + }, + { + "epoch": 3.230641527333096, + "grad_norm": 0.6031666777382958, + "learning_rate": 4.695857654867794e-06, + "loss": 0.033, + "step": 27244 + }, + { + "epoch": 3.230760109095221, + "grad_norm": 1.0973068580788656, + "learning_rate": 4.69445731333088e-06, + "loss": 0.0859, + "step": 27245 + }, + { + "epoch": 3.230878690857346, + "grad_norm": 0.8569519777573233, + "learning_rate": 4.693057158986177e-06, + "loss": 0.041, + "step": 27246 + }, + { + "epoch": 3.2309972726194713, + "grad_norm": 0.5398864517867409, + "learning_rate": 4.691657191846577e-06, + "loss": 0.0276, + "step": 27247 + }, + { + "epoch": 3.231115854381596, + "grad_norm": 0.5714824562462999, + "learning_rate": 4.690257411924995e-06, + "loss": 0.0307, + "step": 27248 + }, + { + "epoch": 3.2312344361437213, + "grad_norm": 0.47702098789470504, + "learning_rate": 4.688857819234336e-06, + "loss": 0.0242, + "step": 27249 + }, + { + "epoch": 3.231353017905846, + "grad_norm": 0.4413578491803499, + "learning_rate": 4.687458413787499e-06, + "loss": 0.0244, + "step": 27250 + }, + { + "epoch": 3.2314715996679713, + "grad_norm": 0.54266868645611, + "learning_rate": 4.686059195597392e-06, + "loss": 0.0292, + "step": 27251 + }, + { + "epoch": 3.231590181430096, + "grad_norm": 0.6224659353968609, + "learning_rate": 4.684660164676896e-06, + "loss": 0.023, + "step": 27252 + }, + { + "epoch": 3.2317087631922212, + "grad_norm": 0.40294754485153766, + "learning_rate": 4.683261321038934e-06, + "loss": 0.0182, + "step": 27253 + }, + { + "epoch": 3.231827344954346, + "grad_norm": 0.8028299641802137, + "learning_rate": 4.681862664696382e-06, + "loss": 0.049, + "step": 27254 + }, + { + "epoch": 3.231945926716471, + "grad_norm": 0.5154538989852842, + "learning_rate": 4.68046419566214e-06, + "loss": 0.0209, + "step": 27255 + }, + { + "epoch": 3.232064508478596, + "grad_norm": 0.5608194042542419, + "learning_rate": 4.679065913949101e-06, + "loss": 0.0239, + "step": 27256 + }, + { + "epoch": 3.232183090240721, + "grad_norm": 0.5925360250346736, + "learning_rate": 4.677667819570164e-06, + "loss": 0.0291, + "step": 27257 + }, + { + "epoch": 3.232301672002846, + "grad_norm": 0.3434359106587691, + "learning_rate": 4.6762699125382e-06, + "loss": 0.0192, + "step": 27258 + }, + { + "epoch": 3.232420253764971, + "grad_norm": 0.6951223406458682, + "learning_rate": 4.674872192866106e-06, + "loss": 0.0342, + "step": 27259 + }, + { + "epoch": 3.232538835527096, + "grad_norm": 0.4116545053717428, + "learning_rate": 4.673474660566765e-06, + "loss": 0.0173, + "step": 27260 + }, + { + "epoch": 3.232657417289221, + "grad_norm": 0.38910162068966403, + "learning_rate": 4.672077315653062e-06, + "loss": 0.0177, + "step": 27261 + }, + { + "epoch": 3.232775999051346, + "grad_norm": 0.7106965114560032, + "learning_rate": 4.670680158137885e-06, + "loss": 0.0501, + "step": 27262 + }, + { + "epoch": 3.232894580813471, + "grad_norm": 0.5129262747663298, + "learning_rate": 4.669283188034093e-06, + "loss": 0.0249, + "step": 27263 + }, + { + "epoch": 3.233013162575596, + "grad_norm": 0.6184323269722121, + "learning_rate": 4.667886405354594e-06, + "loss": 0.0418, + "step": 27264 + }, + { + "epoch": 3.233131744337721, + "grad_norm": 0.6341111404147647, + "learning_rate": 4.666489810112243e-06, + "loss": 0.0313, + "step": 27265 + }, + { + "epoch": 3.2332503260998458, + "grad_norm": 0.6056428846621508, + "learning_rate": 4.665093402319925e-06, + "loss": 0.0328, + "step": 27266 + }, + { + "epoch": 3.233368907861971, + "grad_norm": 0.8269813691422123, + "learning_rate": 4.663697181990501e-06, + "loss": 0.0344, + "step": 27267 + }, + { + "epoch": 3.2334874896240957, + "grad_norm": 0.42553873655514707, + "learning_rate": 4.66230114913686e-06, + "loss": 0.0282, + "step": 27268 + }, + { + "epoch": 3.233606071386221, + "grad_norm": 0.9230575845232698, + "learning_rate": 4.660905303771859e-06, + "loss": 0.0417, + "step": 27269 + }, + { + "epoch": 3.2337246531483457, + "grad_norm": 0.39540061546662736, + "learning_rate": 4.659509645908367e-06, + "loss": 0.0197, + "step": 27270 + }, + { + "epoch": 3.233843234910471, + "grad_norm": 0.8169935928637869, + "learning_rate": 4.658114175559256e-06, + "loss": 0.0586, + "step": 27271 + }, + { + "epoch": 3.2339618166725956, + "grad_norm": 0.5817890802245697, + "learning_rate": 4.656718892737383e-06, + "loss": 0.0236, + "step": 27272 + }, + { + "epoch": 3.234080398434721, + "grad_norm": 0.5273790105393465, + "learning_rate": 4.655323797455624e-06, + "loss": 0.0245, + "step": 27273 + }, + { + "epoch": 3.2341989801968456, + "grad_norm": 0.5843239134491043, + "learning_rate": 4.653928889726819e-06, + "loss": 0.0284, + "step": 27274 + }, + { + "epoch": 3.234317561958971, + "grad_norm": 0.4182482136212162, + "learning_rate": 4.652534169563852e-06, + "loss": 0.0262, + "step": 27275 + }, + { + "epoch": 3.2344361437210956, + "grad_norm": 0.5094863693057446, + "learning_rate": 4.651139636979563e-06, + "loss": 0.0232, + "step": 27276 + }, + { + "epoch": 3.2345547254832208, + "grad_norm": 0.3152121986058886, + "learning_rate": 4.649745291986818e-06, + "loss": 0.014, + "step": 27277 + }, + { + "epoch": 3.2346733072453455, + "grad_norm": 0.7273817856099339, + "learning_rate": 4.648351134598455e-06, + "loss": 0.0247, + "step": 27278 + }, + { + "epoch": 3.2347918890074707, + "grad_norm": 0.33676065867603083, + "learning_rate": 4.646957164827348e-06, + "loss": 0.0153, + "step": 27279 + }, + { + "epoch": 3.2349104707695955, + "grad_norm": 0.3596976961187567, + "learning_rate": 4.6455633826863326e-06, + "loss": 0.0119, + "step": 27280 + }, + { + "epoch": 3.2350290525317207, + "grad_norm": 0.5222262561324078, + "learning_rate": 4.644169788188263e-06, + "loss": 0.0272, + "step": 27281 + }, + { + "epoch": 3.2351476342938454, + "grad_norm": 0.7135090957657931, + "learning_rate": 4.642776381345987e-06, + "loss": 0.0377, + "step": 27282 + }, + { + "epoch": 3.2352662160559706, + "grad_norm": 0.401714610730151, + "learning_rate": 4.641383162172347e-06, + "loss": 0.0143, + "step": 27283 + }, + { + "epoch": 3.2353847978180954, + "grad_norm": 0.46142739701940166, + "learning_rate": 4.639990130680197e-06, + "loss": 0.0216, + "step": 27284 + }, + { + "epoch": 3.2355033795802206, + "grad_norm": 0.2994213547780573, + "learning_rate": 4.638597286882354e-06, + "loss": 0.017, + "step": 27285 + }, + { + "epoch": 3.2356219613423454, + "grad_norm": 0.5836146720479091, + "learning_rate": 4.6372046307916915e-06, + "loss": 0.0306, + "step": 27286 + }, + { + "epoch": 3.2357405431044706, + "grad_norm": 0.6575671371037373, + "learning_rate": 4.635812162421024e-06, + "loss": 0.0305, + "step": 27287 + }, + { + "epoch": 3.2358591248665953, + "grad_norm": 0.5715480045699568, + "learning_rate": 4.634419881783198e-06, + "loss": 0.0307, + "step": 27288 + }, + { + "epoch": 3.2359777066287205, + "grad_norm": 0.3954912849445684, + "learning_rate": 4.633027788891045e-06, + "loss": 0.0191, + "step": 27289 + }, + { + "epoch": 3.2360962883908453, + "grad_norm": 0.4873538680900388, + "learning_rate": 4.631635883757401e-06, + "loss": 0.0256, + "step": 27290 + }, + { + "epoch": 3.2362148701529705, + "grad_norm": 0.3966020594275511, + "learning_rate": 4.630244166395103e-06, + "loss": 0.0147, + "step": 27291 + }, + { + "epoch": 3.2363334519150957, + "grad_norm": 0.40710454337668217, + "learning_rate": 4.628852636816969e-06, + "loss": 0.0209, + "step": 27292 + }, + { + "epoch": 3.2364520336772205, + "grad_norm": 0.5272838559903921, + "learning_rate": 4.627461295035835e-06, + "loss": 0.0227, + "step": 27293 + }, + { + "epoch": 3.236570615439345, + "grad_norm": 0.806128464936843, + "learning_rate": 4.62607014106452e-06, + "loss": 0.0504, + "step": 27294 + }, + { + "epoch": 3.2366891972014704, + "grad_norm": 0.476165796579598, + "learning_rate": 4.624679174915864e-06, + "loss": 0.0241, + "step": 27295 + }, + { + "epoch": 3.2368077789635956, + "grad_norm": 0.4484873541252045, + "learning_rate": 4.623288396602673e-06, + "loss": 0.0216, + "step": 27296 + }, + { + "epoch": 3.2369263607257204, + "grad_norm": 0.4577438470960431, + "learning_rate": 4.621897806137776e-06, + "loss": 0.0211, + "step": 27297 + }, + { + "epoch": 3.237044942487845, + "grad_norm": 0.4445869190818124, + "learning_rate": 4.6205074035339915e-06, + "loss": 0.0245, + "step": 27298 + }, + { + "epoch": 3.2371635242499703, + "grad_norm": 0.6016173955031497, + "learning_rate": 4.619117188804137e-06, + "loss": 0.0278, + "step": 27299 + }, + { + "epoch": 3.2372821060120955, + "grad_norm": 0.5010630120937563, + "learning_rate": 4.6177271619610305e-06, + "loss": 0.0283, + "step": 27300 + }, + { + "epoch": 3.2374006877742203, + "grad_norm": 0.3388603095386597, + "learning_rate": 4.616337323017484e-06, + "loss": 0.0169, + "step": 27301 + }, + { + "epoch": 3.2375192695363455, + "grad_norm": 0.6033642240666882, + "learning_rate": 4.614947671986319e-06, + "loss": 0.0345, + "step": 27302 + }, + { + "epoch": 3.2376378512984703, + "grad_norm": 0.47106580384936847, + "learning_rate": 4.613558208880331e-06, + "loss": 0.0229, + "step": 27303 + }, + { + "epoch": 3.2377564330605955, + "grad_norm": 0.5632848076030716, + "learning_rate": 4.612168933712338e-06, + "loss": 0.0296, + "step": 27304 + }, + { + "epoch": 3.23787501482272, + "grad_norm": 0.533806084203638, + "learning_rate": 4.610779846495145e-06, + "loss": 0.0271, + "step": 27305 + }, + { + "epoch": 3.2379935965848454, + "grad_norm": 0.8057453971406728, + "learning_rate": 4.609390947241565e-06, + "loss": 0.0479, + "step": 27306 + }, + { + "epoch": 3.23811217834697, + "grad_norm": 1.1889861441133327, + "learning_rate": 4.608002235964392e-06, + "loss": 0.0742, + "step": 27307 + }, + { + "epoch": 3.2382307601090954, + "grad_norm": 0.3645307030764082, + "learning_rate": 4.60661371267643e-06, + "loss": 0.0196, + "step": 27308 + }, + { + "epoch": 3.23834934187122, + "grad_norm": 0.6487430304160718, + "learning_rate": 4.605225377390482e-06, + "loss": 0.0257, + "step": 27309 + }, + { + "epoch": 3.2384679236333453, + "grad_norm": 0.5909204472800735, + "learning_rate": 4.603837230119346e-06, + "loss": 0.033, + "step": 27310 + }, + { + "epoch": 3.23858650539547, + "grad_norm": 0.6163002612858551, + "learning_rate": 4.60244927087582e-06, + "loss": 0.0321, + "step": 27311 + }, + { + "epoch": 3.2387050871575953, + "grad_norm": 0.4958752890752176, + "learning_rate": 4.601061499672698e-06, + "loss": 0.0252, + "step": 27312 + }, + { + "epoch": 3.23882366891972, + "grad_norm": 0.322413057849458, + "learning_rate": 4.599673916522781e-06, + "loss": 0.0118, + "step": 27313 + }, + { + "epoch": 3.2389422506818453, + "grad_norm": 0.5794683131841242, + "learning_rate": 4.598286521438846e-06, + "loss": 0.0295, + "step": 27314 + }, + { + "epoch": 3.23906083244397, + "grad_norm": 0.3254165354660108, + "learning_rate": 4.596899314433695e-06, + "loss": 0.018, + "step": 27315 + }, + { + "epoch": 3.2391794142060952, + "grad_norm": 0.5201244613664262, + "learning_rate": 4.5955122955201076e-06, + "loss": 0.0298, + "step": 27316 + }, + { + "epoch": 3.23929799596822, + "grad_norm": 0.8370913953662976, + "learning_rate": 4.594125464710886e-06, + "loss": 0.0512, + "step": 27317 + }, + { + "epoch": 3.239416577730345, + "grad_norm": 0.4324646141044177, + "learning_rate": 4.592738822018794e-06, + "loss": 0.0277, + "step": 27318 + }, + { + "epoch": 3.23953515949247, + "grad_norm": 0.49818732220943585, + "learning_rate": 4.591352367456628e-06, + "loss": 0.0381, + "step": 27319 + }, + { + "epoch": 3.239653741254595, + "grad_norm": 0.6096855063138534, + "learning_rate": 4.589966101037163e-06, + "loss": 0.0321, + "step": 27320 + }, + { + "epoch": 3.23977232301672, + "grad_norm": 0.4185235780663556, + "learning_rate": 4.588580022773184e-06, + "loss": 0.023, + "step": 27321 + }, + { + "epoch": 3.239890904778845, + "grad_norm": 0.7117572112539658, + "learning_rate": 4.587194132677475e-06, + "loss": 0.0423, + "step": 27322 + }, + { + "epoch": 3.24000948654097, + "grad_norm": 0.4198776867915564, + "learning_rate": 4.585808430762789e-06, + "loss": 0.0196, + "step": 27323 + }, + { + "epoch": 3.240128068303095, + "grad_norm": 0.49236310478793366, + "learning_rate": 4.58442291704193e-06, + "loss": 0.0218, + "step": 27324 + }, + { + "epoch": 3.24024665006522, + "grad_norm": 0.720286817161152, + "learning_rate": 4.58303759152765e-06, + "loss": 0.0264, + "step": 27325 + }, + { + "epoch": 3.240365231827345, + "grad_norm": 0.32826045755656735, + "learning_rate": 4.581652454232727e-06, + "loss": 0.0111, + "step": 27326 + }, + { + "epoch": 3.24048381358947, + "grad_norm": 0.48291498865046056, + "learning_rate": 4.580267505169928e-06, + "loss": 0.0175, + "step": 27327 + }, + { + "epoch": 3.240602395351595, + "grad_norm": 0.28362267793892687, + "learning_rate": 4.578882744352028e-06, + "loss": 0.0133, + "step": 27328 + }, + { + "epoch": 3.2407209771137198, + "grad_norm": 0.5583341798906812, + "learning_rate": 4.577498171791783e-06, + "loss": 0.0235, + "step": 27329 + }, + { + "epoch": 3.240839558875845, + "grad_norm": 0.4750956381021061, + "learning_rate": 4.576113787501962e-06, + "loss": 0.0237, + "step": 27330 + }, + { + "epoch": 3.2409581406379697, + "grad_norm": 0.5637804753962082, + "learning_rate": 4.574729591495324e-06, + "loss": 0.0317, + "step": 27331 + }, + { + "epoch": 3.241076722400095, + "grad_norm": 0.43710229275976853, + "learning_rate": 4.573345583784633e-06, + "loss": 0.0266, + "step": 27332 + }, + { + "epoch": 3.2411953041622197, + "grad_norm": 0.6682334126310312, + "learning_rate": 4.571961764382654e-06, + "loss": 0.0284, + "step": 27333 + }, + { + "epoch": 3.241313885924345, + "grad_norm": 0.7042824345659573, + "learning_rate": 4.570578133302122e-06, + "loss": 0.0475, + "step": 27334 + }, + { + "epoch": 3.2414324676864696, + "grad_norm": 0.5996761122512324, + "learning_rate": 4.569194690555822e-06, + "loss": 0.0273, + "step": 27335 + }, + { + "epoch": 3.241551049448595, + "grad_norm": 0.5098890458949079, + "learning_rate": 4.567811436156485e-06, + "loss": 0.0273, + "step": 27336 + }, + { + "epoch": 3.2416696312107196, + "grad_norm": 0.6238107744147662, + "learning_rate": 4.566428370116874e-06, + "loss": 0.0263, + "step": 27337 + }, + { + "epoch": 3.241788212972845, + "grad_norm": 0.46000537076243697, + "learning_rate": 4.565045492449735e-06, + "loss": 0.0187, + "step": 27338 + }, + { + "epoch": 3.2419067947349696, + "grad_norm": 0.6176173352627707, + "learning_rate": 4.563662803167815e-06, + "loss": 0.0349, + "step": 27339 + }, + { + "epoch": 3.2420253764970948, + "grad_norm": 0.6634864913685125, + "learning_rate": 4.562280302283872e-06, + "loss": 0.0376, + "step": 27340 + }, + { + "epoch": 3.2421439582592195, + "grad_norm": 0.6307902602259513, + "learning_rate": 4.560897989810628e-06, + "loss": 0.0389, + "step": 27341 + }, + { + "epoch": 3.2422625400213447, + "grad_norm": 0.4696488837713808, + "learning_rate": 4.559515865760855e-06, + "loss": 0.0345, + "step": 27342 + }, + { + "epoch": 3.2423811217834695, + "grad_norm": 0.5894769902239515, + "learning_rate": 4.558133930147273e-06, + "loss": 0.0391, + "step": 27343 + }, + { + "epoch": 3.2424997035455947, + "grad_norm": 0.38806531732752153, + "learning_rate": 4.556752182982635e-06, + "loss": 0.0248, + "step": 27344 + }, + { + "epoch": 3.24261828530772, + "grad_norm": 0.5184335037237602, + "learning_rate": 4.555370624279659e-06, + "loss": 0.026, + "step": 27345 + }, + { + "epoch": 3.2427368670698447, + "grad_norm": 0.4683361526217257, + "learning_rate": 4.553989254051108e-06, + "loss": 0.0222, + "step": 27346 + }, + { + "epoch": 3.2428554488319694, + "grad_norm": 0.633745625212085, + "learning_rate": 4.5526080723097e-06, + "loss": 0.0369, + "step": 27347 + }, + { + "epoch": 3.2429740305940946, + "grad_norm": 0.7835557348044226, + "learning_rate": 4.551227079068169e-06, + "loss": 0.0262, + "step": 27348 + }, + { + "epoch": 3.24309261235622, + "grad_norm": 0.7625867256495316, + "learning_rate": 4.549846274339248e-06, + "loss": 0.0385, + "step": 27349 + }, + { + "epoch": 3.2432111941183446, + "grad_norm": 0.4233362697504844, + "learning_rate": 4.548465658135667e-06, + "loss": 0.0241, + "step": 27350 + }, + { + "epoch": 3.24332977588047, + "grad_norm": 0.7692601095033128, + "learning_rate": 4.5470852304701575e-06, + "loss": 0.0353, + "step": 27351 + }, + { + "epoch": 3.2434483576425945, + "grad_norm": 0.6918448639330976, + "learning_rate": 4.545704991355437e-06, + "loss": 0.0534, + "step": 27352 + }, + { + "epoch": 3.2435669394047197, + "grad_norm": 0.5677232084239886, + "learning_rate": 4.544324940804234e-06, + "loss": 0.0347, + "step": 27353 + }, + { + "epoch": 3.2436855211668445, + "grad_norm": 0.8165486150843095, + "learning_rate": 4.542945078829272e-06, + "loss": 0.0398, + "step": 27354 + }, + { + "epoch": 3.2438041029289697, + "grad_norm": 0.6523069173206353, + "learning_rate": 4.541565405443274e-06, + "loss": 0.0268, + "step": 27355 + }, + { + "epoch": 3.2439226846910945, + "grad_norm": 0.6369354615631665, + "learning_rate": 4.540185920658943e-06, + "loss": 0.0301, + "step": 27356 + }, + { + "epoch": 3.2440412664532197, + "grad_norm": 0.6356614147764452, + "learning_rate": 4.5388066244890226e-06, + "loss": 0.0362, + "step": 27357 + }, + { + "epoch": 3.2441598482153444, + "grad_norm": 0.37369812507215877, + "learning_rate": 4.537427516946208e-06, + "loss": 0.0162, + "step": 27358 + }, + { + "epoch": 3.2442784299774696, + "grad_norm": 0.5741404796001948, + "learning_rate": 4.536048598043216e-06, + "loss": 0.0252, + "step": 27359 + }, + { + "epoch": 3.2443970117395944, + "grad_norm": 0.7944198066295567, + "learning_rate": 4.534669867792763e-06, + "loss": 0.0418, + "step": 27360 + }, + { + "epoch": 3.2445155935017196, + "grad_norm": 0.32090548395512947, + "learning_rate": 4.533291326207559e-06, + "loss": 0.0139, + "step": 27361 + }, + { + "epoch": 3.2446341752638443, + "grad_norm": 0.42480184792404996, + "learning_rate": 4.531912973300315e-06, + "loss": 0.0173, + "step": 27362 + }, + { + "epoch": 3.2447527570259695, + "grad_norm": 0.4380770192852842, + "learning_rate": 4.530534809083728e-06, + "loss": 0.0292, + "step": 27363 + }, + { + "epoch": 3.2448713387880943, + "grad_norm": 0.6234087563659148, + "learning_rate": 4.529156833570508e-06, + "loss": 0.0222, + "step": 27364 + }, + { + "epoch": 3.2449899205502195, + "grad_norm": 0.6664669358081164, + "learning_rate": 4.5277790467733585e-06, + "loss": 0.0298, + "step": 27365 + }, + { + "epoch": 3.2451085023123443, + "grad_norm": 0.31894014727263564, + "learning_rate": 4.526401448704992e-06, + "loss": 0.0209, + "step": 27366 + }, + { + "epoch": 3.2452270840744695, + "grad_norm": 0.7590152596678472, + "learning_rate": 4.525024039378087e-06, + "loss": 0.0346, + "step": 27367 + }, + { + "epoch": 3.2453456658365942, + "grad_norm": 0.565794962137157, + "learning_rate": 4.523646818805355e-06, + "loss": 0.0246, + "step": 27368 + }, + { + "epoch": 3.2454642475987194, + "grad_norm": 0.5977421911653837, + "learning_rate": 4.522269786999489e-06, + "loss": 0.0255, + "step": 27369 + }, + { + "epoch": 3.245582829360844, + "grad_norm": 0.48433850931694944, + "learning_rate": 4.520892943973182e-06, + "loss": 0.0313, + "step": 27370 + }, + { + "epoch": 3.2457014111229694, + "grad_norm": 0.582899132317779, + "learning_rate": 4.519516289739132e-06, + "loss": 0.0281, + "step": 27371 + }, + { + "epoch": 3.245819992885094, + "grad_norm": 0.5435044509677875, + "learning_rate": 4.518139824310025e-06, + "loss": 0.0309, + "step": 27372 + }, + { + "epoch": 3.2459385746472194, + "grad_norm": 1.0326268253664608, + "learning_rate": 4.516763547698558e-06, + "loss": 0.0366, + "step": 27373 + }, + { + "epoch": 3.246057156409344, + "grad_norm": 0.298111224145929, + "learning_rate": 4.515387459917408e-06, + "loss": 0.0183, + "step": 27374 + }, + { + "epoch": 3.2461757381714693, + "grad_norm": 0.5111312494474423, + "learning_rate": 4.514011560979267e-06, + "loss": 0.0221, + "step": 27375 + }, + { + "epoch": 3.246294319933594, + "grad_norm": 0.5804552797863295, + "learning_rate": 4.512635850896816e-06, + "loss": 0.0329, + "step": 27376 + }, + { + "epoch": 3.2464129016957193, + "grad_norm": 0.5174700618264639, + "learning_rate": 4.511260329682746e-06, + "loss": 0.0242, + "step": 27377 + }, + { + "epoch": 3.246531483457844, + "grad_norm": 0.534838604389928, + "learning_rate": 4.5098849973497224e-06, + "loss": 0.0295, + "step": 27378 + }, + { + "epoch": 3.2466500652199692, + "grad_norm": 0.5900219704656952, + "learning_rate": 4.5085098539104335e-06, + "loss": 0.0321, + "step": 27379 + }, + { + "epoch": 3.246768646982094, + "grad_norm": 0.5724684638199294, + "learning_rate": 4.507134899377557e-06, + "loss": 0.0269, + "step": 27380 + }, + { + "epoch": 3.246887228744219, + "grad_norm": 0.41337218026176337, + "learning_rate": 4.505760133763767e-06, + "loss": 0.017, + "step": 27381 + }, + { + "epoch": 3.247005810506344, + "grad_norm": 0.38169045347208264, + "learning_rate": 4.504385557081742e-06, + "loss": 0.0225, + "step": 27382 + }, + { + "epoch": 3.247124392268469, + "grad_norm": 0.5941108042095287, + "learning_rate": 4.503011169344138e-06, + "loss": 0.0281, + "step": 27383 + }, + { + "epoch": 3.247242974030594, + "grad_norm": 0.38735980539473686, + "learning_rate": 4.501636970563644e-06, + "loss": 0.0212, + "step": 27384 + }, + { + "epoch": 3.247361555792719, + "grad_norm": 0.774212854758011, + "learning_rate": 4.5002629607529185e-06, + "loss": 0.0218, + "step": 27385 + }, + { + "epoch": 3.247480137554844, + "grad_norm": 0.6301239610651268, + "learning_rate": 4.498889139924628e-06, + "loss": 0.036, + "step": 27386 + }, + { + "epoch": 3.247598719316969, + "grad_norm": 0.2706665194422037, + "learning_rate": 4.49751550809144e-06, + "loss": 0.0154, + "step": 27387 + }, + { + "epoch": 3.247717301079094, + "grad_norm": 0.9746812033701433, + "learning_rate": 4.496142065266024e-06, + "loss": 0.0369, + "step": 27388 + }, + { + "epoch": 3.247835882841219, + "grad_norm": 0.3449711372556873, + "learning_rate": 4.494768811461028e-06, + "loss": 0.0198, + "step": 27389 + }, + { + "epoch": 3.247954464603344, + "grad_norm": 0.5336246444327859, + "learning_rate": 4.493395746689116e-06, + "loss": 0.0286, + "step": 27390 + }, + { + "epoch": 3.248073046365469, + "grad_norm": 0.473151080437943, + "learning_rate": 4.492022870962953e-06, + "loss": 0.0227, + "step": 27391 + }, + { + "epoch": 3.2481916281275938, + "grad_norm": 0.48718309663628, + "learning_rate": 4.4906501842951865e-06, + "loss": 0.0193, + "step": 27392 + }, + { + "epoch": 3.248310209889719, + "grad_norm": 0.40622114310778723, + "learning_rate": 4.489277686698485e-06, + "loss": 0.0219, + "step": 27393 + }, + { + "epoch": 3.248428791651844, + "grad_norm": 0.9225840947491843, + "learning_rate": 4.487905378185475e-06, + "loss": 0.0614, + "step": 27394 + }, + { + "epoch": 3.248547373413969, + "grad_norm": 0.6879308011312439, + "learning_rate": 4.486533258768838e-06, + "loss": 0.0324, + "step": 27395 + }, + { + "epoch": 3.2486659551760937, + "grad_norm": 0.43505714360855335, + "learning_rate": 4.4851613284612045e-06, + "loss": 0.022, + "step": 27396 + }, + { + "epoch": 3.248784536938219, + "grad_norm": 0.808959178777857, + "learning_rate": 4.483789587275225e-06, + "loss": 0.0436, + "step": 27397 + }, + { + "epoch": 3.248903118700344, + "grad_norm": 0.4437904808254502, + "learning_rate": 4.4824180352235455e-06, + "loss": 0.0219, + "step": 27398 + }, + { + "epoch": 3.249021700462469, + "grad_norm": 0.35209507786537586, + "learning_rate": 4.481046672318815e-06, + "loss": 0.0202, + "step": 27399 + }, + { + "epoch": 3.249140282224594, + "grad_norm": 0.5583964971034012, + "learning_rate": 4.479675498573676e-06, + "loss": 0.0307, + "step": 27400 + }, + { + "epoch": 3.249258863986719, + "grad_norm": 0.643103234627052, + "learning_rate": 4.478304514000753e-06, + "loss": 0.0345, + "step": 27401 + }, + { + "epoch": 3.249377445748844, + "grad_norm": 0.39678451373725615, + "learning_rate": 4.476933718612708e-06, + "loss": 0.0246, + "step": 27402 + }, + { + "epoch": 3.2494960275109688, + "grad_norm": 0.6052258564524259, + "learning_rate": 4.475563112422163e-06, + "loss": 0.0353, + "step": 27403 + }, + { + "epoch": 3.249614609273094, + "grad_norm": 0.4866823179577112, + "learning_rate": 4.474192695441765e-06, + "loss": 0.0212, + "step": 27404 + }, + { + "epoch": 3.2497331910352187, + "grad_norm": 0.4665782514470352, + "learning_rate": 4.472822467684123e-06, + "loss": 0.0182, + "step": 27405 + }, + { + "epoch": 3.249851772797344, + "grad_norm": 0.5149479347549677, + "learning_rate": 4.4714524291619015e-06, + "loss": 0.0258, + "step": 27406 + }, + { + "epoch": 3.2499703545594687, + "grad_norm": 0.6652262508351136, + "learning_rate": 4.470082579887708e-06, + "loss": 0.0354, + "step": 27407 + }, + { + "epoch": 3.250088936321594, + "grad_norm": 0.9807370166508604, + "learning_rate": 4.468712919874177e-06, + "loss": 0.0443, + "step": 27408 + }, + { + "epoch": 3.2502075180837187, + "grad_norm": 0.5383983770005903, + "learning_rate": 4.467343449133937e-06, + "loss": 0.0281, + "step": 27409 + }, + { + "epoch": 3.250326099845844, + "grad_norm": 0.4319105292999387, + "learning_rate": 4.465974167679609e-06, + "loss": 0.0245, + "step": 27410 + }, + { + "epoch": 3.2504446816079686, + "grad_norm": 0.4315315469292918, + "learning_rate": 4.464605075523828e-06, + "loss": 0.0194, + "step": 27411 + }, + { + "epoch": 3.250563263370094, + "grad_norm": 0.5511434818096692, + "learning_rate": 4.463236172679192e-06, + "loss": 0.0257, + "step": 27412 + }, + { + "epoch": 3.2506818451322186, + "grad_norm": 0.8480545764501413, + "learning_rate": 4.461867459158348e-06, + "loss": 0.0361, + "step": 27413 + }, + { + "epoch": 3.250800426894344, + "grad_norm": 0.5942261056552267, + "learning_rate": 4.460498934973892e-06, + "loss": 0.0238, + "step": 27414 + }, + { + "epoch": 3.2509190086564685, + "grad_norm": 0.4146816983189422, + "learning_rate": 4.459130600138459e-06, + "loss": 0.0215, + "step": 27415 + }, + { + "epoch": 3.2510375904185937, + "grad_norm": 0.5043980642688954, + "learning_rate": 4.457762454664638e-06, + "loss": 0.0257, + "step": 27416 + }, + { + "epoch": 3.2511561721807185, + "grad_norm": 0.6914993442939822, + "learning_rate": 4.456394498565067e-06, + "loss": 0.0235, + "step": 27417 + }, + { + "epoch": 3.2512747539428437, + "grad_norm": 0.3282884167784285, + "learning_rate": 4.455026731852344e-06, + "loss": 0.0178, + "step": 27418 + }, + { + "epoch": 3.2513933357049685, + "grad_norm": 0.8944123763727835, + "learning_rate": 4.453659154539077e-06, + "loss": 0.0479, + "step": 27419 + }, + { + "epoch": 3.2515119174670937, + "grad_norm": 0.6623683644306712, + "learning_rate": 4.452291766637881e-06, + "loss": 0.0358, + "step": 27420 + }, + { + "epoch": 3.2516304992292184, + "grad_norm": 0.5509851969184071, + "learning_rate": 4.450924568161355e-06, + "loss": 0.0241, + "step": 27421 + }, + { + "epoch": 3.2517490809913436, + "grad_norm": 0.6253450639534796, + "learning_rate": 4.449557559122111e-06, + "loss": 0.027, + "step": 27422 + }, + { + "epoch": 3.2518676627534684, + "grad_norm": 0.9448586315133146, + "learning_rate": 4.448190739532743e-06, + "loss": 0.0455, + "step": 27423 + }, + { + "epoch": 3.2519862445155936, + "grad_norm": 0.6949222107265272, + "learning_rate": 4.44682410940585e-06, + "loss": 0.0341, + "step": 27424 + }, + { + "epoch": 3.2521048262777184, + "grad_norm": 0.42859376115520054, + "learning_rate": 4.445457668754038e-06, + "loss": 0.0186, + "step": 27425 + }, + { + "epoch": 3.2522234080398436, + "grad_norm": 0.41494245480329633, + "learning_rate": 4.444091417589904e-06, + "loss": 0.0289, + "step": 27426 + }, + { + "epoch": 3.2523419898019683, + "grad_norm": 0.41923482180920324, + "learning_rate": 4.442725355926031e-06, + "loss": 0.022, + "step": 27427 + }, + { + "epoch": 3.2524605715640935, + "grad_norm": 0.8585326740464695, + "learning_rate": 4.441359483775032e-06, + "loss": 0.0379, + "step": 27428 + }, + { + "epoch": 3.2525791533262183, + "grad_norm": 0.624664669597386, + "learning_rate": 4.439993801149481e-06, + "loss": 0.0376, + "step": 27429 + }, + { + "epoch": 3.2526977350883435, + "grad_norm": 0.7391954138222044, + "learning_rate": 4.438628308061973e-06, + "loss": 0.039, + "step": 27430 + }, + { + "epoch": 3.2528163168504682, + "grad_norm": 0.9892812417590744, + "learning_rate": 4.437263004525103e-06, + "loss": 0.0517, + "step": 27431 + }, + { + "epoch": 3.2529348986125934, + "grad_norm": 0.5382487305108428, + "learning_rate": 4.435897890551446e-06, + "loss": 0.0234, + "step": 27432 + }, + { + "epoch": 3.253053480374718, + "grad_norm": 0.7663903048397577, + "learning_rate": 4.4345329661536035e-06, + "loss": 0.0555, + "step": 27433 + }, + { + "epoch": 3.2531720621368434, + "grad_norm": 0.5272520530046483, + "learning_rate": 4.4331682313441425e-06, + "loss": 0.0331, + "step": 27434 + }, + { + "epoch": 3.253290643898968, + "grad_norm": 0.27865109680579014, + "learning_rate": 4.431803686135647e-06, + "loss": 0.0117, + "step": 27435 + }, + { + "epoch": 3.2534092256610934, + "grad_norm": 0.3517553719447311, + "learning_rate": 4.4304393305407e-06, + "loss": 0.0197, + "step": 27436 + }, + { + "epoch": 3.253527807423218, + "grad_norm": 0.5579854093445203, + "learning_rate": 4.429075164571886e-06, + "loss": 0.0275, + "step": 27437 + }, + { + "epoch": 3.2536463891853433, + "grad_norm": 0.5180311620271351, + "learning_rate": 4.427711188241765e-06, + "loss": 0.0221, + "step": 27438 + }, + { + "epoch": 3.2537649709474685, + "grad_norm": 0.39920967138653324, + "learning_rate": 4.426347401562922e-06, + "loss": 0.0187, + "step": 27439 + }, + { + "epoch": 3.2538835527095933, + "grad_norm": 0.3908766073094901, + "learning_rate": 4.4249838045479245e-06, + "loss": 0.0265, + "step": 27440 + }, + { + "epoch": 3.254002134471718, + "grad_norm": 0.453343602388618, + "learning_rate": 4.423620397209346e-06, + "loss": 0.021, + "step": 27441 + }, + { + "epoch": 3.2541207162338432, + "grad_norm": 0.6316820210284874, + "learning_rate": 4.422257179559755e-06, + "loss": 0.0309, + "step": 27442 + }, + { + "epoch": 3.2542392979959684, + "grad_norm": 0.48454037168230085, + "learning_rate": 4.420894151611721e-06, + "loss": 0.0266, + "step": 27443 + }, + { + "epoch": 3.254357879758093, + "grad_norm": 0.5048845638089743, + "learning_rate": 4.419531313377809e-06, + "loss": 0.0252, + "step": 27444 + }, + { + "epoch": 3.254476461520218, + "grad_norm": 0.49654339170126044, + "learning_rate": 4.418168664870578e-06, + "loss": 0.0283, + "step": 27445 + }, + { + "epoch": 3.254595043282343, + "grad_norm": 0.7965128061092743, + "learning_rate": 4.4168062061025914e-06, + "loss": 0.0641, + "step": 27446 + }, + { + "epoch": 3.2547136250444684, + "grad_norm": 0.2951391176705984, + "learning_rate": 4.415443937086411e-06, + "loss": 0.0166, + "step": 27447 + }, + { + "epoch": 3.254832206806593, + "grad_norm": 0.8224164649781752, + "learning_rate": 4.4140818578345955e-06, + "loss": 0.0483, + "step": 27448 + }, + { + "epoch": 3.254950788568718, + "grad_norm": 0.4924167428517313, + "learning_rate": 4.4127199683597075e-06, + "loss": 0.0257, + "step": 27449 + }, + { + "epoch": 3.255069370330843, + "grad_norm": 0.4331850706488214, + "learning_rate": 4.4113582686742844e-06, + "loss": 0.0202, + "step": 27450 + }, + { + "epoch": 3.2551879520929683, + "grad_norm": 0.3856871251883508, + "learning_rate": 4.409996758790899e-06, + "loss": 0.019, + "step": 27451 + }, + { + "epoch": 3.255306533855093, + "grad_norm": 0.4880504036834003, + "learning_rate": 4.408635438722092e-06, + "loss": 0.0243, + "step": 27452 + }, + { + "epoch": 3.2554251156172183, + "grad_norm": 0.49520720819912356, + "learning_rate": 4.407274308480419e-06, + "loss": 0.0218, + "step": 27453 + }, + { + "epoch": 3.255543697379343, + "grad_norm": 0.5883904733659263, + "learning_rate": 4.405913368078416e-06, + "loss": 0.0365, + "step": 27454 + }, + { + "epoch": 3.255662279141468, + "grad_norm": 0.49430067601908184, + "learning_rate": 4.404552617528646e-06, + "loss": 0.0227, + "step": 27455 + }, + { + "epoch": 3.255780860903593, + "grad_norm": 0.2685945345705816, + "learning_rate": 4.4031920568436385e-06, + "loss": 0.0089, + "step": 27456 + }, + { + "epoch": 3.255899442665718, + "grad_norm": 0.44645219998441327, + "learning_rate": 4.4018316860359454e-06, + "loss": 0.0229, + "step": 27457 + }, + { + "epoch": 3.256018024427843, + "grad_norm": 0.3976671457414399, + "learning_rate": 4.400471505118103e-06, + "loss": 0.0215, + "step": 27458 + }, + { + "epoch": 3.256136606189968, + "grad_norm": 0.43174570856963485, + "learning_rate": 4.399111514102655e-06, + "loss": 0.0243, + "step": 27459 + }, + { + "epoch": 3.256255187952093, + "grad_norm": 0.404817289182233, + "learning_rate": 4.397751713002141e-06, + "loss": 0.0169, + "step": 27460 + }, + { + "epoch": 3.256373769714218, + "grad_norm": 0.4575109253054868, + "learning_rate": 4.39639210182908e-06, + "loss": 0.0193, + "step": 27461 + }, + { + "epoch": 3.256492351476343, + "grad_norm": 0.6560206630339931, + "learning_rate": 4.3950326805960285e-06, + "loss": 0.0334, + "step": 27462 + }, + { + "epoch": 3.256610933238468, + "grad_norm": 0.4161816220360348, + "learning_rate": 4.393673449315505e-06, + "loss": 0.0241, + "step": 27463 + }, + { + "epoch": 3.256729515000593, + "grad_norm": 0.6185252030213831, + "learning_rate": 4.3923144080000475e-06, + "loss": 0.0314, + "step": 27464 + }, + { + "epoch": 3.256848096762718, + "grad_norm": 0.5880207902676712, + "learning_rate": 4.390955556662168e-06, + "loss": 0.0299, + "step": 27465 + }, + { + "epoch": 3.256966678524843, + "grad_norm": 0.6805509390063731, + "learning_rate": 4.389596895314418e-06, + "loss": 0.0277, + "step": 27466 + }, + { + "epoch": 3.257085260286968, + "grad_norm": 0.48353745638704393, + "learning_rate": 4.388238423969307e-06, + "loss": 0.0258, + "step": 27467 + }, + { + "epoch": 3.2572038420490927, + "grad_norm": 0.6229152534666167, + "learning_rate": 4.3868801426393604e-06, + "loss": 0.027, + "step": 27468 + }, + { + "epoch": 3.257322423811218, + "grad_norm": 0.7407065867554066, + "learning_rate": 4.385522051337099e-06, + "loss": 0.0503, + "step": 27469 + }, + { + "epoch": 3.2574410055733427, + "grad_norm": 0.5813011194619719, + "learning_rate": 4.384164150075049e-06, + "loss": 0.0316, + "step": 27470 + }, + { + "epoch": 3.257559587335468, + "grad_norm": 0.7116677240370645, + "learning_rate": 4.3828064388657274e-06, + "loss": 0.0486, + "step": 27471 + }, + { + "epoch": 3.2576781690975927, + "grad_norm": 0.398604772940103, + "learning_rate": 4.381448917721637e-06, + "loss": 0.0209, + "step": 27472 + }, + { + "epoch": 3.257796750859718, + "grad_norm": 0.4414383057302515, + "learning_rate": 4.380091586655316e-06, + "loss": 0.0245, + "step": 27473 + }, + { + "epoch": 3.2579153326218426, + "grad_norm": 0.6814042412884624, + "learning_rate": 4.378734445679258e-06, + "loss": 0.0377, + "step": 27474 + }, + { + "epoch": 3.258033914383968, + "grad_norm": 0.5233947258855991, + "learning_rate": 4.37737749480599e-06, + "loss": 0.0176, + "step": 27475 + }, + { + "epoch": 3.2581524961460926, + "grad_norm": 0.600634991853613, + "learning_rate": 4.376020734047995e-06, + "loss": 0.0362, + "step": 27476 + }, + { + "epoch": 3.258271077908218, + "grad_norm": 0.5989517079428535, + "learning_rate": 4.374664163417813e-06, + "loss": 0.0286, + "step": 27477 + }, + { + "epoch": 3.2583896596703426, + "grad_norm": 0.3409289829043572, + "learning_rate": 4.3733077829279294e-06, + "loss": 0.0174, + "step": 27478 + }, + { + "epoch": 3.2585082414324678, + "grad_norm": 0.5073487364764008, + "learning_rate": 4.371951592590853e-06, + "loss": 0.0346, + "step": 27479 + }, + { + "epoch": 3.2586268231945925, + "grad_norm": 0.49208102088161193, + "learning_rate": 4.370595592419088e-06, + "loss": 0.0253, + "step": 27480 + }, + { + "epoch": 3.2587454049567177, + "grad_norm": 0.4238932909236821, + "learning_rate": 4.369239782425133e-06, + "loss": 0.0228, + "step": 27481 + }, + { + "epoch": 3.2588639867188425, + "grad_norm": 0.6013816888425871, + "learning_rate": 4.367884162621497e-06, + "loss": 0.0324, + "step": 27482 + }, + { + "epoch": 3.2589825684809677, + "grad_norm": 0.559372945908543, + "learning_rate": 4.366528733020653e-06, + "loss": 0.0304, + "step": 27483 + }, + { + "epoch": 3.2591011502430924, + "grad_norm": 0.32586191497277933, + "learning_rate": 4.365173493635124e-06, + "loss": 0.0148, + "step": 27484 + }, + { + "epoch": 3.2592197320052176, + "grad_norm": 0.6857722350690668, + "learning_rate": 4.363818444477385e-06, + "loss": 0.0355, + "step": 27485 + }, + { + "epoch": 3.2593383137673424, + "grad_norm": 0.34961616325701544, + "learning_rate": 4.362463585559942e-06, + "loss": 0.0175, + "step": 27486 + }, + { + "epoch": 3.2594568955294676, + "grad_norm": 0.5161976983702874, + "learning_rate": 4.361108916895265e-06, + "loss": 0.0329, + "step": 27487 + }, + { + "epoch": 3.2595754772915924, + "grad_norm": 0.3485586314415198, + "learning_rate": 4.359754438495867e-06, + "loss": 0.017, + "step": 27488 + }, + { + "epoch": 3.2596940590537176, + "grad_norm": 0.8292672034950662, + "learning_rate": 4.3584001503742145e-06, + "loss": 0.0461, + "step": 27489 + }, + { + "epoch": 3.2598126408158423, + "grad_norm": 0.46736087449348834, + "learning_rate": 4.357046052542801e-06, + "loss": 0.023, + "step": 27490 + }, + { + "epoch": 3.2599312225779675, + "grad_norm": 0.572638981901984, + "learning_rate": 4.355692145014109e-06, + "loss": 0.0377, + "step": 27491 + }, + { + "epoch": 3.2600498043400927, + "grad_norm": 0.9144498748028626, + "learning_rate": 4.3543384278006195e-06, + "loss": 0.0304, + "step": 27492 + }, + { + "epoch": 3.2601683861022175, + "grad_norm": 0.4149419992445855, + "learning_rate": 4.35298490091482e-06, + "loss": 0.0227, + "step": 27493 + }, + { + "epoch": 3.2602869678643422, + "grad_norm": 0.3696008057527235, + "learning_rate": 4.351631564369171e-06, + "loss": 0.0181, + "step": 27494 + }, + { + "epoch": 3.2604055496264674, + "grad_norm": 0.639151818160008, + "learning_rate": 4.3502784181761625e-06, + "loss": 0.0234, + "step": 27495 + }, + { + "epoch": 3.2605241313885927, + "grad_norm": 0.6053286782244033, + "learning_rate": 4.348925462348264e-06, + "loss": 0.0354, + "step": 27496 + }, + { + "epoch": 3.2606427131507174, + "grad_norm": 0.4896273952442982, + "learning_rate": 4.3475726968979444e-06, + "loss": 0.0347, + "step": 27497 + }, + { + "epoch": 3.260761294912842, + "grad_norm": 0.47705584126866163, + "learning_rate": 4.346220121837682e-06, + "loss": 0.0235, + "step": 27498 + }, + { + "epoch": 3.2608798766749674, + "grad_norm": 0.5137712480765328, + "learning_rate": 4.344867737179944e-06, + "loss": 0.0273, + "step": 27499 + }, + { + "epoch": 3.2609984584370926, + "grad_norm": 0.6361111562050317, + "learning_rate": 4.343515542937201e-06, + "loss": 0.0278, + "step": 27500 + }, + { + "epoch": 3.2611170401992173, + "grad_norm": 0.4917233911965857, + "learning_rate": 4.342163539121907e-06, + "loss": 0.0274, + "step": 27501 + }, + { + "epoch": 3.2612356219613425, + "grad_norm": 0.28640298054872043, + "learning_rate": 4.340811725746535e-06, + "loss": 0.0125, + "step": 27502 + }, + { + "epoch": 3.2613542037234673, + "grad_norm": 0.5927389163474427, + "learning_rate": 4.339460102823542e-06, + "loss": 0.0428, + "step": 27503 + }, + { + "epoch": 3.2614727854855925, + "grad_norm": 0.37485249297835155, + "learning_rate": 4.338108670365401e-06, + "loss": 0.0178, + "step": 27504 + }, + { + "epoch": 3.2615913672477173, + "grad_norm": 0.6575010603286068, + "learning_rate": 4.336757428384553e-06, + "loss": 0.0261, + "step": 27505 + }, + { + "epoch": 3.2617099490098425, + "grad_norm": 0.6771885293329892, + "learning_rate": 4.335406376893461e-06, + "loss": 0.0357, + "step": 27506 + }, + { + "epoch": 3.261828530771967, + "grad_norm": 0.45277598299350585, + "learning_rate": 4.334055515904584e-06, + "loss": 0.022, + "step": 27507 + }, + { + "epoch": 3.2619471125340924, + "grad_norm": 0.6142434818157242, + "learning_rate": 4.332704845430371e-06, + "loss": 0.0402, + "step": 27508 + }, + { + "epoch": 3.262065694296217, + "grad_norm": 0.2889210235062536, + "learning_rate": 4.331354365483284e-06, + "loss": 0.0154, + "step": 27509 + }, + { + "epoch": 3.2621842760583424, + "grad_norm": 0.5994774610292379, + "learning_rate": 4.33000407607575e-06, + "loss": 0.0265, + "step": 27510 + }, + { + "epoch": 3.262302857820467, + "grad_norm": 0.5957955563208144, + "learning_rate": 4.328653977220243e-06, + "loss": 0.0426, + "step": 27511 + }, + { + "epoch": 3.2624214395825923, + "grad_norm": 0.37742456867804114, + "learning_rate": 4.327304068929192e-06, + "loss": 0.0153, + "step": 27512 + }, + { + "epoch": 3.262540021344717, + "grad_norm": 0.5092078409694742, + "learning_rate": 4.325954351215047e-06, + "loss": 0.033, + "step": 27513 + }, + { + "epoch": 3.2626586031068423, + "grad_norm": 0.31098454367864525, + "learning_rate": 4.32460482409025e-06, + "loss": 0.0146, + "step": 27514 + }, + { + "epoch": 3.262777184868967, + "grad_norm": 0.6873067105879418, + "learning_rate": 4.323255487567252e-06, + "loss": 0.0333, + "step": 27515 + }, + { + "epoch": 3.2628957666310923, + "grad_norm": 0.4764696150234353, + "learning_rate": 4.321906341658472e-06, + "loss": 0.0258, + "step": 27516 + }, + { + "epoch": 3.263014348393217, + "grad_norm": 0.4166926236052506, + "learning_rate": 4.320557386376361e-06, + "loss": 0.0212, + "step": 27517 + }, + { + "epoch": 3.2631329301553422, + "grad_norm": 0.6177278780077081, + "learning_rate": 4.319208621733353e-06, + "loss": 0.0266, + "step": 27518 + }, + { + "epoch": 3.263251511917467, + "grad_norm": 0.7818841848663666, + "learning_rate": 4.317860047741878e-06, + "loss": 0.0343, + "step": 27519 + }, + { + "epoch": 3.263370093679592, + "grad_norm": 0.40615298766792934, + "learning_rate": 4.31651166441438e-06, + "loss": 0.0177, + "step": 27520 + }, + { + "epoch": 3.263488675441717, + "grad_norm": 0.4371795398615913, + "learning_rate": 4.315163471763267e-06, + "loss": 0.0269, + "step": 27521 + }, + { + "epoch": 3.263607257203842, + "grad_norm": 0.702712954922298, + "learning_rate": 4.313815469800994e-06, + "loss": 0.0377, + "step": 27522 + }, + { + "epoch": 3.263725838965967, + "grad_norm": 0.6670333974296963, + "learning_rate": 4.31246765853997e-06, + "loss": 0.0378, + "step": 27523 + }, + { + "epoch": 3.263844420728092, + "grad_norm": 0.7610393599110505, + "learning_rate": 4.31112003799263e-06, + "loss": 0.0401, + "step": 27524 + }, + { + "epoch": 3.263963002490217, + "grad_norm": 0.7162927234035701, + "learning_rate": 4.309772608171383e-06, + "loss": 0.0371, + "step": 27525 + }, + { + "epoch": 3.264081584252342, + "grad_norm": 0.452119410483237, + "learning_rate": 4.308425369088673e-06, + "loss": 0.0236, + "step": 27526 + }, + { + "epoch": 3.264200166014467, + "grad_norm": 0.5573616953009053, + "learning_rate": 4.307078320756899e-06, + "loss": 0.0286, + "step": 27527 + }, + { + "epoch": 3.264318747776592, + "grad_norm": 0.5137920545947589, + "learning_rate": 4.3057314631884864e-06, + "loss": 0.0297, + "step": 27528 + }, + { + "epoch": 3.264437329538717, + "grad_norm": 0.5941374684870109, + "learning_rate": 4.304384796395855e-06, + "loss": 0.0301, + "step": 27529 + }, + { + "epoch": 3.264555911300842, + "grad_norm": 0.4305857069581418, + "learning_rate": 4.303038320391417e-06, + "loss": 0.0216, + "step": 27530 + }, + { + "epoch": 3.2646744930629668, + "grad_norm": 0.3240217776047274, + "learning_rate": 4.3016920351875905e-06, + "loss": 0.0163, + "step": 27531 + }, + { + "epoch": 3.264793074825092, + "grad_norm": 0.3815836540893597, + "learning_rate": 4.300345940796771e-06, + "loss": 0.0173, + "step": 27532 + }, + { + "epoch": 3.2649116565872167, + "grad_norm": 0.6611722555063648, + "learning_rate": 4.299000037231391e-06, + "loss": 0.0491, + "step": 27533 + }, + { + "epoch": 3.265030238349342, + "grad_norm": 0.5150464550686736, + "learning_rate": 4.297654324503836e-06, + "loss": 0.0232, + "step": 27534 + }, + { + "epoch": 3.2651488201114667, + "grad_norm": 0.6167451474387654, + "learning_rate": 4.296308802626531e-06, + "loss": 0.0385, + "step": 27535 + }, + { + "epoch": 3.265267401873592, + "grad_norm": 0.4045712305398629, + "learning_rate": 4.294963471611854e-06, + "loss": 0.0196, + "step": 27536 + }, + { + "epoch": 3.2653859836357166, + "grad_norm": 0.7410824147261891, + "learning_rate": 4.293618331472238e-06, + "loss": 0.0265, + "step": 27537 + }, + { + "epoch": 3.265504565397842, + "grad_norm": 0.5506868508489816, + "learning_rate": 4.292273382220063e-06, + "loss": 0.0305, + "step": 27538 + }, + { + "epoch": 3.2656231471599666, + "grad_norm": 0.6239591609214893, + "learning_rate": 4.2909286238677345e-06, + "loss": 0.0349, + "step": 27539 + }, + { + "epoch": 3.265741728922092, + "grad_norm": 0.606824917008572, + "learning_rate": 4.289584056427648e-06, + "loss": 0.0339, + "step": 27540 + }, + { + "epoch": 3.265860310684217, + "grad_norm": 0.5970452030744962, + "learning_rate": 4.288239679912201e-06, + "loss": 0.0301, + "step": 27541 + }, + { + "epoch": 3.2659788924463418, + "grad_norm": 0.6209764553749242, + "learning_rate": 4.286895494333792e-06, + "loss": 0.0263, + "step": 27542 + }, + { + "epoch": 3.2660974742084665, + "grad_norm": 0.312877274120093, + "learning_rate": 4.285551499704793e-06, + "loss": 0.0178, + "step": 27543 + }, + { + "epoch": 3.2662160559705917, + "grad_norm": 0.7628541817431468, + "learning_rate": 4.2842076960376205e-06, + "loss": 0.0504, + "step": 27544 + }, + { + "epoch": 3.266334637732717, + "grad_norm": 0.31963281968597956, + "learning_rate": 4.282864083344643e-06, + "loss": 0.0147, + "step": 27545 + }, + { + "epoch": 3.2664532194948417, + "grad_norm": 0.4168326656487008, + "learning_rate": 4.281520661638253e-06, + "loss": 0.0247, + "step": 27546 + }, + { + "epoch": 3.2665718012569664, + "grad_norm": 0.6579037677426373, + "learning_rate": 4.280177430930835e-06, + "loss": 0.0319, + "step": 27547 + }, + { + "epoch": 3.2666903830190916, + "grad_norm": 0.9389573444389527, + "learning_rate": 4.278834391234776e-06, + "loss": 0.0401, + "step": 27548 + }, + { + "epoch": 3.266808964781217, + "grad_norm": 0.4593748995723798, + "learning_rate": 4.277491542562456e-06, + "loss": 0.0177, + "step": 27549 + }, + { + "epoch": 3.2669275465433416, + "grad_norm": 0.3132922090935393, + "learning_rate": 4.276148884926249e-06, + "loss": 0.0139, + "step": 27550 + }, + { + "epoch": 3.267046128305467, + "grad_norm": 0.7163484651250634, + "learning_rate": 4.2748064183385365e-06, + "loss": 0.0361, + "step": 27551 + }, + { + "epoch": 3.2671647100675916, + "grad_norm": 0.7228543479727375, + "learning_rate": 4.273464142811693e-06, + "loss": 0.0366, + "step": 27552 + }, + { + "epoch": 3.2672832918297168, + "grad_norm": 0.5897531251084762, + "learning_rate": 4.2721220583581e-06, + "loss": 0.025, + "step": 27553 + }, + { + "epoch": 3.2674018735918415, + "grad_norm": 0.29706812898836005, + "learning_rate": 4.270780164990115e-06, + "loss": 0.0143, + "step": 27554 + }, + { + "epoch": 3.2675204553539667, + "grad_norm": 0.7812462879803099, + "learning_rate": 4.269438462720119e-06, + "loss": 0.0306, + "step": 27555 + }, + { + "epoch": 3.2676390371160915, + "grad_norm": 0.576972745123371, + "learning_rate": 4.2680969515604795e-06, + "loss": 0.0407, + "step": 27556 + }, + { + "epoch": 3.2677576188782167, + "grad_norm": 0.6879410073480865, + "learning_rate": 4.266755631523561e-06, + "loss": 0.0305, + "step": 27557 + }, + { + "epoch": 3.2678762006403415, + "grad_norm": 0.3629331393123434, + "learning_rate": 4.265414502621734e-06, + "loss": 0.0102, + "step": 27558 + }, + { + "epoch": 3.2679947824024667, + "grad_norm": 0.4399623409838315, + "learning_rate": 4.264073564867355e-06, + "loss": 0.0188, + "step": 27559 + }, + { + "epoch": 3.2681133641645914, + "grad_norm": 0.34694610684396254, + "learning_rate": 4.2627328182728e-06, + "loss": 0.0165, + "step": 27560 + }, + { + "epoch": 3.2682319459267166, + "grad_norm": 0.3658092218422071, + "learning_rate": 4.261392262850408e-06, + "loss": 0.0161, + "step": 27561 + }, + { + "epoch": 3.2683505276888414, + "grad_norm": 0.37342700688862956, + "learning_rate": 4.260051898612552e-06, + "loss": 0.025, + "step": 27562 + }, + { + "epoch": 3.2684691094509666, + "grad_norm": 0.31995926267479696, + "learning_rate": 4.258711725571582e-06, + "loss": 0.0152, + "step": 27563 + }, + { + "epoch": 3.2685876912130913, + "grad_norm": 0.7619027266502364, + "learning_rate": 4.257371743739863e-06, + "loss": 0.0357, + "step": 27564 + }, + { + "epoch": 3.2687062729752165, + "grad_norm": 0.471043609957352, + "learning_rate": 4.256031953129733e-06, + "loss": 0.0176, + "step": 27565 + }, + { + "epoch": 3.2688248547373413, + "grad_norm": 0.46301898044629225, + "learning_rate": 4.254692353753551e-06, + "loss": 0.0277, + "step": 27566 + }, + { + "epoch": 3.2689434364994665, + "grad_norm": 0.5217453202892884, + "learning_rate": 4.253352945623665e-06, + "loss": 0.0295, + "step": 27567 + }, + { + "epoch": 3.2690620182615913, + "grad_norm": 0.4210009910570425, + "learning_rate": 4.252013728752424e-06, + "loss": 0.0184, + "step": 27568 + }, + { + "epoch": 3.2691806000237165, + "grad_norm": 0.6149979361791961, + "learning_rate": 4.250674703152175e-06, + "loss": 0.035, + "step": 27569 + }, + { + "epoch": 3.2692991817858412, + "grad_norm": 0.7278667119439104, + "learning_rate": 4.249335868835258e-06, + "loss": 0.0396, + "step": 27570 + }, + { + "epoch": 3.2694177635479664, + "grad_norm": 0.47475326103259835, + "learning_rate": 4.247997225814027e-06, + "loss": 0.0258, + "step": 27571 + }, + { + "epoch": 3.269536345310091, + "grad_norm": 0.557910598745053, + "learning_rate": 4.246658774100804e-06, + "loss": 0.026, + "step": 27572 + }, + { + "epoch": 3.2696549270722164, + "grad_norm": 1.0363389612712, + "learning_rate": 4.24532051370794e-06, + "loss": 0.0615, + "step": 27573 + }, + { + "epoch": 3.269773508834341, + "grad_norm": 0.9215530412542449, + "learning_rate": 4.243982444647771e-06, + "loss": 0.0262, + "step": 27574 + }, + { + "epoch": 3.2698920905964663, + "grad_norm": 0.9850574648068916, + "learning_rate": 4.242644566932636e-06, + "loss": 0.0302, + "step": 27575 + }, + { + "epoch": 3.270010672358591, + "grad_norm": 0.8337972565305137, + "learning_rate": 4.241306880574858e-06, + "loss": 0.0396, + "step": 27576 + }, + { + "epoch": 3.2701292541207163, + "grad_norm": 0.8187755464292139, + "learning_rate": 4.239969385586773e-06, + "loss": 0.0462, + "step": 27577 + }, + { + "epoch": 3.270247835882841, + "grad_norm": 0.5217700517336539, + "learning_rate": 4.238632081980714e-06, + "loss": 0.024, + "step": 27578 + }, + { + "epoch": 3.2703664176449663, + "grad_norm": 0.6239463199406439, + "learning_rate": 4.237294969769009e-06, + "loss": 0.0319, + "step": 27579 + }, + { + "epoch": 3.270484999407091, + "grad_norm": 0.8317569086336254, + "learning_rate": 4.23595804896399e-06, + "loss": 0.0437, + "step": 27580 + }, + { + "epoch": 3.2706035811692162, + "grad_norm": 0.5151762814881445, + "learning_rate": 4.234621319577961e-06, + "loss": 0.0253, + "step": 27581 + }, + { + "epoch": 3.270722162931341, + "grad_norm": 0.6623125651074556, + "learning_rate": 4.23328478162327e-06, + "loss": 0.025, + "step": 27582 + }, + { + "epoch": 3.270840744693466, + "grad_norm": 0.36951341046968245, + "learning_rate": 4.231948435112223e-06, + "loss": 0.0242, + "step": 27583 + }, + { + "epoch": 3.270959326455591, + "grad_norm": 0.6202896253154944, + "learning_rate": 4.2306122800571445e-06, + "loss": 0.032, + "step": 27584 + }, + { + "epoch": 3.271077908217716, + "grad_norm": 0.5012170902809104, + "learning_rate": 4.229276316470351e-06, + "loss": 0.0287, + "step": 27585 + }, + { + "epoch": 3.271196489979841, + "grad_norm": 0.7457675802822706, + "learning_rate": 4.227940544364167e-06, + "loss": 0.0278, + "step": 27586 + }, + { + "epoch": 3.271315071741966, + "grad_norm": 0.5459139428757235, + "learning_rate": 4.226604963750891e-06, + "loss": 0.0224, + "step": 27587 + }, + { + "epoch": 3.271433653504091, + "grad_norm": 0.6720082601202649, + "learning_rate": 4.225269574642843e-06, + "loss": 0.0311, + "step": 27588 + }, + { + "epoch": 3.271552235266216, + "grad_norm": 0.5711726096600208, + "learning_rate": 4.223934377052336e-06, + "loss": 0.02, + "step": 27589 + }, + { + "epoch": 3.2716708170283413, + "grad_norm": 0.6547966606921288, + "learning_rate": 4.222599370991676e-06, + "loss": 0.0262, + "step": 27590 + }, + { + "epoch": 3.271789398790466, + "grad_norm": 0.6222067998719052, + "learning_rate": 4.221264556473176e-06, + "loss": 0.033, + "step": 27591 + }, + { + "epoch": 3.271907980552591, + "grad_norm": 0.6685532085809174, + "learning_rate": 4.219929933509123e-06, + "loss": 0.0322, + "step": 27592 + }, + { + "epoch": 3.272026562314716, + "grad_norm": 0.4387945476706775, + "learning_rate": 4.218595502111847e-06, + "loss": 0.0246, + "step": 27593 + }, + { + "epoch": 3.272145144076841, + "grad_norm": 0.4534282942433627, + "learning_rate": 4.217261262293631e-06, + "loss": 0.024, + "step": 27594 + }, + { + "epoch": 3.272263725838966, + "grad_norm": 0.42240394801791253, + "learning_rate": 4.215927214066786e-06, + "loss": 0.0216, + "step": 27595 + }, + { + "epoch": 3.2723823076010907, + "grad_norm": 0.4102414006956543, + "learning_rate": 4.214593357443594e-06, + "loss": 0.0239, + "step": 27596 + }, + { + "epoch": 3.272500889363216, + "grad_norm": 0.3186082529883775, + "learning_rate": 4.213259692436367e-06, + "loss": 0.0157, + "step": 27597 + }, + { + "epoch": 3.272619471125341, + "grad_norm": 0.5640467636711263, + "learning_rate": 4.211926219057402e-06, + "loss": 0.03, + "step": 27598 + }, + { + "epoch": 3.272738052887466, + "grad_norm": 0.42813264365961584, + "learning_rate": 4.210592937318975e-06, + "loss": 0.0235, + "step": 27599 + }, + { + "epoch": 3.2728566346495906, + "grad_norm": 0.7178717389345427, + "learning_rate": 4.209259847233396e-06, + "loss": 0.0355, + "step": 27600 + }, + { + "epoch": 3.272975216411716, + "grad_norm": 0.6996770513165891, + "learning_rate": 4.207926948812943e-06, + "loss": 0.0287, + "step": 27601 + }, + { + "epoch": 3.273093798173841, + "grad_norm": 0.4316505420066711, + "learning_rate": 4.206594242069911e-06, + "loss": 0.017, + "step": 27602 + }, + { + "epoch": 3.273212379935966, + "grad_norm": 0.7311381281792692, + "learning_rate": 4.205261727016571e-06, + "loss": 0.0324, + "step": 27603 + }, + { + "epoch": 3.273330961698091, + "grad_norm": 0.5146609717396601, + "learning_rate": 4.203929403665227e-06, + "loss": 0.0247, + "step": 27604 + }, + { + "epoch": 3.2734495434602158, + "grad_norm": 0.4398504643338687, + "learning_rate": 4.202597272028147e-06, + "loss": 0.0205, + "step": 27605 + }, + { + "epoch": 3.273568125222341, + "grad_norm": 0.6550494655096323, + "learning_rate": 4.201265332117618e-06, + "loss": 0.0283, + "step": 27606 + }, + { + "epoch": 3.2736867069844657, + "grad_norm": 0.41263978727171396, + "learning_rate": 4.199933583945917e-06, + "loss": 0.0199, + "step": 27607 + }, + { + "epoch": 3.273805288746591, + "grad_norm": 0.31091481613516364, + "learning_rate": 4.198602027525323e-06, + "loss": 0.0144, + "step": 27608 + }, + { + "epoch": 3.2739238705087157, + "grad_norm": 0.350973207217307, + "learning_rate": 4.197270662868113e-06, + "loss": 0.0202, + "step": 27609 + }, + { + "epoch": 3.274042452270841, + "grad_norm": 0.6256917674559739, + "learning_rate": 4.195939489986553e-06, + "loss": 0.037, + "step": 27610 + }, + { + "epoch": 3.2741610340329657, + "grad_norm": 0.6081137236548672, + "learning_rate": 4.19460850889292e-06, + "loss": 0.0243, + "step": 27611 + }, + { + "epoch": 3.274279615795091, + "grad_norm": 0.5623160960057183, + "learning_rate": 4.193277719599484e-06, + "loss": 0.0276, + "step": 27612 + }, + { + "epoch": 3.2743981975572156, + "grad_norm": 0.5532387294916726, + "learning_rate": 4.191947122118517e-06, + "loss": 0.0377, + "step": 27613 + }, + { + "epoch": 3.274516779319341, + "grad_norm": 0.33095411037069605, + "learning_rate": 4.190616716462268e-06, + "loss": 0.0154, + "step": 27614 + }, + { + "epoch": 3.2746353610814656, + "grad_norm": 0.4994354259666392, + "learning_rate": 4.189286502643028e-06, + "loss": 0.0266, + "step": 27615 + }, + { + "epoch": 3.274753942843591, + "grad_norm": 0.8914823230408414, + "learning_rate": 4.187956480673039e-06, + "loss": 0.0513, + "step": 27616 + }, + { + "epoch": 3.2748725246057155, + "grad_norm": 0.4103752090470813, + "learning_rate": 4.1866266505645735e-06, + "loss": 0.0164, + "step": 27617 + }, + { + "epoch": 3.2749911063678407, + "grad_norm": 0.7542518861801499, + "learning_rate": 4.185297012329883e-06, + "loss": 0.0308, + "step": 27618 + }, + { + "epoch": 3.2751096881299655, + "grad_norm": 0.5824155363166574, + "learning_rate": 4.183967565981231e-06, + "loss": 0.0333, + "step": 27619 + }, + { + "epoch": 3.2752282698920907, + "grad_norm": 0.669132380706832, + "learning_rate": 4.182638311530879e-06, + "loss": 0.0369, + "step": 27620 + }, + { + "epoch": 3.2753468516542155, + "grad_norm": 0.7758817493087665, + "learning_rate": 4.1813092489910664e-06, + "loss": 0.0336, + "step": 27621 + }, + { + "epoch": 3.2754654334163407, + "grad_norm": 0.7741616706257758, + "learning_rate": 4.179980378374054e-06, + "loss": 0.0455, + "step": 27622 + }, + { + "epoch": 3.2755840151784654, + "grad_norm": 0.9856173853623642, + "learning_rate": 4.178651699692091e-06, + "loss": 0.0584, + "step": 27623 + }, + { + "epoch": 3.2757025969405906, + "grad_norm": 0.39190650870230215, + "learning_rate": 4.1773232129574316e-06, + "loss": 0.02, + "step": 27624 + }, + { + "epoch": 3.2758211787027154, + "grad_norm": 0.7197819911606649, + "learning_rate": 4.175994918182314e-06, + "loss": 0.0346, + "step": 27625 + }, + { + "epoch": 3.2759397604648406, + "grad_norm": 0.345889754627339, + "learning_rate": 4.174666815378986e-06, + "loss": 0.0172, + "step": 27626 + }, + { + "epoch": 3.2760583422269653, + "grad_norm": 0.6524784025737468, + "learning_rate": 4.17333890455969e-06, + "loss": 0.0321, + "step": 27627 + }, + { + "epoch": 3.2761769239890906, + "grad_norm": 0.34003757512996713, + "learning_rate": 4.172011185736674e-06, + "loss": 0.0167, + "step": 27628 + }, + { + "epoch": 3.2762955057512153, + "grad_norm": 0.5774918414914245, + "learning_rate": 4.17068365892217e-06, + "loss": 0.0367, + "step": 27629 + }, + { + "epoch": 3.2764140875133405, + "grad_norm": 0.6646852089335636, + "learning_rate": 4.1693563241284235e-06, + "loss": 0.0397, + "step": 27630 + }, + { + "epoch": 3.2765326692754653, + "grad_norm": 0.5211842791073708, + "learning_rate": 4.168029181367672e-06, + "loss": 0.0273, + "step": 27631 + }, + { + "epoch": 3.2766512510375905, + "grad_norm": 0.640270896184452, + "learning_rate": 4.166702230652139e-06, + "loss": 0.0348, + "step": 27632 + }, + { + "epoch": 3.2767698327997152, + "grad_norm": 0.44051121513001035, + "learning_rate": 4.165375471994066e-06, + "loss": 0.023, + "step": 27633 + }, + { + "epoch": 3.2768884145618404, + "grad_norm": 0.4818542001877877, + "learning_rate": 4.164048905405679e-06, + "loss": 0.0226, + "step": 27634 + }, + { + "epoch": 3.277006996323965, + "grad_norm": 0.5018849161755313, + "learning_rate": 4.162722530899219e-06, + "loss": 0.0248, + "step": 27635 + }, + { + "epoch": 3.2771255780860904, + "grad_norm": 0.8262444333590893, + "learning_rate": 4.161396348486895e-06, + "loss": 0.0397, + "step": 27636 + }, + { + "epoch": 3.277244159848215, + "grad_norm": 0.6433167318189187, + "learning_rate": 4.160070358180945e-06, + "loss": 0.0414, + "step": 27637 + }, + { + "epoch": 3.2773627416103404, + "grad_norm": 0.7658351411955707, + "learning_rate": 4.158744559993591e-06, + "loss": 0.025, + "step": 27638 + }, + { + "epoch": 3.2774813233724656, + "grad_norm": 0.6407528001087418, + "learning_rate": 4.157418953937056e-06, + "loss": 0.0336, + "step": 27639 + }, + { + "epoch": 3.2775999051345903, + "grad_norm": 0.34166358482379533, + "learning_rate": 4.156093540023559e-06, + "loss": 0.0175, + "step": 27640 + }, + { + "epoch": 3.277718486896715, + "grad_norm": 0.5133153843082404, + "learning_rate": 4.154768318265317e-06, + "loss": 0.0309, + "step": 27641 + }, + { + "epoch": 3.2778370686588403, + "grad_norm": 0.5465299861174026, + "learning_rate": 4.153443288674558e-06, + "loss": 0.0198, + "step": 27642 + }, + { + "epoch": 3.2779556504209655, + "grad_norm": 0.5994331055627089, + "learning_rate": 4.152118451263479e-06, + "loss": 0.028, + "step": 27643 + }, + { + "epoch": 3.2780742321830902, + "grad_norm": 0.6191744671876762, + "learning_rate": 4.1507938060443065e-06, + "loss": 0.0333, + "step": 27644 + }, + { + "epoch": 3.278192813945215, + "grad_norm": 0.5706617004853547, + "learning_rate": 4.149469353029245e-06, + "loss": 0.0223, + "step": 27645 + }, + { + "epoch": 3.27831139570734, + "grad_norm": 0.4779353368012013, + "learning_rate": 4.148145092230512e-06, + "loss": 0.0245, + "step": 27646 + }, + { + "epoch": 3.2784299774694654, + "grad_norm": 0.5175324109791112, + "learning_rate": 4.1468210236603075e-06, + "loss": 0.0311, + "step": 27647 + }, + { + "epoch": 3.27854855923159, + "grad_norm": 0.2831581917262618, + "learning_rate": 4.145497147330843e-06, + "loss": 0.0117, + "step": 27648 + }, + { + "epoch": 3.278667140993715, + "grad_norm": 0.3047512035179695, + "learning_rate": 4.144173463254319e-06, + "loss": 0.0137, + "step": 27649 + }, + { + "epoch": 3.27878572275584, + "grad_norm": 0.38621876385161175, + "learning_rate": 4.142849971442941e-06, + "loss": 0.0218, + "step": 27650 + }, + { + "epoch": 3.2789043045179653, + "grad_norm": 0.31895243193914274, + "learning_rate": 4.141526671908915e-06, + "loss": 0.0161, + "step": 27651 + }, + { + "epoch": 3.27902288628009, + "grad_norm": 0.5092611959870055, + "learning_rate": 4.1402035646644215e-06, + "loss": 0.0292, + "step": 27652 + }, + { + "epoch": 3.2791414680422153, + "grad_norm": 0.5718087583146946, + "learning_rate": 4.1388806497216855e-06, + "loss": 0.0281, + "step": 27653 + }, + { + "epoch": 3.27926004980434, + "grad_norm": 0.33576564466761255, + "learning_rate": 4.1375579270928835e-06, + "loss": 0.0214, + "step": 27654 + }, + { + "epoch": 3.2793786315664653, + "grad_norm": 1.0016903289680932, + "learning_rate": 4.136235396790211e-06, + "loss": 0.0645, + "step": 27655 + }, + { + "epoch": 3.27949721332859, + "grad_norm": 0.6369415869141795, + "learning_rate": 4.134913058825865e-06, + "loss": 0.0301, + "step": 27656 + }, + { + "epoch": 3.279615795090715, + "grad_norm": 0.567381667725607, + "learning_rate": 4.133590913212032e-06, + "loss": 0.0255, + "step": 27657 + }, + { + "epoch": 3.27973437685284, + "grad_norm": 0.49417850150946663, + "learning_rate": 4.132268959960911e-06, + "loss": 0.0279, + "step": 27658 + }, + { + "epoch": 3.279852958614965, + "grad_norm": 0.5649575334916819, + "learning_rate": 4.13094719908467e-06, + "loss": 0.0215, + "step": 27659 + }, + { + "epoch": 3.27997154037709, + "grad_norm": 0.6099357536340307, + "learning_rate": 4.129625630595513e-06, + "loss": 0.0315, + "step": 27660 + }, + { + "epoch": 3.280090122139215, + "grad_norm": 0.3478587190235009, + "learning_rate": 4.12830425450561e-06, + "loss": 0.0185, + "step": 27661 + }, + { + "epoch": 3.28020870390134, + "grad_norm": 0.35622541647046807, + "learning_rate": 4.126983070827153e-06, + "loss": 0.0201, + "step": 27662 + }, + { + "epoch": 3.280327285663465, + "grad_norm": 0.5774038817663291, + "learning_rate": 4.125662079572304e-06, + "loss": 0.0268, + "step": 27663 + }, + { + "epoch": 3.28044586742559, + "grad_norm": 0.37295195069442144, + "learning_rate": 4.124341280753266e-06, + "loss": 0.0161, + "step": 27664 + }, + { + "epoch": 3.280564449187715, + "grad_norm": 0.4981730602286615, + "learning_rate": 4.123020674382194e-06, + "loss": 0.0192, + "step": 27665 + }, + { + "epoch": 3.28068303094984, + "grad_norm": 0.42672788283114627, + "learning_rate": 4.1217002604712726e-06, + "loss": 0.0227, + "step": 27666 + }, + { + "epoch": 3.280801612711965, + "grad_norm": 0.6283968614900276, + "learning_rate": 4.1203800390326706e-06, + "loss": 0.0305, + "step": 27667 + }, + { + "epoch": 3.28092019447409, + "grad_norm": 0.4574462688110259, + "learning_rate": 4.119060010078563e-06, + "loss": 0.018, + "step": 27668 + }, + { + "epoch": 3.281038776236215, + "grad_norm": 0.450768164512053, + "learning_rate": 4.117740173621118e-06, + "loss": 0.0229, + "step": 27669 + }, + { + "epoch": 3.2811573579983397, + "grad_norm": 0.5434316316582783, + "learning_rate": 4.11642052967249e-06, + "loss": 0.0288, + "step": 27670 + }, + { + "epoch": 3.281275939760465, + "grad_norm": 0.4318013514490227, + "learning_rate": 4.115101078244871e-06, + "loss": 0.0167, + "step": 27671 + }, + { + "epoch": 3.2813945215225897, + "grad_norm": 0.5655376558489232, + "learning_rate": 4.113781819350399e-06, + "loss": 0.0262, + "step": 27672 + }, + { + "epoch": 3.281513103284715, + "grad_norm": 0.42811558697762947, + "learning_rate": 4.112462753001256e-06, + "loss": 0.0188, + "step": 27673 + }, + { + "epoch": 3.2816316850468397, + "grad_norm": 0.730944409151821, + "learning_rate": 4.111143879209578e-06, + "loss": 0.034, + "step": 27674 + }, + { + "epoch": 3.281750266808965, + "grad_norm": 0.7838866744250385, + "learning_rate": 4.109825197987549e-06, + "loss": 0.0263, + "step": 27675 + }, + { + "epoch": 3.2818688485710896, + "grad_norm": 0.7032740331428541, + "learning_rate": 4.108506709347309e-06, + "loss": 0.0354, + "step": 27676 + }, + { + "epoch": 3.281987430333215, + "grad_norm": 0.5017665679043052, + "learning_rate": 4.1071884133010216e-06, + "loss": 0.0271, + "step": 27677 + }, + { + "epoch": 3.2821060120953396, + "grad_norm": 0.585310243650561, + "learning_rate": 4.105870309860832e-06, + "loss": 0.0326, + "step": 27678 + }, + { + "epoch": 3.282224593857465, + "grad_norm": 0.7409760221376896, + "learning_rate": 4.1045523990388994e-06, + "loss": 0.0383, + "step": 27679 + }, + { + "epoch": 3.2823431756195895, + "grad_norm": 0.33580102363846065, + "learning_rate": 4.1032346808473755e-06, + "loss": 0.0113, + "step": 27680 + }, + { + "epoch": 3.2824617573817148, + "grad_norm": 0.7314912141816878, + "learning_rate": 4.101917155298396e-06, + "loss": 0.0449, + "step": 27681 + }, + { + "epoch": 3.2825803391438395, + "grad_norm": 0.5633475260571601, + "learning_rate": 4.100599822404114e-06, + "loss": 0.0303, + "step": 27682 + }, + { + "epoch": 3.2826989209059647, + "grad_norm": 0.5191714706796204, + "learning_rate": 4.099282682176669e-06, + "loss": 0.0195, + "step": 27683 + }, + { + "epoch": 3.2828175026680895, + "grad_norm": 0.5088797374184142, + "learning_rate": 4.097965734628217e-06, + "loss": 0.0304, + "step": 27684 + }, + { + "epoch": 3.2829360844302147, + "grad_norm": 0.6693199761313292, + "learning_rate": 4.0966489797708785e-06, + "loss": 0.0247, + "step": 27685 + }, + { + "epoch": 3.2830546661923394, + "grad_norm": 0.646725299698644, + "learning_rate": 4.0953324176168125e-06, + "loss": 0.0277, + "step": 27686 + }, + { + "epoch": 3.2831732479544646, + "grad_norm": 0.578931862593603, + "learning_rate": 4.094016048178143e-06, + "loss": 0.0356, + "step": 27687 + }, + { + "epoch": 3.28329182971659, + "grad_norm": 0.47679338082421313, + "learning_rate": 4.092699871467006e-06, + "loss": 0.029, + "step": 27688 + }, + { + "epoch": 3.2834104114787146, + "grad_norm": 0.6708136800449935, + "learning_rate": 4.09138388749554e-06, + "loss": 0.0237, + "step": 27689 + }, + { + "epoch": 3.2835289932408394, + "grad_norm": 0.4300764409304966, + "learning_rate": 4.090068096275876e-06, + "loss": 0.0186, + "step": 27690 + }, + { + "epoch": 3.2836475750029646, + "grad_norm": 0.6477583030576066, + "learning_rate": 4.088752497820144e-06, + "loss": 0.0334, + "step": 27691 + }, + { + "epoch": 3.2837661567650898, + "grad_norm": 0.5242404972754818, + "learning_rate": 4.08743709214047e-06, + "loss": 0.0224, + "step": 27692 + }, + { + "epoch": 3.2838847385272145, + "grad_norm": 0.47599322654465215, + "learning_rate": 4.086121879248978e-06, + "loss": 0.0265, + "step": 27693 + }, + { + "epoch": 3.2840033202893393, + "grad_norm": 0.35063322836847793, + "learning_rate": 4.084806859157797e-06, + "loss": 0.0172, + "step": 27694 + }, + { + "epoch": 3.2841219020514645, + "grad_norm": 0.3989833100381488, + "learning_rate": 4.0834920318790546e-06, + "loss": 0.0238, + "step": 27695 + }, + { + "epoch": 3.2842404838135897, + "grad_norm": 0.5628632165092365, + "learning_rate": 4.08217739742486e-06, + "loss": 0.0294, + "step": 27696 + }, + { + "epoch": 3.2843590655757144, + "grad_norm": 0.5335644767132095, + "learning_rate": 4.080862955807341e-06, + "loss": 0.023, + "step": 27697 + }, + { + "epoch": 3.284477647337839, + "grad_norm": 0.4154810777261224, + "learning_rate": 4.0795487070386106e-06, + "loss": 0.0196, + "step": 27698 + }, + { + "epoch": 3.2845962290999644, + "grad_norm": 0.547498946325707, + "learning_rate": 4.0782346511307884e-06, + "loss": 0.0195, + "step": 27699 + }, + { + "epoch": 3.2847148108620896, + "grad_norm": 0.6672513271205129, + "learning_rate": 4.0769207880959836e-06, + "loss": 0.033, + "step": 27700 + }, + { + "epoch": 3.2848333926242144, + "grad_norm": 0.2737417005163801, + "learning_rate": 4.075607117946314e-06, + "loss": 0.011, + "step": 27701 + }, + { + "epoch": 3.2849519743863396, + "grad_norm": 0.6911142906498657, + "learning_rate": 4.0742936406938935e-06, + "loss": 0.03, + "step": 27702 + }, + { + "epoch": 3.2850705561484643, + "grad_norm": 0.4486216751229737, + "learning_rate": 4.072980356350819e-06, + "loss": 0.0227, + "step": 27703 + }, + { + "epoch": 3.2851891379105895, + "grad_norm": 0.40685175396021567, + "learning_rate": 4.071667264929202e-06, + "loss": 0.0166, + "step": 27704 + }, + { + "epoch": 3.2853077196727143, + "grad_norm": 0.4205754036225102, + "learning_rate": 4.070354366441151e-06, + "loss": 0.0167, + "step": 27705 + }, + { + "epoch": 3.2854263014348395, + "grad_norm": 0.37409449204555006, + "learning_rate": 4.069041660898765e-06, + "loss": 0.0211, + "step": 27706 + }, + { + "epoch": 3.2855448831969642, + "grad_norm": 0.717188520785088, + "learning_rate": 4.067729148314153e-06, + "loss": 0.0327, + "step": 27707 + }, + { + "epoch": 3.2856634649590895, + "grad_norm": 0.48834564287900045, + "learning_rate": 4.066416828699399e-06, + "loss": 0.0216, + "step": 27708 + }, + { + "epoch": 3.285782046721214, + "grad_norm": 0.40461115718298907, + "learning_rate": 4.065104702066625e-06, + "loss": 0.0266, + "step": 27709 + }, + { + "epoch": 3.2859006284833394, + "grad_norm": 0.629382263635639, + "learning_rate": 4.063792768427904e-06, + "loss": 0.022, + "step": 27710 + }, + { + "epoch": 3.286019210245464, + "grad_norm": 0.7323934934875286, + "learning_rate": 4.062481027795348e-06, + "loss": 0.0355, + "step": 27711 + }, + { + "epoch": 3.2861377920075894, + "grad_norm": 0.4667073063648745, + "learning_rate": 4.061169480181029e-06, + "loss": 0.0229, + "step": 27712 + }, + { + "epoch": 3.286256373769714, + "grad_norm": 0.4648141881135656, + "learning_rate": 4.059858125597063e-06, + "loss": 0.0191, + "step": 27713 + }, + { + "epoch": 3.2863749555318393, + "grad_norm": 0.5525607694964968, + "learning_rate": 4.05854696405552e-06, + "loss": 0.0281, + "step": 27714 + }, + { + "epoch": 3.286493537293964, + "grad_norm": 0.6328632388367769, + "learning_rate": 4.057235995568496e-06, + "loss": 0.0335, + "step": 27715 + }, + { + "epoch": 3.2866121190560893, + "grad_norm": 0.5503832911816732, + "learning_rate": 4.0559252201480715e-06, + "loss": 0.014, + "step": 27716 + }, + { + "epoch": 3.286730700818214, + "grad_norm": 0.505830722730045, + "learning_rate": 4.054614637806334e-06, + "loss": 0.0222, + "step": 27717 + }, + { + "epoch": 3.2868492825803393, + "grad_norm": 0.5006015906164181, + "learning_rate": 4.05330424855537e-06, + "loss": 0.0345, + "step": 27718 + }, + { + "epoch": 3.286967864342464, + "grad_norm": 0.42648946465412635, + "learning_rate": 4.051994052407243e-06, + "loss": 0.0188, + "step": 27719 + }, + { + "epoch": 3.287086446104589, + "grad_norm": 0.6382527540482905, + "learning_rate": 4.050684049374054e-06, + "loss": 0.0381, + "step": 27720 + }, + { + "epoch": 3.287205027866714, + "grad_norm": 0.42272266541354075, + "learning_rate": 4.049374239467865e-06, + "loss": 0.0239, + "step": 27721 + }, + { + "epoch": 3.287323609628839, + "grad_norm": 0.7279440286170767, + "learning_rate": 4.048064622700756e-06, + "loss": 0.0328, + "step": 27722 + }, + { + "epoch": 3.287442191390964, + "grad_norm": 0.5991969001860454, + "learning_rate": 4.0467551990847894e-06, + "loss": 0.0256, + "step": 27723 + }, + { + "epoch": 3.287560773153089, + "grad_norm": 0.4457397271028291, + "learning_rate": 4.045445968632059e-06, + "loss": 0.0231, + "step": 27724 + }, + { + "epoch": 3.287679354915214, + "grad_norm": 0.6154214337233967, + "learning_rate": 4.044136931354611e-06, + "loss": 0.0283, + "step": 27725 + }, + { + "epoch": 3.287797936677339, + "grad_norm": 0.5166235935904505, + "learning_rate": 4.042828087264522e-06, + "loss": 0.0342, + "step": 27726 + }, + { + "epoch": 3.287916518439464, + "grad_norm": 0.4985362450061206, + "learning_rate": 4.041519436373861e-06, + "loss": 0.024, + "step": 27727 + }, + { + "epoch": 3.288035100201589, + "grad_norm": 0.4834925224755562, + "learning_rate": 4.040210978694689e-06, + "loss": 0.022, + "step": 27728 + }, + { + "epoch": 3.288153681963714, + "grad_norm": 0.5543407393156582, + "learning_rate": 4.038902714239076e-06, + "loss": 0.0294, + "step": 27729 + }, + { + "epoch": 3.288272263725839, + "grad_norm": 0.7141239943008958, + "learning_rate": 4.037594643019063e-06, + "loss": 0.0314, + "step": 27730 + }, + { + "epoch": 3.288390845487964, + "grad_norm": 0.4994459019681836, + "learning_rate": 4.036286765046734e-06, + "loss": 0.0181, + "step": 27731 + }, + { + "epoch": 3.288509427250089, + "grad_norm": 0.5381956698100948, + "learning_rate": 4.034979080334128e-06, + "loss": 0.0295, + "step": 27732 + }, + { + "epoch": 3.2886280090122137, + "grad_norm": 0.7172920446413101, + "learning_rate": 4.033671588893309e-06, + "loss": 0.0296, + "step": 27733 + }, + { + "epoch": 3.288746590774339, + "grad_norm": 0.4681107019976971, + "learning_rate": 4.032364290736318e-06, + "loss": 0.0237, + "step": 27734 + }, + { + "epoch": 3.2888651725364637, + "grad_norm": 0.558171957216608, + "learning_rate": 4.031057185875228e-06, + "loss": 0.0389, + "step": 27735 + }, + { + "epoch": 3.288983754298589, + "grad_norm": 0.5732088618497633, + "learning_rate": 4.029750274322072e-06, + "loss": 0.0248, + "step": 27736 + }, + { + "epoch": 3.2891023360607137, + "grad_norm": 0.7193319987409266, + "learning_rate": 4.0284435560889015e-06, + "loss": 0.0389, + "step": 27737 + }, + { + "epoch": 3.289220917822839, + "grad_norm": 0.40962269014407476, + "learning_rate": 4.027137031187764e-06, + "loss": 0.0231, + "step": 27738 + }, + { + "epoch": 3.2893394995849636, + "grad_norm": 0.4419452349022108, + "learning_rate": 4.025830699630706e-06, + "loss": 0.0197, + "step": 27739 + }, + { + "epoch": 3.289458081347089, + "grad_norm": 0.5482065361147989, + "learning_rate": 4.024524561429771e-06, + "loss": 0.0305, + "step": 27740 + }, + { + "epoch": 3.289576663109214, + "grad_norm": 0.6739027269242459, + "learning_rate": 4.023218616596991e-06, + "loss": 0.0254, + "step": 27741 + }, + { + "epoch": 3.289695244871339, + "grad_norm": 0.4378387809516163, + "learning_rate": 4.02191286514442e-06, + "loss": 0.0186, + "step": 27742 + }, + { + "epoch": 3.2898138266334636, + "grad_norm": 0.8011710770851576, + "learning_rate": 4.020607307084082e-06, + "loss": 0.0367, + "step": 27743 + }, + { + "epoch": 3.2899324083955888, + "grad_norm": 0.5882009870077753, + "learning_rate": 4.019301942428025e-06, + "loss": 0.0186, + "step": 27744 + }, + { + "epoch": 3.290050990157714, + "grad_norm": 0.4588106214575373, + "learning_rate": 4.017996771188265e-06, + "loss": 0.0246, + "step": 27745 + }, + { + "epoch": 3.2901695719198387, + "grad_norm": 0.5268251972578644, + "learning_rate": 4.016691793376854e-06, + "loss": 0.0333, + "step": 27746 + }, + { + "epoch": 3.2902881536819635, + "grad_norm": 0.8531328819088594, + "learning_rate": 4.0153870090058084e-06, + "loss": 0.0417, + "step": 27747 + }, + { + "epoch": 3.2904067354440887, + "grad_norm": 0.7576885192581095, + "learning_rate": 4.014082418087162e-06, + "loss": 0.0316, + "step": 27748 + }, + { + "epoch": 3.290525317206214, + "grad_norm": 0.4792500169002475, + "learning_rate": 4.0127780206329405e-06, + "loss": 0.0269, + "step": 27749 + }, + { + "epoch": 3.2906438989683386, + "grad_norm": 0.5309099529758844, + "learning_rate": 4.01147381665517e-06, + "loss": 0.0265, + "step": 27750 + }, + { + "epoch": 3.290762480730464, + "grad_norm": 0.8577059490088345, + "learning_rate": 4.010169806165881e-06, + "loss": 0.046, + "step": 27751 + }, + { + "epoch": 3.2908810624925886, + "grad_norm": 0.5787632145403245, + "learning_rate": 4.0088659891770765e-06, + "loss": 0.0349, + "step": 27752 + }, + { + "epoch": 3.290999644254714, + "grad_norm": 0.4153987498563949, + "learning_rate": 4.00756236570079e-06, + "loss": 0.0252, + "step": 27753 + }, + { + "epoch": 3.2911182260168386, + "grad_norm": 0.6431968422997445, + "learning_rate": 4.006258935749035e-06, + "loss": 0.0313, + "step": 27754 + }, + { + "epoch": 3.2912368077789638, + "grad_norm": 0.36997461368760215, + "learning_rate": 4.0049556993338285e-06, + "loss": 0.0166, + "step": 27755 + }, + { + "epoch": 3.2913553895410885, + "grad_norm": 0.6996197469297629, + "learning_rate": 4.003652656467182e-06, + "loss": 0.0381, + "step": 27756 + }, + { + "epoch": 3.2914739713032137, + "grad_norm": 0.7554093089386799, + "learning_rate": 4.0023498071611155e-06, + "loss": 0.0417, + "step": 27757 + }, + { + "epoch": 3.2915925530653385, + "grad_norm": 0.3956308730411375, + "learning_rate": 4.001047151427636e-06, + "loss": 0.0174, + "step": 27758 + }, + { + "epoch": 3.2917111348274637, + "grad_norm": 0.45252887840317685, + "learning_rate": 3.999744689278747e-06, + "loss": 0.0201, + "step": 27759 + }, + { + "epoch": 3.2918297165895885, + "grad_norm": 0.5679354043166773, + "learning_rate": 3.998442420726462e-06, + "loss": 0.029, + "step": 27760 + }, + { + "epoch": 3.2919482983517137, + "grad_norm": 0.5241444755891376, + "learning_rate": 3.9971403457827825e-06, + "loss": 0.0269, + "step": 27761 + }, + { + "epoch": 3.2920668801138384, + "grad_norm": 0.3271798310423574, + "learning_rate": 3.99583846445972e-06, + "loss": 0.0138, + "step": 27762 + }, + { + "epoch": 3.2921854618759636, + "grad_norm": 0.516197758786711, + "learning_rate": 3.994536776769262e-06, + "loss": 0.0299, + "step": 27763 + }, + { + "epoch": 3.2923040436380884, + "grad_norm": 0.7365348110517009, + "learning_rate": 3.993235282723418e-06, + "loss": 0.0426, + "step": 27764 + }, + { + "epoch": 3.2924226254002136, + "grad_norm": 0.5844491279414128, + "learning_rate": 3.991933982334184e-06, + "loss": 0.0363, + "step": 27765 + }, + { + "epoch": 3.2925412071623383, + "grad_norm": 0.4696012415859917, + "learning_rate": 3.990632875613556e-06, + "loss": 0.0287, + "step": 27766 + }, + { + "epoch": 3.2926597889244635, + "grad_norm": 0.6033985244232584, + "learning_rate": 3.989331962573537e-06, + "loss": 0.0293, + "step": 27767 + }, + { + "epoch": 3.2927783706865883, + "grad_norm": 0.6298768333840329, + "learning_rate": 3.988031243226098e-06, + "loss": 0.0286, + "step": 27768 + }, + { + "epoch": 3.2928969524487135, + "grad_norm": 0.6358744743746384, + "learning_rate": 3.986730717583259e-06, + "loss": 0.027, + "step": 27769 + }, + { + "epoch": 3.2930155342108383, + "grad_norm": 0.4888039476023514, + "learning_rate": 3.985430385656988e-06, + "loss": 0.034, + "step": 27770 + }, + { + "epoch": 3.2931341159729635, + "grad_norm": 0.4992387391927069, + "learning_rate": 3.984130247459278e-06, + "loss": 0.0247, + "step": 27771 + }, + { + "epoch": 3.293252697735088, + "grad_norm": 0.9088829784387544, + "learning_rate": 3.9828303030021164e-06, + "loss": 0.0336, + "step": 27772 + }, + { + "epoch": 3.2933712794972134, + "grad_norm": 0.37421942171223666, + "learning_rate": 3.981530552297491e-06, + "loss": 0.0191, + "step": 27773 + }, + { + "epoch": 3.293489861259338, + "grad_norm": 0.7045991120675945, + "learning_rate": 3.980230995357375e-06, + "loss": 0.0447, + "step": 27774 + }, + { + "epoch": 3.2936084430214634, + "grad_norm": 0.7241157190924399, + "learning_rate": 3.9789316321937525e-06, + "loss": 0.0402, + "step": 27775 + }, + { + "epoch": 3.293727024783588, + "grad_norm": 0.5813389589631031, + "learning_rate": 3.977632462818603e-06, + "loss": 0.0309, + "step": 27776 + }, + { + "epoch": 3.2938456065457133, + "grad_norm": 0.37030177295754885, + "learning_rate": 3.976333487243905e-06, + "loss": 0.0185, + "step": 27777 + }, + { + "epoch": 3.293964188307838, + "grad_norm": 0.3900694423561358, + "learning_rate": 3.975034705481634e-06, + "loss": 0.0225, + "step": 27778 + }, + { + "epoch": 3.2940827700699633, + "grad_norm": 0.62396544176592, + "learning_rate": 3.97373611754375e-06, + "loss": 0.037, + "step": 27779 + }, + { + "epoch": 3.294201351832088, + "grad_norm": 0.6955383114054494, + "learning_rate": 3.972437723442246e-06, + "loss": 0.0301, + "step": 27780 + }, + { + "epoch": 3.2943199335942133, + "grad_norm": 0.6273715858948584, + "learning_rate": 3.971139523189074e-06, + "loss": 0.0365, + "step": 27781 + }, + { + "epoch": 3.294438515356338, + "grad_norm": 0.5429033772450947, + "learning_rate": 3.969841516796216e-06, + "loss": 0.0289, + "step": 27782 + }, + { + "epoch": 3.2945570971184632, + "grad_norm": 0.6964604741675752, + "learning_rate": 3.968543704275618e-06, + "loss": 0.0439, + "step": 27783 + }, + { + "epoch": 3.294675678880588, + "grad_norm": 0.46652734328904405, + "learning_rate": 3.967246085639268e-06, + "loss": 0.0234, + "step": 27784 + }, + { + "epoch": 3.294794260642713, + "grad_norm": 0.46719942972454875, + "learning_rate": 3.965948660899113e-06, + "loss": 0.0234, + "step": 27785 + }, + { + "epoch": 3.294912842404838, + "grad_norm": 0.42637428897915847, + "learning_rate": 3.964651430067115e-06, + "loss": 0.0213, + "step": 27786 + }, + { + "epoch": 3.295031424166963, + "grad_norm": 0.426115472914542, + "learning_rate": 3.963354393155236e-06, + "loss": 0.025, + "step": 27787 + }, + { + "epoch": 3.295150005929088, + "grad_norm": 0.6538535922915355, + "learning_rate": 3.9620575501754325e-06, + "loss": 0.0266, + "step": 27788 + }, + { + "epoch": 3.295268587691213, + "grad_norm": 0.6305592307742108, + "learning_rate": 3.960760901139668e-06, + "loss": 0.0285, + "step": 27789 + }, + { + "epoch": 3.2953871694533383, + "grad_norm": 0.6139089662571792, + "learning_rate": 3.959464446059874e-06, + "loss": 0.0356, + "step": 27790 + }, + { + "epoch": 3.295505751215463, + "grad_norm": 0.8413465565421957, + "learning_rate": 3.95816818494803e-06, + "loss": 0.0503, + "step": 27791 + }, + { + "epoch": 3.295624332977588, + "grad_norm": 0.4996793817948496, + "learning_rate": 3.956872117816063e-06, + "loss": 0.0325, + "step": 27792 + }, + { + "epoch": 3.295742914739713, + "grad_norm": 0.4870672201191756, + "learning_rate": 3.9555762446759404e-06, + "loss": 0.0139, + "step": 27793 + }, + { + "epoch": 3.2958614965018382, + "grad_norm": 0.5307753689449944, + "learning_rate": 3.954280565539584e-06, + "loss": 0.0168, + "step": 27794 + }, + { + "epoch": 3.295980078263963, + "grad_norm": 0.5463900707835186, + "learning_rate": 3.952985080418964e-06, + "loss": 0.0293, + "step": 27795 + }, + { + "epoch": 3.2960986600260878, + "grad_norm": 0.6614333417879408, + "learning_rate": 3.951689789326008e-06, + "loss": 0.0347, + "step": 27796 + }, + { + "epoch": 3.296217241788213, + "grad_norm": 0.4445445687185322, + "learning_rate": 3.95039469227266e-06, + "loss": 0.022, + "step": 27797 + }, + { + "epoch": 3.296335823550338, + "grad_norm": 0.6435618245224741, + "learning_rate": 3.94909978927086e-06, + "loss": 0.0386, + "step": 27798 + }, + { + "epoch": 3.296454405312463, + "grad_norm": 0.5546876480754466, + "learning_rate": 3.947805080332545e-06, + "loss": 0.0258, + "step": 27799 + }, + { + "epoch": 3.2965729870745877, + "grad_norm": 0.5836845455451529, + "learning_rate": 3.946510565469658e-06, + "loss": 0.0324, + "step": 27800 + }, + { + "epoch": 3.296691568836713, + "grad_norm": 0.787731443007729, + "learning_rate": 3.945216244694114e-06, + "loss": 0.04, + "step": 27801 + }, + { + "epoch": 3.296810150598838, + "grad_norm": 0.8305971375565421, + "learning_rate": 3.94392211801787e-06, + "loss": 0.0305, + "step": 27802 + }, + { + "epoch": 3.296928732360963, + "grad_norm": 0.6903739967928753, + "learning_rate": 3.942628185452838e-06, + "loss": 0.0259, + "step": 27803 + }, + { + "epoch": 3.297047314123088, + "grad_norm": 0.5452145986691702, + "learning_rate": 3.9413344470109496e-06, + "loss": 0.025, + "step": 27804 + }, + { + "epoch": 3.297165895885213, + "grad_norm": 0.5112625128806721, + "learning_rate": 3.940040902704134e-06, + "loss": 0.0173, + "step": 27805 + }, + { + "epoch": 3.297284477647338, + "grad_norm": 0.4514315198237018, + "learning_rate": 3.938747552544317e-06, + "loss": 0.0241, + "step": 27806 + }, + { + "epoch": 3.2974030594094628, + "grad_norm": 0.5590753230831595, + "learning_rate": 3.937454396543428e-06, + "loss": 0.027, + "step": 27807 + }, + { + "epoch": 3.297521641171588, + "grad_norm": 0.6412332804049922, + "learning_rate": 3.936161434713373e-06, + "loss": 0.0392, + "step": 27808 + }, + { + "epoch": 3.2976402229337127, + "grad_norm": 0.36464768601307296, + "learning_rate": 3.93486866706608e-06, + "loss": 0.014, + "step": 27809 + }, + { + "epoch": 3.297758804695838, + "grad_norm": 0.39195231466537633, + "learning_rate": 3.933576093613464e-06, + "loss": 0.0199, + "step": 27810 + }, + { + "epoch": 3.2978773864579627, + "grad_norm": 0.6918742000877832, + "learning_rate": 3.932283714367452e-06, + "loss": 0.0333, + "step": 27811 + }, + { + "epoch": 3.297995968220088, + "grad_norm": 0.5021265549619037, + "learning_rate": 3.9309915293399366e-06, + "loss": 0.0242, + "step": 27812 + }, + { + "epoch": 3.2981145499822127, + "grad_norm": 0.6531807208778259, + "learning_rate": 3.929699538542853e-06, + "loss": 0.0388, + "step": 27813 + }, + { + "epoch": 3.298233131744338, + "grad_norm": 0.4642456982570778, + "learning_rate": 3.9284077419880965e-06, + "loss": 0.0303, + "step": 27814 + }, + { + "epoch": 3.2983517135064626, + "grad_norm": 0.41791802939588896, + "learning_rate": 3.927116139687581e-06, + "loss": 0.0235, + "step": 27815 + }, + { + "epoch": 3.298470295268588, + "grad_norm": 0.6630778768567833, + "learning_rate": 3.925824731653213e-06, + "loss": 0.0325, + "step": 27816 + }, + { + "epoch": 3.2985888770307126, + "grad_norm": 0.6353172743539326, + "learning_rate": 3.9245335178969e-06, + "loss": 0.0341, + "step": 27817 + }, + { + "epoch": 3.2987074587928378, + "grad_norm": 0.3027266064393657, + "learning_rate": 3.923242498430546e-06, + "loss": 0.0143, + "step": 27818 + }, + { + "epoch": 3.2988260405549625, + "grad_norm": 0.3426885946348873, + "learning_rate": 3.921951673266047e-06, + "loss": 0.016, + "step": 27819 + }, + { + "epoch": 3.2989446223170877, + "grad_norm": 0.685958422015653, + "learning_rate": 3.920661042415305e-06, + "loss": 0.0357, + "step": 27820 + }, + { + "epoch": 3.2990632040792125, + "grad_norm": 0.5238278143076072, + "learning_rate": 3.919370605890218e-06, + "loss": 0.027, + "step": 27821 + }, + { + "epoch": 3.2991817858413377, + "grad_norm": 0.6175004488135559, + "learning_rate": 3.918080363702692e-06, + "loss": 0.0306, + "step": 27822 + }, + { + "epoch": 3.2993003676034625, + "grad_norm": 0.9648412482625215, + "learning_rate": 3.916790315864605e-06, + "loss": 0.0337, + "step": 27823 + }, + { + "epoch": 3.2994189493655877, + "grad_norm": 0.4329454126429439, + "learning_rate": 3.915500462387858e-06, + "loss": 0.0235, + "step": 27824 + }, + { + "epoch": 3.2995375311277124, + "grad_norm": 0.6970629368650261, + "learning_rate": 3.9142108032843405e-06, + "loss": 0.0338, + "step": 27825 + }, + { + "epoch": 3.2996561128898376, + "grad_norm": 0.6546710137985855, + "learning_rate": 3.912921338565942e-06, + "loss": 0.026, + "step": 27826 + }, + { + "epoch": 3.2997746946519624, + "grad_norm": 0.6519829603204154, + "learning_rate": 3.91163206824455e-06, + "loss": 0.0312, + "step": 27827 + }, + { + "epoch": 3.2998932764140876, + "grad_norm": 0.8271313249382385, + "learning_rate": 3.91034299233205e-06, + "loss": 0.0505, + "step": 27828 + }, + { + "epoch": 3.3000118581762123, + "grad_norm": 0.5581486851027265, + "learning_rate": 3.909054110840335e-06, + "loss": 0.0278, + "step": 27829 + }, + { + "epoch": 3.3001304399383375, + "grad_norm": 0.589399228632936, + "learning_rate": 3.907765423781268e-06, + "loss": 0.0196, + "step": 27830 + }, + { + "epoch": 3.3002490217004623, + "grad_norm": 0.5999171419890557, + "learning_rate": 3.9064769311667385e-06, + "loss": 0.0213, + "step": 27831 + }, + { + "epoch": 3.3003676034625875, + "grad_norm": 0.8983944146526531, + "learning_rate": 3.905188633008627e-06, + "loss": 0.0387, + "step": 27832 + }, + { + "epoch": 3.3004861852247123, + "grad_norm": 0.3575783770805807, + "learning_rate": 3.903900529318813e-06, + "loss": 0.0179, + "step": 27833 + }, + { + "epoch": 3.3006047669868375, + "grad_norm": 0.557083422906829, + "learning_rate": 3.9026126201091615e-06, + "loss": 0.0206, + "step": 27834 + }, + { + "epoch": 3.3007233487489622, + "grad_norm": 0.3488686185056428, + "learning_rate": 3.901324905391551e-06, + "loss": 0.0161, + "step": 27835 + }, + { + "epoch": 3.3008419305110874, + "grad_norm": 0.39234119417957475, + "learning_rate": 3.9000373851778485e-06, + "loss": 0.0208, + "step": 27836 + }, + { + "epoch": 3.300960512273212, + "grad_norm": 0.378918766959862, + "learning_rate": 3.898750059479931e-06, + "loss": 0.0209, + "step": 27837 + }, + { + "epoch": 3.3010790940353374, + "grad_norm": 0.5886313612775618, + "learning_rate": 3.897462928309667e-06, + "loss": 0.0281, + "step": 27838 + }, + { + "epoch": 3.3011976757974626, + "grad_norm": 0.657825216180427, + "learning_rate": 3.896175991678902e-06, + "loss": 0.0307, + "step": 27839 + }, + { + "epoch": 3.3013162575595874, + "grad_norm": 0.4632513820508234, + "learning_rate": 3.894889249599529e-06, + "loss": 0.024, + "step": 27840 + }, + { + "epoch": 3.301434839321712, + "grad_norm": 0.4561223298058198, + "learning_rate": 3.8936027020833905e-06, + "loss": 0.0256, + "step": 27841 + }, + { + "epoch": 3.3015534210838373, + "grad_norm": 0.31634680519244085, + "learning_rate": 3.892316349142352e-06, + "loss": 0.0213, + "step": 27842 + }, + { + "epoch": 3.3016720028459625, + "grad_norm": 0.3333540942565808, + "learning_rate": 3.891030190788275e-06, + "loss": 0.0154, + "step": 27843 + }, + { + "epoch": 3.3017905846080873, + "grad_norm": 0.47228257656736544, + "learning_rate": 3.889744227033018e-06, + "loss": 0.0273, + "step": 27844 + }, + { + "epoch": 3.301909166370212, + "grad_norm": 0.6771176448394813, + "learning_rate": 3.888458457888428e-06, + "loss": 0.0384, + "step": 27845 + }, + { + "epoch": 3.3020277481323372, + "grad_norm": 0.337499677670817, + "learning_rate": 3.887172883366361e-06, + "loss": 0.0181, + "step": 27846 + }, + { + "epoch": 3.3021463298944624, + "grad_norm": 0.5533353744129451, + "learning_rate": 3.885887503478669e-06, + "loss": 0.0246, + "step": 27847 + }, + { + "epoch": 3.302264911656587, + "grad_norm": 0.4888134317014093, + "learning_rate": 3.884602318237204e-06, + "loss": 0.0283, + "step": 27848 + }, + { + "epoch": 3.302383493418712, + "grad_norm": 0.8566908497483087, + "learning_rate": 3.883317327653818e-06, + "loss": 0.0293, + "step": 27849 + }, + { + "epoch": 3.302502075180837, + "grad_norm": 0.43326556096191937, + "learning_rate": 3.88203253174034e-06, + "loss": 0.0186, + "step": 27850 + }, + { + "epoch": 3.3026206569429624, + "grad_norm": 0.3243749713762092, + "learning_rate": 3.880747930508636e-06, + "loss": 0.0115, + "step": 27851 + }, + { + "epoch": 3.302739238705087, + "grad_norm": 0.6510938987848732, + "learning_rate": 3.879463523970531e-06, + "loss": 0.0388, + "step": 27852 + }, + { + "epoch": 3.3028578204672123, + "grad_norm": 0.3866774220224474, + "learning_rate": 3.878179312137881e-06, + "loss": 0.0198, + "step": 27853 + }, + { + "epoch": 3.302976402229337, + "grad_norm": 0.6894751952840321, + "learning_rate": 3.876895295022504e-06, + "loss": 0.0394, + "step": 27854 + }, + { + "epoch": 3.3030949839914623, + "grad_norm": 0.6680036710259506, + "learning_rate": 3.875611472636259e-06, + "loss": 0.0372, + "step": 27855 + }, + { + "epoch": 3.303213565753587, + "grad_norm": 0.6052998660033125, + "learning_rate": 3.8743278449909685e-06, + "loss": 0.0223, + "step": 27856 + }, + { + "epoch": 3.3033321475157122, + "grad_norm": 0.7312785801166084, + "learning_rate": 3.873044412098467e-06, + "loss": 0.0387, + "step": 27857 + }, + { + "epoch": 3.303450729277837, + "grad_norm": 0.543093589923459, + "learning_rate": 3.871761173970589e-06, + "loss": 0.0214, + "step": 27858 + }, + { + "epoch": 3.303569311039962, + "grad_norm": 0.5001807847313702, + "learning_rate": 3.870478130619165e-06, + "loss": 0.03, + "step": 27859 + }, + { + "epoch": 3.303687892802087, + "grad_norm": 0.745356787303011, + "learning_rate": 3.869195282056029e-06, + "loss": 0.039, + "step": 27860 + }, + { + "epoch": 3.303806474564212, + "grad_norm": 0.7269263723598205, + "learning_rate": 3.867912628292986e-06, + "loss": 0.0493, + "step": 27861 + }, + { + "epoch": 3.303925056326337, + "grad_norm": 0.5697591655543556, + "learning_rate": 3.866630169341886e-06, + "loss": 0.0287, + "step": 27862 + }, + { + "epoch": 3.304043638088462, + "grad_norm": 0.4354555459311577, + "learning_rate": 3.865347905214536e-06, + "loss": 0.0245, + "step": 27863 + }, + { + "epoch": 3.304162219850587, + "grad_norm": 0.3771851576491067, + "learning_rate": 3.864065835922762e-06, + "loss": 0.0169, + "step": 27864 + }, + { + "epoch": 3.304280801612712, + "grad_norm": 0.6518929042036157, + "learning_rate": 3.862783961478381e-06, + "loss": 0.03, + "step": 27865 + }, + { + "epoch": 3.304399383374837, + "grad_norm": 0.98086454723485, + "learning_rate": 3.861502281893212e-06, + "loss": 0.0588, + "step": 27866 + }, + { + "epoch": 3.304517965136962, + "grad_norm": 0.3569510966945982, + "learning_rate": 3.860220797179076e-06, + "loss": 0.0207, + "step": 27867 + }, + { + "epoch": 3.304636546899087, + "grad_norm": 0.7468901867727762, + "learning_rate": 3.8589395073477755e-06, + "loss": 0.0552, + "step": 27868 + }, + { + "epoch": 3.304755128661212, + "grad_norm": 0.3973288725246763, + "learning_rate": 3.857658412411128e-06, + "loss": 0.0187, + "step": 27869 + }, + { + "epoch": 3.3048737104233368, + "grad_norm": 0.624036059361594, + "learning_rate": 3.8563775123809435e-06, + "loss": 0.0234, + "step": 27870 + }, + { + "epoch": 3.304992292185462, + "grad_norm": 0.40139781836275046, + "learning_rate": 3.855096807269034e-06, + "loss": 0.0152, + "step": 27871 + }, + { + "epoch": 3.3051108739475867, + "grad_norm": 0.5227923725918305, + "learning_rate": 3.853816297087196e-06, + "loss": 0.0297, + "step": 27872 + }, + { + "epoch": 3.305229455709712, + "grad_norm": 0.4140921587289125, + "learning_rate": 3.852535981847246e-06, + "loss": 0.023, + "step": 27873 + }, + { + "epoch": 3.3053480374718367, + "grad_norm": 0.39666652516720674, + "learning_rate": 3.851255861560979e-06, + "loss": 0.0185, + "step": 27874 + }, + { + "epoch": 3.305466619233962, + "grad_norm": 0.4747413066574263, + "learning_rate": 3.849975936240199e-06, + "loss": 0.0201, + "step": 27875 + }, + { + "epoch": 3.3055852009960867, + "grad_norm": 0.4115964837761341, + "learning_rate": 3.8486962058967044e-06, + "loss": 0.0193, + "step": 27876 + }, + { + "epoch": 3.305703782758212, + "grad_norm": 0.6734284232647656, + "learning_rate": 3.847416670542292e-06, + "loss": 0.0269, + "step": 27877 + }, + { + "epoch": 3.3058223645203366, + "grad_norm": 0.6142894842098806, + "learning_rate": 3.846137330188765e-06, + "loss": 0.0269, + "step": 27878 + }, + { + "epoch": 3.305940946282462, + "grad_norm": 0.3640761980053133, + "learning_rate": 3.844858184847907e-06, + "loss": 0.0188, + "step": 27879 + }, + { + "epoch": 3.3060595280445866, + "grad_norm": 0.6992080665469463, + "learning_rate": 3.843579234531514e-06, + "loss": 0.0364, + "step": 27880 + }, + { + "epoch": 3.306178109806712, + "grad_norm": 0.5935694884313321, + "learning_rate": 3.842300479251376e-06, + "loss": 0.0364, + "step": 27881 + }, + { + "epoch": 3.3062966915688365, + "grad_norm": 0.4654099168897005, + "learning_rate": 3.841021919019288e-06, + "loss": 0.0257, + "step": 27882 + }, + { + "epoch": 3.3064152733309617, + "grad_norm": 0.4537323713790174, + "learning_rate": 3.839743553847025e-06, + "loss": 0.0184, + "step": 27883 + }, + { + "epoch": 3.3065338550930865, + "grad_norm": 0.560057861069202, + "learning_rate": 3.838465383746378e-06, + "loss": 0.0285, + "step": 27884 + }, + { + "epoch": 3.3066524368552117, + "grad_norm": 0.5332051876442494, + "learning_rate": 3.837187408729131e-06, + "loss": 0.0302, + "step": 27885 + }, + { + "epoch": 3.3067710186173365, + "grad_norm": 0.8356829350842514, + "learning_rate": 3.835909628807066e-06, + "loss": 0.0474, + "step": 27886 + }, + { + "epoch": 3.3068896003794617, + "grad_norm": 0.7482376414644991, + "learning_rate": 3.834632043991959e-06, + "loss": 0.0425, + "step": 27887 + }, + { + "epoch": 3.307008182141587, + "grad_norm": 0.5329230226977014, + "learning_rate": 3.83335465429559e-06, + "loss": 0.0316, + "step": 27888 + }, + { + "epoch": 3.3071267639037116, + "grad_norm": 0.585591377876086, + "learning_rate": 3.832077459729741e-06, + "loss": 0.0354, + "step": 27889 + }, + { + "epoch": 3.3072453456658364, + "grad_norm": 0.47452468268278597, + "learning_rate": 3.830800460306175e-06, + "loss": 0.0311, + "step": 27890 + }, + { + "epoch": 3.3073639274279616, + "grad_norm": 0.35063622290476326, + "learning_rate": 3.829523656036668e-06, + "loss": 0.0158, + "step": 27891 + }, + { + "epoch": 3.307482509190087, + "grad_norm": 0.8340636896561271, + "learning_rate": 3.828247046932992e-06, + "loss": 0.0537, + "step": 27892 + }, + { + "epoch": 3.3076010909522116, + "grad_norm": 0.4909588225640828, + "learning_rate": 3.826970633006924e-06, + "loss": 0.02, + "step": 27893 + }, + { + "epoch": 3.3077196727143363, + "grad_norm": 0.38574762295601633, + "learning_rate": 3.8256944142702144e-06, + "loss": 0.0203, + "step": 27894 + }, + { + "epoch": 3.3078382544764615, + "grad_norm": 0.6968905900207709, + "learning_rate": 3.824418390734635e-06, + "loss": 0.0331, + "step": 27895 + }, + { + "epoch": 3.3079568362385867, + "grad_norm": 0.6818185629619743, + "learning_rate": 3.823142562411955e-06, + "loss": 0.0353, + "step": 27896 + }, + { + "epoch": 3.3080754180007115, + "grad_norm": 0.6181435335328476, + "learning_rate": 3.821866929313928e-06, + "loss": 0.0278, + "step": 27897 + }, + { + "epoch": 3.3081939997628362, + "grad_norm": 0.6927571052664493, + "learning_rate": 3.820591491452319e-06, + "loss": 0.0292, + "step": 27898 + }, + { + "epoch": 3.3083125815249614, + "grad_norm": 0.4047346786667762, + "learning_rate": 3.819316248838886e-06, + "loss": 0.0224, + "step": 27899 + }, + { + "epoch": 3.3084311632870866, + "grad_norm": 0.8144580383826798, + "learning_rate": 3.818041201485389e-06, + "loss": 0.0369, + "step": 27900 + }, + { + "epoch": 3.3085497450492114, + "grad_norm": 0.527573287767143, + "learning_rate": 3.816766349403569e-06, + "loss": 0.0358, + "step": 27901 + }, + { + "epoch": 3.3086683268113366, + "grad_norm": 0.6595568572185324, + "learning_rate": 3.815491692605189e-06, + "loss": 0.0396, + "step": 27902 + }, + { + "epoch": 3.3087869085734614, + "grad_norm": 0.6385850560888783, + "learning_rate": 3.8142172311019965e-06, + "loss": 0.0343, + "step": 27903 + }, + { + "epoch": 3.3089054903355866, + "grad_norm": 0.5776590529038925, + "learning_rate": 3.8129429649057467e-06, + "loss": 0.0255, + "step": 27904 + }, + { + "epoch": 3.3090240720977113, + "grad_norm": 0.6524259125783789, + "learning_rate": 3.811668894028178e-06, + "loss": 0.0231, + "step": 27905 + }, + { + "epoch": 3.3091426538598365, + "grad_norm": 0.46160850857623853, + "learning_rate": 3.8103950184810363e-06, + "loss": 0.0251, + "step": 27906 + }, + { + "epoch": 3.3092612356219613, + "grad_norm": 0.5261313736746881, + "learning_rate": 3.80912133827607e-06, + "loss": 0.032, + "step": 27907 + }, + { + "epoch": 3.3093798173840865, + "grad_norm": 0.23434591027801066, + "learning_rate": 3.80784785342502e-06, + "loss": 0.0113, + "step": 27908 + }, + { + "epoch": 3.3094983991462112, + "grad_norm": 0.5415793872298806, + "learning_rate": 3.806574563939627e-06, + "loss": 0.0267, + "step": 27909 + }, + { + "epoch": 3.3096169809083364, + "grad_norm": 0.7621216440034788, + "learning_rate": 3.8053014698316207e-06, + "loss": 0.0371, + "step": 27910 + }, + { + "epoch": 3.309735562670461, + "grad_norm": 0.48411387642646014, + "learning_rate": 3.8040285711127527e-06, + "loss": 0.018, + "step": 27911 + }, + { + "epoch": 3.3098541444325864, + "grad_norm": 0.6568984415784147, + "learning_rate": 3.802755867794744e-06, + "loss": 0.0339, + "step": 27912 + }, + { + "epoch": 3.309972726194711, + "grad_norm": 0.722707309946253, + "learning_rate": 3.8014833598893323e-06, + "loss": 0.0334, + "step": 27913 + }, + { + "epoch": 3.3100913079568364, + "grad_norm": 0.8729296192925622, + "learning_rate": 3.8002110474082474e-06, + "loss": 0.046, + "step": 27914 + }, + { + "epoch": 3.310209889718961, + "grad_norm": 0.627558006627391, + "learning_rate": 3.798938930363222e-06, + "loss": 0.0289, + "step": 27915 + }, + { + "epoch": 3.3103284714810863, + "grad_norm": 0.832709869288307, + "learning_rate": 3.797667008765987e-06, + "loss": 0.0317, + "step": 27916 + }, + { + "epoch": 3.310447053243211, + "grad_norm": 0.7570380161351903, + "learning_rate": 3.7963952826282484e-06, + "loss": 0.0353, + "step": 27917 + }, + { + "epoch": 3.3105656350053363, + "grad_norm": 0.5126721161391322, + "learning_rate": 3.7951237519617574e-06, + "loss": 0.0302, + "step": 27918 + }, + { + "epoch": 3.310684216767461, + "grad_norm": 0.5091880228970953, + "learning_rate": 3.7938524167782153e-06, + "loss": 0.0221, + "step": 27919 + }, + { + "epoch": 3.3108027985295863, + "grad_norm": 0.5127165599117681, + "learning_rate": 3.792581277089355e-06, + "loss": 0.0247, + "step": 27920 + }, + { + "epoch": 3.310921380291711, + "grad_norm": 0.3711855818167647, + "learning_rate": 3.7913103329068778e-06, + "loss": 0.0177, + "step": 27921 + }, + { + "epoch": 3.311039962053836, + "grad_norm": 0.565958552658857, + "learning_rate": 3.7900395842425247e-06, + "loss": 0.0286, + "step": 27922 + }, + { + "epoch": 3.311158543815961, + "grad_norm": 0.5180750146375289, + "learning_rate": 3.7887690311079887e-06, + "loss": 0.0335, + "step": 27923 + }, + { + "epoch": 3.311277125578086, + "grad_norm": 0.6490747241949512, + "learning_rate": 3.7874986735149942e-06, + "loss": 0.0302, + "step": 27924 + }, + { + "epoch": 3.311395707340211, + "grad_norm": 0.5256419334847869, + "learning_rate": 3.7862285114752457e-06, + "loss": 0.0263, + "step": 27925 + }, + { + "epoch": 3.311514289102336, + "grad_norm": 0.6091180760840573, + "learning_rate": 3.7849585450004583e-06, + "loss": 0.0322, + "step": 27926 + }, + { + "epoch": 3.311632870864461, + "grad_norm": 0.5576979669178772, + "learning_rate": 3.7836887741023453e-06, + "loss": 0.0325, + "step": 27927 + }, + { + "epoch": 3.311751452626586, + "grad_norm": 0.4160367442812694, + "learning_rate": 3.7824191987925916e-06, + "loss": 0.0179, + "step": 27928 + }, + { + "epoch": 3.311870034388711, + "grad_norm": 0.6040453182187787, + "learning_rate": 3.781149819082924e-06, + "loss": 0.0362, + "step": 27929 + }, + { + "epoch": 3.311988616150836, + "grad_norm": 0.4860110262950691, + "learning_rate": 3.7798806349850306e-06, + "loss": 0.0219, + "step": 27930 + }, + { + "epoch": 3.312107197912961, + "grad_norm": 0.6134605242545235, + "learning_rate": 3.7786116465106213e-06, + "loss": 0.0414, + "step": 27931 + }, + { + "epoch": 3.312225779675086, + "grad_norm": 0.5148964931884741, + "learning_rate": 3.7773428536713784e-06, + "loss": 0.0264, + "step": 27932 + }, + { + "epoch": 3.312344361437211, + "grad_norm": 0.38317398980751277, + "learning_rate": 3.7760742564790204e-06, + "loss": 0.0155, + "step": 27933 + }, + { + "epoch": 3.312462943199336, + "grad_norm": 0.45294860618672217, + "learning_rate": 3.7748058549452237e-06, + "loss": 0.0186, + "step": 27934 + }, + { + "epoch": 3.3125815249614607, + "grad_norm": 0.7672103858899045, + "learning_rate": 3.773537649081693e-06, + "loss": 0.0322, + "step": 27935 + }, + { + "epoch": 3.312700106723586, + "grad_norm": 0.3688690698581376, + "learning_rate": 3.772269638900111e-06, + "loss": 0.022, + "step": 27936 + }, + { + "epoch": 3.3128186884857107, + "grad_norm": 0.5910304009546803, + "learning_rate": 3.7710018244121762e-06, + "loss": 0.0371, + "step": 27937 + }, + { + "epoch": 3.312937270247836, + "grad_norm": 0.6568927116830952, + "learning_rate": 3.769734205629574e-06, + "loss": 0.0271, + "step": 27938 + }, + { + "epoch": 3.3130558520099607, + "grad_norm": 0.39636780096759283, + "learning_rate": 3.768466782563984e-06, + "loss": 0.0203, + "step": 27939 + }, + { + "epoch": 3.313174433772086, + "grad_norm": 0.3724161498945175, + "learning_rate": 3.767199555227094e-06, + "loss": 0.0162, + "step": 27940 + }, + { + "epoch": 3.313293015534211, + "grad_norm": 0.461646269002203, + "learning_rate": 3.7659325236305887e-06, + "loss": 0.0163, + "step": 27941 + }, + { + "epoch": 3.313411597296336, + "grad_norm": 0.5201020385226258, + "learning_rate": 3.7646656877861506e-06, + "loss": 0.0229, + "step": 27942 + }, + { + "epoch": 3.3135301790584606, + "grad_norm": 0.9601974688238142, + "learning_rate": 3.763399047705443e-06, + "loss": 0.0402, + "step": 27943 + }, + { + "epoch": 3.313648760820586, + "grad_norm": 0.4007365487725, + "learning_rate": 3.7621326034001648e-06, + "loss": 0.0193, + "step": 27944 + }, + { + "epoch": 3.313767342582711, + "grad_norm": 0.5455009808621921, + "learning_rate": 3.7608663548819756e-06, + "loss": 0.0341, + "step": 27945 + }, + { + "epoch": 3.3138859243448358, + "grad_norm": 0.8105290835436278, + "learning_rate": 3.7596003021625524e-06, + "loss": 0.0383, + "step": 27946 + }, + { + "epoch": 3.3140045061069605, + "grad_norm": 0.6134451411182563, + "learning_rate": 3.7583344452535695e-06, + "loss": 0.0304, + "step": 27947 + }, + { + "epoch": 3.3141230878690857, + "grad_norm": 0.39208792806985315, + "learning_rate": 3.7570687841666925e-06, + "loss": 0.0198, + "step": 27948 + }, + { + "epoch": 3.314241669631211, + "grad_norm": 0.6280270950561221, + "learning_rate": 3.7558033189136005e-06, + "loss": 0.0286, + "step": 27949 + }, + { + "epoch": 3.3143602513933357, + "grad_norm": 0.5295908016233951, + "learning_rate": 3.754538049505943e-06, + "loss": 0.0269, + "step": 27950 + }, + { + "epoch": 3.314478833155461, + "grad_norm": 0.4430885723135758, + "learning_rate": 3.753272975955391e-06, + "loss": 0.0235, + "step": 27951 + }, + { + "epoch": 3.3145974149175856, + "grad_norm": 0.4565798859297612, + "learning_rate": 3.752008098273607e-06, + "loss": 0.0222, + "step": 27952 + }, + { + "epoch": 3.314715996679711, + "grad_norm": 0.452390423435813, + "learning_rate": 3.7507434164722606e-06, + "loss": 0.0271, + "step": 27953 + }, + { + "epoch": 3.3148345784418356, + "grad_norm": 0.5491389616672491, + "learning_rate": 3.749478930562997e-06, + "loss": 0.0268, + "step": 27954 + }, + { + "epoch": 3.314953160203961, + "grad_norm": 0.6327876353527695, + "learning_rate": 3.7482146405574766e-06, + "loss": 0.031, + "step": 27955 + }, + { + "epoch": 3.3150717419660856, + "grad_norm": 0.5262766835145923, + "learning_rate": 3.7469505464673566e-06, + "loss": 0.0274, + "step": 27956 + }, + { + "epoch": 3.3151903237282108, + "grad_norm": 0.5342180136252416, + "learning_rate": 3.7456866483042918e-06, + "loss": 0.0295, + "step": 27957 + }, + { + "epoch": 3.3153089054903355, + "grad_norm": 0.7203904943065375, + "learning_rate": 3.7444229460799312e-06, + "loss": 0.0413, + "step": 27958 + }, + { + "epoch": 3.3154274872524607, + "grad_norm": 0.5521678039143647, + "learning_rate": 3.743159439805924e-06, + "loss": 0.0301, + "step": 27959 + }, + { + "epoch": 3.3155460690145855, + "grad_norm": 0.8986933295897125, + "learning_rate": 3.741896129493927e-06, + "loss": 0.0311, + "step": 27960 + }, + { + "epoch": 3.3156646507767107, + "grad_norm": 0.7510228054718425, + "learning_rate": 3.7406330151555756e-06, + "loss": 0.0338, + "step": 27961 + }, + { + "epoch": 3.3157832325388354, + "grad_norm": 0.507020557803987, + "learning_rate": 3.7393700968025164e-06, + "loss": 0.0232, + "step": 27962 + }, + { + "epoch": 3.3159018143009606, + "grad_norm": 0.4047937163101113, + "learning_rate": 3.7381073744463924e-06, + "loss": 0.021, + "step": 27963 + }, + { + "epoch": 3.3160203960630854, + "grad_norm": 0.5645040765000825, + "learning_rate": 3.736844848098847e-06, + "loss": 0.021, + "step": 27964 + }, + { + "epoch": 3.3161389778252106, + "grad_norm": 0.2990751095477559, + "learning_rate": 3.7355825177715215e-06, + "loss": 0.0176, + "step": 27965 + }, + { + "epoch": 3.3162575595873354, + "grad_norm": 0.4515612505105922, + "learning_rate": 3.734320383476039e-06, + "loss": 0.0259, + "step": 27966 + }, + { + "epoch": 3.3163761413494606, + "grad_norm": 0.44923047929978716, + "learning_rate": 3.7330584452240553e-06, + "loss": 0.0241, + "step": 27967 + }, + { + "epoch": 3.3164947231115853, + "grad_norm": 0.6248955303979373, + "learning_rate": 3.7317967030271876e-06, + "loss": 0.0341, + "step": 27968 + }, + { + "epoch": 3.3166133048737105, + "grad_norm": 0.3768532408233588, + "learning_rate": 3.730535156897075e-06, + "loss": 0.0181, + "step": 27969 + }, + { + "epoch": 3.3167318866358353, + "grad_norm": 0.52098556363743, + "learning_rate": 3.7292738068453457e-06, + "loss": 0.0288, + "step": 27970 + }, + { + "epoch": 3.3168504683979605, + "grad_norm": 0.6402990543803196, + "learning_rate": 3.7280126528836335e-06, + "loss": 0.0366, + "step": 27971 + }, + { + "epoch": 3.3169690501600853, + "grad_norm": 0.34162692921760895, + "learning_rate": 3.7267516950235533e-06, + "loss": 0.0179, + "step": 27972 + }, + { + "epoch": 3.3170876319222105, + "grad_norm": 0.6495491965102689, + "learning_rate": 3.7254909332767348e-06, + "loss": 0.0355, + "step": 27973 + }, + { + "epoch": 3.317206213684335, + "grad_norm": 0.3856662679671859, + "learning_rate": 3.7242303676548018e-06, + "loss": 0.0181, + "step": 27974 + }, + { + "epoch": 3.3173247954464604, + "grad_norm": 0.5192057491326741, + "learning_rate": 3.722969998169373e-06, + "loss": 0.0369, + "step": 27975 + }, + { + "epoch": 3.317443377208585, + "grad_norm": 0.5499691820784068, + "learning_rate": 3.7217098248320785e-06, + "loss": 0.0309, + "step": 27976 + }, + { + "epoch": 3.3175619589707104, + "grad_norm": 0.44292906587810515, + "learning_rate": 3.7204498476545113e-06, + "loss": 0.0198, + "step": 27977 + }, + { + "epoch": 3.317680540732835, + "grad_norm": 0.3642732347154108, + "learning_rate": 3.7191900666483174e-06, + "loss": 0.0184, + "step": 27978 + }, + { + "epoch": 3.3177991224949603, + "grad_norm": 0.6392097452913126, + "learning_rate": 3.7179304818250854e-06, + "loss": 0.0288, + "step": 27979 + }, + { + "epoch": 3.317917704257085, + "grad_norm": 0.37353774655458777, + "learning_rate": 3.7166710931964443e-06, + "loss": 0.0178, + "step": 27980 + }, + { + "epoch": 3.3180362860192103, + "grad_norm": 0.6568021049640583, + "learning_rate": 3.7154119007739823e-06, + "loss": 0.0356, + "step": 27981 + }, + { + "epoch": 3.318154867781335, + "grad_norm": 0.6719948972887031, + "learning_rate": 3.7141529045693372e-06, + "loss": 0.0247, + "step": 27982 + }, + { + "epoch": 3.3182734495434603, + "grad_norm": 0.48846611067661794, + "learning_rate": 3.712894104594092e-06, + "loss": 0.0257, + "step": 27983 + }, + { + "epoch": 3.318392031305585, + "grad_norm": 0.3645375297764137, + "learning_rate": 3.7116355008598585e-06, + "loss": 0.0221, + "step": 27984 + }, + { + "epoch": 3.3185106130677102, + "grad_norm": 0.4016251333855859, + "learning_rate": 3.710377093378242e-06, + "loss": 0.0212, + "step": 27985 + }, + { + "epoch": 3.318629194829835, + "grad_norm": 0.5803359385969297, + "learning_rate": 3.709118882160839e-06, + "loss": 0.0315, + "step": 27986 + }, + { + "epoch": 3.31874777659196, + "grad_norm": 0.5182117272106874, + "learning_rate": 3.7078608672192593e-06, + "loss": 0.0299, + "step": 27987 + }, + { + "epoch": 3.318866358354085, + "grad_norm": 0.8642325597230283, + "learning_rate": 3.70660304856508e-06, + "loss": 0.0333, + "step": 27988 + }, + { + "epoch": 3.31898494011621, + "grad_norm": 0.7072066805686882, + "learning_rate": 3.705345426209919e-06, + "loss": 0.0229, + "step": 27989 + }, + { + "epoch": 3.3191035218783353, + "grad_norm": 0.5188054324533439, + "learning_rate": 3.7040880001653564e-06, + "loss": 0.0271, + "step": 27990 + }, + { + "epoch": 3.31922210364046, + "grad_norm": 0.8404015351481868, + "learning_rate": 3.7028307704429914e-06, + "loss": 0.045, + "step": 27991 + }, + { + "epoch": 3.319340685402585, + "grad_norm": 0.5761565366378449, + "learning_rate": 3.701573737054401e-06, + "loss": 0.0254, + "step": 27992 + }, + { + "epoch": 3.31945926716471, + "grad_norm": 0.5668059123847877, + "learning_rate": 3.7003169000111942e-06, + "loss": 0.0228, + "step": 27993 + }, + { + "epoch": 3.3195778489268353, + "grad_norm": 0.3838966081566302, + "learning_rate": 3.6990602593249407e-06, + "loss": 0.0192, + "step": 27994 + }, + { + "epoch": 3.31969643068896, + "grad_norm": 0.7357055343700173, + "learning_rate": 3.697803815007228e-06, + "loss": 0.0447, + "step": 27995 + }, + { + "epoch": 3.319815012451085, + "grad_norm": 0.4847369706902986, + "learning_rate": 3.6965475670696443e-06, + "loss": 0.0226, + "step": 27996 + }, + { + "epoch": 3.31993359421321, + "grad_norm": 0.5773935482585504, + "learning_rate": 3.695291515523766e-06, + "loss": 0.0258, + "step": 27997 + }, + { + "epoch": 3.320052175975335, + "grad_norm": 0.7005667581402969, + "learning_rate": 3.6940356603811817e-06, + "loss": 0.0461, + "step": 27998 + }, + { + "epoch": 3.32017075773746, + "grad_norm": 0.7088733035395607, + "learning_rate": 3.692780001653451e-06, + "loss": 0.0302, + "step": 27999 + }, + { + "epoch": 3.320289339499585, + "grad_norm": 0.7014771802458767, + "learning_rate": 3.6915245393521674e-06, + "loss": 0.0397, + "step": 28000 + }, + { + "epoch": 3.32040792126171, + "grad_norm": 0.6710692705087972, + "learning_rate": 3.6902692734888946e-06, + "loss": 0.031, + "step": 28001 + }, + { + "epoch": 3.320526503023835, + "grad_norm": 0.5804125193351594, + "learning_rate": 3.689014204075214e-06, + "loss": 0.0236, + "step": 28002 + }, + { + "epoch": 3.32064508478596, + "grad_norm": 0.5786323265658855, + "learning_rate": 3.6877593311226754e-06, + "loss": 0.0376, + "step": 28003 + }, + { + "epoch": 3.320763666548085, + "grad_norm": 0.4680126713698062, + "learning_rate": 3.686504654642875e-06, + "loss": 0.0234, + "step": 28004 + }, + { + "epoch": 3.32088224831021, + "grad_norm": 0.3739822594621729, + "learning_rate": 3.685250174647356e-06, + "loss": 0.0169, + "step": 28005 + }, + { + "epoch": 3.321000830072335, + "grad_norm": 0.6005542144108119, + "learning_rate": 3.6839958911476957e-06, + "loss": 0.0293, + "step": 28006 + }, + { + "epoch": 3.32111941183446, + "grad_norm": 0.600122151768237, + "learning_rate": 3.682741804155454e-06, + "loss": 0.0204, + "step": 28007 + }, + { + "epoch": 3.321237993596585, + "grad_norm": 0.5169456174965069, + "learning_rate": 3.6814879136821914e-06, + "loss": 0.0267, + "step": 28008 + }, + { + "epoch": 3.3213565753587098, + "grad_norm": 0.41952371030024277, + "learning_rate": 3.68023421973947e-06, + "loss": 0.0251, + "step": 28009 + }, + { + "epoch": 3.321475157120835, + "grad_norm": 0.613380403844406, + "learning_rate": 3.678980722338843e-06, + "loss": 0.0248, + "step": 28010 + }, + { + "epoch": 3.3215937388829597, + "grad_norm": 0.41477745565639795, + "learning_rate": 3.6777274214918667e-06, + "loss": 0.0205, + "step": 28011 + }, + { + "epoch": 3.321712320645085, + "grad_norm": 0.5767983827835985, + "learning_rate": 3.676474317210099e-06, + "loss": 0.0323, + "step": 28012 + }, + { + "epoch": 3.3218309024072097, + "grad_norm": 0.527543279785441, + "learning_rate": 3.675221409505086e-06, + "loss": 0.0265, + "step": 28013 + }, + { + "epoch": 3.321949484169335, + "grad_norm": 0.5263532042670281, + "learning_rate": 3.6739686983883833e-06, + "loss": 0.0253, + "step": 28014 + }, + { + "epoch": 3.3220680659314596, + "grad_norm": 0.5003301499720302, + "learning_rate": 3.6727161838715384e-06, + "loss": 0.0347, + "step": 28015 + }, + { + "epoch": 3.322186647693585, + "grad_norm": 0.4526756548145851, + "learning_rate": 3.6714638659661017e-06, + "loss": 0.0204, + "step": 28016 + }, + { + "epoch": 3.3223052294557096, + "grad_norm": 0.34054247255804443, + "learning_rate": 3.6702117446836075e-06, + "loss": 0.0191, + "step": 28017 + }, + { + "epoch": 3.322423811217835, + "grad_norm": 0.5299170046057428, + "learning_rate": 3.6689598200356028e-06, + "loss": 0.029, + "step": 28018 + }, + { + "epoch": 3.3225423929799596, + "grad_norm": 0.6543489115925588, + "learning_rate": 3.6677080920336337e-06, + "loss": 0.0274, + "step": 28019 + }, + { + "epoch": 3.3226609747420848, + "grad_norm": 0.3786048111735766, + "learning_rate": 3.666456560689241e-06, + "loss": 0.0136, + "step": 28020 + }, + { + "epoch": 3.3227795565042095, + "grad_norm": 0.565295433327078, + "learning_rate": 3.6652052260139512e-06, + "loss": 0.024, + "step": 28021 + }, + { + "epoch": 3.3228981382663347, + "grad_norm": 0.6161892910005843, + "learning_rate": 3.6639540880193084e-06, + "loss": 0.0414, + "step": 28022 + }, + { + "epoch": 3.3230167200284595, + "grad_norm": 0.7606252932461626, + "learning_rate": 3.6627031467168447e-06, + "loss": 0.0301, + "step": 28023 + }, + { + "epoch": 3.3231353017905847, + "grad_norm": 0.5706944668441977, + "learning_rate": 3.661452402118093e-06, + "loss": 0.02, + "step": 28024 + }, + { + "epoch": 3.3232538835527095, + "grad_norm": 0.399520030938864, + "learning_rate": 3.6602018542345878e-06, + "loss": 0.0194, + "step": 28025 + }, + { + "epoch": 3.3233724653148347, + "grad_norm": 0.5906620495133359, + "learning_rate": 3.6589515030778427e-06, + "loss": 0.0343, + "step": 28026 + }, + { + "epoch": 3.3234910470769594, + "grad_norm": 0.3771141678485449, + "learning_rate": 3.6577013486594037e-06, + "loss": 0.0204, + "step": 28027 + }, + { + "epoch": 3.3236096288390846, + "grad_norm": 0.5426691128695411, + "learning_rate": 3.656451390990784e-06, + "loss": 0.0202, + "step": 28028 + }, + { + "epoch": 3.3237282106012094, + "grad_norm": 0.40352428287080816, + "learning_rate": 3.6552016300835072e-06, + "loss": 0.0293, + "step": 28029 + }, + { + "epoch": 3.3238467923633346, + "grad_norm": 0.46783888485327807, + "learning_rate": 3.6539520659490955e-06, + "loss": 0.023, + "step": 28030 + }, + { + "epoch": 3.3239653741254593, + "grad_norm": 0.438977499987937, + "learning_rate": 3.652702698599078e-06, + "loss": 0.0188, + "step": 28031 + }, + { + "epoch": 3.3240839558875845, + "grad_norm": 0.8397915316957508, + "learning_rate": 3.6514535280449568e-06, + "loss": 0.038, + "step": 28032 + }, + { + "epoch": 3.3242025376497093, + "grad_norm": 0.5300869601500785, + "learning_rate": 3.650204554298256e-06, + "loss": 0.0282, + "step": 28033 + }, + { + "epoch": 3.3243211194118345, + "grad_norm": 0.5841538624461043, + "learning_rate": 3.648955777370486e-06, + "loss": 0.0259, + "step": 28034 + }, + { + "epoch": 3.3244397011739593, + "grad_norm": 0.6893586953947631, + "learning_rate": 3.6477071972731618e-06, + "loss": 0.0249, + "step": 28035 + }, + { + "epoch": 3.3245582829360845, + "grad_norm": 0.919266362745719, + "learning_rate": 3.6464588140178e-06, + "loss": 0.0451, + "step": 28036 + }, + { + "epoch": 3.3246768646982092, + "grad_norm": 0.31234468306981, + "learning_rate": 3.6452106276158916e-06, + "loss": 0.0164, + "step": 28037 + }, + { + "epoch": 3.3247954464603344, + "grad_norm": 0.46134009630138384, + "learning_rate": 3.6439626380789655e-06, + "loss": 0.0225, + "step": 28038 + }, + { + "epoch": 3.3249140282224596, + "grad_norm": 0.5894183127593473, + "learning_rate": 3.6427148454185074e-06, + "loss": 0.0359, + "step": 28039 + }, + { + "epoch": 3.3250326099845844, + "grad_norm": 0.5077930595401028, + "learning_rate": 3.6414672496460357e-06, + "loss": 0.0314, + "step": 28040 + }, + { + "epoch": 3.325151191746709, + "grad_norm": 0.3203834881904532, + "learning_rate": 3.640219850773033e-06, + "loss": 0.0142, + "step": 28041 + }, + { + "epoch": 3.3252697735088343, + "grad_norm": 0.5648886628427189, + "learning_rate": 3.638972648811023e-06, + "loss": 0.0333, + "step": 28042 + }, + { + "epoch": 3.3253883552709596, + "grad_norm": 0.45560795236507723, + "learning_rate": 3.6377256437714834e-06, + "loss": 0.0268, + "step": 28043 + }, + { + "epoch": 3.3255069370330843, + "grad_norm": 0.5407681446324131, + "learning_rate": 3.636478835665916e-06, + "loss": 0.0253, + "step": 28044 + }, + { + "epoch": 3.325625518795209, + "grad_norm": 0.5845661656153494, + "learning_rate": 3.6352322245058192e-06, + "loss": 0.029, + "step": 28045 + }, + { + "epoch": 3.3257441005573343, + "grad_norm": 0.3730653408019406, + "learning_rate": 3.6339858103026786e-06, + "loss": 0.0196, + "step": 28046 + }, + { + "epoch": 3.3258626823194595, + "grad_norm": 0.4115972939778285, + "learning_rate": 3.6327395930679963e-06, + "loss": 0.0211, + "step": 28047 + }, + { + "epoch": 3.3259812640815842, + "grad_norm": 0.6232534330903102, + "learning_rate": 3.631493572813241e-06, + "loss": 0.0267, + "step": 28048 + }, + { + "epoch": 3.326099845843709, + "grad_norm": 0.7927292965195416, + "learning_rate": 3.6302477495499225e-06, + "loss": 0.0514, + "step": 28049 + }, + { + "epoch": 3.326218427605834, + "grad_norm": 0.3276090554670096, + "learning_rate": 3.629002123289507e-06, + "loss": 0.0211, + "step": 28050 + }, + { + "epoch": 3.3263370093679594, + "grad_norm": 0.4115501736728005, + "learning_rate": 3.627756694043491e-06, + "loss": 0.0218, + "step": 28051 + }, + { + "epoch": 3.326455591130084, + "grad_norm": 0.4794069091680505, + "learning_rate": 3.6265114618233396e-06, + "loss": 0.0191, + "step": 28052 + }, + { + "epoch": 3.3265741728922094, + "grad_norm": 0.7565726390411996, + "learning_rate": 3.6252664266405558e-06, + "loss": 0.047, + "step": 28053 + }, + { + "epoch": 3.326692754654334, + "grad_norm": 0.3491216550689933, + "learning_rate": 3.624021588506596e-06, + "loss": 0.0192, + "step": 28054 + }, + { + "epoch": 3.3268113364164593, + "grad_norm": 0.4610518970494569, + "learning_rate": 3.622776947432946e-06, + "loss": 0.0277, + "step": 28055 + }, + { + "epoch": 3.326929918178584, + "grad_norm": 0.6109379171361768, + "learning_rate": 3.6215325034310744e-06, + "loss": 0.0303, + "step": 28056 + }, + { + "epoch": 3.3270484999407093, + "grad_norm": 0.3677642104906407, + "learning_rate": 3.620288256512461e-06, + "loss": 0.0252, + "step": 28057 + }, + { + "epoch": 3.327167081702834, + "grad_norm": 0.48913753054323195, + "learning_rate": 3.6190442066885764e-06, + "loss": 0.0292, + "step": 28058 + }, + { + "epoch": 3.3272856634649592, + "grad_norm": 0.41172662800093573, + "learning_rate": 3.6178003539708736e-06, + "loss": 0.0248, + "step": 28059 + }, + { + "epoch": 3.327404245227084, + "grad_norm": 0.5996753738938778, + "learning_rate": 3.6165566983708434e-06, + "loss": 0.0199, + "step": 28060 + }, + { + "epoch": 3.327522826989209, + "grad_norm": 0.43481290664754274, + "learning_rate": 3.615313239899931e-06, + "loss": 0.0167, + "step": 28061 + }, + { + "epoch": 3.327641408751334, + "grad_norm": 0.6214964218063276, + "learning_rate": 3.6140699785696093e-06, + "loss": 0.0253, + "step": 28062 + }, + { + "epoch": 3.327759990513459, + "grad_norm": 0.7815446503245452, + "learning_rate": 3.6128269143913347e-06, + "loss": 0.0353, + "step": 28063 + }, + { + "epoch": 3.327878572275584, + "grad_norm": 0.51755396997207, + "learning_rate": 3.6115840473765706e-06, + "loss": 0.0262, + "step": 28064 + }, + { + "epoch": 3.327997154037709, + "grad_norm": 0.3830294818192797, + "learning_rate": 3.610341377536777e-06, + "loss": 0.0197, + "step": 28065 + }, + { + "epoch": 3.328115735799834, + "grad_norm": 0.6815812274689803, + "learning_rate": 3.609098904883404e-06, + "loss": 0.0473, + "step": 28066 + }, + { + "epoch": 3.328234317561959, + "grad_norm": 0.5163129764345256, + "learning_rate": 3.6078566294279053e-06, + "loss": 0.0288, + "step": 28067 + }, + { + "epoch": 3.328352899324084, + "grad_norm": 0.4489444154681484, + "learning_rate": 3.6066145511817363e-06, + "loss": 0.0195, + "step": 28068 + }, + { + "epoch": 3.328471481086209, + "grad_norm": 0.6444009358173334, + "learning_rate": 3.605372670156354e-06, + "loss": 0.0295, + "step": 28069 + }, + { + "epoch": 3.328590062848334, + "grad_norm": 0.5561095939441943, + "learning_rate": 3.6041309863631856e-06, + "loss": 0.0301, + "step": 28070 + }, + { + "epoch": 3.328708644610459, + "grad_norm": 0.4936679653552854, + "learning_rate": 3.602889499813705e-06, + "loss": 0.0241, + "step": 28071 + }, + { + "epoch": 3.3288272263725838, + "grad_norm": 0.5673202536835673, + "learning_rate": 3.60164821051934e-06, + "loss": 0.0313, + "step": 28072 + }, + { + "epoch": 3.328945808134709, + "grad_norm": 0.6326998938234467, + "learning_rate": 3.6004071184915362e-06, + "loss": 0.0243, + "step": 28073 + }, + { + "epoch": 3.3290643898968337, + "grad_norm": 0.41470148879702035, + "learning_rate": 3.599166223741737e-06, + "loss": 0.0183, + "step": 28074 + }, + { + "epoch": 3.329182971658959, + "grad_norm": 0.4031317866663014, + "learning_rate": 3.5979255262813818e-06, + "loss": 0.0249, + "step": 28075 + }, + { + "epoch": 3.3293015534210837, + "grad_norm": 0.29308393475607697, + "learning_rate": 3.5966850261219155e-06, + "loss": 0.0195, + "step": 28076 + }, + { + "epoch": 3.329420135183209, + "grad_norm": 0.585692206951662, + "learning_rate": 3.59544472327476e-06, + "loss": 0.0216, + "step": 28077 + }, + { + "epoch": 3.3295387169453337, + "grad_norm": 0.45450503984381924, + "learning_rate": 3.5942046177513567e-06, + "loss": 0.0167, + "step": 28078 + }, + { + "epoch": 3.329657298707459, + "grad_norm": 0.9707911779093686, + "learning_rate": 3.5929647095631375e-06, + "loss": 0.0245, + "step": 28079 + }, + { + "epoch": 3.3297758804695836, + "grad_norm": 0.47294574095832953, + "learning_rate": 3.591724998721538e-06, + "loss": 0.0241, + "step": 28080 + }, + { + "epoch": 3.329894462231709, + "grad_norm": 0.6231830627285894, + "learning_rate": 3.590485485237974e-06, + "loss": 0.0276, + "step": 28081 + }, + { + "epoch": 3.3300130439938336, + "grad_norm": 0.6248541162297487, + "learning_rate": 3.5892461691238806e-06, + "loss": 0.0451, + "step": 28082 + }, + { + "epoch": 3.330131625755959, + "grad_norm": 0.7587986412801594, + "learning_rate": 3.5880070503906822e-06, + "loss": 0.0369, + "step": 28083 + }, + { + "epoch": 3.3302502075180835, + "grad_norm": 0.4578101648548543, + "learning_rate": 3.586768129049803e-06, + "loss": 0.0209, + "step": 28084 + }, + { + "epoch": 3.3303687892802087, + "grad_norm": 0.8573893538332871, + "learning_rate": 3.5855294051126614e-06, + "loss": 0.0446, + "step": 28085 + }, + { + "epoch": 3.3304873710423335, + "grad_norm": 0.262722341495648, + "learning_rate": 3.584290878590679e-06, + "loss": 0.0088, + "step": 28086 + }, + { + "epoch": 3.3306059528044587, + "grad_norm": 0.6404630914846108, + "learning_rate": 3.5830525494952803e-06, + "loss": 0.0303, + "step": 28087 + }, + { + "epoch": 3.330724534566584, + "grad_norm": 0.33515964144961946, + "learning_rate": 3.5818144178378645e-06, + "loss": 0.0161, + "step": 28088 + }, + { + "epoch": 3.3308431163287087, + "grad_norm": 0.5179544145299119, + "learning_rate": 3.580576483629858e-06, + "loss": 0.0204, + "step": 28089 + }, + { + "epoch": 3.3309616980908334, + "grad_norm": 0.6688358466838135, + "learning_rate": 3.5793387468826684e-06, + "loss": 0.0318, + "step": 28090 + }, + { + "epoch": 3.3310802798529586, + "grad_norm": 0.5676513853555679, + "learning_rate": 3.5781012076077147e-06, + "loss": 0.0326, + "step": 28091 + }, + { + "epoch": 3.331198861615084, + "grad_norm": 0.4341343819812523, + "learning_rate": 3.5768638658163933e-06, + "loss": 0.0234, + "step": 28092 + }, + { + "epoch": 3.3313174433772086, + "grad_norm": 0.6747039097722346, + "learning_rate": 3.575626721520117e-06, + "loss": 0.029, + "step": 28093 + }, + { + "epoch": 3.3314360251393333, + "grad_norm": 0.6010494040445135, + "learning_rate": 3.5743897747302885e-06, + "loss": 0.0295, + "step": 28094 + }, + { + "epoch": 3.3315546069014585, + "grad_norm": 0.5279676563012841, + "learning_rate": 3.573153025458312e-06, + "loss": 0.0329, + "step": 28095 + }, + { + "epoch": 3.3316731886635838, + "grad_norm": 0.5850467769433929, + "learning_rate": 3.5719164737155946e-06, + "loss": 0.0304, + "step": 28096 + }, + { + "epoch": 3.3317917704257085, + "grad_norm": 0.5723921264600655, + "learning_rate": 3.5706801195135196e-06, + "loss": 0.0217, + "step": 28097 + }, + { + "epoch": 3.3319103521878333, + "grad_norm": 0.425707664748483, + "learning_rate": 3.5694439628635083e-06, + "loss": 0.0231, + "step": 28098 + }, + { + "epoch": 3.3320289339499585, + "grad_norm": 0.5068019911973674, + "learning_rate": 3.568208003776938e-06, + "loss": 0.0245, + "step": 28099 + }, + { + "epoch": 3.3321475157120837, + "grad_norm": 0.5053680555584119, + "learning_rate": 3.566972242265207e-06, + "loss": 0.0209, + "step": 28100 + }, + { + "epoch": 3.3322660974742084, + "grad_norm": 0.9782055387383342, + "learning_rate": 3.5657366783397096e-06, + "loss": 0.0416, + "step": 28101 + }, + { + "epoch": 3.3323846792363336, + "grad_norm": 0.7590311539037063, + "learning_rate": 3.564501312011842e-06, + "loss": 0.0405, + "step": 28102 + }, + { + "epoch": 3.3325032609984584, + "grad_norm": 0.47014093349953795, + "learning_rate": 3.563266143292979e-06, + "loss": 0.0177, + "step": 28103 + }, + { + "epoch": 3.3326218427605836, + "grad_norm": 0.4654670820014841, + "learning_rate": 3.562031172194516e-06, + "loss": 0.0275, + "step": 28104 + }, + { + "epoch": 3.3327404245227084, + "grad_norm": 0.5769480322795792, + "learning_rate": 3.5607963987278365e-06, + "loss": 0.0291, + "step": 28105 + }, + { + "epoch": 3.3328590062848336, + "grad_norm": 0.6771737861561109, + "learning_rate": 3.5595618229043224e-06, + "loss": 0.027, + "step": 28106 + }, + { + "epoch": 3.3329775880469583, + "grad_norm": 0.5253928546738901, + "learning_rate": 3.5583274447353655e-06, + "loss": 0.0226, + "step": 28107 + }, + { + "epoch": 3.3330961698090835, + "grad_norm": 0.5746649306775725, + "learning_rate": 3.5570932642323224e-06, + "loss": 0.0215, + "step": 28108 + }, + { + "epoch": 3.3332147515712083, + "grad_norm": 0.3147747205986944, + "learning_rate": 3.5558592814065954e-06, + "loss": 0.016, + "step": 28109 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.8348174857758743, + "learning_rate": 3.5546254962695424e-06, + "loss": 0.0176, + "step": 28110 + }, + { + "epoch": 3.3334519150954582, + "grad_norm": 0.6678627128345974, + "learning_rate": 3.5533919088325512e-06, + "loss": 0.0344, + "step": 28111 + }, + { + "epoch": 3.3335704968575834, + "grad_norm": 0.5152558499657043, + "learning_rate": 3.552158519106977e-06, + "loss": 0.032, + "step": 28112 + }, + { + "epoch": 3.333689078619708, + "grad_norm": 0.47597827539886134, + "learning_rate": 3.55092532710421e-06, + "loss": 0.0231, + "step": 28113 + }, + { + "epoch": 3.3338076603818334, + "grad_norm": 0.3453121272420612, + "learning_rate": 3.5496923328356025e-06, + "loss": 0.0156, + "step": 28114 + }, + { + "epoch": 3.333926242143958, + "grad_norm": 0.4920706437033826, + "learning_rate": 3.548459536312529e-06, + "loss": 0.0281, + "step": 28115 + }, + { + "epoch": 3.3340448239060834, + "grad_norm": 0.49169109796493404, + "learning_rate": 3.5472269375463522e-06, + "loss": 0.0262, + "step": 28116 + }, + { + "epoch": 3.334163405668208, + "grad_norm": 0.5203376014657446, + "learning_rate": 3.5459945365484387e-06, + "loss": 0.0233, + "step": 28117 + }, + { + "epoch": 3.3342819874303333, + "grad_norm": 0.6001043844221207, + "learning_rate": 3.5447623333301484e-06, + "loss": 0.036, + "step": 28118 + }, + { + "epoch": 3.334400569192458, + "grad_norm": 0.7681787758018628, + "learning_rate": 3.5435303279028304e-06, + "loss": 0.0513, + "step": 28119 + }, + { + "epoch": 3.3345191509545833, + "grad_norm": 0.6154166445130955, + "learning_rate": 3.542298520277862e-06, + "loss": 0.0204, + "step": 28120 + }, + { + "epoch": 3.334637732716708, + "grad_norm": 0.4382384387240891, + "learning_rate": 3.5410669104665812e-06, + "loss": 0.0236, + "step": 28121 + }, + { + "epoch": 3.3347563144788333, + "grad_norm": 0.5801112478509725, + "learning_rate": 3.5398354984803513e-06, + "loss": 0.0283, + "step": 28122 + }, + { + "epoch": 3.334874896240958, + "grad_norm": 0.5480644680053263, + "learning_rate": 3.5386042843305184e-06, + "loss": 0.0148, + "step": 28123 + }, + { + "epoch": 3.334993478003083, + "grad_norm": 0.34893208255103386, + "learning_rate": 3.5373732680284406e-06, + "loss": 0.025, + "step": 28124 + }, + { + "epoch": 3.335112059765208, + "grad_norm": 1.052235588902905, + "learning_rate": 3.5361424495854637e-06, + "loss": 0.0308, + "step": 28125 + }, + { + "epoch": 3.335230641527333, + "grad_norm": 0.8706279129193831, + "learning_rate": 3.53491182901293e-06, + "loss": 0.0525, + "step": 28126 + }, + { + "epoch": 3.335349223289458, + "grad_norm": 0.36105536489882395, + "learning_rate": 3.533681406322184e-06, + "loss": 0.0146, + "step": 28127 + }, + { + "epoch": 3.335467805051583, + "grad_norm": 0.4947296149863641, + "learning_rate": 3.5324511815245738e-06, + "loss": 0.0216, + "step": 28128 + }, + { + "epoch": 3.335586386813708, + "grad_norm": 0.5136903866008311, + "learning_rate": 3.531221154631445e-06, + "loss": 0.024, + "step": 28129 + }, + { + "epoch": 3.335704968575833, + "grad_norm": 0.6793228375537872, + "learning_rate": 3.529991325654117e-06, + "loss": 0.036, + "step": 28130 + }, + { + "epoch": 3.335823550337958, + "grad_norm": 0.5338875034102847, + "learning_rate": 3.528761694603955e-06, + "loss": 0.0209, + "step": 28131 + }, + { + "epoch": 3.335942132100083, + "grad_norm": 0.5862167220335227, + "learning_rate": 3.5275322614922724e-06, + "loss": 0.0214, + "step": 28132 + }, + { + "epoch": 3.336060713862208, + "grad_norm": 0.6480100057532668, + "learning_rate": 3.5263030263304104e-06, + "loss": 0.0413, + "step": 28133 + }, + { + "epoch": 3.336179295624333, + "grad_norm": 0.6003407440108759, + "learning_rate": 3.5250739891297014e-06, + "loss": 0.0256, + "step": 28134 + }, + { + "epoch": 3.3362978773864578, + "grad_norm": 0.3843561991177664, + "learning_rate": 3.523845149901478e-06, + "loss": 0.0171, + "step": 28135 + }, + { + "epoch": 3.336416459148583, + "grad_norm": 0.4323950652925302, + "learning_rate": 3.522616508657073e-06, + "loss": 0.0265, + "step": 28136 + }, + { + "epoch": 3.3365350409107077, + "grad_norm": 0.43568400628398984, + "learning_rate": 3.5213880654077965e-06, + "loss": 0.0201, + "step": 28137 + }, + { + "epoch": 3.336653622672833, + "grad_norm": 0.3827981640825874, + "learning_rate": 3.5201598201649865e-06, + "loss": 0.0169, + "step": 28138 + }, + { + "epoch": 3.3367722044349577, + "grad_norm": 0.6505375407955951, + "learning_rate": 3.518931772939962e-06, + "loss": 0.0262, + "step": 28139 + }, + { + "epoch": 3.336890786197083, + "grad_norm": 0.6144644519321392, + "learning_rate": 3.517703923744051e-06, + "loss": 0.0343, + "step": 28140 + }, + { + "epoch": 3.337009367959208, + "grad_norm": 0.49083967986302557, + "learning_rate": 3.5164762725885537e-06, + "loss": 0.0202, + "step": 28141 + }, + { + "epoch": 3.337127949721333, + "grad_norm": 0.4664760165563314, + "learning_rate": 3.515248819484812e-06, + "loss": 0.0244, + "step": 28142 + }, + { + "epoch": 3.3372465314834576, + "grad_norm": 0.5187731389267322, + "learning_rate": 3.5140215644441257e-06, + "loss": 0.0238, + "step": 28143 + }, + { + "epoch": 3.337365113245583, + "grad_norm": 0.5505640223032221, + "learning_rate": 3.5127945074778123e-06, + "loss": 0.0265, + "step": 28144 + }, + { + "epoch": 3.337483695007708, + "grad_norm": 0.5460465033822745, + "learning_rate": 3.511567648597183e-06, + "loss": 0.0204, + "step": 28145 + }, + { + "epoch": 3.337602276769833, + "grad_norm": 0.679222728203132, + "learning_rate": 3.5103409878135512e-06, + "loss": 0.0246, + "step": 28146 + }, + { + "epoch": 3.3377208585319575, + "grad_norm": 0.5812425900420349, + "learning_rate": 3.509114525138227e-06, + "loss": 0.0258, + "step": 28147 + }, + { + "epoch": 3.3378394402940827, + "grad_norm": 0.5783232244920313, + "learning_rate": 3.5078882605825063e-06, + "loss": 0.027, + "step": 28148 + }, + { + "epoch": 3.337958022056208, + "grad_norm": 0.6811948586624061, + "learning_rate": 3.5066621941577032e-06, + "loss": 0.037, + "step": 28149 + }, + { + "epoch": 3.3380766038183327, + "grad_norm": 0.4999890305271869, + "learning_rate": 3.5054363258751166e-06, + "loss": 0.0255, + "step": 28150 + }, + { + "epoch": 3.338195185580458, + "grad_norm": 0.431087261683757, + "learning_rate": 3.504210655746054e-06, + "loss": 0.0211, + "step": 28151 + }, + { + "epoch": 3.3383137673425827, + "grad_norm": 0.5176278894171434, + "learning_rate": 3.502985183781804e-06, + "loss": 0.0285, + "step": 28152 + }, + { + "epoch": 3.338432349104708, + "grad_norm": 0.6703594464056748, + "learning_rate": 3.501759909993668e-06, + "loss": 0.0246, + "step": 28153 + }, + { + "epoch": 3.3385509308668326, + "grad_norm": 0.35800519513987067, + "learning_rate": 3.5005348343929434e-06, + "loss": 0.0129, + "step": 28154 + }, + { + "epoch": 3.338669512628958, + "grad_norm": 0.4615562480378645, + "learning_rate": 3.4993099569909233e-06, + "loss": 0.026, + "step": 28155 + }, + { + "epoch": 3.3387880943910826, + "grad_norm": 0.4370132220629775, + "learning_rate": 3.4980852777989014e-06, + "loss": 0.0205, + "step": 28156 + }, + { + "epoch": 3.338906676153208, + "grad_norm": 0.6119920583473725, + "learning_rate": 3.4968607968281636e-06, + "loss": 0.031, + "step": 28157 + }, + { + "epoch": 3.3390252579153326, + "grad_norm": 0.8388948435316774, + "learning_rate": 3.495636514090006e-06, + "loss": 0.0447, + "step": 28158 + }, + { + "epoch": 3.3391438396774578, + "grad_norm": 0.6066256916276969, + "learning_rate": 3.494412429595706e-06, + "loss": 0.0263, + "step": 28159 + }, + { + "epoch": 3.3392624214395825, + "grad_norm": 0.4469551643054615, + "learning_rate": 3.4931885433565487e-06, + "loss": 0.0112, + "step": 28160 + }, + { + "epoch": 3.3393810032017077, + "grad_norm": 0.7365513839262728, + "learning_rate": 3.4919648553838223e-06, + "loss": 0.044, + "step": 28161 + }, + { + "epoch": 3.3394995849638325, + "grad_norm": 0.39811602381549427, + "learning_rate": 3.490741365688807e-06, + "loss": 0.0187, + "step": 28162 + }, + { + "epoch": 3.3396181667259577, + "grad_norm": 0.45403447739426084, + "learning_rate": 3.4895180742827772e-06, + "loss": 0.0251, + "step": 28163 + }, + { + "epoch": 3.3397367484880824, + "grad_norm": 0.42748700205440665, + "learning_rate": 3.488294981177012e-06, + "loss": 0.0168, + "step": 28164 + }, + { + "epoch": 3.3398553302502076, + "grad_norm": 0.4324414018590449, + "learning_rate": 3.4870720863827892e-06, + "loss": 0.0144, + "step": 28165 + }, + { + "epoch": 3.3399739120123324, + "grad_norm": 0.5446977498032737, + "learning_rate": 3.4858493899113804e-06, + "loss": 0.0341, + "step": 28166 + }, + { + "epoch": 3.3400924937744576, + "grad_norm": 0.5072348671264669, + "learning_rate": 3.484626891774062e-06, + "loss": 0.0221, + "step": 28167 + }, + { + "epoch": 3.3402110755365824, + "grad_norm": 0.39573862127927545, + "learning_rate": 3.4834045919820896e-06, + "loss": 0.0245, + "step": 28168 + }, + { + "epoch": 3.3403296572987076, + "grad_norm": 0.3123242508177821, + "learning_rate": 3.482182490546751e-06, + "loss": 0.0123, + "step": 28169 + }, + { + "epoch": 3.3404482390608323, + "grad_norm": 0.44129500391893756, + "learning_rate": 3.4809605874792983e-06, + "loss": 0.0194, + "step": 28170 + }, + { + "epoch": 3.3405668208229575, + "grad_norm": 0.8298588746452855, + "learning_rate": 3.4797388827910006e-06, + "loss": 0.0407, + "step": 28171 + }, + { + "epoch": 3.3406854025850823, + "grad_norm": 0.5176232142187198, + "learning_rate": 3.4785173764931207e-06, + "loss": 0.0313, + "step": 28172 + }, + { + "epoch": 3.3408039843472075, + "grad_norm": 0.6224652256119334, + "learning_rate": 3.4772960685969192e-06, + "loss": 0.0279, + "step": 28173 + }, + { + "epoch": 3.3409225661093322, + "grad_norm": 0.7424210238117682, + "learning_rate": 3.476074959113662e-06, + "loss": 0.0363, + "step": 28174 + }, + { + "epoch": 3.3410411478714575, + "grad_norm": 0.5250708697314654, + "learning_rate": 3.474854048054585e-06, + "loss": 0.0332, + "step": 28175 + }, + { + "epoch": 3.341159729633582, + "grad_norm": 0.49578679792123215, + "learning_rate": 3.473633335430973e-06, + "loss": 0.0336, + "step": 28176 + }, + { + "epoch": 3.3412783113957074, + "grad_norm": 0.7874190527711381, + "learning_rate": 3.4724128212540563e-06, + "loss": 0.0348, + "step": 28177 + }, + { + "epoch": 3.341396893157832, + "grad_norm": 0.771815688609307, + "learning_rate": 3.471192505535098e-06, + "loss": 0.0311, + "step": 28178 + }, + { + "epoch": 3.3415154749199574, + "grad_norm": 0.5518377335865029, + "learning_rate": 3.469972388285336e-06, + "loss": 0.0239, + "step": 28179 + }, + { + "epoch": 3.341634056682082, + "grad_norm": 0.7384662469267087, + "learning_rate": 3.4687524695160375e-06, + "loss": 0.0299, + "step": 28180 + }, + { + "epoch": 3.3417526384442073, + "grad_norm": 0.6198869956149893, + "learning_rate": 3.4675327492384334e-06, + "loss": 0.0272, + "step": 28181 + }, + { + "epoch": 3.341871220206332, + "grad_norm": 0.6633283820599924, + "learning_rate": 3.4663132274637717e-06, + "loss": 0.0299, + "step": 28182 + }, + { + "epoch": 3.3419898019684573, + "grad_norm": 0.6970979278826498, + "learning_rate": 3.4650939042032955e-06, + "loss": 0.0414, + "step": 28183 + }, + { + "epoch": 3.342108383730582, + "grad_norm": 0.31735932413214174, + "learning_rate": 3.463874779468246e-06, + "loss": 0.0161, + "step": 28184 + }, + { + "epoch": 3.3422269654927073, + "grad_norm": 0.297029650320158, + "learning_rate": 3.462655853269864e-06, + "loss": 0.0124, + "step": 28185 + }, + { + "epoch": 3.342345547254832, + "grad_norm": 0.421756469928052, + "learning_rate": 3.4614371256193766e-06, + "loss": 0.025, + "step": 28186 + }, + { + "epoch": 3.342464129016957, + "grad_norm": 0.4182433039941755, + "learning_rate": 3.4602185965280366e-06, + "loss": 0.0184, + "step": 28187 + }, + { + "epoch": 3.342582710779082, + "grad_norm": 0.41626635490778385, + "learning_rate": 3.459000266007062e-06, + "loss": 0.0242, + "step": 28188 + }, + { + "epoch": 3.342701292541207, + "grad_norm": 0.4107922159423503, + "learning_rate": 3.457782134067697e-06, + "loss": 0.0214, + "step": 28189 + }, + { + "epoch": 3.3428198743033324, + "grad_norm": 0.46892582271562294, + "learning_rate": 3.4565642007211493e-06, + "loss": 0.0209, + "step": 28190 + }, + { + "epoch": 3.342938456065457, + "grad_norm": 0.6913526791788209, + "learning_rate": 3.455346465978676e-06, + "loss": 0.0309, + "step": 28191 + }, + { + "epoch": 3.343057037827582, + "grad_norm": 0.6507970778573601, + "learning_rate": 3.45412892985148e-06, + "loss": 0.0344, + "step": 28192 + }, + { + "epoch": 3.343175619589707, + "grad_norm": 0.7055543410222099, + "learning_rate": 3.4529115923507937e-06, + "loss": 0.0401, + "step": 28193 + }, + { + "epoch": 3.3432942013518323, + "grad_norm": 0.9199076220500583, + "learning_rate": 3.4516944534878414e-06, + "loss": 0.0323, + "step": 28194 + }, + { + "epoch": 3.343412783113957, + "grad_norm": 0.4713967122509957, + "learning_rate": 3.4504775132738423e-06, + "loss": 0.0257, + "step": 28195 + }, + { + "epoch": 3.343531364876082, + "grad_norm": 0.43834867512691517, + "learning_rate": 3.4492607717200198e-06, + "loss": 0.0177, + "step": 28196 + }, + { + "epoch": 3.343649946638207, + "grad_norm": 0.5692708131064051, + "learning_rate": 3.44804422883758e-06, + "loss": 0.0323, + "step": 28197 + }, + { + "epoch": 3.3437685284003322, + "grad_norm": 0.6414294748845712, + "learning_rate": 3.4468278846377435e-06, + "loss": 0.0297, + "step": 28198 + }, + { + "epoch": 3.343887110162457, + "grad_norm": 0.3716064267615827, + "learning_rate": 3.445611739131724e-06, + "loss": 0.0138, + "step": 28199 + }, + { + "epoch": 3.344005691924582, + "grad_norm": 0.4221718005718703, + "learning_rate": 3.4443957923307376e-06, + "loss": 0.0211, + "step": 28200 + }, + { + "epoch": 3.344124273686707, + "grad_norm": 0.5930673146564932, + "learning_rate": 3.443180044245978e-06, + "loss": 0.029, + "step": 28201 + }, + { + "epoch": 3.344242855448832, + "grad_norm": 0.8560265164156348, + "learning_rate": 3.4419644948886748e-06, + "loss": 0.0304, + "step": 28202 + }, + { + "epoch": 3.344361437210957, + "grad_norm": 0.6882309866297076, + "learning_rate": 3.4407491442700196e-06, + "loss": 0.0336, + "step": 28203 + }, + { + "epoch": 3.344480018973082, + "grad_norm": 0.3099078721293835, + "learning_rate": 3.439533992401217e-06, + "loss": 0.0146, + "step": 28204 + }, + { + "epoch": 3.344598600735207, + "grad_norm": 0.5778606235366471, + "learning_rate": 3.4383190392934693e-06, + "loss": 0.0328, + "step": 28205 + }, + { + "epoch": 3.344717182497332, + "grad_norm": 0.43267152790615265, + "learning_rate": 3.4371042849579845e-06, + "loss": 0.0205, + "step": 28206 + }, + { + "epoch": 3.344835764259457, + "grad_norm": 0.3736479895656624, + "learning_rate": 3.4358897294059582e-06, + "loss": 0.0218, + "step": 28207 + }, + { + "epoch": 3.344954346021582, + "grad_norm": 0.4059210344631899, + "learning_rate": 3.43467537264858e-06, + "loss": 0.0229, + "step": 28208 + }, + { + "epoch": 3.345072927783707, + "grad_norm": 0.31038804618952504, + "learning_rate": 3.433461214697051e-06, + "loss": 0.0136, + "step": 28209 + }, + { + "epoch": 3.345191509545832, + "grad_norm": 0.7357423986705598, + "learning_rate": 3.4322472555625624e-06, + "loss": 0.043, + "step": 28210 + }, + { + "epoch": 3.3453100913079568, + "grad_norm": 0.5754216981827954, + "learning_rate": 3.4310334952563112e-06, + "loss": 0.0331, + "step": 28211 + }, + { + "epoch": 3.345428673070082, + "grad_norm": 0.393781598787282, + "learning_rate": 3.4298199337894685e-06, + "loss": 0.0177, + "step": 28212 + }, + { + "epoch": 3.3455472548322067, + "grad_norm": 0.5349602623237609, + "learning_rate": 3.4286065711732486e-06, + "loss": 0.0357, + "step": 28213 + }, + { + "epoch": 3.345665836594332, + "grad_norm": 0.41427778777841306, + "learning_rate": 3.427393407418816e-06, + "loss": 0.017, + "step": 28214 + }, + { + "epoch": 3.3457844183564567, + "grad_norm": 0.6126424996321812, + "learning_rate": 3.426180442537366e-06, + "loss": 0.0344, + "step": 28215 + }, + { + "epoch": 3.345903000118582, + "grad_norm": 0.5297127237189342, + "learning_rate": 3.4249676765400723e-06, + "loss": 0.02, + "step": 28216 + }, + { + "epoch": 3.3460215818807066, + "grad_norm": 0.7270795082679435, + "learning_rate": 3.423755109438123e-06, + "loss": 0.0348, + "step": 28217 + }, + { + "epoch": 3.346140163642832, + "grad_norm": 0.4665862566807893, + "learning_rate": 3.422542741242696e-06, + "loss": 0.025, + "step": 28218 + }, + { + "epoch": 3.3462587454049566, + "grad_norm": 0.398971612099775, + "learning_rate": 3.421330571964959e-06, + "loss": 0.0163, + "step": 28219 + }, + { + "epoch": 3.346377327167082, + "grad_norm": 0.37470988849226766, + "learning_rate": 3.420118601616096e-06, + "loss": 0.0193, + "step": 28220 + }, + { + "epoch": 3.3464959089292066, + "grad_norm": 0.637888952899598, + "learning_rate": 3.4189068302072745e-06, + "loss": 0.0326, + "step": 28221 + }, + { + "epoch": 3.3466144906913318, + "grad_norm": 0.5589683634580547, + "learning_rate": 3.4176952577496673e-06, + "loss": 0.0244, + "step": 28222 + }, + { + "epoch": 3.3467330724534565, + "grad_norm": 0.5538158794332383, + "learning_rate": 3.4164838842544485e-06, + "loss": 0.0299, + "step": 28223 + }, + { + "epoch": 3.3468516542155817, + "grad_norm": 0.4858986764339397, + "learning_rate": 3.4152727097327726e-06, + "loss": 0.0242, + "step": 28224 + }, + { + "epoch": 3.3469702359777065, + "grad_norm": 0.4365959285271395, + "learning_rate": 3.4140617341958225e-06, + "loss": 0.0197, + "step": 28225 + }, + { + "epoch": 3.3470888177398317, + "grad_norm": 0.688776736755611, + "learning_rate": 3.412850957654748e-06, + "loss": 0.0263, + "step": 28226 + }, + { + "epoch": 3.3472073995019564, + "grad_norm": 0.821301857466055, + "learning_rate": 3.411640380120715e-06, + "loss": 0.05, + "step": 28227 + }, + { + "epoch": 3.3473259812640817, + "grad_norm": 0.5875648660628294, + "learning_rate": 3.4104300016048835e-06, + "loss": 0.027, + "step": 28228 + }, + { + "epoch": 3.3474445630262064, + "grad_norm": 0.37481771779192163, + "learning_rate": 3.4092198221184203e-06, + "loss": 0.0218, + "step": 28229 + }, + { + "epoch": 3.3475631447883316, + "grad_norm": 0.6710333324951044, + "learning_rate": 3.408009841672466e-06, + "loss": 0.0307, + "step": 28230 + }, + { + "epoch": 3.3476817265504564, + "grad_norm": 0.7815898042557843, + "learning_rate": 3.406800060278184e-06, + "loss": 0.0605, + "step": 28231 + }, + { + "epoch": 3.3478003083125816, + "grad_norm": 0.3620847863955289, + "learning_rate": 3.4055904779467267e-06, + "loss": 0.0189, + "step": 28232 + }, + { + "epoch": 3.3479188900747063, + "grad_norm": 0.5697120988659441, + "learning_rate": 3.4043810946892436e-06, + "loss": 0.0279, + "step": 28233 + }, + { + "epoch": 3.3480374718368315, + "grad_norm": 0.7277329855884003, + "learning_rate": 3.403171910516892e-06, + "loss": 0.033, + "step": 28234 + }, + { + "epoch": 3.3481560535989563, + "grad_norm": 0.48107042099226494, + "learning_rate": 3.401962925440799e-06, + "loss": 0.0233, + "step": 28235 + }, + { + "epoch": 3.3482746353610815, + "grad_norm": 0.814924486188585, + "learning_rate": 3.4007541394721314e-06, + "loss": 0.0348, + "step": 28236 + }, + { + "epoch": 3.3483932171232063, + "grad_norm": 0.3300562348123815, + "learning_rate": 3.3995455526220215e-06, + "loss": 0.0167, + "step": 28237 + }, + { + "epoch": 3.3485117988853315, + "grad_norm": 0.5334158980364886, + "learning_rate": 3.398337164901619e-06, + "loss": 0.0247, + "step": 28238 + }, + { + "epoch": 3.3486303806474567, + "grad_norm": 0.6884143914578076, + "learning_rate": 3.397128976322045e-06, + "loss": 0.0386, + "step": 28239 + }, + { + "epoch": 3.3487489624095814, + "grad_norm": 0.3016313024102435, + "learning_rate": 3.395920986894463e-06, + "loss": 0.0155, + "step": 28240 + }, + { + "epoch": 3.348867544171706, + "grad_norm": 0.5251405141079009, + "learning_rate": 3.394713196629992e-06, + "loss": 0.0306, + "step": 28241 + }, + { + "epoch": 3.3489861259338314, + "grad_norm": 0.500267518370771, + "learning_rate": 3.3935056055397706e-06, + "loss": 0.0227, + "step": 28242 + }, + { + "epoch": 3.3491047076959566, + "grad_norm": 0.48006827956788833, + "learning_rate": 3.392298213634931e-06, + "loss": 0.0285, + "step": 28243 + }, + { + "epoch": 3.3492232894580813, + "grad_norm": 0.9194337498369566, + "learning_rate": 3.3910910209266037e-06, + "loss": 0.0517, + "step": 28244 + }, + { + "epoch": 3.349341871220206, + "grad_norm": 0.617679520410546, + "learning_rate": 3.389884027425927e-06, + "loss": 0.0348, + "step": 28245 + }, + { + "epoch": 3.3494604529823313, + "grad_norm": 0.46605060881192145, + "learning_rate": 3.388677233144008e-06, + "loss": 0.0155, + "step": 28246 + }, + { + "epoch": 3.3495790347444565, + "grad_norm": 0.3551502916553192, + "learning_rate": 3.387470638091994e-06, + "loss": 0.027, + "step": 28247 + }, + { + "epoch": 3.3496976165065813, + "grad_norm": 0.8831668822300798, + "learning_rate": 3.38626424228099e-06, + "loss": 0.0487, + "step": 28248 + }, + { + "epoch": 3.349816198268706, + "grad_norm": 0.4704423394553142, + "learning_rate": 3.385058045722134e-06, + "loss": 0.0242, + "step": 28249 + }, + { + "epoch": 3.3499347800308312, + "grad_norm": 0.387456952230621, + "learning_rate": 3.383852048426525e-06, + "loss": 0.0211, + "step": 28250 + }, + { + "epoch": 3.3500533617929564, + "grad_norm": 0.7397142725057542, + "learning_rate": 3.3826462504053026e-06, + "loss": 0.0289, + "step": 28251 + }, + { + "epoch": 3.350171943555081, + "grad_norm": 0.41774932061219394, + "learning_rate": 3.381440651669568e-06, + "loss": 0.0161, + "step": 28252 + }, + { + "epoch": 3.3502905253172064, + "grad_norm": 0.45560828738774256, + "learning_rate": 3.3802352522304377e-06, + "loss": 0.0216, + "step": 28253 + }, + { + "epoch": 3.350409107079331, + "grad_norm": 0.7691938157595236, + "learning_rate": 3.3790300520990304e-06, + "loss": 0.0216, + "step": 28254 + }, + { + "epoch": 3.3505276888414564, + "grad_norm": 0.7069236850242283, + "learning_rate": 3.3778250512864513e-06, + "loss": 0.0502, + "step": 28255 + }, + { + "epoch": 3.350646270603581, + "grad_norm": 0.7503153169472407, + "learning_rate": 3.3766202498038136e-06, + "loss": 0.0337, + "step": 28256 + }, + { + "epoch": 3.3507648523657063, + "grad_norm": 0.742307100488776, + "learning_rate": 3.3754156476622112e-06, + "loss": 0.0338, + "step": 28257 + }, + { + "epoch": 3.350883434127831, + "grad_norm": 0.5482687384236419, + "learning_rate": 3.3742112448727713e-06, + "loss": 0.0185, + "step": 28258 + }, + { + "epoch": 3.3510020158899563, + "grad_norm": 0.4111845962248236, + "learning_rate": 3.373007041446577e-06, + "loss": 0.0167, + "step": 28259 + }, + { + "epoch": 3.351120597652081, + "grad_norm": 0.5923859132309279, + "learning_rate": 3.371803037394744e-06, + "loss": 0.0305, + "step": 28260 + }, + { + "epoch": 3.3512391794142062, + "grad_norm": 0.4291911402438124, + "learning_rate": 3.3705992327283523e-06, + "loss": 0.0235, + "step": 28261 + }, + { + "epoch": 3.351357761176331, + "grad_norm": 0.7255319151473992, + "learning_rate": 3.369395627458524e-06, + "loss": 0.0341, + "step": 28262 + }, + { + "epoch": 3.351476342938456, + "grad_norm": 0.43760583108322504, + "learning_rate": 3.3681922215963364e-06, + "loss": 0.0174, + "step": 28263 + }, + { + "epoch": 3.351594924700581, + "grad_norm": 0.4573631757499597, + "learning_rate": 3.366989015152891e-06, + "loss": 0.0218, + "step": 28264 + }, + { + "epoch": 3.351713506462706, + "grad_norm": 0.4527573300220819, + "learning_rate": 3.3657860081392794e-06, + "loss": 0.0187, + "step": 28265 + }, + { + "epoch": 3.351832088224831, + "grad_norm": 0.4182659925900208, + "learning_rate": 3.36458320056659e-06, + "loss": 0.0201, + "step": 28266 + }, + { + "epoch": 3.351950669986956, + "grad_norm": 0.3959800589725202, + "learning_rate": 3.3633805924459164e-06, + "loss": 0.018, + "step": 28267 + }, + { + "epoch": 3.352069251749081, + "grad_norm": 0.5786280379278526, + "learning_rate": 3.3621781837883365e-06, + "loss": 0.0345, + "step": 28268 + }, + { + "epoch": 3.352187833511206, + "grad_norm": 0.5227350376922945, + "learning_rate": 3.3609759746049407e-06, + "loss": 0.0228, + "step": 28269 + }, + { + "epoch": 3.352306415273331, + "grad_norm": 0.7540154162815684, + "learning_rate": 3.3597739649068123e-06, + "loss": 0.0437, + "step": 28270 + }, + { + "epoch": 3.352424997035456, + "grad_norm": 0.39977722437406304, + "learning_rate": 3.358572154705028e-06, + "loss": 0.0135, + "step": 28271 + }, + { + "epoch": 3.352543578797581, + "grad_norm": 0.6107006057939457, + "learning_rate": 3.357370544010671e-06, + "loss": 0.0317, + "step": 28272 + }, + { + "epoch": 3.352662160559706, + "grad_norm": 0.9037926939413755, + "learning_rate": 3.3561691328348187e-06, + "loss": 0.045, + "step": 28273 + }, + { + "epoch": 3.3527807423218308, + "grad_norm": 0.3476388378877911, + "learning_rate": 3.3549679211885483e-06, + "loss": 0.0233, + "step": 28274 + }, + { + "epoch": 3.352899324083956, + "grad_norm": 0.3751134512711658, + "learning_rate": 3.353766909082928e-06, + "loss": 0.0162, + "step": 28275 + }, + { + "epoch": 3.3530179058460807, + "grad_norm": 0.6389063091231484, + "learning_rate": 3.3525660965290307e-06, + "loss": 0.0284, + "step": 28276 + }, + { + "epoch": 3.353136487608206, + "grad_norm": 0.8694600952881567, + "learning_rate": 3.35136548353793e-06, + "loss": 0.04, + "step": 28277 + }, + { + "epoch": 3.3532550693703307, + "grad_norm": 0.5785908495121403, + "learning_rate": 3.350165070120698e-06, + "loss": 0.0277, + "step": 28278 + }, + { + "epoch": 3.353373651132456, + "grad_norm": 0.6922427362905814, + "learning_rate": 3.3489648562883864e-06, + "loss": 0.0281, + "step": 28279 + }, + { + "epoch": 3.3534922328945806, + "grad_norm": 0.3948664622085124, + "learning_rate": 3.34776484205207e-06, + "loss": 0.02, + "step": 28280 + }, + { + "epoch": 3.353610814656706, + "grad_norm": 0.45422896888424386, + "learning_rate": 3.3465650274228127e-06, + "loss": 0.0197, + "step": 28281 + }, + { + "epoch": 3.3537293964188306, + "grad_norm": 0.375289348074269, + "learning_rate": 3.345365412411669e-06, + "loss": 0.0178, + "step": 28282 + }, + { + "epoch": 3.353847978180956, + "grad_norm": 0.5028740842507217, + "learning_rate": 3.344165997029711e-06, + "loss": 0.0235, + "step": 28283 + }, + { + "epoch": 3.3539665599430806, + "grad_norm": 0.5904841985632487, + "learning_rate": 3.3429667812879706e-06, + "loss": 0.021, + "step": 28284 + }, + { + "epoch": 3.3540851417052058, + "grad_norm": 0.6903163965935314, + "learning_rate": 3.341767765197534e-06, + "loss": 0.0402, + "step": 28285 + }, + { + "epoch": 3.3542037234673305, + "grad_norm": 0.48220343875443233, + "learning_rate": 3.340568948769432e-06, + "loss": 0.0176, + "step": 28286 + }, + { + "epoch": 3.3543223052294557, + "grad_norm": 0.4369683633055692, + "learning_rate": 3.339370332014724e-06, + "loss": 0.0204, + "step": 28287 + }, + { + "epoch": 3.354440886991581, + "grad_norm": 0.7765882707750833, + "learning_rate": 3.33817191494446e-06, + "loss": 0.0413, + "step": 28288 + }, + { + "epoch": 3.3545594687537057, + "grad_norm": 0.3875280823679829, + "learning_rate": 3.3369736975696925e-06, + "loss": 0.0222, + "step": 28289 + }, + { + "epoch": 3.3546780505158305, + "grad_norm": 0.35713168227977854, + "learning_rate": 3.335775679901457e-06, + "loss": 0.0144, + "step": 28290 + }, + { + "epoch": 3.3547966322779557, + "grad_norm": 0.6828633275817672, + "learning_rate": 3.3345778619508024e-06, + "loss": 0.0307, + "step": 28291 + }, + { + "epoch": 3.354915214040081, + "grad_norm": 0.6983418686527744, + "learning_rate": 3.3333802437287733e-06, + "loss": 0.0383, + "step": 28292 + }, + { + "epoch": 3.3550337958022056, + "grad_norm": 0.44862152932031757, + "learning_rate": 3.3321828252464077e-06, + "loss": 0.0164, + "step": 28293 + }, + { + "epoch": 3.3551523775643304, + "grad_norm": 0.296317269117406, + "learning_rate": 3.3309856065147526e-06, + "loss": 0.0157, + "step": 28294 + }, + { + "epoch": 3.3552709593264556, + "grad_norm": 0.5663689096481667, + "learning_rate": 3.329788587544827e-06, + "loss": 0.0337, + "step": 28295 + }, + { + "epoch": 3.355389541088581, + "grad_norm": 0.5112536538973177, + "learning_rate": 3.328591768347686e-06, + "loss": 0.0251, + "step": 28296 + }, + { + "epoch": 3.3555081228507055, + "grad_norm": 0.5742272987044011, + "learning_rate": 3.327395148934351e-06, + "loss": 0.0351, + "step": 28297 + }, + { + "epoch": 3.3556267046128303, + "grad_norm": 0.4700243949169704, + "learning_rate": 3.3261987293158547e-06, + "loss": 0.0234, + "step": 28298 + }, + { + "epoch": 3.3557452863749555, + "grad_norm": 0.6300278638954473, + "learning_rate": 3.325002509503228e-06, + "loss": 0.0274, + "step": 28299 + }, + { + "epoch": 3.3558638681370807, + "grad_norm": 0.6884447473363666, + "learning_rate": 3.3238064895075027e-06, + "loss": 0.0342, + "step": 28300 + }, + { + "epoch": 3.3559824498992055, + "grad_norm": 0.733897120409, + "learning_rate": 3.3226106693396956e-06, + "loss": 0.0423, + "step": 28301 + }, + { + "epoch": 3.3561010316613307, + "grad_norm": 0.40880298227747935, + "learning_rate": 3.321415049010837e-06, + "loss": 0.0159, + "step": 28302 + }, + { + "epoch": 3.3562196134234554, + "grad_norm": 0.6712995291648227, + "learning_rate": 3.320219628531945e-06, + "loss": 0.0554, + "step": 28303 + }, + { + "epoch": 3.3563381951855806, + "grad_norm": 0.37541223100322735, + "learning_rate": 3.319024407914045e-06, + "loss": 0.0152, + "step": 28304 + }, + { + "epoch": 3.3564567769477054, + "grad_norm": 0.5588199146739184, + "learning_rate": 3.3178293871681587e-06, + "loss": 0.0284, + "step": 28305 + }, + { + "epoch": 3.3565753587098306, + "grad_norm": 0.564553830834946, + "learning_rate": 3.316634566305285e-06, + "loss": 0.0291, + "step": 28306 + }, + { + "epoch": 3.3566939404719554, + "grad_norm": 0.8319811555024731, + "learning_rate": 3.3154399453364597e-06, + "loss": 0.0368, + "step": 28307 + }, + { + "epoch": 3.3568125222340806, + "grad_norm": 0.41711500013233127, + "learning_rate": 3.3142455242726827e-06, + "loss": 0.0228, + "step": 28308 + }, + { + "epoch": 3.3569311039962053, + "grad_norm": 0.5840921157854478, + "learning_rate": 3.3130513031249756e-06, + "loss": 0.0248, + "step": 28309 + }, + { + "epoch": 3.3570496857583305, + "grad_norm": 0.3116702604707591, + "learning_rate": 3.311857281904329e-06, + "loss": 0.0153, + "step": 28310 + }, + { + "epoch": 3.3571682675204553, + "grad_norm": 0.23759318195728477, + "learning_rate": 3.3106634606217767e-06, + "loss": 0.0166, + "step": 28311 + }, + { + "epoch": 3.3572868492825805, + "grad_norm": 0.5914615049529913, + "learning_rate": 3.309469839288301e-06, + "loss": 0.0238, + "step": 28312 + }, + { + "epoch": 3.3574054310447052, + "grad_norm": 0.9034112453667742, + "learning_rate": 3.3082764179149155e-06, + "loss": 0.0395, + "step": 28313 + }, + { + "epoch": 3.3575240128068304, + "grad_norm": 0.6999090978603308, + "learning_rate": 3.3070831965126227e-06, + "loss": 0.0333, + "step": 28314 + }, + { + "epoch": 3.357642594568955, + "grad_norm": 0.2973270249284338, + "learning_rate": 3.3058901750924185e-06, + "loss": 0.0175, + "step": 28315 + }, + { + "epoch": 3.3577611763310804, + "grad_norm": 0.5279634400338282, + "learning_rate": 3.304697353665312e-06, + "loss": 0.0238, + "step": 28316 + }, + { + "epoch": 3.357879758093205, + "grad_norm": 0.5494786807672036, + "learning_rate": 3.30350473224228e-06, + "loss": 0.0287, + "step": 28317 + }, + { + "epoch": 3.3579983398553304, + "grad_norm": 0.4623087306515806, + "learning_rate": 3.3023123108343377e-06, + "loss": 0.0243, + "step": 28318 + }, + { + "epoch": 3.358116921617455, + "grad_norm": 0.614307465774112, + "learning_rate": 3.301120089452464e-06, + "loss": 0.0396, + "step": 28319 + }, + { + "epoch": 3.3582355033795803, + "grad_norm": 0.6861971562801845, + "learning_rate": 3.2999280681076576e-06, + "loss": 0.0431, + "step": 28320 + }, + { + "epoch": 3.358354085141705, + "grad_norm": 0.47554542646041204, + "learning_rate": 3.2987362468108962e-06, + "loss": 0.0273, + "step": 28321 + }, + { + "epoch": 3.3584726669038303, + "grad_norm": 0.4194668112933761, + "learning_rate": 3.2975446255731844e-06, + "loss": 0.0195, + "step": 28322 + }, + { + "epoch": 3.358591248665955, + "grad_norm": 0.3759538621645294, + "learning_rate": 3.2963532044054916e-06, + "loss": 0.0174, + "step": 28323 + }, + { + "epoch": 3.3587098304280802, + "grad_norm": 0.6566019076367076, + "learning_rate": 3.2951619833188093e-06, + "loss": 0.0351, + "step": 28324 + }, + { + "epoch": 3.358828412190205, + "grad_norm": 0.5353165260980502, + "learning_rate": 3.2939709623241168e-06, + "loss": 0.0296, + "step": 28325 + }, + { + "epoch": 3.35894699395233, + "grad_norm": 0.6298458865649835, + "learning_rate": 3.2927801414323916e-06, + "loss": 0.0326, + "step": 28326 + }, + { + "epoch": 3.359065575714455, + "grad_norm": 0.52118428891993, + "learning_rate": 3.2915895206546227e-06, + "loss": 0.0201, + "step": 28327 + }, + { + "epoch": 3.35918415747658, + "grad_norm": 0.6077360486698166, + "learning_rate": 3.2903991000017678e-06, + "loss": 0.0304, + "step": 28328 + }, + { + "epoch": 3.359302739238705, + "grad_norm": 0.6339115303663793, + "learning_rate": 3.2892088794848204e-06, + "loss": 0.0327, + "step": 28329 + }, + { + "epoch": 3.35942132100083, + "grad_norm": 0.5415655500471742, + "learning_rate": 3.288018859114736e-06, + "loss": 0.0257, + "step": 28330 + }, + { + "epoch": 3.359539902762955, + "grad_norm": 0.7228887635321692, + "learning_rate": 3.2868290389024947e-06, + "loss": 0.0312, + "step": 28331 + }, + { + "epoch": 3.35965848452508, + "grad_norm": 0.5098700609539045, + "learning_rate": 3.2856394188590655e-06, + "loss": 0.0282, + "step": 28332 + }, + { + "epoch": 3.359777066287205, + "grad_norm": 0.3709009305161092, + "learning_rate": 3.284449998995409e-06, + "loss": 0.0132, + "step": 28333 + }, + { + "epoch": 3.35989564804933, + "grad_norm": 0.4035985589407649, + "learning_rate": 3.283260779322503e-06, + "loss": 0.0199, + "step": 28334 + }, + { + "epoch": 3.360014229811455, + "grad_norm": 0.5082633641489566, + "learning_rate": 3.2820717598512936e-06, + "loss": 0.0228, + "step": 28335 + }, + { + "epoch": 3.36013281157358, + "grad_norm": 0.24543513422896332, + "learning_rate": 3.2808829405927525e-06, + "loss": 0.011, + "step": 28336 + }, + { + "epoch": 3.360251393335705, + "grad_norm": 0.8444372617941103, + "learning_rate": 3.2796943215578357e-06, + "loss": 0.0381, + "step": 28337 + }, + { + "epoch": 3.36036997509783, + "grad_norm": 0.4035033745690534, + "learning_rate": 3.278505902757506e-06, + "loss": 0.0168, + "step": 28338 + }, + { + "epoch": 3.3604885568599547, + "grad_norm": 0.3906168972241103, + "learning_rate": 3.277317684202713e-06, + "loss": 0.0168, + "step": 28339 + }, + { + "epoch": 3.36060713862208, + "grad_norm": 0.4043237033032577, + "learning_rate": 3.276129665904412e-06, + "loss": 0.0254, + "step": 28340 + }, + { + "epoch": 3.360725720384205, + "grad_norm": 0.46609714245986206, + "learning_rate": 3.2749418478735557e-06, + "loss": 0.0198, + "step": 28341 + }, + { + "epoch": 3.36084430214633, + "grad_norm": 0.5850114594795462, + "learning_rate": 3.2737542301210928e-06, + "loss": 0.0284, + "step": 28342 + }, + { + "epoch": 3.3609628839084547, + "grad_norm": 0.6194569099358317, + "learning_rate": 3.272566812657973e-06, + "loss": 0.0309, + "step": 28343 + }, + { + "epoch": 3.36108146567058, + "grad_norm": 0.6296536219313433, + "learning_rate": 3.271379595495147e-06, + "loss": 0.0207, + "step": 28344 + }, + { + "epoch": 3.361200047432705, + "grad_norm": 0.3002777906276834, + "learning_rate": 3.270192578643558e-06, + "loss": 0.0178, + "step": 28345 + }, + { + "epoch": 3.36131862919483, + "grad_norm": 0.36538319137322256, + "learning_rate": 3.269005762114144e-06, + "loss": 0.0202, + "step": 28346 + }, + { + "epoch": 3.3614372109569546, + "grad_norm": 0.6875228571399811, + "learning_rate": 3.267819145917847e-06, + "loss": 0.0274, + "step": 28347 + }, + { + "epoch": 3.36155579271908, + "grad_norm": 0.48708531276649436, + "learning_rate": 3.2666327300656054e-06, + "loss": 0.0213, + "step": 28348 + }, + { + "epoch": 3.361674374481205, + "grad_norm": 0.6233920366427661, + "learning_rate": 3.265446514568368e-06, + "loss": 0.0294, + "step": 28349 + }, + { + "epoch": 3.3617929562433297, + "grad_norm": 0.485936129631044, + "learning_rate": 3.2642604994370523e-06, + "loss": 0.017, + "step": 28350 + }, + { + "epoch": 3.361911538005455, + "grad_norm": 0.6490520041181146, + "learning_rate": 3.263074684682604e-06, + "loss": 0.0349, + "step": 28351 + }, + { + "epoch": 3.3620301197675797, + "grad_norm": 0.3778299589979411, + "learning_rate": 3.261889070315949e-06, + "loss": 0.018, + "step": 28352 + }, + { + "epoch": 3.362148701529705, + "grad_norm": 0.24763201856572672, + "learning_rate": 3.2607036563480214e-06, + "loss": 0.01, + "step": 28353 + }, + { + "epoch": 3.3622672832918297, + "grad_norm": 0.6222852681457076, + "learning_rate": 3.2595184427897524e-06, + "loss": 0.0313, + "step": 28354 + }, + { + "epoch": 3.362385865053955, + "grad_norm": 0.38295087602238503, + "learning_rate": 3.2583334296520526e-06, + "loss": 0.0274, + "step": 28355 + }, + { + "epoch": 3.3625044468160796, + "grad_norm": 0.8306044752864288, + "learning_rate": 3.257148616945868e-06, + "loss": 0.0362, + "step": 28356 + }, + { + "epoch": 3.362623028578205, + "grad_norm": 0.6050457322072268, + "learning_rate": 3.2559640046821047e-06, + "loss": 0.0263, + "step": 28357 + }, + { + "epoch": 3.3627416103403296, + "grad_norm": 0.49553428361638757, + "learning_rate": 3.2547795928716874e-06, + "loss": 0.0203, + "step": 28358 + }, + { + "epoch": 3.362860192102455, + "grad_norm": 0.6923794698993894, + "learning_rate": 3.25359538152554e-06, + "loss": 0.0345, + "step": 28359 + }, + { + "epoch": 3.3629787738645796, + "grad_norm": 0.371948471771343, + "learning_rate": 3.252411370654579e-06, + "loss": 0.0215, + "step": 28360 + }, + { + "epoch": 3.3630973556267048, + "grad_norm": 0.9802008723587586, + "learning_rate": 3.251227560269712e-06, + "loss": 0.0463, + "step": 28361 + }, + { + "epoch": 3.3632159373888295, + "grad_norm": 0.41142018496002064, + "learning_rate": 3.250043950381856e-06, + "loss": 0.0253, + "step": 28362 + }, + { + "epoch": 3.3633345191509547, + "grad_norm": 0.24279926466844917, + "learning_rate": 3.2488605410019247e-06, + "loss": 0.0108, + "step": 28363 + }, + { + "epoch": 3.3634531009130795, + "grad_norm": 0.5211991166016366, + "learning_rate": 3.247677332140825e-06, + "loss": 0.0221, + "step": 28364 + }, + { + "epoch": 3.3635716826752047, + "grad_norm": 0.4058903933030355, + "learning_rate": 3.246494323809471e-06, + "loss": 0.0235, + "step": 28365 + }, + { + "epoch": 3.3636902644373294, + "grad_norm": 0.4727023635819523, + "learning_rate": 3.2453115160187537e-06, + "loss": 0.0237, + "step": 28366 + }, + { + "epoch": 3.3638088461994546, + "grad_norm": 0.5214287888246492, + "learning_rate": 3.244128908779595e-06, + "loss": 0.0205, + "step": 28367 + }, + { + "epoch": 3.3639274279615794, + "grad_norm": 0.3063117908460914, + "learning_rate": 3.2429465021028864e-06, + "loss": 0.0144, + "step": 28368 + }, + { + "epoch": 3.3640460097237046, + "grad_norm": 0.5487428350876885, + "learning_rate": 3.241764295999536e-06, + "loss": 0.0352, + "step": 28369 + }, + { + "epoch": 3.3641645914858294, + "grad_norm": 0.33563162550274317, + "learning_rate": 3.2405822904804234e-06, + "loss": 0.0151, + "step": 28370 + }, + { + "epoch": 3.3642831732479546, + "grad_norm": 0.4709968675504259, + "learning_rate": 3.2394004855564738e-06, + "loss": 0.0251, + "step": 28371 + }, + { + "epoch": 3.3644017550100793, + "grad_norm": 0.3893529879935164, + "learning_rate": 3.2382188812385583e-06, + "loss": 0.0183, + "step": 28372 + }, + { + "epoch": 3.3645203367722045, + "grad_norm": 0.7769089957808121, + "learning_rate": 3.23703747753758e-06, + "loss": 0.0444, + "step": 28373 + }, + { + "epoch": 3.3646389185343293, + "grad_norm": 0.30520676194137253, + "learning_rate": 3.23585627446443e-06, + "loss": 0.0126, + "step": 28374 + }, + { + "epoch": 3.3647575002964545, + "grad_norm": 0.5260896824323504, + "learning_rate": 3.234675272029994e-06, + "loss": 0.02, + "step": 28375 + }, + { + "epoch": 3.3648760820585792, + "grad_norm": 0.6397910708141028, + "learning_rate": 3.2334944702451687e-06, + "loss": 0.0311, + "step": 28376 + }, + { + "epoch": 3.3649946638207044, + "grad_norm": 0.574580948635534, + "learning_rate": 3.2323138691208205e-06, + "loss": 0.021, + "step": 28377 + }, + { + "epoch": 3.365113245582829, + "grad_norm": 0.6402744039577356, + "learning_rate": 3.2311334686678572e-06, + "loss": 0.0241, + "step": 28378 + }, + { + "epoch": 3.3652318273449544, + "grad_norm": 0.5493922805226412, + "learning_rate": 3.229953268897143e-06, + "loss": 0.0339, + "step": 28379 + }, + { + "epoch": 3.365350409107079, + "grad_norm": 0.7148063384692843, + "learning_rate": 3.2287732698195626e-06, + "loss": 0.0227, + "step": 28380 + }, + { + "epoch": 3.3654689908692044, + "grad_norm": 0.5831210903309609, + "learning_rate": 3.227593471445994e-06, + "loss": 0.0313, + "step": 28381 + }, + { + "epoch": 3.365587572631329, + "grad_norm": 0.5182779706377152, + "learning_rate": 3.226413873787318e-06, + "loss": 0.023, + "step": 28382 + }, + { + "epoch": 3.3657061543934543, + "grad_norm": 0.7266698346250949, + "learning_rate": 3.225234476854408e-06, + "loss": 0.0516, + "step": 28383 + }, + { + "epoch": 3.365824736155579, + "grad_norm": 0.37294944838555166, + "learning_rate": 3.2240552806581227e-06, + "loss": 0.015, + "step": 28384 + }, + { + "epoch": 3.3659433179177043, + "grad_norm": 0.7225756434909976, + "learning_rate": 3.2228762852093585e-06, + "loss": 0.0349, + "step": 28385 + }, + { + "epoch": 3.366061899679829, + "grad_norm": 0.5849529573635673, + "learning_rate": 3.2216974905189627e-06, + "loss": 0.0317, + "step": 28386 + }, + { + "epoch": 3.3661804814419543, + "grad_norm": 0.7224602217793022, + "learning_rate": 3.2205188965978154e-06, + "loss": 0.035, + "step": 28387 + }, + { + "epoch": 3.366299063204079, + "grad_norm": 0.5856139729928586, + "learning_rate": 3.2193405034567637e-06, + "loss": 0.0269, + "step": 28388 + }, + { + "epoch": 3.366417644966204, + "grad_norm": 0.5514711247898894, + "learning_rate": 3.2181623111066954e-06, + "loss": 0.0348, + "step": 28389 + }, + { + "epoch": 3.3665362267283294, + "grad_norm": 0.47791182279855193, + "learning_rate": 3.2169843195584554e-06, + "loss": 0.028, + "step": 28390 + }, + { + "epoch": 3.366654808490454, + "grad_norm": 0.4659678552073218, + "learning_rate": 3.2158065288229066e-06, + "loss": 0.0276, + "step": 28391 + }, + { + "epoch": 3.366773390252579, + "grad_norm": 0.5579796158504952, + "learning_rate": 3.2146289389109075e-06, + "loss": 0.0302, + "step": 28392 + }, + { + "epoch": 3.366891972014704, + "grad_norm": 0.6652894891985357, + "learning_rate": 3.213451549833316e-06, + "loss": 0.0333, + "step": 28393 + }, + { + "epoch": 3.3670105537768293, + "grad_norm": 0.5923615910784152, + "learning_rate": 3.212274361600989e-06, + "loss": 0.0271, + "step": 28394 + }, + { + "epoch": 3.367129135538954, + "grad_norm": 0.9496742970484627, + "learning_rate": 3.2110973742247725e-06, + "loss": 0.0412, + "step": 28395 + }, + { + "epoch": 3.367247717301079, + "grad_norm": 0.46517234064011076, + "learning_rate": 3.2099205877155148e-06, + "loss": 0.0228, + "step": 28396 + }, + { + "epoch": 3.367366299063204, + "grad_norm": 0.4596104134516196, + "learning_rate": 3.2087440020840714e-06, + "loss": 0.0214, + "step": 28397 + }, + { + "epoch": 3.3674848808253293, + "grad_norm": 0.4636169274473629, + "learning_rate": 3.2075676173412892e-06, + "loss": 0.021, + "step": 28398 + }, + { + "epoch": 3.367603462587454, + "grad_norm": 0.7865933778335288, + "learning_rate": 3.2063914334979987e-06, + "loss": 0.0452, + "step": 28399 + }, + { + "epoch": 3.3677220443495792, + "grad_norm": 0.3817054006028995, + "learning_rate": 3.205215450565069e-06, + "loss": 0.0237, + "step": 28400 + }, + { + "epoch": 3.367840626111704, + "grad_norm": 0.7577335077109038, + "learning_rate": 3.204039668553316e-06, + "loss": 0.0415, + "step": 28401 + }, + { + "epoch": 3.367959207873829, + "grad_norm": 0.3921403080608631, + "learning_rate": 3.202864087473592e-06, + "loss": 0.0185, + "step": 28402 + }, + { + "epoch": 3.368077789635954, + "grad_norm": 0.418327651407074, + "learning_rate": 3.2016887073367307e-06, + "loss": 0.0149, + "step": 28403 + }, + { + "epoch": 3.368196371398079, + "grad_norm": 0.4551825217157843, + "learning_rate": 3.2005135281535705e-06, + "loss": 0.0243, + "step": 28404 + }, + { + "epoch": 3.368314953160204, + "grad_norm": 0.5975004323497811, + "learning_rate": 3.1993385499349465e-06, + "loss": 0.0237, + "step": 28405 + }, + { + "epoch": 3.368433534922329, + "grad_norm": 0.5470517835276452, + "learning_rate": 3.198163772691681e-06, + "loss": 0.0282, + "step": 28406 + }, + { + "epoch": 3.368552116684454, + "grad_norm": 0.36391297975092624, + "learning_rate": 3.1969891964346133e-06, + "loss": 0.0242, + "step": 28407 + }, + { + "epoch": 3.368670698446579, + "grad_norm": 0.4382320821050168, + "learning_rate": 3.195814821174567e-06, + "loss": 0.0203, + "step": 28408 + }, + { + "epoch": 3.368789280208704, + "grad_norm": 0.6135033731861153, + "learning_rate": 3.1946406469223754e-06, + "loss": 0.0287, + "step": 28409 + }, + { + "epoch": 3.368907861970829, + "grad_norm": 0.552983449461249, + "learning_rate": 3.1934666736888526e-06, + "loss": 0.0279, + "step": 28410 + }, + { + "epoch": 3.369026443732954, + "grad_norm": 0.5532775749500256, + "learning_rate": 3.192292901484828e-06, + "loss": 0.0272, + "step": 28411 + }, + { + "epoch": 3.369145025495079, + "grad_norm": 0.6528065539001182, + "learning_rate": 3.1911193303211185e-06, + "loss": 0.0211, + "step": 28412 + }, + { + "epoch": 3.3692636072572038, + "grad_norm": 0.4817292502055002, + "learning_rate": 3.189945960208543e-06, + "loss": 0.0234, + "step": 28413 + }, + { + "epoch": 3.369382189019329, + "grad_norm": 0.36256497051687425, + "learning_rate": 3.1887727911579233e-06, + "loss": 0.0188, + "step": 28414 + }, + { + "epoch": 3.3695007707814537, + "grad_norm": 0.4700065915498995, + "learning_rate": 3.187599823180071e-06, + "loss": 0.0244, + "step": 28415 + }, + { + "epoch": 3.369619352543579, + "grad_norm": 0.8898319715785457, + "learning_rate": 3.186427056285804e-06, + "loss": 0.0134, + "step": 28416 + }, + { + "epoch": 3.3697379343057037, + "grad_norm": 0.4802155988614911, + "learning_rate": 3.185254490485928e-06, + "loss": 0.0286, + "step": 28417 + }, + { + "epoch": 3.369856516067829, + "grad_norm": 0.549676768963708, + "learning_rate": 3.1840821257912516e-06, + "loss": 0.0288, + "step": 28418 + }, + { + "epoch": 3.3699750978299536, + "grad_norm": 0.4917155585101706, + "learning_rate": 3.182909962212588e-06, + "loss": 0.0294, + "step": 28419 + }, + { + "epoch": 3.370093679592079, + "grad_norm": 0.6074538745944409, + "learning_rate": 3.181737999760745e-06, + "loss": 0.0302, + "step": 28420 + }, + { + "epoch": 3.3702122613542036, + "grad_norm": 0.42688868180646494, + "learning_rate": 3.180566238446514e-06, + "loss": 0.0238, + "step": 28421 + }, + { + "epoch": 3.370330843116329, + "grad_norm": 0.49622425942491805, + "learning_rate": 3.179394678280709e-06, + "loss": 0.0243, + "step": 28422 + }, + { + "epoch": 3.3704494248784536, + "grad_norm": 0.7121674063510728, + "learning_rate": 3.178223319274126e-06, + "loss": 0.0461, + "step": 28423 + }, + { + "epoch": 3.3705680066405788, + "grad_norm": 0.39433712911151714, + "learning_rate": 3.177052161437566e-06, + "loss": 0.0302, + "step": 28424 + }, + { + "epoch": 3.3706865884027035, + "grad_norm": 1.02933401916156, + "learning_rate": 3.1758812047818275e-06, + "loss": 0.0408, + "step": 28425 + }, + { + "epoch": 3.3708051701648287, + "grad_norm": 0.5824596302666162, + "learning_rate": 3.1747104493176915e-06, + "loss": 0.0344, + "step": 28426 + }, + { + "epoch": 3.3709237519269535, + "grad_norm": 0.7127793958917581, + "learning_rate": 3.1735398950559713e-06, + "loss": 0.0343, + "step": 28427 + }, + { + "epoch": 3.3710423336890787, + "grad_norm": 0.4823744436861155, + "learning_rate": 3.1723695420074418e-06, + "loss": 0.0277, + "step": 28428 + }, + { + "epoch": 3.3711609154512034, + "grad_norm": 0.7032338123401534, + "learning_rate": 3.1711993901828997e-06, + "loss": 0.0261, + "step": 28429 + }, + { + "epoch": 3.3712794972133286, + "grad_norm": 0.5143227126304037, + "learning_rate": 3.170029439593131e-06, + "loss": 0.0216, + "step": 28430 + }, + { + "epoch": 3.3713980789754534, + "grad_norm": 0.33369514321936794, + "learning_rate": 3.168859690248921e-06, + "loss": 0.0117, + "step": 28431 + }, + { + "epoch": 3.3715166607375786, + "grad_norm": 0.5760406734103235, + "learning_rate": 3.167690142161059e-06, + "loss": 0.0308, + "step": 28432 + }, + { + "epoch": 3.3716352424997034, + "grad_norm": 0.5545315596725181, + "learning_rate": 3.166520795340311e-06, + "loss": 0.027, + "step": 28433 + }, + { + "epoch": 3.3717538242618286, + "grad_norm": 0.36870387464145044, + "learning_rate": 3.1653516497974765e-06, + "loss": 0.0171, + "step": 28434 + }, + { + "epoch": 3.3718724060239533, + "grad_norm": 0.6730036991692576, + "learning_rate": 3.16418270554332e-06, + "loss": 0.0286, + "step": 28435 + }, + { + "epoch": 3.3719909877860785, + "grad_norm": 0.5376573924377342, + "learning_rate": 3.163013962588629e-06, + "loss": 0.0246, + "step": 28436 + }, + { + "epoch": 3.3721095695482033, + "grad_norm": 0.5114635542827333, + "learning_rate": 3.1618454209441567e-06, + "loss": 0.02, + "step": 28437 + }, + { + "epoch": 3.3722281513103285, + "grad_norm": 0.44257961798276235, + "learning_rate": 3.1606770806207027e-06, + "loss": 0.0225, + "step": 28438 + }, + { + "epoch": 3.3723467330724537, + "grad_norm": 0.4420748220061696, + "learning_rate": 3.159508941629019e-06, + "loss": 0.0228, + "step": 28439 + }, + { + "epoch": 3.3724653148345785, + "grad_norm": 0.5399353813283471, + "learning_rate": 3.1583410039798813e-06, + "loss": 0.0235, + "step": 28440 + }, + { + "epoch": 3.372583896596703, + "grad_norm": 0.4596826377107692, + "learning_rate": 3.157173267684055e-06, + "loss": 0.0188, + "step": 28441 + }, + { + "epoch": 3.3727024783588284, + "grad_norm": 0.5556010185875379, + "learning_rate": 3.1560057327523067e-06, + "loss": 0.0243, + "step": 28442 + }, + { + "epoch": 3.3728210601209536, + "grad_norm": 0.4124598612697137, + "learning_rate": 3.154838399195403e-06, + "loss": 0.0188, + "step": 28443 + }, + { + "epoch": 3.3729396418830784, + "grad_norm": 0.9292966510454848, + "learning_rate": 3.153671267024089e-06, + "loss": 0.0458, + "step": 28444 + }, + { + "epoch": 3.373058223645203, + "grad_norm": 0.6636125021113544, + "learning_rate": 3.1525043362491483e-06, + "loss": 0.0348, + "step": 28445 + }, + { + "epoch": 3.3731768054073283, + "grad_norm": 0.4417635986660614, + "learning_rate": 3.1513376068813217e-06, + "loss": 0.0194, + "step": 28446 + }, + { + "epoch": 3.3732953871694535, + "grad_norm": 0.5292581327973719, + "learning_rate": 3.1501710789313744e-06, + "loss": 0.0267, + "step": 28447 + }, + { + "epoch": 3.3734139689315783, + "grad_norm": 0.6896506311269708, + "learning_rate": 3.1490047524100454e-06, + "loss": 0.0351, + "step": 28448 + }, + { + "epoch": 3.3735325506937035, + "grad_norm": 0.5166754080899334, + "learning_rate": 3.1478386273281065e-06, + "loss": 0.0287, + "step": 28449 + }, + { + "epoch": 3.3736511324558283, + "grad_norm": 0.5190649859280394, + "learning_rate": 3.1466727036962933e-06, + "loss": 0.0289, + "step": 28450 + }, + { + "epoch": 3.3737697142179535, + "grad_norm": 0.7088445789581067, + "learning_rate": 3.145506981525362e-06, + "loss": 0.037, + "step": 28451 + }, + { + "epoch": 3.3738882959800782, + "grad_norm": 0.7678390554756007, + "learning_rate": 3.144341460826053e-06, + "loss": 0.0479, + "step": 28452 + }, + { + "epoch": 3.3740068777422034, + "grad_norm": 0.688499594544058, + "learning_rate": 3.1431761416091166e-06, + "loss": 0.0236, + "step": 28453 + }, + { + "epoch": 3.374125459504328, + "grad_norm": 0.4473603931090516, + "learning_rate": 3.1420110238852994e-06, + "loss": 0.0218, + "step": 28454 + }, + { + "epoch": 3.3742440412664534, + "grad_norm": 0.8679182307745145, + "learning_rate": 3.1408461076653297e-06, + "loss": 0.0309, + "step": 28455 + }, + { + "epoch": 3.374362623028578, + "grad_norm": 0.4122406273323114, + "learning_rate": 3.139681392959956e-06, + "loss": 0.0197, + "step": 28456 + }, + { + "epoch": 3.3744812047907033, + "grad_norm": 0.6535763755778339, + "learning_rate": 3.13851687977991e-06, + "loss": 0.0352, + "step": 28457 + }, + { + "epoch": 3.374599786552828, + "grad_norm": 0.6526485985418939, + "learning_rate": 3.137352568135937e-06, + "loss": 0.0243, + "step": 28458 + }, + { + "epoch": 3.3747183683149533, + "grad_norm": 0.6067298192825716, + "learning_rate": 3.1361884580387526e-06, + "loss": 0.0368, + "step": 28459 + }, + { + "epoch": 3.374836950077078, + "grad_norm": 0.527718591706365, + "learning_rate": 3.1350245494991104e-06, + "loss": 0.0261, + "step": 28460 + }, + { + "epoch": 3.3749555318392033, + "grad_norm": 0.569320488075551, + "learning_rate": 3.133860842527722e-06, + "loss": 0.0261, + "step": 28461 + }, + { + "epoch": 3.375074113601328, + "grad_norm": 0.5693536778135765, + "learning_rate": 3.1326973371353263e-06, + "loss": 0.0234, + "step": 28462 + }, + { + "epoch": 3.3751926953634532, + "grad_norm": 1.4072890344695894, + "learning_rate": 3.1315340333326422e-06, + "loss": 0.0374, + "step": 28463 + }, + { + "epoch": 3.375311277125578, + "grad_norm": 0.4142097093638965, + "learning_rate": 3.1303709311303997e-06, + "loss": 0.0195, + "step": 28464 + }, + { + "epoch": 3.375429858887703, + "grad_norm": 0.43787847355136983, + "learning_rate": 3.1292080305393213e-06, + "loss": 0.0193, + "step": 28465 + }, + { + "epoch": 3.375548440649828, + "grad_norm": 0.48339464682531585, + "learning_rate": 3.1280453315701232e-06, + "loss": 0.0357, + "step": 28466 + }, + { + "epoch": 3.375667022411953, + "grad_norm": 0.26733116868304896, + "learning_rate": 3.1268828342335243e-06, + "loss": 0.0113, + "step": 28467 + }, + { + "epoch": 3.375785604174078, + "grad_norm": 0.6215951816401758, + "learning_rate": 3.1257205385402446e-06, + "loss": 0.0374, + "step": 28468 + }, + { + "epoch": 3.375904185936203, + "grad_norm": 0.43580281474752947, + "learning_rate": 3.1245584445009994e-06, + "loss": 0.022, + "step": 28469 + }, + { + "epoch": 3.376022767698328, + "grad_norm": 0.23459955385165523, + "learning_rate": 3.1233965521264923e-06, + "loss": 0.0123, + "step": 28470 + }, + { + "epoch": 3.376141349460453, + "grad_norm": 0.6369846488020747, + "learning_rate": 3.1222348614274506e-06, + "loss": 0.0307, + "step": 28471 + }, + { + "epoch": 3.376259931222578, + "grad_norm": 0.3498761004309446, + "learning_rate": 3.1210733724145735e-06, + "loss": 0.0169, + "step": 28472 + }, + { + "epoch": 3.376378512984703, + "grad_norm": 0.4570431395795493, + "learning_rate": 3.1199120850985668e-06, + "loss": 0.0319, + "step": 28473 + }, + { + "epoch": 3.376497094746828, + "grad_norm": 0.4887829870089527, + "learning_rate": 3.1187509994901415e-06, + "loss": 0.023, + "step": 28474 + }, + { + "epoch": 3.376615676508953, + "grad_norm": 0.6417993175401068, + "learning_rate": 3.117590115599997e-06, + "loss": 0.0344, + "step": 28475 + }, + { + "epoch": 3.3767342582710778, + "grad_norm": 0.6244304186758831, + "learning_rate": 3.1164294334388442e-06, + "loss": 0.0237, + "step": 28476 + }, + { + "epoch": 3.376852840033203, + "grad_norm": 0.3518242874507578, + "learning_rate": 3.115268953017372e-06, + "loss": 0.0155, + "step": 28477 + }, + { + "epoch": 3.3769714217953277, + "grad_norm": 0.32975117877057153, + "learning_rate": 3.1141086743462835e-06, + "loss": 0.0129, + "step": 28478 + }, + { + "epoch": 3.377090003557453, + "grad_norm": 0.47302257218793275, + "learning_rate": 3.1129485974362744e-06, + "loss": 0.0234, + "step": 28479 + }, + { + "epoch": 3.3772085853195777, + "grad_norm": 0.6532258642357543, + "learning_rate": 3.1117887222980395e-06, + "loss": 0.0214, + "step": 28480 + }, + { + "epoch": 3.377327167081703, + "grad_norm": 0.6454416432368102, + "learning_rate": 3.110629048942276e-06, + "loss": 0.0283, + "step": 28481 + }, + { + "epoch": 3.3774457488438276, + "grad_norm": 0.5086395791158544, + "learning_rate": 3.109469577379659e-06, + "loss": 0.0294, + "step": 28482 + }, + { + "epoch": 3.377564330605953, + "grad_norm": 0.4356813096903233, + "learning_rate": 3.108310307620901e-06, + "loss": 0.0169, + "step": 28483 + }, + { + "epoch": 3.3776829123680776, + "grad_norm": 0.7241730774057775, + "learning_rate": 3.10715123967667e-06, + "loss": 0.0375, + "step": 28484 + }, + { + "epoch": 3.377801494130203, + "grad_norm": 0.7138234509594876, + "learning_rate": 3.105992373557659e-06, + "loss": 0.0207, + "step": 28485 + }, + { + "epoch": 3.3779200758923276, + "grad_norm": 0.5548472126614545, + "learning_rate": 3.104833709274549e-06, + "loss": 0.025, + "step": 28486 + }, + { + "epoch": 3.3780386576544528, + "grad_norm": 0.4566426878968168, + "learning_rate": 3.103675246838028e-06, + "loss": 0.0152, + "step": 28487 + }, + { + "epoch": 3.378157239416578, + "grad_norm": 0.43267343007422204, + "learning_rate": 3.102516986258766e-06, + "loss": 0.0223, + "step": 28488 + }, + { + "epoch": 3.3782758211787027, + "grad_norm": 0.2683118039541651, + "learning_rate": 3.101358927547443e-06, + "loss": 0.0121, + "step": 28489 + }, + { + "epoch": 3.3783944029408275, + "grad_norm": 0.36478009995873045, + "learning_rate": 3.1002010707147394e-06, + "loss": 0.0168, + "step": 28490 + }, + { + "epoch": 3.3785129847029527, + "grad_norm": 0.5371761480370094, + "learning_rate": 3.0990434157713242e-06, + "loss": 0.0218, + "step": 28491 + }, + { + "epoch": 3.378631566465078, + "grad_norm": 0.5675836006070906, + "learning_rate": 3.0978859627278784e-06, + "loss": 0.0325, + "step": 28492 + }, + { + "epoch": 3.3787501482272027, + "grad_norm": 0.4616901656889574, + "learning_rate": 3.0967287115950538e-06, + "loss": 0.0185, + "step": 28493 + }, + { + "epoch": 3.3788687299893274, + "grad_norm": 0.3669979475060291, + "learning_rate": 3.0955716623835424e-06, + "loss": 0.0165, + "step": 28494 + }, + { + "epoch": 3.3789873117514526, + "grad_norm": 0.543638411427695, + "learning_rate": 3.0944148151039966e-06, + "loss": 0.0248, + "step": 28495 + }, + { + "epoch": 3.379105893513578, + "grad_norm": 0.3697925606854395, + "learning_rate": 3.093258169767085e-06, + "loss": 0.0177, + "step": 28496 + }, + { + "epoch": 3.3792244752757026, + "grad_norm": 0.8976596592572345, + "learning_rate": 3.0921017263834617e-06, + "loss": 0.043, + "step": 28497 + }, + { + "epoch": 3.3793430570378273, + "grad_norm": 0.31517713460237234, + "learning_rate": 3.090945484963803e-06, + "loss": 0.0192, + "step": 28498 + }, + { + "epoch": 3.3794616387999525, + "grad_norm": 0.6067885846577701, + "learning_rate": 3.0897894455187566e-06, + "loss": 0.0314, + "step": 28499 + }, + { + "epoch": 3.3795802205620777, + "grad_norm": 0.6468297468193335, + "learning_rate": 3.0886336080589856e-06, + "loss": 0.0295, + "step": 28500 + }, + { + "epoch": 3.3796988023242025, + "grad_norm": 0.4220850341505235, + "learning_rate": 3.0874779725951404e-06, + "loss": 0.0203, + "step": 28501 + }, + { + "epoch": 3.3798173840863277, + "grad_norm": 0.5511033620321742, + "learning_rate": 3.0863225391378785e-06, + "loss": 0.0225, + "step": 28502 + }, + { + "epoch": 3.3799359658484525, + "grad_norm": 0.5922528944882812, + "learning_rate": 3.085167307697856e-06, + "loss": 0.0279, + "step": 28503 + }, + { + "epoch": 3.3800545476105777, + "grad_norm": 0.37008938199835095, + "learning_rate": 3.084012278285706e-06, + "loss": 0.0166, + "step": 28504 + }, + { + "epoch": 3.3801731293727024, + "grad_norm": 0.4149916496624296, + "learning_rate": 3.0828574509120967e-06, + "loss": 0.0215, + "step": 28505 + }, + { + "epoch": 3.3802917111348276, + "grad_norm": 0.5543020112087341, + "learning_rate": 3.081702825587662e-06, + "loss": 0.0244, + "step": 28506 + }, + { + "epoch": 3.3804102928969524, + "grad_norm": 0.49438674061110593, + "learning_rate": 3.0805484023230525e-06, + "loss": 0.027, + "step": 28507 + }, + { + "epoch": 3.3805288746590776, + "grad_norm": 0.6006308766364582, + "learning_rate": 3.0793941811288973e-06, + "loss": 0.0267, + "step": 28508 + }, + { + "epoch": 3.3806474564212023, + "grad_norm": 0.44558145820396783, + "learning_rate": 3.078240162015861e-06, + "loss": 0.0168, + "step": 28509 + }, + { + "epoch": 3.3807660381833275, + "grad_norm": 0.4928828141509471, + "learning_rate": 3.077086344994559e-06, + "loss": 0.0242, + "step": 28510 + }, + { + "epoch": 3.3808846199454523, + "grad_norm": 0.6114988503180125, + "learning_rate": 3.075932730075637e-06, + "loss": 0.031, + "step": 28511 + }, + { + "epoch": 3.3810032017075775, + "grad_norm": 0.38883852734764823, + "learning_rate": 3.0747793172697326e-06, + "loss": 0.0185, + "step": 28512 + }, + { + "epoch": 3.3811217834697023, + "grad_norm": 0.7298093245026348, + "learning_rate": 3.0736261065874768e-06, + "loss": 0.0379, + "step": 28513 + }, + { + "epoch": 3.3812403652318275, + "grad_norm": 0.7660670564277019, + "learning_rate": 3.072473098039502e-06, + "loss": 0.0426, + "step": 28514 + }, + { + "epoch": 3.3813589469939522, + "grad_norm": 1.3600806465206503, + "learning_rate": 3.0713202916364285e-06, + "loss": 0.035, + "step": 28515 + }, + { + "epoch": 3.3814775287560774, + "grad_norm": 0.4374321269039058, + "learning_rate": 3.0701676873889e-06, + "loss": 0.0204, + "step": 28516 + }, + { + "epoch": 3.381596110518202, + "grad_norm": 0.5846536735631263, + "learning_rate": 3.0690152853075306e-06, + "loss": 0.0343, + "step": 28517 + }, + { + "epoch": 3.3817146922803274, + "grad_norm": 0.5876342603898199, + "learning_rate": 3.06786308540295e-06, + "loss": 0.0273, + "step": 28518 + }, + { + "epoch": 3.381833274042452, + "grad_norm": 0.5855117882747183, + "learning_rate": 3.066711087685767e-06, + "loss": 0.0302, + "step": 28519 + }, + { + "epoch": 3.3819518558045774, + "grad_norm": 0.6912885486769447, + "learning_rate": 3.06555929216662e-06, + "loss": 0.0222, + "step": 28520 + }, + { + "epoch": 3.382070437566702, + "grad_norm": 0.6877245691881002, + "learning_rate": 3.0644076988561144e-06, + "loss": 0.0251, + "step": 28521 + }, + { + "epoch": 3.3821890193288273, + "grad_norm": 0.7840184341305153, + "learning_rate": 3.0632563077648724e-06, + "loss": 0.0404, + "step": 28522 + }, + { + "epoch": 3.382307601090952, + "grad_norm": 0.5788076471788576, + "learning_rate": 3.062105118903505e-06, + "loss": 0.0262, + "step": 28523 + }, + { + "epoch": 3.3824261828530773, + "grad_norm": 0.394066506606561, + "learning_rate": 3.0609541322826257e-06, + "loss": 0.0183, + "step": 28524 + }, + { + "epoch": 3.382544764615202, + "grad_norm": 0.40551340727886437, + "learning_rate": 3.059803347912851e-06, + "loss": 0.0218, + "step": 28525 + }, + { + "epoch": 3.3826633463773272, + "grad_norm": 0.6251794056244776, + "learning_rate": 3.0586527658047814e-06, + "loss": 0.033, + "step": 28526 + }, + { + "epoch": 3.382781928139452, + "grad_norm": 0.7156211673781362, + "learning_rate": 3.0575023859690273e-06, + "loss": 0.03, + "step": 28527 + }, + { + "epoch": 3.382900509901577, + "grad_norm": 0.6141573079290787, + "learning_rate": 3.0563522084161913e-06, + "loss": 0.0246, + "step": 28528 + }, + { + "epoch": 3.383019091663702, + "grad_norm": 0.7519397938833405, + "learning_rate": 3.0552022331568785e-06, + "loss": 0.0374, + "step": 28529 + }, + { + "epoch": 3.383137673425827, + "grad_norm": 0.68458664684316, + "learning_rate": 3.054052460201692e-06, + "loss": 0.0417, + "step": 28530 + }, + { + "epoch": 3.383256255187952, + "grad_norm": 0.45910169399247563, + "learning_rate": 3.0529028895612294e-06, + "loss": 0.0276, + "step": 28531 + }, + { + "epoch": 3.383374836950077, + "grad_norm": 0.6582651522704862, + "learning_rate": 3.0517535212460953e-06, + "loss": 0.0331, + "step": 28532 + }, + { + "epoch": 3.383493418712202, + "grad_norm": 0.3792838573999262, + "learning_rate": 3.050604355266873e-06, + "loss": 0.0186, + "step": 28533 + }, + { + "epoch": 3.383612000474327, + "grad_norm": 0.5153481532205052, + "learning_rate": 3.0494553916341624e-06, + "loss": 0.0154, + "step": 28534 + }, + { + "epoch": 3.383730582236452, + "grad_norm": 0.6332568811313432, + "learning_rate": 3.048306630358555e-06, + "loss": 0.02, + "step": 28535 + }, + { + "epoch": 3.383849163998577, + "grad_norm": 0.5699044294198763, + "learning_rate": 3.047158071450648e-06, + "loss": 0.0233, + "step": 28536 + }, + { + "epoch": 3.3839677457607023, + "grad_norm": 0.4780191795962937, + "learning_rate": 3.0460097149210166e-06, + "loss": 0.0183, + "step": 28537 + }, + { + "epoch": 3.384086327522827, + "grad_norm": 0.8017907093842397, + "learning_rate": 3.0448615607802543e-06, + "loss": 0.0352, + "step": 28538 + }, + { + "epoch": 3.3842049092849518, + "grad_norm": 0.6495251710810469, + "learning_rate": 3.0437136090389477e-06, + "loss": 0.0428, + "step": 28539 + }, + { + "epoch": 3.384323491047077, + "grad_norm": 0.39437320077632904, + "learning_rate": 3.0425658597076747e-06, + "loss": 0.0193, + "step": 28540 + }, + { + "epoch": 3.384442072809202, + "grad_norm": 0.6892678525007567, + "learning_rate": 3.0414183127970175e-06, + "loss": 0.0266, + "step": 28541 + }, + { + "epoch": 3.384560654571327, + "grad_norm": 0.4914058219162297, + "learning_rate": 3.0402709683175574e-06, + "loss": 0.0195, + "step": 28542 + }, + { + "epoch": 3.3846792363334517, + "grad_norm": 0.8762415428217719, + "learning_rate": 3.039123826279874e-06, + "loss": 0.0422, + "step": 28543 + }, + { + "epoch": 3.384797818095577, + "grad_norm": 0.46491471289713715, + "learning_rate": 3.037976886694535e-06, + "loss": 0.0176, + "step": 28544 + }, + { + "epoch": 3.384916399857702, + "grad_norm": 0.6101522633312086, + "learning_rate": 3.0368301495721173e-06, + "loss": 0.0263, + "step": 28545 + }, + { + "epoch": 3.385034981619827, + "grad_norm": 0.5173887925331193, + "learning_rate": 3.035683614923193e-06, + "loss": 0.0236, + "step": 28546 + }, + { + "epoch": 3.3851535633819516, + "grad_norm": 0.468876658715073, + "learning_rate": 3.0345372827583374e-06, + "loss": 0.0206, + "step": 28547 + }, + { + "epoch": 3.385272145144077, + "grad_norm": 0.6702770349196162, + "learning_rate": 3.0333911530881054e-06, + "loss": 0.0421, + "step": 28548 + }, + { + "epoch": 3.385390726906202, + "grad_norm": 0.5032724958269145, + "learning_rate": 3.0322452259230694e-06, + "loss": 0.0263, + "step": 28549 + }, + { + "epoch": 3.3855093086683268, + "grad_norm": 0.5221901842255368, + "learning_rate": 3.031099501273793e-06, + "loss": 0.0167, + "step": 28550 + }, + { + "epoch": 3.385627890430452, + "grad_norm": 0.6018449247977332, + "learning_rate": 3.0299539791508403e-06, + "loss": 0.0288, + "step": 28551 + }, + { + "epoch": 3.3857464721925767, + "grad_norm": 0.5978504920010301, + "learning_rate": 3.0288086595647773e-06, + "loss": 0.0225, + "step": 28552 + }, + { + "epoch": 3.385865053954702, + "grad_norm": 0.28515297412779, + "learning_rate": 3.0276635425261433e-06, + "loss": 0.0142, + "step": 28553 + }, + { + "epoch": 3.3859836357168267, + "grad_norm": 0.480453568934314, + "learning_rate": 3.0265186280455187e-06, + "loss": 0.0258, + "step": 28554 + }, + { + "epoch": 3.386102217478952, + "grad_norm": 0.7541082327008836, + "learning_rate": 3.025373916133439e-06, + "loss": 0.0342, + "step": 28555 + }, + { + "epoch": 3.3862207992410767, + "grad_norm": 0.37458512520233683, + "learning_rate": 3.024229406800469e-06, + "loss": 0.0143, + "step": 28556 + }, + { + "epoch": 3.386339381003202, + "grad_norm": 0.9195313265089391, + "learning_rate": 3.023085100057152e-06, + "loss": 0.0399, + "step": 28557 + }, + { + "epoch": 3.3864579627653266, + "grad_norm": 0.3366782488704065, + "learning_rate": 3.0219409959140464e-06, + "loss": 0.0144, + "step": 28558 + }, + { + "epoch": 3.386576544527452, + "grad_norm": 0.6836191543354585, + "learning_rate": 3.020797094381689e-06, + "loss": 0.0285, + "step": 28559 + }, + { + "epoch": 3.3866951262895766, + "grad_norm": 0.44249499486776117, + "learning_rate": 3.0196533954706285e-06, + "loss": 0.0273, + "step": 28560 + }, + { + "epoch": 3.386813708051702, + "grad_norm": 0.5175912213418122, + "learning_rate": 3.01850989919141e-06, + "loss": 0.0307, + "step": 28561 + }, + { + "epoch": 3.3869322898138265, + "grad_norm": 0.36283318955126653, + "learning_rate": 3.0173666055545753e-06, + "loss": 0.017, + "step": 28562 + }, + { + "epoch": 3.3870508715759517, + "grad_norm": 0.6129970344475152, + "learning_rate": 3.0162235145706706e-06, + "loss": 0.0261, + "step": 28563 + }, + { + "epoch": 3.3871694533380765, + "grad_norm": 0.6120879170777062, + "learning_rate": 3.0150806262502135e-06, + "loss": 0.0227, + "step": 28564 + }, + { + "epoch": 3.3872880351002017, + "grad_norm": 0.649002866624606, + "learning_rate": 3.0139379406037643e-06, + "loss": 0.0277, + "step": 28565 + }, + { + "epoch": 3.3874066168623265, + "grad_norm": 0.811578493061749, + "learning_rate": 3.0127954576418427e-06, + "loss": 0.0543, + "step": 28566 + }, + { + "epoch": 3.3875251986244517, + "grad_norm": 0.4934331245213893, + "learning_rate": 3.011653177374987e-06, + "loss": 0.0227, + "step": 28567 + }, + { + "epoch": 3.3876437803865764, + "grad_norm": 0.7799368960525657, + "learning_rate": 3.0105110998137174e-06, + "loss": 0.0437, + "step": 28568 + }, + { + "epoch": 3.3877623621487016, + "grad_norm": 0.8159969418077485, + "learning_rate": 3.009369224968578e-06, + "loss": 0.032, + "step": 28569 + }, + { + "epoch": 3.3878809439108264, + "grad_norm": 0.5669185020726558, + "learning_rate": 3.0082275528500846e-06, + "loss": 0.0288, + "step": 28570 + }, + { + "epoch": 3.3879995256729516, + "grad_norm": 0.6318801417039888, + "learning_rate": 3.0070860834687633e-06, + "loss": 0.0314, + "step": 28571 + }, + { + "epoch": 3.3881181074350764, + "grad_norm": 0.4438036568868938, + "learning_rate": 3.0059448168351404e-06, + "loss": 0.028, + "step": 28572 + }, + { + "epoch": 3.3882366891972016, + "grad_norm": 0.42910173861823303, + "learning_rate": 3.004803752959734e-06, + "loss": 0.0258, + "step": 28573 + }, + { + "epoch": 3.3883552709593263, + "grad_norm": 0.5552018690210471, + "learning_rate": 3.003662891853071e-06, + "loss": 0.0274, + "step": 28574 + }, + { + "epoch": 3.3884738527214515, + "grad_norm": 0.47877168693822686, + "learning_rate": 3.002522233525651e-06, + "loss": 0.0211, + "step": 28575 + }, + { + "epoch": 3.3885924344835763, + "grad_norm": 0.44307893959992223, + "learning_rate": 3.001381777988013e-06, + "loss": 0.0164, + "step": 28576 + }, + { + "epoch": 3.3887110162457015, + "grad_norm": 0.6094306459838031, + "learning_rate": 3.000241525250652e-06, + "loss": 0.019, + "step": 28577 + }, + { + "epoch": 3.3888295980078262, + "grad_norm": 0.6326983375447719, + "learning_rate": 2.9991014753240928e-06, + "loss": 0.0245, + "step": 28578 + }, + { + "epoch": 3.3889481797699514, + "grad_norm": 0.48277089995279043, + "learning_rate": 2.997961628218826e-06, + "loss": 0.0195, + "step": 28579 + }, + { + "epoch": 3.389066761532076, + "grad_norm": 0.4491159158360087, + "learning_rate": 2.9968219839453833e-06, + "loss": 0.0257, + "step": 28580 + }, + { + "epoch": 3.3891853432942014, + "grad_norm": 0.6088936275573151, + "learning_rate": 2.9956825425142553e-06, + "loss": 0.0244, + "step": 28581 + }, + { + "epoch": 3.389303925056326, + "grad_norm": 0.43700254742065653, + "learning_rate": 2.994543303935951e-06, + "loss": 0.0211, + "step": 28582 + }, + { + "epoch": 3.3894225068184514, + "grad_norm": 0.5873547805583724, + "learning_rate": 2.993404268220973e-06, + "loss": 0.0261, + "step": 28583 + }, + { + "epoch": 3.389541088580576, + "grad_norm": 0.5687995995867917, + "learning_rate": 2.992265435379821e-06, + "loss": 0.0212, + "step": 28584 + }, + { + "epoch": 3.3896596703427013, + "grad_norm": 0.33213835616664406, + "learning_rate": 2.991126805423e-06, + "loss": 0.0179, + "step": 28585 + }, + { + "epoch": 3.389778252104826, + "grad_norm": 0.6139577074634958, + "learning_rate": 2.989988378360989e-06, + "loss": 0.0262, + "step": 28586 + }, + { + "epoch": 3.3898968338669513, + "grad_norm": 0.395223094504496, + "learning_rate": 2.9888501542043036e-06, + "loss": 0.0165, + "step": 28587 + }, + { + "epoch": 3.390015415629076, + "grad_norm": 0.38388007384236544, + "learning_rate": 2.9877121329634243e-06, + "loss": 0.0207, + "step": 28588 + }, + { + "epoch": 3.3901339973912012, + "grad_norm": 0.5811314458945505, + "learning_rate": 2.986574314648846e-06, + "loss": 0.0246, + "step": 28589 + }, + { + "epoch": 3.3902525791533265, + "grad_norm": 0.458380743905067, + "learning_rate": 2.9854366992710604e-06, + "loss": 0.0233, + "step": 28590 + }, + { + "epoch": 3.390371160915451, + "grad_norm": 0.6895834324025868, + "learning_rate": 2.98429928684055e-06, + "loss": 0.0323, + "step": 28591 + }, + { + "epoch": 3.390489742677576, + "grad_norm": 0.38056761231045466, + "learning_rate": 2.983162077367807e-06, + "loss": 0.0241, + "step": 28592 + }, + { + "epoch": 3.390608324439701, + "grad_norm": 0.7494543189333758, + "learning_rate": 2.9820250708633062e-06, + "loss": 0.0412, + "step": 28593 + }, + { + "epoch": 3.3907269062018264, + "grad_norm": 0.5935450247205594, + "learning_rate": 2.9808882673375338e-06, + "loss": 0.0224, + "step": 28594 + }, + { + "epoch": 3.390845487963951, + "grad_norm": 0.659041926415701, + "learning_rate": 2.979751666800973e-06, + "loss": 0.0328, + "step": 28595 + }, + { + "epoch": 3.390964069726076, + "grad_norm": 0.9736729824588117, + "learning_rate": 2.978615269264101e-06, + "loss": 0.0485, + "step": 28596 + }, + { + "epoch": 3.391082651488201, + "grad_norm": 0.7846735911940006, + "learning_rate": 2.9774790747373854e-06, + "loss": 0.0343, + "step": 28597 + }, + { + "epoch": 3.3912012332503263, + "grad_norm": 0.35992576057555065, + "learning_rate": 2.976343083231309e-06, + "loss": 0.0159, + "step": 28598 + }, + { + "epoch": 3.391319815012451, + "grad_norm": 0.5397481909677259, + "learning_rate": 2.9752072947563436e-06, + "loss": 0.0248, + "step": 28599 + }, + { + "epoch": 3.3914383967745763, + "grad_norm": 0.3529270221009212, + "learning_rate": 2.9740717093229565e-06, + "loss": 0.0163, + "step": 28600 + }, + { + "epoch": 3.391556978536701, + "grad_norm": 0.8788380016502906, + "learning_rate": 2.9729363269416167e-06, + "loss": 0.0346, + "step": 28601 + }, + { + "epoch": 3.391675560298826, + "grad_norm": 0.502652697296164, + "learning_rate": 2.9718011476227963e-06, + "loss": 0.0257, + "step": 28602 + }, + { + "epoch": 3.391794142060951, + "grad_norm": 0.4638454187805047, + "learning_rate": 2.9706661713769597e-06, + "loss": 0.019, + "step": 28603 + }, + { + "epoch": 3.391912723823076, + "grad_norm": 0.48740349038982894, + "learning_rate": 2.9695313982145618e-06, + "loss": 0.0194, + "step": 28604 + }, + { + "epoch": 3.392031305585201, + "grad_norm": 0.6524430506421464, + "learning_rate": 2.9683968281460668e-06, + "loss": 0.0258, + "step": 28605 + }, + { + "epoch": 3.392149887347326, + "grad_norm": 0.43633497038033103, + "learning_rate": 2.9672624611819385e-06, + "loss": 0.0183, + "step": 28606 + }, + { + "epoch": 3.392268469109451, + "grad_norm": 0.6807255380889069, + "learning_rate": 2.966128297332638e-06, + "loss": 0.0239, + "step": 28607 + }, + { + "epoch": 3.392387050871576, + "grad_norm": 0.40499563380239534, + "learning_rate": 2.9649943366086065e-06, + "loss": 0.0228, + "step": 28608 + }, + { + "epoch": 3.392505632633701, + "grad_norm": 0.6769493052106506, + "learning_rate": 2.9638605790203084e-06, + "loss": 0.0276, + "step": 28609 + }, + { + "epoch": 3.392624214395826, + "grad_norm": 0.4864641800728007, + "learning_rate": 2.9627270245781934e-06, + "loss": 0.0235, + "step": 28610 + }, + { + "epoch": 3.392742796157951, + "grad_norm": 0.3157594933234778, + "learning_rate": 2.9615936732927117e-06, + "loss": 0.0167, + "step": 28611 + }, + { + "epoch": 3.392861377920076, + "grad_norm": 0.7681573134272002, + "learning_rate": 2.9604605251743136e-06, + "loss": 0.0366, + "step": 28612 + }, + { + "epoch": 3.392979959682201, + "grad_norm": 0.3143475894178532, + "learning_rate": 2.959327580233434e-06, + "loss": 0.0114, + "step": 28613 + }, + { + "epoch": 3.393098541444326, + "grad_norm": 0.38110571613980815, + "learning_rate": 2.958194838480538e-06, + "loss": 0.019, + "step": 28614 + }, + { + "epoch": 3.3932171232064507, + "grad_norm": 0.9002936133825741, + "learning_rate": 2.95706229992605e-06, + "loss": 0.0485, + "step": 28615 + }, + { + "epoch": 3.393335704968576, + "grad_norm": 0.518610290603584, + "learning_rate": 2.9559299645804143e-06, + "loss": 0.034, + "step": 28616 + }, + { + "epoch": 3.3934542867307007, + "grad_norm": 0.7262468305035318, + "learning_rate": 2.954797832454076e-06, + "loss": 0.0275, + "step": 28617 + }, + { + "epoch": 3.393572868492826, + "grad_norm": 1.0683394529543673, + "learning_rate": 2.9536659035574704e-06, + "loss": 0.0455, + "step": 28618 + }, + { + "epoch": 3.3936914502549507, + "grad_norm": 0.44493616487249404, + "learning_rate": 2.9525341779010257e-06, + "loss": 0.0218, + "step": 28619 + }, + { + "epoch": 3.393810032017076, + "grad_norm": 0.6416362502701769, + "learning_rate": 2.9514026554951808e-06, + "loss": 0.0239, + "step": 28620 + }, + { + "epoch": 3.3939286137792006, + "grad_norm": 0.6421022340867363, + "learning_rate": 2.950271336350366e-06, + "loss": 0.028, + "step": 28621 + }, + { + "epoch": 3.394047195541326, + "grad_norm": 0.5458582211641794, + "learning_rate": 2.9491402204770062e-06, + "loss": 0.0196, + "step": 28622 + }, + { + "epoch": 3.3941657773034506, + "grad_norm": 0.5681306567718764, + "learning_rate": 2.9480093078855437e-06, + "loss": 0.0232, + "step": 28623 + }, + { + "epoch": 3.394284359065576, + "grad_norm": 0.39089727283308057, + "learning_rate": 2.946878598586378e-06, + "loss": 0.0183, + "step": 28624 + }, + { + "epoch": 3.3944029408277006, + "grad_norm": 0.481361705634616, + "learning_rate": 2.945748092589962e-06, + "loss": 0.0244, + "step": 28625 + }, + { + "epoch": 3.3945215225898258, + "grad_norm": 0.6297202883627535, + "learning_rate": 2.9446177899066977e-06, + "loss": 0.0299, + "step": 28626 + }, + { + "epoch": 3.3946401043519505, + "grad_norm": 0.577305025225877, + "learning_rate": 2.9434876905470116e-06, + "loss": 0.0293, + "step": 28627 + }, + { + "epoch": 3.3947586861140757, + "grad_norm": 0.5123969946188941, + "learning_rate": 2.942357794521319e-06, + "loss": 0.0284, + "step": 28628 + }, + { + "epoch": 3.3948772678762005, + "grad_norm": 0.7803996576933145, + "learning_rate": 2.941228101840049e-06, + "loss": 0.0411, + "step": 28629 + }, + { + "epoch": 3.3949958496383257, + "grad_norm": 0.3482383536339463, + "learning_rate": 2.9400986125135947e-06, + "loss": 0.0183, + "step": 28630 + }, + { + "epoch": 3.3951144314004504, + "grad_norm": 0.45614559561568346, + "learning_rate": 2.938969326552385e-06, + "loss": 0.0216, + "step": 28631 + }, + { + "epoch": 3.3952330131625756, + "grad_norm": 0.5521323949368797, + "learning_rate": 2.9378402439668217e-06, + "loss": 0.0233, + "step": 28632 + }, + { + "epoch": 3.3953515949247004, + "grad_norm": 0.36526326137449017, + "learning_rate": 2.9367113647673165e-06, + "loss": 0.0207, + "step": 28633 + }, + { + "epoch": 3.3954701766868256, + "grad_norm": 0.5737516588126739, + "learning_rate": 2.9355826889642833e-06, + "loss": 0.024, + "step": 28634 + }, + { + "epoch": 3.3955887584489504, + "grad_norm": 0.3087021232916764, + "learning_rate": 2.934454216568111e-06, + "loss": 0.0143, + "step": 28635 + }, + { + "epoch": 3.3957073402110756, + "grad_norm": 0.3622767919799822, + "learning_rate": 2.9333259475892217e-06, + "loss": 0.0138, + "step": 28636 + }, + { + "epoch": 3.3958259219732003, + "grad_norm": 0.5712224121150481, + "learning_rate": 2.9321978820380015e-06, + "loss": 0.0228, + "step": 28637 + }, + { + "epoch": 3.3959445037353255, + "grad_norm": 0.7700770611386845, + "learning_rate": 2.9310700199248558e-06, + "loss": 0.0458, + "step": 28638 + }, + { + "epoch": 3.3960630854974507, + "grad_norm": 0.3992156089903412, + "learning_rate": 2.929942361260182e-06, + "loss": 0.0199, + "step": 28639 + }, + { + "epoch": 3.3961816672595755, + "grad_norm": 0.5946220910984557, + "learning_rate": 2.928814906054375e-06, + "loss": 0.032, + "step": 28640 + }, + { + "epoch": 3.3963002490217002, + "grad_norm": 0.5511204183933522, + "learning_rate": 2.9276876543178345e-06, + "loss": 0.0209, + "step": 28641 + }, + { + "epoch": 3.3964188307838254, + "grad_norm": 0.5646106224472268, + "learning_rate": 2.9265606060609378e-06, + "loss": 0.0375, + "step": 28642 + }, + { + "epoch": 3.3965374125459507, + "grad_norm": 0.603810356242826, + "learning_rate": 2.9254337612940915e-06, + "loss": 0.0385, + "step": 28643 + }, + { + "epoch": 3.3966559943080754, + "grad_norm": 0.48866882788206484, + "learning_rate": 2.924307120027675e-06, + "loss": 0.0232, + "step": 28644 + }, + { + "epoch": 3.3967745760702, + "grad_norm": 0.6315066786602354, + "learning_rate": 2.9231806822720782e-06, + "loss": 0.0407, + "step": 28645 + }, + { + "epoch": 3.3968931578323254, + "grad_norm": 0.2957208871750297, + "learning_rate": 2.9220544480376725e-06, + "loss": 0.011, + "step": 28646 + }, + { + "epoch": 3.3970117395944506, + "grad_norm": 0.6021405693110974, + "learning_rate": 2.9209284173348646e-06, + "loss": 0.0226, + "step": 28647 + }, + { + "epoch": 3.3971303213565753, + "grad_norm": 0.7734052768641267, + "learning_rate": 2.9198025901740143e-06, + "loss": 0.0305, + "step": 28648 + }, + { + "epoch": 3.3972489031187005, + "grad_norm": 0.3960983640295445, + "learning_rate": 2.918676966565506e-06, + "loss": 0.0155, + "step": 28649 + }, + { + "epoch": 3.3973674848808253, + "grad_norm": 0.4659437203646461, + "learning_rate": 2.91755154651972e-06, + "loss": 0.0285, + "step": 28650 + }, + { + "epoch": 3.3974860666429505, + "grad_norm": 0.46839622667816094, + "learning_rate": 2.9164263300470312e-06, + "loss": 0.0207, + "step": 28651 + }, + { + "epoch": 3.3976046484050753, + "grad_norm": 0.5332526289906219, + "learning_rate": 2.915301317157812e-06, + "loss": 0.0219, + "step": 28652 + }, + { + "epoch": 3.3977232301672005, + "grad_norm": 0.7311272150138705, + "learning_rate": 2.9141765078624316e-06, + "loss": 0.0252, + "step": 28653 + }, + { + "epoch": 3.397841811929325, + "grad_norm": 0.5621958718433572, + "learning_rate": 2.9130519021712573e-06, + "loss": 0.0233, + "step": 28654 + }, + { + "epoch": 3.3979603936914504, + "grad_norm": 0.5005612486192839, + "learning_rate": 2.9119275000946604e-06, + "loss": 0.028, + "step": 28655 + }, + { + "epoch": 3.398078975453575, + "grad_norm": 0.5006970573126122, + "learning_rate": 2.9108033016430112e-06, + "loss": 0.022, + "step": 28656 + }, + { + "epoch": 3.3981975572157004, + "grad_norm": 0.39696698543708714, + "learning_rate": 2.909679306826657e-06, + "loss": 0.0203, + "step": 28657 + }, + { + "epoch": 3.398316138977825, + "grad_norm": 0.47673646129947733, + "learning_rate": 2.9085555156559835e-06, + "loss": 0.0218, + "step": 28658 + }, + { + "epoch": 3.3984347207399503, + "grad_norm": 0.6086722832765928, + "learning_rate": 2.9074319281413297e-06, + "loss": 0.032, + "step": 28659 + }, + { + "epoch": 3.398553302502075, + "grad_norm": 0.6567427263626158, + "learning_rate": 2.9063085442930625e-06, + "loss": 0.032, + "step": 28660 + }, + { + "epoch": 3.3986718842642003, + "grad_norm": 0.6744118020882897, + "learning_rate": 2.905185364121538e-06, + "loss": 0.0268, + "step": 28661 + }, + { + "epoch": 3.398790466026325, + "grad_norm": 0.4697440022651021, + "learning_rate": 2.9040623876371106e-06, + "loss": 0.0211, + "step": 28662 + }, + { + "epoch": 3.3989090477884503, + "grad_norm": 0.6018959151604935, + "learning_rate": 2.9029396148501366e-06, + "loss": 0.0395, + "step": 28663 + }, + { + "epoch": 3.399027629550575, + "grad_norm": 1.2210200696799538, + "learning_rate": 2.9018170457709577e-06, + "loss": 0.046, + "step": 28664 + }, + { + "epoch": 3.3991462113127002, + "grad_norm": 0.5280978147158887, + "learning_rate": 2.9006946804099265e-06, + "loss": 0.0236, + "step": 28665 + }, + { + "epoch": 3.399264793074825, + "grad_norm": 0.5848045490975013, + "learning_rate": 2.899572518777391e-06, + "loss": 0.0204, + "step": 28666 + }, + { + "epoch": 3.39938337483695, + "grad_norm": 0.48309621305410255, + "learning_rate": 2.8984505608836977e-06, + "loss": 0.0402, + "step": 28667 + }, + { + "epoch": 3.399501956599075, + "grad_norm": 0.6583308532617692, + "learning_rate": 2.8973288067391856e-06, + "loss": 0.0383, + "step": 28668 + }, + { + "epoch": 3.3996205383612, + "grad_norm": 0.5734113580963313, + "learning_rate": 2.896207256354197e-06, + "loss": 0.0298, + "step": 28669 + }, + { + "epoch": 3.399739120123325, + "grad_norm": 0.47443384616648276, + "learning_rate": 2.895085909739073e-06, + "loss": 0.0233, + "step": 28670 + }, + { + "epoch": 3.39985770188545, + "grad_norm": 0.36451156341291313, + "learning_rate": 2.8939647669041476e-06, + "loss": 0.0224, + "step": 28671 + }, + { + "epoch": 3.399976283647575, + "grad_norm": 0.5124414345982546, + "learning_rate": 2.892843827859759e-06, + "loss": 0.0357, + "step": 28672 + }, + { + "epoch": 3.4000948654097, + "grad_norm": 0.5683773318496417, + "learning_rate": 2.891723092616244e-06, + "loss": 0.0272, + "step": 28673 + }, + { + "epoch": 3.400213447171825, + "grad_norm": 0.45506808913124047, + "learning_rate": 2.8906025611839334e-06, + "loss": 0.0187, + "step": 28674 + }, + { + "epoch": 3.40033202893395, + "grad_norm": 0.3133849655005813, + "learning_rate": 2.8894822335731515e-06, + "loss": 0.0146, + "step": 28675 + }, + { + "epoch": 3.400450610696075, + "grad_norm": 0.4830947410719658, + "learning_rate": 2.888362109794229e-06, + "loss": 0.0257, + "step": 28676 + }, + { + "epoch": 3.4005691924582, + "grad_norm": 0.5825171109347153, + "learning_rate": 2.887242189857492e-06, + "loss": 0.0331, + "step": 28677 + }, + { + "epoch": 3.4006877742203248, + "grad_norm": 0.6193478947462927, + "learning_rate": 2.8861224737732702e-06, + "loss": 0.0232, + "step": 28678 + }, + { + "epoch": 3.40080635598245, + "grad_norm": 0.7518627334069364, + "learning_rate": 2.8850029615518775e-06, + "loss": 0.0236, + "step": 28679 + }, + { + "epoch": 3.4009249377445747, + "grad_norm": 1.0199032000572348, + "learning_rate": 2.883883653203637e-06, + "loss": 0.0356, + "step": 28680 + }, + { + "epoch": 3.4010435195067, + "grad_norm": 0.6217015311258766, + "learning_rate": 2.88276454873887e-06, + "loss": 0.0274, + "step": 28681 + }, + { + "epoch": 3.4011621012688247, + "grad_norm": 0.4994664170498962, + "learning_rate": 2.881645648167891e-06, + "loss": 0.0322, + "step": 28682 + }, + { + "epoch": 3.40128068303095, + "grad_norm": 0.4968503222000168, + "learning_rate": 2.8805269515010223e-06, + "loss": 0.0351, + "step": 28683 + }, + { + "epoch": 3.4013992647930746, + "grad_norm": 0.8536120207142458, + "learning_rate": 2.879408458748559e-06, + "loss": 0.0367, + "step": 28684 + }, + { + "epoch": 3.4015178465552, + "grad_norm": 0.415301804105204, + "learning_rate": 2.878290169920836e-06, + "loss": 0.0179, + "step": 28685 + }, + { + "epoch": 3.4016364283173246, + "grad_norm": 0.591959388509042, + "learning_rate": 2.877172085028143e-06, + "loss": 0.0292, + "step": 28686 + }, + { + "epoch": 3.40175501007945, + "grad_norm": 0.5125800281400903, + "learning_rate": 2.876054204080794e-06, + "loss": 0.0263, + "step": 28687 + }, + { + "epoch": 3.401873591841575, + "grad_norm": 0.4914447814770814, + "learning_rate": 2.874936527089095e-06, + "loss": 0.025, + "step": 28688 + }, + { + "epoch": 3.4019921736036998, + "grad_norm": 0.8579544891791435, + "learning_rate": 2.8738190540633507e-06, + "loss": 0.0392, + "step": 28689 + }, + { + "epoch": 3.4021107553658245, + "grad_norm": 0.3896298029138979, + "learning_rate": 2.8727017850138677e-06, + "loss": 0.0152, + "step": 28690 + }, + { + "epoch": 3.4022293371279497, + "grad_norm": 0.7430567314400559, + "learning_rate": 2.8715847199509286e-06, + "loss": 0.0411, + "step": 28691 + }, + { + "epoch": 3.402347918890075, + "grad_norm": 0.3837008082579921, + "learning_rate": 2.8704678588848538e-06, + "loss": 0.0138, + "step": 28692 + }, + { + "epoch": 3.4024665006521997, + "grad_norm": 0.6069874958664384, + "learning_rate": 2.8693512018259235e-06, + "loss": 0.0327, + "step": 28693 + }, + { + "epoch": 3.4025850824143244, + "grad_norm": 0.4837945792605973, + "learning_rate": 2.8682347487844404e-06, + "loss": 0.0166, + "step": 28694 + }, + { + "epoch": 3.4027036641764496, + "grad_norm": 0.41542222613773194, + "learning_rate": 2.867118499770682e-06, + "loss": 0.0158, + "step": 28695 + }, + { + "epoch": 3.402822245938575, + "grad_norm": 0.5687530140782903, + "learning_rate": 2.8660024547949637e-06, + "loss": 0.0204, + "step": 28696 + }, + { + "epoch": 3.4029408277006996, + "grad_norm": 0.6161707337708398, + "learning_rate": 2.8648866138675514e-06, + "loss": 0.0258, + "step": 28697 + }, + { + "epoch": 3.4030594094628244, + "grad_norm": 0.47970539394193024, + "learning_rate": 2.863770976998742e-06, + "loss": 0.0267, + "step": 28698 + }, + { + "epoch": 3.4031779912249496, + "grad_norm": 0.5275151579705294, + "learning_rate": 2.8626555441988196e-06, + "loss": 0.0255, + "step": 28699 + }, + { + "epoch": 3.4032965729870748, + "grad_norm": 0.4362116865883787, + "learning_rate": 2.8615403154780674e-06, + "loss": 0.02, + "step": 28700 + }, + { + "epoch": 3.4034151547491995, + "grad_norm": 0.36044038665816625, + "learning_rate": 2.860425290846769e-06, + "loss": 0.0204, + "step": 28701 + }, + { + "epoch": 3.4035337365113247, + "grad_norm": 0.40496441429394114, + "learning_rate": 2.8593104703151907e-06, + "loss": 0.0197, + "step": 28702 + }, + { + "epoch": 3.4036523182734495, + "grad_norm": 0.5220459545180913, + "learning_rate": 2.8581958538936275e-06, + "loss": 0.0268, + "step": 28703 + }, + { + "epoch": 3.4037709000355747, + "grad_norm": 0.6526097441864515, + "learning_rate": 2.8570814415923437e-06, + "loss": 0.0273, + "step": 28704 + }, + { + "epoch": 3.4038894817976995, + "grad_norm": 0.5790065441913853, + "learning_rate": 2.855967233421619e-06, + "loss": 0.0256, + "step": 28705 + }, + { + "epoch": 3.4040080635598247, + "grad_norm": 0.5436311261108875, + "learning_rate": 2.854853229391713e-06, + "loss": 0.025, + "step": 28706 + }, + { + "epoch": 3.4041266453219494, + "grad_norm": 0.4343107809978202, + "learning_rate": 2.8537394295129137e-06, + "loss": 0.0205, + "step": 28707 + }, + { + "epoch": 3.4042452270840746, + "grad_norm": 0.5620867583354788, + "learning_rate": 2.852625833795475e-06, + "loss": 0.0274, + "step": 28708 + }, + { + "epoch": 3.4043638088461994, + "grad_norm": 0.41442277673357447, + "learning_rate": 2.8515124422496653e-06, + "loss": 0.0179, + "step": 28709 + }, + { + "epoch": 3.4044823906083246, + "grad_norm": 0.3443210585825308, + "learning_rate": 2.8503992548857494e-06, + "loss": 0.0191, + "step": 28710 + }, + { + "epoch": 3.4046009723704493, + "grad_norm": 0.8257153205727438, + "learning_rate": 2.849286271713994e-06, + "loss": 0.045, + "step": 28711 + }, + { + "epoch": 3.4047195541325745, + "grad_norm": 0.5283162930432177, + "learning_rate": 2.84817349274466e-06, + "loss": 0.0232, + "step": 28712 + }, + { + "epoch": 3.4048381358946993, + "grad_norm": 0.523276236365917, + "learning_rate": 2.8470609179879926e-06, + "loss": 0.0289, + "step": 28713 + }, + { + "epoch": 3.4049567176568245, + "grad_norm": 0.6940582442898007, + "learning_rate": 2.8459485474542665e-06, + "loss": 0.0368, + "step": 28714 + }, + { + "epoch": 3.4050752994189493, + "grad_norm": 0.48787674195045616, + "learning_rate": 2.8448363811537234e-06, + "loss": 0.0182, + "step": 28715 + }, + { + "epoch": 3.4051938811810745, + "grad_norm": 0.6928582701375778, + "learning_rate": 2.8437244190966246e-06, + "loss": 0.036, + "step": 28716 + }, + { + "epoch": 3.4053124629431992, + "grad_norm": 0.45113608373093816, + "learning_rate": 2.842612661293209e-06, + "loss": 0.0196, + "step": 28717 + }, + { + "epoch": 3.4054310447053244, + "grad_norm": 0.372123163089793, + "learning_rate": 2.8415011077537413e-06, + "loss": 0.0149, + "step": 28718 + }, + { + "epoch": 3.405549626467449, + "grad_norm": 0.3774003618418143, + "learning_rate": 2.840389758488457e-06, + "loss": 0.0176, + "step": 28719 + }, + { + "epoch": 3.4056682082295744, + "grad_norm": 0.5791807664568188, + "learning_rate": 2.839278613507604e-06, + "loss": 0.0314, + "step": 28720 + }, + { + "epoch": 3.405786789991699, + "grad_norm": 0.5356534075734177, + "learning_rate": 2.838167672821429e-06, + "loss": 0.0225, + "step": 28721 + }, + { + "epoch": 3.4059053717538244, + "grad_norm": 0.6012224087747974, + "learning_rate": 2.8370569364401687e-06, + "loss": 0.0288, + "step": 28722 + }, + { + "epoch": 3.406023953515949, + "grad_norm": 0.4410836305893754, + "learning_rate": 2.8359464043740735e-06, + "loss": 0.0205, + "step": 28723 + }, + { + "epoch": 3.4061425352780743, + "grad_norm": 0.5974094891854677, + "learning_rate": 2.8348360766333654e-06, + "loss": 0.0316, + "step": 28724 + }, + { + "epoch": 3.406261117040199, + "grad_norm": 0.6825578503203708, + "learning_rate": 2.833725953228289e-06, + "loss": 0.0261, + "step": 28725 + }, + { + "epoch": 3.4063796988023243, + "grad_norm": 0.5538535382753909, + "learning_rate": 2.8326160341690777e-06, + "loss": 0.0317, + "step": 28726 + }, + { + "epoch": 3.406498280564449, + "grad_norm": 0.4563194512946678, + "learning_rate": 2.831506319465968e-06, + "loss": 0.0217, + "step": 28727 + }, + { + "epoch": 3.4066168623265742, + "grad_norm": 0.3267326012780439, + "learning_rate": 2.8303968091291766e-06, + "loss": 0.0164, + "step": 28728 + }, + { + "epoch": 3.406735444088699, + "grad_norm": 0.5110498996741871, + "learning_rate": 2.829287503168948e-06, + "loss": 0.0316, + "step": 28729 + }, + { + "epoch": 3.406854025850824, + "grad_norm": 0.6683379512445315, + "learning_rate": 2.8281784015954965e-06, + "loss": 0.0413, + "step": 28730 + }, + { + "epoch": 3.406972607612949, + "grad_norm": 0.33033705298567917, + "learning_rate": 2.8270695044190525e-06, + "loss": 0.0165, + "step": 28731 + }, + { + "epoch": 3.407091189375074, + "grad_norm": 0.5347918989066297, + "learning_rate": 2.825960811649836e-06, + "loss": 0.0334, + "step": 28732 + }, + { + "epoch": 3.407209771137199, + "grad_norm": 0.5661835465693941, + "learning_rate": 2.824852323298069e-06, + "loss": 0.0297, + "step": 28733 + }, + { + "epoch": 3.407328352899324, + "grad_norm": 1.5902489594171931, + "learning_rate": 2.823744039373977e-06, + "loss": 0.0532, + "step": 28734 + }, + { + "epoch": 3.407446934661449, + "grad_norm": 0.4962946204951034, + "learning_rate": 2.822635959887765e-06, + "loss": 0.0283, + "step": 28735 + }, + { + "epoch": 3.407565516423574, + "grad_norm": 0.4808005085953573, + "learning_rate": 2.8215280848496535e-06, + "loss": 0.015, + "step": 28736 + }, + { + "epoch": 3.4076840981856993, + "grad_norm": 0.5144916528636831, + "learning_rate": 2.8204204142698586e-06, + "loss": 0.0273, + "step": 28737 + }, + { + "epoch": 3.407802679947824, + "grad_norm": 0.5716711446858209, + "learning_rate": 2.819312948158587e-06, + "loss": 0.0302, + "step": 28738 + }, + { + "epoch": 3.407921261709949, + "grad_norm": 0.38001127083280534, + "learning_rate": 2.818205686526054e-06, + "loss": 0.0216, + "step": 28739 + }, + { + "epoch": 3.408039843472074, + "grad_norm": 0.43633095936637223, + "learning_rate": 2.8170986293824533e-06, + "loss": 0.0231, + "step": 28740 + }, + { + "epoch": 3.408158425234199, + "grad_norm": 0.49800365188904394, + "learning_rate": 2.815991776738014e-06, + "loss": 0.0245, + "step": 28741 + }, + { + "epoch": 3.408277006996324, + "grad_norm": 0.5387362156733787, + "learning_rate": 2.814885128602918e-06, + "loss": 0.029, + "step": 28742 + }, + { + "epoch": 3.4083955887584487, + "grad_norm": 0.8424977291524355, + "learning_rate": 2.813778684987378e-06, + "loss": 0.0473, + "step": 28743 + }, + { + "epoch": 3.408514170520574, + "grad_norm": 0.47011805050629885, + "learning_rate": 2.812672445901593e-06, + "loss": 0.0267, + "step": 28744 + }, + { + "epoch": 3.408632752282699, + "grad_norm": 0.3504501929140975, + "learning_rate": 2.811566411355762e-06, + "loss": 0.0192, + "step": 28745 + }, + { + "epoch": 3.408751334044824, + "grad_norm": 0.559375505669071, + "learning_rate": 2.8104605813600776e-06, + "loss": 0.027, + "step": 28746 + }, + { + "epoch": 3.4088699158069486, + "grad_norm": 0.415207817184694, + "learning_rate": 2.809354955924734e-06, + "loss": 0.0182, + "step": 28747 + }, + { + "epoch": 3.408988497569074, + "grad_norm": 0.6249242115076572, + "learning_rate": 2.8082495350599258e-06, + "loss": 0.0356, + "step": 28748 + }, + { + "epoch": 3.409107079331199, + "grad_norm": 0.5403208284915397, + "learning_rate": 2.807144318775845e-06, + "loss": 0.0244, + "step": 28749 + }, + { + "epoch": 3.409225661093324, + "grad_norm": 0.2619900264036093, + "learning_rate": 2.8060393070826834e-06, + "loss": 0.0125, + "step": 28750 + }, + { + "epoch": 3.409344242855449, + "grad_norm": 0.49729693096449623, + "learning_rate": 2.8049344999906134e-06, + "loss": 0.025, + "step": 28751 + }, + { + "epoch": 3.4094628246175738, + "grad_norm": 0.45346782010289904, + "learning_rate": 2.803829897509838e-06, + "loss": 0.0282, + "step": 28752 + }, + { + "epoch": 3.409581406379699, + "grad_norm": 0.38342131177603156, + "learning_rate": 2.80272549965053e-06, + "loss": 0.0213, + "step": 28753 + }, + { + "epoch": 3.4096999881418237, + "grad_norm": 0.36535184539612237, + "learning_rate": 2.8016213064228774e-06, + "loss": 0.0153, + "step": 28754 + }, + { + "epoch": 3.409818569903949, + "grad_norm": 0.5028196633042868, + "learning_rate": 2.8005173178370423e-06, + "loss": 0.0226, + "step": 28755 + }, + { + "epoch": 3.4099371516660737, + "grad_norm": 0.5604412292891575, + "learning_rate": 2.7994135339032278e-06, + "loss": 0.0369, + "step": 28756 + }, + { + "epoch": 3.410055733428199, + "grad_norm": 0.4373224610143818, + "learning_rate": 2.7983099546315895e-06, + "loss": 0.0189, + "step": 28757 + }, + { + "epoch": 3.4101743151903237, + "grad_norm": 0.7633271914761219, + "learning_rate": 2.7972065800323078e-06, + "loss": 0.0369, + "step": 28758 + }, + { + "epoch": 3.410292896952449, + "grad_norm": 0.46645790246154306, + "learning_rate": 2.7961034101155553e-06, + "loss": 0.0281, + "step": 28759 + }, + { + "epoch": 3.4104114787145736, + "grad_norm": 0.46017041718832447, + "learning_rate": 2.795000444891502e-06, + "loss": 0.0215, + "step": 28760 + }, + { + "epoch": 3.410530060476699, + "grad_norm": 0.370389651058783, + "learning_rate": 2.7938976843703195e-06, + "loss": 0.0156, + "step": 28761 + }, + { + "epoch": 3.4106486422388236, + "grad_norm": 0.48596459163072586, + "learning_rate": 2.792795128562159e-06, + "loss": 0.0242, + "step": 28762 + }, + { + "epoch": 3.410767224000949, + "grad_norm": 0.5170399844450867, + "learning_rate": 2.7916927774772034e-06, + "loss": 0.0264, + "step": 28763 + }, + { + "epoch": 3.4108858057630735, + "grad_norm": 0.438527228637401, + "learning_rate": 2.790590631125603e-06, + "loss": 0.0131, + "step": 28764 + }, + { + "epoch": 3.4110043875251987, + "grad_norm": 0.7072710669334924, + "learning_rate": 2.7894886895175275e-06, + "loss": 0.0281, + "step": 28765 + }, + { + "epoch": 3.4111229692873235, + "grad_norm": 0.6362325291244383, + "learning_rate": 2.7883869526631217e-06, + "loss": 0.033, + "step": 28766 + }, + { + "epoch": 3.4112415510494487, + "grad_norm": 0.6868435687160185, + "learning_rate": 2.787285420572558e-06, + "loss": 0.0292, + "step": 28767 + }, + { + "epoch": 3.4113601328115735, + "grad_norm": 0.6251614381922825, + "learning_rate": 2.786184093255981e-06, + "loss": 0.0214, + "step": 28768 + }, + { + "epoch": 3.4114787145736987, + "grad_norm": 0.303215345526979, + "learning_rate": 2.7850829707235432e-06, + "loss": 0.0148, + "step": 28769 + }, + { + "epoch": 3.4115972963358234, + "grad_norm": 0.33098764797048735, + "learning_rate": 2.783982052985401e-06, + "loss": 0.0143, + "step": 28770 + }, + { + "epoch": 3.4117158780979486, + "grad_norm": 0.3887891229096125, + "learning_rate": 2.782881340051699e-06, + "loss": 0.0185, + "step": 28771 + }, + { + "epoch": 3.4118344598600734, + "grad_norm": 0.44030216548617135, + "learning_rate": 2.7817808319325954e-06, + "loss": 0.0178, + "step": 28772 + }, + { + "epoch": 3.4119530416221986, + "grad_norm": 0.6223420398593249, + "learning_rate": 2.780680528638213e-06, + "loss": 0.03, + "step": 28773 + }, + { + "epoch": 3.4120716233843233, + "grad_norm": 0.48617301528770646, + "learning_rate": 2.7795804301787214e-06, + "loss": 0.0285, + "step": 28774 + }, + { + "epoch": 3.4121902051464486, + "grad_norm": 0.6103593823967456, + "learning_rate": 2.778480536564243e-06, + "loss": 0.0387, + "step": 28775 + }, + { + "epoch": 3.4123087869085733, + "grad_norm": 0.40114054083933803, + "learning_rate": 2.7773808478049307e-06, + "loss": 0.0222, + "step": 28776 + }, + { + "epoch": 3.4124273686706985, + "grad_norm": 0.4168675835210919, + "learning_rate": 2.776281363910904e-06, + "loss": 0.0232, + "step": 28777 + }, + { + "epoch": 3.4125459504328233, + "grad_norm": 0.7261389552771454, + "learning_rate": 2.775182084892322e-06, + "loss": 0.0304, + "step": 28778 + }, + { + "epoch": 3.4126645321949485, + "grad_norm": 0.3847977613684596, + "learning_rate": 2.7740830107593013e-06, + "loss": 0.0128, + "step": 28779 + }, + { + "epoch": 3.4127831139570732, + "grad_norm": 0.6057826202439162, + "learning_rate": 2.772984141521981e-06, + "loss": 0.0277, + "step": 28780 + }, + { + "epoch": 3.4129016957191984, + "grad_norm": 0.3210043831802938, + "learning_rate": 2.7718854771904916e-06, + "loss": 0.0188, + "step": 28781 + }, + { + "epoch": 3.413020277481323, + "grad_norm": 0.45900613394174505, + "learning_rate": 2.770787017774959e-06, + "loss": 0.0198, + "step": 28782 + }, + { + "epoch": 3.4131388592434484, + "grad_norm": 0.6276219367503522, + "learning_rate": 2.7696887632855136e-06, + "loss": 0.024, + "step": 28783 + }, + { + "epoch": 3.413257441005573, + "grad_norm": 0.3203936152781949, + "learning_rate": 2.768590713732269e-06, + "loss": 0.0183, + "step": 28784 + }, + { + "epoch": 3.4133760227676984, + "grad_norm": 0.41001388956549906, + "learning_rate": 2.7674928691253656e-06, + "loss": 0.0259, + "step": 28785 + }, + { + "epoch": 3.4134946045298236, + "grad_norm": 0.31664711816023183, + "learning_rate": 2.766395229474911e-06, + "loss": 0.0146, + "step": 28786 + }, + { + "epoch": 3.4136131862919483, + "grad_norm": 0.4059405856075139, + "learning_rate": 2.7652977947910274e-06, + "loss": 0.0165, + "step": 28787 + }, + { + "epoch": 3.413731768054073, + "grad_norm": 0.5782918908508955, + "learning_rate": 2.7642005650838325e-06, + "loss": 0.0199, + "step": 28788 + }, + { + "epoch": 3.4138503498161983, + "grad_norm": 0.4867881540640706, + "learning_rate": 2.7631035403634402e-06, + "loss": 0.0171, + "step": 28789 + }, + { + "epoch": 3.4139689315783235, + "grad_norm": 0.5586946682758357, + "learning_rate": 2.7620067206399675e-06, + "loss": 0.0198, + "step": 28790 + }, + { + "epoch": 3.4140875133404482, + "grad_norm": 0.5569610844118944, + "learning_rate": 2.76091010592352e-06, + "loss": 0.0235, + "step": 28791 + }, + { + "epoch": 3.414206095102573, + "grad_norm": 0.5031771499086548, + "learning_rate": 2.7598136962242117e-06, + "loss": 0.0305, + "step": 28792 + }, + { + "epoch": 3.414324676864698, + "grad_norm": 0.642888908574521, + "learning_rate": 2.7587174915521456e-06, + "loss": 0.0422, + "step": 28793 + }, + { + "epoch": 3.4144432586268234, + "grad_norm": 1.3891231444338141, + "learning_rate": 2.7576214919174364e-06, + "loss": 0.0731, + "step": 28794 + }, + { + "epoch": 3.414561840388948, + "grad_norm": 0.4735963545492486, + "learning_rate": 2.7565256973301757e-06, + "loss": 0.0122, + "step": 28795 + }, + { + "epoch": 3.414680422151073, + "grad_norm": 0.47936457788849773, + "learning_rate": 2.755430107800472e-06, + "loss": 0.0197, + "step": 28796 + }, + { + "epoch": 3.414799003913198, + "grad_norm": 0.4019874670183177, + "learning_rate": 2.7543347233384256e-06, + "loss": 0.022, + "step": 28797 + }, + { + "epoch": 3.4149175856753233, + "grad_norm": 0.48441704674533564, + "learning_rate": 2.7532395439541307e-06, + "loss": 0.0233, + "step": 28798 + }, + { + "epoch": 3.415036167437448, + "grad_norm": 0.9570420677325593, + "learning_rate": 2.7521445696576886e-06, + "loss": 0.0346, + "step": 28799 + }, + { + "epoch": 3.4151547491995733, + "grad_norm": 0.5721184465660097, + "learning_rate": 2.7510498004591933e-06, + "loss": 0.0307, + "step": 28800 + }, + { + "epoch": 3.415273330961698, + "grad_norm": 0.6766586062906783, + "learning_rate": 2.749955236368737e-06, + "loss": 0.0374, + "step": 28801 + }, + { + "epoch": 3.4153919127238233, + "grad_norm": 0.48773173818155907, + "learning_rate": 2.748860877396406e-06, + "loss": 0.0317, + "step": 28802 + }, + { + "epoch": 3.415510494485948, + "grad_norm": 0.5844566180454868, + "learning_rate": 2.747766723552289e-06, + "loss": 0.0254, + "step": 28803 + }, + { + "epoch": 3.415629076248073, + "grad_norm": 0.6527410842950828, + "learning_rate": 2.746672774846476e-06, + "loss": 0.031, + "step": 28804 + }, + { + "epoch": 3.415747658010198, + "grad_norm": 0.35177256469215223, + "learning_rate": 2.7455790312890583e-06, + "loss": 0.0149, + "step": 28805 + }, + { + "epoch": 3.415866239772323, + "grad_norm": 0.4173885102443358, + "learning_rate": 2.744485492890106e-06, + "loss": 0.0161, + "step": 28806 + }, + { + "epoch": 3.415984821534448, + "grad_norm": 0.6632841294726318, + "learning_rate": 2.743392159659705e-06, + "loss": 0.039, + "step": 28807 + }, + { + "epoch": 3.416103403296573, + "grad_norm": 0.4922858942413555, + "learning_rate": 2.7422990316079337e-06, + "loss": 0.0333, + "step": 28808 + }, + { + "epoch": 3.416221985058698, + "grad_norm": 0.5973677978098286, + "learning_rate": 2.741206108744876e-06, + "loss": 0.0202, + "step": 28809 + }, + { + "epoch": 3.416340566820823, + "grad_norm": 0.5796100248628445, + "learning_rate": 2.740113391080604e-06, + "loss": 0.0274, + "step": 28810 + }, + { + "epoch": 3.416459148582948, + "grad_norm": 0.40189536892413713, + "learning_rate": 2.7390208786251786e-06, + "loss": 0.0227, + "step": 28811 + }, + { + "epoch": 3.416577730345073, + "grad_norm": 0.4130522267539853, + "learning_rate": 2.7379285713886954e-06, + "loss": 0.0269, + "step": 28812 + }, + { + "epoch": 3.416696312107198, + "grad_norm": 0.6587647972687322, + "learning_rate": 2.7368364693812045e-06, + "loss": 0.0343, + "step": 28813 + }, + { + "epoch": 3.416814893869323, + "grad_norm": 0.7035266555521539, + "learning_rate": 2.7357445726127807e-06, + "loss": 0.0214, + "step": 28814 + }, + { + "epoch": 3.416933475631448, + "grad_norm": 0.5738068767086077, + "learning_rate": 2.734652881093491e-06, + "loss": 0.0332, + "step": 28815 + }, + { + "epoch": 3.417052057393573, + "grad_norm": 0.6777586906310089, + "learning_rate": 2.7335613948334003e-06, + "loss": 0.0345, + "step": 28816 + }, + { + "epoch": 3.4171706391556977, + "grad_norm": 0.475651149187935, + "learning_rate": 2.732470113842567e-06, + "loss": 0.0226, + "step": 28817 + }, + { + "epoch": 3.417289220917823, + "grad_norm": 0.955411133487436, + "learning_rate": 2.731379038131052e-06, + "loss": 0.0327, + "step": 28818 + }, + { + "epoch": 3.4174078026799477, + "grad_norm": 0.8031860796836426, + "learning_rate": 2.7302881677089135e-06, + "loss": 0.0214, + "step": 28819 + }, + { + "epoch": 3.417526384442073, + "grad_norm": 0.6387222375595267, + "learning_rate": 2.7291975025862087e-06, + "loss": 0.0345, + "step": 28820 + }, + { + "epoch": 3.4176449662041977, + "grad_norm": 0.6853281308370222, + "learning_rate": 2.728107042772998e-06, + "loss": 0.0336, + "step": 28821 + }, + { + "epoch": 3.417763547966323, + "grad_norm": 0.4956106200020485, + "learning_rate": 2.727016788279321e-06, + "loss": 0.0325, + "step": 28822 + }, + { + "epoch": 3.4178821297284476, + "grad_norm": 0.5748393680819514, + "learning_rate": 2.725926739115245e-06, + "loss": 0.0285, + "step": 28823 + }, + { + "epoch": 3.418000711490573, + "grad_norm": 0.7320934680525206, + "learning_rate": 2.7248368952908053e-06, + "loss": 0.0315, + "step": 28824 + }, + { + "epoch": 3.4181192932526976, + "grad_norm": 0.6924784333375711, + "learning_rate": 2.723747256816059e-06, + "loss": 0.0286, + "step": 28825 + }, + { + "epoch": 3.418237875014823, + "grad_norm": 0.6334722623028909, + "learning_rate": 2.722657823701036e-06, + "loss": 0.0243, + "step": 28826 + }, + { + "epoch": 3.4183564567769475, + "grad_norm": 0.2995274659004343, + "learning_rate": 2.721568595955798e-06, + "loss": 0.0185, + "step": 28827 + }, + { + "epoch": 3.4184750385390728, + "grad_norm": 0.26480377347418066, + "learning_rate": 2.720479573590373e-06, + "loss": 0.0137, + "step": 28828 + }, + { + "epoch": 3.4185936203011975, + "grad_norm": 0.388129896090892, + "learning_rate": 2.7193907566148063e-06, + "loss": 0.0185, + "step": 28829 + }, + { + "epoch": 3.4187122020633227, + "grad_norm": 0.6594484573794905, + "learning_rate": 2.7183021450391367e-06, + "loss": 0.0283, + "step": 28830 + }, + { + "epoch": 3.4188307838254475, + "grad_norm": 0.3342673508643401, + "learning_rate": 2.717213738873395e-06, + "loss": 0.02, + "step": 28831 + }, + { + "epoch": 3.4189493655875727, + "grad_norm": 0.6228229894317422, + "learning_rate": 2.716125538127623e-06, + "loss": 0.035, + "step": 28832 + }, + { + "epoch": 3.4190679473496974, + "grad_norm": 0.3331075369698268, + "learning_rate": 2.715037542811838e-06, + "loss": 0.0205, + "step": 28833 + }, + { + "epoch": 3.4191865291118226, + "grad_norm": 0.5810199597440494, + "learning_rate": 2.7139497529360875e-06, + "loss": 0.0243, + "step": 28834 + }, + { + "epoch": 3.4193051108739474, + "grad_norm": 0.4490727551782478, + "learning_rate": 2.7128621685103885e-06, + "loss": 0.0239, + "step": 28835 + }, + { + "epoch": 3.4194236926360726, + "grad_norm": 0.8506575087163718, + "learning_rate": 2.7117747895447743e-06, + "loss": 0.0408, + "step": 28836 + }, + { + "epoch": 3.4195422743981974, + "grad_norm": 0.4392783081055932, + "learning_rate": 2.710687616049254e-06, + "loss": 0.0262, + "step": 28837 + }, + { + "epoch": 3.4196608561603226, + "grad_norm": 0.43320267239712923, + "learning_rate": 2.709600648033872e-06, + "loss": 0.0188, + "step": 28838 + }, + { + "epoch": 3.4197794379224478, + "grad_norm": 0.4726697389903445, + "learning_rate": 2.7085138855086317e-06, + "loss": 0.0234, + "step": 28839 + }, + { + "epoch": 3.4198980196845725, + "grad_norm": 0.44346447643242715, + "learning_rate": 2.707427328483558e-06, + "loss": 0.0244, + "step": 28840 + }, + { + "epoch": 3.4200166014466973, + "grad_norm": 0.7564008539168818, + "learning_rate": 2.706340976968669e-06, + "loss": 0.0333, + "step": 28841 + }, + { + "epoch": 3.4201351832088225, + "grad_norm": 0.7314420056726109, + "learning_rate": 2.7052548309739774e-06, + "loss": 0.0371, + "step": 28842 + }, + { + "epoch": 3.4202537649709477, + "grad_norm": 0.4941244638336679, + "learning_rate": 2.704168890509501e-06, + "loss": 0.0288, + "step": 28843 + }, + { + "epoch": 3.4203723467330724, + "grad_norm": 0.7031509376201447, + "learning_rate": 2.703083155585237e-06, + "loss": 0.0252, + "step": 28844 + }, + { + "epoch": 3.420490928495197, + "grad_norm": 0.6535322389990473, + "learning_rate": 2.7019976262112113e-06, + "loss": 0.0319, + "step": 28845 + }, + { + "epoch": 3.4206095102573224, + "grad_norm": 0.7419198214202127, + "learning_rate": 2.700912302397421e-06, + "loss": 0.0298, + "step": 28846 + }, + { + "epoch": 3.4207280920194476, + "grad_norm": 0.5418175536770633, + "learning_rate": 2.699827184153875e-06, + "loss": 0.0329, + "step": 28847 + }, + { + "epoch": 3.4208466737815724, + "grad_norm": 0.4513432167652821, + "learning_rate": 2.698742271490576e-06, + "loss": 0.0243, + "step": 28848 + }, + { + "epoch": 3.4209652555436976, + "grad_norm": 0.4189299049178581, + "learning_rate": 2.6976575644175244e-06, + "loss": 0.017, + "step": 28849 + }, + { + "epoch": 3.4210838373058223, + "grad_norm": 0.4908025394991333, + "learning_rate": 2.696573062944727e-06, + "loss": 0.0234, + "step": 28850 + }, + { + "epoch": 3.4212024190679475, + "grad_norm": 0.37056933843292483, + "learning_rate": 2.6954887670821714e-06, + "loss": 0.0194, + "step": 28851 + }, + { + "epoch": 3.4213210008300723, + "grad_norm": 0.5509020490206578, + "learning_rate": 2.694404676839857e-06, + "loss": 0.0249, + "step": 28852 + }, + { + "epoch": 3.4214395825921975, + "grad_norm": 0.8248525963852674, + "learning_rate": 2.693320792227777e-06, + "loss": 0.0371, + "step": 28853 + }, + { + "epoch": 3.4215581643543223, + "grad_norm": 0.47423697815192395, + "learning_rate": 2.6922371132559328e-06, + "loss": 0.0274, + "step": 28854 + }, + { + "epoch": 3.4216767461164475, + "grad_norm": 0.5003525357083477, + "learning_rate": 2.691153639934299e-06, + "loss": 0.0226, + "step": 28855 + }, + { + "epoch": 3.421795327878572, + "grad_norm": 0.7930953747637942, + "learning_rate": 2.6900703722728738e-06, + "loss": 0.0436, + "step": 28856 + }, + { + "epoch": 3.4219139096406974, + "grad_norm": 0.8214240661945638, + "learning_rate": 2.6889873102816405e-06, + "loss": 0.0458, + "step": 28857 + }, + { + "epoch": 3.422032491402822, + "grad_norm": 0.9454561433902492, + "learning_rate": 2.6879044539705855e-06, + "loss": 0.0414, + "step": 28858 + }, + { + "epoch": 3.4221510731649474, + "grad_norm": 0.6580089227418033, + "learning_rate": 2.68682180334969e-06, + "loss": 0.0298, + "step": 28859 + }, + { + "epoch": 3.422269654927072, + "grad_norm": 0.48506787182481154, + "learning_rate": 2.6857393584289347e-06, + "loss": 0.023, + "step": 28860 + }, + { + "epoch": 3.4223882366891973, + "grad_norm": 0.5142529954386083, + "learning_rate": 2.684657119218306e-06, + "loss": 0.021, + "step": 28861 + }, + { + "epoch": 3.422506818451322, + "grad_norm": 0.4062819578139395, + "learning_rate": 2.683575085727766e-06, + "loss": 0.0232, + "step": 28862 + }, + { + "epoch": 3.4226254002134473, + "grad_norm": 0.9179842371274384, + "learning_rate": 2.6824932579673e-06, + "loss": 0.0581, + "step": 28863 + }, + { + "epoch": 3.422743981975572, + "grad_norm": 0.41338171245838584, + "learning_rate": 2.681411635946876e-06, + "loss": 0.0224, + "step": 28864 + }, + { + "epoch": 3.4228625637376973, + "grad_norm": 0.597720482757603, + "learning_rate": 2.6803302196764744e-06, + "loss": 0.0234, + "step": 28865 + }, + { + "epoch": 3.422981145499822, + "grad_norm": 0.7673419683828584, + "learning_rate": 2.6792490091660544e-06, + "loss": 0.0382, + "step": 28866 + }, + { + "epoch": 3.4230997272619472, + "grad_norm": 0.44033864973767983, + "learning_rate": 2.6781680044255854e-06, + "loss": 0.0156, + "step": 28867 + }, + { + "epoch": 3.423218309024072, + "grad_norm": 0.34685110524468127, + "learning_rate": 2.677087205465034e-06, + "loss": 0.0139, + "step": 28868 + }, + { + "epoch": 3.423336890786197, + "grad_norm": 0.5520318585130529, + "learning_rate": 2.6760066122943657e-06, + "loss": 0.025, + "step": 28869 + }, + { + "epoch": 3.423455472548322, + "grad_norm": 0.7190642751507834, + "learning_rate": 2.6749262249235406e-06, + "loss": 0.0277, + "step": 28870 + }, + { + "epoch": 3.423574054310447, + "grad_norm": 0.4496697873807696, + "learning_rate": 2.6738460433625186e-06, + "loss": 0.0225, + "step": 28871 + }, + { + "epoch": 3.423692636072572, + "grad_norm": 0.29199031169433565, + "learning_rate": 2.672766067621263e-06, + "loss": 0.0134, + "step": 28872 + }, + { + "epoch": 3.423811217834697, + "grad_norm": 0.6858097021769025, + "learning_rate": 2.671686297709719e-06, + "loss": 0.0464, + "step": 28873 + }, + { + "epoch": 3.423929799596822, + "grad_norm": 0.4347315545448251, + "learning_rate": 2.670606733637848e-06, + "loss": 0.0277, + "step": 28874 + }, + { + "epoch": 3.424048381358947, + "grad_norm": 0.40337716458964296, + "learning_rate": 2.669527375415601e-06, + "loss": 0.0229, + "step": 28875 + }, + { + "epoch": 3.424166963121072, + "grad_norm": 0.37926823007009436, + "learning_rate": 2.668448223052933e-06, + "loss": 0.0141, + "step": 28876 + }, + { + "epoch": 3.424285544883197, + "grad_norm": 0.5080023095660138, + "learning_rate": 2.667369276559781e-06, + "loss": 0.0183, + "step": 28877 + }, + { + "epoch": 3.424404126645322, + "grad_norm": 0.742442396846309, + "learning_rate": 2.6662905359460975e-06, + "loss": 0.035, + "step": 28878 + }, + { + "epoch": 3.424522708407447, + "grad_norm": 0.5925498525310628, + "learning_rate": 2.6652120012218313e-06, + "loss": 0.0313, + "step": 28879 + }, + { + "epoch": 3.4246412901695718, + "grad_norm": 0.5615337595323237, + "learning_rate": 2.6641336723969207e-06, + "loss": 0.0343, + "step": 28880 + }, + { + "epoch": 3.424759871931697, + "grad_norm": 0.5102640743614011, + "learning_rate": 2.6630555494813108e-06, + "loss": 0.0199, + "step": 28881 + }, + { + "epoch": 3.4248784536938217, + "grad_norm": 0.7299367425284072, + "learning_rate": 2.661977632484927e-06, + "loss": 0.0379, + "step": 28882 + }, + { + "epoch": 3.424997035455947, + "grad_norm": 0.3311944831043888, + "learning_rate": 2.660899921417728e-06, + "loss": 0.0154, + "step": 28883 + }, + { + "epoch": 3.4251156172180717, + "grad_norm": 0.5171135340395947, + "learning_rate": 2.659822416289634e-06, + "loss": 0.0215, + "step": 28884 + }, + { + "epoch": 3.425234198980197, + "grad_norm": 0.4728392674639292, + "learning_rate": 2.658745117110581e-06, + "loss": 0.0211, + "step": 28885 + }, + { + "epoch": 3.4253527807423216, + "grad_norm": 0.4955424733786062, + "learning_rate": 2.6576680238905e-06, + "loss": 0.0234, + "step": 28886 + }, + { + "epoch": 3.425471362504447, + "grad_norm": 0.4413447835970298, + "learning_rate": 2.6565911366393304e-06, + "loss": 0.0193, + "step": 28887 + }, + { + "epoch": 3.425589944266572, + "grad_norm": 0.4337814196334114, + "learning_rate": 2.6555144553669836e-06, + "loss": 0.0155, + "step": 28888 + }, + { + "epoch": 3.425708526028697, + "grad_norm": 0.6703212141361025, + "learning_rate": 2.654437980083391e-06, + "loss": 0.0226, + "step": 28889 + }, + { + "epoch": 3.4258271077908216, + "grad_norm": 0.5787462506618869, + "learning_rate": 2.653361710798483e-06, + "loss": 0.0358, + "step": 28890 + }, + { + "epoch": 3.4259456895529468, + "grad_norm": 1.025571214280264, + "learning_rate": 2.6522856475221735e-06, + "loss": 0.0589, + "step": 28891 + }, + { + "epoch": 3.426064271315072, + "grad_norm": 0.4804945714205404, + "learning_rate": 2.651209790264392e-06, + "loss": 0.0301, + "step": 28892 + }, + { + "epoch": 3.4261828530771967, + "grad_norm": 0.4250932148559547, + "learning_rate": 2.6501341390350376e-06, + "loss": 0.0197, + "step": 28893 + }, + { + "epoch": 3.4263014348393215, + "grad_norm": 0.7705823944007307, + "learning_rate": 2.6490586938440532e-06, + "loss": 0.0334, + "step": 28894 + }, + { + "epoch": 3.4264200166014467, + "grad_norm": 0.5325498944444598, + "learning_rate": 2.6479834547013304e-06, + "loss": 0.025, + "step": 28895 + }, + { + "epoch": 3.426538598363572, + "grad_norm": 0.7604292267691592, + "learning_rate": 2.646908421616792e-06, + "loss": 0.0313, + "step": 28896 + }, + { + "epoch": 3.4266571801256966, + "grad_norm": 0.4250494865032407, + "learning_rate": 2.645833594600347e-06, + "loss": 0.0213, + "step": 28897 + }, + { + "epoch": 3.426775761887822, + "grad_norm": 0.29908906638436566, + "learning_rate": 2.644758973661904e-06, + "loss": 0.0102, + "step": 28898 + }, + { + "epoch": 3.4268943436499466, + "grad_norm": 0.5135969459152446, + "learning_rate": 2.643684558811374e-06, + "loss": 0.0259, + "step": 28899 + }, + { + "epoch": 3.427012925412072, + "grad_norm": 0.5625710724607172, + "learning_rate": 2.64261035005865e-06, + "loss": 0.0265, + "step": 28900 + }, + { + "epoch": 3.4271315071741966, + "grad_norm": 0.687073786725866, + "learning_rate": 2.641536347413648e-06, + "loss": 0.0249, + "step": 28901 + }, + { + "epoch": 3.4272500889363218, + "grad_norm": 0.3898847547194416, + "learning_rate": 2.6404625508862606e-06, + "loss": 0.0185, + "step": 28902 + }, + { + "epoch": 3.4273686706984465, + "grad_norm": 0.6241801703404349, + "learning_rate": 2.6393889604863916e-06, + "loss": 0.0266, + "step": 28903 + }, + { + "epoch": 3.4274872524605717, + "grad_norm": 0.4009879580917514, + "learning_rate": 2.6383155762239292e-06, + "loss": 0.0152, + "step": 28904 + }, + { + "epoch": 3.4276058342226965, + "grad_norm": 0.7450831694646657, + "learning_rate": 2.637242398108783e-06, + "loss": 0.0351, + "step": 28905 + }, + { + "epoch": 3.4277244159848217, + "grad_norm": 0.49592318093198023, + "learning_rate": 2.6361694261508364e-06, + "loss": 0.026, + "step": 28906 + }, + { + "epoch": 3.4278429977469465, + "grad_norm": 0.3851189558440772, + "learning_rate": 2.6350966603599815e-06, + "loss": 0.0176, + "step": 28907 + }, + { + "epoch": 3.4279615795090717, + "grad_norm": 0.6327495597531456, + "learning_rate": 2.63402410074611e-06, + "loss": 0.0483, + "step": 28908 + }, + { + "epoch": 3.4280801612711964, + "grad_norm": 0.7880176039766688, + "learning_rate": 2.6329517473191095e-06, + "loss": 0.023, + "step": 28909 + }, + { + "epoch": 3.4281987430333216, + "grad_norm": 0.3340950192267071, + "learning_rate": 2.6318796000888714e-06, + "loss": 0.0157, + "step": 28910 + }, + { + "epoch": 3.4283173247954464, + "grad_norm": 0.5562835093461165, + "learning_rate": 2.630807659065268e-06, + "loss": 0.0265, + "step": 28911 + }, + { + "epoch": 3.4284359065575716, + "grad_norm": 0.5930773154972588, + "learning_rate": 2.6297359242581865e-06, + "loss": 0.0428, + "step": 28912 + }, + { + "epoch": 3.4285544883196963, + "grad_norm": 0.5845144673885732, + "learning_rate": 2.6286643956775074e-06, + "loss": 0.0286, + "step": 28913 + }, + { + "epoch": 3.4286730700818215, + "grad_norm": 0.5514470789576525, + "learning_rate": 2.627593073333115e-06, + "loss": 0.0242, + "step": 28914 + }, + { + "epoch": 3.4287916518439463, + "grad_norm": 0.7823458576941363, + "learning_rate": 2.6265219572348705e-06, + "loss": 0.036, + "step": 28915 + }, + { + "epoch": 3.4289102336060715, + "grad_norm": 0.5927304254934044, + "learning_rate": 2.625451047392666e-06, + "loss": 0.0334, + "step": 28916 + }, + { + "epoch": 3.4290288153681963, + "grad_norm": 0.39314086491178474, + "learning_rate": 2.62438034381636e-06, + "loss": 0.0185, + "step": 28917 + }, + { + "epoch": 3.4291473971303215, + "grad_norm": 0.6107250780293171, + "learning_rate": 2.6233098465158285e-06, + "loss": 0.0356, + "step": 28918 + }, + { + "epoch": 3.429265978892446, + "grad_norm": 0.4038961031416665, + "learning_rate": 2.622239555500941e-06, + "loss": 0.0271, + "step": 28919 + }, + { + "epoch": 3.4293845606545714, + "grad_norm": 0.7267137208854517, + "learning_rate": 2.621169470781565e-06, + "loss": 0.0355, + "step": 28920 + }, + { + "epoch": 3.429503142416696, + "grad_norm": 0.5155891985729034, + "learning_rate": 2.6200995923675644e-06, + "loss": 0.0281, + "step": 28921 + }, + { + "epoch": 3.4296217241788214, + "grad_norm": 0.6827069050821734, + "learning_rate": 2.6190299202688008e-06, + "loss": 0.025, + "step": 28922 + }, + { + "epoch": 3.429740305940946, + "grad_norm": 0.3883765910448688, + "learning_rate": 2.617960454495133e-06, + "loss": 0.0183, + "step": 28923 + }, + { + "epoch": 3.4298588877030713, + "grad_norm": 0.5266412958731268, + "learning_rate": 2.6168911950564258e-06, + "loss": 0.029, + "step": 28924 + }, + { + "epoch": 3.429977469465196, + "grad_norm": 0.3948659125168395, + "learning_rate": 2.6158221419625345e-06, + "loss": 0.0165, + "step": 28925 + }, + { + "epoch": 3.4300960512273213, + "grad_norm": 0.5607422498234144, + "learning_rate": 2.6147532952233126e-06, + "loss": 0.0225, + "step": 28926 + }, + { + "epoch": 3.430214632989446, + "grad_norm": 0.37558934232491353, + "learning_rate": 2.6136846548486136e-06, + "loss": 0.0147, + "step": 28927 + }, + { + "epoch": 3.4303332147515713, + "grad_norm": 0.7372984896008119, + "learning_rate": 2.6126162208482905e-06, + "loss": 0.0377, + "step": 28928 + }, + { + "epoch": 3.430451796513696, + "grad_norm": 0.5178211231700118, + "learning_rate": 2.611547993232191e-06, + "loss": 0.0198, + "step": 28929 + }, + { + "epoch": 3.4305703782758212, + "grad_norm": 0.7831609126111655, + "learning_rate": 2.6104799720101657e-06, + "loss": 0.0381, + "step": 28930 + }, + { + "epoch": 3.430688960037946, + "grad_norm": 0.6962743700884316, + "learning_rate": 2.6094121571920595e-06, + "loss": 0.0409, + "step": 28931 + }, + { + "epoch": 3.430807541800071, + "grad_norm": 0.3247761772565658, + "learning_rate": 2.6083445487877223e-06, + "loss": 0.0132, + "step": 28932 + }, + { + "epoch": 3.430926123562196, + "grad_norm": 0.46970845515191734, + "learning_rate": 2.607277146806983e-06, + "loss": 0.0242, + "step": 28933 + }, + { + "epoch": 3.431044705324321, + "grad_norm": 0.32169182314522277, + "learning_rate": 2.6062099512596888e-06, + "loss": 0.016, + "step": 28934 + }, + { + "epoch": 3.431163287086446, + "grad_norm": 0.9036392163153298, + "learning_rate": 2.6051429621556767e-06, + "loss": 0.0569, + "step": 28935 + }, + { + "epoch": 3.431281868848571, + "grad_norm": 0.4497263448425969, + "learning_rate": 2.6040761795047913e-06, + "loss": 0.0225, + "step": 28936 + }, + { + "epoch": 3.4314004506106963, + "grad_norm": 0.47822586462331157, + "learning_rate": 2.6030096033168557e-06, + "loss": 0.0205, + "step": 28937 + }, + { + "epoch": 3.431519032372821, + "grad_norm": 0.432022138190527, + "learning_rate": 2.601943233601703e-06, + "loss": 0.0174, + "step": 28938 + }, + { + "epoch": 3.431637614134946, + "grad_norm": 0.41307157745751927, + "learning_rate": 2.6008770703691704e-06, + "loss": 0.0184, + "step": 28939 + }, + { + "epoch": 3.431756195897071, + "grad_norm": 0.913226843079112, + "learning_rate": 2.5998111136290804e-06, + "loss": 0.0337, + "step": 28940 + }, + { + "epoch": 3.4318747776591962, + "grad_norm": 0.423660507800162, + "learning_rate": 2.598745363391272e-06, + "loss": 0.0276, + "step": 28941 + }, + { + "epoch": 3.431993359421321, + "grad_norm": 0.4220618749950033, + "learning_rate": 2.5976798196655492e-06, + "loss": 0.0205, + "step": 28942 + }, + { + "epoch": 3.4321119411834458, + "grad_norm": 0.4196669825802467, + "learning_rate": 2.5966144824617567e-06, + "loss": 0.0203, + "step": 28943 + }, + { + "epoch": 3.432230522945571, + "grad_norm": 0.6491129103646393, + "learning_rate": 2.5955493517897e-06, + "loss": 0.0266, + "step": 28944 + }, + { + "epoch": 3.432349104707696, + "grad_norm": 0.4014789294156019, + "learning_rate": 2.594484427659205e-06, + "loss": 0.0337, + "step": 28945 + }, + { + "epoch": 3.432467686469821, + "grad_norm": 0.9548158241423143, + "learning_rate": 2.593419710080089e-06, + "loss": 0.0485, + "step": 28946 + }, + { + "epoch": 3.4325862682319457, + "grad_norm": 0.5629155046198691, + "learning_rate": 2.592355199062166e-06, + "loss": 0.022, + "step": 28947 + }, + { + "epoch": 3.432704849994071, + "grad_norm": 0.6146649886225877, + "learning_rate": 2.5912908946152533e-06, + "loss": 0.0413, + "step": 28948 + }, + { + "epoch": 3.432823431756196, + "grad_norm": 0.3257017934828633, + "learning_rate": 2.590226796749151e-06, + "loss": 0.0159, + "step": 28949 + }, + { + "epoch": 3.432942013518321, + "grad_norm": 0.640979815565223, + "learning_rate": 2.5891629054736855e-06, + "loss": 0.0217, + "step": 28950 + }, + { + "epoch": 3.433060595280446, + "grad_norm": 0.2876084486498489, + "learning_rate": 2.5880992207986538e-06, + "loss": 0.0178, + "step": 28951 + }, + { + "epoch": 3.433179177042571, + "grad_norm": 0.5187809642305578, + "learning_rate": 2.5870357427338675e-06, + "loss": 0.0193, + "step": 28952 + }, + { + "epoch": 3.433297758804696, + "grad_norm": 0.3942401382648835, + "learning_rate": 2.5859724712891166e-06, + "loss": 0.0157, + "step": 28953 + }, + { + "epoch": 3.4334163405668208, + "grad_norm": 0.2896247288969914, + "learning_rate": 2.584909406474223e-06, + "loss": 0.0161, + "step": 28954 + }, + { + "epoch": 3.433534922328946, + "grad_norm": 0.8198249469216433, + "learning_rate": 2.583846548298974e-06, + "loss": 0.0378, + "step": 28955 + }, + { + "epoch": 3.4336535040910707, + "grad_norm": 0.8810984749453231, + "learning_rate": 2.5827838967731692e-06, + "loss": 0.0496, + "step": 28956 + }, + { + "epoch": 3.433772085853196, + "grad_norm": 0.47883524445638304, + "learning_rate": 2.58172145190661e-06, + "loss": 0.0324, + "step": 28957 + }, + { + "epoch": 3.4338906676153207, + "grad_norm": 0.4944371178585555, + "learning_rate": 2.580659213709086e-06, + "loss": 0.025, + "step": 28958 + }, + { + "epoch": 3.434009249377446, + "grad_norm": 0.5544512689634641, + "learning_rate": 2.5795971821903965e-06, + "loss": 0.0271, + "step": 28959 + }, + { + "epoch": 3.4341278311395707, + "grad_norm": 0.829947137001259, + "learning_rate": 2.578535357360318e-06, + "loss": 0.0397, + "step": 28960 + }, + { + "epoch": 3.434246412901696, + "grad_norm": 0.5632681184734567, + "learning_rate": 2.5774737392286587e-06, + "loss": 0.0189, + "step": 28961 + }, + { + "epoch": 3.4343649946638206, + "grad_norm": 0.3221177357320511, + "learning_rate": 2.5764123278051886e-06, + "loss": 0.0213, + "step": 28962 + }, + { + "epoch": 3.434483576425946, + "grad_norm": 0.4562813118958436, + "learning_rate": 2.575351123099706e-06, + "loss": 0.0274, + "step": 28963 + }, + { + "epoch": 3.4346021581880706, + "grad_norm": 0.6726240799280562, + "learning_rate": 2.5742901251219754e-06, + "loss": 0.0298, + "step": 28964 + }, + { + "epoch": 3.434720739950196, + "grad_norm": 0.49034198093782105, + "learning_rate": 2.573229333881802e-06, + "loss": 0.0214, + "step": 28965 + }, + { + "epoch": 3.4348393217123205, + "grad_norm": 0.5797672338570193, + "learning_rate": 2.572168749388945e-06, + "loss": 0.0331, + "step": 28966 + }, + { + "epoch": 3.4349579034744457, + "grad_norm": 0.622416674927675, + "learning_rate": 2.5711083716531914e-06, + "loss": 0.0262, + "step": 28967 + }, + { + "epoch": 3.4350764852365705, + "grad_norm": 0.4267494687830739, + "learning_rate": 2.5700482006843136e-06, + "loss": 0.0207, + "step": 28968 + }, + { + "epoch": 3.4351950669986957, + "grad_norm": 0.6534593215897241, + "learning_rate": 2.568988236492087e-06, + "loss": 0.0226, + "step": 28969 + }, + { + "epoch": 3.4353136487608205, + "grad_norm": 0.42079344753664144, + "learning_rate": 2.5679284790862844e-06, + "loss": 0.0223, + "step": 28970 + }, + { + "epoch": 3.4354322305229457, + "grad_norm": 0.5457963783426637, + "learning_rate": 2.5668689284766677e-06, + "loss": 0.025, + "step": 28971 + }, + { + "epoch": 3.4355508122850704, + "grad_norm": 0.6297075658859654, + "learning_rate": 2.5658095846730152e-06, + "loss": 0.0278, + "step": 28972 + }, + { + "epoch": 3.4356693940471956, + "grad_norm": 0.5717664802314275, + "learning_rate": 2.5647504476850852e-06, + "loss": 0.0318, + "step": 28973 + }, + { + "epoch": 3.4357879758093204, + "grad_norm": 0.21352136780975575, + "learning_rate": 2.5636915175226483e-06, + "loss": 0.0089, + "step": 28974 + }, + { + "epoch": 3.4359065575714456, + "grad_norm": 0.4441615052739708, + "learning_rate": 2.562632794195455e-06, + "loss": 0.0208, + "step": 28975 + }, + { + "epoch": 3.4360251393335703, + "grad_norm": 0.45180190868681785, + "learning_rate": 2.56157427771328e-06, + "loss": 0.0258, + "step": 28976 + }, + { + "epoch": 3.4361437210956955, + "grad_norm": 0.4566492504915087, + "learning_rate": 2.560515968085872e-06, + "loss": 0.0151, + "step": 28977 + }, + { + "epoch": 3.4362623028578203, + "grad_norm": 0.41155877860903833, + "learning_rate": 2.5594578653229865e-06, + "loss": 0.0235, + "step": 28978 + }, + { + "epoch": 3.4363808846199455, + "grad_norm": 0.36181227405187905, + "learning_rate": 2.5583999694343828e-06, + "loss": 0.0211, + "step": 28979 + }, + { + "epoch": 3.4364994663820703, + "grad_norm": 0.4241225499151032, + "learning_rate": 2.5573422804298113e-06, + "loss": 0.0245, + "step": 28980 + }, + { + "epoch": 3.4366180481441955, + "grad_norm": 0.6146244230330579, + "learning_rate": 2.556284798319028e-06, + "loss": 0.0198, + "step": 28981 + }, + { + "epoch": 3.4367366299063202, + "grad_norm": 0.8238420779172254, + "learning_rate": 2.5552275231117725e-06, + "loss": 0.0272, + "step": 28982 + }, + { + "epoch": 3.4368552116684454, + "grad_norm": 0.35260990085841615, + "learning_rate": 2.554170454817795e-06, + "loss": 0.0167, + "step": 28983 + }, + { + "epoch": 3.43697379343057, + "grad_norm": 0.9015266317470307, + "learning_rate": 2.55311359344684e-06, + "loss": 0.0345, + "step": 28984 + }, + { + "epoch": 3.4370923751926954, + "grad_norm": 0.5529090054201413, + "learning_rate": 2.552056939008657e-06, + "loss": 0.0259, + "step": 28985 + }, + { + "epoch": 3.4372109569548206, + "grad_norm": 0.3632721380145686, + "learning_rate": 2.55100049151297e-06, + "loss": 0.0159, + "step": 28986 + }, + { + "epoch": 3.4373295387169454, + "grad_norm": 0.722966431134457, + "learning_rate": 2.5499442509695437e-06, + "loss": 0.0398, + "step": 28987 + }, + { + "epoch": 3.43744812047907, + "grad_norm": 0.34384596789340444, + "learning_rate": 2.548888217388093e-06, + "loss": 0.0172, + "step": 28988 + }, + { + "epoch": 3.4375667022411953, + "grad_norm": 0.7496288126630648, + "learning_rate": 2.5478323907783622e-06, + "loss": 0.0338, + "step": 28989 + }, + { + "epoch": 3.4376852840033205, + "grad_norm": 0.5028999285107342, + "learning_rate": 2.5467767711500833e-06, + "loss": 0.0223, + "step": 28990 + }, + { + "epoch": 3.4378038657654453, + "grad_norm": 0.584883026883105, + "learning_rate": 2.5457213585129926e-06, + "loss": 0.0331, + "step": 28991 + }, + { + "epoch": 3.43792244752757, + "grad_norm": 0.5787965236611579, + "learning_rate": 2.544666152876815e-06, + "loss": 0.0284, + "step": 28992 + }, + { + "epoch": 3.4380410292896952, + "grad_norm": 0.47388354370957825, + "learning_rate": 2.543611154251277e-06, + "loss": 0.0253, + "step": 28993 + }, + { + "epoch": 3.4381596110518204, + "grad_norm": 0.27379800309881847, + "learning_rate": 2.5425563626461065e-06, + "loss": 0.0144, + "step": 28994 + }, + { + "epoch": 3.438278192813945, + "grad_norm": 0.3173338032092776, + "learning_rate": 2.541501778071029e-06, + "loss": 0.0118, + "step": 28995 + }, + { + "epoch": 3.43839677457607, + "grad_norm": 0.6322932760022529, + "learning_rate": 2.5404474005357613e-06, + "loss": 0.0238, + "step": 28996 + }, + { + "epoch": 3.438515356338195, + "grad_norm": 0.622440449361367, + "learning_rate": 2.539393230050033e-06, + "loss": 0.0341, + "step": 28997 + }, + { + "epoch": 3.4386339381003204, + "grad_norm": 0.4264394755013822, + "learning_rate": 2.538339266623546e-06, + "loss": 0.0259, + "step": 28998 + }, + { + "epoch": 3.438752519862445, + "grad_norm": 0.4594086091076395, + "learning_rate": 2.5372855102660382e-06, + "loss": 0.0159, + "step": 28999 + }, + { + "epoch": 3.4388711016245703, + "grad_norm": 0.6194532428533074, + "learning_rate": 2.5362319609872036e-06, + "loss": 0.0367, + "step": 29000 + }, + { + "epoch": 3.438989683386695, + "grad_norm": 0.8420450803968799, + "learning_rate": 2.5351786187967684e-06, + "loss": 0.0529, + "step": 29001 + }, + { + "epoch": 3.4391082651488203, + "grad_norm": 0.703784025104894, + "learning_rate": 2.534125483704433e-06, + "loss": 0.0316, + "step": 29002 + }, + { + "epoch": 3.439226846910945, + "grad_norm": 0.6601279429254863, + "learning_rate": 2.5330725557199204e-06, + "loss": 0.0398, + "step": 29003 + }, + { + "epoch": 3.4393454286730702, + "grad_norm": 0.5726357490352061, + "learning_rate": 2.532019834852922e-06, + "loss": 0.0349, + "step": 29004 + }, + { + "epoch": 3.439464010435195, + "grad_norm": 0.5406508742294013, + "learning_rate": 2.530967321113148e-06, + "loss": 0.0191, + "step": 29005 + }, + { + "epoch": 3.43958259219732, + "grad_norm": 0.44384692485672, + "learning_rate": 2.5299150145103007e-06, + "loss": 0.0199, + "step": 29006 + }, + { + "epoch": 3.439701173959445, + "grad_norm": 0.41679732616769966, + "learning_rate": 2.5288629150540838e-06, + "loss": 0.0236, + "step": 29007 + }, + { + "epoch": 3.43981975572157, + "grad_norm": 0.36680708235551435, + "learning_rate": 2.527811022754201e-06, + "loss": 0.0197, + "step": 29008 + }, + { + "epoch": 3.439938337483695, + "grad_norm": 0.4367817728170921, + "learning_rate": 2.5267593376203325e-06, + "loss": 0.0172, + "step": 29009 + }, + { + "epoch": 3.44005691924582, + "grad_norm": 0.3592701413331721, + "learning_rate": 2.5257078596621935e-06, + "loss": 0.0167, + "step": 29010 + }, + { + "epoch": 3.440175501007945, + "grad_norm": 0.7296561004193876, + "learning_rate": 2.5246565888894623e-06, + "loss": 0.0265, + "step": 29011 + }, + { + "epoch": 3.44029408277007, + "grad_norm": 0.7082550740449945, + "learning_rate": 2.5236055253118423e-06, + "loss": 0.0232, + "step": 29012 + }, + { + "epoch": 3.440412664532195, + "grad_norm": 0.6829663828660302, + "learning_rate": 2.522554668939009e-06, + "loss": 0.0435, + "step": 29013 + }, + { + "epoch": 3.44053124629432, + "grad_norm": 0.47394647630635695, + "learning_rate": 2.5215040197806656e-06, + "loss": 0.0172, + "step": 29014 + }, + { + "epoch": 3.440649828056445, + "grad_norm": 0.8515580442447738, + "learning_rate": 2.5204535778464878e-06, + "loss": 0.0498, + "step": 29015 + }, + { + "epoch": 3.44076840981857, + "grad_norm": 0.7281952740154966, + "learning_rate": 2.5194033431461593e-06, + "loss": 0.0395, + "step": 29016 + }, + { + "epoch": 3.4408869915806948, + "grad_norm": 0.6926765658760234, + "learning_rate": 2.5183533156893646e-06, + "loss": 0.0415, + "step": 29017 + }, + { + "epoch": 3.44100557334282, + "grad_norm": 0.4468805631146565, + "learning_rate": 2.5173034954857845e-06, + "loss": 0.0153, + "step": 29018 + }, + { + "epoch": 3.4411241551049447, + "grad_norm": 0.45258893165217723, + "learning_rate": 2.5162538825450997e-06, + "loss": 0.0275, + "step": 29019 + }, + { + "epoch": 3.44124273686707, + "grad_norm": 0.6154791963587078, + "learning_rate": 2.515204476876973e-06, + "loss": 0.0321, + "step": 29020 + }, + { + "epoch": 3.4413613186291947, + "grad_norm": 0.4091725277584971, + "learning_rate": 2.514155278491101e-06, + "loss": 0.0204, + "step": 29021 + }, + { + "epoch": 3.44147990039132, + "grad_norm": 0.7095330928889038, + "learning_rate": 2.513106287397135e-06, + "loss": 0.0345, + "step": 29022 + }, + { + "epoch": 3.4415984821534447, + "grad_norm": 0.5899791236014705, + "learning_rate": 2.512057503604759e-06, + "loss": 0.0358, + "step": 29023 + }, + { + "epoch": 3.44171706391557, + "grad_norm": 0.4027118647360011, + "learning_rate": 2.5110089271236288e-06, + "loss": 0.013, + "step": 29024 + }, + { + "epoch": 3.4418356456776946, + "grad_norm": 0.38418871693055007, + "learning_rate": 2.5099605579634255e-06, + "loss": 0.0159, + "step": 29025 + }, + { + "epoch": 3.44195422743982, + "grad_norm": 0.43132057375401417, + "learning_rate": 2.5089123961338057e-06, + "loss": 0.0193, + "step": 29026 + }, + { + "epoch": 3.4420728092019446, + "grad_norm": 0.4279997978923762, + "learning_rate": 2.507864441644431e-06, + "loss": 0.0221, + "step": 29027 + }, + { + "epoch": 3.44219139096407, + "grad_norm": 0.5180648788311326, + "learning_rate": 2.5068166945049626e-06, + "loss": 0.0279, + "step": 29028 + }, + { + "epoch": 3.4423099727261945, + "grad_norm": 0.48809903774103747, + "learning_rate": 2.5057691547250627e-06, + "loss": 0.023, + "step": 29029 + }, + { + "epoch": 3.4424285544883197, + "grad_norm": 0.7137282454900876, + "learning_rate": 2.5047218223143933e-06, + "loss": 0.0301, + "step": 29030 + }, + { + "epoch": 3.4425471362504445, + "grad_norm": 0.3282517566488394, + "learning_rate": 2.5036746972825904e-06, + "loss": 0.0215, + "step": 29031 + }, + { + "epoch": 3.4426657180125697, + "grad_norm": 0.6311090967392684, + "learning_rate": 2.502627779639333e-06, + "loss": 0.0375, + "step": 29032 + }, + { + "epoch": 3.4427842997746945, + "grad_norm": 0.37801694706280176, + "learning_rate": 2.5015810693942516e-06, + "loss": 0.0221, + "step": 29033 + }, + { + "epoch": 3.4429028815368197, + "grad_norm": 0.42340830564294646, + "learning_rate": 2.5005345665570113e-06, + "loss": 0.0176, + "step": 29034 + }, + { + "epoch": 3.4430214632989444, + "grad_norm": 0.5780364963379726, + "learning_rate": 2.4994882711372404e-06, + "loss": 0.0287, + "step": 29035 + }, + { + "epoch": 3.4431400450610696, + "grad_norm": 0.60811192212662, + "learning_rate": 2.4984421831446058e-06, + "loss": 0.0428, + "step": 29036 + }, + { + "epoch": 3.4432586268231944, + "grad_norm": 0.8429902216223468, + "learning_rate": 2.4973963025887336e-06, + "loss": 0.0304, + "step": 29037 + }, + { + "epoch": 3.4433772085853196, + "grad_norm": 0.4149262359324661, + "learning_rate": 2.496350629479277e-06, + "loss": 0.0167, + "step": 29038 + }, + { + "epoch": 3.443495790347445, + "grad_norm": 0.6868572927466966, + "learning_rate": 2.49530516382587e-06, + "loss": 0.0335, + "step": 29039 + }, + { + "epoch": 3.4436143721095696, + "grad_norm": 0.4949522328778541, + "learning_rate": 2.494259905638152e-06, + "loss": 0.0278, + "step": 29040 + }, + { + "epoch": 3.4437329538716943, + "grad_norm": 0.43521659488519765, + "learning_rate": 2.493214854925763e-06, + "loss": 0.0229, + "step": 29041 + }, + { + "epoch": 3.4438515356338195, + "grad_norm": 0.493761703408868, + "learning_rate": 2.4921700116983247e-06, + "loss": 0.0242, + "step": 29042 + }, + { + "epoch": 3.4439701173959447, + "grad_norm": 0.7906687535597051, + "learning_rate": 2.491125375965489e-06, + "loss": 0.0313, + "step": 29043 + }, + { + "epoch": 3.4440886991580695, + "grad_norm": 0.37421671592941297, + "learning_rate": 2.4900809477368697e-06, + "loss": 0.0154, + "step": 29044 + }, + { + "epoch": 3.4442072809201942, + "grad_norm": 0.4604902073836254, + "learning_rate": 2.489036727022104e-06, + "loss": 0.0274, + "step": 29045 + }, + { + "epoch": 3.4443258626823194, + "grad_norm": 0.7128855087073487, + "learning_rate": 2.4879927138308058e-06, + "loss": 0.0349, + "step": 29046 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.44574328112838224, + "learning_rate": 2.4869489081726205e-06, + "loss": 0.0208, + "step": 29047 + }, + { + "epoch": 3.4445630262065694, + "grad_norm": 0.6144187080687736, + "learning_rate": 2.4859053100571516e-06, + "loss": 0.0309, + "step": 29048 + }, + { + "epoch": 3.4446816079686946, + "grad_norm": 0.7717896735378873, + "learning_rate": 2.4848619194940275e-06, + "loss": 0.0234, + "step": 29049 + }, + { + "epoch": 3.4448001897308194, + "grad_norm": 0.6885027311728122, + "learning_rate": 2.483818736492868e-06, + "loss": 0.0368, + "step": 29050 + }, + { + "epoch": 3.4449187714929446, + "grad_norm": 0.46539491262313476, + "learning_rate": 2.482775761063286e-06, + "loss": 0.0212, + "step": 29051 + }, + { + "epoch": 3.4450373532550693, + "grad_norm": 0.6612896062642103, + "learning_rate": 2.4817329932149054e-06, + "loss": 0.0355, + "step": 29052 + }, + { + "epoch": 3.4451559350171945, + "grad_norm": 0.44109996233261195, + "learning_rate": 2.480690432957328e-06, + "loss": 0.0158, + "step": 29053 + }, + { + "epoch": 3.4452745167793193, + "grad_norm": 0.4968721388551394, + "learning_rate": 2.4796480803001706e-06, + "loss": 0.025, + "step": 29054 + }, + { + "epoch": 3.4453930985414445, + "grad_norm": 1.147394507233972, + "learning_rate": 2.478605935253042e-06, + "loss": 0.0526, + "step": 29055 + }, + { + "epoch": 3.4455116803035692, + "grad_norm": 0.5406626219299528, + "learning_rate": 2.4775639978255466e-06, + "loss": 0.023, + "step": 29056 + }, + { + "epoch": 3.4456302620656944, + "grad_norm": 0.6962725141516158, + "learning_rate": 2.4765222680272954e-06, + "loss": 0.0394, + "step": 29057 + }, + { + "epoch": 3.445748843827819, + "grad_norm": 0.36824669094538337, + "learning_rate": 2.4754807458678862e-06, + "loss": 0.021, + "step": 29058 + }, + { + "epoch": 3.4458674255899444, + "grad_norm": 0.4127295661526935, + "learning_rate": 2.4744394313569286e-06, + "loss": 0.0217, + "step": 29059 + }, + { + "epoch": 3.445986007352069, + "grad_norm": 0.6835067752735285, + "learning_rate": 2.4733983245040144e-06, + "loss": 0.0328, + "step": 29060 + }, + { + "epoch": 3.4461045891141944, + "grad_norm": 0.3362011922061746, + "learning_rate": 2.4723574253187413e-06, + "loss": 0.0159, + "step": 29061 + }, + { + "epoch": 3.446223170876319, + "grad_norm": 0.4708177767983754, + "learning_rate": 2.471316733810708e-06, + "loss": 0.019, + "step": 29062 + }, + { + "epoch": 3.4463417526384443, + "grad_norm": 0.6881375737590889, + "learning_rate": 2.4702762499895115e-06, + "loss": 0.0276, + "step": 29063 + }, + { + "epoch": 3.446460334400569, + "grad_norm": 0.3133189107218558, + "learning_rate": 2.4692359738647363e-06, + "loss": 0.0145, + "step": 29064 + }, + { + "epoch": 3.4465789161626943, + "grad_norm": 0.5284518849314871, + "learning_rate": 2.4681959054459775e-06, + "loss": 0.017, + "step": 29065 + }, + { + "epoch": 3.446697497924819, + "grad_norm": 0.44084197578852424, + "learning_rate": 2.4671560447428187e-06, + "loss": 0.0232, + "step": 29066 + }, + { + "epoch": 3.4468160796869443, + "grad_norm": 0.6930571780853112, + "learning_rate": 2.466116391764853e-06, + "loss": 0.0461, + "step": 29067 + }, + { + "epoch": 3.446934661449069, + "grad_norm": 0.7381030154483369, + "learning_rate": 2.465076946521666e-06, + "loss": 0.049, + "step": 29068 + }, + { + "epoch": 3.447053243211194, + "grad_norm": 0.5222286658119337, + "learning_rate": 2.4640377090228232e-06, + "loss": 0.0211, + "step": 29069 + }, + { + "epoch": 3.447171824973319, + "grad_norm": 0.3848981858597521, + "learning_rate": 2.462998679277931e-06, + "loss": 0.0195, + "step": 29070 + }, + { + "epoch": 3.447290406735444, + "grad_norm": 0.6321971016957006, + "learning_rate": 2.461959857296545e-06, + "loss": 0.0315, + "step": 29071 + }, + { + "epoch": 3.447408988497569, + "grad_norm": 0.3801539252808336, + "learning_rate": 2.4609212430882546e-06, + "loss": 0.0139, + "step": 29072 + }, + { + "epoch": 3.447527570259694, + "grad_norm": 0.3264625540319487, + "learning_rate": 2.4598828366626304e-06, + "loss": 0.0165, + "step": 29073 + }, + { + "epoch": 3.447646152021819, + "grad_norm": 0.48794416207080965, + "learning_rate": 2.458844638029248e-06, + "loss": 0.0235, + "step": 29074 + }, + { + "epoch": 3.447764733783944, + "grad_norm": 0.9659093185556107, + "learning_rate": 2.4578066471976747e-06, + "loss": 0.0616, + "step": 29075 + }, + { + "epoch": 3.447883315546069, + "grad_norm": 0.6131472729446279, + "learning_rate": 2.4567688641774806e-06, + "loss": 0.0345, + "step": 29076 + }, + { + "epoch": 3.448001897308194, + "grad_norm": 0.4245434145883034, + "learning_rate": 2.4557312889782304e-06, + "loss": 0.0251, + "step": 29077 + }, + { + "epoch": 3.448120479070319, + "grad_norm": 0.5652360203241789, + "learning_rate": 2.454693921609494e-06, + "loss": 0.0361, + "step": 29078 + }, + { + "epoch": 3.448239060832444, + "grad_norm": 0.6171989344295306, + "learning_rate": 2.4536567620808387e-06, + "loss": 0.0322, + "step": 29079 + }, + { + "epoch": 3.448357642594569, + "grad_norm": 0.6236299718862258, + "learning_rate": 2.45261981040181e-06, + "loss": 0.0313, + "step": 29080 + }, + { + "epoch": 3.448476224356694, + "grad_norm": 1.0501241705669155, + "learning_rate": 2.451583066581986e-06, + "loss": 0.0417, + "step": 29081 + }, + { + "epoch": 3.4485948061188187, + "grad_norm": 0.5401271169231141, + "learning_rate": 2.4505465306309096e-06, + "loss": 0.0313, + "step": 29082 + }, + { + "epoch": 3.448713387880944, + "grad_norm": 0.5457164411682847, + "learning_rate": 2.44951020255815e-06, + "loss": 0.0194, + "step": 29083 + }, + { + "epoch": 3.4488319696430687, + "grad_norm": 0.48322794242525596, + "learning_rate": 2.4484740823732423e-06, + "loss": 0.0146, + "step": 29084 + }, + { + "epoch": 3.448950551405194, + "grad_norm": 0.47927678487680353, + "learning_rate": 2.4474381700857556e-06, + "loss": 0.0205, + "step": 29085 + }, + { + "epoch": 3.4490691331673187, + "grad_norm": 0.5112067035784186, + "learning_rate": 2.446402465705233e-06, + "loss": 0.0269, + "step": 29086 + }, + { + "epoch": 3.449187714929444, + "grad_norm": 0.3549496276497199, + "learning_rate": 2.4453669692412196e-06, + "loss": 0.0149, + "step": 29087 + }, + { + "epoch": 3.449306296691569, + "grad_norm": 0.32026219841249004, + "learning_rate": 2.444331680703266e-06, + "loss": 0.0115, + "step": 29088 + }, + { + "epoch": 3.449424878453694, + "grad_norm": 0.4906294223303354, + "learning_rate": 2.4432966001009137e-06, + "loss": 0.0171, + "step": 29089 + }, + { + "epoch": 3.4495434602158186, + "grad_norm": 0.5084425183433193, + "learning_rate": 2.4422617274437094e-06, + "loss": 0.0253, + "step": 29090 + }, + { + "epoch": 3.449662041977944, + "grad_norm": 0.4702801357258427, + "learning_rate": 2.441227062741183e-06, + "loss": 0.0244, + "step": 29091 + }, + { + "epoch": 3.449780623740069, + "grad_norm": 0.6261992569868765, + "learning_rate": 2.440192606002889e-06, + "loss": 0.0297, + "step": 29092 + }, + { + "epoch": 3.4498992055021938, + "grad_norm": 0.6001678109123858, + "learning_rate": 2.439158357238347e-06, + "loss": 0.0279, + "step": 29093 + }, + { + "epoch": 3.4500177872643185, + "grad_norm": 0.6577475440541127, + "learning_rate": 2.438124316457108e-06, + "loss": 0.0429, + "step": 29094 + }, + { + "epoch": 3.4501363690264437, + "grad_norm": 0.6099428838597033, + "learning_rate": 2.437090483668683e-06, + "loss": 0.0327, + "step": 29095 + }, + { + "epoch": 3.450254950788569, + "grad_norm": 0.7798756714031444, + "learning_rate": 2.436056858882624e-06, + "loss": 0.0225, + "step": 29096 + }, + { + "epoch": 3.4503735325506937, + "grad_norm": 0.25367868906629515, + "learning_rate": 2.43502344210845e-06, + "loss": 0.0148, + "step": 29097 + }, + { + "epoch": 3.450492114312819, + "grad_norm": 0.7278339321405408, + "learning_rate": 2.4339902333556847e-06, + "loss": 0.0424, + "step": 29098 + }, + { + "epoch": 3.4506106960749436, + "grad_norm": 0.5388410171293393, + "learning_rate": 2.432957232633859e-06, + "loss": 0.0196, + "step": 29099 + }, + { + "epoch": 3.450729277837069, + "grad_norm": 0.41524535508994814, + "learning_rate": 2.431924439952496e-06, + "loss": 0.0228, + "step": 29100 + }, + { + "epoch": 3.4508478595991936, + "grad_norm": 0.448102558638431, + "learning_rate": 2.4308918553211158e-06, + "loss": 0.0241, + "step": 29101 + }, + { + "epoch": 3.450966441361319, + "grad_norm": 0.728779967832756, + "learning_rate": 2.4298594787492303e-06, + "loss": 0.0428, + "step": 29102 + }, + { + "epoch": 3.4510850231234436, + "grad_norm": 0.40671009155851484, + "learning_rate": 2.4288273102463707e-06, + "loss": 0.0101, + "step": 29103 + }, + { + "epoch": 3.4512036048855688, + "grad_norm": 0.5054074567225302, + "learning_rate": 2.4277953498220402e-06, + "loss": 0.0245, + "step": 29104 + }, + { + "epoch": 3.4513221866476935, + "grad_norm": 0.5195237817801355, + "learning_rate": 2.4267635974857596e-06, + "loss": 0.0209, + "step": 29105 + }, + { + "epoch": 3.4514407684098187, + "grad_norm": 0.5329880639000194, + "learning_rate": 2.4257320532470347e-06, + "loss": 0.0294, + "step": 29106 + }, + { + "epoch": 3.4515593501719435, + "grad_norm": 0.5435215251312642, + "learning_rate": 2.4247007171153775e-06, + "loss": 0.0204, + "step": 29107 + }, + { + "epoch": 3.4516779319340687, + "grad_norm": 0.4983534860961178, + "learning_rate": 2.4236695891003026e-06, + "loss": 0.027, + "step": 29108 + }, + { + "epoch": 3.4517965136961934, + "grad_norm": 0.42680178054981305, + "learning_rate": 2.4226386692113024e-06, + "loss": 0.015, + "step": 29109 + }, + { + "epoch": 3.4519150954583186, + "grad_norm": 0.4364123449344783, + "learning_rate": 2.421607957457889e-06, + "loss": 0.0179, + "step": 29110 + }, + { + "epoch": 3.4520336772204434, + "grad_norm": 0.535360851232987, + "learning_rate": 2.420577453849562e-06, + "loss": 0.0215, + "step": 29111 + }, + { + "epoch": 3.4521522589825686, + "grad_norm": 0.7699222645836294, + "learning_rate": 2.4195471583958266e-06, + "loss": 0.0296, + "step": 29112 + }, + { + "epoch": 3.4522708407446934, + "grad_norm": 0.39377041856280265, + "learning_rate": 2.4185170711061663e-06, + "loss": 0.0179, + "step": 29113 + }, + { + "epoch": 3.4523894225068186, + "grad_norm": 0.48562298414123767, + "learning_rate": 2.4174871919900976e-06, + "loss": 0.0256, + "step": 29114 + }, + { + "epoch": 3.4525080042689433, + "grad_norm": 0.9009202034298417, + "learning_rate": 2.4164575210571028e-06, + "loss": 0.0514, + "step": 29115 + }, + { + "epoch": 3.4526265860310685, + "grad_norm": 0.44423996392698284, + "learning_rate": 2.415428058316671e-06, + "loss": 0.0263, + "step": 29116 + }, + { + "epoch": 3.4527451677931933, + "grad_norm": 0.41158259356190857, + "learning_rate": 2.4143988037783006e-06, + "loss": 0.0172, + "step": 29117 + }, + { + "epoch": 3.4528637495553185, + "grad_norm": 0.5035397554871353, + "learning_rate": 2.4133697574514753e-06, + "loss": 0.0335, + "step": 29118 + }, + { + "epoch": 3.4529823313174433, + "grad_norm": 0.4993300982736495, + "learning_rate": 2.4123409193456903e-06, + "loss": 0.0204, + "step": 29119 + }, + { + "epoch": 3.4531009130795685, + "grad_norm": 0.6121959058679166, + "learning_rate": 2.411312289470416e-06, + "loss": 0.0292, + "step": 29120 + }, + { + "epoch": 3.453219494841693, + "grad_norm": 0.5103585864992013, + "learning_rate": 2.410283867835142e-06, + "loss": 0.024, + "step": 29121 + }, + { + "epoch": 3.4533380766038184, + "grad_norm": 0.31194641684442226, + "learning_rate": 2.409255654449352e-06, + "loss": 0.0101, + "step": 29122 + }, + { + "epoch": 3.453456658365943, + "grad_norm": 0.5296828074421802, + "learning_rate": 2.4082276493225255e-06, + "loss": 0.02, + "step": 29123 + }, + { + "epoch": 3.4535752401280684, + "grad_norm": 0.47057795131740915, + "learning_rate": 2.4071998524641316e-06, + "loss": 0.0286, + "step": 29124 + }, + { + "epoch": 3.453693821890193, + "grad_norm": 0.653847418806593, + "learning_rate": 2.406172263883649e-06, + "loss": 0.0353, + "step": 29125 + }, + { + "epoch": 3.4538124036523183, + "grad_norm": 0.3908199735091344, + "learning_rate": 2.4051448835905543e-06, + "loss": 0.0215, + "step": 29126 + }, + { + "epoch": 3.453930985414443, + "grad_norm": 0.2446434433545278, + "learning_rate": 2.4041177115943143e-06, + "loss": 0.0116, + "step": 29127 + }, + { + "epoch": 3.4540495671765683, + "grad_norm": 0.6051101993628845, + "learning_rate": 2.403090747904399e-06, + "loss": 0.0348, + "step": 29128 + }, + { + "epoch": 3.454168148938693, + "grad_norm": 1.1011181467229332, + "learning_rate": 2.402063992530279e-06, + "loss": 0.0365, + "step": 29129 + }, + { + "epoch": 3.4542867307008183, + "grad_norm": 0.49919465078662584, + "learning_rate": 2.4010374454814215e-06, + "loss": 0.0283, + "step": 29130 + }, + { + "epoch": 3.454405312462943, + "grad_norm": 0.9398934601483375, + "learning_rate": 2.4000111067672803e-06, + "loss": 0.0396, + "step": 29131 + }, + { + "epoch": 3.4545238942250682, + "grad_norm": 0.502926861637996, + "learning_rate": 2.3989849763973253e-06, + "loss": 0.0201, + "step": 29132 + }, + { + "epoch": 3.454642475987193, + "grad_norm": 0.4352577537427602, + "learning_rate": 2.3979590543810103e-06, + "loss": 0.017, + "step": 29133 + }, + { + "epoch": 3.454761057749318, + "grad_norm": 0.6917650226454554, + "learning_rate": 2.3969333407278026e-06, + "loss": 0.0334, + "step": 29134 + }, + { + "epoch": 3.454879639511443, + "grad_norm": 0.443412991482155, + "learning_rate": 2.395907835447148e-06, + "loss": 0.022, + "step": 29135 + }, + { + "epoch": 3.454998221273568, + "grad_norm": 0.5807171552882378, + "learning_rate": 2.394882538548504e-06, + "loss": 0.0272, + "step": 29136 + }, + { + "epoch": 3.4551168030356934, + "grad_norm": 0.524486607627375, + "learning_rate": 2.393857450041323e-06, + "loss": 0.0298, + "step": 29137 + }, + { + "epoch": 3.455235384797818, + "grad_norm": 0.7361390524249458, + "learning_rate": 2.3928325699350525e-06, + "loss": 0.042, + "step": 29138 + }, + { + "epoch": 3.455353966559943, + "grad_norm": 0.3936206145546536, + "learning_rate": 2.391807898239151e-06, + "loss": 0.0184, + "step": 29139 + }, + { + "epoch": 3.455472548322068, + "grad_norm": 0.41097104133096174, + "learning_rate": 2.3907834349630454e-06, + "loss": 0.0196, + "step": 29140 + }, + { + "epoch": 3.4555911300841933, + "grad_norm": 0.4702539497503319, + "learning_rate": 2.3897591801161995e-06, + "loss": 0.0212, + "step": 29141 + }, + { + "epoch": 3.455709711846318, + "grad_norm": 0.5432561748206506, + "learning_rate": 2.388735133708045e-06, + "loss": 0.0258, + "step": 29142 + }, + { + "epoch": 3.455828293608443, + "grad_norm": 0.5220575364757349, + "learning_rate": 2.387711295748024e-06, + "loss": 0.0361, + "step": 29143 + }, + { + "epoch": 3.455946875370568, + "grad_norm": 0.6786209163520089, + "learning_rate": 2.3866876662455768e-06, + "loss": 0.037, + "step": 29144 + }, + { + "epoch": 3.456065457132693, + "grad_norm": 0.3989756706300206, + "learning_rate": 2.3856642452101426e-06, + "loss": 0.0142, + "step": 29145 + }, + { + "epoch": 3.456184038894818, + "grad_norm": 0.941302209669808, + "learning_rate": 2.384641032651147e-06, + "loss": 0.0451, + "step": 29146 + }, + { + "epoch": 3.4563026206569427, + "grad_norm": 0.4718219337395561, + "learning_rate": 2.38361802857803e-06, + "loss": 0.0231, + "step": 29147 + }, + { + "epoch": 3.456421202419068, + "grad_norm": 0.5575502589083844, + "learning_rate": 2.3825952330002175e-06, + "loss": 0.0268, + "step": 29148 + }, + { + "epoch": 3.456539784181193, + "grad_norm": 0.6456794590073137, + "learning_rate": 2.3815726459271466e-06, + "loss": 0.0206, + "step": 29149 + }, + { + "epoch": 3.456658365943318, + "grad_norm": 0.4729364059552349, + "learning_rate": 2.3805502673682393e-06, + "loss": 0.0149, + "step": 29150 + }, + { + "epoch": 3.456776947705443, + "grad_norm": 0.3512795100912271, + "learning_rate": 2.3795280973329144e-06, + "loss": 0.0176, + "step": 29151 + }, + { + "epoch": 3.456895529467568, + "grad_norm": 0.3194289502246709, + "learning_rate": 2.3785061358306103e-06, + "loss": 0.0135, + "step": 29152 + }, + { + "epoch": 3.457014111229693, + "grad_norm": 0.4841076055402178, + "learning_rate": 2.377484382870734e-06, + "loss": 0.0269, + "step": 29153 + }, + { + "epoch": 3.457132692991818, + "grad_norm": 0.744501861447363, + "learning_rate": 2.3764628384627092e-06, + "loss": 0.0401, + "step": 29154 + }, + { + "epoch": 3.457251274753943, + "grad_norm": 0.5965989632184261, + "learning_rate": 2.375441502615958e-06, + "loss": 0.0384, + "step": 29155 + }, + { + "epoch": 3.4573698565160678, + "grad_norm": 0.5019516278429694, + "learning_rate": 2.3744203753398893e-06, + "loss": 0.0313, + "step": 29156 + }, + { + "epoch": 3.457488438278193, + "grad_norm": 0.7482487622862202, + "learning_rate": 2.373399456643924e-06, + "loss": 0.0261, + "step": 29157 + }, + { + "epoch": 3.4576070200403177, + "grad_norm": 0.4037401990261856, + "learning_rate": 2.37237874653746e-06, + "loss": 0.0215, + "step": 29158 + }, + { + "epoch": 3.457725601802443, + "grad_norm": 0.4071867916431023, + "learning_rate": 2.3713582450299256e-06, + "loss": 0.0192, + "step": 29159 + }, + { + "epoch": 3.4578441835645677, + "grad_norm": 0.431317747015767, + "learning_rate": 2.3703379521307133e-06, + "loss": 0.0246, + "step": 29160 + }, + { + "epoch": 3.457962765326693, + "grad_norm": 0.4498646230470677, + "learning_rate": 2.369317867849241e-06, + "loss": 0.0186, + "step": 29161 + }, + { + "epoch": 3.4580813470888176, + "grad_norm": 0.6381561397648977, + "learning_rate": 2.3682979921948957e-06, + "loss": 0.0309, + "step": 29162 + }, + { + "epoch": 3.458199928850943, + "grad_norm": 0.6913315143822379, + "learning_rate": 2.3672783251770996e-06, + "loss": 0.0376, + "step": 29163 + }, + { + "epoch": 3.4583185106130676, + "grad_norm": 0.5026546854088444, + "learning_rate": 2.3662588668052405e-06, + "loss": 0.0219, + "step": 29164 + }, + { + "epoch": 3.458437092375193, + "grad_norm": 0.5068024540847944, + "learning_rate": 2.3652396170887185e-06, + "loss": 0.0201, + "step": 29165 + }, + { + "epoch": 3.4585556741373176, + "grad_norm": 0.6150152004638736, + "learning_rate": 2.3642205760369296e-06, + "loss": 0.025, + "step": 29166 + }, + { + "epoch": 3.4586742558994428, + "grad_norm": 0.39428498436565934, + "learning_rate": 2.3632017436592713e-06, + "loss": 0.0148, + "step": 29167 + }, + { + "epoch": 3.4587928376615675, + "grad_norm": 0.6371613394360435, + "learning_rate": 2.3621831199651363e-06, + "loss": 0.0253, + "step": 29168 + }, + { + "epoch": 3.4589114194236927, + "grad_norm": 0.35446485321873566, + "learning_rate": 2.3611647049639085e-06, + "loss": 0.0188, + "step": 29169 + }, + { + "epoch": 3.4590300011858175, + "grad_norm": 0.42105730205117387, + "learning_rate": 2.3601464986649805e-06, + "loss": 0.0198, + "step": 29170 + }, + { + "epoch": 3.4591485829479427, + "grad_norm": 0.44397925153500356, + "learning_rate": 2.3591285010777398e-06, + "loss": 0.0172, + "step": 29171 + }, + { + "epoch": 3.4592671647100675, + "grad_norm": 0.4601114513597288, + "learning_rate": 2.3581107122115726e-06, + "loss": 0.0176, + "step": 29172 + }, + { + "epoch": 3.4593857464721927, + "grad_norm": 0.5760908276558131, + "learning_rate": 2.357093132075852e-06, + "loss": 0.0236, + "step": 29173 + }, + { + "epoch": 3.4595043282343174, + "grad_norm": 0.5707483367214836, + "learning_rate": 2.356075760679977e-06, + "loss": 0.0246, + "step": 29174 + }, + { + "epoch": 3.4596229099964426, + "grad_norm": 0.7650300239006366, + "learning_rate": 2.355058598033308e-06, + "loss": 0.0325, + "step": 29175 + }, + { + "epoch": 3.4597414917585674, + "grad_norm": 0.6268083426615895, + "learning_rate": 2.3540416441452303e-06, + "loss": 0.0312, + "step": 29176 + }, + { + "epoch": 3.4598600735206926, + "grad_norm": 0.5324158442267943, + "learning_rate": 2.353024899025119e-06, + "loss": 0.0214, + "step": 29177 + }, + { + "epoch": 3.4599786552828173, + "grad_norm": 0.478556260432901, + "learning_rate": 2.352008362682348e-06, + "loss": 0.0342, + "step": 29178 + }, + { + "epoch": 3.4600972370449425, + "grad_norm": 0.4941512047149912, + "learning_rate": 2.3509920351262897e-06, + "loss": 0.0225, + "step": 29179 + }, + { + "epoch": 3.4602158188070673, + "grad_norm": 0.4360482135295614, + "learning_rate": 2.3499759163663064e-06, + "loss": 0.0198, + "step": 29180 + }, + { + "epoch": 3.4603344005691925, + "grad_norm": 0.42785148117722394, + "learning_rate": 2.3489600064117714e-06, + "loss": 0.021, + "step": 29181 + }, + { + "epoch": 3.4604529823313173, + "grad_norm": 0.4193752116905748, + "learning_rate": 2.3479443052720455e-06, + "loss": 0.0224, + "step": 29182 + }, + { + "epoch": 3.4605715640934425, + "grad_norm": 0.6183234947768533, + "learning_rate": 2.3469288129565033e-06, + "loss": 0.0265, + "step": 29183 + }, + { + "epoch": 3.4606901458555672, + "grad_norm": 0.686243132890176, + "learning_rate": 2.3459135294744915e-06, + "loss": 0.0388, + "step": 29184 + }, + { + "epoch": 3.4608087276176924, + "grad_norm": 0.38876932355266997, + "learning_rate": 2.344898454835376e-06, + "loss": 0.0157, + "step": 29185 + }, + { + "epoch": 3.4609273093798176, + "grad_norm": 0.5486989908427763, + "learning_rate": 2.343883589048518e-06, + "loss": 0.0282, + "step": 29186 + }, + { + "epoch": 3.4610458911419424, + "grad_norm": 0.8197035562226992, + "learning_rate": 2.3428689321232683e-06, + "loss": 0.0394, + "step": 29187 + }, + { + "epoch": 3.461164472904067, + "grad_norm": 0.38294462543233554, + "learning_rate": 2.3418544840689837e-06, + "loss": 0.023, + "step": 29188 + }, + { + "epoch": 3.4612830546661923, + "grad_norm": 0.47244381162723154, + "learning_rate": 2.3408402448950178e-06, + "loss": 0.0212, + "step": 29189 + }, + { + "epoch": 3.4614016364283176, + "grad_norm": 0.7327408986259605, + "learning_rate": 2.3398262146107184e-06, + "loss": 0.0358, + "step": 29190 + }, + { + "epoch": 3.4615202181904423, + "grad_norm": 0.9052039550191823, + "learning_rate": 2.3388123932254314e-06, + "loss": 0.0355, + "step": 29191 + }, + { + "epoch": 3.461638799952567, + "grad_norm": 0.37430350096345405, + "learning_rate": 2.337798780748507e-06, + "loss": 0.0213, + "step": 29192 + }, + { + "epoch": 3.4617573817146923, + "grad_norm": 0.6426367452790981, + "learning_rate": 2.336785377189285e-06, + "loss": 0.0256, + "step": 29193 + }, + { + "epoch": 3.4618759634768175, + "grad_norm": 0.7611860600812577, + "learning_rate": 2.3357721825571167e-06, + "loss": 0.0284, + "step": 29194 + }, + { + "epoch": 3.4619945452389422, + "grad_norm": 0.5374940418043093, + "learning_rate": 2.3347591968613303e-06, + "loss": 0.0222, + "step": 29195 + }, + { + "epoch": 3.462113127001067, + "grad_norm": 0.43357739383765964, + "learning_rate": 2.3337464201112714e-06, + "loss": 0.0183, + "step": 29196 + }, + { + "epoch": 3.462231708763192, + "grad_norm": 0.45984287638870347, + "learning_rate": 2.3327338523162737e-06, + "loss": 0.0201, + "step": 29197 + }, + { + "epoch": 3.4623502905253174, + "grad_norm": 0.5021546935423872, + "learning_rate": 2.331721493485672e-06, + "loss": 0.0246, + "step": 29198 + }, + { + "epoch": 3.462468872287442, + "grad_norm": 0.38046099764940894, + "learning_rate": 2.3307093436288035e-06, + "loss": 0.0148, + "step": 29199 + }, + { + "epoch": 3.4625874540495674, + "grad_norm": 0.8246527526689806, + "learning_rate": 2.329697402754996e-06, + "loss": 0.0318, + "step": 29200 + }, + { + "epoch": 3.462706035811692, + "grad_norm": 0.5096182106261258, + "learning_rate": 2.3286856708735817e-06, + "loss": 0.0246, + "step": 29201 + }, + { + "epoch": 3.4628246175738173, + "grad_norm": 0.6155270745791951, + "learning_rate": 2.32767414799388e-06, + "loss": 0.0403, + "step": 29202 + }, + { + "epoch": 3.462943199335942, + "grad_norm": 0.7149685061260455, + "learning_rate": 2.326662834125218e-06, + "loss": 0.025, + "step": 29203 + }, + { + "epoch": 3.4630617810980673, + "grad_norm": 0.799547838291144, + "learning_rate": 2.325651729276923e-06, + "loss": 0.0323, + "step": 29204 + }, + { + "epoch": 3.463180362860192, + "grad_norm": 0.47744047681059104, + "learning_rate": 2.324640833458314e-06, + "loss": 0.0255, + "step": 29205 + }, + { + "epoch": 3.4632989446223172, + "grad_norm": 0.38861427293090195, + "learning_rate": 2.323630146678715e-06, + "loss": 0.0212, + "step": 29206 + }, + { + "epoch": 3.463417526384442, + "grad_norm": 0.5181630971639927, + "learning_rate": 2.322619668947429e-06, + "loss": 0.0291, + "step": 29207 + }, + { + "epoch": 3.463536108146567, + "grad_norm": 0.5033355362689607, + "learning_rate": 2.3216094002737887e-06, + "loss": 0.0203, + "step": 29208 + }, + { + "epoch": 3.463654689908692, + "grad_norm": 0.3843616538554143, + "learning_rate": 2.3205993406670956e-06, + "loss": 0.0129, + "step": 29209 + }, + { + "epoch": 3.463773271670817, + "grad_norm": 0.49273614052821935, + "learning_rate": 2.31958949013667e-06, + "loss": 0.0232, + "step": 29210 + }, + { + "epoch": 3.463891853432942, + "grad_norm": 0.41686628947461485, + "learning_rate": 2.318579848691807e-06, + "loss": 0.0199, + "step": 29211 + }, + { + "epoch": 3.464010435195067, + "grad_norm": 0.5355078326313036, + "learning_rate": 2.3175704163418327e-06, + "loss": 0.0305, + "step": 29212 + }, + { + "epoch": 3.464129016957192, + "grad_norm": 0.46081804506628865, + "learning_rate": 2.3165611930960422e-06, + "loss": 0.0187, + "step": 29213 + }, + { + "epoch": 3.464247598719317, + "grad_norm": 0.4681618977565449, + "learning_rate": 2.3155521789637395e-06, + "loss": 0.025, + "step": 29214 + }, + { + "epoch": 3.464366180481442, + "grad_norm": 0.37783781811394546, + "learning_rate": 2.314543373954228e-06, + "loss": 0.0209, + "step": 29215 + }, + { + "epoch": 3.464484762243567, + "grad_norm": 0.38131194814540975, + "learning_rate": 2.313534778076809e-06, + "loss": 0.0158, + "step": 29216 + }, + { + "epoch": 3.464603344005692, + "grad_norm": 0.3049061224715207, + "learning_rate": 2.312526391340783e-06, + "loss": 0.0115, + "step": 29217 + }, + { + "epoch": 3.464721925767817, + "grad_norm": 0.8081328030924009, + "learning_rate": 2.311518213755434e-06, + "loss": 0.0361, + "step": 29218 + }, + { + "epoch": 3.4648405075299418, + "grad_norm": 0.7385873176272129, + "learning_rate": 2.310510245330072e-06, + "loss": 0.0422, + "step": 29219 + }, + { + "epoch": 3.464959089292067, + "grad_norm": 0.6154225075242171, + "learning_rate": 2.3095024860739777e-06, + "loss": 0.0249, + "step": 29220 + }, + { + "epoch": 3.4650776710541917, + "grad_norm": 0.4057790569876691, + "learning_rate": 2.3084949359964525e-06, + "loss": 0.0198, + "step": 29221 + }, + { + "epoch": 3.465196252816317, + "grad_norm": 0.6043904936017849, + "learning_rate": 2.3074875951067663e-06, + "loss": 0.0299, + "step": 29222 + }, + { + "epoch": 3.4653148345784417, + "grad_norm": 0.4725309851724821, + "learning_rate": 2.306480463414226e-06, + "loss": 0.0273, + "step": 29223 + }, + { + "epoch": 3.465433416340567, + "grad_norm": 0.41311783224261067, + "learning_rate": 2.305473540928105e-06, + "loss": 0.0166, + "step": 29224 + }, + { + "epoch": 3.4655519981026917, + "grad_norm": 0.39116023405248135, + "learning_rate": 2.3044668276576865e-06, + "loss": 0.0177, + "step": 29225 + }, + { + "epoch": 3.465670579864817, + "grad_norm": 0.4396692049511022, + "learning_rate": 2.3034603236122526e-06, + "loss": 0.0185, + "step": 29226 + }, + { + "epoch": 3.4657891616269416, + "grad_norm": 0.64726760833289, + "learning_rate": 2.302454028801082e-06, + "loss": 0.0261, + "step": 29227 + }, + { + "epoch": 3.465907743389067, + "grad_norm": 0.2683013761735624, + "learning_rate": 2.301447943233459e-06, + "loss": 0.0135, + "step": 29228 + }, + { + "epoch": 3.4660263251511916, + "grad_norm": 0.7350067798720586, + "learning_rate": 2.30044206691864e-06, + "loss": 0.0337, + "step": 29229 + }, + { + "epoch": 3.466144906913317, + "grad_norm": 0.4117896264739196, + "learning_rate": 2.2994363998659175e-06, + "loss": 0.0169, + "step": 29230 + }, + { + "epoch": 3.4662634886754415, + "grad_norm": 0.5889909730550916, + "learning_rate": 2.2984309420845505e-06, + "loss": 0.0286, + "step": 29231 + }, + { + "epoch": 3.4663820704375667, + "grad_norm": 0.7806434397525628, + "learning_rate": 2.2974256935838155e-06, + "loss": 0.0451, + "step": 29232 + }, + { + "epoch": 3.4665006521996915, + "grad_norm": 0.5877016213551188, + "learning_rate": 2.296420654372966e-06, + "loss": 0.031, + "step": 29233 + }, + { + "epoch": 3.4666192339618167, + "grad_norm": 0.5305012375935263, + "learning_rate": 2.295415824461289e-06, + "loss": 0.0258, + "step": 29234 + }, + { + "epoch": 3.466737815723942, + "grad_norm": 0.6468108451264133, + "learning_rate": 2.2944112038580296e-06, + "loss": 0.0311, + "step": 29235 + }, + { + "epoch": 3.4668563974860667, + "grad_norm": 0.7446141292116759, + "learning_rate": 2.2934067925724558e-06, + "loss": 0.0338, + "step": 29236 + }, + { + "epoch": 3.4669749792481914, + "grad_norm": 0.6951810469313582, + "learning_rate": 2.2924025906138263e-06, + "loss": 0.0438, + "step": 29237 + }, + { + "epoch": 3.4670935610103166, + "grad_norm": 0.48966888181366836, + "learning_rate": 2.2913985979913984e-06, + "loss": 0.0245, + "step": 29238 + }, + { + "epoch": 3.467212142772442, + "grad_norm": 0.858347380816816, + "learning_rate": 2.2903948147144335e-06, + "loss": 0.0397, + "step": 29239 + }, + { + "epoch": 3.4673307245345666, + "grad_norm": 0.6910544504983422, + "learning_rate": 2.289391240792174e-06, + "loss": 0.0406, + "step": 29240 + }, + { + "epoch": 3.4674493062966913, + "grad_norm": 0.4180187487007885, + "learning_rate": 2.2883878762338802e-06, + "loss": 0.0188, + "step": 29241 + }, + { + "epoch": 3.4675678880588165, + "grad_norm": 0.4030310050911613, + "learning_rate": 2.287384721048799e-06, + "loss": 0.0216, + "step": 29242 + }, + { + "epoch": 3.4676864698209418, + "grad_norm": 0.29165651728320235, + "learning_rate": 2.2863817752461824e-06, + "loss": 0.0127, + "step": 29243 + }, + { + "epoch": 3.4678050515830665, + "grad_norm": 0.6283633334819425, + "learning_rate": 2.285379038835264e-06, + "loss": 0.0276, + "step": 29244 + }, + { + "epoch": 3.4679236333451913, + "grad_norm": 0.5748974427353366, + "learning_rate": 2.2843765118253063e-06, + "loss": 0.0325, + "step": 29245 + }, + { + "epoch": 3.4680422151073165, + "grad_norm": 0.5430599629009871, + "learning_rate": 2.283374194225535e-06, + "loss": 0.0276, + "step": 29246 + }, + { + "epoch": 3.4681607968694417, + "grad_norm": 0.45924909733827496, + "learning_rate": 2.2823720860451982e-06, + "loss": 0.0212, + "step": 29247 + }, + { + "epoch": 3.4682793786315664, + "grad_norm": 0.3977549388136383, + "learning_rate": 2.28137018729353e-06, + "loss": 0.0161, + "step": 29248 + }, + { + "epoch": 3.4683979603936916, + "grad_norm": 0.6090912479066212, + "learning_rate": 2.280368497979771e-06, + "loss": 0.0402, + "step": 29249 + }, + { + "epoch": 3.4685165421558164, + "grad_norm": 0.5027632041767917, + "learning_rate": 2.27936701811316e-06, + "loss": 0.0291, + "step": 29250 + }, + { + "epoch": 3.4686351239179416, + "grad_norm": 0.7510045877220342, + "learning_rate": 2.2783657477029154e-06, + "loss": 0.0283, + "step": 29251 + }, + { + "epoch": 3.4687537056800664, + "grad_norm": 0.37895849673328674, + "learning_rate": 2.2773646867582766e-06, + "loss": 0.0193, + "step": 29252 + }, + { + "epoch": 3.4688722874421916, + "grad_norm": 0.872207472640943, + "learning_rate": 2.2763638352884724e-06, + "loss": 0.0333, + "step": 29253 + }, + { + "epoch": 3.4689908692043163, + "grad_norm": 0.7354669985035106, + "learning_rate": 2.2753631933027262e-06, + "loss": 0.0376, + "step": 29254 + }, + { + "epoch": 3.4691094509664415, + "grad_norm": 0.6138287094172261, + "learning_rate": 2.2743627608102724e-06, + "loss": 0.0283, + "step": 29255 + }, + { + "epoch": 3.4692280327285663, + "grad_norm": 0.5557663521248286, + "learning_rate": 2.273362537820314e-06, + "loss": 0.0272, + "step": 29256 + }, + { + "epoch": 3.4693466144906915, + "grad_norm": 0.4284694470059661, + "learning_rate": 2.272362524342092e-06, + "loss": 0.0138, + "step": 29257 + }, + { + "epoch": 3.4694651962528162, + "grad_norm": 0.5118234662778252, + "learning_rate": 2.271362720384815e-06, + "loss": 0.0305, + "step": 29258 + }, + { + "epoch": 3.4695837780149414, + "grad_norm": 0.37555604533488723, + "learning_rate": 2.2703631259577e-06, + "loss": 0.0213, + "step": 29259 + }, + { + "epoch": 3.469702359777066, + "grad_norm": 0.4737136585785027, + "learning_rate": 2.2693637410699657e-06, + "loss": 0.0218, + "step": 29260 + }, + { + "epoch": 3.4698209415391914, + "grad_norm": 0.6315782985750064, + "learning_rate": 2.2683645657308267e-06, + "loss": 0.0321, + "step": 29261 + }, + { + "epoch": 3.469939523301316, + "grad_norm": 0.42753246248342197, + "learning_rate": 2.267365599949489e-06, + "loss": 0.0141, + "step": 29262 + }, + { + "epoch": 3.4700581050634414, + "grad_norm": 0.47867536845770137, + "learning_rate": 2.2663668437351625e-06, + "loss": 0.0232, + "step": 29263 + }, + { + "epoch": 3.470176686825566, + "grad_norm": 0.5341752148269964, + "learning_rate": 2.2653682970970535e-06, + "loss": 0.0189, + "step": 29264 + }, + { + "epoch": 3.4702952685876913, + "grad_norm": 0.7014484795579908, + "learning_rate": 2.264369960044374e-06, + "loss": 0.0466, + "step": 29265 + }, + { + "epoch": 3.470413850349816, + "grad_norm": 0.47148391756452984, + "learning_rate": 2.263371832586325e-06, + "loss": 0.018, + "step": 29266 + }, + { + "epoch": 3.4705324321119413, + "grad_norm": 0.6170328148881957, + "learning_rate": 2.2623739147320967e-06, + "loss": 0.0245, + "step": 29267 + }, + { + "epoch": 3.470651013874066, + "grad_norm": 0.5502432708161583, + "learning_rate": 2.261376206490909e-06, + "loss": 0.0251, + "step": 29268 + }, + { + "epoch": 3.4707695956361913, + "grad_norm": 0.6851771844123379, + "learning_rate": 2.2603787078719437e-06, + "loss": 0.0329, + "step": 29269 + }, + { + "epoch": 3.470888177398316, + "grad_norm": 0.6088632999321347, + "learning_rate": 2.259381418884404e-06, + "loss": 0.0273, + "step": 29270 + }, + { + "epoch": 3.471006759160441, + "grad_norm": 0.5242722267236496, + "learning_rate": 2.2583843395374753e-06, + "loss": 0.0341, + "step": 29271 + }, + { + "epoch": 3.471125340922566, + "grad_norm": 0.6595012240541882, + "learning_rate": 2.2573874698403634e-06, + "loss": 0.034, + "step": 29272 + }, + { + "epoch": 3.471243922684691, + "grad_norm": 0.7472497732631438, + "learning_rate": 2.2563908098022445e-06, + "loss": 0.034, + "step": 29273 + }, + { + "epoch": 3.471362504446816, + "grad_norm": 0.5263060717119447, + "learning_rate": 2.255394359432314e-06, + "loss": 0.0214, + "step": 29274 + }, + { + "epoch": 3.471481086208941, + "grad_norm": 0.6765304455154629, + "learning_rate": 2.2543981187397534e-06, + "loss": 0.0285, + "step": 29275 + }, + { + "epoch": 3.471599667971066, + "grad_norm": 0.4443430979735869, + "learning_rate": 2.2534020877337524e-06, + "loss": 0.018, + "step": 29276 + }, + { + "epoch": 3.471718249733191, + "grad_norm": 0.7081427017708232, + "learning_rate": 2.252406266423493e-06, + "loss": 0.0308, + "step": 29277 + }, + { + "epoch": 3.471836831495316, + "grad_norm": 0.3477525673613209, + "learning_rate": 2.2514106548181454e-06, + "loss": 0.0174, + "step": 29278 + }, + { + "epoch": 3.471955413257441, + "grad_norm": 0.318594687569461, + "learning_rate": 2.2504152529269047e-06, + "loss": 0.0139, + "step": 29279 + }, + { + "epoch": 3.472073995019566, + "grad_norm": 0.9101779629918835, + "learning_rate": 2.2494200607589334e-06, + "loss": 0.0356, + "step": 29280 + }, + { + "epoch": 3.472192576781691, + "grad_norm": 0.500116108698615, + "learning_rate": 2.248425078323413e-06, + "loss": 0.0238, + "step": 29281 + }, + { + "epoch": 3.472311158543816, + "grad_norm": 0.42246203136601496, + "learning_rate": 2.2474303056295056e-06, + "loss": 0.0177, + "step": 29282 + }, + { + "epoch": 3.472429740305941, + "grad_norm": 0.43451935592753393, + "learning_rate": 2.246435742686401e-06, + "loss": 0.0268, + "step": 29283 + }, + { + "epoch": 3.4725483220680657, + "grad_norm": 0.7409369785823833, + "learning_rate": 2.24544138950325e-06, + "loss": 0.0447, + "step": 29284 + }, + { + "epoch": 3.472666903830191, + "grad_norm": 0.5498597544107012, + "learning_rate": 2.244447246089226e-06, + "loss": 0.0226, + "step": 29285 + }, + { + "epoch": 3.4727854855923157, + "grad_norm": 0.4850332120498831, + "learning_rate": 2.2434533124534945e-06, + "loss": 0.0195, + "step": 29286 + }, + { + "epoch": 3.472904067354441, + "grad_norm": 0.5777104109762493, + "learning_rate": 2.242459588605214e-06, + "loss": 0.0313, + "step": 29287 + }, + { + "epoch": 3.473022649116566, + "grad_norm": 0.8531998864192926, + "learning_rate": 2.241466074553558e-06, + "loss": 0.0375, + "step": 29288 + }, + { + "epoch": 3.473141230878691, + "grad_norm": 0.8808510730651472, + "learning_rate": 2.2404727703076666e-06, + "loss": 0.061, + "step": 29289 + }, + { + "epoch": 3.4732598126408156, + "grad_norm": 0.3629974071762716, + "learning_rate": 2.239479675876713e-06, + "loss": 0.0142, + "step": 29290 + }, + { + "epoch": 3.473378394402941, + "grad_norm": 0.90598838058496, + "learning_rate": 2.2384867912698447e-06, + "loss": 0.0461, + "step": 29291 + }, + { + "epoch": 3.473496976165066, + "grad_norm": 0.4284036238984433, + "learning_rate": 2.2374941164962194e-06, + "loss": 0.0193, + "step": 29292 + }, + { + "epoch": 3.473615557927191, + "grad_norm": 0.7239327351875692, + "learning_rate": 2.2365016515649763e-06, + "loss": 0.0364, + "step": 29293 + }, + { + "epoch": 3.4737341396893155, + "grad_norm": 0.8821446501222017, + "learning_rate": 2.2355093964852802e-06, + "loss": 0.035, + "step": 29294 + }, + { + "epoch": 3.4738527214514408, + "grad_norm": 1.303705268701076, + "learning_rate": 2.2345173512662685e-06, + "loss": 0.0223, + "step": 29295 + }, + { + "epoch": 3.473971303213566, + "grad_norm": 0.740830195590785, + "learning_rate": 2.2335255159170925e-06, + "loss": 0.0458, + "step": 29296 + }, + { + "epoch": 3.4740898849756907, + "grad_norm": 0.5318721779287762, + "learning_rate": 2.2325338904468917e-06, + "loss": 0.0255, + "step": 29297 + }, + { + "epoch": 3.474208466737816, + "grad_norm": 0.5567133470567315, + "learning_rate": 2.231542474864809e-06, + "loss": 0.0251, + "step": 29298 + }, + { + "epoch": 3.4743270484999407, + "grad_norm": 0.7224846561077777, + "learning_rate": 2.230551269179987e-06, + "loss": 0.0378, + "step": 29299 + }, + { + "epoch": 3.474445630262066, + "grad_norm": 0.43786612182919876, + "learning_rate": 2.229560273401554e-06, + "loss": 0.0202, + "step": 29300 + }, + { + "epoch": 3.4745642120241906, + "grad_norm": 0.4855989336001142, + "learning_rate": 2.2285694875386593e-06, + "loss": 0.0265, + "step": 29301 + }, + { + "epoch": 3.474682793786316, + "grad_norm": 0.7723315909555917, + "learning_rate": 2.227578911600428e-06, + "loss": 0.0445, + "step": 29302 + }, + { + "epoch": 3.4748013755484406, + "grad_norm": 0.9579218388019975, + "learning_rate": 2.226588545595995e-06, + "loss": 0.0492, + "step": 29303 + }, + { + "epoch": 3.474919957310566, + "grad_norm": 0.6058188706639753, + "learning_rate": 2.2255983895344836e-06, + "loss": 0.0307, + "step": 29304 + }, + { + "epoch": 3.4750385390726906, + "grad_norm": 0.3739418652275196, + "learning_rate": 2.224608443425033e-06, + "loss": 0.0194, + "step": 29305 + }, + { + "epoch": 3.4751571208348158, + "grad_norm": 0.3088536485077807, + "learning_rate": 2.223618707276759e-06, + "loss": 0.0126, + "step": 29306 + }, + { + "epoch": 3.4752757025969405, + "grad_norm": 0.5323389155948776, + "learning_rate": 2.2226291810987924e-06, + "loss": 0.0228, + "step": 29307 + }, + { + "epoch": 3.4753942843590657, + "grad_norm": 0.5161717899718726, + "learning_rate": 2.221639864900252e-06, + "loss": 0.0257, + "step": 29308 + }, + { + "epoch": 3.4755128661211905, + "grad_norm": 0.5874382754576339, + "learning_rate": 2.22065075869026e-06, + "loss": 0.0251, + "step": 29309 + }, + { + "epoch": 3.4756314478833157, + "grad_norm": 0.6195073131961786, + "learning_rate": 2.2196618624779397e-06, + "loss": 0.0309, + "step": 29310 + }, + { + "epoch": 3.4757500296454404, + "grad_norm": 0.45311891609728844, + "learning_rate": 2.2186731762723958e-06, + "loss": 0.0156, + "step": 29311 + }, + { + "epoch": 3.4758686114075656, + "grad_norm": 0.9052438917065136, + "learning_rate": 2.217684700082748e-06, + "loss": 0.0373, + "step": 29312 + }, + { + "epoch": 3.4759871931696904, + "grad_norm": 0.528925437288683, + "learning_rate": 2.2166964339181113e-06, + "loss": 0.0334, + "step": 29313 + }, + { + "epoch": 3.4761057749318156, + "grad_norm": 0.35370057350201084, + "learning_rate": 2.2157083777875954e-06, + "loss": 0.0219, + "step": 29314 + }, + { + "epoch": 3.4762243566939404, + "grad_norm": 0.4138473033560201, + "learning_rate": 2.2147205317003067e-06, + "loss": 0.0221, + "step": 29315 + }, + { + "epoch": 3.4763429384560656, + "grad_norm": 0.6401150730010214, + "learning_rate": 2.2137328956653546e-06, + "loss": 0.0312, + "step": 29316 + }, + { + "epoch": 3.4764615202181903, + "grad_norm": 0.6707898494459144, + "learning_rate": 2.212745469691846e-06, + "loss": 0.0315, + "step": 29317 + }, + { + "epoch": 3.4765801019803155, + "grad_norm": 0.40358682743428187, + "learning_rate": 2.211758253788876e-06, + "loss": 0.0202, + "step": 29318 + }, + { + "epoch": 3.4766986837424403, + "grad_norm": 0.5326616715658983, + "learning_rate": 2.210771247965551e-06, + "loss": 0.022, + "step": 29319 + }, + { + "epoch": 3.4768172655045655, + "grad_norm": 0.7829377123292888, + "learning_rate": 2.20978445223097e-06, + "loss": 0.0379, + "step": 29320 + }, + { + "epoch": 3.4769358472666902, + "grad_norm": 0.7053117780983884, + "learning_rate": 2.208797866594234e-06, + "loss": 0.0328, + "step": 29321 + }, + { + "epoch": 3.4770544290288155, + "grad_norm": 0.5754881475084993, + "learning_rate": 2.2078114910644267e-06, + "loss": 0.0313, + "step": 29322 + }, + { + "epoch": 3.47717301079094, + "grad_norm": 0.479199447300817, + "learning_rate": 2.2068253256506493e-06, + "loss": 0.0194, + "step": 29323 + }, + { + "epoch": 3.4772915925530654, + "grad_norm": 0.6377698461028196, + "learning_rate": 2.2058393703619897e-06, + "loss": 0.034, + "step": 29324 + }, + { + "epoch": 3.47741017431519, + "grad_norm": 0.44227365875209984, + "learning_rate": 2.2048536252075397e-06, + "loss": 0.0207, + "step": 29325 + }, + { + "epoch": 3.4775287560773154, + "grad_norm": 0.9156117619760162, + "learning_rate": 2.2038680901963925e-06, + "loss": 0.0477, + "step": 29326 + }, + { + "epoch": 3.47764733783944, + "grad_norm": 0.7707995508117805, + "learning_rate": 2.202882765337616e-06, + "loss": 0.0336, + "step": 29327 + }, + { + "epoch": 3.4777659196015653, + "grad_norm": 0.44029075020120123, + "learning_rate": 2.201897650640314e-06, + "loss": 0.0226, + "step": 29328 + }, + { + "epoch": 3.47788450136369, + "grad_norm": 0.45122823482367913, + "learning_rate": 2.2009127461135565e-06, + "loss": 0.0172, + "step": 29329 + }, + { + "epoch": 3.4780030831258153, + "grad_norm": 0.5891445525146076, + "learning_rate": 2.1999280517664226e-06, + "loss": 0.0263, + "step": 29330 + }, + { + "epoch": 3.47812166488794, + "grad_norm": 0.3462110599317727, + "learning_rate": 2.1989435676079943e-06, + "loss": 0.0139, + "step": 29331 + }, + { + "epoch": 3.4782402466500653, + "grad_norm": 0.5959919057688172, + "learning_rate": 2.1979592936473504e-06, + "loss": 0.0304, + "step": 29332 + }, + { + "epoch": 3.47835882841219, + "grad_norm": 0.4198642945130328, + "learning_rate": 2.1969752298935526e-06, + "loss": 0.0237, + "step": 29333 + }, + { + "epoch": 3.478477410174315, + "grad_norm": 0.8768855868878417, + "learning_rate": 2.1959913763556826e-06, + "loss": 0.0456, + "step": 29334 + }, + { + "epoch": 3.47859599193644, + "grad_norm": 0.6418834501250485, + "learning_rate": 2.1950077330428088e-06, + "loss": 0.0322, + "step": 29335 + }, + { + "epoch": 3.478714573698565, + "grad_norm": 0.9323994816249697, + "learning_rate": 2.1940242999639983e-06, + "loss": 0.0451, + "step": 29336 + }, + { + "epoch": 3.4788331554606904, + "grad_norm": 0.4736428340892236, + "learning_rate": 2.1930410771283215e-06, + "loss": 0.0211, + "step": 29337 + }, + { + "epoch": 3.478951737222815, + "grad_norm": 0.3806524484121805, + "learning_rate": 2.1920580645448273e-06, + "loss": 0.0173, + "step": 29338 + }, + { + "epoch": 3.47907031898494, + "grad_norm": 0.39271459213148735, + "learning_rate": 2.1910752622225998e-06, + "loss": 0.0195, + "step": 29339 + }, + { + "epoch": 3.479188900747065, + "grad_norm": 0.7652184910521687, + "learning_rate": 2.190092670170682e-06, + "loss": 0.0401, + "step": 29340 + }, + { + "epoch": 3.4793074825091903, + "grad_norm": 0.6550023011773486, + "learning_rate": 2.189110288398144e-06, + "loss": 0.0325, + "step": 29341 + }, + { + "epoch": 3.479426064271315, + "grad_norm": 0.556079665405781, + "learning_rate": 2.188128116914029e-06, + "loss": 0.0244, + "step": 29342 + }, + { + "epoch": 3.47954464603344, + "grad_norm": 0.5804483168101726, + "learning_rate": 2.187146155727407e-06, + "loss": 0.0333, + "step": 29343 + }, + { + "epoch": 3.479663227795565, + "grad_norm": 0.4207348553413509, + "learning_rate": 2.186164404847316e-06, + "loss": 0.024, + "step": 29344 + }, + { + "epoch": 3.4797818095576902, + "grad_norm": 0.37604461781482884, + "learning_rate": 2.185182864282814e-06, + "loss": 0.0184, + "step": 29345 + }, + { + "epoch": 3.479900391319815, + "grad_norm": 0.40143818922139574, + "learning_rate": 2.184201534042951e-06, + "loss": 0.0115, + "step": 29346 + }, + { + "epoch": 3.4800189730819397, + "grad_norm": 0.36458376725432545, + "learning_rate": 2.183220414136769e-06, + "loss": 0.0204, + "step": 29347 + }, + { + "epoch": 3.480137554844065, + "grad_norm": 0.6322863943474227, + "learning_rate": 2.1822395045733216e-06, + "loss": 0.0275, + "step": 29348 + }, + { + "epoch": 3.48025613660619, + "grad_norm": 0.37379970915503685, + "learning_rate": 2.181258805361633e-06, + "loss": 0.0173, + "step": 29349 + }, + { + "epoch": 3.480374718368315, + "grad_norm": 0.7049291399989598, + "learning_rate": 2.180278316510767e-06, + "loss": 0.029, + "step": 29350 + }, + { + "epoch": 3.48049330013044, + "grad_norm": 0.3141729574854941, + "learning_rate": 2.179298038029745e-06, + "loss": 0.0138, + "step": 29351 + }, + { + "epoch": 3.480611881892565, + "grad_norm": 0.769604685147985, + "learning_rate": 2.1783179699276178e-06, + "loss": 0.0462, + "step": 29352 + }, + { + "epoch": 3.48073046365469, + "grad_norm": 0.5229118164403525, + "learning_rate": 2.1773381122134032e-06, + "loss": 0.0206, + "step": 29353 + }, + { + "epoch": 3.480849045416815, + "grad_norm": 0.39675361815150295, + "learning_rate": 2.1763584648961525e-06, + "loss": 0.015, + "step": 29354 + }, + { + "epoch": 3.48096762717894, + "grad_norm": 0.5901665434375012, + "learning_rate": 2.1753790279848835e-06, + "loss": 0.0291, + "step": 29355 + }, + { + "epoch": 3.481086208941065, + "grad_norm": 0.7726218988545044, + "learning_rate": 2.174399801488633e-06, + "loss": 0.0389, + "step": 29356 + }, + { + "epoch": 3.48120479070319, + "grad_norm": 0.5212799956283036, + "learning_rate": 2.173420785416422e-06, + "loss": 0.0309, + "step": 29357 + }, + { + "epoch": 3.4813233724653148, + "grad_norm": 0.49352449095883394, + "learning_rate": 2.1724419797772816e-06, + "loss": 0.0243, + "step": 29358 + }, + { + "epoch": 3.48144195422744, + "grad_norm": 0.3115012475697842, + "learning_rate": 2.1714633845802384e-06, + "loss": 0.0132, + "step": 29359 + }, + { + "epoch": 3.4815605359895647, + "grad_norm": 0.4576440760352765, + "learning_rate": 2.170484999834299e-06, + "loss": 0.0167, + "step": 29360 + }, + { + "epoch": 3.48167911775169, + "grad_norm": 0.42054799161064255, + "learning_rate": 2.1695068255485007e-06, + "loss": 0.0195, + "step": 29361 + }, + { + "epoch": 3.4817976995138147, + "grad_norm": 0.9073489335772802, + "learning_rate": 2.16852886173185e-06, + "loss": 0.0456, + "step": 29362 + }, + { + "epoch": 3.48191628127594, + "grad_norm": 0.4137503570205825, + "learning_rate": 2.1675511083933645e-06, + "loss": 0.0223, + "step": 29363 + }, + { + "epoch": 3.4820348630380646, + "grad_norm": 0.3712116115315126, + "learning_rate": 2.1665735655420573e-06, + "loss": 0.0139, + "step": 29364 + }, + { + "epoch": 3.48215344480019, + "grad_norm": 0.2998202805668738, + "learning_rate": 2.165596233186945e-06, + "loss": 0.0131, + "step": 29365 + }, + { + "epoch": 3.4822720265623146, + "grad_norm": 0.23754619209433964, + "learning_rate": 2.1646191113370352e-06, + "loss": 0.008, + "step": 29366 + }, + { + "epoch": 3.48239060832444, + "grad_norm": 0.6090247735199279, + "learning_rate": 2.1636422000013312e-06, + "loss": 0.0283, + "step": 29367 + }, + { + "epoch": 3.4825091900865646, + "grad_norm": 0.8229742648228398, + "learning_rate": 2.162665499188843e-06, + "loss": 0.0262, + "step": 29368 + }, + { + "epoch": 3.4826277718486898, + "grad_norm": 0.420605370464569, + "learning_rate": 2.161689008908574e-06, + "loss": 0.0169, + "step": 29369 + }, + { + "epoch": 3.4827463536108145, + "grad_norm": 0.8027462768663394, + "learning_rate": 2.1607127291695284e-06, + "loss": 0.0427, + "step": 29370 + }, + { + "epoch": 3.4828649353729397, + "grad_norm": 0.7813221749347392, + "learning_rate": 2.159736659980699e-06, + "loss": 0.0374, + "step": 29371 + }, + { + "epoch": 3.4829835171350645, + "grad_norm": 0.5417231535248139, + "learning_rate": 2.158760801351095e-06, + "loss": 0.0227, + "step": 29372 + }, + { + "epoch": 3.4831020988971897, + "grad_norm": 0.42590830354843245, + "learning_rate": 2.157785153289704e-06, + "loss": 0.0219, + "step": 29373 + }, + { + "epoch": 3.4832206806593144, + "grad_norm": 0.5690959167975002, + "learning_rate": 2.1568097158055216e-06, + "loss": 0.0298, + "step": 29374 + }, + { + "epoch": 3.4833392624214397, + "grad_norm": 0.45855264073905355, + "learning_rate": 2.1558344889075407e-06, + "loss": 0.0222, + "step": 29375 + }, + { + "epoch": 3.4834578441835644, + "grad_norm": 0.5412947137452215, + "learning_rate": 2.1548594726047534e-06, + "loss": 0.0204, + "step": 29376 + }, + { + "epoch": 3.4835764259456896, + "grad_norm": 0.9244819516968619, + "learning_rate": 2.1538846669061528e-06, + "loss": 0.0474, + "step": 29377 + }, + { + "epoch": 3.4836950077078144, + "grad_norm": 0.7712668451395548, + "learning_rate": 2.152910071820713e-06, + "loss": 0.0406, + "step": 29378 + }, + { + "epoch": 3.4838135894699396, + "grad_norm": 0.3715724229858837, + "learning_rate": 2.151935687357426e-06, + "loss": 0.0198, + "step": 29379 + }, + { + "epoch": 3.4839321712320643, + "grad_norm": 0.4190971512742161, + "learning_rate": 2.1509615135252763e-06, + "loss": 0.0189, + "step": 29380 + }, + { + "epoch": 3.4840507529941895, + "grad_norm": 0.5583625104877675, + "learning_rate": 2.149987550333243e-06, + "loss": 0.0297, + "step": 29381 + }, + { + "epoch": 3.4841693347563143, + "grad_norm": 0.5107668040664239, + "learning_rate": 2.149013797790303e-06, + "loss": 0.0163, + "step": 29382 + }, + { + "epoch": 3.4842879165184395, + "grad_norm": 0.4987948268058593, + "learning_rate": 2.1480402559054312e-06, + "loss": 0.0235, + "step": 29383 + }, + { + "epoch": 3.4844064982805643, + "grad_norm": 0.5468848392140822, + "learning_rate": 2.1470669246876074e-06, + "loss": 0.0243, + "step": 29384 + }, + { + "epoch": 3.4845250800426895, + "grad_norm": 0.3070318112993743, + "learning_rate": 2.146093804145802e-06, + "loss": 0.0136, + "step": 29385 + }, + { + "epoch": 3.4846436618048147, + "grad_norm": 0.4242458097504232, + "learning_rate": 2.145120894288985e-06, + "loss": 0.023, + "step": 29386 + }, + { + "epoch": 3.4847622435669394, + "grad_norm": 0.5342233751205393, + "learning_rate": 2.144148195126128e-06, + "loss": 0.0236, + "step": 29387 + }, + { + "epoch": 3.484880825329064, + "grad_norm": 0.7280900267584821, + "learning_rate": 2.1431757066662006e-06, + "loss": 0.0421, + "step": 29388 + }, + { + "epoch": 3.4849994070911894, + "grad_norm": 0.4620429294018603, + "learning_rate": 2.14220342891816e-06, + "loss": 0.0213, + "step": 29389 + }, + { + "epoch": 3.4851179888533146, + "grad_norm": 0.4123952486557153, + "learning_rate": 2.1412313618909746e-06, + "loss": 0.0187, + "step": 29390 + }, + { + "epoch": 3.4852365706154393, + "grad_norm": 0.48836975103166097, + "learning_rate": 2.1402595055936033e-06, + "loss": 0.0371, + "step": 29391 + }, + { + "epoch": 3.485355152377564, + "grad_norm": 0.6443357435648437, + "learning_rate": 2.1392878600350134e-06, + "loss": 0.0332, + "step": 29392 + }, + { + "epoch": 3.4854737341396893, + "grad_norm": 0.29849359634335715, + "learning_rate": 2.1383164252241487e-06, + "loss": 0.0139, + "step": 29393 + }, + { + "epoch": 3.4855923159018145, + "grad_norm": 0.7039033963168636, + "learning_rate": 2.137345201169974e-06, + "loss": 0.0341, + "step": 29394 + }, + { + "epoch": 3.4857108976639393, + "grad_norm": 0.6359529403845702, + "learning_rate": 2.1363741878814403e-06, + "loss": 0.0352, + "step": 29395 + }, + { + "epoch": 3.485829479426064, + "grad_norm": 0.5386421084082034, + "learning_rate": 2.1354033853674986e-06, + "loss": 0.0244, + "step": 29396 + }, + { + "epoch": 3.4859480611881892, + "grad_norm": 0.44246996754913304, + "learning_rate": 2.1344327936371036e-06, + "loss": 0.0194, + "step": 29397 + }, + { + "epoch": 3.4860666429503144, + "grad_norm": 0.7378677484345851, + "learning_rate": 2.133462412699189e-06, + "loss": 0.0425, + "step": 29398 + }, + { + "epoch": 3.486185224712439, + "grad_norm": 0.45655437152017103, + "learning_rate": 2.13249224256272e-06, + "loss": 0.019, + "step": 29399 + }, + { + "epoch": 3.4863038064745644, + "grad_norm": 0.37666063736740313, + "learning_rate": 2.1315222832366253e-06, + "loss": 0.0196, + "step": 29400 + }, + { + "epoch": 3.486422388236689, + "grad_norm": 0.4224789087048029, + "learning_rate": 2.1305525347298544e-06, + "loss": 0.0218, + "step": 29401 + }, + { + "epoch": 3.4865409699988144, + "grad_norm": 0.3401520767526731, + "learning_rate": 2.1295829970513437e-06, + "loss": 0.0128, + "step": 29402 + }, + { + "epoch": 3.486659551760939, + "grad_norm": 0.5337249347891128, + "learning_rate": 2.1286136702100363e-06, + "loss": 0.0227, + "step": 29403 + }, + { + "epoch": 3.4867781335230643, + "grad_norm": 0.30963777003327553, + "learning_rate": 2.127644554214858e-06, + "loss": 0.0199, + "step": 29404 + }, + { + "epoch": 3.486896715285189, + "grad_norm": 0.8811327835155651, + "learning_rate": 2.1266756490747498e-06, + "loss": 0.0428, + "step": 29405 + }, + { + "epoch": 3.4870152970473143, + "grad_norm": 0.5199223003569657, + "learning_rate": 2.125706954798645e-06, + "loss": 0.0202, + "step": 29406 + }, + { + "epoch": 3.487133878809439, + "grad_norm": 0.7010271495556544, + "learning_rate": 2.124738471395471e-06, + "loss": 0.023, + "step": 29407 + }, + { + "epoch": 3.4872524605715642, + "grad_norm": 0.7038915040488598, + "learning_rate": 2.1237701988741587e-06, + "loss": 0.0459, + "step": 29408 + }, + { + "epoch": 3.487371042333689, + "grad_norm": 0.551770917002288, + "learning_rate": 2.1228021372436264e-06, + "loss": 0.0273, + "step": 29409 + }, + { + "epoch": 3.487489624095814, + "grad_norm": 0.6711967133211967, + "learning_rate": 2.121834286512814e-06, + "loss": 0.026, + "step": 29410 + }, + { + "epoch": 3.487608205857939, + "grad_norm": 0.5162639335517566, + "learning_rate": 2.120866646690628e-06, + "loss": 0.022, + "step": 29411 + }, + { + "epoch": 3.487726787620064, + "grad_norm": 0.5992216275926507, + "learning_rate": 2.119899217785995e-06, + "loss": 0.0245, + "step": 29412 + }, + { + "epoch": 3.487845369382189, + "grad_norm": 0.5398326060707596, + "learning_rate": 2.1189319998078355e-06, + "loss": 0.0366, + "step": 29413 + }, + { + "epoch": 3.487963951144314, + "grad_norm": 0.3769189267695202, + "learning_rate": 2.1179649927650617e-06, + "loss": 0.0166, + "step": 29414 + }, + { + "epoch": 3.488082532906439, + "grad_norm": 0.602338151840381, + "learning_rate": 2.1169981966665976e-06, + "loss": 0.0236, + "step": 29415 + }, + { + "epoch": 3.488201114668564, + "grad_norm": 0.48540939936694744, + "learning_rate": 2.116031611521341e-06, + "loss": 0.0206, + "step": 29416 + }, + { + "epoch": 3.488319696430689, + "grad_norm": 0.4509895890743566, + "learning_rate": 2.115065237338218e-06, + "loss": 0.0203, + "step": 29417 + }, + { + "epoch": 3.488438278192814, + "grad_norm": 0.4826364100839541, + "learning_rate": 2.1140990741261245e-06, + "loss": 0.0252, + "step": 29418 + }, + { + "epoch": 3.488556859954939, + "grad_norm": 0.3834478205525187, + "learning_rate": 2.113133121893979e-06, + "loss": 0.0204, + "step": 29419 + }, + { + "epoch": 3.488675441717064, + "grad_norm": 0.7331115487735184, + "learning_rate": 2.112167380650673e-06, + "loss": 0.0298, + "step": 29420 + }, + { + "epoch": 3.4887940234791888, + "grad_norm": 0.5455376413379442, + "learning_rate": 2.1112018504051227e-06, + "loss": 0.0261, + "step": 29421 + }, + { + "epoch": 3.488912605241314, + "grad_norm": 0.46184470398611494, + "learning_rate": 2.1102365311662207e-06, + "loss": 0.0249, + "step": 29422 + }, + { + "epoch": 3.4890311870034387, + "grad_norm": 0.693358570366718, + "learning_rate": 2.1092714229428685e-06, + "loss": 0.0311, + "step": 29423 + }, + { + "epoch": 3.489149768765564, + "grad_norm": 0.3374396538198353, + "learning_rate": 2.108306525743964e-06, + "loss": 0.014, + "step": 29424 + }, + { + "epoch": 3.4892683505276887, + "grad_norm": 0.5268943906258573, + "learning_rate": 2.1073418395784002e-06, + "loss": 0.0293, + "step": 29425 + }, + { + "epoch": 3.489386932289814, + "grad_norm": 0.47938582135188496, + "learning_rate": 2.1063773644550756e-06, + "loss": 0.0207, + "step": 29426 + }, + { + "epoch": 3.4895055140519387, + "grad_norm": 0.4103014705784838, + "learning_rate": 2.105413100382875e-06, + "loss": 0.0207, + "step": 29427 + }, + { + "epoch": 3.489624095814064, + "grad_norm": 0.437666074045083, + "learning_rate": 2.1044490473706907e-06, + "loss": 0.0164, + "step": 29428 + }, + { + "epoch": 3.4897426775761886, + "grad_norm": 0.47971578784816044, + "learning_rate": 2.103485205427408e-06, + "loss": 0.0143, + "step": 29429 + }, + { + "epoch": 3.489861259338314, + "grad_norm": 0.5741578574749671, + "learning_rate": 2.1025215745619194e-06, + "loss": 0.0296, + "step": 29430 + }, + { + "epoch": 3.4899798411004386, + "grad_norm": 0.392588591608724, + "learning_rate": 2.1015581547830955e-06, + "loss": 0.0142, + "step": 29431 + }, + { + "epoch": 3.4900984228625638, + "grad_norm": 0.6110633421997878, + "learning_rate": 2.100594946099832e-06, + "loss": 0.0292, + "step": 29432 + }, + { + "epoch": 3.4902170046246885, + "grad_norm": 0.3164716050968801, + "learning_rate": 2.099631948521e-06, + "loss": 0.0192, + "step": 29433 + }, + { + "epoch": 3.4903355863868137, + "grad_norm": 0.6071533982890582, + "learning_rate": 2.0986691620554778e-06, + "loss": 0.0293, + "step": 29434 + }, + { + "epoch": 3.490454168148939, + "grad_norm": 0.40696859177568284, + "learning_rate": 2.097706586712142e-06, + "loss": 0.0215, + "step": 29435 + }, + { + "epoch": 3.4905727499110637, + "grad_norm": 0.5438633198727599, + "learning_rate": 2.096744222499866e-06, + "loss": 0.0197, + "step": 29436 + }, + { + "epoch": 3.4906913316731885, + "grad_norm": 0.5414904931906812, + "learning_rate": 2.0957820694275293e-06, + "loss": 0.0292, + "step": 29437 + }, + { + "epoch": 3.4908099134353137, + "grad_norm": 0.5336601853069579, + "learning_rate": 2.094820127503988e-06, + "loss": 0.0232, + "step": 29438 + }, + { + "epoch": 3.490928495197439, + "grad_norm": 0.5669269086458006, + "learning_rate": 2.0938583967381187e-06, + "loss": 0.0385, + "step": 29439 + }, + { + "epoch": 3.4910470769595636, + "grad_norm": 0.5598989618008455, + "learning_rate": 2.0928968771387862e-06, + "loss": 0.0237, + "step": 29440 + }, + { + "epoch": 3.4911656587216884, + "grad_norm": 0.42248233711197236, + "learning_rate": 2.091935568714856e-06, + "loss": 0.027, + "step": 29441 + }, + { + "epoch": 3.4912842404838136, + "grad_norm": 0.423111939223382, + "learning_rate": 2.0909744714751823e-06, + "loss": 0.0206, + "step": 29442 + }, + { + "epoch": 3.491402822245939, + "grad_norm": 0.36964562740379975, + "learning_rate": 2.090013585428638e-06, + "loss": 0.0172, + "step": 29443 + }, + { + "epoch": 3.4915214040080635, + "grad_norm": 0.43629783887313606, + "learning_rate": 2.0890529105840717e-06, + "loss": 0.0251, + "step": 29444 + }, + { + "epoch": 3.4916399857701883, + "grad_norm": 0.5571094983362268, + "learning_rate": 2.0880924469503404e-06, + "loss": 0.0209, + "step": 29445 + }, + { + "epoch": 3.4917585675323135, + "grad_norm": 0.6135165539668231, + "learning_rate": 2.087132194536301e-06, + "loss": 0.0315, + "step": 29446 + }, + { + "epoch": 3.4918771492944387, + "grad_norm": 0.3426728604592107, + "learning_rate": 2.0861721533508043e-06, + "loss": 0.0142, + "step": 29447 + }, + { + "epoch": 3.4919957310565635, + "grad_norm": 0.4749896754294779, + "learning_rate": 2.0852123234027077e-06, + "loss": 0.0226, + "step": 29448 + }, + { + "epoch": 3.4921143128186887, + "grad_norm": 0.28633173042339977, + "learning_rate": 2.0842527047008482e-06, + "loss": 0.0113, + "step": 29449 + }, + { + "epoch": 3.4922328945808134, + "grad_norm": 0.4113974975876739, + "learning_rate": 2.083293297254077e-06, + "loss": 0.0198, + "step": 29450 + }, + { + "epoch": 3.4923514763429386, + "grad_norm": 0.4452204640179377, + "learning_rate": 2.08233410107124e-06, + "loss": 0.0156, + "step": 29451 + }, + { + "epoch": 3.4924700581050634, + "grad_norm": 0.6686726501295411, + "learning_rate": 2.0813751161611828e-06, + "loss": 0.0337, + "step": 29452 + }, + { + "epoch": 3.4925886398671886, + "grad_norm": 0.4534851446297752, + "learning_rate": 2.0804163425327374e-06, + "loss": 0.0212, + "step": 29453 + }, + { + "epoch": 3.4927072216293134, + "grad_norm": 0.33045743022122853, + "learning_rate": 2.079457780194749e-06, + "loss": 0.0167, + "step": 29454 + }, + { + "epoch": 3.4928258033914386, + "grad_norm": 0.3903844207319644, + "learning_rate": 2.0784994291560527e-06, + "loss": 0.0208, + "step": 29455 + }, + { + "epoch": 3.4929443851535633, + "grad_norm": 0.5807085152143989, + "learning_rate": 2.077541289425483e-06, + "loss": 0.0211, + "step": 29456 + }, + { + "epoch": 3.4930629669156885, + "grad_norm": 0.6691980434924414, + "learning_rate": 2.076583361011872e-06, + "loss": 0.0361, + "step": 29457 + }, + { + "epoch": 3.4931815486778133, + "grad_norm": 0.6510748914266865, + "learning_rate": 2.0756256439240533e-06, + "loss": 0.0272, + "step": 29458 + }, + { + "epoch": 3.4933001304399385, + "grad_norm": 0.7112301769684595, + "learning_rate": 2.0746681381708597e-06, + "loss": 0.028, + "step": 29459 + }, + { + "epoch": 3.4934187122020632, + "grad_norm": 0.319235308813987, + "learning_rate": 2.0737108437611063e-06, + "loss": 0.0179, + "step": 29460 + }, + { + "epoch": 3.4935372939641884, + "grad_norm": 0.8241387854296641, + "learning_rate": 2.0727537607036276e-06, + "loss": 0.0363, + "step": 29461 + }, + { + "epoch": 3.493655875726313, + "grad_norm": 0.6320428515343267, + "learning_rate": 2.0717968890072407e-06, + "loss": 0.029, + "step": 29462 + }, + { + "epoch": 3.4937744574884384, + "grad_norm": 1.1063651299355879, + "learning_rate": 2.070840228680773e-06, + "loss": 0.0539, + "step": 29463 + }, + { + "epoch": 3.493893039250563, + "grad_norm": 0.5866326419535265, + "learning_rate": 2.0698837797330422e-06, + "loss": 0.0303, + "step": 29464 + }, + { + "epoch": 3.4940116210126884, + "grad_norm": 0.8312682670730789, + "learning_rate": 2.0689275421728578e-06, + "loss": 0.0526, + "step": 29465 + }, + { + "epoch": 3.494130202774813, + "grad_norm": 0.4566677319338217, + "learning_rate": 2.0679715160090513e-06, + "loss": 0.0224, + "step": 29466 + }, + { + "epoch": 3.4942487845369383, + "grad_norm": 0.4417295397761817, + "learning_rate": 2.067015701250419e-06, + "loss": 0.0267, + "step": 29467 + }, + { + "epoch": 3.494367366299063, + "grad_norm": 0.36565664900608946, + "learning_rate": 2.0660600979057865e-06, + "loss": 0.0188, + "step": 29468 + }, + { + "epoch": 3.4944859480611883, + "grad_norm": 0.39428458353256207, + "learning_rate": 2.0651047059839473e-06, + "loss": 0.0201, + "step": 29469 + }, + { + "epoch": 3.494604529823313, + "grad_norm": 0.5221801586768795, + "learning_rate": 2.0641495254937247e-06, + "loss": 0.0216, + "step": 29470 + }, + { + "epoch": 3.4947231115854382, + "grad_norm": 0.5320315848797146, + "learning_rate": 2.063194556443915e-06, + "loss": 0.0315, + "step": 29471 + }, + { + "epoch": 3.494841693347563, + "grad_norm": 0.4099389845805603, + "learning_rate": 2.062239798843324e-06, + "loss": 0.0195, + "step": 29472 + }, + { + "epoch": 3.494960275109688, + "grad_norm": 0.3726984440852311, + "learning_rate": 2.0612852527007537e-06, + "loss": 0.0145, + "step": 29473 + }, + { + "epoch": 3.495078856871813, + "grad_norm": 0.5149011735431152, + "learning_rate": 2.0603309180250054e-06, + "loss": 0.0219, + "step": 29474 + }, + { + "epoch": 3.495197438633938, + "grad_norm": 0.46559487516153697, + "learning_rate": 2.05937679482488e-06, + "loss": 0.0177, + "step": 29475 + }, + { + "epoch": 3.495316020396063, + "grad_norm": 0.5863530920399637, + "learning_rate": 2.058422883109157e-06, + "loss": 0.021, + "step": 29476 + }, + { + "epoch": 3.495434602158188, + "grad_norm": 0.5913666323265289, + "learning_rate": 2.0574691828866538e-06, + "loss": 0.0272, + "step": 29477 + }, + { + "epoch": 3.495553183920313, + "grad_norm": 0.3429728647376966, + "learning_rate": 2.0565156941661443e-06, + "loss": 0.0141, + "step": 29478 + }, + { + "epoch": 3.495671765682438, + "grad_norm": 0.5756306772277019, + "learning_rate": 2.055562416956433e-06, + "loss": 0.0266, + "step": 29479 + }, + { + "epoch": 3.495790347444563, + "grad_norm": 0.6189393203170792, + "learning_rate": 2.054609351266287e-06, + "loss": 0.0226, + "step": 29480 + }, + { + "epoch": 3.495908929206688, + "grad_norm": 0.5850588495805122, + "learning_rate": 2.053656497104517e-06, + "loss": 0.0278, + "step": 29481 + }, + { + "epoch": 3.496027510968813, + "grad_norm": 0.37492915065561, + "learning_rate": 2.05270385447989e-06, + "loss": 0.0174, + "step": 29482 + }, + { + "epoch": 3.496146092730938, + "grad_norm": 0.3579702787009843, + "learning_rate": 2.051751423401194e-06, + "loss": 0.0154, + "step": 29483 + }, + { + "epoch": 3.4962646744930628, + "grad_norm": 0.5093053404462596, + "learning_rate": 2.0507992038772073e-06, + "loss": 0.0265, + "step": 29484 + }, + { + "epoch": 3.496383256255188, + "grad_norm": 0.7263665222168784, + "learning_rate": 2.0498471959167125e-06, + "loss": 0.0294, + "step": 29485 + }, + { + "epoch": 3.4965018380173127, + "grad_norm": 0.38256318162623437, + "learning_rate": 2.048895399528486e-06, + "loss": 0.0131, + "step": 29486 + }, + { + "epoch": 3.496620419779438, + "grad_norm": 0.4495346321662533, + "learning_rate": 2.04794381472129e-06, + "loss": 0.0278, + "step": 29487 + }, + { + "epoch": 3.496739001541563, + "grad_norm": 0.637661883925862, + "learning_rate": 2.0469924415039144e-06, + "loss": 0.0212, + "step": 29488 + }, + { + "epoch": 3.496857583303688, + "grad_norm": 0.48202136296255393, + "learning_rate": 2.0460412798851193e-06, + "loss": 0.0185, + "step": 29489 + }, + { + "epoch": 3.4969761650658127, + "grad_norm": 0.45805472465549957, + "learning_rate": 2.0450903298736807e-06, + "loss": 0.0207, + "step": 29490 + }, + { + "epoch": 3.497094746827938, + "grad_norm": 0.7849762786868582, + "learning_rate": 2.04413959147835e-06, + "loss": 0.0327, + "step": 29491 + }, + { + "epoch": 3.497213328590063, + "grad_norm": 0.8457574087857751, + "learning_rate": 2.0431890647079094e-06, + "loss": 0.0419, + "step": 29492 + }, + { + "epoch": 3.497331910352188, + "grad_norm": 0.7742914895374151, + "learning_rate": 2.0422387495711128e-06, + "loss": 0.0398, + "step": 29493 + }, + { + "epoch": 3.4974504921143126, + "grad_norm": 0.32289992937838297, + "learning_rate": 2.0412886460767195e-06, + "loss": 0.0163, + "step": 29494 + }, + { + "epoch": 3.497569073876438, + "grad_norm": 0.5231213072175789, + "learning_rate": 2.040338754233492e-06, + "loss": 0.0212, + "step": 29495 + }, + { + "epoch": 3.497687655638563, + "grad_norm": 0.5535457446979084, + "learning_rate": 2.0393890740501853e-06, + "loss": 0.0203, + "step": 29496 + }, + { + "epoch": 3.4978062374006877, + "grad_norm": 0.5076396174238198, + "learning_rate": 2.038439605535561e-06, + "loss": 0.027, + "step": 29497 + }, + { + "epoch": 3.497924819162813, + "grad_norm": 0.6784852878684662, + "learning_rate": 2.0374903486983623e-06, + "loss": 0.0262, + "step": 29498 + }, + { + "epoch": 3.4980434009249377, + "grad_norm": 0.40428458286992836, + "learning_rate": 2.036541303547343e-06, + "loss": 0.012, + "step": 29499 + }, + { + "epoch": 3.498161982687063, + "grad_norm": 0.503255303388937, + "learning_rate": 2.035592470091252e-06, + "loss": 0.0202, + "step": 29500 + }, + { + "epoch": 3.4982805644491877, + "grad_norm": 0.638353840772891, + "learning_rate": 2.0346438483388435e-06, + "loss": 0.0311, + "step": 29501 + }, + { + "epoch": 3.498399146211313, + "grad_norm": 0.3203928027940021, + "learning_rate": 2.0336954382988466e-06, + "loss": 0.015, + "step": 29502 + }, + { + "epoch": 3.4985177279734376, + "grad_norm": 0.602906241609347, + "learning_rate": 2.0327472399800264e-06, + "loss": 0.0182, + "step": 29503 + }, + { + "epoch": 3.498636309735563, + "grad_norm": 0.5285257856548802, + "learning_rate": 2.0317992533911064e-06, + "loss": 0.0232, + "step": 29504 + }, + { + "epoch": 3.4987548914976876, + "grad_norm": 0.7629904741081432, + "learning_rate": 2.0308514785408295e-06, + "loss": 0.0371, + "step": 29505 + }, + { + "epoch": 3.498873473259813, + "grad_norm": 0.38464289273571456, + "learning_rate": 2.029903915437939e-06, + "loss": 0.0164, + "step": 29506 + }, + { + "epoch": 3.4989920550219376, + "grad_norm": 0.7309055014785212, + "learning_rate": 2.028956564091164e-06, + "loss": 0.0388, + "step": 29507 + }, + { + "epoch": 3.4991106367840628, + "grad_norm": 0.4824803273090274, + "learning_rate": 2.0280094245092474e-06, + "loss": 0.021, + "step": 29508 + }, + { + "epoch": 3.4992292185461875, + "grad_norm": 0.34802137662040433, + "learning_rate": 2.027062496700907e-06, + "loss": 0.0179, + "step": 29509 + }, + { + "epoch": 3.4993478003083127, + "grad_norm": 0.6157303150919883, + "learning_rate": 2.026115780674881e-06, + "loss": 0.0266, + "step": 29510 + }, + { + "epoch": 3.4994663820704375, + "grad_norm": 0.3378976272550531, + "learning_rate": 2.025169276439892e-06, + "loss": 0.0136, + "step": 29511 + }, + { + "epoch": 3.4995849638325627, + "grad_norm": 0.7507575892259514, + "learning_rate": 2.0242229840046726e-06, + "loss": 0.0318, + "step": 29512 + }, + { + "epoch": 3.4997035455946874, + "grad_norm": 0.6100660555194072, + "learning_rate": 2.0232769033779435e-06, + "loss": 0.0327, + "step": 29513 + }, + { + "epoch": 3.4998221273568126, + "grad_norm": 0.4226053264348614, + "learning_rate": 2.02233103456842e-06, + "loss": 0.0218, + "step": 29514 + }, + { + "epoch": 3.4999407091189374, + "grad_norm": 0.6878214759450412, + "learning_rate": 2.0213853775848335e-06, + "loss": 0.0266, + "step": 29515 + }, + { + "epoch": 3.5000592908810626, + "grad_norm": 0.6272253205852738, + "learning_rate": 2.0204399324358914e-06, + "loss": 0.0287, + "step": 29516 + }, + { + "epoch": 3.5001778726431874, + "grad_norm": 0.4613333498792484, + "learning_rate": 2.019494699130314e-06, + "loss": 0.0127, + "step": 29517 + }, + { + "epoch": 3.5002964544053126, + "grad_norm": 0.5874770327211822, + "learning_rate": 2.018549677676815e-06, + "loss": 0.0393, + "step": 29518 + }, + { + "epoch": 3.5004150361674373, + "grad_norm": 0.6190801290983952, + "learning_rate": 2.017604868084111e-06, + "loss": 0.0298, + "step": 29519 + }, + { + "epoch": 3.5005336179295625, + "grad_norm": 0.5576964103584814, + "learning_rate": 2.016660270360904e-06, + "loss": 0.0322, + "step": 29520 + }, + { + "epoch": 3.5006521996916873, + "grad_norm": 0.5891120022865568, + "learning_rate": 2.0157158845159037e-06, + "loss": 0.0269, + "step": 29521 + }, + { + "epoch": 3.5007707814538125, + "grad_norm": 0.7090971150692977, + "learning_rate": 2.0147717105578168e-06, + "loss": 0.0237, + "step": 29522 + }, + { + "epoch": 3.5008893632159372, + "grad_norm": 0.3997031040332091, + "learning_rate": 2.0138277484953503e-06, + "loss": 0.0266, + "step": 29523 + }, + { + "epoch": 3.5010079449780624, + "grad_norm": 0.42812672269971297, + "learning_rate": 2.012883998337209e-06, + "loss": 0.0191, + "step": 29524 + }, + { + "epoch": 3.501126526740187, + "grad_norm": 0.7548634090375841, + "learning_rate": 2.0119404600920767e-06, + "loss": 0.032, + "step": 29525 + }, + { + "epoch": 3.5012451085023124, + "grad_norm": 0.39529790276587495, + "learning_rate": 2.0109971337686743e-06, + "loss": 0.0171, + "step": 29526 + }, + { + "epoch": 3.501363690264437, + "grad_norm": 0.7539029737828782, + "learning_rate": 2.0100540193756843e-06, + "loss": 0.0314, + "step": 29527 + }, + { + "epoch": 3.5014822720265624, + "grad_norm": 1.2205161570483813, + "learning_rate": 2.0091111169218023e-06, + "loss": 0.0445, + "step": 29528 + }, + { + "epoch": 3.5016008537886876, + "grad_norm": 0.47981721503067704, + "learning_rate": 2.0081684264157243e-06, + "loss": 0.0191, + "step": 29529 + }, + { + "epoch": 3.5017194355508123, + "grad_norm": 0.4932517252431313, + "learning_rate": 2.0072259478661427e-06, + "loss": 0.0222, + "step": 29530 + }, + { + "epoch": 3.501838017312937, + "grad_norm": 0.6733580733831385, + "learning_rate": 2.006283681281737e-06, + "loss": 0.0353, + "step": 29531 + }, + { + "epoch": 3.5019565990750623, + "grad_norm": 0.4827700017522993, + "learning_rate": 2.0053416266712005e-06, + "loss": 0.0241, + "step": 29532 + }, + { + "epoch": 3.5020751808371875, + "grad_norm": 0.720532477161533, + "learning_rate": 2.004399784043215e-06, + "loss": 0.0382, + "step": 29533 + }, + { + "epoch": 3.5021937625993123, + "grad_norm": 0.5789961311279976, + "learning_rate": 2.003458153406465e-06, + "loss": 0.0301, + "step": 29534 + }, + { + "epoch": 3.502312344361437, + "grad_norm": 0.5715596171521957, + "learning_rate": 2.0025167347696326e-06, + "loss": 0.0246, + "step": 29535 + }, + { + "epoch": 3.502430926123562, + "grad_norm": 0.5954565362438518, + "learning_rate": 2.001575528141389e-06, + "loss": 0.028, + "step": 29536 + }, + { + "epoch": 3.5025495078856874, + "grad_norm": 0.46326454588296745, + "learning_rate": 2.0006345335304236e-06, + "loss": 0.0245, + "step": 29537 + }, + { + "epoch": 3.502668089647812, + "grad_norm": 0.5860698043915059, + "learning_rate": 1.9996937509454e-06, + "loss": 0.0217, + "step": 29538 + }, + { + "epoch": 3.502786671409937, + "grad_norm": 0.715873157043195, + "learning_rate": 1.9987531803949992e-06, + "loss": 0.0373, + "step": 29539 + }, + { + "epoch": 3.502905253172062, + "grad_norm": 0.5633231596290524, + "learning_rate": 1.9978128218878815e-06, + "loss": 0.0234, + "step": 29540 + }, + { + "epoch": 3.5030238349341873, + "grad_norm": 0.3674469635955456, + "learning_rate": 1.9968726754327288e-06, + "loss": 0.0164, + "step": 29541 + }, + { + "epoch": 3.503142416696312, + "grad_norm": 0.3646268393548582, + "learning_rate": 1.9959327410381977e-06, + "loss": 0.0149, + "step": 29542 + }, + { + "epoch": 3.503260998458437, + "grad_norm": 0.503339453024869, + "learning_rate": 1.9949930187129568e-06, + "loss": 0.0222, + "step": 29543 + }, + { + "epoch": 3.503379580220562, + "grad_norm": 0.4892730351912566, + "learning_rate": 1.9940535084656708e-06, + "loss": 0.0216, + "step": 29544 + }, + { + "epoch": 3.5034981619826873, + "grad_norm": 0.4281489214260765, + "learning_rate": 1.9931142103049997e-06, + "loss": 0.0179, + "step": 29545 + }, + { + "epoch": 3.503616743744812, + "grad_norm": 0.607353142751209, + "learning_rate": 1.9921751242396063e-06, + "loss": 0.0262, + "step": 29546 + }, + { + "epoch": 3.503735325506937, + "grad_norm": 0.5866833244617999, + "learning_rate": 1.991236250278136e-06, + "loss": 0.04, + "step": 29547 + }, + { + "epoch": 3.503853907269062, + "grad_norm": 0.7682837239099337, + "learning_rate": 1.990297588429263e-06, + "loss": 0.0386, + "step": 29548 + }, + { + "epoch": 3.503972489031187, + "grad_norm": 0.7781459304149279, + "learning_rate": 1.9893591387016213e-06, + "loss": 0.0364, + "step": 29549 + }, + { + "epoch": 3.504091070793312, + "grad_norm": 0.6226874703786452, + "learning_rate": 1.9884209011038795e-06, + "loss": 0.0261, + "step": 29550 + }, + { + "epoch": 3.5042096525554367, + "grad_norm": 0.4867669916934617, + "learning_rate": 1.987482875644667e-06, + "loss": 0.0294, + "step": 29551 + }, + { + "epoch": 3.504328234317562, + "grad_norm": 0.5464274201567452, + "learning_rate": 1.9865450623326507e-06, + "loss": 0.0293, + "step": 29552 + }, + { + "epoch": 3.504446816079687, + "grad_norm": 0.49084904363394993, + "learning_rate": 1.985607461176467e-06, + "loss": 0.0212, + "step": 29553 + }, + { + "epoch": 3.504565397841812, + "grad_norm": 0.7082671793139824, + "learning_rate": 1.984670072184758e-06, + "loss": 0.0369, + "step": 29554 + }, + { + "epoch": 3.504683979603937, + "grad_norm": 0.6461377912177296, + "learning_rate": 1.9837328953661666e-06, + "loss": 0.0281, + "step": 29555 + }, + { + "epoch": 3.504802561366062, + "grad_norm": 0.5399045413081784, + "learning_rate": 1.9827959307293336e-06, + "loss": 0.0241, + "step": 29556 + }, + { + "epoch": 3.504921143128187, + "grad_norm": 0.8801373431753775, + "learning_rate": 1.981859178282902e-06, + "loss": 0.0383, + "step": 29557 + }, + { + "epoch": 3.505039724890312, + "grad_norm": 0.46756890164341003, + "learning_rate": 1.980922638035493e-06, + "loss": 0.0286, + "step": 29558 + }, + { + "epoch": 3.505158306652437, + "grad_norm": 0.5091476964623408, + "learning_rate": 1.97998630999576e-06, + "loss": 0.0229, + "step": 29559 + }, + { + "epoch": 3.5052768884145618, + "grad_norm": 0.5609570215489693, + "learning_rate": 1.979050194172319e-06, + "loss": 0.0255, + "step": 29560 + }, + { + "epoch": 3.505395470176687, + "grad_norm": 0.6318271181938978, + "learning_rate": 1.978114290573807e-06, + "loss": 0.0311, + "step": 29561 + }, + { + "epoch": 3.5055140519388117, + "grad_norm": 0.34770459840088874, + "learning_rate": 1.9771785992088426e-06, + "loss": 0.0167, + "step": 29562 + }, + { + "epoch": 3.505632633700937, + "grad_norm": 0.5357617660345239, + "learning_rate": 1.9762431200860687e-06, + "loss": 0.025, + "step": 29563 + }, + { + "epoch": 3.5057512154630617, + "grad_norm": 0.5311184938876035, + "learning_rate": 1.975307853214095e-06, + "loss": 0.0275, + "step": 29564 + }, + { + "epoch": 3.505869797225187, + "grad_norm": 0.5150242119693037, + "learning_rate": 1.9743727986015483e-06, + "loss": 0.0227, + "step": 29565 + }, + { + "epoch": 3.5059883789873116, + "grad_norm": 0.4717521701665454, + "learning_rate": 1.9734379562570487e-06, + "loss": 0.0149, + "step": 29566 + }, + { + "epoch": 3.506106960749437, + "grad_norm": 0.5927171395805009, + "learning_rate": 1.9725033261892125e-06, + "loss": 0.0285, + "step": 29567 + }, + { + "epoch": 3.5062255425115616, + "grad_norm": 0.4382707715034032, + "learning_rate": 1.9715689084066624e-06, + "loss": 0.016, + "step": 29568 + }, + { + "epoch": 3.506344124273687, + "grad_norm": 0.48182536842137463, + "learning_rate": 1.9706347029180034e-06, + "loss": 0.0161, + "step": 29569 + }, + { + "epoch": 3.5064627060358116, + "grad_norm": 0.6055452522770357, + "learning_rate": 1.9697007097318533e-06, + "loss": 0.0346, + "step": 29570 + }, + { + "epoch": 3.5065812877979368, + "grad_norm": 0.9652197753369854, + "learning_rate": 1.9687669288568187e-06, + "loss": 0.0376, + "step": 29571 + }, + { + "epoch": 3.5066998695600615, + "grad_norm": 0.427891223809996, + "learning_rate": 1.967833360301513e-06, + "loss": 0.0202, + "step": 29572 + }, + { + "epoch": 3.5068184513221867, + "grad_norm": 0.5053811700893635, + "learning_rate": 1.9669000040745374e-06, + "loss": 0.0237, + "step": 29573 + }, + { + "epoch": 3.5069370330843115, + "grad_norm": 0.32842386821874886, + "learning_rate": 1.965966860184498e-06, + "loss": 0.0149, + "step": 29574 + }, + { + "epoch": 3.5070556148464367, + "grad_norm": 0.4536251696529942, + "learning_rate": 1.965033928640006e-06, + "loss": 0.0162, + "step": 29575 + }, + { + "epoch": 3.5071741966085614, + "grad_norm": 0.4667117660383834, + "learning_rate": 1.964101209449648e-06, + "loss": 0.0212, + "step": 29576 + }, + { + "epoch": 3.5072927783706866, + "grad_norm": 0.6090268623289764, + "learning_rate": 1.9631687026220257e-06, + "loss": 0.0234, + "step": 29577 + }, + { + "epoch": 3.507411360132812, + "grad_norm": 0.5199920335126424, + "learning_rate": 1.9622364081657406e-06, + "loss": 0.0194, + "step": 29578 + }, + { + "epoch": 3.5075299418949366, + "grad_norm": 0.7373765348361074, + "learning_rate": 1.9613043260893886e-06, + "loss": 0.041, + "step": 29579 + }, + { + "epoch": 3.5076485236570614, + "grad_norm": 0.5214237100576595, + "learning_rate": 1.9603724564015575e-06, + "loss": 0.0291, + "step": 29580 + }, + { + "epoch": 3.5077671054191866, + "grad_norm": 0.4413548614710275, + "learning_rate": 1.959440799110837e-06, + "loss": 0.0259, + "step": 29581 + }, + { + "epoch": 3.5078856871813118, + "grad_norm": 0.5335800402767409, + "learning_rate": 1.9585093542258154e-06, + "loss": 0.0208, + "step": 29582 + }, + { + "epoch": 3.5080042689434365, + "grad_norm": 0.36509116342251013, + "learning_rate": 1.9575781217550852e-06, + "loss": 0.0139, + "step": 29583 + }, + { + "epoch": 3.5081228507055613, + "grad_norm": 0.932930387388467, + "learning_rate": 1.956647101707232e-06, + "loss": 0.0458, + "step": 29584 + }, + { + "epoch": 3.5082414324676865, + "grad_norm": 0.6739914321156154, + "learning_rate": 1.9557162940908284e-06, + "loss": 0.0311, + "step": 29585 + }, + { + "epoch": 3.5083600142298117, + "grad_norm": 0.7134146963673423, + "learning_rate": 1.9547856989144685e-06, + "loss": 0.0396, + "step": 29586 + }, + { + "epoch": 3.5084785959919365, + "grad_norm": 0.3704251757078723, + "learning_rate": 1.95385531618672e-06, + "loss": 0.0219, + "step": 29587 + }, + { + "epoch": 3.508597177754061, + "grad_norm": 0.7519129722716738, + "learning_rate": 1.9529251459161653e-06, + "loss": 0.035, + "step": 29588 + }, + { + "epoch": 3.5087157595161864, + "grad_norm": 0.5835186728282883, + "learning_rate": 1.9519951881113772e-06, + "loss": 0.0228, + "step": 29589 + }, + { + "epoch": 3.5088343412783116, + "grad_norm": 0.7163742166854814, + "learning_rate": 1.951065442780933e-06, + "loss": 0.0276, + "step": 29590 + }, + { + "epoch": 3.5089529230404364, + "grad_norm": 0.5672426472282756, + "learning_rate": 1.950135909933401e-06, + "loss": 0.0221, + "step": 29591 + }, + { + "epoch": 3.509071504802561, + "grad_norm": 0.4806240372953579, + "learning_rate": 1.949206589577346e-06, + "loss": 0.0267, + "step": 29592 + }, + { + "epoch": 3.5091900865646863, + "grad_norm": 0.3769800322390542, + "learning_rate": 1.9482774817213417e-06, + "loss": 0.0182, + "step": 29593 + }, + { + "epoch": 3.5093086683268115, + "grad_norm": 0.38709736770845776, + "learning_rate": 1.947348586373951e-06, + "loss": 0.0204, + "step": 29594 + }, + { + "epoch": 3.5094272500889363, + "grad_norm": 0.47183871844565234, + "learning_rate": 1.946419903543742e-06, + "loss": 0.0217, + "step": 29595 + }, + { + "epoch": 3.509545831851061, + "grad_norm": 0.4097210270540447, + "learning_rate": 1.945491433239263e-06, + "loss": 0.0245, + "step": 29596 + }, + { + "epoch": 3.5096644136131863, + "grad_norm": 0.510203773771718, + "learning_rate": 1.9445631754690885e-06, + "loss": 0.0185, + "step": 29597 + }, + { + "epoch": 3.5097829953753115, + "grad_norm": 0.6822582453555958, + "learning_rate": 1.9436351302417666e-06, + "loss": 0.0255, + "step": 29598 + }, + { + "epoch": 3.5099015771374362, + "grad_norm": 0.6615562811530399, + "learning_rate": 1.94270729756586e-06, + "loss": 0.0349, + "step": 29599 + }, + { + "epoch": 3.510020158899561, + "grad_norm": 0.41395718114449204, + "learning_rate": 1.9417796774499063e-06, + "loss": 0.0196, + "step": 29600 + }, + { + "epoch": 3.510138740661686, + "grad_norm": 0.7085347210407597, + "learning_rate": 1.9408522699024796e-06, + "loss": 0.0372, + "step": 29601 + }, + { + "epoch": 3.5102573224238114, + "grad_norm": 0.6743141013078395, + "learning_rate": 1.9399250749321114e-06, + "loss": 0.0299, + "step": 29602 + }, + { + "epoch": 3.510375904185936, + "grad_norm": 0.7320010288126069, + "learning_rate": 1.938998092547356e-06, + "loss": 0.0301, + "step": 29603 + }, + { + "epoch": 3.5104944859480613, + "grad_norm": 0.431532223406101, + "learning_rate": 1.9380713227567596e-06, + "loss": 0.0182, + "step": 29604 + }, + { + "epoch": 3.510613067710186, + "grad_norm": 0.6599875789207439, + "learning_rate": 1.9371447655688658e-06, + "loss": 0.0378, + "step": 29605 + }, + { + "epoch": 3.5107316494723113, + "grad_norm": 0.4092663005973759, + "learning_rate": 1.936218420992217e-06, + "loss": 0.0199, + "step": 29606 + }, + { + "epoch": 3.510850231234436, + "grad_norm": 0.48358169861865946, + "learning_rate": 1.935292289035345e-06, + "loss": 0.0226, + "step": 29607 + }, + { + "epoch": 3.5109688129965613, + "grad_norm": 0.5220249872718978, + "learning_rate": 1.9343663697068053e-06, + "loss": 0.032, + "step": 29608 + }, + { + "epoch": 3.511087394758686, + "grad_norm": 0.45702503859636795, + "learning_rate": 1.933440663015115e-06, + "loss": 0.0197, + "step": 29609 + }, + { + "epoch": 3.5112059765208112, + "grad_norm": 0.7306646169252282, + "learning_rate": 1.9325151689688207e-06, + "loss": 0.0383, + "step": 29610 + }, + { + "epoch": 3.511324558282936, + "grad_norm": 0.7744797342008439, + "learning_rate": 1.9315898875764433e-06, + "loss": 0.028, + "step": 29611 + }, + { + "epoch": 3.511443140045061, + "grad_norm": 0.30991037028910856, + "learning_rate": 1.9306648188465252e-06, + "loss": 0.0119, + "step": 29612 + }, + { + "epoch": 3.511561721807186, + "grad_norm": 0.5934865067352479, + "learning_rate": 1.929739962787583e-06, + "loss": 0.0283, + "step": 29613 + }, + { + "epoch": 3.511680303569311, + "grad_norm": 0.5957070153333013, + "learning_rate": 1.9288153194081503e-06, + "loss": 0.033, + "step": 29614 + }, + { + "epoch": 3.511798885331436, + "grad_norm": 0.5821952271711831, + "learning_rate": 1.9278908887167463e-06, + "loss": 0.0332, + "step": 29615 + }, + { + "epoch": 3.511917467093561, + "grad_norm": 0.5871446202841408, + "learning_rate": 1.926966670721897e-06, + "loss": 0.0248, + "step": 29616 + }, + { + "epoch": 3.512036048855686, + "grad_norm": 0.46695794701868576, + "learning_rate": 1.9260426654321235e-06, + "loss": 0.0195, + "step": 29617 + }, + { + "epoch": 3.512154630617811, + "grad_norm": 0.6863742796871579, + "learning_rate": 1.9251188728559326e-06, + "loss": 0.036, + "step": 29618 + }, + { + "epoch": 3.512273212379936, + "grad_norm": 0.40636269765334687, + "learning_rate": 1.9241952930018596e-06, + "loss": 0.022, + "step": 29619 + }, + { + "epoch": 3.512391794142061, + "grad_norm": 0.5423927164089518, + "learning_rate": 1.9232719258784032e-06, + "loss": 0.0288, + "step": 29620 + }, + { + "epoch": 3.512510375904186, + "grad_norm": 0.781421555757999, + "learning_rate": 1.9223487714940843e-06, + "loss": 0.0385, + "step": 29621 + }, + { + "epoch": 3.512628957666311, + "grad_norm": 0.8883713286810705, + "learning_rate": 1.921425829857407e-06, + "loss": 0.0516, + "step": 29622 + }, + { + "epoch": 3.5127475394284358, + "grad_norm": 0.44150381555695517, + "learning_rate": 1.920503100976881e-06, + "loss": 0.0216, + "step": 29623 + }, + { + "epoch": 3.512866121190561, + "grad_norm": 0.5427999058762435, + "learning_rate": 1.9195805848610222e-06, + "loss": 0.0257, + "step": 29624 + }, + { + "epoch": 3.5129847029526857, + "grad_norm": 0.544656027787927, + "learning_rate": 1.9186582815183208e-06, + "loss": 0.0256, + "step": 29625 + }, + { + "epoch": 3.513103284714811, + "grad_norm": 0.44278500863808384, + "learning_rate": 1.9177361909572866e-06, + "loss": 0.0245, + "step": 29626 + }, + { + "epoch": 3.5132218664769357, + "grad_norm": 0.27737628181903967, + "learning_rate": 1.9168143131864215e-06, + "loss": 0.0153, + "step": 29627 + }, + { + "epoch": 3.513340448239061, + "grad_norm": 0.6136770095971343, + "learning_rate": 1.9158926482142237e-06, + "loss": 0.0309, + "step": 29628 + }, + { + "epoch": 3.5134590300011856, + "grad_norm": 0.3621820019466071, + "learning_rate": 1.9149711960491813e-06, + "loss": 0.0213, + "step": 29629 + }, + { + "epoch": 3.513577611763311, + "grad_norm": 0.5967470897335201, + "learning_rate": 1.914049956699801e-06, + "loss": 0.028, + "step": 29630 + }, + { + "epoch": 3.513696193525436, + "grad_norm": 0.507774968902957, + "learning_rate": 1.913128930174568e-06, + "loss": 0.0209, + "step": 29631 + }, + { + "epoch": 3.513814775287561, + "grad_norm": 0.599974932094328, + "learning_rate": 1.9122081164819756e-06, + "loss": 0.0284, + "step": 29632 + }, + { + "epoch": 3.5139333570496856, + "grad_norm": 0.5939036281796418, + "learning_rate": 1.911287515630514e-06, + "loss": 0.0249, + "step": 29633 + }, + { + "epoch": 3.5140519388118108, + "grad_norm": 0.3481040792208887, + "learning_rate": 1.910367127628665e-06, + "loss": 0.0161, + "step": 29634 + }, + { + "epoch": 3.514170520573936, + "grad_norm": 0.3774997390319046, + "learning_rate": 1.9094469524849225e-06, + "loss": 0.0211, + "step": 29635 + }, + { + "epoch": 3.5142891023360607, + "grad_norm": 0.4000016677201124, + "learning_rate": 1.9085269902077597e-06, + "loss": 0.0161, + "step": 29636 + }, + { + "epoch": 3.5144076840981855, + "grad_norm": 0.5054983592089364, + "learning_rate": 1.907607240805659e-06, + "loss": 0.0243, + "step": 29637 + }, + { + "epoch": 3.5145262658603107, + "grad_norm": 0.4283824433350457, + "learning_rate": 1.9066877042871052e-06, + "loss": 0.0265, + "step": 29638 + }, + { + "epoch": 3.514644847622436, + "grad_norm": 0.44753561069757386, + "learning_rate": 1.9057683806605746e-06, + "loss": 0.0175, + "step": 29639 + }, + { + "epoch": 3.5147634293845607, + "grad_norm": 0.32287057446611794, + "learning_rate": 1.9048492699345332e-06, + "loss": 0.0152, + "step": 29640 + }, + { + "epoch": 3.5148820111466854, + "grad_norm": 1.0516437305164756, + "learning_rate": 1.9039303721174629e-06, + "loss": 0.0438, + "step": 29641 + }, + { + "epoch": 3.5150005929088106, + "grad_norm": 0.3504618090520581, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.0125, + "step": 29642 + }, + { + "epoch": 3.515119174670936, + "grad_norm": 0.3803829647724266, + "learning_rate": 1.9020932152441107e-06, + "loss": 0.0175, + "step": 29643 + }, + { + "epoch": 3.5152377564330606, + "grad_norm": 0.49416538229672813, + "learning_rate": 1.9011749562047627e-06, + "loss": 0.0328, + "step": 29644 + }, + { + "epoch": 3.5153563381951853, + "grad_norm": 0.44376221604637317, + "learning_rate": 1.9002569101082584e-06, + "loss": 0.0227, + "step": 29645 + }, + { + "epoch": 3.5154749199573105, + "grad_norm": 0.7082606163596536, + "learning_rate": 1.8993390769630636e-06, + "loss": 0.0306, + "step": 29646 + }, + { + "epoch": 3.5155935017194357, + "grad_norm": 0.8558929614903185, + "learning_rate": 1.898421456777627e-06, + "loss": 0.0298, + "step": 29647 + }, + { + "epoch": 3.5157120834815605, + "grad_norm": 0.45894099323366055, + "learning_rate": 1.897504049560417e-06, + "loss": 0.0221, + "step": 29648 + }, + { + "epoch": 3.5158306652436853, + "grad_norm": 0.43016147091776114, + "learning_rate": 1.89658685531989e-06, + "loss": 0.022, + "step": 29649 + }, + { + "epoch": 3.5159492470058105, + "grad_norm": 0.3181326198213371, + "learning_rate": 1.8956698740645067e-06, + "loss": 0.0214, + "step": 29650 + }, + { + "epoch": 3.5160678287679357, + "grad_norm": 0.6281937526608898, + "learning_rate": 1.8947531058027074e-06, + "loss": 0.0294, + "step": 29651 + }, + { + "epoch": 3.5161864105300604, + "grad_norm": 0.498828565561426, + "learning_rate": 1.8938365505429545e-06, + "loss": 0.02, + "step": 29652 + }, + { + "epoch": 3.5163049922921856, + "grad_norm": 0.46505471297673684, + "learning_rate": 1.8929202082936915e-06, + "loss": 0.0223, + "step": 29653 + }, + { + "epoch": 3.5164235740543104, + "grad_norm": 0.3321871596213565, + "learning_rate": 1.8920040790633669e-06, + "loss": 0.0111, + "step": 29654 + }, + { + "epoch": 3.5165421558164356, + "grad_norm": 0.555140228770797, + "learning_rate": 1.8910881628604354e-06, + "loss": 0.0264, + "step": 29655 + }, + { + "epoch": 3.5166607375785603, + "grad_norm": 0.8613957217933224, + "learning_rate": 1.8901724596933234e-06, + "loss": 0.0372, + "step": 29656 + }, + { + "epoch": 3.5167793193406856, + "grad_norm": 0.731112582975474, + "learning_rate": 1.889256969570491e-06, + "loss": 0.0347, + "step": 29657 + }, + { + "epoch": 3.5168979011028103, + "grad_norm": 0.7422720290959284, + "learning_rate": 1.8883416925003617e-06, + "loss": 0.0428, + "step": 29658 + }, + { + "epoch": 3.5170164828649355, + "grad_norm": 0.4303124218604529, + "learning_rate": 1.8874266284913843e-06, + "loss": 0.0208, + "step": 29659 + }, + { + "epoch": 3.5171350646270603, + "grad_norm": 0.34520177925465506, + "learning_rate": 1.8865117775519886e-06, + "loss": 0.0149, + "step": 29660 + }, + { + "epoch": 3.5172536463891855, + "grad_norm": 0.3747786656637705, + "learning_rate": 1.8855971396906174e-06, + "loss": 0.0193, + "step": 29661 + }, + { + "epoch": 3.5173722281513102, + "grad_norm": 0.5074548829955556, + "learning_rate": 1.8846827149156893e-06, + "loss": 0.0203, + "step": 29662 + }, + { + "epoch": 3.5174908099134354, + "grad_norm": 0.7400346534485179, + "learning_rate": 1.8837685032356417e-06, + "loss": 0.0451, + "step": 29663 + }, + { + "epoch": 3.51760939167556, + "grad_norm": 0.8536580836533645, + "learning_rate": 1.8828545046589014e-06, + "loss": 0.0347, + "step": 29664 + }, + { + "epoch": 3.5177279734376854, + "grad_norm": 0.9127146000370324, + "learning_rate": 1.8819407191938949e-06, + "loss": 0.0461, + "step": 29665 + }, + { + "epoch": 3.51784655519981, + "grad_norm": 0.5412433263276498, + "learning_rate": 1.8810271468490488e-06, + "loss": 0.0259, + "step": 29666 + }, + { + "epoch": 3.5179651369619354, + "grad_norm": 0.5767836569551241, + "learning_rate": 1.8801137876327757e-06, + "loss": 0.0243, + "step": 29667 + }, + { + "epoch": 3.51808371872406, + "grad_norm": 0.8147106222775594, + "learning_rate": 1.879200641553508e-06, + "loss": 0.0398, + "step": 29668 + }, + { + "epoch": 3.5182023004861853, + "grad_norm": 0.41246234206159027, + "learning_rate": 1.8782877086196554e-06, + "loss": 0.0189, + "step": 29669 + }, + { + "epoch": 3.51832088224831, + "grad_norm": 0.41626186213020283, + "learning_rate": 1.8773749888396336e-06, + "loss": 0.0186, + "step": 29670 + }, + { + "epoch": 3.5184394640104353, + "grad_norm": 0.6132295157455471, + "learning_rate": 1.8764624822218636e-06, + "loss": 0.0227, + "step": 29671 + }, + { + "epoch": 3.51855804577256, + "grad_norm": 0.7154469873697779, + "learning_rate": 1.8755501887747495e-06, + "loss": 0.0284, + "step": 29672 + }, + { + "epoch": 3.5186766275346852, + "grad_norm": 0.35340632141370865, + "learning_rate": 1.8746381085067127e-06, + "loss": 0.0156, + "step": 29673 + }, + { + "epoch": 3.51879520929681, + "grad_norm": 0.44925773717861406, + "learning_rate": 1.8737262414261465e-06, + "loss": 0.0237, + "step": 29674 + }, + { + "epoch": 3.518913791058935, + "grad_norm": 0.42047274746251406, + "learning_rate": 1.8728145875414715e-06, + "loss": 0.0172, + "step": 29675 + }, + { + "epoch": 3.51903237282106, + "grad_norm": 0.594638172759034, + "learning_rate": 1.8719031468610814e-06, + "loss": 0.0243, + "step": 29676 + }, + { + "epoch": 3.519150954583185, + "grad_norm": 0.6297717193241232, + "learning_rate": 1.8709919193933862e-06, + "loss": 0.0338, + "step": 29677 + }, + { + "epoch": 3.51926953634531, + "grad_norm": 0.4717516310631284, + "learning_rate": 1.870080905146776e-06, + "loss": 0.0174, + "step": 29678 + }, + { + "epoch": 3.519388118107435, + "grad_norm": 0.3432506974583966, + "learning_rate": 1.8691701041296638e-06, + "loss": 0.0111, + "step": 29679 + }, + { + "epoch": 3.5195066998695603, + "grad_norm": 0.38875552222811577, + "learning_rate": 1.8682595163504345e-06, + "loss": 0.0218, + "step": 29680 + }, + { + "epoch": 3.519625281631685, + "grad_norm": 0.5839137206955795, + "learning_rate": 1.8673491418174871e-06, + "loss": 0.0183, + "step": 29681 + }, + { + "epoch": 3.51974386339381, + "grad_norm": 0.5529411482596726, + "learning_rate": 1.8664389805392146e-06, + "loss": 0.0269, + "step": 29682 + }, + { + "epoch": 3.519862445155935, + "grad_norm": 0.9319428874280611, + "learning_rate": 1.8655290325240021e-06, + "loss": 0.0477, + "step": 29683 + }, + { + "epoch": 3.5199810269180603, + "grad_norm": 0.38838764256661396, + "learning_rate": 1.8646192977802513e-06, + "loss": 0.0159, + "step": 29684 + }, + { + "epoch": 3.520099608680185, + "grad_norm": 0.5818772020666683, + "learning_rate": 1.8637097763163303e-06, + "loss": 0.0315, + "step": 29685 + }, + { + "epoch": 3.5202181904423098, + "grad_norm": 0.37042470689580087, + "learning_rate": 1.8628004681406407e-06, + "loss": 0.0141, + "step": 29686 + }, + { + "epoch": 3.520336772204435, + "grad_norm": 0.897056078967392, + "learning_rate": 1.8618913732615539e-06, + "loss": 0.031, + "step": 29687 + }, + { + "epoch": 3.52045535396656, + "grad_norm": 0.47115445022716557, + "learning_rate": 1.8609824916874602e-06, + "loss": 0.0267, + "step": 29688 + }, + { + "epoch": 3.520573935728685, + "grad_norm": 0.5992225640650202, + "learning_rate": 1.8600738234267223e-06, + "loss": 0.0239, + "step": 29689 + }, + { + "epoch": 3.5206925174908097, + "grad_norm": 0.44771610779624055, + "learning_rate": 1.8591653684877392e-06, + "loss": 0.0232, + "step": 29690 + }, + { + "epoch": 3.520811099252935, + "grad_norm": 0.6867346275128839, + "learning_rate": 1.8582571268788678e-06, + "loss": 0.0468, + "step": 29691 + }, + { + "epoch": 3.52092968101506, + "grad_norm": 0.3671703580022963, + "learning_rate": 1.857349098608488e-06, + "loss": 0.0217, + "step": 29692 + }, + { + "epoch": 3.521048262777185, + "grad_norm": 0.34657071949819673, + "learning_rate": 1.8564412836849704e-06, + "loss": 0.0172, + "step": 29693 + }, + { + "epoch": 3.5211668445393096, + "grad_norm": 0.48540224407376853, + "learning_rate": 1.8555336821166835e-06, + "loss": 0.0226, + "step": 29694 + }, + { + "epoch": 3.521285426301435, + "grad_norm": 0.6520159605800557, + "learning_rate": 1.8546262939120012e-06, + "loss": 0.0267, + "step": 29695 + }, + { + "epoch": 3.52140400806356, + "grad_norm": 0.5376216652980128, + "learning_rate": 1.8537191190792752e-06, + "loss": 0.0253, + "step": 29696 + }, + { + "epoch": 3.521522589825685, + "grad_norm": 0.6995267668829592, + "learning_rate": 1.8528121576268736e-06, + "loss": 0.0313, + "step": 29697 + }, + { + "epoch": 3.5216411715878095, + "grad_norm": 0.5387353452381793, + "learning_rate": 1.8519054095631594e-06, + "loss": 0.0144, + "step": 29698 + }, + { + "epoch": 3.5217597533499347, + "grad_norm": 0.9250499282323328, + "learning_rate": 1.8509988748964979e-06, + "loss": 0.0475, + "step": 29699 + }, + { + "epoch": 3.52187833511206, + "grad_norm": 0.4888143653995135, + "learning_rate": 1.8500925536352299e-06, + "loss": 0.0216, + "step": 29700 + }, + { + "epoch": 3.5219969168741847, + "grad_norm": 0.5456528077285249, + "learning_rate": 1.8491864457877289e-06, + "loss": 0.0241, + "step": 29701 + }, + { + "epoch": 3.5221154986363095, + "grad_norm": 0.6685726184589093, + "learning_rate": 1.848280551362333e-06, + "loss": 0.0337, + "step": 29702 + }, + { + "epoch": 3.5222340803984347, + "grad_norm": 0.6970023529872814, + "learning_rate": 1.847374870367402e-06, + "loss": 0.0455, + "step": 29703 + }, + { + "epoch": 3.52235266216056, + "grad_norm": 0.5489197892061664, + "learning_rate": 1.8464694028112822e-06, + "loss": 0.0251, + "step": 29704 + }, + { + "epoch": 3.5224712439226846, + "grad_norm": 0.5353116764367867, + "learning_rate": 1.8455641487023224e-06, + "loss": 0.019, + "step": 29705 + }, + { + "epoch": 3.52258982568481, + "grad_norm": 0.47505751643292954, + "learning_rate": 1.844659108048874e-06, + "loss": 0.0251, + "step": 29706 + }, + { + "epoch": 3.5227084074469346, + "grad_norm": 0.7357875312882917, + "learning_rate": 1.8437542808592666e-06, + "loss": 0.0379, + "step": 29707 + }, + { + "epoch": 3.52282698920906, + "grad_norm": 0.42882625277318426, + "learning_rate": 1.8428496671418495e-06, + "loss": 0.0268, + "step": 29708 + }, + { + "epoch": 3.5229455709711845, + "grad_norm": 0.25241811279585646, + "learning_rate": 1.8419452669049598e-06, + "loss": 0.008, + "step": 29709 + }, + { + "epoch": 3.5230641527333098, + "grad_norm": 0.4006458156053699, + "learning_rate": 1.8410410801569412e-06, + "loss": 0.0175, + "step": 29710 + }, + { + "epoch": 3.5231827344954345, + "grad_norm": 0.630975163255042, + "learning_rate": 1.8401371069061202e-06, + "loss": 0.0266, + "step": 29711 + }, + { + "epoch": 3.5233013162575597, + "grad_norm": 0.31771555545763186, + "learning_rate": 1.8392333471608347e-06, + "loss": 0.015, + "step": 29712 + }, + { + "epoch": 3.5234198980196845, + "grad_norm": 0.6291049474267222, + "learning_rate": 1.838329800929417e-06, + "loss": 0.0294, + "step": 29713 + }, + { + "epoch": 3.5235384797818097, + "grad_norm": 0.32485782197760504, + "learning_rate": 1.8374264682201964e-06, + "loss": 0.0164, + "step": 29714 + }, + { + "epoch": 3.5236570615439344, + "grad_norm": 0.6103802782541219, + "learning_rate": 1.8365233490414995e-06, + "loss": 0.0247, + "step": 29715 + }, + { + "epoch": 3.5237756433060596, + "grad_norm": 0.6548619220076796, + "learning_rate": 1.8356204434016505e-06, + "loss": 0.0313, + "step": 29716 + }, + { + "epoch": 3.5238942250681844, + "grad_norm": 0.4996384464336699, + "learning_rate": 1.8347177513089814e-06, + "loss": 0.0196, + "step": 29717 + }, + { + "epoch": 3.5240128068303096, + "grad_norm": 0.6585623012661895, + "learning_rate": 1.8338152727718023e-06, + "loss": 0.0247, + "step": 29718 + }, + { + "epoch": 3.5241313885924344, + "grad_norm": 0.6963795298365674, + "learning_rate": 1.8329130077984369e-06, + "loss": 0.0403, + "step": 29719 + }, + { + "epoch": 3.5242499703545596, + "grad_norm": 0.5714889052874405, + "learning_rate": 1.8320109563972065e-06, + "loss": 0.0298, + "step": 29720 + }, + { + "epoch": 3.5243685521166843, + "grad_norm": 0.6195618349232952, + "learning_rate": 1.8311091185764239e-06, + "loss": 0.0264, + "step": 29721 + }, + { + "epoch": 3.5244871338788095, + "grad_norm": 0.46879577847516685, + "learning_rate": 1.8302074943444076e-06, + "loss": 0.029, + "step": 29722 + }, + { + "epoch": 3.5246057156409343, + "grad_norm": 0.5259661868476645, + "learning_rate": 1.829306083709456e-06, + "loss": 0.0209, + "step": 29723 + }, + { + "epoch": 3.5247242974030595, + "grad_norm": 0.9055345916563335, + "learning_rate": 1.8284048866798992e-06, + "loss": 0.0383, + "step": 29724 + }, + { + "epoch": 3.5248428791651842, + "grad_norm": 0.56106685881978, + "learning_rate": 1.8275039032640273e-06, + "loss": 0.0292, + "step": 29725 + }, + { + "epoch": 3.5249614609273094, + "grad_norm": 0.7752177432515858, + "learning_rate": 1.8266031334701616e-06, + "loss": 0.0362, + "step": 29726 + }, + { + "epoch": 3.525080042689434, + "grad_norm": 0.28872964714054916, + "learning_rate": 1.8257025773065873e-06, + "loss": 0.0116, + "step": 29727 + }, + { + "epoch": 3.5251986244515594, + "grad_norm": 0.7143010570943701, + "learning_rate": 1.8248022347816252e-06, + "loss": 0.0283, + "step": 29728 + }, + { + "epoch": 3.5253172062136846, + "grad_norm": 0.529932449245664, + "learning_rate": 1.8239021059035632e-06, + "loss": 0.0261, + "step": 29729 + }, + { + "epoch": 3.5254357879758094, + "grad_norm": 0.29375156478062153, + "learning_rate": 1.8230021906807033e-06, + "loss": 0.0135, + "step": 29730 + }, + { + "epoch": 3.525554369737934, + "grad_norm": 0.3890218094661398, + "learning_rate": 1.8221024891213412e-06, + "loss": 0.0207, + "step": 29731 + }, + { + "epoch": 3.5256729515000593, + "grad_norm": 0.7284754227780927, + "learning_rate": 1.8212030012337705e-06, + "loss": 0.0224, + "step": 29732 + }, + { + "epoch": 3.5257915332621845, + "grad_norm": 0.5576624598077933, + "learning_rate": 1.8203037270262902e-06, + "loss": 0.0316, + "step": 29733 + }, + { + "epoch": 3.5259101150243093, + "grad_norm": 0.8065300353300112, + "learning_rate": 1.8194046665071767e-06, + "loss": 0.0237, + "step": 29734 + }, + { + "epoch": 3.526028696786434, + "grad_norm": 0.6892253378817784, + "learning_rate": 1.818505819684735e-06, + "loss": 0.0365, + "step": 29735 + }, + { + "epoch": 3.5261472785485592, + "grad_norm": 0.6990501324314508, + "learning_rate": 1.8176071865672385e-06, + "loss": 0.0316, + "step": 29736 + }, + { + "epoch": 3.5262658603106845, + "grad_norm": 0.37134012610537076, + "learning_rate": 1.8167087671629779e-06, + "loss": 0.0154, + "step": 29737 + }, + { + "epoch": 3.526384442072809, + "grad_norm": 0.5492653098781812, + "learning_rate": 1.8158105614802274e-06, + "loss": 0.0262, + "step": 29738 + }, + { + "epoch": 3.526503023834934, + "grad_norm": 0.7258268100105604, + "learning_rate": 1.81491256952728e-06, + "loss": 0.0347, + "step": 29739 + }, + { + "epoch": 3.526621605597059, + "grad_norm": 0.3487530175058038, + "learning_rate": 1.8140147913124017e-06, + "loss": 0.0134, + "step": 29740 + }, + { + "epoch": 3.5267401873591844, + "grad_norm": 0.5475254010171273, + "learning_rate": 1.8131172268438774e-06, + "loss": 0.0353, + "step": 29741 + }, + { + "epoch": 3.526858769121309, + "grad_norm": 0.4152964105190126, + "learning_rate": 1.812219876129978e-06, + "loss": 0.0171, + "step": 29742 + }, + { + "epoch": 3.526977350883434, + "grad_norm": 0.43576758401708504, + "learning_rate": 1.811322739178975e-06, + "loss": 0.0203, + "step": 29743 + }, + { + "epoch": 3.527095932645559, + "grad_norm": 0.5038076982810951, + "learning_rate": 1.810425815999145e-06, + "loss": 0.0185, + "step": 29744 + }, + { + "epoch": 3.5272145144076843, + "grad_norm": 0.4616487735042255, + "learning_rate": 1.8095291065987453e-06, + "loss": 0.0168, + "step": 29745 + }, + { + "epoch": 3.527333096169809, + "grad_norm": 0.5776199886167062, + "learning_rate": 1.8086326109860553e-06, + "loss": 0.0278, + "step": 29746 + }, + { + "epoch": 3.527451677931934, + "grad_norm": 0.4409578798158197, + "learning_rate": 1.8077363291693295e-06, + "loss": 0.0204, + "step": 29747 + }, + { + "epoch": 3.527570259694059, + "grad_norm": 0.7135770286718727, + "learning_rate": 1.806840261156842e-06, + "loss": 0.021, + "step": 29748 + }, + { + "epoch": 3.527688841456184, + "grad_norm": 0.40929862935449957, + "learning_rate": 1.8059444069568333e-06, + "loss": 0.0264, + "step": 29749 + }, + { + "epoch": 3.527807423218309, + "grad_norm": 0.4696238576683629, + "learning_rate": 1.8050487665775856e-06, + "loss": 0.0165, + "step": 29750 + }, + { + "epoch": 3.5279260049804337, + "grad_norm": 0.4805204173437258, + "learning_rate": 1.8041533400273396e-06, + "loss": 0.0191, + "step": 29751 + }, + { + "epoch": 3.528044586742559, + "grad_norm": 0.8351890280130448, + "learning_rate": 1.8032581273143555e-06, + "loss": 0.0407, + "step": 29752 + }, + { + "epoch": 3.528163168504684, + "grad_norm": 0.46345807106851716, + "learning_rate": 1.8023631284468878e-06, + "loss": 0.0211, + "step": 29753 + }, + { + "epoch": 3.528281750266809, + "grad_norm": 0.4764786421781317, + "learning_rate": 1.8014683434331825e-06, + "loss": 0.0216, + "step": 29754 + }, + { + "epoch": 3.528400332028934, + "grad_norm": 0.5979450285203324, + "learning_rate": 1.800573772281497e-06, + "loss": 0.0311, + "step": 29755 + }, + { + "epoch": 3.528518913791059, + "grad_norm": 0.5781158556776873, + "learning_rate": 1.7996794150000661e-06, + "loss": 0.0217, + "step": 29756 + }, + { + "epoch": 3.528637495553184, + "grad_norm": 0.43461293957983593, + "learning_rate": 1.798785271597142e-06, + "loss": 0.0189, + "step": 29757 + }, + { + "epoch": 3.528756077315309, + "grad_norm": 0.7318547932139784, + "learning_rate": 1.7978913420809678e-06, + "loss": 0.03, + "step": 29758 + }, + { + "epoch": 3.528874659077434, + "grad_norm": 0.46857644655075453, + "learning_rate": 1.7969976264597843e-06, + "loss": 0.0256, + "step": 29759 + }, + { + "epoch": 3.528993240839559, + "grad_norm": 0.4098172809060326, + "learning_rate": 1.7961041247418236e-06, + "loss": 0.0213, + "step": 29760 + }, + { + "epoch": 3.529111822601684, + "grad_norm": 0.657450695798323, + "learning_rate": 1.795210836935335e-06, + "loss": 0.0277, + "step": 29761 + }, + { + "epoch": 3.5292304043638087, + "grad_norm": 0.6956680632804466, + "learning_rate": 1.7943177630485448e-06, + "loss": 0.0326, + "step": 29762 + }, + { + "epoch": 3.529348986125934, + "grad_norm": 0.8641524781888356, + "learning_rate": 1.7934249030896854e-06, + "loss": 0.0491, + "step": 29763 + }, + { + "epoch": 3.5294675678880587, + "grad_norm": 0.4476380285047533, + "learning_rate": 1.7925322570669923e-06, + "loss": 0.0195, + "step": 29764 + }, + { + "epoch": 3.529586149650184, + "grad_norm": 0.3403047944334038, + "learning_rate": 1.7916398249886945e-06, + "loss": 0.0094, + "step": 29765 + }, + { + "epoch": 3.5297047314123087, + "grad_norm": 0.439544609568498, + "learning_rate": 1.7907476068630192e-06, + "loss": 0.0208, + "step": 29766 + }, + { + "epoch": 3.529823313174434, + "grad_norm": 0.5261525017984751, + "learning_rate": 1.7898556026981871e-06, + "loss": 0.0225, + "step": 29767 + }, + { + "epoch": 3.5299418949365586, + "grad_norm": 0.4613335383832063, + "learning_rate": 1.7889638125024226e-06, + "loss": 0.0205, + "step": 29768 + }, + { + "epoch": 3.530060476698684, + "grad_norm": 0.25250783324545806, + "learning_rate": 1.7880722362839498e-06, + "loss": 0.0111, + "step": 29769 + }, + { + "epoch": 3.5301790584608086, + "grad_norm": 0.4006041100798737, + "learning_rate": 1.7871808740509894e-06, + "loss": 0.018, + "step": 29770 + }, + { + "epoch": 3.530297640222934, + "grad_norm": 0.5715513134702446, + "learning_rate": 1.786289725811749e-06, + "loss": 0.0243, + "step": 29771 + }, + { + "epoch": 3.5304162219850586, + "grad_norm": 0.5265539816408541, + "learning_rate": 1.785398791574458e-06, + "loss": 0.0268, + "step": 29772 + }, + { + "epoch": 3.5305348037471838, + "grad_norm": 0.4411914484670427, + "learning_rate": 1.7845080713473212e-06, + "loss": 0.0203, + "step": 29773 + }, + { + "epoch": 3.5306533855093085, + "grad_norm": 0.6804348971348104, + "learning_rate": 1.7836175651385484e-06, + "loss": 0.0293, + "step": 29774 + }, + { + "epoch": 3.5307719672714337, + "grad_norm": 0.6091942990032148, + "learning_rate": 1.7827272729563526e-06, + "loss": 0.0264, + "step": 29775 + }, + { + "epoch": 3.5308905490335585, + "grad_norm": 0.4288014175599847, + "learning_rate": 1.7818371948089386e-06, + "loss": 0.0295, + "step": 29776 + }, + { + "epoch": 3.5310091307956837, + "grad_norm": 0.5536340441838888, + "learning_rate": 1.7809473307045215e-06, + "loss": 0.0272, + "step": 29777 + }, + { + "epoch": 3.531127712557809, + "grad_norm": 0.48684513697108084, + "learning_rate": 1.7800576806512897e-06, + "loss": 0.0159, + "step": 29778 + }, + { + "epoch": 3.5312462943199336, + "grad_norm": 0.4312612775888368, + "learning_rate": 1.779168244657453e-06, + "loss": 0.0165, + "step": 29779 + }, + { + "epoch": 3.5313648760820584, + "grad_norm": 0.5374613345885483, + "learning_rate": 1.7782790227312106e-06, + "loss": 0.032, + "step": 29780 + }, + { + "epoch": 3.5314834578441836, + "grad_norm": 0.4593400431718555, + "learning_rate": 1.7773900148807587e-06, + "loss": 0.0198, + "step": 29781 + }, + { + "epoch": 3.531602039606309, + "grad_norm": 0.7657652966938282, + "learning_rate": 1.7765012211142962e-06, + "loss": 0.029, + "step": 29782 + }, + { + "epoch": 3.5317206213684336, + "grad_norm": 0.5974263722987603, + "learning_rate": 1.7756126414400082e-06, + "loss": 0.0229, + "step": 29783 + }, + { + "epoch": 3.5318392031305583, + "grad_norm": 0.3957321137181532, + "learning_rate": 1.7747242758660992e-06, + "loss": 0.0196, + "step": 29784 + }, + { + "epoch": 3.5319577848926835, + "grad_norm": 0.509377134328125, + "learning_rate": 1.7738361244007461e-06, + "loss": 0.0305, + "step": 29785 + }, + { + "epoch": 3.5320763666548087, + "grad_norm": 0.7960216473323293, + "learning_rate": 1.7729481870521452e-06, + "loss": 0.0322, + "step": 29786 + }, + { + "epoch": 3.5321949484169335, + "grad_norm": 0.5857607100419213, + "learning_rate": 1.772060463828476e-06, + "loss": 0.0319, + "step": 29787 + }, + { + "epoch": 3.5323135301790582, + "grad_norm": 0.9649839251366753, + "learning_rate": 1.771172954737932e-06, + "loss": 0.0249, + "step": 29788 + }, + { + "epoch": 3.5324321119411835, + "grad_norm": 0.6824612739463263, + "learning_rate": 1.7702856597886813e-06, + "loss": 0.0387, + "step": 29789 + }, + { + "epoch": 3.5325506937033087, + "grad_norm": 0.41981516135438396, + "learning_rate": 1.7693985789889123e-06, + "loss": 0.0182, + "step": 29790 + }, + { + "epoch": 3.5326692754654334, + "grad_norm": 0.7424707163317142, + "learning_rate": 1.7685117123468016e-06, + "loss": 0.0382, + "step": 29791 + }, + { + "epoch": 3.532787857227558, + "grad_norm": 0.7691360384680115, + "learning_rate": 1.767625059870523e-06, + "loss": 0.0286, + "step": 29792 + }, + { + "epoch": 3.5329064389896834, + "grad_norm": 0.4726886140656565, + "learning_rate": 1.7667386215682591e-06, + "loss": 0.0163, + "step": 29793 + }, + { + "epoch": 3.5330250207518086, + "grad_norm": 0.4180160839087153, + "learning_rate": 1.7658523974481644e-06, + "loss": 0.016, + "step": 29794 + }, + { + "epoch": 3.5331436025139333, + "grad_norm": 0.4763833556551987, + "learning_rate": 1.764966387518427e-06, + "loss": 0.0253, + "step": 29795 + }, + { + "epoch": 3.533262184276058, + "grad_norm": 0.4350279395138895, + "learning_rate": 1.764080591787201e-06, + "loss": 0.0183, + "step": 29796 + }, + { + "epoch": 3.5333807660381833, + "grad_norm": 0.57984987883396, + "learning_rate": 1.7631950102626666e-06, + "loss": 0.0308, + "step": 29797 + }, + { + "epoch": 3.5334993478003085, + "grad_norm": 0.7109498071084223, + "learning_rate": 1.7623096429529668e-06, + "loss": 0.0393, + "step": 29798 + }, + { + "epoch": 3.5336179295624333, + "grad_norm": 0.42575143583734737, + "learning_rate": 1.7614244898662868e-06, + "loss": 0.0244, + "step": 29799 + }, + { + "epoch": 3.533736511324558, + "grad_norm": 0.4894183557219041, + "learning_rate": 1.7605395510107704e-06, + "loss": 0.0255, + "step": 29800 + }, + { + "epoch": 3.533855093086683, + "grad_norm": 0.6967255717471638, + "learning_rate": 1.7596548263945828e-06, + "loss": 0.029, + "step": 29801 + }, + { + "epoch": 3.5339736748488084, + "grad_norm": 0.565453006553706, + "learning_rate": 1.7587703160258789e-06, + "loss": 0.0258, + "step": 29802 + }, + { + "epoch": 3.534092256610933, + "grad_norm": 0.34699338374368754, + "learning_rate": 1.7578860199128105e-06, + "loss": 0.0208, + "step": 29803 + }, + { + "epoch": 3.5342108383730584, + "grad_norm": 0.5329438542840225, + "learning_rate": 1.7570019380635378e-06, + "loss": 0.0372, + "step": 29804 + }, + { + "epoch": 3.534329420135183, + "grad_norm": 0.6070295370621347, + "learning_rate": 1.7561180704861958e-06, + "loss": 0.0294, + "step": 29805 + }, + { + "epoch": 3.5344480018973083, + "grad_norm": 0.7241442145827334, + "learning_rate": 1.7552344171889474e-06, + "loss": 0.0373, + "step": 29806 + }, + { + "epoch": 3.534566583659433, + "grad_norm": 0.4590079409599786, + "learning_rate": 1.7543509781799332e-06, + "loss": 0.0208, + "step": 29807 + }, + { + "epoch": 3.5346851654215583, + "grad_norm": 0.36181743238498887, + "learning_rate": 1.7534677534672971e-06, + "loss": 0.0143, + "step": 29808 + }, + { + "epoch": 3.534803747183683, + "grad_norm": 0.9479400055064624, + "learning_rate": 1.7525847430591769e-06, + "loss": 0.0482, + "step": 29809 + }, + { + "epoch": 3.5349223289458083, + "grad_norm": 0.5026042435741257, + "learning_rate": 1.7517019469637242e-06, + "loss": 0.0216, + "step": 29810 + }, + { + "epoch": 3.535040910707933, + "grad_norm": 0.4065932565695605, + "learning_rate": 1.7508193651890659e-06, + "loss": 0.0184, + "step": 29811 + }, + { + "epoch": 3.5351594924700582, + "grad_norm": 0.5686483375640226, + "learning_rate": 1.7499369977433456e-06, + "loss": 0.0258, + "step": 29812 + }, + { + "epoch": 3.535278074232183, + "grad_norm": 0.7063369383911269, + "learning_rate": 1.7490548446346928e-06, + "loss": 0.0344, + "step": 29813 + }, + { + "epoch": 3.535396655994308, + "grad_norm": 0.6307223736504821, + "learning_rate": 1.7481729058712425e-06, + "loss": 0.0196, + "step": 29814 + }, + { + "epoch": 3.535515237756433, + "grad_norm": 0.6120507000120063, + "learning_rate": 1.7472911814611276e-06, + "loss": 0.0232, + "step": 29815 + }, + { + "epoch": 3.535633819518558, + "grad_norm": 0.34526954467351745, + "learning_rate": 1.7464096714124662e-06, + "loss": 0.0125, + "step": 29816 + }, + { + "epoch": 3.535752401280683, + "grad_norm": 0.6143104941930999, + "learning_rate": 1.7455283757334018e-06, + "loss": 0.0293, + "step": 29817 + }, + { + "epoch": 3.535870983042808, + "grad_norm": 0.5187638570596144, + "learning_rate": 1.7446472944320446e-06, + "loss": 0.0248, + "step": 29818 + }, + { + "epoch": 3.535989564804933, + "grad_norm": 0.4925714011980354, + "learning_rate": 1.7437664275165245e-06, + "loss": 0.0247, + "step": 29819 + }, + { + "epoch": 3.536108146567058, + "grad_norm": 0.3907879156021508, + "learning_rate": 1.742885774994954e-06, + "loss": 0.0182, + "step": 29820 + }, + { + "epoch": 3.536226728329183, + "grad_norm": 0.45103791226788476, + "learning_rate": 1.742005336875463e-06, + "loss": 0.0219, + "step": 29821 + }, + { + "epoch": 3.536345310091308, + "grad_norm": 0.4708229392251132, + "learning_rate": 1.741125113166156e-06, + "loss": 0.023, + "step": 29822 + }, + { + "epoch": 3.536463891853433, + "grad_norm": 0.3293516195170832, + "learning_rate": 1.740245103875157e-06, + "loss": 0.0136, + "step": 29823 + }, + { + "epoch": 3.536582473615558, + "grad_norm": 0.30673657240651214, + "learning_rate": 1.7393653090105738e-06, + "loss": 0.0142, + "step": 29824 + }, + { + "epoch": 3.5367010553776828, + "grad_norm": 0.8813419418895659, + "learning_rate": 1.738485728580519e-06, + "loss": 0.0325, + "step": 29825 + }, + { + "epoch": 3.536819637139808, + "grad_norm": 0.5471058434582963, + "learning_rate": 1.7376063625931027e-06, + "loss": 0.0263, + "step": 29826 + }, + { + "epoch": 3.5369382189019327, + "grad_norm": 0.4110963901204428, + "learning_rate": 1.736727211056427e-06, + "loss": 0.0145, + "step": 29827 + }, + { + "epoch": 3.537056800664058, + "grad_norm": 0.5729802721238637, + "learning_rate": 1.7358482739785992e-06, + "loss": 0.0326, + "step": 29828 + }, + { + "epoch": 3.5371753824261827, + "grad_norm": 0.45772041966232146, + "learning_rate": 1.734969551367721e-06, + "loss": 0.021, + "step": 29829 + }, + { + "epoch": 3.537293964188308, + "grad_norm": 0.7026992874302102, + "learning_rate": 1.7340910432318946e-06, + "loss": 0.038, + "step": 29830 + }, + { + "epoch": 3.537412545950433, + "grad_norm": 0.6072973548208316, + "learning_rate": 1.7332127495792189e-06, + "loss": 0.0274, + "step": 29831 + }, + { + "epoch": 3.537531127712558, + "grad_norm": 0.39797307307314894, + "learning_rate": 1.73233467041779e-06, + "loss": 0.0205, + "step": 29832 + }, + { + "epoch": 3.5376497094746826, + "grad_norm": 0.47417361822863985, + "learning_rate": 1.7314568057557046e-06, + "loss": 0.0209, + "step": 29833 + }, + { + "epoch": 3.537768291236808, + "grad_norm": 0.4523918507213722, + "learning_rate": 1.7305791556010532e-06, + "loss": 0.0181, + "step": 29834 + }, + { + "epoch": 3.537886872998933, + "grad_norm": 0.6083819919802055, + "learning_rate": 1.7297017199619236e-06, + "loss": 0.0253, + "step": 29835 + }, + { + "epoch": 3.5380054547610578, + "grad_norm": 0.4105308727370176, + "learning_rate": 1.7288244988464097e-06, + "loss": 0.0186, + "step": 29836 + }, + { + "epoch": 3.5381240365231825, + "grad_norm": 0.7437640263052261, + "learning_rate": 1.7279474922626021e-06, + "loss": 0.0274, + "step": 29837 + }, + { + "epoch": 3.5382426182853077, + "grad_norm": 0.7463108807535694, + "learning_rate": 1.727070700218575e-06, + "loss": 0.0413, + "step": 29838 + }, + { + "epoch": 3.538361200047433, + "grad_norm": 0.32205807592036056, + "learning_rate": 1.726194122722416e-06, + "loss": 0.0156, + "step": 29839 + }, + { + "epoch": 3.5384797818095577, + "grad_norm": 0.5236762675707538, + "learning_rate": 1.7253177597822078e-06, + "loss": 0.0282, + "step": 29840 + }, + { + "epoch": 3.5385983635716824, + "grad_norm": 0.45458682206957407, + "learning_rate": 1.7244416114060301e-06, + "loss": 0.0188, + "step": 29841 + }, + { + "epoch": 3.5387169453338077, + "grad_norm": 0.5487242520176663, + "learning_rate": 1.7235656776019626e-06, + "loss": 0.0229, + "step": 29842 + }, + { + "epoch": 3.538835527095933, + "grad_norm": 0.46616555505282553, + "learning_rate": 1.7226899583780654e-06, + "loss": 0.0166, + "step": 29843 + }, + { + "epoch": 3.5389541088580576, + "grad_norm": 0.5716581887123204, + "learning_rate": 1.721814453742432e-06, + "loss": 0.026, + "step": 29844 + }, + { + "epoch": 3.5390726906201824, + "grad_norm": 0.5047634887575484, + "learning_rate": 1.7209391637031197e-06, + "loss": 0.0261, + "step": 29845 + }, + { + "epoch": 3.5391912723823076, + "grad_norm": 0.8422113633952243, + "learning_rate": 1.7200640882682e-06, + "loss": 0.0342, + "step": 29846 + }, + { + "epoch": 3.5393098541444328, + "grad_norm": 0.4934628124124678, + "learning_rate": 1.7191892274457445e-06, + "loss": 0.0243, + "step": 29847 + }, + { + "epoch": 3.5394284359065575, + "grad_norm": 0.519232498817814, + "learning_rate": 1.7183145812438184e-06, + "loss": 0.0319, + "step": 29848 + }, + { + "epoch": 3.5395470176686823, + "grad_norm": 0.4231169568421761, + "learning_rate": 1.7174401496704767e-06, + "loss": 0.0211, + "step": 29849 + }, + { + "epoch": 3.5396655994308075, + "grad_norm": 0.4309525575842736, + "learning_rate": 1.716565932733788e-06, + "loss": 0.0195, + "step": 29850 + }, + { + "epoch": 3.5397841811929327, + "grad_norm": 0.4648038862937506, + "learning_rate": 1.7156919304418068e-06, + "loss": 0.0162, + "step": 29851 + }, + { + "epoch": 3.5399027629550575, + "grad_norm": 0.4509805621105282, + "learning_rate": 1.7148181428025962e-06, + "loss": 0.0208, + "step": 29852 + }, + { + "epoch": 3.5400213447171827, + "grad_norm": 0.4517073671130342, + "learning_rate": 1.713944569824208e-06, + "loss": 0.0204, + "step": 29853 + }, + { + "epoch": 3.5401399264793074, + "grad_norm": 0.843925161529657, + "learning_rate": 1.7130712115146913e-06, + "loss": 0.0499, + "step": 29854 + }, + { + "epoch": 3.5402585082414326, + "grad_norm": 0.3756836068383002, + "learning_rate": 1.7121980678821064e-06, + "loss": 0.0158, + "step": 29855 + }, + { + "epoch": 3.5403770900035574, + "grad_norm": 0.8415302714690331, + "learning_rate": 1.7113251389344969e-06, + "loss": 0.0351, + "step": 29856 + }, + { + "epoch": 3.5404956717656826, + "grad_norm": 0.3928314870784064, + "learning_rate": 1.710452424679909e-06, + "loss": 0.0172, + "step": 29857 + }, + { + "epoch": 3.5406142535278073, + "grad_norm": 0.522855154415334, + "learning_rate": 1.7095799251263922e-06, + "loss": 0.0265, + "step": 29858 + }, + { + "epoch": 3.5407328352899325, + "grad_norm": 0.3957905381863885, + "learning_rate": 1.7087076402819896e-06, + "loss": 0.019, + "step": 29859 + }, + { + "epoch": 3.5408514170520573, + "grad_norm": 0.5036143622327476, + "learning_rate": 1.7078355701547394e-06, + "loss": 0.0253, + "step": 29860 + }, + { + "epoch": 3.5409699988141825, + "grad_norm": 0.6341055452638957, + "learning_rate": 1.7069637147526824e-06, + "loss": 0.0319, + "step": 29861 + }, + { + "epoch": 3.5410885805763073, + "grad_norm": 0.5486917435269995, + "learning_rate": 1.7060920740838538e-06, + "loss": 0.0372, + "step": 29862 + }, + { + "epoch": 3.5412071623384325, + "grad_norm": 0.652490334588292, + "learning_rate": 1.7052206481562915e-06, + "loss": 0.0424, + "step": 29863 + }, + { + "epoch": 3.5413257441005572, + "grad_norm": 0.8264144148776539, + "learning_rate": 1.7043494369780367e-06, + "loss": 0.0374, + "step": 29864 + }, + { + "epoch": 3.5414443258626824, + "grad_norm": 0.49793402026515315, + "learning_rate": 1.7034784405571018e-06, + "loss": 0.0248, + "step": 29865 + }, + { + "epoch": 3.541562907624807, + "grad_norm": 0.527763515865206, + "learning_rate": 1.7026076589015366e-06, + "loss": 0.0256, + "step": 29866 + }, + { + "epoch": 3.5416814893869324, + "grad_norm": 0.4020104147478867, + "learning_rate": 1.7017370920193537e-06, + "loss": 0.0165, + "step": 29867 + }, + { + "epoch": 3.541800071149057, + "grad_norm": 0.7513557306923028, + "learning_rate": 1.7008667399185912e-06, + "loss": 0.0328, + "step": 29868 + }, + { + "epoch": 3.5419186529111824, + "grad_norm": 0.6295057151808664, + "learning_rate": 1.6999966026072568e-06, + "loss": 0.04, + "step": 29869 + }, + { + "epoch": 3.542037234673307, + "grad_norm": 0.6083393387560253, + "learning_rate": 1.699126680093388e-06, + "loss": 0.0318, + "step": 29870 + }, + { + "epoch": 3.5421558164354323, + "grad_norm": 0.3369097470882282, + "learning_rate": 1.6982569723849956e-06, + "loss": 0.0179, + "step": 29871 + }, + { + "epoch": 3.542274398197557, + "grad_norm": 0.5884479634680414, + "learning_rate": 1.6973874794901007e-06, + "loss": 0.0316, + "step": 29872 + }, + { + "epoch": 3.5423929799596823, + "grad_norm": 0.5902351141991251, + "learning_rate": 1.6965182014167163e-06, + "loss": 0.0321, + "step": 29873 + }, + { + "epoch": 3.542511561721807, + "grad_norm": 0.2552214514480266, + "learning_rate": 1.6956491381728556e-06, + "loss": 0.0124, + "step": 29874 + }, + { + "epoch": 3.5426301434839322, + "grad_norm": 0.684058939501947, + "learning_rate": 1.6947802897665399e-06, + "loss": 0.0268, + "step": 29875 + }, + { + "epoch": 3.542748725246057, + "grad_norm": 0.5782506562229784, + "learning_rate": 1.69391165620576e-06, + "loss": 0.0321, + "step": 29876 + }, + { + "epoch": 3.542867307008182, + "grad_norm": 0.43153916030429934, + "learning_rate": 1.6930432374985428e-06, + "loss": 0.0243, + "step": 29877 + }, + { + "epoch": 3.542985888770307, + "grad_norm": 0.5635094567141441, + "learning_rate": 1.6921750336528846e-06, + "loss": 0.0257, + "step": 29878 + }, + { + "epoch": 3.543104470532432, + "grad_norm": 0.599253942043817, + "learning_rate": 1.6913070446767903e-06, + "loss": 0.0187, + "step": 29879 + }, + { + "epoch": 3.5432230522945574, + "grad_norm": 0.8975826066513624, + "learning_rate": 1.6904392705782591e-06, + "loss": 0.0478, + "step": 29880 + }, + { + "epoch": 3.543341634056682, + "grad_norm": 0.4528030354100953, + "learning_rate": 1.6895717113652953e-06, + "loss": 0.0223, + "step": 29881 + }, + { + "epoch": 3.543460215818807, + "grad_norm": 0.6386564114857325, + "learning_rate": 1.6887043670459012e-06, + "loss": 0.0343, + "step": 29882 + }, + { + "epoch": 3.543578797580932, + "grad_norm": 0.5383767017053323, + "learning_rate": 1.687837237628062e-06, + "loss": 0.0205, + "step": 29883 + }, + { + "epoch": 3.5436973793430573, + "grad_norm": 1.1116871755873077, + "learning_rate": 1.6869703231197742e-06, + "loss": 0.0257, + "step": 29884 + }, + { + "epoch": 3.543815961105182, + "grad_norm": 0.6122016230596661, + "learning_rate": 1.6861036235290311e-06, + "loss": 0.0315, + "step": 29885 + }, + { + "epoch": 3.543934542867307, + "grad_norm": 0.34210628862133163, + "learning_rate": 1.6852371388638294e-06, + "loss": 0.0184, + "step": 29886 + }, + { + "epoch": 3.544053124629432, + "grad_norm": 0.4373793186695634, + "learning_rate": 1.684370869132143e-06, + "loss": 0.0245, + "step": 29887 + }, + { + "epoch": 3.544171706391557, + "grad_norm": 0.5107717647310136, + "learning_rate": 1.683504814341974e-06, + "loss": 0.0178, + "step": 29888 + }, + { + "epoch": 3.544290288153682, + "grad_norm": 0.44761255884693124, + "learning_rate": 1.682638974501291e-06, + "loss": 0.0288, + "step": 29889 + }, + { + "epoch": 3.5444088699158067, + "grad_norm": 0.33191912929559714, + "learning_rate": 1.681773349618085e-06, + "loss": 0.0166, + "step": 29890 + }, + { + "epoch": 3.544527451677932, + "grad_norm": 0.37192234722833595, + "learning_rate": 1.6809079397003353e-06, + "loss": 0.0174, + "step": 29891 + }, + { + "epoch": 3.544646033440057, + "grad_norm": 0.3263127026053091, + "learning_rate": 1.6800427447560163e-06, + "loss": 0.012, + "step": 29892 + }, + { + "epoch": 3.544764615202182, + "grad_norm": 0.44511726798150447, + "learning_rate": 1.6791777647931133e-06, + "loss": 0.0274, + "step": 29893 + }, + { + "epoch": 3.5448831969643066, + "grad_norm": 0.6437111225647386, + "learning_rate": 1.6783129998195869e-06, + "loss": 0.0391, + "step": 29894 + }, + { + "epoch": 3.545001778726432, + "grad_norm": 0.621662052605796, + "learning_rate": 1.6774484498434162e-06, + "loss": 0.0259, + "step": 29895 + }, + { + "epoch": 3.545120360488557, + "grad_norm": 0.709357268680643, + "learning_rate": 1.6765841148725703e-06, + "loss": 0.041, + "step": 29896 + }, + { + "epoch": 3.545238942250682, + "grad_norm": 0.3424418985850515, + "learning_rate": 1.6757199949150232e-06, + "loss": 0.0207, + "step": 29897 + }, + { + "epoch": 3.5453575240128066, + "grad_norm": 0.4004155183000806, + "learning_rate": 1.6748560899787297e-06, + "loss": 0.0231, + "step": 29898 + }, + { + "epoch": 3.5454761057749318, + "grad_norm": 0.6804897141456072, + "learning_rate": 1.673992400071661e-06, + "loss": 0.0393, + "step": 29899 + }, + { + "epoch": 3.545594687537057, + "grad_norm": 0.42614911785263093, + "learning_rate": 1.673128925201778e-06, + "loss": 0.0159, + "step": 29900 + }, + { + "epoch": 3.5457132692991817, + "grad_norm": 0.7692149305780882, + "learning_rate": 1.672265665377043e-06, + "loss": 0.0379, + "step": 29901 + }, + { + "epoch": 3.5458318510613065, + "grad_norm": 0.3504587636278705, + "learning_rate": 1.6714026206054085e-06, + "loss": 0.013, + "step": 29902 + }, + { + "epoch": 3.5459504328234317, + "grad_norm": 0.3271972630156085, + "learning_rate": 1.6705397908948372e-06, + "loss": 0.0139, + "step": 29903 + }, + { + "epoch": 3.546069014585557, + "grad_norm": 0.5620409880710147, + "learning_rate": 1.6696771762532843e-06, + "loss": 0.0273, + "step": 29904 + }, + { + "epoch": 3.5461875963476817, + "grad_norm": 0.6707015732144354, + "learning_rate": 1.668814776688693e-06, + "loss": 0.0379, + "step": 29905 + }, + { + "epoch": 3.546306178109807, + "grad_norm": 0.6113438138680686, + "learning_rate": 1.6679525922090184e-06, + "loss": 0.0219, + "step": 29906 + }, + { + "epoch": 3.5464247598719316, + "grad_norm": 0.3967795501035116, + "learning_rate": 1.667090622822212e-06, + "loss": 0.0207, + "step": 29907 + }, + { + "epoch": 3.546543341634057, + "grad_norm": 0.5176036119763264, + "learning_rate": 1.6662288685362182e-06, + "loss": 0.0312, + "step": 29908 + }, + { + "epoch": 3.5466619233961816, + "grad_norm": 0.6923300138796791, + "learning_rate": 1.66536732935898e-06, + "loss": 0.0287, + "step": 29909 + }, + { + "epoch": 3.546780505158307, + "grad_norm": 0.6852033157374657, + "learning_rate": 1.6645060052984385e-06, + "loss": 0.024, + "step": 29910 + }, + { + "epoch": 3.5468990869204315, + "grad_norm": 0.4795514573921929, + "learning_rate": 1.6636448963625346e-06, + "loss": 0.0231, + "step": 29911 + }, + { + "epoch": 3.5470176686825567, + "grad_norm": 0.40009489720572866, + "learning_rate": 1.662784002559209e-06, + "loss": 0.0197, + "step": 29912 + }, + { + "epoch": 3.5471362504446815, + "grad_norm": 0.7024274503316731, + "learning_rate": 1.6619233238963999e-06, + "loss": 0.0289, + "step": 29913 + }, + { + "epoch": 3.5472548322068067, + "grad_norm": 0.2830419325591423, + "learning_rate": 1.6610628603820316e-06, + "loss": 0.0143, + "step": 29914 + }, + { + "epoch": 3.5473734139689315, + "grad_norm": 0.5668692078642118, + "learning_rate": 1.6602026120240504e-06, + "loss": 0.0277, + "step": 29915 + }, + { + "epoch": 3.5474919957310567, + "grad_norm": 0.3545266990688527, + "learning_rate": 1.659342578830378e-06, + "loss": 0.0213, + "step": 29916 + }, + { + "epoch": 3.5476105774931814, + "grad_norm": 0.5359235261662698, + "learning_rate": 1.6584827608089438e-06, + "loss": 0.0239, + "step": 29917 + }, + { + "epoch": 3.5477291592553066, + "grad_norm": 0.6596112053968065, + "learning_rate": 1.6576231579676749e-06, + "loss": 0.0353, + "step": 29918 + }, + { + "epoch": 3.5478477410174314, + "grad_norm": 0.40766524546466537, + "learning_rate": 1.656763770314501e-06, + "loss": 0.0177, + "step": 29919 + }, + { + "epoch": 3.5479663227795566, + "grad_norm": 0.5274222364424056, + "learning_rate": 1.6559045978573356e-06, + "loss": 0.0325, + "step": 29920 + }, + { + "epoch": 3.5480849045416814, + "grad_norm": 0.8957081181116467, + "learning_rate": 1.6550456406041026e-06, + "loss": 0.0334, + "step": 29921 + }, + { + "epoch": 3.5482034863038066, + "grad_norm": 0.5689893845287324, + "learning_rate": 1.6541868985627235e-06, + "loss": 0.0204, + "step": 29922 + }, + { + "epoch": 3.5483220680659313, + "grad_norm": 0.3754533223834487, + "learning_rate": 1.6533283717411085e-06, + "loss": 0.0202, + "step": 29923 + }, + { + "epoch": 3.5484406498280565, + "grad_norm": 0.9014796454673579, + "learning_rate": 1.6524700601471848e-06, + "loss": 0.0558, + "step": 29924 + }, + { + "epoch": 3.5485592315901813, + "grad_norm": 0.479610826399354, + "learning_rate": 1.651611963788846e-06, + "loss": 0.0206, + "step": 29925 + }, + { + "epoch": 3.5486778133523065, + "grad_norm": 0.5031512092864114, + "learning_rate": 1.650754082674022e-06, + "loss": 0.0232, + "step": 29926 + }, + { + "epoch": 3.5487963951144312, + "grad_norm": 0.4948598407278654, + "learning_rate": 1.649896416810609e-06, + "loss": 0.02, + "step": 29927 + }, + { + "epoch": 3.5489149768765564, + "grad_norm": 0.41450905942702393, + "learning_rate": 1.6490389662065148e-06, + "loss": 0.0235, + "step": 29928 + }, + { + "epoch": 3.5490335586386816, + "grad_norm": 0.8275891066045656, + "learning_rate": 1.6481817308696495e-06, + "loss": 0.0523, + "step": 29929 + }, + { + "epoch": 3.5491521404008064, + "grad_norm": 0.5097201178551718, + "learning_rate": 1.6473247108079127e-06, + "loss": 0.0197, + "step": 29930 + }, + { + "epoch": 3.549270722162931, + "grad_norm": 0.569110279203461, + "learning_rate": 1.646467906029206e-06, + "loss": 0.0165, + "step": 29931 + }, + { + "epoch": 3.5493893039250564, + "grad_norm": 0.5139208042449925, + "learning_rate": 1.6456113165414234e-06, + "loss": 0.0256, + "step": 29932 + }, + { + "epoch": 3.5495078856871816, + "grad_norm": 0.39636152900265587, + "learning_rate": 1.6447549423524694e-06, + "loss": 0.0161, + "step": 29933 + }, + { + "epoch": 3.5496264674493063, + "grad_norm": 0.48408208156945964, + "learning_rate": 1.6438987834702324e-06, + "loss": 0.0252, + "step": 29934 + }, + { + "epoch": 3.549745049211431, + "grad_norm": 0.4902991578629438, + "learning_rate": 1.6430428399026115e-06, + "loss": 0.0255, + "step": 29935 + }, + { + "epoch": 3.5498636309735563, + "grad_norm": 0.38776333101601707, + "learning_rate": 1.6421871116574865e-06, + "loss": 0.0284, + "step": 29936 + }, + { + "epoch": 3.5499822127356815, + "grad_norm": 1.024216384161609, + "learning_rate": 1.6413315987427597e-06, + "loss": 0.0462, + "step": 29937 + }, + { + "epoch": 3.5501007944978062, + "grad_norm": 0.5995667051344792, + "learning_rate": 1.6404763011663076e-06, + "loss": 0.0333, + "step": 29938 + }, + { + "epoch": 3.550219376259931, + "grad_norm": 0.7818462698163368, + "learning_rate": 1.6396212189360189e-06, + "loss": 0.0423, + "step": 29939 + }, + { + "epoch": 3.550337958022056, + "grad_norm": 0.4889185858247401, + "learning_rate": 1.6387663520597758e-06, + "loss": 0.0306, + "step": 29940 + }, + { + "epoch": 3.5504565397841814, + "grad_norm": 0.45069208414725576, + "learning_rate": 1.6379117005454607e-06, + "loss": 0.0177, + "step": 29941 + }, + { + "epoch": 3.550575121546306, + "grad_norm": 0.36663106784241933, + "learning_rate": 1.637057264400954e-06, + "loss": 0.0159, + "step": 29942 + }, + { + "epoch": 3.550693703308431, + "grad_norm": 0.44398399306133945, + "learning_rate": 1.6362030436341213e-06, + "loss": 0.0223, + "step": 29943 + }, + { + "epoch": 3.550812285070556, + "grad_norm": 0.45340747302936346, + "learning_rate": 1.6353490382528563e-06, + "loss": 0.0237, + "step": 29944 + }, + { + "epoch": 3.5509308668326813, + "grad_norm": 0.42217185779061445, + "learning_rate": 1.6344952482650166e-06, + "loss": 0.02, + "step": 29945 + }, + { + "epoch": 3.551049448594806, + "grad_norm": 0.578523733760798, + "learning_rate": 1.6336416736784793e-06, + "loss": 0.0303, + "step": 29946 + }, + { + "epoch": 3.551168030356931, + "grad_norm": 0.5026515720556103, + "learning_rate": 1.6327883145011075e-06, + "loss": 0.0268, + "step": 29947 + }, + { + "epoch": 3.551286612119056, + "grad_norm": 0.5949737603261039, + "learning_rate": 1.6319351707407783e-06, + "loss": 0.0305, + "step": 29948 + }, + { + "epoch": 3.5514051938811813, + "grad_norm": 0.5575747739988298, + "learning_rate": 1.631082242405349e-06, + "loss": 0.0309, + "step": 29949 + }, + { + "epoch": 3.551523775643306, + "grad_norm": 0.4320104524668897, + "learning_rate": 1.630229529502683e-06, + "loss": 0.0178, + "step": 29950 + }, + { + "epoch": 3.5516423574054308, + "grad_norm": 0.251569566290122, + "learning_rate": 1.6293770320406437e-06, + "loss": 0.0105, + "step": 29951 + }, + { + "epoch": 3.551760939167556, + "grad_norm": 0.6561059886653068, + "learning_rate": 1.6285247500270883e-06, + "loss": 0.0483, + "step": 29952 + }, + { + "epoch": 3.551879520929681, + "grad_norm": 0.33644250945698745, + "learning_rate": 1.6276726834698803e-06, + "loss": 0.0165, + "step": 29953 + }, + { + "epoch": 3.551998102691806, + "grad_norm": 0.28193489333838684, + "learning_rate": 1.626820832376863e-06, + "loss": 0.0124, + "step": 29954 + }, + { + "epoch": 3.552116684453931, + "grad_norm": 0.3991332043774468, + "learning_rate": 1.6259691967558971e-06, + "loss": 0.02, + "step": 29955 + }, + { + "epoch": 3.552235266216056, + "grad_norm": 0.6746520863943626, + "learning_rate": 1.6251177766148318e-06, + "loss": 0.0318, + "step": 29956 + }, + { + "epoch": 3.552353847978181, + "grad_norm": 0.7186690893604176, + "learning_rate": 1.624266571961522e-06, + "loss": 0.0335, + "step": 29957 + }, + { + "epoch": 3.552472429740306, + "grad_norm": 0.40318216696219716, + "learning_rate": 1.6234155828037972e-06, + "loss": 0.0157, + "step": 29958 + }, + { + "epoch": 3.552591011502431, + "grad_norm": 0.4088844434067984, + "learning_rate": 1.6225648091495266e-06, + "loss": 0.0171, + "step": 29959 + }, + { + "epoch": 3.552709593264556, + "grad_norm": 0.6014233319504014, + "learning_rate": 1.6217142510065342e-06, + "loss": 0.0233, + "step": 29960 + }, + { + "epoch": 3.552828175026681, + "grad_norm": 0.6325520703068507, + "learning_rate": 1.6208639083826693e-06, + "loss": 0.0324, + "step": 29961 + }, + { + "epoch": 3.552946756788806, + "grad_norm": 0.5586233433904104, + "learning_rate": 1.6200137812857701e-06, + "loss": 0.0234, + "step": 29962 + }, + { + "epoch": 3.553065338550931, + "grad_norm": 0.4392821476936729, + "learning_rate": 1.6191638697236721e-06, + "loss": 0.0257, + "step": 29963 + }, + { + "epoch": 3.5531839203130557, + "grad_norm": 0.7398689062198018, + "learning_rate": 1.6183141737042163e-06, + "loss": 0.0287, + "step": 29964 + }, + { + "epoch": 3.553302502075181, + "grad_norm": 0.7087316739561472, + "learning_rate": 1.6174646932352294e-06, + "loss": 0.0332, + "step": 29965 + }, + { + "epoch": 3.5534210838373057, + "grad_norm": 0.6911970843534968, + "learning_rate": 1.6166154283245417e-06, + "loss": 0.0253, + "step": 29966 + }, + { + "epoch": 3.553539665599431, + "grad_norm": 0.6176333771047589, + "learning_rate": 1.6157663789799854e-06, + "loss": 0.0427, + "step": 29967 + }, + { + "epoch": 3.5536582473615557, + "grad_norm": 0.3478029549539082, + "learning_rate": 1.6149175452093933e-06, + "loss": 0.0191, + "step": 29968 + }, + { + "epoch": 3.553776829123681, + "grad_norm": 0.6018647652046477, + "learning_rate": 1.6140689270205788e-06, + "loss": 0.0256, + "step": 29969 + }, + { + "epoch": 3.5538954108858056, + "grad_norm": 1.062596683589643, + "learning_rate": 1.6132205244213716e-06, + "loss": 0.0531, + "step": 29970 + }, + { + "epoch": 3.554013992647931, + "grad_norm": 0.5785545681191002, + "learning_rate": 1.6123723374195932e-06, + "loss": 0.0222, + "step": 29971 + }, + { + "epoch": 3.5541325744100556, + "grad_norm": 1.135298567295015, + "learning_rate": 1.6115243660230622e-06, + "loss": 0.064, + "step": 29972 + }, + { + "epoch": 3.554251156172181, + "grad_norm": 0.5735655818436343, + "learning_rate": 1.6106766102395949e-06, + "loss": 0.0329, + "step": 29973 + }, + { + "epoch": 3.5543697379343056, + "grad_norm": 0.20924832487709397, + "learning_rate": 1.609829070077007e-06, + "loss": 0.0116, + "step": 29974 + }, + { + "epoch": 3.5544883196964308, + "grad_norm": 0.5269674014941343, + "learning_rate": 1.6089817455431172e-06, + "loss": 0.0242, + "step": 29975 + }, + { + "epoch": 3.5546069014585555, + "grad_norm": 0.5618217938536881, + "learning_rate": 1.608134636645728e-06, + "loss": 0.0273, + "step": 29976 + }, + { + "epoch": 3.5547254832206807, + "grad_norm": 0.8166111997006785, + "learning_rate": 1.607287743392652e-06, + "loss": 0.0554, + "step": 29977 + }, + { + "epoch": 3.554844064982806, + "grad_norm": 0.37541114300031697, + "learning_rate": 1.6064410657916946e-06, + "loss": 0.0131, + "step": 29978 + }, + { + "epoch": 3.5549626467449307, + "grad_norm": 0.3466812415782502, + "learning_rate": 1.6055946038506658e-06, + "loss": 0.0126, + "step": 29979 + }, + { + "epoch": 3.5550812285070554, + "grad_norm": 0.35927746607831496, + "learning_rate": 1.604748357577371e-06, + "loss": 0.0153, + "step": 29980 + }, + { + "epoch": 3.5551998102691806, + "grad_norm": 0.22567532861973727, + "learning_rate": 1.6039023269795978e-06, + "loss": 0.0113, + "step": 29981 + }, + { + "epoch": 3.555318392031306, + "grad_norm": 0.8692775933358761, + "learning_rate": 1.6030565120651654e-06, + "loss": 0.0349, + "step": 29982 + }, + { + "epoch": 3.5554369737934306, + "grad_norm": 0.46116516600696195, + "learning_rate": 1.6022109128418534e-06, + "loss": 0.0272, + "step": 29983 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.5383122555697191, + "learning_rate": 1.6013655293174722e-06, + "loss": 0.0227, + "step": 29984 + }, + { + "epoch": 3.5556741373176806, + "grad_norm": 0.7516918401642313, + "learning_rate": 1.6005203614997965e-06, + "loss": 0.0512, + "step": 29985 + }, + { + "epoch": 3.5557927190798058, + "grad_norm": 0.4724607816892919, + "learning_rate": 1.5996754093966365e-06, + "loss": 0.0272, + "step": 29986 + }, + { + "epoch": 3.5559113008419305, + "grad_norm": 0.7915142616075803, + "learning_rate": 1.598830673015772e-06, + "loss": 0.0354, + "step": 29987 + }, + { + "epoch": 3.5560298826040553, + "grad_norm": 0.329079008806292, + "learning_rate": 1.5979861523649913e-06, + "loss": 0.014, + "step": 29988 + }, + { + "epoch": 3.5561484643661805, + "grad_norm": 0.5377450554898543, + "learning_rate": 1.5971418474520827e-06, + "loss": 0.0203, + "step": 29989 + }, + { + "epoch": 3.5562670461283057, + "grad_norm": 0.4814357541977004, + "learning_rate": 1.596297758284826e-06, + "loss": 0.0232, + "step": 29990 + }, + { + "epoch": 3.5563856278904304, + "grad_norm": 0.4106003903344417, + "learning_rate": 1.5954538848710065e-06, + "loss": 0.0256, + "step": 29991 + }, + { + "epoch": 3.556504209652555, + "grad_norm": 0.807240381865862, + "learning_rate": 1.594610227218396e-06, + "loss": 0.0417, + "step": 29992 + }, + { + "epoch": 3.5566227914146804, + "grad_norm": 0.568221777996751, + "learning_rate": 1.5937667853347853e-06, + "loss": 0.0306, + "step": 29993 + }, + { + "epoch": 3.5567413731768056, + "grad_norm": 0.3165360637766609, + "learning_rate": 1.5929235592279379e-06, + "loss": 0.0152, + "step": 29994 + }, + { + "epoch": 3.5568599549389304, + "grad_norm": 0.5255532261684843, + "learning_rate": 1.5920805489056361e-06, + "loss": 0.0237, + "step": 29995 + }, + { + "epoch": 3.556978536701055, + "grad_norm": 0.45128055420581636, + "learning_rate": 1.591237754375638e-06, + "loss": 0.0245, + "step": 29996 + }, + { + "epoch": 3.5570971184631803, + "grad_norm": 0.5010781763217002, + "learning_rate": 1.5903951756457314e-06, + "loss": 0.0208, + "step": 29997 + }, + { + "epoch": 3.5572157002253055, + "grad_norm": 0.5525397571463377, + "learning_rate": 1.5895528127236687e-06, + "loss": 0.0311, + "step": 29998 + }, + { + "epoch": 3.5573342819874303, + "grad_norm": 0.5243811881358227, + "learning_rate": 1.588710665617224e-06, + "loss": 0.0233, + "step": 29999 + }, + { + "epoch": 3.557452863749555, + "grad_norm": 0.3692116089041567, + "learning_rate": 1.5878687343341553e-06, + "loss": 0.0206, + "step": 30000 + }, + { + "epoch": 3.5575714455116803, + "grad_norm": 0.35217078212237884, + "learning_rate": 1.5870270188822284e-06, + "loss": 0.0127, + "step": 30001 + }, + { + "epoch": 3.5576900272738055, + "grad_norm": 0.5238778502767756, + "learning_rate": 1.5861855192692066e-06, + "loss": 0.0277, + "step": 30002 + }, + { + "epoch": 3.55780860903593, + "grad_norm": 0.34041688069371806, + "learning_rate": 1.5853442355028336e-06, + "loss": 0.0148, + "step": 30003 + }, + { + "epoch": 3.5579271907980554, + "grad_norm": 0.5133758415713613, + "learning_rate": 1.584503167590881e-06, + "loss": 0.0236, + "step": 30004 + }, + { + "epoch": 3.55804577256018, + "grad_norm": 0.4645220097703041, + "learning_rate": 1.5836623155410901e-06, + "loss": 0.0195, + "step": 30005 + }, + { + "epoch": 3.5581643543223054, + "grad_norm": 0.8164833269348722, + "learning_rate": 1.5828216793612239e-06, + "loss": 0.0316, + "step": 30006 + }, + { + "epoch": 3.55828293608443, + "grad_norm": 0.4694432006666502, + "learning_rate": 1.5819812590590178e-06, + "loss": 0.0202, + "step": 30007 + }, + { + "epoch": 3.5584015178465553, + "grad_norm": 0.2659830114618431, + "learning_rate": 1.581141054642235e-06, + "loss": 0.0112, + "step": 30008 + }, + { + "epoch": 3.55852009960868, + "grad_norm": 0.4586838689754872, + "learning_rate": 1.5803010661186085e-06, + "loss": 0.0222, + "step": 30009 + }, + { + "epoch": 3.5586386813708053, + "grad_norm": 0.5397377508827675, + "learning_rate": 1.5794612934958874e-06, + "loss": 0.0178, + "step": 30010 + }, + { + "epoch": 3.55875726313293, + "grad_norm": 0.5392763788022126, + "learning_rate": 1.578621736781813e-06, + "loss": 0.0244, + "step": 30011 + }, + { + "epoch": 3.5588758448950553, + "grad_norm": 0.4140569400246805, + "learning_rate": 1.577782395984126e-06, + "loss": 0.0194, + "step": 30012 + }, + { + "epoch": 3.55899442665718, + "grad_norm": 0.9388399589066896, + "learning_rate": 1.5769432711105675e-06, + "loss": 0.0242, + "step": 30013 + }, + { + "epoch": 3.5591130084193052, + "grad_norm": 0.4058999737148781, + "learning_rate": 1.5761043621688592e-06, + "loss": 0.0176, + "step": 30014 + }, + { + "epoch": 3.55923159018143, + "grad_norm": 0.3546123218547797, + "learning_rate": 1.5752656691667533e-06, + "loss": 0.0175, + "step": 30015 + }, + { + "epoch": 3.559350171943555, + "grad_norm": 0.5312485239492029, + "learning_rate": 1.5744271921119685e-06, + "loss": 0.0248, + "step": 30016 + }, + { + "epoch": 3.55946875370568, + "grad_norm": 0.5465577417705304, + "learning_rate": 1.573588931012243e-06, + "loss": 0.0224, + "step": 30017 + }, + { + "epoch": 3.559587335467805, + "grad_norm": 0.7322253658273026, + "learning_rate": 1.5727508858752904e-06, + "loss": 0.0354, + "step": 30018 + }, + { + "epoch": 3.55970591722993, + "grad_norm": 0.38806517777266897, + "learning_rate": 1.571913056708857e-06, + "loss": 0.0205, + "step": 30019 + }, + { + "epoch": 3.559824498992055, + "grad_norm": 0.7409219957793765, + "learning_rate": 1.5710754435206477e-06, + "loss": 0.0454, + "step": 30020 + }, + { + "epoch": 3.55994308075418, + "grad_norm": 0.7082368068433967, + "learning_rate": 1.5702380463183952e-06, + "loss": 0.0373, + "step": 30021 + }, + { + "epoch": 3.560061662516305, + "grad_norm": 0.4963651393255425, + "learning_rate": 1.5694008651098185e-06, + "loss": 0.0208, + "step": 30022 + }, + { + "epoch": 3.56018024427843, + "grad_norm": 0.48267698631423234, + "learning_rate": 1.5685638999026308e-06, + "loss": 0.0203, + "step": 30023 + }, + { + "epoch": 3.560298826040555, + "grad_norm": 0.37701542961131057, + "learning_rate": 1.5677271507045538e-06, + "loss": 0.0146, + "step": 30024 + }, + { + "epoch": 3.56041740780268, + "grad_norm": 0.4920119277417836, + "learning_rate": 1.566890617523295e-06, + "loss": 0.0256, + "step": 30025 + }, + { + "epoch": 3.560535989564805, + "grad_norm": 0.5556869350704297, + "learning_rate": 1.5660543003665679e-06, + "loss": 0.0248, + "step": 30026 + }, + { + "epoch": 3.56065457132693, + "grad_norm": 0.6174871403036227, + "learning_rate": 1.5652181992420855e-06, + "loss": 0.0229, + "step": 30027 + }, + { + "epoch": 3.560773153089055, + "grad_norm": 0.4864580860819047, + "learning_rate": 1.564382314157556e-06, + "loss": 0.0287, + "step": 30028 + }, + { + "epoch": 3.5608917348511797, + "grad_norm": 0.45340356659461445, + "learning_rate": 1.5635466451206754e-06, + "loss": 0.0219, + "step": 30029 + }, + { + "epoch": 3.561010316613305, + "grad_norm": 0.5582507038640341, + "learning_rate": 1.5627111921391629e-06, + "loss": 0.0309, + "step": 30030 + }, + { + "epoch": 3.56112889837543, + "grad_norm": 0.43234621698050907, + "learning_rate": 1.5618759552207096e-06, + "loss": 0.0212, + "step": 30031 + }, + { + "epoch": 3.561247480137555, + "grad_norm": 0.7471332561130336, + "learning_rate": 1.5610409343730176e-06, + "loss": 0.0226, + "step": 30032 + }, + { + "epoch": 3.5613660618996796, + "grad_norm": 0.3845422930992347, + "learning_rate": 1.5602061296037863e-06, + "loss": 0.0152, + "step": 30033 + }, + { + "epoch": 3.561484643661805, + "grad_norm": 0.49760888615140814, + "learning_rate": 1.5593715409207093e-06, + "loss": 0.0258, + "step": 30034 + }, + { + "epoch": 3.56160322542393, + "grad_norm": 0.5130249489254282, + "learning_rate": 1.5585371683314865e-06, + "loss": 0.032, + "step": 30035 + }, + { + "epoch": 3.561721807186055, + "grad_norm": 0.4225825107885747, + "learning_rate": 1.5577030118438002e-06, + "loss": 0.0152, + "step": 30036 + }, + { + "epoch": 3.5618403889481796, + "grad_norm": 0.8256545379696574, + "learning_rate": 1.5568690714653472e-06, + "loss": 0.0401, + "step": 30037 + }, + { + "epoch": 3.5619589707103048, + "grad_norm": 0.5560339326913208, + "learning_rate": 1.556035347203813e-06, + "loss": 0.0231, + "step": 30038 + }, + { + "epoch": 3.56207755247243, + "grad_norm": 0.6385053514559972, + "learning_rate": 1.5552018390668832e-06, + "loss": 0.036, + "step": 30039 + }, + { + "epoch": 3.5621961342345547, + "grad_norm": 0.435340152351553, + "learning_rate": 1.554368547062246e-06, + "loss": 0.0197, + "step": 30040 + }, + { + "epoch": 3.5623147159966795, + "grad_norm": 0.4090590185530221, + "learning_rate": 1.553535471197573e-06, + "loss": 0.0165, + "step": 30041 + }, + { + "epoch": 3.5624332977588047, + "grad_norm": 0.9694452014596588, + "learning_rate": 1.5527026114805582e-06, + "loss": 0.0294, + "step": 30042 + }, + { + "epoch": 3.56255187952093, + "grad_norm": 0.7191129483617982, + "learning_rate": 1.5518699679188676e-06, + "loss": 0.0223, + "step": 30043 + }, + { + "epoch": 3.5626704612830546, + "grad_norm": 0.502164659818671, + "learning_rate": 1.5510375405201838e-06, + "loss": 0.0237, + "step": 30044 + }, + { + "epoch": 3.5627890430451794, + "grad_norm": 0.5763228980028721, + "learning_rate": 1.5502053292921787e-06, + "loss": 0.0273, + "step": 30045 + }, + { + "epoch": 3.5629076248073046, + "grad_norm": 0.5189930726253942, + "learning_rate": 1.5493733342425265e-06, + "loss": 0.0253, + "step": 30046 + }, + { + "epoch": 3.56302620656943, + "grad_norm": 0.5882753359327298, + "learning_rate": 1.5485415553788935e-06, + "loss": 0.0236, + "step": 30047 + }, + { + "epoch": 3.5631447883315546, + "grad_norm": 0.6101135964047424, + "learning_rate": 1.5477099927089484e-06, + "loss": 0.0433, + "step": 30048 + }, + { + "epoch": 3.5632633700936793, + "grad_norm": 0.7599953128610448, + "learning_rate": 1.5468786462403572e-06, + "loss": 0.0284, + "step": 30049 + }, + { + "epoch": 3.5633819518558045, + "grad_norm": 0.7518660190047474, + "learning_rate": 1.5460475159807864e-06, + "loss": 0.0248, + "step": 30050 + }, + { + "epoch": 3.5635005336179297, + "grad_norm": 0.4318573214262153, + "learning_rate": 1.5452166019378989e-06, + "loss": 0.0236, + "step": 30051 + }, + { + "epoch": 3.5636191153800545, + "grad_norm": 0.5105526416540964, + "learning_rate": 1.5443859041193443e-06, + "loss": 0.0239, + "step": 30052 + }, + { + "epoch": 3.5637376971421797, + "grad_norm": 0.584251032917032, + "learning_rate": 1.5435554225327969e-06, + "loss": 0.0238, + "step": 30053 + }, + { + "epoch": 3.5638562789043045, + "grad_norm": 0.5910906587720097, + "learning_rate": 1.5427251571859008e-06, + "loss": 0.0269, + "step": 30054 + }, + { + "epoch": 3.5639748606664297, + "grad_norm": 0.4586741574125221, + "learning_rate": 1.5418951080863165e-06, + "loss": 0.0161, + "step": 30055 + }, + { + "epoch": 3.5640934424285544, + "grad_norm": 0.5220581214778582, + "learning_rate": 1.5410652752416876e-06, + "loss": 0.0273, + "step": 30056 + }, + { + "epoch": 3.5642120241906796, + "grad_norm": 0.5628344719363101, + "learning_rate": 1.540235658659675e-06, + "loss": 0.0273, + "step": 30057 + }, + { + "epoch": 3.5643306059528044, + "grad_norm": 0.5342029969866551, + "learning_rate": 1.539406258347917e-06, + "loss": 0.0246, + "step": 30058 + }, + { + "epoch": 3.5644491877149296, + "grad_norm": 0.557613239837276, + "learning_rate": 1.5385770743140654e-06, + "loss": 0.0394, + "step": 30059 + }, + { + "epoch": 3.5645677694770543, + "grad_norm": 0.4546104611622254, + "learning_rate": 1.5377481065657646e-06, + "loss": 0.0256, + "step": 30060 + }, + { + "epoch": 3.5646863512391795, + "grad_norm": 0.33253951876902177, + "learning_rate": 1.5369193551106526e-06, + "loss": 0.0131, + "step": 30061 + }, + { + "epoch": 3.5648049330013043, + "grad_norm": 0.36376880000087486, + "learning_rate": 1.5360908199563763e-06, + "loss": 0.0137, + "step": 30062 + }, + { + "epoch": 3.5649235147634295, + "grad_norm": 0.9307190055754025, + "learning_rate": 1.5352625011105598e-06, + "loss": 0.0366, + "step": 30063 + }, + { + "epoch": 3.5650420965255543, + "grad_norm": 0.4372689487175602, + "learning_rate": 1.5344343985808585e-06, + "loss": 0.0163, + "step": 30064 + }, + { + "epoch": 3.5651606782876795, + "grad_norm": 0.4242021130486182, + "learning_rate": 1.533606512374891e-06, + "loss": 0.0189, + "step": 30065 + }, + { + "epoch": 3.565279260049804, + "grad_norm": 0.5302353913096258, + "learning_rate": 1.5327788425003014e-06, + "loss": 0.0277, + "step": 30066 + }, + { + "epoch": 3.5653978418119294, + "grad_norm": 0.4554240038071623, + "learning_rate": 1.531951388964703e-06, + "loss": 0.021, + "step": 30067 + }, + { + "epoch": 3.565516423574054, + "grad_norm": 0.3135853347379759, + "learning_rate": 1.5311241517757453e-06, + "loss": 0.0175, + "step": 30068 + }, + { + "epoch": 3.5656350053361794, + "grad_norm": 0.48671369992284946, + "learning_rate": 1.5302971309410358e-06, + "loss": 0.0253, + "step": 30069 + }, + { + "epoch": 3.565753587098304, + "grad_norm": 0.7548377620477537, + "learning_rate": 1.5294703264682102e-06, + "loss": 0.0372, + "step": 30070 + }, + { + "epoch": 3.5658721688604293, + "grad_norm": 0.5240169255414604, + "learning_rate": 1.528643738364885e-06, + "loss": 0.0287, + "step": 30071 + }, + { + "epoch": 3.565990750622554, + "grad_norm": 0.5095023296783273, + "learning_rate": 1.5278173666386814e-06, + "loss": 0.0177, + "step": 30072 + }, + { + "epoch": 3.5661093323846793, + "grad_norm": 0.46254119973484825, + "learning_rate": 1.5269912112972213e-06, + "loss": 0.0191, + "step": 30073 + }, + { + "epoch": 3.566227914146804, + "grad_norm": 0.7684736664760576, + "learning_rate": 1.5261652723481125e-06, + "loss": 0.0481, + "step": 30074 + }, + { + "epoch": 3.5663464959089293, + "grad_norm": 0.28601561381516194, + "learning_rate": 1.525339549798982e-06, + "loss": 0.0107, + "step": 30075 + }, + { + "epoch": 3.566465077671054, + "grad_norm": 0.41611206116061633, + "learning_rate": 1.5245140436574267e-06, + "loss": 0.0159, + "step": 30076 + }, + { + "epoch": 3.5665836594331792, + "grad_norm": 0.8008433733582215, + "learning_rate": 1.5236887539310712e-06, + "loss": 0.0326, + "step": 30077 + }, + { + "epoch": 3.566702241195304, + "grad_norm": 0.35424824213375455, + "learning_rate": 1.5228636806275094e-06, + "loss": 0.0137, + "step": 30078 + }, + { + "epoch": 3.566820822957429, + "grad_norm": 0.9087045971112346, + "learning_rate": 1.5220388237543626e-06, + "loss": 0.0399, + "step": 30079 + }, + { + "epoch": 3.5669394047195544, + "grad_norm": 0.5869437234910722, + "learning_rate": 1.5212141833192222e-06, + "loss": 0.0313, + "step": 30080 + }, + { + "epoch": 3.567057986481679, + "grad_norm": 0.44236554207531187, + "learning_rate": 1.5203897593296962e-06, + "loss": 0.023, + "step": 30081 + }, + { + "epoch": 3.567176568243804, + "grad_norm": 0.26767154018734, + "learning_rate": 1.5195655517933837e-06, + "loss": 0.0074, + "step": 30082 + }, + { + "epoch": 3.567295150005929, + "grad_norm": 0.6405047289384412, + "learning_rate": 1.5187415607178845e-06, + "loss": 0.0277, + "step": 30083 + }, + { + "epoch": 3.5674137317680543, + "grad_norm": 0.6072315629008704, + "learning_rate": 1.5179177861107951e-06, + "loss": 0.0233, + "step": 30084 + }, + { + "epoch": 3.567532313530179, + "grad_norm": 0.310058871217455, + "learning_rate": 1.5170942279797069e-06, + "loss": 0.0145, + "step": 30085 + }, + { + "epoch": 3.567650895292304, + "grad_norm": 0.44351874196828783, + "learning_rate": 1.5162708863322106e-06, + "loss": 0.0182, + "step": 30086 + }, + { + "epoch": 3.567769477054429, + "grad_norm": 0.6265170292991432, + "learning_rate": 1.5154477611759004e-06, + "loss": 0.0245, + "step": 30087 + }, + { + "epoch": 3.5678880588165542, + "grad_norm": 0.6072798526011636, + "learning_rate": 1.5146248525183622e-06, + "loss": 0.0331, + "step": 30088 + }, + { + "epoch": 3.568006640578679, + "grad_norm": 0.4599697646723456, + "learning_rate": 1.5138021603671837e-06, + "loss": 0.0244, + "step": 30089 + }, + { + "epoch": 3.5681252223408038, + "grad_norm": 0.5250927423579496, + "learning_rate": 1.5129796847299482e-06, + "loss": 0.0219, + "step": 30090 + }, + { + "epoch": 3.568243804102929, + "grad_norm": 0.39220686301185403, + "learning_rate": 1.512157425614244e-06, + "loss": 0.0145, + "step": 30091 + }, + { + "epoch": 3.568362385865054, + "grad_norm": 0.2997802290036812, + "learning_rate": 1.51133538302764e-06, + "loss": 0.0096, + "step": 30092 + }, + { + "epoch": 3.568480967627179, + "grad_norm": 0.40040458714869887, + "learning_rate": 1.5105135569777217e-06, + "loss": 0.0214, + "step": 30093 + }, + { + "epoch": 3.5685995493893037, + "grad_norm": 0.7387450876434745, + "learning_rate": 1.509691947472061e-06, + "loss": 0.0285, + "step": 30094 + }, + { + "epoch": 3.568718131151429, + "grad_norm": 0.5098867407258862, + "learning_rate": 1.5088705545182408e-06, + "loss": 0.0218, + "step": 30095 + }, + { + "epoch": 3.568836712913554, + "grad_norm": 0.39273462304362505, + "learning_rate": 1.5080493781238241e-06, + "loss": 0.0205, + "step": 30096 + }, + { + "epoch": 3.568955294675679, + "grad_norm": 0.8524243710150383, + "learning_rate": 1.507228418296383e-06, + "loss": 0.0262, + "step": 30097 + }, + { + "epoch": 3.5690738764378036, + "grad_norm": 0.42771895411959926, + "learning_rate": 1.506407675043489e-06, + "loss": 0.0169, + "step": 30098 + }, + { + "epoch": 3.569192458199929, + "grad_norm": 0.3865740886145615, + "learning_rate": 1.505587148372703e-06, + "loss": 0.0197, + "step": 30099 + }, + { + "epoch": 3.569311039962054, + "grad_norm": 0.42565484679495663, + "learning_rate": 1.5047668382915963e-06, + "loss": 0.0121, + "step": 30100 + }, + { + "epoch": 3.5694296217241788, + "grad_norm": 0.7512441249659887, + "learning_rate": 1.5039467448077243e-06, + "loss": 0.0381, + "step": 30101 + }, + { + "epoch": 3.569548203486304, + "grad_norm": 0.3594417272262605, + "learning_rate": 1.5031268679286558e-06, + "loss": 0.0162, + "step": 30102 + }, + { + "epoch": 3.5696667852484287, + "grad_norm": 0.480056404015315, + "learning_rate": 1.5023072076619404e-06, + "loss": 0.021, + "step": 30103 + }, + { + "epoch": 3.569785367010554, + "grad_norm": 0.5793711386099695, + "learning_rate": 1.5014877640151386e-06, + "loss": 0.0272, + "step": 30104 + }, + { + "epoch": 3.5699039487726787, + "grad_norm": 0.3908351447146127, + "learning_rate": 1.500668536995803e-06, + "loss": 0.026, + "step": 30105 + }, + { + "epoch": 3.570022530534804, + "grad_norm": 0.46288861116315444, + "learning_rate": 1.499849526611491e-06, + "loss": 0.0184, + "step": 30106 + }, + { + "epoch": 3.5701411122969287, + "grad_norm": 0.6686129818432112, + "learning_rate": 1.4990307328697439e-06, + "loss": 0.0278, + "step": 30107 + }, + { + "epoch": 3.570259694059054, + "grad_norm": 0.48582555044203174, + "learning_rate": 1.4982121557781142e-06, + "loss": 0.0183, + "step": 30108 + }, + { + "epoch": 3.5703782758211786, + "grad_norm": 0.345321718238118, + "learning_rate": 1.4973937953441513e-06, + "loss": 0.0156, + "step": 30109 + }, + { + "epoch": 3.570496857583304, + "grad_norm": 0.7227905898507918, + "learning_rate": 1.4965756515753938e-06, + "loss": 0.0338, + "step": 30110 + }, + { + "epoch": 3.5706154393454286, + "grad_norm": 0.5009105534581844, + "learning_rate": 1.4957577244793907e-06, + "loss": 0.0253, + "step": 30111 + }, + { + "epoch": 3.570734021107554, + "grad_norm": 0.525567375141117, + "learning_rate": 1.4949400140636727e-06, + "loss": 0.0241, + "step": 30112 + }, + { + "epoch": 3.5708526028696785, + "grad_norm": 0.8812447370347065, + "learning_rate": 1.4941225203357918e-06, + "loss": 0.0382, + "step": 30113 + }, + { + "epoch": 3.5709711846318037, + "grad_norm": 0.8522853630355219, + "learning_rate": 1.4933052433032723e-06, + "loss": 0.0439, + "step": 30114 + }, + { + "epoch": 3.5710897663939285, + "grad_norm": 1.0313134683296077, + "learning_rate": 1.4924881829736504e-06, + "loss": 0.0453, + "step": 30115 + }, + { + "epoch": 3.5712083481560537, + "grad_norm": 0.5798696775410367, + "learning_rate": 1.491671339354464e-06, + "loss": 0.0252, + "step": 30116 + }, + { + "epoch": 3.5713269299181785, + "grad_norm": 0.5911808416844393, + "learning_rate": 1.4908547124532408e-06, + "loss": 0.0223, + "step": 30117 + }, + { + "epoch": 3.5714455116803037, + "grad_norm": 0.5835053489910308, + "learning_rate": 1.4900383022775076e-06, + "loss": 0.0247, + "step": 30118 + }, + { + "epoch": 3.5715640934424284, + "grad_norm": 0.5183484702422192, + "learning_rate": 1.4892221088347897e-06, + "loss": 0.0237, + "step": 30119 + }, + { + "epoch": 3.5716826752045536, + "grad_norm": 0.4117125617169833, + "learning_rate": 1.4884061321326136e-06, + "loss": 0.0151, + "step": 30120 + }, + { + "epoch": 3.5718012569666784, + "grad_norm": 0.3297757490748373, + "learning_rate": 1.487590372178499e-06, + "loss": 0.0097, + "step": 30121 + }, + { + "epoch": 3.5719198387288036, + "grad_norm": 0.5356220423522735, + "learning_rate": 1.4867748289799755e-06, + "loss": 0.0245, + "step": 30122 + }, + { + "epoch": 3.5720384204909283, + "grad_norm": 0.41454064347213443, + "learning_rate": 1.4859595025445455e-06, + "loss": 0.027, + "step": 30123 + }, + { + "epoch": 3.5721570022530535, + "grad_norm": 0.4739533544103678, + "learning_rate": 1.4851443928797393e-06, + "loss": 0.0286, + "step": 30124 + }, + { + "epoch": 3.5722755840151783, + "grad_norm": 0.45275413877596293, + "learning_rate": 1.4843294999930645e-06, + "loss": 0.0153, + "step": 30125 + }, + { + "epoch": 3.5723941657773035, + "grad_norm": 0.48844293020472607, + "learning_rate": 1.4835148238920377e-06, + "loss": 0.0264, + "step": 30126 + }, + { + "epoch": 3.5725127475394283, + "grad_norm": 0.34613397956225567, + "learning_rate": 1.4827003645841608e-06, + "loss": 0.0143, + "step": 30127 + }, + { + "epoch": 3.5726313293015535, + "grad_norm": 0.6760761093402609, + "learning_rate": 1.4818861220769532e-06, + "loss": 0.0369, + "step": 30128 + }, + { + "epoch": 3.5727499110636787, + "grad_norm": 0.5572199007616827, + "learning_rate": 1.481072096377914e-06, + "loss": 0.0222, + "step": 30129 + }, + { + "epoch": 3.5728684928258034, + "grad_norm": 0.49660146981027625, + "learning_rate": 1.4802582874945487e-06, + "loss": 0.0224, + "step": 30130 + }, + { + "epoch": 3.572987074587928, + "grad_norm": 0.45475625589863317, + "learning_rate": 1.4794446954343593e-06, + "loss": 0.0215, + "step": 30131 + }, + { + "epoch": 3.5731056563500534, + "grad_norm": 0.5057242214404188, + "learning_rate": 1.4786313202048458e-06, + "loss": 0.0243, + "step": 30132 + }, + { + "epoch": 3.5732242381121786, + "grad_norm": 0.6308129775631055, + "learning_rate": 1.4778181618135129e-06, + "loss": 0.0279, + "step": 30133 + }, + { + "epoch": 3.5733428198743034, + "grad_norm": 0.6374663888763795, + "learning_rate": 1.4770052202678436e-06, + "loss": 0.0318, + "step": 30134 + }, + { + "epoch": 3.573461401636428, + "grad_norm": 0.4916321295336519, + "learning_rate": 1.4761924955753488e-06, + "loss": 0.0228, + "step": 30135 + }, + { + "epoch": 3.5735799833985533, + "grad_norm": 0.3904353696508918, + "learning_rate": 1.4753799877435082e-06, + "loss": 0.0218, + "step": 30136 + }, + { + "epoch": 3.5736985651606785, + "grad_norm": 0.5444669308436115, + "learning_rate": 1.4745676967798162e-06, + "loss": 0.0269, + "step": 30137 + }, + { + "epoch": 3.5738171469228033, + "grad_norm": 0.5495992000230598, + "learning_rate": 1.473755622691761e-06, + "loss": 0.0259, + "step": 30138 + }, + { + "epoch": 3.573935728684928, + "grad_norm": 0.521420353414674, + "learning_rate": 1.472943765486831e-06, + "loss": 0.0216, + "step": 30139 + }, + { + "epoch": 3.5740543104470532, + "grad_norm": 0.6099362093522628, + "learning_rate": 1.4721321251725095e-06, + "loss": 0.0314, + "step": 30140 + }, + { + "epoch": 3.5741728922091784, + "grad_norm": 0.36488737102495644, + "learning_rate": 1.4713207017562764e-06, + "loss": 0.016, + "step": 30141 + }, + { + "epoch": 3.574291473971303, + "grad_norm": 0.6617017931176538, + "learning_rate": 1.4705094952456143e-06, + "loss": 0.0332, + "step": 30142 + }, + { + "epoch": 3.574410055733428, + "grad_norm": 0.7460817069033274, + "learning_rate": 1.4696985056479985e-06, + "loss": 0.042, + "step": 30143 + }, + { + "epoch": 3.574528637495553, + "grad_norm": 0.664507818082829, + "learning_rate": 1.4688877329709138e-06, + "loss": 0.0311, + "step": 30144 + }, + { + "epoch": 3.5746472192576784, + "grad_norm": 1.0108894937967527, + "learning_rate": 1.4680771772218187e-06, + "loss": 0.0431, + "step": 30145 + }, + { + "epoch": 3.574765801019803, + "grad_norm": 0.5700142121433461, + "learning_rate": 1.4672668384082043e-06, + "loss": 0.0368, + "step": 30146 + }, + { + "epoch": 3.574884382781928, + "grad_norm": 0.34182165738566817, + "learning_rate": 1.4664567165375288e-06, + "loss": 0.0162, + "step": 30147 + }, + { + "epoch": 3.575002964544053, + "grad_norm": 0.9671365344356334, + "learning_rate": 1.4656468116172605e-06, + "loss": 0.0383, + "step": 30148 + }, + { + "epoch": 3.5751215463061783, + "grad_norm": 0.6167850941511666, + "learning_rate": 1.464837123654872e-06, + "loss": 0.0338, + "step": 30149 + }, + { + "epoch": 3.575240128068303, + "grad_norm": 0.5634191159746649, + "learning_rate": 1.4640276526578234e-06, + "loss": 0.0244, + "step": 30150 + }, + { + "epoch": 3.575358709830428, + "grad_norm": 0.4786720188286685, + "learning_rate": 1.4632183986335811e-06, + "loss": 0.0267, + "step": 30151 + }, + { + "epoch": 3.575477291592553, + "grad_norm": 0.9356357118991877, + "learning_rate": 1.4624093615896007e-06, + "loss": 0.0434, + "step": 30152 + }, + { + "epoch": 3.575595873354678, + "grad_norm": 0.3670885514404565, + "learning_rate": 1.4616005415333394e-06, + "loss": 0.0165, + "step": 30153 + }, + { + "epoch": 3.575714455116803, + "grad_norm": 0.32157747586531715, + "learning_rate": 1.4607919384722584e-06, + "loss": 0.0156, + "step": 30154 + }, + { + "epoch": 3.575833036878928, + "grad_norm": 0.7341527050643404, + "learning_rate": 1.4599835524138157e-06, + "loss": 0.0419, + "step": 30155 + }, + { + "epoch": 3.575951618641053, + "grad_norm": 0.5863377848366002, + "learning_rate": 1.459175383365452e-06, + "loss": 0.0282, + "step": 30156 + }, + { + "epoch": 3.576070200403178, + "grad_norm": 0.4971885698719047, + "learning_rate": 1.458367431334623e-06, + "loss": 0.023, + "step": 30157 + }, + { + "epoch": 3.576188782165303, + "grad_norm": 0.31312107103703557, + "learning_rate": 1.457559696328778e-06, + "loss": 0.0105, + "step": 30158 + }, + { + "epoch": 3.576307363927428, + "grad_norm": 0.4588022985015596, + "learning_rate": 1.456752178355364e-06, + "loss": 0.0214, + "step": 30159 + }, + { + "epoch": 3.576425945689553, + "grad_norm": 0.6436649723503096, + "learning_rate": 1.4559448774218222e-06, + "loss": 0.0298, + "step": 30160 + }, + { + "epoch": 3.576544527451678, + "grad_norm": 0.6078981068915077, + "learning_rate": 1.4551377935355993e-06, + "loss": 0.0254, + "step": 30161 + }, + { + "epoch": 3.576663109213803, + "grad_norm": 0.46483401917500616, + "learning_rate": 1.4543309267041338e-06, + "loss": 0.022, + "step": 30162 + }, + { + "epoch": 3.576781690975928, + "grad_norm": 0.47836529241323295, + "learning_rate": 1.4535242769348644e-06, + "loss": 0.0165, + "step": 30163 + }, + { + "epoch": 3.5769002727380528, + "grad_norm": 0.36808681874271565, + "learning_rate": 1.452717844235224e-06, + "loss": 0.0148, + "step": 30164 + }, + { + "epoch": 3.577018854500178, + "grad_norm": 0.6177649321783102, + "learning_rate": 1.4519116286126482e-06, + "loss": 0.0306, + "step": 30165 + }, + { + "epoch": 3.5771374362623027, + "grad_norm": 0.6915113028904264, + "learning_rate": 1.4511056300745756e-06, + "loss": 0.0362, + "step": 30166 + }, + { + "epoch": 3.577256018024428, + "grad_norm": 0.33442447208556253, + "learning_rate": 1.4502998486284281e-06, + "loss": 0.0125, + "step": 30167 + }, + { + "epoch": 3.5773745997865527, + "grad_norm": 0.5994882880616493, + "learning_rate": 1.4494942842816383e-06, + "loss": 0.0313, + "step": 30168 + }, + { + "epoch": 3.577493181548678, + "grad_norm": 0.7621724754153495, + "learning_rate": 1.4486889370416312e-06, + "loss": 0.0262, + "step": 30169 + }, + { + "epoch": 3.5776117633108027, + "grad_norm": 0.39499502354252297, + "learning_rate": 1.4478838069158314e-06, + "loss": 0.0159, + "step": 30170 + }, + { + "epoch": 3.577730345072928, + "grad_norm": 0.6173366072348511, + "learning_rate": 1.4470788939116631e-06, + "loss": 0.0349, + "step": 30171 + }, + { + "epoch": 3.5778489268350526, + "grad_norm": 0.55722416533573, + "learning_rate": 1.4462741980365402e-06, + "loss": 0.0334, + "step": 30172 + }, + { + "epoch": 3.577967508597178, + "grad_norm": 0.5343985384985563, + "learning_rate": 1.4454697192978927e-06, + "loss": 0.0267, + "step": 30173 + }, + { + "epoch": 3.5780860903593026, + "grad_norm": 0.4232612456446319, + "learning_rate": 1.444665457703126e-06, + "loss": 0.0217, + "step": 30174 + }, + { + "epoch": 3.578204672121428, + "grad_norm": 0.5796007099746037, + "learning_rate": 1.4438614132596562e-06, + "loss": 0.0262, + "step": 30175 + }, + { + "epoch": 3.5783232538835525, + "grad_norm": 0.5292459635485008, + "learning_rate": 1.4430575859748995e-06, + "loss": 0.0194, + "step": 30176 + }, + { + "epoch": 3.5784418356456777, + "grad_norm": 0.4033338247622254, + "learning_rate": 1.4422539758562698e-06, + "loss": 0.0159, + "step": 30177 + }, + { + "epoch": 3.578560417407803, + "grad_norm": 0.5465657061199971, + "learning_rate": 1.4414505829111635e-06, + "loss": 0.0276, + "step": 30178 + }, + { + "epoch": 3.5786789991699277, + "grad_norm": 0.5914487912869928, + "learning_rate": 1.4406474071469945e-06, + "loss": 0.0327, + "step": 30179 + }, + { + "epoch": 3.5787975809320525, + "grad_norm": 0.5613148305920457, + "learning_rate": 1.4398444485711677e-06, + "loss": 0.0261, + "step": 30180 + }, + { + "epoch": 3.5789161626941777, + "grad_norm": 0.4398068291148326, + "learning_rate": 1.43904170719108e-06, + "loss": 0.0219, + "step": 30181 + }, + { + "epoch": 3.579034744456303, + "grad_norm": 0.4713130203991182, + "learning_rate": 1.4382391830141423e-06, + "loss": 0.0274, + "step": 30182 + }, + { + "epoch": 3.5791533262184276, + "grad_norm": 0.7047375132792658, + "learning_rate": 1.4374368760477374e-06, + "loss": 0.0369, + "step": 30183 + }, + { + "epoch": 3.5792719079805524, + "grad_norm": 0.45968419961745155, + "learning_rate": 1.4366347862992763e-06, + "loss": 0.0227, + "step": 30184 + }, + { + "epoch": 3.5793904897426776, + "grad_norm": 0.723586962711405, + "learning_rate": 1.4358329137761472e-06, + "loss": 0.0331, + "step": 30185 + }, + { + "epoch": 3.579509071504803, + "grad_norm": 0.5773694103199726, + "learning_rate": 1.435031258485739e-06, + "loss": 0.0281, + "step": 30186 + }, + { + "epoch": 3.5796276532669276, + "grad_norm": 0.7330006163560095, + "learning_rate": 1.4342298204354454e-06, + "loss": 0.0309, + "step": 30187 + }, + { + "epoch": 3.5797462350290523, + "grad_norm": 0.5705001862591704, + "learning_rate": 1.4334285996326553e-06, + "loss": 0.0269, + "step": 30188 + }, + { + "epoch": 3.5798648167911775, + "grad_norm": 0.44102317275747777, + "learning_rate": 1.4326275960847596e-06, + "loss": 0.0225, + "step": 30189 + }, + { + "epoch": 3.5799833985533027, + "grad_norm": 0.5429712416916096, + "learning_rate": 1.4318268097991277e-06, + "loss": 0.0245, + "step": 30190 + }, + { + "epoch": 3.5801019803154275, + "grad_norm": 0.7053375790836898, + "learning_rate": 1.4310262407831593e-06, + "loss": 0.038, + "step": 30191 + }, + { + "epoch": 3.5802205620775522, + "grad_norm": 0.34586628547433695, + "learning_rate": 1.4302258890442233e-06, + "loss": 0.0164, + "step": 30192 + }, + { + "epoch": 3.5803391438396774, + "grad_norm": 0.6424959924922977, + "learning_rate": 1.4294257545897055e-06, + "loss": 0.0266, + "step": 30193 + }, + { + "epoch": 3.5804577256018026, + "grad_norm": 0.7086249689804694, + "learning_rate": 1.4286258374269696e-06, + "loss": 0.0248, + "step": 30194 + }, + { + "epoch": 3.5805763073639274, + "grad_norm": 0.6142599354769825, + "learning_rate": 1.4278261375634067e-06, + "loss": 0.0253, + "step": 30195 + }, + { + "epoch": 3.580694889126052, + "grad_norm": 0.4231629030790049, + "learning_rate": 1.4270266550063776e-06, + "loss": 0.0193, + "step": 30196 + }, + { + "epoch": 3.5808134708881774, + "grad_norm": 0.4001135938926872, + "learning_rate": 1.4262273897632543e-06, + "loss": 0.0193, + "step": 30197 + }, + { + "epoch": 3.5809320526503026, + "grad_norm": 0.466162310101244, + "learning_rate": 1.425428341841406e-06, + "loss": 0.0185, + "step": 30198 + }, + { + "epoch": 3.5810506344124273, + "grad_norm": 0.6478625938498463, + "learning_rate": 1.4246295112482017e-06, + "loss": 0.0326, + "step": 30199 + }, + { + "epoch": 3.581169216174552, + "grad_norm": 0.2168925023765598, + "learning_rate": 1.4238308979910048e-06, + "loss": 0.0074, + "step": 30200 + }, + { + "epoch": 3.5812877979366773, + "grad_norm": 0.5316924361051629, + "learning_rate": 1.4230325020771706e-06, + "loss": 0.0231, + "step": 30201 + }, + { + "epoch": 3.5814063796988025, + "grad_norm": 0.3983785733450617, + "learning_rate": 1.422234323514071e-06, + "loss": 0.0211, + "step": 30202 + }, + { + "epoch": 3.5815249614609272, + "grad_norm": 0.6979182831063909, + "learning_rate": 1.4214363623090531e-06, + "loss": 0.0365, + "step": 30203 + }, + { + "epoch": 3.5816435432230525, + "grad_norm": 0.4654900933798464, + "learning_rate": 1.4206386184694832e-06, + "loss": 0.0223, + "step": 30204 + }, + { + "epoch": 3.581762124985177, + "grad_norm": 0.37856064669224276, + "learning_rate": 1.4198410920027022e-06, + "loss": 0.012, + "step": 30205 + }, + { + "epoch": 3.5818807067473024, + "grad_norm": 0.36512000626516167, + "learning_rate": 1.4190437829160768e-06, + "loss": 0.0141, + "step": 30206 + }, + { + "epoch": 3.581999288509427, + "grad_norm": 0.46344444884277636, + "learning_rate": 1.4182466912169513e-06, + "loss": 0.024, + "step": 30207 + }, + { + "epoch": 3.5821178702715524, + "grad_norm": 0.535308525934628, + "learning_rate": 1.4174498169126694e-06, + "loss": 0.0203, + "step": 30208 + }, + { + "epoch": 3.582236452033677, + "grad_norm": 0.44902857176668093, + "learning_rate": 1.4166531600105837e-06, + "loss": 0.0234, + "step": 30209 + }, + { + "epoch": 3.5823550337958023, + "grad_norm": 0.7627272613485054, + "learning_rate": 1.4158567205180328e-06, + "loss": 0.0431, + "step": 30210 + }, + { + "epoch": 3.582473615557927, + "grad_norm": 0.3967597840991206, + "learning_rate": 1.4150604984423694e-06, + "loss": 0.0158, + "step": 30211 + }, + { + "epoch": 3.5825921973200523, + "grad_norm": 0.5064453912959849, + "learning_rate": 1.4142644937909206e-06, + "loss": 0.0243, + "step": 30212 + }, + { + "epoch": 3.582710779082177, + "grad_norm": 0.7971578625968672, + "learning_rate": 1.4134687065710305e-06, + "loss": 0.0405, + "step": 30213 + }, + { + "epoch": 3.5828293608443023, + "grad_norm": 0.6426646970838484, + "learning_rate": 1.4126731367900353e-06, + "loss": 0.0361, + "step": 30214 + }, + { + "epoch": 3.582947942606427, + "grad_norm": 0.5192334140661772, + "learning_rate": 1.4118777844552733e-06, + "loss": 0.0285, + "step": 30215 + }, + { + "epoch": 3.583066524368552, + "grad_norm": 0.4520274850288314, + "learning_rate": 1.4110826495740664e-06, + "loss": 0.0132, + "step": 30216 + }, + { + "epoch": 3.583185106130677, + "grad_norm": 0.5516968868510739, + "learning_rate": 1.410287732153756e-06, + "loss": 0.0355, + "step": 30217 + }, + { + "epoch": 3.583303687892802, + "grad_norm": 0.44543517731972326, + "learning_rate": 1.409493032201664e-06, + "loss": 0.0235, + "step": 30218 + }, + { + "epoch": 3.583422269654927, + "grad_norm": 0.49807527449970823, + "learning_rate": 1.4086985497251154e-06, + "loss": 0.0266, + "step": 30219 + }, + { + "epoch": 3.583540851417052, + "grad_norm": 0.34749079695790547, + "learning_rate": 1.4079042847314373e-06, + "loss": 0.017, + "step": 30220 + }, + { + "epoch": 3.583659433179177, + "grad_norm": 0.3475673768913157, + "learning_rate": 1.4071102372279516e-06, + "loss": 0.0132, + "step": 30221 + }, + { + "epoch": 3.583778014941302, + "grad_norm": 0.5186616412362917, + "learning_rate": 1.4063164072219803e-06, + "loss": 0.0209, + "step": 30222 + }, + { + "epoch": 3.583896596703427, + "grad_norm": 0.5117286897480118, + "learning_rate": 1.4055227947208371e-06, + "loss": 0.0244, + "step": 30223 + }, + { + "epoch": 3.584015178465552, + "grad_norm": 0.4980679139389545, + "learning_rate": 1.4047293997318385e-06, + "loss": 0.02, + "step": 30224 + }, + { + "epoch": 3.584133760227677, + "grad_norm": 0.661765244830287, + "learning_rate": 1.4039362222623004e-06, + "loss": 0.04, + "step": 30225 + }, + { + "epoch": 3.584252341989802, + "grad_norm": 0.551156126229845, + "learning_rate": 1.4031432623195423e-06, + "loss": 0.0227, + "step": 30226 + }, + { + "epoch": 3.5843709237519272, + "grad_norm": 0.6158359095205442, + "learning_rate": 1.4023505199108582e-06, + "loss": 0.0374, + "step": 30227 + }, + { + "epoch": 3.584489505514052, + "grad_norm": 0.36961769744529166, + "learning_rate": 1.4015579950435676e-06, + "loss": 0.018, + "step": 30228 + }, + { + "epoch": 3.5846080872761767, + "grad_norm": 0.2786572953000568, + "learning_rate": 1.4007656877249752e-06, + "loss": 0.0134, + "step": 30229 + }, + { + "epoch": 3.584726669038302, + "grad_norm": 0.5037170955809285, + "learning_rate": 1.3999735979623812e-06, + "loss": 0.0259, + "step": 30230 + }, + { + "epoch": 3.584845250800427, + "grad_norm": 0.551496025186819, + "learning_rate": 1.3991817257630934e-06, + "loss": 0.0221, + "step": 30231 + }, + { + "epoch": 3.584963832562552, + "grad_norm": 0.4063908852982394, + "learning_rate": 1.398390071134406e-06, + "loss": 0.0185, + "step": 30232 + }, + { + "epoch": 3.5850824143246767, + "grad_norm": 0.6892338092446865, + "learning_rate": 1.3975986340836245e-06, + "loss": 0.0283, + "step": 30233 + }, + { + "epoch": 3.585200996086802, + "grad_norm": 0.5810069274088375, + "learning_rate": 1.396807414618037e-06, + "loss": 0.03, + "step": 30234 + }, + { + "epoch": 3.585319577848927, + "grad_norm": 0.5543781124196173, + "learning_rate": 1.396016412744941e-06, + "loss": 0.022, + "step": 30235 + }, + { + "epoch": 3.585438159611052, + "grad_norm": 0.3096828503424397, + "learning_rate": 1.3952256284716275e-06, + "loss": 0.0121, + "step": 30236 + }, + { + "epoch": 3.5855567413731766, + "grad_norm": 0.5959748767753301, + "learning_rate": 1.3944350618053881e-06, + "loss": 0.0356, + "step": 30237 + }, + { + "epoch": 3.585675323135302, + "grad_norm": 0.3515735812647097, + "learning_rate": 1.3936447127535168e-06, + "loss": 0.0167, + "step": 30238 + }, + { + "epoch": 3.585793904897427, + "grad_norm": 0.5147945535823616, + "learning_rate": 1.392854581323283e-06, + "loss": 0.0195, + "step": 30239 + }, + { + "epoch": 3.5859124866595518, + "grad_norm": 0.5906359626905819, + "learning_rate": 1.392064667521989e-06, + "loss": 0.023, + "step": 30240 + }, + { + "epoch": 3.5860310684216765, + "grad_norm": 0.4557009719596823, + "learning_rate": 1.391274971356904e-06, + "loss": 0.02, + "step": 30241 + }, + { + "epoch": 3.5861496501838017, + "grad_norm": 0.2579029641019736, + "learning_rate": 1.3904854928353167e-06, + "loss": 0.0107, + "step": 30242 + }, + { + "epoch": 3.586268231945927, + "grad_norm": 0.5847814535002371, + "learning_rate": 1.3896962319644963e-06, + "loss": 0.0253, + "step": 30243 + }, + { + "epoch": 3.5863868137080517, + "grad_norm": 0.701551211744982, + "learning_rate": 1.3889071887517286e-06, + "loss": 0.0293, + "step": 30244 + }, + { + "epoch": 3.5865053954701764, + "grad_norm": 0.7383910076423557, + "learning_rate": 1.38811836320428e-06, + "loss": 0.0271, + "step": 30245 + }, + { + "epoch": 3.5866239772323016, + "grad_norm": 0.4268343022622213, + "learning_rate": 1.3873297553294252e-06, + "loss": 0.0266, + "step": 30246 + }, + { + "epoch": 3.586742558994427, + "grad_norm": 0.5227073682562776, + "learning_rate": 1.3865413651344362e-06, + "loss": 0.0242, + "step": 30247 + }, + { + "epoch": 3.5868611407565516, + "grad_norm": 0.36801860769607947, + "learning_rate": 1.3857531926265765e-06, + "loss": 0.0148, + "step": 30248 + }, + { + "epoch": 3.5869797225186764, + "grad_norm": 0.5748469060298314, + "learning_rate": 1.3849652378131185e-06, + "loss": 0.0305, + "step": 30249 + }, + { + "epoch": 3.5870983042808016, + "grad_norm": 0.5021037507456586, + "learning_rate": 1.384177500701317e-06, + "loss": 0.0225, + "step": 30250 + }, + { + "epoch": 3.5872168860429268, + "grad_norm": 0.3829667815171854, + "learning_rate": 1.3833899812984442e-06, + "loss": 0.015, + "step": 30251 + }, + { + "epoch": 3.5873354678050515, + "grad_norm": 0.6752813856962608, + "learning_rate": 1.3826026796117526e-06, + "loss": 0.0314, + "step": 30252 + }, + { + "epoch": 3.5874540495671767, + "grad_norm": 0.30455119211741927, + "learning_rate": 1.3818155956485058e-06, + "loss": 0.0104, + "step": 30253 + }, + { + "epoch": 3.5875726313293015, + "grad_norm": 0.5945483359424341, + "learning_rate": 1.3810287294159507e-06, + "loss": 0.0229, + "step": 30254 + }, + { + "epoch": 3.5876912130914267, + "grad_norm": 0.4388049565029634, + "learning_rate": 1.380242080921354e-06, + "loss": 0.0203, + "step": 30255 + }, + { + "epoch": 3.5878097948535514, + "grad_norm": 0.5985533555925228, + "learning_rate": 1.379455650171957e-06, + "loss": 0.0168, + "step": 30256 + }, + { + "epoch": 3.5879283766156767, + "grad_norm": 0.5099580919125285, + "learning_rate": 1.378669437175012e-06, + "loss": 0.0203, + "step": 30257 + }, + { + "epoch": 3.5880469583778014, + "grad_norm": 0.5466179328201661, + "learning_rate": 1.3778834419377717e-06, + "loss": 0.0182, + "step": 30258 + }, + { + "epoch": 3.5881655401399266, + "grad_norm": 0.6060356876233227, + "learning_rate": 1.3770976644674748e-06, + "loss": 0.0396, + "step": 30259 + }, + { + "epoch": 3.5882841219020514, + "grad_norm": 0.46264957350694935, + "learning_rate": 1.3763121047713767e-06, + "loss": 0.0206, + "step": 30260 + }, + { + "epoch": 3.5884027036641766, + "grad_norm": 0.32126412114701736, + "learning_rate": 1.375526762856702e-06, + "loss": 0.0148, + "step": 30261 + }, + { + "epoch": 3.5885212854263013, + "grad_norm": 0.4013040324725195, + "learning_rate": 1.3747416387307089e-06, + "loss": 0.0167, + "step": 30262 + }, + { + "epoch": 3.5886398671884265, + "grad_norm": 0.6185740361497303, + "learning_rate": 1.3739567324006219e-06, + "loss": 0.0286, + "step": 30263 + }, + { + "epoch": 3.5887584489505513, + "grad_norm": 0.6351834016981115, + "learning_rate": 1.3731720438736856e-06, + "loss": 0.0297, + "step": 30264 + }, + { + "epoch": 3.5888770307126765, + "grad_norm": 0.4081973505923881, + "learning_rate": 1.3723875731571218e-06, + "loss": 0.0214, + "step": 30265 + }, + { + "epoch": 3.5889956124748013, + "grad_norm": 0.8295961652404419, + "learning_rate": 1.3716033202581802e-06, + "loss": 0.042, + "step": 30266 + }, + { + "epoch": 3.5891141942369265, + "grad_norm": 0.623301873230482, + "learning_rate": 1.3708192851840745e-06, + "loss": 0.0275, + "step": 30267 + }, + { + "epoch": 3.589232775999051, + "grad_norm": 0.7107597569985098, + "learning_rate": 1.3700354679420406e-06, + "loss": 0.0341, + "step": 30268 + }, + { + "epoch": 3.5893513577611764, + "grad_norm": 0.9026699733918335, + "learning_rate": 1.3692518685393036e-06, + "loss": 0.0418, + "step": 30269 + }, + { + "epoch": 3.589469939523301, + "grad_norm": 0.3672656394805999, + "learning_rate": 1.368468486983085e-06, + "loss": 0.0178, + "step": 30270 + }, + { + "epoch": 3.5895885212854264, + "grad_norm": 0.7170911884541669, + "learning_rate": 1.3676853232806098e-06, + "loss": 0.0324, + "step": 30271 + }, + { + "epoch": 3.589707103047551, + "grad_norm": 0.850512728987101, + "learning_rate": 1.3669023774390915e-06, + "loss": 0.0328, + "step": 30272 + }, + { + "epoch": 3.5898256848096763, + "grad_norm": 0.3512914914345569, + "learning_rate": 1.3661196494657607e-06, + "loss": 0.0173, + "step": 30273 + }, + { + "epoch": 3.589944266571801, + "grad_norm": 0.42976427924895244, + "learning_rate": 1.3653371393678198e-06, + "loss": 0.0171, + "step": 30274 + }, + { + "epoch": 3.5900628483339263, + "grad_norm": 0.4813012950407259, + "learning_rate": 1.364554847152491e-06, + "loss": 0.0297, + "step": 30275 + }, + { + "epoch": 3.590181430096051, + "grad_norm": 0.6444139921377733, + "learning_rate": 1.3637727728269738e-06, + "loss": 0.0282, + "step": 30276 + }, + { + "epoch": 3.5903000118581763, + "grad_norm": 0.4768329913330175, + "learning_rate": 1.3629909163984962e-06, + "loss": 0.028, + "step": 30277 + }, + { + "epoch": 3.590418593620301, + "grad_norm": 0.8180344887343912, + "learning_rate": 1.3622092778742546e-06, + "loss": 0.0334, + "step": 30278 + }, + { + "epoch": 3.5905371753824262, + "grad_norm": 0.48057423130767285, + "learning_rate": 1.3614278572614547e-06, + "loss": 0.0204, + "step": 30279 + }, + { + "epoch": 3.5906557571445514, + "grad_norm": 0.5289233180412584, + "learning_rate": 1.3606466545673047e-06, + "loss": 0.0266, + "step": 30280 + }, + { + "epoch": 3.590774338906676, + "grad_norm": 0.4596876924668516, + "learning_rate": 1.3598656697990015e-06, + "loss": 0.0151, + "step": 30281 + }, + { + "epoch": 3.590892920668801, + "grad_norm": 0.7003836063276891, + "learning_rate": 1.3590849029637531e-06, + "loss": 0.0259, + "step": 30282 + }, + { + "epoch": 3.591011502430926, + "grad_norm": 0.6483051263953187, + "learning_rate": 1.3583043540687457e-06, + "loss": 0.0203, + "step": 30283 + }, + { + "epoch": 3.5911300841930514, + "grad_norm": 0.5417793914587848, + "learning_rate": 1.3575240231211818e-06, + "loss": 0.0291, + "step": 30284 + }, + { + "epoch": 3.591248665955176, + "grad_norm": 0.5513085452304264, + "learning_rate": 1.3567439101282553e-06, + "loss": 0.0231, + "step": 30285 + }, + { + "epoch": 3.591367247717301, + "grad_norm": 0.5248834160907412, + "learning_rate": 1.355964015097158e-06, + "loss": 0.0178, + "step": 30286 + }, + { + "epoch": 3.591485829479426, + "grad_norm": 0.8564798948935965, + "learning_rate": 1.3551843380350732e-06, + "loss": 0.0386, + "step": 30287 + }, + { + "epoch": 3.5916044112415513, + "grad_norm": 0.5975209234364856, + "learning_rate": 1.3544048789492004e-06, + "loss": 0.0315, + "step": 30288 + }, + { + "epoch": 3.591722993003676, + "grad_norm": 0.4416000909476542, + "learning_rate": 1.3536256378467144e-06, + "loss": 0.0277, + "step": 30289 + }, + { + "epoch": 3.591841574765801, + "grad_norm": 0.44688924719015616, + "learning_rate": 1.3528466147348012e-06, + "loss": 0.0184, + "step": 30290 + }, + { + "epoch": 3.591960156527926, + "grad_norm": 0.33973487491199045, + "learning_rate": 1.3520678096206468e-06, + "loss": 0.0207, + "step": 30291 + }, + { + "epoch": 3.592078738290051, + "grad_norm": 0.6316063789536117, + "learning_rate": 1.351289222511426e-06, + "loss": 0.0331, + "step": 30292 + }, + { + "epoch": 3.592197320052176, + "grad_norm": 0.6461935401947546, + "learning_rate": 1.3505108534143246e-06, + "loss": 0.033, + "step": 30293 + }, + { + "epoch": 3.5923159018143007, + "grad_norm": 0.6482903230276174, + "learning_rate": 1.3497327023365063e-06, + "loss": 0.0362, + "step": 30294 + }, + { + "epoch": 3.592434483576426, + "grad_norm": 0.4549044543607977, + "learning_rate": 1.3489547692851518e-06, + "loss": 0.0173, + "step": 30295 + }, + { + "epoch": 3.592553065338551, + "grad_norm": 0.7161529086519499, + "learning_rate": 1.3481770542674327e-06, + "loss": 0.0314, + "step": 30296 + }, + { + "epoch": 3.592671647100676, + "grad_norm": 0.5230857802070488, + "learning_rate": 1.3473995572905158e-06, + "loss": 0.0172, + "step": 30297 + }, + { + "epoch": 3.5927902288628006, + "grad_norm": 0.4404898678110975, + "learning_rate": 1.3466222783615728e-06, + "loss": 0.0223, + "step": 30298 + }, + { + "epoch": 3.592908810624926, + "grad_norm": 0.4363215862809811, + "learning_rate": 1.3458452174877623e-06, + "loss": 0.0226, + "step": 30299 + }, + { + "epoch": 3.593027392387051, + "grad_norm": 0.536731332870012, + "learning_rate": 1.345068374676256e-06, + "loss": 0.0236, + "step": 30300 + }, + { + "epoch": 3.593145974149176, + "grad_norm": 0.4751426658631583, + "learning_rate": 1.3442917499342124e-06, + "loss": 0.0256, + "step": 30301 + }, + { + "epoch": 3.593264555911301, + "grad_norm": 0.3615109810710253, + "learning_rate": 1.3435153432687864e-06, + "loss": 0.0191, + "step": 30302 + }, + { + "epoch": 3.5933831376734258, + "grad_norm": 0.6328096156304654, + "learning_rate": 1.3427391546871422e-06, + "loss": 0.025, + "step": 30303 + }, + { + "epoch": 3.593501719435551, + "grad_norm": 0.733430964987023, + "learning_rate": 1.3419631841964347e-06, + "loss": 0.024, + "step": 30304 + }, + { + "epoch": 3.5936203011976757, + "grad_norm": 0.8292625931294112, + "learning_rate": 1.3411874318038114e-06, + "loss": 0.0367, + "step": 30305 + }, + { + "epoch": 3.593738882959801, + "grad_norm": 0.36124375045537555, + "learning_rate": 1.3404118975164277e-06, + "loss": 0.0147, + "step": 30306 + }, + { + "epoch": 3.5938574647219257, + "grad_norm": 0.46970033336320577, + "learning_rate": 1.339636581341433e-06, + "loss": 0.0122, + "step": 30307 + }, + { + "epoch": 3.593976046484051, + "grad_norm": 0.5995219996728539, + "learning_rate": 1.3388614832859775e-06, + "loss": 0.027, + "step": 30308 + }, + { + "epoch": 3.5940946282461756, + "grad_norm": 0.4369893331388282, + "learning_rate": 1.3380866033572026e-06, + "loss": 0.0143, + "step": 30309 + }, + { + "epoch": 3.594213210008301, + "grad_norm": 0.363076299049197, + "learning_rate": 1.3373119415622498e-06, + "loss": 0.0146, + "step": 30310 + }, + { + "epoch": 3.5943317917704256, + "grad_norm": 0.5685428110441739, + "learning_rate": 1.336537497908269e-06, + "loss": 0.0293, + "step": 30311 + }, + { + "epoch": 3.594450373532551, + "grad_norm": 0.5258199248321362, + "learning_rate": 1.3357632724023933e-06, + "loss": 0.0273, + "step": 30312 + }, + { + "epoch": 3.5945689552946756, + "grad_norm": 0.5685894308807649, + "learning_rate": 1.3349892650517614e-06, + "loss": 0.0261, + "step": 30313 + }, + { + "epoch": 3.5946875370568008, + "grad_norm": 0.4644948564367096, + "learning_rate": 1.334215475863504e-06, + "loss": 0.0227, + "step": 30314 + }, + { + "epoch": 3.5948061188189255, + "grad_norm": 0.6987853920549577, + "learning_rate": 1.333441904844765e-06, + "loss": 0.0329, + "step": 30315 + }, + { + "epoch": 3.5949247005810507, + "grad_norm": 0.5922993707309314, + "learning_rate": 1.3326685520026667e-06, + "loss": 0.0219, + "step": 30316 + }, + { + "epoch": 3.5950432823431755, + "grad_norm": 0.2836174895684248, + "learning_rate": 1.3318954173443393e-06, + "loss": 0.0156, + "step": 30317 + }, + { + "epoch": 3.5951618641053007, + "grad_norm": 0.6170543911869771, + "learning_rate": 1.3311225008769162e-06, + "loss": 0.031, + "step": 30318 + }, + { + "epoch": 3.5952804458674255, + "grad_norm": 0.5619463692533767, + "learning_rate": 1.3303498026075168e-06, + "loss": 0.0324, + "step": 30319 + }, + { + "epoch": 3.5953990276295507, + "grad_norm": 0.604970339439302, + "learning_rate": 1.329577322543271e-06, + "loss": 0.0276, + "step": 30320 + }, + { + "epoch": 3.5955176093916754, + "grad_norm": 0.5346071940322767, + "learning_rate": 1.3288050606912878e-06, + "loss": 0.0255, + "step": 30321 + }, + { + "epoch": 3.5956361911538006, + "grad_norm": 0.9078898211994952, + "learning_rate": 1.3280330170586996e-06, + "loss": 0.0634, + "step": 30322 + }, + { + "epoch": 3.5957547729159254, + "grad_norm": 0.5139273177111484, + "learning_rate": 1.3272611916526178e-06, + "loss": 0.0222, + "step": 30323 + }, + { + "epoch": 3.5958733546780506, + "grad_norm": 0.5186386178726711, + "learning_rate": 1.3264895844801616e-06, + "loss": 0.0184, + "step": 30324 + }, + { + "epoch": 3.5959919364401753, + "grad_norm": 0.6440691646718099, + "learning_rate": 1.3257181955484338e-06, + "loss": 0.0265, + "step": 30325 + }, + { + "epoch": 3.5961105182023005, + "grad_norm": 0.5351225292547905, + "learning_rate": 1.324947024864559e-06, + "loss": 0.0324, + "step": 30326 + }, + { + "epoch": 3.5962290999644253, + "grad_norm": 0.5520469391578139, + "learning_rate": 1.3241760724356373e-06, + "loss": 0.0249, + "step": 30327 + }, + { + "epoch": 3.5963476817265505, + "grad_norm": 0.3824146972043754, + "learning_rate": 1.3234053382687794e-06, + "loss": 0.0277, + "step": 30328 + }, + { + "epoch": 3.5964662634886757, + "grad_norm": 0.5104377800249257, + "learning_rate": 1.322634822371091e-06, + "loss": 0.0212, + "step": 30329 + }, + { + "epoch": 3.5965848452508005, + "grad_norm": 0.5681686069516065, + "learning_rate": 1.3218645247496719e-06, + "loss": 0.0329, + "step": 30330 + }, + { + "epoch": 3.5967034270129252, + "grad_norm": 0.5088068917585812, + "learning_rate": 1.321094445411633e-06, + "loss": 0.0251, + "step": 30331 + }, + { + "epoch": 3.5968220087750504, + "grad_norm": 0.4443175846846231, + "learning_rate": 1.3203245843640549e-06, + "loss": 0.0185, + "step": 30332 + }, + { + "epoch": 3.5969405905371756, + "grad_norm": 0.5819513212000545, + "learning_rate": 1.3195549416140567e-06, + "loss": 0.0292, + "step": 30333 + }, + { + "epoch": 3.5970591722993004, + "grad_norm": 0.4894368380563326, + "learning_rate": 1.318785517168719e-06, + "loss": 0.0292, + "step": 30334 + }, + { + "epoch": 3.597177754061425, + "grad_norm": 0.43151356400347396, + "learning_rate": 1.3180163110351418e-06, + "loss": 0.0194, + "step": 30335 + }, + { + "epoch": 3.5972963358235504, + "grad_norm": 0.565229986310172, + "learning_rate": 1.3172473232204053e-06, + "loss": 0.0251, + "step": 30336 + }, + { + "epoch": 3.5974149175856756, + "grad_norm": 0.64280787485463, + "learning_rate": 1.3164785537316148e-06, + "loss": 0.0341, + "step": 30337 + }, + { + "epoch": 3.5975334993478003, + "grad_norm": 1.0610112371845717, + "learning_rate": 1.3157100025758456e-06, + "loss": 0.0659, + "step": 30338 + }, + { + "epoch": 3.597652081109925, + "grad_norm": 0.3217830712227758, + "learning_rate": 1.3149416697601892e-06, + "loss": 0.0203, + "step": 30339 + }, + { + "epoch": 3.5977706628720503, + "grad_norm": 0.6945476911684376, + "learning_rate": 1.3141735552917228e-06, + "loss": 0.048, + "step": 30340 + }, + { + "epoch": 3.5978892446341755, + "grad_norm": 0.42402235416848116, + "learning_rate": 1.313405659177533e-06, + "loss": 0.0205, + "step": 30341 + }, + { + "epoch": 3.5980078263963002, + "grad_norm": 0.6223403490971303, + "learning_rate": 1.312637981424697e-06, + "loss": 0.0348, + "step": 30342 + }, + { + "epoch": 3.598126408158425, + "grad_norm": 0.6312144806902849, + "learning_rate": 1.311870522040287e-06, + "loss": 0.0282, + "step": 30343 + }, + { + "epoch": 3.59824498992055, + "grad_norm": 0.39917887315445083, + "learning_rate": 1.3111032810313895e-06, + "loss": 0.0148, + "step": 30344 + }, + { + "epoch": 3.5983635716826754, + "grad_norm": 0.43679791204966456, + "learning_rate": 1.310336258405065e-06, + "loss": 0.0208, + "step": 30345 + }, + { + "epoch": 3.5984821534448, + "grad_norm": 0.4223785377744873, + "learning_rate": 1.309569454168391e-06, + "loss": 0.0183, + "step": 30346 + }, + { + "epoch": 3.598600735206925, + "grad_norm": 0.5339585567369409, + "learning_rate": 1.3088028683284375e-06, + "loss": 0.0228, + "step": 30347 + }, + { + "epoch": 3.59871931696905, + "grad_norm": 0.5349026896871063, + "learning_rate": 1.308036500892268e-06, + "loss": 0.0291, + "step": 30348 + }, + { + "epoch": 3.5988378987311753, + "grad_norm": 0.38992376486664304, + "learning_rate": 1.3072703518669517e-06, + "loss": 0.0163, + "step": 30349 + }, + { + "epoch": 3.5989564804933, + "grad_norm": 0.3166618827630165, + "learning_rate": 1.3065044212595467e-06, + "loss": 0.0103, + "step": 30350 + }, + { + "epoch": 3.599075062255425, + "grad_norm": 0.5519439832807934, + "learning_rate": 1.3057387090771145e-06, + "loss": 0.0235, + "step": 30351 + }, + { + "epoch": 3.59919364401755, + "grad_norm": 0.2754363773447477, + "learning_rate": 1.3049732153267158e-06, + "loss": 0.0108, + "step": 30352 + }, + { + "epoch": 3.5993122257796752, + "grad_norm": 0.6629239486298609, + "learning_rate": 1.3042079400154117e-06, + "loss": 0.0324, + "step": 30353 + }, + { + "epoch": 3.5994308075418, + "grad_norm": 0.6508274752246452, + "learning_rate": 1.3034428831502494e-06, + "loss": 0.0289, + "step": 30354 + }, + { + "epoch": 3.599549389303925, + "grad_norm": 0.4610925372135447, + "learning_rate": 1.302678044738284e-06, + "loss": 0.0258, + "step": 30355 + }, + { + "epoch": 3.59966797106605, + "grad_norm": 0.6121515927604324, + "learning_rate": 1.301913424786569e-06, + "loss": 0.0196, + "step": 30356 + }, + { + "epoch": 3.599786552828175, + "grad_norm": 0.8106806691639675, + "learning_rate": 1.3011490233021506e-06, + "loss": 0.0451, + "step": 30357 + }, + { + "epoch": 3.5999051345903, + "grad_norm": 0.5243810637975386, + "learning_rate": 1.3003848402920766e-06, + "loss": 0.0266, + "step": 30358 + }, + { + "epoch": 3.600023716352425, + "grad_norm": 0.6394709802420081, + "learning_rate": 1.299620875763391e-06, + "loss": 0.0224, + "step": 30359 + }, + { + "epoch": 3.60014229811455, + "grad_norm": 0.33580994533329256, + "learning_rate": 1.298857129723144e-06, + "loss": 0.0239, + "step": 30360 + }, + { + "epoch": 3.600260879876675, + "grad_norm": 0.5589027749905028, + "learning_rate": 1.2980936021783629e-06, + "loss": 0.0234, + "step": 30361 + }, + { + "epoch": 3.6003794616388, + "grad_norm": 0.49338318549184335, + "learning_rate": 1.2973302931360953e-06, + "loss": 0.0204, + "step": 30362 + }, + { + "epoch": 3.600498043400925, + "grad_norm": 0.48810186423604685, + "learning_rate": 1.296567202603377e-06, + "loss": 0.0253, + "step": 30363 + }, + { + "epoch": 3.60061662516305, + "grad_norm": 0.5350445640081319, + "learning_rate": 1.2958043305872469e-06, + "loss": 0.0288, + "step": 30364 + }, + { + "epoch": 3.600735206925175, + "grad_norm": 0.37217784609998505, + "learning_rate": 1.295041677094727e-06, + "loss": 0.0115, + "step": 30365 + }, + { + "epoch": 3.6008537886872998, + "grad_norm": 0.6657106046679836, + "learning_rate": 1.2942792421328536e-06, + "loss": 0.0403, + "step": 30366 + }, + { + "epoch": 3.600972370449425, + "grad_norm": 0.321268975630263, + "learning_rate": 1.293517025708657e-06, + "loss": 0.014, + "step": 30367 + }, + { + "epoch": 3.6010909522115497, + "grad_norm": 0.48104765141911393, + "learning_rate": 1.2927550278291622e-06, + "loss": 0.0286, + "step": 30368 + }, + { + "epoch": 3.601209533973675, + "grad_norm": 0.4430877217711643, + "learning_rate": 1.2919932485013969e-06, + "loss": 0.0209, + "step": 30369 + }, + { + "epoch": 3.6013281157357997, + "grad_norm": 0.8013327441623865, + "learning_rate": 1.2912316877323748e-06, + "loss": 0.0471, + "step": 30370 + }, + { + "epoch": 3.601446697497925, + "grad_norm": 0.31631741166454796, + "learning_rate": 1.2904703455291295e-06, + "loss": 0.0145, + "step": 30371 + }, + { + "epoch": 3.6015652792600497, + "grad_norm": 0.25634319506295367, + "learning_rate": 1.2897092218986717e-06, + "loss": 0.0099, + "step": 30372 + }, + { + "epoch": 3.601683861022175, + "grad_norm": 0.5198574320101339, + "learning_rate": 1.2889483168480182e-06, + "loss": 0.0196, + "step": 30373 + }, + { + "epoch": 3.6018024427842996, + "grad_norm": 0.6418413816825265, + "learning_rate": 1.2881876303841828e-06, + "loss": 0.0353, + "step": 30374 + }, + { + "epoch": 3.601921024546425, + "grad_norm": 0.668682676236825, + "learning_rate": 1.2874271625141848e-06, + "loss": 0.0215, + "step": 30375 + }, + { + "epoch": 3.6020396063085496, + "grad_norm": 1.1730074338915435, + "learning_rate": 1.286666913245027e-06, + "loss": 0.0513, + "step": 30376 + }, + { + "epoch": 3.602158188070675, + "grad_norm": 0.5223525170822428, + "learning_rate": 1.2859068825837235e-06, + "loss": 0.0235, + "step": 30377 + }, + { + "epoch": 3.6022767698328, + "grad_norm": 0.38527589625829123, + "learning_rate": 1.2851470705372737e-06, + "loss": 0.0146, + "step": 30378 + }, + { + "epoch": 3.6023953515949247, + "grad_norm": 0.32756128211569696, + "learning_rate": 1.284387477112689e-06, + "loss": 0.015, + "step": 30379 + }, + { + "epoch": 3.6025139333570495, + "grad_norm": 0.36848131010489127, + "learning_rate": 1.283628102316975e-06, + "loss": 0.0142, + "step": 30380 + }, + { + "epoch": 3.6026325151191747, + "grad_norm": 0.6730801250251812, + "learning_rate": 1.2828689461571175e-06, + "loss": 0.0338, + "step": 30381 + }, + { + "epoch": 3.6027510968813, + "grad_norm": 0.5000134317946437, + "learning_rate": 1.2821100086401334e-06, + "loss": 0.0282, + "step": 30382 + }, + { + "epoch": 3.6028696786434247, + "grad_norm": 0.6563473946948736, + "learning_rate": 1.2813512897730056e-06, + "loss": 0.0325, + "step": 30383 + }, + { + "epoch": 3.6029882604055494, + "grad_norm": 0.7019319815947362, + "learning_rate": 1.2805927895627374e-06, + "loss": 0.0292, + "step": 30384 + }, + { + "epoch": 3.6031068421676746, + "grad_norm": 0.4820497721622835, + "learning_rate": 1.2798345080163088e-06, + "loss": 0.0227, + "step": 30385 + }, + { + "epoch": 3.6032254239298, + "grad_norm": 0.725804874823848, + "learning_rate": 1.2790764451407256e-06, + "loss": 0.0445, + "step": 30386 + }, + { + "epoch": 3.6033440056919246, + "grad_norm": 0.6120256475185081, + "learning_rate": 1.2783186009429654e-06, + "loss": 0.0399, + "step": 30387 + }, + { + "epoch": 3.6034625874540493, + "grad_norm": 0.4866485402412959, + "learning_rate": 1.27756097543002e-06, + "loss": 0.016, + "step": 30388 + }, + { + "epoch": 3.6035811692161746, + "grad_norm": 0.3301485330401239, + "learning_rate": 1.2768035686088698e-06, + "loss": 0.0128, + "step": 30389 + }, + { + "epoch": 3.6036997509782998, + "grad_norm": 0.6464208810821416, + "learning_rate": 1.2760463804865008e-06, + "loss": 0.0313, + "step": 30390 + }, + { + "epoch": 3.6038183327404245, + "grad_norm": 0.6324344825747182, + "learning_rate": 1.2752894110698937e-06, + "loss": 0.0339, + "step": 30391 + }, + { + "epoch": 3.6039369145025493, + "grad_norm": 0.3986661118699562, + "learning_rate": 1.2745326603660207e-06, + "loss": 0.0139, + "step": 30392 + }, + { + "epoch": 3.6040554962646745, + "grad_norm": 0.44402784142177126, + "learning_rate": 1.2737761283818679e-06, + "loss": 0.0205, + "step": 30393 + }, + { + "epoch": 3.6041740780267997, + "grad_norm": 0.36246948879912033, + "learning_rate": 1.2730198151244015e-06, + "loss": 0.0159, + "step": 30394 + }, + { + "epoch": 3.6042926597889244, + "grad_norm": 0.37366928246584086, + "learning_rate": 1.2722637206005944e-06, + "loss": 0.0157, + "step": 30395 + }, + { + "epoch": 3.604411241551049, + "grad_norm": 0.4221244436643952, + "learning_rate": 1.2715078448174212e-06, + "loss": 0.024, + "step": 30396 + }, + { + "epoch": 3.6045298233131744, + "grad_norm": 0.423537985500025, + "learning_rate": 1.2707521877818485e-06, + "loss": 0.0181, + "step": 30397 + }, + { + "epoch": 3.6046484050752996, + "grad_norm": 0.42929062556922154, + "learning_rate": 1.269996749500843e-06, + "loss": 0.0179, + "step": 30398 + }, + { + "epoch": 3.6047669868374244, + "grad_norm": 0.5462231168804258, + "learning_rate": 1.2692415299813659e-06, + "loss": 0.03, + "step": 30399 + }, + { + "epoch": 3.604885568599549, + "grad_norm": 0.33812986254103033, + "learning_rate": 1.268486529230381e-06, + "loss": 0.0165, + "step": 30400 + }, + { + "epoch": 3.6050041503616743, + "grad_norm": 0.6806013476541539, + "learning_rate": 1.2677317472548466e-06, + "loss": 0.0181, + "step": 30401 + }, + { + "epoch": 3.6051227321237995, + "grad_norm": 0.34146144930760397, + "learning_rate": 1.2669771840617296e-06, + "loss": 0.0131, + "step": 30402 + }, + { + "epoch": 3.6052413138859243, + "grad_norm": 0.8588688925463029, + "learning_rate": 1.2662228396579712e-06, + "loss": 0.0317, + "step": 30403 + }, + { + "epoch": 3.6053598956480495, + "grad_norm": 0.4873909807391941, + "learning_rate": 1.2654687140505412e-06, + "loss": 0.0227, + "step": 30404 + }, + { + "epoch": 3.6054784774101742, + "grad_norm": 0.6580435634417815, + "learning_rate": 1.264714807246381e-06, + "loss": 0.0192, + "step": 30405 + }, + { + "epoch": 3.6055970591722994, + "grad_norm": 0.41635077913622104, + "learning_rate": 1.2639611192524438e-06, + "loss": 0.0159, + "step": 30406 + }, + { + "epoch": 3.605715640934424, + "grad_norm": 0.7239021554809697, + "learning_rate": 1.263207650075679e-06, + "loss": 0.032, + "step": 30407 + }, + { + "epoch": 3.6058342226965494, + "grad_norm": 0.7021823153122966, + "learning_rate": 1.2624543997230314e-06, + "loss": 0.0336, + "step": 30408 + }, + { + "epoch": 3.605952804458674, + "grad_norm": 0.7762180045302212, + "learning_rate": 1.2617013682014484e-06, + "loss": 0.0447, + "step": 30409 + }, + { + "epoch": 3.6060713862207994, + "grad_norm": 0.681978534315145, + "learning_rate": 1.2609485555178657e-06, + "loss": 0.0334, + "step": 30410 + }, + { + "epoch": 3.606189967982924, + "grad_norm": 0.6179031373926082, + "learning_rate": 1.2601959616792252e-06, + "loss": 0.0279, + "step": 30411 + }, + { + "epoch": 3.6063085497450493, + "grad_norm": 0.6049850499618598, + "learning_rate": 1.2594435866924686e-06, + "loss": 0.0316, + "step": 30412 + }, + { + "epoch": 3.606427131507174, + "grad_norm": 0.6858849181702288, + "learning_rate": 1.258691430564532e-06, + "loss": 0.0299, + "step": 30413 + }, + { + "epoch": 3.6065457132692993, + "grad_norm": 0.8692945097271496, + "learning_rate": 1.257939493302343e-06, + "loss": 0.0438, + "step": 30414 + }, + { + "epoch": 3.606664295031424, + "grad_norm": 0.44130299498077163, + "learning_rate": 1.257187774912838e-06, + "loss": 0.0168, + "step": 30415 + }, + { + "epoch": 3.6067828767935493, + "grad_norm": 0.5552812043547174, + "learning_rate": 1.2564362754029447e-06, + "loss": 0.0273, + "step": 30416 + }, + { + "epoch": 3.606901458555674, + "grad_norm": 1.1576392147005896, + "learning_rate": 1.2556849947795963e-06, + "loss": 0.0365, + "step": 30417 + }, + { + "epoch": 3.607020040317799, + "grad_norm": 0.6212658092453522, + "learning_rate": 1.2549339330497124e-06, + "loss": 0.0187, + "step": 30418 + }, + { + "epoch": 3.607138622079924, + "grad_norm": 0.31304659243795074, + "learning_rate": 1.2541830902202178e-06, + "loss": 0.0216, + "step": 30419 + }, + { + "epoch": 3.607257203842049, + "grad_norm": 0.8150061668845272, + "learning_rate": 1.2534324662980435e-06, + "loss": 0.0355, + "step": 30420 + }, + { + "epoch": 3.607375785604174, + "grad_norm": 0.34918103365931524, + "learning_rate": 1.2526820612900975e-06, + "loss": 0.0123, + "step": 30421 + }, + { + "epoch": 3.607494367366299, + "grad_norm": 0.38071184627512517, + "learning_rate": 1.2519318752032993e-06, + "loss": 0.0162, + "step": 30422 + }, + { + "epoch": 3.607612949128424, + "grad_norm": 0.8002619596869618, + "learning_rate": 1.2511819080445685e-06, + "loss": 0.0307, + "step": 30423 + }, + { + "epoch": 3.607731530890549, + "grad_norm": 0.7225420620050265, + "learning_rate": 1.250432159820822e-06, + "loss": 0.0268, + "step": 30424 + }, + { + "epoch": 3.607850112652674, + "grad_norm": 0.33269416030619986, + "learning_rate": 1.2496826305389647e-06, + "loss": 0.017, + "step": 30425 + }, + { + "epoch": 3.607968694414799, + "grad_norm": 0.5883511294944097, + "learning_rate": 1.2489333202059084e-06, + "loss": 0.0241, + "step": 30426 + }, + { + "epoch": 3.6080872761769243, + "grad_norm": 0.38571949992598753, + "learning_rate": 1.2481842288285612e-06, + "loss": 0.0196, + "step": 30427 + }, + { + "epoch": 3.608205857939049, + "grad_norm": 0.406164929376791, + "learning_rate": 1.2474353564138286e-06, + "loss": 0.019, + "step": 30428 + }, + { + "epoch": 3.608324439701174, + "grad_norm": 0.7278512943937971, + "learning_rate": 1.2466867029686164e-06, + "loss": 0.0226, + "step": 30429 + }, + { + "epoch": 3.608443021463299, + "grad_norm": 0.46466300722054243, + "learning_rate": 1.2459382684998216e-06, + "loss": 0.0316, + "step": 30430 + }, + { + "epoch": 3.608561603225424, + "grad_norm": 0.42822597626595904, + "learning_rate": 1.245190053014353e-06, + "loss": 0.0173, + "step": 30431 + }, + { + "epoch": 3.608680184987549, + "grad_norm": 0.6702021911969323, + "learning_rate": 1.244442056519096e-06, + "loss": 0.0299, + "step": 30432 + }, + { + "epoch": 3.6087987667496737, + "grad_norm": 0.6178361036093637, + "learning_rate": 1.2436942790209543e-06, + "loss": 0.0229, + "step": 30433 + }, + { + "epoch": 3.608917348511799, + "grad_norm": 0.6023694502327289, + "learning_rate": 1.2429467205268192e-06, + "loss": 0.0191, + "step": 30434 + }, + { + "epoch": 3.609035930273924, + "grad_norm": 0.6732546401250891, + "learning_rate": 1.242199381043585e-06, + "loss": 0.0317, + "step": 30435 + }, + { + "epoch": 3.609154512036049, + "grad_norm": 0.4482966253486542, + "learning_rate": 1.2414522605781353e-06, + "loss": 0.0186, + "step": 30436 + }, + { + "epoch": 3.6092730937981736, + "grad_norm": 0.42215151229210157, + "learning_rate": 1.240705359137362e-06, + "loss": 0.0156, + "step": 30437 + }, + { + "epoch": 3.609391675560299, + "grad_norm": 0.5008823477166998, + "learning_rate": 1.2399586767281506e-06, + "loss": 0.0309, + "step": 30438 + }, + { + "epoch": 3.609510257322424, + "grad_norm": 0.3747924143098685, + "learning_rate": 1.2392122133573825e-06, + "loss": 0.0227, + "step": 30439 + }, + { + "epoch": 3.609628839084549, + "grad_norm": 0.6601264566000912, + "learning_rate": 1.2384659690319433e-06, + "loss": 0.0279, + "step": 30440 + }, + { + "epoch": 3.6097474208466735, + "grad_norm": 0.5207826689928697, + "learning_rate": 1.2377199437587027e-06, + "loss": 0.016, + "step": 30441 + }, + { + "epoch": 3.6098660026087988, + "grad_norm": 0.6466758305028387, + "learning_rate": 1.2369741375445527e-06, + "loss": 0.0328, + "step": 30442 + }, + { + "epoch": 3.609984584370924, + "grad_norm": 0.48134773058425706, + "learning_rate": 1.2362285503963567e-06, + "loss": 0.0204, + "step": 30443 + }, + { + "epoch": 3.6101031661330487, + "grad_norm": 0.43335346366538474, + "learning_rate": 1.2354831823209927e-06, + "loss": 0.0211, + "step": 30444 + }, + { + "epoch": 3.6102217478951735, + "grad_norm": 0.8168127494480738, + "learning_rate": 1.2347380333253333e-06, + "loss": 0.0394, + "step": 30445 + }, + { + "epoch": 3.6103403296572987, + "grad_norm": 0.5054991007716786, + "learning_rate": 1.233993103416245e-06, + "loss": 0.0263, + "step": 30446 + }, + { + "epoch": 3.610458911419424, + "grad_norm": 0.47710803778487104, + "learning_rate": 1.2332483926006e-06, + "loss": 0.0231, + "step": 30447 + }, + { + "epoch": 3.6105774931815486, + "grad_norm": 0.6598110494398781, + "learning_rate": 1.232503900885254e-06, + "loss": 0.0375, + "step": 30448 + }, + { + "epoch": 3.6106960749436734, + "grad_norm": 0.4427284590325418, + "learning_rate": 1.2317596282770821e-06, + "loss": 0.0178, + "step": 30449 + }, + { + "epoch": 3.6108146567057986, + "grad_norm": 0.4876119335488241, + "learning_rate": 1.2310155747829372e-06, + "loss": 0.0248, + "step": 30450 + }, + { + "epoch": 3.610933238467924, + "grad_norm": 0.5196729179444166, + "learning_rate": 1.230271740409683e-06, + "loss": 0.0284, + "step": 30451 + }, + { + "epoch": 3.6110518202300486, + "grad_norm": 0.4970340344234, + "learning_rate": 1.22952812516417e-06, + "loss": 0.0243, + "step": 30452 + }, + { + "epoch": 3.6111704019921738, + "grad_norm": 0.3638066936963504, + "learning_rate": 1.2287847290532644e-06, + "loss": 0.0179, + "step": 30453 + }, + { + "epoch": 3.6112889837542985, + "grad_norm": 0.6469294874298182, + "learning_rate": 1.2280415520838113e-06, + "loss": 0.027, + "step": 30454 + }, + { + "epoch": 3.6114075655164237, + "grad_norm": 0.41226554161366635, + "learning_rate": 1.227298594262663e-06, + "loss": 0.0282, + "step": 30455 + }, + { + "epoch": 3.6115261472785485, + "grad_norm": 0.4481417158677059, + "learning_rate": 1.22655585559667e-06, + "loss": 0.0263, + "step": 30456 + }, + { + "epoch": 3.6116447290406737, + "grad_norm": 0.46952881127907065, + "learning_rate": 1.2258133360926792e-06, + "loss": 0.0206, + "step": 30457 + }, + { + "epoch": 3.6117633108027984, + "grad_norm": 0.4184908754745878, + "learning_rate": 1.2250710357575384e-06, + "loss": 0.015, + "step": 30458 + }, + { + "epoch": 3.6118818925649236, + "grad_norm": 0.8234643783745917, + "learning_rate": 1.2243289545980807e-06, + "loss": 0.0381, + "step": 30459 + }, + { + "epoch": 3.6120004743270484, + "grad_norm": 0.5166817825700368, + "learning_rate": 1.2235870926211619e-06, + "loss": 0.0199, + "step": 30460 + }, + { + "epoch": 3.6121190560891736, + "grad_norm": 0.468711878575777, + "learning_rate": 1.2228454498336096e-06, + "loss": 0.0372, + "step": 30461 + }, + { + "epoch": 3.6122376378512984, + "grad_norm": 0.6581615728914298, + "learning_rate": 1.2221040262422685e-06, + "loss": 0.0243, + "step": 30462 + }, + { + "epoch": 3.6123562196134236, + "grad_norm": 0.5461102216305043, + "learning_rate": 1.2213628218539636e-06, + "loss": 0.0233, + "step": 30463 + }, + { + "epoch": 3.6124748013755483, + "grad_norm": 0.3739209886861195, + "learning_rate": 1.2206218366755395e-06, + "loss": 0.0171, + "step": 30464 + }, + { + "epoch": 3.6125933831376735, + "grad_norm": 0.34396371297689216, + "learning_rate": 1.2198810707138215e-06, + "loss": 0.0186, + "step": 30465 + }, + { + "epoch": 3.6127119648997983, + "grad_norm": 0.8761287295493088, + "learning_rate": 1.2191405239756372e-06, + "loss": 0.0466, + "step": 30466 + }, + { + "epoch": 3.6128305466619235, + "grad_norm": 0.5669898972188444, + "learning_rate": 1.2184001964678143e-06, + "loss": 0.0162, + "step": 30467 + }, + { + "epoch": 3.6129491284240483, + "grad_norm": 0.9190986103567917, + "learning_rate": 1.2176600881971783e-06, + "loss": 0.0401, + "step": 30468 + }, + { + "epoch": 3.6130677101861735, + "grad_norm": 0.8323548286218649, + "learning_rate": 1.216920199170557e-06, + "loss": 0.0491, + "step": 30469 + }, + { + "epoch": 3.613186291948298, + "grad_norm": 0.5778218251033751, + "learning_rate": 1.2161805293947642e-06, + "loss": 0.0333, + "step": 30470 + }, + { + "epoch": 3.6133048737104234, + "grad_norm": 0.8239101046903867, + "learning_rate": 1.2154410788766196e-06, + "loss": 0.0482, + "step": 30471 + }, + { + "epoch": 3.613423455472548, + "grad_norm": 0.5494366479821569, + "learning_rate": 1.2147018476229428e-06, + "loss": 0.0209, + "step": 30472 + }, + { + "epoch": 3.6135420372346734, + "grad_norm": 0.4911879246783786, + "learning_rate": 1.2139628356405502e-06, + "loss": 0.0182, + "step": 30473 + }, + { + "epoch": 3.613660618996798, + "grad_norm": 0.4901819477112913, + "learning_rate": 1.213224042936245e-06, + "loss": 0.0159, + "step": 30474 + }, + { + "epoch": 3.6137792007589233, + "grad_norm": 0.9245787352782184, + "learning_rate": 1.2124854695168525e-06, + "loss": 0.0218, + "step": 30475 + }, + { + "epoch": 3.613897782521048, + "grad_norm": 0.3579254956170745, + "learning_rate": 1.2117471153891696e-06, + "loss": 0.0169, + "step": 30476 + }, + { + "epoch": 3.6140163642831733, + "grad_norm": 0.6029620638038913, + "learning_rate": 1.2110089805600077e-06, + "loss": 0.0279, + "step": 30477 + }, + { + "epoch": 3.614134946045298, + "grad_norm": 0.7405846600802909, + "learning_rate": 1.2102710650361698e-06, + "loss": 0.0291, + "step": 30478 + }, + { + "epoch": 3.6142535278074233, + "grad_norm": 0.8555791305751681, + "learning_rate": 1.2095333688244614e-06, + "loss": 0.0297, + "step": 30479 + }, + { + "epoch": 3.6143721095695485, + "grad_norm": 0.2967213319644539, + "learning_rate": 1.2087958919316854e-06, + "loss": 0.0152, + "step": 30480 + }, + { + "epoch": 3.6144906913316732, + "grad_norm": 0.2822985261447789, + "learning_rate": 1.2080586343646312e-06, + "loss": 0.0134, + "step": 30481 + }, + { + "epoch": 3.614609273093798, + "grad_norm": 0.5721537524129262, + "learning_rate": 1.2073215961301037e-06, + "loss": 0.0236, + "step": 30482 + }, + { + "epoch": 3.614727854855923, + "grad_norm": 0.3909600738809005, + "learning_rate": 1.2065847772348926e-06, + "loss": 0.0203, + "step": 30483 + }, + { + "epoch": 3.6148464366180484, + "grad_norm": 0.47344557321411923, + "learning_rate": 1.2058481776857948e-06, + "loss": 0.0177, + "step": 30484 + }, + { + "epoch": 3.614965018380173, + "grad_norm": 0.6201353870455142, + "learning_rate": 1.205111797489597e-06, + "loss": 0.0279, + "step": 30485 + }, + { + "epoch": 3.615083600142298, + "grad_norm": 0.38199758190170147, + "learning_rate": 1.2043756366530878e-06, + "loss": 0.0182, + "step": 30486 + }, + { + "epoch": 3.615202181904423, + "grad_norm": 0.5436926653009105, + "learning_rate": 1.2036396951830564e-06, + "loss": 0.0296, + "step": 30487 + }, + { + "epoch": 3.6153207636665483, + "grad_norm": 0.3786940107728946, + "learning_rate": 1.2029039730862835e-06, + "loss": 0.0184, + "step": 30488 + }, + { + "epoch": 3.615439345428673, + "grad_norm": 0.37165092323899124, + "learning_rate": 1.202168470369558e-06, + "loss": 0.0165, + "step": 30489 + }, + { + "epoch": 3.615557927190798, + "grad_norm": 0.6534510836237717, + "learning_rate": 1.2014331870396522e-06, + "loss": 0.0239, + "step": 30490 + }, + { + "epoch": 3.615676508952923, + "grad_norm": 0.74453196904478, + "learning_rate": 1.2006981231033555e-06, + "loss": 0.0258, + "step": 30491 + }, + { + "epoch": 3.6157950907150482, + "grad_norm": 0.3863198686620523, + "learning_rate": 1.1999632785674313e-06, + "loss": 0.0173, + "step": 30492 + }, + { + "epoch": 3.615913672477173, + "grad_norm": 0.46563828548063463, + "learning_rate": 1.199228653438661e-06, + "loss": 0.0229, + "step": 30493 + }, + { + "epoch": 3.6160322542392977, + "grad_norm": 0.5066160099179331, + "learning_rate": 1.1984942477238164e-06, + "loss": 0.0232, + "step": 30494 + }, + { + "epoch": 3.616150836001423, + "grad_norm": 0.4766012794171598, + "learning_rate": 1.197760061429673e-06, + "loss": 0.0202, + "step": 30495 + }, + { + "epoch": 3.616269417763548, + "grad_norm": 0.4353777709728239, + "learning_rate": 1.197026094562989e-06, + "loss": 0.0197, + "step": 30496 + }, + { + "epoch": 3.616387999525673, + "grad_norm": 0.3689722599407613, + "learning_rate": 1.1962923471305343e-06, + "loss": 0.0146, + "step": 30497 + }, + { + "epoch": 3.6165065812877977, + "grad_norm": 0.3134070954676824, + "learning_rate": 1.1955588191390727e-06, + "loss": 0.0133, + "step": 30498 + }, + { + "epoch": 3.616625163049923, + "grad_norm": 0.686187453685838, + "learning_rate": 1.194825510595371e-06, + "loss": 0.0376, + "step": 30499 + }, + { + "epoch": 3.616743744812048, + "grad_norm": 0.3942578055375481, + "learning_rate": 1.1940924215061878e-06, + "loss": 0.0193, + "step": 30500 + }, + { + "epoch": 3.616862326574173, + "grad_norm": 0.7745334121454815, + "learning_rate": 1.193359551878273e-06, + "loss": 0.0357, + "step": 30501 + }, + { + "epoch": 3.616980908336298, + "grad_norm": 0.3928216352700221, + "learning_rate": 1.1926269017183967e-06, + "loss": 0.0186, + "step": 30502 + }, + { + "epoch": 3.617099490098423, + "grad_norm": 0.30799992766457096, + "learning_rate": 1.191894471033303e-06, + "loss": 0.016, + "step": 30503 + }, + { + "epoch": 3.617218071860548, + "grad_norm": 0.8540695337650546, + "learning_rate": 1.1911622598297451e-06, + "loss": 0.0412, + "step": 30504 + }, + { + "epoch": 3.6173366536226728, + "grad_norm": 0.28160149388635836, + "learning_rate": 1.1904302681144758e-06, + "loss": 0.0162, + "step": 30505 + }, + { + "epoch": 3.617455235384798, + "grad_norm": 0.3524961367654006, + "learning_rate": 1.1896984958942425e-06, + "loss": 0.0151, + "step": 30506 + }, + { + "epoch": 3.6175738171469227, + "grad_norm": 0.47186184047554947, + "learning_rate": 1.1889669431757956e-06, + "loss": 0.0278, + "step": 30507 + }, + { + "epoch": 3.617692398909048, + "grad_norm": 0.40646423001298093, + "learning_rate": 1.1882356099658653e-06, + "loss": 0.0156, + "step": 30508 + }, + { + "epoch": 3.6178109806711727, + "grad_norm": 0.4500759621618194, + "learning_rate": 1.1875044962712107e-06, + "loss": 0.0257, + "step": 30509 + }, + { + "epoch": 3.617929562433298, + "grad_norm": 0.3859043607809819, + "learning_rate": 1.1867736020985592e-06, + "loss": 0.0155, + "step": 30510 + }, + { + "epoch": 3.6180481441954226, + "grad_norm": 0.5792923737317358, + "learning_rate": 1.1860429274546586e-06, + "loss": 0.0162, + "step": 30511 + }, + { + "epoch": 3.618166725957548, + "grad_norm": 0.5343420159760938, + "learning_rate": 1.1853124723462311e-06, + "loss": 0.017, + "step": 30512 + }, + { + "epoch": 3.6182853077196726, + "grad_norm": 0.4929647469757851, + "learning_rate": 1.184582236780027e-06, + "loss": 0.0271, + "step": 30513 + }, + { + "epoch": 3.618403889481798, + "grad_norm": 0.4977106174379826, + "learning_rate": 1.1838522207627683e-06, + "loss": 0.0213, + "step": 30514 + }, + { + "epoch": 3.6185224712439226, + "grad_norm": 0.40538629382586716, + "learning_rate": 1.1831224243011835e-06, + "loss": 0.0183, + "step": 30515 + }, + { + "epoch": 3.6186410530060478, + "grad_norm": 0.7841006602948055, + "learning_rate": 1.1823928474020057e-06, + "loss": 0.0317, + "step": 30516 + }, + { + "epoch": 3.6187596347681725, + "grad_norm": 0.8172656875040033, + "learning_rate": 1.1816634900719603e-06, + "loss": 0.0373, + "step": 30517 + }, + { + "epoch": 3.6188782165302977, + "grad_norm": 0.4735320497174433, + "learning_rate": 1.1809343523177696e-06, + "loss": 0.027, + "step": 30518 + }, + { + "epoch": 3.6189967982924225, + "grad_norm": 0.40633979006197113, + "learning_rate": 1.1802054341461504e-06, + "loss": 0.0174, + "step": 30519 + }, + { + "epoch": 3.6191153800545477, + "grad_norm": 0.4814182072589396, + "learning_rate": 1.1794767355638337e-06, + "loss": 0.0178, + "step": 30520 + }, + { + "epoch": 3.6192339618166725, + "grad_norm": 0.5313535841645073, + "learning_rate": 1.1787482565775276e-06, + "loss": 0.0259, + "step": 30521 + }, + { + "epoch": 3.6193525435787977, + "grad_norm": 0.3075287336283624, + "learning_rate": 1.1780199971939549e-06, + "loss": 0.0167, + "step": 30522 + }, + { + "epoch": 3.6194711253409224, + "grad_norm": 0.6344612003967877, + "learning_rate": 1.1772919574198182e-06, + "loss": 0.0242, + "step": 30523 + }, + { + "epoch": 3.6195897071030476, + "grad_norm": 0.42695676050363446, + "learning_rate": 1.176564137261843e-06, + "loss": 0.0201, + "step": 30524 + }, + { + "epoch": 3.6197082888651724, + "grad_norm": 0.6866624136465029, + "learning_rate": 1.1758365367267293e-06, + "loss": 0.0412, + "step": 30525 + }, + { + "epoch": 3.6198268706272976, + "grad_norm": 0.6222353886614134, + "learning_rate": 1.1751091558211858e-06, + "loss": 0.0295, + "step": 30526 + }, + { + "epoch": 3.6199454523894223, + "grad_norm": 0.5473138302846593, + "learning_rate": 1.1743819945519235e-06, + "loss": 0.0254, + "step": 30527 + }, + { + "epoch": 3.6200640341515475, + "grad_norm": 0.5955191737939642, + "learning_rate": 1.1736550529256402e-06, + "loss": 0.0276, + "step": 30528 + }, + { + "epoch": 3.6201826159136727, + "grad_norm": 0.654399913155924, + "learning_rate": 1.1729283309490412e-06, + "loss": 0.0232, + "step": 30529 + }, + { + "epoch": 3.6203011976757975, + "grad_norm": 0.3660895174996888, + "learning_rate": 1.1722018286288189e-06, + "loss": 0.0163, + "step": 30530 + }, + { + "epoch": 3.6204197794379223, + "grad_norm": 0.7619497763211113, + "learning_rate": 1.1714755459716843e-06, + "loss": 0.0327, + "step": 30531 + }, + { + "epoch": 3.6205383612000475, + "grad_norm": 0.49527109410450376, + "learning_rate": 1.1707494829843207e-06, + "loss": 0.0197, + "step": 30532 + }, + { + "epoch": 3.6206569429621727, + "grad_norm": 0.5121989145062326, + "learning_rate": 1.170023639673426e-06, + "loss": 0.0179, + "step": 30533 + }, + { + "epoch": 3.6207755247242974, + "grad_norm": 0.41768344224975723, + "learning_rate": 1.169298016045689e-06, + "loss": 0.0253, + "step": 30534 + }, + { + "epoch": 3.620894106486422, + "grad_norm": 0.5824321739819032, + "learning_rate": 1.1685726121078045e-06, + "loss": 0.0295, + "step": 30535 + }, + { + "epoch": 3.6210126882485474, + "grad_norm": 0.415628161291174, + "learning_rate": 1.1678474278664531e-06, + "loss": 0.0243, + "step": 30536 + }, + { + "epoch": 3.6211312700106726, + "grad_norm": 0.30177048150454755, + "learning_rate": 1.1671224633283268e-06, + "loss": 0.0139, + "step": 30537 + }, + { + "epoch": 3.6212498517727973, + "grad_norm": 0.5396862394321039, + "learning_rate": 1.1663977185001034e-06, + "loss": 0.0214, + "step": 30538 + }, + { + "epoch": 3.621368433534922, + "grad_norm": 0.5969687065996008, + "learning_rate": 1.165673193388464e-06, + "loss": 0.0316, + "step": 30539 + }, + { + "epoch": 3.6214870152970473, + "grad_norm": 0.7857542417714591, + "learning_rate": 1.1649488880000976e-06, + "loss": 0.0271, + "step": 30540 + }, + { + "epoch": 3.6216055970591725, + "grad_norm": 0.48420929374805155, + "learning_rate": 1.1642248023416679e-06, + "loss": 0.0165, + "step": 30541 + }, + { + "epoch": 3.6217241788212973, + "grad_norm": 0.6239722186824026, + "learning_rate": 1.163500936419859e-06, + "loss": 0.0327, + "step": 30542 + }, + { + "epoch": 3.621842760583422, + "grad_norm": 0.4078541072499883, + "learning_rate": 1.1627772902413376e-06, + "loss": 0.0189, + "step": 30543 + }, + { + "epoch": 3.6219613423455472, + "grad_norm": 0.6571278197321746, + "learning_rate": 1.1620538638127843e-06, + "loss": 0.033, + "step": 30544 + }, + { + "epoch": 3.6220799241076724, + "grad_norm": 0.4424230531443711, + "learning_rate": 1.161330657140855e-06, + "loss": 0.0232, + "step": 30545 + }, + { + "epoch": 3.622198505869797, + "grad_norm": 0.5522192249713966, + "learning_rate": 1.1606076702322304e-06, + "loss": 0.0237, + "step": 30546 + }, + { + "epoch": 3.622317087631922, + "grad_norm": 0.6235627419261496, + "learning_rate": 1.1598849030935665e-06, + "loss": 0.035, + "step": 30547 + }, + { + "epoch": 3.622435669394047, + "grad_norm": 0.8060443539856338, + "learning_rate": 1.15916235573153e-06, + "loss": 0.0316, + "step": 30548 + }, + { + "epoch": 3.6225542511561724, + "grad_norm": 0.4836612956016723, + "learning_rate": 1.1584400281527797e-06, + "loss": 0.0251, + "step": 30549 + }, + { + "epoch": 3.622672832918297, + "grad_norm": 0.6387762860602652, + "learning_rate": 1.1577179203639766e-06, + "loss": 0.0246, + "step": 30550 + }, + { + "epoch": 3.6227914146804223, + "grad_norm": 0.4773753194285187, + "learning_rate": 1.1569960323717794e-06, + "loss": 0.0164, + "step": 30551 + }, + { + "epoch": 3.622909996442547, + "grad_norm": 0.7917534091220493, + "learning_rate": 1.1562743641828355e-06, + "loss": 0.035, + "step": 30552 + }, + { + "epoch": 3.6230285782046723, + "grad_norm": 0.43549290999975643, + "learning_rate": 1.1555529158038065e-06, + "loss": 0.0171, + "step": 30553 + }, + { + "epoch": 3.623147159966797, + "grad_norm": 0.35466284041066237, + "learning_rate": 1.1548316872413366e-06, + "loss": 0.0128, + "step": 30554 + }, + { + "epoch": 3.6232657417289222, + "grad_norm": 0.3910257982030923, + "learning_rate": 1.1541106785020795e-06, + "loss": 0.0248, + "step": 30555 + }, + { + "epoch": 3.623384323491047, + "grad_norm": 0.8081899407835952, + "learning_rate": 1.1533898895926847e-06, + "loss": 0.0361, + "step": 30556 + }, + { + "epoch": 3.623502905253172, + "grad_norm": 0.553763405208823, + "learning_rate": 1.1526693205197835e-06, + "loss": 0.0277, + "step": 30557 + }, + { + "epoch": 3.623621487015297, + "grad_norm": 0.5158763135067869, + "learning_rate": 1.1519489712900345e-06, + "loss": 0.021, + "step": 30558 + }, + { + "epoch": 3.623740068777422, + "grad_norm": 0.42385789914460464, + "learning_rate": 1.151228841910068e-06, + "loss": 0.0208, + "step": 30559 + }, + { + "epoch": 3.623858650539547, + "grad_norm": 0.5656910138302355, + "learning_rate": 1.1505089323865293e-06, + "loss": 0.0241, + "step": 30560 + }, + { + "epoch": 3.623977232301672, + "grad_norm": 0.41170694574023087, + "learning_rate": 1.1497892427260486e-06, + "loss": 0.0198, + "step": 30561 + }, + { + "epoch": 3.624095814063797, + "grad_norm": 0.7074591825567287, + "learning_rate": 1.1490697729352683e-06, + "loss": 0.0344, + "step": 30562 + }, + { + "epoch": 3.624214395825922, + "grad_norm": 0.39698514581374866, + "learning_rate": 1.1483505230208136e-06, + "loss": 0.017, + "step": 30563 + }, + { + "epoch": 3.624332977588047, + "grad_norm": 0.5217813675787172, + "learning_rate": 1.1476314929893206e-06, + "loss": 0.0262, + "step": 30564 + }, + { + "epoch": 3.624451559350172, + "grad_norm": 0.5945184260369211, + "learning_rate": 1.146912682847412e-06, + "loss": 0.0254, + "step": 30565 + }, + { + "epoch": 3.624570141112297, + "grad_norm": 0.6710703820051761, + "learning_rate": 1.1461940926017183e-06, + "loss": 0.0271, + "step": 30566 + }, + { + "epoch": 3.624688722874422, + "grad_norm": 0.475917037825512, + "learning_rate": 1.145475722258868e-06, + "loss": 0.0208, + "step": 30567 + }, + { + "epoch": 3.6248073046365468, + "grad_norm": 0.6642161150057283, + "learning_rate": 1.1447575718254721e-06, + "loss": 0.0218, + "step": 30568 + }, + { + "epoch": 3.624925886398672, + "grad_norm": 0.547932231773108, + "learning_rate": 1.1440396413081672e-06, + "loss": 0.0285, + "step": 30569 + }, + { + "epoch": 3.6250444681607967, + "grad_norm": 0.31449918163465435, + "learning_rate": 1.1433219307135563e-06, + "loss": 0.0104, + "step": 30570 + }, + { + "epoch": 3.625163049922922, + "grad_norm": 0.2861137998563407, + "learning_rate": 1.1426044400482672e-06, + "loss": 0.0116, + "step": 30571 + }, + { + "epoch": 3.6252816316850467, + "grad_norm": 0.5537989095527303, + "learning_rate": 1.1418871693189005e-06, + "loss": 0.0317, + "step": 30572 + }, + { + "epoch": 3.625400213447172, + "grad_norm": 0.553716280192245, + "learning_rate": 1.1411701185320866e-06, + "loss": 0.0308, + "step": 30573 + }, + { + "epoch": 3.6255187952092967, + "grad_norm": 0.4509675912645148, + "learning_rate": 1.1404532876944207e-06, + "loss": 0.0205, + "step": 30574 + }, + { + "epoch": 3.625637376971422, + "grad_norm": 0.7513963086508453, + "learning_rate": 1.1397366768125195e-06, + "loss": 0.0353, + "step": 30575 + }, + { + "epoch": 3.6257559587335466, + "grad_norm": 0.4986654413304326, + "learning_rate": 1.139020285892986e-06, + "loss": 0.0238, + "step": 30576 + }, + { + "epoch": 3.625874540495672, + "grad_norm": 0.4105601778278942, + "learning_rate": 1.1383041149424233e-06, + "loss": 0.0148, + "step": 30577 + }, + { + "epoch": 3.625993122257797, + "grad_norm": 0.2808480891265015, + "learning_rate": 1.1375881639674402e-06, + "loss": 0.0125, + "step": 30578 + }, + { + "epoch": 3.6261117040199218, + "grad_norm": 0.5240268171312453, + "learning_rate": 1.1368724329746256e-06, + "loss": 0.0212, + "step": 30579 + }, + { + "epoch": 3.6262302857820465, + "grad_norm": 0.5236605028219777, + "learning_rate": 1.1361569219705908e-06, + "loss": 0.0302, + "step": 30580 + }, + { + "epoch": 3.6263488675441717, + "grad_norm": 0.6193294769222107, + "learning_rate": 1.1354416309619226e-06, + "loss": 0.0317, + "step": 30581 + }, + { + "epoch": 3.626467449306297, + "grad_norm": 0.5619745054588182, + "learning_rate": 1.1347265599552209e-06, + "loss": 0.0296, + "step": 30582 + }, + { + "epoch": 3.6265860310684217, + "grad_norm": 0.4772974317596211, + "learning_rate": 1.1340117089570695e-06, + "loss": 0.023, + "step": 30583 + }, + { + "epoch": 3.6267046128305465, + "grad_norm": 0.7798652979120075, + "learning_rate": 1.1332970779740687e-06, + "loss": 0.0312, + "step": 30584 + }, + { + "epoch": 3.6268231945926717, + "grad_norm": 0.6002425581025375, + "learning_rate": 1.1325826670127992e-06, + "loss": 0.028, + "step": 30585 + }, + { + "epoch": 3.626941776354797, + "grad_norm": 0.6840424862935617, + "learning_rate": 1.1318684760798476e-06, + "loss": 0.0311, + "step": 30586 + }, + { + "epoch": 3.6270603581169216, + "grad_norm": 0.46708067781406, + "learning_rate": 1.1311545051818e-06, + "loss": 0.0195, + "step": 30587 + }, + { + "epoch": 3.6271789398790464, + "grad_norm": 0.5467220682116494, + "learning_rate": 1.1304407543252404e-06, + "loss": 0.0276, + "step": 30588 + }, + { + "epoch": 3.6272975216411716, + "grad_norm": 0.5979449142684701, + "learning_rate": 1.1297272235167466e-06, + "loss": 0.0248, + "step": 30589 + }, + { + "epoch": 3.627416103403297, + "grad_norm": 0.36135620351451775, + "learning_rate": 1.1290139127628912e-06, + "loss": 0.0177, + "step": 30590 + }, + { + "epoch": 3.6275346851654215, + "grad_norm": 0.47268123570992593, + "learning_rate": 1.1283008220702606e-06, + "loss": 0.0226, + "step": 30591 + }, + { + "epoch": 3.6276532669275463, + "grad_norm": 0.4110461040884841, + "learning_rate": 1.1275879514454218e-06, + "loss": 0.02, + "step": 30592 + }, + { + "epoch": 3.6277718486896715, + "grad_norm": 0.46893417988084124, + "learning_rate": 1.1268753008949502e-06, + "loss": 0.0211, + "step": 30593 + }, + { + "epoch": 3.6278904304517967, + "grad_norm": 0.8147366839824469, + "learning_rate": 1.126162870425404e-06, + "loss": 0.0325, + "step": 30594 + }, + { + "epoch": 3.6280090122139215, + "grad_norm": 0.5122236934921792, + "learning_rate": 1.12545066004337e-06, + "loss": 0.0211, + "step": 30595 + }, + { + "epoch": 3.6281275939760462, + "grad_norm": 0.624299873868875, + "learning_rate": 1.1247386697553986e-06, + "loss": 0.0262, + "step": 30596 + }, + { + "epoch": 3.6282461757381714, + "grad_norm": 0.3175810713865264, + "learning_rate": 1.1240268995680593e-06, + "loss": 0.0121, + "step": 30597 + }, + { + "epoch": 3.6283647575002966, + "grad_norm": 0.45376718457550325, + "learning_rate": 1.1233153494879134e-06, + "loss": 0.0196, + "step": 30598 + }, + { + "epoch": 3.6284833392624214, + "grad_norm": 0.6814902003338795, + "learning_rate": 1.1226040195215226e-06, + "loss": 0.0309, + "step": 30599 + }, + { + "epoch": 3.628601921024546, + "grad_norm": 0.886054269676723, + "learning_rate": 1.1218929096754427e-06, + "loss": 0.0397, + "step": 30600 + }, + { + "epoch": 3.6287205027866714, + "grad_norm": 0.7101241906598454, + "learning_rate": 1.121182019956224e-06, + "loss": 0.0392, + "step": 30601 + }, + { + "epoch": 3.6288390845487966, + "grad_norm": 0.37783827599086106, + "learning_rate": 1.1204713503704305e-06, + "loss": 0.018, + "step": 30602 + }, + { + "epoch": 3.6289576663109213, + "grad_norm": 0.6718328795990541, + "learning_rate": 1.1197609009246045e-06, + "loss": 0.0338, + "step": 30603 + }, + { + "epoch": 3.6290762480730465, + "grad_norm": 0.48448002450394395, + "learning_rate": 1.119050671625299e-06, + "loss": 0.0205, + "step": 30604 + }, + { + "epoch": 3.6291948298351713, + "grad_norm": 0.4786484100410678, + "learning_rate": 1.1183406624790643e-06, + "loss": 0.0283, + "step": 30605 + }, + { + "epoch": 3.6293134115972965, + "grad_norm": 0.7278625257043722, + "learning_rate": 1.1176308734924395e-06, + "loss": 0.0196, + "step": 30606 + }, + { + "epoch": 3.6294319933594212, + "grad_norm": 0.4281238009634022, + "learning_rate": 1.116921304671978e-06, + "loss": 0.0138, + "step": 30607 + }, + { + "epoch": 3.6295505751215464, + "grad_norm": 0.4386010608780737, + "learning_rate": 1.11621195602421e-06, + "loss": 0.031, + "step": 30608 + }, + { + "epoch": 3.629669156883671, + "grad_norm": 0.47146162720263646, + "learning_rate": 1.1155028275556783e-06, + "loss": 0.0198, + "step": 30609 + }, + { + "epoch": 3.6297877386457964, + "grad_norm": 0.3062928532874005, + "learning_rate": 1.1147939192729218e-06, + "loss": 0.011, + "step": 30610 + }, + { + "epoch": 3.629906320407921, + "grad_norm": 0.393814502978095, + "learning_rate": 1.1140852311824794e-06, + "loss": 0.015, + "step": 30611 + }, + { + "epoch": 3.6300249021700464, + "grad_norm": 0.4980923676472471, + "learning_rate": 1.1133767632908799e-06, + "loss": 0.0178, + "step": 30612 + }, + { + "epoch": 3.630143483932171, + "grad_norm": 0.4330434066733445, + "learning_rate": 1.1126685156046506e-06, + "loss": 0.0164, + "step": 30613 + }, + { + "epoch": 3.6302620656942963, + "grad_norm": 0.49210634166012224, + "learning_rate": 1.111960488130326e-06, + "loss": 0.0224, + "step": 30614 + }, + { + "epoch": 3.630380647456421, + "grad_norm": 0.5273597182052229, + "learning_rate": 1.1112526808744333e-06, + "loss": 0.0295, + "step": 30615 + }, + { + "epoch": 3.6304992292185463, + "grad_norm": 0.6588991345900581, + "learning_rate": 1.1105450938434957e-06, + "loss": 0.0314, + "step": 30616 + }, + { + "epoch": 3.630617810980671, + "grad_norm": 0.6375462421735701, + "learning_rate": 1.1098377270440386e-06, + "loss": 0.0362, + "step": 30617 + }, + { + "epoch": 3.6307363927427962, + "grad_norm": 0.5723075993197545, + "learning_rate": 1.1091305804825842e-06, + "loss": 0.0174, + "step": 30618 + }, + { + "epoch": 3.630854974504921, + "grad_norm": 0.7995138441445279, + "learning_rate": 1.108423654165644e-06, + "loss": 0.0457, + "step": 30619 + }, + { + "epoch": 3.630973556267046, + "grad_norm": 0.5269905355224596, + "learning_rate": 1.1077169480997407e-06, + "loss": 0.0281, + "step": 30620 + }, + { + "epoch": 3.631092138029171, + "grad_norm": 0.36743227401754575, + "learning_rate": 1.1070104622913884e-06, + "loss": 0.0147, + "step": 30621 + }, + { + "epoch": 3.631210719791296, + "grad_norm": 0.3906455992659679, + "learning_rate": 1.1063041967471043e-06, + "loss": 0.0228, + "step": 30622 + }, + { + "epoch": 3.631329301553421, + "grad_norm": 0.6787659985633776, + "learning_rate": 1.1055981514733915e-06, + "loss": 0.0387, + "step": 30623 + }, + { + "epoch": 3.631447883315546, + "grad_norm": 0.5071885413085371, + "learning_rate": 1.104892326476764e-06, + "loss": 0.0294, + "step": 30624 + }, + { + "epoch": 3.631566465077671, + "grad_norm": 0.5124258356595751, + "learning_rate": 1.1041867217637253e-06, + "loss": 0.0219, + "step": 30625 + }, + { + "epoch": 3.631685046839796, + "grad_norm": 0.4984115913511895, + "learning_rate": 1.103481337340781e-06, + "loss": 0.0264, + "step": 30626 + }, + { + "epoch": 3.6318036286019213, + "grad_norm": 0.852481167722574, + "learning_rate": 1.1027761732144399e-06, + "loss": 0.0455, + "step": 30627 + }, + { + "epoch": 3.631922210364046, + "grad_norm": 0.3482456807001078, + "learning_rate": 1.102071229391191e-06, + "loss": 0.0173, + "step": 30628 + }, + { + "epoch": 3.632040792126171, + "grad_norm": 0.670199702535674, + "learning_rate": 1.1013665058775464e-06, + "loss": 0.0366, + "step": 30629 + }, + { + "epoch": 3.632159373888296, + "grad_norm": 0.5698865615371608, + "learning_rate": 1.1006620026799918e-06, + "loss": 0.0236, + "step": 30630 + }, + { + "epoch": 3.632277955650421, + "grad_norm": 0.4412866954016474, + "learning_rate": 1.099957719805028e-06, + "loss": 0.0182, + "step": 30631 + }, + { + "epoch": 3.632396537412546, + "grad_norm": 0.631225172224313, + "learning_rate": 1.0992536572591444e-06, + "loss": 0.0242, + "step": 30632 + }, + { + "epoch": 3.6325151191746707, + "grad_norm": 0.39510451154442877, + "learning_rate": 1.0985498150488383e-06, + "loss": 0.0187, + "step": 30633 + }, + { + "epoch": 3.632633700936796, + "grad_norm": 0.4601137296545177, + "learning_rate": 1.0978461931805877e-06, + "loss": 0.0213, + "step": 30634 + }, + { + "epoch": 3.632752282698921, + "grad_norm": 0.5131007109721877, + "learning_rate": 1.0971427916608851e-06, + "loss": 0.0213, + "step": 30635 + }, + { + "epoch": 3.632870864461046, + "grad_norm": 0.3959398099509853, + "learning_rate": 1.0964396104962138e-06, + "loss": 0.0157, + "step": 30636 + }, + { + "epoch": 3.6329894462231707, + "grad_norm": 0.34960841471469123, + "learning_rate": 1.0957366496930576e-06, + "loss": 0.0118, + "step": 30637 + }, + { + "epoch": 3.633108027985296, + "grad_norm": 0.40591177248575283, + "learning_rate": 1.0950339092579003e-06, + "loss": 0.0158, + "step": 30638 + }, + { + "epoch": 3.633226609747421, + "grad_norm": 0.7222169943984436, + "learning_rate": 1.094331389197209e-06, + "loss": 0.0437, + "step": 30639 + }, + { + "epoch": 3.633345191509546, + "grad_norm": 0.39182381363488095, + "learning_rate": 1.0936290895174727e-06, + "loss": 0.016, + "step": 30640 + }, + { + "epoch": 3.6334637732716706, + "grad_norm": 0.8655694778086095, + "learning_rate": 1.092927010225156e-06, + "loss": 0.0373, + "step": 30641 + }, + { + "epoch": 3.633582355033796, + "grad_norm": 0.5217586368718115, + "learning_rate": 1.0922251513267423e-06, + "loss": 0.0177, + "step": 30642 + }, + { + "epoch": 3.633700936795921, + "grad_norm": 0.7468190457018233, + "learning_rate": 1.091523512828685e-06, + "loss": 0.0403, + "step": 30643 + }, + { + "epoch": 3.6338195185580457, + "grad_norm": 0.39627669803961263, + "learning_rate": 1.0908220947374704e-06, + "loss": 0.0188, + "step": 30644 + }, + { + "epoch": 3.6339381003201705, + "grad_norm": 0.690009901686504, + "learning_rate": 1.0901208970595545e-06, + "loss": 0.022, + "step": 30645 + }, + { + "epoch": 3.6340566820822957, + "grad_norm": 0.6040736522335324, + "learning_rate": 1.0894199198014016e-06, + "loss": 0.0331, + "step": 30646 + }, + { + "epoch": 3.634175263844421, + "grad_norm": 0.6467558430326676, + "learning_rate": 1.0887191629694787e-06, + "loss": 0.0306, + "step": 30647 + }, + { + "epoch": 3.6342938456065457, + "grad_norm": 0.576721047794708, + "learning_rate": 1.0880186265702391e-06, + "loss": 0.0266, + "step": 30648 + }, + { + "epoch": 3.6344124273686704, + "grad_norm": 0.38249385804760627, + "learning_rate": 1.0873183106101526e-06, + "loss": 0.0174, + "step": 30649 + }, + { + "epoch": 3.6345310091307956, + "grad_norm": 0.3794906405903929, + "learning_rate": 1.0866182150956584e-06, + "loss": 0.0134, + "step": 30650 + }, + { + "epoch": 3.634649590892921, + "grad_norm": 0.36877271968139186, + "learning_rate": 1.0859183400332262e-06, + "loss": 0.0233, + "step": 30651 + }, + { + "epoch": 3.6347681726550456, + "grad_norm": 0.7898336122284042, + "learning_rate": 1.0852186854292984e-06, + "loss": 0.0481, + "step": 30652 + }, + { + "epoch": 3.634886754417171, + "grad_norm": 0.4264282989734277, + "learning_rate": 1.0845192512903307e-06, + "loss": 0.0189, + "step": 30653 + }, + { + "epoch": 3.6350053361792956, + "grad_norm": 0.8131114137859514, + "learning_rate": 1.0838200376227654e-06, + "loss": 0.0306, + "step": 30654 + }, + { + "epoch": 3.6351239179414208, + "grad_norm": 0.31729420762093946, + "learning_rate": 1.0831210444330525e-06, + "loss": 0.0167, + "step": 30655 + }, + { + "epoch": 3.6352424997035455, + "grad_norm": 0.8174195814930126, + "learning_rate": 1.0824222717276373e-06, + "loss": 0.0263, + "step": 30656 + }, + { + "epoch": 3.6353610814656707, + "grad_norm": 0.7458543463677608, + "learning_rate": 1.0817237195129559e-06, + "loss": 0.0337, + "step": 30657 + }, + { + "epoch": 3.6354796632277955, + "grad_norm": 0.7008790766703236, + "learning_rate": 1.0810253877954535e-06, + "loss": 0.0375, + "step": 30658 + }, + { + "epoch": 3.6355982449899207, + "grad_norm": 0.609589981001295, + "learning_rate": 1.0803272765815636e-06, + "loss": 0.032, + "step": 30659 + }, + { + "epoch": 3.6357168267520454, + "grad_norm": 0.38860936427966913, + "learning_rate": 1.0796293858777283e-06, + "loss": 0.0163, + "step": 30660 + }, + { + "epoch": 3.6358354085141706, + "grad_norm": 0.7519226016784752, + "learning_rate": 1.0789317156903678e-06, + "loss": 0.0416, + "step": 30661 + }, + { + "epoch": 3.6359539902762954, + "grad_norm": 0.31591328263438584, + "learning_rate": 1.078234266025932e-06, + "loss": 0.0181, + "step": 30662 + }, + { + "epoch": 3.6360725720384206, + "grad_norm": 0.7234443597744468, + "learning_rate": 1.0775370368908383e-06, + "loss": 0.0332, + "step": 30663 + }, + { + "epoch": 3.6361911538005454, + "grad_norm": 0.40058004308083744, + "learning_rate": 1.0768400282915176e-06, + "loss": 0.0229, + "step": 30664 + }, + { + "epoch": 3.6363097355626706, + "grad_norm": 0.5088048546031009, + "learning_rate": 1.0761432402343923e-06, + "loss": 0.0268, + "step": 30665 + }, + { + "epoch": 3.6364283173247953, + "grad_norm": 0.5366504070687661, + "learning_rate": 1.0754466727258912e-06, + "loss": 0.019, + "step": 30666 + }, + { + "epoch": 3.6365468990869205, + "grad_norm": 0.29659187098947043, + "learning_rate": 1.0747503257724362e-06, + "loss": 0.0174, + "step": 30667 + }, + { + "epoch": 3.6366654808490453, + "grad_norm": 0.7354955566175789, + "learning_rate": 1.074054199380442e-06, + "loss": 0.0424, + "step": 30668 + }, + { + "epoch": 3.6367840626111705, + "grad_norm": 0.5221193521245354, + "learning_rate": 1.0733582935563285e-06, + "loss": 0.0269, + "step": 30669 + }, + { + "epoch": 3.6369026443732952, + "grad_norm": 0.8033488362640042, + "learning_rate": 1.0726626083065099e-06, + "loss": 0.0305, + "step": 30670 + }, + { + "epoch": 3.6370212261354204, + "grad_norm": 0.6220034798324674, + "learning_rate": 1.0719671436374035e-06, + "loss": 0.0244, + "step": 30671 + }, + { + "epoch": 3.637139807897545, + "grad_norm": 0.7790805290016886, + "learning_rate": 1.0712718995554095e-06, + "loss": 0.0318, + "step": 30672 + }, + { + "epoch": 3.6372583896596704, + "grad_norm": 0.5447436962992006, + "learning_rate": 1.0705768760669532e-06, + "loss": 0.0286, + "step": 30673 + }, + { + "epoch": 3.637376971421795, + "grad_norm": 0.297441120829208, + "learning_rate": 1.0698820731784326e-06, + "loss": 0.0144, + "step": 30674 + }, + { + "epoch": 3.6374955531839204, + "grad_norm": 0.586630596538867, + "learning_rate": 1.0691874908962507e-06, + "loss": 0.0334, + "step": 30675 + }, + { + "epoch": 3.6376141349460456, + "grad_norm": 0.4900506786371344, + "learning_rate": 1.0684931292268163e-06, + "loss": 0.0165, + "step": 30676 + }, + { + "epoch": 3.6377327167081703, + "grad_norm": 0.35638003840553806, + "learning_rate": 1.0677989881765298e-06, + "loss": 0.0171, + "step": 30677 + }, + { + "epoch": 3.637851298470295, + "grad_norm": 0.625744453749271, + "learning_rate": 1.0671050677517892e-06, + "loss": 0.0194, + "step": 30678 + }, + { + "epoch": 3.6379698802324203, + "grad_norm": 0.4531698286430672, + "learning_rate": 1.0664113679589888e-06, + "loss": 0.0171, + "step": 30679 + }, + { + "epoch": 3.6380884619945455, + "grad_norm": 0.6334999272543679, + "learning_rate": 1.0657178888045294e-06, + "loss": 0.0263, + "step": 30680 + }, + { + "epoch": 3.6382070437566703, + "grad_norm": 0.5177708249341996, + "learning_rate": 1.0650246302947975e-06, + "loss": 0.0286, + "step": 30681 + }, + { + "epoch": 3.638325625518795, + "grad_norm": 0.3420156785895251, + "learning_rate": 1.0643315924361908e-06, + "loss": 0.0138, + "step": 30682 + }, + { + "epoch": 3.63844420728092, + "grad_norm": 1.1128607514703148, + "learning_rate": 1.063638775235093e-06, + "loss": 0.041, + "step": 30683 + }, + { + "epoch": 3.6385627890430454, + "grad_norm": 0.5650157715351332, + "learning_rate": 1.0629461786978933e-06, + "loss": 0.0228, + "step": 30684 + }, + { + "epoch": 3.63868137080517, + "grad_norm": 0.40512484465721627, + "learning_rate": 1.0622538028309758e-06, + "loss": 0.0142, + "step": 30685 + }, + { + "epoch": 3.638799952567295, + "grad_norm": 0.6272071028071852, + "learning_rate": 1.0615616476407241e-06, + "loss": 0.0204, + "step": 30686 + }, + { + "epoch": 3.63891853432942, + "grad_norm": 0.3630553485298499, + "learning_rate": 1.0608697131335193e-06, + "loss": 0.0147, + "step": 30687 + }, + { + "epoch": 3.6390371160915453, + "grad_norm": 0.42517620264925127, + "learning_rate": 1.0601779993157368e-06, + "loss": 0.0186, + "step": 30688 + }, + { + "epoch": 3.63915569785367, + "grad_norm": 0.5980601686735284, + "learning_rate": 1.0594865061937632e-06, + "loss": 0.0291, + "step": 30689 + }, + { + "epoch": 3.639274279615795, + "grad_norm": 0.5778785520366753, + "learning_rate": 1.05879523377396e-06, + "loss": 0.0247, + "step": 30690 + }, + { + "epoch": 3.63939286137792, + "grad_norm": 0.40828118298805083, + "learning_rate": 1.058104182062708e-06, + "loss": 0.0234, + "step": 30691 + }, + { + "epoch": 3.6395114431400453, + "grad_norm": 0.4791651378049535, + "learning_rate": 1.0574133510663746e-06, + "loss": 0.0248, + "step": 30692 + }, + { + "epoch": 3.63963002490217, + "grad_norm": 0.5579434656669813, + "learning_rate": 1.0567227407913328e-06, + "loss": 0.0187, + "step": 30693 + }, + { + "epoch": 3.639748606664295, + "grad_norm": 0.7655056683386534, + "learning_rate": 1.0560323512439434e-06, + "loss": 0.039, + "step": 30694 + }, + { + "epoch": 3.63986718842642, + "grad_norm": 0.5058454670722392, + "learning_rate": 1.0553421824305742e-06, + "loss": 0.0189, + "step": 30695 + }, + { + "epoch": 3.639985770188545, + "grad_norm": 0.6117841380441531, + "learning_rate": 1.0546522343575866e-06, + "loss": 0.028, + "step": 30696 + }, + { + "epoch": 3.64010435195067, + "grad_norm": 0.4268376932353655, + "learning_rate": 1.053962507031342e-06, + "loss": 0.0207, + "step": 30697 + }, + { + "epoch": 3.6402229337127947, + "grad_norm": 0.47860666007928626, + "learning_rate": 1.053273000458202e-06, + "loss": 0.0227, + "step": 30698 + }, + { + "epoch": 3.64034151547492, + "grad_norm": 0.6438852151703016, + "learning_rate": 1.0525837146445145e-06, + "loss": 0.0288, + "step": 30699 + }, + { + "epoch": 3.640460097237045, + "grad_norm": 0.8488920079996486, + "learning_rate": 1.0518946495966437e-06, + "loss": 0.0432, + "step": 30700 + }, + { + "epoch": 3.64057867899917, + "grad_norm": 0.43925031223340694, + "learning_rate": 1.0512058053209344e-06, + "loss": 0.0126, + "step": 30701 + }, + { + "epoch": 3.640697260761295, + "grad_norm": 0.52734470758089, + "learning_rate": 1.0505171818237375e-06, + "loss": 0.0206, + "step": 30702 + }, + { + "epoch": 3.64081584252342, + "grad_norm": 0.47643069529588117, + "learning_rate": 1.0498287791114058e-06, + "loss": 0.02, + "step": 30703 + }, + { + "epoch": 3.640934424285545, + "grad_norm": 0.5639808025168161, + "learning_rate": 1.0491405971902817e-06, + "loss": 0.0177, + "step": 30704 + }, + { + "epoch": 3.64105300604767, + "grad_norm": 0.4404881872969116, + "learning_rate": 1.0484526360667158e-06, + "loss": 0.024, + "step": 30705 + }, + { + "epoch": 3.641171587809795, + "grad_norm": 0.7921227398547116, + "learning_rate": 1.047764895747036e-06, + "loss": 0.0251, + "step": 30706 + }, + { + "epoch": 3.6412901695719198, + "grad_norm": 0.5174579407253317, + "learning_rate": 1.0470773762376012e-06, + "loss": 0.0255, + "step": 30707 + }, + { + "epoch": 3.641408751334045, + "grad_norm": 0.9024019145772182, + "learning_rate": 1.0463900775447345e-06, + "loss": 0.0402, + "step": 30708 + }, + { + "epoch": 3.6415273330961697, + "grad_norm": 0.6682147988329953, + "learning_rate": 1.045702999674783e-06, + "loss": 0.0222, + "step": 30709 + }, + { + "epoch": 3.641645914858295, + "grad_norm": 0.907505729860048, + "learning_rate": 1.0450161426340671e-06, + "loss": 0.0364, + "step": 30710 + }, + { + "epoch": 3.6417644966204197, + "grad_norm": 0.4905456518604798, + "learning_rate": 1.0443295064289315e-06, + "loss": 0.0224, + "step": 30711 + }, + { + "epoch": 3.641883078382545, + "grad_norm": 0.4334294047417947, + "learning_rate": 1.043643091065702e-06, + "loss": 0.0216, + "step": 30712 + }, + { + "epoch": 3.6420016601446696, + "grad_norm": 0.5967516650356516, + "learning_rate": 1.0429568965507037e-06, + "loss": 0.0252, + "step": 30713 + }, + { + "epoch": 3.642120241906795, + "grad_norm": 0.5267164567456263, + "learning_rate": 1.042270922890265e-06, + "loss": 0.0252, + "step": 30714 + }, + { + "epoch": 3.6422388236689196, + "grad_norm": 0.5784802123160812, + "learning_rate": 1.0415851700907087e-06, + "loss": 0.0277, + "step": 30715 + }, + { + "epoch": 3.642357405431045, + "grad_norm": 0.35489449320684713, + "learning_rate": 1.040899638158363e-06, + "loss": 0.0142, + "step": 30716 + }, + { + "epoch": 3.6424759871931696, + "grad_norm": 0.45411351620407825, + "learning_rate": 1.040214327099534e-06, + "loss": 0.0192, + "step": 30717 + }, + { + "epoch": 3.6425945689552948, + "grad_norm": 0.3925749437893302, + "learning_rate": 1.0395292369205528e-06, + "loss": 0.0148, + "step": 30718 + }, + { + "epoch": 3.6427131507174195, + "grad_norm": 0.8319037569715956, + "learning_rate": 1.038844367627728e-06, + "loss": 0.0368, + "step": 30719 + }, + { + "epoch": 3.6428317324795447, + "grad_norm": 0.31558837440884313, + "learning_rate": 1.0381597192273773e-06, + "loss": 0.0101, + "step": 30720 + }, + { + "epoch": 3.6429503142416695, + "grad_norm": 0.4932000469815746, + "learning_rate": 1.0374752917258063e-06, + "loss": 0.0278, + "step": 30721 + }, + { + "epoch": 3.6430688960037947, + "grad_norm": 0.5724717751161064, + "learning_rate": 1.0367910851293321e-06, + "loss": 0.0254, + "step": 30722 + }, + { + "epoch": 3.6431874777659194, + "grad_norm": 0.5848934222981165, + "learning_rate": 1.0361070994442556e-06, + "loss": 0.0261, + "step": 30723 + }, + { + "epoch": 3.6433060595280446, + "grad_norm": 0.4002857988538888, + "learning_rate": 1.0354233346768854e-06, + "loss": 0.0155, + "step": 30724 + }, + { + "epoch": 3.6434246412901694, + "grad_norm": 0.4644293671474941, + "learning_rate": 1.0347397908335248e-06, + "loss": 0.0221, + "step": 30725 + }, + { + "epoch": 3.6435432230522946, + "grad_norm": 0.6668469663354473, + "learning_rate": 1.0340564679204772e-06, + "loss": 0.0382, + "step": 30726 + }, + { + "epoch": 3.6436618048144194, + "grad_norm": 0.5077603728381986, + "learning_rate": 1.0333733659440404e-06, + "loss": 0.015, + "step": 30727 + }, + { + "epoch": 3.6437803865765446, + "grad_norm": 0.6398159348307507, + "learning_rate": 1.032690484910509e-06, + "loss": 0.0355, + "step": 30728 + }, + { + "epoch": 3.6438989683386698, + "grad_norm": 0.48553316133926366, + "learning_rate": 1.0320078248261811e-06, + "loss": 0.0246, + "step": 30729 + }, + { + "epoch": 3.6440175501007945, + "grad_norm": 0.5285293923422153, + "learning_rate": 1.0313253856973487e-06, + "loss": 0.0229, + "step": 30730 + }, + { + "epoch": 3.6441361318629193, + "grad_norm": 0.4141971893368603, + "learning_rate": 1.030643167530307e-06, + "loss": 0.0198, + "step": 30731 + }, + { + "epoch": 3.6442547136250445, + "grad_norm": 0.5023216191664538, + "learning_rate": 1.0299611703313367e-06, + "loss": 0.0227, + "step": 30732 + }, + { + "epoch": 3.6443732953871697, + "grad_norm": 0.5467601754066086, + "learning_rate": 1.0292793941067357e-06, + "loss": 0.0207, + "step": 30733 + }, + { + "epoch": 3.6444918771492945, + "grad_norm": 0.5487722928193859, + "learning_rate": 1.0285978388627827e-06, + "loss": 0.0276, + "step": 30734 + }, + { + "epoch": 3.644610458911419, + "grad_norm": 0.7366583651625145, + "learning_rate": 1.027916504605761e-06, + "loss": 0.0352, + "step": 30735 + }, + { + "epoch": 3.6447290406735444, + "grad_norm": 0.3914184791244896, + "learning_rate": 1.027235391341952e-06, + "loss": 0.0145, + "step": 30736 + }, + { + "epoch": 3.6448476224356696, + "grad_norm": 0.6743120880902697, + "learning_rate": 1.0265544990776338e-06, + "loss": 0.0384, + "step": 30737 + }, + { + "epoch": 3.6449662041977944, + "grad_norm": 0.5665212374043255, + "learning_rate": 1.0258738278190904e-06, + "loss": 0.0263, + "step": 30738 + }, + { + "epoch": 3.645084785959919, + "grad_norm": 0.3818154045973056, + "learning_rate": 1.0251933775725891e-06, + "loss": 0.0168, + "step": 30739 + }, + { + "epoch": 3.6452033677220443, + "grad_norm": 0.7412549518051832, + "learning_rate": 1.0245131483444027e-06, + "loss": 0.0325, + "step": 30740 + }, + { + "epoch": 3.6453219494841695, + "grad_norm": 0.5501433283037643, + "learning_rate": 1.0238331401408035e-06, + "loss": 0.0196, + "step": 30741 + }, + { + "epoch": 3.6454405312462943, + "grad_norm": 0.733553620066459, + "learning_rate": 1.0231533529680676e-06, + "loss": 0.0438, + "step": 30742 + }, + { + "epoch": 3.645559113008419, + "grad_norm": 0.31944267037175916, + "learning_rate": 1.0224737868324508e-06, + "loss": 0.0092, + "step": 30743 + }, + { + "epoch": 3.6456776947705443, + "grad_norm": 0.5768028188722137, + "learning_rate": 1.0217944417402231e-06, + "loss": 0.0252, + "step": 30744 + }, + { + "epoch": 3.6457962765326695, + "grad_norm": 0.7275954053639971, + "learning_rate": 1.0211153176976462e-06, + "loss": 0.0299, + "step": 30745 + }, + { + "epoch": 3.6459148582947942, + "grad_norm": 0.39899361306611053, + "learning_rate": 1.020436414710979e-06, + "loss": 0.0158, + "step": 30746 + }, + { + "epoch": 3.646033440056919, + "grad_norm": 0.3575195387963944, + "learning_rate": 1.019757732786486e-06, + "loss": 0.0162, + "step": 30747 + }, + { + "epoch": 3.646152021819044, + "grad_norm": 0.4618887751655598, + "learning_rate": 1.0190792719304176e-06, + "loss": 0.0164, + "step": 30748 + }, + { + "epoch": 3.6462706035811694, + "grad_norm": 0.5971070362855041, + "learning_rate": 1.0184010321490357e-06, + "loss": 0.028, + "step": 30749 + }, + { + "epoch": 3.646389185343294, + "grad_norm": 0.411180388567597, + "learning_rate": 1.0177230134485848e-06, + "loss": 0.0176, + "step": 30750 + }, + { + "epoch": 3.6465077671054194, + "grad_norm": 0.6758945404867688, + "learning_rate": 1.0170452158353161e-06, + "loss": 0.0225, + "step": 30751 + }, + { + "epoch": 3.646626348867544, + "grad_norm": 0.6242249614909375, + "learning_rate": 1.0163676393154852e-06, + "loss": 0.0347, + "step": 30752 + }, + { + "epoch": 3.6467449306296693, + "grad_norm": 0.9072357586223859, + "learning_rate": 1.0156902838953347e-06, + "loss": 0.046, + "step": 30753 + }, + { + "epoch": 3.646863512391794, + "grad_norm": 0.3553959725037362, + "learning_rate": 1.0150131495811037e-06, + "loss": 0.0141, + "step": 30754 + }, + { + "epoch": 3.6469820941539193, + "grad_norm": 0.5580446049549092, + "learning_rate": 1.0143362363790404e-06, + "loss": 0.0154, + "step": 30755 + }, + { + "epoch": 3.647100675916044, + "grad_norm": 0.4889402347642849, + "learning_rate": 1.0136595442953838e-06, + "loss": 0.0157, + "step": 30756 + }, + { + "epoch": 3.6472192576781692, + "grad_norm": 1.1379669526954437, + "learning_rate": 1.0129830733363737e-06, + "loss": 0.0498, + "step": 30757 + }, + { + "epoch": 3.647337839440294, + "grad_norm": 0.9548184195422669, + "learning_rate": 1.0123068235082438e-06, + "loss": 0.0454, + "step": 30758 + }, + { + "epoch": 3.647456421202419, + "grad_norm": 0.7003532852684642, + "learning_rate": 1.0116307948172283e-06, + "loss": 0.0289, + "step": 30759 + }, + { + "epoch": 3.647575002964544, + "grad_norm": 0.5951562912516722, + "learning_rate": 1.0109549872695634e-06, + "loss": 0.0297, + "step": 30760 + }, + { + "epoch": 3.647693584726669, + "grad_norm": 0.3460190084968859, + "learning_rate": 1.010279400871475e-06, + "loss": 0.0216, + "step": 30761 + }, + { + "epoch": 3.647812166488794, + "grad_norm": 0.8062567789201769, + "learning_rate": 1.0096040356291913e-06, + "loss": 0.0243, + "step": 30762 + }, + { + "epoch": 3.647930748250919, + "grad_norm": 0.5724418652583871, + "learning_rate": 1.008928891548941e-06, + "loss": 0.0357, + "step": 30763 + }, + { + "epoch": 3.648049330013044, + "grad_norm": 0.7175721716141188, + "learning_rate": 1.0082539686369436e-06, + "loss": 0.0377, + "step": 30764 + }, + { + "epoch": 3.648167911775169, + "grad_norm": 0.44096074801936797, + "learning_rate": 1.0075792668994277e-06, + "loss": 0.0173, + "step": 30765 + }, + { + "epoch": 3.648286493537294, + "grad_norm": 0.45816962211965484, + "learning_rate": 1.0069047863426052e-06, + "loss": 0.0266, + "step": 30766 + }, + { + "epoch": 3.648405075299419, + "grad_norm": 0.7697897125175935, + "learning_rate": 1.006230526972704e-06, + "loss": 0.0367, + "step": 30767 + }, + { + "epoch": 3.648523657061544, + "grad_norm": 0.3427298057451425, + "learning_rate": 1.0055564887959307e-06, + "loss": 0.0192, + "step": 30768 + }, + { + "epoch": 3.648642238823669, + "grad_norm": 0.5469022493873739, + "learning_rate": 1.0048826718185051e-06, + "loss": 0.0281, + "step": 30769 + }, + { + "epoch": 3.6487608205857938, + "grad_norm": 0.45272896826016035, + "learning_rate": 1.0042090760466306e-06, + "loss": 0.0207, + "step": 30770 + }, + { + "epoch": 3.648879402347919, + "grad_norm": 0.3633449588356305, + "learning_rate": 1.0035357014865298e-06, + "loss": 0.0121, + "step": 30771 + }, + { + "epoch": 3.6489979841100437, + "grad_norm": 0.6930461703965459, + "learning_rate": 1.0028625481443982e-06, + "loss": 0.032, + "step": 30772 + }, + { + "epoch": 3.649116565872169, + "grad_norm": 0.4329847837111056, + "learning_rate": 1.0021896160264498e-06, + "loss": 0.0213, + "step": 30773 + }, + { + "epoch": 3.6492351476342937, + "grad_norm": 0.5250541352728472, + "learning_rate": 1.0015169051388828e-06, + "loss": 0.0195, + "step": 30774 + }, + { + "epoch": 3.649353729396419, + "grad_norm": 0.6354707156555486, + "learning_rate": 1.0008444154879031e-06, + "loss": 0.0299, + "step": 30775 + }, + { + "epoch": 3.6494723111585436, + "grad_norm": 0.915663347476454, + "learning_rate": 1.0001721470797088e-06, + "loss": 0.0466, + "step": 30776 + }, + { + "epoch": 3.649590892920669, + "grad_norm": 0.3704078158096794, + "learning_rate": 9.995000999204918e-07, + "loss": 0.0207, + "step": 30777 + }, + { + "epoch": 3.649709474682794, + "grad_norm": 0.716134680994382, + "learning_rate": 9.988282740164584e-07, + "loss": 0.0272, + "step": 30778 + }, + { + "epoch": 3.649828056444919, + "grad_norm": 0.47387834075884744, + "learning_rate": 9.981566693737925e-07, + "loss": 0.0225, + "step": 30779 + }, + { + "epoch": 3.6499466382070436, + "grad_norm": 0.39377569442976534, + "learning_rate": 9.97485285998695e-07, + "loss": 0.0197, + "step": 30780 + }, + { + "epoch": 3.6500652199691688, + "grad_norm": 0.5719952867072153, + "learning_rate": 9.96814123897341e-07, + "loss": 0.0215, + "step": 30781 + }, + { + "epoch": 3.650183801731294, + "grad_norm": 0.4637190447627906, + "learning_rate": 9.961431830759316e-07, + "loss": 0.0227, + "step": 30782 + }, + { + "epoch": 3.6503023834934187, + "grad_norm": 0.31235425074309975, + "learning_rate": 9.954724635406448e-07, + "loss": 0.0108, + "step": 30783 + }, + { + "epoch": 3.6504209652555435, + "grad_norm": 0.4354200222448532, + "learning_rate": 9.948019652976648e-07, + "loss": 0.0207, + "step": 30784 + }, + { + "epoch": 3.6505395470176687, + "grad_norm": 0.3958017460637726, + "learning_rate": 9.941316883531726e-07, + "loss": 0.0163, + "step": 30785 + }, + { + "epoch": 3.650658128779794, + "grad_norm": 0.6783767335357475, + "learning_rate": 9.934616327133521e-07, + "loss": 0.0372, + "step": 30786 + }, + { + "epoch": 3.6507767105419187, + "grad_norm": 0.41719118763922397, + "learning_rate": 9.927917983843765e-07, + "loss": 0.0156, + "step": 30787 + }, + { + "epoch": 3.6508952923040434, + "grad_norm": 0.5462468032810734, + "learning_rate": 9.921221853724155e-07, + "loss": 0.0224, + "step": 30788 + }, + { + "epoch": 3.6510138740661686, + "grad_norm": 0.40248421765697767, + "learning_rate": 9.914527936836531e-07, + "loss": 0.0157, + "step": 30789 + }, + { + "epoch": 3.651132455828294, + "grad_norm": 0.5366933691422126, + "learning_rate": 9.907836233242513e-07, + "loss": 0.0218, + "step": 30790 + }, + { + "epoch": 3.6512510375904186, + "grad_norm": 0.6118009876700656, + "learning_rate": 9.90114674300388e-07, + "loss": 0.0302, + "step": 30791 + }, + { + "epoch": 3.6513696193525433, + "grad_norm": 0.37197161259626443, + "learning_rate": 9.894459466182172e-07, + "loss": 0.015, + "step": 30792 + }, + { + "epoch": 3.6514882011146685, + "grad_norm": 0.6948446118315804, + "learning_rate": 9.887774402839166e-07, + "loss": 0.0313, + "step": 30793 + }, + { + "epoch": 3.6516067828767937, + "grad_norm": 0.48090015200510056, + "learning_rate": 9.881091553036403e-07, + "loss": 0.0208, + "step": 30794 + }, + { + "epoch": 3.6517253646389185, + "grad_norm": 0.5653941250367356, + "learning_rate": 9.874410916835526e-07, + "loss": 0.0374, + "step": 30795 + }, + { + "epoch": 3.6518439464010433, + "grad_norm": 0.7305041392737872, + "learning_rate": 9.867732494298121e-07, + "loss": 0.0331, + "step": 30796 + }, + { + "epoch": 3.6519625281631685, + "grad_norm": 0.7694055851689255, + "learning_rate": 9.861056285485754e-07, + "loss": 0.0361, + "step": 30797 + }, + { + "epoch": 3.6520811099252937, + "grad_norm": 0.6995042053314955, + "learning_rate": 9.854382290459985e-07, + "loss": 0.0293, + "step": 30798 + }, + { + "epoch": 3.6521996916874184, + "grad_norm": 0.39869941608356846, + "learning_rate": 9.847710509282322e-07, + "loss": 0.0155, + "step": 30799 + }, + { + "epoch": 3.652318273449543, + "grad_norm": 0.5368727756555653, + "learning_rate": 9.84104094201424e-07, + "loss": 0.0236, + "step": 30800 + }, + { + "epoch": 3.6524368552116684, + "grad_norm": 0.40330970282519696, + "learning_rate": 9.834373588717277e-07, + "loss": 0.0149, + "step": 30801 + }, + { + "epoch": 3.6525554369737936, + "grad_norm": 0.447317031892217, + "learning_rate": 9.82770844945291e-07, + "loss": 0.018, + "step": 30802 + }, + { + "epoch": 3.6526740187359183, + "grad_norm": 0.5596007059446945, + "learning_rate": 9.821045524282506e-07, + "loss": 0.0199, + "step": 30803 + }, + { + "epoch": 3.6527926004980436, + "grad_norm": 0.4267489291614389, + "learning_rate": 9.814384813267573e-07, + "loss": 0.0195, + "step": 30804 + }, + { + "epoch": 3.6529111822601683, + "grad_norm": 0.7069155513754719, + "learning_rate": 9.80772631646945e-07, + "loss": 0.0353, + "step": 30805 + }, + { + "epoch": 3.6530297640222935, + "grad_norm": 0.37411700463372066, + "learning_rate": 9.80107003394956e-07, + "loss": 0.0136, + "step": 30806 + }, + { + "epoch": 3.6531483457844183, + "grad_norm": 0.4908239377776413, + "learning_rate": 9.794415965769243e-07, + "loss": 0.0154, + "step": 30807 + }, + { + "epoch": 3.6532669275465435, + "grad_norm": 0.4597669198402222, + "learning_rate": 9.787764111989839e-07, + "loss": 0.0221, + "step": 30808 + }, + { + "epoch": 3.6533855093086682, + "grad_norm": 0.6493412652656324, + "learning_rate": 9.781114472672742e-07, + "loss": 0.0321, + "step": 30809 + }, + { + "epoch": 3.6535040910707934, + "grad_norm": 0.3101810133702138, + "learning_rate": 9.774467047879155e-07, + "loss": 0.0111, + "step": 30810 + }, + { + "epoch": 3.653622672832918, + "grad_norm": 0.525027735025754, + "learning_rate": 9.767821837670387e-07, + "loss": 0.0228, + "step": 30811 + }, + { + "epoch": 3.6537412545950434, + "grad_norm": 0.38142228662514205, + "learning_rate": 9.761178842107699e-07, + "loss": 0.0209, + "step": 30812 + }, + { + "epoch": 3.653859836357168, + "grad_norm": 1.0769591381145713, + "learning_rate": 9.754538061252372e-07, + "loss": 0.0509, + "step": 30813 + }, + { + "epoch": 3.6539784181192934, + "grad_norm": 0.30923598922051543, + "learning_rate": 9.747899495165607e-07, + "loss": 0.0145, + "step": 30814 + }, + { + "epoch": 3.654096999881418, + "grad_norm": 0.4214693051517658, + "learning_rate": 9.741263143908524e-07, + "loss": 0.0143, + "step": 30815 + }, + { + "epoch": 3.6542155816435433, + "grad_norm": 0.80907303183163, + "learning_rate": 9.734629007542462e-07, + "loss": 0.0201, + "step": 30816 + }, + { + "epoch": 3.654334163405668, + "grad_norm": 0.37627840766797627, + "learning_rate": 9.727997086128426e-07, + "loss": 0.0168, + "step": 30817 + }, + { + "epoch": 3.6544527451677933, + "grad_norm": 0.33440031495537326, + "learning_rate": 9.72136737972762e-07, + "loss": 0.0104, + "step": 30818 + }, + { + "epoch": 3.654571326929918, + "grad_norm": 0.5007304329570129, + "learning_rate": 9.714739888401159e-07, + "loss": 0.0276, + "step": 30819 + }, + { + "epoch": 3.6546899086920432, + "grad_norm": 0.6046228305707222, + "learning_rate": 9.70811461221019e-07, + "loss": 0.0208, + "step": 30820 + }, + { + "epoch": 3.654808490454168, + "grad_norm": 0.49369147642639954, + "learning_rate": 9.70149155121569e-07, + "loss": 0.0213, + "step": 30821 + }, + { + "epoch": 3.654927072216293, + "grad_norm": 0.6989533715383505, + "learning_rate": 9.694870705478754e-07, + "loss": 0.0325, + "step": 30822 + }, + { + "epoch": 3.655045653978418, + "grad_norm": 0.4268582798493749, + "learning_rate": 9.68825207506044e-07, + "loss": 0.0181, + "step": 30823 + }, + { + "epoch": 3.655164235740543, + "grad_norm": 0.8290503493355689, + "learning_rate": 9.681635660021755e-07, + "loss": 0.0393, + "step": 30824 + }, + { + "epoch": 3.655282817502668, + "grad_norm": 0.34679472271012074, + "learning_rate": 9.675021460423733e-07, + "loss": 0.0182, + "step": 30825 + }, + { + "epoch": 3.655401399264793, + "grad_norm": 0.6063048616719918, + "learning_rate": 9.668409476327245e-07, + "loss": 0.0213, + "step": 30826 + }, + { + "epoch": 3.6555199810269183, + "grad_norm": 0.565818836000189, + "learning_rate": 9.66179970779335e-07, + "loss": 0.0329, + "step": 30827 + }, + { + "epoch": 3.655638562789043, + "grad_norm": 0.5794322045689522, + "learning_rate": 9.655192154882919e-07, + "loss": 0.0237, + "step": 30828 + }, + { + "epoch": 3.655757144551168, + "grad_norm": 0.4388080369365939, + "learning_rate": 9.648586817656873e-07, + "loss": 0.03, + "step": 30829 + }, + { + "epoch": 3.655875726313293, + "grad_norm": 0.5477246779636937, + "learning_rate": 9.641983696176138e-07, + "loss": 0.0265, + "step": 30830 + }, + { + "epoch": 3.6559943080754183, + "grad_norm": 0.6887578730893199, + "learning_rate": 9.63538279050158e-07, + "loss": 0.0282, + "step": 30831 + }, + { + "epoch": 3.656112889837543, + "grad_norm": 0.46292517563268715, + "learning_rate": 9.62878410069401e-07, + "loss": 0.0178, + "step": 30832 + }, + { + "epoch": 3.6562314715996678, + "grad_norm": 0.4913996737944648, + "learning_rate": 9.6221876268143e-07, + "loss": 0.0176, + "step": 30833 + }, + { + "epoch": 3.656350053361793, + "grad_norm": 0.7479795661575599, + "learning_rate": 9.615593368923258e-07, + "loss": 0.0322, + "step": 30834 + }, + { + "epoch": 3.656468635123918, + "grad_norm": 0.4745376369249839, + "learning_rate": 9.609001327081647e-07, + "loss": 0.0194, + "step": 30835 + }, + { + "epoch": 3.656587216886043, + "grad_norm": 0.5931324594177699, + "learning_rate": 9.602411501350273e-07, + "loss": 0.0305, + "step": 30836 + }, + { + "epoch": 3.6567057986481677, + "grad_norm": 0.4638733242037661, + "learning_rate": 9.59582389178984e-07, + "loss": 0.0207, + "step": 30837 + }, + { + "epoch": 3.656824380410293, + "grad_norm": 0.46698374994242825, + "learning_rate": 9.58923849846116e-07, + "loss": 0.0194, + "step": 30838 + }, + { + "epoch": 3.656942962172418, + "grad_norm": 0.4113869771413013, + "learning_rate": 9.582655321424855e-07, + "loss": 0.0182, + "step": 30839 + }, + { + "epoch": 3.657061543934543, + "grad_norm": 0.6606641110476835, + "learning_rate": 9.576074360741677e-07, + "loss": 0.0334, + "step": 30840 + }, + { + "epoch": 3.6571801256966676, + "grad_norm": 0.4460010299665914, + "learning_rate": 9.56949561647222e-07, + "loss": 0.0201, + "step": 30841 + }, + { + "epoch": 3.657298707458793, + "grad_norm": 0.34401701809269297, + "learning_rate": 9.562919088677208e-07, + "loss": 0.0136, + "step": 30842 + }, + { + "epoch": 3.657417289220918, + "grad_norm": 0.5417242618608592, + "learning_rate": 9.556344777417237e-07, + "loss": 0.0249, + "step": 30843 + }, + { + "epoch": 3.657535870983043, + "grad_norm": 0.9277052165607277, + "learning_rate": 9.549772682752894e-07, + "loss": 0.0455, + "step": 30844 + }, + { + "epoch": 3.6576544527451675, + "grad_norm": 0.5366317071165856, + "learning_rate": 9.5432028047448e-07, + "loss": 0.0311, + "step": 30845 + }, + { + "epoch": 3.6577730345072927, + "grad_norm": 0.5906026363927463, + "learning_rate": 9.536635143453515e-07, + "loss": 0.024, + "step": 30846 + }, + { + "epoch": 3.657891616269418, + "grad_norm": 0.4783951071727468, + "learning_rate": 9.530069698939604e-07, + "loss": 0.022, + "step": 30847 + }, + { + "epoch": 3.6580101980315427, + "grad_norm": 0.6612912746958202, + "learning_rate": 9.523506471263488e-07, + "loss": 0.0272, + "step": 30848 + }, + { + "epoch": 3.6581287797936675, + "grad_norm": 0.4247209991608297, + "learning_rate": 9.516945460485844e-07, + "loss": 0.0149, + "step": 30849 + }, + { + "epoch": 3.6582473615557927, + "grad_norm": 0.3515523193504108, + "learning_rate": 9.51038666666701e-07, + "loss": 0.0182, + "step": 30850 + }, + { + "epoch": 3.658365943317918, + "grad_norm": 0.546388834651381, + "learning_rate": 9.503830089867549e-07, + "loss": 0.02, + "step": 30851 + }, + { + "epoch": 3.6584845250800426, + "grad_norm": 0.4631248867351941, + "learning_rate": 9.497275730147775e-07, + "loss": 0.0209, + "step": 30852 + }, + { + "epoch": 3.658603106842168, + "grad_norm": 0.414679910699122, + "learning_rate": 9.490723587568279e-07, + "loss": 0.0147, + "step": 30853 + }, + { + "epoch": 3.6587216886042926, + "grad_norm": 0.38783154791255475, + "learning_rate": 9.484173662189344e-07, + "loss": 0.0172, + "step": 30854 + }, + { + "epoch": 3.658840270366418, + "grad_norm": 1.0131366018902623, + "learning_rate": 9.477625954071368e-07, + "loss": 0.0509, + "step": 30855 + }, + { + "epoch": 3.6589588521285425, + "grad_norm": 1.0200063174418568, + "learning_rate": 9.471080463274746e-07, + "loss": 0.0608, + "step": 30856 + }, + { + "epoch": 3.6590774338906678, + "grad_norm": 0.798895868617623, + "learning_rate": 9.464537189859818e-07, + "loss": 0.0336, + "step": 30857 + }, + { + "epoch": 3.6591960156527925, + "grad_norm": 0.661417240755807, + "learning_rate": 9.457996133886899e-07, + "loss": 0.023, + "step": 30858 + }, + { + "epoch": 3.6593145974149177, + "grad_norm": 0.4717018135466655, + "learning_rate": 9.451457295416243e-07, + "loss": 0.0167, + "step": 30859 + }, + { + "epoch": 3.6594331791770425, + "grad_norm": 0.38828293797291885, + "learning_rate": 9.444920674508223e-07, + "loss": 0.0141, + "step": 30860 + }, + { + "epoch": 3.6595517609391677, + "grad_norm": 0.5686359430022389, + "learning_rate": 9.43838627122301e-07, + "loss": 0.028, + "step": 30861 + }, + { + "epoch": 3.6596703427012924, + "grad_norm": 0.5428844482373684, + "learning_rate": 9.431854085620889e-07, + "loss": 0.0242, + "step": 30862 + }, + { + "epoch": 3.6597889244634176, + "grad_norm": 0.2871714897187784, + "learning_rate": 9.425324117762063e-07, + "loss": 0.0155, + "step": 30863 + }, + { + "epoch": 3.6599075062255424, + "grad_norm": 0.46075385599180424, + "learning_rate": 9.418796367706762e-07, + "loss": 0.0199, + "step": 30864 + }, + { + "epoch": 3.6600260879876676, + "grad_norm": 0.44900248088590566, + "learning_rate": 9.41227083551513e-07, + "loss": 0.0206, + "step": 30865 + }, + { + "epoch": 3.6601446697497924, + "grad_norm": 0.2601244636094451, + "learning_rate": 9.405747521247316e-07, + "loss": 0.0072, + "step": 30866 + }, + { + "epoch": 3.6602632515119176, + "grad_norm": 0.46054871423506566, + "learning_rate": 9.399226424963492e-07, + "loss": 0.0243, + "step": 30867 + }, + { + "epoch": 3.6603818332740423, + "grad_norm": 0.8279027603106353, + "learning_rate": 9.39270754672375e-07, + "loss": 0.0364, + "step": 30868 + }, + { + "epoch": 3.6605004150361675, + "grad_norm": 0.5529850118711301, + "learning_rate": 9.386190886588208e-07, + "loss": 0.0259, + "step": 30869 + }, + { + "epoch": 3.6606189967982923, + "grad_norm": 0.6074148993748872, + "learning_rate": 9.379676444616902e-07, + "loss": 0.021, + "step": 30870 + }, + { + "epoch": 3.6607375785604175, + "grad_norm": 0.9837107971518343, + "learning_rate": 9.373164220869895e-07, + "loss": 0.0464, + "step": 30871 + }, + { + "epoch": 3.6608561603225422, + "grad_norm": 0.7689524711371938, + "learning_rate": 9.36665421540725e-07, + "loss": 0.0313, + "step": 30872 + }, + { + "epoch": 3.6609747420846674, + "grad_norm": 0.4954528423238879, + "learning_rate": 9.360146428288974e-07, + "loss": 0.0195, + "step": 30873 + }, + { + "epoch": 3.661093323846792, + "grad_norm": 0.41261251581711345, + "learning_rate": 9.353640859575075e-07, + "loss": 0.0225, + "step": 30874 + }, + { + "epoch": 3.6612119056089174, + "grad_norm": 0.5687398028956421, + "learning_rate": 9.347137509325476e-07, + "loss": 0.0251, + "step": 30875 + }, + { + "epoch": 3.6613304873710426, + "grad_norm": 0.3318706722460189, + "learning_rate": 9.340636377600215e-07, + "loss": 0.0155, + "step": 30876 + }, + { + "epoch": 3.6614490691331674, + "grad_norm": 0.4635168721839511, + "learning_rate": 9.334137464459103e-07, + "loss": 0.0223, + "step": 30877 + }, + { + "epoch": 3.661567650895292, + "grad_norm": 0.5319375286027146, + "learning_rate": 9.327640769962148e-07, + "loss": 0.0256, + "step": 30878 + }, + { + "epoch": 3.6616862326574173, + "grad_norm": 0.8446706374402909, + "learning_rate": 9.32114629416922e-07, + "loss": 0.0301, + "step": 30879 + }, + { + "epoch": 3.6618048144195425, + "grad_norm": 0.678952772016791, + "learning_rate": 9.314654037140213e-07, + "loss": 0.0375, + "step": 30880 + }, + { + "epoch": 3.6619233961816673, + "grad_norm": 0.4956575255385961, + "learning_rate": 9.308163998934888e-07, + "loss": 0.0296, + "step": 30881 + }, + { + "epoch": 3.662041977943792, + "grad_norm": 0.6200457047672995, + "learning_rate": 9.301676179613167e-07, + "loss": 0.0343, + "step": 30882 + }, + { + "epoch": 3.6621605597059173, + "grad_norm": 0.6296603630143992, + "learning_rate": 9.295190579234808e-07, + "loss": 0.0241, + "step": 30883 + }, + { + "epoch": 3.6622791414680425, + "grad_norm": 0.3606554379426033, + "learning_rate": 9.288707197859625e-07, + "loss": 0.0167, + "step": 30884 + }, + { + "epoch": 3.662397723230167, + "grad_norm": 0.6357627425965963, + "learning_rate": 9.282226035547432e-07, + "loss": 0.03, + "step": 30885 + }, + { + "epoch": 3.662516304992292, + "grad_norm": 0.911522137551546, + "learning_rate": 9.275747092357845e-07, + "loss": 0.029, + "step": 30886 + }, + { + "epoch": 3.662634886754417, + "grad_norm": 0.5366081079204362, + "learning_rate": 9.269270368350736e-07, + "loss": 0.0271, + "step": 30887 + }, + { + "epoch": 3.6627534685165424, + "grad_norm": 0.7527376037539756, + "learning_rate": 9.262795863585721e-07, + "loss": 0.0422, + "step": 30888 + }, + { + "epoch": 3.662872050278667, + "grad_norm": 0.5631382746496113, + "learning_rate": 9.256323578122505e-07, + "loss": 0.0293, + "step": 30889 + }, + { + "epoch": 3.662990632040792, + "grad_norm": 0.39079528946302183, + "learning_rate": 9.249853512020789e-07, + "loss": 0.0157, + "step": 30890 + }, + { + "epoch": 3.663109213802917, + "grad_norm": 0.40830515756005875, + "learning_rate": 9.243385665340221e-07, + "loss": 0.0172, + "step": 30891 + }, + { + "epoch": 3.6632277955650423, + "grad_norm": 0.4791604822151707, + "learning_rate": 9.236920038140362e-07, + "loss": 0.0171, + "step": 30892 + }, + { + "epoch": 3.663346377327167, + "grad_norm": 0.2798772554359452, + "learning_rate": 9.23045663048086e-07, + "loss": 0.0113, + "step": 30893 + }, + { + "epoch": 3.663464959089292, + "grad_norm": 0.5319736097890487, + "learning_rate": 9.223995442421307e-07, + "loss": 0.0208, + "step": 30894 + }, + { + "epoch": 3.663583540851417, + "grad_norm": 0.5617858124271375, + "learning_rate": 9.217536474021266e-07, + "loss": 0.0265, + "step": 30895 + }, + { + "epoch": 3.6637021226135422, + "grad_norm": 0.9052159999147124, + "learning_rate": 9.211079725340271e-07, + "loss": 0.0541, + "step": 30896 + }, + { + "epoch": 3.663820704375667, + "grad_norm": 0.6292964858703506, + "learning_rate": 9.204625196437805e-07, + "loss": 0.0337, + "step": 30897 + }, + { + "epoch": 3.6639392861377917, + "grad_norm": 0.4911634111467709, + "learning_rate": 9.198172887373458e-07, + "loss": 0.0215, + "step": 30898 + }, + { + "epoch": 3.664057867899917, + "grad_norm": 0.45876141442196444, + "learning_rate": 9.191722798206653e-07, + "loss": 0.0209, + "step": 30899 + }, + { + "epoch": 3.664176449662042, + "grad_norm": 0.6695254840919819, + "learning_rate": 9.185274928996901e-07, + "loss": 0.0337, + "step": 30900 + }, + { + "epoch": 3.664295031424167, + "grad_norm": 0.6018160975907306, + "learning_rate": 9.178829279803513e-07, + "loss": 0.0277, + "step": 30901 + }, + { + "epoch": 3.664413613186292, + "grad_norm": 0.720707392462476, + "learning_rate": 9.17238585068611e-07, + "loss": 0.0274, + "step": 30902 + }, + { + "epoch": 3.664532194948417, + "grad_norm": 0.30084657655885266, + "learning_rate": 9.165944641703922e-07, + "loss": 0.0156, + "step": 30903 + }, + { + "epoch": 3.664650776710542, + "grad_norm": 0.6266154945375715, + "learning_rate": 9.1595056529164e-07, + "loss": 0.0287, + "step": 30904 + }, + { + "epoch": 3.664769358472667, + "grad_norm": 0.3551372583815678, + "learning_rate": 9.153068884382915e-07, + "loss": 0.0177, + "step": 30905 + }, + { + "epoch": 3.664887940234792, + "grad_norm": 0.38422280261800323, + "learning_rate": 9.146634336162779e-07, + "loss": 0.0177, + "step": 30906 + }, + { + "epoch": 3.665006521996917, + "grad_norm": 0.5684370796587013, + "learning_rate": 9.140202008315335e-07, + "loss": 0.0271, + "step": 30907 + }, + { + "epoch": 3.665125103759042, + "grad_norm": 0.4837785106329147, + "learning_rate": 9.13377190089984e-07, + "loss": 0.0227, + "step": 30908 + }, + { + "epoch": 3.6652436855211667, + "grad_norm": 0.64547701515701, + "learning_rate": 9.127344013975636e-07, + "loss": 0.0367, + "step": 30909 + }, + { + "epoch": 3.665362267283292, + "grad_norm": 0.4982831442792032, + "learning_rate": 9.120918347601898e-07, + "loss": 0.0204, + "step": 30910 + }, + { + "epoch": 3.6654808490454167, + "grad_norm": 0.35131415105310154, + "learning_rate": 9.114494901837939e-07, + "loss": 0.0154, + "step": 30911 + }, + { + "epoch": 3.665599430807542, + "grad_norm": 0.6735958878433023, + "learning_rate": 9.108073676742935e-07, + "loss": 0.0357, + "step": 30912 + }, + { + "epoch": 3.6657180125696667, + "grad_norm": 0.707880976949085, + "learning_rate": 9.101654672376087e-07, + "loss": 0.029, + "step": 30913 + }, + { + "epoch": 3.665836594331792, + "grad_norm": 0.5656303357251535, + "learning_rate": 9.095237888796598e-07, + "loss": 0.02, + "step": 30914 + }, + { + "epoch": 3.6659551760939166, + "grad_norm": 0.6805278309362656, + "learning_rate": 9.088823326063534e-07, + "loss": 0.0366, + "step": 30915 + }, + { + "epoch": 3.666073757856042, + "grad_norm": 0.5301975399665498, + "learning_rate": 9.082410984236179e-07, + "loss": 0.0232, + "step": 30916 + }, + { + "epoch": 3.6661923396181666, + "grad_norm": 0.3520676776386184, + "learning_rate": 9.076000863373513e-07, + "loss": 0.0139, + "step": 30917 + }, + { + "epoch": 3.666310921380292, + "grad_norm": 0.5756134272404034, + "learning_rate": 9.069592963534712e-07, + "loss": 0.0223, + "step": 30918 + }, + { + "epoch": 3.6664295031424166, + "grad_norm": 0.6202913243827735, + "learning_rate": 9.063187284778757e-07, + "loss": 0.0332, + "step": 30919 + }, + { + "epoch": 3.6665480849045418, + "grad_norm": 0.6124200768609447, + "learning_rate": 9.056783827164822e-07, + "loss": 0.0273, + "step": 30920 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.4013732713576773, + "learning_rate": 9.050382590751833e-07, + "loss": 0.0155, + "step": 30921 + }, + { + "epoch": 3.6667852484287917, + "grad_norm": 0.28996965331275276, + "learning_rate": 9.043983575598852e-07, + "loss": 0.0146, + "step": 30922 + }, + { + "epoch": 3.6669038301909165, + "grad_norm": 0.4433668096754899, + "learning_rate": 9.037586781764862e-07, + "loss": 0.0211, + "step": 30923 + }, + { + "epoch": 3.6670224119530417, + "grad_norm": 0.378209223576835, + "learning_rate": 9.031192209308842e-07, + "loss": 0.0206, + "step": 30924 + }, + { + "epoch": 3.6671409937151664, + "grad_norm": 0.6405776361755516, + "learning_rate": 9.024799858289745e-07, + "loss": 0.0305, + "step": 30925 + }, + { + "epoch": 3.6672595754772916, + "grad_norm": 0.5585057707742388, + "learning_rate": 9.01840972876647e-07, + "loss": 0.0387, + "step": 30926 + }, + { + "epoch": 3.6673781572394164, + "grad_norm": 0.7158009707660422, + "learning_rate": 9.012021820797967e-07, + "loss": 0.0297, + "step": 30927 + }, + { + "epoch": 3.6674967390015416, + "grad_norm": 0.5173971208680662, + "learning_rate": 9.00563613444308e-07, + "loss": 0.0306, + "step": 30928 + }, + { + "epoch": 3.667615320763667, + "grad_norm": 0.856785639817149, + "learning_rate": 8.999252669760732e-07, + "loss": 0.0298, + "step": 30929 + }, + { + "epoch": 3.6677339025257916, + "grad_norm": 0.47486922404220744, + "learning_rate": 8.992871426809685e-07, + "loss": 0.0237, + "step": 30930 + }, + { + "epoch": 3.6678524842879163, + "grad_norm": 0.5124540675880215, + "learning_rate": 8.986492405648889e-07, + "loss": 0.0179, + "step": 30931 + }, + { + "epoch": 3.6679710660500415, + "grad_norm": 0.358752551444331, + "learning_rate": 8.980115606337048e-07, + "loss": 0.0121, + "step": 30932 + }, + { + "epoch": 3.6680896478121667, + "grad_norm": 0.6745591900432307, + "learning_rate": 8.973741028932975e-07, + "loss": 0.0334, + "step": 30933 + }, + { + "epoch": 3.6682082295742915, + "grad_norm": 0.34787153083001304, + "learning_rate": 8.96736867349543e-07, + "loss": 0.0156, + "step": 30934 + }, + { + "epoch": 3.6683268113364162, + "grad_norm": 0.693472835989991, + "learning_rate": 8.960998540083199e-07, + "loss": 0.0385, + "step": 30935 + }, + { + "epoch": 3.6684453930985415, + "grad_norm": 0.5877550644877902, + "learning_rate": 8.954630628755012e-07, + "loss": 0.031, + "step": 30936 + }, + { + "epoch": 3.6685639748606667, + "grad_norm": 0.20644002232128195, + "learning_rate": 8.94826493956949e-07, + "loss": 0.0066, + "step": 30937 + }, + { + "epoch": 3.6686825566227914, + "grad_norm": 0.35438532797739863, + "learning_rate": 8.94190147258539e-07, + "loss": 0.0113, + "step": 30938 + }, + { + "epoch": 3.668801138384916, + "grad_norm": 0.3876222792181888, + "learning_rate": 8.935540227861333e-07, + "loss": 0.0177, + "step": 30939 + }, + { + "epoch": 3.6689197201470414, + "grad_norm": 0.742175586662613, + "learning_rate": 8.929181205456022e-07, + "loss": 0.034, + "step": 30940 + }, + { + "epoch": 3.6690383019091666, + "grad_norm": 0.4740533958900053, + "learning_rate": 8.922824405428021e-07, + "loss": 0.0244, + "step": 30941 + }, + { + "epoch": 3.6691568836712913, + "grad_norm": 0.39203447927110163, + "learning_rate": 8.916469827835921e-07, + "loss": 0.0161, + "step": 30942 + }, + { + "epoch": 3.669275465433416, + "grad_norm": 0.5081296531294641, + "learning_rate": 8.910117472738371e-07, + "loss": 0.0205, + "step": 30943 + }, + { + "epoch": 3.6693940471955413, + "grad_norm": 0.4690022756233995, + "learning_rate": 8.903767340193853e-07, + "loss": 0.0186, + "step": 30944 + }, + { + "epoch": 3.6695126289576665, + "grad_norm": 0.9357953162376734, + "learning_rate": 8.897419430260984e-07, + "loss": 0.0394, + "step": 30945 + }, + { + "epoch": 3.6696312107197913, + "grad_norm": 0.47406042166553075, + "learning_rate": 8.891073742998219e-07, + "loss": 0.0179, + "step": 30946 + }, + { + "epoch": 3.669749792481916, + "grad_norm": 0.6840476500366959, + "learning_rate": 8.884730278464149e-07, + "loss": 0.0303, + "step": 30947 + }, + { + "epoch": 3.669868374244041, + "grad_norm": 0.4367723782072395, + "learning_rate": 8.878389036717144e-07, + "loss": 0.0185, + "step": 30948 + }, + { + "epoch": 3.6699869560061664, + "grad_norm": 0.2528627890750506, + "learning_rate": 8.872050017815686e-07, + "loss": 0.0114, + "step": 30949 + }, + { + "epoch": 3.670105537768291, + "grad_norm": 0.43238102603355677, + "learning_rate": 8.865713221818284e-07, + "loss": 0.0188, + "step": 30950 + }, + { + "epoch": 3.6702241195304164, + "grad_norm": 0.677407653991715, + "learning_rate": 8.859378648783306e-07, + "loss": 0.045, + "step": 30951 + }, + { + "epoch": 3.670342701292541, + "grad_norm": 0.8367211666094263, + "learning_rate": 8.853046298769124e-07, + "loss": 0.027, + "step": 30952 + }, + { + "epoch": 3.6704612830546663, + "grad_norm": 0.4553437450861238, + "learning_rate": 8.846716171834135e-07, + "loss": 0.0323, + "step": 30953 + }, + { + "epoch": 3.670579864816791, + "grad_norm": 0.32630348618218813, + "learning_rate": 8.840388268036709e-07, + "loss": 0.0125, + "step": 30954 + }, + { + "epoch": 3.6706984465789163, + "grad_norm": 0.5689989681857858, + "learning_rate": 8.834062587435188e-07, + "loss": 0.0315, + "step": 30955 + }, + { + "epoch": 3.670817028341041, + "grad_norm": 0.5400152610659043, + "learning_rate": 8.827739130087887e-07, + "loss": 0.0248, + "step": 30956 + }, + { + "epoch": 3.6709356101031663, + "grad_norm": 0.6058638383539696, + "learning_rate": 8.821417896053036e-07, + "loss": 0.031, + "step": 30957 + }, + { + "epoch": 3.671054191865291, + "grad_norm": 0.4542127264684431, + "learning_rate": 8.815098885389006e-07, + "loss": 0.0168, + "step": 30958 + }, + { + "epoch": 3.6711727736274162, + "grad_norm": 0.4495515802525583, + "learning_rate": 8.808782098153973e-07, + "loss": 0.0207, + "step": 30959 + }, + { + "epoch": 3.671291355389541, + "grad_norm": 0.4688621800426598, + "learning_rate": 8.802467534406222e-07, + "loss": 0.0278, + "step": 30960 + }, + { + "epoch": 3.671409937151666, + "grad_norm": 0.7501002129598162, + "learning_rate": 8.796155194203931e-07, + "loss": 0.0248, + "step": 30961 + }, + { + "epoch": 3.671528518913791, + "grad_norm": 0.4199566565800323, + "learning_rate": 8.789845077605302e-07, + "loss": 0.0206, + "step": 30962 + }, + { + "epoch": 3.671647100675916, + "grad_norm": 0.5892107545976208, + "learning_rate": 8.783537184668539e-07, + "loss": 0.0338, + "step": 30963 + }, + { + "epoch": 3.671765682438041, + "grad_norm": 0.47722877849869033, + "learning_rate": 8.777231515451706e-07, + "loss": 0.0265, + "step": 30964 + }, + { + "epoch": 3.671884264200166, + "grad_norm": 0.660480573001666, + "learning_rate": 8.770928070013062e-07, + "loss": 0.0349, + "step": 30965 + }, + { + "epoch": 3.672002845962291, + "grad_norm": 0.7151338539339837, + "learning_rate": 8.76462684841059e-07, + "loss": 0.0399, + "step": 30966 + }, + { + "epoch": 3.672121427724416, + "grad_norm": 0.37616926791604455, + "learning_rate": 8.758327850702492e-07, + "loss": 0.0192, + "step": 30967 + }, + { + "epoch": 3.672240009486541, + "grad_norm": 0.7227406541561577, + "learning_rate": 8.752031076946721e-07, + "loss": 0.035, + "step": 30968 + }, + { + "epoch": 3.672358591248666, + "grad_norm": 0.37029970446330834, + "learning_rate": 8.745736527201425e-07, + "loss": 0.0181, + "step": 30969 + }, + { + "epoch": 3.672477173010791, + "grad_norm": 0.68623730173007, + "learning_rate": 8.739444201524588e-07, + "loss": 0.0383, + "step": 30970 + }, + { + "epoch": 3.672595754772916, + "grad_norm": 0.8037575385402159, + "learning_rate": 8.733154099974189e-07, + "loss": 0.0515, + "step": 30971 + }, + { + "epoch": 3.6727143365350408, + "grad_norm": 0.289093489947977, + "learning_rate": 8.726866222608293e-07, + "loss": 0.0163, + "step": 30972 + }, + { + "epoch": 3.672832918297166, + "grad_norm": 0.7216050303132983, + "learning_rate": 8.720580569484771e-07, + "loss": 0.0257, + "step": 30973 + }, + { + "epoch": 3.6729515000592907, + "grad_norm": 0.37070657417375275, + "learning_rate": 8.714297140661687e-07, + "loss": 0.0144, + "step": 30974 + }, + { + "epoch": 3.673070081821416, + "grad_norm": 0.5348702616097799, + "learning_rate": 8.708015936196828e-07, + "loss": 0.0264, + "step": 30975 + }, + { + "epoch": 3.6731886635835407, + "grad_norm": 0.5146877966107607, + "learning_rate": 8.701736956148232e-07, + "loss": 0.0242, + "step": 30976 + }, + { + "epoch": 3.673307245345666, + "grad_norm": 0.6441353332639412, + "learning_rate": 8.695460200573686e-07, + "loss": 0.0247, + "step": 30977 + }, + { + "epoch": 3.673425827107791, + "grad_norm": 0.5522964719391228, + "learning_rate": 8.689185669531141e-07, + "loss": 0.0248, + "step": 30978 + }, + { + "epoch": 3.673544408869916, + "grad_norm": 0.5050576232727226, + "learning_rate": 8.682913363078304e-07, + "loss": 0.026, + "step": 30979 + }, + { + "epoch": 3.6736629906320406, + "grad_norm": 0.41379690370258576, + "learning_rate": 8.676643281273156e-07, + "loss": 0.0235, + "step": 30980 + }, + { + "epoch": 3.673781572394166, + "grad_norm": 0.6642363858363932, + "learning_rate": 8.6703754241734e-07, + "loss": 0.0335, + "step": 30981 + }, + { + "epoch": 3.673900154156291, + "grad_norm": 0.624224495580249, + "learning_rate": 8.664109791836822e-07, + "loss": 0.0283, + "step": 30982 + }, + { + "epoch": 3.6740187359184158, + "grad_norm": 0.6861443067786523, + "learning_rate": 8.657846384321239e-07, + "loss": 0.0269, + "step": 30983 + }, + { + "epoch": 3.6741373176805405, + "grad_norm": 0.5040350256554098, + "learning_rate": 8.651585201684326e-07, + "loss": 0.0184, + "step": 30984 + }, + { + "epoch": 3.6742558994426657, + "grad_norm": 0.5416621623185655, + "learning_rate": 8.64532624398387e-07, + "loss": 0.0308, + "step": 30985 + }, + { + "epoch": 3.674374481204791, + "grad_norm": 0.43274522347537525, + "learning_rate": 8.63906951127752e-07, + "loss": 0.0249, + "step": 30986 + }, + { + "epoch": 3.6744930629669157, + "grad_norm": 0.4017598993173844, + "learning_rate": 8.632815003622979e-07, + "loss": 0.0228, + "step": 30987 + }, + { + "epoch": 3.6746116447290404, + "grad_norm": 0.5308564369162935, + "learning_rate": 8.626562721077896e-07, + "loss": 0.0239, + "step": 30988 + }, + { + "epoch": 3.6747302264911657, + "grad_norm": 0.29768047631860317, + "learning_rate": 8.620312663699948e-07, + "loss": 0.0111, + "step": 30989 + }, + { + "epoch": 3.674848808253291, + "grad_norm": 0.6228262338974714, + "learning_rate": 8.614064831546642e-07, + "loss": 0.0199, + "step": 30990 + }, + { + "epoch": 3.6749673900154156, + "grad_norm": 0.7120921228219405, + "learning_rate": 8.607819224675711e-07, + "loss": 0.0372, + "step": 30991 + }, + { + "epoch": 3.6750859717775404, + "grad_norm": 0.48130264006553514, + "learning_rate": 8.601575843144666e-07, + "loss": 0.0235, + "step": 30992 + }, + { + "epoch": 3.6752045535396656, + "grad_norm": 0.6717827607697575, + "learning_rate": 8.595334687011042e-07, + "loss": 0.0385, + "step": 30993 + }, + { + "epoch": 3.6753231353017908, + "grad_norm": 0.629659991013707, + "learning_rate": 8.589095756332433e-07, + "loss": 0.0317, + "step": 30994 + }, + { + "epoch": 3.6754417170639155, + "grad_norm": 0.5517202716537072, + "learning_rate": 8.582859051166292e-07, + "loss": 0.0285, + "step": 30995 + }, + { + "epoch": 3.6755602988260403, + "grad_norm": 0.42879857322283244, + "learning_rate": 8.576624571570186e-07, + "loss": 0.0132, + "step": 30996 + }, + { + "epoch": 3.6756788805881655, + "grad_norm": 0.4767837266269312, + "learning_rate": 8.570392317601511e-07, + "loss": 0.0194, + "step": 30997 + }, + { + "epoch": 3.6757974623502907, + "grad_norm": 0.36000646397022507, + "learning_rate": 8.564162289317778e-07, + "loss": 0.0182, + "step": 30998 + }, + { + "epoch": 3.6759160441124155, + "grad_norm": 0.4381234092106162, + "learning_rate": 8.557934486776386e-07, + "loss": 0.0236, + "step": 30999 + }, + { + "epoch": 3.67603462587454, + "grad_norm": 0.42100517512705765, + "learning_rate": 8.551708910034789e-07, + "loss": 0.0203, + "step": 31000 + }, + { + "epoch": 3.6761532076366654, + "grad_norm": 0.5065920547684708, + "learning_rate": 8.545485559150301e-07, + "loss": 0.023, + "step": 31001 + }, + { + "epoch": 3.6762717893987906, + "grad_norm": 0.5791383190331865, + "learning_rate": 8.539264434180405e-07, + "loss": 0.0202, + "step": 31002 + }, + { + "epoch": 3.6763903711609154, + "grad_norm": 0.49688491655053607, + "learning_rate": 8.53304553518236e-07, + "loss": 0.025, + "step": 31003 + }, + { + "epoch": 3.6765089529230406, + "grad_norm": 0.5689661727122218, + "learning_rate": 8.526828862213537e-07, + "loss": 0.0289, + "step": 31004 + }, + { + "epoch": 3.6766275346851653, + "grad_norm": 0.5247177986549756, + "learning_rate": 8.520614415331224e-07, + "loss": 0.0192, + "step": 31005 + }, + { + "epoch": 3.6767461164472905, + "grad_norm": 0.6161939806146902, + "learning_rate": 8.514402194592736e-07, + "loss": 0.036, + "step": 31006 + }, + { + "epoch": 3.6768646982094153, + "grad_norm": 0.5636870147778875, + "learning_rate": 8.508192200055359e-07, + "loss": 0.0266, + "step": 31007 + }, + { + "epoch": 3.6769832799715405, + "grad_norm": 0.5159608521991481, + "learning_rate": 8.501984431776272e-07, + "loss": 0.0215, + "step": 31008 + }, + { + "epoch": 3.6771018617336653, + "grad_norm": 0.28251324617087775, + "learning_rate": 8.49577888981276e-07, + "loss": 0.0106, + "step": 31009 + }, + { + "epoch": 3.6772204434957905, + "grad_norm": 0.5984082714944161, + "learning_rate": 8.489575574222003e-07, + "loss": 0.0257, + "step": 31010 + }, + { + "epoch": 3.6773390252579152, + "grad_norm": 0.4912838141915146, + "learning_rate": 8.483374485061229e-07, + "loss": 0.0152, + "step": 31011 + }, + { + "epoch": 3.6774576070200404, + "grad_norm": 0.8073189789271624, + "learning_rate": 8.477175622387562e-07, + "loss": 0.0326, + "step": 31012 + }, + { + "epoch": 3.677576188782165, + "grad_norm": 0.45744195900806006, + "learning_rate": 8.47097898625815e-07, + "loss": 0.0207, + "step": 31013 + }, + { + "epoch": 3.6776947705442904, + "grad_norm": 0.31216457758909527, + "learning_rate": 8.464784576730111e-07, + "loss": 0.0123, + "step": 31014 + }, + { + "epoch": 3.677813352306415, + "grad_norm": 0.4312776502407389, + "learning_rate": 8.458592393860598e-07, + "loss": 0.0223, + "step": 31015 + }, + { + "epoch": 3.6779319340685404, + "grad_norm": 1.0047102353464887, + "learning_rate": 8.452402437706647e-07, + "loss": 0.0413, + "step": 31016 + }, + { + "epoch": 3.678050515830665, + "grad_norm": 0.5907929899243941, + "learning_rate": 8.446214708325351e-07, + "loss": 0.02, + "step": 31017 + }, + { + "epoch": 3.6781690975927903, + "grad_norm": 0.7895947904983134, + "learning_rate": 8.440029205773747e-07, + "loss": 0.0338, + "step": 31018 + }, + { + "epoch": 3.678287679354915, + "grad_norm": 0.5993377225567401, + "learning_rate": 8.433845930108846e-07, + "loss": 0.0307, + "step": 31019 + }, + { + "epoch": 3.6784062611170403, + "grad_norm": 0.6315315304680627, + "learning_rate": 8.427664881387659e-07, + "loss": 0.0449, + "step": 31020 + }, + { + "epoch": 3.678524842879165, + "grad_norm": 0.631612247490349, + "learning_rate": 8.421486059667167e-07, + "loss": 0.0292, + "step": 31021 + }, + { + "epoch": 3.6786434246412902, + "grad_norm": 0.618848082194987, + "learning_rate": 8.415309465004323e-07, + "loss": 0.0281, + "step": 31022 + }, + { + "epoch": 3.678762006403415, + "grad_norm": 0.4547792016610691, + "learning_rate": 8.409135097456111e-07, + "loss": 0.0229, + "step": 31023 + }, + { + "epoch": 3.67888058816554, + "grad_norm": 0.6539784267942285, + "learning_rate": 8.402962957079347e-07, + "loss": 0.0353, + "step": 31024 + }, + { + "epoch": 3.678999169927665, + "grad_norm": 0.5724732076765425, + "learning_rate": 8.396793043931067e-07, + "loss": 0.0273, + "step": 31025 + }, + { + "epoch": 3.67911775168979, + "grad_norm": 0.8930016626888366, + "learning_rate": 8.390625358068033e-07, + "loss": 0.0488, + "step": 31026 + }, + { + "epoch": 3.6792363334519154, + "grad_norm": 0.34802632108334136, + "learning_rate": 8.384459899547198e-07, + "loss": 0.0117, + "step": 31027 + }, + { + "epoch": 3.67935491521404, + "grad_norm": 0.30916512695883025, + "learning_rate": 8.378296668425295e-07, + "loss": 0.0128, + "step": 31028 + }, + { + "epoch": 3.679473496976165, + "grad_norm": 0.3902179935393511, + "learning_rate": 8.372135664759279e-07, + "loss": 0.0139, + "step": 31029 + }, + { + "epoch": 3.67959207873829, + "grad_norm": 0.8029867940838362, + "learning_rate": 8.365976888605798e-07, + "loss": 0.0427, + "step": 31030 + }, + { + "epoch": 3.6797106605004153, + "grad_norm": 0.7202261244116787, + "learning_rate": 8.359820340021724e-07, + "loss": 0.0284, + "step": 31031 + }, + { + "epoch": 3.67982924226254, + "grad_norm": 0.592786095646171, + "learning_rate": 8.353666019063789e-07, + "loss": 0.0279, + "step": 31032 + }, + { + "epoch": 3.679947824024665, + "grad_norm": 0.35101995542616554, + "learning_rate": 8.347513925788725e-07, + "loss": 0.0112, + "step": 31033 + }, + { + "epoch": 3.68006640578679, + "grad_norm": 0.2999752415260466, + "learning_rate": 8.341364060253265e-07, + "loss": 0.0109, + "step": 31034 + }, + { + "epoch": 3.680184987548915, + "grad_norm": 0.6154157561690549, + "learning_rate": 8.335216422514031e-07, + "loss": 0.026, + "step": 31035 + }, + { + "epoch": 3.68030356931104, + "grad_norm": 0.6331598337235191, + "learning_rate": 8.329071012627837e-07, + "loss": 0.0236, + "step": 31036 + }, + { + "epoch": 3.6804221510731647, + "grad_norm": 0.6333238992811661, + "learning_rate": 8.322927830651195e-07, + "loss": 0.0191, + "step": 31037 + }, + { + "epoch": 3.68054073283529, + "grad_norm": 0.8261202156433725, + "learning_rate": 8.316786876640837e-07, + "loss": 0.0335, + "step": 31038 + }, + { + "epoch": 3.680659314597415, + "grad_norm": 0.665378765508391, + "learning_rate": 8.310648150653272e-07, + "loss": 0.0238, + "step": 31039 + }, + { + "epoch": 3.68077789635954, + "grad_norm": 0.5881760994153129, + "learning_rate": 8.304511652745206e-07, + "loss": 0.0227, + "step": 31040 + }, + { + "epoch": 3.6808964781216647, + "grad_norm": 0.47219267218960254, + "learning_rate": 8.298377382973121e-07, + "loss": 0.0204, + "step": 31041 + }, + { + "epoch": 3.68101505988379, + "grad_norm": 0.551064278065902, + "learning_rate": 8.292245341393612e-07, + "loss": 0.0248, + "step": 31042 + }, + { + "epoch": 3.681133641645915, + "grad_norm": 0.7203451733942626, + "learning_rate": 8.286115528063188e-07, + "loss": 0.0363, + "step": 31043 + }, + { + "epoch": 3.68125222340804, + "grad_norm": 0.40003720799194376, + "learning_rate": 8.279987943038387e-07, + "loss": 0.0216, + "step": 31044 + }, + { + "epoch": 3.6813708051701646, + "grad_norm": 0.48121897748447956, + "learning_rate": 8.273862586375691e-07, + "loss": 0.0204, + "step": 31045 + }, + { + "epoch": 3.6814893869322898, + "grad_norm": 0.39263378663840787, + "learning_rate": 8.267739458131501e-07, + "loss": 0.0169, + "step": 31046 + }, + { + "epoch": 3.681607968694415, + "grad_norm": 0.6749475826533218, + "learning_rate": 8.26161855836241e-07, + "loss": 0.0463, + "step": 31047 + }, + { + "epoch": 3.6817265504565397, + "grad_norm": 0.8096647888596742, + "learning_rate": 8.255499887124707e-07, + "loss": 0.0419, + "step": 31048 + }, + { + "epoch": 3.6818451322186645, + "grad_norm": 0.5794217958417276, + "learning_rate": 8.249383444474873e-07, + "loss": 0.0216, + "step": 31049 + }, + { + "epoch": 3.6819637139807897, + "grad_norm": 0.6992044479008122, + "learning_rate": 8.243269230469197e-07, + "loss": 0.0233, + "step": 31050 + }, + { + "epoch": 3.682082295742915, + "grad_norm": 0.6003891476074346, + "learning_rate": 8.23715724516419e-07, + "loss": 0.0239, + "step": 31051 + }, + { + "epoch": 3.6822008775050397, + "grad_norm": 0.4155876329533847, + "learning_rate": 8.231047488616112e-07, + "loss": 0.0198, + "step": 31052 + }, + { + "epoch": 3.682319459267165, + "grad_norm": 0.699387247369998, + "learning_rate": 8.224939960881278e-07, + "loss": 0.0323, + "step": 31053 + }, + { + "epoch": 3.6824380410292896, + "grad_norm": 0.7449455336270528, + "learning_rate": 8.218834662016033e-07, + "loss": 0.0434, + "step": 31054 + }, + { + "epoch": 3.682556622791415, + "grad_norm": 0.6183202959480373, + "learning_rate": 8.212731592076612e-07, + "loss": 0.0176, + "step": 31055 + }, + { + "epoch": 3.6826752045535396, + "grad_norm": 0.7770295856396513, + "learning_rate": 8.206630751119354e-07, + "loss": 0.0334, + "step": 31056 + }, + { + "epoch": 3.682793786315665, + "grad_norm": 0.554456146476468, + "learning_rate": 8.20053213920044e-07, + "loss": 0.0322, + "step": 31057 + }, + { + "epoch": 3.6829123680777895, + "grad_norm": 0.7562713711584284, + "learning_rate": 8.194435756376074e-07, + "loss": 0.0316, + "step": 31058 + }, + { + "epoch": 3.6830309498399147, + "grad_norm": 0.5959293332778908, + "learning_rate": 8.188341602702515e-07, + "loss": 0.0327, + "step": 31059 + }, + { + "epoch": 3.6831495316020395, + "grad_norm": 0.42422435727679964, + "learning_rate": 8.182249678235915e-07, + "loss": 0.0208, + "step": 31060 + }, + { + "epoch": 3.6832681133641647, + "grad_norm": 0.6170293774724901, + "learning_rate": 8.176159983032422e-07, + "loss": 0.0294, + "step": 31061 + }, + { + "epoch": 3.6833866951262895, + "grad_norm": 0.6533088547282592, + "learning_rate": 8.170072517148214e-07, + "loss": 0.0242, + "step": 31062 + }, + { + "epoch": 3.6835052768884147, + "grad_norm": 0.7141298163907617, + "learning_rate": 8.163987280639357e-07, + "loss": 0.0375, + "step": 31063 + }, + { + "epoch": 3.6836238586505394, + "grad_norm": 0.3136099054609796, + "learning_rate": 8.157904273562e-07, + "loss": 0.0155, + "step": 31064 + }, + { + "epoch": 3.6837424404126646, + "grad_norm": 0.3599092439151212, + "learning_rate": 8.151823495972183e-07, + "loss": 0.0185, + "step": 31065 + }, + { + "epoch": 3.6838610221747894, + "grad_norm": 0.31600163797290093, + "learning_rate": 8.14574494792597e-07, + "loss": 0.0142, + "step": 31066 + }, + { + "epoch": 3.6839796039369146, + "grad_norm": 0.3418522333390079, + "learning_rate": 8.139668629479457e-07, + "loss": 0.0161, + "step": 31067 + }, + { + "epoch": 3.6840981856990394, + "grad_norm": 0.7509296018734232, + "learning_rate": 8.133594540688571e-07, + "loss": 0.0288, + "step": 31068 + }, + { + "epoch": 3.6842167674611646, + "grad_norm": 0.6091425455009665, + "learning_rate": 8.12752268160935e-07, + "loss": 0.0293, + "step": 31069 + }, + { + "epoch": 3.6843353492232893, + "grad_norm": 0.6797997177598647, + "learning_rate": 8.121453052297778e-07, + "loss": 0.0341, + "step": 31070 + }, + { + "epoch": 3.6844539309854145, + "grad_norm": 0.9569669631265709, + "learning_rate": 8.115385652809782e-07, + "loss": 0.0269, + "step": 31071 + }, + { + "epoch": 3.6845725127475393, + "grad_norm": 0.330402364399142, + "learning_rate": 8.109320483201343e-07, + "loss": 0.0124, + "step": 31072 + }, + { + "epoch": 3.6846910945096645, + "grad_norm": 0.7375289239634463, + "learning_rate": 8.103257543528281e-07, + "loss": 0.0438, + "step": 31073 + }, + { + "epoch": 3.6848096762717892, + "grad_norm": 0.4513635765035643, + "learning_rate": 8.097196833846632e-07, + "loss": 0.0211, + "step": 31074 + }, + { + "epoch": 3.6849282580339144, + "grad_norm": 0.47530939405521194, + "learning_rate": 8.09113835421213e-07, + "loss": 0.0198, + "step": 31075 + }, + { + "epoch": 3.6850468397960396, + "grad_norm": 0.6236374306539888, + "learning_rate": 8.085082104680702e-07, + "loss": 0.0427, + "step": 31076 + }, + { + "epoch": 3.6851654215581644, + "grad_norm": 0.4122361214565143, + "learning_rate": 8.079028085308165e-07, + "loss": 0.0207, + "step": 31077 + }, + { + "epoch": 3.685284003320289, + "grad_norm": 0.7111291912863849, + "learning_rate": 8.072976296150337e-07, + "loss": 0.0388, + "step": 31078 + }, + { + "epoch": 3.6854025850824144, + "grad_norm": 0.7298112645791046, + "learning_rate": 8.066926737262975e-07, + "loss": 0.0329, + "step": 31079 + }, + { + "epoch": 3.6855211668445396, + "grad_norm": 0.3843943825015927, + "learning_rate": 8.060879408701871e-07, + "loss": 0.0129, + "step": 31080 + }, + { + "epoch": 3.6856397486066643, + "grad_norm": 0.3351049750289751, + "learning_rate": 8.054834310522785e-07, + "loss": 0.0187, + "step": 31081 + }, + { + "epoch": 3.685758330368789, + "grad_norm": 0.5189038476423049, + "learning_rate": 8.048791442781423e-07, + "loss": 0.0251, + "step": 31082 + }, + { + "epoch": 3.6858769121309143, + "grad_norm": 0.5006967971833182, + "learning_rate": 8.042750805533517e-07, + "loss": 0.0242, + "step": 31083 + }, + { + "epoch": 3.6859954938930395, + "grad_norm": 0.5563056094132002, + "learning_rate": 8.036712398834717e-07, + "loss": 0.029, + "step": 31084 + }, + { + "epoch": 3.6861140756551642, + "grad_norm": 0.4974943751428712, + "learning_rate": 8.030676222740757e-07, + "loss": 0.0243, + "step": 31085 + }, + { + "epoch": 3.686232657417289, + "grad_norm": 0.4499895482032899, + "learning_rate": 8.024642277307204e-07, + "loss": 0.0202, + "step": 31086 + }, + { + "epoch": 3.686351239179414, + "grad_norm": 0.9161867630380179, + "learning_rate": 8.018610562589707e-07, + "loss": 0.028, + "step": 31087 + }, + { + "epoch": 3.6864698209415394, + "grad_norm": 0.5329193168078035, + "learning_rate": 8.012581078643915e-07, + "loss": 0.0234, + "step": 31088 + }, + { + "epoch": 3.686588402703664, + "grad_norm": 0.38640788504729523, + "learning_rate": 8.006553825525398e-07, + "loss": 0.0161, + "step": 31089 + }, + { + "epoch": 3.686706984465789, + "grad_norm": 0.7101024351683864, + "learning_rate": 8.000528803289665e-07, + "loss": 0.0314, + "step": 31090 + }, + { + "epoch": 3.686825566227914, + "grad_norm": 0.6788177666227772, + "learning_rate": 7.994506011992309e-07, + "loss": 0.0223, + "step": 31091 + }, + { + "epoch": 3.6869441479900393, + "grad_norm": 0.38112600516643547, + "learning_rate": 7.988485451688815e-07, + "loss": 0.0152, + "step": 31092 + }, + { + "epoch": 3.687062729752164, + "grad_norm": 0.37316223796857434, + "learning_rate": 7.98246712243475e-07, + "loss": 0.0159, + "step": 31093 + }, + { + "epoch": 3.687181311514289, + "grad_norm": 0.5866800494143181, + "learning_rate": 7.97645102428557e-07, + "loss": 0.0282, + "step": 31094 + }, + { + "epoch": 3.687299893276414, + "grad_norm": 0.5039922277197196, + "learning_rate": 7.970437157296645e-07, + "loss": 0.0279, + "step": 31095 + }, + { + "epoch": 3.6874184750385393, + "grad_norm": 0.5038896425097493, + "learning_rate": 7.964425521523572e-07, + "loss": 0.0195, + "step": 31096 + }, + { + "epoch": 3.687537056800664, + "grad_norm": 1.0636196785676488, + "learning_rate": 7.958416117021666e-07, + "loss": 0.038, + "step": 31097 + }, + { + "epoch": 3.6876556385627888, + "grad_norm": 0.3432839669125862, + "learning_rate": 7.952408943846357e-07, + "loss": 0.0109, + "step": 31098 + }, + { + "epoch": 3.687774220324914, + "grad_norm": 0.39178069349332206, + "learning_rate": 7.946404002052988e-07, + "loss": 0.0141, + "step": 31099 + }, + { + "epoch": 3.687892802087039, + "grad_norm": 0.746880688328633, + "learning_rate": 7.940401291696986e-07, + "loss": 0.0301, + "step": 31100 + }, + { + "epoch": 3.688011383849164, + "grad_norm": 0.4903706007477691, + "learning_rate": 7.934400812833642e-07, + "loss": 0.02, + "step": 31101 + }, + { + "epoch": 3.688129965611289, + "grad_norm": 0.6196196371217465, + "learning_rate": 7.928402565518272e-07, + "loss": 0.0327, + "step": 31102 + }, + { + "epoch": 3.688248547373414, + "grad_norm": 0.4651792272124165, + "learning_rate": 7.922406549806166e-07, + "loss": 0.0181, + "step": 31103 + }, + { + "epoch": 3.688367129135539, + "grad_norm": 0.569524432879516, + "learning_rate": 7.91641276575264e-07, + "loss": 0.0261, + "step": 31104 + }, + { + "epoch": 3.688485710897664, + "grad_norm": 0.4445231348498981, + "learning_rate": 7.910421213412956e-07, + "loss": 0.018, + "step": 31105 + }, + { + "epoch": 3.688604292659789, + "grad_norm": 0.3984083986516453, + "learning_rate": 7.904431892842262e-07, + "loss": 0.0202, + "step": 31106 + }, + { + "epoch": 3.688722874421914, + "grad_norm": 0.618962955225649, + "learning_rate": 7.89844480409585e-07, + "loss": 0.0311, + "step": 31107 + }, + { + "epoch": 3.688841456184039, + "grad_norm": 0.7136373396152241, + "learning_rate": 7.892459947228898e-07, + "loss": 0.026, + "step": 31108 + }, + { + "epoch": 3.688960037946164, + "grad_norm": 0.6102862763583974, + "learning_rate": 7.886477322296609e-07, + "loss": 0.0258, + "step": 31109 + }, + { + "epoch": 3.689078619708289, + "grad_norm": 0.7410367838782559, + "learning_rate": 7.880496929354026e-07, + "loss": 0.0538, + "step": 31110 + }, + { + "epoch": 3.6891972014704137, + "grad_norm": 0.5970455275311776, + "learning_rate": 7.874518768456407e-07, + "loss": 0.0262, + "step": 31111 + }, + { + "epoch": 3.689315783232539, + "grad_norm": 0.3101430918328745, + "learning_rate": 7.868542839658793e-07, + "loss": 0.0157, + "step": 31112 + }, + { + "epoch": 3.6894343649946637, + "grad_norm": 0.9408374288569628, + "learning_rate": 7.86256914301628e-07, + "loss": 0.0434, + "step": 31113 + }, + { + "epoch": 3.689552946756789, + "grad_norm": 0.5612452136919589, + "learning_rate": 7.85659767858396e-07, + "loss": 0.0207, + "step": 31114 + }, + { + "epoch": 3.6896715285189137, + "grad_norm": 0.4392178170506111, + "learning_rate": 7.850628446416874e-07, + "loss": 0.0171, + "step": 31115 + }, + { + "epoch": 3.689790110281039, + "grad_norm": 0.4122150982475814, + "learning_rate": 7.844661446570089e-07, + "loss": 0.0186, + "step": 31116 + }, + { + "epoch": 3.6899086920431636, + "grad_norm": 0.8078878929814375, + "learning_rate": 7.838696679098506e-07, + "loss": 0.0364, + "step": 31117 + }, + { + "epoch": 3.690027273805289, + "grad_norm": 0.5118488919268804, + "learning_rate": 7.832734144057246e-07, + "loss": 0.0256, + "step": 31118 + }, + { + "epoch": 3.6901458555674136, + "grad_norm": 1.0225946092604805, + "learning_rate": 7.826773841501183e-07, + "loss": 0.0321, + "step": 31119 + }, + { + "epoch": 3.690264437329539, + "grad_norm": 0.8165477889522688, + "learning_rate": 7.820815771485274e-07, + "loss": 0.0554, + "step": 31120 + }, + { + "epoch": 3.6903830190916636, + "grad_norm": 0.6329464680879681, + "learning_rate": 7.814859934064472e-07, + "loss": 0.0269, + "step": 31121 + }, + { + "epoch": 3.6905016008537888, + "grad_norm": 0.49148743744451706, + "learning_rate": 7.808906329293681e-07, + "loss": 0.0226, + "step": 31122 + }, + { + "epoch": 3.6906201826159135, + "grad_norm": 0.5143997966978379, + "learning_rate": 7.802954957227826e-07, + "loss": 0.0213, + "step": 31123 + }, + { + "epoch": 3.6907387643780387, + "grad_norm": 0.4192954592418635, + "learning_rate": 7.797005817921671e-07, + "loss": 0.0198, + "step": 31124 + }, + { + "epoch": 3.690857346140164, + "grad_norm": 0.5935368138415941, + "learning_rate": 7.791058911430116e-07, + "loss": 0.0309, + "step": 31125 + }, + { + "epoch": 3.6909759279022887, + "grad_norm": 1.090273988736509, + "learning_rate": 7.785114237807978e-07, + "loss": 0.0555, + "step": 31126 + }, + { + "epoch": 3.6910945096644134, + "grad_norm": 0.6411947610587434, + "learning_rate": 7.779171797110102e-07, + "loss": 0.0349, + "step": 31127 + }, + { + "epoch": 3.6912130914265386, + "grad_norm": 0.4023042991185588, + "learning_rate": 7.773231589391194e-07, + "loss": 0.0169, + "step": 31128 + }, + { + "epoch": 3.691331673188664, + "grad_norm": 0.42260624419506343, + "learning_rate": 7.767293614706073e-07, + "loss": 0.0195, + "step": 31129 + }, + { + "epoch": 3.6914502549507886, + "grad_norm": 0.35146807756829457, + "learning_rate": 7.761357873109443e-07, + "loss": 0.0189, + "step": 31130 + }, + { + "epoch": 3.6915688367129134, + "grad_norm": 0.35686718901959374, + "learning_rate": 7.755424364656039e-07, + "loss": 0.0145, + "step": 31131 + }, + { + "epoch": 3.6916874184750386, + "grad_norm": 0.5624804956482087, + "learning_rate": 7.749493089400567e-07, + "loss": 0.0268, + "step": 31132 + }, + { + "epoch": 3.6918060002371638, + "grad_norm": 0.8589823952346173, + "learning_rate": 7.743564047397706e-07, + "loss": 0.0454, + "step": 31133 + }, + { + "epoch": 3.6919245819992885, + "grad_norm": 0.6323241528245358, + "learning_rate": 7.737637238702133e-07, + "loss": 0.0329, + "step": 31134 + }, + { + "epoch": 3.6920431637614133, + "grad_norm": 0.5000001347363523, + "learning_rate": 7.731712663368417e-07, + "loss": 0.0331, + "step": 31135 + }, + { + "epoch": 3.6921617455235385, + "grad_norm": 0.9439198216396, + "learning_rate": 7.725790321451237e-07, + "loss": 0.0348, + "step": 31136 + }, + { + "epoch": 3.6922803272856637, + "grad_norm": 0.6102319540932453, + "learning_rate": 7.719870213005187e-07, + "loss": 0.0305, + "step": 31137 + }, + { + "epoch": 3.6923989090477884, + "grad_norm": 0.6981386360997802, + "learning_rate": 7.713952338084834e-07, + "loss": 0.0321, + "step": 31138 + }, + { + "epoch": 3.692517490809913, + "grad_norm": 0.5833688014957914, + "learning_rate": 7.70803669674472e-07, + "loss": 0.0248, + "step": 31139 + }, + { + "epoch": 3.6926360725720384, + "grad_norm": 0.45319779110768615, + "learning_rate": 7.702123289039381e-07, + "loss": 0.0226, + "step": 31140 + }, + { + "epoch": 3.6927546543341636, + "grad_norm": 0.482261397445065, + "learning_rate": 7.69621211502336e-07, + "loss": 0.0266, + "step": 31141 + }, + { + "epoch": 3.6928732360962884, + "grad_norm": 0.5521177316627854, + "learning_rate": 7.690303174751112e-07, + "loss": 0.0294, + "step": 31142 + }, + { + "epoch": 3.692991817858413, + "grad_norm": 0.6589551505166904, + "learning_rate": 7.684396468277149e-07, + "loss": 0.0295, + "step": 31143 + }, + { + "epoch": 3.6931103996205383, + "grad_norm": 0.5225863010846564, + "learning_rate": 7.678491995655873e-07, + "loss": 0.0215, + "step": 31144 + }, + { + "epoch": 3.6932289813826635, + "grad_norm": 0.2919543817321376, + "learning_rate": 7.672589756941767e-07, + "loss": 0.0161, + "step": 31145 + }, + { + "epoch": 3.6933475631447883, + "grad_norm": 0.4952607691905356, + "learning_rate": 7.666689752189232e-07, + "loss": 0.0221, + "step": 31146 + }, + { + "epoch": 3.693466144906913, + "grad_norm": 0.5527490474816786, + "learning_rate": 7.660791981452614e-07, + "loss": 0.0183, + "step": 31147 + }, + { + "epoch": 3.6935847266690383, + "grad_norm": 0.7364622441443655, + "learning_rate": 7.654896444786341e-07, + "loss": 0.0341, + "step": 31148 + }, + { + "epoch": 3.6937033084311635, + "grad_norm": 0.3527831226116415, + "learning_rate": 7.64900314224476e-07, + "loss": 0.0229, + "step": 31149 + }, + { + "epoch": 3.693821890193288, + "grad_norm": 0.6481058475214787, + "learning_rate": 7.643112073882158e-07, + "loss": 0.0292, + "step": 31150 + }, + { + "epoch": 3.6939404719554134, + "grad_norm": 0.7003285298342836, + "learning_rate": 7.637223239752856e-07, + "loss": 0.0259, + "step": 31151 + }, + { + "epoch": 3.694059053717538, + "grad_norm": 0.7427542916672668, + "learning_rate": 7.631336639911168e-07, + "loss": 0.0332, + "step": 31152 + }, + { + "epoch": 3.6941776354796634, + "grad_norm": 0.34571858247158144, + "learning_rate": 7.625452274411305e-07, + "loss": 0.0154, + "step": 31153 + }, + { + "epoch": 3.694296217241788, + "grad_norm": 0.48902525333481706, + "learning_rate": 7.619570143307609e-07, + "loss": 0.0254, + "step": 31154 + }, + { + "epoch": 3.6944147990039133, + "grad_norm": 0.7098040181777275, + "learning_rate": 7.613690246654204e-07, + "loss": 0.0355, + "step": 31155 + }, + { + "epoch": 3.694533380766038, + "grad_norm": 0.3043884869177774, + "learning_rate": 7.607812584505381e-07, + "loss": 0.0148, + "step": 31156 + }, + { + "epoch": 3.6946519625281633, + "grad_norm": 0.6957273822524048, + "learning_rate": 7.601937156915262e-07, + "loss": 0.0344, + "step": 31157 + }, + { + "epoch": 3.694770544290288, + "grad_norm": 0.6184497375348893, + "learning_rate": 7.596063963938027e-07, + "loss": 0.0211, + "step": 31158 + }, + { + "epoch": 3.6948891260524133, + "grad_norm": 0.395150536810827, + "learning_rate": 7.590193005627828e-07, + "loss": 0.0127, + "step": 31159 + }, + { + "epoch": 3.695007707814538, + "grad_norm": 0.6597995607115166, + "learning_rate": 7.58432428203884e-07, + "loss": 0.0272, + "step": 31160 + }, + { + "epoch": 3.6951262895766632, + "grad_norm": 0.46994193315916466, + "learning_rate": 7.578457793225053e-07, + "loss": 0.0249, + "step": 31161 + }, + { + "epoch": 3.695244871338788, + "grad_norm": 0.5070288397905862, + "learning_rate": 7.572593539240614e-07, + "loss": 0.0226, + "step": 31162 + }, + { + "epoch": 3.695363453100913, + "grad_norm": 0.3988634514898461, + "learning_rate": 7.566731520139591e-07, + "loss": 0.016, + "step": 31163 + }, + { + "epoch": 3.695482034863038, + "grad_norm": 0.728814970380539, + "learning_rate": 7.560871735976e-07, + "loss": 0.034, + "step": 31164 + }, + { + "epoch": 3.695600616625163, + "grad_norm": 0.32849325751158953, + "learning_rate": 7.555014186803905e-07, + "loss": 0.0183, + "step": 31165 + }, + { + "epoch": 3.695719198387288, + "grad_norm": 0.503145838930809, + "learning_rate": 7.549158872677209e-07, + "loss": 0.0197, + "step": 31166 + }, + { + "epoch": 3.695837780149413, + "grad_norm": 0.703501669400999, + "learning_rate": 7.543305793650035e-07, + "loss": 0.0389, + "step": 31167 + }, + { + "epoch": 3.695956361911538, + "grad_norm": 0.3711945469393105, + "learning_rate": 7.537454949776229e-07, + "loss": 0.0127, + "step": 31168 + }, + { + "epoch": 3.696074943673663, + "grad_norm": 0.503017588418607, + "learning_rate": 7.531606341109748e-07, + "loss": 0.0163, + "step": 31169 + }, + { + "epoch": 3.696193525435788, + "grad_norm": 0.4741676184930481, + "learning_rate": 7.525759967704521e-07, + "loss": 0.0185, + "step": 31170 + }, + { + "epoch": 3.696312107197913, + "grad_norm": 1.2676101393355226, + "learning_rate": 7.519915829614449e-07, + "loss": 0.0439, + "step": 31171 + }, + { + "epoch": 3.696430688960038, + "grad_norm": 0.3582942565297387, + "learning_rate": 7.514073926893433e-07, + "loss": 0.0243, + "step": 31172 + }, + { + "epoch": 3.696549270722163, + "grad_norm": 0.38976621475079215, + "learning_rate": 7.508234259595237e-07, + "loss": 0.0177, + "step": 31173 + }, + { + "epoch": 3.6966678524842878, + "grad_norm": 0.3690181729188304, + "learning_rate": 7.502396827773816e-07, + "loss": 0.0218, + "step": 31174 + }, + { + "epoch": 3.696786434246413, + "grad_norm": 0.4116851682428823, + "learning_rate": 7.496561631482907e-07, + "loss": 0.0128, + "step": 31175 + }, + { + "epoch": 3.6969050160085377, + "grad_norm": 0.48562961549685385, + "learning_rate": 7.490728670776353e-07, + "loss": 0.0162, + "step": 31176 + }, + { + "epoch": 3.697023597770663, + "grad_norm": 0.5653264559306838, + "learning_rate": 7.484897945707836e-07, + "loss": 0.0188, + "step": 31177 + }, + { + "epoch": 3.697142179532788, + "grad_norm": 0.46480567808208656, + "learning_rate": 7.479069456331228e-07, + "loss": 0.0205, + "step": 31178 + }, + { + "epoch": 3.697260761294913, + "grad_norm": 0.9197708888313745, + "learning_rate": 7.473243202700153e-07, + "loss": 0.0373, + "step": 31179 + }, + { + "epoch": 3.6973793430570376, + "grad_norm": 0.33592309411462806, + "learning_rate": 7.467419184868374e-07, + "loss": 0.0141, + "step": 31180 + }, + { + "epoch": 3.697497924819163, + "grad_norm": 0.4028257250459585, + "learning_rate": 7.461597402889597e-07, + "loss": 0.0138, + "step": 31181 + }, + { + "epoch": 3.697616506581288, + "grad_norm": 0.6147043221507089, + "learning_rate": 7.455777856817475e-07, + "loss": 0.0336, + "step": 31182 + }, + { + "epoch": 3.697735088343413, + "grad_norm": 0.3407010827875755, + "learning_rate": 7.449960546705659e-07, + "loss": 0.0156, + "step": 31183 + }, + { + "epoch": 3.6978536701055376, + "grad_norm": 0.7376035483430387, + "learning_rate": 7.444145472607744e-07, + "loss": 0.0259, + "step": 31184 + }, + { + "epoch": 3.6979722518676628, + "grad_norm": 0.2961110090160359, + "learning_rate": 7.438332634577383e-07, + "loss": 0.0124, + "step": 31185 + }, + { + "epoch": 3.698090833629788, + "grad_norm": 0.7879156314437368, + "learning_rate": 7.432522032668143e-07, + "loss": 0.0342, + "step": 31186 + }, + { + "epoch": 3.6982094153919127, + "grad_norm": 0.43645973876892646, + "learning_rate": 7.426713666933621e-07, + "loss": 0.0177, + "step": 31187 + }, + { + "epoch": 3.6983279971540375, + "grad_norm": 0.43107436244333325, + "learning_rate": 7.420907537427274e-07, + "loss": 0.0237, + "step": 31188 + }, + { + "epoch": 3.6984465789161627, + "grad_norm": 0.7045825691408255, + "learning_rate": 7.415103644202726e-07, + "loss": 0.0379, + "step": 31189 + }, + { + "epoch": 3.698565160678288, + "grad_norm": 0.5311521966162452, + "learning_rate": 7.409301987313461e-07, + "loss": 0.0317, + "step": 31190 + }, + { + "epoch": 3.6986837424404126, + "grad_norm": 0.4037779424503725, + "learning_rate": 7.403502566812908e-07, + "loss": 0.0195, + "step": 31191 + }, + { + "epoch": 3.6988023242025374, + "grad_norm": 0.5264229706927801, + "learning_rate": 7.397705382754582e-07, + "loss": 0.0247, + "step": 31192 + }, + { + "epoch": 3.6989209059646626, + "grad_norm": 0.5567023941438662, + "learning_rate": 7.39191043519194e-07, + "loss": 0.0305, + "step": 31193 + }, + { + "epoch": 3.699039487726788, + "grad_norm": 0.6215443920800685, + "learning_rate": 7.386117724178382e-07, + "loss": 0.0273, + "step": 31194 + }, + { + "epoch": 3.6991580694889126, + "grad_norm": 0.43152838153169615, + "learning_rate": 7.380327249767283e-07, + "loss": 0.0172, + "step": 31195 + }, + { + "epoch": 3.6992766512510373, + "grad_norm": 0.33270852279157676, + "learning_rate": 7.374539012012044e-07, + "loss": 0.015, + "step": 31196 + }, + { + "epoch": 3.6993952330131625, + "grad_norm": 0.740738177057711, + "learning_rate": 7.368753010966012e-07, + "loss": 0.0385, + "step": 31197 + }, + { + "epoch": 3.6995138147752877, + "grad_norm": 0.8402901552615325, + "learning_rate": 7.362969246682588e-07, + "loss": 0.0367, + "step": 31198 + }, + { + "epoch": 3.6996323965374125, + "grad_norm": 0.5594733067990859, + "learning_rate": 7.357187719215037e-07, + "loss": 0.0276, + "step": 31199 + }, + { + "epoch": 3.6997509782995377, + "grad_norm": 0.7697460173615152, + "learning_rate": 7.351408428616646e-07, + "loss": 0.0348, + "step": 31200 + }, + { + "epoch": 3.6998695600616625, + "grad_norm": 0.4940594688152479, + "learning_rate": 7.345631374940709e-07, + "loss": 0.0222, + "step": 31201 + }, + { + "epoch": 3.6999881418237877, + "grad_norm": 0.44256373512659297, + "learning_rate": 7.339856558240487e-07, + "loss": 0.0234, + "step": 31202 + }, + { + "epoch": 3.7001067235859124, + "grad_norm": 0.887498171932962, + "learning_rate": 7.334083978569245e-07, + "loss": 0.0325, + "step": 31203 + }, + { + "epoch": 3.7002253053480376, + "grad_norm": 0.7650074508771773, + "learning_rate": 7.328313635980133e-07, + "loss": 0.0432, + "step": 31204 + }, + { + "epoch": 3.7003438871101624, + "grad_norm": 0.5504839025504246, + "learning_rate": 7.322545530526442e-07, + "loss": 0.0159, + "step": 31205 + }, + { + "epoch": 3.7004624688722876, + "grad_norm": 0.5095809555657996, + "learning_rate": 7.316779662261269e-07, + "loss": 0.0239, + "step": 31206 + }, + { + "epoch": 3.7005810506344123, + "grad_norm": 0.7580630016946854, + "learning_rate": 7.311016031237766e-07, + "loss": 0.0314, + "step": 31207 + }, + { + "epoch": 3.7006996323965375, + "grad_norm": 0.5469661457303516, + "learning_rate": 7.305254637509112e-07, + "loss": 0.0211, + "step": 31208 + }, + { + "epoch": 3.7008182141586623, + "grad_norm": 0.6085320707329673, + "learning_rate": 7.299495481128404e-07, + "loss": 0.0392, + "step": 31209 + }, + { + "epoch": 3.7009367959207875, + "grad_norm": 0.49648564870318745, + "learning_rate": 7.293738562148711e-07, + "loss": 0.0219, + "step": 31210 + }, + { + "epoch": 3.7010553776829123, + "grad_norm": 0.8588135088141834, + "learning_rate": 7.287983880623128e-07, + "loss": 0.0442, + "step": 31211 + }, + { + "epoch": 3.7011739594450375, + "grad_norm": 0.8958310212452234, + "learning_rate": 7.282231436604698e-07, + "loss": 0.0446, + "step": 31212 + }, + { + "epoch": 3.7012925412071622, + "grad_norm": 0.5588096501255252, + "learning_rate": 7.276481230146431e-07, + "loss": 0.0242, + "step": 31213 + }, + { + "epoch": 3.7014111229692874, + "grad_norm": 0.7469776847238184, + "learning_rate": 7.270733261301427e-07, + "loss": 0.0399, + "step": 31214 + }, + { + "epoch": 3.701529704731412, + "grad_norm": 0.5519707565312701, + "learning_rate": 7.26498753012253e-07, + "loss": 0.0241, + "step": 31215 + }, + { + "epoch": 3.7016482864935374, + "grad_norm": 0.4282213576547305, + "learning_rate": 7.259244036662838e-07, + "loss": 0.0197, + "step": 31216 + }, + { + "epoch": 3.701766868255662, + "grad_norm": 0.5543226222310279, + "learning_rate": 7.253502780975224e-07, + "loss": 0.0244, + "step": 31217 + }, + { + "epoch": 3.7018854500177873, + "grad_norm": 0.5091585638298133, + "learning_rate": 7.247763763112647e-07, + "loss": 0.0201, + "step": 31218 + }, + { + "epoch": 3.702004031779912, + "grad_norm": 0.5482083678577713, + "learning_rate": 7.242026983127981e-07, + "loss": 0.0243, + "step": 31219 + }, + { + "epoch": 3.7021226135420373, + "grad_norm": 0.5220228422371889, + "learning_rate": 7.236292441074183e-07, + "loss": 0.0268, + "step": 31220 + }, + { + "epoch": 3.702241195304162, + "grad_norm": 0.8329467929565736, + "learning_rate": 7.230560137004045e-07, + "loss": 0.0332, + "step": 31221 + }, + { + "epoch": 3.7023597770662873, + "grad_norm": 0.5444736231206689, + "learning_rate": 7.22483007097044e-07, + "loss": 0.0238, + "step": 31222 + }, + { + "epoch": 3.702478358828412, + "grad_norm": 0.6232729369481905, + "learning_rate": 7.219102243026215e-07, + "loss": 0.0315, + "step": 31223 + }, + { + "epoch": 3.7025969405905372, + "grad_norm": 0.3964258949376255, + "learning_rate": 7.213376653224107e-07, + "loss": 0.0198, + "step": 31224 + }, + { + "epoch": 3.702715522352662, + "grad_norm": 0.501643088204781, + "learning_rate": 7.20765330161699e-07, + "loss": 0.0232, + "step": 31225 + }, + { + "epoch": 3.702834104114787, + "grad_norm": 0.36638431626267415, + "learning_rate": 7.201932188257515e-07, + "loss": 0.0131, + "step": 31226 + }, + { + "epoch": 3.7029526858769124, + "grad_norm": 0.7615783117000072, + "learning_rate": 7.196213313198558e-07, + "loss": 0.0367, + "step": 31227 + }, + { + "epoch": 3.703071267639037, + "grad_norm": 0.4320215598444288, + "learning_rate": 7.190496676492714e-07, + "loss": 0.015, + "step": 31228 + }, + { + "epoch": 3.703189849401162, + "grad_norm": 0.6318255257347049, + "learning_rate": 7.184782278192776e-07, + "loss": 0.0168, + "step": 31229 + }, + { + "epoch": 3.703308431163287, + "grad_norm": 0.643502211004576, + "learning_rate": 7.179070118351366e-07, + "loss": 0.0329, + "step": 31230 + }, + { + "epoch": 3.7034270129254123, + "grad_norm": 0.5529063681491397, + "learning_rate": 7.17336019702114e-07, + "loss": 0.0195, + "step": 31231 + }, + { + "epoch": 3.703545594687537, + "grad_norm": 0.3580956203130037, + "learning_rate": 7.167652514254802e-07, + "loss": 0.0177, + "step": 31232 + }, + { + "epoch": 3.703664176449662, + "grad_norm": 0.582183995391498, + "learning_rate": 7.161947070104896e-07, + "loss": 0.031, + "step": 31233 + }, + { + "epoch": 3.703782758211787, + "grad_norm": 0.5133792900709148, + "learning_rate": 7.156243864624074e-07, + "loss": 0.0224, + "step": 31234 + }, + { + "epoch": 3.7039013399739122, + "grad_norm": 0.4132230210654316, + "learning_rate": 7.150542897864876e-07, + "loss": 0.0199, + "step": 31235 + }, + { + "epoch": 3.704019921736037, + "grad_norm": 0.40074047425248877, + "learning_rate": 7.144844169879899e-07, + "loss": 0.0182, + "step": 31236 + }, + { + "epoch": 3.7041385034981618, + "grad_norm": 0.5202357664581342, + "learning_rate": 7.139147680721603e-07, + "loss": 0.0248, + "step": 31237 + }, + { + "epoch": 3.704257085260287, + "grad_norm": 0.6184952926869999, + "learning_rate": 7.133453430442583e-07, + "loss": 0.0272, + "step": 31238 + }, + { + "epoch": 3.704375667022412, + "grad_norm": 0.5681512960880519, + "learning_rate": 7.127761419095297e-07, + "loss": 0.026, + "step": 31239 + }, + { + "epoch": 3.704494248784537, + "grad_norm": 0.7212329253831629, + "learning_rate": 7.122071646732204e-07, + "loss": 0.0266, + "step": 31240 + }, + { + "epoch": 3.7046128305466617, + "grad_norm": 0.5741154510392191, + "learning_rate": 7.116384113405761e-07, + "loss": 0.0313, + "step": 31241 + }, + { + "epoch": 3.704731412308787, + "grad_norm": 0.4738818057517235, + "learning_rate": 7.110698819168426e-07, + "loss": 0.0204, + "step": 31242 + }, + { + "epoch": 3.704849994070912, + "grad_norm": 0.5121374985143637, + "learning_rate": 7.105015764072631e-07, + "loss": 0.0201, + "step": 31243 + }, + { + "epoch": 3.704968575833037, + "grad_norm": 0.6484307047476258, + "learning_rate": 7.099334948170694e-07, + "loss": 0.035, + "step": 31244 + }, + { + "epoch": 3.7050871575951616, + "grad_norm": 0.3169596529827098, + "learning_rate": 7.093656371515046e-07, + "loss": 0.0163, + "step": 31245 + }, + { + "epoch": 3.705205739357287, + "grad_norm": 0.30761159290572004, + "learning_rate": 7.087980034158004e-07, + "loss": 0.0107, + "step": 31246 + }, + { + "epoch": 3.705324321119412, + "grad_norm": 0.3342314771529582, + "learning_rate": 7.082305936151917e-07, + "loss": 0.016, + "step": 31247 + }, + { + "epoch": 3.7054429028815368, + "grad_norm": 0.535063980988157, + "learning_rate": 7.076634077549049e-07, + "loss": 0.024, + "step": 31248 + }, + { + "epoch": 3.7055614846436615, + "grad_norm": 0.600046981418442, + "learning_rate": 7.070964458401774e-07, + "loss": 0.0237, + "step": 31249 + }, + { + "epoch": 3.7056800664057867, + "grad_norm": 0.39081868643306156, + "learning_rate": 7.065297078762301e-07, + "loss": 0.0152, + "step": 31250 + }, + { + "epoch": 3.705798648167912, + "grad_norm": 0.4894404653385313, + "learning_rate": 7.059631938682837e-07, + "loss": 0.0233, + "step": 31251 + }, + { + "epoch": 3.7059172299300367, + "grad_norm": 0.45404756979789324, + "learning_rate": 7.053969038215674e-07, + "loss": 0.0252, + "step": 31252 + }, + { + "epoch": 3.706035811692162, + "grad_norm": 0.4516785901186444, + "learning_rate": 7.048308377413021e-07, + "loss": 0.0202, + "step": 31253 + }, + { + "epoch": 3.7061543934542867, + "grad_norm": 0.4410587801264695, + "learning_rate": 7.04264995632703e-07, + "loss": 0.0188, + "step": 31254 + }, + { + "epoch": 3.706272975216412, + "grad_norm": 0.4476595221018931, + "learning_rate": 7.036993775009854e-07, + "loss": 0.0209, + "step": 31255 + }, + { + "epoch": 3.7063915569785366, + "grad_norm": 0.4313844545163968, + "learning_rate": 7.031339833513673e-07, + "loss": 0.0145, + "step": 31256 + }, + { + "epoch": 3.706510138740662, + "grad_norm": 0.4363662691068536, + "learning_rate": 7.025688131890584e-07, + "loss": 0.0244, + "step": 31257 + }, + { + "epoch": 3.7066287205027866, + "grad_norm": 0.43679825922819754, + "learning_rate": 7.020038670192713e-07, + "loss": 0.0172, + "step": 31258 + }, + { + "epoch": 3.706747302264912, + "grad_norm": 0.36745928468702543, + "learning_rate": 7.014391448472074e-07, + "loss": 0.014, + "step": 31259 + }, + { + "epoch": 3.7068658840270365, + "grad_norm": 0.5416563979572981, + "learning_rate": 7.008746466780819e-07, + "loss": 0.0268, + "step": 31260 + }, + { + "epoch": 3.7069844657891617, + "grad_norm": 0.5794167594245443, + "learning_rate": 7.003103725170934e-07, + "loss": 0.0234, + "step": 31261 + }, + { + "epoch": 3.7071030475512865, + "grad_norm": 0.5201191860598152, + "learning_rate": 6.997463223694434e-07, + "loss": 0.0185, + "step": 31262 + }, + { + "epoch": 3.7072216293134117, + "grad_norm": 0.4557655218866438, + "learning_rate": 6.991824962403359e-07, + "loss": 0.0233, + "step": 31263 + }, + { + "epoch": 3.7073402110755365, + "grad_norm": 0.5716154974415758, + "learning_rate": 6.986188941349642e-07, + "loss": 0.027, + "step": 31264 + }, + { + "epoch": 3.7074587928376617, + "grad_norm": 0.32235829531181814, + "learning_rate": 6.980555160585295e-07, + "loss": 0.011, + "step": 31265 + }, + { + "epoch": 3.7075773745997864, + "grad_norm": 1.1465701905484642, + "learning_rate": 6.974923620162193e-07, + "loss": 0.0403, + "step": 31266 + }, + { + "epoch": 3.7076959563619116, + "grad_norm": 0.41452281957365333, + "learning_rate": 6.969294320132269e-07, + "loss": 0.0246, + "step": 31267 + }, + { + "epoch": 3.7078145381240364, + "grad_norm": 0.3609207941034242, + "learning_rate": 6.963667260547424e-07, + "loss": 0.0118, + "step": 31268 + }, + { + "epoch": 3.7079331198861616, + "grad_norm": 0.5312540250043283, + "learning_rate": 6.958042441459589e-07, + "loss": 0.023, + "step": 31269 + }, + { + "epoch": 3.7080517016482863, + "grad_norm": 0.4452723573301421, + "learning_rate": 6.952419862920528e-07, + "loss": 0.0212, + "step": 31270 + }, + { + "epoch": 3.7081702834104115, + "grad_norm": 0.4889935768862295, + "learning_rate": 6.946799524982089e-07, + "loss": 0.0232, + "step": 31271 + }, + { + "epoch": 3.7082888651725363, + "grad_norm": 0.3445501450361918, + "learning_rate": 6.941181427696119e-07, + "loss": 0.0198, + "step": 31272 + }, + { + "epoch": 3.7084074469346615, + "grad_norm": 0.3718847756787146, + "learning_rate": 6.93556557111441e-07, + "loss": 0.016, + "step": 31273 + }, + { + "epoch": 3.7085260286967863, + "grad_norm": 0.43859794826077175, + "learning_rate": 6.929951955288727e-07, + "loss": 0.0199, + "step": 31274 + }, + { + "epoch": 3.7086446104589115, + "grad_norm": 0.6416113628285168, + "learning_rate": 6.924340580270778e-07, + "loss": 0.0311, + "step": 31275 + }, + { + "epoch": 3.7087631922210367, + "grad_norm": 0.5146111955327067, + "learning_rate": 6.918731446112381e-07, + "loss": 0.0286, + "step": 31276 + }, + { + "epoch": 3.7088817739831614, + "grad_norm": 0.9295038361130905, + "learning_rate": 6.913124552865163e-07, + "loss": 0.0418, + "step": 31277 + }, + { + "epoch": 3.709000355745286, + "grad_norm": 1.0843934520395029, + "learning_rate": 6.907519900580861e-07, + "loss": 0.0651, + "step": 31278 + }, + { + "epoch": 3.7091189375074114, + "grad_norm": 0.6445416947609446, + "learning_rate": 6.9019174893111e-07, + "loss": 0.0347, + "step": 31279 + }, + { + "epoch": 3.7092375192695366, + "grad_norm": 0.6557396179430635, + "learning_rate": 6.896317319107559e-07, + "loss": 0.0303, + "step": 31280 + }, + { + "epoch": 3.7093561010316614, + "grad_norm": 0.7433535838935268, + "learning_rate": 6.890719390021893e-07, + "loss": 0.0343, + "step": 31281 + }, + { + "epoch": 3.709474682793786, + "grad_norm": 0.42479985029850587, + "learning_rate": 6.885123702105617e-07, + "loss": 0.0245, + "step": 31282 + }, + { + "epoch": 3.7095932645559113, + "grad_norm": 0.5998068039580419, + "learning_rate": 6.879530255410438e-07, + "loss": 0.0273, + "step": 31283 + }, + { + "epoch": 3.7097118463180365, + "grad_norm": 0.4876132837258091, + "learning_rate": 6.873939049987815e-07, + "loss": 0.0185, + "step": 31284 + }, + { + "epoch": 3.7098304280801613, + "grad_norm": 0.322403789407701, + "learning_rate": 6.868350085889374e-07, + "loss": 0.0169, + "step": 31285 + }, + { + "epoch": 3.709949009842286, + "grad_norm": 0.6540900026723424, + "learning_rate": 6.862763363166519e-07, + "loss": 0.038, + "step": 31286 + }, + { + "epoch": 3.7100675916044112, + "grad_norm": 0.4861243327495563, + "learning_rate": 6.857178881870901e-07, + "loss": 0.0242, + "step": 31287 + }, + { + "epoch": 3.7101861733665364, + "grad_norm": 0.4743489534757921, + "learning_rate": 6.851596642053926e-07, + "loss": 0.024, + "step": 31288 + }, + { + "epoch": 3.710304755128661, + "grad_norm": 0.617425257865431, + "learning_rate": 6.846016643767022e-07, + "loss": 0.0276, + "step": 31289 + }, + { + "epoch": 3.710423336890786, + "grad_norm": 0.4295018772416999, + "learning_rate": 6.840438887061706e-07, + "loss": 0.0158, + "step": 31290 + }, + { + "epoch": 3.710541918652911, + "grad_norm": 0.3490104239556726, + "learning_rate": 6.834863371989326e-07, + "loss": 0.0166, + "step": 31291 + }, + { + "epoch": 3.7106605004150364, + "grad_norm": 0.4623706596654786, + "learning_rate": 6.829290098601365e-07, + "loss": 0.0205, + "step": 31292 + }, + { + "epoch": 3.710779082177161, + "grad_norm": 0.39795077631796866, + "learning_rate": 6.823719066949092e-07, + "loss": 0.0209, + "step": 31293 + }, + { + "epoch": 3.710897663939286, + "grad_norm": 0.4077578838591052, + "learning_rate": 6.81815027708399e-07, + "loss": 0.0208, + "step": 31294 + }, + { + "epoch": 3.711016245701411, + "grad_norm": 0.5206976875004036, + "learning_rate": 6.812583729057298e-07, + "loss": 0.0297, + "step": 31295 + }, + { + "epoch": 3.7111348274635363, + "grad_norm": 0.494399881860018, + "learning_rate": 6.807019422920391e-07, + "loss": 0.0246, + "step": 31296 + }, + { + "epoch": 3.711253409225661, + "grad_norm": 0.49104046508334653, + "learning_rate": 6.801457358724506e-07, + "loss": 0.0222, + "step": 31297 + }, + { + "epoch": 3.711371990987786, + "grad_norm": 0.4558840444684764, + "learning_rate": 6.79589753652099e-07, + "loss": 0.019, + "step": 31298 + }, + { + "epoch": 3.711490572749911, + "grad_norm": 0.6512033849394941, + "learning_rate": 6.790339956361053e-07, + "loss": 0.0351, + "step": 31299 + }, + { + "epoch": 3.711609154512036, + "grad_norm": 0.5294645907499599, + "learning_rate": 6.784784618295931e-07, + "loss": 0.0196, + "step": 31300 + }, + { + "epoch": 3.711727736274161, + "grad_norm": 0.5277695702468178, + "learning_rate": 6.779231522376833e-07, + "loss": 0.0295, + "step": 31301 + }, + { + "epoch": 3.711846318036286, + "grad_norm": 0.40024928928296, + "learning_rate": 6.77368066865497e-07, + "loss": 0.0157, + "step": 31302 + }, + { + "epoch": 3.711964899798411, + "grad_norm": 0.5631502341789902, + "learning_rate": 6.76813205718152e-07, + "loss": 0.0294, + "step": 31303 + }, + { + "epoch": 3.712083481560536, + "grad_norm": 0.6504477904419412, + "learning_rate": 6.762585688007611e-07, + "loss": 0.0364, + "step": 31304 + }, + { + "epoch": 3.712202063322661, + "grad_norm": 0.6040609692887471, + "learning_rate": 6.757041561184396e-07, + "loss": 0.024, + "step": 31305 + }, + { + "epoch": 3.712320645084786, + "grad_norm": 0.5655548290009161, + "learning_rate": 6.751499676762974e-07, + "loss": 0.0201, + "step": 31306 + }, + { + "epoch": 3.712439226846911, + "grad_norm": 0.2922779205859103, + "learning_rate": 6.74596003479444e-07, + "loss": 0.0116, + "step": 31307 + }, + { + "epoch": 3.712557808609036, + "grad_norm": 0.6628778483309246, + "learning_rate": 6.740422635329813e-07, + "loss": 0.0271, + "step": 31308 + }, + { + "epoch": 3.712676390371161, + "grad_norm": 0.7177799862334735, + "learning_rate": 6.734887478420243e-07, + "loss": 0.0376, + "step": 31309 + }, + { + "epoch": 3.712794972133286, + "grad_norm": 0.4094201062609272, + "learning_rate": 6.729354564116663e-07, + "loss": 0.0179, + "step": 31310 + }, + { + "epoch": 3.712913553895411, + "grad_norm": 0.6022478578932542, + "learning_rate": 6.723823892470116e-07, + "loss": 0.0325, + "step": 31311 + }, + { + "epoch": 3.713032135657536, + "grad_norm": 0.6981141783957656, + "learning_rate": 6.718295463531587e-07, + "loss": 0.0342, + "step": 31312 + }, + { + "epoch": 3.7131507174196607, + "grad_norm": 0.550364380168794, + "learning_rate": 6.712769277352038e-07, + "loss": 0.0262, + "step": 31313 + }, + { + "epoch": 3.713269299181786, + "grad_norm": 0.49941772727470024, + "learning_rate": 6.707245333982454e-07, + "loss": 0.0248, + "step": 31314 + }, + { + "epoch": 3.7133878809439107, + "grad_norm": 0.46839219842527424, + "learning_rate": 6.701723633473711e-07, + "loss": 0.0173, + "step": 31315 + }, + { + "epoch": 3.713506462706036, + "grad_norm": 0.498312809990686, + "learning_rate": 6.696204175876686e-07, + "loss": 0.0251, + "step": 31316 + }, + { + "epoch": 3.7136250444681607, + "grad_norm": 0.42102410297743065, + "learning_rate": 6.690686961242337e-07, + "loss": 0.0191, + "step": 31317 + }, + { + "epoch": 3.713743626230286, + "grad_norm": 0.5828399473704055, + "learning_rate": 6.685171989621486e-07, + "loss": 0.0251, + "step": 31318 + }, + { + "epoch": 3.7138622079924106, + "grad_norm": 0.526886823584554, + "learning_rate": 6.679659261064952e-07, + "loss": 0.0259, + "step": 31319 + }, + { + "epoch": 3.713980789754536, + "grad_norm": 0.4301795265383838, + "learning_rate": 6.674148775623612e-07, + "loss": 0.0228, + "step": 31320 + }, + { + "epoch": 3.7140993715166606, + "grad_norm": 0.3765719653285971, + "learning_rate": 6.668640533348202e-07, + "loss": 0.0197, + "step": 31321 + }, + { + "epoch": 3.714217953278786, + "grad_norm": 0.5357365350365233, + "learning_rate": 6.663134534289545e-07, + "loss": 0.0256, + "step": 31322 + }, + { + "epoch": 3.7143365350409105, + "grad_norm": 0.4488290686515934, + "learning_rate": 6.657630778498403e-07, + "loss": 0.0191, + "step": 31323 + }, + { + "epoch": 3.7144551168030358, + "grad_norm": 0.5834645786476494, + "learning_rate": 6.652129266025486e-07, + "loss": 0.0297, + "step": 31324 + }, + { + "epoch": 3.714573698565161, + "grad_norm": 0.4133307284988312, + "learning_rate": 6.646629996921533e-07, + "loss": 0.0198, + "step": 31325 + }, + { + "epoch": 3.7146922803272857, + "grad_norm": 0.42784144194181023, + "learning_rate": 6.641132971237224e-07, + "loss": 0.0178, + "step": 31326 + }, + { + "epoch": 3.7148108620894105, + "grad_norm": 0.6771020556153522, + "learning_rate": 6.635638189023241e-07, + "loss": 0.0348, + "step": 31327 + }, + { + "epoch": 3.7149294438515357, + "grad_norm": 0.5197000221710092, + "learning_rate": 6.630145650330239e-07, + "loss": 0.0274, + "step": 31328 + }, + { + "epoch": 3.715048025613661, + "grad_norm": 0.4881719939585504, + "learning_rate": 6.62465535520887e-07, + "loss": 0.0189, + "step": 31329 + }, + { + "epoch": 3.7151666073757856, + "grad_norm": 0.8144398426255635, + "learning_rate": 6.619167303709706e-07, + "loss": 0.0311, + "step": 31330 + }, + { + "epoch": 3.7152851891379104, + "grad_norm": 0.5440610200640724, + "learning_rate": 6.6136814958834e-07, + "loss": 0.0261, + "step": 31331 + }, + { + "epoch": 3.7154037709000356, + "grad_norm": 0.5776969844736991, + "learning_rate": 6.608197931780496e-07, + "loss": 0.0225, + "step": 31332 + }, + { + "epoch": 3.715522352662161, + "grad_norm": 0.5234362431736586, + "learning_rate": 6.602716611451509e-07, + "loss": 0.0217, + "step": 31333 + }, + { + "epoch": 3.7156409344242856, + "grad_norm": 0.6023180199824486, + "learning_rate": 6.597237534947009e-07, + "loss": 0.0246, + "step": 31334 + }, + { + "epoch": 3.7157595161864103, + "grad_norm": 0.5298404128275658, + "learning_rate": 6.591760702317484e-07, + "loss": 0.0237, + "step": 31335 + }, + { + "epoch": 3.7158780979485355, + "grad_norm": 0.5430989781671565, + "learning_rate": 6.586286113613477e-07, + "loss": 0.0203, + "step": 31336 + }, + { + "epoch": 3.7159966797106607, + "grad_norm": 0.6211084649653716, + "learning_rate": 6.580813768885391e-07, + "loss": 0.0318, + "step": 31337 + }, + { + "epoch": 3.7161152614727855, + "grad_norm": 0.6301015669418718, + "learning_rate": 6.575343668183687e-07, + "loss": 0.0195, + "step": 31338 + }, + { + "epoch": 3.7162338432349102, + "grad_norm": 0.3028899693869545, + "learning_rate": 6.569875811558823e-07, + "loss": 0.0124, + "step": 31339 + }, + { + "epoch": 3.7163524249970354, + "grad_norm": 0.41236998816816633, + "learning_rate": 6.564410199061205e-07, + "loss": 0.0165, + "step": 31340 + }, + { + "epoch": 3.7164710067591606, + "grad_norm": 0.5891114815407545, + "learning_rate": 6.558946830741208e-07, + "loss": 0.0223, + "step": 31341 + }, + { + "epoch": 3.7165895885212854, + "grad_norm": 0.5369237889239968, + "learning_rate": 6.553485706649154e-07, + "loss": 0.0294, + "step": 31342 + }, + { + "epoch": 3.71670817028341, + "grad_norm": 0.581004699226553, + "learning_rate": 6.548026826835474e-07, + "loss": 0.0291, + "step": 31343 + }, + { + "epoch": 3.7168267520455354, + "grad_norm": 0.48114192951163853, + "learning_rate": 6.542570191350434e-07, + "loss": 0.018, + "step": 31344 + }, + { + "epoch": 3.7169453338076606, + "grad_norm": 0.45449022335269273, + "learning_rate": 6.537115800244325e-07, + "loss": 0.0228, + "step": 31345 + }, + { + "epoch": 3.7170639155697853, + "grad_norm": 0.2450118833978659, + "learning_rate": 6.531663653567471e-07, + "loss": 0.0093, + "step": 31346 + }, + { + "epoch": 3.71718249733191, + "grad_norm": 0.6010754207005617, + "learning_rate": 6.526213751370135e-07, + "loss": 0.0276, + "step": 31347 + }, + { + "epoch": 3.7173010790940353, + "grad_norm": 0.46500710041705606, + "learning_rate": 6.52076609370253e-07, + "loss": 0.0224, + "step": 31348 + }, + { + "epoch": 3.7174196608561605, + "grad_norm": 0.6298496828003539, + "learning_rate": 6.515320680614861e-07, + "loss": 0.0284, + "step": 31349 + }, + { + "epoch": 3.7175382426182852, + "grad_norm": 0.6055532029719441, + "learning_rate": 6.50987751215737e-07, + "loss": 0.0304, + "step": 31350 + }, + { + "epoch": 3.7176568243804105, + "grad_norm": 0.6893239920430956, + "learning_rate": 6.504436588380236e-07, + "loss": 0.0377, + "step": 31351 + }, + { + "epoch": 3.717775406142535, + "grad_norm": 0.5030996753946907, + "learning_rate": 6.498997909333615e-07, + "loss": 0.0202, + "step": 31352 + }, + { + "epoch": 3.7178939879046604, + "grad_norm": 0.8951321014205403, + "learning_rate": 6.493561475067577e-07, + "loss": 0.0506, + "step": 31353 + }, + { + "epoch": 3.718012569666785, + "grad_norm": 0.6691634877965267, + "learning_rate": 6.488127285632333e-07, + "loss": 0.0193, + "step": 31354 + }, + { + "epoch": 3.7181311514289104, + "grad_norm": 0.6087716548547942, + "learning_rate": 6.482695341077927e-07, + "loss": 0.0225, + "step": 31355 + }, + { + "epoch": 3.718249733191035, + "grad_norm": 0.5721073271800746, + "learning_rate": 6.477265641454483e-07, + "loss": 0.0352, + "step": 31356 + }, + { + "epoch": 3.7183683149531603, + "grad_norm": 0.6115339671884462, + "learning_rate": 6.471838186811963e-07, + "loss": 0.0259, + "step": 31357 + }, + { + "epoch": 3.718486896715285, + "grad_norm": 0.37851158815023356, + "learning_rate": 6.46641297720052e-07, + "loss": 0.0179, + "step": 31358 + }, + { + "epoch": 3.7186054784774103, + "grad_norm": 0.6400551659168957, + "learning_rate": 6.460990012670087e-07, + "loss": 0.0291, + "step": 31359 + }, + { + "epoch": 3.718724060239535, + "grad_norm": 0.850358580604002, + "learning_rate": 6.455569293270681e-07, + "loss": 0.0405, + "step": 31360 + }, + { + "epoch": 3.7188426420016603, + "grad_norm": 0.4344119081603087, + "learning_rate": 6.450150819052258e-07, + "loss": 0.0181, + "step": 31361 + }, + { + "epoch": 3.718961223763785, + "grad_norm": 0.5726534482721657, + "learning_rate": 6.444734590064782e-07, + "loss": 0.0242, + "step": 31362 + }, + { + "epoch": 3.71907980552591, + "grad_norm": 0.37878239367882655, + "learning_rate": 6.439320606358212e-07, + "loss": 0.0121, + "step": 31363 + }, + { + "epoch": 3.719198387288035, + "grad_norm": 0.5516078270057642, + "learning_rate": 6.433908867982396e-07, + "loss": 0.0234, + "step": 31364 + }, + { + "epoch": 3.71931696905016, + "grad_norm": 0.42646111119723484, + "learning_rate": 6.428499374987296e-07, + "loss": 0.0178, + "step": 31365 + }, + { + "epoch": 3.719435550812285, + "grad_norm": 0.46586631059521266, + "learning_rate": 6.423092127422731e-07, + "loss": 0.0274, + "step": 31366 + }, + { + "epoch": 3.71955413257441, + "grad_norm": 0.6011538222429266, + "learning_rate": 6.417687125338578e-07, + "loss": 0.0228, + "step": 31367 + }, + { + "epoch": 3.719672714336535, + "grad_norm": 0.37067498546757177, + "learning_rate": 6.412284368784604e-07, + "loss": 0.0164, + "step": 31368 + }, + { + "epoch": 3.71979129609866, + "grad_norm": 0.3671073355746183, + "learning_rate": 6.406883857810714e-07, + "loss": 0.0151, + "step": 31369 + }, + { + "epoch": 3.719909877860785, + "grad_norm": 0.5133899147894238, + "learning_rate": 6.401485592466616e-07, + "loss": 0.017, + "step": 31370 + }, + { + "epoch": 3.72002845962291, + "grad_norm": 0.5347291322213187, + "learning_rate": 6.396089572802105e-07, + "loss": 0.0267, + "step": 31371 + }, + { + "epoch": 3.720147041385035, + "grad_norm": 0.3814375392548133, + "learning_rate": 6.390695798866919e-07, + "loss": 0.0171, + "step": 31372 + }, + { + "epoch": 3.72026562314716, + "grad_norm": 0.4781739302964642, + "learning_rate": 6.385304270710768e-07, + "loss": 0.0191, + "step": 31373 + }, + { + "epoch": 3.720384204909285, + "grad_norm": 0.5105097275048281, + "learning_rate": 6.379914988383418e-07, + "loss": 0.0233, + "step": 31374 + }, + { + "epoch": 3.72050278667141, + "grad_norm": 0.4899655802326429, + "learning_rate": 6.374527951934439e-07, + "loss": 0.0236, + "step": 31375 + }, + { + "epoch": 3.7206213684335347, + "grad_norm": 0.2991300218966744, + "learning_rate": 6.369143161413626e-07, + "loss": 0.0116, + "step": 31376 + }, + { + "epoch": 3.72073995019566, + "grad_norm": 0.4793432644045596, + "learning_rate": 6.363760616870495e-07, + "loss": 0.0203, + "step": 31377 + }, + { + "epoch": 3.720858531957785, + "grad_norm": 0.5401187130967229, + "learning_rate": 6.358380318354756e-07, + "loss": 0.0217, + "step": 31378 + }, + { + "epoch": 3.72097711371991, + "grad_norm": 0.29279173781475853, + "learning_rate": 6.35300226591598e-07, + "loss": 0.012, + "step": 31379 + }, + { + "epoch": 3.7210956954820347, + "grad_norm": 0.7251777113500958, + "learning_rate": 6.347626459603712e-07, + "loss": 0.035, + "step": 31380 + }, + { + "epoch": 3.72121427724416, + "grad_norm": 0.854766551909778, + "learning_rate": 6.342252899467604e-07, + "loss": 0.0296, + "step": 31381 + }, + { + "epoch": 3.721332859006285, + "grad_norm": 0.5205416717842121, + "learning_rate": 6.336881585557092e-07, + "loss": 0.0324, + "step": 31382 + }, + { + "epoch": 3.72145144076841, + "grad_norm": 0.5572041942615217, + "learning_rate": 6.331512517921717e-07, + "loss": 0.0273, + "step": 31383 + }, + { + "epoch": 3.7215700225305346, + "grad_norm": 0.5731953354132416, + "learning_rate": 6.326145696610997e-07, + "loss": 0.0232, + "step": 31384 + }, + { + "epoch": 3.72168860429266, + "grad_norm": 1.0610501899611209, + "learning_rate": 6.320781121674446e-07, + "loss": 0.0529, + "step": 31385 + }, + { + "epoch": 3.721807186054785, + "grad_norm": 0.5524156712054321, + "learning_rate": 6.315418793161415e-07, + "loss": 0.0273, + "step": 31386 + }, + { + "epoch": 3.7219257678169098, + "grad_norm": 0.949151499823319, + "learning_rate": 6.31005871112142e-07, + "loss": 0.0413, + "step": 31387 + }, + { + "epoch": 3.7220443495790345, + "grad_norm": 0.41184729100584533, + "learning_rate": 6.304700875603864e-07, + "loss": 0.0181, + "step": 31388 + }, + { + "epoch": 3.7221629313411597, + "grad_norm": 0.5862299272363212, + "learning_rate": 6.299345286658098e-07, + "loss": 0.0295, + "step": 31389 + }, + { + "epoch": 3.722281513103285, + "grad_norm": 0.3310999030167397, + "learning_rate": 6.293991944333527e-07, + "loss": 0.0166, + "step": 31390 + }, + { + "epoch": 3.7224000948654097, + "grad_norm": 0.4508415574492145, + "learning_rate": 6.288640848679528e-07, + "loss": 0.0206, + "step": 31391 + }, + { + "epoch": 3.7225186766275344, + "grad_norm": 0.44461685518151634, + "learning_rate": 6.283291999745394e-07, + "loss": 0.0212, + "step": 31392 + }, + { + "epoch": 3.7226372583896596, + "grad_norm": 0.4567594192242598, + "learning_rate": 6.27794539758042e-07, + "loss": 0.0223, + "step": 31393 + }, + { + "epoch": 3.722755840151785, + "grad_norm": 0.6837596429753174, + "learning_rate": 6.272601042233928e-07, + "loss": 0.0325, + "step": 31394 + }, + { + "epoch": 3.7228744219139096, + "grad_norm": 0.5333662400899111, + "learning_rate": 6.267258933755183e-07, + "loss": 0.027, + "step": 31395 + }, + { + "epoch": 3.7229930036760344, + "grad_norm": 0.8843911135296453, + "learning_rate": 6.26191907219345e-07, + "loss": 0.0459, + "step": 31396 + }, + { + "epoch": 3.7231115854381596, + "grad_norm": 0.7630283564912982, + "learning_rate": 6.256581457597888e-07, + "loss": 0.0279, + "step": 31397 + }, + { + "epoch": 3.7232301672002848, + "grad_norm": 0.635391567563164, + "learning_rate": 6.25124609001776e-07, + "loss": 0.0422, + "step": 31398 + }, + { + "epoch": 3.7233487489624095, + "grad_norm": 0.6852837379913299, + "learning_rate": 6.245912969502249e-07, + "loss": 0.0255, + "step": 31399 + }, + { + "epoch": 3.7234673307245347, + "grad_norm": 0.4301615565412388, + "learning_rate": 6.240582096100484e-07, + "loss": 0.0245, + "step": 31400 + }, + { + "epoch": 3.7235859124866595, + "grad_norm": 0.6694338551763499, + "learning_rate": 6.235253469861647e-07, + "loss": 0.0301, + "step": 31401 + }, + { + "epoch": 3.7237044942487847, + "grad_norm": 0.309036317281725, + "learning_rate": 6.229927090834863e-07, + "loss": 0.0129, + "step": 31402 + }, + { + "epoch": 3.7238230760109094, + "grad_norm": 0.39048245110929425, + "learning_rate": 6.224602959069237e-07, + "loss": 0.0164, + "step": 31403 + }, + { + "epoch": 3.7239416577730347, + "grad_norm": 0.7859851889514737, + "learning_rate": 6.219281074613809e-07, + "loss": 0.0424, + "step": 31404 + }, + { + "epoch": 3.7240602395351594, + "grad_norm": 0.6385332949325545, + "learning_rate": 6.213961437517651e-07, + "loss": 0.0355, + "step": 31405 + }, + { + "epoch": 3.7241788212972846, + "grad_norm": 0.6041970605596065, + "learning_rate": 6.208644047829837e-07, + "loss": 0.0218, + "step": 31406 + }, + { + "epoch": 3.7242974030594094, + "grad_norm": 0.44863041896716305, + "learning_rate": 6.203328905599381e-07, + "loss": 0.0169, + "step": 31407 + }, + { + "epoch": 3.7244159848215346, + "grad_norm": 0.23242413725890373, + "learning_rate": 6.198016010875246e-07, + "loss": 0.0081, + "step": 31408 + }, + { + "epoch": 3.7245345665836593, + "grad_norm": 0.719922776147399, + "learning_rate": 6.192705363706447e-07, + "loss": 0.0274, + "step": 31409 + }, + { + "epoch": 3.7246531483457845, + "grad_norm": 0.4461987672380783, + "learning_rate": 6.187396964141917e-07, + "loss": 0.0248, + "step": 31410 + }, + { + "epoch": 3.7247717301079093, + "grad_norm": 0.7016852635517967, + "learning_rate": 6.18209081223059e-07, + "loss": 0.0319, + "step": 31411 + }, + { + "epoch": 3.7248903118700345, + "grad_norm": 0.5172169628086125, + "learning_rate": 6.176786908021453e-07, + "loss": 0.0268, + "step": 31412 + }, + { + "epoch": 3.7250088936321593, + "grad_norm": 0.7138909025593738, + "learning_rate": 6.171485251563275e-07, + "loss": 0.0232, + "step": 31413 + }, + { + "epoch": 3.7251274753942845, + "grad_norm": 0.6817262997088542, + "learning_rate": 6.166185842905043e-07, + "loss": 0.0283, + "step": 31414 + }, + { + "epoch": 3.725246057156409, + "grad_norm": 0.4571313605845929, + "learning_rate": 6.16088868209555e-07, + "loss": 0.0217, + "step": 31415 + }, + { + "epoch": 3.7253646389185344, + "grad_norm": 0.4559350405182197, + "learning_rate": 6.155593769183676e-07, + "loss": 0.0154, + "step": 31416 + }, + { + "epoch": 3.725483220680659, + "grad_norm": 0.49418800488005843, + "learning_rate": 6.150301104218187e-07, + "loss": 0.0212, + "step": 31417 + }, + { + "epoch": 3.7256018024427844, + "grad_norm": 0.48815364113940213, + "learning_rate": 6.145010687247904e-07, + "loss": 0.025, + "step": 31418 + }, + { + "epoch": 3.725720384204909, + "grad_norm": 0.28933899863447654, + "learning_rate": 6.139722518321567e-07, + "loss": 0.0175, + "step": 31419 + }, + { + "epoch": 3.7258389659670343, + "grad_norm": 0.5044762431122067, + "learning_rate": 6.134436597487969e-07, + "loss": 0.0296, + "step": 31420 + }, + { + "epoch": 3.725957547729159, + "grad_norm": 0.4696265709235607, + "learning_rate": 6.129152924795794e-07, + "loss": 0.016, + "step": 31421 + }, + { + "epoch": 3.7260761294912843, + "grad_norm": 0.39048026185577067, + "learning_rate": 6.123871500293782e-07, + "loss": 0.0154, + "step": 31422 + }, + { + "epoch": 3.726194711253409, + "grad_norm": 0.7919728661899215, + "learning_rate": 6.118592324030642e-07, + "loss": 0.041, + "step": 31423 + }, + { + "epoch": 3.7263132930155343, + "grad_norm": 0.7211488229087859, + "learning_rate": 6.113315396054948e-07, + "loss": 0.0346, + "step": 31424 + }, + { + "epoch": 3.726431874777659, + "grad_norm": 0.7259068586121512, + "learning_rate": 6.108040716415492e-07, + "loss": 0.0307, + "step": 31425 + }, + { + "epoch": 3.7265504565397842, + "grad_norm": 0.5236783249316528, + "learning_rate": 6.102768285160764e-07, + "loss": 0.0247, + "step": 31426 + }, + { + "epoch": 3.7266690383019094, + "grad_norm": 0.5816078880888574, + "learning_rate": 6.097498102339449e-07, + "loss": 0.0348, + "step": 31427 + }, + { + "epoch": 3.726787620064034, + "grad_norm": 0.8581068302581635, + "learning_rate": 6.09223016800009e-07, + "loss": 0.046, + "step": 31428 + }, + { + "epoch": 3.726906201826159, + "grad_norm": 0.42337734359445073, + "learning_rate": 6.086964482191287e-07, + "loss": 0.0202, + "step": 31429 + }, + { + "epoch": 3.727024783588284, + "grad_norm": 0.4847422077567859, + "learning_rate": 6.081701044961585e-07, + "loss": 0.0204, + "step": 31430 + }, + { + "epoch": 3.7271433653504094, + "grad_norm": 0.7406760079031143, + "learning_rate": 6.076439856359445e-07, + "loss": 0.0351, + "step": 31431 + }, + { + "epoch": 3.727261947112534, + "grad_norm": 0.6331416833031045, + "learning_rate": 6.071180916433439e-07, + "loss": 0.0354, + "step": 31432 + }, + { + "epoch": 3.727380528874659, + "grad_norm": 0.26979544875949774, + "learning_rate": 6.065924225232028e-07, + "loss": 0.0118, + "step": 31433 + }, + { + "epoch": 3.727499110636784, + "grad_norm": 0.41965160393139705, + "learning_rate": 6.060669782803646e-07, + "loss": 0.01, + "step": 31434 + }, + { + "epoch": 3.7276176923989093, + "grad_norm": 0.47854535581367214, + "learning_rate": 6.055417589196727e-07, + "loss": 0.0216, + "step": 31435 + }, + { + "epoch": 3.727736274161034, + "grad_norm": 0.22045720930832644, + "learning_rate": 6.050167644459758e-07, + "loss": 0.0117, + "step": 31436 + }, + { + "epoch": 3.727854855923159, + "grad_norm": 0.3614185464159956, + "learning_rate": 6.044919948641064e-07, + "loss": 0.0196, + "step": 31437 + }, + { + "epoch": 3.727973437685284, + "grad_norm": 0.43597212440234795, + "learning_rate": 6.039674501789078e-07, + "loss": 0.0208, + "step": 31438 + }, + { + "epoch": 3.728092019447409, + "grad_norm": 0.497110646592538, + "learning_rate": 6.034431303952092e-07, + "loss": 0.0239, + "step": 31439 + }, + { + "epoch": 3.728210601209534, + "grad_norm": 0.8585854730096437, + "learning_rate": 6.029190355178488e-07, + "loss": 0.0364, + "step": 31440 + }, + { + "epoch": 3.7283291829716587, + "grad_norm": 0.6849722370448874, + "learning_rate": 6.023951655516613e-07, + "loss": 0.0312, + "step": 31441 + }, + { + "epoch": 3.728447764733784, + "grad_norm": 0.5589287010895808, + "learning_rate": 6.01871520501468e-07, + "loss": 0.0323, + "step": 31442 + }, + { + "epoch": 3.728566346495909, + "grad_norm": 0.536722028292015, + "learning_rate": 6.013481003721011e-07, + "loss": 0.0273, + "step": 31443 + }, + { + "epoch": 3.728684928258034, + "grad_norm": 0.42959275159833354, + "learning_rate": 6.008249051683845e-07, + "loss": 0.0242, + "step": 31444 + }, + { + "epoch": 3.7288035100201586, + "grad_norm": 0.4960768241895897, + "learning_rate": 6.00301934895145e-07, + "loss": 0.0216, + "step": 31445 + }, + { + "epoch": 3.728922091782284, + "grad_norm": 0.5007720737002487, + "learning_rate": 5.997791895571953e-07, + "loss": 0.0269, + "step": 31446 + }, + { + "epoch": 3.729040673544409, + "grad_norm": 0.3013241271665834, + "learning_rate": 5.992566691593649e-07, + "loss": 0.0122, + "step": 31447 + }, + { + "epoch": 3.729159255306534, + "grad_norm": 0.34579706087108675, + "learning_rate": 5.987343737064637e-07, + "loss": 0.0148, + "step": 31448 + }, + { + "epoch": 3.7292778370686586, + "grad_norm": 0.5977513774873052, + "learning_rate": 5.982123032033077e-07, + "loss": 0.0284, + "step": 31449 + }, + { + "epoch": 3.7293964188307838, + "grad_norm": 0.516296902625843, + "learning_rate": 5.976904576547121e-07, + "loss": 0.0217, + "step": 31450 + }, + { + "epoch": 3.729515000592909, + "grad_norm": 0.30606002770747226, + "learning_rate": 5.971688370654871e-07, + "loss": 0.0139, + "step": 31451 + }, + { + "epoch": 3.7296335823550337, + "grad_norm": 0.4207631534662485, + "learning_rate": 5.966474414404427e-07, + "loss": 0.0172, + "step": 31452 + }, + { + "epoch": 3.729752164117159, + "grad_norm": 0.3459479877251247, + "learning_rate": 5.961262707843807e-07, + "loss": 0.0205, + "step": 31453 + }, + { + "epoch": 3.7298707458792837, + "grad_norm": 0.5761065366959414, + "learning_rate": 5.956053251021082e-07, + "loss": 0.0294, + "step": 31454 + }, + { + "epoch": 3.729989327641409, + "grad_norm": 0.6023278437147181, + "learning_rate": 5.95084604398427e-07, + "loss": 0.0249, + "step": 31455 + }, + { + "epoch": 3.7301079094035337, + "grad_norm": 0.5336093342866962, + "learning_rate": 5.945641086781417e-07, + "loss": 0.0154, + "step": 31456 + }, + { + "epoch": 3.730226491165659, + "grad_norm": 0.47192674303915694, + "learning_rate": 5.940438379460455e-07, + "loss": 0.0239, + "step": 31457 + }, + { + "epoch": 3.7303450729277836, + "grad_norm": 0.4894764535089744, + "learning_rate": 5.935237922069376e-07, + "loss": 0.0284, + "step": 31458 + }, + { + "epoch": 3.730463654689909, + "grad_norm": 0.35834282310854104, + "learning_rate": 5.930039714656083e-07, + "loss": 0.0168, + "step": 31459 + }, + { + "epoch": 3.7305822364520336, + "grad_norm": 0.4640675569926395, + "learning_rate": 5.924843757268539e-07, + "loss": 0.0204, + "step": 31460 + }, + { + "epoch": 3.7307008182141588, + "grad_norm": 0.6070660389321221, + "learning_rate": 5.91965004995465e-07, + "loss": 0.0415, + "step": 31461 + }, + { + "epoch": 3.7308193999762835, + "grad_norm": 0.8688516482562573, + "learning_rate": 5.914458592762267e-07, + "loss": 0.0502, + "step": 31462 + }, + { + "epoch": 3.7309379817384087, + "grad_norm": 0.8914412314096773, + "learning_rate": 5.909269385739269e-07, + "loss": 0.0454, + "step": 31463 + }, + { + "epoch": 3.7310565635005335, + "grad_norm": 0.42286283905362515, + "learning_rate": 5.904082428933449e-07, + "loss": 0.0204, + "step": 31464 + }, + { + "epoch": 3.7311751452626587, + "grad_norm": 0.6338956126417437, + "learning_rate": 5.898897722392688e-07, + "loss": 0.0281, + "step": 31465 + }, + { + "epoch": 3.7312937270247835, + "grad_norm": 0.5526830704022726, + "learning_rate": 5.893715266164751e-07, + "loss": 0.0259, + "step": 31466 + }, + { + "epoch": 3.7314123087869087, + "grad_norm": 0.6311611317554382, + "learning_rate": 5.888535060297407e-07, + "loss": 0.015, + "step": 31467 + }, + { + "epoch": 3.7315308905490334, + "grad_norm": 0.8327148140131049, + "learning_rate": 5.883357104838421e-07, + "loss": 0.0383, + "step": 31468 + }, + { + "epoch": 3.7316494723111586, + "grad_norm": 0.5268788923428607, + "learning_rate": 5.878181399835536e-07, + "loss": 0.0206, + "step": 31469 + }, + { + "epoch": 3.7317680540732834, + "grad_norm": 0.5077923292321295, + "learning_rate": 5.873007945336433e-07, + "loss": 0.0225, + "step": 31470 + }, + { + "epoch": 3.7318866358354086, + "grad_norm": 1.0298646092415453, + "learning_rate": 5.867836741388855e-07, + "loss": 0.032, + "step": 31471 + }, + { + "epoch": 3.7320052175975333, + "grad_norm": 0.6096806292414664, + "learning_rate": 5.862667788040455e-07, + "loss": 0.0254, + "step": 31472 + }, + { + "epoch": 3.7321237993596585, + "grad_norm": 0.4696604652012172, + "learning_rate": 5.857501085338834e-07, + "loss": 0.0204, + "step": 31473 + }, + { + "epoch": 3.7322423811217833, + "grad_norm": 0.39248808214395603, + "learning_rate": 5.852336633331706e-07, + "loss": 0.0169, + "step": 31474 + }, + { + "epoch": 3.7323609628839085, + "grad_norm": 0.8215840780245833, + "learning_rate": 5.847174432066616e-07, + "loss": 0.0443, + "step": 31475 + }, + { + "epoch": 3.7324795446460337, + "grad_norm": 0.6073539371029008, + "learning_rate": 5.842014481591191e-07, + "loss": 0.0332, + "step": 31476 + }, + { + "epoch": 3.7325981264081585, + "grad_norm": 0.3426013115458381, + "learning_rate": 5.836856781952976e-07, + "loss": 0.0152, + "step": 31477 + }, + { + "epoch": 3.7327167081702832, + "grad_norm": 0.7746657960468412, + "learning_rate": 5.831701333199574e-07, + "loss": 0.0342, + "step": 31478 + }, + { + "epoch": 3.7328352899324084, + "grad_norm": 0.7104021609688193, + "learning_rate": 5.826548135378418e-07, + "loss": 0.0217, + "step": 31479 + }, + { + "epoch": 3.7329538716945336, + "grad_norm": 0.7967910067788115, + "learning_rate": 5.821397188537053e-07, + "loss": 0.0301, + "step": 31480 + }, + { + "epoch": 3.7330724534566584, + "grad_norm": 0.570837678490679, + "learning_rate": 5.816248492722998e-07, + "loss": 0.0225, + "step": 31481 + }, + { + "epoch": 3.733191035218783, + "grad_norm": 0.3397118236136237, + "learning_rate": 5.811102047983686e-07, + "loss": 0.0184, + "step": 31482 + }, + { + "epoch": 3.7333096169809084, + "grad_norm": 0.7507934857609583, + "learning_rate": 5.805957854366606e-07, + "loss": 0.0315, + "step": 31483 + }, + { + "epoch": 3.7334281987430336, + "grad_norm": 0.2890883767510263, + "learning_rate": 5.800815911919083e-07, + "loss": 0.0107, + "step": 31484 + }, + { + "epoch": 3.7335467805051583, + "grad_norm": 0.5835256283058244, + "learning_rate": 5.795676220688634e-07, + "loss": 0.0221, + "step": 31485 + }, + { + "epoch": 3.733665362267283, + "grad_norm": 0.5763885149624606, + "learning_rate": 5.790538780722554e-07, + "loss": 0.0227, + "step": 31486 + }, + { + "epoch": 3.7337839440294083, + "grad_norm": 0.7138293291891239, + "learning_rate": 5.785403592068223e-07, + "loss": 0.0329, + "step": 31487 + }, + { + "epoch": 3.7339025257915335, + "grad_norm": 0.7378542947472001, + "learning_rate": 5.780270654773018e-07, + "loss": 0.0429, + "step": 31488 + }, + { + "epoch": 3.7340211075536582, + "grad_norm": 0.5580674052540348, + "learning_rate": 5.775139968884236e-07, + "loss": 0.0235, + "step": 31489 + }, + { + "epoch": 3.734139689315783, + "grad_norm": 0.7748729777984324, + "learning_rate": 5.770011534449199e-07, + "loss": 0.0296, + "step": 31490 + }, + { + "epoch": 3.734258271077908, + "grad_norm": 0.45729090254259935, + "learning_rate": 5.764885351515092e-07, + "loss": 0.0251, + "step": 31491 + }, + { + "epoch": 3.7343768528400334, + "grad_norm": 0.6207981888617391, + "learning_rate": 5.759761420129322e-07, + "loss": 0.0338, + "step": 31492 + }, + { + "epoch": 3.734495434602158, + "grad_norm": 0.5146157873188513, + "learning_rate": 5.754639740338991e-07, + "loss": 0.0207, + "step": 31493 + }, + { + "epoch": 3.734614016364283, + "grad_norm": 0.27830462933701045, + "learning_rate": 5.74952031219142e-07, + "loss": 0.0114, + "step": 31494 + }, + { + "epoch": 3.734732598126408, + "grad_norm": 0.7748754125224554, + "learning_rate": 5.744403135733683e-07, + "loss": 0.0374, + "step": 31495 + }, + { + "epoch": 3.7348511798885333, + "grad_norm": 0.4310123175666704, + "learning_rate": 5.739288211013078e-07, + "loss": 0.025, + "step": 31496 + }, + { + "epoch": 3.734969761650658, + "grad_norm": 0.5243212560672253, + "learning_rate": 5.734175538076674e-07, + "loss": 0.0204, + "step": 31497 + }, + { + "epoch": 3.735088343412783, + "grad_norm": 0.618937236501607, + "learning_rate": 5.729065116971632e-07, + "loss": 0.0356, + "step": 31498 + }, + { + "epoch": 3.735206925174908, + "grad_norm": 0.6604166760159736, + "learning_rate": 5.723956947745079e-07, + "loss": 0.025, + "step": 31499 + }, + { + "epoch": 3.7353255069370332, + "grad_norm": 0.5619970930627122, + "learning_rate": 5.718851030444089e-07, + "loss": 0.0265, + "step": 31500 + }, + { + "epoch": 3.735444088699158, + "grad_norm": 0.44098187680912293, + "learning_rate": 5.713747365115762e-07, + "loss": 0.0175, + "step": 31501 + }, + { + "epoch": 3.735562670461283, + "grad_norm": 0.6555850436624612, + "learning_rate": 5.708645951807062e-07, + "loss": 0.0288, + "step": 31502 + }, + { + "epoch": 3.735681252223408, + "grad_norm": 0.36825361584651944, + "learning_rate": 5.703546790565117e-07, + "loss": 0.0153, + "step": 31503 + }, + { + "epoch": 3.735799833985533, + "grad_norm": 0.36321891532488887, + "learning_rate": 5.69844988143689e-07, + "loss": 0.0187, + "step": 31504 + }, + { + "epoch": 3.735918415747658, + "grad_norm": 0.4329621107298743, + "learning_rate": 5.693355224469371e-07, + "loss": 0.02, + "step": 31505 + }, + { + "epoch": 3.736036997509783, + "grad_norm": 0.5591615756566501, + "learning_rate": 5.688262819709494e-07, + "loss": 0.026, + "step": 31506 + }, + { + "epoch": 3.736155579271908, + "grad_norm": 0.40689796027593433, + "learning_rate": 5.683172667204306e-07, + "loss": 0.0166, + "step": 31507 + }, + { + "epoch": 3.736274161034033, + "grad_norm": 0.45304236148754695, + "learning_rate": 5.678084767000602e-07, + "loss": 0.0141, + "step": 31508 + }, + { + "epoch": 3.736392742796158, + "grad_norm": 0.6346176958630031, + "learning_rate": 5.672999119145373e-07, + "loss": 0.0287, + "step": 31509 + }, + { + "epoch": 3.736511324558283, + "grad_norm": 0.4390888766204307, + "learning_rate": 5.667915723685496e-07, + "loss": 0.0183, + "step": 31510 + }, + { + "epoch": 3.736629906320408, + "grad_norm": 0.41632082488531663, + "learning_rate": 5.662834580667798e-07, + "loss": 0.0114, + "step": 31511 + }, + { + "epoch": 3.736748488082533, + "grad_norm": 0.38291472906687224, + "learning_rate": 5.657755690139155e-07, + "loss": 0.0168, + "step": 31512 + }, + { + "epoch": 3.7368670698446578, + "grad_norm": 0.740850647627758, + "learning_rate": 5.652679052146365e-07, + "loss": 0.0401, + "step": 31513 + }, + { + "epoch": 3.736985651606783, + "grad_norm": 0.577052064806342, + "learning_rate": 5.647604666736223e-07, + "loss": 0.0199, + "step": 31514 + }, + { + "epoch": 3.7371042333689077, + "grad_norm": 0.3860605884489728, + "learning_rate": 5.642532533955525e-07, + "loss": 0.0174, + "step": 31515 + }, + { + "epoch": 3.737222815131033, + "grad_norm": 0.5737617062784284, + "learning_rate": 5.637462653851039e-07, + "loss": 0.0189, + "step": 31516 + }, + { + "epoch": 3.7373413968931577, + "grad_norm": 0.8299250246168696, + "learning_rate": 5.63239502646945e-07, + "loss": 0.0426, + "step": 31517 + }, + { + "epoch": 3.737459978655283, + "grad_norm": 0.5031544166199313, + "learning_rate": 5.627329651857582e-07, + "loss": 0.0193, + "step": 31518 + }, + { + "epoch": 3.7375785604174077, + "grad_norm": 0.4917967379131274, + "learning_rate": 5.622266530062009e-07, + "loss": 0.0189, + "step": 31519 + }, + { + "epoch": 3.737697142179533, + "grad_norm": 0.6811120184039263, + "learning_rate": 5.617205661129471e-07, + "loss": 0.0418, + "step": 31520 + }, + { + "epoch": 3.7378157239416576, + "grad_norm": 0.3960129895445664, + "learning_rate": 5.612147045106625e-07, + "loss": 0.0149, + "step": 31521 + }, + { + "epoch": 3.737934305703783, + "grad_norm": 0.4838510591292708, + "learning_rate": 5.607090682040101e-07, + "loss": 0.0215, + "step": 31522 + }, + { + "epoch": 3.7380528874659076, + "grad_norm": 0.3324611371208113, + "learning_rate": 5.602036571976527e-07, + "loss": 0.0143, + "step": 31523 + }, + { + "epoch": 3.738171469228033, + "grad_norm": 0.5959332027270696, + "learning_rate": 5.59698471496245e-07, + "loss": 0.0294, + "step": 31524 + }, + { + "epoch": 3.738290050990158, + "grad_norm": 0.5713800928779742, + "learning_rate": 5.591935111044472e-07, + "loss": 0.0208, + "step": 31525 + }, + { + "epoch": 3.7384086327522827, + "grad_norm": 0.6184715950082924, + "learning_rate": 5.586887760269111e-07, + "loss": 0.0267, + "step": 31526 + }, + { + "epoch": 3.7385272145144075, + "grad_norm": 0.419264645703571, + "learning_rate": 5.581842662682967e-07, + "loss": 0.0181, + "step": 31527 + }, + { + "epoch": 3.7386457962765327, + "grad_norm": 0.5106595437404775, + "learning_rate": 5.576799818332506e-07, + "loss": 0.0301, + "step": 31528 + }, + { + "epoch": 3.738764378038658, + "grad_norm": 0.5676361068728127, + "learning_rate": 5.571759227264189e-07, + "loss": 0.0313, + "step": 31529 + }, + { + "epoch": 3.7388829598007827, + "grad_norm": 0.5946622841980573, + "learning_rate": 5.566720889524535e-07, + "loss": 0.0301, + "step": 31530 + }, + { + "epoch": 3.7390015415629074, + "grad_norm": 0.7778878705348743, + "learning_rate": 5.56168480515995e-07, + "loss": 0.0391, + "step": 31531 + }, + { + "epoch": 3.7391201233250326, + "grad_norm": 0.5988012104987436, + "learning_rate": 5.556650974216898e-07, + "loss": 0.0259, + "step": 31532 + }, + { + "epoch": 3.739238705087158, + "grad_norm": 0.6353808171334769, + "learning_rate": 5.551619396741759e-07, + "loss": 0.0258, + "step": 31533 + }, + { + "epoch": 3.7393572868492826, + "grad_norm": 0.606604320119029, + "learning_rate": 5.546590072780966e-07, + "loss": 0.023, + "step": 31534 + }, + { + "epoch": 3.7394758686114073, + "grad_norm": 0.5227736151287294, + "learning_rate": 5.541563002380789e-07, + "loss": 0.0206, + "step": 31535 + }, + { + "epoch": 3.7395944503735326, + "grad_norm": 0.27605836590731536, + "learning_rate": 5.536538185587664e-07, + "loss": 0.0149, + "step": 31536 + }, + { + "epoch": 3.7397130321356578, + "grad_norm": 0.8805123845450928, + "learning_rate": 5.53151562244783e-07, + "loss": 0.0265, + "step": 31537 + }, + { + "epoch": 3.7398316138977825, + "grad_norm": 0.6737649158559315, + "learning_rate": 5.526495313007668e-07, + "loss": 0.0295, + "step": 31538 + }, + { + "epoch": 3.7399501956599073, + "grad_norm": 0.2378982837716851, + "learning_rate": 5.521477257313445e-07, + "loss": 0.0145, + "step": 31539 + }, + { + "epoch": 3.7400687774220325, + "grad_norm": 0.7313377753511603, + "learning_rate": 5.516461455411348e-07, + "loss": 0.0339, + "step": 31540 + }, + { + "epoch": 3.7401873591841577, + "grad_norm": 0.4814135573035987, + "learning_rate": 5.511447907347728e-07, + "loss": 0.0261, + "step": 31541 + }, + { + "epoch": 3.7403059409462824, + "grad_norm": 0.5515139010441078, + "learning_rate": 5.506436613168714e-07, + "loss": 0.0191, + "step": 31542 + }, + { + "epoch": 3.740424522708407, + "grad_norm": 0.5751153899181652, + "learning_rate": 5.501427572920575e-07, + "loss": 0.0195, + "step": 31543 + }, + { + "epoch": 3.7405431044705324, + "grad_norm": 0.30632791826887107, + "learning_rate": 5.496420786649387e-07, + "loss": 0.0135, + "step": 31544 + }, + { + "epoch": 3.7406616862326576, + "grad_norm": 0.46903229196870166, + "learning_rate": 5.491416254401416e-07, + "loss": 0.0131, + "step": 31545 + }, + { + "epoch": 3.7407802679947824, + "grad_norm": 0.5054930357110051, + "learning_rate": 5.486413976222738e-07, + "loss": 0.0205, + "step": 31546 + }, + { + "epoch": 3.740898849756907, + "grad_norm": 0.7120651556973215, + "learning_rate": 5.481413952159481e-07, + "loss": 0.0231, + "step": 31547 + }, + { + "epoch": 3.7410174315190323, + "grad_norm": 0.615101566179651, + "learning_rate": 5.476416182257749e-07, + "loss": 0.0206, + "step": 31548 + }, + { + "epoch": 3.7411360132811575, + "grad_norm": 0.5430645051299532, + "learning_rate": 5.471420666563587e-07, + "loss": 0.0203, + "step": 31549 + }, + { + "epoch": 3.7412545950432823, + "grad_norm": 0.4245241569564131, + "learning_rate": 5.46642740512307e-07, + "loss": 0.0218, + "step": 31550 + }, + { + "epoch": 3.7413731768054075, + "grad_norm": 0.7586551139498764, + "learning_rate": 5.461436397982217e-07, + "loss": 0.0321, + "step": 31551 + }, + { + "epoch": 3.7414917585675322, + "grad_norm": 0.42813806602266763, + "learning_rate": 5.456447645187074e-07, + "loss": 0.0172, + "step": 31552 + }, + { + "epoch": 3.7416103403296574, + "grad_norm": 0.7195115403140351, + "learning_rate": 5.451461146783576e-07, + "loss": 0.0365, + "step": 31553 + }, + { + "epoch": 3.741728922091782, + "grad_norm": 0.49997526136269177, + "learning_rate": 5.446476902817771e-07, + "loss": 0.0253, + "step": 31554 + }, + { + "epoch": 3.7418475038539074, + "grad_norm": 0.5000791684606295, + "learning_rate": 5.441494913335482e-07, + "loss": 0.0254, + "step": 31555 + }, + { + "epoch": 3.741966085616032, + "grad_norm": 0.3653947890100805, + "learning_rate": 5.436515178382784e-07, + "loss": 0.0175, + "step": 31556 + }, + { + "epoch": 3.7420846673781574, + "grad_norm": 0.4324961306526892, + "learning_rate": 5.431537698005473e-07, + "loss": 0.0183, + "step": 31557 + }, + { + "epoch": 3.742203249140282, + "grad_norm": 0.4105758297836235, + "learning_rate": 5.426562472249486e-07, + "loss": 0.0115, + "step": 31558 + }, + { + "epoch": 3.7423218309024073, + "grad_norm": 0.32209809282732366, + "learning_rate": 5.421589501160645e-07, + "loss": 0.0162, + "step": 31559 + }, + { + "epoch": 3.742440412664532, + "grad_norm": 0.6315835473029403, + "learning_rate": 5.416618784784861e-07, + "loss": 0.0289, + "step": 31560 + }, + { + "epoch": 3.7425589944266573, + "grad_norm": 0.813416570379004, + "learning_rate": 5.411650323167928e-07, + "loss": 0.0345, + "step": 31561 + }, + { + "epoch": 3.742677576188782, + "grad_norm": 0.8070267303684362, + "learning_rate": 5.406684116355615e-07, + "loss": 0.032, + "step": 31562 + }, + { + "epoch": 3.7427961579509073, + "grad_norm": 0.3807986555955308, + "learning_rate": 5.401720164393775e-07, + "loss": 0.0158, + "step": 31563 + }, + { + "epoch": 3.742914739713032, + "grad_norm": 0.5372525208034847, + "learning_rate": 5.396758467328094e-07, + "loss": 0.0243, + "step": 31564 + }, + { + "epoch": 3.743033321475157, + "grad_norm": 0.3039659168999697, + "learning_rate": 5.391799025204397e-07, + "loss": 0.0127, + "step": 31565 + }, + { + "epoch": 3.743151903237282, + "grad_norm": 0.5534494158649287, + "learning_rate": 5.386841838068285e-07, + "loss": 0.0231, + "step": 31566 + }, + { + "epoch": 3.743270484999407, + "grad_norm": 0.46830001360041285, + "learning_rate": 5.381886905965583e-07, + "loss": 0.0228, + "step": 31567 + }, + { + "epoch": 3.743389066761532, + "grad_norm": 0.4362841467128925, + "learning_rate": 5.376934228941893e-07, + "loss": 0.0175, + "step": 31568 + }, + { + "epoch": 3.743507648523657, + "grad_norm": 0.6780097097616146, + "learning_rate": 5.371983807042874e-07, + "loss": 0.0291, + "step": 31569 + }, + { + "epoch": 3.743626230285782, + "grad_norm": 0.3785742190949289, + "learning_rate": 5.367035640314184e-07, + "loss": 0.0156, + "step": 31570 + }, + { + "epoch": 3.743744812047907, + "grad_norm": 0.6405253937163138, + "learning_rate": 5.36208972880145e-07, + "loss": 0.0263, + "step": 31571 + }, + { + "epoch": 3.743863393810032, + "grad_norm": 0.5867151462146843, + "learning_rate": 5.357146072550279e-07, + "loss": 0.0252, + "step": 31572 + }, + { + "epoch": 3.743981975572157, + "grad_norm": 0.385426635669653, + "learning_rate": 5.352204671606187e-07, + "loss": 0.0181, + "step": 31573 + }, + { + "epoch": 3.7441005573342823, + "grad_norm": 0.2564962723553276, + "learning_rate": 5.347265526014778e-07, + "loss": 0.009, + "step": 31574 + }, + { + "epoch": 3.744219139096407, + "grad_norm": 0.48813682582222784, + "learning_rate": 5.342328635821542e-07, + "loss": 0.024, + "step": 31575 + }, + { + "epoch": 3.744337720858532, + "grad_norm": 0.404414167348229, + "learning_rate": 5.337394001072054e-07, + "loss": 0.022, + "step": 31576 + }, + { + "epoch": 3.744456302620657, + "grad_norm": 0.8571883505744096, + "learning_rate": 5.332461621811724e-07, + "loss": 0.0412, + "step": 31577 + }, + { + "epoch": 3.744574884382782, + "grad_norm": 0.658895478299704, + "learning_rate": 5.327531498086125e-07, + "loss": 0.0297, + "step": 31578 + }, + { + "epoch": 3.744693466144907, + "grad_norm": 0.5153029544681773, + "learning_rate": 5.322603629940609e-07, + "loss": 0.0192, + "step": 31579 + }, + { + "epoch": 3.7448120479070317, + "grad_norm": 0.7047118290632046, + "learning_rate": 5.317678017420641e-07, + "loss": 0.031, + "step": 31580 + }, + { + "epoch": 3.744930629669157, + "grad_norm": 0.6006722439848856, + "learning_rate": 5.312754660571628e-07, + "loss": 0.0344, + "step": 31581 + }, + { + "epoch": 3.745049211431282, + "grad_norm": 0.6503915420269802, + "learning_rate": 5.30783355943898e-07, + "loss": 0.032, + "step": 31582 + }, + { + "epoch": 3.745167793193407, + "grad_norm": 0.5000719867171144, + "learning_rate": 5.302914714068075e-07, + "loss": 0.0161, + "step": 31583 + }, + { + "epoch": 3.7452863749555316, + "grad_norm": 0.5848523436902527, + "learning_rate": 5.297998124504183e-07, + "loss": 0.0367, + "step": 31584 + }, + { + "epoch": 3.745404956717657, + "grad_norm": 0.4223089900075269, + "learning_rate": 5.293083790792686e-07, + "loss": 0.0186, + "step": 31585 + }, + { + "epoch": 3.745523538479782, + "grad_norm": 0.2010980504276291, + "learning_rate": 5.288171712978879e-07, + "loss": 0.008, + "step": 31586 + }, + { + "epoch": 3.745642120241907, + "grad_norm": 0.38891899395278595, + "learning_rate": 5.283261891108033e-07, + "loss": 0.0161, + "step": 31587 + }, + { + "epoch": 3.7457607020040316, + "grad_norm": 0.6965095341724485, + "learning_rate": 5.278354325225416e-07, + "loss": 0.0377, + "step": 31588 + }, + { + "epoch": 3.7458792837661568, + "grad_norm": 0.5431714863680627, + "learning_rate": 5.273449015376298e-07, + "loss": 0.0263, + "step": 31589 + }, + { + "epoch": 3.745997865528282, + "grad_norm": 0.6112742294254316, + "learning_rate": 5.268545961605864e-07, + "loss": 0.0294, + "step": 31590 + }, + { + "epoch": 3.7461164472904067, + "grad_norm": 0.3263440128765745, + "learning_rate": 5.263645163959329e-07, + "loss": 0.0156, + "step": 31591 + }, + { + "epoch": 3.7462350290525315, + "grad_norm": 0.4605713778257016, + "learning_rate": 5.258746622481881e-07, + "loss": 0.0272, + "step": 31592 + }, + { + "epoch": 3.7463536108146567, + "grad_norm": 0.45786377737166073, + "learning_rate": 5.253850337218647e-07, + "loss": 0.0242, + "step": 31593 + }, + { + "epoch": 3.746472192576782, + "grad_norm": 0.7252559250655347, + "learning_rate": 5.248956308214814e-07, + "loss": 0.0222, + "step": 31594 + }, + { + "epoch": 3.7465907743389066, + "grad_norm": 0.43223946564400245, + "learning_rate": 5.244064535515458e-07, + "loss": 0.0215, + "step": 31595 + }, + { + "epoch": 3.7467093561010314, + "grad_norm": 0.9664198126796915, + "learning_rate": 5.239175019165681e-07, + "loss": 0.0455, + "step": 31596 + }, + { + "epoch": 3.7468279378631566, + "grad_norm": 0.4192504511057313, + "learning_rate": 5.23428775921056e-07, + "loss": 0.0226, + "step": 31597 + }, + { + "epoch": 3.746946519625282, + "grad_norm": 0.6997482638035774, + "learning_rate": 5.229402755695139e-07, + "loss": 0.0275, + "step": 31598 + }, + { + "epoch": 3.7470651013874066, + "grad_norm": 0.40792202055139887, + "learning_rate": 5.224520008664524e-07, + "loss": 0.0175, + "step": 31599 + }, + { + "epoch": 3.7471836831495318, + "grad_norm": 0.6815667211971167, + "learning_rate": 5.219639518163594e-07, + "loss": 0.0329, + "step": 31600 + }, + { + "epoch": 3.7473022649116565, + "grad_norm": 0.7000309647538855, + "learning_rate": 5.214761284237479e-07, + "loss": 0.0412, + "step": 31601 + }, + { + "epoch": 3.7474208466737817, + "grad_norm": 0.6446560738421567, + "learning_rate": 5.209885306931062e-07, + "loss": 0.0366, + "step": 31602 + }, + { + "epoch": 3.7475394284359065, + "grad_norm": 0.4550384891370782, + "learning_rate": 5.205011586289332e-07, + "loss": 0.0195, + "step": 31603 + }, + { + "epoch": 3.7476580101980317, + "grad_norm": 0.4658579941979995, + "learning_rate": 5.2001401223572e-07, + "loss": 0.0255, + "step": 31604 + }, + { + "epoch": 3.7477765919601564, + "grad_norm": 0.5352945541680737, + "learning_rate": 5.195270915179601e-07, + "loss": 0.0217, + "step": 31605 + }, + { + "epoch": 3.7478951737222816, + "grad_norm": 0.47047377213197084, + "learning_rate": 5.190403964801389e-07, + "loss": 0.0215, + "step": 31606 + }, + { + "epoch": 3.7480137554844064, + "grad_norm": 0.816826380375169, + "learning_rate": 5.185539271267442e-07, + "loss": 0.0384, + "step": 31607 + }, + { + "epoch": 3.7481323372465316, + "grad_norm": 0.337734194411855, + "learning_rate": 5.180676834622616e-07, + "loss": 0.0129, + "step": 31608 + }, + { + "epoch": 3.7482509190086564, + "grad_norm": 0.5704262927450912, + "learning_rate": 5.175816654911736e-07, + "loss": 0.037, + "step": 31609 + }, + { + "epoch": 3.7483695007707816, + "grad_norm": 0.5838178053849454, + "learning_rate": 5.170958732179626e-07, + "loss": 0.0335, + "step": 31610 + }, + { + "epoch": 3.7484880825329063, + "grad_norm": 0.6343378297830077, + "learning_rate": 5.166103066470973e-07, + "loss": 0.0356, + "step": 31611 + }, + { + "epoch": 3.7486066642950315, + "grad_norm": 0.8170403446994265, + "learning_rate": 5.161249657830686e-07, + "loss": 0.0447, + "step": 31612 + }, + { + "epoch": 3.7487252460571563, + "grad_norm": 0.491156401824117, + "learning_rate": 5.156398506303422e-07, + "loss": 0.032, + "step": 31613 + }, + { + "epoch": 3.7488438278192815, + "grad_norm": 0.5235610701052105, + "learning_rate": 5.151549611933926e-07, + "loss": 0.0198, + "step": 31614 + }, + { + "epoch": 3.7489624095814063, + "grad_norm": 0.43881425568317795, + "learning_rate": 5.146702974766854e-07, + "loss": 0.0203, + "step": 31615 + }, + { + "epoch": 3.7490809913435315, + "grad_norm": 0.6023757020952663, + "learning_rate": 5.141858594846948e-07, + "loss": 0.0319, + "step": 31616 + }, + { + "epoch": 3.749199573105656, + "grad_norm": 0.34023296115412227, + "learning_rate": 5.137016472218842e-07, + "loss": 0.0165, + "step": 31617 + }, + { + "epoch": 3.7493181548677814, + "grad_norm": 0.5631174764405107, + "learning_rate": 5.132176606927136e-07, + "loss": 0.0247, + "step": 31618 + }, + { + "epoch": 3.749436736629906, + "grad_norm": 0.6656734066167209, + "learning_rate": 5.127338999016518e-07, + "loss": 0.0182, + "step": 31619 + }, + { + "epoch": 3.7495553183920314, + "grad_norm": 0.9295112650777091, + "learning_rate": 5.122503648531535e-07, + "loss": 0.0402, + "step": 31620 + }, + { + "epoch": 3.749673900154156, + "grad_norm": 0.6492115131825976, + "learning_rate": 5.117670555516818e-07, + "loss": 0.0315, + "step": 31621 + }, + { + "epoch": 3.7497924819162813, + "grad_norm": 0.4351318477102644, + "learning_rate": 5.112839720016832e-07, + "loss": 0.0179, + "step": 31622 + }, + { + "epoch": 3.749911063678406, + "grad_norm": 0.8505811501593521, + "learning_rate": 5.108011142076208e-07, + "loss": 0.0379, + "step": 31623 + }, + { + "epoch": 3.7500296454405313, + "grad_norm": 0.657629799720036, + "learning_rate": 5.103184821739381e-07, + "loss": 0.0257, + "step": 31624 + }, + { + "epoch": 3.750148227202656, + "grad_norm": 0.49311246735277825, + "learning_rate": 5.09836075905093e-07, + "loss": 0.0168, + "step": 31625 + }, + { + "epoch": 3.7502668089647813, + "grad_norm": 0.5639652320863031, + "learning_rate": 5.093538954055205e-07, + "loss": 0.0216, + "step": 31626 + }, + { + "epoch": 3.7503853907269065, + "grad_norm": 0.6814221086712895, + "learning_rate": 5.088719406796783e-07, + "loss": 0.0337, + "step": 31627 + }, + { + "epoch": 3.7505039724890312, + "grad_norm": 0.4737679406000582, + "learning_rate": 5.083902117320017e-07, + "loss": 0.0198, + "step": 31628 + }, + { + "epoch": 3.750622554251156, + "grad_norm": 0.8968102630776643, + "learning_rate": 5.079087085669315e-07, + "loss": 0.0499, + "step": 31629 + }, + { + "epoch": 3.750741136013281, + "grad_norm": 0.41362299315434276, + "learning_rate": 5.074274311889115e-07, + "loss": 0.0217, + "step": 31630 + }, + { + "epoch": 3.7508597177754064, + "grad_norm": 0.29809951598557405, + "learning_rate": 5.069463796023743e-07, + "loss": 0.0129, + "step": 31631 + }, + { + "epoch": 3.750978299537531, + "grad_norm": 0.5345693668673022, + "learning_rate": 5.06465553811758e-07, + "loss": 0.0222, + "step": 31632 + }, + { + "epoch": 3.751096881299656, + "grad_norm": 0.5178120472174312, + "learning_rate": 5.059849538214895e-07, + "loss": 0.0261, + "step": 31633 + }, + { + "epoch": 3.751215463061781, + "grad_norm": 0.5203481818980756, + "learning_rate": 5.055045796360069e-07, + "loss": 0.0253, + "step": 31634 + }, + { + "epoch": 3.7513340448239063, + "grad_norm": 0.627497294896029, + "learning_rate": 5.050244312597346e-07, + "loss": 0.0216, + "step": 31635 + }, + { + "epoch": 3.751452626586031, + "grad_norm": 0.6223270208390973, + "learning_rate": 5.045445086970995e-07, + "loss": 0.0336, + "step": 31636 + }, + { + "epoch": 3.751571208348156, + "grad_norm": 0.5976397902203945, + "learning_rate": 5.040648119525232e-07, + "loss": 0.0266, + "step": 31637 + }, + { + "epoch": 3.751689790110281, + "grad_norm": 0.5398806043435811, + "learning_rate": 5.035853410304325e-07, + "loss": 0.0198, + "step": 31638 + }, + { + "epoch": 3.7518083718724062, + "grad_norm": 0.5438211441015067, + "learning_rate": 5.031060959352462e-07, + "loss": 0.0213, + "step": 31639 + }, + { + "epoch": 3.751926953634531, + "grad_norm": 0.49379382535375443, + "learning_rate": 5.026270766713804e-07, + "loss": 0.0208, + "step": 31640 + }, + { + "epoch": 3.7520455353966558, + "grad_norm": 0.5720836681487873, + "learning_rate": 5.021482832432505e-07, + "loss": 0.0362, + "step": 31641 + }, + { + "epoch": 3.752164117158781, + "grad_norm": 0.4071669848784149, + "learning_rate": 5.016697156552757e-07, + "loss": 0.0228, + "step": 31642 + }, + { + "epoch": 3.752282698920906, + "grad_norm": 0.6087033928299371, + "learning_rate": 5.011913739118635e-07, + "loss": 0.0308, + "step": 31643 + }, + { + "epoch": 3.752401280683031, + "grad_norm": 0.43082347167549945, + "learning_rate": 5.007132580174239e-07, + "loss": 0.022, + "step": 31644 + }, + { + "epoch": 3.7525198624451557, + "grad_norm": 0.247068632318401, + "learning_rate": 5.00235367976365e-07, + "loss": 0.0099, + "step": 31645 + }, + { + "epoch": 3.752638444207281, + "grad_norm": 0.4031183768382651, + "learning_rate": 4.997577037930912e-07, + "loss": 0.0139, + "step": 31646 + }, + { + "epoch": 3.752757025969406, + "grad_norm": 0.6295244318593425, + "learning_rate": 4.992802654720074e-07, + "loss": 0.0323, + "step": 31647 + }, + { + "epoch": 3.752875607731531, + "grad_norm": 0.4699184234835197, + "learning_rate": 4.988030530175159e-07, + "loss": 0.0174, + "step": 31648 + }, + { + "epoch": 3.752994189493656, + "grad_norm": 0.5133220652081618, + "learning_rate": 4.983260664340128e-07, + "loss": 0.0237, + "step": 31649 + }, + { + "epoch": 3.753112771255781, + "grad_norm": 0.4954228658356705, + "learning_rate": 4.978493057259004e-07, + "loss": 0.0229, + "step": 31650 + }, + { + "epoch": 3.753231353017906, + "grad_norm": 0.425949085280888, + "learning_rate": 4.973727708975695e-07, + "loss": 0.0217, + "step": 31651 + }, + { + "epoch": 3.7533499347800308, + "grad_norm": 0.5098465531062517, + "learning_rate": 4.968964619534139e-07, + "loss": 0.0212, + "step": 31652 + }, + { + "epoch": 3.753468516542156, + "grad_norm": 0.2898805038588874, + "learning_rate": 4.964203788978272e-07, + "loss": 0.0074, + "step": 31653 + }, + { + "epoch": 3.7535870983042807, + "grad_norm": 0.7674230791362295, + "learning_rate": 4.959445217351949e-07, + "loss": 0.0357, + "step": 31654 + }, + { + "epoch": 3.753705680066406, + "grad_norm": 0.3991747707165267, + "learning_rate": 4.95468890469905e-07, + "loss": 0.0186, + "step": 31655 + }, + { + "epoch": 3.7538242618285307, + "grad_norm": 0.362746996605073, + "learning_rate": 4.949934851063431e-07, + "loss": 0.0164, + "step": 31656 + }, + { + "epoch": 3.753942843590656, + "grad_norm": 0.4820966015090015, + "learning_rate": 4.945183056488917e-07, + "loss": 0.0216, + "step": 31657 + }, + { + "epoch": 3.7540614253527806, + "grad_norm": 0.5811691624615548, + "learning_rate": 4.940433521019305e-07, + "loss": 0.0191, + "step": 31658 + }, + { + "epoch": 3.754180007114906, + "grad_norm": 0.5648749066909599, + "learning_rate": 4.935686244698367e-07, + "loss": 0.0166, + "step": 31659 + }, + { + "epoch": 3.7542985888770306, + "grad_norm": 0.748800102950552, + "learning_rate": 4.930941227569902e-07, + "loss": 0.0277, + "step": 31660 + }, + { + "epoch": 3.754417170639156, + "grad_norm": 0.5715263341470392, + "learning_rate": 4.92619846967765e-07, + "loss": 0.03, + "step": 31661 + }, + { + "epoch": 3.7545357524012806, + "grad_norm": 0.4735076851060042, + "learning_rate": 4.921457971065302e-07, + "loss": 0.0201, + "step": 31662 + }, + { + "epoch": 3.7546543341634058, + "grad_norm": 0.41374661943983015, + "learning_rate": 4.91671973177657e-07, + "loss": 0.0141, + "step": 31663 + }, + { + "epoch": 3.7547729159255305, + "grad_norm": 0.7128261390528026, + "learning_rate": 4.911983751855143e-07, + "loss": 0.0364, + "step": 31664 + }, + { + "epoch": 3.7548914976876557, + "grad_norm": 0.49757135202558206, + "learning_rate": 4.907250031344707e-07, + "loss": 0.0169, + "step": 31665 + }, + { + "epoch": 3.7550100794497805, + "grad_norm": 0.5985853022178877, + "learning_rate": 4.902518570288839e-07, + "loss": 0.0359, + "step": 31666 + }, + { + "epoch": 3.7551286612119057, + "grad_norm": 0.5164674210834934, + "learning_rate": 4.897789368731199e-07, + "loss": 0.0242, + "step": 31667 + }, + { + "epoch": 3.7552472429740305, + "grad_norm": 0.5253722052445128, + "learning_rate": 4.893062426715361e-07, + "loss": 0.0246, + "step": 31668 + }, + { + "epoch": 3.7553658247361557, + "grad_norm": 0.41766448348379204, + "learning_rate": 4.888337744284932e-07, + "loss": 0.0175, + "step": 31669 + }, + { + "epoch": 3.7554844064982804, + "grad_norm": 0.6071709492279571, + "learning_rate": 4.883615321483487e-07, + "loss": 0.0323, + "step": 31670 + }, + { + "epoch": 3.7556029882604056, + "grad_norm": 0.5718305082208559, + "learning_rate": 4.878895158354463e-07, + "loss": 0.0308, + "step": 31671 + }, + { + "epoch": 3.7557215700225304, + "grad_norm": 0.46359543537997167, + "learning_rate": 4.874177254941492e-07, + "loss": 0.0179, + "step": 31672 + }, + { + "epoch": 3.7558401517846556, + "grad_norm": 0.5160184672072928, + "learning_rate": 4.869461611287985e-07, + "loss": 0.0223, + "step": 31673 + }, + { + "epoch": 3.7559587335467803, + "grad_norm": 0.4532109761076522, + "learning_rate": 4.86474822743746e-07, + "loss": 0.0168, + "step": 31674 + }, + { + "epoch": 3.7560773153089055, + "grad_norm": 0.38467198157194377, + "learning_rate": 4.860037103433329e-07, + "loss": 0.0165, + "step": 31675 + }, + { + "epoch": 3.7561958970710307, + "grad_norm": 0.7628817996843611, + "learning_rate": 4.855328239319085e-07, + "loss": 0.0364, + "step": 31676 + }, + { + "epoch": 3.7563144788331555, + "grad_norm": 0.3603198477936126, + "learning_rate": 4.850621635138081e-07, + "loss": 0.0145, + "step": 31677 + }, + { + "epoch": 3.7564330605952803, + "grad_norm": 0.3124553507275398, + "learning_rate": 4.845917290933699e-07, + "loss": 0.0119, + "step": 31678 + }, + { + "epoch": 3.7565516423574055, + "grad_norm": 0.5992694769311568, + "learning_rate": 4.84121520674935e-07, + "loss": 0.0236, + "step": 31679 + }, + { + "epoch": 3.7566702241195307, + "grad_norm": 0.3751517696959551, + "learning_rate": 4.836515382628359e-07, + "loss": 0.0186, + "step": 31680 + }, + { + "epoch": 3.7567888058816554, + "grad_norm": 0.4174851157238731, + "learning_rate": 4.831817818614081e-07, + "loss": 0.0161, + "step": 31681 + }, + { + "epoch": 3.75690738764378, + "grad_norm": 0.7145930073077518, + "learning_rate": 4.827122514749732e-07, + "loss": 0.0357, + "step": 31682 + }, + { + "epoch": 3.7570259694059054, + "grad_norm": 0.7607260458390087, + "learning_rate": 4.822429471078721e-07, + "loss": 0.0353, + "step": 31683 + }, + { + "epoch": 3.7571445511680306, + "grad_norm": 0.5267217249680649, + "learning_rate": 4.817738687644236e-07, + "loss": 0.0208, + "step": 31684 + }, + { + "epoch": 3.7572631329301553, + "grad_norm": 0.923470964478167, + "learning_rate": 4.813050164489575e-07, + "loss": 0.0397, + "step": 31685 + }, + { + "epoch": 3.75738171469228, + "grad_norm": 0.6923044963601438, + "learning_rate": 4.80836390165787e-07, + "loss": 0.0403, + "step": 31686 + }, + { + "epoch": 3.7575002964544053, + "grad_norm": 0.34583184633730063, + "learning_rate": 4.803679899192392e-07, + "loss": 0.0143, + "step": 31687 + }, + { + "epoch": 3.7576188782165305, + "grad_norm": 0.4638666536990943, + "learning_rate": 4.798998157136331e-07, + "loss": 0.0185, + "step": 31688 + }, + { + "epoch": 3.7577374599786553, + "grad_norm": 0.6533598737965659, + "learning_rate": 4.794318675532761e-07, + "loss": 0.0381, + "step": 31689 + }, + { + "epoch": 3.75785604174078, + "grad_norm": 0.5781051323726416, + "learning_rate": 4.789641454424954e-07, + "loss": 0.0211, + "step": 31690 + }, + { + "epoch": 3.7579746235029052, + "grad_norm": 0.43891021289064874, + "learning_rate": 4.784966493855902e-07, + "loss": 0.0121, + "step": 31691 + }, + { + "epoch": 3.7580932052650304, + "grad_norm": 0.5755951252759012, + "learning_rate": 4.780293793868795e-07, + "loss": 0.0338, + "step": 31692 + }, + { + "epoch": 3.758211787027155, + "grad_norm": 0.5524917223800868, + "learning_rate": 4.775623354506626e-07, + "loss": 0.0236, + "step": 31693 + }, + { + "epoch": 3.75833036878928, + "grad_norm": 0.6501844471996531, + "learning_rate": 4.770955175812553e-07, + "loss": 0.033, + "step": 31694 + }, + { + "epoch": 3.758448950551405, + "grad_norm": 0.4572037182567546, + "learning_rate": 4.766289257829515e-07, + "loss": 0.0194, + "step": 31695 + }, + { + "epoch": 3.7585675323135304, + "grad_norm": 0.5882891701314099, + "learning_rate": 4.761625600600561e-07, + "loss": 0.0273, + "step": 31696 + }, + { + "epoch": 3.758686114075655, + "grad_norm": 0.40278108548778035, + "learning_rate": 4.756964204168712e-07, + "loss": 0.021, + "step": 31697 + }, + { + "epoch": 3.75880469583778, + "grad_norm": 0.4832926545791205, + "learning_rate": 4.7523050685768786e-07, + "loss": 0.0164, + "step": 31698 + }, + { + "epoch": 3.758923277599905, + "grad_norm": 0.5763307328013652, + "learning_rate": 4.7476481938681094e-07, + "loss": 0.0252, + "step": 31699 + }, + { + "epoch": 3.7590418593620303, + "grad_norm": 0.656694808138985, + "learning_rate": 4.742993580085231e-07, + "loss": 0.0293, + "step": 31700 + }, + { + "epoch": 3.759160441124155, + "grad_norm": 0.3570249461405894, + "learning_rate": 4.738341227271209e-07, + "loss": 0.0179, + "step": 31701 + }, + { + "epoch": 3.7592790228862802, + "grad_norm": 0.42851664527572686, + "learning_rate": 4.733691135468926e-07, + "loss": 0.0261, + "step": 31702 + }, + { + "epoch": 3.759397604648405, + "grad_norm": 0.3638129190634693, + "learning_rate": 4.7290433047212366e-07, + "loss": 0.0166, + "step": 31703 + }, + { + "epoch": 3.75951618641053, + "grad_norm": 0.48192283263413804, + "learning_rate": 4.7243977350709955e-07, + "loss": 0.0276, + "step": 31704 + }, + { + "epoch": 3.759634768172655, + "grad_norm": 0.5560326329793466, + "learning_rate": 4.7197544265610294e-07, + "loss": 0.023, + "step": 31705 + }, + { + "epoch": 3.75975334993478, + "grad_norm": 0.38097324132247495, + "learning_rate": 4.715113379234165e-07, + "loss": 0.0132, + "step": 31706 + }, + { + "epoch": 3.759871931696905, + "grad_norm": 0.5786819758974097, + "learning_rate": 4.710474593133146e-07, + "loss": 0.02, + "step": 31707 + }, + { + "epoch": 3.75999051345903, + "grad_norm": 0.6577828628050721, + "learning_rate": 4.705838068300744e-07, + "loss": 0.0276, + "step": 31708 + }, + { + "epoch": 3.760109095221155, + "grad_norm": 0.794349275704206, + "learning_rate": 4.70120380477973e-07, + "loss": 0.0338, + "step": 31709 + }, + { + "epoch": 3.76022767698328, + "grad_norm": 0.5090169776294505, + "learning_rate": 4.696571802612848e-07, + "loss": 0.0212, + "step": 31710 + }, + { + "epoch": 3.760346258745405, + "grad_norm": 0.2922569342826977, + "learning_rate": 4.69194206184273e-07, + "loss": 0.0116, + "step": 31711 + }, + { + "epoch": 3.76046484050753, + "grad_norm": 0.5623016279545806, + "learning_rate": 4.6873145825120926e-07, + "loss": 0.0235, + "step": 31712 + }, + { + "epoch": 3.760583422269655, + "grad_norm": 0.5304940956463632, + "learning_rate": 4.682689364663595e-07, + "loss": 0.0271, + "step": 31713 + }, + { + "epoch": 3.76070200403178, + "grad_norm": 0.6340391393314827, + "learning_rate": 4.6780664083398704e-07, + "loss": 0.0244, + "step": 31714 + }, + { + "epoch": 3.7608205857939048, + "grad_norm": 0.4098639454390098, + "learning_rate": 4.673445713583552e-07, + "loss": 0.0184, + "step": 31715 + }, + { + "epoch": 3.76093916755603, + "grad_norm": 0.48887728025240096, + "learning_rate": 4.668827280437188e-07, + "loss": 0.0173, + "step": 31716 + }, + { + "epoch": 3.7610577493181547, + "grad_norm": 0.5961169912180226, + "learning_rate": 4.6642111089434113e-07, + "loss": 0.0241, + "step": 31717 + }, + { + "epoch": 3.76117633108028, + "grad_norm": 0.6587060453471448, + "learning_rate": 4.6595971991447716e-07, + "loss": 0.0314, + "step": 31718 + }, + { + "epoch": 3.7612949128424047, + "grad_norm": 0.5779973004791293, + "learning_rate": 4.6549855510837626e-07, + "loss": 0.0226, + "step": 31719 + }, + { + "epoch": 3.76141349460453, + "grad_norm": 0.6465273781126086, + "learning_rate": 4.650376164802961e-07, + "loss": 0.0294, + "step": 31720 + }, + { + "epoch": 3.7615320763666547, + "grad_norm": 0.6539334250653994, + "learning_rate": 4.6457690403448053e-07, + "loss": 0.0325, + "step": 31721 + }, + { + "epoch": 3.76165065812878, + "grad_norm": 0.41125736771878285, + "learning_rate": 4.641164177751789e-07, + "loss": 0.0127, + "step": 31722 + }, + { + "epoch": 3.7617692398909046, + "grad_norm": 0.7648432528273937, + "learning_rate": 4.6365615770663506e-07, + "loss": 0.0367, + "step": 31723 + }, + { + "epoch": 3.76188782165303, + "grad_norm": 0.5204144823816625, + "learning_rate": 4.6319612383309283e-07, + "loss": 0.0217, + "step": 31724 + }, + { + "epoch": 3.762006403415155, + "grad_norm": 0.6170044832814776, + "learning_rate": 4.627363161587961e-07, + "loss": 0.0247, + "step": 31725 + }, + { + "epoch": 3.76212498517728, + "grad_norm": 0.5550215155327914, + "learning_rate": 4.622767346879775e-07, + "loss": 0.0268, + "step": 31726 + }, + { + "epoch": 3.7622435669394045, + "grad_norm": 0.7536287879322616, + "learning_rate": 4.6181737942487814e-07, + "loss": 0.0414, + "step": 31727 + }, + { + "epoch": 3.7623621487015297, + "grad_norm": 0.4629213768393277, + "learning_rate": 4.613582503737335e-07, + "loss": 0.018, + "step": 31728 + }, + { + "epoch": 3.762480730463655, + "grad_norm": 0.43368927792392226, + "learning_rate": 4.608993475387735e-07, + "loss": 0.0184, + "step": 31729 + }, + { + "epoch": 3.7625993122257797, + "grad_norm": 0.7766191621648451, + "learning_rate": 4.6044067092422815e-07, + "loss": 0.028, + "step": 31730 + }, + { + "epoch": 3.7627178939879045, + "grad_norm": 0.4043969458506702, + "learning_rate": 4.5998222053432736e-07, + "loss": 0.0151, + "step": 31731 + }, + { + "epoch": 3.7628364757500297, + "grad_norm": 0.3722174436481523, + "learning_rate": 4.5952399637330115e-07, + "loss": 0.0213, + "step": 31732 + }, + { + "epoch": 3.762955057512155, + "grad_norm": 0.5912093195099211, + "learning_rate": 4.5906599844536826e-07, + "loss": 0.0253, + "step": 31733 + }, + { + "epoch": 3.7630736392742796, + "grad_norm": 0.4267301343678796, + "learning_rate": 4.5860822675475313e-07, + "loss": 0.0202, + "step": 31734 + }, + { + "epoch": 3.7631922210364044, + "grad_norm": 0.7030925567846759, + "learning_rate": 4.5815068130567465e-07, + "loss": 0.0319, + "step": 31735 + }, + { + "epoch": 3.7633108027985296, + "grad_norm": 0.3922444714075633, + "learning_rate": 4.576933621023544e-07, + "loss": 0.0182, + "step": 31736 + }, + { + "epoch": 3.763429384560655, + "grad_norm": 0.7009894702551888, + "learning_rate": 4.572362691490001e-07, + "loss": 0.0298, + "step": 31737 + }, + { + "epoch": 3.7635479663227795, + "grad_norm": 0.34995024648947753, + "learning_rate": 4.567794024498362e-07, + "loss": 0.0154, + "step": 31738 + }, + { + "epoch": 3.7636665480849043, + "grad_norm": 0.5197882809733071, + "learning_rate": 4.5632276200906485e-07, + "loss": 0.0244, + "step": 31739 + }, + { + "epoch": 3.7637851298470295, + "grad_norm": 0.7841635027288899, + "learning_rate": 4.558663478309022e-07, + "loss": 0.0383, + "step": 31740 + }, + { + "epoch": 3.7639037116091547, + "grad_norm": 0.48279133662024504, + "learning_rate": 4.5541015991955593e-07, + "loss": 0.0259, + "step": 31741 + }, + { + "epoch": 3.7640222933712795, + "grad_norm": 0.8138372953100675, + "learning_rate": 4.549541982792255e-07, + "loss": 0.0252, + "step": 31742 + }, + { + "epoch": 3.7641408751334042, + "grad_norm": 0.6544112959747511, + "learning_rate": 4.5449846291412413e-07, + "loss": 0.0242, + "step": 31743 + }, + { + "epoch": 3.7642594568955294, + "grad_norm": 0.7371360190171548, + "learning_rate": 4.54042953828443e-07, + "loss": 0.037, + "step": 31744 + }, + { + "epoch": 3.7643780386576546, + "grad_norm": 0.4282798965605088, + "learning_rate": 4.535876710263842e-07, + "loss": 0.0158, + "step": 31745 + }, + { + "epoch": 3.7644966204197794, + "grad_norm": 0.857502524008476, + "learning_rate": 4.531326145121473e-07, + "loss": 0.0531, + "step": 31746 + }, + { + "epoch": 3.764615202181904, + "grad_norm": 0.5797641708886534, + "learning_rate": 4.526777842899288e-07, + "loss": 0.025, + "step": 31747 + }, + { + "epoch": 3.7647337839440294, + "grad_norm": 0.7999159499287557, + "learning_rate": 4.522231803639199e-07, + "loss": 0.022, + "step": 31748 + }, + { + "epoch": 3.7648523657061546, + "grad_norm": 0.5620852184116854, + "learning_rate": 4.51768802738306e-07, + "loss": 0.0223, + "step": 31749 + }, + { + "epoch": 3.7649709474682793, + "grad_norm": 0.5143098458162462, + "learning_rate": 4.513146514172839e-07, + "loss": 0.0288, + "step": 31750 + }, + { + "epoch": 3.7650895292304045, + "grad_norm": 0.3903275431737434, + "learning_rate": 4.5086072640503627e-07, + "loss": 0.0126, + "step": 31751 + }, + { + "epoch": 3.7652081109925293, + "grad_norm": 0.5873251853615715, + "learning_rate": 4.504070277057515e-07, + "loss": 0.0248, + "step": 31752 + }, + { + "epoch": 3.7653266927546545, + "grad_norm": 0.48240248816693876, + "learning_rate": 4.4995355532360397e-07, + "loss": 0.0204, + "step": 31753 + }, + { + "epoch": 3.7654452745167792, + "grad_norm": 0.3884835278829907, + "learning_rate": 4.4950030926278197e-07, + "loss": 0.0194, + "step": 31754 + }, + { + "epoch": 3.7655638562789044, + "grad_norm": 0.4412156752214478, + "learning_rate": 4.490472895274628e-07, + "loss": 0.0224, + "step": 31755 + }, + { + "epoch": 3.765682438041029, + "grad_norm": 0.649769915398782, + "learning_rate": 4.4859449612181803e-07, + "loss": 0.0296, + "step": 31756 + }, + { + "epoch": 3.7658010198031544, + "grad_norm": 0.40844913093707536, + "learning_rate": 4.481419290500277e-07, + "loss": 0.0154, + "step": 31757 + }, + { + "epoch": 3.765919601565279, + "grad_norm": 0.7215429466057867, + "learning_rate": 4.476895883162607e-07, + "loss": 0.0406, + "step": 31758 + }, + { + "epoch": 3.7660381833274044, + "grad_norm": 0.7917076798286585, + "learning_rate": 4.4723747392468873e-07, + "loss": 0.0425, + "step": 31759 + }, + { + "epoch": 3.766156765089529, + "grad_norm": 0.3391299180623231, + "learning_rate": 4.4678558587947507e-07, + "loss": 0.0104, + "step": 31760 + }, + { + "epoch": 3.7662753468516543, + "grad_norm": 0.3723808811041767, + "learning_rate": 4.4633392418479135e-07, + "loss": 0.0131, + "step": 31761 + }, + { + "epoch": 3.766393928613779, + "grad_norm": 0.724428281956527, + "learning_rate": 4.45882488844801e-07, + "loss": 0.037, + "step": 31762 + }, + { + "epoch": 3.7665125103759043, + "grad_norm": 0.5347745748513711, + "learning_rate": 4.4543127986366176e-07, + "loss": 0.0285, + "step": 31763 + }, + { + "epoch": 3.766631092138029, + "grad_norm": 0.24501362050655365, + "learning_rate": 4.449802972455341e-07, + "loss": 0.0124, + "step": 31764 + }, + { + "epoch": 3.7667496739001542, + "grad_norm": 0.8451058308977665, + "learning_rate": 4.4452954099458153e-07, + "loss": 0.041, + "step": 31765 + }, + { + "epoch": 3.766868255662279, + "grad_norm": 0.7720947698645696, + "learning_rate": 4.4407901111495063e-07, + "loss": 0.0339, + "step": 31766 + }, + { + "epoch": 3.766986837424404, + "grad_norm": 0.5245239105510062, + "learning_rate": 4.4362870761079925e-07, + "loss": 0.0277, + "step": 31767 + }, + { + "epoch": 3.767105419186529, + "grad_norm": 0.5616126644244221, + "learning_rate": 4.4317863048627673e-07, + "loss": 0.0283, + "step": 31768 + }, + { + "epoch": 3.767224000948654, + "grad_norm": 0.5317551079863586, + "learning_rate": 4.427287797455354e-07, + "loss": 0.0204, + "step": 31769 + }, + { + "epoch": 3.767342582710779, + "grad_norm": 0.5409493102311419, + "learning_rate": 4.422791553927219e-07, + "loss": 0.02, + "step": 31770 + }, + { + "epoch": 3.767461164472904, + "grad_norm": 0.38830877668112745, + "learning_rate": 4.418297574319802e-07, + "loss": 0.02, + "step": 31771 + }, + { + "epoch": 3.767579746235029, + "grad_norm": 0.4936937788887453, + "learning_rate": 4.4138058586745137e-07, + "loss": 0.0177, + "step": 31772 + }, + { + "epoch": 3.767698327997154, + "grad_norm": 0.36294249559263503, + "learning_rate": 4.409316407032765e-07, + "loss": 0.0159, + "step": 31773 + }, + { + "epoch": 3.7678169097592793, + "grad_norm": 0.48081505188763324, + "learning_rate": 4.404829219435996e-07, + "loss": 0.0203, + "step": 31774 + }, + { + "epoch": 3.767935491521404, + "grad_norm": 0.7532621344478417, + "learning_rate": 4.400344295925479e-07, + "loss": 0.027, + "step": 31775 + }, + { + "epoch": 3.768054073283529, + "grad_norm": 0.7244622857307539, + "learning_rate": 4.395861636542653e-07, + "loss": 0.0355, + "step": 31776 + }, + { + "epoch": 3.768172655045654, + "grad_norm": 0.5432909558525993, + "learning_rate": 4.39138124132879e-07, + "loss": 0.0263, + "step": 31777 + }, + { + "epoch": 3.768291236807779, + "grad_norm": 0.5727641975661882, + "learning_rate": 4.386903110325191e-07, + "loss": 0.0268, + "step": 31778 + }, + { + "epoch": 3.768409818569904, + "grad_norm": 0.7169338667462131, + "learning_rate": 4.382427243573184e-07, + "loss": 0.03, + "step": 31779 + }, + { + "epoch": 3.7685284003320287, + "grad_norm": 0.6160224286070087, + "learning_rate": 4.377953641113985e-07, + "loss": 0.032, + "step": 31780 + }, + { + "epoch": 3.768646982094154, + "grad_norm": 0.5893733729080451, + "learning_rate": 4.373482302988868e-07, + "loss": 0.0281, + "step": 31781 + }, + { + "epoch": 3.768765563856279, + "grad_norm": 0.5312608024696982, + "learning_rate": 4.3690132292390217e-07, + "loss": 0.0197, + "step": 31782 + }, + { + "epoch": 3.768884145618404, + "grad_norm": 0.571874028587005, + "learning_rate": 4.364546419905663e-07, + "loss": 0.026, + "step": 31783 + }, + { + "epoch": 3.7690027273805287, + "grad_norm": 0.5328020421484203, + "learning_rate": 4.360081875029953e-07, + "loss": 0.0229, + "step": 31784 + }, + { + "epoch": 3.769121309142654, + "grad_norm": 0.3913122091985591, + "learning_rate": 4.3556195946531096e-07, + "loss": 0.0155, + "step": 31785 + }, + { + "epoch": 3.769239890904779, + "grad_norm": 0.32276314552478025, + "learning_rate": 4.351159578816183e-07, + "loss": 0.0153, + "step": 31786 + }, + { + "epoch": 3.769358472666904, + "grad_norm": 0.541001481885907, + "learning_rate": 4.3467018275603343e-07, + "loss": 0.0203, + "step": 31787 + }, + { + "epoch": 3.7694770544290286, + "grad_norm": 0.519665282948342, + "learning_rate": 4.34224634092667e-07, + "loss": 0.0244, + "step": 31788 + }, + { + "epoch": 3.769595636191154, + "grad_norm": 0.3335703297655153, + "learning_rate": 4.3377931189562405e-07, + "loss": 0.011, + "step": 31789 + }, + { + "epoch": 3.769714217953279, + "grad_norm": 0.4096155159626794, + "learning_rate": 4.333342161690096e-07, + "loss": 0.0118, + "step": 31790 + }, + { + "epoch": 3.7698327997154037, + "grad_norm": 0.8090820401139942, + "learning_rate": 4.3288934691692875e-07, + "loss": 0.0372, + "step": 31791 + }, + { + "epoch": 3.7699513814775285, + "grad_norm": 0.2598894632673458, + "learning_rate": 4.324447041434837e-07, + "loss": 0.0113, + "step": 31792 + }, + { + "epoch": 3.7700699632396537, + "grad_norm": 0.35226842576038286, + "learning_rate": 4.3200028785277126e-07, + "loss": 0.0104, + "step": 31793 + }, + { + "epoch": 3.770188545001779, + "grad_norm": 0.3154062886157476, + "learning_rate": 4.3155609804888806e-07, + "loss": 0.0119, + "step": 31794 + }, + { + "epoch": 3.7703071267639037, + "grad_norm": 0.8253540940775388, + "learning_rate": 4.3111213473593093e-07, + "loss": 0.0417, + "step": 31795 + }, + { + "epoch": 3.7704257085260284, + "grad_norm": 0.44325220016336825, + "learning_rate": 4.306683979179882e-07, + "loss": 0.0185, + "step": 31796 + }, + { + "epoch": 3.7705442902881536, + "grad_norm": 0.44595156728003543, + "learning_rate": 4.3022488759915933e-07, + "loss": 0.0295, + "step": 31797 + }, + { + "epoch": 3.770662872050279, + "grad_norm": 0.46759484113436844, + "learning_rate": 4.2978160378352175e-07, + "loss": 0.0228, + "step": 31798 + }, + { + "epoch": 3.7707814538124036, + "grad_norm": 0.47268983409050036, + "learning_rate": 4.29338546475172e-07, + "loss": 0.0156, + "step": 31799 + }, + { + "epoch": 3.770900035574529, + "grad_norm": 0.3121148278249606, + "learning_rate": 4.288957156781903e-07, + "loss": 0.0154, + "step": 31800 + }, + { + "epoch": 3.7710186173366536, + "grad_norm": 0.7648270676159207, + "learning_rate": 4.2845311139665945e-07, + "loss": 0.0336, + "step": 31801 + }, + { + "epoch": 3.7711371990987788, + "grad_norm": 0.4124428804558289, + "learning_rate": 4.2801073363465393e-07, + "loss": 0.0214, + "step": 31802 + }, + { + "epoch": 3.7712557808609035, + "grad_norm": 0.4395198591888608, + "learning_rate": 4.275685823962622e-07, + "loss": 0.0204, + "step": 31803 + }, + { + "epoch": 3.7713743626230287, + "grad_norm": 0.6768861817390999, + "learning_rate": 4.2712665768555314e-07, + "loss": 0.0312, + "step": 31804 + }, + { + "epoch": 3.7714929443851535, + "grad_norm": 0.47851287161695905, + "learning_rate": 4.2668495950660415e-07, + "loss": 0.0131, + "step": 31805 + }, + { + "epoch": 3.7716115261472787, + "grad_norm": 0.3537161489651981, + "learning_rate": 4.262434878634869e-07, + "loss": 0.0141, + "step": 31806 + }, + { + "epoch": 3.7717301079094034, + "grad_norm": 0.5598918328580023, + "learning_rate": 4.258022427602676e-07, + "loss": 0.0356, + "step": 31807 + }, + { + "epoch": 3.7718486896715286, + "grad_norm": 0.458284740627091, + "learning_rate": 4.2536122420102076e-07, + "loss": 0.0186, + "step": 31808 + }, + { + "epoch": 3.7719672714336534, + "grad_norm": 0.6530313995773684, + "learning_rate": 4.2492043218980426e-07, + "loss": 0.0271, + "step": 31809 + }, + { + "epoch": 3.7720858531957786, + "grad_norm": 0.6882645229704578, + "learning_rate": 4.244798667306871e-07, + "loss": 0.0326, + "step": 31810 + }, + { + "epoch": 3.7722044349579034, + "grad_norm": 0.4542598669655658, + "learning_rate": 4.240395278277298e-07, + "loss": 0.0136, + "step": 31811 + }, + { + "epoch": 3.7723230167200286, + "grad_norm": 0.6248827626656878, + "learning_rate": 4.2359941548499037e-07, + "loss": 0.0267, + "step": 31812 + }, + { + "epoch": 3.7724415984821533, + "grad_norm": 0.5965080080426812, + "learning_rate": 4.2315952970652386e-07, + "loss": 0.0295, + "step": 31813 + }, + { + "epoch": 3.7725601802442785, + "grad_norm": 0.75245982120147, + "learning_rate": 4.227198704963936e-07, + "loss": 0.0437, + "step": 31814 + }, + { + "epoch": 3.7726787620064033, + "grad_norm": 0.4195395412804941, + "learning_rate": 4.222804378586437e-07, + "loss": 0.0237, + "step": 31815 + }, + { + "epoch": 3.7727973437685285, + "grad_norm": 0.4070031264057588, + "learning_rate": 4.218412317973319e-07, + "loss": 0.0212, + "step": 31816 + }, + { + "epoch": 3.7729159255306532, + "grad_norm": 0.6843047220121112, + "learning_rate": 4.214022523165023e-07, + "loss": 0.024, + "step": 31817 + }, + { + "epoch": 3.7730345072927784, + "grad_norm": 0.3508557841503246, + "learning_rate": 4.2096349942020163e-07, + "loss": 0.0167, + "step": 31818 + }, + { + "epoch": 3.773153089054903, + "grad_norm": 0.5546589031857581, + "learning_rate": 4.2052497311248216e-07, + "loss": 0.025, + "step": 31819 + }, + { + "epoch": 3.7732716708170284, + "grad_norm": 0.3833384210881732, + "learning_rate": 4.2008667339737407e-07, + "loss": 0.0183, + "step": 31820 + }, + { + "epoch": 3.773390252579153, + "grad_norm": 0.39296483503400786, + "learning_rate": 4.196486002789296e-07, + "loss": 0.0189, + "step": 31821 + }, + { + "epoch": 3.7735088343412784, + "grad_norm": 0.43146356530993984, + "learning_rate": 4.192107537611817e-07, + "loss": 0.0211, + "step": 31822 + }, + { + "epoch": 3.773627416103403, + "grad_norm": 0.5166066669129281, + "learning_rate": 4.1877313384817154e-07, + "loss": 0.0223, + "step": 31823 + }, + { + "epoch": 3.7737459978655283, + "grad_norm": 0.31062322499923817, + "learning_rate": 4.1833574054392367e-07, + "loss": 0.0108, + "step": 31824 + }, + { + "epoch": 3.773864579627653, + "grad_norm": 0.3362367507943895, + "learning_rate": 4.1789857385247933e-07, + "loss": 0.0102, + "step": 31825 + }, + { + "epoch": 3.7739831613897783, + "grad_norm": 0.370544352691442, + "learning_rate": 4.1746163377786305e-07, + "loss": 0.0153, + "step": 31826 + }, + { + "epoch": 3.7741017431519035, + "grad_norm": 0.5807189626949868, + "learning_rate": 4.170249203241078e-07, + "loss": 0.0222, + "step": 31827 + }, + { + "epoch": 3.7742203249140283, + "grad_norm": 0.49541192592648775, + "learning_rate": 4.1658843349523526e-07, + "loss": 0.0194, + "step": 31828 + }, + { + "epoch": 3.774338906676153, + "grad_norm": 0.3975524443897461, + "learning_rate": 4.161521732952728e-07, + "loss": 0.0164, + "step": 31829 + }, + { + "epoch": 3.774457488438278, + "grad_norm": 0.8371928320391157, + "learning_rate": 4.157161397282422e-07, + "loss": 0.0424, + "step": 31830 + }, + { + "epoch": 3.7745760702004034, + "grad_norm": 0.49436708290436937, + "learning_rate": 4.152803327981569e-07, + "loss": 0.0197, + "step": 31831 + }, + { + "epoch": 3.774694651962528, + "grad_norm": 0.4882580305869405, + "learning_rate": 4.1484475250904156e-07, + "loss": 0.0227, + "step": 31832 + }, + { + "epoch": 3.774813233724653, + "grad_norm": 0.8220902643819877, + "learning_rate": 4.144093988649095e-07, + "loss": 0.0318, + "step": 31833 + }, + { + "epoch": 3.774931815486778, + "grad_norm": 0.4821572511419582, + "learning_rate": 4.13974271869777e-07, + "loss": 0.0152, + "step": 31834 + }, + { + "epoch": 3.7750503972489033, + "grad_norm": 0.5749685032121692, + "learning_rate": 4.1353937152764645e-07, + "loss": 0.0266, + "step": 31835 + }, + { + "epoch": 3.775168979011028, + "grad_norm": 0.6427109617670901, + "learning_rate": 4.131046978425368e-07, + "loss": 0.0305, + "step": 31836 + }, + { + "epoch": 3.775287560773153, + "grad_norm": 0.8353272112312367, + "learning_rate": 4.1267025081845046e-07, + "loss": 0.0304, + "step": 31837 + }, + { + "epoch": 3.775406142535278, + "grad_norm": 0.7600899388366446, + "learning_rate": 4.1223603045939254e-07, + "loss": 0.0377, + "step": 31838 + }, + { + "epoch": 3.7755247242974033, + "grad_norm": 0.7797335240118476, + "learning_rate": 4.118020367693681e-07, + "loss": 0.06, + "step": 31839 + }, + { + "epoch": 3.775643306059528, + "grad_norm": 0.7689651875683298, + "learning_rate": 4.113682697523741e-07, + "loss": 0.0386, + "step": 31840 + }, + { + "epoch": 3.775761887821653, + "grad_norm": 0.5490319434344557, + "learning_rate": 4.1093472941241827e-07, + "loss": 0.022, + "step": 31841 + }, + { + "epoch": 3.775880469583778, + "grad_norm": 0.36292279159787116, + "learning_rate": 4.1050141575348644e-07, + "loss": 0.0115, + "step": 31842 + }, + { + "epoch": 3.775999051345903, + "grad_norm": 0.7399061139931095, + "learning_rate": 4.1006832877957814e-07, + "loss": 0.0341, + "step": 31843 + }, + { + "epoch": 3.776117633108028, + "grad_norm": 0.4729695889735751, + "learning_rate": 4.096354684946846e-07, + "loss": 0.0221, + "step": 31844 + }, + { + "epoch": 3.7762362148701527, + "grad_norm": 0.31355914762016357, + "learning_rate": 4.0920283490279984e-07, + "loss": 0.0164, + "step": 31845 + }, + { + "epoch": 3.776354796632278, + "grad_norm": 0.4827132319964702, + "learning_rate": 4.0877042800790964e-07, + "loss": 0.0238, + "step": 31846 + }, + { + "epoch": 3.776473378394403, + "grad_norm": 0.47805523861129806, + "learning_rate": 4.0833824781399687e-07, + "loss": 0.0243, + "step": 31847 + }, + { + "epoch": 3.776591960156528, + "grad_norm": 0.8643425384320679, + "learning_rate": 4.0790629432505557e-07, + "loss": 0.0349, + "step": 31848 + }, + { + "epoch": 3.776710541918653, + "grad_norm": 0.4226496229544819, + "learning_rate": 4.0747456754505753e-07, + "loss": 0.0196, + "step": 31849 + }, + { + "epoch": 3.776829123680778, + "grad_norm": 0.5913537335060952, + "learning_rate": 4.070430674779857e-07, + "loss": 0.0254, + "step": 31850 + }, + { + "epoch": 3.776947705442903, + "grad_norm": 0.4192179364547982, + "learning_rate": 4.0661179412782025e-07, + "loss": 0.023, + "step": 31851 + }, + { + "epoch": 3.777066287205028, + "grad_norm": 0.3677656240047313, + "learning_rate": 4.0618074749853576e-07, + "loss": 0.0177, + "step": 31852 + }, + { + "epoch": 3.777184868967153, + "grad_norm": 0.3757826554871263, + "learning_rate": 4.057499275941068e-07, + "loss": 0.0179, + "step": 31853 + }, + { + "epoch": 3.7773034507292778, + "grad_norm": 0.6583405707951343, + "learning_rate": 4.0531933441850246e-07, + "loss": 0.0267, + "step": 31854 + }, + { + "epoch": 3.777422032491403, + "grad_norm": 0.617787289908163, + "learning_rate": 4.048889679756918e-07, + "loss": 0.0254, + "step": 31855 + }, + { + "epoch": 3.7775406142535277, + "grad_norm": 0.7308365697164131, + "learning_rate": 4.0445882826964653e-07, + "loss": 0.0289, + "step": 31856 + }, + { + "epoch": 3.777659196015653, + "grad_norm": 0.4614767378921938, + "learning_rate": 4.0402891530433304e-07, + "loss": 0.0227, + "step": 31857 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.4120895743343994, + "learning_rate": 4.035992290837065e-07, + "loss": 0.0164, + "step": 31858 + }, + { + "epoch": 3.777896359539903, + "grad_norm": 0.753556157499614, + "learning_rate": 4.031697696117359e-07, + "loss": 0.0247, + "step": 31859 + }, + { + "epoch": 3.7780149413020276, + "grad_norm": 0.7227416612480351, + "learning_rate": 4.027405368923765e-07, + "loss": 0.027, + "step": 31860 + }, + { + "epoch": 3.778133523064153, + "grad_norm": 0.5287363008921468, + "learning_rate": 4.023115309295833e-07, + "loss": 0.0191, + "step": 31861 + }, + { + "epoch": 3.7782521048262776, + "grad_norm": 0.3970565742346015, + "learning_rate": 4.0188275172731714e-07, + "loss": 0.0183, + "step": 31862 + }, + { + "epoch": 3.778370686588403, + "grad_norm": 0.6936002889299994, + "learning_rate": 4.0145419928953044e-07, + "loss": 0.0281, + "step": 31863 + }, + { + "epoch": 3.7784892683505276, + "grad_norm": 0.5986737288472799, + "learning_rate": 4.010258736201672e-07, + "loss": 0.0246, + "step": 31864 + }, + { + "epoch": 3.7786078501126528, + "grad_norm": 0.3490793560701695, + "learning_rate": 4.0059777472317984e-07, + "loss": 0.0185, + "step": 31865 + }, + { + "epoch": 3.7787264318747775, + "grad_norm": 0.31181752288030923, + "learning_rate": 4.0016990260251797e-07, + "loss": 0.0147, + "step": 31866 + }, + { + "epoch": 3.7788450136369027, + "grad_norm": 0.7278908359700335, + "learning_rate": 3.997422572621201e-07, + "loss": 0.0411, + "step": 31867 + }, + { + "epoch": 3.7789635953990275, + "grad_norm": 0.5487880114987683, + "learning_rate": 3.993148387059359e-07, + "loss": 0.024, + "step": 31868 + }, + { + "epoch": 3.7790821771611527, + "grad_norm": 0.48722041499231605, + "learning_rate": 3.988876469378983e-07, + "loss": 0.0239, + "step": 31869 + }, + { + "epoch": 3.7792007589232774, + "grad_norm": 0.4397279656736417, + "learning_rate": 3.984606819619513e-07, + "loss": 0.0196, + "step": 31870 + }, + { + "epoch": 3.7793193406854027, + "grad_norm": 0.4983560524909072, + "learning_rate": 3.9803394378202516e-07, + "loss": 0.0245, + "step": 31871 + }, + { + "epoch": 3.7794379224475274, + "grad_norm": 0.376635536549917, + "learning_rate": 3.976074324020612e-07, + "loss": 0.0148, + "step": 31872 + }, + { + "epoch": 3.7795565042096526, + "grad_norm": 0.9387925659287376, + "learning_rate": 3.9718114782598126e-07, + "loss": 0.0365, + "step": 31873 + }, + { + "epoch": 3.7796750859717774, + "grad_norm": 0.3869884059293986, + "learning_rate": 3.967550900577266e-07, + "loss": 0.0199, + "step": 31874 + }, + { + "epoch": 3.7797936677339026, + "grad_norm": 0.6933075682159763, + "learning_rate": 3.963292591012191e-07, + "loss": 0.0471, + "step": 31875 + }, + { + "epoch": 3.7799122494960278, + "grad_norm": 0.7244974394856587, + "learning_rate": 3.959036549603834e-07, + "loss": 0.0356, + "step": 31876 + }, + { + "epoch": 3.7800308312581525, + "grad_norm": 0.4894324583702637, + "learning_rate": 3.9547827763914416e-07, + "loss": 0.0219, + "step": 31877 + }, + { + "epoch": 3.7801494130202773, + "grad_norm": 0.39266006833975764, + "learning_rate": 3.9505312714142316e-07, + "loss": 0.0157, + "step": 31878 + }, + { + "epoch": 3.7802679947824025, + "grad_norm": 0.7121298000560101, + "learning_rate": 3.946282034711396e-07, + "loss": 0.0389, + "step": 31879 + }, + { + "epoch": 3.7803865765445277, + "grad_norm": 0.627352188764344, + "learning_rate": 3.942035066322097e-07, + "loss": 0.0363, + "step": 31880 + }, + { + "epoch": 3.7805051583066525, + "grad_norm": 0.49527411855144843, + "learning_rate": 3.937790366285526e-07, + "loss": 0.021, + "step": 31881 + }, + { + "epoch": 3.780623740068777, + "grad_norm": 0.9090845724947956, + "learning_rate": 3.933547934640791e-07, + "loss": 0.0476, + "step": 31882 + }, + { + "epoch": 3.7807423218309024, + "grad_norm": 0.5280205868027062, + "learning_rate": 3.929307771426999e-07, + "loss": 0.0194, + "step": 31883 + }, + { + "epoch": 3.7808609035930276, + "grad_norm": 0.5342588008455651, + "learning_rate": 3.925069876683174e-07, + "loss": 0.0268, + "step": 31884 + }, + { + "epoch": 3.7809794853551524, + "grad_norm": 0.5714573034811801, + "learning_rate": 3.920834250448535e-07, + "loss": 0.0227, + "step": 31885 + }, + { + "epoch": 3.781098067117277, + "grad_norm": 0.4685837423936282, + "learning_rate": 3.9166008927619956e-07, + "loss": 0.0179, + "step": 31886 + }, + { + "epoch": 3.7812166488794023, + "grad_norm": 0.7406951398757605, + "learning_rate": 3.9123698036626076e-07, + "loss": 0.0265, + "step": 31887 + }, + { + "epoch": 3.7813352306415275, + "grad_norm": 0.5425488817061083, + "learning_rate": 3.9081409831894235e-07, + "loss": 0.021, + "step": 31888 + }, + { + "epoch": 3.7814538124036523, + "grad_norm": 0.5215613977881464, + "learning_rate": 3.9039144313814116e-07, + "loss": 0.0248, + "step": 31889 + }, + { + "epoch": 3.781572394165777, + "grad_norm": 0.5904298375692939, + "learning_rate": 3.8996901482775407e-07, + "loss": 0.0233, + "step": 31890 + }, + { + "epoch": 3.7816909759279023, + "grad_norm": 0.5685630541773063, + "learning_rate": 3.895468133916669e-07, + "loss": 0.0249, + "step": 31891 + }, + { + "epoch": 3.7818095576900275, + "grad_norm": 0.32179594138781825, + "learning_rate": 3.8912483883378483e-07, + "loss": 0.0142, + "step": 31892 + }, + { + "epoch": 3.7819281394521522, + "grad_norm": 0.5203834164907836, + "learning_rate": 3.887030911579909e-07, + "loss": 0.0309, + "step": 31893 + }, + { + "epoch": 3.782046721214277, + "grad_norm": 0.5521898636971335, + "learning_rate": 3.882815703681736e-07, + "loss": 0.0234, + "step": 31894 + }, + { + "epoch": 3.782165302976402, + "grad_norm": 0.5294662474398418, + "learning_rate": 3.8786027646821877e-07, + "loss": 0.028, + "step": 31895 + }, + { + "epoch": 3.7822838847385274, + "grad_norm": 0.5969241991561479, + "learning_rate": 3.874392094620122e-07, + "loss": 0.0236, + "step": 31896 + }, + { + "epoch": 3.782402466500652, + "grad_norm": 0.6335807233750141, + "learning_rate": 3.8701836935343407e-07, + "loss": 0.024, + "step": 31897 + }, + { + "epoch": 3.782521048262777, + "grad_norm": 0.4981690234184708, + "learning_rate": 3.8659775614636183e-07, + "loss": 0.023, + "step": 31898 + }, + { + "epoch": 3.782639630024902, + "grad_norm": 0.3770953984917607, + "learning_rate": 3.861773698446786e-07, + "loss": 0.0183, + "step": 31899 + }, + { + "epoch": 3.7827582117870273, + "grad_norm": 0.4427524920254872, + "learning_rate": 3.857572104522533e-07, + "loss": 0.0161, + "step": 31900 + }, + { + "epoch": 3.782876793549152, + "grad_norm": 0.6161563397962272, + "learning_rate": 3.8533727797296637e-07, + "loss": 0.023, + "step": 31901 + }, + { + "epoch": 3.7829953753112773, + "grad_norm": 0.4588123636372751, + "learning_rate": 3.849175724106813e-07, + "loss": 0.0378, + "step": 31902 + }, + { + "epoch": 3.783113957073402, + "grad_norm": 0.4634858743970792, + "learning_rate": 3.8449809376927273e-07, + "loss": 0.0252, + "step": 31903 + }, + { + "epoch": 3.7832325388355272, + "grad_norm": 0.5334546529397126, + "learning_rate": 3.84078842052607e-07, + "loss": 0.0142, + "step": 31904 + }, + { + "epoch": 3.783351120597652, + "grad_norm": 0.5566784421100952, + "learning_rate": 3.836598172645478e-07, + "loss": 0.0217, + "step": 31905 + }, + { + "epoch": 3.783469702359777, + "grad_norm": 0.6126611954215508, + "learning_rate": 3.832410194089586e-07, + "loss": 0.0324, + "step": 31906 + }, + { + "epoch": 3.783588284121902, + "grad_norm": 0.4674974927581612, + "learning_rate": 3.828224484896975e-07, + "loss": 0.0252, + "step": 31907 + }, + { + "epoch": 3.783706865884027, + "grad_norm": 0.6139853644976944, + "learning_rate": 3.824041045106308e-07, + "loss": 0.0292, + "step": 31908 + }, + { + "epoch": 3.783825447646152, + "grad_norm": 0.5398607124047583, + "learning_rate": 3.819859874756082e-07, + "loss": 0.0254, + "step": 31909 + }, + { + "epoch": 3.783944029408277, + "grad_norm": 0.5522137500632561, + "learning_rate": 3.8156809738848496e-07, + "loss": 0.0253, + "step": 31910 + }, + { + "epoch": 3.784062611170402, + "grad_norm": 0.3900193660562421, + "learning_rate": 3.811504342531136e-07, + "loss": 0.0146, + "step": 31911 + }, + { + "epoch": 3.784181192932527, + "grad_norm": 0.6920867652280905, + "learning_rate": 3.8073299807334926e-07, + "loss": 0.0321, + "step": 31912 + }, + { + "epoch": 3.784299774694652, + "grad_norm": 0.4721311082814587, + "learning_rate": 3.803157888530362e-07, + "loss": 0.0166, + "step": 31913 + }, + { + "epoch": 3.784418356456777, + "grad_norm": 0.4485947088175132, + "learning_rate": 3.798988065960185e-07, + "loss": 0.0189, + "step": 31914 + }, + { + "epoch": 3.784536938218902, + "grad_norm": 0.5216555008985573, + "learning_rate": 3.794820513061431e-07, + "loss": 0.0253, + "step": 31915 + }, + { + "epoch": 3.784655519981027, + "grad_norm": 0.43004237031962766, + "learning_rate": 3.790655229872514e-07, + "loss": 0.0186, + "step": 31916 + }, + { + "epoch": 3.7847741017431518, + "grad_norm": 0.3153867820475274, + "learning_rate": 3.7864922164318473e-07, + "loss": 0.0154, + "step": 31917 + }, + { + "epoch": 3.784892683505277, + "grad_norm": 0.32077494431229864, + "learning_rate": 3.7823314727777614e-07, + "loss": 0.0166, + "step": 31918 + }, + { + "epoch": 3.7850112652674017, + "grad_norm": 0.5592501521723984, + "learning_rate": 3.778172998948698e-07, + "loss": 0.0241, + "step": 31919 + }, + { + "epoch": 3.785129847029527, + "grad_norm": 0.45766110759451023, + "learning_rate": 3.774016794982904e-07, + "loss": 0.0198, + "step": 31920 + }, + { + "epoch": 3.7852484287916517, + "grad_norm": 0.7117537536790278, + "learning_rate": 3.7698628609187657e-07, + "loss": 0.0294, + "step": 31921 + }, + { + "epoch": 3.785367010553777, + "grad_norm": 0.5194982338478071, + "learning_rate": 3.7657111967945026e-07, + "loss": 0.0178, + "step": 31922 + }, + { + "epoch": 3.7854855923159016, + "grad_norm": 0.40813630426047054, + "learning_rate": 3.761561802648472e-07, + "loss": 0.0231, + "step": 31923 + }, + { + "epoch": 3.785604174078027, + "grad_norm": 0.6481257833360387, + "learning_rate": 3.757414678518867e-07, + "loss": 0.0279, + "step": 31924 + }, + { + "epoch": 3.785722755840152, + "grad_norm": 0.4506117689470364, + "learning_rate": 3.7532698244439337e-07, + "loss": 0.0173, + "step": 31925 + }, + { + "epoch": 3.785841337602277, + "grad_norm": 0.6508368435692178, + "learning_rate": 3.749127240461892e-07, + "loss": 0.0345, + "step": 31926 + }, + { + "epoch": 3.7859599193644016, + "grad_norm": 0.437369453555126, + "learning_rate": 3.744986926610905e-07, + "loss": 0.0122, + "step": 31927 + }, + { + "epoch": 3.7860785011265268, + "grad_norm": 0.5716715980245407, + "learning_rate": 3.740848882929193e-07, + "loss": 0.0282, + "step": 31928 + }, + { + "epoch": 3.786197082888652, + "grad_norm": 0.529094340318738, + "learning_rate": 3.7367131094548366e-07, + "loss": 0.0221, + "step": 31929 + }, + { + "epoch": 3.7863156646507767, + "grad_norm": 0.3995481691707605, + "learning_rate": 3.732579606226028e-07, + "loss": 0.0169, + "step": 31930 + }, + { + "epoch": 3.7864342464129015, + "grad_norm": 0.39531271991078576, + "learning_rate": 3.728448373280818e-07, + "loss": 0.0136, + "step": 31931 + }, + { + "epoch": 3.7865528281750267, + "grad_norm": 0.4824115135795889, + "learning_rate": 3.724319410657318e-07, + "loss": 0.017, + "step": 31932 + }, + { + "epoch": 3.786671409937152, + "grad_norm": 0.6604058318245357, + "learning_rate": 3.720192718393578e-07, + "loss": 0.037, + "step": 31933 + }, + { + "epoch": 3.7867899916992767, + "grad_norm": 0.7241963492985408, + "learning_rate": 3.716068296527708e-07, + "loss": 0.032, + "step": 31934 + }, + { + "epoch": 3.7869085734614014, + "grad_norm": 0.3273884936506529, + "learning_rate": 3.7119461450976225e-07, + "loss": 0.0103, + "step": 31935 + }, + { + "epoch": 3.7870271552235266, + "grad_norm": 0.3246141724687696, + "learning_rate": 3.7078262641414007e-07, + "loss": 0.0158, + "step": 31936 + }, + { + "epoch": 3.787145736985652, + "grad_norm": 0.5196106242495413, + "learning_rate": 3.7037086536969856e-07, + "loss": 0.0228, + "step": 31937 + }, + { + "epoch": 3.7872643187477766, + "grad_norm": 0.4542673971911658, + "learning_rate": 3.6995933138023465e-07, + "loss": 0.0188, + "step": 31938 + }, + { + "epoch": 3.7873829005099013, + "grad_norm": 0.6408129719994126, + "learning_rate": 3.695480244495425e-07, + "loss": 0.0239, + "step": 31939 + }, + { + "epoch": 3.7875014822720265, + "grad_norm": 0.2756190895843875, + "learning_rate": 3.691369445814136e-07, + "loss": 0.0089, + "step": 31940 + }, + { + "epoch": 3.7876200640341517, + "grad_norm": 1.0251813646344898, + "learning_rate": 3.6872609177963925e-07, + "loss": 0.0626, + "step": 31941 + }, + { + "epoch": 3.7877386457962765, + "grad_norm": 0.4467478322422121, + "learning_rate": 3.683154660480026e-07, + "loss": 0.02, + "step": 31942 + }, + { + "epoch": 3.7878572275584013, + "grad_norm": 0.469438720438782, + "learning_rate": 3.6790506739029506e-07, + "loss": 0.0247, + "step": 31943 + }, + { + "epoch": 3.7879758093205265, + "grad_norm": 0.7211261383494686, + "learning_rate": 3.6749489581029415e-07, + "loss": 0.0421, + "step": 31944 + }, + { + "epoch": 3.7880943910826517, + "grad_norm": 0.4249242166248013, + "learning_rate": 3.670849513117858e-07, + "loss": 0.0187, + "step": 31945 + }, + { + "epoch": 3.7882129728447764, + "grad_norm": 0.5777901199685538, + "learning_rate": 3.666752338985474e-07, + "loss": 0.0211, + "step": 31946 + }, + { + "epoch": 3.788331554606901, + "grad_norm": 0.7365847035640102, + "learning_rate": 3.662657435743566e-07, + "loss": 0.0377, + "step": 31947 + }, + { + "epoch": 3.7884501363690264, + "grad_norm": 0.6003416197718658, + "learning_rate": 3.6585648034298536e-07, + "loss": 0.0282, + "step": 31948 + }, + { + "epoch": 3.7885687181311516, + "grad_norm": 0.6077705731763446, + "learning_rate": 3.654474442082084e-07, + "loss": 0.0333, + "step": 31949 + }, + { + "epoch": 3.7886872998932764, + "grad_norm": 0.568682327052585, + "learning_rate": 3.6503863517380054e-07, + "loss": 0.0198, + "step": 31950 + }, + { + "epoch": 3.7888058816554016, + "grad_norm": 0.6308824785396013, + "learning_rate": 3.646300532435254e-07, + "loss": 0.0291, + "step": 31951 + }, + { + "epoch": 3.7889244634175263, + "grad_norm": 0.4468702828821192, + "learning_rate": 3.6422169842115217e-07, + "loss": 0.0212, + "step": 31952 + }, + { + "epoch": 3.7890430451796515, + "grad_norm": 0.6111869380772533, + "learning_rate": 3.6381357071044455e-07, + "loss": 0.025, + "step": 31953 + }, + { + "epoch": 3.7891616269417763, + "grad_norm": 0.5617844267530999, + "learning_rate": 3.6340567011516335e-07, + "loss": 0.0294, + "step": 31954 + }, + { + "epoch": 3.7892802087039015, + "grad_norm": 0.3988322929750128, + "learning_rate": 3.629979966390695e-07, + "loss": 0.02, + "step": 31955 + }, + { + "epoch": 3.7893987904660262, + "grad_norm": 0.6329884751580458, + "learning_rate": 3.625905502859239e-07, + "loss": 0.0313, + "step": 31956 + }, + { + "epoch": 3.7895173722281514, + "grad_norm": 0.508061234788832, + "learning_rate": 3.621833310594819e-07, + "loss": 0.0186, + "step": 31957 + }, + { + "epoch": 3.789635953990276, + "grad_norm": 0.638010164855185, + "learning_rate": 3.6177633896349596e-07, + "loss": 0.0353, + "step": 31958 + }, + { + "epoch": 3.7897545357524014, + "grad_norm": 0.4998226018753507, + "learning_rate": 3.613695740017187e-07, + "loss": 0.0202, + "step": 31959 + }, + { + "epoch": 3.789873117514526, + "grad_norm": 0.5726592432517222, + "learning_rate": 3.609630361778998e-07, + "loss": 0.0253, + "step": 31960 + }, + { + "epoch": 3.7899916992766514, + "grad_norm": 0.5311231629193192, + "learning_rate": 3.605567254957892e-07, + "loss": 0.0285, + "step": 31961 + }, + { + "epoch": 3.790110281038776, + "grad_norm": 0.5997300727514464, + "learning_rate": 3.6015064195912553e-07, + "loss": 0.0341, + "step": 31962 + }, + { + "epoch": 3.7902288628009013, + "grad_norm": 0.7007199478093511, + "learning_rate": 3.5974478557166403e-07, + "loss": 0.0304, + "step": 31963 + }, + { + "epoch": 3.790347444563026, + "grad_norm": 0.6867472343810522, + "learning_rate": 3.593391563371351e-07, + "loss": 0.04, + "step": 31964 + }, + { + "epoch": 3.7904660263251513, + "grad_norm": 1.0958737170637616, + "learning_rate": 3.589337542592858e-07, + "loss": 0.0381, + "step": 31965 + }, + { + "epoch": 3.790584608087276, + "grad_norm": 0.45196482387139836, + "learning_rate": 3.5852857934184916e-07, + "loss": 0.0157, + "step": 31966 + }, + { + "epoch": 3.7907031898494012, + "grad_norm": 0.6010949165482949, + "learning_rate": 3.581236315885611e-07, + "loss": 0.0213, + "step": 31967 + }, + { + "epoch": 3.790821771611526, + "grad_norm": 0.4448819382942913, + "learning_rate": 3.5771891100315755e-07, + "loss": 0.0226, + "step": 31968 + }, + { + "epoch": 3.790940353373651, + "grad_norm": 0.2673606718171279, + "learning_rate": 3.5731441758936335e-07, + "loss": 0.0098, + "step": 31969 + }, + { + "epoch": 3.791058935135776, + "grad_norm": 0.6815906099876485, + "learning_rate": 3.569101513509143e-07, + "loss": 0.0321, + "step": 31970 + }, + { + "epoch": 3.791177516897901, + "grad_norm": 0.5580130431869675, + "learning_rate": 3.5650611229153253e-07, + "loss": 0.0196, + "step": 31971 + }, + { + "epoch": 3.791296098660026, + "grad_norm": 0.4263114470975018, + "learning_rate": 3.561023004149483e-07, + "loss": 0.0164, + "step": 31972 + }, + { + "epoch": 3.791414680422151, + "grad_norm": 0.6166040755258168, + "learning_rate": 3.5569871572487813e-07, + "loss": 0.0428, + "step": 31973 + }, + { + "epoch": 3.7915332621842763, + "grad_norm": 0.5644412485164052, + "learning_rate": 3.55295358225044e-07, + "loss": 0.028, + "step": 31974 + }, + { + "epoch": 3.791651843946401, + "grad_norm": 0.824731096964996, + "learning_rate": 3.548922279191652e-07, + "loss": 0.0322, + "step": 31975 + }, + { + "epoch": 3.791770425708526, + "grad_norm": 0.4208528552291573, + "learning_rate": 3.544893248109582e-07, + "loss": 0.0164, + "step": 31976 + }, + { + "epoch": 3.791889007470651, + "grad_norm": 0.5895615290252004, + "learning_rate": 3.5408664890413666e-07, + "loss": 0.0316, + "step": 31977 + }, + { + "epoch": 3.7920075892327763, + "grad_norm": 0.5719868065628717, + "learning_rate": 3.536842002024143e-07, + "loss": 0.0258, + "step": 31978 + }, + { + "epoch": 3.792126170994901, + "grad_norm": 0.92298278698909, + "learning_rate": 3.5328197870950484e-07, + "loss": 0.0318, + "step": 31979 + }, + { + "epoch": 3.7922447527570258, + "grad_norm": 0.3850797464551466, + "learning_rate": 3.528799844291053e-07, + "loss": 0.0168, + "step": 31980 + }, + { + "epoch": 3.792363334519151, + "grad_norm": 0.43618540704253156, + "learning_rate": 3.5247821736493216e-07, + "loss": 0.0199, + "step": 31981 + }, + { + "epoch": 3.792481916281276, + "grad_norm": 0.8445382150219234, + "learning_rate": 3.520766775206824e-07, + "loss": 0.0418, + "step": 31982 + }, + { + "epoch": 3.792600498043401, + "grad_norm": 0.5993230998050739, + "learning_rate": 3.516753649000615e-07, + "loss": 0.0384, + "step": 31983 + }, + { + "epoch": 3.7927190798055257, + "grad_norm": 0.6619232811437021, + "learning_rate": 3.512742795067692e-07, + "loss": 0.0317, + "step": 31984 + }, + { + "epoch": 3.792837661567651, + "grad_norm": 0.6463725213266769, + "learning_rate": 3.508734213444997e-07, + "loss": 0.0276, + "step": 31985 + }, + { + "epoch": 3.792956243329776, + "grad_norm": 0.6858320237209403, + "learning_rate": 3.5047279041695027e-07, + "loss": 0.0319, + "step": 31986 + }, + { + "epoch": 3.793074825091901, + "grad_norm": 0.5254694378737392, + "learning_rate": 3.5007238672781495e-07, + "loss": 0.0183, + "step": 31987 + }, + { + "epoch": 3.7931934068540256, + "grad_norm": 0.5307032318386261, + "learning_rate": 3.4967221028078534e-07, + "loss": 0.0312, + "step": 31988 + }, + { + "epoch": 3.793311988616151, + "grad_norm": 0.4861456123256489, + "learning_rate": 3.4927226107954735e-07, + "loss": 0.0226, + "step": 31989 + }, + { + "epoch": 3.793430570378276, + "grad_norm": 0.28882136801099273, + "learning_rate": 3.488725391277953e-07, + "loss": 0.0123, + "step": 31990 + }, + { + "epoch": 3.793549152140401, + "grad_norm": 0.7445896211386859, + "learning_rate": 3.484730444292067e-07, + "loss": 0.0305, + "step": 31991 + }, + { + "epoch": 3.7936677339025255, + "grad_norm": 0.39332387078829145, + "learning_rate": 3.4807377698746477e-07, + "loss": 0.0193, + "step": 31992 + }, + { + "epoch": 3.7937863156646507, + "grad_norm": 0.46003463135024497, + "learning_rate": 3.476747368062527e-07, + "loss": 0.0197, + "step": 31993 + }, + { + "epoch": 3.793904897426776, + "grad_norm": 0.6172089500783939, + "learning_rate": 3.472759238892537e-07, + "loss": 0.0252, + "step": 31994 + }, + { + "epoch": 3.7940234791889007, + "grad_norm": 0.6550485257653393, + "learning_rate": 3.4687733824013413e-07, + "loss": 0.036, + "step": 31995 + }, + { + "epoch": 3.7941420609510255, + "grad_norm": 0.30111780905952185, + "learning_rate": 3.464789798625773e-07, + "loss": 0.013, + "step": 31996 + }, + { + "epoch": 3.7942606427131507, + "grad_norm": 0.3885313252652289, + "learning_rate": 3.460808487602496e-07, + "loss": 0.0175, + "step": 31997 + }, + { + "epoch": 3.794379224475276, + "grad_norm": 1.0515107144062488, + "learning_rate": 3.456829449368232e-07, + "loss": 0.0594, + "step": 31998 + }, + { + "epoch": 3.7944978062374006, + "grad_norm": 0.536662251084361, + "learning_rate": 3.4528526839597285e-07, + "loss": 0.0299, + "step": 31999 + }, + { + "epoch": 3.794616387999526, + "grad_norm": 0.6280402784288501, + "learning_rate": 3.4488781914135126e-07, + "loss": 0.0303, + "step": 32000 + }, + { + "epoch": 3.7947349697616506, + "grad_norm": 0.44196091911921875, + "learning_rate": 3.4449059717663324e-07, + "loss": 0.0155, + "step": 32001 + }, + { + "epoch": 3.794853551523776, + "grad_norm": 0.4622741714200245, + "learning_rate": 3.4409360250547697e-07, + "loss": 0.0189, + "step": 32002 + }, + { + "epoch": 3.7949721332859006, + "grad_norm": 0.6440184347639555, + "learning_rate": 3.4369683513154336e-07, + "loss": 0.0292, + "step": 32003 + }, + { + "epoch": 3.7950907150480258, + "grad_norm": 0.5495249172490893, + "learning_rate": 3.433002950584879e-07, + "loss": 0.033, + "step": 32004 + }, + { + "epoch": 3.7952092968101505, + "grad_norm": 0.40329860373661286, + "learning_rate": 3.429039822899688e-07, + "loss": 0.016, + "step": 32005 + }, + { + "epoch": 3.7953278785722757, + "grad_norm": 0.5460344871339584, + "learning_rate": 3.4250789682964135e-07, + "loss": 0.0294, + "step": 32006 + }, + { + "epoch": 3.7954464603344005, + "grad_norm": 0.47067410470609183, + "learning_rate": 3.421120386811472e-07, + "loss": 0.0158, + "step": 32007 + }, + { + "epoch": 3.7955650420965257, + "grad_norm": 0.6488633970665022, + "learning_rate": 3.4171640784815e-07, + "loss": 0.0298, + "step": 32008 + }, + { + "epoch": 3.7956836238586504, + "grad_norm": 0.7250654991596538, + "learning_rate": 3.4132100433428296e-07, + "loss": 0.0327, + "step": 32009 + }, + { + "epoch": 3.7958022056207756, + "grad_norm": 0.6095127445579519, + "learning_rate": 3.409258281432043e-07, + "loss": 0.0239, + "step": 32010 + }, + { + "epoch": 3.7959207873829004, + "grad_norm": 0.5129245771601799, + "learning_rate": 3.4053087927854446e-07, + "loss": 0.0269, + "step": 32011 + }, + { + "epoch": 3.7960393691450256, + "grad_norm": 0.4826941739688155, + "learning_rate": 3.4013615774395323e-07, + "loss": 0.0211, + "step": 32012 + }, + { + "epoch": 3.7961579509071504, + "grad_norm": 0.89618252946814, + "learning_rate": 3.397416635430667e-07, + "loss": 0.0473, + "step": 32013 + }, + { + "epoch": 3.7962765326692756, + "grad_norm": 0.37934123676121706, + "learning_rate": 3.3934739667952076e-07, + "loss": 0.0159, + "step": 32014 + }, + { + "epoch": 3.7963951144314003, + "grad_norm": 0.3194797688589997, + "learning_rate": 3.3895335715695145e-07, + "loss": 0.0084, + "step": 32015 + }, + { + "epoch": 3.7965136961935255, + "grad_norm": 0.43295016193854907, + "learning_rate": 3.385595449789919e-07, + "loss": 0.0195, + "step": 32016 + }, + { + "epoch": 3.7966322779556503, + "grad_norm": 0.6018257258167681, + "learning_rate": 3.381659601492726e-07, + "loss": 0.0266, + "step": 32017 + }, + { + "epoch": 3.7967508597177755, + "grad_norm": 0.3947752122193301, + "learning_rate": 3.3777260267141565e-07, + "loss": 0.0175, + "step": 32018 + }, + { + "epoch": 3.7968694414799002, + "grad_norm": 0.5733467772497082, + "learning_rate": 3.37379472549057e-07, + "loss": 0.0364, + "step": 32019 + }, + { + "epoch": 3.7969880232420254, + "grad_norm": 0.3857041773390212, + "learning_rate": 3.369865697858132e-07, + "loss": 0.014, + "step": 32020 + }, + { + "epoch": 3.79710660500415, + "grad_norm": 0.7826137028933772, + "learning_rate": 3.365938943853147e-07, + "loss": 0.0346, + "step": 32021 + }, + { + "epoch": 3.7972251867662754, + "grad_norm": 0.46450191327546786, + "learning_rate": 3.362014463511698e-07, + "loss": 0.02, + "step": 32022 + }, + { + "epoch": 3.7973437685284006, + "grad_norm": 0.4185698522889124, + "learning_rate": 3.35809225687006e-07, + "loss": 0.0157, + "step": 32023 + }, + { + "epoch": 3.7974623502905254, + "grad_norm": 0.36972658826700744, + "learning_rate": 3.354172323964372e-07, + "loss": 0.0116, + "step": 32024 + }, + { + "epoch": 3.79758093205265, + "grad_norm": 0.4055682571347903, + "learning_rate": 3.350254664830743e-07, + "loss": 0.0186, + "step": 32025 + }, + { + "epoch": 3.7976995138147753, + "grad_norm": 0.36082444154350996, + "learning_rate": 3.346339279505284e-07, + "loss": 0.0173, + "step": 32026 + }, + { + "epoch": 3.7978180955769005, + "grad_norm": 0.4776012675286623, + "learning_rate": 3.342426168024132e-07, + "loss": 0.0181, + "step": 32027 + }, + { + "epoch": 3.7979366773390253, + "grad_norm": 0.2441742671489448, + "learning_rate": 3.3385153304233417e-07, + "loss": 0.0075, + "step": 32028 + }, + { + "epoch": 3.79805525910115, + "grad_norm": 0.4032025987400549, + "learning_rate": 3.334606766738968e-07, + "loss": 0.0176, + "step": 32029 + }, + { + "epoch": 3.7981738408632753, + "grad_norm": 0.523345768171412, + "learning_rate": 3.3307004770070093e-07, + "loss": 0.0224, + "step": 32030 + }, + { + "epoch": 3.7982924226254005, + "grad_norm": 0.3440080086923662, + "learning_rate": 3.3267964612635484e-07, + "loss": 0.0147, + "step": 32031 + }, + { + "epoch": 3.798411004387525, + "grad_norm": 0.3626646971249543, + "learning_rate": 3.3228947195445e-07, + "loss": 0.0144, + "step": 32032 + }, + { + "epoch": 3.79852958614965, + "grad_norm": 0.750829083790796, + "learning_rate": 3.3189952518858646e-07, + "loss": 0.0309, + "step": 32033 + }, + { + "epoch": 3.798648167911775, + "grad_norm": 0.765692895533417, + "learning_rate": 3.31509805832364e-07, + "loss": 0.0378, + "step": 32034 + }, + { + "epoch": 3.7987667496739004, + "grad_norm": 0.475585022325879, + "learning_rate": 3.3112031388936595e-07, + "loss": 0.0217, + "step": 32035 + }, + { + "epoch": 3.798885331436025, + "grad_norm": 0.7635317826972164, + "learning_rate": 3.3073104936318935e-07, + "loss": 0.0416, + "step": 32036 + }, + { + "epoch": 3.79900391319815, + "grad_norm": 0.9572106174862409, + "learning_rate": 3.303420122574202e-07, + "loss": 0.0314, + "step": 32037 + }, + { + "epoch": 3.799122494960275, + "grad_norm": 0.553696279875005, + "learning_rate": 3.299532025756447e-07, + "loss": 0.0253, + "step": 32038 + }, + { + "epoch": 3.7992410767224003, + "grad_norm": 0.5600736243460904, + "learning_rate": 3.295646203214514e-07, + "loss": 0.0217, + "step": 32039 + }, + { + "epoch": 3.799359658484525, + "grad_norm": 0.4636754865267618, + "learning_rate": 3.2917626549841816e-07, + "loss": 0.024, + "step": 32040 + }, + { + "epoch": 3.79947824024665, + "grad_norm": 0.6723779372690409, + "learning_rate": 3.2878813811012544e-07, + "loss": 0.036, + "step": 32041 + }, + { + "epoch": 3.799596822008775, + "grad_norm": 0.44666651017318737, + "learning_rate": 3.2840023816015087e-07, + "loss": 0.0224, + "step": 32042 + }, + { + "epoch": 3.7997154037709002, + "grad_norm": 0.5440321738303014, + "learning_rate": 3.280125656520777e-07, + "loss": 0.0352, + "step": 32043 + }, + { + "epoch": 3.799833985533025, + "grad_norm": 0.5211361513039947, + "learning_rate": 3.276251205894698e-07, + "loss": 0.0281, + "step": 32044 + }, + { + "epoch": 3.7999525672951497, + "grad_norm": 0.5912881771001252, + "learning_rate": 3.2723790297590205e-07, + "loss": 0.0241, + "step": 32045 + }, + { + "epoch": 3.800071149057275, + "grad_norm": 0.6782233748381558, + "learning_rate": 3.268509128149466e-07, + "loss": 0.038, + "step": 32046 + }, + { + "epoch": 3.8001897308194, + "grad_norm": 0.3804977987958303, + "learning_rate": 3.2646415011016727e-07, + "loss": 0.0145, + "step": 32047 + }, + { + "epoch": 3.800308312581525, + "grad_norm": 0.662231140053833, + "learning_rate": 3.2607761486513345e-07, + "loss": 0.0229, + "step": 32048 + }, + { + "epoch": 3.80042689434365, + "grad_norm": 0.6877833916737834, + "learning_rate": 3.256913070834061e-07, + "loss": 0.0265, + "step": 32049 + }, + { + "epoch": 3.800545476105775, + "grad_norm": 0.8368045644393477, + "learning_rate": 3.2530522676854914e-07, + "loss": 0.0331, + "step": 32050 + }, + { + "epoch": 3.8006640578679, + "grad_norm": 0.4924506940588645, + "learning_rate": 3.2491937392412076e-07, + "loss": 0.0241, + "step": 32051 + }, + { + "epoch": 3.800782639630025, + "grad_norm": 0.5972970916217315, + "learning_rate": 3.2453374855367367e-07, + "loss": 0.0341, + "step": 32052 + }, + { + "epoch": 3.80090122139215, + "grad_norm": 0.817575284271348, + "learning_rate": 3.2414835066076897e-07, + "loss": 0.0302, + "step": 32053 + }, + { + "epoch": 3.801019803154275, + "grad_norm": 0.5706619896285653, + "learning_rate": 3.237631802489538e-07, + "loss": 0.0262, + "step": 32054 + }, + { + "epoch": 3.8011383849164, + "grad_norm": 0.6560760255223475, + "learning_rate": 3.233782373217864e-07, + "loss": 0.028, + "step": 32055 + }, + { + "epoch": 3.8012569666785248, + "grad_norm": 0.6119848988173971, + "learning_rate": 3.229935218828084e-07, + "loss": 0.0233, + "step": 32056 + }, + { + "epoch": 3.80137554844065, + "grad_norm": 0.9422633298683452, + "learning_rate": 3.2260903393556974e-07, + "loss": 0.0315, + "step": 32057 + }, + { + "epoch": 3.8014941302027747, + "grad_norm": 0.3013050254933717, + "learning_rate": 3.2222477348361477e-07, + "loss": 0.0122, + "step": 32058 + }, + { + "epoch": 3.8016127119649, + "grad_norm": 0.588623595254943, + "learning_rate": 3.218407405304852e-07, + "loss": 0.0316, + "step": 32059 + }, + { + "epoch": 3.8017312937270247, + "grad_norm": 0.8919653545449426, + "learning_rate": 3.2145693507971974e-07, + "loss": 0.0483, + "step": 32060 + }, + { + "epoch": 3.80184987548915, + "grad_norm": 0.42675080554639677, + "learning_rate": 3.2107335713486284e-07, + "loss": 0.0167, + "step": 32061 + }, + { + "epoch": 3.8019684572512746, + "grad_norm": 0.531459638507006, + "learning_rate": 3.2069000669944504e-07, + "loss": 0.0184, + "step": 32062 + }, + { + "epoch": 3.8020870390134, + "grad_norm": 0.3182891720999379, + "learning_rate": 3.203068837769996e-07, + "loss": 0.0157, + "step": 32063 + }, + { + "epoch": 3.8022056207755246, + "grad_norm": 0.5889466607787966, + "learning_rate": 3.1992398837105976e-07, + "loss": 0.0226, + "step": 32064 + }, + { + "epoch": 3.80232420253765, + "grad_norm": 0.7308325999502061, + "learning_rate": 3.1954132048515886e-07, + "loss": 0.0393, + "step": 32065 + }, + { + "epoch": 3.8024427842997746, + "grad_norm": 0.362475521193777, + "learning_rate": 3.191588801228218e-07, + "loss": 0.0138, + "step": 32066 + }, + { + "epoch": 3.8025613660618998, + "grad_norm": 0.4238304208673787, + "learning_rate": 3.1877666728757084e-07, + "loss": 0.02, + "step": 32067 + }, + { + "epoch": 3.8026799478240245, + "grad_norm": 0.321656604164187, + "learning_rate": 3.1839468198293644e-07, + "loss": 0.0128, + "step": 32068 + }, + { + "epoch": 3.8027985295861497, + "grad_norm": 0.438210637521601, + "learning_rate": 3.180129242124352e-07, + "loss": 0.0214, + "step": 32069 + }, + { + "epoch": 3.8029171113482745, + "grad_norm": 0.36156277096268574, + "learning_rate": 3.176313939795894e-07, + "loss": 0.0187, + "step": 32070 + }, + { + "epoch": 3.8030356931103997, + "grad_norm": 0.5594457925522613, + "learning_rate": 3.1725009128791007e-07, + "loss": 0.0245, + "step": 32071 + }, + { + "epoch": 3.8031542748725244, + "grad_norm": 0.30498627428708536, + "learning_rate": 3.1686901614091937e-07, + "loss": 0.0117, + "step": 32072 + }, + { + "epoch": 3.8032728566346496, + "grad_norm": 0.547039815985683, + "learning_rate": 3.1648816854212837e-07, + "loss": 0.0291, + "step": 32073 + }, + { + "epoch": 3.8033914383967744, + "grad_norm": 0.509459396904311, + "learning_rate": 3.161075484950482e-07, + "loss": 0.0226, + "step": 32074 + }, + { + "epoch": 3.8035100201588996, + "grad_norm": 0.39319273856593684, + "learning_rate": 3.1572715600318436e-07, + "loss": 0.0178, + "step": 32075 + }, + { + "epoch": 3.803628601921025, + "grad_norm": 0.8279689407482818, + "learning_rate": 3.1534699107004795e-07, + "loss": 0.042, + "step": 32076 + }, + { + "epoch": 3.8037471836831496, + "grad_norm": 0.5079743038531088, + "learning_rate": 3.1496705369914446e-07, + "loss": 0.0179, + "step": 32077 + }, + { + "epoch": 3.8038657654452743, + "grad_norm": 0.7613423154582344, + "learning_rate": 3.145873438939684e-07, + "loss": 0.0383, + "step": 32078 + }, + { + "epoch": 3.8039843472073995, + "grad_norm": 0.514980523284161, + "learning_rate": 3.1420786165803076e-07, + "loss": 0.0195, + "step": 32079 + }, + { + "epoch": 3.8041029289695247, + "grad_norm": 0.8085188980517779, + "learning_rate": 3.138286069948232e-07, + "loss": 0.0395, + "step": 32080 + }, + { + "epoch": 3.8042215107316495, + "grad_norm": 0.7043556742249236, + "learning_rate": 3.1344957990784573e-07, + "loss": 0.032, + "step": 32081 + }, + { + "epoch": 3.8043400924937743, + "grad_norm": 0.46454222624150804, + "learning_rate": 3.1307078040058725e-07, + "loss": 0.0189, + "step": 32082 + }, + { + "epoch": 3.8044586742558995, + "grad_norm": 0.43470727433312056, + "learning_rate": 3.126922084765449e-07, + "loss": 0.0237, + "step": 32083 + }, + { + "epoch": 3.8045772560180247, + "grad_norm": 0.6467265686650456, + "learning_rate": 3.1231386413920485e-07, + "loss": 0.0203, + "step": 32084 + }, + { + "epoch": 3.8046958377801494, + "grad_norm": 0.8446253394920026, + "learning_rate": 3.119357473920587e-07, + "loss": 0.0418, + "step": 32085 + }, + { + "epoch": 3.804814419542274, + "grad_norm": 0.6933481695766333, + "learning_rate": 3.1155785823858973e-07, + "loss": 0.0297, + "step": 32086 + }, + { + "epoch": 3.8049330013043994, + "grad_norm": 0.6758497281701533, + "learning_rate": 3.1118019668228137e-07, + "loss": 0.0393, + "step": 32087 + }, + { + "epoch": 3.8050515830665246, + "grad_norm": 0.35198321742268096, + "learning_rate": 3.1080276272661966e-07, + "loss": 0.0198, + "step": 32088 + }, + { + "epoch": 3.8051701648286493, + "grad_norm": 0.3947356595009336, + "learning_rate": 3.1042555637507686e-07, + "loss": 0.0201, + "step": 32089 + }, + { + "epoch": 3.805288746590774, + "grad_norm": 0.6436727139003262, + "learning_rate": 3.1004857763113624e-07, + "loss": 0.0235, + "step": 32090 + }, + { + "epoch": 3.8054073283528993, + "grad_norm": 0.39725264891251505, + "learning_rate": 3.096718264982701e-07, + "loss": 0.0139, + "step": 32091 + }, + { + "epoch": 3.8055259101150245, + "grad_norm": 0.3018989310642143, + "learning_rate": 3.0929530297995333e-07, + "loss": 0.0135, + "step": 32092 + }, + { + "epoch": 3.8056444918771493, + "grad_norm": 0.5616134769005469, + "learning_rate": 3.089190070796527e-07, + "loss": 0.0201, + "step": 32093 + }, + { + "epoch": 3.805763073639274, + "grad_norm": 0.5462253643998886, + "learning_rate": 3.0854293880084593e-07, + "loss": 0.0287, + "step": 32094 + }, + { + "epoch": 3.805881655401399, + "grad_norm": 0.5189256903485824, + "learning_rate": 3.081670981469914e-07, + "loss": 0.0206, + "step": 32095 + }, + { + "epoch": 3.8060002371635244, + "grad_norm": 0.2525075201867821, + "learning_rate": 3.077914851215585e-07, + "loss": 0.0078, + "step": 32096 + }, + { + "epoch": 3.806118818925649, + "grad_norm": 0.33026433980000824, + "learning_rate": 3.074160997280057e-07, + "loss": 0.0138, + "step": 32097 + }, + { + "epoch": 3.8062374006877744, + "grad_norm": 0.3927463063330785, + "learning_rate": 3.0704094196979947e-07, + "loss": 0.0191, + "step": 32098 + }, + { + "epoch": 3.806355982449899, + "grad_norm": 0.61555998266712, + "learning_rate": 3.066660118503956e-07, + "loss": 0.0239, + "step": 32099 + }, + { + "epoch": 3.8064745642120243, + "grad_norm": 0.7598710090053276, + "learning_rate": 3.0629130937324667e-07, + "loss": 0.0393, + "step": 32100 + }, + { + "epoch": 3.806593145974149, + "grad_norm": 1.0576635724729997, + "learning_rate": 3.05916834541814e-07, + "loss": 0.0575, + "step": 32101 + }, + { + "epoch": 3.8067117277362743, + "grad_norm": 0.3993152014116616, + "learning_rate": 3.055425873595419e-07, + "loss": 0.0239, + "step": 32102 + }, + { + "epoch": 3.806830309498399, + "grad_norm": 0.6733542696420486, + "learning_rate": 3.051685678298888e-07, + "loss": 0.0356, + "step": 32103 + }, + { + "epoch": 3.8069488912605243, + "grad_norm": 0.500464831214322, + "learning_rate": 3.0479477595629636e-07, + "loss": 0.0217, + "step": 32104 + }, + { + "epoch": 3.807067473022649, + "grad_norm": 0.3312797896224538, + "learning_rate": 3.044212117422146e-07, + "loss": 0.0137, + "step": 32105 + }, + { + "epoch": 3.8071860547847742, + "grad_norm": 0.45716445593466887, + "learning_rate": 3.0404787519108523e-07, + "loss": 0.0173, + "step": 32106 + }, + { + "epoch": 3.807304636546899, + "grad_norm": 0.3211063088180714, + "learning_rate": 3.0367476630634995e-07, + "loss": 0.0118, + "step": 32107 + }, + { + "epoch": 3.807423218309024, + "grad_norm": 0.4084646214768792, + "learning_rate": 3.033018850914476e-07, + "loss": 0.0152, + "step": 32108 + }, + { + "epoch": 3.807541800071149, + "grad_norm": 0.4990642646643085, + "learning_rate": 3.0292923154981723e-07, + "loss": 0.0271, + "step": 32109 + }, + { + "epoch": 3.807660381833274, + "grad_norm": 0.6475374536322294, + "learning_rate": 3.0255680568489764e-07, + "loss": 0.0287, + "step": 32110 + }, + { + "epoch": 3.807778963595399, + "grad_norm": 0.3767821490455209, + "learning_rate": 3.021846075001139e-07, + "loss": 0.0168, + "step": 32111 + }, + { + "epoch": 3.807897545357524, + "grad_norm": 0.4943377902742331, + "learning_rate": 3.0181263699890215e-07, + "loss": 0.0188, + "step": 32112 + }, + { + "epoch": 3.808016127119649, + "grad_norm": 0.36479195021532557, + "learning_rate": 3.0144089418469304e-07, + "loss": 0.0136, + "step": 32113 + }, + { + "epoch": 3.808134708881774, + "grad_norm": 0.5043447536590082, + "learning_rate": 3.010693790609087e-07, + "loss": 0.0338, + "step": 32114 + }, + { + "epoch": 3.808253290643899, + "grad_norm": 0.5528151844315454, + "learning_rate": 3.006980916309798e-07, + "loss": 0.0219, + "step": 32115 + }, + { + "epoch": 3.808371872406024, + "grad_norm": 0.5165873487304844, + "learning_rate": 3.0032703189832314e-07, + "loss": 0.0277, + "step": 32116 + }, + { + "epoch": 3.808490454168149, + "grad_norm": 0.7942713850954259, + "learning_rate": 2.999561998663636e-07, + "loss": 0.0378, + "step": 32117 + }, + { + "epoch": 3.808609035930274, + "grad_norm": 0.6961625744568131, + "learning_rate": 2.9958559553852074e-07, + "loss": 0.0354, + "step": 32118 + }, + { + "epoch": 3.8087276176923988, + "grad_norm": 0.6469532821970493, + "learning_rate": 2.992152189182057e-07, + "loss": 0.0339, + "step": 32119 + }, + { + "epoch": 3.808846199454524, + "grad_norm": 0.4847831975174317, + "learning_rate": 2.98845070008838e-07, + "loss": 0.0206, + "step": 32120 + }, + { + "epoch": 3.8089647812166487, + "grad_norm": 0.3605426488564424, + "learning_rate": 2.9847514881382876e-07, + "loss": 0.0188, + "step": 32121 + }, + { + "epoch": 3.809083362978774, + "grad_norm": 0.7590539188283542, + "learning_rate": 2.981054553365864e-07, + "loss": 0.0333, + "step": 32122 + }, + { + "epoch": 3.8092019447408987, + "grad_norm": 0.6035542228609977, + "learning_rate": 2.9773598958052205e-07, + "loss": 0.0273, + "step": 32123 + }, + { + "epoch": 3.809320526503024, + "grad_norm": 0.4979172267200873, + "learning_rate": 2.973667515490386e-07, + "loss": 0.0143, + "step": 32124 + }, + { + "epoch": 3.809439108265149, + "grad_norm": 0.4557807647319833, + "learning_rate": 2.969977412455416e-07, + "loss": 0.0223, + "step": 32125 + }, + { + "epoch": 3.809557690027274, + "grad_norm": 0.509152378761844, + "learning_rate": 2.9662895867343387e-07, + "loss": 0.0222, + "step": 32126 + }, + { + "epoch": 3.8096762717893986, + "grad_norm": 0.3595126656861218, + "learning_rate": 2.9626040383611e-07, + "loss": 0.0177, + "step": 32127 + }, + { + "epoch": 3.809794853551524, + "grad_norm": 0.497496558177567, + "learning_rate": 2.958920767369755e-07, + "loss": 0.0245, + "step": 32128 + }, + { + "epoch": 3.809913435313649, + "grad_norm": 0.5400028938142566, + "learning_rate": 2.9552397737941937e-07, + "loss": 0.0273, + "step": 32129 + }, + { + "epoch": 3.8100320170757738, + "grad_norm": 0.7629963517153588, + "learning_rate": 2.951561057668417e-07, + "loss": 0.0282, + "step": 32130 + }, + { + "epoch": 3.8101505988378985, + "grad_norm": 0.3986755723248289, + "learning_rate": 2.947884619026231e-07, + "loss": 0.0163, + "step": 32131 + }, + { + "epoch": 3.8102691806000237, + "grad_norm": 0.6054364225380848, + "learning_rate": 2.944210457901636e-07, + "loss": 0.0256, + "step": 32132 + }, + { + "epoch": 3.810387762362149, + "grad_norm": 0.568111852209842, + "learning_rate": 2.940538574328466e-07, + "loss": 0.0267, + "step": 32133 + }, + { + "epoch": 3.8105063441242737, + "grad_norm": 0.960676663239833, + "learning_rate": 2.936868968340528e-07, + "loss": 0.0475, + "step": 32134 + }, + { + "epoch": 3.8106249258863985, + "grad_norm": 0.6167569819632651, + "learning_rate": 2.933201639971711e-07, + "loss": 0.0278, + "step": 32135 + }, + { + "epoch": 3.8107435076485237, + "grad_norm": 0.6558509091234292, + "learning_rate": 2.9295365892558214e-07, + "loss": 0.0281, + "step": 32136 + }, + { + "epoch": 3.810862089410649, + "grad_norm": 0.6174441140804513, + "learning_rate": 2.92587381622661e-07, + "loss": 0.0346, + "step": 32137 + }, + { + "epoch": 3.8109806711727736, + "grad_norm": 0.344222166350809, + "learning_rate": 2.9222133209178546e-07, + "loss": 0.0173, + "step": 32138 + }, + { + "epoch": 3.8110992529348984, + "grad_norm": 0.4380606430623032, + "learning_rate": 2.918555103363307e-07, + "loss": 0.0222, + "step": 32139 + }, + { + "epoch": 3.8112178346970236, + "grad_norm": 0.44861400352474984, + "learning_rate": 2.91489916359669e-07, + "loss": 0.0181, + "step": 32140 + }, + { + "epoch": 3.811336416459149, + "grad_norm": 0.3632571935040418, + "learning_rate": 2.911245501651727e-07, + "loss": 0.0109, + "step": 32141 + }, + { + "epoch": 3.8114549982212735, + "grad_norm": 0.7596862346415664, + "learning_rate": 2.907594117562057e-07, + "loss": 0.0313, + "step": 32142 + }, + { + "epoch": 3.8115735799833983, + "grad_norm": 0.49370307743134734, + "learning_rate": 2.9039450113613753e-07, + "loss": 0.022, + "step": 32143 + }, + { + "epoch": 3.8116921617455235, + "grad_norm": 0.596876350266793, + "learning_rate": 2.900298183083294e-07, + "loss": 0.025, + "step": 32144 + }, + { + "epoch": 3.8118107435076487, + "grad_norm": 0.4070314030500513, + "learning_rate": 2.8966536327614813e-07, + "loss": 0.0146, + "step": 32145 + }, + { + "epoch": 3.8119293252697735, + "grad_norm": 0.6682084852075161, + "learning_rate": 2.893011360429465e-07, + "loss": 0.0379, + "step": 32146 + }, + { + "epoch": 3.812047907031898, + "grad_norm": 0.7713628780711353, + "learning_rate": 2.8893713661208854e-07, + "loss": 0.0411, + "step": 32147 + }, + { + "epoch": 3.8121664887940234, + "grad_norm": 0.5252344007130583, + "learning_rate": 2.8857336498692713e-07, + "loss": 0.0264, + "step": 32148 + }, + { + "epoch": 3.8122850705561486, + "grad_norm": 0.696424277436016, + "learning_rate": 2.882098211708151e-07, + "loss": 0.0183, + "step": 32149 + }, + { + "epoch": 3.8124036523182734, + "grad_norm": 0.5410910609887338, + "learning_rate": 2.878465051671081e-07, + "loss": 0.0222, + "step": 32150 + }, + { + "epoch": 3.8125222340803986, + "grad_norm": 0.39307738535373016, + "learning_rate": 2.874834169791507e-07, + "loss": 0.0213, + "step": 32151 + }, + { + "epoch": 3.8126408158425233, + "grad_norm": 0.3639490551697164, + "learning_rate": 2.8712055661029025e-07, + "loss": 0.0146, + "step": 32152 + }, + { + "epoch": 3.8127593976046485, + "grad_norm": 0.5230112626611549, + "learning_rate": 2.86757924063874e-07, + "loss": 0.0223, + "step": 32153 + }, + { + "epoch": 3.8128779793667733, + "grad_norm": 0.4175185836371759, + "learning_rate": 2.863955193432438e-07, + "loss": 0.0182, + "step": 32154 + }, + { + "epoch": 3.8129965611288985, + "grad_norm": 0.5987133493123983, + "learning_rate": 2.860333424517442e-07, + "loss": 0.0224, + "step": 32155 + }, + { + "epoch": 3.8131151428910233, + "grad_norm": 0.5778298840508103, + "learning_rate": 2.8567139339270577e-07, + "loss": 0.0238, + "step": 32156 + }, + { + "epoch": 3.8132337246531485, + "grad_norm": 0.48099292484348594, + "learning_rate": 2.853096721694731e-07, + "loss": 0.0203, + "step": 32157 + }, + { + "epoch": 3.8133523064152732, + "grad_norm": 0.6205378948887058, + "learning_rate": 2.849481787853797e-07, + "loss": 0.0318, + "step": 32158 + }, + { + "epoch": 3.8134708881773984, + "grad_norm": 0.5864162671376884, + "learning_rate": 2.845869132437562e-07, + "loss": 0.0208, + "step": 32159 + }, + { + "epoch": 3.813589469939523, + "grad_norm": 0.4239473056638681, + "learning_rate": 2.8422587554793044e-07, + "loss": 0.0142, + "step": 32160 + }, + { + "epoch": 3.8137080517016484, + "grad_norm": 0.5561876994845704, + "learning_rate": 2.838650657012359e-07, + "loss": 0.0244, + "step": 32161 + }, + { + "epoch": 3.813826633463773, + "grad_norm": 0.6096188508105082, + "learning_rate": 2.8350448370699503e-07, + "loss": 0.0291, + "step": 32162 + }, + { + "epoch": 3.8139452152258984, + "grad_norm": 0.4373125667688239, + "learning_rate": 2.831441295685328e-07, + "loss": 0.021, + "step": 32163 + }, + { + "epoch": 3.814063796988023, + "grad_norm": 0.5156895134029705, + "learning_rate": 2.827840032891743e-07, + "loss": 0.0344, + "step": 32164 + }, + { + "epoch": 3.8141823787501483, + "grad_norm": 0.514217433159126, + "learning_rate": 2.824241048722365e-07, + "loss": 0.0248, + "step": 32165 + }, + { + "epoch": 3.814300960512273, + "grad_norm": 0.6192485951374296, + "learning_rate": 2.820644343210388e-07, + "loss": 0.0333, + "step": 32166 + }, + { + "epoch": 3.8144195422743983, + "grad_norm": 0.8838464215050871, + "learning_rate": 2.8170499163889253e-07, + "loss": 0.0363, + "step": 32167 + }, + { + "epoch": 3.814538124036523, + "grad_norm": 0.5453269974764499, + "learning_rate": 2.8134577682911724e-07, + "loss": 0.0247, + "step": 32168 + }, + { + "epoch": 3.8146567057986482, + "grad_norm": 0.559958577048024, + "learning_rate": 2.8098678989502136e-07, + "loss": 0.0254, + "step": 32169 + }, + { + "epoch": 3.814775287560773, + "grad_norm": 0.41172191714659967, + "learning_rate": 2.8062803083991617e-07, + "loss": 0.0161, + "step": 32170 + }, + { + "epoch": 3.814893869322898, + "grad_norm": 0.46594771121041234, + "learning_rate": 2.802694996671046e-07, + "loss": 0.0237, + "step": 32171 + }, + { + "epoch": 3.815012451085023, + "grad_norm": 0.526639268727333, + "learning_rate": 2.7991119637989504e-07, + "loss": 0.0196, + "step": 32172 + }, + { + "epoch": 3.815131032847148, + "grad_norm": 0.6194033140248086, + "learning_rate": 2.7955312098159326e-07, + "loss": 0.0294, + "step": 32173 + }, + { + "epoch": 3.8152496146092734, + "grad_norm": 0.40909722980669366, + "learning_rate": 2.7919527347549657e-07, + "loss": 0.0174, + "step": 32174 + }, + { + "epoch": 3.815368196371398, + "grad_norm": 0.6547848852081021, + "learning_rate": 2.7883765386490236e-07, + "loss": 0.0299, + "step": 32175 + }, + { + "epoch": 3.815486778133523, + "grad_norm": 0.39778716266662484, + "learning_rate": 2.784802621531107e-07, + "loss": 0.0169, + "step": 32176 + }, + { + "epoch": 3.815605359895648, + "grad_norm": 0.5050673730790161, + "learning_rate": 2.781230983434191e-07, + "loss": 0.0202, + "step": 32177 + }, + { + "epoch": 3.8157239416577733, + "grad_norm": 0.3611663462208305, + "learning_rate": 2.7776616243911366e-07, + "loss": 0.0119, + "step": 32178 + }, + { + "epoch": 3.815842523419898, + "grad_norm": 0.7727142040202052, + "learning_rate": 2.774094544434891e-07, + "loss": 0.0322, + "step": 32179 + }, + { + "epoch": 3.815961105182023, + "grad_norm": 0.5526738371916118, + "learning_rate": 2.7705297435983156e-07, + "loss": 0.0254, + "step": 32180 + }, + { + "epoch": 3.816079686944148, + "grad_norm": 0.6368852707357536, + "learning_rate": 2.766967221914302e-07, + "loss": 0.0351, + "step": 32181 + }, + { + "epoch": 3.816198268706273, + "grad_norm": 0.8199680520175389, + "learning_rate": 2.763406979415628e-07, + "loss": 0.0333, + "step": 32182 + }, + { + "epoch": 3.816316850468398, + "grad_norm": 0.642397478634977, + "learning_rate": 2.759849016135185e-07, + "loss": 0.0258, + "step": 32183 + }, + { + "epoch": 3.8164354322305227, + "grad_norm": 0.6666426205089777, + "learning_rate": 2.7562933321057525e-07, + "loss": 0.0283, + "step": 32184 + }, + { + "epoch": 3.816554013992648, + "grad_norm": 0.739024718972567, + "learning_rate": 2.7527399273601094e-07, + "loss": 0.0398, + "step": 32185 + }, + { + "epoch": 3.816672595754773, + "grad_norm": 0.7535639307639996, + "learning_rate": 2.7491888019310074e-07, + "loss": 0.0211, + "step": 32186 + }, + { + "epoch": 3.816791177516898, + "grad_norm": 0.4370430186018234, + "learning_rate": 2.7456399558511705e-07, + "loss": 0.0187, + "step": 32187 + }, + { + "epoch": 3.8169097592790227, + "grad_norm": 0.6517119258880838, + "learning_rate": 2.7420933891533493e-07, + "loss": 0.0254, + "step": 32188 + }, + { + "epoch": 3.817028341041148, + "grad_norm": 0.5518762920010124, + "learning_rate": 2.738549101870214e-07, + "loss": 0.0248, + "step": 32189 + }, + { + "epoch": 3.817146922803273, + "grad_norm": 0.5401496396390258, + "learning_rate": 2.735007094034431e-07, + "loss": 0.0208, + "step": 32190 + }, + { + "epoch": 3.817265504565398, + "grad_norm": 0.6497218695862856, + "learning_rate": 2.7314673656786693e-07, + "loss": 0.0211, + "step": 32191 + }, + { + "epoch": 3.8173840863275226, + "grad_norm": 0.5583087510112364, + "learning_rate": 2.7279299168355976e-07, + "loss": 0.0319, + "step": 32192 + }, + { + "epoch": 3.8175026680896478, + "grad_norm": 0.6899440356276416, + "learning_rate": 2.7243947475377453e-07, + "loss": 0.0351, + "step": 32193 + }, + { + "epoch": 3.817621249851773, + "grad_norm": 0.5361484105575834, + "learning_rate": 2.7208618578177523e-07, + "loss": 0.0207, + "step": 32194 + }, + { + "epoch": 3.8177398316138977, + "grad_norm": 0.31705951183652753, + "learning_rate": 2.717331247708177e-07, + "loss": 0.013, + "step": 32195 + }, + { + "epoch": 3.8178584133760225, + "grad_norm": 0.4453075947846347, + "learning_rate": 2.7138029172415757e-07, + "loss": 0.0231, + "step": 32196 + }, + { + "epoch": 3.8179769951381477, + "grad_norm": 0.5207109924855691, + "learning_rate": 2.7102768664504787e-07, + "loss": 0.0187, + "step": 32197 + }, + { + "epoch": 3.818095576900273, + "grad_norm": 0.3103661846279404, + "learning_rate": 2.7067530953673594e-07, + "loss": 0.0156, + "step": 32198 + }, + { + "epoch": 3.8182141586623977, + "grad_norm": 0.5703196836919577, + "learning_rate": 2.703231604024747e-07, + "loss": 0.022, + "step": 32199 + }, + { + "epoch": 3.818332740424523, + "grad_norm": 0.6039950880931766, + "learning_rate": 2.6997123924550617e-07, + "loss": 0.0281, + "step": 32200 + }, + { + "epoch": 3.8184513221866476, + "grad_norm": 0.5019789927083761, + "learning_rate": 2.6961954606908035e-07, + "loss": 0.0192, + "step": 32201 + }, + { + "epoch": 3.818569903948773, + "grad_norm": 0.5721215583956886, + "learning_rate": 2.6926808087643087e-07, + "loss": 0.0218, + "step": 32202 + }, + { + "epoch": 3.8186884857108976, + "grad_norm": 0.4696212422364732, + "learning_rate": 2.6891684367080503e-07, + "loss": 0.0245, + "step": 32203 + }, + { + "epoch": 3.818807067473023, + "grad_norm": 0.49863587270982584, + "learning_rate": 2.6856583445543924e-07, + "loss": 0.0252, + "step": 32204 + }, + { + "epoch": 3.8189256492351475, + "grad_norm": 0.7004597122940662, + "learning_rate": 2.6821505323356976e-07, + "loss": 0.0318, + "step": 32205 + }, + { + "epoch": 3.8190442309972727, + "grad_norm": 0.4730078141430379, + "learning_rate": 2.678645000084273e-07, + "loss": 0.0197, + "step": 32206 + }, + { + "epoch": 3.8191628127593975, + "grad_norm": 0.35563681793354585, + "learning_rate": 2.675141747832455e-07, + "loss": 0.0113, + "step": 32207 + }, + { + "epoch": 3.8192813945215227, + "grad_norm": 0.3753308417357076, + "learning_rate": 2.671640775612577e-07, + "loss": 0.0144, + "step": 32208 + }, + { + "epoch": 3.8193999762836475, + "grad_norm": 0.507144588843481, + "learning_rate": 2.6681420834568374e-07, + "loss": 0.0274, + "step": 32209 + }, + { + "epoch": 3.8195185580457727, + "grad_norm": 0.5399470820298754, + "learning_rate": 2.664645671397542e-07, + "loss": 0.0184, + "step": 32210 + }, + { + "epoch": 3.8196371398078974, + "grad_norm": 0.7952011556432225, + "learning_rate": 2.661151539466916e-07, + "loss": 0.0244, + "step": 32211 + }, + { + "epoch": 3.8197557215700226, + "grad_norm": 0.9487168637397733, + "learning_rate": 2.657659687697156e-07, + "loss": 0.0396, + "step": 32212 + }, + { + "epoch": 3.8198743033321474, + "grad_norm": 0.4617495909953384, + "learning_rate": 2.654170116120458e-07, + "loss": 0.0267, + "step": 32213 + }, + { + "epoch": 3.8199928850942726, + "grad_norm": 0.37874086369923865, + "learning_rate": 2.6506828247690183e-07, + "loss": 0.0178, + "step": 32214 + }, + { + "epoch": 3.8201114668563974, + "grad_norm": 0.546500322457693, + "learning_rate": 2.647197813674951e-07, + "loss": 0.031, + "step": 32215 + }, + { + "epoch": 3.8202300486185226, + "grad_norm": 0.5046283227730778, + "learning_rate": 2.643715082870396e-07, + "loss": 0.0286, + "step": 32216 + }, + { + "epoch": 3.8203486303806473, + "grad_norm": 0.3908885599982109, + "learning_rate": 2.6402346323874674e-07, + "loss": 0.0188, + "step": 32217 + }, + { + "epoch": 3.8204672121427725, + "grad_norm": 0.5129649262158739, + "learning_rate": 2.636756462258222e-07, + "loss": 0.0268, + "step": 32218 + }, + { + "epoch": 3.8205857939048973, + "grad_norm": 0.35819007185432894, + "learning_rate": 2.6332805725147737e-07, + "loss": 0.0178, + "step": 32219 + }, + { + "epoch": 3.8207043756670225, + "grad_norm": 0.44142026146298186, + "learning_rate": 2.629806963189096e-07, + "loss": 0.025, + "step": 32220 + }, + { + "epoch": 3.8208229574291472, + "grad_norm": 0.5752579617436857, + "learning_rate": 2.626335634313304e-07, + "loss": 0.0227, + "step": 32221 + }, + { + "epoch": 3.8209415391912724, + "grad_norm": 0.45893093527683004, + "learning_rate": 2.6228665859193145e-07, + "loss": 0.0293, + "step": 32222 + }, + { + "epoch": 3.8210601209533976, + "grad_norm": 0.4647831717156037, + "learning_rate": 2.6193998180391586e-07, + "loss": 0.0221, + "step": 32223 + }, + { + "epoch": 3.8211787027155224, + "grad_norm": 0.5284659708108459, + "learning_rate": 2.6159353307047553e-07, + "loss": 0.0241, + "step": 32224 + }, + { + "epoch": 3.821297284477647, + "grad_norm": 0.5304875058541338, + "learning_rate": 2.612473123948078e-07, + "loss": 0.0187, + "step": 32225 + }, + { + "epoch": 3.8214158662397724, + "grad_norm": 0.4799787577688291, + "learning_rate": 2.6090131978010467e-07, + "loss": 0.0192, + "step": 32226 + }, + { + "epoch": 3.8215344480018976, + "grad_norm": 0.5232750273028788, + "learning_rate": 2.6055555522955246e-07, + "loss": 0.025, + "step": 32227 + }, + { + "epoch": 3.8216530297640223, + "grad_norm": 0.3987948574776581, + "learning_rate": 2.602100187463402e-07, + "loss": 0.0146, + "step": 32228 + }, + { + "epoch": 3.821771611526147, + "grad_norm": 0.43419701881066636, + "learning_rate": 2.5986471033365436e-07, + "loss": 0.0185, + "step": 32229 + }, + { + "epoch": 3.8218901932882723, + "grad_norm": 0.6958787211139715, + "learning_rate": 2.595196299946784e-07, + "loss": 0.0326, + "step": 32230 + }, + { + "epoch": 3.8220087750503975, + "grad_norm": 0.5672662252761036, + "learning_rate": 2.5917477773259035e-07, + "loss": 0.0292, + "step": 32231 + }, + { + "epoch": 3.8221273568125222, + "grad_norm": 0.9027498840647368, + "learning_rate": 2.588301535505766e-07, + "loss": 0.0394, + "step": 32232 + }, + { + "epoch": 3.822245938574647, + "grad_norm": 0.3521656311347229, + "learning_rate": 2.5848575745180403e-07, + "loss": 0.0164, + "step": 32233 + }, + { + "epoch": 3.822364520336772, + "grad_norm": 0.6429599964194214, + "learning_rate": 2.5814158943945345e-07, + "loss": 0.0274, + "step": 32234 + }, + { + "epoch": 3.8224831020988974, + "grad_norm": 0.4064868817886187, + "learning_rate": 2.5779764951670004e-07, + "loss": 0.0206, + "step": 32235 + }, + { + "epoch": 3.822601683861022, + "grad_norm": 0.6306410854426514, + "learning_rate": 2.57453937686708e-07, + "loss": 0.0254, + "step": 32236 + }, + { + "epoch": 3.822720265623147, + "grad_norm": 0.5429582839302577, + "learning_rate": 2.5711045395265256e-07, + "loss": 0.0231, + "step": 32237 + }, + { + "epoch": 3.822838847385272, + "grad_norm": 0.46519794595487, + "learning_rate": 2.5676719831769505e-07, + "loss": 0.0181, + "step": 32238 + }, + { + "epoch": 3.8229574291473973, + "grad_norm": 0.7280284469981231, + "learning_rate": 2.564241707849996e-07, + "loss": 0.0247, + "step": 32239 + }, + { + "epoch": 3.823076010909522, + "grad_norm": 0.5047628281857388, + "learning_rate": 2.5608137135773314e-07, + "loss": 0.0236, + "step": 32240 + }, + { + "epoch": 3.823194592671647, + "grad_norm": 0.325679299468194, + "learning_rate": 2.557388000390543e-07, + "loss": 0.0132, + "step": 32241 + }, + { + "epoch": 3.823313174433772, + "grad_norm": 0.41539549426857697, + "learning_rate": 2.553964568321188e-07, + "loss": 0.0154, + "step": 32242 + }, + { + "epoch": 3.8234317561958973, + "grad_norm": 0.6702192057668742, + "learning_rate": 2.5505434174008527e-07, + "loss": 0.0356, + "step": 32243 + }, + { + "epoch": 3.823550337958022, + "grad_norm": 0.33182721996282155, + "learning_rate": 2.547124547661067e-07, + "loss": 0.0153, + "step": 32244 + }, + { + "epoch": 3.8236689197201468, + "grad_norm": 0.5817909711514432, + "learning_rate": 2.5437079591333344e-07, + "loss": 0.0201, + "step": 32245 + }, + { + "epoch": 3.823787501482272, + "grad_norm": 0.36745071060875256, + "learning_rate": 2.540293651849157e-07, + "loss": 0.0162, + "step": 32246 + }, + { + "epoch": 3.823906083244397, + "grad_norm": 0.7260017540196879, + "learning_rate": 2.5368816258400094e-07, + "loss": 0.0352, + "step": 32247 + }, + { + "epoch": 3.824024665006522, + "grad_norm": 0.37188829372276855, + "learning_rate": 2.5334718811373947e-07, + "loss": 0.0173, + "step": 32248 + }, + { + "epoch": 3.824143246768647, + "grad_norm": 0.5913274947916497, + "learning_rate": 2.5300644177726763e-07, + "loss": 0.0227, + "step": 32249 + }, + { + "epoch": 3.824261828530772, + "grad_norm": 0.803227890653559, + "learning_rate": 2.5266592357773013e-07, + "loss": 0.0336, + "step": 32250 + }, + { + "epoch": 3.824380410292897, + "grad_norm": 0.4638562715866802, + "learning_rate": 2.523256335182661e-07, + "loss": 0.0223, + "step": 32251 + }, + { + "epoch": 3.824498992055022, + "grad_norm": 0.4111870645026095, + "learning_rate": 2.51985571602012e-07, + "loss": 0.0232, + "step": 32252 + }, + { + "epoch": 3.824617573817147, + "grad_norm": 0.4888976062042943, + "learning_rate": 2.516457378321041e-07, + "loss": 0.0179, + "step": 32253 + }, + { + "epoch": 3.824736155579272, + "grad_norm": 0.4388575645168092, + "learning_rate": 2.5130613221167056e-07, + "loss": 0.0208, + "step": 32254 + }, + { + "epoch": 3.824854737341397, + "grad_norm": 0.5281659770810934, + "learning_rate": 2.5096675474384767e-07, + "loss": 0.0218, + "step": 32255 + }, + { + "epoch": 3.824973319103522, + "grad_norm": 0.7254443873286416, + "learning_rate": 2.5062760543176076e-07, + "loss": 0.0258, + "step": 32256 + }, + { + "epoch": 3.825091900865647, + "grad_norm": 0.4069688959046381, + "learning_rate": 2.502886842785407e-07, + "loss": 0.0198, + "step": 32257 + }, + { + "epoch": 3.8252104826277717, + "grad_norm": 0.38135254733149465, + "learning_rate": 2.499499912873071e-07, + "loss": 0.0157, + "step": 32258 + }, + { + "epoch": 3.825329064389897, + "grad_norm": 0.5791007514490251, + "learning_rate": 2.4961152646118535e-07, + "loss": 0.026, + "step": 32259 + }, + { + "epoch": 3.8254476461520217, + "grad_norm": 0.6488853484444277, + "learning_rate": 2.4927328980329235e-07, + "loss": 0.0227, + "step": 32260 + }, + { + "epoch": 3.825566227914147, + "grad_norm": 0.7347388766501896, + "learning_rate": 2.489352813167478e-07, + "loss": 0.031, + "step": 32261 + }, + { + "epoch": 3.8256848096762717, + "grad_norm": 0.43742387465332966, + "learning_rate": 2.485975010046687e-07, + "loss": 0.0208, + "step": 32262 + }, + { + "epoch": 3.825803391438397, + "grad_norm": 1.0387656285667042, + "learning_rate": 2.482599488701692e-07, + "loss": 0.043, + "step": 32263 + }, + { + "epoch": 3.8259219732005216, + "grad_norm": 0.5466711818317451, + "learning_rate": 2.4792262491636355e-07, + "loss": 0.0211, + "step": 32264 + }, + { + "epoch": 3.826040554962647, + "grad_norm": 0.3134211991696923, + "learning_rate": 2.4758552914635193e-07, + "loss": 0.0155, + "step": 32265 + }, + { + "epoch": 3.8261591367247716, + "grad_norm": 0.8402152122733638, + "learning_rate": 2.472486615632541e-07, + "loss": 0.0571, + "step": 32266 + }, + { + "epoch": 3.826277718486897, + "grad_norm": 0.6360947961695947, + "learning_rate": 2.4691202217016485e-07, + "loss": 0.034, + "step": 32267 + }, + { + "epoch": 3.8263963002490216, + "grad_norm": 0.39897112065437246, + "learning_rate": 2.4657561097019555e-07, + "loss": 0.0137, + "step": 32268 + }, + { + "epoch": 3.8265148820111468, + "grad_norm": 0.6036848400631302, + "learning_rate": 2.4623942796644373e-07, + "loss": 0.0294, + "step": 32269 + }, + { + "epoch": 3.8266334637732715, + "grad_norm": 0.6474233328341904, + "learning_rate": 2.4590347316200966e-07, + "loss": 0.0221, + "step": 32270 + }, + { + "epoch": 3.8267520455353967, + "grad_norm": 0.6198689176750978, + "learning_rate": 2.455677465599909e-07, + "loss": 0.0244, + "step": 32271 + }, + { + "epoch": 3.8268706272975215, + "grad_norm": 0.4060254710941895, + "learning_rate": 2.4523224816347946e-07, + "loss": 0.0173, + "step": 32272 + }, + { + "epoch": 3.8269892090596467, + "grad_norm": 0.5246333407283597, + "learning_rate": 2.4489697797557e-07, + "loss": 0.0275, + "step": 32273 + }, + { + "epoch": 3.8271077908217714, + "grad_norm": 0.5084248289973646, + "learning_rate": 2.445619359993545e-07, + "loss": 0.0178, + "step": 32274 + }, + { + "epoch": 3.8272263725838966, + "grad_norm": 0.6122074111181862, + "learning_rate": 2.44227122237925e-07, + "loss": 0.0291, + "step": 32275 + }, + { + "epoch": 3.827344954346022, + "grad_norm": 0.43791797776151653, + "learning_rate": 2.438925366943567e-07, + "loss": 0.0205, + "step": 32276 + }, + { + "epoch": 3.8274635361081466, + "grad_norm": 0.5202462067923984, + "learning_rate": 2.435581793717473e-07, + "loss": 0.0267, + "step": 32277 + }, + { + "epoch": 3.8275821178702714, + "grad_norm": 0.46010897850555527, + "learning_rate": 2.432240502731692e-07, + "loss": 0.0161, + "step": 32278 + }, + { + "epoch": 3.8277006996323966, + "grad_norm": 0.660929119587524, + "learning_rate": 2.428901494017116e-07, + "loss": 0.0327, + "step": 32279 + }, + { + "epoch": 3.8278192813945218, + "grad_norm": 0.3948210217950261, + "learning_rate": 2.4255647676044156e-07, + "loss": 0.021, + "step": 32280 + }, + { + "epoch": 3.8279378631566465, + "grad_norm": 0.4314419323903525, + "learning_rate": 2.422230323524455e-07, + "loss": 0.0221, + "step": 32281 + }, + { + "epoch": 3.8280564449187713, + "grad_norm": 0.4286467959442671, + "learning_rate": 2.418898161807903e-07, + "loss": 0.0198, + "step": 32282 + }, + { + "epoch": 3.8281750266808965, + "grad_norm": 0.4027865393466545, + "learning_rate": 2.415568282485514e-07, + "loss": 0.0182, + "step": 32283 + }, + { + "epoch": 3.8282936084430217, + "grad_norm": 0.4061894559060245, + "learning_rate": 2.412240685587985e-07, + "loss": 0.0187, + "step": 32284 + }, + { + "epoch": 3.8284121902051464, + "grad_norm": 0.48820917865321445, + "learning_rate": 2.408915371145959e-07, + "loss": 0.0252, + "step": 32285 + }, + { + "epoch": 3.828530771967271, + "grad_norm": 0.6285162837223278, + "learning_rate": 2.4055923391901603e-07, + "loss": 0.0352, + "step": 32286 + }, + { + "epoch": 3.8286493537293964, + "grad_norm": 0.45609878775847923, + "learning_rate": 2.4022715897511485e-07, + "loss": 0.0146, + "step": 32287 + }, + { + "epoch": 3.8287679354915216, + "grad_norm": 0.8553391591770297, + "learning_rate": 2.398953122859593e-07, + "loss": 0.0423, + "step": 32288 + }, + { + "epoch": 3.8288865172536464, + "grad_norm": 0.7444040512158345, + "learning_rate": 2.395636938546025e-07, + "loss": 0.0347, + "step": 32289 + }, + { + "epoch": 3.829005099015771, + "grad_norm": 0.8984575650426113, + "learning_rate": 2.3923230368410875e-07, + "loss": 0.0316, + "step": 32290 + }, + { + "epoch": 3.8291236807778963, + "grad_norm": 0.252253717121221, + "learning_rate": 2.3890114177752554e-07, + "loss": 0.0176, + "step": 32291 + }, + { + "epoch": 3.8292422625400215, + "grad_norm": 0.87089930028109, + "learning_rate": 2.385702081379143e-07, + "loss": 0.0361, + "step": 32292 + }, + { + "epoch": 3.8293608443021463, + "grad_norm": 0.4759857787960349, + "learning_rate": 2.3823950276831986e-07, + "loss": 0.0232, + "step": 32293 + }, + { + "epoch": 3.829479426064271, + "grad_norm": 0.2548492522687106, + "learning_rate": 2.3790902567178975e-07, + "loss": 0.0112, + "step": 32294 + }, + { + "epoch": 3.8295980078263963, + "grad_norm": 0.5396014585601289, + "learning_rate": 2.3757877685137708e-07, + "loss": 0.0247, + "step": 32295 + }, + { + "epoch": 3.8297165895885215, + "grad_norm": 0.56707246949222, + "learning_rate": 2.3724875631011835e-07, + "loss": 0.022, + "step": 32296 + }, + { + "epoch": 3.829835171350646, + "grad_norm": 0.30762359048947635, + "learning_rate": 2.3691896405106384e-07, + "loss": 0.0102, + "step": 32297 + }, + { + "epoch": 3.8299537531127714, + "grad_norm": 0.46721419410173937, + "learning_rate": 2.3658940007724728e-07, + "loss": 0.014, + "step": 32298 + }, + { + "epoch": 3.830072334874896, + "grad_norm": 0.5104014103017009, + "learning_rate": 2.3626006439171067e-07, + "loss": 0.0184, + "step": 32299 + }, + { + "epoch": 3.8301909166370214, + "grad_norm": 0.8127604609576257, + "learning_rate": 2.3593095699748767e-07, + "loss": 0.0287, + "step": 32300 + }, + { + "epoch": 3.830309498399146, + "grad_norm": 0.3341606975613387, + "learning_rate": 2.3560207789761758e-07, + "loss": 0.0169, + "step": 32301 + }, + { + "epoch": 3.8304280801612713, + "grad_norm": 0.48624676261041105, + "learning_rate": 2.352734270951229e-07, + "loss": 0.0235, + "step": 32302 + }, + { + "epoch": 3.830546661923396, + "grad_norm": 0.4369865780546054, + "learning_rate": 2.3494500459304292e-07, + "loss": 0.0176, + "step": 32303 + }, + { + "epoch": 3.8306652436855213, + "grad_norm": 0.42486631133113273, + "learning_rate": 2.3461681039439743e-07, + "loss": 0.0211, + "step": 32304 + }, + { + "epoch": 3.830783825447646, + "grad_norm": 0.5216403676523061, + "learning_rate": 2.3428884450221733e-07, + "loss": 0.0245, + "step": 32305 + }, + { + "epoch": 3.8309024072097713, + "grad_norm": 0.40129540639602457, + "learning_rate": 2.3396110691952523e-07, + "loss": 0.0198, + "step": 32306 + }, + { + "epoch": 3.831020988971896, + "grad_norm": 0.3388478780161658, + "learning_rate": 2.336335976493409e-07, + "loss": 0.0108, + "step": 32307 + }, + { + "epoch": 3.8311395707340212, + "grad_norm": 0.7284856966021265, + "learning_rate": 2.3330631669468416e-07, + "loss": 0.0396, + "step": 32308 + }, + { + "epoch": 3.831258152496146, + "grad_norm": 0.5221460312687791, + "learning_rate": 2.3297926405857208e-07, + "loss": 0.0293, + "step": 32309 + }, + { + "epoch": 3.831376734258271, + "grad_norm": 0.7147102422588549, + "learning_rate": 2.3265243974401885e-07, + "loss": 0.0309, + "step": 32310 + }, + { + "epoch": 3.831495316020396, + "grad_norm": 0.5817799787250829, + "learning_rate": 2.3232584375404154e-07, + "loss": 0.0302, + "step": 32311 + }, + { + "epoch": 3.831613897782521, + "grad_norm": 0.3506905357198626, + "learning_rate": 2.319994760916433e-07, + "loss": 0.0134, + "step": 32312 + }, + { + "epoch": 3.831732479544646, + "grad_norm": 0.996652070329875, + "learning_rate": 2.3167333675984116e-07, + "loss": 0.0425, + "step": 32313 + }, + { + "epoch": 3.831851061306771, + "grad_norm": 0.5155146310844564, + "learning_rate": 2.3134742576163548e-07, + "loss": 0.0244, + "step": 32314 + }, + { + "epoch": 3.831969643068896, + "grad_norm": 0.5365646292910884, + "learning_rate": 2.3102174310003778e-07, + "loss": 0.024, + "step": 32315 + }, + { + "epoch": 3.832088224831021, + "grad_norm": 0.6071381644504981, + "learning_rate": 2.3069628877804005e-07, + "loss": 0.022, + "step": 32316 + }, + { + "epoch": 3.832206806593146, + "grad_norm": 1.2627758771508857, + "learning_rate": 2.3037106279865107e-07, + "loss": 0.0536, + "step": 32317 + }, + { + "epoch": 3.832325388355271, + "grad_norm": 0.43219750846162674, + "learning_rate": 2.300460651648656e-07, + "loss": 0.0232, + "step": 32318 + }, + { + "epoch": 3.832443970117396, + "grad_norm": 0.4439363793870995, + "learning_rate": 2.2972129587968406e-07, + "loss": 0.0183, + "step": 32319 + }, + { + "epoch": 3.832562551879521, + "grad_norm": 0.5287400742560479, + "learning_rate": 2.2939675494609293e-07, + "loss": 0.0202, + "step": 32320 + }, + { + "epoch": 3.8326811336416458, + "grad_norm": 0.7413513658735347, + "learning_rate": 2.290724423670898e-07, + "loss": 0.0294, + "step": 32321 + }, + { + "epoch": 3.832799715403771, + "grad_norm": 0.795336445165287, + "learning_rate": 2.287483581456612e-07, + "loss": 0.047, + "step": 32322 + }, + { + "epoch": 3.8329182971658957, + "grad_norm": 0.38643518473895844, + "learning_rate": 2.2842450228479638e-07, + "loss": 0.0125, + "step": 32323 + }, + { + "epoch": 3.833036878928021, + "grad_norm": 0.5877433443800362, + "learning_rate": 2.2810087478748188e-07, + "loss": 0.0258, + "step": 32324 + }, + { + "epoch": 3.833155460690146, + "grad_norm": 0.3649339330233206, + "learning_rate": 2.2777747565669582e-07, + "loss": 0.0146, + "step": 32325 + }, + { + "epoch": 3.833274042452271, + "grad_norm": 0.7909432419952296, + "learning_rate": 2.274543048954303e-07, + "loss": 0.0389, + "step": 32326 + }, + { + "epoch": 3.8333926242143956, + "grad_norm": 0.4332844329127058, + "learning_rate": 2.2713136250665513e-07, + "loss": 0.0244, + "step": 32327 + }, + { + "epoch": 3.833511205976521, + "grad_norm": 0.5120039634716914, + "learning_rate": 2.2680864849335127e-07, + "loss": 0.0224, + "step": 32328 + }, + { + "epoch": 3.833629787738646, + "grad_norm": 0.4712278762103016, + "learning_rate": 2.2648616285848855e-07, + "loss": 0.0195, + "step": 32329 + }, + { + "epoch": 3.833748369500771, + "grad_norm": 0.7766692981044111, + "learning_rate": 2.2616390560504795e-07, + "loss": 0.0281, + "step": 32330 + }, + { + "epoch": 3.8338669512628956, + "grad_norm": 0.5286624079002294, + "learning_rate": 2.2584187673599655e-07, + "loss": 0.0217, + "step": 32331 + }, + { + "epoch": 3.8339855330250208, + "grad_norm": 0.6403666871471871, + "learning_rate": 2.2552007625430137e-07, + "loss": 0.0246, + "step": 32332 + }, + { + "epoch": 3.834104114787146, + "grad_norm": 0.4829232308447923, + "learning_rate": 2.2519850416292954e-07, + "loss": 0.0197, + "step": 32333 + }, + { + "epoch": 3.8342226965492707, + "grad_norm": 0.6192070967103722, + "learning_rate": 2.248771604648481e-07, + "loss": 0.0383, + "step": 32334 + }, + { + "epoch": 3.8343412783113955, + "grad_norm": 0.7137885539493596, + "learning_rate": 2.245560451630213e-07, + "loss": 0.0277, + "step": 32335 + }, + { + "epoch": 3.8344598600735207, + "grad_norm": 0.4452909657052653, + "learning_rate": 2.2423515826039965e-07, + "loss": 0.0191, + "step": 32336 + }, + { + "epoch": 3.834578441835646, + "grad_norm": 0.47283426995762634, + "learning_rate": 2.2391449975995294e-07, + "loss": 0.0217, + "step": 32337 + }, + { + "epoch": 3.8346970235977706, + "grad_norm": 0.39664387973463916, + "learning_rate": 2.235940696646316e-07, + "loss": 0.0208, + "step": 32338 + }, + { + "epoch": 3.8348156053598954, + "grad_norm": 0.41702542274493526, + "learning_rate": 2.2327386797738882e-07, + "loss": 0.0213, + "step": 32339 + }, + { + "epoch": 3.8349341871220206, + "grad_norm": 0.5508572236197197, + "learning_rate": 2.22953894701175e-07, + "loss": 0.0205, + "step": 32340 + }, + { + "epoch": 3.835052768884146, + "grad_norm": 0.44923706792120477, + "learning_rate": 2.2263414983894894e-07, + "loss": 0.0191, + "step": 32341 + }, + { + "epoch": 3.8351713506462706, + "grad_norm": 0.45073190693255993, + "learning_rate": 2.223146333936471e-07, + "loss": 0.0162, + "step": 32342 + }, + { + "epoch": 3.8352899324083953, + "grad_norm": 0.45967669959037893, + "learning_rate": 2.2199534536821997e-07, + "loss": 0.0234, + "step": 32343 + }, + { + "epoch": 3.8354085141705205, + "grad_norm": 0.6585255123591681, + "learning_rate": 2.216762857656124e-07, + "loss": 0.0386, + "step": 32344 + }, + { + "epoch": 3.8355270959326457, + "grad_norm": 0.48906955834510696, + "learning_rate": 2.2135745458876368e-07, + "loss": 0.0296, + "step": 32345 + }, + { + "epoch": 3.8356456776947705, + "grad_norm": 0.8845155615688414, + "learning_rate": 2.210388518406159e-07, + "loss": 0.0459, + "step": 32346 + }, + { + "epoch": 3.8357642594568953, + "grad_norm": 0.4131396724736266, + "learning_rate": 2.2072047752410006e-07, + "loss": 0.0238, + "step": 32347 + }, + { + "epoch": 3.8358828412190205, + "grad_norm": 0.5633948454355348, + "learning_rate": 2.2040233164215828e-07, + "loss": 0.0264, + "step": 32348 + }, + { + "epoch": 3.8360014229811457, + "grad_norm": 0.49629830813422643, + "learning_rate": 2.2008441419772152e-07, + "loss": 0.0201, + "step": 32349 + }, + { + "epoch": 3.8361200047432704, + "grad_norm": 0.45355420009477887, + "learning_rate": 2.19766725193718e-07, + "loss": 0.0245, + "step": 32350 + }, + { + "epoch": 3.8362385865053956, + "grad_norm": 0.6899145395035517, + "learning_rate": 2.1944926463307314e-07, + "loss": 0.0325, + "step": 32351 + }, + { + "epoch": 3.8363571682675204, + "grad_norm": 0.5581049053152226, + "learning_rate": 2.191320325187235e-07, + "loss": 0.0202, + "step": 32352 + }, + { + "epoch": 3.8364757500296456, + "grad_norm": 0.2885810651338321, + "learning_rate": 2.188150288535862e-07, + "loss": 0.0084, + "step": 32353 + }, + { + "epoch": 3.8365943317917703, + "grad_norm": 0.7734100330151826, + "learning_rate": 2.1849825364058663e-07, + "loss": 0.0443, + "step": 32354 + }, + { + "epoch": 3.8367129135538955, + "grad_norm": 1.10115929030448, + "learning_rate": 2.1818170688264195e-07, + "loss": 0.0491, + "step": 32355 + }, + { + "epoch": 3.8368314953160203, + "grad_norm": 0.5859224093376044, + "learning_rate": 2.178653885826748e-07, + "loss": 0.0288, + "step": 32356 + }, + { + "epoch": 3.8369500770781455, + "grad_norm": 0.5158527427362601, + "learning_rate": 2.1754929874359954e-07, + "loss": 0.0241, + "step": 32357 + }, + { + "epoch": 3.8370686588402703, + "grad_norm": 0.44969704539666805, + "learning_rate": 2.172334373683249e-07, + "loss": 0.0143, + "step": 32358 + }, + { + "epoch": 3.8371872406023955, + "grad_norm": 0.3992747353764825, + "learning_rate": 2.1691780445977085e-07, + "loss": 0.0163, + "step": 32359 + }, + { + "epoch": 3.8373058223645202, + "grad_norm": 0.8138595722376765, + "learning_rate": 2.166024000208433e-07, + "loss": 0.0364, + "step": 32360 + }, + { + "epoch": 3.8374244041266454, + "grad_norm": 0.33361677727612754, + "learning_rate": 2.1628722405445112e-07, + "loss": 0.016, + "step": 32361 + }, + { + "epoch": 3.83754298588877, + "grad_norm": 0.7668842110876931, + "learning_rate": 2.159722765634975e-07, + "loss": 0.038, + "step": 32362 + }, + { + "epoch": 3.8376615676508954, + "grad_norm": 0.6187897984533207, + "learning_rate": 2.1565755755088568e-07, + "loss": 0.0196, + "step": 32363 + }, + { + "epoch": 3.83778014941302, + "grad_norm": 0.748712181316043, + "learning_rate": 2.1534306701952168e-07, + "loss": 0.0516, + "step": 32364 + }, + { + "epoch": 3.8378987311751454, + "grad_norm": 0.5461003813385163, + "learning_rate": 2.1502880497230037e-07, + "loss": 0.0264, + "step": 32365 + }, + { + "epoch": 3.83801731293727, + "grad_norm": 0.5668294833245395, + "learning_rate": 2.1471477141211948e-07, + "loss": 0.023, + "step": 32366 + }, + { + "epoch": 3.8381358946993953, + "grad_norm": 0.5665004351152113, + "learning_rate": 2.1440096634187668e-07, + "loss": 0.0313, + "step": 32367 + }, + { + "epoch": 3.83825447646152, + "grad_norm": 0.640790635943515, + "learning_rate": 2.140873897644613e-07, + "loss": 0.0267, + "step": 32368 + }, + { + "epoch": 3.8383730582236453, + "grad_norm": 0.4959819787139154, + "learning_rate": 2.1377404168276825e-07, + "loss": 0.0162, + "step": 32369 + }, + { + "epoch": 3.83849163998577, + "grad_norm": 0.7722929892819221, + "learning_rate": 2.134609220996814e-07, + "loss": 0.0221, + "step": 32370 + }, + { + "epoch": 3.8386102217478952, + "grad_norm": 0.3919902740813167, + "learning_rate": 2.1314803101808723e-07, + "loss": 0.0192, + "step": 32371 + }, + { + "epoch": 3.83872880351002, + "grad_norm": 0.5329213993672982, + "learning_rate": 2.1283536844087514e-07, + "loss": 0.0238, + "step": 32372 + }, + { + "epoch": 3.838847385272145, + "grad_norm": 0.5419014964382625, + "learning_rate": 2.1252293437092619e-07, + "loss": 0.0226, + "step": 32373 + }, + { + "epoch": 3.8389659670342704, + "grad_norm": 0.8393930749092957, + "learning_rate": 2.122107288111158e-07, + "loss": 0.0393, + "step": 32374 + }, + { + "epoch": 3.839084548796395, + "grad_norm": 0.5069578136771319, + "learning_rate": 2.1189875176433062e-07, + "loss": 0.0352, + "step": 32375 + }, + { + "epoch": 3.83920313055852, + "grad_norm": 0.5398842269530586, + "learning_rate": 2.115870032334377e-07, + "loss": 0.0257, + "step": 32376 + }, + { + "epoch": 3.839321712320645, + "grad_norm": 0.41577663226672046, + "learning_rate": 2.112754832213154e-07, + "loss": 0.0143, + "step": 32377 + }, + { + "epoch": 3.8394402940827703, + "grad_norm": 0.37424788037548695, + "learning_rate": 2.1096419173083637e-07, + "loss": 0.0121, + "step": 32378 + }, + { + "epoch": 3.839558875844895, + "grad_norm": 0.5706147570145693, + "learning_rate": 2.1065312876487053e-07, + "loss": 0.0259, + "step": 32379 + }, + { + "epoch": 3.83967745760702, + "grad_norm": 0.6823555673122391, + "learning_rate": 2.1034229432628228e-07, + "loss": 0.0282, + "step": 32380 + }, + { + "epoch": 3.839796039369145, + "grad_norm": 0.46776208204083536, + "learning_rate": 2.1003168841793875e-07, + "loss": 0.0226, + "step": 32381 + }, + { + "epoch": 3.8399146211312702, + "grad_norm": 0.320374594414204, + "learning_rate": 2.0972131104270432e-07, + "loss": 0.0159, + "step": 32382 + }, + { + "epoch": 3.840033202893395, + "grad_norm": 0.942670068062788, + "learning_rate": 2.094111622034378e-07, + "loss": 0.0282, + "step": 32383 + }, + { + "epoch": 3.8401517846555198, + "grad_norm": 0.5970470372927648, + "learning_rate": 2.0910124190300363e-07, + "loss": 0.0293, + "step": 32384 + }, + { + "epoch": 3.840270366417645, + "grad_norm": 0.8421805610361163, + "learning_rate": 2.0879155014425222e-07, + "loss": 0.0298, + "step": 32385 + }, + { + "epoch": 3.84038894817977, + "grad_norm": 0.5919909256992192, + "learning_rate": 2.0848208693004524e-07, + "loss": 0.0229, + "step": 32386 + }, + { + "epoch": 3.840507529941895, + "grad_norm": 0.6095293608635124, + "learning_rate": 2.0817285226322757e-07, + "loss": 0.0206, + "step": 32387 + }, + { + "epoch": 3.8406261117040197, + "grad_norm": 0.6157739750354995, + "learning_rate": 2.078638461466581e-07, + "loss": 0.0274, + "step": 32388 + }, + { + "epoch": 3.840744693466145, + "grad_norm": 0.9470912813368867, + "learning_rate": 2.0755506858317897e-07, + "loss": 0.0367, + "step": 32389 + }, + { + "epoch": 3.84086327522827, + "grad_norm": 0.4976669549629656, + "learning_rate": 2.0724651957564344e-07, + "loss": 0.0243, + "step": 32390 + }, + { + "epoch": 3.840981856990395, + "grad_norm": 0.5944447079596484, + "learning_rate": 2.0693819912688816e-07, + "loss": 0.0288, + "step": 32391 + }, + { + "epoch": 3.8411004387525196, + "grad_norm": 0.5289208929120277, + "learning_rate": 2.0663010723976084e-07, + "loss": 0.0259, + "step": 32392 + }, + { + "epoch": 3.841219020514645, + "grad_norm": 0.260616565795419, + "learning_rate": 2.0632224391710086e-07, + "loss": 0.0091, + "step": 32393 + }, + { + "epoch": 3.84133760227677, + "grad_norm": 0.9564587101313327, + "learning_rate": 2.0601460916174485e-07, + "loss": 0.0478, + "step": 32394 + }, + { + "epoch": 3.8414561840388948, + "grad_norm": 0.5705764190186831, + "learning_rate": 2.0570720297653224e-07, + "loss": 0.0304, + "step": 32395 + }, + { + "epoch": 3.8415747658010195, + "grad_norm": 0.38597600913363606, + "learning_rate": 2.0540002536429125e-07, + "loss": 0.0127, + "step": 32396 + }, + { + "epoch": 3.8416933475631447, + "grad_norm": 0.4645373001196363, + "learning_rate": 2.0509307632785856e-07, + "loss": 0.0244, + "step": 32397 + }, + { + "epoch": 3.84181192932527, + "grad_norm": 0.46939395527594713, + "learning_rate": 2.0478635587005968e-07, + "loss": 0.0263, + "step": 32398 + }, + { + "epoch": 3.8419305110873947, + "grad_norm": 0.4046067665307863, + "learning_rate": 2.0447986399372843e-07, + "loss": 0.0204, + "step": 32399 + }, + { + "epoch": 3.84204909284952, + "grad_norm": 0.745602918596502, + "learning_rate": 2.0417360070168478e-07, + "loss": 0.0312, + "step": 32400 + }, + { + "epoch": 3.8421676746116447, + "grad_norm": 0.5495547866683418, + "learning_rate": 2.0386756599675427e-07, + "loss": 0.022, + "step": 32401 + }, + { + "epoch": 3.84228625637377, + "grad_norm": 0.4852183612633113, + "learning_rate": 2.0356175988175686e-07, + "loss": 0.014, + "step": 32402 + }, + { + "epoch": 3.8424048381358946, + "grad_norm": 0.695991659181916, + "learning_rate": 2.032561823595125e-07, + "loss": 0.0342, + "step": 32403 + }, + { + "epoch": 3.84252341989802, + "grad_norm": 0.3751964876219734, + "learning_rate": 2.029508334328384e-07, + "loss": 0.0197, + "step": 32404 + }, + { + "epoch": 3.8426420016601446, + "grad_norm": 0.44743208858948774, + "learning_rate": 2.026457131045517e-07, + "loss": 0.0157, + "step": 32405 + }, + { + "epoch": 3.84276058342227, + "grad_norm": 0.5799259588415134, + "learning_rate": 2.0234082137746134e-07, + "loss": 0.0238, + "step": 32406 + }, + { + "epoch": 3.8428791651843945, + "grad_norm": 0.5503692584630047, + "learning_rate": 2.0203615825437894e-07, + "loss": 0.0327, + "step": 32407 + }, + { + "epoch": 3.8429977469465197, + "grad_norm": 0.5392787348008629, + "learning_rate": 2.017317237381161e-07, + "loss": 0.0193, + "step": 32408 + }, + { + "epoch": 3.8431163287086445, + "grad_norm": 0.5068360982230334, + "learning_rate": 2.0142751783147617e-07, + "loss": 0.0273, + "step": 32409 + }, + { + "epoch": 3.8432349104707697, + "grad_norm": 0.38108833450043117, + "learning_rate": 2.0112354053726524e-07, + "loss": 0.015, + "step": 32410 + }, + { + "epoch": 3.8433534922328945, + "grad_norm": 0.4784941202046757, + "learning_rate": 2.0081979185828105e-07, + "loss": 0.0187, + "step": 32411 + }, + { + "epoch": 3.8434720739950197, + "grad_norm": 0.43166704553536756, + "learning_rate": 2.0051627179733247e-07, + "loss": 0.0187, + "step": 32412 + }, + { + "epoch": 3.8435906557571444, + "grad_norm": 0.6195356627914599, + "learning_rate": 2.0021298035720894e-07, + "loss": 0.0302, + "step": 32413 + }, + { + "epoch": 3.8437092375192696, + "grad_norm": 0.5782298081269911, + "learning_rate": 1.9990991754071098e-07, + "loss": 0.0242, + "step": 32414 + }, + { + "epoch": 3.8438278192813944, + "grad_norm": 0.6559712124181133, + "learning_rate": 1.996070833506336e-07, + "loss": 0.0262, + "step": 32415 + }, + { + "epoch": 3.8439464010435196, + "grad_norm": 0.5415918139861473, + "learning_rate": 1.9930447778976625e-07, + "loss": 0.0195, + "step": 32416 + }, + { + "epoch": 3.8440649828056443, + "grad_norm": 0.6286716357021125, + "learning_rate": 1.990021008608983e-07, + "loss": 0.0239, + "step": 32417 + }, + { + "epoch": 3.8441835645677696, + "grad_norm": 0.5889221989551054, + "learning_rate": 1.986999525668165e-07, + "loss": 0.0237, + "step": 32418 + }, + { + "epoch": 3.8443021463298943, + "grad_norm": 0.4058739667662757, + "learning_rate": 1.9839803291031024e-07, + "loss": 0.0195, + "step": 32419 + }, + { + "epoch": 3.8444207280920195, + "grad_norm": 0.44022749871706945, + "learning_rate": 1.9809634189416059e-07, + "loss": 0.016, + "step": 32420 + }, + { + "epoch": 3.8445393098541443, + "grad_norm": 0.6056911463820162, + "learning_rate": 1.9779487952114596e-07, + "loss": 0.0285, + "step": 32421 + }, + { + "epoch": 3.8446578916162695, + "grad_norm": 0.6111791377605957, + "learning_rate": 1.9749364579405015e-07, + "loss": 0.0247, + "step": 32422 + }, + { + "epoch": 3.8447764733783947, + "grad_norm": 0.316414865125287, + "learning_rate": 1.9719264071564602e-07, + "loss": 0.0128, + "step": 32423 + }, + { + "epoch": 3.8448950551405194, + "grad_norm": 0.3746931675252892, + "learning_rate": 1.9689186428871464e-07, + "loss": 0.015, + "step": 32424 + }, + { + "epoch": 3.845013636902644, + "grad_norm": 0.6137069156938877, + "learning_rate": 1.9659131651602046e-07, + "loss": 0.0257, + "step": 32425 + }, + { + "epoch": 3.8451322186647694, + "grad_norm": 0.6055044063683356, + "learning_rate": 1.9629099740034185e-07, + "loss": 0.0316, + "step": 32426 + }, + { + "epoch": 3.8452508004268946, + "grad_norm": 0.43292167642492774, + "learning_rate": 1.9599090694444044e-07, + "loss": 0.0209, + "step": 32427 + }, + { + "epoch": 3.8453693821890194, + "grad_norm": 0.3897214549769648, + "learning_rate": 1.9569104515108906e-07, + "loss": 0.0172, + "step": 32428 + }, + { + "epoch": 3.845487963951144, + "grad_norm": 0.6769813231789346, + "learning_rate": 1.9539141202304657e-07, + "loss": 0.0305, + "step": 32429 + }, + { + "epoch": 3.8456065457132693, + "grad_norm": 0.7810981482468868, + "learning_rate": 1.9509200756308022e-07, + "loss": 0.031, + "step": 32430 + }, + { + "epoch": 3.8457251274753945, + "grad_norm": 0.7005575857424886, + "learning_rate": 1.9479283177394614e-07, + "loss": 0.0352, + "step": 32431 + }, + { + "epoch": 3.8458437092375193, + "grad_norm": 0.8022310233210902, + "learning_rate": 1.94493884658406e-07, + "loss": 0.0368, + "step": 32432 + }, + { + "epoch": 3.845962290999644, + "grad_norm": 0.45230965356846176, + "learning_rate": 1.941951662192104e-07, + "loss": 0.0241, + "step": 32433 + }, + { + "epoch": 3.8460808727617692, + "grad_norm": 0.527165576824917, + "learning_rate": 1.9389667645911824e-07, + "loss": 0.0247, + "step": 32434 + }, + { + "epoch": 3.8461994545238944, + "grad_norm": 0.3918215542351256, + "learning_rate": 1.9359841538088007e-07, + "loss": 0.0213, + "step": 32435 + }, + { + "epoch": 3.846318036286019, + "grad_norm": 0.5836700055011448, + "learning_rate": 1.9330038298724652e-07, + "loss": 0.022, + "step": 32436 + }, + { + "epoch": 3.846436618048144, + "grad_norm": 0.45923216328301647, + "learning_rate": 1.9300257928095978e-07, + "loss": 0.0188, + "step": 32437 + }, + { + "epoch": 3.846555199810269, + "grad_norm": 0.4411234959704554, + "learning_rate": 1.9270500426477045e-07, + "loss": 0.0233, + "step": 32438 + }, + { + "epoch": 3.8466737815723944, + "grad_norm": 0.3559420469478563, + "learning_rate": 1.924076579414208e-07, + "loss": 0.0119, + "step": 32439 + }, + { + "epoch": 3.846792363334519, + "grad_norm": 0.4150036982637551, + "learning_rate": 1.9211054031365028e-07, + "loss": 0.0168, + "step": 32440 + }, + { + "epoch": 3.846910945096644, + "grad_norm": 0.5992505175889953, + "learning_rate": 1.9181365138420115e-07, + "loss": 0.0229, + "step": 32441 + }, + { + "epoch": 3.847029526858769, + "grad_norm": 0.5176666053278863, + "learning_rate": 1.9151699115580734e-07, + "loss": 0.0229, + "step": 32442 + }, + { + "epoch": 3.8471481086208943, + "grad_norm": 0.8086161546212984, + "learning_rate": 1.912205596312028e-07, + "loss": 0.0343, + "step": 32443 + }, + { + "epoch": 3.847266690383019, + "grad_norm": 0.6046652225814486, + "learning_rate": 1.9092435681312414e-07, + "loss": 0.022, + "step": 32444 + }, + { + "epoch": 3.847385272145144, + "grad_norm": 0.8114257466274432, + "learning_rate": 1.9062838270429984e-07, + "loss": 0.0312, + "step": 32445 + }, + { + "epoch": 3.847503853907269, + "grad_norm": 0.6875192022926029, + "learning_rate": 1.903326373074582e-07, + "loss": 0.041, + "step": 32446 + }, + { + "epoch": 3.847622435669394, + "grad_norm": 0.8612741673163943, + "learning_rate": 1.9003712062532764e-07, + "loss": 0.0484, + "step": 32447 + }, + { + "epoch": 3.847741017431519, + "grad_norm": 0.32424420082075417, + "learning_rate": 1.8974183266062816e-07, + "loss": 0.0132, + "step": 32448 + }, + { + "epoch": 3.847859599193644, + "grad_norm": 0.8894203285024376, + "learning_rate": 1.8944677341608541e-07, + "loss": 0.0349, + "step": 32449 + }, + { + "epoch": 3.847978180955769, + "grad_norm": 0.3878409849022033, + "learning_rate": 1.8915194289442216e-07, + "loss": 0.0163, + "step": 32450 + }, + { + "epoch": 3.848096762717894, + "grad_norm": 0.5626582778284627, + "learning_rate": 1.888573410983474e-07, + "loss": 0.0333, + "step": 32451 + }, + { + "epoch": 3.848215344480019, + "grad_norm": 0.3501479499073927, + "learning_rate": 1.885629680305867e-07, + "loss": 0.0215, + "step": 32452 + }, + { + "epoch": 3.848333926242144, + "grad_norm": 0.5911750998473173, + "learning_rate": 1.8826882369384624e-07, + "loss": 0.0197, + "step": 32453 + }, + { + "epoch": 3.848452508004269, + "grad_norm": 0.38187865855431136, + "learning_rate": 1.8797490809084328e-07, + "loss": 0.0172, + "step": 32454 + }, + { + "epoch": 3.848571089766394, + "grad_norm": 0.4410255142760647, + "learning_rate": 1.8768122122428679e-07, + "loss": 0.0151, + "step": 32455 + }, + { + "epoch": 3.848689671528519, + "grad_norm": 0.48696846659020765, + "learning_rate": 1.8738776309687733e-07, + "loss": 0.0182, + "step": 32456 + }, + { + "epoch": 3.848808253290644, + "grad_norm": 0.5948873012603836, + "learning_rate": 1.8709453371132945e-07, + "loss": 0.0207, + "step": 32457 + }, + { + "epoch": 3.848926835052769, + "grad_norm": 0.35413762583935327, + "learning_rate": 1.8680153307034376e-07, + "loss": 0.0166, + "step": 32458 + }, + { + "epoch": 3.849045416814894, + "grad_norm": 0.37258803177832117, + "learning_rate": 1.865087611766181e-07, + "loss": 0.0178, + "step": 32459 + }, + { + "epoch": 3.8491639985770187, + "grad_norm": 0.4630914189383797, + "learning_rate": 1.8621621803285027e-07, + "loss": 0.0211, + "step": 32460 + }, + { + "epoch": 3.849282580339144, + "grad_norm": 0.436522858326172, + "learning_rate": 1.8592390364174372e-07, + "loss": 0.0153, + "step": 32461 + }, + { + "epoch": 3.8494011621012687, + "grad_norm": 0.5711499747552092, + "learning_rate": 1.8563181800599073e-07, + "loss": 0.0292, + "step": 32462 + }, + { + "epoch": 3.849519743863394, + "grad_norm": 0.5225507070101192, + "learning_rate": 1.853399611282808e-07, + "loss": 0.0137, + "step": 32463 + }, + { + "epoch": 3.8496383256255187, + "grad_norm": 0.7479684752033527, + "learning_rate": 1.850483330113062e-07, + "loss": 0.0516, + "step": 32464 + }, + { + "epoch": 3.849756907387644, + "grad_norm": 0.23708569042951716, + "learning_rate": 1.8475693365775649e-07, + "loss": 0.0104, + "step": 32465 + }, + { + "epoch": 3.8498754891497686, + "grad_norm": 0.5087421000897505, + "learning_rate": 1.8446576307031837e-07, + "loss": 0.0294, + "step": 32466 + }, + { + "epoch": 3.849994070911894, + "grad_norm": 0.7338227349270761, + "learning_rate": 1.8417482125167308e-07, + "loss": 0.0265, + "step": 32467 + }, + { + "epoch": 3.8501126526740186, + "grad_norm": 1.1878088275533103, + "learning_rate": 1.838841082045073e-07, + "loss": 0.0525, + "step": 32468 + }, + { + "epoch": 3.850231234436144, + "grad_norm": 0.39641135134358163, + "learning_rate": 1.835936239314967e-07, + "loss": 0.0211, + "step": 32469 + }, + { + "epoch": 3.8503498161982685, + "grad_norm": 0.46873401641381574, + "learning_rate": 1.8330336843532247e-07, + "loss": 0.0293, + "step": 32470 + }, + { + "epoch": 3.8504683979603938, + "grad_norm": 0.5371046912586487, + "learning_rate": 1.8301334171865747e-07, + "loss": 0.0268, + "step": 32471 + }, + { + "epoch": 3.8505869797225185, + "grad_norm": 1.118398488433352, + "learning_rate": 1.8272354378417733e-07, + "loss": 0.0276, + "step": 32472 + }, + { + "epoch": 3.8507055614846437, + "grad_norm": 0.7860618001472475, + "learning_rate": 1.824339746345549e-07, + "loss": 0.0343, + "step": 32473 + }, + { + "epoch": 3.8508241432467685, + "grad_norm": 0.6950512381983767, + "learning_rate": 1.8214463427245475e-07, + "loss": 0.029, + "step": 32474 + }, + { + "epoch": 3.8509427250088937, + "grad_norm": 0.5610608052353848, + "learning_rate": 1.8185552270054974e-07, + "loss": 0.0223, + "step": 32475 + }, + { + "epoch": 3.851061306771019, + "grad_norm": 0.7453127661761787, + "learning_rate": 1.8156663992150158e-07, + "loss": 0.0286, + "step": 32476 + }, + { + "epoch": 3.8511798885331436, + "grad_norm": 0.7485745355913629, + "learning_rate": 1.8127798593797762e-07, + "loss": 0.0555, + "step": 32477 + }, + { + "epoch": 3.8512984702952684, + "grad_norm": 0.6647250819633721, + "learning_rate": 1.8098956075263128e-07, + "loss": 0.0239, + "step": 32478 + }, + { + "epoch": 3.8514170520573936, + "grad_norm": 0.6194999010174073, + "learning_rate": 1.807013643681299e-07, + "loss": 0.036, + "step": 32479 + }, + { + "epoch": 3.851535633819519, + "grad_norm": 0.6806066517237038, + "learning_rate": 1.804133967871241e-07, + "loss": 0.0383, + "step": 32480 + }, + { + "epoch": 3.8516542155816436, + "grad_norm": 0.5857982605502763, + "learning_rate": 1.801256580122701e-07, + "loss": 0.0228, + "step": 32481 + }, + { + "epoch": 3.8517727973437683, + "grad_norm": 0.38283714802027696, + "learning_rate": 1.7983814804622413e-07, + "loss": 0.0213, + "step": 32482 + }, + { + "epoch": 3.8518913791058935, + "grad_norm": 0.5286149350683463, + "learning_rate": 1.7955086689163125e-07, + "loss": 0.0342, + "step": 32483 + }, + { + "epoch": 3.8520099608680187, + "grad_norm": 0.3323645480576516, + "learning_rate": 1.7926381455114493e-07, + "loss": 0.0165, + "step": 32484 + }, + { + "epoch": 3.8521285426301435, + "grad_norm": 0.48823588342732377, + "learning_rate": 1.7897699102741027e-07, + "loss": 0.0224, + "step": 32485 + }, + { + "epoch": 3.8522471243922682, + "grad_norm": 0.6212002571036752, + "learning_rate": 1.7869039632306684e-07, + "loss": 0.0338, + "step": 32486 + }, + { + "epoch": 3.8523657061543934, + "grad_norm": 0.29447054329551997, + "learning_rate": 1.7840403044076248e-07, + "loss": 0.0114, + "step": 32487 + }, + { + "epoch": 3.8524842879165186, + "grad_norm": 0.48922821214559636, + "learning_rate": 1.78117893383134e-07, + "loss": 0.019, + "step": 32488 + }, + { + "epoch": 3.8526028696786434, + "grad_norm": 0.4109783988584283, + "learning_rate": 1.778319851528182e-07, + "loss": 0.0182, + "step": 32489 + }, + { + "epoch": 3.852721451440768, + "grad_norm": 0.47963680189137675, + "learning_rate": 1.7754630575245456e-07, + "loss": 0.022, + "step": 32490 + }, + { + "epoch": 3.8528400332028934, + "grad_norm": 0.6592529300626596, + "learning_rate": 1.772608551846744e-07, + "loss": 0.0379, + "step": 32491 + }, + { + "epoch": 3.8529586149650186, + "grad_norm": 0.4299692830422249, + "learning_rate": 1.769756334521089e-07, + "loss": 0.0321, + "step": 32492 + }, + { + "epoch": 3.8530771967271433, + "grad_norm": 0.5087442599535382, + "learning_rate": 1.7669064055738648e-07, + "loss": 0.0271, + "step": 32493 + }, + { + "epoch": 3.853195778489268, + "grad_norm": 0.39857837986078615, + "learning_rate": 1.7640587650313844e-07, + "loss": 0.0153, + "step": 32494 + }, + { + "epoch": 3.8533143602513933, + "grad_norm": 0.5718766207531726, + "learning_rate": 1.7612134129198765e-07, + "loss": 0.0315, + "step": 32495 + }, + { + "epoch": 3.8534329420135185, + "grad_norm": 0.45342682338847656, + "learning_rate": 1.75837034926557e-07, + "loss": 0.0175, + "step": 32496 + }, + { + "epoch": 3.8535515237756433, + "grad_norm": 0.5206439903417396, + "learning_rate": 1.7555295740946665e-07, + "loss": 0.0209, + "step": 32497 + }, + { + "epoch": 3.8536701055377685, + "grad_norm": 0.36920806883873586, + "learning_rate": 1.752691087433339e-07, + "loss": 0.0131, + "step": 32498 + }, + { + "epoch": 3.853788687299893, + "grad_norm": 0.5299792786256508, + "learning_rate": 1.7498548893078171e-07, + "loss": 0.0219, + "step": 32499 + }, + { + "epoch": 3.8539072690620184, + "grad_norm": 0.7015202949460598, + "learning_rate": 1.7470209797441906e-07, + "loss": 0.0308, + "step": 32500 + }, + { + "epoch": 3.854025850824143, + "grad_norm": 0.5965854218483934, + "learning_rate": 1.7441893587686052e-07, + "loss": 0.0232, + "step": 32501 + }, + { + "epoch": 3.8541444325862684, + "grad_norm": 0.8258156690652769, + "learning_rate": 1.7413600264071795e-07, + "loss": 0.037, + "step": 32502 + }, + { + "epoch": 3.854263014348393, + "grad_norm": 0.5831528520899272, + "learning_rate": 1.7385329826859477e-07, + "loss": 0.0251, + "step": 32503 + }, + { + "epoch": 3.8543815961105183, + "grad_norm": 0.5446240532467341, + "learning_rate": 1.7357082276310276e-07, + "loss": 0.0193, + "step": 32504 + }, + { + "epoch": 3.854500177872643, + "grad_norm": 0.5619574227027426, + "learning_rate": 1.732885761268427e-07, + "loss": 0.0286, + "step": 32505 + }, + { + "epoch": 3.8546187596347683, + "grad_norm": 0.8018811059214683, + "learning_rate": 1.7300655836241797e-07, + "loss": 0.035, + "step": 32506 + }, + { + "epoch": 3.854737341396893, + "grad_norm": 0.8355602577710525, + "learning_rate": 1.7272476947242655e-07, + "loss": 0.0335, + "step": 32507 + }, + { + "epoch": 3.8548559231590183, + "grad_norm": 0.4149542216494321, + "learning_rate": 1.7244320945946913e-07, + "loss": 0.0206, + "step": 32508 + }, + { + "epoch": 3.854974504921143, + "grad_norm": 0.4789781986789796, + "learning_rate": 1.7216187832613806e-07, + "loss": 0.024, + "step": 32509 + }, + { + "epoch": 3.855093086683268, + "grad_norm": 0.9633668199891151, + "learning_rate": 1.7188077607503127e-07, + "loss": 0.0335, + "step": 32510 + }, + { + "epoch": 3.855211668445393, + "grad_norm": 0.41960371595441864, + "learning_rate": 1.7159990270873562e-07, + "loss": 0.0175, + "step": 32511 + }, + { + "epoch": 3.855330250207518, + "grad_norm": 0.4465398651398696, + "learning_rate": 1.713192582298434e-07, + "loss": 0.0182, + "step": 32512 + }, + { + "epoch": 3.855448831969643, + "grad_norm": 0.6737734133148547, + "learning_rate": 1.7103884264093872e-07, + "loss": 0.0221, + "step": 32513 + }, + { + "epoch": 3.855567413731768, + "grad_norm": 1.1062103766472522, + "learning_rate": 1.7075865594460838e-07, + "loss": 0.0275, + "step": 32514 + }, + { + "epoch": 3.855685995493893, + "grad_norm": 0.5154008957115662, + "learning_rate": 1.704786981434392e-07, + "loss": 0.0268, + "step": 32515 + }, + { + "epoch": 3.855804577256018, + "grad_norm": 0.4646780278516789, + "learning_rate": 1.701989692400041e-07, + "loss": 0.0226, + "step": 32516 + }, + { + "epoch": 3.855923159018143, + "grad_norm": 0.4644077707148362, + "learning_rate": 1.6991946923688995e-07, + "loss": 0.0248, + "step": 32517 + }, + { + "epoch": 3.856041740780268, + "grad_norm": 0.8093377712971662, + "learning_rate": 1.6964019813666687e-07, + "loss": 0.0327, + "step": 32518 + }, + { + "epoch": 3.856160322542393, + "grad_norm": 0.329778899648655, + "learning_rate": 1.6936115594191338e-07, + "loss": 0.0137, + "step": 32519 + }, + { + "epoch": 3.856278904304518, + "grad_norm": 0.5765300202802016, + "learning_rate": 1.6908234265519963e-07, + "loss": 0.0331, + "step": 32520 + }, + { + "epoch": 3.856397486066643, + "grad_norm": 0.4052307643419008, + "learning_rate": 1.6880375827909855e-07, + "loss": 0.015, + "step": 32521 + }, + { + "epoch": 3.856516067828768, + "grad_norm": 0.8820742402985841, + "learning_rate": 1.6852540281617756e-07, + "loss": 0.0409, + "step": 32522 + }, + { + "epoch": 3.8566346495908927, + "grad_norm": 0.5868784695749976, + "learning_rate": 1.682472762689985e-07, + "loss": 0.028, + "step": 32523 + }, + { + "epoch": 3.856753231353018, + "grad_norm": 0.4003930038605936, + "learning_rate": 1.6796937864013153e-07, + "loss": 0.0219, + "step": 32524 + }, + { + "epoch": 3.856871813115143, + "grad_norm": 0.46924965321059614, + "learning_rate": 1.6769170993213566e-07, + "loss": 0.0241, + "step": 32525 + }, + { + "epoch": 3.856990394877268, + "grad_norm": 0.6764271229089087, + "learning_rate": 1.6741427014757283e-07, + "loss": 0.0304, + "step": 32526 + }, + { + "epoch": 3.8571089766393927, + "grad_norm": 0.6448300404506155, + "learning_rate": 1.671370592889937e-07, + "loss": 0.0288, + "step": 32527 + }, + { + "epoch": 3.857227558401518, + "grad_norm": 0.33328457478712464, + "learning_rate": 1.6686007735896292e-07, + "loss": 0.0199, + "step": 32528 + }, + { + "epoch": 3.857346140163643, + "grad_norm": 0.5674992149440867, + "learning_rate": 1.6658332436002843e-07, + "loss": 0.0211, + "step": 32529 + }, + { + "epoch": 3.857464721925768, + "grad_norm": 0.4393760236942314, + "learning_rate": 1.66306800294741e-07, + "loss": 0.0286, + "step": 32530 + }, + { + "epoch": 3.8575833036878926, + "grad_norm": 0.5375259787696522, + "learning_rate": 1.660305051656541e-07, + "loss": 0.0231, + "step": 32531 + }, + { + "epoch": 3.857701885450018, + "grad_norm": 0.46188760276485713, + "learning_rate": 1.6575443897531295e-07, + "loss": 0.0236, + "step": 32532 + }, + { + "epoch": 3.857820467212143, + "grad_norm": 0.4613621365248006, + "learning_rate": 1.6547860172625995e-07, + "loss": 0.0212, + "step": 32533 + }, + { + "epoch": 3.8579390489742678, + "grad_norm": 0.5420782726536425, + "learning_rate": 1.6520299342104028e-07, + "loss": 0.023, + "step": 32534 + }, + { + "epoch": 3.8580576307363925, + "grad_norm": 0.4945081794659402, + "learning_rate": 1.6492761406219637e-07, + "loss": 0.0215, + "step": 32535 + }, + { + "epoch": 3.8581762124985177, + "grad_norm": 0.3272197332088455, + "learning_rate": 1.6465246365226505e-07, + "loss": 0.0125, + "step": 32536 + }, + { + "epoch": 3.858294794260643, + "grad_norm": 0.5278203566519076, + "learning_rate": 1.6437754219378322e-07, + "loss": 0.0231, + "step": 32537 + }, + { + "epoch": 3.8584133760227677, + "grad_norm": 0.5977466215756991, + "learning_rate": 1.6410284968928215e-07, + "loss": 0.026, + "step": 32538 + }, + { + "epoch": 3.8585319577848924, + "grad_norm": 0.5843106118387974, + "learning_rate": 1.6382838614130148e-07, + "loss": 0.0278, + "step": 32539 + }, + { + "epoch": 3.8586505395470176, + "grad_norm": 0.5529693488076857, + "learning_rate": 1.635541515523642e-07, + "loss": 0.024, + "step": 32540 + }, + { + "epoch": 3.858769121309143, + "grad_norm": 0.521920872700592, + "learning_rate": 1.6328014592500163e-07, + "loss": 0.0287, + "step": 32541 + }, + { + "epoch": 3.8588877030712676, + "grad_norm": 0.8162077326266146, + "learning_rate": 1.6300636926173674e-07, + "loss": 0.0334, + "step": 32542 + }, + { + "epoch": 3.8590062848333924, + "grad_norm": 0.4432165542717633, + "learning_rate": 1.6273282156509805e-07, + "loss": 0.0188, + "step": 32543 + }, + { + "epoch": 3.8591248665955176, + "grad_norm": 0.4920262360834353, + "learning_rate": 1.6245950283760857e-07, + "loss": 0.0255, + "step": 32544 + }, + { + "epoch": 3.8592434483576428, + "grad_norm": 0.6649274853748884, + "learning_rate": 1.6218641308178017e-07, + "loss": 0.0302, + "step": 32545 + }, + { + "epoch": 3.8593620301197675, + "grad_norm": 0.6368580936145761, + "learning_rate": 1.6191355230013582e-07, + "loss": 0.032, + "step": 32546 + }, + { + "epoch": 3.8594806118818927, + "grad_norm": 0.5079795467659366, + "learning_rate": 1.616409204951902e-07, + "loss": 0.0259, + "step": 32547 + }, + { + "epoch": 3.8595991936440175, + "grad_norm": 0.4636785887611869, + "learning_rate": 1.6136851766945793e-07, + "loss": 0.0234, + "step": 32548 + }, + { + "epoch": 3.8597177754061427, + "grad_norm": 0.6456261048204028, + "learning_rate": 1.6109634382544536e-07, + "loss": 0.0365, + "step": 32549 + }, + { + "epoch": 3.8598363571682675, + "grad_norm": 0.5272810157226054, + "learning_rate": 1.6082439896566992e-07, + "loss": 0.0186, + "step": 32550 + }, + { + "epoch": 3.8599549389303927, + "grad_norm": 0.45842699271838383, + "learning_rate": 1.605526830926296e-07, + "loss": 0.0219, + "step": 32551 + }, + { + "epoch": 3.8600735206925174, + "grad_norm": 0.6244543677393063, + "learning_rate": 1.602811962088363e-07, + "loss": 0.0296, + "step": 32552 + }, + { + "epoch": 3.8601921024546426, + "grad_norm": 0.3299489140475676, + "learning_rate": 1.6000993831678802e-07, + "loss": 0.0138, + "step": 32553 + }, + { + "epoch": 3.8603106842167674, + "grad_norm": 0.43678983474528055, + "learning_rate": 1.5973890941898827e-07, + "loss": 0.0188, + "step": 32554 + }, + { + "epoch": 3.8604292659788926, + "grad_norm": 0.6434501662717126, + "learning_rate": 1.594681095179351e-07, + "loss": 0.0221, + "step": 32555 + }, + { + "epoch": 3.8605478477410173, + "grad_norm": 0.5776018623464471, + "learning_rate": 1.5919753861612373e-07, + "loss": 0.0307, + "step": 32556 + }, + { + "epoch": 3.8606664295031425, + "grad_norm": 0.42405311301419707, + "learning_rate": 1.589271967160466e-07, + "loss": 0.0154, + "step": 32557 + }, + { + "epoch": 3.8607850112652673, + "grad_norm": 0.8726191628366329, + "learning_rate": 1.5865708382020172e-07, + "loss": 0.0412, + "step": 32558 + }, + { + "epoch": 3.8609035930273925, + "grad_norm": 0.4526214168962555, + "learning_rate": 1.58387199931076e-07, + "loss": 0.0233, + "step": 32559 + }, + { + "epoch": 3.8610221747895173, + "grad_norm": 0.480839757817766, + "learning_rate": 1.5811754505115627e-07, + "loss": 0.0183, + "step": 32560 + }, + { + "epoch": 3.8611407565516425, + "grad_norm": 0.7477853852776319, + "learning_rate": 1.578481191829323e-07, + "loss": 0.0395, + "step": 32561 + }, + { + "epoch": 3.861259338313767, + "grad_norm": 0.45251300622481844, + "learning_rate": 1.575789223288826e-07, + "loss": 0.0171, + "step": 32562 + }, + { + "epoch": 3.8613779200758924, + "grad_norm": 0.8490845185478879, + "learning_rate": 1.5730995449149133e-07, + "loss": 0.0501, + "step": 32563 + }, + { + "epoch": 3.861496501838017, + "grad_norm": 0.3023903083545019, + "learning_rate": 1.5704121567323703e-07, + "loss": 0.0083, + "step": 32564 + }, + { + "epoch": 3.8616150836001424, + "grad_norm": 0.3538015505940077, + "learning_rate": 1.5677270587660108e-07, + "loss": 0.0102, + "step": 32565 + }, + { + "epoch": 3.861733665362267, + "grad_norm": 0.6167620188295707, + "learning_rate": 1.565044251040565e-07, + "loss": 0.0344, + "step": 32566 + }, + { + "epoch": 3.8618522471243923, + "grad_norm": 0.4158700744483799, + "learning_rate": 1.5623637335807352e-07, + "loss": 0.0232, + "step": 32567 + }, + { + "epoch": 3.861970828886517, + "grad_norm": 0.6094921571482055, + "learning_rate": 1.5596855064112791e-07, + "loss": 0.0244, + "step": 32568 + }, + { + "epoch": 3.8620894106486423, + "grad_norm": 0.4727696875878514, + "learning_rate": 1.5570095695568443e-07, + "loss": 0.0191, + "step": 32569 + }, + { + "epoch": 3.862207992410767, + "grad_norm": 0.5337179392268638, + "learning_rate": 1.5543359230421328e-07, + "loss": 0.0368, + "step": 32570 + }, + { + "epoch": 3.8623265741728923, + "grad_norm": 0.5184890661024976, + "learning_rate": 1.5516645668917919e-07, + "loss": 0.0246, + "step": 32571 + }, + { + "epoch": 3.862445155935017, + "grad_norm": 0.8150848037929325, + "learning_rate": 1.5489955011303848e-07, + "loss": 0.0362, + "step": 32572 + }, + { + "epoch": 3.8625637376971422, + "grad_norm": 0.38327592598594395, + "learning_rate": 1.5463287257826142e-07, + "loss": 0.0193, + "step": 32573 + }, + { + "epoch": 3.8626823194592674, + "grad_norm": 0.5707252548096468, + "learning_rate": 1.5436642408730162e-07, + "loss": 0.0261, + "step": 32574 + }, + { + "epoch": 3.862800901221392, + "grad_norm": 0.4644390512433499, + "learning_rate": 1.5410020464261542e-07, + "loss": 0.0211, + "step": 32575 + }, + { + "epoch": 3.862919482983517, + "grad_norm": 0.46502698357955063, + "learning_rate": 1.5383421424665645e-07, + "loss": 0.0227, + "step": 32576 + }, + { + "epoch": 3.863038064745642, + "grad_norm": 0.3372099655788095, + "learning_rate": 1.5356845290187827e-07, + "loss": 0.0134, + "step": 32577 + }, + { + "epoch": 3.8631566465077674, + "grad_norm": 0.6275208783984894, + "learning_rate": 1.5330292061072892e-07, + "loss": 0.0265, + "step": 32578 + }, + { + "epoch": 3.863275228269892, + "grad_norm": 0.3739830551335698, + "learning_rate": 1.5303761737565926e-07, + "loss": 0.0177, + "step": 32579 + }, + { + "epoch": 3.863393810032017, + "grad_norm": 0.3977507847107187, + "learning_rate": 1.527725431991145e-07, + "loss": 0.0209, + "step": 32580 + }, + { + "epoch": 3.863512391794142, + "grad_norm": 0.4574066676901986, + "learning_rate": 1.5250769808353438e-07, + "loss": 0.0177, + "step": 32581 + }, + { + "epoch": 3.8636309735562673, + "grad_norm": 0.8739498626552537, + "learning_rate": 1.5224308203136418e-07, + "loss": 0.0483, + "step": 32582 + }, + { + "epoch": 3.863749555318392, + "grad_norm": 0.36832713544731427, + "learning_rate": 1.519786950450408e-07, + "loss": 0.0192, + "step": 32583 + }, + { + "epoch": 3.863868137080517, + "grad_norm": 0.31205952273542736, + "learning_rate": 1.517145371270068e-07, + "loss": 0.0152, + "step": 32584 + }, + { + "epoch": 3.863986718842642, + "grad_norm": 0.5511660524477593, + "learning_rate": 1.514506082796935e-07, + "loss": 0.0217, + "step": 32585 + }, + { + "epoch": 3.864105300604767, + "grad_norm": 0.45495838503889174, + "learning_rate": 1.511869085055323e-07, + "loss": 0.0287, + "step": 32586 + }, + { + "epoch": 3.864223882366892, + "grad_norm": 0.43821984145677745, + "learning_rate": 1.509234378069546e-07, + "loss": 0.0185, + "step": 32587 + }, + { + "epoch": 3.8643424641290167, + "grad_norm": 0.4417145967653241, + "learning_rate": 1.506601961863946e-07, + "loss": 0.0209, + "step": 32588 + }, + { + "epoch": 3.864461045891142, + "grad_norm": 0.5058744809609778, + "learning_rate": 1.503971836462753e-07, + "loss": 0.0195, + "step": 32589 + }, + { + "epoch": 3.864579627653267, + "grad_norm": 0.5387575487223066, + "learning_rate": 1.501344001890198e-07, + "loss": 0.0234, + "step": 32590 + }, + { + "epoch": 3.864698209415392, + "grad_norm": 0.5388228200467652, + "learning_rate": 1.4987184581705116e-07, + "loss": 0.0208, + "step": 32591 + }, + { + "epoch": 3.8648167911775166, + "grad_norm": 0.36665198569599994, + "learning_rate": 1.496095205327924e-07, + "loss": 0.0186, + "step": 32592 + }, + { + "epoch": 3.864935372939642, + "grad_norm": 0.3508647444401475, + "learning_rate": 1.4934742433865833e-07, + "loss": 0.0188, + "step": 32593 + }, + { + "epoch": 3.865053954701767, + "grad_norm": 0.636859849319127, + "learning_rate": 1.490855572370692e-07, + "loss": 0.0493, + "step": 32594 + }, + { + "epoch": 3.865172536463892, + "grad_norm": 0.6016004795352826, + "learning_rate": 1.4882391923043693e-07, + "loss": 0.024, + "step": 32595 + }, + { + "epoch": 3.8652911182260166, + "grad_norm": 0.42879046148625155, + "learning_rate": 1.4856251032117353e-07, + "loss": 0.014, + "step": 32596 + }, + { + "epoch": 3.8654096999881418, + "grad_norm": 0.46020555330281576, + "learning_rate": 1.4830133051168816e-07, + "loss": 0.0192, + "step": 32597 + }, + { + "epoch": 3.865528281750267, + "grad_norm": 0.667127191127804, + "learning_rate": 1.4804037980439002e-07, + "loss": 0.0319, + "step": 32598 + }, + { + "epoch": 3.8656468635123917, + "grad_norm": 0.9748382091368105, + "learning_rate": 1.4777965820168272e-07, + "loss": 0.0422, + "step": 32599 + }, + { + "epoch": 3.865765445274517, + "grad_norm": 0.44772435688266465, + "learning_rate": 1.475191657059727e-07, + "loss": 0.0199, + "step": 32600 + }, + { + "epoch": 3.8658840270366417, + "grad_norm": 0.4299117651862585, + "learning_rate": 1.47258902319658e-07, + "loss": 0.0226, + "step": 32601 + }, + { + "epoch": 3.866002608798767, + "grad_norm": 0.322586282202831, + "learning_rate": 1.4699886804514228e-07, + "loss": 0.0184, + "step": 32602 + }, + { + "epoch": 3.8661211905608917, + "grad_norm": 0.5820923736145253, + "learning_rate": 1.467390628848181e-07, + "loss": 0.0252, + "step": 32603 + }, + { + "epoch": 3.866239772323017, + "grad_norm": 0.2807807620307794, + "learning_rate": 1.4647948684108626e-07, + "loss": 0.0138, + "step": 32604 + }, + { + "epoch": 3.8663583540851416, + "grad_norm": 0.3843646765834339, + "learning_rate": 1.4622013991633099e-07, + "loss": 0.0119, + "step": 32605 + }, + { + "epoch": 3.866476935847267, + "grad_norm": 0.614756510278305, + "learning_rate": 1.4596102211295316e-07, + "loss": 0.0293, + "step": 32606 + }, + { + "epoch": 3.8665955176093916, + "grad_norm": 0.5938635103656196, + "learning_rate": 1.4570213343333416e-07, + "loss": 0.0217, + "step": 32607 + }, + { + "epoch": 3.8667140993715168, + "grad_norm": 0.397662247879681, + "learning_rate": 1.454434738798638e-07, + "loss": 0.0203, + "step": 32608 + }, + { + "epoch": 3.8668326811336415, + "grad_norm": 0.557100193557595, + "learning_rate": 1.4518504345492346e-07, + "loss": 0.0214, + "step": 32609 + }, + { + "epoch": 3.8669512628957667, + "grad_norm": 0.39168045151340314, + "learning_rate": 1.4492684216090012e-07, + "loss": 0.0154, + "step": 32610 + }, + { + "epoch": 3.8670698446578915, + "grad_norm": 0.5273518975725061, + "learning_rate": 1.446688700001725e-07, + "loss": 0.0188, + "step": 32611 + }, + { + "epoch": 3.8671884264200167, + "grad_norm": 0.7006204166364842, + "learning_rate": 1.444111269751164e-07, + "loss": 0.0272, + "step": 32612 + }, + { + "epoch": 3.8673070081821415, + "grad_norm": 0.5222318819563422, + "learning_rate": 1.4415361308811327e-07, + "loss": 0.0237, + "step": 32613 + }, + { + "epoch": 3.8674255899442667, + "grad_norm": 0.4233017187079948, + "learning_rate": 1.4389632834153065e-07, + "loss": 0.0153, + "step": 32614 + }, + { + "epoch": 3.8675441717063914, + "grad_norm": 0.49248646272515284, + "learning_rate": 1.4363927273774446e-07, + "loss": 0.0192, + "step": 32615 + }, + { + "epoch": 3.8676627534685166, + "grad_norm": 0.34932417469521815, + "learning_rate": 1.4338244627912222e-07, + "loss": 0.0162, + "step": 32616 + }, + { + "epoch": 3.8677813352306414, + "grad_norm": 0.3872661797413739, + "learning_rate": 1.4312584896803426e-07, + "loss": 0.015, + "step": 32617 + }, + { + "epoch": 3.8678999169927666, + "grad_norm": 0.4833310323065501, + "learning_rate": 1.4286948080684259e-07, + "loss": 0.0196, + "step": 32618 + }, + { + "epoch": 3.8680184987548913, + "grad_norm": 0.7362968660159754, + "learning_rate": 1.4261334179791197e-07, + "loss": 0.0348, + "step": 32619 + }, + { + "epoch": 3.8681370805170165, + "grad_norm": 0.4174521369125346, + "learning_rate": 1.4235743194360718e-07, + "loss": 0.0165, + "step": 32620 + }, + { + "epoch": 3.8682556622791413, + "grad_norm": 0.3375835263670387, + "learning_rate": 1.4210175124628188e-07, + "loss": 0.0133, + "step": 32621 + }, + { + "epoch": 3.8683742440412665, + "grad_norm": 0.6463917733944984, + "learning_rate": 1.418462997082981e-07, + "loss": 0.0329, + "step": 32622 + }, + { + "epoch": 3.8684928258033917, + "grad_norm": 0.9757135490783077, + "learning_rate": 1.4159107733200672e-07, + "loss": 0.053, + "step": 32623 + }, + { + "epoch": 3.8686114075655165, + "grad_norm": 0.43964253269988895, + "learning_rate": 1.413360841197614e-07, + "loss": 0.0206, + "step": 32624 + }, + { + "epoch": 3.8687299893276412, + "grad_norm": 0.5729133169003059, + "learning_rate": 1.4108132007391584e-07, + "loss": 0.0286, + "step": 32625 + }, + { + "epoch": 3.8688485710897664, + "grad_norm": 0.6183637340618612, + "learning_rate": 1.4082678519681537e-07, + "loss": 0.0293, + "step": 32626 + }, + { + "epoch": 3.8689671528518916, + "grad_norm": 0.3439875413132369, + "learning_rate": 1.405724794908081e-07, + "loss": 0.0152, + "step": 32627 + }, + { + "epoch": 3.8690857346140164, + "grad_norm": 0.4444873425787215, + "learning_rate": 1.4031840295823662e-07, + "loss": 0.0231, + "step": 32628 + }, + { + "epoch": 3.869204316376141, + "grad_norm": 0.29338984331864887, + "learning_rate": 1.400645556014435e-07, + "loss": 0.0154, + "step": 32629 + }, + { + "epoch": 3.8693228981382664, + "grad_norm": 0.4876335223974506, + "learning_rate": 1.3981093742277129e-07, + "loss": 0.0221, + "step": 32630 + }, + { + "epoch": 3.8694414799003916, + "grad_norm": 0.4263178096265252, + "learning_rate": 1.3955754842455704e-07, + "loss": 0.0184, + "step": 32631 + }, + { + "epoch": 3.8695600616625163, + "grad_norm": 0.38948734488735515, + "learning_rate": 1.3930438860913775e-07, + "loss": 0.017, + "step": 32632 + }, + { + "epoch": 3.869678643424641, + "grad_norm": 0.43797553012151835, + "learning_rate": 1.390514579788449e-07, + "loss": 0.027, + "step": 32633 + }, + { + "epoch": 3.8697972251867663, + "grad_norm": 0.581059440215136, + "learning_rate": 1.3879875653600993e-07, + "loss": 0.0266, + "step": 32634 + }, + { + "epoch": 3.8699158069488915, + "grad_norm": 0.7273038854572882, + "learning_rate": 1.3854628428296435e-07, + "loss": 0.0436, + "step": 32635 + }, + { + "epoch": 3.8700343887110162, + "grad_norm": 0.6272042933103779, + "learning_rate": 1.3829404122203404e-07, + "loss": 0.0195, + "step": 32636 + }, + { + "epoch": 3.870152970473141, + "grad_norm": 0.5801904446495383, + "learning_rate": 1.3804202735554772e-07, + "loss": 0.0266, + "step": 32637 + }, + { + "epoch": 3.870271552235266, + "grad_norm": 0.5114057206037644, + "learning_rate": 1.3779024268582296e-07, + "loss": 0.0222, + "step": 32638 + }, + { + "epoch": 3.8703901339973914, + "grad_norm": 0.483942764237893, + "learning_rate": 1.375386872151857e-07, + "loss": 0.0215, + "step": 32639 + }, + { + "epoch": 3.870508715759516, + "grad_norm": 0.4259764777135623, + "learning_rate": 1.3728736094595352e-07, + "loss": 0.02, + "step": 32640 + }, + { + "epoch": 3.870627297521641, + "grad_norm": 0.5569648023364093, + "learning_rate": 1.37036263880444e-07, + "loss": 0.022, + "step": 32641 + }, + { + "epoch": 3.870745879283766, + "grad_norm": 0.6299291923501396, + "learning_rate": 1.3678539602096917e-07, + "loss": 0.0157, + "step": 32642 + }, + { + "epoch": 3.8708644610458913, + "grad_norm": 0.34514613755750634, + "learning_rate": 1.365347573698439e-07, + "loss": 0.02, + "step": 32643 + }, + { + "epoch": 3.870983042808016, + "grad_norm": 0.5735864020611046, + "learning_rate": 1.3628434792938016e-07, + "loss": 0.0303, + "step": 32644 + }, + { + "epoch": 3.871101624570141, + "grad_norm": 0.34656648606142626, + "learning_rate": 1.360341677018845e-07, + "loss": 0.0122, + "step": 32645 + }, + { + "epoch": 3.871220206332266, + "grad_norm": 0.4074037171280406, + "learning_rate": 1.3578421668966334e-07, + "loss": 0.0205, + "step": 32646 + }, + { + "epoch": 3.8713387880943912, + "grad_norm": 0.5165907289237643, + "learning_rate": 1.3553449489502045e-07, + "loss": 0.019, + "step": 32647 + }, + { + "epoch": 3.871457369856516, + "grad_norm": 0.43287724502955566, + "learning_rate": 1.3528500232025955e-07, + "loss": 0.0201, + "step": 32648 + }, + { + "epoch": 3.871575951618641, + "grad_norm": 0.6047712722356997, + "learning_rate": 1.3503573896767873e-07, + "loss": 0.0343, + "step": 32649 + }, + { + "epoch": 3.871694533380766, + "grad_norm": 0.4267500716910032, + "learning_rate": 1.3478670483957624e-07, + "loss": 0.023, + "step": 32650 + }, + { + "epoch": 3.871813115142891, + "grad_norm": 0.5627223101939857, + "learning_rate": 1.3453789993825018e-07, + "loss": 0.0249, + "step": 32651 + }, + { + "epoch": 3.871931696905016, + "grad_norm": 0.8460577459011723, + "learning_rate": 1.3428932426599318e-07, + "loss": 0.0385, + "step": 32652 + }, + { + "epoch": 3.872050278667141, + "grad_norm": 0.7398513723441352, + "learning_rate": 1.3404097782509506e-07, + "loss": 0.0428, + "step": 32653 + }, + { + "epoch": 3.872168860429266, + "grad_norm": 0.8259795343590133, + "learning_rate": 1.337928606178457e-07, + "loss": 0.0387, + "step": 32654 + }, + { + "epoch": 3.872287442191391, + "grad_norm": 0.6386803047370578, + "learning_rate": 1.3354497264653765e-07, + "loss": 0.0229, + "step": 32655 + }, + { + "epoch": 3.872406023953516, + "grad_norm": 0.44662598739206627, + "learning_rate": 1.332973139134469e-07, + "loss": 0.0164, + "step": 32656 + }, + { + "epoch": 3.872524605715641, + "grad_norm": 0.5520515698719809, + "learning_rate": 1.330498844208633e-07, + "loss": 0.0228, + "step": 32657 + }, + { + "epoch": 3.872643187477766, + "grad_norm": 0.6585588359203679, + "learning_rate": 1.3280268417106555e-07, + "loss": 0.0245, + "step": 32658 + }, + { + "epoch": 3.872761769239891, + "grad_norm": 0.6544577731106964, + "learning_rate": 1.3255571316633242e-07, + "loss": 0.0298, + "step": 32659 + }, + { + "epoch": 3.8728803510020158, + "grad_norm": 0.6659104984935416, + "learning_rate": 1.323089714089426e-07, + "loss": 0.029, + "step": 32660 + }, + { + "epoch": 3.872998932764141, + "grad_norm": 0.6708623544079515, + "learning_rate": 1.3206245890116653e-07, + "loss": 0.0256, + "step": 32661 + }, + { + "epoch": 3.8731175145262657, + "grad_norm": 0.6902240818017343, + "learning_rate": 1.3181617564528293e-07, + "loss": 0.026, + "step": 32662 + }, + { + "epoch": 3.873236096288391, + "grad_norm": 0.7705162082727598, + "learning_rate": 1.3157012164355665e-07, + "loss": 0.0355, + "step": 32663 + }, + { + "epoch": 3.8733546780505157, + "grad_norm": 0.39765396973103706, + "learning_rate": 1.313242968982581e-07, + "loss": 0.0194, + "step": 32664 + }, + { + "epoch": 3.873473259812641, + "grad_norm": 0.5170717349944588, + "learning_rate": 1.310787014116549e-07, + "loss": 0.0264, + "step": 32665 + }, + { + "epoch": 3.8735918415747657, + "grad_norm": 0.3964385058255222, + "learning_rate": 1.3083333518600917e-07, + "loss": 0.0188, + "step": 32666 + }, + { + "epoch": 3.873710423336891, + "grad_norm": 0.5528311483799406, + "learning_rate": 1.3058819822358293e-07, + "loss": 0.0258, + "step": 32667 + }, + { + "epoch": 3.8738290050990156, + "grad_norm": 0.2997862165700499, + "learning_rate": 1.303432905266383e-07, + "loss": 0.0137, + "step": 32668 + }, + { + "epoch": 3.873947586861141, + "grad_norm": 0.43445629950690956, + "learning_rate": 1.30098612097429e-07, + "loss": 0.0228, + "step": 32669 + }, + { + "epoch": 3.8740661686232656, + "grad_norm": 0.394825865083493, + "learning_rate": 1.298541629382144e-07, + "loss": 0.0135, + "step": 32670 + }, + { + "epoch": 3.874184750385391, + "grad_norm": 0.3896290887168801, + "learning_rate": 1.296099430512454e-07, + "loss": 0.0162, + "step": 32671 + }, + { + "epoch": 3.874303332147516, + "grad_norm": 0.5819294192951346, + "learning_rate": 1.293659524387758e-07, + "loss": 0.0267, + "step": 32672 + }, + { + "epoch": 3.8744219139096407, + "grad_norm": 0.7409501656122045, + "learning_rate": 1.291221911030538e-07, + "loss": 0.0295, + "step": 32673 + }, + { + "epoch": 3.8745404956717655, + "grad_norm": 0.4330243910513964, + "learning_rate": 1.288786590463248e-07, + "loss": 0.0221, + "step": 32674 + }, + { + "epoch": 3.8746590774338907, + "grad_norm": 0.37196662282884163, + "learning_rate": 1.28635356270837e-07, + "loss": 0.0194, + "step": 32675 + }, + { + "epoch": 3.874777659196016, + "grad_norm": 0.48212988845631294, + "learning_rate": 1.283922827788303e-07, + "loss": 0.023, + "step": 32676 + }, + { + "epoch": 3.8748962409581407, + "grad_norm": 0.5303091513937456, + "learning_rate": 1.2814943857255014e-07, + "loss": 0.0233, + "step": 32677 + }, + { + "epoch": 3.8750148227202654, + "grad_norm": 0.38482728897082114, + "learning_rate": 1.2790682365423078e-07, + "loss": 0.0208, + "step": 32678 + }, + { + "epoch": 3.8751334044823906, + "grad_norm": 0.4456371267641812, + "learning_rate": 1.276644380261094e-07, + "loss": 0.0193, + "step": 32679 + }, + { + "epoch": 3.875251986244516, + "grad_norm": 0.5253486954001398, + "learning_rate": 1.2742228169042304e-07, + "loss": 0.0256, + "step": 32680 + }, + { + "epoch": 3.8753705680066406, + "grad_norm": 0.5596053493868849, + "learning_rate": 1.2718035464940326e-07, + "loss": 0.0249, + "step": 32681 + }, + { + "epoch": 3.8754891497687654, + "grad_norm": 0.5890691795724299, + "learning_rate": 1.2693865690527884e-07, + "loss": 0.027, + "step": 32682 + }, + { + "epoch": 3.8756077315308906, + "grad_norm": 0.5101345500493113, + "learning_rate": 1.2669718846027578e-07, + "loss": 0.0237, + "step": 32683 + }, + { + "epoch": 3.8757263132930158, + "grad_norm": 0.5582198955708008, + "learning_rate": 1.2645594931662563e-07, + "loss": 0.0253, + "step": 32684 + }, + { + "epoch": 3.8758448950551405, + "grad_norm": 0.37333783924266273, + "learning_rate": 1.2621493947654882e-07, + "loss": 0.0165, + "step": 32685 + }, + { + "epoch": 3.8759634768172653, + "grad_norm": 0.4923635723583958, + "learning_rate": 1.2597415894227138e-07, + "loss": 0.0189, + "step": 32686 + }, + { + "epoch": 3.8760820585793905, + "grad_norm": 0.6993910643997822, + "learning_rate": 1.257336077160054e-07, + "loss": 0.0298, + "step": 32687 + }, + { + "epoch": 3.8762006403415157, + "grad_norm": 0.5208924870166101, + "learning_rate": 1.254932857999741e-07, + "loss": 0.0268, + "step": 32688 + }, + { + "epoch": 3.8763192221036404, + "grad_norm": 0.4298567123730573, + "learning_rate": 1.2525319319638962e-07, + "loss": 0.019, + "step": 32689 + }, + { + "epoch": 3.876437803865765, + "grad_norm": 0.5837464616752625, + "learning_rate": 1.250133299074696e-07, + "loss": 0.0315, + "step": 32690 + }, + { + "epoch": 3.8765563856278904, + "grad_norm": 0.4567739911858364, + "learning_rate": 1.2477369593542344e-07, + "loss": 0.0338, + "step": 32691 + }, + { + "epoch": 3.8766749673900156, + "grad_norm": 0.47100624937266755, + "learning_rate": 1.2453429128245763e-07, + "loss": 0.016, + "step": 32692 + }, + { + "epoch": 3.8767935491521404, + "grad_norm": 0.4158855844144647, + "learning_rate": 1.2429511595078435e-07, + "loss": 0.0207, + "step": 32693 + }, + { + "epoch": 3.876912130914265, + "grad_norm": 0.29119027337245906, + "learning_rate": 1.2405616994260184e-07, + "loss": 0.0132, + "step": 32694 + }, + { + "epoch": 3.8770307126763903, + "grad_norm": 0.4664870741907755, + "learning_rate": 1.238174532601194e-07, + "loss": 0.0216, + "step": 32695 + }, + { + "epoch": 3.8771492944385155, + "grad_norm": 0.5321490654801614, + "learning_rate": 1.2357896590553254e-07, + "loss": 0.0218, + "step": 32696 + }, + { + "epoch": 3.8772678762006403, + "grad_norm": 0.5928500043530449, + "learning_rate": 1.2334070788104501e-07, + "loss": 0.0291, + "step": 32697 + }, + { + "epoch": 3.8773864579627655, + "grad_norm": 0.5183502338939752, + "learning_rate": 1.2310267918884956e-07, + "loss": 0.0246, + "step": 32698 + }, + { + "epoch": 3.8775050397248902, + "grad_norm": 0.5102843066292511, + "learning_rate": 1.2286487983113882e-07, + "loss": 0.0191, + "step": 32699 + }, + { + "epoch": 3.8776236214870154, + "grad_norm": 0.8412423165425478, + "learning_rate": 1.2262730981010828e-07, + "loss": 0.0281, + "step": 32700 + }, + { + "epoch": 3.87774220324914, + "grad_norm": 0.44953158255585496, + "learning_rate": 1.2238996912795065e-07, + "loss": 0.0192, + "step": 32701 + }, + { + "epoch": 3.8778607850112654, + "grad_norm": 0.44850607515340973, + "learning_rate": 1.2215285778684748e-07, + "loss": 0.019, + "step": 32702 + }, + { + "epoch": 3.87797936677339, + "grad_norm": 0.5150948120731565, + "learning_rate": 1.2191597578898872e-07, + "loss": 0.0235, + "step": 32703 + }, + { + "epoch": 3.8780979485355154, + "grad_norm": 0.6963698675632806, + "learning_rate": 1.2167932313655595e-07, + "loss": 0.026, + "step": 32704 + }, + { + "epoch": 3.87821653029764, + "grad_norm": 0.39658544649260224, + "learning_rate": 1.214428998317335e-07, + "loss": 0.0188, + "step": 32705 + }, + { + "epoch": 3.8783351120597653, + "grad_norm": 0.6774797496853845, + "learning_rate": 1.2120670587669747e-07, + "loss": 0.0373, + "step": 32706 + }, + { + "epoch": 3.87845369382189, + "grad_norm": 0.43733324552338454, + "learning_rate": 1.209707412736294e-07, + "loss": 0.0241, + "step": 32707 + }, + { + "epoch": 3.8785722755840153, + "grad_norm": 0.27665934241143364, + "learning_rate": 1.2073500602470255e-07, + "loss": 0.0146, + "step": 32708 + }, + { + "epoch": 3.87869085734614, + "grad_norm": 0.25842259851662985, + "learning_rate": 1.2049950013209022e-07, + "loss": 0.0131, + "step": 32709 + }, + { + "epoch": 3.8788094391082653, + "grad_norm": 0.8342220298453545, + "learning_rate": 1.202642235979601e-07, + "loss": 0.0386, + "step": 32710 + }, + { + "epoch": 3.87892802087039, + "grad_norm": 0.6037545181142556, + "learning_rate": 1.2002917642448543e-07, + "loss": 0.0224, + "step": 32711 + }, + { + "epoch": 3.879046602632515, + "grad_norm": 0.43170714610825284, + "learning_rate": 1.1979435861383392e-07, + "loss": 0.0189, + "step": 32712 + }, + { + "epoch": 3.87916518439464, + "grad_norm": 0.3148970951399997, + "learning_rate": 1.1955977016816778e-07, + "loss": 0.0136, + "step": 32713 + }, + { + "epoch": 3.879283766156765, + "grad_norm": 0.5088820528993692, + "learning_rate": 1.1932541108964912e-07, + "loss": 0.0266, + "step": 32714 + }, + { + "epoch": 3.87940234791889, + "grad_norm": 0.47823358255777154, + "learning_rate": 1.1909128138044013e-07, + "loss": 0.0212, + "step": 32715 + }, + { + "epoch": 3.879520929681015, + "grad_norm": 0.43968723391023573, + "learning_rate": 1.1885738104269739e-07, + "loss": 0.0226, + "step": 32716 + }, + { + "epoch": 3.87963951144314, + "grad_norm": 0.9353379765466897, + "learning_rate": 1.186237100785803e-07, + "loss": 0.0546, + "step": 32717 + }, + { + "epoch": 3.879758093205265, + "grad_norm": 0.8303197126263923, + "learning_rate": 1.1839026849023993e-07, + "loss": 0.0302, + "step": 32718 + }, + { + "epoch": 3.87987667496739, + "grad_norm": 0.6217878356193468, + "learning_rate": 1.1815705627983009e-07, + "loss": 0.0254, + "step": 32719 + }, + { + "epoch": 3.879995256729515, + "grad_norm": 0.8122231892836824, + "learning_rate": 1.1792407344949907e-07, + "loss": 0.0392, + "step": 32720 + }, + { + "epoch": 3.88011383849164, + "grad_norm": 0.959321928834248, + "learning_rate": 1.1769132000139516e-07, + "loss": 0.0314, + "step": 32721 + }, + { + "epoch": 3.880232420253765, + "grad_norm": 0.4474968966648696, + "learning_rate": 1.1745879593766385e-07, + "loss": 0.02, + "step": 32722 + }, + { + "epoch": 3.88035100201589, + "grad_norm": 0.43264463828014044, + "learning_rate": 1.1722650126045065e-07, + "loss": 0.0188, + "step": 32723 + }, + { + "epoch": 3.880469583778015, + "grad_norm": 0.48160691091089064, + "learning_rate": 1.169944359718983e-07, + "loss": 0.0275, + "step": 32724 + }, + { + "epoch": 3.88058816554014, + "grad_norm": 0.6313937253576575, + "learning_rate": 1.167626000741412e-07, + "loss": 0.0292, + "step": 32725 + }, + { + "epoch": 3.880706747302265, + "grad_norm": 0.5447381273443939, + "learning_rate": 1.1653099356931929e-07, + "loss": 0.0196, + "step": 32726 + }, + { + "epoch": 3.8808253290643897, + "grad_norm": 0.5090796980296977, + "learning_rate": 1.1629961645956699e-07, + "loss": 0.0263, + "step": 32727 + }, + { + "epoch": 3.880943910826515, + "grad_norm": 0.43953016485975943, + "learning_rate": 1.160684687470187e-07, + "loss": 0.0204, + "step": 32728 + }, + { + "epoch": 3.88106249258864, + "grad_norm": 0.3650618191931547, + "learning_rate": 1.1583755043380328e-07, + "loss": 0.0164, + "step": 32729 + }, + { + "epoch": 3.881181074350765, + "grad_norm": 0.7372840033121476, + "learning_rate": 1.1560686152205236e-07, + "loss": 0.0336, + "step": 32730 + }, + { + "epoch": 3.8812996561128896, + "grad_norm": 0.41050835570446714, + "learning_rate": 1.1537640201389199e-07, + "loss": 0.0182, + "step": 32731 + }, + { + "epoch": 3.881418237875015, + "grad_norm": 0.4541691933886445, + "learning_rate": 1.1514617191144272e-07, + "loss": 0.022, + "step": 32732 + }, + { + "epoch": 3.88153681963714, + "grad_norm": 0.8957272574983618, + "learning_rate": 1.1491617121683062e-07, + "loss": 0.0399, + "step": 32733 + }, + { + "epoch": 3.881655401399265, + "grad_norm": 0.4293997359170599, + "learning_rate": 1.1468639993217623e-07, + "loss": 0.0247, + "step": 32734 + }, + { + "epoch": 3.8817739831613896, + "grad_norm": 0.5070062109570288, + "learning_rate": 1.1445685805959727e-07, + "loss": 0.0174, + "step": 32735 + }, + { + "epoch": 3.8818925649235148, + "grad_norm": 0.5639938760973098, + "learning_rate": 1.1422754560120596e-07, + "loss": 0.0242, + "step": 32736 + }, + { + "epoch": 3.88201114668564, + "grad_norm": 0.42810846539402814, + "learning_rate": 1.1399846255912283e-07, + "loss": 0.0208, + "step": 32737 + }, + { + "epoch": 3.8821297284477647, + "grad_norm": 0.47922806329418427, + "learning_rate": 1.137696089354573e-07, + "loss": 0.0228, + "step": 32738 + }, + { + "epoch": 3.8822483102098895, + "grad_norm": 0.4544732797019944, + "learning_rate": 1.13540984732316e-07, + "loss": 0.0208, + "step": 32739 + }, + { + "epoch": 3.8823668919720147, + "grad_norm": 0.5949258008748086, + "learning_rate": 1.1331258995181116e-07, + "loss": 0.0302, + "step": 32740 + }, + { + "epoch": 3.88248547373414, + "grad_norm": 0.47189121966141123, + "learning_rate": 1.1308442459604385e-07, + "loss": 0.0192, + "step": 32741 + }, + { + "epoch": 3.8826040554962646, + "grad_norm": 1.0201583203393738, + "learning_rate": 1.128564886671235e-07, + "loss": 0.0601, + "step": 32742 + }, + { + "epoch": 3.8827226372583894, + "grad_norm": 0.4052178664765206, + "learning_rate": 1.1262878216714567e-07, + "loss": 0.0224, + "step": 32743 + }, + { + "epoch": 3.8828412190205146, + "grad_norm": 0.39847991429314666, + "learning_rate": 1.1240130509821145e-07, + "loss": 0.0147, + "step": 32744 + }, + { + "epoch": 3.88295980078264, + "grad_norm": 0.5729902493668378, + "learning_rate": 1.1217405746241639e-07, + "loss": 0.0291, + "step": 32745 + }, + { + "epoch": 3.8830783825447646, + "grad_norm": 0.3889004385021702, + "learning_rate": 1.1194703926185879e-07, + "loss": 0.0164, + "step": 32746 + }, + { + "epoch": 3.8831969643068898, + "grad_norm": 0.40104142287437694, + "learning_rate": 1.1172025049862866e-07, + "loss": 0.0203, + "step": 32747 + }, + { + "epoch": 3.8833155460690145, + "grad_norm": 0.3963653755864554, + "learning_rate": 1.1149369117481878e-07, + "loss": 0.0162, + "step": 32748 + }, + { + "epoch": 3.8834341278311397, + "grad_norm": 0.4441498411739043, + "learning_rate": 1.1126736129251636e-07, + "loss": 0.0143, + "step": 32749 + }, + { + "epoch": 3.8835527095932645, + "grad_norm": 0.28915864276129905, + "learning_rate": 1.1104126085380861e-07, + "loss": 0.0141, + "step": 32750 + }, + { + "epoch": 3.8836712913553897, + "grad_norm": 0.7152308303839191, + "learning_rate": 1.1081538986078e-07, + "loss": 0.0349, + "step": 32751 + }, + { + "epoch": 3.8837898731175144, + "grad_norm": 0.3789100313548978, + "learning_rate": 1.1058974831551216e-07, + "loss": 0.02, + "step": 32752 + }, + { + "epoch": 3.8839084548796396, + "grad_norm": 1.1224636511978052, + "learning_rate": 1.1036433622008679e-07, + "loss": 0.0617, + "step": 32753 + }, + { + "epoch": 3.8840270366417644, + "grad_norm": 0.7675610444189201, + "learning_rate": 1.1013915357657722e-07, + "loss": 0.0438, + "step": 32754 + }, + { + "epoch": 3.8841456184038896, + "grad_norm": 0.6330947798242184, + "learning_rate": 1.0991420038706512e-07, + "loss": 0.028, + "step": 32755 + }, + { + "epoch": 3.8842642001660144, + "grad_norm": 0.6326599877976278, + "learning_rate": 1.0968947665362106e-07, + "loss": 0.0297, + "step": 32756 + }, + { + "epoch": 3.8843827819281396, + "grad_norm": 0.7910524063705733, + "learning_rate": 1.0946498237831837e-07, + "loss": 0.0269, + "step": 32757 + }, + { + "epoch": 3.8845013636902643, + "grad_norm": 0.6093089722636637, + "learning_rate": 1.0924071756322484e-07, + "loss": 0.0278, + "step": 32758 + }, + { + "epoch": 3.8846199454523895, + "grad_norm": 0.3368534495151929, + "learning_rate": 1.0901668221040828e-07, + "loss": 0.0132, + "step": 32759 + }, + { + "epoch": 3.8847385272145143, + "grad_norm": 0.42057484901431474, + "learning_rate": 1.0879287632193369e-07, + "loss": 0.0247, + "step": 32760 + }, + { + "epoch": 3.8848571089766395, + "grad_norm": 0.49696282477057535, + "learning_rate": 1.0856929989986886e-07, + "loss": 0.0226, + "step": 32761 + }, + { + "epoch": 3.8849756907387643, + "grad_norm": 0.5214393746752765, + "learning_rate": 1.0834595294626771e-07, + "loss": 0.0215, + "step": 32762 + }, + { + "epoch": 3.8850942725008895, + "grad_norm": 0.7856818622390687, + "learning_rate": 1.0812283546319247e-07, + "loss": 0.0364, + "step": 32763 + }, + { + "epoch": 3.885212854263014, + "grad_norm": 0.31908470132255695, + "learning_rate": 1.078999474527026e-07, + "loss": 0.0127, + "step": 32764 + }, + { + "epoch": 3.8853314360251394, + "grad_norm": 0.36084043875327637, + "learning_rate": 1.0767728891685202e-07, + "loss": 0.0168, + "step": 32765 + }, + { + "epoch": 3.885450017787264, + "grad_norm": 0.31563387800699755, + "learning_rate": 1.0745485985768911e-07, + "loss": 0.0155, + "step": 32766 + }, + { + "epoch": 3.8855685995493894, + "grad_norm": 0.43839616004862586, + "learning_rate": 1.0723266027726775e-07, + "loss": 0.0285, + "step": 32767 + }, + { + "epoch": 3.885687181311514, + "grad_norm": 0.3856497069224779, + "learning_rate": 1.0701069017763631e-07, + "loss": 0.0145, + "step": 32768 + }, + { + "epoch": 3.8858057630736393, + "grad_norm": 0.5788784397916891, + "learning_rate": 1.0678894956083763e-07, + "loss": 0.0281, + "step": 32769 + }, + { + "epoch": 3.885924344835764, + "grad_norm": 0.5776328558034969, + "learning_rate": 1.0656743842892003e-07, + "loss": 0.0324, + "step": 32770 + }, + { + "epoch": 3.8860429265978893, + "grad_norm": 0.5269075935142127, + "learning_rate": 1.0634615678392635e-07, + "loss": 0.0244, + "step": 32771 + }, + { + "epoch": 3.886161508360014, + "grad_norm": 0.5031807132057741, + "learning_rate": 1.0612510462789382e-07, + "loss": 0.025, + "step": 32772 + }, + { + "epoch": 3.8862800901221393, + "grad_norm": 0.4567748895873529, + "learning_rate": 1.0590428196285973e-07, + "loss": 0.0194, + "step": 32773 + }, + { + "epoch": 3.8863986718842645, + "grad_norm": 0.4216460798898806, + "learning_rate": 1.0568368879086132e-07, + "loss": 0.02, + "step": 32774 + }, + { + "epoch": 3.8865172536463892, + "grad_norm": 0.3919279908100094, + "learning_rate": 1.0546332511393309e-07, + "loss": 0.0188, + "step": 32775 + }, + { + "epoch": 3.886635835408514, + "grad_norm": 0.7314088447911175, + "learning_rate": 1.0524319093410395e-07, + "loss": 0.0347, + "step": 32776 + }, + { + "epoch": 3.886754417170639, + "grad_norm": 0.5811210190326367, + "learning_rate": 1.0502328625340563e-07, + "loss": 0.0275, + "step": 32777 + }, + { + "epoch": 3.8868729989327644, + "grad_norm": 0.41839274905863744, + "learning_rate": 1.0480361107386427e-07, + "loss": 0.0176, + "step": 32778 + }, + { + "epoch": 3.886991580694889, + "grad_norm": 0.5095030741528924, + "learning_rate": 1.0458416539750327e-07, + "loss": 0.0224, + "step": 32779 + }, + { + "epoch": 3.887110162457014, + "grad_norm": 0.418915685102738, + "learning_rate": 1.0436494922634876e-07, + "loss": 0.0187, + "step": 32780 + }, + { + "epoch": 3.887228744219139, + "grad_norm": 0.6170322333671703, + "learning_rate": 1.0414596256242137e-07, + "loss": 0.0304, + "step": 32781 + }, + { + "epoch": 3.8873473259812643, + "grad_norm": 0.37629412387613376, + "learning_rate": 1.0392720540773893e-07, + "loss": 0.0212, + "step": 32782 + }, + { + "epoch": 3.887465907743389, + "grad_norm": 0.9104152957694075, + "learning_rate": 1.0370867776431647e-07, + "loss": 0.0415, + "step": 32783 + }, + { + "epoch": 3.887584489505514, + "grad_norm": 0.41589310761526155, + "learning_rate": 1.0349037963417185e-07, + "loss": 0.0135, + "step": 32784 + }, + { + "epoch": 3.887703071267639, + "grad_norm": 0.4717108845163511, + "learning_rate": 1.0327231101931179e-07, + "loss": 0.0177, + "step": 32785 + }, + { + "epoch": 3.8878216530297642, + "grad_norm": 0.39102252140928045, + "learning_rate": 1.0305447192175688e-07, + "loss": 0.0174, + "step": 32786 + }, + { + "epoch": 3.887940234791889, + "grad_norm": 0.3387846898414189, + "learning_rate": 1.0283686234350554e-07, + "loss": 0.0169, + "step": 32787 + }, + { + "epoch": 3.8880588165540138, + "grad_norm": 0.6421575470721878, + "learning_rate": 1.0261948228656726e-07, + "loss": 0.0259, + "step": 32788 + }, + { + "epoch": 3.888177398316139, + "grad_norm": 0.3716080902196915, + "learning_rate": 1.02402331752946e-07, + "loss": 0.0153, + "step": 32789 + }, + { + "epoch": 3.888295980078264, + "grad_norm": 0.7303524519062408, + "learning_rate": 1.0218541074464295e-07, + "loss": 0.0345, + "step": 32790 + }, + { + "epoch": 3.888414561840389, + "grad_norm": 0.5660289616288642, + "learning_rate": 1.0196871926366202e-07, + "loss": 0.0214, + "step": 32791 + }, + { + "epoch": 3.8885331436025137, + "grad_norm": 0.7402812384359199, + "learning_rate": 1.0175225731199612e-07, + "loss": 0.0247, + "step": 32792 + }, + { + "epoch": 3.888651725364639, + "grad_norm": 0.7100805595942826, + "learning_rate": 1.0153602489164082e-07, + "loss": 0.0271, + "step": 32793 + }, + { + "epoch": 3.888770307126764, + "grad_norm": 0.375949644124607, + "learning_rate": 1.0132002200459179e-07, + "loss": 0.0139, + "step": 32794 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.6934647110242815, + "learning_rate": 1.0110424865284185e-07, + "loss": 0.0284, + "step": 32795 + }, + { + "epoch": 3.8890074706510136, + "grad_norm": 0.5582115788056977, + "learning_rate": 1.0088870483837276e-07, + "loss": 0.0283, + "step": 32796 + }, + { + "epoch": 3.889126052413139, + "grad_norm": 0.6437577030619719, + "learning_rate": 1.0067339056318015e-07, + "loss": 0.0285, + "step": 32797 + }, + { + "epoch": 3.889244634175264, + "grad_norm": 0.576021150631878, + "learning_rate": 1.0045830582924299e-07, + "loss": 0.0261, + "step": 32798 + }, + { + "epoch": 3.8893632159373888, + "grad_norm": 0.42052630463577434, + "learning_rate": 1.0024345063854857e-07, + "loss": 0.0172, + "step": 32799 + }, + { + "epoch": 3.889481797699514, + "grad_norm": 0.5239410520219584, + "learning_rate": 1.0002882499307309e-07, + "loss": 0.0291, + "step": 32800 + }, + { + "epoch": 3.8896003794616387, + "grad_norm": 0.5861771628526631, + "learning_rate": 9.98144288947983e-08, + "loss": 0.0312, + "step": 32801 + }, + { + "epoch": 3.889718961223764, + "grad_norm": 0.7258926547118401, + "learning_rate": 9.960026234570319e-08, + "loss": 0.032, + "step": 32802 + }, + { + "epoch": 3.8898375429858887, + "grad_norm": 0.702795300303714, + "learning_rate": 9.938632534775283e-08, + "loss": 0.0172, + "step": 32803 + }, + { + "epoch": 3.889956124748014, + "grad_norm": 0.42999719913882106, + "learning_rate": 9.917261790292898e-08, + "loss": 0.0199, + "step": 32804 + }, + { + "epoch": 3.8900747065101386, + "grad_norm": 0.6316927313466153, + "learning_rate": 9.89591400131995e-08, + "loss": 0.0265, + "step": 32805 + }, + { + "epoch": 3.890193288272264, + "grad_norm": 0.44357778744587406, + "learning_rate": 9.87458916805295e-08, + "loss": 0.0266, + "step": 32806 + }, + { + "epoch": 3.8903118700343886, + "grad_norm": 0.4896669669796747, + "learning_rate": 9.853287290688407e-08, + "loss": 0.0301, + "step": 32807 + }, + { + "epoch": 3.890430451796514, + "grad_norm": 0.6290958229867226, + "learning_rate": 9.832008369423385e-08, + "loss": 0.0244, + "step": 32808 + }, + { + "epoch": 3.8905490335586386, + "grad_norm": 0.35718820892983694, + "learning_rate": 9.810752404453282e-08, + "loss": 0.0208, + "step": 32809 + }, + { + "epoch": 3.8906676153207638, + "grad_norm": 0.7707842913987072, + "learning_rate": 9.789519395974334e-08, + "loss": 0.0333, + "step": 32810 + }, + { + "epoch": 3.8907861970828885, + "grad_norm": 0.85157083317171, + "learning_rate": 9.768309344182491e-08, + "loss": 0.0399, + "step": 32811 + }, + { + "epoch": 3.8909047788450137, + "grad_norm": 0.4635860266861261, + "learning_rate": 9.747122249273156e-08, + "loss": 0.0194, + "step": 32812 + }, + { + "epoch": 3.8910233606071385, + "grad_norm": 0.6570219329228497, + "learning_rate": 9.725958111441447e-08, + "loss": 0.0314, + "step": 32813 + }, + { + "epoch": 3.8911419423692637, + "grad_norm": 0.7861729356571042, + "learning_rate": 9.704816930882766e-08, + "loss": 0.0374, + "step": 32814 + }, + { + "epoch": 3.8912605241313885, + "grad_norm": 0.47530149520852905, + "learning_rate": 9.683698707791678e-08, + "loss": 0.0256, + "step": 32815 + }, + { + "epoch": 3.8913791058935137, + "grad_norm": 0.32085734754947204, + "learning_rate": 9.662603442363305e-08, + "loss": 0.0159, + "step": 32816 + }, + { + "epoch": 3.8914976876556384, + "grad_norm": 0.6458862860241278, + "learning_rate": 9.64153113479166e-08, + "loss": 0.0332, + "step": 32817 + }, + { + "epoch": 3.8916162694177636, + "grad_norm": 0.41997720247075776, + "learning_rate": 9.620481785271308e-08, + "loss": 0.0202, + "step": 32818 + }, + { + "epoch": 3.8917348511798884, + "grad_norm": 0.7869130602016425, + "learning_rate": 9.59945539399626e-08, + "loss": 0.0317, + "step": 32819 + }, + { + "epoch": 3.8918534329420136, + "grad_norm": 0.5762158332847528, + "learning_rate": 9.57845196116025e-08, + "loss": 0.0224, + "step": 32820 + }, + { + "epoch": 3.8919720147041383, + "grad_norm": 0.6823062729033833, + "learning_rate": 9.557471486957015e-08, + "loss": 0.0249, + "step": 32821 + }, + { + "epoch": 3.8920905964662635, + "grad_norm": 0.6310552163531695, + "learning_rate": 9.53651397157973e-08, + "loss": 0.0304, + "step": 32822 + }, + { + "epoch": 3.8922091782283887, + "grad_norm": 0.5673159765606386, + "learning_rate": 9.515579415221853e-08, + "loss": 0.0232, + "step": 32823 + }, + { + "epoch": 3.8923277599905135, + "grad_norm": 0.6094942233146888, + "learning_rate": 9.494667818076564e-08, + "loss": 0.0264, + "step": 32824 + }, + { + "epoch": 3.8924463417526383, + "grad_norm": 0.689014377579962, + "learning_rate": 9.473779180335929e-08, + "loss": 0.0259, + "step": 32825 + }, + { + "epoch": 3.8925649235147635, + "grad_norm": 0.3424209245162095, + "learning_rate": 9.452913502193406e-08, + "loss": 0.0168, + "step": 32826 + }, + { + "epoch": 3.8926835052768887, + "grad_norm": 0.2972656116314417, + "learning_rate": 9.432070783840508e-08, + "loss": 0.0137, + "step": 32827 + }, + { + "epoch": 3.8928020870390134, + "grad_norm": 0.6026373629606993, + "learning_rate": 9.411251025470136e-08, + "loss": 0.0292, + "step": 32828 + }, + { + "epoch": 3.892920668801138, + "grad_norm": 0.7516910012946831, + "learning_rate": 9.390454227273804e-08, + "loss": 0.0327, + "step": 32829 + }, + { + "epoch": 3.8930392505632634, + "grad_norm": 0.6393735658323294, + "learning_rate": 9.369680389443025e-08, + "loss": 0.0234, + "step": 32830 + }, + { + "epoch": 3.8931578323253886, + "grad_norm": 0.5808091018491908, + "learning_rate": 9.348929512169869e-08, + "loss": 0.0261, + "step": 32831 + }, + { + "epoch": 3.8932764140875133, + "grad_norm": 0.6638134285769574, + "learning_rate": 9.328201595645014e-08, + "loss": 0.024, + "step": 32832 + }, + { + "epoch": 3.893394995849638, + "grad_norm": 0.571560928438208, + "learning_rate": 9.307496640060254e-08, + "loss": 0.0297, + "step": 32833 + }, + { + "epoch": 3.8935135776117633, + "grad_norm": 0.6871526937737936, + "learning_rate": 9.286814645605713e-08, + "loss": 0.0299, + "step": 32834 + }, + { + "epoch": 3.8936321593738885, + "grad_norm": 0.9110841662031888, + "learning_rate": 9.266155612472905e-08, + "loss": 0.0438, + "step": 32835 + }, + { + "epoch": 3.8937507411360133, + "grad_norm": 0.6765622424916666, + "learning_rate": 9.2455195408514e-08, + "loss": 0.033, + "step": 32836 + }, + { + "epoch": 3.893869322898138, + "grad_norm": 0.5259619561554134, + "learning_rate": 9.224906430931879e-08, + "loss": 0.0205, + "step": 32837 + }, + { + "epoch": 3.8939879046602632, + "grad_norm": 0.43854611454014875, + "learning_rate": 9.204316282904468e-08, + "loss": 0.0198, + "step": 32838 + }, + { + "epoch": 3.8941064864223884, + "grad_norm": 0.5320183729866419, + "learning_rate": 9.183749096958739e-08, + "loss": 0.0348, + "step": 32839 + }, + { + "epoch": 3.894225068184513, + "grad_norm": 0.41484897189812814, + "learning_rate": 9.163204873284536e-08, + "loss": 0.0156, + "step": 32840 + }, + { + "epoch": 3.894343649946638, + "grad_norm": 0.5115332087687546, + "learning_rate": 9.142683612071157e-08, + "loss": 0.0234, + "step": 32841 + }, + { + "epoch": 3.894462231708763, + "grad_norm": 0.5134065637406902, + "learning_rate": 9.122185313507614e-08, + "loss": 0.0255, + "step": 32842 + }, + { + "epoch": 3.8945808134708884, + "grad_norm": 0.5982826510574623, + "learning_rate": 9.101709977782925e-08, + "loss": 0.0216, + "step": 32843 + }, + { + "epoch": 3.894699395233013, + "grad_norm": 0.46371174354657063, + "learning_rate": 9.081257605086102e-08, + "loss": 0.0171, + "step": 32844 + }, + { + "epoch": 3.894817976995138, + "grad_norm": 0.898499077534335, + "learning_rate": 9.060828195605331e-08, + "loss": 0.0325, + "step": 32845 + }, + { + "epoch": 3.894936558757263, + "grad_norm": 0.41795125921866366, + "learning_rate": 9.040421749529349e-08, + "loss": 0.019, + "step": 32846 + }, + { + "epoch": 3.8950551405193883, + "grad_norm": 0.32961729025458625, + "learning_rate": 9.02003826704606e-08, + "loss": 0.0136, + "step": 32847 + }, + { + "epoch": 3.895173722281513, + "grad_norm": 0.419550882220157, + "learning_rate": 8.999677748343093e-08, + "loss": 0.0181, + "step": 32848 + }, + { + "epoch": 3.8952923040436382, + "grad_norm": 0.6757401016852386, + "learning_rate": 8.979340193608631e-08, + "loss": 0.0323, + "step": 32849 + }, + { + "epoch": 3.895410885805763, + "grad_norm": 0.35706580529966797, + "learning_rate": 8.959025603030025e-08, + "loss": 0.0112, + "step": 32850 + }, + { + "epoch": 3.895529467567888, + "grad_norm": 0.5230995169960441, + "learning_rate": 8.938733976794623e-08, + "loss": 0.0182, + "step": 32851 + }, + { + "epoch": 3.895648049330013, + "grad_norm": 0.40099316019209763, + "learning_rate": 8.918465315088942e-08, + "loss": 0.0188, + "step": 32852 + }, + { + "epoch": 3.895766631092138, + "grad_norm": 0.38897716161339707, + "learning_rate": 8.898219618100612e-08, + "loss": 0.0196, + "step": 32853 + }, + { + "epoch": 3.895885212854263, + "grad_norm": 0.562783969368509, + "learning_rate": 8.877996886015594e-08, + "loss": 0.0165, + "step": 32854 + }, + { + "epoch": 3.896003794616388, + "grad_norm": 0.5702160459761578, + "learning_rate": 8.857797119020961e-08, + "loss": 0.0244, + "step": 32855 + }, + { + "epoch": 3.896122376378513, + "grad_norm": 0.5140791873352034, + "learning_rate": 8.83762031730212e-08, + "loss": 0.0258, + "step": 32856 + }, + { + "epoch": 3.896240958140638, + "grad_norm": 0.766761405139536, + "learning_rate": 8.817466481045867e-08, + "loss": 0.0354, + "step": 32857 + }, + { + "epoch": 3.896359539902763, + "grad_norm": 0.41232158991628676, + "learning_rate": 8.797335610437885e-08, + "loss": 0.0161, + "step": 32858 + }, + { + "epoch": 3.896478121664888, + "grad_norm": 0.6125417443246531, + "learning_rate": 8.777227705663305e-08, + "loss": 0.0186, + "step": 32859 + }, + { + "epoch": 3.896596703427013, + "grad_norm": 0.3741472353426438, + "learning_rate": 8.757142766907533e-08, + "loss": 0.0237, + "step": 32860 + }, + { + "epoch": 3.896715285189138, + "grad_norm": 0.9043322142303796, + "learning_rate": 8.737080794356257e-08, + "loss": 0.0417, + "step": 32861 + }, + { + "epoch": 3.8968338669512628, + "grad_norm": 0.401239451032039, + "learning_rate": 8.717041788194047e-08, + "loss": 0.0139, + "step": 32862 + }, + { + "epoch": 3.896952448713388, + "grad_norm": 0.7235971582878648, + "learning_rate": 8.697025748605758e-08, + "loss": 0.032, + "step": 32863 + }, + { + "epoch": 3.8970710304755127, + "grad_norm": 0.501756964436278, + "learning_rate": 8.677032675775687e-08, + "loss": 0.0224, + "step": 32864 + }, + { + "epoch": 3.897189612237638, + "grad_norm": 0.7270043629491337, + "learning_rate": 8.657062569888408e-08, + "loss": 0.0346, + "step": 32865 + }, + { + "epoch": 3.8973081939997627, + "grad_norm": 0.4540139820499246, + "learning_rate": 8.63711543112794e-08, + "loss": 0.0192, + "step": 32866 + }, + { + "epoch": 3.897426775761888, + "grad_norm": 0.6118033547626213, + "learning_rate": 8.617191259678026e-08, + "loss": 0.0248, + "step": 32867 + }, + { + "epoch": 3.8975453575240127, + "grad_norm": 0.8295366637529634, + "learning_rate": 8.597290055722684e-08, + "loss": 0.0442, + "step": 32868 + }, + { + "epoch": 3.897663939286138, + "grad_norm": 0.807480710882785, + "learning_rate": 8.577411819445103e-08, + "loss": 0.0469, + "step": 32869 + }, + { + "epoch": 3.8977825210482626, + "grad_norm": 0.676652404952133, + "learning_rate": 8.557556551028467e-08, + "loss": 0.0255, + "step": 32870 + }, + { + "epoch": 3.897901102810388, + "grad_norm": 0.6524269590680122, + "learning_rate": 8.537724250656243e-08, + "loss": 0.0235, + "step": 32871 + }, + { + "epoch": 3.898019684572513, + "grad_norm": 0.312292043569829, + "learning_rate": 8.517914918510506e-08, + "loss": 0.0138, + "step": 32872 + }, + { + "epoch": 3.898138266334638, + "grad_norm": 0.751214359431838, + "learning_rate": 8.49812855477472e-08, + "loss": 0.0357, + "step": 32873 + }, + { + "epoch": 3.8982568480967625, + "grad_norm": 0.5573124972621954, + "learning_rate": 8.478365159630686e-08, + "loss": 0.0277, + "step": 32874 + }, + { + "epoch": 3.8983754298588877, + "grad_norm": 0.3250948642892914, + "learning_rate": 8.458624733261034e-08, + "loss": 0.0131, + "step": 32875 + }, + { + "epoch": 3.898494011621013, + "grad_norm": 0.6917656648996087, + "learning_rate": 8.438907275847562e-08, + "loss": 0.0229, + "step": 32876 + }, + { + "epoch": 3.8986125933831377, + "grad_norm": 0.36421845016466153, + "learning_rate": 8.419212787571796e-08, + "loss": 0.021, + "step": 32877 + }, + { + "epoch": 3.8987311751452625, + "grad_norm": 0.3953958110462938, + "learning_rate": 8.39954126861553e-08, + "loss": 0.0149, + "step": 32878 + }, + { + "epoch": 3.8988497569073877, + "grad_norm": 0.6020667719120232, + "learning_rate": 8.379892719160009e-08, + "loss": 0.0275, + "step": 32879 + }, + { + "epoch": 3.898968338669513, + "grad_norm": 0.38360657037909646, + "learning_rate": 8.360267139386758e-08, + "loss": 0.0195, + "step": 32880 + }, + { + "epoch": 3.8990869204316376, + "grad_norm": 0.39835012429657934, + "learning_rate": 8.340664529476184e-08, + "loss": 0.0197, + "step": 32881 + }, + { + "epoch": 3.8992055021937624, + "grad_norm": 0.6915888707119626, + "learning_rate": 8.321084889609255e-08, + "loss": 0.0287, + "step": 32882 + }, + { + "epoch": 3.8993240839558876, + "grad_norm": 0.6671810243583413, + "learning_rate": 8.301528219966381e-08, + "loss": 0.0332, + "step": 32883 + }, + { + "epoch": 3.899442665718013, + "grad_norm": 0.3844554787794611, + "learning_rate": 8.281994520727976e-08, + "loss": 0.0245, + "step": 32884 + }, + { + "epoch": 3.8995612474801375, + "grad_norm": 0.9111618955133782, + "learning_rate": 8.262483792074172e-08, + "loss": 0.0468, + "step": 32885 + }, + { + "epoch": 3.8996798292422623, + "grad_norm": 0.23687884464870704, + "learning_rate": 8.242996034184269e-08, + "loss": 0.0093, + "step": 32886 + }, + { + "epoch": 3.8997984110043875, + "grad_norm": 0.4474607147590572, + "learning_rate": 8.223531247238681e-08, + "loss": 0.0126, + "step": 32887 + }, + { + "epoch": 3.8999169927665127, + "grad_norm": 0.9386253202892607, + "learning_rate": 8.204089431416428e-08, + "loss": 0.0556, + "step": 32888 + }, + { + "epoch": 3.9000355745286375, + "grad_norm": 0.5890312024866474, + "learning_rate": 8.184670586896814e-08, + "loss": 0.0282, + "step": 32889 + }, + { + "epoch": 3.9001541562907622, + "grad_norm": 0.6306002405539147, + "learning_rate": 8.165274713858861e-08, + "loss": 0.0287, + "step": 32890 + }, + { + "epoch": 3.9002727380528874, + "grad_norm": 0.4137961860236405, + "learning_rate": 8.145901812481593e-08, + "loss": 0.0182, + "step": 32891 + }, + { + "epoch": 3.9003913198150126, + "grad_norm": 0.539683929045907, + "learning_rate": 8.1265518829432e-08, + "loss": 0.0299, + "step": 32892 + }, + { + "epoch": 3.9005099015771374, + "grad_norm": 0.41483196513336973, + "learning_rate": 8.107224925422152e-08, + "loss": 0.0154, + "step": 32893 + }, + { + "epoch": 3.900628483339262, + "grad_norm": 0.2986712269280829, + "learning_rate": 8.087920940096915e-08, + "loss": 0.0146, + "step": 32894 + }, + { + "epoch": 3.9007470651013874, + "grad_norm": 0.46451732431283665, + "learning_rate": 8.068639927145127e-08, + "loss": 0.0235, + "step": 32895 + }, + { + "epoch": 3.9008656468635126, + "grad_norm": 0.36352713961093003, + "learning_rate": 8.0493818867447e-08, + "loss": 0.0112, + "step": 32896 + }, + { + "epoch": 3.9009842286256373, + "grad_norm": 0.5132113228291761, + "learning_rate": 8.03014681907327e-08, + "loss": 0.0285, + "step": 32897 + }, + { + "epoch": 3.9011028103877625, + "grad_norm": 0.6949493168059061, + "learning_rate": 8.010934724307917e-08, + "loss": 0.0275, + "step": 32898 + }, + { + "epoch": 3.9012213921498873, + "grad_norm": 0.6659674839874709, + "learning_rate": 7.991745602625999e-08, + "loss": 0.0286, + "step": 32899 + }, + { + "epoch": 3.9013399739120125, + "grad_norm": 0.4668956889527074, + "learning_rate": 7.972579454204043e-08, + "loss": 0.0259, + "step": 32900 + }, + { + "epoch": 3.9014585556741372, + "grad_norm": 0.6735016183700935, + "learning_rate": 7.953436279219129e-08, + "loss": 0.0364, + "step": 32901 + }, + { + "epoch": 3.9015771374362624, + "grad_norm": 0.6715648304527518, + "learning_rate": 7.934316077847782e-08, + "loss": 0.0234, + "step": 32902 + }, + { + "epoch": 3.901695719198387, + "grad_norm": 0.3723557679640903, + "learning_rate": 7.915218850265693e-08, + "loss": 0.023, + "step": 32903 + }, + { + "epoch": 3.9018143009605124, + "grad_norm": 0.543036926928761, + "learning_rate": 7.896144596649391e-08, + "loss": 0.0258, + "step": 32904 + }, + { + "epoch": 3.901932882722637, + "grad_norm": 0.7090154972436289, + "learning_rate": 7.877093317174844e-08, + "loss": 0.0327, + "step": 32905 + }, + { + "epoch": 3.9020514644847624, + "grad_norm": 0.6538918612700062, + "learning_rate": 7.858065012017191e-08, + "loss": 0.0266, + "step": 32906 + }, + { + "epoch": 3.902170046246887, + "grad_norm": 0.46811859992661825, + "learning_rate": 7.839059681352401e-08, + "loss": 0.0294, + "step": 32907 + }, + { + "epoch": 3.9022886280090123, + "grad_norm": 0.6505003396308524, + "learning_rate": 7.820077325355057e-08, + "loss": 0.0311, + "step": 32908 + }, + { + "epoch": 3.902407209771137, + "grad_norm": 0.44175053523052654, + "learning_rate": 7.801117944200576e-08, + "loss": 0.022, + "step": 32909 + }, + { + "epoch": 3.9025257915332623, + "grad_norm": 0.8529840062317057, + "learning_rate": 7.782181538063815e-08, + "loss": 0.0359, + "step": 32910 + }, + { + "epoch": 3.902644373295387, + "grad_norm": 0.5952132828975113, + "learning_rate": 7.763268107119081e-08, + "loss": 0.0313, + "step": 32911 + }, + { + "epoch": 3.9027629550575123, + "grad_norm": 0.49774204534298827, + "learning_rate": 7.74437765154068e-08, + "loss": 0.0224, + "step": 32912 + }, + { + "epoch": 3.902881536819637, + "grad_norm": 0.40663268491190196, + "learning_rate": 7.725510171503192e-08, + "loss": 0.0119, + "step": 32913 + }, + { + "epoch": 3.903000118581762, + "grad_norm": 0.4735647728658473, + "learning_rate": 7.706665667180091e-08, + "loss": 0.0231, + "step": 32914 + }, + { + "epoch": 3.903118700343887, + "grad_norm": 0.34041533285556563, + "learning_rate": 7.687844138745404e-08, + "loss": 0.0111, + "step": 32915 + }, + { + "epoch": 3.903237282106012, + "grad_norm": 0.4857610276712844, + "learning_rate": 7.669045586372326e-08, + "loss": 0.0259, + "step": 32916 + }, + { + "epoch": 3.903355863868137, + "grad_norm": 0.29692043303895543, + "learning_rate": 7.650270010234606e-08, + "loss": 0.0123, + "step": 32917 + }, + { + "epoch": 3.903474445630262, + "grad_norm": 0.6345046237272106, + "learning_rate": 7.631517410504884e-08, + "loss": 0.0292, + "step": 32918 + }, + { + "epoch": 3.903593027392387, + "grad_norm": 0.5850987267950636, + "learning_rate": 7.612787787356356e-08, + "loss": 0.0226, + "step": 32919 + }, + { + "epoch": 3.903711609154512, + "grad_norm": 0.7090509278815005, + "learning_rate": 7.594081140961384e-08, + "loss": 0.0262, + "step": 32920 + }, + { + "epoch": 3.903830190916637, + "grad_norm": 0.5541594377179736, + "learning_rate": 7.575397471492607e-08, + "loss": 0.0184, + "step": 32921 + }, + { + "epoch": 3.903948772678762, + "grad_norm": 0.5004512834343101, + "learning_rate": 7.556736779122386e-08, + "loss": 0.0395, + "step": 32922 + }, + { + "epoch": 3.904067354440887, + "grad_norm": 0.4331170731944438, + "learning_rate": 7.53809906402253e-08, + "loss": 0.0138, + "step": 32923 + }, + { + "epoch": 3.904185936203012, + "grad_norm": 0.6745381426386242, + "learning_rate": 7.519484326365123e-08, + "loss": 0.0279, + "step": 32924 + }, + { + "epoch": 3.9043045179651372, + "grad_norm": 0.5596771728752179, + "learning_rate": 7.500892566321416e-08, + "loss": 0.0289, + "step": 32925 + }, + { + "epoch": 3.904423099727262, + "grad_norm": 0.4355973845100918, + "learning_rate": 7.482323784062939e-08, + "loss": 0.0225, + "step": 32926 + }, + { + "epoch": 3.9045416814893867, + "grad_norm": 0.7099160821623313, + "learning_rate": 7.463777979760944e-08, + "loss": 0.0344, + "step": 32927 + }, + { + "epoch": 3.904660263251512, + "grad_norm": 0.41994845159728594, + "learning_rate": 7.445255153586406e-08, + "loss": 0.0176, + "step": 32928 + }, + { + "epoch": 3.904778845013637, + "grad_norm": 0.8705774616506163, + "learning_rate": 7.42675530571002e-08, + "loss": 0.0475, + "step": 32929 + }, + { + "epoch": 3.904897426775762, + "grad_norm": 0.5865378919995873, + "learning_rate": 7.408278436302485e-08, + "loss": 0.022, + "step": 32930 + }, + { + "epoch": 3.9050160085378867, + "grad_norm": 0.37940968143323733, + "learning_rate": 7.389824545533664e-08, + "loss": 0.0166, + "step": 32931 + }, + { + "epoch": 3.905134590300012, + "grad_norm": 0.40980681244540185, + "learning_rate": 7.371393633574253e-08, + "loss": 0.0157, + "step": 32932 + }, + { + "epoch": 3.905253172062137, + "grad_norm": 0.3992028089922306, + "learning_rate": 7.352985700594118e-08, + "loss": 0.0171, + "step": 32933 + }, + { + "epoch": 3.905371753824262, + "grad_norm": 0.5264043004505774, + "learning_rate": 7.334600746762288e-08, + "loss": 0.025, + "step": 32934 + }, + { + "epoch": 3.9054903355863866, + "grad_norm": 0.4319524224894581, + "learning_rate": 7.316238772249184e-08, + "loss": 0.024, + "step": 32935 + }, + { + "epoch": 3.905608917348512, + "grad_norm": 0.4657221431186572, + "learning_rate": 7.297899777223561e-08, + "loss": 0.0229, + "step": 32936 + }, + { + "epoch": 3.905727499110637, + "grad_norm": 0.31052235889051605, + "learning_rate": 7.27958376185417e-08, + "loss": 0.0143, + "step": 32937 + }, + { + "epoch": 3.9058460808727617, + "grad_norm": 0.5369516316934927, + "learning_rate": 7.261290726310599e-08, + "loss": 0.0283, + "step": 32938 + }, + { + "epoch": 3.9059646626348865, + "grad_norm": 0.386034781795001, + "learning_rate": 7.243020670761046e-08, + "loss": 0.0184, + "step": 32939 + }, + { + "epoch": 3.9060832443970117, + "grad_norm": 0.8060195347206971, + "learning_rate": 7.22477359537399e-08, + "loss": 0.0354, + "step": 32940 + }, + { + "epoch": 3.906201826159137, + "grad_norm": 0.47624059882962017, + "learning_rate": 7.206549500317905e-08, + "loss": 0.0183, + "step": 32941 + }, + { + "epoch": 3.9063204079212617, + "grad_norm": 0.5044358141714613, + "learning_rate": 7.188348385760158e-08, + "loss": 0.0286, + "step": 32942 + }, + { + "epoch": 3.9064389896833864, + "grad_norm": 0.406794629609817, + "learning_rate": 7.170170251869224e-08, + "loss": 0.0205, + "step": 32943 + }, + { + "epoch": 3.9065575714455116, + "grad_norm": 0.6841136654869208, + "learning_rate": 7.15201509881247e-08, + "loss": 0.0237, + "step": 32944 + }, + { + "epoch": 3.906676153207637, + "grad_norm": 0.5809399907864505, + "learning_rate": 7.133882926756985e-08, + "loss": 0.0205, + "step": 32945 + }, + { + "epoch": 3.9067947349697616, + "grad_norm": 0.6076950028180731, + "learning_rate": 7.115773735870413e-08, + "loss": 0.0181, + "step": 32946 + }, + { + "epoch": 3.906913316731887, + "grad_norm": 0.30361387586249594, + "learning_rate": 7.097687526319007e-08, + "loss": 0.011, + "step": 32947 + }, + { + "epoch": 3.9070318984940116, + "grad_norm": 0.46178108231233955, + "learning_rate": 7.079624298270416e-08, + "loss": 0.0263, + "step": 32948 + }, + { + "epoch": 3.9071504802561368, + "grad_norm": 1.0865953892085038, + "learning_rate": 7.061584051890336e-08, + "loss": 0.036, + "step": 32949 + }, + { + "epoch": 3.9072690620182615, + "grad_norm": 0.48785260108525, + "learning_rate": 7.043566787345579e-08, + "loss": 0.0241, + "step": 32950 + }, + { + "epoch": 3.9073876437803867, + "grad_norm": 0.5550353431025201, + "learning_rate": 7.025572504801847e-08, + "loss": 0.0221, + "step": 32951 + }, + { + "epoch": 3.9075062255425115, + "grad_norm": 0.6680996886467149, + "learning_rate": 7.007601204425395e-08, + "loss": 0.0261, + "step": 32952 + }, + { + "epoch": 3.9076248073046367, + "grad_norm": 0.40244891627757445, + "learning_rate": 6.989652886381925e-08, + "loss": 0.0177, + "step": 32953 + }, + { + "epoch": 3.9077433890667614, + "grad_norm": 0.48016686514357376, + "learning_rate": 6.971727550836304e-08, + "loss": 0.0225, + "step": 32954 + }, + { + "epoch": 3.9078619708288866, + "grad_norm": 0.3417325779522888, + "learning_rate": 6.95382519795451e-08, + "loss": 0.0184, + "step": 32955 + }, + { + "epoch": 3.9079805525910114, + "grad_norm": 0.37418097618542734, + "learning_rate": 6.935945827901413e-08, + "loss": 0.0194, + "step": 32956 + }, + { + "epoch": 3.9080991343531366, + "grad_norm": 0.47077300395122074, + "learning_rate": 6.918089440841324e-08, + "loss": 0.0182, + "step": 32957 + }, + { + "epoch": 3.9082177161152614, + "grad_norm": 0.34108800372612075, + "learning_rate": 6.900256036939389e-08, + "loss": 0.0141, + "step": 32958 + }, + { + "epoch": 3.9083362978773866, + "grad_norm": 0.7801701781331113, + "learning_rate": 6.882445616359923e-08, + "loss": 0.0423, + "step": 32959 + }, + { + "epoch": 3.9084548796395113, + "grad_norm": 0.638567192548044, + "learning_rate": 6.864658179266959e-08, + "loss": 0.0347, + "step": 32960 + }, + { + "epoch": 3.9085734614016365, + "grad_norm": 0.5343114857993443, + "learning_rate": 6.846893725824532e-08, + "loss": 0.0227, + "step": 32961 + }, + { + "epoch": 3.9086920431637613, + "grad_norm": 0.615990381434428, + "learning_rate": 6.82915225619668e-08, + "loss": 0.0285, + "step": 32962 + }, + { + "epoch": 3.9088106249258865, + "grad_norm": 0.5740717297435133, + "learning_rate": 6.811433770546604e-08, + "loss": 0.0275, + "step": 32963 + }, + { + "epoch": 3.9089292066880112, + "grad_norm": 0.5030386020633882, + "learning_rate": 6.793738269037508e-08, + "loss": 0.0271, + "step": 32964 + }, + { + "epoch": 3.9090477884501365, + "grad_norm": 0.6585761448686483, + "learning_rate": 6.776065751832872e-08, + "loss": 0.0251, + "step": 32965 + }, + { + "epoch": 3.909166370212261, + "grad_norm": 0.3331400030130475, + "learning_rate": 6.758416219095898e-08, + "loss": 0.0139, + "step": 32966 + }, + { + "epoch": 3.9092849519743864, + "grad_norm": 0.3975716560877251, + "learning_rate": 6.740789670988402e-08, + "loss": 0.0176, + "step": 32967 + }, + { + "epoch": 3.909403533736511, + "grad_norm": 0.6658183077930677, + "learning_rate": 6.723186107673585e-08, + "loss": 0.0281, + "step": 32968 + }, + { + "epoch": 3.9095221154986364, + "grad_norm": 0.5943754991882794, + "learning_rate": 6.705605529313541e-08, + "loss": 0.0217, + "step": 32969 + }, + { + "epoch": 3.909640697260761, + "grad_norm": 0.3198826967311972, + "learning_rate": 6.688047936070362e-08, + "loss": 0.0155, + "step": 32970 + }, + { + "epoch": 3.9097592790228863, + "grad_norm": 0.6031963327117228, + "learning_rate": 6.670513328106142e-08, + "loss": 0.0242, + "step": 32971 + }, + { + "epoch": 3.909877860785011, + "grad_norm": 0.4405241988868358, + "learning_rate": 6.653001705581863e-08, + "loss": 0.014, + "step": 32972 + }, + { + "epoch": 3.9099964425471363, + "grad_norm": 0.5089244925562553, + "learning_rate": 6.635513068659338e-08, + "loss": 0.0235, + "step": 32973 + }, + { + "epoch": 3.9101150243092615, + "grad_norm": 0.3604888435220218, + "learning_rate": 6.618047417500106e-08, + "loss": 0.0233, + "step": 32974 + }, + { + "epoch": 3.9102336060713863, + "grad_norm": 0.8421949696444124, + "learning_rate": 6.600604752264594e-08, + "loss": 0.0466, + "step": 32975 + }, + { + "epoch": 3.910352187833511, + "grad_norm": 0.37876976231293186, + "learning_rate": 6.583185073114062e-08, + "loss": 0.0146, + "step": 32976 + }, + { + "epoch": 3.910470769595636, + "grad_norm": 0.4579200565070535, + "learning_rate": 6.565788380209215e-08, + "loss": 0.0265, + "step": 32977 + }, + { + "epoch": 3.9105893513577614, + "grad_norm": 0.4775670397346715, + "learning_rate": 6.548414673709647e-08, + "loss": 0.023, + "step": 32978 + }, + { + "epoch": 3.910707933119886, + "grad_norm": 0.6579632655112039, + "learning_rate": 6.53106395377634e-08, + "loss": 0.0245, + "step": 32979 + }, + { + "epoch": 3.910826514882011, + "grad_norm": 0.4989610905025986, + "learning_rate": 6.513736220568889e-08, + "loss": 0.0271, + "step": 32980 + }, + { + "epoch": 3.910945096644136, + "grad_norm": 0.5983036049415464, + "learning_rate": 6.496431474246889e-08, + "loss": 0.0247, + "step": 32981 + }, + { + "epoch": 3.9110636784062613, + "grad_norm": 0.3461611207106446, + "learning_rate": 6.479149714970212e-08, + "loss": 0.0153, + "step": 32982 + }, + { + "epoch": 3.911182260168386, + "grad_norm": 0.609243471929688, + "learning_rate": 6.461890942897897e-08, + "loss": 0.0228, + "step": 32983 + }, + { + "epoch": 3.911300841930511, + "grad_norm": 0.31066156899094916, + "learning_rate": 6.444655158189261e-08, + "loss": 0.0129, + "step": 32984 + }, + { + "epoch": 3.911419423692636, + "grad_norm": 0.5298891058693376, + "learning_rate": 6.427442361003066e-08, + "loss": 0.0165, + "step": 32985 + }, + { + "epoch": 3.9115380054547613, + "grad_norm": 0.3044268728228278, + "learning_rate": 6.410252551498074e-08, + "loss": 0.0135, + "step": 32986 + }, + { + "epoch": 3.911656587216886, + "grad_norm": 0.4647043871438658, + "learning_rate": 6.39308572983277e-08, + "loss": 0.0213, + "step": 32987 + }, + { + "epoch": 3.911775168979011, + "grad_norm": 0.6552810568568124, + "learning_rate": 6.375941896165361e-08, + "loss": 0.0352, + "step": 32988 + }, + { + "epoch": 3.911893750741136, + "grad_norm": 0.35337127542404184, + "learning_rate": 6.358821050653774e-08, + "loss": 0.0124, + "step": 32989 + }, + { + "epoch": 3.912012332503261, + "grad_norm": 0.5243307093639258, + "learning_rate": 6.34172319345594e-08, + "loss": 0.0239, + "step": 32990 + }, + { + "epoch": 3.912130914265386, + "grad_norm": 0.6127113158375077, + "learning_rate": 6.32464832472951e-08, + "loss": 0.0277, + "step": 32991 + }, + { + "epoch": 3.9122494960275107, + "grad_norm": 0.4960606733995128, + "learning_rate": 6.307596444631858e-08, + "loss": 0.0183, + "step": 32992 + }, + { + "epoch": 3.912368077789636, + "grad_norm": 0.6240554060153949, + "learning_rate": 6.29056755332036e-08, + "loss": 0.0252, + "step": 32993 + }, + { + "epoch": 3.912486659551761, + "grad_norm": 0.8218259881123516, + "learning_rate": 6.273561650951554e-08, + "loss": 0.0482, + "step": 32994 + }, + { + "epoch": 3.912605241313886, + "grad_norm": 0.4267298067502847, + "learning_rate": 6.256578737682816e-08, + "loss": 0.0173, + "step": 32995 + }, + { + "epoch": 3.912723823076011, + "grad_norm": 0.6134715486534913, + "learning_rate": 6.239618813670411e-08, + "loss": 0.0271, + "step": 32996 + }, + { + "epoch": 3.912842404838136, + "grad_norm": 0.7618570342770326, + "learning_rate": 6.222681879070325e-08, + "loss": 0.0351, + "step": 32997 + }, + { + "epoch": 3.912960986600261, + "grad_norm": 0.9338991047229275, + "learning_rate": 6.205767934039375e-08, + "loss": 0.0334, + "step": 32998 + }, + { + "epoch": 3.913079568362386, + "grad_norm": 0.8289952074328788, + "learning_rate": 6.188876978732994e-08, + "loss": 0.0341, + "step": 32999 + }, + { + "epoch": 3.913198150124511, + "grad_norm": 0.33851184210983687, + "learning_rate": 6.17200901330689e-08, + "loss": 0.0155, + "step": 33000 + }, + { + "epoch": 3.9133167318866358, + "grad_norm": 0.5298907122565548, + "learning_rate": 6.15516403791705e-08, + "loss": 0.0165, + "step": 33001 + }, + { + "epoch": 3.913435313648761, + "grad_norm": 0.54309737451329, + "learning_rate": 6.138342052718349e-08, + "loss": 0.0244, + "step": 33002 + }, + { + "epoch": 3.9135538954108857, + "grad_norm": 0.41459818200713805, + "learning_rate": 6.121543057865664e-08, + "loss": 0.0156, + "step": 33003 + }, + { + "epoch": 3.913672477173011, + "grad_norm": 0.27709135037008514, + "learning_rate": 6.104767053514426e-08, + "loss": 0.0154, + "step": 33004 + }, + { + "epoch": 3.9137910589351357, + "grad_norm": 0.5787553175433322, + "learning_rate": 6.088014039818957e-08, + "loss": 0.0292, + "step": 33005 + }, + { + "epoch": 3.913909640697261, + "grad_norm": 0.7010322390032268, + "learning_rate": 6.071284016933853e-08, + "loss": 0.0332, + "step": 33006 + }, + { + "epoch": 3.9140282224593856, + "grad_norm": 0.45354049904178717, + "learning_rate": 6.05457698501316e-08, + "loss": 0.0204, + "step": 33007 + }, + { + "epoch": 3.914146804221511, + "grad_norm": 0.44352584636429543, + "learning_rate": 6.037892944211199e-08, + "loss": 0.0289, + "step": 33008 + }, + { + "epoch": 3.9142653859836356, + "grad_norm": 0.5367390444147271, + "learning_rate": 6.021231894681179e-08, + "loss": 0.0147, + "step": 33009 + }, + { + "epoch": 3.914383967745761, + "grad_norm": 0.568213932058313, + "learning_rate": 6.004593836577422e-08, + "loss": 0.0257, + "step": 33010 + }, + { + "epoch": 3.9145025495078856, + "grad_norm": 0.5246343010782624, + "learning_rate": 5.98797877005286e-08, + "loss": 0.0234, + "step": 33011 + }, + { + "epoch": 3.9146211312700108, + "grad_norm": 0.42576872552792205, + "learning_rate": 5.971386695260706e-08, + "loss": 0.0197, + "step": 33012 + }, + { + "epoch": 3.9147397130321355, + "grad_norm": 0.6774404771052593, + "learning_rate": 5.954817612354169e-08, + "loss": 0.0365, + "step": 33013 + }, + { + "epoch": 3.9148582947942607, + "grad_norm": 0.7740460751056242, + "learning_rate": 5.9382715214856275e-08, + "loss": 0.0412, + "step": 33014 + }, + { + "epoch": 3.9149768765563855, + "grad_norm": 0.5199037539637023, + "learning_rate": 5.921748422807738e-08, + "loss": 0.0174, + "step": 33015 + }, + { + "epoch": 3.9150954583185107, + "grad_norm": 0.4624829378298274, + "learning_rate": 5.9052483164731553e-08, + "loss": 0.0154, + "step": 33016 + }, + { + "epoch": 3.9152140400806354, + "grad_norm": 0.6148328187536695, + "learning_rate": 5.888771202633425e-08, + "loss": 0.031, + "step": 33017 + }, + { + "epoch": 3.9153326218427607, + "grad_norm": 0.6234809121273952, + "learning_rate": 5.872317081440926e-08, + "loss": 0.0252, + "step": 33018 + }, + { + "epoch": 3.9154512036048854, + "grad_norm": 0.47275339810232675, + "learning_rate": 5.855885953047202e-08, + "loss": 0.0203, + "step": 33019 + }, + { + "epoch": 3.9155697853670106, + "grad_norm": 0.5291180032123055, + "learning_rate": 5.839477817603523e-08, + "loss": 0.0157, + "step": 33020 + }, + { + "epoch": 3.9156883671291354, + "grad_norm": 0.3022432741736082, + "learning_rate": 5.823092675261432e-08, + "loss": 0.0115, + "step": 33021 + }, + { + "epoch": 3.9158069488912606, + "grad_norm": 0.4657298636259324, + "learning_rate": 5.806730526171922e-08, + "loss": 0.0172, + "step": 33022 + }, + { + "epoch": 3.9159255306533858, + "grad_norm": 0.6063817074818547, + "learning_rate": 5.790391370485704e-08, + "loss": 0.0318, + "step": 33023 + }, + { + "epoch": 3.9160441124155105, + "grad_norm": 0.5109135363732606, + "learning_rate": 5.774075208353491e-08, + "loss": 0.0241, + "step": 33024 + }, + { + "epoch": 3.9161626941776353, + "grad_norm": 0.3073761036938315, + "learning_rate": 5.757782039925441e-08, + "loss": 0.0141, + "step": 33025 + }, + { + "epoch": 3.9162812759397605, + "grad_norm": 0.5233649816789105, + "learning_rate": 5.741511865352267e-08, + "loss": 0.0208, + "step": 33026 + }, + { + "epoch": 3.9163998577018857, + "grad_norm": 0.49501540848749265, + "learning_rate": 5.7252646847838485e-08, + "loss": 0.0161, + "step": 33027 + }, + { + "epoch": 3.9165184394640105, + "grad_norm": 0.8442860188809693, + "learning_rate": 5.709040498369511e-08, + "loss": 0.045, + "step": 33028 + }, + { + "epoch": 3.916637021226135, + "grad_norm": 0.45596161472251984, + "learning_rate": 5.692839306259134e-08, + "loss": 0.0204, + "step": 33029 + }, + { + "epoch": 3.9167556029882604, + "grad_norm": 0.6384184507037882, + "learning_rate": 5.676661108602044e-08, + "loss": 0.0296, + "step": 33030 + }, + { + "epoch": 3.9168741847503856, + "grad_norm": 0.7016435239244908, + "learning_rate": 5.660505905547564e-08, + "loss": 0.0305, + "step": 33031 + }, + { + "epoch": 3.9169927665125104, + "grad_norm": 0.7163570012146104, + "learning_rate": 5.6443736972444646e-08, + "loss": 0.0337, + "step": 33032 + }, + { + "epoch": 3.917111348274635, + "grad_norm": 0.3894546107277461, + "learning_rate": 5.628264483841239e-08, + "loss": 0.0172, + "step": 33033 + }, + { + "epoch": 3.9172299300367603, + "grad_norm": 0.42144793233674815, + "learning_rate": 5.6121782654866564e-08, + "loss": 0.0113, + "step": 33034 + }, + { + "epoch": 3.9173485117988855, + "grad_norm": 0.4655561589540687, + "learning_rate": 5.5961150423292085e-08, + "loss": 0.0171, + "step": 33035 + }, + { + "epoch": 3.9174670935610103, + "grad_norm": 0.4586725749332586, + "learning_rate": 5.580074814516556e-08, + "loss": 0.0133, + "step": 33036 + }, + { + "epoch": 3.917585675323135, + "grad_norm": 0.41976887509044497, + "learning_rate": 5.564057582196636e-08, + "loss": 0.0196, + "step": 33037 + }, + { + "epoch": 3.9177042570852603, + "grad_norm": 0.6476123937913421, + "learning_rate": 5.5480633455176624e-08, + "loss": 0.0352, + "step": 33038 + }, + { + "epoch": 3.9178228388473855, + "grad_norm": 0.5098838499199789, + "learning_rate": 5.532092104626185e-08, + "loss": 0.0243, + "step": 33039 + }, + { + "epoch": 3.9179414206095102, + "grad_norm": 0.5176971796482911, + "learning_rate": 5.516143859670142e-08, + "loss": 0.0244, + "step": 33040 + }, + { + "epoch": 3.918060002371635, + "grad_norm": 0.6249235425650302, + "learning_rate": 5.500218610796082e-08, + "loss": 0.0191, + "step": 33041 + }, + { + "epoch": 3.91817858413376, + "grad_norm": 0.7414085517637462, + "learning_rate": 5.4843163581513866e-08, + "loss": 0.0256, + "step": 33042 + }, + { + "epoch": 3.9182971658958854, + "grad_norm": 0.4406364333894077, + "learning_rate": 5.4684371018820514e-08, + "loss": 0.0187, + "step": 33043 + }, + { + "epoch": 3.91841574765801, + "grad_norm": 0.5575786983776004, + "learning_rate": 5.452580842134902e-08, + "loss": 0.035, + "step": 33044 + }, + { + "epoch": 3.918534329420135, + "grad_norm": 0.6746227744512483, + "learning_rate": 5.4367475790559344e-08, + "loss": 0.022, + "step": 33045 + }, + { + "epoch": 3.91865291118226, + "grad_norm": 0.5997787448280097, + "learning_rate": 5.4209373127908635e-08, + "loss": 0.0246, + "step": 33046 + }, + { + "epoch": 3.9187714929443853, + "grad_norm": 0.30636786611899997, + "learning_rate": 5.405150043485963e-08, + "loss": 0.0128, + "step": 33047 + }, + { + "epoch": 3.91889007470651, + "grad_norm": 0.5389069366976549, + "learning_rate": 5.3893857712863924e-08, + "loss": 0.0199, + "step": 33048 + }, + { + "epoch": 3.9190086564686353, + "grad_norm": 0.5271612477935981, + "learning_rate": 5.373644496337593e-08, + "loss": 0.0308, + "step": 33049 + }, + { + "epoch": 3.91912723823076, + "grad_norm": 0.7005190947108599, + "learning_rate": 5.3579262187847255e-08, + "loss": 0.0287, + "step": 33050 + }, + { + "epoch": 3.9192458199928852, + "grad_norm": 0.5616150000492885, + "learning_rate": 5.342230938772674e-08, + "loss": 0.0269, + "step": 33051 + }, + { + "epoch": 3.91936440175501, + "grad_norm": 0.5145694922688626, + "learning_rate": 5.3265586564460456e-08, + "loss": 0.0208, + "step": 33052 + }, + { + "epoch": 3.919482983517135, + "grad_norm": 0.521885830734472, + "learning_rate": 5.3109093719494464e-08, + "loss": 0.0197, + "step": 33053 + }, + { + "epoch": 3.91960156527926, + "grad_norm": 0.7018333265089138, + "learning_rate": 5.295283085426927e-08, + "loss": 0.0281, + "step": 33054 + }, + { + "epoch": 3.919720147041385, + "grad_norm": 0.6563639111464061, + "learning_rate": 5.279679797022818e-08, + "loss": 0.0228, + "step": 33055 + }, + { + "epoch": 3.91983872880351, + "grad_norm": 0.6434116737410002, + "learning_rate": 5.264099506880616e-08, + "loss": 0.0337, + "step": 33056 + }, + { + "epoch": 3.919957310565635, + "grad_norm": 0.8529126073560828, + "learning_rate": 5.24854221514437e-08, + "loss": 0.0377, + "step": 33057 + }, + { + "epoch": 3.92007589232776, + "grad_norm": 0.4774569633002864, + "learning_rate": 5.2330079219573004e-08, + "loss": 0.0195, + "step": 33058 + }, + { + "epoch": 3.920194474089885, + "grad_norm": 0.36863473623174137, + "learning_rate": 5.217496627462626e-08, + "loss": 0.0187, + "step": 33059 + }, + { + "epoch": 3.92031305585201, + "grad_norm": 0.9060294105479724, + "learning_rate": 5.2020083318032875e-08, + "loss": 0.0227, + "step": 33060 + }, + { + "epoch": 3.920431637614135, + "grad_norm": 0.4924065087439102, + "learning_rate": 5.186543035121949e-08, + "loss": 0.0247, + "step": 33061 + }, + { + "epoch": 3.92055021937626, + "grad_norm": 0.6196460055036818, + "learning_rate": 5.1711007375615514e-08, + "loss": 0.0181, + "step": 33062 + }, + { + "epoch": 3.920668801138385, + "grad_norm": 0.5518588333603022, + "learning_rate": 5.155681439264204e-08, + "loss": 0.0194, + "step": 33063 + }, + { + "epoch": 3.9207873829005098, + "grad_norm": 0.4196004198953665, + "learning_rate": 5.140285140372292e-08, + "loss": 0.018, + "step": 33064 + }, + { + "epoch": 3.920905964662635, + "grad_norm": 0.7333077094798433, + "learning_rate": 5.124911841027091e-08, + "loss": 0.0574, + "step": 33065 + }, + { + "epoch": 3.9210245464247597, + "grad_norm": 0.5469766726345228, + "learning_rate": 5.109561541371266e-08, + "loss": 0.0247, + "step": 33066 + }, + { + "epoch": 3.921143128186885, + "grad_norm": 0.5762327811745762, + "learning_rate": 5.094234241545537e-08, + "loss": 0.0214, + "step": 33067 + }, + { + "epoch": 3.9212617099490097, + "grad_norm": 0.4262736839153872, + "learning_rate": 5.078929941691457e-08, + "loss": 0.0194, + "step": 33068 + }, + { + "epoch": 3.921380291711135, + "grad_norm": 0.7585186058562639, + "learning_rate": 5.063648641950303e-08, + "loss": 0.0426, + "step": 33069 + }, + { + "epoch": 3.9214988734732596, + "grad_norm": 0.44931001453646896, + "learning_rate": 5.048390342462794e-08, + "loss": 0.025, + "step": 33070 + }, + { + "epoch": 3.921617455235385, + "grad_norm": 0.6521546841017399, + "learning_rate": 5.0331550433696525e-08, + "loss": 0.0345, + "step": 33071 + }, + { + "epoch": 3.92173603699751, + "grad_norm": 0.477448628083289, + "learning_rate": 5.0179427448113215e-08, + "loss": 0.0198, + "step": 33072 + }, + { + "epoch": 3.921854618759635, + "grad_norm": 0.7874420827649931, + "learning_rate": 5.002753446927966e-08, + "loss": 0.0285, + "step": 33073 + }, + { + "epoch": 3.9219732005217596, + "grad_norm": 0.45961007875840815, + "learning_rate": 4.987587149859751e-08, + "loss": 0.0162, + "step": 33074 + }, + { + "epoch": 3.9220917822838848, + "grad_norm": 0.505388721512239, + "learning_rate": 4.972443853746289e-08, + "loss": 0.0234, + "step": 33075 + }, + { + "epoch": 3.92221036404601, + "grad_norm": 0.3108187798821018, + "learning_rate": 4.957323558727189e-08, + "loss": 0.0098, + "step": 33076 + }, + { + "epoch": 3.9223289458081347, + "grad_norm": 0.4780725380399818, + "learning_rate": 4.9422262649420625e-08, + "loss": 0.0213, + "step": 33077 + }, + { + "epoch": 3.9224475275702595, + "grad_norm": 0.3568838776075792, + "learning_rate": 4.9271519725299643e-08, + "loss": 0.0124, + "step": 33078 + }, + { + "epoch": 3.9225661093323847, + "grad_norm": 0.33366859585047653, + "learning_rate": 4.91210068162995e-08, + "loss": 0.0155, + "step": 33079 + }, + { + "epoch": 3.92268469109451, + "grad_norm": 0.4113984607022125, + "learning_rate": 4.89707239238052e-08, + "loss": 0.0194, + "step": 33080 + }, + { + "epoch": 3.9228032728566347, + "grad_norm": 0.5028827417762474, + "learning_rate": 4.8820671049204516e-08, + "loss": 0.0188, + "step": 33081 + }, + { + "epoch": 3.9229218546187594, + "grad_norm": 0.5877262686492479, + "learning_rate": 4.867084819387968e-08, + "loss": 0.0277, + "step": 33082 + }, + { + "epoch": 3.9230404363808846, + "grad_norm": 0.7963123886228259, + "learning_rate": 4.852125535921292e-08, + "loss": 0.0471, + "step": 33083 + }, + { + "epoch": 3.92315901814301, + "grad_norm": 0.3758374282047978, + "learning_rate": 4.837189254658092e-08, + "loss": 0.0151, + "step": 33084 + }, + { + "epoch": 3.9232775999051346, + "grad_norm": 0.4317499916739273, + "learning_rate": 4.82227597573659e-08, + "loss": 0.0219, + "step": 33085 + }, + { + "epoch": 3.9233961816672593, + "grad_norm": 0.9642844808740944, + "learning_rate": 4.8073856992936203e-08, + "loss": 0.0467, + "step": 33086 + }, + { + "epoch": 3.9235147634293845, + "grad_norm": 0.39848350380435915, + "learning_rate": 4.792518425466852e-08, + "loss": 0.0158, + "step": 33087 + }, + { + "epoch": 3.9236333451915097, + "grad_norm": 1.0079605291189995, + "learning_rate": 4.777674154393119e-08, + "loss": 0.0377, + "step": 33088 + }, + { + "epoch": 3.9237519269536345, + "grad_norm": 0.37837960293550066, + "learning_rate": 4.762852886209535e-08, + "loss": 0.0167, + "step": 33089 + }, + { + "epoch": 3.9238705087157593, + "grad_norm": 0.6375981734358109, + "learning_rate": 4.748054621052655e-08, + "loss": 0.0235, + "step": 33090 + }, + { + "epoch": 3.9239890904778845, + "grad_norm": 0.7799259297241589, + "learning_rate": 4.733279359058762e-08, + "loss": 0.0319, + "step": 33091 + }, + { + "epoch": 3.9241076722400097, + "grad_norm": 0.7676675618908122, + "learning_rate": 4.718527100364134e-08, + "loss": 0.0337, + "step": 33092 + }, + { + "epoch": 3.9242262540021344, + "grad_norm": 0.7007096411438644, + "learning_rate": 4.703797845104774e-08, + "loss": 0.03, + "step": 33093 + }, + { + "epoch": 3.924344835764259, + "grad_norm": 0.5762635043344015, + "learning_rate": 4.689091593416406e-08, + "loss": 0.0224, + "step": 33094 + }, + { + "epoch": 3.9244634175263844, + "grad_norm": 0.5435145034750177, + "learning_rate": 4.6744083454344775e-08, + "loss": 0.0206, + "step": 33095 + }, + { + "epoch": 3.9245819992885096, + "grad_norm": 0.3934901676837269, + "learning_rate": 4.6597481012947145e-08, + "loss": 0.0157, + "step": 33096 + }, + { + "epoch": 3.9247005810506344, + "grad_norm": 0.3462056194362654, + "learning_rate": 4.645110861132285e-08, + "loss": 0.0106, + "step": 33097 + }, + { + "epoch": 3.9248191628127596, + "grad_norm": 0.4266725728929685, + "learning_rate": 4.6304966250818035e-08, + "loss": 0.0242, + "step": 33098 + }, + { + "epoch": 3.9249377445748843, + "grad_norm": 0.6381850341844583, + "learning_rate": 4.6159053932778864e-08, + "loss": 0.0288, + "step": 33099 + }, + { + "epoch": 3.9250563263370095, + "grad_norm": 0.5430919230920782, + "learning_rate": 4.601337165855424e-08, + "loss": 0.0284, + "step": 33100 + }, + { + "epoch": 3.9251749080991343, + "grad_norm": 0.7194125584293024, + "learning_rate": 4.586791942948754e-08, + "loss": 0.0342, + "step": 33101 + }, + { + "epoch": 3.9252934898612595, + "grad_norm": 0.6270654073822307, + "learning_rate": 4.572269724691658e-08, + "loss": 0.0248, + "step": 33102 + }, + { + "epoch": 3.9254120716233842, + "grad_norm": 0.432857286158678, + "learning_rate": 4.557770511217918e-08, + "loss": 0.0205, + "step": 33103 + }, + { + "epoch": 3.9255306533855094, + "grad_norm": 0.6419981750018985, + "learning_rate": 4.543294302661594e-08, + "loss": 0.0262, + "step": 33104 + }, + { + "epoch": 3.925649235147634, + "grad_norm": 0.7567140976746265, + "learning_rate": 4.5288410991561894e-08, + "loss": 0.0372, + "step": 33105 + }, + { + "epoch": 3.9257678169097594, + "grad_norm": 0.8283006941233774, + "learning_rate": 4.514410900834376e-08, + "loss": 0.0396, + "step": 33106 + }, + { + "epoch": 3.925886398671884, + "grad_norm": 0.5934859873797844, + "learning_rate": 4.5000037078296584e-08, + "loss": 0.0281, + "step": 33107 + }, + { + "epoch": 3.9260049804340094, + "grad_norm": 0.42448996525031696, + "learning_rate": 4.4856195202747084e-08, + "loss": 0.019, + "step": 33108 + }, + { + "epoch": 3.926123562196134, + "grad_norm": 0.41466429825121087, + "learning_rate": 4.471258338302198e-08, + "loss": 0.0177, + "step": 33109 + }, + { + "epoch": 3.9262421439582593, + "grad_norm": 0.9533245004341644, + "learning_rate": 4.45692016204452e-08, + "loss": 0.0515, + "step": 33110 + }, + { + "epoch": 3.926360725720384, + "grad_norm": 0.4488598396655786, + "learning_rate": 4.442604991633792e-08, + "loss": 0.0146, + "step": 33111 + }, + { + "epoch": 3.9264793074825093, + "grad_norm": 0.5498208770290471, + "learning_rate": 4.4283128272018524e-08, + "loss": 0.0295, + "step": 33112 + }, + { + "epoch": 3.926597889244634, + "grad_norm": 0.898604253435429, + "learning_rate": 4.414043668880818e-08, + "loss": 0.0323, + "step": 33113 + }, + { + "epoch": 3.9267164710067592, + "grad_norm": 0.30873680916875024, + "learning_rate": 4.3997975168016956e-08, + "loss": 0.011, + "step": 33114 + }, + { + "epoch": 3.926835052768884, + "grad_norm": 0.5348067234472883, + "learning_rate": 4.385574371096324e-08, + "loss": 0.0237, + "step": 33115 + }, + { + "epoch": 3.926953634531009, + "grad_norm": 0.70168107970285, + "learning_rate": 4.371374231895431e-08, + "loss": 0.0289, + "step": 33116 + }, + { + "epoch": 3.927072216293134, + "grad_norm": 0.35371884537521286, + "learning_rate": 4.3571970993303015e-08, + "loss": 0.0144, + "step": 33117 + }, + { + "epoch": 3.927190798055259, + "grad_norm": 0.5624580965141538, + "learning_rate": 4.3430429735316634e-08, + "loss": 0.0267, + "step": 33118 + }, + { + "epoch": 3.927309379817384, + "grad_norm": 0.3965053636727945, + "learning_rate": 4.328911854629414e-08, + "loss": 0.0097, + "step": 33119 + }, + { + "epoch": 3.927427961579509, + "grad_norm": 0.46511595900883246, + "learning_rate": 4.314803742754558e-08, + "loss": 0.0205, + "step": 33120 + }, + { + "epoch": 3.9275465433416343, + "grad_norm": 0.5570656393182959, + "learning_rate": 4.300718638036438e-08, + "loss": 0.0207, + "step": 33121 + }, + { + "epoch": 3.927665125103759, + "grad_norm": 0.6636079466716717, + "learning_rate": 4.286656540605505e-08, + "loss": 0.034, + "step": 33122 + }, + { + "epoch": 3.927783706865884, + "grad_norm": 0.8799056143492018, + "learning_rate": 4.2726174505910986e-08, + "loss": 0.0409, + "step": 33123 + }, + { + "epoch": 3.927902288628009, + "grad_norm": 0.6181368863590084, + "learning_rate": 4.258601368122839e-08, + "loss": 0.0306, + "step": 33124 + }, + { + "epoch": 3.9280208703901343, + "grad_norm": 0.450699844113969, + "learning_rate": 4.244608293329511e-08, + "loss": 0.0287, + "step": 33125 + }, + { + "epoch": 3.928139452152259, + "grad_norm": 0.377954395457792, + "learning_rate": 4.230638226340455e-08, + "loss": 0.0158, + "step": 33126 + }, + { + "epoch": 3.9282580339143838, + "grad_norm": 0.3398317873758562, + "learning_rate": 4.216691167284459e-08, + "loss": 0.0139, + "step": 33127 + }, + { + "epoch": 3.928376615676509, + "grad_norm": 0.6318808930145723, + "learning_rate": 4.202767116290029e-08, + "loss": 0.0348, + "step": 33128 + }, + { + "epoch": 3.928495197438634, + "grad_norm": 0.6722589274244242, + "learning_rate": 4.188866073485676e-08, + "loss": 0.0364, + "step": 33129 + }, + { + "epoch": 3.928613779200759, + "grad_norm": 0.7765680709835717, + "learning_rate": 4.1749880389990726e-08, + "loss": 0.0309, + "step": 33130 + }, + { + "epoch": 3.9287323609628837, + "grad_norm": 0.5440263888052566, + "learning_rate": 4.1611330129590066e-08, + "loss": 0.0316, + "step": 33131 + }, + { + "epoch": 3.928850942725009, + "grad_norm": 0.8957897067697977, + "learning_rate": 4.14730099549232e-08, + "loss": 0.0342, + "step": 33132 + }, + { + "epoch": 3.928969524487134, + "grad_norm": 0.7105814482141289, + "learning_rate": 4.133491986726967e-08, + "loss": 0.027, + "step": 33133 + }, + { + "epoch": 3.929088106249259, + "grad_norm": 0.475778516535104, + "learning_rate": 4.119705986790068e-08, + "loss": 0.0279, + "step": 33134 + }, + { + "epoch": 3.9292066880113836, + "grad_norm": 0.8298778691106624, + "learning_rate": 4.10594299580902e-08, + "loss": 0.0427, + "step": 33135 + }, + { + "epoch": 3.929325269773509, + "grad_norm": 0.5808850713305251, + "learning_rate": 4.09220301391039e-08, + "loss": 0.0272, + "step": 33136 + }, + { + "epoch": 3.929443851535634, + "grad_norm": 0.5079665699685264, + "learning_rate": 4.0784860412212964e-08, + "loss": 0.0207, + "step": 33137 + }, + { + "epoch": 3.929562433297759, + "grad_norm": 0.5309765293603176, + "learning_rate": 4.0647920778674743e-08, + "loss": 0.0207, + "step": 33138 + }, + { + "epoch": 3.9296810150598835, + "grad_norm": 0.43936324454081127, + "learning_rate": 4.0511211239757654e-08, + "loss": 0.0213, + "step": 33139 + }, + { + "epoch": 3.9297995968220087, + "grad_norm": 0.48872366741690554, + "learning_rate": 4.0374731796716245e-08, + "loss": 0.0155, + "step": 33140 + }, + { + "epoch": 3.929918178584134, + "grad_norm": 0.3603992114630368, + "learning_rate": 4.023848245081618e-08, + "loss": 0.0139, + "step": 33141 + }, + { + "epoch": 3.9300367603462587, + "grad_norm": 0.43415684891731093, + "learning_rate": 4.010246320330924e-08, + "loss": 0.0175, + "step": 33142 + }, + { + "epoch": 3.9301553421083835, + "grad_norm": 0.45155047569634915, + "learning_rate": 3.9966674055447185e-08, + "loss": 0.0196, + "step": 33143 + }, + { + "epoch": 3.9302739238705087, + "grad_norm": 1.0040246016267995, + "learning_rate": 3.983111500848735e-08, + "loss": 0.0568, + "step": 33144 + }, + { + "epoch": 3.930392505632634, + "grad_norm": 0.6125714224414741, + "learning_rate": 3.9695786063675965e-08, + "loss": 0.0306, + "step": 33145 + }, + { + "epoch": 3.9305110873947586, + "grad_norm": 0.50315825378595, + "learning_rate": 3.956068722225925e-08, + "loss": 0.025, + "step": 33146 + }, + { + "epoch": 3.930629669156884, + "grad_norm": 0.5753798411477716, + "learning_rate": 3.942581848548621e-08, + "loss": 0.0226, + "step": 33147 + }, + { + "epoch": 3.9307482509190086, + "grad_norm": 0.508628078965429, + "learning_rate": 3.929117985459751e-08, + "loss": 0.0131, + "step": 33148 + }, + { + "epoch": 3.930866832681134, + "grad_norm": 0.8783409294940893, + "learning_rate": 3.915677133083662e-08, + "loss": 0.0353, + "step": 33149 + }, + { + "epoch": 3.9309854144432586, + "grad_norm": 0.5099467611711603, + "learning_rate": 3.902259291543864e-08, + "loss": 0.021, + "step": 33150 + }, + { + "epoch": 3.9311039962053838, + "grad_norm": 0.5908223496661885, + "learning_rate": 3.888864460964425e-08, + "loss": 0.0271, + "step": 33151 + }, + { + "epoch": 3.9312225779675085, + "grad_norm": 0.6108463436131902, + "learning_rate": 3.8754926414688585e-08, + "loss": 0.0248, + "step": 33152 + }, + { + "epoch": 3.9313411597296337, + "grad_norm": 0.4772747587491697, + "learning_rate": 3.862143833180121e-08, + "loss": 0.0257, + "step": 33153 + }, + { + "epoch": 3.9314597414917585, + "grad_norm": 0.8451223038279385, + "learning_rate": 3.848818036221724e-08, + "loss": 0.0389, + "step": 33154 + }, + { + "epoch": 3.9315783232538837, + "grad_norm": 0.9885428711235217, + "learning_rate": 3.835515250716071e-08, + "loss": 0.0363, + "step": 33155 + }, + { + "epoch": 3.9316969050160084, + "grad_norm": 0.6691829432019198, + "learning_rate": 3.8222354767858405e-08, + "loss": 0.0345, + "step": 33156 + }, + { + "epoch": 3.9318154867781336, + "grad_norm": 0.48614569565631255, + "learning_rate": 3.808978714553713e-08, + "loss": 0.0212, + "step": 33157 + }, + { + "epoch": 3.9319340685402584, + "grad_norm": 0.7046640849096598, + "learning_rate": 3.7957449641418116e-08, + "loss": 0.0239, + "step": 33158 + }, + { + "epoch": 3.9320526503023836, + "grad_norm": 0.49734972294694013, + "learning_rate": 3.782534225671985e-08, + "loss": 0.0211, + "step": 33159 + }, + { + "epoch": 3.9321712320645084, + "grad_norm": 0.6250097939477269, + "learning_rate": 3.76934649926608e-08, + "loss": 0.0253, + "step": 33160 + }, + { + "epoch": 3.9322898138266336, + "grad_norm": 0.7244424407115737, + "learning_rate": 3.756181785045943e-08, + "loss": 0.0322, + "step": 33161 + }, + { + "epoch": 3.9324083955887583, + "grad_norm": 0.6234903433678087, + "learning_rate": 3.743040083132587e-08, + "loss": 0.0315, + "step": 33162 + }, + { + "epoch": 3.9325269773508835, + "grad_norm": 0.596452013742019, + "learning_rate": 3.729921393647306e-08, + "loss": 0.0264, + "step": 33163 + }, + { + "epoch": 3.9326455591130083, + "grad_norm": 0.6640776938425151, + "learning_rate": 3.716825716711114e-08, + "loss": 0.0294, + "step": 33164 + }, + { + "epoch": 3.9327641408751335, + "grad_norm": 0.4346146813307494, + "learning_rate": 3.70375305244447e-08, + "loss": 0.0239, + "step": 33165 + }, + { + "epoch": 3.9328827226372582, + "grad_norm": 0.4611357728695404, + "learning_rate": 3.6907034009681095e-08, + "loss": 0.0206, + "step": 33166 + }, + { + "epoch": 3.9330013043993834, + "grad_norm": 0.6019266500127408, + "learning_rate": 3.6776767624022157e-08, + "loss": 0.0238, + "step": 33167 + }, + { + "epoch": 3.933119886161508, + "grad_norm": 0.5603044402884042, + "learning_rate": 3.6646731368672474e-08, + "loss": 0.0176, + "step": 33168 + }, + { + "epoch": 3.9332384679236334, + "grad_norm": 0.49162346613557406, + "learning_rate": 3.651692524482553e-08, + "loss": 0.0184, + "step": 33169 + }, + { + "epoch": 3.933357049685758, + "grad_norm": 0.4850518289455356, + "learning_rate": 3.638734925368037e-08, + "loss": 0.0174, + "step": 33170 + }, + { + "epoch": 3.9334756314478834, + "grad_norm": 0.42144445058179075, + "learning_rate": 3.625800339643048e-08, + "loss": 0.0194, + "step": 33171 + }, + { + "epoch": 3.933594213210008, + "grad_norm": 0.3621577943795314, + "learning_rate": 3.612888767427214e-08, + "loss": 0.0196, + "step": 33172 + }, + { + "epoch": 3.9337127949721333, + "grad_norm": 0.3713833232364303, + "learning_rate": 3.6000002088390494e-08, + "loss": 0.0179, + "step": 33173 + }, + { + "epoch": 3.9338313767342585, + "grad_norm": 0.6192841690748263, + "learning_rate": 3.587134663997627e-08, + "loss": 0.0317, + "step": 33174 + }, + { + "epoch": 3.9339499584963833, + "grad_norm": 0.547089258566546, + "learning_rate": 3.574292133021462e-08, + "loss": 0.0349, + "step": 33175 + }, + { + "epoch": 3.934068540258508, + "grad_norm": 0.49308723945741006, + "learning_rate": 3.561472616029071e-08, + "loss": 0.0138, + "step": 33176 + }, + { + "epoch": 3.9341871220206333, + "grad_norm": 0.42595721377571655, + "learning_rate": 3.548676113138694e-08, + "loss": 0.0145, + "step": 33177 + }, + { + "epoch": 3.9343057037827585, + "grad_norm": 0.29605988085380397, + "learning_rate": 3.5359026244680125e-08, + "loss": 0.012, + "step": 33178 + }, + { + "epoch": 3.934424285544883, + "grad_norm": 0.96150257037187, + "learning_rate": 3.52315215013499e-08, + "loss": 0.0358, + "step": 33179 + }, + { + "epoch": 3.934542867307008, + "grad_norm": 0.36970466754519, + "learning_rate": 3.5104246902570304e-08, + "loss": 0.0159, + "step": 33180 + }, + { + "epoch": 3.934661449069133, + "grad_norm": 0.4131746450955399, + "learning_rate": 3.497720244951819e-08, + "loss": 0.0203, + "step": 33181 + }, + { + "epoch": 3.9347800308312584, + "grad_norm": 0.6275786783352804, + "learning_rate": 3.485038814335928e-08, + "loss": 0.0291, + "step": 33182 + }, + { + "epoch": 3.934898612593383, + "grad_norm": 0.7016730013552248, + "learning_rate": 3.4723803985267646e-08, + "loss": 0.0289, + "step": 33183 + }, + { + "epoch": 3.935017194355508, + "grad_norm": 0.3518111509736676, + "learning_rate": 3.4597449976406235e-08, + "loss": 0.0165, + "step": 33184 + }, + { + "epoch": 3.935135776117633, + "grad_norm": 0.4934906830594519, + "learning_rate": 3.447132611794357e-08, + "loss": 0.0214, + "step": 33185 + }, + { + "epoch": 3.9352543578797583, + "grad_norm": 0.5317027999226017, + "learning_rate": 3.4345432411039823e-08, + "loss": 0.025, + "step": 33186 + }, + { + "epoch": 3.935372939641883, + "grad_norm": 0.4185212045033872, + "learning_rate": 3.4219768856855185e-08, + "loss": 0.0168, + "step": 33187 + }, + { + "epoch": 3.935491521404008, + "grad_norm": 1.0901101654485073, + "learning_rate": 3.409433545654983e-08, + "loss": 0.0644, + "step": 33188 + }, + { + "epoch": 3.935610103166133, + "grad_norm": 0.4950595269162272, + "learning_rate": 3.39691322112784e-08, + "loss": 0.0195, + "step": 33189 + }, + { + "epoch": 3.9357286849282582, + "grad_norm": 0.380873052551322, + "learning_rate": 3.3844159122198296e-08, + "loss": 0.016, + "step": 33190 + }, + { + "epoch": 3.935847266690383, + "grad_norm": 0.6741907057843779, + "learning_rate": 3.37194161904586e-08, + "loss": 0.0348, + "step": 33191 + }, + { + "epoch": 3.9359658484525077, + "grad_norm": 0.4640308589084602, + "learning_rate": 3.359490341721116e-08, + "loss": 0.0166, + "step": 33192 + }, + { + "epoch": 3.936084430214633, + "grad_norm": 0.892941326586008, + "learning_rate": 3.3470620803602306e-08, + "loss": 0.0355, + "step": 33193 + }, + { + "epoch": 3.936203011976758, + "grad_norm": 0.6496688166227542, + "learning_rate": 3.3346568350778315e-08, + "loss": 0.035, + "step": 33194 + }, + { + "epoch": 3.936321593738883, + "grad_norm": 0.5356287982132373, + "learning_rate": 3.322274605988274e-08, + "loss": 0.0315, + "step": 33195 + }, + { + "epoch": 3.936440175501008, + "grad_norm": 0.5782738800398083, + "learning_rate": 3.309915393205632e-08, + "loss": 0.0301, + "step": 33196 + }, + { + "epoch": 3.936558757263133, + "grad_norm": 0.5325688129813458, + "learning_rate": 3.297579196843981e-08, + "loss": 0.0302, + "step": 33197 + }, + { + "epoch": 3.936677339025258, + "grad_norm": 0.42190219053246064, + "learning_rate": 3.285266017017119e-08, + "loss": 0.015, + "step": 33198 + }, + { + "epoch": 3.936795920787383, + "grad_norm": 0.42001835036684787, + "learning_rate": 3.2729758538382894e-08, + "loss": 0.0264, + "step": 33199 + }, + { + "epoch": 3.936914502549508, + "grad_norm": 0.4094001926428675, + "learning_rate": 3.2607087074207346e-08, + "loss": 0.0223, + "step": 33200 + }, + { + "epoch": 3.937033084311633, + "grad_norm": 0.6784604467326373, + "learning_rate": 3.2484645778779746e-08, + "loss": 0.0275, + "step": 33201 + }, + { + "epoch": 3.937151666073758, + "grad_norm": 0.3585719687994774, + "learning_rate": 3.2362434653226967e-08, + "loss": 0.0129, + "step": 33202 + }, + { + "epoch": 3.9372702478358828, + "grad_norm": 0.6700547404098471, + "learning_rate": 3.22404536986759e-08, + "loss": 0.0223, + "step": 33203 + }, + { + "epoch": 3.937388829598008, + "grad_norm": 0.5115528287731742, + "learning_rate": 3.2118702916247856e-08, + "loss": 0.0248, + "step": 33204 + }, + { + "epoch": 3.9375074113601327, + "grad_norm": 0.5882074965976071, + "learning_rate": 3.1997182307069716e-08, + "loss": 0.0239, + "step": 33205 + }, + { + "epoch": 3.937625993122258, + "grad_norm": 0.6998102788630138, + "learning_rate": 3.187589187226003e-08, + "loss": 0.0262, + "step": 33206 + }, + { + "epoch": 3.9377445748843827, + "grad_norm": 0.6962159479292769, + "learning_rate": 3.175483161293458e-08, + "loss": 0.0319, + "step": 33207 + }, + { + "epoch": 3.937863156646508, + "grad_norm": 0.5348124449723833, + "learning_rate": 3.16340015302119e-08, + "loss": 0.0315, + "step": 33208 + }, + { + "epoch": 3.9379817384086326, + "grad_norm": 0.545807237578722, + "learning_rate": 3.1513401625207775e-08, + "loss": 0.0192, + "step": 33209 + }, + { + "epoch": 3.938100320170758, + "grad_norm": 0.8783873081406167, + "learning_rate": 3.139303189902965e-08, + "loss": 0.0314, + "step": 33210 + }, + { + "epoch": 3.9382189019328826, + "grad_norm": 0.23749680965276923, + "learning_rate": 3.1272892352790516e-08, + "loss": 0.0116, + "step": 33211 + }, + { + "epoch": 3.938337483695008, + "grad_norm": 0.48222531178420325, + "learning_rate": 3.1152982987595056e-08, + "loss": 0.0181, + "step": 33212 + }, + { + "epoch": 3.9384560654571326, + "grad_norm": 0.4759686117970928, + "learning_rate": 3.1033303804550716e-08, + "loss": 0.0221, + "step": 33213 + }, + { + "epoch": 3.9385746472192578, + "grad_norm": 0.5066721080479306, + "learning_rate": 3.0913854804762166e-08, + "loss": 0.0234, + "step": 33214 + }, + { + "epoch": 3.9386932289813825, + "grad_norm": 0.3748370218456643, + "learning_rate": 3.0794635989325746e-08, + "loss": 0.0189, + "step": 33215 + }, + { + "epoch": 3.9388118107435077, + "grad_norm": 0.46791119341937715, + "learning_rate": 3.067564735934614e-08, + "loss": 0.0256, + "step": 33216 + }, + { + "epoch": 3.9389303925056325, + "grad_norm": 1.0329490327700166, + "learning_rate": 3.055688891591691e-08, + "loss": 0.0433, + "step": 33217 + }, + { + "epoch": 3.9390489742677577, + "grad_norm": 0.5154009691289044, + "learning_rate": 3.0438360660131614e-08, + "loss": 0.0246, + "step": 33218 + }, + { + "epoch": 3.9391675560298824, + "grad_norm": 0.38167205969045503, + "learning_rate": 3.0320062593086616e-08, + "loss": 0.0183, + "step": 33219 + }, + { + "epoch": 3.9392861377920076, + "grad_norm": 0.49190438906332634, + "learning_rate": 3.020199471587271e-08, + "loss": 0.0169, + "step": 33220 + }, + { + "epoch": 3.9394047195541324, + "grad_norm": 0.5742385634697146, + "learning_rate": 3.008415702957512e-08, + "loss": 0.024, + "step": 33221 + }, + { + "epoch": 3.9395233013162576, + "grad_norm": 0.31777270664336815, + "learning_rate": 2.996654953527911e-08, + "loss": 0.013, + "step": 33222 + }, + { + "epoch": 3.939641883078383, + "grad_norm": 0.41715105254759793, + "learning_rate": 2.9849172234072685e-08, + "loss": 0.0194, + "step": 33223 + }, + { + "epoch": 3.9397604648405076, + "grad_norm": 0.567957809232153, + "learning_rate": 2.973202512703832e-08, + "loss": 0.0284, + "step": 33224 + }, + { + "epoch": 3.9398790466026323, + "grad_norm": 0.4615896130847188, + "learning_rate": 2.9615108215252928e-08, + "loss": 0.0249, + "step": 33225 + }, + { + "epoch": 3.9399976283647575, + "grad_norm": 0.7999447307807168, + "learning_rate": 2.9498421499793427e-08, + "loss": 0.036, + "step": 33226 + }, + { + "epoch": 3.9401162101268827, + "grad_norm": 0.6379839802148768, + "learning_rate": 2.9381964981739508e-08, + "loss": 0.0309, + "step": 33227 + }, + { + "epoch": 3.9402347918890075, + "grad_norm": 0.47007058413795816, + "learning_rate": 2.9265738662162535e-08, + "loss": 0.021, + "step": 33228 + }, + { + "epoch": 3.9403533736511323, + "grad_norm": 0.514948537388906, + "learning_rate": 2.914974254213665e-08, + "loss": 0.0211, + "step": 33229 + }, + { + "epoch": 3.9404719554132575, + "grad_norm": 0.5078316225566771, + "learning_rate": 2.903397662272489e-08, + "loss": 0.0266, + "step": 33230 + }, + { + "epoch": 3.9405905371753827, + "grad_norm": 0.5115989437419066, + "learning_rate": 2.89184409050014e-08, + "loss": 0.0211, + "step": 33231 + }, + { + "epoch": 3.9407091189375074, + "grad_norm": 0.4907206280381743, + "learning_rate": 2.8803135390026435e-08, + "loss": 0.0263, + "step": 33232 + }, + { + "epoch": 3.940827700699632, + "grad_norm": 0.595902187536758, + "learning_rate": 2.868806007886582e-08, + "loss": 0.0212, + "step": 33233 + }, + { + "epoch": 3.9409462824617574, + "grad_norm": 0.3440481870762188, + "learning_rate": 2.857321497257981e-08, + "loss": 0.0125, + "step": 33234 + }, + { + "epoch": 3.9410648642238826, + "grad_norm": 0.38337261032538417, + "learning_rate": 2.8458600072225892e-08, + "loss": 0.0175, + "step": 33235 + }, + { + "epoch": 3.9411834459860073, + "grad_norm": 0.4664249317528855, + "learning_rate": 2.834421537886156e-08, + "loss": 0.0191, + "step": 33236 + }, + { + "epoch": 3.941302027748132, + "grad_norm": 0.7826354055009099, + "learning_rate": 2.8230060893541523e-08, + "loss": 0.0299, + "step": 33237 + }, + { + "epoch": 3.9414206095102573, + "grad_norm": 0.7336010688011343, + "learning_rate": 2.8116136617317713e-08, + "loss": 0.0449, + "step": 33238 + }, + { + "epoch": 3.9415391912723825, + "grad_norm": 0.5086041149943982, + "learning_rate": 2.800244255124207e-08, + "loss": 0.0274, + "step": 33239 + }, + { + "epoch": 3.9416577730345073, + "grad_norm": 0.475915966515898, + "learning_rate": 2.7888978696360978e-08, + "loss": 0.0228, + "step": 33240 + }, + { + "epoch": 3.941776354796632, + "grad_norm": 0.4976317682800137, + "learning_rate": 2.777574505371805e-08, + "loss": 0.018, + "step": 33241 + }, + { + "epoch": 3.9418949365587572, + "grad_norm": 0.6511886869805208, + "learning_rate": 2.7662741624362442e-08, + "loss": 0.0315, + "step": 33242 + }, + { + "epoch": 3.9420135183208824, + "grad_norm": 0.5654749039515734, + "learning_rate": 2.7549968409332217e-08, + "loss": 0.0315, + "step": 33243 + }, + { + "epoch": 3.942132100083007, + "grad_norm": 0.4326641467331919, + "learning_rate": 2.7437425409668206e-08, + "loss": 0.0195, + "step": 33244 + }, + { + "epoch": 3.942250681845132, + "grad_norm": 0.502028371232107, + "learning_rate": 2.732511262640569e-08, + "loss": 0.0163, + "step": 33245 + }, + { + "epoch": 3.942369263607257, + "grad_norm": 0.6385219052127855, + "learning_rate": 2.721303006058551e-08, + "loss": 0.0345, + "step": 33246 + }, + { + "epoch": 3.9424878453693823, + "grad_norm": 0.47688329075963737, + "learning_rate": 2.7101177713237392e-08, + "loss": 0.0157, + "step": 33247 + }, + { + "epoch": 3.942606427131507, + "grad_norm": 0.44126706985848213, + "learning_rate": 2.6989555585388294e-08, + "loss": 0.0195, + "step": 33248 + }, + { + "epoch": 3.9427250088936323, + "grad_norm": 0.8309466778645737, + "learning_rate": 2.6878163678076274e-08, + "loss": 0.0407, + "step": 33249 + }, + { + "epoch": 3.942843590655757, + "grad_norm": 0.5929931781748773, + "learning_rate": 2.6767001992322736e-08, + "loss": 0.0195, + "step": 33250 + }, + { + "epoch": 3.9429621724178823, + "grad_norm": 0.5525776095818556, + "learning_rate": 2.6656070529151868e-08, + "loss": 0.0404, + "step": 33251 + }, + { + "epoch": 3.943080754180007, + "grad_norm": 0.584919513568713, + "learning_rate": 2.654536928958784e-08, + "loss": 0.0258, + "step": 33252 + }, + { + "epoch": 3.9431993359421322, + "grad_norm": 0.5403495940871487, + "learning_rate": 2.6434898274652063e-08, + "loss": 0.0361, + "step": 33253 + }, + { + "epoch": 3.943317917704257, + "grad_norm": 0.4368226971767452, + "learning_rate": 2.6324657485360393e-08, + "loss": 0.0247, + "step": 33254 + }, + { + "epoch": 3.943436499466382, + "grad_norm": 0.4681428419532868, + "learning_rate": 2.6214646922731458e-08, + "loss": 0.0209, + "step": 33255 + }, + { + "epoch": 3.943555081228507, + "grad_norm": 0.38246262693066607, + "learning_rate": 2.6104866587778332e-08, + "loss": 0.0133, + "step": 33256 + }, + { + "epoch": 3.943673662990632, + "grad_norm": 0.544755696655888, + "learning_rate": 2.5995316481514097e-08, + "loss": 0.0257, + "step": 33257 + }, + { + "epoch": 3.943792244752757, + "grad_norm": 0.8447314554344055, + "learning_rate": 2.5885996604946282e-08, + "loss": 0.0392, + "step": 33258 + }, + { + "epoch": 3.943910826514882, + "grad_norm": 0.3835102045147337, + "learning_rate": 2.5776906959087964e-08, + "loss": 0.0201, + "step": 33259 + }, + { + "epoch": 3.944029408277007, + "grad_norm": 0.5466139857997736, + "learning_rate": 2.5668047544938346e-08, + "loss": 0.0273, + "step": 33260 + }, + { + "epoch": 3.944147990039132, + "grad_norm": 0.563439669172558, + "learning_rate": 2.5559418363502176e-08, + "loss": 0.0244, + "step": 33261 + }, + { + "epoch": 3.944266571801257, + "grad_norm": 0.5477253181803882, + "learning_rate": 2.5451019415784204e-08, + "loss": 0.0238, + "step": 33262 + }, + { + "epoch": 3.944385153563382, + "grad_norm": 0.5118823243822774, + "learning_rate": 2.534285070278364e-08, + "loss": 0.0237, + "step": 33263 + }, + { + "epoch": 3.944503735325507, + "grad_norm": 0.3961895110886314, + "learning_rate": 2.5234912225494123e-08, + "loss": 0.018, + "step": 33264 + }, + { + "epoch": 3.944622317087632, + "grad_norm": 0.6005938511479746, + "learning_rate": 2.5127203984912083e-08, + "loss": 0.0296, + "step": 33265 + }, + { + "epoch": 3.9447408988497568, + "grad_norm": 0.6970616408585286, + "learning_rate": 2.501972598203395e-08, + "loss": 0.0306, + "step": 33266 + }, + { + "epoch": 3.944859480611882, + "grad_norm": 0.6508395195034129, + "learning_rate": 2.4912478217845037e-08, + "loss": 0.0307, + "step": 33267 + }, + { + "epoch": 3.9449780623740067, + "grad_norm": 0.5737107132881263, + "learning_rate": 2.4805460693336225e-08, + "loss": 0.0179, + "step": 33268 + }, + { + "epoch": 3.945096644136132, + "grad_norm": 0.49382860710013043, + "learning_rate": 2.469867340949561e-08, + "loss": 0.0129, + "step": 33269 + }, + { + "epoch": 3.9452152258982567, + "grad_norm": 0.5625446173804111, + "learning_rate": 2.459211636730574e-08, + "loss": 0.0211, + "step": 33270 + }, + { + "epoch": 3.945333807660382, + "grad_norm": 0.30508586872582927, + "learning_rate": 2.4485789567751938e-08, + "loss": 0.0126, + "step": 33271 + }, + { + "epoch": 3.945452389422507, + "grad_norm": 0.5624088330829993, + "learning_rate": 2.4379693011808425e-08, + "loss": 0.0289, + "step": 33272 + }, + { + "epoch": 3.945570971184632, + "grad_norm": 0.4603544067624388, + "learning_rate": 2.4273826700460523e-08, + "loss": 0.0224, + "step": 33273 + }, + { + "epoch": 3.9456895529467566, + "grad_norm": 0.5426730013010926, + "learning_rate": 2.4168190634679676e-08, + "loss": 0.0301, + "step": 33274 + }, + { + "epoch": 3.945808134708882, + "grad_norm": 0.2659185190714421, + "learning_rate": 2.406278481544011e-08, + "loss": 0.0126, + "step": 33275 + }, + { + "epoch": 3.945926716471007, + "grad_norm": 0.3460479111094722, + "learning_rate": 2.3957609243713262e-08, + "loss": 0.0141, + "step": 33276 + }, + { + "epoch": 3.9460452982331318, + "grad_norm": 0.5686137733389033, + "learning_rate": 2.3852663920470585e-08, + "loss": 0.0235, + "step": 33277 + }, + { + "epoch": 3.9461638799952565, + "grad_norm": 0.41371014797544314, + "learning_rate": 2.3747948846680746e-08, + "loss": 0.024, + "step": 33278 + }, + { + "epoch": 3.9462824617573817, + "grad_norm": 0.6824427298696607, + "learning_rate": 2.3643464023304084e-08, + "loss": 0.025, + "step": 33279 + }, + { + "epoch": 3.946401043519507, + "grad_norm": 0.5075238854029375, + "learning_rate": 2.35392094513065e-08, + "loss": 0.0215, + "step": 33280 + }, + { + "epoch": 3.9465196252816317, + "grad_norm": 0.4065955285251573, + "learning_rate": 2.3435185131651105e-08, + "loss": 0.0172, + "step": 33281 + }, + { + "epoch": 3.9466382070437565, + "grad_norm": 0.432524007741599, + "learning_rate": 2.3331391065292695e-08, + "loss": 0.0198, + "step": 33282 + }, + { + "epoch": 3.9467567888058817, + "grad_norm": 0.4178130462399298, + "learning_rate": 2.3227827253191613e-08, + "loss": 0.0137, + "step": 33283 + }, + { + "epoch": 3.946875370568007, + "grad_norm": 0.5069259523306548, + "learning_rate": 2.312449369630265e-08, + "loss": 0.0179, + "step": 33284 + }, + { + "epoch": 3.9469939523301316, + "grad_norm": 0.8699350571290507, + "learning_rate": 2.3021390395577823e-08, + "loss": 0.0443, + "step": 33285 + }, + { + "epoch": 3.9471125340922564, + "grad_norm": 0.41344177232420504, + "learning_rate": 2.2918517351963597e-08, + "loss": 0.0247, + "step": 33286 + }, + { + "epoch": 3.9472311158543816, + "grad_norm": 0.29006115358845885, + "learning_rate": 2.2815874566414763e-08, + "loss": 0.0112, + "step": 33287 + }, + { + "epoch": 3.947349697616507, + "grad_norm": 0.6424879791521693, + "learning_rate": 2.2713462039872234e-08, + "loss": 0.0457, + "step": 33288 + }, + { + "epoch": 3.9474682793786315, + "grad_norm": 0.5103777512446931, + "learning_rate": 2.261127977328248e-08, + "loss": 0.0158, + "step": 33289 + }, + { + "epoch": 3.9475868611407563, + "grad_norm": 0.7432848616512039, + "learning_rate": 2.2509327767589183e-08, + "loss": 0.0299, + "step": 33290 + }, + { + "epoch": 3.9477054429028815, + "grad_norm": 0.8017004912995923, + "learning_rate": 2.2407606023730486e-08, + "loss": 0.0361, + "step": 33291 + }, + { + "epoch": 3.9478240246650067, + "grad_norm": 0.4372709227726136, + "learning_rate": 2.2306114542641753e-08, + "loss": 0.0189, + "step": 33292 + }, + { + "epoch": 3.9479426064271315, + "grad_norm": 0.6276430561386005, + "learning_rate": 2.2204853325263896e-08, + "loss": 0.0415, + "step": 33293 + }, + { + "epoch": 3.948061188189256, + "grad_norm": 0.727516067484085, + "learning_rate": 2.2103822372523952e-08, + "loss": 0.0364, + "step": 33294 + }, + { + "epoch": 3.9481797699513814, + "grad_norm": 0.43798591668962916, + "learning_rate": 2.200302168536006e-08, + "loss": 0.0191, + "step": 33295 + }, + { + "epoch": 3.9482983517135066, + "grad_norm": 0.8125084105221255, + "learning_rate": 2.1902451264696477e-08, + "loss": 0.0318, + "step": 33296 + }, + { + "epoch": 3.9484169334756314, + "grad_norm": 0.4753295937839806, + "learning_rate": 2.180211111146302e-08, + "loss": 0.0196, + "step": 33297 + }, + { + "epoch": 3.9485355152377566, + "grad_norm": 0.4620646722469581, + "learning_rate": 2.1702001226583946e-08, + "loss": 0.0208, + "step": 33298 + }, + { + "epoch": 3.9486540969998813, + "grad_norm": 0.31352129138887713, + "learning_rate": 2.1602121610980743e-08, + "loss": 0.013, + "step": 33299 + }, + { + "epoch": 3.9487726787620065, + "grad_norm": 0.3530703397406167, + "learning_rate": 2.1502472265577666e-08, + "loss": 0.02, + "step": 33300 + }, + { + "epoch": 3.9488912605241313, + "grad_norm": 0.5246471004380944, + "learning_rate": 2.1403053191290655e-08, + "loss": 0.024, + "step": 33301 + }, + { + "epoch": 3.9490098422862565, + "grad_norm": 0.5699306367049539, + "learning_rate": 2.1303864389035645e-08, + "loss": 0.03, + "step": 33302 + }, + { + "epoch": 3.9491284240483813, + "grad_norm": 0.44784609706341655, + "learning_rate": 2.1204905859728565e-08, + "loss": 0.0228, + "step": 33303 + }, + { + "epoch": 3.9492470058105065, + "grad_norm": 0.6628519838695274, + "learning_rate": 2.1106177604279797e-08, + "loss": 0.0255, + "step": 33304 + }, + { + "epoch": 3.9493655875726312, + "grad_norm": 0.7598229157371881, + "learning_rate": 2.1007679623602506e-08, + "loss": 0.0279, + "step": 33305 + }, + { + "epoch": 3.9494841693347564, + "grad_norm": 0.3488243689353204, + "learning_rate": 2.0909411918601518e-08, + "loss": 0.0136, + "step": 33306 + }, + { + "epoch": 3.949602751096881, + "grad_norm": 0.5978145225080252, + "learning_rate": 2.0811374490187218e-08, + "loss": 0.0319, + "step": 33307 + }, + { + "epoch": 3.9497213328590064, + "grad_norm": 0.81646662108966, + "learning_rate": 2.0713567339258887e-08, + "loss": 0.0278, + "step": 33308 + }, + { + "epoch": 3.949839914621131, + "grad_norm": 0.2906399380038868, + "learning_rate": 2.0615990466718583e-08, + "loss": 0.0099, + "step": 33309 + }, + { + "epoch": 3.9499584963832564, + "grad_norm": 0.7839427778139317, + "learning_rate": 2.0518643873468358e-08, + "loss": 0.0439, + "step": 33310 + }, + { + "epoch": 3.950077078145381, + "grad_norm": 0.3802063271237251, + "learning_rate": 2.0421527560404718e-08, + "loss": 0.0174, + "step": 33311 + }, + { + "epoch": 3.9501956599075063, + "grad_norm": 0.8334606939550585, + "learning_rate": 2.0324641528421395e-08, + "loss": 0.0258, + "step": 33312 + }, + { + "epoch": 3.950314241669631, + "grad_norm": 0.48842002412218066, + "learning_rate": 2.0227985778414894e-08, + "loss": 0.024, + "step": 33313 + }, + { + "epoch": 3.9504328234317563, + "grad_norm": 0.34482306787857625, + "learning_rate": 2.0131560311273388e-08, + "loss": 0.016, + "step": 33314 + }, + { + "epoch": 3.950551405193881, + "grad_norm": 0.668074916915112, + "learning_rate": 2.0035365127885065e-08, + "loss": 0.0398, + "step": 33315 + }, + { + "epoch": 3.9506699869560062, + "grad_norm": 0.7292158189015361, + "learning_rate": 1.993940022914087e-08, + "loss": 0.0234, + "step": 33316 + }, + { + "epoch": 3.950788568718131, + "grad_norm": 0.5542903141174368, + "learning_rate": 1.984366561592066e-08, + "loss": 0.0218, + "step": 33317 + }, + { + "epoch": 3.950907150480256, + "grad_norm": 0.6983907734907087, + "learning_rate": 1.9748161289109836e-08, + "loss": 0.0318, + "step": 33318 + }, + { + "epoch": 3.951025732242381, + "grad_norm": 0.5026446665268934, + "learning_rate": 1.9652887249588248e-08, + "loss": 0.0229, + "step": 33319 + }, + { + "epoch": 3.951144314004506, + "grad_norm": 0.45035518577119305, + "learning_rate": 1.9557843498232974e-08, + "loss": 0.0186, + "step": 33320 + }, + { + "epoch": 3.9512628957666314, + "grad_norm": 0.8736449790911988, + "learning_rate": 1.9463030035923868e-08, + "loss": 0.0427, + "step": 33321 + }, + { + "epoch": 3.951381477528756, + "grad_norm": 0.787652893351557, + "learning_rate": 1.9368446863529677e-08, + "loss": 0.0472, + "step": 33322 + }, + { + "epoch": 3.951500059290881, + "grad_norm": 0.6938650175795306, + "learning_rate": 1.9274093981927478e-08, + "loss": 0.0308, + "step": 33323 + }, + { + "epoch": 3.951618641053006, + "grad_norm": 0.3992487206463021, + "learning_rate": 1.917997139198324e-08, + "loss": 0.0177, + "step": 33324 + }, + { + "epoch": 3.9517372228151313, + "grad_norm": 0.38997967967985364, + "learning_rate": 1.9086079094565725e-08, + "loss": 0.015, + "step": 33325 + }, + { + "epoch": 3.951855804577256, + "grad_norm": 0.47620322569555584, + "learning_rate": 1.8992417090540893e-08, + "loss": 0.0184, + "step": 33326 + }, + { + "epoch": 3.951974386339381, + "grad_norm": 0.7168950964251504, + "learning_rate": 1.8898985380774726e-08, + "loss": 0.0283, + "step": 33327 + }, + { + "epoch": 3.952092968101506, + "grad_norm": 0.5900581676943425, + "learning_rate": 1.8805783966124867e-08, + "loss": 0.0374, + "step": 33328 + }, + { + "epoch": 3.952211549863631, + "grad_norm": 0.6795039532718486, + "learning_rate": 1.8712812847451745e-08, + "loss": 0.0301, + "step": 33329 + }, + { + "epoch": 3.952330131625756, + "grad_norm": 0.4208519848720337, + "learning_rate": 1.8620072025610224e-08, + "loss": 0.0199, + "step": 33330 + }, + { + "epoch": 3.9524487133878807, + "grad_norm": 0.32945032353248216, + "learning_rate": 1.8527561501460735e-08, + "loss": 0.0154, + "step": 33331 + }, + { + "epoch": 3.952567295150006, + "grad_norm": 0.523526055687681, + "learning_rate": 1.8435281275849815e-08, + "loss": 0.0219, + "step": 33332 + }, + { + "epoch": 3.952685876912131, + "grad_norm": 0.645465703850089, + "learning_rate": 1.834323134963234e-08, + "loss": 0.0318, + "step": 33333 + }, + { + "epoch": 3.952804458674256, + "grad_norm": 0.29859890297082187, + "learning_rate": 1.825141172365763e-08, + "loss": 0.0095, + "step": 33334 + }, + { + "epoch": 3.9529230404363807, + "grad_norm": 0.3980779321233909, + "learning_rate": 1.815982239876668e-08, + "loss": 0.0171, + "step": 33335 + }, + { + "epoch": 3.953041622198506, + "grad_norm": 0.5215748494899981, + "learning_rate": 1.8068463375811583e-08, + "loss": 0.025, + "step": 33336 + }, + { + "epoch": 3.953160203960631, + "grad_norm": 0.44206510968084234, + "learning_rate": 1.7977334655627787e-08, + "loss": 0.0206, + "step": 33337 + }, + { + "epoch": 3.953278785722756, + "grad_norm": 0.4157283237211688, + "learning_rate": 1.788643623905628e-08, + "loss": 0.0173, + "step": 33338 + }, + { + "epoch": 3.9533973674848806, + "grad_norm": 0.34662156852177634, + "learning_rate": 1.7795768126940836e-08, + "loss": 0.0155, + "step": 33339 + }, + { + "epoch": 3.953515949247006, + "grad_norm": 0.4503307308227166, + "learning_rate": 1.7705330320111348e-08, + "loss": 0.019, + "step": 33340 + }, + { + "epoch": 3.953634531009131, + "grad_norm": 0.48493389639301543, + "learning_rate": 1.761512281940325e-08, + "loss": 0.0189, + "step": 33341 + }, + { + "epoch": 3.9537531127712557, + "grad_norm": 0.5958974861723171, + "learning_rate": 1.7525145625646444e-08, + "loss": 0.0244, + "step": 33342 + }, + { + "epoch": 3.9538716945333805, + "grad_norm": 0.5784253873447858, + "learning_rate": 1.7435398739676367e-08, + "loss": 0.0263, + "step": 33343 + }, + { + "epoch": 3.9539902762955057, + "grad_norm": 0.5940825496065845, + "learning_rate": 1.734588216231181e-08, + "loss": 0.0304, + "step": 33344 + }, + { + "epoch": 3.954108858057631, + "grad_norm": 0.5415594843788395, + "learning_rate": 1.725659589438544e-08, + "loss": 0.0293, + "step": 33345 + }, + { + "epoch": 3.9542274398197557, + "grad_norm": 0.5961538249479175, + "learning_rate": 1.716753993671605e-08, + "loss": 0.0206, + "step": 33346 + }, + { + "epoch": 3.954346021581881, + "grad_norm": 0.4568338727030447, + "learning_rate": 1.7078714290127973e-08, + "loss": 0.0315, + "step": 33347 + }, + { + "epoch": 3.9544646033440056, + "grad_norm": 0.7711122450524647, + "learning_rate": 1.699011895543723e-08, + "loss": 0.04, + "step": 33348 + }, + { + "epoch": 3.954583185106131, + "grad_norm": 0.6052112368059128, + "learning_rate": 1.690175393345983e-08, + "loss": 0.0349, + "step": 33349 + }, + { + "epoch": 3.9547017668682556, + "grad_norm": 0.4233202924285294, + "learning_rate": 1.681361922501179e-08, + "loss": 0.0224, + "step": 33350 + }, + { + "epoch": 3.954820348630381, + "grad_norm": 0.7050035963003074, + "learning_rate": 1.6725714830909124e-08, + "loss": 0.0345, + "step": 33351 + }, + { + "epoch": 3.9549389303925055, + "grad_norm": 0.2899365796227922, + "learning_rate": 1.663804075195674e-08, + "loss": 0.0137, + "step": 33352 + }, + { + "epoch": 3.9550575121546307, + "grad_norm": 0.32790344076510186, + "learning_rate": 1.6550596988965107e-08, + "loss": 0.0152, + "step": 33353 + }, + { + "epoch": 3.9551760939167555, + "grad_norm": 0.3674005124781923, + "learning_rate": 1.646338354273913e-08, + "loss": 0.0156, + "step": 33354 + }, + { + "epoch": 3.9552946756788807, + "grad_norm": 0.3582819154156955, + "learning_rate": 1.6376400414083725e-08, + "loss": 0.018, + "step": 33355 + }, + { + "epoch": 3.9554132574410055, + "grad_norm": 0.4737574410197525, + "learning_rate": 1.6289647603803802e-08, + "loss": 0.0255, + "step": 33356 + }, + { + "epoch": 3.9555318392031307, + "grad_norm": 0.586575450901515, + "learning_rate": 1.6203125112693174e-08, + "loss": 0.0204, + "step": 33357 + }, + { + "epoch": 3.9556504209652554, + "grad_norm": 0.3313639955578634, + "learning_rate": 1.6116832941553973e-08, + "loss": 0.0159, + "step": 33358 + }, + { + "epoch": 3.9557690027273806, + "grad_norm": 0.4763974137478306, + "learning_rate": 1.603077109118001e-08, + "loss": 0.0141, + "step": 33359 + }, + { + "epoch": 3.9558875844895054, + "grad_norm": 0.4174554132761619, + "learning_rate": 1.5944939562367867e-08, + "loss": 0.0178, + "step": 33360 + }, + { + "epoch": 3.9560061662516306, + "grad_norm": 0.7645576234217413, + "learning_rate": 1.5859338355900254e-08, + "loss": 0.0337, + "step": 33361 + }, + { + "epoch": 3.9561247480137554, + "grad_norm": 0.4674668474331656, + "learning_rate": 1.5773967472576535e-08, + "loss": 0.0232, + "step": 33362 + }, + { + "epoch": 3.9562433297758806, + "grad_norm": 0.47437316723981426, + "learning_rate": 1.5688826913176635e-08, + "loss": 0.0241, + "step": 33363 + }, + { + "epoch": 3.9563619115380053, + "grad_norm": 0.7056316030056391, + "learning_rate": 1.5603916678488818e-08, + "loss": 0.0266, + "step": 33364 + }, + { + "epoch": 3.9564804933001305, + "grad_norm": 0.9505253538107804, + "learning_rate": 1.5519236769295788e-08, + "loss": 0.0398, + "step": 33365 + }, + { + "epoch": 3.9565990750622553, + "grad_norm": 0.6717216063973027, + "learning_rate": 1.5434787186377474e-08, + "loss": 0.0452, + "step": 33366 + }, + { + "epoch": 3.9567176568243805, + "grad_norm": 0.4365995629369219, + "learning_rate": 1.535056793051104e-08, + "loss": 0.0224, + "step": 33367 + }, + { + "epoch": 3.9568362385865052, + "grad_norm": 0.718716788459023, + "learning_rate": 1.5266579002473636e-08, + "loss": 0.0324, + "step": 33368 + }, + { + "epoch": 3.9569548203486304, + "grad_norm": 0.42188557777499547, + "learning_rate": 1.518282040304242e-08, + "loss": 0.0169, + "step": 33369 + }, + { + "epoch": 3.957073402110755, + "grad_norm": 0.40807401040173635, + "learning_rate": 1.509929213298622e-08, + "loss": 0.0176, + "step": 33370 + }, + { + "epoch": 3.9571919838728804, + "grad_norm": 0.5190384249680117, + "learning_rate": 1.5015994193076642e-08, + "loss": 0.0214, + "step": 33371 + }, + { + "epoch": 3.957310565635005, + "grad_norm": 0.4697425186885614, + "learning_rate": 1.4932926584079743e-08, + "loss": 0.0203, + "step": 33372 + }, + { + "epoch": 3.9574291473971304, + "grad_norm": 0.4361057637699822, + "learning_rate": 1.4850089306761572e-08, + "loss": 0.0263, + "step": 33373 + }, + { + "epoch": 3.9575477291592556, + "grad_norm": 0.27940200951258337, + "learning_rate": 1.4767482361888185e-08, + "loss": 0.0154, + "step": 33374 + }, + { + "epoch": 3.9576663109213803, + "grad_norm": 0.4453220082080854, + "learning_rate": 1.4685105750220085e-08, + "loss": 0.023, + "step": 33375 + }, + { + "epoch": 3.957784892683505, + "grad_norm": 0.5712400589838775, + "learning_rate": 1.4602959472514999e-08, + "loss": 0.0194, + "step": 33376 + }, + { + "epoch": 3.9579034744456303, + "grad_norm": 0.4303004205702741, + "learning_rate": 1.4521043529533429e-08, + "loss": 0.0214, + "step": 33377 + }, + { + "epoch": 3.9580220562077555, + "grad_norm": 0.4917654995034687, + "learning_rate": 1.4439357922027553e-08, + "loss": 0.0198, + "step": 33378 + }, + { + "epoch": 3.9581406379698802, + "grad_norm": 0.4143132025571263, + "learning_rate": 1.4357902650752319e-08, + "loss": 0.0169, + "step": 33379 + }, + { + "epoch": 3.958259219732005, + "grad_norm": 0.9097792205530535, + "learning_rate": 1.4276677716457131e-08, + "loss": 0.0301, + "step": 33380 + }, + { + "epoch": 3.95837780149413, + "grad_norm": 0.4603654500396708, + "learning_rate": 1.419568311989139e-08, + "loss": 0.0253, + "step": 33381 + }, + { + "epoch": 3.9584963832562554, + "grad_norm": 0.5931878672442378, + "learning_rate": 1.4114918861801717e-08, + "loss": 0.0234, + "step": 33382 + }, + { + "epoch": 3.95861496501838, + "grad_norm": 0.514240767272106, + "learning_rate": 1.4034384942931966e-08, + "loss": 0.0207, + "step": 33383 + }, + { + "epoch": 3.958733546780505, + "grad_norm": 0.3286816502405666, + "learning_rate": 1.3954081364025985e-08, + "loss": 0.015, + "step": 33384 + }, + { + "epoch": 3.95885212854263, + "grad_norm": 0.437126141347465, + "learning_rate": 1.3874008125822069e-08, + "loss": 0.0179, + "step": 33385 + }, + { + "epoch": 3.9589707103047553, + "grad_norm": 0.5696328829027724, + "learning_rate": 1.3794165229061296e-08, + "loss": 0.0225, + "step": 33386 + }, + { + "epoch": 3.95908929206688, + "grad_norm": 0.6901394592929107, + "learning_rate": 1.3714552674479186e-08, + "loss": 0.0364, + "step": 33387 + }, + { + "epoch": 3.959207873829005, + "grad_norm": 0.41681841502768063, + "learning_rate": 1.3635170462808488e-08, + "loss": 0.0177, + "step": 33388 + }, + { + "epoch": 3.95932645559113, + "grad_norm": 0.5214951510369354, + "learning_rate": 1.3556018594779174e-08, + "loss": 0.0286, + "step": 33389 + }, + { + "epoch": 3.9594450373532553, + "grad_norm": 0.7024763626693468, + "learning_rate": 1.347709707112399e-08, + "loss": 0.0343, + "step": 33390 + }, + { + "epoch": 3.95956361911538, + "grad_norm": 0.6370604862385302, + "learning_rate": 1.3398405892570132e-08, + "loss": 0.0343, + "step": 33391 + }, + { + "epoch": 3.9596822008775048, + "grad_norm": 0.6862260007516471, + "learning_rate": 1.3319945059842021e-08, + "loss": 0.0416, + "step": 33392 + }, + { + "epoch": 3.95980078263963, + "grad_norm": 0.4361854924546859, + "learning_rate": 1.3241714573664077e-08, + "loss": 0.0183, + "step": 33393 + }, + { + "epoch": 3.959919364401755, + "grad_norm": 0.5996192596353614, + "learning_rate": 1.316371443475517e-08, + "loss": 0.0271, + "step": 33394 + }, + { + "epoch": 3.96003794616388, + "grad_norm": 0.6004792035393172, + "learning_rate": 1.3085944643836946e-08, + "loss": 0.0255, + "step": 33395 + }, + { + "epoch": 3.960156527926005, + "grad_norm": 0.5444938220944684, + "learning_rate": 1.3008405201625496e-08, + "loss": 0.031, + "step": 33396 + }, + { + "epoch": 3.96027510968813, + "grad_norm": 0.4203790600809413, + "learning_rate": 1.2931096108836915e-08, + "loss": 0.0199, + "step": 33397 + }, + { + "epoch": 3.960393691450255, + "grad_norm": 0.3458964617073445, + "learning_rate": 1.2854017366178971e-08, + "loss": 0.0161, + "step": 33398 + }, + { + "epoch": 3.96051227321238, + "grad_norm": 0.5679769464024761, + "learning_rate": 1.2777168974367759e-08, + "loss": 0.0231, + "step": 33399 + }, + { + "epoch": 3.960630854974505, + "grad_norm": 0.3978239993186301, + "learning_rate": 1.2700550934108268e-08, + "loss": 0.0169, + "step": 33400 + }, + { + "epoch": 3.96074943673663, + "grad_norm": 0.41066305813617915, + "learning_rate": 1.2624163246111043e-08, + "loss": 0.0157, + "step": 33401 + }, + { + "epoch": 3.960868018498755, + "grad_norm": 0.3856678166152162, + "learning_rate": 1.2548005911075523e-08, + "loss": 0.0196, + "step": 33402 + }, + { + "epoch": 3.96098660026088, + "grad_norm": 0.5160662764001938, + "learning_rate": 1.2472078929706699e-08, + "loss": 0.0229, + "step": 33403 + }, + { + "epoch": 3.961105182023005, + "grad_norm": 0.3791959134212932, + "learning_rate": 1.2396382302701238e-08, + "loss": 0.0174, + "step": 33404 + }, + { + "epoch": 3.9612237637851297, + "grad_norm": 0.45983013355420516, + "learning_rate": 1.2320916030761354e-08, + "loss": 0.021, + "step": 33405 + }, + { + "epoch": 3.961342345547255, + "grad_norm": 0.6512972058913038, + "learning_rate": 1.2245680114580938e-08, + "loss": 0.0277, + "step": 33406 + }, + { + "epoch": 3.9614609273093797, + "grad_norm": 0.6359189736217512, + "learning_rate": 1.2170674554851103e-08, + "loss": 0.0206, + "step": 33407 + }, + { + "epoch": 3.961579509071505, + "grad_norm": 0.48899038012208884, + "learning_rate": 1.2095899352268514e-08, + "loss": 0.0221, + "step": 33408 + }, + { + "epoch": 3.9616980908336297, + "grad_norm": 0.6876925752930285, + "learning_rate": 1.2021354507518734e-08, + "loss": 0.035, + "step": 33409 + }, + { + "epoch": 3.961816672595755, + "grad_norm": 0.3535833849806326, + "learning_rate": 1.1947040021287325e-08, + "loss": 0.0217, + "step": 33410 + }, + { + "epoch": 3.9619352543578796, + "grad_norm": 0.41881379884328623, + "learning_rate": 1.1872955894265402e-08, + "loss": 0.0161, + "step": 33411 + }, + { + "epoch": 3.962053836120005, + "grad_norm": 0.7572290776367461, + "learning_rate": 1.1799102127130202e-08, + "loss": 0.0383, + "step": 33412 + }, + { + "epoch": 3.9621724178821296, + "grad_norm": 0.48937851169817354, + "learning_rate": 1.1725478720564509e-08, + "loss": 0.0278, + "step": 33413 + }, + { + "epoch": 3.962290999644255, + "grad_norm": 0.6023669444576925, + "learning_rate": 1.1652085675248337e-08, + "loss": 0.0239, + "step": 33414 + }, + { + "epoch": 3.9624095814063796, + "grad_norm": 0.5230481508513524, + "learning_rate": 1.1578922991856145e-08, + "loss": 0.0215, + "step": 33415 + }, + { + "epoch": 3.9625281631685048, + "grad_norm": 0.3531442888298612, + "learning_rate": 1.150599067106517e-08, + "loss": 0.0148, + "step": 33416 + }, + { + "epoch": 3.9626467449306295, + "grad_norm": 0.6198212887241226, + "learning_rate": 1.1433288713544321e-08, + "loss": 0.0256, + "step": 33417 + }, + { + "epoch": 3.9627653266927547, + "grad_norm": 0.5082527989387251, + "learning_rate": 1.1360817119968058e-08, + "loss": 0.0234, + "step": 33418 + }, + { + "epoch": 3.9628839084548795, + "grad_norm": 0.5295352023797615, + "learning_rate": 1.1288575890999742e-08, + "loss": 0.0343, + "step": 33419 + }, + { + "epoch": 3.9630024902170047, + "grad_norm": 0.5478557649355785, + "learning_rate": 1.1216565027308278e-08, + "loss": 0.0327, + "step": 33420 + }, + { + "epoch": 3.9631210719791294, + "grad_norm": 0.353763510272433, + "learning_rate": 1.1144784529554252e-08, + "loss": 0.0151, + "step": 33421 + }, + { + "epoch": 3.9632396537412546, + "grad_norm": 0.42416826347522707, + "learning_rate": 1.1073234398406573e-08, + "loss": 0.021, + "step": 33422 + }, + { + "epoch": 3.96335823550338, + "grad_norm": 0.4495168615381652, + "learning_rate": 1.1001914634517497e-08, + "loss": 0.0218, + "step": 33423 + }, + { + "epoch": 3.9634768172655046, + "grad_norm": 0.5203847238053332, + "learning_rate": 1.0930825238547603e-08, + "loss": 0.0319, + "step": 33424 + }, + { + "epoch": 3.9635953990276294, + "grad_norm": 0.5786984119457723, + "learning_rate": 1.0859966211151928e-08, + "loss": 0.0282, + "step": 33425 + }, + { + "epoch": 3.9637139807897546, + "grad_norm": 0.46641932743313264, + "learning_rate": 1.0789337552982726e-08, + "loss": 0.0287, + "step": 33426 + }, + { + "epoch": 3.9638325625518798, + "grad_norm": 0.6980987419607332, + "learning_rate": 1.0718939264692251e-08, + "loss": 0.0427, + "step": 33427 + }, + { + "epoch": 3.9639511443140045, + "grad_norm": 0.40453593400742477, + "learning_rate": 1.0648771346929986e-08, + "loss": 0.0147, + "step": 33428 + }, + { + "epoch": 3.9640697260761293, + "grad_norm": 0.7043887838562016, + "learning_rate": 1.0578833800342636e-08, + "loss": 0.037, + "step": 33429 + }, + { + "epoch": 3.9641883078382545, + "grad_norm": 0.37685271666239895, + "learning_rate": 1.0509126625574129e-08, + "loss": 0.0173, + "step": 33430 + }, + { + "epoch": 3.9643068896003797, + "grad_norm": 0.4218973852290514, + "learning_rate": 1.0439649823268394e-08, + "loss": 0.021, + "step": 33431 + }, + { + "epoch": 3.9644254713625044, + "grad_norm": 0.5153876219981371, + "learning_rate": 1.0370403394061034e-08, + "loss": 0.02, + "step": 33432 + }, + { + "epoch": 3.964544053124629, + "grad_norm": 0.4882484352307018, + "learning_rate": 1.0301387338598755e-08, + "loss": 0.0261, + "step": 33433 + }, + { + "epoch": 3.9646626348867544, + "grad_norm": 0.5643977769486357, + "learning_rate": 1.0232601657508834e-08, + "loss": 0.0216, + "step": 33434 + }, + { + "epoch": 3.9647812166488796, + "grad_norm": 0.48342338574336424, + "learning_rate": 1.0164046351432422e-08, + "loss": 0.0216, + "step": 33435 + }, + { + "epoch": 3.9648997984110044, + "grad_norm": 0.549292873580421, + "learning_rate": 1.0095721420999572e-08, + "loss": 0.0249, + "step": 33436 + }, + { + "epoch": 3.965018380173129, + "grad_norm": 0.306173806399259, + "learning_rate": 1.0027626866837559e-08, + "loss": 0.0124, + "step": 33437 + }, + { + "epoch": 3.9651369619352543, + "grad_norm": 0.704949179219455, + "learning_rate": 9.95976268957921e-09, + "loss": 0.0235, + "step": 33438 + }, + { + "epoch": 3.9652555436973795, + "grad_norm": 0.40024629719833515, + "learning_rate": 9.892128889843478e-09, + "loss": 0.0143, + "step": 33439 + }, + { + "epoch": 3.9653741254595043, + "grad_norm": 0.6604830832718767, + "learning_rate": 9.824725468260409e-09, + "loss": 0.0335, + "step": 33440 + }, + { + "epoch": 3.965492707221629, + "grad_norm": 0.3990751748615953, + "learning_rate": 9.757552425446182e-09, + "loss": 0.0248, + "step": 33441 + }, + { + "epoch": 3.9656112889837543, + "grad_norm": 0.636553261562833, + "learning_rate": 9.690609762022517e-09, + "loss": 0.0328, + "step": 33442 + }, + { + "epoch": 3.9657298707458795, + "grad_norm": 0.4409620024841515, + "learning_rate": 9.623897478608368e-09, + "loss": 0.0163, + "step": 33443 + }, + { + "epoch": 3.965848452508004, + "grad_norm": 0.44156640097557337, + "learning_rate": 9.557415575814355e-09, + "loss": 0.0238, + "step": 33444 + }, + { + "epoch": 3.965967034270129, + "grad_norm": 0.3493351787240521, + "learning_rate": 9.491164054256652e-09, + "loss": 0.0152, + "step": 33445 + }, + { + "epoch": 3.966085616032254, + "grad_norm": 0.5181913835909334, + "learning_rate": 9.425142914545881e-09, + "loss": 0.0144, + "step": 33446 + }, + { + "epoch": 3.9662041977943794, + "grad_norm": 0.41962597688742553, + "learning_rate": 9.359352157287116e-09, + "loss": 0.0173, + "step": 33447 + }, + { + "epoch": 3.966322779556504, + "grad_norm": 0.8226836187917135, + "learning_rate": 9.293791783090978e-09, + "loss": 0.0382, + "step": 33448 + }, + { + "epoch": 3.9664413613186293, + "grad_norm": 0.43117003801646736, + "learning_rate": 9.228461792562538e-09, + "loss": 0.0214, + "step": 33449 + }, + { + "epoch": 3.966559943080754, + "grad_norm": 0.6466098878437938, + "learning_rate": 9.163362186298541e-09, + "loss": 0.0254, + "step": 33450 + }, + { + "epoch": 3.9666785248428793, + "grad_norm": 0.5810504328998831, + "learning_rate": 9.09849296490406e-09, + "loss": 0.0148, + "step": 33451 + }, + { + "epoch": 3.966797106605004, + "grad_norm": 0.4738296323707724, + "learning_rate": 9.033854128975839e-09, + "loss": 0.0274, + "step": 33452 + }, + { + "epoch": 3.9669156883671293, + "grad_norm": 0.4813018982296376, + "learning_rate": 8.969445679110621e-09, + "loss": 0.0223, + "step": 33453 + }, + { + "epoch": 3.967034270129254, + "grad_norm": 0.35439683495596225, + "learning_rate": 8.905267615899604e-09, + "loss": 0.0175, + "step": 33454 + }, + { + "epoch": 3.9671528518913792, + "grad_norm": 0.5070083797320838, + "learning_rate": 8.841319939936754e-09, + "loss": 0.0242, + "step": 33455 + }, + { + "epoch": 3.967271433653504, + "grad_norm": 0.6744508737587046, + "learning_rate": 8.77760265181049e-09, + "loss": 0.0222, + "step": 33456 + }, + { + "epoch": 3.967390015415629, + "grad_norm": 0.39031230335063605, + "learning_rate": 8.714115752106456e-09, + "loss": 0.021, + "step": 33457 + }, + { + "epoch": 3.967508597177754, + "grad_norm": 0.8119395527306987, + "learning_rate": 8.650859241413068e-09, + "loss": 0.0401, + "step": 33458 + }, + { + "epoch": 3.967627178939879, + "grad_norm": 0.438014813769493, + "learning_rate": 8.587833120313194e-09, + "loss": 0.0206, + "step": 33459 + }, + { + "epoch": 3.967745760702004, + "grad_norm": 0.480641670293489, + "learning_rate": 8.525037389386926e-09, + "loss": 0.0195, + "step": 33460 + }, + { + "epoch": 3.967864342464129, + "grad_norm": 0.5067409468209351, + "learning_rate": 8.462472049214354e-09, + "loss": 0.0226, + "step": 33461 + }, + { + "epoch": 3.967982924226254, + "grad_norm": 0.3122453300822548, + "learning_rate": 8.400137100370021e-09, + "loss": 0.0114, + "step": 33462 + }, + { + "epoch": 3.968101505988379, + "grad_norm": 0.451518560398409, + "learning_rate": 8.338032543428465e-09, + "loss": 0.0223, + "step": 33463 + }, + { + "epoch": 3.968220087750504, + "grad_norm": 0.3303959199202032, + "learning_rate": 8.276158378964228e-09, + "loss": 0.0181, + "step": 33464 + }, + { + "epoch": 3.968338669512629, + "grad_norm": 0.6290543314369916, + "learning_rate": 8.214514607546298e-09, + "loss": 0.0257, + "step": 33465 + }, + { + "epoch": 3.968457251274754, + "grad_norm": 0.6299644880090217, + "learning_rate": 8.15310122974644e-09, + "loss": 0.0398, + "step": 33466 + }, + { + "epoch": 3.968575833036879, + "grad_norm": 0.8500162841630194, + "learning_rate": 8.091918246125319e-09, + "loss": 0.0449, + "step": 33467 + }, + { + "epoch": 3.9686944147990038, + "grad_norm": 0.36006024140183307, + "learning_rate": 8.030965657249145e-09, + "loss": 0.0178, + "step": 33468 + }, + { + "epoch": 3.968812996561129, + "grad_norm": 0.3977023888771763, + "learning_rate": 7.970243463681359e-09, + "loss": 0.021, + "step": 33469 + }, + { + "epoch": 3.9689315783232537, + "grad_norm": 0.477085403659299, + "learning_rate": 7.909751665982624e-09, + "loss": 0.021, + "step": 33470 + }, + { + "epoch": 3.969050160085379, + "grad_norm": 0.5173689934712813, + "learning_rate": 7.849490264705272e-09, + "loss": 0.0188, + "step": 33471 + }, + { + "epoch": 3.969168741847504, + "grad_norm": 0.7489436484237628, + "learning_rate": 7.78945926040997e-09, + "loss": 0.0348, + "step": 33472 + }, + { + "epoch": 3.969287323609629, + "grad_norm": 0.6738675046365007, + "learning_rate": 7.729658653649053e-09, + "loss": 0.0272, + "step": 33473 + }, + { + "epoch": 3.9694059053717536, + "grad_norm": 0.6401821038170387, + "learning_rate": 7.67008844497208e-09, + "loss": 0.0368, + "step": 33474 + }, + { + "epoch": 3.969524487133879, + "grad_norm": 0.4333466649139615, + "learning_rate": 7.61074863492861e-09, + "loss": 0.0213, + "step": 33475 + }, + { + "epoch": 3.969643068896004, + "grad_norm": 0.42664843713408995, + "learning_rate": 7.551639224068207e-09, + "loss": 0.025, + "step": 33476 + }, + { + "epoch": 3.969761650658129, + "grad_norm": 0.48769141959727486, + "learning_rate": 7.492760212932105e-09, + "loss": 0.0194, + "step": 33477 + }, + { + "epoch": 3.9698802324202536, + "grad_norm": 0.47012410498377144, + "learning_rate": 7.4341116020670844e-09, + "loss": 0.0255, + "step": 33478 + }, + { + "epoch": 3.9699988141823788, + "grad_norm": 0.8203046059069566, + "learning_rate": 7.3756933920088315e-09, + "loss": 0.0303, + "step": 33479 + }, + { + "epoch": 3.970117395944504, + "grad_norm": 0.550640641505537, + "learning_rate": 7.3175055833013536e-09, + "loss": 0.023, + "step": 33480 + }, + { + "epoch": 3.9702359777066287, + "grad_norm": 0.7135904698663438, + "learning_rate": 7.259548176474784e-09, + "loss": 0.0351, + "step": 33481 + }, + { + "epoch": 3.9703545594687535, + "grad_norm": 0.4232754774447789, + "learning_rate": 7.201821172070356e-09, + "loss": 0.0129, + "step": 33482 + }, + { + "epoch": 3.9704731412308787, + "grad_norm": 0.8924702996251427, + "learning_rate": 7.144324570615424e-09, + "loss": 0.0401, + "step": 33483 + }, + { + "epoch": 3.970591722993004, + "grad_norm": 0.38432137252088955, + "learning_rate": 7.087058372642896e-09, + "loss": 0.0159, + "step": 33484 + }, + { + "epoch": 3.9707103047551287, + "grad_norm": 0.7670766167633478, + "learning_rate": 7.030022578680129e-09, + "loss": 0.0421, + "step": 33485 + }, + { + "epoch": 3.9708288865172534, + "grad_norm": 0.61270673826891, + "learning_rate": 6.973217189248926e-09, + "loss": 0.0296, + "step": 33486 + }, + { + "epoch": 3.9709474682793786, + "grad_norm": 0.6288319718577854, + "learning_rate": 6.91664220487942e-09, + "loss": 0.025, + "step": 33487 + }, + { + "epoch": 3.971066050041504, + "grad_norm": 0.7424078144057263, + "learning_rate": 6.860297626087864e-09, + "loss": 0.0286, + "step": 33488 + }, + { + "epoch": 3.9711846318036286, + "grad_norm": 0.4108934654103966, + "learning_rate": 6.804183453396063e-09, + "loss": 0.0142, + "step": 33489 + }, + { + "epoch": 3.9713032135657533, + "grad_norm": 0.7943667004846999, + "learning_rate": 6.74829968732027e-09, + "loss": 0.0444, + "step": 33490 + }, + { + "epoch": 3.9714217953278785, + "grad_norm": 0.4321595980954415, + "learning_rate": 6.692646328376739e-09, + "loss": 0.0195, + "step": 33491 + }, + { + "epoch": 3.9715403770900037, + "grad_norm": 0.3777444108262476, + "learning_rate": 6.6372233770789495e-09, + "loss": 0.016, + "step": 33492 + }, + { + "epoch": 3.9716589588521285, + "grad_norm": 0.6141610936994182, + "learning_rate": 6.582030833937602e-09, + "loss": 0.0217, + "step": 33493 + }, + { + "epoch": 3.9717775406142533, + "grad_norm": 0.862652650788516, + "learning_rate": 6.527068699460626e-09, + "loss": 0.0473, + "step": 33494 + }, + { + "epoch": 3.9718961223763785, + "grad_norm": 0.38796290388118915, + "learning_rate": 6.472336974155946e-09, + "loss": 0.0213, + "step": 33495 + }, + { + "epoch": 3.9720147041385037, + "grad_norm": 0.37864212188776447, + "learning_rate": 6.417835658525939e-09, + "loss": 0.0166, + "step": 33496 + }, + { + "epoch": 3.9721332859006284, + "grad_norm": 0.5706579212970199, + "learning_rate": 6.363564753072981e-09, + "loss": 0.0321, + "step": 33497 + }, + { + "epoch": 3.9722518676627536, + "grad_norm": 0.6325652909201075, + "learning_rate": 6.309524258302224e-09, + "loss": 0.0358, + "step": 33498 + }, + { + "epoch": 3.9723704494248784, + "grad_norm": 0.43497038450153447, + "learning_rate": 6.255714174704941e-09, + "loss": 0.0207, + "step": 33499 + }, + { + "epoch": 3.9724890311870036, + "grad_norm": 0.6315274597984656, + "learning_rate": 6.202134502780732e-09, + "loss": 0.0301, + "step": 33500 + }, + { + "epoch": 3.9726076129491283, + "grad_norm": 0.38665273122915833, + "learning_rate": 6.148785243026423e-09, + "loss": 0.0212, + "step": 33501 + }, + { + "epoch": 3.9727261947112535, + "grad_norm": 0.6453158691205148, + "learning_rate": 6.095666395927735e-09, + "loss": 0.0187, + "step": 33502 + }, + { + "epoch": 3.9728447764733783, + "grad_norm": 0.4776977957228523, + "learning_rate": 6.042777961978719e-09, + "loss": 0.0195, + "step": 33503 + }, + { + "epoch": 3.9729633582355035, + "grad_norm": 0.45024784708470167, + "learning_rate": 5.990119941667871e-09, + "loss": 0.0186, + "step": 33504 + }, + { + "epoch": 3.9730819399976283, + "grad_norm": 0.5816145355326791, + "learning_rate": 5.937692335475365e-09, + "loss": 0.0259, + "step": 33505 + }, + { + "epoch": 3.9732005217597535, + "grad_norm": 0.6041035367797645, + "learning_rate": 5.885495143889696e-09, + "loss": 0.0298, + "step": 33506 + }, + { + "epoch": 3.9733191035218782, + "grad_norm": 0.6009960308419036, + "learning_rate": 5.8335283673882634e-09, + "loss": 0.0215, + "step": 33507 + }, + { + "epoch": 3.9734376852840034, + "grad_norm": 0.3960207046562683, + "learning_rate": 5.781792006451236e-09, + "loss": 0.0173, + "step": 33508 + }, + { + "epoch": 3.973556267046128, + "grad_norm": 0.46430158891759216, + "learning_rate": 5.730286061558787e-09, + "loss": 0.024, + "step": 33509 + }, + { + "epoch": 3.9736748488082534, + "grad_norm": 0.7191236628255693, + "learning_rate": 5.6790105331799845e-09, + "loss": 0.032, + "step": 33510 + }, + { + "epoch": 3.973793430570378, + "grad_norm": 0.6422602867247167, + "learning_rate": 5.627965421792225e-09, + "loss": 0.0276, + "step": 33511 + }, + { + "epoch": 3.9739120123325034, + "grad_norm": 0.6455536644402708, + "learning_rate": 5.5771507278645775e-09, + "loss": 0.0152, + "step": 33512 + }, + { + "epoch": 3.974030594094628, + "grad_norm": 0.47731731028171004, + "learning_rate": 5.526566451866111e-09, + "loss": 0.019, + "step": 33513 + }, + { + "epoch": 3.9741491758567533, + "grad_norm": 0.4308115089134773, + "learning_rate": 5.476212594263119e-09, + "loss": 0.0162, + "step": 33514 + }, + { + "epoch": 3.974267757618878, + "grad_norm": 0.6345167814546007, + "learning_rate": 5.4260891555163455e-09, + "loss": 0.0321, + "step": 33515 + }, + { + "epoch": 3.9743863393810033, + "grad_norm": 0.4885062637700662, + "learning_rate": 5.376196136092082e-09, + "loss": 0.0168, + "step": 33516 + }, + { + "epoch": 3.974504921143128, + "grad_norm": 0.5324106407532789, + "learning_rate": 5.326533536448297e-09, + "loss": 0.0231, + "step": 33517 + }, + { + "epoch": 3.9746235029052532, + "grad_norm": 0.6167880811648864, + "learning_rate": 5.2771013570457325e-09, + "loss": 0.0325, + "step": 33518 + }, + { + "epoch": 3.974742084667378, + "grad_norm": 0.503905220363612, + "learning_rate": 5.227899598334029e-09, + "loss": 0.0178, + "step": 33519 + }, + { + "epoch": 3.974860666429503, + "grad_norm": 0.3824082195610679, + "learning_rate": 5.17892826077393e-09, + "loss": 0.0165, + "step": 33520 + }, + { + "epoch": 3.9749792481916284, + "grad_norm": 0.6290994813798025, + "learning_rate": 5.130187344812298e-09, + "loss": 0.0304, + "step": 33521 + }, + { + "epoch": 3.975097829953753, + "grad_norm": 0.5484815855733038, + "learning_rate": 5.0816768508987756e-09, + "loss": 0.0205, + "step": 33522 + }, + { + "epoch": 3.975216411715878, + "grad_norm": 0.4899614693538434, + "learning_rate": 5.033396779480226e-09, + "loss": 0.0214, + "step": 33523 + }, + { + "epoch": 3.975334993478003, + "grad_norm": 0.698078372517803, + "learning_rate": 4.985347131006291e-09, + "loss": 0.0372, + "step": 33524 + }, + { + "epoch": 3.9754535752401283, + "grad_norm": 0.2919954622795099, + "learning_rate": 4.937527905912731e-09, + "loss": 0.0168, + "step": 33525 + }, + { + "epoch": 3.975572157002253, + "grad_norm": 0.5608720131182903, + "learning_rate": 4.889939104646412e-09, + "loss": 0.0226, + "step": 33526 + }, + { + "epoch": 3.975690738764378, + "grad_norm": 0.6189326068737782, + "learning_rate": 4.8425807276458735e-09, + "loss": 0.0239, + "step": 33527 + }, + { + "epoch": 3.975809320526503, + "grad_norm": 0.49089097001703935, + "learning_rate": 4.7954527753413246e-09, + "loss": 0.023, + "step": 33528 + }, + { + "epoch": 3.9759279022886282, + "grad_norm": 0.5607220910697843, + "learning_rate": 4.74855524817408e-09, + "loss": 0.0239, + "step": 33529 + }, + { + "epoch": 3.976046484050753, + "grad_norm": 0.6250316029015703, + "learning_rate": 4.701888146574351e-09, + "loss": 0.0264, + "step": 33530 + }, + { + "epoch": 3.9761650658128778, + "grad_norm": 0.5780898785553461, + "learning_rate": 4.655451470969574e-09, + "loss": 0.0268, + "step": 33531 + }, + { + "epoch": 3.976283647575003, + "grad_norm": 0.297413857866776, + "learning_rate": 4.6092452217927355e-09, + "loss": 0.0122, + "step": 33532 + }, + { + "epoch": 3.976402229337128, + "grad_norm": 0.8776370676449369, + "learning_rate": 4.56326939946572e-09, + "loss": 0.0319, + "step": 33533 + }, + { + "epoch": 3.976520811099253, + "grad_norm": 0.3541486486575698, + "learning_rate": 4.517524004415962e-09, + "loss": 0.0143, + "step": 33534 + }, + { + "epoch": 3.9766393928613777, + "grad_norm": 0.47232102026890993, + "learning_rate": 4.472009037059799e-09, + "loss": 0.0288, + "step": 33535 + }, + { + "epoch": 3.976757974623503, + "grad_norm": 0.7223284535705691, + "learning_rate": 4.426724497821888e-09, + "loss": 0.0332, + "step": 33536 + }, + { + "epoch": 3.976876556385628, + "grad_norm": 0.6398927238359544, + "learning_rate": 4.381670387118564e-09, + "loss": 0.0327, + "step": 33537 + }, + { + "epoch": 3.976995138147753, + "grad_norm": 0.5492235088649329, + "learning_rate": 4.33684670536616e-09, + "loss": 0.0334, + "step": 33538 + }, + { + "epoch": 3.9771137199098776, + "grad_norm": 0.5546684063012337, + "learning_rate": 4.292253452972683e-09, + "loss": 0.029, + "step": 33539 + }, + { + "epoch": 3.977232301672003, + "grad_norm": 0.5598314597922052, + "learning_rate": 4.247890630354467e-09, + "loss": 0.016, + "step": 33540 + }, + { + "epoch": 3.977350883434128, + "grad_norm": 0.5226542897935899, + "learning_rate": 4.203758237919519e-09, + "loss": 0.0222, + "step": 33541 + }, + { + "epoch": 3.9774694651962528, + "grad_norm": 0.4521422840402, + "learning_rate": 4.159856276073071e-09, + "loss": 0.019, + "step": 33542 + }, + { + "epoch": 3.9775880469583775, + "grad_norm": 0.4272944331619357, + "learning_rate": 4.116184745223128e-09, + "loss": 0.0148, + "step": 33543 + }, + { + "epoch": 3.9777066287205027, + "grad_norm": 0.8953037194128486, + "learning_rate": 4.072743645766597e-09, + "loss": 0.0357, + "step": 33544 + }, + { + "epoch": 3.977825210482628, + "grad_norm": 0.3219283539745039, + "learning_rate": 4.029532978108708e-09, + "loss": 0.0131, + "step": 33545 + }, + { + "epoch": 3.9779437922447527, + "grad_norm": 0.4052013947506946, + "learning_rate": 3.986552742646365e-09, + "loss": 0.0158, + "step": 33546 + }, + { + "epoch": 3.978062374006878, + "grad_norm": 0.48913984674193717, + "learning_rate": 3.943802939776475e-09, + "loss": 0.0157, + "step": 33547 + }, + { + "epoch": 3.9781809557690027, + "grad_norm": 1.0835384269692885, + "learning_rate": 3.901283569893166e-09, + "loss": 0.0469, + "step": 33548 + }, + { + "epoch": 3.978299537531128, + "grad_norm": 0.6256559962928251, + "learning_rate": 3.858994633385016e-09, + "loss": 0.0275, + "step": 33549 + }, + { + "epoch": 3.9784181192932526, + "grad_norm": 0.21401220554262768, + "learning_rate": 3.816936130646154e-09, + "loss": 0.0072, + "step": 33550 + }, + { + "epoch": 3.978536701055378, + "grad_norm": 0.4437383640647404, + "learning_rate": 3.775108062065158e-09, + "loss": 0.0175, + "step": 33551 + }, + { + "epoch": 3.9786552828175026, + "grad_norm": 0.4889882938553762, + "learning_rate": 3.73351042802228e-09, + "loss": 0.0177, + "step": 33552 + }, + { + "epoch": 3.978773864579628, + "grad_norm": 0.3288044462081121, + "learning_rate": 3.692143228903322e-09, + "loss": 0.0133, + "step": 33553 + }, + { + "epoch": 3.9788924463417525, + "grad_norm": 0.39509663573677334, + "learning_rate": 3.651006465094087e-09, + "loss": 0.0172, + "step": 33554 + }, + { + "epoch": 3.9790110281038777, + "grad_norm": 0.6891727835545239, + "learning_rate": 3.6101001369664987e-09, + "loss": 0.0323, + "step": 33555 + }, + { + "epoch": 3.9791296098660025, + "grad_norm": 0.5270520846672859, + "learning_rate": 3.569424244903585e-09, + "loss": 0.0165, + "step": 33556 + }, + { + "epoch": 3.9792481916281277, + "grad_norm": 0.4352838776821903, + "learning_rate": 3.528978789277271e-09, + "loss": 0.019, + "step": 33557 + }, + { + "epoch": 3.9793667733902525, + "grad_norm": 0.44895081433370093, + "learning_rate": 3.48876377045948e-09, + "loss": 0.0207, + "step": 33558 + }, + { + "epoch": 3.9794853551523777, + "grad_norm": 0.6622197918860289, + "learning_rate": 3.448779188822138e-09, + "loss": 0.0282, + "step": 33559 + }, + { + "epoch": 3.9796039369145024, + "grad_norm": 0.5170604407490137, + "learning_rate": 3.4090250447371687e-09, + "loss": 0.0307, + "step": 33560 + }, + { + "epoch": 3.9797225186766276, + "grad_norm": 0.3476849398124784, + "learning_rate": 3.369501338568171e-09, + "loss": 0.0114, + "step": 33561 + }, + { + "epoch": 3.9798411004387524, + "grad_norm": 0.4950655189184398, + "learning_rate": 3.3302080706759663e-09, + "loss": 0.029, + "step": 33562 + }, + { + "epoch": 3.9799596822008776, + "grad_norm": 0.5060411297041008, + "learning_rate": 3.2911452414297052e-09, + "loss": 0.0268, + "step": 33563 + }, + { + "epoch": 3.9800782639630023, + "grad_norm": 0.4924114267459421, + "learning_rate": 3.252312851184658e-09, + "loss": 0.0196, + "step": 33564 + }, + { + "epoch": 3.9801968457251276, + "grad_norm": 0.3647619323292598, + "learning_rate": 3.213710900298872e-09, + "loss": 0.018, + "step": 33565 + }, + { + "epoch": 3.9803154274872523, + "grad_norm": 0.5005973391968552, + "learning_rate": 3.1753393891331696e-09, + "loss": 0.0156, + "step": 33566 + }, + { + "epoch": 3.9804340092493775, + "grad_norm": 0.6908194805821141, + "learning_rate": 3.1371983180344955e-09, + "loss": 0.044, + "step": 33567 + }, + { + "epoch": 3.9805525910115023, + "grad_norm": 0.42267651831988623, + "learning_rate": 3.099287687358121e-09, + "loss": 0.0185, + "step": 33568 + }, + { + "epoch": 3.9806711727736275, + "grad_norm": 0.6917653617703248, + "learning_rate": 3.0616074974537666e-09, + "loss": 0.0425, + "step": 33569 + }, + { + "epoch": 3.9807897545357527, + "grad_norm": 0.47287206111628477, + "learning_rate": 3.024157748668377e-09, + "loss": 0.0249, + "step": 33570 + }, + { + "epoch": 3.9809083362978774, + "grad_norm": 0.6359028996605481, + "learning_rate": 2.986938441346121e-09, + "loss": 0.0259, + "step": 33571 + }, + { + "epoch": 3.981026918060002, + "grad_norm": 0.5874908975049339, + "learning_rate": 2.9499495758339436e-09, + "loss": 0.0275, + "step": 33572 + }, + { + "epoch": 3.9811454998221274, + "grad_norm": 0.5312154629453262, + "learning_rate": 2.9131911524649113e-09, + "loss": 0.0199, + "step": 33573 + }, + { + "epoch": 3.9812640815842526, + "grad_norm": 0.7195741645663762, + "learning_rate": 2.8766631715859692e-09, + "loss": 0.0307, + "step": 33574 + }, + { + "epoch": 3.9813826633463774, + "grad_norm": 0.6903524270981802, + "learning_rate": 2.8403656335301843e-09, + "loss": 0.0331, + "step": 33575 + }, + { + "epoch": 3.981501245108502, + "grad_norm": 0.6260028250010877, + "learning_rate": 2.804298538633399e-09, + "loss": 0.0441, + "step": 33576 + }, + { + "epoch": 3.9816198268706273, + "grad_norm": 0.5577637996397548, + "learning_rate": 2.768461887225904e-09, + "loss": 0.0298, + "step": 33577 + }, + { + "epoch": 3.9817384086327525, + "grad_norm": 0.5934141113236776, + "learning_rate": 2.732855679640767e-09, + "loss": 0.0147, + "step": 33578 + }, + { + "epoch": 3.9818569903948773, + "grad_norm": 0.3295794638607694, + "learning_rate": 2.6974799162027274e-09, + "loss": 0.0166, + "step": 33579 + }, + { + "epoch": 3.981975572157002, + "grad_norm": 0.6702120226997245, + "learning_rate": 2.6623345972420777e-09, + "loss": 0.0205, + "step": 33580 + }, + { + "epoch": 3.9820941539191272, + "grad_norm": 0.6181039262004118, + "learning_rate": 2.6274197230807817e-09, + "loss": 0.024, + "step": 33581 + }, + { + "epoch": 3.9822127356812524, + "grad_norm": 0.33609809030941484, + "learning_rate": 2.5927352940408044e-09, + "loss": 0.0188, + "step": 33582 + }, + { + "epoch": 3.982331317443377, + "grad_norm": 0.35004675308028804, + "learning_rate": 2.5582813104441106e-09, + "loss": 0.0158, + "step": 33583 + }, + { + "epoch": 3.982449899205502, + "grad_norm": 0.5200446293059806, + "learning_rate": 2.5240577726015623e-09, + "loss": 0.0226, + "step": 33584 + }, + { + "epoch": 3.982568480967627, + "grad_norm": 0.6375770020836394, + "learning_rate": 2.4900646808379e-09, + "loss": 0.0281, + "step": 33585 + }, + { + "epoch": 3.9826870627297524, + "grad_norm": 0.6545281818846889, + "learning_rate": 2.4563020354584354e-09, + "loss": 0.0273, + "step": 33586 + }, + { + "epoch": 3.982805644491877, + "grad_norm": 0.6045810231284627, + "learning_rate": 2.4227698367795816e-09, + "loss": 0.0167, + "step": 33587 + }, + { + "epoch": 3.982924226254002, + "grad_norm": 0.33337380086352114, + "learning_rate": 2.3894680851094253e-09, + "loss": 0.0129, + "step": 33588 + }, + { + "epoch": 3.983042808016127, + "grad_norm": 0.4839483059221931, + "learning_rate": 2.356396780753278e-09, + "loss": 0.0207, + "step": 33589 + }, + { + "epoch": 3.9831613897782523, + "grad_norm": 0.3518366377939293, + "learning_rate": 2.3235559240164518e-09, + "loss": 0.0187, + "step": 33590 + }, + { + "epoch": 3.983279971540377, + "grad_norm": 0.6627186557306701, + "learning_rate": 2.2909455152014816e-09, + "loss": 0.0348, + "step": 33591 + }, + { + "epoch": 3.983398553302502, + "grad_norm": 0.43597019598432496, + "learning_rate": 2.258565554610903e-09, + "loss": 0.0157, + "step": 33592 + }, + { + "epoch": 3.983517135064627, + "grad_norm": 0.46387520875273175, + "learning_rate": 2.2264160425417014e-09, + "loss": 0.0215, + "step": 33593 + }, + { + "epoch": 3.983635716826752, + "grad_norm": 0.32374978376330643, + "learning_rate": 2.194496979290861e-09, + "loss": 0.0147, + "step": 33594 + }, + { + "epoch": 3.983754298588877, + "grad_norm": 0.4580560620995817, + "learning_rate": 2.162808365152591e-09, + "loss": 0.0178, + "step": 33595 + }, + { + "epoch": 3.983872880351002, + "grad_norm": 0.5728562407092934, + "learning_rate": 2.1313502004211004e-09, + "loss": 0.0344, + "step": 33596 + }, + { + "epoch": 3.983991462113127, + "grad_norm": 0.44290450154769523, + "learning_rate": 2.100122485382272e-09, + "loss": 0.0098, + "step": 33597 + }, + { + "epoch": 3.984110043875252, + "grad_norm": 0.4756775893387092, + "learning_rate": 2.0691252203247636e-09, + "loss": 0.0241, + "step": 33598 + }, + { + "epoch": 3.984228625637377, + "grad_norm": 0.31084128447007014, + "learning_rate": 2.038358405537233e-09, + "loss": 0.013, + "step": 33599 + }, + { + "epoch": 3.984347207399502, + "grad_norm": 0.5233105380025254, + "learning_rate": 2.0078220413000113e-09, + "loss": 0.0242, + "step": 33600 + }, + { + "epoch": 3.984465789161627, + "grad_norm": 0.6598965392024491, + "learning_rate": 1.9775161278962063e-09, + "loss": 0.0239, + "step": 33601 + }, + { + "epoch": 3.984584370923752, + "grad_norm": 0.4526226754095667, + "learning_rate": 1.9474406656061485e-09, + "loss": 0.0207, + "step": 33602 + }, + { + "epoch": 3.984702952685877, + "grad_norm": 0.565072330449816, + "learning_rate": 1.917595654704618e-09, + "loss": 0.0268, + "step": 33603 + }, + { + "epoch": 3.984821534448002, + "grad_norm": 0.3016815861020026, + "learning_rate": 1.8879810954691714e-09, + "loss": 0.0129, + "step": 33604 + }, + { + "epoch": 3.984940116210127, + "grad_norm": 0.5452603531605277, + "learning_rate": 1.8585969881718124e-09, + "loss": 0.0257, + "step": 33605 + }, + { + "epoch": 3.985058697972252, + "grad_norm": 0.7591945727810463, + "learning_rate": 1.8294433330845462e-09, + "loss": 0.0309, + "step": 33606 + }, + { + "epoch": 3.9851772797343767, + "grad_norm": 0.4452983913755919, + "learning_rate": 1.800520130473826e-09, + "loss": 0.0147, + "step": 33607 + }, + { + "epoch": 3.985295861496502, + "grad_norm": 0.3287115383582657, + "learning_rate": 1.7718273806061058e-09, + "loss": 0.0153, + "step": 33608 + }, + { + "epoch": 3.9854144432586267, + "grad_norm": 0.46751824992008906, + "learning_rate": 1.7433650837478389e-09, + "loss": 0.0241, + "step": 33609 + }, + { + "epoch": 3.985533025020752, + "grad_norm": 0.40866564282025025, + "learning_rate": 1.715133240162703e-09, + "loss": 0.0151, + "step": 33610 + }, + { + "epoch": 3.9856516067828767, + "grad_norm": 0.5052760960742709, + "learning_rate": 1.687131850108825e-09, + "loss": 0.0171, + "step": 33611 + }, + { + "epoch": 3.985770188545002, + "grad_norm": 0.6417656309723662, + "learning_rate": 1.6593609138415567e-09, + "loss": 0.0292, + "step": 33612 + }, + { + "epoch": 3.9858887703071266, + "grad_norm": 0.6367968319451515, + "learning_rate": 1.6318204316245756e-09, + "loss": 0.0268, + "step": 33613 + }, + { + "epoch": 3.986007352069252, + "grad_norm": 0.4157680008014895, + "learning_rate": 1.6045104037049064e-09, + "loss": 0.0144, + "step": 33614 + }, + { + "epoch": 3.9861259338313766, + "grad_norm": 0.4579044353303659, + "learning_rate": 1.5774308303351248e-09, + "loss": 0.0223, + "step": 33615 + }, + { + "epoch": 3.986244515593502, + "grad_norm": 0.5137368517214654, + "learning_rate": 1.5505817117678068e-09, + "loss": 0.0255, + "step": 33616 + }, + { + "epoch": 3.9863630973556266, + "grad_norm": 0.8412525988533851, + "learning_rate": 1.5239630482499767e-09, + "loss": 0.0331, + "step": 33617 + }, + { + "epoch": 3.9864816791177518, + "grad_norm": 0.9947469518663493, + "learning_rate": 1.4975748400258838e-09, + "loss": 0.0361, + "step": 33618 + }, + { + "epoch": 3.9866002608798765, + "grad_norm": 0.5169840031998991, + "learning_rate": 1.4714170873370016e-09, + "loss": 0.0223, + "step": 33619 + }, + { + "epoch": 3.9867188426420017, + "grad_norm": 0.8460715577408767, + "learning_rate": 1.4454897904275788e-09, + "loss": 0.05, + "step": 33620 + }, + { + "epoch": 3.9868374244041265, + "grad_norm": 0.7896949309309669, + "learning_rate": 1.4197929495363138e-09, + "loss": 0.0268, + "step": 33621 + }, + { + "epoch": 3.9869560061662517, + "grad_norm": 0.4186136021737811, + "learning_rate": 1.3943265648991288e-09, + "loss": 0.0234, + "step": 33622 + }, + { + "epoch": 3.987074587928377, + "grad_norm": 0.5775343123184404, + "learning_rate": 1.3690906367491707e-09, + "loss": 0.0242, + "step": 33623 + }, + { + "epoch": 3.9871931696905016, + "grad_norm": 0.5487106579676693, + "learning_rate": 1.344085165322362e-09, + "loss": 0.0288, + "step": 33624 + }, + { + "epoch": 3.9873117514526264, + "grad_norm": 0.5498563462698809, + "learning_rate": 1.319310150846298e-09, + "loss": 0.0226, + "step": 33625 + }, + { + "epoch": 3.9874303332147516, + "grad_norm": 0.5229337959761348, + "learning_rate": 1.2947655935513503e-09, + "loss": 0.0294, + "step": 33626 + }, + { + "epoch": 3.987548914976877, + "grad_norm": 0.531239254144231, + "learning_rate": 1.2704514936651146e-09, + "loss": 0.0248, + "step": 33627 + }, + { + "epoch": 3.9876674967390016, + "grad_norm": 0.527544705445687, + "learning_rate": 1.24636785140686e-09, + "loss": 0.0184, + "step": 33628 + }, + { + "epoch": 3.9877860785011263, + "grad_norm": 0.5113854545253814, + "learning_rate": 1.222514667004182e-09, + "loss": 0.0205, + "step": 33629 + }, + { + "epoch": 3.9879046602632515, + "grad_norm": 0.3792287381961478, + "learning_rate": 1.1988919406735744e-09, + "loss": 0.0173, + "step": 33630 + }, + { + "epoch": 3.9880232420253767, + "grad_norm": 0.6965342148912004, + "learning_rate": 1.1754996726343059e-09, + "loss": 0.0319, + "step": 33631 + }, + { + "epoch": 3.9881418237875015, + "grad_norm": 0.47333366194580706, + "learning_rate": 1.1523378631000946e-09, + "loss": 0.0256, + "step": 33632 + }, + { + "epoch": 3.9882604055496262, + "grad_norm": 0.5286099910429098, + "learning_rate": 1.1294065122846587e-09, + "loss": 0.0263, + "step": 33633 + }, + { + "epoch": 3.9883789873117514, + "grad_norm": 0.5331980659173664, + "learning_rate": 1.1067056204017156e-09, + "loss": 0.0212, + "step": 33634 + }, + { + "epoch": 3.9884975690738766, + "grad_norm": 0.7258285959128449, + "learning_rate": 1.0842351876594325e-09, + "loss": 0.0391, + "step": 33635 + }, + { + "epoch": 3.9886161508360014, + "grad_norm": 0.5409661246808224, + "learning_rate": 1.0619952142659762e-09, + "loss": 0.0208, + "step": 33636 + }, + { + "epoch": 3.988734732598126, + "grad_norm": 0.3002689352670013, + "learning_rate": 1.0399857004239622e-09, + "loss": 0.0181, + "step": 33637 + }, + { + "epoch": 3.9888533143602514, + "grad_norm": 0.3966165143311295, + "learning_rate": 1.0182066463360063e-09, + "loss": 0.0224, + "step": 33638 + }, + { + "epoch": 3.9889718961223766, + "grad_norm": 0.3316309890639357, + "learning_rate": 9.966580522074997e-10, + "loss": 0.0171, + "step": 33639 + }, + { + "epoch": 3.9890904778845013, + "grad_norm": 0.5212634991875398, + "learning_rate": 9.753399182327316e-10, + "loss": 0.0264, + "step": 33640 + }, + { + "epoch": 3.989209059646626, + "grad_norm": 0.6159713288546432, + "learning_rate": 9.542522446087666e-10, + "loss": 0.0306, + "step": 33641 + }, + { + "epoch": 3.9893276414087513, + "grad_norm": 0.4763141011965461, + "learning_rate": 9.33395031532669e-10, + "loss": 0.0212, + "step": 33642 + }, + { + "epoch": 3.9894462231708765, + "grad_norm": 0.42080754682203575, + "learning_rate": 9.127682791931769e-10, + "loss": 0.0202, + "step": 33643 + }, + { + "epoch": 3.9895648049330013, + "grad_norm": 0.5330000857971132, + "learning_rate": 8.923719877818038e-10, + "loss": 0.0297, + "step": 33644 + }, + { + "epoch": 3.9896833866951265, + "grad_norm": 0.7016440956815122, + "learning_rate": 8.722061574872875e-10, + "loss": 0.0268, + "step": 33645 + }, + { + "epoch": 3.989801968457251, + "grad_norm": 0.4319781576291787, + "learning_rate": 8.522707884955905e-10, + "loss": 0.0167, + "step": 33646 + }, + { + "epoch": 3.9899205502193764, + "grad_norm": 0.45886129118860536, + "learning_rate": 8.325658809871239e-10, + "loss": 0.022, + "step": 33647 + }, + { + "epoch": 3.990039131981501, + "grad_norm": 0.6130258972948387, + "learning_rate": 8.130914351506258e-10, + "loss": 0.0327, + "step": 33648 + }, + { + "epoch": 3.9901577137436264, + "grad_norm": 0.6642057813490203, + "learning_rate": 7.938474511581806e-10, + "loss": 0.0301, + "step": 33649 + }, + { + "epoch": 3.990276295505751, + "grad_norm": 0.5632778825455299, + "learning_rate": 7.748339291929751e-10, + "loss": 0.0176, + "step": 33650 + }, + { + "epoch": 3.9903948772678763, + "grad_norm": 0.3173727431114688, + "learning_rate": 7.560508694243184e-10, + "loss": 0.0145, + "step": 33651 + }, + { + "epoch": 3.990513459030001, + "grad_norm": 0.4741272719130685, + "learning_rate": 7.374982720326218e-10, + "loss": 0.0263, + "step": 33652 + }, + { + "epoch": 3.9906320407921263, + "grad_norm": 0.5898058295732699, + "learning_rate": 7.19176137181643e-10, + "loss": 0.0241, + "step": 33653 + }, + { + "epoch": 3.990750622554251, + "grad_norm": 0.5921164658781082, + "learning_rate": 7.010844650462422e-10, + "loss": 0.024, + "step": 33654 + }, + { + "epoch": 3.9908692043163763, + "grad_norm": 0.47323120946602737, + "learning_rate": 6.832232557901775e-10, + "loss": 0.0229, + "step": 33655 + }, + { + "epoch": 3.990987786078501, + "grad_norm": 0.5588687033202039, + "learning_rate": 6.655925095772065e-10, + "loss": 0.0218, + "step": 33656 + }, + { + "epoch": 3.9911063678406262, + "grad_norm": 0.5292311796999236, + "learning_rate": 6.481922265738627e-10, + "loss": 0.0229, + "step": 33657 + }, + { + "epoch": 3.991224949602751, + "grad_norm": 0.3239922876370491, + "learning_rate": 6.310224069355774e-10, + "loss": 0.0128, + "step": 33658 + }, + { + "epoch": 3.991343531364876, + "grad_norm": 0.26040638973292135, + "learning_rate": 6.14083050823333e-10, + "loss": 0.0087, + "step": 33659 + }, + { + "epoch": 3.991462113127001, + "grad_norm": 0.4032213186930263, + "learning_rate": 5.973741583953363e-10, + "loss": 0.0204, + "step": 33660 + }, + { + "epoch": 3.991580694889126, + "grad_norm": 0.43479049403119147, + "learning_rate": 5.808957298014672e-10, + "loss": 0.0212, + "step": 33661 + }, + { + "epoch": 3.991699276651251, + "grad_norm": 0.5175378863483658, + "learning_rate": 5.646477651943815e-10, + "loss": 0.0208, + "step": 33662 + }, + { + "epoch": 3.991817858413376, + "grad_norm": 0.591747172910268, + "learning_rate": 5.486302647267349e-10, + "loss": 0.0253, + "step": 33663 + }, + { + "epoch": 3.991936440175501, + "grad_norm": 0.6020362868040511, + "learning_rate": 5.328432285428564e-10, + "loss": 0.0309, + "step": 33664 + }, + { + "epoch": 3.992055021937626, + "grad_norm": 0.6471834547628356, + "learning_rate": 5.172866567898505e-10, + "loss": 0.0298, + "step": 33665 + }, + { + "epoch": 3.992173603699751, + "grad_norm": 0.3685518768971127, + "learning_rate": 5.01960549612046e-10, + "loss": 0.0112, + "step": 33666 + }, + { + "epoch": 3.992292185461876, + "grad_norm": 0.6830127206172448, + "learning_rate": 4.868649071509968e-10, + "loss": 0.028, + "step": 33667 + }, + { + "epoch": 3.992410767224001, + "grad_norm": 0.3838299055873761, + "learning_rate": 4.719997295427047e-10, + "loss": 0.0167, + "step": 33668 + }, + { + "epoch": 3.992529348986126, + "grad_norm": 0.4309503917131873, + "learning_rate": 4.57365016925948e-10, + "loss": 0.0141, + "step": 33669 + }, + { + "epoch": 3.9926479307482508, + "grad_norm": 0.35569858578517843, + "learning_rate": 4.4296076943672883e-10, + "loss": 0.0161, + "step": 33670 + }, + { + "epoch": 3.992766512510376, + "grad_norm": 0.975119559184954, + "learning_rate": 4.2878698720549837e-10, + "loss": 0.0445, + "step": 33671 + }, + { + "epoch": 3.992885094272501, + "grad_norm": 0.5200923440159667, + "learning_rate": 4.1484367036548344e-10, + "loss": 0.0232, + "step": 33672 + }, + { + "epoch": 3.993003676034626, + "grad_norm": 0.7408761880840793, + "learning_rate": 4.0113081904435966e-10, + "loss": 0.0331, + "step": 33673 + }, + { + "epoch": 3.9931222577967507, + "grad_norm": 0.6745318201928373, + "learning_rate": 3.8764843336702714e-10, + "loss": 0.023, + "step": 33674 + }, + { + "epoch": 3.993240839558876, + "grad_norm": 0.6482523857271871, + "learning_rate": 3.7439651346116153e-10, + "loss": 0.021, + "step": 33675 + }, + { + "epoch": 3.993359421321001, + "grad_norm": 0.6945647982862011, + "learning_rate": 3.613750594461118e-10, + "loss": 0.0241, + "step": 33676 + }, + { + "epoch": 3.993478003083126, + "grad_norm": 0.3678891512442868, + "learning_rate": 3.4858407144122696e-10, + "loss": 0.0141, + "step": 33677 + }, + { + "epoch": 3.9935965848452506, + "grad_norm": 0.4162566622982007, + "learning_rate": 3.360235495658559e-10, + "loss": 0.0183, + "step": 33678 + }, + { + "epoch": 3.993715166607376, + "grad_norm": 0.3048330144341975, + "learning_rate": 3.236934939365721e-10, + "loss": 0.0132, + "step": 33679 + }, + { + "epoch": 3.993833748369501, + "grad_norm": 0.5878183041208995, + "learning_rate": 3.115939046671734e-10, + "loss": 0.027, + "step": 33680 + }, + { + "epoch": 3.9939523301316258, + "grad_norm": 0.6453979075761415, + "learning_rate": 2.997247818659066e-10, + "loss": 0.0285, + "step": 33681 + }, + { + "epoch": 3.9940709118937505, + "grad_norm": 0.3859614514564942, + "learning_rate": 2.8808612564656944e-10, + "loss": 0.0178, + "step": 33682 + }, + { + "epoch": 3.9941894936558757, + "grad_norm": 0.41000320405515134, + "learning_rate": 2.7667793611463324e-10, + "loss": 0.0172, + "step": 33683 + }, + { + "epoch": 3.994308075418001, + "grad_norm": 0.38458064668626896, + "learning_rate": 2.655002133727935e-10, + "loss": 0.0174, + "step": 33684 + }, + { + "epoch": 3.9944266571801257, + "grad_norm": 0.27637193927271314, + "learning_rate": 2.545529575292971e-10, + "loss": 0.0121, + "step": 33685 + }, + { + "epoch": 3.9945452389422504, + "grad_norm": 0.6171368202108135, + "learning_rate": 2.438361686785129e-10, + "loss": 0.0276, + "step": 33686 + }, + { + "epoch": 3.9946638207043756, + "grad_norm": 0.38349654586616105, + "learning_rate": 2.333498469259121e-10, + "loss": 0.0185, + "step": 33687 + }, + { + "epoch": 3.994782402466501, + "grad_norm": 0.41099419894213385, + "learning_rate": 2.2309399236308814e-10, + "loss": 0.0215, + "step": 33688 + }, + { + "epoch": 3.9949009842286256, + "grad_norm": 0.6479957672159053, + "learning_rate": 2.1306860508718552e-10, + "loss": 0.0295, + "step": 33689 + }, + { + "epoch": 3.9950195659907504, + "grad_norm": 0.717555715372708, + "learning_rate": 2.0327368519257317e-10, + "loss": 0.0254, + "step": 33690 + }, + { + "epoch": 3.9951381477528756, + "grad_norm": 0.4121756213290982, + "learning_rate": 1.9370923276251785e-10, + "loss": 0.0181, + "step": 33691 + }, + { + "epoch": 3.9952567295150008, + "grad_norm": 0.8941225572277204, + "learning_rate": 1.8437524789138848e-10, + "loss": 0.0375, + "step": 33692 + }, + { + "epoch": 3.9953753112771255, + "grad_norm": 0.8571961534966666, + "learning_rate": 1.752717306652274e-10, + "loss": 0.038, + "step": 33693 + }, + { + "epoch": 3.9954938930392503, + "grad_norm": 0.6243121199047095, + "learning_rate": 1.6639868116452573e-10, + "loss": 0.0221, + "step": 33694 + }, + { + "epoch": 3.9956124748013755, + "grad_norm": 0.5868070800543955, + "learning_rate": 1.577560994725502e-10, + "loss": 0.0233, + "step": 33695 + }, + { + "epoch": 3.9957310565635007, + "grad_norm": 0.38239393635625263, + "learning_rate": 1.4934398567256757e-10, + "loss": 0.0122, + "step": 33696 + }, + { + "epoch": 3.9958496383256255, + "grad_norm": 0.45868983181876694, + "learning_rate": 1.4116233983396677e-10, + "loss": 0.0188, + "step": 33697 + }, + { + "epoch": 3.9959682200877507, + "grad_norm": 0.529481903235593, + "learning_rate": 1.3321116204001448e-10, + "loss": 0.0195, + "step": 33698 + }, + { + "epoch": 3.9960868018498754, + "grad_norm": 0.4107249088425968, + "learning_rate": 1.2549045236009972e-10, + "loss": 0.0234, + "step": 33699 + }, + { + "epoch": 3.9962053836120006, + "grad_norm": 0.3868706580755921, + "learning_rate": 1.1800021086638692e-10, + "loss": 0.0242, + "step": 33700 + }, + { + "epoch": 3.9963239653741254, + "grad_norm": 0.4070829714326567, + "learning_rate": 1.1074043762826502e-10, + "loss": 0.0169, + "step": 33701 + }, + { + "epoch": 3.9964425471362506, + "grad_norm": 0.7392751042575342, + "learning_rate": 1.0371113271234745e-10, + "loss": 0.0307, + "step": 33702 + }, + { + "epoch": 3.9965611288983753, + "grad_norm": 0.9481558660513774, + "learning_rate": 9.6912296182472e-11, + "loss": 0.0404, + "step": 33703 + }, + { + "epoch": 3.9966797106605005, + "grad_norm": 0.4968466548184237, + "learning_rate": 9.03439281024765e-11, + "loss": 0.0207, + "step": 33704 + }, + { + "epoch": 3.9967982924226253, + "grad_norm": 0.5012704826708081, + "learning_rate": 8.400602853064766e-11, + "loss": 0.0237, + "step": 33705 + }, + { + "epoch": 3.9969168741847505, + "grad_norm": 0.42001221180105486, + "learning_rate": 7.789859753082329e-11, + "loss": 0.0209, + "step": 33706 + }, + { + "epoch": 3.9970354559468753, + "grad_norm": 0.3963288993433118, + "learning_rate": 7.202163515296345e-11, + "loss": 0.0257, + "step": 33707 + }, + { + "epoch": 3.9971540377090005, + "grad_norm": 0.5723230209586793, + "learning_rate": 6.637514145535484e-11, + "loss": 0.0235, + "step": 33708 + }, + { + "epoch": 3.997272619471125, + "grad_norm": 0.4173565135640968, + "learning_rate": 6.095911649073305e-11, + "loss": 0.0172, + "step": 33709 + }, + { + "epoch": 3.9973912012332504, + "grad_norm": 0.5230037641081093, + "learning_rate": 5.5773560303507e-11, + "loss": 0.0245, + "step": 33710 + }, + { + "epoch": 3.997509782995375, + "grad_norm": 0.7697285386886425, + "learning_rate": 5.081847294918785e-11, + "loss": 0.0333, + "step": 33711 + }, + { + "epoch": 3.9976283647575004, + "grad_norm": 0.5488396845705643, + "learning_rate": 4.6093854466633393e-11, + "loss": 0.0277, + "step": 33712 + }, + { + "epoch": 3.997746946519625, + "grad_norm": 0.7904149592883477, + "learning_rate": 4.159970490302811e-11, + "loss": 0.0286, + "step": 33713 + }, + { + "epoch": 3.9978655282817503, + "grad_norm": 0.693615208210422, + "learning_rate": 3.7336024297229823e-11, + "loss": 0.0346, + "step": 33714 + }, + { + "epoch": 3.997984110043875, + "grad_norm": 0.5985554558319839, + "learning_rate": 3.330281269087188e-11, + "loss": 0.0283, + "step": 33715 + }, + { + "epoch": 3.9981026918060003, + "grad_norm": 0.391757914639111, + "learning_rate": 2.9500070122812084e-11, + "loss": 0.0166, + "step": 33716 + }, + { + "epoch": 3.998221273568125, + "grad_norm": 0.6648502242299664, + "learning_rate": 2.592779662358158e-11, + "loss": 0.0282, + "step": 33717 + }, + { + "epoch": 3.9983398553302503, + "grad_norm": 0.6715748802245397, + "learning_rate": 2.258599222926261e-11, + "loss": 0.0262, + "step": 33718 + }, + { + "epoch": 3.998458437092375, + "grad_norm": 0.5747163916174693, + "learning_rate": 1.947465696761075e-11, + "loss": 0.0257, + "step": 33719 + }, + { + "epoch": 3.9985770188545002, + "grad_norm": 0.8131396677848285, + "learning_rate": 1.659379087193269e-11, + "loss": 0.0495, + "step": 33720 + }, + { + "epoch": 3.9986956006166254, + "grad_norm": 0.5331465683482607, + "learning_rate": 1.3943393964432893e-11, + "loss": 0.0226, + "step": 33721 + }, + { + "epoch": 3.99881418237875, + "grad_norm": 0.720182761970104, + "learning_rate": 1.1523466272866933e-11, + "loss": 0.0294, + "step": 33722 + }, + { + "epoch": 3.998932764140875, + "grad_norm": 0.4756599105814044, + "learning_rate": 9.334007816663715e-12, + "loss": 0.0252, + "step": 33723 + }, + { + "epoch": 3.999051345903, + "grad_norm": 0.41386766727788715, + "learning_rate": 7.375018618027695e-12, + "loss": 0.0227, + "step": 33724 + }, + { + "epoch": 3.9991699276651254, + "grad_norm": 0.4749884881976774, + "learning_rate": 5.646498693612223e-12, + "loss": 0.0198, + "step": 33725 + }, + { + "epoch": 3.99928850942725, + "grad_norm": 0.4836314140155837, + "learning_rate": 4.1484480628461996e-12, + "loss": 0.0278, + "step": 33726 + }, + { + "epoch": 3.999407091189375, + "grad_norm": 0.8755373724610497, + "learning_rate": 2.880866734056298e-12, + "loss": 0.0362, + "step": 33727 + }, + { + "epoch": 3.9995256729515, + "grad_norm": 0.6573198795969022, + "learning_rate": 1.843754723895863e-12, + "loss": 0.0369, + "step": 33728 + }, + { + "epoch": 3.9996442547136253, + "grad_norm": 0.3458545455223809, + "learning_rate": 1.0371120379160104e-12, + "loss": 0.0123, + "step": 33729 + }, + { + "epoch": 3.99976283647575, + "grad_norm": 0.2844134928352069, + "learning_rate": 4.609386844434127e-13, + "loss": 0.0112, + "step": 33730 + }, + { + "epoch": 3.999881418237875, + "grad_norm": 0.4253005662856694, + "learning_rate": 1.1523467180474256e-13, + "loss": 0.0191, + "step": 33731 + }, + { + "epoch": 4.0, + "grad_norm": 1.141259252134078, + "learning_rate": 0.0, + "loss": 0.0277, + "step": 33732 + }, + { + "epoch": 4.0, + "step": 33732, + "total_flos": 4540393750798336.0, + "train_loss": 0.26791081111871257, + "train_runtime": 255040.4618, + "train_samples_per_second": 16.929, + "train_steps_per_second": 0.132 + } + ], + "logging_steps": 1.0, + "max_steps": 33732, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4540393750798336.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}