diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13692 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.990403071017274, + "eval_steps": 500, + "global_step": 1950, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025591810620601407, + "grad_norm": 5.72460191514523, + "learning_rate": 2.0512820512820514e-07, + "loss": 0.8785, + "step": 1 + }, + { + "epoch": 0.005118362124120281, + "grad_norm": 5.959578433623624, + "learning_rate": 4.102564102564103e-07, + "loss": 0.8678, + "step": 2 + }, + { + "epoch": 0.007677543186180422, + "grad_norm": 5.902957688136316, + "learning_rate": 6.153846153846155e-07, + "loss": 0.8968, + "step": 3 + }, + { + "epoch": 0.010236724248240563, + "grad_norm": 5.4723551383557805, + "learning_rate": 8.205128205128206e-07, + "loss": 0.8418, + "step": 4 + }, + { + "epoch": 0.012795905310300703, + "grad_norm": 5.787912289462209, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.886, + "step": 5 + }, + { + "epoch": 0.015355086372360844, + "grad_norm": 5.557852691465803, + "learning_rate": 1.230769230769231e-06, + "loss": 0.9033, + "step": 6 + }, + { + "epoch": 0.017914267434420986, + "grad_norm": 5.362076082832634, + "learning_rate": 1.4358974358974359e-06, + "loss": 0.8629, + "step": 7 + }, + { + "epoch": 0.020473448496481125, + "grad_norm": 4.370734339586599, + "learning_rate": 1.6410256410256412e-06, + "loss": 0.8157, + "step": 8 + }, + { + "epoch": 0.023032629558541268, + "grad_norm": 4.493266725432695, + "learning_rate": 1.8461538461538465e-06, + "loss": 0.8392, + "step": 9 + }, + { + "epoch": 0.025591810620601407, + "grad_norm": 4.121132813839215, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.8246, + "step": 10 + }, + { + "epoch": 0.02815099168266155, + "grad_norm": 2.4085810485641095, + "learning_rate": 2.2564102564102566e-06, + "loss": 0.8009, + "step": 11 + }, + { + "epoch": 0.030710172744721688, + "grad_norm": 2.3319618373499087, + "learning_rate": 2.461538461538462e-06, + "loss": 0.8189, + "step": 12 + }, + { + "epoch": 0.03326935380678183, + "grad_norm": 2.050198471153189, + "learning_rate": 2.666666666666667e-06, + "loss": 0.7671, + "step": 13 + }, + { + "epoch": 0.03582853486884197, + "grad_norm": 2.5524552964667926, + "learning_rate": 2.8717948717948717e-06, + "loss": 0.7587, + "step": 14 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 3.44805626463955, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.779, + "step": 15 + }, + { + "epoch": 0.04094689699296225, + "grad_norm": 3.559383059115368, + "learning_rate": 3.2820512820512823e-06, + "loss": 0.758, + "step": 16 + }, + { + "epoch": 0.04350607805502239, + "grad_norm": 3.33809383465494, + "learning_rate": 3.487179487179487e-06, + "loss": 0.7136, + "step": 17 + }, + { + "epoch": 0.046065259117082535, + "grad_norm": 3.2168388473087757, + "learning_rate": 3.692307692307693e-06, + "loss": 0.7369, + "step": 18 + }, + { + "epoch": 0.04862444017914268, + "grad_norm": 2.5872597896051728, + "learning_rate": 3.897435897435898e-06, + "loss": 0.7323, + "step": 19 + }, + { + "epoch": 0.05118362124120281, + "grad_norm": 2.0282374599316957, + "learning_rate": 4.102564102564103e-06, + "loss": 0.6876, + "step": 20 + }, + { + "epoch": 0.053742802303262956, + "grad_norm": 1.5964078137928233, + "learning_rate": 4.307692307692308e-06, + "loss": 0.6678, + "step": 21 + }, + { + "epoch": 0.0563019833653231, + "grad_norm": 1.2909627910012984, + "learning_rate": 4.512820512820513e-06, + "loss": 0.662, + "step": 22 + }, + { + "epoch": 0.05886116442738324, + "grad_norm": 1.3177057124827334, + "learning_rate": 4.717948717948718e-06, + "loss": 0.6594, + "step": 23 + }, + { + "epoch": 0.061420345489443376, + "grad_norm": 1.24942825931957, + "learning_rate": 4.923076923076924e-06, + "loss": 0.626, + "step": 24 + }, + { + "epoch": 0.06397952655150352, + "grad_norm": 1.2673077068864744, + "learning_rate": 5.128205128205128e-06, + "loss": 0.6347, + "step": 25 + }, + { + "epoch": 0.06653870761356366, + "grad_norm": 1.1563656900829429, + "learning_rate": 5.333333333333334e-06, + "loss": 0.6329, + "step": 26 + }, + { + "epoch": 0.0690978886756238, + "grad_norm": 1.044094666100426, + "learning_rate": 5.538461538461539e-06, + "loss": 0.6254, + "step": 27 + }, + { + "epoch": 0.07165706973768395, + "grad_norm": 0.9466038321235274, + "learning_rate": 5.743589743589743e-06, + "loss": 0.6295, + "step": 28 + }, + { + "epoch": 0.07421625079974409, + "grad_norm": 0.7981163236308523, + "learning_rate": 5.948717948717949e-06, + "loss": 0.6155, + "step": 29 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 0.9968280326546483, + "learning_rate": 6.153846153846155e-06, + "loss": 0.6113, + "step": 30 + }, + { + "epoch": 0.07933461292386436, + "grad_norm": 0.9260190035124614, + "learning_rate": 6.358974358974359e-06, + "loss": 0.636, + "step": 31 + }, + { + "epoch": 0.0818937939859245, + "grad_norm": 0.8477667711908703, + "learning_rate": 6.564102564102565e-06, + "loss": 0.63, + "step": 32 + }, + { + "epoch": 0.08445297504798464, + "grad_norm": 0.6532816121925329, + "learning_rate": 6.76923076923077e-06, + "loss": 0.6108, + "step": 33 + }, + { + "epoch": 0.08701215611004479, + "grad_norm": 0.7821169327010173, + "learning_rate": 6.974358974358974e-06, + "loss": 0.6408, + "step": 34 + }, + { + "epoch": 0.08957133717210493, + "grad_norm": 0.7393409032705888, + "learning_rate": 7.17948717948718e-06, + "loss": 0.5915, + "step": 35 + }, + { + "epoch": 0.09213051823416507, + "grad_norm": 0.6644188521415291, + "learning_rate": 7.384615384615386e-06, + "loss": 0.5961, + "step": 36 + }, + { + "epoch": 0.09468969929622521, + "grad_norm": 0.593116184422468, + "learning_rate": 7.58974358974359e-06, + "loss": 0.5971, + "step": 37 + }, + { + "epoch": 0.09724888035828536, + "grad_norm": 0.6919315440965734, + "learning_rate": 7.794871794871796e-06, + "loss": 0.5725, + "step": 38 + }, + { + "epoch": 0.09980806142034548, + "grad_norm": 0.6959155805001593, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5685, + "step": 39 + }, + { + "epoch": 0.10236724248240563, + "grad_norm": 0.6398643666989815, + "learning_rate": 8.205128205128205e-06, + "loss": 0.6138, + "step": 40 + }, + { + "epoch": 0.10492642354446577, + "grad_norm": 0.6361698523409376, + "learning_rate": 8.410256410256411e-06, + "loss": 0.5849, + "step": 41 + }, + { + "epoch": 0.10748560460652591, + "grad_norm": 0.5303957188864263, + "learning_rate": 8.615384615384617e-06, + "loss": 0.5533, + "step": 42 + }, + { + "epoch": 0.11004478566858605, + "grad_norm": 0.5370326395081426, + "learning_rate": 8.820512820512821e-06, + "loss": 0.5833, + "step": 43 + }, + { + "epoch": 0.1126039667306462, + "grad_norm": 0.590660924049006, + "learning_rate": 9.025641025641027e-06, + "loss": 0.5888, + "step": 44 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 0.540686200850521, + "learning_rate": 9.230769230769232e-06, + "loss": 0.5624, + "step": 45 + }, + { + "epoch": 0.11772232885476648, + "grad_norm": 0.5777571666796689, + "learning_rate": 9.435897435897436e-06, + "loss": 0.5851, + "step": 46 + }, + { + "epoch": 0.12028150991682661, + "grad_norm": 0.537318145066867, + "learning_rate": 9.641025641025642e-06, + "loss": 0.5644, + "step": 47 + }, + { + "epoch": 0.12284069097888675, + "grad_norm": 0.5211947061981134, + "learning_rate": 9.846153846153848e-06, + "loss": 0.5591, + "step": 48 + }, + { + "epoch": 0.1253998720409469, + "grad_norm": 0.5397404148687415, + "learning_rate": 1.0051282051282052e-05, + "loss": 0.5843, + "step": 49 + }, + { + "epoch": 0.12795905310300704, + "grad_norm": 0.5297926804183296, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.5277, + "step": 50 + }, + { + "epoch": 0.13051823416506717, + "grad_norm": 0.605652388150248, + "learning_rate": 1.0461538461538463e-05, + "loss": 0.5724, + "step": 51 + }, + { + "epoch": 0.13307741522712732, + "grad_norm": 0.45885002080359344, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.5381, + "step": 52 + }, + { + "epoch": 0.13563659628918745, + "grad_norm": 0.5930020802013372, + "learning_rate": 1.0871794871794871e-05, + "loss": 0.6045, + "step": 53 + }, + { + "epoch": 0.1381957773512476, + "grad_norm": 0.5808129528368039, + "learning_rate": 1.1076923076923079e-05, + "loss": 0.5222, + "step": 54 + }, + { + "epoch": 0.14075495841330773, + "grad_norm": 0.5154128262574531, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.5578, + "step": 55 + }, + { + "epoch": 0.1433141394753679, + "grad_norm": 0.5390655268219918, + "learning_rate": 1.1487179487179487e-05, + "loss": 0.5436, + "step": 56 + }, + { + "epoch": 0.14587332053742802, + "grad_norm": 0.5586414645653933, + "learning_rate": 1.1692307692307694e-05, + "loss": 0.5353, + "step": 57 + }, + { + "epoch": 0.14843250159948818, + "grad_norm": 0.6534105047151474, + "learning_rate": 1.1897435897435898e-05, + "loss": 0.5392, + "step": 58 + }, + { + "epoch": 0.1509916826615483, + "grad_norm": 0.5945111461514314, + "learning_rate": 1.2102564102564102e-05, + "loss": 0.5542, + "step": 59 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 0.6505380466948517, + "learning_rate": 1.230769230769231e-05, + "loss": 0.5252, + "step": 60 + }, + { + "epoch": 0.1561100447856686, + "grad_norm": 0.6510336772233184, + "learning_rate": 1.2512820512820514e-05, + "loss": 0.5683, + "step": 61 + }, + { + "epoch": 0.15866922584772872, + "grad_norm": 0.5509745528884461, + "learning_rate": 1.2717948717948718e-05, + "loss": 0.5205, + "step": 62 + }, + { + "epoch": 0.16122840690978887, + "grad_norm": 0.5917102526960739, + "learning_rate": 1.2923076923076925e-05, + "loss": 0.5603, + "step": 63 + }, + { + "epoch": 0.163787587971849, + "grad_norm": 0.560110790537656, + "learning_rate": 1.312820512820513e-05, + "loss": 0.545, + "step": 64 + }, + { + "epoch": 0.16634676903390916, + "grad_norm": 0.5678363411921677, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.5287, + "step": 65 + }, + { + "epoch": 0.1689059500959693, + "grad_norm": 0.5819806020456917, + "learning_rate": 1.353846153846154e-05, + "loss": 0.5535, + "step": 66 + }, + { + "epoch": 0.17146513115802944, + "grad_norm": 0.5558896303005907, + "learning_rate": 1.3743589743589745e-05, + "loss": 0.5648, + "step": 67 + }, + { + "epoch": 0.17402431222008957, + "grad_norm": 0.5818297224628268, + "learning_rate": 1.3948717948717949e-05, + "loss": 0.545, + "step": 68 + }, + { + "epoch": 0.1765834932821497, + "grad_norm": 0.5604385516101225, + "learning_rate": 1.4153846153846156e-05, + "loss": 0.5625, + "step": 69 + }, + { + "epoch": 0.17914267434420986, + "grad_norm": 0.5262255043539227, + "learning_rate": 1.435897435897436e-05, + "loss": 0.553, + "step": 70 + }, + { + "epoch": 0.18170185540626999, + "grad_norm": 0.5449392960823104, + "learning_rate": 1.4564102564102564e-05, + "loss": 0.532, + "step": 71 + }, + { + "epoch": 0.18426103646833014, + "grad_norm": 0.5757069226256301, + "learning_rate": 1.4769230769230772e-05, + "loss": 0.538, + "step": 72 + }, + { + "epoch": 0.18682021753039027, + "grad_norm": 0.5241911795419738, + "learning_rate": 1.4974358974358976e-05, + "loss": 0.537, + "step": 73 + }, + { + "epoch": 0.18937939859245043, + "grad_norm": 0.4755835874750557, + "learning_rate": 1.517948717948718e-05, + "loss": 0.5015, + "step": 74 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 0.5816260566225624, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.5317, + "step": 75 + }, + { + "epoch": 0.1944977607165707, + "grad_norm": 0.5324895484169254, + "learning_rate": 1.558974358974359e-05, + "loss": 0.503, + "step": 76 + }, + { + "epoch": 0.19705694177863084, + "grad_norm": 0.5873638181887759, + "learning_rate": 1.5794871794871795e-05, + "loss": 0.5571, + "step": 77 + }, + { + "epoch": 0.19961612284069097, + "grad_norm": 0.5466165214856151, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.5177, + "step": 78 + }, + { + "epoch": 0.20217530390275112, + "grad_norm": 0.6209989234883905, + "learning_rate": 1.6205128205128207e-05, + "loss": 0.5268, + "step": 79 + }, + { + "epoch": 0.20473448496481125, + "grad_norm": 0.5479973413858492, + "learning_rate": 1.641025641025641e-05, + "loss": 0.5096, + "step": 80 + }, + { + "epoch": 0.2072936660268714, + "grad_norm": 0.7512485115452843, + "learning_rate": 1.6615384615384618e-05, + "loss": 0.5398, + "step": 81 + }, + { + "epoch": 0.20985284708893154, + "grad_norm": 0.5988795561569944, + "learning_rate": 1.6820512820512822e-05, + "loss": 0.5054, + "step": 82 + }, + { + "epoch": 0.2124120281509917, + "grad_norm": 0.6349565283068188, + "learning_rate": 1.7025641025641026e-05, + "loss": 0.5301, + "step": 83 + }, + { + "epoch": 0.21497120921305182, + "grad_norm": 0.7482046004578073, + "learning_rate": 1.7230769230769234e-05, + "loss": 0.5451, + "step": 84 + }, + { + "epoch": 0.21753039027511195, + "grad_norm": 0.5452089019203088, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.537, + "step": 85 + }, + { + "epoch": 0.2200895713371721, + "grad_norm": 0.6062443857702727, + "learning_rate": 1.7641025641025642e-05, + "loss": 0.5057, + "step": 86 + }, + { + "epoch": 0.22264875239923224, + "grad_norm": 0.6133096727031904, + "learning_rate": 1.784615384615385e-05, + "loss": 0.549, + "step": 87 + }, + { + "epoch": 0.2252079334612924, + "grad_norm": 0.6734829746990577, + "learning_rate": 1.8051282051282053e-05, + "loss": 0.5549, + "step": 88 + }, + { + "epoch": 0.22776711452335252, + "grad_norm": 0.5692319701895174, + "learning_rate": 1.8256410256410257e-05, + "loss": 0.5053, + "step": 89 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 0.6551305606242741, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.5451, + "step": 90 + }, + { + "epoch": 0.2328854766474728, + "grad_norm": 0.5869184912875696, + "learning_rate": 1.866666666666667e-05, + "loss": 0.5573, + "step": 91 + }, + { + "epoch": 0.23544465770953296, + "grad_norm": 0.6175475389576918, + "learning_rate": 1.8871794871794873e-05, + "loss": 0.5239, + "step": 92 + }, + { + "epoch": 0.2380038387715931, + "grad_norm": 0.6923526622405791, + "learning_rate": 1.907692307692308e-05, + "loss": 0.5178, + "step": 93 + }, + { + "epoch": 0.24056301983365322, + "grad_norm": 0.6169855947769446, + "learning_rate": 1.9282051282051284e-05, + "loss": 0.5583, + "step": 94 + }, + { + "epoch": 0.24312220089571338, + "grad_norm": 0.8905504059133514, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.5577, + "step": 95 + }, + { + "epoch": 0.2456813819577735, + "grad_norm": 0.7206999368653747, + "learning_rate": 1.9692307692307696e-05, + "loss": 0.5248, + "step": 96 + }, + { + "epoch": 0.24824056301983366, + "grad_norm": 0.5689479892809097, + "learning_rate": 1.98974358974359e-05, + "loss": 0.5143, + "step": 97 + }, + { + "epoch": 0.2507997440818938, + "grad_norm": 0.7120200186242683, + "learning_rate": 2.0102564102564104e-05, + "loss": 0.5363, + "step": 98 + }, + { + "epoch": 0.2533589251439539, + "grad_norm": 0.5649172258139606, + "learning_rate": 2.0307692307692308e-05, + "loss": 0.5232, + "step": 99 + }, + { + "epoch": 0.2559181062060141, + "grad_norm": 0.8530948421377756, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.5127, + "step": 100 + }, + { + "epoch": 0.25847728726807423, + "grad_norm": 0.7220421419640349, + "learning_rate": 2.0717948717948723e-05, + "loss": 0.5472, + "step": 101 + }, + { + "epoch": 0.26103646833013433, + "grad_norm": 0.7179821332235292, + "learning_rate": 2.0923076923076927e-05, + "loss": 0.4748, + "step": 102 + }, + { + "epoch": 0.2635956493921945, + "grad_norm": 0.5969399443763903, + "learning_rate": 2.112820512820513e-05, + "loss": 0.4869, + "step": 103 + }, + { + "epoch": 0.26615483045425464, + "grad_norm": 0.7652225150209184, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.5048, + "step": 104 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 0.7026938678210959, + "learning_rate": 2.153846153846154e-05, + "loss": 0.5351, + "step": 105 + }, + { + "epoch": 0.2712731925783749, + "grad_norm": 0.6625081293533241, + "learning_rate": 2.1743589743589743e-05, + "loss": 0.5071, + "step": 106 + }, + { + "epoch": 0.27383237364043506, + "grad_norm": 0.7338103048357757, + "learning_rate": 2.1948717948717954e-05, + "loss": 0.5214, + "step": 107 + }, + { + "epoch": 0.2763915547024952, + "grad_norm": 0.7157427738707126, + "learning_rate": 2.2153846153846158e-05, + "loss": 0.521, + "step": 108 + }, + { + "epoch": 0.27895073576455537, + "grad_norm": 0.8391389612996835, + "learning_rate": 2.235897435897436e-05, + "loss": 0.5114, + "step": 109 + }, + { + "epoch": 0.28150991682661547, + "grad_norm": 0.6739896722237592, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.4607, + "step": 110 + }, + { + "epoch": 0.2840690978886756, + "grad_norm": 0.7325968758566463, + "learning_rate": 2.276923076923077e-05, + "loss": 0.5114, + "step": 111 + }, + { + "epoch": 0.2866282789507358, + "grad_norm": 0.9192765458484284, + "learning_rate": 2.2974358974358974e-05, + "loss": 0.5164, + "step": 112 + }, + { + "epoch": 0.2891874600127959, + "grad_norm": 0.7638911037686114, + "learning_rate": 2.3179487179487184e-05, + "loss": 0.5056, + "step": 113 + }, + { + "epoch": 0.29174664107485604, + "grad_norm": 0.7754803757011083, + "learning_rate": 2.338461538461539e-05, + "loss": 0.5137, + "step": 114 + }, + { + "epoch": 0.2943058221369162, + "grad_norm": 0.7313220696083259, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.5171, + "step": 115 + }, + { + "epoch": 0.29686500319897635, + "grad_norm": 0.8944240636016003, + "learning_rate": 2.3794871794871797e-05, + "loss": 0.5631, + "step": 116 + }, + { + "epoch": 0.29942418426103645, + "grad_norm": 0.827344957741263, + "learning_rate": 2.4e-05, + "loss": 0.5329, + "step": 117 + }, + { + "epoch": 0.3019833653230966, + "grad_norm": 0.7763009278418379, + "learning_rate": 2.4205128205128205e-05, + "loss": 0.5341, + "step": 118 + }, + { + "epoch": 0.30454254638515676, + "grad_norm": 0.7998736408167985, + "learning_rate": 2.4410256410256415e-05, + "loss": 0.52, + "step": 119 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 0.7411952795822903, + "learning_rate": 2.461538461538462e-05, + "loss": 0.5418, + "step": 120 + }, + { + "epoch": 0.309660908509277, + "grad_norm": 0.659770656309478, + "learning_rate": 2.4820512820512824e-05, + "loss": 0.5195, + "step": 121 + }, + { + "epoch": 0.3122200895713372, + "grad_norm": 0.8056693118680838, + "learning_rate": 2.5025641025641028e-05, + "loss": 0.5215, + "step": 122 + }, + { + "epoch": 0.31477927063339733, + "grad_norm": 0.9631898506281213, + "learning_rate": 2.523076923076923e-05, + "loss": 0.5283, + "step": 123 + }, + { + "epoch": 0.31733845169545744, + "grad_norm": 0.7096814914325649, + "learning_rate": 2.5435897435897436e-05, + "loss": 0.5155, + "step": 124 + }, + { + "epoch": 0.3198976327575176, + "grad_norm": 1.028582043530853, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.5394, + "step": 125 + }, + { + "epoch": 0.32245681381957775, + "grad_norm": 0.7239590324246933, + "learning_rate": 2.584615384615385e-05, + "loss": 0.5446, + "step": 126 + }, + { + "epoch": 0.32501599488163785, + "grad_norm": 1.0571455117998556, + "learning_rate": 2.6051282051282054e-05, + "loss": 0.5335, + "step": 127 + }, + { + "epoch": 0.327575175943698, + "grad_norm": 1.0256105590142106, + "learning_rate": 2.625641025641026e-05, + "loss": 0.5418, + "step": 128 + }, + { + "epoch": 0.33013435700575816, + "grad_norm": 1.0959117099820284, + "learning_rate": 2.6461538461538463e-05, + "loss": 0.5544, + "step": 129 + }, + { + "epoch": 0.3326935380678183, + "grad_norm": 0.7463871798931493, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.4965, + "step": 130 + }, + { + "epoch": 0.3352527191298784, + "grad_norm": 1.0194762534931083, + "learning_rate": 2.687179487179487e-05, + "loss": 0.4776, + "step": 131 + }, + { + "epoch": 0.3378119001919386, + "grad_norm": 0.7119748311745303, + "learning_rate": 2.707692307692308e-05, + "loss": 0.5203, + "step": 132 + }, + { + "epoch": 0.34037108125399873, + "grad_norm": 0.920481871489979, + "learning_rate": 2.7282051282051285e-05, + "loss": 0.5142, + "step": 133 + }, + { + "epoch": 0.3429302623160589, + "grad_norm": 0.7065977141822832, + "learning_rate": 2.748717948717949e-05, + "loss": 0.5349, + "step": 134 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 0.9570189898635619, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.556, + "step": 135 + }, + { + "epoch": 0.34804862444017914, + "grad_norm": 0.8774520896998024, + "learning_rate": 2.7897435897435898e-05, + "loss": 0.5221, + "step": 136 + }, + { + "epoch": 0.3506078055022393, + "grad_norm": 0.9754340258356095, + "learning_rate": 2.81025641025641e-05, + "loss": 0.5165, + "step": 137 + }, + { + "epoch": 0.3531669865642994, + "grad_norm": 0.9512013949257682, + "learning_rate": 2.8307692307692312e-05, + "loss": 0.5094, + "step": 138 + }, + { + "epoch": 0.35572616762635956, + "grad_norm": 1.0101279645456138, + "learning_rate": 2.8512820512820516e-05, + "loss": 0.5134, + "step": 139 + }, + { + "epoch": 0.3582853486884197, + "grad_norm": 0.9392456947082269, + "learning_rate": 2.871794871794872e-05, + "loss": 0.5154, + "step": 140 + }, + { + "epoch": 0.36084452975047987, + "grad_norm": 1.064205953704163, + "learning_rate": 2.8923076923076925e-05, + "loss": 0.5422, + "step": 141 + }, + { + "epoch": 0.36340371081253997, + "grad_norm": 0.8361885058587943, + "learning_rate": 2.912820512820513e-05, + "loss": 0.5045, + "step": 142 + }, + { + "epoch": 0.3659628918746001, + "grad_norm": 1.0678541131176078, + "learning_rate": 2.9333333333333333e-05, + "loss": 0.4843, + "step": 143 + }, + { + "epoch": 0.3685220729366603, + "grad_norm": 0.6800509515379447, + "learning_rate": 2.9538461538461543e-05, + "loss": 0.5256, + "step": 144 + }, + { + "epoch": 0.3710812539987204, + "grad_norm": 0.7904486157434544, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.5225, + "step": 145 + }, + { + "epoch": 0.37364043506078054, + "grad_norm": 0.816468325578623, + "learning_rate": 2.994871794871795e-05, + "loss": 0.5389, + "step": 146 + }, + { + "epoch": 0.3761996161228407, + "grad_norm": 0.7918935996515184, + "learning_rate": 3.0153846153846155e-05, + "loss": 0.5014, + "step": 147 + }, + { + "epoch": 0.37875879718490085, + "grad_norm": 0.7555828789735101, + "learning_rate": 3.035897435897436e-05, + "loss": 0.5236, + "step": 148 + }, + { + "epoch": 0.38131797824696095, + "grad_norm": 1.0603633188732544, + "learning_rate": 3.0564102564102564e-05, + "loss": 0.5271, + "step": 149 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 0.8796724653002846, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.5214, + "step": 150 + }, + { + "epoch": 0.38643634037108127, + "grad_norm": 0.6823417266648101, + "learning_rate": 3.097435897435898e-05, + "loss": 0.492, + "step": 151 + }, + { + "epoch": 0.3889955214331414, + "grad_norm": 0.9675008798265416, + "learning_rate": 3.117948717948718e-05, + "loss": 0.5282, + "step": 152 + }, + { + "epoch": 0.3915547024952015, + "grad_norm": 1.0136226084949147, + "learning_rate": 3.1384615384615386e-05, + "loss": 0.5044, + "step": 153 + }, + { + "epoch": 0.3941138835572617, + "grad_norm": 0.9351502869426284, + "learning_rate": 3.158974358974359e-05, + "loss": 0.5006, + "step": 154 + }, + { + "epoch": 0.39667306461932184, + "grad_norm": 0.9882679082998469, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.5013, + "step": 155 + }, + { + "epoch": 0.39923224568138194, + "grad_norm": 0.9382011251547424, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.4777, + "step": 156 + }, + { + "epoch": 0.4017914267434421, + "grad_norm": 0.9879747473370469, + "learning_rate": 3.220512820512821e-05, + "loss": 0.4718, + "step": 157 + }, + { + "epoch": 0.40435060780550225, + "grad_norm": 0.9016818417869712, + "learning_rate": 3.2410256410256413e-05, + "loss": 0.5294, + "step": 158 + }, + { + "epoch": 0.4069097888675624, + "grad_norm": 0.9938632562876675, + "learning_rate": 3.261538461538462e-05, + "loss": 0.4991, + "step": 159 + }, + { + "epoch": 0.4094689699296225, + "grad_norm": 1.3410049525059016, + "learning_rate": 3.282051282051282e-05, + "loss": 0.5095, + "step": 160 + }, + { + "epoch": 0.41202815099168266, + "grad_norm": 0.8871685101904818, + "learning_rate": 3.3025641025641025e-05, + "loss": 0.5483, + "step": 161 + }, + { + "epoch": 0.4145873320537428, + "grad_norm": 1.2268108807413454, + "learning_rate": 3.3230769230769236e-05, + "loss": 0.5174, + "step": 162 + }, + { + "epoch": 0.4171465131158029, + "grad_norm": 0.9220772123108049, + "learning_rate": 3.343589743589744e-05, + "loss": 0.5181, + "step": 163 + }, + { + "epoch": 0.4197056941778631, + "grad_norm": 1.0658979698672157, + "learning_rate": 3.3641025641025644e-05, + "loss": 0.539, + "step": 164 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 0.97733215203732, + "learning_rate": 3.384615384615385e-05, + "loss": 0.5191, + "step": 165 + }, + { + "epoch": 0.4248240563019834, + "grad_norm": 1.1290947988408833, + "learning_rate": 3.405128205128205e-05, + "loss": 0.5169, + "step": 166 + }, + { + "epoch": 0.4273832373640435, + "grad_norm": 0.8361689719032068, + "learning_rate": 3.4256410256410256e-05, + "loss": 0.5114, + "step": 167 + }, + { + "epoch": 0.42994241842610365, + "grad_norm": 0.9928527289292229, + "learning_rate": 3.446153846153847e-05, + "loss": 0.5123, + "step": 168 + }, + { + "epoch": 0.4325015994881638, + "grad_norm": 0.7704183020179163, + "learning_rate": 3.466666666666667e-05, + "loss": 0.4828, + "step": 169 + }, + { + "epoch": 0.4350607805502239, + "grad_norm": 0.9283903900385092, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.5226, + "step": 170 + }, + { + "epoch": 0.43761996161228406, + "grad_norm": 0.925519484340918, + "learning_rate": 3.507692307692308e-05, + "loss": 0.4956, + "step": 171 + }, + { + "epoch": 0.4401791426743442, + "grad_norm": 0.9993231714335514, + "learning_rate": 3.5282051282051283e-05, + "loss": 0.5355, + "step": 172 + }, + { + "epoch": 0.44273832373640437, + "grad_norm": 1.311826250820302, + "learning_rate": 3.548717948717949e-05, + "loss": 0.5237, + "step": 173 + }, + { + "epoch": 0.44529750479846447, + "grad_norm": 0.9273485381049265, + "learning_rate": 3.56923076923077e-05, + "loss": 0.5131, + "step": 174 + }, + { + "epoch": 0.44785668586052463, + "grad_norm": 1.248607874192031, + "learning_rate": 3.58974358974359e-05, + "loss": 0.4972, + "step": 175 + }, + { + "epoch": 0.4504158669225848, + "grad_norm": 0.8222900953155937, + "learning_rate": 3.6102564102564106e-05, + "loss": 0.5364, + "step": 176 + }, + { + "epoch": 0.45297504798464494, + "grad_norm": 1.123693838028357, + "learning_rate": 3.630769230769231e-05, + "loss": 0.5219, + "step": 177 + }, + { + "epoch": 0.45553422904670504, + "grad_norm": 0.7807365104513995, + "learning_rate": 3.6512820512820514e-05, + "loss": 0.5182, + "step": 178 + }, + { + "epoch": 0.4580934101087652, + "grad_norm": 0.9231872939650723, + "learning_rate": 3.671794871794872e-05, + "loss": 0.5235, + "step": 179 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 0.8053927537642723, + "learning_rate": 3.692307692307693e-05, + "loss": 0.5202, + "step": 180 + }, + { + "epoch": 0.46321177223288545, + "grad_norm": 1.1551405361338565, + "learning_rate": 3.712820512820513e-05, + "loss": 0.5173, + "step": 181 + }, + { + "epoch": 0.4657709532949456, + "grad_norm": 0.9439986501141405, + "learning_rate": 3.733333333333334e-05, + "loss": 0.5232, + "step": 182 + }, + { + "epoch": 0.46833013435700577, + "grad_norm": 1.106193836601767, + "learning_rate": 3.753846153846154e-05, + "loss": 0.5533, + "step": 183 + }, + { + "epoch": 0.4708893154190659, + "grad_norm": 0.794287368963475, + "learning_rate": 3.7743589743589745e-05, + "loss": 0.4762, + "step": 184 + }, + { + "epoch": 0.473448496481126, + "grad_norm": 0.8611752707863242, + "learning_rate": 3.794871794871795e-05, + "loss": 0.5274, + "step": 185 + }, + { + "epoch": 0.4760076775431862, + "grad_norm": 0.8817686680939495, + "learning_rate": 3.815384615384616e-05, + "loss": 0.5188, + "step": 186 + }, + { + "epoch": 0.47856685860524634, + "grad_norm": 0.8845212101910219, + "learning_rate": 3.8358974358974364e-05, + "loss": 0.5218, + "step": 187 + }, + { + "epoch": 0.48112603966730644, + "grad_norm": 0.663678181823649, + "learning_rate": 3.856410256410257e-05, + "loss": 0.5227, + "step": 188 + }, + { + "epoch": 0.4836852207293666, + "grad_norm": 0.755313719038617, + "learning_rate": 3.876923076923077e-05, + "loss": 0.5426, + "step": 189 + }, + { + "epoch": 0.48624440179142675, + "grad_norm": 0.68323576325022, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.5254, + "step": 190 + }, + { + "epoch": 0.4888035828534869, + "grad_norm": 0.7623898782087327, + "learning_rate": 3.917948717948718e-05, + "loss": 0.5011, + "step": 191 + }, + { + "epoch": 0.491362763915547, + "grad_norm": 0.6832707042582635, + "learning_rate": 3.938461538461539e-05, + "loss": 0.5318, + "step": 192 + }, + { + "epoch": 0.49392194497760716, + "grad_norm": 0.8383249840589115, + "learning_rate": 3.9589743589743595e-05, + "loss": 0.5244, + "step": 193 + }, + { + "epoch": 0.4964811260396673, + "grad_norm": 0.6885029349814699, + "learning_rate": 3.97948717948718e-05, + "loss": 0.5068, + "step": 194 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 0.9336320586686789, + "learning_rate": 4e-05, + "loss": 0.5013, + "step": 195 + }, + { + "epoch": 0.5015994881637876, + "grad_norm": 1.1273619177371816, + "learning_rate": 3.999996795609852e-05, + "loss": 0.4924, + "step": 196 + }, + { + "epoch": 0.5041586692258477, + "grad_norm": 0.8484780646183507, + "learning_rate": 3.9999871824496765e-05, + "loss": 0.4774, + "step": 197 + }, + { + "epoch": 0.5067178502879078, + "grad_norm": 0.7566071063905553, + "learning_rate": 3.999971160550277e-05, + "loss": 0.5328, + "step": 198 + }, + { + "epoch": 0.509277031349968, + "grad_norm": 0.963017285802582, + "learning_rate": 3.999948729962994e-05, + "loss": 0.535, + "step": 199 + }, + { + "epoch": 0.5118362124120281, + "grad_norm": 0.6417865549660366, + "learning_rate": 3.9999198907597046e-05, + "loss": 0.502, + "step": 200 + }, + { + "epoch": 0.5143953934740882, + "grad_norm": 0.8877386892780115, + "learning_rate": 3.999884643032821e-05, + "loss": 0.513, + "step": 201 + }, + { + "epoch": 0.5169545745361485, + "grad_norm": 0.7952767722764698, + "learning_rate": 3.999842986895289e-05, + "loss": 0.5121, + "step": 202 + }, + { + "epoch": 0.5195137555982086, + "grad_norm": 0.7352919644974529, + "learning_rate": 3.999794922480593e-05, + "loss": 0.4881, + "step": 203 + }, + { + "epoch": 0.5220729366602687, + "grad_norm": 0.7101035899255743, + "learning_rate": 3.9997404499427494e-05, + "loss": 0.4941, + "step": 204 + }, + { + "epoch": 0.5246321177223289, + "grad_norm": 0.7457517555879764, + "learning_rate": 3.9996795694563096e-05, + "loss": 0.5128, + "step": 205 + }, + { + "epoch": 0.527191298784389, + "grad_norm": 0.6573802709079243, + "learning_rate": 3.999612281216358e-05, + "loss": 0.4949, + "step": 206 + }, + { + "epoch": 0.5297504798464492, + "grad_norm": 0.7445707355838409, + "learning_rate": 3.9995385854385124e-05, + "loss": 0.5036, + "step": 207 + }, + { + "epoch": 0.5323096609085093, + "grad_norm": 0.7713581724495707, + "learning_rate": 3.999458482358924e-05, + "loss": 0.4985, + "step": 208 + }, + { + "epoch": 0.5348688419705694, + "grad_norm": 0.7752015257631751, + "learning_rate": 3.9993719722342726e-05, + "loss": 0.508, + "step": 209 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 1.133508847111212, + "learning_rate": 3.999279055341771e-05, + "loss": 0.5261, + "step": 210 + }, + { + "epoch": 0.5399872040946897, + "grad_norm": 0.7726822279135389, + "learning_rate": 3.999179731979162e-05, + "loss": 0.4672, + "step": 211 + }, + { + "epoch": 0.5425463851567498, + "grad_norm": 0.7766807053995074, + "learning_rate": 3.9990740024647154e-05, + "loss": 0.4984, + "step": 212 + }, + { + "epoch": 0.54510556621881, + "grad_norm": 0.5303932042183875, + "learning_rate": 3.9989618671372304e-05, + "loss": 0.5129, + "step": 213 + }, + { + "epoch": 0.5476647472808701, + "grad_norm": 0.6026485573392468, + "learning_rate": 3.998843326356032e-05, + "loss": 0.5127, + "step": 214 + }, + { + "epoch": 0.5502239283429302, + "grad_norm": 0.6118913537653141, + "learning_rate": 3.998718380500971e-05, + "loss": 0.458, + "step": 215 + }, + { + "epoch": 0.5527831094049904, + "grad_norm": 0.6445812096951002, + "learning_rate": 3.998587029972423e-05, + "loss": 0.495, + "step": 216 + }, + { + "epoch": 0.5553422904670505, + "grad_norm": 0.7741176770417552, + "learning_rate": 3.998449275191286e-05, + "loss": 0.5095, + "step": 217 + }, + { + "epoch": 0.5579014715291107, + "grad_norm": 0.6972261092630921, + "learning_rate": 3.9983051165989814e-05, + "loss": 0.4871, + "step": 218 + }, + { + "epoch": 0.5604606525911708, + "grad_norm": 0.6664826347703592, + "learning_rate": 3.998154554657448e-05, + "loss": 0.5137, + "step": 219 + }, + { + "epoch": 0.5630198336532309, + "grad_norm": 0.6334253599100873, + "learning_rate": 3.997997589849145e-05, + "loss": 0.5494, + "step": 220 + }, + { + "epoch": 0.5655790147152912, + "grad_norm": 0.7145250281069672, + "learning_rate": 3.99783422267705e-05, + "loss": 0.5219, + "step": 221 + }, + { + "epoch": 0.5681381957773513, + "grad_norm": 0.8804491997042745, + "learning_rate": 3.997664453664654e-05, + "loss": 0.5305, + "step": 222 + }, + { + "epoch": 0.5706973768394114, + "grad_norm": 0.5941374342473514, + "learning_rate": 3.9974882833559634e-05, + "loss": 0.492, + "step": 223 + }, + { + "epoch": 0.5732565579014716, + "grad_norm": 0.8541812515528515, + "learning_rate": 3.997305712315497e-05, + "loss": 0.4994, + "step": 224 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 0.8858077653575287, + "learning_rate": 3.9971167411282835e-05, + "loss": 0.5268, + "step": 225 + }, + { + "epoch": 0.5783749200255918, + "grad_norm": 1.0807623477992674, + "learning_rate": 3.9969213703998606e-05, + "loss": 0.5047, + "step": 226 + }, + { + "epoch": 0.580934101087652, + "grad_norm": 0.8213345788017161, + "learning_rate": 3.9967196007562725e-05, + "loss": 0.5302, + "step": 227 + }, + { + "epoch": 0.5834932821497121, + "grad_norm": 0.6055767495165976, + "learning_rate": 3.996511432844067e-05, + "loss": 0.4833, + "step": 228 + }, + { + "epoch": 0.5860524632117722, + "grad_norm": 0.9265228707425465, + "learning_rate": 3.996296867330296e-05, + "loss": 0.5146, + "step": 229 + }, + { + "epoch": 0.5886116442738324, + "grad_norm": 1.228465580948079, + "learning_rate": 3.99607590490251e-05, + "loss": 0.474, + "step": 230 + }, + { + "epoch": 0.5911708253358925, + "grad_norm": 0.8130275473999699, + "learning_rate": 3.9958485462687606e-05, + "loss": 0.4767, + "step": 231 + }, + { + "epoch": 0.5937300063979527, + "grad_norm": 0.780191780172052, + "learning_rate": 3.995614792157592e-05, + "loss": 0.5037, + "step": 232 + }, + { + "epoch": 0.5962891874600128, + "grad_norm": 0.9566872727407667, + "learning_rate": 3.995374643318045e-05, + "loss": 0.5152, + "step": 233 + }, + { + "epoch": 0.5988483685220729, + "grad_norm": 1.1646368305147163, + "learning_rate": 3.9951281005196486e-05, + "loss": 0.5329, + "step": 234 + }, + { + "epoch": 0.6014075495841331, + "grad_norm": 0.7216132566248876, + "learning_rate": 3.9948751645524235e-05, + "loss": 0.5285, + "step": 235 + }, + { + "epoch": 0.6039667306461932, + "grad_norm": 1.0575511476108006, + "learning_rate": 3.994615836226874e-05, + "loss": 0.5364, + "step": 236 + }, + { + "epoch": 0.6065259117082533, + "grad_norm": 0.9622069565170775, + "learning_rate": 3.994350116373991e-05, + "loss": 0.5067, + "step": 237 + }, + { + "epoch": 0.6090850927703135, + "grad_norm": 0.7168012454794431, + "learning_rate": 3.9940780058452416e-05, + "loss": 0.5012, + "step": 238 + }, + { + "epoch": 0.6116442738323736, + "grad_norm": 0.8302983155644609, + "learning_rate": 3.9937995055125774e-05, + "loss": 0.5282, + "step": 239 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 0.7497453674792344, + "learning_rate": 3.9935146162684206e-05, + "loss": 0.4992, + "step": 240 + }, + { + "epoch": 0.6167626359564939, + "grad_norm": 0.8217939024893259, + "learning_rate": 3.993223339025667e-05, + "loss": 0.4895, + "step": 241 + }, + { + "epoch": 0.619321817018554, + "grad_norm": 0.7219771415337329, + "learning_rate": 3.992925674717683e-05, + "loss": 0.4636, + "step": 242 + }, + { + "epoch": 0.6218809980806143, + "grad_norm": 0.9341076990511636, + "learning_rate": 3.9926216242983017e-05, + "loss": 0.5008, + "step": 243 + }, + { + "epoch": 0.6244401791426744, + "grad_norm": 0.7304570272364406, + "learning_rate": 3.9923111887418185e-05, + "loss": 0.4921, + "step": 244 + }, + { + "epoch": 0.6269993602047345, + "grad_norm": 1.0471475376229655, + "learning_rate": 3.9919943690429906e-05, + "loss": 0.4768, + "step": 245 + }, + { + "epoch": 0.6295585412667947, + "grad_norm": 0.693079198064159, + "learning_rate": 3.991671166217031e-05, + "loss": 0.4786, + "step": 246 + }, + { + "epoch": 0.6321177223288548, + "grad_norm": 1.1561745746369312, + "learning_rate": 3.991341581299609e-05, + "loss": 0.5182, + "step": 247 + }, + { + "epoch": 0.6346769033909149, + "grad_norm": 0.8741051048237501, + "learning_rate": 3.991005615346843e-05, + "loss": 0.5024, + "step": 248 + }, + { + "epoch": 0.6372360844529751, + "grad_norm": 0.9086658125226366, + "learning_rate": 3.990663269435298e-05, + "loss": 0.4974, + "step": 249 + }, + { + "epoch": 0.6397952655150352, + "grad_norm": 0.9761201164077572, + "learning_rate": 3.9903145446619837e-05, + "loss": 0.5112, + "step": 250 + }, + { + "epoch": 0.6423544465770953, + "grad_norm": 0.8136560445992532, + "learning_rate": 3.989959442144352e-05, + "loss": 0.4971, + "step": 251 + }, + { + "epoch": 0.6449136276391555, + "grad_norm": 0.6200675715165651, + "learning_rate": 3.989597963020289e-05, + "loss": 0.506, + "step": 252 + }, + { + "epoch": 0.6474728087012156, + "grad_norm": 0.9727978732394552, + "learning_rate": 3.989230108448115e-05, + "loss": 0.5132, + "step": 253 + }, + { + "epoch": 0.6500319897632757, + "grad_norm": 0.7588308378509555, + "learning_rate": 3.9888558796065784e-05, + "loss": 0.4948, + "step": 254 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 0.8984908598875354, + "learning_rate": 3.9884752776948564e-05, + "loss": 0.4912, + "step": 255 + }, + { + "epoch": 0.655150351887396, + "grad_norm": 0.9180268932937014, + "learning_rate": 3.988088303932545e-05, + "loss": 0.5058, + "step": 256 + }, + { + "epoch": 0.6577095329494562, + "grad_norm": 0.9874072428401991, + "learning_rate": 3.987694959559658e-05, + "loss": 0.5265, + "step": 257 + }, + { + "epoch": 0.6602687140115163, + "grad_norm": 0.7774534286927767, + "learning_rate": 3.9872952458366267e-05, + "loss": 0.5116, + "step": 258 + }, + { + "epoch": 0.6628278950735764, + "grad_norm": 0.8189986183875947, + "learning_rate": 3.9868891640442874e-05, + "loss": 0.507, + "step": 259 + }, + { + "epoch": 0.6653870761356366, + "grad_norm": 0.6489509247329256, + "learning_rate": 3.9864767154838864e-05, + "loss": 0.5111, + "step": 260 + }, + { + "epoch": 0.6679462571976967, + "grad_norm": 0.8761101947097708, + "learning_rate": 3.986057901477069e-05, + "loss": 0.5127, + "step": 261 + }, + { + "epoch": 0.6705054382597568, + "grad_norm": 0.9687654727768278, + "learning_rate": 3.985632723365878e-05, + "loss": 0.547, + "step": 262 + }, + { + "epoch": 0.673064619321817, + "grad_norm": 0.7867771900738217, + "learning_rate": 3.985201182512752e-05, + "loss": 0.516, + "step": 263 + }, + { + "epoch": 0.6756238003838771, + "grad_norm": 0.6453678386295493, + "learning_rate": 3.984763280300514e-05, + "loss": 0.481, + "step": 264 + }, + { + "epoch": 0.6781829814459372, + "grad_norm": 0.7765079788191963, + "learning_rate": 3.9843190181323744e-05, + "loss": 0.4913, + "step": 265 + }, + { + "epoch": 0.6807421625079975, + "grad_norm": 0.6373936761246601, + "learning_rate": 3.983868397431923e-05, + "loss": 0.5133, + "step": 266 + }, + { + "epoch": 0.6833013435700576, + "grad_norm": 0.730921099373597, + "learning_rate": 3.983411419643125e-05, + "loss": 0.5339, + "step": 267 + }, + { + "epoch": 0.6858605246321178, + "grad_norm": 0.8152333844378884, + "learning_rate": 3.982948086230312e-05, + "loss": 0.4865, + "step": 268 + }, + { + "epoch": 0.6884197056941779, + "grad_norm": 0.7100466271823358, + "learning_rate": 3.9824783986781897e-05, + "loss": 0.49, + "step": 269 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 0.8934016406293627, + "learning_rate": 3.982002358491817e-05, + "loss": 0.5208, + "step": 270 + }, + { + "epoch": 0.6935380678182982, + "grad_norm": 0.6756901306503084, + "learning_rate": 3.981519967196614e-05, + "loss": 0.5191, + "step": 271 + }, + { + "epoch": 0.6960972488803583, + "grad_norm": 0.8009942346681957, + "learning_rate": 3.98103122633835e-05, + "loss": 0.5067, + "step": 272 + }, + { + "epoch": 0.6986564299424184, + "grad_norm": 0.6923671981740297, + "learning_rate": 3.980536137483141e-05, + "loss": 0.4868, + "step": 273 + }, + { + "epoch": 0.7012156110044786, + "grad_norm": 0.9566550041102404, + "learning_rate": 3.980034702217445e-05, + "loss": 0.5398, + "step": 274 + }, + { + "epoch": 0.7037747920665387, + "grad_norm": 0.8440959667430001, + "learning_rate": 3.979526922148058e-05, + "loss": 0.4658, + "step": 275 + }, + { + "epoch": 0.7063339731285988, + "grad_norm": 0.8588196985974998, + "learning_rate": 3.9790127989021024e-05, + "loss": 0.5052, + "step": 276 + }, + { + "epoch": 0.708893154190659, + "grad_norm": 0.7562049758522937, + "learning_rate": 3.978492334127032e-05, + "loss": 0.5267, + "step": 277 + }, + { + "epoch": 0.7114523352527191, + "grad_norm": 0.632945819901848, + "learning_rate": 3.977965529490618e-05, + "loss": 0.4618, + "step": 278 + }, + { + "epoch": 0.7140115163147792, + "grad_norm": 0.845801145881355, + "learning_rate": 3.9774323866809485e-05, + "loss": 0.4683, + "step": 279 + }, + { + "epoch": 0.7165706973768394, + "grad_norm": 0.7371708961015324, + "learning_rate": 3.9768929074064206e-05, + "loss": 0.5364, + "step": 280 + }, + { + "epoch": 0.7191298784388995, + "grad_norm": 0.7371186918676583, + "learning_rate": 3.976347093395736e-05, + "loss": 0.5061, + "step": 281 + }, + { + "epoch": 0.7216890595009597, + "grad_norm": 0.7032178872579914, + "learning_rate": 3.9757949463978975e-05, + "loss": 0.5242, + "step": 282 + }, + { + "epoch": 0.7242482405630198, + "grad_norm": 0.7617726502548777, + "learning_rate": 3.9752364681821973e-05, + "loss": 0.4888, + "step": 283 + }, + { + "epoch": 0.7268074216250799, + "grad_norm": 0.7903937882632156, + "learning_rate": 3.9746716605382186e-05, + "loss": 0.5203, + "step": 284 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 0.7645749463474476, + "learning_rate": 3.9741005252758255e-05, + "loss": 0.5116, + "step": 285 + }, + { + "epoch": 0.7319257837492003, + "grad_norm": 0.6220992771519867, + "learning_rate": 3.973523064225159e-05, + "loss": 0.4671, + "step": 286 + }, + { + "epoch": 0.7344849648112604, + "grad_norm": 0.5115715364977104, + "learning_rate": 3.972939279236627e-05, + "loss": 0.4565, + "step": 287 + }, + { + "epoch": 0.7370441458733206, + "grad_norm": 0.7021026496208411, + "learning_rate": 3.9723491721809076e-05, + "loss": 0.5379, + "step": 288 + }, + { + "epoch": 0.7396033269353807, + "grad_norm": 0.5201603136030488, + "learning_rate": 3.971752744948932e-05, + "loss": 0.4692, + "step": 289 + }, + { + "epoch": 0.7421625079974408, + "grad_norm": 0.8208102312039668, + "learning_rate": 3.971149999451886e-05, + "loss": 0.4882, + "step": 290 + }, + { + "epoch": 0.744721689059501, + "grad_norm": 1.119540735969476, + "learning_rate": 3.970540937621201e-05, + "loss": 0.5097, + "step": 291 + }, + { + "epoch": 0.7472808701215611, + "grad_norm": 0.7635743214902218, + "learning_rate": 3.9699255614085495e-05, + "loss": 0.5101, + "step": 292 + }, + { + "epoch": 0.7498400511836213, + "grad_norm": 0.8126931797929314, + "learning_rate": 3.969303872785837e-05, + "loss": 0.4889, + "step": 293 + }, + { + "epoch": 0.7523992322456814, + "grad_norm": 0.7624486728902423, + "learning_rate": 3.9686758737451955e-05, + "loss": 0.4966, + "step": 294 + }, + { + "epoch": 0.7549584133077415, + "grad_norm": 0.5229197667386186, + "learning_rate": 3.9680415662989806e-05, + "loss": 0.4886, + "step": 295 + }, + { + "epoch": 0.7575175943698017, + "grad_norm": 0.6766454901060076, + "learning_rate": 3.967400952479759e-05, + "loss": 0.4661, + "step": 296 + }, + { + "epoch": 0.7600767754318618, + "grad_norm": 0.6030935943397303, + "learning_rate": 3.966754034340308e-05, + "loss": 0.5526, + "step": 297 + }, + { + "epoch": 0.7626359564939219, + "grad_norm": 0.688481808856117, + "learning_rate": 3.966100813953607e-05, + "loss": 0.5191, + "step": 298 + }, + { + "epoch": 0.7651951375559821, + "grad_norm": 0.8026340827764397, + "learning_rate": 3.965441293412827e-05, + "loss": 0.4875, + "step": 299 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 0.5877161986618623, + "learning_rate": 3.9647754748313294e-05, + "loss": 0.5581, + "step": 300 + }, + { + "epoch": 0.7703134996801023, + "grad_norm": 0.7230843693397704, + "learning_rate": 3.964103360342658e-05, + "loss": 0.4941, + "step": 301 + }, + { + "epoch": 0.7728726807421625, + "grad_norm": 0.6543565273084903, + "learning_rate": 3.963424952100529e-05, + "loss": 0.4749, + "step": 302 + }, + { + "epoch": 0.7754318618042226, + "grad_norm": 0.6083305140129942, + "learning_rate": 3.962740252278827e-05, + "loss": 0.506, + "step": 303 + }, + { + "epoch": 0.7779910428662828, + "grad_norm": 0.6712573537812702, + "learning_rate": 3.962049263071598e-05, + "loss": 0.4752, + "step": 304 + }, + { + "epoch": 0.780550223928343, + "grad_norm": 0.8280090720406486, + "learning_rate": 3.96135198669304e-05, + "loss": 0.4891, + "step": 305 + }, + { + "epoch": 0.783109404990403, + "grad_norm": 0.6598880067981137, + "learning_rate": 3.960648425377499e-05, + "loss": 0.4947, + "step": 306 + }, + { + "epoch": 0.7856685860524633, + "grad_norm": 0.7265475529158774, + "learning_rate": 3.95993858137946e-05, + "loss": 0.4989, + "step": 307 + }, + { + "epoch": 0.7882277671145234, + "grad_norm": 0.5888767351942641, + "learning_rate": 3.959222456973541e-05, + "loss": 0.4829, + "step": 308 + }, + { + "epoch": 0.7907869481765835, + "grad_norm": 0.6375937306972569, + "learning_rate": 3.958500054454482e-05, + "loss": 0.4574, + "step": 309 + }, + { + "epoch": 0.7933461292386437, + "grad_norm": 0.6933840977854485, + "learning_rate": 3.957771376137144e-05, + "loss": 0.5059, + "step": 310 + }, + { + "epoch": 0.7959053103007038, + "grad_norm": 0.6896810504909161, + "learning_rate": 3.9570364243564966e-05, + "loss": 0.4992, + "step": 311 + }, + { + "epoch": 0.7984644913627639, + "grad_norm": 0.6468420638732157, + "learning_rate": 3.9562952014676116e-05, + "loss": 0.496, + "step": 312 + }, + { + "epoch": 0.8010236724248241, + "grad_norm": 0.6274952773967104, + "learning_rate": 3.955547709845656e-05, + "loss": 0.4874, + "step": 313 + }, + { + "epoch": 0.8035828534868842, + "grad_norm": 0.5653574700934775, + "learning_rate": 3.9547939518858856e-05, + "loss": 0.5067, + "step": 314 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 0.6550905511901227, + "learning_rate": 3.954033930003634e-05, + "loss": 0.5212, + "step": 315 + }, + { + "epoch": 0.8087012156110045, + "grad_norm": 0.7494837380638635, + "learning_rate": 3.953267646634309e-05, + "loss": 0.505, + "step": 316 + }, + { + "epoch": 0.8112603966730646, + "grad_norm": 0.635770951888326, + "learning_rate": 3.95249510423338e-05, + "loss": 0.4967, + "step": 317 + }, + { + "epoch": 0.8138195777351248, + "grad_norm": 0.7410641008583149, + "learning_rate": 3.9517163052763756e-05, + "loss": 0.4773, + "step": 318 + }, + { + "epoch": 0.8163787587971849, + "grad_norm": 0.7149351574867076, + "learning_rate": 3.9509312522588704e-05, + "loss": 0.4709, + "step": 319 + }, + { + "epoch": 0.818937939859245, + "grad_norm": 0.823519098221886, + "learning_rate": 3.9501399476964806e-05, + "loss": 0.4867, + "step": 320 + }, + { + "epoch": 0.8214971209213052, + "grad_norm": 0.7163722784021596, + "learning_rate": 3.9493423941248564e-05, + "loss": 0.507, + "step": 321 + }, + { + "epoch": 0.8240563019833653, + "grad_norm": 0.5792885526249222, + "learning_rate": 3.948538594099668e-05, + "loss": 0.4863, + "step": 322 + }, + { + "epoch": 0.8266154830454254, + "grad_norm": 0.8674821812665616, + "learning_rate": 3.9477285501966064e-05, + "loss": 0.4497, + "step": 323 + }, + { + "epoch": 0.8291746641074856, + "grad_norm": 0.6695505549743577, + "learning_rate": 3.946912265011368e-05, + "loss": 0.4853, + "step": 324 + }, + { + "epoch": 0.8317338451695457, + "grad_norm": 0.6407618610078097, + "learning_rate": 3.946089741159648e-05, + "loss": 0.4742, + "step": 325 + }, + { + "epoch": 0.8342930262316058, + "grad_norm": 0.8505071134562596, + "learning_rate": 3.9452609812771346e-05, + "loss": 0.5346, + "step": 326 + }, + { + "epoch": 0.836852207293666, + "grad_norm": 0.5413673046728109, + "learning_rate": 3.944425988019498e-05, + "loss": 0.4677, + "step": 327 + }, + { + "epoch": 0.8394113883557262, + "grad_norm": 0.9828950428531091, + "learning_rate": 3.9435847640623806e-05, + "loss": 0.4808, + "step": 328 + }, + { + "epoch": 0.8419705694177864, + "grad_norm": 0.6480252890065408, + "learning_rate": 3.942737312101394e-05, + "loss": 0.5019, + "step": 329 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 0.9281412616435286, + "learning_rate": 3.9418836348521045e-05, + "loss": 0.5069, + "step": 330 + }, + { + "epoch": 0.8470889315419066, + "grad_norm": 0.6511602292237915, + "learning_rate": 3.941023735050027e-05, + "loss": 0.5135, + "step": 331 + }, + { + "epoch": 0.8496481126039668, + "grad_norm": 0.649990025588154, + "learning_rate": 3.9401576154506155e-05, + "loss": 0.4721, + "step": 332 + }, + { + "epoch": 0.8522072936660269, + "grad_norm": 0.8525830113834602, + "learning_rate": 3.9392852788292556e-05, + "loss": 0.4747, + "step": 333 + }, + { + "epoch": 0.854766474728087, + "grad_norm": 0.8122595340814978, + "learning_rate": 3.938406727981254e-05, + "loss": 0.5036, + "step": 334 + }, + { + "epoch": 0.8573256557901472, + "grad_norm": 0.6813807690997764, + "learning_rate": 3.937521965721831e-05, + "loss": 0.4778, + "step": 335 + }, + { + "epoch": 0.8598848368522073, + "grad_norm": 0.742372369654133, + "learning_rate": 3.936630994886109e-05, + "loss": 0.4912, + "step": 336 + }, + { + "epoch": 0.8624440179142674, + "grad_norm": 0.6932498968117697, + "learning_rate": 3.9357338183291066e-05, + "loss": 0.5033, + "step": 337 + }, + { + "epoch": 0.8650031989763276, + "grad_norm": 0.7002201428035697, + "learning_rate": 3.934830438925728e-05, + "loss": 0.4843, + "step": 338 + }, + { + "epoch": 0.8675623800383877, + "grad_norm": 0.6143454063707157, + "learning_rate": 3.933920859570753e-05, + "loss": 0.4959, + "step": 339 + }, + { + "epoch": 0.8701215611004478, + "grad_norm": 0.5609771595796579, + "learning_rate": 3.933005083178828e-05, + "loss": 0.4778, + "step": 340 + }, + { + "epoch": 0.872680742162508, + "grad_norm": 0.5872526379907818, + "learning_rate": 3.932083112684459e-05, + "loss": 0.4736, + "step": 341 + }, + { + "epoch": 0.8752399232245681, + "grad_norm": 0.5259898431650297, + "learning_rate": 3.931154951041998e-05, + "loss": 0.5061, + "step": 342 + }, + { + "epoch": 0.8777991042866283, + "grad_norm": 0.6462230152484912, + "learning_rate": 3.930220601225638e-05, + "loss": 0.503, + "step": 343 + }, + { + "epoch": 0.8803582853486884, + "grad_norm": 0.6210066174968442, + "learning_rate": 3.9292800662294e-05, + "loss": 0.4592, + "step": 344 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 0.7033811879806838, + "learning_rate": 3.928333349067125e-05, + "loss": 0.4839, + "step": 345 + }, + { + "epoch": 0.8854766474728087, + "grad_norm": 0.7112080589811673, + "learning_rate": 3.927380452772464e-05, + "loss": 0.4833, + "step": 346 + }, + { + "epoch": 0.8880358285348688, + "grad_norm": 0.5763115473730898, + "learning_rate": 3.926421380398869e-05, + "loss": 0.5128, + "step": 347 + }, + { + "epoch": 0.8905950095969289, + "grad_norm": 0.493333488186968, + "learning_rate": 3.925456135019582e-05, + "loss": 0.4777, + "step": 348 + }, + { + "epoch": 0.8931541906589892, + "grad_norm": 0.5969713695047262, + "learning_rate": 3.924484719727625e-05, + "loss": 0.5548, + "step": 349 + }, + { + "epoch": 0.8957133717210493, + "grad_norm": 0.5743073508713653, + "learning_rate": 3.923507137635792e-05, + "loss": 0.4993, + "step": 350 + }, + { + "epoch": 0.8982725527831094, + "grad_norm": 0.5816084208432016, + "learning_rate": 3.922523391876638e-05, + "loss": 0.4974, + "step": 351 + }, + { + "epoch": 0.9008317338451696, + "grad_norm": 0.6913456504659746, + "learning_rate": 3.921533485602467e-05, + "loss": 0.5038, + "step": 352 + }, + { + "epoch": 0.9033909149072297, + "grad_norm": 0.5710576834895075, + "learning_rate": 3.920537421985327e-05, + "loss": 0.469, + "step": 353 + }, + { + "epoch": 0.9059500959692899, + "grad_norm": 0.831878724155053, + "learning_rate": 3.9195352042169924e-05, + "loss": 0.5178, + "step": 354 + }, + { + "epoch": 0.90850927703135, + "grad_norm": 0.5623208840830399, + "learning_rate": 3.9185268355089606e-05, + "loss": 0.4892, + "step": 355 + }, + { + "epoch": 0.9110684580934101, + "grad_norm": 0.8167633776332197, + "learning_rate": 3.9175123190924384e-05, + "loss": 0.5193, + "step": 356 + }, + { + "epoch": 0.9136276391554703, + "grad_norm": 0.5899572202270219, + "learning_rate": 3.916491658218333e-05, + "loss": 0.4739, + "step": 357 + }, + { + "epoch": 0.9161868202175304, + "grad_norm": 0.7206391505617225, + "learning_rate": 3.9154648561572386e-05, + "loss": 0.4752, + "step": 358 + }, + { + "epoch": 0.9187460012795905, + "grad_norm": 0.7210244727385913, + "learning_rate": 3.91443191619943e-05, + "loss": 0.4789, + "step": 359 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 0.7320868272015927, + "learning_rate": 3.913392841654851e-05, + "loss": 0.503, + "step": 360 + }, + { + "epoch": 0.9238643634037108, + "grad_norm": 0.8003299130587506, + "learning_rate": 3.9123476358531e-05, + "loss": 0.4917, + "step": 361 + }, + { + "epoch": 0.9264235444657709, + "grad_norm": 0.7467581757493919, + "learning_rate": 3.911296302143426e-05, + "loss": 0.4973, + "step": 362 + }, + { + "epoch": 0.9289827255278311, + "grad_norm": 0.695841862059534, + "learning_rate": 3.9102388438947104e-05, + "loss": 0.5014, + "step": 363 + }, + { + "epoch": 0.9315419065898912, + "grad_norm": 0.8452597656759123, + "learning_rate": 3.909175264495464e-05, + "loss": 0.4528, + "step": 364 + }, + { + "epoch": 0.9341010876519513, + "grad_norm": 0.8328804446765739, + "learning_rate": 3.9081055673538093e-05, + "loss": 0.5014, + "step": 365 + }, + { + "epoch": 0.9366602687140115, + "grad_norm": 0.8976120702595518, + "learning_rate": 3.907029755897473e-05, + "loss": 0.4767, + "step": 366 + }, + { + "epoch": 0.9392194497760716, + "grad_norm": 0.6729987700598375, + "learning_rate": 3.905947833573775e-05, + "loss": 0.4758, + "step": 367 + }, + { + "epoch": 0.9417786308381318, + "grad_norm": 0.965419537056689, + "learning_rate": 3.904859803849617e-05, + "loss": 0.4952, + "step": 368 + }, + { + "epoch": 0.944337811900192, + "grad_norm": 0.8355270729808283, + "learning_rate": 3.903765670211469e-05, + "loss": 0.5048, + "step": 369 + }, + { + "epoch": 0.946896992962252, + "grad_norm": 0.7603742900226873, + "learning_rate": 3.902665436165364e-05, + "loss": 0.491, + "step": 370 + }, + { + "epoch": 0.9494561740243123, + "grad_norm": 0.7054418825885687, + "learning_rate": 3.901559105236881e-05, + "loss": 0.4654, + "step": 371 + }, + { + "epoch": 0.9520153550863724, + "grad_norm": 0.6840867683040738, + "learning_rate": 3.9004466809711343e-05, + "loss": 0.4789, + "step": 372 + }, + { + "epoch": 0.9545745361484325, + "grad_norm": 0.7107919372914869, + "learning_rate": 3.8993281669327664e-05, + "loss": 0.5041, + "step": 373 + }, + { + "epoch": 0.9571337172104927, + "grad_norm": 0.5705893437651786, + "learning_rate": 3.8982035667059327e-05, + "loss": 0.4724, + "step": 374 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 0.6398479886368891, + "learning_rate": 3.897072883894291e-05, + "loss": 0.512, + "step": 375 + }, + { + "epoch": 0.9622520793346129, + "grad_norm": 0.6496978842533705, + "learning_rate": 3.895936122120991e-05, + "loss": 0.4998, + "step": 376 + }, + { + "epoch": 0.9648112603966731, + "grad_norm": 0.6636946077689827, + "learning_rate": 3.8947932850286585e-05, + "loss": 0.5105, + "step": 377 + }, + { + "epoch": 0.9673704414587332, + "grad_norm": 0.8146680744120419, + "learning_rate": 3.893644376279392e-05, + "loss": 0.5081, + "step": 378 + }, + { + "epoch": 0.9699296225207934, + "grad_norm": 0.6043005275708316, + "learning_rate": 3.8924893995547427e-05, + "loss": 0.465, + "step": 379 + }, + { + "epoch": 0.9724888035828535, + "grad_norm": 0.7737447625323293, + "learning_rate": 3.8913283585557054e-05, + "loss": 0.4745, + "step": 380 + }, + { + "epoch": 0.9750479846449136, + "grad_norm": 0.6584366000203595, + "learning_rate": 3.89016125700271e-05, + "loss": 0.4646, + "step": 381 + }, + { + "epoch": 0.9776071657069738, + "grad_norm": 0.7267670466528602, + "learning_rate": 3.888988098635604e-05, + "loss": 0.5443, + "step": 382 + }, + { + "epoch": 0.9801663467690339, + "grad_norm": 0.8281518075048059, + "learning_rate": 3.8878088872136446e-05, + "loss": 0.5175, + "step": 383 + }, + { + "epoch": 0.982725527831094, + "grad_norm": 0.7524480266101858, + "learning_rate": 3.8866236265154864e-05, + "loss": 0.4752, + "step": 384 + }, + { + "epoch": 0.9852847088931542, + "grad_norm": 0.8364410578252669, + "learning_rate": 3.885432320339167e-05, + "loss": 0.4752, + "step": 385 + }, + { + "epoch": 0.9878438899552143, + "grad_norm": 0.684899934501505, + "learning_rate": 3.884234972502095e-05, + "loss": 0.4931, + "step": 386 + }, + { + "epoch": 0.9904030710172744, + "grad_norm": 0.6158115167694288, + "learning_rate": 3.88303158684104e-05, + "loss": 0.4861, + "step": 387 + }, + { + "epoch": 0.9929622520793346, + "grad_norm": 0.7078911086493701, + "learning_rate": 3.8818221672121204e-05, + "loss": 0.4921, + "step": 388 + }, + { + "epoch": 0.9955214331413947, + "grad_norm": 0.5372294629344029, + "learning_rate": 3.8806067174907876e-05, + "loss": 0.513, + "step": 389 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 0.7744974417312304, + "learning_rate": 3.879385241571817e-05, + "loss": 0.4669, + "step": 390 + }, + { + "epoch": 1.000639795265515, + "grad_norm": 0.649847374858299, + "learning_rate": 3.878157743369294e-05, + "loss": 0.6153, + "step": 391 + }, + { + "epoch": 1.0031989763275753, + "grad_norm": 0.7557922196471385, + "learning_rate": 3.876924226816602e-05, + "loss": 0.3771, + "step": 392 + }, + { + "epoch": 1.0057581573896353, + "grad_norm": 0.720395789720446, + "learning_rate": 3.875684695866409e-05, + "loss": 0.4498, + "step": 393 + }, + { + "epoch": 1.0083173384516955, + "grad_norm": 0.6684707254214445, + "learning_rate": 3.874439154490656e-05, + "loss": 0.4581, + "step": 394 + }, + { + "epoch": 1.0108765195137557, + "grad_norm": 0.6845617164901984, + "learning_rate": 3.873187606680543e-05, + "loss": 0.4382, + "step": 395 + }, + { + "epoch": 1.0134357005758157, + "grad_norm": 0.6687231254947439, + "learning_rate": 3.871930056446518e-05, + "loss": 0.3945, + "step": 396 + }, + { + "epoch": 1.0159948816378759, + "grad_norm": 0.8306778742589912, + "learning_rate": 3.870666507818262e-05, + "loss": 0.4194, + "step": 397 + }, + { + "epoch": 1.018554062699936, + "grad_norm": 0.7593090405257436, + "learning_rate": 3.869396964844679e-05, + "loss": 0.4378, + "step": 398 + }, + { + "epoch": 1.021113243761996, + "grad_norm": 0.545915139395705, + "learning_rate": 3.8681214315938786e-05, + "loss": 0.4009, + "step": 399 + }, + { + "epoch": 1.0236724248240563, + "grad_norm": 0.8728350624043225, + "learning_rate": 3.866839912153168e-05, + "loss": 0.4239, + "step": 400 + }, + { + "epoch": 1.0262316058861165, + "grad_norm": 0.7685743303640589, + "learning_rate": 3.8655524106290345e-05, + "loss": 0.4433, + "step": 401 + }, + { + "epoch": 1.0287907869481765, + "grad_norm": 0.6436857445045538, + "learning_rate": 3.864258931147136e-05, + "loss": 0.4135, + "step": 402 + }, + { + "epoch": 1.0313499680102367, + "grad_norm": 0.6043825799230796, + "learning_rate": 3.862959477852285e-05, + "loss": 0.4511, + "step": 403 + }, + { + "epoch": 1.033909149072297, + "grad_norm": 0.864882027042227, + "learning_rate": 3.8616540549084366e-05, + "loss": 0.4281, + "step": 404 + }, + { + "epoch": 1.036468330134357, + "grad_norm": 0.5669229687455682, + "learning_rate": 3.860342666498677e-05, + "loss": 0.4265, + "step": 405 + }, + { + "epoch": 1.0390275111964171, + "grad_norm": 1.092925757327788, + "learning_rate": 3.859025316825204e-05, + "loss": 0.4171, + "step": 406 + }, + { + "epoch": 1.0415866922584773, + "grad_norm": 0.6235272525172618, + "learning_rate": 3.8577020101093214e-05, + "loss": 0.3889, + "step": 407 + }, + { + "epoch": 1.0441458733205373, + "grad_norm": 0.7044477218033379, + "learning_rate": 3.856372750591419e-05, + "loss": 0.4268, + "step": 408 + }, + { + "epoch": 1.0467050543825975, + "grad_norm": 0.6758623607211073, + "learning_rate": 3.8550375425309643e-05, + "loss": 0.376, + "step": 409 + }, + { + "epoch": 1.0492642354446577, + "grad_norm": 0.703616580836483, + "learning_rate": 3.853696390206484e-05, + "loss": 0.4782, + "step": 410 + }, + { + "epoch": 1.051823416506718, + "grad_norm": 0.7120884633418646, + "learning_rate": 3.8523492979155534e-05, + "loss": 0.4156, + "step": 411 + }, + { + "epoch": 1.054382597568778, + "grad_norm": 0.7074414036356858, + "learning_rate": 3.850996269974782e-05, + "loss": 0.4044, + "step": 412 + }, + { + "epoch": 1.0569417786308382, + "grad_norm": 0.6805988413911291, + "learning_rate": 3.849637310719799e-05, + "loss": 0.4659, + "step": 413 + }, + { + "epoch": 1.0595009596928984, + "grad_norm": 0.751339193242575, + "learning_rate": 3.84827242450524e-05, + "loss": 0.4362, + "step": 414 + }, + { + "epoch": 1.0620601407549584, + "grad_norm": 0.8444583705618753, + "learning_rate": 3.846901615704734e-05, + "loss": 0.4671, + "step": 415 + }, + { + "epoch": 1.0646193218170186, + "grad_norm": 0.5494970933209252, + "learning_rate": 3.845524888710885e-05, + "loss": 0.4192, + "step": 416 + }, + { + "epoch": 1.0671785028790788, + "grad_norm": 0.6871657555654107, + "learning_rate": 3.844142247935265e-05, + "loss": 0.4392, + "step": 417 + }, + { + "epoch": 1.0697376839411388, + "grad_norm": 0.6255544924834299, + "learning_rate": 3.842753697808395e-05, + "loss": 0.4098, + "step": 418 + }, + { + "epoch": 1.072296865003199, + "grad_norm": 0.5236343059225219, + "learning_rate": 3.84135924277973e-05, + "loss": 0.4039, + "step": 419 + }, + { + "epoch": 1.0748560460652592, + "grad_norm": 0.7081136566843064, + "learning_rate": 3.839958887317649e-05, + "loss": 0.42, + "step": 420 + }, + { + "epoch": 1.0774152271273192, + "grad_norm": 0.630614377419562, + "learning_rate": 3.838552635909436e-05, + "loss": 0.4065, + "step": 421 + }, + { + "epoch": 1.0799744081893794, + "grad_norm": 0.5615204386321961, + "learning_rate": 3.8371404930612704e-05, + "loss": 0.4146, + "step": 422 + }, + { + "epoch": 1.0825335892514396, + "grad_norm": 0.6565436867003143, + "learning_rate": 3.835722463298208e-05, + "loss": 0.4064, + "step": 423 + }, + { + "epoch": 1.0850927703134996, + "grad_norm": 0.5737679071890739, + "learning_rate": 3.83429855116417e-05, + "loss": 0.4552, + "step": 424 + }, + { + "epoch": 1.0876519513755598, + "grad_norm": 0.6434114879042129, + "learning_rate": 3.832868761221926e-05, + "loss": 0.4441, + "step": 425 + }, + { + "epoch": 1.09021113243762, + "grad_norm": 0.6708602456648777, + "learning_rate": 3.831433098053082e-05, + "loss": 0.4022, + "step": 426 + }, + { + "epoch": 1.09277031349968, + "grad_norm": 0.623494522486998, + "learning_rate": 3.829991566258061e-05, + "loss": 0.4043, + "step": 427 + }, + { + "epoch": 1.0953294945617402, + "grad_norm": 0.698870377709763, + "learning_rate": 3.828544170456094e-05, + "loss": 0.4559, + "step": 428 + }, + { + "epoch": 1.0978886756238004, + "grad_norm": 0.6409469981980804, + "learning_rate": 3.827090915285202e-05, + "loss": 0.423, + "step": 429 + }, + { + "epoch": 1.1004478566858604, + "grad_norm": 0.7366884864992727, + "learning_rate": 3.825631805402182e-05, + "loss": 0.4878, + "step": 430 + }, + { + "epoch": 1.1030070377479206, + "grad_norm": 0.6346910715057278, + "learning_rate": 3.824166845482591e-05, + "loss": 0.3875, + "step": 431 + }, + { + "epoch": 1.1055662188099808, + "grad_norm": 0.598625329081273, + "learning_rate": 3.8226960402207316e-05, + "loss": 0.4201, + "step": 432 + }, + { + "epoch": 1.108125399872041, + "grad_norm": 0.7955423553177828, + "learning_rate": 3.821219394329638e-05, + "loss": 0.468, + "step": 433 + }, + { + "epoch": 1.110684580934101, + "grad_norm": 0.6767598095676679, + "learning_rate": 3.81973691254106e-05, + "loss": 0.4104, + "step": 434 + }, + { + "epoch": 1.1132437619961613, + "grad_norm": 0.669121013587649, + "learning_rate": 3.818248599605448e-05, + "loss": 0.3625, + "step": 435 + }, + { + "epoch": 1.1158029430582213, + "grad_norm": 0.9836919729491361, + "learning_rate": 3.816754460291936e-05, + "loss": 0.4852, + "step": 436 + }, + { + "epoch": 1.1183621241202815, + "grad_norm": 0.9574882546263436, + "learning_rate": 3.8152544993883305e-05, + "loss": 0.4003, + "step": 437 + }, + { + "epoch": 1.1209213051823417, + "grad_norm": 0.6647251327930666, + "learning_rate": 3.813748721701091e-05, + "loss": 0.4202, + "step": 438 + }, + { + "epoch": 1.1234804862444019, + "grad_norm": 1.05006176773705, + "learning_rate": 3.812237132055317e-05, + "loss": 0.4341, + "step": 439 + }, + { + "epoch": 1.1260396673064619, + "grad_norm": 0.7815506825876288, + "learning_rate": 3.810719735294731e-05, + "loss": 0.4748, + "step": 440 + }, + { + "epoch": 1.128598848368522, + "grad_norm": 1.0347544390312602, + "learning_rate": 3.809196536281665e-05, + "loss": 0.4248, + "step": 441 + }, + { + "epoch": 1.1311580294305823, + "grad_norm": 0.8757539929940961, + "learning_rate": 3.807667539897041e-05, + "loss": 0.3786, + "step": 442 + }, + { + "epoch": 1.1337172104926423, + "grad_norm": 0.869699779958049, + "learning_rate": 3.8061327510403624e-05, + "loss": 0.4397, + "step": 443 + }, + { + "epoch": 1.1362763915547025, + "grad_norm": 1.0544887595956864, + "learning_rate": 3.80459217462969e-05, + "loss": 0.4333, + "step": 444 + }, + { + "epoch": 1.1388355726167627, + "grad_norm": 0.7148540444466959, + "learning_rate": 3.8030458156016326e-05, + "loss": 0.432, + "step": 445 + }, + { + "epoch": 1.1413947536788227, + "grad_norm": 0.8417941713288487, + "learning_rate": 3.801493678911326e-05, + "loss": 0.4414, + "step": 446 + }, + { + "epoch": 1.143953934740883, + "grad_norm": 0.9182856799978838, + "learning_rate": 3.799935769532425e-05, + "loss": 0.4318, + "step": 447 + }, + { + "epoch": 1.1465131158029431, + "grad_norm": 0.6142048538916006, + "learning_rate": 3.798372092457076e-05, + "loss": 0.3898, + "step": 448 + }, + { + "epoch": 1.1490722968650031, + "grad_norm": 0.8780972905895119, + "learning_rate": 3.796802652695911e-05, + "loss": 0.5123, + "step": 449 + }, + { + "epoch": 1.1516314779270633, + "grad_norm": 0.7702566431359069, + "learning_rate": 3.795227455278029e-05, + "loss": 0.3752, + "step": 450 + }, + { + "epoch": 1.1541906589891235, + "grad_norm": 0.6619464755572778, + "learning_rate": 3.7936465052509744e-05, + "loss": 0.4028, + "step": 451 + }, + { + "epoch": 1.1567498400511835, + "grad_norm": 0.8748332649924442, + "learning_rate": 3.79205980768073e-05, + "loss": 0.4178, + "step": 452 + }, + { + "epoch": 1.1593090211132437, + "grad_norm": 0.5921826549497154, + "learning_rate": 3.790467367651694e-05, + "loss": 0.4034, + "step": 453 + }, + { + "epoch": 1.161868202175304, + "grad_norm": 0.8231905572303784, + "learning_rate": 3.788869190266664e-05, + "loss": 0.4934, + "step": 454 + }, + { + "epoch": 1.164427383237364, + "grad_norm": 0.7187628790605729, + "learning_rate": 3.787265280646825e-05, + "loss": 0.4113, + "step": 455 + }, + { + "epoch": 1.1669865642994242, + "grad_norm": 0.6550020914082988, + "learning_rate": 3.785655643931728e-05, + "loss": 0.4038, + "step": 456 + }, + { + "epoch": 1.1695457453614844, + "grad_norm": 0.7857598212931832, + "learning_rate": 3.784040285279279e-05, + "loss": 0.4083, + "step": 457 + }, + { + "epoch": 1.1721049264235446, + "grad_norm": 0.6478292689928322, + "learning_rate": 3.782419209865716e-05, + "loss": 0.387, + "step": 458 + }, + { + "epoch": 1.1746641074856046, + "grad_norm": 0.6645094823023456, + "learning_rate": 3.780792422885597e-05, + "loss": 0.3904, + "step": 459 + }, + { + "epoch": 1.1772232885476648, + "grad_norm": 0.6316724059123985, + "learning_rate": 3.7791599295517825e-05, + "loss": 0.4225, + "step": 460 + }, + { + "epoch": 1.1797824696097248, + "grad_norm": 0.5440033363984303, + "learning_rate": 3.777521735095418e-05, + "loss": 0.4116, + "step": 461 + }, + { + "epoch": 1.182341650671785, + "grad_norm": 0.7448918620076476, + "learning_rate": 3.7758778447659184e-05, + "loss": 0.4272, + "step": 462 + }, + { + "epoch": 1.1849008317338452, + "grad_norm": 0.5835830556541663, + "learning_rate": 3.774228263830948e-05, + "loss": 0.3958, + "step": 463 + }, + { + "epoch": 1.1874600127959054, + "grad_norm": 0.5982038873563293, + "learning_rate": 3.772572997576409e-05, + "loss": 0.4053, + "step": 464 + }, + { + "epoch": 1.1900191938579654, + "grad_norm": 0.5062712599156436, + "learning_rate": 3.7709120513064196e-05, + "loss": 0.3874, + "step": 465 + }, + { + "epoch": 1.1925783749200256, + "grad_norm": 0.606586616558905, + "learning_rate": 3.769245430343301e-05, + "loss": 0.4528, + "step": 466 + }, + { + "epoch": 1.1951375559820858, + "grad_norm": 0.7220231131524425, + "learning_rate": 3.767573140027556e-05, + "loss": 0.433, + "step": 467 + }, + { + "epoch": 1.1976967370441458, + "grad_norm": 0.6706551361731272, + "learning_rate": 3.7658951857178544e-05, + "loss": 0.443, + "step": 468 + }, + { + "epoch": 1.200255918106206, + "grad_norm": 0.5717527361383908, + "learning_rate": 3.764211572791017e-05, + "loss": 0.4669, + "step": 469 + }, + { + "epoch": 1.2028150991682662, + "grad_norm": 0.8106231588399798, + "learning_rate": 3.762522306641998e-05, + "loss": 0.406, + "step": 470 + }, + { + "epoch": 1.2053742802303262, + "grad_norm": 0.5810325663311328, + "learning_rate": 3.760827392683863e-05, + "loss": 0.4304, + "step": 471 + }, + { + "epoch": 1.2079334612923864, + "grad_norm": 0.8071567509837944, + "learning_rate": 3.759126836347779e-05, + "loss": 0.4044, + "step": 472 + }, + { + "epoch": 1.2104926423544466, + "grad_norm": 0.700979544013499, + "learning_rate": 3.757420643082991e-05, + "loss": 0.4397, + "step": 473 + }, + { + "epoch": 1.2130518234165066, + "grad_norm": 0.5863406678085621, + "learning_rate": 3.755708818356809e-05, + "loss": 0.4099, + "step": 474 + }, + { + "epoch": 1.2156110044785668, + "grad_norm": 0.6016554944675842, + "learning_rate": 3.7539913676545874e-05, + "loss": 0.4107, + "step": 475 + }, + { + "epoch": 1.218170185540627, + "grad_norm": 0.5831332187240583, + "learning_rate": 3.7522682964797066e-05, + "loss": 0.4023, + "step": 476 + }, + { + "epoch": 1.220729366602687, + "grad_norm": 0.6093541377057935, + "learning_rate": 3.75053961035356e-05, + "loss": 0.4301, + "step": 477 + }, + { + "epoch": 1.2232885476647473, + "grad_norm": 0.6465778983982824, + "learning_rate": 3.748805314815532e-05, + "loss": 0.3933, + "step": 478 + }, + { + "epoch": 1.2258477287268075, + "grad_norm": 0.5808579764017139, + "learning_rate": 3.7470654154229834e-05, + "loss": 0.4386, + "step": 479 + }, + { + "epoch": 1.2284069097888675, + "grad_norm": 0.7856607698111114, + "learning_rate": 3.745319917751229e-05, + "loss": 0.4201, + "step": 480 + }, + { + "epoch": 1.2309660908509277, + "grad_norm": 0.5050539819501365, + "learning_rate": 3.743568827393525e-05, + "loss": 0.4773, + "step": 481 + }, + { + "epoch": 1.2335252719129879, + "grad_norm": 0.8385075664472458, + "learning_rate": 3.741812149961049e-05, + "loss": 0.4041, + "step": 482 + }, + { + "epoch": 1.236084452975048, + "grad_norm": 0.624705443326678, + "learning_rate": 3.740049891082879e-05, + "loss": 0.4157, + "step": 483 + }, + { + "epoch": 1.238643634037108, + "grad_norm": 0.5436565794572649, + "learning_rate": 3.738282056405981e-05, + "loss": 0.3959, + "step": 484 + }, + { + "epoch": 1.2412028150991683, + "grad_norm": 0.785730847073682, + "learning_rate": 3.736508651595188e-05, + "loss": 0.4413, + "step": 485 + }, + { + "epoch": 1.2437619961612283, + "grad_norm": 0.5059153650301063, + "learning_rate": 3.734729682333179e-05, + "loss": 0.4033, + "step": 486 + }, + { + "epoch": 1.2463211772232885, + "grad_norm": 0.609111900762071, + "learning_rate": 3.732945154320467e-05, + "loss": 0.4282, + "step": 487 + }, + { + "epoch": 1.2488803582853487, + "grad_norm": 0.5621990676055562, + "learning_rate": 3.731155073275375e-05, + "loss": 0.411, + "step": 488 + }, + { + "epoch": 1.251439539347409, + "grad_norm": 0.5476414987684265, + "learning_rate": 3.729359444934022e-05, + "loss": 0.4217, + "step": 489 + }, + { + "epoch": 1.253998720409469, + "grad_norm": 0.5674756992559286, + "learning_rate": 3.727558275050301e-05, + "loss": 0.461, + "step": 490 + }, + { + "epoch": 1.2565579014715291, + "grad_norm": 0.6949881690129194, + "learning_rate": 3.725751569395863e-05, + "loss": 0.4621, + "step": 491 + }, + { + "epoch": 1.2591170825335891, + "grad_norm": 0.5458610089583471, + "learning_rate": 3.723939333760099e-05, + "loss": 0.4508, + "step": 492 + }, + { + "epoch": 1.2616762635956493, + "grad_norm": 0.5906724584503589, + "learning_rate": 3.7221215739501176e-05, + "loss": 0.4276, + "step": 493 + }, + { + "epoch": 1.2642354446577095, + "grad_norm": 0.4800178261816677, + "learning_rate": 3.720298295790732e-05, + "loss": 0.3921, + "step": 494 + }, + { + "epoch": 1.2667946257197698, + "grad_norm": 0.6636123062743209, + "learning_rate": 3.718469505124434e-05, + "loss": 0.4584, + "step": 495 + }, + { + "epoch": 1.2693538067818297, + "grad_norm": 0.5232376990167991, + "learning_rate": 3.716635207811385e-05, + "loss": 0.4168, + "step": 496 + }, + { + "epoch": 1.27191298784389, + "grad_norm": 0.6767875708314827, + "learning_rate": 3.714795409729388e-05, + "loss": 0.4379, + "step": 497 + }, + { + "epoch": 1.2744721689059502, + "grad_norm": 0.5605992428107962, + "learning_rate": 3.712950116773875e-05, + "loss": 0.4074, + "step": 498 + }, + { + "epoch": 1.2770313499680102, + "grad_norm": 0.5932800163478181, + "learning_rate": 3.711099334857884e-05, + "loss": 0.4194, + "step": 499 + }, + { + "epoch": 1.2795905310300704, + "grad_norm": 0.5438179500986763, + "learning_rate": 3.709243069912041e-05, + "loss": 0.3917, + "step": 500 + }, + { + "epoch": 1.2821497120921306, + "grad_norm": 0.7190898552769511, + "learning_rate": 3.707381327884545e-05, + "loss": 0.4717, + "step": 501 + }, + { + "epoch": 1.2847088931541908, + "grad_norm": 0.6816367378411385, + "learning_rate": 3.705514114741142e-05, + "loss": 0.3782, + "step": 502 + }, + { + "epoch": 1.2872680742162508, + "grad_norm": 0.5116844048821922, + "learning_rate": 3.703641436465114e-05, + "loss": 0.4225, + "step": 503 + }, + { + "epoch": 1.289827255278311, + "grad_norm": 0.6736005325770137, + "learning_rate": 3.70176329905725e-05, + "loss": 0.4444, + "step": 504 + }, + { + "epoch": 1.292386436340371, + "grad_norm": 0.5864619464379411, + "learning_rate": 3.699879708535838e-05, + "loss": 0.4354, + "step": 505 + }, + { + "epoch": 1.2949456174024312, + "grad_norm": 0.6078408475459052, + "learning_rate": 3.6979906709366334e-05, + "loss": 0.453, + "step": 506 + }, + { + "epoch": 1.2975047984644914, + "grad_norm": 0.7409797972431034, + "learning_rate": 3.696096192312852e-05, + "loss": 0.4365, + "step": 507 + }, + { + "epoch": 1.3000639795265516, + "grad_norm": 0.4775651025928874, + "learning_rate": 3.694196278735142e-05, + "loss": 0.4391, + "step": 508 + }, + { + "epoch": 1.3026231605886116, + "grad_norm": 0.5823632608853917, + "learning_rate": 3.692290936291568e-05, + "loss": 0.3875, + "step": 509 + }, + { + "epoch": 1.3051823416506718, + "grad_norm": 0.5307154121726435, + "learning_rate": 3.69038017108759e-05, + "loss": 0.4059, + "step": 510 + }, + { + "epoch": 1.3077415227127318, + "grad_norm": 0.5835304311686238, + "learning_rate": 3.688463989246045e-05, + "loss": 0.4505, + "step": 511 + }, + { + "epoch": 1.310300703774792, + "grad_norm": 0.5459622175115579, + "learning_rate": 3.686542396907128e-05, + "loss": 0.3994, + "step": 512 + }, + { + "epoch": 1.3128598848368522, + "grad_norm": 0.5266102200698167, + "learning_rate": 3.6846154002283696e-05, + "loss": 0.3954, + "step": 513 + }, + { + "epoch": 1.3154190658989124, + "grad_norm": 0.49436862141147236, + "learning_rate": 3.68268300538462e-05, + "loss": 0.4417, + "step": 514 + }, + { + "epoch": 1.3179782469609724, + "grad_norm": 0.5981532644758176, + "learning_rate": 3.680745218568026e-05, + "loss": 0.4382, + "step": 515 + }, + { + "epoch": 1.3205374280230326, + "grad_norm": 0.4757174461989479, + "learning_rate": 3.678802045988012e-05, + "loss": 0.3686, + "step": 516 + }, + { + "epoch": 1.3230966090850926, + "grad_norm": 0.6321671394432529, + "learning_rate": 3.676853493871262e-05, + "loss": 0.4418, + "step": 517 + }, + { + "epoch": 1.3256557901471528, + "grad_norm": 0.44736313515673864, + "learning_rate": 3.674899568461696e-05, + "loss": 0.4235, + "step": 518 + }, + { + "epoch": 1.328214971209213, + "grad_norm": 0.5494531167802622, + "learning_rate": 3.6729402760204535e-05, + "loss": 0.4069, + "step": 519 + }, + { + "epoch": 1.3307741522712733, + "grad_norm": 0.5068999873204574, + "learning_rate": 3.6709756228258735e-05, + "loss": 0.4284, + "step": 520 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.506062505003235, + "learning_rate": 3.669005615173469e-05, + "loss": 0.4438, + "step": 521 + }, + { + "epoch": 1.3358925143953935, + "grad_norm": 0.6379310859811307, + "learning_rate": 3.667030259375915e-05, + "loss": 0.4142, + "step": 522 + }, + { + "epoch": 1.3384516954574537, + "grad_norm": 0.42277797644227044, + "learning_rate": 3.665049561763021e-05, + "loss": 0.3805, + "step": 523 + }, + { + "epoch": 1.3410108765195137, + "grad_norm": 0.6090026974823245, + "learning_rate": 3.663063528681716e-05, + "loss": 0.4136, + "step": 524 + }, + { + "epoch": 1.3435700575815739, + "grad_norm": 0.46562372404588254, + "learning_rate": 3.6610721664960236e-05, + "loss": 0.4354, + "step": 525 + }, + { + "epoch": 1.346129238643634, + "grad_norm": 0.6500313998419536, + "learning_rate": 3.659075481587046e-05, + "loss": 0.4283, + "step": 526 + }, + { + "epoch": 1.3486884197056943, + "grad_norm": 0.5211253922160387, + "learning_rate": 3.65707348035294e-05, + "loss": 0.4255, + "step": 527 + }, + { + "epoch": 1.3512476007677543, + "grad_norm": 0.6220670330163766, + "learning_rate": 3.6550661692089e-05, + "loss": 0.4191, + "step": 528 + }, + { + "epoch": 1.3538067818298145, + "grad_norm": 0.544711113768934, + "learning_rate": 3.6530535545871326e-05, + "loss": 0.436, + "step": 529 + }, + { + "epoch": 1.3563659628918745, + "grad_norm": 0.7052970063859283, + "learning_rate": 3.65103564293684e-05, + "loss": 0.4949, + "step": 530 + }, + { + "epoch": 1.3589251439539347, + "grad_norm": 0.5145642841555808, + "learning_rate": 3.6490124407242007e-05, + "loss": 0.4131, + "step": 531 + }, + { + "epoch": 1.361484325015995, + "grad_norm": 0.5857771172798699, + "learning_rate": 3.646983954432342e-05, + "loss": 0.4146, + "step": 532 + }, + { + "epoch": 1.3640435060780551, + "grad_norm": 0.4920261736044566, + "learning_rate": 3.644950190561325e-05, + "loss": 0.4284, + "step": 533 + }, + { + "epoch": 1.3666026871401151, + "grad_norm": 0.5687057750503046, + "learning_rate": 3.642911155628124e-05, + "loss": 0.4514, + "step": 534 + }, + { + "epoch": 1.3691618682021753, + "grad_norm": 0.5724699735691123, + "learning_rate": 3.640866856166601e-05, + "loss": 0.4539, + "step": 535 + }, + { + "epoch": 1.3717210492642353, + "grad_norm": 0.5749209613889618, + "learning_rate": 3.6388172987274913e-05, + "loss": 0.3865, + "step": 536 + }, + { + "epoch": 1.3742802303262955, + "grad_norm": 0.6228395395499405, + "learning_rate": 3.636762489878374e-05, + "loss": 0.4075, + "step": 537 + }, + { + "epoch": 1.3768394113883557, + "grad_norm": 0.6903037733166263, + "learning_rate": 3.63470243620366e-05, + "loss": 0.4312, + "step": 538 + }, + { + "epoch": 1.379398592450416, + "grad_norm": 0.678573324042214, + "learning_rate": 3.632637144304565e-05, + "loss": 0.4806, + "step": 539 + }, + { + "epoch": 1.381957773512476, + "grad_norm": 0.6951420732428104, + "learning_rate": 3.6305666207990886e-05, + "loss": 0.439, + "step": 540 + }, + { + "epoch": 1.3845169545745362, + "grad_norm": 0.5961242888653673, + "learning_rate": 3.628490872321998e-05, + "loss": 0.4205, + "step": 541 + }, + { + "epoch": 1.3870761356365962, + "grad_norm": 0.7217418974601812, + "learning_rate": 3.626409905524799e-05, + "loss": 0.4707, + "step": 542 + }, + { + "epoch": 1.3896353166986564, + "grad_norm": 0.45054510458685476, + "learning_rate": 3.624323727075723e-05, + "loss": 0.4145, + "step": 543 + }, + { + "epoch": 1.3921944977607166, + "grad_norm": 0.7284286885213026, + "learning_rate": 3.622232343659698e-05, + "loss": 0.4299, + "step": 544 + }, + { + "epoch": 1.3947536788227768, + "grad_norm": 0.6769810527885796, + "learning_rate": 3.6201357619783336e-05, + "loss": 0.4163, + "step": 545 + }, + { + "epoch": 1.3973128598848368, + "grad_norm": 0.5221114603904703, + "learning_rate": 3.6180339887498953e-05, + "loss": 0.4443, + "step": 546 + }, + { + "epoch": 1.399872040946897, + "grad_norm": 0.6008896686645977, + "learning_rate": 3.615927030709284e-05, + "loss": 0.4318, + "step": 547 + }, + { + "epoch": 1.4024312220089572, + "grad_norm": 0.5343608825415525, + "learning_rate": 3.613814894608016e-05, + "loss": 0.4623, + "step": 548 + }, + { + "epoch": 1.4049904030710172, + "grad_norm": 0.691771745080796, + "learning_rate": 3.6116975872141984e-05, + "loss": 0.4624, + "step": 549 + }, + { + "epoch": 1.4075495841330774, + "grad_norm": 0.6259414433121822, + "learning_rate": 3.609575115312511e-05, + "loss": 0.4508, + "step": 550 + }, + { + "epoch": 1.4101087651951376, + "grad_norm": 0.6649399879213752, + "learning_rate": 3.607447485704182e-05, + "loss": 0.4143, + "step": 551 + }, + { + "epoch": 1.4126679462571978, + "grad_norm": 0.47253136462127165, + "learning_rate": 3.605314705206966e-05, + "loss": 0.4106, + "step": 552 + }, + { + "epoch": 1.4152271273192578, + "grad_norm": 0.5556693107195737, + "learning_rate": 3.603176780655124e-05, + "loss": 0.4616, + "step": 553 + }, + { + "epoch": 1.417786308381318, + "grad_norm": 0.4021083541093729, + "learning_rate": 3.601033718899401e-05, + "loss": 0.3928, + "step": 554 + }, + { + "epoch": 1.420345489443378, + "grad_norm": 0.5351580026934937, + "learning_rate": 3.598885526807003e-05, + "loss": 0.4661, + "step": 555 + }, + { + "epoch": 1.4229046705054382, + "grad_norm": 0.5247684064361327, + "learning_rate": 3.596732211261574e-05, + "loss": 0.4303, + "step": 556 + }, + { + "epoch": 1.4254638515674984, + "grad_norm": 0.49894549324782106, + "learning_rate": 3.594573779163179e-05, + "loss": 0.3938, + "step": 557 + }, + { + "epoch": 1.4280230326295587, + "grad_norm": 0.5751022801506976, + "learning_rate": 3.5924102374282754e-05, + "loss": 0.4401, + "step": 558 + }, + { + "epoch": 1.4305822136916186, + "grad_norm": 0.4560617808969142, + "learning_rate": 3.590241592989696e-05, + "loss": 0.4241, + "step": 559 + }, + { + "epoch": 1.4331413947536789, + "grad_norm": 0.6245655178581035, + "learning_rate": 3.5880678527966224e-05, + "loss": 0.4138, + "step": 560 + }, + { + "epoch": 1.4357005758157388, + "grad_norm": 0.5355963534405355, + "learning_rate": 3.5858890238145674e-05, + "loss": 0.4145, + "step": 561 + }, + { + "epoch": 1.438259756877799, + "grad_norm": 0.6422997737126074, + "learning_rate": 3.583705113025348e-05, + "loss": 0.4554, + "step": 562 + }, + { + "epoch": 1.4408189379398593, + "grad_norm": 0.6761962261433756, + "learning_rate": 3.581516127427068e-05, + "loss": 0.4176, + "step": 563 + }, + { + "epoch": 1.4433781190019195, + "grad_norm": 0.6187534229527232, + "learning_rate": 3.5793220740340904e-05, + "loss": 0.4255, + "step": 564 + }, + { + "epoch": 1.4459373000639795, + "grad_norm": 0.5255489769173007, + "learning_rate": 3.577122959877017e-05, + "loss": 0.4147, + "step": 565 + }, + { + "epoch": 1.4484964811260397, + "grad_norm": 0.5786362132356377, + "learning_rate": 3.57491879200267e-05, + "loss": 0.4018, + "step": 566 + }, + { + "epoch": 1.4510556621880997, + "grad_norm": 0.5297316242089579, + "learning_rate": 3.572709577474062e-05, + "loss": 0.4446, + "step": 567 + }, + { + "epoch": 1.4536148432501599, + "grad_norm": 0.5131558041266011, + "learning_rate": 3.570495323370378e-05, + "loss": 0.4475, + "step": 568 + }, + { + "epoch": 1.45617402431222, + "grad_norm": 0.7521443480114743, + "learning_rate": 3.568276036786952e-05, + "loss": 0.4091, + "step": 569 + }, + { + "epoch": 1.4587332053742803, + "grad_norm": 0.5969703958136027, + "learning_rate": 3.566051724835245e-05, + "loss": 0.4283, + "step": 570 + }, + { + "epoch": 1.4612923864363403, + "grad_norm": 0.4037709403552394, + "learning_rate": 3.5638223946428194e-05, + "loss": 0.4271, + "step": 571 + }, + { + "epoch": 1.4638515674984005, + "grad_norm": 0.5680529949916449, + "learning_rate": 3.561588053353319e-05, + "loss": 0.4253, + "step": 572 + }, + { + "epoch": 1.4664107485604607, + "grad_norm": 0.4379512610916079, + "learning_rate": 3.559348708126445e-05, + "loss": 0.3955, + "step": 573 + }, + { + "epoch": 1.4689699296225207, + "grad_norm": 0.6425572888305521, + "learning_rate": 3.557104366137934e-05, + "loss": 0.4208, + "step": 574 + }, + { + "epoch": 1.471529110684581, + "grad_norm": 0.5147430638168887, + "learning_rate": 3.554855034579532e-05, + "loss": 0.4206, + "step": 575 + }, + { + "epoch": 1.4740882917466411, + "grad_norm": 0.5773864382218094, + "learning_rate": 3.552600720658976e-05, + "loss": 0.3936, + "step": 576 + }, + { + "epoch": 1.4766474728087013, + "grad_norm": 0.5531560341828542, + "learning_rate": 3.550341431599967e-05, + "loss": 0.4674, + "step": 577 + }, + { + "epoch": 1.4792066538707613, + "grad_norm": 0.5447609179432047, + "learning_rate": 3.5480771746421494e-05, + "loss": 0.4032, + "step": 578 + }, + { + "epoch": 1.4817658349328215, + "grad_norm": 0.572196110734813, + "learning_rate": 3.545807957041084e-05, + "loss": 0.4509, + "step": 579 + }, + { + "epoch": 1.4843250159948815, + "grad_norm": 0.5028398333246705, + "learning_rate": 3.5435337860682304e-05, + "loss": 0.3334, + "step": 580 + }, + { + "epoch": 1.4868841970569417, + "grad_norm": 0.7160945031568345, + "learning_rate": 3.54125466901092e-05, + "loss": 0.4598, + "step": 581 + }, + { + "epoch": 1.489443378119002, + "grad_norm": 0.6791351765101247, + "learning_rate": 3.538970613172332e-05, + "loss": 0.4055, + "step": 582 + }, + { + "epoch": 1.4920025591810622, + "grad_norm": 0.8140085732718708, + "learning_rate": 3.536681625871474e-05, + "loss": 0.3982, + "step": 583 + }, + { + "epoch": 1.4945617402431222, + "grad_norm": 0.6600156473699682, + "learning_rate": 3.534387714443153e-05, + "loss": 0.4283, + "step": 584 + }, + { + "epoch": 1.4971209213051824, + "grad_norm": 0.9072053140385462, + "learning_rate": 3.532088886237956e-05, + "loss": 0.461, + "step": 585 + }, + { + "epoch": 1.4996801023672424, + "grad_norm": 0.5291141400904751, + "learning_rate": 3.5297851486222274e-05, + "loss": 0.4105, + "step": 586 + }, + { + "epoch": 1.5022392834293026, + "grad_norm": 0.8711742022509097, + "learning_rate": 3.527476508978039e-05, + "loss": 0.4266, + "step": 587 + }, + { + "epoch": 1.5047984644913628, + "grad_norm": 0.5011541715943939, + "learning_rate": 3.525162974703174e-05, + "loss": 0.4681, + "step": 588 + }, + { + "epoch": 1.507357645553423, + "grad_norm": 0.7884765548368484, + "learning_rate": 3.5228445532110996e-05, + "loss": 0.4341, + "step": 589 + }, + { + "epoch": 1.5099168266154832, + "grad_norm": 0.5740501726759667, + "learning_rate": 3.520521251930941e-05, + "loss": 0.4128, + "step": 590 + }, + { + "epoch": 1.5124760076775432, + "grad_norm": 0.5460891296171442, + "learning_rate": 3.518193078307463e-05, + "loss": 0.4188, + "step": 591 + }, + { + "epoch": 1.5150351887396032, + "grad_norm": 0.7055940264802735, + "learning_rate": 3.515860039801043e-05, + "loss": 0.3965, + "step": 592 + }, + { + "epoch": 1.5175943698016634, + "grad_norm": 0.5874392809254365, + "learning_rate": 3.513522143887645e-05, + "loss": 0.4918, + "step": 593 + }, + { + "epoch": 1.5201535508637236, + "grad_norm": 0.6245938510885568, + "learning_rate": 3.5111793980588006e-05, + "loss": 0.4285, + "step": 594 + }, + { + "epoch": 1.5227127319257838, + "grad_norm": 0.48774952340541083, + "learning_rate": 3.5088318098215805e-05, + "loss": 0.4013, + "step": 595 + }, + { + "epoch": 1.525271912987844, + "grad_norm": 0.4468453631601922, + "learning_rate": 3.506479386698575e-05, + "loss": 0.3958, + "step": 596 + }, + { + "epoch": 1.527831094049904, + "grad_norm": 0.5538376365976897, + "learning_rate": 3.5041221362278644e-05, + "loss": 0.4347, + "step": 597 + }, + { + "epoch": 1.530390275111964, + "grad_norm": 0.4775499856280007, + "learning_rate": 3.5017600659629986e-05, + "loss": 0.4484, + "step": 598 + }, + { + "epoch": 1.5329494561740242, + "grad_norm": 0.5259243816931762, + "learning_rate": 3.499393183472973e-05, + "loss": 0.4211, + "step": 599 + }, + { + "epoch": 1.5355086372360844, + "grad_norm": 0.5198109524806964, + "learning_rate": 3.497021496342203e-05, + "loss": 0.4363, + "step": 600 + }, + { + "epoch": 1.5380678182981447, + "grad_norm": 0.4696941806509467, + "learning_rate": 3.494645012170498e-05, + "loss": 0.4295, + "step": 601 + }, + { + "epoch": 1.5406269993602049, + "grad_norm": 0.6474700305067774, + "learning_rate": 3.4922637385730406e-05, + "loss": 0.5, + "step": 602 + }, + { + "epoch": 1.5431861804222649, + "grad_norm": 0.4793948522260617, + "learning_rate": 3.489877683180362e-05, + "loss": 0.3845, + "step": 603 + }, + { + "epoch": 1.545745361484325, + "grad_norm": 0.539237627319592, + "learning_rate": 3.487486853638314e-05, + "loss": 0.4356, + "step": 604 + }, + { + "epoch": 1.548304542546385, + "grad_norm": 0.5532057074478213, + "learning_rate": 3.485091257608047e-05, + "loss": 0.3891, + "step": 605 + }, + { + "epoch": 1.5508637236084453, + "grad_norm": 0.6992089262871057, + "learning_rate": 3.482690902765984e-05, + "loss": 0.4571, + "step": 606 + }, + { + "epoch": 1.5534229046705055, + "grad_norm": 0.4638019008983174, + "learning_rate": 3.4802857968038e-05, + "loss": 0.4188, + "step": 607 + }, + { + "epoch": 1.5559820857325657, + "grad_norm": 0.7914532265925823, + "learning_rate": 3.4778759474283936e-05, + "loss": 0.4534, + "step": 608 + }, + { + "epoch": 1.5585412667946257, + "grad_norm": 0.5295039816146513, + "learning_rate": 3.475461362361861e-05, + "loss": 0.4001, + "step": 609 + }, + { + "epoch": 1.561100447856686, + "grad_norm": 0.6132941939025509, + "learning_rate": 3.473042049341474e-05, + "loss": 0.4225, + "step": 610 + }, + { + "epoch": 1.5636596289187459, + "grad_norm": 0.714057587477597, + "learning_rate": 3.470618016119658e-05, + "loss": 0.4136, + "step": 611 + }, + { + "epoch": 1.566218809980806, + "grad_norm": 0.5320432430474729, + "learning_rate": 3.468189270463959e-05, + "loss": 0.4004, + "step": 612 + }, + { + "epoch": 1.5687779910428663, + "grad_norm": 0.66784032505952, + "learning_rate": 3.465755820157026e-05, + "loss": 0.4065, + "step": 613 + }, + { + "epoch": 1.5713371721049265, + "grad_norm": 0.49152795733031895, + "learning_rate": 3.463317672996583e-05, + "loss": 0.3791, + "step": 614 + }, + { + "epoch": 1.5738963531669867, + "grad_norm": 0.555185505506747, + "learning_rate": 3.4608748367954064e-05, + "loss": 0.4633, + "step": 615 + }, + { + "epoch": 1.5764555342290467, + "grad_norm": 0.4980166871980404, + "learning_rate": 3.4584273193812956e-05, + "loss": 0.4252, + "step": 616 + }, + { + "epoch": 1.5790147152911067, + "grad_norm": 0.589732498310526, + "learning_rate": 3.45597512859705e-05, + "loss": 0.4839, + "step": 617 + }, + { + "epoch": 1.581573896353167, + "grad_norm": 0.5743441085203349, + "learning_rate": 3.4535182723004466e-05, + "loss": 0.4062, + "step": 618 + }, + { + "epoch": 1.5841330774152271, + "grad_norm": 0.49126577666625093, + "learning_rate": 3.451056758364212e-05, + "loss": 0.4135, + "step": 619 + }, + { + "epoch": 1.5866922584772873, + "grad_norm": 0.5037192947792559, + "learning_rate": 3.4485905946759965e-05, + "loss": 0.459, + "step": 620 + }, + { + "epoch": 1.5892514395393476, + "grad_norm": 0.5439325021578344, + "learning_rate": 3.446119789138351e-05, + "loss": 0.3882, + "step": 621 + }, + { + "epoch": 1.5918106206014075, + "grad_norm": 0.5020489297705218, + "learning_rate": 3.443644349668701e-05, + "loss": 0.4053, + "step": 622 + }, + { + "epoch": 1.5943698016634675, + "grad_norm": 0.5634072314676822, + "learning_rate": 3.4411642841993185e-05, + "loss": 0.4065, + "step": 623 + }, + { + "epoch": 1.5969289827255277, + "grad_norm": 0.4911032231149214, + "learning_rate": 3.438679600677303e-05, + "loss": 0.4207, + "step": 624 + }, + { + "epoch": 1.599488163787588, + "grad_norm": 0.6463093595410542, + "learning_rate": 3.4361903070645484e-05, + "loss": 0.4195, + "step": 625 + }, + { + "epoch": 1.6020473448496482, + "grad_norm": 0.5395555876619189, + "learning_rate": 3.433696411337723e-05, + "loss": 0.4359, + "step": 626 + }, + { + "epoch": 1.6046065259117084, + "grad_norm": 0.5456191870776568, + "learning_rate": 3.431197921488242e-05, + "loss": 0.4325, + "step": 627 + }, + { + "epoch": 1.6071657069737684, + "grad_norm": 0.5181489907237284, + "learning_rate": 3.4286948455222425e-05, + "loss": 0.4262, + "step": 628 + }, + { + "epoch": 1.6097248880358286, + "grad_norm": 0.4542522655783656, + "learning_rate": 3.426187191460555e-05, + "loss": 0.4008, + "step": 629 + }, + { + "epoch": 1.6122840690978886, + "grad_norm": 0.5748315523058384, + "learning_rate": 3.423674967338681e-05, + "loss": 0.4613, + "step": 630 + }, + { + "epoch": 1.6148432501599488, + "grad_norm": 0.4893291757606303, + "learning_rate": 3.421158181206769e-05, + "loss": 0.411, + "step": 631 + }, + { + "epoch": 1.617402431222009, + "grad_norm": 0.580837787265809, + "learning_rate": 3.418636841129582e-05, + "loss": 0.417, + "step": 632 + }, + { + "epoch": 1.6199616122840692, + "grad_norm": 0.6495075917348687, + "learning_rate": 3.416110955186477e-05, + "loss": 0.4817, + "step": 633 + }, + { + "epoch": 1.6225207933461292, + "grad_norm": 0.45060911216179744, + "learning_rate": 3.4135805314713804e-05, + "loss": 0.4033, + "step": 634 + }, + { + "epoch": 1.6250799744081894, + "grad_norm": 0.5998653037164785, + "learning_rate": 3.411045578092754e-05, + "loss": 0.3912, + "step": 635 + }, + { + "epoch": 1.6276391554702494, + "grad_norm": 0.5310655122776353, + "learning_rate": 3.4085061031735794e-05, + "loss": 0.4313, + "step": 636 + }, + { + "epoch": 1.6301983365323096, + "grad_norm": 0.6388088047336757, + "learning_rate": 3.405962114851324e-05, + "loss": 0.4433, + "step": 637 + }, + { + "epoch": 1.6327575175943698, + "grad_norm": 0.5039515005309823, + "learning_rate": 3.4034136212779195e-05, + "loss": 0.414, + "step": 638 + }, + { + "epoch": 1.63531669865643, + "grad_norm": 0.6351418548004399, + "learning_rate": 3.4008606306197336e-05, + "loss": 0.4271, + "step": 639 + }, + { + "epoch": 1.6378758797184902, + "grad_norm": 0.5382251510311172, + "learning_rate": 3.398303151057543e-05, + "loss": 0.4223, + "step": 640 + }, + { + "epoch": 1.6404350607805502, + "grad_norm": 0.5213424824749899, + "learning_rate": 3.3957411907865123e-05, + "loss": 0.4169, + "step": 641 + }, + { + "epoch": 1.6429942418426102, + "grad_norm": 0.6114074897511479, + "learning_rate": 3.393174758016161e-05, + "loss": 0.4141, + "step": 642 + }, + { + "epoch": 1.6455534229046704, + "grad_norm": 0.5077805851469291, + "learning_rate": 3.39060386097034e-05, + "loss": 0.4354, + "step": 643 + }, + { + "epoch": 1.6481126039667306, + "grad_norm": 0.5648032929681525, + "learning_rate": 3.3880285078872076e-05, + "loss": 0.3944, + "step": 644 + }, + { + "epoch": 1.6506717850287909, + "grad_norm": 0.5623890506817537, + "learning_rate": 3.385448707019199e-05, + "loss": 0.463, + "step": 645 + }, + { + "epoch": 1.653230966090851, + "grad_norm": 0.47253413270859174, + "learning_rate": 3.382864466633003e-05, + "loss": 0.4179, + "step": 646 + }, + { + "epoch": 1.655790147152911, + "grad_norm": 0.6667403121893724, + "learning_rate": 3.3802757950095346e-05, + "loss": 0.4401, + "step": 647 + }, + { + "epoch": 1.658349328214971, + "grad_norm": 0.48362697457689036, + "learning_rate": 3.377682700443907e-05, + "loss": 0.4294, + "step": 648 + }, + { + "epoch": 1.6609085092770313, + "grad_norm": 0.49305745599661777, + "learning_rate": 3.375085191245407e-05, + "loss": 0.4166, + "step": 649 + }, + { + "epoch": 1.6634676903390915, + "grad_norm": 0.5645696987568555, + "learning_rate": 3.372483275737468e-05, + "loss": 0.3922, + "step": 650 + }, + { + "epoch": 1.6660268714011517, + "grad_norm": 0.6497192793696049, + "learning_rate": 3.3698769622576404e-05, + "loss": 0.484, + "step": 651 + }, + { + "epoch": 1.668586052463212, + "grad_norm": 0.6004795627408669, + "learning_rate": 3.367266259157572e-05, + "loss": 0.4744, + "step": 652 + }, + { + "epoch": 1.671145233525272, + "grad_norm": 0.6035991357132837, + "learning_rate": 3.364651174802974e-05, + "loss": 0.4576, + "step": 653 + }, + { + "epoch": 1.673704414587332, + "grad_norm": 0.5638974284467396, + "learning_rate": 3.3620317175735945e-05, + "loss": 0.3829, + "step": 654 + }, + { + "epoch": 1.676263595649392, + "grad_norm": 0.6144746966380764, + "learning_rate": 3.359407895863199e-05, + "loss": 0.4219, + "step": 655 + }, + { + "epoch": 1.6788227767114523, + "grad_norm": 0.6330635207423285, + "learning_rate": 3.356779718079534e-05, + "loss": 0.3939, + "step": 656 + }, + { + "epoch": 1.6813819577735125, + "grad_norm": 0.6511893943323425, + "learning_rate": 3.3541471926443084e-05, + "loss": 0.4626, + "step": 657 + }, + { + "epoch": 1.6839411388355727, + "grad_norm": 0.7313805162264239, + "learning_rate": 3.3515103279931584e-05, + "loss": 0.4443, + "step": 658 + }, + { + "epoch": 1.6865003198976327, + "grad_norm": 0.4567231810719478, + "learning_rate": 3.3488691325756294e-05, + "loss": 0.4072, + "step": 659 + }, + { + "epoch": 1.689059500959693, + "grad_norm": 0.6498464270828352, + "learning_rate": 3.34622361485514e-05, + "loss": 0.4532, + "step": 660 + }, + { + "epoch": 1.691618682021753, + "grad_norm": 0.46427052887792153, + "learning_rate": 3.343573783308964e-05, + "loss": 0.4266, + "step": 661 + }, + { + "epoch": 1.6941778630838131, + "grad_norm": 0.5468516675156209, + "learning_rate": 3.340919646428193e-05, + "loss": 0.4208, + "step": 662 + }, + { + "epoch": 1.6967370441458733, + "grad_norm": 0.5751528606477143, + "learning_rate": 3.3382612127177166e-05, + "loss": 0.4146, + "step": 663 + }, + { + "epoch": 1.6992962252079336, + "grad_norm": 0.5793295201191827, + "learning_rate": 3.335598490696196e-05, + "loss": 0.4623, + "step": 664 + }, + { + "epoch": 1.7018554062699938, + "grad_norm": 0.7077864391730174, + "learning_rate": 3.332931488896029e-05, + "loss": 0.4459, + "step": 665 + }, + { + "epoch": 1.7044145873320538, + "grad_norm": 0.4786371924890489, + "learning_rate": 3.330260215863332e-05, + "loss": 0.3967, + "step": 666 + }, + { + "epoch": 1.7069737683941137, + "grad_norm": 0.71627935746326, + "learning_rate": 3.327584680157904e-05, + "loss": 0.4466, + "step": 667 + }, + { + "epoch": 1.709532949456174, + "grad_norm": 0.5672877205591593, + "learning_rate": 3.3249048903532075e-05, + "loss": 0.4245, + "step": 668 + }, + { + "epoch": 1.7120921305182342, + "grad_norm": 0.5967392434258936, + "learning_rate": 3.322220855036333e-05, + "loss": 0.4399, + "step": 669 + }, + { + "epoch": 1.7146513115802944, + "grad_norm": 0.619642871257619, + "learning_rate": 3.319532582807977e-05, + "loss": 0.4429, + "step": 670 + }, + { + "epoch": 1.7172104926423546, + "grad_norm": 0.5718993726856418, + "learning_rate": 3.316840082282412e-05, + "loss": 0.4049, + "step": 671 + }, + { + "epoch": 1.7197696737044146, + "grad_norm": 0.7158788807881703, + "learning_rate": 3.314143362087462e-05, + "loss": 0.465, + "step": 672 + }, + { + "epoch": 1.7223288547664746, + "grad_norm": 0.49801282221595167, + "learning_rate": 3.3114424308644686e-05, + "loss": 0.4304, + "step": 673 + }, + { + "epoch": 1.7248880358285348, + "grad_norm": 0.732000530911472, + "learning_rate": 3.3087372972682703e-05, + "loss": 0.4496, + "step": 674 + }, + { + "epoch": 1.727447216890595, + "grad_norm": 0.5016534768936702, + "learning_rate": 3.30602796996717e-05, + "loss": 0.4196, + "step": 675 + }, + { + "epoch": 1.7300063979526552, + "grad_norm": 0.6732998458752849, + "learning_rate": 3.303314457642911e-05, + "loss": 0.4377, + "step": 676 + }, + { + "epoch": 1.7325655790147154, + "grad_norm": 0.512135853555358, + "learning_rate": 3.300596768990644e-05, + "loss": 0.4032, + "step": 677 + }, + { + "epoch": 1.7351247600767754, + "grad_norm": 0.5565097137954512, + "learning_rate": 3.297874912718902e-05, + "loss": 0.4124, + "step": 678 + }, + { + "epoch": 1.7376839411388356, + "grad_norm": 0.5579693456602185, + "learning_rate": 3.2951488975495785e-05, + "loss": 0.4493, + "step": 679 + }, + { + "epoch": 1.7402431222008956, + "grad_norm": 0.5767956410825538, + "learning_rate": 3.2924187322178865e-05, + "loss": 0.4701, + "step": 680 + }, + { + "epoch": 1.7428023032629558, + "grad_norm": 0.474585132199083, + "learning_rate": 3.2896844254723414e-05, + "loss": 0.4118, + "step": 681 + }, + { + "epoch": 1.745361484325016, + "grad_norm": 0.44776574599085095, + "learning_rate": 3.28694598607473e-05, + "loss": 0.3928, + "step": 682 + }, + { + "epoch": 1.7479206653870762, + "grad_norm": 0.4613485770735262, + "learning_rate": 3.28420342280008e-05, + "loss": 0.4185, + "step": 683 + }, + { + "epoch": 1.7504798464491362, + "grad_norm": 0.6343181624558385, + "learning_rate": 3.281456744436634e-05, + "loss": 0.4133, + "step": 684 + }, + { + "epoch": 1.7530390275111964, + "grad_norm": 0.5493779464654077, + "learning_rate": 3.278705959785821e-05, + "loss": 0.4671, + "step": 685 + }, + { + "epoch": 1.7555982085732564, + "grad_norm": 0.5441386561154584, + "learning_rate": 3.2759510776622274e-05, + "loss": 0.4453, + "step": 686 + }, + { + "epoch": 1.7581573896353166, + "grad_norm": 0.629823120577556, + "learning_rate": 3.273192106893572e-05, + "loss": 0.3839, + "step": 687 + }, + { + "epoch": 1.7607165706973769, + "grad_norm": 0.549530453190963, + "learning_rate": 3.270429056320672e-05, + "loss": 0.4502, + "step": 688 + }, + { + "epoch": 1.763275751759437, + "grad_norm": 0.6031975081301458, + "learning_rate": 3.26766193479742e-05, + "loss": 0.4738, + "step": 689 + }, + { + "epoch": 1.7658349328214973, + "grad_norm": 0.48136220411058944, + "learning_rate": 3.2648907511907544e-05, + "loss": 0.4036, + "step": 690 + }, + { + "epoch": 1.7683941138835573, + "grad_norm": 0.6253783336704821, + "learning_rate": 3.262115514380628e-05, + "loss": 0.4081, + "step": 691 + }, + { + "epoch": 1.7709532949456173, + "grad_norm": 0.47939071537048983, + "learning_rate": 3.25933623325998e-05, + "loss": 0.4314, + "step": 692 + }, + { + "epoch": 1.7735124760076775, + "grad_norm": 0.4518873833371369, + "learning_rate": 3.256552916734713e-05, + "loss": 0.3986, + "step": 693 + }, + { + "epoch": 1.7760716570697377, + "grad_norm": 0.6074017672602955, + "learning_rate": 3.25376557372366e-05, + "loss": 0.4324, + "step": 694 + }, + { + "epoch": 1.778630838131798, + "grad_norm": 0.470674972052956, + "learning_rate": 3.250974213158555e-05, + "loss": 0.3933, + "step": 695 + }, + { + "epoch": 1.781190019193858, + "grad_norm": 0.6255159824430874, + "learning_rate": 3.248178843984006e-05, + "loss": 0.4252, + "step": 696 + }, + { + "epoch": 1.783749200255918, + "grad_norm": 0.5371760909763971, + "learning_rate": 3.245379475157465e-05, + "loss": 0.4778, + "step": 697 + }, + { + "epoch": 1.786308381317978, + "grad_norm": 0.5168010335153898, + "learning_rate": 3.242576115649205e-05, + "loss": 0.4229, + "step": 698 + }, + { + "epoch": 1.7888675623800383, + "grad_norm": 0.49166449165933496, + "learning_rate": 3.239768774442281e-05, + "loss": 0.4005, + "step": 699 + }, + { + "epoch": 1.7914267434420985, + "grad_norm": 0.4932386148580624, + "learning_rate": 3.23695746053251e-05, + "loss": 0.4163, + "step": 700 + }, + { + "epoch": 1.7939859245041587, + "grad_norm": 0.5880360652699835, + "learning_rate": 3.2341421829284394e-05, + "loss": 0.4413, + "step": 701 + }, + { + "epoch": 1.796545105566219, + "grad_norm": 0.4625923123089751, + "learning_rate": 3.2313229506513167e-05, + "loss": 0.426, + "step": 702 + }, + { + "epoch": 1.799104286628279, + "grad_norm": 0.5109208128171377, + "learning_rate": 3.228499772735062e-05, + "loss": 0.393, + "step": 703 + }, + { + "epoch": 1.8016634676903391, + "grad_norm": 0.4806179945017673, + "learning_rate": 3.2256726582262384e-05, + "loss": 0.4479, + "step": 704 + }, + { + "epoch": 1.8042226487523991, + "grad_norm": 0.4970665904869278, + "learning_rate": 3.222841616184025e-05, + "loss": 0.4318, + "step": 705 + }, + { + "epoch": 1.8067818298144593, + "grad_norm": 0.5666482942373245, + "learning_rate": 3.220006655680183e-05, + "loss": 0.4245, + "step": 706 + }, + { + "epoch": 1.8093410108765196, + "grad_norm": 0.4999181748027583, + "learning_rate": 3.2171677857990334e-05, + "loss": 0.4372, + "step": 707 + }, + { + "epoch": 1.8119001919385798, + "grad_norm": 0.47380754642052403, + "learning_rate": 3.2143250156374226e-05, + "loss": 0.3926, + "step": 708 + }, + { + "epoch": 1.8144593730006398, + "grad_norm": 0.6008131918939661, + "learning_rate": 3.211478354304695e-05, + "loss": 0.4533, + "step": 709 + }, + { + "epoch": 1.8170185540627, + "grad_norm": 0.5144973230871912, + "learning_rate": 3.208627810922665e-05, + "loss": 0.4352, + "step": 710 + }, + { + "epoch": 1.81957773512476, + "grad_norm": 0.4699623742277227, + "learning_rate": 3.2057733946255844e-05, + "loss": 0.3852, + "step": 711 + }, + { + "epoch": 1.8221369161868202, + "grad_norm": 0.4961706904135912, + "learning_rate": 3.202915114560118e-05, + "loss": 0.4445, + "step": 712 + }, + { + "epoch": 1.8246960972488804, + "grad_norm": 0.561279443158619, + "learning_rate": 3.200052979885309e-05, + "loss": 0.4802, + "step": 713 + }, + { + "epoch": 1.8272552783109406, + "grad_norm": 0.4557933345271347, + "learning_rate": 3.197186999772555e-05, + "loss": 0.4029, + "step": 714 + }, + { + "epoch": 1.8298144593730008, + "grad_norm": 0.589112376636283, + "learning_rate": 3.194317183405573e-05, + "loss": 0.4563, + "step": 715 + }, + { + "epoch": 1.8323736404350608, + "grad_norm": 0.5161326824368248, + "learning_rate": 3.191443539980374e-05, + "loss": 0.4556, + "step": 716 + }, + { + "epoch": 1.8349328214971208, + "grad_norm": 0.464424728984026, + "learning_rate": 3.188566078705235e-05, + "loss": 0.4044, + "step": 717 + }, + { + "epoch": 1.837492002559181, + "grad_norm": 0.47067836600914265, + "learning_rate": 3.1856848088006636e-05, + "loss": 0.4335, + "step": 718 + }, + { + "epoch": 1.8400511836212412, + "grad_norm": 0.5769491747912345, + "learning_rate": 3.182799739499371e-05, + "loss": 0.4407, + "step": 719 + }, + { + "epoch": 1.8426103646833014, + "grad_norm": 0.4932465377591071, + "learning_rate": 3.1799108800462466e-05, + "loss": 0.4328, + "step": 720 + }, + { + "epoch": 1.8451695457453616, + "grad_norm": 0.488480860113565, + "learning_rate": 3.177018239698322e-05, + "loss": 0.4235, + "step": 721 + }, + { + "epoch": 1.8477287268074216, + "grad_norm": 0.48833589337954714, + "learning_rate": 3.1741218277247466e-05, + "loss": 0.4132, + "step": 722 + }, + { + "epoch": 1.8502879078694816, + "grad_norm": 0.39099920314827113, + "learning_rate": 3.1712216534067536e-05, + "loss": 0.4265, + "step": 723 + }, + { + "epoch": 1.8528470889315418, + "grad_norm": 0.45542998139885993, + "learning_rate": 3.168317726037634e-05, + "loss": 0.3971, + "step": 724 + }, + { + "epoch": 1.855406269993602, + "grad_norm": 0.46738782533250195, + "learning_rate": 3.1654100549227024e-05, + "loss": 0.4559, + "step": 725 + }, + { + "epoch": 1.8579654510556622, + "grad_norm": 0.40700257974662957, + "learning_rate": 3.1624986493792735e-05, + "loss": 0.4135, + "step": 726 + }, + { + "epoch": 1.8605246321177225, + "grad_norm": 0.4797435460256252, + "learning_rate": 3.159583518736625e-05, + "loss": 0.4463, + "step": 727 + }, + { + "epoch": 1.8630838131797824, + "grad_norm": 0.47740901056002083, + "learning_rate": 3.156664672335973e-05, + "loss": 0.3884, + "step": 728 + }, + { + "epoch": 1.8656429942418427, + "grad_norm": 0.49631875720308977, + "learning_rate": 3.153742119530441e-05, + "loss": 0.4162, + "step": 729 + }, + { + "epoch": 1.8682021753039026, + "grad_norm": 0.49356825084947964, + "learning_rate": 3.1508158696850275e-05, + "loss": 0.4329, + "step": 730 + }, + { + "epoch": 1.8707613563659629, + "grad_norm": 0.4599036508089157, + "learning_rate": 3.1478859321765796e-05, + "loss": 0.428, + "step": 731 + }, + { + "epoch": 1.873320537428023, + "grad_norm": 0.48294927528675924, + "learning_rate": 3.144952316393758e-05, + "loss": 0.4058, + "step": 732 + }, + { + "epoch": 1.8758797184900833, + "grad_norm": 0.5890205448596298, + "learning_rate": 3.142015031737016e-05, + "loss": 0.4776, + "step": 733 + }, + { + "epoch": 1.8784388995521433, + "grad_norm": 0.4923421153314979, + "learning_rate": 3.139074087618556e-05, + "loss": 0.4045, + "step": 734 + }, + { + "epoch": 1.8809980806142035, + "grad_norm": 0.5087618925492778, + "learning_rate": 3.136129493462312e-05, + "loss": 0.4275, + "step": 735 + }, + { + "epoch": 1.8835572616762635, + "grad_norm": 0.5648227631500222, + "learning_rate": 3.133181258703912e-05, + "loss": 0.4727, + "step": 736 + }, + { + "epoch": 1.8861164427383237, + "grad_norm": 0.5234994479526746, + "learning_rate": 3.1302293927906516e-05, + "loss": 0.3967, + "step": 737 + }, + { + "epoch": 1.888675623800384, + "grad_norm": 0.6807348510101979, + "learning_rate": 3.1272739051814594e-05, + "loss": 0.4551, + "step": 738 + }, + { + "epoch": 1.891234804862444, + "grad_norm": 0.4969832272412207, + "learning_rate": 3.1243148053468715e-05, + "loss": 0.3773, + "step": 739 + }, + { + "epoch": 1.8937939859245043, + "grad_norm": 0.6317866448620022, + "learning_rate": 3.121352102768998e-05, + "loss": 0.4389, + "step": 740 + }, + { + "epoch": 1.8963531669865643, + "grad_norm": 0.5131091797313253, + "learning_rate": 3.1183858069414936e-05, + "loss": 0.4458, + "step": 741 + }, + { + "epoch": 1.8989123480486243, + "grad_norm": 0.4972035729160381, + "learning_rate": 3.115415927369529e-05, + "loss": 0.4451, + "step": 742 + }, + { + "epoch": 1.9014715291106845, + "grad_norm": 0.47895024578706524, + "learning_rate": 3.112442473569754e-05, + "loss": 0.4324, + "step": 743 + }, + { + "epoch": 1.9040307101727447, + "grad_norm": 0.4691676491006599, + "learning_rate": 3.109465455070278e-05, + "loss": 0.4035, + "step": 744 + }, + { + "epoch": 1.906589891234805, + "grad_norm": 0.4736286307947326, + "learning_rate": 3.106484881410628e-05, + "loss": 0.4446, + "step": 745 + }, + { + "epoch": 1.9091490722968651, + "grad_norm": 0.4359831705290721, + "learning_rate": 3.103500762141725e-05, + "loss": 0.3829, + "step": 746 + }, + { + "epoch": 1.9117082533589251, + "grad_norm": 0.462916560551583, + "learning_rate": 3.1005131068258506e-05, + "loss": 0.4107, + "step": 747 + }, + { + "epoch": 1.9142674344209851, + "grad_norm": 0.47173985630781595, + "learning_rate": 3.09752192503662e-05, + "loss": 0.3903, + "step": 748 + }, + { + "epoch": 1.9168266154830453, + "grad_norm": 0.43246703137114556, + "learning_rate": 3.094527226358945e-05, + "loss": 0.4091, + "step": 749 + }, + { + "epoch": 1.9193857965451055, + "grad_norm": 0.5056443186885541, + "learning_rate": 3.091529020389009e-05, + "loss": 0.4837, + "step": 750 + }, + { + "epoch": 1.9219449776071658, + "grad_norm": 0.49376735973598973, + "learning_rate": 3.088527316734235e-05, + "loss": 0.4124, + "step": 751 + }, + { + "epoch": 1.924504158669226, + "grad_norm": 0.5428642850508197, + "learning_rate": 3.08552212501325e-05, + "loss": 0.4304, + "step": 752 + }, + { + "epoch": 1.927063339731286, + "grad_norm": 0.5365719180678239, + "learning_rate": 3.082513454855863e-05, + "loss": 0.405, + "step": 753 + }, + { + "epoch": 1.9296225207933462, + "grad_norm": 0.4559502049703374, + "learning_rate": 3.079501315903026e-05, + "loss": 0.445, + "step": 754 + }, + { + "epoch": 1.9321817018554062, + "grad_norm": 0.5222902812164878, + "learning_rate": 3.076485717806808e-05, + "loss": 0.3726, + "step": 755 + }, + { + "epoch": 1.9347408829174664, + "grad_norm": 0.5737561385558596, + "learning_rate": 3.073466670230361e-05, + "loss": 0.4588, + "step": 756 + }, + { + "epoch": 1.9373000639795266, + "grad_norm": 0.43383831135928497, + "learning_rate": 3.070444182847891e-05, + "loss": 0.4006, + "step": 757 + }, + { + "epoch": 1.9398592450415868, + "grad_norm": 0.49738529963698463, + "learning_rate": 3.067418265344628e-05, + "loss": 0.404, + "step": 758 + }, + { + "epoch": 1.9424184261036468, + "grad_norm": 0.4779872060995513, + "learning_rate": 3.0643889274167926e-05, + "loss": 0.4642, + "step": 759 + }, + { + "epoch": 1.944977607165707, + "grad_norm": 0.4501703649941174, + "learning_rate": 3.061356178771564e-05, + "loss": 0.3845, + "step": 760 + }, + { + "epoch": 1.947536788227767, + "grad_norm": 0.566851781049989, + "learning_rate": 3.058320029127052e-05, + "loss": 0.4603, + "step": 761 + }, + { + "epoch": 1.9500959692898272, + "grad_norm": 0.41716803055724166, + "learning_rate": 3.055280488212266e-05, + "loss": 0.3988, + "step": 762 + }, + { + "epoch": 1.9526551503518874, + "grad_norm": 0.5321354765650695, + "learning_rate": 3.052237565767079e-05, + "loss": 0.4633, + "step": 763 + }, + { + "epoch": 1.9552143314139476, + "grad_norm": 0.5101148541262678, + "learning_rate": 3.0491912715422047e-05, + "loss": 0.4154, + "step": 764 + }, + { + "epoch": 1.9577735124760078, + "grad_norm": 0.44138127982821407, + "learning_rate": 3.0461416152991555e-05, + "loss": 0.3971, + "step": 765 + }, + { + "epoch": 1.9603326935380678, + "grad_norm": 0.5119670448118282, + "learning_rate": 3.043088606810221e-05, + "loss": 0.4344, + "step": 766 + }, + { + "epoch": 1.9628918746001278, + "grad_norm": 0.5844510240848945, + "learning_rate": 3.0400322558584308e-05, + "loss": 0.4369, + "step": 767 + }, + { + "epoch": 1.965451055662188, + "grad_norm": 0.45663704360586077, + "learning_rate": 3.0369725722375274e-05, + "loss": 0.4666, + "step": 768 + }, + { + "epoch": 1.9680102367242482, + "grad_norm": 0.539565481061931, + "learning_rate": 3.0339095657519292e-05, + "loss": 0.4359, + "step": 769 + }, + { + "epoch": 1.9705694177863085, + "grad_norm": 0.5385392681842599, + "learning_rate": 3.0308432462167045e-05, + "loss": 0.4264, + "step": 770 + }, + { + "epoch": 1.9731285988483687, + "grad_norm": 0.432889165826209, + "learning_rate": 3.0277736234575378e-05, + "loss": 0.3845, + "step": 771 + }, + { + "epoch": 1.9756877799104287, + "grad_norm": 0.5443873180170078, + "learning_rate": 3.0247007073106976e-05, + "loss": 0.406, + "step": 772 + }, + { + "epoch": 1.9782469609724886, + "grad_norm": 0.5012354762450505, + "learning_rate": 3.0216245076230062e-05, + "loss": 0.4334, + "step": 773 + }, + { + "epoch": 1.9808061420345489, + "grad_norm": 0.5232208647955975, + "learning_rate": 3.0185450342518075e-05, + "loss": 0.4268, + "step": 774 + }, + { + "epoch": 1.983365323096609, + "grad_norm": 0.4513266845951912, + "learning_rate": 3.015462297064936e-05, + "loss": 0.3783, + "step": 775 + }, + { + "epoch": 1.9859245041586693, + "grad_norm": 0.5054305167039745, + "learning_rate": 3.0123763059406835e-05, + "loss": 0.4148, + "step": 776 + }, + { + "epoch": 1.9884836852207295, + "grad_norm": 0.47543662649122564, + "learning_rate": 3.009287070767771e-05, + "loss": 0.4083, + "step": 777 + }, + { + "epoch": 1.9910428662827895, + "grad_norm": 0.5147396246458542, + "learning_rate": 3.0061946014453113e-05, + "loss": 0.406, + "step": 778 + }, + { + "epoch": 1.9936020473448497, + "grad_norm": 0.537028842282906, + "learning_rate": 3.0030989078827848e-05, + "loss": 0.386, + "step": 779 + }, + { + "epoch": 1.9961612284069097, + "grad_norm": 0.4661586754448457, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.4218, + "step": 780 + }, + { + "epoch": 1.99872040946897, + "grad_norm": 0.4261635908664324, + "learning_rate": 2.9968978877270672e-05, + "loss": 0.4449, + "step": 781 + }, + { + "epoch": 2.00127959053103, + "grad_norm": 0.6837214787749436, + "learning_rate": 2.9937925810043654e-05, + "loss": 0.4318, + "step": 782 + }, + { + "epoch": 2.0038387715930903, + "grad_norm": 0.451386066705997, + "learning_rate": 2.990684089782507e-05, + "loss": 0.3612, + "step": 783 + }, + { + "epoch": 2.0063979526551505, + "grad_norm": 0.7078432089850583, + "learning_rate": 2.987572424022311e-05, + "loss": 0.3505, + "step": 784 + }, + { + "epoch": 2.0089571337172103, + "grad_norm": 0.5081289643217769, + "learning_rate": 2.98445759369477e-05, + "loss": 0.3149, + "step": 785 + }, + { + "epoch": 2.0115163147792705, + "grad_norm": 0.5849221231819475, + "learning_rate": 2.9813396087810134e-05, + "loss": 0.3514, + "step": 786 + }, + { + "epoch": 2.0140754958413307, + "grad_norm": 0.48371629559591783, + "learning_rate": 2.9782184792722845e-05, + "loss": 0.3339, + "step": 787 + }, + { + "epoch": 2.016634676903391, + "grad_norm": 0.6063864453716905, + "learning_rate": 2.9750942151698968e-05, + "loss": 0.389, + "step": 788 + }, + { + "epoch": 2.019193857965451, + "grad_norm": 0.6277898887247666, + "learning_rate": 2.971966826485212e-05, + "loss": 0.3283, + "step": 789 + }, + { + "epoch": 2.0217530390275114, + "grad_norm": 0.5884137115714144, + "learning_rate": 2.9688363232396056e-05, + "loss": 0.3353, + "step": 790 + }, + { + "epoch": 2.024312220089571, + "grad_norm": 0.5242949584836221, + "learning_rate": 2.9657027154644294e-05, + "loss": 0.3059, + "step": 791 + }, + { + "epoch": 2.0268714011516313, + "grad_norm": 0.5023037783914988, + "learning_rate": 2.962566013200986e-05, + "loss": 0.3433, + "step": 792 + }, + { + "epoch": 2.0294305822136915, + "grad_norm": 0.6201000972339953, + "learning_rate": 2.959426226500493e-05, + "loss": 0.318, + "step": 793 + }, + { + "epoch": 2.0319897632757518, + "grad_norm": 0.44395380528610195, + "learning_rate": 2.9562833654240518e-05, + "loss": 0.3401, + "step": 794 + }, + { + "epoch": 2.034548944337812, + "grad_norm": 0.43056205609112264, + "learning_rate": 2.9531374400426158e-05, + "loss": 0.2937, + "step": 795 + }, + { + "epoch": 2.037108125399872, + "grad_norm": 0.4803245756648518, + "learning_rate": 2.949988460436958e-05, + "loss": 0.3396, + "step": 796 + }, + { + "epoch": 2.0396673064619324, + "grad_norm": 0.41525308273215306, + "learning_rate": 2.946836436697636e-05, + "loss": 0.3508, + "step": 797 + }, + { + "epoch": 2.042226487523992, + "grad_norm": 0.4033224019153147, + "learning_rate": 2.943681378924964e-05, + "loss": 0.307, + "step": 798 + }, + { + "epoch": 2.0447856685860524, + "grad_norm": 0.4163421073725909, + "learning_rate": 2.94052329722898e-05, + "loss": 0.3145, + "step": 799 + }, + { + "epoch": 2.0473448496481126, + "grad_norm": 0.374364306892129, + "learning_rate": 2.9373622017294075e-05, + "loss": 0.3412, + "step": 800 + }, + { + "epoch": 2.049904030710173, + "grad_norm": 0.3911100371820488, + "learning_rate": 2.934198102555631e-05, + "loss": 0.3046, + "step": 801 + }, + { + "epoch": 2.052463211772233, + "grad_norm": 0.4467542980832922, + "learning_rate": 2.9310310098466588e-05, + "loss": 0.291, + "step": 802 + }, + { + "epoch": 2.055022392834293, + "grad_norm": 0.42396484047274274, + "learning_rate": 2.92786093375109e-05, + "loss": 0.3268, + "step": 803 + }, + { + "epoch": 2.057581573896353, + "grad_norm": 0.52821272561634, + "learning_rate": 2.924687884427087e-05, + "loss": 0.3699, + "step": 804 + }, + { + "epoch": 2.060140754958413, + "grad_norm": 0.4278559612529404, + "learning_rate": 2.9215118720423375e-05, + "loss": 0.3389, + "step": 805 + }, + { + "epoch": 2.0626999360204734, + "grad_norm": 0.4868265035371802, + "learning_rate": 2.9183329067740235e-05, + "loss": 0.2993, + "step": 806 + }, + { + "epoch": 2.0652591170825336, + "grad_norm": 0.41003428936435155, + "learning_rate": 2.9151509988087912e-05, + "loss": 0.3138, + "step": 807 + }, + { + "epoch": 2.067818298144594, + "grad_norm": 0.4548780109348631, + "learning_rate": 2.911966158342713e-05, + "loss": 0.3298, + "step": 808 + }, + { + "epoch": 2.070377479206654, + "grad_norm": 0.42982485262636566, + "learning_rate": 2.9087783955812628e-05, + "loss": 0.3493, + "step": 809 + }, + { + "epoch": 2.072936660268714, + "grad_norm": 0.37659830424896135, + "learning_rate": 2.9055877207392752e-05, + "loss": 0.2905, + "step": 810 + }, + { + "epoch": 2.075495841330774, + "grad_norm": 0.6047986137094586, + "learning_rate": 2.9023941440409164e-05, + "loss": 0.3921, + "step": 811 + }, + { + "epoch": 2.0780550223928342, + "grad_norm": 0.4024787680486599, + "learning_rate": 2.899197675719653e-05, + "loss": 0.3126, + "step": 812 + }, + { + "epoch": 2.0806142034548945, + "grad_norm": 0.4864391147176377, + "learning_rate": 2.8959983260182166e-05, + "loss": 0.3259, + "step": 813 + }, + { + "epoch": 2.0831733845169547, + "grad_norm": 0.4289885322757846, + "learning_rate": 2.8927961051885716e-05, + "loss": 0.3327, + "step": 814 + }, + { + "epoch": 2.085732565579015, + "grad_norm": 0.46469758441400516, + "learning_rate": 2.8895910234918828e-05, + "loss": 0.3566, + "step": 815 + }, + { + "epoch": 2.0882917466410746, + "grad_norm": 0.47717231725597103, + "learning_rate": 2.886383091198483e-05, + "loss": 0.3543, + "step": 816 + }, + { + "epoch": 2.090850927703135, + "grad_norm": 0.40073207808377775, + "learning_rate": 2.8831723185878382e-05, + "loss": 0.2954, + "step": 817 + }, + { + "epoch": 2.093410108765195, + "grad_norm": 0.5629907330757088, + "learning_rate": 2.8799587159485166e-05, + "loss": 0.3302, + "step": 818 + }, + { + "epoch": 2.0959692898272553, + "grad_norm": 0.5537969468557435, + "learning_rate": 2.876742293578155e-05, + "loss": 0.3323, + "step": 819 + }, + { + "epoch": 2.0985284708893155, + "grad_norm": 0.5750723337033808, + "learning_rate": 2.873523061783426e-05, + "loss": 0.3083, + "step": 820 + }, + { + "epoch": 2.1010876519513757, + "grad_norm": 0.5333136369374486, + "learning_rate": 2.8703010308800034e-05, + "loss": 0.3516, + "step": 821 + }, + { + "epoch": 2.103646833013436, + "grad_norm": 0.5946698263030077, + "learning_rate": 2.8670762111925313e-05, + "loss": 0.3337, + "step": 822 + }, + { + "epoch": 2.1062060140754957, + "grad_norm": 0.4783241662438903, + "learning_rate": 2.863848613054591e-05, + "loss": 0.302, + "step": 823 + }, + { + "epoch": 2.108765195137556, + "grad_norm": 0.42094897221075406, + "learning_rate": 2.8606182468086654e-05, + "loss": 0.3739, + "step": 824 + }, + { + "epoch": 2.111324376199616, + "grad_norm": 0.5112995720272753, + "learning_rate": 2.8573851228061084e-05, + "loss": 0.3328, + "step": 825 + }, + { + "epoch": 2.1138835572616763, + "grad_norm": 0.4268589732083703, + "learning_rate": 2.8541492514071115e-05, + "loss": 0.3199, + "step": 826 + }, + { + "epoch": 2.1164427383237365, + "grad_norm": 0.40030928394161236, + "learning_rate": 2.850910642980668e-05, + "loss": 0.3229, + "step": 827 + }, + { + "epoch": 2.1190019193857967, + "grad_norm": 0.4208663528647155, + "learning_rate": 2.8476693079045432e-05, + "loss": 0.3475, + "step": 828 + }, + { + "epoch": 2.1215611004478565, + "grad_norm": 0.43920626817956737, + "learning_rate": 2.8444252565652397e-05, + "loss": 0.3395, + "step": 829 + }, + { + "epoch": 2.1241202815099167, + "grad_norm": 0.4129992695563009, + "learning_rate": 2.8411784993579633e-05, + "loss": 0.2742, + "step": 830 + }, + { + "epoch": 2.126679462571977, + "grad_norm": 0.4454585391740596, + "learning_rate": 2.8379290466865906e-05, + "loss": 0.3328, + "step": 831 + }, + { + "epoch": 2.129238643634037, + "grad_norm": 0.46723501336721224, + "learning_rate": 2.834676908963636e-05, + "loss": 0.3379, + "step": 832 + }, + { + "epoch": 2.1317978246960974, + "grad_norm": 0.5238858504216463, + "learning_rate": 2.8314220966102177e-05, + "loss": 0.3621, + "step": 833 + }, + { + "epoch": 2.1343570057581576, + "grad_norm": 0.4375600366447412, + "learning_rate": 2.828164620056024e-05, + "loss": 0.3031, + "step": 834 + }, + { + "epoch": 2.1369161868202173, + "grad_norm": 0.4011400320445024, + "learning_rate": 2.8249044897392814e-05, + "loss": 0.3167, + "step": 835 + }, + { + "epoch": 2.1394753678822775, + "grad_norm": 0.48667210285852947, + "learning_rate": 2.8216417161067187e-05, + "loss": 0.3517, + "step": 836 + }, + { + "epoch": 2.1420345489443378, + "grad_norm": 0.540946535276379, + "learning_rate": 2.818376309613535e-05, + "loss": 0.3276, + "step": 837 + }, + { + "epoch": 2.144593730006398, + "grad_norm": 0.41449908449590483, + "learning_rate": 2.8151082807233684e-05, + "loss": 0.3429, + "step": 838 + }, + { + "epoch": 2.147152911068458, + "grad_norm": 0.4411596533715045, + "learning_rate": 2.811837639908257e-05, + "loss": 0.3064, + "step": 839 + }, + { + "epoch": 2.1497120921305184, + "grad_norm": 0.4049429223396906, + "learning_rate": 2.80856439764861e-05, + "loss": 0.3212, + "step": 840 + }, + { + "epoch": 2.1522712731925786, + "grad_norm": 0.41603945550088506, + "learning_rate": 2.8052885644331742e-05, + "loss": 0.3097, + "step": 841 + }, + { + "epoch": 2.1548304542546384, + "grad_norm": 0.526968199578847, + "learning_rate": 2.8020101507589958e-05, + "loss": 0.3547, + "step": 842 + }, + { + "epoch": 2.1573896353166986, + "grad_norm": 0.38305125359444786, + "learning_rate": 2.798729167131391e-05, + "loss": 0.3027, + "step": 843 + }, + { + "epoch": 2.159948816378759, + "grad_norm": 0.6336647706261161, + "learning_rate": 2.795445624063913e-05, + "loss": 0.3806, + "step": 844 + }, + { + "epoch": 2.162507997440819, + "grad_norm": 0.44531937519080506, + "learning_rate": 2.792159532078314e-05, + "loss": 0.3323, + "step": 845 + }, + { + "epoch": 2.165067178502879, + "grad_norm": 0.44290733063906507, + "learning_rate": 2.7888709017045146e-05, + "loss": 0.3237, + "step": 846 + }, + { + "epoch": 2.167626359564939, + "grad_norm": 0.4370183293736626, + "learning_rate": 2.7855797434805695e-05, + "loss": 0.338, + "step": 847 + }, + { + "epoch": 2.170185540626999, + "grad_norm": 0.4164027353779343, + "learning_rate": 2.782286067952634e-05, + "loss": 0.3278, + "step": 848 + }, + { + "epoch": 2.1727447216890594, + "grad_norm": 0.49409168279966853, + "learning_rate": 2.7789898856749297e-05, + "loss": 0.3568, + "step": 849 + }, + { + "epoch": 2.1753039027511196, + "grad_norm": 0.43481015195691675, + "learning_rate": 2.77569120720971e-05, + "loss": 0.3356, + "step": 850 + }, + { + "epoch": 2.17786308381318, + "grad_norm": 0.4983975940240211, + "learning_rate": 2.772390043127228e-05, + "loss": 0.3373, + "step": 851 + }, + { + "epoch": 2.18042226487524, + "grad_norm": 0.43692987588400956, + "learning_rate": 2.7690864040057023e-05, + "loss": 0.3108, + "step": 852 + }, + { + "epoch": 2.1829814459373003, + "grad_norm": 0.44866752393409093, + "learning_rate": 2.7657803004312797e-05, + "loss": 0.3347, + "step": 853 + }, + { + "epoch": 2.18554062699936, + "grad_norm": 0.5027966160971863, + "learning_rate": 2.7624717429980067e-05, + "loss": 0.3536, + "step": 854 + }, + { + "epoch": 2.1880998080614202, + "grad_norm": 0.4093435861731407, + "learning_rate": 2.7591607423077932e-05, + "loss": 0.2917, + "step": 855 + }, + { + "epoch": 2.1906589891234804, + "grad_norm": 0.49615217011265117, + "learning_rate": 2.755847308970376e-05, + "loss": 0.3502, + "step": 856 + }, + { + "epoch": 2.1932181701855407, + "grad_norm": 0.3967350798943657, + "learning_rate": 2.752531453603288e-05, + "loss": 0.3177, + "step": 857 + }, + { + "epoch": 2.195777351247601, + "grad_norm": 0.4774434168760954, + "learning_rate": 2.7492131868318247e-05, + "loss": 0.3616, + "step": 858 + }, + { + "epoch": 2.198336532309661, + "grad_norm": 0.4533916271890891, + "learning_rate": 2.7458925192890057e-05, + "loss": 0.3235, + "step": 859 + }, + { + "epoch": 2.200895713371721, + "grad_norm": 0.4666533097303878, + "learning_rate": 2.7425694616155474e-05, + "loss": 0.362, + "step": 860 + }, + { + "epoch": 2.203454894433781, + "grad_norm": 0.5256588022807218, + "learning_rate": 2.739244024459822e-05, + "loss": 0.3577, + "step": 861 + }, + { + "epoch": 2.2060140754958413, + "grad_norm": 0.42484253391437565, + "learning_rate": 2.7359162184778276e-05, + "loss": 0.3297, + "step": 862 + }, + { + "epoch": 2.2085732565579015, + "grad_norm": 0.44868873474156656, + "learning_rate": 2.7325860543331533e-05, + "loss": 0.336, + "step": 863 + }, + { + "epoch": 2.2111324376199617, + "grad_norm": 0.4732703525530866, + "learning_rate": 2.7292535426969436e-05, + "loss": 0.3057, + "step": 864 + }, + { + "epoch": 2.213691618682022, + "grad_norm": 0.477868292020973, + "learning_rate": 2.7259186942478656e-05, + "loss": 0.308, + "step": 865 + }, + { + "epoch": 2.216250799744082, + "grad_norm": 0.4842331398481323, + "learning_rate": 2.7225815196720767e-05, + "loss": 0.3145, + "step": 866 + }, + { + "epoch": 2.218809980806142, + "grad_norm": 0.5381115021925195, + "learning_rate": 2.7192420296631835e-05, + "loss": 0.3798, + "step": 867 + }, + { + "epoch": 2.221369161868202, + "grad_norm": 0.4236836146413328, + "learning_rate": 2.7159002349222178e-05, + "loss": 0.3237, + "step": 868 + }, + { + "epoch": 2.2239283429302623, + "grad_norm": 0.5479188963928002, + "learning_rate": 2.7125561461575924e-05, + "loss": 0.3832, + "step": 869 + }, + { + "epoch": 2.2264875239923225, + "grad_norm": 0.45518106829461097, + "learning_rate": 2.7092097740850712e-05, + "loss": 0.3048, + "step": 870 + }, + { + "epoch": 2.2290467050543827, + "grad_norm": 0.4441896488412185, + "learning_rate": 2.7058611294277378e-05, + "loss": 0.3141, + "step": 871 + }, + { + "epoch": 2.2316058861164425, + "grad_norm": 0.43355994332482317, + "learning_rate": 2.702510222915956e-05, + "loss": 0.3107, + "step": 872 + }, + { + "epoch": 2.2341650671785027, + "grad_norm": 0.5312269604580118, + "learning_rate": 2.6991570652873357e-05, + "loss": 0.3404, + "step": 873 + }, + { + "epoch": 2.236724248240563, + "grad_norm": 0.3995110246728028, + "learning_rate": 2.6958016672867048e-05, + "loss": 0.3122, + "step": 874 + }, + { + "epoch": 2.239283429302623, + "grad_norm": 0.4611276323390611, + "learning_rate": 2.692444039666066e-05, + "loss": 0.317, + "step": 875 + }, + { + "epoch": 2.2418426103646834, + "grad_norm": 0.5307192324613822, + "learning_rate": 2.6890841931845674e-05, + "loss": 0.3579, + "step": 876 + }, + { + "epoch": 2.2444017914267436, + "grad_norm": 0.41152344841446314, + "learning_rate": 2.68572213860847e-05, + "loss": 0.3278, + "step": 877 + }, + { + "epoch": 2.2469609724888038, + "grad_norm": 0.439703599513908, + "learning_rate": 2.6823578867111072e-05, + "loss": 0.3207, + "step": 878 + }, + { + "epoch": 2.2495201535508635, + "grad_norm": 0.48436216914849156, + "learning_rate": 2.6789914482728546e-05, + "loss": 0.3923, + "step": 879 + }, + { + "epoch": 2.2520793346129238, + "grad_norm": 0.4065191432791332, + "learning_rate": 2.6756228340810946e-05, + "loss": 0.3092, + "step": 880 + }, + { + "epoch": 2.254638515674984, + "grad_norm": 0.45257368911274026, + "learning_rate": 2.6722520549301813e-05, + "loss": 0.3201, + "step": 881 + }, + { + "epoch": 2.257197696737044, + "grad_norm": 0.47676366709922463, + "learning_rate": 2.6688791216214064e-05, + "loss": 0.3552, + "step": 882 + }, + { + "epoch": 2.2597568777991044, + "grad_norm": 0.4388935470431639, + "learning_rate": 2.6655040449629646e-05, + "loss": 0.3117, + "step": 883 + }, + { + "epoch": 2.2623160588611646, + "grad_norm": 0.49852041258030133, + "learning_rate": 2.6621268357699165e-05, + "loss": 0.2986, + "step": 884 + }, + { + "epoch": 2.2648752399232244, + "grad_norm": 0.5574815275071192, + "learning_rate": 2.6587475048641596e-05, + "loss": 0.3652, + "step": 885 + }, + { + "epoch": 2.2674344209852846, + "grad_norm": 0.532498078005579, + "learning_rate": 2.655366063074388e-05, + "loss": 0.3361, + "step": 886 + }, + { + "epoch": 2.269993602047345, + "grad_norm": 0.4905390491427929, + "learning_rate": 2.6519825212360607e-05, + "loss": 0.2904, + "step": 887 + }, + { + "epoch": 2.272552783109405, + "grad_norm": 0.4548356990918924, + "learning_rate": 2.6485968901913658e-05, + "loss": 0.3383, + "step": 888 + }, + { + "epoch": 2.275111964171465, + "grad_norm": 0.4527361560109495, + "learning_rate": 2.6452091807891855e-05, + "loss": 0.3395, + "step": 889 + }, + { + "epoch": 2.2776711452335254, + "grad_norm": 0.4444282648587205, + "learning_rate": 2.6418194038850634e-05, + "loss": 0.3155, + "step": 890 + }, + { + "epoch": 2.2802303262955856, + "grad_norm": 0.37005966275705604, + "learning_rate": 2.6384275703411666e-05, + "loss": 0.3172, + "step": 891 + }, + { + "epoch": 2.2827895073576454, + "grad_norm": 0.4280292706382066, + "learning_rate": 2.635033691026253e-05, + "loss": 0.3643, + "step": 892 + }, + { + "epoch": 2.2853486884197056, + "grad_norm": 0.46336387808601265, + "learning_rate": 2.6316377768156366e-05, + "loss": 0.3516, + "step": 893 + }, + { + "epoch": 2.287907869481766, + "grad_norm": 0.3675246620437907, + "learning_rate": 2.6282398385911503e-05, + "loss": 0.2782, + "step": 894 + }, + { + "epoch": 2.290467050543826, + "grad_norm": 0.4188680082069283, + "learning_rate": 2.624839887241115e-05, + "loss": 0.3521, + "step": 895 + }, + { + "epoch": 2.2930262316058863, + "grad_norm": 0.42684239051457756, + "learning_rate": 2.6214379336603016e-05, + "loss": 0.2909, + "step": 896 + }, + { + "epoch": 2.295585412667946, + "grad_norm": 0.36815581001539444, + "learning_rate": 2.618033988749895e-05, + "loss": 0.3068, + "step": 897 + }, + { + "epoch": 2.2981445937300062, + "grad_norm": 0.4355098607462678, + "learning_rate": 2.614628063417464e-05, + "loss": 0.3561, + "step": 898 + }, + { + "epoch": 2.3007037747920664, + "grad_norm": 0.4278330160593537, + "learning_rate": 2.6112201685769224e-05, + "loss": 0.3265, + "step": 899 + }, + { + "epoch": 2.3032629558541267, + "grad_norm": 0.39014595589180573, + "learning_rate": 2.607810315148494e-05, + "loss": 0.3569, + "step": 900 + }, + { + "epoch": 2.305822136916187, + "grad_norm": 0.4667698942028393, + "learning_rate": 2.60439851405868e-05, + "loss": 0.3631, + "step": 901 + }, + { + "epoch": 2.308381317978247, + "grad_norm": 0.41965973806882434, + "learning_rate": 2.600984776240222e-05, + "loss": 0.3248, + "step": 902 + }, + { + "epoch": 2.3109404990403073, + "grad_norm": 0.44724491924157506, + "learning_rate": 2.5975691126320678e-05, + "loss": 0.3854, + "step": 903 + }, + { + "epoch": 2.313499680102367, + "grad_norm": 0.46251594524874595, + "learning_rate": 2.5941515341793366e-05, + "loss": 0.3503, + "step": 904 + }, + { + "epoch": 2.3160588611644273, + "grad_norm": 0.35346075618792994, + "learning_rate": 2.5907320518332827e-05, + "loss": 0.3309, + "step": 905 + }, + { + "epoch": 2.3186180422264875, + "grad_norm": 0.3826935478844609, + "learning_rate": 2.587310676551262e-05, + "loss": 0.2894, + "step": 906 + }, + { + "epoch": 2.3211772232885477, + "grad_norm": 0.416397755264069, + "learning_rate": 2.5838874192966953e-05, + "loss": 0.3716, + "step": 907 + }, + { + "epoch": 2.323736404350608, + "grad_norm": 0.3982851139464845, + "learning_rate": 2.5804622910390348e-05, + "loss": 0.2833, + "step": 908 + }, + { + "epoch": 2.326295585412668, + "grad_norm": 0.4591104893849011, + "learning_rate": 2.5770353027537276e-05, + "loss": 0.3277, + "step": 909 + }, + { + "epoch": 2.328854766474728, + "grad_norm": 0.382725152261151, + "learning_rate": 2.5736064654221808e-05, + "loss": 0.323, + "step": 910 + }, + { + "epoch": 2.331413947536788, + "grad_norm": 0.5087928821054611, + "learning_rate": 2.5701757900317277e-05, + "loss": 0.3314, + "step": 911 + }, + { + "epoch": 2.3339731285988483, + "grad_norm": 0.3914001278691039, + "learning_rate": 2.5667432875755904e-05, + "loss": 0.3508, + "step": 912 + }, + { + "epoch": 2.3365323096609085, + "grad_norm": 0.4510480545622787, + "learning_rate": 2.5633089690528455e-05, + "loss": 0.3529, + "step": 913 + }, + { + "epoch": 2.3390914907229687, + "grad_norm": 0.4134813709280613, + "learning_rate": 2.559872845468391e-05, + "loss": 0.3286, + "step": 914 + }, + { + "epoch": 2.341650671785029, + "grad_norm": 0.40105410165144934, + "learning_rate": 2.5564349278329056e-05, + "loss": 0.2852, + "step": 915 + }, + { + "epoch": 2.344209852847089, + "grad_norm": 0.49998212235296036, + "learning_rate": 2.5529952271628192e-05, + "loss": 0.2916, + "step": 916 + }, + { + "epoch": 2.346769033909149, + "grad_norm": 0.46734623561200184, + "learning_rate": 2.5495537544802757e-05, + "loss": 0.3497, + "step": 917 + }, + { + "epoch": 2.349328214971209, + "grad_norm": 0.43677669206755015, + "learning_rate": 2.5461105208130953e-05, + "loss": 0.359, + "step": 918 + }, + { + "epoch": 2.3518873960332694, + "grad_norm": 0.49010454865810016, + "learning_rate": 2.542665537194742e-05, + "loss": 0.3368, + "step": 919 + }, + { + "epoch": 2.3544465770953296, + "grad_norm": 0.370850360816377, + "learning_rate": 2.539218814664288e-05, + "loss": 0.3222, + "step": 920 + }, + { + "epoch": 2.3570057581573898, + "grad_norm": 0.46886497417633327, + "learning_rate": 2.5357703642663766e-05, + "loss": 0.3633, + "step": 921 + }, + { + "epoch": 2.3595649392194495, + "grad_norm": 0.4227514997155462, + "learning_rate": 2.5323201970511883e-05, + "loss": 0.3497, + "step": 922 + }, + { + "epoch": 2.3621241202815098, + "grad_norm": 0.4015401134012503, + "learning_rate": 2.528868324074405e-05, + "loss": 0.3076, + "step": 923 + }, + { + "epoch": 2.36468330134357, + "grad_norm": 0.4012146844135177, + "learning_rate": 2.525414756397174e-05, + "loss": 0.3117, + "step": 924 + }, + { + "epoch": 2.36724248240563, + "grad_norm": 0.3724913651532696, + "learning_rate": 2.521959505086075e-05, + "loss": 0.2948, + "step": 925 + }, + { + "epoch": 2.3698016634676904, + "grad_norm": 0.44515489828057647, + "learning_rate": 2.5185025812130794e-05, + "loss": 0.3624, + "step": 926 + }, + { + "epoch": 2.3723608445297506, + "grad_norm": 0.4333489454322345, + "learning_rate": 2.5150439958555205e-05, + "loss": 0.3254, + "step": 927 + }, + { + "epoch": 2.374920025591811, + "grad_norm": 0.4284602049159506, + "learning_rate": 2.5115837600960564e-05, + "loss": 0.3232, + "step": 928 + }, + { + "epoch": 2.3774792066538706, + "grad_norm": 0.4327763714080197, + "learning_rate": 2.5081218850226315e-05, + "loss": 0.3213, + "step": 929 + }, + { + "epoch": 2.380038387715931, + "grad_norm": 0.4092391708238846, + "learning_rate": 2.5046583817284437e-05, + "loss": 0.3645, + "step": 930 + }, + { + "epoch": 2.382597568777991, + "grad_norm": 0.3874157650273361, + "learning_rate": 2.5011932613119098e-05, + "loss": 0.3546, + "step": 931 + }, + { + "epoch": 2.385156749840051, + "grad_norm": 0.4154060512286611, + "learning_rate": 2.497726534876627e-05, + "loss": 0.3724, + "step": 932 + }, + { + "epoch": 2.3877159309021114, + "grad_norm": 0.3520365071455179, + "learning_rate": 2.4942582135313393e-05, + "loss": 0.3171, + "step": 933 + }, + { + "epoch": 2.3902751119641716, + "grad_norm": 0.4129362868719995, + "learning_rate": 2.490788308389902e-05, + "loss": 0.3081, + "step": 934 + }, + { + "epoch": 2.3928342930262314, + "grad_norm": 0.3826332240853489, + "learning_rate": 2.487316830571244e-05, + "loss": 0.3167, + "step": 935 + }, + { + "epoch": 2.3953934740882916, + "grad_norm": 0.41073512437749543, + "learning_rate": 2.4838437911993355e-05, + "loss": 0.2872, + "step": 936 + }, + { + "epoch": 2.397952655150352, + "grad_norm": 0.41060249122236425, + "learning_rate": 2.48036920140315e-05, + "loss": 0.3331, + "step": 937 + }, + { + "epoch": 2.400511836212412, + "grad_norm": 0.39939347922246277, + "learning_rate": 2.4768930723166266e-05, + "loss": 0.309, + "step": 938 + }, + { + "epoch": 2.4030710172744723, + "grad_norm": 0.44800610692896503, + "learning_rate": 2.473415415078642e-05, + "loss": 0.3301, + "step": 939 + }, + { + "epoch": 2.4056301983365325, + "grad_norm": 0.45045037484262557, + "learning_rate": 2.4699362408329646e-05, + "loss": 0.3545, + "step": 940 + }, + { + "epoch": 2.4081893793985927, + "grad_norm": 0.4516520833709128, + "learning_rate": 2.466455560728227e-05, + "loss": 0.3219, + "step": 941 + }, + { + "epoch": 2.4107485604606524, + "grad_norm": 0.4140396756771499, + "learning_rate": 2.4629733859178867e-05, + "loss": 0.3312, + "step": 942 + }, + { + "epoch": 2.4133077415227127, + "grad_norm": 0.4242325070475781, + "learning_rate": 2.4594897275601887e-05, + "loss": 0.3657, + "step": 943 + }, + { + "epoch": 2.415866922584773, + "grad_norm": 0.3534056791478892, + "learning_rate": 2.456004596818135e-05, + "loss": 0.2875, + "step": 944 + }, + { + "epoch": 2.418426103646833, + "grad_norm": 0.4504625178937038, + "learning_rate": 2.4525180048594452e-05, + "loss": 0.3947, + "step": 945 + }, + { + "epoch": 2.4209852847088933, + "grad_norm": 0.3713456955088067, + "learning_rate": 2.4490299628565168e-05, + "loss": 0.3365, + "step": 946 + }, + { + "epoch": 2.423544465770953, + "grad_norm": 0.4139332630376369, + "learning_rate": 2.4455404819864e-05, + "loss": 0.3213, + "step": 947 + }, + { + "epoch": 2.4261036468330133, + "grad_norm": 0.39284365253142334, + "learning_rate": 2.4420495734307527e-05, + "loss": 0.3707, + "step": 948 + }, + { + "epoch": 2.4286628278950735, + "grad_norm": 0.44240961049247096, + "learning_rate": 2.4385572483758066e-05, + "loss": 0.373, + "step": 949 + }, + { + "epoch": 2.4312220089571337, + "grad_norm": 0.41468110454884644, + "learning_rate": 2.435063518012335e-05, + "loss": 0.3791, + "step": 950 + }, + { + "epoch": 2.433781190019194, + "grad_norm": 0.38546461774505014, + "learning_rate": 2.4315683935356127e-05, + "loss": 0.3092, + "step": 951 + }, + { + "epoch": 2.436340371081254, + "grad_norm": 0.3973539449011059, + "learning_rate": 2.4280718861453814e-05, + "loss": 0.3537, + "step": 952 + }, + { + "epoch": 2.4388995521433143, + "grad_norm": 0.40087880001543535, + "learning_rate": 2.424574007045816e-05, + "loss": 0.3513, + "step": 953 + }, + { + "epoch": 2.441458733205374, + "grad_norm": 0.4363352682087938, + "learning_rate": 2.421074767445485e-05, + "loss": 0.3168, + "step": 954 + }, + { + "epoch": 2.4440179142674343, + "grad_norm": 0.387588478700538, + "learning_rate": 2.4175741785573177e-05, + "loss": 0.3156, + "step": 955 + }, + { + "epoch": 2.4465770953294945, + "grad_norm": 0.43136617250905906, + "learning_rate": 2.4140722515985666e-05, + "loss": 0.3396, + "step": 956 + }, + { + "epoch": 2.4491362763915547, + "grad_norm": 0.4356259978949205, + "learning_rate": 2.4105689977907722e-05, + "loss": 0.3633, + "step": 957 + }, + { + "epoch": 2.451695457453615, + "grad_norm": 0.3939046327707216, + "learning_rate": 2.407064428359726e-05, + "loss": 0.3367, + "step": 958 + }, + { + "epoch": 2.454254638515675, + "grad_norm": 0.44530753935780215, + "learning_rate": 2.4035585545354353e-05, + "loss": 0.2652, + "step": 959 + }, + { + "epoch": 2.456813819577735, + "grad_norm": 0.39324736708789354, + "learning_rate": 2.4000513875520892e-05, + "loss": 0.3497, + "step": 960 + }, + { + "epoch": 2.459373000639795, + "grad_norm": 0.3745102508002373, + "learning_rate": 2.396542938648018e-05, + "loss": 0.351, + "step": 961 + }, + { + "epoch": 2.4619321817018553, + "grad_norm": 0.433148063417755, + "learning_rate": 2.3930332190656604e-05, + "loss": 0.3226, + "step": 962 + }, + { + "epoch": 2.4644913627639156, + "grad_norm": 0.5060978327975577, + "learning_rate": 2.3895222400515282e-05, + "loss": 0.3944, + "step": 963 + }, + { + "epoch": 2.4670505438259758, + "grad_norm": 0.3831333684566055, + "learning_rate": 2.3860100128561677e-05, + "loss": 0.303, + "step": 964 + }, + { + "epoch": 2.469609724888036, + "grad_norm": 0.4834696949807748, + "learning_rate": 2.3824965487341247e-05, + "loss": 0.36, + "step": 965 + }, + { + "epoch": 2.472168905950096, + "grad_norm": 0.48539696684918826, + "learning_rate": 2.3789818589439094e-05, + "loss": 0.3418, + "step": 966 + }, + { + "epoch": 2.474728087012156, + "grad_norm": 0.3663525699221002, + "learning_rate": 2.375465954747959e-05, + "loss": 0.2906, + "step": 967 + }, + { + "epoch": 2.477287268074216, + "grad_norm": 0.4444877062926493, + "learning_rate": 2.371948847412602e-05, + "loss": 0.3281, + "step": 968 + }, + { + "epoch": 2.4798464491362764, + "grad_norm": 0.37267946857207057, + "learning_rate": 2.3684305482080233e-05, + "loss": 0.3214, + "step": 969 + }, + { + "epoch": 2.4824056301983366, + "grad_norm": 0.422309895496103, + "learning_rate": 2.3649110684082258e-05, + "loss": 0.3309, + "step": 970 + }, + { + "epoch": 2.484964811260397, + "grad_norm": 0.366896575024139, + "learning_rate": 2.361390419290995e-05, + "loss": 0.3359, + "step": 971 + }, + { + "epoch": 2.4875239923224566, + "grad_norm": 0.4252498965109737, + "learning_rate": 2.357868612137866e-05, + "loss": 0.3162, + "step": 972 + }, + { + "epoch": 2.490083173384517, + "grad_norm": 0.4454769676233995, + "learning_rate": 2.3543456582340815e-05, + "loss": 0.3458, + "step": 973 + }, + { + "epoch": 2.492642354446577, + "grad_norm": 0.504528012046428, + "learning_rate": 2.3508215688685607e-05, + "loss": 0.3783, + "step": 974 + }, + { + "epoch": 2.495201535508637, + "grad_norm": 0.3791982649378316, + "learning_rate": 2.3472963553338614e-05, + "loss": 0.3439, + "step": 975 + }, + { + "epoch": 2.4977607165706974, + "grad_norm": 0.3958397655771158, + "learning_rate": 2.3437700289261417e-05, + "loss": 0.3098, + "step": 976 + }, + { + "epoch": 2.5003198976327576, + "grad_norm": 0.4716678361651927, + "learning_rate": 2.3402426009451288e-05, + "loss": 0.3442, + "step": 977 + }, + { + "epoch": 2.502879078694818, + "grad_norm": 0.41505752875646384, + "learning_rate": 2.3367140826940768e-05, + "loss": 0.3393, + "step": 978 + }, + { + "epoch": 2.505438259756878, + "grad_norm": 0.4831411264450984, + "learning_rate": 2.333184485479737e-05, + "loss": 0.3406, + "step": 979 + }, + { + "epoch": 2.507997440818938, + "grad_norm": 0.44259478488091053, + "learning_rate": 2.3296538206123134e-05, + "loss": 0.3498, + "step": 980 + }, + { + "epoch": 2.510556621880998, + "grad_norm": 0.39650938475151654, + "learning_rate": 2.326122099405435e-05, + "loss": 0.3218, + "step": 981 + }, + { + "epoch": 2.5131158029430583, + "grad_norm": 0.44478690078566685, + "learning_rate": 2.3225893331761143e-05, + "loss": 0.3354, + "step": 982 + }, + { + "epoch": 2.5156749840051185, + "grad_norm": 0.4617579108787994, + "learning_rate": 2.319055533244712e-05, + "loss": 0.3689, + "step": 983 + }, + { + "epoch": 2.5182341650671782, + "grad_norm": 0.36510971786258006, + "learning_rate": 2.315520710934903e-05, + "loss": 0.3189, + "step": 984 + }, + { + "epoch": 2.5207933461292384, + "grad_norm": 0.5962978268524062, + "learning_rate": 2.311984877573636e-05, + "loss": 0.3785, + "step": 985 + }, + { + "epoch": 2.5233525271912987, + "grad_norm": 0.4286182290118198, + "learning_rate": 2.3084480444911006e-05, + "loss": 0.2969, + "step": 986 + }, + { + "epoch": 2.525911708253359, + "grad_norm": 0.406129472684799, + "learning_rate": 2.304910223020691e-05, + "loss": 0.3622, + "step": 987 + }, + { + "epoch": 2.528470889315419, + "grad_norm": 0.40352070167371706, + "learning_rate": 2.3013714244989665e-05, + "loss": 0.3003, + "step": 988 + }, + { + "epoch": 2.5310300703774793, + "grad_norm": 0.41853745249254193, + "learning_rate": 2.2978316602656183e-05, + "loss": 0.3545, + "step": 989 + }, + { + "epoch": 2.5335892514395395, + "grad_norm": 0.3751248697656993, + "learning_rate": 2.2942909416634326e-05, + "loss": 0.3317, + "step": 990 + }, + { + "epoch": 2.5361484325015997, + "grad_norm": 0.38457641982344676, + "learning_rate": 2.290749280038252e-05, + "loss": 0.3186, + "step": 991 + }, + { + "epoch": 2.5387076135636595, + "grad_norm": 0.43410698680189885, + "learning_rate": 2.2872066867389434e-05, + "loss": 0.3819, + "step": 992 + }, + { + "epoch": 2.5412667946257197, + "grad_norm": 0.3827463318912182, + "learning_rate": 2.2836631731173577e-05, + "loss": 0.3428, + "step": 993 + }, + { + "epoch": 2.54382597568778, + "grad_norm": 0.3626009354081465, + "learning_rate": 2.2801187505282948e-05, + "loss": 0.3313, + "step": 994 + }, + { + "epoch": 2.54638515674984, + "grad_norm": 0.3879708806451702, + "learning_rate": 2.2765734303294666e-05, + "loss": 0.302, + "step": 995 + }, + { + "epoch": 2.5489443378119003, + "grad_norm": 0.3615193094932171, + "learning_rate": 2.2730272238814636e-05, + "loss": 0.3022, + "step": 996 + }, + { + "epoch": 2.55150351887396, + "grad_norm": 0.38109707078147037, + "learning_rate": 2.2694801425477136e-05, + "loss": 0.3199, + "step": 997 + }, + { + "epoch": 2.5540626999360203, + "grad_norm": 0.35624704442372485, + "learning_rate": 2.2659321976944507e-05, + "loss": 0.3394, + "step": 998 + }, + { + "epoch": 2.5566218809980805, + "grad_norm": 0.5236454693659701, + "learning_rate": 2.2623834006906732e-05, + "loss": 0.3254, + "step": 999 + }, + { + "epoch": 2.5591810620601407, + "grad_norm": 0.4793516094510245, + "learning_rate": 2.2588337629081107e-05, + "loss": 0.4122, + "step": 1000 + }, + { + "epoch": 2.561740243122201, + "grad_norm": 0.32519173651998734, + "learning_rate": 2.25528329572119e-05, + "loss": 0.2782, + "step": 1001 + }, + { + "epoch": 2.564299424184261, + "grad_norm": 0.47513564073653997, + "learning_rate": 2.25173201050699e-05, + "loss": 0.4075, + "step": 1002 + }, + { + "epoch": 2.5668586052463214, + "grad_norm": 0.34327531232567976, + "learning_rate": 2.248179918645216e-05, + "loss": 0.2602, + "step": 1003 + }, + { + "epoch": 2.5694177863083816, + "grad_norm": 0.42876499217691605, + "learning_rate": 2.2446270315181566e-05, + "loss": 0.3538, + "step": 1004 + }, + { + "epoch": 2.5719769673704413, + "grad_norm": 0.38643082957234787, + "learning_rate": 2.2410733605106462e-05, + "loss": 0.3331, + "step": 1005 + }, + { + "epoch": 2.5745361484325016, + "grad_norm": 0.3845952145329833, + "learning_rate": 2.237518917010035e-05, + "loss": 0.3068, + "step": 1006 + }, + { + "epoch": 2.5770953294945618, + "grad_norm": 0.398304345128026, + "learning_rate": 2.233963712406147e-05, + "loss": 0.3455, + "step": 1007 + }, + { + "epoch": 2.579654510556622, + "grad_norm": 0.4106067027436424, + "learning_rate": 2.2304077580912423e-05, + "loss": 0.3266, + "step": 1008 + }, + { + "epoch": 2.5822136916186818, + "grad_norm": 0.3515875497757696, + "learning_rate": 2.2268510654599885e-05, + "loss": 0.3089, + "step": 1009 + }, + { + "epoch": 2.584772872680742, + "grad_norm": 0.34230034407291976, + "learning_rate": 2.2232936459094158e-05, + "loss": 0.37, + "step": 1010 + }, + { + "epoch": 2.587332053742802, + "grad_norm": 0.3409277384030245, + "learning_rate": 2.2197355108388835e-05, + "loss": 0.3425, + "step": 1011 + }, + { + "epoch": 2.5898912348048624, + "grad_norm": 0.3659124451915072, + "learning_rate": 2.216176671650045e-05, + "loss": 0.3417, + "step": 1012 + }, + { + "epoch": 2.5924504158669226, + "grad_norm": 0.34644737723549984, + "learning_rate": 2.2126171397468105e-05, + "loss": 0.3048, + "step": 1013 + }, + { + "epoch": 2.595009596928983, + "grad_norm": 0.34192043331418503, + "learning_rate": 2.209056926535307e-05, + "loss": 0.3245, + "step": 1014 + }, + { + "epoch": 2.597568777991043, + "grad_norm": 0.4024225219500372, + "learning_rate": 2.205496043423849e-05, + "loss": 0.3501, + "step": 1015 + }, + { + "epoch": 2.6001279590531032, + "grad_norm": 0.3444122394151278, + "learning_rate": 2.2019345018228922e-05, + "loss": 0.3403, + "step": 1016 + }, + { + "epoch": 2.602687140115163, + "grad_norm": 0.41753443691652886, + "learning_rate": 2.1983723131450088e-05, + "loss": 0.3609, + "step": 1017 + }, + { + "epoch": 2.605246321177223, + "grad_norm": 0.42275585901863255, + "learning_rate": 2.194809488804839e-05, + "loss": 0.3427, + "step": 1018 + }, + { + "epoch": 2.6078055022392834, + "grad_norm": 0.3513925245942965, + "learning_rate": 2.1912460402190625e-05, + "loss": 0.2984, + "step": 1019 + }, + { + "epoch": 2.6103646833013436, + "grad_norm": 0.4845404009383636, + "learning_rate": 2.1876819788063586e-05, + "loss": 0.342, + "step": 1020 + }, + { + "epoch": 2.612923864363404, + "grad_norm": 0.38632744480954595, + "learning_rate": 2.1841173159873718e-05, + "loss": 0.3178, + "step": 1021 + }, + { + "epoch": 2.6154830454254636, + "grad_norm": 0.40341965144316216, + "learning_rate": 2.1805520631846705e-05, + "loss": 0.3454, + "step": 1022 + }, + { + "epoch": 2.618042226487524, + "grad_norm": 0.5429183555857332, + "learning_rate": 2.176986231822717e-05, + "loss": 0.3407, + "step": 1023 + }, + { + "epoch": 2.620601407549584, + "grad_norm": 0.3693931345744361, + "learning_rate": 2.173419833327826e-05, + "loss": 0.2931, + "step": 1024 + }, + { + "epoch": 2.6231605886116443, + "grad_norm": 0.41461427921632693, + "learning_rate": 2.16985287912813e-05, + "loss": 0.3462, + "step": 1025 + }, + { + "epoch": 2.6257197696737045, + "grad_norm": 0.35793827637412173, + "learning_rate": 2.166285380653541e-05, + "loss": 0.2649, + "step": 1026 + }, + { + "epoch": 2.6282789507357647, + "grad_norm": 0.43455345040805726, + "learning_rate": 2.1627173493357167e-05, + "loss": 0.3432, + "step": 1027 + }, + { + "epoch": 2.630838131797825, + "grad_norm": 0.36886933829443885, + "learning_rate": 2.1591487966080215e-05, + "loss": 0.3106, + "step": 1028 + }, + { + "epoch": 2.633397312859885, + "grad_norm": 0.3991822617060509, + "learning_rate": 2.1555797339054898e-05, + "loss": 0.3621, + "step": 1029 + }, + { + "epoch": 2.635956493921945, + "grad_norm": 0.3967391590295086, + "learning_rate": 2.1520101726647922e-05, + "loss": 0.3711, + "step": 1030 + }, + { + "epoch": 2.638515674984005, + "grad_norm": 0.3577916760514241, + "learning_rate": 2.1484401243241947e-05, + "loss": 0.2945, + "step": 1031 + }, + { + "epoch": 2.6410748560460653, + "grad_norm": 0.40155422081290365, + "learning_rate": 2.1448696003235252e-05, + "loss": 0.3366, + "step": 1032 + }, + { + "epoch": 2.6436340371081255, + "grad_norm": 0.36541079152322986, + "learning_rate": 2.1412986121041355e-05, + "loss": 0.2932, + "step": 1033 + }, + { + "epoch": 2.6461932181701853, + "grad_norm": 0.3484241132798254, + "learning_rate": 2.1377271711088655e-05, + "loss": 0.3339, + "step": 1034 + }, + { + "epoch": 2.6487523992322455, + "grad_norm": 0.43519634098943255, + "learning_rate": 2.1341552887820048e-05, + "loss": 0.3762, + "step": 1035 + }, + { + "epoch": 2.6513115802943057, + "grad_norm": 0.33001353721510546, + "learning_rate": 2.1305829765692588e-05, + "loss": 0.3277, + "step": 1036 + }, + { + "epoch": 2.653870761356366, + "grad_norm": 0.3720609129475248, + "learning_rate": 2.1270102459177093e-05, + "loss": 0.3101, + "step": 1037 + }, + { + "epoch": 2.656429942418426, + "grad_norm": 0.3837314560637158, + "learning_rate": 2.123437108275779e-05, + "loss": 0.351, + "step": 1038 + }, + { + "epoch": 2.6589891234804863, + "grad_norm": 0.40361073664191494, + "learning_rate": 2.119863575093195e-05, + "loss": 0.3171, + "step": 1039 + }, + { + "epoch": 2.6615483045425465, + "grad_norm": 0.35854323369583274, + "learning_rate": 2.1162896578209517e-05, + "loss": 0.3253, + "step": 1040 + }, + { + "epoch": 2.6641074856046068, + "grad_norm": 0.37248590893696937, + "learning_rate": 2.112715367911275e-05, + "loss": 0.3511, + "step": 1041 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3453384873567075, + "learning_rate": 2.1091407168175836e-05, + "loss": 0.3414, + "step": 1042 + }, + { + "epoch": 2.6692258477287267, + "grad_norm": 0.40821485869737884, + "learning_rate": 2.1055657159944545e-05, + "loss": 0.3326, + "step": 1043 + }, + { + "epoch": 2.671785028790787, + "grad_norm": 0.36123413084691214, + "learning_rate": 2.1019903768975852e-05, + "loss": 0.3298, + "step": 1044 + }, + { + "epoch": 2.674344209852847, + "grad_norm": 0.3404084610072503, + "learning_rate": 2.0984147109837564e-05, + "loss": 0.3047, + "step": 1045 + }, + { + "epoch": 2.6769033909149074, + "grad_norm": 0.4454775171788898, + "learning_rate": 2.094838729710798e-05, + "loss": 0.3679, + "step": 1046 + }, + { + "epoch": 2.679462571976967, + "grad_norm": 0.3571602753390297, + "learning_rate": 2.0912624445375483e-05, + "loss": 0.3261, + "step": 1047 + }, + { + "epoch": 2.6820217530390273, + "grad_norm": 0.332410280409988, + "learning_rate": 2.0876858669238206e-05, + "loss": 0.3114, + "step": 1048 + }, + { + "epoch": 2.6845809341010876, + "grad_norm": 0.42377420739915694, + "learning_rate": 2.0841090083303643e-05, + "loss": 0.342, + "step": 1049 + }, + { + "epoch": 2.6871401151631478, + "grad_norm": 0.3395618485708725, + "learning_rate": 2.0805318802188307e-05, + "loss": 0.3157, + "step": 1050 + }, + { + "epoch": 2.689699296225208, + "grad_norm": 0.3545398294326781, + "learning_rate": 2.0769544940517326e-05, + "loss": 0.3207, + "step": 1051 + }, + { + "epoch": 2.692258477287268, + "grad_norm": 0.41494596285417495, + "learning_rate": 2.0733768612924137e-05, + "loss": 0.3294, + "step": 1052 + }, + { + "epoch": 2.6948176583493284, + "grad_norm": 0.3439663523935823, + "learning_rate": 2.0697989934050025e-05, + "loss": 0.2852, + "step": 1053 + }, + { + "epoch": 2.6973768394113886, + "grad_norm": 0.41679625940370135, + "learning_rate": 2.0662209018543836e-05, + "loss": 0.3548, + "step": 1054 + }, + { + "epoch": 2.6999360204734484, + "grad_norm": 0.4309626747124301, + "learning_rate": 2.0626425981061608e-05, + "loss": 0.326, + "step": 1055 + }, + { + "epoch": 2.7024952015355086, + "grad_norm": 0.3700268039056913, + "learning_rate": 2.0590640936266132e-05, + "loss": 0.3346, + "step": 1056 + }, + { + "epoch": 2.705054382597569, + "grad_norm": 0.4092764582830664, + "learning_rate": 2.0554853998826652e-05, + "loss": 0.3432, + "step": 1057 + }, + { + "epoch": 2.707613563659629, + "grad_norm": 0.3003644777646875, + "learning_rate": 2.0519065283418494e-05, + "loss": 0.257, + "step": 1058 + }, + { + "epoch": 2.710172744721689, + "grad_norm": 0.36206552407899595, + "learning_rate": 2.0483274904722647e-05, + "loss": 0.3339, + "step": 1059 + }, + { + "epoch": 2.712731925783749, + "grad_norm": 0.41404402491302905, + "learning_rate": 2.0447482977425465e-05, + "loss": 0.3705, + "step": 1060 + }, + { + "epoch": 2.715291106845809, + "grad_norm": 0.35467256324797414, + "learning_rate": 2.0411689616218234e-05, + "loss": 0.3173, + "step": 1061 + }, + { + "epoch": 2.7178502879078694, + "grad_norm": 0.39211100334061155, + "learning_rate": 2.037589493579685e-05, + "loss": 0.336, + "step": 1062 + }, + { + "epoch": 2.7204094689699296, + "grad_norm": 0.3752676154141884, + "learning_rate": 2.034009905086144e-05, + "loss": 0.339, + "step": 1063 + }, + { + "epoch": 2.72296865003199, + "grad_norm": 0.41294533749554696, + "learning_rate": 2.0304302076115987e-05, + "loss": 0.3187, + "step": 1064 + }, + { + "epoch": 2.72552783109405, + "grad_norm": 0.37876928739351945, + "learning_rate": 2.0268504126267952e-05, + "loss": 0.2895, + "step": 1065 + }, + { + "epoch": 2.7280870121561103, + "grad_norm": 0.4001922065674535, + "learning_rate": 2.0232705316027946e-05, + "loss": 0.3153, + "step": 1066 + }, + { + "epoch": 2.73064619321817, + "grad_norm": 0.8048596079823415, + "learning_rate": 2.019690576010931e-05, + "loss": 0.3593, + "step": 1067 + }, + { + "epoch": 2.7332053742802302, + "grad_norm": 0.38282646966391626, + "learning_rate": 2.0161105573227798e-05, + "loss": 0.3035, + "step": 1068 + }, + { + "epoch": 2.7357645553422905, + "grad_norm": 0.46315568817195285, + "learning_rate": 2.0125304870101184e-05, + "loss": 0.3751, + "step": 1069 + }, + { + "epoch": 2.7383237364043507, + "grad_norm": 0.4109495278872969, + "learning_rate": 2.008950376544887e-05, + "loss": 0.3346, + "step": 1070 + }, + { + "epoch": 2.740882917466411, + "grad_norm": 0.4857096813595627, + "learning_rate": 2.005370237399157e-05, + "loss": 0.328, + "step": 1071 + }, + { + "epoch": 2.7434420985284707, + "grad_norm": 0.39791730028337013, + "learning_rate": 2.0017900810450923e-05, + "loss": 0.2865, + "step": 1072 + }, + { + "epoch": 2.746001279590531, + "grad_norm": 0.4465305517364689, + "learning_rate": 1.9982099189549087e-05, + "loss": 0.3647, + "step": 1073 + }, + { + "epoch": 2.748560460652591, + "grad_norm": 0.41005369965327937, + "learning_rate": 1.9946297626008432e-05, + "loss": 0.3151, + "step": 1074 + }, + { + "epoch": 2.7511196417146513, + "grad_norm": 0.49665282457932985, + "learning_rate": 1.9910496234551132e-05, + "loss": 0.3809, + "step": 1075 + }, + { + "epoch": 2.7536788227767115, + "grad_norm": 0.37361149681320743, + "learning_rate": 1.9874695129898826e-05, + "loss": 0.3221, + "step": 1076 + }, + { + "epoch": 2.7562380038387717, + "grad_norm": 0.51447395549755, + "learning_rate": 1.9838894426772205e-05, + "loss": 0.3677, + "step": 1077 + }, + { + "epoch": 2.758797184900832, + "grad_norm": 0.4601671007615494, + "learning_rate": 1.9803094239890692e-05, + "loss": 0.3519, + "step": 1078 + }, + { + "epoch": 2.761356365962892, + "grad_norm": 0.39680274509707913, + "learning_rate": 1.9767294683972064e-05, + "loss": 0.3521, + "step": 1079 + }, + { + "epoch": 2.763915547024952, + "grad_norm": 0.43847108432837295, + "learning_rate": 1.9731495873732055e-05, + "loss": 0.3346, + "step": 1080 + }, + { + "epoch": 2.766474728087012, + "grad_norm": 0.4078264869228389, + "learning_rate": 1.969569792388402e-05, + "loss": 0.3624, + "step": 1081 + }, + { + "epoch": 2.7690339091490723, + "grad_norm": 0.37340927869032653, + "learning_rate": 1.9659900949138562e-05, + "loss": 0.3252, + "step": 1082 + }, + { + "epoch": 2.7715930902111325, + "grad_norm": 0.7622901676621391, + "learning_rate": 1.9624105064203157e-05, + "loss": 0.3829, + "step": 1083 + }, + { + "epoch": 2.7741522712731923, + "grad_norm": 0.4249232215662602, + "learning_rate": 1.9588310383781773e-05, + "loss": 0.342, + "step": 1084 + }, + { + "epoch": 2.7767114523352525, + "grad_norm": 0.41402606002990694, + "learning_rate": 1.9552517022574542e-05, + "loss": 0.3756, + "step": 1085 + }, + { + "epoch": 2.7792706333973127, + "grad_norm": 0.48132928718497536, + "learning_rate": 1.951672509527736e-05, + "loss": 0.3263, + "step": 1086 + }, + { + "epoch": 2.781829814459373, + "grad_norm": 0.3954278720969905, + "learning_rate": 1.9480934716581513e-05, + "loss": 0.3209, + "step": 1087 + }, + { + "epoch": 2.784388995521433, + "grad_norm": 0.39183072418087983, + "learning_rate": 1.944514600117335e-05, + "loss": 0.3807, + "step": 1088 + }, + { + "epoch": 2.7869481765834934, + "grad_norm": 0.4620346036605357, + "learning_rate": 1.940935906373388e-05, + "loss": 0.3576, + "step": 1089 + }, + { + "epoch": 2.7895073576455536, + "grad_norm": 0.36599489933850005, + "learning_rate": 1.93735740189384e-05, + "loss": 0.3239, + "step": 1090 + }, + { + "epoch": 2.792066538707614, + "grad_norm": 0.3686130358461154, + "learning_rate": 1.9337790981456164e-05, + "loss": 0.331, + "step": 1091 + }, + { + "epoch": 2.7946257197696736, + "grad_norm": 0.38425163759635245, + "learning_rate": 1.930201006594999e-05, + "loss": 0.2916, + "step": 1092 + }, + { + "epoch": 2.7971849008317338, + "grad_norm": 0.4405563689728695, + "learning_rate": 1.926623138707587e-05, + "loss": 0.3361, + "step": 1093 + }, + { + "epoch": 2.799744081893794, + "grad_norm": 0.38706329631394476, + "learning_rate": 1.923045505948267e-05, + "loss": 0.323, + "step": 1094 + }, + { + "epoch": 2.802303262955854, + "grad_norm": 0.4708041341171456, + "learning_rate": 1.9194681197811703e-05, + "loss": 0.3378, + "step": 1095 + }, + { + "epoch": 2.8048624440179144, + "grad_norm": 0.43844070620974923, + "learning_rate": 1.915890991669636e-05, + "loss": 0.331, + "step": 1096 + }, + { + "epoch": 2.807421625079974, + "grad_norm": 0.4532919809550908, + "learning_rate": 1.9123141330761804e-05, + "loss": 0.3863, + "step": 1097 + }, + { + "epoch": 2.8099808061420344, + "grad_norm": 0.48356206097375876, + "learning_rate": 1.9087375554624527e-05, + "loss": 0.3241, + "step": 1098 + }, + { + "epoch": 2.8125399872040946, + "grad_norm": 0.37016571224063527, + "learning_rate": 1.9051612702892028e-05, + "loss": 0.3035, + "step": 1099 + }, + { + "epoch": 2.815099168266155, + "grad_norm": 0.5488710646000331, + "learning_rate": 1.901585289016244e-05, + "loss": 0.3365, + "step": 1100 + }, + { + "epoch": 2.817658349328215, + "grad_norm": 0.39186801965858076, + "learning_rate": 1.898009623102415e-05, + "loss": 0.3171, + "step": 1101 + }, + { + "epoch": 2.8202175303902752, + "grad_norm": 0.396128192090393, + "learning_rate": 1.894434284005546e-05, + "loss": 0.2926, + "step": 1102 + }, + { + "epoch": 2.8227767114523354, + "grad_norm": 0.4498006361647164, + "learning_rate": 1.890859283182417e-05, + "loss": 0.339, + "step": 1103 + }, + { + "epoch": 2.8253358925143957, + "grad_norm": 0.4421659448994112, + "learning_rate": 1.887284632088725e-05, + "loss": 0.3512, + "step": 1104 + }, + { + "epoch": 2.8278950735764554, + "grad_norm": 0.40391454153472733, + "learning_rate": 1.8837103421790486e-05, + "loss": 0.3431, + "step": 1105 + }, + { + "epoch": 2.8304542546385156, + "grad_norm": 0.4029197127830964, + "learning_rate": 1.8801364249068053e-05, + "loss": 0.3083, + "step": 1106 + }, + { + "epoch": 2.833013435700576, + "grad_norm": 0.3636262656158161, + "learning_rate": 1.8765628917242213e-05, + "loss": 0.2695, + "step": 1107 + }, + { + "epoch": 2.835572616762636, + "grad_norm": 0.41088999848761776, + "learning_rate": 1.8729897540822914e-05, + "loss": 0.3191, + "step": 1108 + }, + { + "epoch": 2.838131797824696, + "grad_norm": 0.4141748812616923, + "learning_rate": 1.8694170234307415e-05, + "loss": 0.3494, + "step": 1109 + }, + { + "epoch": 2.840690978886756, + "grad_norm": 0.40924209313260995, + "learning_rate": 1.8658447112179952e-05, + "loss": 0.3424, + "step": 1110 + }, + { + "epoch": 2.8432501599488162, + "grad_norm": 0.38985761612852116, + "learning_rate": 1.8622728288911358e-05, + "loss": 0.3367, + "step": 1111 + }, + { + "epoch": 2.8458093410108765, + "grad_norm": 0.38078814823399454, + "learning_rate": 1.858701387895865e-05, + "loss": 0.3143, + "step": 1112 + }, + { + "epoch": 2.8483685220729367, + "grad_norm": 0.34959904397806785, + "learning_rate": 1.8551303996764755e-05, + "loss": 0.3007, + "step": 1113 + }, + { + "epoch": 2.850927703134997, + "grad_norm": 0.4183989425563707, + "learning_rate": 1.8515598756758064e-05, + "loss": 0.359, + "step": 1114 + }, + { + "epoch": 2.853486884197057, + "grad_norm": 0.37111554641813266, + "learning_rate": 1.8479898273352084e-05, + "loss": 0.3381, + "step": 1115 + }, + { + "epoch": 2.8560460652591173, + "grad_norm": 0.36998674648942864, + "learning_rate": 1.8444202660945105e-05, + "loss": 0.3317, + "step": 1116 + }, + { + "epoch": 2.858605246321177, + "grad_norm": 0.4020148068654659, + "learning_rate": 1.8408512033919798e-05, + "loss": 0.3048, + "step": 1117 + }, + { + "epoch": 2.8611644273832373, + "grad_norm": 0.4005462960969913, + "learning_rate": 1.837282650664284e-05, + "loss": 0.3603, + "step": 1118 + }, + { + "epoch": 2.8637236084452975, + "grad_norm": 0.3444120356474249, + "learning_rate": 1.8337146193464595e-05, + "loss": 0.2898, + "step": 1119 + }, + { + "epoch": 2.8662827895073577, + "grad_norm": 0.39082290145745685, + "learning_rate": 1.83014712087187e-05, + "loss": 0.3131, + "step": 1120 + }, + { + "epoch": 2.868841970569418, + "grad_norm": 0.35821645503904304, + "learning_rate": 1.8265801666721744e-05, + "loss": 0.3433, + "step": 1121 + }, + { + "epoch": 2.8714011516314777, + "grad_norm": 0.41668733158509796, + "learning_rate": 1.8230137681772836e-05, + "loss": 0.3567, + "step": 1122 + }, + { + "epoch": 2.873960332693538, + "grad_norm": 0.34804389069027475, + "learning_rate": 1.8194479368153298e-05, + "loss": 0.3136, + "step": 1123 + }, + { + "epoch": 2.876519513755598, + "grad_norm": 0.3911488779456878, + "learning_rate": 1.8158826840126292e-05, + "loss": 0.3412, + "step": 1124 + }, + { + "epoch": 2.8790786948176583, + "grad_norm": 0.44396897741245006, + "learning_rate": 1.8123180211936417e-05, + "loss": 0.3644, + "step": 1125 + }, + { + "epoch": 2.8816378758797185, + "grad_norm": 0.3818666952140594, + "learning_rate": 1.808753959780938e-05, + "loss": 0.2988, + "step": 1126 + }, + { + "epoch": 2.8841970569417787, + "grad_norm": 0.3819899108018794, + "learning_rate": 1.805190511195162e-05, + "loss": 0.3304, + "step": 1127 + }, + { + "epoch": 2.886756238003839, + "grad_norm": 0.3988449634404456, + "learning_rate": 1.801627686854992e-05, + "loss": 0.3413, + "step": 1128 + }, + { + "epoch": 2.889315419065899, + "grad_norm": 0.4410132899232368, + "learning_rate": 1.7980654981771074e-05, + "loss": 0.3725, + "step": 1129 + }, + { + "epoch": 2.891874600127959, + "grad_norm": 0.31362864495515286, + "learning_rate": 1.794503956576152e-05, + "loss": 0.2833, + "step": 1130 + }, + { + "epoch": 2.894433781190019, + "grad_norm": 0.411109247445083, + "learning_rate": 1.7909430734646936e-05, + "loss": 0.3297, + "step": 1131 + }, + { + "epoch": 2.8969929622520794, + "grad_norm": 0.3360115333630458, + "learning_rate": 1.78738286025319e-05, + "loss": 0.2985, + "step": 1132 + }, + { + "epoch": 2.8995521433141396, + "grad_norm": 0.3728330681966131, + "learning_rate": 1.7838233283499554e-05, + "loss": 0.378, + "step": 1133 + }, + { + "epoch": 2.9021113243761993, + "grad_norm": 0.3539275837478167, + "learning_rate": 1.780264489161117e-05, + "loss": 0.3638, + "step": 1134 + }, + { + "epoch": 2.9046705054382596, + "grad_norm": 0.33582742300915935, + "learning_rate": 1.776706354090585e-05, + "loss": 0.3383, + "step": 1135 + }, + { + "epoch": 2.9072296865003198, + "grad_norm": 0.38396584024437336, + "learning_rate": 1.7731489345400118e-05, + "loss": 0.3116, + "step": 1136 + }, + { + "epoch": 2.90978886756238, + "grad_norm": 0.3121417283022767, + "learning_rate": 1.769592241908758e-05, + "loss": 0.3089, + "step": 1137 + }, + { + "epoch": 2.91234804862444, + "grad_norm": 0.37871598245894667, + "learning_rate": 1.766036287593854e-05, + "loss": 0.3504, + "step": 1138 + }, + { + "epoch": 2.9149072296865004, + "grad_norm": 0.4083761440481677, + "learning_rate": 1.762481082989965e-05, + "loss": 0.3338, + "step": 1139 + }, + { + "epoch": 2.9174664107485606, + "grad_norm": 0.3759166218369834, + "learning_rate": 1.758926639489354e-05, + "loss": 0.3448, + "step": 1140 + }, + { + "epoch": 2.920025591810621, + "grad_norm": 0.3918883448687237, + "learning_rate": 1.755372968481844e-05, + "loss": 0.3465, + "step": 1141 + }, + { + "epoch": 2.9225847728726806, + "grad_norm": 0.45281102397931977, + "learning_rate": 1.7518200813547842e-05, + "loss": 0.352, + "step": 1142 + }, + { + "epoch": 2.925143953934741, + "grad_norm": 0.3403187481784673, + "learning_rate": 1.748267989493011e-05, + "loss": 0.2767, + "step": 1143 + }, + { + "epoch": 2.927703134996801, + "grad_norm": 0.32374732234103554, + "learning_rate": 1.7447167042788108e-05, + "loss": 0.3003, + "step": 1144 + }, + { + "epoch": 2.9302623160588612, + "grad_norm": 0.43821825023278765, + "learning_rate": 1.7411662370918893e-05, + "loss": 0.3365, + "step": 1145 + }, + { + "epoch": 2.9328214971209214, + "grad_norm": 0.3735728621894312, + "learning_rate": 1.7376165993093278e-05, + "loss": 0.3164, + "step": 1146 + }, + { + "epoch": 2.935380678182981, + "grad_norm": 0.3713647905663265, + "learning_rate": 1.7340678023055496e-05, + "loss": 0.3237, + "step": 1147 + }, + { + "epoch": 2.9379398592450414, + "grad_norm": 0.40116903737296333, + "learning_rate": 1.7305198574522864e-05, + "loss": 0.3614, + "step": 1148 + }, + { + "epoch": 2.9404990403071016, + "grad_norm": 0.40054289621797295, + "learning_rate": 1.7269727761185374e-05, + "loss": 0.334, + "step": 1149 + }, + { + "epoch": 2.943058221369162, + "grad_norm": 0.3925230509455669, + "learning_rate": 1.7234265696705344e-05, + "loss": 0.2959, + "step": 1150 + }, + { + "epoch": 2.945617402431222, + "grad_norm": 0.42214888830794545, + "learning_rate": 1.7198812494717062e-05, + "loss": 0.3776, + "step": 1151 + }, + { + "epoch": 2.9481765834932823, + "grad_norm": 0.3616779488648713, + "learning_rate": 1.7163368268826433e-05, + "loss": 0.3016, + "step": 1152 + }, + { + "epoch": 2.9507357645553425, + "grad_norm": 0.38477907353760216, + "learning_rate": 1.7127933132610573e-05, + "loss": 0.3073, + "step": 1153 + }, + { + "epoch": 2.9532949456174027, + "grad_norm": 0.38583388740216534, + "learning_rate": 1.7092507199617482e-05, + "loss": 0.3303, + "step": 1154 + }, + { + "epoch": 2.9558541266794625, + "grad_norm": 0.4615746603773426, + "learning_rate": 1.7057090583365678e-05, + "loss": 0.3944, + "step": 1155 + }, + { + "epoch": 2.9584133077415227, + "grad_norm": 0.3793974621003137, + "learning_rate": 1.7021683397343823e-05, + "loss": 0.3298, + "step": 1156 + }, + { + "epoch": 2.960972488803583, + "grad_norm": 0.4920742763843501, + "learning_rate": 1.698628575501034e-05, + "loss": 0.3401, + "step": 1157 + }, + { + "epoch": 2.963531669865643, + "grad_norm": 0.4104914994234224, + "learning_rate": 1.6950897769793093e-05, + "loss": 0.3268, + "step": 1158 + }, + { + "epoch": 2.966090850927703, + "grad_norm": 0.41153736245664496, + "learning_rate": 1.6915519555089e-05, + "loss": 0.3594, + "step": 1159 + }, + { + "epoch": 2.968650031989763, + "grad_norm": 0.3875070318404897, + "learning_rate": 1.6880151224263646e-05, + "loss": 0.3398, + "step": 1160 + }, + { + "epoch": 2.9712092130518233, + "grad_norm": 0.44887352669211456, + "learning_rate": 1.6844792890650976e-05, + "loss": 0.2813, + "step": 1161 + }, + { + "epoch": 2.9737683941138835, + "grad_norm": 0.3800558590649599, + "learning_rate": 1.680944466755289e-05, + "loss": 0.3635, + "step": 1162 + }, + { + "epoch": 2.9763275751759437, + "grad_norm": 0.3971504175952064, + "learning_rate": 1.6774106668238867e-05, + "loss": 0.3146, + "step": 1163 + }, + { + "epoch": 2.978886756238004, + "grad_norm": 0.4715791982640647, + "learning_rate": 1.673877900594566e-05, + "loss": 0.3553, + "step": 1164 + }, + { + "epoch": 2.981445937300064, + "grad_norm": 0.30859611389638464, + "learning_rate": 1.6703461793876876e-05, + "loss": 0.2989, + "step": 1165 + }, + { + "epoch": 2.9840051183621243, + "grad_norm": 0.3973251808749978, + "learning_rate": 1.6668155145202638e-05, + "loss": 0.3579, + "step": 1166 + }, + { + "epoch": 2.986564299424184, + "grad_norm": 0.446835333586439, + "learning_rate": 1.6632859173059232e-05, + "loss": 0.3258, + "step": 1167 + }, + { + "epoch": 2.9891234804862443, + "grad_norm": 0.3811823018283798, + "learning_rate": 1.6597573990548722e-05, + "loss": 0.3201, + "step": 1168 + }, + { + "epoch": 2.9916826615483045, + "grad_norm": 0.375872795491289, + "learning_rate": 1.6562299710738586e-05, + "loss": 0.3255, + "step": 1169 + }, + { + "epoch": 2.9942418426103647, + "grad_norm": 0.4029499625289535, + "learning_rate": 1.6527036446661396e-05, + "loss": 0.307, + "step": 1170 + }, + { + "epoch": 2.996801023672425, + "grad_norm": 0.4156370523912452, + "learning_rate": 1.6491784311314403e-05, + "loss": 0.3797, + "step": 1171 + }, + { + "epoch": 2.9993602047344847, + "grad_norm": 0.4189485161365368, + "learning_rate": 1.6456543417659192e-05, + "loss": 0.3488, + "step": 1172 + }, + { + "epoch": 3.001919385796545, + "grad_norm": 0.5633124123177335, + "learning_rate": 1.6421313878621344e-05, + "loss": 0.2857, + "step": 1173 + }, + { + "epoch": 3.004478566858605, + "grad_norm": 0.39083213560124314, + "learning_rate": 1.6386095807090047e-05, + "loss": 0.2379, + "step": 1174 + }, + { + "epoch": 3.0070377479206654, + "grad_norm": 0.5240113784249277, + "learning_rate": 1.635088931591775e-05, + "loss": 0.2245, + "step": 1175 + }, + { + "epoch": 3.0095969289827256, + "grad_norm": 0.6354763063642779, + "learning_rate": 1.631569451791977e-05, + "loss": 0.2869, + "step": 1176 + }, + { + "epoch": 3.012156110044786, + "grad_norm": 0.4893508226750199, + "learning_rate": 1.628051152587398e-05, + "loss": 0.2703, + "step": 1177 + }, + { + "epoch": 3.014715291106846, + "grad_norm": 0.4524451646086151, + "learning_rate": 1.6245340452520414e-05, + "loss": 0.2176, + "step": 1178 + }, + { + "epoch": 3.0172744721689058, + "grad_norm": 0.4778994330296646, + "learning_rate": 1.6210181410560912e-05, + "loss": 0.2571, + "step": 1179 + }, + { + "epoch": 3.019833653230966, + "grad_norm": 0.37599046051577073, + "learning_rate": 1.6175034512658753e-05, + "loss": 0.2338, + "step": 1180 + }, + { + "epoch": 3.022392834293026, + "grad_norm": 0.45320853578403025, + "learning_rate": 1.613989987143833e-05, + "loss": 0.2325, + "step": 1181 + }, + { + "epoch": 3.0249520153550864, + "grad_norm": 0.43340502469781866, + "learning_rate": 1.610477759948472e-05, + "loss": 0.2767, + "step": 1182 + }, + { + "epoch": 3.0275111964171466, + "grad_norm": 0.38361490841329304, + "learning_rate": 1.6069667809343396e-05, + "loss": 0.2445, + "step": 1183 + }, + { + "epoch": 3.030070377479207, + "grad_norm": 0.39878208398916914, + "learning_rate": 1.603457061351983e-05, + "loss": 0.2506, + "step": 1184 + }, + { + "epoch": 3.0326295585412666, + "grad_norm": 0.44131555640934655, + "learning_rate": 1.5999486124479115e-05, + "loss": 0.2588, + "step": 1185 + }, + { + "epoch": 3.035188739603327, + "grad_norm": 0.36228403498534006, + "learning_rate": 1.5964414454645647e-05, + "loss": 0.2394, + "step": 1186 + }, + { + "epoch": 3.037747920665387, + "grad_norm": 0.40556573549758734, + "learning_rate": 1.5929355716402754e-05, + "loss": 0.2422, + "step": 1187 + }, + { + "epoch": 3.0403071017274472, + "grad_norm": 0.46971240300411676, + "learning_rate": 1.5894310022092288e-05, + "loss": 0.2536, + "step": 1188 + }, + { + "epoch": 3.0428662827895074, + "grad_norm": 0.37074278168598435, + "learning_rate": 1.5859277484014338e-05, + "loss": 0.2262, + "step": 1189 + }, + { + "epoch": 3.0454254638515676, + "grad_norm": 0.4101230047135583, + "learning_rate": 1.5824258214426833e-05, + "loss": 0.2501, + "step": 1190 + }, + { + "epoch": 3.047984644913628, + "grad_norm": 0.4316482731662535, + "learning_rate": 1.5789252325545157e-05, + "loss": 0.2766, + "step": 1191 + }, + { + "epoch": 3.0505438259756876, + "grad_norm": 0.36783848268146724, + "learning_rate": 1.5754259929541848e-05, + "loss": 0.2401, + "step": 1192 + }, + { + "epoch": 3.053103007037748, + "grad_norm": 0.4061482848745174, + "learning_rate": 1.5719281138546186e-05, + "loss": 0.2508, + "step": 1193 + }, + { + "epoch": 3.055662188099808, + "grad_norm": 0.382539152433566, + "learning_rate": 1.568431606464388e-05, + "loss": 0.2489, + "step": 1194 + }, + { + "epoch": 3.0582213691618683, + "grad_norm": 0.3652697874982772, + "learning_rate": 1.5649364819876655e-05, + "loss": 0.2429, + "step": 1195 + }, + { + "epoch": 3.0607805502239285, + "grad_norm": 0.42737097970366417, + "learning_rate": 1.561442751624193e-05, + "loss": 0.256, + "step": 1196 + }, + { + "epoch": 3.0633397312859887, + "grad_norm": 0.3207166801748589, + "learning_rate": 1.557950426569248e-05, + "loss": 0.1962, + "step": 1197 + }, + { + "epoch": 3.0658989123480485, + "grad_norm": 0.3943172590265861, + "learning_rate": 1.5544595180136003e-05, + "loss": 0.2519, + "step": 1198 + }, + { + "epoch": 3.0684580934101087, + "grad_norm": 0.36274192661719984, + "learning_rate": 1.550970037143483e-05, + "loss": 0.2279, + "step": 1199 + }, + { + "epoch": 3.071017274472169, + "grad_norm": 0.3630874200444502, + "learning_rate": 1.547481995140556e-05, + "loss": 0.2516, + "step": 1200 + }, + { + "epoch": 3.073576455534229, + "grad_norm": 0.3518754730219404, + "learning_rate": 1.5439954031818652e-05, + "loss": 0.2329, + "step": 1201 + }, + { + "epoch": 3.0761356365962893, + "grad_norm": 0.37790098887949486, + "learning_rate": 1.5405102724398113e-05, + "loss": 0.2677, + "step": 1202 + }, + { + "epoch": 3.0786948176583495, + "grad_norm": 0.35043382673558215, + "learning_rate": 1.5370266140821143e-05, + "loss": 0.2294, + "step": 1203 + }, + { + "epoch": 3.0812539987204093, + "grad_norm": 0.3731184820167596, + "learning_rate": 1.5335444392717738e-05, + "loss": 0.2319, + "step": 1204 + }, + { + "epoch": 3.0838131797824695, + "grad_norm": 0.3395760795759123, + "learning_rate": 1.5300637591670357e-05, + "loss": 0.2333, + "step": 1205 + }, + { + "epoch": 3.0863723608445297, + "grad_norm": 0.3530521161404101, + "learning_rate": 1.5265845849213588e-05, + "loss": 0.2458, + "step": 1206 + }, + { + "epoch": 3.08893154190659, + "grad_norm": 0.2968062718035343, + "learning_rate": 1.523106927683374e-05, + "loss": 0.1984, + "step": 1207 + }, + { + "epoch": 3.09149072296865, + "grad_norm": 0.34618157328728927, + "learning_rate": 1.5196307985968509e-05, + "loss": 0.2338, + "step": 1208 + }, + { + "epoch": 3.0940499040307103, + "grad_norm": 0.3991365653818135, + "learning_rate": 1.5161562088006649e-05, + "loss": 0.2639, + "step": 1209 + }, + { + "epoch": 3.09660908509277, + "grad_norm": 0.3337465568445769, + "learning_rate": 1.5126831694287564e-05, + "loss": 0.2354, + "step": 1210 + }, + { + "epoch": 3.0991682661548303, + "grad_norm": 0.39228609678359605, + "learning_rate": 1.5092116916100982e-05, + "loss": 0.2737, + "step": 1211 + }, + { + "epoch": 3.1017274472168905, + "grad_norm": 0.3216556126081721, + "learning_rate": 1.5057417864686607e-05, + "loss": 0.2237, + "step": 1212 + }, + { + "epoch": 3.1042866282789507, + "grad_norm": 0.3567015015151436, + "learning_rate": 1.5022734651233737e-05, + "loss": 0.2568, + "step": 1213 + }, + { + "epoch": 3.106845809341011, + "grad_norm": 0.35178848138592544, + "learning_rate": 1.4988067386880904e-05, + "loss": 0.2276, + "step": 1214 + }, + { + "epoch": 3.109404990403071, + "grad_norm": 0.3511504661864566, + "learning_rate": 1.4953416182715566e-05, + "loss": 0.2699, + "step": 1215 + }, + { + "epoch": 3.1119641714651314, + "grad_norm": 0.3425398327341164, + "learning_rate": 1.4918781149773694e-05, + "loss": 0.2677, + "step": 1216 + }, + { + "epoch": 3.114523352527191, + "grad_norm": 0.36160910451306577, + "learning_rate": 1.4884162399039439e-05, + "loss": 0.2545, + "step": 1217 + }, + { + "epoch": 3.1170825335892514, + "grad_norm": 0.37097329916252125, + "learning_rate": 1.4849560041444795e-05, + "loss": 0.2609, + "step": 1218 + }, + { + "epoch": 3.1196417146513116, + "grad_norm": 0.36352782561925345, + "learning_rate": 1.4814974187869218e-05, + "loss": 0.2236, + "step": 1219 + }, + { + "epoch": 3.122200895713372, + "grad_norm": 0.3551683721423837, + "learning_rate": 1.478040494913926e-05, + "loss": 0.2244, + "step": 1220 + }, + { + "epoch": 3.124760076775432, + "grad_norm": 0.3308813359796844, + "learning_rate": 1.4745852436028262e-05, + "loss": 0.2591, + "step": 1221 + }, + { + "epoch": 3.127319257837492, + "grad_norm": 0.3507830733548493, + "learning_rate": 1.4711316759255963e-05, + "loss": 0.2453, + "step": 1222 + }, + { + "epoch": 3.129878438899552, + "grad_norm": 0.33582461966461585, + "learning_rate": 1.4676798029488123e-05, + "loss": 0.2593, + "step": 1223 + }, + { + "epoch": 3.132437619961612, + "grad_norm": 0.3507842455435477, + "learning_rate": 1.464229635733624e-05, + "loss": 0.2372, + "step": 1224 + }, + { + "epoch": 3.1349968010236724, + "grad_norm": 0.3318567188084375, + "learning_rate": 1.460781185335713e-05, + "loss": 0.231, + "step": 1225 + }, + { + "epoch": 3.1375559820857326, + "grad_norm": 0.3188990523059626, + "learning_rate": 1.4573344628052588e-05, + "loss": 0.2376, + "step": 1226 + }, + { + "epoch": 3.140115163147793, + "grad_norm": 0.3664030290111237, + "learning_rate": 1.4538894791869052e-05, + "loss": 0.2585, + "step": 1227 + }, + { + "epoch": 3.142674344209853, + "grad_norm": 0.3456582759492691, + "learning_rate": 1.4504462455197248e-05, + "loss": 0.2295, + "step": 1228 + }, + { + "epoch": 3.145233525271913, + "grad_norm": 0.3071644333690587, + "learning_rate": 1.4470047728371813e-05, + "loss": 0.2113, + "step": 1229 + }, + { + "epoch": 3.147792706333973, + "grad_norm": 0.33706220227684885, + "learning_rate": 1.443565072167095e-05, + "loss": 0.2286, + "step": 1230 + }, + { + "epoch": 3.1503518873960332, + "grad_norm": 0.3213473350814528, + "learning_rate": 1.4401271545316096e-05, + "loss": 0.2333, + "step": 1231 + }, + { + "epoch": 3.1529110684580934, + "grad_norm": 0.32806923950956013, + "learning_rate": 1.436691030947155e-05, + "loss": 0.2338, + "step": 1232 + }, + { + "epoch": 3.1554702495201536, + "grad_norm": 0.33540430311298725, + "learning_rate": 1.43325671242441e-05, + "loss": 0.209, + "step": 1233 + }, + { + "epoch": 3.158029430582214, + "grad_norm": 0.3208377254757641, + "learning_rate": 1.4298242099682726e-05, + "loss": 0.245, + "step": 1234 + }, + { + "epoch": 3.1605886116442736, + "grad_norm": 0.3427654538840671, + "learning_rate": 1.4263935345778202e-05, + "loss": 0.2521, + "step": 1235 + }, + { + "epoch": 3.163147792706334, + "grad_norm": 0.35551971354398254, + "learning_rate": 1.4229646972462732e-05, + "loss": 0.2338, + "step": 1236 + }, + { + "epoch": 3.165706973768394, + "grad_norm": 0.33357596367989273, + "learning_rate": 1.419537708960966e-05, + "loss": 0.2322, + "step": 1237 + }, + { + "epoch": 3.1682661548304543, + "grad_norm": 0.3416940872307819, + "learning_rate": 1.4161125807033059e-05, + "loss": 0.24, + "step": 1238 + }, + { + "epoch": 3.1708253358925145, + "grad_norm": 0.3259027337159305, + "learning_rate": 1.412689323448739e-05, + "loss": 0.2705, + "step": 1239 + }, + { + "epoch": 3.1733845169545747, + "grad_norm": 0.33778026989222404, + "learning_rate": 1.409267948166718e-05, + "loss": 0.2335, + "step": 1240 + }, + { + "epoch": 3.175943698016635, + "grad_norm": 0.3425941944724759, + "learning_rate": 1.4058484658206646e-05, + "loss": 0.2684, + "step": 1241 + }, + { + "epoch": 3.1785028790786947, + "grad_norm": 0.304680436913791, + "learning_rate": 1.4024308873679327e-05, + "loss": 0.2181, + "step": 1242 + }, + { + "epoch": 3.181062060140755, + "grad_norm": 0.3522850109826806, + "learning_rate": 1.3990152237597787e-05, + "loss": 0.2572, + "step": 1243 + }, + { + "epoch": 3.183621241202815, + "grad_norm": 0.3209721557320742, + "learning_rate": 1.3956014859413211e-05, + "loss": 0.2337, + "step": 1244 + }, + { + "epoch": 3.1861804222648753, + "grad_norm": 0.3293098487746776, + "learning_rate": 1.3921896848515064e-05, + "loss": 0.2411, + "step": 1245 + }, + { + "epoch": 3.1887396033269355, + "grad_norm": 0.30365057870700035, + "learning_rate": 1.388779831423078e-05, + "loss": 0.2291, + "step": 1246 + }, + { + "epoch": 3.1912987843889957, + "grad_norm": 0.3131208709907512, + "learning_rate": 1.3853719365825357e-05, + "loss": 0.2352, + "step": 1247 + }, + { + "epoch": 3.1938579654510555, + "grad_norm": 0.3376254463164988, + "learning_rate": 1.3819660112501054e-05, + "loss": 0.2625, + "step": 1248 + }, + { + "epoch": 3.1964171465131157, + "grad_norm": 0.3069287683421629, + "learning_rate": 1.3785620663396992e-05, + "loss": 0.2229, + "step": 1249 + }, + { + "epoch": 3.198976327575176, + "grad_norm": 0.3316589759980029, + "learning_rate": 1.3751601127588849e-05, + "loss": 0.245, + "step": 1250 + }, + { + "epoch": 3.201535508637236, + "grad_norm": 0.30527916893181595, + "learning_rate": 1.37176016140885e-05, + "loss": 0.2346, + "step": 1251 + }, + { + "epoch": 3.2040946896992963, + "grad_norm": 0.34900918414936455, + "learning_rate": 1.3683622231843644e-05, + "loss": 0.2392, + "step": 1252 + }, + { + "epoch": 3.2066538707613566, + "grad_norm": 0.30641882677939075, + "learning_rate": 1.364966308973747e-05, + "loss": 0.2218, + "step": 1253 + }, + { + "epoch": 3.2092130518234163, + "grad_norm": 0.3296381755475144, + "learning_rate": 1.3615724296588342e-05, + "loss": 0.2566, + "step": 1254 + }, + { + "epoch": 3.2117722328854765, + "grad_norm": 0.34080590800970306, + "learning_rate": 1.3581805961149371e-05, + "loss": 0.2518, + "step": 1255 + }, + { + "epoch": 3.2143314139475367, + "grad_norm": 0.31502390005004344, + "learning_rate": 1.3547908192108143e-05, + "loss": 0.2288, + "step": 1256 + }, + { + "epoch": 3.216890595009597, + "grad_norm": 0.30635056034248115, + "learning_rate": 1.3514031098086349e-05, + "loss": 0.2539, + "step": 1257 + }, + { + "epoch": 3.219449776071657, + "grad_norm": 0.34066214746897916, + "learning_rate": 1.3480174787639397e-05, + "loss": 0.2664, + "step": 1258 + }, + { + "epoch": 3.2220089571337174, + "grad_norm": 0.31764511418435903, + "learning_rate": 1.3446339369256121e-05, + "loss": 0.2067, + "step": 1259 + }, + { + "epoch": 3.224568138195777, + "grad_norm": 0.29852956617495935, + "learning_rate": 1.341252495135841e-05, + "loss": 0.2298, + "step": 1260 + }, + { + "epoch": 3.2271273192578374, + "grad_norm": 0.34615186747664684, + "learning_rate": 1.3378731642300841e-05, + "loss": 0.2488, + "step": 1261 + }, + { + "epoch": 3.2296865003198976, + "grad_norm": 0.31284863193899576, + "learning_rate": 1.3344959550370362e-05, + "loss": 0.222, + "step": 1262 + }, + { + "epoch": 3.232245681381958, + "grad_norm": 0.3198015399733083, + "learning_rate": 1.3311208783785945e-05, + "loss": 0.2561, + "step": 1263 + }, + { + "epoch": 3.234804862444018, + "grad_norm": 0.33484963926651445, + "learning_rate": 1.327747945069819e-05, + "loss": 0.2532, + "step": 1264 + }, + { + "epoch": 3.237364043506078, + "grad_norm": 0.3505508918710989, + "learning_rate": 1.324377165918906e-05, + "loss": 0.253, + "step": 1265 + }, + { + "epoch": 3.2399232245681384, + "grad_norm": 0.3317100885612124, + "learning_rate": 1.3210085517271459e-05, + "loss": 0.2488, + "step": 1266 + }, + { + "epoch": 3.242482405630198, + "grad_norm": 0.3183951056600632, + "learning_rate": 1.3176421132888936e-05, + "loss": 0.2206, + "step": 1267 + }, + { + "epoch": 3.2450415866922584, + "grad_norm": 0.7798174468009574, + "learning_rate": 1.3142778613915308e-05, + "loss": 0.3465, + "step": 1268 + }, + { + "epoch": 3.2476007677543186, + "grad_norm": 0.3276263112485529, + "learning_rate": 1.3109158068154329e-05, + "loss": 0.2206, + "step": 1269 + }, + { + "epoch": 3.250159948816379, + "grad_norm": 0.35582948619273064, + "learning_rate": 1.3075559603339354e-05, + "loss": 0.2272, + "step": 1270 + }, + { + "epoch": 3.252719129878439, + "grad_norm": 0.3425439408964034, + "learning_rate": 1.304198332713296e-05, + "loss": 0.2587, + "step": 1271 + }, + { + "epoch": 3.255278310940499, + "grad_norm": 0.35588463150095667, + "learning_rate": 1.3008429347126641e-05, + "loss": 0.2585, + "step": 1272 + }, + { + "epoch": 3.257837492002559, + "grad_norm": 0.3354553998723496, + "learning_rate": 1.2974897770840448e-05, + "loss": 0.2067, + "step": 1273 + }, + { + "epoch": 3.260396673064619, + "grad_norm": 0.3247184453320128, + "learning_rate": 1.2941388705722627e-05, + "loss": 0.2449, + "step": 1274 + }, + { + "epoch": 3.2629558541266794, + "grad_norm": 0.3269101886184072, + "learning_rate": 1.2907902259149287e-05, + "loss": 0.2454, + "step": 1275 + }, + { + "epoch": 3.2655150351887396, + "grad_norm": 0.34277751654037186, + "learning_rate": 1.2874438538424086e-05, + "loss": 0.2267, + "step": 1276 + }, + { + "epoch": 3.2680742162508, + "grad_norm": 0.3425839528353915, + "learning_rate": 1.2840997650777829e-05, + "loss": 0.2289, + "step": 1277 + }, + { + "epoch": 3.27063339731286, + "grad_norm": 0.32496809601312776, + "learning_rate": 1.2807579703368162e-05, + "loss": 0.2437, + "step": 1278 + }, + { + "epoch": 3.27319257837492, + "grad_norm": 0.37627417428336984, + "learning_rate": 1.2774184803279245e-05, + "loss": 0.2196, + "step": 1279 + }, + { + "epoch": 3.27575175943698, + "grad_norm": 0.30844464297394786, + "learning_rate": 1.274081305752135e-05, + "loss": 0.2148, + "step": 1280 + }, + { + "epoch": 3.2783109404990403, + "grad_norm": 0.32041752442694194, + "learning_rate": 1.2707464573030572e-05, + "loss": 0.2495, + "step": 1281 + }, + { + "epoch": 3.2808701215611005, + "grad_norm": 0.3247468389566215, + "learning_rate": 1.2674139456668479e-05, + "loss": 0.2558, + "step": 1282 + }, + { + "epoch": 3.2834293026231607, + "grad_norm": 0.30941778730595587, + "learning_rate": 1.2640837815221731e-05, + "loss": 0.2238, + "step": 1283 + }, + { + "epoch": 3.285988483685221, + "grad_norm": 0.3397180703699647, + "learning_rate": 1.260755975540178e-05, + "loss": 0.2405, + "step": 1284 + }, + { + "epoch": 3.2885476647472807, + "grad_norm": 0.31915117073868005, + "learning_rate": 1.2574305383844528e-05, + "loss": 0.2396, + "step": 1285 + }, + { + "epoch": 3.291106845809341, + "grad_norm": 0.2977168854647766, + "learning_rate": 1.2541074807109945e-05, + "loss": 0.2286, + "step": 1286 + }, + { + "epoch": 3.293666026871401, + "grad_norm": 0.3141534078265832, + "learning_rate": 1.250786813168176e-05, + "loss": 0.2291, + "step": 1287 + }, + { + "epoch": 3.2962252079334613, + "grad_norm": 0.3250362176047104, + "learning_rate": 1.2474685463967125e-05, + "loss": 0.2353, + "step": 1288 + }, + { + "epoch": 3.2987843889955215, + "grad_norm": 0.5783304188096524, + "learning_rate": 1.2441526910296253e-05, + "loss": 0.2316, + "step": 1289 + }, + { + "epoch": 3.3013435700575817, + "grad_norm": 0.31254991008955707, + "learning_rate": 1.2408392576922075e-05, + "loss": 0.2336, + "step": 1290 + }, + { + "epoch": 3.303902751119642, + "grad_norm": 0.31041323932723247, + "learning_rate": 1.2375282570019933e-05, + "loss": 0.2457, + "step": 1291 + }, + { + "epoch": 3.3064619321817017, + "grad_norm": 0.32993719976229857, + "learning_rate": 1.2342196995687212e-05, + "loss": 0.2588, + "step": 1292 + }, + { + "epoch": 3.309021113243762, + "grad_norm": 0.3016426692910031, + "learning_rate": 1.2309135959942986e-05, + "loss": 0.2577, + "step": 1293 + }, + { + "epoch": 3.311580294305822, + "grad_norm": 0.34298650349077, + "learning_rate": 1.227609956872772e-05, + "loss": 0.2386, + "step": 1294 + }, + { + "epoch": 3.3141394753678823, + "grad_norm": 0.3233823231490881, + "learning_rate": 1.2243087927902905e-05, + "loss": 0.2203, + "step": 1295 + }, + { + "epoch": 3.3166986564299425, + "grad_norm": 0.3431401051489707, + "learning_rate": 1.2210101143250708e-05, + "loss": 0.2369, + "step": 1296 + }, + { + "epoch": 3.3192578374920023, + "grad_norm": 0.30536095566488874, + "learning_rate": 1.2177139320473663e-05, + "loss": 0.209, + "step": 1297 + }, + { + "epoch": 3.3218170185540625, + "grad_norm": 0.3380078591507895, + "learning_rate": 1.2144202565194311e-05, + "loss": 0.2793, + "step": 1298 + }, + { + "epoch": 3.3243761996161227, + "grad_norm": 0.3327840853187567, + "learning_rate": 1.211129098295486e-05, + "loss": 0.2473, + "step": 1299 + }, + { + "epoch": 3.326935380678183, + "grad_norm": 0.29406663008971645, + "learning_rate": 1.2078404679216864e-05, + "loss": 0.2056, + "step": 1300 + }, + { + "epoch": 3.329494561740243, + "grad_norm": 0.31662925879264764, + "learning_rate": 1.2045543759360876e-05, + "loss": 0.2443, + "step": 1301 + }, + { + "epoch": 3.3320537428023034, + "grad_norm": 0.322730147707437, + "learning_rate": 1.2012708328686093e-05, + "loss": 0.2143, + "step": 1302 + }, + { + "epoch": 3.3346129238643636, + "grad_norm": 0.2975751708993095, + "learning_rate": 1.1979898492410049e-05, + "loss": 0.2385, + "step": 1303 + }, + { + "epoch": 3.3371721049264234, + "grad_norm": 0.3636857886009555, + "learning_rate": 1.1947114355668265e-05, + "loss": 0.2623, + "step": 1304 + }, + { + "epoch": 3.3397312859884836, + "grad_norm": 0.3199715781542156, + "learning_rate": 1.1914356023513904e-05, + "loss": 0.2605, + "step": 1305 + }, + { + "epoch": 3.342290467050544, + "grad_norm": 0.32209675935492127, + "learning_rate": 1.1881623600917437e-05, + "loss": 0.2474, + "step": 1306 + }, + { + "epoch": 3.344849648112604, + "grad_norm": 0.3146940770709782, + "learning_rate": 1.1848917192766322e-05, + "loss": 0.1886, + "step": 1307 + }, + { + "epoch": 3.347408829174664, + "grad_norm": 0.3515135762125379, + "learning_rate": 1.1816236903864656e-05, + "loss": 0.2639, + "step": 1308 + }, + { + "epoch": 3.3499680102367244, + "grad_norm": 0.3273044475438286, + "learning_rate": 1.1783582838932821e-05, + "loss": 0.2681, + "step": 1309 + }, + { + "epoch": 3.352527191298784, + "grad_norm": 0.3241556852063347, + "learning_rate": 1.1750955102607193e-05, + "loss": 0.2148, + "step": 1310 + }, + { + "epoch": 3.3550863723608444, + "grad_norm": 0.34788552766120195, + "learning_rate": 1.1718353799439766e-05, + "loss": 0.2328, + "step": 1311 + }, + { + "epoch": 3.3576455534229046, + "grad_norm": 0.31285926538820524, + "learning_rate": 1.1685779033897827e-05, + "loss": 0.2139, + "step": 1312 + }, + { + "epoch": 3.360204734484965, + "grad_norm": 0.34091923953794956, + "learning_rate": 1.1653230910363645e-05, + "loss": 0.2522, + "step": 1313 + }, + { + "epoch": 3.362763915547025, + "grad_norm": 0.3093765842790142, + "learning_rate": 1.1620709533134104e-05, + "loss": 0.25, + "step": 1314 + }, + { + "epoch": 3.3653230966090852, + "grad_norm": 0.3332035792467053, + "learning_rate": 1.1588215006420374e-05, + "loss": 0.2729, + "step": 1315 + }, + { + "epoch": 3.3678822776711455, + "grad_norm": 0.3076095736538766, + "learning_rate": 1.1555747434347606e-05, + "loss": 0.2076, + "step": 1316 + }, + { + "epoch": 3.370441458733205, + "grad_norm": 0.30288813913337326, + "learning_rate": 1.1523306920954571e-05, + "loss": 0.2449, + "step": 1317 + }, + { + "epoch": 3.3730006397952654, + "grad_norm": 0.3702724394333769, + "learning_rate": 1.1490893570193328e-05, + "loss": 0.2646, + "step": 1318 + }, + { + "epoch": 3.3755598208573256, + "grad_norm": 0.35103580998810946, + "learning_rate": 1.1458507485928891e-05, + "loss": 0.2634, + "step": 1319 + }, + { + "epoch": 3.378119001919386, + "grad_norm": 0.3037095018272702, + "learning_rate": 1.1426148771938915e-05, + "loss": 0.2229, + "step": 1320 + }, + { + "epoch": 3.380678182981446, + "grad_norm": 0.3192579058292554, + "learning_rate": 1.139381753191335e-05, + "loss": 0.2489, + "step": 1321 + }, + { + "epoch": 3.383237364043506, + "grad_norm": 0.32501023665860496, + "learning_rate": 1.1361513869454092e-05, + "loss": 0.2407, + "step": 1322 + }, + { + "epoch": 3.385796545105566, + "grad_norm": 0.35596917491983554, + "learning_rate": 1.1329237888074691e-05, + "loss": 0.2437, + "step": 1323 + }, + { + "epoch": 3.3883557261676263, + "grad_norm": 0.33172031186682566, + "learning_rate": 1.129698969119998e-05, + "loss": 0.2623, + "step": 1324 + }, + { + "epoch": 3.3909149072296865, + "grad_norm": 0.2995594606117323, + "learning_rate": 1.1264769382165748e-05, + "loss": 0.1996, + "step": 1325 + }, + { + "epoch": 3.3934740882917467, + "grad_norm": 0.3194047630014032, + "learning_rate": 1.123257706421845e-05, + "loss": 0.236, + "step": 1326 + }, + { + "epoch": 3.396033269353807, + "grad_norm": 0.33068857427400655, + "learning_rate": 1.1200412840514839e-05, + "loss": 0.2244, + "step": 1327 + }, + { + "epoch": 3.398592450415867, + "grad_norm": 0.36837193881845204, + "learning_rate": 1.1168276814121621e-05, + "loss": 0.2828, + "step": 1328 + }, + { + "epoch": 3.401151631477927, + "grad_norm": 0.3076965971301543, + "learning_rate": 1.1136169088015177e-05, + "loss": 0.2241, + "step": 1329 + }, + { + "epoch": 3.403710812539987, + "grad_norm": 0.3411885790050691, + "learning_rate": 1.110408976508118e-05, + "loss": 0.2232, + "step": 1330 + }, + { + "epoch": 3.4062699936020473, + "grad_norm": 0.3263500540450158, + "learning_rate": 1.107203894811429e-05, + "loss": 0.2572, + "step": 1331 + }, + { + "epoch": 3.4088291746641075, + "grad_norm": 0.3416605787702754, + "learning_rate": 1.1040016739817836e-05, + "loss": 0.2433, + "step": 1332 + }, + { + "epoch": 3.4113883557261677, + "grad_norm": 0.335195547574942, + "learning_rate": 1.1008023242803477e-05, + "loss": 0.2648, + "step": 1333 + }, + { + "epoch": 3.413947536788228, + "grad_norm": 0.36538157527653864, + "learning_rate": 1.097605855959084e-05, + "loss": 0.2286, + "step": 1334 + }, + { + "epoch": 3.4165067178502877, + "grad_norm": 0.2912108672412734, + "learning_rate": 1.094412279260726e-05, + "loss": 0.2073, + "step": 1335 + }, + { + "epoch": 3.419065898912348, + "grad_norm": 0.32352858434290793, + "learning_rate": 1.0912216044187382e-05, + "loss": 0.2725, + "step": 1336 + }, + { + "epoch": 3.421625079974408, + "grad_norm": 0.30980460582268804, + "learning_rate": 1.0880338416572872e-05, + "loss": 0.242, + "step": 1337 + }, + { + "epoch": 3.4241842610364683, + "grad_norm": 0.3026357282953144, + "learning_rate": 1.0848490011912096e-05, + "loss": 0.2207, + "step": 1338 + }, + { + "epoch": 3.4267434420985285, + "grad_norm": 0.3222649336637817, + "learning_rate": 1.0816670932259763e-05, + "loss": 0.2196, + "step": 1339 + }, + { + "epoch": 3.4293026231605888, + "grad_norm": 0.3346500533447882, + "learning_rate": 1.0784881279576635e-05, + "loss": 0.2187, + "step": 1340 + }, + { + "epoch": 3.431861804222649, + "grad_norm": 0.3122079565048836, + "learning_rate": 1.0753121155729133e-05, + "loss": 0.2227, + "step": 1341 + }, + { + "epoch": 3.4344209852847087, + "grad_norm": 0.3240510909707239, + "learning_rate": 1.07213906624891e-05, + "loss": 0.2231, + "step": 1342 + }, + { + "epoch": 3.436980166346769, + "grad_norm": 0.3235912618403718, + "learning_rate": 1.0689689901533424e-05, + "loss": 0.2492, + "step": 1343 + }, + { + "epoch": 3.439539347408829, + "grad_norm": 0.3040119908970231, + "learning_rate": 1.0658018974443692e-05, + "loss": 0.1984, + "step": 1344 + }, + { + "epoch": 3.4420985284708894, + "grad_norm": 0.340863607236755, + "learning_rate": 1.0626377982705929e-05, + "loss": 0.2349, + "step": 1345 + }, + { + "epoch": 3.4446577095329496, + "grad_norm": 0.32795701173977326, + "learning_rate": 1.059476702771021e-05, + "loss": 0.2529, + "step": 1346 + }, + { + "epoch": 3.4472168905950094, + "grad_norm": 0.31132450713720333, + "learning_rate": 1.056318621075036e-05, + "loss": 0.2095, + "step": 1347 + }, + { + "epoch": 3.4497760716570696, + "grad_norm": 0.3254097118432526, + "learning_rate": 1.0531635633023644e-05, + "loss": 0.2358, + "step": 1348 + }, + { + "epoch": 3.4523352527191298, + "grad_norm": 0.3017269372689714, + "learning_rate": 1.050011539563043e-05, + "loss": 0.2247, + "step": 1349 + }, + { + "epoch": 3.45489443378119, + "grad_norm": 0.3283903326525304, + "learning_rate": 1.0468625599573842e-05, + "loss": 0.2718, + "step": 1350 + }, + { + "epoch": 3.45745361484325, + "grad_norm": 0.31738351502037276, + "learning_rate": 1.0437166345759489e-05, + "loss": 0.2345, + "step": 1351 + }, + { + "epoch": 3.4600127959053104, + "grad_norm": 0.30402755053632596, + "learning_rate": 1.0405737734995083e-05, + "loss": 0.2057, + "step": 1352 + }, + { + "epoch": 3.4625719769673706, + "grad_norm": 0.33488642167297444, + "learning_rate": 1.037433986799015e-05, + "loss": 0.2439, + "step": 1353 + }, + { + "epoch": 3.4651311580294304, + "grad_norm": 0.30914851197686366, + "learning_rate": 1.034297284535571e-05, + "loss": 0.2028, + "step": 1354 + }, + { + "epoch": 3.4676903390914906, + "grad_norm": 0.32175431813825445, + "learning_rate": 1.0311636767603952e-05, + "loss": 0.2439, + "step": 1355 + }, + { + "epoch": 3.470249520153551, + "grad_norm": 0.3395487986448244, + "learning_rate": 1.028033173514788e-05, + "loss": 0.2502, + "step": 1356 + }, + { + "epoch": 3.472808701215611, + "grad_norm": 0.3042126709214444, + "learning_rate": 1.0249057848301043e-05, + "loss": 0.2395, + "step": 1357 + }, + { + "epoch": 3.4753678822776712, + "grad_norm": 0.3183501854149144, + "learning_rate": 1.0217815207277165e-05, + "loss": 0.2234, + "step": 1358 + }, + { + "epoch": 3.4779270633397315, + "grad_norm": 0.3233478623491546, + "learning_rate": 1.0186603912189867e-05, + "loss": 0.2589, + "step": 1359 + }, + { + "epoch": 3.480486244401791, + "grad_norm": 0.3018286569760461, + "learning_rate": 1.0155424063052306e-05, + "loss": 0.2401, + "step": 1360 + }, + { + "epoch": 3.4830454254638514, + "grad_norm": 0.3174313407841064, + "learning_rate": 1.0124275759776889e-05, + "loss": 0.2399, + "step": 1361 + }, + { + "epoch": 3.4856046065259116, + "grad_norm": 0.307048568974569, + "learning_rate": 1.0093159102174938e-05, + "loss": 0.2291, + "step": 1362 + }, + { + "epoch": 3.488163787587972, + "grad_norm": 0.3132031600114937, + "learning_rate": 1.006207418995636e-05, + "loss": 0.2086, + "step": 1363 + }, + { + "epoch": 3.490722968650032, + "grad_norm": 0.34596427815653313, + "learning_rate": 1.0031021122729328e-05, + "loss": 0.2497, + "step": 1364 + }, + { + "epoch": 3.4932821497120923, + "grad_norm": 0.2986896060364163, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.2379, + "step": 1365 + }, + { + "epoch": 3.4958413307741525, + "grad_norm": 0.3319066544902576, + "learning_rate": 9.969010921172155e-06, + "loss": 0.2542, + "step": 1366 + }, + { + "epoch": 3.4984005118362123, + "grad_norm": 0.3053208810307986, + "learning_rate": 9.938053985546883e-06, + "loss": 0.2299, + "step": 1367 + }, + { + "epoch": 3.5009596928982725, + "grad_norm": 0.3736366846122222, + "learning_rate": 9.907129292322298e-06, + "loss": 0.2676, + "step": 1368 + }, + { + "epoch": 3.5035188739603327, + "grad_norm": 0.33175316766942814, + "learning_rate": 9.876236940593173e-06, + "loss": 0.2753, + "step": 1369 + }, + { + "epoch": 3.506078055022393, + "grad_norm": 0.3276624133983928, + "learning_rate": 9.84537702935065e-06, + "loss": 0.2745, + "step": 1370 + }, + { + "epoch": 3.508637236084453, + "grad_norm": 0.3067138284048095, + "learning_rate": 9.814549657481935e-06, + "loss": 0.2201, + "step": 1371 + }, + { + "epoch": 3.511196417146513, + "grad_norm": 0.3489342217784152, + "learning_rate": 9.783754923769946e-06, + "loss": 0.2402, + "step": 1372 + }, + { + "epoch": 3.513755598208573, + "grad_norm": 0.3103704194146652, + "learning_rate": 9.752992926893027e-06, + "loss": 0.231, + "step": 1373 + }, + { + "epoch": 3.5163147792706333, + "grad_norm": 0.29799307682543535, + "learning_rate": 9.722263765424628e-06, + "loss": 0.2103, + "step": 1374 + }, + { + "epoch": 3.5188739603326935, + "grad_norm": 0.3245882511730939, + "learning_rate": 9.691567537832964e-06, + "loss": 0.2547, + "step": 1375 + }, + { + "epoch": 3.5214331413947537, + "grad_norm": 0.29350433799889125, + "learning_rate": 9.660904342480715e-06, + "loss": 0.2083, + "step": 1376 + }, + { + "epoch": 3.523992322456814, + "grad_norm": 0.3333636077305378, + "learning_rate": 9.630274277624729e-06, + "loss": 0.2837, + "step": 1377 + }, + { + "epoch": 3.526551503518874, + "grad_norm": 0.33952864266921756, + "learning_rate": 9.599677441415694e-06, + "loss": 0.2313, + "step": 1378 + }, + { + "epoch": 3.5291106845809344, + "grad_norm": 0.3012959852140507, + "learning_rate": 9.5691139318978e-06, + "loss": 0.2171, + "step": 1379 + }, + { + "epoch": 3.531669865642994, + "grad_norm": 0.3318689398716116, + "learning_rate": 9.538583847008452e-06, + "loss": 0.2366, + "step": 1380 + }, + { + "epoch": 3.5342290467050543, + "grad_norm": 0.3358375103639254, + "learning_rate": 9.508087284577963e-06, + "loss": 0.2402, + "step": 1381 + }, + { + "epoch": 3.5367882277671145, + "grad_norm": 0.30920378089911293, + "learning_rate": 9.477624342329209e-06, + "loss": 0.2143, + "step": 1382 + }, + { + "epoch": 3.5393474088291748, + "grad_norm": 0.3169427298803479, + "learning_rate": 9.447195117877343e-06, + "loss": 0.2285, + "step": 1383 + }, + { + "epoch": 3.541906589891235, + "grad_norm": 0.3112954457690554, + "learning_rate": 9.416799708729486e-06, + "loss": 0.2315, + "step": 1384 + }, + { + "epoch": 3.5444657709532947, + "grad_norm": 0.3238511016385153, + "learning_rate": 9.386438212284372e-06, + "loss": 0.2252, + "step": 1385 + }, + { + "epoch": 3.547024952015355, + "grad_norm": 0.3031398199187957, + "learning_rate": 9.356110725832081e-06, + "loss": 0.2376, + "step": 1386 + }, + { + "epoch": 3.549584133077415, + "grad_norm": 0.3105091167975465, + "learning_rate": 9.325817346553725e-06, + "loss": 0.2689, + "step": 1387 + }, + { + "epoch": 3.5521433141394754, + "grad_norm": 0.31208594113425225, + "learning_rate": 9.295558171521093e-06, + "loss": 0.2278, + "step": 1388 + }, + { + "epoch": 3.5547024952015356, + "grad_norm": 0.31485619105429463, + "learning_rate": 9.265333297696395e-06, + "loss": 0.242, + "step": 1389 + }, + { + "epoch": 3.557261676263596, + "grad_norm": 0.31606147283215824, + "learning_rate": 9.235142821931928e-06, + "loss": 0.2363, + "step": 1390 + }, + { + "epoch": 3.559820857325656, + "grad_norm": 0.3150525527068536, + "learning_rate": 9.204986840969749e-06, + "loss": 0.2199, + "step": 1391 + }, + { + "epoch": 3.5623800383877158, + "grad_norm": 0.31857444893477177, + "learning_rate": 9.174865451441375e-06, + "loss": 0.2283, + "step": 1392 + }, + { + "epoch": 3.564939219449776, + "grad_norm": 0.30466028849006704, + "learning_rate": 9.1447787498675e-06, + "loss": 0.232, + "step": 1393 + }, + { + "epoch": 3.567498400511836, + "grad_norm": 0.34031824974175295, + "learning_rate": 9.114726832657658e-06, + "loss": 0.2663, + "step": 1394 + }, + { + "epoch": 3.5700575815738964, + "grad_norm": 0.3261116373502211, + "learning_rate": 9.084709796109907e-06, + "loss": 0.2489, + "step": 1395 + }, + { + "epoch": 3.5726167626359566, + "grad_norm": 0.30217642557332414, + "learning_rate": 9.054727736410555e-06, + "loss": 0.2613, + "step": 1396 + }, + { + "epoch": 3.5751759436980164, + "grad_norm": 0.3029175133984261, + "learning_rate": 9.02478074963381e-06, + "loss": 0.2263, + "step": 1397 + }, + { + "epoch": 3.5777351247600766, + "grad_norm": 0.3613891446327612, + "learning_rate": 8.994868931741499e-06, + "loss": 0.2658, + "step": 1398 + }, + { + "epoch": 3.580294305822137, + "grad_norm": 0.32375645597232505, + "learning_rate": 8.964992378582758e-06, + "loss": 0.2458, + "step": 1399 + }, + { + "epoch": 3.582853486884197, + "grad_norm": 0.346977280248451, + "learning_rate": 8.93515118589373e-06, + "loss": 0.2673, + "step": 1400 + }, + { + "epoch": 3.5854126679462572, + "grad_norm": 0.3091670769782311, + "learning_rate": 8.905345449297223e-06, + "loss": 0.2517, + "step": 1401 + }, + { + "epoch": 3.5879718490083174, + "grad_norm": 0.3033606651844572, + "learning_rate": 8.87557526430246e-06, + "loss": 0.2012, + "step": 1402 + }, + { + "epoch": 3.5905310300703777, + "grad_norm": 0.3115678004260849, + "learning_rate": 8.845840726304723e-06, + "loss": 0.2297, + "step": 1403 + }, + { + "epoch": 3.593090211132438, + "grad_norm": 0.3441565775702763, + "learning_rate": 8.816141930585067e-06, + "loss": 0.2542, + "step": 1404 + }, + { + "epoch": 3.5956493921944976, + "grad_norm": 0.30703493196426435, + "learning_rate": 8.786478972310023e-06, + "loss": 0.2342, + "step": 1405 + }, + { + "epoch": 3.598208573256558, + "grad_norm": 0.31549384345069187, + "learning_rate": 8.756851946531294e-06, + "loss": 0.247, + "step": 1406 + }, + { + "epoch": 3.600767754318618, + "grad_norm": 0.29102001056426585, + "learning_rate": 8.72726094818541e-06, + "loss": 0.2074, + "step": 1407 + }, + { + "epoch": 3.6033269353806783, + "grad_norm": 0.33490062048588876, + "learning_rate": 8.697706072093493e-06, + "loss": 0.2541, + "step": 1408 + }, + { + "epoch": 3.6058861164427385, + "grad_norm": 0.3124637481108502, + "learning_rate": 8.668187412960887e-06, + "loss": 0.2437, + "step": 1409 + }, + { + "epoch": 3.6084452975047983, + "grad_norm": 0.3113279052119832, + "learning_rate": 8.638705065376887e-06, + "loss": 0.2389, + "step": 1410 + }, + { + "epoch": 3.6110044785668585, + "grad_norm": 0.3194418676306725, + "learning_rate": 8.609259123814443e-06, + "loss": 0.2549, + "step": 1411 + }, + { + "epoch": 3.6135636596289187, + "grad_norm": 0.28910975717043763, + "learning_rate": 8.579849682629844e-06, + "loss": 0.2246, + "step": 1412 + }, + { + "epoch": 3.616122840690979, + "grad_norm": 0.31307324283444393, + "learning_rate": 8.550476836062419e-06, + "loss": 0.2425, + "step": 1413 + }, + { + "epoch": 3.618682021753039, + "grad_norm": 0.3040084877742325, + "learning_rate": 8.521140678234214e-06, + "loss": 0.2361, + "step": 1414 + }, + { + "epoch": 3.6212412028150993, + "grad_norm": 0.3130765747434785, + "learning_rate": 8.491841303149728e-06, + "loss": 0.2272, + "step": 1415 + }, + { + "epoch": 3.6238003838771595, + "grad_norm": 0.30265698644043443, + "learning_rate": 8.462578804695595e-06, + "loss": 0.2701, + "step": 1416 + }, + { + "epoch": 3.6263595649392193, + "grad_norm": 0.2987350574519827, + "learning_rate": 8.43335327664027e-06, + "loss": 0.2177, + "step": 1417 + }, + { + "epoch": 3.6289187460012795, + "grad_norm": 0.3166676881180338, + "learning_rate": 8.404164812633755e-06, + "loss": 0.2756, + "step": 1418 + }, + { + "epoch": 3.6314779270633397, + "grad_norm": 0.3033675615693007, + "learning_rate": 8.375013506207275e-06, + "loss": 0.2136, + "step": 1419 + }, + { + "epoch": 3.6340371081254, + "grad_norm": 0.30761103254564687, + "learning_rate": 8.345899450772975e-06, + "loss": 0.2535, + "step": 1420 + }, + { + "epoch": 3.63659628918746, + "grad_norm": 0.2982029035930307, + "learning_rate": 8.316822739623662e-06, + "loss": 0.2165, + "step": 1421 + }, + { + "epoch": 3.63915547024952, + "grad_norm": 0.31969771781374523, + "learning_rate": 8.287783465932466e-06, + "loss": 0.257, + "step": 1422 + }, + { + "epoch": 3.64171465131158, + "grad_norm": 0.2837299669230536, + "learning_rate": 8.258781722752535e-06, + "loss": 0.224, + "step": 1423 + }, + { + "epoch": 3.6442738323736403, + "grad_norm": 0.3012916036117272, + "learning_rate": 8.229817603016786e-06, + "loss": 0.2246, + "step": 1424 + }, + { + "epoch": 3.6468330134357005, + "grad_norm": 0.31189818144182524, + "learning_rate": 8.200891199537549e-06, + "loss": 0.2695, + "step": 1425 + }, + { + "epoch": 3.6493921944977608, + "grad_norm": 0.2997238939361931, + "learning_rate": 8.1720026050063e-06, + "loss": 0.1862, + "step": 1426 + }, + { + "epoch": 3.651951375559821, + "grad_norm": 0.34075811134574374, + "learning_rate": 8.143151911993374e-06, + "loss": 0.2619, + "step": 1427 + }, + { + "epoch": 3.654510556621881, + "grad_norm": 0.3146837641261533, + "learning_rate": 8.114339212947655e-06, + "loss": 0.2396, + "step": 1428 + }, + { + "epoch": 3.6570697376839414, + "grad_norm": 0.325379060971747, + "learning_rate": 8.085564600196258e-06, + "loss": 0.2435, + "step": 1429 + }, + { + "epoch": 3.659628918746001, + "grad_norm": 0.33089426856010606, + "learning_rate": 8.056828165944282e-06, + "loss": 0.2459, + "step": 1430 + }, + { + "epoch": 3.6621880998080614, + "grad_norm": 0.3111195583478781, + "learning_rate": 8.028130002274459e-06, + "loss": 0.2328, + "step": 1431 + }, + { + "epoch": 3.6647472808701216, + "grad_norm": 0.2917405967267679, + "learning_rate": 7.999470201146915e-06, + "loss": 0.2273, + "step": 1432 + }, + { + "epoch": 3.667306461932182, + "grad_norm": 0.3072708256616894, + "learning_rate": 7.970848854398825e-06, + "loss": 0.2616, + "step": 1433 + }, + { + "epoch": 3.669865642994242, + "grad_norm": 0.3274866801130236, + "learning_rate": 7.942266053744155e-06, + "loss": 0.2469, + "step": 1434 + }, + { + "epoch": 3.6724248240563018, + "grad_norm": 0.3206452949252231, + "learning_rate": 7.913721890773354e-06, + "loss": 0.2265, + "step": 1435 + }, + { + "epoch": 3.674984005118362, + "grad_norm": 0.3076598617812908, + "learning_rate": 7.885216456953053e-06, + "loss": 0.2167, + "step": 1436 + }, + { + "epoch": 3.677543186180422, + "grad_norm": 0.2979466544850749, + "learning_rate": 7.856749843625777e-06, + "loss": 0.2203, + "step": 1437 + }, + { + "epoch": 3.6801023672424824, + "grad_norm": 0.3224437385684691, + "learning_rate": 7.828322142009672e-06, + "loss": 0.2473, + "step": 1438 + }, + { + "epoch": 3.6826615483045426, + "grad_norm": 0.31485417513081154, + "learning_rate": 7.799933443198173e-06, + "loss": 0.2606, + "step": 1439 + }, + { + "epoch": 3.685220729366603, + "grad_norm": 0.3243639601134415, + "learning_rate": 7.771583838159756e-06, + "loss": 0.2633, + "step": 1440 + }, + { + "epoch": 3.687779910428663, + "grad_norm": 0.32893510730247094, + "learning_rate": 7.743273417737617e-06, + "loss": 0.2531, + "step": 1441 + }, + { + "epoch": 3.690339091490723, + "grad_norm": 0.31189557871340884, + "learning_rate": 7.715002272649388e-06, + "loss": 0.2403, + "step": 1442 + }, + { + "epoch": 3.692898272552783, + "grad_norm": 0.2901780629695267, + "learning_rate": 7.686770493486835e-06, + "loss": 0.2517, + "step": 1443 + }, + { + "epoch": 3.6954574536148432, + "grad_norm": 0.3159906947616901, + "learning_rate": 7.65857817071561e-06, + "loss": 0.2492, + "step": 1444 + }, + { + "epoch": 3.6980166346769034, + "grad_norm": 0.3098159857766409, + "learning_rate": 7.630425394674903e-06, + "loss": 0.2341, + "step": 1445 + }, + { + "epoch": 3.7005758157389637, + "grad_norm": 0.31575476226888965, + "learning_rate": 7.602312255577193e-06, + "loss": 0.2416, + "step": 1446 + }, + { + "epoch": 3.7031349968010234, + "grad_norm": 0.31087089571752347, + "learning_rate": 7.574238843507957e-06, + "loss": 0.2673, + "step": 1447 + }, + { + "epoch": 3.7056941778630836, + "grad_norm": 0.3209375688192084, + "learning_rate": 7.546205248425353e-06, + "loss": 0.2313, + "step": 1448 + }, + { + "epoch": 3.708253358925144, + "grad_norm": 0.30680259294814516, + "learning_rate": 7.518211560159949e-06, + "loss": 0.2187, + "step": 1449 + }, + { + "epoch": 3.710812539987204, + "grad_norm": 0.3064495121568587, + "learning_rate": 7.49025786841445e-06, + "loss": 0.2161, + "step": 1450 + }, + { + "epoch": 3.7133717210492643, + "grad_norm": 0.29214767577744066, + "learning_rate": 7.462344262763399e-06, + "loss": 0.2339, + "step": 1451 + }, + { + "epoch": 3.7159309021113245, + "grad_norm": 0.3146911137467561, + "learning_rate": 7.434470832652865e-06, + "loss": 0.2464, + "step": 1452 + }, + { + "epoch": 3.7184900831733847, + "grad_norm": 0.3499806012960674, + "learning_rate": 7.406637667400205e-06, + "loss": 0.2246, + "step": 1453 + }, + { + "epoch": 3.721049264235445, + "grad_norm": 0.30724891177758956, + "learning_rate": 7.378844856193736e-06, + "loss": 0.272, + "step": 1454 + }, + { + "epoch": 3.7236084452975047, + "grad_norm": 0.29711759225447126, + "learning_rate": 7.3510924880924575e-06, + "loss": 0.2205, + "step": 1455 + }, + { + "epoch": 3.726167626359565, + "grad_norm": 0.31114069148352147, + "learning_rate": 7.323380652025794e-06, + "loss": 0.2619, + "step": 1456 + }, + { + "epoch": 3.728726807421625, + "grad_norm": 0.3019555109712794, + "learning_rate": 7.295709436793284e-06, + "loss": 0.2526, + "step": 1457 + }, + { + "epoch": 3.7312859884836853, + "grad_norm": 0.30587326696750855, + "learning_rate": 7.268078931064293e-06, + "loss": 0.2156, + "step": 1458 + }, + { + "epoch": 3.7338451695457455, + "grad_norm": 0.3010387774085701, + "learning_rate": 7.2404892233777334e-06, + "loss": 0.2343, + "step": 1459 + }, + { + "epoch": 3.7364043506078053, + "grad_norm": 0.30265924975544334, + "learning_rate": 7.212940402141808e-06, + "loss": 0.2542, + "step": 1460 + }, + { + "epoch": 3.7389635316698655, + "grad_norm": 0.2899279021405336, + "learning_rate": 7.185432555633672e-06, + "loss": 0.2263, + "step": 1461 + }, + { + "epoch": 3.7415227127319257, + "grad_norm": 0.3197841686852777, + "learning_rate": 7.1579657719992045e-06, + "loss": 0.2665, + "step": 1462 + }, + { + "epoch": 3.744081893793986, + "grad_norm": 0.27978040220930955, + "learning_rate": 7.130540139252704e-06, + "loss": 0.2338, + "step": 1463 + }, + { + "epoch": 3.746641074856046, + "grad_norm": 0.31288929736620646, + "learning_rate": 7.1031557452765934e-06, + "loss": 0.2372, + "step": 1464 + }, + { + "epoch": 3.7492002559181064, + "grad_norm": 0.3098001081263227, + "learning_rate": 7.075812677821145e-06, + "loss": 0.2221, + "step": 1465 + }, + { + "epoch": 3.7517594369801666, + "grad_norm": 0.29645463690795515, + "learning_rate": 7.048511024504223e-06, + "loss": 0.2439, + "step": 1466 + }, + { + "epoch": 3.7543186180422263, + "grad_norm": 0.30305748324499643, + "learning_rate": 7.021250872810983e-06, + "loss": 0.2447, + "step": 1467 + }, + { + "epoch": 3.7568777991042865, + "grad_norm": 0.3180001382034292, + "learning_rate": 6.9940323100935725e-06, + "loss": 0.2455, + "step": 1468 + }, + { + "epoch": 3.7594369801663468, + "grad_norm": 0.30172748918488546, + "learning_rate": 6.966855423570898e-06, + "loss": 0.2319, + "step": 1469 + }, + { + "epoch": 3.761996161228407, + "grad_norm": 0.30342551799145645, + "learning_rate": 6.939720300328303e-06, + "loss": 0.2283, + "step": 1470 + }, + { + "epoch": 3.764555342290467, + "grad_norm": 0.31295405822100386, + "learning_rate": 6.9126270273173e-06, + "loss": 0.2361, + "step": 1471 + }, + { + "epoch": 3.767114523352527, + "grad_norm": 0.2970422267078146, + "learning_rate": 6.885575691355315e-06, + "loss": 0.1965, + "step": 1472 + }, + { + "epoch": 3.769673704414587, + "grad_norm": 0.32398586323455586, + "learning_rate": 6.858566379125389e-06, + "loss": 0.2661, + "step": 1473 + }, + { + "epoch": 3.7722328854766474, + "grad_norm": 0.28922597868223626, + "learning_rate": 6.831599177175879e-06, + "loss": 0.222, + "step": 1474 + }, + { + "epoch": 3.7747920665387076, + "grad_norm": 0.296927401509587, + "learning_rate": 6.8046741719202385e-06, + "loss": 0.2262, + "step": 1475 + }, + { + "epoch": 3.777351247600768, + "grad_norm": 0.318579910031184, + "learning_rate": 6.777791449636681e-06, + "loss": 0.2455, + "step": 1476 + }, + { + "epoch": 3.779910428662828, + "grad_norm": 0.30482817177680954, + "learning_rate": 6.7509510964679305e-06, + "loss": 0.2376, + "step": 1477 + }, + { + "epoch": 3.782469609724888, + "grad_norm": 0.3043184575100951, + "learning_rate": 6.724153198420957e-06, + "loss": 0.2508, + "step": 1478 + }, + { + "epoch": 3.7850287907869484, + "grad_norm": 0.29471146448676344, + "learning_rate": 6.697397841366686e-06, + "loss": 0.2219, + "step": 1479 + }, + { + "epoch": 3.787587971849008, + "grad_norm": 0.2763016151280136, + "learning_rate": 6.67068511103971e-06, + "loss": 0.2092, + "step": 1480 + }, + { + "epoch": 3.7901471529110684, + "grad_norm": 0.29319933516975266, + "learning_rate": 6.644015093038049e-06, + "loss": 0.2042, + "step": 1481 + }, + { + "epoch": 3.7927063339731286, + "grad_norm": 0.3160040922162459, + "learning_rate": 6.617387872822842e-06, + "loss": 0.2269, + "step": 1482 + }, + { + "epoch": 3.795265515035189, + "grad_norm": 0.31249964016432036, + "learning_rate": 6.590803535718082e-06, + "loss": 0.2841, + "step": 1483 + }, + { + "epoch": 3.797824696097249, + "grad_norm": 0.29901036986186935, + "learning_rate": 6.564262166910367e-06, + "loss": 0.2096, + "step": 1484 + }, + { + "epoch": 3.800383877159309, + "grad_norm": 0.3151541438953574, + "learning_rate": 6.537763851448593e-06, + "loss": 0.2215, + "step": 1485 + }, + { + "epoch": 3.802943058221369, + "grad_norm": 0.3238104601226396, + "learning_rate": 6.511308674243711e-06, + "loss": 0.2493, + "step": 1486 + }, + { + "epoch": 3.8055022392834292, + "grad_norm": 0.30561591970597685, + "learning_rate": 6.484896720068421e-06, + "loss": 0.238, + "step": 1487 + }, + { + "epoch": 3.8080614203454894, + "grad_norm": 0.28920704087260063, + "learning_rate": 6.458528073556925e-06, + "loss": 0.2685, + "step": 1488 + }, + { + "epoch": 3.8106206014075497, + "grad_norm": 0.3038589031703289, + "learning_rate": 6.432202819204667e-06, + "loss": 0.248, + "step": 1489 + }, + { + "epoch": 3.81317978246961, + "grad_norm": 0.3385452502303158, + "learning_rate": 6.4059210413680175e-06, + "loss": 0.2503, + "step": 1490 + }, + { + "epoch": 3.81573896353167, + "grad_norm": 0.2768232976810782, + "learning_rate": 6.379682824264055e-06, + "loss": 0.2164, + "step": 1491 + }, + { + "epoch": 3.81829814459373, + "grad_norm": 0.3009349939503682, + "learning_rate": 6.353488251970275e-06, + "loss": 0.2366, + "step": 1492 + }, + { + "epoch": 3.82085732565579, + "grad_norm": 0.2928819289728828, + "learning_rate": 6.327337408424281e-06, + "loss": 0.2332, + "step": 1493 + }, + { + "epoch": 3.8234165067178503, + "grad_norm": 0.2870974660694633, + "learning_rate": 6.301230377423595e-06, + "loss": 0.2, + "step": 1494 + }, + { + "epoch": 3.8259756877799105, + "grad_norm": 0.3081786003102801, + "learning_rate": 6.275167242625331e-06, + "loss": 0.2414, + "step": 1495 + }, + { + "epoch": 3.8285348688419707, + "grad_norm": 0.2790089490684267, + "learning_rate": 6.2491480875459336e-06, + "loss": 0.215, + "step": 1496 + }, + { + "epoch": 3.8310940499040305, + "grad_norm": 0.29306281039648446, + "learning_rate": 6.223172995560935e-06, + "loss": 0.2679, + "step": 1497 + }, + { + "epoch": 3.8336532309660907, + "grad_norm": 0.2762204818599091, + "learning_rate": 6.1972420499046635e-06, + "loss": 0.2192, + "step": 1498 + }, + { + "epoch": 3.836212412028151, + "grad_norm": 0.29828532304173333, + "learning_rate": 6.171355333669973e-06, + "loss": 0.2441, + "step": 1499 + }, + { + "epoch": 3.838771593090211, + "grad_norm": 0.32131992187351277, + "learning_rate": 6.145512929808013e-06, + "loss": 0.229, + "step": 1500 + }, + { + "epoch": 3.8413307741522713, + "grad_norm": 0.3043046372632, + "learning_rate": 6.119714921127933e-06, + "loss": 0.2694, + "step": 1501 + }, + { + "epoch": 3.8438899552143315, + "grad_norm": 0.2717132569091513, + "learning_rate": 6.093961390296603e-06, + "loss": 0.2254, + "step": 1502 + }, + { + "epoch": 3.8464491362763917, + "grad_norm": 0.30029687863634635, + "learning_rate": 6.068252419838399e-06, + "loss": 0.2326, + "step": 1503 + }, + { + "epoch": 3.849008317338452, + "grad_norm": 0.27288392014891966, + "learning_rate": 6.042588092134878e-06, + "loss": 0.2163, + "step": 1504 + }, + { + "epoch": 3.8515674984005117, + "grad_norm": 0.283307260045485, + "learning_rate": 6.016968489424572e-06, + "loss": 0.2312, + "step": 1505 + }, + { + "epoch": 3.854126679462572, + "grad_norm": 0.2919987165904583, + "learning_rate": 5.991393693802674e-06, + "loss": 0.2533, + "step": 1506 + }, + { + "epoch": 3.856685860524632, + "grad_norm": 0.2968617574713475, + "learning_rate": 5.96586378722081e-06, + "loss": 0.2397, + "step": 1507 + }, + { + "epoch": 3.8592450415866923, + "grad_norm": 0.28294719312430794, + "learning_rate": 5.940378851486766e-06, + "loss": 0.2302, + "step": 1508 + }, + { + "epoch": 3.8618042226487526, + "grad_norm": 0.2885647154013107, + "learning_rate": 5.9149389682642165e-06, + "loss": 0.2429, + "step": 1509 + }, + { + "epoch": 3.8643634037108123, + "grad_norm": 0.29335993884299777, + "learning_rate": 5.889544219072465e-06, + "loss": 0.2347, + "step": 1510 + }, + { + "epoch": 3.8669225847728725, + "grad_norm": 0.30407531909262675, + "learning_rate": 5.864194685286206e-06, + "loss": 0.2405, + "step": 1511 + }, + { + "epoch": 3.8694817658349328, + "grad_norm": 0.29889145730228683, + "learning_rate": 5.838890448135228e-06, + "loss": 0.2373, + "step": 1512 + }, + { + "epoch": 3.872040946896993, + "grad_norm": 0.3002231215499318, + "learning_rate": 5.81363158870418e-06, + "loss": 0.2316, + "step": 1513 + }, + { + "epoch": 3.874600127959053, + "grad_norm": 0.3246106606552585, + "learning_rate": 5.788418187932314e-06, + "loss": 0.2365, + "step": 1514 + }, + { + "epoch": 3.8771593090211134, + "grad_norm": 0.29292707103941984, + "learning_rate": 5.7632503266131925e-06, + "loss": 0.2087, + "step": 1515 + }, + { + "epoch": 3.8797184900831736, + "grad_norm": 0.3255584037433501, + "learning_rate": 5.7381280853944585e-06, + "loss": 0.2807, + "step": 1516 + }, + { + "epoch": 3.8822776711452334, + "grad_norm": 0.2912419112819368, + "learning_rate": 5.713051544777584e-06, + "loss": 0.2218, + "step": 1517 + }, + { + "epoch": 3.8848368522072936, + "grad_norm": 0.3048580778970364, + "learning_rate": 5.688020785117581e-06, + "loss": 0.2753, + "step": 1518 + }, + { + "epoch": 3.887396033269354, + "grad_norm": 0.29167337976733226, + "learning_rate": 5.66303588662277e-06, + "loss": 0.2329, + "step": 1519 + }, + { + "epoch": 3.889955214331414, + "grad_norm": 0.2992142371051243, + "learning_rate": 5.638096929354522e-06, + "loss": 0.2268, + "step": 1520 + }, + { + "epoch": 3.892514395393474, + "grad_norm": 0.3027423583297044, + "learning_rate": 5.613203993226981e-06, + "loss": 0.221, + "step": 1521 + }, + { + "epoch": 3.895073576455534, + "grad_norm": 0.3021686605260421, + "learning_rate": 5.588357158006821e-06, + "loss": 0.252, + "step": 1522 + }, + { + "epoch": 3.897632757517594, + "grad_norm": 0.2996327213886315, + "learning_rate": 5.563556503312997e-06, + "loss": 0.2318, + "step": 1523 + }, + { + "epoch": 3.9001919385796544, + "grad_norm": 0.2957035292429405, + "learning_rate": 5.538802108616494e-06, + "loss": 0.239, + "step": 1524 + }, + { + "epoch": 3.9027511196417146, + "grad_norm": 0.309492243720712, + "learning_rate": 5.514094053240035e-06, + "loss": 0.2228, + "step": 1525 + }, + { + "epoch": 3.905310300703775, + "grad_norm": 0.6416163470606975, + "learning_rate": 5.489432416357885e-06, + "loss": 0.2326, + "step": 1526 + }, + { + "epoch": 3.907869481765835, + "grad_norm": 0.297847179245253, + "learning_rate": 5.46481727699554e-06, + "loss": 0.2346, + "step": 1527 + }, + { + "epoch": 3.9104286628278953, + "grad_norm": 0.30350301665082174, + "learning_rate": 5.440248714029508e-06, + "loss": 0.2478, + "step": 1528 + }, + { + "epoch": 3.9129878438899555, + "grad_norm": 0.29944760683896515, + "learning_rate": 5.415726806187052e-06, + "loss": 0.2306, + "step": 1529 + }, + { + "epoch": 3.9155470249520152, + "grad_norm": 0.28720986057591197, + "learning_rate": 5.39125163204594e-06, + "loss": 0.2057, + "step": 1530 + }, + { + "epoch": 3.9181062060140754, + "grad_norm": 0.2868190209794002, + "learning_rate": 5.3668232700341735e-06, + "loss": 0.2278, + "step": 1531 + }, + { + "epoch": 3.9206653870761357, + "grad_norm": 0.3118363419035033, + "learning_rate": 5.342441798429747e-06, + "loss": 0.2518, + "step": 1532 + }, + { + "epoch": 3.923224568138196, + "grad_norm": 0.27782593526187427, + "learning_rate": 5.318107295360424e-06, + "loss": 0.2334, + "step": 1533 + }, + { + "epoch": 3.925783749200256, + "grad_norm": 0.29231670514977803, + "learning_rate": 5.293819838803429e-06, + "loss": 0.2198, + "step": 1534 + }, + { + "epoch": 3.928342930262316, + "grad_norm": 0.30148858674026124, + "learning_rate": 5.269579506585259e-06, + "loss": 0.2291, + "step": 1535 + }, + { + "epoch": 3.930902111324376, + "grad_norm": 0.2942793998750714, + "learning_rate": 5.245386376381398e-06, + "loss": 0.2235, + "step": 1536 + }, + { + "epoch": 3.9334612923864363, + "grad_norm": 0.30230762629355007, + "learning_rate": 5.221240525716071e-06, + "loss": 0.2182, + "step": 1537 + }, + { + "epoch": 3.9360204734484965, + "grad_norm": 0.2838869405763987, + "learning_rate": 5.197142031961999e-06, + "loss": 0.2531, + "step": 1538 + }, + { + "epoch": 3.9385796545105567, + "grad_norm": 0.284590980219594, + "learning_rate": 5.17309097234016e-06, + "loss": 0.2235, + "step": 1539 + }, + { + "epoch": 3.941138835572617, + "grad_norm": 0.2805331306681004, + "learning_rate": 5.149087423919541e-06, + "loss": 0.1941, + "step": 1540 + }, + { + "epoch": 3.943698016634677, + "grad_norm": 0.3227111819969921, + "learning_rate": 5.125131463616863e-06, + "loss": 0.2598, + "step": 1541 + }, + { + "epoch": 3.946257197696737, + "grad_norm": 0.3095815791809101, + "learning_rate": 5.101223168196381e-06, + "loss": 0.26, + "step": 1542 + }, + { + "epoch": 3.948816378758797, + "grad_norm": 0.28306994750627523, + "learning_rate": 5.077362614269599e-06, + "loss": 0.2214, + "step": 1543 + }, + { + "epoch": 3.9513755598208573, + "grad_norm": 0.30833267171559586, + "learning_rate": 5.05354987829503e-06, + "loss": 0.2473, + "step": 1544 + }, + { + "epoch": 3.9539347408829175, + "grad_norm": 0.29154031796148494, + "learning_rate": 5.029785036577976e-06, + "loss": 0.231, + "step": 1545 + }, + { + "epoch": 3.9564939219449777, + "grad_norm": 0.3230057370683127, + "learning_rate": 5.0060681652702745e-06, + "loss": 0.2538, + "step": 1546 + }, + { + "epoch": 3.9590531030070375, + "grad_norm": 0.28553506909372656, + "learning_rate": 4.982399340370017e-06, + "loss": 0.231, + "step": 1547 + }, + { + "epoch": 3.9616122840690977, + "grad_norm": 0.30170947130683173, + "learning_rate": 4.958778637721364e-06, + "loss": 0.2454, + "step": 1548 + }, + { + "epoch": 3.964171465131158, + "grad_norm": 0.28267032534811537, + "learning_rate": 4.935206133014259e-06, + "loss": 0.2417, + "step": 1549 + }, + { + "epoch": 3.966730646193218, + "grad_norm": 0.2954487998931121, + "learning_rate": 4.911681901784198e-06, + "loss": 0.2319, + "step": 1550 + }, + { + "epoch": 3.9692898272552783, + "grad_norm": 0.32228832699706966, + "learning_rate": 4.8882060194119985e-06, + "loss": 0.2282, + "step": 1551 + }, + { + "epoch": 3.9718490083173386, + "grad_norm": 0.34818873755980667, + "learning_rate": 4.864778561123555e-06, + "loss": 0.2718, + "step": 1552 + }, + { + "epoch": 3.9744081893793988, + "grad_norm": 0.27407731345565545, + "learning_rate": 4.841399601989574e-06, + "loss": 0.2039, + "step": 1553 + }, + { + "epoch": 3.976967370441459, + "grad_norm": 0.2749332266585077, + "learning_rate": 4.8180692169253714e-06, + "loss": 0.2181, + "step": 1554 + }, + { + "epoch": 3.9795265515035187, + "grad_norm": 0.3010112395338718, + "learning_rate": 4.794787480690597e-06, + "loss": 0.2232, + "step": 1555 + }, + { + "epoch": 3.982085732565579, + "grad_norm": 0.29171650447929676, + "learning_rate": 4.771554467889012e-06, + "loss": 0.2391, + "step": 1556 + }, + { + "epoch": 3.984644913627639, + "grad_norm": 0.2906873549139878, + "learning_rate": 4.74837025296826e-06, + "loss": 0.2297, + "step": 1557 + }, + { + "epoch": 3.9872040946896994, + "grad_norm": 0.3106548346493187, + "learning_rate": 4.725234910219609e-06, + "loss": 0.2564, + "step": 1558 + }, + { + "epoch": 3.9897632757517596, + "grad_norm": 0.29178940946369886, + "learning_rate": 4.702148513777731e-06, + "loss": 0.2457, + "step": 1559 + }, + { + "epoch": 3.9923224568138194, + "grad_norm": 0.290141222179049, + "learning_rate": 4.679111137620442e-06, + "loss": 0.2007, + "step": 1560 + }, + { + "epoch": 3.9948816378758796, + "grad_norm": 0.3078661741665255, + "learning_rate": 4.656122855568477e-06, + "loss": 0.2416, + "step": 1561 + }, + { + "epoch": 3.99744081893794, + "grad_norm": 0.28827757162372486, + "learning_rate": 4.63318374128527e-06, + "loss": 0.2416, + "step": 1562 + }, + { + "epoch": 4.0, + "grad_norm": 0.36059866962884213, + "learning_rate": 4.610293868276681e-06, + "loss": 0.286, + "step": 1563 + }, + { + "epoch": 4.00255918106206, + "grad_norm": 0.4806949756871284, + "learning_rate": 4.587453309890804e-06, + "loss": 0.1829, + "step": 1564 + }, + { + "epoch": 4.00511836212412, + "grad_norm": 0.4071198782663298, + "learning_rate": 4.5646621393177e-06, + "loss": 0.2002, + "step": 1565 + }, + { + "epoch": 4.007677543186181, + "grad_norm": 0.29904364596653477, + "learning_rate": 4.541920429589168e-06, + "loss": 0.1689, + "step": 1566 + }, + { + "epoch": 4.010236724248241, + "grad_norm": 0.319382339625052, + "learning_rate": 4.519228253578514e-06, + "loss": 0.162, + "step": 1567 + }, + { + "epoch": 4.012795905310301, + "grad_norm": 0.44482796159027516, + "learning_rate": 4.496585684000332e-06, + "loss": 0.1905, + "step": 1568 + }, + { + "epoch": 4.015355086372361, + "grad_norm": 0.4881520882605733, + "learning_rate": 4.47399279341024e-06, + "loss": 0.1883, + "step": 1569 + }, + { + "epoch": 4.017914267434421, + "grad_norm": 0.39543877818036155, + "learning_rate": 4.451449654204685e-06, + "loss": 0.1792, + "step": 1570 + }, + { + "epoch": 4.020473448496481, + "grad_norm": 0.33428133649360503, + "learning_rate": 4.428956338620671e-06, + "loss": 0.1549, + "step": 1571 + }, + { + "epoch": 4.023032629558541, + "grad_norm": 0.39094054957770324, + "learning_rate": 4.406512918735555e-06, + "loss": 0.168, + "step": 1572 + }, + { + "epoch": 4.025591810620601, + "grad_norm": 0.41624590579275506, + "learning_rate": 4.384119466466816e-06, + "loss": 0.1546, + "step": 1573 + }, + { + "epoch": 4.028150991682661, + "grad_norm": 0.41050595250529065, + "learning_rate": 4.361776053571816e-06, + "loss": 0.1553, + "step": 1574 + }, + { + "epoch": 4.030710172744722, + "grad_norm": 0.338525098415926, + "learning_rate": 4.339482751647557e-06, + "loss": 0.1672, + "step": 1575 + }, + { + "epoch": 4.033269353806782, + "grad_norm": 0.3227337964991407, + "learning_rate": 4.317239632130485e-06, + "loss": 0.1694, + "step": 1576 + }, + { + "epoch": 4.035828534868842, + "grad_norm": 0.3126102999935797, + "learning_rate": 4.295046766296224e-06, + "loss": 0.1652, + "step": 1577 + }, + { + "epoch": 4.038387715930902, + "grad_norm": 0.3419323105009614, + "learning_rate": 4.272904225259387e-06, + "loss": 0.1643, + "step": 1578 + }, + { + "epoch": 4.0409468969929625, + "grad_norm": 0.35539152039667277, + "learning_rate": 4.250812079973301e-06, + "loss": 0.1693, + "step": 1579 + }, + { + "epoch": 4.043506078055023, + "grad_norm": 0.34150875416867105, + "learning_rate": 4.228770401229824e-06, + "loss": 0.1676, + "step": 1580 + }, + { + "epoch": 4.046065259117083, + "grad_norm": 0.31333649874909303, + "learning_rate": 4.206779259659102e-06, + "loss": 0.1837, + "step": 1581 + }, + { + "epoch": 4.048624440179142, + "grad_norm": 0.29497085250511934, + "learning_rate": 4.184838725729326e-06, + "loss": 0.1606, + "step": 1582 + }, + { + "epoch": 4.0511836212412025, + "grad_norm": 0.28865884769293576, + "learning_rate": 4.1629488697465195e-06, + "loss": 0.1701, + "step": 1583 + }, + { + "epoch": 4.053742802303263, + "grad_norm": 0.30617690195804087, + "learning_rate": 4.141109761854332e-06, + "loss": 0.1586, + "step": 1584 + }, + { + "epoch": 4.056301983365323, + "grad_norm": 0.32345308536745493, + "learning_rate": 4.119321472033779e-06, + "loss": 0.1787, + "step": 1585 + }, + { + "epoch": 4.058861164427383, + "grad_norm": 0.29444459546640134, + "learning_rate": 4.097584070103042e-06, + "loss": 0.153, + "step": 1586 + }, + { + "epoch": 4.061420345489443, + "grad_norm": 0.28258351174881763, + "learning_rate": 4.075897625717249e-06, + "loss": 0.1593, + "step": 1587 + }, + { + "epoch": 4.0639795265515035, + "grad_norm": 0.3000401537763792, + "learning_rate": 4.054262208368216e-06, + "loss": 0.1805, + "step": 1588 + }, + { + "epoch": 4.066538707613564, + "grad_norm": 0.31652390372720957, + "learning_rate": 4.032677887384262e-06, + "loss": 0.1702, + "step": 1589 + }, + { + "epoch": 4.069097888675624, + "grad_norm": 0.3228892663985873, + "learning_rate": 4.011144731929981e-06, + "loss": 0.1913, + "step": 1590 + }, + { + "epoch": 4.071657069737684, + "grad_norm": 0.31373728199242906, + "learning_rate": 3.989662811005992e-06, + "loss": 0.1727, + "step": 1591 + }, + { + "epoch": 4.074216250799744, + "grad_norm": 0.30059854075379616, + "learning_rate": 3.96823219344876e-06, + "loss": 0.2085, + "step": 1592 + }, + { + "epoch": 4.076775431861805, + "grad_norm": 0.29253073860156936, + "learning_rate": 3.9468529479303445e-06, + "loss": 0.1746, + "step": 1593 + }, + { + "epoch": 4.079334612923865, + "grad_norm": 0.2925598790253486, + "learning_rate": 3.925525142958189e-06, + "loss": 0.1949, + "step": 1594 + }, + { + "epoch": 4.081893793985924, + "grad_norm": 0.29501284249932824, + "learning_rate": 3.904248846874894e-06, + "loss": 0.1665, + "step": 1595 + }, + { + "epoch": 4.084452975047984, + "grad_norm": 0.3128356519747454, + "learning_rate": 3.883024127858017e-06, + "loss": 0.1725, + "step": 1596 + }, + { + "epoch": 4.0870121561100445, + "grad_norm": 0.2888374275584094, + "learning_rate": 3.861851053919847e-06, + "loss": 0.1873, + "step": 1597 + }, + { + "epoch": 4.089571337172105, + "grad_norm": 0.28170591176204707, + "learning_rate": 3.840729692907164e-06, + "loss": 0.1789, + "step": 1598 + }, + { + "epoch": 4.092130518234165, + "grad_norm": 0.2851212955027544, + "learning_rate": 3.819660112501053e-06, + "loss": 0.1587, + "step": 1599 + }, + { + "epoch": 4.094689699296225, + "grad_norm": 0.2794461018913548, + "learning_rate": 3.7986423802166705e-06, + "loss": 0.1564, + "step": 1600 + }, + { + "epoch": 4.097248880358285, + "grad_norm": 0.29566827038245036, + "learning_rate": 3.7776765634030234e-06, + "loss": 0.1636, + "step": 1601 + }, + { + "epoch": 4.099808061420346, + "grad_norm": 0.2933200780850988, + "learning_rate": 3.756762729242773e-06, + "loss": 0.1991, + "step": 1602 + }, + { + "epoch": 4.102367242482406, + "grad_norm": 0.2761865132545396, + "learning_rate": 3.7359009447520112e-06, + "loss": 0.165, + "step": 1603 + }, + { + "epoch": 4.104926423544466, + "grad_norm": 0.2661987317855668, + "learning_rate": 3.715091276780023e-06, + "loss": 0.1897, + "step": 1604 + }, + { + "epoch": 4.107485604606526, + "grad_norm": 0.2898297360177148, + "learning_rate": 3.694333792009115e-06, + "loss": 0.1967, + "step": 1605 + }, + { + "epoch": 4.110044785668586, + "grad_norm": 0.30088937431780177, + "learning_rate": 3.6736285569543585e-06, + "loss": 0.1705, + "step": 1606 + }, + { + "epoch": 4.112603966730646, + "grad_norm": 0.3076179252763136, + "learning_rate": 3.652975637963401e-06, + "loss": 0.1865, + "step": 1607 + }, + { + "epoch": 4.115163147792706, + "grad_norm": 0.2666186074605756, + "learning_rate": 3.632375101216259e-06, + "loss": 0.1804, + "step": 1608 + }, + { + "epoch": 4.117722328854766, + "grad_norm": 0.264680141837875, + "learning_rate": 3.6118270127250954e-06, + "loss": 0.139, + "step": 1609 + }, + { + "epoch": 4.120281509916826, + "grad_norm": 0.26713666643272443, + "learning_rate": 3.5913314383339937e-06, + "loss": 0.1533, + "step": 1610 + }, + { + "epoch": 4.122840690978887, + "grad_norm": 0.28782921605361467, + "learning_rate": 3.5708884437187673e-06, + "loss": 0.1614, + "step": 1611 + }, + { + "epoch": 4.125399872040947, + "grad_norm": 0.27318365156538243, + "learning_rate": 3.5504980943867538e-06, + "loss": 0.1868, + "step": 1612 + }, + { + "epoch": 4.127959053103007, + "grad_norm": 0.27437741262556425, + "learning_rate": 3.53016045567659e-06, + "loss": 0.1634, + "step": 1613 + }, + { + "epoch": 4.130518234165067, + "grad_norm": 0.27443522507309115, + "learning_rate": 3.509875592757999e-06, + "loss": 0.2041, + "step": 1614 + }, + { + "epoch": 4.1330774152271275, + "grad_norm": 0.2733625632699625, + "learning_rate": 3.4896435706316e-06, + "loss": 0.1676, + "step": 1615 + }, + { + "epoch": 4.135636596289188, + "grad_norm": 0.28425889785798236, + "learning_rate": 3.469464454128684e-06, + "loss": 0.1714, + "step": 1616 + }, + { + "epoch": 4.138195777351248, + "grad_norm": 0.27651054747700926, + "learning_rate": 3.4493383079110054e-06, + "loss": 0.2032, + "step": 1617 + }, + { + "epoch": 4.140754958413308, + "grad_norm": 0.27095766306492064, + "learning_rate": 3.429265196470599e-06, + "loss": 0.1654, + "step": 1618 + }, + { + "epoch": 4.143314139475368, + "grad_norm": 0.27829913330629624, + "learning_rate": 3.409245184129546e-06, + "loss": 0.1753, + "step": 1619 + }, + { + "epoch": 4.145873320537428, + "grad_norm": 0.26642028532837686, + "learning_rate": 3.3892783350397675e-06, + "loss": 0.1605, + "step": 1620 + }, + { + "epoch": 4.148432501599488, + "grad_norm": 0.2671909078689866, + "learning_rate": 3.369364713182848e-06, + "loss": 0.1546, + "step": 1621 + }, + { + "epoch": 4.150991682661548, + "grad_norm": 0.2744629320037609, + "learning_rate": 3.349504382369795e-06, + "loss": 0.1606, + "step": 1622 + }, + { + "epoch": 4.153550863723608, + "grad_norm": 0.2655886430602391, + "learning_rate": 3.329697406240855e-06, + "loss": 0.1802, + "step": 1623 + }, + { + "epoch": 4.1561100447856685, + "grad_norm": 0.2735670466849154, + "learning_rate": 3.309943848265311e-06, + "loss": 0.1685, + "step": 1624 + }, + { + "epoch": 4.158669225847729, + "grad_norm": 0.2733935619603659, + "learning_rate": 3.290243771741275e-06, + "loss": 0.1712, + "step": 1625 + }, + { + "epoch": 4.161228406909789, + "grad_norm": 0.28420268431811874, + "learning_rate": 3.2705972397954655e-06, + "loss": 0.1888, + "step": 1626 + }, + { + "epoch": 4.163787587971849, + "grad_norm": 0.26985232539202264, + "learning_rate": 3.2510043153830486e-06, + "loss": 0.1877, + "step": 1627 + }, + { + "epoch": 4.166346769033909, + "grad_norm": 0.2656409578402704, + "learning_rate": 3.231465061287391e-06, + "loss": 0.1844, + "step": 1628 + }, + { + "epoch": 4.1689059500959695, + "grad_norm": 0.2870535364269227, + "learning_rate": 3.211979540119883e-06, + "loss": 0.1489, + "step": 1629 + }, + { + "epoch": 4.17146513115803, + "grad_norm": 0.2751438862963445, + "learning_rate": 3.1925478143197418e-06, + "loss": 0.1651, + "step": 1630 + }, + { + "epoch": 4.17402431222009, + "grad_norm": 0.28504383266802613, + "learning_rate": 3.1731699461537958e-06, + "loss": 0.1809, + "step": 1631 + }, + { + "epoch": 4.176583493282149, + "grad_norm": 0.28160560698590886, + "learning_rate": 3.153845997716303e-06, + "loss": 0.1608, + "step": 1632 + }, + { + "epoch": 4.1791426743442095, + "grad_norm": 0.2792655962671141, + "learning_rate": 3.1345760309287264e-06, + "loss": 0.1486, + "step": 1633 + }, + { + "epoch": 4.18170185540627, + "grad_norm": 0.2835400810692843, + "learning_rate": 3.1153601075395533e-06, + "loss": 0.1742, + "step": 1634 + }, + { + "epoch": 4.18426103646833, + "grad_norm": 0.27354882443074413, + "learning_rate": 3.0961982891241083e-06, + "loss": 0.1892, + "step": 1635 + }, + { + "epoch": 4.18682021753039, + "grad_norm": 0.2593652179219975, + "learning_rate": 3.0770906370843234e-06, + "loss": 0.176, + "step": 1636 + }, + { + "epoch": 4.18937939859245, + "grad_norm": 0.28067593594699425, + "learning_rate": 3.058037212648579e-06, + "loss": 0.1942, + "step": 1637 + }, + { + "epoch": 4.1919385796545106, + "grad_norm": 0.27960007176651136, + "learning_rate": 3.039038076871481e-06, + "loss": 0.1722, + "step": 1638 + }, + { + "epoch": 4.194497760716571, + "grad_norm": 0.2784512143122092, + "learning_rate": 3.02009329063367e-06, + "loss": 0.1819, + "step": 1639 + }, + { + "epoch": 4.197056941778631, + "grad_norm": 0.291087706558686, + "learning_rate": 3.001202914641628e-06, + "loss": 0.1855, + "step": 1640 + }, + { + "epoch": 4.199616122840691, + "grad_norm": 0.2731191212662854, + "learning_rate": 2.9823670094275e-06, + "loss": 0.1671, + "step": 1641 + }, + { + "epoch": 4.202175303902751, + "grad_norm": 0.27430309267830155, + "learning_rate": 2.9635856353488645e-06, + "loss": 0.1731, + "step": 1642 + }, + { + "epoch": 4.204734484964812, + "grad_norm": 0.26629467870174495, + "learning_rate": 2.9448588525885746e-06, + "loss": 0.1845, + "step": 1643 + }, + { + "epoch": 4.207293666026872, + "grad_norm": 0.2742753809709845, + "learning_rate": 2.9261867211545603e-06, + "loss": 0.1748, + "step": 1644 + }, + { + "epoch": 4.209852847088931, + "grad_norm": 0.28012849721285554, + "learning_rate": 2.907569300879596e-06, + "loss": 0.1994, + "step": 1645 + }, + { + "epoch": 4.212412028150991, + "grad_norm": 0.28016509946069285, + "learning_rate": 2.889006651421169e-06, + "loss": 0.1788, + "step": 1646 + }, + { + "epoch": 4.214971209213052, + "grad_norm": 0.2811917062883201, + "learning_rate": 2.870498832261257e-06, + "loss": 0.1486, + "step": 1647 + }, + { + "epoch": 4.217530390275112, + "grad_norm": 0.277577622075296, + "learning_rate": 2.85204590270612e-06, + "loss": 0.1832, + "step": 1648 + }, + { + "epoch": 4.220089571337172, + "grad_norm": 0.27124773514140205, + "learning_rate": 2.8336479218861556e-06, + "loss": 0.1626, + "step": 1649 + }, + { + "epoch": 4.222648752399232, + "grad_norm": 0.26604650947349917, + "learning_rate": 2.815304948755664e-06, + "loss": 0.1686, + "step": 1650 + }, + { + "epoch": 4.225207933461292, + "grad_norm": 0.273081026747494, + "learning_rate": 2.7970170420926957e-06, + "loss": 0.1713, + "step": 1651 + }, + { + "epoch": 4.227767114523353, + "grad_norm": 0.2906700784230057, + "learning_rate": 2.778784260498828e-06, + "loss": 0.1681, + "step": 1652 + }, + { + "epoch": 4.230326295585413, + "grad_norm": 0.29269194422721684, + "learning_rate": 2.7606066623990145e-06, + "loss": 0.1869, + "step": 1653 + }, + { + "epoch": 4.232885476647473, + "grad_norm": 0.28591911142148163, + "learning_rate": 2.742484306041373e-06, + "loss": 0.174, + "step": 1654 + }, + { + "epoch": 4.235444657709533, + "grad_norm": 0.27682074794775224, + "learning_rate": 2.7244172494969978e-06, + "loss": 0.1855, + "step": 1655 + }, + { + "epoch": 4.2380038387715935, + "grad_norm": 0.302986579035124, + "learning_rate": 2.7064055506597875e-06, + "loss": 0.1641, + "step": 1656 + }, + { + "epoch": 4.240563019833653, + "grad_norm": 0.2799225480334416, + "learning_rate": 2.688449267246258e-06, + "loss": 0.1923, + "step": 1657 + }, + { + "epoch": 4.243122200895713, + "grad_norm": 0.38988709458435794, + "learning_rate": 2.6705484567953386e-06, + "loss": 0.2104, + "step": 1658 + }, + { + "epoch": 4.245681381957773, + "grad_norm": 0.26949955966216177, + "learning_rate": 2.6527031766682142e-06, + "loss": 0.1718, + "step": 1659 + }, + { + "epoch": 4.248240563019833, + "grad_norm": 0.2788156254648664, + "learning_rate": 2.6349134840481294e-06, + "loss": 0.1711, + "step": 1660 + }, + { + "epoch": 4.250799744081894, + "grad_norm": 0.2621763225250691, + "learning_rate": 2.6171794359401957e-06, + "loss": 0.1532, + "step": 1661 + }, + { + "epoch": 4.253358925143954, + "grad_norm": 0.2865555062851034, + "learning_rate": 2.599501089171217e-06, + "loss": 0.1552, + "step": 1662 + }, + { + "epoch": 4.255918106206014, + "grad_norm": 0.26224793954089864, + "learning_rate": 2.581878500389523e-06, + "loss": 0.1755, + "step": 1663 + }, + { + "epoch": 4.258477287268074, + "grad_norm": 0.2721003139035216, + "learning_rate": 2.564311726064754e-06, + "loss": 0.1898, + "step": 1664 + }, + { + "epoch": 4.2610364683301345, + "grad_norm": 0.27269538230220364, + "learning_rate": 2.546800822487714e-06, + "loss": 0.1698, + "step": 1665 + }, + { + "epoch": 4.263595649392195, + "grad_norm": 0.28827484525986047, + "learning_rate": 2.5293458457701726e-06, + "loss": 0.2087, + "step": 1666 + }, + { + "epoch": 4.266154830454255, + "grad_norm": 0.27259819354187614, + "learning_rate": 2.5119468518446844e-06, + "loss": 0.18, + "step": 1667 + }, + { + "epoch": 4.268714011516315, + "grad_norm": 0.28656968871426663, + "learning_rate": 2.494603896464405e-06, + "loss": 0.1818, + "step": 1668 + }, + { + "epoch": 4.271273192578375, + "grad_norm": 0.28475736704342813, + "learning_rate": 2.47731703520294e-06, + "loss": 0.1888, + "step": 1669 + }, + { + "epoch": 4.273832373640435, + "grad_norm": 0.2816641273674954, + "learning_rate": 2.4600863234541338e-06, + "loss": 0.186, + "step": 1670 + }, + { + "epoch": 4.276391554702495, + "grad_norm": 0.2836263153168214, + "learning_rate": 2.4429118164319076e-06, + "loss": 0.1554, + "step": 1671 + }, + { + "epoch": 4.278950735764555, + "grad_norm": 0.2840636606374706, + "learning_rate": 2.4257935691700897e-06, + "loss": 0.2089, + "step": 1672 + }, + { + "epoch": 4.281509916826615, + "grad_norm": 0.2745376280727821, + "learning_rate": 2.408731636522217e-06, + "loss": 0.1579, + "step": 1673 + }, + { + "epoch": 4.2840690978886755, + "grad_norm": 0.2605630953509163, + "learning_rate": 2.3917260731613733e-06, + "loss": 0.1903, + "step": 1674 + }, + { + "epoch": 4.286628278950736, + "grad_norm": 0.2740285831465967, + "learning_rate": 2.374776933580025e-06, + "loss": 0.1725, + "step": 1675 + }, + { + "epoch": 4.289187460012796, + "grad_norm": 0.28095336757761535, + "learning_rate": 2.35788427208983e-06, + "loss": 0.1867, + "step": 1676 + }, + { + "epoch": 4.291746641074856, + "grad_norm": 0.27154745065005187, + "learning_rate": 2.3410481428214602e-06, + "loss": 0.1613, + "step": 1677 + }, + { + "epoch": 4.294305822136916, + "grad_norm": 0.27345791664879376, + "learning_rate": 2.324268599724451e-06, + "loss": 0.1667, + "step": 1678 + }, + { + "epoch": 4.296865003198977, + "grad_norm": 0.2732090679685769, + "learning_rate": 2.307545696566997e-06, + "loss": 0.1657, + "step": 1679 + }, + { + "epoch": 4.299424184261037, + "grad_norm": 0.2697521270279199, + "learning_rate": 2.2908794869358044e-06, + "loss": 0.1897, + "step": 1680 + }, + { + "epoch": 4.301983365323097, + "grad_norm": 0.27114579546466616, + "learning_rate": 2.274270024235912e-06, + "loss": 0.188, + "step": 1681 + }, + { + "epoch": 4.304542546385157, + "grad_norm": 0.2729061433296195, + "learning_rate": 2.2577173616905256e-06, + "loss": 0.1595, + "step": 1682 + }, + { + "epoch": 4.3071017274472165, + "grad_norm": 0.2906131303633033, + "learning_rate": 2.2412215523408266e-06, + "loss": 0.1737, + "step": 1683 + }, + { + "epoch": 4.309660908509277, + "grad_norm": 0.2768622060092854, + "learning_rate": 2.2247826490458223e-06, + "loss": 0.1796, + "step": 1684 + }, + { + "epoch": 4.312220089571337, + "grad_norm": 0.27867740126032275, + "learning_rate": 2.2084007044821764e-06, + "loss": 0.1565, + "step": 1685 + }, + { + "epoch": 4.314779270633397, + "grad_norm": 0.2723637032411945, + "learning_rate": 2.1920757711440354e-06, + "loss": 0.1756, + "step": 1686 + }, + { + "epoch": 4.317338451695457, + "grad_norm": 0.2676292277176362, + "learning_rate": 2.1758079013428435e-06, + "loss": 0.1683, + "step": 1687 + }, + { + "epoch": 4.319897632757518, + "grad_norm": 0.29143070017187106, + "learning_rate": 2.159597147207213e-06, + "loss": 0.1697, + "step": 1688 + }, + { + "epoch": 4.322456813819578, + "grad_norm": 0.27260777733690406, + "learning_rate": 2.143443560682721e-06, + "loss": 0.1788, + "step": 1689 + }, + { + "epoch": 4.325015994881638, + "grad_norm": 0.27238765087958206, + "learning_rate": 2.127347193531757e-06, + "loss": 0.1704, + "step": 1690 + }, + { + "epoch": 4.327575175943698, + "grad_norm": 0.3004112415700906, + "learning_rate": 2.1113080973333643e-06, + "loss": 0.1684, + "step": 1691 + }, + { + "epoch": 4.330134357005758, + "grad_norm": 0.27426577172310523, + "learning_rate": 2.0953263234830667e-06, + "loss": 0.1541, + "step": 1692 + }, + { + "epoch": 4.332693538067819, + "grad_norm": 0.27454753389749326, + "learning_rate": 2.0794019231926986e-06, + "loss": 0.1861, + "step": 1693 + }, + { + "epoch": 4.335252719129878, + "grad_norm": 0.2809263966115645, + "learning_rate": 2.0635349474902598e-06, + "loss": 0.1785, + "step": 1694 + }, + { + "epoch": 4.337811900191938, + "grad_norm": 0.2690301934416836, + "learning_rate": 2.0477254472197237e-06, + "loss": 0.1896, + "step": 1695 + }, + { + "epoch": 4.340371081253998, + "grad_norm": 0.2682604924926407, + "learning_rate": 2.0319734730408935e-06, + "loss": 0.1775, + "step": 1696 + }, + { + "epoch": 4.342930262316059, + "grad_norm": 0.2696807174487812, + "learning_rate": 2.016279075429246e-06, + "loss": 0.1903, + "step": 1697 + }, + { + "epoch": 4.345489443378119, + "grad_norm": 0.2665228127768651, + "learning_rate": 2.0006423046757596e-06, + "loss": 0.1754, + "step": 1698 + }, + { + "epoch": 4.348048624440179, + "grad_norm": 0.2694146119632947, + "learning_rate": 1.985063210886735e-06, + "loss": 0.1549, + "step": 1699 + }, + { + "epoch": 4.350607805502239, + "grad_norm": 0.2822224216458224, + "learning_rate": 1.96954184398368e-06, + "loss": 0.1362, + "step": 1700 + }, + { + "epoch": 4.3531669865642995, + "grad_norm": 0.2615035547353888, + "learning_rate": 1.9540782537031045e-06, + "loss": 0.1586, + "step": 1701 + }, + { + "epoch": 4.35572616762636, + "grad_norm": 0.269471616538485, + "learning_rate": 1.9386724895963805e-06, + "loss": 0.1612, + "step": 1702 + }, + { + "epoch": 4.35828534868842, + "grad_norm": 0.2682599270036803, + "learning_rate": 1.9233246010295903e-06, + "loss": 0.1822, + "step": 1703 + }, + { + "epoch": 4.36084452975048, + "grad_norm": 0.2632188336157985, + "learning_rate": 1.908034637183356e-06, + "loss": 0.1815, + "step": 1704 + }, + { + "epoch": 4.36340371081254, + "grad_norm": 0.2615148499789861, + "learning_rate": 1.8928026470526917e-06, + "loss": 0.1545, + "step": 1705 + }, + { + "epoch": 4.3659628918746005, + "grad_norm": 0.274824297046551, + "learning_rate": 1.8776286794468346e-06, + "loss": 0.1476, + "step": 1706 + }, + { + "epoch": 4.36852207293666, + "grad_norm": 0.27527733450113034, + "learning_rate": 1.8625127829890922e-06, + "loss": 0.2037, + "step": 1707 + }, + { + "epoch": 4.37108125399872, + "grad_norm": 0.27423935602322536, + "learning_rate": 1.8474550061166984e-06, + "loss": 0.1719, + "step": 1708 + }, + { + "epoch": 4.37364043506078, + "grad_norm": 0.27364608091349185, + "learning_rate": 1.8324553970806436e-06, + "loss": 0.1664, + "step": 1709 + }, + { + "epoch": 4.3761996161228405, + "grad_norm": 0.2796166648988934, + "learning_rate": 1.817514003945524e-06, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 4.378758797184901, + "grad_norm": 0.26079246660216127, + "learning_rate": 1.802630874589404e-06, + "loss": 0.1641, + "step": 1711 + }, + { + "epoch": 4.381317978246961, + "grad_norm": 0.2999617994892891, + "learning_rate": 1.787806056703627e-06, + "loss": 0.1718, + "step": 1712 + }, + { + "epoch": 4.383877159309021, + "grad_norm": 0.2889063824640918, + "learning_rate": 1.7730395977926917e-06, + "loss": 0.1653, + "step": 1713 + }, + { + "epoch": 4.386436340371081, + "grad_norm": 0.26748602837639757, + "learning_rate": 1.758331545174099e-06, + "loss": 0.1842, + "step": 1714 + }, + { + "epoch": 4.3889955214331415, + "grad_norm": 0.2806929556508416, + "learning_rate": 1.743681945978184e-06, + "loss": 0.1586, + "step": 1715 + }, + { + "epoch": 4.391554702495202, + "grad_norm": 0.29287158041254, + "learning_rate": 1.7290908471479805e-06, + "loss": 0.1761, + "step": 1716 + }, + { + "epoch": 4.394113883557262, + "grad_norm": 0.28949556938232984, + "learning_rate": 1.7145582954390638e-06, + "loss": 0.1831, + "step": 1717 + }, + { + "epoch": 4.396673064619322, + "grad_norm": 0.2707679886069612, + "learning_rate": 1.7000843374193987e-06, + "loss": 0.1796, + "step": 1718 + }, + { + "epoch": 4.399232245681382, + "grad_norm": 0.26434041057826485, + "learning_rate": 1.6856690194691872e-06, + "loss": 0.1812, + "step": 1719 + }, + { + "epoch": 4.401791426743442, + "grad_norm": 0.2717307275192052, + "learning_rate": 1.6713123877807413e-06, + "loss": 0.1618, + "step": 1720 + }, + { + "epoch": 4.404350607805502, + "grad_norm": 0.26427198325914797, + "learning_rate": 1.6570144883582994e-06, + "loss": 0.1485, + "step": 1721 + }, + { + "epoch": 4.406909788867562, + "grad_norm": 0.2830739919541902, + "learning_rate": 1.6427753670179214e-06, + "loss": 0.1628, + "step": 1722 + }, + { + "epoch": 4.409468969929622, + "grad_norm": 0.2667553896951004, + "learning_rate": 1.6285950693872999e-06, + "loss": 0.1887, + "step": 1723 + }, + { + "epoch": 4.4120281509916826, + "grad_norm": 0.2681400499833261, + "learning_rate": 1.614473640905645e-06, + "loss": 0.1629, + "step": 1724 + }, + { + "epoch": 4.414587332053743, + "grad_norm": 0.2629743389937269, + "learning_rate": 1.6004111268235156e-06, + "loss": 0.2008, + "step": 1725 + }, + { + "epoch": 4.417146513115803, + "grad_norm": 0.285465240532548, + "learning_rate": 1.5864075722027017e-06, + "loss": 0.191, + "step": 1726 + }, + { + "epoch": 4.419705694177863, + "grad_norm": 0.2683638382247192, + "learning_rate": 1.5724630219160553e-06, + "loss": 0.2073, + "step": 1727 + }, + { + "epoch": 4.422264875239923, + "grad_norm": 0.2829211001794818, + "learning_rate": 1.5585775206473508e-06, + "loss": 0.1568, + "step": 1728 + }, + { + "epoch": 4.424824056301984, + "grad_norm": 0.2788212658151338, + "learning_rate": 1.5447511128911542e-06, + "loss": 0.1728, + "step": 1729 + }, + { + "epoch": 4.427383237364044, + "grad_norm": 0.2848579162361746, + "learning_rate": 1.5309838429526714e-06, + "loss": 0.1904, + "step": 1730 + }, + { + "epoch": 4.429942418426104, + "grad_norm": 0.2654662056800488, + "learning_rate": 1.5172757549476024e-06, + "loss": 0.166, + "step": 1731 + }, + { + "epoch": 4.432501599488164, + "grad_norm": 0.285577343916777, + "learning_rate": 1.5036268928020125e-06, + "loss": 0.195, + "step": 1732 + }, + { + "epoch": 4.435060780550224, + "grad_norm": 0.25545559192222317, + "learning_rate": 1.4900373002521851e-06, + "loss": 0.1706, + "step": 1733 + }, + { + "epoch": 4.437619961612284, + "grad_norm": 0.279484081091582, + "learning_rate": 1.4765070208444732e-06, + "loss": 0.1909, + "step": 1734 + }, + { + "epoch": 4.440179142674344, + "grad_norm": 0.27394133244756325, + "learning_rate": 1.4630360979351644e-06, + "loss": 0.1955, + "step": 1735 + }, + { + "epoch": 4.442738323736404, + "grad_norm": 0.27730795832891525, + "learning_rate": 1.4496245746903626e-06, + "loss": 0.1668, + "step": 1736 + }, + { + "epoch": 4.445297504798464, + "grad_norm": 0.2515943739407271, + "learning_rate": 1.4362724940858109e-06, + "loss": 0.173, + "step": 1737 + }, + { + "epoch": 4.447856685860525, + "grad_norm": 0.25367847682125305, + "learning_rate": 1.422979898906789e-06, + "loss": 0.1639, + "step": 1738 + }, + { + "epoch": 4.450415866922585, + "grad_norm": 0.27558480853746065, + "learning_rate": 1.4097468317479623e-06, + "loss": 0.1633, + "step": 1739 + }, + { + "epoch": 4.452975047984645, + "grad_norm": 0.27695396643812065, + "learning_rate": 1.396573335013236e-06, + "loss": 0.1808, + "step": 1740 + }, + { + "epoch": 4.455534229046705, + "grad_norm": 0.2804193028236503, + "learning_rate": 1.3834594509156319e-06, + "loss": 0.1673, + "step": 1741 + }, + { + "epoch": 4.4580934101087655, + "grad_norm": 0.2782333366929398, + "learning_rate": 1.3704052214771513e-06, + "loss": 0.1971, + "step": 1742 + }, + { + "epoch": 4.460652591170826, + "grad_norm": 0.2777400443098731, + "learning_rate": 1.3574106885286465e-06, + "loss": 0.1737, + "step": 1743 + }, + { + "epoch": 4.463211772232885, + "grad_norm": 0.2764493972670724, + "learning_rate": 1.344475893709658e-06, + "loss": 0.1904, + "step": 1744 + }, + { + "epoch": 4.465770953294945, + "grad_norm": 0.2752241285220294, + "learning_rate": 1.3316008784683265e-06, + "loss": 0.1613, + "step": 1745 + }, + { + "epoch": 4.468330134357005, + "grad_norm": 0.2693372116468191, + "learning_rate": 1.3187856840612167e-06, + "loss": 0.1627, + "step": 1746 + }, + { + "epoch": 4.470889315419066, + "grad_norm": 0.26299523604064184, + "learning_rate": 1.3060303515532135e-06, + "loss": 0.1644, + "step": 1747 + }, + { + "epoch": 4.473448496481126, + "grad_norm": 0.26861009102213246, + "learning_rate": 1.2933349218173774e-06, + "loss": 0.1748, + "step": 1748 + }, + { + "epoch": 4.476007677543186, + "grad_norm": 0.266256378668002, + "learning_rate": 1.2806994355348224e-06, + "loss": 0.1717, + "step": 1749 + }, + { + "epoch": 4.478566858605246, + "grad_norm": 0.2783524667972571, + "learning_rate": 1.2681239331945695e-06, + "loss": 0.1739, + "step": 1750 + }, + { + "epoch": 4.4811260396673065, + "grad_norm": 0.2807754665043445, + "learning_rate": 1.2556084550934423e-06, + "loss": 0.163, + "step": 1751 + }, + { + "epoch": 4.483685220729367, + "grad_norm": 0.2751132941559695, + "learning_rate": 1.2431530413359138e-06, + "loss": 0.1596, + "step": 1752 + }, + { + "epoch": 4.486244401791427, + "grad_norm": 0.27965370827809377, + "learning_rate": 1.2307577318339825e-06, + "loss": 0.1764, + "step": 1753 + }, + { + "epoch": 4.488803582853487, + "grad_norm": 0.27090435786248723, + "learning_rate": 1.2184225663070604e-06, + "loss": 0.1904, + "step": 1754 + }, + { + "epoch": 4.491362763915547, + "grad_norm": 0.2830802635501525, + "learning_rate": 1.2061475842818337e-06, + "loss": 0.1785, + "step": 1755 + }, + { + "epoch": 4.4939219449776076, + "grad_norm": 0.2624221347168147, + "learning_rate": 1.1939328250921278e-06, + "loss": 0.1804, + "step": 1756 + }, + { + "epoch": 4.496481126039667, + "grad_norm": 0.27670735162368, + "learning_rate": 1.1817783278788042e-06, + "loss": 0.1534, + "step": 1757 + }, + { + "epoch": 4.499040307101727, + "grad_norm": 0.2755467325350106, + "learning_rate": 1.169684131589608e-06, + "loss": 0.1791, + "step": 1758 + }, + { + "epoch": 4.501599488163787, + "grad_norm": 0.2750192315338786, + "learning_rate": 1.1576502749790608e-06, + "loss": 0.1721, + "step": 1759 + }, + { + "epoch": 4.5041586692258475, + "grad_norm": 0.26825366429953873, + "learning_rate": 1.1456767966083393e-06, + "loss": 0.1739, + "step": 1760 + }, + { + "epoch": 4.506717850287908, + "grad_norm": 0.26461882189193386, + "learning_rate": 1.1337637348451369e-06, + "loss": 0.1836, + "step": 1761 + }, + { + "epoch": 4.509277031349968, + "grad_norm": 0.27170229996613754, + "learning_rate": 1.1219111278635575e-06, + "loss": 0.1746, + "step": 1762 + }, + { + "epoch": 4.511836212412028, + "grad_norm": 0.28612289439672206, + "learning_rate": 1.1101190136439689e-06, + "loss": 0.1664, + "step": 1763 + }, + { + "epoch": 4.514395393474088, + "grad_norm": 0.2814719237385938, + "learning_rate": 1.0983874299729092e-06, + "loss": 0.1552, + "step": 1764 + }, + { + "epoch": 4.516954574536149, + "grad_norm": 0.27224408532725913, + "learning_rate": 1.086716414442952e-06, + "loss": 0.155, + "step": 1765 + }, + { + "epoch": 4.519513755598209, + "grad_norm": 0.2683837888920839, + "learning_rate": 1.0751060044525797e-06, + "loss": 0.1947, + "step": 1766 + }, + { + "epoch": 4.522072936660269, + "grad_norm": 0.266405093166955, + "learning_rate": 1.0635562372060825e-06, + "loss": 0.179, + "step": 1767 + }, + { + "epoch": 4.524632117722329, + "grad_norm": 0.26568191781978007, + "learning_rate": 1.052067149713416e-06, + "loss": 0.1595, + "step": 1768 + }, + { + "epoch": 4.527191298784389, + "grad_norm": 0.27613787388854283, + "learning_rate": 1.0406387787900974e-06, + "loss": 0.2022, + "step": 1769 + }, + { + "epoch": 4.529750479846449, + "grad_norm": 0.2783446591602134, + "learning_rate": 1.0292711610570904e-06, + "loss": 0.1965, + "step": 1770 + }, + { + "epoch": 4.532309660908509, + "grad_norm": 0.2754628182404677, + "learning_rate": 1.0179643329406752e-06, + "loss": 0.1796, + "step": 1771 + }, + { + "epoch": 4.534868841970569, + "grad_norm": 0.2717991423747503, + "learning_rate": 1.0067183306723384e-06, + "loss": 0.1872, + "step": 1772 + }, + { + "epoch": 4.537428023032629, + "grad_norm": 0.26023938540588254, + "learning_rate": 9.955331902886645e-07, + "loss": 0.1645, + "step": 1773 + }, + { + "epoch": 4.53998720409469, + "grad_norm": 0.2697243580148783, + "learning_rate": 9.844089476312035e-07, + "loss": 0.1736, + "step": 1774 + }, + { + "epoch": 4.54254638515675, + "grad_norm": 0.27089652411524956, + "learning_rate": 9.733456383463658e-07, + "loss": 0.156, + "step": 1775 + }, + { + "epoch": 4.54510556621881, + "grad_norm": 0.2625263168411182, + "learning_rate": 9.62343297885313e-07, + "loss": 0.1709, + "step": 1776 + }, + { + "epoch": 4.54766474728087, + "grad_norm": 0.2720147925441457, + "learning_rate": 9.514019615038395e-07, + "loss": 0.1609, + "step": 1777 + }, + { + "epoch": 4.55022392834293, + "grad_norm": 0.26862738106885103, + "learning_rate": 9.40521664262255e-07, + "loss": 0.1823, + "step": 1778 + }, + { + "epoch": 4.552783109404991, + "grad_norm": 0.2858477259373205, + "learning_rate": 9.297024410252753e-07, + "loss": 0.1719, + "step": 1779 + }, + { + "epoch": 4.555342290467051, + "grad_norm": 0.2792231695337476, + "learning_rate": 9.189443264619102e-07, + "loss": 0.2187, + "step": 1780 + }, + { + "epoch": 4.557901471529111, + "grad_norm": 0.2722587468079133, + "learning_rate": 9.082473550453619e-07, + "loss": 0.1581, + "step": 1781 + }, + { + "epoch": 4.560460652591171, + "grad_norm": 0.25994947891197123, + "learning_rate": 8.976115610528957e-07, + "loss": 0.1813, + "step": 1782 + }, + { + "epoch": 4.563019833653231, + "grad_norm": 0.2685596280130304, + "learning_rate": 8.870369785657451e-07, + "loss": 0.1637, + "step": 1783 + }, + { + "epoch": 4.565579014715291, + "grad_norm": 0.2624613954232775, + "learning_rate": 8.765236414690026e-07, + "loss": 0.1867, + "step": 1784 + }, + { + "epoch": 4.568138195777351, + "grad_norm": 0.26985980601394455, + "learning_rate": 8.660715834514977e-07, + "loss": 0.1812, + "step": 1785 + }, + { + "epoch": 4.570697376839411, + "grad_norm": 0.2782580432929674, + "learning_rate": 8.556808380057013e-07, + "loss": 0.1551, + "step": 1786 + }, + { + "epoch": 4.5732565579014715, + "grad_norm": 0.3229561788111089, + "learning_rate": 8.453514384276196e-07, + "loss": 0.1665, + "step": 1787 + }, + { + "epoch": 4.575815738963532, + "grad_norm": 0.26676739208882927, + "learning_rate": 8.350834178166755e-07, + "loss": 0.2019, + "step": 1788 + }, + { + "epoch": 4.578374920025592, + "grad_norm": 0.25638061426053027, + "learning_rate": 8.248768090756143e-07, + "loss": 0.1623, + "step": 1789 + }, + { + "epoch": 4.580934101087652, + "grad_norm": 0.2769760353268046, + "learning_rate": 8.147316449103959e-07, + "loss": 0.193, + "step": 1790 + }, + { + "epoch": 4.583493282149712, + "grad_norm": 0.2827217753260577, + "learning_rate": 8.046479578300803e-07, + "loss": 0.1573, + "step": 1791 + }, + { + "epoch": 4.5860524632117725, + "grad_norm": 0.267728272299288, + "learning_rate": 7.946257801467339e-07, + "loss": 0.1534, + "step": 1792 + }, + { + "epoch": 4.588611644273833, + "grad_norm": 0.26899124519431056, + "learning_rate": 7.846651439753273e-07, + "loss": 0.1785, + "step": 1793 + }, + { + "epoch": 4.591170825335892, + "grad_norm": 0.2655562652017706, + "learning_rate": 7.747660812336221e-07, + "loss": 0.1632, + "step": 1794 + }, + { + "epoch": 4.593730006397953, + "grad_norm": 0.2912418780943663, + "learning_rate": 7.649286236420806e-07, + "loss": 0.1664, + "step": 1795 + }, + { + "epoch": 4.5962891874600125, + "grad_norm": 0.2773582855251603, + "learning_rate": 7.551528027237553e-07, + "loss": 0.1649, + "step": 1796 + }, + { + "epoch": 4.598848368522073, + "grad_norm": 0.2706350032862212, + "learning_rate": 7.454386498041865e-07, + "loss": 0.1897, + "step": 1797 + }, + { + "epoch": 4.601407549584133, + "grad_norm": 0.27987597117336843, + "learning_rate": 7.357861960113121e-07, + "loss": 0.1806, + "step": 1798 + }, + { + "epoch": 4.603966730646193, + "grad_norm": 0.2554799929519513, + "learning_rate": 7.261954722753595e-07, + "loss": 0.1454, + "step": 1799 + }, + { + "epoch": 4.606525911708253, + "grad_norm": 0.28194077726489003, + "learning_rate": 7.166665093287539e-07, + "loss": 0.1956, + "step": 1800 + }, + { + "epoch": 4.6090850927703135, + "grad_norm": 0.27206485970301414, + "learning_rate": 7.071993377060038e-07, + "loss": 0.1813, + "step": 1801 + }, + { + "epoch": 4.611644273832374, + "grad_norm": 0.27639368969275124, + "learning_rate": 6.977939877436224e-07, + "loss": 0.1937, + "step": 1802 + }, + { + "epoch": 4.614203454894434, + "grad_norm": 0.26700294636297844, + "learning_rate": 6.884504895800237e-07, + "loss": 0.159, + "step": 1803 + }, + { + "epoch": 4.616762635956494, + "grad_norm": 0.2715005815453172, + "learning_rate": 6.791688731554158e-07, + "loss": 0.1608, + "step": 1804 + }, + { + "epoch": 4.619321817018554, + "grad_norm": 0.27127828240291824, + "learning_rate": 6.69949168211721e-07, + "loss": 0.1857, + "step": 1805 + }, + { + "epoch": 4.621880998080615, + "grad_norm": 0.28402081462443657, + "learning_rate": 6.607914042924756e-07, + "loss": 0.1918, + "step": 1806 + }, + { + "epoch": 4.624440179142674, + "grad_norm": 0.26263908410916775, + "learning_rate": 6.516956107427241e-07, + "loss": 0.1569, + "step": 1807 + }, + { + "epoch": 4.626999360204734, + "grad_norm": 0.27371755225997646, + "learning_rate": 6.426618167089338e-07, + "loss": 0.1557, + "step": 1808 + }, + { + "epoch": 4.629558541266794, + "grad_norm": 0.26959266513847036, + "learning_rate": 6.336900511389133e-07, + "loss": 0.1733, + "step": 1809 + }, + { + "epoch": 4.6321177223288545, + "grad_norm": 0.27453758652223553, + "learning_rate": 6.247803427816945e-07, + "loss": 0.1635, + "step": 1810 + }, + { + "epoch": 4.634676903390915, + "grad_norm": 0.2673151789681698, + "learning_rate": 6.159327201874598e-07, + "loss": 0.1709, + "step": 1811 + }, + { + "epoch": 4.637236084452975, + "grad_norm": 0.2702926085830735, + "learning_rate": 6.071472117074462e-07, + "loss": 0.1815, + "step": 1812 + }, + { + "epoch": 4.639795265515035, + "grad_norm": 0.2788070786022333, + "learning_rate": 5.984238454938496e-07, + "loss": 0.1527, + "step": 1813 + }, + { + "epoch": 4.642354446577095, + "grad_norm": 0.27358568856995236, + "learning_rate": 5.897626494997366e-07, + "loss": 0.1785, + "step": 1814 + }, + { + "epoch": 4.644913627639156, + "grad_norm": 0.2718095549716457, + "learning_rate": 5.811636514789598e-07, + "loss": 0.1853, + "step": 1815 + }, + { + "epoch": 4.647472808701216, + "grad_norm": 0.27759832517042105, + "learning_rate": 5.726268789860645e-07, + "loss": 0.1646, + "step": 1816 + }, + { + "epoch": 4.650031989763276, + "grad_norm": 0.26320625609355736, + "learning_rate": 5.641523593761977e-07, + "loss": 0.1723, + "step": 1817 + }, + { + "epoch": 4.652591170825336, + "grad_norm": 0.27780583001556897, + "learning_rate": 5.557401198050327e-07, + "loss": 0.184, + "step": 1818 + }, + { + "epoch": 4.6551503518873965, + "grad_norm": 0.27504183175562963, + "learning_rate": 5.473901872286602e-07, + "loss": 0.1712, + "step": 1819 + }, + { + "epoch": 4.657709532949456, + "grad_norm": 0.2774680321446144, + "learning_rate": 5.391025884035239e-07, + "loss": 0.1817, + "step": 1820 + }, + { + "epoch": 4.660268714011516, + "grad_norm": 0.26555781569772313, + "learning_rate": 5.308773498863251e-07, + "loss": 0.1576, + "step": 1821 + }, + { + "epoch": 4.662827895073576, + "grad_norm": 0.33797160433489215, + "learning_rate": 5.22714498033936e-07, + "loss": 0.1929, + "step": 1822 + }, + { + "epoch": 4.665387076135636, + "grad_norm": 0.28245555374717063, + "learning_rate": 5.146140590033199e-07, + "loss": 0.1869, + "step": 1823 + }, + { + "epoch": 4.667946257197697, + "grad_norm": 0.27464067240369455, + "learning_rate": 5.065760587514446e-07, + "loss": 0.1902, + "step": 1824 + }, + { + "epoch": 4.670505438259757, + "grad_norm": 0.2672943490021358, + "learning_rate": 4.986005230351954e-07, + "loss": 0.188, + "step": 1825 + }, + { + "epoch": 4.673064619321817, + "grad_norm": 0.2657663358279065, + "learning_rate": 4.906874774113024e-07, + "loss": 0.184, + "step": 1826 + }, + { + "epoch": 4.675623800383877, + "grad_norm": 0.2674714454963707, + "learning_rate": 4.828369472362493e-07, + "loss": 0.1469, + "step": 1827 + }, + { + "epoch": 4.6781829814459375, + "grad_norm": 0.2882898088898947, + "learning_rate": 4.750489576662021e-07, + "loss": 0.162, + "step": 1828 + }, + { + "epoch": 4.680742162507998, + "grad_norm": 0.27727023737142387, + "learning_rate": 4.6732353365691374e-07, + "loss": 0.1543, + "step": 1829 + }, + { + "epoch": 4.683301343570058, + "grad_norm": 0.2636650641131126, + "learning_rate": 4.5966069996365993e-07, + "loss": 0.1561, + "step": 1830 + }, + { + "epoch": 4.685860524632118, + "grad_norm": 0.2682699322744399, + "learning_rate": 4.5206048114114775e-07, + "loss": 0.1673, + "step": 1831 + }, + { + "epoch": 4.688419705694178, + "grad_norm": 0.2743352871936966, + "learning_rate": 4.4452290154344046e-07, + "loss": 0.1807, + "step": 1832 + }, + { + "epoch": 4.690978886756238, + "grad_norm": 0.2770317090716807, + "learning_rate": 4.3704798532388624e-07, + "loss": 0.2129, + "step": 1833 + }, + { + "epoch": 4.693538067818298, + "grad_norm": 0.280836808159879, + "learning_rate": 4.296357564350362e-07, + "loss": 0.1604, + "step": 1834 + }, + { + "epoch": 4.696097248880358, + "grad_norm": 0.26525175245500215, + "learning_rate": 4.22286238628562e-07, + "loss": 0.1763, + "step": 1835 + }, + { + "epoch": 4.698656429942418, + "grad_norm": 0.2700214270271814, + "learning_rate": 4.1499945545518283e-07, + "loss": 0.154, + "step": 1836 + }, + { + "epoch": 4.7012156110044785, + "grad_norm": 0.24327556501731853, + "learning_rate": 4.077754302645964e-07, + "loss": 0.1616, + "step": 1837 + }, + { + "epoch": 4.703774792066539, + "grad_norm": 0.2652109590879505, + "learning_rate": 4.006141862054014e-07, + "loss": 0.1809, + "step": 1838 + }, + { + "epoch": 4.706333973128599, + "grad_norm": 0.26799443888146, + "learning_rate": 3.935157462250128e-07, + "loss": 0.1799, + "step": 1839 + }, + { + "epoch": 4.708893154190659, + "grad_norm": 0.26417342730564347, + "learning_rate": 3.8648013306960664e-07, + "loss": 0.1697, + "step": 1840 + }, + { + "epoch": 4.711452335252719, + "grad_norm": 0.2655166510116645, + "learning_rate": 3.7950736928402674e-07, + "loss": 0.1354, + "step": 1841 + }, + { + "epoch": 4.7140115163147795, + "grad_norm": 0.27018720652857536, + "learning_rate": 3.7259747721173134e-07, + "loss": 0.1568, + "step": 1842 + }, + { + "epoch": 4.71657069737684, + "grad_norm": 0.26944814432184416, + "learning_rate": 3.6575047899471085e-07, + "loss": 0.1539, + "step": 1843 + }, + { + "epoch": 4.719129878438899, + "grad_norm": 0.273820427322093, + "learning_rate": 3.5896639657342134e-07, + "loss": 0.1566, + "step": 1844 + }, + { + "epoch": 4.72168905950096, + "grad_norm": 0.25408644087595794, + "learning_rate": 3.522452516867048e-07, + "loss": 0.1751, + "step": 1845 + }, + { + "epoch": 4.7242482405630195, + "grad_norm": 0.282638324070005, + "learning_rate": 3.455870658717353e-07, + "loss": 0.1788, + "step": 1846 + }, + { + "epoch": 4.72680742162508, + "grad_norm": 0.27645267654347633, + "learning_rate": 3.3899186046393526e-07, + "loss": 0.1856, + "step": 1847 + }, + { + "epoch": 4.72936660268714, + "grad_norm": 0.28435322569370036, + "learning_rate": 3.324596565969174e-07, + "loss": 0.1903, + "step": 1848 + }, + { + "epoch": 4.7319257837492, + "grad_norm": 0.2665475946312596, + "learning_rate": 3.2599047520241123e-07, + "loss": 0.1625, + "step": 1849 + }, + { + "epoch": 4.73448496481126, + "grad_norm": 0.2848927094305906, + "learning_rate": 3.1958433701019697e-07, + "loss": 0.2058, + "step": 1850 + }, + { + "epoch": 4.737044145873321, + "grad_norm": 0.2704317535606738, + "learning_rate": 3.1324126254804524e-07, + "loss": 0.1868, + "step": 1851 + }, + { + "epoch": 4.739603326935381, + "grad_norm": 0.2800291692162128, + "learning_rate": 3.069612721416371e-07, + "loss": 0.1794, + "step": 1852 + }, + { + "epoch": 4.742162507997441, + "grad_norm": 0.27622038521669706, + "learning_rate": 3.007443859145087e-07, + "loss": 0.1701, + "step": 1853 + }, + { + "epoch": 4.744721689059501, + "grad_norm": 0.28051196613093177, + "learning_rate": 2.9459062378799806e-07, + "loss": 0.193, + "step": 1854 + }, + { + "epoch": 4.747280870121561, + "grad_norm": 0.26937886948023293, + "learning_rate": 2.8850000548115155e-07, + "loss": 0.1645, + "step": 1855 + }, + { + "epoch": 4.749840051183622, + "grad_norm": 0.2574745053917364, + "learning_rate": 2.8247255051068845e-07, + "loss": 0.1711, + "step": 1856 + }, + { + "epoch": 4.752399232245681, + "grad_norm": 0.2716275327438086, + "learning_rate": 2.7650827819093005e-07, + "loss": 0.1699, + "step": 1857 + }, + { + "epoch": 4.754958413307741, + "grad_norm": 0.2590049108849854, + "learning_rate": 2.706072076337285e-07, + "loss": 0.1648, + "step": 1858 + }, + { + "epoch": 4.757517594369801, + "grad_norm": 0.27267669634639347, + "learning_rate": 2.647693577484156e-07, + "loss": 0.1887, + "step": 1859 + }, + { + "epoch": 4.760076775431862, + "grad_norm": 0.27747570018737217, + "learning_rate": 2.5899474724174313e-07, + "loss": 0.1822, + "step": 1860 + }, + { + "epoch": 4.762635956493922, + "grad_norm": 0.27256177644643004, + "learning_rate": 2.532833946178137e-07, + "loss": 0.1833, + "step": 1861 + }, + { + "epoch": 4.765195137555982, + "grad_norm": 0.26875210471690963, + "learning_rate": 2.4763531817802777e-07, + "loss": 0.1634, + "step": 1862 + }, + { + "epoch": 4.767754318618042, + "grad_norm": 0.2841356724669023, + "learning_rate": 2.4205053602103015e-07, + "loss": 0.1716, + "step": 1863 + }, + { + "epoch": 4.770313499680102, + "grad_norm": 0.28261688509298977, + "learning_rate": 2.365290660426389e-07, + "loss": 0.1804, + "step": 1864 + }, + { + "epoch": 4.772872680742163, + "grad_norm": 0.2673985339554513, + "learning_rate": 2.3107092593579905e-07, + "loss": 0.17, + "step": 1865 + }, + { + "epoch": 4.775431861804223, + "grad_norm": 0.2644854641715479, + "learning_rate": 2.2567613319051997e-07, + "loss": 0.1624, + "step": 1866 + }, + { + "epoch": 4.777991042866283, + "grad_norm": 0.2657701185136481, + "learning_rate": 2.2034470509382234e-07, + "loss": 0.1967, + "step": 1867 + }, + { + "epoch": 4.780550223928343, + "grad_norm": 0.26830383283496656, + "learning_rate": 2.1507665872968264e-07, + "loss": 0.1743, + "step": 1868 + }, + { + "epoch": 4.7831094049904035, + "grad_norm": 0.26825661403886375, + "learning_rate": 2.0987201097897757e-07, + "loss": 0.1697, + "step": 1869 + }, + { + "epoch": 4.785668586052463, + "grad_norm": 0.25032287801653486, + "learning_rate": 2.0473077851942858e-07, + "loss": 0.1692, + "step": 1870 + }, + { + "epoch": 4.788227767114523, + "grad_norm": 0.2540438269392125, + "learning_rate": 1.9965297782554848e-07, + "loss": 0.1594, + "step": 1871 + }, + { + "epoch": 4.790786948176583, + "grad_norm": 0.25648856154725486, + "learning_rate": 1.9463862516859277e-07, + "loss": 0.1862, + "step": 1872 + }, + { + "epoch": 4.7933461292386434, + "grad_norm": 0.27520225342246696, + "learning_rate": 1.896877366165062e-07, + "loss": 0.1625, + "step": 1873 + }, + { + "epoch": 4.795905310300704, + "grad_norm": 0.2879456654071117, + "learning_rate": 1.8480032803386505e-07, + "loss": 0.1647, + "step": 1874 + }, + { + "epoch": 4.798464491362764, + "grad_norm": 0.26113619907647856, + "learning_rate": 1.799764150818306e-07, + "loss": 0.1556, + "step": 1875 + }, + { + "epoch": 4.801023672424824, + "grad_norm": 0.2684336534763457, + "learning_rate": 1.7521601321810687e-07, + "loss": 0.1686, + "step": 1876 + }, + { + "epoch": 4.803582853486884, + "grad_norm": 0.287932659535979, + "learning_rate": 1.7051913769687623e-07, + "loss": 0.1549, + "step": 1877 + }, + { + "epoch": 4.8061420345489445, + "grad_norm": 0.26143215817508003, + "learning_rate": 1.658858035687594e-07, + "loss": 0.1796, + "step": 1878 + }, + { + "epoch": 4.808701215611005, + "grad_norm": 0.26742321309706935, + "learning_rate": 1.6131602568076887e-07, + "loss": 0.1723, + "step": 1879 + }, + { + "epoch": 4.811260396673065, + "grad_norm": 0.26866517810628654, + "learning_rate": 1.5680981867625566e-07, + "loss": 0.1631, + "step": 1880 + }, + { + "epoch": 4.813819577735125, + "grad_norm": 0.2664673668196368, + "learning_rate": 1.5236719699486256e-07, + "loss": 0.1595, + "step": 1881 + }, + { + "epoch": 4.816378758797185, + "grad_norm": 0.26624359763267497, + "learning_rate": 1.479881748724865e-07, + "loss": 0.174, + "step": 1882 + }, + { + "epoch": 4.818937939859245, + "grad_norm": 0.26586320390850965, + "learning_rate": 1.4367276634122073e-07, + "loss": 0.1733, + "step": 1883 + }, + { + "epoch": 4.821497120921305, + "grad_norm": 0.2734079332163835, + "learning_rate": 1.3942098522931491e-07, + "loss": 0.1524, + "step": 1884 + }, + { + "epoch": 4.824056301983365, + "grad_norm": 0.26860707446851667, + "learning_rate": 1.3523284516113955e-07, + "loss": 0.1801, + "step": 1885 + }, + { + "epoch": 4.826615483045425, + "grad_norm": 0.26293219622877667, + "learning_rate": 1.3110835955712831e-07, + "loss": 0.1789, + "step": 1886 + }, + { + "epoch": 4.8291746641074855, + "grad_norm": 0.26548180261766674, + "learning_rate": 1.2704754163374022e-07, + "loss": 0.1643, + "step": 1887 + }, + { + "epoch": 4.831733845169546, + "grad_norm": 0.27466550793343814, + "learning_rate": 1.2305040440342198e-07, + "loss": 0.1417, + "step": 1888 + }, + { + "epoch": 4.834293026231606, + "grad_norm": 0.27044679875019695, + "learning_rate": 1.1911696067455902e-07, + "loss": 0.1862, + "step": 1889 + }, + { + "epoch": 4.836852207293666, + "grad_norm": 0.2667382729572236, + "learning_rate": 1.1524722305144231e-07, + "loss": 0.1671, + "step": 1890 + }, + { + "epoch": 4.839411388355726, + "grad_norm": 0.26077082259648754, + "learning_rate": 1.114412039342172e-07, + "loss": 0.1583, + "step": 1891 + }, + { + "epoch": 4.841970569417787, + "grad_norm": 0.2683982318957702, + "learning_rate": 1.0769891551885903e-07, + "loss": 0.1689, + "step": 1892 + }, + { + "epoch": 4.844529750479847, + "grad_norm": 0.26442562426087834, + "learning_rate": 1.0402036979711317e-07, + "loss": 0.1901, + "step": 1893 + }, + { + "epoch": 4.847088931541906, + "grad_norm": 0.2628418022630685, + "learning_rate": 1.0040557855648169e-07, + "loss": 0.1628, + "step": 1894 + }, + { + "epoch": 4.849648112603967, + "grad_norm": 0.2643402568604478, + "learning_rate": 9.685455338016347e-08, + "loss": 0.1769, + "step": 1895 + }, + { + "epoch": 4.8522072936660265, + "grad_norm": 0.27073662782178587, + "learning_rate": 9.336730564702745e-08, + "loss": 0.163, + "step": 1896 + }, + { + "epoch": 4.854766474728087, + "grad_norm": 0.2592523308295433, + "learning_rate": 8.994384653157718e-08, + "loss": 0.1748, + "step": 1897 + }, + { + "epoch": 4.857325655790147, + "grad_norm": 0.26364468600135543, + "learning_rate": 8.658418700391302e-08, + "loss": 0.1658, + "step": 1898 + }, + { + "epoch": 4.859884836852207, + "grad_norm": 0.26261286147824464, + "learning_rate": 8.328833782969003e-08, + "loss": 0.166, + "step": 1899 + }, + { + "epoch": 4.862444017914267, + "grad_norm": 0.2752608130158951, + "learning_rate": 8.005630957010014e-08, + "loss": 0.1832, + "step": 1900 + }, + { + "epoch": 4.865003198976328, + "grad_norm": 0.27593480590108715, + "learning_rate": 7.688811258181883e-08, + "loss": 0.1838, + "step": 1901 + }, + { + "epoch": 4.867562380038388, + "grad_norm": 0.27236472207879586, + "learning_rate": 7.378375701698748e-08, + "loss": 0.1898, + "step": 1902 + }, + { + "epoch": 4.870121561100448, + "grad_norm": 0.25195081103744715, + "learning_rate": 7.074325282317329e-08, + "loss": 0.1754, + "step": 1903 + }, + { + "epoch": 4.872680742162508, + "grad_norm": 0.2869783000463503, + "learning_rate": 6.776660974333605e-08, + "loss": 0.1572, + "step": 1904 + }, + { + "epoch": 4.8752399232245685, + "grad_norm": 0.27441367536012884, + "learning_rate": 6.485383731580142e-08, + "loss": 0.1766, + "step": 1905 + }, + { + "epoch": 4.877799104286629, + "grad_norm": 0.28772532733219014, + "learning_rate": 6.200494487422771e-08, + "loss": 0.1794, + "step": 1906 + }, + { + "epoch": 4.880358285348688, + "grad_norm": 0.27889648675155043, + "learning_rate": 5.921994154758137e-08, + "loss": 0.17, + "step": 1907 + }, + { + "epoch": 4.882917466410748, + "grad_norm": 0.25762876059244744, + "learning_rate": 5.649883626009933e-08, + "loss": 0.1415, + "step": 1908 + }, + { + "epoch": 4.885476647472808, + "grad_norm": 0.27418736611080513, + "learning_rate": 5.3841637731260054e-08, + "loss": 0.1637, + "step": 1909 + }, + { + "epoch": 4.888035828534869, + "grad_norm": 0.2644534739010174, + "learning_rate": 5.1248354475768034e-08, + "loss": 0.1856, + "step": 1910 + }, + { + "epoch": 4.890595009596929, + "grad_norm": 0.26943083477416585, + "learning_rate": 4.871899480351605e-08, + "loss": 0.1833, + "step": 1911 + }, + { + "epoch": 4.893154190658989, + "grad_norm": 0.27096897776649476, + "learning_rate": 4.6253566819554066e-08, + "loss": 0.1646, + "step": 1912 + }, + { + "epoch": 4.895713371721049, + "grad_norm": 0.2810986979004101, + "learning_rate": 4.385207842407813e-08, + "loss": 0.1688, + "step": 1913 + }, + { + "epoch": 4.8982725527831095, + "grad_norm": 0.2780463267788789, + "learning_rate": 4.151453731239707e-08, + "loss": 0.1889, + "step": 1914 + }, + { + "epoch": 4.90083173384517, + "grad_norm": 0.2740470136715985, + "learning_rate": 3.924095097489922e-08, + "loss": 0.1771, + "step": 1915 + }, + { + "epoch": 4.90339091490723, + "grad_norm": 0.2670609796470983, + "learning_rate": 3.703132669704568e-08, + "loss": 0.1767, + "step": 1916 + }, + { + "epoch": 4.90595009596929, + "grad_norm": 0.276634001409615, + "learning_rate": 3.4885671559332645e-08, + "loss": 0.1698, + "step": 1917 + }, + { + "epoch": 4.90850927703135, + "grad_norm": 0.2836253559347624, + "learning_rate": 3.280399243727806e-08, + "loss": 0.1434, + "step": 1918 + }, + { + "epoch": 4.9110684580934105, + "grad_norm": 0.2596326694355661, + "learning_rate": 3.078629600139271e-08, + "loss": 0.1738, + "step": 1919 + }, + { + "epoch": 4.91362763915547, + "grad_norm": 0.26138854550688545, + "learning_rate": 2.8832588717164766e-08, + "loss": 0.1698, + "step": 1920 + }, + { + "epoch": 4.91618682021753, + "grad_norm": 0.2629381551374675, + "learning_rate": 2.694287684503083e-08, + "loss": 0.1803, + "step": 1921 + }, + { + "epoch": 4.91874600127959, + "grad_norm": 0.2707296291029451, + "learning_rate": 2.511716644036932e-08, + "loss": 0.2076, + "step": 1922 + }, + { + "epoch": 4.9213051823416505, + "grad_norm": 0.2643795979993967, + "learning_rate": 2.3355463353467168e-08, + "loss": 0.1737, + "step": 1923 + }, + { + "epoch": 4.923864363403711, + "grad_norm": 0.2687386354716593, + "learning_rate": 2.1657773229508684e-08, + "loss": 0.1525, + "step": 1924 + }, + { + "epoch": 4.926423544465771, + "grad_norm": 0.25773983452315097, + "learning_rate": 2.0024101508555604e-08, + "loss": 0.1611, + "step": 1925 + }, + { + "epoch": 4.928982725527831, + "grad_norm": 0.2759068990299683, + "learning_rate": 1.8454453425527098e-08, + "loss": 0.149, + "step": 1926 + }, + { + "epoch": 4.931541906589891, + "grad_norm": 0.2658729323365574, + "learning_rate": 1.6948834010190874e-08, + "loss": 0.1928, + "step": 1927 + }, + { + "epoch": 4.9341010876519515, + "grad_norm": 0.28178214512822436, + "learning_rate": 1.550724808713877e-08, + "loss": 0.1885, + "step": 1928 + }, + { + "epoch": 4.936660268714012, + "grad_norm": 0.28860699599060785, + "learning_rate": 1.4129700275771208e-08, + "loss": 0.1466, + "step": 1929 + }, + { + "epoch": 4.939219449776072, + "grad_norm": 0.2584677694793919, + "learning_rate": 1.281619499029274e-08, + "loss": 0.1844, + "step": 1930 + }, + { + "epoch": 4.941778630838132, + "grad_norm": 0.2704355939225655, + "learning_rate": 1.1566736439685422e-08, + "loss": 0.1687, + "step": 1931 + }, + { + "epoch": 4.944337811900192, + "grad_norm": 0.2647721632155255, + "learning_rate": 1.0381328627702136e-08, + "loss": 0.1398, + "step": 1932 + }, + { + "epoch": 4.946896992962252, + "grad_norm": 0.26415602194940657, + "learning_rate": 9.259975352848838e-09, + "loss": 0.1665, + "step": 1933 + }, + { + "epoch": 4.949456174024312, + "grad_norm": 0.26483516576483956, + "learning_rate": 8.20268020838455e-09, + "loss": 0.1632, + "step": 1934 + }, + { + "epoch": 4.952015355086372, + "grad_norm": 0.26643152935588643, + "learning_rate": 7.209446582292501e-09, + "loss": 0.1563, + "step": 1935 + }, + { + "epoch": 4.954574536148432, + "grad_norm": 0.25910553240725315, + "learning_rate": 6.2802776572779005e-09, + "loss": 0.1467, + "step": 1936 + }, + { + "epoch": 4.957133717210493, + "grad_norm": 0.25618053832139825, + "learning_rate": 5.415176410765721e-09, + "loss": 0.1586, + "step": 1937 + }, + { + "epoch": 4.959692898272553, + "grad_norm": 0.2519718295297774, + "learning_rate": 4.614145614876275e-09, + "loss": 0.175, + "step": 1938 + }, + { + "epoch": 4.962252079334613, + "grad_norm": 0.266276772335997, + "learning_rate": 3.877187836422991e-09, + "loss": 0.1893, + "step": 1939 + }, + { + "epoch": 4.964811260396673, + "grad_norm": 0.27130663744108346, + "learning_rate": 3.2043054369057523e-09, + "loss": 0.1928, + "step": 1940 + }, + { + "epoch": 4.967370441458733, + "grad_norm": 0.2716068333936989, + "learning_rate": 2.5955005725064597e-09, + "loss": 0.1714, + "step": 1941 + }, + { + "epoch": 4.969929622520794, + "grad_norm": 0.26663386055671573, + "learning_rate": 2.0507751940690434e-09, + "loss": 0.1648, + "step": 1942 + }, + { + "epoch": 4.972488803582854, + "grad_norm": 0.26494109005360517, + "learning_rate": 1.5701310471083476e-09, + "loss": 0.1591, + "step": 1943 + }, + { + "epoch": 4.975047984644913, + "grad_norm": 0.2655087142061252, + "learning_rate": 1.1535696717945855e-09, + "loss": 0.1437, + "step": 1944 + }, + { + "epoch": 4.977607165706974, + "grad_norm": 0.27477087152296875, + "learning_rate": 8.010924029533406e-10, + "loss": 0.1491, + "step": 1945 + }, + { + "epoch": 4.980166346769034, + "grad_norm": 0.2590976709734058, + "learning_rate": 5.127003700589051e-10, + "loss": 0.1584, + "step": 1946 + }, + { + "epoch": 4.982725527831094, + "grad_norm": 0.267447188689341, + "learning_rate": 2.8839449723205847e-10, + "loss": 0.1975, + "step": 1947 + }, + { + "epoch": 4.985284708893154, + "grad_norm": 0.2598735449781636, + "learning_rate": 1.2817550323784843e-10, + "loss": 0.2008, + "step": 1948 + }, + { + "epoch": 4.987843889955214, + "grad_norm": 0.26368976010696143, + "learning_rate": 3.2043901478928666e-11, + "loss": 0.1795, + "step": 1949 + }, + { + "epoch": 4.990403071017274, + "grad_norm": 0.2645697161648628, + "learning_rate": 0.0, + "loss": 0.1707, + "step": 1950 + }, + { + "epoch": 4.990403071017274, + "step": 1950, + "total_flos": 3.115960359367213e+18, + "train_loss": 0.34092234334120386, + "train_runtime": 73926.9587, + "train_samples_per_second": 3.382, + "train_steps_per_second": 0.026 + } + ], + "logging_steps": 1.0, + "max_steps": 1950, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.115960359367213e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}