openthoughts3_science / trainer_state.json
neginr's picture
End of training
161cdc9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 525,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009523809523809525,
"grad_norm": 7.880720841975985,
"learning_rate": 1.509433962264151e-06,
"loss": 1.5676,
"step": 1
},
{
"epoch": 0.01904761904761905,
"grad_norm": 7.860108844770417,
"learning_rate": 3.018867924528302e-06,
"loss": 1.5671,
"step": 2
},
{
"epoch": 0.02857142857142857,
"grad_norm": 7.7425094912660395,
"learning_rate": 4.528301886792453e-06,
"loss": 1.5511,
"step": 3
},
{
"epoch": 0.0380952380952381,
"grad_norm": 5.876257335486737,
"learning_rate": 6.037735849056604e-06,
"loss": 1.5083,
"step": 4
},
{
"epoch": 0.047619047619047616,
"grad_norm": 2.92144085922626,
"learning_rate": 7.5471698113207555e-06,
"loss": 1.4418,
"step": 5
},
{
"epoch": 0.05714285714285714,
"grad_norm": 2.432882075707435,
"learning_rate": 9.056603773584907e-06,
"loss": 1.4431,
"step": 6
},
{
"epoch": 0.06666666666666667,
"grad_norm": 6.42341738028925,
"learning_rate": 1.0566037735849058e-05,
"loss": 1.4534,
"step": 7
},
{
"epoch": 0.0761904761904762,
"grad_norm": 6.844420009717874,
"learning_rate": 1.2075471698113209e-05,
"loss": 1.4503,
"step": 8
},
{
"epoch": 0.08571428571428572,
"grad_norm": 8.72224792197926,
"learning_rate": 1.3584905660377358e-05,
"loss": 1.4895,
"step": 9
},
{
"epoch": 0.09523809523809523,
"grad_norm": 6.221915658880092,
"learning_rate": 1.5094339622641511e-05,
"loss": 1.4558,
"step": 10
},
{
"epoch": 0.10476190476190476,
"grad_norm": 4.297680995629592,
"learning_rate": 1.6603773584905664e-05,
"loss": 1.4005,
"step": 11
},
{
"epoch": 0.11428571428571428,
"grad_norm": 3.3637103263399886,
"learning_rate": 1.8113207547169813e-05,
"loss": 1.353,
"step": 12
},
{
"epoch": 0.12380952380952381,
"grad_norm": 2.102279140328028,
"learning_rate": 1.9622641509433963e-05,
"loss": 1.3221,
"step": 13
},
{
"epoch": 0.13333333333333333,
"grad_norm": 2.5449653919034234,
"learning_rate": 2.1132075471698115e-05,
"loss": 1.3029,
"step": 14
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.6782911213475984,
"learning_rate": 2.2641509433962265e-05,
"loss": 1.304,
"step": 15
},
{
"epoch": 0.1523809523809524,
"grad_norm": 1.7888774786584605,
"learning_rate": 2.4150943396226418e-05,
"loss": 1.2618,
"step": 16
},
{
"epoch": 0.1619047619047619,
"grad_norm": 1.5997868512124438,
"learning_rate": 2.5660377358490567e-05,
"loss": 1.2583,
"step": 17
},
{
"epoch": 0.17142857142857143,
"grad_norm": 1.2377999305796528,
"learning_rate": 2.7169811320754716e-05,
"loss": 1.2703,
"step": 18
},
{
"epoch": 0.18095238095238095,
"grad_norm": 1.486080611758641,
"learning_rate": 2.867924528301887e-05,
"loss": 1.2392,
"step": 19
},
{
"epoch": 0.19047619047619047,
"grad_norm": 1.27108266101122,
"learning_rate": 3.0188679245283022e-05,
"loss": 1.2377,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 0.9831199960325674,
"learning_rate": 3.169811320754717e-05,
"loss": 1.2123,
"step": 21
},
{
"epoch": 0.20952380952380953,
"grad_norm": 1.7763756530914847,
"learning_rate": 3.320754716981133e-05,
"loss": 1.2336,
"step": 22
},
{
"epoch": 0.21904761904761905,
"grad_norm": 1.3105067418162915,
"learning_rate": 3.471698113207548e-05,
"loss": 1.2172,
"step": 23
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1.639321729355214,
"learning_rate": 3.6226415094339626e-05,
"loss": 1.2173,
"step": 24
},
{
"epoch": 0.23809523809523808,
"grad_norm": 1.476554104561696,
"learning_rate": 3.7735849056603776e-05,
"loss": 1.201,
"step": 25
},
{
"epoch": 0.24761904761904763,
"grad_norm": 1.530897069889547,
"learning_rate": 3.9245283018867925e-05,
"loss": 1.2083,
"step": 26
},
{
"epoch": 0.2571428571428571,
"grad_norm": 1.5322183506473377,
"learning_rate": 4.075471698113208e-05,
"loss": 1.2032,
"step": 27
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.3706277501006499,
"learning_rate": 4.226415094339623e-05,
"loss": 1.1896,
"step": 28
},
{
"epoch": 0.2761904761904762,
"grad_norm": 1.675751835429746,
"learning_rate": 4.377358490566038e-05,
"loss": 1.1955,
"step": 29
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.2446280009546904,
"learning_rate": 4.528301886792453e-05,
"loss": 1.1714,
"step": 30
},
{
"epoch": 0.29523809523809524,
"grad_norm": 1.232539792154481,
"learning_rate": 4.679245283018868e-05,
"loss": 1.1935,
"step": 31
},
{
"epoch": 0.3047619047619048,
"grad_norm": 1.1894442744080997,
"learning_rate": 4.8301886792452835e-05,
"loss": 1.1823,
"step": 32
},
{
"epoch": 0.3142857142857143,
"grad_norm": 1.6698359989666955,
"learning_rate": 4.9811320754716985e-05,
"loss": 1.1844,
"step": 33
},
{
"epoch": 0.3238095238095238,
"grad_norm": 1.1584686212441244,
"learning_rate": 5.1320754716981134e-05,
"loss": 1.168,
"step": 34
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.7374801678617988,
"learning_rate": 5.283018867924528e-05,
"loss": 1.175,
"step": 35
},
{
"epoch": 0.34285714285714286,
"grad_norm": 1.3774261987594567,
"learning_rate": 5.433962264150943e-05,
"loss": 1.1885,
"step": 36
},
{
"epoch": 0.3523809523809524,
"grad_norm": 1.144616111806608,
"learning_rate": 5.584905660377359e-05,
"loss": 1.1609,
"step": 37
},
{
"epoch": 0.3619047619047619,
"grad_norm": 2.044834450527845,
"learning_rate": 5.735849056603774e-05,
"loss": 1.1627,
"step": 38
},
{
"epoch": 0.37142857142857144,
"grad_norm": 1.8059910262399932,
"learning_rate": 5.886792452830189e-05,
"loss": 1.1599,
"step": 39
},
{
"epoch": 0.38095238095238093,
"grad_norm": 2.170117071827184,
"learning_rate": 6.0377358490566044e-05,
"loss": 1.1639,
"step": 40
},
{
"epoch": 0.3904761904761905,
"grad_norm": 1.3892171371581836,
"learning_rate": 6.18867924528302e-05,
"loss": 1.1672,
"step": 41
},
{
"epoch": 0.4,
"grad_norm": 3.12613489185021,
"learning_rate": 6.339622641509434e-05,
"loss": 1.1621,
"step": 42
},
{
"epoch": 0.4095238095238095,
"grad_norm": 1.9337153601065566,
"learning_rate": 6.49056603773585e-05,
"loss": 1.1624,
"step": 43
},
{
"epoch": 0.41904761904761906,
"grad_norm": 2.8416059583439055,
"learning_rate": 6.641509433962266e-05,
"loss": 1.1505,
"step": 44
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.9476637352918562,
"learning_rate": 6.79245283018868e-05,
"loss": 1.1661,
"step": 45
},
{
"epoch": 0.4380952380952381,
"grad_norm": 2.276534088165202,
"learning_rate": 6.943396226415095e-05,
"loss": 1.1519,
"step": 46
},
{
"epoch": 0.44761904761904764,
"grad_norm": 1.7741206434308783,
"learning_rate": 7.09433962264151e-05,
"loss": 1.1397,
"step": 47
},
{
"epoch": 0.45714285714285713,
"grad_norm": 1.2590011451410716,
"learning_rate": 7.245283018867925e-05,
"loss": 1.1496,
"step": 48
},
{
"epoch": 0.4666666666666667,
"grad_norm": 2.2235594751942145,
"learning_rate": 7.396226415094341e-05,
"loss": 1.1646,
"step": 49
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.3848690153622902,
"learning_rate": 7.547169811320755e-05,
"loss": 1.1446,
"step": 50
},
{
"epoch": 0.4857142857142857,
"grad_norm": 2.2564098595161375,
"learning_rate": 7.698113207547171e-05,
"loss": 1.1521,
"step": 51
},
{
"epoch": 0.49523809523809526,
"grad_norm": 2.2598603091896234,
"learning_rate": 7.849056603773585e-05,
"loss": 1.1501,
"step": 52
},
{
"epoch": 0.5047619047619047,
"grad_norm": 1.9630215060124938,
"learning_rate": 8e-05,
"loss": 1.1558,
"step": 53
},
{
"epoch": 0.5142857142857142,
"grad_norm": 2.449644231642783,
"learning_rate": 7.99991139787449e-05,
"loss": 1.1478,
"step": 54
},
{
"epoch": 0.5238095238095238,
"grad_norm": 1.8285447577052554,
"learning_rate": 7.999645595423128e-05,
"loss": 1.1522,
"step": 55
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.6446782549594272,
"learning_rate": 7.999202604421244e-05,
"loss": 1.1488,
"step": 56
},
{
"epoch": 0.5428571428571428,
"grad_norm": 1.6532082997276394,
"learning_rate": 7.998582444493811e-05,
"loss": 1.135,
"step": 57
},
{
"epoch": 0.5523809523809524,
"grad_norm": 1.3217448563312193,
"learning_rate": 7.997785143114573e-05,
"loss": 1.1381,
"step": 58
},
{
"epoch": 0.5619047619047619,
"grad_norm": 1.5193922578374879,
"learning_rate": 7.996810735604828e-05,
"loss": 1.1488,
"step": 59
},
{
"epoch": 0.5714285714285714,
"grad_norm": 2.032121014977749,
"learning_rate": 7.995659265131865e-05,
"loss": 1.1486,
"step": 60
},
{
"epoch": 0.580952380952381,
"grad_norm": 1.1859854441122255,
"learning_rate": 7.994330782707048e-05,
"loss": 1.1463,
"step": 61
},
{
"epoch": 0.5904761904761905,
"grad_norm": 1.8225832846306127,
"learning_rate": 7.992825347183563e-05,
"loss": 1.1422,
"step": 62
},
{
"epoch": 0.6,
"grad_norm": 3.1767099132769046,
"learning_rate": 7.991143025253801e-05,
"loss": 1.1281,
"step": 63
},
{
"epoch": 0.6095238095238096,
"grad_norm": 1.7321438313588222,
"learning_rate": 7.989283891446413e-05,
"loss": 1.1509,
"step": 64
},
{
"epoch": 0.6190476190476191,
"grad_norm": 3.7251686905910355,
"learning_rate": 7.987248028123001e-05,
"loss": 1.161,
"step": 65
},
{
"epoch": 0.6285714285714286,
"grad_norm": 2.9492630393156514,
"learning_rate": 7.985035525474475e-05,
"loss": 1.1583,
"step": 66
},
{
"epoch": 0.638095238095238,
"grad_norm": 2.9426097334554897,
"learning_rate": 7.982646481517054e-05,
"loss": 1.1442,
"step": 67
},
{
"epoch": 0.6476190476190476,
"grad_norm": 2.469295775843566,
"learning_rate": 7.980081002087923e-05,
"loss": 1.1401,
"step": 68
},
{
"epoch": 0.6571428571428571,
"grad_norm": 2.652729866210755,
"learning_rate": 7.97733920084055e-05,
"loss": 1.1488,
"step": 69
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.2477356726812125,
"learning_rate": 7.97442119923964e-05,
"loss": 1.1241,
"step": 70
},
{
"epoch": 0.6761904761904762,
"grad_norm": 2.071700619970133,
"learning_rate": 7.971327126555767e-05,
"loss": 1.1383,
"step": 71
},
{
"epoch": 0.6857142857142857,
"grad_norm": 1.9746783226121274,
"learning_rate": 7.968057119859639e-05,
"loss": 1.132,
"step": 72
},
{
"epoch": 0.6952380952380952,
"grad_norm": 1.5945211845534675,
"learning_rate": 7.96461132401603e-05,
"loss": 1.1326,
"step": 73
},
{
"epoch": 0.7047619047619048,
"grad_norm": 1.9842082262603224,
"learning_rate": 7.960989891677354e-05,
"loss": 1.131,
"step": 74
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.0744704158928022,
"learning_rate": 7.957192983276915e-05,
"loss": 1.1286,
"step": 75
},
{
"epoch": 0.7238095238095238,
"grad_norm": 2.4648933927530954,
"learning_rate": 7.953220767021789e-05,
"loss": 1.1334,
"step": 76
},
{
"epoch": 0.7333333333333333,
"grad_norm": 1.6229327724824014,
"learning_rate": 7.949073418885378e-05,
"loss": 1.1268,
"step": 77
},
{
"epoch": 0.7428571428571429,
"grad_norm": 2.070091284758308,
"learning_rate": 7.944751122599613e-05,
"loss": 1.13,
"step": 78
},
{
"epoch": 0.7523809523809524,
"grad_norm": 1.5442744548822007,
"learning_rate": 7.940254069646813e-05,
"loss": 1.1219,
"step": 79
},
{
"epoch": 0.7619047619047619,
"grad_norm": 1.737018571216328,
"learning_rate": 7.935582459251202e-05,
"loss": 1.1219,
"step": 80
},
{
"epoch": 0.7714285714285715,
"grad_norm": 1.5613075146451556,
"learning_rate": 7.930736498370085e-05,
"loss": 1.1096,
"step": 81
},
{
"epoch": 0.780952380952381,
"grad_norm": 1.0920845324913702,
"learning_rate": 7.925716401684678e-05,
"loss": 1.119,
"step": 82
},
{
"epoch": 0.7904761904761904,
"grad_norm": 1.9034104210285714,
"learning_rate": 7.920522391590604e-05,
"loss": 1.1236,
"step": 83
},
{
"epoch": 0.8,
"grad_norm": 1.345035723453052,
"learning_rate": 7.915154698188027e-05,
"loss": 1.1324,
"step": 84
},
{
"epoch": 0.8095238095238095,
"grad_norm": 1.2875694611881727,
"learning_rate": 7.909613559271467e-05,
"loss": 1.1136,
"step": 85
},
{
"epoch": 0.819047619047619,
"grad_norm": 1.0224645041826803,
"learning_rate": 7.90389922031927e-05,
"loss": 1.1226,
"step": 86
},
{
"epoch": 0.8285714285714286,
"grad_norm": 1.3257820885017062,
"learning_rate": 7.898011934482725e-05,
"loss": 1.1214,
"step": 87
},
{
"epoch": 0.8380952380952381,
"grad_norm": 1.839693710231441,
"learning_rate": 7.89195196257485e-05,
"loss": 1.1406,
"step": 88
},
{
"epoch": 0.8476190476190476,
"grad_norm": 1.2987193541355626,
"learning_rate": 7.88571957305884e-05,
"loss": 1.1231,
"step": 89
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.4098694982637217,
"learning_rate": 7.879315042036176e-05,
"loss": 1.1386,
"step": 90
},
{
"epoch": 0.8666666666666667,
"grad_norm": 1.6007046498489645,
"learning_rate": 7.872738653234387e-05,
"loss": 1.1108,
"step": 91
},
{
"epoch": 0.8761904761904762,
"grad_norm": 1.0544725776613069,
"learning_rate": 7.865990697994488e-05,
"loss": 1.1285,
"step": 92
},
{
"epoch": 0.8857142857142857,
"grad_norm": 1.0388252500166617,
"learning_rate": 7.859071475258065e-05,
"loss": 1.1265,
"step": 93
},
{
"epoch": 0.8952380952380953,
"grad_norm": 1.3249551090301468,
"learning_rate": 7.85198129155404e-05,
"loss": 1.1229,
"step": 94
},
{
"epoch": 0.9047619047619048,
"grad_norm": 1.2506636003424314,
"learning_rate": 7.844720460985086e-05,
"loss": 1.1284,
"step": 95
},
{
"epoch": 0.9142857142857143,
"grad_norm": 1.1709974083254924,
"learning_rate": 7.837289305213715e-05,
"loss": 1.1221,
"step": 96
},
{
"epoch": 0.9238095238095239,
"grad_norm": 1.5298739597522653,
"learning_rate": 7.829688153448022e-05,
"loss": 1.122,
"step": 97
},
{
"epoch": 0.9333333333333333,
"grad_norm": 1.0400221166560895,
"learning_rate": 7.821917342427112e-05,
"loss": 1.1125,
"step": 98
},
{
"epoch": 0.9428571428571428,
"grad_norm": 1.1449775808812492,
"learning_rate": 7.81397721640617e-05,
"loss": 1.1098,
"step": 99
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.09547909217965,
"learning_rate": 7.805868127141217e-05,
"loss": 1.1097,
"step": 100
},
{
"epoch": 0.9619047619047619,
"grad_norm": 1.0999337584802662,
"learning_rate": 7.797590433873526e-05,
"loss": 1.1146,
"step": 101
},
{
"epoch": 0.9714285714285714,
"grad_norm": 1.5977618876155488,
"learning_rate": 7.789144503313704e-05,
"loss": 1.1375,
"step": 102
},
{
"epoch": 0.9809523809523809,
"grad_norm": 0.8781433045555708,
"learning_rate": 7.780530709625455e-05,
"loss": 1.1191,
"step": 103
},
{
"epoch": 0.9904761904761905,
"grad_norm": 1.2752214725002813,
"learning_rate": 7.771749434408989e-05,
"loss": 1.1173,
"step": 104
},
{
"epoch": 1.0,
"grad_norm": 1.0844449972573356,
"learning_rate": 7.762801066684133e-05,
"loss": 1.1137,
"step": 105
},
{
"epoch": 1.0095238095238095,
"grad_norm": 1.082120963718663,
"learning_rate": 7.753686002873087e-05,
"loss": 1.0737,
"step": 106
},
{
"epoch": 1.019047619047619,
"grad_norm": 1.1974759867644165,
"learning_rate": 7.744404646782866e-05,
"loss": 1.0849,
"step": 107
},
{
"epoch": 1.0285714285714285,
"grad_norm": 1.6630439720081593,
"learning_rate": 7.734957409587404e-05,
"loss": 1.0861,
"step": 108
},
{
"epoch": 1.0380952380952382,
"grad_norm": 0.8074012816288818,
"learning_rate": 7.725344709809355e-05,
"loss": 1.0785,
"step": 109
},
{
"epoch": 1.0476190476190477,
"grad_norm": 1.6368997832238315,
"learning_rate": 7.715566973301529e-05,
"loss": 1.0963,
"step": 110
},
{
"epoch": 1.0571428571428572,
"grad_norm": 1.297969917454931,
"learning_rate": 7.70562463322805e-05,
"loss": 1.0728,
"step": 111
},
{
"epoch": 1.0666666666666667,
"grad_norm": 1.4036049691933277,
"learning_rate": 7.695518130045147e-05,
"loss": 1.0806,
"step": 112
},
{
"epoch": 1.0761904761904761,
"grad_norm": 0.9013266727291808,
"learning_rate": 7.685247911481652e-05,
"loss": 1.0981,
"step": 113
},
{
"epoch": 1.0857142857142856,
"grad_norm": 1.0917190640539323,
"learning_rate": 7.674814432519163e-05,
"loss": 1.0734,
"step": 114
},
{
"epoch": 1.0952380952380953,
"grad_norm": 1.0218269030800684,
"learning_rate": 7.664218155371884e-05,
"loss": 1.0658,
"step": 115
},
{
"epoch": 1.1047619047619048,
"grad_norm": 0.9481079847045792,
"learning_rate": 7.653459549466157e-05,
"loss": 1.086,
"step": 116
},
{
"epoch": 1.1142857142857143,
"grad_norm": 1.1795366957840878,
"learning_rate": 7.642539091419654e-05,
"loss": 1.1062,
"step": 117
},
{
"epoch": 1.1238095238095238,
"grad_norm": 1.2913118926572955,
"learning_rate": 7.631457265020274e-05,
"loss": 1.075,
"step": 118
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.8478614709821788,
"learning_rate": 7.620214561204704e-05,
"loss": 1.0683,
"step": 119
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.0862987356843379,
"learning_rate": 7.608811478036671e-05,
"loss": 1.0846,
"step": 120
},
{
"epoch": 1.1523809523809523,
"grad_norm": 1.090222081314668,
"learning_rate": 7.597248520684878e-05,
"loss": 1.0905,
"step": 121
},
{
"epoch": 1.161904761904762,
"grad_norm": 0.7532978409995891,
"learning_rate": 7.585526201400623e-05,
"loss": 1.0791,
"step": 122
},
{
"epoch": 1.1714285714285715,
"grad_norm": 1.1901023584369215,
"learning_rate": 7.57364503949511e-05,
"loss": 1.0869,
"step": 123
},
{
"epoch": 1.180952380952381,
"grad_norm": 1.2655635735771573,
"learning_rate": 7.56160556131644e-05,
"loss": 1.0688,
"step": 124
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.8393572943728383,
"learning_rate": 7.549408300226287e-05,
"loss": 1.0798,
"step": 125
},
{
"epoch": 1.2,
"grad_norm": 1.0763767921818734,
"learning_rate": 7.537053796576282e-05,
"loss": 1.0755,
"step": 126
},
{
"epoch": 1.2095238095238094,
"grad_norm": 0.810181087854412,
"learning_rate": 7.524542597684066e-05,
"loss": 1.0722,
"step": 127
},
{
"epoch": 1.2190476190476192,
"grad_norm": 0.7248802686574851,
"learning_rate": 7.51187525780905e-05,
"loss": 1.0718,
"step": 128
},
{
"epoch": 1.2285714285714286,
"grad_norm": 1.3361324797449061,
"learning_rate": 7.499052338127845e-05,
"loss": 1.0778,
"step": 129
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.8378010550345113,
"learning_rate": 7.486074406709429e-05,
"loss": 1.068,
"step": 130
},
{
"epoch": 1.2476190476190476,
"grad_norm": 0.703515432967172,
"learning_rate": 7.47294203848995e-05,
"loss": 1.0832,
"step": 131
},
{
"epoch": 1.2571428571428571,
"grad_norm": 0.5700766848892007,
"learning_rate": 7.459655815247278e-05,
"loss": 1.0702,
"step": 132
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.7201933038596476,
"learning_rate": 7.446216325575225e-05,
"loss": 1.0751,
"step": 133
},
{
"epoch": 1.276190476190476,
"grad_norm": 0.9463247169207133,
"learning_rate": 7.432624164857465e-05,
"loss": 1.0597,
"step": 134
},
{
"epoch": 1.2857142857142856,
"grad_norm": 1.0411988380085484,
"learning_rate": 7.418879935241162e-05,
"loss": 1.0694,
"step": 135
},
{
"epoch": 1.2952380952380953,
"grad_norm": 1.0701186830070861,
"learning_rate": 7.404984245610296e-05,
"loss": 1.0706,
"step": 136
},
{
"epoch": 1.3047619047619048,
"grad_norm": 1.1554699877331098,
"learning_rate": 7.390937711558683e-05,
"loss": 1.0619,
"step": 137
},
{
"epoch": 1.3142857142857143,
"grad_norm": 1.1704855402151975,
"learning_rate": 7.376740955362715e-05,
"loss": 1.0535,
"step": 138
},
{
"epoch": 1.3238095238095238,
"grad_norm": 1.1741068413982254,
"learning_rate": 7.362394605953773e-05,
"loss": 1.0775,
"step": 139
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.2007489108920013,
"learning_rate": 7.347899298890386e-05,
"loss": 1.0726,
"step": 140
},
{
"epoch": 1.342857142857143,
"grad_norm": 0.9964002243232726,
"learning_rate": 7.33325567633006e-05,
"loss": 1.0602,
"step": 141
},
{
"epoch": 1.3523809523809525,
"grad_norm": 0.8355502476587309,
"learning_rate": 7.31846438700084e-05,
"loss": 1.0774,
"step": 142
},
{
"epoch": 1.361904761904762,
"grad_norm": 0.7029298880866707,
"learning_rate": 7.303526086172558e-05,
"loss": 1.0846,
"step": 143
},
{
"epoch": 1.3714285714285714,
"grad_norm": 0.9269565547406854,
"learning_rate": 7.288441435627821e-05,
"loss": 1.0627,
"step": 144
},
{
"epoch": 1.380952380952381,
"grad_norm": 1.0950185791329867,
"learning_rate": 7.273211103632676e-05,
"loss": 1.0841,
"step": 145
},
{
"epoch": 1.3904761904761904,
"grad_norm": 1.0029086769564732,
"learning_rate": 7.25783576490702e-05,
"loss": 1.0559,
"step": 146
},
{
"epoch": 1.4,
"grad_norm": 1.0116599010130591,
"learning_rate": 7.242316100594696e-05,
"loss": 1.063,
"step": 147
},
{
"epoch": 1.4095238095238094,
"grad_norm": 0.8363130430044641,
"learning_rate": 7.226652798233327e-05,
"loss": 1.0601,
"step": 148
},
{
"epoch": 1.4190476190476191,
"grad_norm": 0.7749911266241076,
"learning_rate": 7.210846551723855e-05,
"loss": 1.0655,
"step": 149
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.8621768850747066,
"learning_rate": 7.194898061299798e-05,
"loss": 1.0687,
"step": 150
},
{
"epoch": 1.438095238095238,
"grad_norm": 1.0632400314141561,
"learning_rate": 7.17880803349623e-05,
"loss": 1.0594,
"step": 151
},
{
"epoch": 1.4476190476190476,
"grad_norm": 1.384452095634303,
"learning_rate": 7.162577181118485e-05,
"loss": 1.0747,
"step": 152
},
{
"epoch": 1.457142857142857,
"grad_norm": 0.6834451700354741,
"learning_rate": 7.146206223210571e-05,
"loss": 1.0722,
"step": 153
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.6642752224995787,
"learning_rate": 7.129695885023321e-05,
"loss": 1.0605,
"step": 154
},
{
"epoch": 1.4761904761904763,
"grad_norm": 1.2463789389464905,
"learning_rate": 7.113046897982265e-05,
"loss": 1.0678,
"step": 155
},
{
"epoch": 1.4857142857142858,
"grad_norm": 1.1799339849059132,
"learning_rate": 7.09625999965522e-05,
"loss": 1.0805,
"step": 156
},
{
"epoch": 1.4952380952380953,
"grad_norm": 0.5844684489642771,
"learning_rate": 7.079335933719625e-05,
"loss": 1.0627,
"step": 157
},
{
"epoch": 1.5047619047619047,
"grad_norm": 0.6983703223969,
"learning_rate": 7.062275449929587e-05,
"loss": 1.0685,
"step": 158
},
{
"epoch": 1.5142857142857142,
"grad_norm": 1.0157049520030346,
"learning_rate": 7.045079304082667e-05,
"loss": 1.057,
"step": 159
},
{
"epoch": 1.5238095238095237,
"grad_norm": 1.21522775185842,
"learning_rate": 7.027748257986403e-05,
"loss": 1.0497,
"step": 160
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.670913402144578,
"learning_rate": 7.010283079424553e-05,
"loss": 1.0714,
"step": 161
},
{
"epoch": 1.5428571428571427,
"grad_norm": 0.5725919893988906,
"learning_rate": 6.992684542123094e-05,
"loss": 1.0651,
"step": 162
},
{
"epoch": 1.5523809523809524,
"grad_norm": 0.5816949338841475,
"learning_rate": 6.974953425715926e-05,
"loss": 1.0625,
"step": 163
},
{
"epoch": 1.561904761904762,
"grad_norm": 0.621239138517028,
"learning_rate": 6.957090515710353e-05,
"loss": 1.06,
"step": 164
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.8232854555952457,
"learning_rate": 6.939096603452269e-05,
"loss": 1.0607,
"step": 165
},
{
"epoch": 1.580952380952381,
"grad_norm": 0.7980171860515516,
"learning_rate": 6.920972486091113e-05,
"loss": 1.069,
"step": 166
},
{
"epoch": 1.5904761904761906,
"grad_norm": 0.8469950727776042,
"learning_rate": 6.902718966544545e-05,
"loss": 1.0555,
"step": 167
},
{
"epoch": 1.6,
"grad_norm": 1.0917122939746875,
"learning_rate": 6.884336853462879e-05,
"loss": 1.073,
"step": 168
},
{
"epoch": 1.6095238095238096,
"grad_norm": 0.92965912537962,
"learning_rate": 6.865826961193261e-05,
"loss": 1.0662,
"step": 169
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.7708978762494281,
"learning_rate": 6.84719010974359e-05,
"loss": 1.0524,
"step": 170
},
{
"epoch": 1.6285714285714286,
"grad_norm": 0.6756433297794424,
"learning_rate": 6.828427124746191e-05,
"loss": 1.0692,
"step": 171
},
{
"epoch": 1.638095238095238,
"grad_norm": 0.6747029343891224,
"learning_rate": 6.80953883742124e-05,
"loss": 1.0607,
"step": 172
},
{
"epoch": 1.6476190476190475,
"grad_norm": 0.6109245622680719,
"learning_rate": 6.790526084539939e-05,
"loss": 1.0578,
"step": 173
},
{
"epoch": 1.657142857142857,
"grad_norm": 0.7524343677177509,
"learning_rate": 6.771389708387448e-05,
"loss": 1.0666,
"step": 174
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.1491906497079498,
"learning_rate": 6.752130556725567e-05,
"loss": 1.0694,
"step": 175
},
{
"epoch": 1.6761904761904762,
"grad_norm": 0.8088967378835389,
"learning_rate": 6.73274948275518e-05,
"loss": 1.0632,
"step": 176
},
{
"epoch": 1.6857142857142857,
"grad_norm": 0.4254639452827212,
"learning_rate": 6.713247345078465e-05,
"loss": 1.0543,
"step": 177
},
{
"epoch": 1.6952380952380952,
"grad_norm": 0.5687155529327818,
"learning_rate": 6.693625007660845e-05,
"loss": 1.0575,
"step": 178
},
{
"epoch": 1.704761904761905,
"grad_norm": 0.6667850106047994,
"learning_rate": 6.673883339792723e-05,
"loss": 1.055,
"step": 179
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.612985384932019,
"learning_rate": 6.654023216050963e-05,
"loss": 1.071,
"step": 180
},
{
"epoch": 1.723809523809524,
"grad_norm": 0.5640012274326677,
"learning_rate": 6.634045516260156e-05,
"loss": 1.051,
"step": 181
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.499428294662914,
"learning_rate": 6.613951125453632e-05,
"loss": 1.0768,
"step": 182
},
{
"epoch": 1.7428571428571429,
"grad_norm": 0.42771581794966373,
"learning_rate": 6.593740933834262e-05,
"loss": 1.0566,
"step": 183
},
{
"epoch": 1.7523809523809524,
"grad_norm": 0.8182121479285371,
"learning_rate": 6.573415836735011e-05,
"loss": 1.073,
"step": 184
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.45335952004384406,
"learning_rate": 6.552976734579281e-05,
"loss": 1.0602,
"step": 185
},
{
"epoch": 1.7714285714285714,
"grad_norm": 0.47019401062933164,
"learning_rate": 6.53242453284102e-05,
"loss": 1.0592,
"step": 186
},
{
"epoch": 1.7809523809523808,
"grad_norm": 0.4216198182174328,
"learning_rate": 6.511760142004608e-05,
"loss": 1.0607,
"step": 187
},
{
"epoch": 1.7904761904761903,
"grad_norm": 0.4390063285487437,
"learning_rate": 6.49098447752452e-05,
"loss": 1.0685,
"step": 188
},
{
"epoch": 1.8,
"grad_norm": 0.44096942987470333,
"learning_rate": 6.470098459784768e-05,
"loss": 1.0566,
"step": 189
},
{
"epoch": 1.8095238095238095,
"grad_norm": 0.3569409404531036,
"learning_rate": 6.449103014058139e-05,
"loss": 1.0588,
"step": 190
},
{
"epoch": 1.819047619047619,
"grad_norm": 0.47305630507582075,
"learning_rate": 6.427999070465191e-05,
"loss": 1.0758,
"step": 191
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.34682771443136134,
"learning_rate": 6.406787563933053e-05,
"loss": 1.0561,
"step": 192
},
{
"epoch": 1.8380952380952382,
"grad_norm": 0.3784058265103603,
"learning_rate": 6.385469434154006e-05,
"loss": 1.0806,
"step": 193
},
{
"epoch": 1.8476190476190477,
"grad_norm": 0.45193884219920866,
"learning_rate": 6.364045625543856e-05,
"loss": 1.0601,
"step": 194
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.5301378494163913,
"learning_rate": 6.342517087200094e-05,
"loss": 1.0586,
"step": 195
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.563802211092738,
"learning_rate": 6.320884772859845e-05,
"loss": 1.0571,
"step": 196
},
{
"epoch": 1.8761904761904762,
"grad_norm": 0.5201827508182623,
"learning_rate": 6.29914964085763e-05,
"loss": 1.0571,
"step": 197
},
{
"epoch": 1.8857142857142857,
"grad_norm": 0.4236836095612993,
"learning_rate": 6.277312654082886e-05,
"loss": 1.0606,
"step": 198
},
{
"epoch": 1.8952380952380952,
"grad_norm": 0.38720748352240386,
"learning_rate": 6.255374779937344e-05,
"loss": 1.0493,
"step": 199
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.42659615830981307,
"learning_rate": 6.23333699029214e-05,
"loss": 1.0619,
"step": 200
},
{
"epoch": 1.9142857142857141,
"grad_norm": 0.6061421220455705,
"learning_rate": 6.211200261444774e-05,
"loss": 1.0541,
"step": 201
},
{
"epoch": 1.9238095238095239,
"grad_norm": 0.797781403199275,
"learning_rate": 6.188965574075863e-05,
"loss": 1.0559,
"step": 202
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.8282451159457771,
"learning_rate": 6.166633913205684e-05,
"loss": 1.0545,
"step": 203
},
{
"epoch": 1.9428571428571428,
"grad_norm": 0.9009492897590493,
"learning_rate": 6.144206268150549e-05,
"loss": 1.0586,
"step": 204
},
{
"epoch": 1.9523809523809523,
"grad_norm": 1.0020870163569926,
"learning_rate": 6.12168363247897e-05,
"loss": 1.0457,
"step": 205
},
{
"epoch": 1.961904761904762,
"grad_norm": 1.1260023046138727,
"learning_rate": 6.0990670039676416e-05,
"loss": 1.054,
"step": 206
},
{
"epoch": 1.9714285714285715,
"grad_norm": 0.7820493480578619,
"learning_rate": 6.0763573845572434e-05,
"loss": 1.0674,
"step": 207
},
{
"epoch": 1.980952380952381,
"grad_norm": 0.47269444067540217,
"learning_rate": 6.053555780308049e-05,
"loss": 1.0469,
"step": 208
},
{
"epoch": 1.9904761904761905,
"grad_norm": 0.30952011756676745,
"learning_rate": 6.03066320135536e-05,
"loss": 1.0457,
"step": 209
},
{
"epoch": 2.0,
"grad_norm": 0.5189001553144519,
"learning_rate": 6.0076806618647545e-05,
"loss": 1.0549,
"step": 210
},
{
"epoch": 2.0095238095238095,
"grad_norm": 0.7439078466919193,
"learning_rate": 5.984609179987155e-05,
"loss": 1.0304,
"step": 211
},
{
"epoch": 2.019047619047619,
"grad_norm": 0.8444989260384058,
"learning_rate": 5.961449777813727e-05,
"loss": 1.0483,
"step": 212
},
{
"epoch": 2.0285714285714285,
"grad_norm": 0.85105512066596,
"learning_rate": 5.9382034813306014e-05,
"loss": 1.0192,
"step": 213
},
{
"epoch": 2.038095238095238,
"grad_norm": 0.8717088777508358,
"learning_rate": 5.914871320373417e-05,
"loss": 1.0262,
"step": 214
},
{
"epoch": 2.0476190476190474,
"grad_norm": 0.8473077460098071,
"learning_rate": 5.891454328581702e-05,
"loss": 1.0354,
"step": 215
},
{
"epoch": 2.057142857142857,
"grad_norm": 0.7459552028603442,
"learning_rate": 5.8679535433530756e-05,
"loss": 1.0151,
"step": 216
},
{
"epoch": 2.066666666666667,
"grad_norm": 0.6103690018974712,
"learning_rate": 5.844370005797304e-05,
"loss": 1.0169,
"step": 217
},
{
"epoch": 2.0761904761904764,
"grad_norm": 0.4922703155453895,
"learning_rate": 5.820704760690161e-05,
"loss": 1.0189,
"step": 218
},
{
"epoch": 2.085714285714286,
"grad_norm": 0.5537849831993843,
"learning_rate": 5.796958856427155e-05,
"loss": 1.0177,
"step": 219
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.5404302742305147,
"learning_rate": 5.7731333449770833e-05,
"loss": 1.0095,
"step": 220
},
{
"epoch": 2.104761904761905,
"grad_norm": 0.4534851743612291,
"learning_rate": 5.7492292818354224e-05,
"loss": 1.0187,
"step": 221
},
{
"epoch": 2.1142857142857143,
"grad_norm": 0.3065835944952437,
"learning_rate": 5.725247725977573e-05,
"loss": 1.0034,
"step": 222
},
{
"epoch": 2.123809523809524,
"grad_norm": 0.4746500125323341,
"learning_rate": 5.7011897398119486e-05,
"loss": 1.0156,
"step": 223
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.5364964068164462,
"learning_rate": 5.6770563891329e-05,
"loss": 1.023,
"step": 224
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.39435975272411355,
"learning_rate": 5.652848743073513e-05,
"loss": 1.0161,
"step": 225
},
{
"epoch": 2.1523809523809523,
"grad_norm": 0.2917840449905482,
"learning_rate": 5.628567874058235e-05,
"loss": 1.0176,
"step": 226
},
{
"epoch": 2.1619047619047618,
"grad_norm": 0.4309558938275066,
"learning_rate": 5.6042148577553665e-05,
"loss": 1.0189,
"step": 227
},
{
"epoch": 2.1714285714285713,
"grad_norm": 0.4049822234252623,
"learning_rate": 5.5797907730294123e-05,
"loss": 1.0079,
"step": 228
},
{
"epoch": 2.1809523809523808,
"grad_norm": 0.2826706608436985,
"learning_rate": 5.555296701893284e-05,
"loss": 1.0161,
"step": 229
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.4502126269593606,
"learning_rate": 5.5307337294603595e-05,
"loss": 1.0196,
"step": 230
},
{
"epoch": 2.2,
"grad_norm": 0.41114563221017636,
"learning_rate": 5.506102943896426e-05,
"loss": 1.016,
"step": 231
},
{
"epoch": 2.2095238095238097,
"grad_norm": 0.313503216833648,
"learning_rate": 5.481405436371459e-05,
"loss": 1.0089,
"step": 232
},
{
"epoch": 2.219047619047619,
"grad_norm": 0.36100525990234567,
"learning_rate": 5.45664230101129e-05,
"loss": 1.0031,
"step": 233
},
{
"epoch": 2.2285714285714286,
"grad_norm": 0.3757564707600635,
"learning_rate": 5.431814634849131e-05,
"loss": 1.0144,
"step": 234
},
{
"epoch": 2.238095238095238,
"grad_norm": 0.3578831855885671,
"learning_rate": 5.40692353777698e-05,
"loss": 1.0291,
"step": 235
},
{
"epoch": 2.2476190476190476,
"grad_norm": 0.279381430603173,
"learning_rate": 5.38197011249689e-05,
"loss": 1.0038,
"step": 236
},
{
"epoch": 2.257142857142857,
"grad_norm": 0.31128636254168296,
"learning_rate": 5.356955464472121e-05,
"loss": 1.0132,
"step": 237
},
{
"epoch": 2.2666666666666666,
"grad_norm": 0.28395461762897617,
"learning_rate": 5.331880701878165e-05,
"loss": 1.0116,
"step": 238
},
{
"epoch": 2.276190476190476,
"grad_norm": 0.27391953654101664,
"learning_rate": 5.3067469355536525e-05,
"loss": 1.0183,
"step": 239
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.22816906449119856,
"learning_rate": 5.2815552789511426e-05,
"loss": 1.0031,
"step": 240
},
{
"epoch": 2.295238095238095,
"grad_norm": 0.2370061459930583,
"learning_rate": 5.256306848087796e-05,
"loss": 1.0243,
"step": 241
},
{
"epoch": 2.3047619047619046,
"grad_norm": 0.2679568465187226,
"learning_rate": 5.2310027614959316e-05,
"loss": 1.003,
"step": 242
},
{
"epoch": 2.314285714285714,
"grad_norm": 0.26159769690103984,
"learning_rate": 5.20564414017348e-05,
"loss": 1.0286,
"step": 243
},
{
"epoch": 2.323809523809524,
"grad_norm": 0.2277499325426611,
"learning_rate": 5.1802321075343135e-05,
"loss": 1.0158,
"step": 244
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.2765055373878013,
"learning_rate": 5.1547677893584846e-05,
"loss": 1.0176,
"step": 245
},
{
"epoch": 2.342857142857143,
"grad_norm": 0.2707671528392851,
"learning_rate": 5.129252313742353e-05,
"loss": 1.012,
"step": 246
},
{
"epoch": 2.3523809523809525,
"grad_norm": 0.24663987368382112,
"learning_rate": 5.103686811048603e-05,
"loss": 1.005,
"step": 247
},
{
"epoch": 2.361904761904762,
"grad_norm": 0.3119601934853684,
"learning_rate": 5.078072413856174e-05,
"loss": 0.9982,
"step": 248
},
{
"epoch": 2.3714285714285714,
"grad_norm": 0.28361246368044496,
"learning_rate": 5.052410256910085e-05,
"loss": 1.0106,
"step": 249
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.210984716029959,
"learning_rate": 5.026701477071161e-05,
"loss": 1.0249,
"step": 250
},
{
"epoch": 2.3904761904761904,
"grad_norm": 0.2639314428580076,
"learning_rate": 5.00094721326567e-05,
"loss": 1.0133,
"step": 251
},
{
"epoch": 2.4,
"grad_norm": 0.2077340280233697,
"learning_rate": 4.9751486064348695e-05,
"loss": 1.0166,
"step": 252
},
{
"epoch": 2.4095238095238094,
"grad_norm": 0.20906067378182863,
"learning_rate": 4.9493067994844606e-05,
"loss": 1.0017,
"step": 253
},
{
"epoch": 2.419047619047619,
"grad_norm": 0.31463242576457345,
"learning_rate": 4.9234229372339525e-05,
"loss": 1.0167,
"step": 254
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.23613151956098302,
"learning_rate": 4.897498166365953e-05,
"loss": 1.0148,
"step": 255
},
{
"epoch": 2.4380952380952383,
"grad_norm": 0.2809994241715782,
"learning_rate": 4.8715336353753616e-05,
"loss": 1.0168,
"step": 256
},
{
"epoch": 2.447619047619048,
"grad_norm": 0.28140513046492216,
"learning_rate": 4.845530494518498e-05,
"loss": 1.0174,
"step": 257
},
{
"epoch": 2.4571428571428573,
"grad_norm": 0.26072644777277915,
"learning_rate": 4.819489895762135e-05,
"loss": 1.003,
"step": 258
},
{
"epoch": 2.466666666666667,
"grad_norm": 0.24673982217299864,
"learning_rate": 4.7934129927324717e-05,
"loss": 1.0098,
"step": 259
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.2336437136943171,
"learning_rate": 4.7673009406640264e-05,
"loss": 1.0186,
"step": 260
},
{
"epoch": 2.4857142857142858,
"grad_norm": 0.29231870944215277,
"learning_rate": 4.741154896348458e-05,
"loss": 1.0068,
"step": 261
},
{
"epoch": 2.4952380952380953,
"grad_norm": 0.2911376621948982,
"learning_rate": 4.714976018083315e-05,
"loss": 1.0224,
"step": 262
},
{
"epoch": 2.5047619047619047,
"grad_norm": 0.3338692874924189,
"learning_rate": 4.6887654656207255e-05,
"loss": 1.0195,
"step": 263
},
{
"epoch": 2.5142857142857142,
"grad_norm": 0.2541097184148386,
"learning_rate": 4.66252440011602e-05,
"loss": 1.003,
"step": 264
},
{
"epoch": 2.5238095238095237,
"grad_norm": 0.24770604612278468,
"learning_rate": 4.6362539840762886e-05,
"loss": 1.0223,
"step": 265
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.2191655449635504,
"learning_rate": 4.60995538130888e-05,
"loss": 1.0145,
"step": 266
},
{
"epoch": 2.5428571428571427,
"grad_norm": 0.2790975433249823,
"learning_rate": 4.5836297568698475e-05,
"loss": 1.0126,
"step": 267
},
{
"epoch": 2.552380952380952,
"grad_norm": 0.28428156686414346,
"learning_rate": 4.557278277012329e-05,
"loss": 1.0295,
"step": 268
},
{
"epoch": 2.5619047619047617,
"grad_norm": 0.3233959922060247,
"learning_rate": 4.5309021091348885e-05,
"loss": 1.0174,
"step": 269
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.2956250139389734,
"learning_rate": 4.504502421729795e-05,
"loss": 1.0215,
"step": 270
},
{
"epoch": 2.580952380952381,
"grad_norm": 0.2009944046883891,
"learning_rate": 4.478080384331255e-05,
"loss": 1.011,
"step": 271
},
{
"epoch": 2.5904761904761906,
"grad_norm": 0.18530962666368314,
"learning_rate": 4.4516371674636074e-05,
"loss": 1.0181,
"step": 272
},
{
"epoch": 2.6,
"grad_norm": 0.19572463370442308,
"learning_rate": 4.425173942589462e-05,
"loss": 1.0222,
"step": 273
},
{
"epoch": 2.6095238095238096,
"grad_norm": 0.20133706322572675,
"learning_rate": 4.398691882057804e-05,
"loss": 1.0238,
"step": 274
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.1696896722933609,
"learning_rate": 4.372192159052058e-05,
"loss": 1.0121,
"step": 275
},
{
"epoch": 2.6285714285714286,
"grad_norm": 0.5599358185228015,
"learning_rate": 4.3456759475381183e-05,
"loss": 1.0468,
"step": 276
},
{
"epoch": 2.638095238095238,
"grad_norm": 0.20466020884323685,
"learning_rate": 4.3191444222123326e-05,
"loss": 1.0211,
"step": 277
},
{
"epoch": 2.6476190476190475,
"grad_norm": 0.18418573847215977,
"learning_rate": 4.2925987584494706e-05,
"loss": 1.013,
"step": 278
},
{
"epoch": 2.657142857142857,
"grad_norm": 0.20944834576237945,
"learning_rate": 4.26604013225065e-05,
"loss": 1.0165,
"step": 279
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.17888267966598845,
"learning_rate": 4.239469720191234e-05,
"loss": 1.0285,
"step": 280
},
{
"epoch": 2.6761904761904765,
"grad_norm": 0.24111408627710695,
"learning_rate": 4.2128886993687145e-05,
"loss": 1.011,
"step": 281
},
{
"epoch": 2.685714285714286,
"grad_norm": 0.25283589971223985,
"learning_rate": 4.186298247350567e-05,
"loss": 1.0021,
"step": 282
},
{
"epoch": 2.6952380952380954,
"grad_norm": 0.19532012129248275,
"learning_rate": 4.159699542122071e-05,
"loss": 1.0113,
"step": 283
},
{
"epoch": 2.704761904761905,
"grad_norm": 0.2297669069644532,
"learning_rate": 4.133093762034137e-05,
"loss": 1.0032,
"step": 284
},
{
"epoch": 2.7142857142857144,
"grad_norm": 0.2465633017960609,
"learning_rate": 4.1064820857511e-05,
"loss": 0.9943,
"step": 285
},
{
"epoch": 2.723809523809524,
"grad_norm": 0.24245129662503742,
"learning_rate": 4.079865692198499e-05,
"loss": 1.0164,
"step": 286
},
{
"epoch": 2.7333333333333334,
"grad_norm": 0.22833439764975844,
"learning_rate": 4.053245760510856e-05,
"loss": 1.0161,
"step": 287
},
{
"epoch": 2.742857142857143,
"grad_norm": 0.22230175915043607,
"learning_rate": 4.026623469979436e-05,
"loss": 1.0091,
"step": 288
},
{
"epoch": 2.7523809523809524,
"grad_norm": 0.2781762656155117,
"learning_rate": 4e-05,
"loss": 1.0017,
"step": 289
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.2864072028925091,
"learning_rate": 3.9733765300205654e-05,
"loss": 1.0075,
"step": 290
},
{
"epoch": 2.7714285714285714,
"grad_norm": 0.22898372701526665,
"learning_rate": 3.946754239489146e-05,
"loss": 1.0122,
"step": 291
},
{
"epoch": 2.780952380952381,
"grad_norm": 0.21256356082358066,
"learning_rate": 3.9201343078015026e-05,
"loss": 1.008,
"step": 292
},
{
"epoch": 2.7904761904761903,
"grad_norm": 0.2818845650719653,
"learning_rate": 3.8935179142489016e-05,
"loss": 0.9986,
"step": 293
},
{
"epoch": 2.8,
"grad_norm": 0.23141186471770828,
"learning_rate": 3.866906237965865e-05,
"loss": 1.0165,
"step": 294
},
{
"epoch": 2.8095238095238093,
"grad_norm": 0.23784103111468072,
"learning_rate": 3.840300457877931e-05,
"loss": 1.0226,
"step": 295
},
{
"epoch": 2.819047619047619,
"grad_norm": 0.20078316045157482,
"learning_rate": 3.813701752649435e-05,
"loss": 1.0015,
"step": 296
},
{
"epoch": 2.8285714285714287,
"grad_norm": 0.24338393846551132,
"learning_rate": 3.787111300631287e-05,
"loss": 1.0067,
"step": 297
},
{
"epoch": 2.8380952380952382,
"grad_norm": 0.20074428570812403,
"learning_rate": 3.7605302798087686e-05,
"loss": 1.0122,
"step": 298
},
{
"epoch": 2.8476190476190477,
"grad_norm": 0.20714128390839137,
"learning_rate": 3.7339598677493515e-05,
"loss": 1.0035,
"step": 299
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.17387463464530994,
"learning_rate": 3.7074012415505294e-05,
"loss": 1.0106,
"step": 300
},
{
"epoch": 2.8666666666666667,
"grad_norm": 0.19763448671477987,
"learning_rate": 3.6808555777876673e-05,
"loss": 1.0071,
"step": 301
},
{
"epoch": 2.876190476190476,
"grad_norm": 0.193551861844252,
"learning_rate": 3.654324052461883e-05,
"loss": 1.0149,
"step": 302
},
{
"epoch": 2.8857142857142857,
"grad_norm": 0.20465104040557303,
"learning_rate": 3.6278078409479424e-05,
"loss": 1.0132,
"step": 303
},
{
"epoch": 2.895238095238095,
"grad_norm": 0.1858868750629034,
"learning_rate": 3.6013081179421965e-05,
"loss": 1.0113,
"step": 304
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.19231783181407783,
"learning_rate": 3.5748260574105395e-05,
"loss": 1.0223,
"step": 305
},
{
"epoch": 2.914285714285714,
"grad_norm": 0.20811259594067136,
"learning_rate": 3.548362832536393e-05,
"loss": 1.031,
"step": 306
},
{
"epoch": 2.923809523809524,
"grad_norm": 0.17374784949143135,
"learning_rate": 3.5219196156687454e-05,
"loss": 1.0143,
"step": 307
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.18941317471635122,
"learning_rate": 3.495497578270206e-05,
"loss": 1.0067,
"step": 308
},
{
"epoch": 2.942857142857143,
"grad_norm": 0.2175969420603483,
"learning_rate": 3.469097890865113e-05,
"loss": 1.0149,
"step": 309
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.20305721352150802,
"learning_rate": 3.442721722987673e-05,
"loss": 1.0148,
"step": 310
},
{
"epoch": 2.961904761904762,
"grad_norm": 0.1735412980023502,
"learning_rate": 3.416370243130154e-05,
"loss": 1.0308,
"step": 311
},
{
"epoch": 2.9714285714285715,
"grad_norm": 0.23287418335200405,
"learning_rate": 3.390044618691121e-05,
"loss": 1.0181,
"step": 312
},
{
"epoch": 2.980952380952381,
"grad_norm": 0.1731215573089734,
"learning_rate": 3.363746015923713e-05,
"loss": 1.0141,
"step": 313
},
{
"epoch": 2.9904761904761905,
"grad_norm": 0.36664679839921727,
"learning_rate": 3.337475599883981e-05,
"loss": 1.0252,
"step": 314
},
{
"epoch": 3.0,
"grad_norm": 0.20808906149822945,
"learning_rate": 3.3112345343792765e-05,
"loss": 1.0094,
"step": 315
},
{
"epoch": 3.0095238095238095,
"grad_norm": 0.2386737684696705,
"learning_rate": 3.285023981916687e-05,
"loss": 0.985,
"step": 316
},
{
"epoch": 3.019047619047619,
"grad_norm": 0.21859428661149227,
"learning_rate": 3.2588451036515435e-05,
"loss": 0.9875,
"step": 317
},
{
"epoch": 3.0285714285714285,
"grad_norm": 0.21686946170692808,
"learning_rate": 3.2326990593359756e-05,
"loss": 0.9648,
"step": 318
},
{
"epoch": 3.038095238095238,
"grad_norm": 0.23585519844179503,
"learning_rate": 3.206587007267528e-05,
"loss": 0.9804,
"step": 319
},
{
"epoch": 3.0476190476190474,
"grad_norm": 0.23471049638102,
"learning_rate": 3.1805101042378665e-05,
"loss": 0.9764,
"step": 320
},
{
"epoch": 3.057142857142857,
"grad_norm": 0.24601337714676638,
"learning_rate": 3.154469505481503e-05,
"loss": 0.9787,
"step": 321
},
{
"epoch": 3.066666666666667,
"grad_norm": 0.24680145517708438,
"learning_rate": 3.128466364624638e-05,
"loss": 0.9751,
"step": 322
},
{
"epoch": 3.0761904761904764,
"grad_norm": 0.20266157663662596,
"learning_rate": 3.1025018336340484e-05,
"loss": 0.9786,
"step": 323
},
{
"epoch": 3.085714285714286,
"grad_norm": 0.23172246726109874,
"learning_rate": 3.076577062766049e-05,
"loss": 0.9797,
"step": 324
},
{
"epoch": 3.0952380952380953,
"grad_norm": 0.20530652182231443,
"learning_rate": 3.0506932005155407e-05,
"loss": 0.9776,
"step": 325
},
{
"epoch": 3.104761904761905,
"grad_norm": 0.19541728346574888,
"learning_rate": 3.024851393565132e-05,
"loss": 0.9747,
"step": 326
},
{
"epoch": 3.1142857142857143,
"grad_norm": 0.24999279824271964,
"learning_rate": 2.999052786734331e-05,
"loss": 0.988,
"step": 327
},
{
"epoch": 3.123809523809524,
"grad_norm": 0.22057462842837305,
"learning_rate": 2.9732985229288397e-05,
"loss": 0.9758,
"step": 328
},
{
"epoch": 3.1333333333333333,
"grad_norm": 0.21976681898897724,
"learning_rate": 2.9475897430899157e-05,
"loss": 0.9778,
"step": 329
},
{
"epoch": 3.142857142857143,
"grad_norm": 0.22563434482551034,
"learning_rate": 2.921927586143827e-05,
"loss": 0.989,
"step": 330
},
{
"epoch": 3.1523809523809523,
"grad_norm": 0.1951899724158065,
"learning_rate": 2.8963131889513986e-05,
"loss": 0.9723,
"step": 331
},
{
"epoch": 3.1619047619047618,
"grad_norm": 0.31930450117417375,
"learning_rate": 2.870747686257649e-05,
"loss": 0.9743,
"step": 332
},
{
"epoch": 3.1714285714285713,
"grad_norm": 0.24347478004659553,
"learning_rate": 2.845232210641517e-05,
"loss": 0.9658,
"step": 333
},
{
"epoch": 3.1809523809523808,
"grad_norm": 0.22607819816932653,
"learning_rate": 2.8197678924656886e-05,
"loss": 0.9666,
"step": 334
},
{
"epoch": 3.1904761904761907,
"grad_norm": 0.26083022022974084,
"learning_rate": 2.7943558598265218e-05,
"loss": 0.9875,
"step": 335
},
{
"epoch": 3.2,
"grad_norm": 0.20430972542099413,
"learning_rate": 2.7689972385040697e-05,
"loss": 0.9817,
"step": 336
},
{
"epoch": 3.2095238095238097,
"grad_norm": 0.2478055729854012,
"learning_rate": 2.743693151912206e-05,
"loss": 0.9833,
"step": 337
},
{
"epoch": 3.219047619047619,
"grad_norm": 0.20670211431468252,
"learning_rate": 2.718444721048859e-05,
"loss": 0.9768,
"step": 338
},
{
"epoch": 3.2285714285714286,
"grad_norm": 0.258133896429674,
"learning_rate": 2.693253064446348e-05,
"loss": 0.9663,
"step": 339
},
{
"epoch": 3.238095238095238,
"grad_norm": 0.22888650123481227,
"learning_rate": 2.6681192981218348e-05,
"loss": 0.9815,
"step": 340
},
{
"epoch": 3.2476190476190476,
"grad_norm": 0.2052454770699904,
"learning_rate": 2.6430445355278788e-05,
"loss": 0.9752,
"step": 341
},
{
"epoch": 3.257142857142857,
"grad_norm": 0.2161754437788185,
"learning_rate": 2.6180298875031098e-05,
"loss": 0.9688,
"step": 342
},
{
"epoch": 3.2666666666666666,
"grad_norm": 0.2389481202074638,
"learning_rate": 2.59307646222302e-05,
"loss": 0.9903,
"step": 343
},
{
"epoch": 3.276190476190476,
"grad_norm": 0.1688931879244891,
"learning_rate": 2.5681853651508704e-05,
"loss": 0.9906,
"step": 344
},
{
"epoch": 3.2857142857142856,
"grad_norm": 0.2709186008027029,
"learning_rate": 2.5433576989887115e-05,
"loss": 0.9634,
"step": 345
},
{
"epoch": 3.295238095238095,
"grad_norm": 0.17314495292558174,
"learning_rate": 2.5185945636285416e-05,
"loss": 0.9805,
"step": 346
},
{
"epoch": 3.3047619047619046,
"grad_norm": 0.226865700017327,
"learning_rate": 2.4938970561035753e-05,
"loss": 0.9745,
"step": 347
},
{
"epoch": 3.314285714285714,
"grad_norm": 0.22027328808623828,
"learning_rate": 2.4692662705396412e-05,
"loss": 0.9808,
"step": 348
},
{
"epoch": 3.323809523809524,
"grad_norm": 0.21347101273276106,
"learning_rate": 2.444703298106718e-05,
"loss": 0.9742,
"step": 349
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.6099297681729771,
"learning_rate": 2.420209226970588e-05,
"loss": 1.0004,
"step": 350
},
{
"epoch": 3.342857142857143,
"grad_norm": 0.18262102730707186,
"learning_rate": 2.395785142244634e-05,
"loss": 0.9785,
"step": 351
},
{
"epoch": 3.3523809523809525,
"grad_norm": 0.19086404836286397,
"learning_rate": 2.3714321259417662e-05,
"loss": 0.9784,
"step": 352
},
{
"epoch": 3.361904761904762,
"grad_norm": 0.2101963610417534,
"learning_rate": 2.3471512569264884e-05,
"loss": 0.9703,
"step": 353
},
{
"epoch": 3.3714285714285714,
"grad_norm": 0.20089915550579932,
"learning_rate": 2.3229436108671014e-05,
"loss": 0.9753,
"step": 354
},
{
"epoch": 3.380952380952381,
"grad_norm": 0.17710318859904475,
"learning_rate": 2.298810260188054e-05,
"loss": 0.9717,
"step": 355
},
{
"epoch": 3.3904761904761904,
"grad_norm": 0.1505659891122774,
"learning_rate": 2.274752274022428e-05,
"loss": 0.9778,
"step": 356
},
{
"epoch": 3.4,
"grad_norm": 0.18038983533749448,
"learning_rate": 2.250770718164579e-05,
"loss": 0.9854,
"step": 357
},
{
"epoch": 3.4095238095238094,
"grad_norm": 0.18174732799541482,
"learning_rate": 2.2268666550229173e-05,
"loss": 0.9787,
"step": 358
},
{
"epoch": 3.419047619047619,
"grad_norm": 0.15466454580054184,
"learning_rate": 2.203041143572845e-05,
"loss": 0.9771,
"step": 359
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.17410077565262463,
"learning_rate": 2.1792952393098394e-05,
"loss": 0.9876,
"step": 360
},
{
"epoch": 3.4380952380952383,
"grad_norm": 0.17696206014651153,
"learning_rate": 2.155629994202696e-05,
"loss": 0.9844,
"step": 361
},
{
"epoch": 3.447619047619048,
"grad_norm": 0.15599328687176803,
"learning_rate": 2.1320464566469233e-05,
"loss": 0.9827,
"step": 362
},
{
"epoch": 3.4571428571428573,
"grad_norm": 0.18924852148069596,
"learning_rate": 2.1085456714183002e-05,
"loss": 0.9737,
"step": 363
},
{
"epoch": 3.466666666666667,
"grad_norm": 0.21383476826326972,
"learning_rate": 2.0851286796265838e-05,
"loss": 0.9817,
"step": 364
},
{
"epoch": 3.4761904761904763,
"grad_norm": 0.17321070429571578,
"learning_rate": 2.0617965186694e-05,
"loss": 0.984,
"step": 365
},
{
"epoch": 3.4857142857142858,
"grad_norm": 0.17184638099703334,
"learning_rate": 2.0385502221862742e-05,
"loss": 0.9878,
"step": 366
},
{
"epoch": 3.4952380952380953,
"grad_norm": 0.17908426969585328,
"learning_rate": 2.015390820012847e-05,
"loss": 0.9842,
"step": 367
},
{
"epoch": 3.5047619047619047,
"grad_norm": 0.1640535311292707,
"learning_rate": 1.9923193381352468e-05,
"loss": 0.9641,
"step": 368
},
{
"epoch": 3.5142857142857142,
"grad_norm": 0.16868052713135487,
"learning_rate": 1.9693367986446415e-05,
"loss": 0.9616,
"step": 369
},
{
"epoch": 3.5238095238095237,
"grad_norm": 0.14316347468811785,
"learning_rate": 1.9464442196919525e-05,
"loss": 0.9854,
"step": 370
},
{
"epoch": 3.533333333333333,
"grad_norm": 0.13603829907464166,
"learning_rate": 1.9236426154427583e-05,
"loss": 0.9741,
"step": 371
},
{
"epoch": 3.5428571428571427,
"grad_norm": 0.13833016109922058,
"learning_rate": 1.9009329960323594e-05,
"loss": 0.9911,
"step": 372
},
{
"epoch": 3.552380952380952,
"grad_norm": 0.14395757212272578,
"learning_rate": 1.8783163675210307e-05,
"loss": 0.9837,
"step": 373
},
{
"epoch": 3.5619047619047617,
"grad_norm": 0.16846410866461603,
"learning_rate": 1.8557937318494507e-05,
"loss": 0.9744,
"step": 374
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.1523793105601579,
"learning_rate": 1.8333660867943163e-05,
"loss": 0.971,
"step": 375
},
{
"epoch": 3.580952380952381,
"grad_norm": 0.13785660562080743,
"learning_rate": 1.8110344259241398e-05,
"loss": 0.9766,
"step": 376
},
{
"epoch": 3.5904761904761906,
"grad_norm": 0.1325642554414296,
"learning_rate": 1.7887997385552278e-05,
"loss": 0.9649,
"step": 377
},
{
"epoch": 3.6,
"grad_norm": 0.15775432838130118,
"learning_rate": 1.766663009707861e-05,
"loss": 0.9689,
"step": 378
},
{
"epoch": 3.6095238095238096,
"grad_norm": 0.12446439908125384,
"learning_rate": 1.7446252200626555e-05,
"loss": 0.9722,
"step": 379
},
{
"epoch": 3.619047619047619,
"grad_norm": 0.13869302526796073,
"learning_rate": 1.7226873459171142e-05,
"loss": 0.9651,
"step": 380
},
{
"epoch": 3.6285714285714286,
"grad_norm": 0.12813837269442743,
"learning_rate": 1.700850359142373e-05,
"loss": 0.9659,
"step": 381
},
{
"epoch": 3.638095238095238,
"grad_norm": 0.13933232700434076,
"learning_rate": 1.679115227140155e-05,
"loss": 0.9731,
"step": 382
},
{
"epoch": 3.6476190476190475,
"grad_norm": 0.14485112162942804,
"learning_rate": 1.6574829127999067e-05,
"loss": 0.9854,
"step": 383
},
{
"epoch": 3.657142857142857,
"grad_norm": 0.1390597866481306,
"learning_rate": 1.6359543744561438e-05,
"loss": 0.9706,
"step": 384
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.14156616905744906,
"learning_rate": 1.614530565845994e-05,
"loss": 0.9741,
"step": 385
},
{
"epoch": 3.6761904761904765,
"grad_norm": 0.14789503098270715,
"learning_rate": 1.5932124360669473e-05,
"loss": 0.9792,
"step": 386
},
{
"epoch": 3.685714285714286,
"grad_norm": 0.14078688051407673,
"learning_rate": 1.5720009295348103e-05,
"loss": 0.9805,
"step": 387
},
{
"epoch": 3.6952380952380954,
"grad_norm": 0.149662490393232,
"learning_rate": 1.5508969859418617e-05,
"loss": 0.9708,
"step": 388
},
{
"epoch": 3.704761904761905,
"grad_norm": 0.15565972406127485,
"learning_rate": 1.529901540215233e-05,
"loss": 0.9907,
"step": 389
},
{
"epoch": 3.7142857142857144,
"grad_norm": 0.14457925539378302,
"learning_rate": 1.5090155224754823e-05,
"loss": 0.972,
"step": 390
},
{
"epoch": 3.723809523809524,
"grad_norm": 0.15980853988420965,
"learning_rate": 1.4882398579953928e-05,
"loss": 0.9858,
"step": 391
},
{
"epoch": 3.7333333333333334,
"grad_norm": 0.1353513800841828,
"learning_rate": 1.4675754671589801e-05,
"loss": 0.9843,
"step": 392
},
{
"epoch": 3.742857142857143,
"grad_norm": 0.13602936334115348,
"learning_rate": 1.4470232654207208e-05,
"loss": 0.9819,
"step": 393
},
{
"epoch": 3.7523809523809524,
"grad_norm": 0.14448055061579274,
"learning_rate": 1.4265841632649915e-05,
"loss": 0.9722,
"step": 394
},
{
"epoch": 3.761904761904762,
"grad_norm": 0.13590836391844754,
"learning_rate": 1.40625906616574e-05,
"loss": 0.9658,
"step": 395
},
{
"epoch": 3.7714285714285714,
"grad_norm": 0.1435455827754151,
"learning_rate": 1.3860488745463694e-05,
"loss": 0.9794,
"step": 396
},
{
"epoch": 3.780952380952381,
"grad_norm": 0.1397532798524967,
"learning_rate": 1.365954483739846e-05,
"loss": 0.9829,
"step": 397
},
{
"epoch": 3.7904761904761903,
"grad_norm": 0.12978394192415188,
"learning_rate": 1.3459767839490386e-05,
"loss": 0.9721,
"step": 398
},
{
"epoch": 3.8,
"grad_norm": 0.14356015101687936,
"learning_rate": 1.326116660207279e-05,
"loss": 0.9835,
"step": 399
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.13247764252770453,
"learning_rate": 1.3063749923391557e-05,
"loss": 0.9765,
"step": 400
},
{
"epoch": 3.819047619047619,
"grad_norm": 0.13968884823844,
"learning_rate": 1.2867526549215356e-05,
"loss": 0.9806,
"step": 401
},
{
"epoch": 3.8285714285714287,
"grad_norm": 0.15020881590521024,
"learning_rate": 1.2672505172448201e-05,
"loss": 0.9743,
"step": 402
},
{
"epoch": 3.8380952380952382,
"grad_norm": 0.12600562838248064,
"learning_rate": 1.2478694432744342e-05,
"loss": 0.9747,
"step": 403
},
{
"epoch": 3.8476190476190477,
"grad_norm": 0.1253894515680319,
"learning_rate": 1.2286102916125534e-05,
"loss": 0.9734,
"step": 404
},
{
"epoch": 3.857142857142857,
"grad_norm": 0.14320490323799898,
"learning_rate": 1.2094739154600616e-05,
"loss": 0.9816,
"step": 405
},
{
"epoch": 3.8666666666666667,
"grad_norm": 0.12399745336033988,
"learning_rate": 1.1904611625787612e-05,
"loss": 0.9884,
"step": 406
},
{
"epoch": 3.876190476190476,
"grad_norm": 0.13417523628822833,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.9783,
"step": 407
},
{
"epoch": 3.8857142857142857,
"grad_norm": 0.15683701346116036,
"learning_rate": 1.1528098902564109e-05,
"loss": 0.9733,
"step": 408
},
{
"epoch": 3.895238095238095,
"grad_norm": 0.13713534522259704,
"learning_rate": 1.1341730388067393e-05,
"loss": 0.9818,
"step": 409
},
{
"epoch": 3.9047619047619047,
"grad_norm": 0.13758248343689114,
"learning_rate": 1.1156631465371213e-05,
"loss": 0.9688,
"step": 410
},
{
"epoch": 3.914285714285714,
"grad_norm": 0.14115308612974595,
"learning_rate": 1.0972810334554565e-05,
"loss": 0.9797,
"step": 411
},
{
"epoch": 3.923809523809524,
"grad_norm": 0.5870477835443715,
"learning_rate": 1.0790275139088879e-05,
"loss": 0.982,
"step": 412
},
{
"epoch": 3.9333333333333336,
"grad_norm": 0.12922641407686744,
"learning_rate": 1.0609033965477318e-05,
"loss": 0.9805,
"step": 413
},
{
"epoch": 3.942857142857143,
"grad_norm": 0.14422492668411402,
"learning_rate": 1.0429094842896484e-05,
"loss": 0.9795,
"step": 414
},
{
"epoch": 3.9523809523809526,
"grad_norm": 0.14945738902129274,
"learning_rate": 1.0250465742840743e-05,
"loss": 0.9736,
"step": 415
},
{
"epoch": 3.961904761904762,
"grad_norm": 0.12244500468761615,
"learning_rate": 1.007315457876907e-05,
"loss": 0.9811,
"step": 416
},
{
"epoch": 3.9714285714285715,
"grad_norm": 0.13445713568795264,
"learning_rate": 9.897169205754461e-06,
"loss": 0.9846,
"step": 417
},
{
"epoch": 3.980952380952381,
"grad_norm": 0.13743382078481892,
"learning_rate": 9.722517420135977e-06,
"loss": 0.9688,
"step": 418
},
{
"epoch": 3.9904761904761905,
"grad_norm": 0.1233003022413971,
"learning_rate": 9.549206959173331e-06,
"loss": 0.9845,
"step": 419
},
{
"epoch": 4.0,
"grad_norm": 0.1263829549351974,
"learning_rate": 9.377245500704135e-06,
"loss": 0.9928,
"step": 420
},
{
"epoch": 4.0095238095238095,
"grad_norm": 0.17390084497636504,
"learning_rate": 9.206640662803746e-06,
"loss": 0.9623,
"step": 421
},
{
"epoch": 4.019047619047619,
"grad_norm": 0.1431373191141913,
"learning_rate": 9.037400003447808e-06,
"loss": 0.9616,
"step": 422
},
{
"epoch": 4.0285714285714285,
"grad_norm": 0.13455551888638445,
"learning_rate": 8.869531020177367e-06,
"loss": 0.9475,
"step": 423
},
{
"epoch": 4.038095238095238,
"grad_norm": 0.14362146854953411,
"learning_rate": 8.703041149766797e-06,
"loss": 0.9541,
"step": 424
},
{
"epoch": 4.0476190476190474,
"grad_norm": 0.1452952254226326,
"learning_rate": 8.537937767894303e-06,
"loss": 0.9548,
"step": 425
},
{
"epoch": 4.057142857142857,
"grad_norm": 0.156138958795091,
"learning_rate": 8.374228188815157e-06,
"loss": 0.9597,
"step": 426
},
{
"epoch": 4.066666666666666,
"grad_norm": 0.15807376487720612,
"learning_rate": 8.211919665037697e-06,
"loss": 0.944,
"step": 427
},
{
"epoch": 4.076190476190476,
"grad_norm": 0.1428349305587997,
"learning_rate": 8.051019387002035e-06,
"loss": 0.957,
"step": 428
},
{
"epoch": 4.085714285714285,
"grad_norm": 0.156467940239568,
"learning_rate": 7.891534482761463e-06,
"loss": 0.9519,
"step": 429
},
{
"epoch": 4.095238095238095,
"grad_norm": 0.15091456328015054,
"learning_rate": 7.733472017666739e-06,
"loss": 0.9579,
"step": 430
},
{
"epoch": 4.104761904761904,
"grad_norm": 0.15455184255548396,
"learning_rate": 7.57683899405305e-06,
"loss": 0.9614,
"step": 431
},
{
"epoch": 4.114285714285714,
"grad_norm": 0.23250611326493095,
"learning_rate": 7.42164235092981e-06,
"loss": 0.9777,
"step": 432
},
{
"epoch": 4.123809523809523,
"grad_norm": 0.15146330742839126,
"learning_rate": 7.26788896367324e-06,
"loss": 0.9519,
"step": 433
},
{
"epoch": 4.133333333333334,
"grad_norm": 0.14062980079515083,
"learning_rate": 7.115585643721798e-06,
"loss": 0.961,
"step": 434
},
{
"epoch": 4.142857142857143,
"grad_norm": 0.13654994504554915,
"learning_rate": 6.964739138274433e-06,
"loss": 0.9638,
"step": 435
},
{
"epoch": 4.152380952380953,
"grad_norm": 0.13405718763219662,
"learning_rate": 6.815356129991624e-06,
"loss": 0.9523,
"step": 436
},
{
"epoch": 4.161904761904762,
"grad_norm": 0.1355347721482518,
"learning_rate": 6.667443236699398e-06,
"loss": 0.9572,
"step": 437
},
{
"epoch": 4.171428571428572,
"grad_norm": 0.3193405192419738,
"learning_rate": 6.521007011096143e-06,
"loss": 0.965,
"step": 438
},
{
"epoch": 4.180952380952381,
"grad_norm": 0.12077417580448366,
"learning_rate": 6.376053940462279e-06,
"loss": 0.959,
"step": 439
},
{
"epoch": 4.190476190476191,
"grad_norm": 0.11975543200056642,
"learning_rate": 6.232590446372864e-06,
"loss": 0.9463,
"step": 440
},
{
"epoch": 4.2,
"grad_norm": 0.12394898025158416,
"learning_rate": 6.090622884413164e-06,
"loss": 0.9599,
"step": 441
},
{
"epoch": 4.20952380952381,
"grad_norm": 0.13246928966220897,
"learning_rate": 5.95015754389705e-06,
"loss": 0.9587,
"step": 442
},
{
"epoch": 4.219047619047619,
"grad_norm": 0.17218887548328696,
"learning_rate": 5.811200647588386e-06,
"loss": 0.9483,
"step": 443
},
{
"epoch": 4.228571428571429,
"grad_norm": 0.12269437836537307,
"learning_rate": 5.673758351425358e-06,
"loss": 0.9652,
"step": 444
},
{
"epoch": 4.238095238095238,
"grad_norm": 0.1150641697364405,
"learning_rate": 5.537836744247753e-06,
"loss": 0.9481,
"step": 445
},
{
"epoch": 4.247619047619048,
"grad_norm": 0.12282362749817176,
"learning_rate": 5.403441847527227e-06,
"loss": 0.9565,
"step": 446
},
{
"epoch": 4.257142857142857,
"grad_norm": 0.12605101659916942,
"learning_rate": 5.270579615100518e-06,
"loss": 0.9476,
"step": 447
},
{
"epoch": 4.266666666666667,
"grad_norm": 0.12368987938067505,
"learning_rate": 5.139255932905731e-06,
"loss": 0.9538,
"step": 448
},
{
"epoch": 4.276190476190476,
"grad_norm": 0.11271067268917899,
"learning_rate": 5.009476618721549e-06,
"loss": 0.9776,
"step": 449
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.1136633818810862,
"learning_rate": 4.881247421909523e-06,
"loss": 0.962,
"step": 450
},
{
"epoch": 4.295238095238095,
"grad_norm": 0.127022521830231,
"learning_rate": 4.754574023159335e-06,
"loss": 0.9574,
"step": 451
},
{
"epoch": 4.304761904761905,
"grad_norm": 0.11992208253243061,
"learning_rate": 4.629462034237193e-06,
"loss": 0.9606,
"step": 452
},
{
"epoch": 4.314285714285714,
"grad_norm": 0.11191987434816011,
"learning_rate": 4.505916997737143e-06,
"loss": 0.9489,
"step": 453
},
{
"epoch": 4.3238095238095235,
"grad_norm": 0.1183918815452489,
"learning_rate": 4.383944386835617e-06,
"loss": 0.9516,
"step": 454
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.11408502654065866,
"learning_rate": 4.263549605048898e-06,
"loss": 0.9407,
"step": 455
},
{
"epoch": 4.3428571428571425,
"grad_norm": 0.11790658741334677,
"learning_rate": 4.144737985993774e-06,
"loss": 0.9539,
"step": 456
},
{
"epoch": 4.352380952380952,
"grad_norm": 0.12878580244190324,
"learning_rate": 4.027514793151235e-06,
"loss": 0.961,
"step": 457
},
{
"epoch": 4.3619047619047615,
"grad_norm": 0.12558845969705756,
"learning_rate": 3.9118852196333e-06,
"loss": 0.9681,
"step": 458
},
{
"epoch": 4.371428571428572,
"grad_norm": 0.11339758568248902,
"learning_rate": 3.7978543879529704e-06,
"loss": 0.9544,
"step": 459
},
{
"epoch": 4.380952380952381,
"grad_norm": 0.12502688051375377,
"learning_rate": 3.6854273497972705e-06,
"loss": 0.9573,
"step": 460
},
{
"epoch": 4.390476190476191,
"grad_norm": 0.10534529012828957,
"learning_rate": 3.574609085803471e-06,
"loss": 0.9369,
"step": 461
},
{
"epoch": 4.4,
"grad_norm": 0.10245960822897354,
"learning_rate": 3.4654045053384456e-06,
"loss": 0.9501,
"step": 462
},
{
"epoch": 4.40952380952381,
"grad_norm": 0.11767956608151667,
"learning_rate": 3.3578184462811714e-06,
"loss": 0.9564,
"step": 463
},
{
"epoch": 4.419047619047619,
"grad_norm": 0.12275952669713826,
"learning_rate": 3.2518556748083817e-06,
"loss": 0.9536,
"step": 464
},
{
"epoch": 4.428571428571429,
"grad_norm": 0.10820465034590014,
"learning_rate": 3.1475208851834815e-06,
"loss": 0.9736,
"step": 465
},
{
"epoch": 4.438095238095238,
"grad_norm": 0.10999235496173602,
"learning_rate": 3.0448186995485307e-06,
"loss": 0.9661,
"step": 466
},
{
"epoch": 4.447619047619048,
"grad_norm": 0.10346043147687037,
"learning_rate": 2.9437536677194976e-06,
"loss": 0.9567,
"step": 467
},
{
"epoch": 4.457142857142857,
"grad_norm": 0.10719225041373819,
"learning_rate": 2.844330266984705e-06,
"loss": 0.9456,
"step": 468
},
{
"epoch": 4.466666666666667,
"grad_norm": 0.10494961443465013,
"learning_rate": 2.746552901906463e-06,
"loss": 0.9639,
"step": 469
},
{
"epoch": 4.476190476190476,
"grad_norm": 0.10157859382678862,
"learning_rate": 2.650425904125964e-06,
"loss": 0.9562,
"step": 470
},
{
"epoch": 4.485714285714286,
"grad_norm": 0.09988194579990615,
"learning_rate": 2.55595353217136e-06,
"loss": 0.9488,
"step": 471
},
{
"epoch": 4.495238095238095,
"grad_norm": 0.09993285343775345,
"learning_rate": 2.463139971269133e-06,
"loss": 0.9599,
"step": 472
},
{
"epoch": 4.504761904761905,
"grad_norm": 0.09844176642750738,
"learning_rate": 2.371989333158671e-06,
"loss": 0.9615,
"step": 473
},
{
"epoch": 4.514285714285714,
"grad_norm": 0.10190707779708189,
"learning_rate": 2.2825056559101145e-06,
"loss": 0.9563,
"step": 474
},
{
"epoch": 4.523809523809524,
"grad_norm": 0.10131001370817404,
"learning_rate": 2.194692903745459e-06,
"loss": 0.9587,
"step": 475
},
{
"epoch": 4.533333333333333,
"grad_norm": 0.10251872435272494,
"learning_rate": 2.1085549668629567e-06,
"loss": 0.9489,
"step": 476
},
{
"epoch": 4.542857142857143,
"grad_norm": 0.10043227346155645,
"learning_rate": 2.0240956612647487e-06,
"loss": 0.9634,
"step": 477
},
{
"epoch": 4.552380952380952,
"grad_norm": 0.09956161225983029,
"learning_rate": 1.9413187285878355e-06,
"loss": 0.9468,
"step": 478
},
{
"epoch": 4.561904761904762,
"grad_norm": 0.11045616365540861,
"learning_rate": 1.8602278359383063e-06,
"loss": 0.9526,
"step": 479
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.09834202928593194,
"learning_rate": 1.78082657572888e-06,
"loss": 0.9609,
"step": 480
},
{
"epoch": 4.580952380952381,
"grad_norm": 0.1147748598722428,
"learning_rate": 1.7031184655197818e-06,
"loss": 0.9412,
"step": 481
},
{
"epoch": 4.59047619047619,
"grad_norm": 0.10211354358709429,
"learning_rate": 1.6271069478628644e-06,
"loss": 0.9557,
"step": 482
},
{
"epoch": 4.6,
"grad_norm": 0.1146778979125764,
"learning_rate": 1.5527953901491466e-06,
"loss": 0.9468,
"step": 483
},
{
"epoch": 4.609523809523809,
"grad_norm": 0.10809368043798627,
"learning_rate": 1.48018708445961e-06,
"loss": 0.9693,
"step": 484
},
{
"epoch": 4.619047619047619,
"grad_norm": 0.09749254775799188,
"learning_rate": 1.409285247419363e-06,
"loss": 0.9524,
"step": 485
},
{
"epoch": 4.628571428571428,
"grad_norm": 0.09704895874324644,
"learning_rate": 1.3400930200551331e-06,
"loss": 0.9607,
"step": 486
},
{
"epoch": 4.638095238095238,
"grad_norm": 0.09581374811618217,
"learning_rate": 1.2726134676561385e-06,
"loss": 0.9747,
"step": 487
},
{
"epoch": 4.647619047619048,
"grad_norm": 0.09600971666081125,
"learning_rate": 1.2068495796382495e-06,
"loss": 0.9483,
"step": 488
},
{
"epoch": 4.6571428571428575,
"grad_norm": 0.10065951842357326,
"learning_rate": 1.142804269411606e-06,
"loss": 0.9652,
"step": 489
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.1012630059377017,
"learning_rate": 1.0804803742515068e-06,
"loss": 0.9417,
"step": 490
},
{
"epoch": 4.6761904761904765,
"grad_norm": 0.10102648935759229,
"learning_rate": 1.0198806551727557e-06,
"loss": 0.9575,
"step": 491
},
{
"epoch": 4.685714285714286,
"grad_norm": 0.10109454230992841,
"learning_rate": 9.610077968072962e-07,
"loss": 0.966,
"step": 492
},
{
"epoch": 4.695238095238095,
"grad_norm": 0.09805229033614825,
"learning_rate": 9.038644072853331e-07,
"loss": 0.9671,
"step": 493
},
{
"epoch": 4.704761904761905,
"grad_norm": 0.09726857715744779,
"learning_rate": 8.484530181197504e-07,
"loss": 0.9477,
"step": 494
},
{
"epoch": 4.714285714285714,
"grad_norm": 0.1012887727099665,
"learning_rate": 7.947760840939688e-07,
"loss": 0.9572,
"step": 495
},
{
"epoch": 4.723809523809524,
"grad_norm": 0.09689224625015615,
"learning_rate": 7.428359831532117e-07,
"loss": 0.9385,
"step": 496
},
{
"epoch": 4.733333333333333,
"grad_norm": 0.09894505285476389,
"learning_rate": 6.926350162991613e-07,
"loss": 0.9502,
"step": 497
},
{
"epoch": 4.742857142857143,
"grad_norm": 0.10241361126956,
"learning_rate": 6.441754074879925e-07,
"loss": 0.9495,
"step": 498
},
{
"epoch": 4.752380952380952,
"grad_norm": 0.09443159904089143,
"learning_rate": 5.974593035318777e-07,
"loss": 0.9536,
"step": 499
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.10189499647323387,
"learning_rate": 5.524887740038676e-07,
"loss": 0.9472,
"step": 500
},
{
"epoch": 4.771428571428571,
"grad_norm": 0.10773088780281193,
"learning_rate": 5.092658111462179e-07,
"loss": 0.9473,
"step": 501
},
{
"epoch": 4.780952380952381,
"grad_norm": 0.3240978802797332,
"learning_rate": 4.6779232978211297e-07,
"loss": 0.9784,
"step": 502
},
{
"epoch": 4.79047619047619,
"grad_norm": 0.09242163553460452,
"learning_rate": 4.280701672308585e-07,
"loss": 0.9583,
"step": 503
},
{
"epoch": 4.8,
"grad_norm": 0.09118495736223464,
"learning_rate": 3.901010832264662e-07,
"loss": 0.9473,
"step": 504
},
{
"epoch": 4.809523809523809,
"grad_norm": 0.09466091553337481,
"learning_rate": 3.5388675983971664e-07,
"loss": 0.9523,
"step": 505
},
{
"epoch": 4.819047619047619,
"grad_norm": 0.10089901559145306,
"learning_rate": 3.1942880140360955e-07,
"loss": 0.9591,
"step": 506
},
{
"epoch": 4.828571428571428,
"grad_norm": 0.10487799317926302,
"learning_rate": 2.867287344423364e-07,
"loss": 0.9764,
"step": 507
},
{
"epoch": 4.838095238095238,
"grad_norm": 0.0945247266110506,
"learning_rate": 2.557880076036101e-07,
"loss": 0.9492,
"step": 508
},
{
"epoch": 4.847619047619047,
"grad_norm": 0.09269936781651757,
"learning_rate": 2.2660799159451629e-07,
"loss": 0.9579,
"step": 509
},
{
"epoch": 4.857142857142857,
"grad_norm": 0.09460038465065274,
"learning_rate": 1.991899791207663e-07,
"loss": 0.9638,
"step": 510
},
{
"epoch": 4.866666666666667,
"grad_norm": 0.0982949463276317,
"learning_rate": 1.7353518482946308e-07,
"loss": 0.9535,
"step": 511
},
{
"epoch": 4.876190476190477,
"grad_norm": 0.09485743697787292,
"learning_rate": 1.4964474525525075e-07,
"loss": 0.9599,
"step": 512
},
{
"epoch": 4.885714285714286,
"grad_norm": 0.0963473582791281,
"learning_rate": 1.2751971876999504e-07,
"loss": 0.962,
"step": 513
},
{
"epoch": 4.895238095238096,
"grad_norm": 0.09723177864800757,
"learning_rate": 1.0716108553588289e-07,
"loss": 0.9398,
"step": 514
},
{
"epoch": 4.904761904761905,
"grad_norm": 0.09412117317242105,
"learning_rate": 8.856974746199954e-08,
"loss": 0.9578,
"step": 515
},
{
"epoch": 4.914285714285715,
"grad_norm": 0.08864592036283941,
"learning_rate": 7.174652816437811e-08,
"loss": 0.9599,
"step": 516
},
{
"epoch": 4.923809523809524,
"grad_norm": 0.09083413925650892,
"learning_rate": 5.669217292952223e-08,
"loss": 0.9512,
"step": 517
},
{
"epoch": 4.933333333333334,
"grad_norm": 0.09669535232036294,
"learning_rate": 4.3407348681361314e-08,
"loss": 0.9524,
"step": 518
},
{
"epoch": 4.942857142857143,
"grad_norm": 0.09367376797157739,
"learning_rate": 3.189264395172753e-08,
"loss": 0.9438,
"step": 519
},
{
"epoch": 4.9523809523809526,
"grad_norm": 0.09753187429670476,
"learning_rate": 2.214856885427885e-08,
"loss": 0.9513,
"step": 520
},
{
"epoch": 4.961904761904762,
"grad_norm": 0.09256693337711726,
"learning_rate": 1.4175555061894942e-08,
"loss": 0.9504,
"step": 521
},
{
"epoch": 4.9714285714285715,
"grad_norm": 0.09471054416033763,
"learning_rate": 7.973955787567988e-09,
"loss": 0.9648,
"step": 522
},
{
"epoch": 4.980952380952381,
"grad_norm": 0.09324529832027807,
"learning_rate": 3.544045768730797e-09,
"loss": 0.9516,
"step": 523
},
{
"epoch": 4.9904761904761905,
"grad_norm": 0.09644781364770945,
"learning_rate": 8.860212551020653e-10,
"loss": 0.9408,
"step": 524
},
{
"epoch": 5.0,
"grad_norm": 0.09523670709888162,
"learning_rate": 0.0,
"loss": 0.9441,
"step": 525
},
{
"epoch": 5.0,
"step": 525,
"total_flos": 1.350981278339013e+19,
"train_loss": 1.0422365560985747,
"train_runtime": 52983.8768,
"train_samples_per_second": 5.072,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 525,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.350981278339013e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}