ItsMaxNorm's picture
Model save
5befd42 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021739130434782608,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6431,
"num_tokens": 2090831.0,
"step": 1
},
{
"epoch": 0.043478260869565216,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6435,
"num_tokens": 4183006.0,
"step": 2
},
{
"epoch": 0.06521739130434782,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6379,
"num_tokens": 6272564.0,
"step": 3
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6449,
"num_tokens": 8366125.0,
"step": 4
},
{
"epoch": 0.10869565217391304,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6402,
"num_tokens": 10461703.0,
"step": 5
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6338,
"num_tokens": 12555347.0,
"step": 6
},
{
"epoch": 0.15217391304347827,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6436,
"num_tokens": 14648409.0,
"step": 7
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6384,
"num_tokens": 16741562.0,
"step": 8
},
{
"epoch": 0.1956521739130435,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6437,
"num_tokens": 18831318.0,
"step": 9
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.644,
"num_tokens": 20922863.0,
"step": 10
},
{
"epoch": 0.2391304347826087,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6419,
"num_tokens": 23014620.0,
"step": 11
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6406,
"num_tokens": 25105339.0,
"step": 12
},
{
"epoch": 0.2826086956521739,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6405,
"num_tokens": 27197795.0,
"step": 13
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6448,
"num_tokens": 29291898.0,
"step": 14
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6372,
"num_tokens": 31383496.0,
"step": 15
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 0.6397,
"num_tokens": 33477448.0,
"step": 16
},
{
"epoch": 0.3695652173913043,
"grad_norm": 8.101283473215778,
"learning_rate": 0.0,
"loss": 0.6359,
"num_tokens": 35568911.0,
"step": 17
},
{
"epoch": 0.391304347826087,
"grad_norm": 8.159300793582531,
"learning_rate": 2.1739130434782606e-08,
"loss": 0.6418,
"num_tokens": 37660844.0,
"step": 18
},
{
"epoch": 0.41304347826086957,
"grad_norm": 8.143428019277147,
"learning_rate": 4.347826086956521e-08,
"loss": 0.6391,
"num_tokens": 39755083.0,
"step": 19
},
{
"epoch": 0.43478260869565216,
"grad_norm": 8.104124464792218,
"learning_rate": 6.521739130434782e-08,
"loss": 0.6442,
"num_tokens": 41847436.0,
"step": 20
},
{
"epoch": 0.45652173913043476,
"grad_norm": 8.101986671909064,
"learning_rate": 8.695652173913042e-08,
"loss": 0.6452,
"num_tokens": 43940263.0,
"step": 21
},
{
"epoch": 0.4782608695652174,
"grad_norm": 8.13244304749822,
"learning_rate": 1.0869565217391303e-07,
"loss": 0.646,
"num_tokens": 46033076.0,
"step": 22
},
{
"epoch": 0.5,
"grad_norm": 8.147383007336266,
"learning_rate": 1.3043478260869563e-07,
"loss": 0.6435,
"num_tokens": 48124508.0,
"step": 23
},
{
"epoch": 0.5217391304347826,
"grad_norm": 8.144105003431784,
"learning_rate": 1.5217391304347825e-07,
"loss": 0.639,
"num_tokens": 50217845.0,
"step": 24
},
{
"epoch": 0.5434782608695652,
"grad_norm": 7.999617610004394,
"learning_rate": 1.7391304347826085e-07,
"loss": 0.6374,
"num_tokens": 52311173.0,
"step": 25
},
{
"epoch": 0.5652173913043478,
"grad_norm": 8.109185633091318,
"learning_rate": 1.9565217391304347e-07,
"loss": 0.6335,
"num_tokens": 54401629.0,
"step": 26
},
{
"epoch": 0.5869565217391305,
"grad_norm": 8.109185633091318,
"learning_rate": 2.1739130434782607e-07,
"loss": 0.6192,
"num_tokens": 56492920.0,
"step": 27
},
{
"epoch": 0.6086956521739131,
"grad_norm": 9.27996446234992,
"learning_rate": 2.1739130434782607e-07,
"loss": 0.6146,
"num_tokens": 58586157.0,
"step": 28
},
{
"epoch": 0.6304347826086957,
"grad_norm": 9.159324927898115,
"learning_rate": 2.391304347826087e-07,
"loss": 0.6175,
"num_tokens": 60679798.0,
"step": 29
},
{
"epoch": 0.6521739130434783,
"grad_norm": 9.763710040610643,
"learning_rate": 2.6086956521739126e-07,
"loss": 0.6155,
"num_tokens": 62773329.0,
"step": 30
},
{
"epoch": 0.6739130434782609,
"grad_norm": 9.779733585945591,
"learning_rate": 2.8260869565217386e-07,
"loss": 0.6135,
"num_tokens": 64864578.0,
"step": 31
},
{
"epoch": 0.6956521739130435,
"grad_norm": 9.779733585945591,
"learning_rate": 3.043478260869565e-07,
"loss": 0.5828,
"num_tokens": 66956700.0,
"step": 32
},
{
"epoch": 0.717391304347826,
"grad_norm": 13.430953070069496,
"learning_rate": 3.043478260869565e-07,
"loss": 0.5856,
"num_tokens": 69046410.0,
"step": 33
},
{
"epoch": 0.7391304347826086,
"grad_norm": 13.32547280061291,
"learning_rate": 3.260869565217391e-07,
"loss": 0.5778,
"num_tokens": 71138403.0,
"step": 34
},
{
"epoch": 0.7608695652173914,
"grad_norm": 13.037082670124171,
"learning_rate": 3.478260869565217e-07,
"loss": 0.5698,
"num_tokens": 73231369.0,
"step": 35
},
{
"epoch": 0.782608695652174,
"grad_norm": 11.047732146906828,
"learning_rate": 3.695652173913043e-07,
"loss": 0.5637,
"num_tokens": 75324509.0,
"step": 36
},
{
"epoch": 0.8043478260869565,
"grad_norm": 9.940315680480857,
"learning_rate": 3.9130434782608694e-07,
"loss": 0.5683,
"num_tokens": 77414031.0,
"step": 37
},
{
"epoch": 0.8260869565217391,
"grad_norm": 5.482237002272238,
"learning_rate": 4.1304347826086954e-07,
"loss": 0.559,
"num_tokens": 79506910.0,
"step": 38
},
{
"epoch": 0.8478260869565217,
"grad_norm": 5.078292155102419,
"learning_rate": 4.3478260869565214e-07,
"loss": 0.5504,
"num_tokens": 81597326.0,
"step": 39
},
{
"epoch": 0.8695652173913043,
"grad_norm": 4.758390837071518,
"learning_rate": 4.5652173913043473e-07,
"loss": 0.5467,
"num_tokens": 83691142.0,
"step": 40
},
{
"epoch": 0.8913043478260869,
"grad_norm": 4.533086950846491,
"learning_rate": 4.782608695652174e-07,
"loss": 0.5368,
"num_tokens": 85781965.0,
"step": 41
},
{
"epoch": 0.9130434782608695,
"grad_norm": 4.3523282131985175,
"learning_rate": 5e-07,
"loss": 0.5356,
"num_tokens": 87872195.0,
"step": 42
},
{
"epoch": 0.9347826086956522,
"grad_norm": 4.14903309276372,
"learning_rate": 5.217391304347825e-07,
"loss": 0.536,
"num_tokens": 89964241.0,
"step": 43
},
{
"epoch": 0.9565217391304348,
"grad_norm": 3.9293608373652873,
"learning_rate": 5.434782608695652e-07,
"loss": 0.537,
"num_tokens": 92057253.0,
"step": 44
},
{
"epoch": 0.9782608695652174,
"grad_norm": 3.597560116551251,
"learning_rate": 5.652173913043477e-07,
"loss": 0.5299,
"num_tokens": 94149837.0,
"step": 45
},
{
"epoch": 1.0,
"grad_norm": 3.0088158093378103,
"learning_rate": 5.869565217391305e-07,
"loss": 0.5214,
"num_tokens": 96244823.0,
"step": 46
},
{
"epoch": 1.0217391304347827,
"grad_norm": 2.7490399390700904,
"learning_rate": 6.08695652173913e-07,
"loss": 0.5175,
"num_tokens": 98337212.0,
"step": 47
},
{
"epoch": 1.0434782608695652,
"grad_norm": 2.5946058680255386,
"learning_rate": 6.304347826086957e-07,
"loss": 0.5159,
"num_tokens": 100428766.0,
"step": 48
},
{
"epoch": 1.065217391304348,
"grad_norm": 2.503905065889435,
"learning_rate": 6.521739130434782e-07,
"loss": 0.5072,
"num_tokens": 102520698.0,
"step": 49
},
{
"epoch": 1.0869565217391304,
"grad_norm": 2.456643430738729,
"learning_rate": 6.739130434782609e-07,
"loss": 0.5018,
"num_tokens": 104612511.0,
"step": 50
},
{
"epoch": 1.108695652173913,
"grad_norm": 2.3865628559826857,
"learning_rate": 6.956521739130434e-07,
"loss": 0.4942,
"num_tokens": 106707204.0,
"step": 51
},
{
"epoch": 1.1304347826086956,
"grad_norm": 2.3356298780592293,
"learning_rate": 7.17391304347826e-07,
"loss": 0.4918,
"num_tokens": 108800670.0,
"step": 52
},
{
"epoch": 1.1521739130434783,
"grad_norm": 2.304735371322351,
"learning_rate": 7.391304347826086e-07,
"loss": 0.4914,
"num_tokens": 110894728.0,
"step": 53
},
{
"epoch": 1.1739130434782608,
"grad_norm": 2.2915944727664055,
"learning_rate": 7.608695652173913e-07,
"loss": 0.4919,
"num_tokens": 112989220.0,
"step": 54
},
{
"epoch": 1.1956521739130435,
"grad_norm": 2.2452202598548534,
"learning_rate": 7.826086956521739e-07,
"loss": 0.4794,
"num_tokens": 115080217.0,
"step": 55
},
{
"epoch": 1.2173913043478262,
"grad_norm": 2.179597494769898,
"learning_rate": 8.043478260869565e-07,
"loss": 0.4857,
"num_tokens": 117172369.0,
"step": 56
},
{
"epoch": 1.2391304347826086,
"grad_norm": 2.074697161067729,
"learning_rate": 8.260869565217391e-07,
"loss": 0.4789,
"num_tokens": 119266181.0,
"step": 57
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.8408360821578929,
"learning_rate": 8.478260869565217e-07,
"loss": 0.4746,
"num_tokens": 121360235.0,
"step": 58
},
{
"epoch": 1.2826086956521738,
"grad_norm": 1.409431046959032,
"learning_rate": 8.695652173913043e-07,
"loss": 0.4767,
"num_tokens": 123452301.0,
"step": 59
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.8344487221470542,
"learning_rate": 8.913043478260869e-07,
"loss": 0.4775,
"num_tokens": 125544278.0,
"step": 60
},
{
"epoch": 1.3260869565217392,
"grad_norm": 0.4342290656199116,
"learning_rate": 9.130434782608695e-07,
"loss": 0.4744,
"num_tokens": 127634616.0,
"step": 61
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.27358056826447424,
"learning_rate": 9.347826086956522e-07,
"loss": 0.4754,
"num_tokens": 129729453.0,
"step": 62
},
{
"epoch": 1.3695652173913042,
"grad_norm": 0.23082541661596323,
"learning_rate": 9.565217391304349e-07,
"loss": 0.4743,
"num_tokens": 131820225.0,
"step": 63
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.22354521543660916,
"learning_rate": 9.782608695652173e-07,
"loss": 0.472,
"num_tokens": 133913678.0,
"step": 64
},
{
"epoch": 1.4130434782608696,
"grad_norm": 0.20118073906094502,
"learning_rate": 1e-06,
"loss": 0.4697,
"num_tokens": 136007430.0,
"step": 65
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.20692402483858816,
"learning_rate": 9.999870437446958e-07,
"loss": 0.4631,
"num_tokens": 138101493.0,
"step": 66
},
{
"epoch": 1.4565217391304348,
"grad_norm": 0.1901715221113706,
"learning_rate": 9.999481757248477e-07,
"loss": 0.4666,
"num_tokens": 140193704.0,
"step": 67
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.19179490787267098,
"learning_rate": 9.998833981786071e-07,
"loss": 0.464,
"num_tokens": 142285250.0,
"step": 68
},
{
"epoch": 1.5,
"grad_norm": 0.18845464387262947,
"learning_rate": 9.997927148360823e-07,
"loss": 0.4692,
"num_tokens": 144378781.0,
"step": 69
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.1903737563315874,
"learning_rate": 9.996761309191247e-07,
"loss": 0.4596,
"num_tokens": 146471011.0,
"step": 70
},
{
"epoch": 1.5434782608695652,
"grad_norm": 0.1736225981517059,
"learning_rate": 9.995336531410273e-07,
"loss": 0.4631,
"num_tokens": 148562357.0,
"step": 71
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.17508414588076462,
"learning_rate": 9.993652897061393e-07,
"loss": 0.4649,
"num_tokens": 150656061.0,
"step": 72
},
{
"epoch": 1.5869565217391304,
"grad_norm": 0.17741120682256034,
"learning_rate": 9.991710503093922e-07,
"loss": 0.4674,
"num_tokens": 152748871.0,
"step": 73
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.16540006434719232,
"learning_rate": 9.989509461357426e-07,
"loss": 0.4642,
"num_tokens": 154839927.0,
"step": 74
},
{
"epoch": 1.6304347826086958,
"grad_norm": 0.17667838370848385,
"learning_rate": 9.987049898595276e-07,
"loss": 0.4651,
"num_tokens": 156932138.0,
"step": 75
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.15619953156457722,
"learning_rate": 9.984331956437354e-07,
"loss": 0.46,
"num_tokens": 159020025.0,
"step": 76
},
{
"epoch": 1.6739130434782608,
"grad_norm": 0.158692399179484,
"learning_rate": 9.98135579139189e-07,
"loss": 0.4642,
"num_tokens": 161110709.0,
"step": 77
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.16387685015731232,
"learning_rate": 9.97812157483646e-07,
"loss": 0.4598,
"num_tokens": 163201280.0,
"step": 78
},
{
"epoch": 1.7173913043478262,
"grad_norm": 0.16147264627938895,
"learning_rate": 9.974629493008114e-07,
"loss": 0.4593,
"num_tokens": 165293375.0,
"step": 79
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.14847792515904235,
"learning_rate": 9.97087974699264e-07,
"loss": 0.4539,
"num_tokens": 167386166.0,
"step": 80
},
{
"epoch": 1.7608695652173914,
"grad_norm": 0.14368418963515403,
"learning_rate": 9.966872552713004e-07,
"loss": 0.4596,
"num_tokens": 169477777.0,
"step": 81
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.14685857026484775,
"learning_rate": 9.962608140916905e-07,
"loss": 0.4582,
"num_tokens": 171567446.0,
"step": 82
},
{
"epoch": 1.8043478260869565,
"grad_norm": 0.15282259101923246,
"learning_rate": 9.958086757163488e-07,
"loss": 0.4616,
"num_tokens": 173655862.0,
"step": 83
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.15514478856193514,
"learning_rate": 9.953308661809207e-07,
"loss": 0.457,
"num_tokens": 175749514.0,
"step": 84
},
{
"epoch": 1.8478260869565217,
"grad_norm": 0.14228669183096485,
"learning_rate": 9.948274129992836e-07,
"loss": 0.46,
"num_tokens": 177837992.0,
"step": 85
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.13988219902396246,
"learning_rate": 9.942983451619614e-07,
"loss": 0.4603,
"num_tokens": 179931723.0,
"step": 86
},
{
"epoch": 1.891304347826087,
"grad_norm": 0.1391966834345067,
"learning_rate": 9.93743693134456e-07,
"loss": 0.4617,
"num_tokens": 182023329.0,
"step": 87
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.14697062115129336,
"learning_rate": 9.931634888554935e-07,
"loss": 0.4623,
"num_tokens": 184117906.0,
"step": 88
},
{
"epoch": 1.9347826086956523,
"grad_norm": 0.13927296968611877,
"learning_rate": 9.92557765735184e-07,
"loss": 0.4563,
"num_tokens": 186211056.0,
"step": 89
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.14637757594641007,
"learning_rate": 9.919265586530975e-07,
"loss": 0.4627,
"num_tokens": 188304474.0,
"step": 90
},
{
"epoch": 1.9782608695652173,
"grad_norm": 0.142956055977351,
"learning_rate": 9.912699039562576e-07,
"loss": 0.4579,
"num_tokens": 190397770.0,
"step": 91
},
{
"epoch": 2.0,
"grad_norm": 0.14123350435136145,
"learning_rate": 9.905878394570453e-07,
"loss": 0.4602,
"num_tokens": 192489635.0,
"step": 92
},
{
"epoch": 2.0217391304347827,
"grad_norm": 0.1436834934317212,
"learning_rate": 9.898804044310245e-07,
"loss": 0.4558,
"num_tokens": 194583301.0,
"step": 93
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.1374483001051701,
"learning_rate": 9.891476396146784e-07,
"loss": 0.452,
"num_tokens": 196677306.0,
"step": 94
},
{
"epoch": 2.0652173913043477,
"grad_norm": 0.13722577602290897,
"learning_rate": 9.883895872030657e-07,
"loss": 0.4509,
"num_tokens": 198768835.0,
"step": 95
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.13783100429148887,
"learning_rate": 9.87606290847388e-07,
"loss": 0.4572,
"num_tokens": 200859264.0,
"step": 96
},
{
"epoch": 2.108695652173913,
"grad_norm": 0.1367807019239086,
"learning_rate": 9.867977956524796e-07,
"loss": 0.4582,
"num_tokens": 202952584.0,
"step": 97
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.14120080132901613,
"learning_rate": 9.859641481742077e-07,
"loss": 0.4542,
"num_tokens": 205045012.0,
"step": 98
},
{
"epoch": 2.1521739130434785,
"grad_norm": 0.1347346802712429,
"learning_rate": 9.851053964167927e-07,
"loss": 0.4506,
"num_tokens": 207137257.0,
"step": 99
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.13826725695583056,
"learning_rate": 9.842215898300433e-07,
"loss": 0.4553,
"num_tokens": 209229044.0,
"step": 100
},
{
"epoch": 2.1956521739130435,
"grad_norm": 0.14306905673630102,
"learning_rate": 9.833127793065097e-07,
"loss": 0.4533,
"num_tokens": 211323327.0,
"step": 101
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.13752558845525534,
"learning_rate": 9.823790171785526e-07,
"loss": 0.4556,
"num_tokens": 213417950.0,
"step": 102
},
{
"epoch": 2.239130434782609,
"grad_norm": 0.1451178643142076,
"learning_rate": 9.814203572153298e-07,
"loss": 0.4542,
"num_tokens": 215508377.0,
"step": 103
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.1341943658847912,
"learning_rate": 9.804368546197006e-07,
"loss": 0.4548,
"num_tokens": 217599824.0,
"step": 104
},
{
"epoch": 2.282608695652174,
"grad_norm": 0.13948206525149412,
"learning_rate": 9.794285660250455e-07,
"loss": 0.4549,
"num_tokens": 219693642.0,
"step": 105
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.13795263816003625,
"learning_rate": 9.783955494920066e-07,
"loss": 0.4548,
"num_tokens": 221789678.0,
"step": 106
},
{
"epoch": 2.3260869565217392,
"grad_norm": 0.1409796803835522,
"learning_rate": 9.773378645051436e-07,
"loss": 0.4562,
"num_tokens": 223881586.0,
"step": 107
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.13044382265443313,
"learning_rate": 9.762555719695088e-07,
"loss": 0.4461,
"num_tokens": 225970167.0,
"step": 108
},
{
"epoch": 2.369565217391304,
"grad_norm": 0.1502600460482269,
"learning_rate": 9.751487342071393e-07,
"loss": 0.4498,
"num_tokens": 228061661.0,
"step": 109
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.1338230534634335,
"learning_rate": 9.740174149534692e-07,
"loss": 0.4485,
"num_tokens": 230153932.0,
"step": 110
},
{
"epoch": 2.4130434782608696,
"grad_norm": 0.1440831587199848,
"learning_rate": 9.728616793536587e-07,
"loss": 0.4552,
"num_tokens": 232244695.0,
"step": 111
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.13658285859261474,
"learning_rate": 9.716815939588436e-07,
"loss": 0.4557,
"num_tokens": 234336105.0,
"step": 112
},
{
"epoch": 2.4565217391304346,
"grad_norm": 0.14313487538739023,
"learning_rate": 9.704772267223019e-07,
"loss": 0.4527,
"num_tokens": 236428770.0,
"step": 113
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.1400873818490847,
"learning_rate": 9.692486469955424e-07,
"loss": 0.4494,
"num_tokens": 238521551.0,
"step": 114
},
{
"epoch": 2.5,
"grad_norm": 0.13049717910266437,
"learning_rate": 9.6799592552431e-07,
"loss": 0.4458,
"num_tokens": 240614588.0,
"step": 115
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.13318259742833988,
"learning_rate": 9.667191344445122e-07,
"loss": 0.4562,
"num_tokens": 242706024.0,
"step": 116
},
{
"epoch": 2.5434782608695654,
"grad_norm": 0.1342513590455262,
"learning_rate": 9.654183472780655e-07,
"loss": 0.4573,
"num_tokens": 244798483.0,
"step": 117
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.136855839967963,
"learning_rate": 9.640936389286615e-07,
"loss": 0.4472,
"num_tokens": 246891342.0,
"step": 118
},
{
"epoch": 2.5869565217391304,
"grad_norm": 0.137209398518282,
"learning_rate": 9.627450856774539e-07,
"loss": 0.4468,
"num_tokens": 248983299.0,
"step": 119
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.1347427629250892,
"learning_rate": 9.613727651786657e-07,
"loss": 0.4492,
"num_tokens": 251073761.0,
"step": 120
},
{
"epoch": 2.630434782608696,
"grad_norm": 0.14870638200971545,
"learning_rate": 9.599767564551183e-07,
"loss": 0.4502,
"num_tokens": 253168287.0,
"step": 121
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.13859516453775694,
"learning_rate": 9.5855713989368e-07,
"loss": 0.4523,
"num_tokens": 255260895.0,
"step": 122
},
{
"epoch": 2.6739130434782608,
"grad_norm": 0.15720652496010762,
"learning_rate": 9.57113997240638e-07,
"loss": 0.4463,
"num_tokens": 257351654.0,
"step": 123
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.1401244780972256,
"learning_rate": 9.55647411596991e-07,
"loss": 0.4518,
"num_tokens": 259440240.0,
"step": 124
},
{
"epoch": 2.717391304347826,
"grad_norm": 0.15018164735793,
"learning_rate": 9.541574674136632e-07,
"loss": 0.4498,
"num_tokens": 261532114.0,
"step": 125
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.13823614764022513,
"learning_rate": 9.526442504866426e-07,
"loss": 0.4558,
"num_tokens": 263623046.0,
"step": 126
},
{
"epoch": 2.7608695652173916,
"grad_norm": 0.14842418144378133,
"learning_rate": 9.511078479520392e-07,
"loss": 0.4523,
"num_tokens": 265714481.0,
"step": 127
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.13803890404964064,
"learning_rate": 9.495483482810687e-07,
"loss": 0.451,
"num_tokens": 267809891.0,
"step": 128
},
{
"epoch": 2.8043478260869565,
"grad_norm": 0.15595909439991468,
"learning_rate": 9.479658412749575e-07,
"loss": 0.4535,
"num_tokens": 269901433.0,
"step": 129
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.1368971901595536,
"learning_rate": 9.46360418059771e-07,
"loss": 0.451,
"num_tokens": 271991817.0,
"step": 130
},
{
"epoch": 2.8478260869565215,
"grad_norm": 0.15455191550217653,
"learning_rate": 9.447321710811674e-07,
"loss": 0.4519,
"num_tokens": 274081721.0,
"step": 131
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.13697363499414442,
"learning_rate": 9.430811940990734e-07,
"loss": 0.4509,
"num_tokens": 276177796.0,
"step": 132
},
{
"epoch": 2.891304347826087,
"grad_norm": 0.14942955290721893,
"learning_rate": 9.41407582182286e-07,
"loss": 0.4472,
"num_tokens": 278269125.0,
"step": 133
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.13839892788575286,
"learning_rate": 9.397114317029974e-07,
"loss": 0.4559,
"num_tokens": 280360480.0,
"step": 134
},
{
"epoch": 2.9347826086956523,
"grad_norm": 0.139804429331633,
"learning_rate": 9.37992840331246e-07,
"loss": 0.4512,
"num_tokens": 282455179.0,
"step": 135
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.1522025576863328,
"learning_rate": 9.362519070292923e-07,
"loss": 0.4457,
"num_tokens": 284547258.0,
"step": 136
},
{
"epoch": 2.9782608695652173,
"grad_norm": 0.13715855940260502,
"learning_rate": 9.344887320459198e-07,
"loss": 0.4472,
"num_tokens": 286640938.0,
"step": 137
},
{
"epoch": 3.0,
"grad_norm": 0.1457504699757938,
"learning_rate": 9.327034169106629e-07,
"loss": 0.4465,
"num_tokens": 288733869.0,
"step": 138
},
{
"epoch": 3.0217391304347827,
"grad_norm": 0.13583971625499844,
"learning_rate": 9.308960644279604e-07,
"loss": 0.4507,
"num_tokens": 290826317.0,
"step": 139
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.13589135954621737,
"learning_rate": 9.290667786712352e-07,
"loss": 0.4435,
"num_tokens": 292916931.0,
"step": 140
},
{
"epoch": 3.0652173913043477,
"grad_norm": 0.13858649102418938,
"learning_rate": 9.272156649769018e-07,
"loss": 0.454,
"num_tokens": 295007551.0,
"step": 141
},
{
"epoch": 3.0869565217391304,
"grad_norm": 0.13151545079660656,
"learning_rate": 9.253428299383012e-07,
"loss": 0.4443,
"num_tokens": 297101123.0,
"step": 142
},
{
"epoch": 3.108695652173913,
"grad_norm": 0.1372256717064563,
"learning_rate": 9.234483813995613e-07,
"loss": 0.4525,
"num_tokens": 299194691.0,
"step": 143
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.1362038166818457,
"learning_rate": 9.215324284493888e-07,
"loss": 0.4441,
"num_tokens": 301288963.0,
"step": 144
},
{
"epoch": 3.1521739130434785,
"grad_norm": 0.13366059430402608,
"learning_rate": 9.19595081414786e-07,
"loss": 0.4499,
"num_tokens": 303381802.0,
"step": 145
},
{
"epoch": 3.1739130434782608,
"grad_norm": 0.13964481270629708,
"learning_rate": 9.176364518546988e-07,
"loss": 0.4484,
"num_tokens": 305474609.0,
"step": 146
},
{
"epoch": 3.1956521739130435,
"grad_norm": 0.13321001510853767,
"learning_rate": 9.156566525535923e-07,
"loss": 0.4452,
"num_tokens": 307567147.0,
"step": 147
},
{
"epoch": 3.217391304347826,
"grad_norm": 0.14086672975049896,
"learning_rate": 9.136557975149561e-07,
"loss": 0.4446,
"num_tokens": 309657954.0,
"step": 148
},
{
"epoch": 3.239130434782609,
"grad_norm": 0.1334280312981284,
"learning_rate": 9.116340019547401e-07,
"loss": 0.4468,
"num_tokens": 311749336.0,
"step": 149
},
{
"epoch": 3.260869565217391,
"grad_norm": 0.13486315217872386,
"learning_rate": 9.095913822947196e-07,
"loss": 0.4495,
"num_tokens": 313844697.0,
"step": 150
},
{
"epoch": 3.282608695652174,
"grad_norm": 0.13793460372824046,
"learning_rate": 9.075280561557915e-07,
"loss": 0.4452,
"num_tokens": 315936487.0,
"step": 151
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.1379832600278911,
"learning_rate": 9.054441423512013e-07,
"loss": 0.4485,
"num_tokens": 318028220.0,
"step": 152
},
{
"epoch": 3.3260869565217392,
"grad_norm": 0.1349244615099038,
"learning_rate": 9.033397608797014e-07,
"loss": 0.4463,
"num_tokens": 320120976.0,
"step": 153
},
{
"epoch": 3.3478260869565215,
"grad_norm": 0.13302146086489272,
"learning_rate": 9.012150329186411e-07,
"loss": 0.4446,
"num_tokens": 322213515.0,
"step": 154
},
{
"epoch": 3.369565217391304,
"grad_norm": 0.13213199888987598,
"learning_rate": 8.990700808169889e-07,
"loss": 0.4457,
"num_tokens": 324306085.0,
"step": 155
},
{
"epoch": 3.391304347826087,
"grad_norm": 0.13576476545257601,
"learning_rate": 8.969050280882872e-07,
"loss": 0.4435,
"num_tokens": 326393943.0,
"step": 156
},
{
"epoch": 3.4130434782608696,
"grad_norm": 0.13269213517048742,
"learning_rate": 8.9471999940354e-07,
"loss": 0.4421,
"num_tokens": 328487650.0,
"step": 157
},
{
"epoch": 3.4347826086956523,
"grad_norm": 0.1397362160827111,
"learning_rate": 8.925151205840341e-07,
"loss": 0.4434,
"num_tokens": 330581600.0,
"step": 158
},
{
"epoch": 3.4565217391304346,
"grad_norm": 0.13448412881240226,
"learning_rate": 8.902905185940933e-07,
"loss": 0.4392,
"num_tokens": 332675243.0,
"step": 159
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.13581732207346198,
"learning_rate": 8.880463215337679e-07,
"loss": 0.4412,
"num_tokens": 334764586.0,
"step": 160
},
{
"epoch": 3.5,
"grad_norm": 0.1390811493560612,
"learning_rate": 8.857826586314586e-07,
"loss": 0.4402,
"num_tokens": 336856307.0,
"step": 161
},
{
"epoch": 3.5217391304347827,
"grad_norm": 0.13460989761218342,
"learning_rate": 8.834996602364736e-07,
"loss": 0.4456,
"num_tokens": 338950008.0,
"step": 162
},
{
"epoch": 3.5434782608695654,
"grad_norm": 0.13236047844850818,
"learning_rate": 8.811974578115248e-07,
"loss": 0.4448,
"num_tokens": 341042954.0,
"step": 163
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.13163982832755347,
"learning_rate": 8.788761839251558e-07,
"loss": 0.4415,
"num_tokens": 343134592.0,
"step": 164
},
{
"epoch": 3.5869565217391304,
"grad_norm": 0.14154355839698224,
"learning_rate": 8.765359722441094e-07,
"loss": 0.4456,
"num_tokens": 345226933.0,
"step": 165
},
{
"epoch": 3.608695652173913,
"grad_norm": 0.1355716379437535,
"learning_rate": 8.741769575256304e-07,
"loss": 0.4539,
"num_tokens": 347318986.0,
"step": 166
},
{
"epoch": 3.630434782608696,
"grad_norm": 0.13083494960528463,
"learning_rate": 8.717992756097047e-07,
"loss": 0.4448,
"num_tokens": 349411839.0,
"step": 167
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.13406159250224642,
"learning_rate": 8.694030634112389e-07,
"loss": 0.4421,
"num_tokens": 351504447.0,
"step": 168
},
{
"epoch": 3.6739130434782608,
"grad_norm": 0.13054451577457893,
"learning_rate": 8.669884589121756e-07,
"loss": 0.4426,
"num_tokens": 353596301.0,
"step": 169
},
{
"epoch": 3.6956521739130435,
"grad_norm": 0.13149102842185534,
"learning_rate": 8.645556011535469e-07,
"loss": 0.4432,
"num_tokens": 355690374.0,
"step": 170
},
{
"epoch": 3.717391304347826,
"grad_norm": 0.139504340331513,
"learning_rate": 8.621046302274697e-07,
"loss": 0.4464,
"num_tokens": 357782173.0,
"step": 171
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.13974946687867332,
"learning_rate": 8.596356872690778e-07,
"loss": 0.4441,
"num_tokens": 359875585.0,
"step": 172
},
{
"epoch": 3.7608695652173916,
"grad_norm": 0.1334362497712104,
"learning_rate": 8.571489144483944e-07,
"loss": 0.446,
"num_tokens": 361966708.0,
"step": 173
},
{
"epoch": 3.782608695652174,
"grad_norm": 0.1338803708751177,
"learning_rate": 8.546444549621466e-07,
"loss": 0.4449,
"num_tokens": 364059418.0,
"step": 174
},
{
"epoch": 3.8043478260869565,
"grad_norm": 0.13983854438472215,
"learning_rate": 8.521224530255185e-07,
"loss": 0.4448,
"num_tokens": 366152621.0,
"step": 175
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.13196553812425743,
"learning_rate": 8.495830538638481e-07,
"loss": 0.4418,
"num_tokens": 368245662.0,
"step": 176
},
{
"epoch": 3.8478260869565215,
"grad_norm": 0.13578885820125056,
"learning_rate": 8.470264037042638e-07,
"loss": 0.441,
"num_tokens": 370334430.0,
"step": 177
},
{
"epoch": 3.869565217391304,
"grad_norm": 0.13655268949641716,
"learning_rate": 8.44452649767264e-07,
"loss": 0.4439,
"num_tokens": 372426004.0,
"step": 178
},
{
"epoch": 3.891304347826087,
"grad_norm": 0.13786817781349708,
"learning_rate": 8.418619402582402e-07,
"loss": 0.4449,
"num_tokens": 374517459.0,
"step": 179
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.13918471016485154,
"learning_rate": 8.392544243589427e-07,
"loss": 0.4443,
"num_tokens": 376607611.0,
"step": 180
},
{
"epoch": 3.9347826086956523,
"grad_norm": 0.1403794476474249,
"learning_rate": 8.366302522188902e-07,
"loss": 0.4458,
"num_tokens": 378702030.0,
"step": 181
},
{
"epoch": 3.9565217391304346,
"grad_norm": 0.13426097626521305,
"learning_rate": 8.339895749467237e-07,
"loss": 0.4454,
"num_tokens": 380793542.0,
"step": 182
},
{
"epoch": 3.9782608695652173,
"grad_norm": 0.13694462770495058,
"learning_rate": 8.313325446015051e-07,
"loss": 0.4402,
"num_tokens": 382884886.0,
"step": 183
},
{
"epoch": 4.0,
"grad_norm": 0.14272951717733043,
"learning_rate": 8.286593141839608e-07,
"loss": 0.4423,
"num_tokens": 384977846.0,
"step": 184
},
{
"epoch": 4.021739130434782,
"grad_norm": 0.1330582820057405,
"learning_rate": 8.259700376276723e-07,
"loss": 0.444,
"num_tokens": 387071368.0,
"step": 185
},
{
"epoch": 4.043478260869565,
"grad_norm": 0.14269491995175726,
"learning_rate": 8.232648697902113e-07,
"loss": 0.4419,
"num_tokens": 389164937.0,
"step": 186
},
{
"epoch": 4.065217391304348,
"grad_norm": 0.14167939313294817,
"learning_rate": 8.205439664442229e-07,
"loss": 0.4374,
"num_tokens": 391258317.0,
"step": 187
},
{
"epoch": 4.086956521739131,
"grad_norm": 0.13301946278649346,
"learning_rate": 8.178074842684554e-07,
"loss": 0.4369,
"num_tokens": 393349237.0,
"step": 188
},
{
"epoch": 4.108695652173913,
"grad_norm": 0.14450522832231033,
"learning_rate": 8.150555808387387e-07,
"loss": 0.441,
"num_tokens": 395440009.0,
"step": 189
},
{
"epoch": 4.130434782608695,
"grad_norm": 0.13877319368593247,
"learning_rate": 8.122884146189103e-07,
"loss": 0.4374,
"num_tokens": 397533811.0,
"step": 190
},
{
"epoch": 4.1521739130434785,
"grad_norm": 0.14318441928820957,
"learning_rate": 8.095061449516902e-07,
"loss": 0.4427,
"num_tokens": 399626066.0,
"step": 191
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.14579072710487548,
"learning_rate": 8.067089320495056e-07,
"loss": 0.4424,
"num_tokens": 401718435.0,
"step": 192
},
{
"epoch": 4.195652173913044,
"grad_norm": 0.1374419102226233,
"learning_rate": 8.038969369852654e-07,
"loss": 0.443,
"num_tokens": 403809789.0,
"step": 193
},
{
"epoch": 4.217391304347826,
"grad_norm": 0.14176995288895658,
"learning_rate": 8.010703216830851e-07,
"loss": 0.4413,
"num_tokens": 405899766.0,
"step": 194
},
{
"epoch": 4.239130434782608,
"grad_norm": 0.13176722868730403,
"learning_rate": 7.982292489089621e-07,
"loss": 0.4416,
"num_tokens": 407993759.0,
"step": 195
},
{
"epoch": 4.260869565217392,
"grad_norm": 0.13658573742727376,
"learning_rate": 7.953738822614047e-07,
"loss": 0.4391,
"num_tokens": 410085571.0,
"step": 196
},
{
"epoch": 4.282608695652174,
"grad_norm": 0.13952183028099105,
"learning_rate": 7.92504386162009e-07,
"loss": 0.4349,
"num_tokens": 412179524.0,
"step": 197
},
{
"epoch": 4.304347826086957,
"grad_norm": 0.13564370570138545,
"learning_rate": 7.896209258459932e-07,
"loss": 0.4444,
"num_tokens": 414268672.0,
"step": 198
},
{
"epoch": 4.326086956521739,
"grad_norm": 0.14161938766244753,
"learning_rate": 7.867236673526819e-07,
"loss": 0.4437,
"num_tokens": 416362080.0,
"step": 199
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.13684396912767316,
"learning_rate": 7.838127775159451e-07,
"loss": 0.4436,
"num_tokens": 418453097.0,
"step": 200
},
{
"epoch": 4.369565217391305,
"grad_norm": 0.14184406812492364,
"learning_rate": 7.808884239545909e-07,
"loss": 0.4415,
"num_tokens": 420545560.0,
"step": 201
},
{
"epoch": 4.391304347826087,
"grad_norm": 0.1366458217786427,
"learning_rate": 7.779507750627144e-07,
"loss": 0.4402,
"num_tokens": 422636960.0,
"step": 202
},
{
"epoch": 4.413043478260869,
"grad_norm": 0.13773619821700897,
"learning_rate": 7.75e-07,
"loss": 0.4437,
"num_tokens": 424730959.0,
"step": 203
},
{
"epoch": 4.434782608695652,
"grad_norm": 0.13758121811172125,
"learning_rate": 7.720362686819813e-07,
"loss": 0.44,
"num_tokens": 426822955.0,
"step": 204
},
{
"epoch": 4.456521739130435,
"grad_norm": 0.13761175022454988,
"learning_rate": 7.690597517702567e-07,
"loss": 0.4423,
"num_tokens": 428914580.0,
"step": 205
},
{
"epoch": 4.478260869565218,
"grad_norm": 0.13045102347864693,
"learning_rate": 7.660706206626619e-07,
"loss": 0.4364,
"num_tokens": 431007431.0,
"step": 206
},
{
"epoch": 4.5,
"grad_norm": 0.13320915243246825,
"learning_rate": 7.630690474834003e-07,
"loss": 0.4376,
"num_tokens": 433101278.0,
"step": 207
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.135879604945715,
"learning_rate": 7.600552050731314e-07,
"loss": 0.4392,
"num_tokens": 435193373.0,
"step": 208
},
{
"epoch": 4.543478260869565,
"grad_norm": 0.13761527466802642,
"learning_rate": 7.570292669790184e-07,
"loss": 0.4383,
"num_tokens": 437286568.0,
"step": 209
},
{
"epoch": 4.565217391304348,
"grad_norm": 0.1403373341420107,
"learning_rate": 7.539914074447348e-07,
"loss": 0.4419,
"num_tokens": 439379915.0,
"step": 210
},
{
"epoch": 4.586956521739131,
"grad_norm": 0.13604302231295956,
"learning_rate": 7.5094180140043e-07,
"loss": 0.44,
"num_tokens": 441473072.0,
"step": 211
},
{
"epoch": 4.608695652173913,
"grad_norm": 0.1322101614674662,
"learning_rate": 7.478806244526576e-07,
"loss": 0.4436,
"num_tokens": 443567056.0,
"step": 212
},
{
"epoch": 4.630434782608695,
"grad_norm": 0.13669477430314778,
"learning_rate": 7.448080528742623e-07,
"loss": 0.4398,
"num_tokens": 445657660.0,
"step": 213
},
{
"epoch": 4.6521739130434785,
"grad_norm": 0.1415477810417469,
"learning_rate": 7.417242635942297e-07,
"loss": 0.4394,
"num_tokens": 447751320.0,
"step": 214
},
{
"epoch": 4.673913043478261,
"grad_norm": 0.135566768628023,
"learning_rate": 7.38629434187499e-07,
"loss": 0.4386,
"num_tokens": 449843005.0,
"step": 215
},
{
"epoch": 4.695652173913043,
"grad_norm": 0.14123634359786125,
"learning_rate": 7.355237428647359e-07,
"loss": 0.4415,
"num_tokens": 451936377.0,
"step": 216
},
{
"epoch": 4.717391304347826,
"grad_norm": 0.1357872359821091,
"learning_rate": 7.324073684620725e-07,
"loss": 0.4389,
"num_tokens": 454027246.0,
"step": 217
},
{
"epoch": 4.739130434782608,
"grad_norm": 0.13106073468244842,
"learning_rate": 7.292804904308086e-07,
"loss": 0.4353,
"num_tokens": 456118801.0,
"step": 218
},
{
"epoch": 4.760869565217392,
"grad_norm": 0.13664278397915866,
"learning_rate": 7.261432888270776e-07,
"loss": 0.4436,
"num_tokens": 458211696.0,
"step": 219
},
{
"epoch": 4.782608695652174,
"grad_norm": 0.1359534311622986,
"learning_rate": 7.229959443014793e-07,
"loss": 0.4427,
"num_tokens": 460302365.0,
"step": 220
},
{
"epoch": 4.804347826086957,
"grad_norm": 0.1391728706721248,
"learning_rate": 7.198386380886764e-07,
"loss": 0.4378,
"num_tokens": 462395009.0,
"step": 221
},
{
"epoch": 4.826086956521739,
"grad_norm": 0.14471303133933838,
"learning_rate": 7.1667155199696e-07,
"loss": 0.4393,
"num_tokens": 464488113.0,
"step": 222
},
{
"epoch": 4.8478260869565215,
"grad_norm": 0.13340639535871296,
"learning_rate": 7.134948683977786e-07,
"loss": 0.4403,
"num_tokens": 466576826.0,
"step": 223
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.13672161474434347,
"learning_rate": 7.103087702152376e-07,
"loss": 0.4377,
"num_tokens": 468668935.0,
"step": 224
},
{
"epoch": 4.891304347826087,
"grad_norm": 0.1344120648043691,
"learning_rate": 7.071134409155658e-07,
"loss": 0.4399,
"num_tokens": 470761454.0,
"step": 225
},
{
"epoch": 4.913043478260869,
"grad_norm": 0.13711458441843338,
"learning_rate": 7.039090644965509e-07,
"loss": 0.4432,
"num_tokens": 472854948.0,
"step": 226
},
{
"epoch": 4.934782608695652,
"grad_norm": 0.1355562476824205,
"learning_rate": 7.006958254769437e-07,
"loss": 0.4404,
"num_tokens": 474946262.0,
"step": 227
},
{
"epoch": 4.956521739130435,
"grad_norm": 0.13762364563023322,
"learning_rate": 6.974739088858337e-07,
"loss": 0.439,
"num_tokens": 477036875.0,
"step": 228
},
{
"epoch": 4.978260869565218,
"grad_norm": 0.13969976190229713,
"learning_rate": 6.942435002519938e-07,
"loss": 0.4379,
"num_tokens": 479130327.0,
"step": 229
},
{
"epoch": 5.0,
"grad_norm": 0.13408662519831854,
"learning_rate": 6.91004785593197e-07,
"loss": 0.4466,
"num_tokens": 481221569.0,
"step": 230
},
{
"epoch": 5.021739130434782,
"grad_norm": 0.14032592776135683,
"learning_rate": 6.877579514055058e-07,
"loss": 0.4396,
"num_tokens": 483311867.0,
"step": 231
},
{
"epoch": 5.043478260869565,
"grad_norm": 0.13548191720383043,
"learning_rate": 6.845031846525321e-07,
"loss": 0.4347,
"num_tokens": 485403919.0,
"step": 232
},
{
"epoch": 5.065217391304348,
"grad_norm": 0.130439503781961,
"learning_rate": 6.812406727546712e-07,
"loss": 0.4389,
"num_tokens": 487494574.0,
"step": 233
},
{
"epoch": 5.086956521739131,
"grad_norm": 0.13779175423288925,
"learning_rate": 6.779706035783104e-07,
"loss": 0.4348,
"num_tokens": 489585429.0,
"step": 234
},
{
"epoch": 5.108695652173913,
"grad_norm": 0.13497846536624478,
"learning_rate": 6.7469316542501e-07,
"loss": 0.4371,
"num_tokens": 491679660.0,
"step": 235
},
{
"epoch": 5.130434782608695,
"grad_norm": 0.14168178870357873,
"learning_rate": 6.714085470206609e-07,
"loss": 0.4383,
"num_tokens": 493769766.0,
"step": 236
},
{
"epoch": 5.1521739130434785,
"grad_norm": 0.13453405538660843,
"learning_rate": 6.681169375046172e-07,
"loss": 0.438,
"num_tokens": 495862475.0,
"step": 237
},
{
"epoch": 5.173913043478261,
"grad_norm": 0.13202645111151726,
"learning_rate": 6.648185264188042e-07,
"loss": 0.4381,
"num_tokens": 497955918.0,
"step": 238
},
{
"epoch": 5.195652173913044,
"grad_norm": 0.13125723135499598,
"learning_rate": 6.615135036968049e-07,
"loss": 0.4364,
"num_tokens": 500047691.0,
"step": 239
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.1335404532583581,
"learning_rate": 6.582020596529223e-07,
"loss": 0.4346,
"num_tokens": 502139896.0,
"step": 240
},
{
"epoch": 5.239130434782608,
"grad_norm": 0.13521276914397093,
"learning_rate": 6.548843849712204e-07,
"loss": 0.4402,
"num_tokens": 504233584.0,
"step": 241
},
{
"epoch": 5.260869565217392,
"grad_norm": 0.13773428031418447,
"learning_rate": 6.515606706945448e-07,
"loss": 0.4369,
"num_tokens": 506324690.0,
"step": 242
},
{
"epoch": 5.282608695652174,
"grad_norm": 0.13773029409346027,
"learning_rate": 6.482311082135207e-07,
"loss": 0.4395,
"num_tokens": 508417180.0,
"step": 243
},
{
"epoch": 5.304347826086957,
"grad_norm": 0.1356871115442459,
"learning_rate": 6.448958892555331e-07,
"loss": 0.4365,
"num_tokens": 510508979.0,
"step": 244
},
{
"epoch": 5.326086956521739,
"grad_norm": 0.13534102412383026,
"learning_rate": 6.415552058736853e-07,
"loss": 0.4389,
"num_tokens": 512598937.0,
"step": 245
},
{
"epoch": 5.3478260869565215,
"grad_norm": 0.13264202592618046,
"learning_rate": 6.382092504357407e-07,
"loss": 0.4317,
"num_tokens": 514689904.0,
"step": 246
},
{
"epoch": 5.369565217391305,
"grad_norm": 0.1333826695615688,
"learning_rate": 6.348582156130461e-07,
"loss": 0.4383,
"num_tokens": 516783964.0,
"step": 247
},
{
"epoch": 5.391304347826087,
"grad_norm": 0.13899071330031054,
"learning_rate": 6.315022943694351e-07,
"loss": 0.4403,
"num_tokens": 518876985.0,
"step": 248
},
{
"epoch": 5.413043478260869,
"grad_norm": 0.13778710354618337,
"learning_rate": 6.281416799501187e-07,
"loss": 0.4399,
"num_tokens": 520970152.0,
"step": 249
},
{
"epoch": 5.434782608695652,
"grad_norm": 0.13632128488986214,
"learning_rate": 6.247765658705564e-07,
"loss": 0.4337,
"num_tokens": 523061474.0,
"step": 250
},
{
"epoch": 5.456521739130435,
"grad_norm": 0.1462604946144207,
"learning_rate": 6.21407145905313e-07,
"loss": 0.4404,
"num_tokens": 525152838.0,
"step": 251
},
{
"epoch": 5.478260869565218,
"grad_norm": 0.13766959751545696,
"learning_rate": 6.180336140769014e-07,
"loss": 0.4408,
"num_tokens": 527243818.0,
"step": 252
},
{
"epoch": 5.5,
"grad_norm": 0.13758581395499722,
"learning_rate": 6.146561646446086e-07,
"loss": 0.4369,
"num_tokens": 529336853.0,
"step": 253
},
{
"epoch": 5.521739130434782,
"grad_norm": 0.13513710600810164,
"learning_rate": 6.11274992093311e-07,
"loss": 0.4308,
"num_tokens": 531430293.0,
"step": 254
},
{
"epoch": 5.543478260869565,
"grad_norm": 0.13912312404631008,
"learning_rate": 6.078902911222739e-07,
"loss": 0.4383,
"num_tokens": 533522415.0,
"step": 255
},
{
"epoch": 5.565217391304348,
"grad_norm": 0.1388530569147137,
"learning_rate": 6.045022566339418e-07,
"loss": 0.4376,
"num_tokens": 535617186.0,
"step": 256
},
{
"epoch": 5.586956521739131,
"grad_norm": 0.1374779214335327,
"learning_rate": 6.011110837227137e-07,
"loss": 0.4308,
"num_tokens": 537709724.0,
"step": 257
},
{
"epoch": 5.608695652173913,
"grad_norm": 0.1377630412176324,
"learning_rate": 5.977169676637097e-07,
"loss": 0.4468,
"num_tokens": 539801117.0,
"step": 258
},
{
"epoch": 5.630434782608695,
"grad_norm": 0.13653727942572233,
"learning_rate": 5.943201039015259e-07,
"loss": 0.4388,
"num_tokens": 541895241.0,
"step": 259
},
{
"epoch": 5.6521739130434785,
"grad_norm": 0.14282273069692783,
"learning_rate": 5.909206880389812e-07,
"loss": 0.4377,
"num_tokens": 543985798.0,
"step": 260
},
{
"epoch": 5.673913043478261,
"grad_norm": 0.13054693337369536,
"learning_rate": 5.87518915825852e-07,
"loss": 0.4363,
"num_tokens": 546077077.0,
"step": 261
},
{
"epoch": 5.695652173913043,
"grad_norm": 0.13814789552591392,
"learning_rate": 5.841149831476024e-07,
"loss": 0.4385,
"num_tokens": 548170917.0,
"step": 262
},
{
"epoch": 5.717391304347826,
"grad_norm": 0.13844152235488286,
"learning_rate": 5.80709086014102e-07,
"loss": 0.4333,
"num_tokens": 550259924.0,
"step": 263
},
{
"epoch": 5.739130434782608,
"grad_norm": 0.14059296175840055,
"learning_rate": 5.773014205483413e-07,
"loss": 0.4379,
"num_tokens": 552352828.0,
"step": 264
},
{
"epoch": 5.760869565217392,
"grad_norm": 0.13847725520224147,
"learning_rate": 5.738921829751373e-07,
"loss": 0.442,
"num_tokens": 554447030.0,
"step": 265
},
{
"epoch": 5.782608695652174,
"grad_norm": 0.13609874773318442,
"learning_rate": 5.704815696098336e-07,
"loss": 0.4379,
"num_tokens": 556540214.0,
"step": 266
},
{
"epoch": 5.804347826086957,
"grad_norm": 0.13297354961540186,
"learning_rate": 5.67069776846997e-07,
"loss": 0.4325,
"num_tokens": 558631373.0,
"step": 267
},
{
"epoch": 5.826086956521739,
"grad_norm": 0.13567272164852606,
"learning_rate": 5.636570011491081e-07,
"loss": 0.4388,
"num_tokens": 560726058.0,
"step": 268
},
{
"epoch": 5.8478260869565215,
"grad_norm": 0.13162046638977157,
"learning_rate": 5.602434390352476e-07,
"loss": 0.4329,
"num_tokens": 562819414.0,
"step": 269
},
{
"epoch": 5.869565217391305,
"grad_norm": 0.13332396886010534,
"learning_rate": 5.568292870697812e-07,
"loss": 0.4341,
"num_tokens": 564912852.0,
"step": 270
},
{
"epoch": 5.891304347826087,
"grad_norm": 0.13584602864934453,
"learning_rate": 5.5341474185104e-07,
"loss": 0.4297,
"num_tokens": 567005867.0,
"step": 271
},
{
"epoch": 5.913043478260869,
"grad_norm": 0.1369250861070486,
"learning_rate": 5.5e-07,
"loss": 0.4369,
"num_tokens": 569101094.0,
"step": 272
},
{
"epoch": 5.934782608695652,
"grad_norm": 0.1377729350140169,
"learning_rate": 5.4658525814896e-07,
"loss": 0.4356,
"num_tokens": 571193885.0,
"step": 273
},
{
"epoch": 5.956521739130435,
"grad_norm": 0.1365562431007031,
"learning_rate": 5.431707129302188e-07,
"loss": 0.4363,
"num_tokens": 573284677.0,
"step": 274
},
{
"epoch": 5.978260869565218,
"grad_norm": 0.13502269056012137,
"learning_rate": 5.397565609647524e-07,
"loss": 0.4304,
"num_tokens": 575374743.0,
"step": 275
},
{
"epoch": 6.0,
"grad_norm": 0.13363349453152876,
"learning_rate": 5.36342998850892e-07,
"loss": 0.4368,
"num_tokens": 577464868.0,
"step": 276
},
{
"epoch": 6.021739130434782,
"grad_norm": 0.1328059670364238,
"learning_rate": 5.329302231530028e-07,
"loss": 0.4315,
"num_tokens": 579556555.0,
"step": 277
},
{
"epoch": 6.043478260869565,
"grad_norm": 0.13754954688543153,
"learning_rate": 5.295184303901664e-07,
"loss": 0.4338,
"num_tokens": 581648063.0,
"step": 278
},
{
"epoch": 6.065217391304348,
"grad_norm": 0.12821659544185995,
"learning_rate": 5.261078170248629e-07,
"loss": 0.4355,
"num_tokens": 583740874.0,
"step": 279
},
{
"epoch": 6.086956521739131,
"grad_norm": 0.13679137656422818,
"learning_rate": 5.226985794516586e-07,
"loss": 0.4319,
"num_tokens": 585832349.0,
"step": 280
},
{
"epoch": 6.108695652173913,
"grad_norm": 0.13249307568241678,
"learning_rate": 5.192909139858981e-07,
"loss": 0.4338,
"num_tokens": 587921984.0,
"step": 281
},
{
"epoch": 6.130434782608695,
"grad_norm": 0.1357134504296526,
"learning_rate": 5.158850168523978e-07,
"loss": 0.4406,
"num_tokens": 590012104.0,
"step": 282
},
{
"epoch": 6.1521739130434785,
"grad_norm": 0.13359851753248614,
"learning_rate": 5.124810841741479e-07,
"loss": 0.4367,
"num_tokens": 592104332.0,
"step": 283
},
{
"epoch": 6.173913043478261,
"grad_norm": 0.13330128917193781,
"learning_rate": 5.090793119610189e-07,
"loss": 0.4365,
"num_tokens": 594197153.0,
"step": 284
},
{
"epoch": 6.195652173913044,
"grad_norm": 0.1389379881395303,
"learning_rate": 5.05679896098474e-07,
"loss": 0.4306,
"num_tokens": 596290282.0,
"step": 285
},
{
"epoch": 6.217391304347826,
"grad_norm": 0.13502081622963785,
"learning_rate": 5.022830323362904e-07,
"loss": 0.4339,
"num_tokens": 598381737.0,
"step": 286
},
{
"epoch": 6.239130434782608,
"grad_norm": 0.1300343985791597,
"learning_rate": 4.988889162772862e-07,
"loss": 0.4287,
"num_tokens": 600474261.0,
"step": 287
},
{
"epoch": 6.260869565217392,
"grad_norm": 0.1346594845108426,
"learning_rate": 4.954977433660582e-07,
"loss": 0.4328,
"num_tokens": 602567714.0,
"step": 288
},
{
"epoch": 6.282608695652174,
"grad_norm": 0.1347227053154414,
"learning_rate": 4.921097088777261e-07,
"loss": 0.4279,
"num_tokens": 604656969.0,
"step": 289
},
{
"epoch": 6.304347826086957,
"grad_norm": 0.1300345204916007,
"learning_rate": 4.887250079066891e-07,
"loss": 0.4324,
"num_tokens": 606751602.0,
"step": 290
},
{
"epoch": 6.326086956521739,
"grad_norm": 0.1373172335581007,
"learning_rate": 4.853438353553913e-07,
"loss": 0.4352,
"num_tokens": 608844165.0,
"step": 291
},
{
"epoch": 6.3478260869565215,
"grad_norm": 0.1355884963872206,
"learning_rate": 4.819663859230986e-07,
"loss": 0.4358,
"num_tokens": 610938529.0,
"step": 292
},
{
"epoch": 6.369565217391305,
"grad_norm": 0.13411787711864406,
"learning_rate": 4.785928540946868e-07,
"loss": 0.4353,
"num_tokens": 613033101.0,
"step": 293
},
{
"epoch": 6.391304347826087,
"grad_norm": 0.13405503421436976,
"learning_rate": 4.752234341294438e-07,
"loss": 0.4405,
"num_tokens": 615127003.0,
"step": 294
},
{
"epoch": 6.413043478260869,
"grad_norm": 0.13437305820655193,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.433,
"num_tokens": 617218474.0,
"step": 295
},
{
"epoch": 6.434782608695652,
"grad_norm": 0.1321759781171093,
"learning_rate": 4.684977056305649e-07,
"loss": 0.4391,
"num_tokens": 619311600.0,
"step": 296
},
{
"epoch": 6.456521739130435,
"grad_norm": 0.1305282668313758,
"learning_rate": 4.6514178438695393e-07,
"loss": 0.4388,
"num_tokens": 621403757.0,
"step": 297
},
{
"epoch": 6.478260869565218,
"grad_norm": 0.13202954037114603,
"learning_rate": 4.6179074956425933e-07,
"loss": 0.4292,
"num_tokens": 623498451.0,
"step": 298
},
{
"epoch": 6.5,
"grad_norm": 0.13284707048157723,
"learning_rate": 4.584447941263149e-07,
"loss": 0.4325,
"num_tokens": 625591661.0,
"step": 299
},
{
"epoch": 6.521739130434782,
"grad_norm": 0.13602035136541316,
"learning_rate": 4.551041107444671e-07,
"loss": 0.4392,
"num_tokens": 627682691.0,
"step": 300
},
{
"epoch": 6.543478260869565,
"grad_norm": 0.13598544826900796,
"learning_rate": 4.517688917864794e-07,
"loss": 0.4353,
"num_tokens": 629776091.0,
"step": 301
},
{
"epoch": 6.565217391304348,
"grad_norm": 0.1338409318077529,
"learning_rate": 4.4843932930545523e-07,
"loss": 0.4345,
"num_tokens": 631868289.0,
"step": 302
},
{
"epoch": 6.586956521739131,
"grad_norm": 0.1342983907771441,
"learning_rate": 4.4511561502877957e-07,
"loss": 0.4369,
"num_tokens": 633961314.0,
"step": 303
},
{
"epoch": 6.608695652173913,
"grad_norm": 0.13346437927742005,
"learning_rate": 4.417979403470777e-07,
"loss": 0.431,
"num_tokens": 636053320.0,
"step": 304
},
{
"epoch": 6.630434782608695,
"grad_norm": 0.1310788686933386,
"learning_rate": 4.384864963031951e-07,
"loss": 0.4356,
"num_tokens": 638148918.0,
"step": 305
},
{
"epoch": 6.6521739130434785,
"grad_norm": 0.13154555127052447,
"learning_rate": 4.3518147358119574e-07,
"loss": 0.4339,
"num_tokens": 640240048.0,
"step": 306
},
{
"epoch": 6.673913043478261,
"grad_norm": 0.13334619119653454,
"learning_rate": 4.3188306249538274e-07,
"loss": 0.4314,
"num_tokens": 642332226.0,
"step": 307
},
{
"epoch": 6.695652173913043,
"grad_norm": 0.13503339105593512,
"learning_rate": 4.285914529793391e-07,
"loss": 0.4342,
"num_tokens": 644421381.0,
"step": 308
},
{
"epoch": 6.717391304347826,
"grad_norm": 0.13212880134022156,
"learning_rate": 4.2530683457499015e-07,
"loss": 0.4363,
"num_tokens": 646510254.0,
"step": 309
},
{
"epoch": 6.739130434782608,
"grad_norm": 0.13394002235598818,
"learning_rate": 4.220293964216898e-07,
"loss": 0.4359,
"num_tokens": 648602689.0,
"step": 310
},
{
"epoch": 6.760869565217392,
"grad_norm": 0.14660384432956694,
"learning_rate": 4.187593272453288e-07,
"loss": 0.4365,
"num_tokens": 650696691.0,
"step": 311
},
{
"epoch": 6.782608695652174,
"grad_norm": 0.13239093242319802,
"learning_rate": 4.154968153474679e-07,
"loss": 0.4347,
"num_tokens": 652788120.0,
"step": 312
},
{
"epoch": 6.804347826086957,
"grad_norm": 0.1342643540007929,
"learning_rate": 4.1224204859449416e-07,
"loss": 0.433,
"num_tokens": 654880645.0,
"step": 313
},
{
"epoch": 6.826086956521739,
"grad_norm": 0.13275609367078162,
"learning_rate": 4.0899521440680306e-07,
"loss": 0.4355,
"num_tokens": 656971574.0,
"step": 314
},
{
"epoch": 6.8478260869565215,
"grad_norm": 0.13259301897142664,
"learning_rate": 4.057564997480063e-07,
"loss": 0.4332,
"num_tokens": 659063107.0,
"step": 315
},
{
"epoch": 6.869565217391305,
"grad_norm": 0.13223705233189004,
"learning_rate": 4.0252609111416633e-07,
"loss": 0.4337,
"num_tokens": 661155890.0,
"step": 316
},
{
"epoch": 6.891304347826087,
"grad_norm": 0.13224068238361178,
"learning_rate": 3.993041745230562e-07,
"loss": 0.4309,
"num_tokens": 663247212.0,
"step": 317
},
{
"epoch": 6.913043478260869,
"grad_norm": 0.13637009129468228,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.4321,
"num_tokens": 665338057.0,
"step": 318
},
{
"epoch": 6.934782608695652,
"grad_norm": 0.13707779499269707,
"learning_rate": 3.9288655908443423e-07,
"loss": 0.4329,
"num_tokens": 667432205.0,
"step": 319
},
{
"epoch": 6.956521739130435,
"grad_norm": 0.132109300794455,
"learning_rate": 3.8969122978476253e-07,
"loss": 0.4346,
"num_tokens": 669525185.0,
"step": 320
},
{
"epoch": 6.978260869565218,
"grad_norm": 0.13205979491874026,
"learning_rate": 3.865051316022214e-07,
"loss": 0.4327,
"num_tokens": 671616364.0,
"step": 321
},
{
"epoch": 7.0,
"grad_norm": 0.135129481925338,
"learning_rate": 3.8332844800303996e-07,
"loss": 0.4378,
"num_tokens": 673708729.0,
"step": 322
},
{
"epoch": 7.021739130434782,
"grad_norm": 0.1351161665425618,
"learning_rate": 3.8016136191132354e-07,
"loss": 0.4365,
"num_tokens": 675800356.0,
"step": 323
},
{
"epoch": 7.043478260869565,
"grad_norm": 0.1349927733423944,
"learning_rate": 3.770040556985208e-07,
"loss": 0.4328,
"num_tokens": 677892616.0,
"step": 324
},
{
"epoch": 7.065217391304348,
"grad_norm": 0.1352555819729584,
"learning_rate": 3.738567111729224e-07,
"loss": 0.4334,
"num_tokens": 679985493.0,
"step": 325
},
{
"epoch": 7.086956521739131,
"grad_norm": 0.13250294100790336,
"learning_rate": 3.707195095691913e-07,
"loss": 0.4328,
"num_tokens": 682078744.0,
"step": 326
},
{
"epoch": 7.108695652173913,
"grad_norm": 0.13666594640233576,
"learning_rate": 3.675926315379274e-07,
"loss": 0.4322,
"num_tokens": 684171663.0,
"step": 327
},
{
"epoch": 7.130434782608695,
"grad_norm": 0.13676036413293755,
"learning_rate": 3.644762571352641e-07,
"loss": 0.433,
"num_tokens": 686264097.0,
"step": 328
},
{
"epoch": 7.1521739130434785,
"grad_norm": 0.12951987903608786,
"learning_rate": 3.6137056581250137e-07,
"loss": 0.4365,
"num_tokens": 688358020.0,
"step": 329
},
{
"epoch": 7.173913043478261,
"grad_norm": 0.13212843134698485,
"learning_rate": 3.5827573640577033e-07,
"loss": 0.4333,
"num_tokens": 690450259.0,
"step": 330
},
{
"epoch": 7.195652173913044,
"grad_norm": 0.13407226703097985,
"learning_rate": 3.5519194712573787e-07,
"loss": 0.4371,
"num_tokens": 692542688.0,
"step": 331
},
{
"epoch": 7.217391304347826,
"grad_norm": 0.13581794126806007,
"learning_rate": 3.521193755473423e-07,
"loss": 0.4372,
"num_tokens": 694633082.0,
"step": 332
},
{
"epoch": 7.239130434782608,
"grad_norm": 0.1276611741507177,
"learning_rate": 3.4905819859957e-07,
"loss": 0.4303,
"num_tokens": 696724260.0,
"step": 333
},
{
"epoch": 7.260869565217392,
"grad_norm": 0.13106145609895156,
"learning_rate": 3.460085925552653e-07,
"loss": 0.437,
"num_tokens": 698813738.0,
"step": 334
},
{
"epoch": 7.282608695652174,
"grad_norm": 0.1306738813490512,
"learning_rate": 3.4297073302098155e-07,
"loss": 0.432,
"num_tokens": 700905567.0,
"step": 335
},
{
"epoch": 7.304347826086957,
"grad_norm": 0.1327736232372846,
"learning_rate": 3.399447949268686e-07,
"loss": 0.4285,
"num_tokens": 702999204.0,
"step": 336
},
{
"epoch": 7.326086956521739,
"grad_norm": 0.1332013722230655,
"learning_rate": 3.369309525165997e-07,
"loss": 0.4339,
"num_tokens": 705092251.0,
"step": 337
},
{
"epoch": 7.3478260869565215,
"grad_norm": 0.12940464875966667,
"learning_rate": 3.33929379337338e-07,
"loss": 0.4288,
"num_tokens": 707186648.0,
"step": 338
},
{
"epoch": 7.369565217391305,
"grad_norm": 0.13228179281843067,
"learning_rate": 3.30940248229743e-07,
"loss": 0.4302,
"num_tokens": 709278066.0,
"step": 339
},
{
"epoch": 7.391304347826087,
"grad_norm": 0.13503745540474288,
"learning_rate": 3.279637313180187e-07,
"loss": 0.431,
"num_tokens": 711370104.0,
"step": 340
},
{
"epoch": 7.413043478260869,
"grad_norm": 0.14399326300596813,
"learning_rate": 3.250000000000001e-07,
"loss": 0.4295,
"num_tokens": 713463519.0,
"step": 341
},
{
"epoch": 7.434782608695652,
"grad_norm": 0.1335803366951806,
"learning_rate": 3.220492249372857e-07,
"loss": 0.4333,
"num_tokens": 715555950.0,
"step": 342
},
{
"epoch": 7.456521739130435,
"grad_norm": 0.1321490413754198,
"learning_rate": 3.191115760454092e-07,
"loss": 0.4334,
"num_tokens": 717644115.0,
"step": 343
},
{
"epoch": 7.478260869565218,
"grad_norm": 0.1378381482716753,
"learning_rate": 3.16187222484055e-07,
"loss": 0.4278,
"num_tokens": 719737075.0,
"step": 344
},
{
"epoch": 7.5,
"grad_norm": 0.13813454110009424,
"learning_rate": 3.1327633264731803e-07,
"loss": 0.4354,
"num_tokens": 721829111.0,
"step": 345
},
{
"epoch": 7.521739130434782,
"grad_norm": 0.13507583558096983,
"learning_rate": 3.103790741540067e-07,
"loss": 0.4346,
"num_tokens": 723921674.0,
"step": 346
},
{
"epoch": 7.543478260869565,
"grad_norm": 0.13094378688454855,
"learning_rate": 3.0749561383799107e-07,
"loss": 0.4331,
"num_tokens": 726014640.0,
"step": 347
},
{
"epoch": 7.565217391304348,
"grad_norm": 0.1340997929945924,
"learning_rate": 3.0462611773859536e-07,
"loss": 0.4269,
"num_tokens": 728103347.0,
"step": 348
},
{
"epoch": 7.586956521739131,
"grad_norm": 0.13188855397354393,
"learning_rate": 3.017707510910378e-07,
"loss": 0.4315,
"num_tokens": 730196549.0,
"step": 349
},
{
"epoch": 7.608695652173913,
"grad_norm": 0.13420100529200588,
"learning_rate": 2.9892967831691504e-07,
"loss": 0.4287,
"num_tokens": 732290824.0,
"step": 350
},
{
"epoch": 7.630434782608695,
"grad_norm": 0.13059199827577136,
"learning_rate": 2.961030630147346e-07,
"loss": 0.4353,
"num_tokens": 734382209.0,
"step": 351
},
{
"epoch": 7.6521739130434785,
"grad_norm": 0.1317331651303684,
"learning_rate": 2.9329106795049443e-07,
"loss": 0.4291,
"num_tokens": 736476467.0,
"step": 352
},
{
"epoch": 7.673913043478261,
"grad_norm": 0.13586368026615056,
"learning_rate": 2.904938550483098e-07,
"loss": 0.4361,
"num_tokens": 738567121.0,
"step": 353
},
{
"epoch": 7.695652173913043,
"grad_norm": 0.1333568766936638,
"learning_rate": 2.8771158538108976e-07,
"loss": 0.4316,
"num_tokens": 740660970.0,
"step": 354
},
{
"epoch": 7.717391304347826,
"grad_norm": 0.13226567822802499,
"learning_rate": 2.849444191612613e-07,
"loss": 0.4313,
"num_tokens": 742752708.0,
"step": 355
},
{
"epoch": 7.739130434782608,
"grad_norm": 0.13495676434313775,
"learning_rate": 2.821925157315447e-07,
"loss": 0.4304,
"num_tokens": 744844699.0,
"step": 356
},
{
"epoch": 7.760869565217392,
"grad_norm": 0.13174968471614787,
"learning_rate": 2.7945603355577707e-07,
"loss": 0.4331,
"num_tokens": 746938461.0,
"step": 357
},
{
"epoch": 7.782608695652174,
"grad_norm": 0.13315770983596392,
"learning_rate": 2.7673513020978866e-07,
"loss": 0.4382,
"num_tokens": 749030330.0,
"step": 358
},
{
"epoch": 7.804347826086957,
"grad_norm": 0.1308412156094041,
"learning_rate": 2.7402996237232757e-07,
"loss": 0.4318,
"num_tokens": 751124081.0,
"step": 359
},
{
"epoch": 7.826086956521739,
"grad_norm": 0.14307594281989688,
"learning_rate": 2.713406858160393e-07,
"loss": 0.4257,
"num_tokens": 753217342.0,
"step": 360
},
{
"epoch": 7.8478260869565215,
"grad_norm": 0.13412489931327856,
"learning_rate": 2.686674553984951e-07,
"loss": 0.4337,
"num_tokens": 755310255.0,
"step": 361
},
{
"epoch": 7.869565217391305,
"grad_norm": 0.13326162890783366,
"learning_rate": 2.6601042505327635e-07,
"loss": 0.4278,
"num_tokens": 757400781.0,
"step": 362
},
{
"epoch": 7.891304347826087,
"grad_norm": 0.13553749314962396,
"learning_rate": 2.6336974778110974e-07,
"loss": 0.4335,
"num_tokens": 759493454.0,
"step": 363
},
{
"epoch": 7.913043478260869,
"grad_norm": 0.13276724428197223,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.4297,
"num_tokens": 761586571.0,
"step": 364
},
{
"epoch": 7.934782608695652,
"grad_norm": 0.13353006154405495,
"learning_rate": 2.5813805974175984e-07,
"loss": 0.4366,
"num_tokens": 763678682.0,
"step": 365
},
{
"epoch": 7.956521739130435,
"grad_norm": 0.13422033595407565,
"learning_rate": 2.55547350232736e-07,
"loss": 0.4336,
"num_tokens": 765769422.0,
"step": 366
},
{
"epoch": 7.978260869565218,
"grad_norm": 0.13744588281494488,
"learning_rate": 2.529735962957361e-07,
"loss": 0.432,
"num_tokens": 767860704.0,
"step": 367
},
{
"epoch": 8.0,
"grad_norm": 0.13605419812550198,
"learning_rate": 2.504169461361518e-07,
"loss": 0.4346,
"num_tokens": 769952460.0,
"step": 368
},
{
"epoch": 8.021739130434783,
"grad_norm": 0.12913143473579597,
"learning_rate": 2.478775469744815e-07,
"loss": 0.4325,
"num_tokens": 772044426.0,
"step": 369
},
{
"epoch": 8.043478260869565,
"grad_norm": 0.12961630524558168,
"learning_rate": 2.453555450378535e-07,
"loss": 0.4281,
"num_tokens": 774137051.0,
"step": 370
},
{
"epoch": 8.065217391304348,
"grad_norm": 0.12615636919892154,
"learning_rate": 2.4285108555160575e-07,
"loss": 0.4309,
"num_tokens": 776225808.0,
"step": 371
},
{
"epoch": 8.08695652173913,
"grad_norm": 0.13305530149593306,
"learning_rate": 2.4036431273092235e-07,
"loss": 0.4331,
"num_tokens": 778316140.0,
"step": 372
},
{
"epoch": 8.108695652173912,
"grad_norm": 0.13211852945485628,
"learning_rate": 2.378953697725303e-07,
"loss": 0.4295,
"num_tokens": 780410250.0,
"step": 373
},
{
"epoch": 8.130434782608695,
"grad_norm": 0.1292232171342386,
"learning_rate": 2.3544439884645314e-07,
"loss": 0.4354,
"num_tokens": 782502683.0,
"step": 374
},
{
"epoch": 8.152173913043478,
"grad_norm": 0.1313175653486085,
"learning_rate": 2.3301154108782453e-07,
"loss": 0.4256,
"num_tokens": 784596876.0,
"step": 375
},
{
"epoch": 8.173913043478262,
"grad_norm": 0.13116271616904762,
"learning_rate": 2.3059693658876094e-07,
"loss": 0.434,
"num_tokens": 786687028.0,
"step": 376
},
{
"epoch": 8.195652173913043,
"grad_norm": 0.1290464225342815,
"learning_rate": 2.2820072439029523e-07,
"loss": 0.4307,
"num_tokens": 788781709.0,
"step": 377
},
{
"epoch": 8.217391304347826,
"grad_norm": 0.13638944899256616,
"learning_rate": 2.2582304247436962e-07,
"loss": 0.4305,
"num_tokens": 790874628.0,
"step": 378
},
{
"epoch": 8.23913043478261,
"grad_norm": 0.13152310229196626,
"learning_rate": 2.2346402775589042e-07,
"loss": 0.4353,
"num_tokens": 792968522.0,
"step": 379
},
{
"epoch": 8.26086956521739,
"grad_norm": 0.13177637659531652,
"learning_rate": 2.2112381607484416e-07,
"loss": 0.4335,
"num_tokens": 795061203.0,
"step": 380
},
{
"epoch": 8.282608695652174,
"grad_norm": 0.1325848225519375,
"learning_rate": 2.1880254218847538e-07,
"loss": 0.4309,
"num_tokens": 797155596.0,
"step": 381
},
{
"epoch": 8.304347826086957,
"grad_norm": 0.13091228502197777,
"learning_rate": 2.1650033976352643e-07,
"loss": 0.4273,
"num_tokens": 799250052.0,
"step": 382
},
{
"epoch": 8.326086956521738,
"grad_norm": 0.12968816077115408,
"learning_rate": 2.1421734136854153e-07,
"loss": 0.4283,
"num_tokens": 801343245.0,
"step": 383
},
{
"epoch": 8.347826086956522,
"grad_norm": 0.13538549199023636,
"learning_rate": 2.1195367846623207e-07,
"loss": 0.4336,
"num_tokens": 803434640.0,
"step": 384
},
{
"epoch": 8.369565217391305,
"grad_norm": 0.12970848773853702,
"learning_rate": 2.0970948140590672e-07,
"loss": 0.4258,
"num_tokens": 805528060.0,
"step": 385
},
{
"epoch": 8.391304347826088,
"grad_norm": 0.13268823242396818,
"learning_rate": 2.0748487941596594e-07,
"loss": 0.4325,
"num_tokens": 807620018.0,
"step": 386
},
{
"epoch": 8.41304347826087,
"grad_norm": 0.130777558774095,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.4341,
"num_tokens": 809712424.0,
"step": 387
},
{
"epoch": 8.434782608695652,
"grad_norm": 0.13276667916448534,
"learning_rate": 2.0309497191171281e-07,
"loss": 0.4301,
"num_tokens": 811806174.0,
"step": 388
},
{
"epoch": 8.456521739130435,
"grad_norm": 0.13377311658786517,
"learning_rate": 2.0092991918301106e-07,
"loss": 0.437,
"num_tokens": 813898764.0,
"step": 389
},
{
"epoch": 8.478260869565217,
"grad_norm": 0.1341059998288671,
"learning_rate": 1.9878496708135884e-07,
"loss": 0.4323,
"num_tokens": 815990077.0,
"step": 390
},
{
"epoch": 8.5,
"grad_norm": 0.13253582182125295,
"learning_rate": 1.9666023912029849e-07,
"loss": 0.4402,
"num_tokens": 818080579.0,
"step": 391
},
{
"epoch": 8.521739130434783,
"grad_norm": 0.1302260861058939,
"learning_rate": 1.9455585764879873e-07,
"loss": 0.4338,
"num_tokens": 820171456.0,
"step": 392
},
{
"epoch": 8.543478260869565,
"grad_norm": 0.12879496335031015,
"learning_rate": 1.924719438442085e-07,
"loss": 0.4267,
"num_tokens": 822264249.0,
"step": 393
},
{
"epoch": 8.565217391304348,
"grad_norm": 0.13047843266489334,
"learning_rate": 1.9040861770528043e-07,
"loss": 0.4324,
"num_tokens": 824355668.0,
"step": 394
},
{
"epoch": 8.58695652173913,
"grad_norm": 0.13118675730217721,
"learning_rate": 1.883659980452598e-07,
"loss": 0.4286,
"num_tokens": 826448015.0,
"step": 395
},
{
"epoch": 8.608695652173914,
"grad_norm": 0.131767591474556,
"learning_rate": 1.863442024850438e-07,
"loss": 0.4355,
"num_tokens": 828540071.0,
"step": 396
},
{
"epoch": 8.630434782608695,
"grad_norm": 0.13480920458872578,
"learning_rate": 1.843433474464076e-07,
"loss": 0.4302,
"num_tokens": 830635170.0,
"step": 397
},
{
"epoch": 8.652173913043478,
"grad_norm": 0.1315376912303259,
"learning_rate": 1.8236354814530112e-07,
"loss": 0.4359,
"num_tokens": 832727964.0,
"step": 398
},
{
"epoch": 8.673913043478262,
"grad_norm": 0.1332959061250401,
"learning_rate": 1.80404918585214e-07,
"loss": 0.4344,
"num_tokens": 834819503.0,
"step": 399
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.1357750786137828,
"learning_rate": 1.7846757155061127e-07,
"loss": 0.4312,
"num_tokens": 836909543.0,
"step": 400
},
{
"epoch": 8.717391304347826,
"grad_norm": 0.1297667217706855,
"learning_rate": 1.765516186004387e-07,
"loss": 0.4298,
"num_tokens": 839002022.0,
"step": 401
},
{
"epoch": 8.73913043478261,
"grad_norm": 0.13324607592951465,
"learning_rate": 1.7465717006169887e-07,
"loss": 0.4298,
"num_tokens": 841093887.0,
"step": 402
},
{
"epoch": 8.76086956521739,
"grad_norm": 0.1319543530134324,
"learning_rate": 1.7278433502309808e-07,
"loss": 0.4302,
"num_tokens": 843185214.0,
"step": 403
},
{
"epoch": 8.782608695652174,
"grad_norm": 0.1292262569354397,
"learning_rate": 1.7093322132876485e-07,
"loss": 0.4289,
"num_tokens": 845275872.0,
"step": 404
},
{
"epoch": 8.804347826086957,
"grad_norm": 0.13330281500828273,
"learning_rate": 1.691039355720396e-07,
"loss": 0.4333,
"num_tokens": 847364898.0,
"step": 405
},
{
"epoch": 8.826086956521738,
"grad_norm": 0.13157685547158732,
"learning_rate": 1.6729658308933703e-07,
"loss": 0.432,
"num_tokens": 849456452.0,
"step": 406
},
{
"epoch": 8.847826086956522,
"grad_norm": 0.13145264305507653,
"learning_rate": 1.6551126795408015e-07,
"loss": 0.4265,
"num_tokens": 851549867.0,
"step": 407
},
{
"epoch": 8.869565217391305,
"grad_norm": 0.13486432492275377,
"learning_rate": 1.6374809297070763e-07,
"loss": 0.4329,
"num_tokens": 853641176.0,
"step": 408
},
{
"epoch": 8.891304347826086,
"grad_norm": 0.12974827151243915,
"learning_rate": 1.6200715966875392e-07,
"loss": 0.4213,
"num_tokens": 855735273.0,
"step": 409
},
{
"epoch": 8.91304347826087,
"grad_norm": 0.13185441487515778,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.4271,
"num_tokens": 857828511.0,
"step": 410
},
{
"epoch": 8.934782608695652,
"grad_norm": 0.13485608760298035,
"learning_rate": 1.5859241781771399e-07,
"loss": 0.4308,
"num_tokens": 859920415.0,
"step": 411
},
{
"epoch": 8.956521739130435,
"grad_norm": 0.1339411805022494,
"learning_rate": 1.5691880590092667e-07,
"loss": 0.431,
"num_tokens": 862012285.0,
"step": 412
},
{
"epoch": 8.978260869565217,
"grad_norm": 0.1349062172890918,
"learning_rate": 1.552678289188326e-07,
"loss": 0.435,
"num_tokens": 864105715.0,
"step": 413
},
{
"epoch": 9.0,
"grad_norm": 0.13339483975994826,
"learning_rate": 1.5363958194022895e-07,
"loss": 0.4284,
"num_tokens": 866196230.0,
"step": 414
},
{
"epoch": 9.021739130434783,
"grad_norm": 0.13155206773063716,
"learning_rate": 1.5203415872504246e-07,
"loss": 0.4234,
"num_tokens": 868289042.0,
"step": 415
},
{
"epoch": 9.043478260869565,
"grad_norm": 0.1265848518754384,
"learning_rate": 1.5045165171893116e-07,
"loss": 0.4331,
"num_tokens": 870380556.0,
"step": 416
},
{
"epoch": 9.065217391304348,
"grad_norm": 0.12943671669921178,
"learning_rate": 1.488921520479608e-07,
"loss": 0.4332,
"num_tokens": 872469107.0,
"step": 417
},
{
"epoch": 9.08695652173913,
"grad_norm": 0.13125183209740615,
"learning_rate": 1.473557495133575e-07,
"loss": 0.4294,
"num_tokens": 874561955.0,
"step": 418
},
{
"epoch": 9.108695652173912,
"grad_norm": 0.13109229633803124,
"learning_rate": 1.4584253258633681e-07,
"loss": 0.4251,
"num_tokens": 876654532.0,
"step": 419
},
{
"epoch": 9.130434782608695,
"grad_norm": 0.13214384571361726,
"learning_rate": 1.4435258840300897e-07,
"loss": 0.4336,
"num_tokens": 878746295.0,
"step": 420
},
{
"epoch": 9.152173913043478,
"grad_norm": 0.1311401974553015,
"learning_rate": 1.4288600275936184e-07,
"loss": 0.4274,
"num_tokens": 880840455.0,
"step": 421
},
{
"epoch": 9.173913043478262,
"grad_norm": 0.13541450363149224,
"learning_rate": 1.4144286010631992e-07,
"loss": 0.4307,
"num_tokens": 882933892.0,
"step": 422
},
{
"epoch": 9.195652173913043,
"grad_norm": 0.1347209244136294,
"learning_rate": 1.4002324354488175e-07,
"loss": 0.4321,
"num_tokens": 885025725.0,
"step": 423
},
{
"epoch": 9.217391304347826,
"grad_norm": 0.1307731062962624,
"learning_rate": 1.3862723482133435e-07,
"loss": 0.434,
"num_tokens": 887119819.0,
"step": 424
},
{
"epoch": 9.23913043478261,
"grad_norm": 0.13830257430323137,
"learning_rate": 1.3725491432254623e-07,
"loss": 0.4322,
"num_tokens": 889212264.0,
"step": 425
},
{
"epoch": 9.26086956521739,
"grad_norm": 0.13014409193295903,
"learning_rate": 1.3590636107133845e-07,
"loss": 0.4287,
"num_tokens": 891305737.0,
"step": 426
},
{
"epoch": 9.282608695652174,
"grad_norm": 0.13261904746981767,
"learning_rate": 1.3458165272193445e-07,
"loss": 0.4255,
"num_tokens": 893399618.0,
"step": 427
},
{
"epoch": 9.304347826086957,
"grad_norm": 0.1312843101167757,
"learning_rate": 1.3328086555548762e-07,
"loss": 0.4297,
"num_tokens": 895491469.0,
"step": 428
},
{
"epoch": 9.326086956521738,
"grad_norm": 0.1364003552911898,
"learning_rate": 1.3200407447568984e-07,
"loss": 0.4341,
"num_tokens": 897584632.0,
"step": 429
},
{
"epoch": 9.347826086956522,
"grad_norm": 0.1339886261490337,
"learning_rate": 1.3075135300445745e-07,
"loss": 0.4281,
"num_tokens": 899675181.0,
"step": 430
},
{
"epoch": 9.369565217391305,
"grad_norm": 0.1337370078605457,
"learning_rate": 1.2952277327769804e-07,
"loss": 0.434,
"num_tokens": 901770066.0,
"step": 431
},
{
"epoch": 9.391304347826088,
"grad_norm": 0.13055173922308577,
"learning_rate": 1.2831840604115645e-07,
"loss": 0.4276,
"num_tokens": 903859911.0,
"step": 432
},
{
"epoch": 9.41304347826087,
"grad_norm": 0.13139166365646263,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.428,
"num_tokens": 905953157.0,
"step": 433
},
{
"epoch": 9.434782608695652,
"grad_norm": 0.13103415317501538,
"learning_rate": 1.259825850465308e-07,
"loss": 0.4323,
"num_tokens": 908046526.0,
"step": 434
},
{
"epoch": 9.456521739130435,
"grad_norm": 0.1286719245844758,
"learning_rate": 1.2485126579286066e-07,
"loss": 0.4341,
"num_tokens": 910137413.0,
"step": 435
},
{
"epoch": 9.478260869565217,
"grad_norm": 0.12910865959756296,
"learning_rate": 1.2374442803049124e-07,
"loss": 0.4314,
"num_tokens": 912227811.0,
"step": 436
},
{
"epoch": 9.5,
"grad_norm": 0.13356681693096856,
"learning_rate": 1.2266213549485637e-07,
"loss": 0.4261,
"num_tokens": 914319892.0,
"step": 437
},
{
"epoch": 9.521739130434783,
"grad_norm": 0.13245196915807964,
"learning_rate": 1.2160445050799345e-07,
"loss": 0.4336,
"num_tokens": 916412324.0,
"step": 438
},
{
"epoch": 9.543478260869565,
"grad_norm": 0.13247117449980128,
"learning_rate": 1.205714339749545e-07,
"loss": 0.4288,
"num_tokens": 918503434.0,
"step": 439
},
{
"epoch": 9.565217391304348,
"grad_norm": 0.13140776239629576,
"learning_rate": 1.1956314538029936e-07,
"loss": 0.4286,
"num_tokens": 920596471.0,
"step": 440
},
{
"epoch": 9.58695652173913,
"grad_norm": 0.1317686114638022,
"learning_rate": 1.1857964278467e-07,
"loss": 0.4316,
"num_tokens": 922688205.0,
"step": 441
},
{
"epoch": 9.608695652173914,
"grad_norm": 0.1300939804698934,
"learning_rate": 1.1762098282144734e-07,
"loss": 0.4237,
"num_tokens": 924777343.0,
"step": 442
},
{
"epoch": 9.630434782608695,
"grad_norm": 0.13046645210086893,
"learning_rate": 1.166872206934904e-07,
"loss": 0.4308,
"num_tokens": 926870486.0,
"step": 443
},
{
"epoch": 9.652173913043478,
"grad_norm": 0.1339176102367468,
"learning_rate": 1.157784101699567e-07,
"loss": 0.4333,
"num_tokens": 928965703.0,
"step": 444
},
{
"epoch": 9.673913043478262,
"grad_norm": 0.13053722597742878,
"learning_rate": 1.1489460358320726e-07,
"loss": 0.4315,
"num_tokens": 931058607.0,
"step": 445
},
{
"epoch": 9.695652173913043,
"grad_norm": 0.1318797432070931,
"learning_rate": 1.1403585182579217e-07,
"loss": 0.4328,
"num_tokens": 933148486.0,
"step": 446
},
{
"epoch": 9.717391304347826,
"grad_norm": 0.12863606554725282,
"learning_rate": 1.1320220434752026e-07,
"loss": 0.433,
"num_tokens": 935240504.0,
"step": 447
},
{
"epoch": 9.73913043478261,
"grad_norm": 0.13076944288775819,
"learning_rate": 1.1239370915261193e-07,
"loss": 0.4284,
"num_tokens": 937328887.0,
"step": 448
},
{
"epoch": 9.76086956521739,
"grad_norm": 0.13176495576730735,
"learning_rate": 1.1161041279693445e-07,
"loss": 0.4282,
"num_tokens": 939420861.0,
"step": 449
},
{
"epoch": 9.782608695652174,
"grad_norm": 0.1326360118816391,
"learning_rate": 1.1085236038532148e-07,
"loss": 0.4301,
"num_tokens": 941513118.0,
"step": 450
},
{
"epoch": 9.804347826086957,
"grad_norm": 0.13082582487161826,
"learning_rate": 1.1011959556897558e-07,
"loss": 0.4278,
"num_tokens": 943605716.0,
"step": 451
},
{
"epoch": 9.826086956521738,
"grad_norm": 0.1314931923327051,
"learning_rate": 1.0941216054295468e-07,
"loss": 0.4323,
"num_tokens": 945698868.0,
"step": 452
},
{
"epoch": 9.847826086956522,
"grad_norm": 0.13055748678716064,
"learning_rate": 1.0873009604374245e-07,
"loss": 0.433,
"num_tokens": 947792760.0,
"step": 453
},
{
"epoch": 9.869565217391305,
"grad_norm": 0.1305072624250908,
"learning_rate": 1.0807344134690236e-07,
"loss": 0.4307,
"num_tokens": 949886033.0,
"step": 454
},
{
"epoch": 9.891304347826086,
"grad_norm": 0.12983181142765143,
"learning_rate": 1.074422342648161e-07,
"loss": 0.428,
"num_tokens": 951978463.0,
"step": 455
},
{
"epoch": 9.91304347826087,
"grad_norm": 0.12986749329326977,
"learning_rate": 1.068365111445064e-07,
"loss": 0.4259,
"num_tokens": 954071277.0,
"step": 456
},
{
"epoch": 9.934782608695652,
"grad_norm": 0.13307807739369767,
"learning_rate": 1.0625630686554389e-07,
"loss": 0.4333,
"num_tokens": 956162411.0,
"step": 457
},
{
"epoch": 9.956521739130435,
"grad_norm": 0.1298842544540006,
"learning_rate": 1.0570165483803867e-07,
"loss": 0.4381,
"num_tokens": 958256872.0,
"step": 458
},
{
"epoch": 9.978260869565217,
"grad_norm": 0.12872143065997999,
"learning_rate": 1.0517258700071639e-07,
"loss": 0.4322,
"num_tokens": 960347511.0,
"step": 459
},
{
"epoch": 10.0,
"grad_norm": 0.12998360754044697,
"learning_rate": 1.0466913381907913e-07,
"loss": 0.4357,
"num_tokens": 962441279.0,
"step": 460
},
{
"epoch": 10.0,
"step": 460,
"total_flos": 856414331338752.0,
"train_loss": 0.4584593860351521,
"train_runtime": 14198.1717,
"train_samples_per_second": 66.018,
"train_steps_per_second": 0.032
}
],
"logging_steps": 1,
"max_steps": 460,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 856414331338752.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}