code-full-simple / trainer_state.json
cterdam's picture
Upload 10 files
bc36373 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 10000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 12.391390800476074,
"learning_rate": 3.920000000000001e-06,
"loss": 1.8027,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 7.110462665557861,
"learning_rate": 7.92e-06,
"loss": 1.6358,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 10.526795387268066,
"learning_rate": 1.1920000000000001e-05,
"loss": 1.603,
"step": 300
},
{
"epoch": 0.02,
"grad_norm": 9.175031661987305,
"learning_rate": 1.5920000000000003e-05,
"loss": 1.6249,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 4.193933486938477,
"learning_rate": 1.9920000000000002e-05,
"loss": 1.6364,
"step": 500
},
{
"epoch": 0.03,
"grad_norm": 8.9299955368042,
"learning_rate": 1.998030150753769e-05,
"loss": 1.6265,
"step": 600
},
{
"epoch": 0.04,
"grad_norm": 11.564770698547363,
"learning_rate": 1.996020100502513e-05,
"loss": 1.5935,
"step": 700
},
{
"epoch": 0.04,
"grad_norm": 9.529921531677246,
"learning_rate": 1.9940100502512564e-05,
"loss": 1.5959,
"step": 800
},
{
"epoch": 0.04,
"grad_norm": 5.335429668426514,
"learning_rate": 1.9920000000000002e-05,
"loss": 1.6342,
"step": 900
},
{
"epoch": 0.05,
"grad_norm": 8.102309226989746,
"learning_rate": 1.9899899497487437e-05,
"loss": 1.572,
"step": 1000
},
{
"epoch": 0.06,
"grad_norm": 5.742166042327881,
"learning_rate": 1.987979899497488e-05,
"loss": 1.5645,
"step": 1100
},
{
"epoch": 0.06,
"grad_norm": 5.3909735679626465,
"learning_rate": 1.9859698492462313e-05,
"loss": 1.547,
"step": 1200
},
{
"epoch": 0.07,
"grad_norm": 6.765148639678955,
"learning_rate": 1.983959798994975e-05,
"loss": 1.5399,
"step": 1300
},
{
"epoch": 0.07,
"grad_norm": 6.0268378257751465,
"learning_rate": 1.9819497487437185e-05,
"loss": 1.4985,
"step": 1400
},
{
"epoch": 0.07,
"grad_norm": 7.305541515350342,
"learning_rate": 1.9799396984924623e-05,
"loss": 1.5076,
"step": 1500
},
{
"epoch": 0.08,
"grad_norm": 8.56618595123291,
"learning_rate": 1.977929648241206e-05,
"loss": 1.52,
"step": 1600
},
{
"epoch": 0.09,
"grad_norm": 5.847652435302734,
"learning_rate": 1.97591959798995e-05,
"loss": 1.4976,
"step": 1700
},
{
"epoch": 0.09,
"grad_norm": 6.940663814544678,
"learning_rate": 1.9739095477386937e-05,
"loss": 1.4983,
"step": 1800
},
{
"epoch": 0.1,
"grad_norm": 5.06433629989624,
"learning_rate": 1.9718994974874372e-05,
"loss": 1.4951,
"step": 1900
},
{
"epoch": 0.1,
"grad_norm": 5.1144022941589355,
"learning_rate": 1.969889447236181e-05,
"loss": 1.5256,
"step": 2000
},
{
"epoch": 0.1,
"grad_norm": 6.515092849731445,
"learning_rate": 1.9678793969849248e-05,
"loss": 1.4677,
"step": 2100
},
{
"epoch": 0.11,
"grad_norm": 5.787613868713379,
"learning_rate": 1.9658693467336686e-05,
"loss": 1.4841,
"step": 2200
},
{
"epoch": 0.12,
"grad_norm": 7.798993110656738,
"learning_rate": 1.963859296482412e-05,
"loss": 1.4941,
"step": 2300
},
{
"epoch": 0.12,
"grad_norm": 4.808990955352783,
"learning_rate": 1.9618492462311562e-05,
"loss": 1.4775,
"step": 2400
},
{
"epoch": 0.12,
"grad_norm": 6.113214015960693,
"learning_rate": 1.9598391959798996e-05,
"loss": 1.4757,
"step": 2500
},
{
"epoch": 0.13,
"grad_norm": 6.038852214813232,
"learning_rate": 1.9578291457286434e-05,
"loss": 1.4413,
"step": 2600
},
{
"epoch": 0.14,
"grad_norm": 7.736110687255859,
"learning_rate": 1.955819095477387e-05,
"loss": 1.5001,
"step": 2700
},
{
"epoch": 0.14,
"grad_norm": 6.173422336578369,
"learning_rate": 1.953809045226131e-05,
"loss": 1.4183,
"step": 2800
},
{
"epoch": 0.14,
"grad_norm": 5.368058681488037,
"learning_rate": 1.9517989949748745e-05,
"loss": 1.4877,
"step": 2900
},
{
"epoch": 0.15,
"grad_norm": 5.35443639755249,
"learning_rate": 1.9497889447236183e-05,
"loss": 1.4079,
"step": 3000
},
{
"epoch": 0.15,
"grad_norm": 8.716644287109375,
"learning_rate": 1.9477788944723618e-05,
"loss": 1.4386,
"step": 3100
},
{
"epoch": 0.16,
"grad_norm": 5.639494895935059,
"learning_rate": 1.945768844221106e-05,
"loss": 1.4524,
"step": 3200
},
{
"epoch": 0.17,
"grad_norm": 3.3629064559936523,
"learning_rate": 1.9437587939698493e-05,
"loss": 1.4218,
"step": 3300
},
{
"epoch": 0.17,
"grad_norm": 4.7631402015686035,
"learning_rate": 1.941748743718593e-05,
"loss": 1.4357,
"step": 3400
},
{
"epoch": 0.17,
"grad_norm": 6.286344528198242,
"learning_rate": 1.939738693467337e-05,
"loss": 1.4025,
"step": 3500
},
{
"epoch": 0.18,
"grad_norm": 4.501611232757568,
"learning_rate": 1.9377286432160804e-05,
"loss": 1.4002,
"step": 3600
},
{
"epoch": 0.18,
"grad_norm": 6.302520275115967,
"learning_rate": 1.9357185929648242e-05,
"loss": 1.4128,
"step": 3700
},
{
"epoch": 0.19,
"grad_norm": 6.156075477600098,
"learning_rate": 1.933708542713568e-05,
"loss": 1.4136,
"step": 3800
},
{
"epoch": 0.2,
"grad_norm": 5.4391913414001465,
"learning_rate": 1.9316984924623118e-05,
"loss": 1.4307,
"step": 3900
},
{
"epoch": 0.2,
"grad_norm": 6.862305641174316,
"learning_rate": 1.9296884422110552e-05,
"loss": 1.3605,
"step": 4000
},
{
"epoch": 0.2,
"grad_norm": 5.392678737640381,
"learning_rate": 1.9276783919597994e-05,
"loss": 1.4059,
"step": 4100
},
{
"epoch": 0.21,
"grad_norm": 5.686226844787598,
"learning_rate": 1.925668341708543e-05,
"loss": 1.3474,
"step": 4200
},
{
"epoch": 0.21,
"grad_norm": 4.506126403808594,
"learning_rate": 1.9236582914572866e-05,
"loss": 1.3708,
"step": 4300
},
{
"epoch": 0.22,
"grad_norm": 7.255539894104004,
"learning_rate": 1.92164824120603e-05,
"loss": 1.3803,
"step": 4400
},
{
"epoch": 0.23,
"grad_norm": 6.463212966918945,
"learning_rate": 1.9196381909547742e-05,
"loss": 1.3371,
"step": 4500
},
{
"epoch": 0.23,
"grad_norm": 7.1397294998168945,
"learning_rate": 1.9176281407035177e-05,
"loss": 1.3787,
"step": 4600
},
{
"epoch": 0.23,
"grad_norm": 7.188973426818848,
"learning_rate": 1.9156180904522615e-05,
"loss": 1.3699,
"step": 4700
},
{
"epoch": 0.24,
"grad_norm": 4.161841869354248,
"learning_rate": 1.913608040201005e-05,
"loss": 1.3819,
"step": 4800
},
{
"epoch": 0.24,
"grad_norm": 3.420564889907837,
"learning_rate": 1.911597989949749e-05,
"loss": 1.3719,
"step": 4900
},
{
"epoch": 0.25,
"grad_norm": 5.769357681274414,
"learning_rate": 1.9095879396984925e-05,
"loss": 1.366,
"step": 5000
},
{
"epoch": 0.26,
"grad_norm": 6.374185562133789,
"learning_rate": 1.9075778894472363e-05,
"loss": 1.3377,
"step": 5100
},
{
"epoch": 0.26,
"grad_norm": 6.3521575927734375,
"learning_rate": 1.90556783919598e-05,
"loss": 1.3632,
"step": 5200
},
{
"epoch": 0.27,
"grad_norm": 4.51761531829834,
"learning_rate": 1.903557788944724e-05,
"loss": 1.3505,
"step": 5300
},
{
"epoch": 0.27,
"grad_norm": 6.074390411376953,
"learning_rate": 1.9015477386934674e-05,
"loss": 1.3644,
"step": 5400
},
{
"epoch": 0.28,
"grad_norm": 4.369632244110107,
"learning_rate": 1.8995376884422112e-05,
"loss": 1.3807,
"step": 5500
},
{
"epoch": 0.28,
"grad_norm": 7.657780170440674,
"learning_rate": 1.897527638190955e-05,
"loss": 1.3125,
"step": 5600
},
{
"epoch": 0.28,
"grad_norm": 9.048200607299805,
"learning_rate": 1.8955175879396988e-05,
"loss": 1.3216,
"step": 5700
},
{
"epoch": 0.29,
"grad_norm": 5.997036933898926,
"learning_rate": 1.8935075376884426e-05,
"loss": 1.3262,
"step": 5800
},
{
"epoch": 0.29,
"grad_norm": 4.751107692718506,
"learning_rate": 1.891497487437186e-05,
"loss": 1.3566,
"step": 5900
},
{
"epoch": 0.3,
"grad_norm": 5.662681579589844,
"learning_rate": 1.88948743718593e-05,
"loss": 1.3645,
"step": 6000
},
{
"epoch": 0.3,
"grad_norm": 5.755290508270264,
"learning_rate": 1.887497487437186e-05,
"loss": 1.2714,
"step": 6100
},
{
"epoch": 0.31,
"grad_norm": 5.199550151824951,
"learning_rate": 1.88548743718593e-05,
"loss": 1.3427,
"step": 6200
},
{
"epoch": 0.32,
"grad_norm": 7.531371116638184,
"learning_rate": 1.8834773869346733e-05,
"loss": 1.3198,
"step": 6300
},
{
"epoch": 0.32,
"grad_norm": 4.267923831939697,
"learning_rate": 1.881467336683417e-05,
"loss": 1.334,
"step": 6400
},
{
"epoch": 0.33,
"grad_norm": 5.429295063018799,
"learning_rate": 1.879457286432161e-05,
"loss": 1.2949,
"step": 6500
},
{
"epoch": 0.33,
"grad_norm": 4.842006206512451,
"learning_rate": 1.8774472361809047e-05,
"loss": 1.3123,
"step": 6600
},
{
"epoch": 0.34,
"grad_norm": 4.693381309509277,
"learning_rate": 1.8754371859296482e-05,
"loss": 1.3218,
"step": 6700
},
{
"epoch": 0.34,
"grad_norm": 3.555487632751465,
"learning_rate": 1.8734271356783923e-05,
"loss": 1.3077,
"step": 6800
},
{
"epoch": 0.34,
"grad_norm": 7.314678192138672,
"learning_rate": 1.8714170854271358e-05,
"loss": 1.2855,
"step": 6900
},
{
"epoch": 0.35,
"grad_norm": 6.160294532775879,
"learning_rate": 1.8694070351758796e-05,
"loss": 1.2901,
"step": 7000
},
{
"epoch": 0.35,
"grad_norm": 7.399959087371826,
"learning_rate": 1.867396984924623e-05,
"loss": 1.264,
"step": 7100
},
{
"epoch": 0.36,
"grad_norm": 4.204007625579834,
"learning_rate": 1.8653869346733672e-05,
"loss": 1.323,
"step": 7200
},
{
"epoch": 0.36,
"grad_norm": 5.531479358673096,
"learning_rate": 1.8633768844221106e-05,
"loss": 1.3211,
"step": 7300
},
{
"epoch": 0.37,
"grad_norm": 4.645538806915283,
"learning_rate": 1.8613668341708544e-05,
"loss": 1.2941,
"step": 7400
},
{
"epoch": 0.38,
"grad_norm": 6.326472282409668,
"learning_rate": 1.8593567839195982e-05,
"loss": 1.3025,
"step": 7500
},
{
"epoch": 0.38,
"grad_norm": 6.338307857513428,
"learning_rate": 1.857346733668342e-05,
"loss": 1.2924,
"step": 7600
},
{
"epoch": 0.39,
"grad_norm": 7.802080154418945,
"learning_rate": 1.8553366834170855e-05,
"loss": 1.3061,
"step": 7700
},
{
"epoch": 0.39,
"grad_norm": 4.98875093460083,
"learning_rate": 1.8533266331658293e-05,
"loss": 1.321,
"step": 7800
},
{
"epoch": 0.4,
"grad_norm": 5.888318061828613,
"learning_rate": 1.851316582914573e-05,
"loss": 1.2746,
"step": 7900
},
{
"epoch": 0.4,
"grad_norm": 6.636387825012207,
"learning_rate": 1.849306532663317e-05,
"loss": 1.2653,
"step": 8000
},
{
"epoch": 0.41,
"grad_norm": 6.1142449378967285,
"learning_rate": 1.8473165829145728e-05,
"loss": 1.2347,
"step": 8100
},
{
"epoch": 0.41,
"grad_norm": 5.41117525100708,
"learning_rate": 1.845306532663317e-05,
"loss": 1.3062,
"step": 8200
},
{
"epoch": 0.41,
"grad_norm": 5.025302886962891,
"learning_rate": 1.8432964824120604e-05,
"loss": 1.3162,
"step": 8300
},
{
"epoch": 0.42,
"grad_norm": 7.1088972091674805,
"learning_rate": 1.8412864321608042e-05,
"loss": 1.2573,
"step": 8400
},
{
"epoch": 0.42,
"grad_norm": 5.86447811126709,
"learning_rate": 1.839276381909548e-05,
"loss": 1.2855,
"step": 8500
},
{
"epoch": 0.43,
"grad_norm": 4.323820114135742,
"learning_rate": 1.8372663316582918e-05,
"loss": 1.2272,
"step": 8600
},
{
"epoch": 0.43,
"grad_norm": 7.335355758666992,
"learning_rate": 1.8352562814070352e-05,
"loss": 1.2718,
"step": 8700
},
{
"epoch": 0.44,
"grad_norm": 5.308874130249023,
"learning_rate": 1.833246231155779e-05,
"loss": 1.2727,
"step": 8800
},
{
"epoch": 0.45,
"grad_norm": 3.919790506362915,
"learning_rate": 1.8312361809045228e-05,
"loss": 1.28,
"step": 8900
},
{
"epoch": 0.45,
"grad_norm": 7.291688442230225,
"learning_rate": 1.8292261306532663e-05,
"loss": 1.2768,
"step": 9000
},
{
"epoch": 0.46,
"grad_norm": 5.098793029785156,
"learning_rate": 1.8272160804020104e-05,
"loss": 1.2441,
"step": 9100
},
{
"epoch": 0.46,
"grad_norm": 5.242636203765869,
"learning_rate": 1.825206030150754e-05,
"loss": 1.2534,
"step": 9200
},
{
"epoch": 0.47,
"grad_norm": 5.310051918029785,
"learning_rate": 1.8231959798994977e-05,
"loss": 1.2878,
"step": 9300
},
{
"epoch": 0.47,
"grad_norm": 6.058734893798828,
"learning_rate": 1.821185929648241e-05,
"loss": 1.2964,
"step": 9400
},
{
"epoch": 0.47,
"grad_norm": 6.912698745727539,
"learning_rate": 1.8191758793969853e-05,
"loss": 1.2511,
"step": 9500
},
{
"epoch": 0.48,
"grad_norm": 6.428102016448975,
"learning_rate": 1.8171658291457287e-05,
"loss": 1.2605,
"step": 9600
},
{
"epoch": 0.48,
"grad_norm": 5.642975807189941,
"learning_rate": 1.8151557788944725e-05,
"loss": 1.264,
"step": 9700
},
{
"epoch": 0.49,
"grad_norm": 6.23274040222168,
"learning_rate": 1.813145728643216e-05,
"loss": 1.2583,
"step": 9800
},
{
"epoch": 0.49,
"grad_norm": 7.3280792236328125,
"learning_rate": 1.81113567839196e-05,
"loss": 1.2324,
"step": 9900
},
{
"epoch": 0.5,
"grad_norm": 6.048460483551025,
"learning_rate": 1.8091256281407036e-05,
"loss": 1.2477,
"step": 10000
},
{
"epoch": 0.5,
"eval_loss": 1.2569069862365723,
"eval_runtime": 21.5797,
"eval_samples_per_second": 46.34,
"eval_steps_per_second": 5.792,
"step": 10000
},
{
"epoch": 0.51,
"grad_norm": 5.294989109039307,
"learning_rate": 1.80713567839196e-05,
"loss": 1.3038,
"step": 10100
},
{
"epoch": 0.51,
"grad_norm": 6.7187981605529785,
"learning_rate": 1.8051256281407036e-05,
"loss": 1.2584,
"step": 10200
},
{
"epoch": 0.52,
"grad_norm": 7.11021089553833,
"learning_rate": 1.8031155778894474e-05,
"loss": 1.2612,
"step": 10300
},
{
"epoch": 0.52,
"grad_norm": 6.111474990844727,
"learning_rate": 1.801105527638191e-05,
"loss": 1.2638,
"step": 10400
},
{
"epoch": 0.53,
"grad_norm": 6.04983377456665,
"learning_rate": 1.799095477386935e-05,
"loss": 1.2381,
"step": 10500
},
{
"epoch": 0.53,
"grad_norm": 5.682928562164307,
"learning_rate": 1.7970854271356785e-05,
"loss": 1.233,
"step": 10600
},
{
"epoch": 0.54,
"grad_norm": 6.028292179107666,
"learning_rate": 1.7950753768844223e-05,
"loss": 1.2572,
"step": 10700
},
{
"epoch": 0.54,
"grad_norm": 4.738650798797607,
"learning_rate": 1.793065326633166e-05,
"loss": 1.2125,
"step": 10800
},
{
"epoch": 0.55,
"grad_norm": 5.227931976318359,
"learning_rate": 1.7910753768844223e-05,
"loss": 1.2862,
"step": 10900
},
{
"epoch": 0.55,
"grad_norm": 6.476836204528809,
"learning_rate": 1.7890653266331658e-05,
"loss": 1.243,
"step": 11000
},
{
"epoch": 0.56,
"grad_norm": 4.261963844299316,
"learning_rate": 1.78705527638191e-05,
"loss": 1.2118,
"step": 11100
},
{
"epoch": 0.56,
"grad_norm": 6.414599418640137,
"learning_rate": 1.7850452261306534e-05,
"loss": 1.222,
"step": 11200
},
{
"epoch": 0.56,
"grad_norm": 5.642942905426025,
"learning_rate": 1.783035175879397e-05,
"loss": 1.1809,
"step": 11300
},
{
"epoch": 0.57,
"grad_norm": 4.094428539276123,
"learning_rate": 1.781025125628141e-05,
"loss": 1.2362,
"step": 11400
},
{
"epoch": 0.57,
"grad_norm": 5.5772881507873535,
"learning_rate": 1.7790150753768847e-05,
"loss": 1.2005,
"step": 11500
},
{
"epoch": 0.58,
"grad_norm": 4.420604705810547,
"learning_rate": 1.7770050251256282e-05,
"loss": 1.2138,
"step": 11600
},
{
"epoch": 0.58,
"grad_norm": 5.298806667327881,
"learning_rate": 1.774994974874372e-05,
"loss": 1.1693,
"step": 11700
},
{
"epoch": 0.59,
"grad_norm": 5.862612247467041,
"learning_rate": 1.7729849246231158e-05,
"loss": 1.1728,
"step": 11800
},
{
"epoch": 0.59,
"grad_norm": 3.835301637649536,
"learning_rate": 1.7709748743718593e-05,
"loss": 1.2159,
"step": 11900
},
{
"epoch": 0.6,
"grad_norm": 5.67401123046875,
"learning_rate": 1.768964824120603e-05,
"loss": 1.2393,
"step": 12000
},
{
"epoch": 0.6,
"grad_norm": 5.424498558044434,
"learning_rate": 1.766954773869347e-05,
"loss": 1.2255,
"step": 12100
},
{
"epoch": 0.61,
"grad_norm": 5.532503604888916,
"learning_rate": 1.7649447236180907e-05,
"loss": 1.2024,
"step": 12200
},
{
"epoch": 0.61,
"grad_norm": 5.404232501983643,
"learning_rate": 1.762934673366834e-05,
"loss": 1.2202,
"step": 12300
},
{
"epoch": 0.62,
"grad_norm": 3.9564428329467773,
"learning_rate": 1.7609246231155782e-05,
"loss": 1.1655,
"step": 12400
},
{
"epoch": 0.62,
"grad_norm": 3.2090141773223877,
"learning_rate": 1.7589145728643217e-05,
"loss": 1.1563,
"step": 12500
},
{
"epoch": 0.63,
"grad_norm": 6.341458320617676,
"learning_rate": 1.7569045226130655e-05,
"loss": 1.1982,
"step": 12600
},
{
"epoch": 0.64,
"grad_norm": 7.190246105194092,
"learning_rate": 1.754894472361809e-05,
"loss": 1.1817,
"step": 12700
},
{
"epoch": 0.64,
"grad_norm": 6.108299255371094,
"learning_rate": 1.752884422110553e-05,
"loss": 1.2123,
"step": 12800
},
{
"epoch": 0.65,
"grad_norm": 5.769379615783691,
"learning_rate": 1.7508743718592966e-05,
"loss": 1.1964,
"step": 12900
},
{
"epoch": 0.65,
"grad_norm": 5.177648067474365,
"learning_rate": 1.7488643216080404e-05,
"loss": 1.2103,
"step": 13000
},
{
"epoch": 0.66,
"grad_norm": 5.531684875488281,
"learning_rate": 1.7468542713567838e-05,
"loss": 1.1801,
"step": 13100
},
{
"epoch": 0.66,
"grad_norm": 5.700603008270264,
"learning_rate": 1.744844221105528e-05,
"loss": 1.1943,
"step": 13200
},
{
"epoch": 0.67,
"grad_norm": 9.25114917755127,
"learning_rate": 1.7428341708542714e-05,
"loss": 1.2286,
"step": 13300
},
{
"epoch": 0.67,
"grad_norm": 4.238541126251221,
"learning_rate": 1.7408241206030152e-05,
"loss": 1.1869,
"step": 13400
},
{
"epoch": 0.68,
"grad_norm": 5.6147260665893555,
"learning_rate": 1.738814070351759e-05,
"loss": 1.1854,
"step": 13500
},
{
"epoch": 0.68,
"grad_norm": 4.879734039306641,
"learning_rate": 1.7368040201005028e-05,
"loss": 1.1941,
"step": 13600
},
{
"epoch": 0.69,
"grad_norm": 3.612379312515259,
"learning_rate": 1.7347939698492463e-05,
"loss": 1.1649,
"step": 13700
},
{
"epoch": 0.69,
"grad_norm": 4.583663463592529,
"learning_rate": 1.73278391959799e-05,
"loss": 1.1796,
"step": 13800
},
{
"epoch": 0.69,
"grad_norm": 4.3080339431762695,
"learning_rate": 1.7307939698492463e-05,
"loss": 1.2092,
"step": 13900
},
{
"epoch": 0.7,
"grad_norm": 5.9151506423950195,
"learning_rate": 1.72878391959799e-05,
"loss": 1.1809,
"step": 14000
},
{
"epoch": 0.7,
"grad_norm": 5.167910575866699,
"learning_rate": 1.726773869346734e-05,
"loss": 1.2063,
"step": 14100
},
{
"epoch": 0.71,
"grad_norm": 7.372837543487549,
"learning_rate": 1.7247638190954777e-05,
"loss": 1.147,
"step": 14200
},
{
"epoch": 0.71,
"grad_norm": 3.6992413997650146,
"learning_rate": 1.722753768844221e-05,
"loss": 1.2312,
"step": 14300
},
{
"epoch": 0.72,
"grad_norm": 6.654348850250244,
"learning_rate": 1.720743718592965e-05,
"loss": 1.1956,
"step": 14400
},
{
"epoch": 0.72,
"grad_norm": 4.683749675750732,
"learning_rate": 1.7187336683417087e-05,
"loss": 1.1598,
"step": 14500
},
{
"epoch": 0.73,
"grad_norm": 5.769094467163086,
"learning_rate": 1.7167236180904522e-05,
"loss": 1.1387,
"step": 14600
},
{
"epoch": 0.73,
"grad_norm": 7.586219310760498,
"learning_rate": 1.7147135678391963e-05,
"loss": 1.1994,
"step": 14700
},
{
"epoch": 0.74,
"grad_norm": 5.573954105377197,
"learning_rate": 1.7127035175879398e-05,
"loss": 1.1887,
"step": 14800
},
{
"epoch": 0.74,
"grad_norm": 6.4866251945495605,
"learning_rate": 1.7106934673366836e-05,
"loss": 1.1892,
"step": 14900
},
{
"epoch": 0.75,
"grad_norm": 4.954825401306152,
"learning_rate": 1.708683417085427e-05,
"loss": 1.1742,
"step": 15000
},
{
"epoch": 0.76,
"grad_norm": 3.952847480773926,
"learning_rate": 1.7066733668341712e-05,
"loss": 1.143,
"step": 15100
},
{
"epoch": 0.76,
"grad_norm": 5.170006275177002,
"learning_rate": 1.7046633165829146e-05,
"loss": 1.1881,
"step": 15200
},
{
"epoch": 0.77,
"grad_norm": 4.910400390625,
"learning_rate": 1.7026532663316584e-05,
"loss": 1.131,
"step": 15300
},
{
"epoch": 0.77,
"grad_norm": 4.728166580200195,
"learning_rate": 1.700643216080402e-05,
"loss": 1.1854,
"step": 15400
},
{
"epoch": 0.78,
"grad_norm": 6.516223430633545,
"learning_rate": 1.698633165829146e-05,
"loss": 1.2069,
"step": 15500
},
{
"epoch": 0.78,
"grad_norm": 5.914300918579102,
"learning_rate": 1.6966231155778895e-05,
"loss": 1.1663,
"step": 15600
},
{
"epoch": 0.79,
"grad_norm": 4.6894378662109375,
"learning_rate": 1.6946130653266333e-05,
"loss": 1.145,
"step": 15700
},
{
"epoch": 0.79,
"grad_norm": 4.994019031524658,
"learning_rate": 1.692603015075377e-05,
"loss": 1.1156,
"step": 15800
},
{
"epoch": 0.8,
"grad_norm": 5.994630813598633,
"learning_rate": 1.690592964824121e-05,
"loss": 1.1583,
"step": 15900
},
{
"epoch": 0.8,
"grad_norm": 6.7444562911987305,
"learning_rate": 1.6885829145728643e-05,
"loss": 1.1821,
"step": 16000
},
{
"epoch": 0.81,
"grad_norm": 5.461032867431641,
"learning_rate": 1.686572864321608e-05,
"loss": 1.1388,
"step": 16100
},
{
"epoch": 0.81,
"grad_norm": 5.0775251388549805,
"learning_rate": 1.684562814070352e-05,
"loss": 1.1576,
"step": 16200
},
{
"epoch": 0.81,
"grad_norm": 4.469027042388916,
"learning_rate": 1.6825527638190957e-05,
"loss": 1.1792,
"step": 16300
},
{
"epoch": 0.82,
"grad_norm": 6.780773639678955,
"learning_rate": 1.6805427135678395e-05,
"loss": 1.1441,
"step": 16400
},
{
"epoch": 0.82,
"grad_norm": 6.338268756866455,
"learning_rate": 1.678532663316583e-05,
"loss": 1.1087,
"step": 16500
},
{
"epoch": 0.83,
"grad_norm": 4.28759241104126,
"learning_rate": 1.6765226130653268e-05,
"loss": 1.1616,
"step": 16600
},
{
"epoch": 0.83,
"grad_norm": 4.656599998474121,
"learning_rate": 1.6745125628140706e-05,
"loss": 1.1086,
"step": 16700
},
{
"epoch": 0.84,
"grad_norm": 4.506341457366943,
"learning_rate": 1.6725025125628144e-05,
"loss": 1.1821,
"step": 16800
},
{
"epoch": 0.84,
"grad_norm": 5.074087142944336,
"learning_rate": 1.670492462311558e-05,
"loss": 1.1376,
"step": 16900
},
{
"epoch": 0.85,
"grad_norm": 4.427557468414307,
"learning_rate": 1.6684824120603016e-05,
"loss": 1.1608,
"step": 17000
},
{
"epoch": 0.85,
"grad_norm": 4.684313774108887,
"learning_rate": 1.666472361809045e-05,
"loss": 1.1374,
"step": 17100
},
{
"epoch": 0.86,
"grad_norm": 4.981125354766846,
"learning_rate": 1.6644623115577892e-05,
"loss": 1.1157,
"step": 17200
},
{
"epoch": 0.86,
"grad_norm": 6.36452579498291,
"learning_rate": 1.6624522613065327e-05,
"loss": 1.1547,
"step": 17300
},
{
"epoch": 0.87,
"grad_norm": 4.986701011657715,
"learning_rate": 1.6604422110552765e-05,
"loss": 1.147,
"step": 17400
},
{
"epoch": 0.88,
"grad_norm": 6.206230640411377,
"learning_rate": 1.6584321608040203e-05,
"loss": 1.1235,
"step": 17500
},
{
"epoch": 0.88,
"grad_norm": 5.597214221954346,
"learning_rate": 1.656422110552764e-05,
"loss": 1.1472,
"step": 17600
},
{
"epoch": 0.89,
"grad_norm": 5.753964424133301,
"learning_rate": 1.6544120603015076e-05,
"loss": 1.0838,
"step": 17700
},
{
"epoch": 0.89,
"grad_norm": 5.263125896453857,
"learning_rate": 1.6524020100502513e-05,
"loss": 1.1149,
"step": 17800
},
{
"epoch": 0.9,
"grad_norm": 2.9451704025268555,
"learning_rate": 1.6504120603015076e-05,
"loss": 1.1162,
"step": 17900
},
{
"epoch": 0.9,
"grad_norm": 6.694633960723877,
"learning_rate": 1.6484020100502514e-05,
"loss": 1.1268,
"step": 18000
},
{
"epoch": 0.91,
"grad_norm": 5.449553489685059,
"learning_rate": 1.6463919597989952e-05,
"loss": 1.1307,
"step": 18100
},
{
"epoch": 0.91,
"grad_norm": 5.502272129058838,
"learning_rate": 1.644381909547739e-05,
"loss": 1.1031,
"step": 18200
},
{
"epoch": 0.92,
"grad_norm": 6.899608612060547,
"learning_rate": 1.6423718592964824e-05,
"loss": 1.1389,
"step": 18300
},
{
"epoch": 0.92,
"grad_norm": 4.675032615661621,
"learning_rate": 1.6403618090452262e-05,
"loss": 1.1541,
"step": 18400
},
{
"epoch": 0.93,
"grad_norm": 7.353012561798096,
"learning_rate": 1.63835175879397e-05,
"loss": 1.1213,
"step": 18500
},
{
"epoch": 0.93,
"grad_norm": 4.253681659698486,
"learning_rate": 1.636341708542714e-05,
"loss": 1.1672,
"step": 18600
},
{
"epoch": 0.94,
"grad_norm": 6.5902018547058105,
"learning_rate": 1.6343316582914573e-05,
"loss": 1.1349,
"step": 18700
},
{
"epoch": 0.94,
"grad_norm": 5.40578556060791,
"learning_rate": 1.632321608040201e-05,
"loss": 1.1283,
"step": 18800
},
{
"epoch": 0.94,
"grad_norm": 3.9744160175323486,
"learning_rate": 1.630311557788945e-05,
"loss": 1.1463,
"step": 18900
},
{
"epoch": 0.95,
"grad_norm": 6.115358352661133,
"learning_rate": 1.6283015075376887e-05,
"loss": 1.1443,
"step": 19000
},
{
"epoch": 0.95,
"grad_norm": 2.9785940647125244,
"learning_rate": 1.6262914572864325e-05,
"loss": 1.1409,
"step": 19100
},
{
"epoch": 0.96,
"grad_norm": 5.200758934020996,
"learning_rate": 1.6243015075376887e-05,
"loss": 1.1629,
"step": 19200
},
{
"epoch": 0.96,
"grad_norm": 5.975739479064941,
"learning_rate": 1.6222914572864322e-05,
"loss": 1.083,
"step": 19300
},
{
"epoch": 0.97,
"grad_norm": 6.220870018005371,
"learning_rate": 1.620281407035176e-05,
"loss": 1.1305,
"step": 19400
},
{
"epoch": 0.97,
"grad_norm": 4.187997341156006,
"learning_rate": 1.6182713567839198e-05,
"loss": 1.1028,
"step": 19500
},
{
"epoch": 0.98,
"grad_norm": 5.540648937225342,
"learning_rate": 1.6162613065326636e-05,
"loss": 1.1176,
"step": 19600
},
{
"epoch": 0.98,
"grad_norm": 5.99765157699585,
"learning_rate": 1.6142512562814074e-05,
"loss": 1.0932,
"step": 19700
},
{
"epoch": 0.99,
"grad_norm": 4.647700786590576,
"learning_rate": 1.6122412060301508e-05,
"loss": 1.1294,
"step": 19800
},
{
"epoch": 0.99,
"grad_norm": 6.05048131942749,
"learning_rate": 1.6102311557788946e-05,
"loss": 1.0828,
"step": 19900
},
{
"epoch": 1.0,
"grad_norm": 4.912966251373291,
"learning_rate": 1.608221105527638e-05,
"loss": 1.0975,
"step": 20000
},
{
"epoch": 1.0,
"eval_loss": 1.132000207901001,
"eval_runtime": 21.5853,
"eval_samples_per_second": 46.328,
"eval_steps_per_second": 5.791,
"step": 20000
},
{
"epoch": 1.0,
"grad_norm": 5.5869340896606445,
"learning_rate": 1.6062110552763822e-05,
"loss": 1.1428,
"step": 20100
},
{
"epoch": 1.01,
"grad_norm": 4.5555739402771,
"learning_rate": 1.6042010050251257e-05,
"loss": 1.0939,
"step": 20200
},
{
"epoch": 1.01,
"grad_norm": 3.527172803878784,
"learning_rate": 1.6021909547738695e-05,
"loss": 1.1184,
"step": 20300
},
{
"epoch": 1.02,
"grad_norm": 2.7429285049438477,
"learning_rate": 1.600180904522613e-05,
"loss": 1.1028,
"step": 20400
},
{
"epoch": 1.02,
"grad_norm": 3.6536190509796143,
"learning_rate": 1.598170854271357e-05,
"loss": 1.0954,
"step": 20500
},
{
"epoch": 1.03,
"grad_norm": 4.48521089553833,
"learning_rate": 1.5961608040201005e-05,
"loss": 1.1001,
"step": 20600
},
{
"epoch": 1.03,
"grad_norm": 7.937503814697266,
"learning_rate": 1.5941507537688443e-05,
"loss": 1.0676,
"step": 20700
},
{
"epoch": 1.04,
"grad_norm": 7.802252769470215,
"learning_rate": 1.592140703517588e-05,
"loss": 1.1007,
"step": 20800
},
{
"epoch": 1.04,
"grad_norm": 6.141603469848633,
"learning_rate": 1.590130653266332e-05,
"loss": 1.0749,
"step": 20900
},
{
"epoch": 1.05,
"grad_norm": 5.166286945343018,
"learning_rate": 1.5881206030150754e-05,
"loss": 1.0704,
"step": 21000
},
{
"epoch": 1.05,
"grad_norm": 5.407045364379883,
"learning_rate": 1.5861105527638192e-05,
"loss": 1.0852,
"step": 21100
},
{
"epoch": 1.06,
"grad_norm": 5.4536967277526855,
"learning_rate": 1.584100502512563e-05,
"loss": 1.1152,
"step": 21200
},
{
"epoch": 1.06,
"grad_norm": 5.464064121246338,
"learning_rate": 1.5820904522613068e-05,
"loss": 1.0546,
"step": 21300
},
{
"epoch": 1.07,
"grad_norm": 3.853875160217285,
"learning_rate": 1.580100502512563e-05,
"loss": 1.0858,
"step": 21400
},
{
"epoch": 1.07,
"grad_norm": 4.8497443199157715,
"learning_rate": 1.5780904522613068e-05,
"loss": 1.0973,
"step": 21500
},
{
"epoch": 1.08,
"grad_norm": 4.255434513092041,
"learning_rate": 1.5760804020100503e-05,
"loss": 1.0872,
"step": 21600
},
{
"epoch": 1.08,
"grad_norm": 4.134657382965088,
"learning_rate": 1.574070351758794e-05,
"loss": 1.1127,
"step": 21700
},
{
"epoch": 1.09,
"grad_norm": 9.425840377807617,
"learning_rate": 1.572060301507538e-05,
"loss": 1.1147,
"step": 21800
},
{
"epoch": 1.09,
"grad_norm": 5.42075777053833,
"learning_rate": 1.5700502512562817e-05,
"loss": 1.0719,
"step": 21900
},
{
"epoch": 1.1,
"grad_norm": 5.076992988586426,
"learning_rate": 1.5680402010050255e-05,
"loss": 1.095,
"step": 22000
},
{
"epoch": 1.1,
"grad_norm": 5.668195724487305,
"learning_rate": 1.566030150753769e-05,
"loss": 1.0799,
"step": 22100
},
{
"epoch": 1.11,
"grad_norm": 5.9342474937438965,
"learning_rate": 1.5640201005025127e-05,
"loss": 1.0965,
"step": 22200
},
{
"epoch": 1.11,
"grad_norm": 5.112601280212402,
"learning_rate": 1.5620100502512565e-05,
"loss": 1.0951,
"step": 22300
},
{
"epoch": 1.12,
"grad_norm": 10.206339836120605,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.0837,
"step": 22400
},
{
"epoch": 1.12,
"grad_norm": 3.8015480041503906,
"learning_rate": 1.5579899497487438e-05,
"loss": 1.0871,
"step": 22500
},
{
"epoch": 1.13,
"grad_norm": 4.524369239807129,
"learning_rate": 1.5559798994974876e-05,
"loss": 1.1263,
"step": 22600
},
{
"epoch": 1.14,
"grad_norm": 5.1671671867370605,
"learning_rate": 1.553969849246231e-05,
"loss": 1.085,
"step": 22700
},
{
"epoch": 1.14,
"grad_norm": 4.96006965637207,
"learning_rate": 1.551959798994975e-05,
"loss": 1.0893,
"step": 22800
},
{
"epoch": 1.15,
"grad_norm": 6.482675075531006,
"learning_rate": 1.5499497487437186e-05,
"loss": 1.0667,
"step": 22900
},
{
"epoch": 1.15,
"grad_norm": 4.591585636138916,
"learning_rate": 1.5479396984924624e-05,
"loss": 1.0861,
"step": 23000
},
{
"epoch": 1.16,
"grad_norm": 4.026520729064941,
"learning_rate": 1.5459296482412062e-05,
"loss": 1.0772,
"step": 23100
},
{
"epoch": 1.16,
"grad_norm": 5.972117900848389,
"learning_rate": 1.54391959798995e-05,
"loss": 1.0818,
"step": 23200
},
{
"epoch": 1.17,
"grad_norm": 4.737887382507324,
"learning_rate": 1.5419095477386935e-05,
"loss": 1.0752,
"step": 23300
},
{
"epoch": 1.17,
"grad_norm": 4.748262882232666,
"learning_rate": 1.5398994974874373e-05,
"loss": 1.0803,
"step": 23400
},
{
"epoch": 1.18,
"grad_norm": 4.94175386428833,
"learning_rate": 1.537889447236181e-05,
"loss": 1.0754,
"step": 23500
},
{
"epoch": 1.18,
"grad_norm": 4.3259172439575195,
"learning_rate": 1.535879396984925e-05,
"loss": 1.0463,
"step": 23600
},
{
"epoch": 1.19,
"grad_norm": 5.240546703338623,
"learning_rate": 1.5338693467336687e-05,
"loss": 1.0547,
"step": 23700
},
{
"epoch": 1.19,
"grad_norm": 6.120886325836182,
"learning_rate": 1.531859296482412e-05,
"loss": 1.0861,
"step": 23800
},
{
"epoch": 1.2,
"grad_norm": 5.634921550750732,
"learning_rate": 1.529849246231156e-05,
"loss": 1.0722,
"step": 23900
},
{
"epoch": 1.2,
"grad_norm": 5.39201021194458,
"learning_rate": 1.5278391959798997e-05,
"loss": 1.07,
"step": 24000
},
{
"epoch": 1.21,
"grad_norm": 6.85221004486084,
"learning_rate": 1.5258291457286433e-05,
"loss": 1.0578,
"step": 24100
},
{
"epoch": 1.21,
"grad_norm": 4.522882461547852,
"learning_rate": 1.523819095477387e-05,
"loss": 1.0895,
"step": 24200
},
{
"epoch": 1.22,
"grad_norm": 4.020057201385498,
"learning_rate": 1.5218090452261308e-05,
"loss": 1.0377,
"step": 24300
},
{
"epoch": 1.22,
"grad_norm": 4.188474655151367,
"learning_rate": 1.5197989949748746e-05,
"loss": 1.0469,
"step": 24400
},
{
"epoch": 1.23,
"grad_norm": 6.872804164886475,
"learning_rate": 1.5177889447236182e-05,
"loss": 1.0795,
"step": 24500
},
{
"epoch": 1.23,
"grad_norm": 5.834617614746094,
"learning_rate": 1.515778894472362e-05,
"loss": 1.0827,
"step": 24600
},
{
"epoch": 1.23,
"grad_norm": 4.008932590484619,
"learning_rate": 1.5137688442211056e-05,
"loss": 1.069,
"step": 24700
},
{
"epoch": 1.24,
"grad_norm": 5.309475898742676,
"learning_rate": 1.5117587939698494e-05,
"loss": 1.0668,
"step": 24800
},
{
"epoch": 1.25,
"grad_norm": 6.02021598815918,
"learning_rate": 1.5097487437185932e-05,
"loss": 1.0611,
"step": 24900
},
{
"epoch": 1.25,
"grad_norm": 4.143280029296875,
"learning_rate": 1.5077587939698495e-05,
"loss": 1.0526,
"step": 25000
},
{
"epoch": 1.25,
"grad_norm": 4.231622695922852,
"learning_rate": 1.505748743718593e-05,
"loss": 1.0706,
"step": 25100
},
{
"epoch": 1.26,
"grad_norm": 4.0399322509765625,
"learning_rate": 1.5037386934673369e-05,
"loss": 1.0878,
"step": 25200
},
{
"epoch": 1.27,
"grad_norm": 4.2283759117126465,
"learning_rate": 1.5017286432160805e-05,
"loss": 1.0903,
"step": 25300
},
{
"epoch": 1.27,
"grad_norm": 6.159567356109619,
"learning_rate": 1.4997185929648241e-05,
"loss": 1.069,
"step": 25400
},
{
"epoch": 1.27,
"grad_norm": 5.181605815887451,
"learning_rate": 1.4977085427135681e-05,
"loss": 1.0712,
"step": 25500
},
{
"epoch": 1.28,
"grad_norm": 4.90966796875,
"learning_rate": 1.4956984924623117e-05,
"loss": 1.0672,
"step": 25600
},
{
"epoch": 1.28,
"grad_norm": 4.765697479248047,
"learning_rate": 1.4936884422110554e-05,
"loss": 1.0338,
"step": 25700
},
{
"epoch": 1.29,
"grad_norm": 4.3462018966674805,
"learning_rate": 1.491678391959799e-05,
"loss": 1.0408,
"step": 25800
},
{
"epoch": 1.29,
"grad_norm": 5.249480247497559,
"learning_rate": 1.489668341708543e-05,
"loss": 1.0576,
"step": 25900
},
{
"epoch": 1.3,
"grad_norm": 5.543900489807129,
"learning_rate": 1.4876582914572866e-05,
"loss": 1.0651,
"step": 26000
},
{
"epoch": 1.3,
"grad_norm": 6.526113033294678,
"learning_rate": 1.4856482412060302e-05,
"loss": 1.0596,
"step": 26100
},
{
"epoch": 1.31,
"grad_norm": 4.725895404815674,
"learning_rate": 1.4836381909547738e-05,
"loss": 1.0969,
"step": 26200
},
{
"epoch": 1.31,
"grad_norm": 6.068490028381348,
"learning_rate": 1.4816281407035178e-05,
"loss": 1.0284,
"step": 26300
},
{
"epoch": 1.32,
"grad_norm": 4.363389015197754,
"learning_rate": 1.4796180904522614e-05,
"loss": 1.0589,
"step": 26400
},
{
"epoch": 1.32,
"grad_norm": 6.8659257888793945,
"learning_rate": 1.477608040201005e-05,
"loss": 1.0803,
"step": 26500
},
{
"epoch": 1.33,
"grad_norm": 5.061355113983154,
"learning_rate": 1.4755979899497489e-05,
"loss": 1.066,
"step": 26600
},
{
"epoch": 1.33,
"grad_norm": 4.511940956115723,
"learning_rate": 1.4735879396984927e-05,
"loss": 1.0447,
"step": 26700
},
{
"epoch": 1.34,
"grad_norm": 4.449003219604492,
"learning_rate": 1.4715778894472363e-05,
"loss": 1.0532,
"step": 26800
},
{
"epoch": 1.34,
"grad_norm": 5.1782307624816895,
"learning_rate": 1.46956783919598e-05,
"loss": 1.0608,
"step": 26900
},
{
"epoch": 1.35,
"grad_norm": 5.087260723114014,
"learning_rate": 1.4675577889447237e-05,
"loss": 1.0371,
"step": 27000
},
{
"epoch": 1.35,
"grad_norm": 4.387496471405029,
"learning_rate": 1.4655477386934675e-05,
"loss": 1.055,
"step": 27100
},
{
"epoch": 1.36,
"grad_norm": 4.9253010749816895,
"learning_rate": 1.4635376884422113e-05,
"loss": 1.0385,
"step": 27200
},
{
"epoch": 1.36,
"grad_norm": 4.611992835998535,
"learning_rate": 1.461527638190955e-05,
"loss": 1.0338,
"step": 27300
},
{
"epoch": 1.37,
"grad_norm": 2.981304168701172,
"learning_rate": 1.4595175879396986e-05,
"loss": 1.0516,
"step": 27400
},
{
"epoch": 1.38,
"grad_norm": 5.678966045379639,
"learning_rate": 1.4575075376884422e-05,
"loss": 1.0788,
"step": 27500
},
{
"epoch": 1.38,
"grad_norm": 5.3079752922058105,
"learning_rate": 1.4554974874371862e-05,
"loss": 1.0853,
"step": 27600
},
{
"epoch": 1.39,
"grad_norm": 5.990561485290527,
"learning_rate": 1.4534874371859298e-05,
"loss": 1.0187,
"step": 27700
},
{
"epoch": 1.39,
"grad_norm": 7.396142482757568,
"learning_rate": 1.4514773869346734e-05,
"loss": 1.0694,
"step": 27800
},
{
"epoch": 1.4,
"grad_norm": 4.319200038909912,
"learning_rate": 1.449467336683417e-05,
"loss": 1.0668,
"step": 27900
},
{
"epoch": 1.4,
"grad_norm": 2.7691450119018555,
"learning_rate": 1.447457286432161e-05,
"loss": 1.0652,
"step": 28000
},
{
"epoch": 1.41,
"grad_norm": 8.814241409301758,
"learning_rate": 1.4454472361809046e-05,
"loss": 1.0423,
"step": 28100
},
{
"epoch": 1.41,
"grad_norm": 5.264801979064941,
"learning_rate": 1.4434371859296483e-05,
"loss": 1.0918,
"step": 28200
},
{
"epoch": 1.42,
"grad_norm": 4.573727130889893,
"learning_rate": 1.441427135678392e-05,
"loss": 1.0822,
"step": 28300
},
{
"epoch": 1.42,
"grad_norm": 3.6568844318389893,
"learning_rate": 1.4394170854271359e-05,
"loss": 1.0492,
"step": 28400
},
{
"epoch": 1.43,
"grad_norm": 4.999285697937012,
"learning_rate": 1.437427135678392e-05,
"loss": 1.0583,
"step": 28500
},
{
"epoch": 1.43,
"grad_norm": 4.125443458557129,
"learning_rate": 1.4354170854271359e-05,
"loss": 1.0422,
"step": 28600
},
{
"epoch": 1.44,
"grad_norm": 6.014279365539551,
"learning_rate": 1.4334070351758795e-05,
"loss": 1.0347,
"step": 28700
},
{
"epoch": 1.44,
"grad_norm": 8.18229866027832,
"learning_rate": 1.4313969849246232e-05,
"loss": 1.0133,
"step": 28800
},
{
"epoch": 1.45,
"grad_norm": 3.3756470680236816,
"learning_rate": 1.4294070351758796e-05,
"loss": 1.0684,
"step": 28900
},
{
"epoch": 1.45,
"grad_norm": 5.568530559539795,
"learning_rate": 1.4273969849246232e-05,
"loss": 1.0666,
"step": 29000
},
{
"epoch": 1.46,
"grad_norm": 4.440110683441162,
"learning_rate": 1.4253869346733668e-05,
"loss": 1.057,
"step": 29100
},
{
"epoch": 1.46,
"grad_norm": 6.835775852203369,
"learning_rate": 1.4233768844221108e-05,
"loss": 1.0176,
"step": 29200
},
{
"epoch": 1.47,
"grad_norm": 5.715722560882568,
"learning_rate": 1.4213668341708544e-05,
"loss": 1.0996,
"step": 29300
},
{
"epoch": 1.47,
"grad_norm": 6.401480674743652,
"learning_rate": 1.419356783919598e-05,
"loss": 1.0459,
"step": 29400
},
{
"epoch": 1.48,
"grad_norm": 7.125598430633545,
"learning_rate": 1.4173467336683417e-05,
"loss": 1.0067,
"step": 29500
},
{
"epoch": 1.48,
"grad_norm": 5.287647724151611,
"learning_rate": 1.4153366834170856e-05,
"loss": 1.0475,
"step": 29600
},
{
"epoch": 1.48,
"grad_norm": 5.175357818603516,
"learning_rate": 1.4133266331658293e-05,
"loss": 1.0361,
"step": 29700
},
{
"epoch": 1.49,
"grad_norm": 4.676697731018066,
"learning_rate": 1.4113165829145729e-05,
"loss": 0.9925,
"step": 29800
},
{
"epoch": 1.5,
"grad_norm": 4.375120162963867,
"learning_rate": 1.4093065326633167e-05,
"loss": 1.0145,
"step": 29900
},
{
"epoch": 1.5,
"grad_norm": 4.380770683288574,
"learning_rate": 1.4072964824120605e-05,
"loss": 1.0763,
"step": 30000
},
{
"epoch": 1.5,
"eval_loss": 1.0519436597824097,
"eval_runtime": 21.613,
"eval_samples_per_second": 46.269,
"eval_steps_per_second": 5.784,
"step": 30000
},
{
"epoch": 1.5,
"grad_norm": 5.796531677246094,
"learning_rate": 1.4052864321608041e-05,
"loss": 1.0563,
"step": 30100
},
{
"epoch": 1.51,
"grad_norm": 2.713714361190796,
"learning_rate": 1.4032763819095479e-05,
"loss": 1.0549,
"step": 30200
},
{
"epoch": 1.52,
"grad_norm": 6.333755016326904,
"learning_rate": 1.4012663316582915e-05,
"loss": 1.042,
"step": 30300
},
{
"epoch": 1.52,
"grad_norm": 3.8109474182128906,
"learning_rate": 1.3992562814070353e-05,
"loss": 1.0773,
"step": 30400
},
{
"epoch": 1.52,
"grad_norm": 6.425621509552002,
"learning_rate": 1.3972462311557791e-05,
"loss": 1.0066,
"step": 30500
},
{
"epoch": 1.53,
"grad_norm": 4.9127607345581055,
"learning_rate": 1.3952361809045228e-05,
"loss": 1.0022,
"step": 30600
},
{
"epoch": 1.54,
"grad_norm": 4.212081432342529,
"learning_rate": 1.3932261306532664e-05,
"loss": 1.0358,
"step": 30700
},
{
"epoch": 1.54,
"grad_norm": 7.6413187980651855,
"learning_rate": 1.39121608040201e-05,
"loss": 1.0413,
"step": 30800
},
{
"epoch": 1.54,
"grad_norm": 4.2576494216918945,
"learning_rate": 1.389206030150754e-05,
"loss": 1.0332,
"step": 30900
},
{
"epoch": 1.55,
"grad_norm": 4.797669887542725,
"learning_rate": 1.3871959798994976e-05,
"loss": 1.0396,
"step": 31000
},
{
"epoch": 1.56,
"grad_norm": 5.891973972320557,
"learning_rate": 1.3851859296482412e-05,
"loss": 1.0281,
"step": 31100
},
{
"epoch": 1.56,
"grad_norm": 5.9344964027404785,
"learning_rate": 1.3831758793969849e-05,
"loss": 1.024,
"step": 31200
},
{
"epoch": 1.56,
"grad_norm": 4.902309417724609,
"learning_rate": 1.3811658291457288e-05,
"loss": 1.027,
"step": 31300
},
{
"epoch": 1.57,
"grad_norm": 6.387609958648682,
"learning_rate": 1.3791557788944725e-05,
"loss": 1.0207,
"step": 31400
},
{
"epoch": 1.57,
"grad_norm": 5.870815277099609,
"learning_rate": 1.3771457286432161e-05,
"loss": 1.0128,
"step": 31500
},
{
"epoch": 1.58,
"grad_norm": 6.101361274719238,
"learning_rate": 1.3751356783919599e-05,
"loss": 1.0412,
"step": 31600
},
{
"epoch": 1.58,
"grad_norm": 5.250607967376709,
"learning_rate": 1.3731256281407037e-05,
"loss": 1.0146,
"step": 31700
},
{
"epoch": 1.59,
"grad_norm": 5.449378967285156,
"learning_rate": 1.3711155778894473e-05,
"loss": 1.03,
"step": 31800
},
{
"epoch": 1.59,
"grad_norm": 4.564045429229736,
"learning_rate": 1.3691055276381911e-05,
"loss": 1.0567,
"step": 31900
},
{
"epoch": 1.6,
"grad_norm": 5.84417200088501,
"learning_rate": 1.3670954773869347e-05,
"loss": 1.0782,
"step": 32000
},
{
"epoch": 1.6,
"grad_norm": 4.725462436676025,
"learning_rate": 1.3650854271356785e-05,
"loss": 1.0327,
"step": 32100
},
{
"epoch": 1.61,
"grad_norm": 6.999115943908691,
"learning_rate": 1.3630753768844223e-05,
"loss": 1.0405,
"step": 32200
},
{
"epoch": 1.61,
"grad_norm": 4.241363525390625,
"learning_rate": 1.361065326633166e-05,
"loss": 1.02,
"step": 32300
},
{
"epoch": 1.62,
"grad_norm": 5.884255886077881,
"learning_rate": 1.3590552763819096e-05,
"loss": 1.0634,
"step": 32400
},
{
"epoch": 1.62,
"grad_norm": 3.674698829650879,
"learning_rate": 1.3570452261306536e-05,
"loss": 1.0389,
"step": 32500
},
{
"epoch": 1.63,
"grad_norm": 4.227616310119629,
"learning_rate": 1.3550351758793972e-05,
"loss": 0.9992,
"step": 32600
},
{
"epoch": 1.64,
"grad_norm": 4.682816982269287,
"learning_rate": 1.3530251256281408e-05,
"loss": 1.0111,
"step": 32700
},
{
"epoch": 1.64,
"grad_norm": 4.632464408874512,
"learning_rate": 1.3510150753768844e-05,
"loss": 1.0223,
"step": 32800
},
{
"epoch": 1.65,
"grad_norm": 6.061766147613525,
"learning_rate": 1.349005025125628e-05,
"loss": 0.9837,
"step": 32900
},
{
"epoch": 1.65,
"grad_norm": 5.4998908042907715,
"learning_rate": 1.346994974874372e-05,
"loss": 1.041,
"step": 33000
},
{
"epoch": 1.66,
"grad_norm": 6.294175624847412,
"learning_rate": 1.3449849246231157e-05,
"loss": 1.0311,
"step": 33100
},
{
"epoch": 1.66,
"grad_norm": 5.177206039428711,
"learning_rate": 1.3429748743718593e-05,
"loss": 1.0435,
"step": 33200
},
{
"epoch": 1.67,
"grad_norm": 4.389501571655273,
"learning_rate": 1.3409648241206031e-05,
"loss": 1.0104,
"step": 33300
},
{
"epoch": 1.67,
"grad_norm": 5.105901718139648,
"learning_rate": 1.3389547738693469e-05,
"loss": 0.9868,
"step": 33400
},
{
"epoch": 1.68,
"grad_norm": 3.407482147216797,
"learning_rate": 1.3369447236180905e-05,
"loss": 1.0559,
"step": 33500
},
{
"epoch": 1.68,
"grad_norm": 6.496652126312256,
"learning_rate": 1.3349346733668343e-05,
"loss": 0.9849,
"step": 33600
},
{
"epoch": 1.69,
"grad_norm": 6.241397857666016,
"learning_rate": 1.332924623115578e-05,
"loss": 0.9995,
"step": 33700
},
{
"epoch": 1.69,
"grad_norm": 5.998499870300293,
"learning_rate": 1.3309145728643217e-05,
"loss": 1.0355,
"step": 33800
},
{
"epoch": 1.69,
"grad_norm": 5.380569934844971,
"learning_rate": 1.3289045226130655e-05,
"loss": 1.0082,
"step": 33900
},
{
"epoch": 1.7,
"grad_norm": 5.168824195861816,
"learning_rate": 1.3268944723618092e-05,
"loss": 1.052,
"step": 34000
},
{
"epoch": 1.71,
"grad_norm": 5.691008567810059,
"learning_rate": 1.3248844221105528e-05,
"loss": 1.0424,
"step": 34100
},
{
"epoch": 1.71,
"grad_norm": 5.678094387054443,
"learning_rate": 1.3228743718592968e-05,
"loss": 1.0083,
"step": 34200
},
{
"epoch": 1.71,
"grad_norm": 6.432235240936279,
"learning_rate": 1.3208643216080404e-05,
"loss": 0.9766,
"step": 34300
},
{
"epoch": 1.72,
"grad_norm": 6.016462326049805,
"learning_rate": 1.318854271356784e-05,
"loss": 1.0059,
"step": 34400
},
{
"epoch": 1.73,
"grad_norm": 4.596778392791748,
"learning_rate": 1.3168442211055276e-05,
"loss": 0.9462,
"step": 34500
},
{
"epoch": 1.73,
"grad_norm": 7.2965850830078125,
"learning_rate": 1.3148341708542716e-05,
"loss": 0.972,
"step": 34600
},
{
"epoch": 1.73,
"grad_norm": 5.232773780822754,
"learning_rate": 1.3128241206030152e-05,
"loss": 1.0532,
"step": 34700
},
{
"epoch": 1.74,
"grad_norm": 5.5057783126831055,
"learning_rate": 1.3108140703517589e-05,
"loss": 0.9835,
"step": 34800
},
{
"epoch": 1.75,
"grad_norm": 3.0561375617980957,
"learning_rate": 1.3088040201005025e-05,
"loss": 1.0293,
"step": 34900
},
{
"epoch": 1.75,
"grad_norm": 4.761837959289551,
"learning_rate": 1.3068140703517589e-05,
"loss": 1.0232,
"step": 35000
},
{
"epoch": 1.75,
"grad_norm": 7.006007671356201,
"learning_rate": 1.3048040201005025e-05,
"loss": 0.9945,
"step": 35100
},
{
"epoch": 1.76,
"grad_norm": 4.829462051391602,
"learning_rate": 1.3027939698492465e-05,
"loss": 1.0589,
"step": 35200
},
{
"epoch": 1.77,
"grad_norm": 3.8825013637542725,
"learning_rate": 1.3007839195979901e-05,
"loss": 0.9984,
"step": 35300
},
{
"epoch": 1.77,
"grad_norm": 5.655978202819824,
"learning_rate": 1.2987738693467338e-05,
"loss": 1.0004,
"step": 35400
},
{
"epoch": 1.77,
"grad_norm": 5.612642765045166,
"learning_rate": 1.2967638190954774e-05,
"loss": 0.9874,
"step": 35500
},
{
"epoch": 1.78,
"grad_norm": 17.78661346435547,
"learning_rate": 1.2947537688442212e-05,
"loss": 1.0322,
"step": 35600
},
{
"epoch": 1.79,
"grad_norm": 4.723743915557861,
"learning_rate": 1.292743718592965e-05,
"loss": 0.9984,
"step": 35700
},
{
"epoch": 1.79,
"grad_norm": 5.048336982727051,
"learning_rate": 1.2907336683417086e-05,
"loss": 1.0588,
"step": 35800
},
{
"epoch": 1.79,
"grad_norm": 6.086093425750732,
"learning_rate": 1.2887236180904524e-05,
"loss": 1.0075,
"step": 35900
},
{
"epoch": 1.8,
"grad_norm": 6.542403697967529,
"learning_rate": 1.286713567839196e-05,
"loss": 1.0219,
"step": 36000
},
{
"epoch": 1.81,
"grad_norm": 5.013860702514648,
"learning_rate": 1.2847035175879398e-05,
"loss": 1.0307,
"step": 36100
},
{
"epoch": 1.81,
"grad_norm": 5.978675365447998,
"learning_rate": 1.2826934673366835e-05,
"loss": 1.0026,
"step": 36200
},
{
"epoch": 1.81,
"grad_norm": 6.217547416687012,
"learning_rate": 1.2806834170854273e-05,
"loss": 1.0196,
"step": 36300
},
{
"epoch": 1.82,
"grad_norm": 4.577905654907227,
"learning_rate": 1.2786733668341709e-05,
"loss": 0.9767,
"step": 36400
},
{
"epoch": 1.82,
"grad_norm": 4.999172210693359,
"learning_rate": 1.2766633165829147e-05,
"loss": 1.0261,
"step": 36500
},
{
"epoch": 1.83,
"grad_norm": 3.3435771465301514,
"learning_rate": 1.2746532663316585e-05,
"loss": 0.9751,
"step": 36600
},
{
"epoch": 1.83,
"grad_norm": 6.218837261199951,
"learning_rate": 1.2726432160804021e-05,
"loss": 0.9887,
"step": 36700
},
{
"epoch": 1.84,
"grad_norm": 2.914499044418335,
"learning_rate": 1.2706331658291457e-05,
"loss": 1.0172,
"step": 36800
},
{
"epoch": 1.84,
"grad_norm": 4.287944793701172,
"learning_rate": 1.2686231155778897e-05,
"loss": 1.0336,
"step": 36900
},
{
"epoch": 1.85,
"grad_norm": 9.045112609863281,
"learning_rate": 1.2666331658291458e-05,
"loss": 0.9966,
"step": 37000
},
{
"epoch": 1.85,
"grad_norm": 3.9664063453674316,
"learning_rate": 1.2646231155778896e-05,
"loss": 1.0315,
"step": 37100
},
{
"epoch": 1.86,
"grad_norm": 5.26336145401001,
"learning_rate": 1.2626130653266334e-05,
"loss": 1.031,
"step": 37200
},
{
"epoch": 1.86,
"grad_norm": 5.820954322814941,
"learning_rate": 1.260603015075377e-05,
"loss": 0.9786,
"step": 37300
},
{
"epoch": 1.87,
"grad_norm": 3.7999236583709717,
"learning_rate": 1.2585929648241206e-05,
"loss": 1.0008,
"step": 37400
},
{
"epoch": 1.88,
"grad_norm": 4.96231746673584,
"learning_rate": 1.2565829145728646e-05,
"loss": 0.9823,
"step": 37500
},
{
"epoch": 1.88,
"grad_norm": 5.442008018493652,
"learning_rate": 1.2545728643216082e-05,
"loss": 0.9993,
"step": 37600
},
{
"epoch": 1.89,
"grad_norm": 3.0178353786468506,
"learning_rate": 1.2525628140703518e-05,
"loss": 1.009,
"step": 37700
},
{
"epoch": 1.89,
"grad_norm": 4.0404052734375,
"learning_rate": 1.2505527638190955e-05,
"loss": 1.0047,
"step": 37800
},
{
"epoch": 1.9,
"grad_norm": 3.924924850463867,
"learning_rate": 1.2485427135678394e-05,
"loss": 0.9681,
"step": 37900
},
{
"epoch": 1.9,
"grad_norm": 6.560153961181641,
"learning_rate": 1.246532663316583e-05,
"loss": 0.9346,
"step": 38000
},
{
"epoch": 1.91,
"grad_norm": 4.826027870178223,
"learning_rate": 1.2445226130653267e-05,
"loss": 0.9878,
"step": 38100
},
{
"epoch": 1.91,
"grad_norm": 3.489680767059326,
"learning_rate": 1.2425125628140703e-05,
"loss": 0.9943,
"step": 38200
},
{
"epoch": 1.92,
"grad_norm": 4.7767014503479,
"learning_rate": 1.2405025125628141e-05,
"loss": 1.02,
"step": 38300
},
{
"epoch": 1.92,
"grad_norm": 7.311853408813477,
"learning_rate": 1.238492462311558e-05,
"loss": 0.946,
"step": 38400
},
{
"epoch": 1.93,
"grad_norm": 4.217949390411377,
"learning_rate": 1.236502512562814e-05,
"loss": 0.974,
"step": 38500
},
{
"epoch": 1.93,
"grad_norm": 8.919093132019043,
"learning_rate": 1.234492462311558e-05,
"loss": 0.9628,
"step": 38600
},
{
"epoch": 1.94,
"grad_norm": 4.355369567871094,
"learning_rate": 1.2324824120603016e-05,
"loss": 0.9325,
"step": 38700
},
{
"epoch": 1.94,
"grad_norm": 5.474518775939941,
"learning_rate": 1.2304723618090452e-05,
"loss": 0.9505,
"step": 38800
},
{
"epoch": 1.94,
"grad_norm": 6.389540195465088,
"learning_rate": 1.228462311557789e-05,
"loss": 0.9574,
"step": 38900
},
{
"epoch": 1.95,
"grad_norm": 6.9164719581604,
"learning_rate": 1.2264522613065328e-05,
"loss": 0.9644,
"step": 39000
},
{
"epoch": 1.96,
"grad_norm": 4.559136390686035,
"learning_rate": 1.2244422110552764e-05,
"loss": 1.0306,
"step": 39100
},
{
"epoch": 1.96,
"grad_norm": 6.381926536560059,
"learning_rate": 1.2224321608040202e-05,
"loss": 0.9542,
"step": 39200
},
{
"epoch": 1.96,
"grad_norm": 7.826279163360596,
"learning_rate": 1.2204221105527639e-05,
"loss": 0.9818,
"step": 39300
},
{
"epoch": 1.97,
"grad_norm": 5.7296929359436035,
"learning_rate": 1.2184120603015077e-05,
"loss": 0.9591,
"step": 39400
},
{
"epoch": 1.98,
"grad_norm": 6.479053974151611,
"learning_rate": 1.2164020100502515e-05,
"loss": 1.0083,
"step": 39500
},
{
"epoch": 1.98,
"grad_norm": 5.9377241134643555,
"learning_rate": 1.2144120603015077e-05,
"loss": 0.9969,
"step": 39600
},
{
"epoch": 1.98,
"grad_norm": 4.59481143951416,
"learning_rate": 1.2124020100502513e-05,
"loss": 1.015,
"step": 39700
},
{
"epoch": 1.99,
"grad_norm": 4.979703903198242,
"learning_rate": 1.2103919597989951e-05,
"loss": 0.977,
"step": 39800
},
{
"epoch": 2.0,
"grad_norm": 6.539973735809326,
"learning_rate": 1.2083819095477388e-05,
"loss": 0.9938,
"step": 39900
},
{
"epoch": 2.0,
"grad_norm": 5.971490383148193,
"learning_rate": 1.2063718592964825e-05,
"loss": 0.9848,
"step": 40000
},
{
"epoch": 2.0,
"eval_loss": 0.9915822744369507,
"eval_runtime": 21.5957,
"eval_samples_per_second": 46.305,
"eval_steps_per_second": 5.788,
"step": 40000
},
{
"epoch": 2.0,
"grad_norm": 5.639512538909912,
"learning_rate": 1.2043618090452262e-05,
"loss": 0.9401,
"step": 40100
},
{
"epoch": 2.01,
"grad_norm": 3.0007824897766113,
"learning_rate": 1.20235175879397e-05,
"loss": 0.9769,
"step": 40200
},
{
"epoch": 2.02,
"grad_norm": 4.346365451812744,
"learning_rate": 1.2003417085427136e-05,
"loss": 0.9247,
"step": 40300
},
{
"epoch": 2.02,
"grad_norm": 6.308602809906006,
"learning_rate": 1.1983316582914574e-05,
"loss": 0.9685,
"step": 40400
},
{
"epoch": 2.02,
"grad_norm": 4.597143173217773,
"learning_rate": 1.1963216080402012e-05,
"loss": 0.907,
"step": 40500
},
{
"epoch": 2.03,
"grad_norm": 6.000264644622803,
"learning_rate": 1.1943115577889448e-05,
"loss": 0.9311,
"step": 40600
},
{
"epoch": 2.04,
"grad_norm": 4.718263149261475,
"learning_rate": 1.1923015075376885e-05,
"loss": 0.9707,
"step": 40700
},
{
"epoch": 2.04,
"grad_norm": 3.7472355365753174,
"learning_rate": 1.1902914572864324e-05,
"loss": 0.9812,
"step": 40800
},
{
"epoch": 2.04,
"grad_norm": 4.8061017990112305,
"learning_rate": 1.188281407035176e-05,
"loss": 0.9461,
"step": 40900
},
{
"epoch": 2.05,
"grad_norm": 5.2381391525268555,
"learning_rate": 1.1862713567839197e-05,
"loss": 0.9972,
"step": 41000
},
{
"epoch": 2.06,
"grad_norm": 6.1567583084106445,
"learning_rate": 1.1842613065326633e-05,
"loss": 0.9611,
"step": 41100
},
{
"epoch": 2.06,
"grad_norm": 5.496160984039307,
"learning_rate": 1.1822512562814071e-05,
"loss": 0.9612,
"step": 41200
},
{
"epoch": 2.06,
"grad_norm": 6.659996509552002,
"learning_rate": 1.1802412060301509e-05,
"loss": 0.9593,
"step": 41300
},
{
"epoch": 2.07,
"grad_norm": 7.010763645172119,
"learning_rate": 1.1782311557788945e-05,
"loss": 0.9079,
"step": 41400
},
{
"epoch": 2.08,
"grad_norm": 5.539340496063232,
"learning_rate": 1.1762211055276383e-05,
"loss": 0.946,
"step": 41500
},
{
"epoch": 2.08,
"grad_norm": 4.7269368171691895,
"learning_rate": 1.174211055276382e-05,
"loss": 0.9702,
"step": 41600
},
{
"epoch": 2.08,
"grad_norm": 6.573697090148926,
"learning_rate": 1.1722010050251257e-05,
"loss": 0.9166,
"step": 41700
},
{
"epoch": 2.09,
"grad_norm": 5.467616558074951,
"learning_rate": 1.1701909547738694e-05,
"loss": 0.9479,
"step": 41800
},
{
"epoch": 2.1,
"grad_norm": 7.292219638824463,
"learning_rate": 1.1681809045226132e-05,
"loss": 0.9694,
"step": 41900
},
{
"epoch": 2.1,
"grad_norm": 5.9063849449157715,
"learning_rate": 1.1661708542713568e-05,
"loss": 0.9467,
"step": 42000
},
{
"epoch": 2.1,
"grad_norm": 7.106956958770752,
"learning_rate": 1.1641608040201006e-05,
"loss": 0.9344,
"step": 42100
},
{
"epoch": 2.11,
"grad_norm": 2.7898268699645996,
"learning_rate": 1.1621507537688444e-05,
"loss": 0.9174,
"step": 42200
},
{
"epoch": 2.12,
"grad_norm": 5.543144226074219,
"learning_rate": 1.160140703517588e-05,
"loss": 0.9399,
"step": 42300
},
{
"epoch": 2.12,
"grad_norm": 4.507541656494141,
"learning_rate": 1.1581306532663317e-05,
"loss": 0.8989,
"step": 42400
},
{
"epoch": 2.12,
"grad_norm": 7.4493937492370605,
"learning_rate": 1.1561206030150756e-05,
"loss": 0.9663,
"step": 42500
},
{
"epoch": 2.13,
"grad_norm": 5.758662700653076,
"learning_rate": 1.1541105527638192e-05,
"loss": 0.983,
"step": 42600
},
{
"epoch": 2.13,
"grad_norm": 4.6601386070251465,
"learning_rate": 1.1521005025125629e-05,
"loss": 0.936,
"step": 42700
},
{
"epoch": 2.14,
"grad_norm": 6.687641620635986,
"learning_rate": 1.1500904522613065e-05,
"loss": 0.9452,
"step": 42800
},
{
"epoch": 2.15,
"grad_norm": 6.454759120941162,
"learning_rate": 1.1480804020100505e-05,
"loss": 0.9494,
"step": 42900
},
{
"epoch": 2.15,
"grad_norm": 6.235274314880371,
"learning_rate": 1.1460703517587941e-05,
"loss": 0.9107,
"step": 43000
},
{
"epoch": 2.15,
"grad_norm": 6.445216655731201,
"learning_rate": 1.1440603015075377e-05,
"loss": 0.9448,
"step": 43100
},
{
"epoch": 2.16,
"grad_norm": 4.565326690673828,
"learning_rate": 1.1420502512562814e-05,
"loss": 0.9435,
"step": 43200
},
{
"epoch": 2.17,
"grad_norm": 4.653913497924805,
"learning_rate": 1.1400402010050253e-05,
"loss": 0.9492,
"step": 43300
},
{
"epoch": 2.17,
"grad_norm": 4.022702693939209,
"learning_rate": 1.138030150753769e-05,
"loss": 0.9365,
"step": 43400
},
{
"epoch": 2.17,
"grad_norm": 6.998848915100098,
"learning_rate": 1.1360201005025126e-05,
"loss": 0.9215,
"step": 43500
},
{
"epoch": 2.18,
"grad_norm": 3.925429344177246,
"learning_rate": 1.1340100502512564e-05,
"loss": 0.9408,
"step": 43600
},
{
"epoch": 2.19,
"grad_norm": 5.22701358795166,
"learning_rate": 1.132e-05,
"loss": 0.9755,
"step": 43700
},
{
"epoch": 2.19,
"grad_norm": 5.142667293548584,
"learning_rate": 1.1299899497487438e-05,
"loss": 0.8938,
"step": 43800
},
{
"epoch": 2.19,
"grad_norm": 5.2655158042907715,
"learning_rate": 1.1279798994974876e-05,
"loss": 0.9751,
"step": 43900
},
{
"epoch": 2.2,
"grad_norm": 5.084207057952881,
"learning_rate": 1.1259698492462312e-05,
"loss": 0.9141,
"step": 44000
},
{
"epoch": 2.21,
"grad_norm": 4.578594207763672,
"learning_rate": 1.1239597989949749e-05,
"loss": 0.9403,
"step": 44100
},
{
"epoch": 2.21,
"grad_norm": 3.3010849952697754,
"learning_rate": 1.1219497487437188e-05,
"loss": 0.9657,
"step": 44200
},
{
"epoch": 2.21,
"grad_norm": 6.451618194580078,
"learning_rate": 1.1199396984924624e-05,
"loss": 0.9297,
"step": 44300
},
{
"epoch": 2.22,
"grad_norm": 5.8492655754089355,
"learning_rate": 1.117929648241206e-05,
"loss": 0.9157,
"step": 44400
},
{
"epoch": 2.23,
"grad_norm": 5.015758037567139,
"learning_rate": 1.1159195979899497e-05,
"loss": 0.9508,
"step": 44500
},
{
"epoch": 2.23,
"grad_norm": 4.814078330993652,
"learning_rate": 1.1139095477386937e-05,
"loss": 0.9361,
"step": 44600
},
{
"epoch": 2.23,
"grad_norm": 5.004156112670898,
"learning_rate": 1.1118994974874373e-05,
"loss": 0.958,
"step": 44700
},
{
"epoch": 2.24,
"grad_norm": 5.016057968139648,
"learning_rate": 1.109889447236181e-05,
"loss": 0.9755,
"step": 44800
},
{
"epoch": 2.25,
"grad_norm": 5.041826248168945,
"learning_rate": 1.1078793969849246e-05,
"loss": 0.9082,
"step": 44900
},
{
"epoch": 2.25,
"grad_norm": 5.787368297576904,
"learning_rate": 1.1058693467336685e-05,
"loss": 0.9076,
"step": 45000
},
{
"epoch": 2.25,
"grad_norm": 5.170538902282715,
"learning_rate": 1.1038592964824122e-05,
"loss": 0.9117,
"step": 45100
},
{
"epoch": 2.26,
"grad_norm": 7.477475643157959,
"learning_rate": 1.1018492462311558e-05,
"loss": 0.8987,
"step": 45200
},
{
"epoch": 2.27,
"grad_norm": 4.626328945159912,
"learning_rate": 1.0998391959798996e-05,
"loss": 0.9197,
"step": 45300
},
{
"epoch": 2.27,
"grad_norm": 5.800539016723633,
"learning_rate": 1.0978291457286434e-05,
"loss": 0.9025,
"step": 45400
},
{
"epoch": 2.27,
"grad_norm": 4.291562080383301,
"learning_rate": 1.0958391959798994e-05,
"loss": 0.9348,
"step": 45500
},
{
"epoch": 2.28,
"grad_norm": 5.439847946166992,
"learning_rate": 1.0938291457286434e-05,
"loss": 0.9416,
"step": 45600
},
{
"epoch": 2.29,
"grad_norm": 5.728611946105957,
"learning_rate": 1.091819095477387e-05,
"loss": 0.9124,
"step": 45700
},
{
"epoch": 2.29,
"grad_norm": 3.7975008487701416,
"learning_rate": 1.0898090452261307e-05,
"loss": 0.9345,
"step": 45800
},
{
"epoch": 2.29,
"grad_norm": 7.625438690185547,
"learning_rate": 1.0877989949748745e-05,
"loss": 0.8925,
"step": 45900
},
{
"epoch": 2.3,
"grad_norm": 4.858023643493652,
"learning_rate": 1.0857889447236183e-05,
"loss": 0.9103,
"step": 46000
},
{
"epoch": 2.31,
"grad_norm": 6.363548755645752,
"learning_rate": 1.0837788944723619e-05,
"loss": 0.9523,
"step": 46100
},
{
"epoch": 2.31,
"grad_norm": 4.639822959899902,
"learning_rate": 1.0817688442211057e-05,
"loss": 0.9322,
"step": 46200
},
{
"epoch": 2.31,
"grad_norm": 4.796472072601318,
"learning_rate": 1.0797587939698493e-05,
"loss": 0.9242,
"step": 46300
},
{
"epoch": 2.32,
"grad_norm": 3.8870980739593506,
"learning_rate": 1.077748743718593e-05,
"loss": 0.9048,
"step": 46400
},
{
"epoch": 2.33,
"grad_norm": 6.010646343231201,
"learning_rate": 1.0757386934673369e-05,
"loss": 0.9566,
"step": 46500
},
{
"epoch": 2.33,
"grad_norm": 3.925715684890747,
"learning_rate": 1.0737286432160805e-05,
"loss": 0.9274,
"step": 46600
},
{
"epoch": 2.33,
"grad_norm": 5.222326278686523,
"learning_rate": 1.0717185929648242e-05,
"loss": 0.8914,
"step": 46700
},
{
"epoch": 2.34,
"grad_norm": 5.363781929016113,
"learning_rate": 1.0697085427135678e-05,
"loss": 0.922,
"step": 46800
},
{
"epoch": 2.34,
"grad_norm": 6.332427024841309,
"learning_rate": 1.0676984924623118e-05,
"loss": 0.9017,
"step": 46900
},
{
"epoch": 2.35,
"grad_norm": 4.68159818649292,
"learning_rate": 1.0656884422110554e-05,
"loss": 0.9089,
"step": 47000
},
{
"epoch": 2.35,
"grad_norm": 4.770488739013672,
"learning_rate": 1.063678391959799e-05,
"loss": 0.9738,
"step": 47100
},
{
"epoch": 2.36,
"grad_norm": 6.209041595458984,
"learning_rate": 1.0616683417085426e-05,
"loss": 0.9301,
"step": 47200
},
{
"epoch": 2.37,
"grad_norm": 5.330206394195557,
"learning_rate": 1.0596582914572866e-05,
"loss": 0.9515,
"step": 47300
},
{
"epoch": 2.37,
"grad_norm": 7.701655387878418,
"learning_rate": 1.0576482412060302e-05,
"loss": 0.9072,
"step": 47400
},
{
"epoch": 2.38,
"grad_norm": 4.921889305114746,
"learning_rate": 1.0556381909547739e-05,
"loss": 0.9326,
"step": 47500
},
{
"epoch": 2.38,
"grad_norm": 5.353864669799805,
"learning_rate": 1.0536281407035177e-05,
"loss": 0.902,
"step": 47600
},
{
"epoch": 2.38,
"grad_norm": 4.63252592086792,
"learning_rate": 1.0516180904522615e-05,
"loss": 0.9357,
"step": 47700
},
{
"epoch": 2.39,
"grad_norm": 5.968425750732422,
"learning_rate": 1.0496281407035175e-05,
"loss": 0.9416,
"step": 47800
},
{
"epoch": 2.4,
"grad_norm": 5.979503154754639,
"learning_rate": 1.0476180904522615e-05,
"loss": 0.9461,
"step": 47900
},
{
"epoch": 2.4,
"grad_norm": 5.928488731384277,
"learning_rate": 1.0456080402010051e-05,
"loss": 0.9045,
"step": 48000
},
{
"epoch": 2.41,
"grad_norm": 12.569512367248535,
"learning_rate": 1.0435979899497488e-05,
"loss": 0.9205,
"step": 48100
},
{
"epoch": 2.41,
"grad_norm": 4.5606865882873535,
"learning_rate": 1.0415879396984926e-05,
"loss": 0.9005,
"step": 48200
},
{
"epoch": 2.42,
"grad_norm": 5.511040210723877,
"learning_rate": 1.0395778894472364e-05,
"loss": 0.9105,
"step": 48300
},
{
"epoch": 2.42,
"grad_norm": 5.660979747772217,
"learning_rate": 1.03756783919598e-05,
"loss": 0.8911,
"step": 48400
},
{
"epoch": 2.42,
"grad_norm": 3.5720648765563965,
"learning_rate": 1.0355577889447238e-05,
"loss": 0.9468,
"step": 48500
},
{
"epoch": 2.43,
"grad_norm": 5.770594120025635,
"learning_rate": 1.0335477386934674e-05,
"loss": 0.9296,
"step": 48600
},
{
"epoch": 2.44,
"grad_norm": 4.0545477867126465,
"learning_rate": 1.0315376884422112e-05,
"loss": 0.9133,
"step": 48700
},
{
"epoch": 2.44,
"grad_norm": 4.586203575134277,
"learning_rate": 1.0295276381909548e-05,
"loss": 0.906,
"step": 48800
},
{
"epoch": 2.44,
"grad_norm": 5.315196514129639,
"learning_rate": 1.0275175879396986e-05,
"loss": 0.9065,
"step": 48900
},
{
"epoch": 2.45,
"grad_norm": 5.344489574432373,
"learning_rate": 1.0255075376884423e-05,
"loss": 0.9363,
"step": 49000
},
{
"epoch": 2.46,
"grad_norm": 6.762577533721924,
"learning_rate": 1.0234974874371859e-05,
"loss": 0.9366,
"step": 49100
},
{
"epoch": 2.46,
"grad_norm": 4.087870121002197,
"learning_rate": 1.0214874371859299e-05,
"loss": 0.8812,
"step": 49200
},
{
"epoch": 2.46,
"grad_norm": 5.586741924285889,
"learning_rate": 1.0194773869346735e-05,
"loss": 0.9341,
"step": 49300
},
{
"epoch": 2.47,
"grad_norm": 8.180070877075195,
"learning_rate": 1.0174673366834171e-05,
"loss": 0.9381,
"step": 49400
},
{
"epoch": 2.48,
"grad_norm": 4.389576435089111,
"learning_rate": 1.0154572864321607e-05,
"loss": 0.9288,
"step": 49500
},
{
"epoch": 2.48,
"grad_norm": 4.339807033538818,
"learning_rate": 1.0134472361809047e-05,
"loss": 0.9282,
"step": 49600
},
{
"epoch": 2.48,
"grad_norm": 7.801273345947266,
"learning_rate": 1.0114371859296483e-05,
"loss": 0.9657,
"step": 49700
},
{
"epoch": 2.49,
"grad_norm": 6.016520977020264,
"learning_rate": 1.009427135678392e-05,
"loss": 0.8704,
"step": 49800
},
{
"epoch": 2.5,
"grad_norm": 5.2764506340026855,
"learning_rate": 1.0074170854271358e-05,
"loss": 0.9226,
"step": 49900
},
{
"epoch": 2.5,
"grad_norm": 4.923444747924805,
"learning_rate": 1.0054070351758796e-05,
"loss": 0.9084,
"step": 50000
},
{
"epoch": 2.5,
"eval_loss": 0.9846327900886536,
"eval_runtime": 21.5925,
"eval_samples_per_second": 46.312,
"eval_steps_per_second": 5.789,
"step": 50000
},
{
"epoch": 2.5,
"grad_norm": 6.061006546020508,
"learning_rate": 1.0033969849246232e-05,
"loss": 0.9218,
"step": 50100
},
{
"epoch": 2.51,
"grad_norm": 4.1440348625183105,
"learning_rate": 1.0013869346733668e-05,
"loss": 0.9324,
"step": 50200
},
{
"epoch": 2.52,
"grad_norm": 4.084045886993408,
"learning_rate": 9.993768844221106e-06,
"loss": 0.8859,
"step": 50300
},
{
"epoch": 2.52,
"grad_norm": 3.723971366882324,
"learning_rate": 9.973668341708544e-06,
"loss": 0.9128,
"step": 50400
},
{
"epoch": 2.52,
"grad_norm": 3.9887030124664307,
"learning_rate": 9.95356783919598e-06,
"loss": 0.8987,
"step": 50500
},
{
"epoch": 2.53,
"grad_norm": 5.572610855102539,
"learning_rate": 9.933467336683418e-06,
"loss": 0.9287,
"step": 50600
},
{
"epoch": 2.54,
"grad_norm": 5.956911087036133,
"learning_rate": 9.913366834170856e-06,
"loss": 0.8808,
"step": 50700
},
{
"epoch": 2.54,
"grad_norm": 3.948564052581787,
"learning_rate": 9.893266331658293e-06,
"loss": 0.9243,
"step": 50800
},
{
"epoch": 2.54,
"grad_norm": 5.561892509460449,
"learning_rate": 9.87316582914573e-06,
"loss": 0.9174,
"step": 50900
},
{
"epoch": 2.55,
"grad_norm": 5.9155755043029785,
"learning_rate": 9.853065326633167e-06,
"loss": 0.8951,
"step": 51000
},
{
"epoch": 2.56,
"grad_norm": 5.4488348960876465,
"learning_rate": 9.832964824120603e-06,
"loss": 0.8864,
"step": 51100
},
{
"epoch": 2.56,
"grad_norm": 4.52565860748291,
"learning_rate": 9.812864321608041e-06,
"loss": 0.9103,
"step": 51200
},
{
"epoch": 2.56,
"grad_norm": 3.995807647705078,
"learning_rate": 9.792763819095477e-06,
"loss": 0.8999,
"step": 51300
},
{
"epoch": 2.57,
"grad_norm": 9.156529426574707,
"learning_rate": 9.772663316582915e-06,
"loss": 0.9383,
"step": 51400
},
{
"epoch": 2.58,
"grad_norm": 6.388377666473389,
"learning_rate": 9.752562814070352e-06,
"loss": 0.908,
"step": 51500
},
{
"epoch": 2.58,
"grad_norm": 3.975545644760132,
"learning_rate": 9.73246231155779e-06,
"loss": 0.9006,
"step": 51600
},
{
"epoch": 2.58,
"grad_norm": 4.579479694366455,
"learning_rate": 9.712361809045226e-06,
"loss": 0.9443,
"step": 51700
},
{
"epoch": 2.59,
"grad_norm": 5.22560977935791,
"learning_rate": 9.69246231155779e-06,
"loss": 0.909,
"step": 51800
},
{
"epoch": 2.59,
"grad_norm": 5.2606587409973145,
"learning_rate": 9.672361809045226e-06,
"loss": 0.9255,
"step": 51900
},
{
"epoch": 2.6,
"grad_norm": 4.772227764129639,
"learning_rate": 9.652261306532664e-06,
"loss": 0.9161,
"step": 52000
},
{
"epoch": 2.6,
"grad_norm": 4.636828899383545,
"learning_rate": 9.6321608040201e-06,
"loss": 0.874,
"step": 52100
},
{
"epoch": 2.61,
"grad_norm": 4.5946784019470215,
"learning_rate": 9.612060301507538e-06,
"loss": 0.902,
"step": 52200
},
{
"epoch": 2.62,
"grad_norm": 4.0993266105651855,
"learning_rate": 9.591959798994975e-06,
"loss": 0.9369,
"step": 52300
},
{
"epoch": 2.62,
"grad_norm": 5.468399524688721,
"learning_rate": 9.571859296482413e-06,
"loss": 0.9359,
"step": 52400
},
{
"epoch": 2.62,
"grad_norm": 10.92428970336914,
"learning_rate": 9.551758793969849e-06,
"loss": 0.8889,
"step": 52500
},
{
"epoch": 2.63,
"grad_norm": 6.2350029945373535,
"learning_rate": 9.531658291457287e-06,
"loss": 0.9304,
"step": 52600
},
{
"epoch": 2.63,
"grad_norm": 4.780547618865967,
"learning_rate": 9.511557788944725e-06,
"loss": 0.9396,
"step": 52700
},
{
"epoch": 2.64,
"grad_norm": 3.1009738445281982,
"learning_rate": 9.491457286432161e-06,
"loss": 0.9077,
"step": 52800
},
{
"epoch": 2.65,
"grad_norm": 7.036947727203369,
"learning_rate": 9.4713567839196e-06,
"loss": 0.8753,
"step": 52900
},
{
"epoch": 2.65,
"grad_norm": 4.945110321044922,
"learning_rate": 9.451256281407035e-06,
"loss": 0.9157,
"step": 53000
},
{
"epoch": 2.66,
"grad_norm": 5.361321926116943,
"learning_rate": 9.431155778894473e-06,
"loss": 0.8929,
"step": 53100
},
{
"epoch": 2.66,
"grad_norm": 3.351379632949829,
"learning_rate": 9.411055276381911e-06,
"loss": 0.8636,
"step": 53200
},
{
"epoch": 2.67,
"grad_norm": 5.34309720993042,
"learning_rate": 9.390954773869348e-06,
"loss": 0.8865,
"step": 53300
},
{
"epoch": 2.67,
"grad_norm": 5.316425800323486,
"learning_rate": 9.370854271356786e-06,
"loss": 0.9178,
"step": 53400
},
{
"epoch": 2.67,
"grad_norm": 4.478712558746338,
"learning_rate": 9.350753768844222e-06,
"loss": 0.9181,
"step": 53500
},
{
"epoch": 2.68,
"grad_norm": 5.095877647399902,
"learning_rate": 9.33065326633166e-06,
"loss": 0.902,
"step": 53600
},
{
"epoch": 2.69,
"grad_norm": 4.4164862632751465,
"learning_rate": 9.310552763819096e-06,
"loss": 0.887,
"step": 53700
},
{
"epoch": 2.69,
"grad_norm": 6.3961591720581055,
"learning_rate": 9.290452261306533e-06,
"loss": 0.8778,
"step": 53800
},
{
"epoch": 2.69,
"grad_norm": 7.141729354858398,
"learning_rate": 9.270552763819097e-06,
"loss": 0.9144,
"step": 53900
},
{
"epoch": 2.7,
"grad_norm": 5.858211040496826,
"learning_rate": 9.250452261306535e-06,
"loss": 0.8889,
"step": 54000
},
{
"epoch": 2.71,
"grad_norm": 5.192725658416748,
"learning_rate": 9.230351758793971e-06,
"loss": 0.8928,
"step": 54100
},
{
"epoch": 2.71,
"grad_norm": 6.190788745880127,
"learning_rate": 9.210251256281407e-06,
"loss": 0.8683,
"step": 54200
},
{
"epoch": 2.71,
"grad_norm": 4.610683441162109,
"learning_rate": 9.190150753768845e-06,
"loss": 0.9473,
"step": 54300
},
{
"epoch": 2.72,
"grad_norm": 5.043734550476074,
"learning_rate": 9.170050251256281e-06,
"loss": 0.9142,
"step": 54400
},
{
"epoch": 2.73,
"grad_norm": 5.166931629180908,
"learning_rate": 9.14994974874372e-06,
"loss": 0.8894,
"step": 54500
},
{
"epoch": 2.73,
"grad_norm": 5.05250358581543,
"learning_rate": 9.129849246231156e-06,
"loss": 0.8799,
"step": 54600
},
{
"epoch": 2.73,
"grad_norm": 5.468914031982422,
"learning_rate": 9.109748743718594e-06,
"loss": 0.9099,
"step": 54700
},
{
"epoch": 2.74,
"grad_norm": 4.162414073944092,
"learning_rate": 9.08964824120603e-06,
"loss": 0.8859,
"step": 54800
},
{
"epoch": 2.75,
"grad_norm": 5.149291515350342,
"learning_rate": 9.069547738693468e-06,
"loss": 0.9096,
"step": 54900
},
{
"epoch": 2.75,
"grad_norm": 4.889472961425781,
"learning_rate": 9.049447236180904e-06,
"loss": 0.8953,
"step": 55000
},
{
"epoch": 2.75,
"grad_norm": 4.146818161010742,
"learning_rate": 9.029346733668342e-06,
"loss": 0.8917,
"step": 55100
},
{
"epoch": 2.76,
"grad_norm": 5.937385559082031,
"learning_rate": 9.00924623115578e-06,
"loss": 0.9295,
"step": 55200
},
{
"epoch": 2.77,
"grad_norm": 4.749314785003662,
"learning_rate": 8.989145728643216e-06,
"loss": 0.8776,
"step": 55300
},
{
"epoch": 2.77,
"grad_norm": 6.271254539489746,
"learning_rate": 8.969045226130654e-06,
"loss": 0.8593,
"step": 55400
},
{
"epoch": 2.77,
"grad_norm": 5.769760608673096,
"learning_rate": 8.948944723618092e-06,
"loss": 0.891,
"step": 55500
},
{
"epoch": 2.78,
"grad_norm": 4.185112476348877,
"learning_rate": 8.928844221105529e-06,
"loss": 0.8869,
"step": 55600
},
{
"epoch": 2.79,
"grad_norm": 3.2164394855499268,
"learning_rate": 8.908743718592967e-06,
"loss": 0.8992,
"step": 55700
},
{
"epoch": 2.79,
"grad_norm": 4.406613349914551,
"learning_rate": 8.888643216080403e-06,
"loss": 0.8971,
"step": 55800
},
{
"epoch": 2.79,
"grad_norm": 5.101110458374023,
"learning_rate": 8.868542713567841e-06,
"loss": 0.9066,
"step": 55900
},
{
"epoch": 2.8,
"grad_norm": 4.963405132293701,
"learning_rate": 8.848643216080403e-06,
"loss": 0.881,
"step": 56000
},
{
"epoch": 2.81,
"grad_norm": 7.5268683433532715,
"learning_rate": 8.82854271356784e-06,
"loss": 0.8692,
"step": 56100
},
{
"epoch": 2.81,
"grad_norm": 5.325132369995117,
"learning_rate": 8.808442211055278e-06,
"loss": 0.895,
"step": 56200
},
{
"epoch": 2.81,
"grad_norm": 4.687073707580566,
"learning_rate": 8.788341708542715e-06,
"loss": 0.9007,
"step": 56300
},
{
"epoch": 2.82,
"grad_norm": 4.215831279754639,
"learning_rate": 8.768241206030152e-06,
"loss": 0.8783,
"step": 56400
},
{
"epoch": 2.83,
"grad_norm": 6.363833427429199,
"learning_rate": 8.74814070351759e-06,
"loss": 0.9276,
"step": 56500
},
{
"epoch": 2.83,
"grad_norm": 4.2875518798828125,
"learning_rate": 8.728040201005026e-06,
"loss": 0.8758,
"step": 56600
},
{
"epoch": 2.83,
"grad_norm": 4.461952209472656,
"learning_rate": 8.707939698492464e-06,
"loss": 0.8789,
"step": 56700
},
{
"epoch": 2.84,
"grad_norm": 7.590397834777832,
"learning_rate": 8.6878391959799e-06,
"loss": 0.8726,
"step": 56800
},
{
"epoch": 2.84,
"grad_norm": 5.754077911376953,
"learning_rate": 8.667738693467337e-06,
"loss": 0.9022,
"step": 56900
},
{
"epoch": 2.85,
"grad_norm": 4.305074214935303,
"learning_rate": 8.647638190954775e-06,
"loss": 0.8933,
"step": 57000
},
{
"epoch": 2.85,
"grad_norm": 4.451827526092529,
"learning_rate": 8.627738693467337e-06,
"loss": 0.934,
"step": 57100
},
{
"epoch": 2.86,
"grad_norm": 6.323834419250488,
"learning_rate": 8.607638190954775e-06,
"loss": 0.8858,
"step": 57200
},
{
"epoch": 2.87,
"grad_norm": 6.937102317810059,
"learning_rate": 8.587537688442211e-06,
"loss": 0.9263,
"step": 57300
},
{
"epoch": 2.87,
"grad_norm": 7.153318881988525,
"learning_rate": 8.56743718592965e-06,
"loss": 0.8868,
"step": 57400
},
{
"epoch": 2.88,
"grad_norm": 4.7994842529296875,
"learning_rate": 8.547336683417085e-06,
"loss": 0.877,
"step": 57500
},
{
"epoch": 2.88,
"grad_norm": 6.6480793952941895,
"learning_rate": 8.527236180904523e-06,
"loss": 0.8774,
"step": 57600
},
{
"epoch": 2.88,
"grad_norm": 7.170138835906982,
"learning_rate": 8.50713567839196e-06,
"loss": 0.8704,
"step": 57700
},
{
"epoch": 2.89,
"grad_norm": 4.006447792053223,
"learning_rate": 8.487035175879398e-06,
"loss": 0.8736,
"step": 57800
},
{
"epoch": 2.9,
"grad_norm": 4.998128414154053,
"learning_rate": 8.466934673366834e-06,
"loss": 0.8653,
"step": 57900
},
{
"epoch": 2.9,
"grad_norm": 5.756192207336426,
"learning_rate": 8.446834170854272e-06,
"loss": 0.8756,
"step": 58000
},
{
"epoch": 2.91,
"grad_norm": 5.486929893493652,
"learning_rate": 8.426733668341708e-06,
"loss": 0.9047,
"step": 58100
},
{
"epoch": 2.91,
"grad_norm": 4.589926242828369,
"learning_rate": 8.406633165829146e-06,
"loss": 0.8766,
"step": 58200
},
{
"epoch": 2.92,
"grad_norm": 4.535083293914795,
"learning_rate": 8.386532663316584e-06,
"loss": 0.8758,
"step": 58300
},
{
"epoch": 2.92,
"grad_norm": 3.2254798412323,
"learning_rate": 8.36643216080402e-06,
"loss": 0.8813,
"step": 58400
},
{
"epoch": 2.92,
"grad_norm": 6.055229187011719,
"learning_rate": 8.346331658291458e-06,
"loss": 0.8779,
"step": 58500
},
{
"epoch": 2.93,
"grad_norm": 4.221169471740723,
"learning_rate": 8.326231155778895e-06,
"loss": 0.9325,
"step": 58600
},
{
"epoch": 2.94,
"grad_norm": 5.035799026489258,
"learning_rate": 8.306130653266333e-06,
"loss": 0.8896,
"step": 58700
},
{
"epoch": 2.94,
"grad_norm": 6.551968574523926,
"learning_rate": 8.28603015075377e-06,
"loss": 0.8644,
"step": 58800
},
{
"epoch": 2.94,
"grad_norm": 4.297557353973389,
"learning_rate": 8.265929648241207e-06,
"loss": 0.8853,
"step": 58900
},
{
"epoch": 2.95,
"grad_norm": 6.603255271911621,
"learning_rate": 8.245829145728645e-06,
"loss": 0.9237,
"step": 59000
},
{
"epoch": 2.96,
"grad_norm": 6.272432804107666,
"learning_rate": 8.225728643216081e-06,
"loss": 0.8708,
"step": 59100
},
{
"epoch": 2.96,
"grad_norm": 5.919680595397949,
"learning_rate": 8.20562814070352e-06,
"loss": 0.8525,
"step": 59200
},
{
"epoch": 2.96,
"grad_norm": 4.834166049957275,
"learning_rate": 8.185527638190955e-06,
"loss": 0.8576,
"step": 59300
},
{
"epoch": 2.97,
"grad_norm": 5.948410987854004,
"learning_rate": 8.165427135678393e-06,
"loss": 0.9017,
"step": 59400
},
{
"epoch": 2.98,
"grad_norm": 7.001020431518555,
"learning_rate": 8.14532663316583e-06,
"loss": 0.891,
"step": 59500
},
{
"epoch": 2.98,
"grad_norm": 5.623896598815918,
"learning_rate": 8.125226130653266e-06,
"loss": 0.8255,
"step": 59600
},
{
"epoch": 2.98,
"grad_norm": 5.0935468673706055,
"learning_rate": 8.105125628140704e-06,
"loss": 0.8709,
"step": 59700
},
{
"epoch": 2.99,
"grad_norm": 6.403896808624268,
"learning_rate": 8.08502512562814e-06,
"loss": 0.8957,
"step": 59800
},
{
"epoch": 3.0,
"grad_norm": 5.92683744430542,
"learning_rate": 8.064924623115578e-06,
"loss": 0.9102,
"step": 59900
},
{
"epoch": 3.0,
"grad_norm": 3.4657108783721924,
"learning_rate": 8.04502512562814e-06,
"loss": 0.916,
"step": 60000
},
{
"epoch": 3.0,
"eval_loss": 0.9292559623718262,
"eval_runtime": 21.5879,
"eval_samples_per_second": 46.322,
"eval_steps_per_second": 5.79,
"step": 60000
},
{
"epoch": 3.0,
"grad_norm": 5.614874839782715,
"learning_rate": 8.024924623115579e-06,
"loss": 0.8151,
"step": 60100
},
{
"epoch": 3.01,
"grad_norm": 6.177361011505127,
"learning_rate": 8.004824120603015e-06,
"loss": 0.8266,
"step": 60200
},
{
"epoch": 3.02,
"grad_norm": 5.4862213134765625,
"learning_rate": 7.984723618090453e-06,
"loss": 0.7741,
"step": 60300
},
{
"epoch": 3.02,
"grad_norm": 6.674380779266357,
"learning_rate": 7.964623115577889e-06,
"loss": 0.8134,
"step": 60400
},
{
"epoch": 3.02,
"grad_norm": 6.712404251098633,
"learning_rate": 7.944522613065327e-06,
"loss": 0.8332,
"step": 60500
},
{
"epoch": 3.03,
"grad_norm": 4.442228317260742,
"learning_rate": 7.924422110552763e-06,
"loss": 0.832,
"step": 60600
},
{
"epoch": 3.04,
"grad_norm": 5.503748416900635,
"learning_rate": 7.904321608040201e-06,
"loss": 0.844,
"step": 60700
},
{
"epoch": 3.04,
"grad_norm": 4.290737628936768,
"learning_rate": 7.88422110552764e-06,
"loss": 0.8593,
"step": 60800
},
{
"epoch": 3.04,
"grad_norm": 4.687915802001953,
"learning_rate": 7.864120603015076e-06,
"loss": 0.8506,
"step": 60900
},
{
"epoch": 3.05,
"grad_norm": 5.838376998901367,
"learning_rate": 7.844020100502514e-06,
"loss": 0.8297,
"step": 61000
},
{
"epoch": 3.06,
"grad_norm": 7.26198148727417,
"learning_rate": 7.823919597989952e-06,
"loss": 0.8463,
"step": 61100
},
{
"epoch": 3.06,
"grad_norm": 5.693443298339844,
"learning_rate": 7.803819095477388e-06,
"loss": 0.8405,
"step": 61200
},
{
"epoch": 3.06,
"grad_norm": 5.379219055175781,
"learning_rate": 7.783718592964826e-06,
"loss": 0.8431,
"step": 61300
},
{
"epoch": 3.07,
"grad_norm": 5.703670501708984,
"learning_rate": 7.763618090452262e-06,
"loss": 0.8484,
"step": 61400
},
{
"epoch": 3.08,
"grad_norm": 5.679072380065918,
"learning_rate": 7.7435175879397e-06,
"loss": 0.8272,
"step": 61500
},
{
"epoch": 3.08,
"grad_norm": 4.1109113693237305,
"learning_rate": 7.723417085427136e-06,
"loss": 0.83,
"step": 61600
},
{
"epoch": 3.08,
"grad_norm": 5.94366979598999,
"learning_rate": 7.703316582914574e-06,
"loss": 0.8102,
"step": 61700
},
{
"epoch": 3.09,
"grad_norm": 8.418631553649902,
"learning_rate": 7.68321608040201e-06,
"loss": 0.8396,
"step": 61800
},
{
"epoch": 3.1,
"grad_norm": 5.8859100341796875,
"learning_rate": 7.663115577889449e-06,
"loss": 0.8142,
"step": 61900
},
{
"epoch": 3.1,
"grad_norm": 5.267168045043945,
"learning_rate": 7.643015075376885e-06,
"loss": 0.8087,
"step": 62000
},
{
"epoch": 3.1,
"grad_norm": 5.58022403717041,
"learning_rate": 7.622914572864322e-06,
"loss": 0.8407,
"step": 62100
},
{
"epoch": 3.11,
"grad_norm": 5.999646186828613,
"learning_rate": 7.602814070351759e-06,
"loss": 0.8214,
"step": 62200
},
{
"epoch": 3.12,
"grad_norm": 4.449764251708984,
"learning_rate": 7.582713567839196e-06,
"loss": 0.8562,
"step": 62300
},
{
"epoch": 3.12,
"grad_norm": 6.2914137840271,
"learning_rate": 7.562613065326634e-06,
"loss": 0.8359,
"step": 62400
},
{
"epoch": 3.12,
"grad_norm": 5.262882709503174,
"learning_rate": 7.5425125628140705e-06,
"loss": 0.8368,
"step": 62500
},
{
"epoch": 3.13,
"grad_norm": 4.981582164764404,
"learning_rate": 7.5224120603015085e-06,
"loss": 0.8138,
"step": 62600
},
{
"epoch": 3.13,
"grad_norm": 5.330999374389648,
"learning_rate": 7.502311557788945e-06,
"loss": 0.8292,
"step": 62700
},
{
"epoch": 3.14,
"grad_norm": 5.176852226257324,
"learning_rate": 7.482211055276383e-06,
"loss": 0.8108,
"step": 62800
},
{
"epoch": 3.15,
"grad_norm": 8.819506645202637,
"learning_rate": 7.462110552763819e-06,
"loss": 0.854,
"step": 62900
},
{
"epoch": 3.15,
"grad_norm": 5.1937642097473145,
"learning_rate": 7.442010050251257e-06,
"loss": 0.7943,
"step": 63000
},
{
"epoch": 3.15,
"grad_norm": 4.399514675140381,
"learning_rate": 7.421909547738694e-06,
"loss": 0.7815,
"step": 63100
},
{
"epoch": 3.16,
"grad_norm": 5.575798034667969,
"learning_rate": 7.402010050251257e-06,
"loss": 0.8481,
"step": 63200
},
{
"epoch": 3.17,
"grad_norm": 5.097688674926758,
"learning_rate": 7.381909547738694e-06,
"loss": 0.8412,
"step": 63300
},
{
"epoch": 3.17,
"grad_norm": 4.748641490936279,
"learning_rate": 7.361809045226132e-06,
"loss": 0.8058,
"step": 63400
},
{
"epoch": 3.17,
"grad_norm": 7.151881694793701,
"learning_rate": 7.341708542713568e-06,
"loss": 0.7944,
"step": 63500
},
{
"epoch": 3.18,
"grad_norm": 4.642664909362793,
"learning_rate": 7.321608040201006e-06,
"loss": 0.8185,
"step": 63600
},
{
"epoch": 3.19,
"grad_norm": 5.354043483734131,
"learning_rate": 7.301507537688442e-06,
"loss": 0.7833,
"step": 63700
},
{
"epoch": 3.19,
"grad_norm": 5.168720245361328,
"learning_rate": 7.28140703517588e-06,
"loss": 0.7966,
"step": 63800
},
{
"epoch": 3.19,
"grad_norm": 4.343645095825195,
"learning_rate": 7.261306532663317e-06,
"loss": 0.7851,
"step": 63900
},
{
"epoch": 3.2,
"grad_norm": 4.882009506225586,
"learning_rate": 7.241206030150754e-06,
"loss": 0.8069,
"step": 64000
},
{
"epoch": 3.21,
"grad_norm": 4.954422473907471,
"learning_rate": 7.2211055276381915e-06,
"loss": 0.8029,
"step": 64100
},
{
"epoch": 3.21,
"grad_norm": 3.5329108238220215,
"learning_rate": 7.2010050251256295e-06,
"loss": 0.8262,
"step": 64200
},
{
"epoch": 3.21,
"grad_norm": 4.995691776275635,
"learning_rate": 7.180904522613066e-06,
"loss": 0.7899,
"step": 64300
},
{
"epoch": 3.22,
"grad_norm": 4.367786884307861,
"learning_rate": 7.160804020100504e-06,
"loss": 0.8014,
"step": 64400
},
{
"epoch": 3.23,
"grad_norm": 3.8841774463653564,
"learning_rate": 7.14070351758794e-06,
"loss": 0.8207,
"step": 64500
},
{
"epoch": 3.23,
"grad_norm": 4.118581295013428,
"learning_rate": 7.120603015075378e-06,
"loss": 0.8399,
"step": 64600
},
{
"epoch": 3.23,
"grad_norm": 5.320229530334473,
"learning_rate": 7.100502512562814e-06,
"loss": 0.8407,
"step": 64700
},
{
"epoch": 3.24,
"grad_norm": 4.324894428253174,
"learning_rate": 7.080402010050251e-06,
"loss": 0.7897,
"step": 64800
},
{
"epoch": 3.25,
"grad_norm": 6.917771816253662,
"learning_rate": 7.060301507537689e-06,
"loss": 0.8019,
"step": 64900
},
{
"epoch": 3.25,
"grad_norm": 7.098691463470459,
"learning_rate": 7.040201005025126e-06,
"loss": 0.8058,
"step": 65000
},
{
"epoch": 3.25,
"grad_norm": 5.166707992553711,
"learning_rate": 7.020100502512564e-06,
"loss": 0.7839,
"step": 65100
},
{
"epoch": 3.26,
"grad_norm": 5.616134166717529,
"learning_rate": 7e-06,
"loss": 0.7821,
"step": 65200
},
{
"epoch": 3.27,
"grad_norm": 7.216468334197998,
"learning_rate": 6.979899497487438e-06,
"loss": 0.7974,
"step": 65300
},
{
"epoch": 3.27,
"grad_norm": 7.116774082183838,
"learning_rate": 6.959798994974874e-06,
"loss": 0.8446,
"step": 65400
},
{
"epoch": 3.27,
"grad_norm": 6.275495529174805,
"learning_rate": 6.939698492462312e-06,
"loss": 0.8185,
"step": 65500
},
{
"epoch": 3.28,
"grad_norm": 4.431950092315674,
"learning_rate": 6.919597989949749e-06,
"loss": 0.8203,
"step": 65600
},
{
"epoch": 3.29,
"grad_norm": 6.8355302810668945,
"learning_rate": 6.899497487437186e-06,
"loss": 0.789,
"step": 65700
},
{
"epoch": 3.29,
"grad_norm": 4.217498779296875,
"learning_rate": 6.8793969849246235e-06,
"loss": 0.7909,
"step": 65800
},
{
"epoch": 3.29,
"grad_norm": 9.218932151794434,
"learning_rate": 6.859899497487438e-06,
"loss": 0.8387,
"step": 65900
},
{
"epoch": 3.3,
"grad_norm": 5.607006072998047,
"learning_rate": 6.8397989949748745e-06,
"loss": 0.787,
"step": 66000
},
{
"epoch": 3.31,
"grad_norm": 5.220907688140869,
"learning_rate": 6.8196984924623124e-06,
"loss": 0.8274,
"step": 66100
},
{
"epoch": 3.31,
"grad_norm": 4.795065402984619,
"learning_rate": 6.799597989949749e-06,
"loss": 0.7833,
"step": 66200
},
{
"epoch": 3.31,
"grad_norm": 5.653503894805908,
"learning_rate": 6.779497487437187e-06,
"loss": 0.7919,
"step": 66300
},
{
"epoch": 3.32,
"grad_norm": 5.359546184539795,
"learning_rate": 6.759396984924623e-06,
"loss": 0.8,
"step": 66400
},
{
"epoch": 3.33,
"grad_norm": 3.9278500080108643,
"learning_rate": 6.739296482412061e-06,
"loss": 0.816,
"step": 66500
},
{
"epoch": 3.33,
"grad_norm": 6.889082908630371,
"learning_rate": 6.719195979899498e-06,
"loss": 0.8559,
"step": 66600
},
{
"epoch": 3.33,
"grad_norm": 6.555418491363525,
"learning_rate": 6.699095477386935e-06,
"loss": 0.8084,
"step": 66700
},
{
"epoch": 3.34,
"grad_norm": 5.0188798904418945,
"learning_rate": 6.678994974874372e-06,
"loss": 0.8199,
"step": 66800
},
{
"epoch": 3.34,
"grad_norm": 5.341757297515869,
"learning_rate": 6.6588944723618094e-06,
"loss": 0.8301,
"step": 66900
},
{
"epoch": 3.35,
"grad_norm": 7.638245105743408,
"learning_rate": 6.6387939698492466e-06,
"loss": 0.8156,
"step": 67000
},
{
"epoch": 3.35,
"grad_norm": 4.004561424255371,
"learning_rate": 6.6186934673366845e-06,
"loss": 0.779,
"step": 67100
},
{
"epoch": 3.36,
"grad_norm": 5.197673320770264,
"learning_rate": 6.598592964824121e-06,
"loss": 0.8086,
"step": 67200
},
{
"epoch": 3.37,
"grad_norm": 5.757644176483154,
"learning_rate": 6.578492462311559e-06,
"loss": 0.8609,
"step": 67300
},
{
"epoch": 3.37,
"grad_norm": 3.9802143573760986,
"learning_rate": 6.558391959798995e-06,
"loss": 0.814,
"step": 67400
},
{
"epoch": 3.38,
"grad_norm": 4.6707892417907715,
"learning_rate": 6.538291457286433e-06,
"loss": 0.809,
"step": 67500
},
{
"epoch": 3.38,
"grad_norm": 6.902073860168457,
"learning_rate": 6.51819095477387e-06,
"loss": 0.7862,
"step": 67600
},
{
"epoch": 3.38,
"grad_norm": 4.793231010437012,
"learning_rate": 6.498090452261307e-06,
"loss": 0.8139,
"step": 67700
},
{
"epoch": 3.39,
"grad_norm": 10.16287612915039,
"learning_rate": 6.4779899497487444e-06,
"loss": 0.78,
"step": 67800
},
{
"epoch": 3.4,
"grad_norm": 5.308049201965332,
"learning_rate": 6.4578894472361816e-06,
"loss": 0.8235,
"step": 67900
},
{
"epoch": 3.4,
"grad_norm": 5.0899271965026855,
"learning_rate": 6.437788944723619e-06,
"loss": 0.8222,
"step": 68000
},
{
"epoch": 3.41,
"grad_norm": 5.174381732940674,
"learning_rate": 6.417688442211055e-06,
"loss": 0.7985,
"step": 68100
},
{
"epoch": 3.41,
"grad_norm": 4.858529090881348,
"learning_rate": 6.397587939698493e-06,
"loss": 0.8224,
"step": 68200
},
{
"epoch": 3.42,
"grad_norm": 8.091994285583496,
"learning_rate": 6.37748743718593e-06,
"loss": 0.8078,
"step": 68300
},
{
"epoch": 3.42,
"grad_norm": 5.269526481628418,
"learning_rate": 6.357386934673367e-06,
"loss": 0.8006,
"step": 68400
},
{
"epoch": 3.42,
"grad_norm": 5.161372184753418,
"learning_rate": 6.337286432160804e-06,
"loss": 0.814,
"step": 68500
},
{
"epoch": 3.43,
"grad_norm": 4.547713279724121,
"learning_rate": 6.3171859296482415e-06,
"loss": 0.8024,
"step": 68600
},
{
"epoch": 3.44,
"grad_norm": 5.171160697937012,
"learning_rate": 6.297085427135679e-06,
"loss": 0.7936,
"step": 68700
},
{
"epoch": 3.44,
"grad_norm": 6.406951904296875,
"learning_rate": 6.2769849246231166e-06,
"loss": 0.7627,
"step": 68800
},
{
"epoch": 3.44,
"grad_norm": 6.404531955718994,
"learning_rate": 6.256884422110553e-06,
"loss": 0.8081,
"step": 68900
},
{
"epoch": 3.45,
"grad_norm": 4.409193992614746,
"learning_rate": 6.236783919597991e-06,
"loss": 0.8284,
"step": 69000
},
{
"epoch": 3.46,
"grad_norm": 6.643680572509766,
"learning_rate": 6.216683417085427e-06,
"loss": 0.7908,
"step": 69100
},
{
"epoch": 3.46,
"grad_norm": 6.344150543212891,
"learning_rate": 6.196582914572865e-06,
"loss": 0.8028,
"step": 69200
},
{
"epoch": 3.46,
"grad_norm": 4.83349609375,
"learning_rate": 6.176482412060301e-06,
"loss": 0.7946,
"step": 69300
},
{
"epoch": 3.47,
"grad_norm": 4.103985786437988,
"learning_rate": 6.156381909547739e-06,
"loss": 0.8089,
"step": 69400
},
{
"epoch": 3.48,
"grad_norm": 4.681515693664551,
"learning_rate": 6.1362814070351764e-06,
"loss": 0.8104,
"step": 69500
},
{
"epoch": 3.48,
"grad_norm": 3.5046350955963135,
"learning_rate": 6.1161809045226136e-06,
"loss": 0.8024,
"step": 69600
},
{
"epoch": 3.48,
"grad_norm": 5.06920051574707,
"learning_rate": 6.096080402010051e-06,
"loss": 0.8043,
"step": 69700
},
{
"epoch": 3.49,
"grad_norm": 6.419402599334717,
"learning_rate": 6.075979899497489e-06,
"loss": 0.8218,
"step": 69800
},
{
"epoch": 3.5,
"grad_norm": 4.9620184898376465,
"learning_rate": 6.055879396984925e-06,
"loss": 0.7904,
"step": 69900
},
{
"epoch": 3.5,
"grad_norm": 6.6012349128723145,
"learning_rate": 6.035778894472363e-06,
"loss": 0.8021,
"step": 70000
},
{
"epoch": 3.5,
"eval_loss": 0.913910448551178,
"eval_runtime": 21.5858,
"eval_samples_per_second": 46.327,
"eval_steps_per_second": 5.791,
"step": 70000
},
{
"epoch": 3.5,
"grad_norm": 8.278429985046387,
"learning_rate": 6.015678391959799e-06,
"loss": 0.8255,
"step": 70100
},
{
"epoch": 3.51,
"grad_norm": 5.309919834136963,
"learning_rate": 5.995577889447237e-06,
"loss": 0.8048,
"step": 70200
},
{
"epoch": 3.52,
"grad_norm": 5.3151535987854,
"learning_rate": 5.9754773869346735e-06,
"loss": 0.796,
"step": 70300
},
{
"epoch": 3.52,
"grad_norm": 6.962722301483154,
"learning_rate": 5.9553768844221114e-06,
"loss": 0.8448,
"step": 70400
},
{
"epoch": 3.52,
"grad_norm": 6.564899444580078,
"learning_rate": 5.9352763819095486e-06,
"loss": 0.7782,
"step": 70500
},
{
"epoch": 3.53,
"grad_norm": 4.522327423095703,
"learning_rate": 5.915175879396985e-06,
"loss": 0.8306,
"step": 70600
},
{
"epoch": 3.54,
"grad_norm": 4.783290863037109,
"learning_rate": 5.895075376884423e-06,
"loss": 0.8448,
"step": 70700
},
{
"epoch": 3.54,
"grad_norm": 8.016778945922852,
"learning_rate": 5.874974874371859e-06,
"loss": 0.805,
"step": 70800
},
{
"epoch": 3.54,
"grad_norm": 6.962314605712891,
"learning_rate": 5.854874371859297e-06,
"loss": 0.7802,
"step": 70900
},
{
"epoch": 3.55,
"grad_norm": 4.056068420410156,
"learning_rate": 5.834773869346733e-06,
"loss": 0.8146,
"step": 71000
},
{
"epoch": 3.56,
"grad_norm": 4.548468589782715,
"learning_rate": 5.814673366834171e-06,
"loss": 0.7631,
"step": 71100
},
{
"epoch": 3.56,
"grad_norm": 4.344750881195068,
"learning_rate": 5.7945728643216085e-06,
"loss": 0.8032,
"step": 71200
},
{
"epoch": 3.56,
"grad_norm": 6.746843338012695,
"learning_rate": 5.774472361809046e-06,
"loss": 0.7622,
"step": 71300
},
{
"epoch": 3.57,
"grad_norm": 5.048290729522705,
"learning_rate": 5.754371859296483e-06,
"loss": 0.8133,
"step": 71400
},
{
"epoch": 3.58,
"grad_norm": 5.74857759475708,
"learning_rate": 5.734271356783921e-06,
"loss": 0.7834,
"step": 71500
},
{
"epoch": 3.58,
"grad_norm": 4.5277934074401855,
"learning_rate": 5.714170854271357e-06,
"loss": 0.789,
"step": 71600
},
{
"epoch": 3.58,
"grad_norm": 8.23270034790039,
"learning_rate": 5.694070351758795e-06,
"loss": 0.7613,
"step": 71700
},
{
"epoch": 3.59,
"grad_norm": 3.9528987407684326,
"learning_rate": 5.673969849246231e-06,
"loss": 0.8081,
"step": 71800
},
{
"epoch": 3.59,
"grad_norm": 5.704257965087891,
"learning_rate": 5.653869346733669e-06,
"loss": 0.8164,
"step": 71900
},
{
"epoch": 3.6,
"grad_norm": 4.676042079925537,
"learning_rate": 5.6337688442211055e-06,
"loss": 0.8202,
"step": 72000
},
{
"epoch": 3.6,
"grad_norm": 5.20451021194458,
"learning_rate": 5.6136683417085434e-06,
"loss": 0.7953,
"step": 72100
},
{
"epoch": 3.61,
"grad_norm": 7.501960277557373,
"learning_rate": 5.5935678391959806e-06,
"loss": 0.8168,
"step": 72200
},
{
"epoch": 3.62,
"grad_norm": 7.015203475952148,
"learning_rate": 5.573467336683418e-06,
"loss": 0.789,
"step": 72300
},
{
"epoch": 3.62,
"grad_norm": 4.428484916687012,
"learning_rate": 5.553366834170855e-06,
"loss": 0.8092,
"step": 72400
},
{
"epoch": 3.62,
"grad_norm": 4.477147102355957,
"learning_rate": 5.533266331658293e-06,
"loss": 0.7843,
"step": 72500
},
{
"epoch": 3.63,
"grad_norm": 5.1699748039245605,
"learning_rate": 5.513165829145729e-06,
"loss": 0.7996,
"step": 72600
},
{
"epoch": 3.63,
"grad_norm": 5.133453369140625,
"learning_rate": 5.493065326633167e-06,
"loss": 0.8233,
"step": 72700
},
{
"epoch": 3.64,
"grad_norm": 4.902942657470703,
"learning_rate": 5.472964824120603e-06,
"loss": 0.7586,
"step": 72800
},
{
"epoch": 3.65,
"grad_norm": 6.46637487411499,
"learning_rate": 5.4528643216080405e-06,
"loss": 0.7959,
"step": 72900
},
{
"epoch": 3.65,
"grad_norm": 7.144857406616211,
"learning_rate": 5.432763819095478e-06,
"loss": 0.8197,
"step": 73000
},
{
"epoch": 3.66,
"grad_norm": 6.084510326385498,
"learning_rate": 5.412663316582915e-06,
"loss": 0.8133,
"step": 73100
},
{
"epoch": 3.66,
"grad_norm": 5.132942199707031,
"learning_rate": 5.392562814070353e-06,
"loss": 0.7482,
"step": 73200
},
{
"epoch": 3.67,
"grad_norm": 6.69909143447876,
"learning_rate": 5.372462311557789e-06,
"loss": 0.7498,
"step": 73300
},
{
"epoch": 3.67,
"grad_norm": 7.99722146987915,
"learning_rate": 5.352361809045227e-06,
"loss": 0.7857,
"step": 73400
},
{
"epoch": 3.67,
"grad_norm": 7.380476951599121,
"learning_rate": 5.332261306532663e-06,
"loss": 0.8081,
"step": 73500
},
{
"epoch": 3.68,
"grad_norm": 6.441634178161621,
"learning_rate": 5.312160804020101e-06,
"loss": 0.7737,
"step": 73600
},
{
"epoch": 3.69,
"grad_norm": 5.027355194091797,
"learning_rate": 5.2920603015075375e-06,
"loss": 0.7991,
"step": 73700
},
{
"epoch": 3.69,
"grad_norm": 8.128876686096191,
"learning_rate": 5.2719597989949755e-06,
"loss": 0.8271,
"step": 73800
},
{
"epoch": 3.69,
"grad_norm": 4.09487247467041,
"learning_rate": 5.251859296482413e-06,
"loss": 0.775,
"step": 73900
},
{
"epoch": 3.7,
"grad_norm": 6.368048667907715,
"learning_rate": 5.231959798994976e-06,
"loss": 0.7872,
"step": 74000
},
{
"epoch": 3.71,
"grad_norm": 4.72104549407959,
"learning_rate": 5.211859296482412e-06,
"loss": 0.8057,
"step": 74100
},
{
"epoch": 3.71,
"grad_norm": 5.083056926727295,
"learning_rate": 5.19175879396985e-06,
"loss": 0.7839,
"step": 74200
},
{
"epoch": 3.71,
"grad_norm": 5.289855003356934,
"learning_rate": 5.171658291457286e-06,
"loss": 0.7829,
"step": 74300
},
{
"epoch": 3.72,
"grad_norm": 5.842662811279297,
"learning_rate": 5.151557788944724e-06,
"loss": 0.7782,
"step": 74400
},
{
"epoch": 3.73,
"grad_norm": 6.445068836212158,
"learning_rate": 5.131457286432161e-06,
"loss": 0.8335,
"step": 74500
},
{
"epoch": 3.73,
"grad_norm": 4.2318220138549805,
"learning_rate": 5.111356783919599e-06,
"loss": 0.7942,
"step": 74600
},
{
"epoch": 3.73,
"grad_norm": 8.975232124328613,
"learning_rate": 5.091256281407036e-06,
"loss": 0.8284,
"step": 74700
},
{
"epoch": 3.74,
"grad_norm": 4.482039451599121,
"learning_rate": 5.071155778894473e-06,
"loss": 0.8281,
"step": 74800
},
{
"epoch": 3.75,
"grad_norm": 4.330044269561768,
"learning_rate": 5.05105527638191e-06,
"loss": 0.7737,
"step": 74900
},
{
"epoch": 3.75,
"grad_norm": 4.636693000793457,
"learning_rate": 5.030954773869348e-06,
"loss": 0.7882,
"step": 75000
},
{
"epoch": 3.75,
"grad_norm": 4.175960540771484,
"learning_rate": 5.010854271356784e-06,
"loss": 0.7417,
"step": 75100
},
{
"epoch": 3.76,
"grad_norm": 4.081864833831787,
"learning_rate": 4.990753768844221e-06,
"loss": 0.7579,
"step": 75200
},
{
"epoch": 3.77,
"grad_norm": 4.608290672302246,
"learning_rate": 4.9706532663316585e-06,
"loss": 0.799,
"step": 75300
},
{
"epoch": 3.77,
"grad_norm": 4.851296901702881,
"learning_rate": 4.950552763819096e-06,
"loss": 0.7998,
"step": 75400
},
{
"epoch": 3.77,
"grad_norm": 4.3285112380981445,
"learning_rate": 4.930452261306533e-06,
"loss": 0.8093,
"step": 75500
},
{
"epoch": 3.78,
"grad_norm": 4.927236080169678,
"learning_rate": 4.910552763819096e-06,
"loss": 0.7793,
"step": 75600
},
{
"epoch": 3.79,
"grad_norm": 6.193936824798584,
"learning_rate": 4.890452261306533e-06,
"loss": 0.8072,
"step": 75700
},
{
"epoch": 3.79,
"grad_norm": 4.687440872192383,
"learning_rate": 4.87035175879397e-06,
"loss": 0.8,
"step": 75800
},
{
"epoch": 3.79,
"grad_norm": 4.473381519317627,
"learning_rate": 4.850251256281407e-06,
"loss": 0.8027,
"step": 75900
},
{
"epoch": 3.8,
"grad_norm": 4.676540374755859,
"learning_rate": 4.8301507537688445e-06,
"loss": 0.8029,
"step": 76000
},
{
"epoch": 3.81,
"grad_norm": 4.967388153076172,
"learning_rate": 4.810050251256282e-06,
"loss": 0.7539,
"step": 76100
},
{
"epoch": 3.81,
"grad_norm": 4.699183940887451,
"learning_rate": 4.789949748743719e-06,
"loss": 0.7651,
"step": 76200
},
{
"epoch": 3.81,
"grad_norm": 4.629420757293701,
"learning_rate": 4.769849246231156e-06,
"loss": 0.7803,
"step": 76300
},
{
"epoch": 3.82,
"grad_norm": 5.920188903808594,
"learning_rate": 4.749748743718594e-06,
"loss": 0.8017,
"step": 76400
},
{
"epoch": 3.83,
"grad_norm": 6.677817344665527,
"learning_rate": 4.729648241206031e-06,
"loss": 0.8216,
"step": 76500
},
{
"epoch": 3.83,
"grad_norm": 5.312260627746582,
"learning_rate": 4.709547738693468e-06,
"loss": 0.7827,
"step": 76600
},
{
"epoch": 3.83,
"grad_norm": 4.119052410125732,
"learning_rate": 4.689447236180905e-06,
"loss": 0.7483,
"step": 76700
},
{
"epoch": 3.84,
"grad_norm": 4.5976715087890625,
"learning_rate": 4.669346733668342e-06,
"loss": 0.7657,
"step": 76800
},
{
"epoch": 3.84,
"grad_norm": 5.721061706542969,
"learning_rate": 4.649246231155779e-06,
"loss": 0.7817,
"step": 76900
},
{
"epoch": 3.85,
"grad_norm": 7.369571208953857,
"learning_rate": 4.629145728643216e-06,
"loss": 0.7402,
"step": 77000
},
{
"epoch": 3.85,
"grad_norm": 5.615093231201172,
"learning_rate": 4.609045226130654e-06,
"loss": 0.811,
"step": 77100
},
{
"epoch": 3.86,
"grad_norm": 6.276815414428711,
"learning_rate": 4.588944723618091e-06,
"loss": 0.7909,
"step": 77200
},
{
"epoch": 3.87,
"grad_norm": 4.287708759307861,
"learning_rate": 4.568844221105528e-06,
"loss": 0.8012,
"step": 77300
},
{
"epoch": 3.87,
"grad_norm": 4.280378818511963,
"learning_rate": 4.548743718592965e-06,
"loss": 0.8205,
"step": 77400
},
{
"epoch": 3.88,
"grad_norm": 8.309846878051758,
"learning_rate": 4.528643216080402e-06,
"loss": 0.7785,
"step": 77500
},
{
"epoch": 3.88,
"grad_norm": 5.504384517669678,
"learning_rate": 4.508542713567839e-06,
"loss": 0.7678,
"step": 77600
},
{
"epoch": 3.88,
"grad_norm": 4.6738996505737305,
"learning_rate": 4.4884422110552765e-06,
"loss": 0.8207,
"step": 77700
},
{
"epoch": 3.89,
"grad_norm": 8.038127899169922,
"learning_rate": 4.468341708542714e-06,
"loss": 0.7788,
"step": 77800
},
{
"epoch": 3.9,
"grad_norm": 6.898759365081787,
"learning_rate": 4.448241206030151e-06,
"loss": 0.7575,
"step": 77900
},
{
"epoch": 3.9,
"grad_norm": 5.893388271331787,
"learning_rate": 4.428140703517588e-06,
"loss": 0.7842,
"step": 78000
},
{
"epoch": 3.91,
"grad_norm": 7.37433385848999,
"learning_rate": 4.408040201005026e-06,
"loss": 0.756,
"step": 78100
},
{
"epoch": 3.91,
"grad_norm": 6.226987838745117,
"learning_rate": 4.387939698492463e-06,
"loss": 0.7818,
"step": 78200
},
{
"epoch": 3.92,
"grad_norm": 6.20886754989624,
"learning_rate": 4.368040201005025e-06,
"loss": 0.8057,
"step": 78300
},
{
"epoch": 3.92,
"grad_norm": 3.9309849739074707,
"learning_rate": 4.3479396984924625e-06,
"loss": 0.8052,
"step": 78400
},
{
"epoch": 3.92,
"grad_norm": 4.972345352172852,
"learning_rate": 4.3278391959799e-06,
"loss": 0.7666,
"step": 78500
},
{
"epoch": 3.93,
"grad_norm": 8.730260848999023,
"learning_rate": 4.307738693467337e-06,
"loss": 0.7897,
"step": 78600
},
{
"epoch": 3.94,
"grad_norm": 6.734485626220703,
"learning_rate": 4.287638190954774e-06,
"loss": 0.7595,
"step": 78700
},
{
"epoch": 3.94,
"grad_norm": 6.456557750701904,
"learning_rate": 4.267537688442212e-06,
"loss": 0.7924,
"step": 78800
},
{
"epoch": 3.94,
"grad_norm": 4.421884059906006,
"learning_rate": 4.247437185929649e-06,
"loss": 0.7821,
"step": 78900
},
{
"epoch": 3.95,
"grad_norm": 7.825852394104004,
"learning_rate": 4.227336683417086e-06,
"loss": 0.7834,
"step": 79000
},
{
"epoch": 3.96,
"grad_norm": 6.445671081542969,
"learning_rate": 4.207236180904523e-06,
"loss": 0.7794,
"step": 79100
},
{
"epoch": 3.96,
"grad_norm": 3.7435953617095947,
"learning_rate": 4.18713567839196e-06,
"loss": 0.7218,
"step": 79200
},
{
"epoch": 3.96,
"grad_norm": 10.594905853271484,
"learning_rate": 4.1670351758793975e-06,
"loss": 0.7957,
"step": 79300
},
{
"epoch": 3.97,
"grad_norm": 7.166194438934326,
"learning_rate": 4.146934673366835e-06,
"loss": 0.7936,
"step": 79400
},
{
"epoch": 3.98,
"grad_norm": 4.773101329803467,
"learning_rate": 4.126834170854272e-06,
"loss": 0.7721,
"step": 79500
},
{
"epoch": 3.98,
"grad_norm": 5.979006767272949,
"learning_rate": 4.106733668341709e-06,
"loss": 0.7899,
"step": 79600
},
{
"epoch": 3.98,
"grad_norm": 6.46978235244751,
"learning_rate": 4.086633165829146e-06,
"loss": 0.7874,
"step": 79700
},
{
"epoch": 3.99,
"grad_norm": 5.1106977462768555,
"learning_rate": 4.066532663316583e-06,
"loss": 0.7644,
"step": 79800
},
{
"epoch": 4.0,
"grad_norm": 7.125823974609375,
"learning_rate": 4.0466331658291464e-06,
"loss": 0.792,
"step": 79900
},
{
"epoch": 4.0,
"grad_norm": 5.539035797119141,
"learning_rate": 4.026532663316583e-06,
"loss": 0.7779,
"step": 80000
},
{
"epoch": 4.0,
"eval_loss": 0.8846080303192139,
"eval_runtime": 21.6073,
"eval_samples_per_second": 46.281,
"eval_steps_per_second": 5.785,
"step": 80000
},
{
"epoch": 4.0,
"grad_norm": 5.7579193115234375,
"learning_rate": 4.00643216080402e-06,
"loss": 0.6947,
"step": 80100
},
{
"epoch": 4.01,
"grad_norm": 5.583180904388428,
"learning_rate": 3.986331658291458e-06,
"loss": 0.6614,
"step": 80200
},
{
"epoch": 4.01,
"grad_norm": 5.107233047485352,
"learning_rate": 3.966231155778895e-06,
"loss": 0.6936,
"step": 80300
},
{
"epoch": 4.02,
"grad_norm": 5.804276466369629,
"learning_rate": 3.946130653266332e-06,
"loss": 0.6946,
"step": 80400
},
{
"epoch": 4.03,
"grad_norm": 6.738204479217529,
"learning_rate": 3.926030150753769e-06,
"loss": 0.6681,
"step": 80500
},
{
"epoch": 4.03,
"grad_norm": 6.331192970275879,
"learning_rate": 3.905929648241206e-06,
"loss": 0.6839,
"step": 80600
},
{
"epoch": 4.04,
"grad_norm": 5.382104873657227,
"learning_rate": 3.8858291457286434e-06,
"loss": 0.6566,
"step": 80700
},
{
"epoch": 4.04,
"grad_norm": 6.394933223724365,
"learning_rate": 3.8657286432160806e-06,
"loss": 0.7378,
"step": 80800
},
{
"epoch": 4.04,
"grad_norm": 5.813870429992676,
"learning_rate": 3.845628140703518e-06,
"loss": 0.7112,
"step": 80900
},
{
"epoch": 4.05,
"grad_norm": 6.095046520233154,
"learning_rate": 3.825527638190955e-06,
"loss": 0.6885,
"step": 81000
},
{
"epoch": 4.05,
"grad_norm": 6.212576866149902,
"learning_rate": 3.8054271356783924e-06,
"loss": 0.6658,
"step": 81100
},
{
"epoch": 4.06,
"grad_norm": 4.426722526550293,
"learning_rate": 3.7853266331658295e-06,
"loss": 0.6915,
"step": 81200
},
{
"epoch": 4.07,
"grad_norm": 7.474303722381592,
"learning_rate": 3.7652261306532666e-06,
"loss": 0.6486,
"step": 81300
},
{
"epoch": 4.07,
"grad_norm": 7.347512245178223,
"learning_rate": 3.7451256281407038e-06,
"loss": 0.7078,
"step": 81400
},
{
"epoch": 4.08,
"grad_norm": 9.426233291625977,
"learning_rate": 3.7250251256281413e-06,
"loss": 0.6951,
"step": 81500
},
{
"epoch": 4.08,
"grad_norm": 5.577968597412109,
"learning_rate": 3.7049246231155784e-06,
"loss": 0.6905,
"step": 81600
},
{
"epoch": 4.08,
"grad_norm": 6.477217197418213,
"learning_rate": 3.6848241206030156e-06,
"loss": 0.663,
"step": 81700
},
{
"epoch": 4.09,
"grad_norm": 6.228948593139648,
"learning_rate": 3.6647236180904527e-06,
"loss": 0.6677,
"step": 81800
},
{
"epoch": 4.09,
"grad_norm": 5.777594089508057,
"learning_rate": 3.64462311557789e-06,
"loss": 0.6905,
"step": 81900
},
{
"epoch": 4.1,
"grad_norm": 6.7552080154418945,
"learning_rate": 3.624522613065327e-06,
"loss": 0.7086,
"step": 82000
},
{
"epoch": 4.11,
"grad_norm": 5.3912553787231445,
"learning_rate": 3.6044221105527645e-06,
"loss": 0.6833,
"step": 82100
},
{
"epoch": 4.11,
"grad_norm": 7.366456508636475,
"learning_rate": 3.5843216080402016e-06,
"loss": 0.6618,
"step": 82200
},
{
"epoch": 4.12,
"grad_norm": 4.593729019165039,
"learning_rate": 3.5642211055276383e-06,
"loss": 0.6397,
"step": 82300
},
{
"epoch": 4.12,
"grad_norm": 6.743685722351074,
"learning_rate": 3.5441206030150755e-06,
"loss": 0.7233,
"step": 82400
},
{
"epoch": 4.12,
"grad_norm": 6.125808238983154,
"learning_rate": 3.5240201005025126e-06,
"loss": 0.6804,
"step": 82500
},
{
"epoch": 4.13,
"grad_norm": 7.0340752601623535,
"learning_rate": 3.5039195979899497e-06,
"loss": 0.699,
"step": 82600
},
{
"epoch": 4.13,
"grad_norm": 7.293619632720947,
"learning_rate": 3.4838190954773873e-06,
"loss": 0.6572,
"step": 82700
},
{
"epoch": 4.14,
"grad_norm": 6.3135552406311035,
"learning_rate": 3.4637185929648244e-06,
"loss": 0.6364,
"step": 82800
},
{
"epoch": 4.14,
"grad_norm": 5.138033390045166,
"learning_rate": 3.4436180904522615e-06,
"loss": 0.6815,
"step": 82900
},
{
"epoch": 4.15,
"grad_norm": 6.240560054779053,
"learning_rate": 3.4235175879396986e-06,
"loss": 0.6919,
"step": 83000
},
{
"epoch": 4.16,
"grad_norm": 4.19957971572876,
"learning_rate": 3.4034170854271358e-06,
"loss": 0.6845,
"step": 83100
},
{
"epoch": 4.16,
"grad_norm": 6.340314865112305,
"learning_rate": 3.383316582914573e-06,
"loss": 0.653,
"step": 83200
},
{
"epoch": 4.17,
"grad_norm": 3.309894323348999,
"learning_rate": 3.3632160804020104e-06,
"loss": 0.6612,
"step": 83300
},
{
"epoch": 4.17,
"grad_norm": 5.189826011657715,
"learning_rate": 3.3431155778894476e-06,
"loss": 0.6871,
"step": 83400
},
{
"epoch": 4.17,
"grad_norm": 6.599611759185791,
"learning_rate": 3.3230150753768847e-06,
"loss": 0.6743,
"step": 83500
},
{
"epoch": 4.18,
"grad_norm": 26.47356414794922,
"learning_rate": 3.302914572864322e-06,
"loss": 0.6312,
"step": 83600
},
{
"epoch": 4.18,
"grad_norm": 8.280220985412598,
"learning_rate": 3.282814070351759e-06,
"loss": 0.6276,
"step": 83700
},
{
"epoch": 4.19,
"grad_norm": 7.8088555335998535,
"learning_rate": 3.2627135678391965e-06,
"loss": 0.6514,
"step": 83800
},
{
"epoch": 4.2,
"grad_norm": 5.11159086227417,
"learning_rate": 3.2426130653266336e-06,
"loss": 0.6262,
"step": 83900
},
{
"epoch": 4.2,
"grad_norm": 6.656592845916748,
"learning_rate": 3.2225125628140708e-06,
"loss": 0.6889,
"step": 84000
},
{
"epoch": 4.21,
"grad_norm": 7.140279769897461,
"learning_rate": 3.202412060301508e-06,
"loss": 0.6435,
"step": 84100
},
{
"epoch": 4.21,
"grad_norm": 6.478577613830566,
"learning_rate": 3.182311557788945e-06,
"loss": 0.6593,
"step": 84200
},
{
"epoch": 4.21,
"grad_norm": 6.854846477508545,
"learning_rate": 3.1622110552763826e-06,
"loss": 0.7097,
"step": 84300
},
{
"epoch": 4.22,
"grad_norm": 5.070549488067627,
"learning_rate": 3.1421105527638197e-06,
"loss": 0.6736,
"step": 84400
},
{
"epoch": 4.22,
"grad_norm": 7.519010543823242,
"learning_rate": 3.122010050251257e-06,
"loss": 0.6518,
"step": 84500
},
{
"epoch": 4.23,
"grad_norm": 6.662156105041504,
"learning_rate": 3.1019095477386935e-06,
"loss": 0.675,
"step": 84600
},
{
"epoch": 4.24,
"grad_norm": 7.687413215637207,
"learning_rate": 3.0818090452261307e-06,
"loss": 0.6477,
"step": 84700
},
{
"epoch": 4.24,
"grad_norm": 5.934724807739258,
"learning_rate": 3.0617085427135678e-06,
"loss": 0.6492,
"step": 84800
},
{
"epoch": 4.25,
"grad_norm": 9.457836151123047,
"learning_rate": 3.041608040201005e-06,
"loss": 0.633,
"step": 84900
},
{
"epoch": 4.25,
"grad_norm": 6.666748523712158,
"learning_rate": 3.0215075376884425e-06,
"loss": 0.6693,
"step": 85000
},
{
"epoch": 4.25,
"grad_norm": 6.439404487609863,
"learning_rate": 3.0014070351758796e-06,
"loss": 0.6643,
"step": 85100
},
{
"epoch": 4.26,
"grad_norm": 7.257474422454834,
"learning_rate": 2.9813065326633167e-06,
"loss": 0.6623,
"step": 85200
},
{
"epoch": 4.26,
"grad_norm": 4.707270622253418,
"learning_rate": 2.961206030150754e-06,
"loss": 0.6471,
"step": 85300
},
{
"epoch": 4.27,
"grad_norm": 5.7160844802856445,
"learning_rate": 2.941105527638191e-06,
"loss": 0.683,
"step": 85400
},
{
"epoch": 4.28,
"grad_norm": 6.038240432739258,
"learning_rate": 2.9210050251256285e-06,
"loss": 0.6742,
"step": 85500
},
{
"epoch": 4.28,
"grad_norm": 6.851832866668701,
"learning_rate": 2.9009045226130656e-06,
"loss": 0.6748,
"step": 85600
},
{
"epoch": 4.29,
"grad_norm": 5.691901683807373,
"learning_rate": 2.8808040201005028e-06,
"loss": 0.6703,
"step": 85700
},
{
"epoch": 4.29,
"grad_norm": 6.378291130065918,
"learning_rate": 2.86070351758794e-06,
"loss": 0.6487,
"step": 85800
},
{
"epoch": 4.29,
"grad_norm": 4.439263343811035,
"learning_rate": 2.840603015075377e-06,
"loss": 0.6598,
"step": 85900
},
{
"epoch": 4.3,
"grad_norm": 6.466790199279785,
"learning_rate": 2.8205025125628146e-06,
"loss": 0.6914,
"step": 86000
},
{
"epoch": 4.3,
"grad_norm": 6.0331902503967285,
"learning_rate": 2.8004020100502517e-06,
"loss": 0.6929,
"step": 86100
},
{
"epoch": 4.31,
"grad_norm": 4.750064849853516,
"learning_rate": 2.780301507537689e-06,
"loss": 0.6715,
"step": 86200
},
{
"epoch": 4.32,
"grad_norm": 8.289958953857422,
"learning_rate": 2.760201005025126e-06,
"loss": 0.6975,
"step": 86300
},
{
"epoch": 4.32,
"grad_norm": 10.746756553649902,
"learning_rate": 2.740100502512563e-06,
"loss": 0.6454,
"step": 86400
},
{
"epoch": 4.33,
"grad_norm": 6.792548656463623,
"learning_rate": 2.720201005025126e-06,
"loss": 0.7056,
"step": 86500
},
{
"epoch": 4.33,
"grad_norm": 5.030031204223633,
"learning_rate": 2.700100502512563e-06,
"loss": 0.6711,
"step": 86600
},
{
"epoch": 4.33,
"grad_norm": 4.626148223876953,
"learning_rate": 2.680201005025126e-06,
"loss": 0.676,
"step": 86700
},
{
"epoch": 4.34,
"grad_norm": 8.56241512298584,
"learning_rate": 2.660100502512563e-06,
"loss": 0.6548,
"step": 86800
},
{
"epoch": 4.34,
"grad_norm": 9.747623443603516,
"learning_rate": 2.64e-06,
"loss": 0.6883,
"step": 86900
},
{
"epoch": 4.35,
"grad_norm": 8.002108573913574,
"learning_rate": 2.6198994974874377e-06,
"loss": 0.7166,
"step": 87000
},
{
"epoch": 4.36,
"grad_norm": 6.09249210357666,
"learning_rate": 2.599798994974875e-06,
"loss": 0.6841,
"step": 87100
},
{
"epoch": 4.36,
"grad_norm": 5.512220859527588,
"learning_rate": 2.579698492462312e-06,
"loss": 0.6816,
"step": 87200
},
{
"epoch": 4.37,
"grad_norm": 5.139577388763428,
"learning_rate": 2.559597989949749e-06,
"loss": 0.6475,
"step": 87300
},
{
"epoch": 4.37,
"grad_norm": 11.360005378723145,
"learning_rate": 2.539497487437186e-06,
"loss": 0.7434,
"step": 87400
},
{
"epoch": 4.38,
"grad_norm": 5.06545877456665,
"learning_rate": 2.5193969849246237e-06,
"loss": 0.6626,
"step": 87500
},
{
"epoch": 4.38,
"grad_norm": 4.432734966278076,
"learning_rate": 2.4992964824120604e-06,
"loss": 0.6357,
"step": 87600
},
{
"epoch": 4.38,
"grad_norm": 7.90862512588501,
"learning_rate": 2.4791959798994976e-06,
"loss": 0.6039,
"step": 87700
},
{
"epoch": 4.39,
"grad_norm": 4.959092617034912,
"learning_rate": 2.459095477386935e-06,
"loss": 0.6699,
"step": 87800
},
{
"epoch": 4.39,
"grad_norm": 7.495928764343262,
"learning_rate": 2.4389949748743723e-06,
"loss": 0.6648,
"step": 87900
},
{
"epoch": 4.4,
"grad_norm": 10.80557918548584,
"learning_rate": 2.4188944723618094e-06,
"loss": 0.6532,
"step": 88000
},
{
"epoch": 4.41,
"grad_norm": 7.1374006271362305,
"learning_rate": 2.3987939698492465e-06,
"loss": 0.6903,
"step": 88100
},
{
"epoch": 4.41,
"grad_norm": 12.275821685791016,
"learning_rate": 2.3786934673366836e-06,
"loss": 0.6433,
"step": 88200
},
{
"epoch": 4.42,
"grad_norm": 8.747936248779297,
"learning_rate": 2.3585929648241208e-06,
"loss": 0.62,
"step": 88300
},
{
"epoch": 4.42,
"grad_norm": 5.3552985191345215,
"learning_rate": 2.338492462311558e-06,
"loss": 0.6525,
"step": 88400
},
{
"epoch": 4.42,
"grad_norm": 7.049367427825928,
"learning_rate": 2.318391959798995e-06,
"loss": 0.6742,
"step": 88500
},
{
"epoch": 4.43,
"grad_norm": 8.841930389404297,
"learning_rate": 2.298291457286432e-06,
"loss": 0.6806,
"step": 88600
},
{
"epoch": 4.43,
"grad_norm": 4.58371114730835,
"learning_rate": 2.2781909547738697e-06,
"loss": 0.6469,
"step": 88700
},
{
"epoch": 4.44,
"grad_norm": 8.08278751373291,
"learning_rate": 2.258090452261307e-06,
"loss": 0.6918,
"step": 88800
},
{
"epoch": 4.45,
"grad_norm": 5.989361763000488,
"learning_rate": 2.237989949748744e-06,
"loss": 0.7048,
"step": 88900
},
{
"epoch": 4.45,
"grad_norm": 8.200750350952148,
"learning_rate": 2.217889447236181e-06,
"loss": 0.6222,
"step": 89000
},
{
"epoch": 4.46,
"grad_norm": 7.658218860626221,
"learning_rate": 2.197788944723618e-06,
"loss": 0.653,
"step": 89100
},
{
"epoch": 4.46,
"grad_norm": 6.744418621063232,
"learning_rate": 2.177889447236181e-06,
"loss": 0.6698,
"step": 89200
},
{
"epoch": 4.46,
"grad_norm": 4.423871994018555,
"learning_rate": 2.157788944723618e-06,
"loss": 0.6665,
"step": 89300
},
{
"epoch": 4.47,
"grad_norm": 7.368816375732422,
"learning_rate": 2.1376884422110557e-06,
"loss": 0.6766,
"step": 89400
},
{
"epoch": 4.47,
"grad_norm": 4.649584770202637,
"learning_rate": 2.117587939698493e-06,
"loss": 0.6464,
"step": 89500
},
{
"epoch": 4.48,
"grad_norm": 7.77773904800415,
"learning_rate": 2.09748743718593e-06,
"loss": 0.6721,
"step": 89600
},
{
"epoch": 4.49,
"grad_norm": 6.5589280128479,
"learning_rate": 2.0773869346733667e-06,
"loss": 0.6817,
"step": 89700
},
{
"epoch": 4.49,
"grad_norm": 10.153287887573242,
"learning_rate": 2.0572864321608042e-06,
"loss": 0.645,
"step": 89800
},
{
"epoch": 4.5,
"grad_norm": 8.705924987792969,
"learning_rate": 2.0371859296482414e-06,
"loss": 0.707,
"step": 89900
},
{
"epoch": 4.5,
"grad_norm": 5.7329511642456055,
"learning_rate": 2.0170854271356785e-06,
"loss": 0.6834,
"step": 90000
},
{
"epoch": 4.5,
"eval_loss": 0.9503761529922485,
"eval_runtime": 21.641,
"eval_samples_per_second": 46.209,
"eval_steps_per_second": 5.776,
"step": 90000
},
{
"epoch": 4.5,
"grad_norm": 6.902284622192383,
"learning_rate": 1.9969849246231156e-06,
"loss": 0.6237,
"step": 90100
},
{
"epoch": 4.51,
"grad_norm": 5.6710710525512695,
"learning_rate": 1.9768844221105527e-06,
"loss": 0.6638,
"step": 90200
},
{
"epoch": 4.51,
"grad_norm": 6.364370346069336,
"learning_rate": 1.9567839195979903e-06,
"loss": 0.6537,
"step": 90300
},
{
"epoch": 4.52,
"grad_norm": 5.928137302398682,
"learning_rate": 1.9366834170854274e-06,
"loss": 0.6266,
"step": 90400
},
{
"epoch": 4.53,
"grad_norm": 8.740313529968262,
"learning_rate": 1.9165829145728645e-06,
"loss": 0.6198,
"step": 90500
},
{
"epoch": 4.53,
"grad_norm": 8.339399337768555,
"learning_rate": 1.8964824120603017e-06,
"loss": 0.6482,
"step": 90600
},
{
"epoch": 4.54,
"grad_norm": 8.13129997253418,
"learning_rate": 1.876381909547739e-06,
"loss": 0.6521,
"step": 90700
},
{
"epoch": 4.54,
"grad_norm": 10.06900405883789,
"learning_rate": 1.856281407035176e-06,
"loss": 0.6472,
"step": 90800
},
{
"epoch": 4.54,
"grad_norm": 6.953003406524658,
"learning_rate": 1.836180904522613e-06,
"loss": 0.6185,
"step": 90900
},
{
"epoch": 4.55,
"grad_norm": 7.572219371795654,
"learning_rate": 1.8160804020100504e-06,
"loss": 0.664,
"step": 91000
},
{
"epoch": 4.55,
"grad_norm": 8.318469047546387,
"learning_rate": 1.7959798994974875e-06,
"loss": 0.6442,
"step": 91100
},
{
"epoch": 4.56,
"grad_norm": 6.608754634857178,
"learning_rate": 1.7758793969849246e-06,
"loss": 0.6398,
"step": 91200
},
{
"epoch": 4.56,
"grad_norm": 7.397676467895508,
"learning_rate": 1.755778894472362e-06,
"loss": 0.6689,
"step": 91300
},
{
"epoch": 4.57,
"grad_norm": 10.482325553894043,
"learning_rate": 1.7356783919597991e-06,
"loss": 0.6792,
"step": 91400
},
{
"epoch": 4.58,
"grad_norm": 5.926417827606201,
"learning_rate": 1.7155778894472364e-06,
"loss": 0.6774,
"step": 91500
},
{
"epoch": 4.58,
"grad_norm": 8.223274230957031,
"learning_rate": 1.6954773869346736e-06,
"loss": 0.6528,
"step": 91600
},
{
"epoch": 4.58,
"grad_norm": 7.564822196960449,
"learning_rate": 1.6753768844221107e-06,
"loss": 0.6224,
"step": 91700
},
{
"epoch": 4.59,
"grad_norm": 6.845765113830566,
"learning_rate": 1.655276381909548e-06,
"loss": 0.6984,
"step": 91800
},
{
"epoch": 4.59,
"grad_norm": 6.044042587280273,
"learning_rate": 1.6353768844221107e-06,
"loss": 0.6211,
"step": 91900
},
{
"epoch": 4.6,
"grad_norm": 12.825979232788086,
"learning_rate": 1.615276381909548e-06,
"loss": 0.6851,
"step": 92000
},
{
"epoch": 4.61,
"grad_norm": 6.73763370513916,
"learning_rate": 1.5951758793969851e-06,
"loss": 0.6161,
"step": 92100
},
{
"epoch": 4.61,
"grad_norm": 6.827399730682373,
"learning_rate": 1.5750753768844223e-06,
"loss": 0.6525,
"step": 92200
},
{
"epoch": 4.62,
"grad_norm": 6.6664228439331055,
"learning_rate": 1.5549748743718594e-06,
"loss": 0.6617,
"step": 92300
},
{
"epoch": 4.62,
"grad_norm": 9.772034645080566,
"learning_rate": 1.5348743718592965e-06,
"loss": 0.6687,
"step": 92400
},
{
"epoch": 4.62,
"grad_norm": 6.625182151794434,
"learning_rate": 1.5147738693467336e-06,
"loss": 0.6545,
"step": 92500
},
{
"epoch": 4.63,
"grad_norm": 10.207441329956055,
"learning_rate": 1.494673366834171e-06,
"loss": 0.6332,
"step": 92600
},
{
"epoch": 4.63,
"grad_norm": 9.929265975952148,
"learning_rate": 1.474572864321608e-06,
"loss": 0.6391,
"step": 92700
},
{
"epoch": 4.64,
"grad_norm": 6.050763130187988,
"learning_rate": 1.4544723618090452e-06,
"loss": 0.6708,
"step": 92800
},
{
"epoch": 4.64,
"grad_norm": 5.504277229309082,
"learning_rate": 1.4343718592964826e-06,
"loss": 0.6578,
"step": 92900
},
{
"epoch": 4.65,
"grad_norm": 7.113737106323242,
"learning_rate": 1.4142713567839197e-06,
"loss": 0.6419,
"step": 93000
},
{
"epoch": 4.66,
"grad_norm": 7.181005001068115,
"learning_rate": 1.394170854271357e-06,
"loss": 0.6298,
"step": 93100
},
{
"epoch": 4.66,
"grad_norm": 8.930741310119629,
"learning_rate": 1.3740703517587942e-06,
"loss": 0.6734,
"step": 93200
},
{
"epoch": 4.67,
"grad_norm": 6.288244724273682,
"learning_rate": 1.3539698492462313e-06,
"loss": 0.6307,
"step": 93300
},
{
"epoch": 4.67,
"grad_norm": 6.91972017288208,
"learning_rate": 1.3338693467336686e-06,
"loss": 0.676,
"step": 93400
},
{
"epoch": 4.67,
"grad_norm": 8.017012596130371,
"learning_rate": 1.3137688442211055e-06,
"loss": 0.6157,
"step": 93500
},
{
"epoch": 4.68,
"grad_norm": 4.738548755645752,
"learning_rate": 1.2936683417085427e-06,
"loss": 0.679,
"step": 93600
},
{
"epoch": 4.69,
"grad_norm": 6.201863765716553,
"learning_rate": 1.27356783919598e-06,
"loss": 0.6542,
"step": 93700
},
{
"epoch": 4.69,
"grad_norm": 7.595000267028809,
"learning_rate": 1.2534673366834171e-06,
"loss": 0.6659,
"step": 93800
},
{
"epoch": 4.7,
"grad_norm": 5.57780647277832,
"learning_rate": 1.2333668341708543e-06,
"loss": 0.6381,
"step": 93900
},
{
"epoch": 4.7,
"grad_norm": 8.426780700683594,
"learning_rate": 1.2132663316582916e-06,
"loss": 0.6705,
"step": 94000
},
{
"epoch": 4.71,
"grad_norm": 7.012176990509033,
"learning_rate": 1.1931658291457287e-06,
"loss": 0.6874,
"step": 94100
},
{
"epoch": 4.71,
"grad_norm": 7.747401237487793,
"learning_rate": 1.173065326633166e-06,
"loss": 0.6317,
"step": 94200
},
{
"epoch": 4.71,
"grad_norm": 4.817531108856201,
"learning_rate": 1.1529648241206032e-06,
"loss": 0.6083,
"step": 94300
},
{
"epoch": 4.72,
"grad_norm": 6.916783332824707,
"learning_rate": 1.1328643216080403e-06,
"loss": 0.6619,
"step": 94400
},
{
"epoch": 4.72,
"grad_norm": 7.570366382598877,
"learning_rate": 1.1127638190954775e-06,
"loss": 0.6471,
"step": 94500
},
{
"epoch": 4.73,
"grad_norm": 8.70361328125,
"learning_rate": 1.0926633165829146e-06,
"loss": 0.6483,
"step": 94600
},
{
"epoch": 4.74,
"grad_norm": 9.341569900512695,
"learning_rate": 1.072562814070352e-06,
"loss": 0.6194,
"step": 94700
},
{
"epoch": 4.74,
"grad_norm": 4.283209800720215,
"learning_rate": 1.052462311557789e-06,
"loss": 0.6111,
"step": 94800
},
{
"epoch": 4.75,
"grad_norm": 8.134038925170898,
"learning_rate": 1.0323618090452262e-06,
"loss": 0.632,
"step": 94900
},
{
"epoch": 4.75,
"grad_norm": 8.605172157287598,
"learning_rate": 1.0122613065326633e-06,
"loss": 0.6341,
"step": 95000
},
{
"epoch": 4.75,
"grad_norm": 8.067020416259766,
"learning_rate": 9.921608040201006e-07,
"loss": 0.6694,
"step": 95100
},
{
"epoch": 4.76,
"grad_norm": 6.967876434326172,
"learning_rate": 9.720603015075378e-07,
"loss": 0.648,
"step": 95200
},
{
"epoch": 4.76,
"grad_norm": 8.443940162658691,
"learning_rate": 9.51959798994975e-07,
"loss": 0.6174,
"step": 95300
},
{
"epoch": 4.77,
"grad_norm": 8.791583061218262,
"learning_rate": 9.318592964824122e-07,
"loss": 0.6463,
"step": 95400
},
{
"epoch": 4.78,
"grad_norm": 8.055484771728516,
"learning_rate": 9.117587939698493e-07,
"loss": 0.5966,
"step": 95500
},
{
"epoch": 4.78,
"grad_norm": 5.009509563446045,
"learning_rate": 8.916582914572865e-07,
"loss": 0.6147,
"step": 95600
},
{
"epoch": 4.79,
"grad_norm": 5.755350589752197,
"learning_rate": 8.715577889447237e-07,
"loss": 0.6101,
"step": 95700
},
{
"epoch": 4.79,
"grad_norm": 8.774045944213867,
"learning_rate": 8.514572864321608e-07,
"loss": 0.6332,
"step": 95800
},
{
"epoch": 4.79,
"grad_norm": 6.463279724121094,
"learning_rate": 8.315577889447237e-07,
"loss": 0.6705,
"step": 95900
},
{
"epoch": 4.8,
"grad_norm": 5.299009323120117,
"learning_rate": 8.114572864321608e-07,
"loss": 0.6605,
"step": 96000
},
{
"epoch": 4.8,
"grad_norm": 6.5152130126953125,
"learning_rate": 7.91356783919598e-07,
"loss": 0.6456,
"step": 96100
},
{
"epoch": 4.81,
"grad_norm": 8.499478340148926,
"learning_rate": 7.712562814070353e-07,
"loss": 0.6454,
"step": 96200
},
{
"epoch": 4.81,
"grad_norm": 8.317819595336914,
"learning_rate": 7.511557788944725e-07,
"loss": 0.5961,
"step": 96300
},
{
"epoch": 4.82,
"grad_norm": 7.257504940032959,
"learning_rate": 7.310552763819095e-07,
"loss": 0.614,
"step": 96400
},
{
"epoch": 4.83,
"grad_norm": 3.862578868865967,
"learning_rate": 7.109547738693468e-07,
"loss": 0.6388,
"step": 96500
},
{
"epoch": 4.83,
"grad_norm": 8.748353958129883,
"learning_rate": 6.90854271356784e-07,
"loss": 0.6222,
"step": 96600
},
{
"epoch": 4.83,
"grad_norm": 8.883009910583496,
"learning_rate": 6.707537688442211e-07,
"loss": 0.639,
"step": 96700
},
{
"epoch": 4.84,
"grad_norm": 7.332880973815918,
"learning_rate": 6.506532663316584e-07,
"loss": 0.6341,
"step": 96800
},
{
"epoch": 4.84,
"grad_norm": 7.421239852905273,
"learning_rate": 6.305527638190956e-07,
"loss": 0.6378,
"step": 96900
},
{
"epoch": 4.85,
"grad_norm": 6.633522033691406,
"learning_rate": 6.104522613065327e-07,
"loss": 0.6587,
"step": 97000
},
{
"epoch": 4.86,
"grad_norm": 6.347668170928955,
"learning_rate": 5.903517587939699e-07,
"loss": 0.6355,
"step": 97100
},
{
"epoch": 4.86,
"grad_norm": 5.266615390777588,
"learning_rate": 5.702512562814071e-07,
"loss": 0.5976,
"step": 97200
},
{
"epoch": 4.87,
"grad_norm": 5.0562286376953125,
"learning_rate": 5.501507537688443e-07,
"loss": 0.6426,
"step": 97300
},
{
"epoch": 4.87,
"grad_norm": 9.852864265441895,
"learning_rate": 5.300502512562814e-07,
"loss": 0.6434,
"step": 97400
},
{
"epoch": 4.88,
"grad_norm": 5.227302551269531,
"learning_rate": 5.099497487437187e-07,
"loss": 0.674,
"step": 97500
},
{
"epoch": 4.88,
"grad_norm": 7.586268424987793,
"learning_rate": 4.900502512562814e-07,
"loss": 0.6826,
"step": 97600
},
{
"epoch": 4.88,
"grad_norm": 7.512186527252197,
"learning_rate": 4.699497487437187e-07,
"loss": 0.6428,
"step": 97700
},
{
"epoch": 4.89,
"grad_norm": 8.383907318115234,
"learning_rate": 4.498492462311558e-07,
"loss": 0.6215,
"step": 97800
},
{
"epoch": 4.89,
"grad_norm": 6.214056968688965,
"learning_rate": 4.29748743718593e-07,
"loss": 0.6066,
"step": 97900
},
{
"epoch": 4.9,
"grad_norm": 8.587347030639648,
"learning_rate": 4.096482412060302e-07,
"loss": 0.6213,
"step": 98000
},
{
"epoch": 4.91,
"grad_norm": 14.060787200927734,
"learning_rate": 3.8954773869346735e-07,
"loss": 0.6151,
"step": 98100
},
{
"epoch": 4.91,
"grad_norm": 11.65833568572998,
"learning_rate": 3.694472361809046e-07,
"loss": 0.6226,
"step": 98200
},
{
"epoch": 4.92,
"grad_norm": 5.729846477508545,
"learning_rate": 3.4934673366834176e-07,
"loss": 0.6265,
"step": 98300
},
{
"epoch": 4.92,
"grad_norm": 5.596776485443115,
"learning_rate": 3.292462311557789e-07,
"loss": 0.6048,
"step": 98400
},
{
"epoch": 4.92,
"grad_norm": 5.834877967834473,
"learning_rate": 3.091457286432161e-07,
"loss": 0.6358,
"step": 98500
},
{
"epoch": 4.93,
"grad_norm": 7.830298900604248,
"learning_rate": 2.890452261306533e-07,
"loss": 0.6381,
"step": 98600
},
{
"epoch": 4.94,
"grad_norm": 7.147890567779541,
"learning_rate": 2.689447236180905e-07,
"loss": 0.6428,
"step": 98700
},
{
"epoch": 4.94,
"grad_norm": 5.18765926361084,
"learning_rate": 2.4884422110552766e-07,
"loss": 0.6098,
"step": 98800
},
{
"epoch": 4.95,
"grad_norm": 7.276676654815674,
"learning_rate": 2.2874371859296484e-07,
"loss": 0.6329,
"step": 98900
},
{
"epoch": 4.95,
"grad_norm": 7.58540678024292,
"learning_rate": 2.0864321608040202e-07,
"loss": 0.6095,
"step": 99000
},
{
"epoch": 4.96,
"grad_norm": 5.402534008026123,
"learning_rate": 1.8854271356783923e-07,
"loss": 0.605,
"step": 99100
},
{
"epoch": 4.96,
"grad_norm": 7.289499282836914,
"learning_rate": 1.684422110552764e-07,
"loss": 0.6694,
"step": 99200
},
{
"epoch": 4.96,
"grad_norm": 7.618215560913086,
"learning_rate": 1.483417085427136e-07,
"loss": 0.6313,
"step": 99300
},
{
"epoch": 4.97,
"grad_norm": 7.560898780822754,
"learning_rate": 1.2824120603015077e-07,
"loss": 0.6073,
"step": 99400
},
{
"epoch": 4.97,
"grad_norm": 5.637300968170166,
"learning_rate": 1.0834170854271359e-07,
"loss": 0.6211,
"step": 99500
},
{
"epoch": 4.98,
"grad_norm": 8.691441535949707,
"learning_rate": 8.824120603015076e-08,
"loss": 0.6085,
"step": 99600
},
{
"epoch": 4.99,
"grad_norm": 4.510754585266113,
"learning_rate": 6.814070351758795e-08,
"loss": 0.6193,
"step": 99700
},
{
"epoch": 4.99,
"grad_norm": 7.4050703048706055,
"learning_rate": 4.804020100502513e-08,
"loss": 0.6642,
"step": 99800
},
{
"epoch": 5.0,
"grad_norm": 9.641931533813477,
"learning_rate": 2.7939698492462312e-08,
"loss": 0.6304,
"step": 99900
},
{
"epoch": 5.0,
"grad_norm": 7.846133232116699,
"learning_rate": 7.839195979899499e-09,
"loss": 0.6181,
"step": 100000
},
{
"epoch": 5.0,
"eval_loss": 0.9481298923492432,
"eval_runtime": 21.6157,
"eval_samples_per_second": 46.263,
"eval_steps_per_second": 5.783,
"step": 100000
}
],
"logging_steps": 100,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10000,
"total_flos": 1.1800273747968e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}