EM_TEST2 / tmp /checkpoints /checkpoint-310 /trainer_state.json
Jack-Payne1's picture
Upload folder using huggingface_hub
256ee59 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7818411097099621,
"eval_steps": 100,
"global_step": 310,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025220680958385876,
"grad_norm": 25.350475311279297,
"learning_rate": 0.0,
"loss": 2.5568,
"step": 1
},
{
"epoch": 0.005044136191677175,
"grad_norm": 24.538068771362305,
"learning_rate": 4.000000000000001e-06,
"loss": 2.7748,
"step": 2
},
{
"epoch": 0.007566204287515763,
"grad_norm": 23.780784606933594,
"learning_rate": 8.000000000000001e-06,
"loss": 2.5911,
"step": 3
},
{
"epoch": 0.01008827238335435,
"grad_norm": 24.780380249023438,
"learning_rate": 1.2e-05,
"loss": 2.8427,
"step": 4
},
{
"epoch": 0.012610340479192938,
"grad_norm": 22.699949264526367,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.709,
"step": 5
},
{
"epoch": 0.015132408575031526,
"grad_norm": 22.106008529663086,
"learning_rate": 2e-05,
"loss": 2.5854,
"step": 6
},
{
"epoch": 0.017654476670870115,
"grad_norm": 22.497045516967773,
"learning_rate": 1.9948979591836737e-05,
"loss": 2.5427,
"step": 7
},
{
"epoch": 0.0201765447667087,
"grad_norm": 27.103275299072266,
"learning_rate": 1.9897959183673473e-05,
"loss": 2.6599,
"step": 8
},
{
"epoch": 0.02269861286254729,
"grad_norm": 21.081985473632812,
"learning_rate": 1.9846938775510205e-05,
"loss": 2.5145,
"step": 9
},
{
"epoch": 0.025220680958385876,
"grad_norm": 25.964981079101562,
"learning_rate": 1.979591836734694e-05,
"loss": 2.4247,
"step": 10
},
{
"epoch": 0.027742749054224466,
"grad_norm": 25.353195190429688,
"learning_rate": 1.9744897959183677e-05,
"loss": 2.5092,
"step": 11
},
{
"epoch": 0.03026481715006305,
"grad_norm": 18.94191551208496,
"learning_rate": 1.969387755102041e-05,
"loss": 2.4335,
"step": 12
},
{
"epoch": 0.03278688524590164,
"grad_norm": 23.60140037536621,
"learning_rate": 1.9642857142857145e-05,
"loss": 2.544,
"step": 13
},
{
"epoch": 0.03530895334174023,
"grad_norm": 24.298965454101562,
"learning_rate": 1.9591836734693877e-05,
"loss": 2.4987,
"step": 14
},
{
"epoch": 0.03783102143757881,
"grad_norm": 20.745506286621094,
"learning_rate": 1.9540816326530613e-05,
"loss": 2.4985,
"step": 15
},
{
"epoch": 0.0403530895334174,
"grad_norm": 22.54330062866211,
"learning_rate": 1.948979591836735e-05,
"loss": 2.6892,
"step": 16
},
{
"epoch": 0.04287515762925599,
"grad_norm": 21.46229362487793,
"learning_rate": 1.9438775510204085e-05,
"loss": 2.3998,
"step": 17
},
{
"epoch": 0.04539722572509458,
"grad_norm": 20.54530143737793,
"learning_rate": 1.9387755102040817e-05,
"loss": 2.4244,
"step": 18
},
{
"epoch": 0.04791929382093316,
"grad_norm": 18.8839111328125,
"learning_rate": 1.9336734693877553e-05,
"loss": 2.3911,
"step": 19
},
{
"epoch": 0.05044136191677175,
"grad_norm": 16.924652099609375,
"learning_rate": 1.928571428571429e-05,
"loss": 2.3588,
"step": 20
},
{
"epoch": 0.05296343001261034,
"grad_norm": 16.996627807617188,
"learning_rate": 1.9234693877551024e-05,
"loss": 2.3727,
"step": 21
},
{
"epoch": 0.05548549810844893,
"grad_norm": 18.584613800048828,
"learning_rate": 1.9183673469387756e-05,
"loss": 2.2974,
"step": 22
},
{
"epoch": 0.058007566204287514,
"grad_norm": 14.309200286865234,
"learning_rate": 1.9132653061224492e-05,
"loss": 2.4843,
"step": 23
},
{
"epoch": 0.0605296343001261,
"grad_norm": 15.074164390563965,
"learning_rate": 1.9081632653061225e-05,
"loss": 2.4043,
"step": 24
},
{
"epoch": 0.06305170239596469,
"grad_norm": 13.610542297363281,
"learning_rate": 1.903061224489796e-05,
"loss": 2.3762,
"step": 25
},
{
"epoch": 0.06557377049180328,
"grad_norm": 15.666613578796387,
"learning_rate": 1.8979591836734696e-05,
"loss": 2.3249,
"step": 26
},
{
"epoch": 0.06809583858764187,
"grad_norm": 14.475164413452148,
"learning_rate": 1.892857142857143e-05,
"loss": 2.3317,
"step": 27
},
{
"epoch": 0.07061790668348046,
"grad_norm": 16.231687545776367,
"learning_rate": 1.8877551020408164e-05,
"loss": 2.5064,
"step": 28
},
{
"epoch": 0.07313997477931904,
"grad_norm": 16.8968563079834,
"learning_rate": 1.88265306122449e-05,
"loss": 2.3932,
"step": 29
},
{
"epoch": 0.07566204287515763,
"grad_norm": 17.74305534362793,
"learning_rate": 1.8775510204081636e-05,
"loss": 2.3329,
"step": 30
},
{
"epoch": 0.07818411097099622,
"grad_norm": 16.41620445251465,
"learning_rate": 1.8724489795918368e-05,
"loss": 2.3982,
"step": 31
},
{
"epoch": 0.0807061790668348,
"grad_norm": 17.965959548950195,
"learning_rate": 1.8673469387755104e-05,
"loss": 2.4227,
"step": 32
},
{
"epoch": 0.0832282471626734,
"grad_norm": 19.92589569091797,
"learning_rate": 1.862244897959184e-05,
"loss": 2.5255,
"step": 33
},
{
"epoch": 0.08575031525851198,
"grad_norm": 20.62932586669922,
"learning_rate": 1.8571428571428575e-05,
"loss": 2.1816,
"step": 34
},
{
"epoch": 0.08827238335435057,
"grad_norm": 18.360614776611328,
"learning_rate": 1.8520408163265307e-05,
"loss": 2.2827,
"step": 35
},
{
"epoch": 0.09079445145018916,
"grad_norm": 19.199546813964844,
"learning_rate": 1.8469387755102043e-05,
"loss": 2.1498,
"step": 36
},
{
"epoch": 0.09331651954602774,
"grad_norm": 22.727521896362305,
"learning_rate": 1.8418367346938776e-05,
"loss": 2.3811,
"step": 37
},
{
"epoch": 0.09583858764186633,
"grad_norm": 19.80649757385254,
"learning_rate": 1.836734693877551e-05,
"loss": 2.2342,
"step": 38
},
{
"epoch": 0.09836065573770492,
"grad_norm": 22.24563217163086,
"learning_rate": 1.8316326530612247e-05,
"loss": 2.2287,
"step": 39
},
{
"epoch": 0.1008827238335435,
"grad_norm": 25.384042739868164,
"learning_rate": 1.826530612244898e-05,
"loss": 2.1259,
"step": 40
},
{
"epoch": 0.1034047919293821,
"grad_norm": 23.417089462280273,
"learning_rate": 1.8214285714285715e-05,
"loss": 2.0858,
"step": 41
},
{
"epoch": 0.10592686002522068,
"grad_norm": 27.639497756958008,
"learning_rate": 1.816326530612245e-05,
"loss": 2.2243,
"step": 42
},
{
"epoch": 0.10844892812105927,
"grad_norm": 27.390850067138672,
"learning_rate": 1.8112244897959187e-05,
"loss": 2.1314,
"step": 43
},
{
"epoch": 0.11097099621689786,
"grad_norm": 27.956937789916992,
"learning_rate": 1.806122448979592e-05,
"loss": 2.1755,
"step": 44
},
{
"epoch": 0.11349306431273644,
"grad_norm": 32.09632873535156,
"learning_rate": 1.8010204081632655e-05,
"loss": 2.2365,
"step": 45
},
{
"epoch": 0.11601513240857503,
"grad_norm": 33.84647750854492,
"learning_rate": 1.795918367346939e-05,
"loss": 2.1671,
"step": 46
},
{
"epoch": 0.11853720050441362,
"grad_norm": 32.027130126953125,
"learning_rate": 1.7908163265306123e-05,
"loss": 2.09,
"step": 47
},
{
"epoch": 0.1210592686002522,
"grad_norm": 35.423587799072266,
"learning_rate": 1.785714285714286e-05,
"loss": 2.2479,
"step": 48
},
{
"epoch": 0.1235813366960908,
"grad_norm": 31.041240692138672,
"learning_rate": 1.780612244897959e-05,
"loss": 1.9687,
"step": 49
},
{
"epoch": 0.12610340479192939,
"grad_norm": 28.790103912353516,
"learning_rate": 1.7755102040816327e-05,
"loss": 2.1428,
"step": 50
},
{
"epoch": 0.12862547288776796,
"grad_norm": 25.089313507080078,
"learning_rate": 1.7704081632653062e-05,
"loss": 2.0673,
"step": 51
},
{
"epoch": 0.13114754098360656,
"grad_norm": 26.493867874145508,
"learning_rate": 1.7653061224489798e-05,
"loss": 2.0814,
"step": 52
},
{
"epoch": 0.13366960907944514,
"grad_norm": 19.993173599243164,
"learning_rate": 1.760204081632653e-05,
"loss": 2.005,
"step": 53
},
{
"epoch": 0.13619167717528374,
"grad_norm": 21.89765167236328,
"learning_rate": 1.7551020408163266e-05,
"loss": 2.2262,
"step": 54
},
{
"epoch": 0.13871374527112232,
"grad_norm": 23.22844123840332,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.0208,
"step": 55
},
{
"epoch": 0.14123581336696092,
"grad_norm": 15.864526748657227,
"learning_rate": 1.7448979591836738e-05,
"loss": 2.0153,
"step": 56
},
{
"epoch": 0.1437578814627995,
"grad_norm": 21.451187133789062,
"learning_rate": 1.7397959183673473e-05,
"loss": 2.1386,
"step": 57
},
{
"epoch": 0.14627994955863807,
"grad_norm": 18.089811325073242,
"learning_rate": 1.7346938775510206e-05,
"loss": 1.9517,
"step": 58
},
{
"epoch": 0.14880201765447668,
"grad_norm": 24.029157638549805,
"learning_rate": 1.729591836734694e-05,
"loss": 1.9719,
"step": 59
},
{
"epoch": 0.15132408575031525,
"grad_norm": 18.722776412963867,
"learning_rate": 1.7244897959183674e-05,
"loss": 2.0623,
"step": 60
},
{
"epoch": 0.15384615384615385,
"grad_norm": 20.211933135986328,
"learning_rate": 1.719387755102041e-05,
"loss": 2.0081,
"step": 61
},
{
"epoch": 0.15636822194199243,
"grad_norm": 17.61188507080078,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.8484,
"step": 62
},
{
"epoch": 0.15889029003783103,
"grad_norm": 20.118955612182617,
"learning_rate": 1.7091836734693878e-05,
"loss": 2.0799,
"step": 63
},
{
"epoch": 0.1614123581336696,
"grad_norm": 17.271841049194336,
"learning_rate": 1.7040816326530613e-05,
"loss": 1.9832,
"step": 64
},
{
"epoch": 0.16393442622950818,
"grad_norm": 19.521392822265625,
"learning_rate": 1.698979591836735e-05,
"loss": 1.9129,
"step": 65
},
{
"epoch": 0.1664564943253468,
"grad_norm": 22.660900115966797,
"learning_rate": 1.6938775510204085e-05,
"loss": 2.118,
"step": 66
},
{
"epoch": 0.16897856242118536,
"grad_norm": 17.332427978515625,
"learning_rate": 1.6887755102040817e-05,
"loss": 1.9632,
"step": 67
},
{
"epoch": 0.17150063051702397,
"grad_norm": 22.42765998840332,
"learning_rate": 1.6836734693877553e-05,
"loss": 1.954,
"step": 68
},
{
"epoch": 0.17402269861286254,
"grad_norm": 23.6208553314209,
"learning_rate": 1.678571428571429e-05,
"loss": 1.9917,
"step": 69
},
{
"epoch": 0.17654476670870115,
"grad_norm": 19.78505516052246,
"learning_rate": 1.673469387755102e-05,
"loss": 1.7964,
"step": 70
},
{
"epoch": 0.17906683480453972,
"grad_norm": 19.453041076660156,
"learning_rate": 1.6683673469387757e-05,
"loss": 1.9587,
"step": 71
},
{
"epoch": 0.18158890290037832,
"grad_norm": 24.731407165527344,
"learning_rate": 1.6632653061224492e-05,
"loss": 1.9945,
"step": 72
},
{
"epoch": 0.1841109709962169,
"grad_norm": 20.977611541748047,
"learning_rate": 1.6581632653061225e-05,
"loss": 2.0617,
"step": 73
},
{
"epoch": 0.18663303909205547,
"grad_norm": 22.959585189819336,
"learning_rate": 1.653061224489796e-05,
"loss": 1.98,
"step": 74
},
{
"epoch": 0.18915510718789408,
"grad_norm": 21.952653884887695,
"learning_rate": 1.6479591836734696e-05,
"loss": 2.1094,
"step": 75
},
{
"epoch": 0.19167717528373265,
"grad_norm": 22.320383071899414,
"learning_rate": 1.642857142857143e-05,
"loss": 1.8418,
"step": 76
},
{
"epoch": 0.19419924337957126,
"grad_norm": 24.375411987304688,
"learning_rate": 1.6377551020408164e-05,
"loss": 1.8428,
"step": 77
},
{
"epoch": 0.19672131147540983,
"grad_norm": 19.64323616027832,
"learning_rate": 1.63265306122449e-05,
"loss": 1.9194,
"step": 78
},
{
"epoch": 0.19924337957124844,
"grad_norm": 22.459064483642578,
"learning_rate": 1.6275510204081636e-05,
"loss": 1.6649,
"step": 79
},
{
"epoch": 0.201765447667087,
"grad_norm": 36.789764404296875,
"learning_rate": 1.6224489795918368e-05,
"loss": 2.0131,
"step": 80
},
{
"epoch": 0.2042875157629256,
"grad_norm": 22.109119415283203,
"learning_rate": 1.6173469387755104e-05,
"loss": 1.9603,
"step": 81
},
{
"epoch": 0.2068095838587642,
"grad_norm": 19.196834564208984,
"learning_rate": 1.612244897959184e-05,
"loss": 2.0538,
"step": 82
},
{
"epoch": 0.20933165195460277,
"grad_norm": 26.870800018310547,
"learning_rate": 1.6071428571428572e-05,
"loss": 1.9168,
"step": 83
},
{
"epoch": 0.21185372005044137,
"grad_norm": 35.190696716308594,
"learning_rate": 1.6020408163265308e-05,
"loss": 2.0149,
"step": 84
},
{
"epoch": 0.21437578814627994,
"grad_norm": 19.963472366333008,
"learning_rate": 1.596938775510204e-05,
"loss": 1.7871,
"step": 85
},
{
"epoch": 0.21689785624211855,
"grad_norm": 20.292407989501953,
"learning_rate": 1.5918367346938776e-05,
"loss": 1.944,
"step": 86
},
{
"epoch": 0.21941992433795712,
"grad_norm": 20.55329132080078,
"learning_rate": 1.586734693877551e-05,
"loss": 2.0175,
"step": 87
},
{
"epoch": 0.22194199243379573,
"grad_norm": 17.27350616455078,
"learning_rate": 1.5816326530612247e-05,
"loss": 1.9308,
"step": 88
},
{
"epoch": 0.2244640605296343,
"grad_norm": 22.471134185791016,
"learning_rate": 1.576530612244898e-05,
"loss": 1.9221,
"step": 89
},
{
"epoch": 0.22698612862547288,
"grad_norm": 25.098316192626953,
"learning_rate": 1.5714285714285715e-05,
"loss": 1.9359,
"step": 90
},
{
"epoch": 0.22950819672131148,
"grad_norm": 25.125213623046875,
"learning_rate": 1.566326530612245e-05,
"loss": 2.0087,
"step": 91
},
{
"epoch": 0.23203026481715006,
"grad_norm": 20.038599014282227,
"learning_rate": 1.5612244897959187e-05,
"loss": 2.0939,
"step": 92
},
{
"epoch": 0.23455233291298866,
"grad_norm": 19.016841888427734,
"learning_rate": 1.556122448979592e-05,
"loss": 2.0183,
"step": 93
},
{
"epoch": 0.23707440100882723,
"grad_norm": 21.97820472717285,
"learning_rate": 1.5510204081632655e-05,
"loss": 1.8239,
"step": 94
},
{
"epoch": 0.23959646910466584,
"grad_norm": 25.578901290893555,
"learning_rate": 1.545918367346939e-05,
"loss": 1.9388,
"step": 95
},
{
"epoch": 0.2421185372005044,
"grad_norm": 23.74614143371582,
"learning_rate": 1.5408163265306123e-05,
"loss": 2.0492,
"step": 96
},
{
"epoch": 0.244640605296343,
"grad_norm": 22.203304290771484,
"learning_rate": 1.535714285714286e-05,
"loss": 1.941,
"step": 97
},
{
"epoch": 0.2471626733921816,
"grad_norm": 21.39324188232422,
"learning_rate": 1.530612244897959e-05,
"loss": 1.9042,
"step": 98
},
{
"epoch": 0.24968474148802017,
"grad_norm": 18.99315643310547,
"learning_rate": 1.5255102040816327e-05,
"loss": 1.88,
"step": 99
},
{
"epoch": 0.25220680958385877,
"grad_norm": 24.22341537475586,
"learning_rate": 1.5204081632653063e-05,
"loss": 1.8147,
"step": 100
},
{
"epoch": 0.25220680958385877,
"eval_loss": 1.8966256380081177,
"eval_runtime": 6.9787,
"eval_samples_per_second": 101.022,
"eval_steps_per_second": 50.583,
"step": 100
},
{
"epoch": 0.2547288776796974,
"grad_norm": 18.296152114868164,
"learning_rate": 1.5153061224489798e-05,
"loss": 1.8605,
"step": 101
},
{
"epoch": 0.2572509457755359,
"grad_norm": 26.404766082763672,
"learning_rate": 1.510204081632653e-05,
"loss": 2.1195,
"step": 102
},
{
"epoch": 0.2597730138713745,
"grad_norm": 19.187122344970703,
"learning_rate": 1.5051020408163266e-05,
"loss": 1.9284,
"step": 103
},
{
"epoch": 0.26229508196721313,
"grad_norm": 20.79934310913086,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.725,
"step": 104
},
{
"epoch": 0.2648171500630517,
"grad_norm": 23.833288192749023,
"learning_rate": 1.4948979591836736e-05,
"loss": 1.9215,
"step": 105
},
{
"epoch": 0.2673392181588903,
"grad_norm": 22.301727294921875,
"learning_rate": 1.4897959183673472e-05,
"loss": 1.8728,
"step": 106
},
{
"epoch": 0.2698612862547289,
"grad_norm": 23.685596466064453,
"learning_rate": 1.4846938775510204e-05,
"loss": 2.0482,
"step": 107
},
{
"epoch": 0.2723833543505675,
"grad_norm": 18.969186782836914,
"learning_rate": 1.479591836734694e-05,
"loss": 1.8724,
"step": 108
},
{
"epoch": 0.27490542244640603,
"grad_norm": 23.994483947753906,
"learning_rate": 1.4744897959183676e-05,
"loss": 1.9542,
"step": 109
},
{
"epoch": 0.27742749054224464,
"grad_norm": 16.84621238708496,
"learning_rate": 1.469387755102041e-05,
"loss": 1.9703,
"step": 110
},
{
"epoch": 0.27994955863808324,
"grad_norm": 23.411087036132812,
"learning_rate": 1.4642857142857144e-05,
"loss": 1.9836,
"step": 111
},
{
"epoch": 0.28247162673392184,
"grad_norm": 29.55487632751465,
"learning_rate": 1.4591836734693878e-05,
"loss": 1.9124,
"step": 112
},
{
"epoch": 0.2849936948297604,
"grad_norm": 32.28921127319336,
"learning_rate": 1.4540816326530614e-05,
"loss": 1.8566,
"step": 113
},
{
"epoch": 0.287515762925599,
"grad_norm": 24.007558822631836,
"learning_rate": 1.448979591836735e-05,
"loss": 1.8296,
"step": 114
},
{
"epoch": 0.2900378310214376,
"grad_norm": 26.753524780273438,
"learning_rate": 1.4438775510204083e-05,
"loss": 1.7181,
"step": 115
},
{
"epoch": 0.29255989911727615,
"grad_norm": 22.49270248413086,
"learning_rate": 1.4387755102040817e-05,
"loss": 1.8741,
"step": 116
},
{
"epoch": 0.29508196721311475,
"grad_norm": 28.006656646728516,
"learning_rate": 1.4336734693877551e-05,
"loss": 1.9151,
"step": 117
},
{
"epoch": 0.29760403530895335,
"grad_norm": 17.606775283813477,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.9654,
"step": 118
},
{
"epoch": 0.30012610340479196,
"grad_norm": 29.94802474975586,
"learning_rate": 1.4234693877551023e-05,
"loss": 1.8849,
"step": 119
},
{
"epoch": 0.3026481715006305,
"grad_norm": 28.27743148803711,
"learning_rate": 1.4183673469387755e-05,
"loss": 1.8006,
"step": 120
},
{
"epoch": 0.3051702395964691,
"grad_norm": 19.11652183532715,
"learning_rate": 1.4132653061224491e-05,
"loss": 1.8539,
"step": 121
},
{
"epoch": 0.3076923076923077,
"grad_norm": 24.255807876586914,
"learning_rate": 1.4081632653061225e-05,
"loss": 2.0162,
"step": 122
},
{
"epoch": 0.31021437578814626,
"grad_norm": 22.508352279663086,
"learning_rate": 1.403061224489796e-05,
"loss": 1.858,
"step": 123
},
{
"epoch": 0.31273644388398486,
"grad_norm": 27.028772354125977,
"learning_rate": 1.3979591836734696e-05,
"loss": 1.8175,
"step": 124
},
{
"epoch": 0.31525851197982346,
"grad_norm": 22.697704315185547,
"learning_rate": 1.3928571428571429e-05,
"loss": 1.9789,
"step": 125
},
{
"epoch": 0.31778058007566207,
"grad_norm": 31.604068756103516,
"learning_rate": 1.3877551020408165e-05,
"loss": 2.0039,
"step": 126
},
{
"epoch": 0.3203026481715006,
"grad_norm": 27.71053695678711,
"learning_rate": 1.38265306122449e-05,
"loss": 1.9867,
"step": 127
},
{
"epoch": 0.3228247162673392,
"grad_norm": 17.37586784362793,
"learning_rate": 1.3775510204081634e-05,
"loss": 1.6931,
"step": 128
},
{
"epoch": 0.3253467843631778,
"grad_norm": 20.28536605834961,
"learning_rate": 1.3724489795918368e-05,
"loss": 1.9199,
"step": 129
},
{
"epoch": 0.32786885245901637,
"grad_norm": 29.4377384185791,
"learning_rate": 1.3673469387755102e-05,
"loss": 1.9322,
"step": 130
},
{
"epoch": 0.33039092055485497,
"grad_norm": 17.703046798706055,
"learning_rate": 1.3622448979591838e-05,
"loss": 1.8842,
"step": 131
},
{
"epoch": 0.3329129886506936,
"grad_norm": 30.14008140563965,
"learning_rate": 1.3571428571428574e-05,
"loss": 2.1228,
"step": 132
},
{
"epoch": 0.3354350567465322,
"grad_norm": 29.657262802124023,
"learning_rate": 1.3520408163265306e-05,
"loss": 1.8929,
"step": 133
},
{
"epoch": 0.3379571248423707,
"grad_norm": 18.243854522705078,
"learning_rate": 1.3469387755102042e-05,
"loss": 1.8811,
"step": 134
},
{
"epoch": 0.34047919293820933,
"grad_norm": 33.22247314453125,
"learning_rate": 1.3418367346938776e-05,
"loss": 1.9814,
"step": 135
},
{
"epoch": 0.34300126103404793,
"grad_norm": 26.413856506347656,
"learning_rate": 1.3367346938775512e-05,
"loss": 1.9329,
"step": 136
},
{
"epoch": 0.3455233291298865,
"grad_norm": 20.56089210510254,
"learning_rate": 1.3316326530612247e-05,
"loss": 1.8944,
"step": 137
},
{
"epoch": 0.3480453972257251,
"grad_norm": 19.480737686157227,
"learning_rate": 1.326530612244898e-05,
"loss": 1.9221,
"step": 138
},
{
"epoch": 0.3505674653215637,
"grad_norm": 22.788074493408203,
"learning_rate": 1.3214285714285716e-05,
"loss": 2.0445,
"step": 139
},
{
"epoch": 0.3530895334174023,
"grad_norm": 20.722291946411133,
"learning_rate": 1.316326530612245e-05,
"loss": 1.9998,
"step": 140
},
{
"epoch": 0.35561160151324084,
"grad_norm": 25.190189361572266,
"learning_rate": 1.3112244897959185e-05,
"loss": 1.8076,
"step": 141
},
{
"epoch": 0.35813366960907944,
"grad_norm": 23.203886032104492,
"learning_rate": 1.3061224489795918e-05,
"loss": 1.7821,
"step": 142
},
{
"epoch": 0.36065573770491804,
"grad_norm": 25.32374382019043,
"learning_rate": 1.3010204081632653e-05,
"loss": 2.0356,
"step": 143
},
{
"epoch": 0.36317780580075665,
"grad_norm": 28.798864364624023,
"learning_rate": 1.2959183673469389e-05,
"loss": 1.8202,
"step": 144
},
{
"epoch": 0.3656998738965952,
"grad_norm": 24.93810272216797,
"learning_rate": 1.2908163265306123e-05,
"loss": 1.9237,
"step": 145
},
{
"epoch": 0.3682219419924338,
"grad_norm": 36.78353500366211,
"learning_rate": 1.2857142857142859e-05,
"loss": 2.0019,
"step": 146
},
{
"epoch": 0.3707440100882724,
"grad_norm": 28.510663986206055,
"learning_rate": 1.2806122448979591e-05,
"loss": 1.9268,
"step": 147
},
{
"epoch": 0.37326607818411095,
"grad_norm": 38.19087219238281,
"learning_rate": 1.2755102040816327e-05,
"loss": 1.9366,
"step": 148
},
{
"epoch": 0.37578814627994955,
"grad_norm": 20.796728134155273,
"learning_rate": 1.2704081632653063e-05,
"loss": 1.8731,
"step": 149
},
{
"epoch": 0.37831021437578816,
"grad_norm": 23.036758422851562,
"learning_rate": 1.2653061224489798e-05,
"loss": 1.8835,
"step": 150
},
{
"epoch": 0.38083228247162676,
"grad_norm": 27.058195114135742,
"learning_rate": 1.260204081632653e-05,
"loss": 1.8013,
"step": 151
},
{
"epoch": 0.3833543505674653,
"grad_norm": 25.390460968017578,
"learning_rate": 1.2551020408163267e-05,
"loss": 2.0623,
"step": 152
},
{
"epoch": 0.3858764186633039,
"grad_norm": 27.993654251098633,
"learning_rate": 1.25e-05,
"loss": 1.6895,
"step": 153
},
{
"epoch": 0.3883984867591425,
"grad_norm": 24.15807342529297,
"learning_rate": 1.2448979591836736e-05,
"loss": 1.9799,
"step": 154
},
{
"epoch": 0.39092055485498106,
"grad_norm": 24.369815826416016,
"learning_rate": 1.2397959183673472e-05,
"loss": 1.9687,
"step": 155
},
{
"epoch": 0.39344262295081966,
"grad_norm": 24.572988510131836,
"learning_rate": 1.2346938775510204e-05,
"loss": 1.8607,
"step": 156
},
{
"epoch": 0.39596469104665827,
"grad_norm": 20.491390228271484,
"learning_rate": 1.229591836734694e-05,
"loss": 2.0677,
"step": 157
},
{
"epoch": 0.39848675914249687,
"grad_norm": 25.128101348876953,
"learning_rate": 1.2244897959183674e-05,
"loss": 1.9233,
"step": 158
},
{
"epoch": 0.4010088272383354,
"grad_norm": 18.843276977539062,
"learning_rate": 1.219387755102041e-05,
"loss": 1.781,
"step": 159
},
{
"epoch": 0.403530895334174,
"grad_norm": 24.99994659423828,
"learning_rate": 1.2142857142857142e-05,
"loss": 1.962,
"step": 160
},
{
"epoch": 0.4060529634300126,
"grad_norm": 20.679218292236328,
"learning_rate": 1.2091836734693878e-05,
"loss": 2.0055,
"step": 161
},
{
"epoch": 0.4085750315258512,
"grad_norm": 26.00550651550293,
"learning_rate": 1.2040816326530614e-05,
"loss": 1.9761,
"step": 162
},
{
"epoch": 0.4110970996216898,
"grad_norm": 33.80900192260742,
"learning_rate": 1.1989795918367348e-05,
"loss": 2.0502,
"step": 163
},
{
"epoch": 0.4136191677175284,
"grad_norm": 25.639009475708008,
"learning_rate": 1.1938775510204084e-05,
"loss": 1.9088,
"step": 164
},
{
"epoch": 0.416141235813367,
"grad_norm": 17.48627471923828,
"learning_rate": 1.1887755102040816e-05,
"loss": 1.9359,
"step": 165
},
{
"epoch": 0.41866330390920553,
"grad_norm": 23.16074562072754,
"learning_rate": 1.1836734693877552e-05,
"loss": 1.8647,
"step": 166
},
{
"epoch": 0.42118537200504413,
"grad_norm": 25.39946174621582,
"learning_rate": 1.1785714285714287e-05,
"loss": 1.8523,
"step": 167
},
{
"epoch": 0.42370744010088274,
"grad_norm": 25.8050537109375,
"learning_rate": 1.1734693877551021e-05,
"loss": 1.8403,
"step": 168
},
{
"epoch": 0.4262295081967213,
"grad_norm": 20.019033432006836,
"learning_rate": 1.1683673469387755e-05,
"loss": 2.0023,
"step": 169
},
{
"epoch": 0.4287515762925599,
"grad_norm": 26.194847106933594,
"learning_rate": 1.1632653061224491e-05,
"loss": 1.9429,
"step": 170
},
{
"epoch": 0.4312736443883985,
"grad_norm": 21.064212799072266,
"learning_rate": 1.1581632653061225e-05,
"loss": 1.8302,
"step": 171
},
{
"epoch": 0.4337957124842371,
"grad_norm": 21.876129150390625,
"learning_rate": 1.1530612244897961e-05,
"loss": 1.8881,
"step": 172
},
{
"epoch": 0.43631778058007564,
"grad_norm": 33.61103439331055,
"learning_rate": 1.1479591836734697e-05,
"loss": 2.0497,
"step": 173
},
{
"epoch": 0.43883984867591425,
"grad_norm": 27.204744338989258,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.8431,
"step": 174
},
{
"epoch": 0.44136191677175285,
"grad_norm": 21.605751037597656,
"learning_rate": 1.1377551020408165e-05,
"loss": 1.9149,
"step": 175
},
{
"epoch": 0.44388398486759145,
"grad_norm": 30.307472229003906,
"learning_rate": 1.1326530612244899e-05,
"loss": 2.0313,
"step": 176
},
{
"epoch": 0.44640605296343,
"grad_norm": 23.69244384765625,
"learning_rate": 1.1275510204081635e-05,
"loss": 1.8676,
"step": 177
},
{
"epoch": 0.4489281210592686,
"grad_norm": 25.619901657104492,
"learning_rate": 1.1224489795918367e-05,
"loss": 1.7905,
"step": 178
},
{
"epoch": 0.4514501891551072,
"grad_norm": 28.0296573638916,
"learning_rate": 1.1173469387755103e-05,
"loss": 1.8567,
"step": 179
},
{
"epoch": 0.45397225725094575,
"grad_norm": 36.4359130859375,
"learning_rate": 1.1122448979591838e-05,
"loss": 2.115,
"step": 180
},
{
"epoch": 0.45649432534678436,
"grad_norm": 26.91726303100586,
"learning_rate": 1.1071428571428572e-05,
"loss": 2.0642,
"step": 181
},
{
"epoch": 0.45901639344262296,
"grad_norm": 23.085880279541016,
"learning_rate": 1.1020408163265306e-05,
"loss": 1.9109,
"step": 182
},
{
"epoch": 0.46153846153846156,
"grad_norm": 27.870641708374023,
"learning_rate": 1.096938775510204e-05,
"loss": 1.8999,
"step": 183
},
{
"epoch": 0.4640605296343001,
"grad_norm": 32.0672607421875,
"learning_rate": 1.0918367346938776e-05,
"loss": 1.904,
"step": 184
},
{
"epoch": 0.4665825977301387,
"grad_norm": 28.879365921020508,
"learning_rate": 1.0867346938775512e-05,
"loss": 1.7159,
"step": 185
},
{
"epoch": 0.4691046658259773,
"grad_norm": 27.592771530151367,
"learning_rate": 1.0816326530612246e-05,
"loss": 1.8561,
"step": 186
},
{
"epoch": 0.47162673392181587,
"grad_norm": 27.412763595581055,
"learning_rate": 1.076530612244898e-05,
"loss": 1.9282,
"step": 187
},
{
"epoch": 0.47414880201765447,
"grad_norm": 30.12356185913086,
"learning_rate": 1.0714285714285714e-05,
"loss": 1.8726,
"step": 188
},
{
"epoch": 0.4766708701134931,
"grad_norm": 39.9027214050293,
"learning_rate": 1.066326530612245e-05,
"loss": 1.7551,
"step": 189
},
{
"epoch": 0.4791929382093317,
"grad_norm": 30.483945846557617,
"learning_rate": 1.0612244897959186e-05,
"loss": 1.7643,
"step": 190
},
{
"epoch": 0.4817150063051702,
"grad_norm": 26.00415802001953,
"learning_rate": 1.0561224489795918e-05,
"loss": 2.0552,
"step": 191
},
{
"epoch": 0.4842370744010088,
"grad_norm": 23.03282356262207,
"learning_rate": 1.0510204081632654e-05,
"loss": 2.0052,
"step": 192
},
{
"epoch": 0.48675914249684743,
"grad_norm": 33.653221130371094,
"learning_rate": 1.045918367346939e-05,
"loss": 1.7367,
"step": 193
},
{
"epoch": 0.489281210592686,
"grad_norm": 39.59351348876953,
"learning_rate": 1.0408163265306123e-05,
"loss": 1.9726,
"step": 194
},
{
"epoch": 0.4918032786885246,
"grad_norm": 42.77714920043945,
"learning_rate": 1.0357142857142859e-05,
"loss": 2.0906,
"step": 195
},
{
"epoch": 0.4943253467843632,
"grad_norm": 33.194549560546875,
"learning_rate": 1.0306122448979591e-05,
"loss": 2.0742,
"step": 196
},
{
"epoch": 0.4968474148802018,
"grad_norm": 25.10793685913086,
"learning_rate": 1.0255102040816327e-05,
"loss": 1.9204,
"step": 197
},
{
"epoch": 0.49936948297604034,
"grad_norm": 40.048404693603516,
"learning_rate": 1.0204081632653063e-05,
"loss": 1.7775,
"step": 198
},
{
"epoch": 0.501891551071879,
"grad_norm": 26.085933685302734,
"learning_rate": 1.0153061224489797e-05,
"loss": 1.9459,
"step": 199
},
{
"epoch": 0.5044136191677175,
"grad_norm": 18.375,
"learning_rate": 1.0102040816326531e-05,
"loss": 1.9536,
"step": 200
},
{
"epoch": 0.5044136191677175,
"eval_loss": 1.8626214265823364,
"eval_runtime": 6.6508,
"eval_samples_per_second": 106.002,
"eval_steps_per_second": 53.076,
"step": 200
},
{
"epoch": 0.5069356872635561,
"grad_norm": 33.858341217041016,
"learning_rate": 1.0051020408163265e-05,
"loss": 1.8191,
"step": 201
},
{
"epoch": 0.5094577553593947,
"grad_norm": 22.895992279052734,
"learning_rate": 1e-05,
"loss": 2.0596,
"step": 202
},
{
"epoch": 0.5119798234552333,
"grad_norm": 30.55072593688965,
"learning_rate": 9.948979591836737e-06,
"loss": 1.8904,
"step": 203
},
{
"epoch": 0.5145018915510718,
"grad_norm": 26.542705535888672,
"learning_rate": 9.89795918367347e-06,
"loss": 1.9683,
"step": 204
},
{
"epoch": 0.5170239596469105,
"grad_norm": 39.81034851074219,
"learning_rate": 9.846938775510205e-06,
"loss": 1.9726,
"step": 205
},
{
"epoch": 0.519546027742749,
"grad_norm": 22.0065860748291,
"learning_rate": 9.795918367346939e-06,
"loss": 2.0575,
"step": 206
},
{
"epoch": 0.5220680958385876,
"grad_norm": 19.012041091918945,
"learning_rate": 9.744897959183674e-06,
"loss": 1.8431,
"step": 207
},
{
"epoch": 0.5245901639344263,
"grad_norm": 39.699974060058594,
"learning_rate": 9.693877551020408e-06,
"loss": 1.8231,
"step": 208
},
{
"epoch": 0.5271122320302648,
"grad_norm": 21.391319274902344,
"learning_rate": 9.642857142857144e-06,
"loss": 1.7939,
"step": 209
},
{
"epoch": 0.5296343001261034,
"grad_norm": 25.8063907623291,
"learning_rate": 9.591836734693878e-06,
"loss": 2.0366,
"step": 210
},
{
"epoch": 0.532156368221942,
"grad_norm": 20.598569869995117,
"learning_rate": 9.540816326530612e-06,
"loss": 1.8323,
"step": 211
},
{
"epoch": 0.5346784363177806,
"grad_norm": 29.391401290893555,
"learning_rate": 9.489795918367348e-06,
"loss": 2.0052,
"step": 212
},
{
"epoch": 0.5372005044136192,
"grad_norm": 24.39499855041504,
"learning_rate": 9.438775510204082e-06,
"loss": 1.8461,
"step": 213
},
{
"epoch": 0.5397225725094578,
"grad_norm": 24.16887092590332,
"learning_rate": 9.387755102040818e-06,
"loss": 1.9404,
"step": 214
},
{
"epoch": 0.5422446406052963,
"grad_norm": 24.577871322631836,
"learning_rate": 9.336734693877552e-06,
"loss": 1.9202,
"step": 215
},
{
"epoch": 0.544766708701135,
"grad_norm": 26.117361068725586,
"learning_rate": 9.285714285714288e-06,
"loss": 1.921,
"step": 216
},
{
"epoch": 0.5472887767969735,
"grad_norm": 22.586837768554688,
"learning_rate": 9.234693877551022e-06,
"loss": 1.9692,
"step": 217
},
{
"epoch": 0.5498108448928121,
"grad_norm": 18.438722610473633,
"learning_rate": 9.183673469387756e-06,
"loss": 1.9496,
"step": 218
},
{
"epoch": 0.5523329129886507,
"grad_norm": 22.94545555114746,
"learning_rate": 9.13265306122449e-06,
"loss": 1.991,
"step": 219
},
{
"epoch": 0.5548549810844893,
"grad_norm": 28.664562225341797,
"learning_rate": 9.081632653061225e-06,
"loss": 1.8352,
"step": 220
},
{
"epoch": 0.5573770491803278,
"grad_norm": 25.63576316833496,
"learning_rate": 9.03061224489796e-06,
"loss": 1.9399,
"step": 221
},
{
"epoch": 0.5598991172761665,
"grad_norm": 21.650251388549805,
"learning_rate": 8.979591836734695e-06,
"loss": 1.9565,
"step": 222
},
{
"epoch": 0.562421185372005,
"grad_norm": 29.605735778808594,
"learning_rate": 8.92857142857143e-06,
"loss": 1.729,
"step": 223
},
{
"epoch": 0.5649432534678437,
"grad_norm": 23.98230743408203,
"learning_rate": 8.877551020408163e-06,
"loss": 1.9399,
"step": 224
},
{
"epoch": 0.5674653215636822,
"grad_norm": 20.37510108947754,
"learning_rate": 8.826530612244899e-06,
"loss": 1.7322,
"step": 225
},
{
"epoch": 0.5699873896595208,
"grad_norm": 25.876188278198242,
"learning_rate": 8.775510204081633e-06,
"loss": 1.9444,
"step": 226
},
{
"epoch": 0.5725094577553594,
"grad_norm": 32.07249069213867,
"learning_rate": 8.724489795918369e-06,
"loss": 1.9364,
"step": 227
},
{
"epoch": 0.575031525851198,
"grad_norm": 28.014524459838867,
"learning_rate": 8.673469387755103e-06,
"loss": 1.757,
"step": 228
},
{
"epoch": 0.5775535939470365,
"grad_norm": 30.82647132873535,
"learning_rate": 8.622448979591837e-06,
"loss": 1.9067,
"step": 229
},
{
"epoch": 0.5800756620428752,
"grad_norm": 30.651660919189453,
"learning_rate": 8.571428571428571e-06,
"loss": 1.9906,
"step": 230
},
{
"epoch": 0.5825977301387137,
"grad_norm": 25.239904403686523,
"learning_rate": 8.520408163265307e-06,
"loss": 1.8914,
"step": 231
},
{
"epoch": 0.5851197982345523,
"grad_norm": 21.33747673034668,
"learning_rate": 8.469387755102042e-06,
"loss": 1.9999,
"step": 232
},
{
"epoch": 0.587641866330391,
"grad_norm": 25.255064010620117,
"learning_rate": 8.418367346938776e-06,
"loss": 1.8941,
"step": 233
},
{
"epoch": 0.5901639344262295,
"grad_norm": 24.443973541259766,
"learning_rate": 8.36734693877551e-06,
"loss": 1.7679,
"step": 234
},
{
"epoch": 0.592686002522068,
"grad_norm": 25.473894119262695,
"learning_rate": 8.316326530612246e-06,
"loss": 1.7876,
"step": 235
},
{
"epoch": 0.5952080706179067,
"grad_norm": 26.28467559814453,
"learning_rate": 8.26530612244898e-06,
"loss": 1.6761,
"step": 236
},
{
"epoch": 0.5977301387137453,
"grad_norm": 24.488052368164062,
"learning_rate": 8.214285714285714e-06,
"loss": 2.0022,
"step": 237
},
{
"epoch": 0.6002522068095839,
"grad_norm": 30.074064254760742,
"learning_rate": 8.16326530612245e-06,
"loss": 1.7747,
"step": 238
},
{
"epoch": 0.6027742749054225,
"grad_norm": 23.73440170288086,
"learning_rate": 8.112244897959184e-06,
"loss": 1.8468,
"step": 239
},
{
"epoch": 0.605296343001261,
"grad_norm": 22.338869094848633,
"learning_rate": 8.06122448979592e-06,
"loss": 1.8611,
"step": 240
},
{
"epoch": 0.6078184110970997,
"grad_norm": 24.844266891479492,
"learning_rate": 8.010204081632654e-06,
"loss": 1.9285,
"step": 241
},
{
"epoch": 0.6103404791929382,
"grad_norm": 29.65668487548828,
"learning_rate": 7.959183673469388e-06,
"loss": 1.935,
"step": 242
},
{
"epoch": 0.6128625472887768,
"grad_norm": 26.01723289489746,
"learning_rate": 7.908163265306124e-06,
"loss": 1.7587,
"step": 243
},
{
"epoch": 0.6153846153846154,
"grad_norm": 27.04817771911621,
"learning_rate": 7.857142857142858e-06,
"loss": 1.8878,
"step": 244
},
{
"epoch": 0.617906683480454,
"grad_norm": 36.23786163330078,
"learning_rate": 7.806122448979593e-06,
"loss": 1.992,
"step": 245
},
{
"epoch": 0.6204287515762925,
"grad_norm": 19.283294677734375,
"learning_rate": 7.755102040816327e-06,
"loss": 1.8066,
"step": 246
},
{
"epoch": 0.6229508196721312,
"grad_norm": 24.24143409729004,
"learning_rate": 7.704081632653061e-06,
"loss": 1.8899,
"step": 247
},
{
"epoch": 0.6254728877679697,
"grad_norm": 25.59832763671875,
"learning_rate": 7.653061224489796e-06,
"loss": 1.9601,
"step": 248
},
{
"epoch": 0.6279949558638083,
"grad_norm": 27.195640563964844,
"learning_rate": 7.602040816326531e-06,
"loss": 1.9561,
"step": 249
},
{
"epoch": 0.6305170239596469,
"grad_norm": 27.854570388793945,
"learning_rate": 7.551020408163265e-06,
"loss": 1.8781,
"step": 250
},
{
"epoch": 0.6330390920554855,
"grad_norm": 25.715761184692383,
"learning_rate": 7.500000000000001e-06,
"loss": 1.8542,
"step": 251
},
{
"epoch": 0.6355611601513241,
"grad_norm": 22.562984466552734,
"learning_rate": 7.448979591836736e-06,
"loss": 1.7681,
"step": 252
},
{
"epoch": 0.6380832282471627,
"grad_norm": 20.540348052978516,
"learning_rate": 7.39795918367347e-06,
"loss": 1.9617,
"step": 253
},
{
"epoch": 0.6406052963430012,
"grad_norm": 24.610937118530273,
"learning_rate": 7.346938775510205e-06,
"loss": 1.9694,
"step": 254
},
{
"epoch": 0.6431273644388399,
"grad_norm": 27.93538475036621,
"learning_rate": 7.295918367346939e-06,
"loss": 1.8858,
"step": 255
},
{
"epoch": 0.6456494325346784,
"grad_norm": 31.466445922851562,
"learning_rate": 7.244897959183675e-06,
"loss": 1.8252,
"step": 256
},
{
"epoch": 0.648171500630517,
"grad_norm": 26.276226043701172,
"learning_rate": 7.193877551020409e-06,
"loss": 1.8865,
"step": 257
},
{
"epoch": 0.6506935687263556,
"grad_norm": 22.52095603942871,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.7649,
"step": 258
},
{
"epoch": 0.6532156368221942,
"grad_norm": 20.15144157409668,
"learning_rate": 7.091836734693878e-06,
"loss": 2.0158,
"step": 259
},
{
"epoch": 0.6557377049180327,
"grad_norm": 26.405349731445312,
"learning_rate": 7.0408163265306125e-06,
"loss": 1.8932,
"step": 260
},
{
"epoch": 0.6582597730138714,
"grad_norm": 32.94384765625,
"learning_rate": 6.989795918367348e-06,
"loss": 1.7795,
"step": 261
},
{
"epoch": 0.6607818411097099,
"grad_norm": 23.109092712402344,
"learning_rate": 6.938775510204082e-06,
"loss": 1.8383,
"step": 262
},
{
"epoch": 0.6633039092055486,
"grad_norm": 21.75737190246582,
"learning_rate": 6.887755102040817e-06,
"loss": 1.8727,
"step": 263
},
{
"epoch": 0.6658259773013872,
"grad_norm": 22.96916389465332,
"learning_rate": 6.836734693877551e-06,
"loss": 1.8544,
"step": 264
},
{
"epoch": 0.6683480453972257,
"grad_norm": 25.62445640563965,
"learning_rate": 6.785714285714287e-06,
"loss": 1.7503,
"step": 265
},
{
"epoch": 0.6708701134930644,
"grad_norm": 25.430530548095703,
"learning_rate": 6.734693877551021e-06,
"loss": 1.7938,
"step": 266
},
{
"epoch": 0.6733921815889029,
"grad_norm": 26.462881088256836,
"learning_rate": 6.683673469387756e-06,
"loss": 1.8284,
"step": 267
},
{
"epoch": 0.6759142496847415,
"grad_norm": 31.45004653930664,
"learning_rate": 6.63265306122449e-06,
"loss": 2.0328,
"step": 268
},
{
"epoch": 0.6784363177805801,
"grad_norm": 30.525737762451172,
"learning_rate": 6.581632653061225e-06,
"loss": 1.8192,
"step": 269
},
{
"epoch": 0.6809583858764187,
"grad_norm": 25.705707550048828,
"learning_rate": 6.530612244897959e-06,
"loss": 1.8533,
"step": 270
},
{
"epoch": 0.6834804539722572,
"grad_norm": 39.90187454223633,
"learning_rate": 6.4795918367346946e-06,
"loss": 1.9483,
"step": 271
},
{
"epoch": 0.6860025220680959,
"grad_norm": 28.0180721282959,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.8132,
"step": 272
},
{
"epoch": 0.6885245901639344,
"grad_norm": 34.821372985839844,
"learning_rate": 6.3775510204081635e-06,
"loss": 1.9599,
"step": 273
},
{
"epoch": 0.691046658259773,
"grad_norm": 24.018394470214844,
"learning_rate": 6.326530612244899e-06,
"loss": 1.9248,
"step": 274
},
{
"epoch": 0.6935687263556116,
"grad_norm": 24.074344635009766,
"learning_rate": 6.275510204081633e-06,
"loss": 2.0148,
"step": 275
},
{
"epoch": 0.6960907944514502,
"grad_norm": 31.1939754486084,
"learning_rate": 6.224489795918368e-06,
"loss": 1.8959,
"step": 276
},
{
"epoch": 0.6986128625472888,
"grad_norm": 25.481502532958984,
"learning_rate": 6.173469387755102e-06,
"loss": 1.9832,
"step": 277
},
{
"epoch": 0.7011349306431274,
"grad_norm": 29.6664981842041,
"learning_rate": 6.122448979591837e-06,
"loss": 1.9222,
"step": 278
},
{
"epoch": 0.7036569987389659,
"grad_norm": 26.30698585510254,
"learning_rate": 6.071428571428571e-06,
"loss": 1.9897,
"step": 279
},
{
"epoch": 0.7061790668348046,
"grad_norm": 31.827558517456055,
"learning_rate": 6.020408163265307e-06,
"loss": 1.8615,
"step": 280
},
{
"epoch": 0.7087011349306431,
"grad_norm": 24.80223846435547,
"learning_rate": 5.969387755102042e-06,
"loss": 1.9579,
"step": 281
},
{
"epoch": 0.7112232030264817,
"grad_norm": 36.134700775146484,
"learning_rate": 5.918367346938776e-06,
"loss": 1.723,
"step": 282
},
{
"epoch": 0.7137452711223203,
"grad_norm": 30.388233184814453,
"learning_rate": 5.867346938775511e-06,
"loss": 1.9736,
"step": 283
},
{
"epoch": 0.7162673392181589,
"grad_norm": 32.231563568115234,
"learning_rate": 5.816326530612246e-06,
"loss": 1.9228,
"step": 284
},
{
"epoch": 0.7187894073139974,
"grad_norm": 38.05869674682617,
"learning_rate": 5.7653061224489805e-06,
"loss": 1.9159,
"step": 285
},
{
"epoch": 0.7213114754098361,
"grad_norm": 27.256147384643555,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.8072,
"step": 286
},
{
"epoch": 0.7238335435056746,
"grad_norm": 25.67181396484375,
"learning_rate": 5.663265306122449e-06,
"loss": 1.8633,
"step": 287
},
{
"epoch": 0.7263556116015133,
"grad_norm": 31.8681697845459,
"learning_rate": 5.6122448979591834e-06,
"loss": 1.9822,
"step": 288
},
{
"epoch": 0.7288776796973518,
"grad_norm": 32.85325241088867,
"learning_rate": 5.561224489795919e-06,
"loss": 1.8166,
"step": 289
},
{
"epoch": 0.7313997477931904,
"grad_norm": 35.64312744140625,
"learning_rate": 5.510204081632653e-06,
"loss": 1.7166,
"step": 290
},
{
"epoch": 0.733921815889029,
"grad_norm": 24.276235580444336,
"learning_rate": 5.459183673469388e-06,
"loss": 1.7593,
"step": 291
},
{
"epoch": 0.7364438839848676,
"grad_norm": 29.371950149536133,
"learning_rate": 5.408163265306123e-06,
"loss": 1.8124,
"step": 292
},
{
"epoch": 0.7389659520807061,
"grad_norm": 23.76220703125,
"learning_rate": 5.357142857142857e-06,
"loss": 1.7775,
"step": 293
},
{
"epoch": 0.7414880201765448,
"grad_norm": 37.103050231933594,
"learning_rate": 5.306122448979593e-06,
"loss": 1.9253,
"step": 294
},
{
"epoch": 0.7440100882723834,
"grad_norm": 20.0811767578125,
"learning_rate": 5.255102040816327e-06,
"loss": 1.8711,
"step": 295
},
{
"epoch": 0.7465321563682219,
"grad_norm": 35.33123016357422,
"learning_rate": 5.204081632653062e-06,
"loss": 1.8764,
"step": 296
},
{
"epoch": 0.7490542244640606,
"grad_norm": 31.880672454833984,
"learning_rate": 5.153061224489796e-06,
"loss": 1.8929,
"step": 297
},
{
"epoch": 0.7515762925598991,
"grad_norm": 21.682334899902344,
"learning_rate": 5.1020408163265315e-06,
"loss": 2.0377,
"step": 298
},
{
"epoch": 0.7540983606557377,
"grad_norm": 34.68608474731445,
"learning_rate": 5.0510204081632655e-06,
"loss": 1.9341,
"step": 299
},
{
"epoch": 0.7566204287515763,
"grad_norm": 25.59632110595703,
"learning_rate": 5e-06,
"loss": 1.8264,
"step": 300
},
{
"epoch": 0.7566204287515763,
"eval_loss": 1.8502724170684814,
"eval_runtime": 6.6208,
"eval_samples_per_second": 106.483,
"eval_steps_per_second": 53.317,
"step": 300
},
{
"epoch": 0.7591424968474149,
"grad_norm": 33.780616760253906,
"learning_rate": 4.948979591836735e-06,
"loss": 1.8315,
"step": 301
},
{
"epoch": 0.7616645649432535,
"grad_norm": 23.005069732666016,
"learning_rate": 4.897959183673469e-06,
"loss": 1.8434,
"step": 302
},
{
"epoch": 0.7641866330390921,
"grad_norm": 27.338787078857422,
"learning_rate": 4.846938775510204e-06,
"loss": 1.9102,
"step": 303
},
{
"epoch": 0.7667087011349306,
"grad_norm": 26.87493133544922,
"learning_rate": 4.795918367346939e-06,
"loss": 1.8408,
"step": 304
},
{
"epoch": 0.7692307692307693,
"grad_norm": 33.59117126464844,
"learning_rate": 4.744897959183674e-06,
"loss": 1.8633,
"step": 305
},
{
"epoch": 0.7717528373266078,
"grad_norm": 38.98092269897461,
"learning_rate": 4.693877551020409e-06,
"loss": 1.8187,
"step": 306
},
{
"epoch": 0.7742749054224464,
"grad_norm": 28.7203369140625,
"learning_rate": 4.642857142857144e-06,
"loss": 1.8425,
"step": 307
},
{
"epoch": 0.776796973518285,
"grad_norm": 30.91414451599121,
"learning_rate": 4.591836734693878e-06,
"loss": 1.8526,
"step": 308
},
{
"epoch": 0.7793190416141236,
"grad_norm": 29.04154396057129,
"learning_rate": 4.540816326530613e-06,
"loss": 1.8913,
"step": 309
},
{
"epoch": 0.7818411097099621,
"grad_norm": 29.638099670410156,
"learning_rate": 4.489795918367348e-06,
"loss": 1.8736,
"step": 310
}
],
"logging_steps": 1,
"max_steps": 397,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4524488042102784.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}