marianna13's picture
Upload folder using huggingface_hub
4ec929f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9927710843373494,
"eval_steps": 500,
"global_step": 621,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004819277108433735,
"grad_norm": 6.955545902252197,
"learning_rate": 1.5873015873015874e-07,
"loss": 1.0435,
"step": 1
},
{
"epoch": 0.00963855421686747,
"grad_norm": 6.793174743652344,
"learning_rate": 3.174603174603175e-07,
"loss": 1.0378,
"step": 2
},
{
"epoch": 0.014457831325301205,
"grad_norm": 6.922445774078369,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.0495,
"step": 3
},
{
"epoch": 0.01927710843373494,
"grad_norm": 6.903090000152588,
"learning_rate": 6.34920634920635e-07,
"loss": 1.0422,
"step": 4
},
{
"epoch": 0.024096385542168676,
"grad_norm": 6.824069499969482,
"learning_rate": 7.936507936507937e-07,
"loss": 1.0456,
"step": 5
},
{
"epoch": 0.02891566265060241,
"grad_norm": 6.71932315826416,
"learning_rate": 9.523809523809525e-07,
"loss": 1.0416,
"step": 6
},
{
"epoch": 0.033734939759036145,
"grad_norm": 6.264998912811279,
"learning_rate": 1.111111111111111e-06,
"loss": 1.0182,
"step": 7
},
{
"epoch": 0.03855421686746988,
"grad_norm": 6.157326698303223,
"learning_rate": 1.26984126984127e-06,
"loss": 1.0122,
"step": 8
},
{
"epoch": 0.043373493975903614,
"grad_norm": 5.046329498291016,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.002,
"step": 9
},
{
"epoch": 0.04819277108433735,
"grad_norm": 4.785087585449219,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.9743,
"step": 10
},
{
"epoch": 0.05301204819277108,
"grad_norm": 4.505481719970703,
"learning_rate": 1.746031746031746e-06,
"loss": 0.9795,
"step": 11
},
{
"epoch": 0.05783132530120482,
"grad_norm": 3.0111618041992188,
"learning_rate": 1.904761904761905e-06,
"loss": 0.9396,
"step": 12
},
{
"epoch": 0.06265060240963856,
"grad_norm": 2.865196466445923,
"learning_rate": 2.0634920634920634e-06,
"loss": 0.9502,
"step": 13
},
{
"epoch": 0.06746987951807229,
"grad_norm": 2.732715368270874,
"learning_rate": 2.222222222222222e-06,
"loss": 0.9336,
"step": 14
},
{
"epoch": 0.07228915662650602,
"grad_norm": 2.576692581176758,
"learning_rate": 2.380952380952381e-06,
"loss": 0.933,
"step": 15
},
{
"epoch": 0.07710843373493977,
"grad_norm": 3.3130340576171875,
"learning_rate": 2.53968253968254e-06,
"loss": 0.9039,
"step": 16
},
{
"epoch": 0.0819277108433735,
"grad_norm": 3.8548290729522705,
"learning_rate": 2.6984126984126986e-06,
"loss": 0.8975,
"step": 17
},
{
"epoch": 0.08674698795180723,
"grad_norm": 3.933690071105957,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.9065,
"step": 18
},
{
"epoch": 0.09156626506024096,
"grad_norm": 3.6290793418884277,
"learning_rate": 3.015873015873016e-06,
"loss": 0.8772,
"step": 19
},
{
"epoch": 0.0963855421686747,
"grad_norm": 3.045029878616333,
"learning_rate": 3.1746031746031746e-06,
"loss": 0.8738,
"step": 20
},
{
"epoch": 0.10120481927710843,
"grad_norm": 2.285907506942749,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8443,
"step": 21
},
{
"epoch": 0.10602409638554217,
"grad_norm": 1.6617300510406494,
"learning_rate": 3.492063492063492e-06,
"loss": 0.8385,
"step": 22
},
{
"epoch": 0.1108433734939759,
"grad_norm": 1.595490574836731,
"learning_rate": 3.6507936507936507e-06,
"loss": 0.8201,
"step": 23
},
{
"epoch": 0.11566265060240964,
"grad_norm": 1.3998520374298096,
"learning_rate": 3.80952380952381e-06,
"loss": 0.8115,
"step": 24
},
{
"epoch": 0.12048192771084337,
"grad_norm": 1.24544358253479,
"learning_rate": 3.968253968253968e-06,
"loss": 0.8131,
"step": 25
},
{
"epoch": 0.12530120481927712,
"grad_norm": 1.094412088394165,
"learning_rate": 4.126984126984127e-06,
"loss": 0.7769,
"step": 26
},
{
"epoch": 0.13012048192771083,
"grad_norm": 1.0416014194488525,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.7739,
"step": 27
},
{
"epoch": 0.13493975903614458,
"grad_norm": 1.002666711807251,
"learning_rate": 4.444444444444444e-06,
"loss": 0.7797,
"step": 28
},
{
"epoch": 0.13975903614457832,
"grad_norm": 0.9590147137641907,
"learning_rate": 4.603174603174604e-06,
"loss": 0.7592,
"step": 29
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.8477379679679871,
"learning_rate": 4.761904761904762e-06,
"loss": 0.755,
"step": 30
},
{
"epoch": 0.1493975903614458,
"grad_norm": 0.8571174740791321,
"learning_rate": 4.920634920634921e-06,
"loss": 0.7514,
"step": 31
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.8876820802688599,
"learning_rate": 5.07936507936508e-06,
"loss": 0.7371,
"step": 32
},
{
"epoch": 0.15903614457831325,
"grad_norm": 0.8745198249816895,
"learning_rate": 5.2380952380952384e-06,
"loss": 0.7375,
"step": 33
},
{
"epoch": 0.163855421686747,
"grad_norm": 0.94056236743927,
"learning_rate": 5.396825396825397e-06,
"loss": 0.7282,
"step": 34
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.7261127829551697,
"learning_rate": 5.555555555555557e-06,
"loss": 0.7217,
"step": 35
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.7808772325515747,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.7069,
"step": 36
},
{
"epoch": 0.1783132530120482,
"grad_norm": 0.8928313255310059,
"learning_rate": 5.873015873015874e-06,
"loss": 0.7165,
"step": 37
},
{
"epoch": 0.18313253012048192,
"grad_norm": 0.7088349461555481,
"learning_rate": 6.031746031746032e-06,
"loss": 0.7122,
"step": 38
},
{
"epoch": 0.18795180722891566,
"grad_norm": 0.8545775413513184,
"learning_rate": 6.1904761904761914e-06,
"loss": 0.7151,
"step": 39
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.8626333475112915,
"learning_rate": 6.349206349206349e-06,
"loss": 0.7127,
"step": 40
},
{
"epoch": 0.19759036144578312,
"grad_norm": 0.7708625197410583,
"learning_rate": 6.507936507936509e-06,
"loss": 0.6916,
"step": 41
},
{
"epoch": 0.20240963855421687,
"grad_norm": 0.6877345442771912,
"learning_rate": 6.666666666666667e-06,
"loss": 0.707,
"step": 42
},
{
"epoch": 0.20722891566265061,
"grad_norm": 0.6759275197982788,
"learning_rate": 6.825396825396826e-06,
"loss": 0.6822,
"step": 43
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.9545477032661438,
"learning_rate": 6.984126984126984e-06,
"loss": 0.6943,
"step": 44
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.789354681968689,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.6799,
"step": 45
},
{
"epoch": 0.2216867469879518,
"grad_norm": 0.6081812381744385,
"learning_rate": 7.301587301587301e-06,
"loss": 0.6824,
"step": 46
},
{
"epoch": 0.22650602409638554,
"grad_norm": 0.8076633214950562,
"learning_rate": 7.460317460317461e-06,
"loss": 0.6701,
"step": 47
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.6392861604690552,
"learning_rate": 7.61904761904762e-06,
"loss": 0.6774,
"step": 48
},
{
"epoch": 0.236144578313253,
"grad_norm": 0.6257482767105103,
"learning_rate": 7.77777777777778e-06,
"loss": 0.6908,
"step": 49
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.6038609743118286,
"learning_rate": 7.936507936507936e-06,
"loss": 0.6784,
"step": 50
},
{
"epoch": 0.2457831325301205,
"grad_norm": 0.5643861293792725,
"learning_rate": 8.095238095238097e-06,
"loss": 0.6705,
"step": 51
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.6746996641159058,
"learning_rate": 8.253968253968254e-06,
"loss": 0.6674,
"step": 52
},
{
"epoch": 0.25542168674698795,
"grad_norm": 0.6142429113388062,
"learning_rate": 8.412698412698414e-06,
"loss": 0.6798,
"step": 53
},
{
"epoch": 0.26024096385542167,
"grad_norm": 0.48977726697921753,
"learning_rate": 8.571428571428571e-06,
"loss": 0.6678,
"step": 54
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.6775186061859131,
"learning_rate": 8.730158730158731e-06,
"loss": 0.6777,
"step": 55
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.6069589257240295,
"learning_rate": 8.888888888888888e-06,
"loss": 0.6571,
"step": 56
},
{
"epoch": 0.2746987951807229,
"grad_norm": 0.5993375778198242,
"learning_rate": 9.047619047619049e-06,
"loss": 0.6659,
"step": 57
},
{
"epoch": 0.27951807228915665,
"grad_norm": 0.5999571681022644,
"learning_rate": 9.206349206349207e-06,
"loss": 0.6586,
"step": 58
},
{
"epoch": 0.28433734939759037,
"grad_norm": 0.6867564916610718,
"learning_rate": 9.365079365079366e-06,
"loss": 0.6662,
"step": 59
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.6454688906669617,
"learning_rate": 9.523809523809525e-06,
"loss": 0.655,
"step": 60
},
{
"epoch": 0.29397590361445786,
"grad_norm": 0.6191383600234985,
"learning_rate": 9.682539682539683e-06,
"loss": 0.6641,
"step": 61
},
{
"epoch": 0.2987951807228916,
"grad_norm": 0.6252975463867188,
"learning_rate": 9.841269841269842e-06,
"loss": 0.6436,
"step": 62
},
{
"epoch": 0.3036144578313253,
"grad_norm": 0.5347315669059753,
"learning_rate": 1e-05,
"loss": 0.6542,
"step": 63
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.6288977861404419,
"learning_rate": 9.999920755303033e-06,
"loss": 0.6472,
"step": 64
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.5932735204696655,
"learning_rate": 9.999683023724021e-06,
"loss": 0.6385,
"step": 65
},
{
"epoch": 0.3180722891566265,
"grad_norm": 0.573313295841217,
"learning_rate": 9.99928681279855e-06,
"loss": 0.6504,
"step": 66
},
{
"epoch": 0.3228915662650602,
"grad_norm": 0.5847601294517517,
"learning_rate": 9.998732135085665e-06,
"loss": 0.6469,
"step": 67
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.5990540385246277,
"learning_rate": 9.998019008167476e-06,
"loss": 0.6498,
"step": 68
},
{
"epoch": 0.3325301204819277,
"grad_norm": 0.5455608367919922,
"learning_rate": 9.99714745464859e-06,
"loss": 0.6425,
"step": 69
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.6302762627601624,
"learning_rate": 9.99611750215541e-06,
"loss": 0.6386,
"step": 70
},
{
"epoch": 0.3421686746987952,
"grad_norm": 0.6006896495819092,
"learning_rate": 9.994929183335237e-06,
"loss": 0.6402,
"step": 71
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.6036058664321899,
"learning_rate": 9.993582535855265e-06,
"loss": 0.66,
"step": 72
},
{
"epoch": 0.35180722891566263,
"grad_norm": 0.5187335014343262,
"learning_rate": 9.992077602401358e-06,
"loss": 0.6471,
"step": 73
},
{
"epoch": 0.3566265060240964,
"grad_norm": 0.6480401754379272,
"learning_rate": 9.990414430676716e-06,
"loss": 0.6432,
"step": 74
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.5152376294136047,
"learning_rate": 9.988593073400354e-06,
"loss": 0.6351,
"step": 75
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.5486935377120972,
"learning_rate": 9.986613588305435e-06,
"loss": 0.6307,
"step": 76
},
{
"epoch": 0.3710843373493976,
"grad_norm": 0.6851136684417725,
"learning_rate": 9.984476038137437e-06,
"loss": 0.6366,
"step": 77
},
{
"epoch": 0.3759036144578313,
"grad_norm": 0.5467675924301147,
"learning_rate": 9.982180490652165e-06,
"loss": 0.6366,
"step": 78
},
{
"epoch": 0.38072289156626504,
"grad_norm": 0.7552728652954102,
"learning_rate": 9.979727018613607e-06,
"loss": 0.6286,
"step": 79
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.651406466960907,
"learning_rate": 9.977115699791622e-06,
"loss": 0.6359,
"step": 80
},
{
"epoch": 0.39036144578313253,
"grad_norm": 0.7135255336761475,
"learning_rate": 9.974346616959476e-06,
"loss": 0.6327,
"step": 81
},
{
"epoch": 0.39518072289156625,
"grad_norm": 0.6827096343040466,
"learning_rate": 9.971419857891223e-06,
"loss": 0.6181,
"step": 82
},
{
"epoch": 0.4,
"grad_norm": 0.6823281645774841,
"learning_rate": 9.968335515358916e-06,
"loss": 0.6377,
"step": 83
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.5493991374969482,
"learning_rate": 9.965093687129669e-06,
"loss": 0.6391,
"step": 84
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.6720536351203918,
"learning_rate": 9.961694475962562e-06,
"loss": 0.6331,
"step": 85
},
{
"epoch": 0.41445783132530123,
"grad_norm": 0.6092298030853271,
"learning_rate": 9.95813798960538e-06,
"loss": 0.6341,
"step": 86
},
{
"epoch": 0.41927710843373495,
"grad_norm": 0.5759454369544983,
"learning_rate": 9.954424340791195e-06,
"loss": 0.633,
"step": 87
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.6757510900497437,
"learning_rate": 9.950553647234798e-06,
"loss": 0.6243,
"step": 88
},
{
"epoch": 0.42891566265060244,
"grad_norm": 0.5293172597885132,
"learning_rate": 9.94652603162896e-06,
"loss": 0.6424,
"step": 89
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.663861870765686,
"learning_rate": 9.942341621640558e-06,
"loss": 0.6409,
"step": 90
},
{
"epoch": 0.43855421686746987,
"grad_norm": 0.5438668727874756,
"learning_rate": 9.938000549906509e-06,
"loss": 0.6391,
"step": 91
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.6359225511550903,
"learning_rate": 9.93350295402958e-06,
"loss": 0.6288,
"step": 92
},
{
"epoch": 0.44819277108433736,
"grad_norm": 0.5770338773727417,
"learning_rate": 9.92884897657402e-06,
"loss": 0.6114,
"step": 93
},
{
"epoch": 0.4530120481927711,
"grad_norm": 0.5579841136932373,
"learning_rate": 9.924038765061042e-06,
"loss": 0.6137,
"step": 94
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.597624659538269,
"learning_rate": 9.919072471964146e-06,
"loss": 0.6455,
"step": 95
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.5436515212059021,
"learning_rate": 9.913950254704291e-06,
"loss": 0.6274,
"step": 96
},
{
"epoch": 0.4674698795180723,
"grad_norm": 0.5492904782295227,
"learning_rate": 9.908672275644898e-06,
"loss": 0.6291,
"step": 97
},
{
"epoch": 0.472289156626506,
"grad_norm": 0.5357149839401245,
"learning_rate": 9.903238702086707e-06,
"loss": 0.6422,
"step": 98
},
{
"epoch": 0.4771084337349398,
"grad_norm": 0.5629865527153015,
"learning_rate": 9.897649706262474e-06,
"loss": 0.636,
"step": 99
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.5739107728004456,
"learning_rate": 9.89190546533151e-06,
"loss": 0.6261,
"step": 100
},
{
"epoch": 0.4867469879518072,
"grad_norm": 0.600075900554657,
"learning_rate": 9.88600616137407e-06,
"loss": 0.6206,
"step": 101
},
{
"epoch": 0.491566265060241,
"grad_norm": 0.562242329120636,
"learning_rate": 9.879951981385577e-06,
"loss": 0.6263,
"step": 102
},
{
"epoch": 0.4963855421686747,
"grad_norm": 0.5759828686714172,
"learning_rate": 9.873743117270691e-06,
"loss": 0.6287,
"step": 103
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.7623122334480286,
"learning_rate": 9.867379765837237e-06,
"loss": 0.6112,
"step": 104
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.5862762928009033,
"learning_rate": 9.860862128789954e-06,
"loss": 0.63,
"step": 105
},
{
"epoch": 0.5108433734939759,
"grad_norm": 0.848891019821167,
"learning_rate": 9.854190412724114e-06,
"loss": 0.6322,
"step": 106
},
{
"epoch": 0.5156626506024097,
"grad_norm": 0.5686174035072327,
"learning_rate": 9.847364829118963e-06,
"loss": 0.6148,
"step": 107
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.772788941860199,
"learning_rate": 9.840385594331022e-06,
"loss": 0.62,
"step": 108
},
{
"epoch": 0.5253012048192771,
"grad_norm": 0.7146121263504028,
"learning_rate": 9.833252929587231e-06,
"loss": 0.6251,
"step": 109
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.6730661988258362,
"learning_rate": 9.825967060977933e-06,
"loss": 0.6231,
"step": 110
},
{
"epoch": 0.5349397590361445,
"grad_norm": 0.6394177675247192,
"learning_rate": 9.818528219449705e-06,
"loss": 0.6188,
"step": 111
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.6056435108184814,
"learning_rate": 9.810936640798046e-06,
"loss": 0.6376,
"step": 112
},
{
"epoch": 0.5445783132530121,
"grad_norm": 0.5977922677993774,
"learning_rate": 9.803192565659898e-06,
"loss": 0.6179,
"step": 113
},
{
"epoch": 0.5493975903614458,
"grad_norm": 0.6605879664421082,
"learning_rate": 9.795296239506011e-06,
"loss": 0.6337,
"step": 114
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.5492590069770813,
"learning_rate": 9.78724791263318e-06,
"loss": 0.6205,
"step": 115
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.6682798862457275,
"learning_rate": 9.779047840156288e-06,
"loss": 0.6102,
"step": 116
},
{
"epoch": 0.563855421686747,
"grad_norm": 0.6884996294975281,
"learning_rate": 9.770696282000245e-06,
"loss": 0.6063,
"step": 117
},
{
"epoch": 0.5686746987951807,
"grad_norm": 0.6539535522460938,
"learning_rate": 9.762193502891726e-06,
"loss": 0.6193,
"step": 118
},
{
"epoch": 0.5734939759036145,
"grad_norm": 0.5900925397872925,
"learning_rate": 9.753539772350792e-06,
"loss": 0.6113,
"step": 119
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.7695890069007874,
"learning_rate": 9.744735364682347e-06,
"loss": 0.6219,
"step": 120
},
{
"epoch": 0.5831325301204819,
"grad_norm": 0.6271756887435913,
"learning_rate": 9.735780558967434e-06,
"loss": 0.603,
"step": 121
},
{
"epoch": 0.5879518072289157,
"grad_norm": 0.667735755443573,
"learning_rate": 9.726675639054403e-06,
"loss": 0.616,
"step": 122
},
{
"epoch": 0.5927710843373494,
"grad_norm": 0.5626581311225891,
"learning_rate": 9.717420893549902e-06,
"loss": 0.6251,
"step": 123
},
{
"epoch": 0.5975903614457831,
"grad_norm": 0.5545538067817688,
"learning_rate": 9.70801661580973e-06,
"loss": 0.6165,
"step": 124
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.5270641446113586,
"learning_rate": 9.698463103929542e-06,
"loss": 0.6012,
"step": 125
},
{
"epoch": 0.6072289156626506,
"grad_norm": 0.6241425275802612,
"learning_rate": 9.688760660735403e-06,
"loss": 0.6072,
"step": 126
},
{
"epoch": 0.6120481927710844,
"grad_norm": 0.5785877108573914,
"learning_rate": 9.67890959377418e-06,
"loss": 0.6165,
"step": 127
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.6360320448875427,
"learning_rate": 9.668910215303797e-06,
"loss": 0.6112,
"step": 128
},
{
"epoch": 0.6216867469879518,
"grad_norm": 0.5739641189575195,
"learning_rate": 9.658762842283343e-06,
"loss": 0.6101,
"step": 129
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.698076605796814,
"learning_rate": 9.648467796363019e-06,
"loss": 0.6136,
"step": 130
},
{
"epoch": 0.6313253012048192,
"grad_norm": 0.6175894141197205,
"learning_rate": 9.638025403873939e-06,
"loss": 0.6188,
"step": 131
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.5730419754981995,
"learning_rate": 9.627435995817799e-06,
"loss": 0.6235,
"step": 132
},
{
"epoch": 0.6409638554216868,
"grad_norm": 0.6372531056404114,
"learning_rate": 9.616699907856368e-06,
"loss": 0.6054,
"step": 133
},
{
"epoch": 0.6457831325301204,
"grad_norm": 0.5383384823799133,
"learning_rate": 9.605817480300863e-06,
"loss": 0.6066,
"step": 134
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.6308354735374451,
"learning_rate": 9.594789058101154e-06,
"loss": 0.6084,
"step": 135
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.5583477020263672,
"learning_rate": 9.58361499083483e-06,
"loss": 0.6187,
"step": 136
},
{
"epoch": 0.6602409638554216,
"grad_norm": 0.5494464039802551,
"learning_rate": 9.57229563269612e-06,
"loss": 0.6177,
"step": 137
},
{
"epoch": 0.6650602409638554,
"grad_norm": 0.5109444856643677,
"learning_rate": 9.560831342484668e-06,
"loss": 0.6171,
"step": 138
},
{
"epoch": 0.6698795180722892,
"grad_norm": 0.6080160737037659,
"learning_rate": 9.549222483594154e-06,
"loss": 0.6126,
"step": 139
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.5663288235664368,
"learning_rate": 9.53746942400078e-06,
"loss": 0.616,
"step": 140
},
{
"epoch": 0.6795180722891566,
"grad_norm": 0.5445743203163147,
"learning_rate": 9.525572536251608e-06,
"loss": 0.6093,
"step": 141
},
{
"epoch": 0.6843373493975904,
"grad_norm": 0.6263227462768555,
"learning_rate": 9.513532197452737e-06,
"loss": 0.6118,
"step": 142
},
{
"epoch": 0.689156626506024,
"grad_norm": 0.6303161978721619,
"learning_rate": 9.501348789257373e-06,
"loss": 0.6136,
"step": 143
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.5429930686950684,
"learning_rate": 9.48902269785371e-06,
"loss": 0.6039,
"step": 144
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.6205868124961853,
"learning_rate": 9.476554313952697e-06,
"loss": 0.6216,
"step": 145
},
{
"epoch": 0.7036144578313253,
"grad_norm": 0.5463981032371521,
"learning_rate": 9.46394403277566e-06,
"loss": 0.613,
"step": 146
},
{
"epoch": 0.708433734939759,
"grad_norm": 0.5573562979698181,
"learning_rate": 9.451192254041759e-06,
"loss": 0.6074,
"step": 147
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.6575038433074951,
"learning_rate": 9.438299381955333e-06,
"loss": 0.6174,
"step": 148
},
{
"epoch": 0.7180722891566265,
"grad_norm": 0.5069466233253479,
"learning_rate": 9.425265825193077e-06,
"loss": 0.5937,
"step": 149
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.5574111938476562,
"learning_rate": 9.412091996891097e-06,
"loss": 0.6242,
"step": 150
},
{
"epoch": 0.727710843373494,
"grad_norm": 0.5976607799530029,
"learning_rate": 9.398778314631801e-06,
"loss": 0.6035,
"step": 151
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.5757405161857605,
"learning_rate": 9.385325200430679e-06,
"loss": 0.6024,
"step": 152
},
{
"epoch": 0.7373493975903614,
"grad_norm": 0.5733328461647034,
"learning_rate": 9.371733080722911e-06,
"loss": 0.6242,
"step": 153
},
{
"epoch": 0.7421686746987952,
"grad_norm": 0.600737452507019,
"learning_rate": 9.358002386349862e-06,
"loss": 0.6105,
"step": 154
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.6255360841751099,
"learning_rate": 9.34413355254542e-06,
"loss": 0.6112,
"step": 155
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.5567818880081177,
"learning_rate": 9.330127018922195e-06,
"loss": 0.5927,
"step": 156
},
{
"epoch": 0.7566265060240964,
"grad_norm": 0.7106531262397766,
"learning_rate": 9.31598322945759e-06,
"loss": 0.6112,
"step": 157
},
{
"epoch": 0.7614457831325301,
"grad_norm": 0.5977474451065063,
"learning_rate": 9.301702632479734e-06,
"loss": 0.6084,
"step": 158
},
{
"epoch": 0.7662650602409639,
"grad_norm": 0.664481520652771,
"learning_rate": 9.287285680653254e-06,
"loss": 0.5982,
"step": 159
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.6926606297492981,
"learning_rate": 9.272732830964948e-06,
"loss": 0.6083,
"step": 160
},
{
"epoch": 0.7759036144578313,
"grad_norm": 0.49633654952049255,
"learning_rate": 9.258044544709276e-06,
"loss": 0.6157,
"step": 161
},
{
"epoch": 0.7807228915662651,
"grad_norm": 0.6019198894500732,
"learning_rate": 9.243221287473755e-06,
"loss": 0.6017,
"step": 162
},
{
"epoch": 0.7855421686746988,
"grad_norm": 0.5977044105529785,
"learning_rate": 9.228263529124199e-06,
"loss": 0.6164,
"step": 163
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.6039947867393494,
"learning_rate": 9.21317174378982e-06,
"loss": 0.5977,
"step": 164
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.5924364328384399,
"learning_rate": 9.197946409848196e-06,
"loss": 0.591,
"step": 165
},
{
"epoch": 0.8,
"grad_norm": 0.5773972868919373,
"learning_rate": 9.182588009910119e-06,
"loss": 0.6025,
"step": 166
},
{
"epoch": 0.8048192771084337,
"grad_norm": 0.5809570550918579,
"learning_rate": 9.167097030804289e-06,
"loss": 0.6091,
"step": 167
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.6400211453437805,
"learning_rate": 9.151473963561884e-06,
"loss": 0.5995,
"step": 168
},
{
"epoch": 0.8144578313253013,
"grad_norm": 0.5365077257156372,
"learning_rate": 9.135719303400995e-06,
"loss": 0.6015,
"step": 169
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.5059504508972168,
"learning_rate": 9.119833549710927e-06,
"loss": 0.5981,
"step": 170
},
{
"epoch": 0.8240963855421687,
"grad_norm": 0.6700984835624695,
"learning_rate": 9.103817206036383e-06,
"loss": 0.623,
"step": 171
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.48179715871810913,
"learning_rate": 9.087670780061477e-06,
"loss": 0.6016,
"step": 172
},
{
"epoch": 0.8337349397590361,
"grad_norm": 0.571553647518158,
"learning_rate": 9.071394783593664e-06,
"loss": 0.5971,
"step": 173
},
{
"epoch": 0.8385542168674699,
"grad_norm": 0.5676701664924622,
"learning_rate": 9.054989732547507e-06,
"loss": 0.5975,
"step": 174
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.5889990925788879,
"learning_rate": 9.038456146928325e-06,
"loss": 0.6008,
"step": 175
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.5731596946716309,
"learning_rate": 9.021794550815713e-06,
"loss": 0.6061,
"step": 176
},
{
"epoch": 0.8530120481927711,
"grad_norm": 0.5482961535453796,
"learning_rate": 9.005005472346923e-06,
"loss": 0.621,
"step": 177
},
{
"epoch": 0.8578313253012049,
"grad_norm": 0.5580418109893799,
"learning_rate": 8.988089443700131e-06,
"loss": 0.5861,
"step": 178
},
{
"epoch": 0.8626506024096385,
"grad_norm": 0.6586818099021912,
"learning_rate": 8.971047001077561e-06,
"loss": 0.5879,
"step": 179
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.5529095530509949,
"learning_rate": 8.953878684688492e-06,
"loss": 0.5908,
"step": 180
},
{
"epoch": 0.8722891566265061,
"grad_norm": 0.7042589783668518,
"learning_rate": 8.936585038732143e-06,
"loss": 0.6026,
"step": 181
},
{
"epoch": 0.8771084337349397,
"grad_norm": 0.5728626251220703,
"learning_rate": 8.919166611380397e-06,
"loss": 0.5951,
"step": 182
},
{
"epoch": 0.8819277108433735,
"grad_norm": 0.6059430837631226,
"learning_rate": 8.90162395476046e-06,
"loss": 0.6004,
"step": 183
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.5582807660102844,
"learning_rate": 8.883957624937333e-06,
"loss": 0.5945,
"step": 184
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.5483564138412476,
"learning_rate": 8.866168181896198e-06,
"loss": 0.5853,
"step": 185
},
{
"epoch": 0.8963855421686747,
"grad_norm": 0.5057897567749023,
"learning_rate": 8.848256189524661e-06,
"loss": 0.5998,
"step": 186
},
{
"epoch": 0.9012048192771084,
"grad_norm": 0.5957621932029724,
"learning_rate": 8.83022221559489e-06,
"loss": 0.6038,
"step": 187
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.6007516980171204,
"learning_rate": 8.812066831745602e-06,
"loss": 0.5897,
"step": 188
},
{
"epoch": 0.9108433734939759,
"grad_norm": 0.603797435760498,
"learning_rate": 8.793790613463956e-06,
"loss": 0.6006,
"step": 189
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.6564244627952576,
"learning_rate": 8.775394140067299e-06,
"loss": 0.597,
"step": 190
},
{
"epoch": 0.9204819277108434,
"grad_norm": 0.7330297231674194,
"learning_rate": 8.756877994684818e-06,
"loss": 0.5946,
"step": 191
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.509863555431366,
"learning_rate": 8.738242764239046e-06,
"loss": 0.6,
"step": 192
},
{
"epoch": 0.9301204819277108,
"grad_norm": 0.6488967537879944,
"learning_rate": 8.719489039427256e-06,
"loss": 0.6097,
"step": 193
},
{
"epoch": 0.9349397590361446,
"grad_norm": 0.5334682464599609,
"learning_rate": 8.700617414702746e-06,
"loss": 0.6196,
"step": 194
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.4760792851448059,
"learning_rate": 8.681628488255986e-06,
"loss": 0.6079,
"step": 195
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.5883437395095825,
"learning_rate": 8.66252286199567e-06,
"loss": 0.6005,
"step": 196
},
{
"epoch": 0.9493975903614458,
"grad_norm": 0.5796838998794556,
"learning_rate": 8.643301141529619e-06,
"loss": 0.5867,
"step": 197
},
{
"epoch": 0.9542168674698795,
"grad_norm": 0.6529198884963989,
"learning_rate": 8.6239639361456e-06,
"loss": 0.5948,
"step": 198
},
{
"epoch": 0.9590361445783132,
"grad_norm": 0.5081263184547424,
"learning_rate": 8.604511858792006e-06,
"loss": 0.6159,
"step": 199
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.5447860360145569,
"learning_rate": 8.584945526058426e-06,
"loss": 0.6027,
"step": 200
},
{
"epoch": 0.9686746987951808,
"grad_norm": 0.5446210503578186,
"learning_rate": 8.565265558156101e-06,
"loss": 0.586,
"step": 201
},
{
"epoch": 0.9734939759036144,
"grad_norm": 0.5749803781509399,
"learning_rate": 8.545472578898276e-06,
"loss": 0.6034,
"step": 202
},
{
"epoch": 0.9783132530120482,
"grad_norm": 0.539688229560852,
"learning_rate": 8.525567215680397e-06,
"loss": 0.6044,
"step": 203
},
{
"epoch": 0.983132530120482,
"grad_norm": 0.7089233994483948,
"learning_rate": 8.505550099460264e-06,
"loss": 0.6023,
"step": 204
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.5110678672790527,
"learning_rate": 8.485421864737997e-06,
"loss": 0.5975,
"step": 205
},
{
"epoch": 0.9927710843373494,
"grad_norm": 0.6963241696357727,
"learning_rate": 8.465183149535939e-06,
"loss": 0.5997,
"step": 206
},
{
"epoch": 0.9975903614457832,
"grad_norm": 0.5199764370918274,
"learning_rate": 8.444834595378434e-06,
"loss": 0.5887,
"step": 207
},
{
"epoch": 1.002409638554217,
"grad_norm": 1.0180178880691528,
"learning_rate": 8.424376847271483e-06,
"loss": 0.8314,
"step": 208
},
{
"epoch": 1.0072289156626506,
"grad_norm": 0.6434845924377441,
"learning_rate": 8.403810553682307e-06,
"loss": 0.6377,
"step": 209
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.5937601327896118,
"learning_rate": 8.383136366518788e-06,
"loss": 0.5636,
"step": 210
},
{
"epoch": 1.0168674698795181,
"grad_norm": 0.5739064812660217,
"learning_rate": 8.362354941108803e-06,
"loss": 0.5732,
"step": 211
},
{
"epoch": 1.0216867469879518,
"grad_norm": 0.6371908783912659,
"learning_rate": 8.341466936179457e-06,
"loss": 0.5808,
"step": 212
},
{
"epoch": 1.0265060240963855,
"grad_norm": 0.5970191359519958,
"learning_rate": 8.320473013836197e-06,
"loss": 0.5793,
"step": 213
},
{
"epoch": 1.0313253012048194,
"grad_norm": 0.6064131259918213,
"learning_rate": 8.299373839541829e-06,
"loss": 0.5324,
"step": 214
},
{
"epoch": 1.036144578313253,
"grad_norm": 0.7945104837417603,
"learning_rate": 8.278170082095422e-06,
"loss": 0.5765,
"step": 215
},
{
"epoch": 1.0409638554216867,
"grad_norm": 0.5729210376739502,
"learning_rate": 8.256862413611113e-06,
"loss": 0.578,
"step": 216
},
{
"epoch": 1.0457831325301206,
"grad_norm": 0.6838613152503967,
"learning_rate": 8.23545150949679e-06,
"loss": 0.5703,
"step": 217
},
{
"epoch": 1.0506024096385542,
"grad_norm": 0.5426240563392639,
"learning_rate": 8.213938048432697e-06,
"loss": 0.5667,
"step": 218
},
{
"epoch": 1.0554216867469879,
"grad_norm": 0.6620539426803589,
"learning_rate": 8.192322712349917e-06,
"loss": 0.5592,
"step": 219
},
{
"epoch": 1.0602409638554218,
"grad_norm": 0.6412476897239685,
"learning_rate": 8.170606186408756e-06,
"loss": 0.5855,
"step": 220
},
{
"epoch": 1.0650602409638554,
"grad_norm": 0.6816362142562866,
"learning_rate": 8.148789158977012e-06,
"loss": 0.5883,
"step": 221
},
{
"epoch": 1.069879518072289,
"grad_norm": 0.5668280124664307,
"learning_rate": 8.126872321608185e-06,
"loss": 0.5248,
"step": 222
},
{
"epoch": 1.074698795180723,
"grad_norm": 0.5772892832756042,
"learning_rate": 8.104856369019525e-06,
"loss": 0.6062,
"step": 223
},
{
"epoch": 1.0795180722891566,
"grad_norm": 0.7769073247909546,
"learning_rate": 8.08274199907003e-06,
"loss": 0.5695,
"step": 224
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.5870208740234375,
"learning_rate": 8.060529912738316e-06,
"loss": 0.543,
"step": 225
},
{
"epoch": 1.0891566265060242,
"grad_norm": 0.8082980513572693,
"learning_rate": 8.038220814100403e-06,
"loss": 0.5556,
"step": 226
},
{
"epoch": 1.0939759036144578,
"grad_norm": 0.5669785141944885,
"learning_rate": 8.0158154103074e-06,
"loss": 0.5608,
"step": 227
},
{
"epoch": 1.0987951807228915,
"grad_norm": 0.7154496908187866,
"learning_rate": 7.993314411563075e-06,
"loss": 0.5701,
"step": 228
},
{
"epoch": 1.1036144578313254,
"grad_norm": 0.5299010276794434,
"learning_rate": 7.970718531101365e-06,
"loss": 0.5321,
"step": 229
},
{
"epoch": 1.108433734939759,
"grad_norm": 0.5549928545951843,
"learning_rate": 7.948028485163744e-06,
"loss": 0.6037,
"step": 230
},
{
"epoch": 1.1132530120481927,
"grad_norm": 0.5550109148025513,
"learning_rate": 7.925244992976538e-06,
"loss": 0.5405,
"step": 231
},
{
"epoch": 1.1180722891566266,
"grad_norm": 0.5465673208236694,
"learning_rate": 7.902368776728125e-06,
"loss": 0.5652,
"step": 232
},
{
"epoch": 1.1228915662650603,
"grad_norm": 0.5998708009719849,
"learning_rate": 7.879400561546033e-06,
"loss": 0.5237,
"step": 233
},
{
"epoch": 1.127710843373494,
"grad_norm": 0.5261906981468201,
"learning_rate": 7.856341075473963e-06,
"loss": 0.5947,
"step": 234
},
{
"epoch": 1.1325301204819278,
"grad_norm": 0.5877994894981384,
"learning_rate": 7.833191049448706e-06,
"loss": 0.5326,
"step": 235
},
{
"epoch": 1.1373493975903615,
"grad_norm": 0.49554064869880676,
"learning_rate": 7.809951217276986e-06,
"loss": 0.5494,
"step": 236
},
{
"epoch": 1.1421686746987951,
"grad_norm": 0.6176630258560181,
"learning_rate": 7.786622315612182e-06,
"loss": 0.6198,
"step": 237
},
{
"epoch": 1.146987951807229,
"grad_norm": 0.5112401247024536,
"learning_rate": 7.763205083930995e-06,
"loss": 0.5629,
"step": 238
},
{
"epoch": 1.1518072289156627,
"grad_norm": 0.5183048844337463,
"learning_rate": 7.739700264509993e-06,
"loss": 0.5139,
"step": 239
},
{
"epoch": 1.1566265060240963,
"grad_norm": 0.5413950681686401,
"learning_rate": 7.716108602402094e-06,
"loss": 0.6041,
"step": 240
},
{
"epoch": 1.16144578313253,
"grad_norm": 0.526046872138977,
"learning_rate": 7.692430845412946e-06,
"loss": 0.5592,
"step": 241
},
{
"epoch": 1.1662650602409639,
"grad_norm": 0.5707272887229919,
"learning_rate": 7.668667744077215e-06,
"loss": 0.546,
"step": 242
},
{
"epoch": 1.1710843373493975,
"grad_norm": 0.6153143048286438,
"learning_rate": 7.644820051634813e-06,
"loss": 0.561,
"step": 243
},
{
"epoch": 1.1759036144578312,
"grad_norm": 0.5286562442779541,
"learning_rate": 7.6208885240069995e-06,
"loss": 0.5251,
"step": 244
},
{
"epoch": 1.180722891566265,
"grad_norm": 0.5454757809638977,
"learning_rate": 7.596873919772438e-06,
"loss": 0.5976,
"step": 245
},
{
"epoch": 1.1855421686746987,
"grad_norm": 0.6441790461540222,
"learning_rate": 7.572777000143145e-06,
"loss": 0.5747,
"step": 246
},
{
"epoch": 1.1903614457831324,
"grad_norm": 0.5183297395706177,
"learning_rate": 7.548598528940354e-06,
"loss": 0.4837,
"step": 247
},
{
"epoch": 1.1951807228915663,
"grad_norm": 0.5835433006286621,
"learning_rate": 7.524339272570317e-06,
"loss": 0.6416,
"step": 248
},
{
"epoch": 1.2,
"grad_norm": 0.5134496092796326,
"learning_rate": 7.500000000000001e-06,
"loss": 0.54,
"step": 249
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.5053899884223938,
"learning_rate": 7.475581482732717e-06,
"loss": 0.5513,
"step": 250
},
{
"epoch": 1.2096385542168675,
"grad_norm": 0.5715299844741821,
"learning_rate": 7.451084494783668e-06,
"loss": 0.5604,
"step": 251
},
{
"epoch": 1.2144578313253012,
"grad_norm": 0.5025492310523987,
"learning_rate": 7.4265098126554065e-06,
"loss": 0.5426,
"step": 252
},
{
"epoch": 1.2192771084337348,
"grad_norm": 0.5954846739768982,
"learning_rate": 7.401858215313228e-06,
"loss": 0.5872,
"step": 253
},
{
"epoch": 1.2240963855421687,
"grad_norm": 0.4413267970085144,
"learning_rate": 7.3771304841604764e-06,
"loss": 0.5803,
"step": 254
},
{
"epoch": 1.2289156626506024,
"grad_norm": 0.5452926754951477,
"learning_rate": 7.352327403013779e-06,
"loss": 0.5996,
"step": 255
},
{
"epoch": 1.233734939759036,
"grad_norm": 0.4635935127735138,
"learning_rate": 7.327449758078194e-06,
"loss": 0.5109,
"step": 256
},
{
"epoch": 1.23855421686747,
"grad_norm": 0.49783962965011597,
"learning_rate": 7.302498337922293e-06,
"loss": 0.5357,
"step": 257
},
{
"epoch": 1.2433734939759036,
"grad_norm": 0.5647582411766052,
"learning_rate": 7.27747393345317e-06,
"loss": 0.5724,
"step": 258
},
{
"epoch": 1.2481927710843372,
"grad_norm": 0.49849966168403625,
"learning_rate": 7.2523773378913655e-06,
"loss": 0.5722,
"step": 259
},
{
"epoch": 1.2530120481927711,
"grad_norm": 0.5430841445922852,
"learning_rate": 7.2272093467457226e-06,
"loss": 0.5995,
"step": 260
},
{
"epoch": 1.2578313253012048,
"grad_norm": 0.583263099193573,
"learning_rate": 7.201970757788172e-06,
"loss": 0.5915,
"step": 261
},
{
"epoch": 1.2626506024096384,
"grad_norm": 0.49482282996177673,
"learning_rate": 7.17666237102845e-06,
"loss": 0.5607,
"step": 262
},
{
"epoch": 1.2674698795180723,
"grad_norm": 0.5552793741226196,
"learning_rate": 7.151284988688731e-06,
"loss": 0.5993,
"step": 263
},
{
"epoch": 1.272289156626506,
"grad_norm": 0.5349662899971008,
"learning_rate": 7.125839415178204e-06,
"loss": 0.5268,
"step": 264
},
{
"epoch": 1.2771084337349397,
"grad_norm": 0.4651695191860199,
"learning_rate": 7.100326457067576e-06,
"loss": 0.5822,
"step": 265
},
{
"epoch": 1.2819277108433735,
"grad_norm": 0.5936465859413147,
"learning_rate": 7.074746923063497e-06,
"loss": 0.5578,
"step": 266
},
{
"epoch": 1.2867469879518072,
"grad_norm": 0.5026842355728149,
"learning_rate": 7.049101623982938e-06,
"loss": 0.5922,
"step": 267
},
{
"epoch": 1.2915662650602409,
"grad_norm": 0.5798065662384033,
"learning_rate": 7.02339137272748e-06,
"loss": 0.5598,
"step": 268
},
{
"epoch": 1.2963855421686747,
"grad_norm": 0.5257403254508972,
"learning_rate": 6.9976169842575526e-06,
"loss": 0.5361,
"step": 269
},
{
"epoch": 1.3012048192771084,
"grad_norm": 0.4787715971469879,
"learning_rate": 6.971779275566593e-06,
"loss": 0.5669,
"step": 270
},
{
"epoch": 1.306024096385542,
"grad_norm": 0.5384206175804138,
"learning_rate": 6.945879065655164e-06,
"loss": 0.5447,
"step": 271
},
{
"epoch": 1.310843373493976,
"grad_norm": 0.45355817675590515,
"learning_rate": 6.919917175504978e-06,
"loss": 0.6003,
"step": 272
},
{
"epoch": 1.3156626506024096,
"grad_norm": 0.5099201798439026,
"learning_rate": 6.893894428052881e-06,
"loss": 0.5545,
"step": 273
},
{
"epoch": 1.3204819277108433,
"grad_norm": 0.4605954587459564,
"learning_rate": 6.867811648164769e-06,
"loss": 0.5668,
"step": 274
},
{
"epoch": 1.3253012048192772,
"grad_norm": 0.5192990899085999,
"learning_rate": 6.841669662609437e-06,
"loss": 0.577,
"step": 275
},
{
"epoch": 1.3301204819277108,
"grad_norm": 0.48977819085121155,
"learning_rate": 6.815469300032374e-06,
"loss": 0.5341,
"step": 276
},
{
"epoch": 1.3349397590361445,
"grad_norm": 0.5304272174835205,
"learning_rate": 6.789211390929497e-06,
"loss": 0.565,
"step": 277
},
{
"epoch": 1.3397590361445784,
"grad_norm": 0.6027169823646545,
"learning_rate": 6.762896767620827e-06,
"loss": 0.5453,
"step": 278
},
{
"epoch": 1.344578313253012,
"grad_norm": 0.531121551990509,
"learning_rate": 6.736526264224101e-06,
"loss": 0.5959,
"step": 279
},
{
"epoch": 1.3493975903614457,
"grad_norm": 0.5106316208839417,
"learning_rate": 6.710100716628345e-06,
"loss": 0.5563,
"step": 280
},
{
"epoch": 1.3542168674698796,
"grad_norm": 0.5311499238014221,
"learning_rate": 6.6836209624673575e-06,
"loss": 0.5956,
"step": 281
},
{
"epoch": 1.3590361445783132,
"grad_norm": 0.4638383686542511,
"learning_rate": 6.657087841093179e-06,
"loss": 0.5184,
"step": 282
},
{
"epoch": 1.363855421686747,
"grad_norm": 0.5220621824264526,
"learning_rate": 6.6305021935494755e-06,
"loss": 0.6286,
"step": 283
},
{
"epoch": 1.3686746987951808,
"grad_norm": 0.4436984956264496,
"learning_rate": 6.603864862544879e-06,
"loss": 0.5357,
"step": 284
},
{
"epoch": 1.3734939759036144,
"grad_norm": 0.46485018730163574,
"learning_rate": 6.5771766924262795e-06,
"loss": 0.5653,
"step": 285
},
{
"epoch": 1.378313253012048,
"grad_norm": 0.5112223625183105,
"learning_rate": 6.5504385291520554e-06,
"loss": 0.5715,
"step": 286
},
{
"epoch": 1.383132530120482,
"grad_norm": 0.4580565094947815,
"learning_rate": 6.523651220265269e-06,
"loss": 0.5742,
"step": 287
},
{
"epoch": 1.3879518072289156,
"grad_norm": 0.43429890275001526,
"learning_rate": 6.496815614866792e-06,
"loss": 0.5597,
"step": 288
},
{
"epoch": 1.3927710843373493,
"grad_norm": 0.4501931667327881,
"learning_rate": 6.469932563588386e-06,
"loss": 0.566,
"step": 289
},
{
"epoch": 1.3975903614457832,
"grad_norm": 0.4644792377948761,
"learning_rate": 6.443002918565754e-06,
"loss": 0.5657,
"step": 290
},
{
"epoch": 1.4024096385542169,
"grad_norm": 0.4781135618686676,
"learning_rate": 6.41602753341152e-06,
"loss": 0.5773,
"step": 291
},
{
"epoch": 1.4072289156626505,
"grad_norm": 0.5058557391166687,
"learning_rate": 6.389007263188176e-06,
"loss": 0.5463,
"step": 292
},
{
"epoch": 1.4120481927710844,
"grad_norm": 0.4932103753089905,
"learning_rate": 6.361942964380967e-06,
"loss": 0.5801,
"step": 293
},
{
"epoch": 1.416867469879518,
"grad_norm": 0.5443869829177856,
"learning_rate": 6.334835494870759e-06,
"loss": 0.5264,
"step": 294
},
{
"epoch": 1.4216867469879517,
"grad_norm": 0.5028607845306396,
"learning_rate": 6.307685713906835e-06,
"loss": 0.6111,
"step": 295
},
{
"epoch": 1.4265060240963856,
"grad_norm": 0.4613734781742096,
"learning_rate": 6.2804944820796596e-06,
"loss": 0.5421,
"step": 296
},
{
"epoch": 1.4313253012048193,
"grad_norm": 0.5405146479606628,
"learning_rate": 6.2532626612936035e-06,
"loss": 0.6173,
"step": 297
},
{
"epoch": 1.436144578313253,
"grad_norm": 0.4562685787677765,
"learning_rate": 6.225991114739622e-06,
"loss": 0.534,
"step": 298
},
{
"epoch": 1.4409638554216868,
"grad_norm": 0.5416148900985718,
"learning_rate": 6.1986807068678926e-06,
"loss": 0.5861,
"step": 299
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.5628028512001038,
"learning_rate": 6.171332303360411e-06,
"loss": 0.5484,
"step": 300
},
{
"epoch": 1.4506024096385541,
"grad_norm": 0.4369142949581146,
"learning_rate": 6.143946771103561e-06,
"loss": 0.5304,
"step": 301
},
{
"epoch": 1.455421686746988,
"grad_norm": 0.5397925972938538,
"learning_rate": 6.11652497816062e-06,
"loss": 0.5407,
"step": 302
},
{
"epoch": 1.4602409638554217,
"grad_norm": 0.4883024990558624,
"learning_rate": 6.089067793744258e-06,
"loss": 0.588,
"step": 303
},
{
"epoch": 1.4650602409638553,
"grad_norm": 0.4950958490371704,
"learning_rate": 6.061576088188981e-06,
"loss": 0.5884,
"step": 304
},
{
"epoch": 1.4698795180722892,
"grad_norm": 0.4288600981235504,
"learning_rate": 6.034050732923538e-06,
"loss": 0.555,
"step": 305
},
{
"epoch": 1.4746987951807229,
"grad_norm": 0.5213388800621033,
"learning_rate": 6.006492600443301e-06,
"loss": 0.5619,
"step": 306
},
{
"epoch": 1.4795180722891565,
"grad_norm": 0.4272007346153259,
"learning_rate": 5.978902564282616e-06,
"loss": 0.5384,
"step": 307
},
{
"epoch": 1.4843373493975904,
"grad_norm": 0.5602285861968994,
"learning_rate": 5.951281498987106e-06,
"loss": 0.5827,
"step": 308
},
{
"epoch": 1.489156626506024,
"grad_norm": 0.47353655099868774,
"learning_rate": 5.923630280085948e-06,
"loss": 0.541,
"step": 309
},
{
"epoch": 1.4939759036144578,
"grad_norm": 0.5271124839782715,
"learning_rate": 5.895949784064126e-06,
"loss": 0.5503,
"step": 310
},
{
"epoch": 1.4987951807228916,
"grad_norm": 0.5316128134727478,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.5409,
"step": 311
},
{
"epoch": 1.5036144578313253,
"grad_norm": 0.553626298904419,
"learning_rate": 5.840504471210742e-06,
"loss": 0.5777,
"step": 312
},
{
"epoch": 1.508433734939759,
"grad_norm": 0.48189061880111694,
"learning_rate": 5.8127414118779825e-06,
"loss": 0.5784,
"step": 313
},
{
"epoch": 1.5132530120481928,
"grad_norm": 0.6508918404579163,
"learning_rate": 5.7849525903664636e-06,
"loss": 0.5652,
"step": 314
},
{
"epoch": 1.5180722891566265,
"grad_norm": 0.4616662859916687,
"learning_rate": 5.757138887522884e-06,
"loss": 0.5433,
"step": 315
},
{
"epoch": 1.5228915662650602,
"grad_norm": 0.5459868311882019,
"learning_rate": 5.729301184982622e-06,
"loss": 0.5303,
"step": 316
},
{
"epoch": 1.527710843373494,
"grad_norm": 0.5954388976097107,
"learning_rate": 5.701440365141799e-06,
"loss": 0.584,
"step": 317
},
{
"epoch": 1.5325301204819277,
"grad_norm": 0.4622243344783783,
"learning_rate": 5.673557311129306e-06,
"loss": 0.5334,
"step": 318
},
{
"epoch": 1.5373493975903614,
"grad_norm": 0.6150903701782227,
"learning_rate": 5.645652906778808e-06,
"loss": 0.5803,
"step": 319
},
{
"epoch": 1.5421686746987953,
"grad_norm": 0.6536097526550293,
"learning_rate": 5.617728036600734e-06,
"loss": 0.6041,
"step": 320
},
{
"epoch": 1.546987951807229,
"grad_norm": 0.48668476939201355,
"learning_rate": 5.5897835857542315e-06,
"loss": 0.5577,
"step": 321
},
{
"epoch": 1.5518072289156626,
"grad_norm": 0.5237818956375122,
"learning_rate": 5.561820440019117e-06,
"loss": 0.5162,
"step": 322
},
{
"epoch": 1.5566265060240965,
"grad_norm": 0.5246903896331787,
"learning_rate": 5.533839485767795e-06,
"loss": 0.5891,
"step": 323
},
{
"epoch": 1.5614457831325301,
"grad_norm": 0.5068934559822083,
"learning_rate": 5.505841609937162e-06,
"loss": 0.5608,
"step": 324
},
{
"epoch": 1.5662650602409638,
"grad_norm": 0.5139860510826111,
"learning_rate": 5.477827700000492e-06,
"loss": 0.563,
"step": 325
},
{
"epoch": 1.5710843373493977,
"grad_norm": 0.49885353446006775,
"learning_rate": 5.449798643939305e-06,
"loss": 0.5472,
"step": 326
},
{
"epoch": 1.5759036144578313,
"grad_norm": 0.5572274327278137,
"learning_rate": 5.421755330215223e-06,
"loss": 0.5805,
"step": 327
},
{
"epoch": 1.580722891566265,
"grad_norm": 0.5297819972038269,
"learning_rate": 5.39369864774181e-06,
"loss": 0.5477,
"step": 328
},
{
"epoch": 1.5855421686746989,
"grad_norm": 0.5140382647514343,
"learning_rate": 5.365629485856381e-06,
"loss": 0.5239,
"step": 329
},
{
"epoch": 1.5903614457831325,
"grad_norm": 0.5845142006874084,
"learning_rate": 5.337548734291827e-06,
"loss": 0.5787,
"step": 330
},
{
"epoch": 1.5951807228915662,
"grad_norm": 0.4414353668689728,
"learning_rate": 5.30945728314841e-06,
"loss": 0.5797,
"step": 331
},
{
"epoch": 1.6,
"grad_norm": 0.49196693301200867,
"learning_rate": 5.281356022865542e-06,
"loss": 0.5767,
"step": 332
},
{
"epoch": 1.6048192771084338,
"grad_norm": 0.5447960495948792,
"learning_rate": 5.253245844193564e-06,
"loss": 0.5502,
"step": 333
},
{
"epoch": 1.6096385542168674,
"grad_norm": 0.4547137916088104,
"learning_rate": 5.225127638165514e-06,
"loss": 0.5579,
"step": 334
},
{
"epoch": 1.6144578313253013,
"grad_norm": 0.5411679148674011,
"learning_rate": 5.197002296068878e-06,
"loss": 0.5754,
"step": 335
},
{
"epoch": 1.619277108433735,
"grad_norm": 0.45455247163772583,
"learning_rate": 5.168870709417342e-06,
"loss": 0.5283,
"step": 336
},
{
"epoch": 1.6240963855421686,
"grad_norm": 0.5107097029685974,
"learning_rate": 5.140733769922525e-06,
"loss": 0.5377,
"step": 337
},
{
"epoch": 1.6289156626506025,
"grad_norm": 0.495237797498703,
"learning_rate": 5.112592369465731e-06,
"loss": 0.616,
"step": 338
},
{
"epoch": 1.6337349397590362,
"grad_norm": 0.5090997219085693,
"learning_rate": 5.084447400069656e-06,
"loss": 0.5562,
"step": 339
},
{
"epoch": 1.6385542168674698,
"grad_norm": 0.4626471996307373,
"learning_rate": 5.0562997538701295e-06,
"loss": 0.5302,
"step": 340
},
{
"epoch": 1.6433734939759037,
"grad_norm": 0.5563095211982727,
"learning_rate": 5.0281503230878304e-06,
"loss": 0.5631,
"step": 341
},
{
"epoch": 1.6481927710843374,
"grad_norm": 0.4692661166191101,
"learning_rate": 5e-06,
"loss": 0.609,
"step": 342
},
{
"epoch": 1.653012048192771,
"grad_norm": 0.4281330108642578,
"learning_rate": 4.971849676912172e-06,
"loss": 0.5204,
"step": 343
},
{
"epoch": 1.657831325301205,
"grad_norm": 0.5017001032829285,
"learning_rate": 4.943700246129871e-06,
"loss": 0.5618,
"step": 344
},
{
"epoch": 1.6626506024096386,
"grad_norm": 0.47061556577682495,
"learning_rate": 4.915552599930345e-06,
"loss": 0.5413,
"step": 345
},
{
"epoch": 1.6674698795180722,
"grad_norm": 0.5569798350334167,
"learning_rate": 4.887407630534271e-06,
"loss": 0.5217,
"step": 346
},
{
"epoch": 1.6722891566265061,
"grad_norm": 0.4373623728752136,
"learning_rate": 4.859266230077474e-06,
"loss": 0.577,
"step": 347
},
{
"epoch": 1.6771084337349398,
"grad_norm": 0.4877110719680786,
"learning_rate": 4.83112929058266e-06,
"loss": 0.586,
"step": 348
},
{
"epoch": 1.6819277108433734,
"grad_norm": 0.5045443773269653,
"learning_rate": 4.802997703931124e-06,
"loss": 0.5214,
"step": 349
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.4335879683494568,
"learning_rate": 4.7748723618344865e-06,
"loss": 0.5975,
"step": 350
},
{
"epoch": 1.691566265060241,
"grad_norm": 0.47140806913375854,
"learning_rate": 4.746754155806437e-06,
"loss": 0.5399,
"step": 351
},
{
"epoch": 1.6963855421686747,
"grad_norm": 0.44901373982429504,
"learning_rate": 4.71864397713446e-06,
"loss": 0.617,
"step": 352
},
{
"epoch": 1.7012048192771085,
"grad_norm": 0.3893554210662842,
"learning_rate": 4.6905427168515914e-06,
"loss": 0.4837,
"step": 353
},
{
"epoch": 1.7060240963855422,
"grad_norm": 0.5149268507957458,
"learning_rate": 4.662451265708174e-06,
"loss": 0.5979,
"step": 354
},
{
"epoch": 1.7108433734939759,
"grad_norm": 0.3967888057231903,
"learning_rate": 4.63437051414362e-06,
"loss": 0.5654,
"step": 355
},
{
"epoch": 1.7156626506024097,
"grad_norm": 0.4324769675731659,
"learning_rate": 4.606301352258192e-06,
"loss": 0.5548,
"step": 356
},
{
"epoch": 1.7204819277108434,
"grad_norm": 0.44806748628616333,
"learning_rate": 4.5782446697847775e-06,
"loss": 0.5607,
"step": 357
},
{
"epoch": 1.725301204819277,
"grad_norm": 0.457933634519577,
"learning_rate": 4.550201356060695e-06,
"loss": 0.5773,
"step": 358
},
{
"epoch": 1.730120481927711,
"grad_norm": 0.41432875394821167,
"learning_rate": 4.52217229999951e-06,
"loss": 0.5119,
"step": 359
},
{
"epoch": 1.7349397590361446,
"grad_norm": 0.46618443727493286,
"learning_rate": 4.49415839006284e-06,
"loss": 0.5282,
"step": 360
},
{
"epoch": 1.7397590361445783,
"grad_norm": 0.4658416509628296,
"learning_rate": 4.466160514232206e-06,
"loss": 0.5911,
"step": 361
},
{
"epoch": 1.7445783132530122,
"grad_norm": 0.45206916332244873,
"learning_rate": 4.438179559980885e-06,
"loss": 0.5368,
"step": 362
},
{
"epoch": 1.7493975903614458,
"grad_norm": 0.5187458395957947,
"learning_rate": 4.410216414245771e-06,
"loss": 0.5681,
"step": 363
},
{
"epoch": 1.7542168674698795,
"grad_norm": 0.44342824816703796,
"learning_rate": 4.382271963399268e-06,
"loss": 0.5654,
"step": 364
},
{
"epoch": 1.7590361445783134,
"grad_norm": 0.45453277230262756,
"learning_rate": 4.354347093221194e-06,
"loss": 0.5477,
"step": 365
},
{
"epoch": 1.763855421686747,
"grad_norm": 0.5282381176948547,
"learning_rate": 4.326442688870697e-06,
"loss": 0.5759,
"step": 366
},
{
"epoch": 1.7686746987951807,
"grad_norm": 0.4427390694618225,
"learning_rate": 4.298559634858202e-06,
"loss": 0.5478,
"step": 367
},
{
"epoch": 1.7734939759036146,
"grad_norm": 0.4339542090892792,
"learning_rate": 4.270698815017379e-06,
"loss": 0.5743,
"step": 368
},
{
"epoch": 1.7783132530120482,
"grad_norm": 0.4502948522567749,
"learning_rate": 4.2428611124771184e-06,
"loss": 0.5666,
"step": 369
},
{
"epoch": 1.783132530120482,
"grad_norm": 0.45227017998695374,
"learning_rate": 4.2150474096335356e-06,
"loss": 0.5561,
"step": 370
},
{
"epoch": 1.7879518072289158,
"grad_norm": 0.41950294375419617,
"learning_rate": 4.187258588122019e-06,
"loss": 0.5617,
"step": 371
},
{
"epoch": 1.7927710843373494,
"grad_norm": 0.4249022901058197,
"learning_rate": 4.15949552878926e-06,
"loss": 0.5506,
"step": 372
},
{
"epoch": 1.797590361445783,
"grad_norm": 0.4221281111240387,
"learning_rate": 4.131759111665349e-06,
"loss": 0.5161,
"step": 373
},
{
"epoch": 1.802409638554217,
"grad_norm": 0.48842665553092957,
"learning_rate": 4.104050215935875e-06,
"loss": 0.5988,
"step": 374
},
{
"epoch": 1.8072289156626506,
"grad_norm": 0.39976850152015686,
"learning_rate": 4.076369719914055e-06,
"loss": 0.5246,
"step": 375
},
{
"epoch": 1.8120481927710843,
"grad_norm": 0.4550154209136963,
"learning_rate": 4.048718501012895e-06,
"loss": 0.5871,
"step": 376
},
{
"epoch": 1.8168674698795182,
"grad_norm": 0.44595929980278015,
"learning_rate": 4.021097435717386e-06,
"loss": 0.5507,
"step": 377
},
{
"epoch": 1.8216867469879519,
"grad_norm": 0.407740980386734,
"learning_rate": 3.993507399556699e-06,
"loss": 0.5536,
"step": 378
},
{
"epoch": 1.8265060240963855,
"grad_norm": 0.518549919128418,
"learning_rate": 3.965949267076465e-06,
"loss": 0.569,
"step": 379
},
{
"epoch": 1.8313253012048194,
"grad_norm": 0.39872604608535767,
"learning_rate": 3.938423911811021e-06,
"loss": 0.5161,
"step": 380
},
{
"epoch": 1.836144578313253,
"grad_norm": 0.4431675374507904,
"learning_rate": 3.910932206255742e-06,
"loss": 0.5988,
"step": 381
},
{
"epoch": 1.8409638554216867,
"grad_norm": 0.43980199098587036,
"learning_rate": 3.883475021839382e-06,
"loss": 0.5529,
"step": 382
},
{
"epoch": 1.8457831325301206,
"grad_norm": 0.43700751662254333,
"learning_rate": 3.856053228896442e-06,
"loss": 0.5552,
"step": 383
},
{
"epoch": 1.8506024096385543,
"grad_norm": 0.4302417039871216,
"learning_rate": 3.8286676966395895e-06,
"loss": 0.532,
"step": 384
},
{
"epoch": 1.855421686746988,
"grad_norm": 0.4162864089012146,
"learning_rate": 3.8013192931321095e-06,
"loss": 0.5583,
"step": 385
},
{
"epoch": 1.8602409638554218,
"grad_norm": 0.43791574239730835,
"learning_rate": 3.77400888526038e-06,
"loss": 0.5247,
"step": 386
},
{
"epoch": 1.8650602409638555,
"grad_norm": 0.4298801124095917,
"learning_rate": 3.7467373387063973e-06,
"loss": 0.5928,
"step": 387
},
{
"epoch": 1.8698795180722891,
"grad_norm": 0.40685486793518066,
"learning_rate": 3.719505517920342e-06,
"loss": 0.5563,
"step": 388
},
{
"epoch": 1.874698795180723,
"grad_norm": 0.3878326416015625,
"learning_rate": 3.692314286093167e-06,
"loss": 0.5353,
"step": 389
},
{
"epoch": 1.8795180722891565,
"grad_norm": 0.4625506103038788,
"learning_rate": 3.6651645051292415e-06,
"loss": 0.5886,
"step": 390
},
{
"epoch": 1.8843373493975903,
"grad_norm": 0.42777660489082336,
"learning_rate": 3.6380570356190346e-06,
"loss": 0.5705,
"step": 391
},
{
"epoch": 1.8891566265060242,
"grad_norm": 0.40551841259002686,
"learning_rate": 3.610992736811827e-06,
"loss": 0.5354,
"step": 392
},
{
"epoch": 1.8939759036144577,
"grad_norm": 0.42313259840011597,
"learning_rate": 3.58397246658848e-06,
"loss": 0.5534,
"step": 393
},
{
"epoch": 1.8987951807228916,
"grad_norm": 0.4355023503303528,
"learning_rate": 3.556997081434248e-06,
"loss": 0.5733,
"step": 394
},
{
"epoch": 1.9036144578313254,
"grad_norm": 0.42895275354385376,
"learning_rate": 3.5300674364116173e-06,
"loss": 0.5624,
"step": 395
},
{
"epoch": 1.9084337349397589,
"grad_norm": 0.43042075634002686,
"learning_rate": 3.5031843851332105e-06,
"loss": 0.6029,
"step": 396
},
{
"epoch": 1.9132530120481928,
"grad_norm": 0.37077224254608154,
"learning_rate": 3.476348779734732e-06,
"loss": 0.5392,
"step": 397
},
{
"epoch": 1.9180722891566266,
"grad_norm": 0.4056945741176605,
"learning_rate": 3.449561470847947e-06,
"loss": 0.5322,
"step": 398
},
{
"epoch": 1.92289156626506,
"grad_norm": 0.4261414706707001,
"learning_rate": 3.4228233075737225e-06,
"loss": 0.5752,
"step": 399
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.38658595085144043,
"learning_rate": 3.3961351374551234e-06,
"loss": 0.5601,
"step": 400
},
{
"epoch": 1.9325301204819278,
"grad_norm": 0.4442770183086395,
"learning_rate": 3.3694978064505258e-06,
"loss": 0.6121,
"step": 401
},
{
"epoch": 1.9373493975903613,
"grad_norm": 0.41190898418426514,
"learning_rate": 3.3429121589068213e-06,
"loss": 0.5003,
"step": 402
},
{
"epoch": 1.9421686746987952,
"grad_norm": 0.3673941195011139,
"learning_rate": 3.316379037532644e-06,
"loss": 0.535,
"step": 403
},
{
"epoch": 1.946987951807229,
"grad_norm": 0.4322197139263153,
"learning_rate": 3.289899283371657e-06,
"loss": 0.528,
"step": 404
},
{
"epoch": 1.9518072289156625,
"grad_norm": 0.45587092638015747,
"learning_rate": 3.2634737357758994e-06,
"loss": 0.5608,
"step": 405
},
{
"epoch": 1.9566265060240964,
"grad_norm": 0.4296489953994751,
"learning_rate": 3.2371032323791757e-06,
"loss": 0.6065,
"step": 406
},
{
"epoch": 1.9614457831325303,
"grad_norm": 0.4287756085395813,
"learning_rate": 3.2107886090705035e-06,
"loss": 0.5215,
"step": 407
},
{
"epoch": 1.9662650602409637,
"grad_norm": 0.4841082692146301,
"learning_rate": 3.1845306999676274e-06,
"loss": 0.5658,
"step": 408
},
{
"epoch": 1.9710843373493976,
"grad_norm": 0.3839523196220398,
"learning_rate": 3.158330337390565e-06,
"loss": 0.5713,
"step": 409
},
{
"epoch": 1.9759036144578315,
"grad_norm": 0.37823107838630676,
"learning_rate": 3.132188351835232e-06,
"loss": 0.5295,
"step": 410
},
{
"epoch": 1.980722891566265,
"grad_norm": 0.4143429696559906,
"learning_rate": 3.10610557194712e-06,
"loss": 0.5654,
"step": 411
},
{
"epoch": 1.9855421686746988,
"grad_norm": 0.3939684331417084,
"learning_rate": 3.080082824495024e-06,
"loss": 0.5429,
"step": 412
},
{
"epoch": 1.9903614457831327,
"grad_norm": 0.4271552264690399,
"learning_rate": 3.0541209343448373e-06,
"loss": 0.5801,
"step": 413
},
{
"epoch": 1.9951807228915661,
"grad_norm": 0.4314172565937042,
"learning_rate": 3.0282207244334084e-06,
"loss": 0.5558,
"step": 414
},
{
"epoch": 2.0,
"grad_norm": 0.6961327195167542,
"learning_rate": 3.0023830157424504e-06,
"loss": 0.7905,
"step": 415
},
{
"epoch": 2.004819277108434,
"grad_norm": 0.3929261863231659,
"learning_rate": 2.97660862727252e-06,
"loss": 0.5401,
"step": 416
},
{
"epoch": 2.0096385542168673,
"grad_norm": 0.46623867750167847,
"learning_rate": 2.950898376017064e-06,
"loss": 0.5227,
"step": 417
},
{
"epoch": 2.014457831325301,
"grad_norm": 0.40226995944976807,
"learning_rate": 2.9252530769365053e-06,
"loss": 0.533,
"step": 418
},
{
"epoch": 2.019277108433735,
"grad_norm": 0.43160927295684814,
"learning_rate": 2.8996735429324256e-06,
"loss": 0.5335,
"step": 419
},
{
"epoch": 2.0240963855421685,
"grad_norm": 0.42182430624961853,
"learning_rate": 2.874160584821798e-06,
"loss": 0.5219,
"step": 420
},
{
"epoch": 2.0289156626506024,
"grad_norm": 0.4267122149467468,
"learning_rate": 2.848715011311271e-06,
"loss": 0.5259,
"step": 421
},
{
"epoch": 2.0337349397590363,
"grad_norm": 0.41059398651123047,
"learning_rate": 2.823337628971551e-06,
"loss": 0.5302,
"step": 422
},
{
"epoch": 2.0385542168674697,
"grad_norm": 0.3666737675666809,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.525,
"step": 423
},
{
"epoch": 2.0433734939759036,
"grad_norm": 0.460478276014328,
"learning_rate": 2.7727906532542783e-06,
"loss": 0.5416,
"step": 424
},
{
"epoch": 2.0481927710843375,
"grad_norm": 0.45187464356422424,
"learning_rate": 2.7476226621086354e-06,
"loss": 0.5293,
"step": 425
},
{
"epoch": 2.053012048192771,
"grad_norm": 0.40986311435699463,
"learning_rate": 2.72252606654683e-06,
"loss": 0.5429,
"step": 426
},
{
"epoch": 2.057831325301205,
"grad_norm": 0.3846244812011719,
"learning_rate": 2.697501662077707e-06,
"loss": 0.5322,
"step": 427
},
{
"epoch": 2.0626506024096387,
"grad_norm": 0.44859009981155396,
"learning_rate": 2.6725502419218084e-06,
"loss": 0.5246,
"step": 428
},
{
"epoch": 2.067469879518072,
"grad_norm": 0.4478023648262024,
"learning_rate": 2.6476725969862227e-06,
"loss": 0.5453,
"step": 429
},
{
"epoch": 2.072289156626506,
"grad_norm": 0.4618147015571594,
"learning_rate": 2.622869515839524e-06,
"loss": 0.5373,
"step": 430
},
{
"epoch": 2.07710843373494,
"grad_norm": 0.3915225863456726,
"learning_rate": 2.5981417846867753e-06,
"loss": 0.5298,
"step": 431
},
{
"epoch": 2.0819277108433734,
"grad_norm": 0.41270238161087036,
"learning_rate": 2.573490187344596e-06,
"loss": 0.5294,
"step": 432
},
{
"epoch": 2.0867469879518072,
"grad_norm": 0.44997212290763855,
"learning_rate": 2.548915505216333e-06,
"loss": 0.5152,
"step": 433
},
{
"epoch": 2.091566265060241,
"grad_norm": 0.445286363363266,
"learning_rate": 2.524418517267283e-06,
"loss": 0.5349,
"step": 434
},
{
"epoch": 2.0963855421686746,
"grad_norm": 0.4325414001941681,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.5424,
"step": 435
},
{
"epoch": 2.1012048192771084,
"grad_norm": 0.4230119287967682,
"learning_rate": 2.4756607274296844e-06,
"loss": 0.5278,
"step": 436
},
{
"epoch": 2.1060240963855423,
"grad_norm": 0.37730202078819275,
"learning_rate": 2.4514014710596467e-06,
"loss": 0.5133,
"step": 437
},
{
"epoch": 2.1108433734939758,
"grad_norm": 0.41996341943740845,
"learning_rate": 2.4272229998568576e-06,
"loss": 0.5367,
"step": 438
},
{
"epoch": 2.1156626506024097,
"grad_norm": 0.3746831715106964,
"learning_rate": 2.4031260802275623e-06,
"loss": 0.5228,
"step": 439
},
{
"epoch": 2.1204819277108435,
"grad_norm": 0.39061239361763,
"learning_rate": 2.3791114759930013e-06,
"loss": 0.5354,
"step": 440
},
{
"epoch": 2.125301204819277,
"grad_norm": 0.406981498003006,
"learning_rate": 2.3551799483651894e-06,
"loss": 0.5279,
"step": 441
},
{
"epoch": 2.130120481927711,
"grad_norm": 0.4518032670021057,
"learning_rate": 2.331332255922784e-06,
"loss": 0.5394,
"step": 442
},
{
"epoch": 2.1349397590361447,
"grad_norm": 0.40522995591163635,
"learning_rate": 2.307569154587056e-06,
"loss": 0.539,
"step": 443
},
{
"epoch": 2.139759036144578,
"grad_norm": 0.3735210597515106,
"learning_rate": 2.283891397597908e-06,
"loss": 0.5265,
"step": 444
},
{
"epoch": 2.144578313253012,
"grad_norm": 0.4129243791103363,
"learning_rate": 2.2602997354900075e-06,
"loss": 0.5319,
"step": 445
},
{
"epoch": 2.149397590361446,
"grad_norm": 0.36963480710983276,
"learning_rate": 2.236794916069007e-06,
"loss": 0.5279,
"step": 446
},
{
"epoch": 2.1542168674698794,
"grad_norm": 0.3678368926048279,
"learning_rate": 2.2133776843878185e-06,
"loss": 0.5317,
"step": 447
},
{
"epoch": 2.1590361445783133,
"grad_norm": 0.37665683031082153,
"learning_rate": 2.190048782723015e-06,
"loss": 0.5339,
"step": 448
},
{
"epoch": 2.163855421686747,
"grad_norm": 0.37703850865364075,
"learning_rate": 2.166808950551296e-06,
"loss": 0.5406,
"step": 449
},
{
"epoch": 2.1686746987951806,
"grad_norm": 0.40551963448524475,
"learning_rate": 2.1436589245260375e-06,
"loss": 0.5373,
"step": 450
},
{
"epoch": 2.1734939759036145,
"grad_norm": 0.35308101773262024,
"learning_rate": 2.120599438453968e-06,
"loss": 0.5154,
"step": 451
},
{
"epoch": 2.1783132530120484,
"grad_norm": 0.35655319690704346,
"learning_rate": 2.0976312232718763e-06,
"loss": 0.5408,
"step": 452
},
{
"epoch": 2.183132530120482,
"grad_norm": 0.3718145489692688,
"learning_rate": 2.074755007023461e-06,
"loss": 0.5486,
"step": 453
},
{
"epoch": 2.1879518072289157,
"grad_norm": 0.40163010358810425,
"learning_rate": 2.0519715148362585e-06,
"loss": 0.5233,
"step": 454
},
{
"epoch": 2.1927710843373496,
"grad_norm": 0.41173961758613586,
"learning_rate": 2.0292814688986375e-06,
"loss": 0.5135,
"step": 455
},
{
"epoch": 2.197590361445783,
"grad_norm": 0.3652428686618805,
"learning_rate": 2.0066855884369246e-06,
"loss": 0.5151,
"step": 456
},
{
"epoch": 2.202409638554217,
"grad_norm": 0.36961421370506287,
"learning_rate": 1.9841845896926022e-06,
"loss": 0.5446,
"step": 457
},
{
"epoch": 2.207228915662651,
"grad_norm": 0.3552957773208618,
"learning_rate": 1.961779185899597e-06,
"loss": 0.5252,
"step": 458
},
{
"epoch": 2.212048192771084,
"grad_norm": 0.3752812147140503,
"learning_rate": 1.9394700872616856e-06,
"loss": 0.5395,
"step": 459
},
{
"epoch": 2.216867469879518,
"grad_norm": 0.38084596395492554,
"learning_rate": 1.9172580009299735e-06,
"loss": 0.5256,
"step": 460
},
{
"epoch": 2.221686746987952,
"grad_norm": 0.3959069848060608,
"learning_rate": 1.8951436309804766e-06,
"loss": 0.5474,
"step": 461
},
{
"epoch": 2.2265060240963854,
"grad_norm": 0.4074706435203552,
"learning_rate": 1.8731276783918162e-06,
"loss": 0.5334,
"step": 462
},
{
"epoch": 2.2313253012048193,
"grad_norm": 0.37793341279029846,
"learning_rate": 1.8512108410229878e-06,
"loss": 0.5284,
"step": 463
},
{
"epoch": 2.236144578313253,
"grad_norm": 0.369093656539917,
"learning_rate": 1.8293938135912475e-06,
"loss": 0.5312,
"step": 464
},
{
"epoch": 2.2409638554216866,
"grad_norm": 0.4202345609664917,
"learning_rate": 1.8076772876500831e-06,
"loss": 0.5063,
"step": 465
},
{
"epoch": 2.2457831325301205,
"grad_norm": 0.36291444301605225,
"learning_rate": 1.7860619515673034e-06,
"loss": 0.5294,
"step": 466
},
{
"epoch": 2.2506024096385544,
"grad_norm": 0.3744347095489502,
"learning_rate": 1.7645484905032129e-06,
"loss": 0.5451,
"step": 467
},
{
"epoch": 2.255421686746988,
"grad_norm": 0.41913270950317383,
"learning_rate": 1.74313758638889e-06,
"loss": 0.5137,
"step": 468
},
{
"epoch": 2.2602409638554217,
"grad_norm": 0.3765053153038025,
"learning_rate": 1.7218299179045789e-06,
"loss": 0.5371,
"step": 469
},
{
"epoch": 2.2650602409638556,
"grad_norm": 0.33764129877090454,
"learning_rate": 1.7006261604581725e-06,
"loss": 0.5358,
"step": 470
},
{
"epoch": 2.269879518072289,
"grad_norm": 0.36633729934692383,
"learning_rate": 1.6795269861638041e-06,
"loss": 0.5336,
"step": 471
},
{
"epoch": 2.274698795180723,
"grad_norm": 0.37890884280204773,
"learning_rate": 1.6585330638205454e-06,
"loss": 0.5236,
"step": 472
},
{
"epoch": 2.279518072289157,
"grad_norm": 0.3956688344478607,
"learning_rate": 1.6376450588911985e-06,
"loss": 0.5505,
"step": 473
},
{
"epoch": 2.2843373493975903,
"grad_norm": 0.3635486662387848,
"learning_rate": 1.6168636334812126e-06,
"loss": 0.5234,
"step": 474
},
{
"epoch": 2.289156626506024,
"grad_norm": 0.3534378707408905,
"learning_rate": 1.5961894463176942e-06,
"loss": 0.5181,
"step": 475
},
{
"epoch": 2.293975903614458,
"grad_norm": 0.34564465284347534,
"learning_rate": 1.5756231527285181e-06,
"loss": 0.5247,
"step": 476
},
{
"epoch": 2.2987951807228915,
"grad_norm": 0.39127737283706665,
"learning_rate": 1.555165404621567e-06,
"loss": 0.5445,
"step": 477
},
{
"epoch": 2.3036144578313253,
"grad_norm": 0.3836536407470703,
"learning_rate": 1.5348168504640631e-06,
"loss": 0.5235,
"step": 478
},
{
"epoch": 2.3084337349397592,
"grad_norm": 0.361017107963562,
"learning_rate": 1.5145781352620054e-06,
"loss": 0.5248,
"step": 479
},
{
"epoch": 2.3132530120481927,
"grad_norm": 0.3382739722728729,
"learning_rate": 1.4944499005397372e-06,
"loss": 0.5279,
"step": 480
},
{
"epoch": 2.3180722891566266,
"grad_norm": 0.3852854073047638,
"learning_rate": 1.4744327843196043e-06,
"loss": 0.5159,
"step": 481
},
{
"epoch": 2.32289156626506,
"grad_norm": 0.3639327883720398,
"learning_rate": 1.4545274211017264e-06,
"loss": 0.524,
"step": 482
},
{
"epoch": 2.327710843373494,
"grad_norm": 0.3571608066558838,
"learning_rate": 1.434734441843899e-06,
"loss": 0.5219,
"step": 483
},
{
"epoch": 2.3325301204819278,
"grad_norm": 0.3344271183013916,
"learning_rate": 1.4150544739415755e-06,
"loss": 0.5267,
"step": 484
},
{
"epoch": 2.337349397590361,
"grad_norm": 0.35768088698387146,
"learning_rate": 1.3954881412079945e-06,
"loss": 0.5165,
"step": 485
},
{
"epoch": 2.342168674698795,
"grad_norm": 0.36367958784103394,
"learning_rate": 1.3760360638544012e-06,
"loss": 0.5154,
"step": 486
},
{
"epoch": 2.346987951807229,
"grad_norm": 0.36191970109939575,
"learning_rate": 1.3566988584703817e-06,
"loss": 0.5112,
"step": 487
},
{
"epoch": 2.3518072289156624,
"grad_norm": 0.3416457772254944,
"learning_rate": 1.3374771380043306e-06,
"loss": 0.5161,
"step": 488
},
{
"epoch": 2.3566265060240963,
"grad_norm": 0.3561415374279022,
"learning_rate": 1.3183715117440143e-06,
"loss": 0.5121,
"step": 489
},
{
"epoch": 2.36144578313253,
"grad_norm": 0.35016006231307983,
"learning_rate": 1.2993825852972559e-06,
"loss": 0.5324,
"step": 490
},
{
"epoch": 2.3662650602409636,
"grad_norm": 0.3487759530544281,
"learning_rate": 1.280510960572745e-06,
"loss": 0.5243,
"step": 491
},
{
"epoch": 2.3710843373493975,
"grad_norm": 0.3596084713935852,
"learning_rate": 1.2617572357609565e-06,
"loss": 0.5138,
"step": 492
},
{
"epoch": 2.3759036144578314,
"grad_norm": 0.3738745152950287,
"learning_rate": 1.2431220053151832e-06,
"loss": 0.5235,
"step": 493
},
{
"epoch": 2.380722891566265,
"grad_norm": 0.3412809371948242,
"learning_rate": 1.2246058599327021e-06,
"loss": 0.5283,
"step": 494
},
{
"epoch": 2.3855421686746987,
"grad_norm": 0.3288966119289398,
"learning_rate": 1.2062093865360458e-06,
"loss": 0.5332,
"step": 495
},
{
"epoch": 2.3903614457831326,
"grad_norm": 0.3649263381958008,
"learning_rate": 1.1879331682543972e-06,
"loss": 0.5308,
"step": 496
},
{
"epoch": 2.395180722891566,
"grad_norm": 0.34862828254699707,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.5242,
"step": 497
},
{
"epoch": 2.4,
"grad_norm": 0.33387741446495056,
"learning_rate": 1.1517438104753386e-06,
"loss": 0.5346,
"step": 498
},
{
"epoch": 2.404819277108434,
"grad_norm": 0.3655238449573517,
"learning_rate": 1.1338318181038037e-06,
"loss": 0.5173,
"step": 499
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.3874536454677582,
"learning_rate": 1.1160423750626693e-06,
"loss": 0.5382,
"step": 500
},
{
"epoch": 2.414457831325301,
"grad_norm": 0.33767327666282654,
"learning_rate": 1.0983760452395415e-06,
"loss": 0.543,
"step": 501
},
{
"epoch": 2.419277108433735,
"grad_norm": 0.33204296231269836,
"learning_rate": 1.0808333886196038e-06,
"loss": 0.5508,
"step": 502
},
{
"epoch": 2.4240963855421684,
"grad_norm": 0.3084474802017212,
"learning_rate": 1.063414961267859e-06,
"loss": 0.5483,
"step": 503
},
{
"epoch": 2.4289156626506023,
"grad_norm": 0.36713990569114685,
"learning_rate": 1.046121315311508e-06,
"loss": 0.5242,
"step": 504
},
{
"epoch": 2.433734939759036,
"grad_norm": 0.3953552544116974,
"learning_rate": 1.02895299892244e-06,
"loss": 0.521,
"step": 505
},
{
"epoch": 2.4385542168674696,
"grad_norm": 0.32059621810913086,
"learning_rate": 1.01191055629987e-06,
"loss": 0.5199,
"step": 506
},
{
"epoch": 2.4433734939759035,
"grad_norm": 0.3375447392463684,
"learning_rate": 9.949945276530782e-07,
"loss": 0.5326,
"step": 507
},
{
"epoch": 2.4481927710843374,
"grad_norm": 0.34036242961883545,
"learning_rate": 9.782054491842879e-07,
"loss": 0.5394,
"step": 508
},
{
"epoch": 2.453012048192771,
"grad_norm": 0.35132986307144165,
"learning_rate": 9.615438530716753e-07,
"loss": 0.5458,
"step": 509
},
{
"epoch": 2.4578313253012047,
"grad_norm": 0.36198896169662476,
"learning_rate": 9.450102674524952e-07,
"loss": 0.5061,
"step": 510
},
{
"epoch": 2.4626506024096386,
"grad_norm": 0.3417421877384186,
"learning_rate": 9.286052164063369e-07,
"loss": 0.5279,
"step": 511
},
{
"epoch": 2.467469879518072,
"grad_norm": 0.3652937412261963,
"learning_rate": 9.123292199385247e-07,
"loss": 0.5379,
"step": 512
},
{
"epoch": 2.472289156626506,
"grad_norm": 0.3249654471874237,
"learning_rate": 8.961827939636198e-07,
"loss": 0.5483,
"step": 513
},
{
"epoch": 2.47710843373494,
"grad_norm": 0.3499116003513336,
"learning_rate": 8.801664502890722e-07,
"loss": 0.5173,
"step": 514
},
{
"epoch": 2.4819277108433733,
"grad_norm": 0.3960937559604645,
"learning_rate": 8.64280696599008e-07,
"loss": 0.5293,
"step": 515
},
{
"epoch": 2.486746987951807,
"grad_norm": 0.3167719841003418,
"learning_rate": 8.485260364381187e-07,
"loss": 0.5362,
"step": 516
},
{
"epoch": 2.491566265060241,
"grad_norm": 0.36978796124458313,
"learning_rate": 8.329029691957124e-07,
"loss": 0.5255,
"step": 517
},
{
"epoch": 2.4963855421686745,
"grad_norm": 0.34822145104408264,
"learning_rate": 8.17411990089883e-07,
"loss": 0.5274,
"step": 518
},
{
"epoch": 2.5012048192771084,
"grad_norm": 0.33780673146247864,
"learning_rate": 8.02053590151805e-07,
"loss": 0.5249,
"step": 519
},
{
"epoch": 2.5060240963855422,
"grad_norm": 0.3500611484050751,
"learning_rate": 7.868282562101819e-07,
"loss": 0.5419,
"step": 520
},
{
"epoch": 2.5108433734939757,
"grad_norm": 0.33515140414237976,
"learning_rate": 7.717364708758024e-07,
"loss": 0.5446,
"step": 521
},
{
"epoch": 2.5156626506024096,
"grad_norm": 0.35610055923461914,
"learning_rate": 7.567787125262449e-07,
"loss": 0.5345,
"step": 522
},
{
"epoch": 2.5204819277108435,
"grad_norm": 0.3281942903995514,
"learning_rate": 7.41955455290726e-07,
"loss": 0.5257,
"step": 523
},
{
"epoch": 2.525301204819277,
"grad_norm": 0.3701113164424896,
"learning_rate": 7.27267169035053e-07,
"loss": 0.519,
"step": 524
},
{
"epoch": 2.5301204819277108,
"grad_norm": 0.32227593660354614,
"learning_rate": 7.127143193467445e-07,
"loss": 0.53,
"step": 525
},
{
"epoch": 2.5349397590361447,
"grad_norm": 0.34199750423431396,
"learning_rate": 6.982973675202676e-07,
"loss": 0.5174,
"step": 526
},
{
"epoch": 2.539759036144578,
"grad_norm": 0.33478933572769165,
"learning_rate": 6.840167705424106e-07,
"loss": 0.5374,
"step": 527
},
{
"epoch": 2.544578313253012,
"grad_norm": 0.37539225816726685,
"learning_rate": 6.698729810778065e-07,
"loss": 0.5273,
"step": 528
},
{
"epoch": 2.549397590361446,
"grad_norm": 0.3374510705471039,
"learning_rate": 6.558664474545817e-07,
"loss": 0.5294,
"step": 529
},
{
"epoch": 2.5542168674698793,
"grad_norm": 0.37236514687538147,
"learning_rate": 6.419976136501377e-07,
"loss": 0.5398,
"step": 530
},
{
"epoch": 2.559036144578313,
"grad_norm": 0.35223281383514404,
"learning_rate": 6.282669192770896e-07,
"loss": 0.5034,
"step": 531
},
{
"epoch": 2.563855421686747,
"grad_norm": 0.3377327024936676,
"learning_rate": 6.146747995693225e-07,
"loss": 0.5197,
"step": 532
},
{
"epoch": 2.5686746987951805,
"grad_norm": 0.34087198972702026,
"learning_rate": 6.012216853682001e-07,
"loss": 0.5047,
"step": 533
},
{
"epoch": 2.5734939759036144,
"grad_norm": 0.3489024043083191,
"learning_rate": 5.879080031089047e-07,
"loss": 0.5131,
"step": 534
},
{
"epoch": 2.5783132530120483,
"grad_norm": 0.31479188799858093,
"learning_rate": 5.747341748069229e-07,
"loss": 0.5346,
"step": 535
},
{
"epoch": 2.5831325301204817,
"grad_norm": 0.352466344833374,
"learning_rate": 5.617006180446688e-07,
"loss": 0.5279,
"step": 536
},
{
"epoch": 2.5879518072289156,
"grad_norm": 0.34586963057518005,
"learning_rate": 5.488077459582425e-07,
"loss": 0.5321,
"step": 537
},
{
"epoch": 2.5927710843373495,
"grad_norm": 0.35135793685913086,
"learning_rate": 5.360559672243421e-07,
"loss": 0.5376,
"step": 538
},
{
"epoch": 2.597590361445783,
"grad_norm": 0.35118043422698975,
"learning_rate": 5.234456860473042e-07,
"loss": 0.5253,
"step": 539
},
{
"epoch": 2.602409638554217,
"grad_norm": 0.4087206721305847,
"learning_rate": 5.109773021462921e-07,
"loss": 0.5263,
"step": 540
},
{
"epoch": 2.6072289156626507,
"grad_norm": 0.3330649137496948,
"learning_rate": 4.986512107426283e-07,
"loss": 0.5325,
"step": 541
},
{
"epoch": 2.612048192771084,
"grad_norm": 0.3389038145542145,
"learning_rate": 4.864678025472635e-07,
"loss": 0.5334,
"step": 542
},
{
"epoch": 2.616867469879518,
"grad_norm": 0.35696545243263245,
"learning_rate": 4.7442746374839363e-07,
"loss": 0.5447,
"step": 543
},
{
"epoch": 2.621686746987952,
"grad_norm": 0.35791394114494324,
"learning_rate": 4.625305759992205e-07,
"loss": 0.5125,
"step": 544
},
{
"epoch": 2.6265060240963853,
"grad_norm": 0.3383076786994934,
"learning_rate": 4.50777516405847e-07,
"loss": 0.5353,
"step": 545
},
{
"epoch": 2.6313253012048192,
"grad_norm": 0.32981517910957336,
"learning_rate": 4.3916865751533313e-07,
"loss": 0.532,
"step": 546
},
{
"epoch": 2.636144578313253,
"grad_norm": 0.33359435200691223,
"learning_rate": 4.2770436730388166e-07,
"loss": 0.5177,
"step": 547
},
{
"epoch": 2.6409638554216865,
"grad_norm": 0.3236476480960846,
"learning_rate": 4.163850091651717e-07,
"loss": 0.5314,
"step": 548
},
{
"epoch": 2.6457831325301204,
"grad_norm": 0.31992560625076294,
"learning_rate": 4.05210941898847e-07,
"loss": 0.5489,
"step": 549
},
{
"epoch": 2.6506024096385543,
"grad_norm": 0.3217684030532837,
"learning_rate": 3.941825196991378e-07,
"loss": 0.5393,
"step": 550
},
{
"epoch": 2.6554216867469878,
"grad_norm": 0.3366377353668213,
"learning_rate": 3.8330009214363197e-07,
"loss": 0.5393,
"step": 551
},
{
"epoch": 2.6602409638554216,
"grad_norm": 0.3715660572052002,
"learning_rate": 3.725640041822026e-07,
"loss": 0.5259,
"step": 552
},
{
"epoch": 2.6650602409638555,
"grad_norm": 0.3347671329975128,
"learning_rate": 3.619745961260623e-07,
"loss": 0.5225,
"step": 553
},
{
"epoch": 2.669879518072289,
"grad_norm": 0.3271038234233856,
"learning_rate": 3.5153220363698225e-07,
"loss": 0.5336,
"step": 554
},
{
"epoch": 2.674698795180723,
"grad_norm": 0.36232107877731323,
"learning_rate": 3.4123715771665786e-07,
"loss": 0.5342,
"step": 555
},
{
"epoch": 2.6795180722891567,
"grad_norm": 0.3338398337364197,
"learning_rate": 3.310897846962041e-07,
"loss": 0.5349,
"step": 556
},
{
"epoch": 2.68433734939759,
"grad_norm": 0.33740854263305664,
"learning_rate": 3.2109040622582186e-07,
"loss": 0.5357,
"step": 557
},
{
"epoch": 2.689156626506024,
"grad_norm": 0.33538562059402466,
"learning_rate": 3.112393392645985e-07,
"loss": 0.542,
"step": 558
},
{
"epoch": 2.693975903614458,
"grad_norm": 0.3457934856414795,
"learning_rate": 3.015368960704584e-07,
"loss": 0.524,
"step": 559
},
{
"epoch": 2.6987951807228914,
"grad_norm": 0.32423001527786255,
"learning_rate": 2.919833841902714e-07,
"loss": 0.5345,
"step": 560
},
{
"epoch": 2.7036144578313253,
"grad_norm": 0.3165851831436157,
"learning_rate": 2.8257910645009935e-07,
"loss": 0.5491,
"step": 561
},
{
"epoch": 2.708433734939759,
"grad_norm": 0.3321724236011505,
"learning_rate": 2.733243609455971e-07,
"loss": 0.5283,
"step": 562
},
{
"epoch": 2.7132530120481926,
"grad_norm": 0.34521132707595825,
"learning_rate": 2.6421944103256657e-07,
"loss": 0.5348,
"step": 563
},
{
"epoch": 2.7180722891566265,
"grad_norm": 0.34511709213256836,
"learning_rate": 2.5526463531765467e-07,
"loss": 0.5283,
"step": 564
},
{
"epoch": 2.7228915662650603,
"grad_norm": 0.3328828513622284,
"learning_rate": 2.4646022764920843e-07,
"loss": 0.5213,
"step": 565
},
{
"epoch": 2.727710843373494,
"grad_norm": 0.31696540117263794,
"learning_rate": 2.3780649710827552e-07,
"loss": 0.5261,
"step": 566
},
{
"epoch": 2.7325301204819277,
"grad_norm": 0.31569039821624756,
"learning_rate": 2.2930371799975593e-07,
"loss": 0.5176,
"step": 567
},
{
"epoch": 2.7373493975903616,
"grad_norm": 0.3491840660572052,
"learning_rate": 2.20952159843712e-07,
"loss": 0.5261,
"step": 568
},
{
"epoch": 2.742168674698795,
"grad_norm": 0.3705739974975586,
"learning_rate": 2.1275208736682262e-07,
"loss": 0.5104,
"step": 569
},
{
"epoch": 2.746987951807229,
"grad_norm": 0.34111452102661133,
"learning_rate": 2.0470376049398944e-07,
"loss": 0.5179,
"step": 570
},
{
"epoch": 2.7518072289156628,
"grad_norm": 0.3375121057033539,
"learning_rate": 1.9680743434010385e-07,
"loss": 0.532,
"step": 571
},
{
"epoch": 2.756626506024096,
"grad_norm": 0.34646302461624146,
"learning_rate": 1.8906335920195418e-07,
"loss": 0.5358,
"step": 572
},
{
"epoch": 2.76144578313253,
"grad_norm": 0.3552838861942291,
"learning_rate": 1.814717805502958e-07,
"loss": 0.529,
"step": 573
},
{
"epoch": 2.766265060240964,
"grad_norm": 0.344050794839859,
"learning_rate": 1.7403293902206851e-07,
"loss": 0.5296,
"step": 574
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.3431393802165985,
"learning_rate": 1.667470704127694e-07,
"loss": 0.5234,
"step": 575
},
{
"epoch": 2.7759036144578313,
"grad_norm": 0.33988475799560547,
"learning_rate": 1.5961440566897913e-07,
"loss": 0.5205,
"step": 576
},
{
"epoch": 2.780722891566265,
"grad_norm": 0.3232595920562744,
"learning_rate": 1.5263517088103862e-07,
"loss": 0.521,
"step": 577
},
{
"epoch": 2.7855421686746986,
"grad_norm": 0.32091447710990906,
"learning_rate": 1.4580958727588746e-07,
"loss": 0.5295,
"step": 578
},
{
"epoch": 2.7903614457831325,
"grad_norm": 0.3785247206687927,
"learning_rate": 1.3913787121004717e-07,
"loss": 0.5105,
"step": 579
},
{
"epoch": 2.7951807228915664,
"grad_norm": 0.3253907561302185,
"learning_rate": 1.3262023416276414e-07,
"loss": 0.5347,
"step": 580
},
{
"epoch": 2.8,
"grad_norm": 0.3325127363204956,
"learning_rate": 1.2625688272930925e-07,
"loss": 0.5201,
"step": 581
},
{
"epoch": 2.8048192771084337,
"grad_norm": 0.32310402393341064,
"learning_rate": 1.2004801861442373e-07,
"loss": 0.5328,
"step": 582
},
{
"epoch": 2.8096385542168676,
"grad_norm": 0.3170939087867737,
"learning_rate": 1.1399383862592928e-07,
"loss": 0.536,
"step": 583
},
{
"epoch": 2.814457831325301,
"grad_norm": 0.33208224177360535,
"learning_rate": 1.0809453466849029e-07,
"loss": 0.5444,
"step": 584
},
{
"epoch": 2.819277108433735,
"grad_norm": 0.30856242775917053,
"learning_rate": 1.0235029373752758e-07,
"loss": 0.5357,
"step": 585
},
{
"epoch": 2.824096385542169,
"grad_norm": 0.3055464029312134,
"learning_rate": 9.676129791329481e-08,
"loss": 0.5371,
"step": 586
},
{
"epoch": 2.8289156626506022,
"grad_norm": 0.3335382342338562,
"learning_rate": 9.132772435510362e-08,
"loss": 0.5267,
"step": 587
},
{
"epoch": 2.833734939759036,
"grad_norm": 0.33981144428253174,
"learning_rate": 8.604974529571042e-08,
"loss": 0.5368,
"step": 588
},
{
"epoch": 2.83855421686747,
"grad_norm": 0.3252619802951813,
"learning_rate": 8.092752803585513e-08,
"loss": 0.5216,
"step": 589
},
{
"epoch": 2.8433734939759034,
"grad_norm": 0.33591899275779724,
"learning_rate": 7.59612349389599e-08,
"loss": 0.5165,
"step": 590
},
{
"epoch": 2.8481927710843373,
"grad_norm": 0.3571572005748749,
"learning_rate": 7.115102342598101e-08,
"loss": 0.5313,
"step": 591
},
{
"epoch": 2.853012048192771,
"grad_norm": 0.3264915943145752,
"learning_rate": 6.649704597042061e-08,
"loss": 0.5343,
"step": 592
},
{
"epoch": 2.8578313253012047,
"grad_norm": 0.3618753254413605,
"learning_rate": 6.199945009349173e-08,
"loss": 0.4944,
"step": 593
},
{
"epoch": 2.8626506024096385,
"grad_norm": 0.33330121636390686,
"learning_rate": 5.7658378359443104e-08,
"loss": 0.547,
"step": 594
},
{
"epoch": 2.8674698795180724,
"grad_norm": 0.3385322391986847,
"learning_rate": 5.3473968371040575e-08,
"loss": 0.5333,
"step": 595
},
{
"epoch": 2.872289156626506,
"grad_norm": 0.33145639300346375,
"learning_rate": 4.944635276520393e-08,
"loss": 0.5362,
"step": 596
},
{
"epoch": 2.8771084337349397,
"grad_norm": 0.33541208505630493,
"learning_rate": 4.55756592088058e-08,
"loss": 0.5307,
"step": 597
},
{
"epoch": 2.8819277108433736,
"grad_norm": 0.32140061259269714,
"learning_rate": 4.186201039462046e-08,
"loss": 0.5274,
"step": 598
},
{
"epoch": 2.886746987951807,
"grad_norm": 0.3215773403644562,
"learning_rate": 3.8305524037438035e-08,
"loss": 0.5363,
"step": 599
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.3285579979419708,
"learning_rate": 3.4906312870331973e-08,
"loss": 0.5268,
"step": 600
},
{
"epoch": 2.896385542168675,
"grad_norm": 0.3298654854297638,
"learning_rate": 3.166448464108629e-08,
"loss": 0.5326,
"step": 601
},
{
"epoch": 2.9012048192771083,
"grad_norm": 0.32575464248657227,
"learning_rate": 2.8580142108778354e-08,
"loss": 0.5243,
"step": 602
},
{
"epoch": 2.906024096385542,
"grad_norm": 0.34302592277526855,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.5192,
"step": 603
},
{
"epoch": 2.910843373493976,
"grad_norm": 0.314879834651947,
"learning_rate": 2.2884300208378395e-08,
"loss": 0.5039,
"step": 604
},
{
"epoch": 2.9156626506024095,
"grad_norm": 0.33504053950309753,
"learning_rate": 2.0272981386393332e-08,
"loss": 0.5245,
"step": 605
},
{
"epoch": 2.9204819277108434,
"grad_norm": 0.3199070394039154,
"learning_rate": 1.781950934783505e-08,
"loss": 0.5315,
"step": 606
},
{
"epoch": 2.9253012048192772,
"grad_norm": 0.3318592607975006,
"learning_rate": 1.552396186256411e-08,
"loss": 0.5318,
"step": 607
},
{
"epoch": 2.9301204819277107,
"grad_norm": 0.30516648292541504,
"learning_rate": 1.3386411694565894e-08,
"loss": 0.5263,
"step": 608
},
{
"epoch": 2.9349397590361446,
"grad_norm": 0.31260839104652405,
"learning_rate": 1.1406926599646373e-08,
"loss": 0.5299,
"step": 609
},
{
"epoch": 2.9397590361445785,
"grad_norm": 0.334585040807724,
"learning_rate": 9.585569323284915e-09,
"loss": 0.5403,
"step": 610
},
{
"epoch": 2.944578313253012,
"grad_norm": 0.343319296836853,
"learning_rate": 7.922397598642551e-09,
"loss": 0.5216,
"step": 611
},
{
"epoch": 2.9493975903614458,
"grad_norm": 0.3292441964149475,
"learning_rate": 6.417464144736208e-09,
"loss": 0.5248,
"step": 612
},
{
"epoch": 2.9542168674698797,
"grad_norm": 0.35108089447021484,
"learning_rate": 5.0708166647628345e-09,
"loss": 0.5206,
"step": 613
},
{
"epoch": 2.959036144578313,
"grad_norm": 0.33806362748146057,
"learning_rate": 3.88249784459227e-09,
"loss": 0.5416,
"step": 614
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.304426908493042,
"learning_rate": 2.8525453514099966e-09,
"loss": 0.5359,
"step": 615
},
{
"epoch": 2.968674698795181,
"grad_norm": 0.34446102380752563,
"learning_rate": 1.980991832524759e-09,
"loss": 0.5275,
"step": 616
},
{
"epoch": 2.9734939759036143,
"grad_norm": 0.3511168956756592,
"learning_rate": 1.2678649143349485e-09,
"loss": 0.5181,
"step": 617
},
{
"epoch": 2.978313253012048,
"grad_norm": 0.37222975492477417,
"learning_rate": 7.131872014509711e-10,
"loss": 0.5235,
"step": 618
},
{
"epoch": 2.983132530120482,
"grad_norm": 0.3414348363876343,
"learning_rate": 3.1697627597970794e-10,
"loss": 0.5343,
"step": 619
},
{
"epoch": 2.9879518072289155,
"grad_norm": 0.3310386836528778,
"learning_rate": 7.924469696718451e-11,
"loss": 0.5362,
"step": 620
},
{
"epoch": 2.9927710843373494,
"grad_norm": 0.31892916560173035,
"learning_rate": 0.0,
"loss": 0.5334,
"step": 621
},
{
"epoch": 2.9927710843373494,
"step": 621,
"total_flos": 8.029368648088945e+17,
"train_loss": 0.5880239765233272,
"train_runtime": 15009.674,
"train_samples_per_second": 3.978,
"train_steps_per_second": 0.041
}
],
"logging_steps": 1,
"max_steps": 621,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.029368648088945e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}