craa's picture
Upload folder using huggingface_hub
da85ac5 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 72000,
"best_metric": 3.5293209552764893,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_5039/checkpoint-40000",
"epoch": 29.13752913752914,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014568764568764568,
"grad_norm": 1.6134361028671265,
"learning_rate": 0.000294,
"loss": 8.4822,
"step": 50
},
{
"epoch": 0.029137529137529136,
"grad_norm": 0.6467522382736206,
"learning_rate": 0.0005939999999999999,
"loss": 6.7172,
"step": 100
},
{
"epoch": 0.043706293706293704,
"grad_norm": 0.4256949722766876,
"learning_rate": 0.0005998285714285713,
"loss": 6.3649,
"step": 150
},
{
"epoch": 0.05827505827505827,
"grad_norm": 0.4922082722187042,
"learning_rate": 0.0005996536443148687,
"loss": 6.1487,
"step": 200
},
{
"epoch": 0.07284382284382285,
"grad_norm": 0.4709911346435547,
"learning_rate": 0.0005994787172011662,
"loss": 6.0174,
"step": 250
},
{
"epoch": 0.08741258741258741,
"grad_norm": 0.48052042722702026,
"learning_rate": 0.0005993037900874635,
"loss": 5.8773,
"step": 300
},
{
"epoch": 0.10198135198135198,
"grad_norm": 0.5878971219062805,
"learning_rate": 0.0005991288629737609,
"loss": 5.7603,
"step": 350
},
{
"epoch": 0.11655011655011654,
"grad_norm": 0.4191853702068329,
"learning_rate": 0.0005989539358600582,
"loss": 5.6447,
"step": 400
},
{
"epoch": 0.13111888111888112,
"grad_norm": 0.49607983231544495,
"learning_rate": 0.0005987790087463557,
"loss": 5.5178,
"step": 450
},
{
"epoch": 0.1456876456876457,
"grad_norm": 0.4248947501182556,
"learning_rate": 0.000598604081632653,
"loss": 5.4198,
"step": 500
},
{
"epoch": 0.16025641025641027,
"grad_norm": 0.48206138610839844,
"learning_rate": 0.0005984291545189504,
"loss": 5.3384,
"step": 550
},
{
"epoch": 0.17482517482517482,
"grad_norm": 0.4465309679508209,
"learning_rate": 0.0005982542274052477,
"loss": 5.2645,
"step": 600
},
{
"epoch": 0.1893939393939394,
"grad_norm": 0.42823970317840576,
"learning_rate": 0.0005980793002915452,
"loss": 5.2088,
"step": 650
},
{
"epoch": 0.20396270396270397,
"grad_norm": 0.4172956943511963,
"learning_rate": 0.0005979043731778425,
"loss": 5.1419,
"step": 700
},
{
"epoch": 0.21853146853146854,
"grad_norm": 0.424402117729187,
"learning_rate": 0.0005977294460641399,
"loss": 5.0645,
"step": 750
},
{
"epoch": 0.2331002331002331,
"grad_norm": 0.4406491816043854,
"learning_rate": 0.0005975545189504372,
"loss": 5.0224,
"step": 800
},
{
"epoch": 0.24766899766899766,
"grad_norm": 0.4717820882797241,
"learning_rate": 0.0005973795918367347,
"loss": 5.003,
"step": 850
},
{
"epoch": 0.26223776223776224,
"grad_norm": 0.4521999657154083,
"learning_rate": 0.000597204664723032,
"loss": 4.9142,
"step": 900
},
{
"epoch": 0.2768065268065268,
"grad_norm": 0.4754863679409027,
"learning_rate": 0.0005970297376093294,
"loss": 4.8762,
"step": 950
},
{
"epoch": 0.2913752913752914,
"grad_norm": 0.41961297392845154,
"learning_rate": 0.0005968548104956268,
"loss": 4.8402,
"step": 1000
},
{
"epoch": 0.2913752913752914,
"eval_accuracy": 0.2549595710849709,
"eval_loss": 4.753758430480957,
"eval_runtime": 180.4427,
"eval_samples_per_second": 92.229,
"eval_steps_per_second": 5.769,
"step": 1000
},
{
"epoch": 0.30594405594405594,
"grad_norm": 0.6563605070114136,
"learning_rate": 0.0005966798833819242,
"loss": 4.7815,
"step": 1050
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.4702153503894806,
"learning_rate": 0.0005965049562682215,
"loss": 4.7461,
"step": 1100
},
{
"epoch": 0.3350815850815851,
"grad_norm": 0.4264092743396759,
"learning_rate": 0.0005963300291545189,
"loss": 4.6878,
"step": 1150
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.4903077185153961,
"learning_rate": 0.0005961551020408162,
"loss": 4.6656,
"step": 1200
},
{
"epoch": 0.36421911421911424,
"grad_norm": 0.4931991994380951,
"learning_rate": 0.0005959801749271137,
"loss": 4.6333,
"step": 1250
},
{
"epoch": 0.3787878787878788,
"grad_norm": 0.43286022543907166,
"learning_rate": 0.000595805247813411,
"loss": 4.6066,
"step": 1300
},
{
"epoch": 0.39335664335664333,
"grad_norm": 0.40360233187675476,
"learning_rate": 0.0005956303206997084,
"loss": 4.5706,
"step": 1350
},
{
"epoch": 0.40792540792540793,
"grad_norm": 0.420491486787796,
"learning_rate": 0.0005954553935860059,
"loss": 4.5591,
"step": 1400
},
{
"epoch": 0.4224941724941725,
"grad_norm": 0.4152667820453644,
"learning_rate": 0.0005952804664723032,
"loss": 4.5331,
"step": 1450
},
{
"epoch": 0.4370629370629371,
"grad_norm": 0.4153015613555908,
"learning_rate": 0.0005951055393586005,
"loss": 4.5102,
"step": 1500
},
{
"epoch": 0.45163170163170163,
"grad_norm": 0.4187549352645874,
"learning_rate": 0.0005949306122448979,
"loss": 4.4927,
"step": 1550
},
{
"epoch": 0.4662004662004662,
"grad_norm": 0.4402385354042053,
"learning_rate": 0.0005947556851311952,
"loss": 4.4652,
"step": 1600
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.41887184977531433,
"learning_rate": 0.0005945807580174927,
"loss": 4.4431,
"step": 1650
},
{
"epoch": 0.49533799533799533,
"grad_norm": 0.4349214434623718,
"learning_rate": 0.00059440583090379,
"loss": 4.4302,
"step": 1700
},
{
"epoch": 0.5099067599067599,
"grad_norm": 0.416457861661911,
"learning_rate": 0.0005942309037900874,
"loss": 4.4188,
"step": 1750
},
{
"epoch": 0.5244755244755245,
"grad_norm": 0.3888656198978424,
"learning_rate": 0.0005940559766763847,
"loss": 4.3824,
"step": 1800
},
{
"epoch": 0.539044289044289,
"grad_norm": 0.38429805636405945,
"learning_rate": 0.0005938810495626822,
"loss": 4.38,
"step": 1850
},
{
"epoch": 0.5536130536130536,
"grad_norm": 0.4373445510864258,
"learning_rate": 0.0005937061224489796,
"loss": 4.3664,
"step": 1900
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.43909236788749695,
"learning_rate": 0.0005935311953352769,
"loss": 4.3394,
"step": 1950
},
{
"epoch": 0.5827505827505828,
"grad_norm": 0.3650919795036316,
"learning_rate": 0.0005933562682215743,
"loss": 4.3353,
"step": 2000
},
{
"epoch": 0.5827505827505828,
"eval_accuracy": 0.30004649542771444,
"eval_loss": 4.282717704772949,
"eval_runtime": 180.3042,
"eval_samples_per_second": 92.3,
"eval_steps_per_second": 5.774,
"step": 2000
},
{
"epoch": 0.5973193473193473,
"grad_norm": 0.4164070188999176,
"learning_rate": 0.0005931813411078717,
"loss": 4.3193,
"step": 2050
},
{
"epoch": 0.6118881118881119,
"grad_norm": 0.370237797498703,
"learning_rate": 0.000593006413994169,
"loss": 4.3105,
"step": 2100
},
{
"epoch": 0.6264568764568764,
"grad_norm": 0.4082745611667633,
"learning_rate": 0.0005928314868804664,
"loss": 4.2941,
"step": 2150
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.3935624957084656,
"learning_rate": 0.0005926565597667638,
"loss": 4.2833,
"step": 2200
},
{
"epoch": 0.6555944055944056,
"grad_norm": 0.3984358310699463,
"learning_rate": 0.0005924816326530612,
"loss": 4.2679,
"step": 2250
},
{
"epoch": 0.6701631701631702,
"grad_norm": 0.3854668140411377,
"learning_rate": 0.0005923067055393586,
"loss": 4.2746,
"step": 2300
},
{
"epoch": 0.6847319347319347,
"grad_norm": 0.37700507044792175,
"learning_rate": 0.0005921317784256559,
"loss": 4.2429,
"step": 2350
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.3662063777446747,
"learning_rate": 0.0005919568513119533,
"loss": 4.2584,
"step": 2400
},
{
"epoch": 0.7138694638694638,
"grad_norm": 0.38568246364593506,
"learning_rate": 0.0005917819241982507,
"loss": 4.2344,
"step": 2450
},
{
"epoch": 0.7284382284382285,
"grad_norm": 0.43455713987350464,
"learning_rate": 0.000591606997084548,
"loss": 4.2328,
"step": 2500
},
{
"epoch": 0.743006993006993,
"grad_norm": 0.3856061100959778,
"learning_rate": 0.0005914320699708454,
"loss": 4.2092,
"step": 2550
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.38549911975860596,
"learning_rate": 0.0005912571428571428,
"loss": 4.218,
"step": 2600
},
{
"epoch": 0.7721445221445221,
"grad_norm": 0.39746782183647156,
"learning_rate": 0.0005910822157434402,
"loss": 4.1968,
"step": 2650
},
{
"epoch": 0.7867132867132867,
"grad_norm": 0.39923539757728577,
"learning_rate": 0.0005909072886297376,
"loss": 4.1886,
"step": 2700
},
{
"epoch": 0.8012820512820513,
"grad_norm": 0.3931720554828644,
"learning_rate": 0.0005907323615160349,
"loss": 4.1767,
"step": 2750
},
{
"epoch": 0.8158508158508159,
"grad_norm": 0.3459513783454895,
"learning_rate": 0.0005905574344023324,
"loss": 4.1785,
"step": 2800
},
{
"epoch": 0.8304195804195804,
"grad_norm": 0.39997783303260803,
"learning_rate": 0.0005903825072886297,
"loss": 4.1691,
"step": 2850
},
{
"epoch": 0.844988344988345,
"grad_norm": 0.3813166618347168,
"learning_rate": 0.000590207580174927,
"loss": 4.1596,
"step": 2900
},
{
"epoch": 0.8595571095571095,
"grad_norm": 0.37703993916511536,
"learning_rate": 0.0005900326530612244,
"loss": 4.1536,
"step": 2950
},
{
"epoch": 0.8741258741258742,
"grad_norm": 0.3787032961845398,
"learning_rate": 0.0005898577259475218,
"loss": 4.1411,
"step": 3000
},
{
"epoch": 0.8741258741258742,
"eval_accuracy": 0.31596194853706383,
"eval_loss": 4.094162940979004,
"eval_runtime": 180.2783,
"eval_samples_per_second": 92.313,
"eval_steps_per_second": 5.774,
"step": 3000
},
{
"epoch": 0.8886946386946387,
"grad_norm": 0.35312145948410034,
"learning_rate": 0.0005896827988338192,
"loss": 4.1438,
"step": 3050
},
{
"epoch": 0.9032634032634033,
"grad_norm": 0.38526853919029236,
"learning_rate": 0.0005895078717201166,
"loss": 4.1301,
"step": 3100
},
{
"epoch": 0.9178321678321678,
"grad_norm": 0.36466994881629944,
"learning_rate": 0.000589332944606414,
"loss": 4.1235,
"step": 3150
},
{
"epoch": 0.9324009324009324,
"grad_norm": 0.3571998178958893,
"learning_rate": 0.0005891580174927114,
"loss": 4.1292,
"step": 3200
},
{
"epoch": 0.946969696969697,
"grad_norm": 0.3403795063495636,
"learning_rate": 0.0005889830903790087,
"loss": 4.1082,
"step": 3250
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.38671016693115234,
"learning_rate": 0.000588808163265306,
"loss": 4.096,
"step": 3300
},
{
"epoch": 0.9761072261072261,
"grad_norm": 0.3343498408794403,
"learning_rate": 0.0005886332361516035,
"loss": 4.0914,
"step": 3350
},
{
"epoch": 0.9906759906759907,
"grad_norm": 0.3740348815917969,
"learning_rate": 0.0005884583090379008,
"loss": 4.1069,
"step": 3400
},
{
"epoch": 1.0052447552447552,
"grad_norm": 0.37788498401641846,
"learning_rate": 0.0005882833819241982,
"loss": 4.0461,
"step": 3450
},
{
"epoch": 1.0198135198135199,
"grad_norm": 0.3331277072429657,
"learning_rate": 0.0005881084548104955,
"loss": 4.0172,
"step": 3500
},
{
"epoch": 1.0343822843822843,
"grad_norm": 0.38395169377326965,
"learning_rate": 0.000587933527696793,
"loss": 4.014,
"step": 3550
},
{
"epoch": 1.048951048951049,
"grad_norm": 0.35354799032211304,
"learning_rate": 0.0005877586005830904,
"loss": 4.0174,
"step": 3600
},
{
"epoch": 1.0635198135198136,
"grad_norm": 0.35139200091362,
"learning_rate": 0.0005875836734693877,
"loss": 4.0323,
"step": 3650
},
{
"epoch": 1.078088578088578,
"grad_norm": 0.36168310046195984,
"learning_rate": 0.0005874087463556851,
"loss": 4.0024,
"step": 3700
},
{
"epoch": 1.0926573426573427,
"grad_norm": 0.3537745773792267,
"learning_rate": 0.0005872338192419825,
"loss": 4.0106,
"step": 3750
},
{
"epoch": 1.1072261072261071,
"grad_norm": 0.3509289026260376,
"learning_rate": 0.0005870588921282798,
"loss": 3.9951,
"step": 3800
},
{
"epoch": 1.1217948717948718,
"grad_norm": 0.3399880826473236,
"learning_rate": 0.0005868839650145772,
"loss": 3.9968,
"step": 3850
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.31629034876823425,
"learning_rate": 0.0005867090379008745,
"loss": 3.9957,
"step": 3900
},
{
"epoch": 1.150932400932401,
"grad_norm": 0.34750989079475403,
"learning_rate": 0.000586534110787172,
"loss": 3.9916,
"step": 3950
},
{
"epoch": 1.1655011655011656,
"grad_norm": 0.34375834465026855,
"learning_rate": 0.0005863591836734694,
"loss": 3.994,
"step": 4000
},
{
"epoch": 1.1655011655011656,
"eval_accuracy": 0.3255772359138492,
"eval_loss": 3.989028215408325,
"eval_runtime": 180.1395,
"eval_samples_per_second": 92.384,
"eval_steps_per_second": 5.779,
"step": 4000
},
{
"epoch": 1.18006993006993,
"grad_norm": 0.37413716316223145,
"learning_rate": 0.0005861842565597667,
"loss": 3.9981,
"step": 4050
},
{
"epoch": 1.1946386946386947,
"grad_norm": 0.35629984736442566,
"learning_rate": 0.0005860093294460641,
"loss": 3.975,
"step": 4100
},
{
"epoch": 1.2092074592074593,
"grad_norm": 0.3381233215332031,
"learning_rate": 0.0005858344023323615,
"loss": 3.9851,
"step": 4150
},
{
"epoch": 1.2237762237762237,
"grad_norm": 0.34585943818092346,
"learning_rate": 0.0005856594752186588,
"loss": 3.9808,
"step": 4200
},
{
"epoch": 1.2383449883449884,
"grad_norm": 0.35993272066116333,
"learning_rate": 0.0005854845481049562,
"loss": 3.975,
"step": 4250
},
{
"epoch": 1.2529137529137528,
"grad_norm": 0.32540130615234375,
"learning_rate": 0.0005853096209912535,
"loss": 3.9714,
"step": 4300
},
{
"epoch": 1.2674825174825175,
"grad_norm": 0.36224445700645447,
"learning_rate": 0.000585134693877551,
"loss": 3.9754,
"step": 4350
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.3662620186805725,
"learning_rate": 0.0005849597667638484,
"loss": 3.9694,
"step": 4400
},
{
"epoch": 1.2966200466200466,
"grad_norm": 0.35438838601112366,
"learning_rate": 0.0005847848396501457,
"loss": 3.9519,
"step": 4450
},
{
"epoch": 1.3111888111888113,
"grad_norm": 0.34450942277908325,
"learning_rate": 0.0005846099125364432,
"loss": 3.963,
"step": 4500
},
{
"epoch": 1.3257575757575757,
"grad_norm": 0.351962685585022,
"learning_rate": 0.0005844349854227405,
"loss": 3.9591,
"step": 4550
},
{
"epoch": 1.3403263403263403,
"grad_norm": 0.3839578926563263,
"learning_rate": 0.0005842600583090379,
"loss": 3.9561,
"step": 4600
},
{
"epoch": 1.354895104895105,
"grad_norm": 0.32113179564476013,
"learning_rate": 0.0005840851311953352,
"loss": 3.949,
"step": 4650
},
{
"epoch": 1.3694638694638694,
"grad_norm": 0.33071938157081604,
"learning_rate": 0.0005839102040816325,
"loss": 3.9608,
"step": 4700
},
{
"epoch": 1.384032634032634,
"grad_norm": 0.33803558349609375,
"learning_rate": 0.00058373527696793,
"loss": 3.9482,
"step": 4750
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.31636884808540344,
"learning_rate": 0.0005835603498542273,
"loss": 3.9437,
"step": 4800
},
{
"epoch": 1.4131701631701632,
"grad_norm": 0.3646225035190582,
"learning_rate": 0.0005833854227405247,
"loss": 3.9303,
"step": 4850
},
{
"epoch": 1.4277389277389276,
"grad_norm": 0.3559642732143402,
"learning_rate": 0.0005832104956268222,
"loss": 3.9403,
"step": 4900
},
{
"epoch": 1.4423076923076923,
"grad_norm": 0.3481752276420593,
"learning_rate": 0.0005830355685131195,
"loss": 3.9357,
"step": 4950
},
{
"epoch": 1.456876456876457,
"grad_norm": 0.313125878572464,
"learning_rate": 0.0005828606413994169,
"loss": 3.9303,
"step": 5000
},
{
"epoch": 1.456876456876457,
"eval_accuracy": 0.3321257535516557,
"eval_loss": 3.9129536151885986,
"eval_runtime": 180.4532,
"eval_samples_per_second": 92.223,
"eval_steps_per_second": 5.769,
"step": 5000
},
{
"epoch": 1.4714452214452214,
"grad_norm": 0.33051010966300964,
"learning_rate": 0.0005826857142857142,
"loss": 3.9226,
"step": 5050
},
{
"epoch": 1.486013986013986,
"grad_norm": 0.3060428500175476,
"learning_rate": 0.0005825107871720116,
"loss": 3.9254,
"step": 5100
},
{
"epoch": 1.5005827505827507,
"grad_norm": 0.34262314438819885,
"learning_rate": 0.000582335860058309,
"loss": 3.9131,
"step": 5150
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.33539673686027527,
"learning_rate": 0.0005821609329446063,
"loss": 3.9158,
"step": 5200
},
{
"epoch": 1.5297202797202796,
"grad_norm": 0.3277048170566559,
"learning_rate": 0.0005819860058309037,
"loss": 3.9228,
"step": 5250
},
{
"epoch": 1.5442890442890445,
"grad_norm": 0.31714221835136414,
"learning_rate": 0.0005818110787172012,
"loss": 3.9245,
"step": 5300
},
{
"epoch": 1.558857808857809,
"grad_norm": 0.329098105430603,
"learning_rate": 0.0005816361516034985,
"loss": 3.9212,
"step": 5350
},
{
"epoch": 1.5734265734265733,
"grad_norm": 0.33248335123062134,
"learning_rate": 0.0005814612244897959,
"loss": 3.9066,
"step": 5400
},
{
"epoch": 1.587995337995338,
"grad_norm": 0.3300471305847168,
"learning_rate": 0.0005812862973760932,
"loss": 3.9076,
"step": 5450
},
{
"epoch": 1.6025641025641026,
"grad_norm": 0.3110630214214325,
"learning_rate": 0.0005811113702623907,
"loss": 3.8996,
"step": 5500
},
{
"epoch": 1.617132867132867,
"grad_norm": 0.34096479415893555,
"learning_rate": 0.000580936443148688,
"loss": 3.8914,
"step": 5550
},
{
"epoch": 1.6317016317016317,
"grad_norm": 0.3256978690624237,
"learning_rate": 0.0005807615160349853,
"loss": 3.8901,
"step": 5600
},
{
"epoch": 1.6462703962703964,
"grad_norm": 0.3170398771762848,
"learning_rate": 0.0005805865889212827,
"loss": 3.9086,
"step": 5650
},
{
"epoch": 1.6608391608391608,
"grad_norm": 0.32134151458740234,
"learning_rate": 0.0005804116618075802,
"loss": 3.8843,
"step": 5700
},
{
"epoch": 1.6754079254079253,
"grad_norm": 0.3455315828323364,
"learning_rate": 0.0005802367346938775,
"loss": 3.8936,
"step": 5750
},
{
"epoch": 1.68997668997669,
"grad_norm": 0.33487361669540405,
"learning_rate": 0.0005800618075801749,
"loss": 3.903,
"step": 5800
},
{
"epoch": 1.7045454545454546,
"grad_norm": 0.3249671459197998,
"learning_rate": 0.0005798868804664722,
"loss": 3.8913,
"step": 5850
},
{
"epoch": 1.719114219114219,
"grad_norm": 0.35598769783973694,
"learning_rate": 0.0005797119533527697,
"loss": 3.8821,
"step": 5900
},
{
"epoch": 1.7336829836829837,
"grad_norm": 0.34034013748168945,
"learning_rate": 0.000579537026239067,
"loss": 3.8849,
"step": 5950
},
{
"epoch": 1.7482517482517483,
"grad_norm": 0.33674389123916626,
"learning_rate": 0.0005793620991253643,
"loss": 3.8992,
"step": 6000
},
{
"epoch": 1.7482517482517483,
"eval_accuracy": 0.33753787307759514,
"eval_loss": 3.85577654838562,
"eval_runtime": 180.1447,
"eval_samples_per_second": 92.381,
"eval_steps_per_second": 5.779,
"step": 6000
},
{
"epoch": 1.7628205128205128,
"grad_norm": 0.32885122299194336,
"learning_rate": 0.0005791871720116617,
"loss": 3.8805,
"step": 6050
},
{
"epoch": 1.7773892773892774,
"grad_norm": 0.32068461179733276,
"learning_rate": 0.0005790122448979591,
"loss": 3.8668,
"step": 6100
},
{
"epoch": 1.791958041958042,
"grad_norm": 0.3308079242706299,
"learning_rate": 0.0005788373177842565,
"loss": 3.8776,
"step": 6150
},
{
"epoch": 1.8065268065268065,
"grad_norm": 0.32728639245033264,
"learning_rate": 0.0005786623906705539,
"loss": 3.8633,
"step": 6200
},
{
"epoch": 1.821095571095571,
"grad_norm": 0.3404487073421478,
"learning_rate": 0.0005784874635568512,
"loss": 3.8712,
"step": 6250
},
{
"epoch": 1.8356643356643356,
"grad_norm": 0.32237741351127625,
"learning_rate": 0.0005783125364431487,
"loss": 3.8582,
"step": 6300
},
{
"epoch": 1.8502331002331003,
"grad_norm": 0.3479669392108917,
"learning_rate": 0.000578137609329446,
"loss": 3.8647,
"step": 6350
},
{
"epoch": 1.8648018648018647,
"grad_norm": 0.3184560239315033,
"learning_rate": 0.0005779626822157434,
"loss": 3.847,
"step": 6400
},
{
"epoch": 1.8793706293706294,
"grad_norm": 0.3197358548641205,
"learning_rate": 0.0005777877551020408,
"loss": 3.8617,
"step": 6450
},
{
"epoch": 1.893939393939394,
"grad_norm": 0.2957116663455963,
"learning_rate": 0.0005776128279883381,
"loss": 3.854,
"step": 6500
},
{
"epoch": 1.9085081585081585,
"grad_norm": 0.3220060169696808,
"learning_rate": 0.0005774379008746355,
"loss": 3.851,
"step": 6550
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.3108726441860199,
"learning_rate": 0.0005772629737609329,
"loss": 3.8559,
"step": 6600
},
{
"epoch": 1.9376456876456878,
"grad_norm": 0.33560416102409363,
"learning_rate": 0.0005770880466472303,
"loss": 3.8505,
"step": 6650
},
{
"epoch": 1.9522144522144522,
"grad_norm": 0.33253157138824463,
"learning_rate": 0.0005769131195335277,
"loss": 3.8468,
"step": 6700
},
{
"epoch": 1.9667832167832167,
"grad_norm": 0.3143483102321625,
"learning_rate": 0.000576738192419825,
"loss": 3.8416,
"step": 6750
},
{
"epoch": 1.9813519813519813,
"grad_norm": 0.32564249634742737,
"learning_rate": 0.0005765632653061224,
"loss": 3.843,
"step": 6800
},
{
"epoch": 1.995920745920746,
"grad_norm": 0.33519217371940613,
"learning_rate": 0.0005763883381924198,
"loss": 3.8431,
"step": 6850
},
{
"epoch": 2.0104895104895104,
"grad_norm": 0.32294219732284546,
"learning_rate": 0.0005762134110787171,
"loss": 3.7722,
"step": 6900
},
{
"epoch": 2.025058275058275,
"grad_norm": 0.3262682557106018,
"learning_rate": 0.0005760384839650145,
"loss": 3.7428,
"step": 6950
},
{
"epoch": 2.0396270396270397,
"grad_norm": 0.3397265374660492,
"learning_rate": 0.0005758635568513119,
"loss": 3.7487,
"step": 7000
},
{
"epoch": 2.0396270396270397,
"eval_accuracy": 0.3415744146738347,
"eval_loss": 3.8127987384796143,
"eval_runtime": 180.1348,
"eval_samples_per_second": 92.386,
"eval_steps_per_second": 5.779,
"step": 7000
},
{
"epoch": 2.054195804195804,
"grad_norm": 0.3330610990524292,
"learning_rate": 0.0005756886297376093,
"loss": 3.7405,
"step": 7050
},
{
"epoch": 2.0687645687645686,
"grad_norm": 0.3221195638179779,
"learning_rate": 0.0005755137026239067,
"loss": 3.7561,
"step": 7100
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.32453685998916626,
"learning_rate": 0.000575338775510204,
"loss": 3.7532,
"step": 7150
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.3615976870059967,
"learning_rate": 0.0005751638483965014,
"loss": 3.7618,
"step": 7200
},
{
"epoch": 2.1124708624708624,
"grad_norm": 0.323742538690567,
"learning_rate": 0.0005749889212827988,
"loss": 3.7508,
"step": 7250
},
{
"epoch": 2.1270396270396272,
"grad_norm": 0.3381347954273224,
"learning_rate": 0.0005748139941690962,
"loss": 3.7588,
"step": 7300
},
{
"epoch": 2.1416083916083917,
"grad_norm": 0.3426363468170166,
"learning_rate": 0.0005746390670553935,
"loss": 3.7579,
"step": 7350
},
{
"epoch": 2.156177156177156,
"grad_norm": 0.31964731216430664,
"learning_rate": 0.000574464139941691,
"loss": 3.7528,
"step": 7400
},
{
"epoch": 2.1707459207459205,
"grad_norm": 0.3354383111000061,
"learning_rate": 0.0005742892128279883,
"loss": 3.7556,
"step": 7450
},
{
"epoch": 2.1853146853146854,
"grad_norm": 0.3251858353614807,
"learning_rate": 0.0005741142857142857,
"loss": 3.7556,
"step": 7500
},
{
"epoch": 2.19988344988345,
"grad_norm": 0.3399089276790619,
"learning_rate": 0.000573939358600583,
"loss": 3.7415,
"step": 7550
},
{
"epoch": 2.2144522144522143,
"grad_norm": 0.3444349467754364,
"learning_rate": 0.0005737644314868805,
"loss": 3.7515,
"step": 7600
},
{
"epoch": 2.229020979020979,
"grad_norm": 0.31715652346611023,
"learning_rate": 0.0005735895043731778,
"loss": 3.7618,
"step": 7650
},
{
"epoch": 2.2435897435897436,
"grad_norm": 0.34369540214538574,
"learning_rate": 0.0005734145772594752,
"loss": 3.7687,
"step": 7700
},
{
"epoch": 2.258158508158508,
"grad_norm": 0.3494495153427124,
"learning_rate": 0.0005732396501457726,
"loss": 3.748,
"step": 7750
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.31449177861213684,
"learning_rate": 0.0005730647230320698,
"loss": 3.7541,
"step": 7800
},
{
"epoch": 2.2872960372960374,
"grad_norm": 0.3397660553455353,
"learning_rate": 0.0005728897959183673,
"loss": 3.7624,
"step": 7850
},
{
"epoch": 2.301864801864802,
"grad_norm": 0.34240466356277466,
"learning_rate": 0.0005727148688046647,
"loss": 3.7432,
"step": 7900
},
{
"epoch": 2.3164335664335667,
"grad_norm": 0.3217261731624603,
"learning_rate": 0.000572539941690962,
"loss": 3.7499,
"step": 7950
},
{
"epoch": 2.331002331002331,
"grad_norm": 0.3246598243713379,
"learning_rate": 0.0005723650145772595,
"loss": 3.7619,
"step": 8000
},
{
"epoch": 2.331002331002331,
"eval_accuracy": 0.34476242059382917,
"eval_loss": 3.7828927040100098,
"eval_runtime": 179.9962,
"eval_samples_per_second": 92.458,
"eval_steps_per_second": 5.783,
"step": 8000
},
{
"epoch": 2.3455710955710956,
"grad_norm": 0.3367806673049927,
"learning_rate": 0.0005721900874635568,
"loss": 3.7485,
"step": 8050
},
{
"epoch": 2.36013986013986,
"grad_norm": 0.3171541392803192,
"learning_rate": 0.0005720151603498542,
"loss": 3.7546,
"step": 8100
},
{
"epoch": 2.374708624708625,
"grad_norm": 0.33225518465042114,
"learning_rate": 0.0005718402332361515,
"loss": 3.7429,
"step": 8150
},
{
"epoch": 2.3892773892773893,
"grad_norm": 0.3193056881427765,
"learning_rate": 0.000571665306122449,
"loss": 3.7622,
"step": 8200
},
{
"epoch": 2.4038461538461537,
"grad_norm": 0.3187880218029022,
"learning_rate": 0.0005714903790087463,
"loss": 3.7435,
"step": 8250
},
{
"epoch": 2.4184149184149186,
"grad_norm": 0.33991068601608276,
"learning_rate": 0.0005713154518950437,
"loss": 3.7494,
"step": 8300
},
{
"epoch": 2.432983682983683,
"grad_norm": 0.3092400133609772,
"learning_rate": 0.000571140524781341,
"loss": 3.7612,
"step": 8350
},
{
"epoch": 2.4475524475524475,
"grad_norm": 0.31092721223831177,
"learning_rate": 0.0005709655976676385,
"loss": 3.7488,
"step": 8400
},
{
"epoch": 2.462121212121212,
"grad_norm": 0.32930874824523926,
"learning_rate": 0.0005707906705539358,
"loss": 3.758,
"step": 8450
},
{
"epoch": 2.476689976689977,
"grad_norm": 0.32361528277397156,
"learning_rate": 0.0005706157434402332,
"loss": 3.7454,
"step": 8500
},
{
"epoch": 2.4912587412587412,
"grad_norm": 0.33115440607070923,
"learning_rate": 0.0005704408163265305,
"loss": 3.7402,
"step": 8550
},
{
"epoch": 2.5058275058275057,
"grad_norm": 0.328485369682312,
"learning_rate": 0.000570265889212828,
"loss": 3.7372,
"step": 8600
},
{
"epoch": 2.5203962703962706,
"grad_norm": 0.35709500312805176,
"learning_rate": 0.0005700909620991253,
"loss": 3.7461,
"step": 8650
},
{
"epoch": 2.534965034965035,
"grad_norm": 0.32163530588150024,
"learning_rate": 0.0005699160349854227,
"loss": 3.7541,
"step": 8700
},
{
"epoch": 2.5495337995337994,
"grad_norm": 0.31789329648017883,
"learning_rate": 0.00056974110787172,
"loss": 3.7438,
"step": 8750
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.3170648515224457,
"learning_rate": 0.0005695661807580175,
"loss": 3.7557,
"step": 8800
},
{
"epoch": 2.5786713286713288,
"grad_norm": 0.3424239158630371,
"learning_rate": 0.0005693912536443148,
"loss": 3.7398,
"step": 8850
},
{
"epoch": 2.593240093240093,
"grad_norm": 0.318135529756546,
"learning_rate": 0.0005692163265306122,
"loss": 3.7284,
"step": 8900
},
{
"epoch": 2.607808857808858,
"grad_norm": 0.33802515268325806,
"learning_rate": 0.0005690413994169095,
"loss": 3.738,
"step": 8950
},
{
"epoch": 2.6223776223776225,
"grad_norm": 0.32018738985061646,
"learning_rate": 0.000568866472303207,
"loss": 3.74,
"step": 9000
},
{
"epoch": 2.6223776223776225,
"eval_accuracy": 0.3477287677347602,
"eval_loss": 3.751537799835205,
"eval_runtime": 180.2979,
"eval_samples_per_second": 92.303,
"eval_steps_per_second": 5.774,
"step": 9000
},
{
"epoch": 2.636946386946387,
"grad_norm": 0.3212384283542633,
"learning_rate": 0.0005686915451895044,
"loss": 3.7381,
"step": 9050
},
{
"epoch": 2.6515151515151514,
"grad_norm": 0.3253323435783386,
"learning_rate": 0.0005685166180758016,
"loss": 3.739,
"step": 9100
},
{
"epoch": 2.666083916083916,
"grad_norm": 0.3387431502342224,
"learning_rate": 0.000568341690962099,
"loss": 3.7248,
"step": 9150
},
{
"epoch": 2.6806526806526807,
"grad_norm": 0.32496801018714905,
"learning_rate": 0.0005681667638483965,
"loss": 3.7298,
"step": 9200
},
{
"epoch": 2.695221445221445,
"grad_norm": 0.32816433906555176,
"learning_rate": 0.0005679918367346938,
"loss": 3.7296,
"step": 9250
},
{
"epoch": 2.70979020979021,
"grad_norm": 0.3408059775829315,
"learning_rate": 0.0005678169096209912,
"loss": 3.7364,
"step": 9300
},
{
"epoch": 2.7243589743589745,
"grad_norm": 0.33964434266090393,
"learning_rate": 0.0005676419825072885,
"loss": 3.7332,
"step": 9350
},
{
"epoch": 2.738927738927739,
"grad_norm": 0.31630218029022217,
"learning_rate": 0.000567467055393586,
"loss": 3.7283,
"step": 9400
},
{
"epoch": 2.7534965034965033,
"grad_norm": 0.34303176403045654,
"learning_rate": 0.0005672921282798833,
"loss": 3.7337,
"step": 9450
},
{
"epoch": 2.768065268065268,
"grad_norm": 0.30772241950035095,
"learning_rate": 0.0005671172011661807,
"loss": 3.7223,
"step": 9500
},
{
"epoch": 2.7826340326340326,
"grad_norm": 0.3346325755119324,
"learning_rate": 0.000566942274052478,
"loss": 3.7366,
"step": 9550
},
{
"epoch": 2.797202797202797,
"grad_norm": 0.321429580450058,
"learning_rate": 0.0005667673469387755,
"loss": 3.7289,
"step": 9600
},
{
"epoch": 2.811771561771562,
"grad_norm": 0.3273778259754181,
"learning_rate": 0.0005665924198250728,
"loss": 3.7253,
"step": 9650
},
{
"epoch": 2.8263403263403264,
"grad_norm": 0.33299872279167175,
"learning_rate": 0.0005664174927113702,
"loss": 3.7264,
"step": 9700
},
{
"epoch": 2.840909090909091,
"grad_norm": 0.31705546379089355,
"learning_rate": 0.0005662425655976676,
"loss": 3.7263,
"step": 9750
},
{
"epoch": 2.8554778554778553,
"grad_norm": 0.34314480423927307,
"learning_rate": 0.000566067638483965,
"loss": 3.7151,
"step": 9800
},
{
"epoch": 2.87004662004662,
"grad_norm": 0.32017573714256287,
"learning_rate": 0.0005658927113702623,
"loss": 3.7329,
"step": 9850
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.31930816173553467,
"learning_rate": 0.0005657177842565597,
"loss": 3.7235,
"step": 9900
},
{
"epoch": 2.8991841491841495,
"grad_norm": 0.31949570775032043,
"learning_rate": 0.0005655428571428572,
"loss": 3.7227,
"step": 9950
},
{
"epoch": 2.913752913752914,
"grad_norm": 0.30999991297721863,
"learning_rate": 0.0005653679300291545,
"loss": 3.7152,
"step": 10000
},
{
"epoch": 2.913752913752914,
"eval_accuracy": 0.34985880864932545,
"eval_loss": 3.7270307540893555,
"eval_runtime": 180.2671,
"eval_samples_per_second": 92.319,
"eval_steps_per_second": 5.775,
"step": 10000
},
{
"epoch": 2.9283216783216783,
"grad_norm": 0.3184822201728821,
"learning_rate": 0.0005651930029154518,
"loss": 3.7289,
"step": 10050
},
{
"epoch": 2.9428904428904428,
"grad_norm": 0.31392183899879456,
"learning_rate": 0.0005650180758017492,
"loss": 3.7275,
"step": 10100
},
{
"epoch": 2.957459207459207,
"grad_norm": 0.3100379407405853,
"learning_rate": 0.0005648431486880466,
"loss": 3.7078,
"step": 10150
},
{
"epoch": 2.972027972027972,
"grad_norm": 0.3107777237892151,
"learning_rate": 0.000564668221574344,
"loss": 3.7191,
"step": 10200
},
{
"epoch": 2.9865967365967365,
"grad_norm": 0.31457746028900146,
"learning_rate": 0.0005644932944606413,
"loss": 3.7216,
"step": 10250
},
{
"epoch": 3.001165501165501,
"grad_norm": 0.3300207555294037,
"learning_rate": 0.0005643183673469387,
"loss": 3.7241,
"step": 10300
},
{
"epoch": 3.015734265734266,
"grad_norm": 0.33615049719810486,
"learning_rate": 0.0005641434402332362,
"loss": 3.6097,
"step": 10350
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.32839542627334595,
"learning_rate": 0.0005639685131195335,
"loss": 3.6192,
"step": 10400
},
{
"epoch": 3.0448717948717947,
"grad_norm": 0.32775548100471497,
"learning_rate": 0.0005637935860058308,
"loss": 3.6201,
"step": 10450
},
{
"epoch": 3.0594405594405596,
"grad_norm": 0.3305208086967468,
"learning_rate": 0.0005636186588921282,
"loss": 3.6244,
"step": 10500
},
{
"epoch": 3.074009324009324,
"grad_norm": 0.3248291015625,
"learning_rate": 0.0005634437317784256,
"loss": 3.6289,
"step": 10550
},
{
"epoch": 3.0885780885780885,
"grad_norm": 0.334089070558548,
"learning_rate": 0.000563268804664723,
"loss": 3.6128,
"step": 10600
},
{
"epoch": 3.1031468531468533,
"grad_norm": 0.33667150139808655,
"learning_rate": 0.0005630938775510203,
"loss": 3.6316,
"step": 10650
},
{
"epoch": 3.117715617715618,
"grad_norm": 0.3139183223247528,
"learning_rate": 0.0005629189504373177,
"loss": 3.6267,
"step": 10700
},
{
"epoch": 3.132284382284382,
"grad_norm": 0.3240184187889099,
"learning_rate": 0.0005627440233236151,
"loss": 3.6155,
"step": 10750
},
{
"epoch": 3.1468531468531467,
"grad_norm": 0.3177716135978699,
"learning_rate": 0.0005625690962099125,
"loss": 3.6157,
"step": 10800
},
{
"epoch": 3.1614219114219115,
"grad_norm": 0.32491302490234375,
"learning_rate": 0.0005623941690962099,
"loss": 3.6529,
"step": 10850
},
{
"epoch": 3.175990675990676,
"grad_norm": 0.3269357681274414,
"learning_rate": 0.0005622192419825073,
"loss": 3.6252,
"step": 10900
},
{
"epoch": 3.1905594405594404,
"grad_norm": 0.33358559012413025,
"learning_rate": 0.0005620443148688046,
"loss": 3.6477,
"step": 10950
},
{
"epoch": 3.2051282051282053,
"grad_norm": 0.32112857699394226,
"learning_rate": 0.000561869387755102,
"loss": 3.6367,
"step": 11000
},
{
"epoch": 3.2051282051282053,
"eval_accuracy": 0.3516898159961675,
"eval_loss": 3.7140629291534424,
"eval_runtime": 180.3296,
"eval_samples_per_second": 92.287,
"eval_steps_per_second": 5.773,
"step": 11000
},
{
"epoch": 3.2196969696969697,
"grad_norm": 0.328512042760849,
"learning_rate": 0.0005616944606413993,
"loss": 3.6396,
"step": 11050
},
{
"epoch": 3.234265734265734,
"grad_norm": 0.3449825644493103,
"learning_rate": 0.0005615195335276968,
"loss": 3.6327,
"step": 11100
},
{
"epoch": 3.248834498834499,
"grad_norm": 0.32266926765441895,
"learning_rate": 0.0005613446064139941,
"loss": 3.6382,
"step": 11150
},
{
"epoch": 3.2634032634032635,
"grad_norm": 0.3263072073459625,
"learning_rate": 0.0005611696793002915,
"loss": 3.6265,
"step": 11200
},
{
"epoch": 3.277972027972028,
"grad_norm": 0.32438746094703674,
"learning_rate": 0.0005609947521865889,
"loss": 3.6519,
"step": 11250
},
{
"epoch": 3.2925407925407923,
"grad_norm": 0.3556417226791382,
"learning_rate": 0.0005608198250728863,
"loss": 3.6388,
"step": 11300
},
{
"epoch": 3.3071095571095572,
"grad_norm": 0.31459367275238037,
"learning_rate": 0.0005606448979591836,
"loss": 3.6413,
"step": 11350
},
{
"epoch": 3.3216783216783217,
"grad_norm": 0.3164815902709961,
"learning_rate": 0.000560469970845481,
"loss": 3.6394,
"step": 11400
},
{
"epoch": 3.336247086247086,
"grad_norm": 0.3238040804862976,
"learning_rate": 0.0005602950437317783,
"loss": 3.639,
"step": 11450
},
{
"epoch": 3.350815850815851,
"grad_norm": 0.31536027789115906,
"learning_rate": 0.0005601201166180758,
"loss": 3.651,
"step": 11500
},
{
"epoch": 3.3653846153846154,
"grad_norm": 0.3251273036003113,
"learning_rate": 0.0005599451895043731,
"loss": 3.6398,
"step": 11550
},
{
"epoch": 3.37995337995338,
"grad_norm": 0.3183720111846924,
"learning_rate": 0.0005597702623906705,
"loss": 3.6425,
"step": 11600
},
{
"epoch": 3.3945221445221447,
"grad_norm": 0.3452969193458557,
"learning_rate": 0.0005595953352769679,
"loss": 3.6396,
"step": 11650
},
{
"epoch": 3.409090909090909,
"grad_norm": 0.31187903881073,
"learning_rate": 0.0005594204081632653,
"loss": 3.6399,
"step": 11700
},
{
"epoch": 3.4236596736596736,
"grad_norm": 0.3159955143928528,
"learning_rate": 0.0005592454810495627,
"loss": 3.6371,
"step": 11750
},
{
"epoch": 3.438228438228438,
"grad_norm": 0.3242449462413788,
"learning_rate": 0.00055907055393586,
"loss": 3.6376,
"step": 11800
},
{
"epoch": 3.452797202797203,
"grad_norm": 0.33960285782814026,
"learning_rate": 0.0005588956268221573,
"loss": 3.6397,
"step": 11850
},
{
"epoch": 3.4673659673659674,
"grad_norm": 0.34514838457107544,
"learning_rate": 0.0005587206997084548,
"loss": 3.6349,
"step": 11900
},
{
"epoch": 3.481934731934732,
"grad_norm": 0.33326658606529236,
"learning_rate": 0.0005585457725947521,
"loss": 3.6432,
"step": 11950
},
{
"epoch": 3.4965034965034967,
"grad_norm": 0.3219590187072754,
"learning_rate": 0.0005583708454810495,
"loss": 3.642,
"step": 12000
},
{
"epoch": 3.4965034965034967,
"eval_accuracy": 0.35353681570054407,
"eval_loss": 3.697685480117798,
"eval_runtime": 180.3495,
"eval_samples_per_second": 92.276,
"eval_steps_per_second": 5.772,
"step": 12000
},
{
"epoch": 3.511072261072261,
"grad_norm": 0.315857470035553,
"learning_rate": 0.0005581959183673468,
"loss": 3.6484,
"step": 12050
},
{
"epoch": 3.5256410256410255,
"grad_norm": 0.33714818954467773,
"learning_rate": 0.0005580209912536443,
"loss": 3.6465,
"step": 12100
},
{
"epoch": 3.54020979020979,
"grad_norm": 0.3196263909339905,
"learning_rate": 0.0005578460641399417,
"loss": 3.6444,
"step": 12150
},
{
"epoch": 3.554778554778555,
"grad_norm": 0.34034839272499084,
"learning_rate": 0.000557671137026239,
"loss": 3.6403,
"step": 12200
},
{
"epoch": 3.5693473193473193,
"grad_norm": 0.32852211594581604,
"learning_rate": 0.0005574962099125363,
"loss": 3.6461,
"step": 12250
},
{
"epoch": 3.583916083916084,
"grad_norm": 0.3598001003265381,
"learning_rate": 0.0005573212827988338,
"loss": 3.6392,
"step": 12300
},
{
"epoch": 3.5984848484848486,
"grad_norm": 0.3342962861061096,
"learning_rate": 0.0005571463556851311,
"loss": 3.6414,
"step": 12350
},
{
"epoch": 3.613053613053613,
"grad_norm": 0.316803514957428,
"learning_rate": 0.0005569714285714285,
"loss": 3.6486,
"step": 12400
},
{
"epoch": 3.6276223776223775,
"grad_norm": 0.31796908378601074,
"learning_rate": 0.0005567965014577258,
"loss": 3.6369,
"step": 12450
},
{
"epoch": 3.642191142191142,
"grad_norm": 0.309007465839386,
"learning_rate": 0.0005566215743440233,
"loss": 3.6429,
"step": 12500
},
{
"epoch": 3.656759906759907,
"grad_norm": 0.3321513831615448,
"learning_rate": 0.0005564466472303207,
"loss": 3.6487,
"step": 12550
},
{
"epoch": 3.6713286713286712,
"grad_norm": 0.35138118267059326,
"learning_rate": 0.000556271720116618,
"loss": 3.6527,
"step": 12600
},
{
"epoch": 3.685897435897436,
"grad_norm": 0.3067615032196045,
"learning_rate": 0.0005560967930029155,
"loss": 3.6444,
"step": 12650
},
{
"epoch": 3.7004662004662006,
"grad_norm": 0.33694183826446533,
"learning_rate": 0.0005559218658892128,
"loss": 3.6325,
"step": 12700
},
{
"epoch": 3.715034965034965,
"grad_norm": 0.31776705384254456,
"learning_rate": 0.0005557469387755101,
"loss": 3.6527,
"step": 12750
},
{
"epoch": 3.7296037296037294,
"grad_norm": 0.3377169668674469,
"learning_rate": 0.0005555720116618075,
"loss": 3.6424,
"step": 12800
},
{
"epoch": 3.7441724941724943,
"grad_norm": 0.3101692199707031,
"learning_rate": 0.0005553970845481049,
"loss": 3.6359,
"step": 12850
},
{
"epoch": 3.7587412587412588,
"grad_norm": 0.3166581392288208,
"learning_rate": 0.0005552221574344023,
"loss": 3.6416,
"step": 12900
},
{
"epoch": 3.773310023310023,
"grad_norm": 0.31438636779785156,
"learning_rate": 0.0005550472303206997,
"loss": 3.6336,
"step": 12950
},
{
"epoch": 3.787878787878788,
"grad_norm": 0.3247930705547333,
"learning_rate": 0.000554872303206997,
"loss": 3.6416,
"step": 13000
},
{
"epoch": 3.787878787878788,
"eval_accuracy": 0.3546786229921654,
"eval_loss": 3.6786322593688965,
"eval_runtime": 180.419,
"eval_samples_per_second": 92.241,
"eval_steps_per_second": 5.77,
"step": 13000
},
{
"epoch": 3.8024475524475525,
"grad_norm": 0.3598824441432953,
"learning_rate": 0.0005546973760932945,
"loss": 3.6428,
"step": 13050
},
{
"epoch": 3.817016317016317,
"grad_norm": 0.32811933755874634,
"learning_rate": 0.0005545224489795918,
"loss": 3.6448,
"step": 13100
},
{
"epoch": 3.8315850815850814,
"grad_norm": 0.3222385346889496,
"learning_rate": 0.0005543475218658891,
"loss": 3.6489,
"step": 13150
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.326913058757782,
"learning_rate": 0.0005541725947521865,
"loss": 3.6217,
"step": 13200
},
{
"epoch": 3.8607226107226107,
"grad_norm": 0.31770044565200806,
"learning_rate": 0.0005539976676384839,
"loss": 3.6383,
"step": 13250
},
{
"epoch": 3.875291375291375,
"grad_norm": 0.3197103440761566,
"learning_rate": 0.0005538227405247813,
"loss": 3.6432,
"step": 13300
},
{
"epoch": 3.88986013986014,
"grad_norm": 0.33483409881591797,
"learning_rate": 0.0005536478134110787,
"loss": 3.6325,
"step": 13350
},
{
"epoch": 3.9044289044289044,
"grad_norm": 0.3026617765426636,
"learning_rate": 0.000553472886297376,
"loss": 3.6343,
"step": 13400
},
{
"epoch": 3.918997668997669,
"grad_norm": 0.2976735532283783,
"learning_rate": 0.0005532979591836735,
"loss": 3.6483,
"step": 13450
},
{
"epoch": 3.9335664335664333,
"grad_norm": 0.3455604612827301,
"learning_rate": 0.0005531230320699708,
"loss": 3.6413,
"step": 13500
},
{
"epoch": 3.948135198135198,
"grad_norm": 0.3204672932624817,
"learning_rate": 0.0005529481049562682,
"loss": 3.6384,
"step": 13550
},
{
"epoch": 3.9627039627039626,
"grad_norm": 0.340648889541626,
"learning_rate": 0.0005527731778425655,
"loss": 3.6425,
"step": 13600
},
{
"epoch": 3.9772727272727275,
"grad_norm": 0.3379724323749542,
"learning_rate": 0.0005525982507288629,
"loss": 3.6327,
"step": 13650
},
{
"epoch": 3.991841491841492,
"grad_norm": 0.3036077320575714,
"learning_rate": 0.0005524233236151603,
"loss": 3.6375,
"step": 13700
},
{
"epoch": 4.006410256410256,
"grad_norm": 0.34318360686302185,
"learning_rate": 0.0005522483965014576,
"loss": 3.5803,
"step": 13750
},
{
"epoch": 4.020979020979021,
"grad_norm": 0.3264276087284088,
"learning_rate": 0.000552073469387755,
"loss": 3.5362,
"step": 13800
},
{
"epoch": 4.035547785547785,
"grad_norm": 0.3238934278488159,
"learning_rate": 0.0005518985422740525,
"loss": 3.5332,
"step": 13850
},
{
"epoch": 4.05011655011655,
"grad_norm": 0.32926997542381287,
"learning_rate": 0.0005517236151603498,
"loss": 3.5372,
"step": 13900
},
{
"epoch": 4.064685314685315,
"grad_norm": 0.32314813137054443,
"learning_rate": 0.0005515486880466472,
"loss": 3.5272,
"step": 13950
},
{
"epoch": 4.0792540792540795,
"grad_norm": 0.3332814872264862,
"learning_rate": 0.0005513737609329446,
"loss": 3.5382,
"step": 14000
},
{
"epoch": 4.0792540792540795,
"eval_accuracy": 0.3562127134068402,
"eval_loss": 3.6715903282165527,
"eval_runtime": 180.15,
"eval_samples_per_second": 92.379,
"eval_steps_per_second": 5.779,
"step": 14000
},
{
"epoch": 4.093822843822844,
"grad_norm": 0.33132901787757874,
"learning_rate": 0.0005511988338192419,
"loss": 3.549,
"step": 14050
},
{
"epoch": 4.108391608391608,
"grad_norm": 0.32595717906951904,
"learning_rate": 0.0005510239067055393,
"loss": 3.5445,
"step": 14100
},
{
"epoch": 4.122960372960373,
"grad_norm": 0.3297913670539856,
"learning_rate": 0.0005508489795918366,
"loss": 3.5392,
"step": 14150
},
{
"epoch": 4.137529137529137,
"grad_norm": 0.35622304677963257,
"learning_rate": 0.0005506740524781341,
"loss": 3.5387,
"step": 14200
},
{
"epoch": 4.1520979020979025,
"grad_norm": 0.32156145572662354,
"learning_rate": 0.0005504991253644315,
"loss": 3.5461,
"step": 14250
},
{
"epoch": 4.166666666666667,
"grad_norm": Infinity,
"learning_rate": 0.0005503241982507288,
"loss": 3.5555,
"step": 14300
},
{
"epoch": 4.181235431235431,
"grad_norm": 0.32054704427719116,
"learning_rate": 0.0005501492711370262,
"loss": 3.5634,
"step": 14350
},
{
"epoch": 4.195804195804196,
"grad_norm": 0.3304331302642822,
"learning_rate": 0.0005499743440233236,
"loss": 3.557,
"step": 14400
},
{
"epoch": 4.21037296037296,
"grad_norm": 0.33280083537101746,
"learning_rate": 0.000549799416909621,
"loss": 3.5636,
"step": 14450
},
{
"epoch": 4.224941724941725,
"grad_norm": 0.3097744584083557,
"learning_rate": 0.0005496244897959183,
"loss": 3.5591,
"step": 14500
},
{
"epoch": 4.239510489510489,
"grad_norm": 0.3197658658027649,
"learning_rate": 0.0005494495626822156,
"loss": 3.5661,
"step": 14550
},
{
"epoch": 4.2540792540792545,
"grad_norm": 0.3759899437427521,
"learning_rate": 0.0005492746355685131,
"loss": 3.5621,
"step": 14600
},
{
"epoch": 4.268648018648019,
"grad_norm": 0.34865570068359375,
"learning_rate": 0.0005490997084548105,
"loss": 3.5642,
"step": 14650
},
{
"epoch": 4.283216783216783,
"grad_norm": 0.3441263735294342,
"learning_rate": 0.0005489247813411078,
"loss": 3.5676,
"step": 14700
},
{
"epoch": 4.297785547785548,
"grad_norm": 0.33596622943878174,
"learning_rate": 0.0005487498542274052,
"loss": 3.5693,
"step": 14750
},
{
"epoch": 4.312354312354312,
"grad_norm": 0.3372125029563904,
"learning_rate": 0.0005485749271137026,
"loss": 3.5674,
"step": 14800
},
{
"epoch": 4.326923076923077,
"grad_norm": 0.3590675890445709,
"learning_rate": 0.0005484,
"loss": 3.5677,
"step": 14850
},
{
"epoch": 4.341491841491841,
"grad_norm": 0.3344537615776062,
"learning_rate": 0.0005482250728862973,
"loss": 3.5582,
"step": 14900
},
{
"epoch": 4.356060606060606,
"grad_norm": 0.3320492208003998,
"learning_rate": 0.0005480501457725946,
"loss": 3.5648,
"step": 14950
},
{
"epoch": 4.370629370629371,
"grad_norm": 0.336557537317276,
"learning_rate": 0.0005478752186588921,
"loss": 3.5647,
"step": 15000
},
{
"epoch": 4.370629370629371,
"eval_accuracy": 0.3576591986276676,
"eval_loss": 3.6587440967559814,
"eval_runtime": 180.1159,
"eval_samples_per_second": 92.396,
"eval_steps_per_second": 5.78,
"step": 15000
},
{
"epoch": 4.385198135198135,
"grad_norm": 0.3224494159221649,
"learning_rate": 0.0005477002915451894,
"loss": 3.5672,
"step": 15050
},
{
"epoch": 4.3997668997669,
"grad_norm": 0.3005123734474182,
"learning_rate": 0.0005475253644314868,
"loss": 3.567,
"step": 15100
},
{
"epoch": 4.414335664335664,
"grad_norm": 0.31343790888786316,
"learning_rate": 0.0005473504373177842,
"loss": 3.5575,
"step": 15150
},
{
"epoch": 4.428904428904429,
"grad_norm": 0.3172782361507416,
"learning_rate": 0.0005471755102040816,
"loss": 3.5665,
"step": 15200
},
{
"epoch": 4.443473193473194,
"grad_norm": 0.32356658577919006,
"learning_rate": 0.000547000583090379,
"loss": 3.5787,
"step": 15250
},
{
"epoch": 4.458041958041958,
"grad_norm": 0.35370585322380066,
"learning_rate": 0.0005468256559766763,
"loss": 3.5677,
"step": 15300
},
{
"epoch": 4.472610722610723,
"grad_norm": 0.32113948464393616,
"learning_rate": 0.0005466507288629738,
"loss": 3.5588,
"step": 15350
},
{
"epoch": 4.487179487179487,
"grad_norm": 0.3236648738384247,
"learning_rate": 0.0005464758017492711,
"loss": 3.5723,
"step": 15400
},
{
"epoch": 4.501748251748252,
"grad_norm": 0.32024386525154114,
"learning_rate": 0.0005463008746355684,
"loss": 3.582,
"step": 15450
},
{
"epoch": 4.516317016317016,
"grad_norm": 0.34335383772850037,
"learning_rate": 0.0005461259475218658,
"loss": 3.5728,
"step": 15500
},
{
"epoch": 4.5308857808857805,
"grad_norm": 0.3075568377971649,
"learning_rate": 0.0005459510204081633,
"loss": 3.5652,
"step": 15550
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.3292197585105896,
"learning_rate": 0.0005457760932944606,
"loss": 3.565,
"step": 15600
},
{
"epoch": 4.56002331002331,
"grad_norm": 0.35107719898223877,
"learning_rate": 0.000545601166180758,
"loss": 3.5702,
"step": 15650
},
{
"epoch": 4.574592074592075,
"grad_norm": 0.3471798598766327,
"learning_rate": 0.0005454262390670553,
"loss": 3.5681,
"step": 15700
},
{
"epoch": 4.589160839160839,
"grad_norm": 0.31821051239967346,
"learning_rate": 0.0005452513119533528,
"loss": 3.582,
"step": 15750
},
{
"epoch": 4.603729603729604,
"grad_norm": 0.3309209644794464,
"learning_rate": 0.0005450763848396501,
"loss": 3.5883,
"step": 15800
},
{
"epoch": 4.618298368298368,
"grad_norm": 0.33727866411209106,
"learning_rate": 0.0005449014577259474,
"loss": 3.5817,
"step": 15850
},
{
"epoch": 4.632867132867133,
"grad_norm": 0.3144679069519043,
"learning_rate": 0.0005447265306122448,
"loss": 3.5724,
"step": 15900
},
{
"epoch": 4.647435897435898,
"grad_norm": 0.32342618703842163,
"learning_rate": 0.0005445516034985423,
"loss": 3.5855,
"step": 15950
},
{
"epoch": 4.662004662004662,
"grad_norm": 0.3141750395298004,
"learning_rate": 0.0005443766763848396,
"loss": 3.5807,
"step": 16000
},
{
"epoch": 4.662004662004662,
"eval_accuracy": 0.3584899780834147,
"eval_loss": 3.6451687812805176,
"eval_runtime": 180.0666,
"eval_samples_per_second": 92.421,
"eval_steps_per_second": 5.781,
"step": 16000
},
{
"epoch": 4.676573426573427,
"grad_norm": 0.318861186504364,
"learning_rate": 0.000544201749271137,
"loss": 3.5705,
"step": 16050
},
{
"epoch": 4.691142191142191,
"grad_norm": 0.31984490156173706,
"learning_rate": 0.0005440268221574343,
"loss": 3.5858,
"step": 16100
},
{
"epoch": 4.7057109557109555,
"grad_norm": 0.3313526511192322,
"learning_rate": 0.0005438518950437318,
"loss": 3.5778,
"step": 16150
},
{
"epoch": 4.72027972027972,
"grad_norm": 0.332089900970459,
"learning_rate": 0.0005436769679300291,
"loss": 3.5776,
"step": 16200
},
{
"epoch": 4.734848484848484,
"grad_norm": 0.33302974700927734,
"learning_rate": 0.0005435020408163265,
"loss": 3.5832,
"step": 16250
},
{
"epoch": 4.74941724941725,
"grad_norm": 0.3242354691028595,
"learning_rate": 0.0005433271137026238,
"loss": 3.5848,
"step": 16300
},
{
"epoch": 4.763986013986014,
"grad_norm": 0.3078085482120514,
"learning_rate": 0.0005431521865889212,
"loss": 3.5824,
"step": 16350
},
{
"epoch": 4.778554778554779,
"grad_norm": 0.3317912220954895,
"learning_rate": 0.0005429772594752186,
"loss": 3.5782,
"step": 16400
},
{
"epoch": 4.793123543123543,
"grad_norm": 0.30730515718460083,
"learning_rate": 0.000542802332361516,
"loss": 3.5779,
"step": 16450
},
{
"epoch": 4.8076923076923075,
"grad_norm": 0.35136038064956665,
"learning_rate": 0.0005426274052478133,
"loss": 3.583,
"step": 16500
},
{
"epoch": 4.822261072261072,
"grad_norm": 0.3428604304790497,
"learning_rate": 0.0005424524781341108,
"loss": 3.578,
"step": 16550
},
{
"epoch": 4.836829836829837,
"grad_norm": 0.3045051693916321,
"learning_rate": 0.0005422775510204081,
"loss": 3.5811,
"step": 16600
},
{
"epoch": 4.851398601398602,
"grad_norm": 0.3164063096046448,
"learning_rate": 0.0005421026239067055,
"loss": 3.5821,
"step": 16650
},
{
"epoch": 4.865967365967366,
"grad_norm": 0.33561450242996216,
"learning_rate": 0.0005419276967930028,
"loss": 3.5749,
"step": 16700
},
{
"epoch": 4.880536130536131,
"grad_norm": 0.3375592529773712,
"learning_rate": 0.0005417527696793002,
"loss": 3.5713,
"step": 16750
},
{
"epoch": 4.895104895104895,
"grad_norm": 0.3262588083744049,
"learning_rate": 0.0005415778425655976,
"loss": 3.5773,
"step": 16800
},
{
"epoch": 4.909673659673659,
"grad_norm": 0.33031025528907776,
"learning_rate": 0.000541402915451895,
"loss": 3.5719,
"step": 16850
},
{
"epoch": 4.924242424242424,
"grad_norm": 0.32215115427970886,
"learning_rate": 0.0005412279883381923,
"loss": 3.5679,
"step": 16900
},
{
"epoch": 4.938811188811189,
"grad_norm": 0.3194146156311035,
"learning_rate": 0.0005410530612244898,
"loss": 3.5837,
"step": 16950
},
{
"epoch": 4.953379953379954,
"grad_norm": 0.3187941312789917,
"learning_rate": 0.0005408781341107871,
"loss": 3.5693,
"step": 17000
},
{
"epoch": 4.953379953379954,
"eval_accuracy": 0.35988401777879797,
"eval_loss": 3.632936477661133,
"eval_runtime": 180.0341,
"eval_samples_per_second": 92.438,
"eval_steps_per_second": 5.782,
"step": 17000
},
{
"epoch": 4.967948717948718,
"grad_norm": 0.32214635610580444,
"learning_rate": 0.0005407032069970845,
"loss": 3.5817,
"step": 17050
},
{
"epoch": 4.9825174825174825,
"grad_norm": 0.3381812870502472,
"learning_rate": 0.0005405282798833819,
"loss": 3.5721,
"step": 17100
},
{
"epoch": 4.997086247086247,
"grad_norm": 0.328273206949234,
"learning_rate": 0.0005403533527696793,
"loss": 3.5866,
"step": 17150
},
{
"epoch": 5.011655011655011,
"grad_norm": 0.32486042380332947,
"learning_rate": 0.0005401784256559766,
"loss": 3.4864,
"step": 17200
},
{
"epoch": 5.026223776223776,
"grad_norm": 0.3191656172275543,
"learning_rate": 0.000540003498542274,
"loss": 3.4736,
"step": 17250
},
{
"epoch": 5.040792540792541,
"grad_norm": 0.3504127264022827,
"learning_rate": 0.0005398285714285714,
"loss": 3.469,
"step": 17300
},
{
"epoch": 5.055361305361306,
"grad_norm": 0.3454863727092743,
"learning_rate": 0.0005396536443148688,
"loss": 3.4665,
"step": 17350
},
{
"epoch": 5.06993006993007,
"grad_norm": 0.30901169776916504,
"learning_rate": 0.0005394787172011661,
"loss": 3.4741,
"step": 17400
},
{
"epoch": 5.084498834498834,
"grad_norm": 0.33311742544174194,
"learning_rate": 0.0005393037900874635,
"loss": 3.4876,
"step": 17450
},
{
"epoch": 5.099067599067599,
"grad_norm": 0.33518463373184204,
"learning_rate": 0.0005391288629737609,
"loss": 3.4755,
"step": 17500
},
{
"epoch": 5.113636363636363,
"grad_norm": 0.33938467502593994,
"learning_rate": 0.0005389539358600583,
"loss": 3.489,
"step": 17550
},
{
"epoch": 5.128205128205128,
"grad_norm": 0.3346013128757477,
"learning_rate": 0.0005387790087463557,
"loss": 3.4899,
"step": 17600
},
{
"epoch": 5.142773892773893,
"grad_norm": 0.3396677076816559,
"learning_rate": 0.0005386040816326529,
"loss": 3.4791,
"step": 17650
},
{
"epoch": 5.1573426573426575,
"grad_norm": 0.32493624091148376,
"learning_rate": 0.0005384291545189504,
"loss": 3.4999,
"step": 17700
},
{
"epoch": 5.171911421911422,
"grad_norm": 0.34523579478263855,
"learning_rate": 0.0005382542274052478,
"loss": 3.4942,
"step": 17750
},
{
"epoch": 5.186480186480186,
"grad_norm": 0.34241601824760437,
"learning_rate": 0.0005380793002915451,
"loss": 3.4986,
"step": 17800
},
{
"epoch": 5.201048951048951,
"grad_norm": 0.3449043035507202,
"learning_rate": 0.0005379043731778425,
"loss": 3.5096,
"step": 17850
},
{
"epoch": 5.215617715617715,
"grad_norm": 0.33027029037475586,
"learning_rate": 0.0005377294460641399,
"loss": 3.5021,
"step": 17900
},
{
"epoch": 5.230186480186481,
"grad_norm": 0.33586353063583374,
"learning_rate": 0.0005375545189504373,
"loss": 3.4964,
"step": 17950
},
{
"epoch": 5.244755244755245,
"grad_norm": 0.3348841071128845,
"learning_rate": 0.0005373795918367346,
"loss": 3.5152,
"step": 18000
},
{
"epoch": 5.244755244755245,
"eval_accuracy": 0.3607333765910926,
"eval_loss": 3.6339569091796875,
"eval_runtime": 180.5183,
"eval_samples_per_second": 92.19,
"eval_steps_per_second": 5.767,
"step": 18000
},
{
"epoch": 5.2593240093240095,
"grad_norm": 0.33033329248428345,
"learning_rate": 0.000537204664723032,
"loss": 3.4922,
"step": 18050
},
{
"epoch": 5.273892773892774,
"grad_norm": 0.32480764389038086,
"learning_rate": 0.0005370297376093294,
"loss": 3.5049,
"step": 18100
},
{
"epoch": 5.288461538461538,
"grad_norm": 0.3114669919013977,
"learning_rate": 0.0005368548104956268,
"loss": 3.5045,
"step": 18150
},
{
"epoch": 5.303030303030303,
"grad_norm": 0.32912948727607727,
"learning_rate": 0.0005366798833819241,
"loss": 3.5039,
"step": 18200
},
{
"epoch": 5.317599067599067,
"grad_norm": 0.325888067483902,
"learning_rate": 0.0005365049562682215,
"loss": 3.5107,
"step": 18250
},
{
"epoch": 5.3321678321678325,
"grad_norm": 0.3258603811264038,
"learning_rate": 0.0005363300291545189,
"loss": 3.5079,
"step": 18300
},
{
"epoch": 5.346736596736597,
"grad_norm": 0.34344643354415894,
"learning_rate": 0.0005361551020408163,
"loss": 3.5056,
"step": 18350
},
{
"epoch": 5.361305361305361,
"grad_norm": 0.34246399998664856,
"learning_rate": 0.0005359801749271136,
"loss": 3.5118,
"step": 18400
},
{
"epoch": 5.375874125874126,
"grad_norm": 0.35261663794517517,
"learning_rate": 0.000535805247813411,
"loss": 3.5154,
"step": 18450
},
{
"epoch": 5.39044289044289,
"grad_norm": 0.33429020643234253,
"learning_rate": 0.0005356303206997085,
"loss": 3.515,
"step": 18500
},
{
"epoch": 5.405011655011655,
"grad_norm": 0.3388688266277313,
"learning_rate": 0.0005354553935860058,
"loss": 3.5011,
"step": 18550
},
{
"epoch": 5.41958041958042,
"grad_norm": 0.31441932916641235,
"learning_rate": 0.0005352804664723031,
"loss": 3.524,
"step": 18600
},
{
"epoch": 5.4341491841491845,
"grad_norm": 0.33346623182296753,
"learning_rate": 0.0005351055393586006,
"loss": 3.5096,
"step": 18650
},
{
"epoch": 5.448717948717949,
"grad_norm": 0.3645952045917511,
"learning_rate": 0.0005349306122448979,
"loss": 3.5162,
"step": 18700
},
{
"epoch": 5.463286713286713,
"grad_norm": 0.3252617120742798,
"learning_rate": 0.0005347556851311953,
"loss": 3.5166,
"step": 18750
},
{
"epoch": 5.477855477855478,
"grad_norm": 0.32356569170951843,
"learning_rate": 0.0005345807580174926,
"loss": 3.5259,
"step": 18800
},
{
"epoch": 5.492424242424242,
"grad_norm": 0.32452526688575745,
"learning_rate": 0.0005344058309037901,
"loss": 3.5419,
"step": 18850
},
{
"epoch": 5.506993006993007,
"grad_norm": 0.3109516501426697,
"learning_rate": 0.0005342309037900875,
"loss": 3.523,
"step": 18900
},
{
"epoch": 5.521561771561771,
"grad_norm": 0.32956892251968384,
"learning_rate": 0.0005340559766763848,
"loss": 3.5346,
"step": 18950
},
{
"epoch": 5.536130536130536,
"grad_norm": 0.347649484872818,
"learning_rate": 0.0005338810495626821,
"loss": 3.5148,
"step": 19000
},
{
"epoch": 5.536130536130536,
"eval_accuracy": 0.3614395097307616,
"eval_loss": 3.624124526977539,
"eval_runtime": 180.5076,
"eval_samples_per_second": 92.196,
"eval_steps_per_second": 5.767,
"step": 19000
},
{
"epoch": 5.550699300699301,
"grad_norm": 0.34442394971847534,
"learning_rate": 0.0005337061224489796,
"loss": 3.508,
"step": 19050
},
{
"epoch": 5.565268065268065,
"grad_norm": 0.3646959960460663,
"learning_rate": 0.0005335311953352769,
"loss": 3.5272,
"step": 19100
},
{
"epoch": 5.57983682983683,
"grad_norm": 0.34306755661964417,
"learning_rate": 0.0005333562682215743,
"loss": 3.5253,
"step": 19150
},
{
"epoch": 5.594405594405594,
"grad_norm": 0.34549543261528015,
"learning_rate": 0.0005331813411078716,
"loss": 3.5349,
"step": 19200
},
{
"epoch": 5.608974358974359,
"grad_norm": 0.3486803472042084,
"learning_rate": 0.0005330064139941691,
"loss": 3.518,
"step": 19250
},
{
"epoch": 5.623543123543124,
"grad_norm": 0.3553147315979004,
"learning_rate": 0.0005328314868804665,
"loss": 3.5229,
"step": 19300
},
{
"epoch": 5.638111888111888,
"grad_norm": 0.3389810025691986,
"learning_rate": 0.0005326565597667638,
"loss": 3.5184,
"step": 19350
},
{
"epoch": 5.652680652680653,
"grad_norm": 0.3389154076576233,
"learning_rate": 0.0005324816326530612,
"loss": 3.5242,
"step": 19400
},
{
"epoch": 5.667249417249417,
"grad_norm": 0.31988218426704407,
"learning_rate": 0.0005323067055393586,
"loss": 3.5365,
"step": 19450
},
{
"epoch": 5.681818181818182,
"grad_norm": 0.32239192724227905,
"learning_rate": 0.0005321317784256559,
"loss": 3.5347,
"step": 19500
},
{
"epoch": 5.696386946386946,
"grad_norm": 0.3520359694957733,
"learning_rate": 0.0005319568513119533,
"loss": 3.5332,
"step": 19550
},
{
"epoch": 5.7109557109557105,
"grad_norm": 0.3352511525154114,
"learning_rate": 0.0005317819241982506,
"loss": 3.534,
"step": 19600
},
{
"epoch": 5.725524475524476,
"grad_norm": 0.3281591236591339,
"learning_rate": 0.0005316069970845481,
"loss": 3.5274,
"step": 19650
},
{
"epoch": 5.74009324009324,
"grad_norm": 0.33789217472076416,
"learning_rate": 0.0005314320699708454,
"loss": 3.5266,
"step": 19700
},
{
"epoch": 5.754662004662005,
"grad_norm": 0.34207120537757874,
"learning_rate": 0.0005312571428571428,
"loss": 3.5315,
"step": 19750
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.351068913936615,
"learning_rate": 0.0005310822157434403,
"loss": 3.5341,
"step": 19800
},
{
"epoch": 5.783799533799534,
"grad_norm": 0.3352493643760681,
"learning_rate": 0.0005309072886297376,
"loss": 3.53,
"step": 19850
},
{
"epoch": 5.798368298368298,
"grad_norm": 0.327741801738739,
"learning_rate": 0.0005307323615160349,
"loss": 3.5304,
"step": 19900
},
{
"epoch": 5.812937062937063,
"grad_norm": 0.32836633920669556,
"learning_rate": 0.0005305574344023323,
"loss": 3.5286,
"step": 19950
},
{
"epoch": 5.827505827505828,
"grad_norm": 0.3504875600337982,
"learning_rate": 0.0005303825072886296,
"loss": 3.5384,
"step": 20000
},
{
"epoch": 5.827505827505828,
"eval_accuracy": 0.3625647367105273,
"eval_loss": 3.612804412841797,
"eval_runtime": 180.4713,
"eval_samples_per_second": 92.214,
"eval_steps_per_second": 5.768,
"step": 20000
},
{
"epoch": 5.842074592074592,
"grad_norm": 0.3540632426738739,
"learning_rate": 0.0005302075801749271,
"loss": 3.5366,
"step": 20050
},
{
"epoch": 5.856643356643357,
"grad_norm": 0.34035322070121765,
"learning_rate": 0.0005300326530612244,
"loss": 3.5303,
"step": 20100
},
{
"epoch": 5.871212121212121,
"grad_norm": 0.31729087233543396,
"learning_rate": 0.0005298577259475218,
"loss": 3.5335,
"step": 20150
},
{
"epoch": 5.8857808857808855,
"grad_norm": 0.3735673427581787,
"learning_rate": 0.0005296827988338193,
"loss": 3.5277,
"step": 20200
},
{
"epoch": 5.90034965034965,
"grad_norm": 0.314452201128006,
"learning_rate": 0.0005295078717201166,
"loss": 3.5406,
"step": 20250
},
{
"epoch": 5.914918414918415,
"grad_norm": 0.3204086422920227,
"learning_rate": 0.000529332944606414,
"loss": 3.5359,
"step": 20300
},
{
"epoch": 5.92948717948718,
"grad_norm": 0.3485746681690216,
"learning_rate": 0.0005291580174927113,
"loss": 3.5245,
"step": 20350
},
{
"epoch": 5.944055944055944,
"grad_norm": 0.34968072175979614,
"learning_rate": 0.0005289830903790087,
"loss": 3.54,
"step": 20400
},
{
"epoch": 5.958624708624709,
"grad_norm": 0.3806632161140442,
"learning_rate": 0.0005288081632653061,
"loss": 3.525,
"step": 20450
},
{
"epoch": 5.973193473193473,
"grad_norm": 0.3304056227207184,
"learning_rate": 0.0005286332361516034,
"loss": 3.5232,
"step": 20500
},
{
"epoch": 5.9877622377622375,
"grad_norm": 0.33363205194473267,
"learning_rate": 0.0005284583090379008,
"loss": 3.5174,
"step": 20550
},
{
"epoch": 6.002331002331002,
"grad_norm": 0.3507980704307556,
"learning_rate": 0.0005282833819241983,
"loss": 3.5095,
"step": 20600
},
{
"epoch": 6.016899766899767,
"grad_norm": 0.3389154374599457,
"learning_rate": 0.0005281084548104956,
"loss": 3.4025,
"step": 20650
},
{
"epoch": 6.031468531468532,
"grad_norm": 0.33325284719467163,
"learning_rate": 0.000527933527696793,
"loss": 3.4253,
"step": 20700
},
{
"epoch": 6.046037296037296,
"grad_norm": 0.34633567929267883,
"learning_rate": 0.0005277586005830903,
"loss": 3.4288,
"step": 20750
},
{
"epoch": 6.0606060606060606,
"grad_norm": 0.33911773562431335,
"learning_rate": 0.0005275836734693877,
"loss": 3.4302,
"step": 20800
},
{
"epoch": 6.075174825174825,
"grad_norm": 0.3277522027492523,
"learning_rate": 0.0005274087463556851,
"loss": 3.4381,
"step": 20850
},
{
"epoch": 6.089743589743589,
"grad_norm": 0.3419731855392456,
"learning_rate": 0.0005272338192419824,
"loss": 3.4431,
"step": 20900
},
{
"epoch": 6.104312354312355,
"grad_norm": 0.35028308629989624,
"learning_rate": 0.0005270588921282798,
"loss": 3.4435,
"step": 20950
},
{
"epoch": 6.118881118881119,
"grad_norm": 0.3204551339149475,
"learning_rate": 0.0005268839650145772,
"loss": 3.4338,
"step": 21000
},
{
"epoch": 6.118881118881119,
"eval_accuracy": 0.3627538228202005,
"eval_loss": 3.615595579147339,
"eval_runtime": 180.6199,
"eval_samples_per_second": 92.138,
"eval_steps_per_second": 5.763,
"step": 21000
},
{
"epoch": 6.133449883449884,
"grad_norm": 0.3347219228744507,
"learning_rate": 0.0005267090379008746,
"loss": 3.4486,
"step": 21050
},
{
"epoch": 6.148018648018648,
"grad_norm": 0.3284785747528076,
"learning_rate": 0.000526534110787172,
"loss": 3.4548,
"step": 21100
},
{
"epoch": 6.1625874125874125,
"grad_norm": 0.33264586329460144,
"learning_rate": 0.0005263591836734693,
"loss": 3.4476,
"step": 21150
},
{
"epoch": 6.177156177156177,
"grad_norm": 0.3285725712776184,
"learning_rate": 0.0005261842565597668,
"loss": 3.4675,
"step": 21200
},
{
"epoch": 6.191724941724941,
"grad_norm": 0.3390142321586609,
"learning_rate": 0.0005260093294460641,
"loss": 3.455,
"step": 21250
},
{
"epoch": 6.206293706293707,
"grad_norm": 0.33934858441352844,
"learning_rate": 0.0005258344023323614,
"loss": 3.4463,
"step": 21300
},
{
"epoch": 6.220862470862471,
"grad_norm": 0.3672083914279938,
"learning_rate": 0.0005256594752186588,
"loss": 3.4512,
"step": 21350
},
{
"epoch": 6.235431235431236,
"grad_norm": 0.3115769624710083,
"learning_rate": 0.0005254845481049562,
"loss": 3.4634,
"step": 21400
},
{
"epoch": 6.25,
"grad_norm": 0.32785558700561523,
"learning_rate": 0.0005253096209912536,
"loss": 3.4688,
"step": 21450
},
{
"epoch": 6.264568764568764,
"grad_norm": 0.3327209949493408,
"learning_rate": 0.000525134693877551,
"loss": 3.4517,
"step": 21500
},
{
"epoch": 6.279137529137529,
"grad_norm": 0.34631094336509705,
"learning_rate": 0.0005249597667638484,
"loss": 3.4574,
"step": 21550
},
{
"epoch": 6.293706293706293,
"grad_norm": 0.3532359004020691,
"learning_rate": 0.0005247848396501458,
"loss": 3.4656,
"step": 21600
},
{
"epoch": 6.308275058275059,
"grad_norm": 0.36950933933258057,
"learning_rate": 0.0005246099125364431,
"loss": 3.4769,
"step": 21650
},
{
"epoch": 6.322843822843823,
"grad_norm": 0.336834579706192,
"learning_rate": 0.0005244349854227404,
"loss": 3.4637,
"step": 21700
},
{
"epoch": 6.3374125874125875,
"grad_norm": 0.30184629559516907,
"learning_rate": 0.0005242600583090379,
"loss": 3.4716,
"step": 21750
},
{
"epoch": 6.351981351981352,
"grad_norm": 0.34009432792663574,
"learning_rate": 0.0005240851311953352,
"loss": 3.4698,
"step": 21800
},
{
"epoch": 6.366550116550116,
"grad_norm": 0.32678115367889404,
"learning_rate": 0.0005239102040816326,
"loss": 3.4706,
"step": 21850
},
{
"epoch": 6.381118881118881,
"grad_norm": 0.34370940923690796,
"learning_rate": 0.00052373527696793,
"loss": 3.4649,
"step": 21900
},
{
"epoch": 6.395687645687646,
"grad_norm": 0.31767651438713074,
"learning_rate": 0.0005235603498542274,
"loss": 3.4903,
"step": 21950
},
{
"epoch": 6.410256410256411,
"grad_norm": 0.35483428835868835,
"learning_rate": 0.0005233854227405248,
"loss": 3.4762,
"step": 22000
},
{
"epoch": 6.410256410256411,
"eval_accuracy": 0.3631498688509091,
"eval_loss": 3.6074860095977783,
"eval_runtime": 180.0487,
"eval_samples_per_second": 92.431,
"eval_steps_per_second": 5.782,
"step": 22000
},
{
"epoch": 6.424825174825175,
"grad_norm": 0.31931906938552856,
"learning_rate": 0.0005232104956268221,
"loss": 3.4758,
"step": 22050
},
{
"epoch": 6.4393939393939394,
"grad_norm": 0.3227771818637848,
"learning_rate": 0.0005230355685131195,
"loss": 3.4678,
"step": 22100
},
{
"epoch": 6.453962703962704,
"grad_norm": 0.35156136751174927,
"learning_rate": 0.0005228606413994169,
"loss": 3.4803,
"step": 22150
},
{
"epoch": 6.468531468531468,
"grad_norm": 0.33394086360931396,
"learning_rate": 0.0005226857142857142,
"loss": 3.471,
"step": 22200
},
{
"epoch": 6.483100233100233,
"grad_norm": 0.3395681381225586,
"learning_rate": 0.0005225107871720116,
"loss": 3.4759,
"step": 22250
},
{
"epoch": 6.497668997668998,
"grad_norm": 0.32322457432746887,
"learning_rate": 0.0005223358600583089,
"loss": 3.48,
"step": 22300
},
{
"epoch": 6.5122377622377625,
"grad_norm": 0.32809075713157654,
"learning_rate": 0.0005221609329446064,
"loss": 3.4774,
"step": 22350
},
{
"epoch": 6.526806526806527,
"grad_norm": 0.32868528366088867,
"learning_rate": 0.0005219860058309038,
"loss": 3.4811,
"step": 22400
},
{
"epoch": 6.541375291375291,
"grad_norm": 0.33489176630973816,
"learning_rate": 0.0005218110787172011,
"loss": 3.4916,
"step": 22450
},
{
"epoch": 6.555944055944056,
"grad_norm": 0.3436543941497803,
"learning_rate": 0.0005216361516034985,
"loss": 3.4859,
"step": 22500
},
{
"epoch": 6.57051282051282,
"grad_norm": 0.3015133738517761,
"learning_rate": 0.0005214612244897959,
"loss": 3.4779,
"step": 22550
},
{
"epoch": 6.585081585081585,
"grad_norm": 0.3797510862350464,
"learning_rate": 0.0005212862973760932,
"loss": 3.4846,
"step": 22600
},
{
"epoch": 6.59965034965035,
"grad_norm": 0.327371209859848,
"learning_rate": 0.0005211113702623906,
"loss": 3.4941,
"step": 22650
},
{
"epoch": 6.6142191142191145,
"grad_norm": 0.3728986084461212,
"learning_rate": 0.0005209364431486879,
"loss": 3.4986,
"step": 22700
},
{
"epoch": 6.628787878787879,
"grad_norm": 0.3234831988811493,
"learning_rate": 0.0005207615160349854,
"loss": 3.4824,
"step": 22750
},
{
"epoch": 6.643356643356643,
"grad_norm": 0.3303401470184326,
"learning_rate": 0.0005205865889212828,
"loss": 3.4857,
"step": 22800
},
{
"epoch": 6.657925407925408,
"grad_norm": 0.3562447726726532,
"learning_rate": 0.0005204116618075801,
"loss": 3.4825,
"step": 22850
},
{
"epoch": 6.672494172494172,
"grad_norm": 0.3363456428050995,
"learning_rate": 0.0005202367346938776,
"loss": 3.4786,
"step": 22900
},
{
"epoch": 6.687062937062937,
"grad_norm": 0.337936669588089,
"learning_rate": 0.0005200618075801749,
"loss": 3.4894,
"step": 22950
},
{
"epoch": 6.701631701631702,
"grad_norm": 0.34164348244667053,
"learning_rate": 0.0005198868804664723,
"loss": 3.4815,
"step": 23000
},
{
"epoch": 6.701631701631702,
"eval_accuracy": 0.3637440554878363,
"eval_loss": 3.6008543968200684,
"eval_runtime": 180.4988,
"eval_samples_per_second": 92.2,
"eval_steps_per_second": 5.767,
"step": 23000
},
{
"epoch": 6.716200466200466,
"grad_norm": 0.3702085018157959,
"learning_rate": 0.0005197119533527696,
"loss": 3.4993,
"step": 23050
},
{
"epoch": 6.730769230769231,
"grad_norm": 0.33993563055992126,
"learning_rate": 0.000519537026239067,
"loss": 3.4772,
"step": 23100
},
{
"epoch": 6.745337995337995,
"grad_norm": 0.33401525020599365,
"learning_rate": 0.0005193620991253644,
"loss": 3.4976,
"step": 23150
},
{
"epoch": 6.75990675990676,
"grad_norm": 0.37840354442596436,
"learning_rate": 0.0005191871720116618,
"loss": 3.4828,
"step": 23200
},
{
"epoch": 6.774475524475524,
"grad_norm": 0.3243924379348755,
"learning_rate": 0.0005190122448979591,
"loss": 3.4938,
"step": 23250
},
{
"epoch": 6.7890442890442895,
"grad_norm": 0.3309505581855774,
"learning_rate": 0.0005188373177842566,
"loss": 3.4723,
"step": 23300
},
{
"epoch": 6.803613053613054,
"grad_norm": 0.35153377056121826,
"learning_rate": 0.0005186623906705539,
"loss": 3.4872,
"step": 23350
},
{
"epoch": 6.818181818181818,
"grad_norm": 0.3381296396255493,
"learning_rate": 0.0005184874635568513,
"loss": 3.4899,
"step": 23400
},
{
"epoch": 6.832750582750583,
"grad_norm": 0.3551500737667084,
"learning_rate": 0.0005183125364431486,
"loss": 3.4895,
"step": 23450
},
{
"epoch": 6.847319347319347,
"grad_norm": 0.33850058913230896,
"learning_rate": 0.000518137609329446,
"loss": 3.4793,
"step": 23500
},
{
"epoch": 6.861888111888112,
"grad_norm": 0.3279431164264679,
"learning_rate": 0.0005179626822157434,
"loss": 3.4967,
"step": 23550
},
{
"epoch": 6.876456876456876,
"grad_norm": 0.3145736753940582,
"learning_rate": 0.0005177877551020407,
"loss": 3.5046,
"step": 23600
},
{
"epoch": 6.891025641025641,
"grad_norm": 0.3533722162246704,
"learning_rate": 0.0005176128279883381,
"loss": 3.4892,
"step": 23650
},
{
"epoch": 6.905594405594406,
"grad_norm": 0.3434518575668335,
"learning_rate": 0.0005174379008746356,
"loss": 3.4818,
"step": 23700
},
{
"epoch": 6.92016317016317,
"grad_norm": 0.30422964692115784,
"learning_rate": 0.0005172629737609329,
"loss": 3.4961,
"step": 23750
},
{
"epoch": 6.934731934731935,
"grad_norm": 0.34872138500213623,
"learning_rate": 0.0005170880466472303,
"loss": 3.4941,
"step": 23800
},
{
"epoch": 6.949300699300699,
"grad_norm": 0.3359842598438263,
"learning_rate": 0.0005169131195335276,
"loss": 3.4905,
"step": 23850
},
{
"epoch": 6.963869463869464,
"grad_norm": 0.3362923264503479,
"learning_rate": 0.0005167381924198251,
"loss": 3.4967,
"step": 23900
},
{
"epoch": 6.978438228438229,
"grad_norm": 0.33967387676239014,
"learning_rate": 0.0005165632653061224,
"loss": 3.4997,
"step": 23950
},
{
"epoch": 6.993006993006993,
"grad_norm": 0.326475590467453,
"learning_rate": 0.0005163883381924197,
"loss": 3.4942,
"step": 24000
},
{
"epoch": 6.993006993006993,
"eval_accuracy": 0.36491220313304396,
"eval_loss": 3.5894508361816406,
"eval_runtime": 180.3515,
"eval_samples_per_second": 92.275,
"eval_steps_per_second": 5.772,
"step": 24000
},
{
"epoch": 7.007575757575758,
"grad_norm": 0.35610419511795044,
"learning_rate": 0.0005162134110787171,
"loss": 3.4349,
"step": 24050
},
{
"epoch": 7.022144522144522,
"grad_norm": 0.3531475067138672,
"learning_rate": 0.0005160384839650146,
"loss": 3.3823,
"step": 24100
},
{
"epoch": 7.036713286713287,
"grad_norm": 0.3476791977882385,
"learning_rate": 0.0005158635568513119,
"loss": 3.4016,
"step": 24150
},
{
"epoch": 7.051282051282051,
"grad_norm": 0.35229551792144775,
"learning_rate": 0.0005156886297376093,
"loss": 3.3868,
"step": 24200
},
{
"epoch": 7.0658508158508155,
"grad_norm": 0.3391362428665161,
"learning_rate": 0.0005155137026239066,
"loss": 3.3928,
"step": 24250
},
{
"epoch": 7.08041958041958,
"grad_norm": 0.34460726380348206,
"learning_rate": 0.0005153387755102041,
"loss": 3.3913,
"step": 24300
},
{
"epoch": 7.094988344988345,
"grad_norm": 0.35683906078338623,
"learning_rate": 0.0005151638483965014,
"loss": 3.3972,
"step": 24350
},
{
"epoch": 7.10955710955711,
"grad_norm": 0.3500906825065613,
"learning_rate": 0.0005149889212827987,
"loss": 3.4121,
"step": 24400
},
{
"epoch": 7.124125874125874,
"grad_norm": 0.32340437173843384,
"learning_rate": 0.0005148139941690961,
"loss": 3.4042,
"step": 24450
},
{
"epoch": 7.138694638694639,
"grad_norm": 0.36307796835899353,
"learning_rate": 0.0005146390670553936,
"loss": 3.4152,
"step": 24500
},
{
"epoch": 7.153263403263403,
"grad_norm": 0.35622280836105347,
"learning_rate": 0.0005144641399416909,
"loss": 3.4038,
"step": 24550
},
{
"epoch": 7.1678321678321675,
"grad_norm": 0.34201404452323914,
"learning_rate": 0.0005142892128279883,
"loss": 3.413,
"step": 24600
},
{
"epoch": 7.182400932400933,
"grad_norm": 0.3477611243724823,
"learning_rate": 0.0005141142857142856,
"loss": 3.4147,
"step": 24650
},
{
"epoch": 7.196969696969697,
"grad_norm": 0.3193877339363098,
"learning_rate": 0.0005139393586005831,
"loss": 3.4349,
"step": 24700
},
{
"epoch": 7.211538461538462,
"grad_norm": 0.3370342254638672,
"learning_rate": 0.0005137644314868804,
"loss": 3.4269,
"step": 24750
},
{
"epoch": 7.226107226107226,
"grad_norm": 0.35344481468200684,
"learning_rate": 0.0005135895043731778,
"loss": 3.4046,
"step": 24800
},
{
"epoch": 7.2406759906759905,
"grad_norm": 0.3530924320220947,
"learning_rate": 0.0005134145772594752,
"loss": 3.4115,
"step": 24850
},
{
"epoch": 7.255244755244755,
"grad_norm": 0.3493140935897827,
"learning_rate": 0.0005132396501457726,
"loss": 3.4237,
"step": 24900
},
{
"epoch": 7.269813519813519,
"grad_norm": 0.33685219287872314,
"learning_rate": 0.0005130647230320699,
"loss": 3.4313,
"step": 24950
},
{
"epoch": 7.284382284382285,
"grad_norm": 0.3504573702812195,
"learning_rate": 0.0005128897959183673,
"loss": 3.4237,
"step": 25000
},
{
"epoch": 7.284382284382285,
"eval_accuracy": 0.3649218455839104,
"eval_loss": 3.599851369857788,
"eval_runtime": 180.4257,
"eval_samples_per_second": 92.237,
"eval_steps_per_second": 5.77,
"step": 25000
},
{
"epoch": 7.298951048951049,
"grad_norm": 0.34710603952407837,
"learning_rate": 0.0005127148688046647,
"loss": 3.4347,
"step": 25050
},
{
"epoch": 7.313519813519814,
"grad_norm": 0.3456078767776489,
"learning_rate": 0.0005125399416909621,
"loss": 3.4325,
"step": 25100
},
{
"epoch": 7.328088578088578,
"grad_norm": 0.36139947175979614,
"learning_rate": 0.0005123650145772594,
"loss": 3.4531,
"step": 25150
},
{
"epoch": 7.3426573426573425,
"grad_norm": 0.3331305980682373,
"learning_rate": 0.0005121900874635568,
"loss": 3.4372,
"step": 25200
},
{
"epoch": 7.357226107226107,
"grad_norm": 0.3419002294540405,
"learning_rate": 0.0005120151603498543,
"loss": 3.4222,
"step": 25250
},
{
"epoch": 7.371794871794872,
"grad_norm": 0.37077078223228455,
"learning_rate": 0.0005118402332361515,
"loss": 3.438,
"step": 25300
},
{
"epoch": 7.386363636363637,
"grad_norm": 0.37061864137649536,
"learning_rate": 0.0005116653061224489,
"loss": 3.4384,
"step": 25350
},
{
"epoch": 7.400932400932401,
"grad_norm": 0.33451831340789795,
"learning_rate": 0.0005114903790087463,
"loss": 3.4323,
"step": 25400
},
{
"epoch": 7.415501165501166,
"grad_norm": 0.36487630009651184,
"learning_rate": 0.0005113154518950437,
"loss": 3.4338,
"step": 25450
},
{
"epoch": 7.43006993006993,
"grad_norm": 0.34303170442581177,
"learning_rate": 0.0005111405247813411,
"loss": 3.446,
"step": 25500
},
{
"epoch": 7.444638694638694,
"grad_norm": 0.3491624593734741,
"learning_rate": 0.0005109655976676384,
"loss": 3.4407,
"step": 25550
},
{
"epoch": 7.459207459207459,
"grad_norm": 0.3570358455181122,
"learning_rate": 0.0005107906705539358,
"loss": 3.4499,
"step": 25600
},
{
"epoch": 7.473776223776224,
"grad_norm": 0.3398280739784241,
"learning_rate": 0.0005106157434402332,
"loss": 3.438,
"step": 25650
},
{
"epoch": 7.488344988344989,
"grad_norm": 0.3448866307735443,
"learning_rate": 0.0005104408163265306,
"loss": 3.4396,
"step": 25700
},
{
"epoch": 7.502913752913753,
"grad_norm": 0.35469329357147217,
"learning_rate": 0.0005102658892128279,
"loss": 3.4361,
"step": 25750
},
{
"epoch": 7.5174825174825175,
"grad_norm": 0.35180532932281494,
"learning_rate": 0.0005100909620991253,
"loss": 3.4589,
"step": 25800
},
{
"epoch": 7.532051282051282,
"grad_norm": 0.3383461833000183,
"learning_rate": 0.0005099160349854227,
"loss": 3.446,
"step": 25850
},
{
"epoch": 7.546620046620046,
"grad_norm": 0.35350677371025085,
"learning_rate": 0.0005097411078717201,
"loss": 3.4507,
"step": 25900
},
{
"epoch": 7.561188811188811,
"grad_norm": 0.3186721205711365,
"learning_rate": 0.0005095661807580174,
"loss": 3.4341,
"step": 25950
},
{
"epoch": 7.575757575757576,
"grad_norm": 0.3171408474445343,
"learning_rate": 0.0005093912536443149,
"loss": 3.4501,
"step": 26000
},
{
"epoch": 7.575757575757576,
"eval_accuracy": 0.3657292244576768,
"eval_loss": 3.5900423526763916,
"eval_runtime": 180.2145,
"eval_samples_per_second": 92.346,
"eval_steps_per_second": 5.776,
"step": 26000
},
{
"epoch": 7.590326340326341,
"grad_norm": 0.35610276460647583,
"learning_rate": 0.0005092163265306122,
"loss": 3.4641,
"step": 26050
},
{
"epoch": 7.604895104895105,
"grad_norm": 0.37525665760040283,
"learning_rate": 0.0005090413994169096,
"loss": 3.4647,
"step": 26100
},
{
"epoch": 7.619463869463869,
"grad_norm": 0.33461683988571167,
"learning_rate": 0.000508866472303207,
"loss": 3.4597,
"step": 26150
},
{
"epoch": 7.634032634032634,
"grad_norm": 0.3235708773136139,
"learning_rate": 0.0005086915451895044,
"loss": 3.4497,
"step": 26200
},
{
"epoch": 7.648601398601398,
"grad_norm": 0.34801435470581055,
"learning_rate": 0.0005085166180758017,
"loss": 3.4504,
"step": 26250
},
{
"epoch": 7.663170163170163,
"grad_norm": 0.32955285906791687,
"learning_rate": 0.0005083416909620991,
"loss": 3.449,
"step": 26300
},
{
"epoch": 7.677738927738928,
"grad_norm": 0.3284403383731842,
"learning_rate": 0.0005081667638483964,
"loss": 3.4546,
"step": 26350
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.32493704557418823,
"learning_rate": 0.0005079918367346939,
"loss": 3.4384,
"step": 26400
},
{
"epoch": 7.706876456876457,
"grad_norm": 0.34628820419311523,
"learning_rate": 0.0005078169096209912,
"loss": 3.4534,
"step": 26450
},
{
"epoch": 7.721445221445221,
"grad_norm": 0.33644041419029236,
"learning_rate": 0.0005076419825072886,
"loss": 3.4552,
"step": 26500
},
{
"epoch": 7.736013986013986,
"grad_norm": 0.34094300866127014,
"learning_rate": 0.000507467055393586,
"loss": 3.4536,
"step": 26550
},
{
"epoch": 7.75058275058275,
"grad_norm": 0.34397369623184204,
"learning_rate": 0.0005072921282798834,
"loss": 3.4635,
"step": 26600
},
{
"epoch": 7.765151515151516,
"grad_norm": 0.3402233123779297,
"learning_rate": 0.0005071172011661807,
"loss": 3.4708,
"step": 26650
},
{
"epoch": 7.77972027972028,
"grad_norm": 0.3712950050830841,
"learning_rate": 0.0005069422740524781,
"loss": 3.4713,
"step": 26700
},
{
"epoch": 7.7942890442890445,
"grad_norm": 0.3284025490283966,
"learning_rate": 0.0005067673469387754,
"loss": 3.4556,
"step": 26750
},
{
"epoch": 7.808857808857809,
"grad_norm": 0.34438565373420715,
"learning_rate": 0.0005065924198250729,
"loss": 3.4744,
"step": 26800
},
{
"epoch": 7.823426573426573,
"grad_norm": 0.33172059059143066,
"learning_rate": 0.0005064174927113702,
"loss": 3.4534,
"step": 26850
},
{
"epoch": 7.837995337995338,
"grad_norm": 0.3375876843929291,
"learning_rate": 0.0005062425655976676,
"loss": 3.4484,
"step": 26900
},
{
"epoch": 7.852564102564102,
"grad_norm": 0.3456272780895233,
"learning_rate": 0.0005060676384839649,
"loss": 3.4474,
"step": 26950
},
{
"epoch": 7.867132867132867,
"grad_norm": 0.3476708233356476,
"learning_rate": 0.0005058927113702624,
"loss": 3.4529,
"step": 27000
},
{
"epoch": 7.867132867132867,
"eval_accuracy": 0.36633469981756955,
"eval_loss": 3.5783565044403076,
"eval_runtime": 182.5033,
"eval_samples_per_second": 91.187,
"eval_steps_per_second": 5.704,
"step": 27000
},
{
"epoch": 7.881701631701632,
"grad_norm": 0.357236385345459,
"learning_rate": 0.0005057177842565598,
"loss": 3.4582,
"step": 27050
},
{
"epoch": 7.896270396270396,
"grad_norm": 0.3404090404510498,
"learning_rate": 0.0005055428571428571,
"loss": 3.4633,
"step": 27100
},
{
"epoch": 7.910839160839161,
"grad_norm": 0.341049462556839,
"learning_rate": 0.0005053679300291544,
"loss": 3.4626,
"step": 27150
},
{
"epoch": 7.925407925407925,
"grad_norm": 0.321346640586853,
"learning_rate": 0.0005051930029154519,
"loss": 3.4529,
"step": 27200
},
{
"epoch": 7.93997668997669,
"grad_norm": 0.31583067774772644,
"learning_rate": 0.0005050180758017492,
"loss": 3.4681,
"step": 27250
},
{
"epoch": 7.954545454545455,
"grad_norm": 0.36198437213897705,
"learning_rate": 0.0005048431486880466,
"loss": 3.452,
"step": 27300
},
{
"epoch": 7.9691142191142195,
"grad_norm": 0.34580230712890625,
"learning_rate": 0.0005046682215743439,
"loss": 3.4541,
"step": 27350
},
{
"epoch": 7.983682983682984,
"grad_norm": 0.3525956869125366,
"learning_rate": 0.0005044932944606414,
"loss": 3.4677,
"step": 27400
},
{
"epoch": 7.998251748251748,
"grad_norm": 0.312714546918869,
"learning_rate": 0.0005043183673469388,
"loss": 3.453,
"step": 27450
},
{
"epoch": 8.012820512820513,
"grad_norm": 0.3349739909172058,
"learning_rate": 0.0005041434402332361,
"loss": 3.3605,
"step": 27500
},
{
"epoch": 8.027389277389277,
"grad_norm": 0.35149267315864563,
"learning_rate": 0.0005039685131195334,
"loss": 3.3568,
"step": 27550
},
{
"epoch": 8.041958041958042,
"grad_norm": 0.35762932896614075,
"learning_rate": 0.0005037935860058309,
"loss": 3.3594,
"step": 27600
},
{
"epoch": 8.056526806526806,
"grad_norm": 0.3227376341819763,
"learning_rate": 0.0005036186588921282,
"loss": 3.3618,
"step": 27650
},
{
"epoch": 8.07109557109557,
"grad_norm": 0.3321167826652527,
"learning_rate": 0.0005034437317784256,
"loss": 3.3757,
"step": 27700
},
{
"epoch": 8.085664335664335,
"grad_norm": 0.34823182225227356,
"learning_rate": 0.000503268804664723,
"loss": 3.368,
"step": 27750
},
{
"epoch": 8.1002331002331,
"grad_norm": 0.3144749701023102,
"learning_rate": 0.0005030938775510204,
"loss": 3.3762,
"step": 27800
},
{
"epoch": 8.114801864801866,
"grad_norm": 0.33065780997276306,
"learning_rate": 0.0005029189504373178,
"loss": 3.3705,
"step": 27850
},
{
"epoch": 8.12937062937063,
"grad_norm": 0.3569163382053375,
"learning_rate": 0.0005027440233236151,
"loss": 3.3917,
"step": 27900
},
{
"epoch": 8.143939393939394,
"grad_norm": 0.336088091135025,
"learning_rate": 0.0005025690962099126,
"loss": 3.387,
"step": 27950
},
{
"epoch": 8.158508158508159,
"grad_norm": 0.31934666633605957,
"learning_rate": 0.0005023941690962099,
"loss": 3.3725,
"step": 28000
},
{
"epoch": 8.158508158508159,
"eval_accuracy": 0.3663358757262118,
"eval_loss": 3.590759754180908,
"eval_runtime": 182.8908,
"eval_samples_per_second": 90.994,
"eval_steps_per_second": 5.692,
"step": 28000
},
{
"epoch": 8.173076923076923,
"grad_norm": 0.36414483189582825,
"learning_rate": 0.0005022192419825072,
"loss": 3.3922,
"step": 28050
},
{
"epoch": 8.187645687645688,
"grad_norm": 0.3432634472846985,
"learning_rate": 0.0005020443148688046,
"loss": 3.3869,
"step": 28100
},
{
"epoch": 8.202214452214452,
"grad_norm": 0.34101763367652893,
"learning_rate": 0.000501869387755102,
"loss": 3.3947,
"step": 28150
},
{
"epoch": 8.216783216783217,
"grad_norm": 0.36927416920661926,
"learning_rate": 0.0005016944606413994,
"loss": 3.3831,
"step": 28200
},
{
"epoch": 8.231351981351981,
"grad_norm": 0.3365326523780823,
"learning_rate": 0.0005015195335276967,
"loss": 3.3901,
"step": 28250
},
{
"epoch": 8.245920745920746,
"grad_norm": 0.36989808082580566,
"learning_rate": 0.0005013446064139941,
"loss": 3.3948,
"step": 28300
},
{
"epoch": 8.26048951048951,
"grad_norm": 0.3689005374908447,
"learning_rate": 0.0005011696793002916,
"loss": 3.3917,
"step": 28350
},
{
"epoch": 8.275058275058274,
"grad_norm": 0.3656901717185974,
"learning_rate": 0.0005009947521865889,
"loss": 3.3954,
"step": 28400
},
{
"epoch": 8.289627039627039,
"grad_norm": 0.3664736747741699,
"learning_rate": 0.0005008198250728862,
"loss": 3.4009,
"step": 28450
},
{
"epoch": 8.304195804195805,
"grad_norm": 0.3412057161331177,
"learning_rate": 0.0005006448979591836,
"loss": 3.3916,
"step": 28500
},
{
"epoch": 8.31876456876457,
"grad_norm": 0.37343958020210266,
"learning_rate": 0.000500469970845481,
"loss": 3.3981,
"step": 28550
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.40472412109375,
"learning_rate": 0.0005002950437317784,
"loss": 3.4063,
"step": 28600
},
{
"epoch": 8.347902097902098,
"grad_norm": 0.33591440320014954,
"learning_rate": 0.0005001201166180757,
"loss": 3.3988,
"step": 28650
},
{
"epoch": 8.362470862470863,
"grad_norm": 0.3387737572193146,
"learning_rate": 0.0004999451895043731,
"loss": 3.4103,
"step": 28700
},
{
"epoch": 8.377039627039627,
"grad_norm": 0.3714272975921631,
"learning_rate": 0.0004997702623906706,
"loss": 3.4127,
"step": 28750
},
{
"epoch": 8.391608391608392,
"grad_norm": 0.34964922070503235,
"learning_rate": 0.0004995953352769679,
"loss": 3.409,
"step": 28800
},
{
"epoch": 8.406177156177156,
"grad_norm": 0.34254536032676697,
"learning_rate": 0.0004994204081632653,
"loss": 3.4076,
"step": 28850
},
{
"epoch": 8.42074592074592,
"grad_norm": 0.34269341826438904,
"learning_rate": 0.0004992454810495626,
"loss": 3.3974,
"step": 28900
},
{
"epoch": 8.435314685314685,
"grad_norm": 0.32962408661842346,
"learning_rate": 0.00049907055393586,
"loss": 3.3982,
"step": 28950
},
{
"epoch": 8.44988344988345,
"grad_norm": 0.37458404898643494,
"learning_rate": 0.0004988956268221574,
"loss": 3.4032,
"step": 29000
},
{
"epoch": 8.44988344988345,
"eval_accuracy": 0.3665615325946589,
"eval_loss": 3.5825419425964355,
"eval_runtime": 180.8696,
"eval_samples_per_second": 92.011,
"eval_steps_per_second": 5.756,
"step": 29000
},
{
"epoch": 8.464452214452214,
"grad_norm": 0.34700310230255127,
"learning_rate": 0.0004987206997084547,
"loss": 3.4302,
"step": 29050
},
{
"epoch": 8.479020979020978,
"grad_norm": 0.3363369405269623,
"learning_rate": 0.0004985457725947521,
"loss": 3.4196,
"step": 29100
},
{
"epoch": 8.493589743589745,
"grad_norm": 0.34493017196655273,
"learning_rate": 0.0004983708454810496,
"loss": 3.4233,
"step": 29150
},
{
"epoch": 8.508158508158509,
"grad_norm": 0.3357371389865875,
"learning_rate": 0.0004981959183673469,
"loss": 3.4124,
"step": 29200
},
{
"epoch": 8.522727272727273,
"grad_norm": 0.3642560541629791,
"learning_rate": 0.0004980209912536443,
"loss": 3.4161,
"step": 29250
},
{
"epoch": 8.537296037296038,
"grad_norm": 0.3482314944267273,
"learning_rate": 0.0004978460641399417,
"loss": 3.4204,
"step": 29300
},
{
"epoch": 8.551864801864802,
"grad_norm": 0.3307981491088867,
"learning_rate": 0.000497671137026239,
"loss": 3.4275,
"step": 29350
},
{
"epoch": 8.566433566433567,
"grad_norm": 0.3394106924533844,
"learning_rate": 0.0004974962099125364,
"loss": 3.4057,
"step": 29400
},
{
"epoch": 8.581002331002331,
"grad_norm": 0.3372842073440552,
"learning_rate": 0.0004973212827988337,
"loss": 3.4092,
"step": 29450
},
{
"epoch": 8.595571095571096,
"grad_norm": 0.32758432626724243,
"learning_rate": 0.0004971463556851312,
"loss": 3.4188,
"step": 29500
},
{
"epoch": 8.61013986013986,
"grad_norm": 0.3386209309101105,
"learning_rate": 0.0004969714285714286,
"loss": 3.4205,
"step": 29550
},
{
"epoch": 8.624708624708624,
"grad_norm": 0.3470524549484253,
"learning_rate": 0.0004967965014577259,
"loss": 3.4182,
"step": 29600
},
{
"epoch": 8.639277389277389,
"grad_norm": 0.3339594900608063,
"learning_rate": 0.0004966215743440233,
"loss": 3.4176,
"step": 29650
},
{
"epoch": 8.653846153846153,
"grad_norm": 0.3515772223472595,
"learning_rate": 0.0004964466472303207,
"loss": 3.4288,
"step": 29700
},
{
"epoch": 8.668414918414918,
"grad_norm": 0.35019543766975403,
"learning_rate": 0.000496271720116618,
"loss": 3.4287,
"step": 29750
},
{
"epoch": 8.682983682983682,
"grad_norm": 0.3279973566532135,
"learning_rate": 0.0004960967930029154,
"loss": 3.4211,
"step": 29800
},
{
"epoch": 8.697552447552448,
"grad_norm": 0.33548232913017273,
"learning_rate": 0.0004959218658892127,
"loss": 3.4308,
"step": 29850
},
{
"epoch": 8.712121212121213,
"grad_norm": 0.3599195182323456,
"learning_rate": 0.0004957469387755102,
"loss": 3.4228,
"step": 29900
},
{
"epoch": 8.726689976689977,
"grad_norm": 0.34652629494667053,
"learning_rate": 0.0004955720116618075,
"loss": 3.4285,
"step": 29950
},
{
"epoch": 8.741258741258742,
"grad_norm": 0.332381933927536,
"learning_rate": 0.0004953970845481049,
"loss": 3.4275,
"step": 30000
},
{
"epoch": 8.741258741258742,
"eval_accuracy": 0.36736185601657184,
"eval_loss": 3.5740106105804443,
"eval_runtime": 182.2015,
"eval_samples_per_second": 91.338,
"eval_steps_per_second": 5.713,
"step": 30000
},
{
"epoch": 8.755827505827506,
"grad_norm": 0.3524293601512909,
"learning_rate": 0.0004952221574344023,
"loss": 3.4245,
"step": 30050
},
{
"epoch": 8.77039627039627,
"grad_norm": 0.33680975437164307,
"learning_rate": 0.0004950472303206997,
"loss": 3.4345,
"step": 30100
},
{
"epoch": 8.784965034965035,
"grad_norm": 0.34272924065589905,
"learning_rate": 0.0004948723032069971,
"loss": 3.4335,
"step": 30150
},
{
"epoch": 8.7995337995338,
"grad_norm": 0.3409082591533661,
"learning_rate": 0.0004946973760932944,
"loss": 3.4283,
"step": 30200
},
{
"epoch": 8.814102564102564,
"grad_norm": 0.36862286925315857,
"learning_rate": 0.0004945224489795917,
"loss": 3.4235,
"step": 30250
},
{
"epoch": 8.828671328671328,
"grad_norm": 0.3254280388355255,
"learning_rate": 0.0004943475218658892,
"loss": 3.4312,
"step": 30300
},
{
"epoch": 8.843240093240093,
"grad_norm": 0.3392513394355774,
"learning_rate": 0.0004941725947521865,
"loss": 3.4257,
"step": 30350
},
{
"epoch": 8.857808857808857,
"grad_norm": 0.3554167151451111,
"learning_rate": 0.0004939976676384839,
"loss": 3.4304,
"step": 30400
},
{
"epoch": 8.872377622377622,
"grad_norm": 0.35996973514556885,
"learning_rate": 0.0004938227405247813,
"loss": 3.4399,
"step": 30450
},
{
"epoch": 8.886946386946388,
"grad_norm": 0.36442074179649353,
"learning_rate": 0.0004936478134110787,
"loss": 3.4316,
"step": 30500
},
{
"epoch": 8.901515151515152,
"grad_norm": 0.36240333318710327,
"learning_rate": 0.0004934728862973761,
"loss": 3.4262,
"step": 30550
},
{
"epoch": 8.916083916083917,
"grad_norm": 0.33148348331451416,
"learning_rate": 0.0004932979591836734,
"loss": 3.4361,
"step": 30600
},
{
"epoch": 8.930652680652681,
"grad_norm": 0.3203504681587219,
"learning_rate": 0.0004931230320699707,
"loss": 3.4466,
"step": 30650
},
{
"epoch": 8.945221445221446,
"grad_norm": 0.357393741607666,
"learning_rate": 0.0004929481049562682,
"loss": 3.4402,
"step": 30700
},
{
"epoch": 8.95979020979021,
"grad_norm": 0.36473795771598816,
"learning_rate": 0.0004927731778425655,
"loss": 3.4386,
"step": 30750
},
{
"epoch": 8.974358974358974,
"grad_norm": 0.31827130913734436,
"learning_rate": 0.0004925982507288629,
"loss": 3.4416,
"step": 30800
},
{
"epoch": 8.988927738927739,
"grad_norm": 0.35165274143218994,
"learning_rate": 0.0004924233236151604,
"loss": 3.4313,
"step": 30850
},
{
"epoch": 9.003496503496503,
"grad_norm": 0.3829018771648407,
"learning_rate": 0.0004922483965014577,
"loss": 3.4134,
"step": 30900
},
{
"epoch": 9.018065268065268,
"grad_norm": 0.3232770264148712,
"learning_rate": 0.0004920734693877551,
"loss": 3.3232,
"step": 30950
},
{
"epoch": 9.032634032634032,
"grad_norm": 0.33512336015701294,
"learning_rate": 0.0004918985422740524,
"loss": 3.3293,
"step": 31000
},
{
"epoch": 9.032634032634032,
"eval_accuracy": 0.3672867154543323,
"eval_loss": 3.579836130142212,
"eval_runtime": 181.2154,
"eval_samples_per_second": 91.835,
"eval_steps_per_second": 5.745,
"step": 31000
},
{
"epoch": 9.047202797202797,
"grad_norm": 0.35247862339019775,
"learning_rate": 0.0004917236151603499,
"loss": 3.323,
"step": 31050
},
{
"epoch": 9.061771561771561,
"grad_norm": 0.33538663387298584,
"learning_rate": 0.0004915486880466472,
"loss": 3.3462,
"step": 31100
},
{
"epoch": 9.076340326340326,
"grad_norm": 0.3494170010089874,
"learning_rate": 0.0004913737609329445,
"loss": 3.3328,
"step": 31150
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.35296133160591125,
"learning_rate": 0.0004911988338192419,
"loss": 3.3354,
"step": 31200
},
{
"epoch": 9.105477855477856,
"grad_norm": 0.3609370291233063,
"learning_rate": 0.0004910239067055393,
"loss": 3.3555,
"step": 31250
},
{
"epoch": 9.12004662004662,
"grad_norm": 0.3352583050727844,
"learning_rate": 0.0004908489795918367,
"loss": 3.3444,
"step": 31300
},
{
"epoch": 9.134615384615385,
"grad_norm": 0.3525612950325012,
"learning_rate": 0.0004906740524781341,
"loss": 3.3484,
"step": 31350
},
{
"epoch": 9.14918414918415,
"grad_norm": 0.37619081139564514,
"learning_rate": 0.0004904991253644314,
"loss": 3.3576,
"step": 31400
},
{
"epoch": 9.163752913752914,
"grad_norm": 0.3352401852607727,
"learning_rate": 0.0004903241982507289,
"loss": 3.3542,
"step": 31450
},
{
"epoch": 9.178321678321678,
"grad_norm": 0.3672662675380707,
"learning_rate": 0.0004901492711370262,
"loss": 3.3445,
"step": 31500
},
{
"epoch": 9.192890442890443,
"grad_norm": 0.36354750394821167,
"learning_rate": 0.0004899743440233235,
"loss": 3.3598,
"step": 31550
},
{
"epoch": 9.207459207459207,
"grad_norm": 0.332333505153656,
"learning_rate": 0.0004897994169096209,
"loss": 3.3689,
"step": 31600
},
{
"epoch": 9.222027972027972,
"grad_norm": 0.35484346747398376,
"learning_rate": 0.0004896244897959183,
"loss": 3.3683,
"step": 31650
},
{
"epoch": 9.236596736596736,
"grad_norm": 0.3454098403453827,
"learning_rate": 0.0004894495626822157,
"loss": 3.3646,
"step": 31700
},
{
"epoch": 9.2511655011655,
"grad_norm": 0.34342116117477417,
"learning_rate": 0.0004892746355685131,
"loss": 3.3622,
"step": 31750
},
{
"epoch": 9.265734265734265,
"grad_norm": 0.3866739273071289,
"learning_rate": 0.0004890997084548104,
"loss": 3.3679,
"step": 31800
},
{
"epoch": 9.280303030303031,
"grad_norm": 0.3526863753795624,
"learning_rate": 0.0004889247813411079,
"loss": 3.3813,
"step": 31850
},
{
"epoch": 9.294871794871796,
"grad_norm": 0.36674419045448303,
"learning_rate": 0.0004887498542274052,
"loss": 3.3658,
"step": 31900
},
{
"epoch": 9.30944055944056,
"grad_norm": 0.36833953857421875,
"learning_rate": 0.0004885749271137026,
"loss": 3.3734,
"step": 31950
},
{
"epoch": 9.324009324009324,
"grad_norm": 0.3428957462310791,
"learning_rate": 0.0004883999999999999,
"loss": 3.3803,
"step": 32000
},
{
"epoch": 9.324009324009324,
"eval_accuracy": 0.36745534075363046,
"eval_loss": 3.579030990600586,
"eval_runtime": 180.7616,
"eval_samples_per_second": 92.066,
"eval_steps_per_second": 5.759,
"step": 32000
},
{
"epoch": 9.338578088578089,
"grad_norm": 0.3581482172012329,
"learning_rate": 0.0004882250728862973,
"loss": 3.3766,
"step": 32050
},
{
"epoch": 9.353146853146853,
"grad_norm": 0.3714257478713989,
"learning_rate": 0.0004880501457725947,
"loss": 3.3814,
"step": 32100
},
{
"epoch": 9.367715617715618,
"grad_norm": 0.361931174993515,
"learning_rate": 0.00048787521865889207,
"loss": 3.3851,
"step": 32150
},
{
"epoch": 9.382284382284382,
"grad_norm": 0.3409428596496582,
"learning_rate": 0.00048770029154518945,
"loss": 3.384,
"step": 32200
},
{
"epoch": 9.396853146853147,
"grad_norm": 0.40810930728912354,
"learning_rate": 0.0004875253644314868,
"loss": 3.3816,
"step": 32250
},
{
"epoch": 9.411421911421911,
"grad_norm": 0.3254898190498352,
"learning_rate": 0.0004873504373177842,
"loss": 3.381,
"step": 32300
},
{
"epoch": 9.425990675990676,
"grad_norm": 0.354233056306839,
"learning_rate": 0.00048717551020408163,
"loss": 3.3847,
"step": 32350
},
{
"epoch": 9.44055944055944,
"grad_norm": 0.3318980038166046,
"learning_rate": 0.000487000583090379,
"loss": 3.3792,
"step": 32400
},
{
"epoch": 9.455128205128204,
"grad_norm": 0.32618919014930725,
"learning_rate": 0.00048682565597667633,
"loss": 3.3899,
"step": 32450
},
{
"epoch": 9.469696969696969,
"grad_norm": 0.39949190616607666,
"learning_rate": 0.0004866507288629737,
"loss": 3.3837,
"step": 32500
},
{
"epoch": 9.484265734265735,
"grad_norm": 0.3685564398765564,
"learning_rate": 0.0004864758017492711,
"loss": 3.3851,
"step": 32550
},
{
"epoch": 9.4988344988345,
"grad_norm": 0.359235018491745,
"learning_rate": 0.00048630087463556845,
"loss": 3.3893,
"step": 32600
},
{
"epoch": 9.513403263403264,
"grad_norm": 0.33161383867263794,
"learning_rate": 0.00048612594752186583,
"loss": 3.4009,
"step": 32650
},
{
"epoch": 9.527972027972028,
"grad_norm": 0.3646078109741211,
"learning_rate": 0.0004859510204081632,
"loss": 3.4062,
"step": 32700
},
{
"epoch": 9.542540792540793,
"grad_norm": 0.32304298877716064,
"learning_rate": 0.00048577609329446064,
"loss": 3.4058,
"step": 32750
},
{
"epoch": 9.557109557109557,
"grad_norm": 0.340385764837265,
"learning_rate": 0.000485601166180758,
"loss": 3.4003,
"step": 32800
},
{
"epoch": 9.571678321678322,
"grad_norm": 0.353704571723938,
"learning_rate": 0.0004854262390670554,
"loss": 3.3916,
"step": 32850
},
{
"epoch": 9.586247086247086,
"grad_norm": 0.3353423476219177,
"learning_rate": 0.0004852513119533527,
"loss": 3.4019,
"step": 32900
},
{
"epoch": 9.60081585081585,
"grad_norm": 0.3232695758342743,
"learning_rate": 0.0004850763848396501,
"loss": 3.3974,
"step": 32950
},
{
"epoch": 9.615384615384615,
"grad_norm": 0.36285659670829773,
"learning_rate": 0.00048490145772594746,
"loss": 3.3931,
"step": 33000
},
{
"epoch": 9.615384615384615,
"eval_accuracy": 0.368203101059235,
"eval_loss": 3.5726845264434814,
"eval_runtime": 180.0842,
"eval_samples_per_second": 92.412,
"eval_steps_per_second": 5.781,
"step": 33000
},
{
"epoch": 9.62995337995338,
"grad_norm": 0.3308947682380676,
"learning_rate": 0.00048472653061224484,
"loss": 3.4049,
"step": 33050
},
{
"epoch": 9.644522144522144,
"grad_norm": 0.3408724367618561,
"learning_rate": 0.0004845516034985422,
"loss": 3.404,
"step": 33100
},
{
"epoch": 9.659090909090908,
"grad_norm": 0.34324896335601807,
"learning_rate": 0.0004843766763848396,
"loss": 3.399,
"step": 33150
},
{
"epoch": 9.673659673659674,
"grad_norm": 0.34077367186546326,
"learning_rate": 0.000484201749271137,
"loss": 3.3953,
"step": 33200
},
{
"epoch": 9.688228438228439,
"grad_norm": 0.35905328392982483,
"learning_rate": 0.0004840268221574344,
"loss": 3.3853,
"step": 33250
},
{
"epoch": 9.702797202797203,
"grad_norm": 0.3622050881385803,
"learning_rate": 0.00048385189504373177,
"loss": 3.4025,
"step": 33300
},
{
"epoch": 9.717365967365968,
"grad_norm": 0.34367215633392334,
"learning_rate": 0.0004836769679300291,
"loss": 3.4029,
"step": 33350
},
{
"epoch": 9.731934731934732,
"grad_norm": 0.32383468747138977,
"learning_rate": 0.00048350204081632647,
"loss": 3.4049,
"step": 33400
},
{
"epoch": 9.746503496503497,
"grad_norm": 0.36959537863731384,
"learning_rate": 0.00048332711370262384,
"loss": 3.405,
"step": 33450
},
{
"epoch": 9.761072261072261,
"grad_norm": 0.3404758870601654,
"learning_rate": 0.0004831521865889212,
"loss": 3.4005,
"step": 33500
},
{
"epoch": 9.775641025641026,
"grad_norm": 0.36188212037086487,
"learning_rate": 0.0004829772594752186,
"loss": 3.4074,
"step": 33550
},
{
"epoch": 9.79020979020979,
"grad_norm": 0.38642576336860657,
"learning_rate": 0.00048280233236151597,
"loss": 3.4068,
"step": 33600
},
{
"epoch": 9.804778554778554,
"grad_norm": 0.32433605194091797,
"learning_rate": 0.0004826274052478134,
"loss": 3.4092,
"step": 33650
},
{
"epoch": 9.819347319347319,
"grad_norm": 0.3639720678329468,
"learning_rate": 0.0004824524781341108,
"loss": 3.3985,
"step": 33700
},
{
"epoch": 9.833916083916083,
"grad_norm": 0.3690209686756134,
"learning_rate": 0.00048227755102040815,
"loss": 3.407,
"step": 33750
},
{
"epoch": 9.848484848484848,
"grad_norm": 0.32806217670440674,
"learning_rate": 0.0004821026239067055,
"loss": 3.4117,
"step": 33800
},
{
"epoch": 9.863053613053612,
"grad_norm": 0.32632794976234436,
"learning_rate": 0.00048192769679300285,
"loss": 3.4169,
"step": 33850
},
{
"epoch": 9.877622377622378,
"grad_norm": 0.34658604860305786,
"learning_rate": 0.0004817527696793002,
"loss": 3.4117,
"step": 33900
},
{
"epoch": 9.892191142191143,
"grad_norm": 0.34974268078804016,
"learning_rate": 0.0004815778425655976,
"loss": 3.4073,
"step": 33950
},
{
"epoch": 9.906759906759907,
"grad_norm": 0.3343101739883423,
"learning_rate": 0.000481402915451895,
"loss": 3.4063,
"step": 34000
},
{
"epoch": 9.906759906759907,
"eval_accuracy": 0.3688429129514813,
"eval_loss": 3.5587732791900635,
"eval_runtime": 180.2379,
"eval_samples_per_second": 92.334,
"eval_steps_per_second": 5.776,
"step": 34000
},
{
"epoch": 9.921328671328672,
"grad_norm": 0.33629804849624634,
"learning_rate": 0.0004812279883381924,
"loss": 3.4184,
"step": 34050
},
{
"epoch": 9.935897435897436,
"grad_norm": 0.35826265811920166,
"learning_rate": 0.0004810530612244898,
"loss": 3.4062,
"step": 34100
},
{
"epoch": 9.9504662004662,
"grad_norm": 0.3323402404785156,
"learning_rate": 0.00048087813411078716,
"loss": 3.4029,
"step": 34150
},
{
"epoch": 9.965034965034965,
"grad_norm": 0.3231922388076782,
"learning_rate": 0.00048070320699708453,
"loss": 3.4137,
"step": 34200
},
{
"epoch": 9.97960372960373,
"grad_norm": 0.35591524839401245,
"learning_rate": 0.00048052827988338186,
"loss": 3.4172,
"step": 34250
},
{
"epoch": 9.994172494172494,
"grad_norm": 0.3526099920272827,
"learning_rate": 0.00048035335276967923,
"loss": 3.4215,
"step": 34300
},
{
"epoch": 10.008741258741258,
"grad_norm": 0.367563933134079,
"learning_rate": 0.0004801784256559766,
"loss": 3.3311,
"step": 34350
},
{
"epoch": 10.023310023310023,
"grad_norm": 0.34572193026542664,
"learning_rate": 0.000480003498542274,
"loss": 3.3062,
"step": 34400
},
{
"epoch": 10.037878787878787,
"grad_norm": 0.362204909324646,
"learning_rate": 0.00047982857142857136,
"loss": 3.3028,
"step": 34450
},
{
"epoch": 10.052447552447552,
"grad_norm": 0.3749389946460724,
"learning_rate": 0.0004796536443148688,
"loss": 3.3031,
"step": 34500
},
{
"epoch": 10.067016317016318,
"grad_norm": 0.3729357421398163,
"learning_rate": 0.00047947871720116616,
"loss": 3.3036,
"step": 34550
},
{
"epoch": 10.081585081585082,
"grad_norm": 0.3892238140106201,
"learning_rate": 0.00047930379008746354,
"loss": 3.3145,
"step": 34600
},
{
"epoch": 10.096153846153847,
"grad_norm": 0.3650963008403778,
"learning_rate": 0.0004791288629737609,
"loss": 3.3232,
"step": 34650
},
{
"epoch": 10.110722610722611,
"grad_norm": 0.3529200851917267,
"learning_rate": 0.00047895393586005824,
"loss": 3.3166,
"step": 34700
},
{
"epoch": 10.125291375291376,
"grad_norm": 0.3430958390235901,
"learning_rate": 0.0004787790087463556,
"loss": 3.3311,
"step": 34750
},
{
"epoch": 10.13986013986014,
"grad_norm": 0.35546183586120605,
"learning_rate": 0.000478604081632653,
"loss": 3.3229,
"step": 34800
},
{
"epoch": 10.154428904428904,
"grad_norm": 0.3477681279182434,
"learning_rate": 0.00047842915451895037,
"loss": 3.3211,
"step": 34850
},
{
"epoch": 10.168997668997669,
"grad_norm": 0.35804784297943115,
"learning_rate": 0.0004782542274052478,
"loss": 3.318,
"step": 34900
},
{
"epoch": 10.183566433566433,
"grad_norm": 0.3714865744113922,
"learning_rate": 0.00047807930029154517,
"loss": 3.3529,
"step": 34950
},
{
"epoch": 10.198135198135198,
"grad_norm": 0.37744787335395813,
"learning_rate": 0.00047790437317784255,
"loss": 3.3379,
"step": 35000
},
{
"epoch": 10.198135198135198,
"eval_accuracy": 0.36837090322248356,
"eval_loss": 3.5747363567352295,
"eval_runtime": 180.0894,
"eval_samples_per_second": 92.41,
"eval_steps_per_second": 5.78,
"step": 35000
},
{
"epoch": 10.212703962703962,
"grad_norm": 0.3652697801589966,
"learning_rate": 0.0004777294460641399,
"loss": 3.3403,
"step": 35050
},
{
"epoch": 10.227272727272727,
"grad_norm": 0.3565238118171692,
"learning_rate": 0.0004775545189504373,
"loss": 3.3517,
"step": 35100
},
{
"epoch": 10.241841491841491,
"grad_norm": 0.3647816777229309,
"learning_rate": 0.0004773795918367346,
"loss": 3.3465,
"step": 35150
},
{
"epoch": 10.256410256410255,
"grad_norm": 0.3312961161136627,
"learning_rate": 0.000477204664723032,
"loss": 3.3448,
"step": 35200
},
{
"epoch": 10.270979020979022,
"grad_norm": 0.3463350534439087,
"learning_rate": 0.00047702973760932937,
"loss": 3.329,
"step": 35250
},
{
"epoch": 10.285547785547786,
"grad_norm": 0.36243367195129395,
"learning_rate": 0.00047685481049562675,
"loss": 3.3469,
"step": 35300
},
{
"epoch": 10.30011655011655,
"grad_norm": 0.3585239350795746,
"learning_rate": 0.0004766798833819242,
"loss": 3.3488,
"step": 35350
},
{
"epoch": 10.314685314685315,
"grad_norm": 0.33923816680908203,
"learning_rate": 0.00047650495626822155,
"loss": 3.357,
"step": 35400
},
{
"epoch": 10.32925407925408,
"grad_norm": 0.3626267910003662,
"learning_rate": 0.00047633002915451893,
"loss": 3.356,
"step": 35450
},
{
"epoch": 10.343822843822844,
"grad_norm": 0.36127206683158875,
"learning_rate": 0.0004761551020408163,
"loss": 3.3728,
"step": 35500
},
{
"epoch": 10.358391608391608,
"grad_norm": 0.3516559600830078,
"learning_rate": 0.0004759801749271137,
"loss": 3.3548,
"step": 35550
},
{
"epoch": 10.372960372960373,
"grad_norm": 0.38914352655410767,
"learning_rate": 0.000475805247813411,
"loss": 3.3593,
"step": 35600
},
{
"epoch": 10.387529137529137,
"grad_norm": 0.3629930317401886,
"learning_rate": 0.0004756303206997084,
"loss": 3.3497,
"step": 35650
},
{
"epoch": 10.402097902097902,
"grad_norm": 0.34036391973495483,
"learning_rate": 0.00047545539358600575,
"loss": 3.3635,
"step": 35700
},
{
"epoch": 10.416666666666666,
"grad_norm": 0.35723787546157837,
"learning_rate": 0.00047528046647230313,
"loss": 3.364,
"step": 35750
},
{
"epoch": 10.43123543123543,
"grad_norm": 0.3406592309474945,
"learning_rate": 0.00047510553935860056,
"loss": 3.3589,
"step": 35800
},
{
"epoch": 10.445804195804195,
"grad_norm": 0.3650604784488678,
"learning_rate": 0.00047493061224489794,
"loss": 3.3673,
"step": 35850
},
{
"epoch": 10.460372960372961,
"grad_norm": 0.33995601534843445,
"learning_rate": 0.0004747556851311953,
"loss": 3.3702,
"step": 35900
},
{
"epoch": 10.474941724941726,
"grad_norm": 0.3596780002117157,
"learning_rate": 0.0004745807580174927,
"loss": 3.3651,
"step": 35950
},
{
"epoch": 10.48951048951049,
"grad_norm": 0.358271062374115,
"learning_rate": 0.00047440583090379006,
"loss": 3.3768,
"step": 36000
},
{
"epoch": 10.48951048951049,
"eval_accuracy": 0.3689066471998911,
"eval_loss": 3.565972089767456,
"eval_runtime": 180.2039,
"eval_samples_per_second": 92.351,
"eval_steps_per_second": 5.777,
"step": 36000
},
{
"epoch": 10.504079254079254,
"grad_norm": 0.3587784767150879,
"learning_rate": 0.0004742309037900874,
"loss": 3.3685,
"step": 36050
},
{
"epoch": 10.518648018648019,
"grad_norm": 0.36644667387008667,
"learning_rate": 0.00047405597667638476,
"loss": 3.3731,
"step": 36100
},
{
"epoch": 10.533216783216783,
"grad_norm": 0.3659219741821289,
"learning_rate": 0.00047388104956268214,
"loss": 3.3799,
"step": 36150
},
{
"epoch": 10.547785547785548,
"grad_norm": 0.36219388246536255,
"learning_rate": 0.00047370612244897957,
"loss": 3.366,
"step": 36200
},
{
"epoch": 10.562354312354312,
"grad_norm": 0.3452727496623993,
"learning_rate": 0.00047353119533527694,
"loss": 3.3727,
"step": 36250
},
{
"epoch": 10.576923076923077,
"grad_norm": 0.34664297103881836,
"learning_rate": 0.0004733562682215743,
"loss": 3.359,
"step": 36300
},
{
"epoch": 10.591491841491841,
"grad_norm": 0.34712809324264526,
"learning_rate": 0.0004731813411078717,
"loss": 3.3701,
"step": 36350
},
{
"epoch": 10.606060606060606,
"grad_norm": 0.34347906708717346,
"learning_rate": 0.00047300641399416907,
"loss": 3.3803,
"step": 36400
},
{
"epoch": 10.62062937062937,
"grad_norm": 0.37337714433670044,
"learning_rate": 0.00047283148688046645,
"loss": 3.3882,
"step": 36450
},
{
"epoch": 10.635198135198134,
"grad_norm": 0.36376672983169556,
"learning_rate": 0.00047265655976676377,
"loss": 3.383,
"step": 36500
},
{
"epoch": 10.649766899766899,
"grad_norm": 0.34523946046829224,
"learning_rate": 0.00047248163265306114,
"loss": 3.3846,
"step": 36550
},
{
"epoch": 10.664335664335665,
"grad_norm": 0.3508089482784271,
"learning_rate": 0.0004723067055393585,
"loss": 3.3739,
"step": 36600
},
{
"epoch": 10.67890442890443,
"grad_norm": 0.3470657467842102,
"learning_rate": 0.00047213177842565595,
"loss": 3.3717,
"step": 36650
},
{
"epoch": 10.693473193473194,
"grad_norm": 0.3334925174713135,
"learning_rate": 0.0004719568513119533,
"loss": 3.3814,
"step": 36700
},
{
"epoch": 10.708041958041958,
"grad_norm": 0.3517080545425415,
"learning_rate": 0.0004717819241982507,
"loss": 3.3845,
"step": 36750
},
{
"epoch": 10.722610722610723,
"grad_norm": 0.3703469932079315,
"learning_rate": 0.0004716069970845481,
"loss": 3.3785,
"step": 36800
},
{
"epoch": 10.737179487179487,
"grad_norm": 0.3503482937812805,
"learning_rate": 0.00047143206997084545,
"loss": 3.3877,
"step": 36850
},
{
"epoch": 10.751748251748252,
"grad_norm": 0.36413902044296265,
"learning_rate": 0.00047125714285714283,
"loss": 3.3901,
"step": 36900
},
{
"epoch": 10.766317016317016,
"grad_norm": 0.35273477435112,
"learning_rate": 0.00047108221574344015,
"loss": 3.3989,
"step": 36950
},
{
"epoch": 10.78088578088578,
"grad_norm": 0.3469065725803375,
"learning_rate": 0.0004709072886297375,
"loss": 3.3929,
"step": 37000
},
{
"epoch": 10.78088578088578,
"eval_accuracy": 0.36930116454936474,
"eval_loss": 3.5597054958343506,
"eval_runtime": 180.1588,
"eval_samples_per_second": 92.374,
"eval_steps_per_second": 5.778,
"step": 37000
},
{
"epoch": 10.795454545454545,
"grad_norm": 0.347210556268692,
"learning_rate": 0.00047073236151603495,
"loss": 3.3819,
"step": 37050
},
{
"epoch": 10.81002331002331,
"grad_norm": 0.35915273427963257,
"learning_rate": 0.00047055743440233233,
"loss": 3.3801,
"step": 37100
},
{
"epoch": 10.824592074592074,
"grad_norm": 0.3388284146785736,
"learning_rate": 0.0004703825072886297,
"loss": 3.3866,
"step": 37150
},
{
"epoch": 10.83916083916084,
"grad_norm": 0.3657146990299225,
"learning_rate": 0.0004702075801749271,
"loss": 3.4009,
"step": 37200
},
{
"epoch": 10.853729603729604,
"grad_norm": 0.35583174228668213,
"learning_rate": 0.00047003265306122446,
"loss": 3.387,
"step": 37250
},
{
"epoch": 10.868298368298369,
"grad_norm": 0.3616805672645569,
"learning_rate": 0.00046985772594752183,
"loss": 3.3672,
"step": 37300
},
{
"epoch": 10.882867132867133,
"grad_norm": 0.34906110167503357,
"learning_rate": 0.0004696827988338192,
"loss": 3.3822,
"step": 37350
},
{
"epoch": 10.897435897435898,
"grad_norm": 0.37446925044059753,
"learning_rate": 0.00046950787172011653,
"loss": 3.3935,
"step": 37400
},
{
"epoch": 10.912004662004662,
"grad_norm": 0.3785672187805176,
"learning_rate": 0.0004693329446064139,
"loss": 3.3824,
"step": 37450
},
{
"epoch": 10.926573426573427,
"grad_norm": 0.37299731373786926,
"learning_rate": 0.00046915801749271134,
"loss": 3.3865,
"step": 37500
},
{
"epoch": 10.941142191142191,
"grad_norm": 0.3548412621021271,
"learning_rate": 0.0004689830903790087,
"loss": 3.3952,
"step": 37550
},
{
"epoch": 10.955710955710956,
"grad_norm": 0.36777183413505554,
"learning_rate": 0.0004688081632653061,
"loss": 3.3878,
"step": 37600
},
{
"epoch": 10.97027972027972,
"grad_norm": 0.36412835121154785,
"learning_rate": 0.00046863323615160346,
"loss": 3.4091,
"step": 37650
},
{
"epoch": 10.984848484848484,
"grad_norm": 0.3270232379436493,
"learning_rate": 0.00046845830903790084,
"loss": 3.3996,
"step": 37700
},
{
"epoch": 10.999417249417249,
"grad_norm": 0.3319988250732422,
"learning_rate": 0.0004682833819241982,
"loss": 3.3991,
"step": 37750
},
{
"epoch": 11.013986013986013,
"grad_norm": 0.35844141244888306,
"learning_rate": 0.0004681084548104956,
"loss": 3.2767,
"step": 37800
},
{
"epoch": 11.028554778554778,
"grad_norm": 0.3383696377277374,
"learning_rate": 0.0004679335276967929,
"loss": 3.2718,
"step": 37850
},
{
"epoch": 11.043123543123544,
"grad_norm": 0.3634346127510071,
"learning_rate": 0.0004677586005830903,
"loss": 3.2706,
"step": 37900
},
{
"epoch": 11.057692307692308,
"grad_norm": 0.3992638885974884,
"learning_rate": 0.0004675836734693877,
"loss": 3.2881,
"step": 37950
},
{
"epoch": 11.072261072261073,
"grad_norm": 0.35264912247657776,
"learning_rate": 0.0004674087463556851,
"loss": 3.2905,
"step": 38000
},
{
"epoch": 11.072261072261073,
"eval_accuracy": 0.36926294751849176,
"eval_loss": 3.5672919750213623,
"eval_runtime": 180.0525,
"eval_samples_per_second": 92.429,
"eval_steps_per_second": 5.782,
"step": 38000
},
{
"epoch": 11.086829836829837,
"grad_norm": 0.38650333881378174,
"learning_rate": 0.00046723381924198247,
"loss": 3.3106,
"step": 38050
},
{
"epoch": 11.101398601398602,
"grad_norm": 0.3478892743587494,
"learning_rate": 0.00046705889212827985,
"loss": 3.3016,
"step": 38100
},
{
"epoch": 11.115967365967366,
"grad_norm": 0.3671860992908478,
"learning_rate": 0.0004668839650145772,
"loss": 3.2985,
"step": 38150
},
{
"epoch": 11.13053613053613,
"grad_norm": 0.3565201461315155,
"learning_rate": 0.0004667090379008746,
"loss": 3.3071,
"step": 38200
},
{
"epoch": 11.145104895104895,
"grad_norm": 0.3274824321269989,
"learning_rate": 0.000466534110787172,
"loss": 3.3222,
"step": 38250
},
{
"epoch": 11.15967365967366,
"grad_norm": 0.3710516691207886,
"learning_rate": 0.0004663591836734693,
"loss": 3.3109,
"step": 38300
},
{
"epoch": 11.174242424242424,
"grad_norm": 0.37232545018196106,
"learning_rate": 0.0004661842565597667,
"loss": 3.3054,
"step": 38350
},
{
"epoch": 11.188811188811188,
"grad_norm": 0.3739616274833679,
"learning_rate": 0.0004660093294460641,
"loss": 3.3147,
"step": 38400
},
{
"epoch": 11.203379953379953,
"grad_norm": 0.35690245032310486,
"learning_rate": 0.0004658344023323615,
"loss": 3.3187,
"step": 38450
},
{
"epoch": 11.217948717948717,
"grad_norm": 0.3522016704082489,
"learning_rate": 0.00046565947521865885,
"loss": 3.321,
"step": 38500
},
{
"epoch": 11.232517482517483,
"grad_norm": 0.379158079624176,
"learning_rate": 0.00046548454810495623,
"loss": 3.3273,
"step": 38550
},
{
"epoch": 11.247086247086248,
"grad_norm": 0.37325507402420044,
"learning_rate": 0.0004653096209912536,
"loss": 3.3222,
"step": 38600
},
{
"epoch": 11.261655011655012,
"grad_norm": 0.3767625093460083,
"learning_rate": 0.000465134693877551,
"loss": 3.3269,
"step": 38650
},
{
"epoch": 11.276223776223777,
"grad_norm": 0.3531850278377533,
"learning_rate": 0.0004649597667638484,
"loss": 3.3361,
"step": 38700
},
{
"epoch": 11.290792540792541,
"grad_norm": 0.35781583189964294,
"learning_rate": 0.0004647848396501457,
"loss": 3.3308,
"step": 38750
},
{
"epoch": 11.305361305361306,
"grad_norm": 0.35981640219688416,
"learning_rate": 0.0004646099125364431,
"loss": 3.3252,
"step": 38800
},
{
"epoch": 11.31993006993007,
"grad_norm": 0.36371827125549316,
"learning_rate": 0.0004644349854227405,
"loss": 3.3374,
"step": 38850
},
{
"epoch": 11.334498834498834,
"grad_norm": 0.37464508414268494,
"learning_rate": 0.00046426005830903786,
"loss": 3.3461,
"step": 38900
},
{
"epoch": 11.349067599067599,
"grad_norm": 0.38214632868766785,
"learning_rate": 0.00046408513119533523,
"loss": 3.3348,
"step": 38950
},
{
"epoch": 11.363636363636363,
"grad_norm": 0.40841469168663025,
"learning_rate": 0.0004639102040816326,
"loss": 3.3375,
"step": 39000
},
{
"epoch": 11.363636363636363,
"eval_accuracy": 0.3695302315528744,
"eval_loss": 3.563751220703125,
"eval_runtime": 180.0277,
"eval_samples_per_second": 92.441,
"eval_steps_per_second": 5.782,
"step": 39000
},
{
"epoch": 11.378205128205128,
"grad_norm": 0.35644689202308655,
"learning_rate": 0.00046373527696793,
"loss": 3.3485,
"step": 39050
},
{
"epoch": 11.392773892773892,
"grad_norm": 0.3444243371486664,
"learning_rate": 0.00046356034985422736,
"loss": 3.3417,
"step": 39100
},
{
"epoch": 11.407342657342657,
"grad_norm": 0.3749789893627167,
"learning_rate": 0.0004633854227405248,
"loss": 3.3419,
"step": 39150
},
{
"epoch": 11.421911421911421,
"grad_norm": 0.3557623326778412,
"learning_rate": 0.0004632104956268221,
"loss": 3.3325,
"step": 39200
},
{
"epoch": 11.436480186480187,
"grad_norm": 0.36125391721725464,
"learning_rate": 0.0004630355685131195,
"loss": 3.3398,
"step": 39250
},
{
"epoch": 11.451048951048952,
"grad_norm": 0.3687732517719269,
"learning_rate": 0.00046286064139941687,
"loss": 3.3518,
"step": 39300
},
{
"epoch": 11.465617715617716,
"grad_norm": 0.3502034842967987,
"learning_rate": 0.00046268571428571424,
"loss": 3.3484,
"step": 39350
},
{
"epoch": 11.48018648018648,
"grad_norm": 0.3895909786224365,
"learning_rate": 0.0004625107871720116,
"loss": 3.3564,
"step": 39400
},
{
"epoch": 11.494755244755245,
"grad_norm": 0.3652609884738922,
"learning_rate": 0.000462335860058309,
"loss": 3.346,
"step": 39450
},
{
"epoch": 11.50932400932401,
"grad_norm": 0.372211754322052,
"learning_rate": 0.00046216093294460637,
"loss": 3.3468,
"step": 39500
},
{
"epoch": 11.523892773892774,
"grad_norm": 0.3634597063064575,
"learning_rate": 0.0004619860058309038,
"loss": 3.343,
"step": 39550
},
{
"epoch": 11.538461538461538,
"grad_norm": 0.3725431561470032,
"learning_rate": 0.0004618110787172012,
"loss": 3.3475,
"step": 39600
},
{
"epoch": 11.553030303030303,
"grad_norm": 0.3666999042034149,
"learning_rate": 0.0004616361516034985,
"loss": 3.3463,
"step": 39650
},
{
"epoch": 11.567599067599067,
"grad_norm": 0.33625391125679016,
"learning_rate": 0.00046146122448979587,
"loss": 3.3364,
"step": 39700
},
{
"epoch": 11.582167832167832,
"grad_norm": 0.35108792781829834,
"learning_rate": 0.00046128629737609325,
"loss": 3.3491,
"step": 39750
},
{
"epoch": 11.596736596736596,
"grad_norm": 0.36968687176704407,
"learning_rate": 0.0004611113702623906,
"loss": 3.3587,
"step": 39800
},
{
"epoch": 11.61130536130536,
"grad_norm": 0.37255340814590454,
"learning_rate": 0.000460936443148688,
"loss": 3.3613,
"step": 39850
},
{
"epoch": 11.625874125874127,
"grad_norm": 0.37071385979652405,
"learning_rate": 0.0004607615160349854,
"loss": 3.3637,
"step": 39900
},
{
"epoch": 11.640442890442891,
"grad_norm": 0.3244622051715851,
"learning_rate": 0.00046058658892128275,
"loss": 3.347,
"step": 39950
},
{
"epoch": 11.655011655011656,
"grad_norm": 0.33037108182907104,
"learning_rate": 0.0004604116618075802,
"loss": 3.352,
"step": 40000
},
{
"epoch": 11.655011655011656,
"eval_accuracy": 0.3698865318714751,
"eval_loss": 3.5570318698883057,
"eval_runtime": 179.9937,
"eval_samples_per_second": 92.459,
"eval_steps_per_second": 5.784,
"step": 40000
},
{
"epoch": 11.66958041958042,
"grad_norm": 0.3523600101470947,
"learning_rate": 0.00046023673469387756,
"loss": 3.3681,
"step": 40050
},
{
"epoch": 11.684149184149184,
"grad_norm": 0.336599737405777,
"learning_rate": 0.0004600618075801749,
"loss": 3.3582,
"step": 40100
},
{
"epoch": 11.698717948717949,
"grad_norm": 0.3519699275493622,
"learning_rate": 0.00045988688046647225,
"loss": 3.3528,
"step": 40150
},
{
"epoch": 11.713286713286713,
"grad_norm": 0.34988924860954285,
"learning_rate": 0.00045971195335276963,
"loss": 3.3675,
"step": 40200
},
{
"epoch": 11.727855477855478,
"grad_norm": 0.42452919483184814,
"learning_rate": 0.000459537026239067,
"loss": 3.3535,
"step": 40250
},
{
"epoch": 11.742424242424242,
"grad_norm": 0.35697510838508606,
"learning_rate": 0.0004593620991253644,
"loss": 3.3677,
"step": 40300
},
{
"epoch": 11.756993006993007,
"grad_norm": 0.35553404688835144,
"learning_rate": 0.00045918717201166176,
"loss": 3.3776,
"step": 40350
},
{
"epoch": 11.771561771561771,
"grad_norm": 0.343811959028244,
"learning_rate": 0.00045901224489795913,
"loss": 3.3593,
"step": 40400
},
{
"epoch": 11.786130536130536,
"grad_norm": 0.3576320707798004,
"learning_rate": 0.00045883731778425656,
"loss": 3.3734,
"step": 40450
},
{
"epoch": 11.8006993006993,
"grad_norm": 0.38827261328697205,
"learning_rate": 0.00045866239067055394,
"loss": 3.3659,
"step": 40500
},
{
"epoch": 11.815268065268064,
"grad_norm": 0.3964768350124359,
"learning_rate": 0.00045848746355685126,
"loss": 3.3767,
"step": 40550
},
{
"epoch": 11.82983682983683,
"grad_norm": 0.38394802808761597,
"learning_rate": 0.00045831253644314864,
"loss": 3.3616,
"step": 40600
},
{
"epoch": 11.844405594405595,
"grad_norm": 0.3503780961036682,
"learning_rate": 0.000458137609329446,
"loss": 3.3747,
"step": 40650
},
{
"epoch": 11.85897435897436,
"grad_norm": 0.3336319625377655,
"learning_rate": 0.0004579626822157434,
"loss": 3.3788,
"step": 40700
},
{
"epoch": 11.873543123543124,
"grad_norm": 0.3886152505874634,
"learning_rate": 0.00045778775510204076,
"loss": 3.3737,
"step": 40750
},
{
"epoch": 11.888111888111888,
"grad_norm": 0.3735368549823761,
"learning_rate": 0.00045761282798833814,
"loss": 3.3729,
"step": 40800
},
{
"epoch": 11.902680652680653,
"grad_norm": 0.3551517724990845,
"learning_rate": 0.00045743790087463557,
"loss": 3.3707,
"step": 40850
},
{
"epoch": 11.917249417249417,
"grad_norm": 0.3696897625923157,
"learning_rate": 0.00045726297376093294,
"loss": 3.3682,
"step": 40900
},
{
"epoch": 11.931818181818182,
"grad_norm": 0.36508408188819885,
"learning_rate": 0.0004570880466472303,
"loss": 3.3771,
"step": 40950
},
{
"epoch": 11.946386946386946,
"grad_norm": 0.39585646986961365,
"learning_rate": 0.00045691311953352764,
"loss": 3.3692,
"step": 41000
},
{
"epoch": 11.946386946386946,
"eval_accuracy": 0.37068826638375874,
"eval_loss": 3.5467517375946045,
"eval_runtime": 180.2482,
"eval_samples_per_second": 92.328,
"eval_steps_per_second": 5.775,
"step": 41000
},
{
"epoch": 11.96095571095571,
"grad_norm": 0.3724261522293091,
"learning_rate": 0.000456738192419825,
"loss": 3.3709,
"step": 41050
},
{
"epoch": 11.975524475524475,
"grad_norm": 0.3474469780921936,
"learning_rate": 0.0004565632653061224,
"loss": 3.3842,
"step": 41100
},
{
"epoch": 11.99009324009324,
"grad_norm": 0.3345330059528351,
"learning_rate": 0.00045638833819241977,
"loss": 3.3731,
"step": 41150
},
{
"epoch": 12.004662004662004,
"grad_norm": 0.37426432967185974,
"learning_rate": 0.00045621341107871715,
"loss": 3.3369,
"step": 41200
},
{
"epoch": 12.01923076923077,
"grad_norm": 0.37405309081077576,
"learning_rate": 0.0004560384839650145,
"loss": 3.2543,
"step": 41250
},
{
"epoch": 12.033799533799534,
"grad_norm": 0.36314573884010315,
"learning_rate": 0.00045586355685131195,
"loss": 3.2579,
"step": 41300
},
{
"epoch": 12.048368298368299,
"grad_norm": 0.3675522208213806,
"learning_rate": 0.0004556886297376093,
"loss": 3.2689,
"step": 41350
},
{
"epoch": 12.062937062937063,
"grad_norm": 0.3591010570526123,
"learning_rate": 0.0004555137026239067,
"loss": 3.2704,
"step": 41400
},
{
"epoch": 12.077505827505828,
"grad_norm": 0.3727307617664337,
"learning_rate": 0.000455338775510204,
"loss": 3.2726,
"step": 41450
},
{
"epoch": 12.092074592074592,
"grad_norm": 0.35560178756713867,
"learning_rate": 0.0004551638483965014,
"loss": 3.2717,
"step": 41500
},
{
"epoch": 12.106643356643357,
"grad_norm": 0.3758648931980133,
"learning_rate": 0.0004549889212827988,
"loss": 3.277,
"step": 41550
},
{
"epoch": 12.121212121212121,
"grad_norm": 0.3858795464038849,
"learning_rate": 0.00045481399416909615,
"loss": 3.2792,
"step": 41600
},
{
"epoch": 12.135780885780886,
"grad_norm": 0.3726632297039032,
"learning_rate": 0.00045463906705539353,
"loss": 3.2981,
"step": 41650
},
{
"epoch": 12.15034965034965,
"grad_norm": 0.3951834440231323,
"learning_rate": 0.0004544641399416909,
"loss": 3.2841,
"step": 41700
},
{
"epoch": 12.164918414918414,
"grad_norm": 0.3515232503414154,
"learning_rate": 0.00045428921282798833,
"loss": 3.2943,
"step": 41750
},
{
"epoch": 12.179487179487179,
"grad_norm": 0.3756238520145416,
"learning_rate": 0.0004541142857142857,
"loss": 3.2946,
"step": 41800
},
{
"epoch": 12.194055944055943,
"grad_norm": 0.3413456678390503,
"learning_rate": 0.0004539393586005831,
"loss": 3.2894,
"step": 41850
},
{
"epoch": 12.20862470862471,
"grad_norm": 0.35358941555023193,
"learning_rate": 0.0004537644314868804,
"loss": 3.2924,
"step": 41900
},
{
"epoch": 12.223193473193474,
"grad_norm": 0.38770952820777893,
"learning_rate": 0.0004535895043731778,
"loss": 3.3127,
"step": 41950
},
{
"epoch": 12.237762237762238,
"grad_norm": 0.3479763865470886,
"learning_rate": 0.00045341457725947516,
"loss": 3.3088,
"step": 42000
},
{
"epoch": 12.237762237762238,
"eval_accuracy": 0.36980774599244454,
"eval_loss": 3.5638058185577393,
"eval_runtime": 180.1244,
"eval_samples_per_second": 92.392,
"eval_steps_per_second": 5.779,
"step": 42000
},
{
"epoch": 12.252331002331003,
"grad_norm": 0.36385655403137207,
"learning_rate": 0.00045323965014577253,
"loss": 3.3019,
"step": 42050
},
{
"epoch": 12.266899766899767,
"grad_norm": 0.3448290228843689,
"learning_rate": 0.0004530647230320699,
"loss": 3.3137,
"step": 42100
},
{
"epoch": 12.281468531468532,
"grad_norm": 0.34945112466812134,
"learning_rate": 0.00045288979591836734,
"loss": 3.319,
"step": 42150
},
{
"epoch": 12.296037296037296,
"grad_norm": 0.3717491328716278,
"learning_rate": 0.0004527148688046647,
"loss": 3.3187,
"step": 42200
},
{
"epoch": 12.31060606060606,
"grad_norm": 0.37596195936203003,
"learning_rate": 0.0004525399416909621,
"loss": 3.3144,
"step": 42250
},
{
"epoch": 12.325174825174825,
"grad_norm": 0.35946109890937805,
"learning_rate": 0.00045236501457725947,
"loss": 3.3281,
"step": 42300
},
{
"epoch": 12.33974358974359,
"grad_norm": 0.3581676483154297,
"learning_rate": 0.0004521900874635568,
"loss": 3.3193,
"step": 42350
},
{
"epoch": 12.354312354312354,
"grad_norm": 0.3581669330596924,
"learning_rate": 0.00045201516034985416,
"loss": 3.3187,
"step": 42400
},
{
"epoch": 12.368881118881118,
"grad_norm": 0.3567669987678528,
"learning_rate": 0.00045184023323615154,
"loss": 3.3169,
"step": 42450
},
{
"epoch": 12.383449883449883,
"grad_norm": 0.3985763490200043,
"learning_rate": 0.0004516653061224489,
"loss": 3.3228,
"step": 42500
},
{
"epoch": 12.398018648018647,
"grad_norm": 0.3795642852783203,
"learning_rate": 0.0004514903790087463,
"loss": 3.3242,
"step": 42550
},
{
"epoch": 12.412587412587413,
"grad_norm": 0.39964866638183594,
"learning_rate": 0.0004513154518950437,
"loss": 3.3192,
"step": 42600
},
{
"epoch": 12.427156177156178,
"grad_norm": 0.368116557598114,
"learning_rate": 0.0004511405247813411,
"loss": 3.3117,
"step": 42650
},
{
"epoch": 12.441724941724942,
"grad_norm": 0.4013516902923584,
"learning_rate": 0.0004509655976676385,
"loss": 3.3342,
"step": 42700
},
{
"epoch": 12.456293706293707,
"grad_norm": 0.3931203782558441,
"learning_rate": 0.00045079067055393585,
"loss": 3.3392,
"step": 42750
},
{
"epoch": 12.470862470862471,
"grad_norm": 0.3617455065250397,
"learning_rate": 0.00045061574344023317,
"loss": 3.3214,
"step": 42800
},
{
"epoch": 12.485431235431236,
"grad_norm": 0.3787974715232849,
"learning_rate": 0.00045044081632653055,
"loss": 3.3154,
"step": 42850
},
{
"epoch": 12.5,
"grad_norm": 0.3658803701400757,
"learning_rate": 0.0004502658892128279,
"loss": 3.324,
"step": 42900
},
{
"epoch": 12.514568764568764,
"grad_norm": 0.3991664946079254,
"learning_rate": 0.0004500909620991253,
"loss": 3.3325,
"step": 42950
},
{
"epoch": 12.529137529137529,
"grad_norm": 0.3415054976940155,
"learning_rate": 0.00044991603498542273,
"loss": 3.3395,
"step": 43000
},
{
"epoch": 12.529137529137529,
"eval_accuracy": 0.37044708752123395,
"eval_loss": 3.556431531906128,
"eval_runtime": 180.1017,
"eval_samples_per_second": 92.403,
"eval_steps_per_second": 5.78,
"step": 43000
},
{
"epoch": 12.543706293706293,
"grad_norm": 0.36171630024909973,
"learning_rate": 0.0004497411078717201,
"loss": 3.3305,
"step": 43050
},
{
"epoch": 12.558275058275058,
"grad_norm": 0.3560401499271393,
"learning_rate": 0.0004495661807580175,
"loss": 3.3315,
"step": 43100
},
{
"epoch": 12.572843822843822,
"grad_norm": 0.3484310209751129,
"learning_rate": 0.00044939125364431486,
"loss": 3.3413,
"step": 43150
},
{
"epoch": 12.587412587412587,
"grad_norm": 0.35228946805000305,
"learning_rate": 0.00044921632653061223,
"loss": 3.3509,
"step": 43200
},
{
"epoch": 12.601981351981351,
"grad_norm": 0.3399851620197296,
"learning_rate": 0.00044904139941690955,
"loss": 3.3375,
"step": 43250
},
{
"epoch": 12.616550116550117,
"grad_norm": 0.37779510021209717,
"learning_rate": 0.00044886647230320693,
"loss": 3.3282,
"step": 43300
},
{
"epoch": 12.631118881118882,
"grad_norm": 0.35238713026046753,
"learning_rate": 0.0004486915451895043,
"loss": 3.3347,
"step": 43350
},
{
"epoch": 12.645687645687646,
"grad_norm": 0.36413443088531494,
"learning_rate": 0.0004485166180758017,
"loss": 3.341,
"step": 43400
},
{
"epoch": 12.66025641025641,
"grad_norm": 0.38908377289772034,
"learning_rate": 0.0004483416909620991,
"loss": 3.3425,
"step": 43450
},
{
"epoch": 12.674825174825175,
"grad_norm": 0.3778778612613678,
"learning_rate": 0.0004481667638483965,
"loss": 3.3466,
"step": 43500
},
{
"epoch": 12.68939393939394,
"grad_norm": 0.3862821161746979,
"learning_rate": 0.00044799183673469386,
"loss": 3.3389,
"step": 43550
},
{
"epoch": 12.703962703962704,
"grad_norm": 0.39754387736320496,
"learning_rate": 0.00044781690962099124,
"loss": 3.3586,
"step": 43600
},
{
"epoch": 12.718531468531468,
"grad_norm": 0.36277809739112854,
"learning_rate": 0.0004476419825072886,
"loss": 3.344,
"step": 43650
},
{
"epoch": 12.733100233100233,
"grad_norm": 0.34860557317733765,
"learning_rate": 0.00044746705539358593,
"loss": 3.3501,
"step": 43700
},
{
"epoch": 12.747668997668997,
"grad_norm": 0.3457394242286682,
"learning_rate": 0.0004472921282798833,
"loss": 3.3452,
"step": 43750
},
{
"epoch": 12.762237762237762,
"grad_norm": 0.33117881417274475,
"learning_rate": 0.0004471172011661807,
"loss": 3.3567,
"step": 43800
},
{
"epoch": 12.776806526806526,
"grad_norm": 0.33471763134002686,
"learning_rate": 0.00044694227405247806,
"loss": 3.3463,
"step": 43850
},
{
"epoch": 12.791375291375292,
"grad_norm": 0.36773985624313354,
"learning_rate": 0.0004467673469387755,
"loss": 3.3412,
"step": 43900
},
{
"epoch": 12.805944055944057,
"grad_norm": 0.3666783571243286,
"learning_rate": 0.00044659241982507287,
"loss": 3.3385,
"step": 43950
},
{
"epoch": 12.820512820512821,
"grad_norm": 0.36336493492126465,
"learning_rate": 0.00044641749271137024,
"loss": 3.3436,
"step": 44000
},
{
"epoch": 12.820512820512821,
"eval_accuracy": 0.37076799298970303,
"eval_loss": 3.5492851734161377,
"eval_runtime": 202.0105,
"eval_samples_per_second": 82.382,
"eval_steps_per_second": 5.153,
"step": 44000
},
{
"epoch": 12.835081585081586,
"grad_norm": 0.3824255168437958,
"learning_rate": 0.0004462425655976676,
"loss": 3.3686,
"step": 44050
},
{
"epoch": 12.84965034965035,
"grad_norm": 0.3811502754688263,
"learning_rate": 0.000446067638483965,
"loss": 3.3473,
"step": 44100
},
{
"epoch": 12.864219114219114,
"grad_norm": 0.35578691959381104,
"learning_rate": 0.0004458927113702623,
"loss": 3.3509,
"step": 44150
},
{
"epoch": 12.878787878787879,
"grad_norm": 0.35379981994628906,
"learning_rate": 0.0004457177842565597,
"loss": 3.3516,
"step": 44200
},
{
"epoch": 12.893356643356643,
"grad_norm": 0.3434535562992096,
"learning_rate": 0.00044554285714285707,
"loss": 3.3469,
"step": 44250
},
{
"epoch": 12.907925407925408,
"grad_norm": 0.33877626061439514,
"learning_rate": 0.0004453679300291545,
"loss": 3.3577,
"step": 44300
},
{
"epoch": 12.922494172494172,
"grad_norm": 0.3338939845561981,
"learning_rate": 0.0004451930029154519,
"loss": 3.3427,
"step": 44350
},
{
"epoch": 12.937062937062937,
"grad_norm": 0.35932457447052,
"learning_rate": 0.00044501807580174925,
"loss": 3.363,
"step": 44400
},
{
"epoch": 12.951631701631701,
"grad_norm": 0.37584739923477173,
"learning_rate": 0.0004448431486880466,
"loss": 3.3621,
"step": 44450
},
{
"epoch": 12.966200466200466,
"grad_norm": 0.37329939007759094,
"learning_rate": 0.000444668221574344,
"loss": 3.3654,
"step": 44500
},
{
"epoch": 12.98076923076923,
"grad_norm": 0.3690183460712433,
"learning_rate": 0.0004444932944606414,
"loss": 3.3546,
"step": 44550
},
{
"epoch": 12.995337995337996,
"grad_norm": 0.3789139688014984,
"learning_rate": 0.0004443183673469387,
"loss": 3.3704,
"step": 44600
},
{
"epoch": 13.00990675990676,
"grad_norm": 0.3670189678668976,
"learning_rate": 0.0004441434402332361,
"loss": 3.2863,
"step": 44650
},
{
"epoch": 13.024475524475525,
"grad_norm": 0.3434585630893707,
"learning_rate": 0.00044396851311953345,
"loss": 3.2454,
"step": 44700
},
{
"epoch": 13.03904428904429,
"grad_norm": 0.3941807746887207,
"learning_rate": 0.0004437935860058309,
"loss": 3.2563,
"step": 44750
},
{
"epoch": 13.053613053613054,
"grad_norm": 0.36598798632621765,
"learning_rate": 0.00044361865889212826,
"loss": 3.2473,
"step": 44800
},
{
"epoch": 13.068181818181818,
"grad_norm": 0.3792334794998169,
"learning_rate": 0.00044344373177842563,
"loss": 3.2651,
"step": 44850
},
{
"epoch": 13.082750582750583,
"grad_norm": 0.3610612750053406,
"learning_rate": 0.000443268804664723,
"loss": 3.2663,
"step": 44900
},
{
"epoch": 13.097319347319347,
"grad_norm": 0.3610003590583801,
"learning_rate": 0.0004430938775510204,
"loss": 3.2639,
"step": 44950
},
{
"epoch": 13.111888111888112,
"grad_norm": 0.36222654581069946,
"learning_rate": 0.00044291895043731776,
"loss": 3.2689,
"step": 45000
},
{
"epoch": 13.111888111888112,
"eval_accuracy": 0.3705993676904049,
"eval_loss": 3.5606191158294678,
"eval_runtime": 180.0392,
"eval_samples_per_second": 92.435,
"eval_steps_per_second": 5.782,
"step": 45000
},
{
"epoch": 13.126456876456876,
"grad_norm": 0.36570093035697937,
"learning_rate": 0.0004427440233236151,
"loss": 3.2841,
"step": 45050
},
{
"epoch": 13.14102564102564,
"grad_norm": 0.35010698437690735,
"learning_rate": 0.00044256909620991246,
"loss": 3.2813,
"step": 45100
},
{
"epoch": 13.155594405594405,
"grad_norm": 0.3795238137245178,
"learning_rate": 0.0004423941690962099,
"loss": 3.2703,
"step": 45150
},
{
"epoch": 13.17016317016317,
"grad_norm": 0.36561548709869385,
"learning_rate": 0.00044221924198250726,
"loss": 3.2844,
"step": 45200
},
{
"epoch": 13.184731934731936,
"grad_norm": 0.36267200112342834,
"learning_rate": 0.00044204431486880464,
"loss": 3.285,
"step": 45250
},
{
"epoch": 13.1993006993007,
"grad_norm": 0.361397385597229,
"learning_rate": 0.000441869387755102,
"loss": 3.2833,
"step": 45300
},
{
"epoch": 13.213869463869464,
"grad_norm": 0.37071478366851807,
"learning_rate": 0.0004416944606413994,
"loss": 3.2714,
"step": 45350
},
{
"epoch": 13.228438228438229,
"grad_norm": 0.36633357405662537,
"learning_rate": 0.00044151953352769677,
"loss": 3.2872,
"step": 45400
},
{
"epoch": 13.243006993006993,
"grad_norm": 0.4010475277900696,
"learning_rate": 0.00044134460641399414,
"loss": 3.3027,
"step": 45450
},
{
"epoch": 13.257575757575758,
"grad_norm": 0.3728073537349701,
"learning_rate": 0.00044116967930029146,
"loss": 3.2929,
"step": 45500
},
{
"epoch": 13.272144522144522,
"grad_norm": 0.374639630317688,
"learning_rate": 0.00044099475218658884,
"loss": 3.2947,
"step": 45550
},
{
"epoch": 13.286713286713287,
"grad_norm": 0.37459850311279297,
"learning_rate": 0.00044081982507288627,
"loss": 3.2916,
"step": 45600
},
{
"epoch": 13.301282051282051,
"grad_norm": 0.3703133761882782,
"learning_rate": 0.00044064489795918365,
"loss": 3.3003,
"step": 45650
},
{
"epoch": 13.315850815850816,
"grad_norm": 0.35315272212028503,
"learning_rate": 0.000440469970845481,
"loss": 3.2956,
"step": 45700
},
{
"epoch": 13.33041958041958,
"grad_norm": 0.36796048283576965,
"learning_rate": 0.0004402950437317784,
"loss": 3.2999,
"step": 45750
},
{
"epoch": 13.344988344988344,
"grad_norm": 0.3775721490383148,
"learning_rate": 0.00044012011661807577,
"loss": 3.296,
"step": 45800
},
{
"epoch": 13.359557109557109,
"grad_norm": 0.3700525164604187,
"learning_rate": 0.00043994518950437315,
"loss": 3.2943,
"step": 45850
},
{
"epoch": 13.374125874125873,
"grad_norm": 0.37419870495796204,
"learning_rate": 0.0004397702623906705,
"loss": 3.303,
"step": 45900
},
{
"epoch": 13.38869463869464,
"grad_norm": 0.38533103466033936,
"learning_rate": 0.00043959533527696785,
"loss": 3.3144,
"step": 45950
},
{
"epoch": 13.403263403263404,
"grad_norm": 0.36683189868927,
"learning_rate": 0.0004394204081632652,
"loss": 3.2973,
"step": 46000
},
{
"epoch": 13.403263403263404,
"eval_accuracy": 0.3709427330139409,
"eval_loss": 3.5570855140686035,
"eval_runtime": 180.2232,
"eval_samples_per_second": 92.341,
"eval_steps_per_second": 5.776,
"step": 46000
},
{
"epoch": 13.417832167832168,
"grad_norm": 0.40213772654533386,
"learning_rate": 0.00043924548104956265,
"loss": 3.3031,
"step": 46050
},
{
"epoch": 13.432400932400933,
"grad_norm": 0.3665577471256256,
"learning_rate": 0.00043907055393586003,
"loss": 3.3059,
"step": 46100
},
{
"epoch": 13.446969696969697,
"grad_norm": 0.36229410767555237,
"learning_rate": 0.0004388956268221574,
"loss": 3.3079,
"step": 46150
},
{
"epoch": 13.461538461538462,
"grad_norm": 0.3800623416900635,
"learning_rate": 0.0004387206997084548,
"loss": 3.3198,
"step": 46200
},
{
"epoch": 13.476107226107226,
"grad_norm": 0.33525654673576355,
"learning_rate": 0.00043854577259475215,
"loss": 3.3099,
"step": 46250
},
{
"epoch": 13.49067599067599,
"grad_norm": 0.37382787466049194,
"learning_rate": 0.00043837084548104953,
"loss": 3.3125,
"step": 46300
},
{
"epoch": 13.505244755244755,
"grad_norm": 0.3434448540210724,
"learning_rate": 0.0004381959183673469,
"loss": 3.3181,
"step": 46350
},
{
"epoch": 13.51981351981352,
"grad_norm": 0.38704603910446167,
"learning_rate": 0.00043802099125364423,
"loss": 3.3258,
"step": 46400
},
{
"epoch": 13.534382284382284,
"grad_norm": 0.36768314242362976,
"learning_rate": 0.00043784606413994166,
"loss": 3.322,
"step": 46450
},
{
"epoch": 13.548951048951048,
"grad_norm": 0.3738141655921936,
"learning_rate": 0.00043767113702623903,
"loss": 3.3159,
"step": 46500
},
{
"epoch": 13.563519813519813,
"grad_norm": 0.36555594205856323,
"learning_rate": 0.0004374962099125364,
"loss": 3.3224,
"step": 46550
},
{
"epoch": 13.578088578088579,
"grad_norm": 0.39236852526664734,
"learning_rate": 0.0004373212827988338,
"loss": 3.3168,
"step": 46600
},
{
"epoch": 13.592657342657343,
"grad_norm": 0.35549354553222656,
"learning_rate": 0.00043714635568513116,
"loss": 3.3138,
"step": 46650
},
{
"epoch": 13.607226107226108,
"grad_norm": 0.38144198060035706,
"learning_rate": 0.00043697142857142854,
"loss": 3.315,
"step": 46700
},
{
"epoch": 13.621794871794872,
"grad_norm": 0.3554701805114746,
"learning_rate": 0.0004367965014577259,
"loss": 3.3287,
"step": 46750
},
{
"epoch": 13.636363636363637,
"grad_norm": 0.35527220368385315,
"learning_rate": 0.00043662157434402334,
"loss": 3.318,
"step": 46800
},
{
"epoch": 13.650932400932401,
"grad_norm": 0.3695144057273865,
"learning_rate": 0.0004364466472303206,
"loss": 3.3222,
"step": 46850
},
{
"epoch": 13.665501165501166,
"grad_norm": 0.3941607177257538,
"learning_rate": 0.00043627172011661804,
"loss": 3.3223,
"step": 46900
},
{
"epoch": 13.68006993006993,
"grad_norm": 0.3855513036251068,
"learning_rate": 0.0004360967930029154,
"loss": 3.327,
"step": 46950
},
{
"epoch": 13.694638694638694,
"grad_norm": 0.4064894914627075,
"learning_rate": 0.0004359218658892128,
"loss": 3.3302,
"step": 47000
},
{
"epoch": 13.694638694638694,
"eval_accuracy": 0.3710078783527213,
"eval_loss": 3.5502822399139404,
"eval_runtime": 180.1072,
"eval_samples_per_second": 92.401,
"eval_steps_per_second": 5.78,
"step": 47000
},
{
"epoch": 13.709207459207459,
"grad_norm": 0.36102309823036194,
"learning_rate": 0.00043574693877551017,
"loss": 3.3326,
"step": 47050
},
{
"epoch": 13.723776223776223,
"grad_norm": 0.3735487461090088,
"learning_rate": 0.00043557201166180754,
"loss": 3.322,
"step": 47100
},
{
"epoch": 13.738344988344988,
"grad_norm": 0.3891875147819519,
"learning_rate": 0.0004353970845481049,
"loss": 3.3352,
"step": 47150
},
{
"epoch": 13.752913752913752,
"grad_norm": 0.35352927446365356,
"learning_rate": 0.0004352221574344023,
"loss": 3.3259,
"step": 47200
},
{
"epoch": 13.767482517482517,
"grad_norm": 0.358306884765625,
"learning_rate": 0.0004350472303206997,
"loss": 3.3198,
"step": 47250
},
{
"epoch": 13.782051282051283,
"grad_norm": 0.3883667290210724,
"learning_rate": 0.00043487230320699705,
"loss": 3.3426,
"step": 47300
},
{
"epoch": 13.796620046620047,
"grad_norm": 0.36384516954421997,
"learning_rate": 0.0004346973760932944,
"loss": 3.3394,
"step": 47350
},
{
"epoch": 13.811188811188812,
"grad_norm": 0.39026427268981934,
"learning_rate": 0.0004345224489795918,
"loss": 3.329,
"step": 47400
},
{
"epoch": 13.825757575757576,
"grad_norm": 0.6027129888534546,
"learning_rate": 0.0004343475218658892,
"loss": 3.3378,
"step": 47450
},
{
"epoch": 13.84032634032634,
"grad_norm": 0.39879918098449707,
"learning_rate": 0.00043417259475218655,
"loss": 3.3348,
"step": 47500
},
{
"epoch": 13.854895104895105,
"grad_norm": 0.35614633560180664,
"learning_rate": 0.0004339976676384839,
"loss": 3.3318,
"step": 47550
},
{
"epoch": 13.86946386946387,
"grad_norm": 0.37732526659965515,
"learning_rate": 0.0004338227405247813,
"loss": 3.3335,
"step": 47600
},
{
"epoch": 13.884032634032634,
"grad_norm": 0.36290261149406433,
"learning_rate": 0.00043364781341107873,
"loss": 3.3439,
"step": 47650
},
{
"epoch": 13.898601398601398,
"grad_norm": 0.3448139429092407,
"learning_rate": 0.0004334728862973761,
"loss": 3.331,
"step": 47700
},
{
"epoch": 13.913170163170163,
"grad_norm": 0.373145192861557,
"learning_rate": 0.00043329795918367343,
"loss": 3.3491,
"step": 47750
},
{
"epoch": 13.927738927738927,
"grad_norm": 0.3779900074005127,
"learning_rate": 0.0004331230320699708,
"loss": 3.3345,
"step": 47800
},
{
"epoch": 13.942307692307692,
"grad_norm": 0.35789304971694946,
"learning_rate": 0.0004329481049562682,
"loss": 3.3379,
"step": 47850
},
{
"epoch": 13.956876456876456,
"grad_norm": 0.3417630195617676,
"learning_rate": 0.00043277317784256556,
"loss": 3.3222,
"step": 47900
},
{
"epoch": 13.971445221445222,
"grad_norm": 0.35347306728363037,
"learning_rate": 0.00043259825072886293,
"loss": 3.3365,
"step": 47950
},
{
"epoch": 13.986013986013987,
"grad_norm": 0.4118961989879608,
"learning_rate": 0.0004324233236151603,
"loss": 3.3507,
"step": 48000
},
{
"epoch": 13.986013986013987,
"eval_accuracy": 0.3716781462788018,
"eval_loss": 3.5405595302581787,
"eval_runtime": 180.2892,
"eval_samples_per_second": 92.307,
"eval_steps_per_second": 5.774,
"step": 48000
},
{
"epoch": 14.000582750582751,
"grad_norm": 0.4268363416194916,
"learning_rate": 0.0004322483965014577,
"loss": 3.3316,
"step": 48050
},
{
"epoch": 14.015151515151516,
"grad_norm": 0.3893098533153534,
"learning_rate": 0.0004320734693877551,
"loss": 3.2321,
"step": 48100
},
{
"epoch": 14.02972027972028,
"grad_norm": 0.39411571621894836,
"learning_rate": 0.0004318985422740525,
"loss": 3.2315,
"step": 48150
},
{
"epoch": 14.044289044289044,
"grad_norm": 0.36802518367767334,
"learning_rate": 0.0004317236151603498,
"loss": 3.2374,
"step": 48200
},
{
"epoch": 14.058857808857809,
"grad_norm": 0.39768052101135254,
"learning_rate": 0.0004315486880466472,
"loss": 3.2414,
"step": 48250
},
{
"epoch": 14.073426573426573,
"grad_norm": 0.38169392943382263,
"learning_rate": 0.00043137376093294456,
"loss": 3.2486,
"step": 48300
},
{
"epoch": 14.087995337995338,
"grad_norm": 0.3780570924282074,
"learning_rate": 0.00043119883381924194,
"loss": 3.2548,
"step": 48350
},
{
"epoch": 14.102564102564102,
"grad_norm": 0.35785311460494995,
"learning_rate": 0.0004310239067055393,
"loss": 3.2463,
"step": 48400
},
{
"epoch": 14.117132867132867,
"grad_norm": 0.408586323261261,
"learning_rate": 0.0004308489795918367,
"loss": 3.2536,
"step": 48450
},
{
"epoch": 14.131701631701631,
"grad_norm": 0.38487815856933594,
"learning_rate": 0.00043067405247813407,
"loss": 3.2604,
"step": 48500
},
{
"epoch": 14.146270396270396,
"grad_norm": 0.36268073320388794,
"learning_rate": 0.0004304991253644315,
"loss": 3.2614,
"step": 48550
},
{
"epoch": 14.16083916083916,
"grad_norm": 0.38010692596435547,
"learning_rate": 0.00043032419825072887,
"loss": 3.2638,
"step": 48600
},
{
"epoch": 14.175407925407926,
"grad_norm": 0.36735981702804565,
"learning_rate": 0.0004301492711370262,
"loss": 3.2635,
"step": 48650
},
{
"epoch": 14.18997668997669,
"grad_norm": 0.37236976623535156,
"learning_rate": 0.00042997434402332357,
"loss": 3.2752,
"step": 48700
},
{
"epoch": 14.204545454545455,
"grad_norm": 0.3588123619556427,
"learning_rate": 0.00042979941690962094,
"loss": 3.2593,
"step": 48750
},
{
"epoch": 14.21911421911422,
"grad_norm": 0.3798566162586212,
"learning_rate": 0.0004296244897959183,
"loss": 3.2667,
"step": 48800
},
{
"epoch": 14.233682983682984,
"grad_norm": 0.3849664628505707,
"learning_rate": 0.0004294495626822157,
"loss": 3.2756,
"step": 48850
},
{
"epoch": 14.248251748251748,
"grad_norm": 0.3689548075199127,
"learning_rate": 0.00042927463556851307,
"loss": 3.2641,
"step": 48900
},
{
"epoch": 14.262820512820513,
"grad_norm": 0.38583266735076904,
"learning_rate": 0.0004290997084548105,
"loss": 3.2747,
"step": 48950
},
{
"epoch": 14.277389277389277,
"grad_norm": 0.3908044099807739,
"learning_rate": 0.0004289247813411079,
"loss": 3.2781,
"step": 49000
},
{
"epoch": 14.277389277389277,
"eval_accuracy": 0.3713245505500783,
"eval_loss": 3.555830717086792,
"eval_runtime": 180.235,
"eval_samples_per_second": 92.335,
"eval_steps_per_second": 5.776,
"step": 49000
},
{
"epoch": 14.291958041958042,
"grad_norm": 0.4048484265804291,
"learning_rate": 0.00042874985422740525,
"loss": 3.2815,
"step": 49050
},
{
"epoch": 14.306526806526806,
"grad_norm": 0.3809373080730438,
"learning_rate": 0.0004285749271137026,
"loss": 3.2844,
"step": 49100
},
{
"epoch": 14.32109557109557,
"grad_norm": 0.3947436213493347,
"learning_rate": 0.00042839999999999995,
"loss": 3.2895,
"step": 49150
},
{
"epoch": 14.335664335664335,
"grad_norm": 0.4283214211463928,
"learning_rate": 0.0004282250728862973,
"loss": 3.2771,
"step": 49200
},
{
"epoch": 14.3502331002331,
"grad_norm": 0.404867023229599,
"learning_rate": 0.0004280501457725947,
"loss": 3.2798,
"step": 49250
},
{
"epoch": 14.364801864801866,
"grad_norm": 0.4032004773616791,
"learning_rate": 0.0004278752186588921,
"loss": 3.2898,
"step": 49300
},
{
"epoch": 14.37937062937063,
"grad_norm": 0.3930039703845978,
"learning_rate": 0.00042770029154518945,
"loss": 3.2876,
"step": 49350
},
{
"epoch": 14.393939393939394,
"grad_norm": 0.3659101724624634,
"learning_rate": 0.0004275253644314869,
"loss": 3.296,
"step": 49400
},
{
"epoch": 14.408508158508159,
"grad_norm": 0.3935369551181793,
"learning_rate": 0.00042735043731778426,
"loss": 3.2924,
"step": 49450
},
{
"epoch": 14.423076923076923,
"grad_norm": 0.3779597580432892,
"learning_rate": 0.00042717551020408164,
"loss": 3.2885,
"step": 49500
},
{
"epoch": 14.437645687645688,
"grad_norm": 0.3811582922935486,
"learning_rate": 0.00042700058309037896,
"loss": 3.2916,
"step": 49550
},
{
"epoch": 14.452214452214452,
"grad_norm": 0.3844025135040283,
"learning_rate": 0.00042682565597667633,
"loss": 3.2914,
"step": 49600
},
{
"epoch": 14.466783216783217,
"grad_norm": 0.37973514199256897,
"learning_rate": 0.0004266507288629737,
"loss": 3.2938,
"step": 49650
},
{
"epoch": 14.481351981351981,
"grad_norm": 0.4009278416633606,
"learning_rate": 0.0004264758017492711,
"loss": 3.2984,
"step": 49700
},
{
"epoch": 14.495920745920746,
"grad_norm": 0.3518688678741455,
"learning_rate": 0.00042630087463556846,
"loss": 3.3005,
"step": 49750
},
{
"epoch": 14.51048951048951,
"grad_norm": 0.38609257340431213,
"learning_rate": 0.00042612594752186584,
"loss": 3.2928,
"step": 49800
},
{
"epoch": 14.525058275058274,
"grad_norm": 0.3609243333339691,
"learning_rate": 0.00042595102040816327,
"loss": 3.2912,
"step": 49850
},
{
"epoch": 14.539627039627039,
"grad_norm": 0.38364362716674805,
"learning_rate": 0.00042577609329446064,
"loss": 3.3073,
"step": 49900
},
{
"epoch": 14.554195804195803,
"grad_norm": 0.36876383423805237,
"learning_rate": 0.000425601166180758,
"loss": 3.307,
"step": 49950
},
{
"epoch": 14.56876456876457,
"grad_norm": 0.36288002133369446,
"learning_rate": 0.00042542623906705534,
"loss": 3.3064,
"step": 50000
},
{
"epoch": 14.56876456876457,
"eval_accuracy": 0.37152504297358135,
"eval_loss": 3.5497894287109375,
"eval_runtime": 180.1492,
"eval_samples_per_second": 92.379,
"eval_steps_per_second": 5.779,
"step": 50000
},
{
"epoch": 14.583333333333334,
"grad_norm": 0.3770962953567505,
"learning_rate": 0.0004252513119533527,
"loss": 3.3217,
"step": 50050
},
{
"epoch": 14.597902097902098,
"grad_norm": 0.36296921968460083,
"learning_rate": 0.0004250763848396501,
"loss": 3.3085,
"step": 50100
},
{
"epoch": 14.612470862470863,
"grad_norm": 0.3781993091106415,
"learning_rate": 0.00042490145772594747,
"loss": 3.315,
"step": 50150
},
{
"epoch": 14.627039627039627,
"grad_norm": 0.38002365827560425,
"learning_rate": 0.00042472653061224484,
"loss": 3.3129,
"step": 50200
},
{
"epoch": 14.641608391608392,
"grad_norm": 0.3946523070335388,
"learning_rate": 0.00042455160349854227,
"loss": 3.3144,
"step": 50250
},
{
"epoch": 14.656177156177156,
"grad_norm": 0.36559757590293884,
"learning_rate": 0.00042437667638483965,
"loss": 3.3157,
"step": 50300
},
{
"epoch": 14.67074592074592,
"grad_norm": 0.40196385979652405,
"learning_rate": 0.000424201749271137,
"loss": 3.3064,
"step": 50350
},
{
"epoch": 14.685314685314685,
"grad_norm": 0.39326444268226624,
"learning_rate": 0.0004240268221574344,
"loss": 3.2974,
"step": 50400
},
{
"epoch": 14.69988344988345,
"grad_norm": 0.38454002141952515,
"learning_rate": 0.0004238518950437317,
"loss": 3.3185,
"step": 50450
},
{
"epoch": 14.714452214452214,
"grad_norm": 0.3821794092655182,
"learning_rate": 0.0004236769679300291,
"loss": 3.3211,
"step": 50500
},
{
"epoch": 14.729020979020978,
"grad_norm": 0.36120903491973877,
"learning_rate": 0.00042350204081632647,
"loss": 3.3132,
"step": 50550
},
{
"epoch": 14.743589743589745,
"grad_norm": 0.37556177377700806,
"learning_rate": 0.00042332711370262385,
"loss": 3.3167,
"step": 50600
},
{
"epoch": 14.758158508158509,
"grad_norm": 0.40567317605018616,
"learning_rate": 0.0004231521865889212,
"loss": 3.3081,
"step": 50650
},
{
"epoch": 14.772727272727273,
"grad_norm": 0.37133491039276123,
"learning_rate": 0.00042297725947521865,
"loss": 3.3122,
"step": 50700
},
{
"epoch": 14.787296037296038,
"grad_norm": 0.36307451128959656,
"learning_rate": 0.00042280233236151603,
"loss": 3.3193,
"step": 50750
},
{
"epoch": 14.801864801864802,
"grad_norm": 0.35331910848617554,
"learning_rate": 0.0004226274052478134,
"loss": 3.3101,
"step": 50800
},
{
"epoch": 14.816433566433567,
"grad_norm": 0.3815527558326721,
"learning_rate": 0.0004224524781341108,
"loss": 3.3149,
"step": 50850
},
{
"epoch": 14.831002331002331,
"grad_norm": 0.35823893547058105,
"learning_rate": 0.0004222775510204081,
"loss": 3.3195,
"step": 50900
},
{
"epoch": 14.845571095571096,
"grad_norm": 0.3718653619289398,
"learning_rate": 0.0004221026239067055,
"loss": 3.3149,
"step": 50950
},
{
"epoch": 14.86013986013986,
"grad_norm": 0.4067089557647705,
"learning_rate": 0.00042192769679300285,
"loss": 3.318,
"step": 51000
},
{
"epoch": 14.86013986013986,
"eval_accuracy": 0.3718885163348997,
"eval_loss": 3.541916608810425,
"eval_runtime": 180.0954,
"eval_samples_per_second": 92.407,
"eval_steps_per_second": 5.78,
"step": 51000
},
{
"epoch": 14.874708624708624,
"grad_norm": 0.3652386963367462,
"learning_rate": 0.00042175276967930023,
"loss": 3.3265,
"step": 51050
},
{
"epoch": 14.889277389277389,
"grad_norm": 0.3712463080883026,
"learning_rate": 0.00042157784256559766,
"loss": 3.3248,
"step": 51100
},
{
"epoch": 14.903846153846153,
"grad_norm": 0.37910208106040955,
"learning_rate": 0.00042140291545189504,
"loss": 3.3283,
"step": 51150
},
{
"epoch": 14.918414918414918,
"grad_norm": 0.36953410506248474,
"learning_rate": 0.0004212279883381924,
"loss": 3.3234,
"step": 51200
},
{
"epoch": 14.932983682983682,
"grad_norm": 0.3733784556388855,
"learning_rate": 0.0004210530612244898,
"loss": 3.3219,
"step": 51250
},
{
"epoch": 14.947552447552448,
"grad_norm": 0.3792276382446289,
"learning_rate": 0.0004208781341107871,
"loss": 3.3232,
"step": 51300
},
{
"epoch": 14.962121212121213,
"grad_norm": 0.3714187741279602,
"learning_rate": 0.0004207032069970845,
"loss": 3.3271,
"step": 51350
},
{
"epoch": 14.976689976689977,
"grad_norm": 0.3657480478286743,
"learning_rate": 0.00042052827988338186,
"loss": 3.3185,
"step": 51400
},
{
"epoch": 14.991258741258742,
"grad_norm": 0.3697648048400879,
"learning_rate": 0.00042035335276967924,
"loss": 3.3353,
"step": 51450
},
{
"epoch": 15.005827505827506,
"grad_norm": 0.41941073536872864,
"learning_rate": 0.0004201784256559766,
"loss": 3.2813,
"step": 51500
},
{
"epoch": 15.02039627039627,
"grad_norm": 0.3773099482059479,
"learning_rate": 0.00042000349854227404,
"loss": 3.2151,
"step": 51550
},
{
"epoch": 15.034965034965035,
"grad_norm": 0.37644755840301514,
"learning_rate": 0.0004198285714285714,
"loss": 3.2213,
"step": 51600
},
{
"epoch": 15.0495337995338,
"grad_norm": 0.3629944622516632,
"learning_rate": 0.0004196536443148688,
"loss": 3.2295,
"step": 51650
},
{
"epoch": 15.064102564102564,
"grad_norm": 0.3724285960197449,
"learning_rate": 0.00041947871720116617,
"loss": 3.218,
"step": 51700
},
{
"epoch": 15.078671328671328,
"grad_norm": 0.3703879117965698,
"learning_rate": 0.0004193037900874635,
"loss": 3.2367,
"step": 51750
},
{
"epoch": 15.093240093240093,
"grad_norm": 0.3728949725627899,
"learning_rate": 0.00041912886297376087,
"loss": 3.2414,
"step": 51800
},
{
"epoch": 15.107808857808857,
"grad_norm": 0.3949025273323059,
"learning_rate": 0.00041895393586005824,
"loss": 3.2269,
"step": 51850
},
{
"epoch": 15.122377622377622,
"grad_norm": 0.3763778507709503,
"learning_rate": 0.0004187790087463556,
"loss": 3.2327,
"step": 51900
},
{
"epoch": 15.136946386946388,
"grad_norm": 0.41053783893585205,
"learning_rate": 0.000418604081632653,
"loss": 3.2503,
"step": 51950
},
{
"epoch": 15.151515151515152,
"grad_norm": 0.41025668382644653,
"learning_rate": 0.0004184291545189504,
"loss": 3.2508,
"step": 52000
},
{
"epoch": 15.151515151515152,
"eval_accuracy": 0.37150011371036573,
"eval_loss": 3.556598663330078,
"eval_runtime": 180.2269,
"eval_samples_per_second": 92.339,
"eval_steps_per_second": 5.776,
"step": 52000
},
{
"epoch": 15.166083916083917,
"grad_norm": 0.4072258770465851,
"learning_rate": 0.0004182542274052478,
"loss": 3.2452,
"step": 52050
},
{
"epoch": 15.180652680652681,
"grad_norm": 0.40523266792297363,
"learning_rate": 0.0004180793002915452,
"loss": 3.2578,
"step": 52100
},
{
"epoch": 15.195221445221446,
"grad_norm": 0.38300174474716187,
"learning_rate": 0.00041790437317784255,
"loss": 3.2492,
"step": 52150
},
{
"epoch": 15.20979020979021,
"grad_norm": 0.36747708916664124,
"learning_rate": 0.0004177294460641399,
"loss": 3.2537,
"step": 52200
},
{
"epoch": 15.224358974358974,
"grad_norm": 0.3879857659339905,
"learning_rate": 0.00041755451895043725,
"loss": 3.2632,
"step": 52250
},
{
"epoch": 15.238927738927739,
"grad_norm": 0.368695467710495,
"learning_rate": 0.0004173795918367346,
"loss": 3.2581,
"step": 52300
},
{
"epoch": 15.253496503496503,
"grad_norm": 0.3739892840385437,
"learning_rate": 0.000417204664723032,
"loss": 3.265,
"step": 52350
},
{
"epoch": 15.268065268065268,
"grad_norm": 0.36324694752693176,
"learning_rate": 0.00041702973760932943,
"loss": 3.2505,
"step": 52400
},
{
"epoch": 15.282634032634032,
"grad_norm": 0.39305976033210754,
"learning_rate": 0.0004168548104956268,
"loss": 3.2693,
"step": 52450
},
{
"epoch": 15.297202797202797,
"grad_norm": 0.377043217420578,
"learning_rate": 0.0004166798833819242,
"loss": 3.26,
"step": 52500
},
{
"epoch": 15.311771561771561,
"grad_norm": 0.3906829059123993,
"learning_rate": 0.00041650495626822156,
"loss": 3.2712,
"step": 52550
},
{
"epoch": 15.326340326340326,
"grad_norm": 0.4162210524082184,
"learning_rate": 0.00041633002915451893,
"loss": 3.2637,
"step": 52600
},
{
"epoch": 15.340909090909092,
"grad_norm": 0.4120563566684723,
"learning_rate": 0.00041615510204081626,
"loss": 3.2763,
"step": 52650
},
{
"epoch": 15.355477855477856,
"grad_norm": 0.3810875713825226,
"learning_rate": 0.00041598017492711363,
"loss": 3.2636,
"step": 52700
},
{
"epoch": 15.37004662004662,
"grad_norm": 0.3553137481212616,
"learning_rate": 0.000415805247813411,
"loss": 3.2651,
"step": 52750
},
{
"epoch": 15.384615384615385,
"grad_norm": 0.4119865894317627,
"learning_rate": 0.0004156303206997084,
"loss": 3.2804,
"step": 52800
},
{
"epoch": 15.39918414918415,
"grad_norm": 0.39863142371177673,
"learning_rate": 0.0004154553935860058,
"loss": 3.2859,
"step": 52850
},
{
"epoch": 15.413752913752914,
"grad_norm": 0.38426119089126587,
"learning_rate": 0.0004152804664723032,
"loss": 3.3041,
"step": 52900
},
{
"epoch": 15.428321678321678,
"grad_norm": 0.39725998044013977,
"learning_rate": 0.00041510553935860056,
"loss": 3.2784,
"step": 52950
},
{
"epoch": 15.442890442890443,
"grad_norm": 0.3563925325870514,
"learning_rate": 0.00041493061224489794,
"loss": 3.2845,
"step": 53000
},
{
"epoch": 15.442890442890443,
"eval_accuracy": 0.37165650955978446,
"eval_loss": 3.5524237155914307,
"eval_runtime": 180.1313,
"eval_samples_per_second": 92.388,
"eval_steps_per_second": 5.779,
"step": 53000
},
{
"epoch": 15.457459207459207,
"grad_norm": 0.4049256145954132,
"learning_rate": 0.0004147556851311953,
"loss": 3.291,
"step": 53050
},
{
"epoch": 15.472027972027972,
"grad_norm": 0.3597942292690277,
"learning_rate": 0.00041458075801749264,
"loss": 3.2731,
"step": 53100
},
{
"epoch": 15.486596736596736,
"grad_norm": 0.38720211386680603,
"learning_rate": 0.00041440583090379,
"loss": 3.2912,
"step": 53150
},
{
"epoch": 15.5011655011655,
"grad_norm": 0.3816832900047302,
"learning_rate": 0.0004142309037900874,
"loss": 3.2809,
"step": 53200
},
{
"epoch": 15.515734265734265,
"grad_norm": 0.3881129026412964,
"learning_rate": 0.0004140559766763848,
"loss": 3.292,
"step": 53250
},
{
"epoch": 15.530303030303031,
"grad_norm": 0.3714061677455902,
"learning_rate": 0.0004138810495626822,
"loss": 3.2879,
"step": 53300
},
{
"epoch": 15.544871794871796,
"grad_norm": 0.37834274768829346,
"learning_rate": 0.00041370612244897957,
"loss": 3.292,
"step": 53350
},
{
"epoch": 15.55944055944056,
"grad_norm": 0.392358660697937,
"learning_rate": 0.00041353119533527695,
"loss": 3.2852,
"step": 53400
},
{
"epoch": 15.574009324009324,
"grad_norm": 0.36605942249298096,
"learning_rate": 0.0004133562682215743,
"loss": 3.2933,
"step": 53450
},
{
"epoch": 15.588578088578089,
"grad_norm": 0.3728536069393158,
"learning_rate": 0.0004131813411078717,
"loss": 3.2953,
"step": 53500
},
{
"epoch": 15.603146853146853,
"grad_norm": 0.37290915846824646,
"learning_rate": 0.000413006413994169,
"loss": 3.3002,
"step": 53550
},
{
"epoch": 15.617715617715618,
"grad_norm": 0.3864482343196869,
"learning_rate": 0.0004128314868804664,
"loss": 3.2898,
"step": 53600
},
{
"epoch": 15.632284382284382,
"grad_norm": 0.3864837884902954,
"learning_rate": 0.00041265655976676377,
"loss": 3.3014,
"step": 53650
},
{
"epoch": 15.646853146853147,
"grad_norm": 0.372283011674881,
"learning_rate": 0.0004124816326530612,
"loss": 3.2943,
"step": 53700
},
{
"epoch": 15.661421911421911,
"grad_norm": 0.38414448499679565,
"learning_rate": 0.0004123067055393586,
"loss": 3.3033,
"step": 53750
},
{
"epoch": 15.675990675990676,
"grad_norm": 0.3930130898952484,
"learning_rate": 0.00041213177842565595,
"loss": 3.29,
"step": 53800
},
{
"epoch": 15.69055944055944,
"grad_norm": 0.3804117441177368,
"learning_rate": 0.00041195685131195333,
"loss": 3.2942,
"step": 53850
},
{
"epoch": 15.705128205128204,
"grad_norm": 0.3625752627849579,
"learning_rate": 0.0004117819241982507,
"loss": 3.3051,
"step": 53900
},
{
"epoch": 15.719696969696969,
"grad_norm": 0.3937501013278961,
"learning_rate": 0.0004116069970845481,
"loss": 3.2888,
"step": 53950
},
{
"epoch": 15.734265734265735,
"grad_norm": 0.3985256850719452,
"learning_rate": 0.0004114320699708454,
"loss": 3.2991,
"step": 54000
},
{
"epoch": 15.734265734265735,
"eval_accuracy": 0.37245154139280734,
"eval_loss": 3.542123317718506,
"eval_runtime": 180.2054,
"eval_samples_per_second": 92.35,
"eval_steps_per_second": 5.777,
"step": 54000
},
{
"epoch": 15.7488344988345,
"grad_norm": 0.38483700156211853,
"learning_rate": 0.0004112571428571428,
"loss": 3.2974,
"step": 54050
},
{
"epoch": 15.763403263403264,
"grad_norm": 0.3814025819301605,
"learning_rate": 0.00041108221574344015,
"loss": 3.2975,
"step": 54100
},
{
"epoch": 15.777972027972028,
"grad_norm": 0.3753068745136261,
"learning_rate": 0.0004109072886297376,
"loss": 3.2989,
"step": 54150
},
{
"epoch": 15.792540792540793,
"grad_norm": 0.3675210475921631,
"learning_rate": 0.00041073236151603496,
"loss": 3.3017,
"step": 54200
},
{
"epoch": 15.807109557109557,
"grad_norm": 0.38489532470703125,
"learning_rate": 0.00041055743440233234,
"loss": 3.3112,
"step": 54250
},
{
"epoch": 15.821678321678322,
"grad_norm": 0.38765832781791687,
"learning_rate": 0.0004103825072886297,
"loss": 3.3079,
"step": 54300
},
{
"epoch": 15.836247086247086,
"grad_norm": 0.37342846393585205,
"learning_rate": 0.0004102075801749271,
"loss": 3.3045,
"step": 54350
},
{
"epoch": 15.85081585081585,
"grad_norm": 0.37730568647384644,
"learning_rate": 0.00041003265306122446,
"loss": 3.3083,
"step": 54400
},
{
"epoch": 15.865384615384615,
"grad_norm": 0.4032607674598694,
"learning_rate": 0.0004098577259475218,
"loss": 3.3173,
"step": 54450
},
{
"epoch": 15.87995337995338,
"grad_norm": 0.37657999992370605,
"learning_rate": 0.00040968279883381916,
"loss": 3.3093,
"step": 54500
},
{
"epoch": 15.894522144522144,
"grad_norm": 0.35823720693588257,
"learning_rate": 0.0004095078717201166,
"loss": 3.3004,
"step": 54550
},
{
"epoch": 15.909090909090908,
"grad_norm": 0.36570194363594055,
"learning_rate": 0.00040933294460641397,
"loss": 3.3058,
"step": 54600
},
{
"epoch": 15.923659673659674,
"grad_norm": 0.39722785353660583,
"learning_rate": 0.00040915801749271134,
"loss": 3.2995,
"step": 54650
},
{
"epoch": 15.938228438228439,
"grad_norm": 0.40323764085769653,
"learning_rate": 0.0004089830903790087,
"loss": 3.3116,
"step": 54700
},
{
"epoch": 15.952797202797203,
"grad_norm": 0.3630395829677582,
"learning_rate": 0.0004088081632653061,
"loss": 3.3032,
"step": 54750
},
{
"epoch": 15.967365967365968,
"grad_norm": 0.39046773314476013,
"learning_rate": 0.00040863323615160347,
"loss": 3.3096,
"step": 54800
},
{
"epoch": 15.981934731934732,
"grad_norm": 0.4046758711338043,
"learning_rate": 0.00040845830903790085,
"loss": 3.3049,
"step": 54850
},
{
"epoch": 15.996503496503497,
"grad_norm": 0.38751789927482605,
"learning_rate": 0.00040828338192419817,
"loss": 3.3225,
"step": 54900
},
{
"epoch": 16.01107226107226,
"grad_norm": 0.38193270564079285,
"learning_rate": 0.00040810845481049554,
"loss": 3.2365,
"step": 54950
},
{
"epoch": 16.025641025641026,
"grad_norm": 0.36620691418647766,
"learning_rate": 0.00040793352769679297,
"loss": 3.21,
"step": 55000
},
{
"epoch": 16.025641025641026,
"eval_accuracy": 0.37201010528850803,
"eval_loss": 3.5520431995391846,
"eval_runtime": 180.173,
"eval_samples_per_second": 92.367,
"eval_steps_per_second": 5.778,
"step": 55000
},
{
"epoch": 16.04020979020979,
"grad_norm": 0.40240392088890076,
"learning_rate": 0.00040775860058309035,
"loss": 3.2147,
"step": 55050
},
{
"epoch": 16.054778554778554,
"grad_norm": 0.38166630268096924,
"learning_rate": 0.0004075836734693877,
"loss": 3.2147,
"step": 55100
},
{
"epoch": 16.06934731934732,
"grad_norm": 0.3995780348777771,
"learning_rate": 0.0004074087463556851,
"loss": 3.212,
"step": 55150
},
{
"epoch": 16.083916083916083,
"grad_norm": 0.39974555373191833,
"learning_rate": 0.0004072338192419825,
"loss": 3.2148,
"step": 55200
},
{
"epoch": 16.098484848484848,
"grad_norm": 0.4011850655078888,
"learning_rate": 0.00040705889212827985,
"loss": 3.2189,
"step": 55250
},
{
"epoch": 16.113053613053612,
"grad_norm": 0.38059523701667786,
"learning_rate": 0.00040688396501457723,
"loss": 3.2259,
"step": 55300
},
{
"epoch": 16.127622377622377,
"grad_norm": 0.4111097455024719,
"learning_rate": 0.00040670903790087455,
"loss": 3.2338,
"step": 55350
},
{
"epoch": 16.14219114219114,
"grad_norm": 0.4073127210140228,
"learning_rate": 0.0004065341107871719,
"loss": 3.2335,
"step": 55400
},
{
"epoch": 16.156759906759905,
"grad_norm": 0.35866570472717285,
"learning_rate": 0.00040635918367346935,
"loss": 3.2314,
"step": 55450
},
{
"epoch": 16.17132867132867,
"grad_norm": 0.3810841739177704,
"learning_rate": 0.00040618425655976673,
"loss": 3.2441,
"step": 55500
},
{
"epoch": 16.185897435897434,
"grad_norm": 0.3563622832298279,
"learning_rate": 0.0004060093294460641,
"loss": 3.238,
"step": 55550
},
{
"epoch": 16.2004662004662,
"grad_norm": 0.3715536594390869,
"learning_rate": 0.0004058344023323615,
"loss": 3.242,
"step": 55600
},
{
"epoch": 16.215034965034967,
"grad_norm": 0.386802077293396,
"learning_rate": 0.00040565947521865886,
"loss": 3.2535,
"step": 55650
},
{
"epoch": 16.22960372960373,
"grad_norm": 0.3603656589984894,
"learning_rate": 0.00040548454810495623,
"loss": 3.2459,
"step": 55700
},
{
"epoch": 16.244172494172496,
"grad_norm": 0.36066412925720215,
"learning_rate": 0.0004053096209912536,
"loss": 3.2382,
"step": 55750
},
{
"epoch": 16.25874125874126,
"grad_norm": 0.40658852458000183,
"learning_rate": 0.00040513469387755093,
"loss": 3.2406,
"step": 55800
},
{
"epoch": 16.273310023310025,
"grad_norm": 0.3678629398345947,
"learning_rate": 0.00040495976676384836,
"loss": 3.2542,
"step": 55850
},
{
"epoch": 16.28787878787879,
"grad_norm": 0.3898155093193054,
"learning_rate": 0.00040478483965014574,
"loss": 3.256,
"step": 55900
},
{
"epoch": 16.302447552447553,
"grad_norm": 0.3732418417930603,
"learning_rate": 0.0004046099125364431,
"loss": 3.2631,
"step": 55950
},
{
"epoch": 16.317016317016318,
"grad_norm": 0.35981082916259766,
"learning_rate": 0.0004044349854227405,
"loss": 3.2507,
"step": 56000
},
{
"epoch": 16.317016317016318,
"eval_accuracy": 0.3716827323225066,
"eval_loss": 3.5528886318206787,
"eval_runtime": 180.4051,
"eval_samples_per_second": 92.248,
"eval_steps_per_second": 5.77,
"step": 56000
},
{
"epoch": 16.331585081585082,
"grad_norm": 0.3968351483345032,
"learning_rate": 0.00040426005830903786,
"loss": 3.2583,
"step": 56050
},
{
"epoch": 16.346153846153847,
"grad_norm": 0.3820105791091919,
"learning_rate": 0.00040408513119533524,
"loss": 3.2505,
"step": 56100
},
{
"epoch": 16.36072261072261,
"grad_norm": 0.36826398968696594,
"learning_rate": 0.0004039102040816326,
"loss": 3.2555,
"step": 56150
},
{
"epoch": 16.375291375291376,
"grad_norm": 0.42605534195899963,
"learning_rate": 0.00040373527696793005,
"loss": 3.2613,
"step": 56200
},
{
"epoch": 16.38986013986014,
"grad_norm": 0.38200804591178894,
"learning_rate": 0.0004035603498542273,
"loss": 3.2553,
"step": 56250
},
{
"epoch": 16.404428904428904,
"grad_norm": 0.3612186908721924,
"learning_rate": 0.00040338542274052474,
"loss": 3.2572,
"step": 56300
},
{
"epoch": 16.41899766899767,
"grad_norm": 0.3683592677116394,
"learning_rate": 0.0004032104956268221,
"loss": 3.2684,
"step": 56350
},
{
"epoch": 16.433566433566433,
"grad_norm": 0.3807704448699951,
"learning_rate": 0.0004030355685131195,
"loss": 3.2704,
"step": 56400
},
{
"epoch": 16.448135198135198,
"grad_norm": 0.3632655739784241,
"learning_rate": 0.00040286064139941687,
"loss": 3.2699,
"step": 56450
},
{
"epoch": 16.462703962703962,
"grad_norm": 0.37278565764427185,
"learning_rate": 0.00040268571428571425,
"loss": 3.2681,
"step": 56500
},
{
"epoch": 16.477272727272727,
"grad_norm": 0.3842940628528595,
"learning_rate": 0.0004025107871720116,
"loss": 3.2758,
"step": 56550
},
{
"epoch": 16.49184149184149,
"grad_norm": 0.3898995518684387,
"learning_rate": 0.000402335860058309,
"loss": 3.275,
"step": 56600
},
{
"epoch": 16.506410256410255,
"grad_norm": 0.378342866897583,
"learning_rate": 0.00040216093294460643,
"loss": 3.2697,
"step": 56650
},
{
"epoch": 16.52097902097902,
"grad_norm": 0.40605059266090393,
"learning_rate": 0.00040198600583090375,
"loss": 3.2796,
"step": 56700
},
{
"epoch": 16.535547785547784,
"grad_norm": 0.40337276458740234,
"learning_rate": 0.0004018110787172011,
"loss": 3.2796,
"step": 56750
},
{
"epoch": 16.55011655011655,
"grad_norm": 0.3659421503543854,
"learning_rate": 0.0004016361516034985,
"loss": 3.2631,
"step": 56800
},
{
"epoch": 16.564685314685313,
"grad_norm": 0.3794923424720764,
"learning_rate": 0.0004014612244897959,
"loss": 3.2902,
"step": 56850
},
{
"epoch": 16.579254079254078,
"grad_norm": 0.38088348507881165,
"learning_rate": 0.00040128629737609325,
"loss": 3.2792,
"step": 56900
},
{
"epoch": 16.593822843822842,
"grad_norm": 0.3972194194793701,
"learning_rate": 0.00040111137026239063,
"loss": 3.2859,
"step": 56950
},
{
"epoch": 16.60839160839161,
"grad_norm": 0.39679381251335144,
"learning_rate": 0.000400936443148688,
"loss": 3.2887,
"step": 57000
},
{
"epoch": 16.60839160839161,
"eval_accuracy": 0.3726747288531057,
"eval_loss": 3.5446503162384033,
"eval_runtime": 180.0719,
"eval_samples_per_second": 92.419,
"eval_steps_per_second": 5.781,
"step": 57000
},
{
"epoch": 16.622960372960375,
"grad_norm": 0.38704535365104675,
"learning_rate": 0.00040076151603498543,
"loss": 3.2904,
"step": 57050
},
{
"epoch": 16.63752913752914,
"grad_norm": 0.446439653635025,
"learning_rate": 0.0004005865889212828,
"loss": 3.2839,
"step": 57100
},
{
"epoch": 16.652097902097903,
"grad_norm": 0.3731631636619568,
"learning_rate": 0.00040041166180758013,
"loss": 3.2772,
"step": 57150
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.39002761244773865,
"learning_rate": 0.0004002367346938775,
"loss": 3.2918,
"step": 57200
},
{
"epoch": 16.681235431235432,
"grad_norm": 0.41322073340415955,
"learning_rate": 0.0004000618075801749,
"loss": 3.275,
"step": 57250
},
{
"epoch": 16.695804195804197,
"grad_norm": 0.3777833878993988,
"learning_rate": 0.00039988688046647226,
"loss": 3.2789,
"step": 57300
},
{
"epoch": 16.71037296037296,
"grad_norm": 0.402190238237381,
"learning_rate": 0.00039971195335276963,
"loss": 3.2938,
"step": 57350
},
{
"epoch": 16.724941724941726,
"grad_norm": 0.37010467052459717,
"learning_rate": 0.000399537026239067,
"loss": 3.2931,
"step": 57400
},
{
"epoch": 16.73951048951049,
"grad_norm": 0.39115190505981445,
"learning_rate": 0.0003993620991253644,
"loss": 3.2819,
"step": 57450
},
{
"epoch": 16.754079254079254,
"grad_norm": 0.38583433628082275,
"learning_rate": 0.0003991871720116618,
"loss": 3.2987,
"step": 57500
},
{
"epoch": 16.76864801864802,
"grad_norm": 0.38979053497314453,
"learning_rate": 0.0003990122448979592,
"loss": 3.2919,
"step": 57550
},
{
"epoch": 16.783216783216783,
"grad_norm": 0.37497058510780334,
"learning_rate": 0.0003988373177842565,
"loss": 3.2887,
"step": 57600
},
{
"epoch": 16.797785547785548,
"grad_norm": 0.381260484457016,
"learning_rate": 0.0003986623906705539,
"loss": 3.2818,
"step": 57650
},
{
"epoch": 16.812354312354312,
"grad_norm": 0.38522014021873474,
"learning_rate": 0.00039848746355685127,
"loss": 3.2875,
"step": 57700
},
{
"epoch": 16.826923076923077,
"grad_norm": 0.3929021656513214,
"learning_rate": 0.00039831253644314864,
"loss": 3.2837,
"step": 57750
},
{
"epoch": 16.84149184149184,
"grad_norm": 0.38312867283821106,
"learning_rate": 0.000398137609329446,
"loss": 3.2974,
"step": 57800
},
{
"epoch": 16.856060606060606,
"grad_norm": 0.36243414878845215,
"learning_rate": 0.0003979626822157434,
"loss": 3.2953,
"step": 57850
},
{
"epoch": 16.87062937062937,
"grad_norm": 0.4058583378791809,
"learning_rate": 0.00039778775510204077,
"loss": 3.288,
"step": 57900
},
{
"epoch": 16.885198135198134,
"grad_norm": 0.3815045952796936,
"learning_rate": 0.0003976128279883382,
"loss": 3.292,
"step": 57950
},
{
"epoch": 16.8997668997669,
"grad_norm": 0.41494324803352356,
"learning_rate": 0.0003974379008746356,
"loss": 3.2842,
"step": 58000
},
{
"epoch": 16.8997668997669,
"eval_accuracy": 0.3731525005344505,
"eval_loss": 3.5340723991394043,
"eval_runtime": 180.0789,
"eval_samples_per_second": 92.415,
"eval_steps_per_second": 5.781,
"step": 58000
},
{
"epoch": 16.914335664335663,
"grad_norm": 0.3929421901702881,
"learning_rate": 0.0003972629737609329,
"loss": 3.2962,
"step": 58050
},
{
"epoch": 16.928904428904428,
"grad_norm": 0.3763855993747711,
"learning_rate": 0.00039708804664723027,
"loss": 3.2997,
"step": 58100
},
{
"epoch": 16.943473193473192,
"grad_norm": 0.39408525824546814,
"learning_rate": 0.00039691311953352765,
"loss": 3.29,
"step": 58150
},
{
"epoch": 16.958041958041957,
"grad_norm": 0.3723476827144623,
"learning_rate": 0.000396738192419825,
"loss": 3.2971,
"step": 58200
},
{
"epoch": 16.97261072261072,
"grad_norm": 0.4028165340423584,
"learning_rate": 0.0003965632653061224,
"loss": 3.3058,
"step": 58250
},
{
"epoch": 16.98717948717949,
"grad_norm": 0.3577312231063843,
"learning_rate": 0.0003963883381924198,
"loss": 3.297,
"step": 58300
},
{
"epoch": 17.001748251748253,
"grad_norm": 0.417402446269989,
"learning_rate": 0.0003962134110787172,
"loss": 3.284,
"step": 58350
},
{
"epoch": 17.016317016317018,
"grad_norm": 0.3560294806957245,
"learning_rate": 0.0003960384839650146,
"loss": 3.195,
"step": 58400
},
{
"epoch": 17.030885780885782,
"grad_norm": 0.37743493914604187,
"learning_rate": 0.00039586355685131196,
"loss": 3.1971,
"step": 58450
},
{
"epoch": 17.045454545454547,
"grad_norm": 0.4051961302757263,
"learning_rate": 0.0003956886297376093,
"loss": 3.207,
"step": 58500
},
{
"epoch": 17.06002331002331,
"grad_norm": 0.36903810501098633,
"learning_rate": 0.00039551370262390665,
"loss": 3.2031,
"step": 58550
},
{
"epoch": 17.074592074592076,
"grad_norm": 0.3800497055053711,
"learning_rate": 0.00039533877551020403,
"loss": 3.2032,
"step": 58600
},
{
"epoch": 17.08916083916084,
"grad_norm": 0.381396621465683,
"learning_rate": 0.0003951638483965014,
"loss": 3.2163,
"step": 58650
},
{
"epoch": 17.103729603729604,
"grad_norm": 0.3686426877975464,
"learning_rate": 0.0003949889212827988,
"loss": 3.209,
"step": 58700
},
{
"epoch": 17.11829836829837,
"grad_norm": 0.39614012837409973,
"learning_rate": 0.00039481399416909616,
"loss": 3.2147,
"step": 58750
},
{
"epoch": 17.132867132867133,
"grad_norm": 0.4042656719684601,
"learning_rate": 0.0003946390670553936,
"loss": 3.2183,
"step": 58800
},
{
"epoch": 17.147435897435898,
"grad_norm": 0.387961208820343,
"learning_rate": 0.00039446413994169096,
"loss": 3.225,
"step": 58850
},
{
"epoch": 17.162004662004662,
"grad_norm": 0.3967694342136383,
"learning_rate": 0.00039428921282798834,
"loss": 3.2239,
"step": 58900
},
{
"epoch": 17.176573426573427,
"grad_norm": 0.3902028799057007,
"learning_rate": 0.00039411428571428566,
"loss": 3.2237,
"step": 58950
},
{
"epoch": 17.19114219114219,
"grad_norm": 0.4269110858440399,
"learning_rate": 0.00039393935860058304,
"loss": 3.2203,
"step": 59000
},
{
"epoch": 17.19114219114219,
"eval_accuracy": 0.3721861388122523,
"eval_loss": 3.5538175106048584,
"eval_runtime": 180.0807,
"eval_samples_per_second": 92.414,
"eval_steps_per_second": 5.781,
"step": 59000
},
{
"epoch": 17.205710955710956,
"grad_norm": 0.4304862916469574,
"learning_rate": 0.0003937644314868804,
"loss": 3.2185,
"step": 59050
},
{
"epoch": 17.22027972027972,
"grad_norm": 0.4079671800136566,
"learning_rate": 0.0003935895043731778,
"loss": 3.2219,
"step": 59100
},
{
"epoch": 17.234848484848484,
"grad_norm": 0.3957606554031372,
"learning_rate": 0.00039341457725947516,
"loss": 3.2282,
"step": 59150
},
{
"epoch": 17.24941724941725,
"grad_norm": 0.40197688341140747,
"learning_rate": 0.0003932396501457726,
"loss": 3.2433,
"step": 59200
},
{
"epoch": 17.263986013986013,
"grad_norm": 0.36207619309425354,
"learning_rate": 0.00039306472303206997,
"loss": 3.2276,
"step": 59250
},
{
"epoch": 17.278554778554778,
"grad_norm": 0.39968717098236084,
"learning_rate": 0.00039288979591836734,
"loss": 3.2461,
"step": 59300
},
{
"epoch": 17.293123543123542,
"grad_norm": 0.435290664434433,
"learning_rate": 0.0003927148688046647,
"loss": 3.2376,
"step": 59350
},
{
"epoch": 17.307692307692307,
"grad_norm": 0.3796147108078003,
"learning_rate": 0.00039253994169096204,
"loss": 3.2374,
"step": 59400
},
{
"epoch": 17.32226107226107,
"grad_norm": 0.41960448026657104,
"learning_rate": 0.0003923650145772594,
"loss": 3.2476,
"step": 59450
},
{
"epoch": 17.336829836829835,
"grad_norm": 0.39346998929977417,
"learning_rate": 0.0003921900874635568,
"loss": 3.2321,
"step": 59500
},
{
"epoch": 17.3513986013986,
"grad_norm": 0.4165276288986206,
"learning_rate": 0.00039201516034985417,
"loss": 3.2445,
"step": 59550
},
{
"epoch": 17.365967365967364,
"grad_norm": 0.4580809772014618,
"learning_rate": 0.00039184023323615155,
"loss": 3.2556,
"step": 59600
},
{
"epoch": 17.38053613053613,
"grad_norm": 0.3814791738986969,
"learning_rate": 0.000391665306122449,
"loss": 3.2478,
"step": 59650
},
{
"epoch": 17.395104895104897,
"grad_norm": 0.3738664388656616,
"learning_rate": 0.00039149037900874635,
"loss": 3.2556,
"step": 59700
},
{
"epoch": 17.40967365967366,
"grad_norm": 0.3924182951450348,
"learning_rate": 0.0003913154518950437,
"loss": 3.2628,
"step": 59750
},
{
"epoch": 17.424242424242426,
"grad_norm": 0.40077292919158936,
"learning_rate": 0.0003911405247813411,
"loss": 3.2548,
"step": 59800
},
{
"epoch": 17.43881118881119,
"grad_norm": 0.3633650243282318,
"learning_rate": 0.0003909655976676384,
"loss": 3.2568,
"step": 59850
},
{
"epoch": 17.453379953379955,
"grad_norm": 0.3840297758579254,
"learning_rate": 0.0003907906705539358,
"loss": 3.2474,
"step": 59900
},
{
"epoch": 17.46794871794872,
"grad_norm": 0.3745962083339691,
"learning_rate": 0.0003906157434402332,
"loss": 3.2496,
"step": 59950
},
{
"epoch": 17.482517482517483,
"grad_norm": 0.3834511935710907,
"learning_rate": 0.00039044081632653055,
"loss": 3.2473,
"step": 60000
},
{
"epoch": 17.482517482517483,
"eval_accuracy": 0.3724052105923028,
"eval_loss": 3.548959732055664,
"eval_runtime": 180.0523,
"eval_samples_per_second": 92.429,
"eval_steps_per_second": 5.782,
"step": 60000
},
{
"epoch": 17.497086247086248,
"grad_norm": 0.3921569287776947,
"learning_rate": 0.00039026588921282793,
"loss": 3.2826,
"step": 60050
},
{
"epoch": 17.511655011655012,
"grad_norm": 0.39372700452804565,
"learning_rate": 0.00039009096209912536,
"loss": 3.259,
"step": 60100
},
{
"epoch": 17.526223776223777,
"grad_norm": 0.4190043807029724,
"learning_rate": 0.00038991603498542273,
"loss": 3.2739,
"step": 60150
},
{
"epoch": 17.54079254079254,
"grad_norm": 0.39327865839004517,
"learning_rate": 0.0003897411078717201,
"loss": 3.2594,
"step": 60200
},
{
"epoch": 17.555361305361306,
"grad_norm": 0.36699846386909485,
"learning_rate": 0.0003895661807580175,
"loss": 3.2714,
"step": 60250
},
{
"epoch": 17.56993006993007,
"grad_norm": 0.4013952314853668,
"learning_rate": 0.0003893912536443148,
"loss": 3.2694,
"step": 60300
},
{
"epoch": 17.584498834498834,
"grad_norm": 0.38588792085647583,
"learning_rate": 0.0003892163265306122,
"loss": 3.2683,
"step": 60350
},
{
"epoch": 17.5990675990676,
"grad_norm": 0.36934489011764526,
"learning_rate": 0.00038904139941690956,
"loss": 3.271,
"step": 60400
},
{
"epoch": 17.613636363636363,
"grad_norm": 0.3960769772529602,
"learning_rate": 0.00038886647230320693,
"loss": 3.2859,
"step": 60450
},
{
"epoch": 17.628205128205128,
"grad_norm": 0.37103790044784546,
"learning_rate": 0.00038869154518950436,
"loss": 3.2722,
"step": 60500
},
{
"epoch": 17.642773892773892,
"grad_norm": 0.4212398827075958,
"learning_rate": 0.00038851661807580174,
"loss": 3.2787,
"step": 60550
},
{
"epoch": 17.657342657342657,
"grad_norm": 0.3627692461013794,
"learning_rate": 0.0003883416909620991,
"loss": 3.2775,
"step": 60600
},
{
"epoch": 17.67191142191142,
"grad_norm": 0.3897765576839447,
"learning_rate": 0.0003881667638483965,
"loss": 3.2723,
"step": 60650
},
{
"epoch": 17.686480186480185,
"grad_norm": 0.38437172770500183,
"learning_rate": 0.00038799183673469387,
"loss": 3.2617,
"step": 60700
},
{
"epoch": 17.70104895104895,
"grad_norm": 0.3970353305339813,
"learning_rate": 0.0003878169096209912,
"loss": 3.2641,
"step": 60750
},
{
"epoch": 17.715617715617714,
"grad_norm": 0.38862720131874084,
"learning_rate": 0.00038764198250728856,
"loss": 3.2757,
"step": 60800
},
{
"epoch": 17.73018648018648,
"grad_norm": 0.3624350130558014,
"learning_rate": 0.00038746705539358594,
"loss": 3.2727,
"step": 60850
},
{
"epoch": 17.744755244755243,
"grad_norm": 0.38513702154159546,
"learning_rate": 0.0003872921282798833,
"loss": 3.2819,
"step": 60900
},
{
"epoch": 17.759324009324008,
"grad_norm": 0.3679780066013336,
"learning_rate": 0.00038711720116618075,
"loss": 3.2728,
"step": 60950
},
{
"epoch": 17.773892773892776,
"grad_norm": 0.38644713163375854,
"learning_rate": 0.0003869422740524781,
"loss": 3.2853,
"step": 61000
},
{
"epoch": 17.773892773892776,
"eval_accuracy": 0.37325527494978283,
"eval_loss": 3.5380146503448486,
"eval_runtime": 180.1126,
"eval_samples_per_second": 92.398,
"eval_steps_per_second": 5.78,
"step": 61000
},
{
"epoch": 17.78846153846154,
"grad_norm": 0.4223131537437439,
"learning_rate": 0.0003867673469387755,
"loss": 3.266,
"step": 61050
},
{
"epoch": 17.803030303030305,
"grad_norm": 0.3889198899269104,
"learning_rate": 0.0003865924198250729,
"loss": 3.2749,
"step": 61100
},
{
"epoch": 17.81759906759907,
"grad_norm": 0.37233766913414,
"learning_rate": 0.00038641749271137025,
"loss": 3.2852,
"step": 61150
},
{
"epoch": 17.832167832167833,
"grad_norm": 0.37709736824035645,
"learning_rate": 0.00038624256559766757,
"loss": 3.2859,
"step": 61200
},
{
"epoch": 17.846736596736598,
"grad_norm": 0.390927255153656,
"learning_rate": 0.00038606763848396495,
"loss": 3.2781,
"step": 61250
},
{
"epoch": 17.861305361305362,
"grad_norm": 0.36890149116516113,
"learning_rate": 0.0003858927113702623,
"loss": 3.2759,
"step": 61300
},
{
"epoch": 17.875874125874127,
"grad_norm": 0.36303025484085083,
"learning_rate": 0.00038571778425655975,
"loss": 3.2851,
"step": 61350
},
{
"epoch": 17.89044289044289,
"grad_norm": 0.43360111117362976,
"learning_rate": 0.00038554285714285713,
"loss": 3.2648,
"step": 61400
},
{
"epoch": 17.905011655011656,
"grad_norm": 0.40794098377227783,
"learning_rate": 0.0003853679300291545,
"loss": 3.2768,
"step": 61450
},
{
"epoch": 17.91958041958042,
"grad_norm": 0.4121857285499573,
"learning_rate": 0.0003851930029154519,
"loss": 3.2776,
"step": 61500
},
{
"epoch": 17.934149184149184,
"grad_norm": 0.40042850375175476,
"learning_rate": 0.00038501807580174926,
"loss": 3.2776,
"step": 61550
},
{
"epoch": 17.94871794871795,
"grad_norm": 0.38233354687690735,
"learning_rate": 0.00038484314868804663,
"loss": 3.2909,
"step": 61600
},
{
"epoch": 17.963286713286713,
"grad_norm": 0.3981986939907074,
"learning_rate": 0.00038466822157434395,
"loss": 3.2791,
"step": 61650
},
{
"epoch": 17.977855477855478,
"grad_norm": 0.38394173979759216,
"learning_rate": 0.00038449329446064133,
"loss": 3.2932,
"step": 61700
},
{
"epoch": 17.992424242424242,
"grad_norm": 0.3739701211452484,
"learning_rate": 0.0003843183673469387,
"loss": 3.3018,
"step": 61750
},
{
"epoch": 18.006993006993007,
"grad_norm": 0.4334484338760376,
"learning_rate": 0.00038414344023323613,
"loss": 3.2268,
"step": 61800
},
{
"epoch": 18.02156177156177,
"grad_norm": 0.42861032485961914,
"learning_rate": 0.0003839685131195335,
"loss": 3.1798,
"step": 61850
},
{
"epoch": 18.036130536130536,
"grad_norm": 0.41923579573631287,
"learning_rate": 0.0003837935860058309,
"loss": 3.1827,
"step": 61900
},
{
"epoch": 18.0506993006993,
"grad_norm": 0.40670716762542725,
"learning_rate": 0.00038361865889212826,
"loss": 3.193,
"step": 61950
},
{
"epoch": 18.065268065268064,
"grad_norm": 0.3825482428073883,
"learning_rate": 0.00038344373177842564,
"loss": 3.1926,
"step": 62000
},
{
"epoch": 18.065268065268064,
"eval_accuracy": 0.37260476228889206,
"eval_loss": 3.55277419090271,
"eval_runtime": 180.0064,
"eval_samples_per_second": 92.452,
"eval_steps_per_second": 5.783,
"step": 62000
},
{
"epoch": 18.07983682983683,
"grad_norm": 0.4042184352874756,
"learning_rate": 0.000383268804664723,
"loss": 3.2012,
"step": 62050
},
{
"epoch": 18.094405594405593,
"grad_norm": 0.3851967453956604,
"learning_rate": 0.00038309387755102034,
"loss": 3.1931,
"step": 62100
},
{
"epoch": 18.108974358974358,
"grad_norm": 0.4039771556854248,
"learning_rate": 0.0003829189504373177,
"loss": 3.2051,
"step": 62150
},
{
"epoch": 18.123543123543122,
"grad_norm": 0.39950627088546753,
"learning_rate": 0.0003827440233236151,
"loss": 3.2027,
"step": 62200
},
{
"epoch": 18.138111888111887,
"grad_norm": 0.44474369287490845,
"learning_rate": 0.0003825690962099125,
"loss": 3.2036,
"step": 62250
},
{
"epoch": 18.15268065268065,
"grad_norm": 0.38847821950912476,
"learning_rate": 0.0003823941690962099,
"loss": 3.2128,
"step": 62300
},
{
"epoch": 18.16724941724942,
"grad_norm": 0.3725135922431946,
"learning_rate": 0.00038221924198250727,
"loss": 3.2201,
"step": 62350
},
{
"epoch": 18.181818181818183,
"grad_norm": 0.4199862480163574,
"learning_rate": 0.00038204431486880464,
"loss": 3.2117,
"step": 62400
},
{
"epoch": 18.196386946386948,
"grad_norm": 0.41751933097839355,
"learning_rate": 0.000381869387755102,
"loss": 3.2126,
"step": 62450
},
{
"epoch": 18.210955710955712,
"grad_norm": 0.404287725687027,
"learning_rate": 0.0003816944606413994,
"loss": 3.225,
"step": 62500
},
{
"epoch": 18.225524475524477,
"grad_norm": 0.4140659272670746,
"learning_rate": 0.0003815195335276967,
"loss": 3.2262,
"step": 62550
},
{
"epoch": 18.24009324009324,
"grad_norm": 0.4209405779838562,
"learning_rate": 0.0003813446064139941,
"loss": 3.2267,
"step": 62600
},
{
"epoch": 18.254662004662006,
"grad_norm": 0.39384543895721436,
"learning_rate": 0.0003811696793002915,
"loss": 3.2191,
"step": 62650
},
{
"epoch": 18.26923076923077,
"grad_norm": 0.3749587833881378,
"learning_rate": 0.0003809947521865889,
"loss": 3.2332,
"step": 62700
},
{
"epoch": 18.283799533799534,
"grad_norm": 0.4175374507904053,
"learning_rate": 0.0003808198250728863,
"loss": 3.2345,
"step": 62750
},
{
"epoch": 18.2983682983683,
"grad_norm": 0.4039442837238312,
"learning_rate": 0.00038064489795918365,
"loss": 3.2273,
"step": 62800
},
{
"epoch": 18.312937062937063,
"grad_norm": 0.4071449935436249,
"learning_rate": 0.000380469970845481,
"loss": 3.2226,
"step": 62850
},
{
"epoch": 18.327505827505828,
"grad_norm": 0.43547382950782776,
"learning_rate": 0.0003802950437317784,
"loss": 3.2412,
"step": 62900
},
{
"epoch": 18.342074592074592,
"grad_norm": 0.4017612040042877,
"learning_rate": 0.0003801201166180758,
"loss": 3.2346,
"step": 62950
},
{
"epoch": 18.356643356643357,
"grad_norm": 0.38543474674224854,
"learning_rate": 0.0003799451895043731,
"loss": 3.2341,
"step": 63000
},
{
"epoch": 18.356643356643357,
"eval_accuracy": 0.37304784466529056,
"eval_loss": 3.545969009399414,
"eval_runtime": 180.0148,
"eval_samples_per_second": 92.448,
"eval_steps_per_second": 5.783,
"step": 63000
},
{
"epoch": 18.37121212121212,
"grad_norm": 0.3899354934692383,
"learning_rate": 0.0003797702623906705,
"loss": 3.2418,
"step": 63050
},
{
"epoch": 18.385780885780886,
"grad_norm": 0.38112348318099976,
"learning_rate": 0.0003795953352769679,
"loss": 3.2467,
"step": 63100
},
{
"epoch": 18.40034965034965,
"grad_norm": 0.4406619966030121,
"learning_rate": 0.0003794204081632653,
"loss": 3.243,
"step": 63150
},
{
"epoch": 18.414918414918414,
"grad_norm": 0.3870476484298706,
"learning_rate": 0.00037924548104956266,
"loss": 3.2393,
"step": 63200
},
{
"epoch": 18.42948717948718,
"grad_norm": 0.3903353214263916,
"learning_rate": 0.00037907055393586003,
"loss": 3.2434,
"step": 63250
},
{
"epoch": 18.444055944055943,
"grad_norm": 0.3915194272994995,
"learning_rate": 0.0003788956268221574,
"loss": 3.248,
"step": 63300
},
{
"epoch": 18.458624708624708,
"grad_norm": 0.442837655544281,
"learning_rate": 0.0003787206997084548,
"loss": 3.2541,
"step": 63350
},
{
"epoch": 18.473193473193472,
"grad_norm": 0.38805052638053894,
"learning_rate": 0.00037854577259475216,
"loss": 3.2386,
"step": 63400
},
{
"epoch": 18.487762237762237,
"grad_norm": 0.4246932566165924,
"learning_rate": 0.0003783708454810495,
"loss": 3.2384,
"step": 63450
},
{
"epoch": 18.502331002331,
"grad_norm": 0.3854350745677948,
"learning_rate": 0.00037819591836734686,
"loss": 3.2573,
"step": 63500
},
{
"epoch": 18.516899766899765,
"grad_norm": 0.4202209413051605,
"learning_rate": 0.0003780209912536443,
"loss": 3.2449,
"step": 63550
},
{
"epoch": 18.53146853146853,
"grad_norm": 0.43968549370765686,
"learning_rate": 0.00037784606413994166,
"loss": 3.2394,
"step": 63600
},
{
"epoch": 18.546037296037294,
"grad_norm": 0.4122573137283325,
"learning_rate": 0.00037767113702623904,
"loss": 3.2401,
"step": 63650
},
{
"epoch": 18.560606060606062,
"grad_norm": 0.39152082800865173,
"learning_rate": 0.0003774962099125364,
"loss": 3.2445,
"step": 63700
},
{
"epoch": 18.575174825174827,
"grad_norm": 0.38848137855529785,
"learning_rate": 0.0003773212827988338,
"loss": 3.262,
"step": 63750
},
{
"epoch": 18.58974358974359,
"grad_norm": 0.4093928635120392,
"learning_rate": 0.00037714635568513117,
"loss": 3.2593,
"step": 63800
},
{
"epoch": 18.604312354312356,
"grad_norm": 0.4007672667503357,
"learning_rate": 0.00037697142857142854,
"loss": 3.2601,
"step": 63850
},
{
"epoch": 18.61888111888112,
"grad_norm": 0.3751702606678009,
"learning_rate": 0.00037679650145772586,
"loss": 3.2469,
"step": 63900
},
{
"epoch": 18.633449883449885,
"grad_norm": 0.40671372413635254,
"learning_rate": 0.0003766215743440233,
"loss": 3.2575,
"step": 63950
},
{
"epoch": 18.64801864801865,
"grad_norm": 0.40447014570236206,
"learning_rate": 0.00037644664723032067,
"loss": 3.2576,
"step": 64000
},
{
"epoch": 18.64801864801865,
"eval_accuracy": 0.3732128246477977,
"eval_loss": 3.5389418601989746,
"eval_runtime": 180.229,
"eval_samples_per_second": 92.338,
"eval_steps_per_second": 5.776,
"step": 64000
},
{
"epoch": 18.662587412587413,
"grad_norm": 0.406858891248703,
"learning_rate": 0.00037627172011661805,
"loss": 3.2618,
"step": 64050
},
{
"epoch": 18.677156177156178,
"grad_norm": 0.39155733585357666,
"learning_rate": 0.0003760967930029154,
"loss": 3.2583,
"step": 64100
},
{
"epoch": 18.691724941724942,
"grad_norm": 0.4028909504413605,
"learning_rate": 0.0003759218658892128,
"loss": 3.2607,
"step": 64150
},
{
"epoch": 18.706293706293707,
"grad_norm": 0.3763161599636078,
"learning_rate": 0.00037574693877551017,
"loss": 3.2557,
"step": 64200
},
{
"epoch": 18.72086247086247,
"grad_norm": 0.3901912569999695,
"learning_rate": 0.00037557201166180755,
"loss": 3.2591,
"step": 64250
},
{
"epoch": 18.735431235431236,
"grad_norm": 0.3856413662433624,
"learning_rate": 0.000375397084548105,
"loss": 3.2764,
"step": 64300
},
{
"epoch": 18.75,
"grad_norm": 0.38662955164909363,
"learning_rate": 0.00037522215743440225,
"loss": 3.2628,
"step": 64350
},
{
"epoch": 18.764568764568764,
"grad_norm": 0.3822769820690155,
"learning_rate": 0.0003750472303206997,
"loss": 3.2645,
"step": 64400
},
{
"epoch": 18.77913752913753,
"grad_norm": 0.40630531311035156,
"learning_rate": 0.00037487230320699705,
"loss": 3.2667,
"step": 64450
},
{
"epoch": 18.793706293706293,
"grad_norm": 0.40045738220214844,
"learning_rate": 0.00037469737609329443,
"loss": 3.2628,
"step": 64500
},
{
"epoch": 18.808275058275058,
"grad_norm": 0.38867759704589844,
"learning_rate": 0.0003745224489795918,
"loss": 3.2687,
"step": 64550
},
{
"epoch": 18.822843822843822,
"grad_norm": 0.4422551393508911,
"learning_rate": 0.0003743475218658892,
"loss": 3.275,
"step": 64600
},
{
"epoch": 18.837412587412587,
"grad_norm": 0.39280012249946594,
"learning_rate": 0.00037417259475218655,
"loss": 3.2652,
"step": 64650
},
{
"epoch": 18.85198135198135,
"grad_norm": 0.39500296115875244,
"learning_rate": 0.00037399766763848393,
"loss": 3.2573,
"step": 64700
},
{
"epoch": 18.866550116550115,
"grad_norm": 0.420569509267807,
"learning_rate": 0.00037382274052478136,
"loss": 3.284,
"step": 64750
},
{
"epoch": 18.88111888111888,
"grad_norm": 0.395648330450058,
"learning_rate": 0.0003736478134110787,
"loss": 3.2792,
"step": 64800
},
{
"epoch": 18.895687645687644,
"grad_norm": 0.4250222146511078,
"learning_rate": 0.00037347288629737606,
"loss": 3.2648,
"step": 64850
},
{
"epoch": 18.91025641025641,
"grad_norm": 0.3909446895122528,
"learning_rate": 0.00037329795918367343,
"loss": 3.2897,
"step": 64900
},
{
"epoch": 18.924825174825173,
"grad_norm": 0.38186371326446533,
"learning_rate": 0.0003731230320699708,
"loss": 3.2589,
"step": 64950
},
{
"epoch": 18.939393939393938,
"grad_norm": 0.3965621888637543,
"learning_rate": 0.0003729481049562682,
"loss": 3.2692,
"step": 65000
},
{
"epoch": 18.939393939393938,
"eval_accuracy": 0.3734865761797127,
"eval_loss": 3.534069776535034,
"eval_runtime": 180.2328,
"eval_samples_per_second": 92.336,
"eval_steps_per_second": 5.776,
"step": 65000
},
{
"epoch": 18.953962703962706,
"grad_norm": 0.404230535030365,
"learning_rate": 0.00037277317784256556,
"loss": 3.27,
"step": 65050
},
{
"epoch": 18.96853146853147,
"grad_norm": 0.3822128474712372,
"learning_rate": 0.00037259825072886294,
"loss": 3.2876,
"step": 65100
},
{
"epoch": 18.983100233100235,
"grad_norm": 0.4366745054721832,
"learning_rate": 0.00037242332361516037,
"loss": 3.2788,
"step": 65150
},
{
"epoch": 18.997668997669,
"grad_norm": 0.38679859042167664,
"learning_rate": 0.00037224839650145774,
"loss": 3.2719,
"step": 65200
},
{
"epoch": 19.012237762237763,
"grad_norm": 0.3948976695537567,
"learning_rate": 0.00037207346938775506,
"loss": 3.1947,
"step": 65250
},
{
"epoch": 19.026806526806528,
"grad_norm": 0.4068011939525604,
"learning_rate": 0.00037189854227405244,
"loss": 3.175,
"step": 65300
},
{
"epoch": 19.041375291375292,
"grad_norm": 0.39423683285713196,
"learning_rate": 0.0003717236151603498,
"loss": 3.1707,
"step": 65350
},
{
"epoch": 19.055944055944057,
"grad_norm": 0.3666231036186218,
"learning_rate": 0.0003715486880466472,
"loss": 3.1813,
"step": 65400
},
{
"epoch": 19.07051282051282,
"grad_norm": 0.3903319537639618,
"learning_rate": 0.00037137376093294457,
"loss": 3.1921,
"step": 65450
},
{
"epoch": 19.085081585081586,
"grad_norm": 0.4077537953853607,
"learning_rate": 0.00037119883381924194,
"loss": 3.1883,
"step": 65500
},
{
"epoch": 19.09965034965035,
"grad_norm": 0.40144649147987366,
"learning_rate": 0.0003710239067055393,
"loss": 3.1822,
"step": 65550
},
{
"epoch": 19.114219114219114,
"grad_norm": 0.42461147904396057,
"learning_rate": 0.00037084897959183675,
"loss": 3.1975,
"step": 65600
},
{
"epoch": 19.12878787878788,
"grad_norm": 0.3930090367794037,
"learning_rate": 0.0003706740524781341,
"loss": 3.1864,
"step": 65650
},
{
"epoch": 19.143356643356643,
"grad_norm": 0.3844420909881592,
"learning_rate": 0.00037049912536443145,
"loss": 3.1978,
"step": 65700
},
{
"epoch": 19.157925407925408,
"grad_norm": 0.4048396944999695,
"learning_rate": 0.0003703241982507288,
"loss": 3.2046,
"step": 65750
},
{
"epoch": 19.172494172494172,
"grad_norm": 0.3697856068611145,
"learning_rate": 0.0003701492711370262,
"loss": 3.1964,
"step": 65800
},
{
"epoch": 19.187062937062937,
"grad_norm": 0.42250409722328186,
"learning_rate": 0.0003699743440233236,
"loss": 3.2116,
"step": 65850
},
{
"epoch": 19.2016317016317,
"grad_norm": 0.3881857693195343,
"learning_rate": 0.00036979941690962095,
"loss": 3.2068,
"step": 65900
},
{
"epoch": 19.216200466200466,
"grad_norm": 0.4232952296733856,
"learning_rate": 0.0003696244897959183,
"loss": 3.1967,
"step": 65950
},
{
"epoch": 19.23076923076923,
"grad_norm": 0.39698526263237,
"learning_rate": 0.0003694495626822157,
"loss": 3.2072,
"step": 66000
},
{
"epoch": 19.23076923076923,
"eval_accuracy": 0.3729909306870058,
"eval_loss": 3.551021099090576,
"eval_runtime": 180.0611,
"eval_samples_per_second": 92.424,
"eval_steps_per_second": 5.781,
"step": 66000
},
{
"epoch": 19.245337995337994,
"grad_norm": 0.4016408920288086,
"learning_rate": 0.00036927463556851313,
"loss": 3.2204,
"step": 66050
},
{
"epoch": 19.25990675990676,
"grad_norm": 0.42734089493751526,
"learning_rate": 0.0003690997084548105,
"loss": 3.2053,
"step": 66100
},
{
"epoch": 19.274475524475523,
"grad_norm": 0.44750434160232544,
"learning_rate": 0.00036892478134110783,
"loss": 3.2228,
"step": 66150
},
{
"epoch": 19.289044289044288,
"grad_norm": 0.3957410156726837,
"learning_rate": 0.0003687498542274052,
"loss": 3.2057,
"step": 66200
},
{
"epoch": 19.303613053613052,
"grad_norm": 0.41760486364364624,
"learning_rate": 0.0003685749271137026,
"loss": 3.2128,
"step": 66250
},
{
"epoch": 19.318181818181817,
"grad_norm": 0.37982597947120667,
"learning_rate": 0.00036839999999999996,
"loss": 3.2198,
"step": 66300
},
{
"epoch": 19.33275058275058,
"grad_norm": 0.39311569929122925,
"learning_rate": 0.00036822507288629733,
"loss": 3.2262,
"step": 66350
},
{
"epoch": 19.34731934731935,
"grad_norm": 0.40117841958999634,
"learning_rate": 0.0003680501457725947,
"loss": 3.2224,
"step": 66400
},
{
"epoch": 19.361888111888113,
"grad_norm": 0.43732988834381104,
"learning_rate": 0.00036787521865889214,
"loss": 3.2191,
"step": 66450
},
{
"epoch": 19.376456876456878,
"grad_norm": 0.41658449172973633,
"learning_rate": 0.0003677002915451895,
"loss": 3.224,
"step": 66500
},
{
"epoch": 19.391025641025642,
"grad_norm": 0.39912378787994385,
"learning_rate": 0.0003675253644314869,
"loss": 3.2165,
"step": 66550
},
{
"epoch": 19.405594405594407,
"grad_norm": 0.41178327798843384,
"learning_rate": 0.0003673504373177842,
"loss": 3.2439,
"step": 66600
},
{
"epoch": 19.42016317016317,
"grad_norm": 0.3769591450691223,
"learning_rate": 0.0003671755102040816,
"loss": 3.2277,
"step": 66650
},
{
"epoch": 19.434731934731936,
"grad_norm": 0.3860316872596741,
"learning_rate": 0.00036700058309037896,
"loss": 3.2286,
"step": 66700
},
{
"epoch": 19.4493006993007,
"grad_norm": 0.38730889558792114,
"learning_rate": 0.00036682565597667634,
"loss": 3.2281,
"step": 66750
},
{
"epoch": 19.463869463869464,
"grad_norm": 0.39924201369285583,
"learning_rate": 0.0003666507288629737,
"loss": 3.2363,
"step": 66800
},
{
"epoch": 19.47843822843823,
"grad_norm": 0.4194006323814392,
"learning_rate": 0.0003664758017492711,
"loss": 3.227,
"step": 66850
},
{
"epoch": 19.493006993006993,
"grad_norm": 0.39736315608024597,
"learning_rate": 0.0003663008746355685,
"loss": 3.2454,
"step": 66900
},
{
"epoch": 19.507575757575758,
"grad_norm": 0.39378735423088074,
"learning_rate": 0.0003661259475218659,
"loss": 3.2404,
"step": 66950
},
{
"epoch": 19.522144522144522,
"grad_norm": 0.4026825428009033,
"learning_rate": 0.00036595102040816327,
"loss": 3.2441,
"step": 67000
},
{
"epoch": 19.522144522144522,
"eval_accuracy": 0.37330019465991665,
"eval_loss": 3.542999267578125,
"eval_runtime": 179.9313,
"eval_samples_per_second": 92.491,
"eval_steps_per_second": 5.786,
"step": 67000
},
{
"epoch": 19.536713286713287,
"grad_norm": 0.38818758726119995,
"learning_rate": 0.0003657760932944606,
"loss": 3.2529,
"step": 67050
},
{
"epoch": 19.55128205128205,
"grad_norm": 0.42251184582710266,
"learning_rate": 0.00036560116618075797,
"loss": 3.2517,
"step": 67100
},
{
"epoch": 19.565850815850816,
"grad_norm": 0.45560574531555176,
"learning_rate": 0.00036542623906705534,
"loss": 3.2418,
"step": 67150
},
{
"epoch": 19.58041958041958,
"grad_norm": 0.4155190885066986,
"learning_rate": 0.0003652513119533527,
"loss": 3.2352,
"step": 67200
},
{
"epoch": 19.594988344988344,
"grad_norm": 0.46676522493362427,
"learning_rate": 0.0003650763848396501,
"loss": 3.2424,
"step": 67250
},
{
"epoch": 19.60955710955711,
"grad_norm": 0.4105874001979828,
"learning_rate": 0.0003649014577259475,
"loss": 3.2472,
"step": 67300
},
{
"epoch": 19.624125874125873,
"grad_norm": 0.40313708782196045,
"learning_rate": 0.0003647265306122449,
"loss": 3.2436,
"step": 67350
},
{
"epoch": 19.638694638694638,
"grad_norm": 0.400311142206192,
"learning_rate": 0.0003645516034985423,
"loss": 3.2383,
"step": 67400
},
{
"epoch": 19.653263403263402,
"grad_norm": 0.387896865606308,
"learning_rate": 0.00036437667638483965,
"loss": 3.2514,
"step": 67450
},
{
"epoch": 19.667832167832167,
"grad_norm": 0.39638492465019226,
"learning_rate": 0.000364201749271137,
"loss": 3.2524,
"step": 67500
},
{
"epoch": 19.68240093240093,
"grad_norm": 0.40816885232925415,
"learning_rate": 0.00036402682215743435,
"loss": 3.2627,
"step": 67550
},
{
"epoch": 19.696969696969695,
"grad_norm": 0.39976105093955994,
"learning_rate": 0.0003638518950437317,
"loss": 3.2588,
"step": 67600
},
{
"epoch": 19.71153846153846,
"grad_norm": 0.38313519954681396,
"learning_rate": 0.0003636769679300291,
"loss": 3.2609,
"step": 67650
},
{
"epoch": 19.726107226107224,
"grad_norm": 0.4652852416038513,
"learning_rate": 0.0003635020408163265,
"loss": 3.264,
"step": 67700
},
{
"epoch": 19.740675990675992,
"grad_norm": 0.4328801929950714,
"learning_rate": 0.0003633271137026239,
"loss": 3.2553,
"step": 67750
},
{
"epoch": 19.755244755244757,
"grad_norm": 0.38108712434768677,
"learning_rate": 0.0003631521865889213,
"loss": 3.2439,
"step": 67800
},
{
"epoch": 19.76981351981352,
"grad_norm": 0.3857068121433258,
"learning_rate": 0.00036297725947521866,
"loss": 3.2565,
"step": 67850
},
{
"epoch": 19.784382284382286,
"grad_norm": 0.4026941657066345,
"learning_rate": 0.00036280233236151604,
"loss": 3.2531,
"step": 67900
},
{
"epoch": 19.79895104895105,
"grad_norm": 0.3889337480068207,
"learning_rate": 0.00036262740524781336,
"loss": 3.2349,
"step": 67950
},
{
"epoch": 19.813519813519815,
"grad_norm": 0.4145674705505371,
"learning_rate": 0.00036245247813411073,
"loss": 3.2584,
"step": 68000
},
{
"epoch": 19.813519813519815,
"eval_accuracy": 0.3736507330261703,
"eval_loss": 3.5368125438690186,
"eval_runtime": 180.8631,
"eval_samples_per_second": 92.014,
"eval_steps_per_second": 5.756,
"step": 68000
},
{
"epoch": 19.82808857808858,
"grad_norm": 0.42266836762428284,
"learning_rate": 0.0003622775510204081,
"loss": 3.255,
"step": 68050
},
{
"epoch": 19.842657342657343,
"grad_norm": 0.4349689781665802,
"learning_rate": 0.0003621026239067055,
"loss": 3.2449,
"step": 68100
},
{
"epoch": 19.857226107226108,
"grad_norm": 0.4042121171951294,
"learning_rate": 0.00036192769679300286,
"loss": 3.2478,
"step": 68150
},
{
"epoch": 19.871794871794872,
"grad_norm": 0.40951982140541077,
"learning_rate": 0.0003617527696793003,
"loss": 3.2761,
"step": 68200
},
{
"epoch": 19.886363636363637,
"grad_norm": 0.4130703806877136,
"learning_rate": 0.00036157784256559767,
"loss": 3.2723,
"step": 68250
},
{
"epoch": 19.9009324009324,
"grad_norm": 0.39606547355651855,
"learning_rate": 0.00036140291545189504,
"loss": 3.2622,
"step": 68300
},
{
"epoch": 19.915501165501166,
"grad_norm": 0.3855551779270172,
"learning_rate": 0.0003612279883381924,
"loss": 3.2545,
"step": 68350
},
{
"epoch": 19.93006993006993,
"grad_norm": 0.39541083574295044,
"learning_rate": 0.00036105306122448974,
"loss": 3.2581,
"step": 68400
},
{
"epoch": 19.944638694638694,
"grad_norm": 0.43517494201660156,
"learning_rate": 0.0003608781341107871,
"loss": 3.2528,
"step": 68450
},
{
"epoch": 19.95920745920746,
"grad_norm": 0.39319750666618347,
"learning_rate": 0.0003607032069970845,
"loss": 3.2749,
"step": 68500
},
{
"epoch": 19.973776223776223,
"grad_norm": 0.42578259110450745,
"learning_rate": 0.00036052827988338187,
"loss": 3.2813,
"step": 68550
},
{
"epoch": 19.988344988344988,
"grad_norm": 0.41607028245925903,
"learning_rate": 0.0003603533527696793,
"loss": 3.263,
"step": 68600
},
{
"epoch": 20.002913752913752,
"grad_norm": 0.40466973185539246,
"learning_rate": 0.00036017842565597667,
"loss": 3.2452,
"step": 68650
},
{
"epoch": 20.017482517482517,
"grad_norm": 0.4109882712364197,
"learning_rate": 0.00036000349854227405,
"loss": 3.1553,
"step": 68700
},
{
"epoch": 20.03205128205128,
"grad_norm": 0.44405311346054077,
"learning_rate": 0.0003598285714285714,
"loss": 3.1552,
"step": 68750
},
{
"epoch": 20.046620046620045,
"grad_norm": 0.405329167842865,
"learning_rate": 0.0003596536443148688,
"loss": 3.1718,
"step": 68800
},
{
"epoch": 20.06118881118881,
"grad_norm": 0.4313248097896576,
"learning_rate": 0.0003594787172011661,
"loss": 3.1793,
"step": 68850
},
{
"epoch": 20.075757575757574,
"grad_norm": 0.46406006813049316,
"learning_rate": 0.0003593037900874635,
"loss": 3.1654,
"step": 68900
},
{
"epoch": 20.09032634032634,
"grad_norm": 0.42525312304496765,
"learning_rate": 0.00035912886297376087,
"loss": 3.1819,
"step": 68950
},
{
"epoch": 20.104895104895103,
"grad_norm": 0.38830262422561646,
"learning_rate": 0.00035895393586005825,
"loss": 3.1789,
"step": 69000
},
{
"epoch": 20.104895104895103,
"eval_accuracy": 0.3730249144467667,
"eval_loss": 3.551114559173584,
"eval_runtime": 180.1938,
"eval_samples_per_second": 92.356,
"eval_steps_per_second": 5.777,
"step": 69000
},
{
"epoch": 20.11946386946387,
"grad_norm": 0.40085506439208984,
"learning_rate": 0.0003587790087463557,
"loss": 3.1873,
"step": 69050
},
{
"epoch": 20.134032634032636,
"grad_norm": 0.38859525322914124,
"learning_rate": 0.00035860408163265305,
"loss": 3.1785,
"step": 69100
},
{
"epoch": 20.1486013986014,
"grad_norm": 0.39980462193489075,
"learning_rate": 0.00035842915451895043,
"loss": 3.1818,
"step": 69150
},
{
"epoch": 20.163170163170165,
"grad_norm": 0.45090606808662415,
"learning_rate": 0.0003582542274052478,
"loss": 3.1847,
"step": 69200
},
{
"epoch": 20.17773892773893,
"grad_norm": 0.41566649079322815,
"learning_rate": 0.0003580793002915452,
"loss": 3.1746,
"step": 69250
},
{
"epoch": 20.192307692307693,
"grad_norm": 0.3906833231449127,
"learning_rate": 0.0003579043731778425,
"loss": 3.1916,
"step": 69300
},
{
"epoch": 20.206876456876458,
"grad_norm": 0.3998190760612488,
"learning_rate": 0.0003577294460641399,
"loss": 3.1898,
"step": 69350
},
{
"epoch": 20.221445221445222,
"grad_norm": 0.39457687735557556,
"learning_rate": 0.00035755451895043725,
"loss": 3.2008,
"step": 69400
},
{
"epoch": 20.236013986013987,
"grad_norm": 0.41978347301483154,
"learning_rate": 0.00035737959183673463,
"loss": 3.1958,
"step": 69450
},
{
"epoch": 20.25058275058275,
"grad_norm": 0.4217435121536255,
"learning_rate": 0.00035720466472303206,
"loss": 3.1889,
"step": 69500
},
{
"epoch": 20.265151515151516,
"grad_norm": 0.4164603352546692,
"learning_rate": 0.00035702973760932944,
"loss": 3.2118,
"step": 69550
},
{
"epoch": 20.27972027972028,
"grad_norm": 0.39068740606307983,
"learning_rate": 0.0003568548104956268,
"loss": 3.2086,
"step": 69600
},
{
"epoch": 20.294289044289044,
"grad_norm": 0.36841413378715515,
"learning_rate": 0.0003566798833819242,
"loss": 3.2165,
"step": 69650
},
{
"epoch": 20.30885780885781,
"grad_norm": 0.37738609313964844,
"learning_rate": 0.00035650495626822156,
"loss": 3.2106,
"step": 69700
},
{
"epoch": 20.323426573426573,
"grad_norm": 0.3986680209636688,
"learning_rate": 0.0003563300291545189,
"loss": 3.2111,
"step": 69750
},
{
"epoch": 20.337995337995338,
"grad_norm": 0.40461841225624084,
"learning_rate": 0.00035615510204081626,
"loss": 3.2073,
"step": 69800
},
{
"epoch": 20.352564102564102,
"grad_norm": 0.4184028208255768,
"learning_rate": 0.00035598017492711364,
"loss": 3.2081,
"step": 69850
},
{
"epoch": 20.367132867132867,
"grad_norm": 0.39725759625434875,
"learning_rate": 0.00035580524781341107,
"loss": 3.214,
"step": 69900
},
{
"epoch": 20.38170163170163,
"grad_norm": 0.3868226706981659,
"learning_rate": 0.00035563032069970844,
"loss": 3.2195,
"step": 69950
},
{
"epoch": 20.396270396270396,
"grad_norm": 0.43626147508621216,
"learning_rate": 0.0003554553935860058,
"loss": 3.2145,
"step": 70000
},
{
"epoch": 20.396270396270396,
"eval_accuracy": 0.3731882481571748,
"eval_loss": 3.5491859912872314,
"eval_runtime": 180.0447,
"eval_samples_per_second": 92.433,
"eval_steps_per_second": 5.782,
"step": 70000
},
{
"epoch": 20.41083916083916,
"grad_norm": 0.42190268635749817,
"learning_rate": 0.0003552804664723032,
"loss": 3.2188,
"step": 70050
},
{
"epoch": 20.425407925407924,
"grad_norm": 0.38069528341293335,
"learning_rate": 0.00035510553935860057,
"loss": 3.2178,
"step": 70100
},
{
"epoch": 20.43997668997669,
"grad_norm": 0.39980804920196533,
"learning_rate": 0.00035493061224489795,
"loss": 3.2249,
"step": 70150
},
{
"epoch": 20.454545454545453,
"grad_norm": 0.4361363649368286,
"learning_rate": 0.00035475568513119527,
"loss": 3.2327,
"step": 70200
},
{
"epoch": 20.469114219114218,
"grad_norm": 0.4003528952598572,
"learning_rate": 0.00035458075801749264,
"loss": 3.2181,
"step": 70250
},
{
"epoch": 20.483682983682982,
"grad_norm": 0.4071844816207886,
"learning_rate": 0.00035440583090379,
"loss": 3.2183,
"step": 70300
},
{
"epoch": 20.498251748251747,
"grad_norm": 0.4098977744579315,
"learning_rate": 0.00035423090379008745,
"loss": 3.229,
"step": 70350
},
{
"epoch": 20.51282051282051,
"grad_norm": 0.38837113976478577,
"learning_rate": 0.0003540559766763848,
"loss": 3.2301,
"step": 70400
},
{
"epoch": 20.52738927738928,
"grad_norm": 0.4276978671550751,
"learning_rate": 0.0003538810495626822,
"loss": 3.2347,
"step": 70450
},
{
"epoch": 20.541958041958043,
"grad_norm": 0.39745762944221497,
"learning_rate": 0.0003537061224489796,
"loss": 3.2343,
"step": 70500
},
{
"epoch": 20.556526806526808,
"grad_norm": 0.39127087593078613,
"learning_rate": 0.00035353119533527695,
"loss": 3.2333,
"step": 70550
},
{
"epoch": 20.571095571095572,
"grad_norm": 0.38400810956954956,
"learning_rate": 0.00035335626822157433,
"loss": 3.2294,
"step": 70600
},
{
"epoch": 20.585664335664337,
"grad_norm": 0.48885342478752136,
"learning_rate": 0.00035318134110787165,
"loss": 3.2447,
"step": 70650
},
{
"epoch": 20.6002331002331,
"grad_norm": 0.4058760404586792,
"learning_rate": 0.000353006413994169,
"loss": 3.2314,
"step": 70700
},
{
"epoch": 20.614801864801866,
"grad_norm": 0.44265225529670715,
"learning_rate": 0.00035283148688046646,
"loss": 3.2408,
"step": 70750
},
{
"epoch": 20.62937062937063,
"grad_norm": 0.39163583517074585,
"learning_rate": 0.00035265655976676383,
"loss": 3.2402,
"step": 70800
},
{
"epoch": 20.643939393939394,
"grad_norm": 0.42056941986083984,
"learning_rate": 0.0003524816326530612,
"loss": 3.2357,
"step": 70850
},
{
"epoch": 20.65850815850816,
"grad_norm": 0.41706550121307373,
"learning_rate": 0.0003523067055393586,
"loss": 3.2457,
"step": 70900
},
{
"epoch": 20.673076923076923,
"grad_norm": 0.41059020161628723,
"learning_rate": 0.00035213177842565596,
"loss": 3.237,
"step": 70950
},
{
"epoch": 20.687645687645688,
"grad_norm": 0.38980886340141296,
"learning_rate": 0.00035195685131195333,
"loss": 3.2446,
"step": 71000
},
{
"epoch": 20.687645687645688,
"eval_accuracy": 0.3736142798582607,
"eval_loss": 3.539933919906616,
"eval_runtime": 180.0196,
"eval_samples_per_second": 92.445,
"eval_steps_per_second": 5.783,
"step": 71000
},
{
"epoch": 20.702214452214452,
"grad_norm": 0.42689770460128784,
"learning_rate": 0.0003517819241982507,
"loss": 3.2479,
"step": 71050
},
{
"epoch": 20.716783216783217,
"grad_norm": 0.37170732021331787,
"learning_rate": 0.00035160699708454803,
"loss": 3.2448,
"step": 71100
},
{
"epoch": 20.73135198135198,
"grad_norm": 0.40953195095062256,
"learning_rate": 0.0003514320699708454,
"loss": 3.2477,
"step": 71150
},
{
"epoch": 20.745920745920746,
"grad_norm": 0.44523903727531433,
"learning_rate": 0.00035125714285714284,
"loss": 3.2392,
"step": 71200
},
{
"epoch": 20.76048951048951,
"grad_norm": 0.41191455721855164,
"learning_rate": 0.0003510822157434402,
"loss": 3.2403,
"step": 71250
},
{
"epoch": 20.775058275058274,
"grad_norm": 0.38925042748451233,
"learning_rate": 0.0003509072886297376,
"loss": 3.2517,
"step": 71300
},
{
"epoch": 20.78962703962704,
"grad_norm": 0.44074341654777527,
"learning_rate": 0.00035073236151603497,
"loss": 3.2596,
"step": 71350
},
{
"epoch": 20.804195804195803,
"grad_norm": 0.4275226891040802,
"learning_rate": 0.00035055743440233234,
"loss": 3.2408,
"step": 71400
},
{
"epoch": 20.818764568764568,
"grad_norm": 0.42843613028526306,
"learning_rate": 0.0003503825072886297,
"loss": 3.252,
"step": 71450
},
{
"epoch": 20.833333333333332,
"grad_norm": 0.42051464319229126,
"learning_rate": 0.0003502075801749271,
"loss": 3.2386,
"step": 71500
},
{
"epoch": 20.847902097902097,
"grad_norm": 0.37405192852020264,
"learning_rate": 0.0003500326530612244,
"loss": 3.2573,
"step": 71550
},
{
"epoch": 20.86247086247086,
"grad_norm": 0.42603862285614014,
"learning_rate": 0.0003498577259475218,
"loss": 3.2459,
"step": 71600
},
{
"epoch": 20.877039627039625,
"grad_norm": 0.45300430059432983,
"learning_rate": 0.0003496827988338192,
"loss": 3.25,
"step": 71650
},
{
"epoch": 20.89160839160839,
"grad_norm": 0.40416771173477173,
"learning_rate": 0.0003495078717201166,
"loss": 3.2399,
"step": 71700
},
{
"epoch": 20.906177156177158,
"grad_norm": 0.39509859681129456,
"learning_rate": 0.00034933294460641397,
"loss": 3.2518,
"step": 71750
},
{
"epoch": 20.920745920745922,
"grad_norm": 0.4095703661441803,
"learning_rate": 0.00034915801749271135,
"loss": 3.2553,
"step": 71800
},
{
"epoch": 20.935314685314687,
"grad_norm": 0.4047284722328186,
"learning_rate": 0.0003489830903790087,
"loss": 3.2604,
"step": 71850
},
{
"epoch": 20.94988344988345,
"grad_norm": 0.4120003581047058,
"learning_rate": 0.0003488081632653061,
"loss": 3.2629,
"step": 71900
},
{
"epoch": 20.964452214452216,
"grad_norm": 0.3764059543609619,
"learning_rate": 0.0003486332361516035,
"loss": 3.2592,
"step": 71950
},
{
"epoch": 20.97902097902098,
"grad_norm": 0.4122651219367981,
"learning_rate": 0.0003484583090379008,
"loss": 3.2494,
"step": 72000
},
{
"epoch": 20.97902097902098,
"eval_accuracy": 0.37445611285524494,
"eval_loss": 3.5293209552764893,
"eval_runtime": 182.1054,
"eval_samples_per_second": 91.387,
"eval_steps_per_second": 5.716,
"step": 72000
},
{
"epoch": 20.993589743589745,
"grad_norm": 0.424654096364975,
"learning_rate": 0.0003482833819241982,
"loss": 3.2465,
"step": 72050
},
{
"epoch": 21.00815850815851,
"grad_norm": 0.4177544414997101,
"learning_rate": 0.0003481084548104956,
"loss": 3.1941,
"step": 72100
},
{
"epoch": 21.022727272727273,
"grad_norm": 0.397794634103775,
"learning_rate": 0.000347933527696793,
"loss": 3.1429,
"step": 72150
},
{
"epoch": 21.037296037296038,
"grad_norm": 0.41001781821250916,
"learning_rate": 0.00034775860058309035,
"loss": 3.1562,
"step": 72200
},
{
"epoch": 21.051864801864802,
"grad_norm": 0.39250168204307556,
"learning_rate": 0.00034758367346938773,
"loss": 3.1583,
"step": 72250
},
{
"epoch": 21.066433566433567,
"grad_norm": 0.4046579599380493,
"learning_rate": 0.0003474087463556851,
"loss": 3.1576,
"step": 72300
},
{
"epoch": 21.08100233100233,
"grad_norm": 0.3915032744407654,
"learning_rate": 0.0003472338192419825,
"loss": 3.1568,
"step": 72350
},
{
"epoch": 21.095571095571096,
"grad_norm": 0.40777382254600525,
"learning_rate": 0.0003470588921282799,
"loss": 3.1622,
"step": 72400
},
{
"epoch": 21.11013986013986,
"grad_norm": 0.4149940013885498,
"learning_rate": 0.0003468839650145772,
"loss": 3.1852,
"step": 72450
},
{
"epoch": 21.124708624708624,
"grad_norm": 0.43859758973121643,
"learning_rate": 0.0003467090379008746,
"loss": 3.1629,
"step": 72500
},
{
"epoch": 21.13927738927739,
"grad_norm": 0.43607211112976074,
"learning_rate": 0.000346534110787172,
"loss": 3.1859,
"step": 72550
},
{
"epoch": 21.153846153846153,
"grad_norm": 0.40747860074043274,
"learning_rate": 0.00034635918367346936,
"loss": 3.1825,
"step": 72600
},
{
"epoch": 21.168414918414918,
"grad_norm": 0.41333723068237305,
"learning_rate": 0.00034618425655976674,
"loss": 3.1762,
"step": 72650
},
{
"epoch": 21.182983682983682,
"grad_norm": 0.4163745641708374,
"learning_rate": 0.0003460093294460641,
"loss": 3.1871,
"step": 72700
},
{
"epoch": 21.197552447552447,
"grad_norm": 0.403562068939209,
"learning_rate": 0.0003458344023323615,
"loss": 3.1731,
"step": 72750
},
{
"epoch": 21.21212121212121,
"grad_norm": 0.4139896631240845,
"learning_rate": 0.00034565947521865886,
"loss": 3.1912,
"step": 72800
},
{
"epoch": 21.226689976689975,
"grad_norm": 0.41359269618988037,
"learning_rate": 0.0003454845481049562,
"loss": 3.1819,
"step": 72850
},
{
"epoch": 21.24125874125874,
"grad_norm": 0.41218870878219604,
"learning_rate": 0.0003453096209912536,
"loss": 3.1962,
"step": 72900
},
{
"epoch": 21.255827505827504,
"grad_norm": 0.414986789226532,
"learning_rate": 0.000345134693877551,
"loss": 3.1907,
"step": 72950
},
{
"epoch": 21.27039627039627,
"grad_norm": 0.47167742252349854,
"learning_rate": 0.00034495976676384837,
"loss": 3.1886,
"step": 73000
},
{
"epoch": 21.27039627039627,
"eval_accuracy": 0.3735304375720685,
"eval_loss": 3.546489715576172,
"eval_runtime": 229.5916,
"eval_samples_per_second": 72.485,
"eval_steps_per_second": 4.534,
"step": 73000
},
{
"epoch": 21.284965034965033,
"grad_norm": 0.42384305596351624,
"learning_rate": 0.00034478483965014574,
"loss": 3.2018,
"step": 73050
},
{
"epoch": 21.2995337995338,
"grad_norm": 0.4221298098564148,
"learning_rate": 0.0003446099125364431,
"loss": 3.2019,
"step": 73100
},
{
"epoch": 21.314102564102566,
"grad_norm": 0.3836268484592438,
"learning_rate": 0.0003444349854227405,
"loss": 3.2082,
"step": 73150
},
{
"epoch": 21.32867132867133,
"grad_norm": 0.41144731640815735,
"learning_rate": 0.00034426005830903787,
"loss": 3.1878,
"step": 73200
},
{
"epoch": 21.343240093240095,
"grad_norm": 0.4177376627922058,
"learning_rate": 0.0003440851311953353,
"loss": 3.2069,
"step": 73250
},
{
"epoch": 21.35780885780886,
"grad_norm": 0.4552435576915741,
"learning_rate": 0.00034391020408163257,
"loss": 3.1999,
"step": 73300
},
{
"epoch": 21.372377622377623,
"grad_norm": 0.41348081827163696,
"learning_rate": 0.00034373527696793,
"loss": 3.2043,
"step": 73350
},
{
"epoch": 21.386946386946388,
"grad_norm": 0.40806278586387634,
"learning_rate": 0.00034356034985422737,
"loss": 3.1899,
"step": 73400
},
{
"epoch": 21.401515151515152,
"grad_norm": 0.4172794222831726,
"learning_rate": 0.00034338542274052475,
"loss": 3.2038,
"step": 73450
},
{
"epoch": 21.416083916083917,
"grad_norm": 0.4296078383922577,
"learning_rate": 0.0003432104956268221,
"loss": 3.2136,
"step": 73500
},
{
"epoch": 21.43065268065268,
"grad_norm": 0.379256010055542,
"learning_rate": 0.0003430355685131195,
"loss": 3.1992,
"step": 73550
},
{
"epoch": 21.445221445221446,
"grad_norm": 0.4346024692058563,
"learning_rate": 0.0003428606413994169,
"loss": 3.2252,
"step": 73600
},
{
"epoch": 21.45979020979021,
"grad_norm": 0.40021541714668274,
"learning_rate": 0.00034268571428571425,
"loss": 3.2068,
"step": 73650
},
{
"epoch": 21.474358974358974,
"grad_norm": 0.4183707535266876,
"learning_rate": 0.0003425107871720117,
"loss": 3.2267,
"step": 73700
},
{
"epoch": 21.48892773892774,
"grad_norm": 0.4039231836795807,
"learning_rate": 0.00034233586005830895,
"loss": 3.2168,
"step": 73750
},
{
"epoch": 21.503496503496503,
"grad_norm": 0.39682918787002563,
"learning_rate": 0.0003421609329446064,
"loss": 3.2184,
"step": 73800
},
{
"epoch": 21.518065268065268,
"grad_norm": 0.42791664600372314,
"learning_rate": 0.00034198600583090375,
"loss": 3.2254,
"step": 73850
},
{
"epoch": 21.532634032634032,
"grad_norm": 0.447919100522995,
"learning_rate": 0.00034181107871720113,
"loss": 3.2198,
"step": 73900
},
{
"epoch": 21.547202797202797,
"grad_norm": 0.4114047586917877,
"learning_rate": 0.0003416361516034985,
"loss": 3.2176,
"step": 73950
},
{
"epoch": 21.56177156177156,
"grad_norm": 0.442635715007782,
"learning_rate": 0.0003414612244897959,
"loss": 3.2211,
"step": 74000
},
{
"epoch": 21.56177156177156,
"eval_accuracy": 0.37385510594819277,
"eval_loss": 3.5401127338409424,
"eval_runtime": 180.1598,
"eval_samples_per_second": 92.374,
"eval_steps_per_second": 5.778,
"step": 74000
},
{
"epoch": 21.576340326340326,
"grad_norm": 0.4588135778903961,
"learning_rate": 0.00034128629737609326,
"loss": 3.2151,
"step": 74050
},
{
"epoch": 21.59090909090909,
"grad_norm": 0.4199243187904358,
"learning_rate": 0.00034111137026239063,
"loss": 3.231,
"step": 74100
},
{
"epoch": 21.605477855477854,
"grad_norm": 0.41805458068847656,
"learning_rate": 0.00034093644314868806,
"loss": 3.2262,
"step": 74150
},
{
"epoch": 21.62004662004662,
"grad_norm": 0.4202272891998291,
"learning_rate": 0.0003407615160349854,
"loss": 3.2326,
"step": 74200
},
{
"epoch": 21.634615384615383,
"grad_norm": 0.47583162784576416,
"learning_rate": 0.00034058658892128276,
"loss": 3.2182,
"step": 74250
},
{
"epoch": 21.649184149184148,
"grad_norm": 0.40350261330604553,
"learning_rate": 0.00034041166180758014,
"loss": 3.2307,
"step": 74300
},
{
"epoch": 21.663752913752912,
"grad_norm": 0.43340378999710083,
"learning_rate": 0.0003402367346938775,
"loss": 3.2299,
"step": 74350
},
{
"epoch": 21.67832167832168,
"grad_norm": 0.4040905237197876,
"learning_rate": 0.0003400618075801749,
"loss": 3.238,
"step": 74400
},
{
"epoch": 21.692890442890445,
"grad_norm": 0.42097172141075134,
"learning_rate": 0.00033988688046647226,
"loss": 3.2294,
"step": 74450
},
{
"epoch": 21.70745920745921,
"grad_norm": 0.39376163482666016,
"learning_rate": 0.00033971195335276964,
"loss": 3.2284,
"step": 74500
},
{
"epoch": 21.722027972027973,
"grad_norm": 0.4100789725780487,
"learning_rate": 0.00033953702623906707,
"loss": 3.2379,
"step": 74550
},
{
"epoch": 21.736596736596738,
"grad_norm": 0.3906741440296173,
"learning_rate": 0.00033936209912536445,
"loss": 3.2339,
"step": 74600
},
{
"epoch": 21.751165501165502,
"grad_norm": 0.4219715893268585,
"learning_rate": 0.00033918717201166177,
"loss": 3.2304,
"step": 74650
},
{
"epoch": 21.765734265734267,
"grad_norm": 0.41775527596473694,
"learning_rate": 0.00033901224489795914,
"loss": 3.2425,
"step": 74700
},
{
"epoch": 21.78030303030303,
"grad_norm": 0.4094178378582001,
"learning_rate": 0.0003388373177842565,
"loss": 3.2364,
"step": 74750
},
{
"epoch": 21.794871794871796,
"grad_norm": 0.4222930073738098,
"learning_rate": 0.0003386623906705539,
"loss": 3.2422,
"step": 74800
},
{
"epoch": 21.80944055944056,
"grad_norm": 0.4146324396133423,
"learning_rate": 0.00033848746355685127,
"loss": 3.2283,
"step": 74850
},
{
"epoch": 21.824009324009324,
"grad_norm": 0.42270055413246155,
"learning_rate": 0.00033831253644314865,
"loss": 3.25,
"step": 74900
},
{
"epoch": 21.83857808857809,
"grad_norm": 0.41215404868125916,
"learning_rate": 0.000338137609329446,
"loss": 3.2358,
"step": 74950
},
{
"epoch": 21.853146853146853,
"grad_norm": 0.40176627039909363,
"learning_rate": 0.00033796268221574345,
"loss": 3.2288,
"step": 75000
},
{
"epoch": 21.853146853146853,
"eval_accuracy": 0.37451243887920854,
"eval_loss": 3.533621072769165,
"eval_runtime": 180.1417,
"eval_samples_per_second": 92.383,
"eval_steps_per_second": 5.779,
"step": 75000
},
{
"epoch": 21.867715617715618,
"grad_norm": 0.41212475299835205,
"learning_rate": 0.00033778775510204083,
"loss": 3.2464,
"step": 75050
},
{
"epoch": 21.882284382284382,
"grad_norm": 0.4054611027240753,
"learning_rate": 0.00033761282798833815,
"loss": 3.239,
"step": 75100
},
{
"epoch": 21.896853146853147,
"grad_norm": 0.42206400632858276,
"learning_rate": 0.0003374379008746355,
"loss": 3.2397,
"step": 75150
},
{
"epoch": 21.91142191142191,
"grad_norm": 0.39644235372543335,
"learning_rate": 0.0003372629737609329,
"loss": 3.2399,
"step": 75200
},
{
"epoch": 21.925990675990676,
"grad_norm": 0.46023765206336975,
"learning_rate": 0.0003370880466472303,
"loss": 3.2508,
"step": 75250
},
{
"epoch": 21.94055944055944,
"grad_norm": 0.39913883805274963,
"learning_rate": 0.00033691311953352765,
"loss": 3.2372,
"step": 75300
},
{
"epoch": 21.955128205128204,
"grad_norm": 0.39186689257621765,
"learning_rate": 0.00033673819241982503,
"loss": 3.2402,
"step": 75350
},
{
"epoch": 21.96969696969697,
"grad_norm": 0.42302364110946655,
"learning_rate": 0.00033656326530612246,
"loss": 3.2357,
"step": 75400
},
{
"epoch": 21.984265734265733,
"grad_norm": 0.43021634221076965,
"learning_rate": 0.00033638833819241983,
"loss": 3.2478,
"step": 75450
},
{
"epoch": 21.998834498834498,
"grad_norm": 0.4020933210849762,
"learning_rate": 0.0003362134110787172,
"loss": 3.2506,
"step": 75500
},
{
"epoch": 22.013403263403262,
"grad_norm": 0.40365955233573914,
"learning_rate": 0.00033603848396501453,
"loss": 3.1508,
"step": 75550
},
{
"epoch": 22.027972027972027,
"grad_norm": 0.4142661988735199,
"learning_rate": 0.0003358635568513119,
"loss": 3.1432,
"step": 75600
},
{
"epoch": 22.04254079254079,
"grad_norm": 0.4072301685810089,
"learning_rate": 0.0003356886297376093,
"loss": 3.1519,
"step": 75650
},
{
"epoch": 22.057109557109555,
"grad_norm": 0.43228116631507874,
"learning_rate": 0.00033551370262390666,
"loss": 3.1468,
"step": 75700
},
{
"epoch": 22.071678321678323,
"grad_norm": 0.4091419577598572,
"learning_rate": 0.00033533877551020403,
"loss": 3.1599,
"step": 75750
},
{
"epoch": 22.086247086247088,
"grad_norm": 0.42613473534584045,
"learning_rate": 0.0003351638483965014,
"loss": 3.1608,
"step": 75800
},
{
"epoch": 22.100815850815852,
"grad_norm": 0.41074615716934204,
"learning_rate": 0.00033498892128279884,
"loss": 3.1487,
"step": 75850
},
{
"epoch": 22.115384615384617,
"grad_norm": 0.43605613708496094,
"learning_rate": 0.0003348139941690962,
"loss": 3.1563,
"step": 75900
},
{
"epoch": 22.12995337995338,
"grad_norm": 0.3988359272480011,
"learning_rate": 0.0003346390670553936,
"loss": 3.1575,
"step": 75950
},
{
"epoch": 22.144522144522146,
"grad_norm": 0.3911682367324829,
"learning_rate": 0.0003344641399416909,
"loss": 3.1628,
"step": 76000
},
{
"epoch": 22.144522144522146,
"eval_accuracy": 0.3734473008310617,
"eval_loss": 3.5492424964904785,
"eval_runtime": 250.473,
"eval_samples_per_second": 66.442,
"eval_steps_per_second": 4.156,
"step": 76000
},
{
"epoch": 22.15909090909091,
"grad_norm": 0.4275292158126831,
"learning_rate": 0.0003342892128279883,
"loss": 3.1832,
"step": 76050
},
{
"epoch": 22.173659673659674,
"grad_norm": 0.4048565626144409,
"learning_rate": 0.00033411428571428567,
"loss": 3.1722,
"step": 76100
},
{
"epoch": 22.18822843822844,
"grad_norm": 0.45824673771858215,
"learning_rate": 0.00033393935860058304,
"loss": 3.176,
"step": 76150
},
{
"epoch": 22.202797202797203,
"grad_norm": 0.4052969813346863,
"learning_rate": 0.0003337644314868804,
"loss": 3.1814,
"step": 76200
},
{
"epoch": 22.217365967365968,
"grad_norm": 0.4052939713001251,
"learning_rate": 0.0003335895043731778,
"loss": 3.176,
"step": 76250
},
{
"epoch": 22.231934731934732,
"grad_norm": 0.4383087456226349,
"learning_rate": 0.0003334145772594752,
"loss": 3.1829,
"step": 76300
},
{
"epoch": 22.246503496503497,
"grad_norm": 0.41173261404037476,
"learning_rate": 0.0003332396501457726,
"loss": 3.1896,
"step": 76350
},
{
"epoch": 22.26107226107226,
"grad_norm": 0.42085108160972595,
"learning_rate": 0.00033306472303207,
"loss": 3.1796,
"step": 76400
},
{
"epoch": 22.275641025641026,
"grad_norm": 0.4438980221748352,
"learning_rate": 0.0003328897959183673,
"loss": 3.1888,
"step": 76450
},
{
"epoch": 22.29020979020979,
"grad_norm": 0.4162982702255249,
"learning_rate": 0.00033271486880466467,
"loss": 3.188,
"step": 76500
},
{
"epoch": 22.304778554778554,
"grad_norm": 0.4152987003326416,
"learning_rate": 0.00033253994169096205,
"loss": 3.1885,
"step": 76550
},
{
"epoch": 22.31934731934732,
"grad_norm": 0.41857367753982544,
"learning_rate": 0.0003323650145772594,
"loss": 3.1949,
"step": 76600
},
{
"epoch": 22.333916083916083,
"grad_norm": 0.43874943256378174,
"learning_rate": 0.0003321900874635568,
"loss": 3.2027,
"step": 76650
},
{
"epoch": 22.348484848484848,
"grad_norm": 0.43016231060028076,
"learning_rate": 0.00033201516034985423,
"loss": 3.1932,
"step": 76700
},
{
"epoch": 22.363053613053612,
"grad_norm": 0.41237112879753113,
"learning_rate": 0.0003318402332361516,
"loss": 3.1969,
"step": 76750
},
{
"epoch": 22.377622377622377,
"grad_norm": 0.39711132645606995,
"learning_rate": 0.000331665306122449,
"loss": 3.2021,
"step": 76800
},
{
"epoch": 22.39219114219114,
"grad_norm": 0.41648826003074646,
"learning_rate": 0.00033149037900874636,
"loss": 3.202,
"step": 76850
},
{
"epoch": 22.406759906759905,
"grad_norm": 0.43639883399009705,
"learning_rate": 0.0003313154518950437,
"loss": 3.1975,
"step": 76900
},
{
"epoch": 22.42132867132867,
"grad_norm": 0.43766576051712036,
"learning_rate": 0.00033114052478134105,
"loss": 3.211,
"step": 76950
},
{
"epoch": 22.435897435897434,
"grad_norm": 0.40724071860313416,
"learning_rate": 0.00033096559766763843,
"loss": 3.205,
"step": 77000
},
{
"epoch": 22.435897435897434,
"eval_accuracy": 0.37394717959488066,
"eval_loss": 3.5423734188079834,
"eval_runtime": 351.8544,
"eval_samples_per_second": 47.298,
"eval_steps_per_second": 2.959,
"step": 77000
},
{
"epoch": 22.4504662004662,
"grad_norm": 0.4043303430080414,
"learning_rate": 0.0003307906705539358,
"loss": 3.2015,
"step": 77050
},
{
"epoch": 22.465034965034967,
"grad_norm": 0.4104682207107544,
"learning_rate": 0.0003306157434402332,
"loss": 3.2104,
"step": 77100
},
{
"epoch": 22.47960372960373,
"grad_norm": 0.4327828288078308,
"learning_rate": 0.0003304408163265306,
"loss": 3.1976,
"step": 77150
},
{
"epoch": 22.494172494172496,
"grad_norm": 0.40035805106163025,
"learning_rate": 0.000330265889212828,
"loss": 3.1973,
"step": 77200
},
{
"epoch": 22.50874125874126,
"grad_norm": 0.4154951274394989,
"learning_rate": 0.00033009096209912536,
"loss": 3.2096,
"step": 77250
},
{
"epoch": 22.523310023310025,
"grad_norm": 0.4317122995853424,
"learning_rate": 0.00032991603498542274,
"loss": 3.2028,
"step": 77300
},
{
"epoch": 22.53787878787879,
"grad_norm": 0.4079900085926056,
"learning_rate": 0.00032974110787172006,
"loss": 3.2131,
"step": 77350
},
{
"epoch": 22.552447552447553,
"grad_norm": 0.4091930389404297,
"learning_rate": 0.00032956618075801744,
"loss": 3.202,
"step": 77400
},
{
"epoch": 22.567016317016318,
"grad_norm": 0.42974573373794556,
"learning_rate": 0.0003293912536443148,
"loss": 3.221,
"step": 77450
},
{
"epoch": 22.581585081585082,
"grad_norm": 0.40895313024520874,
"learning_rate": 0.0003292163265306122,
"loss": 3.2192,
"step": 77500
},
{
"epoch": 22.596153846153847,
"grad_norm": 0.4025968611240387,
"learning_rate": 0.00032904139941690956,
"loss": 3.212,
"step": 77550
},
{
"epoch": 22.61072261072261,
"grad_norm": 0.40843942761421204,
"learning_rate": 0.000328866472303207,
"loss": 3.2103,
"step": 77600
},
{
"epoch": 22.625291375291376,
"grad_norm": 0.3990994095802307,
"learning_rate": 0.00032869154518950437,
"loss": 3.2009,
"step": 77650
},
{
"epoch": 22.63986013986014,
"grad_norm": 0.4064718186855316,
"learning_rate": 0.00032851661807580174,
"loss": 3.2152,
"step": 77700
},
{
"epoch": 22.654428904428904,
"grad_norm": 0.40382882952690125,
"learning_rate": 0.0003283416909620991,
"loss": 3.2168,
"step": 77750
},
{
"epoch": 22.66899766899767,
"grad_norm": 0.4538167417049408,
"learning_rate": 0.00032816676384839644,
"loss": 3.2197,
"step": 77800
},
{
"epoch": 22.683566433566433,
"grad_norm": 0.4410167932510376,
"learning_rate": 0.0003279918367346938,
"loss": 3.2207,
"step": 77850
},
{
"epoch": 22.698135198135198,
"grad_norm": 0.40950748324394226,
"learning_rate": 0.0003278169096209912,
"loss": 3.225,
"step": 77900
},
{
"epoch": 22.712703962703962,
"grad_norm": 0.44317975640296936,
"learning_rate": 0.00032764198250728857,
"loss": 3.2251,
"step": 77950
},
{
"epoch": 22.727272727272727,
"grad_norm": 0.4140080213546753,
"learning_rate": 0.000327467055393586,
"loss": 3.2246,
"step": 78000
},
{
"epoch": 22.727272727272727,
"eval_accuracy": 0.37460909856960123,
"eval_loss": 3.534909963607788,
"eval_runtime": 180.2239,
"eval_samples_per_second": 92.341,
"eval_steps_per_second": 5.776,
"step": 78000
},
{
"epoch": 22.74184149184149,
"grad_norm": 0.41078007221221924,
"learning_rate": 0.0003272921282798834,
"loss": 3.2131,
"step": 78050
},
{
"epoch": 22.756410256410255,
"grad_norm": 0.4103488028049469,
"learning_rate": 0.00032711720116618075,
"loss": 3.227,
"step": 78100
},
{
"epoch": 22.77097902097902,
"grad_norm": 0.4068191945552826,
"learning_rate": 0.0003269422740524781,
"loss": 3.2225,
"step": 78150
},
{
"epoch": 22.785547785547784,
"grad_norm": 0.4318259656429291,
"learning_rate": 0.0003267673469387755,
"loss": 3.2264,
"step": 78200
},
{
"epoch": 22.80011655011655,
"grad_norm": 0.40115559101104736,
"learning_rate": 0.0003265924198250728,
"loss": 3.2255,
"step": 78250
},
{
"epoch": 22.814685314685313,
"grad_norm": 0.4201257526874542,
"learning_rate": 0.0003264174927113702,
"loss": 3.2325,
"step": 78300
},
{
"epoch": 22.829254079254078,
"grad_norm": 0.4182145893573761,
"learning_rate": 0.0003262425655976676,
"loss": 3.2292,
"step": 78350
},
{
"epoch": 22.843822843822842,
"grad_norm": 0.4127039611339569,
"learning_rate": 0.00032606763848396495,
"loss": 3.2223,
"step": 78400
},
{
"epoch": 22.85839160839161,
"grad_norm": 0.38099154829978943,
"learning_rate": 0.0003258927113702624,
"loss": 3.2266,
"step": 78450
},
{
"epoch": 22.872960372960375,
"grad_norm": 0.4170164167881012,
"learning_rate": 0.00032571778425655976,
"loss": 3.2318,
"step": 78500
},
{
"epoch": 22.88752913752914,
"grad_norm": 0.41228339076042175,
"learning_rate": 0.00032554285714285713,
"loss": 3.2338,
"step": 78550
},
{
"epoch": 22.902097902097903,
"grad_norm": 0.3996942639350891,
"learning_rate": 0.0003253679300291545,
"loss": 3.2232,
"step": 78600
},
{
"epoch": 22.916666666666668,
"grad_norm": 0.4170009195804596,
"learning_rate": 0.0003251930029154519,
"loss": 3.2358,
"step": 78650
},
{
"epoch": 22.931235431235432,
"grad_norm": 0.40195149183273315,
"learning_rate": 0.0003250180758017492,
"loss": 3.2308,
"step": 78700
},
{
"epoch": 22.945804195804197,
"grad_norm": 0.39445433020591736,
"learning_rate": 0.0003248431486880466,
"loss": 3.2227,
"step": 78750
},
{
"epoch": 22.96037296037296,
"grad_norm": 0.40387430787086487,
"learning_rate": 0.00032466822157434396,
"loss": 3.2338,
"step": 78800
},
{
"epoch": 22.974941724941726,
"grad_norm": 0.4055889844894409,
"learning_rate": 0.0003244932944606414,
"loss": 3.2283,
"step": 78850
},
{
"epoch": 22.98951048951049,
"grad_norm": 0.37491124868392944,
"learning_rate": 0.00032431836734693876,
"loss": 3.221,
"step": 78900
},
{
"epoch": 23.004079254079254,
"grad_norm": 0.43204745650291443,
"learning_rate": 0.00032414344023323614,
"loss": 3.2207,
"step": 78950
},
{
"epoch": 23.01864801864802,
"grad_norm": 0.4061754643917084,
"learning_rate": 0.0003239685131195335,
"loss": 3.1374,
"step": 79000
},
{
"epoch": 23.01864801864802,
"eval_accuracy": 0.3740811155892325,
"eval_loss": 3.54728102684021,
"eval_runtime": 180.1553,
"eval_samples_per_second": 92.376,
"eval_steps_per_second": 5.778,
"step": 79000
},
{
"epoch": 23.033216783216783,
"grad_norm": 0.4229036867618561,
"learning_rate": 0.0003237935860058309,
"loss": 3.1273,
"step": 79050
},
{
"epoch": 23.047785547785548,
"grad_norm": 0.40628716349601746,
"learning_rate": 0.00032361865889212827,
"loss": 3.1343,
"step": 79100
},
{
"epoch": 23.062354312354312,
"grad_norm": 0.4286355674266815,
"learning_rate": 0.0003234437317784256,
"loss": 3.1428,
"step": 79150
},
{
"epoch": 23.076923076923077,
"grad_norm": 0.4400956332683563,
"learning_rate": 0.00032326880466472296,
"loss": 3.1596,
"step": 79200
},
{
"epoch": 23.09149184149184,
"grad_norm": 0.43168753385543823,
"learning_rate": 0.00032309387755102034,
"loss": 3.1411,
"step": 79250
},
{
"epoch": 23.106060606060606,
"grad_norm": 0.42677047848701477,
"learning_rate": 0.00032291895043731777,
"loss": 3.1558,
"step": 79300
},
{
"epoch": 23.12062937062937,
"grad_norm": 0.425641804933548,
"learning_rate": 0.00032274402332361515,
"loss": 3.1493,
"step": 79350
},
{
"epoch": 23.135198135198134,
"grad_norm": 0.41930001974105835,
"learning_rate": 0.0003225690962099125,
"loss": 3.1602,
"step": 79400
},
{
"epoch": 23.1497668997669,
"grad_norm": 0.43074584007263184,
"learning_rate": 0.0003223941690962099,
"loss": 3.1561,
"step": 79450
},
{
"epoch": 23.164335664335663,
"grad_norm": 0.43243861198425293,
"learning_rate": 0.0003222192419825073,
"loss": 3.1647,
"step": 79500
},
{
"epoch": 23.178904428904428,
"grad_norm": 0.44113197922706604,
"learning_rate": 0.00032204431486880465,
"loss": 3.1676,
"step": 79550
},
{
"epoch": 23.193473193473192,
"grad_norm": 0.4136090874671936,
"learning_rate": 0.00032186938775510197,
"loss": 3.1604,
"step": 79600
},
{
"epoch": 23.208041958041957,
"grad_norm": 0.4486762285232544,
"learning_rate": 0.00032169446064139935,
"loss": 3.1636,
"step": 79650
},
{
"epoch": 23.22261072261072,
"grad_norm": 0.47251448035240173,
"learning_rate": 0.0003215195335276967,
"loss": 3.1705,
"step": 79700
},
{
"epoch": 23.237179487179485,
"grad_norm": 0.4380472004413605,
"learning_rate": 0.00032134460641399415,
"loss": 3.1662,
"step": 79750
},
{
"epoch": 23.251748251748253,
"grad_norm": 0.42042556405067444,
"learning_rate": 0.00032116967930029153,
"loss": 3.1799,
"step": 79800
},
{
"epoch": 23.266317016317018,
"grad_norm": 0.4275950491428375,
"learning_rate": 0.0003209947521865889,
"loss": 3.1781,
"step": 79850
},
{
"epoch": 23.280885780885782,
"grad_norm": 0.44534599781036377,
"learning_rate": 0.0003208198250728863,
"loss": 3.1668,
"step": 79900
},
{
"epoch": 23.295454545454547,
"grad_norm": 0.4342590272426605,
"learning_rate": 0.00032064489795918366,
"loss": 3.1745,
"step": 79950
},
{
"epoch": 23.31002331002331,
"grad_norm": 0.4515496492385864,
"learning_rate": 0.00032046997084548103,
"loss": 3.1667,
"step": 80000
},
{
"epoch": 23.31002331002331,
"eval_accuracy": 0.3738672178072079,
"eval_loss": 3.548288345336914,
"eval_runtime": 180.3608,
"eval_samples_per_second": 92.271,
"eval_steps_per_second": 5.772,
"step": 80000
},
{
"epoch": 23.324592074592076,
"grad_norm": 0.39993399381637573,
"learning_rate": 0.00032029504373177835,
"loss": 3.123,
"step": 80050
},
{
"epoch": 23.33916083916084,
"grad_norm": 0.4498705565929413,
"learning_rate": 0.00032012011661807573,
"loss": 3.1427,
"step": 80100
},
{
"epoch": 23.353729603729604,
"grad_norm": 0.405280739068985,
"learning_rate": 0.00031994518950437316,
"loss": 3.1473,
"step": 80150
},
{
"epoch": 23.36829836829837,
"grad_norm": 0.4300670921802521,
"learning_rate": 0.00031977026239067053,
"loss": 3.1478,
"step": 80200
},
{
"epoch": 23.382867132867133,
"grad_norm": 0.4063658118247986,
"learning_rate": 0.0003195953352769679,
"loss": 3.148,
"step": 80250
},
{
"epoch": 23.397435897435898,
"grad_norm": 0.4089643359184265,
"learning_rate": 0.0003194204081632653,
"loss": 3.1584,
"step": 80300
},
{
"epoch": 23.412004662004662,
"grad_norm": 0.4139532148838043,
"learning_rate": 0.00031924548104956266,
"loss": 3.1676,
"step": 80350
},
{
"epoch": 23.426573426573427,
"grad_norm": 0.47233232855796814,
"learning_rate": 0.00031907055393586004,
"loss": 3.1632,
"step": 80400
},
{
"epoch": 23.44114219114219,
"grad_norm": 0.4001595973968506,
"learning_rate": 0.0003188956268221574,
"loss": 3.1748,
"step": 80450
},
{
"epoch": 23.455710955710956,
"grad_norm": 0.41345474123954773,
"learning_rate": 0.00031872069970845474,
"loss": 3.1672,
"step": 80500
},
{
"epoch": 23.47027972027972,
"grad_norm": 0.40790998935699463,
"learning_rate": 0.0003185457725947521,
"loss": 3.1537,
"step": 80550
},
{
"epoch": 23.484848484848484,
"grad_norm": 0.405627578496933,
"learning_rate": 0.00031837084548104954,
"loss": 3.1764,
"step": 80600
},
{
"epoch": 23.49941724941725,
"grad_norm": 0.42736586928367615,
"learning_rate": 0.0003181959183673469,
"loss": 3.1721,
"step": 80650
},
{
"epoch": 23.513986013986013,
"grad_norm": 0.4293386936187744,
"learning_rate": 0.0003180209912536443,
"loss": 3.1747,
"step": 80700
},
{
"epoch": 23.528554778554778,
"grad_norm": 0.4108065366744995,
"learning_rate": 0.00031784606413994167,
"loss": 3.1666,
"step": 80750
},
{
"epoch": 23.543123543123542,
"grad_norm": 0.4118288457393646,
"learning_rate": 0.00031767113702623904,
"loss": 3.1701,
"step": 80800
},
{
"epoch": 23.557692307692307,
"grad_norm": 0.41944974660873413,
"learning_rate": 0.0003174962099125364,
"loss": 3.1658,
"step": 80850
},
{
"epoch": 23.57226107226107,
"grad_norm": 0.4191264808177948,
"learning_rate": 0.0003173212827988338,
"loss": 3.1584,
"step": 80900
},
{
"epoch": 23.586829836829835,
"grad_norm": 0.41242533922195435,
"learning_rate": 0.0003171463556851311,
"loss": 3.1647,
"step": 80950
},
{
"epoch": 23.6013986013986,
"grad_norm": 0.4619014263153076,
"learning_rate": 0.00031697142857142855,
"loss": 3.1871,
"step": 81000
},
{
"epoch": 23.6013986013986,
"eval_accuracy": 0.3739357732810509,
"eval_loss": 3.549891233444214,
"eval_runtime": 179.5335,
"eval_samples_per_second": 92.696,
"eval_steps_per_second": 5.798,
"step": 81000
},
{
"epoch": 23.615967365967364,
"grad_norm": 0.38858968019485474,
"learning_rate": 0.0003167965014577259,
"loss": 3.1771,
"step": 81050
},
{
"epoch": 23.63053613053613,
"grad_norm": 0.43501484394073486,
"learning_rate": 0.0003166215743440233,
"loss": 3.1881,
"step": 81100
},
{
"epoch": 23.645104895104897,
"grad_norm": 0.4217482805252075,
"learning_rate": 0.0003164466472303207,
"loss": 3.1812,
"step": 81150
},
{
"epoch": 23.65967365967366,
"grad_norm": 0.4172874391078949,
"learning_rate": 0.00031627172011661805,
"loss": 3.1826,
"step": 81200
},
{
"epoch": 23.674242424242426,
"grad_norm": 0.3954772651195526,
"learning_rate": 0.0003160967930029154,
"loss": 3.1845,
"step": 81250
},
{
"epoch": 23.68881118881119,
"grad_norm": 0.4089130461215973,
"learning_rate": 0.0003159218658892128,
"loss": 3.1718,
"step": 81300
},
{
"epoch": 23.703379953379955,
"grad_norm": 0.43155068159103394,
"learning_rate": 0.00031574693877551023,
"loss": 3.2006,
"step": 81350
},
{
"epoch": 23.71794871794872,
"grad_norm": 0.4237709045410156,
"learning_rate": 0.0003155720116618075,
"loss": 3.1878,
"step": 81400
},
{
"epoch": 23.732517482517483,
"grad_norm": 0.4215518832206726,
"learning_rate": 0.00031539708454810493,
"loss": 3.1905,
"step": 81450
},
{
"epoch": 23.747086247086248,
"grad_norm": 0.44571006298065186,
"learning_rate": 0.0003152221574344023,
"loss": 3.1912,
"step": 81500
},
{
"epoch": 23.761655011655012,
"grad_norm": 0.443622887134552,
"learning_rate": 0.0003150472303206997,
"loss": 3.199,
"step": 81550
},
{
"epoch": 23.776223776223777,
"grad_norm": 0.4024413228034973,
"learning_rate": 0.00031487230320699706,
"loss": 3.1939,
"step": 81600
},
{
"epoch": 23.79079254079254,
"grad_norm": 0.45250263810157776,
"learning_rate": 0.00031469737609329443,
"loss": 3.1852,
"step": 81650
},
{
"epoch": 23.805361305361306,
"grad_norm": 0.5000615119934082,
"learning_rate": 0.0003145224489795918,
"loss": 3.1972,
"step": 81700
},
{
"epoch": 23.81993006993007,
"grad_norm": 0.40920284390449524,
"learning_rate": 0.0003143475218658892,
"loss": 3.1951,
"step": 81750
},
{
"epoch": 23.834498834498834,
"grad_norm": 0.400705486536026,
"learning_rate": 0.0003141725947521866,
"loss": 3.2042,
"step": 81800
},
{
"epoch": 23.8490675990676,
"grad_norm": 0.4304558038711548,
"learning_rate": 0.0003139976676384839,
"loss": 3.1997,
"step": 81850
},
{
"epoch": 23.863636363636363,
"grad_norm": 0.4087577164173126,
"learning_rate": 0.0003138227405247813,
"loss": 3.1963,
"step": 81900
},
{
"epoch": 23.878205128205128,
"grad_norm": 0.4356166422367096,
"learning_rate": 0.0003136478134110787,
"loss": 3.2014,
"step": 81950
},
{
"epoch": 23.892773892773892,
"grad_norm": 0.4396255314350128,
"learning_rate": 0.00031347288629737606,
"loss": 3.2005,
"step": 82000
},
{
"epoch": 23.892773892773892,
"eval_accuracy": 0.37419106304728256,
"eval_loss": 3.5441832542419434,
"eval_runtime": 189.8124,
"eval_samples_per_second": 87.676,
"eval_steps_per_second": 5.484,
"step": 82000
},
{
"epoch": 23.907342657342657,
"grad_norm": 0.49265164136886597,
"learning_rate": 0.00031329795918367344,
"loss": 3.1957,
"step": 82050
},
{
"epoch": 23.92191142191142,
"grad_norm": 0.42900264263153076,
"learning_rate": 0.0003131230320699708,
"loss": 3.2143,
"step": 82100
},
{
"epoch": 23.936480186480185,
"grad_norm": 0.42176422476768494,
"learning_rate": 0.0003129481049562682,
"loss": 3.2066,
"step": 82150
},
{
"epoch": 23.95104895104895,
"grad_norm": 0.4115244746208191,
"learning_rate": 0.00031277317784256557,
"loss": 3.1994,
"step": 82200
},
{
"epoch": 23.965617715617714,
"grad_norm": 0.4627463221549988,
"learning_rate": 0.000312598250728863,
"loss": 3.205,
"step": 82250
},
{
"epoch": 23.98018648018648,
"grad_norm": 0.4280329942703247,
"learning_rate": 0.0003124233236151603,
"loss": 3.2046,
"step": 82300
},
{
"epoch": 23.994755244755243,
"grad_norm": 0.4357800781726837,
"learning_rate": 0.0003122483965014577,
"loss": 3.2168,
"step": 82350
},
{
"epoch": 24.009324009324008,
"grad_norm": 0.44521504640579224,
"learning_rate": 0.00031207346938775507,
"loss": 3.1641,
"step": 82400
},
{
"epoch": 24.023892773892776,
"grad_norm": 0.4130154252052307,
"learning_rate": 0.00031189854227405245,
"loss": 3.1328,
"step": 82450
},
{
"epoch": 24.03846153846154,
"grad_norm": 0.4017220437526703,
"learning_rate": 0.0003117236151603498,
"loss": 3.1364,
"step": 82500
},
{
"epoch": 24.053030303030305,
"grad_norm": 0.42192500829696655,
"learning_rate": 0.0003115486880466472,
"loss": 3.1237,
"step": 82550
},
{
"epoch": 24.06759906759907,
"grad_norm": 0.44063979387283325,
"learning_rate": 0.00031137376093294457,
"loss": 3.1436,
"step": 82600
},
{
"epoch": 24.082167832167833,
"grad_norm": 0.4516114592552185,
"learning_rate": 0.000311198833819242,
"loss": 3.1318,
"step": 82650
},
{
"epoch": 24.096736596736598,
"grad_norm": 0.4406895339488983,
"learning_rate": 0.0003110239067055394,
"loss": 3.1506,
"step": 82700
},
{
"epoch": 24.111305361305362,
"grad_norm": 0.46922528743743896,
"learning_rate": 0.0003108489795918367,
"loss": 3.1586,
"step": 82750
},
{
"epoch": 24.125874125874127,
"grad_norm": 0.42122822999954224,
"learning_rate": 0.0003106740524781341,
"loss": 3.1544,
"step": 82800
},
{
"epoch": 24.14044289044289,
"grad_norm": 0.46628227829933167,
"learning_rate": 0.00031049912536443145,
"loss": 3.1556,
"step": 82850
},
{
"epoch": 24.155011655011656,
"grad_norm": 0.43666237592697144,
"learning_rate": 0.00031032419825072883,
"loss": 3.1521,
"step": 82900
},
{
"epoch": 24.16958041958042,
"grad_norm": 0.4410806894302368,
"learning_rate": 0.0003101492711370262,
"loss": 3.1573,
"step": 82950
},
{
"epoch": 24.184149184149184,
"grad_norm": 0.4260009825229645,
"learning_rate": 0.0003099743440233236,
"loss": 3.1587,
"step": 83000
},
{
"epoch": 24.184149184149184,
"eval_accuracy": 0.3736513209804914,
"eval_loss": 3.5505282878875732,
"eval_runtime": 189.3113,
"eval_samples_per_second": 87.908,
"eval_steps_per_second": 5.499,
"step": 83000
},
{
"epoch": 24.19871794871795,
"grad_norm": 0.46315836906433105,
"learning_rate": 0.00030979941690962095,
"loss": 3.1663,
"step": 83050
},
{
"epoch": 24.213286713286713,
"grad_norm": 0.40869399905204773,
"learning_rate": 0.0003096244897959184,
"loss": 3.1599,
"step": 83100
},
{
"epoch": 24.227855477855478,
"grad_norm": 0.4249207675457001,
"learning_rate": 0.00030944956268221576,
"loss": 3.1681,
"step": 83150
},
{
"epoch": 24.242424242424242,
"grad_norm": 0.442433625459671,
"learning_rate": 0.0003092746355685131,
"loss": 3.1746,
"step": 83200
},
{
"epoch": 24.256993006993007,
"grad_norm": 0.4539443552494049,
"learning_rate": 0.00030909970845481046,
"loss": 3.1707,
"step": 83250
},
{
"epoch": 24.27156177156177,
"grad_norm": 0.4181159734725952,
"learning_rate": 0.00030892478134110783,
"loss": 3.167,
"step": 83300
},
{
"epoch": 24.286130536130536,
"grad_norm": 0.45877861976623535,
"learning_rate": 0.0003087498542274052,
"loss": 3.1731,
"step": 83350
},
{
"epoch": 24.3006993006993,
"grad_norm": 0.42578256130218506,
"learning_rate": 0.0003085749271137026,
"loss": 3.1787,
"step": 83400
},
{
"epoch": 24.315268065268064,
"grad_norm": 0.46334999799728394,
"learning_rate": 0.00030839999999999996,
"loss": 3.1842,
"step": 83450
},
{
"epoch": 24.32983682983683,
"grad_norm": 0.4494231045246124,
"learning_rate": 0.0003082250728862974,
"loss": 3.1693,
"step": 83500
},
{
"epoch": 24.344405594405593,
"grad_norm": 0.5533521771430969,
"learning_rate": 0.00030805014577259477,
"loss": 3.1596,
"step": 83550
},
{
"epoch": 24.358974358974358,
"grad_norm": 0.44728219509124756,
"learning_rate": 0.00030787521865889214,
"loss": 3.1803,
"step": 83600
},
{
"epoch": 24.373543123543122,
"grad_norm": 0.43010392785072327,
"learning_rate": 0.00030770029154518946,
"loss": 3.1895,
"step": 83650
},
{
"epoch": 24.388111888111887,
"grad_norm": 0.42528676986694336,
"learning_rate": 0.00030752536443148684,
"loss": 3.1935,
"step": 83700
},
{
"epoch": 24.40268065268065,
"grad_norm": 0.41684848070144653,
"learning_rate": 0.0003073504373177842,
"loss": 3.1724,
"step": 83750
},
{
"epoch": 24.41724941724942,
"grad_norm": 0.42178863286972046,
"learning_rate": 0.0003071755102040816,
"loss": 3.1743,
"step": 83800
},
{
"epoch": 24.431818181818183,
"grad_norm": 0.3935507535934448,
"learning_rate": 0.00030700058309037897,
"loss": 3.1922,
"step": 83850
},
{
"epoch": 24.446386946386948,
"grad_norm": 0.43585166335105896,
"learning_rate": 0.00030682565597667634,
"loss": 3.1775,
"step": 83900
},
{
"epoch": 24.460955710955712,
"grad_norm": 0.4166204035282135,
"learning_rate": 0.0003066507288629738,
"loss": 3.1917,
"step": 83950
},
{
"epoch": 24.475524475524477,
"grad_norm": 0.4170168340206146,
"learning_rate": 0.00030647580174927115,
"loss": 3.1966,
"step": 84000
},
{
"epoch": 24.475524475524477,
"eval_accuracy": 0.37420364526975464,
"eval_loss": 3.542668104171753,
"eval_runtime": 189.5519,
"eval_samples_per_second": 87.797,
"eval_steps_per_second": 5.492,
"step": 84000
},
{
"epoch": 24.49009324009324,
"grad_norm": 0.4078339636325836,
"learning_rate": 0.0003063008746355685,
"loss": 3.1876,
"step": 84050
},
{
"epoch": 24.504662004662006,
"grad_norm": 0.39984050393104553,
"learning_rate": 0.00030612594752186585,
"loss": 3.1865,
"step": 84100
},
{
"epoch": 24.51923076923077,
"grad_norm": 0.4368041455745697,
"learning_rate": 0.0003059510204081632,
"loss": 3.1874,
"step": 84150
},
{
"epoch": 24.533799533799534,
"grad_norm": 0.4150752127170563,
"learning_rate": 0.0003057760932944606,
"loss": 3.1968,
"step": 84200
},
{
"epoch": 24.5483682983683,
"grad_norm": 0.4473128914833069,
"learning_rate": 0.000305601166180758,
"loss": 3.1893,
"step": 84250
},
{
"epoch": 24.562937062937063,
"grad_norm": 0.40939411520957947,
"learning_rate": 0.00030542623906705535,
"loss": 3.2025,
"step": 84300
},
{
"epoch": 24.577505827505828,
"grad_norm": 0.46430233120918274,
"learning_rate": 0.0003052513119533527,
"loss": 3.193,
"step": 84350
},
{
"epoch": 24.592074592074592,
"grad_norm": 0.4168838858604431,
"learning_rate": 0.00030507638483965016,
"loss": 3.2013,
"step": 84400
},
{
"epoch": 24.606643356643357,
"grad_norm": 0.4212021827697754,
"learning_rate": 0.00030490145772594753,
"loss": 3.1924,
"step": 84450
},
{
"epoch": 24.62121212121212,
"grad_norm": 0.4454532563686371,
"learning_rate": 0.0003047265306122449,
"loss": 3.2061,
"step": 84500
},
{
"epoch": 24.635780885780886,
"grad_norm": 0.4298340082168579,
"learning_rate": 0.00030455160349854223,
"loss": 3.2033,
"step": 84550
},
{
"epoch": 24.65034965034965,
"grad_norm": 0.4507395625114441,
"learning_rate": 0.0003043766763848396,
"loss": 3.2133,
"step": 84600
},
{
"epoch": 24.664918414918414,
"grad_norm": 0.40621644258499146,
"learning_rate": 0.000304201749271137,
"loss": 3.1939,
"step": 84650
},
{
"epoch": 24.67948717948718,
"grad_norm": 0.43747299909591675,
"learning_rate": 0.00030402682215743436,
"loss": 3.2021,
"step": 84700
},
{
"epoch": 24.694055944055943,
"grad_norm": 0.47679662704467773,
"learning_rate": 0.00030385189504373173,
"loss": 3.202,
"step": 84750
},
{
"epoch": 24.708624708624708,
"grad_norm": 0.4054070711135864,
"learning_rate": 0.00030367696793002916,
"loss": 3.206,
"step": 84800
},
{
"epoch": 24.723193473193472,
"grad_norm": 0.4033360481262207,
"learning_rate": 0.00030350204081632654,
"loss": 3.2069,
"step": 84850
},
{
"epoch": 24.737762237762237,
"grad_norm": 0.4348824620246887,
"learning_rate": 0.0003033271137026239,
"loss": 3.2131,
"step": 84900
},
{
"epoch": 24.752331002331,
"grad_norm": 0.41652145981788635,
"learning_rate": 0.0003031521865889213,
"loss": 3.2136,
"step": 84950
},
{
"epoch": 24.766899766899765,
"grad_norm": 0.4814378321170807,
"learning_rate": 0.0003029772594752186,
"loss": 3.211,
"step": 85000
},
{
"epoch": 24.766899766899765,
"eval_accuracy": 0.3745153786508142,
"eval_loss": 3.5338070392608643,
"eval_runtime": 189.6066,
"eval_samples_per_second": 87.771,
"eval_steps_per_second": 5.49,
"step": 85000
},
{
"epoch": 24.78146853146853,
"grad_norm": 0.378551185131073,
"learning_rate": 0.000302802332361516,
"loss": 3.214,
"step": 85050
},
{
"epoch": 24.796037296037294,
"grad_norm": 0.43937408924102783,
"learning_rate": 0.00030262740524781336,
"loss": 3.2026,
"step": 85100
},
{
"epoch": 24.810606060606062,
"grad_norm": 0.399384468793869,
"learning_rate": 0.00030245247813411074,
"loss": 3.2208,
"step": 85150
},
{
"epoch": 24.825174825174827,
"grad_norm": 0.4425372779369354,
"learning_rate": 0.0003022775510204081,
"loss": 3.212,
"step": 85200
},
{
"epoch": 24.83974358974359,
"grad_norm": 0.4405309855937958,
"learning_rate": 0.00030210262390670554,
"loss": 3.1984,
"step": 85250
},
{
"epoch": 24.854312354312356,
"grad_norm": 0.40886348485946655,
"learning_rate": 0.0003019276967930029,
"loss": 3.2036,
"step": 85300
},
{
"epoch": 24.86888111888112,
"grad_norm": 0.43906837701797485,
"learning_rate": 0.0003017527696793003,
"loss": 3.2168,
"step": 85350
},
{
"epoch": 24.883449883449885,
"grad_norm": 0.44705092906951904,
"learning_rate": 0.00030157784256559767,
"loss": 3.2108,
"step": 85400
},
{
"epoch": 24.89801864801865,
"grad_norm": 0.42035573720932007,
"learning_rate": 0.000301402915451895,
"loss": 3.2158,
"step": 85450
},
{
"epoch": 24.912587412587413,
"grad_norm": 0.4399532377719879,
"learning_rate": 0.00030122798833819237,
"loss": 3.2247,
"step": 85500
},
{
"epoch": 24.927156177156178,
"grad_norm": 0.42403891682624817,
"learning_rate": 0.00030105306122448974,
"loss": 3.2092,
"step": 85550
},
{
"epoch": 24.941724941724942,
"grad_norm": 0.40814968943595886,
"learning_rate": 0.0003008781341107871,
"loss": 3.2136,
"step": 85600
},
{
"epoch": 24.956293706293707,
"grad_norm": 0.4622436761856079,
"learning_rate": 0.0003007032069970845,
"loss": 3.2217,
"step": 85650
},
{
"epoch": 24.97086247086247,
"grad_norm": 0.44937148690223694,
"learning_rate": 0.0003005282798833819,
"loss": 3.2059,
"step": 85700
},
{
"epoch": 24.985431235431236,
"grad_norm": 0.43933066725730896,
"learning_rate": 0.0003003533527696793,
"loss": 3.2154,
"step": 85750
},
{
"epoch": 25.0,
"grad_norm": 0.39813855290412903,
"learning_rate": 0.0003001784256559767,
"loss": 3.2131,
"step": 85800
},
{
"epoch": 25.014568764568764,
"grad_norm": 0.4538300633430481,
"learning_rate": 0.00030000349854227405,
"loss": 3.116,
"step": 85850
},
{
"epoch": 25.02913752913753,
"grad_norm": 0.41572535037994385,
"learning_rate": 0.00029982857142857143,
"loss": 3.1081,
"step": 85900
},
{
"epoch": 25.043706293706293,
"grad_norm": 0.4343355596065521,
"learning_rate": 0.0002996536443148688,
"loss": 3.1264,
"step": 85950
},
{
"epoch": 25.058275058275058,
"grad_norm": 0.47448500990867615,
"learning_rate": 0.0002994787172011661,
"loss": 3.1336,
"step": 86000
},
{
"epoch": 25.058275058275058,
"eval_accuracy": 0.37419223895592485,
"eval_loss": 3.5482711791992188,
"eval_runtime": 188.7946,
"eval_samples_per_second": 88.149,
"eval_steps_per_second": 5.514,
"step": 86000
},
{
"epoch": 25.072843822843822,
"grad_norm": 0.4407516419887543,
"learning_rate": 0.0002993037900874635,
"loss": 3.1332,
"step": 86050
},
{
"epoch": 25.087412587412587,
"grad_norm": 0.42840850353240967,
"learning_rate": 0.00029912886297376093,
"loss": 3.1299,
"step": 86100
},
{
"epoch": 25.10198135198135,
"grad_norm": 0.4274645745754242,
"learning_rate": 0.0002989539358600583,
"loss": 3.1388,
"step": 86150
},
{
"epoch": 25.116550116550115,
"grad_norm": 0.4566427767276764,
"learning_rate": 0.0002987790087463557,
"loss": 3.1259,
"step": 86200
},
{
"epoch": 25.13111888111888,
"grad_norm": 0.40977421402931213,
"learning_rate": 0.000298604081632653,
"loss": 3.1378,
"step": 86250
},
{
"epoch": 25.145687645687644,
"grad_norm": 0.43461692333221436,
"learning_rate": 0.00029842915451895044,
"loss": 3.1386,
"step": 86300
},
{
"epoch": 25.16025641025641,
"grad_norm": 0.42161956429481506,
"learning_rate": 0.0002982542274052478,
"loss": 3.1499,
"step": 86350
},
{
"epoch": 25.174825174825173,
"grad_norm": 0.42874354124069214,
"learning_rate": 0.0002980793002915452,
"loss": 3.1479,
"step": 86400
},
{
"epoch": 25.189393939393938,
"grad_norm": 0.4363137185573578,
"learning_rate": 0.0002979043731778425,
"loss": 3.1367,
"step": 86450
},
{
"epoch": 25.203962703962706,
"grad_norm": 0.43857312202453613,
"learning_rate": 0.0002977294460641399,
"loss": 3.1444,
"step": 86500
},
{
"epoch": 25.21853146853147,
"grad_norm": 0.4245397448539734,
"learning_rate": 0.0002975545189504373,
"loss": 3.1602,
"step": 86550
},
{
"epoch": 25.233100233100235,
"grad_norm": 0.42843955755233765,
"learning_rate": 0.0002973795918367347,
"loss": 3.1556,
"step": 86600
},
{
"epoch": 25.247668997669,
"grad_norm": 0.41731715202331543,
"learning_rate": 0.00029720466472303207,
"loss": 3.1485,
"step": 86650
},
{
"epoch": 25.262237762237763,
"grad_norm": 0.4179287254810333,
"learning_rate": 0.0002970297376093294,
"loss": 3.1711,
"step": 86700
},
{
"epoch": 25.276806526806528,
"grad_norm": 0.4194510579109192,
"learning_rate": 0.0002968548104956268,
"loss": 3.1617,
"step": 86750
},
{
"epoch": 25.291375291375292,
"grad_norm": 0.420640766620636,
"learning_rate": 0.0002966798833819242,
"loss": 3.1645,
"step": 86800
},
{
"epoch": 25.305944055944057,
"grad_norm": 0.44683873653411865,
"learning_rate": 0.00029650495626822157,
"loss": 3.1687,
"step": 86850
},
{
"epoch": 25.32051282051282,
"grad_norm": 0.44092994928359985,
"learning_rate": 0.0002963300291545189,
"loss": 3.1555,
"step": 86900
},
{
"epoch": 25.335081585081586,
"grad_norm": 0.46043768525123596,
"learning_rate": 0.0002961551020408163,
"loss": 3.1795,
"step": 86950
},
{
"epoch": 25.34965034965035,
"grad_norm": 0.43037664890289307,
"learning_rate": 0.0002959801749271137,
"loss": 3.1568,
"step": 87000
},
{
"epoch": 25.34965034965035,
"eval_accuracy": 0.37438896847177267,
"eval_loss": 3.5446794033050537,
"eval_runtime": 180.0732,
"eval_samples_per_second": 92.418,
"eval_steps_per_second": 5.781,
"step": 87000
},
{
"epoch": 25.364219114219114,
"grad_norm": 0.46643906831741333,
"learning_rate": 0.00029580524781341107,
"loss": 3.1654,
"step": 87050
},
{
"epoch": 25.37878787878788,
"grad_norm": 0.46620315313339233,
"learning_rate": 0.00029563032069970845,
"loss": 3.1712,
"step": 87100
},
{
"epoch": 25.393356643356643,
"grad_norm": 0.4388134777545929,
"learning_rate": 0.00029545539358600577,
"loss": 3.1756,
"step": 87150
},
{
"epoch": 25.407925407925408,
"grad_norm": 0.4645821154117584,
"learning_rate": 0.0002952804664723032,
"loss": 3.1865,
"step": 87200
},
{
"epoch": 25.422494172494172,
"grad_norm": 0.4313008487224579,
"learning_rate": 0.0002951055393586006,
"loss": 3.1656,
"step": 87250
},
{
"epoch": 25.437062937062937,
"grad_norm": 0.4366571307182312,
"learning_rate": 0.00029493061224489795,
"loss": 3.1726,
"step": 87300
},
{
"epoch": 25.4516317016317,
"grad_norm": 0.4275836944580078,
"learning_rate": 0.00029475568513119527,
"loss": 3.1824,
"step": 87350
},
{
"epoch": 25.466200466200466,
"grad_norm": 0.41847360134124756,
"learning_rate": 0.0002945807580174927,
"loss": 3.165,
"step": 87400
},
{
"epoch": 25.48076923076923,
"grad_norm": 0.4382728636264801,
"learning_rate": 0.0002944058309037901,
"loss": 3.1943,
"step": 87450
},
{
"epoch": 25.495337995337994,
"grad_norm": 0.45048829913139343,
"learning_rate": 0.00029423090379008745,
"loss": 3.177,
"step": 87500
},
{
"epoch": 25.50990675990676,
"grad_norm": 0.44650429487228394,
"learning_rate": 0.00029405597667638483,
"loss": 3.1752,
"step": 87550
},
{
"epoch": 25.524475524475523,
"grad_norm": 0.4350736439228058,
"learning_rate": 0.0002938810495626822,
"loss": 3.176,
"step": 87600
},
{
"epoch": 25.539044289044288,
"grad_norm": 0.43065324425697327,
"learning_rate": 0.0002937061224489796,
"loss": 3.1841,
"step": 87650
},
{
"epoch": 25.553613053613052,
"grad_norm": 0.494675874710083,
"learning_rate": 0.00029353119533527696,
"loss": 3.1645,
"step": 87700
},
{
"epoch": 25.568181818181817,
"grad_norm": 0.4301419258117676,
"learning_rate": 0.00029335626822157433,
"loss": 3.1787,
"step": 87750
},
{
"epoch": 25.582750582750585,
"grad_norm": 0.4418124854564667,
"learning_rate": 0.00029318134110787166,
"loss": 3.1955,
"step": 87800
},
{
"epoch": 25.59731934731935,
"grad_norm": 0.4510349929332733,
"learning_rate": 0.0002930064139941691,
"loss": 3.2034,
"step": 87850
},
{
"epoch": 25.611888111888113,
"grad_norm": 0.44014590978622437,
"learning_rate": 0.00029283148688046646,
"loss": 3.1952,
"step": 87900
},
{
"epoch": 25.626456876456878,
"grad_norm": 0.4308202862739563,
"learning_rate": 0.00029265655976676384,
"loss": 3.196,
"step": 87950
},
{
"epoch": 25.641025641025642,
"grad_norm": 0.4513147175312042,
"learning_rate": 0.0002924816326530612,
"loss": 3.1885,
"step": 88000
},
{
"epoch": 25.641025641025642,
"eval_accuracy": 0.37464590451010354,
"eval_loss": 3.537010908126831,
"eval_runtime": 180.0096,
"eval_samples_per_second": 92.451,
"eval_steps_per_second": 5.783,
"step": 88000
},
{
"epoch": 25.655594405594407,
"grad_norm": 0.41755375266075134,
"learning_rate": 0.0002923067055393586,
"loss": 3.1938,
"step": 88050
},
{
"epoch": 25.67016317016317,
"grad_norm": 0.43336576223373413,
"learning_rate": 0.00029213177842565596,
"loss": 3.1946,
"step": 88100
},
{
"epoch": 25.684731934731936,
"grad_norm": 0.4581342041492462,
"learning_rate": 0.00029195685131195334,
"loss": 3.1944,
"step": 88150
},
{
"epoch": 25.6993006993007,
"grad_norm": 0.46270158886909485,
"learning_rate": 0.0002917819241982507,
"loss": 3.186,
"step": 88200
},
{
"epoch": 25.713869463869464,
"grad_norm": 0.4232158958911896,
"learning_rate": 0.0002916069970845481,
"loss": 3.1969,
"step": 88250
},
{
"epoch": 25.72843822843823,
"grad_norm": 0.4254876673221588,
"learning_rate": 0.00029143206997084547,
"loss": 3.2034,
"step": 88300
},
{
"epoch": 25.743006993006993,
"grad_norm": 0.4381236433982849,
"learning_rate": 0.00029125714285714284,
"loss": 3.1831,
"step": 88350
},
{
"epoch": 25.757575757575758,
"grad_norm": 0.4526331424713135,
"learning_rate": 0.0002910822157434402,
"loss": 3.1882,
"step": 88400
},
{
"epoch": 25.772144522144522,
"grad_norm": 0.43877992033958435,
"learning_rate": 0.0002909072886297376,
"loss": 3.1952,
"step": 88450
},
{
"epoch": 25.786713286713287,
"grad_norm": 0.42094627022743225,
"learning_rate": 0.00029073236151603497,
"loss": 3.1945,
"step": 88500
},
{
"epoch": 25.80128205128205,
"grad_norm": 0.4453929662704468,
"learning_rate": 0.00029055743440233235,
"loss": 3.1953,
"step": 88550
},
{
"epoch": 25.815850815850816,
"grad_norm": 0.4521614909172058,
"learning_rate": 0.0002903825072886297,
"loss": 3.2105,
"step": 88600
},
{
"epoch": 25.83041958041958,
"grad_norm": 0.4133826494216919,
"learning_rate": 0.0002902075801749271,
"loss": 3.194,
"step": 88650
},
{
"epoch": 25.844988344988344,
"grad_norm": 0.41386139392852783,
"learning_rate": 0.0002900326530612245,
"loss": 3.2099,
"step": 88700
},
{
"epoch": 25.85955710955711,
"grad_norm": 0.4450267255306244,
"learning_rate": 0.00028985772594752185,
"loss": 3.1998,
"step": 88750
},
{
"epoch": 25.874125874125873,
"grad_norm": 0.4442254602909088,
"learning_rate": 0.0002896827988338192,
"loss": 3.2076,
"step": 88800
},
{
"epoch": 25.888694638694638,
"grad_norm": 0.45003944635391235,
"learning_rate": 0.0002895078717201166,
"loss": 3.2119,
"step": 88850
},
{
"epoch": 25.903263403263402,
"grad_norm": 0.4601682722568512,
"learning_rate": 0.000289332944606414,
"loss": 3.2007,
"step": 88900
},
{
"epoch": 25.917832167832167,
"grad_norm": 0.41649630665779114,
"learning_rate": 0.00028915801749271135,
"loss": 3.2134,
"step": 88950
},
{
"epoch": 25.93240093240093,
"grad_norm": 0.4264170527458191,
"learning_rate": 0.00028898309037900873,
"loss": 3.2063,
"step": 89000
},
{
"epoch": 25.93240093240093,
"eval_accuracy": 0.37524891046184755,
"eval_loss": 3.5297417640686035,
"eval_runtime": 180.0669,
"eval_samples_per_second": 92.421,
"eval_steps_per_second": 5.781,
"step": 89000
},
{
"epoch": 25.946969696969695,
"grad_norm": 0.4318842589855194,
"learning_rate": 0.0002888081632653061,
"loss": 3.2068,
"step": 89050
},
{
"epoch": 25.96153846153846,
"grad_norm": 0.40909937024116516,
"learning_rate": 0.0002886332361516035,
"loss": 3.1974,
"step": 89100
},
{
"epoch": 25.976107226107224,
"grad_norm": 0.45897090435028076,
"learning_rate": 0.00028845830903790086,
"loss": 3.2034,
"step": 89150
},
{
"epoch": 25.990675990675992,
"grad_norm": 0.4648137390613556,
"learning_rate": 0.00028828338192419823,
"loss": 3.2068,
"step": 89200
},
{
"epoch": 26.005244755244757,
"grad_norm": 0.4189426600933075,
"learning_rate": 0.0002881084548104956,
"loss": 3.1742,
"step": 89250
},
{
"epoch": 26.01981351981352,
"grad_norm": 0.42856845259666443,
"learning_rate": 0.000287933527696793,
"loss": 3.1066,
"step": 89300
},
{
"epoch": 26.034382284382286,
"grad_norm": 0.4568479061126709,
"learning_rate": 0.00028775860058309036,
"loss": 3.1162,
"step": 89350
},
{
"epoch": 26.04895104895105,
"grad_norm": 0.4555512070655823,
"learning_rate": 0.00028758367346938773,
"loss": 3.1287,
"step": 89400
},
{
"epoch": 26.063519813519815,
"grad_norm": 0.45386287569999695,
"learning_rate": 0.0002874087463556851,
"loss": 3.1103,
"step": 89450
},
{
"epoch": 26.07808857808858,
"grad_norm": 0.45175591111183167,
"learning_rate": 0.0002872338192419825,
"loss": 3.1168,
"step": 89500
},
{
"epoch": 26.092657342657343,
"grad_norm": 0.4288419783115387,
"learning_rate": 0.00028705889212827986,
"loss": 3.1154,
"step": 89550
},
{
"epoch": 26.107226107226108,
"grad_norm": 0.4286247193813324,
"learning_rate": 0.00028688396501457724,
"loss": 3.118,
"step": 89600
},
{
"epoch": 26.121794871794872,
"grad_norm": 0.44961321353912354,
"learning_rate": 0.0002867090379008746,
"loss": 3.1236,
"step": 89650
},
{
"epoch": 26.136363636363637,
"grad_norm": 0.42312365770339966,
"learning_rate": 0.000286534110787172,
"loss": 3.1243,
"step": 89700
},
{
"epoch": 26.1509324009324,
"grad_norm": 0.43461480736732483,
"learning_rate": 0.00028635918367346937,
"loss": 3.1339,
"step": 89750
},
{
"epoch": 26.165501165501166,
"grad_norm": 0.4479959309101105,
"learning_rate": 0.00028618425655976674,
"loss": 3.1364,
"step": 89800
},
{
"epoch": 26.18006993006993,
"grad_norm": 0.44000333547592163,
"learning_rate": 0.0002860093294460641,
"loss": 3.1317,
"step": 89850
},
{
"epoch": 26.194638694638694,
"grad_norm": 0.42374539375305176,
"learning_rate": 0.0002858344023323615,
"loss": 3.1386,
"step": 89900
},
{
"epoch": 26.20920745920746,
"grad_norm": 0.49846577644348145,
"learning_rate": 0.00028565947521865887,
"loss": 3.1444,
"step": 89950
},
{
"epoch": 26.223776223776223,
"grad_norm": 0.42031097412109375,
"learning_rate": 0.00028548454810495624,
"loss": 3.1457,
"step": 90000
},
{
"epoch": 26.223776223776223,
"eval_accuracy": 0.3741142762129439,
"eval_loss": 3.549098491668701,
"eval_runtime": 180.1606,
"eval_samples_per_second": 92.373,
"eval_steps_per_second": 5.778,
"step": 90000
},
{
"epoch": 26.238344988344988,
"grad_norm": 0.45871150493621826,
"learning_rate": 0.0002853096209912536,
"loss": 3.1347,
"step": 90050
},
{
"epoch": 26.252913752913752,
"grad_norm": 0.4206872284412384,
"learning_rate": 0.000285134693877551,
"loss": 3.1534,
"step": 90100
},
{
"epoch": 26.267482517482517,
"grad_norm": 0.4632352888584137,
"learning_rate": 0.00028495976676384837,
"loss": 3.1387,
"step": 90150
},
{
"epoch": 26.28205128205128,
"grad_norm": 0.45894187688827515,
"learning_rate": 0.00028478483965014575,
"loss": 3.1571,
"step": 90200
},
{
"epoch": 26.296620046620045,
"grad_norm": 0.4624476730823517,
"learning_rate": 0.0002846099125364431,
"loss": 3.1518,
"step": 90250
},
{
"epoch": 26.31118881118881,
"grad_norm": 0.42303577065467834,
"learning_rate": 0.0002844349854227405,
"loss": 3.1664,
"step": 90300
},
{
"epoch": 26.325757575757574,
"grad_norm": 0.45280689001083374,
"learning_rate": 0.0002842600583090379,
"loss": 3.1568,
"step": 90350
},
{
"epoch": 26.34032634032634,
"grad_norm": 0.46011117100715637,
"learning_rate": 0.00028408513119533525,
"loss": 3.1678,
"step": 90400
},
{
"epoch": 26.354895104895103,
"grad_norm": 0.41634801030158997,
"learning_rate": 0.0002839102040816326,
"loss": 3.1702,
"step": 90450
},
{
"epoch": 26.36946386946387,
"grad_norm": 0.42648565769195557,
"learning_rate": 0.00028373527696793,
"loss": 3.1612,
"step": 90500
},
{
"epoch": 26.384032634032636,
"grad_norm": 0.45400530099868774,
"learning_rate": 0.0002835603498542274,
"loss": 3.1615,
"step": 90550
},
{
"epoch": 26.3986013986014,
"grad_norm": 0.4465737044811249,
"learning_rate": 0.00028338542274052475,
"loss": 3.1665,
"step": 90600
},
{
"epoch": 26.413170163170165,
"grad_norm": 0.43689030408859253,
"learning_rate": 0.00028321049562682213,
"loss": 3.1611,
"step": 90650
},
{
"epoch": 26.42773892773893,
"grad_norm": 0.45843949913978577,
"learning_rate": 0.0002830355685131195,
"loss": 3.1735,
"step": 90700
},
{
"epoch": 26.442307692307693,
"grad_norm": 0.4294235408306122,
"learning_rate": 0.0002828606413994169,
"loss": 3.1689,
"step": 90750
},
{
"epoch": 26.456876456876458,
"grad_norm": 0.43327438831329346,
"learning_rate": 0.00028268571428571426,
"loss": 3.1698,
"step": 90800
},
{
"epoch": 26.471445221445222,
"grad_norm": 0.42804962396621704,
"learning_rate": 0.00028251078717201163,
"loss": 3.1556,
"step": 90850
},
{
"epoch": 26.486013986013987,
"grad_norm": 0.43207576870918274,
"learning_rate": 0.000282335860058309,
"loss": 3.1692,
"step": 90900
},
{
"epoch": 26.50058275058275,
"grad_norm": 0.43317896127700806,
"learning_rate": 0.0002821609329446064,
"loss": 3.1605,
"step": 90950
},
{
"epoch": 26.515151515151516,
"grad_norm": 0.47736456990242004,
"learning_rate": 0.00028198600583090376,
"loss": 3.1848,
"step": 91000
},
{
"epoch": 26.515151515151516,
"eval_accuracy": 0.37442060041424907,
"eval_loss": 3.5440564155578613,
"eval_runtime": 179.9678,
"eval_samples_per_second": 92.472,
"eval_steps_per_second": 5.784,
"step": 91000
},
{
"epoch": 26.52972027972028,
"grad_norm": 0.42366132140159607,
"learning_rate": 0.00028181107871720114,
"loss": 3.1739,
"step": 91050
},
{
"epoch": 26.544289044289044,
"grad_norm": 0.4395405054092407,
"learning_rate": 0.0002816361516034985,
"loss": 3.1667,
"step": 91100
},
{
"epoch": 26.55885780885781,
"grad_norm": 0.41824859380722046,
"learning_rate": 0.0002814612244897959,
"loss": 3.1733,
"step": 91150
},
{
"epoch": 26.573426573426573,
"grad_norm": 0.4543924331665039,
"learning_rate": 0.00028128629737609326,
"loss": 3.179,
"step": 91200
},
{
"epoch": 26.587995337995338,
"grad_norm": 0.44957053661346436,
"learning_rate": 0.00028111137026239064,
"loss": 3.1781,
"step": 91250
},
{
"epoch": 26.602564102564102,
"grad_norm": 0.4380970895290375,
"learning_rate": 0.000280936443148688,
"loss": 3.1863,
"step": 91300
},
{
"epoch": 26.617132867132867,
"grad_norm": 0.4364950656890869,
"learning_rate": 0.0002807615160349854,
"loss": 3.1805,
"step": 91350
},
{
"epoch": 26.63170163170163,
"grad_norm": 0.46392178535461426,
"learning_rate": 0.00028058658892128277,
"loss": 3.1899,
"step": 91400
},
{
"epoch": 26.646270396270396,
"grad_norm": 0.44661247730255127,
"learning_rate": 0.00028041166180758014,
"loss": 3.1845,
"step": 91450
},
{
"epoch": 26.66083916083916,
"grad_norm": 0.4507148563861847,
"learning_rate": 0.0002802367346938775,
"loss": 3.1823,
"step": 91500
},
{
"epoch": 26.675407925407924,
"grad_norm": 0.4523918926715851,
"learning_rate": 0.0002800618075801749,
"loss": 3.1867,
"step": 91550
},
{
"epoch": 26.68997668997669,
"grad_norm": 0.4648110270500183,
"learning_rate": 0.00027988688046647227,
"loss": 3.1882,
"step": 91600
},
{
"epoch": 26.704545454545453,
"grad_norm": 0.4386864900588989,
"learning_rate": 0.00027971195335276965,
"loss": 3.1715,
"step": 91650
},
{
"epoch": 26.719114219114218,
"grad_norm": 0.43069586157798767,
"learning_rate": 0.000279537026239067,
"loss": 3.1736,
"step": 91700
},
{
"epoch": 26.733682983682982,
"grad_norm": 0.4628106355667114,
"learning_rate": 0.0002793620991253644,
"loss": 3.1872,
"step": 91750
},
{
"epoch": 26.748251748251747,
"grad_norm": 0.43363499641418457,
"learning_rate": 0.00027918717201166177,
"loss": 3.1906,
"step": 91800
},
{
"epoch": 26.76282051282051,
"grad_norm": 0.4345760643482208,
"learning_rate": 0.00027901224489795915,
"loss": 3.1949,
"step": 91850
},
{
"epoch": 26.77738927738928,
"grad_norm": 0.4561176896095276,
"learning_rate": 0.0002788373177842565,
"loss": 3.1857,
"step": 91900
},
{
"epoch": 26.791958041958043,
"grad_norm": 0.44225165247917175,
"learning_rate": 0.0002786623906705539,
"loss": 3.1934,
"step": 91950
},
{
"epoch": 26.806526806526808,
"grad_norm": 0.43117964267730713,
"learning_rate": 0.0002784874635568513,
"loss": 3.1945,
"step": 92000
},
{
"epoch": 26.806526806526808,
"eval_accuracy": 0.37495940175412645,
"eval_loss": 3.5328190326690674,
"eval_runtime": 180.0693,
"eval_samples_per_second": 92.42,
"eval_steps_per_second": 5.781,
"step": 92000
},
{
"epoch": 26.821095571095572,
"grad_norm": 0.4259069859981537,
"learning_rate": 0.00027831253644314865,
"loss": 3.192,
"step": 92050
},
{
"epoch": 26.835664335664337,
"grad_norm": 0.41003715991973877,
"learning_rate": 0.00027813760932944603,
"loss": 3.1884,
"step": 92100
},
{
"epoch": 26.8502331002331,
"grad_norm": 0.42846834659576416,
"learning_rate": 0.0002779626822157434,
"loss": 3.1897,
"step": 92150
},
{
"epoch": 26.864801864801866,
"grad_norm": 0.4506886899471283,
"learning_rate": 0.0002777877551020408,
"loss": 3.1907,
"step": 92200
},
{
"epoch": 26.87937062937063,
"grad_norm": 0.45437026023864746,
"learning_rate": 0.0002776128279883382,
"loss": 3.1979,
"step": 92250
},
{
"epoch": 26.893939393939394,
"grad_norm": 0.43997761607170105,
"learning_rate": 0.00027743790087463553,
"loss": 3.1893,
"step": 92300
},
{
"epoch": 26.90850815850816,
"grad_norm": 0.4390334188938141,
"learning_rate": 0.0002772629737609329,
"loss": 3.1929,
"step": 92350
},
{
"epoch": 26.923076923076923,
"grad_norm": 0.41957342624664307,
"learning_rate": 0.0002770880466472303,
"loss": 3.1948,
"step": 92400
},
{
"epoch": 26.937645687645688,
"grad_norm": 0.43660280108451843,
"learning_rate": 0.00027691311953352766,
"loss": 3.1896,
"step": 92450
},
{
"epoch": 26.952214452214452,
"grad_norm": 0.42752009630203247,
"learning_rate": 0.00027673819241982503,
"loss": 3.2083,
"step": 92500
},
{
"epoch": 26.966783216783217,
"grad_norm": 0.4077240526676178,
"learning_rate": 0.0002765632653061224,
"loss": 3.1988,
"step": 92550
},
{
"epoch": 26.98135198135198,
"grad_norm": 0.44178691506385803,
"learning_rate": 0.0002763883381924198,
"loss": 3.1943,
"step": 92600
},
{
"epoch": 26.995920745920746,
"grad_norm": 0.4497985541820526,
"learning_rate": 0.00027621341107871716,
"loss": 3.1956,
"step": 92650
},
{
"epoch": 27.01048951048951,
"grad_norm": 0.4393564760684967,
"learning_rate": 0.0002760384839650146,
"loss": 3.1167,
"step": 92700
},
{
"epoch": 27.025058275058274,
"grad_norm": 0.42496976256370544,
"learning_rate": 0.0002758635568513119,
"loss": 3.1006,
"step": 92750
},
{
"epoch": 27.03962703962704,
"grad_norm": 0.4464763104915619,
"learning_rate": 0.0002756886297376093,
"loss": 3.1044,
"step": 92800
},
{
"epoch": 27.054195804195803,
"grad_norm": 0.4169905185699463,
"learning_rate": 0.00027551370262390666,
"loss": 3.1068,
"step": 92850
},
{
"epoch": 27.068764568764568,
"grad_norm": 0.441429078578949,
"learning_rate": 0.0002753387755102041,
"loss": 3.1149,
"step": 92900
},
{
"epoch": 27.083333333333332,
"grad_norm": 0.41629430651664734,
"learning_rate": 0.0002751638483965014,
"loss": 3.1169,
"step": 92950
},
{
"epoch": 27.097902097902097,
"grad_norm": 0.4784495234489441,
"learning_rate": 0.0002749889212827988,
"loss": 3.1137,
"step": 93000
},
{
"epoch": 27.097902097902097,
"eval_accuracy": 0.37444059086116727,
"eval_loss": 3.551065683364868,
"eval_runtime": 180.0346,
"eval_samples_per_second": 92.438,
"eval_steps_per_second": 5.782,
"step": 93000
},
{
"epoch": 27.11247086247086,
"grad_norm": 0.43689513206481934,
"learning_rate": 0.00027481399416909617,
"loss": 3.1218,
"step": 93050
},
{
"epoch": 27.127039627039625,
"grad_norm": 0.4687103033065796,
"learning_rate": 0.00027463906705539354,
"loss": 3.1118,
"step": 93100
},
{
"epoch": 27.14160839160839,
"grad_norm": 0.42937007546424866,
"learning_rate": 0.000274464139941691,
"loss": 3.1084,
"step": 93150
},
{
"epoch": 27.156177156177158,
"grad_norm": 0.4537714719772339,
"learning_rate": 0.0002742892128279883,
"loss": 3.1285,
"step": 93200
},
{
"epoch": 27.170745920745922,
"grad_norm": 0.4483446180820465,
"learning_rate": 0.00027411428571428567,
"loss": 3.1191,
"step": 93250
},
{
"epoch": 27.185314685314687,
"grad_norm": 0.4289822280406952,
"learning_rate": 0.00027393935860058305,
"loss": 3.1271,
"step": 93300
},
{
"epoch": 27.19988344988345,
"grad_norm": 0.4680139720439911,
"learning_rate": 0.0002737644314868805,
"loss": 3.1336,
"step": 93350
},
{
"epoch": 27.214452214452216,
"grad_norm": 0.4301610589027405,
"learning_rate": 0.0002735895043731778,
"loss": 3.1374,
"step": 93400
},
{
"epoch": 27.22902097902098,
"grad_norm": 0.4446755349636078,
"learning_rate": 0.0002734145772594752,
"loss": 3.1381,
"step": 93450
},
{
"epoch": 27.243589743589745,
"grad_norm": 0.4434562623500824,
"learning_rate": 0.00027323965014577255,
"loss": 3.1335,
"step": 93500
},
{
"epoch": 27.25815850815851,
"grad_norm": 0.49129512906074524,
"learning_rate": 0.00027306472303207,
"loss": 3.1463,
"step": 93550
},
{
"epoch": 27.272727272727273,
"grad_norm": 0.43248531222343445,
"learning_rate": 0.00027288979591836736,
"loss": 3.1415,
"step": 93600
},
{
"epoch": 27.287296037296038,
"grad_norm": 0.46865126490592957,
"learning_rate": 0.0002727148688046647,
"loss": 3.1394,
"step": 93650
},
{
"epoch": 27.301864801864802,
"grad_norm": 0.46066224575042725,
"learning_rate": 0.00027253994169096205,
"loss": 3.1466,
"step": 93700
},
{
"epoch": 27.316433566433567,
"grad_norm": 0.44381311535835266,
"learning_rate": 0.00027236501457725943,
"loss": 3.1549,
"step": 93750
},
{
"epoch": 27.33100233100233,
"grad_norm": 0.4424683153629303,
"learning_rate": 0.00027219008746355686,
"loss": 3.1552,
"step": 93800
},
{
"epoch": 27.345571095571096,
"grad_norm": 0.4684559404850006,
"learning_rate": 0.0002720151603498542,
"loss": 3.1601,
"step": 93850
},
{
"epoch": 27.36013986013986,
"grad_norm": 0.4258364737033844,
"learning_rate": 0.00027184023323615156,
"loss": 3.1472,
"step": 93900
},
{
"epoch": 27.374708624708624,
"grad_norm": 0.43364837765693665,
"learning_rate": 0.00027166530612244893,
"loss": 3.1483,
"step": 93950
},
{
"epoch": 27.38927738927739,
"grad_norm": 0.5024469494819641,
"learning_rate": 0.00027149037900874636,
"loss": 3.1564,
"step": 94000
},
{
"epoch": 27.38927738927739,
"eval_accuracy": 0.3745912247582391,
"eval_loss": 3.5458009243011475,
"eval_runtime": 179.9539,
"eval_samples_per_second": 92.479,
"eval_steps_per_second": 5.785,
"step": 94000
},
{
"epoch": 27.403846153846153,
"grad_norm": 0.4278470575809479,
"learning_rate": 0.00027131545189504374,
"loss": 3.1459,
"step": 94050
},
{
"epoch": 27.418414918414918,
"grad_norm": 0.43033117055892944,
"learning_rate": 0.00027114052478134106,
"loss": 3.1585,
"step": 94100
},
{
"epoch": 27.432983682983682,
"grad_norm": 0.4715587794780731,
"learning_rate": 0.00027096559766763843,
"loss": 3.1575,
"step": 94150
},
{
"epoch": 27.447552447552447,
"grad_norm": 0.4445301592350006,
"learning_rate": 0.00027079067055393586,
"loss": 3.1454,
"step": 94200
},
{
"epoch": 27.46212121212121,
"grad_norm": 0.45014679431915283,
"learning_rate": 0.00027061574344023324,
"loss": 3.15,
"step": 94250
},
{
"epoch": 27.476689976689975,
"grad_norm": 0.47272783517837524,
"learning_rate": 0.00027044081632653056,
"loss": 3.1527,
"step": 94300
},
{
"epoch": 27.49125874125874,
"grad_norm": 0.47329598665237427,
"learning_rate": 0.00027026588921282794,
"loss": 3.1613,
"step": 94350
},
{
"epoch": 27.505827505827504,
"grad_norm": 0.4669019877910614,
"learning_rate": 0.0002700909620991253,
"loss": 3.1612,
"step": 94400
},
{
"epoch": 27.52039627039627,
"grad_norm": 0.43642956018447876,
"learning_rate": 0.00026991603498542274,
"loss": 3.1634,
"step": 94450
},
{
"epoch": 27.534965034965033,
"grad_norm": 0.45903342962265015,
"learning_rate": 0.0002697411078717201,
"loss": 3.1715,
"step": 94500
},
{
"epoch": 27.5495337995338,
"grad_norm": 0.43123385310173035,
"learning_rate": 0.00026956618075801744,
"loss": 3.1693,
"step": 94550
},
{
"epoch": 27.564102564102566,
"grad_norm": 0.43567144870758057,
"learning_rate": 0.0002693912536443148,
"loss": 3.1722,
"step": 94600
},
{
"epoch": 27.57867132867133,
"grad_norm": 0.4707465171813965,
"learning_rate": 0.00026921632653061225,
"loss": 3.1703,
"step": 94650
},
{
"epoch": 27.593240093240095,
"grad_norm": 0.4384394586086273,
"learning_rate": 0.0002690413994169096,
"loss": 3.1706,
"step": 94700
},
{
"epoch": 27.60780885780886,
"grad_norm": 0.4365791976451874,
"learning_rate": 0.00026886647230320694,
"loss": 3.1742,
"step": 94750
},
{
"epoch": 27.622377622377623,
"grad_norm": 0.41515985131263733,
"learning_rate": 0.0002686915451895043,
"loss": 3.1535,
"step": 94800
},
{
"epoch": 27.636946386946388,
"grad_norm": 0.4355006217956543,
"learning_rate": 0.00026851661807580175,
"loss": 3.1686,
"step": 94850
},
{
"epoch": 27.651515151515152,
"grad_norm": 0.47344303131103516,
"learning_rate": 0.0002683416909620991,
"loss": 3.172,
"step": 94900
},
{
"epoch": 27.666083916083917,
"grad_norm": 0.434416800737381,
"learning_rate": 0.0002681667638483965,
"loss": 3.1708,
"step": 94950
},
{
"epoch": 27.68065268065268,
"grad_norm": 0.4230683445930481,
"learning_rate": 0.0002679918367346938,
"loss": 3.1807,
"step": 95000
},
{
"epoch": 27.68065268065268,
"eval_accuracy": 0.37487167896941487,
"eval_loss": 3.5402252674102783,
"eval_runtime": 179.9377,
"eval_samples_per_second": 92.488,
"eval_steps_per_second": 5.785,
"step": 95000
},
{
"epoch": 27.695221445221446,
"grad_norm": 0.44491046667099,
"learning_rate": 0.00026781690962099125,
"loss": 3.1754,
"step": 95050
},
{
"epoch": 27.70979020979021,
"grad_norm": 0.43584510684013367,
"learning_rate": 0.00026764198250728863,
"loss": 3.1754,
"step": 95100
},
{
"epoch": 27.724358974358974,
"grad_norm": 0.45981577038764954,
"learning_rate": 0.000267467055393586,
"loss": 3.1746,
"step": 95150
},
{
"epoch": 27.73892773892774,
"grad_norm": 0.46015459299087524,
"learning_rate": 0.0002672921282798833,
"loss": 3.1925,
"step": 95200
},
{
"epoch": 27.753496503496503,
"grad_norm": 0.4347175359725952,
"learning_rate": 0.0002671172011661807,
"loss": 3.1811,
"step": 95250
},
{
"epoch": 27.768065268065268,
"grad_norm": 0.4558524191379547,
"learning_rate": 0.00026694227405247813,
"loss": 3.1854,
"step": 95300
},
{
"epoch": 27.782634032634032,
"grad_norm": 0.46495407819747925,
"learning_rate": 0.0002667673469387755,
"loss": 3.1795,
"step": 95350
},
{
"epoch": 27.797202797202797,
"grad_norm": 0.4345493018627167,
"learning_rate": 0.0002665924198250729,
"loss": 3.1858,
"step": 95400
},
{
"epoch": 27.81177156177156,
"grad_norm": 0.4338386058807373,
"learning_rate": 0.0002664174927113702,
"loss": 3.1728,
"step": 95450
},
{
"epoch": 27.826340326340326,
"grad_norm": 0.46094560623168945,
"learning_rate": 0.00026624256559766764,
"loss": 3.1693,
"step": 95500
},
{
"epoch": 27.84090909090909,
"grad_norm": 0.44629111886024475,
"learning_rate": 0.000266067638483965,
"loss": 3.186,
"step": 95550
},
{
"epoch": 27.855477855477854,
"grad_norm": 0.4409028887748718,
"learning_rate": 0.0002658927113702624,
"loss": 3.1921,
"step": 95600
},
{
"epoch": 27.87004662004662,
"grad_norm": 0.4406682848930359,
"learning_rate": 0.0002657177842565597,
"loss": 3.1829,
"step": 95650
},
{
"epoch": 27.884615384615383,
"grad_norm": 0.41936755180358887,
"learning_rate": 0.00026554285714285714,
"loss": 3.1896,
"step": 95700
},
{
"epoch": 27.899184149184148,
"grad_norm": 0.4212104082107544,
"learning_rate": 0.0002653679300291545,
"loss": 3.1837,
"step": 95750
},
{
"epoch": 27.913752913752912,
"grad_norm": 0.4641505181789398,
"learning_rate": 0.0002651930029154519,
"loss": 3.1885,
"step": 95800
},
{
"epoch": 27.92832167832168,
"grad_norm": 0.4994050860404968,
"learning_rate": 0.00026501807580174927,
"loss": 3.1885,
"step": 95850
},
{
"epoch": 27.942890442890445,
"grad_norm": 0.45975276827812195,
"learning_rate": 0.0002648431486880466,
"loss": 3.1928,
"step": 95900
},
{
"epoch": 27.95745920745921,
"grad_norm": 0.4463437497615814,
"learning_rate": 0.000264668221574344,
"loss": 3.1852,
"step": 95950
},
{
"epoch": 27.972027972027973,
"grad_norm": 0.44903019070625305,
"learning_rate": 0.0002644932944606414,
"loss": 3.1905,
"step": 96000
},
{
"epoch": 27.972027972027973,
"eval_accuracy": 0.3752865395383994,
"eval_loss": 3.5301098823547363,
"eval_runtime": 180.0722,
"eval_samples_per_second": 92.418,
"eval_steps_per_second": 5.781,
"step": 96000
},
{
"epoch": 27.986596736596738,
"grad_norm": 0.4516717195510864,
"learning_rate": 0.00026431836734693877,
"loss": 3.1865,
"step": 96050
},
{
"epoch": 28.001165501165502,
"grad_norm": 0.45636415481567383,
"learning_rate": 0.0002641434402332361,
"loss": 3.1982,
"step": 96100
},
{
"epoch": 28.015734265734267,
"grad_norm": 0.4726836681365967,
"learning_rate": 0.0002639685131195335,
"loss": 3.0816,
"step": 96150
},
{
"epoch": 28.03030303030303,
"grad_norm": 0.4496645927429199,
"learning_rate": 0.0002637935860058309,
"loss": 3.0853,
"step": 96200
},
{
"epoch": 28.044871794871796,
"grad_norm": 0.47952306270599365,
"learning_rate": 0.00026361865889212827,
"loss": 3.1046,
"step": 96250
},
{
"epoch": 28.05944055944056,
"grad_norm": 0.45253846049308777,
"learning_rate": 0.00026344373177842565,
"loss": 3.0992,
"step": 96300
},
{
"epoch": 28.074009324009324,
"grad_norm": 0.4647113084793091,
"learning_rate": 0.000263268804664723,
"loss": 3.0939,
"step": 96350
},
{
"epoch": 28.08857808857809,
"grad_norm": 0.46444422006607056,
"learning_rate": 0.0002630938775510204,
"loss": 3.105,
"step": 96400
},
{
"epoch": 28.103146853146853,
"grad_norm": 0.45401421189308167,
"learning_rate": 0.0002629189504373178,
"loss": 3.1085,
"step": 96450
},
{
"epoch": 28.117715617715618,
"grad_norm": 0.4496326744556427,
"learning_rate": 0.00026274402332361515,
"loss": 3.109,
"step": 96500
},
{
"epoch": 28.132284382284382,
"grad_norm": 0.4805924892425537,
"learning_rate": 0.0002625690962099125,
"loss": 3.1118,
"step": 96550
},
{
"epoch": 28.146853146853147,
"grad_norm": 0.4368390142917633,
"learning_rate": 0.0002623941690962099,
"loss": 3.1099,
"step": 96600
},
{
"epoch": 28.16142191142191,
"grad_norm": 0.43335017561912537,
"learning_rate": 0.0002622192419825073,
"loss": 3.1148,
"step": 96650
},
{
"epoch": 28.175990675990676,
"grad_norm": 0.5231890082359314,
"learning_rate": 0.00026204431486880465,
"loss": 3.128,
"step": 96700
},
{
"epoch": 28.19055944055944,
"grad_norm": 0.44500666856765747,
"learning_rate": 0.00026186938775510203,
"loss": 3.1168,
"step": 96750
},
{
"epoch": 28.205128205128204,
"grad_norm": 0.4381171464920044,
"learning_rate": 0.0002616944606413994,
"loss": 3.1199,
"step": 96800
},
{
"epoch": 28.21969696969697,
"grad_norm": 0.4727230370044708,
"learning_rate": 0.0002615195335276968,
"loss": 3.1237,
"step": 96850
},
{
"epoch": 28.234265734265733,
"grad_norm": 0.45557940006256104,
"learning_rate": 0.00026134460641399416,
"loss": 3.136,
"step": 96900
},
{
"epoch": 28.248834498834498,
"grad_norm": 0.42124268412590027,
"learning_rate": 0.00026116967930029153,
"loss": 3.1277,
"step": 96950
},
{
"epoch": 28.263403263403262,
"grad_norm": 0.4520432651042938,
"learning_rate": 0.0002609947521865889,
"loss": 3.1347,
"step": 97000
},
{
"epoch": 28.263403263403262,
"eval_accuracy": 0.37456488440465274,
"eval_loss": 3.5478932857513428,
"eval_runtime": 180.1693,
"eval_samples_per_second": 92.369,
"eval_steps_per_second": 5.778,
"step": 97000
},
{
"epoch": 28.277972027972027,
"grad_norm": 0.45506972074508667,
"learning_rate": 0.0002608198250728863,
"loss": 3.1352,
"step": 97050
},
{
"epoch": 28.29254079254079,
"grad_norm": 0.45318546891212463,
"learning_rate": 0.00026064489795918366,
"loss": 3.1371,
"step": 97100
},
{
"epoch": 28.307109557109555,
"grad_norm": 0.46015825867652893,
"learning_rate": 0.00026046997084548104,
"loss": 3.1348,
"step": 97150
},
{
"epoch": 28.32167832167832,
"grad_norm": 0.4895924925804138,
"learning_rate": 0.0002602950437317784,
"loss": 3.141,
"step": 97200
},
{
"epoch": 28.336247086247088,
"grad_norm": 0.44168609380722046,
"learning_rate": 0.0002601201166180758,
"loss": 3.1361,
"step": 97250
},
{
"epoch": 28.350815850815852,
"grad_norm": 0.44923096895217896,
"learning_rate": 0.00025994518950437316,
"loss": 3.1442,
"step": 97300
},
{
"epoch": 28.365384615384617,
"grad_norm": 0.45875313878059387,
"learning_rate": 0.00025977026239067054,
"loss": 3.1504,
"step": 97350
},
{
"epoch": 28.37995337995338,
"grad_norm": 0.4395272135734558,
"learning_rate": 0.0002595953352769679,
"loss": 3.1461,
"step": 97400
},
{
"epoch": 28.394522144522146,
"grad_norm": 0.4497947096824646,
"learning_rate": 0.0002594204081632653,
"loss": 3.15,
"step": 97450
},
{
"epoch": 28.40909090909091,
"grad_norm": 0.4851381182670593,
"learning_rate": 0.00025924548104956267,
"loss": 3.1458,
"step": 97500
},
{
"epoch": 28.423659673659674,
"grad_norm": 0.4465165436267853,
"learning_rate": 0.00025907055393586004,
"loss": 3.1481,
"step": 97550
},
{
"epoch": 28.43822843822844,
"grad_norm": 0.4833471179008484,
"learning_rate": 0.0002588956268221574,
"loss": 3.1435,
"step": 97600
},
{
"epoch": 28.452797202797203,
"grad_norm": 0.4438728392124176,
"learning_rate": 0.0002587206997084548,
"loss": 3.1695,
"step": 97650
},
{
"epoch": 28.467365967365968,
"grad_norm": 0.4530220031738281,
"learning_rate": 0.00025854577259475217,
"loss": 3.153,
"step": 97700
},
{
"epoch": 28.481934731934732,
"grad_norm": 0.4695287346839905,
"learning_rate": 0.00025837084548104955,
"loss": 3.1391,
"step": 97750
},
{
"epoch": 28.496503496503497,
"grad_norm": 0.4446166455745697,
"learning_rate": 0.0002581959183673469,
"loss": 3.1654,
"step": 97800
},
{
"epoch": 28.51107226107226,
"grad_norm": 0.4539489448070526,
"learning_rate": 0.0002580209912536443,
"loss": 3.1569,
"step": 97850
},
{
"epoch": 28.525641025641026,
"grad_norm": 0.44202882051467896,
"learning_rate": 0.0002578460641399417,
"loss": 3.1621,
"step": 97900
},
{
"epoch": 28.54020979020979,
"grad_norm": 0.46865367889404297,
"learning_rate": 0.00025767113702623905,
"loss": 3.1566,
"step": 97950
},
{
"epoch": 28.554778554778554,
"grad_norm": 0.4353398084640503,
"learning_rate": 0.0002574962099125364,
"loss": 3.1577,
"step": 98000
},
{
"epoch": 28.554778554778554,
"eval_accuracy": 0.37477490168815797,
"eval_loss": 3.543243169784546,
"eval_runtime": 180.0326,
"eval_samples_per_second": 92.439,
"eval_steps_per_second": 5.782,
"step": 98000
},
{
"epoch": 28.56934731934732,
"grad_norm": 0.4895898103713989,
"learning_rate": 0.0002573212827988338,
"loss": 3.1526,
"step": 98050
},
{
"epoch": 28.583916083916083,
"grad_norm": 0.4979413151741028,
"learning_rate": 0.0002571463556851312,
"loss": 3.1605,
"step": 98100
},
{
"epoch": 28.598484848484848,
"grad_norm": 0.4730426073074341,
"learning_rate": 0.00025697142857142855,
"loss": 3.1606,
"step": 98150
},
{
"epoch": 28.613053613053612,
"grad_norm": 0.4984246492385864,
"learning_rate": 0.00025679650145772593,
"loss": 3.1651,
"step": 98200
},
{
"epoch": 28.627622377622377,
"grad_norm": 0.4455728232860565,
"learning_rate": 0.0002566215743440233,
"loss": 3.151,
"step": 98250
},
{
"epoch": 28.64219114219114,
"grad_norm": 0.4372442364692688,
"learning_rate": 0.0002564466472303207,
"loss": 3.1691,
"step": 98300
},
{
"epoch": 28.656759906759905,
"grad_norm": 0.45729032158851624,
"learning_rate": 0.00025627172011661806,
"loss": 3.1627,
"step": 98350
},
{
"epoch": 28.67132867132867,
"grad_norm": 0.4392733871936798,
"learning_rate": 0.00025609679300291543,
"loss": 3.1576,
"step": 98400
},
{
"epoch": 28.685897435897434,
"grad_norm": 0.46147841215133667,
"learning_rate": 0.0002559218658892128,
"loss": 3.1723,
"step": 98450
},
{
"epoch": 28.7004662004662,
"grad_norm": 0.4356614351272583,
"learning_rate": 0.0002557469387755102,
"loss": 3.1619,
"step": 98500
},
{
"epoch": 28.715034965034967,
"grad_norm": 0.4666474461555481,
"learning_rate": 0.00025557201166180756,
"loss": 3.1639,
"step": 98550
},
{
"epoch": 28.72960372960373,
"grad_norm": 0.44134947657585144,
"learning_rate": 0.00025539708454810493,
"loss": 3.1696,
"step": 98600
},
{
"epoch": 28.744172494172496,
"grad_norm": 0.43788793683052063,
"learning_rate": 0.0002552221574344023,
"loss": 3.169,
"step": 98650
},
{
"epoch": 28.75874125874126,
"grad_norm": 0.44114676117897034,
"learning_rate": 0.0002550472303206997,
"loss": 3.175,
"step": 98700
},
{
"epoch": 28.773310023310025,
"grad_norm": 0.4305495619773865,
"learning_rate": 0.00025487230320699706,
"loss": 3.164,
"step": 98750
},
{
"epoch": 28.78787878787879,
"grad_norm": 0.45937755703926086,
"learning_rate": 0.00025469737609329444,
"loss": 3.1809,
"step": 98800
},
{
"epoch": 28.802447552447553,
"grad_norm": 0.4551764726638794,
"learning_rate": 0.0002545224489795918,
"loss": 3.1747,
"step": 98850
},
{
"epoch": 28.817016317016318,
"grad_norm": 0.4215174615383148,
"learning_rate": 0.0002543475218658892,
"loss": 3.1714,
"step": 98900
},
{
"epoch": 28.831585081585082,
"grad_norm": 0.4614230990409851,
"learning_rate": 0.00025417259475218657,
"loss": 3.175,
"step": 98950
},
{
"epoch": 28.846153846153847,
"grad_norm": 0.43585631251335144,
"learning_rate": 0.00025399766763848394,
"loss": 3.16,
"step": 99000
},
{
"epoch": 28.846153846153847,
"eval_accuracy": 0.37547186274041744,
"eval_loss": 3.535378932952881,
"eval_runtime": 179.9708,
"eval_samples_per_second": 92.471,
"eval_steps_per_second": 5.784,
"step": 99000
},
{
"epoch": 28.86072261072261,
"grad_norm": 0.4275636672973633,
"learning_rate": 0.0002538227405247813,
"loss": 3.1658,
"step": 99050
},
{
"epoch": 28.875291375291376,
"grad_norm": 0.4248562753200531,
"learning_rate": 0.0002536478134110787,
"loss": 3.1766,
"step": 99100
},
{
"epoch": 28.88986013986014,
"grad_norm": 0.4694676995277405,
"learning_rate": 0.00025347288629737607,
"loss": 3.169,
"step": 99150
},
{
"epoch": 28.904428904428904,
"grad_norm": 0.446225106716156,
"learning_rate": 0.00025329795918367344,
"loss": 3.1817,
"step": 99200
},
{
"epoch": 28.91899766899767,
"grad_norm": 0.44096997380256653,
"learning_rate": 0.0002531230320699708,
"loss": 3.1872,
"step": 99250
},
{
"epoch": 28.933566433566433,
"grad_norm": 0.4305557906627655,
"learning_rate": 0.0002529481049562682,
"loss": 3.1849,
"step": 99300
},
{
"epoch": 28.948135198135198,
"grad_norm": 0.45161905884742737,
"learning_rate": 0.00025277317784256557,
"loss": 3.1889,
"step": 99350
},
{
"epoch": 28.962703962703962,
"grad_norm": 0.4424970746040344,
"learning_rate": 0.00025259825072886295,
"loss": 3.1705,
"step": 99400
},
{
"epoch": 28.977272727272727,
"grad_norm": 0.451418936252594,
"learning_rate": 0.0002524233236151603,
"loss": 3.1821,
"step": 99450
},
{
"epoch": 28.99184149184149,
"grad_norm": 0.43637681007385254,
"learning_rate": 0.0002522483965014577,
"loss": 3.1801,
"step": 99500
},
{
"epoch": 29.006410256410255,
"grad_norm": 0.5177982449531555,
"learning_rate": 0.0002520734693877551,
"loss": 3.138,
"step": 99550
},
{
"epoch": 29.02097902097902,
"grad_norm": 0.4267365336418152,
"learning_rate": 0.00025189854227405245,
"loss": 3.0772,
"step": 99600
},
{
"epoch": 29.035547785547784,
"grad_norm": 0.45578399300575256,
"learning_rate": 0.0002517236151603498,
"loss": 3.0917,
"step": 99650
},
{
"epoch": 29.05011655011655,
"grad_norm": 0.47677499055862427,
"learning_rate": 0.0002515486880466472,
"loss": 3.0774,
"step": 99700
},
{
"epoch": 29.064685314685313,
"grad_norm": 0.44193127751350403,
"learning_rate": 0.0002513737609329446,
"loss": 3.0921,
"step": 99750
},
{
"epoch": 29.079254079254078,
"grad_norm": 0.4814469516277313,
"learning_rate": 0.00025119883381924195,
"loss": 3.108,
"step": 99800
},
{
"epoch": 29.093822843822842,
"grad_norm": 0.45265457034111023,
"learning_rate": 0.00025102390670553933,
"loss": 3.0893,
"step": 99850
},
{
"epoch": 29.10839160839161,
"grad_norm": 0.46795418858528137,
"learning_rate": 0.0002508489795918367,
"loss": 3.1063,
"step": 99900
},
{
"epoch": 29.122960372960375,
"grad_norm": 0.46038514375686646,
"learning_rate": 0.0002506740524781341,
"loss": 3.1051,
"step": 99950
},
{
"epoch": 29.13752913752914,
"grad_norm": 0.47460585832595825,
"learning_rate": 0.00025049912536443146,
"loss": 3.1113,
"step": 100000
},
{
"epoch": 29.13752913752914,
"eval_accuracy": 0.3746667180930713,
"eval_loss": 3.5507586002349854,
"eval_runtime": 180.1614,
"eval_samples_per_second": 92.373,
"eval_steps_per_second": 5.778,
"step": 100000
},
{
"epoch": 29.13752913752914,
"step": 100000,
"total_flos": 2.090305946124288e+18,
"train_loss": 0.6333386157226563,
"train_runtime": 40009.1315,
"train_samples_per_second": 343.117,
"train_steps_per_second": 4.289
}
],
"logging_steps": 50,
"max_steps": 171600,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.090305946124288e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}