so101-pen-cleanup / trainer_state.json
yuk6ra's picture
Max step 5000
16031a0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.405405405405405,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010810810810810811,
"grad_norm": 14.200489044189453,
"learning_rate": 3.6e-07,
"loss": 0.9925,
"step": 10
},
{
"epoch": 0.021621621621621623,
"grad_norm": 12.673724174499512,
"learning_rate": 7.6e-07,
"loss": 1.0094,
"step": 20
},
{
"epoch": 0.032432432432432434,
"grad_norm": 8.55466365814209,
"learning_rate": 1.1600000000000001e-06,
"loss": 0.9405,
"step": 30
},
{
"epoch": 0.043243243243243246,
"grad_norm": 7.453739643096924,
"learning_rate": 1.56e-06,
"loss": 0.6896,
"step": 40
},
{
"epoch": 0.05405405405405406,
"grad_norm": 2.2464568614959717,
"learning_rate": 1.9600000000000003e-06,
"loss": 0.4138,
"step": 50
},
{
"epoch": 0.06486486486486487,
"grad_norm": 2.7017674446105957,
"learning_rate": 2.3600000000000003e-06,
"loss": 0.3357,
"step": 60
},
{
"epoch": 0.07567567567567568,
"grad_norm": 2.866603374481201,
"learning_rate": 2.7600000000000003e-06,
"loss": 0.2873,
"step": 70
},
{
"epoch": 0.08648648648648649,
"grad_norm": 2.4940736293792725,
"learning_rate": 3.1600000000000002e-06,
"loss": 0.2408,
"step": 80
},
{
"epoch": 0.0972972972972973,
"grad_norm": 1.9782941341400146,
"learning_rate": 3.5600000000000002e-06,
"loss": 0.2165,
"step": 90
},
{
"epoch": 0.10810810810810811,
"grad_norm": 1.1495152711868286,
"learning_rate": 3.96e-06,
"loss": 0.1948,
"step": 100
},
{
"epoch": 0.11891891891891893,
"grad_norm": 1.5376442670822144,
"learning_rate": 4.360000000000001e-06,
"loss": 0.1975,
"step": 110
},
{
"epoch": 0.12972972972972974,
"grad_norm": 1.1928683519363403,
"learning_rate": 4.76e-06,
"loss": 0.17,
"step": 120
},
{
"epoch": 0.14054054054054055,
"grad_norm": 1.3887287378311157,
"learning_rate": 5.1600000000000006e-06,
"loss": 0.173,
"step": 130
},
{
"epoch": 0.15135135135135136,
"grad_norm": 1.9856598377227783,
"learning_rate": 5.560000000000001e-06,
"loss": 0.1637,
"step": 140
},
{
"epoch": 0.16216216216216217,
"grad_norm": 1.1034481525421143,
"learning_rate": 5.9600000000000005e-06,
"loss": 0.153,
"step": 150
},
{
"epoch": 0.17297297297297298,
"grad_norm": 1.4809794425964355,
"learning_rate": 6.360000000000001e-06,
"loss": 0.1402,
"step": 160
},
{
"epoch": 0.1837837837837838,
"grad_norm": 1.930794596672058,
"learning_rate": 6.760000000000001e-06,
"loss": 0.1449,
"step": 170
},
{
"epoch": 0.1945945945945946,
"grad_norm": 0.9831984639167786,
"learning_rate": 7.16e-06,
"loss": 0.1351,
"step": 180
},
{
"epoch": 0.20540540540540542,
"grad_norm": 1.3090108633041382,
"learning_rate": 7.5600000000000005e-06,
"loss": 0.1318,
"step": 190
},
{
"epoch": 0.21621621621621623,
"grad_norm": 1.528368353843689,
"learning_rate": 7.960000000000002e-06,
"loss": 0.1316,
"step": 200
},
{
"epoch": 0.22702702702702704,
"grad_norm": 1.0049641132354736,
"learning_rate": 8.36e-06,
"loss": 0.1266,
"step": 210
},
{
"epoch": 0.23783783783783785,
"grad_norm": 1.335607647895813,
"learning_rate": 8.76e-06,
"loss": 0.1183,
"step": 220
},
{
"epoch": 0.24864864864864866,
"grad_norm": 1.2054740190505981,
"learning_rate": 9.16e-06,
"loss": 0.1135,
"step": 230
},
{
"epoch": 0.2594594594594595,
"grad_norm": 1.3126459121704102,
"learning_rate": 9.56e-06,
"loss": 0.1124,
"step": 240
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.4163024425506592,
"learning_rate": 9.960000000000001e-06,
"loss": 0.1101,
"step": 250
},
{
"epoch": 0.2810810810810811,
"grad_norm": 1.3630675077438354,
"learning_rate": 9.99991141987856e-06,
"loss": 0.0985,
"step": 260
},
{
"epoch": 0.2918918918918919,
"grad_norm": 1.3381731510162354,
"learning_rate": 9.999605221019082e-06,
"loss": 0.1247,
"step": 270
},
{
"epoch": 0.3027027027027027,
"grad_norm": 1.2101716995239258,
"learning_rate": 9.99908032323076e-06,
"loss": 0.1104,
"step": 280
},
{
"epoch": 0.31351351351351353,
"grad_norm": 1.7090423107147217,
"learning_rate": 9.998336749474329e-06,
"loss": 0.1165,
"step": 290
},
{
"epoch": 0.32432432432432434,
"grad_norm": 1.0286948680877686,
"learning_rate": 9.997374532276108e-06,
"loss": 0.1002,
"step": 300
},
{
"epoch": 0.33513513513513515,
"grad_norm": 0.8209099173545837,
"learning_rate": 9.996193713726596e-06,
"loss": 0.1076,
"step": 310
},
{
"epoch": 0.34594594594594597,
"grad_norm": 0.8612469434738159,
"learning_rate": 9.994794345478625e-06,
"loss": 0.1062,
"step": 320
},
{
"epoch": 0.3567567567567568,
"grad_norm": 1.2024775743484497,
"learning_rate": 9.99317648874509e-06,
"loss": 0.0998,
"step": 330
},
{
"epoch": 0.3675675675675676,
"grad_norm": 0.9072966575622559,
"learning_rate": 9.991340214296293e-06,
"loss": 0.0884,
"step": 340
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.7544043660163879,
"learning_rate": 9.98928560245682e-06,
"loss": 0.0838,
"step": 350
},
{
"epoch": 0.3891891891891892,
"grad_norm": 1.3930832147598267,
"learning_rate": 9.987012743102051e-06,
"loss": 0.0953,
"step": 360
},
{
"epoch": 0.4,
"grad_norm": 0.6797852516174316,
"learning_rate": 9.984521735654218e-06,
"loss": 0.0972,
"step": 370
},
{
"epoch": 0.41081081081081083,
"grad_norm": 0.9434889554977417,
"learning_rate": 9.981812689078058e-06,
"loss": 0.0842,
"step": 380
},
{
"epoch": 0.42162162162162165,
"grad_norm": 1.2180750370025635,
"learning_rate": 9.978885721876041e-06,
"loss": 0.096,
"step": 390
},
{
"epoch": 0.43243243243243246,
"grad_norm": 1.0367884635925293,
"learning_rate": 9.975740962083197e-06,
"loss": 0.0925,
"step": 400
},
{
"epoch": 0.44324324324324327,
"grad_norm": 1.0367573499679565,
"learning_rate": 9.972378547261505e-06,
"loss": 0.0877,
"step": 410
},
{
"epoch": 0.4540540540540541,
"grad_norm": 1.2332873344421387,
"learning_rate": 9.968798624493885e-06,
"loss": 0.0847,
"step": 420
},
{
"epoch": 0.4648648648648649,
"grad_norm": 1.3428494930267334,
"learning_rate": 9.965001350377755e-06,
"loss": 0.0893,
"step": 430
},
{
"epoch": 0.4756756756756757,
"grad_norm": 0.8041930198669434,
"learning_rate": 9.960986891018182e-06,
"loss": 0.0791,
"step": 440
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.8222742676734924,
"learning_rate": 9.95675542202063e-06,
"loss": 0.0843,
"step": 450
},
{
"epoch": 0.4972972972972973,
"grad_norm": 0.9690313935279846,
"learning_rate": 9.952307128483257e-06,
"loss": 0.0821,
"step": 460
},
{
"epoch": 0.5081081081081081,
"grad_norm": 0.9667937755584717,
"learning_rate": 9.947642204988835e-06,
"loss": 0.0833,
"step": 470
},
{
"epoch": 0.518918918918919,
"grad_norm": 0.7906189560890198,
"learning_rate": 9.942760855596228e-06,
"loss": 0.0751,
"step": 480
},
{
"epoch": 0.5297297297297298,
"grad_norm": 0.847240149974823,
"learning_rate": 9.93766329383147e-06,
"loss": 0.075,
"step": 490
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.5874003767967224,
"learning_rate": 9.932349742678433e-06,
"loss": 0.0782,
"step": 500
},
{
"epoch": 0.5513513513513514,
"grad_norm": 0.6482784748077393,
"learning_rate": 9.926820434569052e-06,
"loss": 0.0759,
"step": 510
},
{
"epoch": 0.5621621621621622,
"grad_norm": 0.7045950293540955,
"learning_rate": 9.92107561137318e-06,
"loss": 0.0721,
"step": 520
},
{
"epoch": 0.572972972972973,
"grad_norm": 0.6596692204475403,
"learning_rate": 9.915115524387988e-06,
"loss": 0.0746,
"step": 530
},
{
"epoch": 0.5837837837837838,
"grad_norm": 0.7132977843284607,
"learning_rate": 9.908940434326996e-06,
"loss": 0.0753,
"step": 540
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.742100179195404,
"learning_rate": 9.902550611308646e-06,
"loss": 0.0709,
"step": 550
},
{
"epoch": 0.6054054054054054,
"grad_norm": 0.8214631080627441,
"learning_rate": 9.895946334844495e-06,
"loss": 0.0635,
"step": 560
},
{
"epoch": 0.6162162162162163,
"grad_norm": 0.6747580766677856,
"learning_rate": 9.88912789382699e-06,
"loss": 0.0664,
"step": 570
},
{
"epoch": 0.6270270270270271,
"grad_norm": 1.0222914218902588,
"learning_rate": 9.88209558651683e-06,
"loss": 0.0647,
"step": 580
},
{
"epoch": 0.6378378378378379,
"grad_norm": 0.8500170707702637,
"learning_rate": 9.874849720529921e-06,
"loss": 0.078,
"step": 590
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.9567283391952515,
"learning_rate": 9.867390612823915e-06,
"loss": 0.07,
"step": 600
},
{
"epoch": 0.6594594594594595,
"grad_norm": 0.8195412158966064,
"learning_rate": 9.859718589684344e-06,
"loss": 0.0744,
"step": 610
},
{
"epoch": 0.6702702702702703,
"grad_norm": 1.1124755144119263,
"learning_rate": 9.851833986710354e-06,
"loss": 0.068,
"step": 620
},
{
"epoch": 0.6810810810810811,
"grad_norm": 1.030849575996399,
"learning_rate": 9.843737148800023e-06,
"loss": 0.068,
"step": 630
},
{
"epoch": 0.6918918918918919,
"grad_norm": 1.150592565536499,
"learning_rate": 9.835428430135273e-06,
"loss": 0.0726,
"step": 640
},
{
"epoch": 0.7027027027027027,
"grad_norm": 1.5807855129241943,
"learning_rate": 9.82690819416637e-06,
"loss": 0.0678,
"step": 650
},
{
"epoch": 0.7135135135135136,
"grad_norm": 1.2938547134399414,
"learning_rate": 9.818176813596042e-06,
"loss": 0.0639,
"step": 660
},
{
"epoch": 0.7243243243243244,
"grad_norm": 0.9378499388694763,
"learning_rate": 9.80923467036316e-06,
"loss": 0.0701,
"step": 670
},
{
"epoch": 0.7351351351351352,
"grad_norm": 0.6736406683921814,
"learning_rate": 9.800082155626035e-06,
"loss": 0.0688,
"step": 680
},
{
"epoch": 0.745945945945946,
"grad_norm": 0.8763739466667175,
"learning_rate": 9.790719669745312e-06,
"loss": 0.0588,
"step": 690
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.7410522103309631,
"learning_rate": 9.781147622266457e-06,
"loss": 0.0712,
"step": 700
},
{
"epoch": 0.7675675675675676,
"grad_norm": 0.6820585131645203,
"learning_rate": 9.771366431901832e-06,
"loss": 0.0678,
"step": 710
},
{
"epoch": 0.7783783783783784,
"grad_norm": 0.5506922006607056,
"learning_rate": 9.761376526512394e-06,
"loss": 0.0607,
"step": 720
},
{
"epoch": 0.7891891891891892,
"grad_norm": 0.8286749720573425,
"learning_rate": 9.751178343088963e-06,
"loss": 0.0623,
"step": 730
},
{
"epoch": 0.8,
"grad_norm": 0.6345208883285522,
"learning_rate": 9.740772327733124e-06,
"loss": 0.055,
"step": 740
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.6936842203140259,
"learning_rate": 9.730158935637697e-06,
"loss": 0.06,
"step": 750
},
{
"epoch": 0.8216216216216217,
"grad_norm": 0.898246169090271,
"learning_rate": 9.719338631066835e-06,
"loss": 0.0487,
"step": 760
},
{
"epoch": 0.8324324324324325,
"grad_norm": 1.2851463556289673,
"learning_rate": 9.708311887335713e-06,
"loss": 0.0589,
"step": 770
},
{
"epoch": 0.8432432432432433,
"grad_norm": 0.8996430039405823,
"learning_rate": 9.697079186789823e-06,
"loss": 0.0569,
"step": 780
},
{
"epoch": 0.8540540540540541,
"grad_norm": 0.8897648453712463,
"learning_rate": 9.685641020783878e-06,
"loss": 0.059,
"step": 790
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.6341675519943237,
"learning_rate": 9.67399788966031e-06,
"loss": 0.0575,
"step": 800
},
{
"epoch": 0.8756756756756757,
"grad_norm": 0.6399983167648315,
"learning_rate": 9.662150302727395e-06,
"loss": 0.0601,
"step": 810
},
{
"epoch": 0.8864864864864865,
"grad_norm": 0.6879423260688782,
"learning_rate": 9.650098778236967e-06,
"loss": 0.0598,
"step": 820
},
{
"epoch": 0.8972972972972973,
"grad_norm": 0.44590917229652405,
"learning_rate": 9.63784384336175e-06,
"loss": 0.0547,
"step": 830
},
{
"epoch": 0.9081081081081082,
"grad_norm": 0.8224856853485107,
"learning_rate": 9.62538603417229e-06,
"loss": 0.0562,
"step": 840
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.9134641289710999,
"learning_rate": 9.612725895613526e-06,
"loss": 0.0606,
"step": 850
},
{
"epoch": 0.9297297297297298,
"grad_norm": 0.9926605820655823,
"learning_rate": 9.599863981480927e-06,
"loss": 0.0587,
"step": 860
},
{
"epoch": 0.9405405405405406,
"grad_norm": 0.7019346356391907,
"learning_rate": 9.586800854396284e-06,
"loss": 0.0558,
"step": 870
},
{
"epoch": 0.9513513513513514,
"grad_norm": 0.5580384731292725,
"learning_rate": 9.573537085783096e-06,
"loss": 0.052,
"step": 880
},
{
"epoch": 0.9621621621621622,
"grad_norm": 0.7283103466033936,
"learning_rate": 9.560073255841572e-06,
"loss": 0.051,
"step": 890
},
{
"epoch": 0.972972972972973,
"grad_norm": 1.0219670534133911,
"learning_rate": 9.546409953523248e-06,
"loss": 0.0606,
"step": 900
},
{
"epoch": 0.9837837837837838,
"grad_norm": 0.5650193095207214,
"learning_rate": 9.53254777650523e-06,
"loss": 0.0472,
"step": 910
},
{
"epoch": 0.9945945945945946,
"grad_norm": 0.9416384696960449,
"learning_rate": 9.518487331164048e-06,
"loss": 0.0592,
"step": 920
},
{
"epoch": 1.0054054054054054,
"grad_norm": 1.0085505247116089,
"learning_rate": 9.504229232549135e-06,
"loss": 0.0517,
"step": 930
},
{
"epoch": 1.0162162162162163,
"grad_norm": 0.7915866374969482,
"learning_rate": 9.489774104355908e-06,
"loss": 0.0502,
"step": 940
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.7969734072685242,
"learning_rate": 9.475122578898508e-06,
"loss": 0.0546,
"step": 950
},
{
"epoch": 1.037837837837838,
"grad_norm": 0.7960900068283081,
"learning_rate": 9.46027529708212e-06,
"loss": 0.0468,
"step": 960
},
{
"epoch": 1.0486486486486486,
"grad_norm": 0.43782392144203186,
"learning_rate": 9.445232908374948e-06,
"loss": 0.0475,
"step": 970
},
{
"epoch": 1.0594594594594595,
"grad_norm": 0.7317707538604736,
"learning_rate": 9.429996070779808e-06,
"loss": 0.0468,
"step": 980
},
{
"epoch": 1.0702702702702702,
"grad_norm": 0.5017679929733276,
"learning_rate": 9.414565450805333e-06,
"loss": 0.052,
"step": 990
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.679476797580719,
"learning_rate": 9.398941723436832e-06,
"loss": 0.0501,
"step": 1000
},
{
"epoch": 1.0918918918918918,
"grad_norm": 0.7387028932571411,
"learning_rate": 9.383125572106752e-06,
"loss": 0.0479,
"step": 1010
},
{
"epoch": 1.1027027027027028,
"grad_norm": 0.691663920879364,
"learning_rate": 9.367117688664792e-06,
"loss": 0.0468,
"step": 1020
},
{
"epoch": 1.1135135135135135,
"grad_norm": 0.7472488880157471,
"learning_rate": 9.35091877334763e-06,
"loss": 0.0509,
"step": 1030
},
{
"epoch": 1.1243243243243244,
"grad_norm": 0.6413132548332214,
"learning_rate": 9.334529534748298e-06,
"loss": 0.0512,
"step": 1040
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.808664858341217,
"learning_rate": 9.317950689785188e-06,
"loss": 0.0429,
"step": 1050
},
{
"epoch": 1.145945945945946,
"grad_norm": 0.9747439622879028,
"learning_rate": 9.301182963670688e-06,
"loss": 0.0568,
"step": 1060
},
{
"epoch": 1.1567567567567567,
"grad_norm": 0.66274094581604,
"learning_rate": 9.284227089879456e-06,
"loss": 0.0545,
"step": 1070
},
{
"epoch": 1.1675675675675676,
"grad_norm": 0.803703248500824,
"learning_rate": 9.267083810116341e-06,
"loss": 0.0489,
"step": 1080
},
{
"epoch": 1.1783783783783783,
"grad_norm": 0.9180741310119629,
"learning_rate": 9.249753874283937e-06,
"loss": 0.0394,
"step": 1090
},
{
"epoch": 1.1891891891891893,
"grad_norm": 0.6170795559883118,
"learning_rate": 9.232238040449779e-06,
"loss": 0.0499,
"step": 1100
},
{
"epoch": 1.2,
"grad_norm": 0.6339899301528931,
"learning_rate": 9.214537074813181e-06,
"loss": 0.0431,
"step": 1110
},
{
"epoch": 1.2108108108108109,
"grad_norm": 0.7984320521354675,
"learning_rate": 9.196651751671725e-06,
"loss": 0.0484,
"step": 1120
},
{
"epoch": 1.2216216216216216,
"grad_norm": 0.6766464710235596,
"learning_rate": 9.178582853387383e-06,
"loss": 0.0507,
"step": 1130
},
{
"epoch": 1.2324324324324325,
"grad_norm": 0.8145914077758789,
"learning_rate": 9.160331170352304e-06,
"loss": 0.0471,
"step": 1140
},
{
"epoch": 1.2432432432432432,
"grad_norm": 0.7077579498291016,
"learning_rate": 9.14189750095423e-06,
"loss": 0.0492,
"step": 1150
},
{
"epoch": 1.2540540540540541,
"grad_norm": 0.7460741400718689,
"learning_rate": 9.123282651541577e-06,
"loss": 0.0421,
"step": 1160
},
{
"epoch": 1.2648648648648648,
"grad_norm": 0.888500452041626,
"learning_rate": 9.104487436388161e-06,
"loss": 0.0458,
"step": 1170
},
{
"epoch": 1.2756756756756757,
"grad_norm": 0.6311141848564148,
"learning_rate": 9.085512677657582e-06,
"loss": 0.0455,
"step": 1180
},
{
"epoch": 1.2864864864864864,
"grad_norm": 0.7180992960929871,
"learning_rate": 9.066359205367258e-06,
"loss": 0.0463,
"step": 1190
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.860873281955719,
"learning_rate": 9.047027857352113e-06,
"loss": 0.046,
"step": 1200
},
{
"epoch": 1.308108108108108,
"grad_norm": 0.7998963594436646,
"learning_rate": 9.027519479227934e-06,
"loss": 0.0432,
"step": 1210
},
{
"epoch": 1.318918918918919,
"grad_norm": 0.6902336478233337,
"learning_rate": 9.007834924354384e-06,
"loss": 0.0454,
"step": 1220
},
{
"epoch": 1.3297297297297297,
"grad_norm": 0.6556903123855591,
"learning_rate": 8.987975053797655e-06,
"loss": 0.0483,
"step": 1230
},
{
"epoch": 1.3405405405405406,
"grad_norm": 0.722432017326355,
"learning_rate": 8.967940736292826e-06,
"loss": 0.0418,
"step": 1240
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.8098694086074829,
"learning_rate": 8.947732848205846e-06,
"loss": 0.0479,
"step": 1250
},
{
"epoch": 1.3621621621621622,
"grad_norm": 0.5581735968589783,
"learning_rate": 8.927352273495205e-06,
"loss": 0.0441,
"step": 1260
},
{
"epoch": 1.372972972972973,
"grad_norm": 0.6423065066337585,
"learning_rate": 8.906799903673264e-06,
"loss": 0.0466,
"step": 1270
},
{
"epoch": 1.3837837837837839,
"grad_norm": 0.6050016283988953,
"learning_rate": 8.88607663776726e-06,
"loss": 0.047,
"step": 1280
},
{
"epoch": 1.3945945945945946,
"grad_norm": 0.7292416095733643,
"learning_rate": 8.865183382279979e-06,
"loss": 0.0488,
"step": 1290
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.7246063947677612,
"learning_rate": 8.844121051150097e-06,
"loss": 0.0452,
"step": 1300
},
{
"epoch": 1.4162162162162162,
"grad_norm": 0.6597639918327332,
"learning_rate": 8.822890565712212e-06,
"loss": 0.0411,
"step": 1310
},
{
"epoch": 1.427027027027027,
"grad_norm": 0.780088484287262,
"learning_rate": 8.801492854656537e-06,
"loss": 0.0466,
"step": 1320
},
{
"epoch": 1.4378378378378378,
"grad_norm": 0.624079167842865,
"learning_rate": 8.779928853988269e-06,
"loss": 0.0407,
"step": 1330
},
{
"epoch": 1.4486486486486487,
"grad_norm": 0.4669192135334015,
"learning_rate": 8.758199506986655e-06,
"loss": 0.0419,
"step": 1340
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.43073180317878723,
"learning_rate": 8.73630576416373e-06,
"loss": 0.0419,
"step": 1350
},
{
"epoch": 1.4702702702702704,
"grad_norm": 0.6980085372924805,
"learning_rate": 8.714248583222727e-06,
"loss": 0.041,
"step": 1360
},
{
"epoch": 1.481081081081081,
"grad_norm": 0.7735833525657654,
"learning_rate": 8.692028929016196e-06,
"loss": 0.0357,
"step": 1370
},
{
"epoch": 1.491891891891892,
"grad_norm": 0.9316625595092773,
"learning_rate": 8.669647773503797e-06,
"loss": 0.0395,
"step": 1380
},
{
"epoch": 1.5027027027027027,
"grad_norm": 0.4638199210166931,
"learning_rate": 8.647106095709773e-06,
"loss": 0.0353,
"step": 1390
},
{
"epoch": 1.5135135135135136,
"grad_norm": 0.766643762588501,
"learning_rate": 8.624404881680138e-06,
"loss": 0.0445,
"step": 1400
},
{
"epoch": 1.5243243243243243,
"grad_norm": 0.5198076367378235,
"learning_rate": 8.601545124439535e-06,
"loss": 0.0391,
"step": 1410
},
{
"epoch": 1.535135135135135,
"grad_norm": 0.8132079243659973,
"learning_rate": 8.578527823947801e-06,
"loss": 0.0453,
"step": 1420
},
{
"epoch": 1.545945945945946,
"grad_norm": 0.8266350030899048,
"learning_rate": 8.555353987056224e-06,
"loss": 0.0458,
"step": 1430
},
{
"epoch": 1.5567567567567568,
"grad_norm": 0.6028828620910645,
"learning_rate": 8.532024627463504e-06,
"loss": 0.0409,
"step": 1440
},
{
"epoch": 1.5675675675675675,
"grad_norm": 0.555508017539978,
"learning_rate": 8.508540765671407e-06,
"loss": 0.0452,
"step": 1450
},
{
"epoch": 1.5783783783783782,
"grad_norm": 0.6225217580795288,
"learning_rate": 8.484903428940121e-06,
"loss": 0.0441,
"step": 1460
},
{
"epoch": 1.5891891891891892,
"grad_norm": 0.7838477492332458,
"learning_rate": 8.461113651243333e-06,
"loss": 0.0442,
"step": 1470
},
{
"epoch": 1.6,
"grad_norm": 0.8195562362670898,
"learning_rate": 8.437172473222987e-06,
"loss": 0.0465,
"step": 1480
},
{
"epoch": 1.6108108108108108,
"grad_norm": 0.6786017417907715,
"learning_rate": 8.413080942143767e-06,
"loss": 0.0385,
"step": 1490
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.6811516284942627,
"learning_rate": 8.388840111847288e-06,
"loss": 0.0467,
"step": 1500
},
{
"epoch": 1.6324324324324324,
"grad_norm": 0.7284950017929077,
"learning_rate": 8.364451042705999e-06,
"loss": 0.0427,
"step": 1510
},
{
"epoch": 1.6432432432432433,
"grad_norm": 0.8170714378356934,
"learning_rate": 8.33991480157679e-06,
"loss": 0.0428,
"step": 1520
},
{
"epoch": 1.654054054054054,
"grad_norm": 1.0552774667739868,
"learning_rate": 8.315232461754338e-06,
"loss": 0.0433,
"step": 1530
},
{
"epoch": 1.6648648648648647,
"grad_norm": 0.6368141174316406,
"learning_rate": 8.290405102924144e-06,
"loss": 0.0448,
"step": 1540
},
{
"epoch": 1.6756756756756757,
"grad_norm": 0.40666061639785767,
"learning_rate": 8.265433811115316e-06,
"loss": 0.0411,
"step": 1550
},
{
"epoch": 1.6864864864864866,
"grad_norm": 0.44923198223114014,
"learning_rate": 8.24031967865305e-06,
"loss": 0.0396,
"step": 1560
},
{
"epoch": 1.6972972972972973,
"grad_norm": 0.7264629602432251,
"learning_rate": 8.215063804110858e-06,
"loss": 0.041,
"step": 1570
},
{
"epoch": 1.708108108108108,
"grad_norm": 0.46326103806495667,
"learning_rate": 8.189667292262513e-06,
"loss": 0.0382,
"step": 1580
},
{
"epoch": 1.718918918918919,
"grad_norm": 0.7099355459213257,
"learning_rate": 8.164131254033716e-06,
"loss": 0.044,
"step": 1590
},
{
"epoch": 1.7297297297297298,
"grad_norm": 0.827971339225769,
"learning_rate": 8.138456806453503e-06,
"loss": 0.0401,
"step": 1600
},
{
"epoch": 1.7405405405405405,
"grad_norm": 0.5584285855293274,
"learning_rate": 8.112645072605386e-06,
"loss": 0.0374,
"step": 1610
},
{
"epoch": 1.7513513513513512,
"grad_norm": 0.6379075646400452,
"learning_rate": 8.086697181578223e-06,
"loss": 0.0357,
"step": 1620
},
{
"epoch": 1.7621621621621621,
"grad_norm": 0.6546282768249512,
"learning_rate": 8.060614268416823e-06,
"loss": 0.0387,
"step": 1630
},
{
"epoch": 1.772972972972973,
"grad_norm": 0.665284276008606,
"learning_rate": 8.034397474072308e-06,
"loss": 0.0394,
"step": 1640
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.6494997143745422,
"learning_rate": 8.008047945352194e-06,
"loss": 0.0451,
"step": 1650
},
{
"epoch": 1.7945945945945945,
"grad_norm": 0.6128862500190735,
"learning_rate": 7.981566834870225e-06,
"loss": 0.0357,
"step": 1660
},
{
"epoch": 1.8054054054054054,
"grad_norm": 0.6705856919288635,
"learning_rate": 7.954955300995961e-06,
"loss": 0.0362,
"step": 1670
},
{
"epoch": 1.8162162162162163,
"grad_norm": 0.6312499642372131,
"learning_rate": 7.928214507804104e-06,
"loss": 0.0464,
"step": 1680
},
{
"epoch": 1.827027027027027,
"grad_norm": 0.5211047530174255,
"learning_rate": 7.901345625023577e-06,
"loss": 0.0421,
"step": 1690
},
{
"epoch": 1.8378378378378377,
"grad_norm": 0.5969777703285217,
"learning_rate": 7.874349827986354e-06,
"loss": 0.039,
"step": 1700
},
{
"epoch": 1.8486486486486486,
"grad_norm": 0.6680732369422913,
"learning_rate": 7.847228297576052e-06,
"loss": 0.0379,
"step": 1710
},
{
"epoch": 1.8594594594594596,
"grad_norm": 0.7429904937744141,
"learning_rate": 7.819982220176276e-06,
"loss": 0.0366,
"step": 1720
},
{
"epoch": 1.8702702702702703,
"grad_norm": 0.6381489634513855,
"learning_rate": 7.792612787618714e-06,
"loss": 0.0464,
"step": 1730
},
{
"epoch": 1.881081081081081,
"grad_norm": 0.5638653635978699,
"learning_rate": 7.76512119713101e-06,
"loss": 0.037,
"step": 1740
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.44251424074172974,
"learning_rate": 7.73750865128439e-06,
"loss": 0.0449,
"step": 1750
},
{
"epoch": 1.9027027027027028,
"grad_norm": 0.6562517881393433,
"learning_rate": 7.70977635794107e-06,
"loss": 0.0436,
"step": 1760
},
{
"epoch": 1.9135135135135135,
"grad_norm": 0.656298041343689,
"learning_rate": 7.681925530201392e-06,
"loss": 0.0396,
"step": 1770
},
{
"epoch": 1.9243243243243242,
"grad_norm": 0.650246798992157,
"learning_rate": 7.65395738635079e-06,
"loss": 0.0364,
"step": 1780
},
{
"epoch": 1.9351351351351351,
"grad_norm": 0.7400831580162048,
"learning_rate": 7.6258731498064796e-06,
"loss": 0.0417,
"step": 1790
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.6492830514907837,
"learning_rate": 7.597674049063948e-06,
"loss": 0.0401,
"step": 1800
},
{
"epoch": 1.9567567567567568,
"grad_norm": 0.47661155462265015,
"learning_rate": 7.5693613176432115e-06,
"loss": 0.0368,
"step": 1810
},
{
"epoch": 1.9675675675675675,
"grad_norm": 1.1524066925048828,
"learning_rate": 7.540936194034865e-06,
"loss": 0.037,
"step": 1820
},
{
"epoch": 1.9783783783783784,
"grad_norm": 0.88988196849823,
"learning_rate": 7.512399921645901e-06,
"loss": 0.0397,
"step": 1830
},
{
"epoch": 1.9891891891891893,
"grad_norm": 0.48117414116859436,
"learning_rate": 7.483753748745317e-06,
"loss": 0.0357,
"step": 1840
},
{
"epoch": 2.0,
"grad_norm": 0.4710519015789032,
"learning_rate": 7.454998928409516e-06,
"loss": 0.0348,
"step": 1850
},
{
"epoch": 2.0108108108108107,
"grad_norm": 0.8176958560943604,
"learning_rate": 7.426136718467494e-06,
"loss": 0.0433,
"step": 1860
},
{
"epoch": 2.0216216216216214,
"grad_norm": 0.6511132717132568,
"learning_rate": 7.397168381445812e-06,
"loss": 0.0357,
"step": 1870
},
{
"epoch": 2.0324324324324325,
"grad_norm": 0.638883650302887,
"learning_rate": 7.3680951845133775e-06,
"loss": 0.0354,
"step": 1880
},
{
"epoch": 2.0432432432432432,
"grad_norm": 0.5587888956069946,
"learning_rate": 7.338918399426006e-06,
"loss": 0.0312,
"step": 1890
},
{
"epoch": 2.054054054054054,
"grad_norm": 0.658237636089325,
"learning_rate": 7.309639302470801e-06,
"loss": 0.0411,
"step": 1900
},
{
"epoch": 2.064864864864865,
"grad_norm": 0.7402287721633911,
"learning_rate": 7.280259174410312e-06,
"loss": 0.0368,
"step": 1910
},
{
"epoch": 2.075675675675676,
"grad_norm": 0.787990391254425,
"learning_rate": 7.250779300426518e-06,
"loss": 0.0405,
"step": 1920
},
{
"epoch": 2.0864864864864865,
"grad_norm": 0.6542930006980896,
"learning_rate": 7.22120097006461e-06,
"loss": 0.0382,
"step": 1930
},
{
"epoch": 2.097297297297297,
"grad_norm": 0.9772655367851257,
"learning_rate": 7.191525477176577e-06,
"loss": 0.0381,
"step": 1940
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.4485587775707245,
"learning_rate": 7.161754119864616e-06,
"loss": 0.0352,
"step": 1950
},
{
"epoch": 2.118918918918919,
"grad_norm": 0.37433692812919617,
"learning_rate": 7.13188820042434e-06,
"loss": 0.0293,
"step": 1960
},
{
"epoch": 2.1297297297297297,
"grad_norm": 0.5695798397064209,
"learning_rate": 7.101929025287817e-06,
"loss": 0.0391,
"step": 1970
},
{
"epoch": 2.1405405405405404,
"grad_norm": 0.6900025010108948,
"learning_rate": 7.071877904966422e-06,
"loss": 0.041,
"step": 1980
},
{
"epoch": 2.1513513513513516,
"grad_norm": 0.6525390148162842,
"learning_rate": 7.04173615399351e-06,
"loss": 0.036,
"step": 1990
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.5852620601654053,
"learning_rate": 7.011505090866914e-06,
"loss": 0.0323,
"step": 2000
},
{
"epoch": 2.172972972972973,
"grad_norm": 0.5119943618774414,
"learning_rate": 6.981186037991271e-06,
"loss": 0.0322,
"step": 2010
},
{
"epoch": 2.1837837837837837,
"grad_norm": 0.5682677626609802,
"learning_rate": 6.950780321620174e-06,
"loss": 0.0382,
"step": 2020
},
{
"epoch": 2.1945945945945944,
"grad_norm": 0.5038251280784607,
"learning_rate": 6.920289271798158e-06,
"loss": 0.0344,
"step": 2030
},
{
"epoch": 2.2054054054054055,
"grad_norm": 0.6174978017807007,
"learning_rate": 6.889714222302517e-06,
"loss": 0.0396,
"step": 2040
},
{
"epoch": 2.2162162162162162,
"grad_norm": 0.5457316637039185,
"learning_rate": 6.8590565105849695e-06,
"loss": 0.035,
"step": 2050
},
{
"epoch": 2.227027027027027,
"grad_norm": 0.654593288898468,
"learning_rate": 6.82831747771314e-06,
"loss": 0.0329,
"step": 2060
},
{
"epoch": 2.237837837837838,
"grad_norm": 0.8348110318183899,
"learning_rate": 6.797498468311907e-06,
"loss": 0.0311,
"step": 2070
},
{
"epoch": 2.2486486486486488,
"grad_norm": 0.5480743050575256,
"learning_rate": 6.766600830504585e-06,
"loss": 0.0325,
"step": 2080
},
{
"epoch": 2.2594594594594595,
"grad_norm": 0.5144963264465332,
"learning_rate": 6.735625915853943e-06,
"loss": 0.0278,
"step": 2090
},
{
"epoch": 2.27027027027027,
"grad_norm": 0.5144929885864258,
"learning_rate": 6.7045750793030905e-06,
"loss": 0.0323,
"step": 2100
},
{
"epoch": 2.281081081081081,
"grad_norm": 0.6607171893119812,
"learning_rate": 6.673449679116215e-06,
"loss": 0.0303,
"step": 2110
},
{
"epoch": 2.291891891891892,
"grad_norm": 0.48313409090042114,
"learning_rate": 6.6422510768191485e-06,
"loss": 0.0293,
"step": 2120
},
{
"epoch": 2.3027027027027027,
"grad_norm": 0.5028293132781982,
"learning_rate": 6.610980637139826e-06,
"loss": 0.0359,
"step": 2130
},
{
"epoch": 2.3135135135135134,
"grad_norm": 0.584972620010376,
"learning_rate": 6.579639727948583e-06,
"loss": 0.0345,
"step": 2140
},
{
"epoch": 2.3243243243243246,
"grad_norm": 0.41404393315315247,
"learning_rate": 6.5482297201983155e-06,
"loss": 0.0367,
"step": 2150
},
{
"epoch": 2.3351351351351353,
"grad_norm": 0.6505828499794006,
"learning_rate": 6.516751987864518e-06,
"loss": 0.0327,
"step": 2160
},
{
"epoch": 2.345945945945946,
"grad_norm": 0.6071712374687195,
"learning_rate": 6.485207907885175e-06,
"loss": 0.0321,
"step": 2170
},
{
"epoch": 2.3567567567567567,
"grad_norm": 0.7137896418571472,
"learning_rate": 6.453598860100536e-06,
"loss": 0.0358,
"step": 2180
},
{
"epoch": 2.3675675675675674,
"grad_norm": 0.44680288434028625,
"learning_rate": 6.421926227192748e-06,
"loss": 0.0339,
"step": 2190
},
{
"epoch": 2.3783783783783785,
"grad_norm": 0.44672706723213196,
"learning_rate": 6.3901913946253815e-06,
"loss": 0.031,
"step": 2200
},
{
"epoch": 2.389189189189189,
"grad_norm": 0.4818141460418701,
"learning_rate": 6.358395750582817e-06,
"loss": 0.0293,
"step": 2210
},
{
"epoch": 2.4,
"grad_norm": 0.4907307028770447,
"learning_rate": 6.3265406859095325e-06,
"loss": 0.0321,
"step": 2220
},
{
"epoch": 2.410810810810811,
"grad_norm": 0.5125067234039307,
"learning_rate": 6.29462759404925e-06,
"loss": 0.0273,
"step": 2230
},
{
"epoch": 2.4216216216216218,
"grad_norm": 0.5038555860519409,
"learning_rate": 6.262657870983989e-06,
"loss": 0.0343,
"step": 2240
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.4431181848049164,
"learning_rate": 6.230632915173009e-06,
"loss": 0.0364,
"step": 2250
},
{
"epoch": 2.443243243243243,
"grad_norm": 0.8261221051216125,
"learning_rate": 6.198554127491622e-06,
"loss": 0.0335,
"step": 2260
},
{
"epoch": 2.454054054054054,
"grad_norm": 0.7294406294822693,
"learning_rate": 6.166422911169922e-06,
"loss": 0.0311,
"step": 2270
},
{
"epoch": 2.464864864864865,
"grad_norm": 0.5856850147247314,
"learning_rate": 6.1342406717314e-06,
"loss": 0.0382,
"step": 2280
},
{
"epoch": 2.4756756756756757,
"grad_norm": 0.5774531364440918,
"learning_rate": 6.102008816931466e-06,
"loss": 0.031,
"step": 2290
},
{
"epoch": 2.4864864864864864,
"grad_norm": 0.5902078747749329,
"learning_rate": 6.069728756695867e-06,
"loss": 0.0261,
"step": 2300
},
{
"epoch": 2.4972972972972975,
"grad_norm": 0.6393314599990845,
"learning_rate": 6.037401903059008e-06,
"loss": 0.0365,
"step": 2310
},
{
"epoch": 2.5081081081081082,
"grad_norm": 0.5464274883270264,
"learning_rate": 6.005029670102195e-06,
"loss": 0.0262,
"step": 2320
},
{
"epoch": 2.518918918918919,
"grad_norm": 0.43032026290893555,
"learning_rate": 5.972613473891766e-06,
"loss": 0.0308,
"step": 2330
},
{
"epoch": 2.5297297297297296,
"grad_norm": 0.522640585899353,
"learning_rate": 5.940154732417159e-06,
"loss": 0.0335,
"step": 2340
},
{
"epoch": 2.5405405405405403,
"grad_norm": 0.5012731552124023,
"learning_rate": 5.907654865528876e-06,
"loss": 0.0341,
"step": 2350
},
{
"epoch": 2.5513513513513515,
"grad_norm": 0.7608346939086914,
"learning_rate": 5.8751152948763815e-06,
"loss": 0.0303,
"step": 2360
},
{
"epoch": 2.562162162162162,
"grad_norm": 0.6261675953865051,
"learning_rate": 5.842537443845908e-06,
"loss": 0.0295,
"step": 2370
},
{
"epoch": 2.572972972972973,
"grad_norm": 0.5141987800598145,
"learning_rate": 5.809922737498198e-06,
"loss": 0.0275,
"step": 2380
},
{
"epoch": 2.583783783783784,
"grad_norm": 0.9153976440429688,
"learning_rate": 5.777272602506166e-06,
"loss": 0.037,
"step": 2390
},
{
"epoch": 2.5945945945945947,
"grad_norm": 0.4885156452655792,
"learning_rate": 5.744588467092483e-06,
"loss": 0.0362,
"step": 2400
},
{
"epoch": 2.6054054054054054,
"grad_norm": 0.6028148531913757,
"learning_rate": 5.711871760967119e-06,
"loss": 0.0304,
"step": 2410
},
{
"epoch": 2.616216216216216,
"grad_norm": 0.7090111374855042,
"learning_rate": 5.679123915264786e-06,
"loss": 0.0315,
"step": 2420
},
{
"epoch": 2.627027027027027,
"grad_norm": 0.4406765103340149,
"learning_rate": 5.646346362482342e-06,
"loss": 0.0293,
"step": 2430
},
{
"epoch": 2.637837837837838,
"grad_norm": 0.5371266603469849,
"learning_rate": 5.613540536416132e-06,
"loss": 0.0369,
"step": 2440
},
{
"epoch": 2.6486486486486487,
"grad_norm": 0.39742565155029297,
"learning_rate": 5.580707872099265e-06,
"loss": 0.0284,
"step": 2450
},
{
"epoch": 2.6594594594594594,
"grad_norm": 0.6796385049819946,
"learning_rate": 5.547849805738836e-06,
"loss": 0.0322,
"step": 2460
},
{
"epoch": 2.6702702702702705,
"grad_norm": 0.4403522312641144,
"learning_rate": 5.514967774653118e-06,
"loss": 0.0287,
"step": 2470
},
{
"epoch": 2.6810810810810812,
"grad_norm": 0.7354927659034729,
"learning_rate": 5.4820632172086745e-06,
"loss": 0.0335,
"step": 2480
},
{
"epoch": 2.691891891891892,
"grad_norm": 0.3618038296699524,
"learning_rate": 5.449137572757439e-06,
"loss": 0.027,
"step": 2490
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.5390015244483948,
"learning_rate": 5.41619228157377e-06,
"loss": 0.0313,
"step": 2500
},
{
"epoch": 2.7135135135135133,
"grad_norm": 0.636913537979126,
"learning_rate": 5.3832287847914276e-06,
"loss": 0.0324,
"step": 2510
},
{
"epoch": 2.7243243243243245,
"grad_norm": 0.6561273336410522,
"learning_rate": 5.35024852434055e-06,
"loss": 0.0275,
"step": 2520
},
{
"epoch": 2.735135135135135,
"grad_norm": 0.6739108562469482,
"learning_rate": 5.317252942884568e-06,
"loss": 0.0251,
"step": 2530
},
{
"epoch": 2.745945945945946,
"grad_norm": 0.35930028557777405,
"learning_rate": 5.284243483757109e-06,
"loss": 0.03,
"step": 2540
},
{
"epoch": 2.756756756756757,
"grad_norm": 0.43455588817596436,
"learning_rate": 5.251221590898848e-06,
"loss": 0.035,
"step": 2550
},
{
"epoch": 2.7675675675675677,
"grad_norm": 0.5473480224609375,
"learning_rate": 5.218188708794357e-06,
"loss": 0.0312,
"step": 2560
},
{
"epoch": 2.7783783783783784,
"grad_norm": 0.7597518563270569,
"learning_rate": 5.185146282408911e-06,
"loss": 0.0285,
"step": 2570
},
{
"epoch": 2.789189189189189,
"grad_norm": 0.7374501824378967,
"learning_rate": 5.1520957571252795e-06,
"loss": 0.0307,
"step": 2580
},
{
"epoch": 2.8,
"grad_norm": 0.7217923998832703,
"learning_rate": 5.119038578680511e-06,
"loss": 0.0367,
"step": 2590
},
{
"epoch": 2.810810810810811,
"grad_norm": 0.7415491938591003,
"learning_rate": 5.085976193102678e-06,
"loss": 0.0303,
"step": 2600
},
{
"epoch": 2.8216216216216217,
"grad_norm": 0.5876473784446716,
"learning_rate": 5.052910046647634e-06,
"loss": 0.0257,
"step": 2610
},
{
"epoch": 2.8324324324324324,
"grad_norm": 0.5944861769676208,
"learning_rate": 5.0198415857357465e-06,
"loss": 0.0311,
"step": 2620
},
{
"epoch": 2.8432432432432435,
"grad_norm": 0.34813442826271057,
"learning_rate": 4.986772256888623e-06,
"loss": 0.0257,
"step": 2630
},
{
"epoch": 2.854054054054054,
"grad_norm": 0.5532535910606384,
"learning_rate": 4.953703506665832e-06,
"loss": 0.0249,
"step": 2640
},
{
"epoch": 2.864864864864865,
"grad_norm": 0.5942780375480652,
"learning_rate": 4.9206367816016385e-06,
"loss": 0.0314,
"step": 2650
},
{
"epoch": 2.8756756756756756,
"grad_norm": 0.5041717886924744,
"learning_rate": 4.887573528141721e-06,
"loss": 0.0324,
"step": 2660
},
{
"epoch": 2.8864864864864863,
"grad_norm": 0.437028706073761,
"learning_rate": 4.854515192579892e-06,
"loss": 0.0298,
"step": 2670
},
{
"epoch": 2.8972972972972975,
"grad_norm": 0.5581991076469421,
"learning_rate": 4.821463220994848e-06,
"loss": 0.0323,
"step": 2680
},
{
"epoch": 2.908108108108108,
"grad_norm": 0.4572019577026367,
"learning_rate": 4.788419059186895e-06,
"loss": 0.0286,
"step": 2690
},
{
"epoch": 2.918918918918919,
"grad_norm": 0.37265264987945557,
"learning_rate": 4.75538415261472e-06,
"loss": 0.0297,
"step": 2700
},
{
"epoch": 2.92972972972973,
"grad_norm": 0.45635294914245605,
"learning_rate": 4.722359946332156e-06,
"loss": 0.0306,
"step": 2710
},
{
"epoch": 2.9405405405405407,
"grad_norm": 0.39567267894744873,
"learning_rate": 4.689347884924966e-06,
"loss": 0.0282,
"step": 2720
},
{
"epoch": 2.9513513513513514,
"grad_norm": 0.8368029594421387,
"learning_rate": 4.656349412447664e-06,
"loss": 0.033,
"step": 2730
},
{
"epoch": 2.962162162162162,
"grad_norm": 0.5868388414382935,
"learning_rate": 4.6233659723603374e-06,
"loss": 0.0299,
"step": 2740
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.4864098131656647,
"learning_rate": 4.590399007465503e-06,
"loss": 0.0232,
"step": 2750
},
{
"epoch": 2.983783783783784,
"grad_norm": 0.6210123896598816,
"learning_rate": 4.557449959845005e-06,
"loss": 0.0287,
"step": 2760
},
{
"epoch": 2.9945945945945946,
"grad_norm": 0.7582560777664185,
"learning_rate": 4.524520270796927e-06,
"loss": 0.0291,
"step": 2770
},
{
"epoch": 3.0054054054054054,
"grad_norm": 0.5422881245613098,
"learning_rate": 4.491611380772545e-06,
"loss": 0.0265,
"step": 2780
},
{
"epoch": 3.016216216216216,
"grad_norm": 0.36191341280937195,
"learning_rate": 4.458724729313319e-06,
"loss": 0.0254,
"step": 2790
},
{
"epoch": 3.027027027027027,
"grad_norm": 0.6519547700881958,
"learning_rate": 4.425861754987921e-06,
"loss": 0.0302,
"step": 2800
},
{
"epoch": 3.037837837837838,
"grad_norm": 0.6648505330085754,
"learning_rate": 4.3930238953293096e-06,
"loss": 0.0272,
"step": 2810
},
{
"epoch": 3.0486486486486486,
"grad_norm": 0.6026409864425659,
"learning_rate": 4.360212586771847e-06,
"loss": 0.0255,
"step": 2820
},
{
"epoch": 3.0594594594594593,
"grad_norm": 0.7593157887458801,
"learning_rate": 4.327429264588463e-06,
"loss": 0.0299,
"step": 2830
},
{
"epoch": 3.0702702702702704,
"grad_norm": 0.5723231434822083,
"learning_rate": 4.294675362827872e-06,
"loss": 0.0292,
"step": 2840
},
{
"epoch": 3.081081081081081,
"grad_norm": 0.5857054591178894,
"learning_rate": 4.261952314251848e-06,
"loss": 0.025,
"step": 2850
},
{
"epoch": 3.091891891891892,
"grad_norm": 0.5336072444915771,
"learning_rate": 4.229261550272539e-06,
"loss": 0.0281,
"step": 2860
},
{
"epoch": 3.1027027027027025,
"grad_norm": 0.5887632966041565,
"learning_rate": 4.196604500889868e-06,
"loss": 0.0266,
"step": 2870
},
{
"epoch": 3.1135135135135137,
"grad_norm": 0.43469563126564026,
"learning_rate": 4.163982594628969e-06,
"loss": 0.0265,
"step": 2880
},
{
"epoch": 3.1243243243243244,
"grad_norm": 0.5681430101394653,
"learning_rate": 4.131397258477702e-06,
"loss": 0.0264,
"step": 2890
},
{
"epoch": 3.135135135135135,
"grad_norm": 0.4526456296443939,
"learning_rate": 4.098849917824232e-06,
"loss": 0.0283,
"step": 2900
},
{
"epoch": 3.145945945945946,
"grad_norm": 0.661095917224884,
"learning_rate": 4.066341996394677e-06,
"loss": 0.0384,
"step": 2910
},
{
"epoch": 3.156756756756757,
"grad_norm": 0.8155391812324524,
"learning_rate": 4.033874916190833e-06,
"loss": 0.0271,
"step": 2920
},
{
"epoch": 3.1675675675675676,
"grad_norm": 0.4132017493247986,
"learning_rate": 4.001450097427965e-06,
"loss": 0.0258,
"step": 2930
},
{
"epoch": 3.1783783783783783,
"grad_norm": 0.6691679954528809,
"learning_rate": 3.969068958472689e-06,
"loss": 0.0272,
"step": 2940
},
{
"epoch": 3.189189189189189,
"grad_norm": 0.5882253050804138,
"learning_rate": 3.936732915780923e-06,
"loss": 0.0238,
"step": 2950
},
{
"epoch": 3.2,
"grad_norm": 0.4746255576610565,
"learning_rate": 3.904443383835929e-06,
"loss": 0.0304,
"step": 2960
},
{
"epoch": 3.210810810810811,
"grad_norm": 0.3622106611728668,
"learning_rate": 3.872201775086437e-06,
"loss": 0.0303,
"step": 2970
},
{
"epoch": 3.2216216216216216,
"grad_norm": 0.39385557174682617,
"learning_rate": 3.840009499884862e-06,
"loss": 0.0247,
"step": 2980
},
{
"epoch": 3.2324324324324323,
"grad_norm": 0.5606188774108887,
"learning_rate": 3.8078679664256112e-06,
"loss": 0.0233,
"step": 2990
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.4499453008174896,
"learning_rate": 3.7757785806834808e-06,
"loss": 0.0282,
"step": 3000
},
{
"epoch": 3.254054054054054,
"grad_norm": 0.5054988265037537,
"learning_rate": 3.7437427463521557e-06,
"loss": 0.0248,
"step": 3010
},
{
"epoch": 3.264864864864865,
"grad_norm": 0.4411916732788086,
"learning_rate": 3.7117618647828168e-06,
"loss": 0.029,
"step": 3020
},
{
"epoch": 3.2756756756756755,
"grad_norm": 0.4525195360183716,
"learning_rate": 3.6798373349228255e-06,
"loss": 0.0263,
"step": 3030
},
{
"epoch": 3.2864864864864867,
"grad_norm": 0.3996464014053345,
"learning_rate": 3.647970553254538e-06,
"loss": 0.0218,
"step": 3040
},
{
"epoch": 3.2972972972972974,
"grad_norm": 0.5602717995643616,
"learning_rate": 3.6161629137342203e-06,
"loss": 0.0288,
"step": 3050
},
{
"epoch": 3.308108108108108,
"grad_norm": 0.45001763105392456,
"learning_rate": 3.5844158077310654e-06,
"loss": 0.0272,
"step": 3060
},
{
"epoch": 3.3189189189189188,
"grad_norm": 0.588201105594635,
"learning_rate": 3.5527306239663372e-06,
"loss": 0.0287,
"step": 3070
},
{
"epoch": 3.32972972972973,
"grad_norm": 0.6123380661010742,
"learning_rate": 3.5211087484526176e-06,
"loss": 0.0269,
"step": 3080
},
{
"epoch": 3.3405405405405406,
"grad_norm": 0.44060274958610535,
"learning_rate": 3.489551564433186e-06,
"loss": 0.0289,
"step": 3090
},
{
"epoch": 3.3513513513513513,
"grad_norm": 0.5079867243766785,
"learning_rate": 3.4580604523215008e-06,
"loss": 0.0251,
"step": 3100
},
{
"epoch": 3.362162162162162,
"grad_norm": 0.6919063925743103,
"learning_rate": 3.4266367896408214e-06,
"loss": 0.0337,
"step": 3110
},
{
"epoch": 3.372972972972973,
"grad_norm": 0.691513180732727,
"learning_rate": 3.3952819509639534e-06,
"loss": 0.026,
"step": 3120
},
{
"epoch": 3.383783783783784,
"grad_norm": 0.42166635394096375,
"learning_rate": 3.3639973078531163e-06,
"loss": 0.0258,
"step": 3130
},
{
"epoch": 3.3945945945945946,
"grad_norm": 0.5259663462638855,
"learning_rate": 3.332784228799947e-06,
"loss": 0.0271,
"step": 3140
},
{
"epoch": 3.4054054054054053,
"grad_norm": 0.4360649585723877,
"learning_rate": 3.301644079165638e-06,
"loss": 0.0267,
"step": 3150
},
{
"epoch": 3.4162162162162164,
"grad_norm": 0.6892200112342834,
"learning_rate": 3.27057822112122e-06,
"loss": 0.0249,
"step": 3160
},
{
"epoch": 3.427027027027027,
"grad_norm": 0.7275106906890869,
"learning_rate": 3.239588013587958e-06,
"loss": 0.0295,
"step": 3170
},
{
"epoch": 3.437837837837838,
"grad_norm": 0.39717885851860046,
"learning_rate": 3.208674812177926e-06,
"loss": 0.0255,
"step": 3180
},
{
"epoch": 3.4486486486486485,
"grad_norm": 0.7455564737319946,
"learning_rate": 3.1778399691346985e-06,
"loss": 0.0265,
"step": 3190
},
{
"epoch": 3.4594594594594597,
"grad_norm": 0.5156950950622559,
"learning_rate": 3.1470848332742005e-06,
"loss": 0.0313,
"step": 3200
},
{
"epoch": 3.4702702702702704,
"grad_norm": 0.48247015476226807,
"learning_rate": 3.1164107499257078e-06,
"loss": 0.0277,
"step": 3210
},
{
"epoch": 3.481081081081081,
"grad_norm": 0.5416699647903442,
"learning_rate": 3.0858190608729956e-06,
"loss": 0.0262,
"step": 3220
},
{
"epoch": 3.4918918918918918,
"grad_norm": 0.4909563362598419,
"learning_rate": 3.0553111042956478e-06,
"loss": 0.0263,
"step": 3230
},
{
"epoch": 3.5027027027027025,
"grad_norm": 0.6142066717147827,
"learning_rate": 3.024888214710517e-06,
"loss": 0.0241,
"step": 3240
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.5313824415206909,
"learning_rate": 2.9945517229133494e-06,
"loss": 0.0226,
"step": 3250
},
{
"epoch": 3.5243243243243243,
"grad_norm": 0.42341265082359314,
"learning_rate": 2.9643029559205727e-06,
"loss": 0.0327,
"step": 3260
},
{
"epoch": 3.535135135135135,
"grad_norm": 0.40098732709884644,
"learning_rate": 2.9341432369112483e-06,
"loss": 0.0254,
"step": 3270
},
{
"epoch": 3.545945945945946,
"grad_norm": 0.5539133548736572,
"learning_rate": 2.90407388516919e-06,
"loss": 0.0261,
"step": 3280
},
{
"epoch": 3.556756756756757,
"grad_norm": 0.5634763240814209,
"learning_rate": 2.8740962160252496e-06,
"loss": 0.0307,
"step": 3290
},
{
"epoch": 3.5675675675675675,
"grad_norm": 0.33248645067214966,
"learning_rate": 2.844211540799797e-06,
"loss": 0.0241,
"step": 3300
},
{
"epoch": 3.5783783783783782,
"grad_norm": 0.37290477752685547,
"learning_rate": 2.814421166745337e-06,
"loss": 0.0231,
"step": 3310
},
{
"epoch": 3.589189189189189,
"grad_norm": 0.5094774961471558,
"learning_rate": 2.7847263969893347e-06,
"loss": 0.0245,
"step": 3320
},
{
"epoch": 3.6,
"grad_norm": 0.5604079365730286,
"learning_rate": 2.7551285304772205e-06,
"loss": 0.0249,
"step": 3330
},
{
"epoch": 3.610810810810811,
"grad_norm": 0.49264541268348694,
"learning_rate": 2.725628861915557e-06,
"loss": 0.0239,
"step": 3340
},
{
"epoch": 3.6216216216216215,
"grad_norm": 0.5173695683479309,
"learning_rate": 2.6962286817154158e-06,
"loss": 0.0266,
"step": 3350
},
{
"epoch": 3.6324324324324326,
"grad_norm": 0.43412289023399353,
"learning_rate": 2.6669292759359166e-06,
"loss": 0.0259,
"step": 3360
},
{
"epoch": 3.6432432432432433,
"grad_norm": 0.33709916472435,
"learning_rate": 2.637731926227993e-06,
"loss": 0.0304,
"step": 3370
},
{
"epoch": 3.654054054054054,
"grad_norm": 0.4906066656112671,
"learning_rate": 2.608637909778303e-06,
"loss": 0.03,
"step": 3380
},
{
"epoch": 3.6648648648648647,
"grad_norm": 0.5424423217773438,
"learning_rate": 2.5796484992533773e-06,
"loss": 0.0267,
"step": 3390
},
{
"epoch": 3.6756756756756754,
"grad_norm": 0.5699389576911926,
"learning_rate": 2.550764962743947e-06,
"loss": 0.0297,
"step": 3400
},
{
"epoch": 3.6864864864864866,
"grad_norm": 0.43275460600852966,
"learning_rate": 2.5219885637094653e-06,
"loss": 0.0252,
"step": 3410
},
{
"epoch": 3.6972972972972973,
"grad_norm": 0.5598377585411072,
"learning_rate": 2.4933205609228534e-06,
"loss": 0.0275,
"step": 3420
},
{
"epoch": 3.708108108108108,
"grad_norm": 0.4491939842700958,
"learning_rate": 2.4647622084154195e-06,
"loss": 0.0241,
"step": 3430
},
{
"epoch": 3.718918918918919,
"grad_norm": 0.5898016691207886,
"learning_rate": 2.436314755422021e-06,
"loss": 0.0235,
"step": 3440
},
{
"epoch": 3.72972972972973,
"grad_norm": 0.5775002241134644,
"learning_rate": 2.407979446326411e-06,
"loss": 0.0279,
"step": 3450
},
{
"epoch": 3.7405405405405405,
"grad_norm": 0.3893280327320099,
"learning_rate": 2.3797575206067993e-06,
"loss": 0.0305,
"step": 3460
},
{
"epoch": 3.7513513513513512,
"grad_norm": 0.6161412596702576,
"learning_rate": 2.3516502127816455e-06,
"loss": 0.023,
"step": 3470
},
{
"epoch": 3.762162162162162,
"grad_norm": 0.4626004993915558,
"learning_rate": 2.323658752355647e-06,
"loss": 0.026,
"step": 3480
},
{
"epoch": 3.772972972972973,
"grad_norm": 0.40138059854507446,
"learning_rate": 2.2957843637659654e-06,
"loss": 0.023,
"step": 3490
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.6491943001747131,
"learning_rate": 2.268028266328655e-06,
"loss": 0.0294,
"step": 3500
},
{
"epoch": 3.7945945945945945,
"grad_norm": 0.6690925359725952,
"learning_rate": 2.2403916741853366e-06,
"loss": 0.0288,
"step": 3510
},
{
"epoch": 3.8054054054054056,
"grad_norm": 0.4034656286239624,
"learning_rate": 2.2128757962500817e-06,
"loss": 0.0223,
"step": 3520
},
{
"epoch": 3.8162162162162163,
"grad_norm": 0.6808702349662781,
"learning_rate": 2.1854818361565277e-06,
"loss": 0.0245,
"step": 3530
},
{
"epoch": 3.827027027027027,
"grad_norm": 0.5565137267112732,
"learning_rate": 2.1582109922052365e-06,
"loss": 0.0289,
"step": 3540
},
{
"epoch": 3.8378378378378377,
"grad_norm": 0.3042198419570923,
"learning_rate": 2.131064457311264e-06,
"loss": 0.0256,
"step": 3550
},
{
"epoch": 3.8486486486486484,
"grad_norm": 0.3782758414745331,
"learning_rate": 2.1040434189519924e-06,
"loss": 0.0255,
"step": 3560
},
{
"epoch": 3.8594594594594596,
"grad_norm": 0.5362565517425537,
"learning_rate": 2.0771490591151734e-06,
"loss": 0.0282,
"step": 3570
},
{
"epoch": 3.8702702702702703,
"grad_norm": 0.6471264958381653,
"learning_rate": 2.0503825542472315e-06,
"loss": 0.0296,
"step": 3580
},
{
"epoch": 3.881081081081081,
"grad_norm": 0.4816526174545288,
"learning_rate": 2.023745075201805e-06,
"loss": 0.0243,
"step": 3590
},
{
"epoch": 3.891891891891892,
"grad_norm": 0.3072303533554077,
"learning_rate": 1.9972377871885157e-06,
"loss": 0.0235,
"step": 3600
},
{
"epoch": 3.902702702702703,
"grad_norm": 0.5302412509918213,
"learning_rate": 1.9708618497220173e-06,
"loss": 0.0241,
"step": 3610
},
{
"epoch": 3.9135135135135135,
"grad_norm": 0.5970901250839233,
"learning_rate": 1.944618416571259e-06,
"loss": 0.0251,
"step": 3620
},
{
"epoch": 3.924324324324324,
"grad_norm": 0.45257118344306946,
"learning_rate": 1.9185086357090217e-06,
"loss": 0.0254,
"step": 3630
},
{
"epoch": 3.935135135135135,
"grad_norm": 0.5031522512435913,
"learning_rate": 1.8925336492617057e-06,
"loss": 0.029,
"step": 3640
},
{
"epoch": 3.945945945945946,
"grad_norm": 0.6579412221908569,
"learning_rate": 1.8666945934593668e-06,
"loss": 0.0259,
"step": 3650
},
{
"epoch": 3.9567567567567568,
"grad_norm": 0.589546799659729,
"learning_rate": 1.8409925985860128e-06,
"loss": 0.0191,
"step": 3660
},
{
"epoch": 3.9675675675675675,
"grad_norm": 0.5495330095291138,
"learning_rate": 1.8154287889301604e-06,
"loss": 0.0244,
"step": 3670
},
{
"epoch": 3.9783783783783786,
"grad_norm": 0.49903082847595215,
"learning_rate": 1.7900042827356611e-06,
"loss": 0.0302,
"step": 3680
},
{
"epoch": 3.9891891891891893,
"grad_norm": 0.6340963840484619,
"learning_rate": 1.7647201921527802e-06,
"loss": 0.0257,
"step": 3690
},
{
"epoch": 4.0,
"grad_norm": 0.40171554684638977,
"learning_rate": 1.739577623189545e-06,
"loss": 0.0228,
"step": 3700
},
{
"epoch": 4.010810810810811,
"grad_norm": 0.4687245488166809,
"learning_rate": 1.714577675663377e-06,
"loss": 0.0306,
"step": 3710
},
{
"epoch": 4.021621621621621,
"grad_norm": 0.5827450156211853,
"learning_rate": 1.6897214431529647e-06,
"loss": 0.0256,
"step": 3720
},
{
"epoch": 4.032432432432432,
"grad_norm": 0.4635016620159149,
"learning_rate": 1.6650100129504477e-06,
"loss": 0.0204,
"step": 3730
},
{
"epoch": 4.043243243243243,
"grad_norm": 0.5446805953979492,
"learning_rate": 1.6404444660138335e-06,
"loss": 0.0246,
"step": 3740
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.7499637603759766,
"learning_rate": 1.616025876919725e-06,
"loss": 0.0256,
"step": 3750
},
{
"epoch": 4.064864864864865,
"grad_norm": 0.4591582715511322,
"learning_rate": 1.5917553138163171e-06,
"loss": 0.0233,
"step": 3760
},
{
"epoch": 4.075675675675676,
"grad_norm": 0.57037353515625,
"learning_rate": 1.5676338383766632e-06,
"loss": 0.0265,
"step": 3770
},
{
"epoch": 4.0864864864864865,
"grad_norm": 0.3500991463661194,
"learning_rate": 1.5436625057522446e-06,
"loss": 0.0219,
"step": 3780
},
{
"epoch": 4.097297297297297,
"grad_norm": 0.39382967352867126,
"learning_rate": 1.519842364526804e-06,
"loss": 0.0211,
"step": 3790
},
{
"epoch": 4.108108108108108,
"grad_norm": 0.49823036789894104,
"learning_rate": 1.4961744566704855e-06,
"loss": 0.0223,
"step": 3800
},
{
"epoch": 4.118918918918919,
"grad_norm": 0.5071790218353271,
"learning_rate": 1.4726598174942553e-06,
"loss": 0.0227,
"step": 3810
},
{
"epoch": 4.12972972972973,
"grad_norm": 0.6194488406181335,
"learning_rate": 1.4492994756046036e-06,
"loss": 0.0278,
"step": 3820
},
{
"epoch": 4.140540540540541,
"grad_norm": 0.404752641916275,
"learning_rate": 1.4260944528585646e-06,
"loss": 0.0262,
"step": 3830
},
{
"epoch": 4.151351351351352,
"grad_norm": 0.577710747718811,
"learning_rate": 1.4030457643190048e-06,
"loss": 0.0247,
"step": 3840
},
{
"epoch": 4.162162162162162,
"grad_norm": 0.4997677505016327,
"learning_rate": 1.380154418210231e-06,
"loss": 0.0215,
"step": 3850
},
{
"epoch": 4.172972972972973,
"grad_norm": 0.5820034146308899,
"learning_rate": 1.3574214158738763e-06,
"loss": 0.0299,
"step": 3860
},
{
"epoch": 4.183783783783784,
"grad_norm": 0.38549038767814636,
"learning_rate": 1.3348477517251102e-06,
"loss": 0.0224,
"step": 3870
},
{
"epoch": 4.194594594594594,
"grad_norm": 0.278469979763031,
"learning_rate": 1.312434413209131e-06,
"loss": 0.0238,
"step": 3880
},
{
"epoch": 4.205405405405405,
"grad_norm": 0.5217724442481995,
"learning_rate": 1.2901823807579727e-06,
"loss": 0.0222,
"step": 3890
},
{
"epoch": 4.216216216216216,
"grad_norm": 0.45270097255706787,
"learning_rate": 1.2680926277476245e-06,
"loss": 0.0241,
"step": 3900
},
{
"epoch": 4.227027027027027,
"grad_norm": 0.4428935647010803,
"learning_rate": 1.2461661204554398e-06,
"loss": 0.0243,
"step": 3910
},
{
"epoch": 4.237837837837838,
"grad_norm": 0.35106682777404785,
"learning_rate": 1.2244038180178836e-06,
"loss": 0.0228,
"step": 3920
},
{
"epoch": 4.248648648648649,
"grad_norm": 0.4548755884170532,
"learning_rate": 1.2028066723885611e-06,
"loss": 0.0244,
"step": 3930
},
{
"epoch": 4.2594594594594595,
"grad_norm": 0.3868198096752167,
"learning_rate": 1.1813756282965887e-06,
"loss": 0.0238,
"step": 3940
},
{
"epoch": 4.27027027027027,
"grad_norm": 0.59125816822052,
"learning_rate": 1.1601116232052639e-06,
"loss": 0.0253,
"step": 3950
},
{
"epoch": 4.281081081081081,
"grad_norm": 0.49321606755256653,
"learning_rate": 1.1390155872710517e-06,
"loss": 0.0267,
"step": 3960
},
{
"epoch": 4.291891891891892,
"grad_norm": 0.5783228874206543,
"learning_rate": 1.1180884433029088e-06,
"loss": 0.0246,
"step": 3970
},
{
"epoch": 4.302702702702703,
"grad_norm": 0.5392751097679138,
"learning_rate": 1.097331106721904e-06,
"loss": 0.0221,
"step": 3980
},
{
"epoch": 4.313513513513514,
"grad_norm": 0.44086018204689026,
"learning_rate": 1.076744485521186e-06,
"loss": 0.0242,
"step": 3990
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.6276214122772217,
"learning_rate": 1.0563294802262558e-06,
"loss": 0.027,
"step": 4000
},
{
"epoch": 4.335135135135135,
"grad_norm": 0.586881697177887,
"learning_rate": 1.036086983855581e-06,
"loss": 0.0237,
"step": 4010
},
{
"epoch": 4.345945945945946,
"grad_norm": 0.5574650764465332,
"learning_rate": 1.0160178818815314e-06,
"loss": 0.0239,
"step": 4020
},
{
"epoch": 4.356756756756757,
"grad_norm": 0.45856261253356934,
"learning_rate": 9.961230521916387e-07,
"loss": 0.0237,
"step": 4030
},
{
"epoch": 4.367567567567567,
"grad_norm": 0.6503721475601196,
"learning_rate": 9.764033650502076e-07,
"loss": 0.0257,
"step": 4040
},
{
"epoch": 4.378378378378378,
"grad_norm": 0.4516863226890564,
"learning_rate": 9.568596830602345e-07,
"loss": 0.0216,
"step": 4050
},
{
"epoch": 4.389189189189189,
"grad_norm": 0.6652976870536804,
"learning_rate": 9.374928611256811e-07,
"loss": 0.0298,
"step": 4060
},
{
"epoch": 4.4,
"grad_norm": 0.524724006652832,
"learning_rate": 9.183037464140804e-07,
"loss": 0.0258,
"step": 4070
},
{
"epoch": 4.410810810810811,
"grad_norm": 0.5315241813659668,
"learning_rate": 8.992931783194736e-07,
"loss": 0.0218,
"step": 4080
},
{
"epoch": 4.421621621621622,
"grad_norm": 0.5122963786125183,
"learning_rate": 8.804619884256959e-07,
"loss": 0.0234,
"step": 4090
},
{
"epoch": 4.4324324324324325,
"grad_norm": 0.533340573310852,
"learning_rate": 8.618110004699976e-07,
"loss": 0.0253,
"step": 4100
},
{
"epoch": 4.443243243243243,
"grad_norm": 0.4996013343334198,
"learning_rate": 8.4334103030701e-07,
"loss": 0.0257,
"step": 4110
},
{
"epoch": 4.454054054054054,
"grad_norm": 0.5451450943946838,
"learning_rate": 8.250528858730661e-07,
"loss": 0.0223,
"step": 4120
},
{
"epoch": 4.464864864864865,
"grad_norm": 0.5494437217712402,
"learning_rate": 8.06947367150846e-07,
"loss": 0.0231,
"step": 4130
},
{
"epoch": 4.475675675675676,
"grad_norm": 0.5594083666801453,
"learning_rate": 7.890252661343939e-07,
"loss": 0.0258,
"step": 4140
},
{
"epoch": 4.486486486486487,
"grad_norm": 0.5314719080924988,
"learning_rate": 7.712873667944681e-07,
"loss": 0.0239,
"step": 4150
},
{
"epoch": 4.4972972972972975,
"grad_norm": 0.6392211318016052,
"learning_rate": 7.537344450442469e-07,
"loss": 0.0198,
"step": 4160
},
{
"epoch": 4.508108108108108,
"grad_norm": 0.33640533685684204,
"learning_rate": 7.36367268705393e-07,
"loss": 0.0233,
"step": 4170
},
{
"epoch": 4.518918918918919,
"grad_norm": 0.6524032354354858,
"learning_rate": 7.191865974744599e-07,
"loss": 0.0244,
"step": 4180
},
{
"epoch": 4.52972972972973,
"grad_norm": 0.5606145262718201,
"learning_rate": 7.021931828896666e-07,
"loss": 0.0235,
"step": 4190
},
{
"epoch": 4.54054054054054,
"grad_norm": 0.4877568483352661,
"learning_rate": 6.853877682980159e-07,
"loss": 0.0189,
"step": 4200
},
{
"epoch": 4.551351351351351,
"grad_norm": 0.4097627103328705,
"learning_rate": 6.687710888227849e-07,
"loss": 0.0181,
"step": 4210
},
{
"epoch": 4.562162162162162,
"grad_norm": 0.4605877101421356,
"learning_rate": 6.523438713313656e-07,
"loss": 0.0219,
"step": 4220
},
{
"epoch": 4.572972972972973,
"grad_norm": 0.5829401016235352,
"learning_rate": 6.361068344034665e-07,
"loss": 0.023,
"step": 4230
},
{
"epoch": 4.583783783783784,
"grad_norm": 0.431805819272995,
"learning_rate": 6.200606882996846e-07,
"loss": 0.023,
"step": 4240
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.3898727297782898,
"learning_rate": 6.042061349304313e-07,
"loss": 0.0229,
"step": 4250
},
{
"epoch": 4.605405405405405,
"grad_norm": 0.7912748456001282,
"learning_rate": 5.885438678252342e-07,
"loss": 0.0284,
"step": 4260
},
{
"epoch": 4.616216216216216,
"grad_norm": 0.47734588384628296,
"learning_rate": 5.730745721023939e-07,
"loss": 0.0214,
"step": 4270
},
{
"epoch": 4.627027027027027,
"grad_norm": 0.750998318195343,
"learning_rate": 5.577989244390192e-07,
"loss": 0.0237,
"step": 4280
},
{
"epoch": 4.6378378378378375,
"grad_norm": 0.3779522776603699,
"learning_rate": 5.427175930414264e-07,
"loss": 0.0225,
"step": 4290
},
{
"epoch": 4.648648648648649,
"grad_norm": 0.44585949182510376,
"learning_rate": 5.27831237615905e-07,
"loss": 0.0236,
"step": 4300
},
{
"epoch": 4.65945945945946,
"grad_norm": 0.38612884283065796,
"learning_rate": 5.131405093398694e-07,
"loss": 0.0204,
"step": 4310
},
{
"epoch": 4.6702702702702705,
"grad_norm": 0.4791419804096222,
"learning_rate": 4.986460508333635e-07,
"loss": 0.0203,
"step": 4320
},
{
"epoch": 4.681081081081081,
"grad_norm": 0.28379178047180176,
"learning_rate": 4.843484961309597e-07,
"loss": 0.0213,
"step": 4330
},
{
"epoch": 4.691891891891892,
"grad_norm": 0.628464937210083,
"learning_rate": 4.7024847065401614e-07,
"loss": 0.0235,
"step": 4340
},
{
"epoch": 4.702702702702703,
"grad_norm": 0.3909755349159241,
"learning_rate": 4.5634659118332594e-07,
"loss": 0.0221,
"step": 4350
},
{
"epoch": 4.713513513513513,
"grad_norm": 0.5916475653648376,
"learning_rate": 4.4264346583213435e-07,
"loss": 0.0196,
"step": 4360
},
{
"epoch": 4.724324324324324,
"grad_norm": 0.6053293943405151,
"learning_rate": 4.291396940195347e-07,
"loss": 0.0237,
"step": 4370
},
{
"epoch": 4.735135135135135,
"grad_norm": 0.5997756123542786,
"learning_rate": 4.15835866444253e-07,
"loss": 0.0214,
"step": 4380
},
{
"epoch": 4.745945945945946,
"grad_norm": 0.2547402083873749,
"learning_rate": 4.027325650588043e-07,
"loss": 0.024,
"step": 4390
},
{
"epoch": 4.756756756756757,
"grad_norm": 0.5118744969367981,
"learning_rate": 3.89830363044042e-07,
"loss": 0.0284,
"step": 4400
},
{
"epoch": 4.767567567567568,
"grad_norm": 0.5891205072402954,
"learning_rate": 3.771298247840788e-07,
"loss": 0.0265,
"step": 4410
},
{
"epoch": 4.778378378378378,
"grad_norm": 0.5021476149559021,
"learning_rate": 3.6463150584160056e-07,
"loss": 0.0241,
"step": 4420
},
{
"epoch": 4.789189189189189,
"grad_norm": 0.589209794998169,
"learning_rate": 3.523359529335696e-07,
"loss": 0.0271,
"step": 4430
},
{
"epoch": 4.8,
"grad_norm": 0.52953040599823,
"learning_rate": 3.402437039073003e-07,
"loss": 0.0274,
"step": 4440
},
{
"epoch": 4.8108108108108105,
"grad_norm": 0.32550954818725586,
"learning_rate": 3.283552877169399e-07,
"loss": 0.0212,
"step": 4450
},
{
"epoch": 4.821621621621622,
"grad_norm": 0.7259662747383118,
"learning_rate": 3.1667122440032506e-07,
"loss": 0.028,
"step": 4460
},
{
"epoch": 4.832432432432433,
"grad_norm": 0.39751487970352173,
"learning_rate": 3.0519202505623513e-07,
"loss": 0.0211,
"step": 4470
},
{
"epoch": 4.8432432432432435,
"grad_norm": 0.39766958355903625,
"learning_rate": 2.939181918220385e-07,
"loss": 0.0236,
"step": 4480
},
{
"epoch": 4.854054054054054,
"grad_norm": 0.40745213627815247,
"learning_rate": 2.828502178517223e-07,
"loss": 0.026,
"step": 4490
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.6851739287376404,
"learning_rate": 2.719885872943229e-07,
"loss": 0.022,
"step": 4500
},
{
"epoch": 4.875675675675676,
"grad_norm": 0.8467708230018616,
"learning_rate": 2.6133377527274906e-07,
"loss": 0.0293,
"step": 4510
},
{
"epoch": 4.886486486486486,
"grad_norm": 0.5121775269508362,
"learning_rate": 2.508862478629936e-07,
"loss": 0.0249,
"step": 4520
},
{
"epoch": 4.897297297297297,
"grad_norm": 0.46212148666381836,
"learning_rate": 2.4064646207375307e-07,
"loss": 0.0259,
"step": 4530
},
{
"epoch": 4.908108108108108,
"grad_norm": 0.41044148802757263,
"learning_rate": 2.3061486582642733e-07,
"loss": 0.0226,
"step": 4540
},
{
"epoch": 4.918918918918919,
"grad_norm": 0.3751397132873535,
"learning_rate": 2.2079189793553667e-07,
"loss": 0.0178,
"step": 4550
},
{
"epoch": 4.92972972972973,
"grad_norm": 0.6501022577285767,
"learning_rate": 2.111779880895165e-07,
"loss": 0.0194,
"step": 4560
},
{
"epoch": 4.940540540540541,
"grad_norm": 0.4015115201473236,
"learning_rate": 2.01773556831929e-07,
"loss": 0.0232,
"step": 4570
},
{
"epoch": 4.951351351351351,
"grad_norm": 0.40367844700813293,
"learning_rate": 1.9257901554306514e-07,
"loss": 0.0229,
"step": 4580
},
{
"epoch": 4.962162162162162,
"grad_norm": 0.39636364579200745,
"learning_rate": 1.835947664219445e-07,
"loss": 0.0232,
"step": 4590
},
{
"epoch": 4.972972972972973,
"grad_norm": 0.49914997816085815,
"learning_rate": 1.748212024687307e-07,
"loss": 0.0188,
"step": 4600
},
{
"epoch": 4.9837837837837835,
"grad_norm": 0.32945120334625244,
"learning_rate": 1.6625870746753148e-07,
"loss": 0.0261,
"step": 4610
},
{
"epoch": 4.994594594594595,
"grad_norm": 0.5340284705162048,
"learning_rate": 1.5790765596961855e-07,
"loss": 0.0264,
"step": 4620
},
{
"epoch": 5.005405405405406,
"grad_norm": 0.5107668042182922,
"learning_rate": 1.4976841327703717e-07,
"loss": 0.02,
"step": 4630
},
{
"epoch": 5.0162162162162165,
"grad_norm": 0.5458788275718689,
"learning_rate": 1.4184133542663014e-07,
"loss": 0.0245,
"step": 4640
},
{
"epoch": 5.027027027027027,
"grad_norm": 0.4814177453517914,
"learning_rate": 1.341267691744641e-07,
"loss": 0.0225,
"step": 4650
},
{
"epoch": 5.037837837837838,
"grad_norm": 0.48016414046287537,
"learning_rate": 1.2662505198065667e-07,
"loss": 0.0281,
"step": 4660
},
{
"epoch": 5.048648648648649,
"grad_norm": 0.3474937081336975,
"learning_rate": 1.193365119946216e-07,
"loss": 0.0226,
"step": 4670
},
{
"epoch": 5.059459459459459,
"grad_norm": 0.46155011653900146,
"learning_rate": 1.122614680407086e-07,
"loss": 0.0169,
"step": 4680
},
{
"epoch": 5.07027027027027,
"grad_norm": 0.4519842863082886,
"learning_rate": 1.054002296042611e-07,
"loss": 0.0253,
"step": 4690
},
{
"epoch": 5.081081081081081,
"grad_norm": 0.43510937690734863,
"learning_rate": 9.875309681807443e-08,
"loss": 0.0273,
"step": 4700
},
{
"epoch": 5.091891891891892,
"grad_norm": 0.5046071410179138,
"learning_rate": 9.232036044927062e-08,
"loss": 0.023,
"step": 4710
},
{
"epoch": 5.102702702702703,
"grad_norm": 0.5468009114265442,
"learning_rate": 8.61023018865792e-08,
"loss": 0.026,
"step": 4720
},
{
"epoch": 5.113513513513514,
"grad_norm": 0.5225476026535034,
"learning_rate": 8.009919312802372e-08,
"loss": 0.0234,
"step": 4730
},
{
"epoch": 5.124324324324324,
"grad_norm": 0.49468499422073364,
"learning_rate": 7.431129676902905e-08,
"loss": 0.0185,
"step": 4740
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.5294644236564636,
"learning_rate": 6.873886599093216e-08,
"loss": 0.0244,
"step": 4750
},
{
"epoch": 5.145945945945946,
"grad_norm": 0.4790719151496887,
"learning_rate": 6.338214454990776e-08,
"loss": 0.0225,
"step": 4760
},
{
"epoch": 5.1567567567567565,
"grad_norm": 0.44608423113822937,
"learning_rate": 5.82413667663051e-08,
"loss": 0.0224,
"step": 4770
},
{
"epoch": 5.167567567567567,
"grad_norm": 0.444612592458725,
"learning_rate": 5.3316757514397245e-08,
"loss": 0.025,
"step": 4780
},
{
"epoch": 5.178378378378379,
"grad_norm": 0.5216858386993408,
"learning_rate": 4.860853221254791e-08,
"loss": 0.0202,
"step": 4790
},
{
"epoch": 5.1891891891891895,
"grad_norm": 0.510924756526947,
"learning_rate": 4.411689681378284e-08,
"loss": 0.0255,
"step": 4800
},
{
"epoch": 5.2,
"grad_norm": 0.36953872442245483,
"learning_rate": 3.984204779678646e-08,
"loss": 0.0197,
"step": 4810
},
{
"epoch": 5.210810810810811,
"grad_norm": 0.6091222763061523,
"learning_rate": 3.578417215730323e-08,
"loss": 0.0188,
"step": 4820
},
{
"epoch": 5.221621621621622,
"grad_norm": 0.4898870289325714,
"learning_rate": 3.194344739995803e-08,
"loss": 0.0239,
"step": 4830
},
{
"epoch": 5.232432432432432,
"grad_norm": 0.4123137593269348,
"learning_rate": 2.8320041530495724e-08,
"loss": 0.0317,
"step": 4840
},
{
"epoch": 5.243243243243243,
"grad_norm": 0.5520591735839844,
"learning_rate": 2.4914113048425393e-08,
"loss": 0.0225,
"step": 4850
},
{
"epoch": 5.254054054054054,
"grad_norm": 0.4832994043827057,
"learning_rate": 2.1725810940094182e-08,
"loss": 0.0259,
"step": 4860
},
{
"epoch": 5.264864864864865,
"grad_norm": 0.4356898069381714,
"learning_rate": 1.8755274672164202e-08,
"loss": 0.0237,
"step": 4870
},
{
"epoch": 5.275675675675676,
"grad_norm": 0.41619211435317993,
"learning_rate": 1.600263418551573e-08,
"loss": 0.0241,
"step": 4880
},
{
"epoch": 5.286486486486487,
"grad_norm": 0.6041860580444336,
"learning_rate": 1.3468009889559541e-08,
"loss": 0.0242,
"step": 4890
},
{
"epoch": 5.297297297297297,
"grad_norm": 0.5045064687728882,
"learning_rate": 1.1151512656975006e-08,
"loss": 0.0267,
"step": 4900
},
{
"epoch": 5.308108108108108,
"grad_norm": 0.270536333322525,
"learning_rate": 9.053243818853974e-09,
"loss": 0.0214,
"step": 4910
},
{
"epoch": 5.318918918918919,
"grad_norm": 0.643278181552887,
"learning_rate": 7.173295160273763e-09,
"loss": 0.0222,
"step": 4920
},
{
"epoch": 5.3297297297297295,
"grad_norm": 0.44826021790504456,
"learning_rate": 5.511748916279258e-09,
"loss": 0.0253,
"step": 4930
},
{
"epoch": 5.34054054054054,
"grad_norm": 0.4237731993198395,
"learning_rate": 4.068677768285234e-09,
"loss": 0.0219,
"step": 4940
},
{
"epoch": 5.351351351351352,
"grad_norm": 0.4591931700706482,
"learning_rate": 2.844144840898344e-09,
"loss": 0.0239,
"step": 4950
},
{
"epoch": 5.3621621621621625,
"grad_norm": 0.4851166009902954,
"learning_rate": 1.8382036991559938e-09,
"loss": 0.0193,
"step": 4960
},
{
"epoch": 5.372972972972973,
"grad_norm": 0.5183833241462708,
"learning_rate": 1.0508983461832156e-09,
"loss": 0.0232,
"step": 4970
},
{
"epoch": 5.383783783783784,
"grad_norm": 0.4397503435611725,
"learning_rate": 4.822632212653222e-10,
"loss": 0.0255,
"step": 4980
},
{
"epoch": 5.394594594594595,
"grad_norm": 0.5476964712142944,
"learning_rate": 1.323231983463291e-10,
"loss": 0.0207,
"step": 4990
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.46734490990638733,
"learning_rate": 1.0935849353854722e-12,
"loss": 0.0245,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}