ModernBERT-Prot-v1-34M / trainer_state.json
RaphaelMourad's picture
Upload 9 files
c52817c verified
{
"best_metric": 2.9280490707606077e-05,
"best_model_checkpoint": "./results/models/checkpoint-156240",
"epoch": 8.0,
"eval_steps": 500,
"global_step": 156240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025601638504864313,
"grad_norm": 0.0004177093505859375,
"learning_rate": 0.0009994879672299028,
"loss": 0.023,
"step": 500
},
{
"epoch": 0.051203277009728626,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0009989759344598056,
"loss": 0.0,
"step": 1000
},
{
"epoch": 0.07680491551459294,
"grad_norm": 0.000270843505859375,
"learning_rate": 0.0009984639016897081,
"loss": 0.0,
"step": 1500
},
{
"epoch": 0.10240655401945725,
"grad_norm": 0.0002231597900390625,
"learning_rate": 0.000997951868919611,
"loss": 0.0,
"step": 2000
},
{
"epoch": 0.12800819252432155,
"grad_norm": 0.00021457672119140625,
"learning_rate": 0.0009974398361495137,
"loss": 0.0,
"step": 2500
},
{
"epoch": 0.15360983102918588,
"grad_norm": 0.00020694732666015625,
"learning_rate": 0.0009969278033794163,
"loss": 0.0,
"step": 3000
},
{
"epoch": 0.17921146953405018,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.000996415770609319,
"loss": 0.0,
"step": 3500
},
{
"epoch": 0.2048131080389145,
"grad_norm": 0.000202178955078125,
"learning_rate": 0.0009959037378392218,
"loss": 0.0,
"step": 4000
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0009953917050691244,
"loss": 0.0,
"step": 4500
},
{
"epoch": 0.2560163850486431,
"grad_norm": 0.0024566650390625,
"learning_rate": 0.0009948796722990272,
"loss": 0.0813,
"step": 5000
},
{
"epoch": 0.2816180235535074,
"grad_norm": 0.0025482177734375,
"learning_rate": 0.00099436763952893,
"loss": 0.001,
"step": 5500
},
{
"epoch": 0.30721966205837176,
"grad_norm": 0.00048828125,
"learning_rate": 0.0009938556067588325,
"loss": 0.0005,
"step": 6000
},
{
"epoch": 0.33282130056323606,
"grad_norm": 0.00058746337890625,
"learning_rate": 0.0009933435739887353,
"loss": 0.0004,
"step": 6500
},
{
"epoch": 0.35842293906810035,
"grad_norm": 0.000614166259765625,
"learning_rate": 0.000992831541218638,
"loss": 0.0004,
"step": 7000
},
{
"epoch": 0.38402457757296465,
"grad_norm": 0.014404296875,
"learning_rate": 0.0009923195084485406,
"loss": 0.0002,
"step": 7500
},
{
"epoch": 0.409626216077829,
"grad_norm": 0.000820159912109375,
"learning_rate": 0.0009918074756784434,
"loss": 0.0002,
"step": 8000
},
{
"epoch": 0.4352278545826933,
"grad_norm": 0.000713348388671875,
"learning_rate": 0.0009912954429083462,
"loss": 0.0002,
"step": 8500
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.00057220458984375,
"learning_rate": 0.000990783410138249,
"loss": 0.0002,
"step": 9000
},
{
"epoch": 0.4864311315924219,
"grad_norm": 0.0026702880859375,
"learning_rate": 0.0009902713773681515,
"loss": 0.0002,
"step": 9500
},
{
"epoch": 0.5120327700972862,
"grad_norm": 0.0026702880859375,
"learning_rate": 0.0009897593445980543,
"loss": 0.0001,
"step": 10000
},
{
"epoch": 0.5376344086021505,
"grad_norm": 0.0005035400390625,
"learning_rate": 0.000989247311827957,
"loss": 0.0001,
"step": 10500
},
{
"epoch": 0.5632360471070148,
"grad_norm": 0.00052642822265625,
"learning_rate": 0.0009887352790578599,
"loss": 0.0002,
"step": 11000
},
{
"epoch": 0.5888376856118792,
"grad_norm": 0.000667572021484375,
"learning_rate": 0.0009882232462877624,
"loss": 0.0001,
"step": 11500
},
{
"epoch": 0.6144393241167435,
"grad_norm": 0.0003376007080078125,
"learning_rate": 0.0009877112135176652,
"loss": 0.0001,
"step": 12000
},
{
"epoch": 0.6400409626216078,
"grad_norm": 0.00029754638671875,
"learning_rate": 0.000987199180747568,
"loss": 0.0001,
"step": 12500
},
{
"epoch": 0.6656426011264721,
"grad_norm": 0.000308990478515625,
"learning_rate": 0.0009866871479774705,
"loss": 0.0001,
"step": 13000
},
{
"epoch": 0.6912442396313364,
"grad_norm": 0.0003147125244140625,
"learning_rate": 0.0009861751152073733,
"loss": 0.0001,
"step": 13500
},
{
"epoch": 0.7168458781362007,
"grad_norm": 0.0004367828369140625,
"learning_rate": 0.0009856630824372759,
"loss": 0.0001,
"step": 14000
},
{
"epoch": 0.742447516641065,
"grad_norm": 0.0002880096435546875,
"learning_rate": 0.0009851510496671787,
"loss": 0.0001,
"step": 14500
},
{
"epoch": 0.7680491551459293,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0009846390168970814,
"loss": 0.0001,
"step": 15000
},
{
"epoch": 0.7936507936507936,
"grad_norm": 0.0003528594970703125,
"learning_rate": 0.000984126984126984,
"loss": 0.0001,
"step": 15500
},
{
"epoch": 0.819252432155658,
"grad_norm": 0.0002880096435546875,
"learning_rate": 0.0009836149513568868,
"loss": 0.0001,
"step": 16000
},
{
"epoch": 0.8448540706605223,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.0009831029185867896,
"loss": 0.0001,
"step": 16500
},
{
"epoch": 0.8704557091653866,
"grad_norm": 0.0003509521484375,
"learning_rate": 0.0009825908858166923,
"loss": 0.0001,
"step": 17000
},
{
"epoch": 0.8960573476702509,
"grad_norm": 0.000244140625,
"learning_rate": 0.0009820788530465951,
"loss": 0.0,
"step": 17500
},
{
"epoch": 0.9216589861751152,
"grad_norm": 0.000568389892578125,
"learning_rate": 0.0009815668202764977,
"loss": 0.0007,
"step": 18000
},
{
"epoch": 0.9472606246799795,
"grad_norm": 0.0005645751953125,
"learning_rate": 0.0009810547875064005,
"loss": 0.0001,
"step": 18500
},
{
"epoch": 0.9728622631848438,
"grad_norm": 0.00037384033203125,
"learning_rate": 0.0009805427547363032,
"loss": 0.0001,
"step": 19000
},
{
"epoch": 0.9984639016897081,
"grad_norm": 0.000507354736328125,
"learning_rate": 0.000980030721966206,
"loss": 0.0001,
"step": 19500
},
{
"epoch": 1.0,
"eval_loss": 6.09988892392721e-05,
"eval_runtime": 0.5651,
"eval_samples_per_second": 1769.543,
"eval_steps_per_second": 3.539,
"step": 19530
},
{
"epoch": 1.0240655401945724,
"grad_norm": 0.00133514404296875,
"learning_rate": 0.0009795186891961086,
"loss": 0.0001,
"step": 20000
},
{
"epoch": 1.0496671786994367,
"grad_norm": 0.0004024505615234375,
"learning_rate": 0.0009790066564260114,
"loss": 0.0001,
"step": 20500
},
{
"epoch": 1.075268817204301,
"grad_norm": 0.00031280517578125,
"learning_rate": 0.000978494623655914,
"loss": 0.0002,
"step": 21000
},
{
"epoch": 1.1008704557091653,
"grad_norm": 0.0002689361572265625,
"learning_rate": 0.0009779825908858167,
"loss": 0.0001,
"step": 21500
},
{
"epoch": 1.1264720942140296,
"grad_norm": 0.00023365020751953125,
"learning_rate": 0.0009774705581157195,
"loss": 0.0001,
"step": 22000
},
{
"epoch": 1.1520737327188941,
"grad_norm": 0.00032806396484375,
"learning_rate": 0.000976958525345622,
"loss": 0.0001,
"step": 22500
},
{
"epoch": 1.1776753712237582,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0009764464925755249,
"loss": 0.0001,
"step": 23000
},
{
"epoch": 1.2032770097286227,
"grad_norm": 0.0002651214599609375,
"learning_rate": 0.0009759344598054276,
"loss": 0.0001,
"step": 23500
},
{
"epoch": 1.228878648233487,
"grad_norm": 0.00029754638671875,
"learning_rate": 0.0009754224270353303,
"loss": 0.0001,
"step": 24000
},
{
"epoch": 1.2544802867383513,
"grad_norm": 0.000308990478515625,
"learning_rate": 0.0009749103942652329,
"loss": 0.0001,
"step": 24500
},
{
"epoch": 1.2800819252432156,
"grad_norm": 0.000263214111328125,
"learning_rate": 0.0009743983614951357,
"loss": 0.0001,
"step": 25000
},
{
"epoch": 1.30568356374808,
"grad_norm": 0.00074005126953125,
"learning_rate": 0.0009738863287250385,
"loss": 0.0001,
"step": 25500
},
{
"epoch": 1.3312852022529442,
"grad_norm": 0.000354766845703125,
"learning_rate": 0.000973374295954941,
"loss": 0.0001,
"step": 26000
},
{
"epoch": 1.3568868407578085,
"grad_norm": 0.0003108978271484375,
"learning_rate": 0.0009728622631848438,
"loss": 0.0001,
"step": 26500
},
{
"epoch": 1.3824884792626728,
"grad_norm": 0.000278472900390625,
"learning_rate": 0.0009723502304147466,
"loss": 0.0001,
"step": 27000
},
{
"epoch": 1.4080901177675371,
"grad_norm": 0.000293731689453125,
"learning_rate": 0.0009718381976446493,
"loss": 0.0,
"step": 27500
},
{
"epoch": 1.4336917562724014,
"grad_norm": 0.000263214111328125,
"learning_rate": 0.000971326164874552,
"loss": 0.0,
"step": 28000
},
{
"epoch": 1.4592933947772657,
"grad_norm": 0.000293731689453125,
"learning_rate": 0.0009708141321044547,
"loss": 0.0,
"step": 28500
},
{
"epoch": 1.48489503328213,
"grad_norm": 0.000278472900390625,
"learning_rate": 0.0009703020993343574,
"loss": 0.0001,
"step": 29000
},
{
"epoch": 1.5104966717869943,
"grad_norm": 0.00030517578125,
"learning_rate": 0.0009697900665642602,
"loss": 0.0001,
"step": 29500
},
{
"epoch": 1.5360983102918588,
"grad_norm": 0.0002899169921875,
"learning_rate": 0.0009692780337941628,
"loss": 0.0,
"step": 30000
},
{
"epoch": 1.561699948796723,
"grad_norm": 0.000766754150390625,
"learning_rate": 0.0009687660010240655,
"loss": 0.0,
"step": 30500
},
{
"epoch": 1.5873015873015874,
"grad_norm": 0.00032806396484375,
"learning_rate": 0.0009682539682539683,
"loss": 0.0001,
"step": 31000
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.000301361083984375,
"learning_rate": 0.000967741935483871,
"loss": 0.0001,
"step": 31500
},
{
"epoch": 1.638504864311316,
"grad_norm": 0.003997802734375,
"learning_rate": 0.0009672299027137736,
"loss": 0.0001,
"step": 32000
},
{
"epoch": 1.66410650281618,
"grad_norm": 0.0003452301025390625,
"learning_rate": 0.0009667178699436764,
"loss": 0.0001,
"step": 32500
},
{
"epoch": 1.6897081413210446,
"grad_norm": 0.0002918243408203125,
"learning_rate": 0.0009662058371735791,
"loss": 0.0001,
"step": 33000
},
{
"epoch": 1.7153097798259087,
"grad_norm": 0.0002422332763671875,
"learning_rate": 0.0009656938044034819,
"loss": 0.0001,
"step": 33500
},
{
"epoch": 1.7409114183307732,
"grad_norm": 0.00124359130859375,
"learning_rate": 0.0009651817716333846,
"loss": 0.0,
"step": 34000
},
{
"epoch": 1.7665130568356375,
"grad_norm": 0.0009002685546875,
"learning_rate": 0.0009646697388632872,
"loss": 0.0001,
"step": 34500
},
{
"epoch": 1.7921146953405018,
"grad_norm": 0.00030517578125,
"learning_rate": 0.00096415770609319,
"loss": 0.0,
"step": 35000
},
{
"epoch": 1.8177163338453661,
"grad_norm": 0.00041961669921875,
"learning_rate": 0.0009636456733230928,
"loss": 0.0001,
"step": 35500
},
{
"epoch": 1.8433179723502304,
"grad_norm": 0.0003108978271484375,
"learning_rate": 0.0009631336405529954,
"loss": 0.0,
"step": 36000
},
{
"epoch": 1.8689196108550947,
"grad_norm": 0.0002727508544921875,
"learning_rate": 0.0009626216077828981,
"loss": 0.0,
"step": 36500
},
{
"epoch": 1.894521249359959,
"grad_norm": 0.0002613067626953125,
"learning_rate": 0.0009621095750128009,
"loss": 0.0,
"step": 37000
},
{
"epoch": 1.9201228878648233,
"grad_norm": 0.0003204345703125,
"learning_rate": 0.0009615975422427036,
"loss": 0.0,
"step": 37500
},
{
"epoch": 1.9457245263696876,
"grad_norm": 0.000255584716796875,
"learning_rate": 0.0009610855094726063,
"loss": 0.0,
"step": 38000
},
{
"epoch": 1.971326164874552,
"grad_norm": 0.000263214111328125,
"learning_rate": 0.0009605734767025089,
"loss": 0.0,
"step": 38500
},
{
"epoch": 1.9969278033794162,
"grad_norm": 0.00048828125,
"learning_rate": 0.0009600614439324117,
"loss": 0.0001,
"step": 39000
},
{
"epoch": 2.0,
"eval_loss": 3.46598717442248e-05,
"eval_runtime": 0.5684,
"eval_samples_per_second": 1759.346,
"eval_steps_per_second": 3.519,
"step": 39060
},
{
"epoch": 2.0225294418842807,
"grad_norm": 0.000423431396484375,
"learning_rate": 0.0009595494111623145,
"loss": 0.0,
"step": 39500
},
{
"epoch": 2.048131080389145,
"grad_norm": 0.0002899169921875,
"learning_rate": 0.0009590373783922171,
"loss": 0.0,
"step": 40000
},
{
"epoch": 2.0737327188940093,
"grad_norm": 0.0002593994140625,
"learning_rate": 0.0009585253456221198,
"loss": 0.0,
"step": 40500
},
{
"epoch": 2.0993343573988734,
"grad_norm": 0.00042724609375,
"learning_rate": 0.0009580133128520226,
"loss": 0.0,
"step": 41000
},
{
"epoch": 2.124935995903738,
"grad_norm": 0.00032806396484375,
"learning_rate": 0.0009575012800819252,
"loss": 0.0,
"step": 41500
},
{
"epoch": 2.150537634408602,
"grad_norm": 0.0002918243408203125,
"learning_rate": 0.000956989247311828,
"loss": 0.0,
"step": 42000
},
{
"epoch": 2.1761392729134665,
"grad_norm": 0.00250244140625,
"learning_rate": 0.0009564772145417307,
"loss": 0.0,
"step": 42500
},
{
"epoch": 2.2017409114183306,
"grad_norm": 0.0002803802490234375,
"learning_rate": 0.0009559651817716334,
"loss": 0.0,
"step": 43000
},
{
"epoch": 2.227342549923195,
"grad_norm": 0.0003681182861328125,
"learning_rate": 0.0009554531490015361,
"loss": 0.0,
"step": 43500
},
{
"epoch": 2.252944188428059,
"grad_norm": 0.0002689361572265625,
"learning_rate": 0.0009549411162314389,
"loss": 0.0,
"step": 44000
},
{
"epoch": 2.2785458269329237,
"grad_norm": 0.000370025634765625,
"learning_rate": 0.0009544290834613415,
"loss": 0.0001,
"step": 44500
},
{
"epoch": 2.3041474654377883,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.0009539170506912443,
"loss": 0.0,
"step": 45000
},
{
"epoch": 2.3297491039426523,
"grad_norm": 0.000591278076171875,
"learning_rate": 0.0009534050179211469,
"loss": 0.0,
"step": 45500
},
{
"epoch": 2.3553507424475164,
"grad_norm": 0.000514984130859375,
"learning_rate": 0.0009528929851510497,
"loss": 0.0,
"step": 46000
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.00025177001953125,
"learning_rate": 0.0009523809523809524,
"loss": 0.0,
"step": 46500
},
{
"epoch": 2.4065540194572455,
"grad_norm": 0.0002384185791015625,
"learning_rate": 0.000951868919610855,
"loss": 0.0,
"step": 47000
},
{
"epoch": 2.4321556579621095,
"grad_norm": 0.000247955322265625,
"learning_rate": 0.0009513568868407578,
"loss": 0.0,
"step": 47500
},
{
"epoch": 2.457757296466974,
"grad_norm": 0.000240325927734375,
"learning_rate": 0.0009508448540706606,
"loss": 0.0,
"step": 48000
},
{
"epoch": 2.483358934971838,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0009503328213005633,
"loss": 0.0,
"step": 48500
},
{
"epoch": 2.5089605734767026,
"grad_norm": 0.0002269744873046875,
"learning_rate": 0.000949820788530466,
"loss": 0.0,
"step": 49000
},
{
"epoch": 2.5345622119815667,
"grad_norm": 0.0003204345703125,
"learning_rate": 0.0009493087557603687,
"loss": 0.0,
"step": 49500
},
{
"epoch": 2.5601638504864312,
"grad_norm": 0.00787353515625,
"learning_rate": 0.0009487967229902714,
"loss": 0.0,
"step": 50000
},
{
"epoch": 2.5857654889912953,
"grad_norm": 0.00022983551025390625,
"learning_rate": 0.0009482846902201742,
"loss": 0.0,
"step": 50500
},
{
"epoch": 2.61136712749616,
"grad_norm": 0.0002651214599609375,
"learning_rate": 0.0009477726574500767,
"loss": 0.0,
"step": 51000
},
{
"epoch": 2.636968766001024,
"grad_norm": 0.0002346038818359375,
"learning_rate": 0.0009472606246799795,
"loss": 0.0,
"step": 51500
},
{
"epoch": 2.6625704045058884,
"grad_norm": 0.00021839141845703125,
"learning_rate": 0.0009467485919098823,
"loss": 0.0,
"step": 52000
},
{
"epoch": 2.688172043010753,
"grad_norm": 0.0004177093505859375,
"learning_rate": 0.000946236559139785,
"loss": 0.0,
"step": 52500
},
{
"epoch": 2.713773681515617,
"grad_norm": 0.000247955322265625,
"learning_rate": 0.0009457245263696876,
"loss": 0.0,
"step": 53000
},
{
"epoch": 2.739375320020481,
"grad_norm": 0.0002269744873046875,
"learning_rate": 0.0009452124935995904,
"loss": 0.0,
"step": 53500
},
{
"epoch": 2.7649769585253456,
"grad_norm": 0.00025177001953125,
"learning_rate": 0.0009447004608294931,
"loss": 0.0,
"step": 54000
},
{
"epoch": 2.79057859703021,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.0009441884280593959,
"loss": 0.0,
"step": 54500
},
{
"epoch": 2.8161802355350742,
"grad_norm": 0.0002346038818359375,
"learning_rate": 0.0009436763952892985,
"loss": 0.0,
"step": 55000
},
{
"epoch": 2.8417818740399383,
"grad_norm": 0.0013427734375,
"learning_rate": 0.0009431643625192012,
"loss": 0.0,
"step": 55500
},
{
"epoch": 2.867383512544803,
"grad_norm": 0.0003528594970703125,
"learning_rate": 0.000942652329749104,
"loss": 0.0,
"step": 56000
},
{
"epoch": 2.8929851510496674,
"grad_norm": 0.00023746490478515625,
"learning_rate": 0.0009421402969790068,
"loss": 0.0,
"step": 56500
},
{
"epoch": 2.9185867895545314,
"grad_norm": 0.00055694580078125,
"learning_rate": 0.0009416282642089093,
"loss": 0.0,
"step": 57000
},
{
"epoch": 2.944188428059396,
"grad_norm": 0.0002307891845703125,
"learning_rate": 0.0009411162314388121,
"loss": 0.0,
"step": 57500
},
{
"epoch": 2.96979006656426,
"grad_norm": 0.000286102294921875,
"learning_rate": 0.0009406041986687148,
"loss": 0.0,
"step": 58000
},
{
"epoch": 2.9953917050691246,
"grad_norm": 0.00019550323486328125,
"learning_rate": 0.0009400921658986176,
"loss": 0.0,
"step": 58500
},
{
"epoch": 3.0,
"eval_loss": 3.896626367350109e-05,
"eval_runtime": 0.5618,
"eval_samples_per_second": 1780.014,
"eval_steps_per_second": 3.56,
"step": 58590
},
{
"epoch": 3.0209933435739886,
"grad_norm": 0.000255584716796875,
"learning_rate": 0.0009395801331285202,
"loss": 0.0,
"step": 59000
},
{
"epoch": 3.046594982078853,
"grad_norm": 0.000209808349609375,
"learning_rate": 0.0009390681003584229,
"loss": 0.0,
"step": 59500
},
{
"epoch": 3.0721966205837172,
"grad_norm": 0.00021457672119140625,
"learning_rate": 0.0009385560675883257,
"loss": 0.0,
"step": 60000
},
{
"epoch": 3.0977982590885818,
"grad_norm": 0.0013580322265625,
"learning_rate": 0.0009380440348182285,
"loss": 0.0,
"step": 60500
},
{
"epoch": 3.123399897593446,
"grad_norm": 0.0002002716064453125,
"learning_rate": 0.000937532002048131,
"loss": 0.0,
"step": 61000
},
{
"epoch": 3.1490015360983103,
"grad_norm": 0.0002727508544921875,
"learning_rate": 0.0009370199692780338,
"loss": 0.0,
"step": 61500
},
{
"epoch": 3.1746031746031744,
"grad_norm": 0.0003662109375,
"learning_rate": 0.0009365079365079366,
"loss": 0.0,
"step": 62000
},
{
"epoch": 3.200204813108039,
"grad_norm": 0.0002040863037109375,
"learning_rate": 0.0009359959037378392,
"loss": 0.0,
"step": 62500
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.00020694732666015625,
"learning_rate": 0.0009354838709677419,
"loss": 0.0,
"step": 63000
},
{
"epoch": 3.2514080901177675,
"grad_norm": 0.000213623046875,
"learning_rate": 0.0009349718381976447,
"loss": 0.0,
"step": 63500
},
{
"epoch": 3.277009728622632,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.0009344598054275474,
"loss": 0.0,
"step": 64000
},
{
"epoch": 3.302611367127496,
"grad_norm": 0.0001983642578125,
"learning_rate": 0.0009339477726574501,
"loss": 0.0,
"step": 64500
},
{
"epoch": 3.32821300563236,
"grad_norm": 0.0002117156982421875,
"learning_rate": 0.0009334357398873528,
"loss": 0.0,
"step": 65000
},
{
"epoch": 3.3538146441372247,
"grad_norm": 0.0004444122314453125,
"learning_rate": 0.0009329237071172555,
"loss": 0.0,
"step": 65500
},
{
"epoch": 3.3794162826420893,
"grad_norm": 0.00024318695068359375,
"learning_rate": 0.0009324116743471583,
"loss": 0.0,
"step": 66000
},
{
"epoch": 3.4050179211469533,
"grad_norm": 0.004608154296875,
"learning_rate": 0.0009318996415770609,
"loss": 0.0,
"step": 66500
},
{
"epoch": 3.430619559651818,
"grad_norm": 0.00022029876708984375,
"learning_rate": 0.0009313876088069637,
"loss": 0.0,
"step": 67000
},
{
"epoch": 3.456221198156682,
"grad_norm": 0.00021648406982421875,
"learning_rate": 0.0009308755760368664,
"loss": 0.0,
"step": 67500
},
{
"epoch": 3.4818228366615465,
"grad_norm": 0.0002841949462890625,
"learning_rate": 0.000930363543266769,
"loss": 0.0,
"step": 68000
},
{
"epoch": 3.5074244751664105,
"grad_norm": 0.0002269744873046875,
"learning_rate": 0.0009298515104966718,
"loss": 0.0,
"step": 68500
},
{
"epoch": 3.533026113671275,
"grad_norm": 0.00021648406982421875,
"learning_rate": 0.0009293394777265746,
"loss": 0.0,
"step": 69000
},
{
"epoch": 3.558627752176139,
"grad_norm": 0.0002079010009765625,
"learning_rate": 0.0009288274449564772,
"loss": 0.0,
"step": 69500
},
{
"epoch": 3.5842293906810037,
"grad_norm": 0.00022125244140625,
"learning_rate": 0.00092831541218638,
"loss": 0.0,
"step": 70000
},
{
"epoch": 3.6098310291858677,
"grad_norm": 0.00021457672119140625,
"learning_rate": 0.0009278033794162827,
"loss": 0.0,
"step": 70500
},
{
"epoch": 3.6354326676907323,
"grad_norm": 0.000209808349609375,
"learning_rate": 0.0009272913466461854,
"loss": 0.0,
"step": 71000
},
{
"epoch": 3.6610343061955968,
"grad_norm": 0.0002727508544921875,
"learning_rate": 0.0009267793138760881,
"loss": 0.0,
"step": 71500
},
{
"epoch": 3.686635944700461,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.0009262672811059907,
"loss": 0.0,
"step": 72000
},
{
"epoch": 3.712237583205325,
"grad_norm": 0.0002288818359375,
"learning_rate": 0.0009257552483358935,
"loss": 0.0,
"step": 72500
},
{
"epoch": 3.7378392217101895,
"grad_norm": 0.0010528564453125,
"learning_rate": 0.0009252432155657963,
"loss": 0.0,
"step": 73000
},
{
"epoch": 3.763440860215054,
"grad_norm": 0.00019550323486328125,
"learning_rate": 0.0009247311827956989,
"loss": 0.0,
"step": 73500
},
{
"epoch": 3.789042498719918,
"grad_norm": 0.00021266937255859375,
"learning_rate": 0.0009242191500256016,
"loss": 0.0,
"step": 74000
},
{
"epoch": 3.814644137224782,
"grad_norm": 0.0003509521484375,
"learning_rate": 0.0009237071172555044,
"loss": 0.0,
"step": 74500
},
{
"epoch": 3.8402457757296466,
"grad_norm": 0.00020599365234375,
"learning_rate": 0.0009231950844854071,
"loss": 0.0,
"step": 75000
},
{
"epoch": 3.865847414234511,
"grad_norm": 0.0002002716064453125,
"learning_rate": 0.0009226830517153098,
"loss": 0.0,
"step": 75500
},
{
"epoch": 3.8914490527393752,
"grad_norm": 0.00021076202392578125,
"learning_rate": 0.0009221710189452125,
"loss": 0.0,
"step": 76000
},
{
"epoch": 3.9170506912442398,
"grad_norm": 0.0002918243408203125,
"learning_rate": 0.0009216589861751152,
"loss": 0.0,
"step": 76500
},
{
"epoch": 3.942652329749104,
"grad_norm": 0.0003223419189453125,
"learning_rate": 0.000921146953405018,
"loss": 0.0,
"step": 77000
},
{
"epoch": 3.9682539682539684,
"grad_norm": 0.0003032684326171875,
"learning_rate": 0.0009206349206349207,
"loss": 0.0,
"step": 77500
},
{
"epoch": 3.9938556067588324,
"grad_norm": 0.000698089599609375,
"learning_rate": 0.0009201228878648233,
"loss": 0.0,
"step": 78000
},
{
"epoch": 4.0,
"eval_loss": 3.7267222069203854e-05,
"eval_runtime": 0.546,
"eval_samples_per_second": 1831.41,
"eval_steps_per_second": 3.663,
"step": 78120
},
{
"epoch": 4.0194572452636965,
"grad_norm": 0.0002651214599609375,
"learning_rate": 0.0009196108550947261,
"loss": 0.0,
"step": 78500
},
{
"epoch": 4.0450588837685615,
"grad_norm": 0.00020885467529296875,
"learning_rate": 0.0009190988223246288,
"loss": 0.0,
"step": 79000
},
{
"epoch": 4.070660522273426,
"grad_norm": 0.00019741058349609375,
"learning_rate": 0.0009185867895545314,
"loss": 0.0,
"step": 79500
},
{
"epoch": 4.09626216077829,
"grad_norm": 0.00021839141845703125,
"learning_rate": 0.0009180747567844342,
"loss": 0.0,
"step": 80000
},
{
"epoch": 4.121863799283154,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.0009175627240143369,
"loss": 0.0,
"step": 80500
},
{
"epoch": 4.147465437788019,
"grad_norm": 0.00023555755615234375,
"learning_rate": 0.0009170506912442397,
"loss": 0.0,
"step": 81000
},
{
"epoch": 4.173067076292883,
"grad_norm": 0.0003204345703125,
"learning_rate": 0.0009165386584741425,
"loss": 0.0,
"step": 81500
},
{
"epoch": 4.198668714797747,
"grad_norm": 0.00022411346435546875,
"learning_rate": 0.000916026625704045,
"loss": 0.0,
"step": 82000
},
{
"epoch": 4.224270353302611,
"grad_norm": 0.0004634857177734375,
"learning_rate": 0.0009155145929339478,
"loss": 0.0,
"step": 82500
},
{
"epoch": 4.249871991807476,
"grad_norm": 0.00023651123046875,
"learning_rate": 0.0009150025601638506,
"loss": 0.0,
"step": 83000
},
{
"epoch": 4.27547363031234,
"grad_norm": 0.000202178955078125,
"learning_rate": 0.0009144905273937532,
"loss": 0.0,
"step": 83500
},
{
"epoch": 4.301075268817204,
"grad_norm": 0.0003528594970703125,
"learning_rate": 0.0009139784946236559,
"loss": 0.0,
"step": 84000
},
{
"epoch": 4.326676907322069,
"grad_norm": 0.0003204345703125,
"learning_rate": 0.0009134664618535587,
"loss": 0.0,
"step": 84500
},
{
"epoch": 4.352278545826933,
"grad_norm": 0.000598907470703125,
"learning_rate": 0.0009129544290834614,
"loss": 0.0,
"step": 85000
},
{
"epoch": 4.377880184331797,
"grad_norm": 0.000286102294921875,
"learning_rate": 0.0009124423963133641,
"loss": 0.0,
"step": 85500
},
{
"epoch": 4.403481822836661,
"grad_norm": 0.00021839141845703125,
"learning_rate": 0.0009119303635432667,
"loss": 0.0,
"step": 86000
},
{
"epoch": 4.429083461341526,
"grad_norm": 0.000225067138671875,
"learning_rate": 0.0009114183307731695,
"loss": 0.0,
"step": 86500
},
{
"epoch": 4.45468509984639,
"grad_norm": 0.00141143798828125,
"learning_rate": 0.0009109062980030723,
"loss": 0.0,
"step": 87000
},
{
"epoch": 4.480286738351254,
"grad_norm": 0.00022602081298828125,
"learning_rate": 0.0009103942652329749,
"loss": 0.0,
"step": 87500
},
{
"epoch": 4.505888376856118,
"grad_norm": 0.000335693359375,
"learning_rate": 0.0009098822324628776,
"loss": 0.0,
"step": 88000
},
{
"epoch": 4.531490015360983,
"grad_norm": 0.00019073486328125,
"learning_rate": 0.0009093701996927804,
"loss": 0.0,
"step": 88500
},
{
"epoch": 4.5570916538658475,
"grad_norm": 0.00019931793212890625,
"learning_rate": 0.000908858166922683,
"loss": 0.0,
"step": 89000
},
{
"epoch": 4.5826932923707115,
"grad_norm": 0.00021839141845703125,
"learning_rate": 0.0009083461341525858,
"loss": 0.0,
"step": 89500
},
{
"epoch": 4.6082949308755765,
"grad_norm": 0.001983642578125,
"learning_rate": 0.0009078341013824885,
"loss": 0.0,
"step": 90000
},
{
"epoch": 4.633896569380441,
"grad_norm": 0.00020503997802734375,
"learning_rate": 0.0009073220686123912,
"loss": 0.0,
"step": 90500
},
{
"epoch": 4.659498207885305,
"grad_norm": 0.0002689361572265625,
"learning_rate": 0.000906810035842294,
"loss": 0.0,
"step": 91000
},
{
"epoch": 4.685099846390169,
"grad_norm": 0.0002307891845703125,
"learning_rate": 0.0009062980030721967,
"loss": 0.0,
"step": 91500
},
{
"epoch": 4.710701484895033,
"grad_norm": 0.00020885467529296875,
"learning_rate": 0.0009057859703020993,
"loss": 0.0,
"step": 92000
},
{
"epoch": 4.736303123399898,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.0009052739375320021,
"loss": 0.0,
"step": 92500
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.0002040863037109375,
"learning_rate": 0.0009047619047619047,
"loss": 0.0,
"step": 93000
},
{
"epoch": 4.787506400409626,
"grad_norm": 0.00174713134765625,
"learning_rate": 0.0009042498719918075,
"loss": 0.0,
"step": 93500
},
{
"epoch": 4.813108038914491,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.0009037378392217102,
"loss": 0.0,
"step": 94000
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0009032258064516129,
"loss": 0.0,
"step": 94500
},
{
"epoch": 4.864311315924219,
"grad_norm": 0.000186920166015625,
"learning_rate": 0.0009027137736815156,
"loss": 0.0,
"step": 95000
},
{
"epoch": 4.889912954429083,
"grad_norm": 0.00019359588623046875,
"learning_rate": 0.0009022017409114184,
"loss": 0.0,
"step": 95500
},
{
"epoch": 4.915514592933948,
"grad_norm": 0.00018310546875,
"learning_rate": 0.000901689708141321,
"loss": 0.0,
"step": 96000
},
{
"epoch": 4.941116231438812,
"grad_norm": 0.000579833984375,
"learning_rate": 0.0009011776753712238,
"loss": 0.0,
"step": 96500
},
{
"epoch": 4.966717869943676,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0009006656426011265,
"loss": 0.0,
"step": 97000
},
{
"epoch": 4.99231950844854,
"grad_norm": 0.000507354736328125,
"learning_rate": 0.0009001536098310292,
"loss": 0.0,
"step": 97500
},
{
"epoch": 5.0,
"eval_loss": 3.1982614018488675e-05,
"eval_runtime": 0.5464,
"eval_samples_per_second": 1830.037,
"eval_steps_per_second": 3.66,
"step": 97650
},
{
"epoch": 5.017921146953405,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.000899641577060932,
"loss": 0.0,
"step": 98000
},
{
"epoch": 5.043522785458269,
"grad_norm": 0.0002651214599609375,
"learning_rate": 0.0008991295442908345,
"loss": 0.0,
"step": 98500
},
{
"epoch": 5.0691244239631335,
"grad_norm": 0.000335693359375,
"learning_rate": 0.0008986175115207373,
"loss": 0.0,
"step": 99000
},
{
"epoch": 5.0947260624679975,
"grad_norm": 0.000186920166015625,
"learning_rate": 0.0008981054787506401,
"loss": 0.0,
"step": 99500
},
{
"epoch": 5.1203277009728625,
"grad_norm": 0.0001850128173828125,
"learning_rate": 0.0008975934459805428,
"loss": 0.0,
"step": 100000
},
{
"epoch": 5.145929339477727,
"grad_norm": 0.00022411346435546875,
"learning_rate": 0.0008970814132104454,
"loss": 0.0,
"step": 100500
},
{
"epoch": 5.171530977982591,
"grad_norm": 0.0002002716064453125,
"learning_rate": 0.0008965693804403482,
"loss": 0.0,
"step": 101000
},
{
"epoch": 5.197132616487456,
"grad_norm": 0.00019550323486328125,
"learning_rate": 0.0008960573476702509,
"loss": 0.0,
"step": 101500
},
{
"epoch": 5.22273425499232,
"grad_norm": 0.00019931793212890625,
"learning_rate": 0.0008955453149001537,
"loss": 0.0,
"step": 102000
},
{
"epoch": 5.248335893497184,
"grad_norm": 0.0002918243408203125,
"learning_rate": 0.0008950332821300563,
"loss": 0.0,
"step": 102500
},
{
"epoch": 5.273937532002048,
"grad_norm": 0.000217437744140625,
"learning_rate": 0.000894521249359959,
"loss": 0.0,
"step": 103000
},
{
"epoch": 5.299539170506913,
"grad_norm": 0.00018405914306640625,
"learning_rate": 0.0008940092165898618,
"loss": 0.0,
"step": 103500
},
{
"epoch": 5.325140809011777,
"grad_norm": 0.0009918212890625,
"learning_rate": 0.0008934971838197646,
"loss": 0.0,
"step": 104000
},
{
"epoch": 5.350742447516641,
"grad_norm": 0.00086212158203125,
"learning_rate": 0.0008929851510496671,
"loss": 0.0,
"step": 104500
},
{
"epoch": 5.376344086021505,
"grad_norm": 0.00020122528076171875,
"learning_rate": 0.0008924731182795699,
"loss": 0.0,
"step": 105000
},
{
"epoch": 5.40194572452637,
"grad_norm": 0.00019168853759765625,
"learning_rate": 0.0008919610855094726,
"loss": 0.0,
"step": 105500
},
{
"epoch": 5.427547363031234,
"grad_norm": 0.0001964569091796875,
"learning_rate": 0.0008914490527393754,
"loss": 0.0,
"step": 106000
},
{
"epoch": 5.453149001536098,
"grad_norm": 0.0001811981201171875,
"learning_rate": 0.000890937019969278,
"loss": 0.0,
"step": 106500
},
{
"epoch": 5.478750640040962,
"grad_norm": 0.000179290771484375,
"learning_rate": 0.0008904249871991807,
"loss": 0.0,
"step": 107000
},
{
"epoch": 5.504352278545827,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0008899129544290835,
"loss": 0.0,
"step": 107500
},
{
"epoch": 5.529953917050691,
"grad_norm": 0.00018310546875,
"learning_rate": 0.0008894009216589863,
"loss": 0.0,
"step": 108000
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.00018596649169921875,
"learning_rate": 0.0008888888888888888,
"loss": 0.0,
"step": 108500
},
{
"epoch": 5.58115719406042,
"grad_norm": 0.0002689361572265625,
"learning_rate": 0.0008883768561187916,
"loss": 0.0,
"step": 109000
},
{
"epoch": 5.606758832565284,
"grad_norm": 0.00018215179443359375,
"learning_rate": 0.0008878648233486944,
"loss": 0.0,
"step": 109500
},
{
"epoch": 5.6323604710701485,
"grad_norm": 0.0002613067626953125,
"learning_rate": 0.000887352790578597,
"loss": 0.0,
"step": 110000
},
{
"epoch": 5.6579621095750126,
"grad_norm": 0.0002613067626953125,
"learning_rate": 0.0008868407578084997,
"loss": 0.0,
"step": 110500
},
{
"epoch": 5.683563748079877,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0008863287250384025,
"loss": 0.0,
"step": 111000
},
{
"epoch": 5.709165386584742,
"grad_norm": 0.00018024444580078125,
"learning_rate": 0.0008858166922683052,
"loss": 0.0,
"step": 111500
},
{
"epoch": 5.734767025089606,
"grad_norm": 0.000186920166015625,
"learning_rate": 0.000885304659498208,
"loss": 0.0,
"step": 112000
},
{
"epoch": 5.76036866359447,
"grad_norm": 0.00018024444580078125,
"learning_rate": 0.0008847926267281106,
"loss": 0.0,
"step": 112500
},
{
"epoch": 5.785970302099335,
"grad_norm": 0.000347137451171875,
"learning_rate": 0.0008842805939580133,
"loss": 0.0,
"step": 113000
},
{
"epoch": 5.811571940604199,
"grad_norm": 0.00023174285888671875,
"learning_rate": 0.0008837685611879161,
"loss": 0.0,
"step": 113500
},
{
"epoch": 5.837173579109063,
"grad_norm": 0.000392913818359375,
"learning_rate": 0.0008832565284178187,
"loss": 0.0,
"step": 114000
},
{
"epoch": 5.862775217613927,
"grad_norm": 0.0003032684326171875,
"learning_rate": 0.0008827444956477215,
"loss": 0.0,
"step": 114500
},
{
"epoch": 5.888376856118792,
"grad_norm": 0.0001964569091796875,
"learning_rate": 0.0008822324628776242,
"loss": 0.0,
"step": 115000
},
{
"epoch": 5.913978494623656,
"grad_norm": 0.00020122528076171875,
"learning_rate": 0.0008817204301075269,
"loss": 0.0,
"step": 115500
},
{
"epoch": 5.93958013312852,
"grad_norm": 0.00019550323486328125,
"learning_rate": 0.0008812083973374296,
"loss": 0.0,
"step": 116000
},
{
"epoch": 5.965181771633384,
"grad_norm": 0.000179290771484375,
"learning_rate": 0.0008806963645673324,
"loss": 0.0,
"step": 116500
},
{
"epoch": 5.990783410138249,
"grad_norm": 0.00018024444580078125,
"learning_rate": 0.000880184331797235,
"loss": 0.0,
"step": 117000
},
{
"epoch": 6.0,
"eval_loss": 3.100566391367465e-05,
"eval_runtime": 0.5504,
"eval_samples_per_second": 1816.722,
"eval_steps_per_second": 3.633,
"step": 117180
},
{
"epoch": 6.016385048643113,
"grad_norm": 0.0001983642578125,
"learning_rate": 0.0008796722990271378,
"loss": 0.0,
"step": 117500
},
{
"epoch": 6.041986687147977,
"grad_norm": 0.00018978118896484375,
"learning_rate": 0.0008791602662570405,
"loss": 0.0,
"step": 118000
},
{
"epoch": 6.067588325652842,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0008786482334869432,
"loss": 0.0,
"step": 118500
},
{
"epoch": 6.093189964157706,
"grad_norm": 0.00019168853759765625,
"learning_rate": 0.0008781362007168459,
"loss": 0.0,
"step": 119000
},
{
"epoch": 6.11879160266257,
"grad_norm": 0.000209808349609375,
"learning_rate": 0.0008776241679467485,
"loss": 0.0,
"step": 119500
},
{
"epoch": 6.1443932411674345,
"grad_norm": 0.0001964569091796875,
"learning_rate": 0.0008771121351766513,
"loss": 0.0,
"step": 120000
},
{
"epoch": 6.169994879672299,
"grad_norm": 0.0002002716064453125,
"learning_rate": 0.0008766001024065541,
"loss": 0.0,
"step": 120500
},
{
"epoch": 6.1955965181771635,
"grad_norm": 0.0001773834228515625,
"learning_rate": 0.0008760880696364567,
"loss": 0.0,
"step": 121000
},
{
"epoch": 6.221198156682028,
"grad_norm": 0.0001773834228515625,
"learning_rate": 0.0008755760368663594,
"loss": 0.0,
"step": 121500
},
{
"epoch": 6.246799795186892,
"grad_norm": 0.0002040863037109375,
"learning_rate": 0.0008750640040962622,
"loss": 0.0,
"step": 122000
},
{
"epoch": 6.272401433691757,
"grad_norm": 0.0002498626708984375,
"learning_rate": 0.0008745519713261649,
"loss": 0.0,
"step": 122500
},
{
"epoch": 6.298003072196621,
"grad_norm": 0.000179290771484375,
"learning_rate": 0.0008740399385560676,
"loss": 0.0,
"step": 123000
},
{
"epoch": 6.323604710701485,
"grad_norm": 0.00018405914306640625,
"learning_rate": 0.0008735279057859703,
"loss": 0.0,
"step": 123500
},
{
"epoch": 6.349206349206349,
"grad_norm": 0.00017452239990234375,
"learning_rate": 0.000873015873015873,
"loss": 0.0,
"step": 124000
},
{
"epoch": 6.374807987711214,
"grad_norm": 0.00018024444580078125,
"learning_rate": 0.0008725038402457758,
"loss": 0.0,
"step": 124500
},
{
"epoch": 6.400409626216078,
"grad_norm": 0.00026702880859375,
"learning_rate": 0.0008719918074756785,
"loss": 0.0,
"step": 125000
},
{
"epoch": 6.426011264720942,
"grad_norm": 0.00018978118896484375,
"learning_rate": 0.0008714797747055811,
"loss": 0.0,
"step": 125500
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.00018405914306640625,
"learning_rate": 0.0008709677419354839,
"loss": 0.0,
"step": 126000
},
{
"epoch": 6.477214541730671,
"grad_norm": 0.00018596649169921875,
"learning_rate": 0.0008704557091653866,
"loss": 0.0,
"step": 126500
},
{
"epoch": 6.502816180235535,
"grad_norm": 0.000568389892578125,
"learning_rate": 0.0008699436763952893,
"loss": 0.0,
"step": 127000
},
{
"epoch": 6.528417818740399,
"grad_norm": 0.00060272216796875,
"learning_rate": 0.000869431643625192,
"loss": 0.0,
"step": 127500
},
{
"epoch": 6.554019457245264,
"grad_norm": 0.000179290771484375,
"learning_rate": 0.0008689196108550947,
"loss": 0.0,
"step": 128000
},
{
"epoch": 6.579621095750128,
"grad_norm": 0.000255584716796875,
"learning_rate": 0.0008684075780849975,
"loss": 0.0,
"step": 128500
},
{
"epoch": 6.605222734254992,
"grad_norm": 0.00018978118896484375,
"learning_rate": 0.0008678955453149003,
"loss": 0.0,
"step": 129000
},
{
"epoch": 6.630824372759856,
"grad_norm": 0.00019550323486328125,
"learning_rate": 0.0008673835125448028,
"loss": 0.0,
"step": 129500
},
{
"epoch": 6.65642601126472,
"grad_norm": 0.0004329681396484375,
"learning_rate": 0.0008668714797747056,
"loss": 0.0,
"step": 130000
},
{
"epoch": 6.682027649769585,
"grad_norm": 0.00040435791015625,
"learning_rate": 0.0008663594470046084,
"loss": 0.0,
"step": 130500
},
{
"epoch": 6.7076292882744495,
"grad_norm": 0.000392913818359375,
"learning_rate": 0.000865847414234511,
"loss": 0.0,
"step": 131000
},
{
"epoch": 6.733230926779314,
"grad_norm": 0.0001811981201171875,
"learning_rate": 0.0008653353814644137,
"loss": 0.0,
"step": 131500
},
{
"epoch": 6.7588325652841785,
"grad_norm": 0.00018310546875,
"learning_rate": 0.0008648233486943165,
"loss": 0.0,
"step": 132000
},
{
"epoch": 6.784434203789043,
"grad_norm": 0.00022125244140625,
"learning_rate": 0.0008643113159242192,
"loss": 0.0,
"step": 132500
},
{
"epoch": 6.810035842293907,
"grad_norm": 0.00018310546875,
"learning_rate": 0.000863799283154122,
"loss": 0.0,
"step": 133000
},
{
"epoch": 6.835637480798771,
"grad_norm": 0.00022125244140625,
"learning_rate": 0.0008632872503840245,
"loss": 0.0,
"step": 133500
},
{
"epoch": 6.861239119303636,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0008627752176139273,
"loss": 0.0,
"step": 134000
},
{
"epoch": 6.8868407578085,
"grad_norm": 0.00023365020751953125,
"learning_rate": 0.0008622631848438301,
"loss": 0.0,
"step": 134500
},
{
"epoch": 6.912442396313364,
"grad_norm": 0.00018787384033203125,
"learning_rate": 0.0008617511520737327,
"loss": 0.0,
"step": 135000
},
{
"epoch": 6.938044034818228,
"grad_norm": 0.0003833770751953125,
"learning_rate": 0.0008612391193036354,
"loss": 0.0,
"step": 135500
},
{
"epoch": 6.963645673323093,
"grad_norm": 0.0003833770751953125,
"learning_rate": 0.0008607270865335382,
"loss": 0.0,
"step": 136000
},
{
"epoch": 6.989247311827957,
"grad_norm": 0.0004425048828125,
"learning_rate": 0.0008602150537634409,
"loss": 0.0,
"step": 136500
},
{
"epoch": 7.0,
"eval_loss": 3.138924512313679e-05,
"eval_runtime": 0.5584,
"eval_samples_per_second": 1790.97,
"eval_steps_per_second": 3.582,
"step": 136710
},
{
"epoch": 7.014848950332821,
"grad_norm": 0.0001811981201171875,
"learning_rate": 0.0008597030209933436,
"loss": 0.0,
"step": 137000
},
{
"epoch": 7.040450588837686,
"grad_norm": 0.0002536773681640625,
"learning_rate": 0.0008591909882232463,
"loss": 0.0,
"step": 137500
},
{
"epoch": 7.06605222734255,
"grad_norm": 0.0004558563232421875,
"learning_rate": 0.000858678955453149,
"loss": 0.0,
"step": 138000
},
{
"epoch": 7.091653865847414,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0008581669226830518,
"loss": 0.0,
"step": 138500
},
{
"epoch": 7.117255504352278,
"grad_norm": 0.0001773834228515625,
"learning_rate": 0.0008576548899129545,
"loss": 0.0,
"step": 139000
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.0004482269287109375,
"learning_rate": 0.0008571428571428571,
"loss": 0.0,
"step": 139500
},
{
"epoch": 7.168458781362007,
"grad_norm": 0.00019359588623046875,
"learning_rate": 0.0008566308243727599,
"loss": 0.0,
"step": 140000
},
{
"epoch": 7.194060419866871,
"grad_norm": 0.0001811981201171875,
"learning_rate": 0.0008561187916026625,
"loss": 0.0,
"step": 140500
},
{
"epoch": 7.2196620583717355,
"grad_norm": 0.000576019287109375,
"learning_rate": 0.0008556067588325653,
"loss": 0.0,
"step": 141000
},
{
"epoch": 7.2452636968766,
"grad_norm": 0.00018787384033203125,
"learning_rate": 0.000855094726062468,
"loss": 0.0,
"step": 141500
},
{
"epoch": 7.2708653353814645,
"grad_norm": 0.0002155303955078125,
"learning_rate": 0.0008545826932923707,
"loss": 0.0,
"step": 142000
},
{
"epoch": 7.296466973886329,
"grad_norm": 0.00018215179443359375,
"learning_rate": 0.0008540706605222734,
"loss": 0.0,
"step": 142500
},
{
"epoch": 7.322068612391193,
"grad_norm": 0.0002346038818359375,
"learning_rate": 0.0008535586277521762,
"loss": 0.0,
"step": 143000
},
{
"epoch": 7.347670250896058,
"grad_norm": 0.00023937225341796875,
"learning_rate": 0.0008530465949820788,
"loss": 0.0,
"step": 143500
},
{
"epoch": 7.373271889400922,
"grad_norm": 0.0002460479736328125,
"learning_rate": 0.0008525345622119816,
"loss": 0.0,
"step": 144000
},
{
"epoch": 7.398873527905786,
"grad_norm": 0.0002613067626953125,
"learning_rate": 0.0008520225294418843,
"loss": 0.0,
"step": 144500
},
{
"epoch": 7.42447516641065,
"grad_norm": 0.00017642974853515625,
"learning_rate": 0.000851510496671787,
"loss": 0.0,
"step": 145000
},
{
"epoch": 7.450076804915515,
"grad_norm": 0.000232696533203125,
"learning_rate": 0.0008509984639016898,
"loss": 0.0,
"step": 145500
},
{
"epoch": 7.475678443420379,
"grad_norm": 0.0001697540283203125,
"learning_rate": 0.0008504864311315924,
"loss": 0.0,
"step": 146000
},
{
"epoch": 7.501280081925243,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0008499743983614951,
"loss": 0.0,
"step": 146500
},
{
"epoch": 7.526881720430108,
"grad_norm": 0.000377655029296875,
"learning_rate": 0.0008494623655913979,
"loss": 0.0,
"step": 147000
},
{
"epoch": 7.552483358934972,
"grad_norm": 0.000194549560546875,
"learning_rate": 0.0008489503328213006,
"loss": 0.0,
"step": 147500
},
{
"epoch": 7.578084997439836,
"grad_norm": 0.0004825592041015625,
"learning_rate": 0.0008484383000512033,
"loss": 0.0,
"step": 148000
},
{
"epoch": 7.6036866359447,
"grad_norm": 0.0001773834228515625,
"learning_rate": 0.000847926267281106,
"loss": 0.0,
"step": 148500
},
{
"epoch": 7.629288274449565,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0008474142345110087,
"loss": 0.0,
"step": 149000
},
{
"epoch": 7.654889912954429,
"grad_norm": 0.00018215179443359375,
"learning_rate": 0.0008469022017409115,
"loss": 0.0,
"step": 149500
},
{
"epoch": 7.680491551459293,
"grad_norm": 0.00019073486328125,
"learning_rate": 0.0008463901689708142,
"loss": 0.0,
"step": 150000
},
{
"epoch": 7.706093189964157,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0008458781362007168,
"loss": 0.0,
"step": 150500
},
{
"epoch": 7.731694828469022,
"grad_norm": 0.0001926422119140625,
"learning_rate": 0.0008453661034306196,
"loss": 0.0,
"step": 151000
},
{
"epoch": 7.757296466973886,
"grad_norm": 0.000274658203125,
"learning_rate": 0.0008448540706605224,
"loss": 0.0,
"step": 151500
},
{
"epoch": 7.7828981054787505,
"grad_norm": 0.00017452239990234375,
"learning_rate": 0.0008443420378904249,
"loss": 0.0,
"step": 152000
},
{
"epoch": 7.808499743983615,
"grad_norm": 0.00017833709716796875,
"learning_rate": 0.0008438300051203277,
"loss": 0.0,
"step": 152500
},
{
"epoch": 7.8341013824884795,
"grad_norm": 0.000301361083984375,
"learning_rate": 0.0008433179723502304,
"loss": 0.0,
"step": 153000
},
{
"epoch": 7.859703020993344,
"grad_norm": 0.0001697540283203125,
"learning_rate": 0.0008428059395801332,
"loss": 0.0,
"step": 153500
},
{
"epoch": 7.885304659498208,
"grad_norm": 0.00020313262939453125,
"learning_rate": 0.0008422939068100358,
"loss": 0.0,
"step": 154000
},
{
"epoch": 7.910906298003072,
"grad_norm": 0.0004425048828125,
"learning_rate": 0.0008417818740399385,
"loss": 0.0,
"step": 154500
},
{
"epoch": 7.936507936507937,
"grad_norm": 0.00017452239990234375,
"learning_rate": 0.0008412698412698413,
"loss": 0.0,
"step": 155000
},
{
"epoch": 7.962109575012801,
"grad_norm": 0.0002346038818359375,
"learning_rate": 0.0008407578084997441,
"loss": 0.0,
"step": 155500
},
{
"epoch": 7.987711213517665,
"grad_norm": 0.0001773834228515625,
"learning_rate": 0.0008402457757296466,
"loss": 0.0,
"step": 156000
},
{
"epoch": 8.0,
"eval_loss": 2.9280490707606077e-05,
"eval_runtime": 0.5361,
"eval_samples_per_second": 1865.365,
"eval_steps_per_second": 3.731,
"step": 156240
}
],
"logging_steps": 500,
"max_steps": 976500,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.132510754955264e+18,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}