{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 905, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0055248618784530384, "grad_norm": 2.2495654788733823, "learning_rate": 5.4347826086956525e-06, "loss": 0.9695, "step": 5 }, { "epoch": 0.011049723756906077, "grad_norm": 1.9461642490164472, "learning_rate": 1.0869565217391305e-05, "loss": 0.9861, "step": 10 }, { "epoch": 0.016574585635359115, "grad_norm": 1.417204705816258, "learning_rate": 1.630434782608696e-05, "loss": 0.9362, "step": 15 }, { "epoch": 0.022099447513812154, "grad_norm": 1.1341587011781606, "learning_rate": 2.173913043478261e-05, "loss": 0.8601, "step": 20 }, { "epoch": 0.027624309392265192, "grad_norm": 1.0602684449366846, "learning_rate": 2.7173913043478262e-05, "loss": 0.7882, "step": 25 }, { "epoch": 0.03314917127071823, "grad_norm": 0.9548854170467792, "learning_rate": 3.260869565217392e-05, "loss": 0.7866, "step": 30 }, { "epoch": 0.03867403314917127, "grad_norm": 0.8472222008228629, "learning_rate": 3.804347826086957e-05, "loss": 0.7845, "step": 35 }, { "epoch": 0.04419889502762431, "grad_norm": 0.9685960177896414, "learning_rate": 4.347826086956522e-05, "loss": 0.8259, "step": 40 }, { "epoch": 0.049723756906077346, "grad_norm": 1.0119153562267356, "learning_rate": 4.891304347826087e-05, "loss": 0.783, "step": 45 }, { "epoch": 0.055248618784530384, "grad_norm": 1.0082850863037842, "learning_rate": 4.9997592434132936e-05, "loss": 0.8047, "step": 50 }, { "epoch": 0.06077348066298342, "grad_norm": 1.078699321374065, "learning_rate": 4.9987812580832524e-05, "loss": 0.7593, "step": 55 }, { "epoch": 0.06629834254143646, "grad_norm": 0.8486834251840161, "learning_rate": 4.9970513234911335e-05, "loss": 0.7781, "step": 60 }, { "epoch": 0.0718232044198895, "grad_norm": 0.8151943193010388, "learning_rate": 4.99457001809384e-05, "loss": 0.7678, "step": 65 }, { "epoch": 0.07734806629834254, "grad_norm": 0.999095249982963, "learning_rate": 4.991338171592238e-05, "loss": 0.7203, "step": 70 }, { "epoch": 0.08287292817679558, "grad_norm": 0.9570443791252086, "learning_rate": 4.9873568646537276e-05, "loss": 0.8225, "step": 75 }, { "epoch": 0.08839779005524862, "grad_norm": 1.0377587505108075, "learning_rate": 4.9826274285508836e-05, "loss": 0.7606, "step": 80 }, { "epoch": 0.09392265193370165, "grad_norm": 0.8916728595351986, "learning_rate": 4.977151444716302e-05, "loss": 0.7752, "step": 85 }, { "epoch": 0.09944751381215469, "grad_norm": 0.7828895660955932, "learning_rate": 4.970930744213807e-05, "loss": 0.7716, "step": 90 }, { "epoch": 0.10497237569060773, "grad_norm": 0.9255391724115476, "learning_rate": 4.963967407126167e-05, "loss": 0.7904, "step": 95 }, { "epoch": 0.11049723756906077, "grad_norm": 0.8282644001387807, "learning_rate": 4.956263761859566e-05, "loss": 0.7607, "step": 100 }, { "epoch": 0.11602209944751381, "grad_norm": 0.8855019219145566, "learning_rate": 4.947822384365024e-05, "loss": 0.7336, "step": 105 }, { "epoch": 0.12154696132596685, "grad_norm": 1.3326630713526908, "learning_rate": 4.9386460972770485e-05, "loss": 0.756, "step": 110 }, { "epoch": 0.1270718232044199, "grad_norm": 0.7623686644469245, "learning_rate": 4.9287379689697974e-05, "loss": 0.7787, "step": 115 }, { "epoch": 0.13259668508287292, "grad_norm": 0.6744035121841259, "learning_rate": 4.9181013125310804e-05, "loss": 0.7607, "step": 120 }, { "epoch": 0.13812154696132597, "grad_norm": 0.8801426064681885, "learning_rate": 4.9067396846545205e-05, "loss": 0.7879, "step": 125 }, { "epoch": 0.143646408839779, "grad_norm": 0.8104653989188151, "learning_rate": 4.894656884450265e-05, "loss": 0.7172, "step": 130 }, { "epoch": 0.14917127071823205, "grad_norm": 0.8442454603941927, "learning_rate": 4.881856952174638e-05, "loss": 0.8042, "step": 135 }, { "epoch": 0.15469613259668508, "grad_norm": 0.8046048783159924, "learning_rate": 4.868344167879152e-05, "loss": 0.7545, "step": 140 }, { "epoch": 0.16022099447513813, "grad_norm": 0.7342440517610522, "learning_rate": 4.8541230499793475e-05, "loss": 0.7239, "step": 145 }, { "epoch": 0.16574585635359115, "grad_norm": 0.8351674082098574, "learning_rate": 4.839198353743915e-05, "loss": 0.7782, "step": 150 }, { "epoch": 0.1712707182320442, "grad_norm": 0.8568746827943232, "learning_rate": 4.8235750697046296e-05, "loss": 0.7305, "step": 155 }, { "epoch": 0.17679558011049723, "grad_norm": 0.799075952862911, "learning_rate": 4.8072584219876086e-05, "loss": 0.736, "step": 160 }, { "epoch": 0.18232044198895028, "grad_norm": 0.7632219335279382, "learning_rate": 4.790253866566467e-05, "loss": 0.7413, "step": 165 }, { "epoch": 0.1878453038674033, "grad_norm": 0.7143875780259061, "learning_rate": 4.772567089437945e-05, "loss": 0.7112, "step": 170 }, { "epoch": 0.19337016574585636, "grad_norm": 0.757498467869332, "learning_rate": 4.7542040047206165e-05, "loss": 0.79, "step": 175 }, { "epoch": 0.19889502762430938, "grad_norm": 0.7359012444695264, "learning_rate": 4.735170752677327e-05, "loss": 0.7153, "step": 180 }, { "epoch": 0.20441988950276244, "grad_norm": 0.7088663879251825, "learning_rate": 4.715473697662001e-05, "loss": 0.7293, "step": 185 }, { "epoch": 0.20994475138121546, "grad_norm": 0.695608085898617, "learning_rate": 4.69511942599153e-05, "loss": 0.7251, "step": 190 }, { "epoch": 0.2154696132596685, "grad_norm": 0.632627534005913, "learning_rate": 4.6741147437434323e-05, "loss": 0.6947, "step": 195 }, { "epoch": 0.22099447513812154, "grad_norm": 0.704743849198389, "learning_rate": 4.6524666744800305e-05, "loss": 0.7147, "step": 200 }, { "epoch": 0.2265193370165746, "grad_norm": 0.7017435419792435, "learning_rate": 4.630182456899907e-05, "loss": 0.7302, "step": 205 }, { "epoch": 0.23204419889502761, "grad_norm": 0.7021833045223487, "learning_rate": 4.607269542417427e-05, "loss": 0.7304, "step": 210 }, { "epoch": 0.23756906077348067, "grad_norm": 0.7070770033781527, "learning_rate": 4.58373559267112e-05, "loss": 0.7373, "step": 215 }, { "epoch": 0.2430939226519337, "grad_norm": 0.6730958248948499, "learning_rate": 4.559588476961784e-05, "loss": 0.7126, "step": 220 }, { "epoch": 0.24861878453038674, "grad_norm": 0.6997491859738907, "learning_rate": 4.534836269621137e-05, "loss": 0.7402, "step": 225 }, { "epoch": 0.2541436464088398, "grad_norm": 0.5977862788008176, "learning_rate": 4.509487247311917e-05, "loss": 0.7245, "step": 230 }, { "epoch": 0.2596685082872928, "grad_norm": 0.6993869464122184, "learning_rate": 4.483549886260324e-05, "loss": 0.7241, "step": 235 }, { "epoch": 0.26519337016574585, "grad_norm": 0.7030894956399786, "learning_rate": 4.4570328594217356e-05, "loss": 0.7005, "step": 240 }, { "epoch": 0.27071823204419887, "grad_norm": 0.7057927516603153, "learning_rate": 4.429945033580633e-05, "loss": 0.7097, "step": 245 }, { "epoch": 0.27624309392265195, "grad_norm": 0.6391012673945178, "learning_rate": 4.4022954663857244e-05, "loss": 0.7134, "step": 250 }, { "epoch": 0.281767955801105, "grad_norm": 0.6744043202485268, "learning_rate": 4.374093403321233e-05, "loss": 0.7275, "step": 255 }, { "epoch": 0.287292817679558, "grad_norm": 0.7400160690377109, "learning_rate": 4.345348274615395e-05, "loss": 0.7233, "step": 260 }, { "epoch": 0.292817679558011, "grad_norm": 0.739209940661344, "learning_rate": 4.3160696920871605e-05, "loss": 0.7444, "step": 265 }, { "epoch": 0.2983425414364641, "grad_norm": 0.6742845794877194, "learning_rate": 4.2862674459321994e-05, "loss": 0.7316, "step": 270 }, { "epoch": 0.30386740331491713, "grad_norm": 0.6651549735204162, "learning_rate": 4.255951501449234e-05, "loss": 0.7301, "step": 275 }, { "epoch": 0.30939226519337015, "grad_norm": 0.6819675840699815, "learning_rate": 4.225131995707845e-05, "loss": 0.7179, "step": 280 }, { "epoch": 0.3149171270718232, "grad_norm": 0.6416784248554884, "learning_rate": 4.1938192341588265e-05, "loss": 0.729, "step": 285 }, { "epoch": 0.32044198895027626, "grad_norm": 0.727007416919034, "learning_rate": 4.162023687188238e-05, "loss": 0.7368, "step": 290 }, { "epoch": 0.3259668508287293, "grad_norm": 0.6219219552088717, "learning_rate": 4.129755986616315e-05, "loss": 0.7058, "step": 295 }, { "epoch": 0.3314917127071823, "grad_norm": 0.6741150513014229, "learning_rate": 4.097026922142389e-05, "loss": 0.7406, "step": 300 }, { "epoch": 0.3370165745856354, "grad_norm": 0.7472100039399484, "learning_rate": 4.0638474377370155e-05, "loss": 0.7021, "step": 305 }, { "epoch": 0.3425414364640884, "grad_norm": 0.627954564572662, "learning_rate": 4.0302286279825286e-05, "loss": 0.7014, "step": 310 }, { "epoch": 0.34806629834254144, "grad_norm": 0.734789301440309, "learning_rate": 3.996181734363218e-05, "loss": 0.7012, "step": 315 }, { "epoch": 0.35359116022099446, "grad_norm": 0.7242805946396662, "learning_rate": 3.961718141506398e-05, "loss": 0.7191, "step": 320 }, { "epoch": 0.35911602209944754, "grad_norm": 0.6457534652885096, "learning_rate": 3.926849373375604e-05, "loss": 0.7239, "step": 325 }, { "epoch": 0.36464088397790057, "grad_norm": 0.6599551130534848, "learning_rate": 3.891587089417201e-05, "loss": 0.7311, "step": 330 }, { "epoch": 0.3701657458563536, "grad_norm": 0.6375311137388923, "learning_rate": 3.855943080661688e-05, "loss": 0.6725, "step": 335 }, { "epoch": 0.3756906077348066, "grad_norm": 0.694826014055163, "learning_rate": 3.819929265781007e-05, "loss": 0.7053, "step": 340 }, { "epoch": 0.3812154696132597, "grad_norm": 0.7063030149524828, "learning_rate": 3.783557687103164e-05, "loss": 0.688, "step": 345 }, { "epoch": 0.3867403314917127, "grad_norm": 0.7229962452752301, "learning_rate": 3.7468405065855066e-05, "loss": 0.7466, "step": 350 }, { "epoch": 0.39226519337016574, "grad_norm": 0.6282377813832024, "learning_rate": 3.7097900017480025e-05, "loss": 0.6712, "step": 355 }, { "epoch": 0.39779005524861877, "grad_norm": 0.6936933940060245, "learning_rate": 3.672418561567867e-05, "loss": 0.705, "step": 360 }, { "epoch": 0.40331491712707185, "grad_norm": 0.6543353136837871, "learning_rate": 3.634738682336934e-05, "loss": 0.7115, "step": 365 }, { "epoch": 0.4088397790055249, "grad_norm": 0.5872861304195733, "learning_rate": 3.596762963483127e-05, "loss": 0.7152, "step": 370 }, { "epoch": 0.4143646408839779, "grad_norm": 0.6287395366045305, "learning_rate": 3.5585041033574615e-05, "loss": 0.7396, "step": 375 }, { "epoch": 0.4198895027624309, "grad_norm": 0.6173655969252787, "learning_rate": 3.5199748949879544e-05, "loss": 0.6718, "step": 380 }, { "epoch": 0.425414364640884, "grad_norm": 0.6115484539768465, "learning_rate": 3.4811882218018836e-05, "loss": 0.7041, "step": 385 }, { "epoch": 0.430939226519337, "grad_norm": 0.6592054769519977, "learning_rate": 3.442157053317817e-05, "loss": 0.6905, "step": 390 }, { "epoch": 0.43646408839779005, "grad_norm": 0.64695169140918, "learning_rate": 3.402894440808852e-05, "loss": 0.7249, "step": 395 }, { "epoch": 0.4419889502762431, "grad_norm": 0.6660351092999496, "learning_rate": 3.363413512938527e-05, "loss": 0.692, "step": 400 }, { "epoch": 0.44751381215469616, "grad_norm": 0.6625512350903074, "learning_rate": 3.32372747137084e-05, "loss": 0.7009, "step": 405 }, { "epoch": 0.4530386740331492, "grad_norm": 0.7130204785542628, "learning_rate": 3.2838495863558716e-05, "loss": 0.7196, "step": 410 }, { "epoch": 0.4585635359116022, "grad_norm": 0.7209339805921852, "learning_rate": 3.243793192292468e-05, "loss": 0.6964, "step": 415 }, { "epoch": 0.46408839779005523, "grad_norm": 0.6556622599922761, "learning_rate": 3.2035716832694705e-05, "loss": 0.6912, "step": 420 }, { "epoch": 0.4696132596685083, "grad_norm": 0.6372838951795533, "learning_rate": 3.163198508586993e-05, "loss": 0.7146, "step": 425 }, { "epoch": 0.47513812154696133, "grad_norm": 0.6188245445254615, "learning_rate": 3.122687168259233e-05, "loss": 0.6981, "step": 430 }, { "epoch": 0.48066298342541436, "grad_norm": 0.6253825885926813, "learning_rate": 3.082051208500324e-05, "loss": 0.7083, "step": 435 }, { "epoch": 0.4861878453038674, "grad_norm": 0.6385317812674182, "learning_rate": 3.0413042171947475e-05, "loss": 0.7183, "step": 440 }, { "epoch": 0.49171270718232046, "grad_norm": 0.5545979834442434, "learning_rate": 3.000459819353798e-05, "loss": 0.6757, "step": 445 }, { "epoch": 0.4972375690607735, "grad_norm": 0.6457896775039127, "learning_rate": 2.9595316725596485e-05, "loss": 0.7126, "step": 450 }, { "epoch": 0.5027624309392266, "grad_norm": 0.6026206696104952, "learning_rate": 2.918533462398509e-05, "loss": 0.6932, "step": 455 }, { "epoch": 0.5082872928176796, "grad_norm": 0.705985656607051, "learning_rate": 2.8774788978844374e-05, "loss": 0.7165, "step": 460 }, { "epoch": 0.5138121546961326, "grad_norm": 0.6822033180016115, "learning_rate": 2.8363817068753025e-05, "loss": 0.7232, "step": 465 }, { "epoch": 0.5193370165745856, "grad_norm": 0.6026435106196351, "learning_rate": 2.795255631482457e-05, "loss": 0.6923, "step": 470 }, { "epoch": 0.5248618784530387, "grad_norm": 0.6136085628375205, "learning_rate": 2.7541144234756327e-05, "loss": 0.6988, "step": 475 }, { "epoch": 0.5303867403314917, "grad_norm": 0.595949445957797, "learning_rate": 2.7129718396846216e-05, "loss": 0.6793, "step": 480 }, { "epoch": 0.5359116022099447, "grad_norm": 0.6198967125875174, "learning_rate": 2.671841637399249e-05, "loss": 0.6711, "step": 485 }, { "epoch": 0.5414364640883977, "grad_norm": 0.6448908365718997, "learning_rate": 2.6307375697692016e-05, "loss": 0.6963, "step": 490 }, { "epoch": 0.5469613259668509, "grad_norm": 0.5952230142125985, "learning_rate": 2.5896733812052403e-05, "loss": 0.6595, "step": 495 }, { "epoch": 0.5524861878453039, "grad_norm": 0.5692444802287681, "learning_rate": 2.5486628027833337e-05, "loss": 0.705, "step": 500 }, { "epoch": 0.5580110497237569, "grad_norm": 0.6583799728791444, "learning_rate": 2.5077195476532456e-05, "loss": 0.7108, "step": 505 }, { "epoch": 0.56353591160221, "grad_norm": 0.6118674538428357, "learning_rate": 2.4668573064531275e-05, "loss": 0.6779, "step": 510 }, { "epoch": 0.569060773480663, "grad_norm": 0.6865305146078687, "learning_rate": 2.4260897427316255e-05, "loss": 0.7115, "step": 515 }, { "epoch": 0.574585635359116, "grad_norm": 0.5634331597251886, "learning_rate": 2.3854304883790573e-05, "loss": 0.6763, "step": 520 }, { "epoch": 0.580110497237569, "grad_norm": 0.603347124007975, "learning_rate": 2.344893139069166e-05, "loss": 0.6728, "step": 525 }, { "epoch": 0.585635359116022, "grad_norm": 0.5780548338990102, "learning_rate": 2.304491249712992e-05, "loss": 0.6857, "step": 530 }, { "epoch": 0.5911602209944752, "grad_norm": 0.5838799744020187, "learning_rate": 2.2642383299263674e-05, "loss": 0.6961, "step": 535 }, { "epoch": 0.5966850828729282, "grad_norm": 0.610184172054768, "learning_rate": 2.224147839512562e-05, "loss": 0.7036, "step": 540 }, { "epoch": 0.6022099447513812, "grad_norm": 0.6249192777191065, "learning_rate": 2.184233183961582e-05, "loss": 0.6486, "step": 545 }, { "epoch": 0.6077348066298343, "grad_norm": 0.637226405901165, "learning_rate": 2.1445077099676346e-05, "loss": 0.6819, "step": 550 }, { "epoch": 0.6132596685082873, "grad_norm": 0.6326954384056243, "learning_rate": 2.1049847009662455e-05, "loss": 0.6957, "step": 555 }, { "epoch": 0.6187845303867403, "grad_norm": 0.621926214738313, "learning_rate": 2.065677372692536e-05, "loss": 0.6761, "step": 560 }, { "epoch": 0.6243093922651933, "grad_norm": 0.5897101308180229, "learning_rate": 2.0265988687621363e-05, "loss": 0.6872, "step": 565 }, { "epoch": 0.6298342541436464, "grad_norm": 0.5718564263286896, "learning_rate": 1.9877622562762088e-05, "loss": 0.6663, "step": 570 }, { "epoch": 0.6353591160220995, "grad_norm": 0.5822438329318962, "learning_rate": 1.949180521452064e-05, "loss": 0.6952, "step": 575 }, { "epoch": 0.6408839779005525, "grad_norm": 0.55538735160568, "learning_rate": 1.9108665652808177e-05, "loss": 0.6389, "step": 580 }, { "epoch": 0.6464088397790055, "grad_norm": 0.624144335344885, "learning_rate": 1.8728331992135457e-05, "loss": 0.6916, "step": 585 }, { "epoch": 0.6519337016574586, "grad_norm": 0.5931319057188856, "learning_rate": 1.835093140877383e-05, "loss": 0.6737, "step": 590 }, { "epoch": 0.6574585635359116, "grad_norm": 0.6049587715850621, "learning_rate": 1.7976590098229932e-05, "loss": 0.6985, "step": 595 }, { "epoch": 0.6629834254143646, "grad_norm": 0.5880589654245556, "learning_rate": 1.7605433233048325e-05, "loss": 0.6615, "step": 600 }, { "epoch": 0.6685082872928176, "grad_norm": 0.5512950866249651, "learning_rate": 1.7237584920956195e-05, "loss": 0.6676, "step": 605 }, { "epoch": 0.6740331491712708, "grad_norm": 0.6279974406839042, "learning_rate": 1.6873168163364126e-05, "loss": 0.6792, "step": 610 }, { "epoch": 0.6795580110497238, "grad_norm": 0.6298057725245987, "learning_rate": 1.651230481423677e-05, "loss": 0.6611, "step": 615 }, { "epoch": 0.6850828729281768, "grad_norm": 0.5801623465755368, "learning_rate": 1.615511553934726e-05, "loss": 0.6862, "step": 620 }, { "epoch": 0.6906077348066298, "grad_norm": 0.5791150492145345, "learning_rate": 1.5801719775928858e-05, "loss": 0.6777, "step": 625 }, { "epoch": 0.6961325966850829, "grad_norm": 0.5365861610338757, "learning_rate": 1.545223569273744e-05, "loss": 0.6619, "step": 630 }, { "epoch": 0.7016574585635359, "grad_norm": 0.5592674957340275, "learning_rate": 1.5106780150538164e-05, "loss": 0.6725, "step": 635 }, { "epoch": 0.7071823204419889, "grad_norm": 0.5874282755671061, "learning_rate": 1.4765468663029427e-05, "loss": 0.705, "step": 640 }, { "epoch": 0.712707182320442, "grad_norm": 0.5736802681033443, "learning_rate": 1.4428415358217348e-05, "loss": 0.6721, "step": 645 }, { "epoch": 0.7182320441988951, "grad_norm": 0.5911352248795241, "learning_rate": 1.409573294025354e-05, "loss": 0.7002, "step": 650 }, { "epoch": 0.7237569060773481, "grad_norm": 0.5731565558773776, "learning_rate": 1.3767532651748973e-05, "loss": 0.6584, "step": 655 }, { "epoch": 0.7292817679558011, "grad_norm": 0.5567247088749864, "learning_rate": 1.3443924236576643e-05, "loss": 0.7039, "step": 660 }, { "epoch": 0.7348066298342542, "grad_norm": 0.5760525309492333, "learning_rate": 1.3125015903175292e-05, "loss": 0.6675, "step": 665 }, { "epoch": 0.7403314917127072, "grad_norm": 0.5707270289177868, "learning_rate": 1.2810914288366571e-05, "loss": 0.6866, "step": 670 }, { "epoch": 0.7458563535911602, "grad_norm": 0.6426256664859457, "learning_rate": 1.2501724421697753e-05, "loss": 0.6701, "step": 675 }, { "epoch": 0.7513812154696132, "grad_norm": 0.5777718022675424, "learning_rate": 1.2197549690321886e-05, "loss": 0.7014, "step": 680 }, { "epoch": 0.7569060773480663, "grad_norm": 0.5477827409060126, "learning_rate": 1.1898491804427097e-05, "loss": 0.6319, "step": 685 }, { "epoch": 0.7624309392265194, "grad_norm": 0.674023091743246, "learning_rate": 1.1604650763226643e-05, "loss": 0.6525, "step": 690 }, { "epoch": 0.7679558011049724, "grad_norm": 0.607590999103845, "learning_rate": 1.131612482152113e-05, "loss": 0.667, "step": 695 }, { "epoch": 0.7734806629834254, "grad_norm": 0.6018921128449826, "learning_rate": 1.1033010456843956e-05, "loss": 0.6689, "step": 700 }, { "epoch": 0.7790055248618785, "grad_norm": 0.6519694089962578, "learning_rate": 1.075540233720112e-05, "loss": 0.7203, "step": 705 }, { "epoch": 0.7845303867403315, "grad_norm": 0.5735806870840707, "learning_rate": 1.048339328941601e-05, "loss": 0.6472, "step": 710 }, { "epoch": 0.7900552486187845, "grad_norm": 0.6397132166838564, "learning_rate": 1.0217074268089937e-05, "loss": 0.6769, "step": 715 }, { "epoch": 0.7955801104972375, "grad_norm": 0.5494095260135633, "learning_rate": 9.956534325188664e-06, "loss": 0.7047, "step": 720 }, { "epoch": 0.8011049723756906, "grad_norm": 0.6304028016016777, "learning_rate": 9.701860580265087e-06, "loss": 0.6768, "step": 725 }, { "epoch": 0.8066298342541437, "grad_norm": 0.5704657105301698, "learning_rate": 9.453138191328185e-06, "loss": 0.6683, "step": 730 }, { "epoch": 0.8121546961325967, "grad_norm": 0.5890416243667709, "learning_rate": 9.210450326367803e-06, "loss": 0.6728, "step": 735 }, { "epoch": 0.8176795580110497, "grad_norm": 0.5607066583139462, "learning_rate": 8.973878135544859e-06, "loss": 0.683, "step": 740 }, { "epoch": 0.8232044198895028, "grad_norm": 0.6271466709020599, "learning_rate": 8.743500724056313e-06, "loss": 0.6754, "step": 745 }, { "epoch": 0.8287292817679558, "grad_norm": 0.6131988976912456, "learning_rate": 8.519395125683873e-06, "loss": 0.6387, "step": 750 }, { "epoch": 0.8342541436464088, "grad_norm": 0.5615837426545646, "learning_rate": 8.30163627703541e-06, "loss": 0.6717, "step": 755 }, { "epoch": 0.8397790055248618, "grad_norm": 0.5158077036018013, "learning_rate": 8.090296992487588e-06, "loss": 0.6001, "step": 760 }, { "epoch": 0.8453038674033149, "grad_norm": 0.5367673789723251, "learning_rate": 7.885447939838128e-06, "loss": 0.675, "step": 765 }, { "epoch": 0.850828729281768, "grad_norm": 0.53629105700699, "learning_rate": 7.687157616675851e-06, "loss": 0.6333, "step": 770 }, { "epoch": 0.856353591160221, "grad_norm": 0.5672788900263016, "learning_rate": 7.495492327476418e-06, "loss": 0.6704, "step": 775 }, { "epoch": 0.861878453038674, "grad_norm": 0.5428010023883922, "learning_rate": 7.310516161431368e-06, "loss": 0.6473, "step": 780 }, { "epoch": 0.8674033149171271, "grad_norm": 0.5921147890300914, "learning_rate": 7.132290971017927e-06, "loss": 0.6815, "step": 785 }, { "epoch": 0.8729281767955801, "grad_norm": 0.5344053242706458, "learning_rate": 6.9608763513167336e-06, "loss": 0.6564, "step": 790 }, { "epoch": 0.8784530386740331, "grad_norm": 0.628627445655646, "learning_rate": 6.796329620084385e-06, "loss": 0.6537, "step": 795 }, { "epoch": 0.8839779005524862, "grad_norm": 0.5410896967839478, "learning_rate": 6.63870579858749e-06, "loss": 0.6215, "step": 800 }, { "epoch": 0.8895027624309392, "grad_norm": 0.553908457484728, "learning_rate": 6.488057593204589e-06, "loss": 0.6904, "step": 805 }, { "epoch": 0.8950276243093923, "grad_norm": 0.5653620781263345, "learning_rate": 6.344435377802178e-06, "loss": 0.6563, "step": 810 }, { "epoch": 0.9005524861878453, "grad_norm": 0.5298053711596861, "learning_rate": 6.207887176890645e-06, "loss": 0.6585, "step": 815 }, { "epoch": 0.9060773480662984, "grad_norm": 0.5934777330589273, "learning_rate": 6.0784586495658025e-06, "loss": 0.6779, "step": 820 }, { "epoch": 0.9116022099447514, "grad_norm": 0.5863688277507009, "learning_rate": 5.956193074241348e-06, "loss": 0.6686, "step": 825 }, { "epoch": 0.9171270718232044, "grad_norm": 0.49942899738158636, "learning_rate": 5.841131334177408e-06, "loss": 0.6206, "step": 830 }, { "epoch": 0.9226519337016574, "grad_norm": 0.542957527300465, "learning_rate": 5.733311903809964e-06, "loss": 0.6407, "step": 835 }, { "epoch": 0.9281767955801105, "grad_norm": 0.5772756241841824, "learning_rate": 5.6327708358857185e-06, "loss": 0.6838, "step": 840 }, { "epoch": 0.9337016574585635, "grad_norm": 0.6803622516332938, "learning_rate": 5.5395417494067696e-06, "loss": 0.6923, "step": 845 }, { "epoch": 0.9392265193370166, "grad_norm": 0.5292098020073351, "learning_rate": 5.453655818389058e-06, "loss": 0.6497, "step": 850 }, { "epoch": 0.9447513812154696, "grad_norm": 0.4934073985759796, "learning_rate": 5.37514176143837e-06, "loss": 0.6088, "step": 855 }, { "epoch": 0.9502762430939227, "grad_norm": 0.593336038543385, "learning_rate": 5.304025832147392e-06, "loss": 0.6514, "step": 860 }, { "epoch": 0.9558011049723757, "grad_norm": 0.5473762015068131, "learning_rate": 5.240331810317012e-06, "loss": 0.6576, "step": 865 }, { "epoch": 0.9613259668508287, "grad_norm": 0.6258926493703553, "learning_rate": 5.184080994004797e-06, "loss": 0.6733, "step": 870 }, { "epoch": 0.9668508287292817, "grad_norm": 0.5461645379290371, "learning_rate": 5.135292192403366e-06, "loss": 0.6529, "step": 875 }, { "epoch": 0.9723756906077348, "grad_norm": 0.6064073284247843, "learning_rate": 5.093981719550922e-06, "loss": 0.6559, "step": 880 }, { "epoch": 0.9779005524861878, "grad_norm": 0.5243053103650271, "learning_rate": 5.060163388876165e-06, "loss": 0.6214, "step": 885 }, { "epoch": 0.9834254143646409, "grad_norm": 0.5755014538442916, "learning_rate": 5.033848508579353e-06, "loss": 0.6603, "step": 890 }, { "epoch": 0.988950276243094, "grad_norm": 0.5488561027700054, "learning_rate": 5.015045877851049e-06, "loss": 0.6213, "step": 895 }, { "epoch": 0.994475138121547, "grad_norm": 0.5042073727528616, "learning_rate": 5.003761783929837e-06, "loss": 0.6301, "step": 900 }, { "epoch": 1.0, "grad_norm": 0.5266929188152082, "learning_rate": 5e-06, "loss": 0.6544, "step": 905 }, { "epoch": 1.0, "step": 905, "total_flos": 77344140165120.0, "train_loss": 0.7063807840505357, "train_runtime": 17902.8015, "train_samples_per_second": 0.303, "train_steps_per_second": 0.051 } ], "logging_steps": 5, "max_steps": 905, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 77344140165120.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }