{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18053800324968405, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045134500812421015, "grad_norm": 4.092976093292236, "learning_rate": 4.9977884094601916e-05, "loss": 5.2971, "step": 50 }, { "epoch": 0.0009026900162484203, "grad_norm": 3.788925886154175, "learning_rate": 4.99553168441957e-05, "loss": 5.2741, "step": 100 }, { "epoch": 0.0013540350243726304, "grad_norm": 5.105148792266846, "learning_rate": 4.9932749593789494e-05, "loss": 5.0557, "step": 150 }, { "epoch": 0.0018053800324968406, "grad_norm": 4.261813163757324, "learning_rate": 4.991018234338328e-05, "loss": 5.155, "step": 200 }, { "epoch": 0.0022567250406210506, "grad_norm": 6.269433498382568, "learning_rate": 4.988761509297707e-05, "loss": 5.098, "step": 250 }, { "epoch": 0.002708070048745261, "grad_norm": 4.494116306304932, "learning_rate": 4.986504784257086e-05, "loss": 5.0972, "step": 300 }, { "epoch": 0.003159415056869471, "grad_norm": 3.811136484146118, "learning_rate": 4.984248059216465e-05, "loss": 5.0968, "step": 350 }, { "epoch": 0.003610760064993681, "grad_norm": 5.116394996643066, "learning_rate": 4.9819913341758444e-05, "loss": 5.0053, "step": 400 }, { "epoch": 0.004062105073117891, "grad_norm": 3.4574902057647705, "learning_rate": 4.979734609135223e-05, "loss": 5.0443, "step": 450 }, { "epoch": 0.004513450081242101, "grad_norm": 3.924276113510132, "learning_rate": 4.977477884094602e-05, "loss": 4.9834, "step": 500 }, { "epoch": 0.004964795089366311, "grad_norm": 3.120497226715088, "learning_rate": 4.975221159053981e-05, "loss": 4.9859, "step": 550 }, { "epoch": 0.005416140097490522, "grad_norm": 5.467548847198486, "learning_rate": 4.97296443401336e-05, "loss": 5.0014, "step": 600 }, { "epoch": 0.005867485105614732, "grad_norm": 4.165292739868164, "learning_rate": 4.970707708972739e-05, "loss": 4.9381, "step": 650 }, { "epoch": 0.006318830113738942, "grad_norm": 5.348793029785156, "learning_rate": 4.968450983932118e-05, "loss": 4.8988, "step": 700 }, { "epoch": 0.006770175121863152, "grad_norm": 5.445329189300537, "learning_rate": 4.9661942588914965e-05, "loss": 4.882, "step": 750 }, { "epoch": 0.007221520129987362, "grad_norm": 3.731977939605713, "learning_rate": 4.963937533850876e-05, "loss": 4.8879, "step": 800 }, { "epoch": 0.007672865138111573, "grad_norm": 4.9821343421936035, "learning_rate": 4.9616808088102544e-05, "loss": 4.8933, "step": 850 }, { "epoch": 0.008124210146235782, "grad_norm": 6.4130401611328125, "learning_rate": 4.9594240837696337e-05, "loss": 4.8942, "step": 900 }, { "epoch": 0.008575555154359992, "grad_norm": 5.44791841506958, "learning_rate": 4.957167358729013e-05, "loss": 4.816, "step": 950 }, { "epoch": 0.009026900162484202, "grad_norm": 4.63847541809082, "learning_rate": 4.9549106336883915e-05, "loss": 4.8797, "step": 1000 }, { "epoch": 0.009478245170608413, "grad_norm": 4.616076946258545, "learning_rate": 4.952653908647771e-05, "loss": 4.8622, "step": 1050 }, { "epoch": 0.009929590178732623, "grad_norm": 4.847900390625, "learning_rate": 4.9503971836071494e-05, "loss": 4.8765, "step": 1100 }, { "epoch": 0.010380935186856833, "grad_norm": 3.968596935272217, "learning_rate": 4.9481404585665286e-05, "loss": 4.7834, "step": 1150 }, { "epoch": 0.010832280194981043, "grad_norm": 3.6416995525360107, "learning_rate": 4.945883733525907e-05, "loss": 4.8674, "step": 1200 }, { "epoch": 0.011283625203105253, "grad_norm": 6.88565731048584, "learning_rate": 4.9436270084852865e-05, "loss": 4.809, "step": 1250 }, { "epoch": 0.011734970211229464, "grad_norm": 5.064456462860107, "learning_rate": 4.941370283444665e-05, "loss": 4.8121, "step": 1300 }, { "epoch": 0.012186315219353674, "grad_norm": 4.556407451629639, "learning_rate": 4.939113558404044e-05, "loss": 4.799, "step": 1350 }, { "epoch": 0.012637660227477884, "grad_norm": 4.071566581726074, "learning_rate": 4.936856833363423e-05, "loss": 4.7471, "step": 1400 }, { "epoch": 0.013089005235602094, "grad_norm": 4.647943019866943, "learning_rate": 4.934600108322802e-05, "loss": 4.7783, "step": 1450 }, { "epoch": 0.013540350243726304, "grad_norm": 4.131853103637695, "learning_rate": 4.9323433832821814e-05, "loss": 4.8507, "step": 1500 }, { "epoch": 0.013991695251850515, "grad_norm": 6.93862771987915, "learning_rate": 4.93008665824156e-05, "loss": 4.8, "step": 1550 }, { "epoch": 0.014443040259974725, "grad_norm": 4.366854190826416, "learning_rate": 4.927829933200939e-05, "loss": 4.8794, "step": 1600 }, { "epoch": 0.014894385268098935, "grad_norm": 3.989370822906494, "learning_rate": 4.925573208160318e-05, "loss": 4.7972, "step": 1650 }, { "epoch": 0.015345730276223145, "grad_norm": 4.402428150177002, "learning_rate": 4.923316483119697e-05, "loss": 4.9, "step": 1700 }, { "epoch": 0.015797075284347355, "grad_norm": 4.536413192749023, "learning_rate": 4.921059758079076e-05, "loss": 4.7663, "step": 1750 }, { "epoch": 0.016248420292471564, "grad_norm": 6.875385284423828, "learning_rate": 4.918803033038455e-05, "loss": 4.8557, "step": 1800 }, { "epoch": 0.016699765300595776, "grad_norm": 2.8038690090179443, "learning_rate": 4.9165463079978336e-05, "loss": 4.8403, "step": 1850 }, { "epoch": 0.017151110308719984, "grad_norm": 4.83705997467041, "learning_rate": 4.914289582957213e-05, "loss": 4.8451, "step": 1900 }, { "epoch": 0.017602455316844196, "grad_norm": 3.359116315841675, "learning_rate": 4.9120328579165914e-05, "loss": 4.8559, "step": 1950 }, { "epoch": 0.018053800324968405, "grad_norm": 6.140733242034912, "learning_rate": 4.909776132875971e-05, "loss": 4.8633, "step": 2000 }, { "epoch": 0.018505145333092617, "grad_norm": 4.224785327911377, "learning_rate": 4.90751940783535e-05, "loss": 4.835, "step": 2050 }, { "epoch": 0.018956490341216825, "grad_norm": 3.613844394683838, "learning_rate": 4.9052626827947285e-05, "loss": 4.7293, "step": 2100 }, { "epoch": 0.019407835349341037, "grad_norm": 5.848568439483643, "learning_rate": 4.903005957754108e-05, "loss": 4.6969, "step": 2150 }, { "epoch": 0.019859180357465245, "grad_norm": 3.9656293392181396, "learning_rate": 4.9007492327134864e-05, "loss": 4.7284, "step": 2200 }, { "epoch": 0.020310525365589457, "grad_norm": 5.00789213180542, "learning_rate": 4.898492507672866e-05, "loss": 4.865, "step": 2250 }, { "epoch": 0.020761870373713666, "grad_norm": 4.17151403427124, "learning_rate": 4.896235782632244e-05, "loss": 4.7261, "step": 2300 }, { "epoch": 0.021213215381837878, "grad_norm": 3.966817617416382, "learning_rate": 4.8939790575916235e-05, "loss": 4.7437, "step": 2350 }, { "epoch": 0.021664560389962086, "grad_norm": 4.516706943511963, "learning_rate": 4.891722332551002e-05, "loss": 4.7352, "step": 2400 }, { "epoch": 0.022115905398086298, "grad_norm": 4.184154033660889, "learning_rate": 4.8894656075103814e-05, "loss": 4.6622, "step": 2450 }, { "epoch": 0.022567250406210507, "grad_norm": 6.0985188484191895, "learning_rate": 4.8872088824697606e-05, "loss": 4.7659, "step": 2500 }, { "epoch": 0.02301859541433472, "grad_norm": 4.630510330200195, "learning_rate": 4.884952157429139e-05, "loss": 4.8424, "step": 2550 }, { "epoch": 0.023469940422458927, "grad_norm": 4.261359214782715, "learning_rate": 4.8826954323885185e-05, "loss": 4.7576, "step": 2600 }, { "epoch": 0.02392128543058314, "grad_norm": 4.511416435241699, "learning_rate": 4.880438707347897e-05, "loss": 4.7644, "step": 2650 }, { "epoch": 0.024372630438707348, "grad_norm": 3.6945180892944336, "learning_rate": 4.878181982307276e-05, "loss": 4.7893, "step": 2700 }, { "epoch": 0.02482397544683156, "grad_norm": 5.0419511795043945, "learning_rate": 4.875925257266655e-05, "loss": 4.6466, "step": 2750 }, { "epoch": 0.025275320454955768, "grad_norm": 3.80349063873291, "learning_rate": 4.873668532226034e-05, "loss": 4.778, "step": 2800 }, { "epoch": 0.02572666546307998, "grad_norm": 3.5543832778930664, "learning_rate": 4.871411807185413e-05, "loss": 4.6788, "step": 2850 }, { "epoch": 0.02617801047120419, "grad_norm": 3.064133405685425, "learning_rate": 4.869155082144792e-05, "loss": 4.6796, "step": 2900 }, { "epoch": 0.0266293554793284, "grad_norm": 3.449727773666382, "learning_rate": 4.8668983571041706e-05, "loss": 4.7513, "step": 2950 }, { "epoch": 0.02708070048745261, "grad_norm": 3.831252098083496, "learning_rate": 4.86464163206355e-05, "loss": 4.6886, "step": 3000 }, { "epoch": 0.027532045495576817, "grad_norm": 6.98654842376709, "learning_rate": 4.862384907022929e-05, "loss": 4.7468, "step": 3050 }, { "epoch": 0.02798339050370103, "grad_norm": 3.842249870300293, "learning_rate": 4.860128181982308e-05, "loss": 4.6317, "step": 3100 }, { "epoch": 0.028434735511825238, "grad_norm": 8.266908645629883, "learning_rate": 4.857871456941686e-05, "loss": 4.7426, "step": 3150 }, { "epoch": 0.02888608051994945, "grad_norm": 5.496558666229248, "learning_rate": 4.8556147319010656e-05, "loss": 4.5784, "step": 3200 }, { "epoch": 0.029337425528073658, "grad_norm": 4.078311920166016, "learning_rate": 4.853358006860444e-05, "loss": 4.6739, "step": 3250 }, { "epoch": 0.02978877053619787, "grad_norm": 3.8962206840515137, "learning_rate": 4.8511012818198234e-05, "loss": 4.7384, "step": 3300 }, { "epoch": 0.03024011554432208, "grad_norm": 3.655855178833008, "learning_rate": 4.848844556779202e-05, "loss": 4.7782, "step": 3350 }, { "epoch": 0.03069146055244629, "grad_norm": 3.840287446975708, "learning_rate": 4.846587831738581e-05, "loss": 4.6969, "step": 3400 }, { "epoch": 0.0311428055605705, "grad_norm": 3.54238224029541, "learning_rate": 4.84433110669796e-05, "loss": 4.7998, "step": 3450 }, { "epoch": 0.03159415056869471, "grad_norm": 6.432263374328613, "learning_rate": 4.842074381657339e-05, "loss": 4.7554, "step": 3500 }, { "epoch": 0.03204549557681892, "grad_norm": 4.151718616485596, "learning_rate": 4.839817656616718e-05, "loss": 4.7455, "step": 3550 }, { "epoch": 0.03249684058494313, "grad_norm": 3.6925272941589355, "learning_rate": 4.837560931576097e-05, "loss": 4.7143, "step": 3600 }, { "epoch": 0.03294818559306734, "grad_norm": 5.515355110168457, "learning_rate": 4.8353042065354756e-05, "loss": 4.6842, "step": 3650 }, { "epoch": 0.03339953060119155, "grad_norm": 4.059805393218994, "learning_rate": 4.833047481494855e-05, "loss": 4.6799, "step": 3700 }, { "epoch": 0.03385087560931576, "grad_norm": 4.311253547668457, "learning_rate": 4.8307907564542334e-05, "loss": 4.7937, "step": 3750 }, { "epoch": 0.03430222061743997, "grad_norm": 3.7470786571502686, "learning_rate": 4.828534031413613e-05, "loss": 4.6706, "step": 3800 }, { "epoch": 0.034753565625564184, "grad_norm": 3.432297468185425, "learning_rate": 4.826277306372991e-05, "loss": 4.6627, "step": 3850 }, { "epoch": 0.03520491063368839, "grad_norm": 2.6612203121185303, "learning_rate": 4.8240205813323706e-05, "loss": 4.6027, "step": 3900 }, { "epoch": 0.0356562556418126, "grad_norm": 5.329100131988525, "learning_rate": 4.821763856291749e-05, "loss": 4.7471, "step": 3950 }, { "epoch": 0.03610760064993681, "grad_norm": 3.7262275218963623, "learning_rate": 4.8195071312511284e-05, "loss": 4.6032, "step": 4000 }, { "epoch": 0.036558945658061025, "grad_norm": 4.605144500732422, "learning_rate": 4.817250406210507e-05, "loss": 4.8456, "step": 4050 }, { "epoch": 0.03701029066618523, "grad_norm": 3.8024492263793945, "learning_rate": 4.814993681169886e-05, "loss": 4.7256, "step": 4100 }, { "epoch": 0.03746163567430944, "grad_norm": 7.693057060241699, "learning_rate": 4.8127369561292655e-05, "loss": 4.6224, "step": 4150 }, { "epoch": 0.03791298068243365, "grad_norm": 4.100279808044434, "learning_rate": 4.810480231088644e-05, "loss": 4.7119, "step": 4200 }, { "epoch": 0.038364325690557866, "grad_norm": 4.8026347160339355, "learning_rate": 4.8082235060480234e-05, "loss": 4.57, "step": 4250 }, { "epoch": 0.038815670698682074, "grad_norm": 5.2641119956970215, "learning_rate": 4.805966781007402e-05, "loss": 4.5887, "step": 4300 }, { "epoch": 0.03926701570680628, "grad_norm": 2.8225934505462646, "learning_rate": 4.803710055966781e-05, "loss": 4.6981, "step": 4350 }, { "epoch": 0.03971836071493049, "grad_norm": 3.3784983158111572, "learning_rate": 4.80145333092616e-05, "loss": 4.6752, "step": 4400 }, { "epoch": 0.040169705723054706, "grad_norm": 5.6406426429748535, "learning_rate": 4.799196605885539e-05, "loss": 4.6396, "step": 4450 }, { "epoch": 0.040621050731178915, "grad_norm": 4.564062595367432, "learning_rate": 4.7969398808449177e-05, "loss": 4.6158, "step": 4500 }, { "epoch": 0.04107239573930312, "grad_norm": 3.6431472301483154, "learning_rate": 4.794683155804297e-05, "loss": 4.6883, "step": 4550 }, { "epoch": 0.04152374074742733, "grad_norm": 5.026195526123047, "learning_rate": 4.792426430763676e-05, "loss": 4.7225, "step": 4600 }, { "epoch": 0.04197508575555154, "grad_norm": 4.776146411895752, "learning_rate": 4.790169705723055e-05, "loss": 4.6596, "step": 4650 }, { "epoch": 0.042426430763675756, "grad_norm": 4.838674545288086, "learning_rate": 4.787912980682434e-05, "loss": 4.6576, "step": 4700 }, { "epoch": 0.042877775771799964, "grad_norm": 4.529509544372559, "learning_rate": 4.7856562556418126e-05, "loss": 4.6721, "step": 4750 }, { "epoch": 0.04332912077992417, "grad_norm": 4.392935752868652, "learning_rate": 4.783399530601192e-05, "loss": 4.701, "step": 4800 }, { "epoch": 0.04378046578804838, "grad_norm": 4.331223011016846, "learning_rate": 4.7811428055605705e-05, "loss": 4.6795, "step": 4850 }, { "epoch": 0.044231810796172596, "grad_norm": 4.109352111816406, "learning_rate": 4.77888608051995e-05, "loss": 4.5997, "step": 4900 }, { "epoch": 0.044683155804296805, "grad_norm": 3.7418441772460938, "learning_rate": 4.776629355479328e-05, "loss": 4.6427, "step": 4950 }, { "epoch": 0.04513450081242101, "grad_norm": 3.0237081050872803, "learning_rate": 4.7743726304387076e-05, "loss": 4.7359, "step": 5000 }, { "epoch": 0.04558584582054522, "grad_norm": 3.9886231422424316, "learning_rate": 4.772115905398086e-05, "loss": 4.5842, "step": 5050 }, { "epoch": 0.04603719082866944, "grad_norm": 4.597533226013184, "learning_rate": 4.7698591803574654e-05, "loss": 4.7202, "step": 5100 }, { "epoch": 0.046488535836793646, "grad_norm": 4.520393371582031, "learning_rate": 4.767602455316845e-05, "loss": 4.5774, "step": 5150 }, { "epoch": 0.046939880844917854, "grad_norm": 3.2824018001556396, "learning_rate": 4.765345730276223e-05, "loss": 4.6084, "step": 5200 }, { "epoch": 0.04739122585304206, "grad_norm": 6.290219783782959, "learning_rate": 4.7630890052356026e-05, "loss": 4.6361, "step": 5250 }, { "epoch": 0.04784257086116628, "grad_norm": 4.844172954559326, "learning_rate": 4.760832280194981e-05, "loss": 4.6252, "step": 5300 }, { "epoch": 0.04829391586929049, "grad_norm": 4.8328962326049805, "learning_rate": 4.7585755551543604e-05, "loss": 4.5557, "step": 5350 }, { "epoch": 0.048745260877414695, "grad_norm": 4.386012077331543, "learning_rate": 4.756318830113739e-05, "loss": 4.6911, "step": 5400 }, { "epoch": 0.049196605885538904, "grad_norm": 4.393270969390869, "learning_rate": 4.754062105073118e-05, "loss": 4.4869, "step": 5450 }, { "epoch": 0.04964795089366312, "grad_norm": 3.9346606731414795, "learning_rate": 4.751805380032497e-05, "loss": 4.608, "step": 5500 }, { "epoch": 0.05009929590178733, "grad_norm": 5.140569686889648, "learning_rate": 4.749548654991876e-05, "loss": 4.6262, "step": 5550 }, { "epoch": 0.050550640909911536, "grad_norm": 3.2936654090881348, "learning_rate": 4.747291929951255e-05, "loss": 4.6565, "step": 5600 }, { "epoch": 0.051001985918035744, "grad_norm": 3.5564124584198, "learning_rate": 4.745035204910634e-05, "loss": 4.5727, "step": 5650 }, { "epoch": 0.05145333092615996, "grad_norm": 3.9385626316070557, "learning_rate": 4.742778479870013e-05, "loss": 4.7165, "step": 5700 }, { "epoch": 0.05190467593428417, "grad_norm": 3.736527681350708, "learning_rate": 4.740521754829392e-05, "loss": 4.6504, "step": 5750 }, { "epoch": 0.05235602094240838, "grad_norm": 3.3729724884033203, "learning_rate": 4.738265029788771e-05, "loss": 4.7029, "step": 5800 }, { "epoch": 0.052807365950532585, "grad_norm": 2.953383445739746, "learning_rate": 4.73600830474815e-05, "loss": 4.5483, "step": 5850 }, { "epoch": 0.0532587109586568, "grad_norm": 4.406127452850342, "learning_rate": 4.733751579707529e-05, "loss": 4.6443, "step": 5900 }, { "epoch": 0.05371005596678101, "grad_norm": 2.935302495956421, "learning_rate": 4.7314948546669075e-05, "loss": 4.61, "step": 5950 }, { "epoch": 0.05416140097490522, "grad_norm": 4.362770080566406, "learning_rate": 4.729238129626287e-05, "loss": 4.5821, "step": 6000 }, { "epoch": 0.054612745983029426, "grad_norm": 3.588181972503662, "learning_rate": 4.7269814045856654e-05, "loss": 4.6317, "step": 6050 }, { "epoch": 0.055064090991153634, "grad_norm": 2.7238504886627197, "learning_rate": 4.7247246795450446e-05, "loss": 4.6867, "step": 6100 }, { "epoch": 0.05551543599927785, "grad_norm": 3.66497802734375, "learning_rate": 4.722467954504423e-05, "loss": 4.557, "step": 6150 }, { "epoch": 0.05596678100740206, "grad_norm": 3.9344165325164795, "learning_rate": 4.7202112294638025e-05, "loss": 4.5099, "step": 6200 }, { "epoch": 0.05641812601552627, "grad_norm": 3.919712781906128, "learning_rate": 4.717954504423182e-05, "loss": 4.6521, "step": 6250 }, { "epoch": 0.056869471023650475, "grad_norm": 6.165071964263916, "learning_rate": 4.71569777938256e-05, "loss": 4.6656, "step": 6300 }, { "epoch": 0.05732081603177469, "grad_norm": 3.976167917251587, "learning_rate": 4.7134410543419396e-05, "loss": 4.6891, "step": 6350 }, { "epoch": 0.0577721610398989, "grad_norm": 3.4293136596679688, "learning_rate": 4.711184329301318e-05, "loss": 4.6213, "step": 6400 }, { "epoch": 0.05822350604802311, "grad_norm": 3.062398910522461, "learning_rate": 4.7089276042606975e-05, "loss": 4.6794, "step": 6450 }, { "epoch": 0.058674851056147316, "grad_norm": 3.9836747646331787, "learning_rate": 4.706670879220076e-05, "loss": 4.6722, "step": 6500 }, { "epoch": 0.05912619606427153, "grad_norm": 4.0859246253967285, "learning_rate": 4.704414154179455e-05, "loss": 4.6909, "step": 6550 }, { "epoch": 0.05957754107239574, "grad_norm": 4.478472709655762, "learning_rate": 4.702157429138834e-05, "loss": 4.4942, "step": 6600 }, { "epoch": 0.06002888608051995, "grad_norm": 5.508967399597168, "learning_rate": 4.699900704098213e-05, "loss": 4.6658, "step": 6650 }, { "epoch": 0.06048023108864416, "grad_norm": 3.933199644088745, "learning_rate": 4.697643979057592e-05, "loss": 4.5696, "step": 6700 }, { "epoch": 0.06093157609676837, "grad_norm": 3.0764100551605225, "learning_rate": 4.695387254016971e-05, "loss": 4.7047, "step": 6750 }, { "epoch": 0.06138292110489258, "grad_norm": 3.0718812942504883, "learning_rate": 4.69313052897635e-05, "loss": 4.6213, "step": 6800 }, { "epoch": 0.06183426611301679, "grad_norm": 3.2949626445770264, "learning_rate": 4.690873803935729e-05, "loss": 4.5174, "step": 6850 }, { "epoch": 0.062285611121141, "grad_norm": 3.5119667053222656, "learning_rate": 4.688617078895108e-05, "loss": 4.6313, "step": 6900 }, { "epoch": 0.0627369561292652, "grad_norm": 3.8293747901916504, "learning_rate": 4.686360353854487e-05, "loss": 4.6896, "step": 6950 }, { "epoch": 0.06318830113738942, "grad_norm": 3.223698139190674, "learning_rate": 4.684103628813866e-05, "loss": 4.6462, "step": 7000 }, { "epoch": 0.06363964614551364, "grad_norm": 3.7061171531677246, "learning_rate": 4.6818469037732446e-05, "loss": 4.5423, "step": 7050 }, { "epoch": 0.06409099115363784, "grad_norm": 3.9031214714050293, "learning_rate": 4.679590178732624e-05, "loss": 4.6688, "step": 7100 }, { "epoch": 0.06454233616176205, "grad_norm": 11.581488609313965, "learning_rate": 4.6773334536920024e-05, "loss": 4.6832, "step": 7150 }, { "epoch": 0.06499368116988626, "grad_norm": 3.9187841415405273, "learning_rate": 4.675076728651382e-05, "loss": 4.6451, "step": 7200 }, { "epoch": 0.06544502617801047, "grad_norm": 3.8191521167755127, "learning_rate": 4.67282000361076e-05, "loss": 4.5677, "step": 7250 }, { "epoch": 0.06589637118613469, "grad_norm": 3.5511984825134277, "learning_rate": 4.6705632785701395e-05, "loss": 4.5412, "step": 7300 }, { "epoch": 0.06634771619425889, "grad_norm": 4.853089809417725, "learning_rate": 4.668306553529518e-05, "loss": 4.6636, "step": 7350 }, { "epoch": 0.0667990612023831, "grad_norm": 2.9507358074188232, "learning_rate": 4.6660498284888974e-05, "loss": 4.6232, "step": 7400 }, { "epoch": 0.06725040621050732, "grad_norm": 4.20766019821167, "learning_rate": 4.663793103448276e-05, "loss": 4.6429, "step": 7450 }, { "epoch": 0.06770175121863152, "grad_norm": 2.9639532566070557, "learning_rate": 4.6615363784076546e-05, "loss": 4.5613, "step": 7500 }, { "epoch": 0.06815309622675574, "grad_norm": 4.452625751495361, "learning_rate": 4.659279653367034e-05, "loss": 4.7034, "step": 7550 }, { "epoch": 0.06860444123487994, "grad_norm": 4.076809883117676, "learning_rate": 4.6570229283264124e-05, "loss": 4.6244, "step": 7600 }, { "epoch": 0.06905578624300415, "grad_norm": 3.361752510070801, "learning_rate": 4.654766203285792e-05, "loss": 4.6122, "step": 7650 }, { "epoch": 0.06950713125112837, "grad_norm": 2.9916162490844727, "learning_rate": 4.65250947824517e-05, "loss": 4.5939, "step": 7700 }, { "epoch": 0.06995847625925257, "grad_norm": 4.1875200271606445, "learning_rate": 4.6502527532045495e-05, "loss": 4.5255, "step": 7750 }, { "epoch": 0.07040982126737678, "grad_norm": 2.9376866817474365, "learning_rate": 4.647996028163929e-05, "loss": 4.6222, "step": 7800 }, { "epoch": 0.07086116627550099, "grad_norm": 3.77079176902771, "learning_rate": 4.6457393031233074e-05, "loss": 4.5671, "step": 7850 }, { "epoch": 0.0713125112836252, "grad_norm": 6.709794044494629, "learning_rate": 4.6434825780826866e-05, "loss": 4.493, "step": 7900 }, { "epoch": 0.07176385629174942, "grad_norm": 4.273845195770264, "learning_rate": 4.641225853042065e-05, "loss": 4.6503, "step": 7950 }, { "epoch": 0.07221520129987362, "grad_norm": 3.1263434886932373, "learning_rate": 4.6389691280014445e-05, "loss": 4.6878, "step": 8000 }, { "epoch": 0.07266654630799783, "grad_norm": 4.049619674682617, "learning_rate": 4.636712402960823e-05, "loss": 4.6728, "step": 8050 }, { "epoch": 0.07311789131612205, "grad_norm": 4.419615745544434, "learning_rate": 4.6344556779202023e-05, "loss": 4.5648, "step": 8100 }, { "epoch": 0.07356923632424625, "grad_norm": 4.067174911499023, "learning_rate": 4.632198952879581e-05, "loss": 4.6475, "step": 8150 }, { "epoch": 0.07402058133237047, "grad_norm": 3.8273239135742188, "learning_rate": 4.62994222783896e-05, "loss": 4.4821, "step": 8200 }, { "epoch": 0.07447192634049467, "grad_norm": 2.988802433013916, "learning_rate": 4.627685502798339e-05, "loss": 4.4786, "step": 8250 }, { "epoch": 0.07492327134861888, "grad_norm": 4.000159740447998, "learning_rate": 4.625428777757718e-05, "loss": 4.6493, "step": 8300 }, { "epoch": 0.0753746163567431, "grad_norm": 4.026582717895508, "learning_rate": 4.623172052717097e-05, "loss": 4.6096, "step": 8350 }, { "epoch": 0.0758259613648673, "grad_norm": 3.3265931606292725, "learning_rate": 4.620915327676476e-05, "loss": 4.5148, "step": 8400 }, { "epoch": 0.07627730637299152, "grad_norm": 3.2252328395843506, "learning_rate": 4.618658602635855e-05, "loss": 4.6038, "step": 8450 }, { "epoch": 0.07672865138111573, "grad_norm": 3.4897453784942627, "learning_rate": 4.616401877595234e-05, "loss": 4.6121, "step": 8500 }, { "epoch": 0.07717999638923993, "grad_norm": 3.3298215866088867, "learning_rate": 4.614145152554613e-05, "loss": 4.5457, "step": 8550 }, { "epoch": 0.07763134139736415, "grad_norm": 3.875998020172119, "learning_rate": 4.6118884275139916e-05, "loss": 4.5236, "step": 8600 }, { "epoch": 0.07808268640548835, "grad_norm": 3.5962016582489014, "learning_rate": 4.609631702473371e-05, "loss": 4.528, "step": 8650 }, { "epoch": 0.07853403141361257, "grad_norm": 2.4850423336029053, "learning_rate": 4.6073749774327494e-05, "loss": 4.5441, "step": 8700 }, { "epoch": 0.07898537642173678, "grad_norm": 2.6482949256896973, "learning_rate": 4.605118252392129e-05, "loss": 4.5235, "step": 8750 }, { "epoch": 0.07943672142986098, "grad_norm": 3.3628525733947754, "learning_rate": 4.602861527351507e-05, "loss": 4.6373, "step": 8800 }, { "epoch": 0.0798880664379852, "grad_norm": 3.0251526832580566, "learning_rate": 4.6006048023108866e-05, "loss": 4.3972, "step": 8850 }, { "epoch": 0.08033941144610941, "grad_norm": 3.8248074054718018, "learning_rate": 4.598348077270266e-05, "loss": 4.5324, "step": 8900 }, { "epoch": 0.08079075645423361, "grad_norm": 3.5319507122039795, "learning_rate": 4.5960913522296444e-05, "loss": 4.6163, "step": 8950 }, { "epoch": 0.08124210146235783, "grad_norm": 5.563832759857178, "learning_rate": 4.593834627189024e-05, "loss": 4.5826, "step": 9000 }, { "epoch": 0.08169344647048203, "grad_norm": 3.98085355758667, "learning_rate": 4.591577902148402e-05, "loss": 4.6515, "step": 9050 }, { "epoch": 0.08214479147860625, "grad_norm": 6.063210964202881, "learning_rate": 4.5893211771077815e-05, "loss": 4.5158, "step": 9100 }, { "epoch": 0.08259613648673046, "grad_norm": 3.957599401473999, "learning_rate": 4.58706445206716e-05, "loss": 4.5528, "step": 9150 }, { "epoch": 0.08304748149485466, "grad_norm": 3.1111884117126465, "learning_rate": 4.5848077270265394e-05, "loss": 4.5484, "step": 9200 }, { "epoch": 0.08349882650297888, "grad_norm": 4.1915059089660645, "learning_rate": 4.582551001985918e-05, "loss": 4.595, "step": 9250 }, { "epoch": 0.08395017151110308, "grad_norm": 4.1448140144348145, "learning_rate": 4.580294276945297e-05, "loss": 4.6259, "step": 9300 }, { "epoch": 0.0844015165192273, "grad_norm": 3.6308369636535645, "learning_rate": 4.5780375519046765e-05, "loss": 4.6703, "step": 9350 }, { "epoch": 0.08485286152735151, "grad_norm": 6.079587459564209, "learning_rate": 4.575780826864055e-05, "loss": 4.6145, "step": 9400 }, { "epoch": 0.08530420653547571, "grad_norm": 3.5566651821136475, "learning_rate": 4.5735241018234343e-05, "loss": 4.5084, "step": 9450 }, { "epoch": 0.08575555154359993, "grad_norm": 4.733799934387207, "learning_rate": 4.571267376782813e-05, "loss": 4.5918, "step": 9500 }, { "epoch": 0.08620689655172414, "grad_norm": 3.1966097354888916, "learning_rate": 4.569010651742192e-05, "loss": 4.4592, "step": 9550 }, { "epoch": 0.08665824155984835, "grad_norm": 3.9291093349456787, "learning_rate": 4.566753926701571e-05, "loss": 4.5673, "step": 9600 }, { "epoch": 0.08710958656797256, "grad_norm": 5.446611404418945, "learning_rate": 4.56449720166095e-05, "loss": 4.6176, "step": 9650 }, { "epoch": 0.08756093157609676, "grad_norm": 3.054124355316162, "learning_rate": 4.5622404766203286e-05, "loss": 4.6921, "step": 9700 }, { "epoch": 0.08801227658422098, "grad_norm": 3.27416729927063, "learning_rate": 4.559983751579708e-05, "loss": 4.5667, "step": 9750 }, { "epoch": 0.08846362159234519, "grad_norm": 3.577589273452759, "learning_rate": 4.5577270265390865e-05, "loss": 4.442, "step": 9800 }, { "epoch": 0.0889149666004694, "grad_norm": 3.566028118133545, "learning_rate": 4.555470301498466e-05, "loss": 4.6025, "step": 9850 }, { "epoch": 0.08936631160859361, "grad_norm": 4.064197540283203, "learning_rate": 4.553213576457845e-05, "loss": 4.5812, "step": 9900 }, { "epoch": 0.08981765661671783, "grad_norm": 4.237987041473389, "learning_rate": 4.5509568514172236e-05, "loss": 4.6083, "step": 9950 }, { "epoch": 0.09026900162484203, "grad_norm": 3.0101680755615234, "learning_rate": 4.548700126376603e-05, "loss": 4.6099, "step": 10000 }, { "epoch": 0.09072034663296624, "grad_norm": 3.5102596282958984, "learning_rate": 4.5464434013359815e-05, "loss": 4.5643, "step": 10050 }, { "epoch": 0.09117169164109044, "grad_norm": 4.774995803833008, "learning_rate": 4.544186676295361e-05, "loss": 4.5663, "step": 10100 }, { "epoch": 0.09162303664921466, "grad_norm": 3.963777780532837, "learning_rate": 4.541929951254739e-05, "loss": 4.5603, "step": 10150 }, { "epoch": 0.09207438165733887, "grad_norm": 2.888615846633911, "learning_rate": 4.5396732262141186e-05, "loss": 4.5396, "step": 10200 }, { "epoch": 0.09252572666546308, "grad_norm": 4.281205177307129, "learning_rate": 4.537416501173497e-05, "loss": 4.5725, "step": 10250 }, { "epoch": 0.09297707167358729, "grad_norm": 4.1528472900390625, "learning_rate": 4.5351597761328764e-05, "loss": 4.6262, "step": 10300 }, { "epoch": 0.0934284166817115, "grad_norm": 3.966341972351074, "learning_rate": 4.532903051092255e-05, "loss": 4.5871, "step": 10350 }, { "epoch": 0.09387976168983571, "grad_norm": 3.1821911334991455, "learning_rate": 4.530646326051634e-05, "loss": 4.4732, "step": 10400 }, { "epoch": 0.09433110669795992, "grad_norm": 5.116222858428955, "learning_rate": 4.5283896010110135e-05, "loss": 4.6084, "step": 10450 }, { "epoch": 0.09478245170608413, "grad_norm": 5.254827976226807, "learning_rate": 4.526132875970392e-05, "loss": 4.5696, "step": 10500 }, { "epoch": 0.09523379671420834, "grad_norm": 3.6102991104125977, "learning_rate": 4.5238761509297714e-05, "loss": 4.5971, "step": 10550 }, { "epoch": 0.09568514172233256, "grad_norm": 3.348236322402954, "learning_rate": 4.52161942588915e-05, "loss": 4.5048, "step": 10600 }, { "epoch": 0.09613648673045676, "grad_norm": 3.1192493438720703, "learning_rate": 4.519362700848529e-05, "loss": 4.571, "step": 10650 }, { "epoch": 0.09658783173858097, "grad_norm": 2.9626996517181396, "learning_rate": 4.517105975807908e-05, "loss": 4.4975, "step": 10700 }, { "epoch": 0.09703917674670517, "grad_norm": 3.4130876064300537, "learning_rate": 4.514849250767287e-05, "loss": 4.6279, "step": 10750 }, { "epoch": 0.09749052175482939, "grad_norm": 2.4458179473876953, "learning_rate": 4.512592525726666e-05, "loss": 4.511, "step": 10800 }, { "epoch": 0.0979418667629536, "grad_norm": 5.223287105560303, "learning_rate": 4.510335800686045e-05, "loss": 4.5435, "step": 10850 }, { "epoch": 0.09839321177107781, "grad_norm": 4.481621742248535, "learning_rate": 4.5080790756454235e-05, "loss": 4.5177, "step": 10900 }, { "epoch": 0.09884455677920202, "grad_norm": 2.959305763244629, "learning_rate": 4.505822350604803e-05, "loss": 4.5746, "step": 10950 }, { "epoch": 0.09929590178732624, "grad_norm": 6.753904342651367, "learning_rate": 4.5035656255641814e-05, "loss": 4.5424, "step": 11000 }, { "epoch": 0.09974724679545044, "grad_norm": 3.4904415607452393, "learning_rate": 4.5013089005235606e-05, "loss": 4.5661, "step": 11050 }, { "epoch": 0.10019859180357465, "grad_norm": 3.316413164138794, "learning_rate": 4.499052175482939e-05, "loss": 4.5242, "step": 11100 }, { "epoch": 0.10064993681169886, "grad_norm": 3.974198579788208, "learning_rate": 4.4967954504423185e-05, "loss": 4.5251, "step": 11150 }, { "epoch": 0.10110128181982307, "grad_norm": 4.306400775909424, "learning_rate": 4.494538725401697e-05, "loss": 4.5209, "step": 11200 }, { "epoch": 0.10155262682794729, "grad_norm": 4.841123104095459, "learning_rate": 4.4922820003610764e-05, "loss": 4.5895, "step": 11250 }, { "epoch": 0.10200397183607149, "grad_norm": 3.6396520137786865, "learning_rate": 4.490025275320455e-05, "loss": 4.5788, "step": 11300 }, { "epoch": 0.1024553168441957, "grad_norm": 3.500455379486084, "learning_rate": 4.487768550279834e-05, "loss": 4.4844, "step": 11350 }, { "epoch": 0.10290666185231992, "grad_norm": 4.19438362121582, "learning_rate": 4.485511825239213e-05, "loss": 4.5723, "step": 11400 }, { "epoch": 0.10335800686044412, "grad_norm": 3.513514995574951, "learning_rate": 4.483255100198592e-05, "loss": 4.6457, "step": 11450 }, { "epoch": 0.10380935186856834, "grad_norm": 3.5381104946136475, "learning_rate": 4.4809983751579706e-05, "loss": 4.4716, "step": 11500 }, { "epoch": 0.10426069687669254, "grad_norm": 4.183605194091797, "learning_rate": 4.47874165011735e-05, "loss": 4.5554, "step": 11550 }, { "epoch": 0.10471204188481675, "grad_norm": 3.838669538497925, "learning_rate": 4.4764849250767285e-05, "loss": 4.5354, "step": 11600 }, { "epoch": 0.10516338689294097, "grad_norm": 3.651357889175415, "learning_rate": 4.474228200036108e-05, "loss": 4.5784, "step": 11650 }, { "epoch": 0.10561473190106517, "grad_norm": 3.6753928661346436, "learning_rate": 4.4719714749954863e-05, "loss": 4.5484, "step": 11700 }, { "epoch": 0.10606607690918939, "grad_norm": 4.5028228759765625, "learning_rate": 4.4697147499548656e-05, "loss": 4.5781, "step": 11750 }, { "epoch": 0.1065174219173136, "grad_norm": 7.304862022399902, "learning_rate": 4.467458024914244e-05, "loss": 4.5041, "step": 11800 }, { "epoch": 0.1069687669254378, "grad_norm": 4.280136585235596, "learning_rate": 4.4652012998736235e-05, "loss": 4.6027, "step": 11850 }, { "epoch": 0.10742011193356202, "grad_norm": 3.6763241291046143, "learning_rate": 4.462944574833002e-05, "loss": 4.5593, "step": 11900 }, { "epoch": 0.10787145694168622, "grad_norm": 3.8541440963745117, "learning_rate": 4.460687849792381e-05, "loss": 4.5144, "step": 11950 }, { "epoch": 0.10832280194981043, "grad_norm": 2.8991189002990723, "learning_rate": 4.4584311247517606e-05, "loss": 4.6064, "step": 12000 }, { "epoch": 0.10877414695793465, "grad_norm": 2.928452491760254, "learning_rate": 4.456174399711139e-05, "loss": 4.5631, "step": 12050 }, { "epoch": 0.10922549196605885, "grad_norm": 3.3975236415863037, "learning_rate": 4.4539176746705184e-05, "loss": 4.5839, "step": 12100 }, { "epoch": 0.10967683697418307, "grad_norm": 3.4614107608795166, "learning_rate": 4.451660949629897e-05, "loss": 4.5176, "step": 12150 }, { "epoch": 0.11012818198230727, "grad_norm": 3.582960605621338, "learning_rate": 4.449404224589276e-05, "loss": 4.6017, "step": 12200 }, { "epoch": 0.11057952699043148, "grad_norm": 5.049736499786377, "learning_rate": 4.447147499548655e-05, "loss": 4.5375, "step": 12250 }, { "epoch": 0.1110308719985557, "grad_norm": 4.15340518951416, "learning_rate": 4.444890774508034e-05, "loss": 4.6302, "step": 12300 }, { "epoch": 0.1114822170066799, "grad_norm": 3.0118372440338135, "learning_rate": 4.442634049467413e-05, "loss": 4.6187, "step": 12350 }, { "epoch": 0.11193356201480412, "grad_norm": 3.5457749366760254, "learning_rate": 4.440377324426792e-05, "loss": 4.5201, "step": 12400 }, { "epoch": 0.11238490702292833, "grad_norm": 3.9251248836517334, "learning_rate": 4.4381205993861706e-05, "loss": 4.6261, "step": 12450 }, { "epoch": 0.11283625203105253, "grad_norm": 3.2046866416931152, "learning_rate": 4.43586387434555e-05, "loss": 4.5479, "step": 12500 }, { "epoch": 0.11328759703917675, "grad_norm": 3.1684064865112305, "learning_rate": 4.433607149304929e-05, "loss": 4.5823, "step": 12550 }, { "epoch": 0.11373894204730095, "grad_norm": 4.124698638916016, "learning_rate": 4.431350424264308e-05, "loss": 4.6002, "step": 12600 }, { "epoch": 0.11419028705542517, "grad_norm": 3.9625906944274902, "learning_rate": 4.429093699223687e-05, "loss": 4.5571, "step": 12650 }, { "epoch": 0.11464163206354938, "grad_norm": 4.684337139129639, "learning_rate": 4.4268369741830655e-05, "loss": 4.5483, "step": 12700 }, { "epoch": 0.11509297707167358, "grad_norm": 4.30114221572876, "learning_rate": 4.424580249142445e-05, "loss": 4.5534, "step": 12750 }, { "epoch": 0.1155443220797978, "grad_norm": 3.673405647277832, "learning_rate": 4.4223235241018234e-05, "loss": 4.5645, "step": 12800 }, { "epoch": 0.11599566708792201, "grad_norm": 4.129467964172363, "learning_rate": 4.4200667990612027e-05, "loss": 4.5647, "step": 12850 }, { "epoch": 0.11644701209604622, "grad_norm": 2.8640856742858887, "learning_rate": 4.417810074020581e-05, "loss": 4.475, "step": 12900 }, { "epoch": 0.11689835710417043, "grad_norm": 3.1711478233337402, "learning_rate": 4.4155533489799605e-05, "loss": 4.5395, "step": 12950 }, { "epoch": 0.11734970211229463, "grad_norm": 4.4645586013793945, "learning_rate": 4.413296623939339e-05, "loss": 4.5149, "step": 13000 }, { "epoch": 0.11780104712041885, "grad_norm": 4.081081867218018, "learning_rate": 4.4110398988987184e-05, "loss": 4.3781, "step": 13050 }, { "epoch": 0.11825239212854306, "grad_norm": 3.4459915161132812, "learning_rate": 4.4087831738580976e-05, "loss": 4.4447, "step": 13100 }, { "epoch": 0.11870373713666726, "grad_norm": 4.382139205932617, "learning_rate": 4.406526448817476e-05, "loss": 4.5698, "step": 13150 }, { "epoch": 0.11915508214479148, "grad_norm": 3.9767699241638184, "learning_rate": 4.4042697237768555e-05, "loss": 4.7358, "step": 13200 }, { "epoch": 0.1196064271529157, "grad_norm": 2.903264284133911, "learning_rate": 4.402012998736234e-05, "loss": 4.4043, "step": 13250 }, { "epoch": 0.1200577721610399, "grad_norm": 3.7580466270446777, "learning_rate": 4.399756273695613e-05, "loss": 4.4899, "step": 13300 }, { "epoch": 0.12050911716916411, "grad_norm": 3.086916446685791, "learning_rate": 4.397499548654992e-05, "loss": 4.494, "step": 13350 }, { "epoch": 0.12096046217728831, "grad_norm": 3.9137027263641357, "learning_rate": 4.395242823614371e-05, "loss": 4.5933, "step": 13400 }, { "epoch": 0.12141180718541253, "grad_norm": 3.615917205810547, "learning_rate": 4.39298609857375e-05, "loss": 4.4373, "step": 13450 }, { "epoch": 0.12186315219353674, "grad_norm": 2.4744229316711426, "learning_rate": 4.390729373533129e-05, "loss": 4.6075, "step": 13500 }, { "epoch": 0.12231449720166095, "grad_norm": 3.469045639038086, "learning_rate": 4.3884726484925076e-05, "loss": 4.5208, "step": 13550 }, { "epoch": 0.12276584220978516, "grad_norm": 4.882166385650635, "learning_rate": 4.386215923451887e-05, "loss": 4.4989, "step": 13600 }, { "epoch": 0.12321718721790936, "grad_norm": 4.610581398010254, "learning_rate": 4.383959198411266e-05, "loss": 4.5306, "step": 13650 }, { "epoch": 0.12366853222603358, "grad_norm": 3.6969921588897705, "learning_rate": 4.381702473370645e-05, "loss": 4.5978, "step": 13700 }, { "epoch": 0.1241198772341578, "grad_norm": 4.886890888214111, "learning_rate": 4.379445748330024e-05, "loss": 4.4131, "step": 13750 }, { "epoch": 0.124571222242282, "grad_norm": 2.121551513671875, "learning_rate": 4.3771890232894026e-05, "loss": 4.5148, "step": 13800 }, { "epoch": 0.1250225672504062, "grad_norm": 3.1213953495025635, "learning_rate": 4.374932298248782e-05, "loss": 4.6455, "step": 13850 }, { "epoch": 0.1254739122585304, "grad_norm": 3.9660770893096924, "learning_rate": 4.3726755732081604e-05, "loss": 4.4807, "step": 13900 }, { "epoch": 0.12592525726665463, "grad_norm": 2.980980396270752, "learning_rate": 4.37041884816754e-05, "loss": 4.6299, "step": 13950 }, { "epoch": 0.12637660227477884, "grad_norm": 3.5488901138305664, "learning_rate": 4.368162123126918e-05, "loss": 4.5797, "step": 14000 }, { "epoch": 0.12682794728290306, "grad_norm": 2.9502065181732178, "learning_rate": 4.3659053980862975e-05, "loss": 4.5491, "step": 14050 }, { "epoch": 0.12727929229102727, "grad_norm": 2.8409996032714844, "learning_rate": 4.363648673045677e-05, "loss": 4.5958, "step": 14100 }, { "epoch": 0.12773063729915146, "grad_norm": 5.0700907707214355, "learning_rate": 4.3613919480050554e-05, "loss": 4.6128, "step": 14150 }, { "epoch": 0.12818198230727568, "grad_norm": 3.55629301071167, "learning_rate": 4.3591352229644347e-05, "loss": 4.492, "step": 14200 }, { "epoch": 0.1286333273153999, "grad_norm": 3.631505250930786, "learning_rate": 4.356878497923813e-05, "loss": 4.5023, "step": 14250 }, { "epoch": 0.1290846723235241, "grad_norm": 3.8898086547851562, "learning_rate": 4.3546217728831925e-05, "loss": 4.5478, "step": 14300 }, { "epoch": 0.12953601733164832, "grad_norm": 3.2403228282928467, "learning_rate": 4.352365047842571e-05, "loss": 4.4829, "step": 14350 }, { "epoch": 0.1299873623397725, "grad_norm": 3.5314269065856934, "learning_rate": 4.3501083228019504e-05, "loss": 4.5186, "step": 14400 }, { "epoch": 0.13043870734789673, "grad_norm": 3.769017457962036, "learning_rate": 4.347851597761329e-05, "loss": 4.4798, "step": 14450 }, { "epoch": 0.13089005235602094, "grad_norm": 3.4731597900390625, "learning_rate": 4.345594872720708e-05, "loss": 4.6109, "step": 14500 }, { "epoch": 0.13134139736414516, "grad_norm": 4.540064811706543, "learning_rate": 4.343338147680087e-05, "loss": 4.5537, "step": 14550 }, { "epoch": 0.13179274237226937, "grad_norm": 4.51099157333374, "learning_rate": 4.341081422639466e-05, "loss": 4.4937, "step": 14600 }, { "epoch": 0.13224408738039356, "grad_norm": 5.07973051071167, "learning_rate": 4.338824697598845e-05, "loss": 4.5471, "step": 14650 }, { "epoch": 0.13269543238851778, "grad_norm": 4.9902753829956055, "learning_rate": 4.336567972558224e-05, "loss": 4.5544, "step": 14700 }, { "epoch": 0.133146777396642, "grad_norm": 5.2365031242370605, "learning_rate": 4.334311247517603e-05, "loss": 4.4559, "step": 14750 }, { "epoch": 0.1335981224047662, "grad_norm": 4.138045787811279, "learning_rate": 4.332054522476982e-05, "loss": 4.3979, "step": 14800 }, { "epoch": 0.13404946741289042, "grad_norm": 3.637258529663086, "learning_rate": 4.329797797436361e-05, "loss": 4.5282, "step": 14850 }, { "epoch": 0.13450081242101464, "grad_norm": 3.374943256378174, "learning_rate": 4.3275410723957396e-05, "loss": 4.5585, "step": 14900 }, { "epoch": 0.13495215742913882, "grad_norm": 4.198739051818848, "learning_rate": 4.325284347355119e-05, "loss": 4.4996, "step": 14950 }, { "epoch": 0.13540350243726304, "grad_norm": 3.0009047985076904, "learning_rate": 4.3230276223144975e-05, "loss": 4.5361, "step": 15000 }, { "epoch": 0.13585484744538726, "grad_norm": 3.28633975982666, "learning_rate": 4.320770897273877e-05, "loss": 4.5367, "step": 15050 }, { "epoch": 0.13630619245351147, "grad_norm": 3.2945947647094727, "learning_rate": 4.318514172233255e-05, "loss": 4.5113, "step": 15100 }, { "epoch": 0.1367575374616357, "grad_norm": 5.111336708068848, "learning_rate": 4.3162574471926346e-05, "loss": 4.5238, "step": 15150 }, { "epoch": 0.13720888246975987, "grad_norm": 2.328876256942749, "learning_rate": 4.314000722152013e-05, "loss": 4.5512, "step": 15200 }, { "epoch": 0.1376602274778841, "grad_norm": 3.703890323638916, "learning_rate": 4.311743997111392e-05, "loss": 4.5192, "step": 15250 }, { "epoch": 0.1381115724860083, "grad_norm": 2.8396573066711426, "learning_rate": 4.309487272070771e-05, "loss": 4.5641, "step": 15300 }, { "epoch": 0.13856291749413252, "grad_norm": 3.3222029209136963, "learning_rate": 4.3072305470301496e-05, "loss": 4.5332, "step": 15350 }, { "epoch": 0.13901426250225674, "grad_norm": 3.652606725692749, "learning_rate": 4.304973821989529e-05, "loss": 4.5056, "step": 15400 }, { "epoch": 0.13946560751038092, "grad_norm": 7.847742080688477, "learning_rate": 4.3027170969489075e-05, "loss": 4.4884, "step": 15450 }, { "epoch": 0.13991695251850514, "grad_norm": 3.6494662761688232, "learning_rate": 4.300460371908287e-05, "loss": 4.4938, "step": 15500 }, { "epoch": 0.14036829752662935, "grad_norm": 4.544933795928955, "learning_rate": 4.298203646867665e-05, "loss": 4.5045, "step": 15550 }, { "epoch": 0.14081964253475357, "grad_norm": 3.429764986038208, "learning_rate": 4.2959469218270446e-05, "loss": 4.5033, "step": 15600 }, { "epoch": 0.14127098754287779, "grad_norm": 3.790017604827881, "learning_rate": 4.293690196786423e-05, "loss": 4.4775, "step": 15650 }, { "epoch": 0.14172233255100197, "grad_norm": 3.4987452030181885, "learning_rate": 4.2914334717458024e-05, "loss": 4.5988, "step": 15700 }, { "epoch": 0.1421736775591262, "grad_norm": 2.5895438194274902, "learning_rate": 4.289176746705182e-05, "loss": 4.5285, "step": 15750 }, { "epoch": 0.1426250225672504, "grad_norm": 4.709017276763916, "learning_rate": 4.28692002166456e-05, "loss": 4.4137, "step": 15800 }, { "epoch": 0.14307636757537462, "grad_norm": 3.9760525226593018, "learning_rate": 4.2846632966239395e-05, "loss": 4.5278, "step": 15850 }, { "epoch": 0.14352771258349883, "grad_norm": 4.445188045501709, "learning_rate": 4.282406571583318e-05, "loss": 4.5362, "step": 15900 }, { "epoch": 0.14397905759162305, "grad_norm": 4.021897792816162, "learning_rate": 4.2801498465426974e-05, "loss": 4.458, "step": 15950 }, { "epoch": 0.14443040259974724, "grad_norm": 4.263660907745361, "learning_rate": 4.277893121502076e-05, "loss": 4.4782, "step": 16000 }, { "epoch": 0.14488174760787145, "grad_norm": 3.184115171432495, "learning_rate": 4.275636396461455e-05, "loss": 4.4877, "step": 16050 }, { "epoch": 0.14533309261599567, "grad_norm": 3.6419224739074707, "learning_rate": 4.273379671420834e-05, "loss": 4.5329, "step": 16100 }, { "epoch": 0.14578443762411988, "grad_norm": 5.209333896636963, "learning_rate": 4.271122946380213e-05, "loss": 4.5252, "step": 16150 }, { "epoch": 0.1462357826322441, "grad_norm": 2.9980499744415283, "learning_rate": 4.2688662213395924e-05, "loss": 4.4491, "step": 16200 }, { "epoch": 0.1466871276403683, "grad_norm": 2.8836166858673096, "learning_rate": 4.266609496298971e-05, "loss": 4.524, "step": 16250 }, { "epoch": 0.1471384726484925, "grad_norm": 3.24406099319458, "learning_rate": 4.26435277125835e-05, "loss": 4.5629, "step": 16300 }, { "epoch": 0.14758981765661672, "grad_norm": 3.78409743309021, "learning_rate": 4.262096046217729e-05, "loss": 4.5051, "step": 16350 }, { "epoch": 0.14804116266474093, "grad_norm": 3.738863229751587, "learning_rate": 4.259839321177108e-05, "loss": 4.3699, "step": 16400 }, { "epoch": 0.14849250767286515, "grad_norm": 3.1949925422668457, "learning_rate": 4.2575825961364867e-05, "loss": 4.4681, "step": 16450 }, { "epoch": 0.14894385268098934, "grad_norm": 3.774017810821533, "learning_rate": 4.255325871095866e-05, "loss": 4.4382, "step": 16500 }, { "epoch": 0.14939519768911355, "grad_norm": 3.903379201889038, "learning_rate": 4.2530691460552445e-05, "loss": 4.4229, "step": 16550 }, { "epoch": 0.14984654269723777, "grad_norm": 2.8182575702667236, "learning_rate": 4.250812421014624e-05, "loss": 4.4755, "step": 16600 }, { "epoch": 0.15029788770536198, "grad_norm": 3.8375935554504395, "learning_rate": 4.2485556959740024e-05, "loss": 4.5113, "step": 16650 }, { "epoch": 0.1507492327134862, "grad_norm": 3.6683831214904785, "learning_rate": 4.2462989709333816e-05, "loss": 4.5386, "step": 16700 }, { "epoch": 0.15120057772161039, "grad_norm": 4.0321431159973145, "learning_rate": 4.244042245892761e-05, "loss": 4.4977, "step": 16750 }, { "epoch": 0.1516519227297346, "grad_norm": 3.8294458389282227, "learning_rate": 4.2417855208521395e-05, "loss": 4.4926, "step": 16800 }, { "epoch": 0.15210326773785882, "grad_norm": 3.6209237575531006, "learning_rate": 4.239528795811519e-05, "loss": 4.5028, "step": 16850 }, { "epoch": 0.15255461274598303, "grad_norm": 3.8138227462768555, "learning_rate": 4.237272070770897e-05, "loss": 4.4808, "step": 16900 }, { "epoch": 0.15300595775410725, "grad_norm": 4.5005927085876465, "learning_rate": 4.2350153457302766e-05, "loss": 4.4702, "step": 16950 }, { "epoch": 0.15345730276223146, "grad_norm": 3.48544979095459, "learning_rate": 4.232758620689655e-05, "loss": 4.4993, "step": 17000 }, { "epoch": 0.15390864777035565, "grad_norm": 3.5820982456207275, "learning_rate": 4.2305018956490344e-05, "loss": 4.5032, "step": 17050 }, { "epoch": 0.15435999277847987, "grad_norm": 4.8123555183410645, "learning_rate": 4.228245170608413e-05, "loss": 4.5196, "step": 17100 }, { "epoch": 0.15481133778660408, "grad_norm": 3.8024814128875732, "learning_rate": 4.225988445567792e-05, "loss": 4.5327, "step": 17150 }, { "epoch": 0.1552626827947283, "grad_norm": 5.407778263092041, "learning_rate": 4.223731720527171e-05, "loss": 4.5355, "step": 17200 }, { "epoch": 0.1557140278028525, "grad_norm": 3.6917614936828613, "learning_rate": 4.22147499548655e-05, "loss": 4.4072, "step": 17250 }, { "epoch": 0.1561653728109767, "grad_norm": 3.9421164989471436, "learning_rate": 4.2192182704459294e-05, "loss": 4.4812, "step": 17300 }, { "epoch": 0.15661671781910091, "grad_norm": 4.172101974487305, "learning_rate": 4.216961545405308e-05, "loss": 4.4737, "step": 17350 }, { "epoch": 0.15706806282722513, "grad_norm": 3.308185577392578, "learning_rate": 4.214704820364687e-05, "loss": 4.5789, "step": 17400 }, { "epoch": 0.15751940783534935, "grad_norm": 4.956492900848389, "learning_rate": 4.212448095324066e-05, "loss": 4.565, "step": 17450 }, { "epoch": 0.15797075284347356, "grad_norm": 3.411794900894165, "learning_rate": 4.210191370283445e-05, "loss": 4.5473, "step": 17500 }, { "epoch": 0.15842209785159775, "grad_norm": 4.067993640899658, "learning_rate": 4.207934645242824e-05, "loss": 4.4836, "step": 17550 }, { "epoch": 0.15887344285972196, "grad_norm": 2.9520280361175537, "learning_rate": 4.205677920202203e-05, "loss": 4.4962, "step": 17600 }, { "epoch": 0.15932478786784618, "grad_norm": 4.387596130371094, "learning_rate": 4.2034211951615815e-05, "loss": 4.513, "step": 17650 }, { "epoch": 0.1597761328759704, "grad_norm": 3.250239849090576, "learning_rate": 4.201164470120961e-05, "loss": 4.5496, "step": 17700 }, { "epoch": 0.1602274778840946, "grad_norm": 3.867882013320923, "learning_rate": 4.1989077450803394e-05, "loss": 4.5849, "step": 17750 }, { "epoch": 0.16067882289221883, "grad_norm": 3.7500853538513184, "learning_rate": 4.196651020039719e-05, "loss": 4.4585, "step": 17800 }, { "epoch": 0.161130167900343, "grad_norm": 3.8945131301879883, "learning_rate": 4.194394294999098e-05, "loss": 4.4149, "step": 17850 }, { "epoch": 0.16158151290846723, "grad_norm": 8.667535781860352, "learning_rate": 4.1921375699584765e-05, "loss": 4.44, "step": 17900 }, { "epoch": 0.16203285791659144, "grad_norm": 4.284276485443115, "learning_rate": 4.189880844917856e-05, "loss": 4.4561, "step": 17950 }, { "epoch": 0.16248420292471566, "grad_norm": 2.9393467903137207, "learning_rate": 4.1876241198772344e-05, "loss": 4.5887, "step": 18000 }, { "epoch": 0.16293554793283987, "grad_norm": 3.012742519378662, "learning_rate": 4.1853673948366136e-05, "loss": 4.4513, "step": 18050 }, { "epoch": 0.16338689294096406, "grad_norm": 5.467082500457764, "learning_rate": 4.183110669795992e-05, "loss": 4.5611, "step": 18100 }, { "epoch": 0.16383823794908828, "grad_norm": 3.46402907371521, "learning_rate": 4.1808539447553715e-05, "loss": 4.5312, "step": 18150 }, { "epoch": 0.1642895829572125, "grad_norm": 3.8491625785827637, "learning_rate": 4.17859721971475e-05, "loss": 4.4916, "step": 18200 }, { "epoch": 0.1647409279653367, "grad_norm": 5.8692450523376465, "learning_rate": 4.176340494674129e-05, "loss": 4.4869, "step": 18250 }, { "epoch": 0.16519227297346092, "grad_norm": 3.2287988662719727, "learning_rate": 4.174083769633508e-05, "loss": 4.4431, "step": 18300 }, { "epoch": 0.1656436179815851, "grad_norm": 4.350259304046631, "learning_rate": 4.171827044592887e-05, "loss": 4.4968, "step": 18350 }, { "epoch": 0.16609496298970933, "grad_norm": 3.7243659496307373, "learning_rate": 4.1695703195522664e-05, "loss": 4.4738, "step": 18400 }, { "epoch": 0.16654630799783354, "grad_norm": 4.834224224090576, "learning_rate": 4.167313594511645e-05, "loss": 4.5754, "step": 18450 }, { "epoch": 0.16699765300595776, "grad_norm": 6.014001846313477, "learning_rate": 4.165056869471024e-05, "loss": 4.5449, "step": 18500 }, { "epoch": 0.16744899801408197, "grad_norm": 3.7950220108032227, "learning_rate": 4.162800144430403e-05, "loss": 4.4007, "step": 18550 }, { "epoch": 0.16790034302220616, "grad_norm": 4.019992828369141, "learning_rate": 4.160543419389782e-05, "loss": 4.4231, "step": 18600 }, { "epoch": 0.16835168803033038, "grad_norm": 4.363696575164795, "learning_rate": 4.158286694349161e-05, "loss": 4.4445, "step": 18650 }, { "epoch": 0.1688030330384546, "grad_norm": 4.168088912963867, "learning_rate": 4.15602996930854e-05, "loss": 4.5064, "step": 18700 }, { "epoch": 0.1692543780465788, "grad_norm": 3.3574249744415283, "learning_rate": 4.1537732442679186e-05, "loss": 4.5161, "step": 18750 }, { "epoch": 0.16970572305470302, "grad_norm": 4.255382061004639, "learning_rate": 4.151516519227298e-05, "loss": 4.4809, "step": 18800 }, { "epoch": 0.17015706806282724, "grad_norm": 3.896949291229248, "learning_rate": 4.1492597941866764e-05, "loss": 4.488, "step": 18850 }, { "epoch": 0.17060841307095143, "grad_norm": 4.572742938995361, "learning_rate": 4.147003069146056e-05, "loss": 4.5692, "step": 18900 }, { "epoch": 0.17105975807907564, "grad_norm": 4.25124454498291, "learning_rate": 4.144746344105434e-05, "loss": 4.5341, "step": 18950 }, { "epoch": 0.17151110308719986, "grad_norm": 3.2986035346984863, "learning_rate": 4.1424896190648136e-05, "loss": 4.4489, "step": 19000 }, { "epoch": 0.17196244809532407, "grad_norm": 3.633592367172241, "learning_rate": 4.140232894024192e-05, "loss": 4.5124, "step": 19050 }, { "epoch": 0.1724137931034483, "grad_norm": 3.3687500953674316, "learning_rate": 4.1379761689835714e-05, "loss": 4.5165, "step": 19100 }, { "epoch": 0.17286513811157247, "grad_norm": 4.958398342132568, "learning_rate": 4.13571944394295e-05, "loss": 4.5043, "step": 19150 }, { "epoch": 0.1733164831196967, "grad_norm": 4.127295017242432, "learning_rate": 4.133462718902329e-05, "loss": 4.4783, "step": 19200 }, { "epoch": 0.1737678281278209, "grad_norm": 3.3556175231933594, "learning_rate": 4.131205993861708e-05, "loss": 4.4185, "step": 19250 }, { "epoch": 0.17421917313594512, "grad_norm": 4.382410049438477, "learning_rate": 4.128949268821087e-05, "loss": 4.5009, "step": 19300 }, { "epoch": 0.17467051814406934, "grad_norm": 3.7760777473449707, "learning_rate": 4.126692543780466e-05, "loss": 4.3572, "step": 19350 }, { "epoch": 0.17512186315219352, "grad_norm": 4.594768524169922, "learning_rate": 4.124435818739845e-05, "loss": 4.4793, "step": 19400 }, { "epoch": 0.17557320816031774, "grad_norm": 4.605646133422852, "learning_rate": 4.1221790936992235e-05, "loss": 4.4462, "step": 19450 }, { "epoch": 0.17602455316844196, "grad_norm": 3.358002185821533, "learning_rate": 4.119922368658603e-05, "loss": 4.4986, "step": 19500 }, { "epoch": 0.17647589817656617, "grad_norm": 3.7644100189208984, "learning_rate": 4.1176656436179814e-05, "loss": 4.5314, "step": 19550 }, { "epoch": 0.17692724318469039, "grad_norm": 4.109899044036865, "learning_rate": 4.115408918577361e-05, "loss": 4.5382, "step": 19600 }, { "epoch": 0.1773785881928146, "grad_norm": 8.232100486755371, "learning_rate": 4.113152193536739e-05, "loss": 4.5095, "step": 19650 }, { "epoch": 0.1778299332009388, "grad_norm": 3.442411422729492, "learning_rate": 4.1108954684961185e-05, "loss": 4.5861, "step": 19700 }, { "epoch": 0.178281278209063, "grad_norm": 2.404611825942993, "learning_rate": 4.108638743455497e-05, "loss": 4.4563, "step": 19750 }, { "epoch": 0.17873262321718722, "grad_norm": 3.3895816802978516, "learning_rate": 4.1063820184148764e-05, "loss": 4.4434, "step": 19800 }, { "epoch": 0.17918396822531144, "grad_norm": 2.9194042682647705, "learning_rate": 4.104125293374255e-05, "loss": 4.5463, "step": 19850 }, { "epoch": 0.17963531323343565, "grad_norm": 2.6337718963623047, "learning_rate": 4.101868568333634e-05, "loss": 4.4246, "step": 19900 }, { "epoch": 0.18008665824155984, "grad_norm": 5.921742916107178, "learning_rate": 4.0996118432930135e-05, "loss": 4.3955, "step": 19950 }, { "epoch": 0.18053800324968405, "grad_norm": 3.9008045196533203, "learning_rate": 4.097355118252392e-05, "loss": 4.6028, "step": 20000 } ], "logging_steps": 50, "max_steps": 110780, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4772471541951488e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }