SpireLab's picture
Upload folder using huggingface_hub
137c748 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 5000,
"global_step": 542931,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00276278201097377,
"grad_norm": 2.595154047012329,
"learning_rate": 4.53088739087192e-07,
"loss": 3.7252,
"step": 500
},
{
"epoch": 0.00552556402194754,
"grad_norm": 3.3661036491394043,
"learning_rate": 9.135447747449074e-07,
"loss": 3.7548,
"step": 1000
},
{
"epoch": 0.00828834603292131,
"grad_norm": 3.4492340087890625,
"learning_rate": 1.3730798983313074e-06,
"loss": 3.7204,
"step": 1500
},
{
"epoch": 0.01105112804389508,
"grad_norm": 5.218665599822998,
"learning_rate": 1.8335359339890227e-06,
"loss": 3.7032,
"step": 2000
},
{
"epoch": 0.013813910054868851,
"grad_norm": 5.617623805999756,
"learning_rate": 2.293991969646738e-06,
"loss": 3.6474,
"step": 2500
},
{
"epoch": 0.01657669206584262,
"grad_norm": 5.512911319732666,
"learning_rate": 2.7544480053044535e-06,
"loss": 3.5624,
"step": 3000
},
{
"epoch": 0.019339474076816392,
"grad_norm": 5.604681015014648,
"learning_rate": 3.2139831288908536e-06,
"loss": 3.4398,
"step": 3500
},
{
"epoch": 0.02210225608779016,
"grad_norm": 6.751086235046387,
"learning_rate": 3.6744391645485687e-06,
"loss": 3.3622,
"step": 4000
},
{
"epoch": 0.02486503809876393,
"grad_norm": 8.164905548095703,
"learning_rate": 4.134895200206284e-06,
"loss": 3.1909,
"step": 4500
},
{
"epoch": 0.027627820109737702,
"grad_norm": 7.667040824890137,
"learning_rate": 4.595351235863999e-06,
"loss": 3.2288,
"step": 5000
},
{
"epoch": 0.027627820109737702,
"eval_runtime": 1422.3033,
"eval_samples_per_second": 254.484,
"eval_steps_per_second": 31.811,
"step": 5000
},
{
"epoch": 0.03039060212071147,
"grad_norm": 7.5037031173706055,
"learning_rate": 5.055807271521715e-06,
"loss": 3.1382,
"step": 5500
},
{
"epoch": 0.03315338413168524,
"grad_norm": 7.567819595336914,
"learning_rate": 5.51626330717943e-06,
"loss": 3.1615,
"step": 6000
},
{
"epoch": 0.03591616614265901,
"grad_norm": 9.968245506286621,
"learning_rate": 5.976719342837146e-06,
"loss": 3.102,
"step": 6500
},
{
"epoch": 0.038678948153632785,
"grad_norm": 9.482606887817383,
"learning_rate": 6.4371753784948614e-06,
"loss": 3.1102,
"step": 7000
},
{
"epoch": 0.041441730164606554,
"grad_norm": 7.074209213256836,
"learning_rate": 6.8967105020812615e-06,
"loss": 3.0017,
"step": 7500
},
{
"epoch": 0.04420451217558032,
"grad_norm": 8.135669708251953,
"learning_rate": 7.356245625667662e-06,
"loss": 3.0574,
"step": 8000
},
{
"epoch": 0.04696729418655409,
"grad_norm": 13.200604438781738,
"learning_rate": 7.816701661325378e-06,
"loss": 3.0581,
"step": 8500
},
{
"epoch": 0.04973007619752786,
"grad_norm": 8.470757484436035,
"learning_rate": 8.277157696983094e-06,
"loss": 2.9737,
"step": 9000
},
{
"epoch": 0.052492858208501636,
"grad_norm": 8.881372451782227,
"learning_rate": 8.737613732640809e-06,
"loss": 3.0145,
"step": 9500
},
{
"epoch": 0.055255640219475405,
"grad_norm": 7.54667854309082,
"learning_rate": 9.198069768298522e-06,
"loss": 3.0479,
"step": 10000
},
{
"epoch": 0.055255640219475405,
"eval_runtime": 1428.4865,
"eval_samples_per_second": 253.382,
"eval_steps_per_second": 31.673,
"step": 10000
},
{
"epoch": 0.058018422230449174,
"grad_norm": 9.400116920471191,
"learning_rate": 9.658525803956239e-06,
"loss": 2.9569,
"step": 10500
},
{
"epoch": 0.06078120424142294,
"grad_norm": 9.827701568603516,
"learning_rate": 1.0118981839613954e-05,
"loss": 2.9763,
"step": 11000
},
{
"epoch": 0.06354398625239671,
"grad_norm": 9.987720489501953,
"learning_rate": 1.0579437875271669e-05,
"loss": 2.9257,
"step": 11500
},
{
"epoch": 0.06630676826337048,
"grad_norm": 13.571234703063965,
"learning_rate": 1.1039893910929384e-05,
"loss": 2.9896,
"step": 12000
},
{
"epoch": 0.06906955027434425,
"grad_norm": 11.866579055786133,
"learning_rate": 1.1499429034515786e-05,
"loss": 2.978,
"step": 12500
},
{
"epoch": 0.07183233228531802,
"grad_norm": 15.338528633117676,
"learning_rate": 1.19598850701735e-05,
"loss": 2.9433,
"step": 13000
},
{
"epoch": 0.0745951142962918,
"grad_norm": 8.874979019165039,
"learning_rate": 1.2420341105831214e-05,
"loss": 2.9817,
"step": 13500
},
{
"epoch": 0.07735789630726557,
"grad_norm": 8.997846603393555,
"learning_rate": 1.2880797141488931e-05,
"loss": 2.8916,
"step": 14000
},
{
"epoch": 0.08012067831823934,
"grad_norm": 11.824705123901367,
"learning_rate": 1.3340332265075331e-05,
"loss": 2.9126,
"step": 14500
},
{
"epoch": 0.08288346032921311,
"grad_norm": 11.844013214111328,
"learning_rate": 1.3800788300733048e-05,
"loss": 2.9078,
"step": 15000
},
{
"epoch": 0.08288346032921311,
"eval_runtime": 1417.0351,
"eval_samples_per_second": 255.43,
"eval_steps_per_second": 31.929,
"step": 15000
},
{
"epoch": 0.08564624234018688,
"grad_norm": 9.543901443481445,
"learning_rate": 1.4261244336390762e-05,
"loss": 2.9336,
"step": 15500
},
{
"epoch": 0.08840902435116064,
"grad_norm": 10.792810440063477,
"learning_rate": 1.4721700372048478e-05,
"loss": 2.9225,
"step": 16000
},
{
"epoch": 0.09117180636213441,
"grad_norm": 9.682255744934082,
"learning_rate": 1.5181235495634877e-05,
"loss": 2.9388,
"step": 16500
},
{
"epoch": 0.09393458837310818,
"grad_norm": 9.591748237609863,
"learning_rate": 1.5641691531292592e-05,
"loss": 2.9116,
"step": 17000
},
{
"epoch": 0.09669737038408195,
"grad_norm": 15.961346626281738,
"learning_rate": 1.610214756695031e-05,
"loss": 2.8788,
"step": 17500
},
{
"epoch": 0.09946015239505572,
"grad_norm": 12.563713073730469,
"learning_rate": 1.6562603602608022e-05,
"loss": 2.868,
"step": 18000
},
{
"epoch": 0.10222293440602949,
"grad_norm": 9.912450790405273,
"learning_rate": 1.702305963826574e-05,
"loss": 2.8436,
"step": 18500
},
{
"epoch": 0.10498571641700327,
"grad_norm": 10.168437957763672,
"learning_rate": 1.748259476185214e-05,
"loss": 2.893,
"step": 19000
},
{
"epoch": 0.10774849842797704,
"grad_norm": 9.45627212524414,
"learning_rate": 1.7943050797509856e-05,
"loss": 2.9454,
"step": 19500
},
{
"epoch": 0.11051128043895081,
"grad_norm": 11.309224128723145,
"learning_rate": 1.840350683316757e-05,
"loss": 2.898,
"step": 20000
},
{
"epoch": 0.11051128043895081,
"eval_runtime": 1418.7718,
"eval_samples_per_second": 255.117,
"eval_steps_per_second": 31.89,
"step": 20000
},
{
"epoch": 0.11327406244992458,
"grad_norm": 9.24001407623291,
"learning_rate": 1.8863962868825286e-05,
"loss": 2.8393,
"step": 20500
},
{
"epoch": 0.11603684446089835,
"grad_norm": 11.295225143432617,
"learning_rate": 1.9324418904483e-05,
"loss": 2.8147,
"step": 21000
},
{
"epoch": 0.11879962647187212,
"grad_norm": 16.763473510742188,
"learning_rate": 1.97839540280694e-05,
"loss": 2.8566,
"step": 21500
},
{
"epoch": 0.12156240848284589,
"grad_norm": 11.213971138000488,
"learning_rate": 2.02434891516558e-05,
"loss": 2.8748,
"step": 22000
},
{
"epoch": 0.12432519049381965,
"grad_norm": 8.82392406463623,
"learning_rate": 2.0703945187313516e-05,
"loss": 2.8374,
"step": 22500
},
{
"epoch": 0.12708797250479342,
"grad_norm": 16.63780403137207,
"learning_rate": 2.116440122297123e-05,
"loss": 2.815,
"step": 23000
},
{
"epoch": 0.1298507545157672,
"grad_norm": 8.42003345489502,
"learning_rate": 2.1624857258628947e-05,
"loss": 2.8534,
"step": 23500
},
{
"epoch": 0.13261353652674096,
"grad_norm": 18.058134078979492,
"learning_rate": 2.2085313294286665e-05,
"loss": 2.8309,
"step": 24000
},
{
"epoch": 0.13537631853771473,
"grad_norm": 10.62288761138916,
"learning_rate": 2.2545769329944377e-05,
"loss": 2.7925,
"step": 24500
},
{
"epoch": 0.1381391005486885,
"grad_norm": 10.44122314453125,
"learning_rate": 2.3006225365602095e-05,
"loss": 2.8439,
"step": 25000
},
{
"epoch": 0.1381391005486885,
"eval_runtime": 1423.542,
"eval_samples_per_second": 254.262,
"eval_steps_per_second": 31.783,
"step": 25000
},
{
"epoch": 0.14090188255966227,
"grad_norm": 12.520238876342773,
"learning_rate": 2.346668140125981e-05,
"loss": 2.7922,
"step": 25500
},
{
"epoch": 0.14366466457063604,
"grad_norm": 9.882099151611328,
"learning_rate": 2.3927137436917525e-05,
"loss": 2.8186,
"step": 26000
},
{
"epoch": 0.1464274465816098,
"grad_norm": 9.38256549835205,
"learning_rate": 2.4386672560503922e-05,
"loss": 2.7843,
"step": 26500
},
{
"epoch": 0.1491902285925836,
"grad_norm": 9.543642044067383,
"learning_rate": 2.4846207684090326e-05,
"loss": 2.8285,
"step": 27000
},
{
"epoch": 0.15195301060355737,
"grad_norm": 11.416987419128418,
"learning_rate": 2.530666371974804e-05,
"loss": 2.8062,
"step": 27500
},
{
"epoch": 0.15471579261453114,
"grad_norm": 11.19566535949707,
"learning_rate": 2.5767119755405756e-05,
"loss": 2.8096,
"step": 28000
},
{
"epoch": 0.1574785746255049,
"grad_norm": 11.436910629272461,
"learning_rate": 2.6227575791063468e-05,
"loss": 2.7607,
"step": 28500
},
{
"epoch": 0.16024135663647868,
"grad_norm": 11.108380317687988,
"learning_rate": 2.668803182672119e-05,
"loss": 2.8115,
"step": 29000
},
{
"epoch": 0.16300413864745245,
"grad_norm": 8.149287223815918,
"learning_rate": 2.7147566950307586e-05,
"loss": 2.8093,
"step": 29500
},
{
"epoch": 0.16576692065842621,
"grad_norm": 9.921182632446289,
"learning_rate": 2.7608022985965305e-05,
"loss": 2.7958,
"step": 30000
},
{
"epoch": 0.16576692065842621,
"eval_runtime": 1420.6642,
"eval_samples_per_second": 254.777,
"eval_steps_per_second": 31.848,
"step": 30000
},
{
"epoch": 0.16852970266939998,
"grad_norm": 9.527510643005371,
"learning_rate": 2.8068479021623016e-05,
"loss": 2.8025,
"step": 30500
},
{
"epoch": 0.17129248468037375,
"grad_norm": 8.283650398254395,
"learning_rate": 2.852893505728073e-05,
"loss": 2.7871,
"step": 31000
},
{
"epoch": 0.17405526669134752,
"grad_norm": 10.247408866882324,
"learning_rate": 2.8989391092938447e-05,
"loss": 2.8239,
"step": 31500
},
{
"epoch": 0.1768180487023213,
"grad_norm": 9.047866821289062,
"learning_rate": 2.9449847128596165e-05,
"loss": 2.8061,
"step": 32000
},
{
"epoch": 0.17958083071329506,
"grad_norm": 12.305429458618164,
"learning_rate": 2.9910303164253877e-05,
"loss": 2.7637,
"step": 32500
},
{
"epoch": 0.18234361272426883,
"grad_norm": 8.979884147644043,
"learning_rate": 3.0370759199911592e-05,
"loss": 2.8169,
"step": 33000
},
{
"epoch": 0.1851063947352426,
"grad_norm": 10.300788879394531,
"learning_rate": 3.0830294323497995e-05,
"loss": 2.7801,
"step": 33500
},
{
"epoch": 0.18786917674621637,
"grad_norm": 11.265089988708496,
"learning_rate": 3.129075035915571e-05,
"loss": 2.7739,
"step": 34000
},
{
"epoch": 0.19063195875719013,
"grad_norm": 7.74532413482666,
"learning_rate": 3.1751206394813426e-05,
"loss": 2.8098,
"step": 34500
},
{
"epoch": 0.1933947407681639,
"grad_norm": 7.382881164550781,
"learning_rate": 3.221166243047114e-05,
"loss": 2.7901,
"step": 35000
},
{
"epoch": 0.1933947407681639,
"eval_runtime": 1425.7104,
"eval_samples_per_second": 253.876,
"eval_steps_per_second": 31.735,
"step": 35000
},
{
"epoch": 0.19615752277913767,
"grad_norm": 11.83242416381836,
"learning_rate": 3.2672118466128856e-05,
"loss": 2.7753,
"step": 35500
},
{
"epoch": 0.19892030479011144,
"grad_norm": 7.594019412994385,
"learning_rate": 3.313165358971525e-05,
"loss": 2.7747,
"step": 36000
},
{
"epoch": 0.2016830868010852,
"grad_norm": 7.169086933135986,
"learning_rate": 3.359210962537297e-05,
"loss": 2.7942,
"step": 36500
},
{
"epoch": 0.20444586881205898,
"grad_norm": 9.127911567687988,
"learning_rate": 3.405256566103069e-05,
"loss": 2.8336,
"step": 37000
},
{
"epoch": 0.20720865082303275,
"grad_norm": 9.519842147827148,
"learning_rate": 3.4512100784617086e-05,
"loss": 2.8049,
"step": 37500
},
{
"epoch": 0.20997143283400654,
"grad_norm": 7.047683238983154,
"learning_rate": 3.49725568202748e-05,
"loss": 2.7987,
"step": 38000
},
{
"epoch": 0.2127342148449803,
"grad_norm": 8.880785942077637,
"learning_rate": 3.5433012855932516e-05,
"loss": 2.7919,
"step": 38500
},
{
"epoch": 0.21549699685595408,
"grad_norm": 9.917975425720215,
"learning_rate": 3.589346889159023e-05,
"loss": 2.8002,
"step": 39000
},
{
"epoch": 0.21825977886692785,
"grad_norm": 8.076781272888184,
"learning_rate": 3.6353924927247947e-05,
"loss": 2.7868,
"step": 39500
},
{
"epoch": 0.22102256087790162,
"grad_norm": 8.734768867492676,
"learning_rate": 3.681438096290566e-05,
"loss": 2.7937,
"step": 40000
},
{
"epoch": 0.22102256087790162,
"eval_runtime": 1421.6361,
"eval_samples_per_second": 254.603,
"eval_steps_per_second": 31.826,
"step": 40000
},
{
"epoch": 0.2237853428888754,
"grad_norm": 10.838555335998535,
"learning_rate": 3.727483699856338e-05,
"loss": 2.8301,
"step": 40500
},
{
"epoch": 0.22654812489984916,
"grad_norm": 10.323356628417969,
"learning_rate": 3.773529303422109e-05,
"loss": 2.7691,
"step": 41000
},
{
"epoch": 0.22931090691082293,
"grad_norm": 10.885273933410645,
"learning_rate": 3.8195749069878814e-05,
"loss": 2.8109,
"step": 41500
},
{
"epoch": 0.2320736889217967,
"grad_norm": 7.563793659210205,
"learning_rate": 3.865620510553652e-05,
"loss": 2.7768,
"step": 42000
},
{
"epoch": 0.23483647093277046,
"grad_norm": 18.850767135620117,
"learning_rate": 3.911666114119424e-05,
"loss": 2.8198,
"step": 42500
},
{
"epoch": 0.23759925294374423,
"grad_norm": 6.142578601837158,
"learning_rate": 3.957619626478064e-05,
"loss": 2.7862,
"step": 43000
},
{
"epoch": 0.240362034954718,
"grad_norm": 5.065933704376221,
"learning_rate": 4.0036652300438356e-05,
"loss": 2.7652,
"step": 43500
},
{
"epoch": 0.24312481696569177,
"grad_norm": 7.932371139526367,
"learning_rate": 4.049710833609607e-05,
"loss": 2.7517,
"step": 44000
},
{
"epoch": 0.24588759897666554,
"grad_norm": 11.972217559814453,
"learning_rate": 4.0957564371753786e-05,
"loss": 2.8162,
"step": 44500
},
{
"epoch": 0.2486503809876393,
"grad_norm": 8.804488182067871,
"learning_rate": 4.141709949534019e-05,
"loss": 2.7491,
"step": 45000
},
{
"epoch": 0.2486503809876393,
"eval_runtime": 1414.8499,
"eval_samples_per_second": 255.824,
"eval_steps_per_second": 31.979,
"step": 45000
},
{
"epoch": 0.2514131629986131,
"grad_norm": 7.548344135284424,
"learning_rate": 4.1877555530997905e-05,
"loss": 2.8001,
"step": 45500
},
{
"epoch": 0.25417594500958685,
"grad_norm": 7.751644134521484,
"learning_rate": 4.233801156665562e-05,
"loss": 2.7754,
"step": 46000
},
{
"epoch": 0.25693872702056064,
"grad_norm": 7.323184013366699,
"learning_rate": 4.2798467602313335e-05,
"loss": 2.8117,
"step": 46500
},
{
"epoch": 0.2597015090315344,
"grad_norm": 10.177978515625,
"learning_rate": 4.325892363797105e-05,
"loss": 2.7637,
"step": 47000
},
{
"epoch": 0.2624642910425082,
"grad_norm": 9.300724029541016,
"learning_rate": 4.3719379673628765e-05,
"loss": 2.7798,
"step": 47500
},
{
"epoch": 0.2652270730534819,
"grad_norm": 7.969640731811523,
"learning_rate": 4.417983570928648e-05,
"loss": 2.7597,
"step": 48000
},
{
"epoch": 0.2679898550644557,
"grad_norm": 11.944114685058594,
"learning_rate": 4.463937083287288e-05,
"loss": 2.7944,
"step": 48500
},
{
"epoch": 0.27075263707542946,
"grad_norm": 8.067237854003906,
"learning_rate": 4.509982686853059e-05,
"loss": 2.797,
"step": 49000
},
{
"epoch": 0.27351541908640326,
"grad_norm": 8.342887878417969,
"learning_rate": 4.5560282904188314e-05,
"loss": 2.7753,
"step": 49500
},
{
"epoch": 0.276278201097377,
"grad_norm": 7.031680107116699,
"learning_rate": 4.602073893984602e-05,
"loss": 2.7716,
"step": 50000
},
{
"epoch": 0.276278201097377,
"eval_runtime": 1419.8091,
"eval_samples_per_second": 254.931,
"eval_steps_per_second": 31.867,
"step": 50000
},
{
"epoch": 0.2790409831083508,
"grad_norm": 6.716032028198242,
"learning_rate": 4.648119497550374e-05,
"loss": 2.762,
"step": 50500
},
{
"epoch": 0.28180376511932453,
"grad_norm": 6.020242214202881,
"learning_rate": 4.694073009909014e-05,
"loss": 2.7485,
"step": 51000
},
{
"epoch": 0.28456654713029833,
"grad_norm": 6.298365592956543,
"learning_rate": 4.7401186134747856e-05,
"loss": 2.733,
"step": 51500
},
{
"epoch": 0.28732932914127207,
"grad_norm": 11.52296257019043,
"learning_rate": 4.786164217040557e-05,
"loss": 2.7207,
"step": 52000
},
{
"epoch": 0.29009211115224587,
"grad_norm": 7.5664143562316895,
"learning_rate": 4.8322098206063286e-05,
"loss": 2.7784,
"step": 52500
},
{
"epoch": 0.2928548931632196,
"grad_norm": 7.839147567749023,
"learning_rate": 4.8782554241721e-05,
"loss": 2.75,
"step": 53000
},
{
"epoch": 0.2956176751741934,
"grad_norm": 7.68875789642334,
"learning_rate": 4.9242089365307405e-05,
"loss": 2.8226,
"step": 53500
},
{
"epoch": 0.2983804571851672,
"grad_norm": 9.220076560974121,
"learning_rate": 4.970254540096512e-05,
"loss": 2.7392,
"step": 54000
},
{
"epoch": 0.30114323919614094,
"grad_norm": 6.299682140350342,
"learning_rate": 4.998188839568023e-05,
"loss": 2.7663,
"step": 54500
},
{
"epoch": 0.30390602120711474,
"grad_norm": 7.302303791046143,
"learning_rate": 4.993072567161308e-05,
"loss": 2.7241,
"step": 55000
},
{
"epoch": 0.30390602120711474,
"eval_runtime": 1421.4385,
"eval_samples_per_second": 254.639,
"eval_steps_per_second": 31.83,
"step": 55000
},
{
"epoch": 0.3066688032180885,
"grad_norm": 7.086694240570068,
"learning_rate": 4.9879562947545934e-05,
"loss": 2.7075,
"step": 55500
},
{
"epoch": 0.3094315852290623,
"grad_norm": 8.3382568359375,
"learning_rate": 4.9828502548926916e-05,
"loss": 2.7592,
"step": 56000
},
{
"epoch": 0.312194367240036,
"grad_norm": 10.291159629821777,
"learning_rate": 4.977733982485976e-05,
"loss": 2.7674,
"step": 56500
},
{
"epoch": 0.3149571492510098,
"grad_norm": 10.337152481079102,
"learning_rate": 4.972617710079261e-05,
"loss": 2.75,
"step": 57000
},
{
"epoch": 0.31771993126198356,
"grad_norm": 5.782974720001221,
"learning_rate": 4.9675014376725465e-05,
"loss": 2.7324,
"step": 57500
},
{
"epoch": 0.32048271327295735,
"grad_norm": 6.272622108459473,
"learning_rate": 4.962385165265832e-05,
"loss": 2.746,
"step": 58000
},
{
"epoch": 0.3232454952839311,
"grad_norm": 6.653768539428711,
"learning_rate": 4.957268892859116e-05,
"loss": 2.7742,
"step": 58500
},
{
"epoch": 0.3260082772949049,
"grad_norm": 6.433887481689453,
"learning_rate": 4.9521526204524014e-05,
"loss": 2.7635,
"step": 59000
},
{
"epoch": 0.32877105930587863,
"grad_norm": 6.354071617126465,
"learning_rate": 4.9470363480456866e-05,
"loss": 2.7317,
"step": 59500
},
{
"epoch": 0.33153384131685243,
"grad_norm": 7.874678611755371,
"learning_rate": 4.941920075638972e-05,
"loss": 2.7431,
"step": 60000
},
{
"epoch": 0.33153384131685243,
"eval_runtime": 1414.971,
"eval_samples_per_second": 255.802,
"eval_steps_per_second": 31.976,
"step": 60000
},
{
"epoch": 0.33429662332782617,
"grad_norm": 9.324529647827148,
"learning_rate": 4.93681403577707e-05,
"loss": 2.7606,
"step": 60500
},
{
"epoch": 0.33705940533879997,
"grad_norm": 6.370975017547607,
"learning_rate": 4.9316977633703545e-05,
"loss": 2.7578,
"step": 61000
},
{
"epoch": 0.3398221873497737,
"grad_norm": 6.50999116897583,
"learning_rate": 4.92658149096364e-05,
"loss": 2.7627,
"step": 61500
},
{
"epoch": 0.3425849693607475,
"grad_norm": 6.26449728012085,
"learning_rate": 4.921465218556925e-05,
"loss": 2.7515,
"step": 62000
},
{
"epoch": 0.34534775137172125,
"grad_norm": 5.123514175415039,
"learning_rate": 4.916359178695024e-05,
"loss": 2.7553,
"step": 62500
},
{
"epoch": 0.34811053338269504,
"grad_norm": 7.093264102935791,
"learning_rate": 4.911242906288308e-05,
"loss": 2.7338,
"step": 63000
},
{
"epoch": 0.3508733153936688,
"grad_norm": 5.520063400268555,
"learning_rate": 4.906126633881593e-05,
"loss": 2.7567,
"step": 63500
},
{
"epoch": 0.3536360974046426,
"grad_norm": 6.911723613739014,
"learning_rate": 4.901010361474878e-05,
"loss": 2.7687,
"step": 64000
},
{
"epoch": 0.3563988794156164,
"grad_norm": 7.70906400680542,
"learning_rate": 4.895894089068163e-05,
"loss": 2.7228,
"step": 64500
},
{
"epoch": 0.3591616614265901,
"grad_norm": 6.372740745544434,
"learning_rate": 4.890788049206262e-05,
"loss": 2.733,
"step": 65000
},
{
"epoch": 0.3591616614265901,
"eval_runtime": 1420.2395,
"eval_samples_per_second": 254.854,
"eval_steps_per_second": 31.857,
"step": 65000
},
{
"epoch": 0.3619244434375639,
"grad_norm": 7.683312892913818,
"learning_rate": 4.885671776799547e-05,
"loss": 2.7507,
"step": 65500
},
{
"epoch": 0.36468722544853766,
"grad_norm": 6.729420185089111,
"learning_rate": 4.880555504392832e-05,
"loss": 2.7595,
"step": 66000
},
{
"epoch": 0.36745000745951145,
"grad_norm": 6.871359825134277,
"learning_rate": 4.875439231986117e-05,
"loss": 2.7179,
"step": 66500
},
{
"epoch": 0.3702127894704852,
"grad_norm": 6.755906581878662,
"learning_rate": 4.870343424669029e-05,
"loss": 2.7454,
"step": 67000
},
{
"epoch": 0.372975571481459,
"grad_norm": 5.853033065795898,
"learning_rate": 4.865227152262313e-05,
"loss": 2.7138,
"step": 67500
},
{
"epoch": 0.37573835349243273,
"grad_norm": 7.575068950653076,
"learning_rate": 4.8601108798555984e-05,
"loss": 2.7798,
"step": 68000
},
{
"epoch": 0.3785011355034065,
"grad_norm": 8.923949241638184,
"learning_rate": 4.8549946074488836e-05,
"loss": 2.7867,
"step": 68500
},
{
"epoch": 0.38126391751438027,
"grad_norm": 6.704534530639648,
"learning_rate": 4.849878335042168e-05,
"loss": 2.6885,
"step": 69000
},
{
"epoch": 0.38402669952535406,
"grad_norm": 6.3064117431640625,
"learning_rate": 4.8447620626354533e-05,
"loss": 2.75,
"step": 69500
},
{
"epoch": 0.3867894815363278,
"grad_norm": 5.934976100921631,
"learning_rate": 4.8396457902287385e-05,
"loss": 2.7623,
"step": 70000
},
{
"epoch": 0.3867894815363278,
"eval_runtime": 1418.267,
"eval_samples_per_second": 255.208,
"eval_steps_per_second": 31.902,
"step": 70000
},
{
"epoch": 0.3895522635473016,
"grad_norm": 8.18109130859375,
"learning_rate": 4.834529517822024e-05,
"loss": 2.7414,
"step": 70500
},
{
"epoch": 0.39231504555827534,
"grad_norm": 5.952932834625244,
"learning_rate": 4.829413245415308e-05,
"loss": 2.7224,
"step": 71000
},
{
"epoch": 0.39507782756924914,
"grad_norm": 6.907143592834473,
"learning_rate": 4.824307205553407e-05,
"loss": 2.7661,
"step": 71500
},
{
"epoch": 0.3978406095802229,
"grad_norm": 6.694629192352295,
"learning_rate": 4.8191909331466916e-05,
"loss": 2.7454,
"step": 72000
},
{
"epoch": 0.4006033915911967,
"grad_norm": 6.9917192459106445,
"learning_rate": 4.814074660739977e-05,
"loss": 2.7164,
"step": 72500
},
{
"epoch": 0.4033661736021704,
"grad_norm": 7.304172992706299,
"learning_rate": 4.808958388333262e-05,
"loss": 2.7476,
"step": 73000
},
{
"epoch": 0.4061289556131442,
"grad_norm": 9.685128211975098,
"learning_rate": 4.8038421159265465e-05,
"loss": 2.7332,
"step": 73500
},
{
"epoch": 0.40889173762411796,
"grad_norm": 5.05424165725708,
"learning_rate": 4.798725843519832e-05,
"loss": 2.7229,
"step": 74000
},
{
"epoch": 0.41165451963509175,
"grad_norm": 6.11020040512085,
"learning_rate": 4.7936198036579306e-05,
"loss": 2.75,
"step": 74500
},
{
"epoch": 0.4144173016460655,
"grad_norm": 5.443029403686523,
"learning_rate": 4.788503531251216e-05,
"loss": 2.7217,
"step": 75000
},
{
"epoch": 0.4144173016460655,
"eval_runtime": 1418.9418,
"eval_samples_per_second": 255.087,
"eval_steps_per_second": 31.886,
"step": 75000
},
{
"epoch": 0.4171800836570393,
"grad_norm": 6.66157865524292,
"learning_rate": 4.7833872588445e-05,
"loss": 2.6993,
"step": 75500
},
{
"epoch": 0.4199428656680131,
"grad_norm": 6.773935317993164,
"learning_rate": 4.7782709864377855e-05,
"loss": 2.7365,
"step": 76000
},
{
"epoch": 0.42270564767898683,
"grad_norm": 5.710464000701904,
"learning_rate": 4.77315471403107e-05,
"loss": 2.7073,
"step": 76500
},
{
"epoch": 0.4254684296899606,
"grad_norm": 5.999380111694336,
"learning_rate": 4.768048674169169e-05,
"loss": 2.7089,
"step": 77000
},
{
"epoch": 0.42823121170093437,
"grad_norm": 12.620460510253906,
"learning_rate": 4.762932401762454e-05,
"loss": 2.7287,
"step": 77500
},
{
"epoch": 0.43099399371190816,
"grad_norm": 5.68431282043457,
"learning_rate": 4.7578161293557385e-05,
"loss": 2.7433,
"step": 78000
},
{
"epoch": 0.4337567757228819,
"grad_norm": 4.173344135284424,
"learning_rate": 4.752699856949024e-05,
"loss": 2.7507,
"step": 78500
},
{
"epoch": 0.4365195577338557,
"grad_norm": 7.130237579345703,
"learning_rate": 4.747583584542309e-05,
"loss": 2.6849,
"step": 79000
},
{
"epoch": 0.43928233974482944,
"grad_norm": 7.622902870178223,
"learning_rate": 4.742467312135594e-05,
"loss": 2.7398,
"step": 79500
},
{
"epoch": 0.44204512175580324,
"grad_norm": 6.098598003387451,
"learning_rate": 4.7373510397288787e-05,
"loss": 2.7279,
"step": 80000
},
{
"epoch": 0.44204512175580324,
"eval_runtime": 1421.1349,
"eval_samples_per_second": 254.693,
"eval_steps_per_second": 31.837,
"step": 80000
},
{
"epoch": 0.444807903766777,
"grad_norm": 5.454360485076904,
"learning_rate": 4.732234767322164e-05,
"loss": 2.7256,
"step": 80500
},
{
"epoch": 0.4475706857777508,
"grad_norm": 9.29869556427002,
"learning_rate": 4.727128727460262e-05,
"loss": 2.7272,
"step": 81000
},
{
"epoch": 0.4503334677887245,
"grad_norm": 10.766260147094727,
"learning_rate": 4.722012455053547e-05,
"loss": 2.7266,
"step": 81500
},
{
"epoch": 0.4530962497996983,
"grad_norm": 5.04358434677124,
"learning_rate": 4.7168961826468324e-05,
"loss": 2.6443,
"step": 82000
},
{
"epoch": 0.45585903181067206,
"grad_norm": 6.527529716491699,
"learning_rate": 4.711779910240117e-05,
"loss": 2.7191,
"step": 82500
},
{
"epoch": 0.45862181382164585,
"grad_norm": 4.683417797088623,
"learning_rate": 4.706673870378215e-05,
"loss": 2.7299,
"step": 83000
},
{
"epoch": 0.4613845958326196,
"grad_norm": 6.090554237365723,
"learning_rate": 4.7015575979715e-05,
"loss": 2.759,
"step": 83500
},
{
"epoch": 0.4641473778435934,
"grad_norm": 6.470883369445801,
"learning_rate": 4.6964413255647855e-05,
"loss": 2.734,
"step": 84000
},
{
"epoch": 0.46691015985456713,
"grad_norm": 8.398398399353027,
"learning_rate": 4.691325053158071e-05,
"loss": 2.7007,
"step": 84500
},
{
"epoch": 0.4696729418655409,
"grad_norm": 5.122215270996094,
"learning_rate": 4.686208780751355e-05,
"loss": 2.6983,
"step": 85000
},
{
"epoch": 0.4696729418655409,
"eval_runtime": 1425.8823,
"eval_samples_per_second": 253.845,
"eval_steps_per_second": 31.731,
"step": 85000
},
{
"epoch": 0.47243572387651467,
"grad_norm": 5.900498867034912,
"learning_rate": 4.681102740889454e-05,
"loss": 2.7391,
"step": 85500
},
{
"epoch": 0.47519850588748846,
"grad_norm": 8.789809226989746,
"learning_rate": 4.6759864684827386e-05,
"loss": 2.6843,
"step": 86000
},
{
"epoch": 0.47796128789846226,
"grad_norm": 10.296858787536621,
"learning_rate": 4.670870196076024e-05,
"loss": 2.7054,
"step": 86500
},
{
"epoch": 0.480724069909436,
"grad_norm": 7.866740703582764,
"learning_rate": 4.665753923669309e-05,
"loss": 2.7313,
"step": 87000
},
{
"epoch": 0.4834868519204098,
"grad_norm": 5.662039756774902,
"learning_rate": 4.660647883807408e-05,
"loss": 2.6513,
"step": 87500
},
{
"epoch": 0.48624963393138354,
"grad_norm": 6.773980617523193,
"learning_rate": 4.655531611400692e-05,
"loss": 2.7071,
"step": 88000
},
{
"epoch": 0.48901241594235734,
"grad_norm": 9.490970611572266,
"learning_rate": 4.6504153389939775e-05,
"loss": 2.7469,
"step": 88500
},
{
"epoch": 0.4917751979533311,
"grad_norm": 6.021182537078857,
"learning_rate": 4.645299066587263e-05,
"loss": 2.7378,
"step": 89000
},
{
"epoch": 0.4945379799643049,
"grad_norm": 6.2784271240234375,
"learning_rate": 4.640182794180548e-05,
"loss": 2.691,
"step": 89500
},
{
"epoch": 0.4973007619752786,
"grad_norm": 6.208467960357666,
"learning_rate": 4.635076754318646e-05,
"loss": 2.6986,
"step": 90000
},
{
"epoch": 0.4973007619752786,
"eval_runtime": 1416.7509,
"eval_samples_per_second": 255.481,
"eval_steps_per_second": 31.936,
"step": 90000
},
{
"epoch": 0.5000635439862524,
"grad_norm": 7.096754550933838,
"learning_rate": 4.6299604819119306e-05,
"loss": 2.7249,
"step": 90500
},
{
"epoch": 0.5028263259972262,
"grad_norm": 6.615260601043701,
"learning_rate": 4.624844209505216e-05,
"loss": 2.691,
"step": 91000
},
{
"epoch": 0.5055891080081999,
"grad_norm": 5.5422043800354,
"learning_rate": 4.619727937098501e-05,
"loss": 2.7257,
"step": 91500
},
{
"epoch": 0.5083518900191737,
"grad_norm": 6.378222465515137,
"learning_rate": 4.614621897236599e-05,
"loss": 2.7359,
"step": 92000
},
{
"epoch": 0.5111146720301475,
"grad_norm": 7.653573513031006,
"learning_rate": 4.609505624829884e-05,
"loss": 2.7233,
"step": 92500
},
{
"epoch": 0.5138774540411213,
"grad_norm": 8.169157981872559,
"learning_rate": 4.604389352423169e-05,
"loss": 2.6786,
"step": 93000
},
{
"epoch": 0.516640236052095,
"grad_norm": 6.562656402587891,
"learning_rate": 4.599273080016454e-05,
"loss": 2.705,
"step": 93500
},
{
"epoch": 0.5194030180630688,
"grad_norm": 5.986241340637207,
"learning_rate": 4.594167040154553e-05,
"loss": 2.717,
"step": 94000
},
{
"epoch": 0.5221658000740426,
"grad_norm": 6.135688304901123,
"learning_rate": 4.589061000292651e-05,
"loss": 2.6973,
"step": 94500
},
{
"epoch": 0.5249285820850164,
"grad_norm": 5.69881534576416,
"learning_rate": 4.583944727885936e-05,
"loss": 2.7153,
"step": 95000
},
{
"epoch": 0.5249285820850164,
"eval_runtime": 1415.9132,
"eval_samples_per_second": 255.632,
"eval_steps_per_second": 31.955,
"step": 95000
},
{
"epoch": 0.52769136409599,
"grad_norm": 11.074015617370605,
"learning_rate": 4.578828455479221e-05,
"loss": 2.7301,
"step": 95500
},
{
"epoch": 0.5304541461069638,
"grad_norm": 4.483022212982178,
"learning_rate": 4.573712183072506e-05,
"loss": 2.6829,
"step": 96000
},
{
"epoch": 0.5332169281179376,
"grad_norm": 5.1948561668396,
"learning_rate": 4.568595910665791e-05,
"loss": 2.7031,
"step": 96500
},
{
"epoch": 0.5359797101289114,
"grad_norm": 7.523544788360596,
"learning_rate": 4.5634796382590763e-05,
"loss": 2.75,
"step": 97000
},
{
"epoch": 0.5387424921398852,
"grad_norm": 7.069555282592773,
"learning_rate": 4.558363365852361e-05,
"loss": 2.7049,
"step": 97500
},
{
"epoch": 0.5415052741508589,
"grad_norm": 9.187417984008789,
"learning_rate": 4.553247093445646e-05,
"loss": 2.7008,
"step": 98000
},
{
"epoch": 0.5442680561618327,
"grad_norm": 6.571780204772949,
"learning_rate": 4.548141053583744e-05,
"loss": 2.6672,
"step": 98500
},
{
"epoch": 0.5470308381728065,
"grad_norm": 6.857777118682861,
"learning_rate": 4.5430247811770294e-05,
"loss": 2.6805,
"step": 99000
},
{
"epoch": 0.5497936201837803,
"grad_norm": 4.911254405975342,
"learning_rate": 4.5379085087703146e-05,
"loss": 2.7083,
"step": 99500
},
{
"epoch": 0.552556402194754,
"grad_norm": 6.255260467529297,
"learning_rate": 4.532792236363599e-05,
"loss": 2.7035,
"step": 100000
},
{
"epoch": 0.552556402194754,
"eval_runtime": 1418.8162,
"eval_samples_per_second": 255.109,
"eval_steps_per_second": 31.889,
"step": 100000
},
{
"epoch": 0.5553191842057278,
"grad_norm": 5.266800403594971,
"learning_rate": 4.527686196501697e-05,
"loss": 2.7331,
"step": 100500
},
{
"epoch": 0.5580819662167016,
"grad_norm": 5.317836284637451,
"learning_rate": 4.5225699240949825e-05,
"loss": 2.7293,
"step": 101000
},
{
"epoch": 0.5608447482276754,
"grad_norm": 6.019017219543457,
"learning_rate": 4.517453651688268e-05,
"loss": 2.6752,
"step": 101500
},
{
"epoch": 0.5636075302386491,
"grad_norm": 9.754213333129883,
"learning_rate": 4.512337379281553e-05,
"loss": 2.7048,
"step": 102000
},
{
"epoch": 0.5663703122496229,
"grad_norm": 5.172014236450195,
"learning_rate": 4.507231339419651e-05,
"loss": 2.6733,
"step": 102500
},
{
"epoch": 0.5691330942605967,
"grad_norm": 23.669513702392578,
"learning_rate": 4.502115067012936e-05,
"loss": 2.6909,
"step": 103000
},
{
"epoch": 0.5718958762715705,
"grad_norm": 4.895296573638916,
"learning_rate": 4.4969987946062214e-05,
"loss": 2.6816,
"step": 103500
},
{
"epoch": 0.5746586582825441,
"grad_norm": 6.87628173828125,
"learning_rate": 4.4918825221995066e-05,
"loss": 2.7055,
"step": 104000
},
{
"epoch": 0.5774214402935179,
"grad_norm": 6.831465721130371,
"learning_rate": 4.486776482337605e-05,
"loss": 2.6939,
"step": 104500
},
{
"epoch": 0.5801842223044917,
"grad_norm": 6.297806739807129,
"learning_rate": 4.481670442475703e-05,
"loss": 2.7159,
"step": 105000
},
{
"epoch": 0.5801842223044917,
"eval_runtime": 1418.3876,
"eval_samples_per_second": 255.186,
"eval_steps_per_second": 31.899,
"step": 105000
},
{
"epoch": 0.5829470043154655,
"grad_norm": 5.875136852264404,
"learning_rate": 4.4765541700689875e-05,
"loss": 2.6865,
"step": 105500
},
{
"epoch": 0.5857097863264392,
"grad_norm": 7.282098770141602,
"learning_rate": 4.471437897662273e-05,
"loss": 2.6504,
"step": 106000
},
{
"epoch": 0.588472568337413,
"grad_norm": 7.123196125030518,
"learning_rate": 4.466321625255558e-05,
"loss": 2.6962,
"step": 106500
},
{
"epoch": 0.5912353503483868,
"grad_norm": 5.343898296356201,
"learning_rate": 4.461205352848843e-05,
"loss": 2.6918,
"step": 107000
},
{
"epoch": 0.5939981323593606,
"grad_norm": 7.826199054718018,
"learning_rate": 4.4560890804421276e-05,
"loss": 2.7273,
"step": 107500
},
{
"epoch": 0.5967609143703344,
"grad_norm": 4.318883895874023,
"learning_rate": 4.4509830405802264e-05,
"loss": 2.7267,
"step": 108000
},
{
"epoch": 0.5995236963813081,
"grad_norm": 7.37441873550415,
"learning_rate": 4.4458667681735116e-05,
"loss": 2.6595,
"step": 108500
},
{
"epoch": 0.6022864783922819,
"grad_norm": 8.249720573425293,
"learning_rate": 4.440750495766797e-05,
"loss": 2.7001,
"step": 109000
},
{
"epoch": 0.6050492604032557,
"grad_norm": 7.008593559265137,
"learning_rate": 4.435634223360081e-05,
"loss": 2.6998,
"step": 109500
},
{
"epoch": 0.6078120424142295,
"grad_norm": 5.00942850112915,
"learning_rate": 4.430517950953366e-05,
"loss": 2.6903,
"step": 110000
},
{
"epoch": 0.6078120424142295,
"eval_runtime": 1419.9436,
"eval_samples_per_second": 254.907,
"eval_steps_per_second": 31.864,
"step": 110000
},
{
"epoch": 0.6105748244252032,
"grad_norm": 9.650525093078613,
"learning_rate": 4.425401678546651e-05,
"loss": 2.6904,
"step": 110500
},
{
"epoch": 0.613337606436177,
"grad_norm": 8.574694633483887,
"learning_rate": 4.420285406139936e-05,
"loss": 2.7113,
"step": 111000
},
{
"epoch": 0.6161003884471508,
"grad_norm": 8.437103271484375,
"learning_rate": 4.4151691337332214e-05,
"loss": 2.6072,
"step": 111500
},
{
"epoch": 0.6188631704581246,
"grad_norm": 7.744716167449951,
"learning_rate": 4.41006309387132e-05,
"loss": 2.6759,
"step": 112000
},
{
"epoch": 0.6216259524690982,
"grad_norm": 6.564632892608643,
"learning_rate": 4.404946821464605e-05,
"loss": 2.6762,
"step": 112500
},
{
"epoch": 0.624388734480072,
"grad_norm": 8.12996768951416,
"learning_rate": 4.39983054905789e-05,
"loss": 2.6773,
"step": 113000
},
{
"epoch": 0.6271515164910458,
"grad_norm": 10.49181842803955,
"learning_rate": 4.394714276651175e-05,
"loss": 2.7151,
"step": 113500
},
{
"epoch": 0.6299142985020196,
"grad_norm": 10.710319519042969,
"learning_rate": 4.3896082367892734e-05,
"loss": 2.6788,
"step": 114000
},
{
"epoch": 0.6326770805129933,
"grad_norm": 4.202202796936035,
"learning_rate": 4.3844919643825586e-05,
"loss": 2.7281,
"step": 114500
},
{
"epoch": 0.6354398625239671,
"grad_norm": 5.33767557144165,
"learning_rate": 4.379375691975843e-05,
"loss": 2.7071,
"step": 115000
},
{
"epoch": 0.6354398625239671,
"eval_runtime": 1417.9967,
"eval_samples_per_second": 255.257,
"eval_steps_per_second": 31.908,
"step": 115000
},
{
"epoch": 0.6382026445349409,
"grad_norm": 7.461055278778076,
"learning_rate": 4.374259419569128e-05,
"loss": 2.7429,
"step": 115500
},
{
"epoch": 0.6409654265459147,
"grad_norm": 5.790754318237305,
"learning_rate": 4.3691533797072264e-05,
"loss": 2.6729,
"step": 116000
},
{
"epoch": 0.6437282085568884,
"grad_norm": 6.799173831939697,
"learning_rate": 4.3640371073005116e-05,
"loss": 2.6795,
"step": 116500
},
{
"epoch": 0.6464909905678622,
"grad_norm": 8.002934455871582,
"learning_rate": 4.358920834893797e-05,
"loss": 2.7077,
"step": 117000
},
{
"epoch": 0.649253772578836,
"grad_norm": 6.33392858505249,
"learning_rate": 4.353814795031895e-05,
"loss": 2.67,
"step": 117500
},
{
"epoch": 0.6520165545898098,
"grad_norm": 7.54648494720459,
"learning_rate": 4.34869852262518e-05,
"loss": 2.703,
"step": 118000
},
{
"epoch": 0.6547793366007836,
"grad_norm": 6.37369441986084,
"learning_rate": 4.343582250218465e-05,
"loss": 2.7147,
"step": 118500
},
{
"epoch": 0.6575421186117573,
"grad_norm": 6.073976516723633,
"learning_rate": 4.33846597781175e-05,
"loss": 2.6448,
"step": 119000
},
{
"epoch": 0.6603049006227311,
"grad_norm": 6.560715675354004,
"learning_rate": 4.333349705405035e-05,
"loss": 2.6999,
"step": 119500
},
{
"epoch": 0.6630676826337049,
"grad_norm": 5.832411766052246,
"learning_rate": 4.3282334329983196e-05,
"loss": 2.6517,
"step": 120000
},
{
"epoch": 0.6630676826337049,
"eval_runtime": 1414.7946,
"eval_samples_per_second": 255.834,
"eval_steps_per_second": 31.98,
"step": 120000
},
{
"epoch": 0.6658304646446787,
"grad_norm": 7.141829490661621,
"learning_rate": 4.323117160591605e-05,
"loss": 2.6821,
"step": 120500
},
{
"epoch": 0.6685932466556523,
"grad_norm": 10.832301139831543,
"learning_rate": 4.31800088818489e-05,
"loss": 2.6649,
"step": 121000
},
{
"epoch": 0.6713560286666261,
"grad_norm": 6.108252048492432,
"learning_rate": 4.312894848322989e-05,
"loss": 2.6853,
"step": 121500
},
{
"epoch": 0.6741188106775999,
"grad_norm": 6.897459983825684,
"learning_rate": 4.3077785759162734e-05,
"loss": 2.7084,
"step": 122000
},
{
"epoch": 0.6768815926885737,
"grad_norm": 7.211142063140869,
"learning_rate": 4.3026623035095586e-05,
"loss": 2.7029,
"step": 122500
},
{
"epoch": 0.6796443746995474,
"grad_norm": 6.063936710357666,
"learning_rate": 4.297546031102843e-05,
"loss": 2.6938,
"step": 123000
},
{
"epoch": 0.6824071567105212,
"grad_norm": 7.535489082336426,
"learning_rate": 4.292439991240942e-05,
"loss": 2.71,
"step": 123500
},
{
"epoch": 0.685169938721495,
"grad_norm": 6.275320529937744,
"learning_rate": 4.287323718834227e-05,
"loss": 2.6889,
"step": 124000
},
{
"epoch": 0.6879327207324688,
"grad_norm": 5.57111930847168,
"learning_rate": 4.2822074464275116e-05,
"loss": 2.6347,
"step": 124500
},
{
"epoch": 0.6906955027434425,
"grad_norm": 5.227652072906494,
"learning_rate": 4.277091174020797e-05,
"loss": 2.6816,
"step": 125000
},
{
"epoch": 0.6906955027434425,
"eval_runtime": 1444.2501,
"eval_samples_per_second": 250.617,
"eval_steps_per_second": 31.328,
"step": 125000
},
{
"epoch": 0.6934582847544163,
"grad_norm": 3.870896339416504,
"learning_rate": 4.271974901614082e-05,
"loss": 2.6564,
"step": 125500
},
{
"epoch": 0.6962210667653901,
"grad_norm": 5.954395771026611,
"learning_rate": 4.26686886175218e-05,
"loss": 2.7183,
"step": 126000
},
{
"epoch": 0.6989838487763639,
"grad_norm": 5.926782131195068,
"learning_rate": 4.2617525893454654e-05,
"loss": 2.7013,
"step": 126500
},
{
"epoch": 0.7017466307873376,
"grad_norm": 6.156914710998535,
"learning_rate": 4.25663631693875e-05,
"loss": 2.6723,
"step": 127000
},
{
"epoch": 0.7045094127983114,
"grad_norm": 5.73563289642334,
"learning_rate": 4.251520044532035e-05,
"loss": 2.7047,
"step": 127500
},
{
"epoch": 0.7072721948092852,
"grad_norm": 6.068446636199951,
"learning_rate": 4.246414004670133e-05,
"loss": 2.6729,
"step": 128000
},
{
"epoch": 0.710034976820259,
"grad_norm": 6.130403995513916,
"learning_rate": 4.2412977322634185e-05,
"loss": 2.7039,
"step": 128500
},
{
"epoch": 0.7127977588312328,
"grad_norm": 5.924908638000488,
"learning_rate": 4.236191692401517e-05,
"loss": 2.7127,
"step": 129000
},
{
"epoch": 0.7155605408422064,
"grad_norm": 7.866479396820068,
"learning_rate": 4.2310754199948025e-05,
"loss": 2.644,
"step": 129500
},
{
"epoch": 0.7183233228531802,
"grad_norm": 6.398780345916748,
"learning_rate": 4.225959147588087e-05,
"loss": 2.7028,
"step": 130000
},
{
"epoch": 0.7183233228531802,
"eval_runtime": 1453.3668,
"eval_samples_per_second": 249.044,
"eval_steps_per_second": 31.131,
"step": 130000
},
{
"epoch": 0.721086104864154,
"grad_norm": 7.091272830963135,
"learning_rate": 4.220842875181372e-05,
"loss": 2.6944,
"step": 130500
},
{
"epoch": 0.7238488868751278,
"grad_norm": 6.755273342132568,
"learning_rate": 4.2157266027746574e-05,
"loss": 2.7319,
"step": 131000
},
{
"epoch": 0.7266116688861015,
"grad_norm": 7.107387065887451,
"learning_rate": 4.2106103303679426e-05,
"loss": 2.686,
"step": 131500
},
{
"epoch": 0.7293744508970753,
"grad_norm": 5.511538982391357,
"learning_rate": 4.205494057961227e-05,
"loss": 2.7141,
"step": 132000
},
{
"epoch": 0.7321372329080491,
"grad_norm": 6.616804599761963,
"learning_rate": 4.2003777855545116e-05,
"loss": 2.6817,
"step": 132500
},
{
"epoch": 0.7349000149190229,
"grad_norm": 5.216026782989502,
"learning_rate": 4.1952717456926105e-05,
"loss": 2.6844,
"step": 133000
},
{
"epoch": 0.7376627969299966,
"grad_norm": 6.271154880523682,
"learning_rate": 4.190155473285896e-05,
"loss": 2.6845,
"step": 133500
},
{
"epoch": 0.7404255789409704,
"grad_norm": 7.05709981918335,
"learning_rate": 4.185039200879181e-05,
"loss": 2.6943,
"step": 134000
},
{
"epoch": 0.7431883609519442,
"grad_norm": 8.08059024810791,
"learning_rate": 4.1799229284724654e-05,
"loss": 2.6897,
"step": 134500
},
{
"epoch": 0.745951142962918,
"grad_norm": 9.127315521240234,
"learning_rate": 4.1748066560657506e-05,
"loss": 2.7364,
"step": 135000
},
{
"epoch": 0.745951142962918,
"eval_runtime": 1433.8013,
"eval_samples_per_second": 252.443,
"eval_steps_per_second": 31.556,
"step": 135000
},
{
"epoch": 0.7487139249738917,
"grad_norm": 6.763530254364014,
"learning_rate": 4.169700616203849e-05,
"loss": 2.6775,
"step": 135500
},
{
"epoch": 0.7514767069848655,
"grad_norm": 7.9728617668151855,
"learning_rate": 4.164584343797134e-05,
"loss": 2.6517,
"step": 136000
},
{
"epoch": 0.7542394889958393,
"grad_norm": 6.352534294128418,
"learning_rate": 4.159468071390419e-05,
"loss": 2.6726,
"step": 136500
},
{
"epoch": 0.757002271006813,
"grad_norm": 8.01561450958252,
"learning_rate": 4.154351798983704e-05,
"loss": 2.656,
"step": 137000
},
{
"epoch": 0.7597650530177867,
"grad_norm": 4.679101467132568,
"learning_rate": 4.149245759121802e-05,
"loss": 2.6909,
"step": 137500
},
{
"epoch": 0.7625278350287605,
"grad_norm": 8.915389060974121,
"learning_rate": 4.144129486715087e-05,
"loss": 2.7239,
"step": 138000
},
{
"epoch": 0.7652906170397343,
"grad_norm": 9.970344543457031,
"learning_rate": 4.139013214308372e-05,
"loss": 2.6624,
"step": 138500
},
{
"epoch": 0.7680533990507081,
"grad_norm": 4.899960994720459,
"learning_rate": 4.1338969419016574e-05,
"loss": 2.6651,
"step": 139000
},
{
"epoch": 0.7708161810616819,
"grad_norm": 6.549561023712158,
"learning_rate": 4.128780669494942e-05,
"loss": 2.6871,
"step": 139500
},
{
"epoch": 0.7735789630726556,
"grad_norm": 9.052062034606934,
"learning_rate": 4.123674629633041e-05,
"loss": 2.6841,
"step": 140000
},
{
"epoch": 0.7735789630726556,
"eval_runtime": 1389.3693,
"eval_samples_per_second": 260.516,
"eval_steps_per_second": 32.565,
"step": 140000
},
{
"epoch": 0.7763417450836294,
"grad_norm": 9.36550521850586,
"learning_rate": 4.118558357226326e-05,
"loss": 2.6732,
"step": 140500
},
{
"epoch": 0.7791045270946032,
"grad_norm": 6.052969932556152,
"learning_rate": 4.113442084819611e-05,
"loss": 2.6642,
"step": 141000
},
{
"epoch": 0.781867309105577,
"grad_norm": 5.193731307983398,
"learning_rate": 4.108325812412896e-05,
"loss": 2.6776,
"step": 141500
},
{
"epoch": 0.7846300911165507,
"grad_norm": 7.808539390563965,
"learning_rate": 4.10320954000618e-05,
"loss": 2.6496,
"step": 142000
},
{
"epoch": 0.7873928731275245,
"grad_norm": 6.747580051422119,
"learning_rate": 4.098103500144279e-05,
"loss": 2.6942,
"step": 142500
},
{
"epoch": 0.7901556551384983,
"grad_norm": 7.423492908477783,
"learning_rate": 4.092987227737564e-05,
"loss": 2.7079,
"step": 143000
},
{
"epoch": 0.7929184371494721,
"grad_norm": 8.380352973937988,
"learning_rate": 4.0878709553308494e-05,
"loss": 2.7226,
"step": 143500
},
{
"epoch": 0.7956812191604458,
"grad_norm": 5.976553440093994,
"learning_rate": 4.082754682924134e-05,
"loss": 2.6531,
"step": 144000
},
{
"epoch": 0.7984440011714196,
"grad_norm": 6.945559024810791,
"learning_rate": 4.077648643062232e-05,
"loss": 2.6888,
"step": 144500
},
{
"epoch": 0.8012067831823934,
"grad_norm": 5.81919002532959,
"learning_rate": 4.072532370655517e-05,
"loss": 2.7122,
"step": 145000
},
{
"epoch": 0.8012067831823934,
"eval_runtime": 1417.5368,
"eval_samples_per_second": 255.339,
"eval_steps_per_second": 31.918,
"step": 145000
},
{
"epoch": 0.8039695651933672,
"grad_norm": 4.8362908363342285,
"learning_rate": 4.0674160982488025e-05,
"loss": 2.7026,
"step": 145500
},
{
"epoch": 0.8067323472043408,
"grad_norm": 5.70801305770874,
"learning_rate": 4.062299825842088e-05,
"loss": 2.676,
"step": 146000
},
{
"epoch": 0.8094951292153146,
"grad_norm": 5.2062811851501465,
"learning_rate": 4.057193785980186e-05,
"loss": 2.6539,
"step": 146500
},
{
"epoch": 0.8122579112262884,
"grad_norm": 7.768016815185547,
"learning_rate": 4.0520775135734704e-05,
"loss": 2.6562,
"step": 147000
},
{
"epoch": 0.8150206932372622,
"grad_norm": 5.368041515350342,
"learning_rate": 4.0469612411667556e-05,
"loss": 2.6276,
"step": 147500
},
{
"epoch": 0.8177834752482359,
"grad_norm": 8.40014934539795,
"learning_rate": 4.041844968760041e-05,
"loss": 2.6833,
"step": 148000
},
{
"epoch": 0.8205462572592097,
"grad_norm": 5.6016740798950195,
"learning_rate": 4.036728696353326e-05,
"loss": 2.6668,
"step": 148500
},
{
"epoch": 0.8233090392701835,
"grad_norm": 7.395069599151611,
"learning_rate": 4.031622656491424e-05,
"loss": 2.6534,
"step": 149000
},
{
"epoch": 0.8260718212811573,
"grad_norm": 6.262421607971191,
"learning_rate": 4.026506384084709e-05,
"loss": 2.6487,
"step": 149500
},
{
"epoch": 0.828834603292131,
"grad_norm": 5.546928882598877,
"learning_rate": 4.0213901116779945e-05,
"loss": 2.678,
"step": 150000
},
{
"epoch": 0.828834603292131,
"eval_runtime": 1413.0655,
"eval_samples_per_second": 256.147,
"eval_steps_per_second": 32.019,
"step": 150000
},
{
"epoch": 0.8315973853031048,
"grad_norm": 6.29634428024292,
"learning_rate": 4.01627383927128e-05,
"loss": 2.7104,
"step": 150500
},
{
"epoch": 0.8343601673140786,
"grad_norm": 8.511784553527832,
"learning_rate": 4.011167799409378e-05,
"loss": 2.6784,
"step": 151000
},
{
"epoch": 0.8371229493250524,
"grad_norm": 6.0729451179504395,
"learning_rate": 4.0060515270026624e-05,
"loss": 2.6968,
"step": 151500
},
{
"epoch": 0.8398857313360262,
"grad_norm": 4.513732433319092,
"learning_rate": 4.0009352545959476e-05,
"loss": 2.661,
"step": 152000
},
{
"epoch": 0.8426485133469999,
"grad_norm": 7.522515773773193,
"learning_rate": 3.995818982189233e-05,
"loss": 2.6892,
"step": 152500
},
{
"epoch": 0.8454112953579737,
"grad_norm": 13.848055839538574,
"learning_rate": 3.990712942327331e-05,
"loss": 2.6444,
"step": 153000
},
{
"epoch": 0.8481740773689475,
"grad_norm": 7.082030296325684,
"learning_rate": 3.985596669920616e-05,
"loss": 2.6872,
"step": 153500
},
{
"epoch": 0.8509368593799213,
"grad_norm": 7.098601818084717,
"learning_rate": 3.980480397513901e-05,
"loss": 2.6684,
"step": 154000
},
{
"epoch": 0.8536996413908949,
"grad_norm": 5.784538269042969,
"learning_rate": 3.975364125107186e-05,
"loss": 2.7026,
"step": 154500
},
{
"epoch": 0.8564624234018687,
"grad_norm": 7.91291618347168,
"learning_rate": 3.970258085245285e-05,
"loss": 2.7019,
"step": 155000
},
{
"epoch": 0.8564624234018687,
"eval_runtime": 95374.473,
"eval_samples_per_second": 3.795,
"eval_steps_per_second": 0.474,
"step": 155000
},
{
"epoch": 0.8592252054128425,
"grad_norm": 6.4150710105896,
"learning_rate": 3.96514181283857e-05,
"loss": 2.6462,
"step": 155500
},
{
"epoch": 0.8619879874238163,
"grad_norm": 6.692925930023193,
"learning_rate": 3.9600255404318544e-05,
"loss": 2.6601,
"step": 156000
},
{
"epoch": 0.86475076943479,
"grad_norm": 6.283834934234619,
"learning_rate": 3.954909268025139e-05,
"loss": 2.6762,
"step": 156500
},
{
"epoch": 0.8675135514457638,
"grad_norm": 7.09011173248291,
"learning_rate": 3.949803228163238e-05,
"loss": 2.6903,
"step": 157000
},
{
"epoch": 0.8702763334567376,
"grad_norm": 5.456295490264893,
"learning_rate": 3.944686955756523e-05,
"loss": 2.6861,
"step": 157500
},
{
"epoch": 0.8730391154677114,
"grad_norm": 8.332560539245605,
"learning_rate": 3.939570683349808e-05,
"loss": 2.6292,
"step": 158000
},
{
"epoch": 0.8758018974786851,
"grad_norm": 11.606867790222168,
"learning_rate": 3.9344544109430934e-05,
"loss": 2.6681,
"step": 158500
},
{
"epoch": 0.8785646794896589,
"grad_norm": 6.087031364440918,
"learning_rate": 3.9293483710811915e-05,
"loss": 2.6429,
"step": 159000
},
{
"epoch": 0.8813274615006327,
"grad_norm": 7.540450572967529,
"learning_rate": 3.924232098674476e-05,
"loss": 2.6526,
"step": 159500
},
{
"epoch": 0.8840902435116065,
"grad_norm": 5.995485305786133,
"learning_rate": 3.919115826267761e-05,
"loss": 2.6561,
"step": 160000
},
{
"epoch": 0.8840902435116065,
"eval_runtime": 1415.0083,
"eval_samples_per_second": 255.796,
"eval_steps_per_second": 31.975,
"step": 160000
},
{
"epoch": 0.8868530255225802,
"grad_norm": 6.677637100219727,
"learning_rate": 3.9139995538610464e-05,
"loss": 2.726,
"step": 160500
},
{
"epoch": 0.889615807533554,
"grad_norm": 4.598792552947998,
"learning_rate": 3.9088832814543316e-05,
"loss": 2.6989,
"step": 161000
},
{
"epoch": 0.8923785895445278,
"grad_norm": 6.177221775054932,
"learning_rate": 3.90377724159243e-05,
"loss": 2.6515,
"step": 161500
},
{
"epoch": 0.8951413715555016,
"grad_norm": 4.76786994934082,
"learning_rate": 3.898660969185714e-05,
"loss": 2.6567,
"step": 162000
},
{
"epoch": 0.8979041535664753,
"grad_norm": 8.788933753967285,
"learning_rate": 3.8935446967789995e-05,
"loss": 2.6491,
"step": 162500
},
{
"epoch": 0.900666935577449,
"grad_norm": 5.806134223937988,
"learning_rate": 3.888428424372285e-05,
"loss": 2.638,
"step": 163000
},
{
"epoch": 0.9034297175884228,
"grad_norm": 6.518142223358154,
"learning_rate": 3.8833223845103836e-05,
"loss": 2.7087,
"step": 163500
},
{
"epoch": 0.9061924995993966,
"grad_norm": 5.603370189666748,
"learning_rate": 3.878206112103668e-05,
"loss": 2.6832,
"step": 164000
},
{
"epoch": 0.9089552816103704,
"grad_norm": 4.990660667419434,
"learning_rate": 3.873089839696953e-05,
"loss": 2.6847,
"step": 164500
},
{
"epoch": 0.9117180636213441,
"grad_norm": 7.145622730255127,
"learning_rate": 3.8679735672902385e-05,
"loss": 2.652,
"step": 165000
},
{
"epoch": 0.9117180636213441,
"eval_runtime": 1419.0902,
"eval_samples_per_second": 255.06,
"eval_steps_per_second": 31.883,
"step": 165000
},
{
"epoch": 0.9144808456323179,
"grad_norm": 6.253338813781738,
"learning_rate": 3.8628675274283366e-05,
"loss": 2.6564,
"step": 165500
},
{
"epoch": 0.9172436276432917,
"grad_norm": 9.034163475036621,
"learning_rate": 3.857761487566435e-05,
"loss": 2.6881,
"step": 166000
},
{
"epoch": 0.9200064096542655,
"grad_norm": 6.8109331130981445,
"learning_rate": 3.85264521515972e-05,
"loss": 2.6523,
"step": 166500
},
{
"epoch": 0.9227691916652392,
"grad_norm": 6.663868427276611,
"learning_rate": 3.8475289427530045e-05,
"loss": 2.6115,
"step": 167000
},
{
"epoch": 0.925531973676213,
"grad_norm": 6.121496677398682,
"learning_rate": 3.84241267034629e-05,
"loss": 2.6596,
"step": 167500
},
{
"epoch": 0.9282947556871868,
"grad_norm": 7.0516533851623535,
"learning_rate": 3.837296397939575e-05,
"loss": 2.7253,
"step": 168000
},
{
"epoch": 0.9310575376981606,
"grad_norm": 9.789468765258789,
"learning_rate": 3.83218012553286e-05,
"loss": 2.6783,
"step": 168500
},
{
"epoch": 0.9338203197091343,
"grad_norm": 6.72811222076416,
"learning_rate": 3.8270638531261446e-05,
"loss": 2.6639,
"step": 169000
},
{
"epoch": 0.9365831017201081,
"grad_norm": 6.445733070373535,
"learning_rate": 3.82194758071943e-05,
"loss": 2.6683,
"step": 169500
},
{
"epoch": 0.9393458837310819,
"grad_norm": 6.840031623840332,
"learning_rate": 3.816841540857528e-05,
"loss": 2.6877,
"step": 170000
},
{
"epoch": 0.9393458837310819,
"eval_runtime": 1422.4105,
"eval_samples_per_second": 254.465,
"eval_steps_per_second": 31.809,
"step": 170000
},
{
"epoch": 0.9421086657420557,
"grad_norm": 7.416438579559326,
"learning_rate": 3.811735500995627e-05,
"loss": 2.6762,
"step": 170500
},
{
"epoch": 0.9448714477530293,
"grad_norm": 7.09953498840332,
"learning_rate": 3.806619228588912e-05,
"loss": 2.6424,
"step": 171000
},
{
"epoch": 0.9476342297640031,
"grad_norm": 4.984237194061279,
"learning_rate": 3.801502956182197e-05,
"loss": 2.615,
"step": 171500
},
{
"epoch": 0.9503970117749769,
"grad_norm": 7.201922416687012,
"learning_rate": 3.796386683775482e-05,
"loss": 2.6765,
"step": 172000
},
{
"epoch": 0.9531597937859507,
"grad_norm": 5.911804676055908,
"learning_rate": 3.791270411368767e-05,
"loss": 2.6948,
"step": 172500
},
{
"epoch": 0.9559225757969245,
"grad_norm": 9.068735122680664,
"learning_rate": 3.786154138962052e-05,
"loss": 2.732,
"step": 173000
},
{
"epoch": 0.9586853578078982,
"grad_norm": 5.268345355987549,
"learning_rate": 3.781037866555337e-05,
"loss": 2.6845,
"step": 173500
},
{
"epoch": 0.961448139818872,
"grad_norm": 4.931880474090576,
"learning_rate": 3.775921594148622e-05,
"loss": 2.6898,
"step": 174000
},
{
"epoch": 0.9642109218298458,
"grad_norm": 6.40797233581543,
"learning_rate": 3.77081555428672e-05,
"loss": 2.6438,
"step": 174500
},
{
"epoch": 0.9669737038408196,
"grad_norm": 8.506610870361328,
"learning_rate": 3.765699281880005e-05,
"loss": 2.6475,
"step": 175000
},
{
"epoch": 0.9669737038408196,
"eval_runtime": 1421.0183,
"eval_samples_per_second": 254.714,
"eval_steps_per_second": 31.84,
"step": 175000
},
{
"epoch": 0.9697364858517933,
"grad_norm": 6.606350898742676,
"learning_rate": 3.7605932420181033e-05,
"loss": 2.649,
"step": 175500
},
{
"epoch": 0.9724992678627671,
"grad_norm": 7.5227580070495605,
"learning_rate": 3.7554769696113885e-05,
"loss": 2.6666,
"step": 176000
},
{
"epoch": 0.9752620498737409,
"grad_norm": 6.499021530151367,
"learning_rate": 3.750360697204674e-05,
"loss": 2.6652,
"step": 176500
},
{
"epoch": 0.9780248318847147,
"grad_norm": 6.094892501831055,
"learning_rate": 3.745244424797958e-05,
"loss": 2.6776,
"step": 177000
},
{
"epoch": 0.9807876138956884,
"grad_norm": 6.329995632171631,
"learning_rate": 3.7401281523912435e-05,
"loss": 2.7002,
"step": 177500
},
{
"epoch": 0.9835503959066622,
"grad_norm": 7.394835472106934,
"learning_rate": 3.7350118799845287e-05,
"loss": 2.6647,
"step": 178000
},
{
"epoch": 0.986313177917636,
"grad_norm": 9.899744987487793,
"learning_rate": 3.729895607577814e-05,
"loss": 2.6873,
"step": 178500
},
{
"epoch": 0.9890759599286097,
"grad_norm": 8.495482444763184,
"learning_rate": 3.7247793351710984e-05,
"loss": 2.6574,
"step": 179000
},
{
"epoch": 0.9918387419395834,
"grad_norm": 7.411177158355713,
"learning_rate": 3.7196630627643836e-05,
"loss": 2.6587,
"step": 179500
},
{
"epoch": 0.9946015239505572,
"grad_norm": 6.457353591918945,
"learning_rate": 3.714557022902482e-05,
"loss": 2.6607,
"step": 180000
},
{
"epoch": 0.9946015239505572,
"eval_runtime": 1418.3578,
"eval_samples_per_second": 255.192,
"eval_steps_per_second": 31.9,
"step": 180000
},
{
"epoch": 0.997364305961531,
"grad_norm": 4.6210618019104,
"learning_rate": 3.709440750495767e-05,
"loss": 2.6894,
"step": 180500
},
{
"epoch": 1.0001270879725048,
"grad_norm": 8.51563549041748,
"learning_rate": 3.704324478089052e-05,
"loss": 2.6624,
"step": 181000
},
{
"epoch": 1.0028898699834785,
"grad_norm": 7.084454536437988,
"learning_rate": 3.6992082056823366e-05,
"loss": 2.644,
"step": 181500
},
{
"epoch": 1.0056526519944524,
"grad_norm": 6.7502665519714355,
"learning_rate": 3.6941021658204355e-05,
"loss": 2.719,
"step": 182000
},
{
"epoch": 1.008415434005426,
"grad_norm": 9.962002754211426,
"learning_rate": 3.688985893413721e-05,
"loss": 2.6581,
"step": 182500
},
{
"epoch": 1.0111782160163998,
"grad_norm": 8.310935020446777,
"learning_rate": 3.683869621007006e-05,
"loss": 2.6834,
"step": 183000
},
{
"epoch": 1.0139409980273737,
"grad_norm": 6.0927557945251465,
"learning_rate": 3.6787533486002904e-05,
"loss": 2.6788,
"step": 183500
},
{
"epoch": 1.0167037800383474,
"grad_norm": 7.631921768188477,
"learning_rate": 3.6736473087383886e-05,
"loss": 2.6882,
"step": 184000
},
{
"epoch": 1.0194665620493213,
"grad_norm": 11.251723289489746,
"learning_rate": 3.668531036331674e-05,
"loss": 2.6699,
"step": 184500
},
{
"epoch": 1.022229344060295,
"grad_norm": 6.595937252044678,
"learning_rate": 3.663414763924959e-05,
"loss": 2.6798,
"step": 185000
},
{
"epoch": 1.022229344060295,
"eval_runtime": 1415.123,
"eval_samples_per_second": 255.775,
"eval_steps_per_second": 31.972,
"step": 185000
},
{
"epoch": 1.0249921260712687,
"grad_norm": 11.095263481140137,
"learning_rate": 3.658308724063057e-05,
"loss": 2.6569,
"step": 185500
},
{
"epoch": 1.0277549080822426,
"grad_norm": 6.58620023727417,
"learning_rate": 3.653192451656342e-05,
"loss": 2.6731,
"step": 186000
},
{
"epoch": 1.0305176900932163,
"grad_norm": 8.076006889343262,
"learning_rate": 3.648076179249627e-05,
"loss": 2.708,
"step": 186500
},
{
"epoch": 1.03328047210419,
"grad_norm": 6.5196990966796875,
"learning_rate": 3.642959906842912e-05,
"loss": 2.6235,
"step": 187000
},
{
"epoch": 1.0360432541151638,
"grad_norm": 5.743066787719727,
"learning_rate": 3.637843634436197e-05,
"loss": 2.6598,
"step": 187500
},
{
"epoch": 1.0388060361261375,
"grad_norm": 5.8798112869262695,
"learning_rate": 3.6327273620294824e-05,
"loss": 2.6463,
"step": 188000
},
{
"epoch": 1.0415688181371114,
"grad_norm": 5.750164031982422,
"learning_rate": 3.627611089622767e-05,
"loss": 2.6784,
"step": 188500
},
{
"epoch": 1.0443316001480851,
"grad_norm": 10.507771492004395,
"learning_rate": 3.622494817216052e-05,
"loss": 2.6642,
"step": 189000
},
{
"epoch": 1.0470943821590588,
"grad_norm": 6.932614803314209,
"learning_rate": 3.61738877735415e-05,
"loss": 2.6819,
"step": 189500
},
{
"epoch": 1.0498571641700327,
"grad_norm": 6.338881969451904,
"learning_rate": 3.6122725049474355e-05,
"loss": 2.6597,
"step": 190000
},
{
"epoch": 1.0498571641700327,
"eval_runtime": 1409.894,
"eval_samples_per_second": 256.724,
"eval_steps_per_second": 32.091,
"step": 190000
},
{
"epoch": 1.0526199461810064,
"grad_norm": 6.120572090148926,
"learning_rate": 3.607156232540721e-05,
"loss": 2.6625,
"step": 190500
},
{
"epoch": 1.05538272819198,
"grad_norm": 7.011288166046143,
"learning_rate": 3.602039960134005e-05,
"loss": 2.6853,
"step": 191000
},
{
"epoch": 1.058145510202954,
"grad_norm": 6.725677013397217,
"learning_rate": 3.5969236877272904e-05,
"loss": 2.715,
"step": 191500
},
{
"epoch": 1.0609082922139277,
"grad_norm": 5.5944132804870605,
"learning_rate": 3.5918074153205756e-05,
"loss": 2.6688,
"step": 192000
},
{
"epoch": 1.0636710742249016,
"grad_norm": 5.993505954742432,
"learning_rate": 3.5867013754586744e-05,
"loss": 2.6747,
"step": 192500
},
{
"epoch": 1.0664338562358753,
"grad_norm": 6.300167560577393,
"learning_rate": 3.581585103051959e-05,
"loss": 2.6711,
"step": 193000
},
{
"epoch": 1.069196638246849,
"grad_norm": 4.657979965209961,
"learning_rate": 3.5764688306452435e-05,
"loss": 2.6409,
"step": 193500
},
{
"epoch": 1.0719594202578229,
"grad_norm": 5.573739051818848,
"learning_rate": 3.571352558238529e-05,
"loss": 2.6507,
"step": 194000
},
{
"epoch": 1.0747222022687966,
"grad_norm": 9.203325271606445,
"learning_rate": 3.566236285831814e-05,
"loss": 2.6963,
"step": 194500
},
{
"epoch": 1.0774849842797702,
"grad_norm": 5.0021185874938965,
"learning_rate": 3.561120013425099e-05,
"loss": 2.6293,
"step": 195000
},
{
"epoch": 1.0774849842797702,
"eval_runtime": 1412.7471,
"eval_samples_per_second": 256.205,
"eval_steps_per_second": 32.026,
"step": 195000
},
{
"epoch": 1.0802477662907441,
"grad_norm": 5.483398914337158,
"learning_rate": 3.5560037410183836e-05,
"loss": 2.6449,
"step": 195500
},
{
"epoch": 1.0830105483017178,
"grad_norm": 6.608130931854248,
"learning_rate": 3.550887468611669e-05,
"loss": 2.7054,
"step": 196000
},
{
"epoch": 1.0857733303126917,
"grad_norm": 6.910079479217529,
"learning_rate": 3.5457814287497676e-05,
"loss": 2.6568,
"step": 196500
},
{
"epoch": 1.0885361123236654,
"grad_norm": 4.9066619873046875,
"learning_rate": 3.540665156343053e-05,
"loss": 2.6738,
"step": 197000
},
{
"epoch": 1.0912988943346391,
"grad_norm": 10.586669921875,
"learning_rate": 3.535548883936337e-05,
"loss": 2.6878,
"step": 197500
},
{
"epoch": 1.094061676345613,
"grad_norm": 6.523357391357422,
"learning_rate": 3.530432611529622e-05,
"loss": 2.6964,
"step": 198000
},
{
"epoch": 1.0968244583565867,
"grad_norm": 6.5124831199646,
"learning_rate": 3.525326571667721e-05,
"loss": 2.6358,
"step": 198500
},
{
"epoch": 1.0995872403675606,
"grad_norm": 5.297051429748535,
"learning_rate": 3.520220531805819e-05,
"loss": 2.609,
"step": 199000
},
{
"epoch": 1.1023500223785343,
"grad_norm": 9.636564254760742,
"learning_rate": 3.515104259399104e-05,
"loss": 2.6268,
"step": 199500
},
{
"epoch": 1.105112804389508,
"grad_norm": 6.009031772613525,
"learning_rate": 3.509987986992389e-05,
"loss": 2.6668,
"step": 200000
},
{
"epoch": 1.105112804389508,
"eval_runtime": 1415.3759,
"eval_samples_per_second": 255.729,
"eval_steps_per_second": 31.967,
"step": 200000
},
{
"epoch": 1.107875586400482,
"grad_norm": 6.43049430847168,
"learning_rate": 3.504871714585674e-05,
"loss": 2.5873,
"step": 200500
},
{
"epoch": 1.1106383684114556,
"grad_norm": 5.201763153076172,
"learning_rate": 3.4997656747237726e-05,
"loss": 2.6485,
"step": 201000
},
{
"epoch": 1.1134011504224293,
"grad_norm": 8.699557304382324,
"learning_rate": 3.494659634861871e-05,
"loss": 2.6795,
"step": 201500
},
{
"epoch": 1.1161639324334032,
"grad_norm": 6.473239421844482,
"learning_rate": 3.489543362455156e-05,
"loss": 2.6479,
"step": 202000
},
{
"epoch": 1.1189267144443769,
"grad_norm": 5.315208435058594,
"learning_rate": 3.484427090048441e-05,
"loss": 2.6804,
"step": 202500
},
{
"epoch": 1.1216894964553508,
"grad_norm": 5.999909400939941,
"learning_rate": 3.4793108176417264e-05,
"loss": 2.6287,
"step": 203000
},
{
"epoch": 1.1244522784663245,
"grad_norm": 5.583693504333496,
"learning_rate": 3.474194545235011e-05,
"loss": 2.6928,
"step": 203500
},
{
"epoch": 1.1272150604772981,
"grad_norm": 12.50479793548584,
"learning_rate": 3.469078272828296e-05,
"loss": 2.6234,
"step": 204000
},
{
"epoch": 1.129977842488272,
"grad_norm": 7.096823692321777,
"learning_rate": 3.463962000421581e-05,
"loss": 2.6131,
"step": 204500
},
{
"epoch": 1.1327406244992457,
"grad_norm": 8.93995475769043,
"learning_rate": 3.4588457280148665e-05,
"loss": 2.6765,
"step": 205000
},
{
"epoch": 1.1327406244992457,
"eval_runtime": 1410.6732,
"eval_samples_per_second": 256.582,
"eval_steps_per_second": 32.073,
"step": 205000
},
{
"epoch": 1.1355034065102196,
"grad_norm": 7.607180595397949,
"learning_rate": 3.453729455608151e-05,
"loss": 2.6506,
"step": 205500
},
{
"epoch": 1.1382661885211933,
"grad_norm": 5.653107166290283,
"learning_rate": 3.448613183201436e-05,
"loss": 2.6471,
"step": 206000
},
{
"epoch": 1.141028970532167,
"grad_norm": 5.887621879577637,
"learning_rate": 3.4434969107947214e-05,
"loss": 2.6733,
"step": 206500
},
{
"epoch": 1.143791752543141,
"grad_norm": 6.376918315887451,
"learning_rate": 3.438380638388006e-05,
"loss": 2.6874,
"step": 207000
},
{
"epoch": 1.1465545345541146,
"grad_norm": 5.099799156188965,
"learning_rate": 3.433274598526105e-05,
"loss": 2.6371,
"step": 207500
},
{
"epoch": 1.1493173165650883,
"grad_norm": 6.011958122253418,
"learning_rate": 3.428168558664203e-05,
"loss": 2.6578,
"step": 208000
},
{
"epoch": 1.1520800985760622,
"grad_norm": 6.049017429351807,
"learning_rate": 3.4230522862574874e-05,
"loss": 2.6667,
"step": 208500
},
{
"epoch": 1.1548428805870359,
"grad_norm": 7.92437744140625,
"learning_rate": 3.4179360138507726e-05,
"loss": 2.6889,
"step": 209000
},
{
"epoch": 1.1576056625980098,
"grad_norm": 6.145605087280273,
"learning_rate": 3.412819741444058e-05,
"loss": 2.6537,
"step": 209500
},
{
"epoch": 1.1603684446089835,
"grad_norm": 7.511498928070068,
"learning_rate": 3.407703469037343e-05,
"loss": 2.6694,
"step": 210000
},
{
"epoch": 1.1603684446089835,
"eval_runtime": 1410.6383,
"eval_samples_per_second": 256.588,
"eval_steps_per_second": 32.074,
"step": 210000
},
{
"epoch": 1.1631312266199572,
"grad_norm": 13.444281578063965,
"learning_rate": 3.4025871966306275e-05,
"loss": 2.6653,
"step": 210500
},
{
"epoch": 1.165894008630931,
"grad_norm": 6.2352399826049805,
"learning_rate": 3.397470924223913e-05,
"loss": 2.6504,
"step": 211000
},
{
"epoch": 1.1686567906419048,
"grad_norm": 6.435050964355469,
"learning_rate": 3.392354651817198e-05,
"loss": 2.6625,
"step": 211500
},
{
"epoch": 1.1714195726528787,
"grad_norm": 5.893118858337402,
"learning_rate": 3.387248611955296e-05,
"loss": 2.6887,
"step": 212000
},
{
"epoch": 1.1741823546638523,
"grad_norm": 7.816985607147217,
"learning_rate": 3.382132339548581e-05,
"loss": 2.6655,
"step": 212500
},
{
"epoch": 1.176945136674826,
"grad_norm": 6.382891654968262,
"learning_rate": 3.377016067141866e-05,
"loss": 2.675,
"step": 213000
},
{
"epoch": 1.1797079186858,
"grad_norm": 5.761401653289795,
"learning_rate": 3.371899794735151e-05,
"loss": 2.6846,
"step": 213500
},
{
"epoch": 1.1824707006967736,
"grad_norm": 6.107725620269775,
"learning_rate": 3.366783522328436e-05,
"loss": 2.6129,
"step": 214000
},
{
"epoch": 1.1852334827077473,
"grad_norm": 6.5399394035339355,
"learning_rate": 3.361677482466535e-05,
"loss": 2.6943,
"step": 214500
},
{
"epoch": 1.1879962647187212,
"grad_norm": 10.632125854492188,
"learning_rate": 3.356571442604633e-05,
"loss": 2.6334,
"step": 215000
},
{
"epoch": 1.1879962647187212,
"eval_runtime": 1416.4537,
"eval_samples_per_second": 255.535,
"eval_steps_per_second": 31.942,
"step": 215000
},
{
"epoch": 1.190759046729695,
"grad_norm": 8.258727073669434,
"learning_rate": 3.351455170197918e-05,
"loss": 2.6625,
"step": 215500
},
{
"epoch": 1.1935218287406686,
"grad_norm": 6.275572776794434,
"learning_rate": 3.346338897791203e-05,
"loss": 2.6586,
"step": 216000
},
{
"epoch": 1.1962846107516425,
"grad_norm": 5.8179144859313965,
"learning_rate": 3.341222625384488e-05,
"loss": 2.7107,
"step": 216500
},
{
"epoch": 1.1990473927626162,
"grad_norm": 7.048791885375977,
"learning_rate": 3.336106352977773e-05,
"loss": 2.6541,
"step": 217000
},
{
"epoch": 1.20181017477359,
"grad_norm": 5.731837272644043,
"learning_rate": 3.330990080571058e-05,
"loss": 2.6137,
"step": 217500
},
{
"epoch": 1.2045729567845638,
"grad_norm": 6.205833911895752,
"learning_rate": 3.325873808164343e-05,
"loss": 2.6608,
"step": 218000
},
{
"epoch": 1.2073357387955375,
"grad_norm": 7.904666900634766,
"learning_rate": 3.320757535757628e-05,
"loss": 2.6514,
"step": 218500
},
{
"epoch": 1.2100985208065114,
"grad_norm": 7.223947525024414,
"learning_rate": 3.3156514958957264e-05,
"loss": 2.7057,
"step": 219000
},
{
"epoch": 1.212861302817485,
"grad_norm": 5.2091569900512695,
"learning_rate": 3.3105352234890116e-05,
"loss": 2.6155,
"step": 219500
},
{
"epoch": 1.215624084828459,
"grad_norm": 13.029759407043457,
"learning_rate": 3.305418951082296e-05,
"loss": 2.6393,
"step": 220000
},
{
"epoch": 1.215624084828459,
"eval_runtime": 1409.2915,
"eval_samples_per_second": 256.833,
"eval_steps_per_second": 32.105,
"step": 220000
},
{
"epoch": 1.2183868668394326,
"grad_norm": 5.841119766235352,
"learning_rate": 3.300302678675581e-05,
"loss": 2.6914,
"step": 220500
},
{
"epoch": 1.2211496488504063,
"grad_norm": 5.692915916442871,
"learning_rate": 3.2951966388136794e-05,
"loss": 2.674,
"step": 221000
},
{
"epoch": 1.2239124308613802,
"grad_norm": 6.484999656677246,
"learning_rate": 3.2900803664069646e-05,
"loss": 2.6225,
"step": 221500
},
{
"epoch": 1.226675212872354,
"grad_norm": 8.07515811920166,
"learning_rate": 3.28496409400025e-05,
"loss": 2.6609,
"step": 222000
},
{
"epoch": 1.2294379948833276,
"grad_norm": 7.687187194824219,
"learning_rate": 3.2798478215935343e-05,
"loss": 2.6666,
"step": 222500
},
{
"epoch": 1.2322007768943015,
"grad_norm": 5.499644756317139,
"learning_rate": 3.274741781731633e-05,
"loss": 2.6853,
"step": 223000
},
{
"epoch": 1.2349635589052752,
"grad_norm": 6.094354629516602,
"learning_rate": 3.2696255093249184e-05,
"loss": 2.665,
"step": 223500
},
{
"epoch": 1.2377263409162491,
"grad_norm": 6.3320159912109375,
"learning_rate": 3.2645092369182036e-05,
"loss": 2.6958,
"step": 224000
},
{
"epoch": 1.2404891229272228,
"grad_norm": 5.882307529449463,
"learning_rate": 3.259392964511489e-05,
"loss": 2.6822,
"step": 224500
},
{
"epoch": 1.2432519049381965,
"grad_norm": 6.465645790100098,
"learning_rate": 3.254276692104773e-05,
"loss": 2.6648,
"step": 225000
},
{
"epoch": 1.2432519049381965,
"eval_runtime": 1411.6928,
"eval_samples_per_second": 256.396,
"eval_steps_per_second": 32.05,
"step": 225000
},
{
"epoch": 1.2460146869491704,
"grad_norm": 7.901124477386475,
"learning_rate": 3.2491706522428715e-05,
"loss": 2.6548,
"step": 225500
},
{
"epoch": 1.248777468960144,
"grad_norm": 11.486516952514648,
"learning_rate": 3.2440543798361567e-05,
"loss": 2.6362,
"step": 226000
},
{
"epoch": 1.251540250971118,
"grad_norm": 8.68649959564209,
"learning_rate": 3.238938107429442e-05,
"loss": 2.6175,
"step": 226500
},
{
"epoch": 1.2543030329820917,
"grad_norm": 9.17063045501709,
"learning_rate": 3.233821835022727e-05,
"loss": 2.6319,
"step": 227000
},
{
"epoch": 1.2570658149930654,
"grad_norm": 7.608591556549072,
"learning_rate": 3.228715795160825e-05,
"loss": 2.6814,
"step": 227500
},
{
"epoch": 1.2598285970040393,
"grad_norm": 6.055707931518555,
"learning_rate": 3.2236097552989234e-05,
"loss": 2.6131,
"step": 228000
},
{
"epoch": 1.262591379015013,
"grad_norm": 10.52354621887207,
"learning_rate": 3.2184934828922086e-05,
"loss": 2.6803,
"step": 228500
},
{
"epoch": 1.2653541610259866,
"grad_norm": 4.819145679473877,
"learning_rate": 3.213377210485494e-05,
"loss": 2.6665,
"step": 229000
},
{
"epoch": 1.2681169430369605,
"grad_norm": 6.71164608001709,
"learning_rate": 3.208260938078779e-05,
"loss": 2.6405,
"step": 229500
},
{
"epoch": 1.2708797250479342,
"grad_norm": 6.727443218231201,
"learning_rate": 3.2031446656720635e-05,
"loss": 2.6668,
"step": 230000
},
{
"epoch": 1.2708797250479342,
"eval_runtime": 1397.8356,
"eval_samples_per_second": 258.938,
"eval_steps_per_second": 32.368,
"step": 230000
},
{
"epoch": 1.273642507058908,
"grad_norm": 6.408928871154785,
"learning_rate": 3.198028393265348e-05,
"loss": 2.6714,
"step": 230500
},
{
"epoch": 1.2764052890698818,
"grad_norm": 7.555359363555908,
"learning_rate": 3.192912120858633e-05,
"loss": 2.6462,
"step": 231000
},
{
"epoch": 1.2791680710808555,
"grad_norm": 7.9627909660339355,
"learning_rate": 3.1877958484519184e-05,
"loss": 2.6649,
"step": 231500
},
{
"epoch": 1.2819308530918294,
"grad_norm": 5.883249759674072,
"learning_rate": 3.182689808590017e-05,
"loss": 2.6482,
"step": 232000
},
{
"epoch": 1.284693635102803,
"grad_norm": 6.337319850921631,
"learning_rate": 3.177573536183302e-05,
"loss": 2.653,
"step": 232500
},
{
"epoch": 1.287456417113777,
"grad_norm": 9.221954345703125,
"learning_rate": 3.172457263776587e-05,
"loss": 2.6429,
"step": 233000
},
{
"epoch": 1.2902191991247507,
"grad_norm": 8.365209579467773,
"learning_rate": 3.167340991369872e-05,
"loss": 2.6898,
"step": 233500
},
{
"epoch": 1.2929819811357244,
"grad_norm": 13.809211730957031,
"learning_rate": 3.16223495150797e-05,
"loss": 2.6034,
"step": 234000
},
{
"epoch": 1.2957447631466983,
"grad_norm": 6.561621189117432,
"learning_rate": 3.1571186791012555e-05,
"loss": 2.6831,
"step": 234500
},
{
"epoch": 1.298507545157672,
"grad_norm": 7.049484729766846,
"learning_rate": 3.15200240669454e-05,
"loss": 2.6741,
"step": 235000
},
{
"epoch": 1.298507545157672,
"eval_runtime": 1410.5655,
"eval_samples_per_second": 256.601,
"eval_steps_per_second": 32.076,
"step": 235000
},
{
"epoch": 1.3012703271686457,
"grad_norm": 7.707888126373291,
"learning_rate": 3.146886134287825e-05,
"loss": 2.6391,
"step": 235500
},
{
"epoch": 1.3040331091796196,
"grad_norm": 8.038480758666992,
"learning_rate": 3.1417800944259234e-05,
"loss": 2.6347,
"step": 236000
},
{
"epoch": 1.3067958911905933,
"grad_norm": 8.12757396697998,
"learning_rate": 3.1366638220192086e-05,
"loss": 2.6362,
"step": 236500
},
{
"epoch": 1.309558673201567,
"grad_norm": 14.125542640686035,
"learning_rate": 3.131547549612494e-05,
"loss": 2.6423,
"step": 237000
},
{
"epoch": 1.3123214552125408,
"grad_norm": 7.672112941741943,
"learning_rate": 3.126431277205778e-05,
"loss": 2.6628,
"step": 237500
},
{
"epoch": 1.3150842372235145,
"grad_norm": 5.344297409057617,
"learning_rate": 3.1213150047990635e-05,
"loss": 2.6825,
"step": 238000
},
{
"epoch": 1.3178470192344884,
"grad_norm": 5.527090549468994,
"learning_rate": 3.116208964937162e-05,
"loss": 2.6757,
"step": 238500
},
{
"epoch": 1.3206098012454621,
"grad_norm": 6.49380350112915,
"learning_rate": 3.1110926925304475e-05,
"loss": 2.6798,
"step": 239000
},
{
"epoch": 1.323372583256436,
"grad_norm": 6.0890069007873535,
"learning_rate": 3.105976420123732e-05,
"loss": 2.6273,
"step": 239500
},
{
"epoch": 1.3261353652674097,
"grad_norm": 6.064700126647949,
"learning_rate": 3.1008601477170166e-05,
"loss": 2.6461,
"step": 240000
},
{
"epoch": 1.3261353652674097,
"eval_runtime": 1411.2023,
"eval_samples_per_second": 256.486,
"eval_steps_per_second": 32.061,
"step": 240000
},
{
"epoch": 1.3288981472783834,
"grad_norm": 8.562914848327637,
"learning_rate": 3.0957541078551154e-05,
"loss": 2.6699,
"step": 240500
},
{
"epoch": 1.3316609292893573,
"grad_norm": 4.0414958000183105,
"learning_rate": 3.0906378354484006e-05,
"loss": 2.7082,
"step": 241000
},
{
"epoch": 1.334423711300331,
"grad_norm": 6.873857021331787,
"learning_rate": 3.085531795586499e-05,
"loss": 2.6486,
"step": 241500
},
{
"epoch": 1.3371864933113047,
"grad_norm": 7.180528163909912,
"learning_rate": 3.080415523179784e-05,
"loss": 2.6766,
"step": 242000
},
{
"epoch": 1.3399492753222786,
"grad_norm": 3.9526586532592773,
"learning_rate": 3.0752992507730685e-05,
"loss": 2.6523,
"step": 242500
},
{
"epoch": 1.3427120573332523,
"grad_norm": 7.868597030639648,
"learning_rate": 3.070182978366354e-05,
"loss": 2.7141,
"step": 243000
},
{
"epoch": 1.345474839344226,
"grad_norm": 6.393147945404053,
"learning_rate": 3.065066705959639e-05,
"loss": 2.661,
"step": 243500
},
{
"epoch": 1.3482376213551999,
"grad_norm": 6.155392646789551,
"learning_rate": 3.059950433552924e-05,
"loss": 2.6336,
"step": 244000
},
{
"epoch": 1.3510004033661736,
"grad_norm": 5.36915922164917,
"learning_rate": 3.0548341611462086e-05,
"loss": 2.6183,
"step": 244500
},
{
"epoch": 1.3537631853771472,
"grad_norm": 8.455395698547363,
"learning_rate": 3.0497178887394938e-05,
"loss": 2.6629,
"step": 245000
},
{
"epoch": 1.3537631853771472,
"eval_runtime": 1408.8422,
"eval_samples_per_second": 256.915,
"eval_steps_per_second": 32.115,
"step": 245000
},
{
"epoch": 1.3565259673881211,
"grad_norm": 7.414444446563721,
"learning_rate": 3.044601616332779e-05,
"loss": 2.6668,
"step": 245500
},
{
"epoch": 1.3592887493990948,
"grad_norm": 5.821547031402588,
"learning_rate": 3.039495576470877e-05,
"loss": 2.6888,
"step": 246000
},
{
"epoch": 1.3620515314100687,
"grad_norm": 6.702820301055908,
"learning_rate": 3.034379304064162e-05,
"loss": 2.6647,
"step": 246500
},
{
"epoch": 1.3648143134210424,
"grad_norm": 8.23851203918457,
"learning_rate": 3.0292630316574472e-05,
"loss": 2.6668,
"step": 247000
},
{
"epoch": 1.3675770954320163,
"grad_norm": 6.016136646270752,
"learning_rate": 3.024146759250732e-05,
"loss": 2.6509,
"step": 247500
},
{
"epoch": 1.37033987744299,
"grad_norm": 6.836232662200928,
"learning_rate": 3.019040719388831e-05,
"loss": 2.6904,
"step": 248000
},
{
"epoch": 1.3731026594539637,
"grad_norm": 7.973288059234619,
"learning_rate": 3.013924446982116e-05,
"loss": 2.6805,
"step": 248500
},
{
"epoch": 1.3758654414649376,
"grad_norm": 6.736196517944336,
"learning_rate": 3.0088081745754003e-05,
"loss": 2.6456,
"step": 249000
},
{
"epoch": 1.3786282234759113,
"grad_norm": 6.223706245422363,
"learning_rate": 3.0036919021686855e-05,
"loss": 2.6705,
"step": 249500
},
{
"epoch": 1.381391005486885,
"grad_norm": 6.599213600158691,
"learning_rate": 2.9985858623067843e-05,
"loss": 2.6088,
"step": 250000
},
{
"epoch": 1.381391005486885,
"eval_runtime": 1412.4913,
"eval_samples_per_second": 256.251,
"eval_steps_per_second": 32.032,
"step": 250000
},
{
"epoch": 1.384153787497859,
"grad_norm": 5.85990571975708,
"learning_rate": 2.993469589900069e-05,
"loss": 2.6636,
"step": 250500
},
{
"epoch": 1.3869165695088326,
"grad_norm": 8.427452087402344,
"learning_rate": 2.9883533174933544e-05,
"loss": 2.641,
"step": 251000
},
{
"epoch": 1.3896793515198063,
"grad_norm": 4.338545799255371,
"learning_rate": 2.9832472776314525e-05,
"loss": 2.6391,
"step": 251500
},
{
"epoch": 1.3924421335307802,
"grad_norm": 6.209786891937256,
"learning_rate": 2.9781310052247374e-05,
"loss": 2.6357,
"step": 252000
},
{
"epoch": 1.3952049155417539,
"grad_norm": 5.6584672927856445,
"learning_rate": 2.9730147328180226e-05,
"loss": 2.6903,
"step": 252500
},
{
"epoch": 1.3979676975527278,
"grad_norm": 6.956233501434326,
"learning_rate": 2.9678984604113074e-05,
"loss": 2.6452,
"step": 253000
},
{
"epoch": 1.4007304795637014,
"grad_norm": 7.020050048828125,
"learning_rate": 2.9627821880045926e-05,
"loss": 2.707,
"step": 253500
},
{
"epoch": 1.4034932615746754,
"grad_norm": 6.412283420562744,
"learning_rate": 2.9576659155978775e-05,
"loss": 2.5976,
"step": 254000
},
{
"epoch": 1.406256043585649,
"grad_norm": 6.7848711013793945,
"learning_rate": 2.9525598757359756e-05,
"loss": 2.6319,
"step": 254500
},
{
"epoch": 1.4090188255966227,
"grad_norm": 4.951188564300537,
"learning_rate": 2.947443603329261e-05,
"loss": 2.6127,
"step": 255000
},
{
"epoch": 1.4090188255966227,
"eval_runtime": 1426.0103,
"eval_samples_per_second": 253.822,
"eval_steps_per_second": 31.728,
"step": 255000
},
{
"epoch": 1.4117816076075966,
"grad_norm": 7.517430305480957,
"learning_rate": 2.9423273309225457e-05,
"loss": 2.6338,
"step": 255500
},
{
"epoch": 1.4145443896185703,
"grad_norm": 8.3431978225708,
"learning_rate": 2.937211058515831e-05,
"loss": 2.6741,
"step": 256000
},
{
"epoch": 1.417307171629544,
"grad_norm": 6.5295867919921875,
"learning_rate": 2.9321050186539294e-05,
"loss": 2.6293,
"step": 256500
},
{
"epoch": 1.420069953640518,
"grad_norm": 7.589269638061523,
"learning_rate": 2.9269887462472146e-05,
"loss": 2.6028,
"step": 257000
},
{
"epoch": 1.4228327356514916,
"grad_norm": 6.105846881866455,
"learning_rate": 2.9218724738404994e-05,
"loss": 2.6168,
"step": 257500
},
{
"epoch": 1.4255955176624653,
"grad_norm": 5.840164661407471,
"learning_rate": 2.9167562014337846e-05,
"loss": 2.642,
"step": 258000
},
{
"epoch": 1.4283582996734392,
"grad_norm": 7.00549840927124,
"learning_rate": 2.911639929027069e-05,
"loss": 2.6812,
"step": 258500
},
{
"epoch": 1.4311210816844129,
"grad_norm": 8.726187705993652,
"learning_rate": 2.906523656620354e-05,
"loss": 2.6415,
"step": 259000
},
{
"epoch": 1.4338838636953868,
"grad_norm": 7.919028282165527,
"learning_rate": 2.9014073842136392e-05,
"loss": 2.5798,
"step": 259500
},
{
"epoch": 1.4366466457063605,
"grad_norm": 4.848925590515137,
"learning_rate": 2.896291111806924e-05,
"loss": 2.6991,
"step": 260000
},
{
"epoch": 1.4366466457063605,
"eval_runtime": 1376.0192,
"eval_samples_per_second": 263.044,
"eval_steps_per_second": 32.881,
"step": 260000
},
{
"epoch": 1.4394094277173344,
"grad_norm": 7.420814514160156,
"learning_rate": 2.8911748394002093e-05,
"loss": 2.6541,
"step": 260500
},
{
"epoch": 1.442172209728308,
"grad_norm": 7.090695381164551,
"learning_rate": 2.8860687995383078e-05,
"loss": 2.6506,
"step": 261000
},
{
"epoch": 1.4449349917392817,
"grad_norm": 6.3192338943481445,
"learning_rate": 2.880952527131593e-05,
"loss": 2.6555,
"step": 261500
},
{
"epoch": 1.4476977737502557,
"grad_norm": 5.872584819793701,
"learning_rate": 2.8758362547248778e-05,
"loss": 2.615,
"step": 262000
},
{
"epoch": 1.4504605557612293,
"grad_norm": 7.909795761108398,
"learning_rate": 2.870719982318163e-05,
"loss": 2.6214,
"step": 262500
},
{
"epoch": 1.453223337772203,
"grad_norm": 6.419271469116211,
"learning_rate": 2.8656139424562612e-05,
"loss": 2.6541,
"step": 263000
},
{
"epoch": 1.455986119783177,
"grad_norm": 8.628451347351074,
"learning_rate": 2.860497670049546e-05,
"loss": 2.6178,
"step": 263500
},
{
"epoch": 1.4587489017941506,
"grad_norm": 6.384825706481934,
"learning_rate": 2.8553813976428312e-05,
"loss": 2.6617,
"step": 264000
},
{
"epoch": 1.4615116838051243,
"grad_norm": 7.782327651977539,
"learning_rate": 2.850265125236116e-05,
"loss": 2.6056,
"step": 264500
},
{
"epoch": 1.4642744658160982,
"grad_norm": 6.750179767608643,
"learning_rate": 2.8451590853742143e-05,
"loss": 2.6685,
"step": 265000
},
{
"epoch": 1.4642744658160982,
"eval_runtime": 1431.562,
"eval_samples_per_second": 252.838,
"eval_steps_per_second": 31.605,
"step": 265000
},
{
"epoch": 1.467037247827072,
"grad_norm": 6.400202751159668,
"learning_rate": 2.8400428129674995e-05,
"loss": 2.6467,
"step": 265500
},
{
"epoch": 1.4698000298380456,
"grad_norm": 9.842098236083984,
"learning_rate": 2.8349265405607843e-05,
"loss": 2.5767,
"step": 266000
},
{
"epoch": 1.4725628118490195,
"grad_norm": 6.3928680419921875,
"learning_rate": 2.8298102681540695e-05,
"loss": 2.643,
"step": 266500
},
{
"epoch": 1.4753255938599932,
"grad_norm": 5.774998188018799,
"learning_rate": 2.8246939957473544e-05,
"loss": 2.7011,
"step": 267000
},
{
"epoch": 1.478088375870967,
"grad_norm": 7.253933429718018,
"learning_rate": 2.8195879558854525e-05,
"loss": 2.6298,
"step": 267500
},
{
"epoch": 1.4808511578819408,
"grad_norm": 6.2444658279418945,
"learning_rate": 2.8144716834787377e-05,
"loss": 2.661,
"step": 268000
},
{
"epoch": 1.4836139398929147,
"grad_norm": 7.06601619720459,
"learning_rate": 2.8093554110720226e-05,
"loss": 2.6499,
"step": 268500
},
{
"epoch": 1.4863767219038884,
"grad_norm": 8.167895317077637,
"learning_rate": 2.8042391386653078e-05,
"loss": 2.6404,
"step": 269000
},
{
"epoch": 1.489139503914862,
"grad_norm": 6.798631191253662,
"learning_rate": 2.7991330988034066e-05,
"loss": 2.6585,
"step": 269500
},
{
"epoch": 1.491902285925836,
"grad_norm": 6.69813346862793,
"learning_rate": 2.7940168263966915e-05,
"loss": 2.6658,
"step": 270000
},
{
"epoch": 1.491902285925836,
"eval_runtime": 1470.9324,
"eval_samples_per_second": 246.07,
"eval_steps_per_second": 30.759,
"step": 270000
},
{
"epoch": 1.4946650679368096,
"grad_norm": 6.37416410446167,
"learning_rate": 2.7889005539899767e-05,
"loss": 2.6653,
"step": 270500
},
{
"epoch": 1.4974278499477833,
"grad_norm": 5.052603244781494,
"learning_rate": 2.7837842815832615e-05,
"loss": 2.6822,
"step": 271000
},
{
"epoch": 1.5001906319587572,
"grad_norm": 7.3138861656188965,
"learning_rate": 2.7786782417213597e-05,
"loss": 2.637,
"step": 271500
},
{
"epoch": 1.502953413969731,
"grad_norm": 12.509490013122559,
"learning_rate": 2.773561969314645e-05,
"loss": 2.6139,
"step": 272000
},
{
"epoch": 1.5057161959807046,
"grad_norm": 8.668211936950684,
"learning_rate": 2.7684456969079297e-05,
"loss": 2.6967,
"step": 272500
},
{
"epoch": 1.5084789779916785,
"grad_norm": 6.163717269897461,
"learning_rate": 2.763339657046028e-05,
"loss": 2.6344,
"step": 273000
},
{
"epoch": 1.5112417600026524,
"grad_norm": 5.849397659301758,
"learning_rate": 2.758223384639313e-05,
"loss": 2.7212,
"step": 273500
},
{
"epoch": 1.5140045420136259,
"grad_norm": 5.386920928955078,
"learning_rate": 2.753107112232598e-05,
"loss": 2.6579,
"step": 274000
},
{
"epoch": 1.5167673240245998,
"grad_norm": 7.916058540344238,
"learning_rate": 2.747990839825883e-05,
"loss": 2.6906,
"step": 274500
},
{
"epoch": 1.5195301060355737,
"grad_norm": 5.125283241271973,
"learning_rate": 2.742874567419168e-05,
"loss": 2.6266,
"step": 275000
},
{
"epoch": 1.5195301060355737,
"eval_runtime": 1424.4614,
"eval_samples_per_second": 254.098,
"eval_steps_per_second": 31.763,
"step": 275000
},
{
"epoch": 1.5222928880465474,
"grad_norm": 8.393288612365723,
"learning_rate": 2.7377582950124532e-05,
"loss": 2.6324,
"step": 275500
},
{
"epoch": 1.525055670057521,
"grad_norm": 6.936960697174072,
"learning_rate": 2.732642022605738e-05,
"loss": 2.6751,
"step": 276000
},
{
"epoch": 1.527818452068495,
"grad_norm": 7.31864595413208,
"learning_rate": 2.7275257501990233e-05,
"loss": 2.7046,
"step": 276500
},
{
"epoch": 1.5305812340794687,
"grad_norm": 7.4950151443481445,
"learning_rate": 2.7224197103371214e-05,
"loss": 2.6602,
"step": 277000
},
{
"epoch": 1.5333440160904424,
"grad_norm": 8.419631004333496,
"learning_rate": 2.7173034379304063e-05,
"loss": 2.6288,
"step": 277500
},
{
"epoch": 1.5361067981014163,
"grad_norm": 4.573643684387207,
"learning_rate": 2.7121871655236915e-05,
"loss": 2.6075,
"step": 278000
},
{
"epoch": 1.53886958011239,
"grad_norm": 6.746376991271973,
"learning_rate": 2.7070708931169763e-05,
"loss": 2.6455,
"step": 278500
},
{
"epoch": 1.5416323621233636,
"grad_norm": 7.2786478996276855,
"learning_rate": 2.7019546207102615e-05,
"loss": 2.6467,
"step": 279000
},
{
"epoch": 1.5443951441343375,
"grad_norm": 8.315926551818848,
"learning_rate": 2.69684858084836e-05,
"loss": 2.6329,
"step": 279500
},
{
"epoch": 1.5471579261453114,
"grad_norm": 6.038636207580566,
"learning_rate": 2.6917323084416452e-05,
"loss": 2.6422,
"step": 280000
},
{
"epoch": 1.5471579261453114,
"eval_runtime": 1397.1141,
"eval_samples_per_second": 259.072,
"eval_steps_per_second": 32.385,
"step": 280000
},
{
"epoch": 1.549920708156285,
"grad_norm": 5.000608921051025,
"learning_rate": 2.68661603603493e-05,
"loss": 2.5991,
"step": 280500
},
{
"epoch": 1.5526834901672588,
"grad_norm": 5.943995952606201,
"learning_rate": 2.6814997636282153e-05,
"loss": 2.5922,
"step": 281000
},
{
"epoch": 1.5554462721782327,
"grad_norm": 8.255182266235352,
"learning_rate": 2.6763834912214998e-05,
"loss": 2.6483,
"step": 281500
},
{
"epoch": 1.5582090541892064,
"grad_norm": 8.202108383178711,
"learning_rate": 2.6712774513595983e-05,
"loss": 2.6235,
"step": 282000
},
{
"epoch": 1.56097183620018,
"grad_norm": 5.840571880340576,
"learning_rate": 2.6661611789528835e-05,
"loss": 2.652,
"step": 282500
},
{
"epoch": 1.563734618211154,
"grad_norm": 5.939957141876221,
"learning_rate": 2.6610449065461684e-05,
"loss": 2.6501,
"step": 283000
},
{
"epoch": 1.5664974002221277,
"grad_norm": 5.314937114715576,
"learning_rate": 2.6559286341394536e-05,
"loss": 2.656,
"step": 283500
},
{
"epoch": 1.5692601822331014,
"grad_norm": 6.23870849609375,
"learning_rate": 2.6508123617327384e-05,
"loss": 2.6776,
"step": 284000
},
{
"epoch": 1.5720229642440753,
"grad_norm": 6.62495231628418,
"learning_rate": 2.6457063218708366e-05,
"loss": 2.6351,
"step": 284500
},
{
"epoch": 1.574785746255049,
"grad_norm": 6.557297706604004,
"learning_rate": 2.6406002820089354e-05,
"loss": 2.6637,
"step": 285000
},
{
"epoch": 1.574785746255049,
"eval_runtime": 1399.3188,
"eval_samples_per_second": 258.664,
"eval_steps_per_second": 32.334,
"step": 285000
},
{
"epoch": 1.5775485282660227,
"grad_norm": 7.950584411621094,
"learning_rate": 2.6354840096022203e-05,
"loss": 2.6292,
"step": 285500
},
{
"epoch": 1.5803113102769966,
"grad_norm": 6.725704193115234,
"learning_rate": 2.6303677371955055e-05,
"loss": 2.64,
"step": 286000
},
{
"epoch": 1.5830740922879702,
"grad_norm": 8.884140014648438,
"learning_rate": 2.62525146478879e-05,
"loss": 2.6596,
"step": 286500
},
{
"epoch": 1.585836874298944,
"grad_norm": 6.812872409820557,
"learning_rate": 2.620135192382075e-05,
"loss": 2.6034,
"step": 287000
},
{
"epoch": 1.5885996563099178,
"grad_norm": 7.91174840927124,
"learning_rate": 2.61501891997536e-05,
"loss": 2.6564,
"step": 287500
},
{
"epoch": 1.5913624383208917,
"grad_norm": 5.403963565826416,
"learning_rate": 2.6099128801134585e-05,
"loss": 2.6303,
"step": 288000
},
{
"epoch": 1.5941252203318652,
"grad_norm": 7.750992774963379,
"learning_rate": 2.6047966077067437e-05,
"loss": 2.6337,
"step": 288500
},
{
"epoch": 1.5968880023428391,
"grad_norm": 8.077462196350098,
"learning_rate": 2.5996803353000286e-05,
"loss": 2.6428,
"step": 289000
},
{
"epoch": 1.599650784353813,
"grad_norm": 5.293886184692383,
"learning_rate": 2.5945640628933138e-05,
"loss": 2.6432,
"step": 289500
},
{
"epoch": 1.6024135663647867,
"grad_norm": 6.613586902618408,
"learning_rate": 2.5894477904865986e-05,
"loss": 2.618,
"step": 290000
},
{
"epoch": 1.6024135663647867,
"eval_runtime": 1431.565,
"eval_samples_per_second": 252.837,
"eval_steps_per_second": 31.605,
"step": 290000
},
{
"epoch": 1.6051763483757604,
"grad_norm": 7.696370601654053,
"learning_rate": 2.584331518079884e-05,
"loss": 2.6043,
"step": 290500
},
{
"epoch": 1.6079391303867343,
"grad_norm": 14.686103820800781,
"learning_rate": 2.579225478217982e-05,
"loss": 2.6113,
"step": 291000
},
{
"epoch": 1.610701912397708,
"grad_norm": 7.173743724822998,
"learning_rate": 2.574109205811267e-05,
"loss": 2.6496,
"step": 291500
},
{
"epoch": 1.6134646944086817,
"grad_norm": 5.398017883300781,
"learning_rate": 2.568992933404552e-05,
"loss": 2.6616,
"step": 292000
},
{
"epoch": 1.6162274764196556,
"grad_norm": 4.810672760009766,
"learning_rate": 2.563876660997837e-05,
"loss": 2.6557,
"step": 292500
},
{
"epoch": 1.6189902584306293,
"grad_norm": 5.541525840759277,
"learning_rate": 2.558760388591122e-05,
"loss": 2.6657,
"step": 293000
},
{
"epoch": 1.621753040441603,
"grad_norm": 6.207642555236816,
"learning_rate": 2.5536543487292203e-05,
"loss": 2.6438,
"step": 293500
},
{
"epoch": 1.6245158224525769,
"grad_norm": 5.112069129943848,
"learning_rate": 2.548538076322505e-05,
"loss": 2.617,
"step": 294000
},
{
"epoch": 1.6272786044635508,
"grad_norm": 5.147789001464844,
"learning_rate": 2.5434218039157903e-05,
"loss": 2.6407,
"step": 294500
},
{
"epoch": 1.6300413864745242,
"grad_norm": 7.100889205932617,
"learning_rate": 2.5383055315090752e-05,
"loss": 2.6411,
"step": 295000
},
{
"epoch": 1.6300413864745242,
"eval_runtime": 1371.2042,
"eval_samples_per_second": 263.967,
"eval_steps_per_second": 32.997,
"step": 295000
},
{
"epoch": 1.6328041684854981,
"grad_norm": 8.297256469726562,
"learning_rate": 2.5331892591023604e-05,
"loss": 2.6792,
"step": 295500
},
{
"epoch": 1.635566950496472,
"grad_norm": 7.450379371643066,
"learning_rate": 2.5280729866956452e-05,
"loss": 2.6567,
"step": 296000
},
{
"epoch": 1.6383297325074457,
"grad_norm": 4.418615818023682,
"learning_rate": 2.5229567142889304e-05,
"loss": 2.5833,
"step": 296500
},
{
"epoch": 1.6410925145184194,
"grad_norm": 8.853099822998047,
"learning_rate": 2.5178404418822153e-05,
"loss": 2.6617,
"step": 297000
},
{
"epoch": 1.6438552965293933,
"grad_norm": 6.378116607666016,
"learning_rate": 2.5127344020203138e-05,
"loss": 2.6225,
"step": 297500
},
{
"epoch": 1.646618078540367,
"grad_norm": 9.61796760559082,
"learning_rate": 2.5076181296135987e-05,
"loss": 2.7009,
"step": 298000
},
{
"epoch": 1.6493808605513407,
"grad_norm": 6.160669803619385,
"learning_rate": 2.502501857206884e-05,
"loss": 2.667,
"step": 298500
},
{
"epoch": 1.6521436425623146,
"grad_norm": 5.313681602478027,
"learning_rate": 2.497395817344982e-05,
"loss": 2.5937,
"step": 299000
},
{
"epoch": 1.6549064245732883,
"grad_norm": 6.531844139099121,
"learning_rate": 2.4922795449382672e-05,
"loss": 2.6204,
"step": 299500
},
{
"epoch": 1.657669206584262,
"grad_norm": 7.415525436401367,
"learning_rate": 2.487163272531552e-05,
"loss": 2.6015,
"step": 300000
},
{
"epoch": 1.657669206584262,
"eval_runtime": 1437.4291,
"eval_samples_per_second": 251.806,
"eval_steps_per_second": 31.476,
"step": 300000
},
{
"epoch": 1.6604319885952359,
"grad_norm": 7.35875129699707,
"learning_rate": 2.4820470001248373e-05,
"loss": 2.611,
"step": 300500
},
{
"epoch": 1.6631947706062098,
"grad_norm": 6.453457355499268,
"learning_rate": 2.476930727718122e-05,
"loss": 2.6368,
"step": 301000
},
{
"epoch": 1.6659575526171833,
"grad_norm": 5.64149808883667,
"learning_rate": 2.4718144553114073e-05,
"loss": 2.6352,
"step": 301500
},
{
"epoch": 1.6687203346281572,
"grad_norm": 6.376221656799316,
"learning_rate": 2.4666981829046922e-05,
"loss": 2.6224,
"step": 302000
},
{
"epoch": 1.671483116639131,
"grad_norm": 7.666605472564697,
"learning_rate": 2.461581910497977e-05,
"loss": 2.6664,
"step": 302500
},
{
"epoch": 1.6742458986501048,
"grad_norm": 5.104877471923828,
"learning_rate": 2.4564758706360755e-05,
"loss": 2.5975,
"step": 303000
},
{
"epoch": 1.6770086806610784,
"grad_norm": 14.055898666381836,
"learning_rate": 2.4513595982293604e-05,
"loss": 2.628,
"step": 303500
},
{
"epoch": 1.6797714626720524,
"grad_norm": 9.033441543579102,
"learning_rate": 2.4462433258226456e-05,
"loss": 2.6281,
"step": 304000
},
{
"epoch": 1.682534244683026,
"grad_norm": 12.3050537109375,
"learning_rate": 2.4411270534159304e-05,
"loss": 2.6701,
"step": 304500
},
{
"epoch": 1.6852970266939997,
"grad_norm": 8.409795761108398,
"learning_rate": 2.436021013554029e-05,
"loss": 2.6023,
"step": 305000
},
{
"epoch": 1.6852970266939997,
"eval_runtime": 1387.5318,
"eval_samples_per_second": 260.861,
"eval_steps_per_second": 32.608,
"step": 305000
},
{
"epoch": 1.6880598087049736,
"grad_norm": 5.9802937507629395,
"learning_rate": 2.4309149736921274e-05,
"loss": 2.665,
"step": 305500
},
{
"epoch": 1.6908225907159473,
"grad_norm": 6.1783270835876465,
"learning_rate": 2.4257987012854126e-05,
"loss": 2.6356,
"step": 306000
},
{
"epoch": 1.693585372726921,
"grad_norm": 6.058241367340088,
"learning_rate": 2.4206824288786975e-05,
"loss": 2.6219,
"step": 306500
},
{
"epoch": 1.696348154737895,
"grad_norm": 6.79514741897583,
"learning_rate": 2.4155661564719824e-05,
"loss": 2.6272,
"step": 307000
},
{
"epoch": 1.6991109367488686,
"grad_norm": 9.22230052947998,
"learning_rate": 2.4104498840652672e-05,
"loss": 2.5919,
"step": 307500
},
{
"epoch": 1.7018737187598423,
"grad_norm": 5.048295021057129,
"learning_rate": 2.4053336116585524e-05,
"loss": 2.6379,
"step": 308000
},
{
"epoch": 1.7046365007708162,
"grad_norm": 7.282494068145752,
"learning_rate": 2.4002173392518373e-05,
"loss": 2.6391,
"step": 308500
},
{
"epoch": 1.70739928278179,
"grad_norm": 6.831259727478027,
"learning_rate": 2.3951010668451225e-05,
"loss": 2.6218,
"step": 309000
},
{
"epoch": 1.7101620647927636,
"grad_norm": 6.001838207244873,
"learning_rate": 2.389995026983221e-05,
"loss": 2.6372,
"step": 309500
},
{
"epoch": 1.7129248468037375,
"grad_norm": 5.721564769744873,
"learning_rate": 2.3848787545765058e-05,
"loss": 2.7011,
"step": 310000
},
{
"epoch": 1.7129248468037375,
"eval_runtime": 1416.3525,
"eval_samples_per_second": 255.553,
"eval_steps_per_second": 31.945,
"step": 310000
},
{
"epoch": 1.7156876288147114,
"grad_norm": 7.709352970123291,
"learning_rate": 2.379762482169791e-05,
"loss": 2.6193,
"step": 310500
},
{
"epoch": 1.718450410825685,
"grad_norm": 7.23681640625,
"learning_rate": 2.374646209763076e-05,
"loss": 2.6307,
"step": 311000
},
{
"epoch": 1.7212131928366587,
"grad_norm": 6.505390167236328,
"learning_rate": 2.3695401699011744e-05,
"loss": 2.6409,
"step": 311500
},
{
"epoch": 1.7239759748476327,
"grad_norm": 8.059307098388672,
"learning_rate": 2.3644238974944592e-05,
"loss": 2.6348,
"step": 312000
},
{
"epoch": 1.7267387568586063,
"grad_norm": 7.6500749588012695,
"learning_rate": 2.359307625087744e-05,
"loss": 2.6435,
"step": 312500
},
{
"epoch": 1.72950153886958,
"grad_norm": 9.657527923583984,
"learning_rate": 2.3541913526810293e-05,
"loss": 2.661,
"step": 313000
},
{
"epoch": 1.732264320880554,
"grad_norm": 5.21886682510376,
"learning_rate": 2.3490853128191275e-05,
"loss": 2.6794,
"step": 313500
},
{
"epoch": 1.7350271028915276,
"grad_norm": 6.33572244644165,
"learning_rate": 2.3439690404124126e-05,
"loss": 2.6372,
"step": 314000
},
{
"epoch": 1.7377898849025013,
"grad_norm": 6.692564010620117,
"learning_rate": 2.3388527680056975e-05,
"loss": 2.6583,
"step": 314500
},
{
"epoch": 1.7405526669134752,
"grad_norm": 5.055424690246582,
"learning_rate": 2.3337364955989827e-05,
"loss": 2.6386,
"step": 315000
},
{
"epoch": 1.7405526669134752,
"eval_runtime": 1401.7623,
"eval_samples_per_second": 258.213,
"eval_steps_per_second": 32.277,
"step": 315000
},
{
"epoch": 1.7433154489244491,
"grad_norm": 5.959291934967041,
"learning_rate": 2.3286202231922676e-05,
"loss": 2.6501,
"step": 315500
},
{
"epoch": 1.7460782309354226,
"grad_norm": 7.027371406555176,
"learning_rate": 2.323514183330366e-05,
"loss": 2.6522,
"step": 316000
},
{
"epoch": 1.7488410129463965,
"grad_norm": 6.8300557136535645,
"learning_rate": 2.318397910923651e-05,
"loss": 2.6386,
"step": 316500
},
{
"epoch": 1.7516037949573704,
"grad_norm": 5.422798156738281,
"learning_rate": 2.3132816385169358e-05,
"loss": 2.6825,
"step": 317000
},
{
"epoch": 1.754366576968344,
"grad_norm": 7.326968669891357,
"learning_rate": 2.308165366110221e-05,
"loss": 2.628,
"step": 317500
},
{
"epoch": 1.7571293589793178,
"grad_norm": 6.498944282531738,
"learning_rate": 2.3030490937035058e-05,
"loss": 2.6193,
"step": 318000
},
{
"epoch": 1.7598921409902917,
"grad_norm": 7.064229965209961,
"learning_rate": 2.297953286386418e-05,
"loss": 2.6416,
"step": 318500
},
{
"epoch": 1.7626549230012654,
"grad_norm": 6.315282344818115,
"learning_rate": 2.2928370139797028e-05,
"loss": 2.6887,
"step": 319000
},
{
"epoch": 1.765417705012239,
"grad_norm": 8.035995483398438,
"learning_rate": 2.287720741572988e-05,
"loss": 2.6238,
"step": 319500
},
{
"epoch": 1.768180487023213,
"grad_norm": 7.513897895812988,
"learning_rate": 2.282604469166273e-05,
"loss": 2.626,
"step": 320000
},
{
"epoch": 1.768180487023213,
"eval_runtime": 1385.5831,
"eval_samples_per_second": 261.228,
"eval_steps_per_second": 32.654,
"step": 320000
},
{
"epoch": 1.7709432690341866,
"grad_norm": 6.0088090896606445,
"learning_rate": 2.277488196759558e-05,
"loss": 2.6455,
"step": 320500
},
{
"epoch": 1.7737060510451603,
"grad_norm": 4.775638103485107,
"learning_rate": 2.2723719243528426e-05,
"loss": 2.6581,
"step": 321000
},
{
"epoch": 1.7764688330561342,
"grad_norm": 5.797138690948486,
"learning_rate": 2.2672556519461278e-05,
"loss": 2.6307,
"step": 321500
},
{
"epoch": 1.7792316150671081,
"grad_norm": 6.206060886383057,
"learning_rate": 2.2621393795394127e-05,
"loss": 2.6399,
"step": 322000
},
{
"epoch": 1.7819943970780816,
"grad_norm": 6.536865711212158,
"learning_rate": 2.257023107132698e-05,
"loss": 2.6514,
"step": 322500
},
{
"epoch": 1.7847571790890555,
"grad_norm": 5.105484962463379,
"learning_rate": 2.2519170672707964e-05,
"loss": 2.6481,
"step": 323000
},
{
"epoch": 1.7875199611000294,
"grad_norm": 5.873786926269531,
"learning_rate": 2.2468007948640812e-05,
"loss": 2.6467,
"step": 323500
},
{
"epoch": 1.790282743111003,
"grad_norm": 5.885590553283691,
"learning_rate": 2.2416947550021797e-05,
"loss": 2.6207,
"step": 324000
},
{
"epoch": 1.7930455251219768,
"grad_norm": 9.117544174194336,
"learning_rate": 2.2365784825954646e-05,
"loss": 2.6161,
"step": 324500
},
{
"epoch": 1.7958083071329507,
"grad_norm": 7.810193061828613,
"learning_rate": 2.2314622101887498e-05,
"loss": 2.6701,
"step": 325000
},
{
"epoch": 1.7958083071329507,
"eval_runtime": 1431.4837,
"eval_samples_per_second": 252.852,
"eval_steps_per_second": 31.607,
"step": 325000
},
{
"epoch": 1.7985710891439244,
"grad_norm": 5.142136573791504,
"learning_rate": 2.2263459377820346e-05,
"loss": 2.6457,
"step": 325500
},
{
"epoch": 1.801333871154898,
"grad_norm": 6.691473960876465,
"learning_rate": 2.2212398979201328e-05,
"loss": 2.6182,
"step": 326000
},
{
"epoch": 1.804096653165872,
"grad_norm": 10.478597640991211,
"learning_rate": 2.216123625513418e-05,
"loss": 2.6797,
"step": 326500
},
{
"epoch": 1.8068594351768457,
"grad_norm": 6.4801554679870605,
"learning_rate": 2.211007353106703e-05,
"loss": 2.6638,
"step": 327000
},
{
"epoch": 1.8096222171878193,
"grad_norm": 5.350027561187744,
"learning_rate": 2.205891080699988e-05,
"loss": 2.6383,
"step": 327500
},
{
"epoch": 1.8123849991987933,
"grad_norm": 7.608794689178467,
"learning_rate": 2.200774808293273e-05,
"loss": 2.6547,
"step": 328000
},
{
"epoch": 1.815147781209767,
"grad_norm": 7.434188365936279,
"learning_rate": 2.195658535886558e-05,
"loss": 2.6311,
"step": 328500
},
{
"epoch": 1.8179105632207406,
"grad_norm": 5.700359344482422,
"learning_rate": 2.190542263479843e-05,
"loss": 2.641,
"step": 329000
},
{
"epoch": 1.8206733452317145,
"grad_norm": 5.555663585662842,
"learning_rate": 2.185425991073128e-05,
"loss": 2.6202,
"step": 329500
},
{
"epoch": 1.8234361272426884,
"grad_norm": 6.527945518493652,
"learning_rate": 2.180309718666413e-05,
"loss": 2.6888,
"step": 330000
},
{
"epoch": 1.8234361272426884,
"eval_runtime": 1420.615,
"eval_samples_per_second": 254.786,
"eval_steps_per_second": 31.849,
"step": 330000
},
{
"epoch": 1.826198909253662,
"grad_norm": 8.153132438659668,
"learning_rate": 2.175193446259698e-05,
"loss": 2.6267,
"step": 330500
},
{
"epoch": 1.8289616912646358,
"grad_norm": 5.93485164642334,
"learning_rate": 2.170077173852983e-05,
"loss": 2.642,
"step": 331000
},
{
"epoch": 1.8317244732756097,
"grad_norm": 7.894295692443848,
"learning_rate": 2.164960901446268e-05,
"loss": 2.6249,
"step": 331500
},
{
"epoch": 1.8344872552865834,
"grad_norm": 7.009608268737793,
"learning_rate": 2.159844629039553e-05,
"loss": 2.6824,
"step": 332000
},
{
"epoch": 1.837250037297557,
"grad_norm": 23.06879997253418,
"learning_rate": 2.1547385891776513e-05,
"loss": 2.6113,
"step": 332500
},
{
"epoch": 1.840012819308531,
"grad_norm": 5.4768290519714355,
"learning_rate": 2.1496223167709365e-05,
"loss": 2.6372,
"step": 333000
},
{
"epoch": 1.8427756013195047,
"grad_norm": 5.850235939025879,
"learning_rate": 2.1445060443642213e-05,
"loss": 2.6308,
"step": 333500
},
{
"epoch": 1.8455383833304784,
"grad_norm": 6.897058963775635,
"learning_rate": 2.1393897719575065e-05,
"loss": 2.6435,
"step": 334000
},
{
"epoch": 1.8483011653414523,
"grad_norm": 7.006948947906494,
"learning_rate": 2.134283732095605e-05,
"loss": 2.6398,
"step": 334500
},
{
"epoch": 1.851063947352426,
"grad_norm": 5.789132595062256,
"learning_rate": 2.1291674596888895e-05,
"loss": 2.6249,
"step": 335000
},
{
"epoch": 1.851063947352426,
"eval_runtime": 1441.3667,
"eval_samples_per_second": 251.118,
"eval_steps_per_second": 31.39,
"step": 335000
},
{
"epoch": 1.8538267293633997,
"grad_norm": 7.754148006439209,
"learning_rate": 2.1240511872821747e-05,
"loss": 2.6279,
"step": 335500
},
{
"epoch": 1.8565895113743736,
"grad_norm": 5.5116071701049805,
"learning_rate": 2.1189349148754596e-05,
"loss": 2.6443,
"step": 336000
},
{
"epoch": 1.8593522933853475,
"grad_norm": 7.665276050567627,
"learning_rate": 2.113828875013558e-05,
"loss": 2.5898,
"step": 336500
},
{
"epoch": 1.862115075396321,
"grad_norm": 6.607998371124268,
"learning_rate": 2.1087126026068433e-05,
"loss": 2.6224,
"step": 337000
},
{
"epoch": 1.8648778574072948,
"grad_norm": 7.938060760498047,
"learning_rate": 2.103596330200128e-05,
"loss": 2.6023,
"step": 337500
},
{
"epoch": 1.8676406394182687,
"grad_norm": 5.741148948669434,
"learning_rate": 2.0984800577934133e-05,
"loss": 2.6425,
"step": 338000
},
{
"epoch": 1.8704034214292424,
"grad_norm": 7.50128173828125,
"learning_rate": 2.093374017931512e-05,
"loss": 2.6474,
"step": 338500
},
{
"epoch": 1.8731662034402161,
"grad_norm": 5.097824573516846,
"learning_rate": 2.0882577455247967e-05,
"loss": 2.682,
"step": 339000
},
{
"epoch": 1.87592898545119,
"grad_norm": 7.523733139038086,
"learning_rate": 2.0831414731180816e-05,
"loss": 2.6237,
"step": 339500
},
{
"epoch": 1.8786917674621637,
"grad_norm": 10.524862289428711,
"learning_rate": 2.0780252007113664e-05,
"loss": 2.6293,
"step": 340000
},
{
"epoch": 1.8786917674621637,
"eval_runtime": 1428.3224,
"eval_samples_per_second": 253.411,
"eval_steps_per_second": 31.677,
"step": 340000
},
{
"epoch": 1.8814545494731374,
"grad_norm": 5.725772380828857,
"learning_rate": 2.072919160849465e-05,
"loss": 2.6238,
"step": 340500
},
{
"epoch": 1.8842173314841113,
"grad_norm": 6.34156608581543,
"learning_rate": 2.06780288844275e-05,
"loss": 2.5932,
"step": 341000
},
{
"epoch": 1.886980113495085,
"grad_norm": 9.06069278717041,
"learning_rate": 2.062686616036035e-05,
"loss": 2.6577,
"step": 341500
},
{
"epoch": 1.8897428955060587,
"grad_norm": 7.7342329025268555,
"learning_rate": 2.05757034362932e-05,
"loss": 2.6486,
"step": 342000
},
{
"epoch": 1.8925056775170326,
"grad_norm": 7.23144006729126,
"learning_rate": 2.052454071222605e-05,
"loss": 2.6266,
"step": 342500
},
{
"epoch": 1.8952684595280063,
"grad_norm": 6.990833759307861,
"learning_rate": 2.0473480313607035e-05,
"loss": 2.6279,
"step": 343000
},
{
"epoch": 1.89803124153898,
"grad_norm": 9.42507553100586,
"learning_rate": 2.0422317589539884e-05,
"loss": 2.61,
"step": 343500
},
{
"epoch": 1.9007940235499539,
"grad_norm": 4.919162750244141,
"learning_rate": 2.037125719092087e-05,
"loss": 2.6752,
"step": 344000
},
{
"epoch": 1.9035568055609278,
"grad_norm": 6.697198390960693,
"learning_rate": 2.0320094466853717e-05,
"loss": 2.6724,
"step": 344500
},
{
"epoch": 1.9063195875719015,
"grad_norm": 8.001893043518066,
"learning_rate": 2.026893174278657e-05,
"loss": 2.6417,
"step": 345000
},
{
"epoch": 1.9063195875719015,
"eval_runtime": 1396.1909,
"eval_samples_per_second": 259.243,
"eval_steps_per_second": 32.406,
"step": 345000
},
{
"epoch": 1.9090823695828751,
"grad_norm": 6.076798915863037,
"learning_rate": 2.0217769018719418e-05,
"loss": 2.6188,
"step": 345500
},
{
"epoch": 1.911845151593849,
"grad_norm": 6.7825398445129395,
"learning_rate": 2.016660629465227e-05,
"loss": 2.6218,
"step": 346000
},
{
"epoch": 1.9146079336048227,
"grad_norm": 7.529403209686279,
"learning_rate": 2.011544357058512e-05,
"loss": 2.6696,
"step": 346500
},
{
"epoch": 1.9173707156157964,
"grad_norm": 6.596738815307617,
"learning_rate": 2.006428084651797e-05,
"loss": 2.5924,
"step": 347000
},
{
"epoch": 1.9201334976267703,
"grad_norm": 8.123401641845703,
"learning_rate": 2.001311812245082e-05,
"loss": 2.6367,
"step": 347500
},
{
"epoch": 1.922896279637744,
"grad_norm": 7.292053699493408,
"learning_rate": 1.996195539838367e-05,
"loss": 2.6255,
"step": 348000
},
{
"epoch": 1.9256590616487177,
"grad_norm": 8.276845932006836,
"learning_rate": 1.9910894999764653e-05,
"loss": 2.6787,
"step": 348500
},
{
"epoch": 1.9284218436596916,
"grad_norm": 7.750123023986816,
"learning_rate": 1.98597322756975e-05,
"loss": 2.6608,
"step": 349000
},
{
"epoch": 1.9311846256706653,
"grad_norm": 5.726837158203125,
"learning_rate": 1.9808569551630353e-05,
"loss": 2.6488,
"step": 349500
},
{
"epoch": 1.933947407681639,
"grad_norm": 6.344818592071533,
"learning_rate": 1.9757406827563202e-05,
"loss": 2.6391,
"step": 350000
},
{
"epoch": 1.933947407681639,
"eval_runtime": 1416.31,
"eval_samples_per_second": 255.561,
"eval_steps_per_second": 31.946,
"step": 350000
},
{
"epoch": 1.9367101896926129,
"grad_norm": 6.042297840118408,
"learning_rate": 1.9706346428944187e-05,
"loss": 2.6575,
"step": 350500
},
{
"epoch": 1.9394729717035868,
"grad_norm": 7.41777229309082,
"learning_rate": 1.9655183704877035e-05,
"loss": 2.6125,
"step": 351000
},
{
"epoch": 1.9422357537145603,
"grad_norm": 7.600329875946045,
"learning_rate": 1.960412330625802e-05,
"loss": 2.5912,
"step": 351500
},
{
"epoch": 1.9449985357255342,
"grad_norm": 8.74294376373291,
"learning_rate": 1.9552960582190872e-05,
"loss": 2.6447,
"step": 352000
},
{
"epoch": 1.947761317736508,
"grad_norm": 7.785200595855713,
"learning_rate": 1.950179785812372e-05,
"loss": 2.6272,
"step": 352500
},
{
"epoch": 1.9505240997474818,
"grad_norm": 5.1395263671875,
"learning_rate": 1.9450635134056573e-05,
"loss": 2.6262,
"step": 353000
},
{
"epoch": 1.9532868817584554,
"grad_norm": 6.273059368133545,
"learning_rate": 1.9399472409989418e-05,
"loss": 2.6474,
"step": 353500
},
{
"epoch": 1.9560496637694293,
"grad_norm": 7.929372787475586,
"learning_rate": 1.9348412011370403e-05,
"loss": 2.6075,
"step": 354000
},
{
"epoch": 1.958812445780403,
"grad_norm": 5.680710792541504,
"learning_rate": 1.9297249287303255e-05,
"loss": 2.6348,
"step": 354500
},
{
"epoch": 1.9615752277913767,
"grad_norm": 7.282249450683594,
"learning_rate": 1.9246086563236104e-05,
"loss": 2.6393,
"step": 355000
},
{
"epoch": 1.9615752277913767,
"eval_runtime": 1435.1937,
"eval_samples_per_second": 252.198,
"eval_steps_per_second": 31.525,
"step": 355000
},
{
"epoch": 1.9643380098023506,
"grad_norm": 7.032031536102295,
"learning_rate": 1.9194923839168956e-05,
"loss": 2.6573,
"step": 355500
},
{
"epoch": 1.9671007918133243,
"grad_norm": 7.789410591125488,
"learning_rate": 1.9143761115101804e-05,
"loss": 2.6282,
"step": 356000
},
{
"epoch": 1.969863573824298,
"grad_norm": 7.624570369720459,
"learning_rate": 1.9092598391034656e-05,
"loss": 2.6147,
"step": 356500
},
{
"epoch": 1.972626355835272,
"grad_norm": 7.583735942840576,
"learning_rate": 1.9041435666967505e-05,
"loss": 2.6087,
"step": 357000
},
{
"epoch": 1.9753891378462458,
"grad_norm": 7.545061111450195,
"learning_rate": 1.8990272942900357e-05,
"loss": 2.6363,
"step": 357500
},
{
"epoch": 1.9781519198572193,
"grad_norm": 8.852106094360352,
"learning_rate": 1.8939110218833202e-05,
"loss": 2.6375,
"step": 358000
},
{
"epoch": 1.9809147018681932,
"grad_norm": 9.523889541625977,
"learning_rate": 1.8888049820214187e-05,
"loss": 2.6647,
"step": 358500
},
{
"epoch": 1.983677483879167,
"grad_norm": 5.024425029754639,
"learning_rate": 1.883688709614704e-05,
"loss": 2.6493,
"step": 359000
},
{
"epoch": 1.9864402658901408,
"grad_norm": 6.810407638549805,
"learning_rate": 1.8785724372079887e-05,
"loss": 2.6589,
"step": 359500
},
{
"epoch": 1.9892030479011145,
"grad_norm": 7.492327690124512,
"learning_rate": 1.873456164801274e-05,
"loss": 2.6184,
"step": 360000
},
{
"epoch": 1.9892030479011145,
"eval_runtime": 1381.387,
"eval_samples_per_second": 262.021,
"eval_steps_per_second": 32.753,
"step": 360000
},
{
"epoch": 1.9919658299120884,
"grad_norm": 7.694727420806885,
"learning_rate": 1.8683501249393724e-05,
"loss": 2.6087,
"step": 360500
},
{
"epoch": 1.994728611923062,
"grad_norm": 6.121093273162842,
"learning_rate": 1.8632338525326573e-05,
"loss": 2.6543,
"step": 361000
},
{
"epoch": 1.9974913939340357,
"grad_norm": 11.24258041381836,
"learning_rate": 1.8581175801259425e-05,
"loss": 2.5845,
"step": 361500
},
{
"epoch": 2.0002541759450096,
"grad_norm": 6.372257709503174,
"learning_rate": 1.8530013077192273e-05,
"loss": 2.6031,
"step": 362000
},
{
"epoch": 2.0030169579559836,
"grad_norm": 6.463737964630127,
"learning_rate": 1.847895267857326e-05,
"loss": 2.6096,
"step": 362500
},
{
"epoch": 2.005779739966957,
"grad_norm": 5.803626537322998,
"learning_rate": 1.8427789954506107e-05,
"loss": 2.6181,
"step": 363000
},
{
"epoch": 2.008542521977931,
"grad_norm": 6.860798358917236,
"learning_rate": 1.8376627230438956e-05,
"loss": 2.6372,
"step": 363500
},
{
"epoch": 2.011305303988905,
"grad_norm": 6.21894645690918,
"learning_rate": 1.832556683181994e-05,
"loss": 2.5802,
"step": 364000
},
{
"epoch": 2.0140680859998783,
"grad_norm": 5.939208507537842,
"learning_rate": 1.827440410775279e-05,
"loss": 2.6489,
"step": 364500
},
{
"epoch": 2.016830868010852,
"grad_norm": 11.368240356445312,
"learning_rate": 1.822324138368564e-05,
"loss": 2.665,
"step": 365000
},
{
"epoch": 2.016830868010852,
"eval_runtime": 1404.9151,
"eval_samples_per_second": 257.633,
"eval_steps_per_second": 32.205,
"step": 365000
},
{
"epoch": 2.019593650021826,
"grad_norm": 6.604458808898926,
"learning_rate": 1.817207865961849e-05,
"loss": 2.6483,
"step": 365500
},
{
"epoch": 2.0223564320327996,
"grad_norm": 9.250690460205078,
"learning_rate": 1.812091593555134e-05,
"loss": 2.5957,
"step": 366000
},
{
"epoch": 2.0251192140437735,
"grad_norm": 4.720149517059326,
"learning_rate": 1.806975321148419e-05,
"loss": 2.6207,
"step": 366500
},
{
"epoch": 2.0278819960547474,
"grad_norm": 4.749586582183838,
"learning_rate": 1.801859048741704e-05,
"loss": 2.5841,
"step": 367000
},
{
"epoch": 2.030644778065721,
"grad_norm": 9.431694984436035,
"learning_rate": 1.796742776334989e-05,
"loss": 2.645,
"step": 367500
},
{
"epoch": 2.0334075600766948,
"grad_norm": 9.072124481201172,
"learning_rate": 1.7916367364730872e-05,
"loss": 2.6951,
"step": 368000
},
{
"epoch": 2.0361703420876687,
"grad_norm": 5.056208610534668,
"learning_rate": 1.7865204640663724e-05,
"loss": 2.5918,
"step": 368500
},
{
"epoch": 2.0389331240986426,
"grad_norm": 6.752665996551514,
"learning_rate": 1.7814041916596573e-05,
"loss": 2.6602,
"step": 369000
},
{
"epoch": 2.041695906109616,
"grad_norm": 10.628358840942383,
"learning_rate": 1.7762879192529425e-05,
"loss": 2.6218,
"step": 369500
},
{
"epoch": 2.04445868812059,
"grad_norm": 6.070361614227295,
"learning_rate": 1.7711716468462274e-05,
"loss": 2.7051,
"step": 370000
},
{
"epoch": 2.04445868812059,
"eval_runtime": 1429.0226,
"eval_samples_per_second": 253.287,
"eval_steps_per_second": 31.662,
"step": 370000
},
{
"epoch": 2.047221470131564,
"grad_norm": 7.342000961303711,
"learning_rate": 1.766065606984326e-05,
"loss": 2.6368,
"step": 370500
},
{
"epoch": 2.0499842521425373,
"grad_norm": 10.825027465820312,
"learning_rate": 1.760949334577611e-05,
"loss": 2.6431,
"step": 371000
},
{
"epoch": 2.0527470341535112,
"grad_norm": 5.528331756591797,
"learning_rate": 1.755833062170896e-05,
"loss": 2.6038,
"step": 371500
},
{
"epoch": 2.055509816164485,
"grad_norm": 5.892696380615234,
"learning_rate": 1.7507167897641808e-05,
"loss": 2.6242,
"step": 372000
},
{
"epoch": 2.0582725981754586,
"grad_norm": 6.996720790863037,
"learning_rate": 1.7456107499022793e-05,
"loss": 2.6515,
"step": 372500
},
{
"epoch": 2.0610353801864325,
"grad_norm": 6.8381757736206055,
"learning_rate": 1.740494477495564e-05,
"loss": 2.6302,
"step": 373000
},
{
"epoch": 2.0637981621974064,
"grad_norm": 8.656445503234863,
"learning_rate": 1.7353782050888493e-05,
"loss": 2.6484,
"step": 373500
},
{
"epoch": 2.06656094420838,
"grad_norm": 6.725839138031006,
"learning_rate": 1.7302619326821342e-05,
"loss": 2.6319,
"step": 374000
},
{
"epoch": 2.069323726219354,
"grad_norm": 5.520457744598389,
"learning_rate": 1.7251558928202327e-05,
"loss": 2.626,
"step": 374500
},
{
"epoch": 2.0720865082303277,
"grad_norm": 5.802083969116211,
"learning_rate": 1.720039620413518e-05,
"loss": 2.6161,
"step": 375000
},
{
"epoch": 2.0720865082303277,
"eval_runtime": 1404.6799,
"eval_samples_per_second": 257.676,
"eval_steps_per_second": 32.21,
"step": 375000
},
{
"epoch": 2.0748492902413016,
"grad_norm": 6.6860551834106445,
"learning_rate": 1.7149233480068027e-05,
"loss": 2.6162,
"step": 375500
},
{
"epoch": 2.077612072252275,
"grad_norm": 6.858133792877197,
"learning_rate": 1.709807075600088e-05,
"loss": 2.6101,
"step": 376000
},
{
"epoch": 2.080374854263249,
"grad_norm": 6.421977996826172,
"learning_rate": 1.7046908031933724e-05,
"loss": 2.6648,
"step": 376500
},
{
"epoch": 2.083137636274223,
"grad_norm": 5.524794578552246,
"learning_rate": 1.699584763331471e-05,
"loss": 2.616,
"step": 377000
},
{
"epoch": 2.0859004182851963,
"grad_norm": 11.743040084838867,
"learning_rate": 1.694468490924756e-05,
"loss": 2.5836,
"step": 377500
},
{
"epoch": 2.0886632002961703,
"grad_norm": 11.61206340789795,
"learning_rate": 1.689352218518041e-05,
"loss": 2.6349,
"step": 378000
},
{
"epoch": 2.091425982307144,
"grad_norm": 10.494318962097168,
"learning_rate": 1.6842359461113262e-05,
"loss": 2.6134,
"step": 378500
},
{
"epoch": 2.0941887643181176,
"grad_norm": 5.6387619972229,
"learning_rate": 1.679119673704611e-05,
"loss": 2.6245,
"step": 379000
},
{
"epoch": 2.0969515463290915,
"grad_norm": 8.307207107543945,
"learning_rate": 1.6740136338427096e-05,
"loss": 2.6092,
"step": 379500
},
{
"epoch": 2.0997143283400654,
"grad_norm": 8.203621864318848,
"learning_rate": 1.6688973614359944e-05,
"loss": 2.6709,
"step": 380000
},
{
"epoch": 2.0997143283400654,
"eval_runtime": 1387.6269,
"eval_samples_per_second": 260.843,
"eval_steps_per_second": 32.606,
"step": 380000
},
{
"epoch": 2.102477110351039,
"grad_norm": 7.687305927276611,
"learning_rate": 1.6637810890292796e-05,
"loss": 2.6405,
"step": 380500
},
{
"epoch": 2.105239892362013,
"grad_norm": 5.860333442687988,
"learning_rate": 1.6586648166225645e-05,
"loss": 2.6511,
"step": 381000
},
{
"epoch": 2.1080026743729867,
"grad_norm": 6.413809299468994,
"learning_rate": 1.6535587767606626e-05,
"loss": 2.634,
"step": 381500
},
{
"epoch": 2.11076545638396,
"grad_norm": 6.205860137939453,
"learning_rate": 1.6484425043539478e-05,
"loss": 2.6624,
"step": 382000
},
{
"epoch": 2.113528238394934,
"grad_norm": 7.94976806640625,
"learning_rate": 1.6433262319472327e-05,
"loss": 2.592,
"step": 382500
},
{
"epoch": 2.116291020405908,
"grad_norm": 6.3407793045043945,
"learning_rate": 1.638209959540518e-05,
"loss": 2.6391,
"step": 383000
},
{
"epoch": 2.119053802416882,
"grad_norm": 5.911262512207031,
"learning_rate": 1.6331039196786164e-05,
"loss": 2.6936,
"step": 383500
},
{
"epoch": 2.1218165844278554,
"grad_norm": 6.195751667022705,
"learning_rate": 1.6279876472719012e-05,
"loss": 2.6127,
"step": 384000
},
{
"epoch": 2.1245793664388293,
"grad_norm": 7.307173252105713,
"learning_rate": 1.6228713748651864e-05,
"loss": 2.6623,
"step": 384500
},
{
"epoch": 2.127342148449803,
"grad_norm": 7.353754043579102,
"learning_rate": 1.6177551024584713e-05,
"loss": 2.6114,
"step": 385000
},
{
"epoch": 2.127342148449803,
"eval_runtime": 1366.6131,
"eval_samples_per_second": 264.854,
"eval_steps_per_second": 33.107,
"step": 385000
},
{
"epoch": 2.1301049304607766,
"grad_norm": 6.172619342803955,
"learning_rate": 1.6126490625965698e-05,
"loss": 2.6657,
"step": 385500
},
{
"epoch": 2.1328677124717506,
"grad_norm": 7.605554580688477,
"learning_rate": 1.6075327901898546e-05,
"loss": 2.6024,
"step": 386000
},
{
"epoch": 2.1356304944827245,
"grad_norm": 8.817626953125,
"learning_rate": 1.6024165177831395e-05,
"loss": 2.6079,
"step": 386500
},
{
"epoch": 2.138393276493698,
"grad_norm": 7.332306861877441,
"learning_rate": 1.5973002453764247e-05,
"loss": 2.6872,
"step": 387000
},
{
"epoch": 2.141156058504672,
"grad_norm": 4.464954853057861,
"learning_rate": 1.5921839729697096e-05,
"loss": 2.6496,
"step": 387500
},
{
"epoch": 2.1439188405156457,
"grad_norm": 5.7703962326049805,
"learning_rate": 1.5870677005629948e-05,
"loss": 2.6303,
"step": 388000
},
{
"epoch": 2.146681622526619,
"grad_norm": 7.109230041503906,
"learning_rate": 1.5819616607010933e-05,
"loss": 2.6188,
"step": 388500
},
{
"epoch": 2.149444404537593,
"grad_norm": 10.503727912902832,
"learning_rate": 1.576845388294378e-05,
"loss": 2.6147,
"step": 389000
},
{
"epoch": 2.152207186548567,
"grad_norm": 7.042636394500732,
"learning_rate": 1.5717291158876633e-05,
"loss": 2.6058,
"step": 389500
},
{
"epoch": 2.1549699685595405,
"grad_norm": 6.6826252937316895,
"learning_rate": 1.5666230760257615e-05,
"loss": 2.6158,
"step": 390000
},
{
"epoch": 2.1549699685595405,
"eval_runtime": 1433.6301,
"eval_samples_per_second": 252.473,
"eval_steps_per_second": 31.56,
"step": 390000
},
{
"epoch": 2.1577327505705144,
"grad_norm": 8.209315299987793,
"learning_rate": 1.5615068036190463e-05,
"loss": 2.648,
"step": 390500
},
{
"epoch": 2.1604955325814883,
"grad_norm": 6.613926887512207,
"learning_rate": 1.5563905312123315e-05,
"loss": 2.6458,
"step": 391000
},
{
"epoch": 2.163258314592462,
"grad_norm": 7.016421794891357,
"learning_rate": 1.5512742588056164e-05,
"loss": 2.6386,
"step": 391500
},
{
"epoch": 2.1660210966034357,
"grad_norm": 6.227564811706543,
"learning_rate": 1.5461579863989016e-05,
"loss": 2.6648,
"step": 392000
},
{
"epoch": 2.1687838786144096,
"grad_norm": 6.598555088043213,
"learning_rate": 1.5410417139921864e-05,
"loss": 2.6187,
"step": 392500
},
{
"epoch": 2.1715466606253835,
"grad_norm": 8.013922691345215,
"learning_rate": 1.5359254415854716e-05,
"loss": 2.6079,
"step": 393000
},
{
"epoch": 2.174309442636357,
"grad_norm": 5.305454730987549,
"learning_rate": 1.53081940172357e-05,
"loss": 2.6232,
"step": 393500
},
{
"epoch": 2.177072224647331,
"grad_norm": 7.661605358123779,
"learning_rate": 1.525703129316855e-05,
"loss": 2.6102,
"step": 394000
},
{
"epoch": 2.1798350066583048,
"grad_norm": 7.636397838592529,
"learning_rate": 1.52058685691014e-05,
"loss": 2.635,
"step": 394500
},
{
"epoch": 2.1825977886692782,
"grad_norm": 9.935632705688477,
"learning_rate": 1.5154705845034247e-05,
"loss": 2.5679,
"step": 395000
},
{
"epoch": 2.1825977886692782,
"eval_runtime": 1450.0546,
"eval_samples_per_second": 249.613,
"eval_steps_per_second": 31.202,
"step": 395000
},
{
"epoch": 2.185360570680252,
"grad_norm": 6.385195255279541,
"learning_rate": 1.5103543120967097e-05,
"loss": 2.6055,
"step": 395500
},
{
"epoch": 2.188123352691226,
"grad_norm": 4.359088897705078,
"learning_rate": 1.5052380396899948e-05,
"loss": 2.6116,
"step": 396000
},
{
"epoch": 2.1908861347021995,
"grad_norm": 6.967292308807373,
"learning_rate": 1.5001217672832798e-05,
"loss": 2.5738,
"step": 396500
},
{
"epoch": 2.1936489167131734,
"grad_norm": 5.064013481140137,
"learning_rate": 1.4950054948765648e-05,
"loss": 2.59,
"step": 397000
},
{
"epoch": 2.1964116987241473,
"grad_norm": 7.287230014801025,
"learning_rate": 1.4898892224698498e-05,
"loss": 2.5935,
"step": 397500
},
{
"epoch": 2.1991744807351212,
"grad_norm": 5.277096271514893,
"learning_rate": 1.4847831826079483e-05,
"loss": 2.6603,
"step": 398000
},
{
"epoch": 2.2019372627460947,
"grad_norm": 5.027023792266846,
"learning_rate": 1.4796771427460468e-05,
"loss": 2.6749,
"step": 398500
},
{
"epoch": 2.2047000447570686,
"grad_norm": 5.998363494873047,
"learning_rate": 1.4745608703393319e-05,
"loss": 2.6365,
"step": 399000
},
{
"epoch": 2.2074628267680425,
"grad_norm": 5.35511589050293,
"learning_rate": 1.4694445979326169e-05,
"loss": 2.6342,
"step": 399500
},
{
"epoch": 2.210225608779016,
"grad_norm": 4.967937469482422,
"learning_rate": 1.4643283255259016e-05,
"loss": 2.6388,
"step": 400000
},
{
"epoch": 2.210225608779016,
"eval_runtime": 1400.0792,
"eval_samples_per_second": 258.523,
"eval_steps_per_second": 32.316,
"step": 400000
},
{
"epoch": 2.21298839078999,
"grad_norm": 5.5652055740356445,
"learning_rate": 1.4592120531191866e-05,
"loss": 2.629,
"step": 400500
},
{
"epoch": 2.215751172800964,
"grad_norm": 6.4792962074279785,
"learning_rate": 1.4540957807124716e-05,
"loss": 2.6253,
"step": 401000
},
{
"epoch": 2.2185139548119372,
"grad_norm": 5.494840621948242,
"learning_rate": 1.4489795083057567e-05,
"loss": 2.5912,
"step": 401500
},
{
"epoch": 2.221276736822911,
"grad_norm": 6.507066249847412,
"learning_rate": 1.4438632358990417e-05,
"loss": 2.5945,
"step": 402000
},
{
"epoch": 2.224039518833885,
"grad_norm": 5.496526718139648,
"learning_rate": 1.4387571960371402e-05,
"loss": 2.6173,
"step": 402500
},
{
"epoch": 2.2268023008448585,
"grad_norm": 6.222531795501709,
"learning_rate": 1.4336511561752385e-05,
"loss": 2.6482,
"step": 403000
},
{
"epoch": 2.2295650828558324,
"grad_norm": 5.230762481689453,
"learning_rate": 1.4285348837685236e-05,
"loss": 2.6678,
"step": 403500
},
{
"epoch": 2.2323278648668063,
"grad_norm": 7.752573490142822,
"learning_rate": 1.4234186113618086e-05,
"loss": 2.6552,
"step": 404000
},
{
"epoch": 2.2350906468777803,
"grad_norm": 6.026094436645508,
"learning_rate": 1.4183023389550934e-05,
"loss": 2.6387,
"step": 404500
},
{
"epoch": 2.2378534288887537,
"grad_norm": 8.043586730957031,
"learning_rate": 1.4131962990931918e-05,
"loss": 2.6137,
"step": 405000
},
{
"epoch": 2.2378534288887537,
"eval_runtime": 1371.0294,
"eval_samples_per_second": 264.001,
"eval_steps_per_second": 33.001,
"step": 405000
},
{
"epoch": 2.2406162108997276,
"grad_norm": 13.741228103637695,
"learning_rate": 1.4080800266864768e-05,
"loss": 2.621,
"step": 405500
},
{
"epoch": 2.2433789929107015,
"grad_norm": 7.889692306518555,
"learning_rate": 1.4029637542797618e-05,
"loss": 2.645,
"step": 406000
},
{
"epoch": 2.246141774921675,
"grad_norm": 7.462569236755371,
"learning_rate": 1.3978474818730469e-05,
"loss": 2.5896,
"step": 406500
},
{
"epoch": 2.248904556932649,
"grad_norm": 6.514028072357178,
"learning_rate": 1.3927312094663319e-05,
"loss": 2.6266,
"step": 407000
},
{
"epoch": 2.251667338943623,
"grad_norm": 6.088305950164795,
"learning_rate": 1.3876149370596169e-05,
"loss": 2.5982,
"step": 407500
},
{
"epoch": 2.2544301209545963,
"grad_norm": 8.39070987701416,
"learning_rate": 1.382498664652902e-05,
"loss": 2.6371,
"step": 408000
},
{
"epoch": 2.25719290296557,
"grad_norm": 7.8665361404418945,
"learning_rate": 1.377382392246187e-05,
"loss": 2.621,
"step": 408500
},
{
"epoch": 2.259955684976544,
"grad_norm": 6.9274373054504395,
"learning_rate": 1.3722661198394718e-05,
"loss": 2.6896,
"step": 409000
},
{
"epoch": 2.2627184669875176,
"grad_norm": 7.833282947540283,
"learning_rate": 1.3671498474327568e-05,
"loss": 2.6513,
"step": 409500
},
{
"epoch": 2.2654812489984915,
"grad_norm": 7.596132278442383,
"learning_rate": 1.3620438075708552e-05,
"loss": 2.6452,
"step": 410000
},
{
"epoch": 2.2654812489984915,
"eval_runtime": 1432.6263,
"eval_samples_per_second": 252.65,
"eval_steps_per_second": 31.582,
"step": 410000
},
{
"epoch": 2.2682440310094654,
"grad_norm": 7.686011791229248,
"learning_rate": 1.3569275351641402e-05,
"loss": 2.6487,
"step": 410500
},
{
"epoch": 2.2710068130204393,
"grad_norm": 10.180373191833496,
"learning_rate": 1.3518112627574252e-05,
"loss": 2.6355,
"step": 411000
},
{
"epoch": 2.2737695950314127,
"grad_norm": 5.1875410079956055,
"learning_rate": 1.3466949903507103e-05,
"loss": 2.6251,
"step": 411500
},
{
"epoch": 2.2765323770423866,
"grad_norm": 5.864450931549072,
"learning_rate": 1.3415787179439953e-05,
"loss": 2.5926,
"step": 412000
},
{
"epoch": 2.2792951590533606,
"grad_norm": 6.403237342834473,
"learning_rate": 1.3364624455372803e-05,
"loss": 2.5844,
"step": 412500
},
{
"epoch": 2.282057941064334,
"grad_norm": 6.299551963806152,
"learning_rate": 1.3313461731305653e-05,
"loss": 2.6534,
"step": 413000
},
{
"epoch": 2.284820723075308,
"grad_norm": 5.631259441375732,
"learning_rate": 1.3262299007238502e-05,
"loss": 2.6481,
"step": 413500
},
{
"epoch": 2.287583505086282,
"grad_norm": 6.804217338562012,
"learning_rate": 1.3211136283171352e-05,
"loss": 2.6417,
"step": 414000
},
{
"epoch": 2.2903462870972553,
"grad_norm": 6.593264102935791,
"learning_rate": 1.3160075884552337e-05,
"loss": 2.6293,
"step": 414500
},
{
"epoch": 2.293109069108229,
"grad_norm": 7.17709493637085,
"learning_rate": 1.3108913160485188e-05,
"loss": 2.6037,
"step": 415000
},
{
"epoch": 2.293109069108229,
"eval_runtime": 1394.243,
"eval_samples_per_second": 259.605,
"eval_steps_per_second": 32.451,
"step": 415000
},
{
"epoch": 2.295871851119203,
"grad_norm": 8.601012229919434,
"learning_rate": 1.3057750436418038e-05,
"loss": 2.6404,
"step": 415500
},
{
"epoch": 2.2986346331301766,
"grad_norm": 5.984838485717773,
"learning_rate": 1.3006587712350888e-05,
"loss": 2.6162,
"step": 416000
},
{
"epoch": 2.3013974151411505,
"grad_norm": 6.601894378662109,
"learning_rate": 1.2955629639180006e-05,
"loss": 2.5866,
"step": 416500
},
{
"epoch": 2.3041601971521244,
"grad_norm": 7.93733024597168,
"learning_rate": 1.2904466915112856e-05,
"loss": 2.6262,
"step": 417000
},
{
"epoch": 2.3069229791630983,
"grad_norm": 7.534053325653076,
"learning_rate": 1.2853304191045707e-05,
"loss": 2.6565,
"step": 417500
},
{
"epoch": 2.3096857611740718,
"grad_norm": 5.987677574157715,
"learning_rate": 1.2802141466978554e-05,
"loss": 2.6128,
"step": 418000
},
{
"epoch": 2.3124485431850457,
"grad_norm": 9.730072021484375,
"learning_rate": 1.2750978742911404e-05,
"loss": 2.6266,
"step": 418500
},
{
"epoch": 2.3152113251960196,
"grad_norm": 5.9827799797058105,
"learning_rate": 1.2699816018844254e-05,
"loss": 2.6411,
"step": 419000
},
{
"epoch": 2.317974107206993,
"grad_norm": 8.000412940979004,
"learning_rate": 1.2648653294777104e-05,
"loss": 2.6197,
"step": 419500
},
{
"epoch": 2.320736889217967,
"grad_norm": 5.673067092895508,
"learning_rate": 1.2597490570709955e-05,
"loss": 2.6396,
"step": 420000
},
{
"epoch": 2.320736889217967,
"eval_runtime": 1399.3763,
"eval_samples_per_second": 258.653,
"eval_steps_per_second": 32.332,
"step": 420000
},
{
"epoch": 2.323499671228941,
"grad_norm": 11.707938194274902,
"learning_rate": 1.254643017209094e-05,
"loss": 2.685,
"step": 420500
},
{
"epoch": 2.3262624532399143,
"grad_norm": 9.935530662536621,
"learning_rate": 1.249526744802379e-05,
"loss": 2.6652,
"step": 421000
},
{
"epoch": 2.3290252352508882,
"grad_norm": 7.645023345947266,
"learning_rate": 1.2444104723956638e-05,
"loss": 2.6288,
"step": 421500
},
{
"epoch": 2.331788017261862,
"grad_norm": 8.301952362060547,
"learning_rate": 1.2392941999889489e-05,
"loss": 2.6653,
"step": 422000
},
{
"epoch": 2.3345507992728356,
"grad_norm": 8.863719940185547,
"learning_rate": 1.2341881601270474e-05,
"loss": 2.6188,
"step": 422500
},
{
"epoch": 2.3373135812838095,
"grad_norm": 6.79737663269043,
"learning_rate": 1.2290718877203324e-05,
"loss": 2.6198,
"step": 423000
},
{
"epoch": 2.3400763632947834,
"grad_norm": 13.54198932647705,
"learning_rate": 1.2239556153136174e-05,
"loss": 2.6335,
"step": 423500
},
{
"epoch": 2.3428391453057573,
"grad_norm": 6.236546039581299,
"learning_rate": 1.2188393429069023e-05,
"loss": 2.5878,
"step": 424000
},
{
"epoch": 2.345601927316731,
"grad_norm": 6.494855880737305,
"learning_rate": 1.2137230705001873e-05,
"loss": 2.6116,
"step": 424500
},
{
"epoch": 2.3483647093277047,
"grad_norm": 5.994902610778809,
"learning_rate": 1.2086170306382858e-05,
"loss": 2.6368,
"step": 425000
},
{
"epoch": 2.3483647093277047,
"eval_runtime": 1385.8563,
"eval_samples_per_second": 261.176,
"eval_steps_per_second": 32.648,
"step": 425000
},
{
"epoch": 2.351127491338678,
"grad_norm": 6.626513481140137,
"learning_rate": 1.2035007582315708e-05,
"loss": 2.6542,
"step": 425500
},
{
"epoch": 2.353890273349652,
"grad_norm": 6.740534782409668,
"learning_rate": 1.1983844858248559e-05,
"loss": 2.6394,
"step": 426000
},
{
"epoch": 2.356653055360626,
"grad_norm": 6.561714172363281,
"learning_rate": 1.1932682134181407e-05,
"loss": 2.6508,
"step": 426500
},
{
"epoch": 2.3594158373716,
"grad_norm": 7.288315773010254,
"learning_rate": 1.188162173556239e-05,
"loss": 2.673,
"step": 427000
},
{
"epoch": 2.3621786193825733,
"grad_norm": 6.247045040130615,
"learning_rate": 1.183045901149524e-05,
"loss": 2.6219,
"step": 427500
},
{
"epoch": 2.3649414013935472,
"grad_norm": 6.165623664855957,
"learning_rate": 1.1779296287428091e-05,
"loss": 2.567,
"step": 428000
},
{
"epoch": 2.367704183404521,
"grad_norm": 7.5528717041015625,
"learning_rate": 1.1728133563360941e-05,
"loss": 2.6683,
"step": 428500
},
{
"epoch": 2.3704669654154946,
"grad_norm": 6.398986339569092,
"learning_rate": 1.1677073164741925e-05,
"loss": 2.5937,
"step": 429000
},
{
"epoch": 2.3732297474264685,
"grad_norm": 5.770337104797363,
"learning_rate": 1.162601276612291e-05,
"loss": 2.6698,
"step": 429500
},
{
"epoch": 2.3759925294374424,
"grad_norm": 11.233945846557617,
"learning_rate": 1.157485004205576e-05,
"loss": 2.6207,
"step": 430000
},
{
"epoch": 2.3759925294374424,
"eval_runtime": 1412.4375,
"eval_samples_per_second": 256.261,
"eval_steps_per_second": 32.033,
"step": 430000
},
{
"epoch": 2.3787553114484163,
"grad_norm": 8.665916442871094,
"learning_rate": 1.152368731798861e-05,
"loss": 2.6306,
"step": 430500
},
{
"epoch": 2.38151809345939,
"grad_norm": 5.553136825561523,
"learning_rate": 1.1472524593921459e-05,
"loss": 2.6256,
"step": 431000
},
{
"epoch": 2.3842808754703637,
"grad_norm": 5.725644588470459,
"learning_rate": 1.1421361869854309e-05,
"loss": 2.6169,
"step": 431500
},
{
"epoch": 2.387043657481337,
"grad_norm": 5.656550407409668,
"learning_rate": 1.137019914578716e-05,
"loss": 2.614,
"step": 432000
},
{
"epoch": 2.389806439492311,
"grad_norm": 5.470634460449219,
"learning_rate": 1.131903642172001e-05,
"loss": 2.6239,
"step": 432500
},
{
"epoch": 2.392569221503285,
"grad_norm": 6.324733257293701,
"learning_rate": 1.1267873697652858e-05,
"loss": 2.6097,
"step": 433000
},
{
"epoch": 2.395332003514259,
"grad_norm": 5.290309906005859,
"learning_rate": 1.1216710973585708e-05,
"loss": 2.6635,
"step": 433500
},
{
"epoch": 2.3980947855252324,
"grad_norm": 9.409131050109863,
"learning_rate": 1.1165650574966693e-05,
"loss": 2.6287,
"step": 434000
},
{
"epoch": 2.4008575675362063,
"grad_norm": 6.079099655151367,
"learning_rate": 1.1114487850899544e-05,
"loss": 2.5926,
"step": 434500
},
{
"epoch": 2.40362034954718,
"grad_norm": 5.747387886047363,
"learning_rate": 1.1063427452280529e-05,
"loss": 2.6502,
"step": 435000
},
{
"epoch": 2.40362034954718,
"eval_runtime": 1441.4109,
"eval_samples_per_second": 251.11,
"eval_steps_per_second": 31.389,
"step": 435000
},
{
"epoch": 2.4063831315581536,
"grad_norm": 5.655724048614502,
"learning_rate": 1.1012264728213379e-05,
"loss": 2.6302,
"step": 435500
},
{
"epoch": 2.4091459135691276,
"grad_norm": 7.379015922546387,
"learning_rate": 1.0961102004146228e-05,
"loss": 2.6013,
"step": 436000
},
{
"epoch": 2.4119086955801015,
"grad_norm": 5.20357608795166,
"learning_rate": 1.0909939280079078e-05,
"loss": 2.593,
"step": 436500
},
{
"epoch": 2.414671477591075,
"grad_norm": 7.364123344421387,
"learning_rate": 1.0858776556011928e-05,
"loss": 2.5955,
"step": 437000
},
{
"epoch": 2.417434259602049,
"grad_norm": 6.859920978546143,
"learning_rate": 1.0807716157392913e-05,
"loss": 2.6142,
"step": 437500
},
{
"epoch": 2.4201970416130227,
"grad_norm": 8.261401176452637,
"learning_rate": 1.0756553433325762e-05,
"loss": 2.6348,
"step": 438000
},
{
"epoch": 2.422959823623996,
"grad_norm": 6.4325852394104,
"learning_rate": 1.0705390709258612e-05,
"loss": 2.6208,
"step": 438500
},
{
"epoch": 2.42572260563497,
"grad_norm": 7.540378093719482,
"learning_rate": 1.0654227985191462e-05,
"loss": 2.6543,
"step": 439000
},
{
"epoch": 2.428485387645944,
"grad_norm": 4.978431701660156,
"learning_rate": 1.0603065261124313e-05,
"loss": 2.6658,
"step": 439500
},
{
"epoch": 2.431248169656918,
"grad_norm": 7.280527114868164,
"learning_rate": 1.0551902537057161e-05,
"loss": 2.6231,
"step": 440000
},
{
"epoch": 2.431248169656918,
"eval_runtime": 1447.4545,
"eval_samples_per_second": 250.062,
"eval_steps_per_second": 31.258,
"step": 440000
},
{
"epoch": 2.4340109516678914,
"grad_norm": 8.55695915222168,
"learning_rate": 1.0500739812990011e-05,
"loss": 2.6355,
"step": 440500
},
{
"epoch": 2.4367737336788653,
"grad_norm": 6.825678825378418,
"learning_rate": 1.0449577088922862e-05,
"loss": 2.5972,
"step": 441000
},
{
"epoch": 2.439536515689839,
"grad_norm": 9.022064208984375,
"learning_rate": 1.0398414364855712e-05,
"loss": 2.6085,
"step": 441500
},
{
"epoch": 2.4422992977008127,
"grad_norm": 7.041652202606201,
"learning_rate": 1.0347353966236697e-05,
"loss": 2.628,
"step": 442000
},
{
"epoch": 2.4450620797117866,
"grad_norm": 8.78257942199707,
"learning_rate": 1.029629356761768e-05,
"loss": 2.5947,
"step": 442500
},
{
"epoch": 2.4478248617227605,
"grad_norm": 3.7992634773254395,
"learning_rate": 1.0245130843550529e-05,
"loss": 2.6069,
"step": 443000
},
{
"epoch": 2.450587643733734,
"grad_norm": 5.678961753845215,
"learning_rate": 1.0193968119483379e-05,
"loss": 2.6333,
"step": 443500
},
{
"epoch": 2.453350425744708,
"grad_norm": 6.932492256164551,
"learning_rate": 1.014280539541623e-05,
"loss": 2.6285,
"step": 444000
},
{
"epoch": 2.4561132077556818,
"grad_norm": 8.48609447479248,
"learning_rate": 1.009164267134908e-05,
"loss": 2.6003,
"step": 444500
},
{
"epoch": 2.4588759897666552,
"grad_norm": 7.256680488586426,
"learning_rate": 1.004047994728193e-05,
"loss": 2.5648,
"step": 445000
},
{
"epoch": 2.4588759897666552,
"eval_runtime": 1380.4966,
"eval_samples_per_second": 262.19,
"eval_steps_per_second": 32.774,
"step": 445000
},
{
"epoch": 2.461638771777629,
"grad_norm": 10.294569969177246,
"learning_rate": 9.98931722321478e-06,
"loss": 2.6668,
"step": 445500
},
{
"epoch": 2.464401553788603,
"grad_norm": 7.309881687164307,
"learning_rate": 9.93815449914763e-06,
"loss": 2.6188,
"step": 446000
},
{
"epoch": 2.467164335799577,
"grad_norm": 8.109071731567383,
"learning_rate": 9.887094100528614e-06,
"loss": 2.5915,
"step": 446500
},
{
"epoch": 2.4699271178105504,
"grad_norm": 6.958956718444824,
"learning_rate": 9.835931376461462e-06,
"loss": 2.6043,
"step": 447000
},
{
"epoch": 2.4726898998215243,
"grad_norm": 5.835160732269287,
"learning_rate": 9.784768652394313e-06,
"loss": 2.6273,
"step": 447500
},
{
"epoch": 2.4754526818324982,
"grad_norm": 7.2995781898498535,
"learning_rate": 9.733605928327163e-06,
"loss": 2.6368,
"step": 448000
},
{
"epoch": 2.4782154638434717,
"grad_norm": 6.141138553619385,
"learning_rate": 9.682443204260013e-06,
"loss": 2.6429,
"step": 448500
},
{
"epoch": 2.4809782458544456,
"grad_norm": 7.309754371643066,
"learning_rate": 9.631382805640998e-06,
"loss": 2.6084,
"step": 449000
},
{
"epoch": 2.4837410278654195,
"grad_norm": 6.5357794761657715,
"learning_rate": 9.580220081573847e-06,
"loss": 2.6064,
"step": 449500
},
{
"epoch": 2.486503809876393,
"grad_norm": 5.566898822784424,
"learning_rate": 9.529057357506697e-06,
"loss": 2.6275,
"step": 450000
},
{
"epoch": 2.486503809876393,
"eval_runtime": 1385.5398,
"eval_samples_per_second": 261.236,
"eval_steps_per_second": 32.655,
"step": 450000
},
{
"epoch": 2.489266591887367,
"grad_norm": 7.904833793640137,
"learning_rate": 9.477894633439547e-06,
"loss": 2.6019,
"step": 450500
},
{
"epoch": 2.492029373898341,
"grad_norm": 7.342651844024658,
"learning_rate": 9.426731909372398e-06,
"loss": 2.675,
"step": 451000
},
{
"epoch": 2.4947921559093142,
"grad_norm": 6.255519390106201,
"learning_rate": 9.375569185305246e-06,
"loss": 2.6526,
"step": 451500
},
{
"epoch": 2.497554937920288,
"grad_norm": 5.555826663970947,
"learning_rate": 9.324406461238096e-06,
"loss": 2.6009,
"step": 452000
},
{
"epoch": 2.500317719931262,
"grad_norm": 9.351966857910156,
"learning_rate": 9.273346062619081e-06,
"loss": 2.6516,
"step": 452500
},
{
"epoch": 2.503080501942236,
"grad_norm": 13.138755798339844,
"learning_rate": 9.222183338551932e-06,
"loss": 2.5894,
"step": 453000
},
{
"epoch": 2.5058432839532094,
"grad_norm": 5.904870986938477,
"learning_rate": 9.171020614484782e-06,
"loss": 2.6366,
"step": 453500
},
{
"epoch": 2.5086060659641833,
"grad_norm": 7.674947261810303,
"learning_rate": 9.11985789041763e-06,
"loss": 2.6441,
"step": 454000
},
{
"epoch": 2.511368847975157,
"grad_norm": 7.656473636627197,
"learning_rate": 9.068797491798615e-06,
"loss": 2.6298,
"step": 454500
},
{
"epoch": 2.5141316299861307,
"grad_norm": 5.670429706573486,
"learning_rate": 9.017634767731466e-06,
"loss": 2.6246,
"step": 455000
},
{
"epoch": 2.5141316299861307,
"eval_runtime": 1400.9354,
"eval_samples_per_second": 258.365,
"eval_steps_per_second": 32.296,
"step": 455000
},
{
"epoch": 2.5168944119971046,
"grad_norm": 7.4133148193359375,
"learning_rate": 8.966472043664316e-06,
"loss": 2.6301,
"step": 455500
},
{
"epoch": 2.5196571940080785,
"grad_norm": 12.215228080749512,
"learning_rate": 8.915309319597166e-06,
"loss": 2.5877,
"step": 456000
},
{
"epoch": 2.522419976019052,
"grad_norm": 7.437780857086182,
"learning_rate": 8.864146595530015e-06,
"loss": 2.6486,
"step": 456500
},
{
"epoch": 2.525182758030026,
"grad_norm": 6.193426609039307,
"learning_rate": 8.812983871462865e-06,
"loss": 2.6423,
"step": 457000
},
{
"epoch": 2.527945540041,
"grad_norm": 6.621194362640381,
"learning_rate": 8.761821147395715e-06,
"loss": 2.6323,
"step": 457500
},
{
"epoch": 2.5307083220519733,
"grad_norm": 8.068601608276367,
"learning_rate": 8.710658423328566e-06,
"loss": 2.6727,
"step": 458000
},
{
"epoch": 2.533471104062947,
"grad_norm": 4.508535385131836,
"learning_rate": 8.659598024709549e-06,
"loss": 2.6195,
"step": 458500
},
{
"epoch": 2.536233886073921,
"grad_norm": 7.6524128913879395,
"learning_rate": 8.6084353006424e-06,
"loss": 2.6293,
"step": 459000
},
{
"epoch": 2.538996668084895,
"grad_norm": 6.510564804077148,
"learning_rate": 8.55727257657525e-06,
"loss": 2.5563,
"step": 459500
},
{
"epoch": 2.5417594500958685,
"grad_norm": 7.351913928985596,
"learning_rate": 8.5061098525081e-06,
"loss": 2.6318,
"step": 460000
},
{
"epoch": 2.5417594500958685,
"eval_runtime": 1443.1461,
"eval_samples_per_second": 250.808,
"eval_steps_per_second": 31.352,
"step": 460000
},
{
"epoch": 2.5445222321068424,
"grad_norm": 10.461015701293945,
"learning_rate": 8.45494712844095e-06,
"loss": 2.623,
"step": 460500
},
{
"epoch": 2.547285014117816,
"grad_norm": 7.475493907928467,
"learning_rate": 8.403886729821933e-06,
"loss": 2.6073,
"step": 461000
},
{
"epoch": 2.5500477961287897,
"grad_norm": 5.767341136932373,
"learning_rate": 8.352724005754784e-06,
"loss": 2.6477,
"step": 461500
},
{
"epoch": 2.5528105781397636,
"grad_norm": 6.720097541809082,
"learning_rate": 8.301561281687634e-06,
"loss": 2.6244,
"step": 462000
},
{
"epoch": 2.5555733601507375,
"grad_norm": 9.576379776000977,
"learning_rate": 8.250398557620484e-06,
"loss": 2.6118,
"step": 462500
},
{
"epoch": 2.558336142161711,
"grad_norm": 7.282556056976318,
"learning_rate": 8.19933815900147e-06,
"loss": 2.6138,
"step": 463000
},
{
"epoch": 2.561098924172685,
"grad_norm": 8.628390312194824,
"learning_rate": 8.148175434934318e-06,
"loss": 2.6197,
"step": 463500
},
{
"epoch": 2.563861706183659,
"grad_norm": 6.606767654418945,
"learning_rate": 8.097012710867168e-06,
"loss": 2.6339,
"step": 464000
},
{
"epoch": 2.5666244881946323,
"grad_norm": 5.061454772949219,
"learning_rate": 8.045849986800018e-06,
"loss": 2.6692,
"step": 464500
},
{
"epoch": 2.569387270205606,
"grad_norm": 5.3237104415893555,
"learning_rate": 7.994687262732869e-06,
"loss": 2.6403,
"step": 465000
},
{
"epoch": 2.569387270205606,
"eval_runtime": 1446.7808,
"eval_samples_per_second": 250.178,
"eval_steps_per_second": 31.273,
"step": 465000
},
{
"epoch": 2.57215005221658,
"grad_norm": 7.996010780334473,
"learning_rate": 7.943626864113852e-06,
"loss": 2.6071,
"step": 465500
},
{
"epoch": 2.574912834227554,
"grad_norm": 5.897533416748047,
"learning_rate": 7.892464140046702e-06,
"loss": 2.6137,
"step": 466000
},
{
"epoch": 2.5776756162385275,
"grad_norm": 6.0307488441467285,
"learning_rate": 7.841301415979552e-06,
"loss": 2.6316,
"step": 466500
},
{
"epoch": 2.5804383982495014,
"grad_norm": 8.005854606628418,
"learning_rate": 7.790138691912403e-06,
"loss": 2.6063,
"step": 467000
},
{
"epoch": 2.583201180260475,
"grad_norm": 5.740025997161865,
"learning_rate": 7.739078293293386e-06,
"loss": 2.615,
"step": 467500
},
{
"epoch": 2.5859639622714488,
"grad_norm": 8.913529396057129,
"learning_rate": 7.687915569226235e-06,
"loss": 2.619,
"step": 468000
},
{
"epoch": 2.5887267442824227,
"grad_norm": 7.639087677001953,
"learning_rate": 7.636752845159085e-06,
"loss": 2.6082,
"step": 468500
},
{
"epoch": 2.5914895262933966,
"grad_norm": 6.564584732055664,
"learning_rate": 7.585590121091936e-06,
"loss": 2.6301,
"step": 469000
},
{
"epoch": 2.59425230830437,
"grad_norm": 7.4024834632873535,
"learning_rate": 7.534427397024786e-06,
"loss": 2.63,
"step": 469500
},
{
"epoch": 2.597015090315344,
"grad_norm": 6.080173969268799,
"learning_rate": 7.4833669984057704e-06,
"loss": 2.64,
"step": 470000
},
{
"epoch": 2.597015090315344,
"eval_runtime": 1388.5169,
"eval_samples_per_second": 260.676,
"eval_steps_per_second": 32.585,
"step": 470000
},
{
"epoch": 2.599777872326318,
"grad_norm": 5.737242698669434,
"learning_rate": 7.43220427433862e-06,
"loss": 2.6081,
"step": 470500
},
{
"epoch": 2.6025406543372913,
"grad_norm": 5.815971851348877,
"learning_rate": 7.38104155027147e-06,
"loss": 2.6399,
"step": 471000
},
{
"epoch": 2.6053034363482652,
"grad_norm": 7.472295761108398,
"learning_rate": 7.32987882620432e-06,
"loss": 2.6019,
"step": 471500
},
{
"epoch": 2.608066218359239,
"grad_norm": 10.439508438110352,
"learning_rate": 7.278920753033438e-06,
"loss": 2.6021,
"step": 472000
},
{
"epoch": 2.610829000370213,
"grad_norm": 4.827859878540039,
"learning_rate": 7.227758028966289e-06,
"loss": 2.6533,
"step": 472500
},
{
"epoch": 2.6135917823811865,
"grad_norm": 7.332652568817139,
"learning_rate": 7.176595304899138e-06,
"loss": 2.6024,
"step": 473000
},
{
"epoch": 2.6163545643921604,
"grad_norm": 8.309117317199707,
"learning_rate": 7.125432580831988e-06,
"loss": 2.6284,
"step": 473500
},
{
"epoch": 2.619117346403134,
"grad_norm": 5.733798503875732,
"learning_rate": 7.074269856764839e-06,
"loss": 2.5479,
"step": 474000
},
{
"epoch": 2.621880128414108,
"grad_norm": 7.765644073486328,
"learning_rate": 7.023107132697689e-06,
"loss": 2.6764,
"step": 474500
},
{
"epoch": 2.6246429104250817,
"grad_norm": 7.007179260253906,
"learning_rate": 6.9719444086305375e-06,
"loss": 2.6276,
"step": 475000
},
{
"epoch": 2.6246429104250817,
"eval_runtime": 1395.7273,
"eval_samples_per_second": 259.329,
"eval_steps_per_second": 32.417,
"step": 475000
},
{
"epoch": 2.6274056924360556,
"grad_norm": 8.562434196472168,
"learning_rate": 6.920781684563388e-06,
"loss": 2.5855,
"step": 475500
},
{
"epoch": 2.630168474447029,
"grad_norm": 6.0895867347717285,
"learning_rate": 6.869618960496238e-06,
"loss": 2.622,
"step": 476000
},
{
"epoch": 2.632931256458003,
"grad_norm": 5.998204708099365,
"learning_rate": 6.818558561877222e-06,
"loss": 2.6498,
"step": 476500
},
{
"epoch": 2.635694038468977,
"grad_norm": 10.850279808044434,
"learning_rate": 6.7673958378100725e-06,
"loss": 2.6251,
"step": 477000
},
{
"epoch": 2.6384568204799503,
"grad_norm": 8.587841987609863,
"learning_rate": 6.716233113742922e-06,
"loss": 2.6448,
"step": 477500
},
{
"epoch": 2.6412196024909242,
"grad_norm": 7.590404510498047,
"learning_rate": 6.665070389675772e-06,
"loss": 2.6155,
"step": 478000
},
{
"epoch": 2.643982384501898,
"grad_norm": 9.195626258850098,
"learning_rate": 6.614009991056756e-06,
"loss": 2.6585,
"step": 478500
},
{
"epoch": 2.646745166512872,
"grad_norm": 7.80164909362793,
"learning_rate": 6.562847266989607e-06,
"loss": 2.6388,
"step": 479000
},
{
"epoch": 2.6495079485238455,
"grad_norm": 8.529934883117676,
"learning_rate": 6.511684542922455e-06,
"loss": 2.5775,
"step": 479500
},
{
"epoch": 2.6522707305348194,
"grad_norm": 4.80623722076416,
"learning_rate": 6.4605218188553055e-06,
"loss": 2.6404,
"step": 480000
},
{
"epoch": 2.6522707305348194,
"eval_runtime": 1401.4337,
"eval_samples_per_second": 258.273,
"eval_steps_per_second": 32.285,
"step": 480000
},
{
"epoch": 2.655033512545793,
"grad_norm": 7.216457366943359,
"learning_rate": 6.4094614202362905e-06,
"loss": 2.5943,
"step": 480500
},
{
"epoch": 2.657796294556767,
"grad_norm": 9.760845184326172,
"learning_rate": 6.358298696169141e-06,
"loss": 2.6424,
"step": 481000
},
{
"epoch": 2.6605590765677407,
"grad_norm": 5.927933692932129,
"learning_rate": 6.307135972101991e-06,
"loss": 2.6467,
"step": 481500
},
{
"epoch": 2.6633218585787146,
"grad_norm": 12.73469352722168,
"learning_rate": 6.25597324803484e-06,
"loss": 2.657,
"step": 482000
},
{
"epoch": 2.666084640589688,
"grad_norm": 10.275845527648926,
"learning_rate": 6.204912849415824e-06,
"loss": 2.6168,
"step": 482500
},
{
"epoch": 2.668847422600662,
"grad_norm": 6.86333703994751,
"learning_rate": 6.153750125348674e-06,
"loss": 2.6537,
"step": 483000
},
{
"epoch": 2.671610204611636,
"grad_norm": 7.216489315032959,
"learning_rate": 6.102587401281524e-06,
"loss": 2.6347,
"step": 483500
},
{
"epoch": 2.6743729866226094,
"grad_norm": 15.533758163452148,
"learning_rate": 6.0514246772143745e-06,
"loss": 2.6386,
"step": 484000
},
{
"epoch": 2.6771357686335833,
"grad_norm": 9.071037292480469,
"learning_rate": 6.000364278595359e-06,
"loss": 2.6346,
"step": 484500
},
{
"epoch": 2.679898550644557,
"grad_norm": 6.715532302856445,
"learning_rate": 5.949201554528208e-06,
"loss": 2.5978,
"step": 485000
},
{
"epoch": 2.679898550644557,
"eval_runtime": 1414.4263,
"eval_samples_per_second": 255.901,
"eval_steps_per_second": 31.988,
"step": 485000
},
{
"epoch": 2.682661332655531,
"grad_norm": 6.801553249359131,
"learning_rate": 5.898038830461058e-06,
"loss": 2.6455,
"step": 485500
},
{
"epoch": 2.6854241146665045,
"grad_norm": 8.986194610595703,
"learning_rate": 5.846876106393908e-06,
"loss": 2.6106,
"step": 486000
},
{
"epoch": 2.6881868966774785,
"grad_norm": 9.200784683227539,
"learning_rate": 5.795713382326758e-06,
"loss": 2.628,
"step": 486500
},
{
"epoch": 2.690949678688452,
"grad_norm": 12.348143577575684,
"learning_rate": 5.744652983707743e-06,
"loss": 2.6519,
"step": 487000
},
{
"epoch": 2.693712460699426,
"grad_norm": 5.376158237457275,
"learning_rate": 5.6934902596405925e-06,
"loss": 2.5868,
"step": 487500
},
{
"epoch": 2.6964752427103997,
"grad_norm": 6.574330806732178,
"learning_rate": 5.642327535573443e-06,
"loss": 2.6171,
"step": 488000
},
{
"epoch": 2.6992380247213736,
"grad_norm": 7.850590229034424,
"learning_rate": 5.591164811506292e-06,
"loss": 2.6335,
"step": 488500
},
{
"epoch": 2.702000806732347,
"grad_norm": 4.8061113357543945,
"learning_rate": 5.540104412887276e-06,
"loss": 2.6141,
"step": 489000
},
{
"epoch": 2.704763588743321,
"grad_norm": 5.521638870239258,
"learning_rate": 5.488941688820126e-06,
"loss": 2.5994,
"step": 489500
},
{
"epoch": 2.7075263707542945,
"grad_norm": 7.759128093719482,
"learning_rate": 5.437778964752976e-06,
"loss": 2.6212,
"step": 490000
},
{
"epoch": 2.7075263707542945,
"eval_runtime": 1444.9579,
"eval_samples_per_second": 250.494,
"eval_steps_per_second": 31.312,
"step": 490000
},
{
"epoch": 2.7102891527652684,
"grad_norm": 7.741880893707275,
"learning_rate": 5.386616240685826e-06,
"loss": 2.6377,
"step": 490500
},
{
"epoch": 2.7130519347762423,
"grad_norm": 6.901477813720703,
"learning_rate": 5.3355558420668105e-06,
"loss": 2.6113,
"step": 491000
},
{
"epoch": 2.715814716787216,
"grad_norm": 4.808909893035889,
"learning_rate": 5.284393117999661e-06,
"loss": 2.6073,
"step": 491500
},
{
"epoch": 2.7185774987981897,
"grad_norm": 5.946444034576416,
"learning_rate": 5.23323039393251e-06,
"loss": 2.5861,
"step": 492000
},
{
"epoch": 2.7213402808091636,
"grad_norm": 6.473993301391602,
"learning_rate": 5.1820676698653605e-06,
"loss": 2.5685,
"step": 492500
},
{
"epoch": 2.7241030628201375,
"grad_norm": 5.835826873779297,
"learning_rate": 5.131007271246345e-06,
"loss": 2.5697,
"step": 493000
},
{
"epoch": 2.726865844831111,
"grad_norm": 6.624295711517334,
"learning_rate": 5.079844547179195e-06,
"loss": 2.5782,
"step": 493500
},
{
"epoch": 2.729628626842085,
"grad_norm": 9.765020370483398,
"learning_rate": 5.028681823112045e-06,
"loss": 2.6423,
"step": 494000
},
{
"epoch": 2.7323914088530588,
"grad_norm": 7.017053127288818,
"learning_rate": 4.9775190990448946e-06,
"loss": 2.6669,
"step": 494500
},
{
"epoch": 2.7351541908640327,
"grad_norm": 6.12160587310791,
"learning_rate": 4.926356374977745e-06,
"loss": 2.6144,
"step": 495000
},
{
"epoch": 2.7351541908640327,
"eval_runtime": 1425.3262,
"eval_samples_per_second": 253.944,
"eval_steps_per_second": 31.744,
"step": 495000
},
{
"epoch": 2.737916972875006,
"grad_norm": 7.875446796417236,
"learning_rate": 4.875398301806864e-06,
"loss": 2.6609,
"step": 495500
},
{
"epoch": 2.74067975488598,
"grad_norm": 5.206502914428711,
"learning_rate": 4.824235577739713e-06,
"loss": 2.6007,
"step": 496000
},
{
"epoch": 2.7434425368969535,
"grad_norm": 6.697471618652344,
"learning_rate": 4.773072853672563e-06,
"loss": 2.6205,
"step": 496500
},
{
"epoch": 2.7462053189079274,
"grad_norm": 6.960028648376465,
"learning_rate": 4.721910129605413e-06,
"loss": 2.5956,
"step": 497000
},
{
"epoch": 2.7489681009189013,
"grad_norm": 5.795044422149658,
"learning_rate": 4.670747405538263e-06,
"loss": 2.6254,
"step": 497500
},
{
"epoch": 2.751730882929875,
"grad_norm": 5.511195182800293,
"learning_rate": 4.6195846814711125e-06,
"loss": 2.6216,
"step": 498000
},
{
"epoch": 2.7544936649408487,
"grad_norm": 7.889344215393066,
"learning_rate": 4.568421957403963e-06,
"loss": 2.5932,
"step": 498500
},
{
"epoch": 2.7572564469518226,
"grad_norm": 6.596147060394287,
"learning_rate": 4.517259233336812e-06,
"loss": 2.6322,
"step": 499000
},
{
"epoch": 2.7600192289627965,
"grad_norm": 8.392708778381348,
"learning_rate": 4.466198834717797e-06,
"loss": 2.6114,
"step": 499500
},
{
"epoch": 2.76278201097377,
"grad_norm": 5.404835224151611,
"learning_rate": 4.4150361106506475e-06,
"loss": 2.6497,
"step": 500000
},
{
"epoch": 2.76278201097377,
"eval_runtime": 1379.3919,
"eval_samples_per_second": 262.4,
"eval_steps_per_second": 32.801,
"step": 500000
},
{
"epoch": 2.765544792984744,
"grad_norm": 10.54916000366211,
"learning_rate": 4.363873386583497e-06,
"loss": 2.6555,
"step": 500500
},
{
"epoch": 2.768307574995718,
"grad_norm": 7.687312602996826,
"learning_rate": 4.312710662516347e-06,
"loss": 2.611,
"step": 501000
},
{
"epoch": 2.7710703570066917,
"grad_norm": 5.376524448394775,
"learning_rate": 4.261547938449197e-06,
"loss": 2.686,
"step": 501500
},
{
"epoch": 2.773833139017665,
"grad_norm": 6.104116439819336,
"learning_rate": 4.210487539830181e-06,
"loss": 2.5986,
"step": 502000
},
{
"epoch": 2.776595921028639,
"grad_norm": 6.0707597732543945,
"learning_rate": 4.15932481576303e-06,
"loss": 2.6303,
"step": 502500
},
{
"epoch": 2.7793587030396125,
"grad_norm": 7.738794803619385,
"learning_rate": 4.1081620916958805e-06,
"loss": 2.6115,
"step": 503000
},
{
"epoch": 2.7821214850505864,
"grad_norm": 6.483746528625488,
"learning_rate": 4.056999367628731e-06,
"loss": 2.6179,
"step": 503500
},
{
"epoch": 2.7848842670615603,
"grad_norm": 8.825859069824219,
"learning_rate": 4.005938969009715e-06,
"loss": 2.6217,
"step": 504000
},
{
"epoch": 2.7876470490725342,
"grad_norm": 6.525907039642334,
"learning_rate": 3.954776244942565e-06,
"loss": 2.6415,
"step": 504500
},
{
"epoch": 2.7904098310835077,
"grad_norm": 6.188871383666992,
"learning_rate": 3.903613520875415e-06,
"loss": 2.629,
"step": 505000
},
{
"epoch": 2.7904098310835077,
"eval_runtime": 1400.5744,
"eval_samples_per_second": 258.432,
"eval_steps_per_second": 32.305,
"step": 505000
},
{
"epoch": 2.7931726130944816,
"grad_norm": 5.803805828094482,
"learning_rate": 3.852450796808265e-06,
"loss": 2.6107,
"step": 505500
},
{
"epoch": 2.7959353951054555,
"grad_norm": 5.849481105804443,
"learning_rate": 3.8013903981892486e-06,
"loss": 2.6773,
"step": 506000
},
{
"epoch": 2.798698177116429,
"grad_norm": 5.97512674331665,
"learning_rate": 3.750227674122099e-06,
"loss": 2.622,
"step": 506500
},
{
"epoch": 2.801460959127403,
"grad_norm": 23.599891662597656,
"learning_rate": 3.6990649500549487e-06,
"loss": 2.6565,
"step": 507000
},
{
"epoch": 2.804223741138377,
"grad_norm": 8.95606803894043,
"learning_rate": 3.647902225987799e-06,
"loss": 2.6444,
"step": 507500
},
{
"epoch": 2.8069865231493507,
"grad_norm": 6.165973663330078,
"learning_rate": 3.5967395019206493e-06,
"loss": 2.6061,
"step": 508000
},
{
"epoch": 2.809749305160324,
"grad_norm": 5.899477958679199,
"learning_rate": 3.545679103301633e-06,
"loss": 2.6525,
"step": 508500
},
{
"epoch": 2.812512087171298,
"grad_norm": 12.357131958007812,
"learning_rate": 3.4945163792344833e-06,
"loss": 2.601,
"step": 509000
},
{
"epoch": 2.8152748691822715,
"grad_norm": 5.383533000946045,
"learning_rate": 3.4433536551673327e-06,
"loss": 2.6151,
"step": 509500
},
{
"epoch": 2.8180376511932455,
"grad_norm": 6.1412153244018555,
"learning_rate": 3.392190931100183e-06,
"loss": 2.6158,
"step": 510000
},
{
"epoch": 2.8180376511932455,
"eval_runtime": 1441.3572,
"eval_samples_per_second": 251.12,
"eval_steps_per_second": 31.391,
"step": 510000
},
{
"epoch": 2.8208004332042194,
"grad_norm": 9.799259185791016,
"learning_rate": 3.3411305324811675e-06,
"loss": 2.6548,
"step": 510500
},
{
"epoch": 2.8235632152151933,
"grad_norm": 6.84127140045166,
"learning_rate": 3.289967808414017e-06,
"loss": 2.6121,
"step": 511000
},
{
"epoch": 2.8263259972261667,
"grad_norm": 5.669933795928955,
"learning_rate": 3.2388050843468672e-06,
"loss": 2.647,
"step": 511500
},
{
"epoch": 2.8290887792371406,
"grad_norm": 4.693601608276367,
"learning_rate": 3.187744685727851e-06,
"loss": 2.6306,
"step": 512000
},
{
"epoch": 2.8318515612481145,
"grad_norm": 4.971369743347168,
"learning_rate": 3.1365819616607012e-06,
"loss": 2.6419,
"step": 512500
},
{
"epoch": 2.834614343259088,
"grad_norm": 6.460732460021973,
"learning_rate": 3.085419237593551e-06,
"loss": 2.5998,
"step": 513000
},
{
"epoch": 2.837377125270062,
"grad_norm": 7.711912155151367,
"learning_rate": 3.0342565135264013e-06,
"loss": 2.6082,
"step": 513500
},
{
"epoch": 2.840139907281036,
"grad_norm": 6.650845527648926,
"learning_rate": 2.983093789459251e-06,
"loss": 2.6067,
"step": 514000
},
{
"epoch": 2.8429026892920097,
"grad_norm": 6.2975664138793945,
"learning_rate": 2.931931065392101e-06,
"loss": 2.6717,
"step": 514500
},
{
"epoch": 2.845665471302983,
"grad_norm": 8.976877212524414,
"learning_rate": 2.880768341324951e-06,
"loss": 2.6582,
"step": 515000
},
{
"epoch": 2.845665471302983,
"eval_runtime": 1422.7904,
"eval_samples_per_second": 254.397,
"eval_steps_per_second": 31.8,
"step": 515000
},
{
"epoch": 2.848428253313957,
"grad_norm": 7.009723663330078,
"learning_rate": 2.8296056172578008e-06,
"loss": 2.6485,
"step": 515500
},
{
"epoch": 2.8511910353249306,
"grad_norm": 12.484458923339844,
"learning_rate": 2.7785452186387853e-06,
"loss": 2.5926,
"step": 516000
},
{
"epoch": 2.8539538173359045,
"grad_norm": 8.153068542480469,
"learning_rate": 2.727382494571635e-06,
"loss": 2.6376,
"step": 516500
},
{
"epoch": 2.8567165993468784,
"grad_norm": 8.453513145446777,
"learning_rate": 2.676219770504485e-06,
"loss": 2.5791,
"step": 517000
},
{
"epoch": 2.8594793813578523,
"grad_norm": 5.08599853515625,
"learning_rate": 2.6250570464373353e-06,
"loss": 2.6142,
"step": 517500
},
{
"epoch": 2.8622421633688258,
"grad_norm": 6.9135589599609375,
"learning_rate": 2.5740989732664537e-06,
"loss": 2.6435,
"step": 518000
},
{
"epoch": 2.8650049453797997,
"grad_norm": 7.915692329406738,
"learning_rate": 2.5229362491993036e-06,
"loss": 2.6459,
"step": 518500
},
{
"epoch": 2.8677677273907736,
"grad_norm": 6.149202823638916,
"learning_rate": 2.4717735251321534e-06,
"loss": 2.6444,
"step": 519000
},
{
"epoch": 2.870530509401747,
"grad_norm": 5.931761741638184,
"learning_rate": 2.4206108010650033e-06,
"loss": 2.6396,
"step": 519500
},
{
"epoch": 2.873293291412721,
"grad_norm": 7.582653045654297,
"learning_rate": 2.3694480769978536e-06,
"loss": 2.6285,
"step": 520000
},
{
"epoch": 2.873293291412721,
"eval_runtime": 1392.8266,
"eval_samples_per_second": 259.869,
"eval_steps_per_second": 32.484,
"step": 520000
},
{
"epoch": 2.876056073423695,
"grad_norm": 5.626070976257324,
"learning_rate": 2.3182853529307034e-06,
"loss": 2.5947,
"step": 520500
},
{
"epoch": 2.8788188554346688,
"grad_norm": 6.177218914031982,
"learning_rate": 2.2671226288635533e-06,
"loss": 2.6351,
"step": 521000
},
{
"epoch": 2.881581637445642,
"grad_norm": 7.875889778137207,
"learning_rate": 2.215959904796403e-06,
"loss": 2.6291,
"step": 521500
},
{
"epoch": 2.884344419456616,
"grad_norm": 6.308676242828369,
"learning_rate": 2.164797180729253e-06,
"loss": 2.5921,
"step": 522000
},
{
"epoch": 2.8871072014675896,
"grad_norm": 6.715153217315674,
"learning_rate": 2.1137367821102375e-06,
"loss": 2.6217,
"step": 522500
},
{
"epoch": 2.8898699834785635,
"grad_norm": 6.99116849899292,
"learning_rate": 2.0625740580430874e-06,
"loss": 2.645,
"step": 523000
},
{
"epoch": 2.8926327654895374,
"grad_norm": 7.715075969696045,
"learning_rate": 2.0114113339759372e-06,
"loss": 2.6156,
"step": 523500
},
{
"epoch": 2.8953955475005113,
"grad_norm": 8.207829475402832,
"learning_rate": 1.9602486099087875e-06,
"loss": 2.6027,
"step": 524000
},
{
"epoch": 2.8981583295114848,
"grad_norm": 6.43826150894165,
"learning_rate": 1.9091882112897712e-06,
"loss": 2.6053,
"step": 524500
},
{
"epoch": 2.9009211115224587,
"grad_norm": 9.398484230041504,
"learning_rate": 1.8580254872226213e-06,
"loss": 2.5925,
"step": 525000
},
{
"epoch": 2.9009211115224587,
"eval_runtime": 1412.1705,
"eval_samples_per_second": 256.31,
"eval_steps_per_second": 32.039,
"step": 525000
},
{
"epoch": 2.903683893533432,
"grad_norm": 6.568800926208496,
"learning_rate": 1.8068627631554713e-06,
"loss": 2.5867,
"step": 525500
},
{
"epoch": 2.906446675544406,
"grad_norm": 6.8241963386535645,
"learning_rate": 1.7557000390883212e-06,
"loss": 2.5929,
"step": 526000
},
{
"epoch": 2.90920945755538,
"grad_norm": 4.7916436195373535,
"learning_rate": 1.7046396404693056e-06,
"loss": 2.6441,
"step": 526500
},
{
"epoch": 2.911972239566354,
"grad_norm": 8.739401817321777,
"learning_rate": 1.6534769164021556e-06,
"loss": 2.6121,
"step": 527000
},
{
"epoch": 2.914735021577328,
"grad_norm": 8.750603675842285,
"learning_rate": 1.6023141923350055e-06,
"loss": 2.6541,
"step": 527500
},
{
"epoch": 2.9174978035883012,
"grad_norm": 7.22824239730835,
"learning_rate": 1.5511514682678553e-06,
"loss": 2.6455,
"step": 528000
},
{
"epoch": 2.920260585599275,
"grad_norm": 8.114001274108887,
"learning_rate": 1.4999887442007054e-06,
"loss": 2.5991,
"step": 528500
},
{
"epoch": 2.9230233676102486,
"grad_norm": 4.913358688354492,
"learning_rate": 1.4488260201335552e-06,
"loss": 2.6597,
"step": 529000
},
{
"epoch": 2.9257861496212225,
"grad_norm": 4.944784164428711,
"learning_rate": 1.397663296066405e-06,
"loss": 2.5788,
"step": 529500
},
{
"epoch": 2.9285489316321964,
"grad_norm": 6.435703754425049,
"learning_rate": 1.3465005719992551e-06,
"loss": 2.5839,
"step": 530000
},
{
"epoch": 2.9285489316321964,
"eval_runtime": 1465.041,
"eval_samples_per_second": 247.06,
"eval_steps_per_second": 30.883,
"step": 530000
},
{
"epoch": 2.9313117136431703,
"grad_norm": 6.146181583404541,
"learning_rate": 1.295337847932105e-06,
"loss": 2.5753,
"step": 530500
},
{
"epoch": 2.934074495654144,
"grad_norm": 6.585812568664551,
"learning_rate": 1.2442774493130894e-06,
"loss": 2.6388,
"step": 531000
},
{
"epoch": 2.9368372776651177,
"grad_norm": 5.040027618408203,
"learning_rate": 1.1931147252459392e-06,
"loss": 2.6404,
"step": 531500
},
{
"epoch": 2.939600059676091,
"grad_norm": 6.62531042098999,
"learning_rate": 1.141952001178789e-06,
"loss": 2.6357,
"step": 532000
},
{
"epoch": 2.942362841687065,
"grad_norm": 6.749676704406738,
"learning_rate": 1.0907892771116393e-06,
"loss": 2.6343,
"step": 532500
},
{
"epoch": 2.945125623698039,
"grad_norm": 6.896877765655518,
"learning_rate": 1.0397288784926235e-06,
"loss": 2.5909,
"step": 533000
},
{
"epoch": 2.947888405709013,
"grad_norm": 5.896803855895996,
"learning_rate": 9.885661544254733e-07,
"loss": 2.6058,
"step": 533500
},
{
"epoch": 2.9506511877199864,
"grad_norm": 7.961572170257568,
"learning_rate": 9.374034303583234e-07,
"loss": 2.5977,
"step": 534000
},
{
"epoch": 2.9534139697309603,
"grad_norm": 5.0403594970703125,
"learning_rate": 8.862407062911732e-07,
"loss": 2.6199,
"step": 534500
},
{
"epoch": 2.956176751741934,
"grad_norm": 5.320588111877441,
"learning_rate": 8.351803076721575e-07,
"loss": 2.6392,
"step": 535000
},
{
"epoch": 2.956176751741934,
"eval_runtime": 1384.9303,
"eval_samples_per_second": 261.351,
"eval_steps_per_second": 32.67,
"step": 535000
},
{
"epoch": 2.9589395337529076,
"grad_norm": 6.019600868225098,
"learning_rate": 7.840175836050074e-07,
"loss": 2.6634,
"step": 535500
},
{
"epoch": 2.9617023157638815,
"grad_norm": 8.975756645202637,
"learning_rate": 7.328548595378574e-07,
"loss": 2.6451,
"step": 536000
},
{
"epoch": 2.9644650977748555,
"grad_norm": 5.559593200683594,
"learning_rate": 6.816921354707074e-07,
"loss": 2.579,
"step": 536500
},
{
"epoch": 2.9672278797858294,
"grad_norm": 5.876176834106445,
"learning_rate": 6.305294114035572e-07,
"loss": 2.6417,
"step": 537000
},
{
"epoch": 2.969990661796803,
"grad_norm": 6.175969123840332,
"learning_rate": 5.793666873364073e-07,
"loss": 2.6182,
"step": 537500
},
{
"epoch": 2.9727534438077767,
"grad_norm": 7.48173713684082,
"learning_rate": 5.283062887173914e-07,
"loss": 2.6455,
"step": 538000
},
{
"epoch": 2.97551622581875,
"grad_norm": 7.331089496612549,
"learning_rate": 4.771435646502415e-07,
"loss": 2.625,
"step": 538500
},
{
"epoch": 2.978279007829724,
"grad_norm": 4.610020637512207,
"learning_rate": 4.259808405830914e-07,
"loss": 2.6334,
"step": 539000
},
{
"epoch": 2.981041789840698,
"grad_norm": 8.355249404907227,
"learning_rate": 3.748181165159413e-07,
"loss": 2.6034,
"step": 539500
},
{
"epoch": 2.983804571851672,
"grad_norm": 6.447065830230713,
"learning_rate": 3.2365539244879124e-07,
"loss": 2.6019,
"step": 540000
},
{
"epoch": 2.983804571851672,
"eval_runtime": 1372.3117,
"eval_samples_per_second": 263.754,
"eval_steps_per_second": 32.97,
"step": 540000
},
{
"epoch": 2.9865673538626454,
"grad_norm": 8.215147018432617,
"learning_rate": 2.724926683816412e-07,
"loss": 2.6186,
"step": 540500
},
{
"epoch": 2.9893301358736193,
"grad_norm": 5.153554439544678,
"learning_rate": 2.2132994431449112e-07,
"loss": 2.6318,
"step": 541000
},
{
"epoch": 2.992092917884593,
"grad_norm": 6.764389991760254,
"learning_rate": 1.7016722024734108e-07,
"loss": 2.6113,
"step": 541500
},
{
"epoch": 2.9948556998955667,
"grad_norm": 8.090177536010742,
"learning_rate": 1.1910682162832533e-07,
"loss": 2.6307,
"step": 542000
},
{
"epoch": 2.9976184819065406,
"grad_norm": 10.311904907226562,
"learning_rate": 6.794409756117527e-08,
"loss": 2.6157,
"step": 542500
}
],
"logging_steps": 500,
"max_steps": 542931,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8505448008628067e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}