zhtw-en / trainer_state.json
agentlans's picture
Upload 13 files
b7e2702 verified
{
"best_global_step": 499760,
"best_metric": 0.9877662062644958,
"best_model_checkpoint": "/media/user/Expansion1/opus-mt-zhtw-en-google-translate3/checkpoint-499760",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 499760,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005002401152553225,
"grad_norm": 11.430471420288086,
"learning_rate": 4.995007603649752e-05,
"loss": 2.0932,
"num_input_tokens_seen": 276616,
"step": 500
},
{
"epoch": 0.01000480230510645,
"grad_norm": 11.411553382873535,
"learning_rate": 4.990005202497199e-05,
"loss": 2.0509,
"num_input_tokens_seen": 545376,
"step": 1000
},
{
"epoch": 0.015007203457659676,
"grad_norm": 7.671140193939209,
"learning_rate": 4.985002801344646e-05,
"loss": 2.0225,
"num_input_tokens_seen": 812256,
"step": 1500
},
{
"epoch": 0.0200096046102129,
"grad_norm": 10.581878662109375,
"learning_rate": 4.9800004001920924e-05,
"loss": 1.9906,
"num_input_tokens_seen": 1089008,
"step": 2000
},
{
"epoch": 0.02501200576276613,
"grad_norm": 9.597127914428711,
"learning_rate": 4.974997999039539e-05,
"loss": 1.9507,
"num_input_tokens_seen": 1366424,
"step": 2500
},
{
"epoch": 0.030014406915319352,
"grad_norm": 8.959872245788574,
"learning_rate": 4.9699955978869863e-05,
"loss": 1.9346,
"num_input_tokens_seen": 1638376,
"step": 3000
},
{
"epoch": 0.03501680806787258,
"grad_norm": 8.698769569396973,
"learning_rate": 4.9649931967344327e-05,
"loss": 1.9318,
"num_input_tokens_seen": 1906088,
"step": 3500
},
{
"epoch": 0.0400192092204258,
"grad_norm": 9.283397674560547,
"learning_rate": 4.9599907955818796e-05,
"loss": 1.9044,
"num_input_tokens_seen": 2180336,
"step": 4000
},
{
"epoch": 0.04502161037297903,
"grad_norm": 8.18691635131836,
"learning_rate": 4.954988394429326e-05,
"loss": 1.8793,
"num_input_tokens_seen": 2446104,
"step": 4500
},
{
"epoch": 0.05002401152553226,
"grad_norm": 9.12602424621582,
"learning_rate": 4.949985993276773e-05,
"loss": 1.8827,
"num_input_tokens_seen": 2707352,
"step": 5000
},
{
"epoch": 0.05502641267808548,
"grad_norm": 7.385958194732666,
"learning_rate": 4.94498359212422e-05,
"loss": 1.8553,
"num_input_tokens_seen": 2980496,
"step": 5500
},
{
"epoch": 0.060028813830638704,
"grad_norm": 12.771719932556152,
"learning_rate": 4.939981190971666e-05,
"loss": 1.8257,
"num_input_tokens_seen": 3261104,
"step": 6000
},
{
"epoch": 0.06503121498319193,
"grad_norm": 7.724299430847168,
"learning_rate": 4.934978789819113e-05,
"loss": 1.82,
"num_input_tokens_seen": 3530600,
"step": 6500
},
{
"epoch": 0.07003361613574516,
"grad_norm": 10.434374809265137,
"learning_rate": 4.92997638866656e-05,
"loss": 1.8089,
"num_input_tokens_seen": 3798856,
"step": 7000
},
{
"epoch": 0.07503601728829838,
"grad_norm": 6.624173164367676,
"learning_rate": 4.924973987514007e-05,
"loss": 1.8068,
"num_input_tokens_seen": 4068760,
"step": 7500
},
{
"epoch": 0.0800384184408516,
"grad_norm": 7.316147327423096,
"learning_rate": 4.9199715863614536e-05,
"loss": 1.7867,
"num_input_tokens_seen": 4337376,
"step": 8000
},
{
"epoch": 0.08504081959340483,
"grad_norm": 7.759427547454834,
"learning_rate": 4.9149691852089006e-05,
"loss": 1.7814,
"num_input_tokens_seen": 4610160,
"step": 8500
},
{
"epoch": 0.09004322074595807,
"grad_norm": 7.609263896942139,
"learning_rate": 4.9099667840563476e-05,
"loss": 1.7981,
"num_input_tokens_seen": 4885232,
"step": 9000
},
{
"epoch": 0.09504562189851129,
"grad_norm": 6.2129669189453125,
"learning_rate": 4.904964382903794e-05,
"loss": 1.7689,
"num_input_tokens_seen": 5155216,
"step": 9500
},
{
"epoch": 0.10004802305106451,
"grad_norm": 6.697121620178223,
"learning_rate": 4.899961981751241e-05,
"loss": 1.7388,
"num_input_tokens_seen": 5428160,
"step": 10000
},
{
"epoch": 0.10505042420361774,
"grad_norm": 10.624402046203613,
"learning_rate": 4.894959580598688e-05,
"loss": 1.7443,
"num_input_tokens_seen": 5692136,
"step": 10500
},
{
"epoch": 0.11005282535617096,
"grad_norm": 8.177070617675781,
"learning_rate": 4.889957179446135e-05,
"loss": 1.7421,
"num_input_tokens_seen": 5965544,
"step": 11000
},
{
"epoch": 0.11505522650872418,
"grad_norm": 9.939962387084961,
"learning_rate": 4.884954778293581e-05,
"loss": 1.7128,
"num_input_tokens_seen": 6230272,
"step": 11500
},
{
"epoch": 0.12005762766127741,
"grad_norm": 8.003973007202148,
"learning_rate": 4.8799523771410275e-05,
"loss": 1.7197,
"num_input_tokens_seen": 6505160,
"step": 12000
},
{
"epoch": 0.12506002881383063,
"grad_norm": 6.709536075592041,
"learning_rate": 4.8749499759884745e-05,
"loss": 1.7021,
"num_input_tokens_seen": 6782752,
"step": 12500
},
{
"epoch": 0.13006242996638387,
"grad_norm": 7.609007835388184,
"learning_rate": 4.8699475748359215e-05,
"loss": 1.7083,
"num_input_tokens_seen": 7059144,
"step": 13000
},
{
"epoch": 0.13506483111893708,
"grad_norm": 7.746858596801758,
"learning_rate": 4.8649451736833685e-05,
"loss": 1.709,
"num_input_tokens_seen": 7327608,
"step": 13500
},
{
"epoch": 0.14006723227149032,
"grad_norm": 9.576034545898438,
"learning_rate": 4.859942772530815e-05,
"loss": 1.7204,
"num_input_tokens_seen": 7598784,
"step": 14000
},
{
"epoch": 0.14506963342404355,
"grad_norm": 7.995354175567627,
"learning_rate": 4.854940371378262e-05,
"loss": 1.6987,
"num_input_tokens_seen": 7866144,
"step": 14500
},
{
"epoch": 0.15007203457659676,
"grad_norm": 7.990232944488525,
"learning_rate": 4.849937970225709e-05,
"loss": 1.695,
"num_input_tokens_seen": 8132240,
"step": 15000
},
{
"epoch": 0.15507443572915,
"grad_norm": 6.7087507247924805,
"learning_rate": 4.844935569073155e-05,
"loss": 1.6766,
"num_input_tokens_seen": 8400496,
"step": 15500
},
{
"epoch": 0.1600768368817032,
"grad_norm": 6.8279900550842285,
"learning_rate": 4.839933167920602e-05,
"loss": 1.6662,
"num_input_tokens_seen": 8666120,
"step": 16000
},
{
"epoch": 0.16507923803425645,
"grad_norm": 8.427155494689941,
"learning_rate": 4.834930766768049e-05,
"loss": 1.6781,
"num_input_tokens_seen": 8935352,
"step": 16500
},
{
"epoch": 0.17008163918680966,
"grad_norm": 8.360432624816895,
"learning_rate": 4.829928365615496e-05,
"loss": 1.6682,
"num_input_tokens_seen": 9209648,
"step": 17000
},
{
"epoch": 0.1750840403393629,
"grad_norm": 7.38226842880249,
"learning_rate": 4.8249259644629424e-05,
"loss": 1.6475,
"num_input_tokens_seen": 9475336,
"step": 17500
},
{
"epoch": 0.18008644149191613,
"grad_norm": 8.226577758789062,
"learning_rate": 4.819923563310389e-05,
"loss": 1.6668,
"num_input_tokens_seen": 9752088,
"step": 18000
},
{
"epoch": 0.18508884264446934,
"grad_norm": 9.053323745727539,
"learning_rate": 4.8149211621578364e-05,
"loss": 1.6615,
"num_input_tokens_seen": 10021568,
"step": 18500
},
{
"epoch": 0.19009124379702258,
"grad_norm": 10.050972938537598,
"learning_rate": 4.809918761005283e-05,
"loss": 1.6453,
"num_input_tokens_seen": 10294200,
"step": 19000
},
{
"epoch": 0.1950936449495758,
"grad_norm": 12.779481887817383,
"learning_rate": 4.80491635985273e-05,
"loss": 1.6305,
"num_input_tokens_seen": 10563664,
"step": 19500
},
{
"epoch": 0.20009604610212903,
"grad_norm": 9.113444328308105,
"learning_rate": 4.799913958700176e-05,
"loss": 1.6374,
"num_input_tokens_seen": 10833928,
"step": 20000
},
{
"epoch": 0.20509844725468224,
"grad_norm": 9.230701446533203,
"learning_rate": 4.794911557547624e-05,
"loss": 1.633,
"num_input_tokens_seen": 11098416,
"step": 20500
},
{
"epoch": 0.21010084840723547,
"grad_norm": 7.994055271148682,
"learning_rate": 4.78990915639507e-05,
"loss": 1.6267,
"num_input_tokens_seen": 11368280,
"step": 21000
},
{
"epoch": 0.2151032495597887,
"grad_norm": 8.182880401611328,
"learning_rate": 4.784906755242516e-05,
"loss": 1.6199,
"num_input_tokens_seen": 11634848,
"step": 21500
},
{
"epoch": 0.22010565071234192,
"grad_norm": 8.053507804870605,
"learning_rate": 4.779904354089963e-05,
"loss": 1.6318,
"num_input_tokens_seen": 11912456,
"step": 22000
},
{
"epoch": 0.22510805186489516,
"grad_norm": 8.162029266357422,
"learning_rate": 4.77490195293741e-05,
"loss": 1.6156,
"num_input_tokens_seen": 12177848,
"step": 22500
},
{
"epoch": 0.23011045301744837,
"grad_norm": 7.447554588317871,
"learning_rate": 4.769899551784857e-05,
"loss": 1.6181,
"num_input_tokens_seen": 12448808,
"step": 23000
},
{
"epoch": 0.2351128541700016,
"grad_norm": 8.473464012145996,
"learning_rate": 4.7648971506323036e-05,
"loss": 1.6544,
"num_input_tokens_seen": 12724056,
"step": 23500
},
{
"epoch": 0.24011525532255482,
"grad_norm": 8.162822723388672,
"learning_rate": 4.7598947494797506e-05,
"loss": 1.6097,
"num_input_tokens_seen": 12992248,
"step": 24000
},
{
"epoch": 0.24511765647510805,
"grad_norm": 8.350125312805176,
"learning_rate": 4.7548923483271976e-05,
"loss": 1.5977,
"num_input_tokens_seen": 13267848,
"step": 24500
},
{
"epoch": 0.25012005762766126,
"grad_norm": 6.6851091384887695,
"learning_rate": 4.749889947174644e-05,
"loss": 1.6008,
"num_input_tokens_seen": 13544424,
"step": 25000
},
{
"epoch": 0.2551224587802145,
"grad_norm": 8.89284610748291,
"learning_rate": 4.744887546022091e-05,
"loss": 1.6099,
"num_input_tokens_seen": 13818272,
"step": 25500
},
{
"epoch": 0.26012485993276774,
"grad_norm": 8.554915428161621,
"learning_rate": 4.739885144869537e-05,
"loss": 1.6121,
"num_input_tokens_seen": 14088432,
"step": 26000
},
{
"epoch": 0.265127261085321,
"grad_norm": 7.6910529136657715,
"learning_rate": 4.734882743716985e-05,
"loss": 1.6073,
"num_input_tokens_seen": 14362896,
"step": 26500
},
{
"epoch": 0.27012966223787416,
"grad_norm": 6.856411457061768,
"learning_rate": 4.729880342564431e-05,
"loss": 1.604,
"num_input_tokens_seen": 14629232,
"step": 27000
},
{
"epoch": 0.2751320633904274,
"grad_norm": 8.767107009887695,
"learning_rate": 4.7248779414118775e-05,
"loss": 1.587,
"num_input_tokens_seen": 14904336,
"step": 27500
},
{
"epoch": 0.28013446454298063,
"grad_norm": 6.474203586578369,
"learning_rate": 4.7198755402593245e-05,
"loss": 1.5936,
"num_input_tokens_seen": 15171200,
"step": 28000
},
{
"epoch": 0.28513686569553387,
"grad_norm": 10.540902137756348,
"learning_rate": 4.7148731391067715e-05,
"loss": 1.5861,
"num_input_tokens_seen": 15438392,
"step": 28500
},
{
"epoch": 0.2901392668480871,
"grad_norm": 9.134960174560547,
"learning_rate": 4.7098707379542185e-05,
"loss": 1.5717,
"num_input_tokens_seen": 15707768,
"step": 29000
},
{
"epoch": 0.2951416680006403,
"grad_norm": 6.669952869415283,
"learning_rate": 4.704868336801665e-05,
"loss": 1.5989,
"num_input_tokens_seen": 15980896,
"step": 29500
},
{
"epoch": 0.3001440691531935,
"grad_norm": 6.744385242462158,
"learning_rate": 4.699865935649112e-05,
"loss": 1.5858,
"num_input_tokens_seen": 16249024,
"step": 30000
},
{
"epoch": 0.30514647030574676,
"grad_norm": 8.088431358337402,
"learning_rate": 4.694863534496559e-05,
"loss": 1.5612,
"num_input_tokens_seen": 16516448,
"step": 30500
},
{
"epoch": 0.3101488714583,
"grad_norm": 8.115087509155273,
"learning_rate": 4.689861133344005e-05,
"loss": 1.5995,
"num_input_tokens_seen": 16788416,
"step": 31000
},
{
"epoch": 0.3151512726108532,
"grad_norm": 8.037360191345215,
"learning_rate": 4.684858732191452e-05,
"loss": 1.5787,
"num_input_tokens_seen": 17057432,
"step": 31500
},
{
"epoch": 0.3201536737634064,
"grad_norm": 7.91753625869751,
"learning_rate": 4.6798563310388984e-05,
"loss": 1.5642,
"num_input_tokens_seen": 17324440,
"step": 32000
},
{
"epoch": 0.32515607491595966,
"grad_norm": 8.645842552185059,
"learning_rate": 4.674853929886346e-05,
"loss": 1.5685,
"num_input_tokens_seen": 17597472,
"step": 32500
},
{
"epoch": 0.3301584760685129,
"grad_norm": 7.731500148773193,
"learning_rate": 4.6698515287337924e-05,
"loss": 1.568,
"num_input_tokens_seen": 17862696,
"step": 33000
},
{
"epoch": 0.33516087722106613,
"grad_norm": 8.492512702941895,
"learning_rate": 4.6648491275812394e-05,
"loss": 1.5639,
"num_input_tokens_seen": 18132104,
"step": 33500
},
{
"epoch": 0.3401632783736193,
"grad_norm": 6.294936656951904,
"learning_rate": 4.659846726428686e-05,
"loss": 1.5651,
"num_input_tokens_seen": 18398680,
"step": 34000
},
{
"epoch": 0.34516567952617255,
"grad_norm": 6.749844551086426,
"learning_rate": 4.654844325276133e-05,
"loss": 1.5501,
"num_input_tokens_seen": 18664584,
"step": 34500
},
{
"epoch": 0.3501680806787258,
"grad_norm": 8.394216537475586,
"learning_rate": 4.64984192412358e-05,
"loss": 1.5655,
"num_input_tokens_seen": 18935000,
"step": 35000
},
{
"epoch": 0.35517048183127903,
"grad_norm": 7.8091607093811035,
"learning_rate": 4.644839522971026e-05,
"loss": 1.5545,
"num_input_tokens_seen": 19215376,
"step": 35500
},
{
"epoch": 0.36017288298383227,
"grad_norm": 8.398904800415039,
"learning_rate": 4.639837121818473e-05,
"loss": 1.5482,
"num_input_tokens_seen": 19493104,
"step": 36000
},
{
"epoch": 0.36517528413638545,
"grad_norm": 20.516576766967773,
"learning_rate": 4.63483472066592e-05,
"loss": 1.546,
"num_input_tokens_seen": 19761160,
"step": 36500
},
{
"epoch": 0.3701776852889387,
"grad_norm": 6.954863548278809,
"learning_rate": 4.629832319513366e-05,
"loss": 1.5581,
"num_input_tokens_seen": 20034376,
"step": 37000
},
{
"epoch": 0.3751800864414919,
"grad_norm": 7.736358642578125,
"learning_rate": 4.624829918360813e-05,
"loss": 1.5477,
"num_input_tokens_seen": 20303800,
"step": 37500
},
{
"epoch": 0.38018248759404516,
"grad_norm": 9.256717681884766,
"learning_rate": 4.61982751720826e-05,
"loss": 1.5379,
"num_input_tokens_seen": 20577472,
"step": 38000
},
{
"epoch": 0.38518488874659834,
"grad_norm": 7.987185478210449,
"learning_rate": 4.614825116055707e-05,
"loss": 1.5411,
"num_input_tokens_seen": 20844712,
"step": 38500
},
{
"epoch": 0.3901872898991516,
"grad_norm": 6.768152713775635,
"learning_rate": 4.6098227149031536e-05,
"loss": 1.5317,
"num_input_tokens_seen": 21112208,
"step": 39000
},
{
"epoch": 0.3951896910517048,
"grad_norm": 8.385108947753906,
"learning_rate": 4.6048203137506006e-05,
"loss": 1.522,
"num_input_tokens_seen": 21378704,
"step": 39500
},
{
"epoch": 0.40019209220425805,
"grad_norm": 6.166186809539795,
"learning_rate": 4.5998179125980476e-05,
"loss": 1.5167,
"num_input_tokens_seen": 21648840,
"step": 40000
},
{
"epoch": 0.4051944933568113,
"grad_norm": 6.178369045257568,
"learning_rate": 4.594815511445494e-05,
"loss": 1.5161,
"num_input_tokens_seen": 21917272,
"step": 40500
},
{
"epoch": 0.4101968945093645,
"grad_norm": 8.063447952270508,
"learning_rate": 4.589813110292941e-05,
"loss": 1.5335,
"num_input_tokens_seen": 22185960,
"step": 41000
},
{
"epoch": 0.4151992956619177,
"grad_norm": 9.296152114868164,
"learning_rate": 4.584810709140387e-05,
"loss": 1.5212,
"num_input_tokens_seen": 22451608,
"step": 41500
},
{
"epoch": 0.42020169681447095,
"grad_norm": 7.5216965675354,
"learning_rate": 4.579808307987834e-05,
"loss": 1.5275,
"num_input_tokens_seen": 22720912,
"step": 42000
},
{
"epoch": 0.4252040979670242,
"grad_norm": 7.390750408172607,
"learning_rate": 4.574805906835281e-05,
"loss": 1.5063,
"num_input_tokens_seen": 22985424,
"step": 42500
},
{
"epoch": 0.4302064991195774,
"grad_norm": 7.005786895751953,
"learning_rate": 4.569803505682728e-05,
"loss": 1.5166,
"num_input_tokens_seen": 23258384,
"step": 43000
},
{
"epoch": 0.4352089002721306,
"grad_norm": 8.231675148010254,
"learning_rate": 4.5648011045301745e-05,
"loss": 1.5198,
"num_input_tokens_seen": 23531648,
"step": 43500
},
{
"epoch": 0.44021130142468384,
"grad_norm": 8.228809356689453,
"learning_rate": 4.5597987033776215e-05,
"loss": 1.4984,
"num_input_tokens_seen": 23798648,
"step": 44000
},
{
"epoch": 0.4452137025772371,
"grad_norm": 7.101296901702881,
"learning_rate": 4.5547963022250685e-05,
"loss": 1.5261,
"num_input_tokens_seen": 24061032,
"step": 44500
},
{
"epoch": 0.4502161037297903,
"grad_norm": 7.954161167144775,
"learning_rate": 4.549793901072515e-05,
"loss": 1.5011,
"num_input_tokens_seen": 24329816,
"step": 45000
},
{
"epoch": 0.4552185048823435,
"grad_norm": 6.646183490753174,
"learning_rate": 4.544791499919962e-05,
"loss": 1.5281,
"num_input_tokens_seen": 24600296,
"step": 45500
},
{
"epoch": 0.46022090603489674,
"grad_norm": 6.38918924331665,
"learning_rate": 4.539789098767409e-05,
"loss": 1.5084,
"num_input_tokens_seen": 24864680,
"step": 46000
},
{
"epoch": 0.46522330718745,
"grad_norm": 9.28714656829834,
"learning_rate": 4.534786697614855e-05,
"loss": 1.5132,
"num_input_tokens_seen": 25138944,
"step": 46500
},
{
"epoch": 0.4702257083400032,
"grad_norm": 7.2790937423706055,
"learning_rate": 4.529784296462302e-05,
"loss": 1.4851,
"num_input_tokens_seen": 25405872,
"step": 47000
},
{
"epoch": 0.47522810949255645,
"grad_norm": 6.309217929840088,
"learning_rate": 4.5247818953097484e-05,
"loss": 1.4966,
"num_input_tokens_seen": 25672064,
"step": 47500
},
{
"epoch": 0.48023051064510963,
"grad_norm": 5.927547931671143,
"learning_rate": 4.519779494157196e-05,
"loss": 1.4871,
"num_input_tokens_seen": 25944256,
"step": 48000
},
{
"epoch": 0.48523291179766287,
"grad_norm": 7.525789260864258,
"learning_rate": 4.5147770930046424e-05,
"loss": 1.5077,
"num_input_tokens_seen": 26218880,
"step": 48500
},
{
"epoch": 0.4902353129502161,
"grad_norm": 7.0813517570495605,
"learning_rate": 4.5097746918520894e-05,
"loss": 1.5036,
"num_input_tokens_seen": 26492264,
"step": 49000
},
{
"epoch": 0.49523771410276934,
"grad_norm": 10.278409004211426,
"learning_rate": 4.504772290699536e-05,
"loss": 1.4888,
"num_input_tokens_seen": 26761928,
"step": 49500
},
{
"epoch": 0.5002401152553225,
"grad_norm": 10.73009967803955,
"learning_rate": 4.499769889546983e-05,
"loss": 1.4941,
"num_input_tokens_seen": 27033072,
"step": 50000
},
{
"epoch": 0.5052425164078758,
"grad_norm": 8.047172546386719,
"learning_rate": 4.49476748839443e-05,
"loss": 1.4882,
"num_input_tokens_seen": 27297448,
"step": 50500
},
{
"epoch": 0.510244917560429,
"grad_norm": 7.059236526489258,
"learning_rate": 4.489765087241876e-05,
"loss": 1.5093,
"num_input_tokens_seen": 27572696,
"step": 51000
},
{
"epoch": 0.5152473187129822,
"grad_norm": 6.407097816467285,
"learning_rate": 4.484762686089323e-05,
"loss": 1.4841,
"num_input_tokens_seen": 27850272,
"step": 51500
},
{
"epoch": 0.5202497198655355,
"grad_norm": 7.751036167144775,
"learning_rate": 4.47976028493677e-05,
"loss": 1.4797,
"num_input_tokens_seen": 28125984,
"step": 52000
},
{
"epoch": 0.5252521210180887,
"grad_norm": 7.869476795196533,
"learning_rate": 4.474757883784217e-05,
"loss": 1.4826,
"num_input_tokens_seen": 28400064,
"step": 52500
},
{
"epoch": 0.530254522170642,
"grad_norm": 6.702480316162109,
"learning_rate": 4.4697554826316633e-05,
"loss": 1.4888,
"num_input_tokens_seen": 28668520,
"step": 53000
},
{
"epoch": 0.5352569233231952,
"grad_norm": 5.916720867156982,
"learning_rate": 4.46475308147911e-05,
"loss": 1.4984,
"num_input_tokens_seen": 28938448,
"step": 53500
},
{
"epoch": 0.5402593244757483,
"grad_norm": 8.540026664733887,
"learning_rate": 4.459750680326557e-05,
"loss": 1.5038,
"num_input_tokens_seen": 29207464,
"step": 54000
},
{
"epoch": 0.5452617256283016,
"grad_norm": 6.425217151641846,
"learning_rate": 4.4547482791740036e-05,
"loss": 1.4771,
"num_input_tokens_seen": 29476896,
"step": 54500
},
{
"epoch": 0.5502641267808548,
"grad_norm": 5.669241428375244,
"learning_rate": 4.4497458780214506e-05,
"loss": 1.4618,
"num_input_tokens_seen": 29744952,
"step": 55000
},
{
"epoch": 0.555266527933408,
"grad_norm": 6.841111660003662,
"learning_rate": 4.444743476868897e-05,
"loss": 1.4899,
"num_input_tokens_seen": 30012448,
"step": 55500
},
{
"epoch": 0.5602689290859613,
"grad_norm": 6.233266353607178,
"learning_rate": 4.439741075716344e-05,
"loss": 1.5034,
"num_input_tokens_seen": 30282976,
"step": 56000
},
{
"epoch": 0.5652713302385145,
"grad_norm": 6.895115375518799,
"learning_rate": 4.434738674563791e-05,
"loss": 1.493,
"num_input_tokens_seen": 30555656,
"step": 56500
},
{
"epoch": 0.5702737313910677,
"grad_norm": 6.7635674476623535,
"learning_rate": 4.429736273411237e-05,
"loss": 1.4729,
"num_input_tokens_seen": 30825952,
"step": 57000
},
{
"epoch": 0.575276132543621,
"grad_norm": 8.052691459655762,
"learning_rate": 4.424733872258684e-05,
"loss": 1.4775,
"num_input_tokens_seen": 31095152,
"step": 57500
},
{
"epoch": 0.5802785336961742,
"grad_norm": 7.0079545974731445,
"learning_rate": 4.419731471106131e-05,
"loss": 1.4726,
"num_input_tokens_seen": 31368672,
"step": 58000
},
{
"epoch": 0.5852809348487273,
"grad_norm": 5.913256645202637,
"learning_rate": 4.414729069953578e-05,
"loss": 1.4758,
"num_input_tokens_seen": 31642464,
"step": 58500
},
{
"epoch": 0.5902833360012806,
"grad_norm": 9.248602867126465,
"learning_rate": 4.4097266688010246e-05,
"loss": 1.4534,
"num_input_tokens_seen": 31913336,
"step": 59000
},
{
"epoch": 0.5952857371538338,
"grad_norm": 6.6079936027526855,
"learning_rate": 4.4047242676484716e-05,
"loss": 1.4581,
"num_input_tokens_seen": 32186344,
"step": 59500
},
{
"epoch": 0.600288138306387,
"grad_norm": 6.47177791595459,
"learning_rate": 4.3997218664959185e-05,
"loss": 1.4505,
"num_input_tokens_seen": 32460808,
"step": 60000
},
{
"epoch": 0.6052905394589403,
"grad_norm": 6.106129169464111,
"learning_rate": 4.394719465343365e-05,
"loss": 1.4701,
"num_input_tokens_seen": 32728200,
"step": 60500
},
{
"epoch": 0.6102929406114935,
"grad_norm": 8.406516075134277,
"learning_rate": 4.389717064190812e-05,
"loss": 1.4591,
"num_input_tokens_seen": 33005896,
"step": 61000
},
{
"epoch": 0.6152953417640468,
"grad_norm": 9.166064262390137,
"learning_rate": 4.384714663038258e-05,
"loss": 1.4875,
"num_input_tokens_seen": 33269392,
"step": 61500
},
{
"epoch": 0.6202977429166,
"grad_norm": 7.39436149597168,
"learning_rate": 4.379712261885706e-05,
"loss": 1.4563,
"num_input_tokens_seen": 33541424,
"step": 62000
},
{
"epoch": 0.6253001440691532,
"grad_norm": 5.612057685852051,
"learning_rate": 4.374709860733152e-05,
"loss": 1.4761,
"num_input_tokens_seen": 33806192,
"step": 62500
},
{
"epoch": 0.6303025452217064,
"grad_norm": 7.853974342346191,
"learning_rate": 4.3697074595805985e-05,
"loss": 1.4617,
"num_input_tokens_seen": 34076920,
"step": 63000
},
{
"epoch": 0.6353049463742596,
"grad_norm": 8.041677474975586,
"learning_rate": 4.3647050584280455e-05,
"loss": 1.442,
"num_input_tokens_seen": 34340464,
"step": 63500
},
{
"epoch": 0.6403073475268128,
"grad_norm": 7.188261985778809,
"learning_rate": 4.3597026572754925e-05,
"loss": 1.4746,
"num_input_tokens_seen": 34608464,
"step": 64000
},
{
"epoch": 0.6453097486793661,
"grad_norm": 7.935949802398682,
"learning_rate": 4.3547002561229395e-05,
"loss": 1.4655,
"num_input_tokens_seen": 34878936,
"step": 64500
},
{
"epoch": 0.6503121498319193,
"grad_norm": 6.211294651031494,
"learning_rate": 4.349697854970386e-05,
"loss": 1.4481,
"num_input_tokens_seen": 35148000,
"step": 65000
},
{
"epoch": 0.6553145509844726,
"grad_norm": 8.786713600158691,
"learning_rate": 4.344695453817833e-05,
"loss": 1.4541,
"num_input_tokens_seen": 35425104,
"step": 65500
},
{
"epoch": 0.6603169521370258,
"grad_norm": 7.866344928741455,
"learning_rate": 4.33969305266528e-05,
"loss": 1.4537,
"num_input_tokens_seen": 35693952,
"step": 66000
},
{
"epoch": 0.665319353289579,
"grad_norm": 7.549289703369141,
"learning_rate": 4.334690651512726e-05,
"loss": 1.4569,
"num_input_tokens_seen": 35966608,
"step": 66500
},
{
"epoch": 0.6703217544421323,
"grad_norm": 6.588021278381348,
"learning_rate": 4.329688250360173e-05,
"loss": 1.4477,
"num_input_tokens_seen": 36238976,
"step": 67000
},
{
"epoch": 0.6753241555946854,
"grad_norm": 7.3665852546691895,
"learning_rate": 4.32468584920762e-05,
"loss": 1.4524,
"num_input_tokens_seen": 36504184,
"step": 67500
},
{
"epoch": 0.6803265567472386,
"grad_norm": 8.99618911743164,
"learning_rate": 4.319683448055067e-05,
"loss": 1.4559,
"num_input_tokens_seen": 36767640,
"step": 68000
},
{
"epoch": 0.6853289578997919,
"grad_norm": 8.354264259338379,
"learning_rate": 4.3146810469025134e-05,
"loss": 1.4552,
"num_input_tokens_seen": 37033640,
"step": 68500
},
{
"epoch": 0.6903313590523451,
"grad_norm": 6.712357521057129,
"learning_rate": 4.30967864574996e-05,
"loss": 1.4428,
"num_input_tokens_seen": 37301576,
"step": 69000
},
{
"epoch": 0.6953337602048983,
"grad_norm": 6.98289680480957,
"learning_rate": 4.3046762445974074e-05,
"loss": 1.4389,
"num_input_tokens_seen": 37575040,
"step": 69500
},
{
"epoch": 0.7003361613574516,
"grad_norm": 7.0615410804748535,
"learning_rate": 4.299673843444854e-05,
"loss": 1.4323,
"num_input_tokens_seen": 37844304,
"step": 70000
},
{
"epoch": 0.7053385625100048,
"grad_norm": 7.445618152618408,
"learning_rate": 4.294671442292301e-05,
"loss": 1.437,
"num_input_tokens_seen": 38118688,
"step": 70500
},
{
"epoch": 0.7103409636625581,
"grad_norm": 5.989320278167725,
"learning_rate": 4.289669041139747e-05,
"loss": 1.4525,
"num_input_tokens_seen": 38387808,
"step": 71000
},
{
"epoch": 0.7153433648151113,
"grad_norm": 6.6483283042907715,
"learning_rate": 4.284666639987194e-05,
"loss": 1.4389,
"num_input_tokens_seen": 38648568,
"step": 71500
},
{
"epoch": 0.7203457659676645,
"grad_norm": 7.410543918609619,
"learning_rate": 4.279664238834641e-05,
"loss": 1.4205,
"num_input_tokens_seen": 38924584,
"step": 72000
},
{
"epoch": 0.7253481671202177,
"grad_norm": 6.462342739105225,
"learning_rate": 4.274661837682087e-05,
"loss": 1.4286,
"num_input_tokens_seen": 39189080,
"step": 72500
},
{
"epoch": 0.7303505682727709,
"grad_norm": 6.404929161071777,
"learning_rate": 4.269659436529534e-05,
"loss": 1.4475,
"num_input_tokens_seen": 39456616,
"step": 73000
},
{
"epoch": 0.7353529694253241,
"grad_norm": 7.46643590927124,
"learning_rate": 4.264657035376981e-05,
"loss": 1.4374,
"num_input_tokens_seen": 39729608,
"step": 73500
},
{
"epoch": 0.7403553705778774,
"grad_norm": 6.626592636108398,
"learning_rate": 4.259654634224428e-05,
"loss": 1.4194,
"num_input_tokens_seen": 39998368,
"step": 74000
},
{
"epoch": 0.7453577717304306,
"grad_norm": 6.95764684677124,
"learning_rate": 4.2546522330718746e-05,
"loss": 1.4276,
"num_input_tokens_seen": 40271760,
"step": 74500
},
{
"epoch": 0.7503601728829838,
"grad_norm": 5.589132785797119,
"learning_rate": 4.249649831919321e-05,
"loss": 1.4304,
"num_input_tokens_seen": 40539392,
"step": 75000
},
{
"epoch": 0.7553625740355371,
"grad_norm": 6.854423999786377,
"learning_rate": 4.2446474307667686e-05,
"loss": 1.4259,
"num_input_tokens_seen": 40805536,
"step": 75500
},
{
"epoch": 0.7603649751880903,
"grad_norm": 7.378007888793945,
"learning_rate": 4.239645029614215e-05,
"loss": 1.4145,
"num_input_tokens_seen": 41074104,
"step": 76000
},
{
"epoch": 0.7653673763406436,
"grad_norm": 6.971609592437744,
"learning_rate": 4.234642628461662e-05,
"loss": 1.421,
"num_input_tokens_seen": 41344576,
"step": 76500
},
{
"epoch": 0.7703697774931967,
"grad_norm": 7.222036838531494,
"learning_rate": 4.229640227309108e-05,
"loss": 1.4314,
"num_input_tokens_seen": 41615376,
"step": 77000
},
{
"epoch": 0.7753721786457499,
"grad_norm": 6.561624526977539,
"learning_rate": 4.224637826156556e-05,
"loss": 1.409,
"num_input_tokens_seen": 41886272,
"step": 77500
},
{
"epoch": 0.7803745797983032,
"grad_norm": 6.4644646644592285,
"learning_rate": 4.219635425004002e-05,
"loss": 1.4101,
"num_input_tokens_seen": 42156112,
"step": 78000
},
{
"epoch": 0.7853769809508564,
"grad_norm": 6.069692611694336,
"learning_rate": 4.2146330238514485e-05,
"loss": 1.4213,
"num_input_tokens_seen": 42424448,
"step": 78500
},
{
"epoch": 0.7903793821034096,
"grad_norm": 6.701622486114502,
"learning_rate": 4.2096306226988955e-05,
"loss": 1.4204,
"num_input_tokens_seen": 42685864,
"step": 79000
},
{
"epoch": 0.7953817832559629,
"grad_norm": 8.732488632202148,
"learning_rate": 4.2046282215463425e-05,
"loss": 1.4225,
"num_input_tokens_seen": 42959264,
"step": 79500
},
{
"epoch": 0.8003841844085161,
"grad_norm": 7.264562129974365,
"learning_rate": 4.1996258203937895e-05,
"loss": 1.4044,
"num_input_tokens_seen": 43231192,
"step": 80000
},
{
"epoch": 0.8053865855610693,
"grad_norm": 7.394875526428223,
"learning_rate": 4.194623419241236e-05,
"loss": 1.4211,
"num_input_tokens_seen": 43500168,
"step": 80500
},
{
"epoch": 0.8103889867136226,
"grad_norm": 6.593264102935791,
"learning_rate": 4.189621018088683e-05,
"loss": 1.4179,
"num_input_tokens_seen": 43774064,
"step": 81000
},
{
"epoch": 0.8153913878661757,
"grad_norm": 7.966070175170898,
"learning_rate": 4.18461861693613e-05,
"loss": 1.4439,
"num_input_tokens_seen": 44046680,
"step": 81500
},
{
"epoch": 0.820393789018729,
"grad_norm": 10.988821029663086,
"learning_rate": 4.179616215783576e-05,
"loss": 1.43,
"num_input_tokens_seen": 44320912,
"step": 82000
},
{
"epoch": 0.8253961901712822,
"grad_norm": 6.874449729919434,
"learning_rate": 4.174613814631023e-05,
"loss": 1.4199,
"num_input_tokens_seen": 44593488,
"step": 82500
},
{
"epoch": 0.8303985913238354,
"grad_norm": 7.1776838302612305,
"learning_rate": 4.1696114134784694e-05,
"loss": 1.4131,
"num_input_tokens_seen": 44862096,
"step": 83000
},
{
"epoch": 0.8354009924763887,
"grad_norm": 7.381138801574707,
"learning_rate": 4.164609012325917e-05,
"loss": 1.4059,
"num_input_tokens_seen": 45130632,
"step": 83500
},
{
"epoch": 0.8404033936289419,
"grad_norm": 8.17155933380127,
"learning_rate": 4.1596066111733634e-05,
"loss": 1.4425,
"num_input_tokens_seen": 45400856,
"step": 84000
},
{
"epoch": 0.8454057947814951,
"grad_norm": 6.636998176574707,
"learning_rate": 4.1546042100208104e-05,
"loss": 1.3979,
"num_input_tokens_seen": 45671560,
"step": 84500
},
{
"epoch": 0.8504081959340484,
"grad_norm": 5.552203178405762,
"learning_rate": 4.149601808868257e-05,
"loss": 1.4255,
"num_input_tokens_seen": 45945496,
"step": 85000
},
{
"epoch": 0.8554105970866016,
"grad_norm": 7.160405158996582,
"learning_rate": 4.144599407715704e-05,
"loss": 1.4122,
"num_input_tokens_seen": 46213464,
"step": 85500
},
{
"epoch": 0.8604129982391548,
"grad_norm": 7.1668381690979,
"learning_rate": 4.139597006563151e-05,
"loss": 1.407,
"num_input_tokens_seen": 46482056,
"step": 86000
},
{
"epoch": 0.865415399391708,
"grad_norm": 6.595818996429443,
"learning_rate": 4.134594605410597e-05,
"loss": 1.4052,
"num_input_tokens_seen": 46754792,
"step": 86500
},
{
"epoch": 0.8704178005442612,
"grad_norm": 7.962093830108643,
"learning_rate": 4.129592204258044e-05,
"loss": 1.3779,
"num_input_tokens_seen": 47023888,
"step": 87000
},
{
"epoch": 0.8754202016968144,
"grad_norm": 5.4436421394348145,
"learning_rate": 4.124589803105491e-05,
"loss": 1.395,
"num_input_tokens_seen": 47294240,
"step": 87500
},
{
"epoch": 0.8804226028493677,
"grad_norm": 9.327848434448242,
"learning_rate": 4.119587401952937e-05,
"loss": 1.4015,
"num_input_tokens_seen": 47564360,
"step": 88000
},
{
"epoch": 0.8854250040019209,
"grad_norm": 5.366121768951416,
"learning_rate": 4.114585000800384e-05,
"loss": 1.4136,
"num_input_tokens_seen": 47846352,
"step": 88500
},
{
"epoch": 0.8904274051544742,
"grad_norm": 5.672398090362549,
"learning_rate": 4.109582599647831e-05,
"loss": 1.395,
"num_input_tokens_seen": 48117352,
"step": 89000
},
{
"epoch": 0.8954298063070274,
"grad_norm": 7.147487163543701,
"learning_rate": 4.104580198495278e-05,
"loss": 1.3946,
"num_input_tokens_seen": 48390832,
"step": 89500
},
{
"epoch": 0.9004322074595806,
"grad_norm": 9.567891120910645,
"learning_rate": 4.0995777973427246e-05,
"loss": 1.4111,
"num_input_tokens_seen": 48667984,
"step": 90000
},
{
"epoch": 0.9054346086121339,
"grad_norm": 7.761517524719238,
"learning_rate": 4.0945753961901716e-05,
"loss": 1.3972,
"num_input_tokens_seen": 48941240,
"step": 90500
},
{
"epoch": 0.910437009764687,
"grad_norm": 8.4068603515625,
"learning_rate": 4.089572995037618e-05,
"loss": 1.3894,
"num_input_tokens_seen": 49212696,
"step": 91000
},
{
"epoch": 0.9154394109172402,
"grad_norm": 5.621284008026123,
"learning_rate": 4.084570593885065e-05,
"loss": 1.3823,
"num_input_tokens_seen": 49481760,
"step": 91500
},
{
"epoch": 0.9204418120697935,
"grad_norm": 8.205471992492676,
"learning_rate": 4.079568192732512e-05,
"loss": 1.3886,
"num_input_tokens_seen": 49753376,
"step": 92000
},
{
"epoch": 0.9254442132223467,
"grad_norm": 8.143417358398438,
"learning_rate": 4.074565791579958e-05,
"loss": 1.384,
"num_input_tokens_seen": 50025128,
"step": 92500
},
{
"epoch": 0.9304466143749,
"grad_norm": 7.172451496124268,
"learning_rate": 4.069563390427405e-05,
"loss": 1.4011,
"num_input_tokens_seen": 50301448,
"step": 93000
},
{
"epoch": 0.9354490155274532,
"grad_norm": 7.71168851852417,
"learning_rate": 4.064560989274852e-05,
"loss": 1.3702,
"num_input_tokens_seen": 50569616,
"step": 93500
},
{
"epoch": 0.9404514166800064,
"grad_norm": 7.981653213500977,
"learning_rate": 4.059558588122299e-05,
"loss": 1.3897,
"num_input_tokens_seen": 50842808,
"step": 94000
},
{
"epoch": 0.9454538178325597,
"grad_norm": 6.760748386383057,
"learning_rate": 4.0545561869697455e-05,
"loss": 1.3878,
"num_input_tokens_seen": 51121760,
"step": 94500
},
{
"epoch": 0.9504562189851129,
"grad_norm": 7.034352779388428,
"learning_rate": 4.0495537858171925e-05,
"loss": 1.4073,
"num_input_tokens_seen": 51392648,
"step": 95000
},
{
"epoch": 0.955458620137666,
"grad_norm": 6.021711349487305,
"learning_rate": 4.0445513846646395e-05,
"loss": 1.4106,
"num_input_tokens_seen": 51657888,
"step": 95500
},
{
"epoch": 0.9604610212902193,
"grad_norm": 7.470587253570557,
"learning_rate": 4.039548983512086e-05,
"loss": 1.3982,
"num_input_tokens_seen": 51934352,
"step": 96000
},
{
"epoch": 0.9654634224427725,
"grad_norm": 6.424021244049072,
"learning_rate": 4.034546582359533e-05,
"loss": 1.3788,
"num_input_tokens_seen": 52209064,
"step": 96500
},
{
"epoch": 0.9704658235953257,
"grad_norm": 7.6357197761535645,
"learning_rate": 4.02954418120698e-05,
"loss": 1.3714,
"num_input_tokens_seen": 52474304,
"step": 97000
},
{
"epoch": 0.975468224747879,
"grad_norm": 8.156658172607422,
"learning_rate": 4.024541780054426e-05,
"loss": 1.3992,
"num_input_tokens_seen": 52747832,
"step": 97500
},
{
"epoch": 0.9804706259004322,
"grad_norm": 6.052001953125,
"learning_rate": 4.019539378901873e-05,
"loss": 1.3758,
"num_input_tokens_seen": 53024352,
"step": 98000
},
{
"epoch": 0.9854730270529855,
"grad_norm": 6.635683059692383,
"learning_rate": 4.0145369777493194e-05,
"loss": 1.3868,
"num_input_tokens_seen": 53296880,
"step": 98500
},
{
"epoch": 0.9904754282055387,
"grad_norm": 6.532413482666016,
"learning_rate": 4.009534576596767e-05,
"loss": 1.3582,
"num_input_tokens_seen": 53571800,
"step": 99000
},
{
"epoch": 0.9954778293580919,
"grad_norm": 6.029451370239258,
"learning_rate": 4.0045321754442134e-05,
"loss": 1.3772,
"num_input_tokens_seen": 53843520,
"step": 99500
},
{
"epoch": 1.0,
"eval_loss": 1.2156304121017456,
"eval_runtime": 188.3426,
"eval_samples_per_second": 1061.39,
"eval_steps_per_second": 132.678,
"num_input_tokens_seen": 54090088,
"step": 99952
},
{
"epoch": 1.000480230510645,
"grad_norm": 5.276439189910889,
"learning_rate": 3.9995297742916604e-05,
"loss": 1.2803,
"num_input_tokens_seen": 54117480,
"step": 100000
},
{
"epoch": 1.0054826316631984,
"grad_norm": 6.911218643188477,
"learning_rate": 3.994527373139107e-05,
"loss": 1.2695,
"num_input_tokens_seen": 54389512,
"step": 100500
},
{
"epoch": 1.0104850328157515,
"grad_norm": 9.460619926452637,
"learning_rate": 3.989524971986554e-05,
"loss": 1.2787,
"num_input_tokens_seen": 54665304,
"step": 101000
},
{
"epoch": 1.0154874339683049,
"grad_norm": 7.135129928588867,
"learning_rate": 3.984522570834001e-05,
"loss": 1.2616,
"num_input_tokens_seen": 54935144,
"step": 101500
},
{
"epoch": 1.020489835120858,
"grad_norm": 7.705801010131836,
"learning_rate": 3.979520169681447e-05,
"loss": 1.2673,
"num_input_tokens_seen": 55209312,
"step": 102000
},
{
"epoch": 1.0254922362734111,
"grad_norm": 7.493370532989502,
"learning_rate": 3.974517768528894e-05,
"loss": 1.2623,
"num_input_tokens_seen": 55478440,
"step": 102500
},
{
"epoch": 1.0304946374259645,
"grad_norm": 6.460716724395752,
"learning_rate": 3.969515367376341e-05,
"loss": 1.2881,
"num_input_tokens_seen": 55752896,
"step": 103000
},
{
"epoch": 1.0354970385785176,
"grad_norm": 7.391408443450928,
"learning_rate": 3.964512966223788e-05,
"loss": 1.2692,
"num_input_tokens_seen": 56028536,
"step": 103500
},
{
"epoch": 1.040499439731071,
"grad_norm": 8.04489803314209,
"learning_rate": 3.9595105650712343e-05,
"loss": 1.2582,
"num_input_tokens_seen": 56297360,
"step": 104000
},
{
"epoch": 1.045501840883624,
"grad_norm": 6.487476348876953,
"learning_rate": 3.9545081639186807e-05,
"loss": 1.2791,
"num_input_tokens_seen": 56565576,
"step": 104500
},
{
"epoch": 1.0505042420361774,
"grad_norm": 7.118215084075928,
"learning_rate": 3.949505762766128e-05,
"loss": 1.2822,
"num_input_tokens_seen": 56841624,
"step": 105000
},
{
"epoch": 1.0555066431887306,
"grad_norm": 6.419320583343506,
"learning_rate": 3.9445033616135746e-05,
"loss": 1.2533,
"num_input_tokens_seen": 57114504,
"step": 105500
},
{
"epoch": 1.060509044341284,
"grad_norm": 6.287978649139404,
"learning_rate": 3.9395009604610216e-05,
"loss": 1.2735,
"num_input_tokens_seen": 57384112,
"step": 106000
},
{
"epoch": 1.065511445493837,
"grad_norm": 6.397841930389404,
"learning_rate": 3.934498559308468e-05,
"loss": 1.2715,
"num_input_tokens_seen": 57658904,
"step": 106500
},
{
"epoch": 1.0705138466463904,
"grad_norm": 6.377140998840332,
"learning_rate": 3.929496158155915e-05,
"loss": 1.2764,
"num_input_tokens_seen": 57930592,
"step": 107000
},
{
"epoch": 1.0755162477989435,
"grad_norm": 7.9464850425720215,
"learning_rate": 3.924493757003362e-05,
"loss": 1.2818,
"num_input_tokens_seen": 58206432,
"step": 107500
},
{
"epoch": 1.0805186489514966,
"grad_norm": 5.806307792663574,
"learning_rate": 3.919491355850808e-05,
"loss": 1.2724,
"num_input_tokens_seen": 58476704,
"step": 108000
},
{
"epoch": 1.08552105010405,
"grad_norm": 7.807882308959961,
"learning_rate": 3.914488954698255e-05,
"loss": 1.2901,
"num_input_tokens_seen": 58749936,
"step": 108500
},
{
"epoch": 1.090523451256603,
"grad_norm": 6.01190185546875,
"learning_rate": 3.909486553545702e-05,
"loss": 1.2691,
"num_input_tokens_seen": 59014736,
"step": 109000
},
{
"epoch": 1.0955258524091565,
"grad_norm": 5.44499397277832,
"learning_rate": 3.904484152393149e-05,
"loss": 1.2589,
"num_input_tokens_seen": 59280912,
"step": 109500
},
{
"epoch": 1.1005282535617096,
"grad_norm": 7.433501243591309,
"learning_rate": 3.8994817512405956e-05,
"loss": 1.2686,
"num_input_tokens_seen": 59549544,
"step": 110000
},
{
"epoch": 1.105530654714263,
"grad_norm": 6.828175067901611,
"learning_rate": 3.8944793500880425e-05,
"loss": 1.2747,
"num_input_tokens_seen": 59820728,
"step": 110500
},
{
"epoch": 1.110533055866816,
"grad_norm": 7.450278282165527,
"learning_rate": 3.8894769489354895e-05,
"loss": 1.281,
"num_input_tokens_seen": 60091056,
"step": 111000
},
{
"epoch": 1.1155354570193694,
"grad_norm": 8.60688591003418,
"learning_rate": 3.884474547782936e-05,
"loss": 1.2745,
"num_input_tokens_seen": 60361616,
"step": 111500
},
{
"epoch": 1.1205378581719225,
"grad_norm": 6.874872207641602,
"learning_rate": 3.879472146630383e-05,
"loss": 1.2663,
"num_input_tokens_seen": 60628632,
"step": 112000
},
{
"epoch": 1.1255402593244757,
"grad_norm": 6.488132953643799,
"learning_rate": 3.874469745477829e-05,
"loss": 1.2733,
"num_input_tokens_seen": 60896840,
"step": 112500
},
{
"epoch": 1.130542660477029,
"grad_norm": 10.988000869750977,
"learning_rate": 3.869467344325277e-05,
"loss": 1.2686,
"num_input_tokens_seen": 61170424,
"step": 113000
},
{
"epoch": 1.1355450616295821,
"grad_norm": 6.758646011352539,
"learning_rate": 3.864464943172723e-05,
"loss": 1.2807,
"num_input_tokens_seen": 61437984,
"step": 113500
},
{
"epoch": 1.1405474627821355,
"grad_norm": 6.983493804931641,
"learning_rate": 3.8594625420201695e-05,
"loss": 1.2807,
"num_input_tokens_seen": 61710280,
"step": 114000
},
{
"epoch": 1.1455498639346886,
"grad_norm": 6.096587181091309,
"learning_rate": 3.8544601408676165e-05,
"loss": 1.2711,
"num_input_tokens_seen": 61980960,
"step": 114500
},
{
"epoch": 1.150552265087242,
"grad_norm": 6.4102373123168945,
"learning_rate": 3.8494577397150635e-05,
"loss": 1.2986,
"num_input_tokens_seen": 62254680,
"step": 115000
},
{
"epoch": 1.155554666239795,
"grad_norm": 7.1004638671875,
"learning_rate": 3.8444553385625105e-05,
"loss": 1.2837,
"num_input_tokens_seen": 62526128,
"step": 115500
},
{
"epoch": 1.1605570673923484,
"grad_norm": 6.682748794555664,
"learning_rate": 3.839452937409957e-05,
"loss": 1.2557,
"num_input_tokens_seen": 62793600,
"step": 116000
},
{
"epoch": 1.1655594685449016,
"grad_norm": 5.091439247131348,
"learning_rate": 3.834450536257404e-05,
"loss": 1.2786,
"num_input_tokens_seen": 63060392,
"step": 116500
},
{
"epoch": 1.1705618696974547,
"grad_norm": 6.379209041595459,
"learning_rate": 3.829448135104851e-05,
"loss": 1.2803,
"num_input_tokens_seen": 63324312,
"step": 117000
},
{
"epoch": 1.175564270850008,
"grad_norm": 6.799802780151367,
"learning_rate": 3.824445733952297e-05,
"loss": 1.2776,
"num_input_tokens_seen": 63596232,
"step": 117500
},
{
"epoch": 1.1805666720025612,
"grad_norm": 5.58148193359375,
"learning_rate": 3.819443332799744e-05,
"loss": 1.2875,
"num_input_tokens_seen": 63863096,
"step": 118000
},
{
"epoch": 1.1855690731551145,
"grad_norm": 6.822576999664307,
"learning_rate": 3.814440931647191e-05,
"loss": 1.2895,
"num_input_tokens_seen": 64138648,
"step": 118500
},
{
"epoch": 1.1905714743076676,
"grad_norm": 8.899248123168945,
"learning_rate": 3.809438530494638e-05,
"loss": 1.2788,
"num_input_tokens_seen": 64408856,
"step": 119000
},
{
"epoch": 1.195573875460221,
"grad_norm": 7.763192653656006,
"learning_rate": 3.8044361293420844e-05,
"loss": 1.2702,
"num_input_tokens_seen": 64679712,
"step": 119500
},
{
"epoch": 1.200576276612774,
"grad_norm": 8.2811861038208,
"learning_rate": 3.799433728189531e-05,
"loss": 1.2566,
"num_input_tokens_seen": 64945912,
"step": 120000
},
{
"epoch": 1.2055786777653275,
"grad_norm": 5.707862854003906,
"learning_rate": 3.794431327036978e-05,
"loss": 1.2667,
"num_input_tokens_seen": 65217232,
"step": 120500
},
{
"epoch": 1.2105810789178806,
"grad_norm": 6.428073406219482,
"learning_rate": 3.789428925884425e-05,
"loss": 1.2817,
"num_input_tokens_seen": 65479136,
"step": 121000
},
{
"epoch": 1.2155834800704337,
"grad_norm": 6.519428730010986,
"learning_rate": 3.784426524731872e-05,
"loss": 1.2719,
"num_input_tokens_seen": 65749968,
"step": 121500
},
{
"epoch": 1.220585881222987,
"grad_norm": 5.953312873840332,
"learning_rate": 3.779424123579318e-05,
"loss": 1.2701,
"num_input_tokens_seen": 66017680,
"step": 122000
},
{
"epoch": 1.2255882823755402,
"grad_norm": 6.453891277313232,
"learning_rate": 3.774421722426765e-05,
"loss": 1.2668,
"num_input_tokens_seen": 66288296,
"step": 122500
},
{
"epoch": 1.2305906835280935,
"grad_norm": 6.9297709465026855,
"learning_rate": 3.769419321274212e-05,
"loss": 1.2754,
"num_input_tokens_seen": 66555160,
"step": 123000
},
{
"epoch": 1.2355930846806467,
"grad_norm": 5.545460224151611,
"learning_rate": 3.764416920121658e-05,
"loss": 1.2636,
"num_input_tokens_seen": 66823296,
"step": 123500
},
{
"epoch": 1.2405954858332,
"grad_norm": 7.921981334686279,
"learning_rate": 3.759414518969105e-05,
"loss": 1.2671,
"num_input_tokens_seen": 67090360,
"step": 124000
},
{
"epoch": 1.2455978869857531,
"grad_norm": 7.033051490783691,
"learning_rate": 3.754412117816552e-05,
"loss": 1.2826,
"num_input_tokens_seen": 67363312,
"step": 124500
},
{
"epoch": 1.2506002881383065,
"grad_norm": 5.355251789093018,
"learning_rate": 3.749409716663999e-05,
"loss": 1.2712,
"num_input_tokens_seen": 67642752,
"step": 125000
},
{
"epoch": 1.2556026892908596,
"grad_norm": 7.9365081787109375,
"learning_rate": 3.7444073155114456e-05,
"loss": 1.2763,
"num_input_tokens_seen": 67905152,
"step": 125500
},
{
"epoch": 1.2606050904434127,
"grad_norm": 6.157983779907227,
"learning_rate": 3.739404914358892e-05,
"loss": 1.2615,
"num_input_tokens_seen": 68168720,
"step": 126000
},
{
"epoch": 1.265607491595966,
"grad_norm": 5.456648349761963,
"learning_rate": 3.7344025132063396e-05,
"loss": 1.2642,
"num_input_tokens_seen": 68433784,
"step": 126500
},
{
"epoch": 1.2706098927485192,
"grad_norm": 7.156668663024902,
"learning_rate": 3.729400112053786e-05,
"loss": 1.2866,
"num_input_tokens_seen": 68705336,
"step": 127000
},
{
"epoch": 1.2756122939010726,
"grad_norm": 6.959549903869629,
"learning_rate": 3.724397710901233e-05,
"loss": 1.2733,
"num_input_tokens_seen": 68968320,
"step": 127500
},
{
"epoch": 1.2806146950536257,
"grad_norm": 6.225592613220215,
"learning_rate": 3.719395309748679e-05,
"loss": 1.2684,
"num_input_tokens_seen": 69244592,
"step": 128000
},
{
"epoch": 1.285617096206179,
"grad_norm": 7.163039684295654,
"learning_rate": 3.714392908596127e-05,
"loss": 1.2823,
"num_input_tokens_seen": 69518384,
"step": 128500
},
{
"epoch": 1.2906194973587322,
"grad_norm": 5.474428176879883,
"learning_rate": 3.709390507443573e-05,
"loss": 1.2684,
"num_input_tokens_seen": 69789560,
"step": 129000
},
{
"epoch": 1.2956218985112855,
"grad_norm": 6.292562961578369,
"learning_rate": 3.7043881062910195e-05,
"loss": 1.2697,
"num_input_tokens_seen": 70058688,
"step": 129500
},
{
"epoch": 1.3006242996638386,
"grad_norm": 5.789345741271973,
"learning_rate": 3.6993857051384665e-05,
"loss": 1.2652,
"num_input_tokens_seen": 70330632,
"step": 130000
},
{
"epoch": 1.3056267008163918,
"grad_norm": 7.9446821212768555,
"learning_rate": 3.6943833039859135e-05,
"loss": 1.2928,
"num_input_tokens_seen": 70594088,
"step": 130500
},
{
"epoch": 1.310629101968945,
"grad_norm": 9.38175106048584,
"learning_rate": 3.6893809028333605e-05,
"loss": 1.2636,
"num_input_tokens_seen": 70858664,
"step": 131000
},
{
"epoch": 1.3156315031214985,
"grad_norm": 7.6178812980651855,
"learning_rate": 3.684378501680807e-05,
"loss": 1.2767,
"num_input_tokens_seen": 71132936,
"step": 131500
},
{
"epoch": 1.3206339042740516,
"grad_norm": 7.7378435134887695,
"learning_rate": 3.679376100528254e-05,
"loss": 1.2752,
"num_input_tokens_seen": 71398296,
"step": 132000
},
{
"epoch": 1.3256363054266047,
"grad_norm": 7.162954807281494,
"learning_rate": 3.674373699375701e-05,
"loss": 1.2534,
"num_input_tokens_seen": 71673192,
"step": 132500
},
{
"epoch": 1.330638706579158,
"grad_norm": 6.323235511779785,
"learning_rate": 3.669371298223147e-05,
"loss": 1.265,
"num_input_tokens_seen": 71949960,
"step": 133000
},
{
"epoch": 1.3356411077317112,
"grad_norm": 6.324786186218262,
"learning_rate": 3.664368897070594e-05,
"loss": 1.2661,
"num_input_tokens_seen": 72217256,
"step": 133500
},
{
"epoch": 1.3406435088842645,
"grad_norm": 7.326359748840332,
"learning_rate": 3.6593664959180404e-05,
"loss": 1.244,
"num_input_tokens_seen": 72480696,
"step": 134000
},
{
"epoch": 1.3456459100368177,
"grad_norm": 7.224339962005615,
"learning_rate": 3.654364094765488e-05,
"loss": 1.2677,
"num_input_tokens_seen": 72747888,
"step": 134500
},
{
"epoch": 1.3506483111893708,
"grad_norm": 6.521255970001221,
"learning_rate": 3.6493616936129344e-05,
"loss": 1.26,
"num_input_tokens_seen": 73016624,
"step": 135000
},
{
"epoch": 1.3556507123419241,
"grad_norm": 7.485130786895752,
"learning_rate": 3.644359292460381e-05,
"loss": 1.2703,
"num_input_tokens_seen": 73291936,
"step": 135500
},
{
"epoch": 1.3606531134944775,
"grad_norm": 7.1711626052856445,
"learning_rate": 3.639356891307828e-05,
"loss": 1.2798,
"num_input_tokens_seen": 73557640,
"step": 136000
},
{
"epoch": 1.3656555146470306,
"grad_norm": 6.981902599334717,
"learning_rate": 3.634354490155275e-05,
"loss": 1.2485,
"num_input_tokens_seen": 73827312,
"step": 136500
},
{
"epoch": 1.3706579157995837,
"grad_norm": 6.2199320793151855,
"learning_rate": 3.629352089002722e-05,
"loss": 1.2587,
"num_input_tokens_seen": 74098048,
"step": 137000
},
{
"epoch": 1.375660316952137,
"grad_norm": 6.726940155029297,
"learning_rate": 3.624349687850168e-05,
"loss": 1.2753,
"num_input_tokens_seen": 74365640,
"step": 137500
},
{
"epoch": 1.3806627181046902,
"grad_norm": 5.759517669677734,
"learning_rate": 3.619347286697615e-05,
"loss": 1.2549,
"num_input_tokens_seen": 74632424,
"step": 138000
},
{
"epoch": 1.3856651192572436,
"grad_norm": 6.594145774841309,
"learning_rate": 3.614344885545062e-05,
"loss": 1.2601,
"num_input_tokens_seen": 74906848,
"step": 138500
},
{
"epoch": 1.3906675204097967,
"grad_norm": 6.375233173370361,
"learning_rate": 3.609342484392508e-05,
"loss": 1.2682,
"num_input_tokens_seen": 75176912,
"step": 139000
},
{
"epoch": 1.3956699215623498,
"grad_norm": 6.785195827484131,
"learning_rate": 3.604340083239955e-05,
"loss": 1.2602,
"num_input_tokens_seen": 75447008,
"step": 139500
},
{
"epoch": 1.4006723227149032,
"grad_norm": 6.8394694328308105,
"learning_rate": 3.599337682087402e-05,
"loss": 1.2432,
"num_input_tokens_seen": 75720552,
"step": 140000
},
{
"epoch": 1.4056747238674565,
"grad_norm": 6.43784236907959,
"learning_rate": 3.594335280934849e-05,
"loss": 1.2576,
"num_input_tokens_seen": 75991000,
"step": 140500
},
{
"epoch": 1.4106771250200096,
"grad_norm": 6.5971479415893555,
"learning_rate": 3.5893328797822956e-05,
"loss": 1.2625,
"num_input_tokens_seen": 76262432,
"step": 141000
},
{
"epoch": 1.4156795261725628,
"grad_norm": 5.662843227386475,
"learning_rate": 3.5843304786297426e-05,
"loss": 1.2822,
"num_input_tokens_seen": 76537768,
"step": 141500
},
{
"epoch": 1.4206819273251161,
"grad_norm": 5.13416862487793,
"learning_rate": 3.579328077477189e-05,
"loss": 1.2639,
"num_input_tokens_seen": 76807000,
"step": 142000
},
{
"epoch": 1.4256843284776692,
"grad_norm": 6.224869728088379,
"learning_rate": 3.574325676324636e-05,
"loss": 1.2633,
"num_input_tokens_seen": 77077208,
"step": 142500
},
{
"epoch": 1.4306867296302226,
"grad_norm": 5.980476379394531,
"learning_rate": 3.569323275172083e-05,
"loss": 1.254,
"num_input_tokens_seen": 77348800,
"step": 143000
},
{
"epoch": 1.4356891307827757,
"grad_norm": 5.705311298370361,
"learning_rate": 3.564320874019529e-05,
"loss": 1.2641,
"num_input_tokens_seen": 77623192,
"step": 143500
},
{
"epoch": 1.4406915319353288,
"grad_norm": 5.703660488128662,
"learning_rate": 3.559318472866976e-05,
"loss": 1.2509,
"num_input_tokens_seen": 77892080,
"step": 144000
},
{
"epoch": 1.4456939330878822,
"grad_norm": 6.834238052368164,
"learning_rate": 3.554316071714423e-05,
"loss": 1.2455,
"num_input_tokens_seen": 78161320,
"step": 144500
},
{
"epoch": 1.4506963342404355,
"grad_norm": 5.70477294921875,
"learning_rate": 3.54931367056187e-05,
"loss": 1.2534,
"num_input_tokens_seen": 78426328,
"step": 145000
},
{
"epoch": 1.4556987353929887,
"grad_norm": 7.84694766998291,
"learning_rate": 3.5443112694093165e-05,
"loss": 1.2696,
"num_input_tokens_seen": 78700184,
"step": 145500
},
{
"epoch": 1.4607011365455418,
"grad_norm": 6.4869914054870605,
"learning_rate": 3.5393088682567635e-05,
"loss": 1.2548,
"num_input_tokens_seen": 78968056,
"step": 146000
},
{
"epoch": 1.4657035376980951,
"grad_norm": 7.2102251052856445,
"learning_rate": 3.5343064671042105e-05,
"loss": 1.2643,
"num_input_tokens_seen": 79236952,
"step": 146500
},
{
"epoch": 1.4707059388506483,
"grad_norm": 8.560874938964844,
"learning_rate": 3.529304065951657e-05,
"loss": 1.2674,
"num_input_tokens_seen": 79508800,
"step": 147000
},
{
"epoch": 1.4757083400032016,
"grad_norm": 6.250794410705566,
"learning_rate": 3.524301664799104e-05,
"loss": 1.2529,
"num_input_tokens_seen": 79781928,
"step": 147500
},
{
"epoch": 1.4807107411557547,
"grad_norm": 5.825743198394775,
"learning_rate": 3.519299263646551e-05,
"loss": 1.25,
"num_input_tokens_seen": 80044592,
"step": 148000
},
{
"epoch": 1.4857131423083079,
"grad_norm": 8.07386589050293,
"learning_rate": 3.514296862493997e-05,
"loss": 1.2546,
"num_input_tokens_seen": 80312648,
"step": 148500
},
{
"epoch": 1.4907155434608612,
"grad_norm": 6.903604984283447,
"learning_rate": 3.509294461341444e-05,
"loss": 1.2465,
"num_input_tokens_seen": 80580360,
"step": 149000
},
{
"epoch": 1.4957179446134146,
"grad_norm": 7.45670223236084,
"learning_rate": 3.5042920601888904e-05,
"loss": 1.2612,
"num_input_tokens_seen": 80854928,
"step": 149500
},
{
"epoch": 1.5007203457659677,
"grad_norm": 7.703638553619385,
"learning_rate": 3.4992896590363374e-05,
"loss": 1.2494,
"num_input_tokens_seen": 81122392,
"step": 150000
},
{
"epoch": 1.5057227469185208,
"grad_norm": 7.255218982696533,
"learning_rate": 3.4942872578837844e-05,
"loss": 1.2549,
"num_input_tokens_seen": 81393152,
"step": 150500
},
{
"epoch": 1.5107251480710742,
"grad_norm": 6.001245498657227,
"learning_rate": 3.4892848567312314e-05,
"loss": 1.2565,
"num_input_tokens_seen": 81663000,
"step": 151000
},
{
"epoch": 1.5157275492236273,
"grad_norm": 8.100776672363281,
"learning_rate": 3.484282455578678e-05,
"loss": 1.2662,
"num_input_tokens_seen": 81935248,
"step": 151500
},
{
"epoch": 1.5207299503761806,
"grad_norm": 7.566408157348633,
"learning_rate": 3.479280054426125e-05,
"loss": 1.2521,
"num_input_tokens_seen": 82198824,
"step": 152000
},
{
"epoch": 1.5257323515287338,
"grad_norm": 6.650607109069824,
"learning_rate": 3.474277653273572e-05,
"loss": 1.2516,
"num_input_tokens_seen": 82465776,
"step": 152500
},
{
"epoch": 1.530734752681287,
"grad_norm": 6.24419641494751,
"learning_rate": 3.469275252121018e-05,
"loss": 1.2504,
"num_input_tokens_seen": 82741480,
"step": 153000
},
{
"epoch": 1.5357371538338402,
"grad_norm": 5.1919403076171875,
"learning_rate": 3.464272850968465e-05,
"loss": 1.2403,
"num_input_tokens_seen": 83010176,
"step": 153500
},
{
"epoch": 1.5407395549863936,
"grad_norm": 7.3934407234191895,
"learning_rate": 3.459270449815912e-05,
"loss": 1.2455,
"num_input_tokens_seen": 83279712,
"step": 154000
},
{
"epoch": 1.5457419561389467,
"grad_norm": 5.885237693786621,
"learning_rate": 3.454268048663359e-05,
"loss": 1.2667,
"num_input_tokens_seen": 83546624,
"step": 154500
},
{
"epoch": 1.5507443572914998,
"grad_norm": 6.22340726852417,
"learning_rate": 3.449265647510805e-05,
"loss": 1.2341,
"num_input_tokens_seen": 83814832,
"step": 155000
},
{
"epoch": 1.5557467584440532,
"grad_norm": 7.060276508331299,
"learning_rate": 3.4442632463582516e-05,
"loss": 1.2419,
"num_input_tokens_seen": 84086544,
"step": 155500
},
{
"epoch": 1.5607491595966063,
"grad_norm": 6.495555400848389,
"learning_rate": 3.439260845205699e-05,
"loss": 1.2552,
"num_input_tokens_seen": 84352576,
"step": 156000
},
{
"epoch": 1.5657515607491597,
"grad_norm": 7.454058647155762,
"learning_rate": 3.4342584440531456e-05,
"loss": 1.2469,
"num_input_tokens_seen": 84619200,
"step": 156500
},
{
"epoch": 1.5707539619017128,
"grad_norm": 6.108017444610596,
"learning_rate": 3.4292560429005926e-05,
"loss": 1.2358,
"num_input_tokens_seen": 84891184,
"step": 157000
},
{
"epoch": 1.575756363054266,
"grad_norm": 9.97182559967041,
"learning_rate": 3.424253641748039e-05,
"loss": 1.2482,
"num_input_tokens_seen": 85164008,
"step": 157500
},
{
"epoch": 1.5807587642068193,
"grad_norm": 7.4442877769470215,
"learning_rate": 3.419251240595486e-05,
"loss": 1.2444,
"num_input_tokens_seen": 85431664,
"step": 158000
},
{
"epoch": 1.5857611653593726,
"grad_norm": 5.728388786315918,
"learning_rate": 3.414248839442933e-05,
"loss": 1.2496,
"num_input_tokens_seen": 85706112,
"step": 158500
},
{
"epoch": 1.5907635665119257,
"grad_norm": 5.5090861320495605,
"learning_rate": 3.409246438290379e-05,
"loss": 1.2468,
"num_input_tokens_seen": 85979600,
"step": 159000
},
{
"epoch": 1.5957659676644789,
"grad_norm": 7.548877716064453,
"learning_rate": 3.404244037137826e-05,
"loss": 1.2483,
"num_input_tokens_seen": 86249384,
"step": 159500
},
{
"epoch": 1.6007683688170322,
"grad_norm": 6.700185775756836,
"learning_rate": 3.399241635985273e-05,
"loss": 1.2493,
"num_input_tokens_seen": 86511120,
"step": 160000
},
{
"epoch": 1.6057707699695856,
"grad_norm": 6.892191410064697,
"learning_rate": 3.39423923483272e-05,
"loss": 1.2528,
"num_input_tokens_seen": 86784872,
"step": 160500
},
{
"epoch": 1.6107731711221387,
"grad_norm": 5.970468521118164,
"learning_rate": 3.3892368336801665e-05,
"loss": 1.2309,
"num_input_tokens_seen": 87058000,
"step": 161000
},
{
"epoch": 1.6157755722746918,
"grad_norm": 6.773517608642578,
"learning_rate": 3.384234432527613e-05,
"loss": 1.2449,
"num_input_tokens_seen": 87331744,
"step": 161500
},
{
"epoch": 1.620777973427245,
"grad_norm": 6.08986234664917,
"learning_rate": 3.3792320313750605e-05,
"loss": 1.2518,
"num_input_tokens_seen": 87604792,
"step": 162000
},
{
"epoch": 1.6257803745797983,
"grad_norm": 6.549533843994141,
"learning_rate": 3.374229630222507e-05,
"loss": 1.242,
"num_input_tokens_seen": 87871576,
"step": 162500
},
{
"epoch": 1.6307827757323516,
"grad_norm": 5.974827289581299,
"learning_rate": 3.369227229069954e-05,
"loss": 1.2347,
"num_input_tokens_seen": 88137032,
"step": 163000
},
{
"epoch": 1.6357851768849048,
"grad_norm": 6.639895915985107,
"learning_rate": 3.3642248279174e-05,
"loss": 1.2241,
"num_input_tokens_seen": 88403456,
"step": 163500
},
{
"epoch": 1.640787578037458,
"grad_norm": 8.600828170776367,
"learning_rate": 3.359222426764848e-05,
"loss": 1.2496,
"num_input_tokens_seen": 88676440,
"step": 164000
},
{
"epoch": 1.6457899791900112,
"grad_norm": 6.850368976593018,
"learning_rate": 3.354220025612294e-05,
"loss": 1.232,
"num_input_tokens_seen": 88947256,
"step": 164500
},
{
"epoch": 1.6507923803425646,
"grad_norm": 7.311619281768799,
"learning_rate": 3.3492176244597405e-05,
"loss": 1.2497,
"num_input_tokens_seen": 89215568,
"step": 165000
},
{
"epoch": 1.6557947814951177,
"grad_norm": 6.298097610473633,
"learning_rate": 3.3442152233071875e-05,
"loss": 1.2482,
"num_input_tokens_seen": 89487600,
"step": 165500
},
{
"epoch": 1.6607971826476708,
"grad_norm": 6.948307514190674,
"learning_rate": 3.3392128221546345e-05,
"loss": 1.2274,
"num_input_tokens_seen": 89760496,
"step": 166000
},
{
"epoch": 1.665799583800224,
"grad_norm": 6.305945873260498,
"learning_rate": 3.3342104210020814e-05,
"loss": 1.2462,
"num_input_tokens_seen": 90033976,
"step": 166500
},
{
"epoch": 1.6708019849527773,
"grad_norm": 6.428753852844238,
"learning_rate": 3.329208019849528e-05,
"loss": 1.2159,
"num_input_tokens_seen": 90309992,
"step": 167000
},
{
"epoch": 1.6758043861053307,
"grad_norm": 6.597380638122559,
"learning_rate": 3.324205618696975e-05,
"loss": 1.224,
"num_input_tokens_seen": 90577224,
"step": 167500
},
{
"epoch": 1.6808067872578838,
"grad_norm": 6.567870140075684,
"learning_rate": 3.319203217544422e-05,
"loss": 1.2214,
"num_input_tokens_seen": 90853608,
"step": 168000
},
{
"epoch": 1.685809188410437,
"grad_norm": 6.079522609710693,
"learning_rate": 3.314200816391868e-05,
"loss": 1.2465,
"num_input_tokens_seen": 91122360,
"step": 168500
},
{
"epoch": 1.6908115895629903,
"grad_norm": 5.641016006469727,
"learning_rate": 3.309198415239315e-05,
"loss": 1.2456,
"num_input_tokens_seen": 91395744,
"step": 169000
},
{
"epoch": 1.6958139907155436,
"grad_norm": 6.620981216430664,
"learning_rate": 3.304196014086762e-05,
"loss": 1.2232,
"num_input_tokens_seen": 91664688,
"step": 169500
},
{
"epoch": 1.7008163918680967,
"grad_norm": 5.642283916473389,
"learning_rate": 3.299193612934209e-05,
"loss": 1.2548,
"num_input_tokens_seen": 91937056,
"step": 170000
},
{
"epoch": 1.7058187930206499,
"grad_norm": 8.295174598693848,
"learning_rate": 3.2941912117816554e-05,
"loss": 1.2394,
"num_input_tokens_seen": 92211072,
"step": 170500
},
{
"epoch": 1.710821194173203,
"grad_norm": 6.525012493133545,
"learning_rate": 3.289188810629102e-05,
"loss": 1.2371,
"num_input_tokens_seen": 92479560,
"step": 171000
},
{
"epoch": 1.7158235953257563,
"grad_norm": 5.822702884674072,
"learning_rate": 3.284186409476549e-05,
"loss": 1.2512,
"num_input_tokens_seen": 92751592,
"step": 171500
},
{
"epoch": 1.7208259964783097,
"grad_norm": 7.12557315826416,
"learning_rate": 3.279184008323996e-05,
"loss": 1.2252,
"num_input_tokens_seen": 93027888,
"step": 172000
},
{
"epoch": 1.7258283976308628,
"grad_norm": 6.948513984680176,
"learning_rate": 3.2741816071714427e-05,
"loss": 1.2273,
"num_input_tokens_seen": 93305496,
"step": 172500
},
{
"epoch": 1.730830798783416,
"grad_norm": 6.272098064422607,
"learning_rate": 3.269179206018889e-05,
"loss": 1.2412,
"num_input_tokens_seen": 93571272,
"step": 173000
},
{
"epoch": 1.7358331999359693,
"grad_norm": 6.048664569854736,
"learning_rate": 3.264176804866336e-05,
"loss": 1.2192,
"num_input_tokens_seen": 93844768,
"step": 173500
},
{
"epoch": 1.7408356010885226,
"grad_norm": 7.0680999755859375,
"learning_rate": 3.259174403713783e-05,
"loss": 1.2389,
"num_input_tokens_seen": 94117208,
"step": 174000
},
{
"epoch": 1.7458380022410758,
"grad_norm": 4.265655517578125,
"learning_rate": 3.254172002561229e-05,
"loss": 1.2285,
"num_input_tokens_seen": 94388984,
"step": 174500
},
{
"epoch": 1.750840403393629,
"grad_norm": 5.9715118408203125,
"learning_rate": 3.249169601408676e-05,
"loss": 1.2427,
"num_input_tokens_seen": 94655112,
"step": 175000
},
{
"epoch": 1.755842804546182,
"grad_norm": 6.257503509521484,
"learning_rate": 3.244167200256123e-05,
"loss": 1.2361,
"num_input_tokens_seen": 94924496,
"step": 175500
},
{
"epoch": 1.7608452056987354,
"grad_norm": 8.316187858581543,
"learning_rate": 3.23916479910357e-05,
"loss": 1.2283,
"num_input_tokens_seen": 95192608,
"step": 176000
},
{
"epoch": 1.7658476068512887,
"grad_norm": 6.69648551940918,
"learning_rate": 3.2341623979510166e-05,
"loss": 1.2364,
"num_input_tokens_seen": 95459872,
"step": 176500
},
{
"epoch": 1.7708500080038418,
"grad_norm": 7.617880821228027,
"learning_rate": 3.229159996798463e-05,
"loss": 1.2265,
"num_input_tokens_seen": 95729504,
"step": 177000
},
{
"epoch": 1.775852409156395,
"grad_norm": 7.258569240570068,
"learning_rate": 3.2241575956459106e-05,
"loss": 1.235,
"num_input_tokens_seen": 95996688,
"step": 177500
},
{
"epoch": 1.7808548103089483,
"grad_norm": 5.590980052947998,
"learning_rate": 3.219155194493357e-05,
"loss": 1.2335,
"num_input_tokens_seen": 96263440,
"step": 178000
},
{
"epoch": 1.7858572114615017,
"grad_norm": 5.80760383605957,
"learning_rate": 3.214152793340804e-05,
"loss": 1.2179,
"num_input_tokens_seen": 96539352,
"step": 178500
},
{
"epoch": 1.7908596126140548,
"grad_norm": 5.532135486602783,
"learning_rate": 3.20915039218825e-05,
"loss": 1.1994,
"num_input_tokens_seen": 96805416,
"step": 179000
},
{
"epoch": 1.795862013766608,
"grad_norm": 5.589640140533447,
"learning_rate": 3.204147991035697e-05,
"loss": 1.2264,
"num_input_tokens_seen": 97073464,
"step": 179500
},
{
"epoch": 1.800864414919161,
"grad_norm": 6.577908039093018,
"learning_rate": 3.199145589883144e-05,
"loss": 1.23,
"num_input_tokens_seen": 97347808,
"step": 180000
},
{
"epoch": 1.8058668160717144,
"grad_norm": 6.8848724365234375,
"learning_rate": 3.1941431887305905e-05,
"loss": 1.2167,
"num_input_tokens_seen": 97615696,
"step": 180500
},
{
"epoch": 1.8108692172242677,
"grad_norm": 6.463140964508057,
"learning_rate": 3.1891407875780375e-05,
"loss": 1.2189,
"num_input_tokens_seen": 97886312,
"step": 181000
},
{
"epoch": 1.8158716183768209,
"grad_norm": 8.028167724609375,
"learning_rate": 3.1841383864254845e-05,
"loss": 1.2384,
"num_input_tokens_seen": 98151768,
"step": 181500
},
{
"epoch": 1.820874019529374,
"grad_norm": 7.106721878051758,
"learning_rate": 3.1791359852729315e-05,
"loss": 1.2264,
"num_input_tokens_seen": 98419192,
"step": 182000
},
{
"epoch": 1.8258764206819273,
"grad_norm": 5.771492004394531,
"learning_rate": 3.174133584120378e-05,
"loss": 1.2421,
"num_input_tokens_seen": 98695368,
"step": 182500
},
{
"epoch": 1.8308788218344807,
"grad_norm": 5.563631534576416,
"learning_rate": 3.169131182967825e-05,
"loss": 1.2325,
"num_input_tokens_seen": 98974312,
"step": 183000
},
{
"epoch": 1.8358812229870338,
"grad_norm": 7.004051208496094,
"learning_rate": 3.164128781815272e-05,
"loss": 1.2107,
"num_input_tokens_seen": 99244240,
"step": 183500
},
{
"epoch": 1.840883624139587,
"grad_norm": 6.410153865814209,
"learning_rate": 3.159126380662718e-05,
"loss": 1.2251,
"num_input_tokens_seen": 99515024,
"step": 184000
},
{
"epoch": 1.84588602529214,
"grad_norm": 4.9155354499816895,
"learning_rate": 3.154123979510165e-05,
"loss": 1.2192,
"num_input_tokens_seen": 99785496,
"step": 184500
},
{
"epoch": 1.8508884264446934,
"grad_norm": 7.882739067077637,
"learning_rate": 3.1491215783576114e-05,
"loss": 1.2405,
"num_input_tokens_seen": 100061088,
"step": 185000
},
{
"epoch": 1.8558908275972468,
"grad_norm": 5.929235935211182,
"learning_rate": 3.144119177205059e-05,
"loss": 1.2171,
"num_input_tokens_seen": 100330216,
"step": 185500
},
{
"epoch": 1.8608932287498,
"grad_norm": 5.840740203857422,
"learning_rate": 3.1391167760525054e-05,
"loss": 1.2093,
"num_input_tokens_seen": 100606080,
"step": 186000
},
{
"epoch": 1.865895629902353,
"grad_norm": 6.414222717285156,
"learning_rate": 3.134114374899952e-05,
"loss": 1.2154,
"num_input_tokens_seen": 100872752,
"step": 186500
},
{
"epoch": 1.8708980310549064,
"grad_norm": 7.030595779418945,
"learning_rate": 3.129111973747399e-05,
"loss": 1.2209,
"num_input_tokens_seen": 101144088,
"step": 187000
},
{
"epoch": 1.8759004322074597,
"grad_norm": 5.81058406829834,
"learning_rate": 3.124109572594846e-05,
"loss": 1.2068,
"num_input_tokens_seen": 101414120,
"step": 187500
},
{
"epoch": 1.8809028333600128,
"grad_norm": 5.672033309936523,
"learning_rate": 3.119107171442293e-05,
"loss": 1.2305,
"num_input_tokens_seen": 101679104,
"step": 188000
},
{
"epoch": 1.885905234512566,
"grad_norm": 6.247150421142578,
"learning_rate": 3.114104770289739e-05,
"loss": 1.2281,
"num_input_tokens_seen": 101950272,
"step": 188500
},
{
"epoch": 1.890907635665119,
"grad_norm": 6.070692539215088,
"learning_rate": 3.109102369137186e-05,
"loss": 1.2277,
"num_input_tokens_seen": 102222744,
"step": 189000
},
{
"epoch": 1.8959100368176725,
"grad_norm": 7.217655181884766,
"learning_rate": 3.104099967984633e-05,
"loss": 1.2227,
"num_input_tokens_seen": 102498072,
"step": 189500
},
{
"epoch": 1.9009124379702258,
"grad_norm": 6.292141914367676,
"learning_rate": 3.099097566832079e-05,
"loss": 1.226,
"num_input_tokens_seen": 102771184,
"step": 190000
},
{
"epoch": 1.905914839122779,
"grad_norm": 6.4393534660339355,
"learning_rate": 3.094095165679526e-05,
"loss": 1.2238,
"num_input_tokens_seen": 103041800,
"step": 190500
},
{
"epoch": 1.910917240275332,
"grad_norm": 6.367134094238281,
"learning_rate": 3.0890927645269726e-05,
"loss": 1.2277,
"num_input_tokens_seen": 103315416,
"step": 191000
},
{
"epoch": 1.9159196414278854,
"grad_norm": 5.803537368774414,
"learning_rate": 3.08409036337442e-05,
"loss": 1.2101,
"num_input_tokens_seen": 103584680,
"step": 191500
},
{
"epoch": 1.9209220425804387,
"grad_norm": 5.529000282287598,
"learning_rate": 3.0790879622218666e-05,
"loss": 1.2252,
"num_input_tokens_seen": 103854296,
"step": 192000
},
{
"epoch": 1.9259244437329919,
"grad_norm": 6.204425811767578,
"learning_rate": 3.0740855610693136e-05,
"loss": 1.2234,
"num_input_tokens_seen": 104127296,
"step": 192500
},
{
"epoch": 1.930926844885545,
"grad_norm": 6.076712131500244,
"learning_rate": 3.06908315991676e-05,
"loss": 1.2245,
"num_input_tokens_seen": 104402160,
"step": 193000
},
{
"epoch": 1.9359292460380981,
"grad_norm": 5.718363285064697,
"learning_rate": 3.064080758764207e-05,
"loss": 1.2288,
"num_input_tokens_seen": 104669408,
"step": 193500
},
{
"epoch": 1.9409316471906515,
"grad_norm": 5.174673080444336,
"learning_rate": 3.059078357611654e-05,
"loss": 1.2276,
"num_input_tokens_seen": 104944256,
"step": 194000
},
{
"epoch": 1.9459340483432048,
"grad_norm": 6.684966564178467,
"learning_rate": 3.0540759564591e-05,
"loss": 1.2341,
"num_input_tokens_seen": 105217600,
"step": 194500
},
{
"epoch": 1.950936449495758,
"grad_norm": 6.3069562911987305,
"learning_rate": 3.0490735553065475e-05,
"loss": 1.211,
"num_input_tokens_seen": 105491832,
"step": 195000
},
{
"epoch": 1.955938850648311,
"grad_norm": 8.71688461303711,
"learning_rate": 3.044071154153994e-05,
"loss": 1.231,
"num_input_tokens_seen": 105764832,
"step": 195500
},
{
"epoch": 1.9609412518008644,
"grad_norm": 8.65140438079834,
"learning_rate": 3.0390687530014405e-05,
"loss": 1.2142,
"num_input_tokens_seen": 106034576,
"step": 196000
},
{
"epoch": 1.9659436529534178,
"grad_norm": 5.5850725173950195,
"learning_rate": 3.0340663518488875e-05,
"loss": 1.2254,
"num_input_tokens_seen": 106305056,
"step": 196500
},
{
"epoch": 1.970946054105971,
"grad_norm": 6.236534118652344,
"learning_rate": 3.029063950696334e-05,
"loss": 1.2121,
"num_input_tokens_seen": 106574696,
"step": 197000
},
{
"epoch": 1.975948455258524,
"grad_norm": 6.221134185791016,
"learning_rate": 3.024061549543781e-05,
"loss": 1.2031,
"num_input_tokens_seen": 106850384,
"step": 197500
},
{
"epoch": 1.9809508564110772,
"grad_norm": 7.421369552612305,
"learning_rate": 3.0190591483912278e-05,
"loss": 1.2214,
"num_input_tokens_seen": 107126816,
"step": 198000
},
{
"epoch": 1.9859532575636305,
"grad_norm": 6.951572418212891,
"learning_rate": 3.0140567472386748e-05,
"loss": 1.2052,
"num_input_tokens_seen": 107392688,
"step": 198500
},
{
"epoch": 1.9909556587161839,
"grad_norm": 7.400991439819336,
"learning_rate": 3.0090543460861215e-05,
"loss": 1.2091,
"num_input_tokens_seen": 107668928,
"step": 199000
},
{
"epoch": 1.995958059868737,
"grad_norm": 6.747934818267822,
"learning_rate": 3.004051944933568e-05,
"loss": 1.2001,
"num_input_tokens_seen": 107939312,
"step": 199500
},
{
"epoch": 2.0,
"eval_loss": 1.1146966218948364,
"eval_runtime": 186.1963,
"eval_samples_per_second": 1073.625,
"eval_steps_per_second": 134.208,
"num_input_tokens_seen": 108157960,
"step": 199904
},
{
"epoch": 2.00096046102129,
"grad_norm": 5.809133052825928,
"learning_rate": 2.999049543781015e-05,
"loss": 1.21,
"num_input_tokens_seen": 108209480,
"step": 200000
},
{
"epoch": 2.0059628621738432,
"grad_norm": 7.035338401794434,
"learning_rate": 2.9940471426284618e-05,
"loss": 1.0955,
"num_input_tokens_seen": 108480360,
"step": 200500
},
{
"epoch": 2.010965263326397,
"grad_norm": 8.504626274108887,
"learning_rate": 2.9890447414759088e-05,
"loss": 1.1085,
"num_input_tokens_seen": 108745072,
"step": 201000
},
{
"epoch": 2.01596766447895,
"grad_norm": 7.347136497497559,
"learning_rate": 2.9840423403233554e-05,
"loss": 1.0983,
"num_input_tokens_seen": 109010096,
"step": 201500
},
{
"epoch": 2.020970065631503,
"grad_norm": 5.3891215324401855,
"learning_rate": 2.9790399391708024e-05,
"loss": 1.0941,
"num_input_tokens_seen": 109280096,
"step": 202000
},
{
"epoch": 2.025972466784056,
"grad_norm": 5.807075023651123,
"learning_rate": 2.974037538018249e-05,
"loss": 1.1066,
"num_input_tokens_seen": 109550512,
"step": 202500
},
{
"epoch": 2.0309748679366098,
"grad_norm": 7.344318866729736,
"learning_rate": 2.9690351368656954e-05,
"loss": 1.1149,
"num_input_tokens_seen": 109827584,
"step": 203000
},
{
"epoch": 2.035977269089163,
"grad_norm": 4.556800842285156,
"learning_rate": 2.9640327357131427e-05,
"loss": 1.1137,
"num_input_tokens_seen": 110094320,
"step": 203500
},
{
"epoch": 2.040979670241716,
"grad_norm": 6.238656044006348,
"learning_rate": 2.959030334560589e-05,
"loss": 1.1174,
"num_input_tokens_seen": 110369888,
"step": 204000
},
{
"epoch": 2.045982071394269,
"grad_norm": 6.009298801422119,
"learning_rate": 2.954027933408036e-05,
"loss": 1.0978,
"num_input_tokens_seen": 110638112,
"step": 204500
},
{
"epoch": 2.0509844725468223,
"grad_norm": 5.883254051208496,
"learning_rate": 2.9490255322554827e-05,
"loss": 1.1127,
"num_input_tokens_seen": 110907728,
"step": 205000
},
{
"epoch": 2.055986873699376,
"grad_norm": 5.4123125076293945,
"learning_rate": 2.9440231311029297e-05,
"loss": 1.116,
"num_input_tokens_seen": 111174608,
"step": 205500
},
{
"epoch": 2.060989274851929,
"grad_norm": 6.456712245941162,
"learning_rate": 2.9390207299503763e-05,
"loss": 1.1306,
"num_input_tokens_seen": 111443896,
"step": 206000
},
{
"epoch": 2.065991676004482,
"grad_norm": 7.134698390960693,
"learning_rate": 2.934018328797823e-05,
"loss": 1.1149,
"num_input_tokens_seen": 111722672,
"step": 206500
},
{
"epoch": 2.070994077157035,
"grad_norm": 5.317368984222412,
"learning_rate": 2.92901592764527e-05,
"loss": 1.093,
"num_input_tokens_seen": 111987688,
"step": 207000
},
{
"epoch": 2.0759964783095888,
"grad_norm": 5.929445743560791,
"learning_rate": 2.9240135264927166e-05,
"loss": 1.124,
"num_input_tokens_seen": 112256088,
"step": 207500
},
{
"epoch": 2.080998879462142,
"grad_norm": 6.658150672912598,
"learning_rate": 2.9190111253401636e-05,
"loss": 1.1068,
"num_input_tokens_seen": 112530064,
"step": 208000
},
{
"epoch": 2.086001280614695,
"grad_norm": 7.434782028198242,
"learning_rate": 2.9140087241876103e-05,
"loss": 1.11,
"num_input_tokens_seen": 112807296,
"step": 208500
},
{
"epoch": 2.091003681767248,
"grad_norm": 6.564949035644531,
"learning_rate": 2.9090063230350566e-05,
"loss": 1.1181,
"num_input_tokens_seen": 113075032,
"step": 209000
},
{
"epoch": 2.0960060829198017,
"grad_norm": 11.387114524841309,
"learning_rate": 2.904003921882504e-05,
"loss": 1.1097,
"num_input_tokens_seen": 113344824,
"step": 209500
},
{
"epoch": 2.101008484072355,
"grad_norm": 5.74482536315918,
"learning_rate": 2.8990015207299502e-05,
"loss": 1.1018,
"num_input_tokens_seen": 113618576,
"step": 210000
},
{
"epoch": 2.106010885224908,
"grad_norm": 5.009258270263672,
"learning_rate": 2.8939991195773976e-05,
"loss": 1.1039,
"num_input_tokens_seen": 113889800,
"step": 210500
},
{
"epoch": 2.111013286377461,
"grad_norm": 7.421350955963135,
"learning_rate": 2.888996718424844e-05,
"loss": 1.1173,
"num_input_tokens_seen": 114168104,
"step": 211000
},
{
"epoch": 2.1160156875300142,
"grad_norm": 6.955892086029053,
"learning_rate": 2.8839943172722912e-05,
"loss": 1.1217,
"num_input_tokens_seen": 114430440,
"step": 211500
},
{
"epoch": 2.121018088682568,
"grad_norm": 7.287781715393066,
"learning_rate": 2.8789919161197375e-05,
"loss": 1.1063,
"num_input_tokens_seen": 114706600,
"step": 212000
},
{
"epoch": 2.126020489835121,
"grad_norm": 7.426519870758057,
"learning_rate": 2.8739895149671842e-05,
"loss": 1.1181,
"num_input_tokens_seen": 114975360,
"step": 212500
},
{
"epoch": 2.131022890987674,
"grad_norm": 6.112298965454102,
"learning_rate": 2.8689871138146312e-05,
"loss": 1.1,
"num_input_tokens_seen": 115252480,
"step": 213000
},
{
"epoch": 2.136025292140227,
"grad_norm": 8.356368064880371,
"learning_rate": 2.863984712662078e-05,
"loss": 1.1382,
"num_input_tokens_seen": 115527840,
"step": 213500
},
{
"epoch": 2.1410276932927808,
"grad_norm": 5.211204528808594,
"learning_rate": 2.858982311509525e-05,
"loss": 1.1241,
"num_input_tokens_seen": 115795240,
"step": 214000
},
{
"epoch": 2.146030094445334,
"grad_norm": 7.513902187347412,
"learning_rate": 2.8539799103569715e-05,
"loss": 1.0984,
"num_input_tokens_seen": 116071656,
"step": 214500
},
{
"epoch": 2.151032495597887,
"grad_norm": 5.553924560546875,
"learning_rate": 2.8489775092044185e-05,
"loss": 1.1138,
"num_input_tokens_seen": 116337944,
"step": 215000
},
{
"epoch": 2.15603489675044,
"grad_norm": 5.7051920890808105,
"learning_rate": 2.843975108051865e-05,
"loss": 1.1193,
"num_input_tokens_seen": 116608560,
"step": 215500
},
{
"epoch": 2.1610372979029933,
"grad_norm": 6.199916362762451,
"learning_rate": 2.8389727068993115e-05,
"loss": 1.114,
"num_input_tokens_seen": 116876312,
"step": 216000
},
{
"epoch": 2.166039699055547,
"grad_norm": 6.054383754730225,
"learning_rate": 2.8339703057467588e-05,
"loss": 1.0965,
"num_input_tokens_seen": 117146704,
"step": 216500
},
{
"epoch": 2.1710421002081,
"grad_norm": 6.129938125610352,
"learning_rate": 2.828967904594205e-05,
"loss": 1.1191,
"num_input_tokens_seen": 117407640,
"step": 217000
},
{
"epoch": 2.176044501360653,
"grad_norm": 8.660636901855469,
"learning_rate": 2.8239655034416524e-05,
"loss": 1.1217,
"num_input_tokens_seen": 117675248,
"step": 217500
},
{
"epoch": 2.181046902513206,
"grad_norm": 5.140537261962891,
"learning_rate": 2.8189631022890988e-05,
"loss": 1.106,
"num_input_tokens_seen": 117946856,
"step": 218000
},
{
"epoch": 2.18604930366576,
"grad_norm": 8.983773231506348,
"learning_rate": 2.8139607011365454e-05,
"loss": 1.1055,
"num_input_tokens_seen": 118218576,
"step": 218500
},
{
"epoch": 2.191051704818313,
"grad_norm": 5.122745513916016,
"learning_rate": 2.8089582999839924e-05,
"loss": 1.1022,
"num_input_tokens_seen": 118493280,
"step": 219000
},
{
"epoch": 2.196054105970866,
"grad_norm": 5.861432075500488,
"learning_rate": 2.803955898831439e-05,
"loss": 1.0946,
"num_input_tokens_seen": 118764672,
"step": 219500
},
{
"epoch": 2.201056507123419,
"grad_norm": 5.456287384033203,
"learning_rate": 2.798953497678886e-05,
"loss": 1.1136,
"num_input_tokens_seen": 119033632,
"step": 220000
},
{
"epoch": 2.2060589082759723,
"grad_norm": 6.379229545593262,
"learning_rate": 2.7939510965263327e-05,
"loss": 1.1123,
"num_input_tokens_seen": 119302136,
"step": 220500
},
{
"epoch": 2.211061309428526,
"grad_norm": 7.430028438568115,
"learning_rate": 2.7889486953737797e-05,
"loss": 1.1079,
"num_input_tokens_seen": 119571112,
"step": 221000
},
{
"epoch": 2.216063710581079,
"grad_norm": 6.985309600830078,
"learning_rate": 2.7839462942212264e-05,
"loss": 1.142,
"num_input_tokens_seen": 119840376,
"step": 221500
},
{
"epoch": 2.221066111733632,
"grad_norm": 5.228456974029541,
"learning_rate": 2.778943893068673e-05,
"loss": 1.111,
"num_input_tokens_seen": 120108496,
"step": 222000
},
{
"epoch": 2.2260685128861852,
"grad_norm": 7.293130874633789,
"learning_rate": 2.77394149191612e-05,
"loss": 1.1067,
"num_input_tokens_seen": 120381296,
"step": 222500
},
{
"epoch": 2.231070914038739,
"grad_norm": 7.219442367553711,
"learning_rate": 2.7689390907635667e-05,
"loss": 1.1331,
"num_input_tokens_seen": 120646192,
"step": 223000
},
{
"epoch": 2.236073315191292,
"grad_norm": 6.636627197265625,
"learning_rate": 2.7639366896110137e-05,
"loss": 1.1266,
"num_input_tokens_seen": 120912232,
"step": 223500
},
{
"epoch": 2.241075716343845,
"grad_norm": 6.974771976470947,
"learning_rate": 2.75893428845846e-05,
"loss": 1.1432,
"num_input_tokens_seen": 121178320,
"step": 224000
},
{
"epoch": 2.246078117496398,
"grad_norm": 6.00003719329834,
"learning_rate": 2.7539318873059073e-05,
"loss": 1.1228,
"num_input_tokens_seen": 121450400,
"step": 224500
},
{
"epoch": 2.2510805186489513,
"grad_norm": 6.582889556884766,
"learning_rate": 2.7489294861533536e-05,
"loss": 1.1236,
"num_input_tokens_seen": 121722344,
"step": 225000
},
{
"epoch": 2.256082919801505,
"grad_norm": 4.923620700836182,
"learning_rate": 2.7439270850008003e-05,
"loss": 1.1133,
"num_input_tokens_seen": 121986208,
"step": 225500
},
{
"epoch": 2.261085320954058,
"grad_norm": 6.316877365112305,
"learning_rate": 2.7389246838482473e-05,
"loss": 1.1163,
"num_input_tokens_seen": 122249640,
"step": 226000
},
{
"epoch": 2.266087722106611,
"grad_norm": 6.2502241134643555,
"learning_rate": 2.733922282695694e-05,
"loss": 1.103,
"num_input_tokens_seen": 122518696,
"step": 226500
},
{
"epoch": 2.2710901232591643,
"grad_norm": 8.201516151428223,
"learning_rate": 2.728919881543141e-05,
"loss": 1.1212,
"num_input_tokens_seen": 122783616,
"step": 227000
},
{
"epoch": 2.276092524411718,
"grad_norm": 5.959327220916748,
"learning_rate": 2.7239174803905876e-05,
"loss": 1.1096,
"num_input_tokens_seen": 123053552,
"step": 227500
},
{
"epoch": 2.281094925564271,
"grad_norm": 9.138140678405762,
"learning_rate": 2.7189150792380342e-05,
"loss": 1.1051,
"num_input_tokens_seen": 123325184,
"step": 228000
},
{
"epoch": 2.286097326716824,
"grad_norm": 6.214888572692871,
"learning_rate": 2.7139126780854812e-05,
"loss": 1.1141,
"num_input_tokens_seen": 123600592,
"step": 228500
},
{
"epoch": 2.291099727869377,
"grad_norm": 6.67230224609375,
"learning_rate": 2.708910276932928e-05,
"loss": 1.1149,
"num_input_tokens_seen": 123876144,
"step": 229000
},
{
"epoch": 2.2961021290219303,
"grad_norm": 7.004880905151367,
"learning_rate": 2.703907875780375e-05,
"loss": 1.1136,
"num_input_tokens_seen": 124149576,
"step": 229500
},
{
"epoch": 2.301104530174484,
"grad_norm": 5.232549667358398,
"learning_rate": 2.6989054746278215e-05,
"loss": 1.136,
"num_input_tokens_seen": 124420360,
"step": 230000
},
{
"epoch": 2.306106931327037,
"grad_norm": 6.569345951080322,
"learning_rate": 2.6939030734752685e-05,
"loss": 1.1136,
"num_input_tokens_seen": 124694320,
"step": 230500
},
{
"epoch": 2.31110933247959,
"grad_norm": 4.602709770202637,
"learning_rate": 2.688900672322715e-05,
"loss": 1.131,
"num_input_tokens_seen": 124973608,
"step": 231000
},
{
"epoch": 2.3161117336321433,
"grad_norm": 7.659350872039795,
"learning_rate": 2.6838982711701615e-05,
"loss": 1.1237,
"num_input_tokens_seen": 125247480,
"step": 231500
},
{
"epoch": 2.321114134784697,
"grad_norm": 5.581116676330566,
"learning_rate": 2.6788958700176088e-05,
"loss": 1.1373,
"num_input_tokens_seen": 125514944,
"step": 232000
},
{
"epoch": 2.32611653593725,
"grad_norm": 6.8799238204956055,
"learning_rate": 2.673893468865055e-05,
"loss": 1.1185,
"num_input_tokens_seen": 125786544,
"step": 232500
},
{
"epoch": 2.331118937089803,
"grad_norm": 6.535116195678711,
"learning_rate": 2.6688910677125025e-05,
"loss": 1.1154,
"num_input_tokens_seen": 126061016,
"step": 233000
},
{
"epoch": 2.3361213382423562,
"grad_norm": 9.319666862487793,
"learning_rate": 2.6638886665599488e-05,
"loss": 1.1293,
"num_input_tokens_seen": 126326104,
"step": 233500
},
{
"epoch": 2.3411237393949094,
"grad_norm": 6.085050582885742,
"learning_rate": 2.6588862654073958e-05,
"loss": 1.1266,
"num_input_tokens_seen": 126600472,
"step": 234000
},
{
"epoch": 2.346126140547463,
"grad_norm": 7.938391208648682,
"learning_rate": 2.6538838642548424e-05,
"loss": 1.1222,
"num_input_tokens_seen": 126867552,
"step": 234500
},
{
"epoch": 2.351128541700016,
"grad_norm": 6.2780537605285645,
"learning_rate": 2.648881463102289e-05,
"loss": 1.0976,
"num_input_tokens_seen": 127135696,
"step": 235000
},
{
"epoch": 2.356130942852569,
"grad_norm": 6.0472731590271,
"learning_rate": 2.643879061949736e-05,
"loss": 1.1141,
"num_input_tokens_seen": 127406128,
"step": 235500
},
{
"epoch": 2.3611333440051223,
"grad_norm": 6.907486438751221,
"learning_rate": 2.6388766607971827e-05,
"loss": 1.1149,
"num_input_tokens_seen": 127678464,
"step": 236000
},
{
"epoch": 2.366135745157676,
"grad_norm": 6.429139137268066,
"learning_rate": 2.6338742596446297e-05,
"loss": 1.1154,
"num_input_tokens_seen": 127947800,
"step": 236500
},
{
"epoch": 2.371138146310229,
"grad_norm": 5.432641506195068,
"learning_rate": 2.6288718584920764e-05,
"loss": 1.1246,
"num_input_tokens_seen": 128221960,
"step": 237000
},
{
"epoch": 2.376140547462782,
"grad_norm": 6.486244201660156,
"learning_rate": 2.6238694573395227e-05,
"loss": 1.1295,
"num_input_tokens_seen": 128492192,
"step": 237500
},
{
"epoch": 2.3811429486153353,
"grad_norm": 6.889167308807373,
"learning_rate": 2.61886705618697e-05,
"loss": 1.1073,
"num_input_tokens_seen": 128761776,
"step": 238000
},
{
"epoch": 2.3861453497678884,
"grad_norm": 5.81854248046875,
"learning_rate": 2.6138646550344163e-05,
"loss": 1.0988,
"num_input_tokens_seen": 129033976,
"step": 238500
},
{
"epoch": 2.391147750920442,
"grad_norm": 6.5693864822387695,
"learning_rate": 2.6088622538818637e-05,
"loss": 1.1225,
"num_input_tokens_seen": 129304680,
"step": 239000
},
{
"epoch": 2.396150152072995,
"grad_norm": 7.3249969482421875,
"learning_rate": 2.60385985272931e-05,
"loss": 1.1134,
"num_input_tokens_seen": 129577608,
"step": 239500
},
{
"epoch": 2.401152553225548,
"grad_norm": 6.37844181060791,
"learning_rate": 2.5988574515767573e-05,
"loss": 1.116,
"num_input_tokens_seen": 129853352,
"step": 240000
},
{
"epoch": 2.4061549543781013,
"grad_norm": 6.640512943267822,
"learning_rate": 2.5938550504242036e-05,
"loss": 1.1158,
"num_input_tokens_seen": 130123560,
"step": 240500
},
{
"epoch": 2.411157355530655,
"grad_norm": 4.907979965209961,
"learning_rate": 2.5888526492716503e-05,
"loss": 1.1116,
"num_input_tokens_seen": 130395312,
"step": 241000
},
{
"epoch": 2.416159756683208,
"grad_norm": 5.592065811157227,
"learning_rate": 2.5838502481190973e-05,
"loss": 1.125,
"num_input_tokens_seen": 130667624,
"step": 241500
},
{
"epoch": 2.421162157835761,
"grad_norm": 6.227156639099121,
"learning_rate": 2.578847846966544e-05,
"loss": 1.1101,
"num_input_tokens_seen": 130936288,
"step": 242000
},
{
"epoch": 2.4261645589883143,
"grad_norm": 6.889796733856201,
"learning_rate": 2.573845445813991e-05,
"loss": 1.1098,
"num_input_tokens_seen": 131202792,
"step": 242500
},
{
"epoch": 2.4311669601408674,
"grad_norm": 6.005047798156738,
"learning_rate": 2.5688430446614376e-05,
"loss": 1.1227,
"num_input_tokens_seen": 131474608,
"step": 243000
},
{
"epoch": 2.436169361293421,
"grad_norm": 6.773987293243408,
"learning_rate": 2.5638406435088846e-05,
"loss": 1.1167,
"num_input_tokens_seen": 131739400,
"step": 243500
},
{
"epoch": 2.441171762445974,
"grad_norm": 8.459389686584473,
"learning_rate": 2.5588382423563312e-05,
"loss": 1.1275,
"num_input_tokens_seen": 132006752,
"step": 244000
},
{
"epoch": 2.4461741635985272,
"grad_norm": 6.094442367553711,
"learning_rate": 2.5538358412037776e-05,
"loss": 1.1035,
"num_input_tokens_seen": 132280008,
"step": 244500
},
{
"epoch": 2.4511765647510804,
"grad_norm": 9.516000747680664,
"learning_rate": 2.548833440051225e-05,
"loss": 1.1194,
"num_input_tokens_seen": 132548616,
"step": 245000
},
{
"epoch": 2.456178965903634,
"grad_norm": 9.94356918334961,
"learning_rate": 2.5438310388986712e-05,
"loss": 1.1136,
"num_input_tokens_seen": 132827096,
"step": 245500
},
{
"epoch": 2.461181367056187,
"grad_norm": 7.003009796142578,
"learning_rate": 2.5388286377461185e-05,
"loss": 1.1257,
"num_input_tokens_seen": 133094168,
"step": 246000
},
{
"epoch": 2.46618376820874,
"grad_norm": 6.280598163604736,
"learning_rate": 2.533826236593565e-05,
"loss": 1.1241,
"num_input_tokens_seen": 133362264,
"step": 246500
},
{
"epoch": 2.4711861693612933,
"grad_norm": 7.221234321594238,
"learning_rate": 2.5288238354410115e-05,
"loss": 1.1095,
"num_input_tokens_seen": 133628496,
"step": 247000
},
{
"epoch": 2.4761885705138464,
"grad_norm": 6.677853584289551,
"learning_rate": 2.5238214342884585e-05,
"loss": 1.1148,
"num_input_tokens_seen": 133902784,
"step": 247500
},
{
"epoch": 2.4811909716664,
"grad_norm": 6.834347248077393,
"learning_rate": 2.518819033135905e-05,
"loss": 1.1128,
"num_input_tokens_seen": 134174984,
"step": 248000
},
{
"epoch": 2.486193372818953,
"grad_norm": 5.890481948852539,
"learning_rate": 2.513816631983352e-05,
"loss": 1.1181,
"num_input_tokens_seen": 134442008,
"step": 248500
},
{
"epoch": 2.4911957739715063,
"grad_norm": 5.24491548538208,
"learning_rate": 2.5088142308307988e-05,
"loss": 1.112,
"num_input_tokens_seen": 134721144,
"step": 249000
},
{
"epoch": 2.4961981751240594,
"grad_norm": 6.424367904663086,
"learning_rate": 2.5038118296782458e-05,
"loss": 1.117,
"num_input_tokens_seen": 134996320,
"step": 249500
},
{
"epoch": 2.501200576276613,
"grad_norm": 5.759153366088867,
"learning_rate": 2.4988094285256925e-05,
"loss": 1.1146,
"num_input_tokens_seen": 135261680,
"step": 250000
},
{
"epoch": 2.506202977429166,
"grad_norm": 6.164818286895752,
"learning_rate": 2.493807027373139e-05,
"loss": 1.1171,
"num_input_tokens_seen": 135530080,
"step": 250500
},
{
"epoch": 2.511205378581719,
"grad_norm": 5.746749401092529,
"learning_rate": 2.488804626220586e-05,
"loss": 1.1156,
"num_input_tokens_seen": 135793120,
"step": 251000
},
{
"epoch": 2.5162077797342723,
"grad_norm": 8.123281478881836,
"learning_rate": 2.4838022250680328e-05,
"loss": 1.1139,
"num_input_tokens_seen": 136063864,
"step": 251500
},
{
"epoch": 2.5212101808868255,
"grad_norm": 5.1486287117004395,
"learning_rate": 2.4787998239154794e-05,
"loss": 1.1146,
"num_input_tokens_seen": 136332816,
"step": 252000
},
{
"epoch": 2.526212582039379,
"grad_norm": 5.926784038543701,
"learning_rate": 2.4737974227629264e-05,
"loss": 1.1072,
"num_input_tokens_seen": 136604928,
"step": 252500
},
{
"epoch": 2.531214983191932,
"grad_norm": 5.782299041748047,
"learning_rate": 2.468795021610373e-05,
"loss": 1.1114,
"num_input_tokens_seen": 136875976,
"step": 253000
},
{
"epoch": 2.5362173843444853,
"grad_norm": 6.699214935302734,
"learning_rate": 2.4637926204578197e-05,
"loss": 1.1129,
"num_input_tokens_seen": 137150704,
"step": 253500
},
{
"epoch": 2.5412197854970384,
"grad_norm": 6.502534866333008,
"learning_rate": 2.4587902193052667e-05,
"loss": 1.0949,
"num_input_tokens_seen": 137430640,
"step": 254000
},
{
"epoch": 2.546222186649592,
"grad_norm": 6.316598892211914,
"learning_rate": 2.4537878181527134e-05,
"loss": 1.1061,
"num_input_tokens_seen": 137704496,
"step": 254500
},
{
"epoch": 2.551224587802145,
"grad_norm": 6.855249881744385,
"learning_rate": 2.44878541700016e-05,
"loss": 1.1147,
"num_input_tokens_seen": 137973064,
"step": 255000
},
{
"epoch": 2.5562269889546982,
"grad_norm": 6.485804080963135,
"learning_rate": 2.443783015847607e-05,
"loss": 1.1081,
"num_input_tokens_seen": 138245472,
"step": 255500
},
{
"epoch": 2.5612293901072514,
"grad_norm": 5.901826858520508,
"learning_rate": 2.4387806146950537e-05,
"loss": 1.1316,
"num_input_tokens_seen": 138521368,
"step": 256000
},
{
"epoch": 2.5662317912598045,
"grad_norm": 10.232002258300781,
"learning_rate": 2.4337782135425007e-05,
"loss": 1.1237,
"num_input_tokens_seen": 138788592,
"step": 256500
},
{
"epoch": 2.571234192412358,
"grad_norm": 6.8045148849487305,
"learning_rate": 2.4287758123899473e-05,
"loss": 1.1205,
"num_input_tokens_seen": 139062416,
"step": 257000
},
{
"epoch": 2.576236593564911,
"grad_norm": 6.035918712615967,
"learning_rate": 2.4237734112373943e-05,
"loss": 1.1063,
"num_input_tokens_seen": 139341400,
"step": 257500
},
{
"epoch": 2.5812389947174643,
"grad_norm": 6.652617454528809,
"learning_rate": 2.418771010084841e-05,
"loss": 1.1018,
"num_input_tokens_seen": 139618008,
"step": 258000
},
{
"epoch": 2.5862413958700174,
"grad_norm": 5.396528720855713,
"learning_rate": 2.4137686089322876e-05,
"loss": 1.1036,
"num_input_tokens_seen": 139893680,
"step": 258500
},
{
"epoch": 2.591243797022571,
"grad_norm": 7.620987415313721,
"learning_rate": 2.4087662077797343e-05,
"loss": 1.1127,
"num_input_tokens_seen": 140161304,
"step": 259000
},
{
"epoch": 2.596246198175124,
"grad_norm": 6.9869279861450195,
"learning_rate": 2.4037638066271813e-05,
"loss": 1.115,
"num_input_tokens_seen": 140437008,
"step": 259500
},
{
"epoch": 2.6012485993276773,
"grad_norm": 6.20002555847168,
"learning_rate": 2.398761405474628e-05,
"loss": 1.1184,
"num_input_tokens_seen": 140703800,
"step": 260000
},
{
"epoch": 2.6062510004802304,
"grad_norm": 5.8140974044799805,
"learning_rate": 2.393759004322075e-05,
"loss": 1.1048,
"num_input_tokens_seen": 140976992,
"step": 260500
},
{
"epoch": 2.6112534016327835,
"grad_norm": 6.32145357131958,
"learning_rate": 2.3887566031695216e-05,
"loss": 1.1023,
"num_input_tokens_seen": 141245264,
"step": 261000
},
{
"epoch": 2.616255802785337,
"grad_norm": 6.455646991729736,
"learning_rate": 2.3837542020169682e-05,
"loss": 1.107,
"num_input_tokens_seen": 141517384,
"step": 261500
},
{
"epoch": 2.62125820393789,
"grad_norm": 6.573545455932617,
"learning_rate": 2.378751800864415e-05,
"loss": 1.1047,
"num_input_tokens_seen": 141791208,
"step": 262000
},
{
"epoch": 2.6262606050904433,
"grad_norm": 9.841447830200195,
"learning_rate": 2.373749399711862e-05,
"loss": 1.0933,
"num_input_tokens_seen": 142065992,
"step": 262500
},
{
"epoch": 2.631263006242997,
"grad_norm": 6.491105556488037,
"learning_rate": 2.3687469985593085e-05,
"loss": 1.1132,
"num_input_tokens_seen": 142333808,
"step": 263000
},
{
"epoch": 2.63626540739555,
"grad_norm": 5.914114952087402,
"learning_rate": 2.3637445974067555e-05,
"loss": 1.1106,
"num_input_tokens_seen": 142602960,
"step": 263500
},
{
"epoch": 2.641267808548103,
"grad_norm": 5.9673261642456055,
"learning_rate": 2.3587421962542022e-05,
"loss": 1.1387,
"num_input_tokens_seen": 142870032,
"step": 264000
},
{
"epoch": 2.6462702097006563,
"grad_norm": 6.37895393371582,
"learning_rate": 2.353739795101649e-05,
"loss": 1.1133,
"num_input_tokens_seen": 143142496,
"step": 264500
},
{
"epoch": 2.6512726108532094,
"grad_norm": 6.1890692710876465,
"learning_rate": 2.3487373939490955e-05,
"loss": 1.1088,
"num_input_tokens_seen": 143420168,
"step": 265000
},
{
"epoch": 2.6562750120057625,
"grad_norm": 5.707185745239258,
"learning_rate": 2.3437349927965425e-05,
"loss": 1.1127,
"num_input_tokens_seen": 143692648,
"step": 265500
},
{
"epoch": 2.661277413158316,
"grad_norm": 6.048717975616455,
"learning_rate": 2.338732591643989e-05,
"loss": 1.1068,
"num_input_tokens_seen": 143964856,
"step": 266000
},
{
"epoch": 2.6662798143108692,
"grad_norm": 5.904679775238037,
"learning_rate": 2.333730190491436e-05,
"loss": 1.1039,
"num_input_tokens_seen": 144228744,
"step": 266500
},
{
"epoch": 2.6712822154634224,
"grad_norm": 6.36087703704834,
"learning_rate": 2.3287277893388828e-05,
"loss": 1.1197,
"num_input_tokens_seen": 144501872,
"step": 267000
},
{
"epoch": 2.676284616615976,
"grad_norm": 5.9171576499938965,
"learning_rate": 2.3237253881863298e-05,
"loss": 1.1192,
"num_input_tokens_seen": 144768432,
"step": 267500
},
{
"epoch": 2.681287017768529,
"grad_norm": 6.9919514656066895,
"learning_rate": 2.318722987033776e-05,
"loss": 1.1093,
"num_input_tokens_seen": 145035880,
"step": 268000
},
{
"epoch": 2.686289418921082,
"grad_norm": 5.2417826652526855,
"learning_rate": 2.313720585881223e-05,
"loss": 1.1293,
"num_input_tokens_seen": 145311552,
"step": 268500
},
{
"epoch": 2.6912918200736353,
"grad_norm": 5.52398681640625,
"learning_rate": 2.3087181847286697e-05,
"loss": 1.107,
"num_input_tokens_seen": 145584192,
"step": 269000
},
{
"epoch": 2.6962942212261884,
"grad_norm": 6.279477119445801,
"learning_rate": 2.3037157835761167e-05,
"loss": 1.0918,
"num_input_tokens_seen": 145854640,
"step": 269500
},
{
"epoch": 2.7012966223787416,
"grad_norm": 8.50329303741455,
"learning_rate": 2.2987133824235634e-05,
"loss": 1.1232,
"num_input_tokens_seen": 146125568,
"step": 270000
},
{
"epoch": 2.706299023531295,
"grad_norm": 7.494457721710205,
"learning_rate": 2.2937109812710104e-05,
"loss": 1.1178,
"num_input_tokens_seen": 146388376,
"step": 270500
},
{
"epoch": 2.7113014246838483,
"grad_norm": 5.595491886138916,
"learning_rate": 2.2887085801184567e-05,
"loss": 1.1144,
"num_input_tokens_seen": 146655392,
"step": 271000
},
{
"epoch": 2.7163038258364014,
"grad_norm": 7.584702968597412,
"learning_rate": 2.2837061789659037e-05,
"loss": 1.107,
"num_input_tokens_seen": 146920408,
"step": 271500
},
{
"epoch": 2.721306226988955,
"grad_norm": 5.952847957611084,
"learning_rate": 2.2787037778133504e-05,
"loss": 1.0989,
"num_input_tokens_seen": 147193680,
"step": 272000
},
{
"epoch": 2.726308628141508,
"grad_norm": 5.385768413543701,
"learning_rate": 2.2737013766607973e-05,
"loss": 1.1169,
"num_input_tokens_seen": 147465336,
"step": 272500
},
{
"epoch": 2.731311029294061,
"grad_norm": 7.199370861053467,
"learning_rate": 2.268698975508244e-05,
"loss": 1.119,
"num_input_tokens_seen": 147733784,
"step": 273000
},
{
"epoch": 2.7363134304466143,
"grad_norm": 6.557952880859375,
"learning_rate": 2.263696574355691e-05,
"loss": 1.0966,
"num_input_tokens_seen": 147998984,
"step": 273500
},
{
"epoch": 2.7413158315991675,
"grad_norm": 6.291484355926514,
"learning_rate": 2.2586941732031373e-05,
"loss": 1.1039,
"num_input_tokens_seen": 148268168,
"step": 274000
},
{
"epoch": 2.7463182327517206,
"grad_norm": 5.747891426086426,
"learning_rate": 2.2536917720505843e-05,
"loss": 1.1006,
"num_input_tokens_seen": 148546456,
"step": 274500
},
{
"epoch": 2.751320633904274,
"grad_norm": 5.766910552978516,
"learning_rate": 2.248689370898031e-05,
"loss": 1.1216,
"num_input_tokens_seen": 148815712,
"step": 275000
},
{
"epoch": 2.7563230350568273,
"grad_norm": 6.185927391052246,
"learning_rate": 2.243686969745478e-05,
"loss": 1.1007,
"num_input_tokens_seen": 149086416,
"step": 275500
},
{
"epoch": 2.7613254362093804,
"grad_norm": 7.301943778991699,
"learning_rate": 2.2386845685929246e-05,
"loss": 1.1182,
"num_input_tokens_seen": 149351152,
"step": 276000
},
{
"epoch": 2.766327837361934,
"grad_norm": 4.440983295440674,
"learning_rate": 2.2336821674403716e-05,
"loss": 1.1125,
"num_input_tokens_seen": 149617360,
"step": 276500
},
{
"epoch": 2.771330238514487,
"grad_norm": 6.778481483459473,
"learning_rate": 2.2286797662878183e-05,
"loss": 1.1108,
"num_input_tokens_seen": 149884296,
"step": 277000
},
{
"epoch": 2.7763326396670402,
"grad_norm": 6.671989440917969,
"learning_rate": 2.223677365135265e-05,
"loss": 1.0942,
"num_input_tokens_seen": 150155088,
"step": 277500
},
{
"epoch": 2.7813350408195934,
"grad_norm": 6.532144069671631,
"learning_rate": 2.218674963982712e-05,
"loss": 1.1102,
"num_input_tokens_seen": 150424504,
"step": 278000
},
{
"epoch": 2.7863374419721465,
"grad_norm": 7.665340423583984,
"learning_rate": 2.2136725628301586e-05,
"loss": 1.0995,
"num_input_tokens_seen": 150697200,
"step": 278500
},
{
"epoch": 2.7913398431246996,
"grad_norm": 8.809953689575195,
"learning_rate": 2.2086701616776052e-05,
"loss": 1.0934,
"num_input_tokens_seen": 150963832,
"step": 279000
},
{
"epoch": 2.796342244277253,
"grad_norm": 6.865957260131836,
"learning_rate": 2.2036677605250522e-05,
"loss": 1.0941,
"num_input_tokens_seen": 151236880,
"step": 279500
},
{
"epoch": 2.8013446454298063,
"grad_norm": 8.230210304260254,
"learning_rate": 2.198665359372499e-05,
"loss": 1.105,
"num_input_tokens_seen": 151510320,
"step": 280000
},
{
"epoch": 2.8063470465823595,
"grad_norm": 5.514502048492432,
"learning_rate": 2.1936629582199455e-05,
"loss": 1.1129,
"num_input_tokens_seen": 151773160,
"step": 280500
},
{
"epoch": 2.811349447734913,
"grad_norm": 6.41658353805542,
"learning_rate": 2.1886605570673925e-05,
"loss": 1.1234,
"num_input_tokens_seen": 152043968,
"step": 281000
},
{
"epoch": 2.816351848887466,
"grad_norm": 10.474380493164062,
"learning_rate": 2.183658155914839e-05,
"loss": 1.1109,
"num_input_tokens_seen": 152312528,
"step": 281500
},
{
"epoch": 2.8213542500400193,
"grad_norm": 6.710339069366455,
"learning_rate": 2.178655754762286e-05,
"loss": 1.1007,
"num_input_tokens_seen": 152583128,
"step": 282000
},
{
"epoch": 2.8263566511925724,
"grad_norm": 6.992675304412842,
"learning_rate": 2.1736533536097328e-05,
"loss": 1.0936,
"num_input_tokens_seen": 152859696,
"step": 282500
},
{
"epoch": 2.8313590523451255,
"grad_norm": 5.590021133422852,
"learning_rate": 2.1686509524571795e-05,
"loss": 1.1047,
"num_input_tokens_seen": 153129472,
"step": 283000
},
{
"epoch": 2.8363614534976787,
"grad_norm": 5.853962421417236,
"learning_rate": 2.1636485513046265e-05,
"loss": 1.116,
"num_input_tokens_seen": 153397560,
"step": 283500
},
{
"epoch": 2.8413638546502322,
"grad_norm": 5.7029242515563965,
"learning_rate": 2.158646150152073e-05,
"loss": 1.1153,
"num_input_tokens_seen": 153671176,
"step": 284000
},
{
"epoch": 2.8463662558027853,
"grad_norm": 6.952505111694336,
"learning_rate": 2.1536437489995198e-05,
"loss": 1.1153,
"num_input_tokens_seen": 153937560,
"step": 284500
},
{
"epoch": 2.8513686569553385,
"grad_norm": 5.1992902755737305,
"learning_rate": 2.1486413478469668e-05,
"loss": 1.0892,
"num_input_tokens_seen": 154210728,
"step": 285000
},
{
"epoch": 2.856371058107892,
"grad_norm": 5.967268943786621,
"learning_rate": 2.1436389466944134e-05,
"loss": 1.0971,
"num_input_tokens_seen": 154484848,
"step": 285500
},
{
"epoch": 2.861373459260445,
"grad_norm": 7.573243618011475,
"learning_rate": 2.1386365455418604e-05,
"loss": 1.1015,
"num_input_tokens_seen": 154756800,
"step": 286000
},
{
"epoch": 2.8663758604129983,
"grad_norm": 6.0880584716796875,
"learning_rate": 2.133634144389307e-05,
"loss": 1.0941,
"num_input_tokens_seen": 155027568,
"step": 286500
},
{
"epoch": 2.8713782615655514,
"grad_norm": 8.533178329467773,
"learning_rate": 2.1286317432367537e-05,
"loss": 1.0951,
"num_input_tokens_seen": 155296992,
"step": 287000
},
{
"epoch": 2.8763806627181046,
"grad_norm": 6.032339096069336,
"learning_rate": 2.1236293420842004e-05,
"loss": 1.1088,
"num_input_tokens_seen": 155565816,
"step": 287500
},
{
"epoch": 2.8813830638706577,
"grad_norm": 7.005359649658203,
"learning_rate": 2.1186269409316474e-05,
"loss": 1.0966,
"num_input_tokens_seen": 155835736,
"step": 288000
},
{
"epoch": 2.8863854650232112,
"grad_norm": 6.709108829498291,
"learning_rate": 2.113624539779094e-05,
"loss": 1.0999,
"num_input_tokens_seen": 156105320,
"step": 288500
},
{
"epoch": 2.8913878661757644,
"grad_norm": 6.140367031097412,
"learning_rate": 2.108622138626541e-05,
"loss": 1.0919,
"num_input_tokens_seen": 156373336,
"step": 289000
},
{
"epoch": 2.8963902673283175,
"grad_norm": 6.799286365509033,
"learning_rate": 2.1036197374739877e-05,
"loss": 1.0977,
"num_input_tokens_seen": 156645480,
"step": 289500
},
{
"epoch": 2.901392668480871,
"grad_norm": 7.2591023445129395,
"learning_rate": 2.0986173363214343e-05,
"loss": 1.1045,
"num_input_tokens_seen": 156919152,
"step": 290000
},
{
"epoch": 2.906395069633424,
"grad_norm": 6.572688102722168,
"learning_rate": 2.093614935168881e-05,
"loss": 1.0954,
"num_input_tokens_seen": 157189944,
"step": 290500
},
{
"epoch": 2.9113974707859773,
"grad_norm": 8.598788261413574,
"learning_rate": 2.088612534016328e-05,
"loss": 1.1201,
"num_input_tokens_seen": 157462520,
"step": 291000
},
{
"epoch": 2.9163998719385305,
"grad_norm": 7.680613994598389,
"learning_rate": 2.0836101328637746e-05,
"loss": 1.0966,
"num_input_tokens_seen": 157734256,
"step": 291500
},
{
"epoch": 2.9214022730910836,
"grad_norm": 7.925107479095459,
"learning_rate": 2.0786077317112216e-05,
"loss": 1.0918,
"num_input_tokens_seen": 158006528,
"step": 292000
},
{
"epoch": 2.9264046742436367,
"grad_norm": 5.118693828582764,
"learning_rate": 2.0736053305586683e-05,
"loss": 1.0932,
"num_input_tokens_seen": 158275552,
"step": 292500
},
{
"epoch": 2.9314070753961903,
"grad_norm": 4.836045265197754,
"learning_rate": 2.0686029294061153e-05,
"loss": 1.0973,
"num_input_tokens_seen": 158547424,
"step": 293000
},
{
"epoch": 2.9364094765487434,
"grad_norm": 5.422683238983154,
"learning_rate": 2.0636005282535616e-05,
"loss": 1.1073,
"num_input_tokens_seen": 158818568,
"step": 293500
},
{
"epoch": 2.9414118777012965,
"grad_norm": 6.608382225036621,
"learning_rate": 2.0585981271010086e-05,
"loss": 1.0836,
"num_input_tokens_seen": 159088240,
"step": 294000
},
{
"epoch": 2.94641427885385,
"grad_norm": 5.50745153427124,
"learning_rate": 2.0535957259484552e-05,
"loss": 1.1019,
"num_input_tokens_seen": 159356336,
"step": 294500
},
{
"epoch": 2.9514166800064032,
"grad_norm": 5.7239251136779785,
"learning_rate": 2.0485933247959022e-05,
"loss": 1.0824,
"num_input_tokens_seen": 159630264,
"step": 295000
},
{
"epoch": 2.9564190811589564,
"grad_norm": 6.992796421051025,
"learning_rate": 2.043590923643349e-05,
"loss": 1.1014,
"num_input_tokens_seen": 159892336,
"step": 295500
},
{
"epoch": 2.9614214823115095,
"grad_norm": 6.0249433517456055,
"learning_rate": 2.038588522490796e-05,
"loss": 1.0831,
"num_input_tokens_seen": 160159512,
"step": 296000
},
{
"epoch": 2.9664238834640626,
"grad_norm": 5.429805755615234,
"learning_rate": 2.0335861213382422e-05,
"loss": 1.0991,
"num_input_tokens_seen": 160423896,
"step": 296500
},
{
"epoch": 2.9714262846166157,
"grad_norm": 7.0117034912109375,
"learning_rate": 2.0285837201856892e-05,
"loss": 1.0682,
"num_input_tokens_seen": 160699992,
"step": 297000
},
{
"epoch": 2.9764286857691693,
"grad_norm": 4.545111179351807,
"learning_rate": 2.023581319033136e-05,
"loss": 1.0976,
"num_input_tokens_seen": 160970504,
"step": 297500
},
{
"epoch": 2.9814310869217224,
"grad_norm": 7.641571998596191,
"learning_rate": 2.018578917880583e-05,
"loss": 1.1149,
"num_input_tokens_seen": 161244848,
"step": 298000
},
{
"epoch": 2.9864334880742756,
"grad_norm": 5.6191205978393555,
"learning_rate": 2.0135765167280295e-05,
"loss": 1.0841,
"num_input_tokens_seen": 161521312,
"step": 298500
},
{
"epoch": 2.991435889226829,
"grad_norm": 7.104705810546875,
"learning_rate": 2.0085741155754765e-05,
"loss": 1.1083,
"num_input_tokens_seen": 161787136,
"step": 299000
},
{
"epoch": 2.9964382903793823,
"grad_norm": 7.319199085235596,
"learning_rate": 2.0035717144229228e-05,
"loss": 1.0933,
"num_input_tokens_seen": 162058632,
"step": 299500
},
{
"epoch": 3.0,
"eval_loss": 1.0591504573822021,
"eval_runtime": 193.2048,
"eval_samples_per_second": 1034.679,
"eval_steps_per_second": 129.339,
"num_input_tokens_seen": 162248288,
"step": 299856
},
{
"epoch": 3.0014406915319354,
"grad_norm": 6.5569539070129395,
"learning_rate": 1.9985693132703698e-05,
"loss": 1.0801,
"num_input_tokens_seen": 162329952,
"step": 300000
},
{
"epoch": 3.0064430926844885,
"grad_norm": 4.675987720489502,
"learning_rate": 1.9935669121178165e-05,
"loss": 0.9946,
"num_input_tokens_seen": 162606936,
"step": 300500
},
{
"epoch": 3.0114454938370416,
"grad_norm": 9.786909103393555,
"learning_rate": 1.9885645109652635e-05,
"loss": 1.0173,
"num_input_tokens_seen": 162883112,
"step": 301000
},
{
"epoch": 3.016447894989595,
"grad_norm": 7.118892669677734,
"learning_rate": 1.98356210981271e-05,
"loss": 0.9956,
"num_input_tokens_seen": 163158976,
"step": 301500
},
{
"epoch": 3.0214502961421483,
"grad_norm": 4.8414411544799805,
"learning_rate": 1.978559708660157e-05,
"loss": 0.9909,
"num_input_tokens_seen": 163431480,
"step": 302000
},
{
"epoch": 3.0264526972947015,
"grad_norm": 6.550401210784912,
"learning_rate": 1.9735573075076038e-05,
"loss": 1.0088,
"num_input_tokens_seen": 163705656,
"step": 302500
},
{
"epoch": 3.0314550984472546,
"grad_norm": 6.2179694175720215,
"learning_rate": 1.9685549063550504e-05,
"loss": 1.0046,
"num_input_tokens_seen": 163978520,
"step": 303000
},
{
"epoch": 3.0364574995998077,
"grad_norm": 5.6524224281311035,
"learning_rate": 1.963552505202497e-05,
"loss": 0.9962,
"num_input_tokens_seen": 164246424,
"step": 303500
},
{
"epoch": 3.0414599007523613,
"grad_norm": 6.216259479522705,
"learning_rate": 1.958550104049944e-05,
"loss": 1.0032,
"num_input_tokens_seen": 164520760,
"step": 304000
},
{
"epoch": 3.0464623019049144,
"grad_norm": 5.4327311515808105,
"learning_rate": 1.9535477028973907e-05,
"loss": 0.9958,
"num_input_tokens_seen": 164791976,
"step": 304500
},
{
"epoch": 3.0514647030574675,
"grad_norm": 6.64623498916626,
"learning_rate": 1.9485453017448377e-05,
"loss": 1.0027,
"num_input_tokens_seen": 165064512,
"step": 305000
},
{
"epoch": 3.0564671042100207,
"grad_norm": 5.067431449890137,
"learning_rate": 1.9435429005922844e-05,
"loss": 1.0095,
"num_input_tokens_seen": 165335384,
"step": 305500
},
{
"epoch": 3.0614695053625742,
"grad_norm": 5.332586765289307,
"learning_rate": 1.938540499439731e-05,
"loss": 1.0001,
"num_input_tokens_seen": 165600752,
"step": 306000
},
{
"epoch": 3.0664719065151274,
"grad_norm": 6.432159900665283,
"learning_rate": 1.933538098287178e-05,
"loss": 0.9977,
"num_input_tokens_seen": 165870160,
"step": 306500
},
{
"epoch": 3.0714743076676805,
"grad_norm": 6.297356605529785,
"learning_rate": 1.9285356971346247e-05,
"loss": 0.9981,
"num_input_tokens_seen": 166137752,
"step": 307000
},
{
"epoch": 3.0764767088202336,
"grad_norm": 6.82805871963501,
"learning_rate": 1.9235332959820717e-05,
"loss": 1.008,
"num_input_tokens_seen": 166409344,
"step": 307500
},
{
"epoch": 3.0814791099727867,
"grad_norm": 5.371485710144043,
"learning_rate": 1.9185308948295183e-05,
"loss": 1.0077,
"num_input_tokens_seen": 166679600,
"step": 308000
},
{
"epoch": 3.0864815111253403,
"grad_norm": 5.552392482757568,
"learning_rate": 1.913528493676965e-05,
"loss": 1.0154,
"num_input_tokens_seen": 166951960,
"step": 308500
},
{
"epoch": 3.0914839122778934,
"grad_norm": 5.485569953918457,
"learning_rate": 1.908526092524412e-05,
"loss": 1.0183,
"num_input_tokens_seen": 167229768,
"step": 309000
},
{
"epoch": 3.0964863134304466,
"grad_norm": 7.161227226257324,
"learning_rate": 1.9035236913718586e-05,
"loss": 0.9953,
"num_input_tokens_seen": 167500816,
"step": 309500
},
{
"epoch": 3.1014887145829997,
"grad_norm": 6.685337543487549,
"learning_rate": 1.8985212902193053e-05,
"loss": 1.0126,
"num_input_tokens_seen": 167761944,
"step": 310000
},
{
"epoch": 3.1064911157355533,
"grad_norm": 7.007294178009033,
"learning_rate": 1.8935188890667523e-05,
"loss": 1.0168,
"num_input_tokens_seen": 168031200,
"step": 310500
},
{
"epoch": 3.1114935168881064,
"grad_norm": 4.6598615646362305,
"learning_rate": 1.888516487914199e-05,
"loss": 0.9972,
"num_input_tokens_seen": 168306704,
"step": 311000
},
{
"epoch": 3.1164959180406595,
"grad_norm": 6.378694534301758,
"learning_rate": 1.883514086761646e-05,
"loss": 1.0057,
"num_input_tokens_seen": 168578560,
"step": 311500
},
{
"epoch": 3.1214983191932126,
"grad_norm": 4.70497465133667,
"learning_rate": 1.8785116856090926e-05,
"loss": 0.9805,
"num_input_tokens_seen": 168857808,
"step": 312000
},
{
"epoch": 3.1265007203457658,
"grad_norm": 6.10917329788208,
"learning_rate": 1.8735092844565392e-05,
"loss": 1.0013,
"num_input_tokens_seen": 169130360,
"step": 312500
},
{
"epoch": 3.1315031214983193,
"grad_norm": 5.804021835327148,
"learning_rate": 1.868506883303986e-05,
"loss": 1.0111,
"num_input_tokens_seen": 169403088,
"step": 313000
},
{
"epoch": 3.1365055226508725,
"grad_norm": 6.582799911499023,
"learning_rate": 1.863504482151433e-05,
"loss": 1.0181,
"num_input_tokens_seen": 169678360,
"step": 313500
},
{
"epoch": 3.1415079238034256,
"grad_norm": 6.149540424346924,
"learning_rate": 1.8585020809988795e-05,
"loss": 1.0122,
"num_input_tokens_seen": 169944568,
"step": 314000
},
{
"epoch": 3.1465103249559787,
"grad_norm": 8.258193969726562,
"learning_rate": 1.8534996798463265e-05,
"loss": 0.9965,
"num_input_tokens_seen": 170213776,
"step": 314500
},
{
"epoch": 3.1515127261085323,
"grad_norm": 5.790067195892334,
"learning_rate": 1.8484972786937732e-05,
"loss": 1.0083,
"num_input_tokens_seen": 170487696,
"step": 315000
},
{
"epoch": 3.1565151272610854,
"grad_norm": 6.756849765777588,
"learning_rate": 1.84349487754122e-05,
"loss": 1.0112,
"num_input_tokens_seen": 170763864,
"step": 315500
},
{
"epoch": 3.1615175284136385,
"grad_norm": 8.113907814025879,
"learning_rate": 1.8384924763886665e-05,
"loss": 1.0113,
"num_input_tokens_seen": 171034920,
"step": 316000
},
{
"epoch": 3.1665199295661917,
"grad_norm": 6.584122180938721,
"learning_rate": 1.8334900752361135e-05,
"loss": 1.0071,
"num_input_tokens_seen": 171319328,
"step": 316500
},
{
"epoch": 3.1715223307187452,
"grad_norm": 7.648674964904785,
"learning_rate": 1.82848767408356e-05,
"loss": 1.0092,
"num_input_tokens_seen": 171587048,
"step": 317000
},
{
"epoch": 3.1765247318712984,
"grad_norm": 6.150393486022949,
"learning_rate": 1.823485272931007e-05,
"loss": 1.0213,
"num_input_tokens_seen": 171854768,
"step": 317500
},
{
"epoch": 3.1815271330238515,
"grad_norm": 6.078028202056885,
"learning_rate": 1.8184828717784538e-05,
"loss": 1.0129,
"num_input_tokens_seen": 172122080,
"step": 318000
},
{
"epoch": 3.1865295341764046,
"grad_norm": 6.360128402709961,
"learning_rate": 1.8134804706259008e-05,
"loss": 0.9966,
"num_input_tokens_seen": 172386096,
"step": 318500
},
{
"epoch": 3.1915319353289577,
"grad_norm": 7.205709934234619,
"learning_rate": 1.808478069473347e-05,
"loss": 1.019,
"num_input_tokens_seen": 172656264,
"step": 319000
},
{
"epoch": 3.1965343364815113,
"grad_norm": 6.01072883605957,
"learning_rate": 1.803475668320794e-05,
"loss": 1.0104,
"num_input_tokens_seen": 172928696,
"step": 319500
},
{
"epoch": 3.2015367376340644,
"grad_norm": 5.552466869354248,
"learning_rate": 1.7984732671682407e-05,
"loss": 1.0045,
"num_input_tokens_seen": 173193664,
"step": 320000
},
{
"epoch": 3.2065391387866176,
"grad_norm": 6.396902561187744,
"learning_rate": 1.7934708660156877e-05,
"loss": 1.007,
"num_input_tokens_seen": 173460776,
"step": 320500
},
{
"epoch": 3.2115415399391707,
"grad_norm": 5.7529778480529785,
"learning_rate": 1.7884684648631344e-05,
"loss": 1.0185,
"num_input_tokens_seen": 173728072,
"step": 321000
},
{
"epoch": 3.2165439410917243,
"grad_norm": 8.143234252929688,
"learning_rate": 1.7834660637105814e-05,
"loss": 1.0306,
"num_input_tokens_seen": 173997832,
"step": 321500
},
{
"epoch": 3.2215463422442774,
"grad_norm": 5.61393928527832,
"learning_rate": 1.7784636625580277e-05,
"loss": 1.0206,
"num_input_tokens_seen": 174268112,
"step": 322000
},
{
"epoch": 3.2265487433968305,
"grad_norm": 5.928481578826904,
"learning_rate": 1.7734612614054747e-05,
"loss": 0.9988,
"num_input_tokens_seen": 174533880,
"step": 322500
},
{
"epoch": 3.2315511445493836,
"grad_norm": 5.389233589172363,
"learning_rate": 1.7684588602529213e-05,
"loss": 1.02,
"num_input_tokens_seen": 174799568,
"step": 323000
},
{
"epoch": 3.2365535457019368,
"grad_norm": 7.281908988952637,
"learning_rate": 1.7634564591003683e-05,
"loss": 0.9986,
"num_input_tokens_seen": 175066168,
"step": 323500
},
{
"epoch": 3.2415559468544903,
"grad_norm": 6.380090713500977,
"learning_rate": 1.758454057947815e-05,
"loss": 1.0057,
"num_input_tokens_seen": 175336944,
"step": 324000
},
{
"epoch": 3.2465583480070435,
"grad_norm": 6.550302982330322,
"learning_rate": 1.753451656795262e-05,
"loss": 1.0171,
"num_input_tokens_seen": 175611424,
"step": 324500
},
{
"epoch": 3.2515607491595966,
"grad_norm": 5.882409572601318,
"learning_rate": 1.7484492556427083e-05,
"loss": 1.0112,
"num_input_tokens_seen": 175879768,
"step": 325000
},
{
"epoch": 3.2565631503121497,
"grad_norm": 6.047407627105713,
"learning_rate": 1.7434468544901553e-05,
"loss": 1.0332,
"num_input_tokens_seen": 176157824,
"step": 325500
},
{
"epoch": 3.2615655514647033,
"grad_norm": 6.9863691329956055,
"learning_rate": 1.738444453337602e-05,
"loss": 1.0008,
"num_input_tokens_seen": 176428696,
"step": 326000
},
{
"epoch": 3.2665679526172564,
"grad_norm": 6.332062721252441,
"learning_rate": 1.733442052185049e-05,
"loss": 1.0005,
"num_input_tokens_seen": 176707880,
"step": 326500
},
{
"epoch": 3.2715703537698095,
"grad_norm": 5.405006408691406,
"learning_rate": 1.7284396510324956e-05,
"loss": 1.0249,
"num_input_tokens_seen": 176978528,
"step": 327000
},
{
"epoch": 3.2765727549223627,
"grad_norm": 6.076756477355957,
"learning_rate": 1.7234372498799426e-05,
"loss": 1.0156,
"num_input_tokens_seen": 177248232,
"step": 327500
},
{
"epoch": 3.281575156074916,
"grad_norm": 7.379303932189941,
"learning_rate": 1.7184348487273893e-05,
"loss": 1.0039,
"num_input_tokens_seen": 177519848,
"step": 328000
},
{
"epoch": 3.2865775572274694,
"grad_norm": 6.788669109344482,
"learning_rate": 1.713432447574836e-05,
"loss": 1.0092,
"num_input_tokens_seen": 177789288,
"step": 328500
},
{
"epoch": 3.2915799583800225,
"grad_norm": 6.320953369140625,
"learning_rate": 1.7084300464222826e-05,
"loss": 1.0143,
"num_input_tokens_seen": 178056680,
"step": 329000
},
{
"epoch": 3.2965823595325756,
"grad_norm": 6.267603397369385,
"learning_rate": 1.7034276452697296e-05,
"loss": 1.0023,
"num_input_tokens_seen": 178329192,
"step": 329500
},
{
"epoch": 3.3015847606851287,
"grad_norm": 5.770685195922852,
"learning_rate": 1.6984252441171762e-05,
"loss": 1.0279,
"num_input_tokens_seen": 178597800,
"step": 330000
},
{
"epoch": 3.3065871618376823,
"grad_norm": 6.4189863204956055,
"learning_rate": 1.6934228429646232e-05,
"loss": 1.0052,
"num_input_tokens_seen": 178869992,
"step": 330500
},
{
"epoch": 3.3115895629902354,
"grad_norm": 5.872836589813232,
"learning_rate": 1.68842044181207e-05,
"loss": 0.9991,
"num_input_tokens_seen": 179139968,
"step": 331000
},
{
"epoch": 3.3165919641427886,
"grad_norm": 4.180021286010742,
"learning_rate": 1.6834180406595165e-05,
"loss": 1.0087,
"num_input_tokens_seen": 179411592,
"step": 331500
},
{
"epoch": 3.3215943652953417,
"grad_norm": 5.875650405883789,
"learning_rate": 1.6784156395069635e-05,
"loss": 1.0076,
"num_input_tokens_seen": 179672616,
"step": 332000
},
{
"epoch": 3.326596766447895,
"grad_norm": 5.798732280731201,
"learning_rate": 1.67341323835441e-05,
"loss": 1.0121,
"num_input_tokens_seen": 179943344,
"step": 332500
},
{
"epoch": 3.3315991676004484,
"grad_norm": 5.229135513305664,
"learning_rate": 1.6684108372018568e-05,
"loss": 1.018,
"num_input_tokens_seen": 180212504,
"step": 333000
},
{
"epoch": 3.3366015687530015,
"grad_norm": 6.477422714233398,
"learning_rate": 1.6634084360493038e-05,
"loss": 0.9992,
"num_input_tokens_seen": 180482184,
"step": 333500
},
{
"epoch": 3.3416039699055546,
"grad_norm": 6.4892497062683105,
"learning_rate": 1.6584060348967505e-05,
"loss": 1.0223,
"num_input_tokens_seen": 180754256,
"step": 334000
},
{
"epoch": 3.3466063710581078,
"grad_norm": 5.80246639251709,
"learning_rate": 1.653403633744197e-05,
"loss": 1.0102,
"num_input_tokens_seen": 181030712,
"step": 334500
},
{
"epoch": 3.3516087722106613,
"grad_norm": 5.75023078918457,
"learning_rate": 1.648401232591644e-05,
"loss": 1.0111,
"num_input_tokens_seen": 181295800,
"step": 335000
},
{
"epoch": 3.3566111733632145,
"grad_norm": 5.059189796447754,
"learning_rate": 1.6433988314390908e-05,
"loss": 1.01,
"num_input_tokens_seen": 181561000,
"step": 335500
},
{
"epoch": 3.3616135745157676,
"grad_norm": 5.798236846923828,
"learning_rate": 1.6383964302865378e-05,
"loss": 1.0201,
"num_input_tokens_seen": 181836528,
"step": 336000
},
{
"epoch": 3.3666159756683207,
"grad_norm": 6.402642250061035,
"learning_rate": 1.6333940291339844e-05,
"loss": 1.0191,
"num_input_tokens_seen": 182106640,
"step": 336500
},
{
"epoch": 3.371618376820874,
"grad_norm": 6.876767635345459,
"learning_rate": 1.6283916279814314e-05,
"loss": 1.0171,
"num_input_tokens_seen": 182385832,
"step": 337000
},
{
"epoch": 3.3766207779734274,
"grad_norm": 8.243651390075684,
"learning_rate": 1.623389226828878e-05,
"loss": 1.0136,
"num_input_tokens_seen": 182652112,
"step": 337500
},
{
"epoch": 3.3816231791259805,
"grad_norm": 6.981409072875977,
"learning_rate": 1.6183868256763247e-05,
"loss": 1.0027,
"num_input_tokens_seen": 182920216,
"step": 338000
},
{
"epoch": 3.3866255802785337,
"grad_norm": 5.377172470092773,
"learning_rate": 1.6133844245237714e-05,
"loss": 1.0129,
"num_input_tokens_seen": 183185600,
"step": 338500
},
{
"epoch": 3.391627981431087,
"grad_norm": 7.256112575531006,
"learning_rate": 1.6083820233712184e-05,
"loss": 1.0061,
"num_input_tokens_seen": 183453440,
"step": 339000
},
{
"epoch": 3.3966303825836404,
"grad_norm": 5.170373916625977,
"learning_rate": 1.603379622218665e-05,
"loss": 1.0127,
"num_input_tokens_seen": 183722560,
"step": 339500
},
{
"epoch": 3.4016327837361935,
"grad_norm": 5.4537248611450195,
"learning_rate": 1.598377221066112e-05,
"loss": 1.0084,
"num_input_tokens_seen": 183991520,
"step": 340000
},
{
"epoch": 3.4066351848887466,
"grad_norm": 6.006635665893555,
"learning_rate": 1.5933748199135587e-05,
"loss": 1.0017,
"num_input_tokens_seen": 184263976,
"step": 340500
},
{
"epoch": 3.4116375860412997,
"grad_norm": 5.732070446014404,
"learning_rate": 1.5883724187610053e-05,
"loss": 1.0124,
"num_input_tokens_seen": 184528864,
"step": 341000
},
{
"epoch": 3.416639987193853,
"grad_norm": 7.936917304992676,
"learning_rate": 1.583370017608452e-05,
"loss": 1.0183,
"num_input_tokens_seen": 184798528,
"step": 341500
},
{
"epoch": 3.4216423883464064,
"grad_norm": 7.584635257720947,
"learning_rate": 1.578367616455899e-05,
"loss": 1.0236,
"num_input_tokens_seen": 185063968,
"step": 342000
},
{
"epoch": 3.4266447894989596,
"grad_norm": 6.110123634338379,
"learning_rate": 1.5733652153033456e-05,
"loss": 1.0113,
"num_input_tokens_seen": 185328760,
"step": 342500
},
{
"epoch": 3.4316471906515127,
"grad_norm": 6.163844585418701,
"learning_rate": 1.5683628141507926e-05,
"loss": 1.0143,
"num_input_tokens_seen": 185601312,
"step": 343000
},
{
"epoch": 3.436649591804066,
"grad_norm": 6.428155899047852,
"learning_rate": 1.5633604129982393e-05,
"loss": 1.0231,
"num_input_tokens_seen": 185868856,
"step": 343500
},
{
"epoch": 3.4416519929566194,
"grad_norm": 4.938517093658447,
"learning_rate": 1.5583580118456863e-05,
"loss": 1.029,
"num_input_tokens_seen": 186141200,
"step": 344000
},
{
"epoch": 3.4466543941091725,
"grad_norm": 6.4214019775390625,
"learning_rate": 1.5533556106931326e-05,
"loss": 1.0117,
"num_input_tokens_seen": 186415152,
"step": 344500
},
{
"epoch": 3.4516567952617256,
"grad_norm": 5.303710460662842,
"learning_rate": 1.5483532095405796e-05,
"loss": 1.0094,
"num_input_tokens_seen": 186690176,
"step": 345000
},
{
"epoch": 3.4566591964142788,
"grad_norm": 4.86320161819458,
"learning_rate": 1.5433508083880262e-05,
"loss": 1.0027,
"num_input_tokens_seen": 186957432,
"step": 345500
},
{
"epoch": 3.461661597566832,
"grad_norm": 4.321279048919678,
"learning_rate": 1.5383484072354732e-05,
"loss": 1.0123,
"num_input_tokens_seen": 187226552,
"step": 346000
},
{
"epoch": 3.4666639987193855,
"grad_norm": 5.8327860832214355,
"learning_rate": 1.53334600608292e-05,
"loss": 1.0057,
"num_input_tokens_seen": 187497776,
"step": 346500
},
{
"epoch": 3.4716663998719386,
"grad_norm": 9.191901206970215,
"learning_rate": 1.528343604930367e-05,
"loss": 1.0008,
"num_input_tokens_seen": 187768112,
"step": 347000
},
{
"epoch": 3.4766688010244917,
"grad_norm": 6.153154373168945,
"learning_rate": 1.5233412037778134e-05,
"loss": 1.0,
"num_input_tokens_seen": 188036912,
"step": 347500
},
{
"epoch": 3.481671202177045,
"grad_norm": 5.405030250549316,
"learning_rate": 1.5183388026252602e-05,
"loss": 1.0119,
"num_input_tokens_seen": 188312952,
"step": 348000
},
{
"epoch": 3.4866736033295984,
"grad_norm": 6.342990398406982,
"learning_rate": 1.513336401472707e-05,
"loss": 1.0101,
"num_input_tokens_seen": 188585776,
"step": 348500
},
{
"epoch": 3.4916760044821515,
"grad_norm": 6.2145867347717285,
"learning_rate": 1.5083340003201538e-05,
"loss": 1.0018,
"num_input_tokens_seen": 188856160,
"step": 349000
},
{
"epoch": 3.4966784056347047,
"grad_norm": 5.479875564575195,
"learning_rate": 1.5033315991676007e-05,
"loss": 1.0092,
"num_input_tokens_seen": 189125536,
"step": 349500
},
{
"epoch": 3.501680806787258,
"grad_norm": 6.38485050201416,
"learning_rate": 1.4983291980150473e-05,
"loss": 1.0074,
"num_input_tokens_seen": 189397856,
"step": 350000
},
{
"epoch": 3.506683207939811,
"grad_norm": 5.203739166259766,
"learning_rate": 1.493326796862494e-05,
"loss": 0.9897,
"num_input_tokens_seen": 189664624,
"step": 350500
},
{
"epoch": 3.5116856090923645,
"grad_norm": 6.554189682006836,
"learning_rate": 1.4883243957099408e-05,
"loss": 1.0222,
"num_input_tokens_seen": 189931544,
"step": 351000
},
{
"epoch": 3.5166880102449176,
"grad_norm": 6.045382022857666,
"learning_rate": 1.4833219945573876e-05,
"loss": 1.0108,
"num_input_tokens_seen": 190201904,
"step": 351500
},
{
"epoch": 3.5216904113974707,
"grad_norm": 5.883347988128662,
"learning_rate": 1.4783195934048344e-05,
"loss": 1.0204,
"num_input_tokens_seen": 190469632,
"step": 352000
},
{
"epoch": 3.526692812550024,
"grad_norm": 5.156943321228027,
"learning_rate": 1.4733171922522813e-05,
"loss": 0.9963,
"num_input_tokens_seen": 190737512,
"step": 352500
},
{
"epoch": 3.5316952137025774,
"grad_norm": 5.740571975708008,
"learning_rate": 1.4683147910997281e-05,
"loss": 1.0157,
"num_input_tokens_seen": 191003792,
"step": 353000
},
{
"epoch": 3.5366976148551306,
"grad_norm": 5.744316101074219,
"learning_rate": 1.463312389947175e-05,
"loss": 1.0182,
"num_input_tokens_seen": 191276360,
"step": 353500
},
{
"epoch": 3.5417000160076837,
"grad_norm": 6.743235111236572,
"learning_rate": 1.4583099887946214e-05,
"loss": 1.0212,
"num_input_tokens_seen": 191547000,
"step": 354000
},
{
"epoch": 3.546702417160237,
"grad_norm": 6.034450531005859,
"learning_rate": 1.4533075876420682e-05,
"loss": 1.0159,
"num_input_tokens_seen": 191816024,
"step": 354500
},
{
"epoch": 3.55170481831279,
"grad_norm": 6.9873833656311035,
"learning_rate": 1.448305186489515e-05,
"loss": 0.9996,
"num_input_tokens_seen": 192078648,
"step": 355000
},
{
"epoch": 3.5567072194653435,
"grad_norm": 4.8513078689575195,
"learning_rate": 1.4433027853369619e-05,
"loss": 1.0022,
"num_input_tokens_seen": 192346960,
"step": 355500
},
{
"epoch": 3.5617096206178966,
"grad_norm": 6.602761268615723,
"learning_rate": 1.4383003841844087e-05,
"loss": 1.0182,
"num_input_tokens_seen": 192617800,
"step": 356000
},
{
"epoch": 3.5667120217704498,
"grad_norm": 5.9454851150512695,
"learning_rate": 1.4332979830318555e-05,
"loss": 1.0032,
"num_input_tokens_seen": 192887048,
"step": 356500
},
{
"epoch": 3.571714422923003,
"grad_norm": 4.868193626403809,
"learning_rate": 1.428295581879302e-05,
"loss": 0.9844,
"num_input_tokens_seen": 193153600,
"step": 357000
},
{
"epoch": 3.5767168240755565,
"grad_norm": 5.1398749351501465,
"learning_rate": 1.4232931807267488e-05,
"loss": 1.0119,
"num_input_tokens_seen": 193428088,
"step": 357500
},
{
"epoch": 3.5817192252281096,
"grad_norm": 5.984772682189941,
"learning_rate": 1.4182907795741957e-05,
"loss": 1.0014,
"num_input_tokens_seen": 193702344,
"step": 358000
},
{
"epoch": 3.5867216263806627,
"grad_norm": 5.537957191467285,
"learning_rate": 1.4132883784216425e-05,
"loss": 1.0158,
"num_input_tokens_seen": 193976088,
"step": 358500
},
{
"epoch": 3.591724027533216,
"grad_norm": 5.605039119720459,
"learning_rate": 1.4082859772690893e-05,
"loss": 1.0175,
"num_input_tokens_seen": 194241888,
"step": 359000
},
{
"epoch": 3.596726428685769,
"grad_norm": 8.43393611907959,
"learning_rate": 1.4032835761165361e-05,
"loss": 1.0001,
"num_input_tokens_seen": 194514168,
"step": 359500
},
{
"epoch": 3.6017288298383225,
"grad_norm": 6.197558403015137,
"learning_rate": 1.3982811749639826e-05,
"loss": 1.0006,
"num_input_tokens_seen": 194788128,
"step": 360000
},
{
"epoch": 3.6067312309908757,
"grad_norm": 6.9046430587768555,
"learning_rate": 1.3932787738114294e-05,
"loss": 1.0163,
"num_input_tokens_seen": 195057368,
"step": 360500
},
{
"epoch": 3.611733632143429,
"grad_norm": 6.350090026855469,
"learning_rate": 1.3882763726588763e-05,
"loss": 0.9951,
"num_input_tokens_seen": 195326896,
"step": 361000
},
{
"epoch": 3.616736033295982,
"grad_norm": 7.581150531768799,
"learning_rate": 1.3832739715063231e-05,
"loss": 1.0004,
"num_input_tokens_seen": 195599832,
"step": 361500
},
{
"epoch": 3.6217384344485355,
"grad_norm": 5.0561017990112305,
"learning_rate": 1.3782715703537699e-05,
"loss": 1.0211,
"num_input_tokens_seen": 195878880,
"step": 362000
},
{
"epoch": 3.6267408356010886,
"grad_norm": 6.046396732330322,
"learning_rate": 1.3732691692012167e-05,
"loss": 1.0113,
"num_input_tokens_seen": 196149072,
"step": 362500
},
{
"epoch": 3.6317432367536417,
"grad_norm": 6.726164817810059,
"learning_rate": 1.3682667680486636e-05,
"loss": 1.0031,
"num_input_tokens_seen": 196417928,
"step": 363000
},
{
"epoch": 3.636745637906195,
"grad_norm": 10.790836334228516,
"learning_rate": 1.36326436689611e-05,
"loss": 1.0076,
"num_input_tokens_seen": 196688920,
"step": 363500
},
{
"epoch": 3.641748039058748,
"grad_norm": 6.029910087585449,
"learning_rate": 1.3582619657435569e-05,
"loss": 1.0027,
"num_input_tokens_seen": 196965448,
"step": 364000
},
{
"epoch": 3.6467504402113016,
"grad_norm": 4.5111308097839355,
"learning_rate": 1.3532595645910037e-05,
"loss": 0.9984,
"num_input_tokens_seen": 197241664,
"step": 364500
},
{
"epoch": 3.6517528413638547,
"grad_norm": 7.830237865447998,
"learning_rate": 1.3482571634384505e-05,
"loss": 1.0109,
"num_input_tokens_seen": 197508048,
"step": 365000
},
{
"epoch": 3.656755242516408,
"grad_norm": 6.430135726928711,
"learning_rate": 1.3432547622858973e-05,
"loss": 1.0041,
"num_input_tokens_seen": 197782528,
"step": 365500
},
{
"epoch": 3.661757643668961,
"grad_norm": 8.756442070007324,
"learning_rate": 1.3382523611333442e-05,
"loss": 0.997,
"num_input_tokens_seen": 198054664,
"step": 366000
},
{
"epoch": 3.6667600448215145,
"grad_norm": 6.538286209106445,
"learning_rate": 1.3332499599807907e-05,
"loss": 1.0085,
"num_input_tokens_seen": 198320680,
"step": 366500
},
{
"epoch": 3.6717624459740676,
"grad_norm": 7.97443151473999,
"learning_rate": 1.3282475588282375e-05,
"loss": 1.0125,
"num_input_tokens_seen": 198590752,
"step": 367000
},
{
"epoch": 3.6767648471266208,
"grad_norm": 5.407761573791504,
"learning_rate": 1.3232451576756843e-05,
"loss": 1.029,
"num_input_tokens_seen": 198861544,
"step": 367500
},
{
"epoch": 3.681767248279174,
"grad_norm": 6.2920355796813965,
"learning_rate": 1.3182427565231311e-05,
"loss": 1.0131,
"num_input_tokens_seen": 199129704,
"step": 368000
},
{
"epoch": 3.686769649431727,
"grad_norm": 8.73907470703125,
"learning_rate": 1.313240355370578e-05,
"loss": 1.0051,
"num_input_tokens_seen": 199397264,
"step": 368500
},
{
"epoch": 3.6917720505842806,
"grad_norm": 6.030662536621094,
"learning_rate": 1.3082379542180248e-05,
"loss": 1.008,
"num_input_tokens_seen": 199664200,
"step": 369000
},
{
"epoch": 3.6967744517368337,
"grad_norm": 6.953051567077637,
"learning_rate": 1.3032355530654716e-05,
"loss": 1.0075,
"num_input_tokens_seen": 199933048,
"step": 369500
},
{
"epoch": 3.701776852889387,
"grad_norm": 6.4026618003845215,
"learning_rate": 1.2982331519129181e-05,
"loss": 0.9982,
"num_input_tokens_seen": 200203568,
"step": 370000
},
{
"epoch": 3.70677925404194,
"grad_norm": 8.120702743530273,
"learning_rate": 1.2932307507603649e-05,
"loss": 1.0093,
"num_input_tokens_seen": 200475136,
"step": 370500
},
{
"epoch": 3.7117816551944935,
"grad_norm": 4.819450855255127,
"learning_rate": 1.2882283496078117e-05,
"loss": 0.9915,
"num_input_tokens_seen": 200751456,
"step": 371000
},
{
"epoch": 3.7167840563470467,
"grad_norm": 6.006054878234863,
"learning_rate": 1.2832259484552586e-05,
"loss": 0.9993,
"num_input_tokens_seen": 201022872,
"step": 371500
},
{
"epoch": 3.7217864574996,
"grad_norm": 5.173740386962891,
"learning_rate": 1.2782235473027054e-05,
"loss": 1.0043,
"num_input_tokens_seen": 201293168,
"step": 372000
},
{
"epoch": 3.726788858652153,
"grad_norm": 4.628252983093262,
"learning_rate": 1.2732211461501522e-05,
"loss": 1.0047,
"num_input_tokens_seen": 201566544,
"step": 372500
},
{
"epoch": 3.731791259804706,
"grad_norm": 5.305530548095703,
"learning_rate": 1.2682187449975989e-05,
"loss": 1.0084,
"num_input_tokens_seen": 201835288,
"step": 373000
},
{
"epoch": 3.7367936609572596,
"grad_norm": 4.96281623840332,
"learning_rate": 1.2632163438450457e-05,
"loss": 1.009,
"num_input_tokens_seen": 202106240,
"step": 373500
},
{
"epoch": 3.7417960621098127,
"grad_norm": 5.782063007354736,
"learning_rate": 1.2582139426924925e-05,
"loss": 0.9932,
"num_input_tokens_seen": 202380080,
"step": 374000
},
{
"epoch": 3.746798463262366,
"grad_norm": 5.399883270263672,
"learning_rate": 1.2532115415399392e-05,
"loss": 1.0138,
"num_input_tokens_seen": 202647520,
"step": 374500
},
{
"epoch": 3.751800864414919,
"grad_norm": 5.419944763183594,
"learning_rate": 1.248209140387386e-05,
"loss": 0.9949,
"num_input_tokens_seen": 202913024,
"step": 375000
},
{
"epoch": 3.7568032655674726,
"grad_norm": 5.332630157470703,
"learning_rate": 1.2432067392348328e-05,
"loss": 1.0018,
"num_input_tokens_seen": 203175136,
"step": 375500
},
{
"epoch": 3.7618056667200257,
"grad_norm": 5.563337802886963,
"learning_rate": 1.2382043380822796e-05,
"loss": 1.0257,
"num_input_tokens_seen": 203444624,
"step": 376000
},
{
"epoch": 3.766808067872579,
"grad_norm": 7.322454452514648,
"learning_rate": 1.2332019369297265e-05,
"loss": 1.0097,
"num_input_tokens_seen": 203721152,
"step": 376500
},
{
"epoch": 3.771810469025132,
"grad_norm": 5.674718379974365,
"learning_rate": 1.2281995357771731e-05,
"loss": 0.9849,
"num_input_tokens_seen": 203993696,
"step": 377000
},
{
"epoch": 3.776812870177685,
"grad_norm": 6.736847877502441,
"learning_rate": 1.22319713462462e-05,
"loss": 0.9913,
"num_input_tokens_seen": 204263944,
"step": 377500
},
{
"epoch": 3.7818152713302386,
"grad_norm": 6.920697212219238,
"learning_rate": 1.2181947334720668e-05,
"loss": 0.9984,
"num_input_tokens_seen": 204534952,
"step": 378000
},
{
"epoch": 3.7868176724827918,
"grad_norm": 6.516974449157715,
"learning_rate": 1.2131923323195134e-05,
"loss": 1.01,
"num_input_tokens_seen": 204808160,
"step": 378500
},
{
"epoch": 3.791820073635345,
"grad_norm": 5.656439781188965,
"learning_rate": 1.2081899311669602e-05,
"loss": 1.0013,
"num_input_tokens_seen": 205082712,
"step": 379000
},
{
"epoch": 3.796822474787898,
"grad_norm": 4.886724472045898,
"learning_rate": 1.203187530014407e-05,
"loss": 1.011,
"num_input_tokens_seen": 205354224,
"step": 379500
},
{
"epoch": 3.8018248759404516,
"grad_norm": 8.127188682556152,
"learning_rate": 1.1981851288618537e-05,
"loss": 1.0004,
"num_input_tokens_seen": 205626120,
"step": 380000
},
{
"epoch": 3.8068272770930047,
"grad_norm": 7.59630823135376,
"learning_rate": 1.1931827277093005e-05,
"loss": 1.0078,
"num_input_tokens_seen": 205895872,
"step": 380500
},
{
"epoch": 3.811829678245558,
"grad_norm": 5.615649700164795,
"learning_rate": 1.1881803265567474e-05,
"loss": 0.9887,
"num_input_tokens_seen": 206161088,
"step": 381000
},
{
"epoch": 3.816832079398111,
"grad_norm": 5.560026168823242,
"learning_rate": 1.183177925404194e-05,
"loss": 0.9986,
"num_input_tokens_seen": 206434032,
"step": 381500
},
{
"epoch": 3.821834480550664,
"grad_norm": 4.2973480224609375,
"learning_rate": 1.1781755242516409e-05,
"loss": 0.9843,
"num_input_tokens_seen": 206711936,
"step": 382000
},
{
"epoch": 3.8268368817032177,
"grad_norm": 4.312121391296387,
"learning_rate": 1.1731731230990877e-05,
"loss": 0.995,
"num_input_tokens_seen": 206984224,
"step": 382500
},
{
"epoch": 3.831839282855771,
"grad_norm": 5.747461795806885,
"learning_rate": 1.1681707219465345e-05,
"loss": 1.0087,
"num_input_tokens_seen": 207254056,
"step": 383000
},
{
"epoch": 3.836841684008324,
"grad_norm": 5.281491279602051,
"learning_rate": 1.1631683207939812e-05,
"loss": 1.0027,
"num_input_tokens_seen": 207532144,
"step": 383500
},
{
"epoch": 3.8418440851608775,
"grad_norm": 6.745446681976318,
"learning_rate": 1.158165919641428e-05,
"loss": 1.0079,
"num_input_tokens_seen": 207801576,
"step": 384000
},
{
"epoch": 3.8468464863134306,
"grad_norm": 6.459105968475342,
"learning_rate": 1.1531635184888748e-05,
"loss": 1.0094,
"num_input_tokens_seen": 208065512,
"step": 384500
},
{
"epoch": 3.8518488874659838,
"grad_norm": 5.599144458770752,
"learning_rate": 1.1481611173363215e-05,
"loss": 0.9959,
"num_input_tokens_seen": 208339576,
"step": 385000
},
{
"epoch": 3.856851288618537,
"grad_norm": 7.585973739624023,
"learning_rate": 1.1431587161837683e-05,
"loss": 1.0037,
"num_input_tokens_seen": 208617584,
"step": 385500
},
{
"epoch": 3.86185368977109,
"grad_norm": 6.892619609832764,
"learning_rate": 1.1381563150312151e-05,
"loss": 0.9997,
"num_input_tokens_seen": 208880256,
"step": 386000
},
{
"epoch": 3.866856090923643,
"grad_norm": 7.016179084777832,
"learning_rate": 1.1331539138786618e-05,
"loss": 1.0165,
"num_input_tokens_seen": 209152592,
"step": 386500
},
{
"epoch": 3.8718584920761967,
"grad_norm": 7.874701976776123,
"learning_rate": 1.1281515127261086e-05,
"loss": 0.9907,
"num_input_tokens_seen": 209423248,
"step": 387000
},
{
"epoch": 3.87686089322875,
"grad_norm": 5.5261030197143555,
"learning_rate": 1.1231491115735554e-05,
"loss": 1.0029,
"num_input_tokens_seen": 209692384,
"step": 387500
},
{
"epoch": 3.881863294381303,
"grad_norm": 8.382930755615234,
"learning_rate": 1.118146710421002e-05,
"loss": 0.9865,
"num_input_tokens_seen": 209961656,
"step": 388000
},
{
"epoch": 3.8868656955338565,
"grad_norm": 4.629281044006348,
"learning_rate": 1.1131443092684489e-05,
"loss": 0.995,
"num_input_tokens_seen": 210227928,
"step": 388500
},
{
"epoch": 3.8918680966864097,
"grad_norm": 4.65085506439209,
"learning_rate": 1.1081419081158957e-05,
"loss": 0.9815,
"num_input_tokens_seen": 210494672,
"step": 389000
},
{
"epoch": 3.8968704978389628,
"grad_norm": 5.350659370422363,
"learning_rate": 1.1031395069633424e-05,
"loss": 0.9988,
"num_input_tokens_seen": 210757368,
"step": 389500
},
{
"epoch": 3.901872898991516,
"grad_norm": 6.074803829193115,
"learning_rate": 1.0981371058107892e-05,
"loss": 0.9996,
"num_input_tokens_seen": 211028544,
"step": 390000
},
{
"epoch": 3.906875300144069,
"grad_norm": 5.179644584655762,
"learning_rate": 1.093134704658236e-05,
"loss": 0.99,
"num_input_tokens_seen": 211297312,
"step": 390500
},
{
"epoch": 3.911877701296622,
"grad_norm": 6.687560081481934,
"learning_rate": 1.0881323035056828e-05,
"loss": 1.0047,
"num_input_tokens_seen": 211563536,
"step": 391000
},
{
"epoch": 3.9168801024491757,
"grad_norm": 5.4230570793151855,
"learning_rate": 1.0831299023531295e-05,
"loss": 1.0096,
"num_input_tokens_seen": 211837976,
"step": 391500
},
{
"epoch": 3.921882503601729,
"grad_norm": 5.730657577514648,
"learning_rate": 1.0781275012005763e-05,
"loss": 1.0044,
"num_input_tokens_seen": 212105824,
"step": 392000
},
{
"epoch": 3.926884904754282,
"grad_norm": 6.501159191131592,
"learning_rate": 1.0731251000480231e-05,
"loss": 0.9933,
"num_input_tokens_seen": 212377128,
"step": 392500
},
{
"epoch": 3.9318873059068356,
"grad_norm": 6.957518100738525,
"learning_rate": 1.0681226988954698e-05,
"loss": 0.9987,
"num_input_tokens_seen": 212653848,
"step": 393000
},
{
"epoch": 3.9368897070593887,
"grad_norm": 6.272824287414551,
"learning_rate": 1.0631202977429166e-05,
"loss": 0.9901,
"num_input_tokens_seen": 212924520,
"step": 393500
},
{
"epoch": 3.941892108211942,
"grad_norm": 7.048558712005615,
"learning_rate": 1.0581178965903634e-05,
"loss": 0.9943,
"num_input_tokens_seen": 213198256,
"step": 394000
},
{
"epoch": 3.946894509364495,
"grad_norm": 6.269680976867676,
"learning_rate": 1.0531154954378101e-05,
"loss": 0.9846,
"num_input_tokens_seen": 213471024,
"step": 394500
},
{
"epoch": 3.951896910517048,
"grad_norm": 5.69096565246582,
"learning_rate": 1.048113094285257e-05,
"loss": 0.9993,
"num_input_tokens_seen": 213740224,
"step": 395000
},
{
"epoch": 3.956899311669601,
"grad_norm": 6.711835861206055,
"learning_rate": 1.0431106931327038e-05,
"loss": 0.9994,
"num_input_tokens_seen": 214004440,
"step": 395500
},
{
"epoch": 3.9619017128221548,
"grad_norm": 9.549476623535156,
"learning_rate": 1.0381082919801504e-05,
"loss": 0.9853,
"num_input_tokens_seen": 214275152,
"step": 396000
},
{
"epoch": 3.966904113974708,
"grad_norm": 6.297016620635986,
"learning_rate": 1.0331058908275972e-05,
"loss": 0.9931,
"num_input_tokens_seen": 214549672,
"step": 396500
},
{
"epoch": 3.971906515127261,
"grad_norm": 5.232683181762695,
"learning_rate": 1.028103489675044e-05,
"loss": 0.9943,
"num_input_tokens_seen": 214823176,
"step": 397000
},
{
"epoch": 3.9769089162798146,
"grad_norm": 6.6180219650268555,
"learning_rate": 1.0231010885224907e-05,
"loss": 0.9965,
"num_input_tokens_seen": 215092304,
"step": 397500
},
{
"epoch": 3.9819113174323677,
"grad_norm": 6.272809982299805,
"learning_rate": 1.0180986873699375e-05,
"loss": 0.9773,
"num_input_tokens_seen": 215364504,
"step": 398000
},
{
"epoch": 3.986913718584921,
"grad_norm": 6.778554916381836,
"learning_rate": 1.0130962862173844e-05,
"loss": 1.0018,
"num_input_tokens_seen": 215636456,
"step": 398500
},
{
"epoch": 3.991916119737474,
"grad_norm": 5.8704071044921875,
"learning_rate": 1.0080938850648312e-05,
"loss": 0.9982,
"num_input_tokens_seen": 215902264,
"step": 399000
},
{
"epoch": 3.996918520890027,
"grad_norm": 6.430477142333984,
"learning_rate": 1.0030914839122778e-05,
"loss": 0.9897,
"num_input_tokens_seen": 216171688,
"step": 399500
},
{
"epoch": 4.0,
"eval_loss": 1.0106587409973145,
"eval_runtime": 189.1748,
"eval_samples_per_second": 1056.721,
"eval_steps_per_second": 132.095,
"num_input_tokens_seen": 216341560,
"step": 399808
},
{
"epoch": 4.00192092204258,
"grad_norm": 6.6500749588012695,
"learning_rate": 9.980890827597247e-06,
"loss": 0.9608,
"num_input_tokens_seen": 216448456,
"step": 400000
},
{
"epoch": 4.006923323195133,
"grad_norm": 5.360565662384033,
"learning_rate": 9.930866816071715e-06,
"loss": 0.9449,
"num_input_tokens_seen": 216721080,
"step": 400500
},
{
"epoch": 4.0119257243476865,
"grad_norm": 9.556127548217773,
"learning_rate": 9.880842804546183e-06,
"loss": 0.9209,
"num_input_tokens_seen": 216986544,
"step": 401000
},
{
"epoch": 4.0169281255002405,
"grad_norm": 5.1922502517700195,
"learning_rate": 9.830818793020651e-06,
"loss": 0.9175,
"num_input_tokens_seen": 217262208,
"step": 401500
},
{
"epoch": 4.021930526652794,
"grad_norm": 6.372469902038574,
"learning_rate": 9.780794781495118e-06,
"loss": 0.9079,
"num_input_tokens_seen": 217532376,
"step": 402000
},
{
"epoch": 4.026932927805347,
"grad_norm": 5.694700717926025,
"learning_rate": 9.730770769969586e-06,
"loss": 0.9205,
"num_input_tokens_seen": 217800456,
"step": 402500
},
{
"epoch": 4.0319353289579,
"grad_norm": 7.39500617980957,
"learning_rate": 9.680746758444054e-06,
"loss": 0.9262,
"num_input_tokens_seen": 218065808,
"step": 403000
},
{
"epoch": 4.036937730110453,
"grad_norm": 5.675652980804443,
"learning_rate": 9.630722746918523e-06,
"loss": 0.9128,
"num_input_tokens_seen": 218330520,
"step": 403500
},
{
"epoch": 4.041940131263006,
"grad_norm": 4.7625203132629395,
"learning_rate": 9.58069873539299e-06,
"loss": 0.9438,
"num_input_tokens_seen": 218603792,
"step": 404000
},
{
"epoch": 4.046942532415559,
"grad_norm": 5.834499835968018,
"learning_rate": 9.530674723867457e-06,
"loss": 0.9191,
"num_input_tokens_seen": 218870800,
"step": 404500
},
{
"epoch": 4.051944933568112,
"grad_norm": 7.351385116577148,
"learning_rate": 9.480650712341926e-06,
"loss": 0.9257,
"num_input_tokens_seen": 219142384,
"step": 405000
},
{
"epoch": 4.0569473347206655,
"grad_norm": 7.003774166107178,
"learning_rate": 9.430626700816392e-06,
"loss": 0.935,
"num_input_tokens_seen": 219404752,
"step": 405500
},
{
"epoch": 4.0619497358732195,
"grad_norm": 6.0088019371032715,
"learning_rate": 9.38060268929086e-06,
"loss": 0.9307,
"num_input_tokens_seen": 219675200,
"step": 406000
},
{
"epoch": 4.066952137025773,
"grad_norm": 6.115697383880615,
"learning_rate": 9.330578677765329e-06,
"loss": 0.9281,
"num_input_tokens_seen": 219950728,
"step": 406500
},
{
"epoch": 4.071954538178326,
"grad_norm": 5.1921868324279785,
"learning_rate": 9.280554666239795e-06,
"loss": 0.9391,
"num_input_tokens_seen": 220221608,
"step": 407000
},
{
"epoch": 4.076956939330879,
"grad_norm": 6.665195465087891,
"learning_rate": 9.230530654714263e-06,
"loss": 0.9296,
"num_input_tokens_seen": 220497552,
"step": 407500
},
{
"epoch": 4.081959340483432,
"grad_norm": 6.402169704437256,
"learning_rate": 9.180506643188732e-06,
"loss": 0.9199,
"num_input_tokens_seen": 220764032,
"step": 408000
},
{
"epoch": 4.086961741635985,
"grad_norm": 5.718327045440674,
"learning_rate": 9.1304826316632e-06,
"loss": 0.927,
"num_input_tokens_seen": 221033992,
"step": 408500
},
{
"epoch": 4.091964142788538,
"grad_norm": 6.588100433349609,
"learning_rate": 9.080458620137667e-06,
"loss": 0.9374,
"num_input_tokens_seen": 221308640,
"step": 409000
},
{
"epoch": 4.096966543941091,
"grad_norm": 5.30639123916626,
"learning_rate": 9.030434608612135e-06,
"loss": 0.9251,
"num_input_tokens_seen": 221581024,
"step": 409500
},
{
"epoch": 4.1019689450936445,
"grad_norm": 4.8298821449279785,
"learning_rate": 8.980410597086603e-06,
"loss": 0.9137,
"num_input_tokens_seen": 221842152,
"step": 410000
},
{
"epoch": 4.1069713462461985,
"grad_norm": 5.2428483963012695,
"learning_rate": 8.93038658556107e-06,
"loss": 0.9333,
"num_input_tokens_seen": 222117312,
"step": 410500
},
{
"epoch": 4.111973747398752,
"grad_norm": 7.0200114250183105,
"learning_rate": 8.880362574035538e-06,
"loss": 0.914,
"num_input_tokens_seen": 222386728,
"step": 411000
},
{
"epoch": 4.116976148551305,
"grad_norm": 5.761682510375977,
"learning_rate": 8.830338562510006e-06,
"loss": 0.9236,
"num_input_tokens_seen": 222656968,
"step": 411500
},
{
"epoch": 4.121978549703858,
"grad_norm": 5.959192752838135,
"learning_rate": 8.780314550984473e-06,
"loss": 0.9383,
"num_input_tokens_seen": 222931472,
"step": 412000
},
{
"epoch": 4.126980950856411,
"grad_norm": 7.374218940734863,
"learning_rate": 8.73029053945894e-06,
"loss": 0.918,
"num_input_tokens_seen": 223197176,
"step": 412500
},
{
"epoch": 4.131983352008964,
"grad_norm": 5.401734352111816,
"learning_rate": 8.680266527933409e-06,
"loss": 0.9134,
"num_input_tokens_seen": 223473808,
"step": 413000
},
{
"epoch": 4.136985753161517,
"grad_norm": 5.810543537139893,
"learning_rate": 8.630242516407876e-06,
"loss": 0.9219,
"num_input_tokens_seen": 223749560,
"step": 413500
},
{
"epoch": 4.14198815431407,
"grad_norm": 6.911441326141357,
"learning_rate": 8.580218504882344e-06,
"loss": 0.9371,
"num_input_tokens_seen": 224026048,
"step": 414000
},
{
"epoch": 4.146990555466624,
"grad_norm": 5.81462287902832,
"learning_rate": 8.530194493356812e-06,
"loss": 0.922,
"num_input_tokens_seen": 224296456,
"step": 414500
},
{
"epoch": 4.1519929566191776,
"grad_norm": 6.728647232055664,
"learning_rate": 8.480170481831279e-06,
"loss": 0.9261,
"num_input_tokens_seen": 224566384,
"step": 415000
},
{
"epoch": 4.156995357771731,
"grad_norm": 6.1371564865112305,
"learning_rate": 8.430146470305747e-06,
"loss": 0.9286,
"num_input_tokens_seen": 224841952,
"step": 415500
},
{
"epoch": 4.161997758924284,
"grad_norm": 7.179012775421143,
"learning_rate": 8.380122458780215e-06,
"loss": 0.9196,
"num_input_tokens_seen": 225112416,
"step": 416000
},
{
"epoch": 4.167000160076837,
"grad_norm": 6.620611667633057,
"learning_rate": 8.330098447254682e-06,
"loss": 0.9358,
"num_input_tokens_seen": 225389344,
"step": 416500
},
{
"epoch": 4.17200256122939,
"grad_norm": 6.2398905754089355,
"learning_rate": 8.28007443572915e-06,
"loss": 0.9178,
"num_input_tokens_seen": 225663376,
"step": 417000
},
{
"epoch": 4.177004962381943,
"grad_norm": 6.071476936340332,
"learning_rate": 8.230050424203618e-06,
"loss": 0.9319,
"num_input_tokens_seen": 225931480,
"step": 417500
},
{
"epoch": 4.182007363534496,
"grad_norm": 5.602684497833252,
"learning_rate": 8.180026412678086e-06,
"loss": 0.9326,
"num_input_tokens_seen": 226198008,
"step": 418000
},
{
"epoch": 4.187009764687049,
"grad_norm": 6.832788944244385,
"learning_rate": 8.130002401152553e-06,
"loss": 0.929,
"num_input_tokens_seen": 226466280,
"step": 418500
},
{
"epoch": 4.1920121658396035,
"grad_norm": 6.3188862800598145,
"learning_rate": 8.079978389627021e-06,
"loss": 0.94,
"num_input_tokens_seen": 226741632,
"step": 419000
},
{
"epoch": 4.197014566992157,
"grad_norm": 5.0520548820495605,
"learning_rate": 8.02995437810149e-06,
"loss": 0.9327,
"num_input_tokens_seen": 227006288,
"step": 419500
},
{
"epoch": 4.20201696814471,
"grad_norm": 6.5376410484313965,
"learning_rate": 7.979930366575956e-06,
"loss": 0.9042,
"num_input_tokens_seen": 227281352,
"step": 420000
},
{
"epoch": 4.207019369297263,
"grad_norm": 5.164760112762451,
"learning_rate": 7.929906355050424e-06,
"loss": 0.9231,
"num_input_tokens_seen": 227542968,
"step": 420500
},
{
"epoch": 4.212021770449816,
"grad_norm": 9.208584785461426,
"learning_rate": 7.879882343524892e-06,
"loss": 0.9333,
"num_input_tokens_seen": 227813704,
"step": 421000
},
{
"epoch": 4.217024171602369,
"grad_norm": 5.241026878356934,
"learning_rate": 7.829858331999359e-06,
"loss": 0.9278,
"num_input_tokens_seen": 228083120,
"step": 421500
},
{
"epoch": 4.222026572754922,
"grad_norm": 6.501145839691162,
"learning_rate": 7.779834320473827e-06,
"loss": 0.9219,
"num_input_tokens_seen": 228352616,
"step": 422000
},
{
"epoch": 4.227028973907475,
"grad_norm": 4.962836742401123,
"learning_rate": 7.729810308948296e-06,
"loss": 0.9176,
"num_input_tokens_seen": 228615376,
"step": 422500
},
{
"epoch": 4.2320313750600285,
"grad_norm": 5.714748859405518,
"learning_rate": 7.679786297422762e-06,
"loss": 0.9459,
"num_input_tokens_seen": 228890544,
"step": 423000
},
{
"epoch": 4.2370337762125825,
"grad_norm": 5.394798755645752,
"learning_rate": 7.62976228589723e-06,
"loss": 0.9305,
"num_input_tokens_seen": 229158352,
"step": 423500
},
{
"epoch": 4.242036177365136,
"grad_norm": 7.450530529022217,
"learning_rate": 7.579738274371699e-06,
"loss": 0.9342,
"num_input_tokens_seen": 229430896,
"step": 424000
},
{
"epoch": 4.247038578517689,
"grad_norm": 6.761574745178223,
"learning_rate": 7.529714262846166e-06,
"loss": 0.9324,
"num_input_tokens_seen": 229692632,
"step": 424500
},
{
"epoch": 4.252040979670242,
"grad_norm": 6.781697750091553,
"learning_rate": 7.479690251320634e-06,
"loss": 0.9299,
"num_input_tokens_seen": 229963336,
"step": 425000
},
{
"epoch": 4.257043380822795,
"grad_norm": 6.029842376708984,
"learning_rate": 7.4296662397951024e-06,
"loss": 0.9411,
"num_input_tokens_seen": 230235128,
"step": 425500
},
{
"epoch": 4.262045781975348,
"grad_norm": 6.796103477478027,
"learning_rate": 7.379642228269571e-06,
"loss": 0.9119,
"num_input_tokens_seen": 230508152,
"step": 426000
},
{
"epoch": 4.267048183127901,
"grad_norm": 5.397275447845459,
"learning_rate": 7.329618216744037e-06,
"loss": 0.9287,
"num_input_tokens_seen": 230778056,
"step": 426500
},
{
"epoch": 4.272050584280454,
"grad_norm": 6.785423755645752,
"learning_rate": 7.2795942052185055e-06,
"loss": 0.9249,
"num_input_tokens_seen": 231050672,
"step": 427000
},
{
"epoch": 4.2770529854330075,
"grad_norm": 6.21229362487793,
"learning_rate": 7.229570193692974e-06,
"loss": 0.9293,
"num_input_tokens_seen": 231322192,
"step": 427500
},
{
"epoch": 4.2820553865855615,
"grad_norm": 5.904947757720947,
"learning_rate": 7.17954618216744e-06,
"loss": 0.9361,
"num_input_tokens_seen": 231593960,
"step": 428000
},
{
"epoch": 4.287057787738115,
"grad_norm": 4.485384464263916,
"learning_rate": 7.1295221706419085e-06,
"loss": 0.9359,
"num_input_tokens_seen": 231857176,
"step": 428500
},
{
"epoch": 4.292060188890668,
"grad_norm": 5.395241737365723,
"learning_rate": 7.079498159116377e-06,
"loss": 0.9382,
"num_input_tokens_seen": 232135880,
"step": 429000
},
{
"epoch": 4.297062590043221,
"grad_norm": 6.106403827667236,
"learning_rate": 7.029474147590843e-06,
"loss": 0.9266,
"num_input_tokens_seen": 232409736,
"step": 429500
},
{
"epoch": 4.302064991195774,
"grad_norm": 5.79823637008667,
"learning_rate": 6.9794501360653115e-06,
"loss": 0.9083,
"num_input_tokens_seen": 232681632,
"step": 430000
},
{
"epoch": 4.307067392348327,
"grad_norm": 5.688789367675781,
"learning_rate": 6.92942612453978e-06,
"loss": 0.9145,
"num_input_tokens_seen": 232953312,
"step": 430500
},
{
"epoch": 4.31206979350088,
"grad_norm": 5.097667694091797,
"learning_rate": 6.879402113014246e-06,
"loss": 0.9459,
"num_input_tokens_seen": 233217560,
"step": 431000
},
{
"epoch": 4.317072194653433,
"grad_norm": 5.180954456329346,
"learning_rate": 6.829378101488715e-06,
"loss": 0.9302,
"num_input_tokens_seen": 233491872,
"step": 431500
},
{
"epoch": 4.3220745958059865,
"grad_norm": 5.185079574584961,
"learning_rate": 6.779354089963183e-06,
"loss": 0.9304,
"num_input_tokens_seen": 233757296,
"step": 432000
},
{
"epoch": 4.3270769969585405,
"grad_norm": 5.8646464347839355,
"learning_rate": 6.729330078437649e-06,
"loss": 0.917,
"num_input_tokens_seen": 234033848,
"step": 432500
},
{
"epoch": 4.332079398111094,
"grad_norm": 4.718979358673096,
"learning_rate": 6.679306066912118e-06,
"loss": 0.9355,
"num_input_tokens_seen": 234301952,
"step": 433000
},
{
"epoch": 4.337081799263647,
"grad_norm": 5.194594383239746,
"learning_rate": 6.629282055386586e-06,
"loss": 0.9178,
"num_input_tokens_seen": 234570432,
"step": 433500
},
{
"epoch": 4.3420842004162,
"grad_norm": 6.157474994659424,
"learning_rate": 6.579258043861053e-06,
"loss": 0.9306,
"num_input_tokens_seen": 234848568,
"step": 434000
},
{
"epoch": 4.347086601568753,
"grad_norm": 5.017276763916016,
"learning_rate": 6.5292340323355215e-06,
"loss": 0.9266,
"num_input_tokens_seen": 235120480,
"step": 434500
},
{
"epoch": 4.352089002721306,
"grad_norm": 6.485071659088135,
"learning_rate": 6.479210020809989e-06,
"loss": 0.9245,
"num_input_tokens_seen": 235385120,
"step": 435000
},
{
"epoch": 4.357091403873859,
"grad_norm": 6.405189514160156,
"learning_rate": 6.429186009284457e-06,
"loss": 0.9148,
"num_input_tokens_seen": 235653424,
"step": 435500
},
{
"epoch": 4.362093805026412,
"grad_norm": 7.216737747192383,
"learning_rate": 6.3791619977589245e-06,
"loss": 0.9171,
"num_input_tokens_seen": 235927792,
"step": 436000
},
{
"epoch": 4.3670962061789655,
"grad_norm": 5.484450340270996,
"learning_rate": 6.329137986233393e-06,
"loss": 0.9253,
"num_input_tokens_seen": 236198512,
"step": 436500
},
{
"epoch": 4.37209860733152,
"grad_norm": 5.4462971687316895,
"learning_rate": 6.279113974707861e-06,
"loss": 0.9052,
"num_input_tokens_seen": 236470920,
"step": 437000
},
{
"epoch": 4.377101008484073,
"grad_norm": 6.061979293823242,
"learning_rate": 6.229089963182328e-06,
"loss": 0.9344,
"num_input_tokens_seen": 236743128,
"step": 437500
},
{
"epoch": 4.382103409636626,
"grad_norm": 7.572735786437988,
"learning_rate": 6.179065951656796e-06,
"loss": 0.9262,
"num_input_tokens_seen": 237019048,
"step": 438000
},
{
"epoch": 4.387105810789179,
"grad_norm": 5.743752956390381,
"learning_rate": 6.129041940131263e-06,
"loss": 0.9256,
"num_input_tokens_seen": 237287696,
"step": 438500
},
{
"epoch": 4.392108211941732,
"grad_norm": 7.014731407165527,
"learning_rate": 6.0790179286057314e-06,
"loss": 0.9242,
"num_input_tokens_seen": 237552080,
"step": 439000
},
{
"epoch": 4.397110613094285,
"grad_norm": 7.045165061950684,
"learning_rate": 6.028993917080199e-06,
"loss": 0.9256,
"num_input_tokens_seen": 237822224,
"step": 439500
},
{
"epoch": 4.402113014246838,
"grad_norm": 9.347033500671387,
"learning_rate": 5.978969905554666e-06,
"loss": 0.9339,
"num_input_tokens_seen": 238090088,
"step": 440000
},
{
"epoch": 4.4071154153993914,
"grad_norm": 5.694771766662598,
"learning_rate": 5.9289458940291345e-06,
"loss": 0.9289,
"num_input_tokens_seen": 238362352,
"step": 440500
},
{
"epoch": 4.412117816551945,
"grad_norm": 6.164538383483887,
"learning_rate": 5.878921882503602e-06,
"loss": 0.9315,
"num_input_tokens_seen": 238629408,
"step": 441000
},
{
"epoch": 4.417120217704499,
"grad_norm": 6.145501136779785,
"learning_rate": 5.82889787097807e-06,
"loss": 0.9265,
"num_input_tokens_seen": 238897488,
"step": 441500
},
{
"epoch": 4.422122618857052,
"grad_norm": 8.055036544799805,
"learning_rate": 5.7788738594525375e-06,
"loss": 0.9459,
"num_input_tokens_seen": 239173024,
"step": 442000
},
{
"epoch": 4.427125020009605,
"grad_norm": 6.102746963500977,
"learning_rate": 5.728849847927005e-06,
"loss": 0.9413,
"num_input_tokens_seen": 239441080,
"step": 442500
},
{
"epoch": 4.432127421162158,
"grad_norm": 5.538390159606934,
"learning_rate": 5.678825836401473e-06,
"loss": 0.9356,
"num_input_tokens_seen": 239711624,
"step": 443000
},
{
"epoch": 4.437129822314711,
"grad_norm": 6.128843307495117,
"learning_rate": 5.6288018248759405e-06,
"loss": 0.9201,
"num_input_tokens_seen": 239985480,
"step": 443500
},
{
"epoch": 4.442132223467264,
"grad_norm": 6.007194995880127,
"learning_rate": 5.578777813350408e-06,
"loss": 0.9176,
"num_input_tokens_seen": 240250312,
"step": 444000
},
{
"epoch": 4.447134624619817,
"grad_norm": 8.6255521774292,
"learning_rate": 5.528753801824876e-06,
"loss": 0.9216,
"num_input_tokens_seen": 240515680,
"step": 444500
},
{
"epoch": 4.4521370257723705,
"grad_norm": 6.955540180206299,
"learning_rate": 5.478729790299344e-06,
"loss": 0.9196,
"num_input_tokens_seen": 240785288,
"step": 445000
},
{
"epoch": 4.457139426924924,
"grad_norm": 7.786896228790283,
"learning_rate": 5.428705778773812e-06,
"loss": 0.9337,
"num_input_tokens_seen": 241060592,
"step": 445500
},
{
"epoch": 4.462141828077478,
"grad_norm": 5.2199482917785645,
"learning_rate": 5.378681767248279e-06,
"loss": 0.9356,
"num_input_tokens_seen": 241326064,
"step": 446000
},
{
"epoch": 4.467144229230031,
"grad_norm": 4.975681781768799,
"learning_rate": 5.328657755722747e-06,
"loss": 0.911,
"num_input_tokens_seen": 241596704,
"step": 446500
},
{
"epoch": 4.472146630382584,
"grad_norm": 4.91240119934082,
"learning_rate": 5.278633744197215e-06,
"loss": 0.9276,
"num_input_tokens_seen": 241870056,
"step": 447000
},
{
"epoch": 4.477149031535137,
"grad_norm": 7.170393466949463,
"learning_rate": 5.228609732671682e-06,
"loss": 0.9212,
"num_input_tokens_seen": 242140224,
"step": 447500
},
{
"epoch": 4.48215143268769,
"grad_norm": 5.982038497924805,
"learning_rate": 5.1785857211461505e-06,
"loss": 0.9112,
"num_input_tokens_seen": 242405736,
"step": 448000
},
{
"epoch": 4.487153833840243,
"grad_norm": 6.960501670837402,
"learning_rate": 5.128561709620618e-06,
"loss": 0.9284,
"num_input_tokens_seen": 242672696,
"step": 448500
},
{
"epoch": 4.492156234992796,
"grad_norm": 5.97189474105835,
"learning_rate": 5.078537698095086e-06,
"loss": 0.9304,
"num_input_tokens_seen": 242944136,
"step": 449000
},
{
"epoch": 4.4971586361453495,
"grad_norm": 5.7404704093933105,
"learning_rate": 5.028513686569554e-06,
"loss": 0.9166,
"num_input_tokens_seen": 243217160,
"step": 449500
},
{
"epoch": 4.502161037297903,
"grad_norm": 14.095989227294922,
"learning_rate": 4.978489675044022e-06,
"loss": 0.9364,
"num_input_tokens_seen": 243492064,
"step": 450000
},
{
"epoch": 4.507163438450457,
"grad_norm": 6.327740669250488,
"learning_rate": 4.928465663518489e-06,
"loss": 0.9334,
"num_input_tokens_seen": 243765824,
"step": 450500
},
{
"epoch": 4.51216583960301,
"grad_norm": 5.019825458526611,
"learning_rate": 4.878441651992957e-06,
"loss": 0.9183,
"num_input_tokens_seen": 244045424,
"step": 451000
},
{
"epoch": 4.517168240755563,
"grad_norm": 5.382749557495117,
"learning_rate": 4.828417640467425e-06,
"loss": 0.9183,
"num_input_tokens_seen": 244317544,
"step": 451500
},
{
"epoch": 4.522170641908116,
"grad_norm": 6.0456461906433105,
"learning_rate": 4.778393628941892e-06,
"loss": 0.9176,
"num_input_tokens_seen": 244587752,
"step": 452000
},
{
"epoch": 4.527173043060669,
"grad_norm": 6.2013983726501465,
"learning_rate": 4.7283696174163604e-06,
"loss": 0.9253,
"num_input_tokens_seen": 244852400,
"step": 452500
},
{
"epoch": 4.532175444213222,
"grad_norm": 5.575494766235352,
"learning_rate": 4.678345605890828e-06,
"loss": 0.9315,
"num_input_tokens_seen": 245130392,
"step": 453000
},
{
"epoch": 4.537177845365775,
"grad_norm": 6.855820178985596,
"learning_rate": 4.628321594365295e-06,
"loss": 0.9255,
"num_input_tokens_seen": 245393352,
"step": 453500
},
{
"epoch": 4.5421802465183285,
"grad_norm": 5.4364728927612305,
"learning_rate": 4.5782975828397635e-06,
"loss": 0.9327,
"num_input_tokens_seen": 245667248,
"step": 454000
},
{
"epoch": 4.547182647670882,
"grad_norm": 7.509527206420898,
"learning_rate": 4.528273571314231e-06,
"loss": 0.9207,
"num_input_tokens_seen": 245940192,
"step": 454500
},
{
"epoch": 4.552185048823436,
"grad_norm": 5.191705226898193,
"learning_rate": 4.478249559788699e-06,
"loss": 0.9231,
"num_input_tokens_seen": 246211320,
"step": 455000
},
{
"epoch": 4.557187449975989,
"grad_norm": 6.908538341522217,
"learning_rate": 4.4282255482631665e-06,
"loss": 0.9256,
"num_input_tokens_seen": 246488984,
"step": 455500
},
{
"epoch": 4.562189851128542,
"grad_norm": 6.262028694152832,
"learning_rate": 4.378201536737634e-06,
"loss": 0.9219,
"num_input_tokens_seen": 246764632,
"step": 456000
},
{
"epoch": 4.567192252281095,
"grad_norm": 6.5729475021362305,
"learning_rate": 4.328177525212102e-06,
"loss": 0.9244,
"num_input_tokens_seen": 247035024,
"step": 456500
},
{
"epoch": 4.572194653433648,
"grad_norm": 7.030519008636475,
"learning_rate": 4.2781535136865695e-06,
"loss": 0.9096,
"num_input_tokens_seen": 247301664,
"step": 457000
},
{
"epoch": 4.577197054586201,
"grad_norm": 5.72337532043457,
"learning_rate": 4.228129502161037e-06,
"loss": 0.9243,
"num_input_tokens_seen": 247574992,
"step": 457500
},
{
"epoch": 4.582199455738754,
"grad_norm": 5.769835948944092,
"learning_rate": 4.178105490635505e-06,
"loss": 0.9378,
"num_input_tokens_seen": 247843024,
"step": 458000
},
{
"epoch": 4.5872018568913075,
"grad_norm": 4.633671760559082,
"learning_rate": 4.128081479109973e-06,
"loss": 0.9249,
"num_input_tokens_seen": 248112056,
"step": 458500
},
{
"epoch": 4.592204258043861,
"grad_norm": 6.9910569190979,
"learning_rate": 4.078057467584441e-06,
"loss": 0.9164,
"num_input_tokens_seen": 248380816,
"step": 459000
},
{
"epoch": 4.597206659196415,
"grad_norm": 5.471499919891357,
"learning_rate": 4.028033456058908e-06,
"loss": 0.9241,
"num_input_tokens_seen": 248657168,
"step": 459500
},
{
"epoch": 4.602209060348968,
"grad_norm": 5.17936897277832,
"learning_rate": 3.978009444533376e-06,
"loss": 0.9308,
"num_input_tokens_seen": 248924720,
"step": 460000
},
{
"epoch": 4.607211461501521,
"grad_norm": 7.616632461547852,
"learning_rate": 3.927985433007844e-06,
"loss": 0.9316,
"num_input_tokens_seen": 249194600,
"step": 460500
},
{
"epoch": 4.612213862654074,
"grad_norm": 7.818989276885986,
"learning_rate": 3.877961421482311e-06,
"loss": 0.9142,
"num_input_tokens_seen": 249462512,
"step": 461000
},
{
"epoch": 4.617216263806627,
"grad_norm": 6.754061698913574,
"learning_rate": 3.8279374099567795e-06,
"loss": 0.9213,
"num_input_tokens_seen": 249729824,
"step": 461500
},
{
"epoch": 4.62221866495918,
"grad_norm": 5.925983905792236,
"learning_rate": 3.7779133984312473e-06,
"loss": 0.9171,
"num_input_tokens_seen": 250002496,
"step": 462000
},
{
"epoch": 4.6272210661117334,
"grad_norm": 5.226542949676514,
"learning_rate": 3.7278893869057147e-06,
"loss": 0.9118,
"num_input_tokens_seen": 250268376,
"step": 462500
},
{
"epoch": 4.632223467264287,
"grad_norm": 5.781167984008789,
"learning_rate": 3.677865375380183e-06,
"loss": 0.9271,
"num_input_tokens_seen": 250530912,
"step": 463000
},
{
"epoch": 4.63722586841684,
"grad_norm": 4.7409210205078125,
"learning_rate": 3.6278413638546503e-06,
"loss": 0.9203,
"num_input_tokens_seen": 250803368,
"step": 463500
},
{
"epoch": 4.642228269569394,
"grad_norm": 4.875260353088379,
"learning_rate": 3.5778173523291177e-06,
"loss": 0.9235,
"num_input_tokens_seen": 251074960,
"step": 464000
},
{
"epoch": 4.647230670721947,
"grad_norm": 6.038626194000244,
"learning_rate": 3.527793340803586e-06,
"loss": 0.9274,
"num_input_tokens_seen": 251347688,
"step": 464500
},
{
"epoch": 4.6522330718745,
"grad_norm": 5.727837562561035,
"learning_rate": 3.477769329278054e-06,
"loss": 0.9173,
"num_input_tokens_seen": 251623264,
"step": 465000
},
{
"epoch": 4.657235473027053,
"grad_norm": 8.719578742980957,
"learning_rate": 3.427745317752521e-06,
"loss": 0.9149,
"num_input_tokens_seen": 251894136,
"step": 465500
},
{
"epoch": 4.662237874179606,
"grad_norm": 5.212377071380615,
"learning_rate": 3.3777213062269894e-06,
"loss": 0.9151,
"num_input_tokens_seen": 252168632,
"step": 466000
},
{
"epoch": 4.667240275332159,
"grad_norm": 5.5170440673828125,
"learning_rate": 3.327697294701457e-06,
"loss": 0.9177,
"num_input_tokens_seen": 252446776,
"step": 466500
},
{
"epoch": 4.6722426764847125,
"grad_norm": 5.481988906860352,
"learning_rate": 3.277673283175925e-06,
"loss": 0.922,
"num_input_tokens_seen": 252711248,
"step": 467000
},
{
"epoch": 4.677245077637266,
"grad_norm": 5.051156520843506,
"learning_rate": 3.2276492716503925e-06,
"loss": 0.9133,
"num_input_tokens_seen": 252977920,
"step": 467500
},
{
"epoch": 4.682247478789819,
"grad_norm": 5.823482036590576,
"learning_rate": 3.17762526012486e-06,
"loss": 0.9274,
"num_input_tokens_seen": 253243784,
"step": 468000
},
{
"epoch": 4.687249879942373,
"grad_norm": 5.723121166229248,
"learning_rate": 3.127601248599328e-06,
"loss": 0.9144,
"num_input_tokens_seen": 253513552,
"step": 468500
},
{
"epoch": 4.692252281094926,
"grad_norm": 6.516372203826904,
"learning_rate": 3.0775772370737955e-06,
"loss": 0.9162,
"num_input_tokens_seen": 253780008,
"step": 469000
},
{
"epoch": 4.697254682247479,
"grad_norm": 5.488427639007568,
"learning_rate": 3.0275532255482633e-06,
"loss": 0.9181,
"num_input_tokens_seen": 254050256,
"step": 469500
},
{
"epoch": 4.702257083400032,
"grad_norm": 6.543509006500244,
"learning_rate": 2.977529214022731e-06,
"loss": 0.9251,
"num_input_tokens_seen": 254325728,
"step": 470000
},
{
"epoch": 4.707259484552585,
"grad_norm": 6.277120590209961,
"learning_rate": 2.9275052024971985e-06,
"loss": 0.9332,
"num_input_tokens_seen": 254596720,
"step": 470500
},
{
"epoch": 4.712261885705138,
"grad_norm": 5.882318496704102,
"learning_rate": 2.8774811909716664e-06,
"loss": 0.9091,
"num_input_tokens_seen": 254877768,
"step": 471000
},
{
"epoch": 4.7172642868576915,
"grad_norm": 6.018533706665039,
"learning_rate": 2.827457179446134e-06,
"loss": 0.9112,
"num_input_tokens_seen": 255146616,
"step": 471500
},
{
"epoch": 4.722266688010245,
"grad_norm": 7.787155628204346,
"learning_rate": 2.777433167920602e-06,
"loss": 0.9204,
"num_input_tokens_seen": 255411528,
"step": 472000
},
{
"epoch": 4.727269089162798,
"grad_norm": 4.750833034515381,
"learning_rate": 2.72740915639507e-06,
"loss": 0.9229,
"num_input_tokens_seen": 255673056,
"step": 472500
},
{
"epoch": 4.732271490315352,
"grad_norm": 5.9136457443237305,
"learning_rate": 2.6773851448695376e-06,
"loss": 0.9291,
"num_input_tokens_seen": 255947496,
"step": 473000
},
{
"epoch": 4.737273891467905,
"grad_norm": 5.880224704742432,
"learning_rate": 2.6273611333440055e-06,
"loss": 0.9202,
"num_input_tokens_seen": 256214416,
"step": 473500
},
{
"epoch": 4.742276292620458,
"grad_norm": 4.128984451293945,
"learning_rate": 2.577337121818473e-06,
"loss": 0.9225,
"num_input_tokens_seen": 256493072,
"step": 474000
},
{
"epoch": 4.747278693773011,
"grad_norm": 4.8430914878845215,
"learning_rate": 2.5273131102929407e-06,
"loss": 0.9142,
"num_input_tokens_seen": 256764208,
"step": 474500
},
{
"epoch": 4.752281094925564,
"grad_norm": 4.830491542816162,
"learning_rate": 2.4772890987674085e-06,
"loss": 0.9118,
"num_input_tokens_seen": 257032808,
"step": 475000
},
{
"epoch": 4.757283496078117,
"grad_norm": 4.94685697555542,
"learning_rate": 2.4272650872418763e-06,
"loss": 0.9182,
"num_input_tokens_seen": 257296856,
"step": 475500
},
{
"epoch": 4.7622858972306705,
"grad_norm": 5.098095417022705,
"learning_rate": 2.3772410757163437e-06,
"loss": 0.9135,
"num_input_tokens_seen": 257564344,
"step": 476000
},
{
"epoch": 4.767288298383224,
"grad_norm": 6.16255521774292,
"learning_rate": 2.3272170641908115e-06,
"loss": 0.9081,
"num_input_tokens_seen": 257834360,
"step": 476500
},
{
"epoch": 4.772290699535777,
"grad_norm": 5.006162643432617,
"learning_rate": 2.2771930526652793e-06,
"loss": 0.924,
"num_input_tokens_seen": 258105976,
"step": 477000
},
{
"epoch": 4.777293100688331,
"grad_norm": 5.462359428405762,
"learning_rate": 2.227169041139747e-06,
"loss": 0.8951,
"num_input_tokens_seen": 258374328,
"step": 477500
},
{
"epoch": 4.782295501840884,
"grad_norm": 6.263942241668701,
"learning_rate": 2.177145029614215e-06,
"loss": 0.9173,
"num_input_tokens_seen": 258646752,
"step": 478000
},
{
"epoch": 4.787297902993437,
"grad_norm": 6.507811546325684,
"learning_rate": 2.127121018088683e-06,
"loss": 0.9137,
"num_input_tokens_seen": 258920128,
"step": 478500
},
{
"epoch": 4.79230030414599,
"grad_norm": 5.116788864135742,
"learning_rate": 2.0770970065631506e-06,
"loss": 0.9134,
"num_input_tokens_seen": 259189648,
"step": 479000
},
{
"epoch": 4.797302705298543,
"grad_norm": 5.995227336883545,
"learning_rate": 2.0270729950376184e-06,
"loss": 0.9169,
"num_input_tokens_seen": 259457416,
"step": 479500
},
{
"epoch": 4.802305106451096,
"grad_norm": 4.341572284698486,
"learning_rate": 1.977048983512086e-06,
"loss": 0.9121,
"num_input_tokens_seen": 259733960,
"step": 480000
},
{
"epoch": 4.8073075076036496,
"grad_norm": 6.331231117248535,
"learning_rate": 1.9270249719865537e-06,
"loss": 0.9096,
"num_input_tokens_seen": 260007160,
"step": 480500
},
{
"epoch": 4.812309908756203,
"grad_norm": 7.295907497406006,
"learning_rate": 1.8770009604610215e-06,
"loss": 0.9227,
"num_input_tokens_seen": 260281344,
"step": 481000
},
{
"epoch": 4.817312309908756,
"grad_norm": 5.934523105621338,
"learning_rate": 1.8269769489354893e-06,
"loss": 0.9225,
"num_input_tokens_seen": 260555592,
"step": 481500
},
{
"epoch": 4.82231471106131,
"grad_norm": 6.325069904327393,
"learning_rate": 1.7769529374099567e-06,
"loss": 0.9061,
"num_input_tokens_seen": 260826576,
"step": 482000
},
{
"epoch": 4.827317112213863,
"grad_norm": 4.8332977294921875,
"learning_rate": 1.7269289258844245e-06,
"loss": 0.9298,
"num_input_tokens_seen": 261100264,
"step": 482500
},
{
"epoch": 4.832319513366416,
"grad_norm": 6.445847988128662,
"learning_rate": 1.6769049143588923e-06,
"loss": 0.9189,
"num_input_tokens_seen": 261372184,
"step": 483000
},
{
"epoch": 4.837321914518969,
"grad_norm": 6.000613212585449,
"learning_rate": 1.6268809028333602e-06,
"loss": 0.9195,
"num_input_tokens_seen": 261635736,
"step": 483500
},
{
"epoch": 4.842324315671522,
"grad_norm": 5.839612007141113,
"learning_rate": 1.5768568913078278e-06,
"loss": 0.9109,
"num_input_tokens_seen": 261908384,
"step": 484000
},
{
"epoch": 4.8473267168240755,
"grad_norm": 5.0340471267700195,
"learning_rate": 1.5268328797822956e-06,
"loss": 0.914,
"num_input_tokens_seen": 262183640,
"step": 484500
},
{
"epoch": 4.852329117976629,
"grad_norm": 9.325509071350098,
"learning_rate": 1.4768088682567634e-06,
"loss": 0.9097,
"num_input_tokens_seen": 262457280,
"step": 485000
},
{
"epoch": 4.857331519129182,
"grad_norm": 4.7551703453063965,
"learning_rate": 1.426784856731231e-06,
"loss": 0.916,
"num_input_tokens_seen": 262736336,
"step": 485500
},
{
"epoch": 4.862333920281735,
"grad_norm": 5.392652988433838,
"learning_rate": 1.3767608452056988e-06,
"loss": 0.9184,
"num_input_tokens_seen": 263002616,
"step": 486000
},
{
"epoch": 4.867336321434289,
"grad_norm": 6.265237808227539,
"learning_rate": 1.3267368336801664e-06,
"loss": 0.9233,
"num_input_tokens_seen": 263268536,
"step": 486500
},
{
"epoch": 4.872338722586842,
"grad_norm": 9.598567008972168,
"learning_rate": 1.2767128221546343e-06,
"loss": 0.9266,
"num_input_tokens_seen": 263537000,
"step": 487000
},
{
"epoch": 4.877341123739395,
"grad_norm": 4.646302700042725,
"learning_rate": 1.226688810629102e-06,
"loss": 0.9115,
"num_input_tokens_seen": 263804248,
"step": 487500
},
{
"epoch": 4.882343524891948,
"grad_norm": 5.325549125671387,
"learning_rate": 1.1766647991035699e-06,
"loss": 0.9065,
"num_input_tokens_seen": 264071728,
"step": 488000
},
{
"epoch": 4.887345926044501,
"grad_norm": 5.929357528686523,
"learning_rate": 1.1266407875780375e-06,
"loss": 0.9124,
"num_input_tokens_seen": 264342480,
"step": 488500
},
{
"epoch": 4.8923483271970545,
"grad_norm": 7.279246807098389,
"learning_rate": 1.0766167760525053e-06,
"loss": 0.9112,
"num_input_tokens_seen": 264615696,
"step": 489000
},
{
"epoch": 4.897350728349608,
"grad_norm": 7.106380939483643,
"learning_rate": 1.026592764526973e-06,
"loss": 0.9099,
"num_input_tokens_seen": 264886296,
"step": 489500
},
{
"epoch": 4.902353129502161,
"grad_norm": 6.210377216339111,
"learning_rate": 9.765687530014407e-07,
"loss": 0.9227,
"num_input_tokens_seen": 265156384,
"step": 490000
},
{
"epoch": 4.907355530654714,
"grad_norm": 6.38276481628418,
"learning_rate": 9.265447414759085e-07,
"loss": 0.9205,
"num_input_tokens_seen": 265426248,
"step": 490500
},
{
"epoch": 4.912357931807268,
"grad_norm": 5.883709907531738,
"learning_rate": 8.765207299503763e-07,
"loss": 0.9309,
"num_input_tokens_seen": 265695040,
"step": 491000
},
{
"epoch": 4.917360332959821,
"grad_norm": 5.776634693145752,
"learning_rate": 8.26496718424844e-07,
"loss": 0.9042,
"num_input_tokens_seen": 265968648,
"step": 491500
},
{
"epoch": 4.922362734112374,
"grad_norm": 6.002242088317871,
"learning_rate": 7.764727068993117e-07,
"loss": 0.9151,
"num_input_tokens_seen": 266237776,
"step": 492000
},
{
"epoch": 4.927365135264927,
"grad_norm": 6.250047206878662,
"learning_rate": 7.264486953737794e-07,
"loss": 0.9021,
"num_input_tokens_seen": 266510616,
"step": 492500
},
{
"epoch": 4.93236753641748,
"grad_norm": 7.225757598876953,
"learning_rate": 6.764246838482472e-07,
"loss": 0.9197,
"num_input_tokens_seen": 266780560,
"step": 493000
},
{
"epoch": 4.9373699375700335,
"grad_norm": 6.095335006713867,
"learning_rate": 6.26400672322715e-07,
"loss": 0.916,
"num_input_tokens_seen": 267050712,
"step": 493500
},
{
"epoch": 4.942372338722587,
"grad_norm": 7.186228275299072,
"learning_rate": 5.763766607971827e-07,
"loss": 0.9027,
"num_input_tokens_seen": 267324136,
"step": 494000
},
{
"epoch": 4.94737473987514,
"grad_norm": 6.71329402923584,
"learning_rate": 5.263526492716505e-07,
"loss": 0.9205,
"num_input_tokens_seen": 267592264,
"step": 494500
},
{
"epoch": 4.952377141027693,
"grad_norm": 6.1677045822143555,
"learning_rate": 4.763286377461182e-07,
"loss": 0.9235,
"num_input_tokens_seen": 267861184,
"step": 495000
},
{
"epoch": 4.957379542180247,
"grad_norm": 5.632607460021973,
"learning_rate": 4.263046262205859e-07,
"loss": 0.9131,
"num_input_tokens_seen": 268133248,
"step": 495500
},
{
"epoch": 4.9623819433328,
"grad_norm": 7.572421073913574,
"learning_rate": 3.7628061469505367e-07,
"loss": 0.9228,
"num_input_tokens_seen": 268408144,
"step": 496000
},
{
"epoch": 4.967384344485353,
"grad_norm": 5.349244594573975,
"learning_rate": 3.262566031695214e-07,
"loss": 0.9038,
"num_input_tokens_seen": 268677424,
"step": 496500
},
{
"epoch": 4.972386745637906,
"grad_norm": 5.9130754470825195,
"learning_rate": 2.7623259164398915e-07,
"loss": 0.9116,
"num_input_tokens_seen": 268944976,
"step": 497000
},
{
"epoch": 4.977389146790459,
"grad_norm": 6.351227283477783,
"learning_rate": 2.2620858011845684e-07,
"loss": 0.9281,
"num_input_tokens_seen": 269220512,
"step": 497500
},
{
"epoch": 4.9823915479430125,
"grad_norm": 4.36036491394043,
"learning_rate": 1.761845685929246e-07,
"loss": 0.9158,
"num_input_tokens_seen": 269491032,
"step": 498000
},
{
"epoch": 4.987393949095566,
"grad_norm": 6.718998432159424,
"learning_rate": 1.2616055706739237e-07,
"loss": 0.923,
"num_input_tokens_seen": 269761928,
"step": 498500
},
{
"epoch": 4.992396350248119,
"grad_norm": 6.188157558441162,
"learning_rate": 7.61365455418601e-08,
"loss": 0.9097,
"num_input_tokens_seen": 270030752,
"step": 499000
},
{
"epoch": 4.997398751400672,
"grad_norm": 7.353320598602295,
"learning_rate": 2.6112534016327838e-08,
"loss": 0.9016,
"num_input_tokens_seen": 270298112,
"step": 499500
},
{
"epoch": 5.0,
"eval_loss": 0.9877662062644958,
"eval_runtime": 193.3937,
"eval_samples_per_second": 1033.668,
"eval_steps_per_second": 129.213,
"num_input_tokens_seen": 270444104,
"step": 499760
},
{
"epoch": 5.0,
"num_input_tokens_seen": 270444104,
"step": 499760,
"total_flos": 7.16219760157655e+16,
"train_loss": 0.8578685343122414,
"train_runtime": 15819.1918,
"train_samples_per_second": 252.736,
"train_steps_per_second": 31.592,
"train_tokens_per_second": 17096.35
}
],
"logging_steps": 500,
"max_steps": 499760,
"num_input_tokens_seen": 270444104,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.16219760157655e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}