diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.35021189938906927, + "epoch": 0.4002421707303649, "eval_steps": 500, - "global_step": 6363, + "global_step": 7272, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -44548,6 +44548,6369 @@ "learning_rate": 9.280914683069837e-06, "loss": 0.8314, "step": 6363 + }, + { + "epoch": 0.35026693819142496, + "grad_norm": 0.8059762716293335, + "learning_rate": 9.280690706573633e-06, + "loss": 0.7695, + "step": 6364 + }, + { + "epoch": 0.3503219769937806, + "grad_norm": 0.8053386807441711, + "learning_rate": 9.280466697904902e-06, + "loss": 0.8941, + "step": 6365 + }, + { + "epoch": 0.3503770157961363, + "grad_norm": 0.6703817248344421, + "learning_rate": 9.280242657065329e-06, + "loss": 0.5978, + "step": 6366 + }, + { + "epoch": 0.3504320545984919, + "grad_norm": 0.9359784722328186, + "learning_rate": 9.280018584056598e-06, + "loss": 0.8479, + "step": 6367 + }, + { + "epoch": 0.3504870934008476, + "grad_norm": 0.7692418098449707, + "learning_rate": 9.279794478880393e-06, + "loss": 0.7254, + "step": 6368 + }, + { + "epoch": 0.35054213220320324, + "grad_norm": 0.7992031574249268, + "learning_rate": 9.279570341538397e-06, + "loss": 0.6749, + "step": 6369 + }, + { + "epoch": 0.3505971710055589, + "grad_norm": 0.7735288739204407, + "learning_rate": 9.279346172032297e-06, + "loss": 0.8545, + "step": 6370 + }, + { + "epoch": 0.35065220980791456, + "grad_norm": 0.7124339938163757, + "learning_rate": 9.279121970363778e-06, + "loss": 0.8066, + "step": 6371 + }, + { + "epoch": 0.35070724861027025, + "grad_norm": 0.8116535544395447, + "learning_rate": 9.278897736534521e-06, + "loss": 0.8197, + "step": 6372 + }, + { + "epoch": 0.3507622874126259, + "grad_norm": 0.9377869963645935, + "learning_rate": 9.278673470546217e-06, + "loss": 0.74, + "step": 6373 + }, + { + "epoch": 0.3508173262149816, + "grad_norm": 0.6726253628730774, + "learning_rate": 9.278449172400548e-06, + "loss": 0.6389, + "step": 6374 + }, + { + "epoch": 0.3508723650173372, + "grad_norm": 0.8470593094825745, + "learning_rate": 9.278224842099198e-06, + "loss": 0.8059, + "step": 6375 + }, + { + "epoch": 0.3509274038196929, + "grad_norm": 0.7041867971420288, + "learning_rate": 9.278000479643857e-06, + "loss": 0.7409, + "step": 6376 + }, + { + "epoch": 0.35098244262204853, + "grad_norm": 0.7467322945594788, + "learning_rate": 9.27777608503621e-06, + "loss": 0.823, + "step": 6377 + }, + { + "epoch": 0.3510374814244042, + "grad_norm": 0.7211065888404846, + "learning_rate": 9.277551658277942e-06, + "loss": 0.7655, + "step": 6378 + }, + { + "epoch": 0.35109252022675985, + "grad_norm": 0.7709450125694275, + "learning_rate": 9.27732719937074e-06, + "loss": 0.8938, + "step": 6379 + }, + { + "epoch": 0.35114755902911554, + "grad_norm": 0.7672929167747498, + "learning_rate": 9.277102708316293e-06, + "loss": 0.6814, + "step": 6380 + }, + { + "epoch": 0.3512025978314712, + "grad_norm": 0.7334907650947571, + "learning_rate": 9.276878185116287e-06, + "loss": 0.6608, + "step": 6381 + }, + { + "epoch": 0.35125763663382686, + "grad_norm": 0.7011460065841675, + "learning_rate": 9.27665362977241e-06, + "loss": 0.8196, + "step": 6382 + }, + { + "epoch": 0.3513126754361825, + "grad_norm": 0.7388820052146912, + "learning_rate": 9.276429042286349e-06, + "loss": 0.8793, + "step": 6383 + }, + { + "epoch": 0.3513677142385382, + "grad_norm": 0.809725821018219, + "learning_rate": 9.27620442265979e-06, + "loss": 0.6976, + "step": 6384 + }, + { + "epoch": 0.3514227530408938, + "grad_norm": 0.6933012008666992, + "learning_rate": 9.275979770894424e-06, + "loss": 0.759, + "step": 6385 + }, + { + "epoch": 0.3514777918432495, + "grad_norm": 0.7928480505943298, + "learning_rate": 9.27575508699194e-06, + "loss": 0.7462, + "step": 6386 + }, + { + "epoch": 0.35153283064560514, + "grad_norm": 0.8461304903030396, + "learning_rate": 9.275530370954024e-06, + "loss": 0.8184, + "step": 6387 + }, + { + "epoch": 0.35158786944796083, + "grad_norm": 0.7624425292015076, + "learning_rate": 9.275305622782366e-06, + "loss": 0.7913, + "step": 6388 + }, + { + "epoch": 0.35164290825031647, + "grad_norm": 0.7103675007820129, + "learning_rate": 9.275080842478657e-06, + "loss": 0.7633, + "step": 6389 + }, + { + "epoch": 0.35169794705267216, + "grad_norm": 0.9002664089202881, + "learning_rate": 9.274856030044583e-06, + "loss": 0.7643, + "step": 6390 + }, + { + "epoch": 0.3517529858550278, + "grad_norm": 0.7658692002296448, + "learning_rate": 9.274631185481836e-06, + "loss": 0.8028, + "step": 6391 + }, + { + "epoch": 0.3518080246573835, + "grad_norm": 0.6747875809669495, + "learning_rate": 9.274406308792106e-06, + "loss": 0.695, + "step": 6392 + }, + { + "epoch": 0.3518630634597391, + "grad_norm": 0.8197165131568909, + "learning_rate": 9.27418139997708e-06, + "loss": 0.7218, + "step": 6393 + }, + { + "epoch": 0.3519181022620948, + "grad_norm": 0.7597750425338745, + "learning_rate": 9.273956459038453e-06, + "loss": 0.7738, + "step": 6394 + }, + { + "epoch": 0.35197314106445043, + "grad_norm": 0.7365928888320923, + "learning_rate": 9.273731485977912e-06, + "loss": 0.7906, + "step": 6395 + }, + { + "epoch": 0.3520281798668061, + "grad_norm": 0.7313928604125977, + "learning_rate": 9.273506480797151e-06, + "loss": 0.834, + "step": 6396 + }, + { + "epoch": 0.35208321866916176, + "grad_norm": 0.758886456489563, + "learning_rate": 9.273281443497858e-06, + "loss": 0.8883, + "step": 6397 + }, + { + "epoch": 0.35213825747151745, + "grad_norm": 0.7318256497383118, + "learning_rate": 9.273056374081726e-06, + "loss": 0.7463, + "step": 6398 + }, + { + "epoch": 0.3521932962738731, + "grad_norm": 0.778448224067688, + "learning_rate": 9.272831272550446e-06, + "loss": 0.6838, + "step": 6399 + }, + { + "epoch": 0.3522483350762287, + "grad_norm": 0.7392274141311646, + "learning_rate": 9.272606138905709e-06, + "loss": 0.7237, + "step": 6400 + }, + { + "epoch": 0.3523033738785844, + "grad_norm": 0.8803032040596008, + "learning_rate": 9.272380973149209e-06, + "loss": 0.7839, + "step": 6401 + }, + { + "epoch": 0.35235841268094004, + "grad_norm": 0.7506754994392395, + "learning_rate": 9.272155775282636e-06, + "loss": 0.7665, + "step": 6402 + }, + { + "epoch": 0.3524134514832957, + "grad_norm": 0.8136595487594604, + "learning_rate": 9.271930545307686e-06, + "loss": 0.9111, + "step": 6403 + }, + { + "epoch": 0.35246849028565136, + "grad_norm": 0.7976880073547363, + "learning_rate": 9.271705283226047e-06, + "loss": 0.735, + "step": 6404 + }, + { + "epoch": 0.35252352908800705, + "grad_norm": 0.89708411693573, + "learning_rate": 9.271479989039415e-06, + "loss": 0.7698, + "step": 6405 + }, + { + "epoch": 0.3525785678903627, + "grad_norm": 0.8618703484535217, + "learning_rate": 9.271254662749484e-06, + "loss": 0.9001, + "step": 6406 + }, + { + "epoch": 0.35263360669271837, + "grad_norm": 0.7143027186393738, + "learning_rate": 9.271029304357946e-06, + "loss": 0.8188, + "step": 6407 + }, + { + "epoch": 0.352688645495074, + "grad_norm": 0.795365571975708, + "learning_rate": 9.270803913866496e-06, + "loss": 0.7389, + "step": 6408 + }, + { + "epoch": 0.3527436842974297, + "grad_norm": 0.6947643756866455, + "learning_rate": 9.270578491276825e-06, + "loss": 0.7278, + "step": 6409 + }, + { + "epoch": 0.3527987230997853, + "grad_norm": 0.7806137204170227, + "learning_rate": 9.27035303659063e-06, + "loss": 0.808, + "step": 6410 + }, + { + "epoch": 0.352853761902141, + "grad_norm": 0.8908704519271851, + "learning_rate": 9.270127549809606e-06, + "loss": 0.8659, + "step": 6411 + }, + { + "epoch": 0.35290880070449665, + "grad_norm": 0.8171417713165283, + "learning_rate": 9.269902030935445e-06, + "loss": 0.7918, + "step": 6412 + }, + { + "epoch": 0.35296383950685234, + "grad_norm": 0.7556712627410889, + "learning_rate": 9.269676479969842e-06, + "loss": 0.7121, + "step": 6413 + }, + { + "epoch": 0.353018878309208, + "grad_norm": 0.8080483675003052, + "learning_rate": 9.269450896914495e-06, + "loss": 0.8185, + "step": 6414 + }, + { + "epoch": 0.35307391711156366, + "grad_norm": 0.8514583706855774, + "learning_rate": 9.2692252817711e-06, + "loss": 0.8055, + "step": 6415 + }, + { + "epoch": 0.3531289559139193, + "grad_norm": 0.7914162278175354, + "learning_rate": 9.268999634541347e-06, + "loss": 0.759, + "step": 6416 + }, + { + "epoch": 0.353183994716275, + "grad_norm": 0.6452118754386902, + "learning_rate": 9.268773955226937e-06, + "loss": 0.6797, + "step": 6417 + }, + { + "epoch": 0.3532390335186306, + "grad_norm": 0.6876220107078552, + "learning_rate": 9.268548243829565e-06, + "loss": 0.7365, + "step": 6418 + }, + { + "epoch": 0.3532940723209863, + "grad_norm": 0.758550226688385, + "learning_rate": 9.268322500350926e-06, + "loss": 0.7069, + "step": 6419 + }, + { + "epoch": 0.35334911112334194, + "grad_norm": 0.7905879020690918, + "learning_rate": 9.268096724792718e-06, + "loss": 0.8024, + "step": 6420 + }, + { + "epoch": 0.35340414992569763, + "grad_norm": 0.755253255367279, + "learning_rate": 9.267870917156638e-06, + "loss": 0.8018, + "step": 6421 + }, + { + "epoch": 0.35345918872805326, + "grad_norm": 0.6879923343658447, + "learning_rate": 9.267645077444382e-06, + "loss": 0.7267, + "step": 6422 + }, + { + "epoch": 0.35351422753040895, + "grad_norm": 0.766214907169342, + "learning_rate": 9.267419205657649e-06, + "loss": 0.7801, + "step": 6423 + }, + { + "epoch": 0.3535692663327646, + "grad_norm": 0.868776798248291, + "learning_rate": 9.267193301798135e-06, + "loss": 0.9234, + "step": 6424 + }, + { + "epoch": 0.3536243051351203, + "grad_norm": 1.2007492780685425, + "learning_rate": 9.266967365867536e-06, + "loss": 0.7743, + "step": 6425 + }, + { + "epoch": 0.3536793439374759, + "grad_norm": 0.7445551156997681, + "learning_rate": 9.266741397867556e-06, + "loss": 0.6755, + "step": 6426 + }, + { + "epoch": 0.3537343827398316, + "grad_norm": 0.7493785619735718, + "learning_rate": 9.266515397799889e-06, + "loss": 0.7891, + "step": 6427 + }, + { + "epoch": 0.35378942154218723, + "grad_norm": 0.6718230843544006, + "learning_rate": 9.266289365666234e-06, + "loss": 0.6908, + "step": 6428 + }, + { + "epoch": 0.3538444603445429, + "grad_norm": 0.7783547639846802, + "learning_rate": 9.266063301468289e-06, + "loss": 0.7115, + "step": 6429 + }, + { + "epoch": 0.35389949914689856, + "grad_norm": 0.745627224445343, + "learning_rate": 9.265837205207755e-06, + "loss": 0.8421, + "step": 6430 + }, + { + "epoch": 0.35395453794925424, + "grad_norm": 0.7314152717590332, + "learning_rate": 9.26561107688633e-06, + "loss": 0.807, + "step": 6431 + }, + { + "epoch": 0.3540095767516099, + "grad_norm": 0.6975863575935364, + "learning_rate": 9.265384916505714e-06, + "loss": 0.7787, + "step": 6432 + }, + { + "epoch": 0.35406461555396557, + "grad_norm": 0.9758319854736328, + "learning_rate": 9.265158724067608e-06, + "loss": 0.8668, + "step": 6433 + }, + { + "epoch": 0.3541196543563212, + "grad_norm": 0.7686764001846313, + "learning_rate": 9.264932499573711e-06, + "loss": 0.7428, + "step": 6434 + }, + { + "epoch": 0.3541746931586769, + "grad_norm": 0.8761935830116272, + "learning_rate": 9.26470624302572e-06, + "loss": 0.8022, + "step": 6435 + }, + { + "epoch": 0.3542297319610325, + "grad_norm": 0.9145118594169617, + "learning_rate": 9.264479954425341e-06, + "loss": 0.7994, + "step": 6436 + }, + { + "epoch": 0.3542847707633882, + "grad_norm": 0.8217951655387878, + "learning_rate": 9.264253633774271e-06, + "loss": 0.7235, + "step": 6437 + }, + { + "epoch": 0.35433980956574385, + "grad_norm": 0.7624716758728027, + "learning_rate": 9.264027281074214e-06, + "loss": 0.8238, + "step": 6438 + }, + { + "epoch": 0.35439484836809954, + "grad_norm": 0.7772085070610046, + "learning_rate": 9.26380089632687e-06, + "loss": 0.7941, + "step": 6439 + }, + { + "epoch": 0.35444988717045517, + "grad_norm": 1.0462371110916138, + "learning_rate": 9.263574479533937e-06, + "loss": 0.8255, + "step": 6440 + }, + { + "epoch": 0.35450492597281086, + "grad_norm": 0.8523101210594177, + "learning_rate": 9.263348030697119e-06, + "loss": 0.8489, + "step": 6441 + }, + { + "epoch": 0.3545599647751665, + "grad_norm": 1.0292255878448486, + "learning_rate": 9.26312154981812e-06, + "loss": 0.7989, + "step": 6442 + }, + { + "epoch": 0.3546150035775221, + "grad_norm": 0.7621143460273743, + "learning_rate": 9.262895036898641e-06, + "loss": 0.8154, + "step": 6443 + }, + { + "epoch": 0.3546700423798778, + "grad_norm": 0.7158074378967285, + "learning_rate": 9.262668491940382e-06, + "loss": 0.7821, + "step": 6444 + }, + { + "epoch": 0.35472508118223345, + "grad_norm": 0.7969478964805603, + "learning_rate": 9.26244191494505e-06, + "loss": 0.8535, + "step": 6445 + }, + { + "epoch": 0.35478011998458914, + "grad_norm": 0.9244762063026428, + "learning_rate": 9.262215305914345e-06, + "loss": 0.7585, + "step": 6446 + }, + { + "epoch": 0.35483515878694477, + "grad_norm": 0.6862454414367676, + "learning_rate": 9.26198866484997e-06, + "loss": 0.7294, + "step": 6447 + }, + { + "epoch": 0.35489019758930046, + "grad_norm": 0.6816834211349487, + "learning_rate": 9.261761991753629e-06, + "loss": 0.7763, + "step": 6448 + }, + { + "epoch": 0.3549452363916561, + "grad_norm": 0.792539119720459, + "learning_rate": 9.261535286627025e-06, + "loss": 0.7829, + "step": 6449 + }, + { + "epoch": 0.3550002751940118, + "grad_norm": 0.8563211560249329, + "learning_rate": 9.261308549471866e-06, + "loss": 0.8945, + "step": 6450 + }, + { + "epoch": 0.3550553139963674, + "grad_norm": 0.7241078019142151, + "learning_rate": 9.26108178028985e-06, + "loss": 0.6936, + "step": 6451 + }, + { + "epoch": 0.3551103527987231, + "grad_norm": 0.7150034308433533, + "learning_rate": 9.260854979082682e-06, + "loss": 0.7689, + "step": 6452 + }, + { + "epoch": 0.35516539160107874, + "grad_norm": 0.8630193471908569, + "learning_rate": 9.260628145852073e-06, + "loss": 0.8506, + "step": 6453 + }, + { + "epoch": 0.35522043040343443, + "grad_norm": 0.7133893370628357, + "learning_rate": 9.26040128059972e-06, + "loss": 0.7976, + "step": 6454 + }, + { + "epoch": 0.35527546920579006, + "grad_norm": 0.6984630823135376, + "learning_rate": 9.260174383327332e-06, + "loss": 0.7442, + "step": 6455 + }, + { + "epoch": 0.35533050800814575, + "grad_norm": 0.7166933417320251, + "learning_rate": 9.259947454036613e-06, + "loss": 0.813, + "step": 6456 + }, + { + "epoch": 0.3553855468105014, + "grad_norm": 0.7353581190109253, + "learning_rate": 9.259720492729272e-06, + "loss": 0.8157, + "step": 6457 + }, + { + "epoch": 0.3554405856128571, + "grad_norm": 0.6810038089752197, + "learning_rate": 9.259493499407011e-06, + "loss": 0.7423, + "step": 6458 + }, + { + "epoch": 0.3554956244152127, + "grad_norm": 1.1599586009979248, + "learning_rate": 9.259266474071535e-06, + "loss": 0.7159, + "step": 6459 + }, + { + "epoch": 0.3555506632175684, + "grad_norm": 0.7857629060745239, + "learning_rate": 9.259039416724554e-06, + "loss": 0.7846, + "step": 6460 + }, + { + "epoch": 0.35560570201992403, + "grad_norm": 0.705333948135376, + "learning_rate": 9.258812327367773e-06, + "loss": 0.751, + "step": 6461 + }, + { + "epoch": 0.3556607408222797, + "grad_norm": 0.6899998188018799, + "learning_rate": 9.258585206002897e-06, + "loss": 0.7303, + "step": 6462 + }, + { + "epoch": 0.35571577962463535, + "grad_norm": 0.8007912039756775, + "learning_rate": 9.258358052631637e-06, + "loss": 0.7363, + "step": 6463 + }, + { + "epoch": 0.35577081842699104, + "grad_norm": 0.9403146505355835, + "learning_rate": 9.258130867255695e-06, + "loss": 0.9096, + "step": 6464 + }, + { + "epoch": 0.3558258572293467, + "grad_norm": 0.7069174647331238, + "learning_rate": 9.257903649876782e-06, + "loss": 0.7362, + "step": 6465 + }, + { + "epoch": 0.35588089603170237, + "grad_norm": 0.770807683467865, + "learning_rate": 9.257676400496607e-06, + "loss": 0.7904, + "step": 6466 + }, + { + "epoch": 0.355935934834058, + "grad_norm": 0.8586871027946472, + "learning_rate": 9.257449119116874e-06, + "loss": 0.7596, + "step": 6467 + }, + { + "epoch": 0.3559909736364137, + "grad_norm": 0.6934101581573486, + "learning_rate": 9.257221805739294e-06, + "loss": 0.6655, + "step": 6468 + }, + { + "epoch": 0.3560460124387693, + "grad_norm": 0.9494497179985046, + "learning_rate": 9.256994460365573e-06, + "loss": 0.7923, + "step": 6469 + }, + { + "epoch": 0.356101051241125, + "grad_norm": 0.7131130695343018, + "learning_rate": 9.256767082997422e-06, + "loss": 0.819, + "step": 6470 + }, + { + "epoch": 0.35615609004348064, + "grad_norm": 0.8641398549079895, + "learning_rate": 9.25653967363655e-06, + "loss": 0.8275, + "step": 6471 + }, + { + "epoch": 0.35621112884583633, + "grad_norm": 0.7350367307662964, + "learning_rate": 9.256312232284665e-06, + "loss": 0.7991, + "step": 6472 + }, + { + "epoch": 0.35626616764819197, + "grad_norm": 0.8174671530723572, + "learning_rate": 9.256084758943476e-06, + "loss": 0.7147, + "step": 6473 + }, + { + "epoch": 0.35632120645054766, + "grad_norm": 0.7560263872146606, + "learning_rate": 9.255857253614693e-06, + "loss": 0.7435, + "step": 6474 + }, + { + "epoch": 0.3563762452529033, + "grad_norm": 0.7465197443962097, + "learning_rate": 9.255629716300025e-06, + "loss": 0.8228, + "step": 6475 + }, + { + "epoch": 0.356431284055259, + "grad_norm": 0.7130733728408813, + "learning_rate": 9.255402147001184e-06, + "loss": 0.8361, + "step": 6476 + }, + { + "epoch": 0.3564863228576146, + "grad_norm": 0.7200759053230286, + "learning_rate": 9.255174545719882e-06, + "loss": 0.7387, + "step": 6477 + }, + { + "epoch": 0.3565413616599703, + "grad_norm": 0.8387622237205505, + "learning_rate": 9.254946912457826e-06, + "loss": 0.8427, + "step": 6478 + }, + { + "epoch": 0.35659640046232594, + "grad_norm": 0.7263510823249817, + "learning_rate": 9.254719247216725e-06, + "loss": 0.712, + "step": 6479 + }, + { + "epoch": 0.3566514392646816, + "grad_norm": 0.7393862009048462, + "learning_rate": 9.254491549998296e-06, + "loss": 0.6916, + "step": 6480 + }, + { + "epoch": 0.35670647806703726, + "grad_norm": 0.7289569973945618, + "learning_rate": 9.254263820804246e-06, + "loss": 0.7561, + "step": 6481 + }, + { + "epoch": 0.35676151686939295, + "grad_norm": 0.7597448825836182, + "learning_rate": 9.254036059636288e-06, + "loss": 0.853, + "step": 6482 + }, + { + "epoch": 0.3568165556717486, + "grad_norm": 0.7652063369750977, + "learning_rate": 9.253808266496136e-06, + "loss": 0.7652, + "step": 6483 + }, + { + "epoch": 0.35687159447410427, + "grad_norm": 1.193938136100769, + "learning_rate": 9.253580441385497e-06, + "loss": 0.8288, + "step": 6484 + }, + { + "epoch": 0.3569266332764599, + "grad_norm": 0.9258719086647034, + "learning_rate": 9.253352584306087e-06, + "loss": 0.807, + "step": 6485 + }, + { + "epoch": 0.35698167207881554, + "grad_norm": 0.78384929895401, + "learning_rate": 9.253124695259617e-06, + "loss": 0.7785, + "step": 6486 + }, + { + "epoch": 0.3570367108811712, + "grad_norm": 0.801403284072876, + "learning_rate": 9.252896774247802e-06, + "loss": 0.8382, + "step": 6487 + }, + { + "epoch": 0.35709174968352686, + "grad_norm": 0.9472376108169556, + "learning_rate": 9.25266882127235e-06, + "loss": 0.8661, + "step": 6488 + }, + { + "epoch": 0.35714678848588255, + "grad_norm": 0.7575686573982239, + "learning_rate": 9.252440836334981e-06, + "loss": 0.8428, + "step": 6489 + }, + { + "epoch": 0.3572018272882382, + "grad_norm": 0.736282467842102, + "learning_rate": 9.252212819437402e-06, + "loss": 0.801, + "step": 6490 + }, + { + "epoch": 0.35725686609059387, + "grad_norm": 0.7420864701271057, + "learning_rate": 9.251984770581332e-06, + "loss": 0.8849, + "step": 6491 + }, + { + "epoch": 0.3573119048929495, + "grad_norm": 0.7129189372062683, + "learning_rate": 9.251756689768482e-06, + "loss": 0.7716, + "step": 6492 + }, + { + "epoch": 0.3573669436953052, + "grad_norm": 0.7777297496795654, + "learning_rate": 9.251528577000566e-06, + "loss": 0.8183, + "step": 6493 + }, + { + "epoch": 0.35742198249766083, + "grad_norm": 0.7644590139389038, + "learning_rate": 9.2513004322793e-06, + "loss": 0.6319, + "step": 6494 + }, + { + "epoch": 0.3574770213000165, + "grad_norm": 0.7112484574317932, + "learning_rate": 9.251072255606399e-06, + "loss": 0.8012, + "step": 6495 + }, + { + "epoch": 0.35753206010237215, + "grad_norm": 0.7772265076637268, + "learning_rate": 9.250844046983576e-06, + "loss": 0.8372, + "step": 6496 + }, + { + "epoch": 0.35758709890472784, + "grad_norm": 0.9530157446861267, + "learning_rate": 9.250615806412546e-06, + "loss": 0.8683, + "step": 6497 + }, + { + "epoch": 0.3576421377070835, + "grad_norm": 0.7249575257301331, + "learning_rate": 9.250387533895026e-06, + "loss": 0.7091, + "step": 6498 + }, + { + "epoch": 0.35769717650943916, + "grad_norm": 0.8549422025680542, + "learning_rate": 9.25015922943273e-06, + "loss": 0.8376, + "step": 6499 + }, + { + "epoch": 0.3577522153117948, + "grad_norm": 0.74477618932724, + "learning_rate": 9.249930893027376e-06, + "loss": 0.7594, + "step": 6500 + }, + { + "epoch": 0.3578072541141505, + "grad_norm": 0.8269739151000977, + "learning_rate": 9.24970252468068e-06, + "loss": 0.6473, + "step": 6501 + }, + { + "epoch": 0.3578622929165061, + "grad_norm": 0.8375437259674072, + "learning_rate": 9.249474124394358e-06, + "loss": 0.7631, + "step": 6502 + }, + { + "epoch": 0.3579173317188618, + "grad_norm": 0.8680340051651001, + "learning_rate": 9.249245692170123e-06, + "loss": 0.7863, + "step": 6503 + }, + { + "epoch": 0.35797237052121744, + "grad_norm": 0.7179692983627319, + "learning_rate": 9.249017228009696e-06, + "loss": 0.8022, + "step": 6504 + }, + { + "epoch": 0.35802740932357313, + "grad_norm": 0.7797464728355408, + "learning_rate": 9.248788731914794e-06, + "loss": 0.8067, + "step": 6505 + }, + { + "epoch": 0.35808244812592877, + "grad_norm": 0.8032993674278259, + "learning_rate": 9.248560203887133e-06, + "loss": 0.7383, + "step": 6506 + }, + { + "epoch": 0.35813748692828445, + "grad_norm": 0.7714722156524658, + "learning_rate": 9.24833164392843e-06, + "loss": 0.7149, + "step": 6507 + }, + { + "epoch": 0.3581925257306401, + "grad_norm": 0.7492430210113525, + "learning_rate": 9.248103052040404e-06, + "loss": 0.7645, + "step": 6508 + }, + { + "epoch": 0.3582475645329958, + "grad_norm": 0.6843901872634888, + "learning_rate": 9.247874428224773e-06, + "loss": 0.7183, + "step": 6509 + }, + { + "epoch": 0.3583026033353514, + "grad_norm": 0.8370186686515808, + "learning_rate": 9.247645772483254e-06, + "loss": 0.7832, + "step": 6510 + }, + { + "epoch": 0.3583576421377071, + "grad_norm": 0.7907791137695312, + "learning_rate": 9.247417084817567e-06, + "loss": 0.8742, + "step": 6511 + }, + { + "epoch": 0.35841268094006273, + "grad_norm": 0.7950869798660278, + "learning_rate": 9.247188365229428e-06, + "loss": 0.8705, + "step": 6512 + }, + { + "epoch": 0.3584677197424184, + "grad_norm": 0.7276936173439026, + "learning_rate": 9.24695961372056e-06, + "loss": 0.7629, + "step": 6513 + }, + { + "epoch": 0.35852275854477406, + "grad_norm": 0.7761141657829285, + "learning_rate": 9.24673083029268e-06, + "loss": 0.8813, + "step": 6514 + }, + { + "epoch": 0.35857779734712975, + "grad_norm": 0.7528283596038818, + "learning_rate": 9.24650201494751e-06, + "loss": 0.7885, + "step": 6515 + }, + { + "epoch": 0.3586328361494854, + "grad_norm": 0.8972534537315369, + "learning_rate": 9.246273167686765e-06, + "loss": 0.9081, + "step": 6516 + }, + { + "epoch": 0.35868787495184107, + "grad_norm": 0.7658557891845703, + "learning_rate": 9.246044288512168e-06, + "loss": 0.8451, + "step": 6517 + }, + { + "epoch": 0.3587429137541967, + "grad_norm": 0.8013193607330322, + "learning_rate": 9.245815377425438e-06, + "loss": 0.7236, + "step": 6518 + }, + { + "epoch": 0.3587979525565524, + "grad_norm": 0.8134163022041321, + "learning_rate": 9.245586434428298e-06, + "loss": 0.908, + "step": 6519 + }, + { + "epoch": 0.358852991358908, + "grad_norm": 0.6479801535606384, + "learning_rate": 9.245357459522466e-06, + "loss": 0.7397, + "step": 6520 + }, + { + "epoch": 0.3589080301612637, + "grad_norm": 0.70014488697052, + "learning_rate": 9.245128452709665e-06, + "loss": 0.6898, + "step": 6521 + }, + { + "epoch": 0.35896306896361935, + "grad_norm": 0.7645437717437744, + "learning_rate": 9.244899413991613e-06, + "loss": 0.8319, + "step": 6522 + }, + { + "epoch": 0.35901810776597504, + "grad_norm": 0.6812799572944641, + "learning_rate": 9.244670343370033e-06, + "loss": 0.7359, + "step": 6523 + }, + { + "epoch": 0.35907314656833067, + "grad_norm": 0.6573774218559265, + "learning_rate": 9.244441240846647e-06, + "loss": 0.742, + "step": 6524 + }, + { + "epoch": 0.35912818537068636, + "grad_norm": 0.7870661020278931, + "learning_rate": 9.244212106423178e-06, + "loss": 0.7307, + "step": 6525 + }, + { + "epoch": 0.359183224173042, + "grad_norm": 0.9163166284561157, + "learning_rate": 9.243982940101347e-06, + "loss": 0.8584, + "step": 6526 + }, + { + "epoch": 0.3592382629753977, + "grad_norm": 0.766888439655304, + "learning_rate": 9.243753741882874e-06, + "loss": 0.8093, + "step": 6527 + }, + { + "epoch": 0.3592933017777533, + "grad_norm": 0.7831236124038696, + "learning_rate": 9.243524511769486e-06, + "loss": 0.8665, + "step": 6528 + }, + { + "epoch": 0.35934834058010895, + "grad_norm": 0.7485133409500122, + "learning_rate": 9.243295249762904e-06, + "loss": 0.7336, + "step": 6529 + }, + { + "epoch": 0.35940337938246464, + "grad_norm": 0.7231502532958984, + "learning_rate": 9.24306595586485e-06, + "loss": 0.8095, + "step": 6530 + }, + { + "epoch": 0.35945841818482027, + "grad_norm": 0.821898877620697, + "learning_rate": 9.242836630077048e-06, + "loss": 0.831, + "step": 6531 + }, + { + "epoch": 0.35951345698717596, + "grad_norm": 0.6792737245559692, + "learning_rate": 9.242607272401223e-06, + "loss": 0.7183, + "step": 6532 + }, + { + "epoch": 0.3595684957895316, + "grad_norm": 0.7200430631637573, + "learning_rate": 9.242377882839095e-06, + "loss": 0.7256, + "step": 6533 + }, + { + "epoch": 0.3596235345918873, + "grad_norm": 0.6713700890541077, + "learning_rate": 9.242148461392393e-06, + "loss": 0.7416, + "step": 6534 + }, + { + "epoch": 0.3596785733942429, + "grad_norm": 0.7054564356803894, + "learning_rate": 9.241919008062836e-06, + "loss": 0.6856, + "step": 6535 + }, + { + "epoch": 0.3597336121965986, + "grad_norm": 0.7516196966171265, + "learning_rate": 9.241689522852152e-06, + "loss": 0.7149, + "step": 6536 + }, + { + "epoch": 0.35978865099895424, + "grad_norm": 0.8547651767730713, + "learning_rate": 9.241460005762067e-06, + "loss": 0.7075, + "step": 6537 + }, + { + "epoch": 0.35984368980130993, + "grad_norm": 0.6791819334030151, + "learning_rate": 9.241230456794302e-06, + "loss": 0.6449, + "step": 6538 + }, + { + "epoch": 0.35989872860366556, + "grad_norm": 0.8365122079849243, + "learning_rate": 9.241000875950583e-06, + "loss": 0.7619, + "step": 6539 + }, + { + "epoch": 0.35995376740602125, + "grad_norm": 0.763829231262207, + "learning_rate": 9.24077126323264e-06, + "loss": 0.71, + "step": 6540 + }, + { + "epoch": 0.3600088062083769, + "grad_norm": 0.7698483467102051, + "learning_rate": 9.240541618642193e-06, + "loss": 0.7949, + "step": 6541 + }, + { + "epoch": 0.3600638450107326, + "grad_norm": 0.7331508994102478, + "learning_rate": 9.24031194218097e-06, + "loss": 0.8292, + "step": 6542 + }, + { + "epoch": 0.3601188838130882, + "grad_norm": 0.7507451772689819, + "learning_rate": 9.2400822338507e-06, + "loss": 0.8651, + "step": 6543 + }, + { + "epoch": 0.3601739226154439, + "grad_norm": 0.8537001609802246, + "learning_rate": 9.239852493653104e-06, + "loss": 0.848, + "step": 6544 + }, + { + "epoch": 0.36022896141779953, + "grad_norm": 0.683311939239502, + "learning_rate": 9.239622721589913e-06, + "loss": 0.803, + "step": 6545 + }, + { + "epoch": 0.3602840002201552, + "grad_norm": 0.6916974186897278, + "learning_rate": 9.239392917662852e-06, + "loss": 0.8037, + "step": 6546 + }, + { + "epoch": 0.36033903902251085, + "grad_norm": 0.798795223236084, + "learning_rate": 9.23916308187365e-06, + "loss": 0.8037, + "step": 6547 + }, + { + "epoch": 0.36039407782486654, + "grad_norm": 0.7284069657325745, + "learning_rate": 9.238933214224032e-06, + "loss": 0.7365, + "step": 6548 + }, + { + "epoch": 0.3604491166272222, + "grad_norm": 0.7789250016212463, + "learning_rate": 9.238703314715727e-06, + "loss": 0.788, + "step": 6549 + }, + { + "epoch": 0.36050415542957787, + "grad_norm": 0.7029675841331482, + "learning_rate": 9.238473383350462e-06, + "loss": 0.7796, + "step": 6550 + }, + { + "epoch": 0.3605591942319335, + "grad_norm": 0.9094457626342773, + "learning_rate": 9.238243420129965e-06, + "loss": 0.7884, + "step": 6551 + }, + { + "epoch": 0.3606142330342892, + "grad_norm": 0.8253848552703857, + "learning_rate": 9.238013425055965e-06, + "loss": 0.7671, + "step": 6552 + }, + { + "epoch": 0.3606692718366448, + "grad_norm": 0.7052987813949585, + "learning_rate": 9.237783398130193e-06, + "loss": 0.7511, + "step": 6553 + }, + { + "epoch": 0.3607243106390005, + "grad_norm": 0.7506607174873352, + "learning_rate": 9.237553339354373e-06, + "loss": 0.6804, + "step": 6554 + }, + { + "epoch": 0.36077934944135615, + "grad_norm": 0.725106418132782, + "learning_rate": 9.237323248730237e-06, + "loss": 0.7658, + "step": 6555 + }, + { + "epoch": 0.36083438824371183, + "grad_norm": 0.8164945244789124, + "learning_rate": 9.237093126259515e-06, + "loss": 0.7857, + "step": 6556 + }, + { + "epoch": 0.36088942704606747, + "grad_norm": 0.6937377452850342, + "learning_rate": 9.236862971943934e-06, + "loss": 0.6985, + "step": 6557 + }, + { + "epoch": 0.36094446584842316, + "grad_norm": 0.7511105537414551, + "learning_rate": 9.236632785785225e-06, + "loss": 0.7891, + "step": 6558 + }, + { + "epoch": 0.3609995046507788, + "grad_norm": 0.7217637896537781, + "learning_rate": 9.236402567785118e-06, + "loss": 0.7942, + "step": 6559 + }, + { + "epoch": 0.3610545434531345, + "grad_norm": 1.1438478231430054, + "learning_rate": 9.236172317945343e-06, + "loss": 0.8311, + "step": 6560 + }, + { + "epoch": 0.3611095822554901, + "grad_norm": 0.7414245009422302, + "learning_rate": 9.23594203626763e-06, + "loss": 0.7726, + "step": 6561 + }, + { + "epoch": 0.3611646210578458, + "grad_norm": 0.7762154340744019, + "learning_rate": 9.235711722753712e-06, + "loss": 0.7891, + "step": 6562 + }, + { + "epoch": 0.36121965986020144, + "grad_norm": 0.7368801832199097, + "learning_rate": 9.23548137740532e-06, + "loss": 0.7656, + "step": 6563 + }, + { + "epoch": 0.3612746986625571, + "grad_norm": 0.7571502923965454, + "learning_rate": 9.235251000224181e-06, + "loss": 0.7845, + "step": 6564 + }, + { + "epoch": 0.36132973746491276, + "grad_norm": 0.8078309297561646, + "learning_rate": 9.235020591212031e-06, + "loss": 0.7969, + "step": 6565 + }, + { + "epoch": 0.36138477626726845, + "grad_norm": 0.6897913813591003, + "learning_rate": 9.234790150370599e-06, + "loss": 0.6922, + "step": 6566 + }, + { + "epoch": 0.3614398150696241, + "grad_norm": 0.8053449988365173, + "learning_rate": 9.234559677701618e-06, + "loss": 0.9126, + "step": 6567 + }, + { + "epoch": 0.36149485387197977, + "grad_norm": 0.8400903940200806, + "learning_rate": 9.23432917320682e-06, + "loss": 0.8144, + "step": 6568 + }, + { + "epoch": 0.3615498926743354, + "grad_norm": 0.7753110527992249, + "learning_rate": 9.234098636887935e-06, + "loss": 0.7025, + "step": 6569 + }, + { + "epoch": 0.3616049314766911, + "grad_norm": 0.7901243567466736, + "learning_rate": 9.233868068746702e-06, + "loss": 0.783, + "step": 6570 + }, + { + "epoch": 0.3616599702790467, + "grad_norm": 1.2297497987747192, + "learning_rate": 9.233637468784849e-06, + "loss": 0.8541, + "step": 6571 + }, + { + "epoch": 0.36171500908140236, + "grad_norm": 0.7590478658676147, + "learning_rate": 9.233406837004108e-06, + "loss": 0.7856, + "step": 6572 + }, + { + "epoch": 0.36177004788375805, + "grad_norm": 0.6651493310928345, + "learning_rate": 9.233176173406216e-06, + "loss": 0.6822, + "step": 6573 + }, + { + "epoch": 0.3618250866861137, + "grad_norm": 0.7760787010192871, + "learning_rate": 9.232945477992905e-06, + "loss": 0.8017, + "step": 6574 + }, + { + "epoch": 0.3618801254884694, + "grad_norm": 0.8788009285926819, + "learning_rate": 9.232714750765908e-06, + "loss": 0.7812, + "step": 6575 + }, + { + "epoch": 0.361935164290825, + "grad_norm": 0.7014517188072205, + "learning_rate": 9.232483991726961e-06, + "loss": 0.7293, + "step": 6576 + }, + { + "epoch": 0.3619902030931807, + "grad_norm": 0.7586061954498291, + "learning_rate": 9.232253200877797e-06, + "loss": 0.7953, + "step": 6577 + }, + { + "epoch": 0.36204524189553633, + "grad_norm": 0.8202564120292664, + "learning_rate": 9.232022378220151e-06, + "loss": 0.8545, + "step": 6578 + }, + { + "epoch": 0.362100280697892, + "grad_norm": 0.7816846966743469, + "learning_rate": 9.231791523755758e-06, + "loss": 0.8573, + "step": 6579 + }, + { + "epoch": 0.36215531950024765, + "grad_norm": 0.883222222328186, + "learning_rate": 9.23156063748635e-06, + "loss": 0.7733, + "step": 6580 + }, + { + "epoch": 0.36221035830260334, + "grad_norm": 0.8472830057144165, + "learning_rate": 9.231329719413668e-06, + "loss": 0.8931, + "step": 6581 + }, + { + "epoch": 0.362265397104959, + "grad_norm": 0.7916087508201599, + "learning_rate": 9.231098769539443e-06, + "loss": 0.8806, + "step": 6582 + }, + { + "epoch": 0.36232043590731466, + "grad_norm": 0.815339982509613, + "learning_rate": 9.230867787865414e-06, + "loss": 0.9081, + "step": 6583 + }, + { + "epoch": 0.3623754747096703, + "grad_norm": 1.2352560758590698, + "learning_rate": 9.230636774393312e-06, + "loss": 0.726, + "step": 6584 + }, + { + "epoch": 0.362430513512026, + "grad_norm": 0.759308397769928, + "learning_rate": 9.230405729124878e-06, + "loss": 0.7648, + "step": 6585 + }, + { + "epoch": 0.3624855523143816, + "grad_norm": 0.8285754323005676, + "learning_rate": 9.230174652061847e-06, + "loss": 0.7972, + "step": 6586 + }, + { + "epoch": 0.3625405911167373, + "grad_norm": 0.7393043041229248, + "learning_rate": 9.229943543205956e-06, + "loss": 0.7859, + "step": 6587 + }, + { + "epoch": 0.36259562991909294, + "grad_norm": 0.7354594469070435, + "learning_rate": 9.229712402558942e-06, + "loss": 0.6683, + "step": 6588 + }, + { + "epoch": 0.36265066872144863, + "grad_norm": 0.8244406580924988, + "learning_rate": 9.229481230122543e-06, + "loss": 0.6977, + "step": 6589 + }, + { + "epoch": 0.36270570752380427, + "grad_norm": 0.810565173625946, + "learning_rate": 9.229250025898493e-06, + "loss": 0.7278, + "step": 6590 + }, + { + "epoch": 0.36276074632615996, + "grad_norm": 0.7443352937698364, + "learning_rate": 9.229018789888532e-06, + "loss": 0.7821, + "step": 6591 + }, + { + "epoch": 0.3628157851285156, + "grad_norm": 0.9211748838424683, + "learning_rate": 9.228787522094398e-06, + "loss": 0.9174, + "step": 6592 + }, + { + "epoch": 0.3628708239308713, + "grad_norm": 0.7099255919456482, + "learning_rate": 9.22855622251783e-06, + "loss": 0.74, + "step": 6593 + }, + { + "epoch": 0.3629258627332269, + "grad_norm": 0.7373029589653015, + "learning_rate": 9.228324891160564e-06, + "loss": 0.7909, + "step": 6594 + }, + { + "epoch": 0.3629809015355826, + "grad_norm": 0.8774755001068115, + "learning_rate": 9.22809352802434e-06, + "loss": 0.8354, + "step": 6595 + }, + { + "epoch": 0.36303594033793823, + "grad_norm": 0.7547696232795715, + "learning_rate": 9.227862133110899e-06, + "loss": 0.6942, + "step": 6596 + }, + { + "epoch": 0.3630909791402939, + "grad_norm": 0.7868191003799438, + "learning_rate": 9.227630706421975e-06, + "loss": 0.7575, + "step": 6597 + }, + { + "epoch": 0.36314601794264956, + "grad_norm": 0.6753721237182617, + "learning_rate": 9.227399247959312e-06, + "loss": 0.7092, + "step": 6598 + }, + { + "epoch": 0.36320105674500525, + "grad_norm": 0.7317304611206055, + "learning_rate": 9.227167757724646e-06, + "loss": 0.8372, + "step": 6599 + }, + { + "epoch": 0.3632560955473609, + "grad_norm": 0.8928040266036987, + "learning_rate": 9.226936235719721e-06, + "loss": 0.8536, + "step": 6600 + }, + { + "epoch": 0.36331113434971657, + "grad_norm": 0.7178280353546143, + "learning_rate": 9.226704681946275e-06, + "loss": 0.7648, + "step": 6601 + }, + { + "epoch": 0.3633661731520722, + "grad_norm": 0.7439851760864258, + "learning_rate": 9.226473096406046e-06, + "loss": 0.8284, + "step": 6602 + }, + { + "epoch": 0.3634212119544279, + "grad_norm": 0.7000887989997864, + "learning_rate": 9.226241479100777e-06, + "loss": 0.7797, + "step": 6603 + }, + { + "epoch": 0.3634762507567835, + "grad_norm": 0.7882626056671143, + "learning_rate": 9.226009830032209e-06, + "loss": 0.72, + "step": 6604 + }, + { + "epoch": 0.3635312895591392, + "grad_norm": 0.6445927619934082, + "learning_rate": 9.225778149202081e-06, + "loss": 0.6785, + "step": 6605 + }, + { + "epoch": 0.36358632836149485, + "grad_norm": 0.7348469495773315, + "learning_rate": 9.225546436612137e-06, + "loss": 0.8117, + "step": 6606 + }, + { + "epoch": 0.36364136716385054, + "grad_norm": 0.7455001473426819, + "learning_rate": 9.225314692264118e-06, + "loss": 0.8196, + "step": 6607 + }, + { + "epoch": 0.36369640596620617, + "grad_norm": 0.7149390578269958, + "learning_rate": 9.225082916159762e-06, + "loss": 0.8841, + "step": 6608 + }, + { + "epoch": 0.36375144476856186, + "grad_norm": 0.7095748782157898, + "learning_rate": 9.224851108300816e-06, + "loss": 0.7336, + "step": 6609 + }, + { + "epoch": 0.3638064835709175, + "grad_norm": 0.7112231850624084, + "learning_rate": 9.224619268689019e-06, + "loss": 0.8606, + "step": 6610 + }, + { + "epoch": 0.3638615223732732, + "grad_norm": 0.8052846789360046, + "learning_rate": 9.224387397326115e-06, + "loss": 0.7838, + "step": 6611 + }, + { + "epoch": 0.3639165611756288, + "grad_norm": 0.7538836002349854, + "learning_rate": 9.224155494213846e-06, + "loss": 0.8252, + "step": 6612 + }, + { + "epoch": 0.3639715999779845, + "grad_norm": 0.6968722343444824, + "learning_rate": 9.223923559353956e-06, + "loss": 0.759, + "step": 6613 + }, + { + "epoch": 0.36402663878034014, + "grad_norm": 0.7797368168830872, + "learning_rate": 9.223691592748185e-06, + "loss": 0.8452, + "step": 6614 + }, + { + "epoch": 0.3640816775826958, + "grad_norm": 0.7738572955131531, + "learning_rate": 9.223459594398278e-06, + "loss": 0.806, + "step": 6615 + }, + { + "epoch": 0.36413671638505146, + "grad_norm": 0.7998547554016113, + "learning_rate": 9.223227564305983e-06, + "loss": 0.748, + "step": 6616 + }, + { + "epoch": 0.3641917551874071, + "grad_norm": 0.838666558265686, + "learning_rate": 9.222995502473037e-06, + "loss": 0.8252, + "step": 6617 + }, + { + "epoch": 0.3642467939897628, + "grad_norm": 1.1672697067260742, + "learning_rate": 9.222763408901189e-06, + "loss": 0.806, + "step": 6618 + }, + { + "epoch": 0.3643018327921184, + "grad_norm": 0.6721193194389343, + "learning_rate": 9.22253128359218e-06, + "loss": 0.6897, + "step": 6619 + }, + { + "epoch": 0.3643568715944741, + "grad_norm": 0.8152795433998108, + "learning_rate": 9.222299126547758e-06, + "loss": 0.8377, + "step": 6620 + }, + { + "epoch": 0.36441191039682974, + "grad_norm": 0.7959492206573486, + "learning_rate": 9.222066937769664e-06, + "loss": 0.8496, + "step": 6621 + }, + { + "epoch": 0.36446694919918543, + "grad_norm": 0.7759784460067749, + "learning_rate": 9.221834717259646e-06, + "loss": 0.7736, + "step": 6622 + }, + { + "epoch": 0.36452198800154106, + "grad_norm": 0.6929076313972473, + "learning_rate": 9.221602465019449e-06, + "loss": 0.7759, + "step": 6623 + }, + { + "epoch": 0.36457702680389675, + "grad_norm": 0.7323315143585205, + "learning_rate": 9.221370181050817e-06, + "loss": 0.7958, + "step": 6624 + }, + { + "epoch": 0.3646320656062524, + "grad_norm": 0.7177294492721558, + "learning_rate": 9.221137865355496e-06, + "loss": 0.8405, + "step": 6625 + }, + { + "epoch": 0.3646871044086081, + "grad_norm": 0.7425093650817871, + "learning_rate": 9.220905517935235e-06, + "loss": 0.7722, + "step": 6626 + }, + { + "epoch": 0.3647421432109637, + "grad_norm": 0.8761040568351746, + "learning_rate": 9.220673138791775e-06, + "loss": 0.8617, + "step": 6627 + }, + { + "epoch": 0.3647971820133194, + "grad_norm": 0.927509069442749, + "learning_rate": 9.220440727926869e-06, + "loss": 0.7839, + "step": 6628 + }, + { + "epoch": 0.36485222081567503, + "grad_norm": 0.874399721622467, + "learning_rate": 9.220208285342258e-06, + "loss": 0.9697, + "step": 6629 + }, + { + "epoch": 0.3649072596180307, + "grad_norm": 0.931384801864624, + "learning_rate": 9.219975811039691e-06, + "loss": 0.8142, + "step": 6630 + }, + { + "epoch": 0.36496229842038636, + "grad_norm": 0.8567885160446167, + "learning_rate": 9.219743305020916e-06, + "loss": 0.7623, + "step": 6631 + }, + { + "epoch": 0.36501733722274204, + "grad_norm": 0.7287514209747314, + "learning_rate": 9.21951076728768e-06, + "loss": 0.8044, + "step": 6632 + }, + { + "epoch": 0.3650723760250977, + "grad_norm": 0.7234703302383423, + "learning_rate": 9.21927819784173e-06, + "loss": 0.7736, + "step": 6633 + }, + { + "epoch": 0.36512741482745337, + "grad_norm": 0.7174978256225586, + "learning_rate": 9.219045596684815e-06, + "loss": 0.7658, + "step": 6634 + }, + { + "epoch": 0.365182453629809, + "grad_norm": 0.751075804233551, + "learning_rate": 9.218812963818682e-06, + "loss": 0.7586, + "step": 6635 + }, + { + "epoch": 0.3652374924321647, + "grad_norm": 0.755283534526825, + "learning_rate": 9.21858029924508e-06, + "loss": 0.8904, + "step": 6636 + }, + { + "epoch": 0.3652925312345203, + "grad_norm": 0.6439716815948486, + "learning_rate": 9.21834760296576e-06, + "loss": 0.7335, + "step": 6637 + }, + { + "epoch": 0.365347570036876, + "grad_norm": 0.735285758972168, + "learning_rate": 9.218114874982467e-06, + "loss": 0.7193, + "step": 6638 + }, + { + "epoch": 0.36540260883923165, + "grad_norm": 0.7724307775497437, + "learning_rate": 9.217882115296952e-06, + "loss": 0.8322, + "step": 6639 + }, + { + "epoch": 0.36545764764158734, + "grad_norm": 0.7771303653717041, + "learning_rate": 9.217649323910964e-06, + "loss": 0.7952, + "step": 6640 + }, + { + "epoch": 0.36551268644394297, + "grad_norm": 0.7753337621688843, + "learning_rate": 9.217416500826251e-06, + "loss": 0.8501, + "step": 6641 + }, + { + "epoch": 0.36556772524629866, + "grad_norm": 0.8104514479637146, + "learning_rate": 9.217183646044567e-06, + "loss": 0.8503, + "step": 6642 + }, + { + "epoch": 0.3656227640486543, + "grad_norm": 0.7191929221153259, + "learning_rate": 9.21695075956766e-06, + "loss": 0.7578, + "step": 6643 + }, + { + "epoch": 0.36567780285101, + "grad_norm": 0.745837926864624, + "learning_rate": 9.216717841397277e-06, + "loss": 0.819, + "step": 6644 + }, + { + "epoch": 0.3657328416533656, + "grad_norm": 0.7019662261009216, + "learning_rate": 9.216484891535174e-06, + "loss": 0.8024, + "step": 6645 + }, + { + "epoch": 0.3657878804557213, + "grad_norm": 0.9709738492965698, + "learning_rate": 9.216251909983095e-06, + "loss": 0.7653, + "step": 6646 + }, + { + "epoch": 0.36584291925807694, + "grad_norm": 0.7973032593727112, + "learning_rate": 9.2160188967428e-06, + "loss": 0.8071, + "step": 6647 + }, + { + "epoch": 0.3658979580604326, + "grad_norm": 0.6945796012878418, + "learning_rate": 9.215785851816034e-06, + "loss": 0.6831, + "step": 6648 + }, + { + "epoch": 0.36595299686278826, + "grad_norm": 0.8685100674629211, + "learning_rate": 9.21555277520455e-06, + "loss": 0.821, + "step": 6649 + }, + { + "epoch": 0.36600803566514395, + "grad_norm": 1.0164310932159424, + "learning_rate": 9.2153196669101e-06, + "loss": 0.7861, + "step": 6650 + }, + { + "epoch": 0.3660630744674996, + "grad_norm": 0.8572850227355957, + "learning_rate": 9.215086526934435e-06, + "loss": 0.7982, + "step": 6651 + }, + { + "epoch": 0.36611811326985527, + "grad_norm": 0.7481987476348877, + "learning_rate": 9.214853355279307e-06, + "loss": 0.8258, + "step": 6652 + }, + { + "epoch": 0.3661731520722109, + "grad_norm": 0.750344455242157, + "learning_rate": 9.214620151946472e-06, + "loss": 0.7842, + "step": 6653 + }, + { + "epoch": 0.3662281908745666, + "grad_norm": 1.0266414880752563, + "learning_rate": 9.214386916937678e-06, + "loss": 0.7313, + "step": 6654 + }, + { + "epoch": 0.36628322967692223, + "grad_norm": 0.7913589477539062, + "learning_rate": 9.214153650254682e-06, + "loss": 0.8251, + "step": 6655 + }, + { + "epoch": 0.3663382684792779, + "grad_norm": 0.7185465693473816, + "learning_rate": 9.213920351899235e-06, + "loss": 0.7145, + "step": 6656 + }, + { + "epoch": 0.36639330728163355, + "grad_norm": 0.7185063362121582, + "learning_rate": 9.213687021873088e-06, + "loss": 0.8321, + "step": 6657 + }, + { + "epoch": 0.3664483460839892, + "grad_norm": 0.8380091190338135, + "learning_rate": 9.213453660178e-06, + "loss": 0.8293, + "step": 6658 + }, + { + "epoch": 0.3665033848863449, + "grad_norm": 0.7569485306739807, + "learning_rate": 9.21322026681572e-06, + "loss": 0.7201, + "step": 6659 + }, + { + "epoch": 0.3665584236887005, + "grad_norm": 0.7212445735931396, + "learning_rate": 9.212986841788005e-06, + "loss": 0.7869, + "step": 6660 + }, + { + "epoch": 0.3666134624910562, + "grad_norm": 0.9435489773750305, + "learning_rate": 9.212753385096612e-06, + "loss": 0.8469, + "step": 6661 + }, + { + "epoch": 0.36666850129341183, + "grad_norm": 0.6609265208244324, + "learning_rate": 9.212519896743289e-06, + "loss": 0.6446, + "step": 6662 + }, + { + "epoch": 0.3667235400957675, + "grad_norm": 0.7232604026794434, + "learning_rate": 9.212286376729794e-06, + "loss": 0.7138, + "step": 6663 + }, + { + "epoch": 0.36677857889812315, + "grad_norm": 0.7276197075843811, + "learning_rate": 9.212052825057882e-06, + "loss": 0.725, + "step": 6664 + }, + { + "epoch": 0.36683361770047884, + "grad_norm": 0.7029727101325989, + "learning_rate": 9.21181924172931e-06, + "loss": 0.6973, + "step": 6665 + }, + { + "epoch": 0.3668886565028345, + "grad_norm": 0.7292968034744263, + "learning_rate": 9.21158562674583e-06, + "loss": 0.6984, + "step": 6666 + }, + { + "epoch": 0.36694369530519017, + "grad_norm": 0.6977009177207947, + "learning_rate": 9.2113519801092e-06, + "loss": 0.7752, + "step": 6667 + }, + { + "epoch": 0.3669987341075458, + "grad_norm": 0.8019471764564514, + "learning_rate": 9.211118301821176e-06, + "loss": 0.7481, + "step": 6668 + }, + { + "epoch": 0.3670537729099015, + "grad_norm": 0.8097867965698242, + "learning_rate": 9.210884591883516e-06, + "loss": 0.8077, + "step": 6669 + }, + { + "epoch": 0.3671088117122571, + "grad_norm": 1.1622828245162964, + "learning_rate": 9.210650850297973e-06, + "loss": 0.8053, + "step": 6670 + }, + { + "epoch": 0.3671638505146128, + "grad_norm": 0.8188957571983337, + "learning_rate": 9.210417077066304e-06, + "loss": 0.7731, + "step": 6671 + }, + { + "epoch": 0.36721888931696844, + "grad_norm": 0.8531584739685059, + "learning_rate": 9.210183272190269e-06, + "loss": 0.8183, + "step": 6672 + }, + { + "epoch": 0.36727392811932413, + "grad_norm": 0.8007203936576843, + "learning_rate": 9.209949435671624e-06, + "loss": 0.7906, + "step": 6673 + }, + { + "epoch": 0.36732896692167977, + "grad_norm": 0.8284860253334045, + "learning_rate": 9.209715567512126e-06, + "loss": 0.7845, + "step": 6674 + }, + { + "epoch": 0.36738400572403546, + "grad_norm": 0.7735304236412048, + "learning_rate": 9.209481667713533e-06, + "loss": 0.7333, + "step": 6675 + }, + { + "epoch": 0.3674390445263911, + "grad_norm": 0.7390912771224976, + "learning_rate": 9.209247736277601e-06, + "loss": 0.7992, + "step": 6676 + }, + { + "epoch": 0.3674940833287468, + "grad_norm": 0.6871926784515381, + "learning_rate": 9.209013773206091e-06, + "loss": 0.7765, + "step": 6677 + }, + { + "epoch": 0.3675491221311024, + "grad_norm": 0.7241746187210083, + "learning_rate": 9.208779778500758e-06, + "loss": 0.7124, + "step": 6678 + }, + { + "epoch": 0.3676041609334581, + "grad_norm": 0.7362630367279053, + "learning_rate": 9.208545752163365e-06, + "loss": 0.7695, + "step": 6679 + }, + { + "epoch": 0.36765919973581374, + "grad_norm": 0.7577944993972778, + "learning_rate": 9.208311694195669e-06, + "loss": 0.8302, + "step": 6680 + }, + { + "epoch": 0.3677142385381694, + "grad_norm": 0.7182355523109436, + "learning_rate": 9.208077604599427e-06, + "loss": 0.8182, + "step": 6681 + }, + { + "epoch": 0.36776927734052506, + "grad_norm": 0.7636679410934448, + "learning_rate": 9.207843483376402e-06, + "loss": 0.7266, + "step": 6682 + }, + { + "epoch": 0.36782431614288075, + "grad_norm": 0.7325936555862427, + "learning_rate": 9.207609330528349e-06, + "loss": 0.735, + "step": 6683 + }, + { + "epoch": 0.3678793549452364, + "grad_norm": 1.1119143962860107, + "learning_rate": 9.207375146057033e-06, + "loss": 1.0124, + "step": 6684 + }, + { + "epoch": 0.36793439374759207, + "grad_norm": 0.7694228291511536, + "learning_rate": 9.207140929964212e-06, + "loss": 0.7803, + "step": 6685 + }, + { + "epoch": 0.3679894325499477, + "grad_norm": 0.7628658413887024, + "learning_rate": 9.206906682251644e-06, + "loss": 0.8057, + "step": 6686 + }, + { + "epoch": 0.3680444713523034, + "grad_norm": 0.766266942024231, + "learning_rate": 9.206672402921092e-06, + "loss": 0.7827, + "step": 6687 + }, + { + "epoch": 0.368099510154659, + "grad_norm": 0.7355746626853943, + "learning_rate": 9.206438091974316e-06, + "loss": 0.8146, + "step": 6688 + }, + { + "epoch": 0.3681545489570147, + "grad_norm": 0.8464547395706177, + "learning_rate": 9.20620374941308e-06, + "loss": 0.8296, + "step": 6689 + }, + { + "epoch": 0.36820958775937035, + "grad_norm": 0.7113955616950989, + "learning_rate": 9.20596937523914e-06, + "loss": 0.7621, + "step": 6690 + }, + { + "epoch": 0.36826462656172604, + "grad_norm": 0.7141324877738953, + "learning_rate": 9.205734969454259e-06, + "loss": 0.738, + "step": 6691 + }, + { + "epoch": 0.36831966536408167, + "grad_norm": 0.7576237320899963, + "learning_rate": 9.2055005320602e-06, + "loss": 0.7727, + "step": 6692 + }, + { + "epoch": 0.36837470416643736, + "grad_norm": 0.7448444962501526, + "learning_rate": 9.205266063058727e-06, + "loss": 0.8238, + "step": 6693 + }, + { + "epoch": 0.368429742968793, + "grad_norm": 0.7441811561584473, + "learning_rate": 9.205031562451599e-06, + "loss": 0.7518, + "step": 6694 + }, + { + "epoch": 0.3684847817711487, + "grad_norm": 0.9284115433692932, + "learning_rate": 9.20479703024058e-06, + "loss": 0.817, + "step": 6695 + }, + { + "epoch": 0.3685398205735043, + "grad_norm": 0.7019243836402893, + "learning_rate": 9.204562466427431e-06, + "loss": 0.7403, + "step": 6696 + }, + { + "epoch": 0.36859485937586, + "grad_norm": 0.6345306634902954, + "learning_rate": 9.204327871013917e-06, + "loss": 0.7058, + "step": 6697 + }, + { + "epoch": 0.36864989817821564, + "grad_norm": 0.7375063300132751, + "learning_rate": 9.2040932440018e-06, + "loss": 0.831, + "step": 6698 + }, + { + "epoch": 0.36870493698057133, + "grad_norm": 0.8213731050491333, + "learning_rate": 9.203858585392842e-06, + "loss": 0.7677, + "step": 6699 + }, + { + "epoch": 0.36875997578292696, + "grad_norm": 0.7114601731300354, + "learning_rate": 9.203623895188809e-06, + "loss": 0.8015, + "step": 6700 + }, + { + "epoch": 0.3688150145852826, + "grad_norm": 0.7707667350769043, + "learning_rate": 9.203389173391463e-06, + "loss": 0.7758, + "step": 6701 + }, + { + "epoch": 0.3688700533876383, + "grad_norm": 0.7374396920204163, + "learning_rate": 9.203154420002572e-06, + "loss": 0.7583, + "step": 6702 + }, + { + "epoch": 0.3689250921899939, + "grad_norm": 0.7156866192817688, + "learning_rate": 9.202919635023895e-06, + "loss": 0.8173, + "step": 6703 + }, + { + "epoch": 0.3689801309923496, + "grad_norm": 0.6811904311180115, + "learning_rate": 9.2026848184572e-06, + "loss": 0.7441, + "step": 6704 + }, + { + "epoch": 0.36903516979470524, + "grad_norm": 0.7515163421630859, + "learning_rate": 9.20244997030425e-06, + "loss": 0.7927, + "step": 6705 + }, + { + "epoch": 0.36909020859706093, + "grad_norm": 0.761116087436676, + "learning_rate": 9.202215090566813e-06, + "loss": 0.7686, + "step": 6706 + }, + { + "epoch": 0.36914524739941657, + "grad_norm": 0.8726711869239807, + "learning_rate": 9.20198017924665e-06, + "loss": 0.7831, + "step": 6707 + }, + { + "epoch": 0.36920028620177225, + "grad_norm": 0.6868153810501099, + "learning_rate": 9.20174523634553e-06, + "loss": 0.7855, + "step": 6708 + }, + { + "epoch": 0.3692553250041279, + "grad_norm": 0.7140498757362366, + "learning_rate": 9.201510261865218e-06, + "loss": 0.8144, + "step": 6709 + }, + { + "epoch": 0.3693103638064836, + "grad_norm": 0.8745181560516357, + "learning_rate": 9.201275255807478e-06, + "loss": 0.9204, + "step": 6710 + }, + { + "epoch": 0.3693654026088392, + "grad_norm": 0.6535945534706116, + "learning_rate": 9.20104021817408e-06, + "loss": 0.7729, + "step": 6711 + }, + { + "epoch": 0.3694204414111949, + "grad_norm": 0.655857503414154, + "learning_rate": 9.200805148966785e-06, + "loss": 0.8373, + "step": 6712 + }, + { + "epoch": 0.36947548021355053, + "grad_norm": 0.8393271565437317, + "learning_rate": 9.200570048187365e-06, + "loss": 0.8532, + "step": 6713 + }, + { + "epoch": 0.3695305190159062, + "grad_norm": 0.7484574913978577, + "learning_rate": 9.200334915837585e-06, + "loss": 0.8411, + "step": 6714 + }, + { + "epoch": 0.36958555781826186, + "grad_norm": 0.9913665652275085, + "learning_rate": 9.200099751919212e-06, + "loss": 0.9011, + "step": 6715 + }, + { + "epoch": 0.36964059662061755, + "grad_norm": 0.7314063310623169, + "learning_rate": 9.199864556434013e-06, + "loss": 0.7184, + "step": 6716 + }, + { + "epoch": 0.3696956354229732, + "grad_norm": 0.7881553173065186, + "learning_rate": 9.199629329383758e-06, + "loss": 0.796, + "step": 6717 + }, + { + "epoch": 0.36975067422532887, + "grad_norm": 0.7440283298492432, + "learning_rate": 9.199394070770212e-06, + "loss": 0.7472, + "step": 6718 + }, + { + "epoch": 0.3698057130276845, + "grad_norm": 0.6916326880455017, + "learning_rate": 9.199158780595144e-06, + "loss": 0.6808, + "step": 6719 + }, + { + "epoch": 0.3698607518300402, + "grad_norm": 0.8482714295387268, + "learning_rate": 9.198923458860323e-06, + "loss": 0.7795, + "step": 6720 + }, + { + "epoch": 0.3699157906323958, + "grad_norm": 0.7541999816894531, + "learning_rate": 9.198688105567516e-06, + "loss": 0.7917, + "step": 6721 + }, + { + "epoch": 0.3699708294347515, + "grad_norm": 0.794335126876831, + "learning_rate": 9.198452720718494e-06, + "loss": 0.8463, + "step": 6722 + }, + { + "epoch": 0.37002586823710715, + "grad_norm": 0.7866827845573425, + "learning_rate": 9.198217304315025e-06, + "loss": 0.7938, + "step": 6723 + }, + { + "epoch": 0.37008090703946284, + "grad_norm": 0.7393556833267212, + "learning_rate": 9.19798185635888e-06, + "loss": 0.7825, + "step": 6724 + }, + { + "epoch": 0.37013594584181847, + "grad_norm": 0.7131090760231018, + "learning_rate": 9.197746376851825e-06, + "loss": 0.7184, + "step": 6725 + }, + { + "epoch": 0.37019098464417416, + "grad_norm": 0.7054039239883423, + "learning_rate": 9.197510865795634e-06, + "loss": 0.7458, + "step": 6726 + }, + { + "epoch": 0.3702460234465298, + "grad_norm": 0.7437009811401367, + "learning_rate": 9.197275323192073e-06, + "loss": 0.7921, + "step": 6727 + }, + { + "epoch": 0.3703010622488855, + "grad_norm": 1.0703076124191284, + "learning_rate": 9.197039749042916e-06, + "loss": 0.771, + "step": 6728 + }, + { + "epoch": 0.3703561010512411, + "grad_norm": 0.8278045654296875, + "learning_rate": 9.196804143349929e-06, + "loss": 0.8984, + "step": 6729 + }, + { + "epoch": 0.3704111398535968, + "grad_norm": 0.7713067531585693, + "learning_rate": 9.196568506114887e-06, + "loss": 0.7702, + "step": 6730 + }, + { + "epoch": 0.37046617865595244, + "grad_norm": 0.9040505290031433, + "learning_rate": 9.19633283733956e-06, + "loss": 0.7113, + "step": 6731 + }, + { + "epoch": 0.3705212174583081, + "grad_norm": 0.8853700757026672, + "learning_rate": 9.196097137025718e-06, + "loss": 0.8445, + "step": 6732 + }, + { + "epoch": 0.37057625626066376, + "grad_norm": 0.6870817542076111, + "learning_rate": 9.195861405175133e-06, + "loss": 0.7613, + "step": 6733 + }, + { + "epoch": 0.37063129506301945, + "grad_norm": 0.7539152503013611, + "learning_rate": 9.195625641789579e-06, + "loss": 0.7478, + "step": 6734 + }, + { + "epoch": 0.3706863338653751, + "grad_norm": 0.7084356546401978, + "learning_rate": 9.195389846870822e-06, + "loss": 0.7803, + "step": 6735 + }, + { + "epoch": 0.3707413726677308, + "grad_norm": 0.7883948087692261, + "learning_rate": 9.19515402042064e-06, + "loss": 0.8606, + "step": 6736 + }, + { + "epoch": 0.3707964114700864, + "grad_norm": 0.714948296546936, + "learning_rate": 9.194918162440804e-06, + "loss": 0.8066, + "step": 6737 + }, + { + "epoch": 0.3708514502724421, + "grad_norm": 0.7110786437988281, + "learning_rate": 9.194682272933085e-06, + "loss": 0.7439, + "step": 6738 + }, + { + "epoch": 0.37090648907479773, + "grad_norm": 0.7281045317649841, + "learning_rate": 9.194446351899257e-06, + "loss": 0.7772, + "step": 6739 + }, + { + "epoch": 0.3709615278771534, + "grad_norm": 0.7351245880126953, + "learning_rate": 9.194210399341093e-06, + "loss": 0.8777, + "step": 6740 + }, + { + "epoch": 0.37101656667950905, + "grad_norm": 0.8028532266616821, + "learning_rate": 9.193974415260367e-06, + "loss": 0.7461, + "step": 6741 + }, + { + "epoch": 0.37107160548186474, + "grad_norm": 0.8015451431274414, + "learning_rate": 9.19373839965885e-06, + "loss": 0.8006, + "step": 6742 + }, + { + "epoch": 0.3711266442842204, + "grad_norm": 0.9567442536354065, + "learning_rate": 9.193502352538321e-06, + "loss": 0.8636, + "step": 6743 + }, + { + "epoch": 0.371181683086576, + "grad_norm": 1.1413114070892334, + "learning_rate": 9.193266273900547e-06, + "loss": 0.8976, + "step": 6744 + }, + { + "epoch": 0.3712367218889317, + "grad_norm": 0.6971789002418518, + "learning_rate": 9.19303016374731e-06, + "loss": 0.7419, + "step": 6745 + }, + { + "epoch": 0.37129176069128733, + "grad_norm": 0.8117435574531555, + "learning_rate": 9.192794022080378e-06, + "loss": 0.8166, + "step": 6746 + }, + { + "epoch": 0.371346799493643, + "grad_norm": 0.7748119831085205, + "learning_rate": 9.19255784890153e-06, + "loss": 0.8073, + "step": 6747 + }, + { + "epoch": 0.37140183829599865, + "grad_norm": 0.6550068259239197, + "learning_rate": 9.192321644212539e-06, + "loss": 0.6976, + "step": 6748 + }, + { + "epoch": 0.37145687709835434, + "grad_norm": 0.7931404709815979, + "learning_rate": 9.19208540801518e-06, + "loss": 0.7153, + "step": 6749 + }, + { + "epoch": 0.37151191590071, + "grad_norm": 0.7107539176940918, + "learning_rate": 9.19184914031123e-06, + "loss": 0.7616, + "step": 6750 + }, + { + "epoch": 0.37156695470306567, + "grad_norm": 0.6983848810195923, + "learning_rate": 9.191612841102463e-06, + "loss": 0.6507, + "step": 6751 + }, + { + "epoch": 0.3716219935054213, + "grad_norm": 0.7653477787971497, + "learning_rate": 9.191376510390657e-06, + "loss": 0.708, + "step": 6752 + }, + { + "epoch": 0.371677032307777, + "grad_norm": 0.8903954029083252, + "learning_rate": 9.191140148177586e-06, + "loss": 0.8131, + "step": 6753 + }, + { + "epoch": 0.3717320711101326, + "grad_norm": 0.7584933042526245, + "learning_rate": 9.190903754465028e-06, + "loss": 0.8178, + "step": 6754 + }, + { + "epoch": 0.3717871099124883, + "grad_norm": 0.7338405847549438, + "learning_rate": 9.19066732925476e-06, + "loss": 0.7717, + "step": 6755 + }, + { + "epoch": 0.37184214871484395, + "grad_norm": 0.764944851398468, + "learning_rate": 9.190430872548557e-06, + "loss": 0.7762, + "step": 6756 + }, + { + "epoch": 0.37189718751719963, + "grad_norm": 0.7362231612205505, + "learning_rate": 9.190194384348199e-06, + "loss": 0.8277, + "step": 6757 + }, + { + "epoch": 0.37195222631955527, + "grad_norm": 0.7462226748466492, + "learning_rate": 9.18995786465546e-06, + "loss": 0.7362, + "step": 6758 + }, + { + "epoch": 0.37200726512191096, + "grad_norm": 0.7769725322723389, + "learning_rate": 9.18972131347212e-06, + "loss": 0.8217, + "step": 6759 + }, + { + "epoch": 0.3720623039242666, + "grad_norm": 0.7263969779014587, + "learning_rate": 9.189484730799956e-06, + "loss": 0.7719, + "step": 6760 + }, + { + "epoch": 0.3721173427266223, + "grad_norm": 0.7612473964691162, + "learning_rate": 9.189248116640746e-06, + "loss": 0.7149, + "step": 6761 + }, + { + "epoch": 0.3721723815289779, + "grad_norm": 0.6813042759895325, + "learning_rate": 9.189011470996268e-06, + "loss": 0.7119, + "step": 6762 + }, + { + "epoch": 0.3722274203313336, + "grad_norm": 0.7376571297645569, + "learning_rate": 9.188774793868302e-06, + "loss": 0.7998, + "step": 6763 + }, + { + "epoch": 0.37228245913368924, + "grad_norm": 0.8592102527618408, + "learning_rate": 9.188538085258626e-06, + "loss": 0.8026, + "step": 6764 + }, + { + "epoch": 0.3723374979360449, + "grad_norm": 0.7666613459587097, + "learning_rate": 9.188301345169017e-06, + "loss": 0.8571, + "step": 6765 + }, + { + "epoch": 0.37239253673840056, + "grad_norm": 0.7118985652923584, + "learning_rate": 9.188064573601258e-06, + "loss": 0.7637, + "step": 6766 + }, + { + "epoch": 0.37244757554075625, + "grad_norm": 0.8247082233428955, + "learning_rate": 9.187827770557127e-06, + "loss": 0.8209, + "step": 6767 + }, + { + "epoch": 0.3725026143431119, + "grad_norm": 0.7259567975997925, + "learning_rate": 9.187590936038403e-06, + "loss": 0.7918, + "step": 6768 + }, + { + "epoch": 0.37255765314546757, + "grad_norm": 0.7409893274307251, + "learning_rate": 9.187354070046867e-06, + "loss": 0.8004, + "step": 6769 + }, + { + "epoch": 0.3726126919478232, + "grad_norm": 0.8163084387779236, + "learning_rate": 9.187117172584298e-06, + "loss": 0.8452, + "step": 6770 + }, + { + "epoch": 0.3726677307501789, + "grad_norm": 0.9241586923599243, + "learning_rate": 9.186880243652477e-06, + "loss": 0.8939, + "step": 6771 + }, + { + "epoch": 0.3727227695525345, + "grad_norm": 0.710434079170227, + "learning_rate": 9.186643283253185e-06, + "loss": 0.7337, + "step": 6772 + }, + { + "epoch": 0.3727778083548902, + "grad_norm": 0.7850505709648132, + "learning_rate": 9.186406291388203e-06, + "loss": 0.7892, + "step": 6773 + }, + { + "epoch": 0.37283284715724585, + "grad_norm": 0.813979983329773, + "learning_rate": 9.186169268059311e-06, + "loss": 0.7993, + "step": 6774 + }, + { + "epoch": 0.37288788595960154, + "grad_norm": 0.7923213243484497, + "learning_rate": 9.185932213268292e-06, + "loss": 0.7501, + "step": 6775 + }, + { + "epoch": 0.3729429247619572, + "grad_norm": 0.7923155426979065, + "learning_rate": 9.185695127016928e-06, + "loss": 0.8435, + "step": 6776 + }, + { + "epoch": 0.37299796356431286, + "grad_norm": 0.69893479347229, + "learning_rate": 9.185458009306999e-06, + "loss": 0.7155, + "step": 6777 + }, + { + "epoch": 0.3730530023666685, + "grad_norm": 0.7848305106163025, + "learning_rate": 9.185220860140289e-06, + "loss": 0.7971, + "step": 6778 + }, + { + "epoch": 0.3731080411690242, + "grad_norm": 0.6707655787467957, + "learning_rate": 9.184983679518578e-06, + "loss": 0.6939, + "step": 6779 + }, + { + "epoch": 0.3731630799713798, + "grad_norm": 0.6612532734870911, + "learning_rate": 9.18474646744365e-06, + "loss": 0.7361, + "step": 6780 + }, + { + "epoch": 0.3732181187737355, + "grad_norm": 0.7753985524177551, + "learning_rate": 9.184509223917288e-06, + "loss": 0.7263, + "step": 6781 + }, + { + "epoch": 0.37327315757609114, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.184271948941275e-06, + "loss": 0.6923, + "step": 6782 + }, + { + "epoch": 0.37332819637844683, + "grad_norm": 0.7223647832870483, + "learning_rate": 9.184034642517393e-06, + "loss": 0.793, + "step": 6783 + }, + { + "epoch": 0.37338323518080246, + "grad_norm": 0.7428838014602661, + "learning_rate": 9.183797304647428e-06, + "loss": 0.7781, + "step": 6784 + }, + { + "epoch": 0.37343827398315815, + "grad_norm": 0.7301773428916931, + "learning_rate": 9.183559935333161e-06, + "loss": 0.7964, + "step": 6785 + }, + { + "epoch": 0.3734933127855138, + "grad_norm": 0.7883384823799133, + "learning_rate": 9.183322534576378e-06, + "loss": 0.8904, + "step": 6786 + }, + { + "epoch": 0.3735483515878694, + "grad_norm": 0.7943564653396606, + "learning_rate": 9.183085102378864e-06, + "loss": 0.7229, + "step": 6787 + }, + { + "epoch": 0.3736033903902251, + "grad_norm": 0.7385129928588867, + "learning_rate": 9.1828476387424e-06, + "loss": 0.7967, + "step": 6788 + }, + { + "epoch": 0.37365842919258074, + "grad_norm": 0.7968102097511292, + "learning_rate": 9.182610143668775e-06, + "loss": 0.8016, + "step": 6789 + }, + { + "epoch": 0.37371346799493643, + "grad_norm": 0.7810283303260803, + "learning_rate": 9.18237261715977e-06, + "loss": 0.8956, + "step": 6790 + }, + { + "epoch": 0.37376850679729207, + "grad_norm": 0.7110065221786499, + "learning_rate": 9.182135059217172e-06, + "loss": 0.7808, + "step": 6791 + }, + { + "epoch": 0.37382354559964776, + "grad_norm": 0.7513633370399475, + "learning_rate": 9.181897469842767e-06, + "loss": 0.8236, + "step": 6792 + }, + { + "epoch": 0.3738785844020034, + "grad_norm": 0.7850426435470581, + "learning_rate": 9.18165984903834e-06, + "loss": 0.8642, + "step": 6793 + }, + { + "epoch": 0.3739336232043591, + "grad_norm": 1.4948225021362305, + "learning_rate": 9.181422196805676e-06, + "loss": 0.8765, + "step": 6794 + }, + { + "epoch": 0.3739886620067147, + "grad_norm": 0.8242343068122864, + "learning_rate": 9.181184513146563e-06, + "loss": 0.7213, + "step": 6795 + }, + { + "epoch": 0.3740437008090704, + "grad_norm": 0.8017476797103882, + "learning_rate": 9.180946798062786e-06, + "loss": 0.655, + "step": 6796 + }, + { + "epoch": 0.37409873961142603, + "grad_norm": 0.9573387503623962, + "learning_rate": 9.180709051556132e-06, + "loss": 0.8674, + "step": 6797 + }, + { + "epoch": 0.3741537784137817, + "grad_norm": 0.7575511932373047, + "learning_rate": 9.180471273628388e-06, + "loss": 0.8672, + "step": 6798 + }, + { + "epoch": 0.37420881721613736, + "grad_norm": 0.7723323702812195, + "learning_rate": 9.180233464281343e-06, + "loss": 0.7698, + "step": 6799 + }, + { + "epoch": 0.37426385601849305, + "grad_norm": 0.8352731466293335, + "learning_rate": 9.17999562351678e-06, + "loss": 0.9248, + "step": 6800 + }, + { + "epoch": 0.3743188948208487, + "grad_norm": 0.7459322214126587, + "learning_rate": 9.179757751336488e-06, + "loss": 0.7561, + "step": 6801 + }, + { + "epoch": 0.37437393362320437, + "grad_norm": 0.8053051829338074, + "learning_rate": 9.179519847742257e-06, + "loss": 0.8743, + "step": 6802 + }, + { + "epoch": 0.37442897242556, + "grad_norm": 0.7781768441200256, + "learning_rate": 9.179281912735873e-06, + "loss": 0.7426, + "step": 6803 + }, + { + "epoch": 0.3744840112279157, + "grad_norm": 0.6812007427215576, + "learning_rate": 9.179043946319126e-06, + "loss": 0.761, + "step": 6804 + }, + { + "epoch": 0.3745390500302713, + "grad_norm": 0.8327108025550842, + "learning_rate": 9.178805948493803e-06, + "loss": 0.7633, + "step": 6805 + }, + { + "epoch": 0.374594088832627, + "grad_norm": 0.7519007921218872, + "learning_rate": 9.178567919261692e-06, + "loss": 0.8268, + "step": 6806 + }, + { + "epoch": 0.37464912763498265, + "grad_norm": 0.7507897019386292, + "learning_rate": 9.178329858624584e-06, + "loss": 0.8734, + "step": 6807 + }, + { + "epoch": 0.37470416643733834, + "grad_norm": 0.6874666213989258, + "learning_rate": 9.178091766584267e-06, + "loss": 0.6669, + "step": 6808 + }, + { + "epoch": 0.37475920523969397, + "grad_norm": 0.6987403631210327, + "learning_rate": 9.17785364314253e-06, + "loss": 0.7627, + "step": 6809 + }, + { + "epoch": 0.37481424404204966, + "grad_norm": 0.7777343392372131, + "learning_rate": 9.177615488301163e-06, + "loss": 0.7637, + "step": 6810 + }, + { + "epoch": 0.3748692828444053, + "grad_norm": 0.71980881690979, + "learning_rate": 9.177377302061958e-06, + "loss": 0.7964, + "step": 6811 + }, + { + "epoch": 0.374924321646761, + "grad_norm": 0.627328634262085, + "learning_rate": 9.177139084426704e-06, + "loss": 0.6862, + "step": 6812 + }, + { + "epoch": 0.3749793604491166, + "grad_norm": 0.7099852561950684, + "learning_rate": 9.176900835397188e-06, + "loss": 0.7592, + "step": 6813 + }, + { + "epoch": 0.3750343992514723, + "grad_norm": 0.7880212664604187, + "learning_rate": 9.176662554975205e-06, + "loss": 0.756, + "step": 6814 + }, + { + "epoch": 0.37508943805382794, + "grad_norm": 0.7347460389137268, + "learning_rate": 9.176424243162546e-06, + "loss": 0.8537, + "step": 6815 + }, + { + "epoch": 0.37514447685618363, + "grad_norm": 0.7020999789237976, + "learning_rate": 9.176185899960996e-06, + "loss": 0.7844, + "step": 6816 + }, + { + "epoch": 0.37519951565853926, + "grad_norm": 0.6857696175575256, + "learning_rate": 9.175947525372355e-06, + "loss": 0.8491, + "step": 6817 + }, + { + "epoch": 0.37525455446089495, + "grad_norm": 0.6882391571998596, + "learning_rate": 9.175709119398409e-06, + "loss": 0.7797, + "step": 6818 + }, + { + "epoch": 0.3753095932632506, + "grad_norm": 0.7788485288619995, + "learning_rate": 9.17547068204095e-06, + "loss": 0.6898, + "step": 6819 + }, + { + "epoch": 0.3753646320656063, + "grad_norm": 0.8529300093650818, + "learning_rate": 9.17523221330177e-06, + "loss": 0.8113, + "step": 6820 + }, + { + "epoch": 0.3754196708679619, + "grad_norm": 0.6297540068626404, + "learning_rate": 9.174993713182663e-06, + "loss": 0.7133, + "step": 6821 + }, + { + "epoch": 0.3754747096703176, + "grad_norm": 0.8225051760673523, + "learning_rate": 9.174755181685422e-06, + "loss": 0.83, + "step": 6822 + }, + { + "epoch": 0.37552974847267323, + "grad_norm": 0.7445290684700012, + "learning_rate": 9.174516618811838e-06, + "loss": 0.8597, + "step": 6823 + }, + { + "epoch": 0.3755847872750289, + "grad_norm": 0.7890744209289551, + "learning_rate": 9.174278024563706e-06, + "loss": 0.8021, + "step": 6824 + }, + { + "epoch": 0.37563982607738455, + "grad_norm": 0.644434928894043, + "learning_rate": 9.174039398942815e-06, + "loss": 0.7154, + "step": 6825 + }, + { + "epoch": 0.37569486487974024, + "grad_norm": 0.7664980292320251, + "learning_rate": 9.173800741950962e-06, + "loss": 0.8496, + "step": 6826 + }, + { + "epoch": 0.3757499036820959, + "grad_norm": 0.8062339425086975, + "learning_rate": 9.173562053589942e-06, + "loss": 0.7736, + "step": 6827 + }, + { + "epoch": 0.37580494248445157, + "grad_norm": 0.6334213018417358, + "learning_rate": 9.173323333861543e-06, + "loss": 0.6513, + "step": 6828 + }, + { + "epoch": 0.3758599812868072, + "grad_norm": 0.6825501322746277, + "learning_rate": 9.173084582767567e-06, + "loss": 0.755, + "step": 6829 + }, + { + "epoch": 0.37591502008916283, + "grad_norm": 0.7353835105895996, + "learning_rate": 9.172845800309801e-06, + "loss": 0.7783, + "step": 6830 + }, + { + "epoch": 0.3759700588915185, + "grad_norm": 0.7830193638801575, + "learning_rate": 9.172606986490046e-06, + "loss": 0.7352, + "step": 6831 + }, + { + "epoch": 0.37602509769387416, + "grad_norm": 0.7464943528175354, + "learning_rate": 9.172368141310091e-06, + "loss": 0.6454, + "step": 6832 + }, + { + "epoch": 0.37608013649622984, + "grad_norm": 0.7171493172645569, + "learning_rate": 9.172129264771736e-06, + "loss": 0.7978, + "step": 6833 + }, + { + "epoch": 0.3761351752985855, + "grad_norm": 0.6929624676704407, + "learning_rate": 9.171890356876774e-06, + "loss": 0.8026, + "step": 6834 + }, + { + "epoch": 0.37619021410094117, + "grad_norm": 0.7240758538246155, + "learning_rate": 9.171651417627e-06, + "loss": 0.8469, + "step": 6835 + }, + { + "epoch": 0.3762452529032968, + "grad_norm": 0.7713736891746521, + "learning_rate": 9.17141244702421e-06, + "loss": 0.8307, + "step": 6836 + }, + { + "epoch": 0.3763002917056525, + "grad_norm": 0.7417639493942261, + "learning_rate": 9.171173445070203e-06, + "loss": 0.8165, + "step": 6837 + }, + { + "epoch": 0.3763553305080081, + "grad_norm": 0.811005711555481, + "learning_rate": 9.17093441176677e-06, + "loss": 0.8418, + "step": 6838 + }, + { + "epoch": 0.3764103693103638, + "grad_norm": 0.9996818900108337, + "learning_rate": 9.170695347115713e-06, + "loss": 0.851, + "step": 6839 + }, + { + "epoch": 0.37646540811271945, + "grad_norm": 0.7703381776809692, + "learning_rate": 9.170456251118824e-06, + "loss": 0.8308, + "step": 6840 + }, + { + "epoch": 0.37652044691507514, + "grad_norm": 0.7194466590881348, + "learning_rate": 9.170217123777904e-06, + "loss": 0.699, + "step": 6841 + }, + { + "epoch": 0.37657548571743077, + "grad_norm": 0.7146462202072144, + "learning_rate": 9.169977965094748e-06, + "loss": 0.8247, + "step": 6842 + }, + { + "epoch": 0.37663052451978646, + "grad_norm": 0.7490555047988892, + "learning_rate": 9.169738775071153e-06, + "loss": 0.8627, + "step": 6843 + }, + { + "epoch": 0.3766855633221421, + "grad_norm": 0.827996015548706, + "learning_rate": 9.169499553708919e-06, + "loss": 0.7454, + "step": 6844 + }, + { + "epoch": 0.3767406021244978, + "grad_norm": 0.7185913324356079, + "learning_rate": 9.16926030100984e-06, + "loss": 0.7018, + "step": 6845 + }, + { + "epoch": 0.3767956409268534, + "grad_norm": 0.7879654169082642, + "learning_rate": 9.169021016975718e-06, + "loss": 0.8144, + "step": 6846 + }, + { + "epoch": 0.3768506797292091, + "grad_norm": 0.7072417736053467, + "learning_rate": 9.168781701608352e-06, + "loss": 0.7572, + "step": 6847 + }, + { + "epoch": 0.37690571853156474, + "grad_norm": 0.7359803915023804, + "learning_rate": 9.168542354909536e-06, + "loss": 0.7712, + "step": 6848 + }, + { + "epoch": 0.3769607573339204, + "grad_norm": 0.7672479748725891, + "learning_rate": 9.168302976881072e-06, + "loss": 0.7696, + "step": 6849 + }, + { + "epoch": 0.37701579613627606, + "grad_norm": 0.7276006937026978, + "learning_rate": 9.168063567524758e-06, + "loss": 0.8235, + "step": 6850 + }, + { + "epoch": 0.37707083493863175, + "grad_norm": 0.673577606678009, + "learning_rate": 9.167824126842396e-06, + "loss": 0.6515, + "step": 6851 + }, + { + "epoch": 0.3771258737409874, + "grad_norm": 0.7257997989654541, + "learning_rate": 9.167584654835782e-06, + "loss": 0.729, + "step": 6852 + }, + { + "epoch": 0.37718091254334307, + "grad_norm": 0.6655071377754211, + "learning_rate": 9.167345151506717e-06, + "loss": 0.7917, + "step": 6853 + }, + { + "epoch": 0.3772359513456987, + "grad_norm": 0.7603726983070374, + "learning_rate": 9.167105616857002e-06, + "loss": 0.8383, + "step": 6854 + }, + { + "epoch": 0.3772909901480544, + "grad_norm": 0.7066939473152161, + "learning_rate": 9.166866050888437e-06, + "loss": 0.7589, + "step": 6855 + }, + { + "epoch": 0.37734602895041003, + "grad_norm": 0.7002355456352234, + "learning_rate": 9.16662645360282e-06, + "loss": 0.8305, + "step": 6856 + }, + { + "epoch": 0.3774010677527657, + "grad_norm": 0.9499780535697937, + "learning_rate": 9.166386825001957e-06, + "loss": 0.78, + "step": 6857 + }, + { + "epoch": 0.37745610655512135, + "grad_norm": 0.7136938571929932, + "learning_rate": 9.166147165087645e-06, + "loss": 0.7449, + "step": 6858 + }, + { + "epoch": 0.37751114535747704, + "grad_norm": 0.740443766117096, + "learning_rate": 9.165907473861687e-06, + "loss": 0.8228, + "step": 6859 + }, + { + "epoch": 0.3775661841598327, + "grad_norm": 0.7649856209754944, + "learning_rate": 9.165667751325879e-06, + "loss": 0.7762, + "step": 6860 + }, + { + "epoch": 0.37762122296218836, + "grad_norm": 0.743251383304596, + "learning_rate": 9.165427997482032e-06, + "loss": 0.7536, + "step": 6861 + }, + { + "epoch": 0.377676261764544, + "grad_norm": 0.7023851871490479, + "learning_rate": 9.165188212331941e-06, + "loss": 0.7327, + "step": 6862 + }, + { + "epoch": 0.3777313005668997, + "grad_norm": 0.7304333448410034, + "learning_rate": 9.164948395877411e-06, + "loss": 0.8816, + "step": 6863 + }, + { + "epoch": 0.3777863393692553, + "grad_norm": 0.6666659116744995, + "learning_rate": 9.164708548120244e-06, + "loss": 0.7821, + "step": 6864 + }, + { + "epoch": 0.377841378171611, + "grad_norm": 0.6542865037918091, + "learning_rate": 9.164468669062242e-06, + "loss": 0.7044, + "step": 6865 + }, + { + "epoch": 0.37789641697396664, + "grad_norm": 0.7436043620109558, + "learning_rate": 9.16422875870521e-06, + "loss": 0.8492, + "step": 6866 + }, + { + "epoch": 0.37795145577632233, + "grad_norm": 0.7660424709320068, + "learning_rate": 9.163988817050947e-06, + "loss": 0.7236, + "step": 6867 + }, + { + "epoch": 0.37800649457867797, + "grad_norm": 0.7288914918899536, + "learning_rate": 9.16374884410126e-06, + "loss": 0.6361, + "step": 6868 + }, + { + "epoch": 0.37806153338103365, + "grad_norm": 0.884832501411438, + "learning_rate": 9.163508839857948e-06, + "loss": 0.8112, + "step": 6869 + }, + { + "epoch": 0.3781165721833893, + "grad_norm": 0.937660813331604, + "learning_rate": 9.163268804322822e-06, + "loss": 0.6405, + "step": 6870 + }, + { + "epoch": 0.378171610985745, + "grad_norm": 0.8295212388038635, + "learning_rate": 9.16302873749768e-06, + "loss": 0.8107, + "step": 6871 + }, + { + "epoch": 0.3782266497881006, + "grad_norm": 1.0573647022247314, + "learning_rate": 9.16278863938433e-06, + "loss": 0.7792, + "step": 6872 + }, + { + "epoch": 0.37828168859045624, + "grad_norm": 0.8450027108192444, + "learning_rate": 9.162548509984574e-06, + "loss": 0.8103, + "step": 6873 + }, + { + "epoch": 0.37833672739281193, + "grad_norm": 0.7372947931289673, + "learning_rate": 9.162308349300218e-06, + "loss": 0.8232, + "step": 6874 + }, + { + "epoch": 0.37839176619516757, + "grad_norm": 0.7573776841163635, + "learning_rate": 9.162068157333066e-06, + "loss": 0.773, + "step": 6875 + }, + { + "epoch": 0.37844680499752326, + "grad_norm": 0.7883201241493225, + "learning_rate": 9.161827934084924e-06, + "loss": 0.7561, + "step": 6876 + }, + { + "epoch": 0.3785018437998789, + "grad_norm": 0.7195025086402893, + "learning_rate": 9.161587679557598e-06, + "loss": 0.798, + "step": 6877 + }, + { + "epoch": 0.3785568826022346, + "grad_norm": 0.7047843337059021, + "learning_rate": 9.161347393752891e-06, + "loss": 0.8122, + "step": 6878 + }, + { + "epoch": 0.3786119214045902, + "grad_norm": 0.7354363203048706, + "learning_rate": 9.161107076672613e-06, + "loss": 0.7296, + "step": 6879 + }, + { + "epoch": 0.3786669602069459, + "grad_norm": 0.7748313546180725, + "learning_rate": 9.160866728318567e-06, + "loss": 0.9576, + "step": 6880 + }, + { + "epoch": 0.37872199900930154, + "grad_norm": 0.7197638750076294, + "learning_rate": 9.16062634869256e-06, + "loss": 0.8054, + "step": 6881 + }, + { + "epoch": 0.3787770378116572, + "grad_norm": 0.7086492776870728, + "learning_rate": 9.1603859377964e-06, + "loss": 0.8938, + "step": 6882 + }, + { + "epoch": 0.37883207661401286, + "grad_norm": 0.7764425873756409, + "learning_rate": 9.160145495631894e-06, + "loss": 0.7562, + "step": 6883 + }, + { + "epoch": 0.37888711541636855, + "grad_norm": 0.7673479914665222, + "learning_rate": 9.159905022200846e-06, + "loss": 0.6783, + "step": 6884 + }, + { + "epoch": 0.3789421542187242, + "grad_norm": 0.7323669195175171, + "learning_rate": 9.159664517505067e-06, + "loss": 0.8274, + "step": 6885 + }, + { + "epoch": 0.37899719302107987, + "grad_norm": 0.8283136487007141, + "learning_rate": 9.159423981546362e-06, + "loss": 0.7184, + "step": 6886 + }, + { + "epoch": 0.3790522318234355, + "grad_norm": 0.6949145793914795, + "learning_rate": 9.15918341432654e-06, + "loss": 0.7843, + "step": 6887 + }, + { + "epoch": 0.3791072706257912, + "grad_norm": 0.8584639430046082, + "learning_rate": 9.158942815847408e-06, + "loss": 0.71, + "step": 6888 + }, + { + "epoch": 0.3791623094281468, + "grad_norm": 0.7125271558761597, + "learning_rate": 9.158702186110777e-06, + "loss": 0.7432, + "step": 6889 + }, + { + "epoch": 0.3792173482305025, + "grad_norm": 0.6657430529594421, + "learning_rate": 9.158461525118452e-06, + "loss": 0.6715, + "step": 6890 + }, + { + "epoch": 0.37927238703285815, + "grad_norm": 0.770226240158081, + "learning_rate": 9.158220832872243e-06, + "loss": 0.7029, + "step": 6891 + }, + { + "epoch": 0.37932742583521384, + "grad_norm": 0.7697272300720215, + "learning_rate": 9.15798010937396e-06, + "loss": 0.686, + "step": 6892 + }, + { + "epoch": 0.37938246463756947, + "grad_norm": 0.7693290710449219, + "learning_rate": 9.157739354625413e-06, + "loss": 0.7669, + "step": 6893 + }, + { + "epoch": 0.37943750343992516, + "grad_norm": 0.8365996479988098, + "learning_rate": 9.157498568628406e-06, + "loss": 0.8254, + "step": 6894 + }, + { + "epoch": 0.3794925422422808, + "grad_norm": 0.8075883388519287, + "learning_rate": 9.157257751384756e-06, + "loss": 0.8311, + "step": 6895 + }, + { + "epoch": 0.3795475810446365, + "grad_norm": 0.8422812819480896, + "learning_rate": 9.15701690289627e-06, + "loss": 0.9173, + "step": 6896 + }, + { + "epoch": 0.3796026198469921, + "grad_norm": 0.7930355072021484, + "learning_rate": 9.156776023164755e-06, + "loss": 0.9376, + "step": 6897 + }, + { + "epoch": 0.3796576586493478, + "grad_norm": 0.7877563238143921, + "learning_rate": 9.156535112192026e-06, + "loss": 0.8358, + "step": 6898 + }, + { + "epoch": 0.37971269745170344, + "grad_norm": 0.7712885141372681, + "learning_rate": 9.156294169979891e-06, + "loss": 0.8781, + "step": 6899 + }, + { + "epoch": 0.37976773625405913, + "grad_norm": 0.6953728199005127, + "learning_rate": 9.156053196530162e-06, + "loss": 0.7861, + "step": 6900 + }, + { + "epoch": 0.37982277505641476, + "grad_norm": 0.9581564664840698, + "learning_rate": 9.155812191844649e-06, + "loss": 0.8294, + "step": 6901 + }, + { + "epoch": 0.37987781385877045, + "grad_norm": 0.738571286201477, + "learning_rate": 9.155571155925166e-06, + "loss": 0.7998, + "step": 6902 + }, + { + "epoch": 0.3799328526611261, + "grad_norm": 0.7059765458106995, + "learning_rate": 9.155330088773519e-06, + "loss": 0.7877, + "step": 6903 + }, + { + "epoch": 0.3799878914634818, + "grad_norm": 0.8572642207145691, + "learning_rate": 9.155088990391527e-06, + "loss": 0.7333, + "step": 6904 + }, + { + "epoch": 0.3800429302658374, + "grad_norm": 0.7442637085914612, + "learning_rate": 9.154847860780996e-06, + "loss": 0.685, + "step": 6905 + }, + { + "epoch": 0.3800979690681931, + "grad_norm": 0.7787682414054871, + "learning_rate": 9.154606699943741e-06, + "loss": 0.7893, + "step": 6906 + }, + { + "epoch": 0.38015300787054873, + "grad_norm": 0.8973822593688965, + "learning_rate": 9.154365507881574e-06, + "loss": 0.8297, + "step": 6907 + }, + { + "epoch": 0.3802080466729044, + "grad_norm": 0.7759919166564941, + "learning_rate": 9.154124284596311e-06, + "loss": 0.8257, + "step": 6908 + }, + { + "epoch": 0.38026308547526005, + "grad_norm": 0.8042850494384766, + "learning_rate": 9.153883030089759e-06, + "loss": 0.8024, + "step": 6909 + }, + { + "epoch": 0.38031812427761574, + "grad_norm": 0.8285790085792542, + "learning_rate": 9.153641744363733e-06, + "loss": 0.7824, + "step": 6910 + }, + { + "epoch": 0.3803731630799714, + "grad_norm": 0.7225445508956909, + "learning_rate": 9.15340042742005e-06, + "loss": 0.8065, + "step": 6911 + }, + { + "epoch": 0.38042820188232707, + "grad_norm": 0.7685298919677734, + "learning_rate": 9.15315907926052e-06, + "loss": 0.8151, + "step": 6912 + }, + { + "epoch": 0.3804832406846827, + "grad_norm": 0.9005589485168457, + "learning_rate": 9.152917699886958e-06, + "loss": 0.8413, + "step": 6913 + }, + { + "epoch": 0.3805382794870384, + "grad_norm": 0.8715279698371887, + "learning_rate": 9.152676289301178e-06, + "loss": 0.7233, + "step": 6914 + }, + { + "epoch": 0.380593318289394, + "grad_norm": 0.8764133453369141, + "learning_rate": 9.152434847504996e-06, + "loss": 0.783, + "step": 6915 + }, + { + "epoch": 0.38064835709174966, + "grad_norm": 0.6847019195556641, + "learning_rate": 9.152193374500225e-06, + "loss": 0.7133, + "step": 6916 + }, + { + "epoch": 0.38070339589410535, + "grad_norm": 0.7562721371650696, + "learning_rate": 9.151951870288678e-06, + "loss": 0.8155, + "step": 6917 + }, + { + "epoch": 0.380758434696461, + "grad_norm": 0.6888439059257507, + "learning_rate": 9.151710334872173e-06, + "loss": 0.6395, + "step": 6918 + }, + { + "epoch": 0.38081347349881667, + "grad_norm": 1.0951511859893799, + "learning_rate": 9.151468768252525e-06, + "loss": 0.8936, + "step": 6919 + }, + { + "epoch": 0.3808685123011723, + "grad_norm": 0.7261115908622742, + "learning_rate": 9.151227170431549e-06, + "loss": 0.7864, + "step": 6920 + }, + { + "epoch": 0.380923551103528, + "grad_norm": 1.2851859331130981, + "learning_rate": 9.150985541411061e-06, + "loss": 0.9419, + "step": 6921 + }, + { + "epoch": 0.3809785899058836, + "grad_norm": 0.7621721625328064, + "learning_rate": 9.150743881192876e-06, + "loss": 0.7773, + "step": 6922 + }, + { + "epoch": 0.3810336287082393, + "grad_norm": 0.7605605721473694, + "learning_rate": 9.150502189778811e-06, + "loss": 0.8752, + "step": 6923 + }, + { + "epoch": 0.38108866751059495, + "grad_norm": 0.8422327041625977, + "learning_rate": 9.150260467170683e-06, + "loss": 0.8555, + "step": 6924 + }, + { + "epoch": 0.38114370631295064, + "grad_norm": 0.7227829098701477, + "learning_rate": 9.15001871337031e-06, + "loss": 0.7637, + "step": 6925 + }, + { + "epoch": 0.38119874511530627, + "grad_norm": 0.6568942666053772, + "learning_rate": 9.149776928379506e-06, + "loss": 0.6944, + "step": 6926 + }, + { + "epoch": 0.38125378391766196, + "grad_norm": 0.9317567944526672, + "learning_rate": 9.149535112200087e-06, + "loss": 0.8098, + "step": 6927 + }, + { + "epoch": 0.3813088227200176, + "grad_norm": 0.6374759674072266, + "learning_rate": 9.149293264833877e-06, + "loss": 0.6654, + "step": 6928 + }, + { + "epoch": 0.3813638615223733, + "grad_norm": 0.7276837825775146, + "learning_rate": 9.149051386282685e-06, + "loss": 0.7728, + "step": 6929 + }, + { + "epoch": 0.3814189003247289, + "grad_norm": 0.7573683261871338, + "learning_rate": 9.148809476548337e-06, + "loss": 0.7681, + "step": 6930 + }, + { + "epoch": 0.3814739391270846, + "grad_norm": 0.7535703778266907, + "learning_rate": 9.148567535632647e-06, + "loss": 0.8498, + "step": 6931 + }, + { + "epoch": 0.38152897792944024, + "grad_norm": 0.7510126233100891, + "learning_rate": 9.148325563537432e-06, + "loss": 0.7874, + "step": 6932 + }, + { + "epoch": 0.3815840167317959, + "grad_norm": 0.7809224724769592, + "learning_rate": 9.148083560264515e-06, + "loss": 0.7223, + "step": 6933 + }, + { + "epoch": 0.38163905553415156, + "grad_norm": 0.7433155179023743, + "learning_rate": 9.14784152581571e-06, + "loss": 0.7914, + "step": 6934 + }, + { + "epoch": 0.38169409433650725, + "grad_norm": 0.7142858505249023, + "learning_rate": 9.14759946019284e-06, + "loss": 0.781, + "step": 6935 + }, + { + "epoch": 0.3817491331388629, + "grad_norm": 0.7910202741622925, + "learning_rate": 9.147357363397721e-06, + "loss": 0.755, + "step": 6936 + }, + { + "epoch": 0.3818041719412186, + "grad_norm": 1.007727026939392, + "learning_rate": 9.147115235432176e-06, + "loss": 0.7809, + "step": 6937 + }, + { + "epoch": 0.3818592107435742, + "grad_norm": 0.7227005362510681, + "learning_rate": 9.146873076298024e-06, + "loss": 0.7276, + "step": 6938 + }, + { + "epoch": 0.3819142495459299, + "grad_norm": 0.6945967674255371, + "learning_rate": 9.146630885997081e-06, + "loss": 0.825, + "step": 6939 + }, + { + "epoch": 0.38196928834828553, + "grad_norm": 0.6719669103622437, + "learning_rate": 9.146388664531172e-06, + "loss": 0.6486, + "step": 6940 + }, + { + "epoch": 0.3820243271506412, + "grad_norm": 0.7528467178344727, + "learning_rate": 9.146146411902115e-06, + "loss": 0.8143, + "step": 6941 + }, + { + "epoch": 0.38207936595299685, + "grad_norm": 0.6835548877716064, + "learning_rate": 9.145904128111732e-06, + "loss": 0.7742, + "step": 6942 + }, + { + "epoch": 0.38213440475535254, + "grad_norm": 0.7829870581626892, + "learning_rate": 9.145661813161844e-06, + "loss": 0.8147, + "step": 6943 + }, + { + "epoch": 0.3821894435577082, + "grad_norm": 0.6833155155181885, + "learning_rate": 9.145419467054271e-06, + "loss": 0.7615, + "step": 6944 + }, + { + "epoch": 0.38224448236006386, + "grad_norm": 0.7577275037765503, + "learning_rate": 9.145177089790833e-06, + "loss": 0.8611, + "step": 6945 + }, + { + "epoch": 0.3822995211624195, + "grad_norm": 0.7102984189987183, + "learning_rate": 9.144934681373356e-06, + "loss": 0.8373, + "step": 6946 + }, + { + "epoch": 0.3823545599647752, + "grad_norm": 0.6906121373176575, + "learning_rate": 9.144692241803658e-06, + "loss": 0.8314, + "step": 6947 + }, + { + "epoch": 0.3824095987671308, + "grad_norm": 0.7790967226028442, + "learning_rate": 9.144449771083563e-06, + "loss": 0.8285, + "step": 6948 + }, + { + "epoch": 0.3824646375694865, + "grad_norm": 0.8420237898826599, + "learning_rate": 9.144207269214893e-06, + "loss": 0.8159, + "step": 6949 + }, + { + "epoch": 0.38251967637184214, + "grad_norm": 0.7944310307502747, + "learning_rate": 9.143964736199471e-06, + "loss": 0.7981, + "step": 6950 + }, + { + "epoch": 0.38257471517419783, + "grad_norm": 0.7610076069831848, + "learning_rate": 9.14372217203912e-06, + "loss": 0.8011, + "step": 6951 + }, + { + "epoch": 0.38262975397655347, + "grad_norm": 0.7183333039283752, + "learning_rate": 9.143479576735661e-06, + "loss": 0.7504, + "step": 6952 + }, + { + "epoch": 0.38268479277890916, + "grad_norm": 0.7363573312759399, + "learning_rate": 9.14323695029092e-06, + "loss": 0.7561, + "step": 6953 + }, + { + "epoch": 0.3827398315812648, + "grad_norm": 0.7330427765846252, + "learning_rate": 9.142994292706716e-06, + "loss": 0.754, + "step": 6954 + }, + { + "epoch": 0.3827948703836205, + "grad_norm": 0.8307509422302246, + "learning_rate": 9.142751603984879e-06, + "loss": 0.8059, + "step": 6955 + }, + { + "epoch": 0.3828499091859761, + "grad_norm": 0.7340347766876221, + "learning_rate": 9.142508884127228e-06, + "loss": 0.8636, + "step": 6956 + }, + { + "epoch": 0.3829049479883318, + "grad_norm": 0.7032678127288818, + "learning_rate": 9.14226613313559e-06, + "loss": 0.8237, + "step": 6957 + }, + { + "epoch": 0.38295998679068743, + "grad_norm": 0.769809365272522, + "learning_rate": 9.142023351011788e-06, + "loss": 0.7523, + "step": 6958 + }, + { + "epoch": 0.38301502559304307, + "grad_norm": 0.7446833252906799, + "learning_rate": 9.141780537757647e-06, + "loss": 0.8382, + "step": 6959 + }, + { + "epoch": 0.38307006439539876, + "grad_norm": 0.6926285028457642, + "learning_rate": 9.141537693374994e-06, + "loss": 0.7997, + "step": 6960 + }, + { + "epoch": 0.3831251031977544, + "grad_norm": 0.7303034067153931, + "learning_rate": 9.141294817865651e-06, + "loss": 0.794, + "step": 6961 + }, + { + "epoch": 0.3831801420001101, + "grad_norm": 0.7453297972679138, + "learning_rate": 9.141051911231445e-06, + "loss": 0.7031, + "step": 6962 + }, + { + "epoch": 0.3832351808024657, + "grad_norm": 0.8503912091255188, + "learning_rate": 9.140808973474201e-06, + "loss": 0.7855, + "step": 6963 + }, + { + "epoch": 0.3832902196048214, + "grad_norm": 0.7304036617279053, + "learning_rate": 9.140566004595746e-06, + "loss": 0.7062, + "step": 6964 + }, + { + "epoch": 0.38334525840717704, + "grad_norm": 0.7534968852996826, + "learning_rate": 9.140323004597904e-06, + "loss": 0.8138, + "step": 6965 + }, + { + "epoch": 0.3834002972095327, + "grad_norm": 0.8122013807296753, + "learning_rate": 9.140079973482503e-06, + "loss": 0.7769, + "step": 6966 + }, + { + "epoch": 0.38345533601188836, + "grad_norm": 0.7345744967460632, + "learning_rate": 9.13983691125137e-06, + "loss": 0.7588, + "step": 6967 + }, + { + "epoch": 0.38351037481424405, + "grad_norm": 0.7251620292663574, + "learning_rate": 9.13959381790633e-06, + "loss": 0.8027, + "step": 6968 + }, + { + "epoch": 0.3835654136165997, + "grad_norm": 0.7157594561576843, + "learning_rate": 9.139350693449212e-06, + "loss": 0.7233, + "step": 6969 + }, + { + "epoch": 0.38362045241895537, + "grad_norm": 0.8076621890068054, + "learning_rate": 9.139107537881842e-06, + "loss": 0.7256, + "step": 6970 + }, + { + "epoch": 0.383675491221311, + "grad_norm": 0.717182993888855, + "learning_rate": 9.138864351206047e-06, + "loss": 0.7003, + "step": 6971 + }, + { + "epoch": 0.3837305300236667, + "grad_norm": 0.7534194588661194, + "learning_rate": 9.138621133423656e-06, + "loss": 0.7315, + "step": 6972 + }, + { + "epoch": 0.3837855688260223, + "grad_norm": 0.6400160193443298, + "learning_rate": 9.138377884536494e-06, + "loss": 0.6814, + "step": 6973 + }, + { + "epoch": 0.383840607628378, + "grad_norm": 0.7319507002830505, + "learning_rate": 9.138134604546394e-06, + "loss": 0.7942, + "step": 6974 + }, + { + "epoch": 0.38389564643073365, + "grad_norm": 0.7109829783439636, + "learning_rate": 9.137891293455181e-06, + "loss": 0.7528, + "step": 6975 + }, + { + "epoch": 0.38395068523308934, + "grad_norm": 1.006724238395691, + "learning_rate": 9.137647951264685e-06, + "loss": 0.7652, + "step": 6976 + }, + { + "epoch": 0.384005724035445, + "grad_norm": 0.7080540060997009, + "learning_rate": 9.137404577976736e-06, + "loss": 0.7706, + "step": 6977 + }, + { + "epoch": 0.38406076283780066, + "grad_norm": 0.7551368474960327, + "learning_rate": 9.137161173593161e-06, + "loss": 0.8202, + "step": 6978 + }, + { + "epoch": 0.3841158016401563, + "grad_norm": 0.6624314785003662, + "learning_rate": 9.13691773811579e-06, + "loss": 0.7258, + "step": 6979 + }, + { + "epoch": 0.384170840442512, + "grad_norm": 0.9603848457336426, + "learning_rate": 9.136674271546451e-06, + "loss": 0.9415, + "step": 6980 + }, + { + "epoch": 0.3842258792448676, + "grad_norm": 0.6964829564094543, + "learning_rate": 9.136430773886977e-06, + "loss": 0.7604, + "step": 6981 + }, + { + "epoch": 0.3842809180472233, + "grad_norm": 0.6503588557243347, + "learning_rate": 9.136187245139197e-06, + "loss": 0.7141, + "step": 6982 + }, + { + "epoch": 0.38433595684957894, + "grad_norm": 0.9179829359054565, + "learning_rate": 9.13594368530494e-06, + "loss": 0.7619, + "step": 6983 + }, + { + "epoch": 0.38439099565193463, + "grad_norm": 0.7993278503417969, + "learning_rate": 9.135700094386038e-06, + "loss": 0.832, + "step": 6984 + }, + { + "epoch": 0.38444603445429026, + "grad_norm": 0.8136988282203674, + "learning_rate": 9.13545647238432e-06, + "loss": 0.8127, + "step": 6985 + }, + { + "epoch": 0.38450107325664595, + "grad_norm": 0.9918104410171509, + "learning_rate": 9.135212819301619e-06, + "loss": 0.836, + "step": 6986 + }, + { + "epoch": 0.3845561120590016, + "grad_norm": 0.7767511010169983, + "learning_rate": 9.134969135139765e-06, + "loss": 0.8391, + "step": 6987 + }, + { + "epoch": 0.3846111508613573, + "grad_norm": 0.6889285445213318, + "learning_rate": 9.134725419900589e-06, + "loss": 0.7639, + "step": 6988 + }, + { + "epoch": 0.3846661896637129, + "grad_norm": 1.803467035293579, + "learning_rate": 9.134481673585924e-06, + "loss": 0.7629, + "step": 6989 + }, + { + "epoch": 0.3847212284660686, + "grad_norm": 0.721581757068634, + "learning_rate": 9.134237896197603e-06, + "loss": 0.8194, + "step": 6990 + }, + { + "epoch": 0.38477626726842423, + "grad_norm": 0.8163189888000488, + "learning_rate": 9.133994087737456e-06, + "loss": 0.7789, + "step": 6991 + }, + { + "epoch": 0.3848313060707799, + "grad_norm": 0.7518420815467834, + "learning_rate": 9.133750248207315e-06, + "loss": 0.7529, + "step": 6992 + }, + { + "epoch": 0.38488634487313556, + "grad_norm": 0.7318000197410583, + "learning_rate": 9.133506377609015e-06, + "loss": 0.7829, + "step": 6993 + }, + { + "epoch": 0.38494138367549124, + "grad_norm": 0.7765058875083923, + "learning_rate": 9.133262475944386e-06, + "loss": 0.7902, + "step": 6994 + }, + { + "epoch": 0.3849964224778469, + "grad_norm": 0.845567524433136, + "learning_rate": 9.133018543215265e-06, + "loss": 0.8117, + "step": 6995 + }, + { + "epoch": 0.38505146128020257, + "grad_norm": 0.7081887125968933, + "learning_rate": 9.13277457942348e-06, + "loss": 0.8131, + "step": 6996 + }, + { + "epoch": 0.3851065000825582, + "grad_norm": 0.7447869777679443, + "learning_rate": 9.132530584570869e-06, + "loss": 0.7765, + "step": 6997 + }, + { + "epoch": 0.3851615388849139, + "grad_norm": 0.8554795384407043, + "learning_rate": 9.132286558659265e-06, + "loss": 0.8966, + "step": 6998 + }, + { + "epoch": 0.3852165776872695, + "grad_norm": 0.7117023468017578, + "learning_rate": 9.1320425016905e-06, + "loss": 0.7461, + "step": 6999 + }, + { + "epoch": 0.3852716164896252, + "grad_norm": 0.6965934038162231, + "learning_rate": 9.131798413666411e-06, + "loss": 0.6827, + "step": 7000 + }, + { + "epoch": 0.38532665529198085, + "grad_norm": 0.7449018359184265, + "learning_rate": 9.13155429458883e-06, + "loss": 0.7562, + "step": 7001 + }, + { + "epoch": 0.3853816940943365, + "grad_norm": 0.7764221429824829, + "learning_rate": 9.131310144459593e-06, + "loss": 0.7842, + "step": 7002 + }, + { + "epoch": 0.38543673289669217, + "grad_norm": 0.9788658618927002, + "learning_rate": 9.131065963280536e-06, + "loss": 0.7857, + "step": 7003 + }, + { + "epoch": 0.3854917716990478, + "grad_norm": 0.7900908589363098, + "learning_rate": 9.13082175105349e-06, + "loss": 0.8733, + "step": 7004 + }, + { + "epoch": 0.3855468105014035, + "grad_norm": 0.814822793006897, + "learning_rate": 9.130577507780298e-06, + "loss": 0.8032, + "step": 7005 + }, + { + "epoch": 0.3856018493037591, + "grad_norm": 1.0648475885391235, + "learning_rate": 9.130333233462789e-06, + "loss": 0.8078, + "step": 7006 + }, + { + "epoch": 0.3856568881061148, + "grad_norm": 0.7359917163848877, + "learning_rate": 9.130088928102799e-06, + "loss": 0.6491, + "step": 7007 + }, + { + "epoch": 0.38571192690847045, + "grad_norm": 0.7321771383285522, + "learning_rate": 9.129844591702169e-06, + "loss": 0.7663, + "step": 7008 + }, + { + "epoch": 0.38576696571082614, + "grad_norm": 0.6937146186828613, + "learning_rate": 9.129600224262732e-06, + "loss": 0.7835, + "step": 7009 + }, + { + "epoch": 0.38582200451318177, + "grad_norm": 0.7330107688903809, + "learning_rate": 9.129355825786323e-06, + "loss": 0.7626, + "step": 7010 + }, + { + "epoch": 0.38587704331553746, + "grad_norm": 0.7021715044975281, + "learning_rate": 9.129111396274783e-06, + "loss": 0.7115, + "step": 7011 + }, + { + "epoch": 0.3859320821178931, + "grad_norm": 0.6599563360214233, + "learning_rate": 9.128866935729947e-06, + "loss": 0.6554, + "step": 7012 + }, + { + "epoch": 0.3859871209202488, + "grad_norm": 0.7323513031005859, + "learning_rate": 9.128622444153652e-06, + "loss": 0.7392, + "step": 7013 + }, + { + "epoch": 0.3860421597226044, + "grad_norm": 0.681888222694397, + "learning_rate": 9.128377921547736e-06, + "loss": 0.7474, + "step": 7014 + }, + { + "epoch": 0.3860971985249601, + "grad_norm": 0.8454889059066772, + "learning_rate": 9.128133367914036e-06, + "loss": 0.8355, + "step": 7015 + }, + { + "epoch": 0.38615223732731574, + "grad_norm": 0.7514123916625977, + "learning_rate": 9.12788878325439e-06, + "loss": 0.7683, + "step": 7016 + }, + { + "epoch": 0.38620727612967143, + "grad_norm": 0.7317092418670654, + "learning_rate": 9.12764416757064e-06, + "loss": 0.7201, + "step": 7017 + }, + { + "epoch": 0.38626231493202706, + "grad_norm": 0.7626729011535645, + "learning_rate": 9.127399520864619e-06, + "loss": 0.7701, + "step": 7018 + }, + { + "epoch": 0.38631735373438275, + "grad_norm": 0.9790363311767578, + "learning_rate": 9.127154843138168e-06, + "loss": 0.8034, + "step": 7019 + }, + { + "epoch": 0.3863723925367384, + "grad_norm": 0.663593590259552, + "learning_rate": 9.126910134393125e-06, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 0.3864274313390941, + "grad_norm": 0.6599924564361572, + "learning_rate": 9.126665394631332e-06, + "loss": 0.7395, + "step": 7021 + }, + { + "epoch": 0.3864824701414497, + "grad_norm": 0.8493411540985107, + "learning_rate": 9.126420623854625e-06, + "loss": 0.8008, + "step": 7022 + }, + { + "epoch": 0.3865375089438054, + "grad_norm": 0.7587194442749023, + "learning_rate": 9.126175822064846e-06, + "loss": 0.7533, + "step": 7023 + }, + { + "epoch": 0.38659254774616103, + "grad_norm": 0.773764431476593, + "learning_rate": 9.125930989263835e-06, + "loss": 0.75, + "step": 7024 + }, + { + "epoch": 0.3866475865485167, + "grad_norm": 0.7126749753952026, + "learning_rate": 9.12568612545343e-06, + "loss": 0.7794, + "step": 7025 + }, + { + "epoch": 0.38670262535087235, + "grad_norm": 0.7404584884643555, + "learning_rate": 9.125441230635472e-06, + "loss": 0.7264, + "step": 7026 + }, + { + "epoch": 0.38675766415322804, + "grad_norm": 0.8057644367218018, + "learning_rate": 9.125196304811804e-06, + "loss": 0.8058, + "step": 7027 + }, + { + "epoch": 0.3868127029555837, + "grad_norm": 0.9586995840072632, + "learning_rate": 9.124951347984263e-06, + "loss": 0.7659, + "step": 7028 + }, + { + "epoch": 0.38686774175793937, + "grad_norm": 0.7567793726921082, + "learning_rate": 9.124706360154693e-06, + "loss": 0.8961, + "step": 7029 + }, + { + "epoch": 0.386922780560295, + "grad_norm": 0.8523182272911072, + "learning_rate": 9.124461341324934e-06, + "loss": 0.8815, + "step": 7030 + }, + { + "epoch": 0.3869778193626507, + "grad_norm": 0.7466379404067993, + "learning_rate": 9.124216291496826e-06, + "loss": 0.7817, + "step": 7031 + }, + { + "epoch": 0.3870328581650063, + "grad_norm": 0.6721325516700745, + "learning_rate": 9.123971210672214e-06, + "loss": 0.7637, + "step": 7032 + }, + { + "epoch": 0.387087896967362, + "grad_norm": 0.6620928049087524, + "learning_rate": 9.123726098852936e-06, + "loss": 0.6956, + "step": 7033 + }, + { + "epoch": 0.38714293576971764, + "grad_norm": 0.6784290671348572, + "learning_rate": 9.12348095604084e-06, + "loss": 0.7034, + "step": 7034 + }, + { + "epoch": 0.38719797457207333, + "grad_norm": 0.7138848304748535, + "learning_rate": 9.123235782237763e-06, + "loss": 0.6037, + "step": 7035 + }, + { + "epoch": 0.38725301337442897, + "grad_norm": 0.8473613858222961, + "learning_rate": 9.122990577445548e-06, + "loss": 0.8157, + "step": 7036 + }, + { + "epoch": 0.38730805217678466, + "grad_norm": 0.835381031036377, + "learning_rate": 9.122745341666041e-06, + "loss": 0.8736, + "step": 7037 + }, + { + "epoch": 0.3873630909791403, + "grad_norm": 0.8823271989822388, + "learning_rate": 9.122500074901083e-06, + "loss": 0.7448, + "step": 7038 + }, + { + "epoch": 0.387418129781496, + "grad_norm": 0.6494244933128357, + "learning_rate": 9.122254777152519e-06, + "loss": 0.7423, + "step": 7039 + }, + { + "epoch": 0.3874731685838516, + "grad_norm": 0.7232181429862976, + "learning_rate": 9.122009448422191e-06, + "loss": 0.8489, + "step": 7040 + }, + { + "epoch": 0.3875282073862073, + "grad_norm": 0.7357699275016785, + "learning_rate": 9.121764088711945e-06, + "loss": 0.8799, + "step": 7041 + }, + { + "epoch": 0.38758324618856294, + "grad_norm": 0.7638574838638306, + "learning_rate": 9.121518698023621e-06, + "loss": 0.8539, + "step": 7042 + }, + { + "epoch": 0.3876382849909186, + "grad_norm": 0.7407062649726868, + "learning_rate": 9.121273276359068e-06, + "loss": 0.7152, + "step": 7043 + }, + { + "epoch": 0.38769332379327426, + "grad_norm": 0.6945983171463013, + "learning_rate": 9.121027823720126e-06, + "loss": 0.8224, + "step": 7044 + }, + { + "epoch": 0.3877483625956299, + "grad_norm": 0.7163639068603516, + "learning_rate": 9.120782340108643e-06, + "loss": 0.808, + "step": 7045 + }, + { + "epoch": 0.3878034013979856, + "grad_norm": 0.7062035799026489, + "learning_rate": 9.120536825526463e-06, + "loss": 0.783, + "step": 7046 + }, + { + "epoch": 0.3878584402003412, + "grad_norm": 0.7459971308708191, + "learning_rate": 9.120291279975431e-06, + "loss": 0.8219, + "step": 7047 + }, + { + "epoch": 0.3879134790026969, + "grad_norm": 0.9016150236129761, + "learning_rate": 9.120045703457394e-06, + "loss": 0.8605, + "step": 7048 + }, + { + "epoch": 0.38796851780505254, + "grad_norm": 0.78440922498703, + "learning_rate": 9.119800095974193e-06, + "loss": 0.8424, + "step": 7049 + }, + { + "epoch": 0.3880235566074082, + "grad_norm": 0.751504123210907, + "learning_rate": 9.119554457527681e-06, + "loss": 0.701, + "step": 7050 + }, + { + "epoch": 0.38807859540976386, + "grad_norm": 0.7540284991264343, + "learning_rate": 9.119308788119698e-06, + "loss": 0.7912, + "step": 7051 + }, + { + "epoch": 0.38813363421211955, + "grad_norm": 0.7977007627487183, + "learning_rate": 9.119063087752094e-06, + "loss": 0.9297, + "step": 7052 + }, + { + "epoch": 0.3881886730144752, + "grad_norm": 0.6923508644104004, + "learning_rate": 9.118817356426715e-06, + "loss": 0.7458, + "step": 7053 + }, + { + "epoch": 0.38824371181683087, + "grad_norm": 0.7170272469520569, + "learning_rate": 9.118571594145406e-06, + "loss": 0.733, + "step": 7054 + }, + { + "epoch": 0.3882987506191865, + "grad_norm": 0.7547701001167297, + "learning_rate": 9.118325800910015e-06, + "loss": 0.7758, + "step": 7055 + }, + { + "epoch": 0.3883537894215422, + "grad_norm": 0.7921421527862549, + "learning_rate": 9.118079976722391e-06, + "loss": 0.8262, + "step": 7056 + }, + { + "epoch": 0.38840882822389783, + "grad_norm": 0.734470784664154, + "learning_rate": 9.117834121584379e-06, + "loss": 0.817, + "step": 7057 + }, + { + "epoch": 0.3884638670262535, + "grad_norm": 0.8106420040130615, + "learning_rate": 9.117588235497829e-06, + "loss": 0.8203, + "step": 7058 + }, + { + "epoch": 0.38851890582860915, + "grad_norm": 0.7355543375015259, + "learning_rate": 9.117342318464588e-06, + "loss": 0.8076, + "step": 7059 + }, + { + "epoch": 0.38857394463096484, + "grad_norm": 0.7665252685546875, + "learning_rate": 9.117096370486504e-06, + "loss": 0.7611, + "step": 7060 + }, + { + "epoch": 0.3886289834333205, + "grad_norm": 0.7968598008155823, + "learning_rate": 9.116850391565426e-06, + "loss": 0.6461, + "step": 7061 + }, + { + "epoch": 0.38868402223567616, + "grad_norm": 0.7187741994857788, + "learning_rate": 9.116604381703203e-06, + "loss": 0.7982, + "step": 7062 + }, + { + "epoch": 0.3887390610380318, + "grad_norm": 0.8566913604736328, + "learning_rate": 9.11635834090168e-06, + "loss": 0.9072, + "step": 7063 + }, + { + "epoch": 0.3887940998403875, + "grad_norm": 0.7120797038078308, + "learning_rate": 9.116112269162714e-06, + "loss": 0.7353, + "step": 7064 + }, + { + "epoch": 0.3888491386427431, + "grad_norm": 0.7230019569396973, + "learning_rate": 9.115866166488148e-06, + "loss": 0.7717, + "step": 7065 + }, + { + "epoch": 0.3889041774450988, + "grad_norm": 0.6650584936141968, + "learning_rate": 9.115620032879833e-06, + "loss": 0.7384, + "step": 7066 + }, + { + "epoch": 0.38895921624745444, + "grad_norm": 0.970750629901886, + "learning_rate": 9.115373868339621e-06, + "loss": 0.8478, + "step": 7067 + }, + { + "epoch": 0.38901425504981013, + "grad_norm": 0.7066280245780945, + "learning_rate": 9.115127672869359e-06, + "loss": 0.7638, + "step": 7068 + }, + { + "epoch": 0.38906929385216577, + "grad_norm": 0.6952232718467712, + "learning_rate": 9.1148814464709e-06, + "loss": 0.7869, + "step": 7069 + }, + { + "epoch": 0.38912433265452145, + "grad_norm": 0.804489254951477, + "learning_rate": 9.114635189146094e-06, + "loss": 0.7905, + "step": 7070 + }, + { + "epoch": 0.3891793714568771, + "grad_norm": 0.6988457441329956, + "learning_rate": 9.114388900896791e-06, + "loss": 0.7107, + "step": 7071 + }, + { + "epoch": 0.3892344102592328, + "grad_norm": 0.6379980444908142, + "learning_rate": 9.114142581724842e-06, + "loss": 0.733, + "step": 7072 + }, + { + "epoch": 0.3892894490615884, + "grad_norm": 0.7238649129867554, + "learning_rate": 9.113896231632098e-06, + "loss": 0.8252, + "step": 7073 + }, + { + "epoch": 0.3893444878639441, + "grad_norm": 0.7168585062026978, + "learning_rate": 9.113649850620412e-06, + "loss": 0.6459, + "step": 7074 + }, + { + "epoch": 0.38939952666629973, + "grad_norm": 0.7315915822982788, + "learning_rate": 9.113403438691634e-06, + "loss": 0.7557, + "step": 7075 + }, + { + "epoch": 0.3894545654686554, + "grad_norm": 0.7438754439353943, + "learning_rate": 9.11315699584762e-06, + "loss": 0.7938, + "step": 7076 + }, + { + "epoch": 0.38950960427101106, + "grad_norm": 0.7497848272323608, + "learning_rate": 9.112910522090215e-06, + "loss": 0.8232, + "step": 7077 + }, + { + "epoch": 0.38956464307336675, + "grad_norm": 0.8072896003723145, + "learning_rate": 9.112664017421277e-06, + "loss": 0.7974, + "step": 7078 + }, + { + "epoch": 0.3896196818757224, + "grad_norm": 0.7255920767784119, + "learning_rate": 9.112417481842657e-06, + "loss": 0.7658, + "step": 7079 + }, + { + "epoch": 0.38967472067807807, + "grad_norm": 0.6263132095336914, + "learning_rate": 9.112170915356209e-06, + "loss": 0.7188, + "step": 7080 + }, + { + "epoch": 0.3897297594804337, + "grad_norm": 0.6817660927772522, + "learning_rate": 9.111924317963785e-06, + "loss": 0.7406, + "step": 7081 + }, + { + "epoch": 0.3897847982827894, + "grad_norm": 0.7829134464263916, + "learning_rate": 9.111677689667238e-06, + "loss": 0.8406, + "step": 7082 + }, + { + "epoch": 0.389839837085145, + "grad_norm": 0.7122843861579895, + "learning_rate": 9.111431030468421e-06, + "loss": 0.7722, + "step": 7083 + }, + { + "epoch": 0.3898948758875007, + "grad_norm": 0.7041764259338379, + "learning_rate": 9.11118434036919e-06, + "loss": 0.8307, + "step": 7084 + }, + { + "epoch": 0.38994991468985635, + "grad_norm": 0.7582009434700012, + "learning_rate": 9.110937619371398e-06, + "loss": 0.7461, + "step": 7085 + }, + { + "epoch": 0.39000495349221204, + "grad_norm": 0.7156100273132324, + "learning_rate": 9.110690867476899e-06, + "loss": 0.7294, + "step": 7086 + }, + { + "epoch": 0.39005999229456767, + "grad_norm": 0.79449063539505, + "learning_rate": 9.110444084687549e-06, + "loss": 0.8652, + "step": 7087 + }, + { + "epoch": 0.3901150310969233, + "grad_norm": 0.7692831754684448, + "learning_rate": 9.1101972710052e-06, + "loss": 0.7899, + "step": 7088 + }, + { + "epoch": 0.390170069899279, + "grad_norm": 0.7189639806747437, + "learning_rate": 9.109950426431708e-06, + "loss": 0.726, + "step": 7089 + }, + { + "epoch": 0.3902251087016346, + "grad_norm": 0.7491177916526794, + "learning_rate": 9.10970355096893e-06, + "loss": 0.8881, + "step": 7090 + }, + { + "epoch": 0.3902801475039903, + "grad_norm": 0.783027172088623, + "learning_rate": 9.10945664461872e-06, + "loss": 0.7728, + "step": 7091 + }, + { + "epoch": 0.39033518630634595, + "grad_norm": 1.0871556997299194, + "learning_rate": 9.109209707382934e-06, + "loss": 0.8059, + "step": 7092 + }, + { + "epoch": 0.39039022510870164, + "grad_norm": 0.7287113666534424, + "learning_rate": 9.108962739263429e-06, + "loss": 0.7896, + "step": 7093 + }, + { + "epoch": 0.39044526391105727, + "grad_norm": 0.7801700234413147, + "learning_rate": 9.108715740262058e-06, + "loss": 0.8012, + "step": 7094 + }, + { + "epoch": 0.39050030271341296, + "grad_norm": 0.846709132194519, + "learning_rate": 9.10846871038068e-06, + "loss": 0.8392, + "step": 7095 + }, + { + "epoch": 0.3905553415157686, + "grad_norm": 0.7408092617988586, + "learning_rate": 9.10822164962115e-06, + "loss": 0.8657, + "step": 7096 + }, + { + "epoch": 0.3906103803181243, + "grad_norm": 0.6748743057250977, + "learning_rate": 9.107974557985328e-06, + "loss": 0.7659, + "step": 7097 + }, + { + "epoch": 0.3906654191204799, + "grad_norm": 0.7512170672416687, + "learning_rate": 9.107727435475067e-06, + "loss": 0.7704, + "step": 7098 + }, + { + "epoch": 0.3907204579228356, + "grad_norm": 0.9039596319198608, + "learning_rate": 9.107480282092227e-06, + "loss": 0.8412, + "step": 7099 + }, + { + "epoch": 0.39077549672519124, + "grad_norm": 0.829785943031311, + "learning_rate": 9.107233097838663e-06, + "loss": 0.8229, + "step": 7100 + }, + { + "epoch": 0.39083053552754693, + "grad_norm": 0.7597842812538147, + "learning_rate": 9.106985882716238e-06, + "loss": 0.7798, + "step": 7101 + }, + { + "epoch": 0.39088557432990256, + "grad_norm": 0.7619945406913757, + "learning_rate": 9.106738636726802e-06, + "loss": 0.7504, + "step": 7102 + }, + { + "epoch": 0.39094061313225825, + "grad_norm": 0.6791092157363892, + "learning_rate": 9.10649135987222e-06, + "loss": 0.8167, + "step": 7103 + }, + { + "epoch": 0.3909956519346139, + "grad_norm": 0.7977412343025208, + "learning_rate": 9.10624405215435e-06, + "loss": 0.8252, + "step": 7104 + }, + { + "epoch": 0.3910506907369696, + "grad_norm": 0.7329283356666565, + "learning_rate": 9.105996713575047e-06, + "loss": 0.7084, + "step": 7105 + }, + { + "epoch": 0.3911057295393252, + "grad_norm": 0.7125133872032166, + "learning_rate": 9.105749344136172e-06, + "loss": 0.6672, + "step": 7106 + }, + { + "epoch": 0.3911607683416809, + "grad_norm": 0.6974679827690125, + "learning_rate": 9.105501943839583e-06, + "loss": 0.7354, + "step": 7107 + }, + { + "epoch": 0.39121580714403653, + "grad_norm": 0.7191265225410461, + "learning_rate": 9.10525451268714e-06, + "loss": 0.8133, + "step": 7108 + }, + { + "epoch": 0.3912708459463922, + "grad_norm": 0.7188206911087036, + "learning_rate": 9.105007050680704e-06, + "loss": 0.7947, + "step": 7109 + }, + { + "epoch": 0.39132588474874785, + "grad_norm": 0.9017364382743835, + "learning_rate": 9.104759557822135e-06, + "loss": 0.7848, + "step": 7110 + }, + { + "epoch": 0.39138092355110354, + "grad_norm": 0.7551164031028748, + "learning_rate": 9.104512034113292e-06, + "loss": 0.8266, + "step": 7111 + }, + { + "epoch": 0.3914359623534592, + "grad_norm": 0.7810001969337463, + "learning_rate": 9.104264479556033e-06, + "loss": 0.7731, + "step": 7112 + }, + { + "epoch": 0.39149100115581487, + "grad_norm": 0.787723183631897, + "learning_rate": 9.104016894152223e-06, + "loss": 0.8008, + "step": 7113 + }, + { + "epoch": 0.3915460399581705, + "grad_norm": 0.7303524017333984, + "learning_rate": 9.103769277903718e-06, + "loss": 0.826, + "step": 7114 + }, + { + "epoch": 0.3916010787605262, + "grad_norm": 0.707759439945221, + "learning_rate": 9.103521630812384e-06, + "loss": 0.6303, + "step": 7115 + }, + { + "epoch": 0.3916561175628818, + "grad_norm": 0.6929940581321716, + "learning_rate": 9.10327395288008e-06, + "loss": 0.733, + "step": 7116 + }, + { + "epoch": 0.3917111563652375, + "grad_norm": 0.7133205533027649, + "learning_rate": 9.103026244108667e-06, + "loss": 0.8421, + "step": 7117 + }, + { + "epoch": 0.39176619516759315, + "grad_norm": 1.2049434185028076, + "learning_rate": 9.102778504500005e-06, + "loss": 0.8618, + "step": 7118 + }, + { + "epoch": 0.39182123396994883, + "grad_norm": 0.7792720198631287, + "learning_rate": 9.10253073405596e-06, + "loss": 0.717, + "step": 7119 + }, + { + "epoch": 0.39187627277230447, + "grad_norm": 0.7234412431716919, + "learning_rate": 9.10228293277839e-06, + "loss": 0.7547, + "step": 7120 + }, + { + "epoch": 0.39193131157466016, + "grad_norm": 0.6845420002937317, + "learning_rate": 9.102035100669162e-06, + "loss": 0.7255, + "step": 7121 + }, + { + "epoch": 0.3919863503770158, + "grad_norm": 0.7446799874305725, + "learning_rate": 9.101787237730135e-06, + "loss": 0.7947, + "step": 7122 + }, + { + "epoch": 0.3920413891793715, + "grad_norm": 0.812924325466156, + "learning_rate": 9.101539343963176e-06, + "loss": 0.843, + "step": 7123 + }, + { + "epoch": 0.3920964279817271, + "grad_norm": 0.7373847365379333, + "learning_rate": 9.101291419370141e-06, + "loss": 0.7703, + "step": 7124 + }, + { + "epoch": 0.3921514667840828, + "grad_norm": 0.8305120468139648, + "learning_rate": 9.101043463952899e-06, + "loss": 0.8904, + "step": 7125 + }, + { + "epoch": 0.39220650558643844, + "grad_norm": 0.7263030409812927, + "learning_rate": 9.100795477713313e-06, + "loss": 0.8319, + "step": 7126 + }, + { + "epoch": 0.3922615443887941, + "grad_norm": 0.8358581066131592, + "learning_rate": 9.100547460653245e-06, + "loss": 0.8305, + "step": 7127 + }, + { + "epoch": 0.39231658319114976, + "grad_norm": 0.6608800292015076, + "learning_rate": 9.10029941277456e-06, + "loss": 0.7815, + "step": 7128 + }, + { + "epoch": 0.39237162199350545, + "grad_norm": 0.8590257167816162, + "learning_rate": 9.100051334079122e-06, + "loss": 0.8292, + "step": 7129 + }, + { + "epoch": 0.3924266607958611, + "grad_norm": 0.6241755485534668, + "learning_rate": 9.099803224568797e-06, + "loss": 0.6568, + "step": 7130 + }, + { + "epoch": 0.3924816995982167, + "grad_norm": 0.7298059463500977, + "learning_rate": 9.099555084245447e-06, + "loss": 0.727, + "step": 7131 + }, + { + "epoch": 0.3925367384005724, + "grad_norm": 0.7741055488586426, + "learning_rate": 9.099306913110939e-06, + "loss": 0.8481, + "step": 7132 + }, + { + "epoch": 0.39259177720292804, + "grad_norm": 0.9674170613288879, + "learning_rate": 9.099058711167137e-06, + "loss": 0.8507, + "step": 7133 + }, + { + "epoch": 0.3926468160052837, + "grad_norm": 0.7285159826278687, + "learning_rate": 9.098810478415907e-06, + "loss": 0.766, + "step": 7134 + }, + { + "epoch": 0.39270185480763936, + "grad_norm": 0.7215660810470581, + "learning_rate": 9.098562214859115e-06, + "loss": 0.794, + "step": 7135 + }, + { + "epoch": 0.39275689360999505, + "grad_norm": 0.764437735080719, + "learning_rate": 9.098313920498627e-06, + "loss": 0.8228, + "step": 7136 + }, + { + "epoch": 0.3928119324123507, + "grad_norm": 0.7222796082496643, + "learning_rate": 9.098065595336309e-06, + "loss": 0.8064, + "step": 7137 + }, + { + "epoch": 0.3928669712147064, + "grad_norm": 0.7044625878334045, + "learning_rate": 9.097817239374024e-06, + "loss": 0.8017, + "step": 7138 + }, + { + "epoch": 0.392922010017062, + "grad_norm": 0.7929979562759399, + "learning_rate": 9.097568852613646e-06, + "loss": 0.7527, + "step": 7139 + }, + { + "epoch": 0.3929770488194177, + "grad_norm": 0.7833721041679382, + "learning_rate": 9.097320435057033e-06, + "loss": 0.8335, + "step": 7140 + }, + { + "epoch": 0.39303208762177333, + "grad_norm": 0.8365728259086609, + "learning_rate": 9.097071986706058e-06, + "loss": 0.6439, + "step": 7141 + }, + { + "epoch": 0.393087126424129, + "grad_norm": 0.7547842264175415, + "learning_rate": 9.096823507562588e-06, + "loss": 0.8316, + "step": 7142 + }, + { + "epoch": 0.39314216522648465, + "grad_norm": 0.6598891019821167, + "learning_rate": 9.09657499762849e-06, + "loss": 0.6547, + "step": 7143 + }, + { + "epoch": 0.39319720402884034, + "grad_norm": 0.7913638949394226, + "learning_rate": 9.096326456905627e-06, + "loss": 0.7964, + "step": 7144 + }, + { + "epoch": 0.393252242831196, + "grad_norm": 0.6927905082702637, + "learning_rate": 9.096077885395874e-06, + "loss": 0.7836, + "step": 7145 + }, + { + "epoch": 0.39330728163355166, + "grad_norm": 0.7505417466163635, + "learning_rate": 9.095829283101094e-06, + "loss": 0.7707, + "step": 7146 + }, + { + "epoch": 0.3933623204359073, + "grad_norm": 0.8797083497047424, + "learning_rate": 9.095580650023158e-06, + "loss": 0.866, + "step": 7147 + }, + { + "epoch": 0.393417359238263, + "grad_norm": 0.7023645639419556, + "learning_rate": 9.095331986163935e-06, + "loss": 0.7013, + "step": 7148 + }, + { + "epoch": 0.3934723980406186, + "grad_norm": 0.697354793548584, + "learning_rate": 9.095083291525293e-06, + "loss": 0.7691, + "step": 7149 + }, + { + "epoch": 0.3935274368429743, + "grad_norm": 0.7211105227470398, + "learning_rate": 9.094834566109101e-06, + "loss": 0.6816, + "step": 7150 + }, + { + "epoch": 0.39358247564532994, + "grad_norm": 0.8593278527259827, + "learning_rate": 9.094585809917227e-06, + "loss": 0.915, + "step": 7151 + }, + { + "epoch": 0.39363751444768563, + "grad_norm": 0.7406070828437805, + "learning_rate": 9.094337022951545e-06, + "loss": 0.7825, + "step": 7152 + }, + { + "epoch": 0.39369255325004127, + "grad_norm": 0.7644504308700562, + "learning_rate": 9.09408820521392e-06, + "loss": 0.6796, + "step": 7153 + }, + { + "epoch": 0.39374759205239696, + "grad_norm": 0.8239033222198486, + "learning_rate": 9.093839356706224e-06, + "loss": 0.8396, + "step": 7154 + }, + { + "epoch": 0.3938026308547526, + "grad_norm": 0.6433991193771362, + "learning_rate": 9.093590477430327e-06, + "loss": 0.6941, + "step": 7155 + }, + { + "epoch": 0.3938576696571083, + "grad_norm": 0.6979972124099731, + "learning_rate": 9.093341567388102e-06, + "loss": 0.8142, + "step": 7156 + }, + { + "epoch": 0.3939127084594639, + "grad_norm": 0.7062026262283325, + "learning_rate": 9.093092626581414e-06, + "loss": 0.804, + "step": 7157 + }, + { + "epoch": 0.3939677472618196, + "grad_norm": 0.7070814967155457, + "learning_rate": 9.09284365501214e-06, + "loss": 0.765, + "step": 7158 + }, + { + "epoch": 0.39402278606417523, + "grad_norm": 0.8577908873558044, + "learning_rate": 9.092594652682147e-06, + "loss": 0.7074, + "step": 7159 + }, + { + "epoch": 0.3940778248665309, + "grad_norm": 0.7386197447776794, + "learning_rate": 9.092345619593309e-06, + "loss": 0.7629, + "step": 7160 + }, + { + "epoch": 0.39413286366888656, + "grad_norm": 0.8048123121261597, + "learning_rate": 9.092096555747496e-06, + "loss": 0.9225, + "step": 7161 + }, + { + "epoch": 0.39418790247124225, + "grad_norm": 0.7479888200759888, + "learning_rate": 9.091847461146582e-06, + "loss": 0.7284, + "step": 7162 + }, + { + "epoch": 0.3942429412735979, + "grad_norm": 0.7448734045028687, + "learning_rate": 9.091598335792438e-06, + "loss": 0.8694, + "step": 7163 + }, + { + "epoch": 0.39429798007595357, + "grad_norm": 0.7511261701583862, + "learning_rate": 9.091349179686935e-06, + "loss": 0.7822, + "step": 7164 + }, + { + "epoch": 0.3943530188783092, + "grad_norm": 0.7079344391822815, + "learning_rate": 9.091099992831946e-06, + "loss": 0.7238, + "step": 7165 + }, + { + "epoch": 0.3944080576806649, + "grad_norm": 0.7007229328155518, + "learning_rate": 9.090850775229347e-06, + "loss": 0.7269, + "step": 7166 + }, + { + "epoch": 0.3944630964830205, + "grad_norm": 0.769800066947937, + "learning_rate": 9.090601526881007e-06, + "loss": 0.7894, + "step": 7167 + }, + { + "epoch": 0.3945181352853762, + "grad_norm": 0.7211676836013794, + "learning_rate": 9.090352247788801e-06, + "loss": 0.7998, + "step": 7168 + }, + { + "epoch": 0.39457317408773185, + "grad_norm": 0.6784254312515259, + "learning_rate": 9.090102937954602e-06, + "loss": 0.7576, + "step": 7169 + }, + { + "epoch": 0.39462821289008754, + "grad_norm": 0.7696946859359741, + "learning_rate": 9.089853597380285e-06, + "loss": 0.8395, + "step": 7170 + }, + { + "epoch": 0.39468325169244317, + "grad_norm": 0.8720405697822571, + "learning_rate": 9.089604226067723e-06, + "loss": 0.8971, + "step": 7171 + }, + { + "epoch": 0.39473829049479886, + "grad_norm": 0.8457947373390198, + "learning_rate": 9.08935482401879e-06, + "loss": 0.7002, + "step": 7172 + }, + { + "epoch": 0.3947933292971545, + "grad_norm": 0.8181997537612915, + "learning_rate": 9.089105391235361e-06, + "loss": 0.8949, + "step": 7173 + }, + { + "epoch": 0.3948483680995101, + "grad_norm": 0.7717136144638062, + "learning_rate": 9.08885592771931e-06, + "loss": 0.829, + "step": 7174 + }, + { + "epoch": 0.3949034069018658, + "grad_norm": 0.6941567063331604, + "learning_rate": 9.088606433472514e-06, + "loss": 0.7592, + "step": 7175 + }, + { + "epoch": 0.39495844570422145, + "grad_norm": 0.7358599901199341, + "learning_rate": 9.088356908496845e-06, + "loss": 0.8657, + "step": 7176 + }, + { + "epoch": 0.39501348450657714, + "grad_norm": 1.1329307556152344, + "learning_rate": 9.08810735279418e-06, + "loss": 0.8307, + "step": 7177 + }, + { + "epoch": 0.3950685233089328, + "grad_norm": 0.7011532187461853, + "learning_rate": 9.087857766366395e-06, + "loss": 0.7487, + "step": 7178 + }, + { + "epoch": 0.39512356211128846, + "grad_norm": 0.7390572428703308, + "learning_rate": 9.087608149215366e-06, + "loss": 0.8244, + "step": 7179 + }, + { + "epoch": 0.3951786009136441, + "grad_norm": 0.6907634735107422, + "learning_rate": 9.087358501342966e-06, + "loss": 0.751, + "step": 7180 + }, + { + "epoch": 0.3952336397159998, + "grad_norm": 0.7467379570007324, + "learning_rate": 9.087108822751076e-06, + "loss": 0.8549, + "step": 7181 + }, + { + "epoch": 0.3952886785183554, + "grad_norm": 0.7493302226066589, + "learning_rate": 9.086859113441568e-06, + "loss": 0.8332, + "step": 7182 + }, + { + "epoch": 0.3953437173207111, + "grad_norm": 0.8364959955215454, + "learning_rate": 9.086609373416321e-06, + "loss": 0.7873, + "step": 7183 + }, + { + "epoch": 0.39539875612306674, + "grad_norm": 0.7330418825149536, + "learning_rate": 9.086359602677214e-06, + "loss": 0.7861, + "step": 7184 + }, + { + "epoch": 0.39545379492542243, + "grad_norm": 0.7296311855316162, + "learning_rate": 9.086109801226121e-06, + "loss": 0.7946, + "step": 7185 + }, + { + "epoch": 0.39550883372777806, + "grad_norm": 0.7884660363197327, + "learning_rate": 9.085859969064921e-06, + "loss": 0.7851, + "step": 7186 + }, + { + "epoch": 0.39556387253013375, + "grad_norm": 0.7311955690383911, + "learning_rate": 9.08561010619549e-06, + "loss": 0.7645, + "step": 7187 + }, + { + "epoch": 0.3956189113324894, + "grad_norm": 0.7447296977043152, + "learning_rate": 9.085360212619707e-06, + "loss": 0.7446, + "step": 7188 + }, + { + "epoch": 0.3956739501348451, + "grad_norm": 0.755628228187561, + "learning_rate": 9.08511028833945e-06, + "loss": 0.8107, + "step": 7189 + }, + { + "epoch": 0.3957289889372007, + "grad_norm": 0.6800833940505981, + "learning_rate": 9.0848603333566e-06, + "loss": 0.7471, + "step": 7190 + }, + { + "epoch": 0.3957840277395564, + "grad_norm": 0.6396341919898987, + "learning_rate": 9.08461034767303e-06, + "loss": 0.6797, + "step": 7191 + }, + { + "epoch": 0.39583906654191203, + "grad_norm": 0.729680597782135, + "learning_rate": 9.084360331290625e-06, + "loss": 0.7224, + "step": 7192 + }, + { + "epoch": 0.3958941053442677, + "grad_norm": 0.7630584239959717, + "learning_rate": 9.084110284211259e-06, + "loss": 0.8203, + "step": 7193 + }, + { + "epoch": 0.39594914414662336, + "grad_norm": 0.8799235820770264, + "learning_rate": 9.083860206436813e-06, + "loss": 0.8312, + "step": 7194 + }, + { + "epoch": 0.39600418294897904, + "grad_norm": 0.797081708908081, + "learning_rate": 9.083610097969169e-06, + "loss": 0.7561, + "step": 7195 + }, + { + "epoch": 0.3960592217513347, + "grad_norm": 0.7408759593963623, + "learning_rate": 9.083359958810203e-06, + "loss": 0.7854, + "step": 7196 + }, + { + "epoch": 0.39611426055369037, + "grad_norm": 0.7552130222320557, + "learning_rate": 9.083109788961797e-06, + "loss": 0.8145, + "step": 7197 + }, + { + "epoch": 0.396169299356046, + "grad_norm": 0.7147447466850281, + "learning_rate": 9.08285958842583e-06, + "loss": 0.792, + "step": 7198 + }, + { + "epoch": 0.3962243381584017, + "grad_norm": 0.7416259050369263, + "learning_rate": 9.082609357204183e-06, + "loss": 0.7801, + "step": 7199 + }, + { + "epoch": 0.3962793769607573, + "grad_norm": 0.7551109194755554, + "learning_rate": 9.082359095298741e-06, + "loss": 0.8841, + "step": 7200 + }, + { + "epoch": 0.396334415763113, + "grad_norm": 0.761472225189209, + "learning_rate": 9.082108802711377e-06, + "loss": 0.9061, + "step": 7201 + }, + { + "epoch": 0.39638945456546865, + "grad_norm": 0.7234126329421997, + "learning_rate": 9.081858479443977e-06, + "loss": 0.8308, + "step": 7202 + }, + { + "epoch": 0.39644449336782434, + "grad_norm": 0.7204816341400146, + "learning_rate": 9.08160812549842e-06, + "loss": 0.7481, + "step": 7203 + }, + { + "epoch": 0.39649953217017997, + "grad_norm": 0.7207956910133362, + "learning_rate": 9.081357740876591e-06, + "loss": 0.762, + "step": 7204 + }, + { + "epoch": 0.39655457097253566, + "grad_norm": 0.7967123985290527, + "learning_rate": 9.081107325580367e-06, + "loss": 0.7931, + "step": 7205 + }, + { + "epoch": 0.3966096097748913, + "grad_norm": 0.9839354753494263, + "learning_rate": 9.080856879611635e-06, + "loss": 0.8182, + "step": 7206 + }, + { + "epoch": 0.396664648577247, + "grad_norm": 0.8468357920646667, + "learning_rate": 9.080606402972274e-06, + "loss": 0.7056, + "step": 7207 + }, + { + "epoch": 0.3967196873796026, + "grad_norm": 0.6549574136734009, + "learning_rate": 9.080355895664169e-06, + "loss": 0.7604, + "step": 7208 + }, + { + "epoch": 0.3967747261819583, + "grad_norm": 0.7475417256355286, + "learning_rate": 9.080105357689201e-06, + "loss": 0.7107, + "step": 7209 + }, + { + "epoch": 0.39682976498431394, + "grad_norm": 0.7464179992675781, + "learning_rate": 9.079854789049251e-06, + "loss": 0.793, + "step": 7210 + }, + { + "epoch": 0.3968848037866696, + "grad_norm": 0.8332071900367737, + "learning_rate": 9.079604189746207e-06, + "loss": 0.8383, + "step": 7211 + }, + { + "epoch": 0.39693984258902526, + "grad_norm": 0.722055196762085, + "learning_rate": 9.07935355978195e-06, + "loss": 0.8569, + "step": 7212 + }, + { + "epoch": 0.39699488139138095, + "grad_norm": 0.7442018389701843, + "learning_rate": 9.079102899158363e-06, + "loss": 0.8165, + "step": 7213 + }, + { + "epoch": 0.3970499201937366, + "grad_norm": 0.6986141204833984, + "learning_rate": 9.07885220787733e-06, + "loss": 0.7562, + "step": 7214 + }, + { + "epoch": 0.39710495899609227, + "grad_norm": 0.7718464732170105, + "learning_rate": 9.078601485940736e-06, + "loss": 0.8529, + "step": 7215 + }, + { + "epoch": 0.3971599977984479, + "grad_norm": 0.7583653330802917, + "learning_rate": 9.078350733350464e-06, + "loss": 0.7855, + "step": 7216 + }, + { + "epoch": 0.39721503660080354, + "grad_norm": 0.7699223160743713, + "learning_rate": 9.078099950108401e-06, + "loss": 0.8061, + "step": 7217 + }, + { + "epoch": 0.39727007540315923, + "grad_norm": 0.7374141812324524, + "learning_rate": 9.07784913621643e-06, + "loss": 0.789, + "step": 7218 + }, + { + "epoch": 0.39732511420551486, + "grad_norm": 0.7446104884147644, + "learning_rate": 9.077598291676436e-06, + "loss": 0.8381, + "step": 7219 + }, + { + "epoch": 0.39738015300787055, + "grad_norm": 0.7017301917076111, + "learning_rate": 9.077347416490305e-06, + "loss": 0.7153, + "step": 7220 + }, + { + "epoch": 0.3974351918102262, + "grad_norm": 0.7676172852516174, + "learning_rate": 9.077096510659922e-06, + "loss": 0.8029, + "step": 7221 + }, + { + "epoch": 0.3974902306125819, + "grad_norm": 0.9340602159500122, + "learning_rate": 9.076845574187174e-06, + "loss": 0.7865, + "step": 7222 + }, + { + "epoch": 0.3975452694149375, + "grad_norm": 0.8634235262870789, + "learning_rate": 9.076594607073945e-06, + "loss": 0.7606, + "step": 7223 + }, + { + "epoch": 0.3976003082172932, + "grad_norm": 0.8967369198799133, + "learning_rate": 9.076343609322123e-06, + "loss": 0.7011, + "step": 7224 + }, + { + "epoch": 0.39765534701964883, + "grad_norm": 0.7269352078437805, + "learning_rate": 9.076092580933594e-06, + "loss": 0.8043, + "step": 7225 + }, + { + "epoch": 0.3977103858220045, + "grad_norm": 0.7550628781318665, + "learning_rate": 9.075841521910243e-06, + "loss": 0.7344, + "step": 7226 + }, + { + "epoch": 0.39776542462436015, + "grad_norm": 0.6973844766616821, + "learning_rate": 9.075590432253958e-06, + "loss": 0.6995, + "step": 7227 + }, + { + "epoch": 0.39782046342671584, + "grad_norm": 0.648560643196106, + "learning_rate": 9.075339311966627e-06, + "loss": 0.6997, + "step": 7228 + }, + { + "epoch": 0.3978755022290715, + "grad_norm": 0.8457548022270203, + "learning_rate": 9.075088161050134e-06, + "loss": 0.8548, + "step": 7229 + }, + { + "epoch": 0.39793054103142717, + "grad_norm": 0.7644637823104858, + "learning_rate": 9.074836979506373e-06, + "loss": 0.6966, + "step": 7230 + }, + { + "epoch": 0.3979855798337828, + "grad_norm": 0.7146210670471191, + "learning_rate": 9.074585767337227e-06, + "loss": 0.7673, + "step": 7231 + }, + { + "epoch": 0.3980406186361385, + "grad_norm": 0.8570694327354431, + "learning_rate": 9.074334524544585e-06, + "loss": 0.8233, + "step": 7232 + }, + { + "epoch": 0.3980956574384941, + "grad_norm": 0.7257633805274963, + "learning_rate": 9.074083251130334e-06, + "loss": 0.7464, + "step": 7233 + }, + { + "epoch": 0.3981506962408498, + "grad_norm": 0.9377032518386841, + "learning_rate": 9.073831947096365e-06, + "loss": 0.7814, + "step": 7234 + }, + { + "epoch": 0.39820573504320544, + "grad_norm": 0.8105629086494446, + "learning_rate": 9.073580612444566e-06, + "loss": 0.8069, + "step": 7235 + }, + { + "epoch": 0.39826077384556113, + "grad_norm": 0.7874456644058228, + "learning_rate": 9.073329247176824e-06, + "loss": 0.8414, + "step": 7236 + }, + { + "epoch": 0.39831581264791677, + "grad_norm": 0.6829617023468018, + "learning_rate": 9.07307785129503e-06, + "loss": 0.7633, + "step": 7237 + }, + { + "epoch": 0.39837085145027246, + "grad_norm": 0.6838501691818237, + "learning_rate": 9.072826424801075e-06, + "loss": 0.6972, + "step": 7238 + }, + { + "epoch": 0.3984258902526281, + "grad_norm": 0.7054216861724854, + "learning_rate": 9.072574967696845e-06, + "loss": 0.8049, + "step": 7239 + }, + { + "epoch": 0.3984809290549838, + "grad_norm": 0.9462615847587585, + "learning_rate": 9.072323479984232e-06, + "loss": 0.7988, + "step": 7240 + }, + { + "epoch": 0.3985359678573394, + "grad_norm": 0.7334465980529785, + "learning_rate": 9.072071961665128e-06, + "loss": 0.7538, + "step": 7241 + }, + { + "epoch": 0.3985910066596951, + "grad_norm": 0.7506609559059143, + "learning_rate": 9.071820412741418e-06, + "loss": 0.7991, + "step": 7242 + }, + { + "epoch": 0.39864604546205074, + "grad_norm": 0.6858688592910767, + "learning_rate": 9.071568833214998e-06, + "loss": 0.7258, + "step": 7243 + }, + { + "epoch": 0.3987010842644064, + "grad_norm": 0.8117396235466003, + "learning_rate": 9.071317223087754e-06, + "loss": 0.752, + "step": 7244 + }, + { + "epoch": 0.39875612306676206, + "grad_norm": 0.7772389054298401, + "learning_rate": 9.071065582361582e-06, + "loss": 0.7444, + "step": 7245 + }, + { + "epoch": 0.39881116186911775, + "grad_norm": 0.7221882939338684, + "learning_rate": 9.07081391103837e-06, + "loss": 0.8035, + "step": 7246 + }, + { + "epoch": 0.3988662006714734, + "grad_norm": 0.8113289475440979, + "learning_rate": 9.07056220912001e-06, + "loss": 0.7623, + "step": 7247 + }, + { + "epoch": 0.39892123947382907, + "grad_norm": 0.730823278427124, + "learning_rate": 9.070310476608395e-06, + "loss": 0.7872, + "step": 7248 + }, + { + "epoch": 0.3989762782761847, + "grad_norm": 0.7690893411636353, + "learning_rate": 9.070058713505415e-06, + "loss": 0.7402, + "step": 7249 + }, + { + "epoch": 0.3990313170785404, + "grad_norm": 0.6768597364425659, + "learning_rate": 9.069806919812963e-06, + "loss": 0.7283, + "step": 7250 + }, + { + "epoch": 0.399086355880896, + "grad_norm": 0.6938686370849609, + "learning_rate": 9.069555095532932e-06, + "loss": 0.7209, + "step": 7251 + }, + { + "epoch": 0.3991413946832517, + "grad_norm": 0.7162025570869446, + "learning_rate": 9.069303240667215e-06, + "loss": 0.7915, + "step": 7252 + }, + { + "epoch": 0.39919643348560735, + "grad_norm": 0.9170399308204651, + "learning_rate": 9.069051355217704e-06, + "loss": 0.8399, + "step": 7253 + }, + { + "epoch": 0.39925147228796304, + "grad_norm": 0.7080186009407043, + "learning_rate": 9.068799439186291e-06, + "loss": 0.8678, + "step": 7254 + }, + { + "epoch": 0.39930651109031867, + "grad_norm": 1.013613224029541, + "learning_rate": 9.068547492574872e-06, + "loss": 0.817, + "step": 7255 + }, + { + "epoch": 0.39936154989267436, + "grad_norm": 0.6911013722419739, + "learning_rate": 9.068295515385337e-06, + "loss": 0.7048, + "step": 7256 + }, + { + "epoch": 0.39941658869503, + "grad_norm": 0.748219907283783, + "learning_rate": 9.068043507619584e-06, + "loss": 0.8115, + "step": 7257 + }, + { + "epoch": 0.3994716274973857, + "grad_norm": 0.6763347387313843, + "learning_rate": 9.067791469279504e-06, + "loss": 0.763, + "step": 7258 + }, + { + "epoch": 0.3995266662997413, + "grad_norm": 0.7291030287742615, + "learning_rate": 9.067539400366993e-06, + "loss": 0.7319, + "step": 7259 + }, + { + "epoch": 0.39958170510209695, + "grad_norm": 0.6515628695487976, + "learning_rate": 9.067287300883945e-06, + "loss": 0.7903, + "step": 7260 + }, + { + "epoch": 0.39963674390445264, + "grad_norm": 0.7815985679626465, + "learning_rate": 9.067035170832253e-06, + "loss": 0.8241, + "step": 7261 + }, + { + "epoch": 0.3996917827068083, + "grad_norm": 0.6747417449951172, + "learning_rate": 9.066783010213812e-06, + "loss": 0.7544, + "step": 7262 + }, + { + "epoch": 0.39974682150916396, + "grad_norm": 0.6568340063095093, + "learning_rate": 9.066530819030522e-06, + "loss": 0.7754, + "step": 7263 + }, + { + "epoch": 0.3998018603115196, + "grad_norm": 0.6703339219093323, + "learning_rate": 9.066278597284273e-06, + "loss": 0.7581, + "step": 7264 + }, + { + "epoch": 0.3998568991138753, + "grad_norm": 0.7421279549598694, + "learning_rate": 9.066026344976962e-06, + "loss": 0.7974, + "step": 7265 + }, + { + "epoch": 0.3999119379162309, + "grad_norm": 0.7226015329360962, + "learning_rate": 9.065774062110486e-06, + "loss": 0.7777, + "step": 7266 + }, + { + "epoch": 0.3999669767185866, + "grad_norm": 0.7092894911766052, + "learning_rate": 9.06552174868674e-06, + "loss": 0.7885, + "step": 7267 + }, + { + "epoch": 0.40002201552094224, + "grad_norm": 0.837902307510376, + "learning_rate": 9.065269404707622e-06, + "loss": 0.7425, + "step": 7268 + }, + { + "epoch": 0.40007705432329793, + "grad_norm": 0.803811252117157, + "learning_rate": 9.065017030175027e-06, + "loss": 0.8418, + "step": 7269 + }, + { + "epoch": 0.40013209312565357, + "grad_norm": 0.8110278248786926, + "learning_rate": 9.064764625090854e-06, + "loss": 0.7724, + "step": 7270 + }, + { + "epoch": 0.40018713192800925, + "grad_norm": 0.7305173277854919, + "learning_rate": 9.064512189456995e-06, + "loss": 0.7465, + "step": 7271 + }, + { + "epoch": 0.4002421707303649, + "grad_norm": 0.7312467694282532, + "learning_rate": 9.06425972327535e-06, + "loss": 0.8406, + "step": 7272 } ], "logging_steps": 1, @@ -44567,7 +50930,7 @@ "attributes": {} } }, - "total_flos": 1.877760299885396e+19, + "total_flos": 2.1460117712975954e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null