{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997289972899729, "eval_steps": 500, "global_step": 276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036133694670280035, "grad_norm": 0.3959366977214813, "learning_rate": 6.25e-06, "loss": 0.9323, "step": 1 }, { "epoch": 0.007226738934056007, "grad_norm": 0.45551198720932007, "learning_rate": 1.25e-05, "loss": 1.0507, "step": 2 }, { "epoch": 0.01084010840108401, "grad_norm": 0.2823091745376587, "learning_rate": 1.8750000000000002e-05, "loss": 0.8491, "step": 3 }, { "epoch": 0.014453477868112014, "grad_norm": 0.46047303080558777, "learning_rate": 2.5e-05, "loss": 1.0142, "step": 4 }, { "epoch": 0.018066847335140017, "grad_norm": 0.4086349606513977, "learning_rate": 3.125e-05, "loss": 0.947, "step": 5 }, { "epoch": 0.02168021680216802, "grad_norm": 0.457003116607666, "learning_rate": 3.7500000000000003e-05, "loss": 0.9485, "step": 6 }, { "epoch": 0.025293586269196026, "grad_norm": 0.35562458634376526, "learning_rate": 4.375e-05, "loss": 0.8449, "step": 7 }, { "epoch": 0.028906955736224028, "grad_norm": 0.33805516362190247, "learning_rate": 5e-05, "loss": 0.7379, "step": 8 }, { "epoch": 0.032520325203252036, "grad_norm": 0.3412623703479767, "learning_rate": 4.9998282347929784e-05, "loss": 0.6282, "step": 9 }, { "epoch": 0.036133694670280034, "grad_norm": 0.2843680679798126, "learning_rate": 4.99931296277454e-05, "loss": 0.5503, "step": 10 }, { "epoch": 0.03974706413730804, "grad_norm": 0.17628777027130127, "learning_rate": 4.998454254749331e-05, "loss": 0.512, "step": 11 }, { "epoch": 0.04336043360433604, "grad_norm": 0.19055013358592987, "learning_rate": 4.997252228714279e-05, "loss": 0.5397, "step": 12 }, { "epoch": 0.04697380307136405, "grad_norm": 0.08906977623701096, "learning_rate": 4.9957070498423854e-05, "loss": 0.5458, "step": 13 }, { "epoch": 0.05058717253839205, "grad_norm": 0.0917251780629158, "learning_rate": 4.993818930460026e-05, "loss": 0.5269, "step": 14 }, { "epoch": 0.05420054200542006, "grad_norm": 0.0985497236251831, "learning_rate": 4.9915881300177725e-05, "loss": 0.4135, "step": 15 }, { "epoch": 0.057813911472448055, "grad_norm": 0.1111132949590683, "learning_rate": 4.9890149550547454e-05, "loss": 0.5064, "step": 16 }, { "epoch": 0.06142728093947606, "grad_norm": 0.0649256557226181, "learning_rate": 4.98609975915649e-05, "loss": 0.4804, "step": 17 }, { "epoch": 0.06504065040650407, "grad_norm": 0.09687516838312149, "learning_rate": 4.982842942906386e-05, "loss": 0.3706, "step": 18 }, { "epoch": 0.06865401987353206, "grad_norm": 0.14679567515850067, "learning_rate": 4.979244953830608e-05, "loss": 0.4105, "step": 19 }, { "epoch": 0.07226738934056007, "grad_norm": 0.14155593514442444, "learning_rate": 4.9753062863366276e-05, "loss": 0.4886, "step": 20 }, { "epoch": 0.07588075880758807, "grad_norm": 0.14684930443763733, "learning_rate": 4.971027481645274e-05, "loss": 0.4044, "step": 21 }, { "epoch": 0.07949412827461608, "grad_norm": 0.11222010105848312, "learning_rate": 4.966409127716367e-05, "loss": 0.4361, "step": 22 }, { "epoch": 0.08310749774164408, "grad_norm": 0.058118775486946106, "learning_rate": 4.96145185916792e-05, "loss": 0.4176, "step": 23 }, { "epoch": 0.08672086720867209, "grad_norm": 0.06764644384384155, "learning_rate": 4.95615635718894e-05, "loss": 0.4683, "step": 24 }, { "epoch": 0.09033423667570009, "grad_norm": 0.06886276602745056, "learning_rate": 4.950523349445824e-05, "loss": 0.418, "step": 25 }, { "epoch": 0.0939476061427281, "grad_norm": 0.0706636980175972, "learning_rate": 4.944553609982363e-05, "loss": 0.3967, "step": 26 }, { "epoch": 0.0975609756097561, "grad_norm": 0.04914792627096176, "learning_rate": 4.938247959113386e-05, "loss": 0.4623, "step": 27 }, { "epoch": 0.1011743450767841, "grad_norm": 0.05717244744300842, "learning_rate": 4.931607263312032e-05, "loss": 0.4047, "step": 28 }, { "epoch": 0.10478771454381211, "grad_norm": 0.05677526444196701, "learning_rate": 4.924632435090696e-05, "loss": 0.4251, "step": 29 }, { "epoch": 0.10840108401084012, "grad_norm": 0.051282044500112534, "learning_rate": 4.917324432875627e-05, "loss": 0.4101, "step": 30 }, { "epoch": 0.1120144534778681, "grad_norm": 0.05558260530233383, "learning_rate": 4.909684260875235e-05, "loss": 0.4425, "step": 31 }, { "epoch": 0.11562782294489611, "grad_norm": 0.05362090840935707, "learning_rate": 4.9017129689421e-05, "loss": 0.383, "step": 32 }, { "epoch": 0.11924119241192412, "grad_norm": 0.050591859966516495, "learning_rate": 4.893411652428712e-05, "loss": 0.3988, "step": 33 }, { "epoch": 0.12285456187895212, "grad_norm": 0.07354583591222763, "learning_rate": 4.8847814520369475e-05, "loss": 0.473, "step": 34 }, { "epoch": 0.12646793134598014, "grad_norm": 0.07448670268058777, "learning_rate": 4.875823553661334e-05, "loss": 0.3609, "step": 35 }, { "epoch": 0.13008130081300814, "grad_norm": 0.09399361908435822, "learning_rate": 4.8665391882260856e-05, "loss": 0.3927, "step": 36 }, { "epoch": 0.13369467028003612, "grad_norm": 0.061091382056474686, "learning_rate": 4.856929631515964e-05, "loss": 0.4512, "step": 37 }, { "epoch": 0.13730803974706413, "grad_norm": 0.06277038156986237, "learning_rate": 4.846996204000967e-05, "loss": 0.3961, "step": 38 }, { "epoch": 0.14092140921409213, "grad_norm": 0.05277445912361145, "learning_rate": 4.8367402706548805e-05, "loss": 0.3885, "step": 39 }, { "epoch": 0.14453477868112014, "grad_norm": 0.06335710734128952, "learning_rate": 4.8261632407677174e-05, "loss": 0.4663, "step": 40 }, { "epoch": 0.14814814814814814, "grad_norm": 0.05149435997009277, "learning_rate": 4.815266567752059e-05, "loss": 0.4012, "step": 41 }, { "epoch": 0.15176151761517614, "grad_norm": 0.052154790610075, "learning_rate": 4.804051748943343e-05, "loss": 0.377, "step": 42 }, { "epoch": 0.15537488708220415, "grad_norm": 0.06229854002594948, "learning_rate": 4.792520325394111e-05, "loss": 0.4677, "step": 43 }, { "epoch": 0.15898825654923215, "grad_norm": 0.050992563366889954, "learning_rate": 4.780673881662242e-05, "loss": 0.4271, "step": 44 }, { "epoch": 0.16260162601626016, "grad_norm": 0.057579364627599716, "learning_rate": 4.7685140455932267e-05, "loss": 0.4096, "step": 45 }, { "epoch": 0.16621499548328816, "grad_norm": 0.05966678634285927, "learning_rate": 4.756042488096471e-05, "loss": 0.4075, "step": 46 }, { "epoch": 0.16982836495031617, "grad_norm": 0.055218473076820374, "learning_rate": 4.743260922915701e-05, "loss": 0.459, "step": 47 }, { "epoch": 0.17344173441734417, "grad_norm": 0.05127694830298424, "learning_rate": 4.730171106393466e-05, "loss": 0.4086, "step": 48 }, { "epoch": 0.17705510388437218, "grad_norm": 0.06519781798124313, "learning_rate": 4.716774837229804e-05, "loss": 0.4418, "step": 49 }, { "epoch": 0.18066847335140018, "grad_norm": 0.05895975977182388, "learning_rate": 4.7030739562350713e-05, "loss": 0.4013, "step": 50 }, { "epoch": 0.1842818428184282, "grad_norm": 0.061492159962654114, "learning_rate": 4.6890703460769955e-05, "loss": 0.3726, "step": 51 }, { "epoch": 0.1878952122854562, "grad_norm": 0.05051853135228157, "learning_rate": 4.674765931021976e-05, "loss": 0.4354, "step": 52 }, { "epoch": 0.1915085817524842, "grad_norm": 0.05664265528321266, "learning_rate": 4.6601626766706626e-05, "loss": 0.4137, "step": 53 }, { "epoch": 0.1951219512195122, "grad_norm": 0.06020362302660942, "learning_rate": 4.645262589687861e-05, "loss": 0.4171, "step": 54 }, { "epoch": 0.1987353206865402, "grad_norm": 0.06303560733795166, "learning_rate": 4.6300677175267914e-05, "loss": 0.3724, "step": 55 }, { "epoch": 0.2023486901535682, "grad_norm": 0.06793845444917679, "learning_rate": 4.614580148147744e-05, "loss": 0.3711, "step": 56 }, { "epoch": 0.20596205962059622, "grad_norm": 0.07107391953468323, "learning_rate": 4.598802009731167e-05, "loss": 0.4428, "step": 57 }, { "epoch": 0.20957542908762422, "grad_norm": 0.06567548215389252, "learning_rate": 4.582735470385229e-05, "loss": 0.3774, "step": 58 }, { "epoch": 0.21318879855465223, "grad_norm": 0.05056913569569588, "learning_rate": 4.5663827378478975e-05, "loss": 0.3584, "step": 59 }, { "epoch": 0.21680216802168023, "grad_norm": 0.08128344267606735, "learning_rate": 4.5497460591835615e-05, "loss": 0.3983, "step": 60 }, { "epoch": 0.2204155374887082, "grad_norm": 0.05856931954622269, "learning_rate": 4.532827720474268e-05, "loss": 0.3486, "step": 61 }, { "epoch": 0.2240289069557362, "grad_norm": 0.05503028631210327, "learning_rate": 4.515630046505575e-05, "loss": 0.3896, "step": 62 }, { "epoch": 0.22764227642276422, "grad_norm": 0.047534190118312836, "learning_rate": 4.498155400447107e-05, "loss": 0.4463, "step": 63 }, { "epoch": 0.23125564588979222, "grad_norm": 0.0638430267572403, "learning_rate": 4.480406183527823e-05, "loss": 0.3977, "step": 64 }, { "epoch": 0.23486901535682023, "grad_norm": 0.04974055290222168, "learning_rate": 4.462384834706058e-05, "loss": 0.3999, "step": 65 }, { "epoch": 0.23848238482384823, "grad_norm": 0.06309591233730316, "learning_rate": 4.4440938303343804e-05, "loss": 0.4275, "step": 66 }, { "epoch": 0.24209575429087624, "grad_norm": 0.05192544683814049, "learning_rate": 4.425535683819312e-05, "loss": 0.4096, "step": 67 }, { "epoch": 0.24570912375790424, "grad_norm": 0.057684604078531265, "learning_rate": 4.406712945275955e-05, "loss": 0.41, "step": 68 }, { "epoch": 0.24932249322493225, "grad_norm": 0.0514802448451519, "learning_rate": 4.387628201177577e-05, "loss": 0.3372, "step": 69 }, { "epoch": 0.2529358626919603, "grad_norm": 0.056559968739748, "learning_rate": 4.368284074000193e-05, "loss": 0.3929, "step": 70 }, { "epoch": 0.2565492321589883, "grad_norm": 0.0645717978477478, "learning_rate": 4.348683221862212e-05, "loss": 0.4353, "step": 71 }, { "epoch": 0.2601626016260163, "grad_norm": 0.08638172596693039, "learning_rate": 4.328828338159173e-05, "loss": 0.3978, "step": 72 }, { "epoch": 0.26377597109304424, "grad_norm": 0.05915065109729767, "learning_rate": 4.3087221511936434e-05, "loss": 0.393, "step": 73 }, { "epoch": 0.26738934056007224, "grad_norm": 0.061671093106269836, "learning_rate": 4.288367423800319e-05, "loss": 0.4187, "step": 74 }, { "epoch": 0.27100271002710025, "grad_norm": 0.07420554012060165, "learning_rate": 4.267766952966369e-05, "loss": 0.3939, "step": 75 }, { "epoch": 0.27461607949412825, "grad_norm": 0.07052630186080933, "learning_rate": 4.2469235694471043e-05, "loss": 0.3435, "step": 76 }, { "epoch": 0.27822944896115626, "grad_norm": 0.06885933130979538, "learning_rate": 4.225840137376993e-05, "loss": 0.4363, "step": 77 }, { "epoch": 0.28184281842818426, "grad_norm": 0.05735473707318306, "learning_rate": 4.204519553876095e-05, "loss": 0.3509, "step": 78 }, { "epoch": 0.28545618789521227, "grad_norm": 0.06102309376001358, "learning_rate": 4.1829647486519596e-05, "loss": 0.3369, "step": 79 }, { "epoch": 0.28906955736224027, "grad_norm": 0.06527422368526459, "learning_rate": 4.161178683597054e-05, "loss": 0.4052, "step": 80 }, { "epoch": 0.2926829268292683, "grad_norm": 0.06578138470649719, "learning_rate": 4.139164352381758e-05, "loss": 0.3586, "step": 81 }, { "epoch": 0.2962962962962963, "grad_norm": 0.05465536564588547, "learning_rate": 4.116924780042997e-05, "loss": 0.3759, "step": 82 }, { "epoch": 0.2999096657633243, "grad_norm": 0.08491545915603638, "learning_rate": 4.094463022568569e-05, "loss": 0.3611, "step": 83 }, { "epoch": 0.3035230352303523, "grad_norm": 0.06035340949892998, "learning_rate": 4.071782166477213e-05, "loss": 0.3537, "step": 84 }, { "epoch": 0.3071364046973803, "grad_norm": 0.06220124289393425, "learning_rate": 4.0488853283944806e-05, "loss": 0.3878, "step": 85 }, { "epoch": 0.3107497741644083, "grad_norm": 0.05434149503707886, "learning_rate": 4.0257756546244804e-05, "loss": 0.3765, "step": 86 }, { "epoch": 0.3143631436314363, "grad_norm": 0.06244641914963722, "learning_rate": 4.0024563207175316e-05, "loss": 0.3668, "step": 87 }, { "epoch": 0.3179765130984643, "grad_norm": 0.08008646965026855, "learning_rate": 3.978930531033807e-05, "loss": 0.3883, "step": 88 }, { "epoch": 0.3215898825654923, "grad_norm": 0.06990881264209747, "learning_rate": 3.9552015183030136e-05, "loss": 0.4611, "step": 89 }, { "epoch": 0.3252032520325203, "grad_norm": 0.05660560727119446, "learning_rate": 3.93127254318018e-05, "loss": 0.3865, "step": 90 }, { "epoch": 0.3288166214995483, "grad_norm": 0.05711934715509415, "learning_rate": 3.907146893797599e-05, "loss": 0.4223, "step": 91 }, { "epoch": 0.3324299909665763, "grad_norm": 0.06767363101243973, "learning_rate": 3.882827885312999e-05, "loss": 0.3481, "step": 92 }, { "epoch": 0.33604336043360433, "grad_norm": 0.05866090953350067, "learning_rate": 3.858318859454001e-05, "loss": 0.4195, "step": 93 }, { "epoch": 0.33965672990063234, "grad_norm": 0.05316139757633209, "learning_rate": 3.833623184058926e-05, "loss": 0.4042, "step": 94 }, { "epoch": 0.34327009936766034, "grad_norm": 0.06730002164840698, "learning_rate": 3.808744252614012e-05, "loss": 0.3717, "step": 95 }, { "epoch": 0.34688346883468835, "grad_norm": 0.07342930138111115, "learning_rate": 3.783685483787105e-05, "loss": 0.4075, "step": 96 }, { "epoch": 0.35049683830171635, "grad_norm": 0.07083098590373993, "learning_rate": 3.758450320957899e-05, "loss": 0.3864, "step": 97 }, { "epoch": 0.35411020776874436, "grad_norm": 0.07677371054887772, "learning_rate": 3.7330422317447685e-05, "loss": 0.393, "step": 98 }, { "epoch": 0.35772357723577236, "grad_norm": 0.0808129534125328, "learning_rate": 3.707464707528275e-05, "loss": 0.3801, "step": 99 }, { "epoch": 0.36133694670280037, "grad_norm": 0.06672363728284836, "learning_rate": 3.681721262971413e-05, "loss": 0.4472, "step": 100 }, { "epoch": 0.36495031616982837, "grad_norm": 0.05534950643777847, "learning_rate": 3.6558154355366506e-05, "loss": 0.3683, "step": 101 }, { "epoch": 0.3685636856368564, "grad_norm": 0.06686428934335709, "learning_rate": 3.6297507849998344e-05, "loss": 0.3455, "step": 102 }, { "epoch": 0.3721770551038844, "grad_norm": 0.07248938828706741, "learning_rate": 3.6035308929610446e-05, "loss": 0.4083, "step": 103 }, { "epoch": 0.3757904245709124, "grad_norm": 0.06316327303647995, "learning_rate": 3.5771593623524265e-05, "loss": 0.3661, "step": 104 }, { "epoch": 0.3794037940379404, "grad_norm": 0.08561142534017563, "learning_rate": 3.550639816943111e-05, "loss": 0.3693, "step": 105 }, { "epoch": 0.3830171635049684, "grad_norm": 0.05884739011526108, "learning_rate": 3.5239759008412666e-05, "loss": 0.4326, "step": 106 }, { "epoch": 0.3866305329719964, "grad_norm": 0.06861259788274765, "learning_rate": 3.497171277993346e-05, "loss": 0.3423, "step": 107 }, { "epoch": 0.3902439024390244, "grad_norm": 0.06908590346574783, "learning_rate": 3.4702296316806244e-05, "loss": 0.4494, "step": 108 }, { "epoch": 0.3938572719060524, "grad_norm": 0.07454199343919754, "learning_rate": 3.443154664013067e-05, "loss": 0.4488, "step": 109 }, { "epoch": 0.3974706413730804, "grad_norm": 0.07938794046640396, "learning_rate": 3.415950095420616e-05, "loss": 0.3938, "step": 110 }, { "epoch": 0.4010840108401084, "grad_norm": 0.08505871146917343, "learning_rate": 3.3886196641419545e-05, "loss": 0.4004, "step": 111 }, { "epoch": 0.4046973803071364, "grad_norm": 0.0625777617096901, "learning_rate": 3.361167125710832e-05, "loss": 0.3863, "step": 112 }, { "epoch": 0.4083107497741644, "grad_norm": 0.07772816717624664, "learning_rate": 3.333596252440008e-05, "loss": 0.3981, "step": 113 }, { "epoch": 0.41192411924119243, "grad_norm": 0.06656523048877716, "learning_rate": 3.305910832902884e-05, "loss": 0.3705, "step": 114 }, { "epoch": 0.41553748870822044, "grad_norm": 0.07238256186246872, "learning_rate": 3.278114671412917e-05, "loss": 0.412, "step": 115 }, { "epoch": 0.41915085817524844, "grad_norm": 0.06601731479167938, "learning_rate": 3.2502115875008524e-05, "loss": 0.3716, "step": 116 }, { "epoch": 0.42276422764227645, "grad_norm": 0.0684824138879776, "learning_rate": 3.222205415389877e-05, "loss": 0.4183, "step": 117 }, { "epoch": 0.42637759710930445, "grad_norm": 0.0698830783367157, "learning_rate": 3.1941000034687515e-05, "loss": 0.3517, "step": 118 }, { "epoch": 0.42999096657633246, "grad_norm": 0.05978047475218773, "learning_rate": 3.165899213762995e-05, "loss": 0.3852, "step": 119 }, { "epoch": 0.43360433604336046, "grad_norm": 0.07572682201862335, "learning_rate": 3.1376069214041913e-05, "loss": 0.4022, "step": 120 }, { "epoch": 0.4372177055103884, "grad_norm": 0.07104960829019547, "learning_rate": 3.109227014097505e-05, "loss": 0.4185, "step": 121 }, { "epoch": 0.4408310749774164, "grad_norm": 0.06828156113624573, "learning_rate": 3.0807633915874584e-05, "loss": 0.4239, "step": 122 }, { "epoch": 0.4444444444444444, "grad_norm": 0.057690802961587906, "learning_rate": 3.052219965122062e-05, "loss": 0.4109, "step": 123 }, { "epoch": 0.4480578139114724, "grad_norm": 0.06580954045057297, "learning_rate": 3.0236006569153617e-05, "loss": 0.359, "step": 124 }, { "epoch": 0.45167118337850043, "grad_norm": 0.060349613428115845, "learning_rate": 2.9949093996084747e-05, "loss": 0.3775, "step": 125 }, { "epoch": 0.45528455284552843, "grad_norm": 0.07335729151964188, "learning_rate": 2.9661501357292033e-05, "loss": 0.4043, "step": 126 }, { "epoch": 0.45889792231255644, "grad_norm": 0.04954389110207558, "learning_rate": 2.9373268171502777e-05, "loss": 0.3537, "step": 127 }, { "epoch": 0.46251129177958444, "grad_norm": 0.07528957724571228, "learning_rate": 2.9084434045463255e-05, "loss": 0.467, "step": 128 }, { "epoch": 0.46612466124661245, "grad_norm": 0.06106121093034744, "learning_rate": 2.8795038668496222e-05, "loss": 0.4323, "step": 129 }, { "epoch": 0.46973803071364045, "grad_norm": 0.08181653916835785, "learning_rate": 2.850512180704715e-05, "loss": 0.4208, "step": 130 }, { "epoch": 0.47335140018066846, "grad_norm": 0.07354505360126495, "learning_rate": 2.821472329921981e-05, "loss": 0.3909, "step": 131 }, { "epoch": 0.47696476964769646, "grad_norm": 0.09099866449832916, "learning_rate": 2.792388304930207e-05, "loss": 0.4296, "step": 132 }, { "epoch": 0.48057813911472447, "grad_norm": 0.08062151074409485, "learning_rate": 2.7632641022282502e-05, "loss": 0.4106, "step": 133 }, { "epoch": 0.48419150858175247, "grad_norm": 0.09198120981454849, "learning_rate": 2.7341037238358774e-05, "loss": 0.4064, "step": 134 }, { "epoch": 0.4878048780487805, "grad_norm": 0.05343058705329895, "learning_rate": 2.704911176743833e-05, "loss": 0.404, "step": 135 }, { "epoch": 0.4914182475158085, "grad_norm": 0.0657978504896164, "learning_rate": 2.6756904723632324e-05, "loss": 0.3993, "step": 136 }, { "epoch": 0.4950316169828365, "grad_norm": 0.057678401470184326, "learning_rate": 2.646445625974347e-05, "loss": 0.3804, "step": 137 }, { "epoch": 0.4986449864498645, "grad_norm": 0.06898088753223419, "learning_rate": 2.6171806561748502e-05, "loss": 0.4452, "step": 138 }, { "epoch": 0.5022583559168925, "grad_norm": 0.09333262592554092, "learning_rate": 2.5878995843276204e-05, "loss": 0.3304, "step": 139 }, { "epoch": 0.5058717253839206, "grad_norm": 0.06717183440923691, "learning_rate": 2.5586064340081516e-05, "loss": 0.326, "step": 140 }, { "epoch": 0.5094850948509485, "grad_norm": 0.06729979068040848, "learning_rate": 2.529305230451666e-05, "loss": 0.3934, "step": 141 }, { "epoch": 0.5130984643179766, "grad_norm": 0.09550358355045319, "learning_rate": 2.5e-05, "loss": 0.4733, "step": 142 }, { "epoch": 0.5167118337850045, "grad_norm": 0.07080523669719696, "learning_rate": 2.4706947695483348e-05, "loss": 0.4039, "step": 143 }, { "epoch": 0.5203252032520326, "grad_norm": 0.055423106998205185, "learning_rate": 2.441393565991849e-05, "loss": 0.3275, "step": 144 }, { "epoch": 0.5239385727190605, "grad_norm": 0.06483904272317886, "learning_rate": 2.4121004156723802e-05, "loss": 0.4377, "step": 145 }, { "epoch": 0.5275519421860885, "grad_norm": 0.06614437699317932, "learning_rate": 2.3828193438251497e-05, "loss": 0.3935, "step": 146 }, { "epoch": 0.5311653116531165, "grad_norm": 0.08745498955249786, "learning_rate": 2.3535543740256536e-05, "loss": 0.4348, "step": 147 }, { "epoch": 0.5347786811201445, "grad_norm": 0.07158234715461731, "learning_rate": 2.3243095276367685e-05, "loss": 0.3286, "step": 148 }, { "epoch": 0.5383920505871725, "grad_norm": 0.06448652595281601, "learning_rate": 2.2950888232561672e-05, "loss": 0.4108, "step": 149 }, { "epoch": 0.5420054200542005, "grad_norm": 0.07621192187070847, "learning_rate": 2.2658962761641232e-05, "loss": 0.4317, "step": 150 }, { "epoch": 0.5456187895212286, "grad_norm": 0.07459475100040436, "learning_rate": 2.23673589777175e-05, "loss": 0.3876, "step": 151 }, { "epoch": 0.5492321589882565, "grad_norm": 0.07355853170156479, "learning_rate": 2.207611695069794e-05, "loss": 0.3506, "step": 152 }, { "epoch": 0.5528455284552846, "grad_norm": 0.07565652579069138, "learning_rate": 2.17852767007802e-05, "loss": 0.4221, "step": 153 }, { "epoch": 0.5564588979223125, "grad_norm": 0.07433846592903137, "learning_rate": 2.1494878192952855e-05, "loss": 0.3913, "step": 154 }, { "epoch": 0.5600722673893406, "grad_norm": 0.07123446464538574, "learning_rate": 2.1204961331503787e-05, "loss": 0.4106, "step": 155 }, { "epoch": 0.5636856368563685, "grad_norm": 0.0848294198513031, "learning_rate": 2.0915565954536744e-05, "loss": 0.3171, "step": 156 }, { "epoch": 0.5672990063233966, "grad_norm": 0.06394634395837784, "learning_rate": 2.0626731828497225e-05, "loss": 0.4106, "step": 157 }, { "epoch": 0.5709123757904245, "grad_norm": 0.06601906567811966, "learning_rate": 2.0338498642707977e-05, "loss": 0.3651, "step": 158 }, { "epoch": 0.5745257452574526, "grad_norm": 0.0734376311302185, "learning_rate": 2.005090600391526e-05, "loss": 0.3906, "step": 159 }, { "epoch": 0.5781391147244805, "grad_norm": 0.07122786343097687, "learning_rate": 1.9763993430846395e-05, "loss": 0.4157, "step": 160 }, { "epoch": 0.5817524841915086, "grad_norm": 0.06590158492326736, "learning_rate": 1.947780034877938e-05, "loss": 0.4267, "step": 161 }, { "epoch": 0.5853658536585366, "grad_norm": 0.07380690425634384, "learning_rate": 1.9192366084125425e-05, "loss": 0.3748, "step": 162 }, { "epoch": 0.5889792231255646, "grad_norm": 0.054361093789339066, "learning_rate": 1.890772985902496e-05, "loss": 0.3637, "step": 163 }, { "epoch": 0.5925925925925926, "grad_norm": 0.06896340101957321, "learning_rate": 1.8623930785958092e-05, "loss": 0.4319, "step": 164 }, { "epoch": 0.5962059620596206, "grad_norm": 0.08140537887811661, "learning_rate": 1.8341007862370056e-05, "loss": 0.3942, "step": 165 }, { "epoch": 0.5998193315266486, "grad_norm": 0.07021729648113251, "learning_rate": 1.8058999965312484e-05, "loss": 0.3917, "step": 166 }, { "epoch": 0.6034327009936766, "grad_norm": 0.06319273263216019, "learning_rate": 1.777794584610124e-05, "loss": 0.3833, "step": 167 }, { "epoch": 0.6070460704607046, "grad_norm": 0.07088933885097504, "learning_rate": 1.749788412499149e-05, "loss": 0.3326, "step": 168 }, { "epoch": 0.6106594399277326, "grad_norm": 0.06848324090242386, "learning_rate": 1.721885328587083e-05, "loss": 0.5018, "step": 169 }, { "epoch": 0.6142728093947606, "grad_norm": 0.07163573056459427, "learning_rate": 1.694089167097116e-05, "loss": 0.3624, "step": 170 }, { "epoch": 0.6178861788617886, "grad_norm": 0.06683260202407837, "learning_rate": 1.6664037475599923e-05, "loss": 0.4198, "step": 171 }, { "epoch": 0.6214995483288166, "grad_norm": 0.06273495405912399, "learning_rate": 1.638832874289168e-05, "loss": 0.3388, "step": 172 }, { "epoch": 0.6251129177958447, "grad_norm": 0.06024303659796715, "learning_rate": 1.611380335858047e-05, "loss": 0.4156, "step": 173 }, { "epoch": 0.6287262872628726, "grad_norm": 0.08732262253761292, "learning_rate": 1.5840499045793843e-05, "loss": 0.3883, "step": 174 }, { "epoch": 0.6323396567299007, "grad_norm": 0.06800790876150131, "learning_rate": 1.5568453359869334e-05, "loss": 0.3636, "step": 175 }, { "epoch": 0.6359530261969286, "grad_norm": 0.08514184504747391, "learning_rate": 1.5297703683193752e-05, "loss": 0.3664, "step": 176 }, { "epoch": 0.6395663956639567, "grad_norm": 0.0805889442563057, "learning_rate": 1.502828722006655e-05, "loss": 0.3912, "step": 177 }, { "epoch": 0.6431797651309846, "grad_norm": 0.07321416586637497, "learning_rate": 1.4760240991587337e-05, "loss": 0.4077, "step": 178 }, { "epoch": 0.6467931345980127, "grad_norm": 0.06993624567985535, "learning_rate": 1.4493601830568887e-05, "loss": 0.3728, "step": 179 }, { "epoch": 0.6504065040650406, "grad_norm": 0.07736963033676147, "learning_rate": 1.4228406376475742e-05, "loss": 0.3644, "step": 180 }, { "epoch": 0.6540198735320687, "grad_norm": 0.06840698421001434, "learning_rate": 1.396469107038956e-05, "loss": 0.3936, "step": 181 }, { "epoch": 0.6576332429990966, "grad_norm": 0.07498890906572342, "learning_rate": 1.3702492150001659e-05, "loss": 0.3948, "step": 182 }, { "epoch": 0.6612466124661247, "grad_norm": 0.06307978183031082, "learning_rate": 1.34418456446335e-05, "loss": 0.398, "step": 183 }, { "epoch": 0.6648599819331527, "grad_norm": 0.0843866616487503, "learning_rate": 1.3182787370285865e-05, "loss": 0.3891, "step": 184 }, { "epoch": 0.6684733514001807, "grad_norm": 0.07880077511072159, "learning_rate": 1.292535292471726e-05, "loss": 0.3812, "step": 185 }, { "epoch": 0.6720867208672087, "grad_norm": 0.06986968219280243, "learning_rate": 1.2669577682552319e-05, "loss": 0.3851, "step": 186 }, { "epoch": 0.6757000903342367, "grad_norm": 0.07602784037590027, "learning_rate": 1.2415496790421011e-05, "loss": 0.3956, "step": 187 }, { "epoch": 0.6793134598012647, "grad_norm": 0.06611546874046326, "learning_rate": 1.2163145162128947e-05, "loss": 0.3629, "step": 188 }, { "epoch": 0.6829268292682927, "grad_norm": 0.07958898693323135, "learning_rate": 1.1912557473859895e-05, "loss": 0.3647, "step": 189 }, { "epoch": 0.6865401987353207, "grad_norm": 0.06264237314462662, "learning_rate": 1.1663768159410748e-05, "loss": 0.3797, "step": 190 }, { "epoch": 0.6901535682023487, "grad_norm": 0.08303744345903397, "learning_rate": 1.1416811405459993e-05, "loss": 0.3754, "step": 191 }, { "epoch": 0.6937669376693767, "grad_norm": 0.07206673175096512, "learning_rate": 1.1171721146870015e-05, "loss": 0.327, "step": 192 }, { "epoch": 0.6973803071364046, "grad_norm": 0.06349314749240875, "learning_rate": 1.0928531062024017e-05, "loss": 0.3902, "step": 193 }, { "epoch": 0.7009936766034327, "grad_norm": 0.07241489738225937, "learning_rate": 1.0687274568198208e-05, "loss": 0.3845, "step": 194 }, { "epoch": 0.7046070460704607, "grad_norm": 0.06357239931821823, "learning_rate": 1.0447984816969874e-05, "loss": 0.3881, "step": 195 }, { "epoch": 0.7082204155374887, "grad_norm": 0.06316613405942917, "learning_rate": 1.021069468966194e-05, "loss": 0.4735, "step": 196 }, { "epoch": 0.7118337850045167, "grad_norm": 0.08076903223991394, "learning_rate": 9.975436792824691e-06, "loss": 0.43, "step": 197 }, { "epoch": 0.7154471544715447, "grad_norm": 0.0836021676659584, "learning_rate": 9.742243453755202e-06, "loss": 0.3818, "step": 198 }, { "epoch": 0.7190605239385727, "grad_norm": 0.0713673084974289, "learning_rate": 9.5111467160552e-06, "loss": 0.3846, "step": 199 }, { "epoch": 0.7226738934056007, "grad_norm": 0.08711904287338257, "learning_rate": 9.282178335227884e-06, "loss": 0.4817, "step": 200 }, { "epoch": 0.7262872628726287, "grad_norm": 0.05264454334974289, "learning_rate": 9.05536977431431e-06, "loss": 0.3995, "step": 201 }, { "epoch": 0.7299006323396567, "grad_norm": 0.07466941326856613, "learning_rate": 8.830752199570033e-06, "loss": 0.3718, "step": 202 }, { "epoch": 0.7335140018066847, "grad_norm": 0.07776648551225662, "learning_rate": 8.608356476182424e-06, "loss": 0.4786, "step": 203 }, { "epoch": 0.7371273712737128, "grad_norm": 0.06611160188913345, "learning_rate": 8.38821316402946e-06, "loss": 0.3668, "step": 204 }, { "epoch": 0.7407407407407407, "grad_norm": 0.07174837589263916, "learning_rate": 8.170352513480408e-06, "loss": 0.4016, "step": 205 }, { "epoch": 0.7443541102077688, "grad_norm": 0.0830477848649025, "learning_rate": 7.954804461239053e-06, "loss": 0.4162, "step": 206 }, { "epoch": 0.7479674796747967, "grad_norm": 0.08300362527370453, "learning_rate": 7.741598626230079e-06, "loss": 0.3738, "step": 207 }, { "epoch": 0.7515808491418248, "grad_norm": 0.07526036351919174, "learning_rate": 7.530764305528959e-06, "loss": 0.3576, "step": 208 }, { "epoch": 0.7551942186088527, "grad_norm": 0.06786955147981644, "learning_rate": 7.3223304703363135e-06, "loss": 0.4152, "step": 209 }, { "epoch": 0.7588075880758808, "grad_norm": 0.08544765412807465, "learning_rate": 7.116325761996817e-06, "loss": 0.3735, "step": 210 }, { "epoch": 0.7624209575429087, "grad_norm": 0.06077965721487999, "learning_rate": 6.91277848806356e-06, "loss": 0.3486, "step": 211 }, { "epoch": 0.7660343270099368, "grad_norm": 0.07332652807235718, "learning_rate": 6.711716618408281e-06, "loss": 0.3734, "step": 212 }, { "epoch": 0.7696476964769647, "grad_norm": 0.07848729193210602, "learning_rate": 6.513167781377885e-06, "loss": 0.4231, "step": 213 }, { "epoch": 0.7732610659439928, "grad_norm": 0.07897993177175522, "learning_rate": 6.317159259998073e-06, "loss": 0.3513, "step": 214 }, { "epoch": 0.7768744354110207, "grad_norm": 0.07235241681337357, "learning_rate": 6.123717988224237e-06, "loss": 0.4069, "step": 215 }, { "epoch": 0.7804878048780488, "grad_norm": 0.09085345268249512, "learning_rate": 5.932870547240454e-06, "loss": 0.3849, "step": 216 }, { "epoch": 0.7841011743450768, "grad_norm": 0.07704368233680725, "learning_rate": 5.74464316180689e-06, "loss": 0.4261, "step": 217 }, { "epoch": 0.7877145438121048, "grad_norm": 0.057720448821783066, "learning_rate": 5.559061696656198e-06, "loss": 0.3711, "step": 218 }, { "epoch": 0.7913279132791328, "grad_norm": 0.06448069959878922, "learning_rate": 5.37615165293942e-06, "loss": 0.4027, "step": 219 }, { "epoch": 0.7949412827461608, "grad_norm": 0.08539154380559921, "learning_rate": 5.1959381647217666e-06, "loss": 0.388, "step": 220 }, { "epoch": 0.7985546522131888, "grad_norm": 0.07000590115785599, "learning_rate": 5.018445995528931e-06, "loss": 0.4122, "step": 221 }, { "epoch": 0.8021680216802168, "grad_norm": 0.07643178850412369, "learning_rate": 4.843699534944257e-06, "loss": 0.3749, "step": 222 }, { "epoch": 0.8057813911472448, "grad_norm": 0.06629081815481186, "learning_rate": 4.671722795257327e-06, "loss": 0.3817, "step": 223 }, { "epoch": 0.8093947606142728, "grad_norm": 0.06171542406082153, "learning_rate": 4.502539408164386e-06, "loss": 0.3474, "step": 224 }, { "epoch": 0.8130081300813008, "grad_norm": 0.06734922528266907, "learning_rate": 4.336172621521034e-06, "loss": 0.3328, "step": 225 }, { "epoch": 0.8166214995483289, "grad_norm": 0.09524697810411453, "learning_rate": 4.1726452961477146e-06, "loss": 0.3433, "step": 226 }, { "epoch": 0.8202348690153568, "grad_norm": 0.06357850879430771, "learning_rate": 4.01197990268834e-06, "loss": 0.3992, "step": 227 }, { "epoch": 0.8238482384823849, "grad_norm": 0.07560393214225769, "learning_rate": 3.8541985185225645e-06, "loss": 0.3575, "step": 228 }, { "epoch": 0.8274616079494128, "grad_norm": 0.06906560808420181, "learning_rate": 3.6993228247320877e-06, "loss": 0.3287, "step": 229 }, { "epoch": 0.8310749774164409, "grad_norm": 0.08411566913127899, "learning_rate": 3.547374103121398e-06, "loss": 0.4115, "step": 230 }, { "epoch": 0.8346883468834688, "grad_norm": 0.08515972644090652, "learning_rate": 3.398373233293378e-06, "loss": 0.3709, "step": 231 }, { "epoch": 0.8383017163504969, "grad_norm": 0.06780155003070831, "learning_rate": 3.252340689780245e-06, "loss": 0.3599, "step": 232 }, { "epoch": 0.8419150858175248, "grad_norm": 0.08019706606864929, "learning_rate": 3.1092965392300417e-06, "loss": 0.3869, "step": 233 }, { "epoch": 0.8455284552845529, "grad_norm": 0.0702086016535759, "learning_rate": 2.969260437649293e-06, "loss": 0.3846, "step": 234 }, { "epoch": 0.8491418247515808, "grad_norm": 0.0851154550909996, "learning_rate": 2.8322516277019624e-06, "loss": 0.3434, "step": 235 }, { "epoch": 0.8527551942186089, "grad_norm": 0.06722518056631088, "learning_rate": 2.6982889360653377e-06, "loss": 0.3349, "step": 236 }, { "epoch": 0.8563685636856369, "grad_norm": 0.06803542375564575, "learning_rate": 2.5673907708429976e-06, "loss": 0.3526, "step": 237 }, { "epoch": 0.8599819331526649, "grad_norm": 0.08029063045978546, "learning_rate": 2.4395751190352924e-06, "loss": 0.4286, "step": 238 }, { "epoch": 0.8635953026196929, "grad_norm": 0.08042778819799423, "learning_rate": 2.3148595440677405e-06, "loss": 0.3739, "step": 239 }, { "epoch": 0.8672086720867209, "grad_norm": 0.07175204902887344, "learning_rate": 2.1932611833775846e-06, "loss": 0.4156, "step": 240 }, { "epoch": 0.8708220415537489, "grad_norm": 0.058878783136606216, "learning_rate": 2.074796746058896e-06, "loss": 0.3636, "step": 241 }, { "epoch": 0.8744354110207768, "grad_norm": 0.08569607883691788, "learning_rate": 1.9594825105665654e-06, "loss": 0.3889, "step": 242 }, { "epoch": 0.8780487804878049, "grad_norm": 0.07353324443101883, "learning_rate": 1.847334322479413e-06, "loss": 0.4352, "step": 243 }, { "epoch": 0.8816621499548328, "grad_norm": 0.07135035842657089, "learning_rate": 1.738367592322837e-06, "loss": 0.4265, "step": 244 }, { "epoch": 0.8852755194218609, "grad_norm": 0.06918162852525711, "learning_rate": 1.6325972934512018e-06, "loss": 0.4295, "step": 245 }, { "epoch": 0.8888888888888888, "grad_norm": 0.07300789654254913, "learning_rate": 1.5300379599903409e-06, "loss": 0.4226, "step": 246 }, { "epoch": 0.8925022583559169, "grad_norm": 0.06973148882389069, "learning_rate": 1.4307036848403648e-06, "loss": 0.3368, "step": 247 }, { "epoch": 0.8961156278229448, "grad_norm": 0.07200148701667786, "learning_rate": 1.3346081177391472e-06, "loss": 0.3924, "step": 248 }, { "epoch": 0.8997289972899729, "grad_norm": 0.07833510637283325, "learning_rate": 1.2417644633866632e-06, "loss": 0.3274, "step": 249 }, { "epoch": 0.9033423667570009, "grad_norm": 0.061651114374399185, "learning_rate": 1.1521854796305242e-06, "loss": 0.3705, "step": 250 }, { "epoch": 0.9069557362240289, "grad_norm": 0.07440148293972015, "learning_rate": 1.0658834757128838e-06, "loss": 0.3715, "step": 251 }, { "epoch": 0.9105691056910569, "grad_norm": 0.0720466673374176, "learning_rate": 9.828703105789983e-07, "loss": 0.3361, "step": 252 }, { "epoch": 0.9141824751580849, "grad_norm": 0.08179104328155518, "learning_rate": 9.031573912476554e-07, "loss": 0.3393, "step": 253 }, { "epoch": 0.9177958446251129, "grad_norm": 0.058865226805210114, "learning_rate": 8.267556712437341e-07, "loss": 0.4249, "step": 254 }, { "epoch": 0.9214092140921409, "grad_norm": 0.07929901778697968, "learning_rate": 7.536756490930358e-07, "loss": 0.4341, "step": 255 }, { "epoch": 0.9250225835591689, "grad_norm": 0.07914505153894424, "learning_rate": 6.839273668796747e-07, "loss": 0.3942, "step": 256 }, { "epoch": 0.928635953026197, "grad_norm": 0.08146975934505463, "learning_rate": 6.175204088661485e-07, "loss": 0.3562, "step": 257 }, { "epoch": 0.9322493224932249, "grad_norm": 0.08726157248020172, "learning_rate": 5.544639001763718e-07, "loss": 0.4314, "step": 258 }, { "epoch": 0.935862691960253, "grad_norm": 0.09031800180673599, "learning_rate": 4.947665055417605e-07, "loss": 0.3842, "step": 259 }, { "epoch": 0.9394760614272809, "grad_norm": 0.0922897681593895, "learning_rate": 4.3843642811059737e-07, "loss": 0.3285, "step": 260 }, { "epoch": 0.943089430894309, "grad_norm": 0.07188927382230759, "learning_rate": 3.854814083208064e-07, "loss": 0.3839, "step": 261 }, { "epoch": 0.9467028003613369, "grad_norm": 0.08181816339492798, "learning_rate": 3.3590872283633944e-07, "loss": 0.3651, "step": 262 }, { "epoch": 0.950316169828365, "grad_norm": 0.0699373111128807, "learning_rate": 2.8972518354725977e-07, "loss": 0.457, "step": 263 }, { "epoch": 0.9539295392953929, "grad_norm": 0.08292391151189804, "learning_rate": 2.4693713663372644e-07, "loss": 0.4105, "step": 264 }, { "epoch": 0.957542908762421, "grad_norm": 0.07387669384479523, "learning_rate": 2.0755046169392e-07, "loss": 0.3846, "step": 265 }, { "epoch": 0.9611562782294489, "grad_norm": 0.08278100937604904, "learning_rate": 1.7157057093614703e-07, "loss": 0.4334, "step": 266 }, { "epoch": 0.964769647696477, "grad_norm": 0.06216645613312721, "learning_rate": 1.3900240843510993e-07, "loss": 0.4007, "step": 267 }, { "epoch": 0.9683830171635049, "grad_norm": 0.07292906939983368, "learning_rate": 1.0985044945254764e-07, "loss": 0.4152, "step": 268 }, { "epoch": 0.971996386630533, "grad_norm": 0.07897216826677322, "learning_rate": 8.411869982228038e-08, "loss": 0.3954, "step": 269 }, { "epoch": 0.975609756097561, "grad_norm": 0.0776594951748848, "learning_rate": 6.181069539974716e-08, "loss": 0.3449, "step": 270 }, { "epoch": 0.979223125564589, "grad_norm": 0.07104814052581787, "learning_rate": 4.292950157614717e-08, "loss": 0.3476, "step": 271 }, { "epoch": 0.982836495031617, "grad_norm": 0.07420724630355835, "learning_rate": 2.7477712857215677e-08, "loss": 0.4095, "step": 272 }, { "epoch": 0.986449864498645, "grad_norm": 0.06806948781013489, "learning_rate": 1.5457452506698056e-08, "loss": 0.3879, "step": 273 }, { "epoch": 0.990063233965673, "grad_norm": 0.08909036219120026, "learning_rate": 6.870372254602631e-09, "loss": 0.3327, "step": 274 }, { "epoch": 0.993676603432701, "grad_norm": 0.07509468495845795, "learning_rate": 1.7176520702238964e-09, "loss": 0.4033, "step": 275 }, { "epoch": 0.997289972899729, "grad_norm": 0.06269805878400803, "learning_rate": 0.0, "loss": 0.4076, "step": 276 }, { "epoch": 0.997289972899729, "eval_loss": 0.35787180066108704, "eval_runtime": 515.6409, "eval_samples_per_second": 1.422, "eval_steps_per_second": 0.357, "step": 276 } ], "logging_steps": 1, "max_steps": 276, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.247726843172225e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }