| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.997289972899729, | |
| "eval_steps": 500, | |
| "global_step": 276, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0036133694670280035, | |
| "grad_norm": 0.3959366977214813, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.9323, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.007226738934056007, | |
| "grad_norm": 0.45551198720932007, | |
| "learning_rate": 1.25e-05, | |
| "loss": 1.0507, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01084010840108401, | |
| "grad_norm": 0.2823091745376587, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.8491, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.014453477868112014, | |
| "grad_norm": 0.46047303080558777, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.0142, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.018066847335140017, | |
| "grad_norm": 0.4086349606513977, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.947, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02168021680216802, | |
| "grad_norm": 0.457003116607666, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.9485, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.025293586269196026, | |
| "grad_norm": 0.35562458634376526, | |
| "learning_rate": 4.375e-05, | |
| "loss": 0.8449, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.028906955736224028, | |
| "grad_norm": 0.33805516362190247, | |
| "learning_rate": 5e-05, | |
| "loss": 0.7379, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.032520325203252036, | |
| "grad_norm": 0.3412623703479767, | |
| "learning_rate": 4.9998282347929784e-05, | |
| "loss": 0.6282, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.036133694670280034, | |
| "grad_norm": 0.2843680679798126, | |
| "learning_rate": 4.99931296277454e-05, | |
| "loss": 0.5503, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03974706413730804, | |
| "grad_norm": 0.17628777027130127, | |
| "learning_rate": 4.998454254749331e-05, | |
| "loss": 0.512, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04336043360433604, | |
| "grad_norm": 0.19055013358592987, | |
| "learning_rate": 4.997252228714279e-05, | |
| "loss": 0.5397, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04697380307136405, | |
| "grad_norm": 0.08906977623701096, | |
| "learning_rate": 4.9957070498423854e-05, | |
| "loss": 0.5458, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05058717253839205, | |
| "grad_norm": 0.0917251780629158, | |
| "learning_rate": 4.993818930460026e-05, | |
| "loss": 0.5269, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.05420054200542006, | |
| "grad_norm": 0.0985497236251831, | |
| "learning_rate": 4.9915881300177725e-05, | |
| "loss": 0.4135, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.057813911472448055, | |
| "grad_norm": 0.1111132949590683, | |
| "learning_rate": 4.9890149550547454e-05, | |
| "loss": 0.5064, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.06142728093947606, | |
| "grad_norm": 0.0649256557226181, | |
| "learning_rate": 4.98609975915649e-05, | |
| "loss": 0.4804, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.06504065040650407, | |
| "grad_norm": 0.09687516838312149, | |
| "learning_rate": 4.982842942906386e-05, | |
| "loss": 0.3706, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06865401987353206, | |
| "grad_norm": 0.14679567515850067, | |
| "learning_rate": 4.979244953830608e-05, | |
| "loss": 0.4105, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.07226738934056007, | |
| "grad_norm": 0.14155593514442444, | |
| "learning_rate": 4.9753062863366276e-05, | |
| "loss": 0.4886, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07588075880758807, | |
| "grad_norm": 0.14684930443763733, | |
| "learning_rate": 4.971027481645274e-05, | |
| "loss": 0.4044, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07949412827461608, | |
| "grad_norm": 0.11222010105848312, | |
| "learning_rate": 4.966409127716367e-05, | |
| "loss": 0.4361, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.08310749774164408, | |
| "grad_norm": 0.058118775486946106, | |
| "learning_rate": 4.96145185916792e-05, | |
| "loss": 0.4176, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08672086720867209, | |
| "grad_norm": 0.06764644384384155, | |
| "learning_rate": 4.95615635718894e-05, | |
| "loss": 0.4683, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.09033423667570009, | |
| "grad_norm": 0.06886276602745056, | |
| "learning_rate": 4.950523349445824e-05, | |
| "loss": 0.418, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0939476061427281, | |
| "grad_norm": 0.0706636980175972, | |
| "learning_rate": 4.944553609982363e-05, | |
| "loss": 0.3967, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0975609756097561, | |
| "grad_norm": 0.04914792627096176, | |
| "learning_rate": 4.938247959113386e-05, | |
| "loss": 0.4623, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1011743450767841, | |
| "grad_norm": 0.05717244744300842, | |
| "learning_rate": 4.931607263312032e-05, | |
| "loss": 0.4047, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.10478771454381211, | |
| "grad_norm": 0.05677526444196701, | |
| "learning_rate": 4.924632435090696e-05, | |
| "loss": 0.4251, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10840108401084012, | |
| "grad_norm": 0.051282044500112534, | |
| "learning_rate": 4.917324432875627e-05, | |
| "loss": 0.4101, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1120144534778681, | |
| "grad_norm": 0.05558260530233383, | |
| "learning_rate": 4.909684260875235e-05, | |
| "loss": 0.4425, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.11562782294489611, | |
| "grad_norm": 0.05362090840935707, | |
| "learning_rate": 4.9017129689421e-05, | |
| "loss": 0.383, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11924119241192412, | |
| "grad_norm": 0.050591859966516495, | |
| "learning_rate": 4.893411652428712e-05, | |
| "loss": 0.3988, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.12285456187895212, | |
| "grad_norm": 0.07354583591222763, | |
| "learning_rate": 4.8847814520369475e-05, | |
| "loss": 0.473, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.12646793134598014, | |
| "grad_norm": 0.07448670268058777, | |
| "learning_rate": 4.875823553661334e-05, | |
| "loss": 0.3609, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.13008130081300814, | |
| "grad_norm": 0.09399361908435822, | |
| "learning_rate": 4.8665391882260856e-05, | |
| "loss": 0.3927, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.13369467028003612, | |
| "grad_norm": 0.061091382056474686, | |
| "learning_rate": 4.856929631515964e-05, | |
| "loss": 0.4512, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.13730803974706413, | |
| "grad_norm": 0.06277038156986237, | |
| "learning_rate": 4.846996204000967e-05, | |
| "loss": 0.3961, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.14092140921409213, | |
| "grad_norm": 0.05277445912361145, | |
| "learning_rate": 4.8367402706548805e-05, | |
| "loss": 0.3885, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.14453477868112014, | |
| "grad_norm": 0.06335710734128952, | |
| "learning_rate": 4.8261632407677174e-05, | |
| "loss": 0.4663, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 0.05149435997009277, | |
| "learning_rate": 4.815266567752059e-05, | |
| "loss": 0.4012, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.15176151761517614, | |
| "grad_norm": 0.052154790610075, | |
| "learning_rate": 4.804051748943343e-05, | |
| "loss": 0.377, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.15537488708220415, | |
| "grad_norm": 0.06229854002594948, | |
| "learning_rate": 4.792520325394111e-05, | |
| "loss": 0.4677, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.15898825654923215, | |
| "grad_norm": 0.050992563366889954, | |
| "learning_rate": 4.780673881662242e-05, | |
| "loss": 0.4271, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.16260162601626016, | |
| "grad_norm": 0.057579364627599716, | |
| "learning_rate": 4.7685140455932267e-05, | |
| "loss": 0.4096, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16621499548328816, | |
| "grad_norm": 0.05966678634285927, | |
| "learning_rate": 4.756042488096471e-05, | |
| "loss": 0.4075, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.16982836495031617, | |
| "grad_norm": 0.055218473076820374, | |
| "learning_rate": 4.743260922915701e-05, | |
| "loss": 0.459, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.17344173441734417, | |
| "grad_norm": 0.05127694830298424, | |
| "learning_rate": 4.730171106393466e-05, | |
| "loss": 0.4086, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.17705510388437218, | |
| "grad_norm": 0.06519781798124313, | |
| "learning_rate": 4.716774837229804e-05, | |
| "loss": 0.4418, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.18066847335140018, | |
| "grad_norm": 0.05895975977182388, | |
| "learning_rate": 4.7030739562350713e-05, | |
| "loss": 0.4013, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1842818428184282, | |
| "grad_norm": 0.061492159962654114, | |
| "learning_rate": 4.6890703460769955e-05, | |
| "loss": 0.3726, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1878952122854562, | |
| "grad_norm": 0.05051853135228157, | |
| "learning_rate": 4.674765931021976e-05, | |
| "loss": 0.4354, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1915085817524842, | |
| "grad_norm": 0.05664265528321266, | |
| "learning_rate": 4.6601626766706626e-05, | |
| "loss": 0.4137, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1951219512195122, | |
| "grad_norm": 0.06020362302660942, | |
| "learning_rate": 4.645262589687861e-05, | |
| "loss": 0.4171, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1987353206865402, | |
| "grad_norm": 0.06303560733795166, | |
| "learning_rate": 4.6300677175267914e-05, | |
| "loss": 0.3724, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2023486901535682, | |
| "grad_norm": 0.06793845444917679, | |
| "learning_rate": 4.614580148147744e-05, | |
| "loss": 0.3711, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.20596205962059622, | |
| "grad_norm": 0.07107391953468323, | |
| "learning_rate": 4.598802009731167e-05, | |
| "loss": 0.4428, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.20957542908762422, | |
| "grad_norm": 0.06567548215389252, | |
| "learning_rate": 4.582735470385229e-05, | |
| "loss": 0.3774, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.21318879855465223, | |
| "grad_norm": 0.05056913569569588, | |
| "learning_rate": 4.5663827378478975e-05, | |
| "loss": 0.3584, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.21680216802168023, | |
| "grad_norm": 0.08128344267606735, | |
| "learning_rate": 4.5497460591835615e-05, | |
| "loss": 0.3983, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2204155374887082, | |
| "grad_norm": 0.05856931954622269, | |
| "learning_rate": 4.532827720474268e-05, | |
| "loss": 0.3486, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2240289069557362, | |
| "grad_norm": 0.05503028631210327, | |
| "learning_rate": 4.515630046505575e-05, | |
| "loss": 0.3896, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.22764227642276422, | |
| "grad_norm": 0.047534190118312836, | |
| "learning_rate": 4.498155400447107e-05, | |
| "loss": 0.4463, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.23125564588979222, | |
| "grad_norm": 0.0638430267572403, | |
| "learning_rate": 4.480406183527823e-05, | |
| "loss": 0.3977, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.23486901535682023, | |
| "grad_norm": 0.04974055290222168, | |
| "learning_rate": 4.462384834706058e-05, | |
| "loss": 0.3999, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23848238482384823, | |
| "grad_norm": 0.06309591233730316, | |
| "learning_rate": 4.4440938303343804e-05, | |
| "loss": 0.4275, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.24209575429087624, | |
| "grad_norm": 0.05192544683814049, | |
| "learning_rate": 4.425535683819312e-05, | |
| "loss": 0.4096, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.24570912375790424, | |
| "grad_norm": 0.057684604078531265, | |
| "learning_rate": 4.406712945275955e-05, | |
| "loss": 0.41, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.24932249322493225, | |
| "grad_norm": 0.0514802448451519, | |
| "learning_rate": 4.387628201177577e-05, | |
| "loss": 0.3372, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2529358626919603, | |
| "grad_norm": 0.056559968739748, | |
| "learning_rate": 4.368284074000193e-05, | |
| "loss": 0.3929, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2565492321589883, | |
| "grad_norm": 0.0645717978477478, | |
| "learning_rate": 4.348683221862212e-05, | |
| "loss": 0.4353, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2601626016260163, | |
| "grad_norm": 0.08638172596693039, | |
| "learning_rate": 4.328828338159173e-05, | |
| "loss": 0.3978, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.26377597109304424, | |
| "grad_norm": 0.05915065109729767, | |
| "learning_rate": 4.3087221511936434e-05, | |
| "loss": 0.393, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.26738934056007224, | |
| "grad_norm": 0.061671093106269836, | |
| "learning_rate": 4.288367423800319e-05, | |
| "loss": 0.4187, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.27100271002710025, | |
| "grad_norm": 0.07420554012060165, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.3939, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.27461607949412825, | |
| "grad_norm": 0.07052630186080933, | |
| "learning_rate": 4.2469235694471043e-05, | |
| "loss": 0.3435, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.27822944896115626, | |
| "grad_norm": 0.06885933130979538, | |
| "learning_rate": 4.225840137376993e-05, | |
| "loss": 0.4363, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.28184281842818426, | |
| "grad_norm": 0.05735473707318306, | |
| "learning_rate": 4.204519553876095e-05, | |
| "loss": 0.3509, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.28545618789521227, | |
| "grad_norm": 0.06102309376001358, | |
| "learning_rate": 4.1829647486519596e-05, | |
| "loss": 0.3369, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.28906955736224027, | |
| "grad_norm": 0.06527422368526459, | |
| "learning_rate": 4.161178683597054e-05, | |
| "loss": 0.4052, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2926829268292683, | |
| "grad_norm": 0.06578138470649719, | |
| "learning_rate": 4.139164352381758e-05, | |
| "loss": 0.3586, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.05465536564588547, | |
| "learning_rate": 4.116924780042997e-05, | |
| "loss": 0.3759, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2999096657633243, | |
| "grad_norm": 0.08491545915603638, | |
| "learning_rate": 4.094463022568569e-05, | |
| "loss": 0.3611, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3035230352303523, | |
| "grad_norm": 0.06035340949892998, | |
| "learning_rate": 4.071782166477213e-05, | |
| "loss": 0.3537, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3071364046973803, | |
| "grad_norm": 0.06220124289393425, | |
| "learning_rate": 4.0488853283944806e-05, | |
| "loss": 0.3878, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3107497741644083, | |
| "grad_norm": 0.05434149503707886, | |
| "learning_rate": 4.0257756546244804e-05, | |
| "loss": 0.3765, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3143631436314363, | |
| "grad_norm": 0.06244641914963722, | |
| "learning_rate": 4.0024563207175316e-05, | |
| "loss": 0.3668, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3179765130984643, | |
| "grad_norm": 0.08008646965026855, | |
| "learning_rate": 3.978930531033807e-05, | |
| "loss": 0.3883, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3215898825654923, | |
| "grad_norm": 0.06990881264209747, | |
| "learning_rate": 3.9552015183030136e-05, | |
| "loss": 0.4611, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3252032520325203, | |
| "grad_norm": 0.05660560727119446, | |
| "learning_rate": 3.93127254318018e-05, | |
| "loss": 0.3865, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3288166214995483, | |
| "grad_norm": 0.05711934715509415, | |
| "learning_rate": 3.907146893797599e-05, | |
| "loss": 0.4223, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3324299909665763, | |
| "grad_norm": 0.06767363101243973, | |
| "learning_rate": 3.882827885312999e-05, | |
| "loss": 0.3481, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.33604336043360433, | |
| "grad_norm": 0.05866090953350067, | |
| "learning_rate": 3.858318859454001e-05, | |
| "loss": 0.4195, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.33965672990063234, | |
| "grad_norm": 0.05316139757633209, | |
| "learning_rate": 3.833623184058926e-05, | |
| "loss": 0.4042, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.34327009936766034, | |
| "grad_norm": 0.06730002164840698, | |
| "learning_rate": 3.808744252614012e-05, | |
| "loss": 0.3717, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.34688346883468835, | |
| "grad_norm": 0.07342930138111115, | |
| "learning_rate": 3.783685483787105e-05, | |
| "loss": 0.4075, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.35049683830171635, | |
| "grad_norm": 0.07083098590373993, | |
| "learning_rate": 3.758450320957899e-05, | |
| "loss": 0.3864, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.35411020776874436, | |
| "grad_norm": 0.07677371054887772, | |
| "learning_rate": 3.7330422317447685e-05, | |
| "loss": 0.393, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.35772357723577236, | |
| "grad_norm": 0.0808129534125328, | |
| "learning_rate": 3.707464707528275e-05, | |
| "loss": 0.3801, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.36133694670280037, | |
| "grad_norm": 0.06672363728284836, | |
| "learning_rate": 3.681721262971413e-05, | |
| "loss": 0.4472, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.36495031616982837, | |
| "grad_norm": 0.05534950643777847, | |
| "learning_rate": 3.6558154355366506e-05, | |
| "loss": 0.3683, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3685636856368564, | |
| "grad_norm": 0.06686428934335709, | |
| "learning_rate": 3.6297507849998344e-05, | |
| "loss": 0.3455, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3721770551038844, | |
| "grad_norm": 0.07248938828706741, | |
| "learning_rate": 3.6035308929610446e-05, | |
| "loss": 0.4083, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.3757904245709124, | |
| "grad_norm": 0.06316327303647995, | |
| "learning_rate": 3.5771593623524265e-05, | |
| "loss": 0.3661, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3794037940379404, | |
| "grad_norm": 0.08561142534017563, | |
| "learning_rate": 3.550639816943111e-05, | |
| "loss": 0.3693, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3830171635049684, | |
| "grad_norm": 0.05884739011526108, | |
| "learning_rate": 3.5239759008412666e-05, | |
| "loss": 0.4326, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3866305329719964, | |
| "grad_norm": 0.06861259788274765, | |
| "learning_rate": 3.497171277993346e-05, | |
| "loss": 0.3423, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3902439024390244, | |
| "grad_norm": 0.06908590346574783, | |
| "learning_rate": 3.4702296316806244e-05, | |
| "loss": 0.4494, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3938572719060524, | |
| "grad_norm": 0.07454199343919754, | |
| "learning_rate": 3.443154664013067e-05, | |
| "loss": 0.4488, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3974706413730804, | |
| "grad_norm": 0.07938794046640396, | |
| "learning_rate": 3.415950095420616e-05, | |
| "loss": 0.3938, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4010840108401084, | |
| "grad_norm": 0.08505871146917343, | |
| "learning_rate": 3.3886196641419545e-05, | |
| "loss": 0.4004, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4046973803071364, | |
| "grad_norm": 0.0625777617096901, | |
| "learning_rate": 3.361167125710832e-05, | |
| "loss": 0.3863, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4083107497741644, | |
| "grad_norm": 0.07772816717624664, | |
| "learning_rate": 3.333596252440008e-05, | |
| "loss": 0.3981, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.41192411924119243, | |
| "grad_norm": 0.06656523048877716, | |
| "learning_rate": 3.305910832902884e-05, | |
| "loss": 0.3705, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.41553748870822044, | |
| "grad_norm": 0.07238256186246872, | |
| "learning_rate": 3.278114671412917e-05, | |
| "loss": 0.412, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.41915085817524844, | |
| "grad_norm": 0.06601731479167938, | |
| "learning_rate": 3.2502115875008524e-05, | |
| "loss": 0.3716, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.42276422764227645, | |
| "grad_norm": 0.0684824138879776, | |
| "learning_rate": 3.222205415389877e-05, | |
| "loss": 0.4183, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.42637759710930445, | |
| "grad_norm": 0.0698830783367157, | |
| "learning_rate": 3.1941000034687515e-05, | |
| "loss": 0.3517, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.42999096657633246, | |
| "grad_norm": 0.05978047475218773, | |
| "learning_rate": 3.165899213762995e-05, | |
| "loss": 0.3852, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.43360433604336046, | |
| "grad_norm": 0.07572682201862335, | |
| "learning_rate": 3.1376069214041913e-05, | |
| "loss": 0.4022, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4372177055103884, | |
| "grad_norm": 0.07104960829019547, | |
| "learning_rate": 3.109227014097505e-05, | |
| "loss": 0.4185, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4408310749774164, | |
| "grad_norm": 0.06828156113624573, | |
| "learning_rate": 3.0807633915874584e-05, | |
| "loss": 0.4239, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.057690802961587906, | |
| "learning_rate": 3.052219965122062e-05, | |
| "loss": 0.4109, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.4480578139114724, | |
| "grad_norm": 0.06580954045057297, | |
| "learning_rate": 3.0236006569153617e-05, | |
| "loss": 0.359, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.45167118337850043, | |
| "grad_norm": 0.060349613428115845, | |
| "learning_rate": 2.9949093996084747e-05, | |
| "loss": 0.3775, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.45528455284552843, | |
| "grad_norm": 0.07335729151964188, | |
| "learning_rate": 2.9661501357292033e-05, | |
| "loss": 0.4043, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.45889792231255644, | |
| "grad_norm": 0.04954389110207558, | |
| "learning_rate": 2.9373268171502777e-05, | |
| "loss": 0.3537, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.46251129177958444, | |
| "grad_norm": 0.07528957724571228, | |
| "learning_rate": 2.9084434045463255e-05, | |
| "loss": 0.467, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.46612466124661245, | |
| "grad_norm": 0.06106121093034744, | |
| "learning_rate": 2.8795038668496222e-05, | |
| "loss": 0.4323, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.46973803071364045, | |
| "grad_norm": 0.08181653916835785, | |
| "learning_rate": 2.850512180704715e-05, | |
| "loss": 0.4208, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.47335140018066846, | |
| "grad_norm": 0.07354505360126495, | |
| "learning_rate": 2.821472329921981e-05, | |
| "loss": 0.3909, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.47696476964769646, | |
| "grad_norm": 0.09099866449832916, | |
| "learning_rate": 2.792388304930207e-05, | |
| "loss": 0.4296, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.48057813911472447, | |
| "grad_norm": 0.08062151074409485, | |
| "learning_rate": 2.7632641022282502e-05, | |
| "loss": 0.4106, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.48419150858175247, | |
| "grad_norm": 0.09198120981454849, | |
| "learning_rate": 2.7341037238358774e-05, | |
| "loss": 0.4064, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 0.05343058705329895, | |
| "learning_rate": 2.704911176743833e-05, | |
| "loss": 0.404, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4914182475158085, | |
| "grad_norm": 0.0657978504896164, | |
| "learning_rate": 2.6756904723632324e-05, | |
| "loss": 0.3993, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4950316169828365, | |
| "grad_norm": 0.057678401470184326, | |
| "learning_rate": 2.646445625974347e-05, | |
| "loss": 0.3804, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4986449864498645, | |
| "grad_norm": 0.06898088753223419, | |
| "learning_rate": 2.6171806561748502e-05, | |
| "loss": 0.4452, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5022583559168925, | |
| "grad_norm": 0.09333262592554092, | |
| "learning_rate": 2.5878995843276204e-05, | |
| "loss": 0.3304, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5058717253839206, | |
| "grad_norm": 0.06717183440923691, | |
| "learning_rate": 2.5586064340081516e-05, | |
| "loss": 0.326, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5094850948509485, | |
| "grad_norm": 0.06729979068040848, | |
| "learning_rate": 2.529305230451666e-05, | |
| "loss": 0.3934, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5130984643179766, | |
| "grad_norm": 0.09550358355045319, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.4733, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5167118337850045, | |
| "grad_norm": 0.07080523669719696, | |
| "learning_rate": 2.4706947695483348e-05, | |
| "loss": 0.4039, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5203252032520326, | |
| "grad_norm": 0.055423106998205185, | |
| "learning_rate": 2.441393565991849e-05, | |
| "loss": 0.3275, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5239385727190605, | |
| "grad_norm": 0.06483904272317886, | |
| "learning_rate": 2.4121004156723802e-05, | |
| "loss": 0.4377, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5275519421860885, | |
| "grad_norm": 0.06614437699317932, | |
| "learning_rate": 2.3828193438251497e-05, | |
| "loss": 0.3935, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5311653116531165, | |
| "grad_norm": 0.08745498955249786, | |
| "learning_rate": 2.3535543740256536e-05, | |
| "loss": 0.4348, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5347786811201445, | |
| "grad_norm": 0.07158234715461731, | |
| "learning_rate": 2.3243095276367685e-05, | |
| "loss": 0.3286, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5383920505871725, | |
| "grad_norm": 0.06448652595281601, | |
| "learning_rate": 2.2950888232561672e-05, | |
| "loss": 0.4108, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5420054200542005, | |
| "grad_norm": 0.07621192187070847, | |
| "learning_rate": 2.2658962761641232e-05, | |
| "loss": 0.4317, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5456187895212286, | |
| "grad_norm": 0.07459475100040436, | |
| "learning_rate": 2.23673589777175e-05, | |
| "loss": 0.3876, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5492321589882565, | |
| "grad_norm": 0.07355853170156479, | |
| "learning_rate": 2.207611695069794e-05, | |
| "loss": 0.3506, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5528455284552846, | |
| "grad_norm": 0.07565652579069138, | |
| "learning_rate": 2.17852767007802e-05, | |
| "loss": 0.4221, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5564588979223125, | |
| "grad_norm": 0.07433846592903137, | |
| "learning_rate": 2.1494878192952855e-05, | |
| "loss": 0.3913, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5600722673893406, | |
| "grad_norm": 0.07123446464538574, | |
| "learning_rate": 2.1204961331503787e-05, | |
| "loss": 0.4106, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5636856368563685, | |
| "grad_norm": 0.0848294198513031, | |
| "learning_rate": 2.0915565954536744e-05, | |
| "loss": 0.3171, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5672990063233966, | |
| "grad_norm": 0.06394634395837784, | |
| "learning_rate": 2.0626731828497225e-05, | |
| "loss": 0.4106, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5709123757904245, | |
| "grad_norm": 0.06601906567811966, | |
| "learning_rate": 2.0338498642707977e-05, | |
| "loss": 0.3651, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5745257452574526, | |
| "grad_norm": 0.0734376311302185, | |
| "learning_rate": 2.005090600391526e-05, | |
| "loss": 0.3906, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5781391147244805, | |
| "grad_norm": 0.07122786343097687, | |
| "learning_rate": 1.9763993430846395e-05, | |
| "loss": 0.4157, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5817524841915086, | |
| "grad_norm": 0.06590158492326736, | |
| "learning_rate": 1.947780034877938e-05, | |
| "loss": 0.4267, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5853658536585366, | |
| "grad_norm": 0.07380690425634384, | |
| "learning_rate": 1.9192366084125425e-05, | |
| "loss": 0.3748, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5889792231255646, | |
| "grad_norm": 0.054361093789339066, | |
| "learning_rate": 1.890772985902496e-05, | |
| "loss": 0.3637, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.06896340101957321, | |
| "learning_rate": 1.8623930785958092e-05, | |
| "loss": 0.4319, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5962059620596206, | |
| "grad_norm": 0.08140537887811661, | |
| "learning_rate": 1.8341007862370056e-05, | |
| "loss": 0.3942, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5998193315266486, | |
| "grad_norm": 0.07021729648113251, | |
| "learning_rate": 1.8058999965312484e-05, | |
| "loss": 0.3917, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.6034327009936766, | |
| "grad_norm": 0.06319273263216019, | |
| "learning_rate": 1.777794584610124e-05, | |
| "loss": 0.3833, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.6070460704607046, | |
| "grad_norm": 0.07088933885097504, | |
| "learning_rate": 1.749788412499149e-05, | |
| "loss": 0.3326, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6106594399277326, | |
| "grad_norm": 0.06848324090242386, | |
| "learning_rate": 1.721885328587083e-05, | |
| "loss": 0.5018, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6142728093947606, | |
| "grad_norm": 0.07163573056459427, | |
| "learning_rate": 1.694089167097116e-05, | |
| "loss": 0.3624, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6178861788617886, | |
| "grad_norm": 0.06683260202407837, | |
| "learning_rate": 1.6664037475599923e-05, | |
| "loss": 0.4198, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6214995483288166, | |
| "grad_norm": 0.06273495405912399, | |
| "learning_rate": 1.638832874289168e-05, | |
| "loss": 0.3388, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6251129177958447, | |
| "grad_norm": 0.06024303659796715, | |
| "learning_rate": 1.611380335858047e-05, | |
| "loss": 0.4156, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6287262872628726, | |
| "grad_norm": 0.08732262253761292, | |
| "learning_rate": 1.5840499045793843e-05, | |
| "loss": 0.3883, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.6323396567299007, | |
| "grad_norm": 0.06800790876150131, | |
| "learning_rate": 1.5568453359869334e-05, | |
| "loss": 0.3636, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6359530261969286, | |
| "grad_norm": 0.08514184504747391, | |
| "learning_rate": 1.5297703683193752e-05, | |
| "loss": 0.3664, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6395663956639567, | |
| "grad_norm": 0.0805889442563057, | |
| "learning_rate": 1.502828722006655e-05, | |
| "loss": 0.3912, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6431797651309846, | |
| "grad_norm": 0.07321416586637497, | |
| "learning_rate": 1.4760240991587337e-05, | |
| "loss": 0.4077, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6467931345980127, | |
| "grad_norm": 0.06993624567985535, | |
| "learning_rate": 1.4493601830568887e-05, | |
| "loss": 0.3728, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.6504065040650406, | |
| "grad_norm": 0.07736963033676147, | |
| "learning_rate": 1.4228406376475742e-05, | |
| "loss": 0.3644, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6540198735320687, | |
| "grad_norm": 0.06840698421001434, | |
| "learning_rate": 1.396469107038956e-05, | |
| "loss": 0.3936, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.6576332429990966, | |
| "grad_norm": 0.07498890906572342, | |
| "learning_rate": 1.3702492150001659e-05, | |
| "loss": 0.3948, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6612466124661247, | |
| "grad_norm": 0.06307978183031082, | |
| "learning_rate": 1.34418456446335e-05, | |
| "loss": 0.398, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6648599819331527, | |
| "grad_norm": 0.0843866616487503, | |
| "learning_rate": 1.3182787370285865e-05, | |
| "loss": 0.3891, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6684733514001807, | |
| "grad_norm": 0.07880077511072159, | |
| "learning_rate": 1.292535292471726e-05, | |
| "loss": 0.3812, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6720867208672087, | |
| "grad_norm": 0.06986968219280243, | |
| "learning_rate": 1.2669577682552319e-05, | |
| "loss": 0.3851, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6757000903342367, | |
| "grad_norm": 0.07602784037590027, | |
| "learning_rate": 1.2415496790421011e-05, | |
| "loss": 0.3956, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6793134598012647, | |
| "grad_norm": 0.06611546874046326, | |
| "learning_rate": 1.2163145162128947e-05, | |
| "loss": 0.3629, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6829268292682927, | |
| "grad_norm": 0.07958898693323135, | |
| "learning_rate": 1.1912557473859895e-05, | |
| "loss": 0.3647, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6865401987353207, | |
| "grad_norm": 0.06264237314462662, | |
| "learning_rate": 1.1663768159410748e-05, | |
| "loss": 0.3797, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6901535682023487, | |
| "grad_norm": 0.08303744345903397, | |
| "learning_rate": 1.1416811405459993e-05, | |
| "loss": 0.3754, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6937669376693767, | |
| "grad_norm": 0.07206673175096512, | |
| "learning_rate": 1.1171721146870015e-05, | |
| "loss": 0.327, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6973803071364046, | |
| "grad_norm": 0.06349314749240875, | |
| "learning_rate": 1.0928531062024017e-05, | |
| "loss": 0.3902, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.7009936766034327, | |
| "grad_norm": 0.07241489738225937, | |
| "learning_rate": 1.0687274568198208e-05, | |
| "loss": 0.3845, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.7046070460704607, | |
| "grad_norm": 0.06357239931821823, | |
| "learning_rate": 1.0447984816969874e-05, | |
| "loss": 0.3881, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7082204155374887, | |
| "grad_norm": 0.06316613405942917, | |
| "learning_rate": 1.021069468966194e-05, | |
| "loss": 0.4735, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7118337850045167, | |
| "grad_norm": 0.08076903223991394, | |
| "learning_rate": 9.975436792824691e-06, | |
| "loss": 0.43, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7154471544715447, | |
| "grad_norm": 0.0836021676659584, | |
| "learning_rate": 9.742243453755202e-06, | |
| "loss": 0.3818, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.7190605239385727, | |
| "grad_norm": 0.0713673084974289, | |
| "learning_rate": 9.5111467160552e-06, | |
| "loss": 0.3846, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.7226738934056007, | |
| "grad_norm": 0.08711904287338257, | |
| "learning_rate": 9.282178335227884e-06, | |
| "loss": 0.4817, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7262872628726287, | |
| "grad_norm": 0.05264454334974289, | |
| "learning_rate": 9.05536977431431e-06, | |
| "loss": 0.3995, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7299006323396567, | |
| "grad_norm": 0.07466941326856613, | |
| "learning_rate": 8.830752199570033e-06, | |
| "loss": 0.3718, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.7335140018066847, | |
| "grad_norm": 0.07776648551225662, | |
| "learning_rate": 8.608356476182424e-06, | |
| "loss": 0.4786, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.7371273712737128, | |
| "grad_norm": 0.06611160188913345, | |
| "learning_rate": 8.38821316402946e-06, | |
| "loss": 0.3668, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.07174837589263916, | |
| "learning_rate": 8.170352513480408e-06, | |
| "loss": 0.4016, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7443541102077688, | |
| "grad_norm": 0.0830477848649025, | |
| "learning_rate": 7.954804461239053e-06, | |
| "loss": 0.4162, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7479674796747967, | |
| "grad_norm": 0.08300362527370453, | |
| "learning_rate": 7.741598626230079e-06, | |
| "loss": 0.3738, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.7515808491418248, | |
| "grad_norm": 0.07526036351919174, | |
| "learning_rate": 7.530764305528959e-06, | |
| "loss": 0.3576, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7551942186088527, | |
| "grad_norm": 0.06786955147981644, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.4152, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.7588075880758808, | |
| "grad_norm": 0.08544765412807465, | |
| "learning_rate": 7.116325761996817e-06, | |
| "loss": 0.3735, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7624209575429087, | |
| "grad_norm": 0.06077965721487999, | |
| "learning_rate": 6.91277848806356e-06, | |
| "loss": 0.3486, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.7660343270099368, | |
| "grad_norm": 0.07332652807235718, | |
| "learning_rate": 6.711716618408281e-06, | |
| "loss": 0.3734, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7696476964769647, | |
| "grad_norm": 0.07848729193210602, | |
| "learning_rate": 6.513167781377885e-06, | |
| "loss": 0.4231, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.7732610659439928, | |
| "grad_norm": 0.07897993177175522, | |
| "learning_rate": 6.317159259998073e-06, | |
| "loss": 0.3513, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7768744354110207, | |
| "grad_norm": 0.07235241681337357, | |
| "learning_rate": 6.123717988224237e-06, | |
| "loss": 0.4069, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7804878048780488, | |
| "grad_norm": 0.09085345268249512, | |
| "learning_rate": 5.932870547240454e-06, | |
| "loss": 0.3849, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7841011743450768, | |
| "grad_norm": 0.07704368233680725, | |
| "learning_rate": 5.74464316180689e-06, | |
| "loss": 0.4261, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7877145438121048, | |
| "grad_norm": 0.057720448821783066, | |
| "learning_rate": 5.559061696656198e-06, | |
| "loss": 0.3711, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7913279132791328, | |
| "grad_norm": 0.06448069959878922, | |
| "learning_rate": 5.37615165293942e-06, | |
| "loss": 0.4027, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7949412827461608, | |
| "grad_norm": 0.08539154380559921, | |
| "learning_rate": 5.1959381647217666e-06, | |
| "loss": 0.388, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7985546522131888, | |
| "grad_norm": 0.07000590115785599, | |
| "learning_rate": 5.018445995528931e-06, | |
| "loss": 0.4122, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8021680216802168, | |
| "grad_norm": 0.07643178850412369, | |
| "learning_rate": 4.843699534944257e-06, | |
| "loss": 0.3749, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.8057813911472448, | |
| "grad_norm": 0.06629081815481186, | |
| "learning_rate": 4.671722795257327e-06, | |
| "loss": 0.3817, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.8093947606142728, | |
| "grad_norm": 0.06171542406082153, | |
| "learning_rate": 4.502539408164386e-06, | |
| "loss": 0.3474, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.8130081300813008, | |
| "grad_norm": 0.06734922528266907, | |
| "learning_rate": 4.336172621521034e-06, | |
| "loss": 0.3328, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8166214995483289, | |
| "grad_norm": 0.09524697810411453, | |
| "learning_rate": 4.1726452961477146e-06, | |
| "loss": 0.3433, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.8202348690153568, | |
| "grad_norm": 0.06357850879430771, | |
| "learning_rate": 4.01197990268834e-06, | |
| "loss": 0.3992, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.8238482384823849, | |
| "grad_norm": 0.07560393214225769, | |
| "learning_rate": 3.8541985185225645e-06, | |
| "loss": 0.3575, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.8274616079494128, | |
| "grad_norm": 0.06906560808420181, | |
| "learning_rate": 3.6993228247320877e-06, | |
| "loss": 0.3287, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.8310749774164409, | |
| "grad_norm": 0.08411566913127899, | |
| "learning_rate": 3.547374103121398e-06, | |
| "loss": 0.4115, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8346883468834688, | |
| "grad_norm": 0.08515972644090652, | |
| "learning_rate": 3.398373233293378e-06, | |
| "loss": 0.3709, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.8383017163504969, | |
| "grad_norm": 0.06780155003070831, | |
| "learning_rate": 3.252340689780245e-06, | |
| "loss": 0.3599, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.8419150858175248, | |
| "grad_norm": 0.08019706606864929, | |
| "learning_rate": 3.1092965392300417e-06, | |
| "loss": 0.3869, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.8455284552845529, | |
| "grad_norm": 0.0702086016535759, | |
| "learning_rate": 2.969260437649293e-06, | |
| "loss": 0.3846, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.8491418247515808, | |
| "grad_norm": 0.0851154550909996, | |
| "learning_rate": 2.8322516277019624e-06, | |
| "loss": 0.3434, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8527551942186089, | |
| "grad_norm": 0.06722518056631088, | |
| "learning_rate": 2.6982889360653377e-06, | |
| "loss": 0.3349, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8563685636856369, | |
| "grad_norm": 0.06803542375564575, | |
| "learning_rate": 2.5673907708429976e-06, | |
| "loss": 0.3526, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.8599819331526649, | |
| "grad_norm": 0.08029063045978546, | |
| "learning_rate": 2.4395751190352924e-06, | |
| "loss": 0.4286, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.8635953026196929, | |
| "grad_norm": 0.08042778819799423, | |
| "learning_rate": 2.3148595440677405e-06, | |
| "loss": 0.3739, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.8672086720867209, | |
| "grad_norm": 0.07175204902887344, | |
| "learning_rate": 2.1932611833775846e-06, | |
| "loss": 0.4156, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8708220415537489, | |
| "grad_norm": 0.058878783136606216, | |
| "learning_rate": 2.074796746058896e-06, | |
| "loss": 0.3636, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.8744354110207768, | |
| "grad_norm": 0.08569607883691788, | |
| "learning_rate": 1.9594825105665654e-06, | |
| "loss": 0.3889, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.8780487804878049, | |
| "grad_norm": 0.07353324443101883, | |
| "learning_rate": 1.847334322479413e-06, | |
| "loss": 0.4352, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.8816621499548328, | |
| "grad_norm": 0.07135035842657089, | |
| "learning_rate": 1.738367592322837e-06, | |
| "loss": 0.4265, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.8852755194218609, | |
| "grad_norm": 0.06918162852525711, | |
| "learning_rate": 1.6325972934512018e-06, | |
| "loss": 0.4295, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.07300789654254913, | |
| "learning_rate": 1.5300379599903409e-06, | |
| "loss": 0.4226, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.8925022583559169, | |
| "grad_norm": 0.06973148882389069, | |
| "learning_rate": 1.4307036848403648e-06, | |
| "loss": 0.3368, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8961156278229448, | |
| "grad_norm": 0.07200148701667786, | |
| "learning_rate": 1.3346081177391472e-06, | |
| "loss": 0.3924, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8997289972899729, | |
| "grad_norm": 0.07833510637283325, | |
| "learning_rate": 1.2417644633866632e-06, | |
| "loss": 0.3274, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.9033423667570009, | |
| "grad_norm": 0.061651114374399185, | |
| "learning_rate": 1.1521854796305242e-06, | |
| "loss": 0.3705, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9069557362240289, | |
| "grad_norm": 0.07440148293972015, | |
| "learning_rate": 1.0658834757128838e-06, | |
| "loss": 0.3715, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.9105691056910569, | |
| "grad_norm": 0.0720466673374176, | |
| "learning_rate": 9.828703105789983e-07, | |
| "loss": 0.3361, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.9141824751580849, | |
| "grad_norm": 0.08179104328155518, | |
| "learning_rate": 9.031573912476554e-07, | |
| "loss": 0.3393, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.9177958446251129, | |
| "grad_norm": 0.058865226805210114, | |
| "learning_rate": 8.267556712437341e-07, | |
| "loss": 0.4249, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.9214092140921409, | |
| "grad_norm": 0.07929901778697968, | |
| "learning_rate": 7.536756490930358e-07, | |
| "loss": 0.4341, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.9250225835591689, | |
| "grad_norm": 0.07914505153894424, | |
| "learning_rate": 6.839273668796747e-07, | |
| "loss": 0.3942, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.928635953026197, | |
| "grad_norm": 0.08146975934505463, | |
| "learning_rate": 6.175204088661485e-07, | |
| "loss": 0.3562, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.9322493224932249, | |
| "grad_norm": 0.08726157248020172, | |
| "learning_rate": 5.544639001763718e-07, | |
| "loss": 0.4314, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.935862691960253, | |
| "grad_norm": 0.09031800180673599, | |
| "learning_rate": 4.947665055417605e-07, | |
| "loss": 0.3842, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.9394760614272809, | |
| "grad_norm": 0.0922897681593895, | |
| "learning_rate": 4.3843642811059737e-07, | |
| "loss": 0.3285, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.943089430894309, | |
| "grad_norm": 0.07188927382230759, | |
| "learning_rate": 3.854814083208064e-07, | |
| "loss": 0.3839, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.9467028003613369, | |
| "grad_norm": 0.08181816339492798, | |
| "learning_rate": 3.3590872283633944e-07, | |
| "loss": 0.3651, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.950316169828365, | |
| "grad_norm": 0.0699373111128807, | |
| "learning_rate": 2.8972518354725977e-07, | |
| "loss": 0.457, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.9539295392953929, | |
| "grad_norm": 0.08292391151189804, | |
| "learning_rate": 2.4693713663372644e-07, | |
| "loss": 0.4105, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.957542908762421, | |
| "grad_norm": 0.07387669384479523, | |
| "learning_rate": 2.0755046169392e-07, | |
| "loss": 0.3846, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9611562782294489, | |
| "grad_norm": 0.08278100937604904, | |
| "learning_rate": 1.7157057093614703e-07, | |
| "loss": 0.4334, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.964769647696477, | |
| "grad_norm": 0.06216645613312721, | |
| "learning_rate": 1.3900240843510993e-07, | |
| "loss": 0.4007, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.9683830171635049, | |
| "grad_norm": 0.07292906939983368, | |
| "learning_rate": 1.0985044945254764e-07, | |
| "loss": 0.4152, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.971996386630533, | |
| "grad_norm": 0.07897216826677322, | |
| "learning_rate": 8.411869982228038e-08, | |
| "loss": 0.3954, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.0776594951748848, | |
| "learning_rate": 6.181069539974716e-08, | |
| "loss": 0.3449, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.979223125564589, | |
| "grad_norm": 0.07104814052581787, | |
| "learning_rate": 4.292950157614717e-08, | |
| "loss": 0.3476, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.982836495031617, | |
| "grad_norm": 0.07420724630355835, | |
| "learning_rate": 2.7477712857215677e-08, | |
| "loss": 0.4095, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.986449864498645, | |
| "grad_norm": 0.06806948781013489, | |
| "learning_rate": 1.5457452506698056e-08, | |
| "loss": 0.3879, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.990063233965673, | |
| "grad_norm": 0.08909036219120026, | |
| "learning_rate": 6.870372254602631e-09, | |
| "loss": 0.3327, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.993676603432701, | |
| "grad_norm": 0.07509468495845795, | |
| "learning_rate": 1.7176520702238964e-09, | |
| "loss": 0.4033, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.997289972899729, | |
| "grad_norm": 0.06269805878400803, | |
| "learning_rate": 0.0, | |
| "loss": 0.4076, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.997289972899729, | |
| "eval_loss": 0.35787180066108704, | |
| "eval_runtime": 515.6409, | |
| "eval_samples_per_second": 1.422, | |
| "eval_steps_per_second": 0.357, | |
| "step": 276 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 276, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.247726843172225e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |