{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0526315789473686, "eval_steps": 500, "global_step": 174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017543859649122806, "grad_norm": 0.2277653039057954, "learning_rate": 1.6666666666666667e-06, "loss": 1.0079, "step": 1 }, { "epoch": 0.03508771929824561, "grad_norm": 0.22929541131469036, "learning_rate": 3.3333333333333333e-06, "loss": 1.0155, "step": 2 }, { "epoch": 0.05263157894736842, "grad_norm": 0.23320532182877252, "learning_rate": 5e-06, "loss": 1.042, "step": 3 }, { "epoch": 0.07017543859649122, "grad_norm": 0.2327235097386226, "learning_rate": 6.666666666666667e-06, "loss": 1.0147, "step": 4 }, { "epoch": 0.08771929824561403, "grad_norm": 0.22899683750189437, "learning_rate": 8.333333333333334e-06, "loss": 1.0155, "step": 5 }, { "epoch": 0.10526315789473684, "grad_norm": 0.2346984759899663, "learning_rate": 1e-05, "loss": 1.0471, "step": 6 }, { "epoch": 0.12280701754385964, "grad_norm": 0.22126116958454167, "learning_rate": 1.1666666666666668e-05, "loss": 1.0295, "step": 7 }, { "epoch": 0.14035087719298245, "grad_norm": 0.21457628974392648, "learning_rate": 1.3333333333333333e-05, "loss": 1.0319, "step": 8 }, { "epoch": 0.15789473684210525, "grad_norm": 0.21253345072366, "learning_rate": 1.5e-05, "loss": 1.0166, "step": 9 }, { "epoch": 0.17543859649122806, "grad_norm": 0.21642169088434604, "learning_rate": 1.6666666666666667e-05, "loss": 1.0331, "step": 10 }, { "epoch": 0.19298245614035087, "grad_norm": 0.18849235769492945, "learning_rate": 1.8333333333333333e-05, "loss": 1.0142, "step": 11 }, { "epoch": 0.21052631578947367, "grad_norm": 0.17642654464303906, "learning_rate": 2e-05, "loss": 0.9902, "step": 12 }, { "epoch": 0.22807017543859648, "grad_norm": 0.17187933882719988, "learning_rate": 2.1666666666666667e-05, "loss": 1.017, "step": 13 }, { "epoch": 0.24561403508771928, "grad_norm": 0.17103598555992858, "learning_rate": 2.3333333333333336e-05, "loss": 0.9751, "step": 14 }, { "epoch": 0.2631578947368421, "grad_norm": 0.16014487415950107, "learning_rate": 2.5e-05, "loss": 0.9881, "step": 15 }, { "epoch": 0.2807017543859649, "grad_norm": 0.14028695923022452, "learning_rate": 2.4998640395219987e-05, "loss": 0.9778, "step": 16 }, { "epoch": 0.2982456140350877, "grad_norm": 0.12551729140438972, "learning_rate": 2.499456187664396e-05, "loss": 0.9689, "step": 17 }, { "epoch": 0.3157894736842105, "grad_norm": 0.1251340971956454, "learning_rate": 2.4987765331499672e-05, "loss": 0.9429, "step": 18 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1296210035785423, "learning_rate": 2.497825223828555e-05, "loss": 0.946, "step": 19 }, { "epoch": 0.3508771929824561, "grad_norm": 0.11329484685623345, "learning_rate": 2.4966024666449125e-05, "loss": 0.9366, "step": 20 }, { "epoch": 0.3684210526315789, "grad_norm": 0.10321338855040195, "learning_rate": 2.495108527593681e-05, "loss": 0.9259, "step": 21 }, { "epoch": 0.38596491228070173, "grad_norm": 0.09404432805330766, "learning_rate": 2.493343731661529e-05, "loss": 0.9482, "step": 22 }, { "epoch": 0.40350877192982454, "grad_norm": 0.09243083734470846, "learning_rate": 2.4913084627564535e-05, "loss": 0.9065, "step": 23 }, { "epoch": 0.42105263157894735, "grad_norm": 0.08906226913721381, "learning_rate": 2.4890031636242685e-05, "loss": 0.8938, "step": 24 }, { "epoch": 0.43859649122807015, "grad_norm": 0.08389891645958811, "learning_rate": 2.486428335752288e-05, "loss": 0.916, "step": 25 }, { "epoch": 0.45614035087719296, "grad_norm": 0.08425667259579685, "learning_rate": 2.483584539260238e-05, "loss": 0.8779, "step": 26 }, { "epoch": 0.47368421052631576, "grad_norm": 0.07744492406671652, "learning_rate": 2.480472392778407e-05, "loss": 0.8834, "step": 27 }, { "epoch": 0.49122807017543857, "grad_norm": 0.08767059933162678, "learning_rate": 2.4770925733130725e-05, "loss": 0.9148, "step": 28 }, { "epoch": 0.5087719298245614, "grad_norm": 0.07726724919511256, "learning_rate": 2.473445816099226e-05, "loss": 0.9088, "step": 29 }, { "epoch": 0.5263157894736842, "grad_norm": 0.07836879806212735, "learning_rate": 2.4695329144406337e-05, "loss": 0.8944, "step": 30 }, { "epoch": 0.543859649122807, "grad_norm": 0.07205578520489196, "learning_rate": 2.465354719537264e-05, "loss": 0.8966, "step": 31 }, { "epoch": 0.5614035087719298, "grad_norm": 0.0732976565919632, "learning_rate": 2.460912140300119e-05, "loss": 0.8933, "step": 32 }, { "epoch": 0.5789473684210527, "grad_norm": 0.06413069817944542, "learning_rate": 2.4562061431535128e-05, "loss": 0.8687, "step": 33 }, { "epoch": 0.5964912280701754, "grad_norm": 0.061655130915948715, "learning_rate": 2.4512377518248398e-05, "loss": 0.8757, "step": 34 }, { "epoch": 0.6140350877192983, "grad_norm": 0.06005143040792398, "learning_rate": 2.4460080471218766e-05, "loss": 0.8763, "step": 35 }, { "epoch": 0.631578947368421, "grad_norm": 0.059268901460994255, "learning_rate": 2.4405181666976646e-05, "loss": 0.8691, "step": 36 }, { "epoch": 0.6491228070175439, "grad_norm": 0.0632968952247683, "learning_rate": 2.43476930480303e-05, "loss": 0.876, "step": 37 }, { "epoch": 0.6666666666666666, "grad_norm": 0.05479140319445762, "learning_rate": 2.428762712026792e-05, "loss": 0.8682, "step": 38 }, { "epoch": 0.6842105263157895, "grad_norm": 0.05563512390428038, "learning_rate": 2.4224996950237093e-05, "loss": 0.8841, "step": 39 }, { "epoch": 0.7017543859649122, "grad_norm": 0.05247972036076721, "learning_rate": 2.4159816162302394e-05, "loss": 0.8787, "step": 40 }, { "epoch": 0.7192982456140351, "grad_norm": 0.061183366717530226, "learning_rate": 2.4092098935681556e-05, "loss": 0.8549, "step": 41 }, { "epoch": 0.7368421052631579, "grad_norm": 0.05375757865160034, "learning_rate": 2.402186000136098e-05, "loss": 0.8528, "step": 42 }, { "epoch": 0.7543859649122807, "grad_norm": 0.05057045758521559, "learning_rate": 2.39491146388912e-05, "loss": 0.8536, "step": 43 }, { "epoch": 0.7719298245614035, "grad_norm": 0.04649702726117417, "learning_rate": 2.387387867306302e-05, "loss": 0.8488, "step": 44 }, { "epoch": 0.7894736842105263, "grad_norm": 0.04787518315340449, "learning_rate": 2.379616847046505e-05, "loss": 0.8573, "step": 45 }, { "epoch": 0.8070175438596491, "grad_norm": 0.045213502324060025, "learning_rate": 2.371600093592335e-05, "loss": 0.8727, "step": 46 }, { "epoch": 0.8245614035087719, "grad_norm": 0.04738126021150567, "learning_rate": 2.3633393508824022e-05, "loss": 0.8633, "step": 47 }, { "epoch": 0.8421052631578947, "grad_norm": 0.043833518738075415, "learning_rate": 2.3548364159319513e-05, "loss": 0.868, "step": 48 }, { "epoch": 0.8596491228070176, "grad_norm": 0.042510793627425734, "learning_rate": 2.3460931384419427e-05, "loss": 0.852, "step": 49 }, { "epoch": 0.8771929824561403, "grad_norm": 0.0408258194280563, "learning_rate": 2.3371114203966756e-05, "loss": 0.8595, "step": 50 }, { "epoch": 0.8947368421052632, "grad_norm": 0.041284443987598174, "learning_rate": 2.3278932156500348e-05, "loss": 0.8701, "step": 51 }, { "epoch": 0.9122807017543859, "grad_norm": 0.04015399468491511, "learning_rate": 2.3184405295004592e-05, "loss": 0.8378, "step": 52 }, { "epoch": 0.9298245614035088, "grad_norm": 0.0470194493755998, "learning_rate": 2.3087554182547123e-05, "loss": 0.8522, "step": 53 }, { "epoch": 0.9473684210526315, "grad_norm": 0.040346271463175765, "learning_rate": 2.298839988780561e-05, "loss": 0.8571, "step": 54 }, { "epoch": 0.9649122807017544, "grad_norm": 0.03907150168014673, "learning_rate": 2.288696398048455e-05, "loss": 0.8389, "step": 55 }, { "epoch": 0.9824561403508771, "grad_norm": 0.03966173521753987, "learning_rate": 2.278326852662305e-05, "loss": 0.8473, "step": 56 }, { "epoch": 1.0, "grad_norm": 0.03792422830368885, "learning_rate": 2.267733608379468e-05, "loss": 0.8308, "step": 57 }, { "epoch": 1.0175438596491229, "grad_norm": 0.039916995591762185, "learning_rate": 2.2569189696200327e-05, "loss": 0.8363, "step": 58 }, { "epoch": 1.0350877192982457, "grad_norm": 0.04222978922823946, "learning_rate": 2.2458852889655284e-05, "loss": 0.8248, "step": 59 }, { "epoch": 1.0526315789473684, "grad_norm": 0.037766008270626816, "learning_rate": 2.234634966647148e-05, "loss": 0.8108, "step": 60 }, { "epoch": 1.0701754385964912, "grad_norm": 0.03910137890933386, "learning_rate": 2.2231704500236117e-05, "loss": 0.8111, "step": 61 }, { "epoch": 1.087719298245614, "grad_norm": 0.03723938187511672, "learning_rate": 2.211494233048776e-05, "loss": 0.7961, "step": 62 }, { "epoch": 1.1052631578947367, "grad_norm": 0.03536618534586841, "learning_rate": 2.1996088557291062e-05, "loss": 0.8083, "step": 63 }, { "epoch": 1.1228070175438596, "grad_norm": 0.03509557198121232, "learning_rate": 2.1875169035711335e-05, "loss": 0.8301, "step": 64 }, { "epoch": 1.1403508771929824, "grad_norm": 0.03631360422996243, "learning_rate": 2.1752210070190106e-05, "loss": 0.8119, "step": 65 }, { "epoch": 1.1578947368421053, "grad_norm": 0.034923838848099804, "learning_rate": 2.162723840882293e-05, "loss": 0.8351, "step": 66 }, { "epoch": 1.1754385964912282, "grad_norm": 0.034222477158642954, "learning_rate": 2.150028123754072e-05, "loss": 0.8396, "step": 67 }, { "epoch": 1.1929824561403508, "grad_norm": 0.044099006560021574, "learning_rate": 2.137136617419578e-05, "loss": 0.8132, "step": 68 }, { "epoch": 1.2105263157894737, "grad_norm": 0.03760194901086737, "learning_rate": 2.1240521262553927e-05, "loss": 0.8277, "step": 69 }, { "epoch": 1.2280701754385965, "grad_norm": 0.034132230637497686, "learning_rate": 2.1107774966193932e-05, "loss": 0.8231, "step": 70 }, { "epoch": 1.2456140350877192, "grad_norm": 0.034145371494878535, "learning_rate": 2.097315616231564e-05, "loss": 0.8116, "step": 71 }, { "epoch": 1.263157894736842, "grad_norm": 0.03402745474331636, "learning_rate": 2.0836694135458136e-05, "loss": 0.8283, "step": 72 }, { "epoch": 1.280701754385965, "grad_norm": 0.046074062113807, "learning_rate": 2.0698418571129255e-05, "loss": 0.8161, "step": 73 }, { "epoch": 1.2982456140350878, "grad_norm": 0.03409175484451008, "learning_rate": 2.055835954934791e-05, "loss": 0.8056, "step": 74 }, { "epoch": 1.3157894736842106, "grad_norm": 0.03601398897730395, "learning_rate": 2.041654753810059e-05, "loss": 0.8139, "step": 75 }, { "epoch": 1.3333333333333333, "grad_norm": 0.035605249354124874, "learning_rate": 2.027301338671342e-05, "loss": 0.7993, "step": 76 }, { "epoch": 1.3508771929824561, "grad_norm": 0.03851629705577501, "learning_rate": 2.0127788319141345e-05, "loss": 0.8192, "step": 77 }, { "epoch": 1.368421052631579, "grad_norm": 0.03560123297297108, "learning_rate": 1.998090392717572e-05, "loss": 0.8194, "step": 78 }, { "epoch": 1.3859649122807016, "grad_norm": 0.03338440818080332, "learning_rate": 1.9832392163571977e-05, "loss": 0.823, "step": 79 }, { "epoch": 1.4035087719298245, "grad_norm": 0.039320599103418945, "learning_rate": 1.968228533509871e-05, "loss": 0.7991, "step": 80 }, { "epoch": 1.4210526315789473, "grad_norm": 0.03413703166680613, "learning_rate": 1.953061609550976e-05, "loss": 0.8122, "step": 81 }, { "epoch": 1.4385964912280702, "grad_norm": 0.03632660780988978, "learning_rate": 1.937741743844082e-05, "loss": 0.8051, "step": 82 }, { "epoch": 1.456140350877193, "grad_norm": 0.2720784291107051, "learning_rate": 1.9222722690232124e-05, "loss": 0.7982, "step": 83 }, { "epoch": 1.4736842105263157, "grad_norm": 0.033948193629099205, "learning_rate": 1.9066565502678735e-05, "loss": 0.8244, "step": 84 }, { "epoch": 1.4912280701754386, "grad_norm": 0.04015231276799685, "learning_rate": 1.8908979845710028e-05, "loss": 0.802, "step": 85 }, { "epoch": 1.5087719298245614, "grad_norm": 0.0334280910595663, "learning_rate": 1.8750000000000002e-05, "loss": 0.7944, "step": 86 }, { "epoch": 1.526315789473684, "grad_norm": 0.033433004016842426, "learning_rate": 1.8589660549509958e-05, "loss": 0.8086, "step": 87 }, { "epoch": 1.543859649122807, "grad_norm": 0.03647767785675323, "learning_rate": 1.842799637396523e-05, "loss": 0.8005, "step": 88 }, { "epoch": 1.5614035087719298, "grad_norm": 0.034851294076943595, "learning_rate": 1.8265042641267543e-05, "loss": 0.7697, "step": 89 }, { "epoch": 1.5789473684210527, "grad_norm": 0.035934237530251795, "learning_rate": 1.8100834799844733e-05, "loss": 0.8017, "step": 90 }, { "epoch": 1.5964912280701755, "grad_norm": 0.035583328708530516, "learning_rate": 1.793540857093937e-05, "loss": 0.8035, "step": 91 }, { "epoch": 1.6140350877192984, "grad_norm": 0.035602801094138097, "learning_rate": 1.77687999408381e-05, "loss": 0.7785, "step": 92 }, { "epoch": 1.631578947368421, "grad_norm": 0.03338943294215932, "learning_rate": 1.760104515304331e-05, "loss": 0.809, "step": 93 }, { "epoch": 1.6491228070175439, "grad_norm": 0.03440455754366396, "learning_rate": 1.743218070038882e-05, "loss": 0.7835, "step": 94 }, { "epoch": 1.6666666666666665, "grad_norm": 0.03413755186702014, "learning_rate": 1.7262243317101342e-05, "loss": 0.7857, "step": 95 }, { "epoch": 1.6842105263157894, "grad_norm": 0.03450977935766268, "learning_rate": 1.709126997080946e-05, "loss": 0.8045, "step": 96 }, { "epoch": 1.7017543859649122, "grad_norm": 0.03521585021316178, "learning_rate": 1.6919297854501793e-05, "loss": 0.7935, "step": 97 }, { "epoch": 1.719298245614035, "grad_norm": 0.03493887488862163, "learning_rate": 1.674636437843616e-05, "loss": 0.798, "step": 98 }, { "epoch": 1.736842105263158, "grad_norm": 0.035529760900503735, "learning_rate": 1.6572507162001472e-05, "loss": 0.799, "step": 99 }, { "epoch": 1.7543859649122808, "grad_norm": 0.033513087382252796, "learning_rate": 1.6397764025534122e-05, "loss": 0.7894, "step": 100 }, { "epoch": 1.7719298245614035, "grad_norm": 0.19122994636460844, "learning_rate": 1.6222172982090696e-05, "loss": 0.7804, "step": 101 }, { "epoch": 1.7894736842105263, "grad_norm": 0.035807687623747184, "learning_rate": 1.604577222917871e-05, "loss": 0.7951, "step": 102 }, { "epoch": 1.807017543859649, "grad_norm": 0.03259107722786136, "learning_rate": 1.586860014044726e-05, "loss": 0.7781, "step": 103 }, { "epoch": 1.8245614035087718, "grad_norm": 0.036809691518080494, "learning_rate": 1.5690695257339348e-05, "loss": 0.8008, "step": 104 }, { "epoch": 1.8421052631578947, "grad_norm": 0.035038305488987835, "learning_rate": 1.551209628070768e-05, "loss": 0.7753, "step": 105 }, { "epoch": 1.8596491228070176, "grad_norm": 0.03563704680282362, "learning_rate": 1.5332842062395837e-05, "loss": 0.8109, "step": 106 }, { "epoch": 1.8771929824561404, "grad_norm": 0.03337909268033773, "learning_rate": 1.5152971596786539e-05, "loss": 0.8074, "step": 107 }, { "epoch": 1.8947368421052633, "grad_norm": 0.03364182376321439, "learning_rate": 1.4972524012318968e-05, "loss": 0.7814, "step": 108 }, { "epoch": 1.912280701754386, "grad_norm": 0.03394899793236395, "learning_rate": 1.4791538562976858e-05, "loss": 0.8046, "step": 109 }, { "epoch": 1.9298245614035088, "grad_norm": 0.03276078625186682, "learning_rate": 1.4610054619749335e-05, "loss": 0.7923, "step": 110 }, { "epoch": 1.9473684210526314, "grad_norm": 0.035403995130927304, "learning_rate": 1.442811166206628e-05, "loss": 0.8036, "step": 111 }, { "epoch": 1.9649122807017543, "grad_norm": 0.036133327261938186, "learning_rate": 1.4245749269210077e-05, "loss": 0.7875, "step": 112 }, { "epoch": 1.9824561403508771, "grad_norm": 0.036600172448979534, "learning_rate": 1.40630071117057e-05, "loss": 0.7697, "step": 113 }, { "epoch": 2.0, "grad_norm": 0.037601350569099724, "learning_rate": 1.3879924942690875e-05, "loss": 0.8189, "step": 114 }, { "epoch": 2.017543859649123, "grad_norm": 0.03302530011499694, "learning_rate": 1.3696542589268343e-05, "loss": 0.7611, "step": 115 }, { "epoch": 2.0350877192982457, "grad_norm": 0.03512883427203999, "learning_rate": 1.3512899943842001e-05, "loss": 0.8027, "step": 116 }, { "epoch": 2.0526315789473686, "grad_norm": 0.032765823734136675, "learning_rate": 1.3329036955438801e-05, "loss": 0.7739, "step": 117 }, { "epoch": 2.0701754385964914, "grad_norm": 0.0325717443826363, "learning_rate": 1.3144993621018414e-05, "loss": 0.7749, "step": 118 }, { "epoch": 2.087719298245614, "grad_norm": 0.03351967483422044, "learning_rate": 1.2960809976772395e-05, "loss": 0.7776, "step": 119 }, { "epoch": 2.1052631578947367, "grad_norm": 0.03412426117127494, "learning_rate": 1.2776526089414836e-05, "loss": 0.7604, "step": 120 }, { "epoch": 2.1228070175438596, "grad_norm": 0.034817647751256633, "learning_rate": 1.2592182047466405e-05, "loss": 0.7701, "step": 121 }, { "epoch": 2.1403508771929824, "grad_norm": 0.03260110561553477, "learning_rate": 1.2407817952533594e-05, "loss": 0.7741, "step": 122 }, { "epoch": 2.1578947368421053, "grad_norm": 0.03193494436049472, "learning_rate": 1.2223473910585165e-05, "loss": 0.7645, "step": 123 }, { "epoch": 2.175438596491228, "grad_norm": 0.03312398775341158, "learning_rate": 1.2039190023227611e-05, "loss": 0.7585, "step": 124 }, { "epoch": 2.192982456140351, "grad_norm": 0.03343964045395972, "learning_rate": 1.1855006378981588e-05, "loss": 0.7921, "step": 125 }, { "epoch": 2.2105263157894735, "grad_norm": 0.033686542752496544, "learning_rate": 1.1670963044561205e-05, "loss": 0.7827, "step": 126 }, { "epoch": 2.2280701754385963, "grad_norm": 0.03497709388430689, "learning_rate": 1.1487100056158e-05, "loss": 0.7867, "step": 127 }, { "epoch": 2.245614035087719, "grad_norm": 0.03909574941588132, "learning_rate": 1.1303457410731658e-05, "loss": 0.7651, "step": 128 }, { "epoch": 2.263157894736842, "grad_norm": 0.03295131774552763, "learning_rate": 1.112007505730913e-05, "loss": 0.7716, "step": 129 }, { "epoch": 2.280701754385965, "grad_norm": 0.03535332720570186, "learning_rate": 1.0936992888294304e-05, "loss": 0.7519, "step": 130 }, { "epoch": 2.2982456140350878, "grad_norm": 0.034260157940805745, "learning_rate": 1.0754250730789925e-05, "loss": 0.7778, "step": 131 }, { "epoch": 2.3157894736842106, "grad_norm": 0.03778502943480454, "learning_rate": 1.057188833793372e-05, "loss": 0.7785, "step": 132 }, { "epoch": 2.3333333333333335, "grad_norm": 0.03282521968583762, "learning_rate": 1.0389945380250666e-05, "loss": 0.7822, "step": 133 }, { "epoch": 2.3508771929824563, "grad_norm": 0.03439059832810125, "learning_rate": 1.0208461437023146e-05, "loss": 0.7774, "step": 134 }, { "epoch": 2.3684210526315788, "grad_norm": 0.03579842875417821, "learning_rate": 1.0027475987681033e-05, "loss": 0.7626, "step": 135 }, { "epoch": 2.3859649122807016, "grad_norm": 0.04487526535583229, "learning_rate": 9.847028403213464e-06, "loss": 0.785, "step": 136 }, { "epoch": 2.4035087719298245, "grad_norm": 0.03582408613012423, "learning_rate": 9.667157937604165e-06, "loss": 0.772, "step": 137 }, { "epoch": 2.4210526315789473, "grad_norm": 0.035176969293327615, "learning_rate": 9.487903719292321e-06, "loss": 0.7777, "step": 138 }, { "epoch": 2.43859649122807, "grad_norm": 0.03346578608050542, "learning_rate": 9.309304742660656e-06, "loss": 0.7577, "step": 139 }, { "epoch": 2.456140350877193, "grad_norm": 0.039503770377912154, "learning_rate": 9.131399859552739e-06, "loss": 0.7901, "step": 140 }, { "epoch": 2.473684210526316, "grad_norm": 0.03677116076069421, "learning_rate": 8.954227770821292e-06, "loss": 0.7723, "step": 141 }, { "epoch": 2.4912280701754383, "grad_norm": 0.03338240458340637, "learning_rate": 8.77782701790931e-06, "loss": 0.7617, "step": 142 }, { "epoch": 2.5087719298245617, "grad_norm": 0.033837659971756015, "learning_rate": 8.60223597446588e-06, "loss": 0.7713, "step": 143 }, { "epoch": 2.526315789473684, "grad_norm": 0.03820384430127108, "learning_rate": 8.427492837998533e-06, "loss": 0.7557, "step": 144 }, { "epoch": 2.543859649122807, "grad_norm": 0.03252410748125962, "learning_rate": 8.25363562156384e-06, "loss": 0.7805, "step": 145 }, { "epoch": 2.56140350877193, "grad_norm": 0.03706815897768721, "learning_rate": 8.080702145498206e-06, "loss": 0.7645, "step": 146 }, { "epoch": 2.5789473684210527, "grad_norm": 0.035598168147733984, "learning_rate": 7.908730029190544e-06, "loss": 0.7877, "step": 147 }, { "epoch": 2.5964912280701755, "grad_norm": 0.0333688858025551, "learning_rate": 7.737756682898659e-06, "loss": 0.7591, "step": 148 }, { "epoch": 2.6140350877192984, "grad_norm": 0.044640638601682346, "learning_rate": 7.567819299611184e-06, "loss": 0.7658, "step": 149 }, { "epoch": 2.6315789473684212, "grad_norm": 0.033158357578421456, "learning_rate": 7.398954846956688e-06, "loss": 0.7719, "step": 150 }, { "epoch": 2.6491228070175437, "grad_norm": 0.033194259963536865, "learning_rate": 7.231200059161899e-06, "loss": 0.7806, "step": 151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.03518173471203294, "learning_rate": 7.064591429060635e-06, "loss": 0.7679, "step": 152 }, { "epoch": 2.6842105263157894, "grad_norm": 0.03276207653785537, "learning_rate": 6.8991652001552695e-06, "loss": 0.7728, "step": 153 }, { "epoch": 2.7017543859649122, "grad_norm": 0.0347231674496661, "learning_rate": 6.734957358732458e-06, "loss": 0.7741, "step": 154 }, { "epoch": 2.719298245614035, "grad_norm": 0.033258910497780264, "learning_rate": 6.572003626034776e-06, "loss": 0.7728, "step": 155 }, { "epoch": 2.736842105263158, "grad_norm": 0.033923018163736, "learning_rate": 6.410339450490047e-06, "loss": 0.7838, "step": 156 }, { "epoch": 2.754385964912281, "grad_norm": 0.03499101652185436, "learning_rate": 6.250000000000003e-06, "loss": 0.7836, "step": 157 }, { "epoch": 2.7719298245614032, "grad_norm": 0.037455923489755065, "learning_rate": 6.091020154289971e-06, "loss": 0.786, "step": 158 }, { "epoch": 2.7894736842105265, "grad_norm": 0.03406798967875397, "learning_rate": 5.933434497321268e-06, "loss": 0.7607, "step": 159 }, { "epoch": 2.807017543859649, "grad_norm": 0.03263055408168355, "learning_rate": 5.777277309767873e-06, "loss": 0.7835, "step": 160 }, { "epoch": 2.824561403508772, "grad_norm": 0.0370087111535257, "learning_rate": 5.62258256155918e-06, "loss": 0.7506, "step": 161 }, { "epoch": 2.8421052631578947, "grad_norm": 0.03253506069052104, "learning_rate": 5.469383904490243e-06, "loss": 0.7849, "step": 162 }, { "epoch": 2.8596491228070176, "grad_norm": 0.034164574715283676, "learning_rate": 5.317714664901289e-06, "loss": 0.7665, "step": 163 }, { "epoch": 2.8771929824561404, "grad_norm": 0.03375521327460909, "learning_rate": 5.167607836428023e-06, "loss": 0.7497, "step": 164 }, { "epoch": 2.8947368421052633, "grad_norm": 0.035848240948616814, "learning_rate": 5.0190960728242834e-06, "loss": 0.7904, "step": 165 }, { "epoch": 2.912280701754386, "grad_norm": 0.03266391216652421, "learning_rate": 4.872211680858662e-06, "loss": 0.7592, "step": 166 }, { "epoch": 2.9298245614035086, "grad_norm": 0.03349205497732682, "learning_rate": 4.726986613286583e-06, "loss": 0.7666, "step": 167 }, { "epoch": 2.9473684210526314, "grad_norm": 0.034893605290688134, "learning_rate": 4.5834524618994106e-06, "loss": 0.7676, "step": 168 }, { "epoch": 2.9649122807017543, "grad_norm": 0.034664258032328234, "learning_rate": 4.441640450652093e-06, "loss": 0.7675, "step": 169 }, { "epoch": 2.982456140350877, "grad_norm": 0.032567147685664447, "learning_rate": 4.30158142887075e-06, "loss": 0.7607, "step": 170 }, { "epoch": 3.0, "grad_norm": 0.034965409248096775, "learning_rate": 4.163305864541865e-06, "loss": 0.7622, "step": 171 }, { "epoch": 3.017543859649123, "grad_norm": 0.034735445589064905, "learning_rate": 4.026843837684359e-06, "loss": 0.7767, "step": 172 }, { "epoch": 3.0350877192982457, "grad_norm": 0.061447912336751793, "learning_rate": 3.89222503380607e-06, "loss": 0.7423, "step": 173 }, { "epoch": 3.0526315789473686, "grad_norm": 0.03296491017047373, "learning_rate": 3.7594787374460747e-06, "loss": 0.7608, "step": 174 } ], "logging_steps": 1, "max_steps": 228, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 29, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1115148672434176e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }