| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0526315789473686, | |
| "eval_steps": 500, | |
| "global_step": 174, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 0.2277653039057954, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.0079, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 0.22929541131469036, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.0155, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.23320532182877252, | |
| "learning_rate": 5e-06, | |
| "loss": 1.042, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 0.2327235097386226, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.0147, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 0.22899683750189437, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 1.0155, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.2346984759899663, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0471, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 0.22126116958454167, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 1.0295, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 0.21457628974392648, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.0319, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.21253345072366, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.0166, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 0.21642169088434604, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.0331, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 0.18849235769492945, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 1.0142, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.17642654464303906, | |
| "learning_rate": 2e-05, | |
| "loss": 0.9902, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 0.17187933882719988, | |
| "learning_rate": 2.1666666666666667e-05, | |
| "loss": 1.017, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 0.17103598555992858, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.9751, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.16014487415950107, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.9881, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.14028695923022452, | |
| "learning_rate": 2.4998640395219987e-05, | |
| "loss": 0.9778, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 0.12551729140438972, | |
| "learning_rate": 2.499456187664396e-05, | |
| "loss": 0.9689, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.1251340971956454, | |
| "learning_rate": 2.4987765331499672e-05, | |
| "loss": 0.9429, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.1296210035785423, | |
| "learning_rate": 2.497825223828555e-05, | |
| "loss": 0.946, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.11329484685623345, | |
| "learning_rate": 2.4966024666449125e-05, | |
| "loss": 0.9366, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.10321338855040195, | |
| "learning_rate": 2.495108527593681e-05, | |
| "loss": 0.9259, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 0.09404432805330766, | |
| "learning_rate": 2.493343731661529e-05, | |
| "loss": 0.9482, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 0.09243083734470846, | |
| "learning_rate": 2.4913084627564535e-05, | |
| "loss": 0.9065, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.08906226913721381, | |
| "learning_rate": 2.4890031636242685e-05, | |
| "loss": 0.8938, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 0.08389891645958811, | |
| "learning_rate": 2.486428335752288e-05, | |
| "loss": 0.916, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 0.08425667259579685, | |
| "learning_rate": 2.483584539260238e-05, | |
| "loss": 0.8779, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.07744492406671652, | |
| "learning_rate": 2.480472392778407e-05, | |
| "loss": 0.8834, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 0.08767059933162678, | |
| "learning_rate": 2.4770925733130725e-05, | |
| "loss": 0.9148, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 0.07726724919511256, | |
| "learning_rate": 2.473445816099226e-05, | |
| "loss": 0.9088, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.07836879806212735, | |
| "learning_rate": 2.4695329144406337e-05, | |
| "loss": 0.8944, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 0.07205578520489196, | |
| "learning_rate": 2.465354719537264e-05, | |
| "loss": 0.8966, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.0732976565919632, | |
| "learning_rate": 2.460912140300119e-05, | |
| "loss": 0.8933, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.06413069817944542, | |
| "learning_rate": 2.4562061431535128e-05, | |
| "loss": 0.8687, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 0.061655130915948715, | |
| "learning_rate": 2.4512377518248398e-05, | |
| "loss": 0.8757, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 0.06005143040792398, | |
| "learning_rate": 2.4460080471218766e-05, | |
| "loss": 0.8763, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.059268901460994255, | |
| "learning_rate": 2.4405181666976646e-05, | |
| "loss": 0.8691, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 0.0632968952247683, | |
| "learning_rate": 2.43476930480303e-05, | |
| "loss": 0.876, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.05479140319445762, | |
| "learning_rate": 2.428762712026792e-05, | |
| "loss": 0.8682, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.05563512390428038, | |
| "learning_rate": 2.4224996950237093e-05, | |
| "loss": 0.8841, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.05247972036076721, | |
| "learning_rate": 2.4159816162302394e-05, | |
| "loss": 0.8787, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 0.061183366717530226, | |
| "learning_rate": 2.4092098935681556e-05, | |
| "loss": 0.8549, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.05375757865160034, | |
| "learning_rate": 2.402186000136098e-05, | |
| "loss": 0.8528, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 0.05057045758521559, | |
| "learning_rate": 2.39491146388912e-05, | |
| "loss": 0.8536, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 0.04649702726117417, | |
| "learning_rate": 2.387387867306302e-05, | |
| "loss": 0.8488, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.04787518315340449, | |
| "learning_rate": 2.379616847046505e-05, | |
| "loss": 0.8573, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 0.045213502324060025, | |
| "learning_rate": 2.371600093592335e-05, | |
| "loss": 0.8727, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 0.04738126021150567, | |
| "learning_rate": 2.3633393508824022e-05, | |
| "loss": 0.8633, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.043833518738075415, | |
| "learning_rate": 2.3548364159319513e-05, | |
| "loss": 0.868, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 0.042510793627425734, | |
| "learning_rate": 2.3460931384419427e-05, | |
| "loss": 0.852, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.0408258194280563, | |
| "learning_rate": 2.3371114203966756e-05, | |
| "loss": 0.8595, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.041284443987598174, | |
| "learning_rate": 2.3278932156500348e-05, | |
| "loss": 0.8701, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.04015399468491511, | |
| "learning_rate": 2.3184405295004592e-05, | |
| "loss": 0.8378, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 0.0470194493755998, | |
| "learning_rate": 2.3087554182547123e-05, | |
| "loss": 0.8522, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.040346271463175765, | |
| "learning_rate": 2.298839988780561e-05, | |
| "loss": 0.8571, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 0.03907150168014673, | |
| "learning_rate": 2.288696398048455e-05, | |
| "loss": 0.8389, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.03966173521753987, | |
| "learning_rate": 2.278326852662305e-05, | |
| "loss": 0.8473, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.03792422830368885, | |
| "learning_rate": 2.267733608379468e-05, | |
| "loss": 0.8308, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 0.039916995591762185, | |
| "learning_rate": 2.2569189696200327e-05, | |
| "loss": 0.8363, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 0.04222978922823946, | |
| "learning_rate": 2.2458852889655284e-05, | |
| "loss": 0.8248, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.037766008270626816, | |
| "learning_rate": 2.234634966647148e-05, | |
| "loss": 0.8108, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 0.03910137890933386, | |
| "learning_rate": 2.2231704500236117e-05, | |
| "loss": 0.8111, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 0.03723938187511672, | |
| "learning_rate": 2.211494233048776e-05, | |
| "loss": 0.7961, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.03536618534586841, | |
| "learning_rate": 2.1996088557291062e-05, | |
| "loss": 0.8083, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 0.03509557198121232, | |
| "learning_rate": 2.1875169035711335e-05, | |
| "loss": 0.8301, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 0.03631360422996243, | |
| "learning_rate": 2.1752210070190106e-05, | |
| "loss": 0.8119, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.034923838848099804, | |
| "learning_rate": 2.162723840882293e-05, | |
| "loss": 0.8351, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.034222477158642954, | |
| "learning_rate": 2.150028123754072e-05, | |
| "loss": 0.8396, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 0.044099006560021574, | |
| "learning_rate": 2.137136617419578e-05, | |
| "loss": 0.8132, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.03760194901086737, | |
| "learning_rate": 2.1240521262553927e-05, | |
| "loss": 0.8277, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.034132230637497686, | |
| "learning_rate": 2.1107774966193932e-05, | |
| "loss": 0.8231, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 0.034145371494878535, | |
| "learning_rate": 2.097315616231564e-05, | |
| "loss": 0.8116, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.03402745474331636, | |
| "learning_rate": 2.0836694135458136e-05, | |
| "loss": 0.8283, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 0.046074062113807, | |
| "learning_rate": 2.0698418571129255e-05, | |
| "loss": 0.8161, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.03409175484451008, | |
| "learning_rate": 2.055835954934791e-05, | |
| "loss": 0.8056, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.03601398897730395, | |
| "learning_rate": 2.041654753810059e-05, | |
| "loss": 0.8139, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.035605249354124874, | |
| "learning_rate": 2.027301338671342e-05, | |
| "loss": 0.7993, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 0.03851629705577501, | |
| "learning_rate": 2.0127788319141345e-05, | |
| "loss": 0.8192, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.03560123297297108, | |
| "learning_rate": 1.998090392717572e-05, | |
| "loss": 0.8194, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 0.03338440818080332, | |
| "learning_rate": 1.9832392163571977e-05, | |
| "loss": 0.823, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.039320599103418945, | |
| "learning_rate": 1.968228533509871e-05, | |
| "loss": 0.7991, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.03413703166680613, | |
| "learning_rate": 1.953061609550976e-05, | |
| "loss": 0.8122, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 0.03632660780988978, | |
| "learning_rate": 1.937741743844082e-05, | |
| "loss": 0.8051, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 0.2720784291107051, | |
| "learning_rate": 1.9222722690232124e-05, | |
| "loss": 0.7982, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.033948193629099205, | |
| "learning_rate": 1.9066565502678735e-05, | |
| "loss": 0.8244, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 0.04015231276799685, | |
| "learning_rate": 1.8908979845710028e-05, | |
| "loss": 0.802, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.0334280910595663, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.7944, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.033433004016842426, | |
| "learning_rate": 1.8589660549509958e-05, | |
| "loss": 0.8086, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.03647767785675323, | |
| "learning_rate": 1.842799637396523e-05, | |
| "loss": 0.8005, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.034851294076943595, | |
| "learning_rate": 1.8265042641267543e-05, | |
| "loss": 0.7697, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.035934237530251795, | |
| "learning_rate": 1.8100834799844733e-05, | |
| "loss": 0.8017, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.035583328708530516, | |
| "learning_rate": 1.793540857093937e-05, | |
| "loss": 0.8035, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.035602801094138097, | |
| "learning_rate": 1.77687999408381e-05, | |
| "loss": 0.7785, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.03338943294215932, | |
| "learning_rate": 1.760104515304331e-05, | |
| "loss": 0.809, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.03440455754366396, | |
| "learning_rate": 1.743218070038882e-05, | |
| "loss": 0.7835, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.03413755186702014, | |
| "learning_rate": 1.7262243317101342e-05, | |
| "loss": 0.7857, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.03450977935766268, | |
| "learning_rate": 1.709126997080946e-05, | |
| "loss": 0.8045, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 0.03521585021316178, | |
| "learning_rate": 1.6919297854501793e-05, | |
| "loss": 0.7935, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.03493887488862163, | |
| "learning_rate": 1.674636437843616e-05, | |
| "loss": 0.798, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.035529760900503735, | |
| "learning_rate": 1.6572507162001472e-05, | |
| "loss": 0.799, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.033513087382252796, | |
| "learning_rate": 1.6397764025534122e-05, | |
| "loss": 0.7894, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 0.19122994636460844, | |
| "learning_rate": 1.6222172982090696e-05, | |
| "loss": 0.7804, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.035807687623747184, | |
| "learning_rate": 1.604577222917871e-05, | |
| "loss": 0.7951, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 0.03259107722786136, | |
| "learning_rate": 1.586860014044726e-05, | |
| "loss": 0.7781, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.036809691518080494, | |
| "learning_rate": 1.5690695257339348e-05, | |
| "loss": 0.8008, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.035038305488987835, | |
| "learning_rate": 1.551209628070768e-05, | |
| "loss": 0.7753, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.03563704680282362, | |
| "learning_rate": 1.5332842062395837e-05, | |
| "loss": 0.8109, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.03337909268033773, | |
| "learning_rate": 1.5152971596786539e-05, | |
| "loss": 0.8074, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.03364182376321439, | |
| "learning_rate": 1.4972524012318968e-05, | |
| "loss": 0.7814, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.03394899793236395, | |
| "learning_rate": 1.4791538562976858e-05, | |
| "loss": 0.8046, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.03276078625186682, | |
| "learning_rate": 1.4610054619749335e-05, | |
| "loss": 0.7923, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.035403995130927304, | |
| "learning_rate": 1.442811166206628e-05, | |
| "loss": 0.8036, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.036133327261938186, | |
| "learning_rate": 1.4245749269210077e-05, | |
| "loss": 0.7875, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 0.036600172448979534, | |
| "learning_rate": 1.40630071117057e-05, | |
| "loss": 0.7697, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.037601350569099724, | |
| "learning_rate": 1.3879924942690875e-05, | |
| "loss": 0.8189, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.017543859649123, | |
| "grad_norm": 0.03302530011499694, | |
| "learning_rate": 1.3696542589268343e-05, | |
| "loss": 0.7611, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.03512883427203999, | |
| "learning_rate": 1.3512899943842001e-05, | |
| "loss": 0.8027, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.032765823734136675, | |
| "learning_rate": 1.3329036955438801e-05, | |
| "loss": 0.7739, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.0325717443826363, | |
| "learning_rate": 1.3144993621018414e-05, | |
| "loss": 0.7749, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.087719298245614, | |
| "grad_norm": 0.03351967483422044, | |
| "learning_rate": 1.2960809976772395e-05, | |
| "loss": 0.7776, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.03412426117127494, | |
| "learning_rate": 1.2776526089414836e-05, | |
| "loss": 0.7604, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.1228070175438596, | |
| "grad_norm": 0.034817647751256633, | |
| "learning_rate": 1.2592182047466405e-05, | |
| "loss": 0.7701, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.03260110561553477, | |
| "learning_rate": 1.2407817952533594e-05, | |
| "loss": 0.7741, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.03193494436049472, | |
| "learning_rate": 1.2223473910585165e-05, | |
| "loss": 0.7645, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.03312398775341158, | |
| "learning_rate": 1.2039190023227611e-05, | |
| "loss": 0.7585, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.192982456140351, | |
| "grad_norm": 0.03343964045395972, | |
| "learning_rate": 1.1855006378981588e-05, | |
| "loss": 0.7921, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.033686542752496544, | |
| "learning_rate": 1.1670963044561205e-05, | |
| "loss": 0.7827, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.2280701754385963, | |
| "grad_norm": 0.03497709388430689, | |
| "learning_rate": 1.1487100056158e-05, | |
| "loss": 0.7867, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.03909574941588132, | |
| "learning_rate": 1.1303457410731658e-05, | |
| "loss": 0.7651, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.03295131774552763, | |
| "learning_rate": 1.112007505730913e-05, | |
| "loss": 0.7716, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.03535332720570186, | |
| "learning_rate": 1.0936992888294304e-05, | |
| "loss": 0.7519, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2982456140350878, | |
| "grad_norm": 0.034260157940805745, | |
| "learning_rate": 1.0754250730789925e-05, | |
| "loss": 0.7778, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.03778502943480454, | |
| "learning_rate": 1.057188833793372e-05, | |
| "loss": 0.7785, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.03282521968583762, | |
| "learning_rate": 1.0389945380250666e-05, | |
| "loss": 0.7822, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.03439059832810125, | |
| "learning_rate": 1.0208461437023146e-05, | |
| "loss": 0.7774, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.03579842875417821, | |
| "learning_rate": 1.0027475987681033e-05, | |
| "loss": 0.7626, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.04487526535583229, | |
| "learning_rate": 9.847028403213464e-06, | |
| "loss": 0.785, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.4035087719298245, | |
| "grad_norm": 0.03582408613012423, | |
| "learning_rate": 9.667157937604165e-06, | |
| "loss": 0.772, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.035176969293327615, | |
| "learning_rate": 9.487903719292321e-06, | |
| "loss": 0.7777, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.43859649122807, | |
| "grad_norm": 0.03346578608050542, | |
| "learning_rate": 9.309304742660656e-06, | |
| "loss": 0.7577, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.039503770377912154, | |
| "learning_rate": 9.131399859552739e-06, | |
| "loss": 0.7901, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.03677116076069421, | |
| "learning_rate": 8.954227770821292e-06, | |
| "loss": 0.7723, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.03338240458340637, | |
| "learning_rate": 8.77782701790931e-06, | |
| "loss": 0.7617, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.5087719298245617, | |
| "grad_norm": 0.033837659971756015, | |
| "learning_rate": 8.60223597446588e-06, | |
| "loss": 0.7713, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.03820384430127108, | |
| "learning_rate": 8.427492837998533e-06, | |
| "loss": 0.7557, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.543859649122807, | |
| "grad_norm": 0.03252410748125962, | |
| "learning_rate": 8.25363562156384e-06, | |
| "loss": 0.7805, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.03706815897768721, | |
| "learning_rate": 8.080702145498206e-06, | |
| "loss": 0.7645, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.035598168147733984, | |
| "learning_rate": 7.908730029190544e-06, | |
| "loss": 0.7877, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.0333688858025551, | |
| "learning_rate": 7.737756682898659e-06, | |
| "loss": 0.7591, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.6140350877192984, | |
| "grad_norm": 0.044640638601682346, | |
| "learning_rate": 7.567819299611184e-06, | |
| "loss": 0.7658, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.033158357578421456, | |
| "learning_rate": 7.398954846956688e-06, | |
| "loss": 0.7719, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.6491228070175437, | |
| "grad_norm": 0.033194259963536865, | |
| "learning_rate": 7.231200059161899e-06, | |
| "loss": 0.7806, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.03518173471203294, | |
| "learning_rate": 7.064591429060635e-06, | |
| "loss": 0.7679, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.03276207653785537, | |
| "learning_rate": 6.8991652001552695e-06, | |
| "loss": 0.7728, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.0347231674496661, | |
| "learning_rate": 6.734957358732458e-06, | |
| "loss": 0.7741, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.719298245614035, | |
| "grad_norm": 0.033258910497780264, | |
| "learning_rate": 6.572003626034776e-06, | |
| "loss": 0.7728, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.033923018163736, | |
| "learning_rate": 6.410339450490047e-06, | |
| "loss": 0.7838, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.754385964912281, | |
| "grad_norm": 0.03499101652185436, | |
| "learning_rate": 6.250000000000003e-06, | |
| "loss": 0.7836, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.037455923489755065, | |
| "learning_rate": 6.091020154289971e-06, | |
| "loss": 0.786, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.03406798967875397, | |
| "learning_rate": 5.933434497321268e-06, | |
| "loss": 0.7607, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.03263055408168355, | |
| "learning_rate": 5.777277309767873e-06, | |
| "loss": 0.7835, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.824561403508772, | |
| "grad_norm": 0.0370087111535257, | |
| "learning_rate": 5.62258256155918e-06, | |
| "loss": 0.7506, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.03253506069052104, | |
| "learning_rate": 5.469383904490243e-06, | |
| "loss": 0.7849, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.8596491228070176, | |
| "grad_norm": 0.034164574715283676, | |
| "learning_rate": 5.317714664901289e-06, | |
| "loss": 0.7665, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.03375521327460909, | |
| "learning_rate": 5.167607836428023e-06, | |
| "loss": 0.7497, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.035848240948616814, | |
| "learning_rate": 5.0190960728242834e-06, | |
| "loss": 0.7904, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.03266391216652421, | |
| "learning_rate": 4.872211680858662e-06, | |
| "loss": 0.7592, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.9298245614035086, | |
| "grad_norm": 0.03349205497732682, | |
| "learning_rate": 4.726986613286583e-06, | |
| "loss": 0.7666, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.034893605290688134, | |
| "learning_rate": 4.5834524618994106e-06, | |
| "loss": 0.7676, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.9649122807017543, | |
| "grad_norm": 0.034664258032328234, | |
| "learning_rate": 4.441640450652093e-06, | |
| "loss": 0.7675, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.032567147685664447, | |
| "learning_rate": 4.30158142887075e-06, | |
| "loss": 0.7607, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.034965409248096775, | |
| "learning_rate": 4.163305864541865e-06, | |
| "loss": 0.7622, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 3.017543859649123, | |
| "grad_norm": 0.034735445589064905, | |
| "learning_rate": 4.026843837684359e-06, | |
| "loss": 0.7767, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 3.0350877192982457, | |
| "grad_norm": 0.061447912336751793, | |
| "learning_rate": 3.89222503380607e-06, | |
| "loss": 0.7423, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 3.0526315789473686, | |
| "grad_norm": 0.03296491017047373, | |
| "learning_rate": 3.7594787374460747e-06, | |
| "loss": 0.7608, | |
| "step": 174 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 228, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 29, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1115148672434176e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |