| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 5000, | |
| "global_step": 87895, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05688605722737357, | |
| "grad_norm": 2.3711910247802734, | |
| "learning_rate": 0.0007909073326127766, | |
| "loss": 2.6366, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11377211445474714, | |
| "grad_norm": 2.2273147106170654, | |
| "learning_rate": 0.0007818055634563969, | |
| "loss": 1.7361, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.17065817168212072, | |
| "grad_norm": 2.9114110469818115, | |
| "learning_rate": 0.000772703794300017, | |
| "loss": 1.5903, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.22754422890949427, | |
| "grad_norm": 1.7726603746414185, | |
| "learning_rate": 0.0007636020251436373, | |
| "loss": 1.5127, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2844302861368678, | |
| "grad_norm": 1.8174991607666016, | |
| "learning_rate": 0.0007545002559872575, | |
| "loss": 1.4609, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2844302861368678, | |
| "eval_accuracy": 0.653196, | |
| "eval_loss": 1.3989018201828003, | |
| "eval_runtime": 65.7885, | |
| "eval_samples_per_second": 3800.055, | |
| "eval_steps_per_second": 14.851, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.34131634336424144, | |
| "grad_norm": 1.7002882957458496, | |
| "learning_rate": 0.0007453984868308778, | |
| "loss": 1.4214, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.398202400591615, | |
| "grad_norm": 1.6060094833374023, | |
| "learning_rate": 0.0007362967176744981, | |
| "loss": 1.3803, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.45508845781898855, | |
| "grad_norm": 2.100240468978882, | |
| "learning_rate": 0.0007271949485181182, | |
| "loss": 1.358, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5119745150463622, | |
| "grad_norm": 1.507076621055603, | |
| "learning_rate": 0.0007180931793617385, | |
| "loss": 1.3392, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.5688605722737357, | |
| "grad_norm": 1.8028790950775146, | |
| "learning_rate": 0.0007089914102053587, | |
| "loss": 1.3211, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5688605722737357, | |
| "eval_accuracy": 0.680348, | |
| "eval_loss": 1.2739007472991943, | |
| "eval_runtime": 64.9042, | |
| "eval_samples_per_second": 3851.83, | |
| "eval_steps_per_second": 15.053, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.6257466295011093, | |
| "grad_norm": 1.699574589729309, | |
| "learning_rate": 0.000699889641048979, | |
| "loss": 1.3131, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.6826326867284829, | |
| "grad_norm": 1.6491554975509644, | |
| "learning_rate": 0.0006907878718925991, | |
| "loss": 1.2837, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.7395187439558564, | |
| "grad_norm": 1.8563138246536255, | |
| "learning_rate": 0.0006816861027362194, | |
| "loss": 1.276, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.79640480118323, | |
| "grad_norm": 1.5511844158172607, | |
| "learning_rate": 0.0006725843335798396, | |
| "loss": 1.2678, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.8532908584106036, | |
| "grad_norm": 1.3686333894729614, | |
| "learning_rate": 0.0006634825644234599, | |
| "loss": 1.2531, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8532908584106036, | |
| "eval_accuracy": 0.694232, | |
| "eval_loss": 1.2132482528686523, | |
| "eval_runtime": 64.8716, | |
| "eval_samples_per_second": 3853.765, | |
| "eval_steps_per_second": 15.061, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.9101769156379771, | |
| "grad_norm": 1.958629846572876, | |
| "learning_rate": 0.00065438079526708, | |
| "loss": 1.2457, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.9670629728653507, | |
| "grad_norm": 1.528414011001587, | |
| "learning_rate": 0.0006452790261107003, | |
| "loss": 1.2338, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.0239490300927243, | |
| "grad_norm": 1.2693781852722168, | |
| "learning_rate": 0.0006361772569543205, | |
| "loss": 1.2142, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.0808350873200978, | |
| "grad_norm": 1.4573434591293335, | |
| "learning_rate": 0.0006270754877979408, | |
| "loss": 1.19, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.1377211445474713, | |
| "grad_norm": 1.236939787864685, | |
| "learning_rate": 0.0006179737186415609, | |
| "loss": 1.1875, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.1377211445474713, | |
| "eval_accuracy": 0.704068, | |
| "eval_loss": 1.1761754751205444, | |
| "eval_runtime": 65.8177, | |
| "eval_samples_per_second": 3798.369, | |
| "eval_steps_per_second": 14.844, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.194607201774845, | |
| "grad_norm": 1.241289496421814, | |
| "learning_rate": 0.0006088719494851812, | |
| "loss": 1.1814, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.2514932590022185, | |
| "grad_norm": 1.483782410621643, | |
| "learning_rate": 0.0005997701803288014, | |
| "loss": 1.1822, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.3083793162295922, | |
| "grad_norm": 1.5755152702331543, | |
| "learning_rate": 0.0005906684111724217, | |
| "loss": 1.1767, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.3652653734569657, | |
| "grad_norm": 1.333516001701355, | |
| "learning_rate": 0.0005815666420160419, | |
| "loss": 1.1731, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.4221514306843392, | |
| "grad_norm": 1.8660708665847778, | |
| "learning_rate": 0.0005724648728596621, | |
| "loss": 1.157, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.4221514306843392, | |
| "eval_accuracy": 0.711072, | |
| "eval_loss": 1.145967960357666, | |
| "eval_runtime": 63.5002, | |
| "eval_samples_per_second": 3936.992, | |
| "eval_steps_per_second": 15.386, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.4790374879117127, | |
| "grad_norm": 1.3808480501174927, | |
| "learning_rate": 0.0005633631037032824, | |
| "loss": 1.1574, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.5359235451390862, | |
| "grad_norm": 1.1691391468048096, | |
| "learning_rate": 0.0005542613345469026, | |
| "loss": 1.1554, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.59280960236646, | |
| "grad_norm": 1.4390947818756104, | |
| "learning_rate": 0.0005451595653905228, | |
| "loss": 1.1497, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.6496956595938337, | |
| "grad_norm": 1.3637901544570923, | |
| "learning_rate": 0.000536057796234143, | |
| "loss": 1.1452, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.7065817168212072, | |
| "grad_norm": 1.2076903581619263, | |
| "learning_rate": 0.0005269560270777633, | |
| "loss": 1.144, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.7065817168212072, | |
| "eval_accuracy": 0.716336, | |
| "eval_loss": 1.11836576461792, | |
| "eval_runtime": 64.1718, | |
| "eval_samples_per_second": 3895.791, | |
| "eval_steps_per_second": 15.225, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.7634677740485807, | |
| "grad_norm": 1.349098801612854, | |
| "learning_rate": 0.0005178542579213835, | |
| "loss": 1.1383, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.8203538312759542, | |
| "grad_norm": 1.4453612565994263, | |
| "learning_rate": 0.0005087524887650037, | |
| "loss": 1.1391, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.8772398885033277, | |
| "grad_norm": 1.0392345190048218, | |
| "learning_rate": 0.0004996507196086239, | |
| "loss": 1.1328, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.9341259457307014, | |
| "grad_norm": 1.1520024538040161, | |
| "learning_rate": 0.0004905489504522442, | |
| "loss": 1.1238, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.9910120029580751, | |
| "grad_norm": 1.515512228012085, | |
| "learning_rate": 0.0004814471812958644, | |
| "loss": 1.1217, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.9910120029580751, | |
| "eval_accuracy": 0.724676, | |
| "eval_loss": 1.0880111455917358, | |
| "eval_runtime": 64.3813, | |
| "eval_samples_per_second": 3883.115, | |
| "eval_steps_per_second": 15.175, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.0478980601854486, | |
| "grad_norm": 1.4771007299423218, | |
| "learning_rate": 0.00047234541213948464, | |
| "loss": 1.0919, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.104784117412822, | |
| "grad_norm": 1.3845994472503662, | |
| "learning_rate": 0.00046324364298310487, | |
| "loss": 1.0838, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.1616701746401956, | |
| "grad_norm": 1.250450611114502, | |
| "learning_rate": 0.00045414187382672515, | |
| "loss": 1.0785, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.218556231867569, | |
| "grad_norm": 1.5783060789108276, | |
| "learning_rate": 0.0004450401046703453, | |
| "loss": 1.0753, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.2754422890949426, | |
| "grad_norm": 1.7228904962539673, | |
| "learning_rate": 0.0004359383355139656, | |
| "loss": 1.0831, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.2754422890949426, | |
| "eval_accuracy": 0.727968, | |
| "eval_loss": 1.0728965997695923, | |
| "eval_runtime": 64.3156, | |
| "eval_samples_per_second": 3887.084, | |
| "eval_steps_per_second": 15.191, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.3323283463223166, | |
| "grad_norm": 1.333543062210083, | |
| "learning_rate": 0.00042683656635758577, | |
| "loss": 1.0798, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.38921440354969, | |
| "grad_norm": 1.3213781118392944, | |
| "learning_rate": 0.00041773479720120594, | |
| "loss": 1.0804, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.4461004607770636, | |
| "grad_norm": 1.43584406375885, | |
| "learning_rate": 0.0004086330280448262, | |
| "loss": 1.0713, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.502986518004437, | |
| "grad_norm": 1.2614803314208984, | |
| "learning_rate": 0.0003995312588884465, | |
| "loss": 1.0697, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.5598725752318106, | |
| "grad_norm": 1.1319971084594727, | |
| "learning_rate": 0.0003904294897320667, | |
| "loss": 1.0761, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.5598725752318106, | |
| "eval_accuracy": 0.731168, | |
| "eval_loss": 1.0593221187591553, | |
| "eval_runtime": 64.6765, | |
| "eval_samples_per_second": 3865.393, | |
| "eval_steps_per_second": 15.106, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.6167586324591845, | |
| "grad_norm": 1.2045773267745972, | |
| "learning_rate": 0.00038132772057568694, | |
| "loss": 1.0723, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.673644689686558, | |
| "grad_norm": 1.3462469577789307, | |
| "learning_rate": 0.00037222595141930717, | |
| "loss": 1.067, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.7305307469139315, | |
| "grad_norm": 1.3573272228240967, | |
| "learning_rate": 0.0003631241822629274, | |
| "loss": 1.0636, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.787416804141305, | |
| "grad_norm": 1.2870041131973267, | |
| "learning_rate": 0.0003540224131065476, | |
| "loss": 1.0655, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.8443028613686785, | |
| "grad_norm": 1.3287382125854492, | |
| "learning_rate": 0.0003449206439501678, | |
| "loss": 1.0565, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.8443028613686785, | |
| "eval_accuracy": 0.734552, | |
| "eval_loss": 1.0479968786239624, | |
| "eval_runtime": 65.2161, | |
| "eval_samples_per_second": 3833.412, | |
| "eval_steps_per_second": 14.981, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.901188918596052, | |
| "grad_norm": 1.384717345237732, | |
| "learning_rate": 0.000335818874793788, | |
| "loss": 1.0529, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.9580749758234255, | |
| "grad_norm": 1.1834776401519775, | |
| "learning_rate": 0.0003267171056374083, | |
| "loss": 1.0608, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.0149610330507994, | |
| "grad_norm": 1.0646686553955078, | |
| "learning_rate": 0.0003176153364810285, | |
| "loss": 1.0417, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.071847090278173, | |
| "grad_norm": 1.348777174949646, | |
| "learning_rate": 0.00030851356732464874, | |
| "loss": 1.0168, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.1287331475055464, | |
| "grad_norm": 1.2929068803787231, | |
| "learning_rate": 0.00029941179816826897, | |
| "loss": 1.0149, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.1287331475055464, | |
| "eval_accuracy": 0.73796, | |
| "eval_loss": 1.0355563163757324, | |
| "eval_runtime": 66.0157, | |
| "eval_samples_per_second": 3786.979, | |
| "eval_steps_per_second": 14.8, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.18561920473292, | |
| "grad_norm": 1.3426847457885742, | |
| "learning_rate": 0.0002903100290118892, | |
| "loss": 1.0145, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.2425052619602934, | |
| "grad_norm": 1.3112365007400513, | |
| "learning_rate": 0.0002812082598555094, | |
| "loss": 1.013, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.299391319187667, | |
| "grad_norm": 1.3956024646759033, | |
| "learning_rate": 0.00027210649069912964, | |
| "loss": 1.0117, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.356277376415041, | |
| "grad_norm": 1.2679752111434937, | |
| "learning_rate": 0.00026300472154274987, | |
| "loss": 1.0155, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.4131634336424144, | |
| "grad_norm": 1.5014774799346924, | |
| "learning_rate": 0.0002539029523863701, | |
| "loss": 1.0102, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.4131634336424144, | |
| "eval_accuracy": 0.74012, | |
| "eval_loss": 1.0263450145721436, | |
| "eval_runtime": 64.1919, | |
| "eval_samples_per_second": 3894.574, | |
| "eval_steps_per_second": 15.22, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.470049490869788, | |
| "grad_norm": 1.4669406414031982, | |
| "learning_rate": 0.0002448011832299904, | |
| "loss": 1.0145, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.5269355480971614, | |
| "grad_norm": 1.3615577220916748, | |
| "learning_rate": 0.00023569941407361057, | |
| "loss": 1.0173, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.583821605324535, | |
| "grad_norm": 1.126437783241272, | |
| "learning_rate": 0.00022659764491723082, | |
| "loss": 1.0125, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.6407076625519084, | |
| "grad_norm": 1.2467857599258423, | |
| "learning_rate": 0.00021749587576085105, | |
| "loss": 1.0133, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.697593719779282, | |
| "grad_norm": 1.3474713563919067, | |
| "learning_rate": 0.00020839410660447127, | |
| "loss": 1.0014, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.697593719779282, | |
| "eval_accuracy": 0.743688, | |
| "eval_loss": 1.0122489929199219, | |
| "eval_runtime": 64.6438, | |
| "eval_samples_per_second": 3867.347, | |
| "eval_steps_per_second": 15.114, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.754479777006656, | |
| "grad_norm": 1.3319435119628906, | |
| "learning_rate": 0.00019929233744809147, | |
| "loss": 1.0034, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.8113658342340293, | |
| "grad_norm": 1.9685286283493042, | |
| "learning_rate": 0.00019019056829171172, | |
| "loss": 0.995, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.868251891461403, | |
| "grad_norm": 1.2180532217025757, | |
| "learning_rate": 0.00018108879913533195, | |
| "loss": 1.0069, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.9251379486887763, | |
| "grad_norm": 1.3233805894851685, | |
| "learning_rate": 0.00017198702997895217, | |
| "loss": 0.9983, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.98202400591615, | |
| "grad_norm": 1.7491425275802612, | |
| "learning_rate": 0.0001628852608225724, | |
| "loss": 0.9972, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.98202400591615, | |
| "eval_accuracy": 0.745936, | |
| "eval_loss": 1.0027811527252197, | |
| "eval_runtime": 65.7257, | |
| "eval_samples_per_second": 3803.688, | |
| "eval_steps_per_second": 14.865, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.038910063143524, | |
| "grad_norm": 1.1467124223709106, | |
| "learning_rate": 0.00015378349166619262, | |
| "loss": 0.9752, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 4.095796120370897, | |
| "grad_norm": 1.2129188776016235, | |
| "learning_rate": 0.00014468172250981285, | |
| "loss": 0.9652, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 4.152682177598271, | |
| "grad_norm": 1.3177002668380737, | |
| "learning_rate": 0.00013557995335343307, | |
| "loss": 0.9615, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 4.209568234825644, | |
| "grad_norm": 1.1324489116668701, | |
| "learning_rate": 0.0001264781841970533, | |
| "loss": 0.9629, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 4.266454292053018, | |
| "grad_norm": 1.2428852319717407, | |
| "learning_rate": 0.00011737641504067354, | |
| "loss": 0.9556, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.266454292053018, | |
| "eval_accuracy": 0.747436, | |
| "eval_loss": 0.9971279501914978, | |
| "eval_runtime": 66.3789, | |
| "eval_samples_per_second": 3766.258, | |
| "eval_steps_per_second": 14.719, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.323340349280391, | |
| "grad_norm": 1.4413901567459106, | |
| "learning_rate": 0.00010827464588429376, | |
| "loss": 0.9616, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 4.380226406507765, | |
| "grad_norm": 1.312136173248291, | |
| "learning_rate": 9.917287672791399e-05, | |
| "loss": 0.9657, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 4.437112463735138, | |
| "grad_norm": 1.3660274744033813, | |
| "learning_rate": 9.007110757153423e-05, | |
| "loss": 0.9613, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 4.493998520962512, | |
| "grad_norm": 1.4278331995010376, | |
| "learning_rate": 8.096933841515445e-05, | |
| "loss": 0.9576, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 4.550884578189885, | |
| "grad_norm": 1.20628821849823, | |
| "learning_rate": 7.186756925877468e-05, | |
| "loss": 0.9606, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.550884578189885, | |
| "eval_accuracy": 0.749644, | |
| "eval_loss": 0.990385890007019, | |
| "eval_runtime": 65.0093, | |
| "eval_samples_per_second": 3845.605, | |
| "eval_steps_per_second": 15.029, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.607770635417259, | |
| "grad_norm": 1.8617701530456543, | |
| "learning_rate": 6.27658001023949e-05, | |
| "loss": 0.954, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.664656692644633, | |
| "grad_norm": 1.352597951889038, | |
| "learning_rate": 5.366403094601513e-05, | |
| "loss": 0.957, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.721542749872007, | |
| "grad_norm": 1.4314864873886108, | |
| "learning_rate": 4.4562261789635364e-05, | |
| "loss": 0.9541, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.77842880709938, | |
| "grad_norm": 1.2464176416397095, | |
| "learning_rate": 3.5460492633255596e-05, | |
| "loss": 0.9545, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.835314864326754, | |
| "grad_norm": 1.4721029996871948, | |
| "learning_rate": 2.6358723476875817e-05, | |
| "loss": 0.9544, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.835314864326754, | |
| "eval_accuracy": 0.750732, | |
| "eval_loss": 0.9842203259468079, | |
| "eval_runtime": 65.3657, | |
| "eval_samples_per_second": 3824.637, | |
| "eval_steps_per_second": 14.947, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.892200921554127, | |
| "grad_norm": 1.383285403251648, | |
| "learning_rate": 1.7256954320496046e-05, | |
| "loss": 0.9556, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.949086978781501, | |
| "grad_norm": 1.3051174879074097, | |
| "learning_rate": 8.155185164116276e-06, | |
| "loss": 0.9503, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 87895, | |
| "total_flos": 5.4597447576e+17, | |
| "train_loss": 1.1272559640920097, | |
| "train_runtime": 10316.2205, | |
| "train_samples_per_second": 2181.031, | |
| "train_steps_per_second": 8.52 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 87895, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.4597447576e+17, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |