{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 5000, "global_step": 87895, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05688605722737357, "grad_norm": 2.3711910247802734, "learning_rate": 0.0007909073326127766, "loss": 2.6366, "step": 1000 }, { "epoch": 0.11377211445474714, "grad_norm": 2.2273147106170654, "learning_rate": 0.0007818055634563969, "loss": 1.7361, "step": 2000 }, { "epoch": 0.17065817168212072, "grad_norm": 2.9114110469818115, "learning_rate": 0.000772703794300017, "loss": 1.5903, "step": 3000 }, { "epoch": 0.22754422890949427, "grad_norm": 1.7726603746414185, "learning_rate": 0.0007636020251436373, "loss": 1.5127, "step": 4000 }, { "epoch": 0.2844302861368678, "grad_norm": 1.8174991607666016, "learning_rate": 0.0007545002559872575, "loss": 1.4609, "step": 5000 }, { "epoch": 0.2844302861368678, "eval_accuracy": 0.653196, "eval_loss": 1.3989018201828003, "eval_runtime": 65.7885, "eval_samples_per_second": 3800.055, "eval_steps_per_second": 14.851, "step": 5000 }, { "epoch": 0.34131634336424144, "grad_norm": 1.7002882957458496, "learning_rate": 0.0007453984868308778, "loss": 1.4214, "step": 6000 }, { "epoch": 0.398202400591615, "grad_norm": 1.6060094833374023, "learning_rate": 0.0007362967176744981, "loss": 1.3803, "step": 7000 }, { "epoch": 0.45508845781898855, "grad_norm": 2.100240468978882, "learning_rate": 0.0007271949485181182, "loss": 1.358, "step": 8000 }, { "epoch": 0.5119745150463622, "grad_norm": 1.507076621055603, "learning_rate": 0.0007180931793617385, "loss": 1.3392, "step": 9000 }, { "epoch": 0.5688605722737357, "grad_norm": 1.8028790950775146, "learning_rate": 0.0007089914102053587, "loss": 1.3211, "step": 10000 }, { "epoch": 0.5688605722737357, "eval_accuracy": 0.680348, "eval_loss": 1.2739007472991943, "eval_runtime": 64.9042, "eval_samples_per_second": 3851.83, "eval_steps_per_second": 15.053, "step": 10000 }, { "epoch": 0.6257466295011093, "grad_norm": 1.699574589729309, "learning_rate": 0.000699889641048979, "loss": 1.3131, "step": 11000 }, { "epoch": 0.6826326867284829, "grad_norm": 1.6491554975509644, "learning_rate": 0.0006907878718925991, "loss": 1.2837, "step": 12000 }, { "epoch": 0.7395187439558564, "grad_norm": 1.8563138246536255, "learning_rate": 0.0006816861027362194, "loss": 1.276, "step": 13000 }, { "epoch": 0.79640480118323, "grad_norm": 1.5511844158172607, "learning_rate": 0.0006725843335798396, "loss": 1.2678, "step": 14000 }, { "epoch": 0.8532908584106036, "grad_norm": 1.3686333894729614, "learning_rate": 0.0006634825644234599, "loss": 1.2531, "step": 15000 }, { "epoch": 0.8532908584106036, "eval_accuracy": 0.694232, "eval_loss": 1.2132482528686523, "eval_runtime": 64.8716, "eval_samples_per_second": 3853.765, "eval_steps_per_second": 15.061, "step": 15000 }, { "epoch": 0.9101769156379771, "grad_norm": 1.958629846572876, "learning_rate": 0.00065438079526708, "loss": 1.2457, "step": 16000 }, { "epoch": 0.9670629728653507, "grad_norm": 1.528414011001587, "learning_rate": 0.0006452790261107003, "loss": 1.2338, "step": 17000 }, { "epoch": 1.0239490300927243, "grad_norm": 1.2693781852722168, "learning_rate": 0.0006361772569543205, "loss": 1.2142, "step": 18000 }, { "epoch": 1.0808350873200978, "grad_norm": 1.4573434591293335, "learning_rate": 0.0006270754877979408, "loss": 1.19, "step": 19000 }, { "epoch": 1.1377211445474713, "grad_norm": 1.236939787864685, "learning_rate": 0.0006179737186415609, "loss": 1.1875, "step": 20000 }, { "epoch": 1.1377211445474713, "eval_accuracy": 0.704068, "eval_loss": 1.1761754751205444, "eval_runtime": 65.8177, "eval_samples_per_second": 3798.369, "eval_steps_per_second": 14.844, "step": 20000 }, { "epoch": 1.194607201774845, "grad_norm": 1.241289496421814, "learning_rate": 0.0006088719494851812, "loss": 1.1814, "step": 21000 }, { "epoch": 1.2514932590022185, "grad_norm": 1.483782410621643, "learning_rate": 0.0005997701803288014, "loss": 1.1822, "step": 22000 }, { "epoch": 1.3083793162295922, "grad_norm": 1.5755152702331543, "learning_rate": 0.0005906684111724217, "loss": 1.1767, "step": 23000 }, { "epoch": 1.3652653734569657, "grad_norm": 1.333516001701355, "learning_rate": 0.0005815666420160419, "loss": 1.1731, "step": 24000 }, { "epoch": 1.4221514306843392, "grad_norm": 1.8660708665847778, "learning_rate": 0.0005724648728596621, "loss": 1.157, "step": 25000 }, { "epoch": 1.4221514306843392, "eval_accuracy": 0.711072, "eval_loss": 1.145967960357666, "eval_runtime": 63.5002, "eval_samples_per_second": 3936.992, "eval_steps_per_second": 15.386, "step": 25000 }, { "epoch": 1.4790374879117127, "grad_norm": 1.3808480501174927, "learning_rate": 0.0005633631037032824, "loss": 1.1574, "step": 26000 }, { "epoch": 1.5359235451390862, "grad_norm": 1.1691391468048096, "learning_rate": 0.0005542613345469026, "loss": 1.1554, "step": 27000 }, { "epoch": 1.59280960236646, "grad_norm": 1.4390947818756104, "learning_rate": 0.0005451595653905228, "loss": 1.1497, "step": 28000 }, { "epoch": 1.6496956595938337, "grad_norm": 1.3637901544570923, "learning_rate": 0.000536057796234143, "loss": 1.1452, "step": 29000 }, { "epoch": 1.7065817168212072, "grad_norm": 1.2076903581619263, "learning_rate": 0.0005269560270777633, "loss": 1.144, "step": 30000 }, { "epoch": 1.7065817168212072, "eval_accuracy": 0.716336, "eval_loss": 1.11836576461792, "eval_runtime": 64.1718, "eval_samples_per_second": 3895.791, "eval_steps_per_second": 15.225, "step": 30000 }, { "epoch": 1.7634677740485807, "grad_norm": 1.349098801612854, "learning_rate": 0.0005178542579213835, "loss": 1.1383, "step": 31000 }, { "epoch": 1.8203538312759542, "grad_norm": 1.4453612565994263, "learning_rate": 0.0005087524887650037, "loss": 1.1391, "step": 32000 }, { "epoch": 1.8772398885033277, "grad_norm": 1.0392345190048218, "learning_rate": 0.0004996507196086239, "loss": 1.1328, "step": 33000 }, { "epoch": 1.9341259457307014, "grad_norm": 1.1520024538040161, "learning_rate": 0.0004905489504522442, "loss": 1.1238, "step": 34000 }, { "epoch": 1.9910120029580751, "grad_norm": 1.515512228012085, "learning_rate": 0.0004814471812958644, "loss": 1.1217, "step": 35000 }, { "epoch": 1.9910120029580751, "eval_accuracy": 0.724676, "eval_loss": 1.0880111455917358, "eval_runtime": 64.3813, "eval_samples_per_second": 3883.115, "eval_steps_per_second": 15.175, "step": 35000 }, { "epoch": 2.0478980601854486, "grad_norm": 1.4771007299423218, "learning_rate": 0.00047234541213948464, "loss": 1.0919, "step": 36000 }, { "epoch": 2.104784117412822, "grad_norm": 1.3845994472503662, "learning_rate": 0.00046324364298310487, "loss": 1.0838, "step": 37000 }, { "epoch": 2.1616701746401956, "grad_norm": 1.250450611114502, "learning_rate": 0.00045414187382672515, "loss": 1.0785, "step": 38000 }, { "epoch": 2.218556231867569, "grad_norm": 1.5783060789108276, "learning_rate": 0.0004450401046703453, "loss": 1.0753, "step": 39000 }, { "epoch": 2.2754422890949426, "grad_norm": 1.7228904962539673, "learning_rate": 0.0004359383355139656, "loss": 1.0831, "step": 40000 }, { "epoch": 2.2754422890949426, "eval_accuracy": 0.727968, "eval_loss": 1.0728965997695923, "eval_runtime": 64.3156, "eval_samples_per_second": 3887.084, "eval_steps_per_second": 15.191, "step": 40000 }, { "epoch": 2.3323283463223166, "grad_norm": 1.333543062210083, "learning_rate": 0.00042683656635758577, "loss": 1.0798, "step": 41000 }, { "epoch": 2.38921440354969, "grad_norm": 1.3213781118392944, "learning_rate": 0.00041773479720120594, "loss": 1.0804, "step": 42000 }, { "epoch": 2.4461004607770636, "grad_norm": 1.43584406375885, "learning_rate": 0.0004086330280448262, "loss": 1.0713, "step": 43000 }, { "epoch": 2.502986518004437, "grad_norm": 1.2614803314208984, "learning_rate": 0.0003995312588884465, "loss": 1.0697, "step": 44000 }, { "epoch": 2.5598725752318106, "grad_norm": 1.1319971084594727, "learning_rate": 0.0003904294897320667, "loss": 1.0761, "step": 45000 }, { "epoch": 2.5598725752318106, "eval_accuracy": 0.731168, "eval_loss": 1.0593221187591553, "eval_runtime": 64.6765, "eval_samples_per_second": 3865.393, "eval_steps_per_second": 15.106, "step": 45000 }, { "epoch": 2.6167586324591845, "grad_norm": 1.2045773267745972, "learning_rate": 0.00038132772057568694, "loss": 1.0723, "step": 46000 }, { "epoch": 2.673644689686558, "grad_norm": 1.3462469577789307, "learning_rate": 0.00037222595141930717, "loss": 1.067, "step": 47000 }, { "epoch": 2.7305307469139315, "grad_norm": 1.3573272228240967, "learning_rate": 0.0003631241822629274, "loss": 1.0636, "step": 48000 }, { "epoch": 2.787416804141305, "grad_norm": 1.2870041131973267, "learning_rate": 0.0003540224131065476, "loss": 1.0655, "step": 49000 }, { "epoch": 2.8443028613686785, "grad_norm": 1.3287382125854492, "learning_rate": 0.0003449206439501678, "loss": 1.0565, "step": 50000 }, { "epoch": 2.8443028613686785, "eval_accuracy": 0.734552, "eval_loss": 1.0479968786239624, "eval_runtime": 65.2161, "eval_samples_per_second": 3833.412, "eval_steps_per_second": 14.981, "step": 50000 }, { "epoch": 2.901188918596052, "grad_norm": 1.384717345237732, "learning_rate": 0.000335818874793788, "loss": 1.0529, "step": 51000 }, { "epoch": 2.9580749758234255, "grad_norm": 1.1834776401519775, "learning_rate": 0.0003267171056374083, "loss": 1.0608, "step": 52000 }, { "epoch": 3.0149610330507994, "grad_norm": 1.0646686553955078, "learning_rate": 0.0003176153364810285, "loss": 1.0417, "step": 53000 }, { "epoch": 3.071847090278173, "grad_norm": 1.348777174949646, "learning_rate": 0.00030851356732464874, "loss": 1.0168, "step": 54000 }, { "epoch": 3.1287331475055464, "grad_norm": 1.2929068803787231, "learning_rate": 0.00029941179816826897, "loss": 1.0149, "step": 55000 }, { "epoch": 3.1287331475055464, "eval_accuracy": 0.73796, "eval_loss": 1.0355563163757324, "eval_runtime": 66.0157, "eval_samples_per_second": 3786.979, "eval_steps_per_second": 14.8, "step": 55000 }, { "epoch": 3.18561920473292, "grad_norm": 1.3426847457885742, "learning_rate": 0.0002903100290118892, "loss": 1.0145, "step": 56000 }, { "epoch": 3.2425052619602934, "grad_norm": 1.3112365007400513, "learning_rate": 0.0002812082598555094, "loss": 1.013, "step": 57000 }, { "epoch": 3.299391319187667, "grad_norm": 1.3956024646759033, "learning_rate": 0.00027210649069912964, "loss": 1.0117, "step": 58000 }, { "epoch": 3.356277376415041, "grad_norm": 1.2679752111434937, "learning_rate": 0.00026300472154274987, "loss": 1.0155, "step": 59000 }, { "epoch": 3.4131634336424144, "grad_norm": 1.5014774799346924, "learning_rate": 0.0002539029523863701, "loss": 1.0102, "step": 60000 }, { "epoch": 3.4131634336424144, "eval_accuracy": 0.74012, "eval_loss": 1.0263450145721436, "eval_runtime": 64.1919, "eval_samples_per_second": 3894.574, "eval_steps_per_second": 15.22, "step": 60000 }, { "epoch": 3.470049490869788, "grad_norm": 1.4669406414031982, "learning_rate": 0.0002448011832299904, "loss": 1.0145, "step": 61000 }, { "epoch": 3.5269355480971614, "grad_norm": 1.3615577220916748, "learning_rate": 0.00023569941407361057, "loss": 1.0173, "step": 62000 }, { "epoch": 3.583821605324535, "grad_norm": 1.126437783241272, "learning_rate": 0.00022659764491723082, "loss": 1.0125, "step": 63000 }, { "epoch": 3.6407076625519084, "grad_norm": 1.2467857599258423, "learning_rate": 0.00021749587576085105, "loss": 1.0133, "step": 64000 }, { "epoch": 3.697593719779282, "grad_norm": 1.3474713563919067, "learning_rate": 0.00020839410660447127, "loss": 1.0014, "step": 65000 }, { "epoch": 3.697593719779282, "eval_accuracy": 0.743688, "eval_loss": 1.0122489929199219, "eval_runtime": 64.6438, "eval_samples_per_second": 3867.347, "eval_steps_per_second": 15.114, "step": 65000 }, { "epoch": 3.754479777006656, "grad_norm": 1.3319435119628906, "learning_rate": 0.00019929233744809147, "loss": 1.0034, "step": 66000 }, { "epoch": 3.8113658342340293, "grad_norm": 1.9685286283493042, "learning_rate": 0.00019019056829171172, "loss": 0.995, "step": 67000 }, { "epoch": 3.868251891461403, "grad_norm": 1.2180532217025757, "learning_rate": 0.00018108879913533195, "loss": 1.0069, "step": 68000 }, { "epoch": 3.9251379486887763, "grad_norm": 1.3233805894851685, "learning_rate": 0.00017198702997895217, "loss": 0.9983, "step": 69000 }, { "epoch": 3.98202400591615, "grad_norm": 1.7491425275802612, "learning_rate": 0.0001628852608225724, "loss": 0.9972, "step": 70000 }, { "epoch": 3.98202400591615, "eval_accuracy": 0.745936, "eval_loss": 1.0027811527252197, "eval_runtime": 65.7257, "eval_samples_per_second": 3803.688, "eval_steps_per_second": 14.865, "step": 70000 }, { "epoch": 4.038910063143524, "grad_norm": 1.1467124223709106, "learning_rate": 0.00015378349166619262, "loss": 0.9752, "step": 71000 }, { "epoch": 4.095796120370897, "grad_norm": 1.2129188776016235, "learning_rate": 0.00014468172250981285, "loss": 0.9652, "step": 72000 }, { "epoch": 4.152682177598271, "grad_norm": 1.3177002668380737, "learning_rate": 0.00013557995335343307, "loss": 0.9615, "step": 73000 }, { "epoch": 4.209568234825644, "grad_norm": 1.1324489116668701, "learning_rate": 0.0001264781841970533, "loss": 0.9629, "step": 74000 }, { "epoch": 4.266454292053018, "grad_norm": 1.2428852319717407, "learning_rate": 0.00011737641504067354, "loss": 0.9556, "step": 75000 }, { "epoch": 4.266454292053018, "eval_accuracy": 0.747436, "eval_loss": 0.9971279501914978, "eval_runtime": 66.3789, "eval_samples_per_second": 3766.258, "eval_steps_per_second": 14.719, "step": 75000 }, { "epoch": 4.323340349280391, "grad_norm": 1.4413901567459106, "learning_rate": 0.00010827464588429376, "loss": 0.9616, "step": 76000 }, { "epoch": 4.380226406507765, "grad_norm": 1.312136173248291, "learning_rate": 9.917287672791399e-05, "loss": 0.9657, "step": 77000 }, { "epoch": 4.437112463735138, "grad_norm": 1.3660274744033813, "learning_rate": 9.007110757153423e-05, "loss": 0.9613, "step": 78000 }, { "epoch": 4.493998520962512, "grad_norm": 1.4278331995010376, "learning_rate": 8.096933841515445e-05, "loss": 0.9576, "step": 79000 }, { "epoch": 4.550884578189885, "grad_norm": 1.20628821849823, "learning_rate": 7.186756925877468e-05, "loss": 0.9606, "step": 80000 }, { "epoch": 4.550884578189885, "eval_accuracy": 0.749644, "eval_loss": 0.990385890007019, "eval_runtime": 65.0093, "eval_samples_per_second": 3845.605, "eval_steps_per_second": 15.029, "step": 80000 }, { "epoch": 4.607770635417259, "grad_norm": 1.8617701530456543, "learning_rate": 6.27658001023949e-05, "loss": 0.954, "step": 81000 }, { "epoch": 4.664656692644633, "grad_norm": 1.352597951889038, "learning_rate": 5.366403094601513e-05, "loss": 0.957, "step": 82000 }, { "epoch": 4.721542749872007, "grad_norm": 1.4314864873886108, "learning_rate": 4.4562261789635364e-05, "loss": 0.9541, "step": 83000 }, { "epoch": 4.77842880709938, "grad_norm": 1.2464176416397095, "learning_rate": 3.5460492633255596e-05, "loss": 0.9545, "step": 84000 }, { "epoch": 4.835314864326754, "grad_norm": 1.4721029996871948, "learning_rate": 2.6358723476875817e-05, "loss": 0.9544, "step": 85000 }, { "epoch": 4.835314864326754, "eval_accuracy": 0.750732, "eval_loss": 0.9842203259468079, "eval_runtime": 65.3657, "eval_samples_per_second": 3824.637, "eval_steps_per_second": 14.947, "step": 85000 }, { "epoch": 4.892200921554127, "grad_norm": 1.383285403251648, "learning_rate": 1.7256954320496046e-05, "loss": 0.9556, "step": 86000 }, { "epoch": 4.949086978781501, "grad_norm": 1.3051174879074097, "learning_rate": 8.155185164116276e-06, "loss": 0.9503, "step": 87000 }, { "epoch": 5.0, "step": 87895, "total_flos": 5.4597447576e+17, "train_loss": 1.1272559640920097, "train_runtime": 10316.2205, "train_samples_per_second": 2181.031, "train_steps_per_second": 8.52 } ], "logging_steps": 1000, "max_steps": 87895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.4597447576e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }