{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0423728813559322, "grad_norm": 1.7643470764160156, "learning_rate": 4.936440677966102e-05, "loss": 1.7478, "step": 10 }, { "epoch": 0.0847457627118644, "grad_norm": 1.352023959159851, "learning_rate": 4.8658192090395484e-05, "loss": 1.2556, "step": 20 }, { "epoch": 0.1271186440677966, "grad_norm": 1.205060601234436, "learning_rate": 4.795197740112994e-05, "loss": 1.1919, "step": 30 }, { "epoch": 0.1694915254237288, "grad_norm": 1.019239902496338, "learning_rate": 4.724576271186441e-05, "loss": 1.061, "step": 40 }, { "epoch": 0.211864406779661, "grad_norm": 1.0925650596618652, "learning_rate": 4.6539548022598875e-05, "loss": 1.1092, "step": 50 }, { "epoch": 0.211864406779661, "eval_loss": 1.0438729524612427, "eval_runtime": 22.5506, "eval_samples_per_second": 95.385, "eval_steps_per_second": 1.508, "step": 50 }, { "epoch": 0.2542372881355932, "grad_norm": 0.9862654209136963, "learning_rate": 4.5833333333333334e-05, "loss": 1.0876, "step": 60 }, { "epoch": 0.2966101694915254, "grad_norm": 0.9759798645973206, "learning_rate": 4.51271186440678e-05, "loss": 1.0237, "step": 70 }, { "epoch": 0.3389830508474576, "grad_norm": 1.0690054893493652, "learning_rate": 4.442090395480226e-05, "loss": 1.0092, "step": 80 }, { "epoch": 0.3813559322033898, "grad_norm": 1.1084680557250977, "learning_rate": 4.3714689265536725e-05, "loss": 1.0229, "step": 90 }, { "epoch": 0.423728813559322, "grad_norm": 1.081882357597351, "learning_rate": 4.300847457627119e-05, "loss": 1.0233, "step": 100 }, { "epoch": 0.423728813559322, "eval_loss": 1.003989815711975, "eval_runtime": 22.3839, "eval_samples_per_second": 96.096, "eval_steps_per_second": 1.519, "step": 100 }, { "epoch": 0.4661016949152542, "grad_norm": 1.1931794881820679, "learning_rate": 4.230225988700565e-05, "loss": 0.9807, "step": 110 }, { "epoch": 0.5084745762711864, "grad_norm": 1.1798979043960571, "learning_rate": 4.1596045197740115e-05, "loss": 1.037, "step": 120 }, { "epoch": 0.5508474576271186, "grad_norm": 1.236763596534729, "learning_rate": 4.088983050847458e-05, "loss": 1.0088, "step": 130 }, { "epoch": 0.5932203389830508, "grad_norm": 1.031708836555481, "learning_rate": 4.018361581920904e-05, "loss": 0.9605, "step": 140 }, { "epoch": 0.635593220338983, "grad_norm": 1.0881413221359253, "learning_rate": 3.9477401129943506e-05, "loss": 0.9669, "step": 150 }, { "epoch": 0.635593220338983, "eval_loss": 0.9852223992347717, "eval_runtime": 22.645, "eval_samples_per_second": 94.988, "eval_steps_per_second": 1.501, "step": 150 }, { "epoch": 0.6779661016949152, "grad_norm": 1.0248509645462036, "learning_rate": 3.877118644067797e-05, "loss": 0.9584, "step": 160 }, { "epoch": 0.7203389830508474, "grad_norm": 1.1003658771514893, "learning_rate": 3.806497175141243e-05, "loss": 1.0191, "step": 170 }, { "epoch": 0.7627118644067796, "grad_norm": 0.9632079005241394, "learning_rate": 3.735875706214689e-05, "loss": 1.0048, "step": 180 }, { "epoch": 0.8050847457627118, "grad_norm": 1.2166016101837158, "learning_rate": 3.665254237288136e-05, "loss": 1.0035, "step": 190 }, { "epoch": 0.847457627118644, "grad_norm": 1.132070779800415, "learning_rate": 3.594632768361582e-05, "loss": 0.9939, "step": 200 }, { "epoch": 0.847457627118644, "eval_loss": 0.9768873453140259, "eval_runtime": 23.2377, "eval_samples_per_second": 92.565, "eval_steps_per_second": 1.463, "step": 200 }, { "epoch": 0.8898305084745762, "grad_norm": 1.0186951160430908, "learning_rate": 3.524011299435028e-05, "loss": 0.9749, "step": 210 }, { "epoch": 0.9322033898305084, "grad_norm": 1.2069514989852905, "learning_rate": 3.4533898305084746e-05, "loss": 0.9756, "step": 220 }, { "epoch": 0.9745762711864406, "grad_norm": 1.1548621654510498, "learning_rate": 3.382768361581921e-05, "loss": 0.9723, "step": 230 }, { "epoch": 1.0169491525423728, "grad_norm": 1.0338218212127686, "learning_rate": 3.312146892655367e-05, "loss": 0.9304, "step": 240 }, { "epoch": 1.0593220338983051, "grad_norm": 1.1850943565368652, "learning_rate": 3.241525423728814e-05, "loss": 0.9512, "step": 250 }, { "epoch": 1.0593220338983051, "eval_loss": 0.9682268500328064, "eval_runtime": 22.9025, "eval_samples_per_second": 93.92, "eval_steps_per_second": 1.485, "step": 250 }, { "epoch": 1.1016949152542372, "grad_norm": 1.0937676429748535, "learning_rate": 3.17090395480226e-05, "loss": 0.9241, "step": 260 }, { "epoch": 1.1440677966101696, "grad_norm": 1.296743631362915, "learning_rate": 3.100282485875706e-05, "loss": 0.9508, "step": 270 }, { "epoch": 1.1864406779661016, "grad_norm": 1.145988941192627, "learning_rate": 3.0296610169491528e-05, "loss": 0.9629, "step": 280 }, { "epoch": 1.228813559322034, "grad_norm": 1.2783371210098267, "learning_rate": 2.959039548022599e-05, "loss": 0.9621, "step": 290 }, { "epoch": 1.271186440677966, "grad_norm": 1.3129595518112183, "learning_rate": 2.8884180790960453e-05, "loss": 0.9667, "step": 300 }, { "epoch": 1.271186440677966, "eval_loss": 0.9666466116905212, "eval_runtime": 23.5102, "eval_samples_per_second": 91.492, "eval_steps_per_second": 1.446, "step": 300 }, { "epoch": 1.3135593220338984, "grad_norm": 1.2464239597320557, "learning_rate": 2.817796610169492e-05, "loss": 0.9359, "step": 310 }, { "epoch": 1.3559322033898304, "grad_norm": 1.3320660591125488, "learning_rate": 2.747175141242938e-05, "loss": 0.9232, "step": 320 }, { "epoch": 1.3983050847457628, "grad_norm": 1.3213363885879517, "learning_rate": 2.6765536723163843e-05, "loss": 0.9064, "step": 330 }, { "epoch": 1.4406779661016949, "grad_norm": 1.4874022006988525, "learning_rate": 2.605932203389831e-05, "loss": 0.9367, "step": 340 }, { "epoch": 1.4830508474576272, "grad_norm": 1.308205246925354, "learning_rate": 2.535310734463277e-05, "loss": 0.9058, "step": 350 }, { "epoch": 1.4830508474576272, "eval_loss": 0.964046061038971, "eval_runtime": 22.9381, "eval_samples_per_second": 93.774, "eval_steps_per_second": 1.482, "step": 350 }, { "epoch": 1.5254237288135593, "grad_norm": 1.3957818746566772, "learning_rate": 2.464689265536723e-05, "loss": 0.9232, "step": 360 }, { "epoch": 1.5677966101694916, "grad_norm": 1.3040454387664795, "learning_rate": 2.3940677966101697e-05, "loss": 0.9376, "step": 370 }, { "epoch": 1.6101694915254239, "grad_norm": 1.4122378826141357, "learning_rate": 2.323446327683616e-05, "loss": 0.9115, "step": 380 }, { "epoch": 1.652542372881356, "grad_norm": 1.3442506790161133, "learning_rate": 2.252824858757062e-05, "loss": 0.9599, "step": 390 }, { "epoch": 1.694915254237288, "grad_norm": 1.5409705638885498, "learning_rate": 2.1822033898305087e-05, "loss": 0.9258, "step": 400 }, { "epoch": 1.694915254237288, "eval_loss": 0.9589577913284302, "eval_runtime": 22.6608, "eval_samples_per_second": 94.922, "eval_steps_per_second": 1.5, "step": 400 }, { "epoch": 1.7372881355932204, "grad_norm": 1.315023422241211, "learning_rate": 2.111581920903955e-05, "loss": 0.9088, "step": 410 }, { "epoch": 1.7796610169491527, "grad_norm": 1.4532520771026611, "learning_rate": 2.0409604519774012e-05, "loss": 0.9351, "step": 420 }, { "epoch": 1.8220338983050848, "grad_norm": 1.3708842992782593, "learning_rate": 1.9703389830508475e-05, "loss": 0.95, "step": 430 }, { "epoch": 1.8644067796610169, "grad_norm": 1.3897417783737183, "learning_rate": 1.899717514124294e-05, "loss": 0.8943, "step": 440 }, { "epoch": 1.9067796610169492, "grad_norm": 1.2795928716659546, "learning_rate": 1.8290960451977403e-05, "loss": 0.8856, "step": 450 }, { "epoch": 1.9067796610169492, "eval_loss": 0.9577222466468811, "eval_runtime": 23.1528, "eval_samples_per_second": 92.905, "eval_steps_per_second": 1.469, "step": 450 }, { "epoch": 1.9491525423728815, "grad_norm": 1.2920231819152832, "learning_rate": 1.7584745762711865e-05, "loss": 0.9415, "step": 460 }, { "epoch": 1.9915254237288136, "grad_norm": 1.3519489765167236, "learning_rate": 1.687853107344633e-05, "loss": 0.9708, "step": 470 }, { "epoch": 2.0338983050847457, "grad_norm": 1.3836076259613037, "learning_rate": 1.617231638418079e-05, "loss": 0.8801, "step": 480 }, { "epoch": 2.0762711864406778, "grad_norm": 1.478256344795227, "learning_rate": 1.5466101694915256e-05, "loss": 0.8908, "step": 490 }, { "epoch": 2.1186440677966103, "grad_norm": 1.571032166481018, "learning_rate": 1.475988700564972e-05, "loss": 0.9013, "step": 500 }, { "epoch": 2.1186440677966103, "eval_loss": 0.9608597755432129, "eval_runtime": 22.5872, "eval_samples_per_second": 95.231, "eval_steps_per_second": 1.505, "step": 500 }, { "epoch": 2.1610169491525424, "grad_norm": 1.466437578201294, "learning_rate": 1.4053672316384181e-05, "loss": 0.8926, "step": 510 }, { "epoch": 2.2033898305084745, "grad_norm": 1.6073771715164185, "learning_rate": 1.3347457627118645e-05, "loss": 0.8717, "step": 520 }, { "epoch": 2.2457627118644066, "grad_norm": 1.5890705585479736, "learning_rate": 1.2641242937853107e-05, "loss": 0.8633, "step": 530 }, { "epoch": 2.288135593220339, "grad_norm": 1.6749075651168823, "learning_rate": 1.1935028248587572e-05, "loss": 0.8941, "step": 540 }, { "epoch": 2.330508474576271, "grad_norm": 1.6727330684661865, "learning_rate": 1.1228813559322036e-05, "loss": 0.8929, "step": 550 }, { "epoch": 2.330508474576271, "eval_loss": 0.9595548510551453, "eval_runtime": 22.9206, "eval_samples_per_second": 93.846, "eval_steps_per_second": 1.483, "step": 550 }, { "epoch": 2.3728813559322033, "grad_norm": 1.5609760284423828, "learning_rate": 1.0522598870056498e-05, "loss": 0.8298, "step": 560 }, { "epoch": 2.415254237288136, "grad_norm": 1.7489114999771118, "learning_rate": 9.81638418079096e-06, "loss": 0.8618, "step": 570 }, { "epoch": 2.457627118644068, "grad_norm": 1.7662324905395508, "learning_rate": 9.110169491525423e-06, "loss": 0.8932, "step": 580 }, { "epoch": 2.5, "grad_norm": 1.9018930196762085, "learning_rate": 8.403954802259887e-06, "loss": 0.8743, "step": 590 }, { "epoch": 2.542372881355932, "grad_norm": 1.6308236122131348, "learning_rate": 7.697740112994351e-06, "loss": 0.8752, "step": 600 }, { "epoch": 2.542372881355932, "eval_loss": 0.9617831707000732, "eval_runtime": 23.2374, "eval_samples_per_second": 92.566, "eval_steps_per_second": 1.463, "step": 600 }, { "epoch": 2.584745762711864, "grad_norm": 1.5900993347167969, "learning_rate": 6.991525423728814e-06, "loss": 0.8653, "step": 610 }, { "epoch": 2.6271186440677967, "grad_norm": 1.8414711952209473, "learning_rate": 6.285310734463278e-06, "loss": 0.8877, "step": 620 }, { "epoch": 2.669491525423729, "grad_norm": 1.7117115259170532, "learning_rate": 5.57909604519774e-06, "loss": 0.8691, "step": 630 }, { "epoch": 2.711864406779661, "grad_norm": 1.690845012664795, "learning_rate": 4.872881355932204e-06, "loss": 0.8728, "step": 640 }, { "epoch": 2.7542372881355934, "grad_norm": 1.727298378944397, "learning_rate": 4.166666666666667e-06, "loss": 0.9005, "step": 650 }, { "epoch": 2.7542372881355934, "eval_loss": 0.959761381149292, "eval_runtime": 22.8543, "eval_samples_per_second": 94.118, "eval_steps_per_second": 1.488, "step": 650 }, { "epoch": 2.7966101694915255, "grad_norm": 1.6035046577453613, "learning_rate": 3.46045197740113e-06, "loss": 0.9181, "step": 660 }, { "epoch": 2.8389830508474576, "grad_norm": 1.7235994338989258, "learning_rate": 2.7542372881355934e-06, "loss": 0.9003, "step": 670 }, { "epoch": 2.8813559322033897, "grad_norm": 1.7914451360702515, "learning_rate": 2.0480225988700563e-06, "loss": 0.951, "step": 680 }, { "epoch": 2.923728813559322, "grad_norm": 1.8638156652450562, "learning_rate": 1.3418079096045198e-06, "loss": 0.8739, "step": 690 }, { "epoch": 2.9661016949152543, "grad_norm": 1.6826245784759521, "learning_rate": 6.355932203389831e-07, "loss": 0.8375, "step": 700 }, { "epoch": 2.9661016949152543, "eval_loss": 0.9599277973175049, "eval_runtime": 22.8971, "eval_samples_per_second": 93.942, "eval_steps_per_second": 1.485, "step": 700 } ], "logging_steps": 10, "max_steps": 708, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8604011452104704e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }