| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 50, | |
| "global_step": 708, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0423728813559322, | |
| "grad_norm": 1.7643470764160156, | |
| "learning_rate": 4.936440677966102e-05, | |
| "loss": 1.7478, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0847457627118644, | |
| "grad_norm": 1.352023959159851, | |
| "learning_rate": 4.8658192090395484e-05, | |
| "loss": 1.2556, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1271186440677966, | |
| "grad_norm": 1.205060601234436, | |
| "learning_rate": 4.795197740112994e-05, | |
| "loss": 1.1919, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1694915254237288, | |
| "grad_norm": 1.019239902496338, | |
| "learning_rate": 4.724576271186441e-05, | |
| "loss": 1.061, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.211864406779661, | |
| "grad_norm": 1.0925650596618652, | |
| "learning_rate": 4.6539548022598875e-05, | |
| "loss": 1.1092, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.211864406779661, | |
| "eval_loss": 1.0438729524612427, | |
| "eval_runtime": 22.5506, | |
| "eval_samples_per_second": 95.385, | |
| "eval_steps_per_second": 1.508, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2542372881355932, | |
| "grad_norm": 0.9862654209136963, | |
| "learning_rate": 4.5833333333333334e-05, | |
| "loss": 1.0876, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2966101694915254, | |
| "grad_norm": 0.9759798645973206, | |
| "learning_rate": 4.51271186440678e-05, | |
| "loss": 1.0237, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 1.0690054893493652, | |
| "learning_rate": 4.442090395480226e-05, | |
| "loss": 1.0092, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3813559322033898, | |
| "grad_norm": 1.1084680557250977, | |
| "learning_rate": 4.3714689265536725e-05, | |
| "loss": 1.0229, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.423728813559322, | |
| "grad_norm": 1.081882357597351, | |
| "learning_rate": 4.300847457627119e-05, | |
| "loss": 1.0233, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.423728813559322, | |
| "eval_loss": 1.003989815711975, | |
| "eval_runtime": 22.3839, | |
| "eval_samples_per_second": 96.096, | |
| "eval_steps_per_second": 1.519, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4661016949152542, | |
| "grad_norm": 1.1931794881820679, | |
| "learning_rate": 4.230225988700565e-05, | |
| "loss": 0.9807, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5084745762711864, | |
| "grad_norm": 1.1798979043960571, | |
| "learning_rate": 4.1596045197740115e-05, | |
| "loss": 1.037, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5508474576271186, | |
| "grad_norm": 1.236763596534729, | |
| "learning_rate": 4.088983050847458e-05, | |
| "loss": 1.0088, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5932203389830508, | |
| "grad_norm": 1.031708836555481, | |
| "learning_rate": 4.018361581920904e-05, | |
| "loss": 0.9605, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.635593220338983, | |
| "grad_norm": 1.0881413221359253, | |
| "learning_rate": 3.9477401129943506e-05, | |
| "loss": 0.9669, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.635593220338983, | |
| "eval_loss": 0.9852223992347717, | |
| "eval_runtime": 22.645, | |
| "eval_samples_per_second": 94.988, | |
| "eval_steps_per_second": 1.501, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 1.0248509645462036, | |
| "learning_rate": 3.877118644067797e-05, | |
| "loss": 0.9584, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7203389830508474, | |
| "grad_norm": 1.1003658771514893, | |
| "learning_rate": 3.806497175141243e-05, | |
| "loss": 1.0191, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7627118644067796, | |
| "grad_norm": 0.9632079005241394, | |
| "learning_rate": 3.735875706214689e-05, | |
| "loss": 1.0048, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8050847457627118, | |
| "grad_norm": 1.2166016101837158, | |
| "learning_rate": 3.665254237288136e-05, | |
| "loss": 1.0035, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.847457627118644, | |
| "grad_norm": 1.132070779800415, | |
| "learning_rate": 3.594632768361582e-05, | |
| "loss": 0.9939, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.847457627118644, | |
| "eval_loss": 0.9768873453140259, | |
| "eval_runtime": 23.2377, | |
| "eval_samples_per_second": 92.565, | |
| "eval_steps_per_second": 1.463, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8898305084745762, | |
| "grad_norm": 1.0186951160430908, | |
| "learning_rate": 3.524011299435028e-05, | |
| "loss": 0.9749, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9322033898305084, | |
| "grad_norm": 1.2069514989852905, | |
| "learning_rate": 3.4533898305084746e-05, | |
| "loss": 0.9756, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9745762711864406, | |
| "grad_norm": 1.1548621654510498, | |
| "learning_rate": 3.382768361581921e-05, | |
| "loss": 0.9723, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0169491525423728, | |
| "grad_norm": 1.0338218212127686, | |
| "learning_rate": 3.312146892655367e-05, | |
| "loss": 0.9304, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0593220338983051, | |
| "grad_norm": 1.1850943565368652, | |
| "learning_rate": 3.241525423728814e-05, | |
| "loss": 0.9512, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0593220338983051, | |
| "eval_loss": 0.9682268500328064, | |
| "eval_runtime": 22.9025, | |
| "eval_samples_per_second": 93.92, | |
| "eval_steps_per_second": 1.485, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1016949152542372, | |
| "grad_norm": 1.0937676429748535, | |
| "learning_rate": 3.17090395480226e-05, | |
| "loss": 0.9241, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1440677966101696, | |
| "grad_norm": 1.296743631362915, | |
| "learning_rate": 3.100282485875706e-05, | |
| "loss": 0.9508, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1864406779661016, | |
| "grad_norm": 1.145988941192627, | |
| "learning_rate": 3.0296610169491528e-05, | |
| "loss": 0.9629, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.228813559322034, | |
| "grad_norm": 1.2783371210098267, | |
| "learning_rate": 2.959039548022599e-05, | |
| "loss": 0.9621, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.271186440677966, | |
| "grad_norm": 1.3129595518112183, | |
| "learning_rate": 2.8884180790960453e-05, | |
| "loss": 0.9667, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.271186440677966, | |
| "eval_loss": 0.9666466116905212, | |
| "eval_runtime": 23.5102, | |
| "eval_samples_per_second": 91.492, | |
| "eval_steps_per_second": 1.446, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3135593220338984, | |
| "grad_norm": 1.2464239597320557, | |
| "learning_rate": 2.817796610169492e-05, | |
| "loss": 0.9359, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3559322033898304, | |
| "grad_norm": 1.3320660591125488, | |
| "learning_rate": 2.747175141242938e-05, | |
| "loss": 0.9232, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3983050847457628, | |
| "grad_norm": 1.3213363885879517, | |
| "learning_rate": 2.6765536723163843e-05, | |
| "loss": 0.9064, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4406779661016949, | |
| "grad_norm": 1.4874022006988525, | |
| "learning_rate": 2.605932203389831e-05, | |
| "loss": 0.9367, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4830508474576272, | |
| "grad_norm": 1.308205246925354, | |
| "learning_rate": 2.535310734463277e-05, | |
| "loss": 0.9058, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4830508474576272, | |
| "eval_loss": 0.964046061038971, | |
| "eval_runtime": 22.9381, | |
| "eval_samples_per_second": 93.774, | |
| "eval_steps_per_second": 1.482, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5254237288135593, | |
| "grad_norm": 1.3957818746566772, | |
| "learning_rate": 2.464689265536723e-05, | |
| "loss": 0.9232, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.5677966101694916, | |
| "grad_norm": 1.3040454387664795, | |
| "learning_rate": 2.3940677966101697e-05, | |
| "loss": 0.9376, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6101694915254239, | |
| "grad_norm": 1.4122378826141357, | |
| "learning_rate": 2.323446327683616e-05, | |
| "loss": 0.9115, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.652542372881356, | |
| "grad_norm": 1.3442506790161133, | |
| "learning_rate": 2.252824858757062e-05, | |
| "loss": 0.9599, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.694915254237288, | |
| "grad_norm": 1.5409705638885498, | |
| "learning_rate": 2.1822033898305087e-05, | |
| "loss": 0.9258, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.694915254237288, | |
| "eval_loss": 0.9589577913284302, | |
| "eval_runtime": 22.6608, | |
| "eval_samples_per_second": 94.922, | |
| "eval_steps_per_second": 1.5, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7372881355932204, | |
| "grad_norm": 1.315023422241211, | |
| "learning_rate": 2.111581920903955e-05, | |
| "loss": 0.9088, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7796610169491527, | |
| "grad_norm": 1.4532520771026611, | |
| "learning_rate": 2.0409604519774012e-05, | |
| "loss": 0.9351, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8220338983050848, | |
| "grad_norm": 1.3708842992782593, | |
| "learning_rate": 1.9703389830508475e-05, | |
| "loss": 0.95, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8644067796610169, | |
| "grad_norm": 1.3897417783737183, | |
| "learning_rate": 1.899717514124294e-05, | |
| "loss": 0.8943, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9067796610169492, | |
| "grad_norm": 1.2795928716659546, | |
| "learning_rate": 1.8290960451977403e-05, | |
| "loss": 0.8856, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9067796610169492, | |
| "eval_loss": 0.9577222466468811, | |
| "eval_runtime": 23.1528, | |
| "eval_samples_per_second": 92.905, | |
| "eval_steps_per_second": 1.469, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9491525423728815, | |
| "grad_norm": 1.2920231819152832, | |
| "learning_rate": 1.7584745762711865e-05, | |
| "loss": 0.9415, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.9915254237288136, | |
| "grad_norm": 1.3519489765167236, | |
| "learning_rate": 1.687853107344633e-05, | |
| "loss": 0.9708, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.0338983050847457, | |
| "grad_norm": 1.3836076259613037, | |
| "learning_rate": 1.617231638418079e-05, | |
| "loss": 0.8801, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.0762711864406778, | |
| "grad_norm": 1.478256344795227, | |
| "learning_rate": 1.5466101694915256e-05, | |
| "loss": 0.8908, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.1186440677966103, | |
| "grad_norm": 1.571032166481018, | |
| "learning_rate": 1.475988700564972e-05, | |
| "loss": 0.9013, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.1186440677966103, | |
| "eval_loss": 0.9608597755432129, | |
| "eval_runtime": 22.5872, | |
| "eval_samples_per_second": 95.231, | |
| "eval_steps_per_second": 1.505, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.1610169491525424, | |
| "grad_norm": 1.466437578201294, | |
| "learning_rate": 1.4053672316384181e-05, | |
| "loss": 0.8926, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.2033898305084745, | |
| "grad_norm": 1.6073771715164185, | |
| "learning_rate": 1.3347457627118645e-05, | |
| "loss": 0.8717, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.2457627118644066, | |
| "grad_norm": 1.5890705585479736, | |
| "learning_rate": 1.2641242937853107e-05, | |
| "loss": 0.8633, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.288135593220339, | |
| "grad_norm": 1.6749075651168823, | |
| "learning_rate": 1.1935028248587572e-05, | |
| "loss": 0.8941, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.330508474576271, | |
| "grad_norm": 1.6727330684661865, | |
| "learning_rate": 1.1228813559322036e-05, | |
| "loss": 0.8929, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.330508474576271, | |
| "eval_loss": 0.9595548510551453, | |
| "eval_runtime": 22.9206, | |
| "eval_samples_per_second": 93.846, | |
| "eval_steps_per_second": 1.483, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.3728813559322033, | |
| "grad_norm": 1.5609760284423828, | |
| "learning_rate": 1.0522598870056498e-05, | |
| "loss": 0.8298, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.415254237288136, | |
| "grad_norm": 1.7489114999771118, | |
| "learning_rate": 9.81638418079096e-06, | |
| "loss": 0.8618, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.457627118644068, | |
| "grad_norm": 1.7662324905395508, | |
| "learning_rate": 9.110169491525423e-06, | |
| "loss": 0.8932, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.9018930196762085, | |
| "learning_rate": 8.403954802259887e-06, | |
| "loss": 0.8743, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.542372881355932, | |
| "grad_norm": 1.6308236122131348, | |
| "learning_rate": 7.697740112994351e-06, | |
| "loss": 0.8752, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.542372881355932, | |
| "eval_loss": 0.9617831707000732, | |
| "eval_runtime": 23.2374, | |
| "eval_samples_per_second": 92.566, | |
| "eval_steps_per_second": 1.463, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.584745762711864, | |
| "grad_norm": 1.5900993347167969, | |
| "learning_rate": 6.991525423728814e-06, | |
| "loss": 0.8653, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.6271186440677967, | |
| "grad_norm": 1.8414711952209473, | |
| "learning_rate": 6.285310734463278e-06, | |
| "loss": 0.8877, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.669491525423729, | |
| "grad_norm": 1.7117115259170532, | |
| "learning_rate": 5.57909604519774e-06, | |
| "loss": 0.8691, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.711864406779661, | |
| "grad_norm": 1.690845012664795, | |
| "learning_rate": 4.872881355932204e-06, | |
| "loss": 0.8728, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.7542372881355934, | |
| "grad_norm": 1.727298378944397, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.9005, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.7542372881355934, | |
| "eval_loss": 0.959761381149292, | |
| "eval_runtime": 22.8543, | |
| "eval_samples_per_second": 94.118, | |
| "eval_steps_per_second": 1.488, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.7966101694915255, | |
| "grad_norm": 1.6035046577453613, | |
| "learning_rate": 3.46045197740113e-06, | |
| "loss": 0.9181, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.8389830508474576, | |
| "grad_norm": 1.7235994338989258, | |
| "learning_rate": 2.7542372881355934e-06, | |
| "loss": 0.9003, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.8813559322033897, | |
| "grad_norm": 1.7914451360702515, | |
| "learning_rate": 2.0480225988700563e-06, | |
| "loss": 0.951, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.923728813559322, | |
| "grad_norm": 1.8638156652450562, | |
| "learning_rate": 1.3418079096045198e-06, | |
| "loss": 0.8739, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.9661016949152543, | |
| "grad_norm": 1.6826245784759521, | |
| "learning_rate": 6.355932203389831e-07, | |
| "loss": 0.8375, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.9661016949152543, | |
| "eval_loss": 0.9599277973175049, | |
| "eval_runtime": 22.8971, | |
| "eval_samples_per_second": 93.942, | |
| "eval_steps_per_second": 1.485, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 708, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8604011452104704e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |