| { |
| "best_global_step": 550, |
| "best_metric": 0.334078311920166, |
| "best_model_checkpoint": "./output_training_h200/checkpoint-550", |
| "epoch": 3.225806451612903, |
| "eval_steps": 50, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.053763440860215055, |
| "grad_norm": 0.05421804264187813, |
| "learning_rate": 4.5e-06, |
| "loss": 0.5785, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10752688172043011, |
| "grad_norm": 0.06676478683948517, |
| "learning_rate": 9.5e-06, |
| "loss": 0.6234, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 0.062416791915893555, |
| "learning_rate": 1.45e-05, |
| "loss": 0.6716, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 0.05097672715783119, |
| "learning_rate": 1.9500000000000003e-05, |
| "loss": 0.6736, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.26881720430107525, |
| "grad_norm": 0.03752289339900017, |
| "learning_rate": 2.45e-05, |
| "loss": 0.6243, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.26881720430107525, |
| "eval_loss": 0.5304985642433167, |
| "eval_runtime": 1155.6426, |
| "eval_samples_per_second": 0.303, |
| "eval_steps_per_second": 0.151, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 0.0321248322725296, |
| "learning_rate": 2.95e-05, |
| "loss": 0.4349, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3763440860215054, |
| "grad_norm": 0.02994599938392639, |
| "learning_rate": 3.45e-05, |
| "loss": 0.4682, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 0.03603624552488327, |
| "learning_rate": 3.9500000000000005e-05, |
| "loss": 0.4734, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 0.0552607886493206, |
| "learning_rate": 4.4500000000000004e-05, |
| "loss": 0.5035, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "grad_norm": 0.04917095974087715, |
| "learning_rate": 4.9500000000000004e-05, |
| "loss": 0.4024, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "eval_loss": 0.41749581694602966, |
| "eval_runtime": 1156.2817, |
| "eval_samples_per_second": 0.303, |
| "eval_steps_per_second": 0.151, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5913978494623656, |
| "grad_norm": 0.043786872178316116, |
| "learning_rate": 4.998549570601475e-05, |
| "loss": 0.3703, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.05016686022281647, |
| "learning_rate": 4.9935379012568985e-05, |
| "loss": 0.4031, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6989247311827957, |
| "grad_norm": 0.056300703436136246, |
| "learning_rate": 4.984954262888182e-05, |
| "loss": 0.4134, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7526881720430108, |
| "grad_norm": 0.04982742667198181, |
| "learning_rate": 4.972810951474605e-05, |
| "loss": 0.4282, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 0.042826056480407715, |
| "learning_rate": 4.957125362192794e-05, |
| "loss": 0.3259, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "eval_loss": 0.3815489113330841, |
| "eval_runtime": 1148.3707, |
| "eval_samples_per_second": 0.305, |
| "eval_steps_per_second": 0.152, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 0.0555887334048748, |
| "learning_rate": 4.937919964498302e-05, |
| "loss": 0.3549, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9139784946236559, |
| "grad_norm": 0.0595487505197525, |
| "learning_rate": 4.9152222699383273e-05, |
| "loss": 0.3793, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 0.07630524784326553, |
| "learning_rate": 4.8890647927416887e-05, |
| "loss": 0.3904, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.021505376344086, |
| "grad_norm": 0.054408393800258636, |
| "learning_rate": 4.859485003242503e-05, |
| "loss": 0.3848, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "grad_norm": 0.052903175354003906, |
| "learning_rate": 4.8265252742042965e-05, |
| "loss": 0.3173, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "eval_loss": 0.3656960725784302, |
| "eval_runtime": 1157.3762, |
| "eval_samples_per_second": 0.302, |
| "eval_steps_per_second": 0.151, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 0.06489476561546326, |
| "learning_rate": 4.7902328201214256e-05, |
| "loss": 0.3467, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1827956989247312, |
| "grad_norm": 0.06930793821811676, |
| "learning_rate": 4.7506596295847716e-05, |
| "loss": 0.3572, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.2365591397849462, |
| "grad_norm": 0.08778736740350723, |
| "learning_rate": 4.7078623908085825e-05, |
| "loss": 0.3915, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 0.07511554658412933, |
| "learning_rate": 4.661902410425155e-05, |
| "loss": 0.3341, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3440860215053765, |
| "grad_norm": 0.061772529035806656, |
| "learning_rate": 4.6128455256636706e-05, |
| "loss": 0.3135, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3440860215053765, |
| "eval_loss": 0.3562982678413391, |
| "eval_runtime": 1141.9386, |
| "eval_samples_per_second": 0.306, |
| "eval_steps_per_second": 0.153, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3978494623655915, |
| "grad_norm": 0.06834172457456589, |
| "learning_rate": 4.560762010039001e-05, |
| "loss": 0.347, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 0.07726427167654037, |
| "learning_rate": 4.5057264726855765e-05, |
| "loss": 0.3572, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.5053763440860215, |
| "grad_norm": 0.06627994775772095, |
| "learning_rate": 4.4478177514805166e-05, |
| "loss": 0.3861, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.5591397849462365, |
| "grad_norm": 0.06034032255411148, |
| "learning_rate": 4.387118800109133e-05, |
| "loss": 0.2919, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 0.07344155013561249, |
| "learning_rate": 4.323716569234572e-05, |
| "loss": 0.3211, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "eval_loss": 0.3491288125514984, |
| "eval_runtime": 1137.4889, |
| "eval_samples_per_second": 0.308, |
| "eval_steps_per_second": 0.154, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.07450632750988007, |
| "learning_rate": 4.2577018819418296e-05, |
| "loss": 0.3446, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.7204301075268817, |
| "grad_norm": 0.09049148857593536, |
| "learning_rate": 4.189169303634555e-05, |
| "loss": 0.3587, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.06568805873394012, |
| "learning_rate": 4.1182170065710226e-05, |
| "loss": 0.3463, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.827956989247312, |
| "grad_norm": 0.0814325287938118, |
| "learning_rate": 4.044946629233316e-05, |
| "loss": 0.2897, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.881720430107527, |
| "grad_norm": 0.07297977060079575, |
| "learning_rate": 3.969463130731183e-05, |
| "loss": 0.3354, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.881720430107527, |
| "eval_loss": 0.34430938959121704, |
| "eval_runtime": 1149.1219, |
| "eval_samples_per_second": 0.305, |
| "eval_steps_per_second": 0.152, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.08211623132228851, |
| "learning_rate": 3.89187464044912e-05, |
| "loss": 0.3559, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.989247311827957, |
| "grad_norm": 0.12685473263263702, |
| "learning_rate": 3.8122923031520735e-05, |
| "loss": 0.3879, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.043010752688172, |
| "grad_norm": 0.07243961095809937, |
| "learning_rate": 3.730830119771631e-05, |
| "loss": 0.2912, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 0.0748642310500145, |
| "learning_rate": 3.647604784100787e-05, |
| "loss": 0.2902, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "grad_norm": 0.0815594270825386, |
| "learning_rate": 3.5627355156312084e-05, |
| "loss": 0.3234, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "eval_loss": 0.34095144271850586, |
| "eval_runtime": 1133.2669, |
| "eval_samples_per_second": 0.309, |
| "eval_steps_per_second": 0.154, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.204301075268817, |
| "grad_norm": 0.09361624717712402, |
| "learning_rate": 3.476343888772461e-05, |
| "loss": 0.3324, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 0.09291868656873703, |
| "learning_rate": 3.3885536586978394e-05, |
| "loss": 0.3436, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.3118279569892475, |
| "grad_norm": 0.07615092396736145, |
| "learning_rate": 3.2994905840662696e-05, |
| "loss": 0.2794, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.3655913978494625, |
| "grad_norm": 0.08865945041179657, |
| "learning_rate": 3.20928224687425e-05, |
| "loss": 0.3047, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "grad_norm": 0.0916745513677597, |
| "learning_rate": 3.118057869695858e-05, |
| "loss": 0.3222, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "eval_loss": 0.33828824758529663, |
| "eval_runtime": 1135.3808, |
| "eval_samples_per_second": 0.308, |
| "eval_steps_per_second": 0.154, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.4731182795698925, |
| "grad_norm": 0.12153404206037521, |
| "learning_rate": 3.0259481305726705e-05, |
| "loss": 0.3446, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.5268817204301075, |
| "grad_norm": 0.08235576003789902, |
| "learning_rate": 2.9330849758187195e-05, |
| "loss": 0.3188, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.08115874975919724, |
| "learning_rate": 2.839601431008669e-05, |
| "loss": 0.2803, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.6344086021505375, |
| "grad_norm": 0.08219080418348312, |
| "learning_rate": 2.7456314104199647e-05, |
| "loss": 0.3101, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "grad_norm": 0.09740650653839111, |
| "learning_rate": 2.651309525201918e-05, |
| "loss": 0.3297, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "eval_loss": 0.33599451184272766, |
| "eval_runtime": 1141.2743, |
| "eval_samples_per_second": 0.307, |
| "eval_steps_per_second": 0.153, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.741935483870968, |
| "grad_norm": 0.16821204125881195, |
| "learning_rate": 2.5567708905465337e-05, |
| "loss": 0.3602, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.795698924731183, |
| "grad_norm": 0.07880295068025589, |
| "learning_rate": 2.4621509321372963e-05, |
| "loss": 0.279, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.849462365591398, |
| "grad_norm": 0.08798631280660629, |
| "learning_rate": 2.3675851921531854e-05, |
| "loss": 0.2905, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 0.09074675291776657, |
| "learning_rate": 2.2732091351058068e-05, |
| "loss": 0.3102, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.956989247311828, |
| "grad_norm": 0.09948832541704178, |
| "learning_rate": 2.1791579537877894e-05, |
| "loss": 0.3296, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.956989247311828, |
| "eval_loss": 0.334078311920166, |
| "eval_runtime": 1150.9489, |
| "eval_samples_per_second": 0.304, |
| "eval_steps_per_second": 0.152, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.010752688172043, |
| "grad_norm": 0.08561868220567703, |
| "learning_rate": 2.085566375610409e-05, |
| "loss": 0.3341, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.064516129032258, |
| "grad_norm": 0.0797228142619133, |
| "learning_rate": 1.9925684696078656e-05, |
| "loss": 0.2616, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.118279569892473, |
| "grad_norm": 0.09251473098993301, |
| "learning_rate": 1.9002974543846926e-05, |
| "loss": 0.2976, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.172043010752688, |
| "grad_norm": 0.09707269072532654, |
| "learning_rate": 1.8088855072813818e-05, |
| "loss": 0.3086, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 0.1216258853673935, |
| "learning_rate": 1.7184635750316168e-05, |
| "loss": 0.3279, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "eval_loss": 0.3347454369068146, |
| "eval_runtime": 1152.9274, |
| "eval_samples_per_second": 0.304, |
| "eval_steps_per_second": 0.152, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 930, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 1 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3295802405463953e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|