| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5106382978723404, | |
| "eval_steps": 500, | |
| "global_step": 42, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0121580547112462, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8436, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0243161094224924, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 4e-05, | |
| "loss": 0.9406, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0364741641337386, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 6e-05, | |
| "loss": 1.1161, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0486322188449848, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.9467, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.060790273556231005, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8456, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0729483282674772, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.00012, | |
| "loss": 0.9215, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0851063829787234, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.00014, | |
| "loss": 0.8381, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0972644376899696, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.00016, | |
| "loss": 0.9411, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.1094224924012158, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.00018, | |
| "loss": 0.9348, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.12158054711246201, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9752, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1337386018237082, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001999048221581858, | |
| "loss": 1.1067, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1458966565349544, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.00019961946980917456, | |
| "loss": 0.904, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1580547112462006, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.00019914448613738106, | |
| "loss": 0.9718, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.1702127659574468, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.00019848077530122083, | |
| "loss": 0.9354, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.182370820668693, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 0.00019762960071199333, | |
| "loss": 1.0536, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1945288753799392, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.00019659258262890683, | |
| "loss": 1.0129, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2066869300911854, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001953716950748227, | |
| "loss": 1.0696, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.2188449848024316, | |
| "grad_norm": 0.1025390625, | |
| "learning_rate": 0.00019396926207859084, | |
| "loss": 0.802, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.23100303951367782, | |
| "grad_norm": 0.12255859375, | |
| "learning_rate": 0.0001923879532511287, | |
| "loss": 0.9757, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.24316109422492402, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 0.000190630778703665, | |
| "loss": 1.1032, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2553191489361702, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.00018870108331782217, | |
| "loss": 0.9319, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.2674772036474164, | |
| "grad_norm": 0.115234375, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 0.8929, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.2796352583586626, | |
| "grad_norm": 0.10888671875, | |
| "learning_rate": 0.0001843391445812886, | |
| "loss": 0.9112, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2917933130699088, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.0001819152044288992, | |
| "loss": 0.9391, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.303951367781155, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.00017933533402912354, | |
| "loss": 0.7715, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3161094224924012, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 0.0001766044443118978, | |
| "loss": 0.9815, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3282674772036474, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001737277336810124, | |
| "loss": 0.869, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3404255319148936, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.00017071067811865476, | |
| "loss": 0.9439, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3525835866261398, | |
| "grad_norm": 0.1162109375, | |
| "learning_rate": 0.00016755902076156604, | |
| "loss": 0.9866, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.364741641337386, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 1.0274, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3768996960486322, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.00016087614290087208, | |
| "loss": 0.9795, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3890577507598784, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 0.0001573576436351046, | |
| "loss": 0.9229, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.4012158054711246, | |
| "grad_norm": 0.11279296875, | |
| "learning_rate": 0.0001537299608346824, | |
| "loss": 0.8489, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.4133738601823708, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.8989, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 0.00014617486132350343, | |
| "loss": 0.9865, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4376899696048632, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.00014226182617406996, | |
| "loss": 0.9563, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.44984802431610943, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 0.9357, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.46200607902735563, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.00013420201433256689, | |
| "loss": 0.9433, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.47416413373860183, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.00013007057995042732, | |
| "loss": 1.0711, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.48632218844984804, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 0.00012588190451025207, | |
| "loss": 0.977, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.49848024316109424, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.00012164396139381029, | |
| "loss": 0.8356, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.5106382978723404, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.9512, | |
| "step": 42 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 82, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 21, | |
| "total_flos": 3.1159265909538816e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |