{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5106382978723404, "eval_steps": 500, "global_step": 42, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0121580547112462, "grad_norm": 0.0986328125, "learning_rate": 2e-05, "loss": 0.8436, "step": 1 }, { "epoch": 0.0243161094224924, "grad_norm": 0.09765625, "learning_rate": 4e-05, "loss": 0.9406, "step": 2 }, { "epoch": 0.0364741641337386, "grad_norm": 0.11083984375, "learning_rate": 6e-05, "loss": 1.1161, "step": 3 }, { "epoch": 0.0486322188449848, "grad_norm": 0.09521484375, "learning_rate": 8e-05, "loss": 0.9467, "step": 4 }, { "epoch": 0.060790273556231005, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.8456, "step": 5 }, { "epoch": 0.0729483282674772, "grad_norm": 0.10009765625, "learning_rate": 0.00012, "loss": 0.9215, "step": 6 }, { "epoch": 0.0851063829787234, "grad_norm": 0.107421875, "learning_rate": 0.00014, "loss": 0.8381, "step": 7 }, { "epoch": 0.0972644376899696, "grad_norm": 0.1123046875, "learning_rate": 0.00016, "loss": 0.9411, "step": 8 }, { "epoch": 0.1094224924012158, "grad_norm": 0.11572265625, "learning_rate": 0.00018, "loss": 0.9348, "step": 9 }, { "epoch": 0.12158054711246201, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 0.9752, "step": 10 }, { "epoch": 0.1337386018237082, "grad_norm": 0.1416015625, "learning_rate": 0.0001999048221581858, "loss": 1.1067, "step": 11 }, { "epoch": 0.1458966565349544, "grad_norm": 0.1328125, "learning_rate": 0.00019961946980917456, "loss": 0.904, "step": 12 }, { "epoch": 0.1580547112462006, "grad_norm": 0.146484375, "learning_rate": 0.00019914448613738106, "loss": 0.9718, "step": 13 }, { "epoch": 0.1702127659574468, "grad_norm": 0.1435546875, "learning_rate": 0.00019848077530122083, "loss": 0.9354, "step": 14 }, { "epoch": 0.182370820668693, "grad_norm": 0.12890625, "learning_rate": 0.00019762960071199333, "loss": 1.0536, "step": 15 }, { "epoch": 0.1945288753799392, "grad_norm": 0.1259765625, "learning_rate": 0.00019659258262890683, "loss": 1.0129, "step": 16 }, { "epoch": 0.2066869300911854, "grad_norm": 0.1328125, "learning_rate": 0.0001953716950748227, "loss": 1.0696, "step": 17 }, { "epoch": 0.2188449848024316, "grad_norm": 0.1025390625, "learning_rate": 0.00019396926207859084, "loss": 0.802, "step": 18 }, { "epoch": 0.23100303951367782, "grad_norm": 0.12255859375, "learning_rate": 0.0001923879532511287, "loss": 0.9757, "step": 19 }, { "epoch": 0.24316109422492402, "grad_norm": 0.123046875, "learning_rate": 0.000190630778703665, "loss": 1.1032, "step": 20 }, { "epoch": 0.2553191489361702, "grad_norm": 0.12158203125, "learning_rate": 0.00018870108331782217, "loss": 0.9319, "step": 21 }, { "epoch": 0.2674772036474164, "grad_norm": 0.115234375, "learning_rate": 0.00018660254037844388, "loss": 0.8929, "step": 22 }, { "epoch": 0.2796352583586626, "grad_norm": 0.10888671875, "learning_rate": 0.0001843391445812886, "loss": 0.9112, "step": 23 }, { "epoch": 0.2917933130699088, "grad_norm": 0.11376953125, "learning_rate": 0.0001819152044288992, "loss": 0.9391, "step": 24 }, { "epoch": 0.303951367781155, "grad_norm": 0.10107421875, "learning_rate": 0.00017933533402912354, "loss": 0.7715, "step": 25 }, { "epoch": 0.3161094224924012, "grad_norm": 0.12353515625, "learning_rate": 0.0001766044443118978, "loss": 0.9815, "step": 26 }, { "epoch": 0.3282674772036474, "grad_norm": 0.11181640625, "learning_rate": 0.0001737277336810124, "loss": 0.869, "step": 27 }, { "epoch": 0.3404255319148936, "grad_norm": 0.11181640625, "learning_rate": 0.00017071067811865476, "loss": 0.9439, "step": 28 }, { "epoch": 0.3525835866261398, "grad_norm": 0.1162109375, "learning_rate": 0.00016755902076156604, "loss": 0.9866, "step": 29 }, { "epoch": 0.364741641337386, "grad_norm": 0.1181640625, "learning_rate": 0.00016427876096865394, "loss": 1.0274, "step": 30 }, { "epoch": 0.3768996960486322, "grad_norm": 0.1337890625, "learning_rate": 0.00016087614290087208, "loss": 0.9795, "step": 31 }, { "epoch": 0.3890577507598784, "grad_norm": 0.119140625, "learning_rate": 0.0001573576436351046, "loss": 0.9229, "step": 32 }, { "epoch": 0.4012158054711246, "grad_norm": 0.11279296875, "learning_rate": 0.0001537299608346824, "loss": 0.8489, "step": 33 }, { "epoch": 0.4133738601823708, "grad_norm": 0.11181640625, "learning_rate": 0.00015000000000000001, "loss": 0.8989, "step": 34 }, { "epoch": 0.425531914893617, "grad_norm": 0.12109375, "learning_rate": 0.00014617486132350343, "loss": 0.9865, "step": 35 }, { "epoch": 0.4376899696048632, "grad_norm": 0.126953125, "learning_rate": 0.00014226182617406996, "loss": 0.9563, "step": 36 }, { "epoch": 0.44984802431610943, "grad_norm": 0.1181640625, "learning_rate": 0.000138268343236509, "loss": 0.9357, "step": 37 }, { "epoch": 0.46200607902735563, "grad_norm": 0.1171875, "learning_rate": 0.00013420201433256689, "loss": 0.9433, "step": 38 }, { "epoch": 0.47416413373860183, "grad_norm": 0.1279296875, "learning_rate": 0.00013007057995042732, "loss": 1.0711, "step": 39 }, { "epoch": 0.48632218844984804, "grad_norm": 0.12353515625, "learning_rate": 0.00012588190451025207, "loss": 0.977, "step": 40 }, { "epoch": 0.49848024316109424, "grad_norm": 0.11083984375, "learning_rate": 0.00012164396139381029, "loss": 0.8356, "step": 41 }, { "epoch": 0.5106382978723404, "grad_norm": 0.1259765625, "learning_rate": 0.00011736481776669306, "loss": 0.9512, "step": 42 } ], "logging_steps": 1, "max_steps": 82, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 21, "total_flos": 3.1159265909538816e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }