{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9957567185289957, "eval_steps": 500, "global_step": 176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005657708628005658, "grad_norm": 0.43130557078322906, "learning_rate": 1.1111111111111112e-05, "loss": 1.9358, "step": 1 }, { "epoch": 0.028288543140028287, "grad_norm": 0.47181074868090295, "learning_rate": 5.555555555555556e-05, "loss": 1.8755, "step": 5 }, { "epoch": 0.056577086280056574, "grad_norm": 0.24499816695787435, "learning_rate": 0.00011111111111111112, "loss": 1.8709, "step": 10 }, { "epoch": 0.08486562942008487, "grad_norm": 0.1946906656901304, "learning_rate": 0.0001666666666666667, "loss": 1.783, "step": 15 }, { "epoch": 0.11315417256011315, "grad_norm": 0.16173800943880282, "learning_rate": 0.00019992093972273018, "loss": 1.6992, "step": 20 }, { "epoch": 0.14144271570014144, "grad_norm": 0.13810234079187747, "learning_rate": 0.0001990329466472502, "loss": 1.6291, "step": 25 }, { "epoch": 0.16973125884016974, "grad_norm": 0.11800201667039435, "learning_rate": 0.000197166934004041, "loss": 1.6379, "step": 30 }, { "epoch": 0.19801980198019803, "grad_norm": 0.11256043626490136, "learning_rate": 0.00019434132997221345, "loss": 1.5736, "step": 35 }, { "epoch": 0.2263083451202263, "grad_norm": 0.10718182461540707, "learning_rate": 0.00019058403936655233, "loss": 1.5491, "step": 40 }, { "epoch": 0.2545968882602546, "grad_norm": 0.112326110136943, "learning_rate": 0.00018593216805796612, "loss": 1.51, "step": 45 }, { "epoch": 0.2828854314002829, "grad_norm": 0.08515607635399414, "learning_rate": 0.00018043165652707649, "loss": 1.5018, "step": 50 }, { "epoch": 0.31117397454031115, "grad_norm": 0.08598762497133733, "learning_rate": 0.00017413682616986185, "loss": 1.4985, "step": 55 }, { "epoch": 0.33946251768033947, "grad_norm": 0.08605574729719911, "learning_rate": 0.0001671098428359037, "loss": 1.4957, "step": 60 }, { "epoch": 0.36775106082036774, "grad_norm": 0.08270099201021953, "learning_rate": 0.00015942010289717105, "loss": 1.4897, "step": 65 }, { "epoch": 0.39603960396039606, "grad_norm": 0.09139373323353543, "learning_rate": 0.00015114354791034225, "loss": 1.4661, "step": 70 }, { "epoch": 0.4243281471004243, "grad_norm": 0.09213917858670279, "learning_rate": 0.00014236191464085286, "loss": 1.4825, "step": 75 }, { "epoch": 0.4526166902404526, "grad_norm": 0.09741017460001462, "learning_rate": 0.0001331619278552068, "loss": 1.4796, "step": 80 }, { "epoch": 0.4809052333804809, "grad_norm": 0.09522302084100907, "learning_rate": 0.0001236344438532905, "loss": 1.4522, "step": 85 }, { "epoch": 0.5091937765205092, "grad_norm": 0.10005523175818275, "learning_rate": 0.00011387355319890685, "loss": 1.4512, "step": 90 }, { "epoch": 0.5374823196605375, "grad_norm": 0.09774894920042008, "learning_rate": 0.0001039756515096926, "loss": 1.4531, "step": 95 }, { "epoch": 0.5657708628005658, "grad_norm": 0.09337090709938024, "learning_rate": 9.403848748301802e-05, "loss": 1.431, "step": 100 }, { "epoch": 0.594059405940594, "grad_norm": 0.10862536785181011, "learning_rate": 8.416019755927851e-05, "loss": 1.4497, "step": 105 }, { "epoch": 0.6223479490806223, "grad_norm": 0.09931981935895794, "learning_rate": 7.443833675595255e-05, "loss": 1.4277, "step": 110 }, { "epoch": 0.6506364922206507, "grad_norm": 0.09784818782800696, "learning_rate": 6.496891524361757e-05, "loss": 1.4467, "step": 115 }, { "epoch": 0.6789250353606789, "grad_norm": 0.09291305679558234, "learning_rate": 5.584545017840885e-05, "loss": 1.4384, "step": 120 }, { "epoch": 0.7072135785007072, "grad_norm": 0.10423529826360275, "learning_rate": 4.715804215473809e-05, "loss": 1.4476, "step": 125 }, { "epoch": 0.7355021216407355, "grad_norm": 0.10820562792911577, "learning_rate": 3.899248539894757e-05, "loss": 1.4327, "step": 130 }, { "epoch": 0.7637906647807637, "grad_norm": 0.09010400983561788, "learning_rate": 3.14294204913587e-05, "loss": 1.4426, "step": 135 }, { "epoch": 0.7920792079207921, "grad_norm": 0.093194757033809, "learning_rate": 2.4543537984176978e-05, "loss": 1.4434, "step": 140 }, { "epoch": 0.8203677510608204, "grad_norm": 0.09797367971916428, "learning_rate": 1.840284078008393e-05, "loss": 1.4363, "step": 145 }, { "epoch": 0.8486562942008486, "grad_norm": 0.09241502492056575, "learning_rate": 1.3067972556041752e-05, "loss": 1.4347, "step": 150 }, { "epoch": 0.8769448373408769, "grad_norm": 0.0929511573879462, "learning_rate": 8.59161886459654e-06, "loss": 1.4129, "step": 155 }, { "epoch": 0.9052333804809052, "grad_norm": 0.08984575236333624, "learning_rate": 5.017986827221733e-06, "loss": 1.4555, "step": 160 }, { "epoch": 0.9335219236209336, "grad_norm": 0.0905814734943296, "learning_rate": 2.3823685580949273e-06, "loss": 1.4418, "step": 165 }, { "epoch": 0.9618104667609618, "grad_norm": 0.08926337770672606, "learning_rate": 7.10792629802659e-07, "loss": 1.4399, "step": 170 }, { "epoch": 0.9900990099009901, "grad_norm": 0.09160289967908435, "learning_rate": 1.976702299344435e-08, "loss": 1.4333, "step": 175 }, { "epoch": 0.9957567185289957, "eval_loss": 1.235967755317688, "eval_runtime": 1555.7732, "eval_samples_per_second": 8.599, "eval_steps_per_second": 0.538, "step": 176 }, { "epoch": 0.9957567185289957, "step": 176, "total_flos": 2207623390691328.0, "train_loss": 1.0380965369668873, "train_runtime": 4181.7688, "train_samples_per_second": 2.703, "train_steps_per_second": 0.042 } ], "logging_steps": 5, "max_steps": 176, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2207623390691328.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }