{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 50, "global_step": 1701, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11757789535567313, "grad_norm": 1.5434420108795166, "learning_rate": 2.9411764705882355e-06, "loss": 0.7456, "step": 50 }, { "epoch": 0.23515579071134626, "grad_norm": 0.963267982006073, "learning_rate": 5.823529411764706e-06, "loss": 0.5784, "step": 100 }, { "epoch": 0.3527336860670194, "grad_norm": 1.1193718910217285, "learning_rate": 8.705882352941177e-06, "loss": 0.3618, "step": 150 }, { "epoch": 0.4703115814226925, "grad_norm": 0.7008563876152039, "learning_rate": 1.1647058823529412e-05, "loss": 0.3389, "step": 200 }, { "epoch": 0.5878894767783657, "grad_norm": 0.6045801639556885, "learning_rate": 1.4588235294117647e-05, "loss": 0.3334, "step": 250 }, { "epoch": 0.7054673721340388, "grad_norm": 0.5706155300140381, "learning_rate": 1.7529411764705884e-05, "loss": 0.2923, "step": 300 }, { "epoch": 0.823045267489712, "grad_norm": 0.6757872104644775, "learning_rate": 2.047058823529412e-05, "loss": 0.2901, "step": 350 }, { "epoch": 0.940623162845385, "grad_norm": 0.5101656317710876, "learning_rate": 2.3411764705882354e-05, "loss": 0.2893, "step": 400 }, { "epoch": 0.9994121105232217, "eval_loss": 0.4672001004219055, "eval_runtime": 174.9103, "eval_samples_per_second": 1.081, "eval_steps_per_second": 0.137, "step": 425 }, { "epoch": 1.0582010582010581, "grad_norm": 0.5830298066139221, "learning_rate": 2.6352941176470592e-05, "loss": 0.2832, "step": 450 }, { "epoch": 1.1757789535567313, "grad_norm": 0.5612862706184387, "learning_rate": 2.9294117647058827e-05, "loss": 0.2486, "step": 500 }, { "epoch": 1.2933568489124045, "grad_norm": 0.5751911997795105, "learning_rate": 3.223529411764706e-05, "loss": 0.2924, "step": 550 }, { "epoch": 1.4109347442680775, "grad_norm": 0.6772521734237671, "learning_rate": 3.5176470588235294e-05, "loss": 0.2899, "step": 600 }, { "epoch": 1.5285126396237507, "grad_norm": 0.6256781220436096, "learning_rate": 3.811764705882353e-05, "loss": 0.2733, "step": 650 }, { "epoch": 1.646090534979424, "grad_norm": 0.7451750040054321, "learning_rate": 4.1058823529411764e-05, "loss": 0.2874, "step": 700 }, { "epoch": 1.763668430335097, "grad_norm": 0.6674547791481018, "learning_rate": 4.4000000000000006e-05, "loss": 0.2668, "step": 750 }, { "epoch": 1.88124632569077, "grad_norm": 0.5857487916946411, "learning_rate": 4.694117647058824e-05, "loss": 0.2699, "step": 800 }, { "epoch": 1.9988242210464433, "grad_norm": 0.4972004294395447, "learning_rate": 4.9882352941176476e-05, "loss": 0.2867, "step": 850 }, { "epoch": 1.9988242210464433, "eval_loss": 0.43864762783050537, "eval_runtime": 172.7658, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.139, "step": 850 }, { "epoch": 2.1164021164021163, "grad_norm": 0.43465057015419006, "learning_rate": 4.929411764705882e-05, "loss": 0.2874, "step": 900 }, { "epoch": 2.2339800117577897, "grad_norm": 0.45625609159469604, "learning_rate": 4.855882352941177e-05, "loss": 0.2549, "step": 950 }, { "epoch": 2.3515579071134627, "grad_norm": 0.4708680510520935, "learning_rate": 4.7823529411764704e-05, "loss": 0.2435, "step": 1000 }, { "epoch": 2.4691358024691357, "grad_norm": 0.4648517370223999, "learning_rate": 4.708823529411765e-05, "loss": 0.2397, "step": 1050 }, { "epoch": 2.586713697824809, "grad_norm": 0.39246800541877747, "learning_rate": 4.635294117647059e-05, "loss": 0.2676, "step": 1100 }, { "epoch": 2.704291593180482, "grad_norm": 0.579401433467865, "learning_rate": 4.5617647058823535e-05, "loss": 0.2207, "step": 1150 }, { "epoch": 2.821869488536155, "grad_norm": 0.3959507942199707, "learning_rate": 4.4882352941176476e-05, "loss": 0.2247, "step": 1200 }, { "epoch": 2.9394473838918285, "grad_norm": 0.3841804265975952, "learning_rate": 4.414705882352941e-05, "loss": 0.2639, "step": 1250 }, { "epoch": 2.998236331569665, "eval_loss": 0.42491579055786133, "eval_runtime": 172.7491, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.139, "step": 1275 }, { "epoch": 3.0570252792475014, "grad_norm": 0.3887556791305542, "learning_rate": 4.341176470588236e-05, "loss": 0.2318, "step": 1300 }, { "epoch": 3.1746031746031744, "grad_norm": 0.2942847013473511, "learning_rate": 4.267647058823529e-05, "loss": 0.2351, "step": 1350 }, { "epoch": 3.292181069958848, "grad_norm": 0.41963356733322144, "learning_rate": 4.194117647058824e-05, "loss": 0.2449, "step": 1400 }, { "epoch": 3.409758965314521, "grad_norm": 0.41531622409820557, "learning_rate": 4.1205882352941176e-05, "loss": 0.2484, "step": 1450 }, { "epoch": 3.527336860670194, "grad_norm": 0.35044431686401367, "learning_rate": 4.0470588235294124e-05, "loss": 0.2331, "step": 1500 }, { "epoch": 3.6449147560258672, "grad_norm": 0.3828336000442505, "learning_rate": 3.973529411764706e-05, "loss": 0.2287, "step": 1550 }, { "epoch": 3.76249265138154, "grad_norm": 0.4572592079639435, "learning_rate": 3.9000000000000006e-05, "loss": 0.2297, "step": 1600 }, { "epoch": 3.8800705467372136, "grad_norm": 0.47478222846984863, "learning_rate": 3.826470588235294e-05, "loss": 0.2285, "step": 1650 }, { "epoch": 3.9976484420928866, "grad_norm": 0.39389339089393616, "learning_rate": 3.752941176470588e-05, "loss": 0.2096, "step": 1700 }, { "epoch": 4.0, "eval_loss": 0.4230487048625946, "eval_runtime": 172.772, "eval_samples_per_second": 1.094, "eval_steps_per_second": 0.139, "step": 1701 } ], "logging_steps": 50, "max_steps": 4250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 8.215788830190797e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }