| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 50, | |
| "global_step": 1701, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11757789535567313, | |
| "grad_norm": 1.5434420108795166, | |
| "learning_rate": 2.9411764705882355e-06, | |
| "loss": 0.7456, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23515579071134626, | |
| "grad_norm": 0.963267982006073, | |
| "learning_rate": 5.823529411764706e-06, | |
| "loss": 0.5784, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3527336860670194, | |
| "grad_norm": 1.1193718910217285, | |
| "learning_rate": 8.705882352941177e-06, | |
| "loss": 0.3618, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4703115814226925, | |
| "grad_norm": 0.7008563876152039, | |
| "learning_rate": 1.1647058823529412e-05, | |
| "loss": 0.3389, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5878894767783657, | |
| "grad_norm": 0.6045801639556885, | |
| "learning_rate": 1.4588235294117647e-05, | |
| "loss": 0.3334, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7054673721340388, | |
| "grad_norm": 0.5706155300140381, | |
| "learning_rate": 1.7529411764705884e-05, | |
| "loss": 0.2923, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.823045267489712, | |
| "grad_norm": 0.6757872104644775, | |
| "learning_rate": 2.047058823529412e-05, | |
| "loss": 0.2901, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.940623162845385, | |
| "grad_norm": 0.5101656317710876, | |
| "learning_rate": 2.3411764705882354e-05, | |
| "loss": 0.2893, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9994121105232217, | |
| "eval_loss": 0.4672001004219055, | |
| "eval_runtime": 174.9103, | |
| "eval_samples_per_second": 1.081, | |
| "eval_steps_per_second": 0.137, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0582010582010581, | |
| "grad_norm": 0.5830298066139221, | |
| "learning_rate": 2.6352941176470592e-05, | |
| "loss": 0.2832, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1757789535567313, | |
| "grad_norm": 0.5612862706184387, | |
| "learning_rate": 2.9294117647058827e-05, | |
| "loss": 0.2486, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2933568489124045, | |
| "grad_norm": 0.5751911997795105, | |
| "learning_rate": 3.223529411764706e-05, | |
| "loss": 0.2924, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4109347442680775, | |
| "grad_norm": 0.6772521734237671, | |
| "learning_rate": 3.5176470588235294e-05, | |
| "loss": 0.2899, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5285126396237507, | |
| "grad_norm": 0.6256781220436096, | |
| "learning_rate": 3.811764705882353e-05, | |
| "loss": 0.2733, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.646090534979424, | |
| "grad_norm": 0.7451750040054321, | |
| "learning_rate": 4.1058823529411764e-05, | |
| "loss": 0.2874, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.763668430335097, | |
| "grad_norm": 0.6674547791481018, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.2668, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.88124632569077, | |
| "grad_norm": 0.5857487916946411, | |
| "learning_rate": 4.694117647058824e-05, | |
| "loss": 0.2699, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9988242210464433, | |
| "grad_norm": 0.4972004294395447, | |
| "learning_rate": 4.9882352941176476e-05, | |
| "loss": 0.2867, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9988242210464433, | |
| "eval_loss": 0.43864762783050537, | |
| "eval_runtime": 172.7658, | |
| "eval_samples_per_second": 1.094, | |
| "eval_steps_per_second": 0.139, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1164021164021163, | |
| "grad_norm": 0.43465057015419006, | |
| "learning_rate": 4.929411764705882e-05, | |
| "loss": 0.2874, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2339800117577897, | |
| "grad_norm": 0.45625609159469604, | |
| "learning_rate": 4.855882352941177e-05, | |
| "loss": 0.2549, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.3515579071134627, | |
| "grad_norm": 0.4708680510520935, | |
| "learning_rate": 4.7823529411764704e-05, | |
| "loss": 0.2435, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.4691358024691357, | |
| "grad_norm": 0.4648517370223999, | |
| "learning_rate": 4.708823529411765e-05, | |
| "loss": 0.2397, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.586713697824809, | |
| "grad_norm": 0.39246800541877747, | |
| "learning_rate": 4.635294117647059e-05, | |
| "loss": 0.2676, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.704291593180482, | |
| "grad_norm": 0.579401433467865, | |
| "learning_rate": 4.5617647058823535e-05, | |
| "loss": 0.2207, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.821869488536155, | |
| "grad_norm": 0.3959507942199707, | |
| "learning_rate": 4.4882352941176476e-05, | |
| "loss": 0.2247, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.9394473838918285, | |
| "grad_norm": 0.3841804265975952, | |
| "learning_rate": 4.414705882352941e-05, | |
| "loss": 0.2639, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.998236331569665, | |
| "eval_loss": 0.42491579055786133, | |
| "eval_runtime": 172.7491, | |
| "eval_samples_per_second": 1.094, | |
| "eval_steps_per_second": 0.139, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 3.0570252792475014, | |
| "grad_norm": 0.3887556791305542, | |
| "learning_rate": 4.341176470588236e-05, | |
| "loss": 0.2318, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "grad_norm": 0.2942847013473511, | |
| "learning_rate": 4.267647058823529e-05, | |
| "loss": 0.2351, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.292181069958848, | |
| "grad_norm": 0.41963356733322144, | |
| "learning_rate": 4.194117647058824e-05, | |
| "loss": 0.2449, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.409758965314521, | |
| "grad_norm": 0.41531622409820557, | |
| "learning_rate": 4.1205882352941176e-05, | |
| "loss": 0.2484, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.527336860670194, | |
| "grad_norm": 0.35044431686401367, | |
| "learning_rate": 4.0470588235294124e-05, | |
| "loss": 0.2331, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.6449147560258672, | |
| "grad_norm": 0.3828336000442505, | |
| "learning_rate": 3.973529411764706e-05, | |
| "loss": 0.2287, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.76249265138154, | |
| "grad_norm": 0.4572592079639435, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.2297, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.8800705467372136, | |
| "grad_norm": 0.47478222846984863, | |
| "learning_rate": 3.826470588235294e-05, | |
| "loss": 0.2285, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.9976484420928866, | |
| "grad_norm": 0.39389339089393616, | |
| "learning_rate": 3.752941176470588e-05, | |
| "loss": 0.2096, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.4230487048625946, | |
| "eval_runtime": 172.772, | |
| "eval_samples_per_second": 1.094, | |
| "eval_steps_per_second": 0.139, | |
| "step": 1701 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "total_flos": 8.215788830190797e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |