| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 50, |
| "global_step": 1701, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.11757789535567313, |
| "grad_norm": 1.5434420108795166, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 0.7456, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.23515579071134626, |
| "grad_norm": 0.963267982006073, |
| "learning_rate": 5.823529411764706e-06, |
| "loss": 0.5784, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3527336860670194, |
| "grad_norm": 1.1193718910217285, |
| "learning_rate": 8.705882352941177e-06, |
| "loss": 0.3618, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4703115814226925, |
| "grad_norm": 0.7008563876152039, |
| "learning_rate": 1.1647058823529412e-05, |
| "loss": 0.3389, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5878894767783657, |
| "grad_norm": 0.6045801639556885, |
| "learning_rate": 1.4588235294117647e-05, |
| "loss": 0.3334, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7054673721340388, |
| "grad_norm": 0.5706155300140381, |
| "learning_rate": 1.7529411764705884e-05, |
| "loss": 0.2923, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.823045267489712, |
| "grad_norm": 0.6757872104644775, |
| "learning_rate": 2.047058823529412e-05, |
| "loss": 0.2901, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.940623162845385, |
| "grad_norm": 0.5101656317710876, |
| "learning_rate": 2.3411764705882354e-05, |
| "loss": 0.2893, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9994121105232217, |
| "eval_loss": 0.4672001004219055, |
| "eval_runtime": 174.9103, |
| "eval_samples_per_second": 1.081, |
| "eval_steps_per_second": 0.137, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0582010582010581, |
| "grad_norm": 0.5830298066139221, |
| "learning_rate": 2.6352941176470592e-05, |
| "loss": 0.2832, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1757789535567313, |
| "grad_norm": 0.5612862706184387, |
| "learning_rate": 2.9294117647058827e-05, |
| "loss": 0.2486, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2933568489124045, |
| "grad_norm": 0.5751911997795105, |
| "learning_rate": 3.223529411764706e-05, |
| "loss": 0.2924, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.4109347442680775, |
| "grad_norm": 0.6772521734237671, |
| "learning_rate": 3.5176470588235294e-05, |
| "loss": 0.2899, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5285126396237507, |
| "grad_norm": 0.6256781220436096, |
| "learning_rate": 3.811764705882353e-05, |
| "loss": 0.2733, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.646090534979424, |
| "grad_norm": 0.7451750040054321, |
| "learning_rate": 4.1058823529411764e-05, |
| "loss": 0.2874, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.763668430335097, |
| "grad_norm": 0.6674547791481018, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.2668, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.88124632569077, |
| "grad_norm": 0.5857487916946411, |
| "learning_rate": 4.694117647058824e-05, |
| "loss": 0.2699, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9988242210464433, |
| "grad_norm": 0.4972004294395447, |
| "learning_rate": 4.9882352941176476e-05, |
| "loss": 0.2867, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.9988242210464433, |
| "eval_loss": 0.43864762783050537, |
| "eval_runtime": 172.7658, |
| "eval_samples_per_second": 1.094, |
| "eval_steps_per_second": 0.139, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.1164021164021163, |
| "grad_norm": 0.43465057015419006, |
| "learning_rate": 4.929411764705882e-05, |
| "loss": 0.2874, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2339800117577897, |
| "grad_norm": 0.45625609159469604, |
| "learning_rate": 4.855882352941177e-05, |
| "loss": 0.2549, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.3515579071134627, |
| "grad_norm": 0.4708680510520935, |
| "learning_rate": 4.7823529411764704e-05, |
| "loss": 0.2435, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4691358024691357, |
| "grad_norm": 0.4648517370223999, |
| "learning_rate": 4.708823529411765e-05, |
| "loss": 0.2397, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.586713697824809, |
| "grad_norm": 0.39246800541877747, |
| "learning_rate": 4.635294117647059e-05, |
| "loss": 0.2676, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.704291593180482, |
| "grad_norm": 0.579401433467865, |
| "learning_rate": 4.5617647058823535e-05, |
| "loss": 0.2207, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.821869488536155, |
| "grad_norm": 0.3959507942199707, |
| "learning_rate": 4.4882352941176476e-05, |
| "loss": 0.2247, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.9394473838918285, |
| "grad_norm": 0.3841804265975952, |
| "learning_rate": 4.414705882352941e-05, |
| "loss": 0.2639, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.998236331569665, |
| "eval_loss": 0.42491579055786133, |
| "eval_runtime": 172.7491, |
| "eval_samples_per_second": 1.094, |
| "eval_steps_per_second": 0.139, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.0570252792475014, |
| "grad_norm": 0.3887556791305542, |
| "learning_rate": 4.341176470588236e-05, |
| "loss": 0.2318, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.1746031746031744, |
| "grad_norm": 0.2942847013473511, |
| "learning_rate": 4.267647058823529e-05, |
| "loss": 0.2351, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.292181069958848, |
| "grad_norm": 0.41963356733322144, |
| "learning_rate": 4.194117647058824e-05, |
| "loss": 0.2449, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.409758965314521, |
| "grad_norm": 0.41531622409820557, |
| "learning_rate": 4.1205882352941176e-05, |
| "loss": 0.2484, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.527336860670194, |
| "grad_norm": 0.35044431686401367, |
| "learning_rate": 4.0470588235294124e-05, |
| "loss": 0.2331, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.6449147560258672, |
| "grad_norm": 0.3828336000442505, |
| "learning_rate": 3.973529411764706e-05, |
| "loss": 0.2287, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.76249265138154, |
| "grad_norm": 0.4572592079639435, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 0.2297, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.8800705467372136, |
| "grad_norm": 0.47478222846984863, |
| "learning_rate": 3.826470588235294e-05, |
| "loss": 0.2285, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.9976484420928866, |
| "grad_norm": 0.39389339089393616, |
| "learning_rate": 3.752941176470588e-05, |
| "loss": 0.2096, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.4230487048625946, |
| "eval_runtime": 172.772, |
| "eval_samples_per_second": 1.094, |
| "eval_steps_per_second": 0.139, |
| "step": 1701 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 4250, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "total_flos": 8.215788830190797e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|