| { | |
| "best_global_step": 5900, | |
| "best_metric": 2.4210917949676514, | |
| "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000", | |
| "epoch": 0.12, | |
| "eval_steps": 100, | |
| "global_step": 6000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 2.624103276270124, | |
| "learning_rate": 4.8e-08, | |
| "loss": 4.0893, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 1.3629568986234561, | |
| "learning_rate": 9.8e-08, | |
| "loss": 3.9543, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "grad_norm": 0.8050128701430977, | |
| "learning_rate": 1.4800000000000003e-07, | |
| "loss": 3.6763, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 0.3690286383727022, | |
| "learning_rate": 1.9800000000000003e-07, | |
| "loss": 3.327, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 3.100055694580078, | |
| "eval_runtime": 32.7706, | |
| "eval_samples_per_second": 3.57, | |
| "eval_steps_per_second": 1.8, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 0.24011694167100578, | |
| "learning_rate": 2.48e-07, | |
| "loss": 3.1322, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 0.149511940963387, | |
| "learning_rate": 2.9800000000000005e-07, | |
| "loss": 2.9672, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "grad_norm": 0.10071711520195754, | |
| "learning_rate": 3.48e-07, | |
| "loss": 2.8684, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 0.09695377414070089, | |
| "learning_rate": 3.9800000000000004e-07, | |
| "loss": 2.8244, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 2.7518060207366943, | |
| "eval_runtime": 32.9203, | |
| "eval_samples_per_second": 3.554, | |
| "eval_steps_per_second": 1.792, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "grad_norm": 0.06541174981920718, | |
| "learning_rate": 4.4800000000000004e-07, | |
| "loss": 2.7736, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.061297886999798934, | |
| "learning_rate": 4.98e-07, | |
| "loss": 2.7392, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "grad_norm": 0.07881073149840945, | |
| "learning_rate": 5.480000000000001e-07, | |
| "loss": 2.7194, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.05125386617161651, | |
| "learning_rate": 5.98e-07, | |
| "loss": 2.6982, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 2.6622018814086914, | |
| "eval_runtime": 32.9076, | |
| "eval_samples_per_second": 3.555, | |
| "eval_steps_per_second": 1.793, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "grad_norm": 0.04659366450077996, | |
| "learning_rate": 6.48e-07, | |
| "loss": 2.6725, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 0.04588097652548341, | |
| "learning_rate": 6.98e-07, | |
| "loss": 2.6592, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.058421958212028904, | |
| "learning_rate": 7.480000000000001e-07, | |
| "loss": 2.6481, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.04289575736155661, | |
| "learning_rate": 7.98e-07, | |
| "loss": 2.6257, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 2.6052613258361816, | |
| "eval_runtime": 32.8227, | |
| "eval_samples_per_second": 3.565, | |
| "eval_steps_per_second": 1.798, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "grad_norm": 0.041602666338794385, | |
| "learning_rate": 8.480000000000001e-07, | |
| "loss": 2.6089, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.040090024026539266, | |
| "learning_rate": 8.980000000000001e-07, | |
| "loss": 2.5985, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "grad_norm": 0.05346463020318845, | |
| "learning_rate": 9.480000000000001e-07, | |
| "loss": 2.5858, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.03240197247016216, | |
| "learning_rate": 9.98e-07, | |
| "loss": 2.5773, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 2.5677218437194824, | |
| "eval_runtime": 32.9146, | |
| "eval_samples_per_second": 3.555, | |
| "eval_steps_per_second": 1.793, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "grad_norm": 0.030627609315729644, | |
| "learning_rate": 1.0480000000000002e-06, | |
| "loss": 2.5695, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 0.03146801435404312, | |
| "learning_rate": 1.0980000000000001e-06, | |
| "loss": 2.558, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "grad_norm": 0.028453864143727626, | |
| "learning_rate": 1.148e-06, | |
| "loss": 2.5645, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.03026805511159676, | |
| "learning_rate": 1.1980000000000002e-06, | |
| "loss": 2.5645, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 2.546586275100708, | |
| "eval_runtime": 32.8424, | |
| "eval_samples_per_second": 3.562, | |
| "eval_steps_per_second": 1.796, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.032033771539522, | |
| "learning_rate": 1.248e-06, | |
| "loss": 2.5424, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.0281966122475446, | |
| "learning_rate": 1.2980000000000001e-06, | |
| "loss": 2.5409, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "grad_norm": 0.02887428243284281, | |
| "learning_rate": 1.348e-06, | |
| "loss": 2.543, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 0.027672621753278132, | |
| "learning_rate": 1.3980000000000002e-06, | |
| "loss": 2.5385, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 2.530237913131714, | |
| "eval_runtime": 32.7994, | |
| "eval_samples_per_second": 3.567, | |
| "eval_steps_per_second": 1.799, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "grad_norm": 0.030815191380069624, | |
| "learning_rate": 1.4480000000000002e-06, | |
| "loss": 2.5302, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.0336387385604783, | |
| "learning_rate": 1.498e-06, | |
| "loss": 2.531, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "grad_norm": 0.02858543320323233, | |
| "learning_rate": 1.548e-06, | |
| "loss": 2.5184, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.028120393653995705, | |
| "learning_rate": 1.5980000000000002e-06, | |
| "loss": 2.5101, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 2.5182888507843018, | |
| "eval_runtime": 33.2135, | |
| "eval_samples_per_second": 3.523, | |
| "eval_steps_per_second": 1.776, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "grad_norm": 0.03014167593156162, | |
| "learning_rate": 1.6480000000000001e-06, | |
| "loss": 2.5232, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.028528349033195077, | |
| "learning_rate": 1.6980000000000003e-06, | |
| "loss": 2.5162, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.031230193601244804, | |
| "learning_rate": 1.7480000000000002e-06, | |
| "loss": 2.4995, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 0.03555060954716827, | |
| "learning_rate": 1.798e-06, | |
| "loss": 2.5064, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 2.5070879459381104, | |
| "eval_runtime": 33.3807, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.767, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "grad_norm": 0.03561871969060444, | |
| "learning_rate": 1.8480000000000001e-06, | |
| "loss": 2.5004, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 0.03094584673111385, | |
| "learning_rate": 1.898e-06, | |
| "loss": 2.4959, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "grad_norm": 0.035545021685136444, | |
| "learning_rate": 1.9480000000000002e-06, | |
| "loss": 2.4982, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.0370422613473599, | |
| "learning_rate": 1.998e-06, | |
| "loss": 2.4927, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 2.4966063499450684, | |
| "eval_runtime": 33.3038, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "grad_norm": 0.04199895036530391, | |
| "learning_rate": 2.048e-06, | |
| "loss": 2.4847, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 0.0384304039845165, | |
| "learning_rate": 2.098e-06, | |
| "loss": 2.4916, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "grad_norm": 0.03291684378446945, | |
| "learning_rate": 2.148e-06, | |
| "loss": 2.4891, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.03376054787167217, | |
| "learning_rate": 2.198e-06, | |
| "loss": 2.4896, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 2.488358974456787, | |
| "eval_runtime": 33.2437, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.04001450258151374, | |
| "learning_rate": 2.2480000000000003e-06, | |
| "loss": 2.4855, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 0.036190398257348835, | |
| "learning_rate": 2.2980000000000003e-06, | |
| "loss": 2.4834, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "grad_norm": 0.03806535632489679, | |
| "learning_rate": 2.3480000000000002e-06, | |
| "loss": 2.48, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.039255476981030824, | |
| "learning_rate": 2.398e-06, | |
| "loss": 2.4853, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 2.481823205947876, | |
| "eval_runtime": 33.3121, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "grad_norm": 0.037361446323077335, | |
| "learning_rate": 2.448e-06, | |
| "loss": 2.4776, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.03410866644624654, | |
| "learning_rate": 2.498e-06, | |
| "loss": 2.4672, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "grad_norm": 0.03501276078614437, | |
| "learning_rate": 2.5480000000000004e-06, | |
| "loss": 2.4633, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.035383520468643466, | |
| "learning_rate": 2.598e-06, | |
| "loss": 2.4647, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 2.476562976837158, | |
| "eval_runtime": 33.4013, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "grad_norm": 0.03467179176189109, | |
| "learning_rate": 2.648e-06, | |
| "loss": 2.476, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.03925271631713796, | |
| "learning_rate": 2.6980000000000003e-06, | |
| "loss": 2.4675, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.03419652940921129, | |
| "learning_rate": 2.748e-06, | |
| "loss": 2.4708, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.03764216373530557, | |
| "learning_rate": 2.798e-06, | |
| "loss": 2.4709, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 2.471618175506592, | |
| "eval_runtime": 33.0936, | |
| "eval_samples_per_second": 3.535, | |
| "eval_steps_per_second": 1.783, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "grad_norm": 0.03802047455035515, | |
| "learning_rate": 2.848e-06, | |
| "loss": 2.4608, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.03323072329115027, | |
| "learning_rate": 2.8980000000000005e-06, | |
| "loss": 2.4695, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "grad_norm": 0.03693054288365918, | |
| "learning_rate": 2.9480000000000004e-06, | |
| "loss": 2.4635, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.06509796100945928, | |
| "learning_rate": 2.9980000000000003e-06, | |
| "loss": 2.467, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 2.467376232147217, | |
| "eval_runtime": 33.1827, | |
| "eval_samples_per_second": 3.526, | |
| "eval_steps_per_second": 1.778, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "grad_norm": 0.030120041993102375, | |
| "learning_rate": 3.0480000000000003e-06, | |
| "loss": 2.463, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.039881744916892024, | |
| "learning_rate": 3.0980000000000007e-06, | |
| "loss": 2.4533, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "grad_norm": 0.029950518864288997, | |
| "learning_rate": 3.1480000000000006e-06, | |
| "loss": 2.4585, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.07753499473514511, | |
| "learning_rate": 3.198e-06, | |
| "loss": 2.4502, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 2.4625656604766846, | |
| "eval_runtime": 33.2433, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.048526204949902306, | |
| "learning_rate": 3.248e-06, | |
| "loss": 2.45, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.0378506235382453, | |
| "learning_rate": 3.298e-06, | |
| "loss": 2.4488, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "grad_norm": 0.03228564469275673, | |
| "learning_rate": 3.348e-06, | |
| "loss": 2.4568, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 0.03417826301349761, | |
| "learning_rate": 3.3980000000000003e-06, | |
| "loss": 2.4514, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 2.459094762802124, | |
| "eval_runtime": 33.1684, | |
| "eval_samples_per_second": 3.527, | |
| "eval_steps_per_second": 1.779, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "grad_norm": 0.03119990821359214, | |
| "learning_rate": 3.4480000000000003e-06, | |
| "loss": 2.4447, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.032737257559355144, | |
| "learning_rate": 3.4980000000000002e-06, | |
| "loss": 2.4531, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "grad_norm": 0.03341768726028273, | |
| "learning_rate": 3.548e-06, | |
| "loss": 2.4476, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.03225090122428514, | |
| "learning_rate": 3.5980000000000005e-06, | |
| "loss": 2.4403, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 2.455217123031616, | |
| "eval_runtime": 32.9783, | |
| "eval_samples_per_second": 3.548, | |
| "eval_steps_per_second": 1.789, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "grad_norm": 0.030979620558740147, | |
| "learning_rate": 3.6480000000000005e-06, | |
| "loss": 2.4379, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.04044689712503281, | |
| "learning_rate": 3.6980000000000004e-06, | |
| "loss": 2.455, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.034557037951751954, | |
| "learning_rate": 3.7480000000000004e-06, | |
| "loss": 2.4517, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 0.02821125825480679, | |
| "learning_rate": 3.7980000000000007e-06, | |
| "loss": 2.4429, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 2.4529292583465576, | |
| "eval_runtime": 33.4058, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "grad_norm": 0.029890640830031213, | |
| "learning_rate": 3.848e-06, | |
| "loss": 2.4437, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.03265759623511245, | |
| "learning_rate": 3.898e-06, | |
| "loss": 2.438, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "grad_norm": 0.10385356338699042, | |
| "learning_rate": 3.948e-06, | |
| "loss": 2.4442, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.03233294644174686, | |
| "learning_rate": 3.9980000000000005e-06, | |
| "loss": 2.4451, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 2.450512647628784, | |
| "eval_runtime": 33.274, | |
| "eval_samples_per_second": 3.516, | |
| "eval_steps_per_second": 1.773, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "grad_norm": 0.034945541932647324, | |
| "learning_rate": 4.048e-06, | |
| "loss": 2.4357, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 0.029322959861707003, | |
| "learning_rate": 4.098e-06, | |
| "loss": 2.4373, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "grad_norm": 0.027365033479394632, | |
| "learning_rate": 4.148000000000001e-06, | |
| "loss": 2.442, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.042214130565513416, | |
| "learning_rate": 4.198e-06, | |
| "loss": 2.4362, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 2.448322296142578, | |
| "eval_runtime": 33.466, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.028874346576168566, | |
| "learning_rate": 4.248000000000001e-06, | |
| "loss": 2.4428, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.029771861998040296, | |
| "learning_rate": 4.298e-06, | |
| "loss": 2.4298, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "grad_norm": 0.029668415484575914, | |
| "learning_rate": 4.3480000000000006e-06, | |
| "loss": 2.4352, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.02564927582570633, | |
| "learning_rate": 4.398000000000001e-06, | |
| "loss": 2.4349, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 2.4465889930725098, | |
| "eval_runtime": 33.3555, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "grad_norm": 0.024797235968250814, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 2.4409, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.02813189377877088, | |
| "learning_rate": 4.498e-06, | |
| "loss": 2.4367, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "grad_norm": 0.02750903211389184, | |
| "learning_rate": 4.548e-06, | |
| "loss": 2.4326, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.027737559952553607, | |
| "learning_rate": 4.598e-06, | |
| "loss": 2.4375, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 2.4448626041412354, | |
| "eval_runtime": 33.2658, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "grad_norm": 0.02630663299301831, | |
| "learning_rate": 4.648e-06, | |
| "loss": 2.4392, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.027929449055597393, | |
| "learning_rate": 4.698000000000001e-06, | |
| "loss": 2.4256, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.0283193243102273, | |
| "learning_rate": 4.748e-06, | |
| "loss": 2.429, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.029295313451333963, | |
| "learning_rate": 4.7980000000000005e-06, | |
| "loss": 2.4393, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 2.4432175159454346, | |
| "eval_runtime": 33.3067, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.771, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "grad_norm": 0.025382897552394503, | |
| "learning_rate": 4.848000000000001e-06, | |
| "loss": 2.4322, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 0.02450548193909556, | |
| "learning_rate": 4.898e-06, | |
| "loss": 2.4314, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "grad_norm": 0.033065483070063684, | |
| "learning_rate": 4.948000000000001e-06, | |
| "loss": 2.4338, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.027543894857825314, | |
| "learning_rate": 4.998e-06, | |
| "loss": 2.4333, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 2.441807985305786, | |
| "eval_runtime": 33.0379, | |
| "eval_samples_per_second": 3.541, | |
| "eval_steps_per_second": 1.786, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "grad_norm": 0.027354239436717945, | |
| "learning_rate": 5.048000000000001e-06, | |
| "loss": 2.439, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.022458884368301627, | |
| "learning_rate": 5.098000000000001e-06, | |
| "loss": 2.427, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "grad_norm": 0.033350881745701555, | |
| "learning_rate": 5.1480000000000005e-06, | |
| "loss": 2.4275, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.025032545530163004, | |
| "learning_rate": 5.198000000000001e-06, | |
| "loss": 2.4275, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 2.440882444381714, | |
| "eval_runtime": 33.1835, | |
| "eval_samples_per_second": 3.526, | |
| "eval_steps_per_second": 1.778, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.026294170044068685, | |
| "learning_rate": 5.248000000000001e-06, | |
| "loss": 2.4312, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 0.03301155351988982, | |
| "learning_rate": 5.298000000000001e-06, | |
| "loss": 2.4203, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "grad_norm": 0.02389586194961339, | |
| "learning_rate": 5.348000000000001e-06, | |
| "loss": 2.4332, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.056862279743176244, | |
| "learning_rate": 5.398e-06, | |
| "loss": 2.4313, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 2.4402644634246826, | |
| "eval_runtime": 33.2071, | |
| "eval_samples_per_second": 3.523, | |
| "eval_steps_per_second": 1.777, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "grad_norm": 0.025636671246445756, | |
| "learning_rate": 5.448e-06, | |
| "loss": 2.4311, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.022083605910153424, | |
| "learning_rate": 5.498e-06, | |
| "loss": 2.4357, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "grad_norm": 0.024223735712298522, | |
| "learning_rate": 5.548e-06, | |
| "loss": 2.4294, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.029847698463432104, | |
| "learning_rate": 5.5980000000000004e-06, | |
| "loss": 2.4344, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 2.4389007091522217, | |
| "eval_runtime": 33.2705, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.773, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "grad_norm": 0.032144633236930065, | |
| "learning_rate": 5.648e-06, | |
| "loss": 2.4282, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 0.02355863809037046, | |
| "learning_rate": 5.698e-06, | |
| "loss": 2.4322, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.023728744427970416, | |
| "learning_rate": 5.748e-06, | |
| "loss": 2.4286, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.025539915034515293, | |
| "learning_rate": 5.798e-06, | |
| "loss": 2.4287, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 2.4376914501190186, | |
| "eval_runtime": 33.3179, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "grad_norm": 0.023457547558388747, | |
| "learning_rate": 5.848000000000001e-06, | |
| "loss": 2.4289, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.025297710201421797, | |
| "learning_rate": 5.898e-06, | |
| "loss": 2.4274, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "grad_norm": 0.024155176530161276, | |
| "learning_rate": 5.9480000000000005e-06, | |
| "loss": 2.4169, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.023954841726960448, | |
| "learning_rate": 5.998000000000001e-06, | |
| "loss": 2.4244, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 2.436969041824341, | |
| "eval_runtime": 33.2713, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.773, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0605, | |
| "grad_norm": 0.025507916252978883, | |
| "learning_rate": 6.048e-06, | |
| "loss": 2.4192, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 0.02126046028834697, | |
| "learning_rate": 6.098000000000001e-06, | |
| "loss": 2.4233, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.0615, | |
| "grad_norm": 0.026235681014214807, | |
| "learning_rate": 6.148e-06, | |
| "loss": 2.4215, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 0.026243691288249413, | |
| "learning_rate": 6.198000000000001e-06, | |
| "loss": 2.4134, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "eval_loss": 2.435988664627075, | |
| "eval_runtime": 33.0276, | |
| "eval_samples_per_second": 3.542, | |
| "eval_steps_per_second": 1.786, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.02496599291141367, | |
| "learning_rate": 6.248000000000001e-06, | |
| "loss": 2.4241, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 0.0236951365360608, | |
| "learning_rate": 6.2980000000000005e-06, | |
| "loss": 2.4252, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.0635, | |
| "grad_norm": 0.022752035914773892, | |
| "learning_rate": 6.348000000000001e-06, | |
| "loss": 2.4244, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.021656953860252137, | |
| "learning_rate": 6.398000000000001e-06, | |
| "loss": 2.4227, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "eval_loss": 2.43520450592041, | |
| "eval_runtime": 33.136, | |
| "eval_samples_per_second": 3.531, | |
| "eval_steps_per_second": 1.781, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.0645, | |
| "grad_norm": 0.021188520683488872, | |
| "learning_rate": 6.448000000000001e-06, | |
| "loss": 2.4248, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.02274972468402099, | |
| "learning_rate": 6.498000000000001e-06, | |
| "loss": 2.4215, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.0655, | |
| "grad_norm": 0.024046700552500286, | |
| "learning_rate": 6.548000000000001e-06, | |
| "loss": 2.4169, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 0.022071385618052216, | |
| "learning_rate": 6.598000000000001e-06, | |
| "loss": 2.4199, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "eval_loss": 2.4344840049743652, | |
| "eval_runtime": 33.1729, | |
| "eval_samples_per_second": 3.527, | |
| "eval_steps_per_second": 1.779, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0665, | |
| "grad_norm": 0.02931021842271797, | |
| "learning_rate": 6.648e-06, | |
| "loss": 2.4253, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 0.021754527434557868, | |
| "learning_rate": 6.698e-06, | |
| "loss": 2.4281, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.022651522972508432, | |
| "learning_rate": 6.7480000000000004e-06, | |
| "loss": 2.4208, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 0.022676405563792287, | |
| "learning_rate": 6.798e-06, | |
| "loss": 2.4222, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "eval_loss": 2.43371844291687, | |
| "eval_runtime": 33.1293, | |
| "eval_samples_per_second": 3.532, | |
| "eval_steps_per_second": 1.781, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.0685, | |
| "grad_norm": 0.021100680573628707, | |
| "learning_rate": 6.848e-06, | |
| "loss": 2.4243, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 0.02101417038408147, | |
| "learning_rate": 6.898e-06, | |
| "loss": 2.4242, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.0695, | |
| "grad_norm": 0.022432735157488455, | |
| "learning_rate": 6.948e-06, | |
| "loss": 2.4224, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.02164716008760555, | |
| "learning_rate": 6.998000000000001e-06, | |
| "loss": 2.4202, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_loss": 2.433281898498535, | |
| "eval_runtime": 33.0783, | |
| "eval_samples_per_second": 3.537, | |
| "eval_steps_per_second": 1.784, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0705, | |
| "grad_norm": 0.022412840176404082, | |
| "learning_rate": 7.048e-06, | |
| "loss": 2.4184, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 0.025300113537910857, | |
| "learning_rate": 7.0980000000000005e-06, | |
| "loss": 2.421, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.0715, | |
| "grad_norm": 0.022085711512698558, | |
| "learning_rate": 7.148000000000001e-06, | |
| "loss": 2.415, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.021041258769866313, | |
| "learning_rate": 7.198e-06, | |
| "loss": 2.4157, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "eval_loss": 2.4324123859405518, | |
| "eval_runtime": 34.1633, | |
| "eval_samples_per_second": 3.425, | |
| "eval_steps_per_second": 1.727, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.021694681795354324, | |
| "learning_rate": 7.248000000000001e-06, | |
| "loss": 2.4152, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 0.03056130171104773, | |
| "learning_rate": 7.298e-06, | |
| "loss": 2.4151, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.0735, | |
| "grad_norm": 0.02112814663770162, | |
| "learning_rate": 7.348000000000001e-06, | |
| "loss": 2.4163, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.024883267721069864, | |
| "learning_rate": 7.398000000000001e-06, | |
| "loss": 2.4258, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "eval_loss": 2.4319984912872314, | |
| "eval_runtime": 33.2699, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.773, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.0745, | |
| "grad_norm": 0.02062910451612879, | |
| "learning_rate": 7.4480000000000005e-06, | |
| "loss": 2.4263, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.021068085012478772, | |
| "learning_rate": 7.498000000000001e-06, | |
| "loss": 2.4216, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.0755, | |
| "grad_norm": 0.020665118516629687, | |
| "learning_rate": 7.548000000000001e-06, | |
| "loss": 2.4285, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 0.02425992757924128, | |
| "learning_rate": 7.598000000000001e-06, | |
| "loss": 2.4174, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "eval_loss": 2.4310436248779297, | |
| "eval_runtime": 35.0728, | |
| "eval_samples_per_second": 3.336, | |
| "eval_steps_per_second": 1.682, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0765, | |
| "grad_norm": 0.021337004595007786, | |
| "learning_rate": 7.648e-06, | |
| "loss": 2.4303, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 0.020168500131750186, | |
| "learning_rate": 7.698000000000002e-06, | |
| "loss": 2.4298, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.020089032493824672, | |
| "learning_rate": 7.748000000000001e-06, | |
| "loss": 2.4151, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 0.02462630071931115, | |
| "learning_rate": 7.798e-06, | |
| "loss": 2.4235, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "eval_loss": 2.431330442428589, | |
| "eval_runtime": 33.093, | |
| "eval_samples_per_second": 3.535, | |
| "eval_steps_per_second": 1.783, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.0785, | |
| "grad_norm": 0.0226705620922379, | |
| "learning_rate": 7.848000000000002e-06, | |
| "loss": 2.4185, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 0.022075041269811142, | |
| "learning_rate": 7.898e-06, | |
| "loss": 2.4344, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.0795, | |
| "grad_norm": 0.03932607113814955, | |
| "learning_rate": 7.948e-06, | |
| "loss": 2.4228, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.020604342831921824, | |
| "learning_rate": 7.998e-06, | |
| "loss": 2.4289, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 2.430954933166504, | |
| "eval_runtime": 33.1216, | |
| "eval_samples_per_second": 3.532, | |
| "eval_steps_per_second": 1.781, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0805, | |
| "grad_norm": 0.021865944897834468, | |
| "learning_rate": 8.048e-06, | |
| "loss": 2.4283, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 0.020393010409248808, | |
| "learning_rate": 8.098000000000001e-06, | |
| "loss": 2.4142, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.0815, | |
| "grad_norm": 0.02279155824698799, | |
| "learning_rate": 8.148e-06, | |
| "loss": 2.4208, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 0.021110562493101104, | |
| "learning_rate": 8.198e-06, | |
| "loss": 2.4093, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "eval_loss": 2.4299628734588623, | |
| "eval_runtime": 33.2215, | |
| "eval_samples_per_second": 3.522, | |
| "eval_steps_per_second": 1.776, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.019752507861163327, | |
| "learning_rate": 8.248e-06, | |
| "loss": 2.4073, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 0.019897433088879975, | |
| "learning_rate": 8.298000000000001e-06, | |
| "loss": 2.4129, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.0835, | |
| "grad_norm": 0.02275241957806373, | |
| "learning_rate": 8.348e-06, | |
| "loss": 2.4243, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 0.02009113389579191, | |
| "learning_rate": 8.398e-06, | |
| "loss": 2.4138, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "eval_loss": 2.4301230907440186, | |
| "eval_runtime": 33.0641, | |
| "eval_samples_per_second": 3.539, | |
| "eval_steps_per_second": 1.784, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.0845, | |
| "grad_norm": 0.021259070586902896, | |
| "learning_rate": 8.448000000000001e-06, | |
| "loss": 2.4212, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.021461643865178466, | |
| "learning_rate": 8.498e-06, | |
| "loss": 2.4242, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.0855, | |
| "grad_norm": 0.02129278617109427, | |
| "learning_rate": 8.548e-06, | |
| "loss": 2.4153, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 0.019884381961586706, | |
| "learning_rate": 8.598000000000001e-06, | |
| "loss": 2.4107, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "eval_loss": 2.429638385772705, | |
| "eval_runtime": 33.1452, | |
| "eval_samples_per_second": 3.53, | |
| "eval_steps_per_second": 1.78, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.0865, | |
| "grad_norm": 0.02127578557225418, | |
| "learning_rate": 8.648000000000001e-06, | |
| "loss": 2.4202, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 0.021749788475476855, | |
| "learning_rate": 8.698e-06, | |
| "loss": 2.4274, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.021521494708913836, | |
| "learning_rate": 8.748000000000002e-06, | |
| "loss": 2.4189, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.021276426458537334, | |
| "learning_rate": 8.798000000000001e-06, | |
| "loss": 2.4152, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "eval_loss": 2.4292917251586914, | |
| "eval_runtime": 33.1057, | |
| "eval_samples_per_second": 3.534, | |
| "eval_steps_per_second": 1.782, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.0885, | |
| "grad_norm": 0.019843371943772815, | |
| "learning_rate": 8.848e-06, | |
| "loss": 2.421, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 0.02031045171970109, | |
| "learning_rate": 8.898000000000002e-06, | |
| "loss": 2.4201, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.0895, | |
| "grad_norm": 0.018642717079241176, | |
| "learning_rate": 8.948000000000001e-06, | |
| "loss": 2.4171, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.021016901396559935, | |
| "learning_rate": 8.998000000000001e-06, | |
| "loss": 2.4257, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 2.4288113117218018, | |
| "eval_runtime": 33.1217, | |
| "eval_samples_per_second": 3.532, | |
| "eval_steps_per_second": 1.781, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0905, | |
| "grad_norm": 0.021595090834222327, | |
| "learning_rate": 9.048e-06, | |
| "loss": 2.4209, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 0.020500341653961213, | |
| "learning_rate": 9.098000000000002e-06, | |
| "loss": 2.4093, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.0915, | |
| "grad_norm": 0.021134665935359346, | |
| "learning_rate": 9.148e-06, | |
| "loss": 2.4238, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 0.018064298488706988, | |
| "learning_rate": 9.198e-06, | |
| "loss": 2.4163, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "eval_loss": 2.428257465362549, | |
| "eval_runtime": 33.451, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.019704962175624032, | |
| "learning_rate": 9.248e-06, | |
| "loss": 2.4082, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 0.019712333508134283, | |
| "learning_rate": 9.298e-06, | |
| "loss": 2.4089, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.0935, | |
| "grad_norm": 0.021269463834833153, | |
| "learning_rate": 9.348000000000001e-06, | |
| "loss": 2.408, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 0.021278662940784676, | |
| "learning_rate": 9.398e-06, | |
| "loss": 2.4189, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "eval_loss": 2.4279165267944336, | |
| "eval_runtime": 33.1606, | |
| "eval_samples_per_second": 3.528, | |
| "eval_steps_per_second": 1.779, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.0945, | |
| "grad_norm": 0.018504564797986272, | |
| "learning_rate": 9.448e-06, | |
| "loss": 2.4254, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.01917099113509997, | |
| "learning_rate": 9.498000000000001e-06, | |
| "loss": 2.411, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.0955, | |
| "grad_norm": 0.019097394482211122, | |
| "learning_rate": 9.548e-06, | |
| "loss": 2.4209, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.020220692469392707, | |
| "learning_rate": 9.598e-06, | |
| "loss": 2.4066, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "eval_loss": 2.4273650646209717, | |
| "eval_runtime": 33.1079, | |
| "eval_samples_per_second": 3.534, | |
| "eval_steps_per_second": 1.782, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.0965, | |
| "grad_norm": 0.019607148490934756, | |
| "learning_rate": 9.648000000000001e-06, | |
| "loss": 2.4132, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 0.019388710503851023, | |
| "learning_rate": 9.698000000000001e-06, | |
| "loss": 2.4096, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.019593746411763164, | |
| "learning_rate": 9.748e-06, | |
| "loss": 2.4064, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 0.018761734791343965, | |
| "learning_rate": 9.798e-06, | |
| "loss": 2.4033, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "eval_loss": 2.4270286560058594, | |
| "eval_runtime": 33.0269, | |
| "eval_samples_per_second": 3.543, | |
| "eval_steps_per_second": 1.786, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.0985, | |
| "grad_norm": 0.018964507342139367, | |
| "learning_rate": 9.848000000000001e-06, | |
| "loss": 2.4211, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 0.01858861943184826, | |
| "learning_rate": 9.898e-06, | |
| "loss": 2.4032, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.0995, | |
| "grad_norm": 0.01821023564956819, | |
| "learning_rate": 9.948e-06, | |
| "loss": 2.4031, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.018839474555921314, | |
| "learning_rate": 9.998000000000002e-06, | |
| "loss": 2.4112, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 2.426590919494629, | |
| "eval_runtime": 33.0133, | |
| "eval_samples_per_second": 3.544, | |
| "eval_steps_per_second": 1.787, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.1005, | |
| "grad_norm": 0.0187590945164155, | |
| "learning_rate": 9.994666666666668e-06, | |
| "loss": 2.4164, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 0.018683158146542603, | |
| "learning_rate": 9.989111111111111e-06, | |
| "loss": 2.4082, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.1015, | |
| "grad_norm": 0.017610949419625762, | |
| "learning_rate": 9.983555555555556e-06, | |
| "loss": 2.4124, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 0.01862298073358942, | |
| "learning_rate": 9.978000000000002e-06, | |
| "loss": 2.409, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "eval_loss": 2.425841808319092, | |
| "eval_runtime": 33.063, | |
| "eval_samples_per_second": 3.539, | |
| "eval_steps_per_second": 1.784, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.025407800531065724, | |
| "learning_rate": 9.972444444444445e-06, | |
| "loss": 2.4051, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 0.01838713779514561, | |
| "learning_rate": 9.966888888888889e-06, | |
| "loss": 2.4105, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.1035, | |
| "grad_norm": 0.018921321521659856, | |
| "learning_rate": 9.961333333333334e-06, | |
| "loss": 2.4191, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.01824666535901335, | |
| "learning_rate": 9.95577777777778e-06, | |
| "loss": 2.4115, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "eval_loss": 2.4254310131073, | |
| "eval_runtime": 33.141, | |
| "eval_samples_per_second": 3.53, | |
| "eval_steps_per_second": 1.78, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.1045, | |
| "grad_norm": 0.018794067362196056, | |
| "learning_rate": 9.950222222222223e-06, | |
| "loss": 2.4062, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.01825837669653065, | |
| "learning_rate": 9.944666666666668e-06, | |
| "loss": 2.4154, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.1055, | |
| "grad_norm": 0.01843310767671649, | |
| "learning_rate": 9.939111111111112e-06, | |
| "loss": 2.4201, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.018304681522005508, | |
| "learning_rate": 9.933555555555557e-06, | |
| "loss": 2.4089, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "eval_loss": 2.424731492996216, | |
| "eval_runtime": 33.0325, | |
| "eval_samples_per_second": 3.542, | |
| "eval_steps_per_second": 1.786, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.1065, | |
| "grad_norm": 0.01846362790517963, | |
| "learning_rate": 9.928e-06, | |
| "loss": 2.4118, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 0.01872825463357926, | |
| "learning_rate": 9.922444444444446e-06, | |
| "loss": 2.4045, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.017781011104963246, | |
| "learning_rate": 9.91688888888889e-06, | |
| "loss": 2.4145, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 0.018840752543683545, | |
| "learning_rate": 9.911333333333335e-06, | |
| "loss": 2.416, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "eval_loss": 2.423886775970459, | |
| "eval_runtime": 33.1239, | |
| "eval_samples_per_second": 3.532, | |
| "eval_steps_per_second": 1.781, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.1085, | |
| "grad_norm": 0.019278786947294697, | |
| "learning_rate": 9.905777777777778e-06, | |
| "loss": 2.4117, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 0.018430470806705172, | |
| "learning_rate": 9.900222222222223e-06, | |
| "loss": 2.4114, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.1095, | |
| "grad_norm": 0.018464088455141334, | |
| "learning_rate": 9.894666666666669e-06, | |
| "loss": 2.4185, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.01866239126789079, | |
| "learning_rate": 9.889111111111112e-06, | |
| "loss": 2.4099, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 2.423039197921753, | |
| "eval_runtime": 35.4471, | |
| "eval_samples_per_second": 3.301, | |
| "eval_steps_per_second": 1.664, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.1105, | |
| "grad_norm": 0.01827370320895024, | |
| "learning_rate": 9.883555555555556e-06, | |
| "loss": 2.4078, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 0.01863057836209491, | |
| "learning_rate": 9.878000000000001e-06, | |
| "loss": 2.4044, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.1115, | |
| "grad_norm": 0.018262835671926946, | |
| "learning_rate": 9.872444444444446e-06, | |
| "loss": 2.4123, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.017655227692766756, | |
| "learning_rate": 9.86688888888889e-06, | |
| "loss": 2.4118, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "eval_loss": 2.4225943088531494, | |
| "eval_runtime": 33.2709, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.773, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.01812962067528887, | |
| "learning_rate": 9.861333333333333e-06, | |
| "loss": 2.4017, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 0.018265397582930686, | |
| "learning_rate": 9.855777777777779e-06, | |
| "loss": 2.4166, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.1135, | |
| "grad_norm": 0.018207114017877214, | |
| "learning_rate": 9.850222222222224e-06, | |
| "loss": 2.413, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 0.01952225079171619, | |
| "learning_rate": 9.844666666666667e-06, | |
| "loss": 2.4022, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "eval_loss": 2.42179274559021, | |
| "eval_runtime": 33.0648, | |
| "eval_samples_per_second": 3.539, | |
| "eval_steps_per_second": 1.784, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.1145, | |
| "grad_norm": 0.01780836124763766, | |
| "learning_rate": 9.839111111111111e-06, | |
| "loss": 2.4128, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.018290904429709265, | |
| "learning_rate": 9.833555555555556e-06, | |
| "loss": 2.4119, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.1155, | |
| "grad_norm": 0.019359740861514655, | |
| "learning_rate": 9.828000000000001e-06, | |
| "loss": 2.4019, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 0.018278231474623628, | |
| "learning_rate": 9.822444444444445e-06, | |
| "loss": 2.4072, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "eval_loss": 2.4214675426483154, | |
| "eval_runtime": 33.0642, | |
| "eval_samples_per_second": 3.539, | |
| "eval_steps_per_second": 1.784, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.1165, | |
| "grad_norm": 0.017493007146383306, | |
| "learning_rate": 9.81688888888889e-06, | |
| "loss": 2.4134, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 0.018399348008473985, | |
| "learning_rate": 9.811333333333334e-06, | |
| "loss": 2.4082, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.0186494867742927, | |
| "learning_rate": 9.805777777777779e-06, | |
| "loss": 2.4131, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 0.017842605036949514, | |
| "learning_rate": 9.800222222222223e-06, | |
| "loss": 2.4134, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "eval_loss": 2.4210917949676514, | |
| "eval_runtime": 33.1318, | |
| "eval_samples_per_second": 3.531, | |
| "eval_steps_per_second": 1.781, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.1185, | |
| "grad_norm": 0.01835138877842204, | |
| "learning_rate": 9.794666666666668e-06, | |
| "loss": 2.4017, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.018202303746487493, | |
| "learning_rate": 9.789111111111111e-06, | |
| "loss": 2.4103, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.1195, | |
| "grad_norm": 0.0176777777086958, | |
| "learning_rate": 9.783555555555557e-06, | |
| "loss": 2.4023, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.019351209333625233, | |
| "learning_rate": 9.778e-06, | |
| "loss": 2.4053, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 2.421157121658325, | |
| "eval_runtime": 33.0891, | |
| "eval_samples_per_second": 3.536, | |
| "eval_steps_per_second": 1.783, | |
| "step": 6000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6711811550821745e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |