{ "best_global_step": 5900, "best_metric": 2.4210917949676514, "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000", "epoch": 0.12, "eval_steps": 100, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 2.624103276270124, "learning_rate": 4.8e-08, "loss": 4.0893, "step": 25 }, { "epoch": 0.001, "grad_norm": 1.3629568986234561, "learning_rate": 9.8e-08, "loss": 3.9543, "step": 50 }, { "epoch": 0.0015, "grad_norm": 0.8050128701430977, "learning_rate": 1.4800000000000003e-07, "loss": 3.6763, "step": 75 }, { "epoch": 0.002, "grad_norm": 0.3690286383727022, "learning_rate": 1.9800000000000003e-07, "loss": 3.327, "step": 100 }, { "epoch": 0.002, "eval_loss": 3.100055694580078, "eval_runtime": 32.7706, "eval_samples_per_second": 3.57, "eval_steps_per_second": 1.8, "step": 100 }, { "epoch": 0.0025, "grad_norm": 0.24011694167100578, "learning_rate": 2.48e-07, "loss": 3.1322, "step": 125 }, { "epoch": 0.003, "grad_norm": 0.149511940963387, "learning_rate": 2.9800000000000005e-07, "loss": 2.9672, "step": 150 }, { "epoch": 0.0035, "grad_norm": 0.10071711520195754, "learning_rate": 3.48e-07, "loss": 2.8684, "step": 175 }, { "epoch": 0.004, "grad_norm": 0.09695377414070089, "learning_rate": 3.9800000000000004e-07, "loss": 2.8244, "step": 200 }, { "epoch": 0.004, "eval_loss": 2.7518060207366943, "eval_runtime": 32.9203, "eval_samples_per_second": 3.554, "eval_steps_per_second": 1.792, "step": 200 }, { "epoch": 0.0045, "grad_norm": 0.06541174981920718, "learning_rate": 4.4800000000000004e-07, "loss": 2.7736, "step": 225 }, { "epoch": 0.005, "grad_norm": 0.061297886999798934, "learning_rate": 4.98e-07, "loss": 2.7392, "step": 250 }, { "epoch": 0.0055, "grad_norm": 0.07881073149840945, "learning_rate": 5.480000000000001e-07, "loss": 2.7194, "step": 275 }, { "epoch": 0.006, "grad_norm": 0.05125386617161651, "learning_rate": 5.98e-07, "loss": 2.6982, "step": 300 }, { "epoch": 0.006, "eval_loss": 2.6622018814086914, "eval_runtime": 32.9076, "eval_samples_per_second": 3.555, "eval_steps_per_second": 1.793, "step": 300 }, { "epoch": 0.0065, "grad_norm": 0.04659366450077996, "learning_rate": 6.48e-07, "loss": 2.6725, "step": 325 }, { "epoch": 0.007, "grad_norm": 0.04588097652548341, "learning_rate": 6.98e-07, "loss": 2.6592, "step": 350 }, { "epoch": 0.0075, "grad_norm": 0.058421958212028904, "learning_rate": 7.480000000000001e-07, "loss": 2.6481, "step": 375 }, { "epoch": 0.008, "grad_norm": 0.04289575736155661, "learning_rate": 7.98e-07, "loss": 2.6257, "step": 400 }, { "epoch": 0.008, "eval_loss": 2.6052613258361816, "eval_runtime": 32.8227, "eval_samples_per_second": 3.565, "eval_steps_per_second": 1.798, "step": 400 }, { "epoch": 0.0085, "grad_norm": 0.041602666338794385, "learning_rate": 8.480000000000001e-07, "loss": 2.6089, "step": 425 }, { "epoch": 0.009, "grad_norm": 0.040090024026539266, "learning_rate": 8.980000000000001e-07, "loss": 2.5985, "step": 450 }, { "epoch": 0.0095, "grad_norm": 0.05346463020318845, "learning_rate": 9.480000000000001e-07, "loss": 2.5858, "step": 475 }, { "epoch": 0.01, "grad_norm": 0.03240197247016216, "learning_rate": 9.98e-07, "loss": 2.5773, "step": 500 }, { "epoch": 0.01, "eval_loss": 2.5677218437194824, "eval_runtime": 32.9146, "eval_samples_per_second": 3.555, "eval_steps_per_second": 1.793, "step": 500 }, { "epoch": 0.0105, "grad_norm": 0.030627609315729644, "learning_rate": 1.0480000000000002e-06, "loss": 2.5695, "step": 525 }, { "epoch": 0.011, "grad_norm": 0.03146801435404312, "learning_rate": 1.0980000000000001e-06, "loss": 2.558, "step": 550 }, { "epoch": 0.0115, "grad_norm": 0.028453864143727626, "learning_rate": 1.148e-06, "loss": 2.5645, "step": 575 }, { "epoch": 0.012, "grad_norm": 0.03026805511159676, "learning_rate": 1.1980000000000002e-06, "loss": 2.5645, "step": 600 }, { "epoch": 0.012, "eval_loss": 2.546586275100708, "eval_runtime": 32.8424, "eval_samples_per_second": 3.562, "eval_steps_per_second": 1.796, "step": 600 }, { "epoch": 0.0125, "grad_norm": 0.032033771539522, "learning_rate": 1.248e-06, "loss": 2.5424, "step": 625 }, { "epoch": 0.013, "grad_norm": 0.0281966122475446, "learning_rate": 1.2980000000000001e-06, "loss": 2.5409, "step": 650 }, { "epoch": 0.0135, "grad_norm": 0.02887428243284281, "learning_rate": 1.348e-06, "loss": 2.543, "step": 675 }, { "epoch": 0.014, "grad_norm": 0.027672621753278132, "learning_rate": 1.3980000000000002e-06, "loss": 2.5385, "step": 700 }, { "epoch": 0.014, "eval_loss": 2.530237913131714, "eval_runtime": 32.7994, "eval_samples_per_second": 3.567, "eval_steps_per_second": 1.799, "step": 700 }, { "epoch": 0.0145, "grad_norm": 0.030815191380069624, "learning_rate": 1.4480000000000002e-06, "loss": 2.5302, "step": 725 }, { "epoch": 0.015, "grad_norm": 0.0336387385604783, "learning_rate": 1.498e-06, "loss": 2.531, "step": 750 }, { "epoch": 0.0155, "grad_norm": 0.02858543320323233, "learning_rate": 1.548e-06, "loss": 2.5184, "step": 775 }, { "epoch": 0.016, "grad_norm": 0.028120393653995705, "learning_rate": 1.5980000000000002e-06, "loss": 2.5101, "step": 800 }, { "epoch": 0.016, "eval_loss": 2.5182888507843018, "eval_runtime": 33.2135, "eval_samples_per_second": 3.523, "eval_steps_per_second": 1.776, "step": 800 }, { "epoch": 0.0165, "grad_norm": 0.03014167593156162, "learning_rate": 1.6480000000000001e-06, "loss": 2.5232, "step": 825 }, { "epoch": 0.017, "grad_norm": 0.028528349033195077, "learning_rate": 1.6980000000000003e-06, "loss": 2.5162, "step": 850 }, { "epoch": 0.0175, "grad_norm": 0.031230193601244804, "learning_rate": 1.7480000000000002e-06, "loss": 2.4995, "step": 875 }, { "epoch": 0.018, "grad_norm": 0.03555060954716827, "learning_rate": 1.798e-06, "loss": 2.5064, "step": 900 }, { "epoch": 0.018, "eval_loss": 2.5070879459381104, "eval_runtime": 33.3807, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.767, "step": 900 }, { "epoch": 0.0185, "grad_norm": 0.03561871969060444, "learning_rate": 1.8480000000000001e-06, "loss": 2.5004, "step": 925 }, { "epoch": 0.019, "grad_norm": 0.03094584673111385, "learning_rate": 1.898e-06, "loss": 2.4959, "step": 950 }, { "epoch": 0.0195, "grad_norm": 0.035545021685136444, "learning_rate": 1.9480000000000002e-06, "loss": 2.4982, "step": 975 }, { "epoch": 0.02, "grad_norm": 0.0370422613473599, "learning_rate": 1.998e-06, "loss": 2.4927, "step": 1000 }, { "epoch": 0.02, "eval_loss": 2.4966063499450684, "eval_runtime": 33.3038, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 1000 }, { "epoch": 0.0205, "grad_norm": 0.04199895036530391, "learning_rate": 2.048e-06, "loss": 2.4847, "step": 1025 }, { "epoch": 0.021, "grad_norm": 0.0384304039845165, "learning_rate": 2.098e-06, "loss": 2.4916, "step": 1050 }, { "epoch": 0.0215, "grad_norm": 0.03291684378446945, "learning_rate": 2.148e-06, "loss": 2.4891, "step": 1075 }, { "epoch": 0.022, "grad_norm": 0.03376054787167217, "learning_rate": 2.198e-06, "loss": 2.4896, "step": 1100 }, { "epoch": 0.022, "eval_loss": 2.488358974456787, "eval_runtime": 33.2437, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 1100 }, { "epoch": 0.0225, "grad_norm": 0.04001450258151374, "learning_rate": 2.2480000000000003e-06, "loss": 2.4855, "step": 1125 }, { "epoch": 0.023, "grad_norm": 0.036190398257348835, "learning_rate": 2.2980000000000003e-06, "loss": 2.4834, "step": 1150 }, { "epoch": 0.0235, "grad_norm": 0.03806535632489679, "learning_rate": 2.3480000000000002e-06, "loss": 2.48, "step": 1175 }, { "epoch": 0.024, "grad_norm": 0.039255476981030824, "learning_rate": 2.398e-06, "loss": 2.4853, "step": 1200 }, { "epoch": 0.024, "eval_loss": 2.481823205947876, "eval_runtime": 33.3121, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1200 }, { "epoch": 0.0245, "grad_norm": 0.037361446323077335, "learning_rate": 2.448e-06, "loss": 2.4776, "step": 1225 }, { "epoch": 0.025, "grad_norm": 0.03410866644624654, "learning_rate": 2.498e-06, "loss": 2.4672, "step": 1250 }, { "epoch": 0.0255, "grad_norm": 0.03501276078614437, "learning_rate": 2.5480000000000004e-06, "loss": 2.4633, "step": 1275 }, { "epoch": 0.026, "grad_norm": 0.035383520468643466, "learning_rate": 2.598e-06, "loss": 2.4647, "step": 1300 }, { "epoch": 0.026, "eval_loss": 2.476562976837158, "eval_runtime": 33.4013, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1300 }, { "epoch": 0.0265, "grad_norm": 0.03467179176189109, "learning_rate": 2.648e-06, "loss": 2.476, "step": 1325 }, { "epoch": 0.027, "grad_norm": 0.03925271631713796, "learning_rate": 2.6980000000000003e-06, "loss": 2.4675, "step": 1350 }, { "epoch": 0.0275, "grad_norm": 0.03419652940921129, "learning_rate": 2.748e-06, "loss": 2.4708, "step": 1375 }, { "epoch": 0.028, "grad_norm": 0.03764216373530557, "learning_rate": 2.798e-06, "loss": 2.4709, "step": 1400 }, { "epoch": 0.028, "eval_loss": 2.471618175506592, "eval_runtime": 33.0936, "eval_samples_per_second": 3.535, "eval_steps_per_second": 1.783, "step": 1400 }, { "epoch": 0.0285, "grad_norm": 0.03802047455035515, "learning_rate": 2.848e-06, "loss": 2.4608, "step": 1425 }, { "epoch": 0.029, "grad_norm": 0.03323072329115027, "learning_rate": 2.8980000000000005e-06, "loss": 2.4695, "step": 1450 }, { "epoch": 0.0295, "grad_norm": 0.03693054288365918, "learning_rate": 2.9480000000000004e-06, "loss": 2.4635, "step": 1475 }, { "epoch": 0.03, "grad_norm": 0.06509796100945928, "learning_rate": 2.9980000000000003e-06, "loss": 2.467, "step": 1500 }, { "epoch": 0.03, "eval_loss": 2.467376232147217, "eval_runtime": 33.1827, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.778, "step": 1500 }, { "epoch": 0.0305, "grad_norm": 0.030120041993102375, "learning_rate": 3.0480000000000003e-06, "loss": 2.463, "step": 1525 }, { "epoch": 0.031, "grad_norm": 0.039881744916892024, "learning_rate": 3.0980000000000007e-06, "loss": 2.4533, "step": 1550 }, { "epoch": 0.0315, "grad_norm": 0.029950518864288997, "learning_rate": 3.1480000000000006e-06, "loss": 2.4585, "step": 1575 }, { "epoch": 0.032, "grad_norm": 0.07753499473514511, "learning_rate": 3.198e-06, "loss": 2.4502, "step": 1600 }, { "epoch": 0.032, "eval_loss": 2.4625656604766846, "eval_runtime": 33.2433, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 1600 }, { "epoch": 0.0325, "grad_norm": 0.048526204949902306, "learning_rate": 3.248e-06, "loss": 2.45, "step": 1625 }, { "epoch": 0.033, "grad_norm": 0.0378506235382453, "learning_rate": 3.298e-06, "loss": 2.4488, "step": 1650 }, { "epoch": 0.0335, "grad_norm": 0.03228564469275673, "learning_rate": 3.348e-06, "loss": 2.4568, "step": 1675 }, { "epoch": 0.034, "grad_norm": 0.03417826301349761, "learning_rate": 3.3980000000000003e-06, "loss": 2.4514, "step": 1700 }, { "epoch": 0.034, "eval_loss": 2.459094762802124, "eval_runtime": 33.1684, "eval_samples_per_second": 3.527, "eval_steps_per_second": 1.779, "step": 1700 }, { "epoch": 0.0345, "grad_norm": 0.03119990821359214, "learning_rate": 3.4480000000000003e-06, "loss": 2.4447, "step": 1725 }, { "epoch": 0.035, "grad_norm": 0.032737257559355144, "learning_rate": 3.4980000000000002e-06, "loss": 2.4531, "step": 1750 }, { "epoch": 0.0355, "grad_norm": 0.03341768726028273, "learning_rate": 3.548e-06, "loss": 2.4476, "step": 1775 }, { "epoch": 0.036, "grad_norm": 0.03225090122428514, "learning_rate": 3.5980000000000005e-06, "loss": 2.4403, "step": 1800 }, { "epoch": 0.036, "eval_loss": 2.455217123031616, "eval_runtime": 32.9783, "eval_samples_per_second": 3.548, "eval_steps_per_second": 1.789, "step": 1800 }, { "epoch": 0.0365, "grad_norm": 0.030979620558740147, "learning_rate": 3.6480000000000005e-06, "loss": 2.4379, "step": 1825 }, { "epoch": 0.037, "grad_norm": 0.04044689712503281, "learning_rate": 3.6980000000000004e-06, "loss": 2.455, "step": 1850 }, { "epoch": 0.0375, "grad_norm": 0.034557037951751954, "learning_rate": 3.7480000000000004e-06, "loss": 2.4517, "step": 1875 }, { "epoch": 0.038, "grad_norm": 0.02821125825480679, "learning_rate": 3.7980000000000007e-06, "loss": 2.4429, "step": 1900 }, { "epoch": 0.038, "eval_loss": 2.4529292583465576, "eval_runtime": 33.4058, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 1900 }, { "epoch": 0.0385, "grad_norm": 0.029890640830031213, "learning_rate": 3.848e-06, "loss": 2.4437, "step": 1925 }, { "epoch": 0.039, "grad_norm": 0.03265759623511245, "learning_rate": 3.898e-06, "loss": 2.438, "step": 1950 }, { "epoch": 0.0395, "grad_norm": 0.10385356338699042, "learning_rate": 3.948e-06, "loss": 2.4442, "step": 1975 }, { "epoch": 0.04, "grad_norm": 0.03233294644174686, "learning_rate": 3.9980000000000005e-06, "loss": 2.4451, "step": 2000 }, { "epoch": 0.04, "eval_loss": 2.450512647628784, "eval_runtime": 33.274, "eval_samples_per_second": 3.516, "eval_steps_per_second": 1.773, "step": 2000 }, { "epoch": 0.0405, "grad_norm": 0.034945541932647324, "learning_rate": 4.048e-06, "loss": 2.4357, "step": 2025 }, { "epoch": 0.041, "grad_norm": 0.029322959861707003, "learning_rate": 4.098e-06, "loss": 2.4373, "step": 2050 }, { "epoch": 0.0415, "grad_norm": 0.027365033479394632, "learning_rate": 4.148000000000001e-06, "loss": 2.442, "step": 2075 }, { "epoch": 0.042, "grad_norm": 0.042214130565513416, "learning_rate": 4.198e-06, "loss": 2.4362, "step": 2100 }, { "epoch": 0.042, "eval_loss": 2.448322296142578, "eval_runtime": 33.466, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2100 }, { "epoch": 0.0425, "grad_norm": 0.028874346576168566, "learning_rate": 4.248000000000001e-06, "loss": 2.4428, "step": 2125 }, { "epoch": 0.043, "grad_norm": 0.029771861998040296, "learning_rate": 4.298e-06, "loss": 2.4298, "step": 2150 }, { "epoch": 0.0435, "grad_norm": 0.029668415484575914, "learning_rate": 4.3480000000000006e-06, "loss": 2.4352, "step": 2175 }, { "epoch": 0.044, "grad_norm": 0.02564927582570633, "learning_rate": 4.398000000000001e-06, "loss": 2.4349, "step": 2200 }, { "epoch": 0.044, "eval_loss": 2.4465889930725098, "eval_runtime": 33.3555, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2200 }, { "epoch": 0.0445, "grad_norm": 0.024797235968250814, "learning_rate": 4.4480000000000004e-06, "loss": 2.4409, "step": 2225 }, { "epoch": 0.045, "grad_norm": 0.02813189377877088, "learning_rate": 4.498e-06, "loss": 2.4367, "step": 2250 }, { "epoch": 0.0455, "grad_norm": 0.02750903211389184, "learning_rate": 4.548e-06, "loss": 2.4326, "step": 2275 }, { "epoch": 0.046, "grad_norm": 0.027737559952553607, "learning_rate": 4.598e-06, "loss": 2.4375, "step": 2300 }, { "epoch": 0.046, "eval_loss": 2.4448626041412354, "eval_runtime": 33.2658, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 2300 }, { "epoch": 0.0465, "grad_norm": 0.02630663299301831, "learning_rate": 4.648e-06, "loss": 2.4392, "step": 2325 }, { "epoch": 0.047, "grad_norm": 0.027929449055597393, "learning_rate": 4.698000000000001e-06, "loss": 2.4256, "step": 2350 }, { "epoch": 0.0475, "grad_norm": 0.0283193243102273, "learning_rate": 4.748e-06, "loss": 2.429, "step": 2375 }, { "epoch": 0.048, "grad_norm": 0.029295313451333963, "learning_rate": 4.7980000000000005e-06, "loss": 2.4393, "step": 2400 }, { "epoch": 0.048, "eval_loss": 2.4432175159454346, "eval_runtime": 33.3067, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.771, "step": 2400 }, { "epoch": 0.0485, "grad_norm": 0.025382897552394503, "learning_rate": 4.848000000000001e-06, "loss": 2.4322, "step": 2425 }, { "epoch": 0.049, "grad_norm": 0.02450548193909556, "learning_rate": 4.898e-06, "loss": 2.4314, "step": 2450 }, { "epoch": 0.0495, "grad_norm": 0.033065483070063684, "learning_rate": 4.948000000000001e-06, "loss": 2.4338, "step": 2475 }, { "epoch": 0.05, "grad_norm": 0.027543894857825314, "learning_rate": 4.998e-06, "loss": 2.4333, "step": 2500 }, { "epoch": 0.05, "eval_loss": 2.441807985305786, "eval_runtime": 33.0379, "eval_samples_per_second": 3.541, "eval_steps_per_second": 1.786, "step": 2500 }, { "epoch": 0.0505, "grad_norm": 0.027354239436717945, "learning_rate": 5.048000000000001e-06, "loss": 2.439, "step": 2525 }, { "epoch": 0.051, "grad_norm": 0.022458884368301627, "learning_rate": 5.098000000000001e-06, "loss": 2.427, "step": 2550 }, { "epoch": 0.0515, "grad_norm": 0.033350881745701555, "learning_rate": 5.1480000000000005e-06, "loss": 2.4275, "step": 2575 }, { "epoch": 0.052, "grad_norm": 0.025032545530163004, "learning_rate": 5.198000000000001e-06, "loss": 2.4275, "step": 2600 }, { "epoch": 0.052, "eval_loss": 2.440882444381714, "eval_runtime": 33.1835, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.778, "step": 2600 }, { "epoch": 0.0525, "grad_norm": 0.026294170044068685, "learning_rate": 5.248000000000001e-06, "loss": 2.4312, "step": 2625 }, { "epoch": 0.053, "grad_norm": 0.03301155351988982, "learning_rate": 5.298000000000001e-06, "loss": 2.4203, "step": 2650 }, { "epoch": 0.0535, "grad_norm": 0.02389586194961339, "learning_rate": 5.348000000000001e-06, "loss": 2.4332, "step": 2675 }, { "epoch": 0.054, "grad_norm": 0.056862279743176244, "learning_rate": 5.398e-06, "loss": 2.4313, "step": 2700 }, { "epoch": 0.054, "eval_loss": 2.4402644634246826, "eval_runtime": 33.2071, "eval_samples_per_second": 3.523, "eval_steps_per_second": 1.777, "step": 2700 }, { "epoch": 0.0545, "grad_norm": 0.025636671246445756, "learning_rate": 5.448e-06, "loss": 2.4311, "step": 2725 }, { "epoch": 0.055, "grad_norm": 0.022083605910153424, "learning_rate": 5.498e-06, "loss": 2.4357, "step": 2750 }, { "epoch": 0.0555, "grad_norm": 0.024223735712298522, "learning_rate": 5.548e-06, "loss": 2.4294, "step": 2775 }, { "epoch": 0.056, "grad_norm": 0.029847698463432104, "learning_rate": 5.5980000000000004e-06, "loss": 2.4344, "step": 2800 }, { "epoch": 0.056, "eval_loss": 2.4389007091522217, "eval_runtime": 33.2705, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.773, "step": 2800 }, { "epoch": 0.0565, "grad_norm": 0.032144633236930065, "learning_rate": 5.648e-06, "loss": 2.4282, "step": 2825 }, { "epoch": 0.057, "grad_norm": 0.02355863809037046, "learning_rate": 5.698e-06, "loss": 2.4322, "step": 2850 }, { "epoch": 0.0575, "grad_norm": 0.023728744427970416, "learning_rate": 5.748e-06, "loss": 2.4286, "step": 2875 }, { "epoch": 0.058, "grad_norm": 0.025539915034515293, "learning_rate": 5.798e-06, "loss": 2.4287, "step": 2900 }, { "epoch": 0.058, "eval_loss": 2.4376914501190186, "eval_runtime": 33.3179, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 2900 }, { "epoch": 0.0585, "grad_norm": 0.023457547558388747, "learning_rate": 5.848000000000001e-06, "loss": 2.4289, "step": 2925 }, { "epoch": 0.059, "grad_norm": 0.025297710201421797, "learning_rate": 5.898e-06, "loss": 2.4274, "step": 2950 }, { "epoch": 0.0595, "grad_norm": 0.024155176530161276, "learning_rate": 5.9480000000000005e-06, "loss": 2.4169, "step": 2975 }, { "epoch": 0.06, "grad_norm": 0.023954841726960448, "learning_rate": 5.998000000000001e-06, "loss": 2.4244, "step": 3000 }, { "epoch": 0.06, "eval_loss": 2.436969041824341, "eval_runtime": 33.2713, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.773, "step": 3000 }, { "epoch": 0.0605, "grad_norm": 0.025507916252978883, "learning_rate": 6.048e-06, "loss": 2.4192, "step": 3025 }, { "epoch": 0.061, "grad_norm": 0.02126046028834697, "learning_rate": 6.098000000000001e-06, "loss": 2.4233, "step": 3050 }, { "epoch": 0.0615, "grad_norm": 0.026235681014214807, "learning_rate": 6.148e-06, "loss": 2.4215, "step": 3075 }, { "epoch": 0.062, "grad_norm": 0.026243691288249413, "learning_rate": 6.198000000000001e-06, "loss": 2.4134, "step": 3100 }, { "epoch": 0.062, "eval_loss": 2.435988664627075, "eval_runtime": 33.0276, "eval_samples_per_second": 3.542, "eval_steps_per_second": 1.786, "step": 3100 }, { "epoch": 0.0625, "grad_norm": 0.02496599291141367, "learning_rate": 6.248000000000001e-06, "loss": 2.4241, "step": 3125 }, { "epoch": 0.063, "grad_norm": 0.0236951365360608, "learning_rate": 6.2980000000000005e-06, "loss": 2.4252, "step": 3150 }, { "epoch": 0.0635, "grad_norm": 0.022752035914773892, "learning_rate": 6.348000000000001e-06, "loss": 2.4244, "step": 3175 }, { "epoch": 0.064, "grad_norm": 0.021656953860252137, "learning_rate": 6.398000000000001e-06, "loss": 2.4227, "step": 3200 }, { "epoch": 0.064, "eval_loss": 2.43520450592041, "eval_runtime": 33.136, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.781, "step": 3200 }, { "epoch": 0.0645, "grad_norm": 0.021188520683488872, "learning_rate": 6.448000000000001e-06, "loss": 2.4248, "step": 3225 }, { "epoch": 0.065, "grad_norm": 0.02274972468402099, "learning_rate": 6.498000000000001e-06, "loss": 2.4215, "step": 3250 }, { "epoch": 0.0655, "grad_norm": 0.024046700552500286, "learning_rate": 6.548000000000001e-06, "loss": 2.4169, "step": 3275 }, { "epoch": 0.066, "grad_norm": 0.022071385618052216, "learning_rate": 6.598000000000001e-06, "loss": 2.4199, "step": 3300 }, { "epoch": 0.066, "eval_loss": 2.4344840049743652, "eval_runtime": 33.1729, "eval_samples_per_second": 3.527, "eval_steps_per_second": 1.779, "step": 3300 }, { "epoch": 0.0665, "grad_norm": 0.02931021842271797, "learning_rate": 6.648e-06, "loss": 2.4253, "step": 3325 }, { "epoch": 0.067, "grad_norm": 0.021754527434557868, "learning_rate": 6.698e-06, "loss": 2.4281, "step": 3350 }, { "epoch": 0.0675, "grad_norm": 0.022651522972508432, "learning_rate": 6.7480000000000004e-06, "loss": 2.4208, "step": 3375 }, { "epoch": 0.068, "grad_norm": 0.022676405563792287, "learning_rate": 6.798e-06, "loss": 2.4222, "step": 3400 }, { "epoch": 0.068, "eval_loss": 2.43371844291687, "eval_runtime": 33.1293, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.781, "step": 3400 }, { "epoch": 0.0685, "grad_norm": 0.021100680573628707, "learning_rate": 6.848e-06, "loss": 2.4243, "step": 3425 }, { "epoch": 0.069, "grad_norm": 0.02101417038408147, "learning_rate": 6.898e-06, "loss": 2.4242, "step": 3450 }, { "epoch": 0.0695, "grad_norm": 0.022432735157488455, "learning_rate": 6.948e-06, "loss": 2.4224, "step": 3475 }, { "epoch": 0.07, "grad_norm": 0.02164716008760555, "learning_rate": 6.998000000000001e-06, "loss": 2.4202, "step": 3500 }, { "epoch": 0.07, "eval_loss": 2.433281898498535, "eval_runtime": 33.0783, "eval_samples_per_second": 3.537, "eval_steps_per_second": 1.784, "step": 3500 }, { "epoch": 0.0705, "grad_norm": 0.022412840176404082, "learning_rate": 7.048e-06, "loss": 2.4184, "step": 3525 }, { "epoch": 0.071, "grad_norm": 0.025300113537910857, "learning_rate": 7.0980000000000005e-06, "loss": 2.421, "step": 3550 }, { "epoch": 0.0715, "grad_norm": 0.022085711512698558, "learning_rate": 7.148000000000001e-06, "loss": 2.415, "step": 3575 }, { "epoch": 0.072, "grad_norm": 0.021041258769866313, "learning_rate": 7.198e-06, "loss": 2.4157, "step": 3600 }, { "epoch": 0.072, "eval_loss": 2.4324123859405518, "eval_runtime": 34.1633, "eval_samples_per_second": 3.425, "eval_steps_per_second": 1.727, "step": 3600 }, { "epoch": 0.0725, "grad_norm": 0.021694681795354324, "learning_rate": 7.248000000000001e-06, "loss": 2.4152, "step": 3625 }, { "epoch": 0.073, "grad_norm": 0.03056130171104773, "learning_rate": 7.298e-06, "loss": 2.4151, "step": 3650 }, { "epoch": 0.0735, "grad_norm": 0.02112814663770162, "learning_rate": 7.348000000000001e-06, "loss": 2.4163, "step": 3675 }, { "epoch": 0.074, "grad_norm": 0.024883267721069864, "learning_rate": 7.398000000000001e-06, "loss": 2.4258, "step": 3700 }, { "epoch": 0.074, "eval_loss": 2.4319984912872314, "eval_runtime": 33.2699, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.773, "step": 3700 }, { "epoch": 0.0745, "grad_norm": 0.02062910451612879, "learning_rate": 7.4480000000000005e-06, "loss": 2.4263, "step": 3725 }, { "epoch": 0.075, "grad_norm": 0.021068085012478772, "learning_rate": 7.498000000000001e-06, "loss": 2.4216, "step": 3750 }, { "epoch": 0.0755, "grad_norm": 0.020665118516629687, "learning_rate": 7.548000000000001e-06, "loss": 2.4285, "step": 3775 }, { "epoch": 0.076, "grad_norm": 0.02425992757924128, "learning_rate": 7.598000000000001e-06, "loss": 2.4174, "step": 3800 }, { "epoch": 0.076, "eval_loss": 2.4310436248779297, "eval_runtime": 35.0728, "eval_samples_per_second": 3.336, "eval_steps_per_second": 1.682, "step": 3800 }, { "epoch": 0.0765, "grad_norm": 0.021337004595007786, "learning_rate": 7.648e-06, "loss": 2.4303, "step": 3825 }, { "epoch": 0.077, "grad_norm": 0.020168500131750186, "learning_rate": 7.698000000000002e-06, "loss": 2.4298, "step": 3850 }, { "epoch": 0.0775, "grad_norm": 0.020089032493824672, "learning_rate": 7.748000000000001e-06, "loss": 2.4151, "step": 3875 }, { "epoch": 0.078, "grad_norm": 0.02462630071931115, "learning_rate": 7.798e-06, "loss": 2.4235, "step": 3900 }, { "epoch": 0.078, "eval_loss": 2.431330442428589, "eval_runtime": 33.093, "eval_samples_per_second": 3.535, "eval_steps_per_second": 1.783, "step": 3900 }, { "epoch": 0.0785, "grad_norm": 0.0226705620922379, "learning_rate": 7.848000000000002e-06, "loss": 2.4185, "step": 3925 }, { "epoch": 0.079, "grad_norm": 0.022075041269811142, "learning_rate": 7.898e-06, "loss": 2.4344, "step": 3950 }, { "epoch": 0.0795, "grad_norm": 0.03932607113814955, "learning_rate": 7.948e-06, "loss": 2.4228, "step": 3975 }, { "epoch": 0.08, "grad_norm": 0.020604342831921824, "learning_rate": 7.998e-06, "loss": 2.4289, "step": 4000 }, { "epoch": 0.08, "eval_loss": 2.430954933166504, "eval_runtime": 33.1216, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.781, "step": 4000 }, { "epoch": 0.0805, "grad_norm": 0.021865944897834468, "learning_rate": 8.048e-06, "loss": 2.4283, "step": 4025 }, { "epoch": 0.081, "grad_norm": 0.020393010409248808, "learning_rate": 8.098000000000001e-06, "loss": 2.4142, "step": 4050 }, { "epoch": 0.0815, "grad_norm": 0.02279155824698799, "learning_rate": 8.148e-06, "loss": 2.4208, "step": 4075 }, { "epoch": 0.082, "grad_norm": 0.021110562493101104, "learning_rate": 8.198e-06, "loss": 2.4093, "step": 4100 }, { "epoch": 0.082, "eval_loss": 2.4299628734588623, "eval_runtime": 33.2215, "eval_samples_per_second": 3.522, "eval_steps_per_second": 1.776, "step": 4100 }, { "epoch": 0.0825, "grad_norm": 0.019752507861163327, "learning_rate": 8.248e-06, "loss": 2.4073, "step": 4125 }, { "epoch": 0.083, "grad_norm": 0.019897433088879975, "learning_rate": 8.298000000000001e-06, "loss": 2.4129, "step": 4150 }, { "epoch": 0.0835, "grad_norm": 0.02275241957806373, "learning_rate": 8.348e-06, "loss": 2.4243, "step": 4175 }, { "epoch": 0.084, "grad_norm": 0.02009113389579191, "learning_rate": 8.398e-06, "loss": 2.4138, "step": 4200 }, { "epoch": 0.084, "eval_loss": 2.4301230907440186, "eval_runtime": 33.0641, "eval_samples_per_second": 3.539, "eval_steps_per_second": 1.784, "step": 4200 }, { "epoch": 0.0845, "grad_norm": 0.021259070586902896, "learning_rate": 8.448000000000001e-06, "loss": 2.4212, "step": 4225 }, { "epoch": 0.085, "grad_norm": 0.021461643865178466, "learning_rate": 8.498e-06, "loss": 2.4242, "step": 4250 }, { "epoch": 0.0855, "grad_norm": 0.02129278617109427, "learning_rate": 8.548e-06, "loss": 2.4153, "step": 4275 }, { "epoch": 0.086, "grad_norm": 0.019884381961586706, "learning_rate": 8.598000000000001e-06, "loss": 2.4107, "step": 4300 }, { "epoch": 0.086, "eval_loss": 2.429638385772705, "eval_runtime": 33.1452, "eval_samples_per_second": 3.53, "eval_steps_per_second": 1.78, "step": 4300 }, { "epoch": 0.0865, "grad_norm": 0.02127578557225418, "learning_rate": 8.648000000000001e-06, "loss": 2.4202, "step": 4325 }, { "epoch": 0.087, "grad_norm": 0.021749788475476855, "learning_rate": 8.698e-06, "loss": 2.4274, "step": 4350 }, { "epoch": 0.0875, "grad_norm": 0.021521494708913836, "learning_rate": 8.748000000000002e-06, "loss": 2.4189, "step": 4375 }, { "epoch": 0.088, "grad_norm": 0.021276426458537334, "learning_rate": 8.798000000000001e-06, "loss": 2.4152, "step": 4400 }, { "epoch": 0.088, "eval_loss": 2.4292917251586914, "eval_runtime": 33.1057, "eval_samples_per_second": 3.534, "eval_steps_per_second": 1.782, "step": 4400 }, { "epoch": 0.0885, "grad_norm": 0.019843371943772815, "learning_rate": 8.848e-06, "loss": 2.421, "step": 4425 }, { "epoch": 0.089, "grad_norm": 0.02031045171970109, "learning_rate": 8.898000000000002e-06, "loss": 2.4201, "step": 4450 }, { "epoch": 0.0895, "grad_norm": 0.018642717079241176, "learning_rate": 8.948000000000001e-06, "loss": 2.4171, "step": 4475 }, { "epoch": 0.09, "grad_norm": 0.021016901396559935, "learning_rate": 8.998000000000001e-06, "loss": 2.4257, "step": 4500 }, { "epoch": 0.09, "eval_loss": 2.4288113117218018, "eval_runtime": 33.1217, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.781, "step": 4500 }, { "epoch": 0.0905, "grad_norm": 0.021595090834222327, "learning_rate": 9.048e-06, "loss": 2.4209, "step": 4525 }, { "epoch": 0.091, "grad_norm": 0.020500341653961213, "learning_rate": 9.098000000000002e-06, "loss": 2.4093, "step": 4550 }, { "epoch": 0.0915, "grad_norm": 0.021134665935359346, "learning_rate": 9.148e-06, "loss": 2.4238, "step": 4575 }, { "epoch": 0.092, "grad_norm": 0.018064298488706988, "learning_rate": 9.198e-06, "loss": 2.4163, "step": 4600 }, { "epoch": 0.092, "eval_loss": 2.428257465362549, "eval_runtime": 33.451, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 4600 }, { "epoch": 0.0925, "grad_norm": 0.019704962175624032, "learning_rate": 9.248e-06, "loss": 2.4082, "step": 4625 }, { "epoch": 0.093, "grad_norm": 0.019712333508134283, "learning_rate": 9.298e-06, "loss": 2.4089, "step": 4650 }, { "epoch": 0.0935, "grad_norm": 0.021269463834833153, "learning_rate": 9.348000000000001e-06, "loss": 2.408, "step": 4675 }, { "epoch": 0.094, "grad_norm": 0.021278662940784676, "learning_rate": 9.398e-06, "loss": 2.4189, "step": 4700 }, { "epoch": 0.094, "eval_loss": 2.4279165267944336, "eval_runtime": 33.1606, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.779, "step": 4700 }, { "epoch": 0.0945, "grad_norm": 0.018504564797986272, "learning_rate": 9.448e-06, "loss": 2.4254, "step": 4725 }, { "epoch": 0.095, "grad_norm": 0.01917099113509997, "learning_rate": 9.498000000000001e-06, "loss": 2.411, "step": 4750 }, { "epoch": 0.0955, "grad_norm": 0.019097394482211122, "learning_rate": 9.548e-06, "loss": 2.4209, "step": 4775 }, { "epoch": 0.096, "grad_norm": 0.020220692469392707, "learning_rate": 9.598e-06, "loss": 2.4066, "step": 4800 }, { "epoch": 0.096, "eval_loss": 2.4273650646209717, "eval_runtime": 33.1079, "eval_samples_per_second": 3.534, "eval_steps_per_second": 1.782, "step": 4800 }, { "epoch": 0.0965, "grad_norm": 0.019607148490934756, "learning_rate": 9.648000000000001e-06, "loss": 2.4132, "step": 4825 }, { "epoch": 0.097, "grad_norm": 0.019388710503851023, "learning_rate": 9.698000000000001e-06, "loss": 2.4096, "step": 4850 }, { "epoch": 0.0975, "grad_norm": 0.019593746411763164, "learning_rate": 9.748e-06, "loss": 2.4064, "step": 4875 }, { "epoch": 0.098, "grad_norm": 0.018761734791343965, "learning_rate": 9.798e-06, "loss": 2.4033, "step": 4900 }, { "epoch": 0.098, "eval_loss": 2.4270286560058594, "eval_runtime": 33.0269, "eval_samples_per_second": 3.543, "eval_steps_per_second": 1.786, "step": 4900 }, { "epoch": 0.0985, "grad_norm": 0.018964507342139367, "learning_rate": 9.848000000000001e-06, "loss": 2.4211, "step": 4925 }, { "epoch": 0.099, "grad_norm": 0.01858861943184826, "learning_rate": 9.898e-06, "loss": 2.4032, "step": 4950 }, { "epoch": 0.0995, "grad_norm": 0.01821023564956819, "learning_rate": 9.948e-06, "loss": 2.4031, "step": 4975 }, { "epoch": 0.1, "grad_norm": 0.018839474555921314, "learning_rate": 9.998000000000002e-06, "loss": 2.4112, "step": 5000 }, { "epoch": 0.1, "eval_loss": 2.426590919494629, "eval_runtime": 33.0133, "eval_samples_per_second": 3.544, "eval_steps_per_second": 1.787, "step": 5000 }, { "epoch": 0.1005, "grad_norm": 0.0187590945164155, "learning_rate": 9.994666666666668e-06, "loss": 2.4164, "step": 5025 }, { "epoch": 0.101, "grad_norm": 0.018683158146542603, "learning_rate": 9.989111111111111e-06, "loss": 2.4082, "step": 5050 }, { "epoch": 0.1015, "grad_norm": 0.017610949419625762, "learning_rate": 9.983555555555556e-06, "loss": 2.4124, "step": 5075 }, { "epoch": 0.102, "grad_norm": 0.01862298073358942, "learning_rate": 9.978000000000002e-06, "loss": 2.409, "step": 5100 }, { "epoch": 0.102, "eval_loss": 2.425841808319092, "eval_runtime": 33.063, "eval_samples_per_second": 3.539, "eval_steps_per_second": 1.784, "step": 5100 }, { "epoch": 0.1025, "grad_norm": 0.025407800531065724, "learning_rate": 9.972444444444445e-06, "loss": 2.4051, "step": 5125 }, { "epoch": 0.103, "grad_norm": 0.01838713779514561, "learning_rate": 9.966888888888889e-06, "loss": 2.4105, "step": 5150 }, { "epoch": 0.1035, "grad_norm": 0.018921321521659856, "learning_rate": 9.961333333333334e-06, "loss": 2.4191, "step": 5175 }, { "epoch": 0.104, "grad_norm": 0.01824666535901335, "learning_rate": 9.95577777777778e-06, "loss": 2.4115, "step": 5200 }, { "epoch": 0.104, "eval_loss": 2.4254310131073, "eval_runtime": 33.141, "eval_samples_per_second": 3.53, "eval_steps_per_second": 1.78, "step": 5200 }, { "epoch": 0.1045, "grad_norm": 0.018794067362196056, "learning_rate": 9.950222222222223e-06, "loss": 2.4062, "step": 5225 }, { "epoch": 0.105, "grad_norm": 0.01825837669653065, "learning_rate": 9.944666666666668e-06, "loss": 2.4154, "step": 5250 }, { "epoch": 0.1055, "grad_norm": 0.01843310767671649, "learning_rate": 9.939111111111112e-06, "loss": 2.4201, "step": 5275 }, { "epoch": 0.106, "grad_norm": 0.018304681522005508, "learning_rate": 9.933555555555557e-06, "loss": 2.4089, "step": 5300 }, { "epoch": 0.106, "eval_loss": 2.424731492996216, "eval_runtime": 33.0325, "eval_samples_per_second": 3.542, "eval_steps_per_second": 1.786, "step": 5300 }, { "epoch": 0.1065, "grad_norm": 0.01846362790517963, "learning_rate": 9.928e-06, "loss": 2.4118, "step": 5325 }, { "epoch": 0.107, "grad_norm": 0.01872825463357926, "learning_rate": 9.922444444444446e-06, "loss": 2.4045, "step": 5350 }, { "epoch": 0.1075, "grad_norm": 0.017781011104963246, "learning_rate": 9.91688888888889e-06, "loss": 2.4145, "step": 5375 }, { "epoch": 0.108, "grad_norm": 0.018840752543683545, "learning_rate": 9.911333333333335e-06, "loss": 2.416, "step": 5400 }, { "epoch": 0.108, "eval_loss": 2.423886775970459, "eval_runtime": 33.1239, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.781, "step": 5400 }, { "epoch": 0.1085, "grad_norm": 0.019278786947294697, "learning_rate": 9.905777777777778e-06, "loss": 2.4117, "step": 5425 }, { "epoch": 0.109, "grad_norm": 0.018430470806705172, "learning_rate": 9.900222222222223e-06, "loss": 2.4114, "step": 5450 }, { "epoch": 0.1095, "grad_norm": 0.018464088455141334, "learning_rate": 9.894666666666669e-06, "loss": 2.4185, "step": 5475 }, { "epoch": 0.11, "grad_norm": 0.01866239126789079, "learning_rate": 9.889111111111112e-06, "loss": 2.4099, "step": 5500 }, { "epoch": 0.11, "eval_loss": 2.423039197921753, "eval_runtime": 35.4471, "eval_samples_per_second": 3.301, "eval_steps_per_second": 1.664, "step": 5500 }, { "epoch": 0.1105, "grad_norm": 0.01827370320895024, "learning_rate": 9.883555555555556e-06, "loss": 2.4078, "step": 5525 }, { "epoch": 0.111, "grad_norm": 0.01863057836209491, "learning_rate": 9.878000000000001e-06, "loss": 2.4044, "step": 5550 }, { "epoch": 0.1115, "grad_norm": 0.018262835671926946, "learning_rate": 9.872444444444446e-06, "loss": 2.4123, "step": 5575 }, { "epoch": 0.112, "grad_norm": 0.017655227692766756, "learning_rate": 9.86688888888889e-06, "loss": 2.4118, "step": 5600 }, { "epoch": 0.112, "eval_loss": 2.4225943088531494, "eval_runtime": 33.2709, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.773, "step": 5600 }, { "epoch": 0.1125, "grad_norm": 0.01812962067528887, "learning_rate": 9.861333333333333e-06, "loss": 2.4017, "step": 5625 }, { "epoch": 0.113, "grad_norm": 0.018265397582930686, "learning_rate": 9.855777777777779e-06, "loss": 2.4166, "step": 5650 }, { "epoch": 0.1135, "grad_norm": 0.018207114017877214, "learning_rate": 9.850222222222224e-06, "loss": 2.413, "step": 5675 }, { "epoch": 0.114, "grad_norm": 0.01952225079171619, "learning_rate": 9.844666666666667e-06, "loss": 2.4022, "step": 5700 }, { "epoch": 0.114, "eval_loss": 2.42179274559021, "eval_runtime": 33.0648, "eval_samples_per_second": 3.539, "eval_steps_per_second": 1.784, "step": 5700 }, { "epoch": 0.1145, "grad_norm": 0.01780836124763766, "learning_rate": 9.839111111111111e-06, "loss": 2.4128, "step": 5725 }, { "epoch": 0.115, "grad_norm": 0.018290904429709265, "learning_rate": 9.833555555555556e-06, "loss": 2.4119, "step": 5750 }, { "epoch": 0.1155, "grad_norm": 0.019359740861514655, "learning_rate": 9.828000000000001e-06, "loss": 2.4019, "step": 5775 }, { "epoch": 0.116, "grad_norm": 0.018278231474623628, "learning_rate": 9.822444444444445e-06, "loss": 2.4072, "step": 5800 }, { "epoch": 0.116, "eval_loss": 2.4214675426483154, "eval_runtime": 33.0642, "eval_samples_per_second": 3.539, "eval_steps_per_second": 1.784, "step": 5800 }, { "epoch": 0.1165, "grad_norm": 0.017493007146383306, "learning_rate": 9.81688888888889e-06, "loss": 2.4134, "step": 5825 }, { "epoch": 0.117, "grad_norm": 0.018399348008473985, "learning_rate": 9.811333333333334e-06, "loss": 2.4082, "step": 5850 }, { "epoch": 0.1175, "grad_norm": 0.0186494867742927, "learning_rate": 9.805777777777779e-06, "loss": 2.4131, "step": 5875 }, { "epoch": 0.118, "grad_norm": 0.017842605036949514, "learning_rate": 9.800222222222223e-06, "loss": 2.4134, "step": 5900 }, { "epoch": 0.118, "eval_loss": 2.4210917949676514, "eval_runtime": 33.1318, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.781, "step": 5900 }, { "epoch": 0.1185, "grad_norm": 0.01835138877842204, "learning_rate": 9.794666666666668e-06, "loss": 2.4017, "step": 5925 }, { "epoch": 0.119, "grad_norm": 0.018202303746487493, "learning_rate": 9.789111111111111e-06, "loss": 2.4103, "step": 5950 }, { "epoch": 0.1195, "grad_norm": 0.0176777777086958, "learning_rate": 9.783555555555557e-06, "loss": 2.4023, "step": 5975 }, { "epoch": 0.12, "grad_norm": 0.019351209333625233, "learning_rate": 9.778e-06, "loss": 2.4053, "step": 6000 }, { "epoch": 0.12, "eval_loss": 2.421157121658325, "eval_runtime": 33.0891, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.783, "step": 6000 } ], "logging_steps": 25, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6711811550821745e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }